•數據庫
show databases;
CREATE DATABASE IF NOT EXISTS test;
drop database test;
use test;
•建表
CREATE [EXTERNAL] TABLE [IF NOT EXISTS] table_name
[(col_name data_type [COMMENT col_comment], ...)]
[COMMENT table_comment]
[PARTITIONED BY (col_name data_type [COMMENT col_comment], ...)]
[CLUSTERED BY (col_name, col_name, ...)
[SORTED BY (col_name [ASC|DESC], ...)] INTO num_buckets BUCKETS]
[ROW FORMAT row_format]
[STORED AS file_format]
[LOCATION hdfs_path]
•CREATE TABLE 建立一個指定名字的表。若是相同名字的表已經存在,則拋出異常;用戶能夠用 IF NOT EXIST 選項來忽略這個異常
•EXTERNAL 關鍵字可讓用戶建立一個外部表,在建表的同時指定一個指向實際數據的路徑(LOCATION)
•LIKE 容許用戶複製現有的表結構,可是不復制數據
•COMMENT能夠爲表與字段增長描述
•ROW FORMAT
DELIMITED [FIELDS TERMINATED BY char] [COLLECTION ITEMS TERMINATED BY char]
[MAP KEYS TERMINATED BY char] [LINES TERMINATED BY char]
| SERDE serde_name [WITH SERDEPROPERTIES (property_name=property_value, property_name=property_value, ...)]
用戶在建表的時候能夠自定義 SerDe 或者使用自帶的 SerDe。若是沒有指定 ROW FORMAT 或者 ROW FORMAT DELIMITED,將會使用自帶的 SerDe。在建表的時候,用戶還須要爲表指定列,用戶在指定表的列的同時也會指定自定義的 SerDe,Hive 經過 SerDe 肯定表的具體的列的數據。
•STORED AS
SEQUENCEFILE
| TEXTFILE
| RCFILE
| INPUTFORMAT input_format_classname OUTPUTFORMAT output_format_classname
若是文件數據是純文本,可使用 STORED AS TEXTFILE。若是數據須要壓縮,使用 STORED AS SEQUENCE 。
•hive支持的字段類型
TINYINT
SMALLINT
INT
BIGINT
BOOLEAN
FLOAT
DOUBLE
STRING
•建立簡單表
CREATE TABLE IF NOT EXISTS pokes (foo STRING, bar STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
STORED AS TEXTFILE;
•建立外部表
CREATE EXTERNAL TABLE pokes (foo STRING, bar STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
STORED AS TEXTFILE
LOCATION '/test/pokes';
•建分區表
CREATE TABLE IF NOT EXISTS invites (foo STRING, bar STRING)
PARTITIONED BY(d STRING,s STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
STORED AS TEXTFILE;
•建Bucket表
CREATE TABLE IF NOT EXISTS buckets (foo STRING, bar STRING)
CLUSTERED BY (foo) into 4 buckets
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
STORED AS TEXTFILE;
•複製一個空表
CREATE TABLE invites_copy LIKE invites;
•建立表並從其餘表導入數據(mapreduce)
CREATE TABLE parts AS SELECT * FROM invites;
•hbase表
CREATE EXTERNAL TABLE workStatisticsNone (
id string,
num int
) STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
WITH SERDEPROPERTIES ("hbase.columns.mapping" = ":key,f:c")
TBLPROPERTIES ("hbase.table.name" = "workStatisticsNone","hbase.mapred.output.outputtable" = "workStatisticsNone");正則表達式
•刪除表
drop table pokes;
drop table invites;
•修改表結構
•增長/替換/修改列
ALTER TABLE table_name ADD|REPLACE COLUMNS (col_name data_type[COMMENT col_comment], ...)
ALTER TABLE pokes ADD COLUMNS (d STRING COMMENT 'd comment');
ALTER TABLE table_name CHANGE [COLUMN] col_old_name col_new_name column_type [COMMENTcol_comment] [FIRST|(AFTER column_name)]
alter table pokes change d s string comment 'change column name' first;
•更改表名:
ALTER TABLE pokes RENAME TO poke;
•修復表分區:
MSCK REPAIR TABLE invites;
ALTER TABLE invites RECOVER PARTITIONS;
•建立/刪除視圖
CREATE VIEW [IF NOT EXISTS] view_name [ (column_name [COMMENT column_comment], ...) ][COMMENT view_comment][TBLPROPERTIES (property_name = property_value, ...)] AS SELECT
create view v_invites(foo,bar) as select foo,bar from invites;
DROP VIEW v_invites;
•顯示命令
SHOW TABLES;
SHOW TABLES '.*s';(正則表達式)
desc pokes;
SHOW FUNCTIONS;
DESCRIBE FUNCTION <function_name>;
DESCRIBE FUNCTION EXTENDED <function_name>;
•加載數據
•Load data到指定的表
LOAD DATA LOCAL INPATH 'kv.txt' OVERWRITE INTO TABLE pokes;
LOAD DATA LOCAL INPATH 'kv1.txt' INTO TABLE pokes;
LOAD DATA INPATH '/test/kv.txt' INTO TABLE pokes;
LOAD DATA INPATH '/test/kv.txt' INTO TABLE pokes;
關鍵字[OVERWRITE]意思是是覆蓋原表裏的數據,不寫則不會覆蓋。
關鍵字[LOCAL]是指你加載文件的來源爲本地文件,不寫則爲hdfs的文件。
•load到指定表的分區
LOAD DATA LOCAL INPATH 'kv.txt' OVERWRITE INTO TABLE invites PARTITION(d='1',s='1');
LOAD DATA LOCAL INPATH 'kv1.txt' INTO TABLE invites PARTITION(d='1',s='1');
LOAD DATA LOCAL INPATH 'kv.txt' OVERWRITE INTO TABLE invites PARTITION(d='1',s='2');
•查詢結果導入hive
INSERT overwrite TABLE pokes SELECT foo,bar FROM invites; 覆蓋相應目錄下的文件
INSERT INTO TABLE pokes SELECT foo,bar FROM invites;
INSERT INTO TABLE invites_copy PARTITION(d='1',s='1') SELECT * FROM invites;
動態分區插入,默認關閉
set hive.exec.dynamic.partition.mode=nonstrict
INSERT INTO TABLE invites_copy PARTITION(d,s) SELECT * FROM invites;
•多插入模式
FROM from_statement
INSERT OVERWRITE TABLE tablename1 [PARTITION (partcol1=val1, partcol2=val2 ...)] select_statement1
[INSERT OVERWRITE TABLE tablename2 [PARTITION ...] select_statement2] ...
•查詢結果寫入文件系統
INSERT OVERWRITE [LOCAL] DIRECTORY directory1 select_statement1
insert overwrite local DIRECTORY 'test.txt' select * from invites_copy
•數據查詢
SELECT [ALL | DISTINCT] select_expr, select_expr, ...
FROM table_reference
[WHERE where_condition]
[GROUP BY col_list [HAVING condition]]
[ CLUSTER BY col_list
| [DISTRIBUTE BY col_list] [SORT BY| ORDER BY col_list]
]
[LIMIT number]
select * from invites limit 2,5;
ORDER BY與SORT BY的不一樣
•ORDER BY 全局排序,只有一個Reduce任務
•SORT BY 只在本機作排序
hive會根據distribute by後面列,根據reduce的個數進行數據分發,默認是採用hash算法
cluster by 除了具備 distribute by 的功能外還兼具 sort by 的功能,可是排序只能是倒序排序
select * from invites where foo=1 or bar=2;
where 條件支持 AND,OR ,between,IN, NOT IN,EXIST,NOT EXIST
•JOIN
Hive 只支持等值鏈接(equality joins)、外鏈接(outer joins)和(left semi joins)。Hive 不支持全部非等值的鏈接,由於非等值鏈接很是難轉化到 map/reduce 任務
•join on 屬於 common join
最爲普通的join策略,不受數據量的大小影響,也能夠叫作reduce side join
•left semi joins
left semi join 則屬於 map join(broadcast join)的一種變體,left semi join 是隻傳遞表的 join key 給 map 階段 , 若是 key 足夠小仍是執行 map join, 若是不是則仍是 common join,代替in條件
select a.* from invites a left semi join invites_copy b on (a.bar=b.bar)
•Map Join
SELECT /*+ MAPJOIN(smalltable)*/ .key,value
FROM smalltable JOIN bigtable ON smalltable.key = bigtable.key
0.7以後,不須要/*+ MAPJOIN(smalltable)*/,這個計算是自動化的,自動判斷哪一個是小表,哪一個是大表
set hive.auto.convert.join=true; # 是否自動轉換爲mapjoin
set hive.mapjoin.smalltable.filesize=300000000; # 小表的最大文件大小,默認爲25000000,即25M
set hive.auto.convert.join.noconditionaltask=true; #是否將多個mapjoin合併爲一個
set hive.auto.convert.join.noconditionaltask.size=300000000;
#多個mapjoin轉換爲1個時,全部小表的文件大小總和的最大值,例如,一個大表順序關聯3個小表a(10M), b(8M),c(12M)
FULL [OUTER] JOIN不會使用MapJoin優化
•Bucket Map Join
當鏈接的兩個表的join key 就是bucket column 的時候
hive.optimize.bucketmapjoin= true
算法