1.DDLnode
數據庫相關操做python
刪除數據庫
drop database if exists <database name> [restrict|cascade];
--刪除數據庫,默認狀況下,hive不容許刪除含有表的數據庫,要先將數據庫中的表清空才能drop,不然會報錯
--加入cascade關鍵字,能夠強制刪除一個數據庫,默認是restrict,表示有限制的數據庫
eg. hive> drop database if exists users cascade;
1.1分區表(PARTITIONED BY)json
分區建表分爲2種,一種是單分區,也就是說在表文件夾目錄下只有一級文件夾目錄。另一種是多分區,表文件夾下出現多文件夾嵌套模式。
1.1.1單分區建表語句:app
create table day_table (id int, content string) partitioned by (dt string);單分區表,按天分區,在表結構中存在id,content,dt三列
1.1.2雙分區建表語句:函數
create table day_hour_table (id int, content string) partitioned by (dt string, hour string);雙分區表,按天和小時分區,在表結構中新增長了dt和hour兩列。
1.1.3導入數據ui
單分區導入數據 LOAD DATA local INPATH '/root/hivedata/dat_table.txt' INTO TABLE day_table partition(dt='2017-07-07'); 多分區導入數據 LOAD DATA local INPATH '/root/hivedata/dat_table.txt' INTO TABLE day_hour_table PARTITION(dt='2017-07-07', hour='08');
1.1.4基於分區的查詢:spa
SELECT day_table.* FROM day_table WHERE day_table.dt = '2017-07-07';
1.1.5查看分區:unix
show partitions day_hour_table;
總的說來partition就是輔助查詢,縮小查詢範圍,加快數據的檢索速度和對數據按照必定的規格和條件進行管理。rest
1.2ROW FORMAT DELIMITED(指定分隔符)
create table day_table (id int, content string) partitioned by (dt string) row format delimited fields terminated by ','; ---指定分隔符建立分區表 複雜類型的數據表指定分隔符 create table complex_array(name string,work_locations array<string>) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' COLLECTION ITEMS TERMINATED BY ','; 數據以下: zhangsan beijing,shanghai,tianjin,hangzhou wangwu shanghai,chengdu,wuhan,haerbin create table t_map(id int,name string,hobby map<string,string>) row format delimited fields terminated by ',' collection items terminated by '-' map keys terminated by ':' ; 數據: 1,zhangsan,唱歌:很是喜歡-跳舞:喜歡-游泳:通常般 2,lisi,打遊戲:很是喜歡-籃球:不喜歡
1.3內部表、外部表
create table student(Sno int,Sname string,Sex string,Sage int,Sdept string) row format delimited fields terminated by ',';
create external table student_ext(Sno int,Sname string,Sex string,Sage int,Sdept string) row format delimited fields terminated by ',' location '/stu';
內、外部表加載數據: load data local inpath '/root/hivedata/students.txt' overwrite into table student; load data inpath '/stu' into table student_ext;
本地模式
set hive.exec.mode.local.auto=true;
1.5分桶表(cluster by into num buckets)
指定開啓分桶
set hive.enforce.bucketing = true; set mapreduce.job.reduces=4; TRUNCATE TABLE stu_buck; #先刪除表 drop table stu_buck; #在建立 create table stu_buck(Sno int,Sname string,Sex string,Sage int,Sdept string) clustered by(Sno) sorted by(Sno DESC) into 4 buckets row format delimited fields terminated by ',';
分桶表導入數據
insert overwrite table stu_buck select * from student cluster by(Sno);
分桶、排序等查詢:cluster by 、sort by、distribute by
select * from student cluster by(Sno); insert overwrite table student_buck select * from student cluster by(Sno) sort by(Sage); 報錯,cluster 和 sort 不能共存
對某列進行分桶的同時,根據另外一列進行排序
insert overwrite table stu_buck select * from student distribute by(Sno) sort by(Sage asc);
總結:
cluster(分且排序,必須同樣)==distribute(分) + sort(排序)(能夠不同)
增長/刪除分區
drop table t_partition; create table t_partition(id int,name string) partitioned by (dt string) row format delimited fields terminated by ',';
增長分區
alter table t_partition add partition (dt='2008-08-08') location 'hdfs://node-21:9000/t_parti/';
執行添加分區時 /t_parti文件夾下的數據不會被移動。而且沒有分區目錄dt=2008-08-08
刪除分區
alter table t_partition drop partition (dt='2008-08-08');
執行刪除分區時/t_parti下的數據會被刪除而且連同/t_parti文件夾也會被刪除
注意區別於load data時候添加分區:會移動數據 會建立分區目錄
Insert查詢語句
多重插入:
create table source_table (id int, name string) row format delimited fields terminated by ','; create table test_insert1 (id int) row format delimited fields terminated by ','; create table test_insert2 (name string) row format delimited fields terminated by ','; from source_table insert overwrite table test_insert1 select id insert overwrite table test_insert2 select name;
動態分區插入
set hive.exec.dynamic.partition=true; #是否開啓動態分區功能,默認false關閉。 set hive.exec.dynamic.partition.mode=nonstrict; #動態分區的模式,默認strict,表示必須指定
至少一個分區爲靜態分區,nonstrict模式表示容許全部的分區字段均可以使用動態分區。
需求:
將dynamic_partition_table中的數據按照時間(day),插入到目標表d_p_t的相應分區中。
原始表:
create table dynamic_partition_table(day string,ip string)row format delimited fields terminated by ","; load data local inpath '/root/hivedata/dynamic_partition_table.txt' into table dynamic_partition_table; 2015-05-10,ip1 2015-05-10,ip2 2015-06-14,ip3 2015-06-14,ip4 2015-06-15,ip1 2015-06-15,ip2
目標表:
create table d_p_t(ip string) partitioned by (month string,day string);
動態插入:
insert overwrite table d_p_t partition (month,day) select ip,substr(day,1,7) as month,day from dynamic_partition_table;
查詢結果導出到文件系統
三、將查詢結果保存到指定的文件目錄(能夠是本地,也能夠是hdfs)
insert overwrite local directory '/root/123456' select * from t_p; insert overwrite directory '/aaa/test' select * from t_p;
關於hive中的各類join
準備數據
1,a 2,b 3,c 4,d 7,y 8,u 2,bb 3,cc 7,yy 9,pp
建表:
create table a(id int,name string) row format delimited fields terminated by ','; create table b(id int,name string) row format delimited fields terminated by ',';
導入數據:
load data local inpath '/root/hivedata/a.txt' into table a; load data local inpath '/root/hivedata/b.txt' into table b;
實驗:
** inner join
select * from a inner join b on a.id=b.id; select a.id,a.name from a join b on a.id = b.id; select a.* from a join b on a.id = b.id; +-------+---------+-------+---------+--+ | a.id | a.name | b.id | b.name | +-------+---------+-------+---------+--+ | 2 | b | 2 | bb | | 3 | c | 3 | cc | | 7 | y | 7 | yy | +-------+---------+-------+---------+--+
**left join
select * from a left join b on a.id=b.id; +-------+---------+-------+---------+--+ | a.id | a.name | b.id | b.name | +-------+---------+-------+---------+--+ | 1 | a | NULL | NULL | | 2 | b | 2 | bb | | 3 | c | 3 | cc | | 4 | d | NULL | NULL | | 7 | y | 7 | yy | | 8 | u | NULL | NULL | +-------+---------+-------+---------+--+
**right join
select * from a right join b on a.id=b.id; select * from b right join a on b.id=a.id; +-------+---------+-------+---------+--+ | a.id | a.name | b.id | b.name | +-------+---------+-------+---------+--+ | 2 | b | 2 | bb | | 3 | c | 3 | cc | | 7 | y | 7 | yy | | NULL | NULL | 9 | pp | +-------+---------+-------+---------+--+
**
select * from a full outer join b on a.id=b.id; +-------+---------+-------+---------+--+ | a.id | a.name | b.id | b.name | +-------+---------+-------+---------+--+ | 1 | a | NULL | NULL | | 2 | b | 2 | bb | | 3 | c | 3 | cc | | 4 | d | NULL | NULL | | 7 | y | 7 | yy | | 8 | u | NULL | NULL | | NULL | NULL | 9 | pp | +-------+---------+-------+---------+--+
**hive中的特別join
select * from a left semi join b on a.id = b.id; select a.* from a inner join b on a.id=b.id; +-------+---------+--+ | a.id | a.name | +-------+---------+--+ | 2 | b | | 3 | c | | 7 | y | +-------+---------+--+
至關於
select a.id,a.name from a where a.id in (select b.id from b); 在hive中效率極低 select a.id,a.name from a join b on (a.id = b.id); select * from a inner join b on a.id=b.id;
cross join(##慎用)
返回兩個表的笛卡爾積結果,不須要指定關聯鍵。
select a.,b. from a cross join b;
內置jason函數
select get_json_object(line,'$.movie') as moive,get_json_object(line,'$.rate') as rate from rat_json limit 10;
transform案例:
一、先加載rating.json文件到hive的一個原始表 rat_json
create table rat_json(line string) row format delimited; load data local inpath '/root/hivedata/rating.json' into table rat_json;
二、須要解析json數據成四個字段,插入一張新的表 t_rating
drop table if exists t_rating; create table t_rating(movieid string,rate int,timestring string,uid string) row format delimited fields terminated by '\t'; insert overwrite table t_rating select get_json_object(line,'$.movie') as moive,get_json_object(line,'$.rate') as rate,get_json_object(line,'$.timeStamp') as timestring, get_json_object(line,'$.uid') as uid from rat_json limit 10;
三、使用transform+python的方式去轉換unixtime爲weekday
先編輯一個python腳本文件
python代碼
vi weekday_mapper.py
#!/bin/python import sys import datetime for line in sys.stdin: line = line.strip() movieid, rating, unixtime,userid = line.split('\t') weekday = datetime.datetime.fromtimestamp(float(unixtime)).isoweekday() print '\t'.join([movieid, rating, str(weekday),userid])
保存文件
而後,將文件加入hive的classpath:
hive>add FILE /root/hivedata/weekday_mapper.py;
create table u_data_new as select transform (movieid, rate, timestring,uid) using 'python weekday_mapper.py' as (movieid, rate, weekday,uid) from t_rating; select distinct(weekday) from u_data_new limit 10;
desc formatted student;