// 建立大表 create table bigtable(id bigint, t bigint, uid string, keyword string, url_rank int, click_num int, click_url string) row format delimited fields terminated by '\t';
// 建立小表 create table smalltable(id bigint, t bigint, uid string, keyword string, url_rank int, click_num int, click_url string) row format delimited fields terminated by '\t';
// 建立 JOIN 後表 create table jointable(id bigint, t bigint, uid string, keyword string, url_rank int, click_num int, click_url string) row format delimited fields terminated by '\t';
load data local inpath '/opt/module/data/bigtable' into table bigtable;
load data local inpath '/opt/module/data/smalltable' into table smalltable;
3)數據下載地址:node
EXPLAIN [EXTENDED | DEPENDENCY | AUTHORIZATION] query-sql
explain select * from bigtable;
explain select click_url, count(*) ct from bigtable group by click_url;
explain extended select * from bigtable;
explain extended select click_url, count(*) ct from bigtable group by click_url;
dept_20200401.log dept_20200402.log dept_20200403.log
create table dept_partition( deptno int, dname string, loc string ) partitioned by (day string) row format delimited fields terminated by '\t';
10 ACCOUNTING 1700 20 RESEARCH 1800
30 SALES 1900 40 OPERATIONS 1700
50 TEST 2000 60 DEV 1900
load data local inpath '/opt/module/data/dept_20200401.log' into table dept_partition partition(day='20200401');
load data local inpath '/opt/module/data/dept_20200402.log' into table dept_partition partition(day='20200402');
load data local inpath '/opt/module/data/dept_20200403.log' into table dept_partition partition(day='20200403');
select * from dept_partition where day='20200401';
select * from dept_partition where day='20200401' union select * from dept_partition where day='20200402' union select * from dept_partition where day='20200403';
select * from dept_partition where day='20200401' or day='20200402' or day='20200403';
alter table dept_partition add partition(day='20200404');
(2)同時增長多個分區算法
alter table dept_partition add partition(day='20200405') partition(day='20200406');
alter table dept_partition drop partition(day='20200406');
(2)同時刪除多個分區sql
alter table dept_partition drop partition(day='20200404'),partition(day='20200405');
show partitions dept_partition;
desc formatted dept_partition;
create table dept_partition2( deptno int, dname string, loc string) partitioned by (day string, hour string) row format delimited fields terminated by '\t';
load data local inpath '/opt/module/data/dept_20200401.log' into table dept_partition2 partition(day='20200401', hour='12');
select * from dept_partition2 where day='20200401' and hour='12';
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.max.dynamic.partitions=1000;
set hive.exec.max.dynamic.partitions.pernode=400;
set hive.exec.max.created.files=100000;
set hive.error.on.empty.partition=false;
create table dept_partition_dy(id int, name string) partitioned by (loc int) row format delimited fields terminated by '\t';
set hive.exec.dynamic.partition.mode = nonstrict;
insert into table dept_partition_dy partition(loc) select deptno, dname, loc from dept_partition;
show partitions dept_partition_dy;
vim /opt/module/data/student.txt
1001 ss1 1002 ss2 1003 ss3 1004 ss4 1005 ss5 1006 ss6 1007 ss7 1008 ss8 1009 ss9 1010 ss10 1011 ss11 1012 ss12 1013 ss13 1014 ss14 1015 ss15 1016 ss16
create table stu_buck(id int, name string) clustered by(id) into 4 buckets row format delimited fields terminated by ' ';
desc formatted stu_buck;
load data local inpath '/opt/module/data/student.txt' into table stu_buck;
select * from stu_buck;
insert into table stu_buck select * from student_insert;
TABLESAMPLE(BUCKET x OUT OF y)
select * from stu_buck tablesample(bucket 1 out of 4 on id);
hive (test)> select * from stu_buck tablesample(bucket 5 out of 4 on id); FAILED: SemanticException [Error 10061]: Numerator should not be bigger than denominator in sample clause for table stu_buck
壓縮格式 對應的編碼/解碼器 DEFLATE org.apache.hadoop.io.compress.DefaultCodec gzip org.apache.hadoop.io.compress.GzipCodec bzip2 org.apache.hadoop.io.compress.BZip2Codec LZO com.hadoop.compression.lzo.LzopCodec Snappy org.apache.hadoop.io.compress.SnappyCodec
壓縮性能的比較:數據庫
set hive.map.aggr = true;
set hive.groupby.mapaggr.checkinterval = 100000;
set hive.groupby.skewindata = true;
create table emp(empno int,empname string,deptno int) partitioned by (day string) row format delimited fields terminated by ' ';
vim /opt/module/data/emp.txt
1 aa 10 2 bb 20 3 cc 30
load data local inpath '/opt/module/data/emp.txt' into table emp;
select deptno from emp group by deptno;
set hive.groupby.skewindata = true;
select deptno from emp group by deptno;
set hive.vectorized.execution.enabled = true; set hive.vectorized.execution.reduce.enabled = true;
insert .... select id,name,sex, age from student where age > 17; insert .... select id,name,sex, age from student where age > 18; insert .... select id,name,sex, age from student where age > 19;
insert int t_ptn partition(city=A). select id,name,sex, age from student where city= A; insert int t_ptn partition(city=B). select id,name,sex, age from student where city= B; insert int t_ptn partition(city=c). select id,name,sex, age from student where city= c;
from student insert int t_ptn partition(city=A) select id,name,sex, age where city= A insert int t_ptn partition(city=B) select id,name,sex, age where city= B
select a.id, a.name from a where a.id in (select b.id from b); select a.id, a.name from a where exists (select id from b where a.id =b.id);
select a.id, a.name from a join b on a.id = b.id;
select a.id, a.name from a left semi join b on a.id = b.id;
select a.*, b.*, c.* from a join b on a.id = b.id join c on a.id = c.id;
set hive.cbo.enable=true; set hive.compute.query.using.stats=true; set hive.stats.fetch.column.stats=true; set hive.stats.fetch.partition.stats=true;
#謂詞下推,默認是 true set hive.optimize.ppd = true;
explain select o.id from bigtable b join bigtable o on o.id = b.id where o.id <= 10;
explain select b.id from bigtable b join (select id from bigtable where id <= 10) o on b.id = o.id;
#默認爲 true set hive.auto.convert.join=true;
set hive.mapjoin.smalltable.filesize=25000000;
#默認爲 true set hive.auto.convert.join = true;
Explain insert overwrite table jointable select b.id, b.t, b.uid, b.keyword, b.url_rank, b.click_num, b.click_url from smalltable s left join bigtable b on s.id = b.id;
Time taken: 52.581 seconds
Explain insert overwrite table jointable select b.id, b.t, b.uid, b.keyword, b.url_rank, b.click_num, b.click_url from bigtable b left join smalltable s on s.id = b.id;
Time taken: 55.997 seconds
create table bigtable2( id bigint, t bigint, uid string, keyword string, url_rank int, click_num int, click_url string) row format delimited fields terminated by '\t'; load data local inpath '/opt/module/data/bigtable' into table bigtable2;
insert overwrite table jointable select b.id, b.t, b.uid, b.keyword, b.url_rank, b.click_num, b.click_url from bigtable a join bigtable2 b on a.id = b.id;
create table bigtable_buck1( id bigint, t bigint, uid string, keyword string, url_rank int, click_num int, click_url string) clustered by(id) sorted by(id) into 6 buckets row format delimited fields terminated by '\t'; load data local inpath '/opt/module/data/bigtable' into table bigtable_buck1;
create table bigtable_buck2( id bigint, t bigint, uid string, keyword string, url_rank int, click_num int, click_url string) clustered by(id) sorted by(id) into 6 buckets row format delimited fields terminated by '\t'; load data local inpath '/opt/module/data/bigtable' into table bigtable_buck2;
set hive.optimize.bucketmapjoin = true; set hive.optimize.bucketmapjoin.sortedmerge = true; set hive.input.format=org.apache.hadoop.hive.ql.io.BucketizedHiveInputFormat;
insert overwrite table jointable select b.id, b.t, b.uid, b.keyword, b.url_rank, b.click_num, b.click_url from bigtable_buck1 s join bigtable_buck2 b on b.id = s.id;
測試結果:Time taken: 96.226 secondsapache
set hive.map.aggr = true;
set hive.groupby.mapaggr.checkinterval = 100000;
set hive.groupby.skewindata = true;
set hive.exec.reducers.bytes.per.reducer = 256000000;
set hive.exec.reducers.max = 1009;
N=min(參數 2,總輸入數據量/參數 1)(參數 2 指的是上面的 1009,參數 1 值得是 256M)
set mapreduce.job.reduces = 15;
# join 的鍵對應的記錄條數超過這個值則會進行分拆,值根據具體數據量設置 set hive.skewjoin.key=100000; # 若是是 join 過程出現傾斜應該設置爲 true set hive.optimize.skewjoin=false;
set hive.skewjoin.mapjoin.map.tasks=10000;
select count(*) from emp;
set mapreduce.input.fileinputformat.split.maxsize=100;
select count(*) from emp;
set hive.input.format=org.apache.hadoop.hive.ql.io.CombineHiveInputFormat;
#在 map-only 任務結束時合併小文件,默認 true set hive.merge.mapfiles = true;
#在 map-reduce 任務結束時合併小文件,默認 false set hive.merge.mapredfiles = true;
#合併文件的大小,默認 256M set hive.merge.size.per.task = 268435456;
#當輸出文件的平均大小小於該值時,啓動一個獨立的 map-reduce 任務進行文件 merge set hive.merge.smallfiles.avgsize = 16777216;
#至關於 map 端執行 combiner set hive.map.aggr=true;
#默認是 true set mapred.map.tasks.speculative.execution = true;
set hive.exec.reducers.bytes.per.reducer = 256000000;
set hive.exec.reducers.max = 1009;
N=min(參數 2,總輸入數據量/參數 1)(參數 2 指的是上面的 1009,參數 1 值得是 256M)
set mapreduce.job.reduces = 15;
mapred.reduce.tasks.speculative.execution (hadoop 裏面的)
hive.mapred.reduce.tasks.speculative.execution(hive 裏面相同的參數,效果和hadoop 裏面的同樣兩個隨便哪一個都行)
<property> <name>hive.fetch.task.conversion</name> <value>more</value> <description> Expects one of [none, minimal, more]. Some select queries can be converted to single FETCH task minimizing latency. Currently the query should be single sourced not having any subquery and should not have any aggregations or distincts (which incurs RS), lateral views and joins. 0. none : disable hive.fetch.task.conversion 1. minimal : SELECT STAR, FILTER on partition columns, LIMIT only 2. more : SELECT, FILTER, LIMIT only (support TABLESAMPLE and virtual columns) </description> </property>
set hive.fetch.task.conversion=none;
select * from emp;
select empname from emp;
select empname from emp limit 3;
set hive.fetch.task.conversion=more;
select * from emp;
select empname from emp;
select empname from emp limit 3;
set hive.exec.mode.local.auto=true; //開啓本地 mr //設置 local mr 的最大輸入數據量,當輸入數據量小於這個值時採用 local mr 的方式,默認爲 134217728,即 128M set hive.exec.mode.local.auto.inputbytes.max=50000000; //設置 local mr 的最大輸入文件個數,當輸入文件個數小於這個值時採用 local mr 的方式,默認爲 4 set hive.exec.mode.local.auto.input.files.max=10;
set hive.exec.mode.local.auto=true;
select * from emp cluster by deptno;
Time taken: 1.443 seconds, Fetched: 3 row(s)
set hive.exec.mode.local.auto=false;
select * from emp cluster by deptno;
Time taken: 19.493 seconds, Fetched: 3 row(s)
set hive.exec.parallel=true; //打開任務並行執行,默認爲 false set hive.exec.parallel.thread.number=16; //同一個 sql 容許最大並行度,默認爲 8
yarn.nodemanager.resource.memory-mb*(spark.executor.cores/yarn.nodemanager.resource.cpu-vcores)
set spark.executor.memory=11.2g; set spark.yarn.executor.memoryOverhead=2.8g;
set hive.execution.engine=spark; set spark.executor.memory=11.2g; set spark.yarn.executor.memoryOverhead=2.8g; set spark.executor.cores=4; set spark.executor.instances=40; set spark.dynamicAllocation.enabled=true; set spark.serializer=org.apache.spark.serializer.KryoSerializer;