1、準備數據sql
造數據腳本 : gendata.shbash
#!/bin/bashsession
file=$1app
s=$2oop
touch $file性能
for((i=0;i<10000000;i++))測試
do優化
str=','$s;spa
name=${i}${str}${i}orm
#echo $name
echo $name>> $file
done
echo 'show testdata'
head $file
造數據:
先造十個小文件,每一個1000w記錄:
bash gendata.sh name.txt name ; bash gendata.sh zhuzhi.txt zhuzhi ; bash gendata.sh minzu.txt minzu ; bash gendata.sh jg.txt jg ;bash gendata.sh gj.txt gj ; bash gendata.sh dz.txt dz ; bash gendata.sh abcd.txt abcd ; bash gendata.sh efgh.txt efgh ; bash gendata.sh xyz.txt xyz ;bash gendata.sh opq.txt opq
total 1.8G
-rw-r--r-- 1 root root 189M Feb 9 10:35 abcd.txt
-rw-r--r-- 1 root root 170M Feb 9 10:32 dz.txt
-rw-r--r-- 1 root root 189M Feb 9 10:38 efgh.txt
-rw-r--r-- 1 root root 170M Feb 9 10:28 gj.txt
-rw-r--r-- 1 root root 170M Feb 9 10:25 jg.txt
-rw-r--r-- 1 root root 199M Feb 9 10:22 minzu.txt
-rw-r--r-- 1 root root 189M Feb 9 10:08 name.txt
-rw-r--r-- 1 root root 180M Feb 9 10:49 opq.txt
-rw-r--r-- 1 root root 180M Feb 9 10:41 xyz.txt
-rw-r--r-- 1 root root 208M Feb 9 10:19 zhuzhi.txt
大文件,1億記錄
bash gendata.sh name1000.txt name
-rw-r--r-- 1 root root 2.1G Feb 9 10:50 name1000.txt
2、測試10個小文件,每一個文件1000萬記錄,180MB大小,總1億記錄,1.8G ,不作任何優化的數據分析
hive中建表:
create table hyl_test_par(id int,name string) partitioned by(sys_sj string,sys_type string) row format delimited fields terminated by ',' stored as textfile;
手動創建分區文件夾:
hadoop fs -mkdir -p /apps/hive/warehouse/hyl_test_par/sys_sj=20170209/sys_type=2003
上傳數據、修改權限:
hadoop fs -put *.txt /apps/hive/warehouse/hyl_test_par/sys_sj=20170209/sys_type=2003/
hadoop fs -chown -R hive /apps/hive/warehouse/hyl_test_par/
修復分區信息:
0: jdbc:hive2://cluster09.hzhz.co:10000> show partitions hyl_test_par;
+------------+--+
| partition |
+------------+--+
+------------+--+
No rows selected (0.125 seconds)
0: jdbc:hive2://cluster09.hzhz.co:10000> msck repair table hyl_test_par;
No rows affected (0.551 seconds)
0: jdbc:hive2://cluster09.hzhz.co:10000> show partitions hyl_test_par;
+--------------------------------+--+
| partition |
+--------------------------------+--+
| sys_sj=20170209/sys_type=2003 |
+--------------------------------+--+
1 row selected (0.123 seconds)
測試:
select count(*) from hyl_test_par where name <> ' ' ;
第一次運行
+------------+--+
| _c0 |
+------------+--+
| 100000000 |
+------------+--+
1 row selected (189.116 seconds)
第二次運行:
+------------+--+
| _c0 |
+------------+--+
| 100000000 |
+------------+--+
1 row selected (118.107 seconds)
第三次運行:
+------------+--+
| _c0 |
+------------+--+
| 100000000 |
+------------+--+
1 row selected (117.551 seconds)
第四次運行:
+------------+--+
| _c0 |
+------------+--+
| 100000000 |
+------------+--+
1 row selected (117.44 seconds)
第五次運行:
+------------+--+
| _c0 |
+------------+--+
| 100000000 |
+------------+--+
1 row selected (113.291 seconds)
======================================莫名的分割線=======================================================
下午從新跑,性能塊了10倍以上!!!
第一次運行
+------------+--+
| _c0 |
+------------+--+
| 100000000 |
+------------+--+
1 row selected (11.274 seconds)
第二次運行
+------------+--+
| _c0 |
+------------+--+
| 100000000 |
+------------+--+
1 row selected (11.525 seconds)
第三次運行
+------------+--+
| _c0 |
+------------+--+
| 100000000 |
+------------+--+
1 row selected (11.11 seconds)
第四次:
+------------+--+
| _c0 |
+------------+--+
| 100000000 |
+------------+--+
1 row selected (11.722 seconds)
3、測試10個小文件,每一個文件1000萬記錄,180MB大小,總1億記錄,1.8G,通過analysis分區信息的表
同理,建立測試analysis的表,並導入數據:
create table hyl_test_par_ana(id int,name string) partitioned by(sys_sj string,sys_type string) row format delimited fields terminated by ',' stored as textfile;
hadoop fs -mkdir -p /apps/hive/warehouse/hyl_test_par_ana/sys_sj=20170209/sys_type=2003
hadoop fs -put *.txt /apps/hive/warehouse/hyl_test_par_ana/sys_sj=20170209/sys_type=2003/
hadoop fs -chown -R hive /apps/hive/warehouse/hyl_test_par_ana/
show partitions hyl_test_par_ana;
msck repair table hyl_test_par_ana;
show partitions hyl_test_par_ana;
分區信息分析:
0: jdbc:hive2://cluster09.hzhz.co:10000> analyze table hyl_test_par_ana partition(sys_sj=20170209,sys_type=2003) compute statistics ;
INFO : Session is already open
INFO : Dag name: analyze table hyl_test_par_ana ...statistics(Stage-0)
INFO :
INFO : Status: Running (Executing on YARN cluster with App id application_1486351392526_0021)
.
.
.
INFO : Partition default.hyl_test_par_ana{sys_sj=20170209, sys_type=2003} stats: [numFiles=10, numRows=100000000, totalSize=1927777800, rawDataSize=1827777800]
No rows affected (126.81 seconds)
測試:
select count(*) from hyl_test_par_ana;
0: jdbc:hive2://cluster09.hzhz.co:10000> select count(*) from hyl_test_par_ana;
+------------+--+
| _c0 |
+------------+--+
| 100000000 |
+------------+--+
1 row selected (0.086 seconds)
換sql:
select count(*) from hyl_test_par_ana where name <> ' ';
第一次運行:
+------------+--+
| _c0 |
+------------+--+
| 100000000 |
+------------+--+
1 row selected (118.239 seconds)
第二次運行:
+------------+--+
| _c0 |
+------------+--+
| 100000000 |
+------------+--+
1 row selected (121.687 seconds)
第三次運行:
+------------+--+
| _c0 |
+------------+--+
| 100000000 |
+------------+--+
1 row selected (121.319 seconds)
======================================莫名的分割線=======================================================
下午從新跑,性能塊了10倍以上!!!
第一次運行
+------------+--+
| _c0 |
+------------+--+
| 100000000 |
+------------+--+
1 row selected (10.923 seconds)
第二次運行
+------------+--+
| _c0 |
+------------+--+
| 100000000 |
+------------+--+
1 row selected (6.058 seconds)
第三次運行
+------------+--+
| _c0 |
+------------+--+
| 100000000 |
+------------+--+
1 row selected (6.45 seconds)
第四次:
+------------+--+
| _c0 |
+------------+--+
| 100000000 |
+------------+--+
1 row selected (6.218 seconds)
3、測試1個大文件,2.0G大小,總1億記錄,沒有任何優化的表
建立使用一個大文件的同結構表,並上傳數據:
create table hyl_test_par_big(id int,name string) partitioned by(sys_sj string,sys_type string) row format delimited fields terminated by ',' stored as textfile;
hadoop fs -mkdir -p /apps/hive/warehouse/hyl_test_par_big/sys_sj=20170209/sys_type=2003
hadoop fs -put name1000.txt /apps/hive/warehouse/hyl_test_par_big/sys_sj=20170209/sys_type=2003/
hadoop fs -chown -R hive /apps/hive/warehouse/hyl_test_par_big/
[hdfs@cluster13 tmp]$ hadoop fs -du -h /apps/hive/warehouse/hyl_test_par_big/
2.0 G /apps/hive/warehouse/hyl_test_par_big/sys_sj=20170209
show partitions hyl_test_par_big;
msck repair table hyl_test_par_big;
show partitions hyl_test_par_big;
測試:
select count(*) from hyl_test_par_big where name <> ' ';
第一次運行:
+------------+--+
| _c0 |
+------------+--+
| 100000000 |
+------------+--+
1 row selected (11.356 seconds)
第二次運行:
+------------+--+
| _c0 |
+------------+--+
| 100000000 |
+------------+--+
1 row selected (11.3 seconds)
第三次運行:
+------------+--+
| _c0 |
+------------+--+
| 100000000 |
+------------+--+
1 row selected (5.861 seconds)
第四次:
+------------+--+
| _c0 |
+------------+--+
| 100000000 |
+------------+--+
1 row selected (5.675 seconds)
第五次運行:
+------------+--+
| _c0 |
+------------+--+
| 100000000 |
+------------+--+
1 row selected (3.814 seconds)
======================================莫名的分割線=======================================================
下午測試:
第一次運行:
+------------+--+
| _c0 |
+------------+--+
| 100000000 |
+------------+--+
1 row selected (11.933 seconds)
第二次運行:
+------------+--+
| _c0 |
+------------+--+
| 100000000 |
+------------+--+
1 row selected (4.435 seconds)
第三次運行:
+------------+--+
| _c0 |
+------------+--+
| 100000000 |
+------------+--+
1 row selected (5.868 seconds)
第四次運行:
+------------+--+
| _c0 |
+------------+--+
| 100000000 |
+------------+--+
1 row selected (3.403 seconds)
第五次運行:
+------------+--+
| _c0 |
+------------+--+
| 100000000 |
+------------+--+
1 row selected (3.814 seconds)
4、測試10個小文件,180MB大小,總1億記錄,是通過insert into方式創建的表
經過insert into 從第一張表中導入數據(這個過程當中hive會自動analysis信息)到新表,測試:
create table hyl_test_par_auto as select * from hyl_test_par distribute by rand(123);
0: jdbc:hive2://cluster09.hzhz.co:10000> create table hyl_test_par_auto as select * from hyl_test_par distribute by rand(123);
INFO : Session is already open
INFO : Dag name: create table hyl_test_par_auto a...rand(123)(Stage-1)
INFO : Tez session was closed. Reopening...
INFO : Session re-established.
INFO :
INFO : Status: Running (Executing on YARN cluster with App id application_1486351392526_0022)
INFO : Map 1: -/- Reducer 2: 0/10
INFO : Map 1: 0/119 Reducer 2: 0/10
.
.
.
INFO : Moving data to directory hdfs://myBigdata/apps/hive/warehouse/hyl_test_par_auto from hdfs://myBigdata/apps/hive/warehouse/.hive-staging_hive_2017-02-09_13-05-59_036_2906983779886430780-1/-ext-10001
INFO : Table default.hyl_test_par_auto stats: [numFiles=10, numRows=100000000, totalSize=3327777800, rawDataSize=3227777800]
[hdfs@cluster13 tmp]$ hadoop fs -ls -h hdfs://myBigdata/apps/hive/warehouse/hyl_test_par_auto/sys_sj=20170209/sys_type=2003
Found 10 items
-rwxrwxrwx 2 hive hdfs 183.9 M 2017-02-09 13:28 hdfs://myBigdata/apps/hive/warehouse/hyl_test_par_auto/sys_sj=20170209/sys_type=2003/000000_0
-rwxrwxrwx 2 hive hdfs 183.7 M 2017-02-09 13:28 hdfs://myBigdata/apps/hive/warehouse/hyl_test_par_auto/sys_sj=20170209/sys_type=2003/000001_0
-rwxrwxrwx 2 hive hdfs 183.6 M 2017-02-09 13:28 hdfs://myBigdata/apps/hive/warehouse/hyl_test_par_auto/sys_sj=20170209/sys_type=2003/000002_0
-rwxrwxrwx 2 hive hdfs 184.6 M 2017-02-09 13:28 hdfs://myBigdata/apps/hive/warehouse/hyl_test_par_auto/sys_sj=20170209/sys_type=2003/000003_0
-rwxrwxrwx 2 hive hdfs 183.7 M 2017-02-09 13:28 hdfs://myBigdata/apps/hive/warehouse/hyl_test_par_auto/sys_sj=20170209/sys_type=2003/000004_0
-rwxrwxrwx 2 hive hdfs 183.3 M 2017-02-09 13:28 hdfs://myBigdata/apps/hive/warehouse/hyl_test_par_auto/sys_sj=20170209/sys_type=2003/000005_0
-rwxrwxrwx 2 hive hdfs 184.2 M 2017-02-09 13:28 hdfs://myBigdata/apps/hive/warehouse/hyl_test_par_auto/sys_sj=20170209/sys_type=2003/000006_0
-rwxrwxrwx 2 hive hdfs 184.0 M 2017-02-09 13:28 hdfs://myBigdata/apps/hive/warehouse/hyl_test_par_auto/sys_sj=20170209/sys_type=2003/000007_0
-rwxrwxrwx 2 hive hdfs 184.2 M 2017-02-09 13:28 hdfs://myBigdata/apps/hive/warehouse/hyl_test_par_auto/sys_sj=20170209/sys_type=2003/000008_0
-rwxrwxrwx 2 hive hdfs 183.4 M 2017-02-09 13:28 hdfs://myBigdata/apps/hive/warehouse/hyl_test_par_auto/sys_sj=20170209/sys_type=2003/000009_0
測試:
select count(*) from hyl_test_par_auto where name <> ' ';
第一次運行:
+------------+--+
| _c0 |
+------------+--+
| 100000000 |
+------------+--+
1 row selected (14.653 seconds)
第二次運行:
+------------+--+
| _c0 |
+------------+--+
| 100000000 |
+------------+--+
1 row selected (13.989 seconds)
第三次運行:
+------------+--+
| _c0 |
+------------+--+
| 100000000 |
+------------+--+
1 row selected (9.236 seconds)
drop table hyl_test_auto;
create table hyl_test_par_auto(id int,name string) partitioned by(sys_sj string,sys_type string) row format delimited fields terminated by ',' stored as textfile;
insert into table hyl_test_par_auto partition(sys_sj=20170209,sys_type=2003) select id,name from hyl_test_par distribute by rand(123);
0: jdbc:hive2://cluster09.hzhz.co:10000> insert into table hyl_test_par_auto partition(sys_sj=20170209,sys_type=2003) select id,name from hyl_test_par distribute by rand(123);
INFO : Session is already open
INFO : Dag name: insert into table hyl_test_par_a...rand(123)(Stage-1)
INFO :
INFO : Status: Running (Executing on YARN cluster with App id application_1486351392526_0022)
INFO : Map 1: 0/119 Reducer 2: 0/10
.
.
.
INFO : Loading data to table default.hyl_test_par_auto partition (sys_sj=20170209, sys_type=2003) from hdfs://myBigdata/apps/hive/warehouse/hyl_test_par_auto/sys_sj=20170209/sys_type=2003/.hive-staging_hive_2017-02-09_13-26-47_915_3059621581501435386-1/-ext-10000
INFO : Partition default.hyl_test_par_auto{sys_sj=20170209, sys_type=2003} stats: [numFiles=10, numRows=100000000, totalSize=1927777800, rawDataSize=1827777800]
No rows affected (135.325 seconds)
測試:
select count(*) from hyl_test_par_auto where name <> ' ';
第一次運行:
+------------+--+
| _c0 |
+------------+--+
| 100000000 |
+------------+--+
1 row selected (11.303 seconds)
第二次運行:
+------------+--+
| _c0 |
+------------+--+
| 100000000 |
+------------+--+
1 row selected (11.56 seconds)
第三次運行:
+------------+--+
| _c0 |
+------------+--+
| 100000000 |
+------------+--+
1 row selected (3.446 seconds)
第四次:
+------------+--+
| _c0 |
+------------+--+
| 100000000 |
+------------+--+
1 row selected (3.643 seconds)
0: jdbc:hive2://cluster09.hzhz.co:10000> select count(*) from hyl_test_par_auto ;
+------------+--+
| _c0 |
+------------+--+
| 100000000 |
+------------+--+
1 row selected (0.098 seconds)
======================================莫名的分割線=======================================================
下午測試:
第一次運行:
+------------+--+
| _c0 |
+------------+--+
| 100000000 |
+------------+--+
1 row selected (11.282 seconds)
第二次運行:
+------------+--+
| _c0 |
+------------+--+
| 100000000 |
+------------+--+
1 row selected (3.414 seconds)
第三次運行:
+------------+--+
| _c0 |
+------------+--+
| 100000000 |
+------------+--+
1 row selected (6.452 seconds)
第四次運行:
+------------+--+
| _c0 |
+------------+--+
| 100000000 |
+------------+--+
1 row selected (6.24 seconds)
5、統計:
耗時\條件 | 10*180MB*NONE | 10*180MB*analyze | 1*2GB*none | 10*180MB*insert into |
第一次運行 | 11.274 | 10.923 | 11.933 | 11.282 |
第二次運行 | 11.525 | 6.058 | 4.435 | 3.414 |
第三次運行 | 11.11 | 6.45 | 5.868 | 6.452 |
第四次運行 | 11.722 | 6.218 | 3.403 | 6.24 |
平均時間 | 11.40775 | 7.41225 | 6.40975 | 6.847 |
熱數據平均時間(去掉第一次) | 11.45233333 |
6.242 |
4.568666667 |
5.368666667 |
6、繼續尋找優化項:
單個文件,經過insert into方式插入數據的表:
set mapred.reduce.tasks=1;
create table hyl_test_par_big_auto(id int,name string) partitioned by(sys_sj string,sys_type string) row format delimited fields terminated by ',' stored as textfile;
insert into table hyl_test_par_big_auto partition(sys_sj=20170209,sys_type=2003) select id,name from hyl_test_par distribute by rand(123);
0: jdbc:hive2://cluster09.hzhz.co:10000> insert into table hyl_test_par_big_auto partition(sys_sj=20170209,sys_type=2003) select id,name from hyl_test_par distribute by rand(123);
INFO : Tez session hasn't been created yet. Opening session
INFO : Dag name: insert into table hyl_test_par_b...rand(123)(Stage-1)
INFO :
INFO : Status: Running (Executing on YARN cluster with App id application_1486351392526_0023)
INFO : Map 1: -/- Reducer 2: 0/1
INFO : Map 1: 0/119 Reducer 2: 0/1
.
.
.
INFO : Loading data to table default.hyl_test_par_big_auto partition (sys_sj=20170209, sys_type=2003) from hdfs://myBigdata/apps/hive/warehouse/hyl_test_par_big_auto/sys_sj=20170209/sys_type=2003/.hive-staging_hive_2017-02-09_14-30-05_480_3967529948260649900-1/-ext-10000
INFO : Partition default.hyl_test_par_big_auto{sys_sj=20170209, sys_type=2003} stats: [numFiles=1, numRows=100000000, totalSize=1927777800, rawDataSize=1827777800]
No rows affected (104.637 seconds)
測試:
select count(*) from hyl_test_par_big_auto where name <> ' ';
運行:
+------------+--+
| _c0 |
+------------+--+
| 100000000 |
+------------+--+
1 row selected (11.744 seconds)
+------------+--+
| _c0 |
+------------+--+
| 100000000 |
+------------+--+
1 row selected (4.188 seconds)
+------------+--+
| _c0 |
+------------+--+
| 100000000 |
+------------+--+
1 row selected (4.041 seconds)
+------------+--+
| _c0 |
+------------+--+
| 100000000 |
+------------+--+
1 row selected (5.198 seconds)
+------------+--+
| _c0 |
+------------+--+
| 100000000 |
+------------+--+
1 row selected (3.788 seconds)
耗時\條件 | 10*180MB*NONE | 10*180MB*analyze | 1*2GB*none | 10*180MB*insert into | 1*2GB*none*insert into |
第一次運行 | 11.274 | 10.923 | 11.933 | 11.282 | 11.744 |
第二次運行 | 11.525 | 6.058 | 4.435 | 3.414 | 4.188 |
第三次運行 | 11.11 | 6.45 | 5.868 | 6.452 | 4.041 |
第四次運行 | 11.722 | 6.218 | 3.403 | 6.24 | 5.198 |
平均時間 | 11.40775 | 7.41225 | 6.40975 | 6.847 | 6.29275 |
熱數據平均時間(去掉第一次) | 11.45233333 |
6.242 |
4.56866666 |
5.368666667 |
4.475666667 |
優化,列信息統計分析
analyze table hyl_test_par_ana compute statistics for columns;
select count(*) from hyl_test_par_ana where name <> ' ';
+------------+--+
| _c0 |
+------------+--+
| 100000000 |
+------------+--+
1 row selected (11.519 seconds)
+------------+--+
| _c0 |
+------------+--+
| 100000000 |
+------------+--+
1 row selected (7.688 seconds)
+------------+--+
| _c0 |
+------------+--+
| 100000000 |
+------------+--+
1 row selected (6.456 seconds)
+------------+--+
| _c0 |
+------------+--+
| 100000000 |
+------------+--+
1 row selected (5.651 seconds)
+------------+--+
| _c0 |
+------------+--+
| 100000000 |
+------------+--+
1 row selected (3.413 seconds)
+------------+--+
| _c0 |
+------------+--+
| 100000000 |
+------------+--+
1 row selected (3.62 seconds)
矢量查詢(Vectorized query) 每次處理數據時會將1024行數據組成一個batch進行處理,而不是一行一行進行處理:
set hive.vectorized.execution.enabled = true;
set hive.vectorized.execution.reduce.enabled = true;
select count(*) from hyl_test_par where name <> ' ';
+------------+--+
| _c0 |
+------------+--+
| 100000000 |
+------------+--+
1 row selected (11.54 seconds)
+------------+--+
| _c0 |
+------------+--+
| 100000000 |
+------------+--+
1 row selected (3.859 seconds)
+------------+--+
| _c0 |
+------------+--+
| 100000000 |
+------------+--+
1 row selected (4.134 seconds)
+------------+--+
| _c0 |
+------------+--+
| 100000000 |
+------------+--+
1 row selected (11.844 seconds)
+------------+--+
| _c0 |
+------------+--+
| 100000000 |
+------------+--+
1 row selected (11.089 seconds)