1.安裝git和下載tpc-h-impala腳步git
[root@ip-172-31-34-31 ~]# yum install gitgithub
[root@ip-172-31-34-31 ~]# git clone https://github.com/kj-ki/tpc-h-impalashell
[root@ip-172-31-34-31 ~]# cd tpc-h-impala/bash
[root@ip-172-31-34-31 tpc-h-impala]# ls
benchmark.conf confs data README.md tpch_benchmark.sh tpch_hive tpch_impala tpch_prepareoop
2.將tpch dbgen tool生成好的數據移動到指定目錄
[root@ip-172-31-34-31 data]# mv /root/tpch_2_17_0/data10g/*.tbl /root/tpc-h-impala/dataspa
3.調整tpc-h-impala腳本code
因爲涉及到權限問題,調整tpch_prepare_data.sh腳步:將第一行改成以下:
sudo -u hdfs /usr/bin/hadoop fs -mkdir /tpch/
並增長一行:
sudo -u hdfs /usr/bin/hadoop fs -chown root /tpchblog
4.運行腳步tpch_prepare_data.sh,將數據從本地寫到HDFSip
[root@ip-172-31-34-31 data]# ./tpch_prepare_data.shhadoop
5.調整tpch_benchmark.sh腳本
因爲在運行過程當中會在Hive上建表,這些表要對impala可見,須要運行invalidate metadata,在運行impala查詢的語句前加入如下一行
$IMPALA_CMD -q 'invalidate metadata' 2>&1
#!/usr/bin/env bash # set up configurations source benchmark.conf; if [ -e "$LOG_FILE" ]; then timestamp=`date "+%F-%R" --reference=$LOG_FILE` backupFile="$LOG_FILE.$timestamp" mv $LOG_FILE $LOG_DIR/$backupFile fi echo "" echo "***********************************************" echo "* TPC-H benchmark on Impala *" echo "***********************************************" echo " " echo "See $LOG_FILE for more details of query errors." echo "" trial=0 while [ $trial -lt $NUM_OF_TRIALS ]; do trial=`expr $trial + 1` echo "Executing Trial #$trial of $NUM_OF_TRIALS trial(s)..." for query in ${TPCH_QUERIES_ALL[@]}; do echo "Running query: $query" | tee -a $LOG_FILE echo "Running Hive prepare query: $query" >> $LOG_FILE $TIME_CMD $HIVE_CMD -f $BASE_DIR/tpch_prepare/${query}.hive 2>&1 | tee -a $LOG_FILE | grep '^Time:' returncode=${PIPESTATUS[0]} if [ $returncode -ne 0 ]; then echo "ABOVE QUERY FAILED:$returncode" fi # If you want to use old beta, enable below. #$TIME_CMD $IMPALA_CMD -q 'refresh' 2>&1 | tee -a $LOG_FILE | grep '^Time:' #returncode=${PIPESTATUS[0]} #if [ $returncode -ne 0 ]; then # echo "ABOVE QUERY FAILED:$returncode" #fi echo "Running Impala query: $query" >> $LOG_FILE $IMPALA_CMD -q 'invalidate metadata' 2>&1 $TIME_CMD $IMPALA_CMD --query_file=$BASE_DIR/tpch_impala/${query}.impala 2>&1 | tee -a $LOG_FILE | grep '^Time:' returncode=${PIPESTATUS[0]} if [ $returncode -ne 0 ]; then echo "ABOVE QUERY FAILED:$returncode" fi #echo "Running Hive query: $query" >> $LOG_FILE #$TIME_CMD $HIVE_CMD -f $BASE_DIR/tpch_hive/${query}.hive 2>&1 | tee -a $LOG_FILE | grep '^Time:' #returncode=${PIPESTATUS[0]} #if [ $returncode -ne 0 ]; then # echo "ABOVE QUERY FAILED:$returncode" #fi done done # TRIAL echo "***********************************************"
6.修改配置文件benchmark.conf,使指向正確的impala master:
因爲在impala-shell的集羣上沒有配置impala-daemon,因此須要這個修改
# impala
IMPALA_CMD="/usr/bin/impala-shell --impalad=172.31.25.244:21000"
7.mr,hive,impala
注意,要運行impala,hive必須先啓動MR
8.運行benmark腳本[root@ip-172-31-34-31 tpc-h-impala]# pwd/root/tpc-h-impala[root@ip-172-31-34-31 tpc-h-impala]# ./tpch_benchmark.sh