第一步: 需求分析python
須要哪些字段(時間:每一天,各個時段,id,url,guid,tracTime) 須要分區爲天/時 PV(統計記錄數) UV(guid去重)
第二步: 實施步驟mysql
建Hive表,表列分隔符和文件保持一至 Load數據到Hive表中 寫HiveSql進行統計,將結果放入Hive另外一張表中(數據清洗) 從Hive的另外一張表中的數據導出到Mysql,使用sqoop 網站項目從Mysql讀取這張表的信息
預期結果linux
日期 小時 PV UV
第三步: 實施sql
# 建源表(注意進入beeline用戶名密碼是linux的) create database if not exists track_log; use track_log; create table if not exists yhd_source( id string, url string, referer string, keyword string, type string, guid string, pageId string, moduleId string, linkId string, attachedInfo string, sessionId string, trackerU string, trackerType string, ip string, trackerSrc string, cookie string, orderCode string, trackTime string, endUserId string, firstLink string, sessionViewNo string, productId string, curMerchantId string, provinceId string, cityId string, fee string, edmActivity string, edmEmail string, edmJobId string, ieVersion string, platform string, internalKeyword string, resultSum string, currentPage string, linkPosition string, buttonPosition string )row format delimited fields terminated by '\t' stored as textfile load data local inpath '/home/liuwl/opt/datas/2015082818' into table yhd_source; load data local inpath '/home/liuwl/opt/datas/2015082819' into table yhd_source; # 建立清洗表 create table if not exists yhd_clean( id string, url string, guid string, date string, hour string) row format delimited fields terminated by '\t' insert into table yhd_clean select id,url,guid,substring(trackTime,9,2) date,substring(trackTime,12,2) hour from yhd_source; select id,date,hour from yhd_clean limit 5; # 改建分區表(靜態分區) create table if not exists yhd_part1( id string, url string, guid string ) partitioned by (date string,hour string) row format delimited fields terminated by '\t' insert into table yhd_part1 partition (date='28',hour='18') select id,url,guid from yhd_clean where date='28' and hour='18'; insert into table yhd_part1 partition (date='28',hour='19') select id,url,guid from yhd_clean where date='28' and hour='19'; select id,date ,hour from yhd_part1 where date ='28' and hour='18' limit 10; # 使用動態分區須要修改部分參數 hive.exec.dynamic.partition--true hive.exec.dynamic.partition.mode--nonstrict create table if not exists yhd_part2( id string, url string, guid string ) partitioned by (date string,hour string) row format delimited fields terminated by '\t' # 動態分區根據partition字段進行匹配 insert into table yhd_part2 partition (date,hour) select * from yhd_clean; select id,date ,hour from yhd_part2 where date ='28' and hour='18' limit 10; # 實現需求 PV: select date,hour,count(url) PV from yhd_part1 group by date,hour; 0: jdbc:hive2://hadoop09-linux-01.ibeifeng.co> select date,hour,count(url) PV from yhd_part1 group by date,hour; +-------+-------+--------+--+ | date | hour | pv | +-------+-------+--------+--+ | 28 | 18 | 64972 | | 28 | 19 | 61162 | +-------+-------+--------+--+ UV: select date,hour,count(distinct(guid)) UV from yhd_part1 group by date,hour; 0: jdbc:hive2://hadoop09-linux-01.ibeifeng.co> select date,hour,count(distinct(guid)) UV from yhd_part1 group by date,hour; +-------+-------+--------+--+ | date | hour | uv | +-------+-------+--------+--+ | 28 | 18 | 23938 | | 28 | 19 | 22330 | +-------+-------+--------+--+ # 結合放入log_result表 create table if not exists log_result as select date,hour,count(url) PV,count(distinct(guid)) UV from yhd_part1 group by date,hour; select date,hour,pv,uv from log_result; 0: jdbc:hive2://hadoop09-linux-01.ibeifeng.co> select date,hour,pv,uv from log_result; +-------+-------+--------+--------+--+ | date | hour | pv | uv | +-------+-------+--------+--------+--+ | 28 | 18 | 64972 | 23938 | | 28 | 19 | 61162 | 22330 | +-------+-------+--------+--------+--+ # 將結果表導出到Mysql,使用Sqoop # 在Mysql中建立數據庫和表 create database if not exists track_result; use track_result; create table if not exists log_track_result( date varchar(10) not null, hour varchar(10) not null, pv varchar(10) not null, uv varchar(10) not null, primary key(date,hour) ); # 使用sqoop export 導出到log_track_result表 bin/sqoop export \ --connect jdbc:mysql://hadoop09-linux-01.ibeifeng.com:3306/track_result \ --username root \ --password root \ --table log_track_result \ --export-dir /user/hive/warehouse/track_log.db/log_result \ --num-mappers 1 \ --input-fields-terminated-by '\001' # 在Mysql中查詢測試 select * from log_track_result; mysql> select * from log_track_result; +------+------+-------+-------+ | date | hour | pv | uv | +------+------+-------+-------+ | 28 | 18 | 64972 | 23938 | | 28 | 19 | 61162 | 22330 | +------+------+-------+-------+ 2 rows in set (0.00 sec)