一 使用SpagoBI和Hive進行互聯網統計分區問題?node
1 原來根據年月日進行分區再拼接SQL時會異常複雜,須要在Where條件後判斷是否跨年,跨月,跨日等問題。數據庫
2 Hive的自定義函數不能再Where條件後返回字符串條件來進行數據篩選,可是能夠返回boolean進行數據過濾,函數
好比:自定義函數date_where(startTime, endTime),根據開始日期和結束日期返回格式爲:code
year=2015 and month=08 and day > 1 and day < 10 拼接到Where date_where("2015-08-01", "2015-08-10")不支持。ip
二 根據上述問題,經過日期(dt)創建分區字符串
Step1 建立數據庫string
1 建立tvlog_test數據庫 create database tvlog_test;it
Step2 建立數據表io
1 建立tvlog_tcl數據表table
create table if not exists tvlog_test.tvlog_tcl(
id string,
userid string,
channelid string,
channelname string,
region string,
channelcode string,
ip string,
starttime string,
endtime string,
fromchannel string,
tochannel string,
mac string,
deviceid string,
dnum string
)
partitioned by (dt string)
stored as orc;
2 建立epg_wiki_info數據表
create table if not exists tvlog_test.epg_wiki_info(
id string,
name string,
starttime string,
endtime string,
wikiscreenshots array<string>,
wikicover map<string, string>,
wikititle string,
tags array<string>,
wikiid string,
channelcode string,
channelname string,
timestamp string
)
partitioned by (dt string)
stored as orc;
Step3 開啓Hive動態分區插入
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.max.dynamic.partitions.pernode=1000;
Step4 插入數據(2015-09-01 ~ 2015-09-05)
1 向tvlog_test.tvlog_tcl表插入數據
insert overwrite table tvlog_test.tvlog_tcl
partition (dt)
select id, userid, channelid, channelname, region,
channelcode, ip, starttime, endtime, fromchannel,
tochannel, mac, deviceid, dnum, dt
from tvlog.tvlog_tcl
where year = 2015 and month = 9 and (day between 1 and 5);
2 向tvlog_test.epg_wiki_info表插入數據
insert overwrite table tvlog_test.epg_wiki_info
partition (dt)
select id, name, starttime, endtime, wikiscreenshots, wikicover,
wikititle, tags, wikiid, channelcode, channelname, timestamp, dt
from tvlog.epg_wiki_info
where dt between '2015-09-01' and '2015-09-05';