分組取topN的方法:python
數據的預處理爲使用mapreduceweb
每日新訪客:spa
回頭/單次訪客統計:code
漏斗模型:blog
使用python產生數據rem
1.統計每一個步驟的總訪問人數it
create table tmp_page_views like ods_weblog_origin;建立一個像 ods_weblog_origin這個表格式的表io
desc tmp_page_views;table
把python生成的mylog.log數據導入ast
統計每一個step的數量:(爲何要加這個?step1 as step)
select‘step1’ as step ,count(distinct remote_order) as number from tmp_page_views where request like '%/iterm%';
union就是把行並在一塊兒,把表縱向加起來
把拼接的表放到route_number中
create table dw_oute_numbs as select 'step1' as step,count(distinct remote_addr) as numbs from ods_click_pageviews where request like '/item%' union select 'step2' as step,count(distinct remote_addr) as numbs from ods_click_pageviews where request like '/category%' union select 'step3' as step,count(distinct remote_addr) as numbs from ods_click_pageviews where request like '/order%' union select 'step4' as step,count(distinct remote_addr) as numbs from ods_click_pageviews where request like '/index%';
2.查詢每一步驟相對於路徑起點人數的比例
select rn.step as rnstep,rn.numbs as rnnumbs,rr.step as rrstep,rr.numbs as rrnumbs from route_num rn
inner join
route_num rr
select tmp.rnstep,tmp.rnnumbs/tmp.rrnumbs as ratio
from
(
select rn.step as rnstep,rn.numbs as rnnumbs,rr.step as rrstep,rr.numbs as rrnumbs from dw_oute_numbs rn
inner join
dw_oute_numbs rr) tmp
where tmp.rrstep='step1';
3.查詢每一步驟相對於上一步驟的漏出率
select tmp.rrstep as rrstep,tmp.rrnumbs/tmp.rnnumbs as ration
from
(
select rn.step as rnstep,rn.numbs as rnnumbs,rr.step as rrstep,rr.numbs as rrnumbs from route_num rn
inner join
route_num rr) tmp
where cast(substr(tmp.rnstep,5,1) as int)=cast(substr(tmp.rrstep,5,1) as int)-1
4.把查詢的兩個結果彙總到一個表中
select abs.step,abs.numbs,abs.ratio as abs_ratio,rel.ratio as rel_ratio from ( select tmp.rnstep as step,tmp.rnnumbs as numbs,tmp.rnnumbs/tmp.rrnumbs as ratio from ( select rn.step as rnstep,rn.numbs as rnnumbs,rr.step as rrstep,rr.numbs as rrnumbs from route_num rn inner join route_num rr) tmp where tmp.rrstep='step1' ) abs left outer join ( select tmp.rrstep as step,tmp.rrnumbs/tmp.rnnumbs as ratio from ( select rn.step as rnstep,rn.numbs as rnnumbs,rr.step as rrstep,rr.numbs as rrnumbs from route_num rn inner join route_num rr) tmp where cast(substr(tmp.rnstep,5,1) as int)=cast(substr(tmp.rrstep,5,1) as int)-1 ) rel on abs.step=rel.step