@sql
這個問題能夠擴展到不少類似的問題:連續幾個月充值會員、連續天數有商品賣出、連續打車、連續逾期……ui
用戶ID、登入日期 user01,2018-02-28 user01,2018-03-01 user01,2018-03-02 user01,2018-03-04 user01,2018-03-05 user01,2018-03-06 user01,2018-03-07 user02,2018-03-01 user02,2018-03-02 user02,2018-03-03 user02,2018-03-06
+---------+--------+-------------+-------------+--+ | uid | times | start_date | end_date | +---------+--------+-------------+-------------+--+
先對每一個用戶的登陸日期排序,而後拿第n行的日期,減第n-2行的日期,若是等於2,就說明連續三天登陸了。code
開窗,窗囗內部排序而後作差orm
rownumber() oover
create table wedw_dw.t_login_info( user_id string COMMENT '用戶ID' ,login_date date COMMENT '登陸日期' ) row format delimited fields terminated by ',';
hdfs dfs -put /test/login.txt /data/hive/test/wedw/dw/t_login_info/
select * from wedw_dw.t_login_info; +----------+-------------+--+ | user_id | login_date | +----------+-------------+--+ | user01 | 2018-02-28 | | user01 | 2018-03-01 | | user01 | 2018-03-02 | | user01 | 2018-03-04 | | user01 | 2018-03-05 | | user01 | 2018-03-06 | | user01 | 2018-03-07 | | user02 | 2018-03-01 | | user02 | 2018-03-02 | | user02 | 2018-03-03 | | user02 | 2018-03-06 | +----------+-------------+--+
select t2.user_id as user_id, count(1) as times, min(t2.login_date) as start_date, max(t2.login_date) as end_date from ( select t1.user_id, t1.login_date, date_sub(t1.login_date,rn) as date_diff from ( select user_id, login_date, row_number() over(partition by user_id order by login_date asc) as rn from wedw_dw.t_login_info ) t1 ) t2 group by t2.user_id, t2.date_diff having times >= 3;
+----------+--------+-------------+-------------+--+ | user_id | times | start_date | end_date | +----------+--------+-------------+-------------+--+ | user01 | 3 | 2018-02-28 | 2018-03-02 | | user01 | 4 | 2018-03-04 | 2018-03-07 | | user02 | 3 | 2018-03-01 | 2018-03-03 | +----------+--------+-------------+-------------+--+
select user_id ,login_date ,row_number() over(partition by user_id order by login_date asc) as rn from wedw_dw.t_login_info +----------+-------------+-----+--+ | user_id | login_date | rn | +----------+-------------+-----+--+ | user01 | 2018-02-28 | 1 | | user01 | 2018-03-01 | 2 | | user01 | 2018-03-02 | 3 | | user01 | 2018-03-04 | 4 | | user01 | 2018-03-05 | 5 | | user01 | 2018-03-06 | 6 | | user01 | 2018-03-07 | 7 | | user02 | 2018-03-01 | 1 | | user02 | 2018-03-02 | 2 | | user02 | 2018-03-03 | 3 | | user02 | 2018-03-06 | 4 | +----------+-------------+-----+--+
select t1.user_id ,t1.login_date ,date_sub(t1.login_date,rn) as date_diff from ( select user_id ,login_date ,row_number() over(partition by user_id order by login_date asc) as rn from wedw_dw.t_login_info ) t1 ; +----------+-------------+-------------+--+ | user_id | login_date | date_diff | +----------+-------------+-------------+--+ | user01 | 2018-02-28 | 2018-02-27 | | user01 | 2018-03-01 | 2018-02-27 | | user01 | 2018-03-02 | 2018-02-27 | | user01 | 2018-03-04 | 2018-02-28 | | user01 | 2018-03-05 | 2018-02-28 | | user01 | 2018-03-06 | 2018-02-28 | | user01 | 2018-03-07 | 2018-02-28 | | user02 | 2018-03-01 | 2018-02-28 | | user02 | 2018-03-02 | 2018-02-28 | | user02 | 2018-03-03 | 2018-02-28 | | user02 | 2018-03-06 | 2018-03-02 | +----------+-------------+-------------+--+
select t2.user_id as user_id ,count(1) as times ,min(t2.login_date) as start_date ,max(t2.login_date) as end_date from ( select t1.user_id ,t1.login_date ,date_sub(t1.login_date,rn) as date_diff from ( select user_id ,login_date ,row_number() over(partition by user_id order by login_date asc) as rn from wedw_dw.t_login_info ) t1 ) t2 group by t2.user_id ,t2.date_diff having times >= 3 ; +----------+--------+-------------+-------------+--+ | user_id | times | start_date | end_date | +----------+--------+-------------+-------------+--+ | user01 | 3 | 2018-02-28 | 2018-03-02 | | user01 | 4 | 2018-03-04 | 2018-03-07 | | user02 | 3 | 2018-03-01 | 2018-03-03 | +----------+--------+-------------+-------------+--+