15. 數據倉庫分層之DWS層、ADS層--用戶主題活躍

    在通過了ODS層的加載數據、DWD層的解析數據後。這一節將具體分析特定指標,DWS層將對DWD層數據輕度彙總,ADS層將統計數據。網絡


用戶活躍主題


  1. DWD層目標:統計當日、當週、當月活動的每一個設備明細。app

    1. 每日活躍設備明細oop

      • 建表語句
      hive (gmall)>
      drop table if exists dws_uv_detail_day;
      create external table dws_uv_detail_day(
        `mid_id` string COMMENT '設備惟一標識',
        `user_id` string COMMENT '用戶標識', 
        `version_code` string COMMENT '程序版本號', 
        `version_name` string COMMENT '程序版本名', 
        `lang` string COMMENT '系統語言', 
        `source` string COMMENT '渠道號', 
        `os` string COMMENT '安卓系統版本', 
        `area` string COMMENT '區域', 
        `model` string COMMENT '手機型號', 
        `brand` string COMMENT '手機品牌', 
        `sdk_version` string COMMENT 'sdkVersion', 
        `gmail` string COMMENT 'gmail', 
        `height_width` string COMMENT '屏幕寬高',
        `app_time` string COMMENT '客戶端日誌產生時的時間',
        `network` string COMMENT '網絡模式',
        `lng` string COMMENT '經度',
        `lat` string COMMENT '緯度'
      )
      partitioned by(dt string)
      stored as parquet
      location '/warehouse/gmall/dws/dws_uv_detail_day'
      ;
      • 插入數據
      hive (gmall)>
      set hive.exec.dynamic.partition.mode=nonstrict;
      
      insert overwrite table dws_uv_detail_day 
      partition(dt='2020-02-03')
      select  
        mid_id,
        concat_ws('|', collect_set(user_id)) user_id,
        concat_ws('|', collect_set(version_code)) version_code,
        concat_ws('|', collect_set(version_name)) version_name,
        concat_ws('|', collect_set(lang))lang,
        concat_ws('|', collect_set(source)) source,
        concat_ws('|', collect_set(os)) os,
        concat_ws('|', collect_set(area)) area, 
        concat_ws('|', collect_set(model)) model,
        concat_ws('|', collect_set(brand)) brand,
        concat_ws('|', collect_set(sdk_version)) sdk_version,
        concat_ws('|', collect_set(gmail)) gmail,
        concat_ws('|', collect_set(height_width)) height_width,
        concat_ws('|', collect_set(app_time)) app_time,
        concat_ws('|', collect_set(network)) network,
        concat_ws('|', collect_set(lng)) lng,
        concat_ws('|', collect_set(lat)) lat
      from dwd_start_log
      where dt='2020-02-03'
      group by mid_id;
      • 查看結果
      hive (gmall)> select * from dws_uv_detail_day limit 1;
    2. 每週活躍設備明細日誌

      • 建表語句
      hive (gmall)>
      drop table if exists dws_uv_detail_wk;
      create external table dws_uv_detail_wk( 
        `mid_id` string COMMENT '設備惟一標識',
        `user_id` string COMMENT '用戶標識', 
        `version_code` string COMMENT '程序版本號', 
        `version_name` string COMMENT '程序版本名', 
        `lang` string COMMENT '系統語言', 
        `source` string COMMENT '渠道號', 
        `os` string COMMENT '安卓系統版本', 
        `area` string COMMENT '區域', 
        `model` string COMMENT '手機型號', 
        `brand` string COMMENT '手機品牌', 
        `sdk_version` string COMMENT 'sdkVersion', 
        `gmail` string COMMENT 'gmail', 
        `height_width` string COMMENT '屏幕寬高',
        `app_time` string COMMENT '客戶端日誌產生時的時間',
        `network` string COMMENT '網絡模式',
        `lng` string COMMENT '經度',
        `lat` string COMMENT '緯度',
        `monday_date` string COMMENT '週一日期',
        `sunday_date` string COMMENT  '週日日期' 
      ) COMMENT '活躍用戶按周明細'
      PARTITIONED BY (`wk_dt` string)
      stored as parquet
      location '/warehouse/gmall/dws/dws_uv_detail_wk/';
      • 插入數據
      set hive.exec.dynamic.partition.mode=nonstrict;
      insert overwrite table dws_uv_detail_wk 
      partition(wk_dt) 
      select
        mid_id,
        concat_ws('|', collect_set(user_id)) user_id,
        concat_ws('|', collect_set(version_code)) version_code,
        concat_ws('|', collect_set(version_name)) version_name,
        concat_ws('|', collect_set(lang))lang,
        concat_ws('|', collect_set(source)) source,
        concat_ws('|', collect_set(os)) os,
        concat_ws('|', collect_set(area)) area, 
        concat_ws('|', collect_set(model)) model,
        concat_ws('|', collect_set(brand)) brand,
        concat_ws('|', collect_set(sdk_version)) sdk_version,
        concat_ws('|', collect_set(gmail)) gmail,
        concat_ws('|', collect_set(height_width)) height_width,
        concat_ws('|', collect_set(app_time)) app_time,
        concat_ws('|', collect_set(network)) network,
        concat_ws('|', collect_set(lng)) lng,
        concat_ws('|', collect_set(lat)) lat,
        date_add(next_day('2020-02-03','mo'),-7),
        date_add(next_day('2020-02-03','mo'),-1),
        concat(date_add(next_day('2020-02-03','mo'),-7),'_',date_add(next_day('2020-02-03','mo'),-1))
      from dws_uv_detail_day
      where dt>=date_add(next_day('2020-02-03','mo'),-7) and dt<=date_add(next_day('2020-02-03','mo'),-1)
      group by mid_id;
      • 查詢導入結果
      select * from dws_uv_detail_wk limit 1;
    3. 每個月活躍設備明細code

      • 建表語句
      hive (gmall)>
      drop table if exists dws_uv_detail_mn;
      
      create external table dws_uv_detail_mn( 
        `mid_id` string COMMENT '設備惟一標識',
        `user_id` string COMMENT '用戶標識', 
        `version_code` string COMMENT '程序版本號', 
        `version_name` string COMMENT '程序版本名', 
        `lang` string COMMENT '系統語言', 
        `source` string COMMENT '渠道號', 
        `os` string COMMENT '安卓系統版本', 
        `area` string COMMENT '區域', 
        `model` string COMMENT '手機型號', 
        `brand` string COMMENT '手機品牌', 
        `sdk_version` string COMMENT 'sdkVersion', 
        `gmail` string COMMENT 'gmail', 
        `height_width` string COMMENT '屏幕寬高',
        `app_time` string COMMENT '客戶端日誌產生時的時間',
        `network` string COMMENT '網絡模式',
        `lng` string COMMENT '經度',
        `lat` string COMMENT '緯度'
      ) COMMENT '活躍用戶按月明細'
      PARTITIONED BY (`mn` string)
      stored as parquet
      location '/warehouse/gmall/dws/dws_uv_detail_mn/';
      • 插入數據
      insert overwrite table dws_uv_detail_mn
      partition(mn)
      select
        mid_id,
        concat_ws('|', collect_set(user_id)) user_id,
        concat_ws('|', collect_set(version_code)) version_code,
        concat_ws('|', collect_set(version_name)) version_name,
        concat_ws('|', collect_set(lang)) lang,
        concat_ws('|', collect_set(source)) source,
        concat_ws('|', collect_set(os)) os,
        concat_ws('|', collect_set(area)) area, 
        concat_ws('|', collect_set(model)) model,
        concat_ws('|', collect_set(brand)) brand,
        concat_ws('|', collect_set(sdk_version)) sdk_version,
        concat_ws('|', collect_set(gmail)) gmail,
        concat_ws('|', collect_set(height_width)) height_width,
        concat_ws('|', collect_set(app_time)) app_time,
        concat_ws('|', collect_set(network)) network,
        concat_ws('|', collect_set(lng)) lng,
        concat_ws('|', collect_set(lat)) lat,
        date_format('2020-02-03', 'yyyy-MM')
      from dws_uv_detail_day
      where date_format(dt, 'yyyy-MM')=date_format('2020-02-03', 'yyyy-MM')
      group by mid_id;
      • 查詢結果
      hive (gmall)> select * from dws_uv_detail_mn limit 2;
      • 將剩餘數據導入
  2. ADS層目標:計算當日、當週、當月活躍設備數orm

    • 建表語句
    hive (gmall)>
    drop table if exists ads_uv_count;
    create external table ads_uv_count( 
       `dt` string COMMENT '統計日期',
       `day_count` bigint COMMENT '當日用戶數量',
       `wk_count`  bigint COMMENT '當週用戶數量',
       `mn_count`  bigint COMMENT '當月用戶數量',
       `is_weekend` string COMMENT 'Y,N是不是週末,用於獲得本週最終結果',
       `is_monthend` string COMMENT 'Y,N是不是月末,用於獲得本月最終結果' 
    ) COMMENT '活躍設備數'
    row format delimited fields terminated by '\t'
    location '/warehouse/gmall/ads/ads_uv_count/';
    • 導入數據
    hive (gmall)> 
    insert into table ads_uv_count
    select
       '2020-02-03',
       daycount.ct,
       weekcount.ct,
       mncount.ct,
       if(date_add(next_day('2020-02-03','mo'),-1)='2020-02-03', 'Y', 'N'),
       if(last_day('2020-02-03')='2020-02-03','Y','N')
    from
    (
       select
           '2020-02-03' dt,
           count(*) ct
       from dws_uv_detail_day
       where dt='2020-02-03'
    )daycount join
    (
       select
           '2020-02-03' dt,
           count(*) ct
       from dws_uv_detail_wk
       where wk_dt=concat(date_add(next_day('2020-02-03','mo'),-7),'_',date_add(next_day('2020-02-03','mo'),-1))
    )weekcount on daycount.dt=weekcount.dt join
    (
       select
           '2020-02-03' dt,
           count(*) ct
       from dws_uv_detail_mn
       where mn=date_format('2020-02-10','yyyy-MM')
    )mncount on daycount.dt=mncount.dt;
    • 加載剩餘數據
    [hadoop@hadoop151 bin]$ ads_uv_log.sh 2020-01-01 2020-01-31
  3. 查詢ADS層中的數據。hadoop

    hive (gmall)> select * from ads_uv_count;
    OK
    ads_uv_count.dt    ads_uv_count.day_count    ads_uv_count.wk_count    ads_uv_count.mn_count    ads_uv_count.is_weekend    ads_uv_count.is_monthend
    2020-02-03    741    741     741     N    N
    2020-01-01    521    990     1000    N    N
    2020-01-10    728    999     1000    N    N
    2020-01-11    763    999     1000    N    N
    2020-01-12    742    999     1000    Y    N
    2020-01-13    444    1000    1000    N    N
    2020-01-14    757    1000    1000    N    N
    2020-01-15    757    1000    1000    N    N
    2020-01-16    756    1000    1000    N    N
    2020-01-17    744    1000    1000    N    N
    2020-01-18    746    1000    1000    N    N
    2020-01-19    722    1000    1000    Y    N
    2020-01-02    506    990     1000    N    N
    2020-01-20    751    1000    1000    N    N
    2020-01-21    742    1000    1000    N    N
    2020-01-22    760    1000    1000    N    N
    2020-01-23    750    1000    1000    N    N
    2020-01-24    784    1000    1000    N    N
    2020-01-25    578    1000    1000    N    N
    2020-01-26    866    1000    1000    Y    N
    2020-01-27    750    999     1000    N    N
    2020-01-28    757    999     1000    N    N
    2020-01-29    745    999     1000    N    N
    2020-01-03    736    990     1000    N    N
    2020-01-30    766    999     1000    N    N
    2020-01-31    657    999     1000    N    Y
    2020-01-04    502    990     1000    N    N
    2020-01-05    759    990     1000    Y    N
    2020-01-06    762    999     1000    N    N
    2020-01-07    772    999     1000    N    N
    2020-01-08    735    999     1000    N    N
    2020-01-09    29    999     1000     N    N
    Time taken: 0.042 seconds, Fetched: 32 row(s)

    形成周活躍設備接近月活躍設備的緣由多是生成的埋點數據致使的,由於沒有生產數據,因此這樣的結果應該在乎料之中。string

相關文章
相關標籤/搜索