經過召回的操做, 咱們已經進行了問題規模的縮減, 對於每一個用戶, 選擇出了N篇文章做爲了候選集,並基於召回的候選集構建了與用戶歷史相關的特徵,以及用戶自己的屬性特徵,文章本省的屬性特徵,以及用戶與文章之間的特徵,下面就是使用機器學習模型來對構造好的特徵進行學習,而後對測試集進行預測,獲得測試集中的每一個候選集用戶點擊的機率,返回點擊機率最大的topk個文章,做爲最終的結果。python
排序階段選擇了三個比較有表明性的排序模型,它們分別是:api
獲得了最終的排序模型輸出的結果以後,還選擇了兩種比較經典的模型集成的方法:數組
import numpy as np import pandas as pd import pickle from tqdm import tqdm import gc, os import time from datetime import datetime import lightgbm as lgb from sklearn.preprocessing import MinMaxScaler import warnings warnings.filterwarnings('ignore')
data_path = './data_raw' save_path = './temp_results' offline = False
# 從新讀取數據的時候,發現click_article_id是一個浮點數,因此將其轉換成int類型 trn_user_item_feats_df = pd.read_csv(save_path + 'trn_user_item_feats_df.csv') trn_user_item_feats_df['click_article_id'] = trn_user_item_feats_df['click_article_id'].astype(int) if offline: val_user_item_feats_df = pd.read_csv(save_path + 'val_user_item_feats_df.csv') val_user_item_feats_df['click_article_id'] = val_user_item_feats_df['click_article_id'].astype(int) else: val_user_item_feats_df = None tst_user_item_feats_df = pd.read_csv(save_path + 'tst_user_item_feats_df.csv') tst_user_item_feats_df['click_article_id'] = tst_user_item_feats_df['click_article_id'].astype(int) # 作特徵的時候爲了方便,給測試集也打上了一個無效的標籤,這裏直接刪掉就行 del tst_user_item_feats_df['label']
def submit(recall_df, topk=5, model_name=None): recall_df = recall_df.sort_values(by=['user_id', 'pred_score']) recall_df['rank'] = recall_df.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first') # 判斷是否是每一個用戶都有5篇文章及以上 tmp = recall_df.groupby('user_id').apply(lambda x: x['rank'].max()) assert tmp.min() >= topk del recall_df['pred_score'] submit = recall_df[recall_df['rank'] <= topk].set_index(['user_id', 'rank']).unstack(-1).reset_index() submit.columns = [int(col) if isinstance(col, int) else col for col in submit.columns.droplevel(0)] # 按照提交格式定義列名 submit = submit.rename(columns={'': 'user_id', 1: 'article_1', 2: 'article_2', 3: 'article_3', 4: 'article_4', 5: 'article_5'}) save_name = save_path + model_name + '_' + datetime.today().strftime('%m-%d') + '.csv' submit.to_csv(save_name, index=False, header=True)
# 排序結果歸一化 def norm_sim(sim_df, weight=0.0): # print(sim_df.head()) min_sim = sim_df.min() max_sim = sim_df.max() if max_sim == min_sim: sim_df = sim_df.apply(lambda sim: 1.0) else: sim_df = sim_df.apply(lambda sim: 1.0 * (sim - min_sim) / (max_sim - min_sim)) sim_df = sim_df.apply(lambda sim: sim + weight) # plus one return sim_df
# copy 一份 trn_user_item_feats_df_rank_model = trn_user_item_feats_df.copy() if offline: val_user_item_feats_df_rank_model = val_user_item_feats_df.copy() tst_user_item_feats_df_rank_model = tst_user_item_feats_df.copy()
# 定義特徵列 lgb_cols = ['sim0', 'time_diff0', 'word_diff0','sim_max', 'sim_min', 'sim_sum', 'sim_mean', 'score','click_size', 'time_diff_mean', 'active_level', 'click_environment','click_deviceGroup', 'click_os', 'click_country', 'click_region','click_referrer_type', 'user_time_hob1', 'user_time_hob2', 'words_hbo', 'category_id', 'created_at_ts','words_count']
# 排序模型分組 trn_user_item_feats_df_rank_model.sort_values(by=['user_id'], inplace=True) g_train = trn_user_item_feats_df_rank_model.groupby(['user_id'], as_index=False).count()["label"].values if offline: val_user_item_feats_df_rank_model.sort_values(by=['user_id'], inplace=True) g_val = val_user_item_feats_df_rank_model.groupby(['user_id'], as_index=False).count()["label"].values
# 排序模型定義 lgb_ranker = lgb.LGBMRanker(boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1, max_depth=-1, n_estimators=100, subsample=0.7, colsample_bytree=0.7, subsample_freq=1, learning_rate=0.01, min_child_weight=50, random_state=2018, n_jobs= 16)
# 排序模型訓練 if offline: lgb_ranker.fit(trn_user_item_feats_df_rank_model[lgb_cols], trn_user_item_feats_df_rank_model['label'], group=g_train, eval_set=[(val_user_item_feats_df_rank_model[lgb_cols], val_user_item_feats_df_rank_model['label'])], eval_group= [g_val], eval_at=[1, 2, 3, 4, 5], eval_metric=['ndcg', ], early_stopping_rounds=50, ) else: lgb_ranker.fit(trn_user_item_feats_df[lgb_cols], trn_user_item_feats_df['label'], group=g_train)
# 模型預測 tst_user_item_feats_df['pred_score'] = lgb_ranker.predict(tst_user_item_feats_df[lgb_cols], num_iteration=lgb_ranker.best_iteration_) # 將這裏的排序結果保存一份,用戶後面的模型融合 tst_user_item_feats_df[['user_id', 'click_article_id', 'pred_score']].to_csv(save_path + 'lgb_ranker_score.csv', index=False)
# 預測結果從新排序, 及生成提交結果 rank_results = tst_user_item_feats_df[['user_id', 'click_article_id', 'pred_score']] rank_results['click_article_id'] = rank_results['click_article_id'].astype(int) submit(rank_results, topk=5, model_name='lgb_ranker')
# 五折交叉驗證,這裏的五折交叉是以用戶爲目標進行五折劃分 # 這一部分與前面的單獨訓練和驗證是分開的 def get_kfold_users(trn_df, n=5): user_ids = trn_df['user_id'].unique() user_set = [user_ids[i::n] for i in range(n)] return user_set k_fold = 5 trn_df = trn_user_item_feats_df_rank_model user_set = get_kfold_users(trn_df, n=k_fold) score_list = [] score_df = trn_df[['user_id', 'click_article_id','label']] sub_preds = np.zeros(tst_user_item_feats_df_rank_model.shape[0]) # 五折交叉驗證,並將中間結果保存用於staking for n_fold, valid_user in enumerate(user_set): train_idx = trn_df[~trn_df['user_id'].isin(valid_user)] # add slide user valid_idx = trn_df[trn_df['user_id'].isin(valid_user)] # 訓練集與驗證集的用戶分組 train_idx.sort_values(by=['user_id'], inplace=True) g_train = train_idx.groupby(['user_id'], as_index=False).count()["label"].values valid_idx.sort_values(by=['user_id'], inplace=True) g_val = valid_idx.groupby(['user_id'], as_index=False).count()["label"].values # 定義模型 lgb_ranker = lgb.LGBMRanker(boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1, max_depth=-1, n_estimators=100, subsample=0.7, colsample_bytree=0.7, subsample_freq=1, learning_rate=0.01, min_child_weight=50, random_state=2018, n_jobs= 16) # 訓練模型 lgb_ranker.fit(train_idx[lgb_cols], train_idx['label'], group=g_train, eval_set=[(valid_idx[lgb_cols], valid_idx['label'])], eval_group= [g_val], eval_at=[1, 2, 3, 4, 5], eval_metric=['ndcg', ], early_stopping_rounds=50, ) # 預測驗證集結果 valid_idx['pred_score'] = lgb_ranker.predict(valid_idx[lgb_cols], num_iteration=lgb_ranker.best_iteration_) # 對輸出結果進行歸一化 valid_idx['pred_score'] = valid_idx[['pred_score']].transform(lambda x: norm_sim(x)) valid_idx.sort_values(by=['user_id', 'pred_score']) valid_idx['pred_rank'] = valid_idx.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first') # 將驗證集的預測結果放到一個列表中,後面進行拼接 score_list.append(valid_idx[['user_id', 'click_article_id', 'pred_score', 'pred_rank']]) # 若是是線上測試,須要計算每次交叉驗證的結果相加,最後求平均 if not offline: sub_preds += lgb_ranker.predict(tst_user_item_feats_df_rank_model[lgb_cols], lgb_ranker.best_iteration_) score_df_ = pd.concat(score_list, axis=0) score_df = score_df.merge(score_df_, how='left', on=['user_id', 'click_article_id']) # 保存訓練集交叉驗證產生的新特徵 score_df[['user_id', 'click_article_id', 'pred_score', 'pred_rank', 'label']].to_csv(save_path + 'trn_lgb_ranker_feats.csv', index=False) # 測試集的預測結果,屢次交叉驗證求平均,將預測的score和對應的rank特徵保存,能夠用於後面的staking,這裏還能夠構造其餘更多的特徵 tst_user_item_feats_df_rank_model['pred_score'] = sub_preds / k_fold tst_user_item_feats_df_rank_model['pred_score'] = tst_user_item_feats_df_rank_model['pred_score'].transform(lambda x: norm_sim(x)) tst_user_item_feats_df_rank_model.sort_values(by=['user_id', 'pred_score']) tst_user_item_feats_df_rank_model['pred_rank'] = tst_user_item_feats_df_rank_model.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first') # 保存測試集交叉驗證的新特徵 tst_user_item_feats_df_rank_model[['user_id', 'click_article_id', 'pred_score', 'pred_rank']].to_csv(save_path + 'tst_lgb_ranker_feats.csv', index=False)
# 預測結果從新排序, 及生成提交結果 # 單模型生成提交結果 rank_results = tst_user_item_feats_df_rank_model[['user_id', 'click_article_id', 'pred_score']] rank_results['click_article_id'] = rank_results['click_article_id'].astype(int) submit(rank_results, topk=5, model_name='lgb_ranker')
# 模型及參數的定義 lgb_Classfication = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1, max_depth=-1, n_estimators=500, subsample=0.7, colsample_bytree=0.7, subsample_freq=1, learning_rate=0.01, min_child_weight=50, random_state=2018, n_jobs= 16, verbose=10)
# 模型及參數的定義 lgb_Classfication = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1, max_depth=-1, n_estimators=500, subsample=0.7, colsample_bytree=0.7, subsample_freq=1, learning_rate=0.01, min_child_weight=50, random_state=2018, n_jobs= 16, verbose=10)
# 模型訓練 if offline: lgb_Classfication.fit(trn_user_item_feats_df_rank_model[lgb_cols], trn_user_item_feats_df_rank_model['label'], eval_set=[(val_user_item_feats_df_rank_model[lgb_cols], val_user_item_feats_df_rank_model['label'])], eval_metric=['auc', ],early_stopping_rounds=50, ) else: lgb_Classfication.fit(trn_user_item_feats_df_rank_model[lgb_cols], trn_user_item_feats_df_rank_model['label'])
# 模型預測 tst_user_item_feats_df['pred_score'] = lgb_Classfication.predict_proba(tst_user_item_feats_df[lgb_cols])[:,1] # 將這裏的排序結果保存一份,用戶後面的模型融合 tst_user_item_feats_df[['user_id', 'click_article_id', 'pred_score']].to_csv(save_path + 'lgb_cls_score.csv', index=False)
# 預測結果從新排序, 及生成提交結果 rank_results = tst_user_item_feats_df[['user_id', 'click_article_id', 'pred_score']] rank_results['click_article_id'] = rank_results['click_article_id'].astype(int) submit(rank_results, topk=5, model_name='lgb_cls')
# 五折交叉驗證,這裏的五折交叉是以用戶爲目標進行五折劃分 # 這一部分與前面的單獨訓練和驗證是分開的 def get_kfold_users(trn_df, n=5): user_ids = trn_df['user_id'].unique() user_set = [user_ids[i::n] for i in range(n)] return user_set k_fold = 5 trn_df = trn_user_item_feats_df_rank_model user_set = get_kfold_users(trn_df, n=k_fold) score_list = [] score_df = trn_df[['user_id', 'click_article_id', 'label']] sub_preds = np.zeros(tst_user_item_feats_df_rank_model.shape[0]) # 五折交叉驗證,並將中間結果保存用於staking for n_fold, valid_user in enumerate(user_set): train_idx = trn_df[~trn_df['user_id'].isin(valid_user)] # add slide user valid_idx = trn_df[trn_df['user_id'].isin(valid_user)] # 模型及參數的定義 lgb_Classfication = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1, max_depth=-1, n_estimators=100, subsample=0.7, colsample_bytree=0.7, subsample_freq=1, learning_rate=0.01, min_child_weight=50, random_state=2018, n_jobs= 16, verbose=10) # 訓練模型 lgb_Classfication.fit(train_idx[lgb_cols], train_idx['label'],eval_set=[(valid_idx[lgb_cols], valid_idx['label'])], eval_metric=['auc', ],early_stopping_rounds=50, ) # 預測驗證集結果 valid_idx['pred_score'] = lgb_Classfication.predict_proba(valid_idx[lgb_cols], num_iteration=lgb_Classfication.best_iteration_)[:,1] # 對輸出結果進行歸一化 分類模型輸出的值自己就是一個機率值不須要進行歸一化 # valid_idx['pred_score'] = valid_idx[['pred_score']].transform(lambda x: norm_sim(x)) valid_idx.sort_values(by=['user_id', 'pred_score']) valid_idx['pred_rank'] = valid_idx.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first') # 將驗證集的預測結果放到一個列表中,後面進行拼接 score_list.append(valid_idx[['user_id', 'click_article_id', 'pred_score', 'pred_rank']]) # 若是是線上測試,須要計算每次交叉驗證的結果相加,最後求平均 if not offline: sub_preds += lgb_Classfication.predict_proba(tst_user_item_feats_df_rank_model[lgb_cols], num_iteration=lgb_Classfication.best_iteration_)[:,1] score_df_ = pd.concat(score_list, axis=0) score_df = score_df.merge(score_df_, how='left', on=['user_id', 'click_article_id']) # 保存訓練集交叉驗證產生的新特徵 score_df[['user_id', 'click_article_id', 'pred_score', 'pred_rank', 'label']].to_csv(save_path + 'trn_lgb_cls_feats.csv', index=False) # 測試集的預測結果,屢次交叉驗證求平均,將預測的score和對應的rank特徵保存,能夠用於後面的staking,這裏還能夠構造其餘更多的特徵 tst_user_item_feats_df_rank_model['pred_score'] = sub_preds / k_fold tst_user_item_feats_df_rank_model['pred_score'] = tst_user_item_feats_df_rank_model['pred_score'].transform(lambda x: norm_sim(x)) tst_user_item_feats_df_rank_model.sort_values(by=['user_id', 'pred_score']) tst_user_item_feats_df_rank_model['pred_rank'] = tst_user_item_feats_df_rank_model.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first') # 保存測試集交叉驗證的新特徵 tst_user_item_feats_df_rank_model[['user_id', 'click_article_id', 'pred_score', 'pred_rank']].to_csv(save_path + 'tst_lgb_cls_feats.csv', index=False)
# 預測結果從新排序, 及生成提交結果 rank_results = tst_user_item_feats_df_rank_model[['user_id', 'click_article_id', 'pred_score']] rank_results['click_article_id'] = rank_results['click_article_id'].astype(int) submit(rank_results, topk=5, model_name='lgb_cls')
這個是爲後面的DIN模型服務的網絡
if offline: all_data = pd.read_csv('./data_raw/train_click_log.csv') else: trn_data = pd.read_csv('./data_raw/train_click_log.csv') tst_data = pd.read_csv('./data_raw/testA_click_log.csv') all_data = trn_data.append(tst_data) hist_click =all_data[['user_id', 'click_article_id']].groupby('user_id').agg({list}).reset_index() his_behavior_df = pd.DataFrame() his_behavior_df['user_id'] = hist_click['user_id'] his_behavior_df['hist_click_article_id'] = hist_click['click_article_id']
trn_user_item_feats_df_din_model = trn_user_item_feats_df.copy() if offline: val_user_item_feats_df_din_model = val_user_item_feats_df.copy() else: val_user_item_feats_df_din_model = None tst_user_item_feats_df_din_model = tst_user_item_feats_df.copy()
trn_user_item_feats_df_din_model = trn_user_item_feats_df_din_model.merge(his_behavior_df, on='user_id') if offline: val_user_item_feats_df_din_model = val_user_item_feats_df_din_model.merge(his_behavior_df, on='user_id') else: val_user_item_feats_df_din_model = None tst_user_item_feats_df_din_model = tst_user_item_feats_df_din_model.merge(his_behavior_df, on='user_id')
咱們下面嘗試使用DIN模型, DIN的全稱是Deep Interest Network, 這是阿里2018年基於前面的深度學習模型沒法表達用戶多樣化的興趣而提出的一個模型, 它能夠經過考慮【給定的候選廣告】和【用戶的歷史行爲】的相關性,來計算用戶興趣的表示向量。具體來講就是經過引入局部激活單元,經過軟搜索歷史行爲的相關部分來關注相關的用戶興趣,並採用加權和來得到有關候選廣告的用戶興趣的表示。與候選廣告相關性較高的行爲會得到較高的激活權重,並支配着用戶興趣。該表示向量在不一樣廣告上有所不一樣,大大提升了模型的表達能力。因此該模型對於這次新聞推薦的任務也比較適合, 咱們在這裏經過當前的候選文章與用戶歷史點擊文章的相關性來計算用戶對於文章的興趣。 該模型的結構以下:app
咱們這裏直接調包來使用這個模型, 關於這個模型的詳細細節部分咱們會在下一期的推薦系統組隊學習中給出。下面說一下該模型如何具體使用:deepctr的函數原型以下:dom
def DIN(dnn_feature_columns, history_feature_list, dnn_use_bn=False, dnn_hidden_units=(200, 80), dnn_activation=‘relu’, att_hidden_size=(80, 40), att_activation=「dice」, att_weight_normalization=False, l2_reg_dnn=0, l2_reg_embedding=1e-6, dnn_dropout=0, seed=1024, task=‘binary’): """ dnn_feature_columns: 特徵列, 包含數據全部特徵的列表 history_feature_list: 用戶歷史行爲列, 反應用戶歷史行爲的特徵的列表 dnn_use_bn: 是否使用BatchNormalization dnn_hidden_units: 全鏈接層網絡的層數和每一層神經元的個數, 一個列表或者元組 dnn_activation_relu: 全鏈接網絡的激活單元類型 att_hidden_size: 注意力層的全鏈接網絡的層數和每一層神經元的個數 att_activation: 注意力層的激活單元類型 att_weight_normalization: 是否歸一化注意力得分 l2_reg_dnn: 全鏈接網絡的正則化係數 l2_reg_embedding: embedding向量的正則化稀疏 dnn_dropout: 全鏈接網絡的神經元的失活機率 task: 任務, 能夠是分類, 也但是是迴歸 """
在具體使用的時候, 咱們必需要傳入特徵列和歷史行爲列, 可是再傳入以前, 咱們須要進行一下特徵列的預處理。具體以下:機器學習
首先,咱們要處理數據集, 獲得數據, 因爲咱們是基於用戶過去的行爲去預測用戶是否點擊當前文章, 因此咱們須要把數據的特徵列劃分紅數值型特徵, 離散型特徵和歷史行爲特徵列三部分, 對於每一部分, DIN模型的處理會有不一樣ide
下面根據具體的代碼感覺一下, 邏輯是這樣, 首先咱們須要寫一個數據準備函數, 在這裏面就是根據上面的具體步驟準備數據, 獲得數據和特徵列, 而後就是創建DIN模型並訓練, 最後基於模型進行測試。函數
# 導入deepctr from deepctr.models import DIN from deepctr.feature_column import SparseFeat, VarLenSparseFeat, DenseFeat, get_feature_names from tensorflow.keras.preprocessing.sequence import pad_sequences from tensorflow.keras import backend as K from tensorflow.keras.layers import * from tensorflow.keras.models import * from tensorflow.keras.callbacks import * import tensorflow as tf import os os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = "2"
# 數據準備函數 def get_din_feats_columns(df, dense_fea, sparse_fea, behavior_fea, his_behavior_fea, emb_dim=32, max_len=100): """ 數據準備函數: df: 數據集 dense_fea: 數值型特徵列 sparse_fea: 離散型特徵列 behavior_fea: 用戶的候選行爲特徵列 his_behavior_fea: 用戶的歷史行爲特徵列 embedding_dim: embedding的維度, 這裏爲了簡單, 統一把離散型特徵列採用同樣的隱向量維度 max_len: 用戶序列的最大長度 """ sparse_feature_columns = [SparseFeat(feat, vocabulary_size=df[feat].nunique() + 1, embedding_dim=emb_dim) for feat in sparse_fea] dense_feature_columns = [DenseFeat(feat, 1, ) for feat in dense_fea] var_feature_columns = [VarLenSparseFeat(SparseFeat(feat, vocabulary_size=df['click_article_id'].nunique() + 1, embedding_dim=emb_dim, embedding_name='click_article_id'), maxlen=max_len) for feat in hist_behavior_fea] dnn_feature_columns = sparse_feature_columns + dense_feature_columns + var_feature_columns # 創建x, x是一個字典的形式 x = {} for name in get_feature_names(dnn_feature_columns): if name in his_behavior_fea: # 這是歷史行爲序列 his_list = [l for l in df[name]] x[name] = pad_sequences(his_list, maxlen=max_len, padding='post') # 二維數組 else: x[name] = df[name].values return x, dnn_feature_columns # 把特徵分開 sparse_fea = ['user_id', 'click_article_id', 'category_id', 'click_environment', 'click_deviceGroup', 'click_os', 'click_country', 'click_region', 'click_referrer_type', 'is_cat_hab'] behavior_fea = ['click_article_id'] hist_behavior_fea = ['hist_click_article_id'] dense_fea = ['sim0', 'time_diff0', 'word_diff0', 'sim_max', 'sim_min', 'sim_sum', 'sim_mean', 'score', 'rank','click_size','time_diff_mean','active_level','user_time_hob1','user_time_hob2', 'words_hbo','words_count']
# dense特徵進行歸一化, 神經網絡訓練都須要將數值進行歸一化處理 mm = MinMaxScaler() # 下面是作一些特殊處理,當在其餘的地方出現無效值的時候,不處理沒法進行歸一化,剛開始能夠先把他註釋掉,在運行了下面的代碼 # 以後若是發現報錯,應該先去想辦法處理如何不出現inf之類的值 # trn_user_item_feats_df_din_model.replace([np.inf, -np.inf], 0, inplace=True) # tst_user_item_feats_df_din_model.replace([np.inf, -np.inf], 0, inplace=True) for feat in dense_fea: trn_user_item_feats_df_din_model[feat] = mm.fit_transform(trn_user_item_feats_df_din_model[[feat]]) if val_user_item_feats_df_din_model is not None: val_user_item_feats_df_din_model[feat] = mm.fit_transform(val_user_item_feats_df_din_model[[feat]]) tst_user_item_feats_df_din_model[feat] = mm.fit_transform(tst_user_item_feats_df_din_model[[feat]])
# 準備訓練數據 x_trn, dnn_feature_columns = get_din_feats_columns(trn_user_item_feats_df_din_model, dense_fea, sparse_fea, behavior_fea, hist_behavior_fea, max_len=50) y_trn = trn_user_item_feats_df_din_model['label'].values if offline: # 準備驗證數據 x_val, dnn_feature_columns = get_din_feats_columns(val_user_item_feats_df_din_model, dense_fea, sparse_fea, behavior_fea, hist_behavior_fea, max_len=50) y_val = val_user_item_feats_df_din_model['label'].values dense_fea = [x for x in dense_fea if x != 'label'] x_tst, dnn_feature_columns = get_din_feats_columns(tst_user_item_feats_df_din_model, dense_fea, sparse_fea, behavior_fea, hist_behavior_fea, max_len=50)
# 創建模型 model = DIN(dnn_feature_columns, behavior_fea) # 查看模型結構 model.summary() # 模型編譯 model.compile('adam', 'binary_crossentropy',metrics=['binary_crossentropy', tf.keras.metrics.AUC()])
WARNING:tensorflow:From /home/ryluo/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:1288: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version. Instructions for updating: Call initializer instance with the dtype argument instead of passing it to the constructor WARNING:tensorflow:From /home/ryluo/anaconda3/lib/python3.6/site-packages/tensorflow/python/autograph/impl/api.py:255: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version. Instructions for updating: Use tf.where in 2.0, which has the same broadcast rule as np.where Model: "model" __________________________________________________________________________________________________ Layer (type) Output Shape Param # Connected to ================================================================================================== user_id (InputLayer) [(None, 1)] 0 __________________________________________________________________________________________________ click_article_id (InputLayer) [(None, 1)] 0 __________________________________________________________________________________________________ category_id (InputLayer) [(None, 1)] 0 __________________________________________________________________________________________________ click_environment (InputLayer) [(None, 1)] 0 __________________________________________________________________________________________________ click_deviceGroup (InputLayer) [(None, 1)] 0 __________________________________________________________________________________________________ click_os (InputLayer) [(None, 1)] 0 __________________________________________________________________________________________________ click_country (InputLayer) [(None, 1)] 0 __________________________________________________________________________________________________ click_region (InputLayer) [(None, 1)] 0 __________________________________________________________________________________________________ click_referrer_type (InputLayer [(None, 1)] 0 __________________________________________________________________________________________________ is_cat_hab (InputLayer) [(None, 1)] 0 __________________________________________________________________________________________________ sparse_emb_user_id (Embedding) (None, 1, 32) 1600032 user_id[0][0] __________________________________________________________________________________________________ sparse_seq_emb_hist_click_artic multiple 525664 click_article_id[0][0] hist_click_article_id[0][0] click_article_id[0][0] __________________________________________________________________________________________________ sparse_emb_category_id (Embeddi (None, 1, 32) 7776 category_id[0][0] __________________________________________________________________________________________________ sparse_emb_click_environment (E (None, 1, 32) 128 click_environment[0][0] __________________________________________________________________________________________________ sparse_emb_click_deviceGroup (E (None, 1, 32) 160 click_deviceGroup[0][0] __________________________________________________________________________________________________ sparse_emb_click_os (Embedding) (None, 1, 32) 288 click_os[0][0] __________________________________________________________________________________________________ sparse_emb_click_country (Embed (None, 1, 32) 384 click_country[0][0] __________________________________________________________________________________________________ sparse_emb_click_region (Embedd (None, 1, 32) 928 click_region[0][0] __________________________________________________________________________________________________ sparse_emb_click_referrer_type (None, 1, 32) 256 click_referrer_type[0][0] __________________________________________________________________________________________________ sparse_emb_is_cat_hab (Embeddin (None, 1, 32) 64 is_cat_hab[0][0] __________________________________________________________________________________________________ no_mask (NoMask) (None, 1, 32) 0 sparse_emb_user_id[0][0] sparse_seq_emb_hist_click_article sparse_emb_category_id[0][0] sparse_emb_click_environment[0][0 sparse_emb_click_deviceGroup[0][0 sparse_emb_click_os[0][0] sparse_emb_click_country[0][0] sparse_emb_click_region[0][0] sparse_emb_click_referrer_type[0] sparse_emb_is_cat_hab[0][0] __________________________________________________________________________________________________ hist_click_article_id (InputLay [(None, 50)] 0 __________________________________________________________________________________________________ concatenate (Concatenate) (None, 1, 320) 0 no_mask[0][0] no_mask[1][0] no_mask[2][0] no_mask[3][0] no_mask[4][0] no_mask[5][0] no_mask[6][0] no_mask[7][0] no_mask[8][0] no_mask[9][0] __________________________________________________________________________________________________ no_mask_1 (NoMask) (None, 1, 320) 0 concatenate[0][0] __________________________________________________________________________________________________ attention_sequence_pooling_laye (None, 1, 32) 13961 sparse_seq_emb_hist_click_article sparse_seq_emb_hist_click_article __________________________________________________________________________________________________ concatenate_1 (Concatenate) (None, 1, 352) 0 no_mask_1[0][0] attention_sequence_pooling_layer[ __________________________________________________________________________________________________ sim0 (InputLayer) [(None, 1)] 0 __________________________________________________________________________________________________ time_diff0 (InputLayer) [(None, 1)] 0 __________________________________________________________________________________________________ word_diff0 (InputLayer) [(None, 1)] 0 __________________________________________________________________________________________________ sim_max (InputLayer) [(None, 1)] 0 __________________________________________________________________________________________________ sim_min (InputLayer) [(None, 1)] 0 __________________________________________________________________________________________________ sim_sum (InputLayer) [(None, 1)] 0 __________________________________________________________________________________________________ sim_mean (InputLayer) [(None, 1)] 0 __________________________________________________________________________________________________ score (InputLayer) [(None, 1)] 0 __________________________________________________________________________________________________ rank (InputLayer) [(None, 1)] 0 __________________________________________________________________________________________________ click_size (InputLayer) [(None, 1)] 0 __________________________________________________________________________________________________ time_diff_mean (InputLayer) [(None, 1)] 0 __________________________________________________________________________________________________ active_level (InputLayer) [(None, 1)] 0 __________________________________________________________________________________________________ user_time_hob1 (InputLayer) [(None, 1)] 0 __________________________________________________________________________________________________ user_time_hob2 (InputLayer) [(None, 1)] 0 __________________________________________________________________________________________________ words_hbo (InputLayer) [(None, 1)] 0 __________________________________________________________________________________________________ words_count (InputLayer) [(None, 1)] 0 __________________________________________________________________________________________________ flatten (Flatten) (None, 352) 0 concatenate_1[0][0] __________________________________________________________________________________________________ no_mask_3 (NoMask) (None, 1) 0 sim0[0][0] time_diff0[0][0] word_diff0[0][0] sim_max[0][0] sim_min[0][0] sim_sum[0][0] sim_mean[0][0] score[0][0] rank[0][0] click_size[0][0] time_diff_mean[0][0] active_level[0][0] user_time_hob1[0][0] user_time_hob2[0][0] words_hbo[0][0] words_count[0][0] __________________________________________________________________________________________________ no_mask_2 (NoMask) (None, 352) 0 flatten[0][0] __________________________________________________________________________________________________ concatenate_2 (Concatenate) (None, 16) 0 no_mask_3[0][0] no_mask_3[1][0] no_mask_3[2][0] no_mask_3[3][0] no_mask_3[4][0] no_mask_3[5][0] no_mask_3[6][0] no_mask_3[7][0] no_mask_3[8][0] no_mask_3[9][0] no_mask_3[10][0] no_mask_3[11][0] no_mask_3[12][0] no_mask_3[13][0] no_mask_3[14][0] no_mask_3[15][0] __________________________________________________________________________________________________ flatten_1 (Flatten) (None, 352) 0 no_mask_2[0][0] __________________________________________________________________________________________________ flatten_2 (Flatten) (None, 16) 0 concatenate_2[0][0] __________________________________________________________________________________________________ no_mask_4 (NoMask) multiple 0 flatten_1[0][0] flatten_2[0][0] __________________________________________________________________________________________________ concatenate_3 (Concatenate) (None, 368) 0 no_mask_4[0][0] no_mask_4[1][0] __________________________________________________________________________________________________ dnn_1 (DNN) (None, 80) 89880 concatenate_3[0][0] __________________________________________________________________________________________________ dense (Dense) (None, 1) 80 dnn_1[0][0] __________________________________________________________________________________________________ prediction_layer (PredictionLay (None, 1) 1 dense[0][0] ================================================================================================== Total params: 2,239,602 Trainable params: 2,239,362 Non-trainable params: 240 __________________________________________________________________________________________________
# 模型訓練 if offline: history = model.fit(x_trn, y_trn, verbose=1, epochs=10, validation_data=(x_val, y_val) , batch_size=256) else: # 也可使用上面的語句用本身採樣出來的驗證集 # history = model.fit(x_trn, y_trn, verbose=1, epochs=3, validation_split=0.3, batch_size=256) history = model.fit(x_trn, y_trn, verbose=1, epochs=2, batch_size=256)
Epoch 1/2 290964/290964 [==============================] - 55s 189us/sample - loss: 0.4209 - binary_crossentropy: 0.4206 - auc: 0.7842 Epoch 2/2 290964/290964 [==============================] - 52s 178us/sample - loss: 0.3630 - binary_crossentropy: 0.3618 - auc: 0.8478
# 模型預測 tst_user_item_feats_df_din_model['pred_score'] = model.predict(x_tst, verbose=1, batch_size=256) tst_user_item_feats_df_din_model[['user_id', 'click_article_id', 'pred_score']].to_csv(save_path + 'din_rank_score.csv', index=False)
500000/500000 [==============================] - 20s 39us/sample
# 預測結果從新排序, 及生成提交結果 rank_results = tst_user_item_feats_df_din_model[['user_id', 'click_article_id', 'pred_score']] submit(rank_results, topk=5, model_name='din')
# 五折交叉驗證,這裏的五折交叉是以用戶爲目標進行五折劃分 # 這一部分與前面的單獨訓練和驗證是分開的 def get_kfold_users(trn_df, n=5): user_ids = trn_df['user_id'].unique() user_set = [user_ids[i::n] for i in range(n)] return user_set k_fold = 5 trn_df = trn_user_item_feats_df_din_model user_set = get_kfold_users(trn_df, n=k_fold) score_list = [] score_df = trn_df[['user_id', 'click_article_id', 'label']] sub_preds = np.zeros(tst_user_item_feats_df_rank_model.shape[0]) dense_fea = [x for x in dense_fea if x != 'label'] x_tst, dnn_feature_columns = get_din_feats_columns(tst_user_item_feats_df_din_model, dense_fea, sparse_fea, behavior_fea, hist_behavior_fea, max_len=50) # 五折交叉驗證,並將中間結果保存用於staking for n_fold, valid_user in enumerate(user_set): train_idx = trn_df[~trn_df['user_id'].isin(valid_user)] # add slide user valid_idx = trn_df[trn_df['user_id'].isin(valid_user)] # 準備訓練數據 x_trn, dnn_feature_columns = get_din_feats_columns(train_idx, dense_fea, sparse_fea, behavior_fea, hist_behavior_fea, max_len=50) y_trn = train_idx['label'].values # 準備驗證數據 x_val, dnn_feature_columns = get_din_feats_columns(valid_idx, dense_fea, sparse_fea, behavior_fea, hist_behavior_fea, max_len=50) y_val = valid_idx['label'].values history = model.fit(x_trn, y_trn, verbose=1, epochs=2, validation_data=(x_val, y_val) , batch_size=256) # 預測驗證集結果 valid_idx['pred_score'] = model.predict(x_val, verbose=1, batch_size=256) valid_idx.sort_values(by=['user_id', 'pred_score']) valid_idx['pred_rank'] = valid_idx.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first') # 將驗證集的預測結果放到一個列表中,後面進行拼接 score_list.append(valid_idx[['user_id', 'click_article_id', 'pred_score', 'pred_rank']]) # 若是是線上測試,須要計算每次交叉驗證的結果相加,最後求平均 if not offline: sub_preds += model.predict(x_tst, verbose=1, batch_size=256)[:, 0] score_df_ = pd.concat(score_list, axis=0) score_df = score_df.merge(score_df_, how='left', on=['user_id', 'click_article_id']) # 保存訓練集交叉驗證產生的新特徵 score_df[['user_id', 'click_article_id', 'pred_score', 'pred_rank', 'label']].to_csv(save_path + 'trn_din_cls_feats.csv', index=False) # 測試集的預測結果,屢次交叉驗證求平均,將預測的score和對應的rank特徵保存,能夠用於後面的staking,這裏還能夠構造其餘更多的特徵 tst_user_item_feats_df_din_model['pred_score'] = sub_preds / k_fold tst_user_item_feats_df_din_model['pred_score'] = tst_user_item_feats_df_din_model['pred_score'].transform(lambda x: norm_sim(x)) tst_user_item_feats_df_din_model.sort_values(by=['user_id', 'pred_score']) tst_user_item_feats_df_din_model['pred_rank'] = tst_user_item_feats_df_din_model.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first') # 保存測試集交叉驗證的新特徵 tst_user_item_feats_df_din_model[['user_id', 'click_article_id', 'pred_score', 'pred_rank']].to_csv(save_path + 'tst_din_cls_feats.csv', index=False)
# 讀取多個模型的排序結果文件 lgb_ranker = pd.read_csv(save_path + 'lgb_ranker_score.csv') lgb_cls = pd.read_csv(save_path + 'lgb_cls_score.csv') din_ranker = pd.read_csv(save_path + 'din_rank_score.csv') # 這裏也能夠換成交叉驗證輸出的測試結果進行加權融合
rank_model = {'lgb_ranker': lgb_ranker, 'lgb_cls': lgb_cls, 'din_ranker': din_ranker}
def get_ensumble_predict_topk(rank_model, topk=5): final_recall = rank_model['lgb_cls'].append(rank_model['din_ranker']) rank_model['lgb_ranker']['pred_score'] = rank_model['lgb_ranker']['pred_score'].transform(lambda x: norm_sim(x)) final_recall = final_recall.append(rank_model['lgb_ranker']) final_recall = final_recall.groupby(['user_id', 'click_article_id'])['pred_score'].sum().reset_index() submit(final_recall, topk=topk, model_name='ensemble_fuse')
get_ensumble_predict_topk(rank_model)
# 讀取多個模型的交叉驗證生成的結果文件 # 訓練集 trn_lgb_ranker_feats = pd.read_csv(save_path + 'trn_lgb_ranker_feats.csv') trn_lgb_cls_feats = pd.read_csv(save_path + 'trn_lgb_cls_feats.csv') trn_din_cls_feats = pd.read_csv(save_path + 'trn_din_cls_feats.csv') # 測試集 tst_lgb_ranker_feats = pd.read_csv(save_path + 'tst_lgb_ranker_feats.csv') tst_lgb_cls_feats = pd.read_csv(save_path + 'tst_lgb_cls_feats.csv') tst_din_cls_feats = pd.read_csv(save_path + 'tst_din_cls_feats.csv')
# 將多個模型輸出的特徵進行拼接 finall_trn_ranker_feats = trn_lgb_ranker_feats[['user_id', 'click_article_id', 'label']] finall_tst_ranker_feats = tst_lgb_ranker_feats[['user_id', 'click_article_id']] for idx, trn_model in enumerate([trn_lgb_ranker_feats, trn_lgb_cls_feats, trn_din_cls_feats]): for feat in [ 'pred_score', 'pred_rank']: col_name = feat + '_' + str(idx) finall_trn_ranker_feats[col_name] = trn_model[feat] for idx, tst_model in enumerate([tst_lgb_ranker_feats, tst_lgb_cls_feats, tst_din_cls_feats]): for feat in [ 'pred_score', 'pred_rank']: col_name = feat + '_' + str(idx) finall_tst_ranker_feats[col_name] = tst_model[feat]
# 定義一個邏輯迴歸模型再次擬合交叉驗證產生的特徵對測試集進行預測 # 這裏須要注意的是,在作交叉驗證的時候能夠構造多一些與輸出預測值相關的特徵,來豐富這裏簡單模型的特徵 from sklearn.linear_model import LogisticRegression feat_cols = ['pred_score_0', 'pred_rank_0', 'pred_score_1', 'pred_rank_1', 'pred_score_2', 'pred_rank_2'] trn_x = finall_trn_ranker_feats[feat_cols] trn_y = finall_trn_ranker_feats['label'] tst_x = finall_tst_ranker_feats[feat_cols] # 定義模型 lr = LogisticRegression() # 模型訓練 lr.fit(trn_x, trn_y) # 模型預測 finall_tst_ranker_feats['pred_score'] = lr.predict_proba(tst_x)[:, 1]
# 預測結果從新排序, 及生成提交結果 rank_results = finall_tst_ranker_feats[['user_id', 'click_article_id', 'pred_score']] submit(rank_results, topk=5, model_name='ensumble_staking')
本章主要學習了三個排序模型,包括LGB的Rank, LGB的Classifier還有深度學習的DIN模型。咱們進行了簡單的模型融合策略,包括簡單的加權和Stacking。post