'channelGrouping', 'customDimensions', 'date', 'device', 'fullVisitorId', 'geoNetwork', 'hits', 'socialEngagementType', 'totals', 'trafficSource', 'visitId', 'visitNumber', 'visitStartTime'python
import os import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import json from pandas.io.json import json_normalize from datetime import datetime from ast import literal_eval import warnings warnings.filterwarnings('ignore') data_path = 'C:\\Project\\Kaggle\\Revenue_Prediction\\data\\'
def read_df(path, file_name, nrows = None): os.chdir(path) df = pd.read_csv(file_name, dtype = {'fullVisitorId': 'str', 'visitId': 'str'}, chunksize = 10000) return df train_head = read_df(data_path, 'train_v2.csv', nrows = 10000)
能夠看出數據的結構較爲複雜,對於JSON列和類JSON列,須要通過處理,才能進行有效使用。在處理的過程當中, 我也參考了其餘參賽者分享的一些Kernels,再經過拆分計算的思想,完成了數據的解析。json
def split_df(df, path, num_split): os.chdir(path) for i in range(num_split): temp = df[i*20000 : (i+1)*20000] temp.to_csv(str(i) + '.csv', index = False) print('No. %s is done.' %i) def load_df(csv_name, nrows = None): "csv_path:文件路徑, nrows 讀取行數,JSON_COLUMNS: JSON的列" df = pd.read_csv(csv_name, converters = {column: json.loads for column in JSON_COLUMNS}, # json.loads : json --> python dtype = {'fullVisitorId': 'str', 'visitId': 'str'}, nrows = nrows) for col in NEW_COLUMNS: df[col][df[col] == "[]"] = "[{}]" df[col] = df[col].apply(literal_eval).str[0] for column in JSON_COLUMNS + NEW_COLUMNS: column_as_df = json_normalize(df[column]) # json column --> tabel(DataFrame) column_as_df.columns = [f"{column}.{subcolumn}" for subcolumn in column_as_df.columns] # f-string in Python 3.6 # Extract the product and promo names from the complex nested structure into a simple flat list: if 'hits.product' in column_as_df.columns: column_as_df['hits.v2ProductName'] = column_as_df['hits.product'].apply(lambda x: [p['v2ProductName'] for p in x] if type(x) == list else []) column_as_df['hits.v2ProductCategory'] = column_as_df['hits.product'].apply(lambda x: [p['v2ProductCategory'] for p in x] if type(x) == list else []) del column_as_df['hits.product'] if 'hits.promotion' in column_as_df.columns: column_as_df['hits.promoId'] = column_as_df['hits.promotion'].apply(lambda x: [p['promoId'] for p in x] if type(x) == list else []) column_as_df['hits.promoName'] = column_as_df['hits.promotion'].apply(lambda x: [p['promoName'] for p in x] if type(x) == list else []) del column_as_df['hits.promotion'] df = df.drop(column, axis = 1).merge(column_as_df, left_index = True, right_index = True) df.to_csv('exjson_' + csv_name.split('.')[0] + '.csv', index = False) return df def exjson(path, num): os.chdir(path) files = [str(d) + '.csv' for d in range(num)] for i in files: load_df(i) print('No. {} is done.'.format(i.split('.')[0])) def concat_df(path, num, outname): "path: path_train/path_test; num: 86/21" os.chdir(path) file_list = ['exjson_{}.csv'.format(i) for i in range(num)] df_list = [] for file in file_list: dfname = file.split('.')[0] dfname = pd.read_csv(file, dtype = {'fullVisitorId': 'str', 'visitId': 'str'}) df_list.append(dfname) df = pd.concat(df_list, ignore_index = True) df.to_csv(outname, index = False) return df def bug_fix(df): drop_list = df[df['date'] == "No"].index.tolist() df = df.drop(drop_list) print(df) return df
因爲比較擔憂計算能力,拆分、解析、組合的過程被分別執行,且存儲了過程結果,三者的主要函數見上面摺疊的代碼。app
此後又對數據作出了一些簡單處理,分離了年月日的信息,將totals.transactionRevenue取了對數(np.log1p),去掉了缺失值過多和數值單一的列,下面將主要對瀏覽、購買次數和時間進行分析。ide
all_precleaning = read_df(path_data, 'all_data_precleaning.csv') all_eda = all_precleaning[['fullVisitorId', 'visitStartTime', 'visitNumber', 'totals.transactionRevenue', 'totals.hits', 'totals.pageviews', 'totals.timeOnSite', 'totals.newVisits', 'date']]
all_precleaning 總共有70列,爲了突出重點展現,本文只對以上特徵進行分析。函數
提取年和月做爲一列,方便後續分組。工具
all_eda['yearMonth'] = all_eda.apply(lambda x: x['date'].split('-')[0] + x['date'].split('-')[1], axis = 1)
計算過程當中,將僅瀏覽一次的數據單獨計算; 其他數據根據 fullVisitorId 進行分組累計,每一個分組內按照瀏覽時間由小到大排列,以便標記次數。測試
計算特徵的代碼較長,摺疊於下方,結果爲29列。spa
def add_groupby_col(df, new_column_names, by = 'fullVisitorId', agg_cols = ['totals.transactionRevenue'], aggfunc =['count']): "new_column_names: a list of col names" temp = df.groupby(by)[agg_cols].aggregate(aggfunc) temp.columns = new_column_names df = pd.merge(df, temp, left_on = 'fullVisitorId', right_index = True, how = 'left') return df def calculate_id_features(df): df = df.sort_values(by = 'visitNumber') df['buy'] = df.apply(lambda x: 1 if x['totals.transactionRevenue']>0 else 0, axis = 1) df['buyNumber'] = df['buy'].cumsum() df['nextBuyGroup'] = df['buyNumber'] - df['buy'] next_buy_time = df.groupby('nextBuyGroup').agg({'visitStartTime': 'max'}) next_buy_time.columns = ['nextBuyTime'] df = pd.merge(df, next_buy_time, left_on = 'buyNumber', right_index = True, how = 'left') df['sumRevenue'] = df['totals.transactionRevenue'].sum() df['everBuy'] = df.apply(lambda x: 1 if x['sumRevenue']>0 else 0, axis = 1) df['buyTimes'] = df['buy'].sum() df['averageRevenue'] = df.apply(lambda x: x['sumRevenue']/x['buyTimes'] if x['buyTimes']>0 else 0, axis = 1) df['firstVisitTime'] = df['visitStartTime'].min() df['lastVisitTime'] = df['visitStartTime'].max() df['sinceFirstVisit'] = df['visitStartTime'] - df['firstVisitTime'] df['sinceFirstVisit.day'] = df['sinceFirstVisit'] // (24*3600) df['sinceFirstVisit.period'] = pd.cut(df['sinceFirstVisit.day'], [-1, 30, 60, 120, 240, 800], labels = ['within30', '30-60', '60-120', '120-240', '240-800']) def get_timegap(df_l): timegap = df_l['nextBuyTime'] - df_l['visitStartTime'] if timegap > 0: return timegap df['timeToBuy'] = df.apply(lambda x: get_timegap(x), axis = 1) df['timeToBuy'].fillna(0, inplace = True) df['timeToBuy.day'] = df.apply(lambda x: x['timeToBuy']/(24*3600) if x['everBuy']==1 else -10, axis = 1) df['revNum'] = df.apply(lambda x: x['buyNumber'] if x['buy']==1 else 0, axis = 1) df['firstBuy'] = df.apply(lambda x: 1 if x['revNum']==1 else 0, axis = 1) df['reBuy'] = df.apply(lambda x: 1 if x['revNum']>1 else 0, axis = 1) return df def one_visit_features(df): df['buy'] = df.apply(lambda x: 1 if x['totals.transactionRevenue']>0 else 0, axis = 1) df['sumRevenue'] = df['totals.transactionRevenue'].sum() df['everBuy'] = df.apply(lambda x: 1 if x['sumRevenue']>0 else 0, axis = 1) #df['viewTimes'] = df['visitStartTime'].count() df['buyTimes'] = df['buy'].sum() df['averageRevenue'] = df.apply(lambda x: x['sumRevenue']/x['buyTimes'] if x['buyTimes']>0 else 0, axis = 1) df['firstVisitTime'] = df['visitStartTime'] df['lastVisitTime'] = df['visitStartTime'] df['revNum'] = df.apply(lambda x: 1 if x['buy']==1 else 0, axis = 1) df['firstBuy'] = df.apply(lambda x: 1 if x['buy']==1 else 0, axis = 1) df['reBuy'] = 0 return df all_eda = add_groupby_col(all_eda, ['viewTimes']) all_eda_oneview = all_eda[all_eda['viewTimes'] == 1] all_eda_views = all_eda[all_eda['viewTimes'] > 1] all_eda_oneview_cal = one_visit_features(all_eda_oneview) all_eda_views_cal = all_eda_views.groupby('fullVisitorId').apply(calculate_id_features) all_eda_cal = pd.concat([all_eda_views_cal, all_eda_oneview_cal], ignore_index = True) all_eda_cal.to_csv('all_eda_cal.csv', index = False)
def view_range_agg(df): "df: all_eda_cal" view_times = df.groupby('fullVisitorId').agg({'viewTimes': 'max'}) view_times_agg = view_times.groupby('viewTimes').agg({'viewTimes': 'count'}) view_times_agg.columns = ['num'] view_times_agg.reset_index(inplace = True) view_times_agg['viewRange'] = pd.cut(view_times_agg['viewTimes'], [-1, 1, 2, 3, 6, 10, 20, 40, 80, 500], labels = ['1', '2', '3', '4-6', '7-10', '11-20', '21-40', '41-80', '81-500']) result = view_times_agg.groupby('viewRange').agg({'num': 'sum'}) return result def buy_range_agg(df): "df: all_eda_agg" buy_times = df.groupby('fullVisitorId').agg({'buyTimes': 'max'}) buy_times_agg = buy_times.groupby('buyTimes').agg({'buyTimes': 'count'}) buy_times_agg.columns = ['num'] buy_times_agg.reset_index(inplace = True) buy_times_agg['buyRange'] = pd.cut(buy_times_agg['buyTimes'], [-1, 0, 1, 2, 3, 6, 10, 33], labels = ['0', '1', '2', '3', '4-6', '7-10', '11-33']) result = buy_times_agg.groupby('buyRange').agg({'num': 'sum'}) return result view_range = view_range_agg(all_eda_cal) buy_range = buy_range_agg(all_eda_cal) print('瀏覽次數分佈以下:') print(view_range) print('-' * 10) print('購買次數分佈以下:') print(buy_range)
包含全部取值可能,會致使部分數據沒法得到直觀展現3d
plt.rcParams['font.sans-serif']=['SimHei'] fig,axes = plt.subplots(1,2,figsize = (20,6)) view_range.plot.barh(ax = axes[0]) axes[0].set_title('瀏覽次數分佈') buy_range.plot.barh(ax = axes[1]) axes[1].set_title('購買次數分佈')
fig,axes = plt.subplots(1,2,figsize = (20,6)) view_range[2:].plot.barh(ax = axes[0]) axes[0].set_title('瀏覽次數分佈') buy_range[2:].plot.barh(ax = axes[1]) axes[1].set_title('購買次數分佈')
def yearMonth_des(df): "df: all_eda_cal" # 總購買數 新增瀏覽 總銷售額 yearmonth_1 = df.groupby('yearMonth').agg({'buy': 'sum', 'totals.newVisits': 'sum', 'totals.transactionRevenue': 'sum'}) yearmonth_1.columns = ['month_buyTimes', 'month_newVisits', 'month_totalRev'] # 總瀏覽數 yearmonth_visit_time = df.groupby('yearMonth').apply(lambda x: len(x)).reset_index() yearmonth_visit_time.columns = ['yearMonth', 'month_visitTime'] yearmonth_visit_time.index = yearmonth_visit_time['yearMonth'] # 新增購買 / 重複購買 銷售額 # 此時的重複購買指:不是第一次購買,有可能第一次購買就發生於當月 first_buy_rev = df[df['firstBuy']==1].groupby('yearMonth').agg({'totals.transactionRevenue': 'sum'}) rebuy_rev = df[df['reBuy']==1].groupby('yearMonth').agg({'totals.transactionRevenue': 'sum'}) first_buy_rev.columns = ['firstBuyRev'] rebuy_rev.columns = ['reBuyRev'] # 統計新增/重複購買人數 按年月分組 yearmonth_2 = df.groupby('yearMonth').agg({'firstBuy': 'sum', 'reBuy': 'sum'}) # 將分散的groupby特徵整合到一塊兒 yearmonth_des = pd.concat([yearmonth_visit_time, yearmonth_1, yearmonth_2, first_buy_rev, rebuy_rev], axis = 1) # 計算首次購買和重複購買的金額均值 yearmonth_des['avgFirst'] = yearmonth_des['firstBuyRev'] / yearmonth_des['firstBuy'] yearmonth_des['avgRev'] = yearmonth_des['reBuyRev'] / yearmonth_des['reBuy'] #yearmonth_des.to_csv('yearmonth_group.csv', index = False) return yearmonth_des yearmonth_des = yearMonth_des(all_eda_cal) yearmonth_des.index = yearmonth_des.index.astype(str) yearmonth_des.tail(6)
fig, ax = plt.subplots(2, 1, figsize = (20, 16)) ax[0].plot(yearmonth_des['month_visitTime']) ax[0].plot(yearmonth_des['month_newVisits']) ax[0].plot(yearmonth_des['month_buyTimes']) ax[0].legend() ax[0].set_title('瀏覽次數 新增瀏覽 購買次數') ax[1].plot(yearmonth_des['month_buyTimes']) ax[1].plot(yearmonth_des['firstBuy']) ax[1].plot(yearmonth_des['reBuy']) ax[1].legend() ax[1].set_title('購買次數 首次購買 重複購買')
## 表2 # 首次購買的用戶須要的瀏覽次數 區間 all_eda_cal['visitNumRange'] = pd.cut(all_eda_cal['visitNumber'], [0, 1, 2, 5, 10, 20, 388], labels = ['1', '2', '3-5', '6-10', '11-20', '21-388']) firstBuy_visitNum_pivot = all_eda_cal[all_eda_cal['firstBuy']==1].pivot_table(index = 'yearMonth', columns = 'visitNumRange', aggfunc = {'visitNumRange': 'count'}) firstBuy_visitNum_pivot.tail(6) plt.figure(figsize = (12, 10)) #yearmonth_buy_pivot.fillna(0, inplace = True) sns.heatmap(firstBuy_visitNum_pivot, annot = True, # 是否顯示數值 fmt = '.0f', # 格式化字符串 linewidths = 0.1, # 格子邊線寬度 center = 300, # 調色盤的色彩中心值,若沒有指定,則以cmap爲主 cmap = 'Blues', # 設置調色盤 cbar = True, # 是否顯示圖例色帶 #cbar_kws={"orientation": "horizontal"}, # 是否橫向顯示圖例色帶 #square = True, # 是否正方形顯示圖表 ) plt.title('首次購買時的瀏覽次數分佈')
## 表3表4 # 首次購買和重複購買與首次瀏覽時間間隔的分佈 firstBuy_sinceFisrtVisit_pivot = all_eda_cal[all_eda_cal['firstBuy']==1].pivot_table(index = 'yearMonth', columns = 'sinceFirstVisit.period', aggfunc = {'sinceFirstVisit.period': 'count'}) reBuy_sinceFisrtVisit_pivot = all_eda_cal[all_eda_cal['reBuy']==1].pivot_table(index = 'yearMonth', columns = 'sinceFirstVisit.period', aggfunc = {'sinceFirstVisit.period': 'count'}) firstBuy_sinceFisrtVisit_pivot.columns = [['120-240', '240-800', '30-60', '60-120', 'within30']] reBuy_sinceFisrtVisit_pivot.columns = [['120-240', '240-800', '30-60', '60-120', 'within30']] firstBuy_sinceFisrtVisit_pivot = firstBuy_sinceFisrtVisit_pivot[['within30', '30-60', '60-120', '120-240', '240-800']] reBuy_sinceFisrtVisit_pivot = reBuy_sinceFisrtVisit_pivot[['within30', '30-60', '60-120', '120-240', '240-800']] firstBuy_sinceFisrtVisit_pivot.tail(6)
plt.figure(figsize = (12, 10)) #yearmonth_buy_pivot.fillna(0, inplace = True) sns.heatmap(firstBuy_sinceFisrtVisit_pivot.drop('within30', axis = 1), annot = True, # 是否顯示數值 fmt = '.0f', # 格式化字符串 linewidths = 0.1, # 格子邊線寬度 center = 30, # 調色盤的色彩中心值,若沒有指定,則以cmap爲主 cmap = 'Blues', # 設置調色盤 cbar = True, # 是否顯示圖例色帶 #cbar_kws={"orientation": "horizontal"}, # 是否橫向顯示圖例色帶 #square = True, # 是否正方形顯示圖表 ) plt.title('首次購買與首次瀏覽的時間間隔')
plt.figure(figsize = (12, 10)) #yearmonth_buy_pivot.fillna(0, inplace = True) sns.heatmap(reBuy_sinceFisrtVisit_pivot, annot = True, # 是否顯示數值 fmt = '.0f', # 格式化字符串 linewidths = 0.1, # 格子邊線寬度 center = 35, # 調色盤的色彩中心值,若沒有指定,則以cmap爲主 cmap = 'Blues', # 設置調色盤 cbar = True, # 是否顯示圖例色帶 #cbar_kws={"orientation": "horizontal"}, # 是否橫向顯示圖例色帶 #square = True, # 是否正方形顯示圖表 ) plt.title('重複購買與首次瀏覽的時間間隔')
至此,咱們已經對這個預測問題的基本狀況有了一個初步的認識,這些數據能夠爲本身的交叉驗證作出有效的補充。code