原書做者使用字典dict實現推薦算法,而且驚歎於18行代碼實現了向量的餘弦夾角公式。python
我用pandas實現相同的公式只要3行。算法
特別說明:本篇筆記是針對矩陣數據,下篇筆記是針對條目數據。json
''' 基於用戶的協同推薦 矩陣數據 ''' import pandas as pd from io import StringIO import json #數據類型一:csv矩陣(用戶-商品)(適用於小數據量) csv_txt = '''"user","Blues Traveler","Broken Bells","Deadmau5","Norah Jones","Phoenix","Slightly Stoopid","The Strokes","Vampire Weekend" "Angelica",3.5,2.0,,4.5,5.0,1.5,2.5,2.0 "Bill",2.0,3.5,4.0,,2.0,3.5,,3.0 "Chan",5.0,1.0,1.0,3.0,5,1.0,, "Dan",3.0,4.0,4.5,,3.0,4.5,4.0,2.0 "Hailey",,4.0,1.0,4.0,,,4.0,1.0 "Jordyn",,4.5,4.0,5.0,5.0,4.5,4.0,4.0 "Sam",5.0,2.0,,3.0,5.0,4.0,5.0, "Veronica",3.0,,,5.0,4.0,2.5,3.0,''' #數據類型二:json數據(用戶、商品、打分) json_txt = '''{"Angelica": {"Blues Traveler": 3.5, "Broken Bells": 2.0, "Norah Jones": 4.5, "Phoenix": 5.0, "Slightly Stoopid": 1.5, "The Strokes": 2.5, "Vampire Weekend": 2.0}, "Bill":{"Blues Traveler": 2.0, "Broken Bells": 3.5, "Deadmau5": 4.0, "Phoenix": 2.0, "Slightly Stoopid": 3.5, "Vampire Weekend": 3.0}, "Chan": {"Blues Traveler": 5.0, "Broken Bells": 1.0, "Deadmau5": 1.0, "Norah Jones": 3.0, "Phoenix": 5, "Slightly Stoopid": 1.0}, "Dan": {"Blues Traveler": 3.0, "Broken Bells": 4.0, "Deadmau5": 4.5, "Phoenix": 3.0, "Slightly Stoopid": 4.5, "The Strokes": 4.0, "Vampire Weekend": 2.0}, "Hailey": {"Broken Bells": 4.0, "Deadmau5": 1.0, "Norah Jones": 4.0, "The Strokes": 4.0, "Vampire Weekend": 1.0}, "Jordyn": {"Broken Bells": 4.5, "Deadmau5": 4.0, "Norah Jones": 5.0, "Phoenix": 5.0, "Slightly Stoopid": 4.5, "The Strokes": 4.0, "Vampire Weekend": 4.0}, "Sam": {"Blues Traveler": 5.0, "Broken Bells": 2.0, "Norah Jones": 3.0, "Phoenix": 5.0, "Slightly Stoopid": 4.0, "The Strokes": 5.0}, "Veronica": {"Blues Traveler": 3.0, "Norah Jones": 5.0, "Phoenix": 4.0, "Slightly Stoopid": 2.5, "The Strokes": 3.0} }''' df = None #方式一:加載csv數據 def load_csv_txt(): global df df = pd.read_csv(StringIO(csv_txt), header=0, index_col="user") #方式二:加載json數據(把json讀成矩陣) def load_json_txt(): global df df = pd.read_json(json_txt, orient='index') #測試:讀取數據 load_csv_txt() #load_json_txt() def build_xy(user_name1, user_name2): #df2 = df.ix[[user_name1, user_name2]].dropna(axis=1) #return df2.ix[user_name1], df2.ix[user_name2] bool_array = df.ix[user_name1].notnull() & df.ix[user_name2].notnull() return df.ix[user_name1, bool_array], df.ix[user_name2, bool_array] #曼哈頓距離 def manhattan(user_name1, user_name2): x, y = build_xy(user_name1, user_name2) return sum(abs(x - y)) #歐幾里德距離 def euclidean(user_name1, user_name2): x, y = build_xy(user_name1, user_name2) return sum((x - y)**2)**0.5 #閔可夫斯基距離 def minkowski(user_name1, user_name2, r): x, y = build_xy(user_name1, user_name2) return sum(abs(x - y)**r)**(1/r) #皮爾遜相關係數 def pearson(user_name1, user_name2): x, y = build_xy(user_name1, user_name2) mean1, mean2 = x.mean(), y.mean() #分母 denominator = (sum((x-mean1)**2)*sum((y-mean2)**2))**0.5 return [sum((x-mean1)*(y-mean2))/denominator, 0][denominator == 0] #餘弦類似度(數據的稀疏性問題,在文本挖掘中應用得較多) def cosine(user_name1, user_name2): x, y = build_xy(user_name1, user_name2) #分母 denominator = (sum(x*x)*sum(y*y))**0.5 return [sum(x*y)/denominator, 0][denominator == 0] metric_funcs = { 'manhattan': manhattan, 'euclidean': euclidean, 'minkowski': minkowski, 'pearson': pearson, 'cosine': cosine } #df.ix[["Angelica","Bill"]].dropna(axis=1) print(manhattan("Angelica","Bill")) #計算最近的鄰居 def computeNearestNeighbor(user_name, metric='pearson', k=3, r=2): ''' metric: 度量函數 k: 返回k個鄰居 r: 閔可夫斯基距離專用 返回:pd.Series,其中index是鄰居名稱,values是距離 ''' if metric in ['manhattan', 'euclidean']: return df.drop(user_name).index.to_series().apply(metric_funcs[metric], args=(user_name,)).nsmallest(k) elif metric in ['minkowski']: return df.drop(user_name).index.to_series().apply(metric_funcs[metric], args=(user_name, r,)).nsmallest(k) elif metric in ['pearson', 'cosine']: return df.drop(user_name).index.to_series().apply(metric_funcs[metric], args=(user_name,)).nlargest(k) print(computeNearestNeighbor('Hailey', metric='pearson')) #向給定用戶推薦(返回:pd.Series) def recommend(user_name): # 找到距離最近的用戶名 nearest_username = computeNearestNeighbor(user_name).index[0] # 找出鄰居評價過、但本身不曾評價的樂隊(或商品) # 結果:index是商品名稱,values是評分 return df.ix[nearest_username, df.ix[user_name].isnull() & df.ix[nearest_username].notnull()].sort_values() #爲Hailey作推薦 print(recommend('Hailey')) #向給定用戶推薦 def recommend2(user_name, metric='pearson', k=3, n=5, r=2): ''' metric: 度量函數 k: 根據k個最近鄰居,協同推薦 r: 閔可夫斯基距離專用 n: 推薦的商品數目 返回:pd.Series,其中index是商品名稱,values是加權評分 ''' # 找到距離最近的k個鄰居 nearest_neighbors = computeNearestNeighbor(user_name, metric='pearson', k=k, r=r) # 計算權值 if metric in ['manhattan', 'euclidean', 'minkowski']: # 距離越小,越相似 nearest_neighbors = 1 / nearest_neighbors # 因此,取倒數(或者別的減函數,如:y=2**-x) elif metric in ['pearson', 'cosine']: # 距離越大,越相似 pass nearest_neighbors = nearest_neighbors / nearest_neighbors.sum() #已經變爲權值(pd.Series) # 逐個鄰居找出其評價過、但本身不曾評價的樂隊(或商品)的評分,並乘以權值 neighbors_rate_with_weight = [] for neighbor_name in nearest_neighbors.index: # 每一個結果:pd.Series,其中index是商品名稱,values是評分(已乘權值) neighbors_rate_with_weight.append(df.ix[neighbor_name, df.ix[user_name].isnull() & df.ix[neighbor_name].notnull()] * nearest_neighbors[neighbor_name]) # 把鄰居們的加權評分拼接成pd.DataFrame,按列累加,取最大的前n個商品的評分 return pd.concat(neighbors_rate_with_weight, axis=1).sum(axis=1, skipna=True).nlargest(n) #爲Hailey作推薦 print(recommend2('Hailey', metric='manhattan', k=3, n=5)) #爲Hailey作推薦 print(recommend2('Hailey', metric='euclidean', k=3, n=5, r=2)) #爲Hailey作推薦 print(recommend2('Hailey', metric='pearson', k=1, n=5))