【參考文獻】:Sarwar B M . Item-based collaborative filtering recommendation algorithms[C]// International Conference on World Wide Web. ACM, 2001.
背景:推薦領域必讀文獻之一,經典之做,本博客主要記錄了該文章的主要思想和相關實現代碼,歡迎觀摩!python
前提或假設算法
數據集框架
咱們選用MovieLens 100K Dataset,=> 100,000 ratings from 1000 users on 1700 movies.
下載地址:movielens數據集ide
算法理論
算法框架:如圖,輸入是user-item的評分矩陣,該矩陣很是稀疏。算法的任務是預測特定用戶對特定項目的評分,填補矩陣中空白單元格,接着根據預測評分從高到低爲特定用戶進行top-N推薦
算法預測:算法認爲某用戶喜歡某項目,在很大程度上也會對和該項目較類似的項目產生興趣。因此預測分兩步進行:計算項目之間的類似性和根據類似性進行預測評分。
文章提供了三個類似性計算公式:
Cosine-based Similarity
$$ sim(i,j)= cos(\vec{i},\vec{j})= \frac{\vec{i}\cdot \vec{j}}{\left \| \vec{i} \right \|_{2}*\left \| \vec{j} \right \|_{2}} $$
Correlation-based Similarity
$$ sim(i,j)= \frac{\sum _{u\in U}(R_{u,i}-\bar{R}_{i})(R_{u,j}-\bar{R}_{j})}{\sqrt{\sum _{u\in U}(R_{u,i}-\bar{R}_{i})^{2}}\sqrt{\sum _{u\in U}(R_{u,j}-\bar{R}_{j})^{2}}} $$
Adjusted Cosine Similarity
$$ sim(i,j)= \frac{\sum _{u\in U}(R_{u,i}-\bar{R}_{u})(R_{u,j}-\bar{R}_{u})}{\sqrt{\sum _{u\in U}(R_{u,i}-\bar{R}_{u})^{2}}\sqrt{\sum _{u\in U}(R_{u,j}-\bar{R}_{u})^{2}}} $$
可是全部的類似性計算公式必須在共同評分項上進行,即同時評價過i和j的歷史評分
算法選取和該項目最類似的前N個項目做爲預測基礎,預測公式以下:
$$ P_{u,i}=\frac{\sum _{all similar items,N}(S_{i,N}*R_{u,N})}{\sum _{all similar items,N}(\left | S_{i,N} \right |)} $$
算法最後一步,根據預測評分值從高到低進行推薦測試
實驗度量
文章採用MAE進行偏差度量,公式以下:
$$ MAE = \frac{\sum_{i=1}^{N}\left | p_{i}-q_{i} \right |}{N} $$ui
Python 代碼spa
# !usr/bin/python # -*- coding=utf-8 -*- import math import operator #加載數據 def loadData(): # trainSet格式爲: testSet格式一致 # { # userid:{ # itemid1: rating, # itemid2: rating # } # } # movieUser格式爲:看過某一部電影的全部用戶集合 # { # itemid: { # userid1: rating, # userid2: rating # } # } # # # trainSet = {} testSet = {} movieUser = {} TrainFile = './dataset/u1.base' # 指定訓練集 TestFile = './dataset/u1.test' # 指定測試集 # 讀取訓練集 f = open(TrainFile,'r') lines = f.readlines() for line in lines: arr = line.strip().split('\t') userId = arr[0] itemId = arr[1] rating = arr[2] trainSet.setdefault(userId, {}) trainSet[userId].setdefault(itemId, float(rating)) movieUser.setdefault(itemId, {}) movieUser[itemId].setdefault(userId, float(rating)) # 讀取測試集 f1 = open(TestFile,'r') lines1 = f1.readlines() for line1 in lines1: arr1 = line1.strip().split('\t') userId1 = arr1[0] itemId1 = arr1[1] rating1 = arr1[2] testSet.setdefault(userId1, {}) testSet[userId1].setdefault(itemId1, float(rating1)) arr = [trainSet,movieUser] return arr # 生成電影電影共有用戶矩陣 def i_j_users(i_id,j_id,movieUser): # ij_users格式爲: # { # (i_id,j_id):{userid1:None,userid2:None,....} # } if i_id in movieUser.keys(): i_users = movieUser[i_id] else: i_users = {} if j_id in movieUser.keys(): j_users = movieUser[j_id] else: j_users = {} inter = dict.fromkeys([x for x in i_users if x in j_users]) i_j_users = {(i_id,j_id):inter} return i_j_users #計算一個用戶的平均分數 def getAverageRating(trainSet,userid): average = (sum(trainSet[userid].values()) * 1.0) / len(trainSet[userid].keys()) return average #計算項目類似度 def getItemSim(i_j_users,i_id,j_id,trainSet): # 分子 sumtop # 分母 sumbot1 sumbot2 sumtop = 0 sumbot1 = 0 sumbot2 = 0 ij_users = i_j_users[(i_id,j_id)] if not ij_users: ij_sim = -9999 # 疑問? 爲0 或者爲None else: for user in ij_users.keys(): avr_user = getAverageRating(trainSet,user) # 求分子 left = trainSet[user][i_id] - avr_user right = trainSet[user][j_id] - avr_user sumtop += left*right # 求分母 sumbot1 += left*left sumbot2 += right*right if sumbot1 == 0 or sumbot2 == 0: ij_sim = 1 else: ij_sim = sumtop*1.0 / (math.sqrt(sumbot1)*math.sqrt(sumbot2)) return ij_sim # 計算項目i和其她全部項目的類似度並排序 # i_allitem_sim格式爲: # { # j_id1:s1, # j_id2:s2 # } def i_allitem_sort(i_id,movieUser,trainSet,N): i_allitem = {} for j in movieUser.keys(): if j != i_id: i_j_user = i_j_users(i_id,j,movieUser) s = getItemSim(i_j_user,i_id,j,trainSet) i_allitem.setdefault(j, s) i_allitem_sort1 = sorted(i_allitem.items(), key = operator.itemgetter(1), reverse = True)[0:N] i_allitem_sort_dict = {} for n in range(len(i_allitem_sort1)): j1 = i_allitem_sort1[n][0] s = i_allitem_sort1[n][1] i_allitem_sort_dict.setdefault(j1, s) return i_allitem_sort_dict # 預測評分 def prediction(userid,itemid,moviUser,trainSet,N): # predict 格式爲: # { # (userid,itemid): pui # } predict = 0 sumtop = 0 sumbot = 0 nsets = i_allitem_sort(itemid,movieUser,trainSet,N) for j in nsets.keys(): # 防止用戶對i的領域集合內的j沒評分 if j not in trainSet[userid].keys(): ruj = 0 mid = 0 else: ruj = trainSet[userid][j] mid = abs(nsets[j]) sumtop += nsets[j]*ruj sumbot += mid # 防止分母爲0 if sumbot == 0: predict = 0 else: predict = sumtop * 1.0 / sumbot return predict def saveFile(moviUser,trainSet,N): # 讀取用戶 string = "" # 正在讀取 f = open("../Collaborative Filtering/dataset/u1.test") fw = open("../Collaborative Filtering/predict",'w') fl = f.readlines() for i in fl: arr = i.split('\t') uid = str(arr[0].strip()) item = str(arr[1].strip()) rating = float(arr[2].strip()) predictScore = prediction(str(uid),str(item),moviUser,trainSet,N) string = string + str(uid) + "\t" + str(item) + "\t" + str(rating) + "\t" + str(predictScore) + "\n" fw.write(string) f.close() fw.close() # 計算預測分析準確度 def getMAE(): f = open("../Collaborative Filtering/predict") fl = f.readlines() mae = 0.0 s = 0 counttest = 0# 測試集的個數 for i in fl: arr = i.split('\t') uid = str(arr[0].strip()) item = str(arr[1].strip()) rating = float(arr[2].strip()) predictScore = float(arr[3].strip()) if predictScore == 0: mid = 0 else: mid = abs((predictScore-rating)) counttest = counttest + 1 s = s + mid mae = s/counttest print(mae) if __name__ == '__main__': N = 30 arr = loadData() trainSet = arr[0] movieUser = arr[1] saveFile(movieUser,trainSet,N) # getMAE()