1基於用戶的協同過濾算法:node
基於用戶的協同過濾算法是推薦系統中最古老的的算法,能夠說是這個算法的誕生標誌了推薦系統的誕生。該算法在1992年被提出,並應用於郵件過濾系統,1994年被GroupLens用於新聞過濾。算法
在一個在線個性化推薦系統中,當一個用戶A須要個性化推薦時,能夠先找到和他有類似興趣的其餘用戶,而後把那些用戶喜歡的而用戶A沒有接觸過的物品推薦給A。這種方法稱爲基於用戶的協同過濾算法。app
給定用戶u和用戶v,令N(u)表示用戶u曾經有過正反饋的物品集合,經過餘弦類似度計算用戶的類似度。因爲不少用戶相互之間並無對一樣的物品產生過行爲,即,所以能夠先計算的用戶對(u,v)。爲此,能夠首先創建物品到用戶的倒查表,對於每一個物品保存對該物品產生過行爲的用戶列表。令稀疏矩陣,假設用戶u和用戶v同時屬於倒查表中K個物品對應的用戶列表,就有(即用戶u和v對相同物品產生正反饋的物品數),從而能夠掃描倒查表中每一個物品對應的用戶列表,將用戶列表中的兩兩用戶對應的加1,最終就能夠得到全部用戶之間不爲0的(也就是餘弦類似度的分子)。dom
獲得用戶之間的興趣類似度以後,基於用戶的協同過濾算法(User Based Collaborative Filering)會給用戶推薦和他興趣最類似的K個用戶喜歡的物品。以下公式度量了UserCF算法中用戶u對物品i的感興趣程度:ide
其中,S(u,k)包含和用戶u興趣類似度最接*的k個用戶集合,N(i)是對物品i有過行爲的用戶集合,是用戶u和用戶v的興趣類似度,表明用戶v對物品i的興趣,由於使用的是單一行爲的隱反饋數據,所以爲1。測試
根據以上思路,使用Python實現UserCF算法的代碼以下:網站
import random |
002 |
import math |
003 |
class UserBasedCF: |
004 |
def __init__( self ,datafile = None ): |
005 |
self .datafile = datafile |
006 |
self .readData() |
007 |
self .splitData( 3 , 47 ) |
008 |
|
009 |
def readData( self ,datafile = None ): |
010 |
""" |
011 |
read the data from the data file which is a data set |
012 |
""" |
013 |
self .datafile = datafile or self .datafile |
014 |
self .data = [] |
015 |
for line in open ( self .datafile): |
016 |
userid,itemid,record,_ = line.split() |
017 |
self .data.append((userid,itemid, int (record))) |
018 |
019 |
def splitData( self ,k,seed,data = None ,M = 8 ): |
020 |
""" |
021 |
split the data set |
022 |
testdata is a test data set |
023 |
traindata is a train set |
024 |
test data set / train data set is 1:M-1 |
025 |
""" |
026 |
self .testdata = {} |
027 |
self .traindata = {} |
028 |
data = data or self .data |
029 |
random.seed(seed) |
030 |
for user,item, record in self .data: |
031 |
if random.randint( 0 ,M) = = k: |
032 |
self .testdata.setdefault(user,{}) |
033 |
self .testdata[user][item] = record |
034 |
else : |
035 |
self .traindata.setdefault(user,{}) |
036 |
self .traindata[user][item] = record |
037 |
|
038 |
def userSimilarityBest( self ,train = None ): |
039 |
""" |
040 |
the other method of getting user similarity which is better than above |
041 |
you can get the method on page 46 |
042 |
In this experiment,we use this method |
043 |
""" |
044 |
train = train or self .traindata |
045 |
self .userSimBest = dict () |
046 |
item_users = dict () |
047 |
for u,item in train.items(): |
048 |
for i in item.keys(): |
049 |
item_users.setdefault(i, set ()) |
050 |
item_users[i].add(u) |
051 |
user_item_count = dict () |
052 |
count = dict () |
053 |
for item,users in item_users.items(): |
054 |
for u in users: |
055 |
user_item_count.setdefault(u, 0 ) |
056 |
user_item_count[u] + = 1 |
057 |
for v in users: |
058 |
if u = = v: continue |
059 |
count.setdefault(u,{}) |
060 |
count[u].setdefault(v, 0 ) |
061 |
count[u][v] + = 1 |
062 |
for u ,related_users in count.items(): |
063 |
self .userSimBest.setdefault(u, dict ()) |
064 |
for v, cuv in related_users.items(): |
065 |
self .userSimBest[u][v] = cuv / math.sqrt(user_item_count[u] * user_item_count[v] * 1.0 ) |
066 |
|
067 |
def recommend( self ,user,train = None ,k = 8 ,nitem = 40 ): |
068 |
train = train or self .traindata |
069 |
rank = dict () |
070 |
interacted_items = train.get(user,{}) |
071 |
for v ,wuv in sorted ( self .userSimBest[user].items(),key = lambda x : x[ 1 ],reverse = True )[ 0 :k]: |
072 |
for i , rvi in train[v].items(): |
073 |
if i in interacted_items: |
074 |
continue |
075 |
rank.setdefault(i, 0 ) |
076 |
rank[i] + = wuv |
077 |
return dict ( sorted (rank.items(),key = lambda x :x[ 1 ],reverse = True )[ 0 :nitem]) |
078 |
079 |
def recallAndPrecision( self ,train = None ,test = None ,k = 8 ,nitem = 10 ): |
080 |
""" |
081 |
Get the recall and precision, the method you want to know is listed |
082 |
in the page 43 |
083 |
""" |
084 |
train = train or self .traindata |
085 |
test = test or self .testdata |
086 |
hit = 0 |
087 |
recall = 0 |
088 |
precision = 0 |
089 |
for user in train.keys(): |
090 |
tu = test.get(user,{}) |
091 |
rank = self .recommend(user, train = train,k = k,nitem = nitem) |
092 |
for item,_ in rank.items(): |
093 |
if item in tu: |
094 |
hit + = 1 |
095 |
recall + = len (tu) |
096 |
precision + = nitem |
097 |
return (hit / (recall * 1.0 ),hit / (precision * 1.0 )) |
098 |
|
099 |
def coverage( self ,train = None ,test = None ,k = 8 ,nitem = 10 ): |
100 |
train = train or self .traindata |
101 |
test = test or self .testdata |
102 |
recommend_items = set () |
103 |
all_items = set () |
104 |
for user in train.keys(): |
105 |
for item in train[user].keys(): |
106 |
all_items.add(item) |
107 |
rank = self .recommend(user, train, k = k, nitem = nitem) |
108 |
for item,_ in rank.items(): |
109 |
recommend_items.add(item) |
110 |
return len (recommend_items) / ( len (all_items) * 1.0 ) |
111 |
|
112 |
def popularity( self ,train = None ,test = None ,k = 8 ,nitem = 10 ): |
113 |
""" |
114 |
Get the popularity |
115 |
the algorithm on page 44 |
116 |
""" |
117 |
train = train or self .traindata |
118 |
test = test or self .testdata |
119 |
item_popularity = dict () |
120 |
for user ,items in train.items(): |
121 |
for item in items.keys(): |
122 |
item_popularity.setdefault(item, 0 ) |
123 |
item_popularity[item] + = 1 |
124 |
ret = 0 |
125 |
n = 0 |
126 |
for user in train.keys(): |
127 |
rank = self .recommend(user, train, k = k, nitem = nitem) |
128 |
for item ,_ in rank.items(): |
129 |
ret + = math.log( 1 + item_popularity[item]) |
130 |
n + = 1 |
131 |
return ret / (n * 1.0 ) |
132 |
133 |
def testUserBasedCF(): |
134 |
cf = UserBasedCF( 'u.data' ) |
135 |
cf.userSimilarityBest() |
136 |
print "%3s%20s%20s%20s%20s" % ( 'K' , "recall" , 'precision' , 'coverage' , 'popularity' ) |
137 |
for k in [ 5 , 10 , 20 , 40 , 80 , 160 ]: |
138 |
recall,precision = cf.recallAndPrecision( k = k) |
139 |
coverage = cf.coverage(k = k) |
140 |
popularity = cf.popularity(k = k) |
141 |
print "%3d%19.3f%%%19.3f%%%19.3f%%%20.3f" % (k,recall * 100 ,precision * 100 ,coverage * 100 ,popularity) |
142 |
143 |
if __name__ = = "__main__" : |
144 |
testUserBasedCF() |
基於物品的協同過濾算法(Item-Based Collaborative Filtering)是目前業界應用最多的算法,亞馬遜、Netflix、Hulu、YouTube都採用該算法做爲其基礎推薦算法。this
基於用戶的協同過濾算法有一些缺點:隨着網站的用戶數目愈來愈大,計算用戶興趣類似度矩陣將愈來愈困難,其運算時間複雜度和空間複雜度的增加和用戶數的增加*似*方關心。而且,基於用戶的協同過濾算法很難對推薦結果作出解釋。所以亞馬遜提出了基於物品的協同過濾算法。spa
基於物品的協同過濾算法給用戶推薦那些和他們以前喜歡的物品類似的物品。不過ItemCF算法並不利用物品的內容屬性計算物品之間的類似度,它主要經過分析用戶的行爲記錄計算用戶之間的類似度,也就是說物品A和物品B具備很大的類似度是由於喜歡物品A的用戶大都也喜歡物品B(這一點也是基於物品的協同過濾算法和基於內容的推薦算法最主要的區別)。同時,基於物品的協同過濾算法能夠利用用戶的歷史行爲給推薦結果提供推薦解釋,用於解釋的物品都是用戶以前喜歡的或者購買的物品。3d
「Customers Who Bought This Item Also Bought」(亞馬遜顯示相關物品推薦時的標題),從這句話的定義出發,利用如下公式定義物品之間的類似度:
其中N(i)是喜歡物品i的用戶數,分子是同時喜歡物品i和物品j的用戶數。這個公式懲罰了物品j的權重,減輕了熱門物品會和不少物品類似的可能性(這樣wij的值會很大,接*於1)。這個公式說明兩個物品產生類似度是由於它們共同被不少用戶喜歡,也就是說每一個用戶均可以經過他們的歷史興趣列表給物品「貢獻」類似度。
和UserCF算法相似,用ItemCF算法計算物品類似度時也能夠首先創建用戶-物品倒排表,即對每一個用戶創建一個包含他喜歡的物品的列表,而後對於每一個用戶,將他物品列表中的物品兩兩在共現矩陣C中加1,最終就能夠獲得全部物品之間不爲0的,也就是公式中的分子。
在獲得物品之間的類似度後,ItemCF經過以下公式計算用戶u對一個物品i的興趣:
其中,N(u)是用戶喜歡的物品集合,S(i,k)是和物品i最類似的k個物品的集合,是物品j和物品i的類似度,是用戶u對物品i的興趣,對於隱反饋數據集,若是用戶u對物品i有過行爲,便可令爲1。
根據以上思路,使用Python實現ItemCF算法的代碼以下:
001 |
import math |
002 |
import random |
003 |
class ItemBasedCF: |
004 |
def __init__( self , datafile = None ): |
005 |
self .datafile = datafile |
006 |
self .readData() |
007 |
self .splitData( 3 , 47 ) |
008 |
|
009 |
def readData( self ,datafile = None ): |
010 |
self .datafile = datafile or self .datafile |
011 |
self .data = [] |
012 |
file = open ( self .datafile, 'r' ) |
013 |
for line in file .readlines()[ 0 : 100 * 1000 ]: |
014 |
userid, itemid, record,_ = line.split() |
015 |
self .data.append((userid,itemid, int (record))) |
016 |
|
017 |
def splitData( self ,k,seed,data = None ,M = 8 ): |
018 |
self .testdata = {} |
019 |
self .traindata = {} |
020 |
data = data or self .data |
021 |
random.seed(seed) |
022 |
for user,item,record in self .data: |
023 |
if random.randint( 0 , 7 ) = = k: |
024 |
self .testdata.setdefault(item,{}) |
025 |
self .testdata[item][user] = record |
026 |
else : |
027 |
self .traindata.setdefault(item,{}) |
028 |
self .traindata[item][user] = record |
029 |
|
030 |
def ItemSimilarity( self , train = None ): |
031 |
train = train or self .traindata |
032 |
self .itemSim = dict () |
033 |
#user_items = dict() |
034 |
item_user_count = dict () #item_user_count{item: likeCount} the number of users who like the item |
035 |
count = dict () #count{i:{j:value}} the number of users who both like item i and j |
036 |
for user, item in train.items(): #initialize the user_items{user: items} |
037 |
for i in item.keys(): |
038 |
item_user_count.setdefault(i, 0 ) |
039 |
item_user_count[i] + = 1 |
040 |
for j in item.keys(): |
041 |
if i = = j: |
042 |
continue |
043 |
count.setdefault(i,{}) |
044 |
count[i].setdefault(j, 0 ) |
045 |
count[i][j] + = 1 |
046 |
for i, related_items in count.items(): |
047 |
self .itemSim.setdefault(i, dict ()) |
048 |
for j, cuv in related_items.items(): |
049 |
self .itemSim[i].setdefault(j, 0 ) |
050 |
self .itemSim[i][j] = cuv / math.sqrt(item_user_count[i] * item_user_count[j] * 1.0 ) |
051 |
|
052 |
def recommend( self ,user,train = None , k = 8 , nitem = 40 ): |
053 |
train = train or self .traindata |
054 |
rank = dict () |
055 |
ru = train.get(user,{}) |
056 |
for i,pi in ru.items(): |
057 |
for j,wj in sorted ( self .itemSim[i].items(), key = lambda x:x[ 1 ], reverse = True )[ 0 :k]: |
058 |
if j in ru: |
059 |
continue |
060 |
rank.setdefault(j, 0 ) |
061 |
rank[j] + = wj |
062 |
#print dict(sorted(rank.items(), key = lambda x:x[1], reverse = True)[0:nitem]) |
063 |
return dict ( sorted (rank.items(), key = lambda x:x[ 1 ], reverse = True )[ 0 :nitem]) |
064 |
|
065 |
def recallAndPrecision( self ,train = None ,test = None ,k = 8 ,nitem = 10 ): |
066 |
train = train or self .traindata |
067 |
test = test or self .testdata |
068 |
hit = 0 |
069 |
recall = 0 |
070 |
precision = 0 |
071 |
for user in train.keys(): |
072 |
tu = test.get(user,{}) |
073 |
rank = self .recommend(user,train = train,k = k,nitem = nitem) |
074 |
for item,_ in rank.items(): |
075 |
if item in tu: |
076 |
hit + = 1 |
077 |
recall + = len (tu) |
078 |
precision + = nitem |
079 |
return (hit / (recall * 1.0 ),hit / (precision * 1.0 )) |
080 |
|
081 |
def coverage( self ,train = None ,test = None ,k = 8 ,nitem = 10 ): |
082 |
train = train or self .traindata |
083 |
test = test or self .testdata |
084 |
recommend_items = set () |
085 |
all_items = set () |
086 |
for user in train.keys(): |
087 |
for item in train[user].keys(): |
088 |
all_items.add(item) |
089 |
rank = self .recommend(user, train, k = k, nitem = nitem) |
090 |
for item,_ in rank.items(): |
091 |
recommend_items.add(item) |
092 |
return len (recommend_items) / ( len (all_items) * 1.0 ) |
093 |
|
094 |
def popularity( self ,train = None ,test = None ,k = 8 ,nitem = 10 ): |
095 |
""" |
096 |
Get the popularity |
097 |
the algorithm on page 44 |
098 |
""" |
099 |
train = train or self .traindata |
100 |
test = test or self .testdata |
101 |
item_popularity = dict () |
102 |
for user ,items in train.items(): |
103 |
for item in items.keys(): |
104 |
item_popularity.setdefault(item, 0 ) |
105 |
item_popularity[item] + = 1 |
106 |
ret = 0 |
107 |
n = 0 |
108 |
for user in train.keys(): |
109 |
rank = self .recommend(user, train, k = k, nitem = nitem) |
110 |
for item ,_ in rank.items(): |
111 |
ret + = math.log( 1 + item_popularity[item]) |
112 |
n + = 1 |
113 |
return ret / (n * 1.0 ) |
114 |
|
115 |
def testRecommend(): |
116 |
ubcf = ItemBasedCF( 'u.data' ) |
117 |
ubcf.readData() |
118 |
ubcf.splitData( 4 , 100 ) |
119 |
ubcf.ItemSimilarity() |
120 |
user = "345" |
121 |
rank = ubcf.recommend(user,k = 3 ) |
122 |
for i,rvi in rank.items(): |
123 |
items = ubcf.testdata.get(user,{}) |
124 |
record = items.get(i, 0 ) |
125 |
print "%5s: %.4f--%.4f" % (i,rvi,record) |
126 |
|
127 |
def testItemBasedCF(): |
128 |
cf = ItemBasedCF( 'u.data' ) |
129 |
cf.ItemSimilarity() |
130 |
print "%3s%20s%20s%20s%20s" % ( 'K' , "recall" , 'precision' , 'coverage' , 'popularity' ) |
131 |
for k in [ 5 , 10 , 20 , 40 , 80 , 160 ]: |
132 |
recall,precision = cf.recallAndPrecision( k = k) |
133 |
coverage = cf.coverage(k = k) |
134 |
popularity = cf.popularity(k = k) |
135 |
print "%3d%19.3f%%%19.3f%%%19.3f%%%20.3f" % (k,recall * 100 ,precision * 100 ,coverage * 100 ,popularity) |
136 |
|
137 |
if __name__ = = "__main__" : |
138 |
testItemBasedCF() |
UserCF給用戶推薦那些和他有共同興趣愛好的用戶喜歡的物品,而ItemCF給用戶推薦那些和他以前喜歡的物品相似的物品。從這個原理能夠看到,UserCF的推薦結果着重於反映和用戶興趣類似的小羣體的熱點,而ItemCF的推薦結果着重於維繫用戶的歷史興趣。UserCF的推薦更社會化,反映了用戶所在的小型興趣羣體中物品的熱門程度,而ItemCF的推薦更加個性化,反映了用戶本身的興趣傳承。同時,從技術上來講,UserCF須要維護一個用戶類似度的矩陣,而ItemCF須要維護一個物品類似度矩陣。從存儲的角度來講,若是用戶不少,那麼維護用戶興趣類似度矩陣須要很大的空間,同理,若是物品不少,維護物品類似度矩陣代價較大。
對於UserCF和ItemCF,咱們採用http://www.grouplens.org/node/73 的數據集進行測試,使用準確率/召回率、覆蓋率和流行度對實驗結果進行評測。
對用戶u推薦N個物品R(u),令用戶u在測試集上喜歡的物品集合爲T(u),則:
召回率描述有多少比例的用戶-物品評分記錄包含在最終的推薦列表中,而準確率描述最終的推薦列表中有多少比例是發生過的用戶-物品評分記錄。
覆蓋率表示最終的推薦列表中包含多大比例的物品,若是全部的物品都被推薦給至少一個用戶,那麼覆蓋率就是100%。
最後還須要評測推薦的新穎度,這裏用推薦列表中物品的*均流行度度量推薦結果的新穎度,若是推薦出的物品都很熱門,說明推薦的新穎度較低,不然說明推薦結果比較新穎。
圖1 UserCF實驗結果
圖2 ItemCF實驗結果
對於以上UserCF和ItemCF的實驗結果能夠看出,推薦系統的精度指標(準確率和召回率)並不和參數k成線性關係。推薦結果的精度對k也不是特別敏感,只要選在必定的區域內,就能夠得到不錯的精度。
對於覆蓋率而言,k越大則推薦結果的覆蓋率越低,覆蓋率的下降是由於流行度的增長,隨着流行度增長,推薦算法愈來愈傾向於推薦熱門的物品,這是由於k決定了推薦算法在作推薦時參考多少和你興趣類似的其餘用戶的興趣,若是k越大,參考的人或者物品越多,結果就愈來愈趨*於全局熱門的物品。