from surprise import KNNBasic, SVD from surprise import Dataset from surprise import evaluate, print_perf
# 使用公開的推薦系統數據集--MovieLens data = Dataset.load_builtin('ml-100k') # k 折交叉驗證 data.split(n_folds=3) # 算法使用SVD分解 algo = SVD() # 在數據集上測試效果,算出最小均方根偏差、平均絕對偏差 perf = evaluate(algo, data, measures=['RMSE', 'MAE']) # 輸出結果 print_perf(perf)
Evaluating RMSE, MAE of algorithm SVD. ------------ Fold 1 RMSE: 0.9506 MAE: 0.7511 ------------ Fold 2 RMSE: 0.9452 MAE: 0.7456 ------------ Fold 3 RMSE: 0.9442 MAE: 0.7444 ------------ ------------ Mean RMSE: 0.9467 Mean MAE : 0.7470 ------------ ------------ Fold 1 Fold 2 Fold 3 Mean RMSE 0.9506 0.9452 0.9442 0.9467 MAE 0.7511 0.7456 0.7444 0.7470
咱們使用sklearn經常使用到的網格搜索交叉驗證(GridSearchCV)來選擇最優的參數算法
# 算法調參 from surprise import GridSearch # 迭代輪次、學習率、 # 三個參數,每一個有兩個參數,2^3 = 8種可能 param_grid = {'n_epochs':[5, 10], 'lr_all':[0.002, 0.005], 'reg_all':[0.4, 0.6]} # 使用SVD算法,三個參數參與調參,評估標準使用最小均方根偏差、協調對分數 grid_search = GridSearch(SVD, param_grid, measures=['RMSE', 'FCP']) data = Dataset.load_builtin('ml-100k') data.split(n_folds=3) grid_search.evaluate(data)
Running grid search for the following parameter combinations: {'n_epochs': 5, 'lr_all': 0.002, 'reg_all': 0.4} {'n_epochs': 5, 'lr_all': 0.002, 'reg_all': 0.6} {'n_epochs': 5, 'lr_all': 0.005, 'reg_all': 0.4} {'n_epochs': 5, 'lr_all': 0.005, 'reg_all': 0.6} {'n_epochs': 10, 'lr_all': 0.002, 'reg_all': 0.4} {'n_epochs': 10, 'lr_all': 0.002, 'reg_all': 0.6} {'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4} {'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.6}
Resulsts: {'n_epochs': 5, 'lr_all': 0.002, 'reg_all': 0.4} {'RMSE': 0.9973640543212537, 'FCP': 0.6834505918617332} ---------- {'n_epochs': 5, 'lr_all': 0.002, 'reg_all': 0.6} {'RMSE': 1.0033367804212159, 'FCP': 0.6863671726311678} ---------- {'n_epochs': 5, 'lr_all': 0.005, 'reg_all': 0.4} {'RMSE': 0.9740022047005671, 'FCP': 0.693822773157699} ---------- {'n_epochs': 5, 'lr_all': 0.005, 'reg_all': 0.6} {'RMSE': 0.9828360526820644, 'FCP': 0.6939377853330241} ---------- {'n_epochs': 10, 'lr_all': 0.002, 'reg_all': 0.4} {'RMSE': 0.9783154591562983, 'FCP': 0.6919014896389958} ---------- {'n_epochs': 10, 'lr_all': 0.002, 'reg_all': 0.6} {'RMSE': 0.9863470326305794, 'FCP': 0.6925580320424597} ---------- {'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4} {'RMSE': 0.9641597864074152, 'FCP': 0.6973875277009212} ---------- {'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.6} {'RMSE': 0.9740231673256359, 'FCP': 0.6976928768968366}
# 輸出最優的參數組 # 輸出最好的RMSE結果 print(grid_search.best_score['RMSE']) # 輸出對應最好的RMSE結果的參數 print(grid_search.best_params['RMSE'])
0.9641597864074152 {'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}
# 最好的FCP得分 print(grid_search.best_score['FCP']) # 輸出對應最好的FCP結果的參數 print(grid_search.best_params['FCP'])
0.6983253171588012 {'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.6}
該如何作?數組
1. 載入本身的數據集 ide
import os from surprise import Reader, Dataset # 指定文件路徑 file_path = os.path.expanduser('./popular_music_suprise_format.txt') # 指定文件格式 reader = Reader(line_format='user item rating timestamp', sep=',') # 從文件讀取數據 music_data = Dataset.load_from_file(file_path, reader=reader) # 分紅5折 music_data.split(n_folds=5)
2. 使用不一樣的推薦算法進行建模比較學習
### 使用NormalPredictor from surprise import NormalPredictor, evaluate algo = NormalPredictor() perf = evaluate(algo, music_data, measures=['RMSE', 'MAE']) ### 使用BaselineOnly from surprise import BaselineOnly, evaluate algo = BaselineOnly() perf = evaluate(algo, music_data, measures=['RMSE', 'MAE']) ### 使用基礎版協同過濾 from surprise import KNNBasic, evaluate algo = KNNBasic() perf = evaluate(algo, music_data, measures=['RMSE', 'MAE']) ### 使用均值協同過濾 from surprise import KNNWithMeans, evaluate algo = KNNWithMeans() perf = evaluate(algo, music_data, measures=['RMSE', 'MAE']) ### 使用協同過濾baseline from surprise import KNNBaseline, evaluate algo = KNNBaseline() perf = evaluate(algo, music_data, measures=['RMSE', 'MAE']) ### 使用SVD from surprise import SVD, evaluate algo = SVD() perf = evaluate(algo, music_data, measures=['RMSE', 'MAE']) ### 使用SVD++ from surprise import SVDpp, evaluate algo = SVDpp() perf = evaluate(algo, music_data, measures=['RMSE', 'MAE']) ### 使用NMF from surprise import NMF algo = NMF() perf = evaluate(algo, music_data, measures=['RMSE', 'MAE']) print_perf(perf)
# 在協同過濾算法建模之後,根據item取回類似度最高的item # 使用的是 algo.get_neighbors() from __future__ import (absolute_import, division, print_function, unicode_literals) import os import io from surprise import KNNBaseline from surprise import Dataset
# 獲取電影名到電影id 和 電影id到電影名的映射 def read_item_names(): file_name = (os.path.expanduser('~') + '/.surprise_data/ml-100k/ml-100k/u.item') rid_to_name = {} name_to_rid = {} with io.open(file_name, 'r', encoding='ISO-8859-1') as f: for line in f: line = line.split('|') rid_to_name[line[0]] = line[1] name_to_rid[line[1]] = line[0] return rid_to_name, name_to_rid # 用算法計算相互間的類似度 data = Dataset.load_builtin('ml-100k') trainest = data.build_full_trainset() sim_options = {'name': 'pearson_baseline', 'user_based': False} algo = KNNBaseline(sim_options=sim_options) algo.train(trainest)
Estimating biases using als... Computing the pearson_baseline similarity matrix... Done computing similarity matrix.
# 獲取電影名到電影id 和 電影id到電影名的映射 rid_to_name, name_to_rid = read_item_names() # 獲取玩具總動員的內部id toy_story_raw_id = name_to_rid['Toy Story (1995)'] toy_story_raw_id
'1'
toy_story_inner_id = algo.trainset.to_inner_iid(toy_story_raw_id)
toy_story_inner_id
24
toy_story_neighbors = algo.get_neighbors(toy_story_inner_id, k=10)
toy_story_neighbors
[433, 101, 302, 309, 971, 95, 26, 561, 816, 347]
# 將鄰居的內部id轉換爲名稱。 toy_story_neighbors = (algo.trainset.to_raw_iid(inner_id) for inner_id in toy_story_neighbors) toy_story_neighbors = (rid_to_name[rid] for rid in toy_story_neighbors) print() print('The 10 nearest neighbors of Toy Story are:') for movie in toy_story_neighbors: print(movie)
The 10 nearest neighbors of Toy Story are: Beauty and the Beast (1991) Raiders of the Lost Ark (1981) That Thing You Do! (1996) Lion King, The (1994) Craft, The (1996) Liar Liar (1997) Aladdin (1992) Cool Hand Luke (1967) Winnie the Pooh and the Blustery Day (1968) Indiana Jones and the Last Crusade (1989)
參考文章:https://blog.csdn.net/mycafe_/article/details/79146764測試