Python推薦系統庫--Surprise實戰

時間 2019-11-26

原文原文鏈接

1、使用movieLens數據集

from surprise import KNNBasic, SVD
from surprise import Dataset
from surprise import evaluate, print_perf

# 使用公開的推薦系統數據集--MovieLens
data = Dataset.load_builtin('ml-100k')
# k 折交叉驗證
data.split(n_folds=3)
# 算法使用SVD分解
algo = SVD()
# 在數據集上測試效果，算出最小均方根偏差、平均絕對偏差
perf = evaluate(algo, data, measures=['RMSE', 'MAE'])
# 輸出結果
print_perf(perf)

Evaluating RMSE, MAE of algorithm SVD.

------------
Fold 1
RMSE: 0.9506
MAE:  0.7511
------------
Fold 2
RMSE: 0.9452
MAE:  0.7456
------------
Fold 3
RMSE: 0.9442
MAE:  0.7444
------------
------------
Mean RMSE: 0.9467
Mean MAE : 0.7470
------------
------------
        Fold 1  Fold 2  Fold 3  Mean    
RMSE    0.9506  0.9452  0.9442  0.9467  
MAE     0.7511  0.7456  0.7444  0.7470

2、算法調參

咱們使用sklearn經常使用到的網格搜索交叉驗證（GridSearchCV）來選擇最優的參數算法

# 算法調參
from surprise import GridSearch
# 迭代輪次、學習率、
# 三個參數，每一個有兩個參數，2^3 = 8種可能
param_grid = {'n_epochs':[5, 10], 'lr_all':[0.002, 0.005],
             'reg_all':[0.4, 0.6]}

# 使用SVD算法，三個參數參與調參，評估標準使用最小均方根偏差、協調對分數
grid_search = GridSearch(SVD, param_grid, measures=['RMSE', 'FCP'])
data = Dataset.load_builtin('ml-100k')
data.split(n_folds=3)

grid_search.evaluate(data)

Running grid search for the following parameter combinations:
{'n_epochs': 5, 'lr_all': 0.002, 'reg_all': 0.4}
{'n_epochs': 5, 'lr_all': 0.002, 'reg_all': 0.6}
{'n_epochs': 5, 'lr_all': 0.005, 'reg_all': 0.4}
{'n_epochs': 5, 'lr_all': 0.005, 'reg_all': 0.6}
{'n_epochs': 10, 'lr_all': 0.002, 'reg_all': 0.4}
{'n_epochs': 10, 'lr_all': 0.002, 'reg_all': 0.6}
{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}
{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.6}

Resulsts:
{'n_epochs': 5, 'lr_all': 0.002, 'reg_all': 0.4}
{'RMSE': 0.9973640543212537, 'FCP': 0.6834505918617332}
----------
{'n_epochs': 5, 'lr_all': 0.002, 'reg_all': 0.6}
{'RMSE': 1.0033367804212159, 'FCP': 0.6863671726311678}
----------
{'n_epochs': 5, 'lr_all': 0.005, 'reg_all': 0.4}
{'RMSE': 0.9740022047005671, 'FCP': 0.693822773157699}
----------
{'n_epochs': 5, 'lr_all': 0.005, 'reg_all': 0.6}
{'RMSE': 0.9828360526820644, 'FCP': 0.6939377853330241}
----------
{'n_epochs': 10, 'lr_all': 0.002, 'reg_all': 0.4}
{'RMSE': 0.9783154591562983, 'FCP': 0.6919014896389958}
----------
{'n_epochs': 10, 'lr_all': 0.002, 'reg_all': 0.6}
{'RMSE': 0.9863470326305794, 'FCP': 0.6925580320424597}
----------
{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}
{'RMSE': 0.9641597864074152, 'FCP': 0.6973875277009212}
----------
{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.6}
{'RMSE': 0.9740231673256359, 'FCP': 0.6976928768968366}

# 輸出最優的參數組
# 輸出最好的RMSE結果
print(grid_search.best_score['RMSE'])

# 輸出對應最好的RMSE結果的參數
print(grid_search.best_params['RMSE'])

0.9641597864074152
{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}

# 最好的FCP得分
print(grid_search.best_score['FCP'])

# 輸出對應最好的FCP結果的參數
print(grid_search.best_params['FCP'])

0.6983253171588012
{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.6}

在本身的數據集上訓練模型

該如何作？數組

1. 載入本身的數據集 ide

import os
from surprise import Reader, Dataset
# 指定文件路徑
file_path = os.path.expanduser('./popular_music_suprise_format.txt')
# 指定文件格式
reader = Reader(line_format='user item rating timestamp', sep=',')
# 從文件讀取數據
music_data = Dataset.load_from_file(file_path, reader=reader)
# 分紅5折
music_data.split(n_folds=5)

2. 使用不一樣的推薦算法進行建模比較學習

### 使用NormalPredictor
from surprise import NormalPredictor, evaluate
algo = NormalPredictor()
perf = evaluate(algo, music_data, measures=['RMSE', 'MAE'])

### 使用BaselineOnly
from surprise import BaselineOnly, evaluate
algo = BaselineOnly()
perf = evaluate(algo, music_data, measures=['RMSE', 'MAE'])

### 使用基礎版協同過濾
from surprise import KNNBasic, evaluate
algo = KNNBasic()
perf = evaluate(algo, music_data, measures=['RMSE', 'MAE'])

### 使用均值協同過濾
from surprise import KNNWithMeans, evaluate
algo = KNNWithMeans()
perf = evaluate(algo, music_data, measures=['RMSE', 'MAE'])

### 使用協同過濾baseline
from surprise import KNNBaseline, evaluate
algo = KNNBaseline()
perf = evaluate(algo, music_data, measures=['RMSE', 'MAE'])

### 使用SVD
from surprise import SVD, evaluate
algo = SVD()
perf = evaluate(algo, music_data, measures=['RMSE', 'MAE'])

### 使用SVD++
from surprise import SVDpp, evaluate
algo = SVDpp()
perf = evaluate(algo, music_data, measures=['RMSE', 'MAE'])

### 使用NMF
from surprise import NMF
algo = NMF()
perf = evaluate(algo, music_data, measures=['RMSE', 'MAE'])
print_perf(perf)

推薦系統--不一樣電影之間的類似度

1、載入數據，使用算法算出相互間的類似度

# 在協同過濾算法建模之後，根據item取回類似度最高的item
# 使用的是 algo.get_neighbors()

from __future__ import (absolute_import, division, print_function, unicode_literals)
import os
import io

from surprise import KNNBaseline
from surprise import Dataset

# 獲取電影名到電影id 和 電影id到電影名的映射
def read_item_names():
    file_name = (os.path.expanduser('~') + '/.surprise_data/ml-100k/ml-100k/u.item')
    rid_to_name = {}
    name_to_rid = {}
    with io.open(file_name, 'r', encoding='ISO-8859-1') as f:
        for line in f:
            line = line.split('|')
            rid_to_name[line[0]] = line[1]
            name_to_rid[line[1]] = line[0]
    return rid_to_name, name_to_rid

# 用算法計算相互間的類似度
data = Dataset.load_builtin('ml-100k')
trainest = data.build_full_trainset()
sim_options = {'name': 'pearson_baseline', 'user_based': False}
algo = KNNBaseline(sim_options=sim_options)
algo.train(trainest)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.

# 獲取電影名到電影id 和 電影id到電影名的映射
rid_to_name, name_to_rid = read_item_names()

# 獲取玩具總動員的內部id
toy_story_raw_id = name_to_rid['Toy Story (1995)']
toy_story_raw_id

'1'

toy_story_inner_id = algo.trainset.to_inner_iid(toy_story_raw_id)
toy_story_inner_id

toy_story_neighbors = algo.get_neighbors(toy_story_inner_id, k=10)
toy_story_neighbors

[433, 101, 302, 309, 971, 95, 26, 561, 816, 347]

2、獲取類似度最近的10部電影

# 將鄰居的內部id轉換爲名稱。
toy_story_neighbors = (algo.trainset.to_raw_iid(inner_id) for inner_id in toy_story_neighbors)

toy_story_neighbors = (rid_to_name[rid] for rid in toy_story_neighbors)

print()
print('The 10 nearest neighbors of Toy Story are:')
for movie in toy_story_neighbors:
    print(movie)

The 10 nearest neighbors of Toy Story are:
Beauty and the Beast (1991)
Raiders of the Lost Ark (1981)
That Thing You Do! (1996)
Lion King, The (1994)
Craft, The (1996)
Liar Liar (1997)
Aladdin (1992)
Cool Hand Luke (1967)
Winnie the Pooh and the Blustery Day (1968)
Indiana Jones and the Last Crusade (1989)

參考文章：https://blog.csdn.net/mycafe_/article/details/79146764測試

相關標籤/搜索