Mahout學習(主要學習內容是Mahout中推薦部分的ItemCF、UserCF、Hadoop集羣部署運行)java
一、Mahout是什麼?算法
二、Mahout是用來幹嗎的?數據庫
2.1 推薦引擎apache
服務商或網站會根據你過去的行爲爲你推薦書籍、電影或文章。編程
2.2 聚類服務器
Google news使用聚類技術經過標題把新聞文章進行分組,從而按照邏輯線索來顯示新聞,而並不是給出全部新聞的原始列表。框架
2.3 分類dom
雅虎郵箱基於用戶之前對正常郵件和垃圾郵件的報告,以及電子郵件自身的特徵,來判別到來的消息是不是垃圾郵件。機器學習
三、Mahout協同過濾算法maven
Mahout使用了Taste來提升協同過濾算法的實現,它是一個基於Java實現的可擴展的,高效的推薦引擎。Taste既實現了最基本的基於用戶的和基於內容的推薦算法,同時也提供了擴展接口,使用戶能夠方便的定義和實現本身的推薦算法。同時,Taste不只僅只適用於Java應用程序,它能夠做爲內部服務器的一個組件以HTTP和Web Service的形式向外界提供推薦的邏輯。Taste的設計使它能知足企業對推薦引擎在性能、靈活性和可擴展性等方面的要求。
Taste主要包括如下幾個接口:
四、Mahout協同過濾算法編程
一、建立maven項目
二、導入mahout依賴
<dependencies> <dependency> <groupId>org.apache.mahout</groupId> <artifactId>mahout</artifactId> <version>0.11.1</version> </dependency> <dependency> <groupId>org.apache.mahout</groupId> <artifactId>mahout-examples</artifactId> <version>0.11.1</version> <exclusions> <exclusion> <groupId>org.slf4j</groupId> <artifactId>slf4j-log4j12</artifactId> </exclusion> </exclusions> </dependency> </dependencies>
三、下載電影評分數據
下載地址:http://grouplens.org/datasets/movielens/
數據類別:7.2萬用戶對1萬部電影的百萬級評價和10萬個標籤數據
四、基於用戶的推薦
1 package com.ahu.learnmahout; 2 3 import org.apache.mahout.cf.taste.impl.neighborhood.NearestNUserNeighborhood; 4 import org.apache.mahout.cf.taste.impl.recommender.GenericUserBasedRecommender; 5 import org.apache.mahout.cf.taste.impl.similarity.PearsonCorrelationSimilarity; 6 import org.apache.mahout.cf.taste.model.DataModel; 7 import org.apache.mahout.cf.taste.neighborhood.UserNeighborhood; 8 import org.apache.mahout.cf.taste.recommender.RecommendedItem; 9 import org.apache.mahout.cf.taste.recommender.Recommender; 10 import org.apache.mahout.cf.taste.similarity.UserSimilarity; 11 import org.apache.mahout.cf.taste.similarity.precompute.example.GroupLensDataModel; 12 13 import java.io.File; 14 import java.util.List; 15 16 /** 17 * Created by ahu_lichang on 2017/6/23. 18 */ 19 public class BaseUserRecommender { 20 public static void main(String[] args) throws Exception { 21 //準備數據 這裏是電影評分數據 22 File file = new File("E:\\ml-10M100K\\ratings.dat"); 23 //將數據加載到內存中,GroupLensDataModel是針對開放電影評論數據的 24 DataModel dataModel = new GroupLensDataModel(file); 25 //計算類似度,類似度算法有不少種,歐幾里得、皮爾遜等等。 26 UserSimilarity similarity = new PearsonCorrelationSimilarity(dataModel); 27 //計算最近鄰域,鄰居有兩種算法,基於固定數量的鄰居和基於類似度的鄰居,這裏使用基於固定數量的鄰居 28 UserNeighborhood userNeighborhood = new NearestNUserNeighborhood(100, similarity, dataModel); 29 //構建推薦器,協同過濾推薦有兩種,分別是基於用戶的和基於物品的,這裏使用基於用戶的協同過濾推薦 30 Recommender recommender = new GenericUserBasedRecommender(dataModel, userNeighborhood, similarity); 31 //給用戶ID等於5的用戶推薦10部電影 32 List<RecommendedItem> recommendedItemList = recommender.recommend(5, 10); 33 //打印推薦的結果 34 System.out.println("使用基於用戶的協同過濾算法"); 35 System.out.println("爲用戶5推薦10個商品"); 36 for (RecommendedItem recommendedItem : recommendedItemList) { 37 System.out.println(recommendedItem); 38 } 39 } 40 }
運行結果:
五、基於物品的推薦
package com.ahu.learnmahout; import org.apache.mahout.cf.taste.impl.recommender.GenericItemBasedRecommender; import org.apache.mahout.cf.taste.impl.similarity.PearsonCorrelationSimilarity; import org.apache.mahout.cf.taste.model.DataModel; import org.apache.mahout.cf.taste.recommender.RecommendedItem; import org.apache.mahout.cf.taste.similarity.ItemSimilarity; import org.apache.mahout.cf.taste.similarity.precompute.example.GroupLensDataModel; import java.io.File; import java.util.List; /** * Created by ahu_lichang on 2017/6/24. */ public class BaseItemRecommender { public static void main(String[] args) throws Exception { //準備數據 這裏是電影評分數據 File file = new File("E:\\ml-10M100K\\ratings.dat"); //將數據加載到內存中,GroupLensDataModel是針對開放電影評論數據的 DataModel dataModel = new GroupLensDataModel(file); //計算類似度,類似度算法有不少種,歐幾里得、皮爾遜等等。 ItemSimilarity itemSimilarity = new PearsonCorrelationSimilarity(dataModel); //構建推薦器,協同過濾推薦有兩種,分別是基於用戶的和基於物品的,這裏使用基於物品的協同過濾推薦 GenericItemBasedRecommender recommender = new GenericItemBasedRecommender(dataModel, itemSimilarity); //給用戶ID等於5的用戶推薦10個與2398類似的商品 List<RecommendedItem> recommendedItemList = recommender.recommendedBecause(5, 2398, 10); //打印推薦的結果 System.out.println("使用基於物品的協同過濾算法"); System.out.println("根據用戶5當前瀏覽的商品2398,推薦10個類似的商品"); for (RecommendedItem recommendedItem : recommendedItemList) { System.out.println(recommendedItem); } long start = System.currentTimeMillis(); recommendedItemList = recommender.recommendedBecause(5, 34, 10); //打印推薦的結果 System.out.println("使用基於物品的協同過濾算法"); System.out.println("根據用戶5當前瀏覽的商品34,推薦10個類似的商品"); for (RecommendedItem recommendedItem : recommendedItemList) { System.out.println(recommendedItem); } System.out.println(System.currentTimeMillis() -start); } }
運行結果:
六、評估推薦模型
package com.ahu.learnmahout; import org.apache.mahout.cf.taste.common.TasteException; import org.apache.mahout.cf.taste.eval.RecommenderBuilder; import org.apache.mahout.cf.taste.eval.RecommenderEvaluator; import org.apache.mahout.cf.taste.impl.eval.AverageAbsoluteDifferenceRecommenderEvaluator; import org.apache.mahout.cf.taste.impl.neighborhood.NearestNUserNeighborhood; import org.apache.mahout.cf.taste.impl.recommender.GenericUserBasedRecommender; import org.apache.mahout.cf.taste.impl.similarity.PearsonCorrelationSimilarity; import org.apache.mahout.cf.taste.model.DataModel; import org.apache.mahout.cf.taste.neighborhood.UserNeighborhood; import org.apache.mahout.cf.taste.recommender.Recommender; import org.apache.mahout.cf.taste.similarity.UserSimilarity; import org.apache.mahout.cf.taste.similarity.precompute.example.GroupLensDataModel; import java.io.File; /** * Created by ahu_lichang on 2017/6/24. */ public class MyEvaluator { public static void main(String[] args) throws Exception { //準備數據 這裏是電影評分數據 File file = new File("E:\\ml-10M100K\\ratings.dat"); //將數據加載到內存中,GroupLensDataModel是針對開放電影評論數據的 DataModel dataModel = new GroupLensDataModel(file); //推薦評估,使用均方根 //RecommenderEvaluator evaluator = new RMSRecommenderEvaluator(); //推薦評估,使用平均差值 RecommenderEvaluator evaluator = new AverageAbsoluteDifferenceRecommenderEvaluator(); RecommenderBuilder builder = new RecommenderBuilder() { public Recommender buildRecommender(DataModel dataModel) throws TasteException { UserSimilarity similarity = new PearsonCorrelationSimilarity(dataModel); UserNeighborhood neighborhood = new NearestNUserNeighborhood(2, similarity, dataModel); return new GenericUserBasedRecommender(dataModel, neighborhood, similarity); } }; // 用70%的數據用做訓練,剩下的30%用來測試 double score = evaluator.evaluate(builder, null, dataModel, 0.7, 1.0); //最後得出的評估值越小,說明推薦結果越好 System.out.println(score); } }
七、獲取推薦的準確率和召回率
package com.ahu.learnmahout; import org.apache.mahout.cf.taste.common.TasteException; import org.apache.mahout.cf.taste.eval.IRStatistics; import org.apache.mahout.cf.taste.eval.RecommenderBuilder; import org.apache.mahout.cf.taste.eval.RecommenderIRStatsEvaluator; import org.apache.mahout.cf.taste.impl.eval.GenericRecommenderIRStatsEvaluator; import org.apache.mahout.cf.taste.impl.neighborhood.NearestNUserNeighborhood; import org.apache.mahout.cf.taste.impl.recommender.GenericUserBasedRecommender; import org.apache.mahout.cf.taste.impl.similarity.PearsonCorrelationSimilarity; import org.apache.mahout.cf.taste.model.DataModel; import org.apache.mahout.cf.taste.neighborhood.UserNeighborhood; import org.apache.mahout.cf.taste.recommender.Recommender; import org.apache.mahout.cf.taste.similarity.UserSimilarity; import org.apache.mahout.cf.taste.similarity.precompute.example.GroupLensDataModel; import java.io.File; /** * Created by ahu_lichang on 2017/6/24. */ public class MyIRStatistics { public static void main(String[] args) throws Exception { //準備數據 這裏是電影評分數據 File file = new File("E:\\ml-10M100K\\ratings.dat"); //將數據加載到內存中,GroupLensDataModel是針對開放電影評論數據的 DataModel dataModel = new GroupLensDataModel(file); RecommenderIRStatsEvaluator statsEvaluator = new GenericRecommenderIRStatsEvaluator(); RecommenderBuilder recommenderBuilder = new RecommenderBuilder() { public Recommender buildRecommender(DataModel model) throws TasteException { UserSimilarity similarity = new PearsonCorrelationSimilarity(model); UserNeighborhood neighborhood = new NearestNUserNeighborhood(4, similarity, model); return new GenericUserBasedRecommender(model, neighborhood, similarity); } }; // 計算推薦4個結果時的查準率和召回率 //使用評估器,並設定評估期的參數 //4表示"precision and recall at 4"即至關於推薦top4,而後在top-4的推薦上計算準確率和召回率 IRStatistics stats = statsEvaluator.evaluate(recommenderBuilder, null, dataModel, null, 4, GenericRecommenderIRStatsEvaluator.CHOOSE_THRESHOLD, 1.0); System.out.println(stats.getPrecision()); System.out.println(stats.getRecall()); } }
五、Mahout運行在Hadoop集羣
1、Hadoop 執行腳本
hadoop jar mahout-examples-0.9-job.jar org.apache.mahout.cf.taste.hadoop.item.RecommenderJob --input /sanbox/movie/10M.txt --output /sanbox/movie/r -s SIMILARITY_LOGLIKELIHOOD
參數說明
二、 執行結果
上面命令運行完成以後,會在當前用戶的hdfs主目錄生成temp目錄,該目錄可由 --tempDir (path) 參數設置.
後期學習補充:
Mahout 是基於Hadoop的機器學習和數據挖掘的一個分佈式框架。Mahout用MapReduce實現了部分數據挖掘算法,解決了並行挖掘的問題。
Mahout應用場景: