應用場景:預測反欺詐用戶算法
一、構建用戶畫像,用戶畫像由多種業務指標組成(例如用戶申請後回訪次數、ip城市與gps城市是否一致等等)sql
二、提取用戶畫像的業務指標--->將指標數值變成向量和矩陣apache
三、先驗數據集:總數據條數:75568 正經常使用戶條數:72723 肯定欺詐用戶條數:2845app
訓練數據集條數:60162 預測數據集:15406dom
四、使用RandomForestClassifier算法優化
代碼以下ui
package mllib import org.apache.spark.ml.Pipeline import org.apache.spark.ml.classification.RandomForestClassifier import org.apache.spark.ml.feature._ import org.apache.spark.sql.{DataFrame, SparkSession} /** * Created by dongdong on 17/6/16. */ case class Feature(cid: String, label: String, f2: Double, f3: Double, f4: Double, f5: Double, f6: Double, f7: Double, f8: Double, f9: Double, f10: Double, f11: Double, f12: Double, f13: Double, f14: Double, f15: Double, f16: Double, f17: Double, f18: Double, f19: Double, f20: Double, f21: Double, f22: Double, f23: Double, f24: Double, f25: Double, f26: Double, f27: Double, f28: Double, f29: Double, f30: Double, f31: Double, f32: Double, text: String, f38: Double, f39: Double ) object UserProfile_Forest { def main(args: Array[String]): Unit = { val inpath = "/user/hive/warehouse/user_profile_tmp_db.db/t_cid_feature/*" val spark = SparkSession .builder() .master("local[3]") .appName("UserProfile_Forest") .getOrCreate() import spark.implicits._ //Build a dataset and read data val originalData = spark.sparkContext .textFile(inpath) .map(line => { val arr = line.split("\001") val cid = arr(0) val f1 = arr(1) val f2 = arr(2).replace("\\N", "0").toDouble val f3 = arr(3).replace("\\N", "0").toDouble val f4 = arr(4).replace("\\N", "0").toDouble val f5 = arr(5).replace("\\N", "0").toDouble val f6 = arr(6).replace("\\N", "0").toDouble val f7 = arr(7).replace("\\N", "0").toDouble val f8 = arr(8).replace("\\N", "0").toDouble val f9 = arr(9).replace("\\N", "0").toDouble val f10 = arr(10).replace("\\N", "0").toDouble val f11 = arr(11).replace("\\N", "0").toDouble val f12 = arr(12).replace("\\N", "0").toDouble val f13 = arr(13).replace("\\N", "0").toDouble val f14 = arr(14).replace("\\N", "0").toDouble val f15 = arr(15).replace("\\N", "0").toDouble val f16 = arr(16).replace("\\N", "0").toDouble val f17 = arr(17).replace("\\N", "0").toDouble val f18 = arr(18).replace("\\N", "0").toDouble val f19 = arr(19).replace("\\N", "0").toDouble val f20 = arr(20).replace("\\N", "0").toDouble val f21 = arr(21).replace("\\N", "0").toDouble val f22 = arr(22).replace("\\N", "0").toDouble val f23 = arr(23).replace("\\N", "0").toDouble val f24 = arr(24).replace("\\N", "0").toDouble val f25 = arr(25).replace("\\N", "0").toDouble val f26 = arr(26).replace("\\N", "0").toDouble val f27 = arr(27).replace("\\N", "0").toDouble val f28 = arr(28).replace("\\N", "0").toDouble val f29 = arr(29).replace("\\N", "0").toDouble val f30 = arr(35).replace("\\N", "0").toDouble val f31 = arr(36).replace("\\N", "0").toDouble val f32 = arr(37).replace("\\N", "0").toDouble val text = arr(40) + "|" + arr(41) //val f35 = arr(44).replace("\\N", "0").toDouble //val f36 = arr(45).replace("\\N", "0").toDouble // val f37 = arr(46).replace("\\N", "0").toDouble val f38 = arr(47).replace("\\N", "0").toDouble val f39 = arr(48).replace("\\N", "0").toDouble Feature(cid, f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11, f12, f13, f14, f15, f16, f17, f18, f19, f20, f21, f22, f23, f24, f25, f26, f27 , f28, f29, f30, f31, f32, text, f38, f39) } ).toDS //build label val labelIndexer = new StringIndexer() .setInputCol("label") .setOutputCol("indexedLabel") .fit(originalData) val tokenizer = new RegexTokenizer() .setInputCol("text") .setOutputCol("words") .setPattern("\\|") val word2Vec = new Word2Vec() .setInputCol("words") .setOutputCol("feature_one") .setVectorSize(100) //.setMinCount(1) .setMaxIter(20) val arr = Array("f2", "f3", "f4", "f5", "f6", "f7", "f8", "f9", "f10", "f11", "f12", "f13", "f14", "f15", "f16", "f17", "f18", "f19", "f20", "f21", "f22", "f23", "f24", "f25", "f26", "f27", "f28", "f29", "f30", "f31", "f32", "feature_one", "f38", "f39") val vectorAssembler = new VectorAssembler() .setInputCols(arr) .setOutputCol("featureVector") //set setnumtrees 1 val rfClassifier = new RandomForestClassifier() .setLabelCol("indexedLabel") .setFeaturesCol("featureVector") .setNumTrees(1) //predict label to string val labelConverter = new IndexToString() .setInputCol("prediction") .setOutputCol("predictedLabel") .setLabels(labelIndexer.labels) val Array(trainingData, testData) = originalData.randomSplit(Array(0.8, 0.2)) //build pipeline val pipeline = new Pipeline().setStages(Array(labelIndexer, tokenizer, word2Vec, vectorAssembler, rfClassifier, labelConverter)) //train data val model = pipeline.fit(trainingData) //predict data val predictionResultDF = model.transform(testData) //predict corrector bad user val correctcount_baduser = predictionResultDF.select("cid", "label", "predictedLabel") .filter($"label" === $"predictedLabel") .filter($"label" === 1).count() //predict corrector good user val correctcount_gooduser = predictionResultDF.select("cid", "label", "predictedLabel") .filter($"label" === $"predictedLabel") .filter($"label" === 0).count() spark.stop() } }
debug時的一些向量特徵以下spa
+--------------------+-----+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+----+---+----+---+---+----+---+---+---+-----+---+---+------------+--------+---------------------+-----------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------+-----------------------------------------+----------+--------------+ |cid |label|f2 |f3 |f4 |f5 |f6 |f7 |f8 |f9 |f10|f11|f12|f13|f14|f15|f16|f17|f18|f19|f20|f21|f22|f23|f24 |f25|f26 |f27|f28|f29 |f30|f31|f32|text |f38|f39|indexedLabel|words |feature_one |featureVector |rawPrediction |probability |prediction|predictedLabel| +--------------------+-----+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+----+---+----+---+---+----+---+---+---+-----+---+---+------------+--------+---------------------+-----------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------+-----------------------------------------+----------+--------------+ |2**60327000*0017**12|0 |0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0 |0.0|0.0 |0.0|0.0|0.0 |0.0|0.0|0.0|\N|\N|0.0|0.0|0.0 |[\n, \n]|[-0.6971070766448975]|(34,[31],[-0.6971070766448975]) |[0.964332892998679,0.035667107001321] |[0.964332892998679,0.035667107001321] |0.0 |0 | |20**050300000031**55|0 |0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0 |0.0|0.0 |0.0|0.0|0.0 |0.0|0.0|0.0|\N|\N|0.0|0.0|0.0 |[\n, \n]|[-0.6971070766448975]|(34,[31],[-0.6971070766448975]) |[0.964332892998679,0.035667107001321] |[0.964332892998679,0.035667107001321] |0.0 |0 | |20**051800000043**09|0 |0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0 |0.0|0.0 |0.0|0.0|0.0 |0.0|0.0|0.0|\N|\N|0.0|0.0|0.0 |[\n, \n]|[-0.6971070766448975]|(34,[31],[-0.6971070766448975]) |[0.964332892998679,0.035667107001321] |[0.964332892998679,0.035667107001321] |0.0 |0 | |20**051900000044**35|0 |0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0 |0.0|0.0 |0.0|0.0|0.0 |0.0|0.0|0.0|\N|\N|0.0|0.0|0.0 |[\n, \n]|[-0.6971070766448975]|(34,[31],[-0.6971070766448975]) |[0.964332892998679,0.035667107001321] |[0.964332892998679,0.035667107001321] |0.0 |0 | |20**052100000047**47|0 |0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0 |0.0|0.0 |0.0|0.0|0.0 |0.0|0.0|0.0|\N|\N|0.0|0.0|0.0 |[\n, \n]|[-0.6971070766448975]|(34,[31],[-0.6971070766448975]) |[0.964332892998679,0.035667107001321] |[0.964332892998679,0.035667107001321] |0.0 |0 | |20**052600000051**75|0 |0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0 |0.0|0.0 |0.0|0.0|0.0 |0.0|0.0|0.0|\N|\N|0.0|0.0|0.0 |[\n, \n]|[-0.6971070766448975]|(34,[31],[-0.6971070766448975]) |[0.964332892998679,0.035667107001321] |[0.964332892998679,0.035667107001321] |0.0 |0 | |20**053100000057**95|0 |0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0 |0.0|0.0 |0.0|0.0|0.0 |0.0|0.0|0.0|\N|\N|0.0|0.0|0.0 |[\n, \n]|[-0.6971070766448975]|(34,[31],[-0.6971070766448975]) |[0.964332892998679,0.035667107001321] |[0.964332892998679,0.035667107001321] |0.0 |0 | |20**060100000057**90|0 |0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0 |0.0|0.0 |0.0|0.0|0.0 |0.0|0.0|0.0|\N|\N|0.0|0.0|0.0 |[\n, \n]|[-0.6971070766448975]|(34,[31],[-0.6971070766448975]) |[0.964332892998679,0.035667107001321] |[0.964332892998679,0.035667107001321] |0.0 |0 | |20**060300000060**02|0 |0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0 |0.0|0.0 |0.0|0.0|0.0 |0.0|0.0|0.0|\N|\N|0.0|0.0|0.0 |[\n, \n]|[-0.6971070766448975]|(34,[31],[-0.6971070766448975]) |[0.964332892998679,0.035667107001321] |[0.964332892998679,0.035667107001321] |0.0 |0 | |20**061500000072**13|0 |0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0 |0.0|0.0 |0.0|0.0|0.0 |0.0|0.0|0.0|\N|\N|0.0|0.0|0.0 |[\n, \n]|[-0.6971070766448975]|(34,[31],[-0.6971070766448975]) |[0.964332892998679,0.035667107001321] |[0.964332892998679,0.035667107001321] |0.0 |0 | |20**061700000073**10|0 |0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0 |0.0|0.0 |0.0|0.0|0.0 |0.0|0.0|0.0|\N|\N|0.0|0.0|0.0 |[\n, \n]|[-0.6971070766448975]|(34,[31],[-0.6971070766448975]) |[0.964332892998679,0.035667107001321] |[0.964332892998679,0.035667107001321] |0.0 |0 | |20**061700000074**37|0 |0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0 |0.0|0.0 |0.0|0.0|0.0 |0.0|0.0|0.0|\N|\N|0.0|0.0|0.0 |[\n, \n]|[-0.6971070766448975]|(34,[31],[-0.6971070766448975]) |[0.964332892998679,0.035667107001321] |[0.964332892998679,0.035667107001321] |0.0 |0 | |20**061900000077**27|0 |0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0 |0.0|0.0 |0.0|0.0|0.0 |0.0|0.0|0.0|\N|\N|0.0|0.0|0.0 |[\n, \n]|[-0.6971070766448975]|(34,[31],[-0.6971070766448975]) |[0.964332892998679,0.035667107001321] |[0.964332892998679,0.035667107001321] |0.0 |0 | |20**062100000080**02|1 |0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|1.0|0.0|0.0|0.0|1.0|1.0|0.0|0.0|21.0|1.0|20.0|0.0|0.0|18.0|1.0|1.0|1.0|\N|\N|0.5|1.0|1.0 |[\n, \n]|[-0.6971070766448975]|(34,[14,18,19,22,23,24,27,28,29,30,31,32,33],[1.0,1.0,1.0,21.0,1.0,20.0,18.0,1.0,1.0,1.0,-0.6971070766448975,0.5,1.0]) |[0.8320209973753281,0.1679790026246719] |[0.8320209973753281,0.1679790026246719] |0.0 |0 | |20**062400000083**16|0 |0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0 |0.0|0.0 |0.0|0.0|0.0 |0.0|0.0|0.0|\N|\N|0.0|0.0|0.0 |[\n, \n]|[-0.6971070766448975]|(34,[31],[-0.6971070766448975]) |[0.964332892998679,0.035667107001321] |[0.964332892998679,0.035667107001321] |0.0 |0 | |20**062400000084**81|0 |0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0 |0.0|0.0 |0.0|0.0|0.0 |0.0|0.0|0.0|\N|\N|0.0|0.0|0.0 |[\n, \n]|[-0.6971070766448975]|(34,[31],[-0.6971070766448975]) |[0.964332892998679,0.035667107001321] |[0.964332892998679,0.035667107001321] |0.0 |0 | |20**070500000098**50|0 |0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0 |0.0|0.0 |0.0|0.0|0.0 |0.0|0.0|0.0|\N|\N|0.0|0.0|0.0 |[\n, \n]|[-0.6971070766448975]|(34,[31],[-0.6971070766448975]) |[0.964332892998679,0.035667107001321] |[0.964332892998679,0.035667107001321] |0.0 |0 | |20**070600000099**12|0 |0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0 |0.0|0.0 |0.0|0.0|0.0 |0.0|0.0|0.0|\N|\N|0.0|0.0|0.0 |[\n, \n]|[-0.6971070766448975]|(34,[31],[-0.6971070766448975]) |[0.964332892998679,0.035667107001321] |[0.964332892998679,0.035667107001321] |0.0 |0 | |20**070900000102**72|0 |1.0|0.0|0.0|0.0|0.0|0.0|1.0|0.0|0.0|1.0|0.0|0.0|0.0|1.0|1.0|1.0|0.0|0.0|1.0|1.0|0.0|0.0|44.0|1.0|43.0|0.0|1.0|14.0|1.0|1.0|1.0|\N|高 |0.0|0.0|0.0 |[\n, 高] |[-0.8806669116020203]|(34,[0,6,9,13,14,15,18,19,22,23,24,26,27,28,29,30,31],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,44.0,1.0,43.0,1.0,14.0,1.0,1.0,1.0,-0.8806669116020203])|[0.9938829787234043,0.006117021276595745]|[0.9938829787234043,0.006117021276595745]|0.0 |0 | |20**071700000112**30|0 |0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0 |0.0|0.0 |0.0|0.0|0.0 |0.0|0.0|0.0|\N|\N|0.0|0.0|0.0 |[\n, \n]|[-0.6971070766448975]|(34,[31],[-0.6971070766448975]) |[0.964332892998679,0.035667107001321] |[0.964332892998679,0.035667107001321] |0.0 |0 | +--------------------+-----+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+----+---+----+---+---+----+---+---+---+-----+---+---+------------+--------+---------------------+-----------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------+-----------------------------------------+----------+--------------+
結果:scala
程序預測爲欺詐用戶爲73人,其中正確爲欺詐用戶爲57人,16人預測不正確(自己是正經常使用戶被預測爲欺詐用戶)debug
程序預測爲正經常使用戶爲14995人,其中正確爲正經常使用戶爲14498人,其中407預測不正確
優化方向:一、gooduser 數據量和baduser數據量分佈不均勻,能夠提取更多baduser的先驗數據
二、用戶畫像中的指標數據量不全,致使有些用戶是指標爲null
三、根據業務設置更多有用指標
總結:使用過lr算法和kmeas,可是效果不怎麼好