spark MLPC 文本分類例子

一、公司名歸類,簡稱cc碼java

二、算法:多層感知分類node

三、總思路:文本分詞-->Word2Vec--->矩陣---->MultilayerPerceptronClassifier算法

    ①中文分詞使用是 IK Analyzerapache

     例如:浙江工人日報社印刷廠---->分詞後--->浙江|工人日報|社|印刷廠|less

     代碼dom

import java.io.StringReader
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute
import org.apache.spark.{SparkConf, SparkContext}
import org.wltea.analyzer.lucene.IKAnalyzer

/**
  * Created by dongdong on 17/4/24.
  */
object Participles {

  def main(args: Array[String]): Unit = {
  
    val inpath = "/Users/dongdong/Desktop/cc/small_data/mlj_total_cc.txt"
    val outpath = "/Users/dongdong/Desktop/cc/participles_small"
    val conf = new SparkConf().setMaster("local[2]").setAppName("Participles")
    val sc = new SparkContext(conf)

   //read data
    val originalData = sc
      .textFile(inpath)
      .map(line => {
        val arr = line.split("\t")
        arr
      }).filter(t => {
      t.length == 3
    })

    //splits data
    val participles_data = originalData.map(t => {
      var words = ""
      val company_name = t(0).trim
      val label = t(1).trim
      val cNumber = t(2).trim
      //  val address = t(3).trim
      val anal = new IKAnalyzer(true)
      val reader = new StringReader(company_name)
      val ts = anal.tokenStream("", reader)
      ts.reset()
      val term: CharTermAttribute = ts.getAttribute(classOf[CharTermAttribute])
      while (ts.incrementToken()) {
        words += term.toString + "|"
      }
      val words_repalce = words.replaceAll(",", "")

      words_repalce + "," + label + "," + cNumber

    })

    //save data
    participles_data.repartition(1).saveAsTextFile(outpath)

    sc.stop()
  }

}

 ②MultilayerPerceptronClassifierspa

 核心代碼scala

//The label into vector
    val labelIndexer = new StringIndexer()
      .setInputCol("label")
      .setOutputCol("indexedLabel")
      .fit(originalData)
    val labelIndexer_data: DataFrame = labelIndexer.transform(originalData)
   
    //Break up the word
    val tokenizer = new RegexTokenizer()
      .setInputCol("text")
      .setOutputCol("words")
      .setPattern("\\|")
    val tokenizer_ts_data: DataFrame = tokenizer.transform(labelIndexer_data)
    
    //Filter the useless words
    val arr = Array("有限公司", "有限責任公司", "", "公司", "分公司", "責任公司", "有限", "責任")
    val remover = new StopWordsRemover()
      .setInputCol("words")
      .setOutputCol("filtered")
      .setStopWords(arr)
    val fitered_data: DataFrame = remover.transform(tokenizer_ts_data)

    //The words into vector
    val word2Vec = new Word2Vec()
      .setInputCol("filtered")
      .setOutputCol("features")
      //Set features number
      .setVectorSize(VECTOR_SIZE)
      .setMinCount(1)
      .setMaxIter(100)
    // .setNumPartitions(3)

   
    // The hidden layer nodes=2n+1,n input nodes
    //the 43 is number of we want to classification
    val layers = Array[Int](VECTOR_SIZE, 101, 100, 43)
    val mlpc = new MultilayerPerceptronClassifier()
      .setLayers(layers)
      .setBlockSize(512)
      .setSeed(1234L)
      .setMaxIter(128)
      .setFeaturesCol("features")
      .setLabelCol("indexedLabel")
      .setPredictionCol("prediction")

    //To convert vector label to the label of type String
    val labelConverter = new IndexToString()
      .setInputCol("prediction")
      .setOutputCol("predictedLabel")
      .setLabels(labelIndexer.labels)

    val Array(trainingData, testData) = originalData.randomSplit(Array(0.8, 0.2))

    val pipeline = new Pipeline().setStages(Array(tokenizer, remover, labelIndexer, word2Vec, mlpc, labelConverter))

 ③試過用TF-IDF 和LogisticRegression(邏輯迴歸)組合code

                   TF-IDF  和 NaiveBayes(樸素貝葉斯)組合orm

       效果都不太好,其中LogisticRegression只支持二分類

 ④ 因爲先驗數據集分佈不均勻   

     最終的正確率:0.606549930730621

total_rate		659490		527397		132093		80121		0.606549930730621
相關文章
相關標籤/搜索