1、導入須要的分詞包java
import org.ansj.domain.Term import org.ansj.recognition.impl.StopRecognition import org.ansj.splitWord.analysis.ToAnalysis
2、停用詞過濾dom
def filter(stopWords: Array[String]): StopRecognition = { // add stop words val filter = new StopRecognition filter.insertStopNatures("w") // filter punctuation filter.insertStopNatures("m") // filter m pattern filter.insertStopNatures("null") // filter null filter.insertStopNatures("<br />") // filter <br /> filter.insertStopRegexes("^[a-zA-Z]{1,}") //filter English alphabet filter.insertStopRegexes("^[0-9]+") //filter number filter.insertStopRegexes("[^a-zA-Z0-9\\u4e00-\\u9fa5]+") filter.insertStopRegexes("\t") for (x <- stopWords) { filter.insertStopWords(x) } filter }
3、分詞ide
def getWords(text: String, filter: StopRecognition): ArrayBuffer[String] = { val words = new mutable.ArrayBuffer[String]() val terms: java.util.List[Term] = ToAnalysis.parse(text).recognition(filter).getTerms for (i <- 0 until terms.size()) { val word = terms.get(i).getName if (word.length >= MIN_WORD_LENGTH) { words += word } } words }