package com.kkrgwbj.util; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.core.LetterTokenizer; import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.core.StopAnalyzer; import org.apache.lucene.analysis.core.StopFilter; import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.util.Version; import java.io.Reader; import java.util.HashSet; import java.util.Set; /** * 自定義停用分詞器 * Created by lvbingyang on 2015/11/25 0025. */ public class MyStopAnalyzer extends Analyzer { private Set stops; public MyStopAnalyzer(String[] sws) { //將字符串數組添加到停用詞的set集合中 stops = StopFilter.makeStopSet(Version.LUCENE_45, sws, true); //加入原來的停用詞 stops.addAll(StopAnalyzer.ENGLISH_STOP_WORDS_SET); } /** * 默認構造方法 */ public MyStopAnalyzer() { stops = new HashSet<>(); stops.addAll(StopAnalyzer.ENGLISH_STOP_WORDS_SET);//加入原來的停用詞 } @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { //主要負責接收reader,將reader進行分詞操做 Tokenizer tokenizer = new LetterTokenizer(Version.LUCENE_45, reader); //建立停用詞的set對象 CharArraySet charArraySet = CharArraySet.copy(Version.LUCENE_45, stops); //分詞器作好處理以後獲得的一個流,這個流中存儲了分詞的信息 //使用了忽略大小寫的filter,停用filter過濾 TokenStream tokenStream = new LowerCaseFilter(Version.LUCENE_45, new StopFilter(Version.LUCENE_45, tokenizer, charArraySet)); return new TokenStreamComponents(tokenizer, tokenStream); } }
junit測試: java
@Test public void test2() { Analyzer analyzer = new MyStopAnalyzer(new String[]{"I", "you", "hate"}); Analyzer analyzer1 = new StopAnalyzer(Version.LUCENE_45); String txt = "i love you,i hate you"; //自定義的停用詞分詞器 AnalyzerUtils.displayToken(txt, analyzer); //默認的停用詞分詞器 AnalyzerUtils.displayToken(txt, analyzer1); }
在這裏,咱們停用了i,you,hate,運行結果: apache