在應用程序中使用java
maven座標:算法
<dependency> <groupId>com.chenlb.mmseg4j</groupId> <artifactId>mmseg4j-core</artifactId> <version>1.10.0</version> </dependency>
默認加載詞典的路徑代碼以下(源碼單詞拼寫有錯誤,將就着看吧,readonly):apache
另外,能夠創建本身的詞庫,文件名爲words*.dic,而且文件要以UTF-8無BOM格式編碼。app
* 每一個分詞文件必須以words開頭,.dic結尾,如:words-canmou.dic
maven
* 每一個分詞文件大小必須控制在50M之內,不然極可能會OOM
工具
/** * 當 words.dic 是從 jar 里加載時, 可能 defalut 不存在 */ public static File getDefalutPath() { if(defalutPath == null) { String defPath = System.getProperty("mmseg.dic.path"); log.info("look up in mmseg.dic.path="+defPath); if(defPath == null) { URL url = Dictionary.class.getClassLoader().getResource("data"); if(url != null) { defPath = url.getFile(); log.info("look up in classpath="+defPath); } else { defPath = System.getProperty("user.dir")+"/data"; log.info("look up in user.dir="+defPath); } } defalutPath = new File(defPath); if(!defalutPath.exists()) { log.warning("defalut dic path="+defalutPath+" not exist"); } } return defalutPath; }
創建如下工具類:ui
package com.caiya.software.service.utils; import com.chenlb.mmseg4j.*; import org.apache.commons.lang3.StringUtils; import java.io.IOException; import java.io.Reader; import java.io.StringReader; import java.util.HashSet; import java.util.Set; /** * 使用mmseg4j分詞器,共有三種分詞方法 * Created by caiya on 16/4/27. */ public class MMSeg4jUtils { public static final String mode_simple = "simple"; public static final String mode_complex = "complex"; public static final String mode_max_word = "max-word"; public static final String mode_default = mode_max_word; public static final String operator_default = " "; public static Dictionary dictionary_default = Dictionary.getInstance(); public static Dictionary getDictionary(String path){ if(StringUtils.isBlank(path)){ throw new IllegalArgumentException("詞典目錄不可爲空"); } return Dictionary.getInstance(path); } private static Seg getSeg(String mode, Dictionary dic) { if(mode.equals(mode_simple)){ return new SimpleSeg(dic); }else if(mode.equals(mode_complex)){ return new ComplexSeg(dic); }else{ return new MaxWordSeg(dic); } } private static String segWords(Reader input, String wordSpilt, String mode, Dictionary dic) throws IOException { StringBuilder sb = new StringBuilder(); Seg seg = getSeg(mode, dic); //取得不一樣的分詞具體算法 MMSeg mmSeg = new MMSeg(input, seg); Word word = null; boolean first = true; while((word=mmSeg.next())!=null) { if(!first) { sb.append(wordSpilt); } String w = word.getString(); sb.append(w); first = false; } return sb.toString(); } public static String segWords(String txt, String wordSpilt, String mode, Dictionary dic) throws IOException { return segWords(new StringReader(txt), wordSpilt, mode, dic); } public static String segWords(String txt, String wordSpilt) throws IOException { return segWords(txt, wordSpilt, mode_default, dictionary_default); } public static Set<String> segWordsSet(String txt, String wordSpilt, String mode, Dictionary dic) throws IOException { Set<String> sets = new HashSet<String>(); String segWords = segWords(new StringReader(txt), wordSpilt, mode, dic); for (String s : segWords.split(wordSpilt)){ sets.add(s); } return sets; } public static Set<String> segWordsSet(String txt, String mode, Dictionary dic) throws IOException { return segWordsSet(txt, operator_default, mode, dic); } public static Set<String> segWordsSet(String txt) throws IOException { return segWordsSet(txt, mode_default, dictionary_default); } }
調用方式:編碼
SolrQuery params = new SolrQuery(); StringBuffer buffer = new StringBuffer(); if (StringUtils.isNotBlank(softwareQuery.getKeyWord())) { String kw = softwareQuery.getKeyWord(); try { // kw = MMSeg4jUtils.segWords(kw, " OR "); kw = MMSeg4jUtils.segWords(kw, " OR ", MMSeg4jUtils.mode_complex, SoftwareConstants.dictionary); } catch (IOException e) { logger.error("mmseg4j切詞出現異常", e); } buffer.append("title:(").append(kw).append(")^50"); buffer.append(" OR tag_name:(").append(kw).append(")^1.7"); buffer.append(" OR author:(").append(kw).append(")^1.1"); buffer.append(" OR origin:(").append(kw).append(")^1.1"); buffer.append(" OR description:(").append(kw).append(")^1.1"); buffer.append(" OR txt:(").append(kw).append(")^1.1"); } else { buffer.append("*:*"); } params.setQuery(buffer.toString()); //其中, // 初始化詞典文件 /Users/caiya/workspace/test/dic // dictionary = MMSeg4jUtils.getDictionary(dicPath);
分詞效果:url
擴展詞典內容:spa
$ cat words.dic 十衣素 韓都衣舍專賣旗艦旗艦店 十衣素啊哈哈
因而可知,通常的分詞器都是基於正向最大匹配,稍後咱們能夠最這塊進行拓展和挖掘。
在solr中的使用
目前solr 4.7.2使用mmseg4j 2.0.0版本
<fieldType name="textComplex" class="solr.TextField" positionIncrementGap="100" > <analyzer> <tokenizer class="com.chenlb.mmseg4j.solr.MMSegTokenizerFactory" mode="complex" dicPath="dic"/> </analyzer> </fieldType> <fieldType name="textMaxWord" class="solr.TextField" positionIncrementGap="100" > <analyzer> <tokenizer class="com.chenlb.mmseg4j.solr.MMSegTokenizerFactory" mode="max-word" dicPath="dic"/> </analyzer> </fieldType> <fieldType name="textSimple" class="solr.TextField" positionIncrementGap="100" > <analyzer> <tokenizer class="com.chenlb.mmseg4j.solr.MMSegTokenizerFactory" mode="simple" dicPath="n:/OpenSource/apache-solr-1.3.0/example/solr/my_dic"/> </analyzer> </fieldType>
dicPath 指定詞庫位置(每一個MMSegTokenizerFactory能夠指定不一樣的目錄,當是相對目錄時,是相對 solr.home 的目錄),mode 指定分詞模式(simple|complex|max-word,默認是max-word)。