stanford corenlp的中文切詞有時不盡如意,那咱們就須要實現一個自定義切詞類,來徹底知足咱們的私人定製(加各類詞典干預)。上篇文章《IKAnalyzer》介紹了IKAnalyzer的自由度,本篇文章就說下怎麼把IKAnalyzer做爲corenlp的切詞工具。html
《stanford corenlp的TokensRegex》提到了corenlp的配置CoreNLP-chinese.properties,其中customAnnotatorClass.segment就是用於指定切詞類的,在這裏咱們只須要模仿ChineseSegmenterAnnotator來實現一個本身的Annotator,並設置在配置文件中便可。java
customAnnotatorClass.segment = edu.stanford.nlp.pipeline.ChineseSegmenterAnnotator
下面是個人實現:ide
public class IKSegmenterAnnotator extends ChineseSegmenterAnnotator { public IKSegmenterAnnotator() { super(); } public IKSegmenterAnnotator(boolean verbose) { super(verbose); } public IKSegmenterAnnotator(String segLoc, boolean verbose) { super(segLoc, verbose); } public IKSegmenterAnnotator(String segLoc, boolean verbose, String serDictionary, String sighanCorporaDict) { super(segLoc, verbose, serDictionary, sighanCorporaDict); } public IKSegmenterAnnotator(String name, Properties props) { super(name, props); } private List<String> splitWords(String str) { try { List<String> words = new ArrayList<String>(); IKSegmenter ik = new IKSegmenter(new StringReader(str), true); Lexeme lex = null; while ((lex = ik.next()) != null) { words.add(lex.getLexemeText()); } return words; } catch (IOException e) { //LOGGER.error(e.getMessage(), e); System.out.println(e); List<String> words = new ArrayList<String>(); words.add(str); return words; } } @Override public void runSegmentation(CoreMap annotation) { //0 2 // A BC D E // 1 10 1 1 // 0 12 3 4 // 0, 0+1 , String text = annotation.get(CoreAnnotations.TextAnnotation.class); List<CoreLabel> sentChars = annotation.get(ChineseCoreAnnotations.CharactersAnnotation.class); List<CoreLabel> tokens = new ArrayList<CoreLabel>(); annotation.set(CoreAnnotations.TokensAnnotation.class, tokens); //List<String> words = segmenter.segmentString(text); List<String> words = splitWords(text); System.err.println(text); System.err.println("--->"); System.err.println(words); int pos = 0; for (String w : words) { CoreLabel fl = sentChars.get(pos); fl.set(CoreAnnotations.ChineseSegAnnotation.class, "1"); if (w.length() == 0) { continue; } CoreLabel token = new CoreLabel(); token.setWord(w); token.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, fl.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class)); pos += w.length(); fl = sentChars.get(pos - 1); token.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, fl.get(CoreAnnotations.CharacterOffsetEndAnnotation.class)); tokens.add(token); } } }
在外面爲IKAnalyzer初始化詞典,指定擴展詞典和刪除詞典工具
//爲ik初始化詞典,刪除干擾詞 Dictionary.initial(DefaultConfig.getInstance()); String delDic = System.getProperty(READ_IK_DEL_DIC, null); BufferedReader reader = new BufferedReader(new FileReader(delDic)); String line = null; List<String> delWords = new ArrayList<>(); while ((line = reader.readLine()) != null) { delWords.add(line); } Dictionary.getSingleton().disableWords(delWords);