上次(http://www.cnblogs.com/stGeekpower/p/3457746.html)主要是對應於javadoc寫了下LexicalizedParser類main函數的功能,此次看下main函數的具體處理過程。main函數大概350行左右,主要完成的工做是:初始化變量(各類標誌位)、解析傳入的各類參數、根據傳入的選項參數分步驟完成各類工做。html
根據選項來作的工做按順序主要包括:分詞(必須最早處理)、初始化LexicalizedParser(讀入或訓練)、編碼設置、測試、保存(若是須要的話)、解析輸出結果。java
具體解析的部分:對句子解析是經過LexicalizedParser對象生成的ParserQuery類的parse函數來完成,對文件的解析由ParseFiles類的parseFiles函數(最終也是調用ParserQuery類)完成。express
這部分主要處理申明一些標誌位,以及構建解析器須要的變量;app
boolean train = false;//train or parse boolean saveToSerializedFile = false;//是否序列化存儲至文件 boolean saveToTextFile = false;//是否存儲至文本文件 String serializedInputFileOrUrl = null;//序列化輸入文件或者url String textInputFileOrUrl = null;//文本輸入文件或者url String serializedOutputFileOrUrl = null;//序列化輸出文件或者url String textOutputFileOrUrl = null;//文本輸入文件或者url String treebankPath = null;//語法樹路徑 Treebank testTreebank = null; Treebank tuneTreebank = null; String testPath = null; FileFilter testFilter = null; String tunePath = null; FileFilter tuneFilter = null; FileFilter trainFilter = null;//訓練過濾範圍 String secondaryTreebankPath = null; double secondaryTreebankWeight = 1.0; FileFilter secondaryTrainFilter = null; // variables needed to process the files to be parsed TokenizerFactory<? extends HasWord> tokenizerFactory = null; //分詞工廠 String tokenizerOptions = null;//分詞所需參數 String tokenizerFactoryClass = null;//分詞所用類 String tokenizerMethod = null;//分詞所用方法 boolean tokenized = false; // whether or not the input file has already been tokenized Function<List<HasWord>, List<HasWord>> escaper = null; //轉義 String tagDelimiter = null; //分隔符 String sentenceDelimiter = null; String elementDelimiter = null;
這裏處理用戶傳入的各類選項參數,存入在一種申明的變量中;ide
int argIndex = 0; if (args.length < 1) {//參數數量爲0,錯誤返回 System.err.println("Basic usage (see Javadoc for more): java edu.stanford.nlp.parser.lexparser" + ".LexicalizedParser parserFileOrUrl filename*"); return; } Options op = new Options(); //處理參數的對象 List<String> optionArgs = new ArrayList<String>(); String encoding = null; // while loop through option arguments,循環處理選項參數 while (argIndex < args.length && args[argIndex].charAt(0) == '-') { if (args[argIndex].equalsIgnoreCase("-train") || args[argIndex].equalsIgnoreCase("-trainTreebank")) {//判斷是否執行訓練功能 train = true; //處理訓練時傳入的參數信息,獲得文件路徑和過濾範圍存至treebankDescription Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-test"); argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1; treebankPath = treebankDescription.first(); trainFilter = treebankDescription.second(); } else if (args[argIndex].equalsIgnoreCase("-train2")) { // TODO: we could use the fully expressive -train options if // we add some mechanism for returning leftover options from // ArgUtils.getTreebankDescription // train = true; // cdm july 2005: should require -train for this int numSubArgs = ArgUtils.numSubArgs(args, argIndex); argIndex++; if (numSubArgs < 2) { throw new RuntimeException("Error: -train2 <treebankPath> [<ranges>] <weight>."); } secondaryTreebankPath = args[argIndex++]; secondaryTrainFilter = (numSubArgs == 3) ? new NumberRangesFileFilter(args[argIndex++], true) : null; secondaryTreebankWeight = Double.parseDouble(args[argIndex++]); } else if (args[argIndex].equalsIgnoreCase("-tLPP") && (argIndex + 1 < args.length)) { // 當使用除英文外的語言或者English Penn Treebank以外的Treebank時候須要指定TreebankLangParserParams, // 該選項必須出如今其餘的與語言相關的選項以前。不一樣的語言有不一樣的參數 try { op.tlpParams = (TreebankLangParserParams) Class.forName(args[argIndex + 1]).newInstance(); } catch (ClassNotFoundException e) { System.err.println("Class not found: " + args[argIndex + 1]); throw new RuntimeException(e); } catch (InstantiationException e) { System.err.println("Couldn't instantiate: " + args[argIndex + 1] + ": " + e.toString()); throw new RuntimeException(e); } catch (IllegalAccessException e) { System.err.println("Illegal access" + e); throw new RuntimeException(e); } argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-encoding")) {//編碼 // sets encoding for TreebankLangParserParams // redone later to override any serialized parser one read in encoding = args[argIndex + 1]; op.tlpParams.setInputEncoding(encoding); op.tlpParams.setOutputEncoding(encoding); argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-tokenized")) {//是否已經分詞 tokenized = true; argIndex += 1; } else if (args[argIndex].equalsIgnoreCase("-escaper")) { try { escaper = ReflectionLoading.loadByReflection(args[argIndex + 1]); } catch (Exception e) { System.err.println("Couldn't instantiate escaper " + args[argIndex + 1] + ": " + e); } argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-tokenizerOptions")) {//指定TokenizerFactory類完成tokenization 所須要的參數信息 tokenizerOptions = args[argIndex + 1]; argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-tokenizerFactory")) {//指定一個TokenizerFactory類來完成分詞 tokenizerFactoryClass = args[argIndex + 1]; argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-tokenizerMethod")) {//分詞方法 tokenizerMethod = args[argIndex + 1]; argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-sentences")) {//指定一個詞語來劃分句子邊界,即分句根據 sentenceDelimiter = args[argIndex + 1]; if (sentenceDelimiter.equalsIgnoreCase("newline")) { sentenceDelimiter = "\n"; } argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-parseInside")) {//解析的範圍,能夠是句,幾句等等 elementDelimiter = args[argIndex + 1]; argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-tagSeparator")) {//指明標註符號 tagDelimiter = args[argIndex + 1]; argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-loadFromSerializedFile") || args[argIndex].equalsIgnoreCase("-model")) { // load the parser from a binary serialized file // the next argument must be the path to the parser file serializedInputFileOrUrl = args[argIndex + 1]; argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-loadFromTextFile")) { // load the parser from declarative text file // the next argument must be the path to the parser file textInputFileOrUrl = args[argIndex + 1]; argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-saveToSerializedFile")) { saveToSerializedFile = true; if (ArgUtils.numSubArgs(args, argIndex) < 1) { System.err.println("Missing path: -saveToSerialized filename"); } else { serializedOutputFileOrUrl = args[argIndex + 1]; } argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-saveToTextFile")) { // save the parser to declarative text file saveToTextFile = true; textOutputFileOrUrl = args[argIndex + 1]; argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-saveTrainTrees")) { // save the training trees to a binary file op.trainOptions.trainTreeFile = args[argIndex + 1]; argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-treebank") || args[argIndex].equalsIgnoreCase("-testTreebank") || args[argIndex].equalsIgnoreCase("-test")) {//訓練並測試,測試所需的參數 Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-test"); argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1; testPath = treebankDescription.first(); testFilter = treebankDescription.second(); } else if (args[argIndex].equalsIgnoreCase("-tune")) { Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-tune"); argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1; tunePath = treebankDescription.first(); tuneFilter = treebankDescription.second(); } else { int oldIndex = argIndex; argIndex = op.setOptionOrWarn(args, argIndex); for (int i = oldIndex; i < argIndex; i++) { optionArgs.add(args[i]); } } } // end while loop through arguments
句法分析的前提是句子已經被正確分詞,這裏即完成分詞工做,固然分詞咱們能夠選用本身合適的分詞器;函數
// set up tokenizerFactory with options if provided if (tokenizerFactoryClass != null || tokenizerOptions != null) { try {//分詞工廠類、分詞方法由參數指定,若不指定,默認PTBTokenizer if (tokenizerFactoryClass != null) { Class<TokenizerFactory<? extends HasWord>> clazz = ErasureUtils.uncheckedCast(Class.forName (tokenizerFactoryClass)); Method factoryMethod; if (tokenizerOptions != null) { factoryMethod = clazz.getMethod(tokenizerMethod != null ? tokenizerMethod : "newWordTokenizerFactory", String.class); tokenizerFactory = ErasureUtils.uncheckedCast(factoryMethod.invoke(null, tokenizerOptions)); } else { factoryMethod = clazz.getMethod(tokenizerMethod != null ? tokenizerMethod : "newTokenizerFactory"); tokenizerFactory = ErasureUtils.uncheckedCast(factoryMethod.invoke(null)); } } else { // have options but no tokenizer factory; default to PTB tokenizerFactory = PTBTokenizer.PTBTokenizerFactory.newWordTokenizerFactory(tokenizerOptions); } } catch (IllegalAccessException e) { System.err.println("Couldn't instantiate TokenizerFactory " + tokenizerFactoryClass + " with options " + "" + tokenizerOptions); throw new RuntimeException(e); } catch (NoSuchMethodException e) { System.err.println("Couldn't instantiate TokenizerFactory " + tokenizerFactoryClass + " with options " + "" + tokenizerOptions); throw new RuntimeException(e); } catch (ClassNotFoundException e) { System.err.println("Couldn't instantiate TokenizerFactory " + tokenizerFactoryClass + " with options " + "" + tokenizerOptions); throw new RuntimeException(e); } catch (InvocationTargetException e) { System.err.println("Couldn't instantiate TokenizerFactory " + tokenizerFactoryClass + " with options " + "" + tokenizerOptions); throw new RuntimeException(e); }
初始化LexicalizedParser有三種方式,分別是:根據數據訓練一個,從文本文件讀入,從序列化文件讀入;oop
if (tuneFilter != null || tunePath != null) {//處理tune treebank if (tunePath == null) { if (treebankPath == null) { throw new RuntimeException("No tune treebank path specified..."); } else { System.err.println("No tune treebank path specified. Using train path: \"" + treebankPath + '\"'); tunePath = treebankPath; } } tuneTreebank = op.tlpParams.testMemoryTreebank(); tuneTreebank.loadPath(tunePath, tuneFilter); } if (!train && op.testOptions.verbose) { StringUtils.printErrInvocationString("LexicalizedParser", args); } edu.stanford.nlp.parser.lexparser.LexicalizedParser lp; // always initialized in next if-then-else block if (train) { StringUtils.printErrInvocationString("LexicalizedParser", args); // so we train a parser using the treebank GrammarCompactor compactor = null; if (op.trainOptions.compactGrammar() == 3) { compactor = new ExactGrammarCompactor(op, false, false); } Treebank trainTreebank = makeTreebank(treebankPath, op, trainFilter); Treebank secondaryTrainTreebank = null; if (secondaryTreebankPath != null) { secondaryTrainTreebank = makeSecondaryTreebank(secondaryTreebankPath, op, secondaryTrainFilter); } List<List<TaggedWord>> extraTaggedWords = null; if (op.trainOptions.taggedFiles != null) { extraTaggedWords = new ArrayList<List<TaggedWord>>(); List<TaggedFileRecord> fileRecords = TaggedFileRecord.createRecords(new Properties(), op.trainOptions.taggedFiles); for (TaggedFileRecord record : fileRecords) { for (List<TaggedWord> sentence : record.reader()) { extraTaggedWords.add(sentence); } } } //執行訓練方法時對lp的初始化,根據標註數據訓練出lp lp = getParserFromTreebank(trainTreebank, secondaryTrainTreebank, secondaryTreebankWeight, compactor, op, tuneTreebank, extraTaggedWords); } else if (textInputFileOrUrl != null) { // so we load the parser from a text grammar file,直接從文本文件中讀入lp lp = getParserFromTextFile(textInputFileOrUrl, op); } else { // so we load a serialized parser,從序列化保存的文件中讀入lp if (serializedInputFileOrUrl == null && argIndex < args.length) { // the next argument must be the path to the serialized parser serializedInputFileOrUrl = args[argIndex]; argIndex++; } if (serializedInputFileOrUrl == null) { System.err.println("No grammar specified, exiting..."); return; } String[] extraArgs = new String[optionArgs.size()]; extraArgs = optionArgs.toArray(extraArgs); try { lp = loadModel(serializedInputFileOrUrl, op, extraArgs); op = lp.op; } catch (IllegalArgumentException e) { System.err.println("Error loading parser, exiting..."); throw e; } }
// the following has to go after reading parser to make sure // op and tlpParams are the same for train and test // THIS IS BUTT UGLY BUT IT STOPS USER SPECIFIED ENCODING BEING // OVERWRITTEN BY ONE SPECIFIED IN SERIALIZED PARSER if (encoding != null) { op.tlpParams.setInputEncoding(encoding); op.tlpParams.setOutputEncoding(encoding); }
if (testFilter != null || testPath != null) { if (testPath == null) { if (treebankPath == null) { throw new RuntimeException("No test treebank path specified..."); } else { System.err.println("No test treebank path specified. Using train path: \"" + treebankPath + '\"'); testPath = treebankPath; } } testTreebank = op.tlpParams.testMemoryTreebank(); testTreebank.loadPath(testPath, testFilter); }
op.trainOptions.sisterSplitters = Generics.newHashSet(Arrays.asList(op.tlpParams.sisterSplitters())); // at this point we should be sure that op.tlpParams is // set appropriately (from command line, or from grammar file), // and will never change again. -- Roger // Now what do we do with the parser we've made if (saveToTextFile) { // save the parser to textGrammar format if (textOutputFileOrUrl != null) { lp.saveParserToTextFile(textOutputFileOrUrl); } else { System.err.println("Usage: must specify a text grammar output path"); } } if (saveToSerializedFile) { if (serializedOutputFileOrUrl != null) { lp.saveParserToSerialized(serializedOutputFileOrUrl); } else if (textOutputFileOrUrl == null && testTreebank == null) { // no saving/parsing request has been specified System.err.println("usage: " + "java edu.stanford.nlp.parser.lexparser.LexicalizedParser " + "-train " + "trainFilesPath [fileRange] -saveToSerializedFile serializedParserFilename"); } }
if (op.testOptions.verbose || train) { // Tell the user a little or a lot about what we have made // get lexicon size separately as it may have its own prints in it.... String lexNumRules = lp.lex != null ? Integer.toString(lp.lex.numRules()) : ""; System.err.println("Grammar\tStates\tTags\tWords\tUnaryR\tBinaryR\tTaggings"); System.err.println("Grammar\t" + lp.stateIndex.size() + '\t' + lp.tagIndex.size() + '\t' + lp.wordIndex.size() + '\t' + (lp.ug != null ? lp.ug.numRules() : "") + '\t' + (lp.bg != null ? lp.bg.numRules() : "") + '\t' + lexNumRules); System.err.println("ParserPack is " + op.tlpParams.getClass().getName()); System.err.println("Lexicon is " + lp.lex.getClass().getName()); if (op.testOptions.verbose) { System.err.println("Tags are: " + lp.tagIndex); // System.err.println("States are: " + lp.pd.stateIndex); // This is too verbose. It was already // printed out by the below printOptions command if the flag -printStates is given (at training time)! } printOptions(false, op); }
能夠以句子的方式解析,也可用ParseFiles類的方法來解析多個文件。測試
if (testTreebank != null) { // test parser on treebank EvaluateTreebank evaluator = new EvaluateTreebank(lp); evaluator.testOnTreebank(testTreebank); } else if (argIndex >= args.length) { // no more arguments, so we just parse our own test sentence PrintWriter pwOut = op.tlpParams.pw(); PrintWriter pwErr = op.tlpParams.pw(System.err); ParserQuery pq = lp.parserQuery(); if (pq.parse(op.tlpParams.defaultTestSentence())) {//解析 lp.getTreePrint().printTree(pq.getBestParse(), pwOut); } else { pwErr.println("Error. Can't parse test sentence: " + op.tlpParams.defaultTestSentence()); } } else { // We parse filenames given by the remaining arguments,解析 ParseFiles.parseFiles(args, argIndex, tokenized, tokenizerFactory, elementDelimiter, sentenceDelimiter, escaper, tagDelimiter, op, lp.getTreePrint(), lp); }