Stanford parser學習:LexicalizedParser類分析

      上次(http://www.cnblogs.com/stGeekpower/p/3457746.html)主要是對應於javadoc寫了下LexicalizedParser類main函數的功能,此次看下main函數的具體處理過程。main函數大概350行左右,主要完成的工做是:初始化變量(各類標誌位)、解析傳入的各類參數、根據傳入的選項參數分步驟完成各類工做。html

      根據選項來作的工做按順序主要包括:分詞(必須最早處理)、初始化LexicalizedParser(讀入或訓練)、編碼設置、測試、保存(若是須要的話)、解析輸出結果。java

      具體解析的部分:對句子解析是經過LexicalizedParser對象生成的ParserQuery類的parse函數來完成,對文件的解析由ParseFiles類的parseFiles函數(最終也是調用ParserQuery類)完成。express

1、初始化變量

這部分主要處理申明一些標誌位,以及構建解析器須要的變量;app

 boolean train = false;//train or parse
 boolean saveToSerializedFile = false;//是否序列化存儲至文件
 boolean saveToTextFile = false;//是否存儲至文本文件
 String serializedInputFileOrUrl = null;//序列化輸入文件或者url
 String textInputFileOrUrl = null;//文本輸入文件或者url
 String serializedOutputFileOrUrl = null;//序列化輸出文件或者url
 String textOutputFileOrUrl = null;//文本輸入文件或者url
 String treebankPath = null;//語法樹路徑
 Treebank testTreebank = null;
 Treebank tuneTreebank = null;
 String testPath = null;
 FileFilter testFilter = null;
 String tunePath = null;
 FileFilter tuneFilter = null;
 FileFilter trainFilter = null;//訓練過濾範圍
 String secondaryTreebankPath = null;
double secondaryTreebankWeight = 1.0;
 FileFilter secondaryTrainFilter = null;

 // variables needed to process the files to be parsed
 TokenizerFactory<? extends HasWord> tokenizerFactory = null; //分詞工廠
 String tokenizerOptions = null;//分詞所需參數
 String tokenizerFactoryClass = null;//分詞所用類
 String tokenizerMethod = null;//分詞所用方法
 boolean tokenized = false; // whether or not the input file has already been tokenized
 Function<List<HasWord>, List<HasWord>> escaper = null; //轉義
 String tagDelimiter = null; //分隔符
 String sentenceDelimiter = null;
 String elementDelimiter = null;

2、解析傳入的各類參數

這裏處理用戶傳入的各類選項參數,存入在一種申明的變量中;ide

 int argIndex = 0;
 if (args.length < 1) {//參數數量爲0,錯誤返回
      System.err.println("Basic usage (see Javadoc for more): java edu.stanford.nlp.parser.lexparser" +
                    ".LexicalizedParser parserFileOrUrl filename*");
     return;
 }

 Options op = new Options(); //處理參數的對象
 List<String> optionArgs = new ArrayList<String>();
 String encoding = null;
 // while loop through option arguments,循環處理選項參數
 while (argIndex < args.length && args[argIndex].charAt(0) == '-') {
     if (args[argIndex].equalsIgnoreCase("-train") || args[argIndex].equalsIgnoreCase("-trainTreebank")) {//判斷是否執行訓練功能
          train = true;
         //處理訓練時傳入的參數信息,獲得文件路徑和過濾範圍存至treebankDescription
         Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-test");
         argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
         treebankPath = treebankDescription.first();
         trainFilter = treebankDescription.second();
         } else if (args[argIndex].equalsIgnoreCase("-train2")) {
             // TODO: we could use the fully expressive -train options if
             // we add some mechanism for returning leftover options from
             // ArgUtils.getTreebankDescription
             // train = true;     // cdm july 2005: should require -train for this
             int numSubArgs = ArgUtils.numSubArgs(args, argIndex);
             argIndex++;
             if (numSubArgs < 2) {
                 throw new RuntimeException("Error: -train2 <treebankPath> [<ranges>] <weight>.");
             }
             secondaryTreebankPath = args[argIndex++];
             secondaryTrainFilter = (numSubArgs == 3) ? new NumberRangesFileFilter(args[argIndex++], true) : null;
             secondaryTreebankWeight = Double.parseDouble(args[argIndex++]);
         } else if (args[argIndex].equalsIgnoreCase("-tLPP") && (argIndex + 1 < args.length)) {
              // 當使用除英文外的語言或者English Penn Treebank以外的Treebank時候須要指定TreebankLangParserParams,
                // 該選項必須出如今其餘的與語言相關的選項以前。不一樣的語言有不一樣的參數
                try {
                    op.tlpParams = (TreebankLangParserParams) Class.forName(args[argIndex + 1]).newInstance();
              } catch (ClassNotFoundException e) {
                  System.err.println("Class not found: " + args[argIndex + 1]);
                  throw new RuntimeException(e);
              } catch (InstantiationException e) {
                  System.err.println("Couldn't instantiate: " + args[argIndex + 1] + ": " + e.toString());
                  throw new RuntimeException(e);
              } catch (IllegalAccessException e) {
                  System.err.println("Illegal access" + e);
                  throw new RuntimeException(e);
              }
              argIndex += 2;
          } else if (args[argIndex].equalsIgnoreCase("-encoding")) {//編碼
                // sets encoding for TreebankLangParserParams
              // redone later to override any serialized parser one read in
              encoding = args[argIndex + 1];
              op.tlpParams.setInputEncoding(encoding);
              op.tlpParams.setOutputEncoding(encoding);
              argIndex += 2;
          } else if (args[argIndex].equalsIgnoreCase("-tokenized")) {//是否已經分詞
                 tokenized = true;
               argIndex += 1;
          } else if (args[argIndex].equalsIgnoreCase("-escaper")) {
               try {
                   escaper = ReflectionLoading.loadByReflection(args[argIndex + 1]);
               } catch (Exception e) {
                   System.err.println("Couldn't instantiate escaper " + args[argIndex + 1] + ": " + e);
               }
               argIndex += 2;
          } else if (args[argIndex].equalsIgnoreCase("-tokenizerOptions")) {//指定TokenizerFactory類完成tokenization 所須要的參數信息
                tokenizerOptions = args[argIndex + 1];
              argIndex += 2;
          } else if (args[argIndex].equalsIgnoreCase("-tokenizerFactory")) {//指定一個TokenizerFactory類來完成分詞
                tokenizerFactoryClass = args[argIndex + 1];
              argIndex += 2;
          } else if (args[argIndex].equalsIgnoreCase("-tokenizerMethod")) {//分詞方法
                tokenizerMethod = args[argIndex + 1];
              argIndex += 2;
          } else if (args[argIndex].equalsIgnoreCase("-sentences")) {//指定一個詞語來劃分句子邊界,即分句根據
                sentenceDelimiter = args[argIndex + 1];
              if (sentenceDelimiter.equalsIgnoreCase("newline")) {
                  sentenceDelimiter = "\n";
              }
              argIndex += 2;
          } else if (args[argIndex].equalsIgnoreCase("-parseInside")) {//解析的範圍,能夠是句,幾句等等
                elementDelimiter = args[argIndex + 1];
              argIndex += 2;
          } else if (args[argIndex].equalsIgnoreCase("-tagSeparator")) {//指明標註符號
                tagDelimiter = args[argIndex + 1];
              argIndex += 2;
          } else if (args[argIndex].equalsIgnoreCase("-loadFromSerializedFile") ||
              args[argIndex].equalsIgnoreCase("-model")) {
              // load the parser from a binary serialized file
              // the next argument must be the path to the parser file
              serializedInputFileOrUrl = args[argIndex + 1];
              argIndex += 2;
          } else if (args[argIndex].equalsIgnoreCase("-loadFromTextFile")) {
              // load the parser from declarative text file
              // the next argument must be the path to the parser file
              textInputFileOrUrl = args[argIndex + 1];
              argIndex += 2;
          } else if (args[argIndex].equalsIgnoreCase("-saveToSerializedFile")) {
              saveToSerializedFile = true;
              if (ArgUtils.numSubArgs(args, argIndex) < 1) {
                  System.err.println("Missing path: -saveToSerialized filename");
              } else {
                  serializedOutputFileOrUrl = args[argIndex + 1];
              }
              argIndex += 2;
          } else if (args[argIndex].equalsIgnoreCase("-saveToTextFile")) {
              // save the parser to declarative text file
              saveToTextFile = true;
              textOutputFileOrUrl = args[argIndex + 1];
              argIndex += 2;
          } else if (args[argIndex].equalsIgnoreCase("-saveTrainTrees")) {
              // save the training trees to a binary file
              op.trainOptions.trainTreeFile = args[argIndex + 1];
              argIndex += 2;
          } else if (args[argIndex].equalsIgnoreCase("-treebank") ||
              args[argIndex].equalsIgnoreCase("-testTreebank") ||
              args[argIndex].equalsIgnoreCase("-test")) {//訓練並測試,測試所需的參數
                Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-test");
              argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
              testPath = treebankDescription.first();
              testFilter = treebankDescription.second();
          } else if (args[argIndex].equalsIgnoreCase("-tune")) {
              Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-tune");
              argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
              tunePath = treebankDescription.first();
              tuneFilter = treebankDescription.second();
          } else {
              int oldIndex = argIndex;
              argIndex = op.setOptionOrWarn(args, argIndex);
              for (int i = oldIndex; i < argIndex; i++) {
                  optionArgs.add(args[i]);
              }
          }
 } // end while loop through arguments

3、分詞處理

句法分析的前提是句子已經被正確分詞,這裏即完成分詞工做,固然分詞咱們能夠選用本身合適的分詞器;函數

// set up tokenizerFactory with options if provided
        if (tokenizerFactoryClass != null || tokenizerOptions != null) {
            try {//分詞工廠類、分詞方法由參數指定,若不指定,默認PTBTokenizer
                if (tokenizerFactoryClass != null) {
                    Class<TokenizerFactory<? extends HasWord>> clazz = ErasureUtils.uncheckedCast(Class.forName
                            (tokenizerFactoryClass));
                    Method factoryMethod;
                    if (tokenizerOptions != null) {
                        factoryMethod = clazz.getMethod(tokenizerMethod != null ? tokenizerMethod :
                                "newWordTokenizerFactory", String.class);
                        tokenizerFactory = ErasureUtils.uncheckedCast(factoryMethod.invoke(null, tokenizerOptions));
                    } else {
                        factoryMethod = clazz.getMethod(tokenizerMethod != null ? tokenizerMethod :
                                "newTokenizerFactory");
                        tokenizerFactory = ErasureUtils.uncheckedCast(factoryMethod.invoke(null));
                    }
                } else {
                    // have options but no tokenizer factory; default to PTB
                    tokenizerFactory = PTBTokenizer.PTBTokenizerFactory.newWordTokenizerFactory(tokenizerOptions);
                }
            } catch (IllegalAccessException e) {
                System.err.println("Couldn't instantiate TokenizerFactory " + tokenizerFactoryClass + " with options " +
                        "" + tokenizerOptions);
                throw new RuntimeException(e);
            } catch (NoSuchMethodException e) {
                System.err.println("Couldn't instantiate TokenizerFactory " + tokenizerFactoryClass + " with options " +
                        "" + tokenizerOptions);
                throw new RuntimeException(e);
            } catch (ClassNotFoundException e) {
                System.err.println("Couldn't instantiate TokenizerFactory " + tokenizerFactoryClass + " with options " +
                        "" + tokenizerOptions);
                throw new RuntimeException(e);
            } catch (InvocationTargetException e) {
                System.err.println("Couldn't instantiate TokenizerFactory " + tokenizerFactoryClass + " with options " +
                        "" + tokenizerOptions);
                throw new RuntimeException(e);
            }

4、初始化LexicalizedParser

初始化LexicalizedParser有三種方式,分別是:根據數據訓練一個,從文本文件讀入,從序列化文件讀入;oop

if (tuneFilter != null || tunePath != null) {//處理tune treebank
            if (tunePath == null) {
                if (treebankPath == null) {
                    throw new RuntimeException("No tune treebank path specified...");
                } else {
                    System.err.println("No tune treebank path specified.  Using train path: \"" + treebankPath + '\"');
                    tunePath = treebankPath;
                }
            }
            tuneTreebank = op.tlpParams.testMemoryTreebank();
            tuneTreebank.loadPath(tunePath, tuneFilter);
        }

        if (!train && op.testOptions.verbose) {
            StringUtils.printErrInvocationString("LexicalizedParser", args);
        }
        edu.stanford.nlp.parser.lexparser.LexicalizedParser lp; // always initialized in next if-then-else block
        if (train) {
            StringUtils.printErrInvocationString("LexicalizedParser", args);

            // so we train a parser using the treebank
            GrammarCompactor compactor = null;
            if (op.trainOptions.compactGrammar() == 3) {
                compactor = new ExactGrammarCompactor(op, false, false);
            }

            Treebank trainTreebank = makeTreebank(treebankPath, op, trainFilter);

            Treebank secondaryTrainTreebank = null;
            if (secondaryTreebankPath != null) {
                secondaryTrainTreebank = makeSecondaryTreebank(secondaryTreebankPath, op, secondaryTrainFilter);
            }

            List<List<TaggedWord>> extraTaggedWords = null;
            if (op.trainOptions.taggedFiles != null) {
                extraTaggedWords = new ArrayList<List<TaggedWord>>();
                List<TaggedFileRecord> fileRecords = TaggedFileRecord.createRecords(new Properties(),
                        op.trainOptions.taggedFiles);
                for (TaggedFileRecord record : fileRecords) {
                    for (List<TaggedWord> sentence : record.reader()) {
                        extraTaggedWords.add(sentence);
                    }
                }
            }
            //執行訓練方法時對lp的初始化,根據標註數據訓練出lp
            lp = getParserFromTreebank(trainTreebank, secondaryTrainTreebank, secondaryTreebankWeight, compactor, op,
                    tuneTreebank, extraTaggedWords);
        } else if (textInputFileOrUrl != null) {
            // so we load the parser from a text grammar file,直接從文本文件中讀入lp
            lp = getParserFromTextFile(textInputFileOrUrl, op);
        } else {
            // so we load a serialized parser,從序列化保存的文件中讀入lp
            if (serializedInputFileOrUrl == null && argIndex < args.length) {
                // the next argument must be the path to the serialized parser
                serializedInputFileOrUrl = args[argIndex];
                argIndex++;
            }
            if (serializedInputFileOrUrl == null) {
                System.err.println("No grammar specified, exiting...");
                return;
            }
            String[] extraArgs = new String[optionArgs.size()];
            extraArgs = optionArgs.toArray(extraArgs);
            try {
                lp = loadModel(serializedInputFileOrUrl, op, extraArgs);
                op = lp.op;
            } catch (IllegalArgumentException e) {
                System.err.println("Error loading parser, exiting...");
                throw e;
            }
        }

5、控制編碼

 // the following has to go after reading parser to make sure
 // op and tlpParams are the same for train and test
 // THIS IS BUTT UGLY BUT IT STOPS USER SPECIFIED ENCODING BEING
 // OVERWRITTEN BY ONE SPECIFIED IN SERIALIZED PARSER
 if (encoding != null) {
    op.tlpParams.setInputEncoding(encoding);
    op.tlpParams.setOutputEncoding(encoding);
 }

6、測試數據設置

       if (testFilter != null || testPath != null) {
            if (testPath == null) {
                if (treebankPath == null) {
                    throw new RuntimeException("No test treebank path specified...");
                } else {
                    System.err.println("No test treebank path specified.  Using train path: \"" + treebankPath + '\"');
                    testPath = treebankPath;
                }
            }
            testTreebank = op.tlpParams.testMemoryTreebank();
            testTreebank.loadPath(testPath, testFilter);
        }

7、須要的話將訓練生成的解析器保存

        op.trainOptions.sisterSplitters = Generics.newHashSet(Arrays.asList(op.tlpParams.sisterSplitters()));

        // at this point we should be sure that op.tlpParams is
        // set appropriately (from command line, or from grammar file),
        // and will never change again.  -- Roger

        // Now what do we do with the parser we've made
        if (saveToTextFile) {
            // save the parser to textGrammar format
            if (textOutputFileOrUrl != null) {
                lp.saveParserToTextFile(textOutputFileOrUrl);
            } else {
                System.err.println("Usage: must specify a text grammar output path");
            }
        }
        if (saveToSerializedFile) {
            if (serializedOutputFileOrUrl != null) {
                lp.saveParserToSerialized(serializedOutputFileOrUrl);
            } else if (textOutputFileOrUrl == null && testTreebank == null) {
                // no saving/parsing request has been specified
                System.err.println("usage: " + "java edu.stanford.nlp.parser.lexparser.LexicalizedParser " + "-train " +
                        "trainFilesPath [fileRange] -saveToSerializedFile serializedParserFilename");
            }
        }

8、訓練或者指定輸入參數時,輸出一些信息

        if (op.testOptions.verbose || train) {
            // Tell the user a little or a lot about what we have made
            // get lexicon size separately as it may have its own prints in it....
            String lexNumRules = lp.lex != null ? Integer.toString(lp.lex.numRules()) : "";
            System.err.println("Grammar\tStates\tTags\tWords\tUnaryR\tBinaryR\tTaggings");
            System.err.println("Grammar\t" +
                    lp.stateIndex.size() + '\t' +
                    lp.tagIndex.size() + '\t' +
                    lp.wordIndex.size() + '\t' +
                    (lp.ug != null ? lp.ug.numRules() : "") + '\t' +
                    (lp.bg != null ? lp.bg.numRules() : "") + '\t' +
                    lexNumRules);
            System.err.println("ParserPack is " + op.tlpParams.getClass().getName());
            System.err.println("Lexicon is " + lp.lex.getClass().getName());
            if (op.testOptions.verbose) {
                System.err.println("Tags are: " + lp.tagIndex);
                // System.err.println("States are: " + lp.pd.stateIndex); // This is too verbose. It was already
                // printed out by the below printOptions command if the flag -printStates is given (at training time)!
            }
            printOptions(false, op);
        }

9、執行解析工做

能夠以句子的方式解析,也可用ParseFiles類的方法來解析多個文件。測試

        if (testTreebank != null) {
            // test parser on treebank
            EvaluateTreebank evaluator = new EvaluateTreebank(lp);
            evaluator.testOnTreebank(testTreebank);
        } else if (argIndex >= args.length) {
            // no more arguments, so we just parse our own test sentence
            PrintWriter pwOut = op.tlpParams.pw();
            PrintWriter pwErr = op.tlpParams.pw(System.err);
            ParserQuery pq = lp.parserQuery();
            if (pq.parse(op.tlpParams.defaultTestSentence())) {//解析
                lp.getTreePrint().printTree(pq.getBestParse(), pwOut);
            } else {
                pwErr.println("Error. Can't parse test sentence: " +
                        op.tlpParams.defaultTestSentence());
            }
        } else {
            // We parse filenames given by the remaining arguments,解析
            ParseFiles.parseFiles(args, argIndex, tokenized, tokenizerFactory, elementDelimiter, sentenceDelimiter,
                    escaper, tagDelimiter, op, lp.getTreePrint(), lp);
        }
相關文章
相關標籤/搜索