搜索的過程總的來講就是將詞典及倒排表信息從索引中讀出來,根據用戶輸入的查詢語句合併倒排表,獲得結果文檔集並對文檔進行打分的過程。java
其可用以下圖示:算法
總共包括如下幾個過程:數組
爲了解析Lucene對索引文件搜索的過程,預先寫入索引了以下幾個文件:app
file01.txt: apple apples cat dogless
file02.txt: apple boy cat category函數
file03.txt: apply dog eat etc性能
file04.txt: apply cat foods學習
代碼爲:ui
IndexReader reader = IndexReader.open(FSDirectory.open(indexDir));this
實際上是調用了DirectoryReader.open(Directory, IndexDeletionPolicy, IndexCommit, boolean, int) 函數,其主要做用是生成一個SegmentInfos.FindSegmentsFile對象,並用它來找到此索引文件中全部的段,並打開這些段。
SegmentInfos.FindSegmentsFile.run(IndexCommit commit)主要作如下事情:
String[] files = directory.listAll(); long genA = getCurrentSegmentGeneration(files); |
long getCurrentSegmentGeneration(String[] files) { long max = -1; for (int i = 0; i < files.length; i++) { String file = files[i]; if (file.startsWith(IndexFileNames.SEGMENTS) //"segments_N" && !file.equals(IndexFileNames.SEGMENTS_GEN)) { //"segments.gen" long gen = generationFromSegmentsFileName(file); if (gen > max) { max = gen; } } } return max; } |
IndexInput genInput = directory.openInput(IndexFileNames.SEGMENTS_GEN); int version = genInput.readInt(); long gen0 = genInput.readLong(); long gen1 = genInput.readLong(); if (gen0 == gen1) { genB = gen0; } |
if (genA > genB) gen = genA; else gen = genB; String segmentFileName = IndexFileNames.fileNameFromGeneration(IndexFileNames.SEGMENTS, "", gen); //segmentFileName "segments_4" |
SegmentInfos infos = new SegmentInfos(); infos.read(directory, segmentFileName); |
SegmentInfos.read(Directory, String) 代碼以下: int format = input.readInt(); version = input.readLong(); counter = input.readInt(); for (int i = input.readInt(); i > 0; i—) { //讀出每個段,並構造SegmentInfo對象 add(new SegmentInfo(directory, format, input)); } |
SegmentInfo(Directory dir, int format, IndexInput input)構造函數以下: name = input.readString(); docCount = input.readInt(); delGen = input.readLong(); docStoreOffset = input.readInt(); if (docStoreOffset != -1) { docStoreSegment = input.readString(); docStoreIsCompoundFile = (1 == input.readByte()); } else { docStoreSegment = name; docStoreIsCompoundFile = false; } hasSingleNormFile = (1 == input.readByte()); int numNormGen = input.readInt(); normGen = new long[numNormGen]; for(int j=0;j normGen[j] = input.readLong(); } isCompoundFile = input.readByte(); delCount = input.readInt(); hasProx = input.readByte() == 1; 其實不用多介紹,看過Lucene學習總結之三:Lucene的索引文件格式 (2)一章,就很容易明白。 |
SegmentReader[] readers = new SegmentReader[sis.size()]; for (int i = sis.size()-1; i >= 0; i—) { //打開每個段 readers[i] = SegmentReader.get(readOnly, sis.info(i), termInfosIndexDivisor); } |
SegmentReader.get(boolean, Directory, SegmentInfo, int, boolean, int) 代碼以下: instance.core = new CoreReaders(dir, si, readBufferSize, termInfosIndexDivisor); instance.core.openDocStores(si); //生成用於讀取存儲域和詞向量的對象。 instance.loadDeletedDocs(); //讀取被刪除文檔(.del)文件 instance.openNorms(instance.core.cfsDir, readBufferSize); //讀取標準化因子(.nrm) |
CoreReaders(Directory dir, SegmentInfo si, int readBufferSize, int termsIndexDivisor)構造函數代碼以下: cfsReader = new CompoundFileReader(dir, segment + "." + IndexFileNames.COMPOUND_FILE_EXTENSION, readBufferSize); //讀取cfs的reader fieldInfos = new FieldInfos(cfsDir, segment + "." + IndexFileNames.FIELD_INFOS_EXTENSION); //讀取段元數據信息(.fnm) TermInfosReader reader = new TermInfosReader(cfsDir, segment, fieldInfos, readBufferSize, termsIndexDivisor); //用於讀取詞典信息(.tii .tis) freqStream = cfsDir.openInput(segment + "." + IndexFileNames.FREQ_EXTENSION, readBufferSize); //用於讀取freq proxStream = cfsDir.openInput(segment + "." + IndexFileNames.PROX_EXTENSION, readBufferSize); //用於讀取prox |
FieldInfos(Directory d, String name)構造函數以下: IndexInput input = d.openInput(name); int firstInt = input.readVInt(); size = input.readVInt(); for (int i = 0; i < size; i++) { //讀取域名 String name = StringHelper.intern(input.readString()); //讀取域的各類標誌位 byte bits = input.readByte(); boolean isIndexed = (bits & IS_INDEXED) != 0; boolean storeTermVector = (bits & STORE_TERMVECTOR) != 0; boolean storePositionsWithTermVector = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0; boolean storeOffsetWithTermVector = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0; boolean omitNorms = (bits & OMIT_NORMS) != 0; boolean storePayloads = (bits & STORE_PAYLOADS) != 0; boolean omitTermFreqAndPositions = (bits & OMIT_TERM_FREQ_AND_POSITIONS) != 0; //將讀出的域生成FieldInfo對象,加入fieldinfos進行管理 addInternal(name, isIndexed, storeTermVector, storePositionsWithTermVector, storeOffsetWithTermVector, omitNorms, storePayloads, omitTermFreqAndPositions); } |
CoreReaders.openDocStores(SegmentInfo)主要代碼以下: fieldsReaderOrig = new FieldsReader(storeDir, storesSegment, fieldInfos, readBufferSize, si.getDocStoreOffset(), si.docCount); //用於讀取存儲域(.fdx, .fdt) termVectorsReaderOrig = new TermVectorsReader(storeDir, storesSegment, fieldInfos, readBufferSize, si.getDocStoreOffset(), si.docCount); //用於讀取詞向量(.tvx, .tvd, .tvf) |
在Lucene中,每一個段中的文檔編號都是從0開始的,而一個索引有多個段,須要從新進行編號,因而維護數組start[],來保存每一個段的文檔號的偏移量,從而第i個段的文檔號是從start[i]至start[i]+Num private void initialize(SegmentReader[] subReaders) { this.subReaders = subReaders; starts = new int[subReaders.length + 1]; for (int i = 0; i < subReaders.length; i++) { starts[i] = maxDoc; maxDoc += subReaders[i].maxDoc(); if (subReaders[i].hasDeletions()) hasDeletions = true; } starts[subReaders.length] = maxDoc; } |
reader ReadOnlyDirectoryReader (id=466) //索引文件夾 //段元數據信息 //每一個段的Reader |
從上面的過程來看,IndexReader有如下幾個特性:
代碼爲:
IndexSearcher searcher = new IndexSearcher(reader);
其過程很是簡單:
private IndexSearcher(IndexReader r, boolean closeReader) { reader = r; //當關閉searcher的時候,是否關閉其reader this.closeReader = closeReader; //對文檔號進行編號 List subReadersList = new ArrayList(); gatherSubReaders(subReadersList, reader); subReaders = subReadersList.toArray(new IndexReader[subReadersList.size()]); docStarts = new int[subReaders.length]; int maxDoc = 0; for (int i = 0; i < subReaders.length; i++) { docStarts[i] = maxDoc; maxDoc += subReaders[i].maxDoc(); } } |
IndexSearcher表面上看起來好像僅僅是reader的一個封裝,它的不少函數都是直接調用reader的相應函數,如:int docFreq(Term term),Document doc(int i),int maxDoc()。然而它提供了兩個很是重要的函數:
於是在某些應用之中,只想獲得某個詞的倒排表的時候,最好不要用IndexSearcher,而直接用IndexReader.termDocs(Term term),則省去了打分的計算。
代碼爲:
QueryParser parser = new QueryParser(Version.LUCENE_CURRENT, "contents", new StandardAnalyzer(Version.LUCENE_CURRENT)); Query query = parser.parse("+(+apple* -boy) (cat* dog) -(eat~ foods)"); |
此過程相對複雜,涉及JavaCC,QueryParser,分詞器,查詢語法等,本章不會詳細論述,會在後面的章節中一一說明。
此處惟一要說明的是,根據查詢語句生成的是一個Query樹,這棵樹很重要,而且會生成其餘的樹,一直貫穿整個索引過程。
query BooleanQuery (id=96) |
對於Query對象有如下說明:
代碼爲:
TopDocs docs = searcher.search(query, 50);
其最終調用search(createWeight(query), filter, n);
索引過程包含如下子過程:
IndexSearcher(Searcher).createWeight(Query) 代碼以下:
protected Weight createWeight(Query query) throws IOException { return query.weight(this); } |
BooleanQuery(Query).weight(Searcher) 代碼爲: public Weight weight(Searcher searcher) throws IOException { //重寫Query對象樹 Query query = searcher.rewrite(this); //建立Weight對象樹 Weight weight = query.createWeight(searcher); //計算Term Weight分數 float sum = weight.sumOfSquaredWeights(); float norm = getSimilarity(searcher).queryNorm(sum); weight.normalize(norm); return weight; } |
此過程又包含如下過程:
從BooleanQuery的rewrite函數咱們能夠看出,重寫過程也是一個遞歸的過程,一直到Query對象樹的葉子節點。
BooleanQuery.rewrite(IndexReader) 代碼以下: BooleanQuery clone = null; for (int i = 0 ; i < clauses.size(); i++) { BooleanClause c = clauses.get(i); //對每個子語句的Query對象進行重寫 Query query = c.getQuery().rewrite(reader); if (query != c.getQuery()) { if (clone == null) clone = (BooleanQuery)this.clone(); //重寫後的Query對象加入複製的新Query對象樹 clone.clauses.set(i, new BooleanClause(query, c.getOccur())); } } if (clone != null) { return clone; //若是有子語句被重寫,則返回複製的新Query對象樹。 } else return this; //不然將老的Query對象樹返回。 |
讓咱們把目光彙集到葉子節點上,葉子節點基本是兩種,或是TermQuery,或是MultiTermQuery,從Lucene的源碼能夠看出TermQuery的rewrite函數就是返回對象自己,也即真正須要重寫的是MultiTermQuery,也即一個Query表明多個Term參與查詢,如本例子中的PrefixQuery及FuzzyQuery。
對此類的Query,Lucene不可以直接進行查詢,必須進行重寫處理:
從上面的Query對象樹中,咱們能夠看到,MultiTermQuery都有一個RewriteMethod成員變量,就是用來重寫Query對象的,有如下幾種:
public Query rewrite(IndexReader reader, MultiTermQuery query) { Query result = new ConstantScoreQuery(new MultiTermQueryWrapperFilter(query)); result.setBoost(query.getBoost()); return result; } |
MultiTermQueryWrapperFilter中的getDocIdSet函數實現以下:
public DocIdSet getDocIdSet(IndexReader reader) throws IOException { //獲得MultiTermQuery的Term枚舉器 final TermEnum enumerator = query.getEnum(reader); try { if (enumerator.term() == null) return DocIdSet.EMPTY_DOCIDSET; //建立包含多個Term的文檔號集合 final OpenBitSet bitSet = new OpenBitSet(reader.maxDoc()); final int[] docs = new int[32]; final int[] freqs = new int[32]; TermDocs termDocs = reader.termDocs(); try { int termCount = 0; //一個循環,取出對應MultiTermQuery的全部的Term,取出他們的文檔號,加入集合 do { Term term = enumerator.term(); if (term == null) break; termCount++; termDocs.seek(term); while (true) { final int count = termDocs.read(docs, freqs); if (count != 0) { for(int i=0;i bitSet.set(docs[i]); } } else { break; } } } while (enumerator.next()); query.incTotalNumberOfTerms(termCount); } finally { termDocs.close(); } return bitSet; } finally { enumerator.close(); } } |
public Query rewrite(IndexReader reader, MultiTermQuery query) throws IOException { //獲得MultiTermQuery的Term枚舉器 FilteredTermEnum enumerator = query.getEnum(reader); BooleanQuery result = new BooleanQuery(true); int count = 0; try { //一個循環,取出對應MultiTermQuery的全部的Term,加入BooleanQuery do { Term t = enumerator.term(); if (t != null) { TermQuery tq = new TermQuery(t); tq.setBoost(query.getBoost() * enumerator.difference()); result.add(tq, BooleanClause.Occur.SHOULD); count++; } } while (enumerator.next()); } finally { enumerator.close(); } query.incTotalNumberOfTerms(count); return result; } |
ConstantScoreAutoRewrite.rewrite代碼以下: public Query rewrite(IndexReader reader, MultiTermQuery query) throws IOException { final Collection pendingTerms = new ArrayList(); //計算文檔數目限制,docCountPercent默認爲0.1,也即索引文檔總數的0.1% final int docCountCutoff = (int) ((docCountPercent / 100.) * reader.maxDoc()); //計算Term數目限制,默認爲350 final int termCountLimit = Math.min(BooleanQuery.getMaxClauseCount(), termCountCutoff); int docVisitCount = 0; FilteredTermEnum enumerator = query.getEnum(reader); try { //一個循環,取出與MultiTermQuery相關的全部的Term。 while(true) { Term t = enumerator.term(); if (t != null) { pendingTerms.add(t); docVisitCount += reader.docFreq(t); } //若是Term數目超限,或者文檔數目超限,則可能很是影響倒排表合併的性能,於是選用方式一,也即ConstantScoreFilterRewrite的方式 if (pendingTerms.size() >= termCountLimit || docVisitCount >= docCountCutoff) { Query result = new ConstantScoreQuery(new MultiTermQueryWrapperFilter(query)); result.setBoost(query.getBoost()); return result; } else if (!enumerator.next()) { //若是Term數目不太多,並且文檔數目也不太多,不會影響倒排表合併的性能,於是選用方式二,也即ConstantScoreBooleanQueryRewrite的方式。 BooleanQuery bq = new BooleanQuery(true); for (final Term term: pendingTerms) { TermQuery tq = new TermQuery(term); bq.add(tq, BooleanClause.Occur.SHOULD); } Query result = new ConstantScoreQuery(new QueryWrapperFilter(bq)); result.setBoost(query.getBoost()); query.incTotalNumberOfTerms(pendingTerms.size()); return result; } } } finally { enumerator.close(); } } |
從上面的敘述中,咱們知道,在重寫Query對象樹的時候,從MultiTermQuery獲得的TermEnum很重要,可以獲得對應MultiTermQuery的全部的Term,這是怎麼作的的呢?
MultiTermQuery的getEnum返回的是FilteredTermEnum,它有兩個成員變量,其中TermEnum actualEnum是用來枚舉索引中全部的Term的,而Term currentTerm指向的是當前知足條件的Term,FilteredTermEnum的next()函數以下:
public boolean next() throws IOException { if (actualEnum == null) return false; currentTerm = null; //不斷獲得下一個索引中的Term while (currentTerm == null) { if (endEnum()) return false; if (actualEnum.next()) { Term term = actualEnum.term(); //若是當前索引中的Term知足條件,則賦值爲當前的Term if (termCompare(term)) { currentTerm = term; return true; } } else return false; } currentTerm = null; return false; } |
不一樣的MultiTermQuery的termCompare不一樣:
protected boolean termCompare(Term term) { //只要前綴相同,就知足條件 if (term.field() == prefix.field() && term.text().startsWith(prefix.text())){ return true; } endEnum = true; return false; }
protected final boolean termCompare(Term term) { //對於FuzzyQuery,其prefix設爲空"",也即這一條件必定知足,只要計算的是similarity if (field == term.field() && term.text().startsWith(prefix)) { final String target = term.text().substring(prefix.length()); this.similarity = similarity(target); return (similarity > minimumSimilarity); } endEnum = true; return false; } //計算Levenshtein distance 也即 edit distance,對於兩個字符串,從一個轉換成爲另外一個所須要的最少基本操做(添加,刪除,替換)數。
private synchronized final float similarity(final String target) { final int m = target.length(); final int n = text.length(); // init matrix d for (int i = 0; i<=n; ++i) { p[i] = i; } // start computing edit distance for (int j = 1; j<=m; ++j) { // iterates through target int bestPossibleEditDistance = m; final char t_j = target.charAt(j-1); // jth character of t d[0] = j; for (int i=1; i<=n; ++i) { // iterates through text // minimum of cell to the left+1, to the top+1, diagonally left and up +(0|1) if (t_j != text.charAt(i-1)) { d[i] = Math.min(Math.min(d[i-1], p[i]), p[i-1]) + 1; } else { d[i] = Math.min(Math.min(d[i-1]+1, p[i]+1), p[i-1]); } bestPossibleEditDistance = Math.min(bestPossibleEditDistance, d[i]); } // copy current distance counts to 'previous row' distance counts: swap p and d int _d[] = p; p = d; d = _d; } return 1.0f - ((float)p[n] / (float) (Math.min(n, m))); } |
有關edit distance的算法詳見http://www.merriampark.com/ld.htm 計算兩個字符串s和t的edit distance算法以下: Step 1: Step 2: Step 3: Step 4: Step 5: Step 6: Step 7: 舉例說明其過程以下: 比較的兩個字符串爲:「GUMBO」 和 "GAMBOL". |
下面作一個試驗,來講明ConstantScoreXXX對評分的影響:
在索引中,添加了如下四篇文檔: file01.txt : apple other other other other file02.txt : apple apple other other other file03.txt : apple apple apple other other file04.txt : apple apple apple other other 搜索"apple"結果以下: docid : 3 score : 0.67974937 文檔按照包含"apple"的多少排序。 而搜索"apple*"結果以下: docid : 0 score : 1.0 也即Lucene放棄了對score的計算。 |
通過rewrite,獲得的新Query對象樹以下:
query BooleanQuery (id=89) | | //"apple*"被用方式一重寫爲ConstantScoreQuery | | //"cat*"被用方式一重寫爲ConstantScoreQuery | | //"eat~"做爲FuzzyQuery,被重寫成BooleanQuery, |
BooleanQuery.createWeight(Searcher) 最終返回return new BooleanWeight(searcher),BooleanWeight構造函數的具體實現以下:
public BooleanWeight(Searcher searcher) { this.similarity = getSimilarity(searcher); weights = new ArrayList(clauses.size()); //也是一個遞歸的過程,沿着新的Query對象樹一直到葉子節點 for (int i = 0 ; i < clauses.size(); i++) { weights.add(clauses.get(i).getQuery().createWeight(searcher)); } } |
對於TermQuery的葉子節點,其TermQuery.createWeight(Searcher) 返回return new TermWeight(searcher)對象,TermWeight構造函數以下:
public TermWeight(Searcher searcher) { this.similarity = getSimilarity(searcher); //此處計算了idf idfExp = similarity.idfExplain(term, searcher); idf = idfExp.getIdf(); } |
//idf的計算徹底符合文檔中的公式: public IDFExplanation idfExplain(final Term term, final Searcher searcher) { final int df = searcher.docFreq(term); final int max = searcher.maxDoc(); final float idf = idf(df, max); return new IDFExplanation() { public float getIdf() { return idf; }}; } |
public float idf(int docFreq, int numDocs) { return (float)(Math.log(numDocs/(double)(docFreq+1)) + 1.0); } |
而ConstantScoreQuery.createWeight(Searcher) 除了建立ConstantScoreQuery.ConstantWeight(searcher)對象外,沒有計算idf。
由此建立的Weight對象樹以下:
weight BooleanQuery$BooleanWeight (id=169) | | //ConstantScore(contents:apple*) | //contents:boy | | //ConstantScore(contents:cat*) | //contents:dog | | //contents:eat | //contents:cat^0.33333325 //contents:foods |
(1) 首先計算sumOfSquaredWeights
按照公式:
代碼以下:
float sum = weight.sumOfSquaredWeights();
//能夠看出,也是一個遞歸的過程 public float sumOfSquaredWeights() throws IOException { float sum = 0.0f; for (int i = 0 ; i < weights.size(); i++) { float s = weights.get(i).sumOfSquaredWeights(); if (!clauses.get(i).isProhibited()) sum += s; } sum *= getBoost() * getBoost(); //乘以query boost return sum ; } |
對於葉子節點TermWeight來說,其TermQuery$TermWeight.sumOfSquaredWeights()實現以下:
public float sumOfSquaredWeights() { //計算一部分打分,idf*t.getBoost(),未來還會用到。 queryWeight = idf * getBoost(); //計算(idf*t.getBoost())^2 return queryWeight * queryWeight; } |
對於葉子節點ConstantWeight來說,其ConstantScoreQuery$ConstantWeight.sumOfSquaredWeights() 以下:
public float sumOfSquaredWeights() { //除了用戶指定的boost之外,其餘都不計算在打份內 queryWeight = getBoost(); return queryWeight * queryWeight; } |
(2) 計算queryNorm
其公式以下:
其代碼以下:
public float queryNorm(float sumOfSquaredWeights) { return (float)(1.0 / Math.sqrt(sumOfSquaredWeights)); } |
(3) 將queryNorm算入打分
代碼爲:
weight.normalize(norm);
//又是一個遞歸的過程 public void normalize(float norm) { norm *= getBoost(); for (Weight w : weights) { w.normalize(norm); } } |
其葉子節點TermWeight來說,其TermQuery$TermWeight.normalize(float) 代碼以下:
public void normalize(float queryNorm) { this.queryNorm = queryNorm; //原來queryWeight爲idf*t.getBoost(),如今爲queryNorm*idf*t.getBoost()。 queryWeight *= queryNorm; //打分到此計算了queryNorm*idf*t.getBoost()*idf = queryNorm*idf^2*t.getBoost()部分。 value = queryWeight * idf; } |
咱們知道,Lucene的打分公式總體以下,到此計算了圖中,紅色的部分:
當建立完Weight對象樹的時候,調用IndexSearcher.search(Weight, Filter, int),代碼以下:
//(a)建立文檔號收集器 TopScoreDocCollector collector = TopScoreDocCollector.create(nDocs, !weight.scoresDocsOutOfOrder()); search(weight, filter, collector); //(b)返回搜索結果 return collector.topDocs(); |
public void search(Weight weight, Filter filter, Collector collector) throws IOException { if (filter == null) { for (int i = 0; i < subReaders.length; i++) { collector.setNextReader(subReaders[i], docStarts[i]); //(c)建立Scorer對象樹,以及SumScorer樹用來合併倒排表 Scorer scorer = weight.scorer(subReaders[i], !collector.acceptsDocsOutOfOrder(), true); if (scorer != null) { //(d)合併倒排表,(e)收集文檔號 scorer.score(collector); } } } else { for (int i = 0; i < subReaders.length; i++) { collector.setNextReader(subReaders[i], docStarts[i]); searchWithFilter(subReaders[i], weight, filter, collector); } } } |
在本節中,重點分析(c)建立Scorer對象樹,以及SumScorer樹用來合併倒排表,在2.4.3節中,分析 (d)合併倒排表,在2.4.4節中,分析文檔結果收集器的建立(a),結果文檔的收集(e),以及文檔的返回(b)。
BooleanQuery$BooleanWeight.scorer(IndexReader, boolean, boolean) 代碼以下:
public Scorer scorer(IndexReader reader, boolean scoreDocsInOrder, boolean topScorer){ //存放對應於MUST語句的Scorer List required = new ArrayList(); //存放對應於MUST_NOT語句的Scorer List prohibited = new ArrayList(); //存放對應於SHOULD語句的Scorer List optional = new ArrayList(); //遍歷每個子語句,生成子Scorer對象,並加入相應的集合,這是一個遞歸的過程。 Iterator cIter = clauses.iterator(); for (Weight w : weights) { BooleanClause c = cIter.next(); Scorer subScorer = w.scorer(reader, true, false); if (subScorer == null) { if (c.isRequired()) { return null; } } else if (c.isRequired()) { required.add(subScorer); } else if (c.isProhibited()) { prohibited.add(subScorer); } else { optional.add(subScorer); } } //此處在有關BooleanScorer及scoreDocsInOrder一節會詳細描述 if (!scoreDocsInOrder && topScorer && required.size() == 0 && prohibited.size() < 32) { //生成Scorer對象樹,同時生成SumScorer對象樹 return new BooleanScorer2(similarity, minNrShouldMatch, required, prohibited, optional); } |
對其葉子節點TermWeight來講,TermQuery$TermWeight.scorer(IndexReader, boolean, boolean) 代碼以下:
public Scorer scorer(IndexReader reader, boolean scoreDocsInOrder, boolean topScorer) throws IOException { //此Term的倒排表 TermDocs termDocs = reader.termDocs(term); if (termDocs == null) return null; return new TermScorer(this, termDocs, similarity, reader.norms(term.field())); } |
TermScorer(Weight weight, TermDocs td, Similarity similarity, byte[] norms) { super(similarity); this.weight = weight; this.termDocs = td; //獲得標準化因子 this.norms = norms; //獲得原來計算得的打分:queryNorm*idf^2*t.getBoost() this.weightValue = weight.getValue(); for (int i = 0; i < SCORE_CACHE_SIZE; i++) scoreCache[i] = getSimilarity().tf(i) * weightValue; } |
對其葉子節點ConstantWeight來講,ConstantScoreQuery$ConstantWeight.scorer(IndexReader, boolean, boolean) 代碼以下:
public ConstantScorer(Similarity similarity, IndexReader reader, Weight w) { super(similarity); theScore = w.getValue(); //獲得全部的文檔號,造成統一的倒排表,參與倒排表合併。 DocIdSet docIdSet = filter.getDocIdSet(reader); DocIdSetIterator docIdSetIterator = docIdSet.iterator(); } |
對於BooleanWeight,最後要產生的是BooleanScorer2,其構造函數代碼以下:
public BooleanScorer2(Similarity similarity, int minNrShouldMatch, List required, List prohibited, List optional) { super(similarity); //爲了計算打分公式中的coord項作統計 coordinator = new Coordinator(); this.minNrShouldMatch = minNrShouldMatch; //SHOULD的部分 optionalScorers = optional; coordinator.maxCoord += optional.size(); //MUST的部分 requiredScorers = required; coordinator.maxCoord += required.size(); //MUST_NOT的部分 prohibitedScorers = prohibited; //事先計算好各類狀況的coord值 coordinator.init(); //建立SumScorer爲倒排表合併作準備 countingSumScorer = makeCountingSumScorer(); } |
Coordinator.init() { coordFactors = new float[maxCoord + 1]; Similarity sim = getSimilarity(); for (int i = 0; i <= maxCoord; i++) { //計算總的子語句的個數和一個文檔知足的子語句的個數之間的關係,天然是一篇文檔知足的子語句個個數越多,打分越高。 coordFactors[i] = sim.coord(i, maxCoord); } } |
在生成Scorer對象樹以外,還會生成SumScorer對象樹,來表示各個語句之間的關係,爲合併倒排表作準備。
在解析BooleanScorer2.makeCountingSumScorer() 以前,咱們先來看不一樣的語句之間都存在什麼樣的關係,又將如何影響倒排表合併呢?
語句主要分三類:MUST,SHOULD,MUST_NOT
語句之間的組合主要有如下幾種狀況:
下面分析生成SumScorer的過程:
BooleanScorer2.makeCountingSumScorer() 分兩種狀況:
首先來看makeCountingSumScorerSomeReq代碼以下:
private Scorer makeCountingSumScorerSomeReq() { if (optionalScorers.size() == minNrShouldMatch) { //若是optional的語句個數剛好等於最少需知足的optional的個數,則全部的optional都變成required。因而首先全部的optional生成ConjunctionScorer(交集),而後再經過addProhibitedScorers將prohibited加入,生成ReqExclScorer(required exclusive) ArrayList allReq = new ArrayList(requiredScorers); allReq.addAll(optionalScorers); return addProhibitedScorers(countingConjunctionSumScorer(allReq)); } else { //首先全部的required的語句生成ConjunctionScorer(交集) Scorer requiredCountingSumScorer = requiredScorers.size() == 1 ? new SingleMatchScorer(requiredScorers.get(0)) : countingConjunctionSumScorer(requiredScorers); if (minNrShouldMatch > 0) { //若是最少需知足的optional的個數有必定的限制,則意味着optional中有一部分要至關於required,會影響倒排表的合併。於是required生成的ConjunctionScorer(交集)和optional生成的DisjunctionSumScorer(並集)共同組合成一個ConjunctionScorer(交集),而後再加入prohibited,生成ReqExclScorer return addProhibitedScorers( dualConjunctionSumScorer( requiredCountingSumScorer, countingDisjunctionSumScorer( optionalScorers, minNrShouldMatch))); } else { // minNrShouldMatch == 0 //若是最少需知足的optional的個數沒有必定的限制,則optional並不影響倒排表的合併,僅僅在文檔包含optional部分的時候增長打分。因此required和prohibited首先生成ReqExclScorer,而後再加入optional,生成ReqOptSumScorer(required optional) return new ReqOptSumScorer( addProhibitedScorers(requiredCountingSumScorer), optionalScorers.size() == 1 ? new SingleMatchScorer(optionalScorers.get(0)) : countingDisjunctionSumScorer(optionalScorers, 1)); } } } |
而後咱們來看makeCountingSumScorerNoReq代碼以下:
private Scorer makeCountingSumScorerNoReq() { // minNrShouldMatch optional scorers are required, but at least 1 int nrOptRequired = (minNrShouldMatch < 1) ? 1 : minNrShouldMatch; Scorer requiredCountingSumScorer; if (optionalScorers.size() > nrOptRequired) //若是optional的語句個數多於最少需知足的optional的個數,則optional中一部分至關required,影響倒排表的合併,因此生成DisjunctionSumScorer requiredCountingSumScorer = countingDisjunctionSumScorer(optionalScorers, nrOptRequired); else if (optionalScorers.size() == 1) //若是optional的語句只有一個,則返回SingleMatchScorer,不存在倒排表合併的問題。 requiredCountingSumScorer = new SingleMatchScorer(optionalScorers.get(0)); else //若是optional的語句個數少於等於最少需知足的optional的個數,則全部的optional都算required,因此生成ConjunctionScorer requiredCountingSumScorer = countingConjunctionSumScorer(optionalScorers); //將prohibited加入,生成ReqExclScorer return addProhibitedScorers(requiredCountingSumScorer); } |
通過此步驟,生成的Scorer對象樹以下:
scorer BooleanScorer2 (id=50) | | | //ConstantScore(contents:cat*) | | //weight(contents:dog) | | | | | //weight(contents:eat) | | | | //weight(contents:cat^0.33333325) | | //weight(contents:foods) | //weight(contents:boy) //ConstantScore(contents:apple*) |
生成的SumScorer對象樹以下:
scorer BooleanScorer2 (id=50) | | //ConstantScore(contents:cat*) | //weight(contents:dog) | | | //weight(contents:eat) | | //weight(contents:cat^0.33333325) | //weight(contents:foods) | //weight(contents:boy) //ConstantScore(contents:apple*) |
在獲得了Scorer對象樹以及SumScorer對象樹後,即是倒排表的合併以及打分計算的過程。
合併倒排表在此節中進行分析,而Scorer對象樹來進行打分的計算則在下一節分析。
BooleanScorer2.score(Collector) 代碼以下:
public void score(Collector collector) throws IOException { collector.setScorer(this); while ((doc = countingSumScorer.nextDoc()) != NO_MORE_DOCS) { collector.collect(doc); } } |
從代碼咱們能夠看出,此過程就是不斷的取下一篇文檔號,而後加入文檔結果集。
取下一篇文檔的過程,就是合併倒排表的過程,也就是對多個查詢條件進行綜合考慮後的下一篇文檔的編號。
因爲SumScorer是一棵樹,於是合併倒排表也是按照樹的結構進行的,先合併子樹,而後子樹與子樹再進行合併,直到根。
按照上一節的分析,倒排表的合併主要用瞭如下幾個SumScorer:
下面咱們一一分析:
ConjunctionScorer中有成員變量Scorer[] scorers,是一個Scorer的數組,每一項表明一個倒排表,ConjunctionScorer就是對這些倒排表取交集,而後將交集中的文檔號在nextDoc()函數中依次返回。
爲了描述清楚此過程,下面舉一個具體的例子來解釋倒排表合併的過程:
(1) 倒排表最初以下:
(2) 在ConjunctionScorer的構造函數中,首先調用每一個Scorer的nextDoc()函數,使得每一個Scorer獲得本身的第一篇文檔號。
for (int i = 0; i < scorers.length; i++) { if (scorers[i].nextDoc() == NO_MORE_DOCS) { //因爲是取交集,於是任何一個倒排表沒有文檔,交集就爲空。 lastDoc = NO_MORE_DOCS; return; } } |
(3) 在ConjunctionScorer的構造函數中,將Scorer按照第一篇的文檔號從小到大進行排列。
Arrays.sort(scorers, new Comparator() { public int compare(Scorer o1, Scorer o2) { return o1.docID() - o2.docID(); } }); |
倒排表以下:
(4) 在ConjunctionScorer的構造函數中,第一次調用doNext()函數。
if (doNext() == NO_MORE_DOCS) { lastDoc = NO_MORE_DOCS; return; } |
private int doNext() throws IOException { int first = 0; int doc = scorers[scorers.length - 1].docID(); Scorer firstScorer; while ((firstScorer = scorers[first]).docID() < doc) { doc = firstScorer.advance(doc); first = first == scorers.length - 1 ? 0 : first + 1; } return doc; } |
姑且咱們稱擁有最小文檔號的倒排表稱爲first,其實從doNext()函數中的first = first == scorers.length - 1 ? 0 : first + 1;咱們能夠看出,在處理過程當中,Scorer數組被當作一個循環數組(Ring)。
而此時scorer[scorers.length - 1]擁有最大的文檔號,doNext()中的循環,將全部的小於當前數組中最大文檔號的文檔所有用firstScorer.advance(doc)(其跳到大於或等於doc的文檔)函數跳過,由於既然它們小於最大的文檔號,而ConjunctionScorer又是取交集,它們固然不會在交集中。
此過程以下:
(5) 當BooleanScorer2.score(Collector)中第一次調用ConjunctionScorer.nextDoc()的時候,lastDoc爲-1,根據nextDoc函數的實現,返回lastDoc = scorers[scorers.length - 1].docID()也即返回11,lastDoc也設爲11。
public int nextDoc() throws IOException { if (lastDoc == NO_MORE_DOCS) { return lastDoc; } else if (lastDoc == -1) { return lastDoc = scorers[scorers.length - 1].docID(); } scorers[(scorers.length - 1)].nextDoc(); return lastDoc = doNext(); } |
(6) 在BooleanScorer2.score(Collector)中,調用nextDoc()後,collector.collect(doc)來收集文檔號(收集過程下節分析),在收集文檔的過程當中,ConjunctionScorer.docID()會被調用,返回lastDoc,也即當前的文檔號爲11。
(7) 當BooleanScorer2.score(Collector)第二次調用ConjunctionScorer.nextDoc()時:
(8) lastDoc設爲13,在收集文檔的過程當中,ConjunctionScorer.docID()會被調用,返回lastDoc,也即當前的文檔號爲13。
(9) 當再次調用nextDoc()的時候,返回NO_MORE_DOCS,倒排表合併結束。
DisjunctionSumScorer中有成員變量List subScorers,是一個Scorer的鏈表,每一項表明一個倒排表,DisjunctionSumScorer就是對這些倒排表取並集,而後將並集中的文檔號在nextDoc()函數中依次返回。
DisjunctionSumScorer還有一個成員變量minimumNrMatchers,表示最少需知足的子條件的個數,也即subScorer中,必須有至少minimumNrMatchers個Scorer都包含某個文檔號,此文檔號纔可以返回。
爲了描述清楚此過程,下面舉一個具體的例子來解釋倒排表合併的過程:
(1) 假設minimumNrMatchers = 4,倒排表最初以下:
(2) 在DisjunctionSumScorer的構造函數中,將倒排表放入一個優先級隊列scorerDocQueue中(scorerDocQueue的實現是一個最小堆),隊列中的Scorer按照第一篇文檔的大小排序。
private void initScorerDocQueue() throws IOException { scorerDocQueue = new ScorerDocQueue(nrScorers); for (Scorer se : subScorers) { if (se.nextDoc() != NO_MORE_DOCS) { //此處的nextDoc使得每一個Scorer獲得第一篇文檔號。 scorerDocQueue.insert(se); } } } |
(3) 當BooleanScorer2.score(Collector)中第一次調用nextDoc()的時候,advanceAfterCurrent被調用。
public int nextDoc() throws IOException { if (scorerDocQueue.size() < minimumNrMatchers || !advanceAfterCurrent()) { currentDoc = NO_MORE_DOCS; } return currentDoc; } |
protected boolean advanceAfterCurrent() throws IOException { do { currentDoc = scorerDocQueue.topDoc(); //當前的文檔號爲最頂層 currentScore = scorerDocQueue.topScore(); //當前文檔的打分 nrMatchers = 1; //當前文檔知足的子條件的個數,也即包含當前文檔號的Scorer的個數 do { //所謂topNextAndAdjustElsePop是指,最頂層(top)的Scorer取下一篇文檔(Next),若是可以取到,則最小堆的堆頂可能再也不是最小值了,須要調整(Adjust,實際上是downHeap()),若是不可以取到,則最頂層的Scorer已經爲空,則彈出隊列(Pop)。 if (!scorerDocQueue.topNextAndAdjustElsePop()) { if (scorerDocQueue.size() == 0) { break; // nothing more to advance, check for last match. } } //當最頂層的Scorer取到下一篇文檔,而且調整完畢後,再取出此時最上層的Scorer的第一篇文檔,若是不是currentDoc,說明currentDoc此文檔號已經統計完畢nrMatchers,則退出內層循環。 if (scorerDocQueue.topDoc() != currentDoc) { break; // All remaining subscorers are after currentDoc. } //不然nrMatchers加一,也即又多了一個Scorer也包含此文檔號。 currentScore += scorerDocQueue.topScore(); nrMatchers++; } while (true); //若是統計出的nrMatchers大於最少需知足的子條件的個數,則此currentDoc就是知足條件的文檔,則返回true,在收集文檔的過程當中,DisjunctionSumScorer.docID()會被調用,返回currentDoc。 if (nrMatchers >= minimumNrMatchers) { return true; } else if (scorerDocQueue.size() < minimumNrMatchers) { return false; } } while (true); } |
advanceAfterCurrent具體過程以下:
(4) currentDoc設爲7,在收集文檔的過程當中,DisjunctionSumScorer.docID()會被調用,返回currentDoc,也即當前的文檔號爲7。
(5) 當再次調用nextDoc()的時候,文檔8, 9, 11都不知足要求,最後返回NO_MORE_DOCS,倒排表合併結束。
ReqExclScorer有成員變量Scorer reqScorer表示必須知足的部分(required),成員變量DocIdSetIterator exclDisi表示必須不能知足的部分,ReqExclScorer就是返回reqScorer和exclDisi的倒排表的差集,也即在reqScorer的倒排表中排除exclDisi中的文檔號。
當nextDoc()調用的時候,首先取得reqScorer的第一個文檔號,而後toNonExcluded()函數則判斷此文檔號是否被exclDisi排除掉,若是沒有,則返回此文檔號,若是排除掉,則取下一個文檔號,看是否被排除掉,依次類推,直到找到一個文檔號,或者返回NO_MORE_DOCS。
public int nextDoc() throws IOException { if (reqScorer == null) { return doc; } doc = reqScorer.nextDoc(); if (doc == NO_MORE_DOCS) { reqScorer = null; return doc; } if (exclDisi == null) { return doc; } return doc = toNonExcluded(); } |
private int toNonExcluded() throws IOException { //取得被排除的文檔號 int exclDoc = exclDisi.docID(); //取得當前required文檔號 int reqDoc = reqScorer.docID(); do { //若是required文檔號小於被排除的文檔號,因爲倒排表是按照從小到大的順序排列的,於是此required文檔號不會被排除,返回。 if (reqDoc < exclDoc) { return reqDoc; } else if (reqDoc > exclDoc) { //若是required文檔號大於被排除的文檔號,則此required文檔號有可能被排除。因而exclDisi移動到大於或者等於required文檔號的文檔。 exclDoc = exclDisi.advance(reqDoc); //若是被排除的倒排表遍歷結束,則required文檔號不會被排除,返回。 if (exclDoc == NO_MORE_DOCS) { exclDisi = null; return reqDoc; } //若是exclDisi移動後,大於required文檔號,則required文檔號不會被排除,返回。 if (exclDoc > reqDoc) { return reqDoc; // not excluded } } //若是required文檔號等於被排除的文檔號,則被排除,取下一個required文檔號。 } while ((reqDoc = reqScorer.nextDoc()) != NO_MORE_DOCS); reqScorer = null; return NO_MORE_DOCS; } |
ReqOptSumScorer包含兩個成員變量,Scorer reqScorer表明必須(required)知足的文檔倒排表,Scorer optScorer表明能夠(optional)知足的文檔倒排表。
如代碼顯示,在nextDoc()中,返回的就是required的文檔倒排表,只不過在計算score的時候打分更高。
public int nextDoc() throws IOException { return reqScorer.nextDoc(); } |
在BooleanWeight.scorer生成Scorer樹的時候,除了生成上述的BooleanScorer2外, 還會生成BooleanScorer,是在如下的條件下:
public boolean scoresDocsOutOfOrder() { int numProhibited = 0; for (BooleanClause c : clauses) { if (c.isRequired()) { return false; } else if (c.isProhibited()) { ++numProhibited; } } if (numProhibited > 32) { return false; } return true; } |
從上面能夠看出,最後兩個條件和scoresDocsOutOfOrder函數中的邏輯是一致的。
下面咱們看看BooleanScorer如何合併倒排表的:
public int nextDoc() throws IOException { boolean more; do { //bucketTable等因而存放合併後的倒排表的文檔隊列 while (bucketTable.first != null) { //從隊列中取出第一篇文檔,返回 current = bucketTable.first; bucketTable.first = current.next; if ((current.bits & prohibitedMask) == 0 && (current.bits & requiredMask) == requiredMask && current.coord >= minNrShouldMatch) { return doc = current.doc; } } //若是隊列爲空,則填充隊列。 more = false; end += BucketTable.SIZE; //按照Scorer的順序,依次用Scorer中的倒排表填充隊列,填滿爲止。 for (SubScorer sub = scorers; sub != null; sub = sub.next) { Scorer scorer = sub.scorer; sub.collector.setScorer(scorer); int doc = scorer.docID(); while (doc < end) { sub.collector.collect(doc); doc = scorer.nextDoc(); } more |= (doc != NO_MORE_DOCS); } } while (bucketTable.first != null || more); return doc = NO_MORE_DOCS; } |
public final void collect(final int doc) throws IOException { final BucketTable table = bucketTable; final int i = doc & BucketTable.MASK; Bucket bucket = table.buckets[i]; if (bucket == null) table.buckets[i] = bucket = new Bucket(); if (bucket.doc != doc) { bucket.doc = doc; bucket.score = scorer.score(); bucket.bits = mask; bucket.coord = 1; bucket.next = table.first; table.first = bucket; } else { bucket.score += scorer.score(); bucket.bits |= mask; bucket.coord++; } } |
從上面的實現咱們能夠看出,BooleanScorer合併倒排表的時候,並非按照文檔號從小到大的順序排列的。
從原理上咱們能夠理解,在AND的查詢條件下,倒排表的合併按照算法須要按照文檔號從小到大的順序排列。然而在沒有AND的查詢條件下,若是都是OR,則文檔號是否按照順序返回就不重要了,於是scoreDocsInOrder就是false。
於是上面的DisjunctionSumScorer,其實"apple boy dog"是不能產生DisjunctionSumScorer的,而僅有在有AND的查詢條件下,才產生DisjunctionSumScorer。
咱們作實驗以下:
對於查詢語句"apple boy dog",生成的Scorer以下:
scorer BooleanScorer (id=34) |
對於查詢語句"+hello (apple boy dog)",生成的Scorer對象以下:
scorer BooleanScorer2 (id=40) //weight(contents:apple) //weight(contents:boy) //weight(contents:cat) |
在函數IndexSearcher.search(Weight, Filter, int) 中,有以下代碼:
TopScoreDocCollector collector = TopScoreDocCollector.create(nDocs, !weight.scoresDocsOutOfOrder());
search(weight, filter, collector);
return collector.topDocs();
TopScoreDocCollector collector = TopScoreDocCollector.create(nDocs, !weight.scoresDocsOutOfOrder());
public static TopScoreDocCollector create(int numHits, boolean docsScoredInOrder) { if (docsScoredInOrder) { return new InOrderTopScoreDocCollector(numHits); } else { return new OutOfOrderTopScoreDocCollector(numHits); } } |
其根據是否按照文檔號從小到大返回文檔而建立InOrderTopScoreDocCollector或者OutOfOrderTopScoreDocCollector,二者的不一樣在於收集文檔的方式不一樣。
當建立完畢Scorer對象樹和SumScorer對象樹後,IndexSearcher.search(Weight, Filter, Collector) 有如下調用:
scorer.score(collector) ,以下代碼所示,其不斷的獲得合併的倒排表後的文檔號,並收集它們。
public void score(Collector collector) throws IOException { collector.setScorer(this); while ((doc = countingSumScorer.nextDoc()) != NO_MORE_DOCS) { collector.collect(doc); } } |
InOrderTopScoreDocCollector的collect函數以下:
public void collect(int doc) throws IOException { float score = scorer.score(); totalHits++; if (score <= pqTop.score) { return; } pqTop.doc = doc + docBase; pqTop.score = score; pqTop = pq.updateTop(); } |
OutOfOrderTopScoreDocCollector的collect函數以下:
public void collect(int doc) throws IOException { float score = scorer.score(); totalHits++; doc += docBase; if (score < pqTop.score || (score == pqTop.score && doc > pqTop.doc)) { return; } pqTop.doc = doc; pqTop.score = score; pqTop = pq.updateTop(); } |
從上面的代碼能夠看出,collector的做用就是首先計算文檔的打分,而後根據打分,將文檔放入優先級隊列(最小堆)中,最後在優先級隊列中取前N篇文檔。
然而存在一個問題,若是要取10篇文檔,而第8,9,10,11,12篇文檔的打分都相同,則拋棄那些呢?Lucene的策略是,在文檔打分相同的狀況下,文檔號小的優先。
也即8,9,10被保留,11,12被拋棄。
由上面的敘述可知,建立collector的時候,根據文檔是否將按照文檔號從小到大的順序返回而建立InOrderTopScoreDocCollector或者OutOfOrderTopScoreDocCollector。
對於InOrderTopScoreDocCollector,因爲文檔是按照順序返回的,後來的文檔號確定大於前面的文檔號,於是當score <= pqTop.score的時候,直接拋棄。
對於OutOfOrderTopScoreDocCollector,因爲文檔不是按順序返回的,於是當score
BooleanScorer2的打分函數以下:
public float score() throws IOException { coordinator.nrMatchers = 0; float sum = countingSumScorer.score(); return sum * coordinator.coordFactors[coordinator.nrMatchers]; } |
ConjunctionScorer的打分函數以下:
public float score() throws IOException { float sum = 0.0f; for (int i = 0; i < scorers.length; i++) { sum += scorers[i].score(); } return sum * coord; } |
DisjunctionSumScorer的打分函數以下:
public float score() throws IOException { return currentScore; } currentScore計算以下: currentScore += scorerDocQueue.topScore(); 以上計算是在DisjunctionSumScorer的倒排表合併算法中進行的,其是取堆頂的打分函數。 public final float topScore() throws IOException { return topHSD.scorer.score(); } |
ReqExclScorer的打分函數以下:
public float score() throws IOException { return reqScorer.score(); } |
ReqOptSumScorer的打分函數以下:
public float score() throws IOException { int curDoc = reqScorer.docID(); float reqScore = reqScorer.score(); if (optScorer == null) { return reqScore; } int optScorerDoc = optScorer.docID(); if (optScorerDoc < curDoc && (optScorerDoc = optScorer.advance(curDoc)) == NO_MORE_DOCS) { optScorer = null; return reqScore; } return optScorerDoc == curDoc ? reqScore + optScorer.score() : reqScore; } |
TermScorer的打分函數以下:
public float score() { int f = freqs[pointer]; float raw = f < SCORE_CACHE_SIZE ? scoreCache[f] : getSimilarity().tf(f)*weightValue; return norms == null ? raw : raw * SIM_NORM_DECODER[norms[doc] & 0xFF]; } |
Lucene的打分公式總體以下,2.4.1計算了圖中的紅色的部分,此步計算了藍色的部分:
打分計算到此結束。
IndexSearcher.search(Weight, Filter, int)中,在收集完文檔後,調用collector.topDocs()返回打分最高的N篇文檔:
public final TopDocs topDocs() { return topDocs(0, totalHits < pq.size() ? totalHits : pq.size()); } |
public final TopDocs topDocs(int start, int howMany) { int size = totalHits < pq.size() ? totalHits : pq.size(); howMany = Math.min(size - start, howMany); ScoreDoc[] results = new ScoreDoc[howMany]; //因爲pq是最小堆,於是要首先彈出最小的文檔。好比qp中總共有50篇文檔,想取第5到10篇文檔,則應該先彈出打分最小的40篇文檔。 for (int i = pq.size() - start - howMany; i > 0; i--) { pq.pop(); } populateResults(results, howMany); return newTopDocs(results, start); } |
protected void populateResults(ScoreDoc[] results, int howMany) { //而後再從pq彈出第5到10篇文檔,並按照打分從大到小的順序放入results中。 for (int i = howMany - 1; i >= 0; i--) { results[i] = pq.pop(); } } |
protected TopDocs newTopDocs(ScoreDoc[] results, int start) { return results == null ? EMPTY_TOPDOCS : new TopDocs(totalHits, results); } |
以上敘述的是搜索過程當中如何進行倒排表合併以及計算打分。然而索引信息是從索引文件中讀出來的,下面分析如何讀取這些信息。
其實讀取的信息無非是兩種信息,一個是詞典信息,一個是倒排表信息。
詞典信息的讀取是在Scorer對象樹生成的時候進行的,真正讀取這些信息的是葉子節點TermScorer。
倒排表信息的讀取時在合併倒排表的時候進行的,真正讀取這些信息的也是葉子節點TermScorer.nextDoc()。
此步是在TermWeight.scorer(IndexReader, boolean, boolean) 中進行的,其代碼以下:
public Scorer scorer(IndexReader reader, boolean scoreDocsInOrder, boolean topScorer) { TermDocs termDocs = reader.termDocs(term); if (termDocs == null) return null; return new TermScorer(this, termDocs, similarity, reader.norms(term.field())); } |
ReadOnlySegmentReader.termDocs(Term)是找到Term並生成用來讀倒排表的TermDocs對象:
public TermDocs termDocs(Term term) throws IOException { ensureOpen(); TermDocs termDocs = termDocs(); termDocs.seek(term); return termDocs; } |
termDocs()函數首先生成SegmentTermDocs對象,用於讀取倒排表:
protected SegmentTermDocs(SegmentReader parent) { this.parent = parent; this.freqStream = (IndexInput) parent.core.freqStream.clone();//用於讀取freq synchronized (parent) { this.deletedDocs = parent.deletedDocs; } this.skipInterval = parent.core.getTermsReader().getSkipInterval(); this.maxSkipLevels = parent.core.getTermsReader().getMaxSkipLevels(); } |
SegmentTermDocs.seek(Term)是讀取詞典中的Term,並將freqStream指向此Term對應的倒排表:
public void seek(Term term) throws IOException { TermInfo ti = parent.core.getTermsReader().get(term); seek(ti, term); } |
TermInfosReader.get(Term, boolean)主要是讀取詞典中的Term獲得TermInfo,代碼以下: private TermInfo get(Term term, boolean useCache) { if (size == 0) return null; ensureIndexIsRead(); TermInfo ti; ThreadResources resources = getThreadResources(); SegmentTermEnum enumerator = resources.termEnum; seekEnum(enumerator, getIndexOffset(term)); enumerator.scanTo(term); if (enumerator.term() != null && term.compareTo(enumerator.term()) == 0) { ti = enumerator.termInfo(); } else { ti = null; } return ti; } |
在IndexReader打開一個索引文件夾的時候,會從tii文件中讀出的Term index到indexPointers數組中,TermInfosReader.seekEnum(SegmentTermEnum enumerator, int indexOffset)負責在indexPointers數組中找Term對應的tis文件中所在的跳錶區域的位置。
private final void seekEnum(SegmentTermEnum enumerator, int indexOffset) throws IOException { enumerator.seek(indexPointers[indexOffset], (indexOffset * totalIndexInterval) - 1, indexTerms[indexOffset], indexInfos[indexOffset]); } |
final void SegmentTermEnum.seek(long pointer, int p, Term t, TermInfo ti) { input.seek(pointer); position = p; termBuffer.set(t); prevBuffer.reset(); termInfo.set(ti); } |
SegmentTermEnum.scanTo(Term)在跳錶區域中,一個一個往下找,直到找到Term:
final int scanTo(Term term) throws IOException { scanBuffer.set(term); int count = 0; //不斷取得下一個term到termBuffer中,目標term放入scanBuffer中,當二者相等的時候,目標Term找到。 while (scanBuffer.compareTo(termBuffer) > 0 && next()) { count++; } return count; } |
public final boolean next() throws IOException { if (position++ >= size - 1) { prevBuffer.set(termBuffer); termBuffer.reset(); return false; } prevBuffer.set(termBuffer); //讀取Term的字符串 termBuffer.read(input, fieldInfos); //讀取docFreq,也即多少文檔包含此Term termInfo.docFreq = input.readVInt(); //讀取偏移量 termInfo.freqPointer += input.readVLong(); termInfo.proxPointer += input.readVLong(); if (termInfo.docFreq >= skipInterval) termInfo.skipOffset = input.readVInt(); indexPointer += input.readVLong(); return true; } |
TermBuffer.read(IndexInput, FieldInfos) 代碼以下: public final void read(IndexInput input, FieldInfos fieldInfos) { this.term = null; int start = input.readVInt(); int length = input.readVInt(); int totalLength = start + length; text.setLength(totalLength); input.readChars(text.result, start, length); this.field = fieldInfos.fieldName(input.readVInt()); } |
SegmentTermDocs.seek(TermInfo ti, Term term)根據TermInfo,將freqStream指向此Term對應的倒排表位置:
void seek(TermInfo ti, Term term) { count = 0; FieldInfo fi = parent.core.fieldInfos.fieldInfo(term.field); df = ti.docFreq; doc = 0; freqBasePointer = ti.freqPointer; proxBasePointer = ti.proxPointer; skipPointer = freqBasePointer + ti.skipOffset; freqStream.seek(freqBasePointer); haveSkipped = false; } |
當讀出Term的信息獲得TermInfo後,而且freqStream指向此Term的倒排表位置的時候,下面就是在TermScorer.nextDoc()函數中讀取倒排表信息:
public int nextDoc() throws IOException { pointer++; if (pointer >= pointerMax) { pointerMax = termDocs.read(docs, freqs); if (pointerMax != 0) { pointer = 0; } else { termDocs.close(); return doc = NO_MORE_DOCS; } } doc = docs[pointer]; return doc; } |
SegmentTermDocs.read(int[], int[]) 代碼以下:
public int read(final int[] docs, final int[] freqs) { final int length = docs.length; int i = 0; while (i < length && count < df) { //讀取docid final int docCode = freqStream.readVInt(); doc += docCode >>> 1; if ((docCode & 1) != 0) freq = 1; else freq = freqStream.readVInt(); //讀取freq count++; if (deletedDocs == null || !deletedDocs.get(doc)) { docs[i] = doc; freqs[i] = freq; ++i; } return i; } } |