Lucene系列（三）查詢及高亮

時間 2019-11-09

標籤 lucene 系列查詢简体版

原文原文鏈接

系列文章：java

Lucene系列（一）快速入門git

Lucene系列（二）luke使用及索引文檔的基本操做github

Lucene系列（三）查詢及高亮面試

一準備

建立項目並添加Maven依賴apache

<dependency>
			<groupId>junit</groupId>
			<artifactId>junit</artifactId>
			<version>4.12</version>
			<scope>test</scope>
		</dependency>
		<!-- https://mvnrepository.com/artifact/org.apache.lucene/lucene-core -->
		<!-- Lucene核心庫 -->
		<dependency>
			<groupId>org.apache.lucene</groupId>
			<artifactId>lucene-core</artifactId>
			<version>7.2.1</version>
		</dependency>
		<!-- Lucene解析庫 -->
		<dependency>
			<groupId>org.apache.lucene</groupId>
			<artifactId>lucene-queryparser</artifactId>
			<version>7.2.1</version>
		</dependency>
		<!-- Lucene附加的分析庫 -->
		<dependency>
			<groupId>org.apache.lucene</groupId>
			<artifactId>lucene-analyzers-common</artifactId>
			<version>7.2.1</version>
		</dependency>
		<!-- 高亮顯示 -->
		<dependency>
			<groupId>org.apache.lucene</groupId>
			<artifactId>lucene-highlighter</artifactId>
			<version>7.2.1</version>
		</dependency>

		<!-- https://mvnrepository.com/artifact/org.apache.lucene/lucene-analyzers-smartcn -->
		<!-- 中文分詞 -->
		<dependency>
			<groupId>org.apache.lucene</groupId>
			<artifactId>lucene-analyzers-smartcn</artifactId>
			<version>7.2.0</version>
		</dependency>
	</dependencies>

複製代碼

二對特定單詞查詢/模糊查詢和查詢表達式

寫索引數組

import java.io.File;
import java.io.FileReader;
import java.nio.file.Paths;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

public class Indexer {

	private IndexWriter writer; // 寫索引實例
	
	/** * 構造方法 實例化IndexWriter * @param indexDir * @throws Exception */
	public Indexer(String indexDir)throws Exception{
		Directory dir=FSDirectory.open(Paths.get(indexDir));
		Analyzer analyzer=new StandardAnalyzer(); // 標準分詞器
		IndexWriterConfig iwc=new IndexWriterConfig(analyzer);
		writer=new IndexWriter(dir, iwc);
	}
	
	/** * 關閉寫索引 * @throws Exception */
	public void close()throws Exception{
		writer.close();
	}
	
	/** * 索引指定目錄的全部文件 * @param dataDir * @throws Exception */
	public int index(String dataDir)throws Exception{
		File []files=new File(dataDir).listFiles();
		for(File f:files){
			indexFile(f);
		}
		return writer.numDocs();
	}

	/** * 索引指定文件 * @param f */
	private void indexFile(File f) throws Exception{
		System.out.println("索引文件："+f.getCanonicalPath());
		Document doc=getDocument(f);
		writer.addDocument(doc);
	}

	/** * 獲取文檔，文檔裏再設置每一個字段 * @param f */
	private Document getDocument(File f)throws Exception {
		Document doc=new Document();
		doc.add(new TextField("contents",new FileReader(f)));
		doc.add(new TextField("fileName", f.getName(),Field.Store.YES));
		doc.add(new TextField("fullPath",f.getCanonicalPath(),Field.Store.YES));
		return doc;
	}
	
	public static void main(String[] args) {
		String indexDir="D:\\lucene\\searchindex";
		String dataDir="D:\\lucene\\data";
		Indexer indexer=null;
		int numIndexed=0;
		long start=System.currentTimeMillis();
		try {
			indexer = new Indexer(indexDir);
			numIndexed=indexer.index(dataDir);
		} catch (Exception e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}finally{
			try {
				indexer.close();
			} catch (Exception e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			}
		}
		long end=System.currentTimeMillis();
		System.out.println("索引："+numIndexed+" 個文件 花費了"+(end-start)+" 毫秒");
	}
}
複製代碼

讀取索引微信

import java.nio.file.Paths;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;

public class SearchTest {

	private Directory dir;
	private IndexReader reader;
	private IndexSearcher is;
	
	@Before
	public void setUp() throws Exception {
		dir=FSDirectory.open(Paths.get("D:\\lucene\\searchindex"));
		reader=DirectoryReader.open(dir);
		is=new IndexSearcher(reader);
	}

	@After
	public void tearDown() throws Exception {
		reader.close();
	}
}
複製代碼

對特定單詞查詢和模糊查詢post

/** * 對特定單詞查詢及模糊查詢 * * @throws Exception */
	@Test
	public void testTermQuery() throws Exception {
		String searchField = "contents";
		// 所給出的必須是單詞，否則差不到
		String q = "authorship";
		// 一個Term表示來自文本的一個單詞。
		Term t = new Term(searchField, q);
		// 爲Term構造查詢。
		Query query = new TermQuery(t);
	     /** * 1.須要根據條件查詢 * * 2.最大可編輯數，取值範圍0，1，2 * 容許個人查詢條件的值，能夠錯誤幾個字符 * */  
	    Query query2 = new FuzzyQuery(new Term(searchField,"authorshioo"),1);  
		TopDocs hits = is.search(query, 10);
		// hits.totalHits：查詢的總命中次數。即在幾個文檔中查到給定單詞
		System.out.println("匹配 '" + q + "'，總共查詢到" + hits.totalHits + "個文檔");
		for (ScoreDoc scoreDoc : hits.scoreDocs) {
			Document doc = is.doc(scoreDoc.doc);
			System.out.println(doc.get("fullPath"));
		}
		TopDocs hits2 = is.search(query2, 10);
		// hits.totalHits：查詢的總命中次數。即在幾個文檔中查到給定單詞
		System.out.println("匹配 '" + "authorshioo"+ "'，總共查詢到" + hits2.totalHits + "個文檔");
		for (ScoreDoc scoreDoc : hits2.scoreDocs) {
			Document doc = is.doc(scoreDoc.doc);
			System.out.println(doc.get("fullPath"));
		}
	}


複製代碼

咱們上面查詢了單詞「authorship」以及模糊查詢了單詞"authorshioo"，結果以下：學習

能夠看到只在LICENSE.txt文檔下找到該單詞。

那麼模糊查詢爲何查不到單詞"authorshioo"呢？這是由於咱們在這裏容許能夠錯誤幾個字符爲1個，可是咱們單詞"authorshioo"錯誤字符個數爲2個，因此就查不到。

Query query2 = new FuzzyQuery(new Term(searchField,"authorshioo"),1);  
複製代碼

解析表達式的使用測試

/** * 解析查詢表達式 * * @throws Exception */
	@Test
	public void testQueryParser() throws Exception {
		// 標準分詞器
		Analyzer analyzer = new StandardAnalyzer();
		String searchField = "contents";
		String q = "atomic a atomicReader";
		String q2 = "AtomicReader and AtomicReaderContext";
		// 創建查詢解析器
		//searchField:要查詢的字段； 
		//analyzer:標準分詞器實例
		QueryParser parser = new QueryParser(searchField, analyzer);
		Query query = parser.parse(q);
		//返回查詢到的前10項（查到100個相關內容的話也只會返回10個）
		TopDocs hits = is.search(query, 10);
		System.out.println("匹配 " + q + "查詢到" + hits.totalHits + "個記錄");
		for (ScoreDoc scoreDoc : hits.scoreDocs) {
			Document doc = is.doc(scoreDoc.doc);
			System.out.println(doc.get("fullPath"));
		}
		
		QueryParser parser2 = new QueryParser(searchField, analyzer);
		Query query2 = parser2.parse(q2);
		//返回查詢到的前10項（查到100個相關內容的話也只會返回10個）
		TopDocs hits2 = is.search(query2, 10);
		System.out.println("匹配 " + q2 + "查詢到" + hits2.totalHits + "個記錄");
		for (ScoreDoc scoreDoc : hits2.scoreDocs) {
			Document doc = is.doc(scoreDoc.doc);
			System.out.println(doc.get("fullPath"));
		}
	}
複製代碼

咱們上面分別查詢了：「atomic a atomicReader」和「AtomicReader and AtomicReaderContext」，經過查詢結果能夠看出即便稍微改變查詢內容，也仍是能夠查詢到和咱們給出的表達式相關的文檔。

三中文查詢及高亮

寫索引

import java.nio.file.Paths;

import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;  
  
public class Indexer {  
  
    private String[] ids={"1","2","3"};  
    private String citys[]={"青島","南京","上海"};  
    private String descs[]={  
            "青島是一個漂亮的城市。",  
            "南京是一個文化的城市。",  
            "上海是一個繁華的城市。"  
    };  
      
    private Directory dir;  
      
    /** *實例化indexerWriter * @return * @throws Exception */  
    private IndexWriter getWriter()throws Exception{  
  
        //中文分詞器 
    	SmartChineseAnalyzer analyzer=new SmartChineseAnalyzer();  
          
        IndexWriterConfig iwc=new IndexWriterConfig(analyzer);  
          
        IndexWriter writer=new IndexWriter(dir, iwc);  
          
        return writer;  
    }  
      
    /** * 獲取indexDir * @param indexDir * @throws Exception */  
    private void index(String indexDir)throws Exception{  
          
        dir=FSDirectory.open(Paths.get(indexDir));  
          
        IndexWriter writer=getWriter();  
          
        for(int i=0;i<ids.length;i++){  
              
            Document doc=new Document();  
              
            doc.add(new StringField("id", ids[i], Field.Store.YES));  
            doc.add(new StringField("city",citys[i],Field.Store.YES));  
            doc.add(new TextField("desc", descs[i], Field.Store.YES));  
              
            writer.addDocument(doc);   
        }  
          
        writer.close();  
    }  
      
      
    public static void main(String[] args) throws Exception {  
          
        new Indexer().index("D:\\lucene\\dataindex2");  
        System.out.println("Success Indexer");  
    }  
      
}  
複製代碼

中文查詢及高亮顯示

import java.io.StringReader;
import java.nio.file.Paths;

import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.Fragmenter;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.search.highlight.SimpleSpanFragmenter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;  
  
/** * * 經過索引字段來讀取文檔 * @author LXY * */  
public class SearchTest {  
  
    public static void search(String indexDir, String par) throws Exception{  
                  
            //獲得讀取索引文件的路徑 
            Directory dir = FSDirectory.open(Paths.get(indexDir));  
                  
            //經過dir獲得的路徑下的全部的文件 
            IndexReader reader = DirectoryReader.open(dir);  
                  
            //創建索引查詢器 
            IndexSearcher searcher = new IndexSearcher(reader);  
                  
            //中文分詞器 
            SmartChineseAnalyzer analyzer=new SmartChineseAnalyzer();  
                  
            //創建查詢解析器 
            /** * 第一個參數是要查詢的字段； * 第二個參數是分析器Analyzer * */  
            QueryParser parser = new QueryParser("desc", analyzer);  
                  
            //根據傳進來的par查找 
            Query query = parser.parse(par);  
                  
            //計算索引開始時間 
            long start = System.currentTimeMillis();  
                  
            //開始查詢 
            /** * 第一個參數是經過傳過來的參數來查找獲得的query； * 第二個參數是要出查詢的行數 * */  
            TopDocs topDocs = searcher.search(query, 10);  
                  
            //索引結束時間 
            long end = System.currentTimeMillis();  
                  
            System.out.println("匹配"+par+",總共花費了"+(end-start)+"毫秒,共查到"+topDocs.totalHits+"條記錄。");  
                  
                  
            //高亮顯示start 
                  
            //算分 
            QueryScorer scorer=new QueryScorer(query);  
                  
            //顯示得分高的片斷 
            Fragmenter fragmenter=new SimpleSpanFragmenter(scorer);  
                  
        //設置標籤內部關鍵字的顏色 
        //第一個參數：標籤的前半部分；第二個參數：標籤的後半部分。 
        SimpleHTMLFormatter simpleHTMLFormatter=new SimpleHTMLFormatter("<b><font color='red'>","</font></b>");  
                  
            //第一個參數是對查到的結果進行實例化；第二個是片斷得分（顯示得分高的片斷，即摘要） 
                Highlighter highlighter=new Highlighter(simpleHTMLFormatter, scorer);  
                  
                //設置片斷 
                highlighter.setTextFragmenter(fragmenter);  
                  
                //高亮顯示end 
                  
                //遍歷topDocs 
                /** * ScoreDoc:是表明一個結果的相關度得分與文檔編號等信息的對象。 * scoreDocs:表明文件的數組 * @throws Exception * */  
                for(ScoreDoc scoreDoc : topDocs.scoreDocs){  
                      
                    //獲取文檔 
                    Document document = searcher.doc(scoreDoc.doc);  
                      
                    //輸出全路徑 
                    System.out.println(document.get("city"));  
                    System.out.println(document.get("desc"));  
                      
                    String desc = document.get("desc");  
                    if(desc!=null){  
                          
                        //把所有得分高的摘要給顯示出來 
                          
            //第一個參數是對哪一個參數進行設置；第二個是以流的方式讀入 
            TokenStream tokenStream=analyzer.tokenStream("desc", new StringReader(desc));  
                          
            //獲取最高的片斷 
            System.out.println(highlighter.getBestFragment(tokenStream, desc));  
                }  
        }  
                  
        reader.close();  
    }  
              
              
    //開始測試 
    public static void main(String[] args) {  
                  
        //索引指定的路徑 
        String indexDir = "D:\\lucene\\dataindex2";  
                  
        //查詢的字段 
        String par = "南京";  
                  
        try {  
                      
            search(indexDir,par);  
                      
        } catch (Exception e) {  
        // TODO Auto-generated catch block 
            e.printStackTrace();  
        }  
    }         
}  
複製代碼

結果會把咱們查詢的「南京」單詞給高亮顯示，這在咱們平時搜索中很常見了。

咱們平時搜索中的高亮就像下圖：

歡迎關注個人微信公衆號：「Java面試通關手冊」（堅持原創，分享美文，分享各類Java學習資源，面試題，以及企業級Java實戰項目回覆關鍵字免費領取）：

Lucene我想暫時先更新到這裏，僅僅這三篇文章想掌握Lucene是遠遠不夠的。另外我這裏三篇文章都用的最新的jar包，Lucene更新太快，5系列後的版本和以前的有些地方仍是有挺大差距的，就好比爲文檔域設置權值的setBoost方法6.6之後已經被廢除了等等。由於時間有限，因此我就草草的看了一下Lucene的官方文檔，大多數內容仍是看java1234網站的這個視頻來學習的，而後在版本和部分代碼上作了改進。截止2018/4/1，上述代碼所用的jar包皆爲最新。

最後推薦一下本身以爲還不錯的Lucene學習網站/博客：

官方網站：[Welcome to Apache Lucene](Welcome to Apache Lucene)

Github:Apache Lucene and Solr

Lucene專欄

搜索系統18：lucene索引文件結構

Lucene6.6的介紹和使用