全文檢索是計算機程序經過掃描文章中的每個詞,對每個詞創建一個索引,指明該詞在文章中出現的次數和位置。當用戶查詢時根據創建的索引查找,相似於經過字典的檢索字表查字的過程。java
全文檢索(Full-Text Retrieval)是指以文本做爲檢索對象,找出含有指定詞彙的文本。全面、準確和快速是衡量全文檢索系統的關鍵指標。apache
全文檢索特色:
一、只處理文本
二、不處理語義
三、搜索時英文不區分大小寫
四、結果列表有相關度排序。框架
一個開源的全文檢索框架工具
IndexWriter:建立和維護索引庫
Analyzer:在建立索引時會用到分詞器,在使用字符串搜索時也會用到分詞器,這兩個地方要使用同一個分詞器,不然可能會搜索不出結果。Analyzer(分詞器)的做用是把一段文本中的詞按規則取出所包含的全部詞。對應的是Analyzer類,這是一個抽象類,切分詞的具體規則是由子類實現的,因此對於不一樣的語言(規則),要用不一樣的分詞器。
Directory:索引庫的位置
Document:文檔是索引和搜索的單元。一個文檔能夠包含一組Field(字段)
Field:字段,包含名稱和值,字段能夠選擇性的存儲在索引中
IndexSearcher:檢索工具
Query:查詢,lucene中支持模糊查詢,語義查詢,短語查詢,組合查詢等等,若有TermQuery,BooleanQuery,RangeQuery,WildcardQuery等一些類。
QueryParser: 是一個解析用戶輸入的工具,能夠經過掃描用戶輸入的字符串,生成Query對象。
Hits:在搜索完成以後,須要把搜索結果返回並顯示給用戶,只有這樣纔算是完成搜索的目的。在lucene中,搜索的結果的集合是用Hits類的實例來表示的。優化
package lucenedemo.lucenedemo; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.StringReader; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.Field.Index; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriter.MaxFieldLength; import org.apache.lucene.queryParser.MultiFieldQueryParser; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.TopScoreDocCollector; import org.apache.lucene.search.highlight.Highlighter; import org.apache.lucene.search.highlight.QueryScorer; import org.apache.lucene.search.highlight.SimpleHTMLFormatter; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; import org.junit.Test; public class LuceneDemo { @Test public void buildIndex() throws Exception { File path = new File("D:/lucene"); Directory d = FSDirectory.open(path); Analyzer a = new StandardAnalyzer(Version.LUCENE_30); // 第一個參數:索引存儲的位置 // 第二個參數:分詞器 // 第三個參數:Filed長度是否受限制 IndexWriter indexWriter = new IndexWriter(d, a, MaxFieldLength.UNLIMITED); indexWriter.deleteAll(); // 構造document BufferedReader reader = new BufferedReader(new FileReader(new File( "D:/test.txt"))); String line = null; while ((line = reader.readLine()) != null) { Document doc = new Document(); // 若是須要查詢則必須設置Index.ANALYZED Field field = new Field("line", line, Field.Store.YES, Index.ANALYZED); doc.add(field); indexWriter.addDocument(doc); } // 優化 indexWriter.optimize(); indexWriter.commit(); indexWriter.close(true); reader.close(); } @Test public void searchTermQuery() throws Exception { File path = new File("D:/lucene"); Directory d = FSDirectory.open(path); IndexSearcher indexSearcher = new IndexSearcher(d); Analyzer a = new StandardAnalyzer(Version.LUCENE_30); QueryParser queryParser = new QueryParser(Version.LUCENE_30, "line", a); Query query = queryParser.parse("Spark"); // top 5 TopDocs result = indexSearcher.search(query, 5); System.out.println(result.totalHits); indexSearcher.close(); } /** * 高亮查詢 */ @Test public void searchHL() throws Exception { File path = new File("D:/lucene"); Directory d = FSDirectory.open(path); IndexSearcher indexSearcher = new IndexSearcher(d); Analyzer a = new StandardAnalyzer(Version.LUCENE_30); String[] fields = { "line" }; String[] q = { "spark" }; Query query = MultiFieldQueryParser.parse(Version.LUCENE_CURRENT, q, fields, a); TopScoreDocCollector topCollector = TopScoreDocCollector.create( indexSearcher.maxDoc(), false); indexSearcher.search(query, topCollector); // 高亮設置 SimpleHTMLFormatter simpleHtmlFormatter = new SimpleHTMLFormatter( "<B>", "</B>");// Highlighter highlighter = new Highlighter(simpleHtmlFormatter, new QueryScorer(query)); ScoreDoc[] docs = topCollector.topDocs(5).scoreDocs; for (ScoreDoc scdoc : docs) { Document document = indexSearcher.doc(scdoc.doc); TokenStream tokenStream = a.tokenStream("line", new StringReader( document.get("line"))); String str = highlighter.getBestFragment(tokenStream, document.get("line")); System.out.println(str); } } }
須要的jar包ui
<dependency> <groupId>org.apache.lucene</groupId> <artifactId>lucene-core</artifactId> <version>3.0.0</version> </dependency> <dependency> <groupId>org.apache.lucene</groupId> <artifactId>lucene-analyzers</artifactId> <version>3.0.0</version> </dependency> <dependency> <groupId>org.apache.lucene</groupId> <artifactId>lucene-analyzers</artifactId> <version>3.0.0</version> </dependency> <dependency> <groupId>org.apache.lucene</groupId> <artifactId>lucene-highlighter</artifactId> <version>3.0.0</version> </dependency> <dependency> <groupId>org.apache.lucene</groupId> <artifactId>lucene-memory</artifactId> <version>3.0.0</version> </dependency>