【手把手教你全文檢索】Lucene索引的【增、刪、改、查】

時間 2019-11-19

原文原文鏈接

前言html

　　搞檢索的，應該多少都會了解Lucene一些，它開源並且簡單上手，官方API足夠編寫些小DEMO。而且根據倒排索引，實現快速檢索。本文就簡單的實現增量添加索引，刪除索引，經過關鍵字查詢，以及更新索引等操做。java

　　目前博豬使用的不爽的地方就是，讀取文件內容進行全文檢索時，須要本身編寫讀取過程（這個solr免費幫咱們實現）。並且建立索引的過程比較慢，還有很大的優化空間，這個就要細心下來研究了。apache

　　建立索引

　　Lucene在進行建立索引時，根據前面一篇博客，已經講完了大致的流程，這裏再簡單說下：ide

1 Directory directory = FSDirectory.open("/tmp/testindex"); 2 IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_CURRENT, analyzer); 3 IndexWriter iwriter = new IndexWriter(directory, config); 4 Document doc = new Document(); 5 String text = "This is the text to be indexed."; 6 doc.add(new Field("fieldname", text, TextField.TYPE_STORED)); iwriter.close();

　　1 建立Directory，獲取索引目錄優化

　　2 建立詞法分析器，建立IndexWriter對象spa

　　3 建立document對象，存儲數據code

　　4 關閉IndexWriter，提交htm

 1 /**
 2  * 創建索引  3  *  4  * @param args  5      */
 6     public static void index() throws Exception {  7         
 8         String text1 = "hello,man!";  9         String text2 = "goodbye,man!"; 10         String text3 = "hello,woman!"; 11         String text4 = "goodbye,woman!"; 12         
13         Date date1 = new Date(); 14         analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT); 15         directory = FSDirectory.open(new File(INDEX_DIR)); 16 
17         IndexWriterConfig config = new IndexWriterConfig( 18  Version.LUCENE_CURRENT, analyzer); 19         indexWriter = new IndexWriter(directory, config); 20 
21         Document doc1 = new Document(); 22         doc1.add(new TextField("filename", "text1", Store.YES)); 23         doc1.add(new TextField("content", text1, Store.YES)); 24  indexWriter.addDocument(doc1); 25         
26         Document doc2 = new Document(); 27         doc2.add(new TextField("filename", "text2", Store.YES)); 28         doc2.add(new TextField("content", text2, Store.YES)); 29  indexWriter.addDocument(doc2); 30         
31         Document doc3 = new Document(); 32         doc3.add(new TextField("filename", "text3", Store.YES)); 33         doc3.add(new TextField("content", text3, Store.YES)); 34  indexWriter.addDocument(doc3); 35         
36         Document doc4 = new Document(); 37         doc4.add(new TextField("filename", "text4", Store.YES)); 38         doc4.add(new TextField("content", text4, Store.YES)); 39  indexWriter.addDocument(doc4); 40         
41  indexWriter.commit(); 42  indexWriter.close(); 43 
44         Date date2 = new Date(); 45         System.out.println("建立索引耗時：" + (date2.getTime() - date1.getTime()) + "ms\n"); 46     }

　　增量添加索引

　　Lucene擁有增量添加索引的功能，在不會影響以前的索引狀況下，添加索引，它會在什麼時候的時機，自動合併索引文件。對象

 1 /**
 2  * 增長索引  3  *  4  * @throws Exception  5      */
 6     public static void insert() throws Exception {  7         String text5 = "hello,goodbye,man,woman";  8         Date date1 = new Date();  9         analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT); 10         directory = FSDirectory.open(new File(INDEX_DIR)); 11 
12         IndexWriterConfig config = new IndexWriterConfig( 13  Version.LUCENE_CURRENT, analyzer); 14         indexWriter = new IndexWriter(directory, config); 15 
16         Document doc1 = new Document(); 17         doc1.add(new TextField("filename", "text5", Store.YES)); 18         doc1.add(new TextField("content", text5, Store.YES)); 19  indexWriter.addDocument(doc1); 20 
21  indexWriter.commit(); 22  indexWriter.close(); 23 
24         Date date2 = new Date(); 25         System.out.println("增長索引耗時：" + (date2.getTime() - date1.getTime()) + "ms\n"); 26     }

　　刪除索引

　　Lucene也是經過IndexWriter調用它的delete方法，來刪除索引。咱們能夠經過關鍵字，刪除與這個關鍵字有關的全部內容。若是僅僅是想要刪除一個文檔，那麼最好就頂一個惟一的ID域，經過這個ID域，來進行刪除操做。blog

 1 /**
 2  * 刪除索引  3  *  4  * @param str 刪除的關鍵字  5  * @throws Exception  6      */
 7     public static void delete(String str) throws Exception {  8         Date date1 = new Date();  9         analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT); 10         directory = FSDirectory.open(new File(INDEX_DIR)); 11 
12         IndexWriterConfig config = new IndexWriterConfig( 13  Version.LUCENE_CURRENT, analyzer); 14         indexWriter = new IndexWriter(directory, config); 15         
16         indexWriter.deleteDocuments(new Term("filename",str)); 17         
18  indexWriter.close(); 19         
20         Date date2 = new Date(); 21         System.out.println("刪除索引耗時：" + (date2.getTime() - date1.getTime()) + "ms\n"); 22     }

　　更新索引

　　Lucene沒有真正的更新操做，經過某個fieldname，能夠更新這個域對應的索引，可是實質上，它是先刪除索引，再從新創建的。

 1 /**
 2  * 更新索引  3  *  4  * @throws Exception  5      */
 6     public static void update() throws Exception {  7         String text1 = "update,hello,man!";  8         Date date1 = new Date();  9          analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT); 10          directory = FSDirectory.open(new File(INDEX_DIR)); 11 
12          IndexWriterConfig config = new IndexWriterConfig( 13  Version.LUCENE_CURRENT, analyzer); 14          indexWriter = new IndexWriter(directory, config); 15          
16          Document doc1 = new Document(); 17         doc1.add(new TextField("filename", "text1", Store.YES)); 18         doc1.add(new TextField("content", text1, Store.YES)); 19         
20         indexWriter.updateDocument(new Term("filename","text1"), doc1); 21         
22  indexWriter.close(); 23          
24          Date date2 = new Date(); 25          System.out.println("更新索引耗時：" + (date2.getTime() - date1.getTime()) + "ms\n"); 26     }

　　經過索引查詢關鍵字

　　Lucene的查詢方式有不少種，這裏就不作詳細介紹了。它會返回一個ScoreDoc的集合，相似ResultSet的集合，咱們能夠經過域名獲取想要獲取的內容。

 1 /**
 2  * 關鍵字查詢  3  *  4  * @param str  5  * @throws Exception  6      */
 7     public static void search(String str) throws Exception {  8         directory = FSDirectory.open(new File(INDEX_DIR));  9         analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT); 10         DirectoryReader ireader = DirectoryReader.open(directory); 11         IndexSearcher isearcher = new IndexSearcher(ireader); 12 
13         QueryParser parser = new QueryParser(Version.LUCENE_CURRENT, "content",analyzer); 14         Query query = parser.parse(str); 15 
16         ScoreDoc[] hits = isearcher.search(query, null, 1000).scoreDocs; 17         for (int i = 0; i < hits.length; i++) { 18             Document hitDoc = isearcher.doc(hits[i].doc); 19             System.out.println(hitDoc.get("filename")); 20             System.out.println(hitDoc.get("content")); 21  } 22  ireader.close(); 23  directory.close(); 24     }

　　所有代碼

 1 package test;  2 
 3 import java.io.File;  4 import java.util.Date;  5 import java.util.List;  6 
 7 import org.apache.lucene.analysis.Analyzer;  8 import org.apache.lucene.analysis.standard.StandardAnalyzer;  9 import org.apache.lucene.document.Document;  10 import org.apache.lucene.document.LongField;  11 import org.apache.lucene.document.TextField;  12 import org.apache.lucene.document.Field.Store;  13 import org.apache.lucene.index.DirectoryReader;  14 import org.apache.lucene.index.IndexWriter;  15 import org.apache.lucene.index.IndexWriterConfig;  16 import org.apache.lucene.index.Term;  17 import org.apache.lucene.queryparser.classic.QueryParser;  18 import org.apache.lucene.search.IndexSearcher;  19 import org.apache.lucene.search.Query;  20 import org.apache.lucene.search.ScoreDoc;  21 import org.apache.lucene.store.Directory;  22 import org.apache.lucene.store.FSDirectory;  23 import org.apache.lucene.util.Version;  24 
 25 public class TestLucene {  26     // 保存路徑
 27     private static String INDEX_DIR = "D:\\luceneIndex";  28     private static Analyzer analyzer = null;  29     private static Directory directory = null;  30     private static IndexWriter indexWriter = null;  31 
 32     public static void main(String[] args) {  33         try {  34 // index();
 35             search("man");  36 // insert();  37 // delete("text5");  38 // update();
 39         } catch (Exception e) {  40  e.printStackTrace();  41  }  42  }  43     /**
 44  * 更新索引  45  *  46  * @throws Exception  47      */
 48     public static void update() throws Exception {  49         String text1 = "update,hello,man!";  50         Date date1 = new Date();  51          analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);  52          directory = FSDirectory.open(new File(INDEX_DIR));  53 
 54          IndexWriterConfig config = new IndexWriterConfig(  55  Version.LUCENE_CURRENT, analyzer);  56          indexWriter = new IndexWriter(directory, config);  57          
 58          Document doc1 = new Document();  59         doc1.add(new TextField("filename", "text1", Store.YES));  60         doc1.add(new TextField("content", text1, Store.YES));  61         
 62         indexWriter.updateDocument(new Term("filename","text1"), doc1);  63         
 64  indexWriter.close();  65          
 66          Date date2 = new Date();  67          System.out.println("更新索引耗時：" + (date2.getTime() - date1.getTime()) + "ms\n");  68  }  69     /**
 70  * 刪除索引  71  *  72  * @param str 刪除的關鍵字  73  * @throws Exception  74      */
 75     public static void delete(String str) throws Exception {  76         Date date1 = new Date();  77         analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);  78         directory = FSDirectory.open(new File(INDEX_DIR));  79 
 80         IndexWriterConfig config = new IndexWriterConfig(  81  Version.LUCENE_CURRENT, analyzer);  82         indexWriter = new IndexWriter(directory, config);  83         
 84         indexWriter.deleteDocuments(new Term("filename",str));  85         
 86  indexWriter.close();  87         
 88         Date date2 = new Date();  89         System.out.println("刪除索引耗時：" + (date2.getTime() - date1.getTime()) + "ms\n");  90  }  91     /**
 92  * 增長索引  93  *  94  * @throws Exception  95      */
 96     public static void insert() throws Exception {  97         String text5 = "hello,goodbye,man,woman";  98         Date date1 = new Date();  99         analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT); 100         directory = FSDirectory.open(new File(INDEX_DIR)); 101 
102         IndexWriterConfig config = new IndexWriterConfig( 103  Version.LUCENE_CURRENT, analyzer); 104         indexWriter = new IndexWriter(directory, config); 105 
106         Document doc1 = new Document(); 107         doc1.add(new TextField("filename", "text5", Store.YES)); 108         doc1.add(new TextField("content", text5, Store.YES)); 109  indexWriter.addDocument(doc1); 110 
111  indexWriter.commit(); 112  indexWriter.close(); 113 
114         Date date2 = new Date(); 115         System.out.println("增長索引耗時：" + (date2.getTime() - date1.getTime()) + "ms\n"); 116  } 117     /**
118  * 創建索引 119  * 120  * @param args 121      */
122     public static void index() throws Exception { 123         
124         String text1 = "hello,man!"; 125         String text2 = "goodbye,man!"; 126         String text3 = "hello,woman!"; 127         String text4 = "goodbye,woman!"; 128         
129         Date date1 = new Date(); 130         analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT); 131         directory = FSDirectory.open(new File(INDEX_DIR)); 132 
133         IndexWriterConfig config = new IndexWriterConfig( 134  Version.LUCENE_CURRENT, analyzer); 135         indexWriter = new IndexWriter(directory, config); 136 
137         Document doc1 = new Document(); 138         doc1.add(new TextField("filename", "text1", Store.YES)); 139         doc1.add(new TextField("content", text1, Store.YES)); 140  indexWriter.addDocument(doc1); 141         
142         Document doc2 = new Document(); 143         doc2.add(new TextField("filename", "text2", Store.YES)); 144         doc2.add(new TextField("content", text2, Store.YES)); 145  indexWriter.addDocument(doc2); 146         
147         Document doc3 = new Document(); 148         doc3.add(new TextField("filename", "text3", Store.YES)); 149         doc3.add(new TextField("content", text3, Store.YES)); 150  indexWriter.addDocument(doc3); 151         
152         Document doc4 = new Document(); 153         doc4.add(new TextField("filename", "text4", Store.YES)); 154         doc4.add(new TextField("content", text4, Store.YES)); 155  indexWriter.addDocument(doc4); 156         
157  indexWriter.commit(); 158  indexWriter.close(); 159 
160         Date date2 = new Date(); 161         System.out.println("建立索引耗時：" + (date2.getTime() - date1.getTime()) + "ms\n"); 162  } 163 
164     /**
165  * 關鍵字查詢 166  * 167  * @param str 168  * @throws Exception 169      */
170     public static void search(String str) throws Exception { 171         directory = FSDirectory.open(new File(INDEX_DIR)); 172         analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT); 173         DirectoryReader ireader = DirectoryReader.open(directory); 174         IndexSearcher isearcher = new IndexSearcher(ireader); 175 
176         QueryParser parser = new QueryParser(Version.LUCENE_CURRENT, "content",analyzer); 177         Query query = parser.parse(str); 178 
179         ScoreDoc[] hits = isearcher.search(query, null, 1000).scoreDocs; 180         for (int i = 0; i < hits.length; i++) { 181             Document hitDoc = isearcher.doc(hits[i].doc); 182             System.out.println(hitDoc.get("filename")); 183             System.out.println(hitDoc.get("content")); 184  } 185  ireader.close(); 186  directory.close(); 187  } 188 }