lucene索引

時間 2019-11-18

標籤 lucene 索引简体版

原文原文鏈接

1、lucene索引java

一、文檔層次結構apache

索引（Index）：一個索引放在一個文件夾中；
段（Segment）：一個索引中能夠有不少段，段與段之間是獨立的，添加新的文檔可能產生新段，不一樣的段能夠合併成一個新段；
文檔（Document）：文檔是建立索引的基本單位，不一樣的文檔保存在不一樣的段中，一個段能夠包含多個文檔；
域（Field）：一個文檔包含不一樣類型的信息，能夠拆分開索引；
詞（Term）：詞是索引的最小單位，是通過詞法分析和語言處理後的數據；

　　文檔是Lucene索引和搜索的原子單位，文檔爲包含一個或多個域的容器，而域則依次包含「真正的」被搜索內容，域值經過分詞技術處理，獲得多個詞元。如一篇小說信息能夠稱爲一個文檔；小說信息又包含多個域，好比標題，做者、簡介、最後更新時間等；對標題這一個域採用分詞技術，又能夠等到一個或多個詞元。測試

二、正向索引與反向索引spa

正向索引：文檔佔據了中心的位置，每一個文檔指向了一個它所包含的索引項的序列。正向信息就是按層次保存了索引一直到詞的包含關係：索引 -> 段-> 文檔 -> 域 -> 詞
反向索引：一種以索引項爲中心來組織文檔的方式，每一個索引項指向一個文檔序列，這個序列中的文檔都包含該索引項。反向信息保存了詞典的倒排表映射：詞 -> 文檔

　　lucene使用到的就是反向索引。以下圖所示：code

2、索引操做對象

　　相關示例以下：blog

  1 package com.test.lucene;
  2 
  3 import java.io.IOException;
  4 import java.nio.file.Paths;
  5 
  6 import org.apache.lucene.analysis.Analyzer;
  7 import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
  8 import org.apache.lucene.document.Document;
  9 import org.apache.lucene.document.Field.Store;
 10 import org.apache.lucene.document.IntField;
 11 import org.apache.lucene.document.StringField;
 12 import org.apache.lucene.document.TextField;
 13 import org.apache.lucene.index.DirectoryReader;
 14 import org.apache.lucene.index.IndexWriter;
 15 import org.apache.lucene.index.IndexWriterConfig;
 16 import org.apache.lucene.index.Term;
 17 import org.apache.lucene.index.IndexWriterConfig.OpenMode;
 18 import org.apache.lucene.queryparser.classic.ParseException;
 19 import org.apache.lucene.queryparser.classic.QueryParser;
 20 import org.apache.lucene.search.IndexSearcher;
 21 import org.apache.lucene.search.Query;
 22 import org.apache.lucene.search.TopDocs;
 23 import org.apache.lucene.store.Directory;
 24 import org.apache.lucene.store.FSDirectory;
 25 
 26 /**
 27  * 索引增刪改查
 28  */
 29 public class IndexTest {
 30     /**
 31      * 建立索引
 32      * 
 33      * @param path
 34      *            索引存放路徑
 35      */
 36     public static void create(String path) {
 37         System.out.println("建立開始=============================》");
 38         Analyzer analyzer = new SmartChineseAnalyzer();// 指定分詞技術，這裏使用的是中文分詞
 39 
 40         IndexWriterConfig indexWriterConfig = new IndexWriterConfig(analyzer);// indexWriter的配置信息
 41 
 42         indexWriterConfig.setOpenMode(OpenMode.CREATE_OR_APPEND); // 索引的打開方式：沒有則建立，有則打開
 43 
 44         Directory directory = null;
 45         IndexWriter indexWriter = null;
 46         // 文檔一
 47         Document doc1 = new Document();
 48         doc1.add(new StringField("id", "1111", Store.YES));
 49         doc1.add(new TextField("content", "中國廣州", Store.YES));
 50         doc1.add(new IntField("num", 1, Store.YES));
 51 
 52         // 文檔二
 53         Document doc2 = new Document();
 54         doc2.add(new StringField("id", "2222", Store.YES));
 55         doc2.add(new TextField("content", "中國上海", Store.YES));
 56         doc2.add(new IntField("num", 2, Store.YES));
 57 
 58         try {
 59             directory = FSDirectory.open(Paths.get(path));// 索引在硬盤上的存儲路徑
 60             indexWriter = new IndexWriter(directory, indexWriterConfig);
 61             indexWriter.addDocument(doc1);
 62             indexWriter.addDocument(doc2);
 63             // 將indexWrite操做提交，若是不提交，以前的操做將不會保存到硬盤
 64             // 可是這一步很消耗系統資源，索引執行該操做須要有必定的策略
 65             indexWriter.commit();
 66         } catch (IOException e) {
 67             e.printStackTrace();
 68         } finally { // 關閉資源
 69             try {
 70                 indexWriter.close();
 71                 directory.close();
 72             } catch (IOException e) {
 73                 e.printStackTrace();
 74             }
 75         }
 76         System.out.println("建立索引完成=================================");
 77     }
 78 
 79     /**
 80      * 添加索引
 81      * 
 82      * @param path
 83      *            索引存放路徑
 84      * @param document
 85      *            添加的文檔
 86      */
 87     public static void add(String path, Document document) {
 88         System.out.println("增長索引開始=============================》");
 89         Analyzer analyzer = new SmartChineseAnalyzer();// 指定分詞技術，這裏使用的是中文分詞
 90 
 91         IndexWriterConfig indexWriterConfig = new IndexWriterConfig(analyzer);// indexWriter的配置信息
 92 
 93         indexWriterConfig.setOpenMode(OpenMode.CREATE_OR_APPEND); // 索引的打開方式：沒有則建立，有則打開
 94         Directory directory = null;
 95         IndexWriter indexWriter = null;
 96         try {
 97             directory = FSDirectory.open(Paths.get(path));// 索引在硬盤上的存儲路徑
 98 
 99             indexWriter = new IndexWriter(directory, indexWriterConfig);
100             indexWriter.addDocument(document);
101             indexWriter.commit();
102         } catch (IOException e) {
103             e.printStackTrace();
104         } finally { // 關閉資源
105             try {
106                 indexWriter.close();
107                 directory.close();
108             } catch (IOException e) {
109                 e.printStackTrace();
110             }
111         }
112         System.out.println("增長索引完成=================================");
113     }
114 
115     /**
116      * 刪除索引
117      * 
118      * @param indexpath
119      *            索引存放路徑
120      * @param id
121      *            文檔id
122      */
123     public static void delete(String indexpath, String id) {
124         System.out.println("刪除索引開始=============================》");
125         Analyzer analyzer = new SmartChineseAnalyzer();// 指定分詞技術，這裏使用的是中文分詞
126 
127         IndexWriterConfig indexWriterConfig = new IndexWriterConfig(analyzer);// indexWriter的配置信息
128 
129         indexWriterConfig.setOpenMode(OpenMode.CREATE_OR_APPEND); // 索引的打開方式：沒有則建立，有則打開
130         IndexWriter indexWriter = null;
131         Directory directory = null;
132         try {
133             directory = FSDirectory.open(Paths.get(indexpath));// 索引在硬盤上的存儲路徑
134             indexWriter = new IndexWriter(directory, indexWriterConfig);
135             indexWriter.deleteDocuments(new Term("id", id));// 刪除索引操做
136         } catch (IOException e) {
137             e.printStackTrace();
138         } finally { // 關閉資源
139             try {
140                 indexWriter.close();
141                 directory.close();
142             } catch (IOException e) {
143                 e.printStackTrace();
144             }
145         }
146         System.out.println("刪除索引完成=================================");
147     }
148 
149     /**
150      * Lucene沒有真正的更新操做，經過某個fieldname，能夠更新這個域對應的索引，可是實質上，它是先刪除索引，再從新創建的。
151      * 
152      * @param indexpath
153      *            索引存放路徑
154      * @param newDoc
155      *            更新後的文檔
156      * @param oldDoc
157      *            須要更新的目標文檔
158      */
159     public static void update(String indexpath, Document newDoc, Document oldDoc) {
160         System.out.println("更新索引開始=============================》");
161         Analyzer analyzer = new SmartChineseAnalyzer();
162         IndexWriterConfig config = new IndexWriterConfig(analyzer);
163         config.setOpenMode(OpenMode.CREATE_OR_APPEND);
164         IndexWriter indexWriter = null;
165         Directory directory = null;
166         try {
167             directory = FSDirectory.open(Paths.get(indexpath));
168             indexWriter = new IndexWriter(directory, config);
169             indexWriter.updateDocument(new Term("id", oldDoc.get("id")), newDoc);
170         } catch (IOException e) {
171             e.printStackTrace();
172         } finally { // 關閉資源
173             try {
174                 indexWriter.close();
175                 directory.close();
176             } catch (IOException e) {
177                 e.printStackTrace();
178             }
179         }
180         System.out.println("更新索引完成=================================");
181     }
182 
183     /**
184      * 搜索
185      * 
186      * @param keyword
187      *            關鍵字
188      * @param indexpath
189      *            索引存放路徑
190      */
191     public static void search(String keyword, String indexpath) {
192         Directory directory = null;
193         try {
194             directory = FSDirectory.open(Paths.get(indexpath));// 索引硬盤存儲路徑
195 
196             DirectoryReader directoryReader = DirectoryReader.open(directory);// 讀取索引
197 
198             IndexSearcher searcher = new IndexSearcher(directoryReader);// 建立索引檢索對象
199 
200             Analyzer analyzer = new SmartChineseAnalyzer();// 分詞技術
201 
202             QueryParser parser = new QueryParser("content", analyzer);// 建立Query
203             Query query = parser.parse(keyword);// 查詢content爲廣州的
204             // 檢索索引，獲取符合條件的前10條記錄
205             TopDocs topDocs = searcher.search(query, 10);
206             if (topDocs != null) {
207                 System.out.println("符合條件的記錄爲： " + topDocs.totalHits);
208                 for (int i = 0; i < topDocs.scoreDocs.length; i++) {
209                     Document doc = searcher.doc(topDocs.scoreDocs[i].doc);
210                     System.out.println("id = " + doc.get("id"));
211                     System.out.println("content = " + doc.get("content"));
212                     System.out.println("num = " + doc.get("num"));
213                 }
214             }
215             directory.close();
216             directoryReader.close();
217         } catch (IOException e) {
218             e.printStackTrace();
219         } catch (ParseException e) {
220             e.printStackTrace();
221         }
222     }
223     /**
224      * 測試代碼
225      * @param args
226      */
227     public static void main(String[] args) {
228         String indexpath = "D://index/test";
229         create(indexpath);// 建立索引
230         search("廣州", indexpath);
231         Document doc = new Document();
232         doc.add(new StringField("id", "3333", Store.YES));
233         doc.add(new TextField("content", "中國北京廣州", Store.YES));
234         doc.add(new IntField("num", 2, Store.YES));
235         add(indexpath, doc);// 添加索引
236         search("廣州", indexpath);
237         Document newDoc = new Document();
238         newDoc.add(new StringField("id", "3333", Store.YES));
239         newDoc.add(new TextField("content", "中國北京廣州個人頂頂頂頂頂頂頂頂頂頂頂頂", Store.YES));
240         newDoc.add(new IntField("num", 3, Store.YES));
241         update(indexpath, newDoc, doc);// 更新索引
242         search("廣州", indexpath);
243         delete(indexpath, "3333");// 刪除索引
244         search("廣州", indexpath);
245     }
246 }

　　運行結果以下：索引

相關標籤/搜索

每日一句

每一个你不满意的现在，都有一个你没有努力的曾经。