lucene相關

時間 2019-12-10

標籤 lucene 相關简体版

原文原文鏈接

lucene相關：java

應用領域：web

互聯網全文檢索引擎(好比百度, 谷歌, 必應)
站內全文檢索引擎(淘寶, 京東搜索功能)
優化數據庫查詢(由於數據庫中使用like關鍵字是全表掃描也就是順序掃描算法,查詢慢)

lucene:又叫全文檢索，先創建索引，在對索引進行搜索的過程。

Lucene下載

官方網站：http://lucene.apache.org/算法

版本：lucene4.10.3數據庫

Jdk要求：1.7以上apache

域的各類類型：

lucene的使用：優化

一、導入jar包：網站

二、這裏咱們使用的IKAnalyzer分詞器，導入相關配置：spa

3：代碼：3d

新建 IndexManagerTest 類，主要用於新增、刪除、修改索引：code

  1 package com.dengwei.lucene;
  2 
  3 import org.apache.commons.io.FileUtils;
  4 import org.apache.lucene.analysis.Analyzer;
  5 import org.apache.lucene.document.Document;
  6 import org.apache.lucene.document.Field.Store;
  7 import org.apache.lucene.document.LongField;
  8 import org.apache.lucene.document.TextField;
  9 import org.apache.lucene.index.IndexWriter;
 10 import org.apache.lucene.index.IndexWriterConfig;
 11 import org.apache.lucene.index.Term;
 12 import org.apache.lucene.store.Directory;
 13 import org.apache.lucene.store.FSDirectory;
 14 import org.apache.lucene.util.Version;
 15 import org.junit.Test;
 16 import org.wltea.analyzer.lucene.IKAnalyzer;
 17 
 18 import java.io.File;
 19 import java.util.ArrayList;
 20 import java.util.List;
 21 
 22 public class IndexManagerTest {
 23 
 24     /*
 25      * testIndexCreate();建立索引，數據來源於txt文件
 26      */
 27     @Test
 28     public void testIndexCreate() throws Exception{
 29         //建立文檔列表,保存多個Docuemnt
 30         List<Document> docList = new ArrayList<Document>();
 31         
 32         //指定txt文件所在目錄(須要創建索引的文件)
 33         File dir = new File("E:\\searchsource");
 34         //循環文件夾取出文件
 35         for(File file : dir.listFiles()){
 36             //文件名稱
 37             String fileName = file.getName();
 38             //文件內容
 39             String fileContext = FileUtils.readFileToString(file);
 40             //文件大小
 41             Long fileSize = FileUtils.sizeOf(file);
 42             
 43             //文檔對象,文件系統中的一個文件就是一個Docuemnt對象
 44             Document doc = new Document();
 45             
 46             //第一個參數:域名
 47             //第二個參數:域值
 48             //第三個參數:是否存儲,是爲yes,不存儲爲no
 49             /*TextField nameFiled = new TextField("fileName", fileName, Store.YES);
 50             TextField contextFiled = new TextField("fileContext", fileContext, Store.YES);
 51             TextField sizeFiled = new TextField("fileSize", fileSize.toString(), Store.YES);*/
 52             
 53             //是否分詞:要,由於它要索引,而且它不是一個總體,分詞有意義
 54             //是否索引:要,由於要經過它來進行搜索
 55             //是否存儲:要,由於要直接在頁面上顯示
 56             TextField nameFiled = new TextField("fileName", fileName, Store.YES);
 57             
 58             //是否分詞: 要,由於要根據內容進行搜索,而且它分詞有意義
 59             //是否索引: 要,由於要根據它進行搜索
 60             //是否存儲: 能夠要也能夠不要,不存儲搜索完內容就提取不出來
 61             TextField contextFiled = new TextField("fileContext", fileContext, Store.NO);
 62             
 63             //是否分詞: 要, 由於數字要對比,搜索文檔的時候能夠搜大小, lunene內部對數字進行了分詞算法
 64             //是否索引: 要, 由於要根據大小進行搜索
 65             //是否存儲: 要, 由於要顯示文檔大小
 66             LongField sizeFiled = new LongField("fileSize", fileSize, Store.YES);
 67             
 68             //將全部的域都存入文檔中
 69             doc.add(nameFiled);
 70             doc.add(contextFiled);
 71             doc.add(sizeFiled);
 72             
 73             //將文檔存入文檔集合中
 74             docList.add(doc);
 75         }
 76         
 77         //建立分詞器,StandardAnalyzer標準分詞器,標準分詞器對英文分詞效果很好,對中文是單字分詞
 78         Analyzer analyzer = new IKAnalyzer();
 79         //指定索引和文檔存儲的目錄
 80         Directory directory = FSDirectory.open(new File("E:\\dic"));
 81         //建立寫對象的初始化對象
 82         IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_4_10_3, analyzer);
 83         //建立索引和文檔寫對象
 84         IndexWriter indexWriter = new IndexWriter(directory, config);
 85         
 86         //將文檔加入到索引和文檔的寫對象中
 87         for(Document doc : docList){
 88             indexWriter.addDocument(doc);
 89         }
 90         //提交
 91         indexWriter.commit();
 92         //關閉流
 93         indexWriter.close();
 94     }
 95 
 96     /*
 97      * testIndexDel();刪除全部和根據詞源進行刪除。
 98      */
 99     @Test
100     public void testIndexDel() throws Exception{
101         //建立分詞器,StandardAnalyzer標準分詞器,標準分詞器對英文分詞效果很好,對中文是單字分詞
102         Analyzer analyzer = new IKAnalyzer();
103         //指定索引和文檔存儲的目錄
104         Directory directory = FSDirectory.open(new File("E:\\dic"));
105         //建立寫對象的初始化對象
106         IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_4_10_3, analyzer);
107         //建立索引和文檔寫對象
108         IndexWriter indexWriter = new IndexWriter(directory, config);
109         
110         //刪除全部
111         //indexWriter.deleteAll();
112         
113         //根據名稱進行刪除
114         //Term詞元,就是一個詞, 第一個參數:域名, 第二個參數:要刪除含有此關鍵詞的數據
115         indexWriter.deleteDocuments(new Term("fileName", "apache"));
116         
117         //提交
118         indexWriter.commit();
119         //關閉
120         indexWriter.close();
121     }
122     
123     /**
124      * testIndexUpdate();更新：
125      * 更新就是按照傳入的Term進行搜索,若是找到結果那麼刪除,將更新的內容從新生成一個Document對象
126      * 若是沒有搜索到結果,那麼將更新的內容直接添加一個新的Document對象
127      * @throws Exception
128      */
129     @Test
130     public void testIndexUpdate() throws Exception{
131         //建立分詞器,StandardAnalyzer標準分詞器,標準分詞器對英文分詞效果很好,對中文是單字分詞
132         Analyzer analyzer = new IKAnalyzer();
133         //指定索引和文檔存儲的目錄
134         Directory directory = FSDirectory.open(new File("E:\\dic"));
135         //建立寫對象的初始化對象
136         IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_4_10_3, analyzer);
137         //建立索引和文檔寫對象
138         IndexWriter indexWriter = new IndexWriter(directory, config);
139         
140         
141         //根據文件名稱進行更新
142         Term term = new Term("fileName", "web");
143         //更新的對象
144         Document doc = new Document();
145         doc.add(new TextField("fileName", "xxxxxx", Store.YES));
146         doc.add(new TextField("fileContext", "think in java xxxxxxx", Store.NO));
147         doc.add(new LongField("fileSize", 100L, Store.YES));
148         
149         //更新
150         indexWriter.updateDocument(term, doc);
151         
152         //提交
153         indexWriter.commit();
154         //關閉
155         indexWriter.close();
156     }
157 }

2：新建 IndexSearchTest 類，主要用於搜索索引：

  1 package com.dengwei.lucene;
  2 
  3 import java.io.File;
  4 
  5 import org.apache.lucene.analysis.Analyzer;
  6 import org.apache.lucene.analysis.standard.StandardAnalyzer;
  7 import org.apache.lucene.document.Document;
  8 import org.apache.lucene.index.IndexReader;
  9 import org.apache.lucene.index.Term;
 10 import org.apache.lucene.queryparser.classic.MultiFieldQueryParser;
 11 import org.apache.lucene.queryparser.classic.QueryParser;
 12 import org.apache.lucene.search.BooleanClause.Occur;
 13 import org.apache.lucene.search.BooleanQuery;
 14 import org.apache.lucene.search.IndexSearcher;
 15 import org.apache.lucene.search.MatchAllDocsQuery;
 16 import org.apache.lucene.search.NumericRangeQuery;
 17 import org.apache.lucene.search.Query;
 18 import org.apache.lucene.search.ScoreDoc;
 19 import org.apache.lucene.search.TermQuery;
 20 import org.apache.lucene.search.TopDocs;
 21 import org.apache.lucene.store.Directory;
 22 import org.apache.lucene.store.FSDirectory;
 23 import org.junit.Test;
 24 import org.wltea.analyzer.lucene.IKAnalyzer;
 25 
 26 public class IndexSearchTest {
 27     /*
 28      *testIndexSearch()
 29      * 根據關鍵字搜索，並指定默認域
 30      */
 31     @Test
 32     public void testIndexSearch() throws Exception{
 33         
 34         //建立分詞器(建立索引和全部時所用的分詞器必須一致)
 35         Analyzer analyzer = new IKAnalyzer();
 36         //建立查詢對象,第一個參數:默認搜索域, 第二個參數:分詞器
 37         //默認搜索域做用:若是搜索語法中指定域名從指定域中搜索,若是搜索時只寫了查詢關鍵字,則從默認搜索域中進行搜索
 38         QueryParser queryParser = new QueryParser("fileContext", analyzer);
 39         //查詢語法=域名:搜索的關鍵字
 40         Query query = queryParser.parse("fileName:web");
 41         
 42         //指定索引和文檔的目錄
 43         Directory dir = FSDirectory.open(new File("E:\\dic"));
 44         //索引和文檔的讀取對象
 45         IndexReader indexReader = IndexReader.open(dir);
 46         //建立索引的搜索對象
 47         IndexSearcher indexSearcher = new IndexSearcher(indexReader);
 48         //搜索:第一個參數爲查詢語句對象, 第二個參數:指定顯示多少條
 49         TopDocs topdocs = indexSearcher.search(query, 5);
 50         //一共搜索到多少條記錄
 51         System.out.println("=====count=====" + topdocs.totalHits);
 52         //從搜索結果對象中獲取結果集
 53         ScoreDoc[] scoreDocs = topdocs.scoreDocs;
 54         
 55         for(ScoreDoc scoreDoc : scoreDocs){
 56             //獲取docID
 57             int docID = scoreDoc.doc;
 58             //經過文檔ID從硬盤中讀取出對應的文檔
 59             Document document = indexReader.document(docID);
 60             //get域名能夠取出值 打印
 61             System.out.println("fileName:" + document.get("fileName"));
 62             System.out.println("fileSize:" + document.get("fileSize"));
 63             System.out.println("============================================================");
 64         }
 65         
 66     }
 67 
 68     /*
 69      *testIndexTermQuery()
 70      * 根據關鍵字進行搜索，須要指定域名，和上面的對比起來，更推薦
 71      * 使用上面的（能夠指定默認域名的）
 72      */
 73     
 74     @Test
 75     public void testIndexTermQuery() throws Exception{
 76         //建立分詞器(建立索引和全部時所用的分詞器必須一致)
 77         Analyzer analyzer = new IKAnalyzer();
 78         //建立詞元:就是詞,   
 79         Term term = new Term("fileName", "apache");
 80         //使用TermQuery查詢,根據term對象進行查詢
 81         TermQuery termQuery = new TermQuery(term);
 82         
 83         
 84         //指定索引和文檔的目錄
 85         Directory dir = FSDirectory.open(new File("E:\\dic"));
 86         //索引和文檔的讀取對象
 87         IndexReader indexReader = IndexReader.open(dir);
 88         //建立索引的搜索對象
 89         IndexSearcher indexSearcher = new IndexSearcher(indexReader);
 90         //搜索:第一個參數爲查詢語句對象, 第二個參數:指定顯示多少條
 91         TopDocs topdocs = indexSearcher.search(termQuery, 5);
 92         //一共搜索到多少條記錄
 93         System.out.println("=====count=====" + topdocs.totalHits);
 94         //從搜索結果對象中獲取結果集
 95         ScoreDoc[] scoreDocs = topdocs.scoreDocs;
 96         
 97         for(ScoreDoc scoreDoc : scoreDocs){
 98             //獲取docID
 99             int docID = scoreDoc.doc;
100             //經過文檔ID從硬盤中讀取出對應的文檔
101             Document document = indexReader.document(docID);
102             //get域名能夠取出值 打印
103             System.out.println("fileName:" + document.get("fileName"));
104             System.out.println("fileSize:" + document.get("fileSize"));
105             System.out.println("============================================================");
106         }
107     }
108 
109     /*
110      *testNumericRangeQuery();
111      * 用於搜索價格、大小等數值區間
112      */
113     @Test
114     public void testNumericRangeQuery() throws Exception{
115         //建立分詞器(建立索引和全部時所用的分詞器必須一致)
116         Analyzer analyzer = new IKAnalyzer();
117         
118         //根據數字範圍查詢
119         //查詢文件大小,大於100 小於1000的文章
120         //第一個參數:域名      第二個參數:最小值,  第三個參數:最大值, 第四個參數:是否包含最小值,   第五個參數:是否包含最大值
121         Query query = NumericRangeQuery.newLongRange("fileSize", 100L, 1000L, true, true);        
122         
123         //指定索引和文檔的目錄
124         Directory dir = FSDirectory.open(new File("E:\\dic"));
125         //索引和文檔的讀取對象
126         IndexReader indexReader = IndexReader.open(dir);
127         //建立索引的搜索對象
128         IndexSearcher indexSearcher = new IndexSearcher(indexReader);
129         //搜索:第一個參數爲查詢語句對象, 第二個參數:指定顯示多少條
130         TopDocs topdocs = indexSearcher.search(query, 5);
131         //一共搜索到多少條記錄
132         System.out.println("=====count=====" + topdocs.totalHits);
133         //從搜索結果對象中獲取結果集
134         ScoreDoc[] scoreDocs = topdocs.scoreDocs;
135         
136         for(ScoreDoc scoreDoc : scoreDocs){
137             //獲取docID
138             int docID = scoreDoc.doc;
139             //經過文檔ID從硬盤中讀取出對應的文檔
140             Document document = indexReader.document(docID);
141             //get域名能夠取出值 打印
142             System.out.println("fileName:" + document.get("fileName"));
143             System.out.println("fileSize:" + document.get("fileSize"));
144             System.out.println("============================================================");
145         }
146     }
147 
148     /*
149      *testBooleanQuery();
150      * 組合查詢，能夠根據多條件進行查詢
151      */
152     @Test
153     public void testBooleanQuery() throws Exception{
154         //建立分詞器(建立索引和全部時所用的分詞器必須一致)
155         Analyzer analyzer = new IKAnalyzer();
156         
157         //布爾查詢,就是能夠根據多個條件組合進行查詢
158         //文件名稱包含apache的,而且文件大小大於等於100 小於等於1000字節的文章
159         BooleanQuery query = new BooleanQuery();
160         
161         //根據數字範圍查詢
162         //查詢文件大小,大於100 小於1000的文章
163         //第一個參數:域名      第二個參數:最小值,  第三個參數:最大值, 第四個參數:是否包含最小值,   第五個參數:是否包含最大值
164         Query numericQuery = NumericRangeQuery.newLongRange("fileSize", 100L, 1000L, true, true);
165         
166         //建立詞元:就是詞,   
167         Term term = new Term("fileName", "apache");
168         //使用TermQuery查詢,根據term對象進行查詢
169         TermQuery termQuery = new TermQuery(term);
170         
171         //Occur是邏輯條件
172         //must至關於and關鍵字,是而且的意思
173         //should,至關於or關鍵字或者的意思
174         //must_not至關於not關鍵字, 非的意思
175         //注意:單獨使用must_not  或者 獨自使用must_not沒有任何意義
176         query.add(termQuery, Occur.MUST);
177         query.add(numericQuery, Occur.MUST);
178         
179         //指定索引和文檔的目錄
180         Directory dir = FSDirectory.open(new File("E:\\dic"));
181         //索引和文檔的讀取對象
182         IndexReader indexReader = IndexReader.open(dir);
183         //建立索引的搜索對象
184         IndexSearcher indexSearcher = new IndexSearcher(indexReader);
185         //搜索:第一個參數爲查詢語句對象, 第二個參數:指定顯示多少條
186         TopDocs topdocs = indexSearcher.search(query, 5);
187         //一共搜索到多少條記錄
188         System.out.println("=====count=====" + topdocs.totalHits);
189         //從搜索結果對象中獲取結果集
190         ScoreDoc[] scoreDocs = topdocs.scoreDocs;
191         
192         for(ScoreDoc scoreDoc : scoreDocs){
193             //獲取docID
194             int docID = scoreDoc.doc;
195             //經過文檔ID從硬盤中讀取出對應的文檔
196             Document document = indexReader.document(docID);
197             //get域名能夠取出值 打印
198             System.out.println("fileName:" + document.get("fileName"));
199             System.out.println("fileSize:" + document.get("fileSize"));
200             System.out.println("============================================================");
201         }
202     }
203 
204     /*
205      *testMathAllQuery();
206      * 查詢全部：
207      */
208     @Test
209     public void testMathAllQuery() throws Exception{
210         //建立分詞器(建立索引和全部時所用的分詞器必須一致)
211         Analyzer analyzer = new IKAnalyzer();
212         
213         //查詢全部文檔
214         MatchAllDocsQuery query = new MatchAllDocsQuery();
215         
216         //指定索引和文檔的目錄
217         Directory dir = FSDirectory.open(new File("E:\\dic"));
218         //索引和文檔的讀取對象
219         IndexReader indexReader = IndexReader.open(dir);
220         //建立索引的搜索對象
221         IndexSearcher indexSearcher = new IndexSearcher(indexReader);
222         //搜索:第一個參數爲查詢語句對象, 第二個參數:指定顯示多少條
223         TopDocs topdocs = indexSearcher.search(query, 5);
224         //一共搜索到多少條記錄
225         System.out.println("=====count=====" + topdocs.totalHits);
226         //從搜索結果對象中獲取結果集
227         ScoreDoc[] scoreDocs = topdocs.scoreDocs;
228         
229         for(ScoreDoc scoreDoc : scoreDocs){
230             //獲取docID
231             int docID = scoreDoc.doc;
232             //經過文檔ID從硬盤中讀取出對應的文檔
233             Document document = indexReader.document(docID);
234             //get域名能夠取出值 打印
235             System.out.println("fileName:" + document.get("fileName"));
236             System.out.println("fileSize:" + document.get("fileSize"));
237             System.out.println("============================================================");
238         }
239     }
240 
241     /*
242      *testMultiFieldQueryParser();
243      * 從多個域中進行查詢
244      */
245     @Test
246     public void testMultiFieldQueryParser() throws Exception{
247         //建立分詞器(建立索引和全部時所用的分詞器必須一致)
248         Analyzer analyzer = new IKAnalyzer();
249         
250         String [] fields = {"fileName","fileContext"};
251         //從文件名稱和文件內容中查詢,只有含有apache的就查出來
252         MultiFieldQueryParser multiQuery = new MultiFieldQueryParser(fields, analyzer);
253         //輸入須要搜索的關鍵字
254         Query query = multiQuery.parse("apache");
255         
256         //指定索引和文檔的目錄
257         Directory dir = FSDirectory.open(new File("E:\\dic"));
258         //索引和文檔的讀取對象
259         IndexReader indexReader = IndexReader.open(dir);
260         //建立索引的搜索對象
261         IndexSearcher indexSearcher = new IndexSearcher(indexReader);
262         //搜索:第一個參數爲查詢語句對象, 第二個參數:指定顯示多少條
263         TopDocs topdocs = indexSearcher.search(query, 5);
264         //一共搜索到多少條記錄
265         System.out.println("=====count=====" + topdocs.totalHits);
266         //從搜索結果對象中獲取結果集
267         ScoreDoc[] scoreDocs = topdocs.scoreDocs;
268         
269         for(ScoreDoc scoreDoc : scoreDocs){
270             //獲取docID
271             int docID = scoreDoc.doc;
272             //經過文檔ID從硬盤中讀取出對應的文檔
273             Document document = indexReader.document(docID);
274             //get域名能夠取出值 打印
275             System.out.println("fileName:" + document.get("fileName"));
276             System.out.println("fileSize:" + document.get("fileSize"));
277             System.out.println("============================================================");
278         }
279     }
280 }

相關標籤/搜索

每日一句

每一个你不满意的现在，都有一个你没有努力的曾经。