一、數據分類:html
二、檢索過程:Luncene檢索過程能夠分爲兩個部分,一個部分是上圖左側結構化,半結構化,非結構化數據的索引創建過程,另外一部分是右側索引查詢過程。java
三、反向索引: git
luncene檢索關鍵詞,經過索引能夠將關鍵詞定位到一篇文檔,這種與哦關鍵詞到文檔的映射是文檔到字符串映射的方向過程,因此稱之爲方向索引。github
四、建立索引:數據庫
五、索引檢索:apache
一、關鍵名詞:架構
二、權重計算:工具
三、空間向量模型:ui
運行源代碼中的demo實例文件和LucenecoreAPI中示例,來看看luncene是如何建立Index和檢索的。spa
package Test; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.TextField; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.store.Directory; import org.apache.lucene.store.RAMDirectory; import java.io.IOException; /** * Created by rzx on 2017/6/1. */ public class CindexSearch { public static void createIndexANDSearchIndex() throws Exception{ Analyzer analyzer = new StandardAnalyzer();//標準分詞器 //RAMDirectory內存字典存儲索引 Directory directory = new RAMDirectory(); //Directory directory = FSDirectory.open("/tmp/testindex");磁盤存儲索引 IndexWriterConfig config = new IndexWriterConfig(analyzer); IndexWriter writer = new IndexWriter(directory,config); Document document = new Document(); String text = "hello world main test"; document.add(new Field("filetest",text, TextField.TYPE_STORED)); //將域field添加到document中 writer.addDocument(document); writer.close(); DirectoryReader directoryReader = DirectoryReader.open(directory); IndexSearcher isearch = new IndexSearcher(directoryReader); QueryParser parser = new QueryParser("filetest",new StandardAnalyzer()); Query query = parser.parse("main");//查詢main關鍵詞 ScoreDoc [] hits = isearch.search(query,1000).scoreDocs; for (int i = 0; i <hits.length ; i++) { Document hitdoc =isearch.doc(hits[i].doc); System.out.print("命中的文件內容:"+hitdoc.get("filetest")); } directoryReader.close(); directory.close(); } public static void main(String[] args) { try { createIndexANDSearchIndex(); } catch (Exception e) { e.printStackTrace(); } } }
運行結果:
input目錄中建立了兩個文件test1.txt,test2.txt,內容分別是:hello world和hello main man test。運行IndexFiles讀取input目錄,並自動建立一個testindex索引目錄,結果以下:
目錄中建立了testindex文件,裏面存儲索引相關信息。運行SearchFiles以下:分別在控制檯輸入檢索關鍵字:hello,min
創建索引:Analyzer,Director(RAMDirectory,FSDirectory),IndexWriterConfig,IndexWriter,Document
* Analyzer analyzer = new StandardAnalyzer(); //實例化分詞器 * Directory directory = new RAMDirectory(); //初始化內存索引目錄 * Directory directory = FSDirectory.open("indexpath");//初始化磁盤存儲索引 * IndexWriterConfig config = new IndexWriterConfig(analyzer); //索引器配置 * IndexWriter writer = new IndexWriter(directory,config); //索引器 * Document document = new Document(); //初始化Document,用來存數據。
* DirectoryReader directoryReader = DirectoryReader.open(directory); //索引目錄讀取器 * IndexSearcher isearch = new IndexSearcher(directoryReader); //索引查詢器 *多種檢索方式: * QueryParser單字段<域>綁定: QueryParser qparser = new QueryParser("filed",new StandardAnalyzer()); //查詢解析器:參數Field域,分詞器 Query query = qparser.parse("main") //查詢關鍵詞 * MultiFieldQueryParser多字段<域>綁定(): QueryParser qparser2 = new MultiFieldQueryParser(new String[]{"field1","field2"},new StandardAnalyzer());//多字段查詢解析器 Query query = qparser2.parse("main") //查詢關鍵詞 * Term綁定字段<域>查詢:new Term(field,keyword); Term term =new Term("content","main"); Query query = new TermQuery(term); *更多方法:參照http://blog.csdn.net/chenghui0317/article/details/10824789 * ScoreDoc [] hits = isearch.search(query,1000).scoreDocs; //查詢你命中的文檔以及評分和所在分片
SimpleHTMLFormatter formatter=new SimpleHTMLFormatter("<b><font color='red'>","</font></b>"); Highlighter highlighter=new Highlighter(formatter, new QueryScorer(query)); highlighter.setTextFragmenter(new SimpleFragmenter(400)); String conten = highlighter.getBestFragment(new StandardAnalyzer(),"contents","hello main man test");
QueryParser qparser = new QueryParser("content",new SimpleAnalyzer()); QueryParser qparser = new QueryParser("content",new ClassicAnalyzer()); QueryParser qparser = new QueryParser("content",new KeywordAnalyzer()); QueryParser qparser = new QueryParser("content",new StopAnalyzer()); QueryParser qparser = new QueryParser("content",new UAX29URLEmailAnalyzer()); QueryParser qparser = new QueryParser("content",new UnicodeWhitespaceAnalyzer()); QueryParser qparser = new QueryParser("content",new WhitespaceAnalyzer()); QueryParser qparser = new QueryParser("content",new ArabicAnalyzer()); QueryParser qparser = new QueryParser("content",new ArmenianAnalyzer()); QueryParser qparser = new QueryParser("content",new BasqueAnalyzer()); QueryParser qparser = new QueryParser("content",new BrazilianAnalyzer()); QueryParser qparser = new QueryParser("content",new BulgarianAnalyzer()); QueryParser qparser = new QueryParser("content",new CatalanAnalyzer()); QueryParser qparser = new QueryParser("content",new CJKAnalyzer()); QueryParser qparser = new QueryParser("content",new CollationKeyAnalyzer()); QueryParser qparser = new QueryParser("content",new CustomAnalyzer(Version defaultMatchVersion, CharFilterFactory[] charFilters, TokenizerFactory tokenizer, TokenFilterFactory[] tokenFilters, Integer posIncGap, Integer offsetGap)); QueryParser qparser = new QueryParser("content",new SmartChineseAnalyzer());//中文最長分詞
public static void searchByIndex(String indexFilePath,String keyword) throws ParseException, InvalidTokenOffsetsException { try { String indexDataPath="testindex"; String keyWord = "main"; Directory dir= FSDirectory.open(new File(indexDataPath).toPath()); IndexReader reader= DirectoryReader.open(dir); IndexSearcher searcher=new IndexSearcher(reader); QueryParser queryParser = new QueryParser("contents",new StandardAnalyzer()); Query query = queryParser.parse("main"); TopDocs topdocs=searcher.search(query,10); ScoreDoc[] scoredocs=topdocs.scoreDocs; System.out.println("最大的評分:"+topdocs.getMaxScore()); for(int i=0;i<scoredocs.length;i++){ int doc=scoredocs[i].doc; Document document=searcher.doc(doc); System.out.println("====================================="); System.out.println("關鍵詞:"+keyWord); System.out.println("文件路徑:"+document.get("path")); System.out.println("文件ID:"+scoredocs[i].doc); //開始高亮 SimpleHTMLFormatter formatter=new SimpleHTMLFormatter("<b><font color='red'>","</font></b>"); Highlighter highlighter=new Highlighter(formatter, new QueryScorer(query)); highlighter.setTextFragmenter(new SimpleFragmenter(400)); String conten = highlighter.getBestFragment(new StandardAnalyzer(),"contents","hello main man test"); //String conten = highlighter.getBestFragment(new StandardAnalyzer(),"contents",document.get("content")); System.out.println("文件內容:"+conten); System.out.println("相關度:"+scoredocs[i].score); } reader.close(); } catch (IOException e) { e.printStackTrace(); } }
輸出結果:
源代碼:https://codeload.github.com/NextNight/luncene6.5.1Test/zip/master