lucene

全文檢索

全文檢索是計算機程序經過掃描文章中的每個詞,對每個詞創建一個索引,指明該詞在文章中出現的次數和位置。當用戶查詢時根據創建的索引查找,相似於經過字典的檢索字表查字的過程。java


全文檢索(Full-Text Retrieval)是指以文本做爲檢索對象,找出含有指定詞彙的文本。全面、準確和快速是衡量全文檢索系統的關鍵指標。apache

全文檢索特色:
一、只處理文本
二、不處理語義
三、搜索時英文不區分大小寫
四、結果列表有相關度排序。框架

Lucene

一個開源的全文檢索框架工具

lucene相關API介紹

IndexWriter:建立和維護索引庫
Analyzer:在建立索引時會用到分詞器,在使用字符串搜索時也會用到分詞器,這兩個地方要使用同一個分詞器,不然可能會搜索不出結果。Analyzer(分詞器)的做用是把一段文本中的詞按規則取出所包含的全部詞。對應的是Analyzer類,這是一個抽象類,切分詞的具體規則是由子類實現的,因此對於不一樣的語言(規則),要用不一樣的分詞器。
Directory:索引庫的位置
Document:文檔是索引和搜索的單元。一個文檔能夠包含一組Field(字段)
Field:字段,包含名稱和值,字段能夠選擇性的存儲在索引中
IndexSearcher:檢索工具
Query:查詢,lucene中支持模糊查詢,語義查詢,短語查詢,組合查詢等等,若有TermQuery,BooleanQuery,RangeQuery,WildcardQuery等一些類。
QueryParser: 是一個解析用戶輸入的工具,能夠經過掃描用戶輸入的字符串,生成Query對象。
Hits:在搜索完成以後,須要把搜索結果返回並顯示給用戶,只有這樣纔算是完成搜索的目的。在lucene中,搜索的結果的集合是用Hits類的實例來表示的。優化

 

package lucenedemo.lucenedemo;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.StringReader;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriter.MaxFieldLength;
import org.apache.lucene.queryParser.MultiFieldQueryParser;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.junit.Test;

public class LuceneDemo {

	@Test
	public void buildIndex() throws Exception {

		File path = new File("D:/lucene");
		Directory d = FSDirectory.open(path);

		Analyzer a = new StandardAnalyzer(Version.LUCENE_30);
		// 第一個參數:索引存儲的位置
		// 第二個參數:分詞器
		// 第三個參數:Filed長度是否受限制
		IndexWriter indexWriter = new IndexWriter(d, a,
				MaxFieldLength.UNLIMITED);
		indexWriter.deleteAll();
		// 構造document
		BufferedReader reader = new BufferedReader(new FileReader(new File(
				"D:/test.txt")));
		String line = null;
		while ((line = reader.readLine()) != null) {
			Document doc = new Document();
			// 若是須要查詢則必須設置Index.ANALYZED
			Field field = new Field("line", line, Field.Store.YES,
					Index.ANALYZED);
			doc.add(field);
			indexWriter.addDocument(doc);
		}
		// 優化
		indexWriter.optimize();
		indexWriter.commit();
		indexWriter.close(true);
		reader.close();
	}

	@Test
	public void searchTermQuery() throws Exception {
		File path = new File("D:/lucene");
		Directory d = FSDirectory.open(path);
		IndexSearcher indexSearcher = new IndexSearcher(d);
		Analyzer a = new StandardAnalyzer(Version.LUCENE_30);

		QueryParser queryParser = new QueryParser(Version.LUCENE_30, "line", a);
		Query query = queryParser.parse("Spark");
		// top 5
		TopDocs result = indexSearcher.search(query, 5);
		System.out.println(result.totalHits);
		indexSearcher.close();
	}

	/**
	 * 高亮查詢
	 */
	@Test
	public void searchHL() throws Exception {
		File path = new File("D:/lucene");
		Directory d = FSDirectory.open(path);
		IndexSearcher indexSearcher = new IndexSearcher(d);
		Analyzer a = new StandardAnalyzer(Version.LUCENE_30);

		String[] fields = { "line" };
		String[] q = { "spark" };
		Query query = MultiFieldQueryParser.parse(Version.LUCENE_CURRENT, q,
				fields, a);

		TopScoreDocCollector topCollector = TopScoreDocCollector.create(
				indexSearcher.maxDoc(), false);
		
		indexSearcher.search(query, topCollector);
		// 高亮設置
		SimpleHTMLFormatter simpleHtmlFormatter = new SimpleHTMLFormatter(
				"<B>", "</B>");//
		Highlighter highlighter = new Highlighter(simpleHtmlFormatter,
				new QueryScorer(query));

		ScoreDoc[] docs = topCollector.topDocs(5).scoreDocs;
		for (ScoreDoc scdoc : docs) {
			Document document = indexSearcher.doc(scdoc.doc);

			TokenStream tokenStream = a.tokenStream("line", new StringReader(
					document.get("line")));
			String str = highlighter.getBestFragment(tokenStream,
					document.get("line"));
			System.out.println(str);
		}
	}

}

須要的jar包ui

		<dependency>
			<groupId>org.apache.lucene</groupId>
			<artifactId>lucene-core</artifactId>
			<version>3.0.0</version>
		</dependency>

		<dependency>
			<groupId>org.apache.lucene</groupId>
			<artifactId>lucene-analyzers</artifactId>
			<version>3.0.0</version>
		</dependency>

		<dependency>
			<groupId>org.apache.lucene</groupId>
			<artifactId>lucene-analyzers</artifactId>
			<version>3.0.0</version>
		</dependency>

		<dependency>
			<groupId>org.apache.lucene</groupId>
			<artifactId>lucene-highlighter</artifactId>
			<version>3.0.0</version>
		</dependency>

		<dependency>
			<groupId>org.apache.lucene</groupId>
			<artifactId>lucene-memory</artifactId>
			<version>3.0.0</version>
		</dependency>
相關文章
相關標籤/搜索