系列文章:java
Lucene系列(二)luke使用及索引文檔的基本操做github
建立項目並添加Maven依賴apache
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
<scope>test</scope>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.lucene/lucene-core -->
<!-- Lucene核心庫 -->
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId>
<version>7.2.1</version>
</dependency>
<!-- Lucene解析庫 -->
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-queryparser</artifactId>
<version>7.2.1</version>
</dependency>
<!-- Lucene附加的分析庫 -->
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analyzers-common</artifactId>
<version>7.2.1</version>
</dependency>
<!-- 高亮顯示 -->
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-highlighter</artifactId>
<version>7.2.1</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.lucene/lucene-analyzers-smartcn -->
<!-- 中文分詞 -->
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analyzers-smartcn</artifactId>
<version>7.2.0</version>
</dependency>
</dependencies>
複製代碼
寫索引數組
import java.io.File;
import java.io.FileReader;
import java.nio.file.Paths;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
public class Indexer {
private IndexWriter writer; // 寫索引實例
/** * 構造方法 實例化IndexWriter * @param indexDir * @throws Exception */
public Indexer(String indexDir)throws Exception{
Directory dir=FSDirectory.open(Paths.get(indexDir));
Analyzer analyzer=new StandardAnalyzer(); // 標準分詞器
IndexWriterConfig iwc=new IndexWriterConfig(analyzer);
writer=new IndexWriter(dir, iwc);
}
/** * 關閉寫索引 * @throws Exception */
public void close()throws Exception{
writer.close();
}
/** * 索引指定目錄的全部文件 * @param dataDir * @throws Exception */
public int index(String dataDir)throws Exception{
File []files=new File(dataDir).listFiles();
for(File f:files){
indexFile(f);
}
return writer.numDocs();
}
/** * 索引指定文件 * @param f */
private void indexFile(File f) throws Exception{
System.out.println("索引文件:"+f.getCanonicalPath());
Document doc=getDocument(f);
writer.addDocument(doc);
}
/** * 獲取文檔,文檔裏再設置每一個字段 * @param f */
private Document getDocument(File f)throws Exception {
Document doc=new Document();
doc.add(new TextField("contents",new FileReader(f)));
doc.add(new TextField("fileName", f.getName(),Field.Store.YES));
doc.add(new TextField("fullPath",f.getCanonicalPath(),Field.Store.YES));
return doc;
}
public static void main(String[] args) {
String indexDir="D:\\lucene\\searchindex";
String dataDir="D:\\lucene\\data";
Indexer indexer=null;
int numIndexed=0;
long start=System.currentTimeMillis();
try {
indexer = new Indexer(indexDir);
numIndexed=indexer.index(dataDir);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}finally{
try {
indexer.close();
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
long end=System.currentTimeMillis();
System.out.println("索引:"+numIndexed+" 個文件 花費了"+(end-start)+" 毫秒");
}
}
複製代碼
讀取索引微信
import java.nio.file.Paths;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
public class SearchTest {
private Directory dir;
private IndexReader reader;
private IndexSearcher is;
@Before
public void setUp() throws Exception {
dir=FSDirectory.open(Paths.get("D:\\lucene\\searchindex"));
reader=DirectoryReader.open(dir);
is=new IndexSearcher(reader);
}
@After
public void tearDown() throws Exception {
reader.close();
}
}
複製代碼
對特定單詞查詢和模糊查詢post
/** * 對特定單詞查詢及模糊查詢 * * @throws Exception */
@Test
public void testTermQuery() throws Exception {
String searchField = "contents";
// 所給出的必須是單詞,否則差不到
String q = "authorship";
// 一個Term表示來自文本的一個單詞。
Term t = new Term(searchField, q);
// 爲Term構造查詢。
Query query = new TermQuery(t);
/** * 1.須要根據條件查詢 * * 2.最大可編輯數,取值範圍0,1,2 * 容許個人查詢條件的值,能夠錯誤幾個字符 * */
Query query2 = new FuzzyQuery(new Term(searchField,"authorshioo"),1);
TopDocs hits = is.search(query, 10);
// hits.totalHits:查詢的總命中次數。即在幾個文檔中查到給定單詞
System.out.println("匹配 '" + q + "',總共查詢到" + hits.totalHits + "個文檔");
for (ScoreDoc scoreDoc : hits.scoreDocs) {
Document doc = is.doc(scoreDoc.doc);
System.out.println(doc.get("fullPath"));
}
TopDocs hits2 = is.search(query2, 10);
// hits.totalHits:查詢的總命中次數。即在幾個文檔中查到給定單詞
System.out.println("匹配 '" + "authorshioo"+ "',總共查詢到" + hits2.totalHits + "個文檔");
for (ScoreDoc scoreDoc : hits2.scoreDocs) {
Document doc = is.doc(scoreDoc.doc);
System.out.println(doc.get("fullPath"));
}
}
複製代碼
咱們上面查詢了單詞「authorship」以及模糊查詢了單詞"authorshioo",結果以下: 學習
Query query2 = new FuzzyQuery(new Term(searchField,"authorshioo"),1);
複製代碼
解析表達式的使用測試
/** * 解析查詢表達式 * * @throws Exception */
@Test
public void testQueryParser() throws Exception {
// 標準分詞器
Analyzer analyzer = new StandardAnalyzer();
String searchField = "contents";
String q = "atomic a atomicReader";
String q2 = "AtomicReader and AtomicReaderContext";
// 創建查詢解析器
//searchField:要查詢的字段;
//analyzer:標準分詞器實例
QueryParser parser = new QueryParser(searchField, analyzer);
Query query = parser.parse(q);
//返回查詢到的前10項(查到100個相關內容的話也只會返回10個)
TopDocs hits = is.search(query, 10);
System.out.println("匹配 " + q + "查詢到" + hits.totalHits + "個記錄");
for (ScoreDoc scoreDoc : hits.scoreDocs) {
Document doc = is.doc(scoreDoc.doc);
System.out.println(doc.get("fullPath"));
}
QueryParser parser2 = new QueryParser(searchField, analyzer);
Query query2 = parser2.parse(q2);
//返回查詢到的前10項(查到100個相關內容的話也只會返回10個)
TopDocs hits2 = is.search(query2, 10);
System.out.println("匹配 " + q2 + "查詢到" + hits2.totalHits + "個記錄");
for (ScoreDoc scoreDoc : hits2.scoreDocs) {
Document doc = is.doc(scoreDoc.doc);
System.out.println(doc.get("fullPath"));
}
}
複製代碼
咱們上面分別查詢了:「atomic a atomicReader」和「AtomicReader and AtomicReaderContext」,經過查詢結果能夠看出即便稍微改變查詢內容,也仍是能夠查詢到和咱們給出的表達式相關的文檔。
寫索引
import java.nio.file.Paths;
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
public class Indexer {
private String[] ids={"1","2","3"};
private String citys[]={"青島","南京","上海"};
private String descs[]={
"青島是一個漂亮的城市。",
"南京是一個文化的城市。",
"上海是一個繁華的城市。"
};
private Directory dir;
/** *實例化indexerWriter * @return * @throws Exception */
private IndexWriter getWriter()throws Exception{
//中文分詞器
SmartChineseAnalyzer analyzer=new SmartChineseAnalyzer();
IndexWriterConfig iwc=new IndexWriterConfig(analyzer);
IndexWriter writer=new IndexWriter(dir, iwc);
return writer;
}
/** * 獲取indexDir * @param indexDir * @throws Exception */
private void index(String indexDir)throws Exception{
dir=FSDirectory.open(Paths.get(indexDir));
IndexWriter writer=getWriter();
for(int i=0;i<ids.length;i++){
Document doc=new Document();
doc.add(new StringField("id", ids[i], Field.Store.YES));
doc.add(new StringField("city",citys[i],Field.Store.YES));
doc.add(new TextField("desc", descs[i], Field.Store.YES));
writer.addDocument(doc);
}
writer.close();
}
public static void main(String[] args) throws Exception {
new Indexer().index("D:\\lucene\\dataindex2");
System.out.println("Success Indexer");
}
}
複製代碼
中文查詢及高亮顯示
import java.io.StringReader;
import java.nio.file.Paths;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.Fragmenter;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.search.highlight.SimpleSpanFragmenter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
/** * * 經過索引字段來讀取文檔 * @author LXY * */
public class SearchTest {
public static void search(String indexDir, String par) throws Exception{
//獲得讀取索引文件的路徑
Directory dir = FSDirectory.open(Paths.get(indexDir));
//經過dir獲得的路徑下的全部的文件
IndexReader reader = DirectoryReader.open(dir);
//創建索引查詢器
IndexSearcher searcher = new IndexSearcher(reader);
//中文分詞器
SmartChineseAnalyzer analyzer=new SmartChineseAnalyzer();
//創建查詢解析器
/** * 第一個參數是要查詢的字段; * 第二個參數是分析器Analyzer * */
QueryParser parser = new QueryParser("desc", analyzer);
//根據傳進來的par查找
Query query = parser.parse(par);
//計算索引開始時間
long start = System.currentTimeMillis();
//開始查詢
/** * 第一個參數是經過傳過來的參數來查找獲得的query; * 第二個參數是要出查詢的行數 * */
TopDocs topDocs = searcher.search(query, 10);
//索引結束時間
long end = System.currentTimeMillis();
System.out.println("匹配"+par+",總共花費了"+(end-start)+"毫秒,共查到"+topDocs.totalHits+"條記錄。");
//高亮顯示start
//算分
QueryScorer scorer=new QueryScorer(query);
//顯示得分高的片斷
Fragmenter fragmenter=new SimpleSpanFragmenter(scorer);
//設置標籤內部關鍵字的顏色
//第一個參數:標籤的前半部分;第二個參數:標籤的後半部分。
SimpleHTMLFormatter simpleHTMLFormatter=new SimpleHTMLFormatter("<b><font color='red'>","</font></b>");
//第一個參數是對查到的結果進行實例化;第二個是片斷得分(顯示得分高的片斷,即摘要)
Highlighter highlighter=new Highlighter(simpleHTMLFormatter, scorer);
//設置片斷
highlighter.setTextFragmenter(fragmenter);
//高亮顯示end
//遍歷topDocs
/** * ScoreDoc:是表明一個結果的相關度得分與文檔編號等信息的對象。 * scoreDocs:表明文件的數組 * @throws Exception * */
for(ScoreDoc scoreDoc : topDocs.scoreDocs){
//獲取文檔
Document document = searcher.doc(scoreDoc.doc);
//輸出全路徑
System.out.println(document.get("city"));
System.out.println(document.get("desc"));
String desc = document.get("desc");
if(desc!=null){
//把所有得分高的摘要給顯示出來
//第一個參數是對哪一個參數進行設置;第二個是以流的方式讀入
TokenStream tokenStream=analyzer.tokenStream("desc", new StringReader(desc));
//獲取最高的片斷
System.out.println(highlighter.getBestFragment(tokenStream, desc));
}
}
reader.close();
}
//開始測試
public static void main(String[] args) {
//索引指定的路徑
String indexDir = "D:\\lucene\\dataindex2";
//查詢的字段
String par = "南京";
try {
search(indexDir,par);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
複製代碼
結果會把咱們查詢的「南京」單詞給高亮顯示,這在咱們平時搜索中很常見了。
歡迎關注個人微信公衆號:「Java面試通關手冊」(堅持原創,分享美文,分享各類Java學習資源,面試題,以及企業級Java實戰項目回覆關鍵字免費領取):
Lucene我想暫時先更新到這裏,僅僅這三篇文章想掌握Lucene是遠遠不夠的。另外我這裏三篇文章都用的最新的jar包,Lucene更新太快,5系列後的版本和以前的有些地方仍是有挺大差距的,就好比爲文檔域設置權值的setBoost方法6.6之後已經被廢除了等等。由於時間有限,因此我就草草的看了一下Lucene的官方文檔,大多數內容仍是看java1234網站的這個視頻來學習的,而後在版本和部分代碼上作了改進。截止2018/4/1,上述代碼所用的jar包皆爲最新。
最後推薦一下本身以爲還不錯的Lucene學習網站/博客:
官方網站:[Welcome to Apache Lucene](Welcome to Apache Lucene)
Github:Apache Lucene and Solr