淺談文本檢索工具Lucene

時間 2019-11-07

原文原文鏈接

最近項目要用文本檢索工具，就瞭解到了lucene. 我對lucene查了不少資料，瞭解到lucene的版本比較多，且每一個版本里面的方法變更比較大。最後肯定lucene4.7.2版本，理論就很少說了，網上不少介紹理論的，我就直接把能用的代碼介紹一下。ide

1.首先創建索引工具

public static void indexBuilding(String indexPath, List<Book> datas){// indexPath表示索引存放的目錄 datas表示數據（能夠從文本里、表裏等取到數據）
try {
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_47);
IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_47, analyzer);
indexWriterConfig.setOpenMode(OpenMode.CREATE);//已建立模式創建索引
IndexWriter indexWriter = new IndexWriter(FSDirectory.open(new File(indexPath)), indexWriterConfig);
for (Book bood: datas) {
indexWriter.addDocument(Document(wotvMediaInfoModel));//寫入索引
}
indexWriter.close();
} catch (Exception e) {
e.printStackTrace();
}測試

}ui

public static Document Document(Book book) {
Document doc = new Document();
doc.add(new StringField("name", book.getName(), Field.Store.YES));//name建立索引，並保存
doc.add(new StringField("title", book.getTitle(), Field.Store.No));//title建立索引，不保存
doc.add(new TextField("name", book.getAuthor(), Field.Store.YES));//author建立索引，保存
return doc;
}this

//數據封裝對象.net

public class Book implements Serializable {對象

private static final long serialVersionUID = 1L;
private int id;
private String name;
private String title;
private String author;索引

public data () {}
public void setId(int id) {token

this.id = id;ci

}

public int getId() {

return id;

}

................

}

2.創建完索引，就能夠進行查詢了

public List<QueryResult> query(String keyword) {
try {
Date start = new Date();
IndexSearcher indexSearcher = initIndexSearch();//初始化IndexSearcher
String key = IK_Analyzer(keyword);//IK分詞器進行分詞
if (StringUtil.isEmpty(key)) {//分詞失敗，直接返回
return null;
}
List<QueryResult> resultList = queryByOneKey(indexSearcher, "name", key);
if (!CollectionUtil.isEmpty(resultList)) {
Date end = new Date();
logger.info("queryByOneField 耗時：" + (end.getTime() - start.getTime()) + "ms");
for (QueryResult queryResult : resultList) {
logger.info("queryByOneField查詢結果：" + queryResult.toString());
}
return resultList;
}
} catch (Exception e) {
logger.error("queryByOneField error", e);
}
return null;
}

//拿到索引，並建立IndexSearcher

protected static IndexSearcher initIndexSearch() {
if (indexSearcher == null) {
DirectoryReader directoryReader = null;
try {
directoryReader = DirectoryReader.open(FSDirectory.open(new File(Lucene.Path.indexFilePath)));
} catch (IOException e) {
logger.error("queryByMultiField error:{}", e);
e.printStackTrace();
return null;
}
indexSearcher = new IndexSearcher(directoryReader);
return indexSearcher;
}
return indexSearcher;
}

//利用IK進行分詞

protected static String IK_Analyzer(String str) {
Analyzer ikAnalyzer = new IKAnalyzer();
Reader reader = new StringReader(str);
String results = "";
try {
TokenStream tokenStream = ikAnalyzer.tokenStream("", reader);
CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);
tokenStream.addAttribute(CharTermAttribute.class);
tokenStream.reset();
while (tokenStream.incrementToken()) {
int startOffSet = offsetAttribute.startOffset();
int endOffSet = offsetAttribute.endOffset();
if (endOffSet - startOffSet > 1) {
results = results + charTermAttribute.toString() + " ";
}
}
tokenStream.close();
} catch (IOException e) {
e.printStackTrace();
}
logger.info("IK_Analyzer字典分詞結果:[" + results + "]");
ikAnalyzer.close();
return results;
}

//查詢結果

protected static List<QueryResult> queryByOneKey(IndexSearcher indexSearcher, String field, String keys)
throws ParseException {
try {
Date start = new Date();
QueryParser queryParser = new QueryParser(Version.LUCENE_47, field,
new StandardAnalyzer(Version.LUCENE_47));
queryParser.setDefaultOperator(Operator.OR);//Operator.YES（邏輯與的意思）和 Operator.OR（邏輯或的意思）
Query query = queryParser.parse(keys);
TopDocs topDocs = indexSearcher.search(query, 3);//取出最匹配的3條記錄
Date end = new Date();
logger.info("queryByOneKey 總共爲您找到 " + topDocs.totalHits + "條記錄\t耗時：" + (end.getTime() - start.getTime())
+ "ms");
ScoreDoc[] scoreDocs = topDocs.scoreDocs;
List<QueryResult> list = new ArrayList<QueryResult>();
for (ScoreDoc scoreDoc : scoreDocs) {
list.add(getIndexResult(indexSearcher.doc(scoreDoc.doc), scoreDoc.score));
}
return list;
} catch (IOException e) {
logger.error("queryByOneKey error:{}", e);
e.printStackTrace();
}
return null;
}

// 組裝對象
private static QueryResult getIndexResult(Document doc, float score) {
QueryResult indexResult = new QueryResult();
indexResult.setCid(doc.get("cid"));
indexResult.setRes(doc.get("res"));
indexResult.setName(doc.get("name"));
indexResult.setScore(score);
return indexResult;
}

//結果對象封裝

public class QueryResult {

private String name;
private String author;
private float score;

public String getCid() {
return cid;
}

public void setName(String name) {
this.name= name;
}

public String getName() {
return name;
}

public void setTitle(String title) {
this.title = title;
}

public float getTitle() {
return title;
}

public void setScore(float score) {
this.score = score;
}

}

3.利用IK分詞器

IK分詞器是對中文分詞支持比較好的了，而且能夠本身擴展字典

自定義字典配置

public class MyConfig implements Configuration {

/*
* 分詞器默認字典路徑
*/
private static final String PATH_DIC_MAIN = "org/wltea/analyzer/dic/main2012.dic";
private static final String PATH_DIC_QUANTIFIER = "org/wltea/analyzer/dic/quantifier.dic";
/*
* 是否使用smart方式分詞
*/
private boolean useSmart;

/**
* 返回useSmart標誌位 useSmart =true ，分詞器使用智能切分策略， =false則使用細粒度切分
*
* @return useSmart
*/
public boolean useSmart() {
return useSmart;
}

/**
* 設置useSmart標誌位 useSmart =true ，分詞器使用智能切分策略， =false則使用細粒度切分
*
* @param useSmart
*/
public void setUseSmart(boolean useSmart) {
this.useSmart = useSmart;
}

/**
* 獲取主詞典路徑
*
* @return String 主詞典路徑
*/
public String getMainDictionary() {
return PATH_DIC_MAIN;
}

/**
* 獲取量詞詞典路徑
*
* @return String 量詞詞典路徑
*/
public String getQuantifierDicionary() {
return PATH_DIC_QUANTIFIER;
}

/**
* 獲取擴展字典配置路徑
*
* @return List<String> 相對類加載器的路徑
*/
public List<String> getExtDictionarys() {
List<String> extDictFiles = new ArrayList<String>(2);
String extDictCfg = Lucene.IKAnalyzer.myDicPath;//這是本身定義字典目錄，由於IK利用的ClassLoader解析的該文件，因此目錄最好放到與該方法同級目錄下，要否則擴展字典不起做用
extDictFiles.add(extDictCfg);
return extDictFiles;
}

/**
* 獲取擴展中止詞典配置路徑
*/
@Override
public List<String> getExtStopWordDictionarys() {
return null;
}

}

public class InitMyDictionary {

/**
* 將自定義配置添加到字典中
*/
public static void init() {
Dictionary.initial(new MyConfig());
}

}