最近項目要用文本檢索工具,就瞭解到了lucene. 我對lucene查了不少資料,瞭解到lucene的版本比較多,且每一個版本里面的方法變更比較大。最後肯定lucene4.7.2版本,理論就很少說了,網上不少介紹理論的,我就直接把能用的代碼介紹一下。ide
1.首先創建索引工具
public static void indexBuilding(String indexPath, List<Book> datas){// indexPath表示索引存放的目錄 datas表示數據(能夠從文本里、表裏等取到數據)
try {
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_47);
IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_47, analyzer);
indexWriterConfig.setOpenMode(OpenMode.CREATE);//已建立模式創建索引
IndexWriter indexWriter = new IndexWriter(FSDirectory.open(new File(indexPath)), indexWriterConfig);
for (Book bood: datas) {
indexWriter.addDocument(Document(wotvMediaInfoModel));//寫入索引
}
indexWriter.close();
} catch (Exception e) {
e.printStackTrace();
}測試
}ui
public static Document Document(Book book) {
Document doc = new Document();
doc.add(new StringField("name", book.getName(), Field.Store.YES));//name建立索引,並保存
doc.add(new StringField("title", book.getTitle(), Field.Store.No));//title建立索引,不保存
doc.add(new TextField("name", book.getAuthor(), Field.Store.YES));//author建立索引,保存
return doc;
}this
//數據封裝對象.net
public class Book implements Serializable {對象
private static final long serialVersionUID = 1L;
private int id;
private String name;
private String title;
private String author;索引
public data () {}
public void setId(int id) {token
this.id = id;ci
}
public int getId() {
return id;
}
................
................
}
2.創建完索引,就能夠進行查詢了
public List<QueryResult> query(String keyword) {
try {
Date start = new Date();
IndexSearcher indexSearcher = initIndexSearch();//初始化IndexSearcher
String key = IK_Analyzer(keyword);//IK分詞器進行分詞
if (StringUtil.isEmpty(key)) {//分詞失敗,直接返回
return null;
}
List<QueryResult> resultList = queryByOneKey(indexSearcher, "name", key);
if (!CollectionUtil.isEmpty(resultList)) {
Date end = new Date();
logger.info("queryByOneField 耗時:" + (end.getTime() - start.getTime()) + "ms");
for (QueryResult queryResult : resultList) {
logger.info("queryByOneField查詢結果:" + queryResult.toString());
}
return resultList;
}
} catch (Exception e) {
logger.error("queryByOneField error", e);
}
return null;
}
//拿到索引,並建立IndexSearcher
protected static IndexSearcher initIndexSearch() {
if (indexSearcher == null) {
DirectoryReader directoryReader = null;
try {
directoryReader = DirectoryReader.open(FSDirectory.open(new File(Lucene.Path.indexFilePath)));
} catch (IOException e) {
logger.error("queryByMultiField error:{}", e);
e.printStackTrace();
return null;
}
indexSearcher = new IndexSearcher(directoryReader);
return indexSearcher;
}
return indexSearcher;
}
//利用IK進行分詞
protected static String IK_Analyzer(String str) {
Analyzer ikAnalyzer = new IKAnalyzer();
Reader reader = new StringReader(str);
String results = "";
try {
TokenStream tokenStream = ikAnalyzer.tokenStream("", reader);
CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);
tokenStream.addAttribute(CharTermAttribute.class);
tokenStream.reset();
while (tokenStream.incrementToken()) {
int startOffSet = offsetAttribute.startOffset();
int endOffSet = offsetAttribute.endOffset();
if (endOffSet - startOffSet > 1) {
results = results + charTermAttribute.toString() + " ";
}
}
tokenStream.close();
} catch (IOException e) {
e.printStackTrace();
}
logger.info("IK_Analyzer字典分詞結果:[" + results + "]");
ikAnalyzer.close();
return results;
}
//查詢結果
protected static List<QueryResult> queryByOneKey(IndexSearcher indexSearcher, String field, String keys)
throws ParseException {
try {
Date start = new Date();
QueryParser queryParser = new QueryParser(Version.LUCENE_47, field,
new StandardAnalyzer(Version.LUCENE_47));
queryParser.setDefaultOperator(Operator.OR);//Operator.YES(邏輯與的意思) 和 Operator.OR(邏輯或的意思)
Query query = queryParser.parse(keys);
TopDocs topDocs = indexSearcher.search(query, 3);//取出最匹配的3條記錄
Date end = new Date();
logger.info("queryByOneKey 總共爲您找到 " + topDocs.totalHits + "條記錄\t耗時:" + (end.getTime() - start.getTime())
+ "ms");
ScoreDoc[] scoreDocs = topDocs.scoreDocs;
List<QueryResult> list = new ArrayList<QueryResult>();
for (ScoreDoc scoreDoc : scoreDocs) {
list.add(getIndexResult(indexSearcher.doc(scoreDoc.doc), scoreDoc.score));
}
return list;
} catch (IOException e) {
logger.error("queryByOneKey error:{}", e);
e.printStackTrace();
}
return null;
}
// 組裝對象
private static QueryResult getIndexResult(Document doc, float score) {
QueryResult indexResult = new QueryResult();
indexResult.setCid(doc.get("cid"));
indexResult.setRes(doc.get("res"));
indexResult.setName(doc.get("name"));
indexResult.setScore(score);
return indexResult;
}
//結果對象封裝
public class QueryResult {
private String name;
private String author;
private float score;
public String getCid() {
return cid;
}
public void setName(String name) {
this.name= name;
}
public String getName() {
return name;
}
public void setTitle(String title) {
this.title = title;
}
public float getTitle() {
return title;
}
public void setScore(float score) {
this.score = score;
}
}
3.利用IK分詞器
IK分詞器是對中文分詞支持比較好的了,而且能夠本身擴展字典
自定義字典配置
public class MyConfig implements Configuration {
/*
* 分詞器默認字典路徑
*/
private static final String PATH_DIC_MAIN = "org/wltea/analyzer/dic/main2012.dic";
private static final String PATH_DIC_QUANTIFIER = "org/wltea/analyzer/dic/quantifier.dic";
/*
* 是否使用smart方式分詞
*/
private boolean useSmart;
/**
* 返回useSmart標誌位 useSmart =true ,分詞器使用智能切分策略, =false則使用細粒度切分
*
* @return useSmart
*/
public boolean useSmart() {
return useSmart;
}
/**
* 設置useSmart標誌位 useSmart =true ,分詞器使用智能切分策略, =false則使用細粒度切分
*
* @param useSmart
*/
public void setUseSmart(boolean useSmart) {
this.useSmart = useSmart;
}
/**
* 獲取主詞典路徑
*
* @return String 主詞典路徑
*/
public String getMainDictionary() {
return PATH_DIC_MAIN;
}
/**
* 獲取量詞詞典路徑
*
* @return String 量詞詞典路徑
*/
public String getQuantifierDicionary() {
return PATH_DIC_QUANTIFIER;
}
/**
* 獲取擴展字典配置路徑
*
* @return List<String> 相對類加載器的路徑
*/
public List<String> getExtDictionarys() {
List<String> extDictFiles = new ArrayList<String>(2);
String extDictCfg = Lucene.IKAnalyzer.myDicPath;//這是本身定義字典目錄,由於IK利用的ClassLoader解析的該文件,因此目錄最好放到與該方法同級目錄下,要否則擴展字典不起做用
extDictFiles.add(extDictCfg);
return extDictFiles;
}
/**
* 獲取擴展中止詞典配置路徑
*/
@Override
public List<String> getExtStopWordDictionarys() {
return null;
}
}
public class InitMyDictionary {
/**
* 將自定義配置添加到字典中
*/
public static void init() {
Dictionary.initial(new MyConfig());
}
}
//測試
public class Test {
public static void main(String[] args) throws Exception {
// 建立索引
//indexBuilding(索引存放的路徑,數據);
//初始化擴展字典
InitMyDictionary.init();
//查詢
query("水滸傳");
}
}
以上就是完成的運行代碼, 要想使用就是須要lucene和IK_Analyzer的jar包、而後把本身須要的數據拿到、再就是把上面創建索引和查詢的方法本身組織一下、就能夠測試使用了。
以上就是我瞭解的lucene,還有不少功能沒了解到,可是目前夠項目使用的。