Lucene是apache軟件基金會4 jakarta項目組的一個子項目,是一個開放源代碼的全文檢索引擎工具包,即它不是一個完整的全文檢索引擎,而是一個全文檢索引擎的架構,提供了完整的查詢引擎和索引引擎,部分文本分析引擎(英文與德文兩種西方語言)。Lucene的目的是爲軟件開發人員提供一個簡單易用的工具包,以方便的在目標系統中實現全文檢索的功能,或者是以此爲基礎創建起完整的全文檢索引擎。(摘自百度百科)java
操做系統:centos 5.8linux
開發環境:Eclipse 4.3apache
構建工具:Maven 4.0centos
爲了可以按照書中的例子進行學習,這裏依賴的Lucene版本是3.0.1架構
<dependencies> <dependency> <groupId>org.apache.lucene</groupId> <artifactId>lucene-core</artifactId> <version>3.0.1</version> </dependency> </dependencies>
完整配置:maven
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>com.linjl.study.book</groupId> <artifactId>book_luceneInAction</artifactId> <version>0.0.1-SNAPSHOT</version> <build> <sourceDirectory>src</sourceDirectory> <plugins> <plugin> <artifactId>maven-compiler-plugin</artifactId> <version>3.1</version> <configuration> <source /> <target /> </configuration> </plugin> </plugins> </build> <dependencies> <dependency> <groupId>org.apache.lucene</groupId> <artifactId>lucene-core</artifactId> <version>3.0.1</version> </dependency> </dependencies> </project>
下面將用兩個例子進行Lucene入門講解工具
案例一主要展現經過對指定路徑下.txt文件創建索引的過程學習
完整源碼:ui
package com.linjl.study.book.luceneInAction.chapter1; import java.io.File; import java.io.FileFilter; import java.io.FileReader; import java.io.IOException; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; public class Indexer { private IndexWriter indexWriter; public Indexer(String indexDir) throws IOException { //步驟一:建立 Directory Directory dir = FSDirectory.open(new File(indexDir)); //步驟二:建立 IndexWriter indexWriter = new IndexWriter(dir, new StandardAnalyzer( Version.LUCENE_30), true, IndexWriter.MaxFieldLength.UNLIMITED); } public void close() throws CorruptIndexException, IOException { //步驟五:關閉IndexWriter indexWriter.close(); } public int index(String dataDir, FileFilter fileFilter) throws IOException { File[] files = new File(dataDir).listFiles(); for (File file : files) { if (!file.isDirectory() && !file.isHidden() && file.exists() && file.canRead() && (fileFilter == null || fileFilter.accept(file))) { indexFile(file); } } return indexWriter.numDocs(); } private void indexFile(File file) throws IOException { System.out.println("Indexing " + file.getCanonicalPath()); //步驟三:建立Document對象 Document doc = getDocument(file); //步驟四:添加Document indexWriter.addDocument(doc); } protected Document getDocument(File file) throws IOException { Document doc = new Document(); doc.add(new Field("contents", new FileReader(file))); doc.add(new Field("filename", file.getName(), Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.add(new Field("fullpath", file.getCanonicalPath(), Field.Store.YES, Field.Index.NOT_ANALYZED)); return doc; } private static class TextFilesFilter implements FileFilter { public boolean accept(File pathname) { return pathname.getName().toLowerCase().endsWith(".txt"); } } public static void main(String[] strs) throws IOException { //存放索引的位置(linux環境下路徑) String indexDir = "/opt/test/lucene/index"; //存放待索引文件的位置(linux環境下路徑) String dataDir = "/opt/test/lucene/files"; long startTime = System.currentTimeMillis(); Indexer indexer = new Indexer(indexDir); int numIndexed; try { numIndexed = indexer.index(dataDir, new TextFilesFilter()); } finally { indexer.close(); } long endTime = System.currentTimeMillis(); System.out.println("Indexing " + numIndexed + " files took " + (endTime - startTime) + "ms"); } }
案例二展現如何經過對指定的索引文件夾進行關鍵詞索引google
完整源碼:
package com.linjl.study.book.luceneInAction.chapter1; import java.io.File; import java.io.IOException; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; public class Searcher { public static void search(String indexDir, String searchWord) throws IOException, ParseException { //步驟一:建立Directory Directory dir = FSDirectory.open(new File(indexDir)); //步驟二:建立IndexSearcher IndexSearcher indexSearcher = new IndexSearcher(dir); //步驟三:建立QueryParser QueryParser parser = new QueryParser(Version.LUCENE_30, "contents", new StandardAnalyzer(Version.LUCENE_30)); long startTime = System.currentTimeMillis(); //步驟四:解析生成查詢對象 Query query = parser.parse(searchWord); //步驟五:查詢並獲取查詢結果(只是獲取到查詢結果的引用) TopDocs hits = indexSearcher.search(query, 30); long endTime = System.currentTimeMillis(); System.out.println("Found " + hits.totalHits + "document(s) (in " + (endTime - startTime) + "ms) that matched query '" + searchWord + "':"); for (ScoreDoc scoreDoc : hits.scoreDocs) { //步驟六:根據引用生成查詢結果 Document doc = indexSearcher.doc(scoreDoc.doc); System.out.println(doc.get("fullpath")); } //步驟七:關閉IndexSearcher indexSearcher.close(); } public static void main(String[] args) throws IOException, ParseException { String indexDir = "/opt/test/lucene/index"; String searchWord = "牀"; Searcher.search(indexDir, searchWord); }
Directory dir = FSDirectory.open(new File("/tmp/index")); IndexSearcher searcher = new IndexSearcher(dir); Query q = new TermQuery(new Term("contents","lucene")); TopDocs hits = searcher.search(q,10); searcher.close();
本文主要是Lucene In Action 第一章的內容,經過2個例子,對lucene有了最初的認識和使用方法。
(全文完 linjl 20130904 深圳)