lucene建立索引的幾種方式(一)

時間 2019-11-17

標籤 lucene 建立索引幾種方式简体版

原文原文鏈接

什麼是索引：html

根據你輸入的值去找，這個值就是索引java

第一種建立索引的方式：apache

根據文件來生成索引，如後綴爲.txt等的文件數組

步驟：測試

第一步：FSDirectory.open(Paths.get(url));根據路徑獲取存儲索引的目錄。url

FSDirectory：表示對文件系統目錄的操做。RAMDirectory ：內存中的目錄操做。spa

Paths爲NIO(new io)的一個類；Path 類是 java.io.File 類的升級版，File file=newFile("index.html")而Path path=Paths.get("index.html");因爲 Path 類基於字符串建立，所以它引用的資源也有可能不存在。操作系統

關於nio:傳統的io流都是經過字節的移動來處理的，也就是說輸入/輸出流一次只能處理一個字節，所以面向流的輸入/輸出系統一般效率不高；所以引進了新IO(new IO),NIO採用內存映射文件的方式來處理輸入/輸出，NIO將文件或文件的一段區域映射到內存中，這樣就能夠向訪問內存同樣來訪問文件了(這種方式模擬了操做系統上的虛擬內存的概念)，因此NIO的效率很快。.net

第二步：new IndexWriter(Directory,IndexWriterConfig)建立索引code

第三步：索引指定目錄的文件

第四步：將文件寫入lucene中的文檔(Document)

package com.wp.util;

import java.io.File;
import java.io.FileReader;
import java.nio.file.Paths;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

public class Indexer {

    private IndexWriter writer; // 寫索引實例

    /**
     * 構造方法 實例化IndexWriter
     * 
     * @param indexDir
     * @throws Exception
     */
    public Indexer(String indexDir) throws Exception {
        Directory dir = FSDirectory.open(Paths.get(indexDir));// 根據路徑獲取存儲索引的目錄
        Analyzer analyzer = new StandardAnalyzer(); // 這裏用了多態，StandardAnalyzer是標準分詞器，Analyzer是一個分詞器

 IndexWriterConfig iwc = new IndexWriterConfig(analyzer); writer = new IndexWriter(dir, iwc); } /** * 關閉寫索引 * * @throws Exception */ public void close() throws Exception { writer.close(); } /** * 索引指定目錄的全部文件 * * @param dataDir * @throws Exception */ public int index(String dataDir) throws Exception { File[] files = new File(dataDir).listFiles(); for (File f : files) { indexFile(f); } return writer.numDocs(); } /** * 索引指定文件 * * @param f */ private void indexFile(File f) throws Exception { // 關於f.getCanonicalPath()查看http://www.blogjava.net/dreamstone/archive/2007/08/08/134968.html System.out.println("索引文件：" + f.getCanonicalPath()); Document doc = getDocument(f); writer.addDocument(doc); } /** * 獲取文檔，文檔裏再設置每一個字段 * * @param f */ private Document getDocument(File f) throws Exception { Document doc = new Document(); doc.add(new TextField("contents", new FileReader(f))); doc.add(new TextField("fileName", f.getName(), Field.Store.YES)); doc .add(new TextField("fullPath", f.getCanonicalPath(), Field.Store.YES)); return doc; } public static void main(String[] args) { String indexDir = "D:\\lucene4"; String dataDir = "D:\\lucene4\\data"; Indexer indexer = null; int numIndexed = 0; long start = System.currentTimeMillis(); try { indexer = new Indexer(indexDir); numIndexed = indexer.index(dataDir); } catch (Exception e) { e.printStackTrace(); } finally { try { indexer.close(); } catch (Exception e) { e.printStackTrace(); } } long end = System.currentTimeMillis(); System.out.println("索引：" + numIndexed + " 個文件 花費了" + (end - start) + " 毫秒"); } }

第二種建立索引的方式：

根據字段來生成索引，我用的是數組

第一步：建立索引

第二步：將字段添加到文檔中

package com.wp.util;

import java.nio.file.Paths;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.junit.Before;
import org.junit.Test;

public class IndexIngTest {

    private String ids[] = { "1", "2", "3" };
    private String citys[] = { "qingdao", "nanjing", "shanghai" };
    private String descs[] = { "Qingdao is a beautiful city.",
            "Nanjing is a city of culture.", "Shanghai is a bustling city." };

    private Directory dir;// 目錄

    /**
     * 獲取IndexWriter實例
     * 
     * @return
     * @throws Exception
     */
    private IndexWriter getWriter() throws Exception {
        Analyzer analyzer = new StandardAnalyzer(); // 標準分詞器
        IndexWriterConfig iwc = new IndexWriterConfig(analyzer);
        IndexWriter writer = new IndexWriter(dir, iwc);
        return writer;
    }

    /**
     * 添加文檔
     * 
     * @throws Exception
     */
    @Before
    public void setUp() throws Exception {
        dir = FSDirectory.open(Paths.get("D:\\lucene\\luceneIndex"));// 獲得luceneIndex目錄
        IndexWriter writer = getWriter();// 獲得索引
        for (int i = 0; i < ids.length; i++) {
            Document doc = new Document();// 建立文檔
            doc.add(new StringField("id", ids[i], Field.Store.YES));// 將id屬性存入內存中
            doc.add(new StringField("city", citys[i], Field.Store.YES));
            doc.add(new TextField("desc", descs[i], Field.Store.NO));
            writer.addDocument(doc); // 添加文檔
        }
        writer.close();
    }

    /**
     * 測試寫了幾個文檔
     * 
     * @throws Exception
     */
    @Test
    public void testIndexWriter() throws Exception {
        IndexWriter writer = getWriter();
        System.out.println("寫入了" + writer.numDocs() + "個文檔");
        writer.close();
    }

    /**
     * 測試讀取文檔
     * 
     * @throws Exception
     */
    @Test
    public void testIndexReader() throws Exception {
        IndexReader reader = DirectoryReader.open(dir);
        System.out.println("最大文檔數：" + reader.maxDoc());
        System.out.println("實際文檔數：" + reader.numDocs());
        reader.close();
    }

    /**
     * 測試刪除 在合併前
     * 
     * @throws Exception
     */
    @Test
    public void testDeleteBeforeMerge() throws Exception {
        IndexWriter writer = getWriter();
        System.out.println("刪除前：" + writer.numDocs());
        writer.deleteDocuments(new Term("id", "1"));// term：根據id找到爲1的
        writer.commit();
        System.out.println("writer.maxDoc()：" + writer.maxDoc());
        System.out.println("writer.numDocs()：" + writer.numDocs());
        writer.close();
    }

    /**
     * 測試刪除 在合併後
     * 
     * @throws Exception
     */
    @Test
    public void testDeleteAfterMerge() throws Exception {
        IndexWriter writer = getWriter();
        System.out.println("刪除前：" + writer.numDocs());
        writer.deleteDocuments(new Term("id", "1"));
        writer.forceMergeDeletes(); // 強制刪除
        writer.commit();
        System.out.println("writer.maxDoc()：" + writer.maxDoc());
        System.out.println("writer.numDocs()：" + writer.numDocs());
        writer.close();
    }

    /**
     * 測試更新
     * 
     * @throws Exception
     */
    @Test
    public void testUpdate() throws Exception {
        IndexWriter writer = getWriter();
        Document doc = new Document();
        doc.add(new StringField("id", "1", Field.Store.YES));
        doc.add(new StringField("city", "qingdao", Field.Store.YES));
        doc.add(new TextField("desc", "dsss is a city.", Field.Store.NO));
        writer.updateDocument(new Term("id", "1"), doc);
        writer.close();
    }
}

生成的索引文件以下：

關於索引的搜索：

這裏有一個要注意的地方：必定要先建立出索引後才能去進行查找，不然會報

org.apache.lucene.index.IndexNotFoundException:

no segments* file found in MMapDirectory@D:\lucene lockFactory=org.apache.lucene.store.NativeFSLockFactory@753f67a9: files: [data, lucene-5.3.1, lucene-5.3.1.zip]

package com.wp.lucene;

import java.nio.file.Paths;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

public class Searcher {
    /**
     * 
     * @param indexDir
     *            哪一個目錄
     * @param q
     *            要查詢的字段
     * @throws Exception
     */
    public static void search(String indexDir, String q) throws Exception {
        Directory dir = FSDirectory.open(Paths.get(indexDir));// 打開目錄
        IndexReader reader = DirectoryReader.open(dir);// 進行讀取
        IndexSearcher is = new IndexSearcher(reader);// 索引查詢器
        Analyzer analyzer = new StandardAnalyzer(); // 標準分詞器
        QueryParser parser = new QueryParser("contents", analyzer);// 在哪查詢，第一個參數爲查詢的Document，在Indexer中建立了
        Query query = parser.parse(q);// 對字段進行解析後返回給查詢
        long start = System.currentTimeMillis();
        TopDocs hits = is.search(query, 10);// 開始查詢，10表明前10條數據；返回一個文檔
        long end = System.currentTimeMillis();
        System.out.println("匹配 " + q + " ，總共花費" + (end - start) + "毫秒" + "查詢到"
                + hits.totalHits + "個記錄");
        for (ScoreDoc scoreDoc : hits.scoreDocs) {
            Document doc = is.doc(scoreDoc.doc);// 根據文檔的標識獲取文檔
            System.out.println(doc.get("fullPath"));
        }
        reader.close();
    }

    /**
     * 執行這個main方法進行查詢以前，必需要有索引，即先執行Indexer這個類
     * 
     * @param args
     */
    public static void main(String[] args) {
        String indexDir = "D:\\lucene";
        String q = "ADD";
        try {
            search(indexDir, q);
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}