初識lucene5.x

時間 2019-11-17

標籤 lucene5.x lucene 简体版

原文原文鏈接

1、Lucene簡介

Lucene 是一個基於 Java 的全文信息檢索工具包，它不是一個完整的搜索應用程序，而是爲你的應用程序提供索引和搜索功能。Lucene 目前是 Apache Jakarta 家族中的一個開源項目。也是目前最爲流行的基於 Java 開源全文檢索工具包。java

目前已經有不少應用程序的搜索功能是基於 Lucene 的，好比 Eclipse 的幫助系統的搜索功能。Lucene 可以爲文本類型的數據創建索引，因此你只要能把你要索引的數據格式轉化的文本的，Lucene 就能對你的文檔進行索引和搜索。好比你要對一些 HTML 文檔，PDF 文檔進行索引的話你就首先須要把 HTML 文檔和 PDF 文檔轉化成文本格式的，而後將轉化後的內容交給 Lucene 進行索引，而後把建立好的索引文件保存到磁盤或者內存中，最後根據用戶輸入的查詢條件在索引文件上進行查詢。不指定要索引的文檔的格式也使 Lucene 可以幾乎適用於全部的搜索應用程序。apache

2、代碼實現

package com.demo.index;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.nio.file.FileSystems;
import java.util.logging.Logger;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.*;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.*;
import org.apache.lucene.store.*;

public class LuceneIndex {

    private static Logger logger = Logger.getLogger(LuceneIndex.class.getName());
    //IndexWriter 索引文件的segment進行寫、合併、優化
    IndexWriter indexWriter;
    //將要存儲索引的目錄
    Directory dir;

    public LuceneIndex(){
        try{
            IndexWriterConfig config = new IndexWriterConfig(new StandardAnalyzer());
            config.setOpenMode(OpenMode.CREATE_OR_APPEND);
            this.dir = new RAMDirectory();
            this.indexWriter = new IndexWriter(dir,config);
        }catch (Exception e){
            e.printStackTrace();
        }
    }

    //索引保存於內存中
    public LuceneIndex(String dataUrl){
        try{
            IndexWriterConfig config = new IndexWriterConfig(new StandardAnalyzer());
            config.setOpenMode(OpenMode.CREATE_OR_APPEND);
            this.dir = new RAMDirectory();
            this.indexWriter  = new IndexWriter(this.dir,config);
            //根據傳入的URL添加索引
            addDocument(dataUrl);
            closeWriter();
        }catch (Exception e){
            e.printStackTrace();
        }
    }
    
    //索引保存於指定目錄
    public LuceneIndex(String IndexUrl,String dataUrl){
        try{
            IndexWriterConfig config = new IndexWriterConfig(new StandardAnalyzer());
            config.setOpenMode(OpenMode.CREATE_OR_APPEND);
            this.dir = FSDirectory.open(FileSystems.getDefault().getPath(IndexUrl));
            this.indexWriter = new IndexWriter(dir,config);
            //根據傳入的URL添加索引
            addDocument(dataUrl);
            closeWriter();
        }catch (Exception e){
            e.printStackTrace();
        }
    }

    public void closeWriter(){
        try {
            this.indexWriter.close();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    /**
     * 根據提供的URL添加索引,但是文檔或者目錄
     * @param url 文件或者目錄
     */
    public void addDocument(String url){
        try{
            File file = new File(url);
            if(file.exists()){
                if(file.isFile()){
                    Document document = getDocument(file);
                    this.indexWriter.addDocument(document);
                }else if(file.isDirectory()){
                    //File[] dataFiles = new File("/Users/wangjing/Documents/lucene/data/").listFiles();
                    File[] dataFiles = file.listFiles();
                    if(dataFiles != null){
                        for(File f : dataFiles) {
                            if (f.isFile() && f.getName().endsWith(".txt")) {
                                System.out.println("正在創建索引：" + f.getCanonicalPath());
                                //生成Document
                                Document document = getDocument(f);
                                this.indexWriter.addDocument(document);
                            }
                        }
                    }else{
                        logger.info("找不到目錄");
                    }
                }
                this.indexWriter.commit();
            }else{
                logger.info("找不到文件或者目錄");
            }
        }catch (Exception e){
            e.printStackTrace();
        }
    }

    /**
     * 根據file生成Document
     * @param file 文件
     * @return 文檔結構
     */
    public Document getDocument(File file){
        Document document = new Document();
        //document.add(new StringField("path", dataFiles[i].getAbsolutePath(),Store.YES));
        document.add(new TextField("path", file.getAbsolutePath(),Store.YES));
        document.add(new TextField("contents", getFileContent(file),Store.YES));
        return document;
    }


    /**
     * 獲取文件內容
     * @param file 文件
     * @return  文件內容
     */
    public static String getFileContent(File file){
        String result = "";
        try{
            BufferedReader br = new BufferedReader(new FileReader(file));//構造一個BufferedReader類來讀取文件
            String s;
            while((s = br.readLine())!=null){//使用readLine方法，一次讀一行
                result = result+s;
            }
            br.close();
        }catch(Exception e){
            e.printStackTrace();
        }
        return result;
    }

    /**
     * 根據字段以及值進行查找
     * @param field        字段
     * @param queryString   值
     */
    public void search(String field , String queryString){
        try{
            IndexReader reader = DirectoryReader.open(this.dir);
            IndexSearcher searcher = new IndexSearcher(reader);

//            Term term = new Term(field,queryString);
//            TermQuery luceneQuery = new TermQuery(term);
            Query query = new QueryParser(field,new StandardAnalyzer()).parse(queryString);
            TopDocs result = searcher.search(query,10);
            ScoreDoc[] hits = result.scoreDocs;
            System.out.println("找到" + hits.length + "條記錄");
            for(int i=0;i<hits.length;++i) {
                int docId = hits[i].doc;
                Document d = searcher.doc(docId);
                System.out.println((i + 1) + ". " + d.get("path") + "\t" + d.get("contents"));
            }
        }catch (Exception e){
            e.printStackTrace();
        }
    }

    public static void main(String[] args) {
        LuceneIndex index = new LuceneIndex("/Users/wangjing/Documents/lucene/data1/");
        index.search("contents","國");
    }
}

相關標籤/搜索

每日一句

每一个你不满意的现在，都有一个你没有努力的曾经。