Java Lucene入門

時間 2019-12-10

標籤 java lucene 入門欄目 Java 简体版

原文原文鏈接

一、lucene版本：7.2.1

pom文件：java

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
    xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>com.example</groupId>
    <artifactId>demo-lucene</artifactId>
    <version>0.0.1-SNAPSHOT</version>
    <packaging>war</packaging>

    <name>demo-lucene</name>
    <description>Demo project for Spring Boot</description>

    <parent>
        <groupId>org.springframework.boot</groupId>
        <artifactId>spring-boot-starter-parent</artifactId>
        <version>2.1.0.RELEASE</version>
        <relativePath/> <!-- lookup parent from repository -->
    </parent>

    <properties>
        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
        <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
        <java.version>1.8</java.version>
    </properties>

    <dependencies>
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-web</artifactId>
        </dependency>

        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-test</artifactId>
            <scope>test</scope>
        </dependency>

        <!-- Lucene核心庫 -->
        <dependency>
            <groupId>org.apache.lucene</groupId>
            <artifactId>lucene-core</artifactId>
            <version>7.2.1</version>
        </dependency>
        <!-- Lucene解析庫 -->
        <dependency>
            <groupId>org.apache.lucene</groupId>
            <artifactId>lucene-queryparser</artifactId>
            <version>7.2.1</version>
        </dependency>
        <!-- Lucene附加的分析庫 -->
        <dependency>
            <groupId>org.apache.lucene</groupId>
            <artifactId>lucene-analyzers-common</artifactId>
            <version>7.2.1</version>
        </dependency>
    </dependencies>

    <build>
        <plugins>
            <plugin>
                <groupId>org.springframework.boot</groupId>
                <artifactId>spring-boot-maven-plugin</artifactId>
            </plugin>
        </plugins>
    </build>


</project>

二、代碼以下：

package com.example.demo;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.*;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

import java.io.IOException;
import java.nio.file.Paths;

/**
 * Created by 1 on 2018/11/20.
 */
public class DemoLucene {

    public static void main(String[] args) throws IOException {
        String path = "D:\\lucene\\index";

        Directory directory = FSDirectory.open(Paths.get(path));
        Analyzer analyzer = new StandardAnalyzer();
        IndexWriterConfig config = new IndexWriterConfig(analyzer);
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
        IndexWriter indexWriter = new IndexWriter(directory, config);

        Document document1 = new Document();
        document1.add(new StringField("name", "張三", Field.Store.YES));
        document1.add(new StringField("no", "1001", Field.Store.YES));
        document1.add(new TextField("content", "中心小學的張三是個喜歡學習的學生", Field.Store.YES));
        indexWriter.addDocument(document1);

        Document document2 = new Document();
        document2.add(new StringField("name", "李四", Field.Store.YES));
        document2.add(new StringField("no", "1002", Field.Store.YES));
        document2.add(new TextField("content", "中心小學的李四是個期末考試成績很好的學生", Field.Store.YES));
        indexWriter.addDocument(document2);

        Document document3 = new Document();
        document3.add(new StringField("name", "王五", Field.Store.YES));
        document3.add(new StringField("no", "1003", Field.Store.YES));
        document3.add(new TextField("content", "南寧市中心小學的王五在班級裏是個班長", Field.Store.YES));
        indexWriter.addDocument(document3);

        indexWriter.close();

        DirectoryReader directoryReader = DirectoryReader.open(directory);
        IndexSearcher indexSearcher = new IndexSearcher(directoryReader);
        Query query = new TermQuery(new Term("content", "班長"));
        System.out.println("查詢語句 = " + query);
        TopDocs topDocs = indexSearcher.search(query, 100);
        ScoreDoc[] scoreDocs = topDocs.scoreDocs;
        if(scoreDocs.length == 0){
            System.out.println("未找到數據");
        }else{
            for (int i = 0; i < scoreDocs.length; i++) {
                Document docResult = indexSearcher.doc(scoreDocs[i].doc);
                System.out.println(String.format("======================  第%d條  ======================", i + 1));
                System.out.println("name：" + docResult.get("name"));
                System.out.println("no：" + docResult.get("no"));
                System.out.println("content：" + docResult.get("content"));
            }
        }

    }

}

一、Directory是個抽象類，其子類的BaseDirectory也是抽象類。git

　　BaseDirectory的子類：github

　　　　RAMDirectory（普通類）：把索引建立到內存web

　　　　FSDirectory（抽象類）：把索引建立到磁盤spring

本文使用FSDirectory。經過FSDirectory的靜態方法open能夠建立一個目錄。看看open方法：apache

FSDirectory.open方法返回FSDirectory抽象類的其中一個子類，這裏根據狀況會返回一個MMapDirectory。maven

三、Lucene經常使用查詢的Query抽象類

一、抽象類Query有不少子類如上圖，這裏只列舉經常使用的類。函數

　　一、PhraseQuery類：用於查詢英文單詞，用法以下。

        PhraseQuery.Builder builder = new PhraseQuery.Builder();
        builder.add(new Term("content", "tom"));
        builder.add(new Term("content", "is"));
        builder.add(new Term("content", "student"));
        System.out.println("查詢語句 = " + builder.build());
        TopDocs topDocs = indexSearcher.search(builder.build(), 100);

　　二、WildcardQuery類：用於通配符查詢，用法以下。

        WildcardQuery wildcardQuery = new WildcardQuery(new Term("content", "中心小學*"));
        System.out.println("查詢語句 = " + wildcardQuery);
        TopDocs topDocs = indexSearcher.search(wildcardQuery, 100);

　　三、FuzzyQuery類：用於模糊查詢，用法以下：

        FuzzyQuery fuzzyQuery = new FuzzyQuery(new Term("name", "王健林"), 2, 3);
        System.out.println("查詢語句 = " + fuzzyQuery);
        TopDocs topDocs = indexSearcher.search(fuzzyQuery, 100);

　　　　這裏的FuzzyQuery使用了3個參數的構造函數：spring-boot

  public FuzzyQuery(Term term, int maxEdits, int prefixLength) {
    this(term, maxEdits, prefixLength, defaultMaxExpansions, defaultTranspositions);
  }

　　　　maxEdits：最大編輯距離（該參數範圍是0-2，默認值是2，編輯指：新增一個字符、修改一個字符、刪除一個字符）學習

　　　　　　舉個例子，上面的模糊查詢參數是「王健林」，maxEdits=2，那麼將匹配下列內容：

　　　　　　　　一、王健林首富（最大編輯距離maxEdits=2，在「王健林」後面作2次新增字符分別是「首」、「富」）

　　　　　　　　二、王力宏（最大編輯距離maxEdits=2，把「健」、「林」分別修改成「力」、「宏」）

　　　　　　　　三、王（最大編輯距離maxEdits=2，把「健」、「林」分別刪除）

　　　　　　　　四、王五（最大編輯距離maxEdits=2，把「健」修改成「五」，把「林」刪除）

　　　　　　　　等等，能夠看出最大編輯距離maxEdits=2，那麼將隨機組合「增、刪、改」字符2次後是否匹配。

　　　　prefixLength：前綴長度（默認值是0，當prefixLength不爲0則表示前面多少個字符必需要匹配）

　　　　　　舉個例子：
　　　　　　　　FuzzyQuery fuzzyQuery = new FuzzyQuery(new Term("name", "王健林"), 2, 2)：那麼必須匹配「王健」開頭的內容，
　　　　　　　　FuzzyQuery fuzzyQuery = new FuzzyQuery(new Term("name", "王健林"), 2, 3)：那麼必須匹配「王健林」開頭的內容，可是沒法匹配「王健林」這條記錄，若是知道的話也告訴我爲何。

　　四、BooleanQuery類：用於組合查詢，用法以下：

        BooleanQuery.Builder builder = new BooleanQuery.Builder();
        builder.add(new TermQuery(new Term("content", "首富")), BooleanClause.Occur.MUST);
        builder.add(new TermQuery(new Term("content", "影院")), BooleanClause.Occur.MUST);
        System.out.println("查詢語句 = " + builder.build());
        TopDocs topDocs = indexSearcher.search(builder.build(), 100);

　　　　Occur是個枚舉有如下值：

　　　　MUST、MUST_NOT、FILTER、SHOULD

　　五、IntPoint、LongPoint、FloatPoint、DoublePoint類：用於數值範圍查詢，用法以下：

　　　　　　newRangeQuery：整型範圍查詢

　　　　　　newExactQuery：整型精確查詢

        Query query = null;
        query = IntPoint.newRangeQuery("number", 1, 10);//範圍查詢
        query = IntPoint.newExactQuery("number", 5);//精確查詢
        System.out.println("查詢語句 = " + query);
        TopDocs topDocs = indexSearcher.search(query, 20);

　　六、TermQuery類：用於詞條查詢，用法以下：

        Query query = null;
        query = new TermQuery(new Term("name", "張三"));
        System.out.println("查詢語句 = " + query);
        TopDocs topDocs = indexSearcher.search(query, 20);

四、Lucene經常使用的Field

　　TextField：Reader或String索引全文搜索
　　StringField：將String逐字索引做爲單個標記
　　IntPoint：int爲精確/範圍查詢創建索引。
　　LongPoint：long爲精確/範圍查詢創建索引。
　　FloatPoint：float爲精確/範圍查詢創建索引。
　　DoublePoint：double爲精確/範圍查詢創建索引。
　　SortedDocValuesField：byte[]逐列索引，用於排序/分面
　　SortedSetDocValuesField：SortedSet<byte[]>逐列索引，用於排序/分面
　　NumericDocValuesField：long逐列索引，用於排序/分面
　　SortedNumericDocValuesField：SortedSet<long>逐列索引，用於排序/分面
　　StoredField：僅用於在摘要結果中檢索的存儲值
　　
　　經常使用的Field以下：
　　　　TextField:索引、分詞
　　　　StringField：索引
　　　　StoredField：存儲值

        Document document = new Document();
        //建立StringField，使用Field.Store.YES枚舉代表要存儲該值
        document.add(new StringField("name", "王健林", Field.Store.YES));
        //建立StringField，使用Field.Store.YES枚舉代表要存儲該值
        document.add(new StringField("no", "1006", Field.Store.YES));
        //建立IntPoint
        document.add(new IntPoint("number", 6));
        //若要存儲該IntPoint的值，則添加同名的StoredField
        document.add(new StoredField("number", 6));
        //若要排序該IntPoint的值，則添加同名的SortedNumericDocValuesField
        document.add(new SortedNumericDocValuesField("number", 6L));
        //建立TextField，使用Field.Store.YES枚舉代表要存儲該值
        document.add(new TextField("content", "王健林是萬達集團的董事長，下有萬達影院、萬達酒店、萬達廣場等產業", Field.Store.YES));

五、Lucene經常使用的Analyzer(分詞器)

　一、StandardAnalyzer分詞器　　

　　當你運行文章開頭的demo，查詢字段爲"content"，查詢的值爲"班長"，實際上應該查到document3這個對象的數據，由於已經添加了類型爲TextField的字段。而且根據上面說的TextField是建立分詞，建立索引。建立分詞後，"班長"這個詞應該能夠查詢的到。爲何查詢不到數據？這裏和分詞器有關。

　　Document document3 = new Document();
　　document3.add(new StringField("name", "王五", Field.Store.YES));
　　document3.add(new StringField("no", "1003", Field.Store.YES));
　　document3.add(new TextField("content", "南寧市中心小學的王五在班級裏是個班長", Field.Store.YES));
　　indexWriter.addDocument(document3);　　
　　
　　文章開頭的demo用的分詞器是StandardAnalyzer分詞器，看名字就知道是標準分詞器。

        Analyzer analyzer = new StandardAnalyzer();

　　如今把這個分詞器進行上述的"南寧市中心小學的王五在班級裏是個班長"這段文本進行分詞，看看結果。

    public static void main(String[] args) throws IOException {
        StandardAnalyzer();
    }

    public static void StandardAnalyzer() throws IOException {
        String text = "南寧市中心小學的王五在班級裏是個班長";
        Analyzer analyzer = new StandardAnalyzer();
        TokenStream tokenStream= analyzer.tokenStream("word",text);
        tokenStream.reset();
        CharTermAttribute charTermAttribute=tokenStream.addAttribute(CharTermAttribute.class);
        System.out.println("==========StandardAnalyzer分詞開始==========");
        while(tokenStream.incrementToken()){
            System.out.print(String.format("[%s] ", charTermAttribute.toString()));
        }
        System.out.println("");
        System.out.println("==========StandardAnalyzer分詞結束==========");
    }

　　結果以下：

==========StandardAnalyzer分詞開始==========
[南] [寧] [市] [中] [心] [小] [學] [的] [王] [五] [在] [班] [級] [裏] [是] [個] [班] [長] 
==========StandardAnalyzer分詞結束==========

　　能夠看到，StandardAnalyzer這個標準分詞器是一個字符一個字符來分詞，因此當查詢"班長"這個詞的時候查不到。

　二、IKAnalyzer分詞器（開源，github搜索）

　　這是個開源的分詞器（github地址：https://github.com/wks/ik-analyzer），如今來看看這個分詞器的結果。

    public static void IKAnalyzer() throws IOException {
        String text = "南寧市中心小學的王五在班級裏是個班長";
        Analyzer analyzer = new IKAnalyzer();
        TokenStream tokenStream= analyzer.tokenStream("word",text);
        tokenStream.reset();
        CharTermAttribute charTermAttribute=tokenStream.addAttribute(CharTermAttribute.class);
        System.out.println("==========IKAnalyzer分詞開始==========");
        while(tokenStream.incrementToken()){
            System.out.print(String.format("[%s] ", charTermAttribute.toString()));
        }
        System.out.println("");
        System.out.println("==========IKAnalyzer分詞結束==========");
    }

　　結果以下：

==========IKAnalyzer分詞開始==========
[南寧市] [南寧] [市中心] [中心小學] [中心] [小學] [的] [王] [五] [在] [班級] [裏] [是] [個] [班長] 
==========IKAnalyzer分詞結束==========

　　能夠看到這個IKAnalyzer分詞器能識別詞語來分詞。因此只須要對文章開頭的demo代碼把StandardAnalyzer換成IKAnalyzer。

        Directory directory = FSDirectory.open(Paths.get(path));
        //這裏原來是StandardAnalyzer分詞器，換爲IKAnalyzer分詞器
        Analyzer analyzer = new IKAnalyzer();
        IndexWriterConfig config = new IndexWriterConfig(analyzer);
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
        IndexWriter indexWriter = new IndexWriter(directory, config);

　　這樣就能查詢到文章開頭的demo的數據。