pom文件:java
<?xml version="1.0" encoding="UTF-8"?> <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>com.example</groupId> <artifactId>demo-lucene</artifactId> <version>0.0.1-SNAPSHOT</version> <packaging>war</packaging> <name>demo-lucene</name> <description>Demo project for Spring Boot</description> <parent> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-parent</artifactId> <version>2.1.0.RELEASE</version> <relativePath/> <!-- lookup parent from repository --> </parent> <properties> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding> <java.version>1.8</java.version> </properties> <dependencies> <dependency> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-web</artifactId> </dependency> <dependency> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-test</artifactId> <scope>test</scope> </dependency> <!-- Lucene核心庫 --> <dependency> <groupId>org.apache.lucene</groupId> <artifactId>lucene-core</artifactId> <version>7.2.1</version> </dependency> <!-- Lucene解析庫 --> <dependency> <groupId>org.apache.lucene</groupId> <artifactId>lucene-queryparser</artifactId> <version>7.2.1</version> </dependency> <!-- Lucene附加的分析庫 --> <dependency> <groupId>org.apache.lucene</groupId> <artifactId>lucene-analyzers-common</artifactId> <version>7.2.1</version> </dependency> </dependencies> <build> <plugins> <plugin> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-maven-plugin</artifactId> </plugin> </plugins> </build> </project>
package com.example.demo; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.Term; import org.apache.lucene.search.*; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import java.io.IOException; import java.nio.file.Paths; /** * Created by 1 on 2018/11/20. */ public class DemoLucene { public static void main(String[] args) throws IOException { String path = "D:\\lucene\\index"; Directory directory = FSDirectory.open(Paths.get(path)); Analyzer analyzer = new StandardAnalyzer(); IndexWriterConfig config = new IndexWriterConfig(analyzer); config.setOpenMode(IndexWriterConfig.OpenMode.CREATE); IndexWriter indexWriter = new IndexWriter(directory, config); Document document1 = new Document(); document1.add(new StringField("name", "張三", Field.Store.YES)); document1.add(new StringField("no", "1001", Field.Store.YES)); document1.add(new TextField("content", "中心小學的張三是個喜歡學習的學生", Field.Store.YES)); indexWriter.addDocument(document1); Document document2 = new Document(); document2.add(new StringField("name", "李四", Field.Store.YES)); document2.add(new StringField("no", "1002", Field.Store.YES)); document2.add(new TextField("content", "中心小學的李四是個期末考試成績很好的學生", Field.Store.YES)); indexWriter.addDocument(document2); Document document3 = new Document(); document3.add(new StringField("name", "王五", Field.Store.YES)); document3.add(new StringField("no", "1003", Field.Store.YES)); document3.add(new TextField("content", "南寧市中心小學的王五在班級裏是個班長", Field.Store.YES)); indexWriter.addDocument(document3); indexWriter.close(); DirectoryReader directoryReader = DirectoryReader.open(directory); IndexSearcher indexSearcher = new IndexSearcher(directoryReader); Query query = new TermQuery(new Term("content", "班長")); System.out.println("查詢語句 = " + query); TopDocs topDocs = indexSearcher.search(query, 100); ScoreDoc[] scoreDocs = topDocs.scoreDocs; if(scoreDocs.length == 0){ System.out.println("未找到數據"); }else{ for (int i = 0; i < scoreDocs.length; i++) { Document docResult = indexSearcher.doc(scoreDocs[i].doc); System.out.println(String.format("====================== 第%d條 ======================", i + 1)); System.out.println("name:" + docResult.get("name")); System.out.println("no:" + docResult.get("no")); System.out.println("content:" + docResult.get("content")); } } } }
一、Directory是個抽象類,其子類的BaseDirectory也是抽象類。git
BaseDirectory的子類:github
RAMDirectory(普通類):把索引建立到內存web
FSDirectory(抽象類):把索引建立到磁盤spring
本文使用FSDirectory。經過FSDirectory的靜態方法open能夠建立一個目錄。看看open方法:apache
FSDirectory.open方法返回FSDirectory抽象類的其中一個子類,這裏根據狀況會返回一個MMapDirectory。maven
一、抽象類Query有不少子類如上圖,這裏只列舉經常使用的類。函數
PhraseQuery.Builder builder = new PhraseQuery.Builder(); builder.add(new Term("content", "tom")); builder.add(new Term("content", "is")); builder.add(new Term("content", "student")); System.out.println("查詢語句 = " + builder.build()); TopDocs topDocs = indexSearcher.search(builder.build(), 100);
WildcardQuery wildcardQuery = new WildcardQuery(new Term("content", "中心小學*")); System.out.println("查詢語句 = " + wildcardQuery); TopDocs topDocs = indexSearcher.search(wildcardQuery, 100);
FuzzyQuery fuzzyQuery = new FuzzyQuery(new Term("name", "王健林"), 2, 3); System.out.println("查詢語句 = " + fuzzyQuery); TopDocs topDocs = indexSearcher.search(fuzzyQuery, 100);
這裏的FuzzyQuery使用了3個參數的構造函數:spring-boot
public FuzzyQuery(Term term, int maxEdits, int prefixLength) { this(term, maxEdits, prefixLength, defaultMaxExpansions, defaultTranspositions); }
maxEdits:最大編輯距離(該參數範圍是0-2,默認值是2,編輯指:新增一個字符、修改一個字符、刪除一個字符)學習
舉個例子,上面的模糊查詢參數是「王健林」,maxEdits=2,那麼將匹配下列內容:
一、王健林首富(最大編輯距離maxEdits=2,在「王健林」後面作2次新增字符分別是「首」、「富」)
二、王力宏(最大編輯距離maxEdits=2,把「健」、「林」分別修改成「力」、「宏」)
三、王(最大編輯距離maxEdits=2,把「健」、「林」分別刪除)
四、王五(最大編輯距離maxEdits=2,把「健」修改成「五」,把「林」刪除)
等等,能夠看出最大編輯距離maxEdits=2,那麼將隨機組合「增、刪、改」字符2次後是否匹配。
prefixLength:前綴長度(默認值是0,當prefixLength不爲0則表示前面多少個字符必需要匹配)
舉個例子:
FuzzyQuery fuzzyQuery = new FuzzyQuery(new Term("name", "王健林"), 2, 2):那麼必須匹配「王健」開頭的內容,
FuzzyQuery fuzzyQuery = new FuzzyQuery(new Term("name", "王健林"), 2, 3):那麼必須匹配「王健林」開頭的內容,可是沒法匹配「王健林」這條記錄,若是知道的話也告訴我爲何。
BooleanQuery.Builder builder = new BooleanQuery.Builder(); builder.add(new TermQuery(new Term("content", "首富")), BooleanClause.Occur.MUST); builder.add(new TermQuery(new Term("content", "影院")), BooleanClause.Occur.MUST); System.out.println("查詢語句 = " + builder.build()); TopDocs topDocs = indexSearcher.search(builder.build(), 100);
Occur是個枚舉有如下值:
MUST、MUST_NOT、FILTER、SHOULD
newRangeQuery:整型範圍查詢
newExactQuery:整型精確查詢
Query query = null; query = IntPoint.newRangeQuery("number", 1, 10);//範圍查詢 query = IntPoint.newExactQuery("number", 5);//精確查詢 System.out.println("查詢語句 = " + query); TopDocs topDocs = indexSearcher.search(query, 20);
Query query = null; query = new TermQuery(new Term("name", "張三")); System.out.println("查詢語句 = " + query); TopDocs topDocs = indexSearcher.search(query, 20);
TextField:Reader或String索引全文搜索
StringField:將String逐字索引做爲單個標記
IntPoint:int爲精確/範圍查詢創建索引。
LongPoint:long爲精確/範圍查詢創建索引。
FloatPoint:float爲精確/範圍查詢創建索引。
DoublePoint:double爲精確/範圍查詢創建索引。
SortedDocValuesField:byte[]逐列索引,用於排序/分面
SortedSetDocValuesField:SortedSet<byte[]>逐列索引,用於排序/分面
NumericDocValuesField:long逐列索引,用於排序/分面
SortedNumericDocValuesField:SortedSet<long>逐列索引,用於排序/分面
StoredField:僅用於在摘要結果中檢索的存儲值
經常使用的Field以下:
TextField:索引、分詞
StringField:索引
StoredField:存儲值
Document document = new Document(); //建立StringField,使用Field.Store.YES枚舉代表要存儲該值 document.add(new StringField("name", "王健林", Field.Store.YES)); //建立StringField,使用Field.Store.YES枚舉代表要存儲該值 document.add(new StringField("no", "1006", Field.Store.YES)); //建立IntPoint document.add(new IntPoint("number", 6)); //若要存儲該IntPoint的值,則添加同名的StoredField document.add(new StoredField("number", 6)); //若要排序該IntPoint的值,則添加同名的SortedNumericDocValuesField document.add(new SortedNumericDocValuesField("number", 6L)); //建立TextField,使用Field.Store.YES枚舉代表要存儲該值 document.add(new TextField("content", "王健林是萬達集團的董事長,下有萬達影院、萬達酒店、萬達廣場等產業", Field.Store.YES));
當你運行文章開頭的demo,查詢字段爲"content",查詢的值爲"班長",實際上應該查到document3這個對象的數據,由於已經添加了類型爲TextField的字段。而且根據上面說的TextField是建立分詞,建立索引。建立分詞後,"班長"這個詞應該能夠查詢的到。爲何查詢不到數據?這裏和分詞器有關。
Document document3 = new Document();
document3.add(new StringField("name", "王五", Field.Store.YES));
document3.add(new StringField("no", "1003", Field.Store.YES));
document3.add(new TextField("content", "南寧市中心小學的王五在班級裏是個班長", Field.Store.YES));
indexWriter.addDocument(document3);
文章開頭的demo用的分詞器是StandardAnalyzer分詞器,看名字就知道是標準分詞器。
Analyzer analyzer = new StandardAnalyzer();
如今把這個分詞器進行上述的"南寧市中心小學的王五在班級裏是個班長"這段文本進行分詞,看看結果。
public static void main(String[] args) throws IOException { StandardAnalyzer(); } public static void StandardAnalyzer() throws IOException { String text = "南寧市中心小學的王五在班級裏是個班長"; Analyzer analyzer = new StandardAnalyzer(); TokenStream tokenStream= analyzer.tokenStream("word",text); tokenStream.reset(); CharTermAttribute charTermAttribute=tokenStream.addAttribute(CharTermAttribute.class); System.out.println("==========StandardAnalyzer分詞開始=========="); while(tokenStream.incrementToken()){ System.out.print(String.format("[%s] ", charTermAttribute.toString())); } System.out.println(""); System.out.println("==========StandardAnalyzer分詞結束=========="); }
結果以下:
==========StandardAnalyzer分詞開始==========
[南] [寧] [市] [中] [心] [小] [學] [的] [王] [五] [在] [班] [級] [裏] [是] [個] [班] [長]
==========StandardAnalyzer分詞結束==========
能夠看到,StandardAnalyzer這個標準分詞器是一個字符一個字符來分詞,因此當查詢"班長"這個詞的時候查不到。
這是個開源的分詞器(github地址:https://github.com/wks/ik-analyzer),如今來看看這個分詞器的結果。
public static void IKAnalyzer() throws IOException { String text = "南寧市中心小學的王五在班級裏是個班長"; Analyzer analyzer = new IKAnalyzer(); TokenStream tokenStream= analyzer.tokenStream("word",text); tokenStream.reset(); CharTermAttribute charTermAttribute=tokenStream.addAttribute(CharTermAttribute.class); System.out.println("==========IKAnalyzer分詞開始=========="); while(tokenStream.incrementToken()){ System.out.print(String.format("[%s] ", charTermAttribute.toString())); } System.out.println(""); System.out.println("==========IKAnalyzer分詞結束=========="); }
結果以下:
==========IKAnalyzer分詞開始==========
[南寧市] [南寧] [市中心] [中心小學] [中心] [小學] [的] [王] [五] [在] [班級] [裏] [是] [個] [班長]
==========IKAnalyzer分詞結束==========
能夠看到這個IKAnalyzer分詞器能識別詞語來分詞。因此只須要對文章開頭的demo代碼把StandardAnalyzer換成IKAnalyzer。
Directory directory = FSDirectory.open(Paths.get(path)); //這裏原來是StandardAnalyzer分詞器,換爲IKAnalyzer分詞器 Analyzer analyzer = new IKAnalyzer(); IndexWriterConfig config = new IndexWriterConfig(analyzer); config.setOpenMode(IndexWriterConfig.OpenMode.CREATE); IndexWriter indexWriter = new IndexWriter(directory, config);
這樣就能查詢到文章開頭的demo的數據。