lucene全文檢索學習記錄，附帶源碼——三種實現，超全超細緻

時間 2019-11-15

標籤 lucene 全文檢索學習記錄附帶源碼三種實現細緻简体版

原文原文鏈接

Lucene學習記錄java

———三種實現，超全超細緻web

下載lucene3.6.0.zip http://download.csdn.net/detail/leilovegege/6800405 ，解壓，將裏邊的lucene-core-3.6.0.jar等包拷貝到工程lib中。還須要中文分詞器IKAnalyzer3.2.8.jar，ojdbc14.jarsql

至此環境搭配完成。數據庫

下面開始實現，只用lava類，沒鏈接web頁面，因此在測試時執行java文件，在控制檯進行測試。apache

工程原文件lucene36 http://download.csdn.net/detail/leilovegege/6804669oracle

Test1檢索絕對路徑上的文件工具

package test1; 學習

import java.io.BufferedReader; 測試

import java.io.File; this

import java.io.FileInputStream;

import java.io.FileReader;

import java.io.IOException;

import java.io.InputStreamReader;

import java.util.Date;

import org.apache.lucene.analysis.Analyzer;

import org.apache.lucene.analysis.standard.StandardAnalyzer;

import org.apache.lucene.document.Document;

import org.apache.lucene.document.Field;

import org.apache.lucene.document.FieldSelectorResult;

import org.apache.lucene.document.NumericField;

import org.apache.lucene.index.IndexWriter;

import org.apache.lucene.index.IndexWriterConfig;

import org.apache.lucene.index.IndexWriterConfig.OpenMode;

import org.apache.lucene.store.Directory;

import org.apache.lucene.store.FSDirectory;

import org.apache.lucene.util.Version;

public class TestFileIndexer {

public static void main(String[] args) throws Exception {

/* 指明要索引文件夾的位置,這裏是C盤的source文件夾下 */

File fileDir = new File( "E:\\Documents and Settings\\Administrator\\Workspaces\\MyEclipse 8.6\\lucene36\\source" );

/* 這裏放索引文件的位置 */

File indexDir = new File( ".\\index\\test1" );

Directory dir=FSDirectory.open(indexDir);//將索引存放在磁盤上

Analyzer lucenAnalyzer=new StandardAnalyzer(Version.LUCENE_36);//分析器

IndexWriterConfig iwc=new IndexWriterConfig(Version.LUCENE_36,lucenAnalyzer);

iwc.setOpenMode(OpenMode.CREATE);//建立新的索引文件create 表示建立或追加到已有索引庫

IndexWriter indexWriter=new IndexWriter(dir,iwc);//把文檔寫入到索引庫

File[] textFiles=fileDir.listFiles();//獲得索引文件夾下全部文件

long startTime=new Date().getTime();

//增長document到檢索去

for (int i = 0; i < textFiles.length; i++) {

// if (textFiles[i].isFile()&& textFiles[i].getName().endsWith(".txt")) {

System.out.println(":;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;");

System.out.println("File"+textFiles[i].getCanonicalPath()+"正在被索引...");

String temp=FileReaderAll(textFiles[i].getCanonicalPath(),"GBK");

System.out.println(temp);

Document document=new Document();

Field FieldPath=new Field("path",textFiles[i].getPath(),Field.Store.YES,Field.Index.NO);

Field FieldBody=new Field("body",temp,Field.Store.YES,Field.Index.ANALYZED,Field.TermVector.WITH_POSITIONS_OFFSETS);

NumericField modifiField=new NumericField("modified");//因此key爲modified

modifiField.setLongValue(fileDir.lastModified());

document.add(FieldPath);

document.add(FieldBody);

document.add(modifiField);

indexWriter.addDocument(document);

// }

}

indexWriter.close();

//計算一下索引的時間

long endTime=new Date().getTime();

System.out.println("花了"+(endTime-startTime)+"毫秒把文檔添加到索引裏面去"+fileDir.getPath());

}

public static String FileReaderAll(String FileName,String charset)throws IOException{

BufferedReader reader=new BufferedReader(new InputStreamReader(new FileInputStream(FileName),charset));

String line=new String();

String temp=new String();

while ((line=reader.readLine())!=null) {

temp+=line;

}

reader.close();

return temp;

}

package test1;

import java.io.BufferedReader;

import java.io.File;

import java.io.IOException;

import java.io.InputStreamReader;

import java.util.Scanner;

import org.apache.lucene.analysis.Analyzer;

import org.apache.lucene.analysis.standard.StandardAnalyzer;

import org.apache.lucene.document.Document;

import org.apache.lucene.index.IndexReader;

import org.apache.lucene.queryParser.ParseException;

import org.apache.lucene.queryParser.QueryParser;

import org.apache.lucene.search.IndexSearcher;

import org.apache.lucene.search.Query;

import org.apache.lucene.search.ScoreDoc;

import org.apache.lucene.search.TopDocs;

import org.apache.lucene.store.FSDirectory;

import org.apache.lucene.util.Version;

public class TestQuery {

public static void main(String[] args) throws ParseException, IOException {

String index="./index/test1";//搜索的索引路徑

IndexReader reader=IndexReader.open(FSDirectory.open(new File(index)));

IndexSearcher searcher=new IndexSearcher(reader);//檢索工具

ScoreDoc[] hits=null;

// BufferedReader reader1=new BufferedReader(new InputStreamReader(System.in));

// String queryString=reader1.readLine().toString(); //搜索關鍵字

// Scanner sca=new Scanner(System.in);

// String queryString=sca.next().toString();

String queryString="測試";

Query query=null;

Analyzer analyzer=new StandardAnalyzer(Version.LUCENE_36);

try {

QueryParser qp=new QueryParser(Version.LUCENE_36,"body",analyzer);//用於解析用戶輸入的工具

query=qp.parse(queryString);

} catch (Exception e) {

// TODO: handle exception

}

if (searcher!=null) {

TopDocs results=searcher.search(query, 10);//只取排名前十的搜索結果

hits=results.scoreDocs;

Document document=null;

if (hits.length>0) {

System.out.println("找到"+hits.length+"條結果");

for (int i = 0; i < hits.length; i++) {

document=searcher.doc(hits[i].doc);

String body=document.get("body");

String path=document.get("path");

String modifiedtime=document.get("modifiField");

System.out.print(body+" ");

System.out.println(path);

}

}else

System.out.println("沒查到結果");

searcher.close();

reader.close();

}else

System.out.println("沒查找到索引");

}

Test2檢索相對路徑上的文件

package test2;

import java.io.*;

import java.util.HashMap;

import java.util.Iterator;

import java.util.Set;

import org.apache.lucene.analysis.Analyzer;

import org.apache.lucene.document.Document;

import org.apache.lucene.document.Field;

import org.apache.lucene.index.IndexReader;

import org.apache.lucene.index.IndexWriter;

import org.apache.lucene.index.IndexWriterConfig;

import org.apache.lucene.index.IndexWriterConfig.OpenMode;

import org.apache.lucene.search.IndexSearcher;

import org.apache.lucene.search.Query;

import org.apache.lucene.search.ScoreDoc;

import org.apache.lucene.search.TopScoreDocCollector;

import org.apache.lucene.store.FSDirectory;

import org.apache.lucene.util.Version;

import org.wltea.analyzer.lucene.IKAnalyzer;

import org.wltea.analyzer.lucene.IKQueryParser;

public class MyLucene {

private static final File INDEX_PATH = new File(".\\index\\test2"); // 索引文件位置, 當前路徑下的index文件

private static final String filePath = ".\\luceneDataSource\\test.txt";// 索引數據源文件位置，當前路徑下的luceneDataSource\test.txt文件

private static final Analyzer ANALYZER = new IKAnalyzer(); // 中文分詞器

public static void main(String[] args){

/**

* 建立索引

File readFile = new File(filePath); // 獲取數據源文件

HashMap<String, String> words = readFile(readFile);

Document doc = null;

if (words != null) {

try {

IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_36, ANALYZER);

iwc.setOpenMode(OpenMode.CREATE);//建立新的索引文件create 表示建立或追加到已有索引庫，沒有這句話索引庫會有重複的

IndexWriter writer = new IndexWriter(FSDirectory.open(INDEX_PATH), iwc);

Set<String> keys = words.keySet();

for (Iterator<String> it = keys.iterator(); it.hasNext();) {

String key = it.next();

doc = new Document();

Field index = new Field("index", key, Field.Store.YES,Field.Index.ANALYZED,Field.TermVector.WITH_POSITIONS_OFFSETS);

Field contents = new Field("contents", words.get(key),Field.Store.YES, Field.Index.NO);

doc.add(index);

doc.add(contents);

writer.addDocument(doc);

}

writer.close(); // 這裏不關閉，創建索引會失敗

} catch (Exception e) {

e.printStackTrace();

}

else

System.out.println("文件讀取錯誤");

}

/**

* 判斷索引庫是已否建立

public boolean noIndex() {

File[] indexs = INDEX_PATH.listFiles();

if (indexs.length == 0) {

return true;

} else {

return false;

}

/**

* 讀取文件

* @param file

public static HashMap<String, String> readFile(File file) {

InputStream in = null;

InputStreamReader inR = null;

BufferedReader br = null;

HashMap<String, String> wordsMap = new HashMap<String, String>();

try {

in = new FileInputStream(file);

inR = new InputStreamReader(in, "GBK"); //utf-8

br = new BufferedReader(inR);

String line;

while ((line = br.readLine()) != null) {

System.out.println(line);

wordsMap.put(line.trim(), line.trim());

}

return wordsMap;

} catch (Exception e) {

e.printStackTrace();

return null;

} finally {

try {

if (in != null)

in.close();

if (inR != null)

inR.close();

if (br != null)

br.close();

} catch (Exception e) {

e.printStackTrace();

return null;

}

/**

* 檢索

* @param queryStr

* @param hitsPerPage

public void search(String queryStr) {

try {

IndexReader reader = IndexReader.open(FSDirectory.open(INDEX_PATH));// 獲得索引的目錄

IndexSearcher searcher = new IndexSearcher(reader);

Query query = IKQueryParser.parse("index", queryStr);

TopScoreDocCollector collector = TopScoreDocCollector.create(100, true);

searcher.search(query, collector);

ScoreDoc[] hits = collector.topDocs().scoreDocs;

if(hits.length > 0){

System.out.println("檢索詞："+queryStr+"\t共找到 "+hits.length+"條記錄");

for (int i = 0; i < hits.length; i++) {

Document result = searcher.doc(hits[i].doc);

System.out.println((i+1) +")" + "\n index:" + result.get("index") + "\n contents:" + result.get("contents"));

}

}else{

System.out.println("未找到結果");

}

} catch (Exception e) {

System.out.println("Exception");

}

package test2;

import java.io.BufferedReader;

import java.io.IOException;

import java.io.InputStreamReader;

public class TestMyLucene {

public static void main(String[] args) throws IOException {

MyLucene myLucene = new MyLucene();

// 索引庫是已否建立,若是沒有則建立

if(myLucene.noIndex()){

System.out.println("索引庫尚未建立");

}else{

BufferedReader reader1=new BufferedReader(new InputStreamReader(System.in));

String queryString=reader1.readLine().toString(); //搜索關鍵字

myLucene.search(queryString);

}

Test3檢索數據庫中的數據（本例爲oracle）

package test3;

import java.io.File;

import java.io.IOException;

import java.sql.Connection;

import java.sql.ResultSet;

import java.sql.SQLException;

import java.sql.Statement;

import java.text.DateFormat;

import java.text.SimpleDateFormat;

import java.util.ArrayList;

import java.util.List;

import org.apache.lucene.analysis.Analyzer;

import org.apache.lucene.document.Document;

import org.apache.lucene.document.Field;

import org.apache.lucene.index.IndexWriter;

import org.apache.lucene.index.IndexWriterConfig;

import org.apache.lucene.index.IndexWriterConfig.OpenMode;

import org.apache.lucene.store.Directory;

import org.apache.lucene.store.FSDirectory;

import org.apache.lucene.util.Version;

import org.wltea.analyzer.lucene.IKAnalyzer;

public class IndexCreateUtill {

private List<NewsItem> list;

public void createIndexForMynews() throws IOException, ClassNotFoundException{

//存放索引的文件夾

File indxeFile = new File(".\\index\\test3");

//建立Directory對象

Directory directory =FSDirectory.open(indxeFile);

//使用IKAnalyzer分詞器

Analyzer analyzer = new IKAnalyzer();

//建立IndexWriterConfig

IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_36, analyzer);

//建立IndexWriter

indexWriterConfig.setOpenMode(OpenMode.CREATE);

IndexWriter indexWriter = new IndexWriter(directory, indexWriterConfig);

//從數據庫中讀取出全部的新聞記錄以便進行索引的建立

try {

// DBSource dbSource =DBSource.getInstance();

// Connection conn = dbSource.getConnection();

Connection conn=Utils.getConnection();

Statement stmt = null;

ResultSet rs = null;

String sql = "select * from t_newsitem";

stmt = conn.createStatement();

rs = stmt.executeQuery(sql);

list = new ArrayList<NewsItem>();

while(rs.next()){

NewsItem newsItem = new NewsItem();

newsItem.setId(rs.getInt("id"));

newsItem.setNewsTitle(rs.getString("newsTitle"));

newsItem.setNewsContent(rs.getString("newsContent"));

newsItem.setPublishTime(rs.getTimestamp("publishTime"));

newsItem.setResource(rs.getString("resourcer"));

newsItem.setT_newsType_id(rs.getInt("t_newsType_id"));

newsItem.setEditor(rs.getString("editor"));

list.add(newsItem);

}

DateFormat dateFormat = new SimpleDateFormat("yyyy年MM月dd日 HH時mm分ss秒");

for (int i=0;i<list.size();i++) {

//創建一個lucene文檔

Document doc = new Document();

//獲得新聞標題

String newsTitle = list.get(i).getNewsTitle();

//獲得新聞內容

String newsContent = list.get(i).getNewsContent();

//獲得新聞事件

String publishDate = dateFormat.format(list.get(i).getPublishTime());

//獲得新聞主鍵id

String id = list.get(i).getId() + "";

//將新聞標題加入文檔，由於要搜索和高亮，因此index是tokennized，TermVector是WITH_POSITIONS_OFFSETS

doc.add(new Field("title" , newsTitle , Field.Store.YES , Field.Index.ANALYZED , Field.TermVector.WITH_POSITIONS_OFFSETS));

//添加新聞內容至文檔，與標題類似

doc.add(new Field("content" , newsContent , Field.Store.YES , Field.Index.ANALYZED , Field.TermVector.WITH_POSITIONS_OFFSETS));

//添加時間至文檔，由於要按照此字段降序排列排序，因此tokenzied,不用高亮因此TermVector是no就好了

doc.add(new Field("date" , publishDate , Field.Store.YES , Field.Index.ANALYZED , Field.TermVector.NO));

//添加主鍵至文檔，不分詞，不高亮。

doc.add(new Field("id" , id , Field.Store.YES , Field.Index.NO , Field.TermVector.NO));

indexWriter.addDocument(doc);

}

indexWriter.close();

Utils.closeAll(rs, stmt, conn);

} catch (SQLException e) {

// TODO Auto-generated catch block

e.printStackTrace();

}

public static void main(String[] args) throws Exception {

IndexCreateUtill util = new IndexCreateUtill();

util.createIndexForMynews();

}

package test3;

import java.io.Serializable;

import java.util.Date;

public class NewsItem implements Serializable{

private static final long serialVersionUID = 1L;

private Integer id ;

private String newsTitle ;

private String newsContent;

private Date publishTime;

private String resource;

private Integer t_newsType_id;

private String editor;

public NewsItem() {

}

public NewsItem(Integer id, String newsTitle, String newsContent,

Date publishTime, String resource, Integer t_newsType_id, String editor) {

super();

this.id = id;

this.newsTitle = newsTitle;

this.newsContent = newsContent;

this.publishTime = publishTime;

this.resource = resource;

this.t_newsType_id = t_newsType_id;

this.editor = editor;

}

public Integer getId() {

return id;

}

public void setId(Integer id) {

this.id = id;

}

public String getNewsTitle() {

return newsTitle;

}

public void setNewsTitle(String newsTitle) {

this.newsTitle = newsTitle;

}

public String getNewsContent() {

return newsContent;

}

public void setNewsContent(String newsContent) {

this.newsContent = newsContent;

}

public Date getPublishTime() {

return publishTime;

}

public void setPublishTime(Date publishTime) {

this.publishTime = publishTime;

}

public String getResource() {

return resource;

}

public void setResource(String resource) {

this.resource = resource;

}

public Integer getT_newsType_id() {

return t_newsType_id;

}

public void setT_newsType_id(Integer t_newsType_id) {

this.t_newsType_id = t_newsType_id;

}

public String getEditor() {

return editor;

}

public void setEditor(String editor) {

this.editor = editor;

}

package test3;

import java.io.File;

import java.io.IOException;

import java.util.Scanner;

import org.apache.lucene.analysis.Analyzer;

import org.apache.lucene.analysis.standard.StandardAnalyzer;

import org.apache.lucene.document.Document;

import org.apache.lucene.index.IndexReader;

import org.apache.lucene.queryParser.ParseException;

import org.apache.lucene.queryParser.QueryParser;

import org.apache.lucene.search.IndexSearcher;

import org.apache.lucene.search.Query;

import org.apache.lucene.search.ScoreDoc;

import org.apache.lucene.search.TopDocs;

import org.apache.lucene.store.FSDirectory;

import org.apache.lucene.util.Version;

public class TestQuery {

public static void main(String[] args) throws ParseException, IOException {

String index=".\\index\\test3";//搜索的索引路徑

IndexReader reader=IndexReader.open(FSDirectory.open(new File(index)));

IndexSearcher searcher=new IndexSearcher(reader);//檢索工具

ScoreDoc[] hits=null;

// BufferedReader reader1=new BufferedReader(new InputStreamReader(System.in));

// String queryString=reader1.readLine().toString(); //搜索關鍵字

Scanner sca=new Scanner(System.in);

String queryString=sca.next().toString();

System.out.print("搜索關鍵詞爲"+queryString+",");

Query query=null;

Analyzer analyzer=new StandardAnalyzer(Version.LUCENE_36);

try {

QueryParser qp=new QueryParser(Version.LUCENE_36,"content",analyzer);//用於解析用戶輸入的工具

query=qp.parse(queryString);

} catch (Exception e) {

// TODO: handle exception

}

if (searcher!=null) {

TopDocs results=searcher.search(query, 10);//只取排名前十的搜索結果

hits=results.scoreDocs;

Document document=null;

if (hits.length>0) {

System.out.println("找到"+hits.length+"條結果");

for (int i = 0; i < hits.length; i++) {

document=searcher.doc(hits[i].doc);

String title=document.get("title");

String content=document.get("content");

String date=document.get("date");

String id=document.get("id");

System.out.println("標題："+title);

System.out.println("內容："+content);

System.out.println("日期："+date);

System.out.println("ID:"+id);

}

}else

System.out.println("沒查到結果");

searcher.close();

reader.close();

}else

System.out.println("沒查找到索引");

}

package test3;

import java.sql.Connection;

import java.sql.DriverManager;

import java.sql.ResultSet;

import java.sql.SQLException;

import java.sql.Statement;

public class Utils {

public static Connection getConnection() {

Connection con = null;

try {

Class.forName("oracle.jdbc.driver.OracleDriver");

con = DriverManager.getConnection("jdbc:oracle:thin:@localhost :1521:orcl", "hr", "orcl");

}catch (ClassNotFoundException e) {

// TODO Auto-generated catch block

e.printStackTrace();

}

catch (SQLException e) {

// TODO Auto-generated catch block

e.printStackTrace();

}

return con;

}

public static void closeAll(ResultSet rs, Statement ps,Connection conn) throws SQLException{

closeResultSet(rs);

closeStatement(ps);

closeConnection(conn);

}

public static void closeConnection(Connection con) {

try {

if (con != null) {

con.close();

}

}catch (SQLException ex) {

ex.printStackTrace();

}

public static void closeStatement(Statement st) {

try {

if (st != null) {

st.close();

}

}catch (SQLException ex) {

ex.printStackTrace();

}

public static void closeResultSet(ResultSet rs) {

try {

if (rs != null) {

rs.close();

}

}catch (SQLException ex) {

ex.printStackTrace();

}

相關標籤/搜索

每日一句

每一个你不满意的现在，都有一个你没有努力的曾经。