public
void
crateIndex()
throws
Exception {
File indexDir =
new
File(
"D://luceneIndex"
);
//
存儲索引文件夾
File dataDir =
new
File(
"D://test"
);
//
須要檢索文件夾
Analyzer luceneAnalyzer = new PaodingAnalyzer();
//
PaodingAnalyzer這個類是庖丁解牛中文分詞分析器類繼承了Lucene的
Analyzer接口,對於檢索中文分詞有很大幫助
File[] dataFiles = dataDir.listFiles();
boolean
fileIsExist =
false
;
if
(indexDir.listFiles().
length
== 0)
fileIsExist =
true
;
IndexWriter indexWriter =
new
IndexWriter(indexDir,
luceneAnalyzer
, fileIsExist);
//
第三個參數是一個布爾型的變量,若是爲
true
的話就表明建立一個新的索引,爲
false
的話就表明在原來索引的基礎上進行操做。
long
startTime =
new
Date().getTime();
this
.doIndex(dataFiles, indexWriter);
indexWriter.optimize();//優化索引
indexWriter.close();//關閉索引
long
endTime =
new
Date().getTime();
System.
out
.println(
"It takes "
+ (endTime - startTime)
+
" milliseconds to create index for the files in directory "
+ dataDir.getPath());
{color:black}}
* private{*}
void
doIndex(File[] dataFiles, IndexWriter indexWriter)
throws
Exception {
for
(
int
i = 0; i < dataFiles.
length
; i++) {
if
(dataFiles[i].isFile() && dataFiles[i].getName().endsWith(
".html"
)) {//索引全部html格式文件
System.
out
.println(
"Indexing file "
+ dataFiles[i].getCanonicalPath());
Reader txtReader =
new
FileReader(dataFiles[i]);
Document document =
new
Document();
// Field.Store.YES
存儲
Field.Store.NO
不存儲
// Field.Index.TOKENIZED
分詞
Field.Index.UN_TOKENIZED
不分詞
document.add(
new
Field(
"path"
, dataFiles[i].getCanonicalPath(), Field.Store.
YES
,
Field.Index.
UN_TOKENIZED
));
document.add(
new
Field(
"filename"
, dataFiles[i].getName(), Field.Store.
YES
, Field.Index.
TOKENIZED
));
//
另一個構造函數
,
接受一個
Reader
對象
document.add(
new
Field(
"contents"
, txtReader));
indexWriter.addDocument(document);
{color:black}}
else
if
(dataFiles[i].isFile() && dataFiles[i].getName().endsWith(
".doc"
)) {//索引全部word文件
FileInputStream in =
new
FileInputStream(dataFiles[i]);//
得到文件流
WordExtractor extractor =
new
WordExtractor(in);//
使用POI對word文件進行解析
String str = extractor.getText();//
返回String
Document document =
new
Document();//生成
Document對象,其中有3個
Field,分別是
path
,
filename,
contents
document.add(
new
Field(
"path"
, dataFiles[i].getCanonicalPath(), Field.Store.
YES
,
Field.Index.
UN_TOKENIZED
));
document.add(
new
Field(
"filename"
, dataFiles[i].getName(), Field.Store.
YES
, Field.Index.
TOKENIZED
));
//
另一個構造函數
,
接受一個
Reader
對象
document.add(
new
Field(
"contents"
, str, Field.Store.
YES
,Field.Index.
TOKENIZED
,
Field.TermVector.
WITH_POSITIONS_OFFSETS
));
indexWriter.addDocument(document);
{color:black}}
else
{
if
(dataFiles[i].isDirectory()) {
doIndex(dataFiles[i].listFiles(), indexWriter);//使用遞歸,繼續索引文件夾
{color:black}}
{color:black}}
{color:black}}
{color:black}}