lucene之索引建立

時間 2019-11-13

標籤 lucene 索引建立简体版

原文原文鏈接

package org.itat.index;

import java.io.File;
import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.HashMap;
import java.util.Map;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.NumericField;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;

public class CreateIndex {
   private static IndexWriter writer = null;
   private static String[] ids = { "1", "2", "3", "4", "5", "6" };
   private static String[] emails = { "aa@itat.org", "bb@itat.org",
           "cc@cc.org", "dd@sina.org", "ee@zttc.edu", "ff@itat.org" };
   private static String[] contents = {
           "welcome to visited the space,I like book",
           "hello boy, I like pingpeng ball", "my name is cc I like game",
           "I like football", "I like football and I like basketball too",
           "I like movie and swim" };
   private static Date[] dates = null;
   private static int[] attachs = { 2, 3, 1, 4, 5, 5 };
   private static String[] names = { "zhangsan", "lisi", "john", "jetty",
           "mike", "jake" };
   private static Directory directory = null;
   private static Map<String, Float> scores = new HashMap<String, Float>();
   private static IndexReader reader = null;
   static {
       SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
       try {
           // 初始化date數組
           dates = new Date[ids.length];
           dates[0] = sdf.parse("2010-02-19");
           dates[1] = sdf.parse("2012-01-11");
           dates[2] = sdf.parse("2011-09-19");
           dates[3] = sdf.parse("2010-12-22");
           dates[4] = sdf.parse("2012-01-01");
           dates[5] = sdf.parse("2011-05-19");
           // 設置搜索郵箱名的優先級
           scores.put("itat.org", 2.0f);
           scores.put("zttc.edu", 1.5f);
           directory = FSDirectory.open(new
           File("d:/lucene/index02"));//打開磁盤做爲索引的存儲
           //directory = new RAMDirectory();// 索引建在內存中


       } catch (ParseException e) {
           e.printStackTrace();
       } catch (Exception e) {
           // TODO Auto-generated catch block
           e.printStackTrace();
       }

   }
   public CreateIndex() throws Exception, Exception{//利用索引器建立索引
       //實例化了一個索引器

       //writer= new IndexWriter("D:/index/", new PanGuAnalyzer(), true);
       //第一個參數path;第二個參數分詞器；第三個參數 create：它是一個boolean型變量，若是爲true，
       //表示要重寫指定的存放索引目錄下的索引文件；若是爲false，表示在指定存放索引目錄下已經存在的
       //索引文件的基礎上，向其中繼續追加新的索引文件。
       writer = new IndexWriter(directory, new IndexWriterConfig(
               Version.LUCENE_35, new StandardAnalyzer(Version.LUCENE_35)));

       Document doc = null;
       for(int i=0;i<ids.length;i++) {
           doc = new Document();
           //ANALYZED進行分詞和索引，適用於標題、內容等不適合精確操做
           doc.add(new Field("id",ids[i],Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS));
           //進行分詞可是不存儲norms信息，這個norms中包括了建立索引的時間和權值等信息
           doc.add(new Field("email",emails[i],Field.Store.YES,Field.Index.NOT_ANALYZED));
           doc.add(new Field("email","test"+i+"@test.com",Field.Store.YES,Field.Index.NOT_ANALYZED));
           doc.add(new Field("content",contents[i],Field.Store.NO,Field.Index.ANALYZED));
           doc.add(new Field("name",names[i],Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS));
           //存儲數字在索引中
           doc.add(new NumericField("attach",Field.Store.YES,true).setIntValue(attachs[i]));
           //存儲日期在索引中
           doc.add(new NumericField("date",Field.Store.YES,true).setLongValue(dates[i].getTime()));
           String et = emails[i].substring(emails[i].lastIndexOf("@")+1);
           System.out.println(et);
           if(scores.containsKey(et)) {
               doc.setBoost(scores.get(et));//設置搜索的優先級
           } else {
               doc.setBoost(0.5f);
           }
           writer.addDocument(doc);

           writer.optimize(); //添加完全部document，咱們對索引進行優化，
           //優化主要是將多個索引文件合併到一個，有利於提升索引速度。
       }
       writer.close();//必定要關閉，否則會出現Lock obtain timed out: NativeFSLock@D:\lucene\index02\write.lock

   }
   public void undelete() throws Exception, Exception{

       /*lucene中文檔的刪除是推遲到IndexReader實例關閉的時候才進行的，因此lucene容許在文檔被標記刪除但還
       * 沒有執行最後的刪除操做以前，恢復被標記爲刪除的文檔。能夠經過調用undelete方法移除索引目錄中的.del文件
       * 。隨後關閉IndexReader實例。這樣就保存了全部標記爲刪除的文檔。若是用IndexReader標記了刪除文檔，那麼
       * 只有同一個IndexReader實例的deleteAll()方法才能在最初的位置恢復各個被標記爲刪除的文檔。也就是說IndexReader實例只能處理自身標記的刪除文檔，沒法恢復其餘實例的文檔。
       lucene恢復被刪除的文檔
       * */
       //使用IndexReader進行恢復
       //恢復時，必須把IndexReader的只讀(readOnly)設置爲false
       IndexReader reader = IndexReader.open(directory,false);
       reader.undeleteAll();
       reader.close();
   }
   public void forceDelete() throws Exception {
       /**
         * 假如你想要強制刪除回收站的信息能夠調用writer.forceMergeDeletes()這個方法，
         * 可是這個方法不推薦使用，比較消耗內存，lucene會自動根據容量的大小刪除所刪除的文件
         */
       Directory directory1 = FSDirectory.open(new
               File("d:/lucene/index02"));//
       IndexWriter writer = null;
       try {
           writer = new IndexWriter(directory1,
                   new IndexWriterConfig(Version.LUCENE_35,new StandardAnalyzer(Version.LUCENE_35)));
           writer.forceMergeDeletes();
       } catch (CorruptIndexException e) {
           e.printStackTrace();
       } catch (LockObtainFailedException e) {
           e.printStackTrace();
       } catch (IOException e) {
           e.printStackTrace();
       } finally {
           try {
               if(writer!=null) writer.close();
           } catch (CorruptIndexException e) {
               e.printStackTrace();
           } catch (IOException e) {
               e.printStackTrace();
           }
       }
   }
   public void merge() throws Exception, Exception, Exception {
       Directory directory1 = FSDirectory.open(new
               File("d:/lucene/index01"));//

       writer = new IndexWriter(directory1,
               new IndexWriterConfig(Version.LUCENE_35,new StandardAnalyzer(Version.LUCENE_35)));
       //會將索引合併爲2段，這兩段中的被刪除的數據會被清空
       //特別注意：此處Lucene在3.5以後不建議使用，由於會消耗大量的開銷，
       //Lucene會根據狀況自動處理的
       writer.commit();
       writer.forceMerge(2);
       writer.close();
   }
   public void delete() {
       IndexWriter writer = null;

       try {
           writer = new IndexWriter(directory,
                   new IndexWriterConfig(Version.LUCENE_35,new StandardAnalyzer(Version.LUCENE_35)));
           //參數是一個選項，能夠是一個Query，也能夠是一個term，term是一個精確查找的值
           //此時刪除的文檔並不會被徹底刪除，而是存儲在一個回收站中的，能夠恢復
           writer.deleteDocuments(new Term("id","1"));
           writer.commit();
       } catch (CorruptIndexException e) {
           e.printStackTrace();
       } catch (LockObtainFailedException e) {
           e.printStackTrace();
       } catch (IOException e) {
           e.printStackTrace();
       } finally {
           try {
               if(writer!=null) writer.close();
           } catch (CorruptIndexException e) {
               e.printStackTrace();
           } catch (IOException e) {
               e.printStackTrace();
           }
       }
   }
   public void delete02() {

       try {
           reader = IndexReader.open(directory,false);
           reader.deleteDocuments(new Term("id","1"));
       } catch (CorruptIndexException e) {
           e.printStackTrace();
       } catch (LockObtainFailedException e) {
           e.printStackTrace();
       } catch (IOException e) {
           e.printStackTrace();
       }
   }
   public void update() {
       IndexWriter writer = null;
       try {
           writer = new IndexWriter(directory,
                   new IndexWriterConfig(Version.LUCENE_35,new StandardAnalyzer(Version.LUCENE_35)));
           /*
           * Lucene並無提供更新，這裏的更新操做實際上是以下兩個操做的合集
           * 先刪除以後再添加
           */
           Document doc = new Document();
           doc.add(new Field("id","11",Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS));
           doc.add(new Field("email",emails[0],Field.Store.YES,Field.Index.NOT_ANALYZED));
           doc.add(new Field("content",contents[0],Field.Store.NO,Field.Index.ANALYZED));
           doc.add(new Field("name",names[0],Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS));
           writer.updateDocument(new Term("id","1"), doc);
       } catch (CorruptIndexException e) {
           e.printStackTrace();
       } catch (LockObtainFailedException e) {
           e.printStackTrace();
       } catch (IOException e) {
           e.printStackTrace();
       } finally {
           try {
               if(writer!=null) writer.close();
           } catch (CorruptIndexException e) {
               e.printStackTrace();
           } catch (IOException e) {
               e.printStackTrace();
           }
       }
   }
   public void query() {
       try {
           IndexReader reader = IndexReader.open(directory);
           //經過reader能夠有效的獲取到文檔的數量
           System.out.println("numDocs:"+reader.numDocs());
           System.out.println("maxDocs:"+reader.maxDoc());
           System.out.println("deleteDocs:"+reader.numDeletedDocs());
           reader.close();
       } catch (CorruptIndexException e) {
           e.printStackTrace();
       } catch (IOException e) {
           e.printStackTrace();
       }
   }
   public void search01() {
       try {
           IndexReader reader = IndexReader.open(directory);
           IndexSearcher searcher = new IndexSearcher(reader);
           TermQuery query = new TermQuery(new Term("email","test0@test.com"));
           TopDocs tds = searcher.search(query, 10);
           for(ScoreDoc sd:tds.scoreDocs) {
               Document doc = searcher.doc(sd.doc);
               System.out.println("("+sd.doc+"-"+doc.getBoost()+"-"+sd.score+")"+
                       doc.get("name")+"["+doc.get("email")+"]-->"+doc.get("id")+","+
                       doc.get("attach")+","+doc.get("date")+","+doc.getValues("email")[1]);
           }
           reader.close();
       } catch (CorruptIndexException e) {
           e.printStackTrace();
       } catch (IOException e) {
           e.printStackTrace();
       }
   }
   public void search02() {
       try {
           if(reader==null) {
               reader = IndexReader.open(directory,false);
           } else {
               IndexReader tr = IndexReader.openIfChanged(reader);
               if(tr!=null) {
                   reader.close();
                   reader = tr;
               }
           }
           IndexSearcher searcher=new IndexSearcher(reader);
           TermQuery query = new TermQuery(new Term("content","like"));
           TopDocs tds = searcher.search(query, 10);
           for(ScoreDoc sd:tds.scoreDocs) {
               Document doc = searcher.doc(sd.doc);
               System.out.println(doc.get("id")+"---->"+
                       doc.get("name")+"["+doc.get("email")+"]-->"+doc.get("id")+","+
                       doc.get("attach")+","+doc.get("date")+","+doc.getValues("email")[1]);
           }
           searcher.close();
       } catch (CorruptIndexException e) {
           e.printStackTrace();
       } catch (IOException e) {
           e.printStackTrace();
       }
   }


   }

}

java

相關標籤/搜索

每日一句

每一个你不满意的现在，都有一个你没有努力的曾经。