lunece學習筆記

時間 2019-11-12

標籤 lunece 學習筆記简体版

原文原文鏈接

lunece是一個全文檢索工具,lunece的出現能夠對關鍵詞和部分數據創建索引，優化查詢效率java

這裏引入須要lunecejar version 3.5 和junit jar version4.0數據庫

學習lunece須要最重要如下幾個工具

索引部分學習
分詞器測試
搜索部分優化

v1.下面先了解索引部分spa

v1.創建索引.net

首先創建一個IndexUtil類orm

//建立索引
public void index() {對象

  IndexWriter writer=null;
  try {
   // 1.建立Directory 將索引創建在什麼地方(是內存中仍是硬盤),這裏保存到硬盤上
   //Directory directory=new RAMDirectory();//創建在內存中
   Directory directroy=FSDirectory.open(new File("F:/lunece"));
   // 2.建立IndexWriter 寫入索引
   IndexWriterConfig iwc=new IndexWriterConfig(Version.LUCENE_35, new StandardAnalyzer(Version.LUCENE_35));
   writer=new IndexWriter(directroy,iwc);
   // 3.建立Document對象至關於數據庫中的表,或者硬盤的某個文件
   Document doc=null;
   File f=new File("F:/lunecetest1");
   for(File file:f.listFiles()){
    doc=new Document();
    // 4.爲Document添加Filed 至關於表中的字段,爲那些字段創建索引
    doc.add(new Field("content", new FileReader(file)));
    /**
     * Field.Store.YES或者NO(存儲域選項)
       設置爲YES表示或把這個域中的內容徹底存儲到文件中，方便進行文本的還原
       設置爲NO表示把這個域的內容不存儲到文件中，可是能夠被索引，此時內容沒法徹底還原(doc.get)

     */
    doc.add(new Field("filename", file.getName(),Field.Store.YES,Field.Index.NOT_ANALYZED));
    /***
     * Field.Index(索引選項)
        Index.ANALYZED:進行分詞和索引，適用於標題、內容等
        Index.NOT_ANALYZED:進行索引，可是不進行分詞，若是身份證號，姓名，ID等，適用於精確搜索
        Index.ANALYZED_NOT_NORMS:進行分詞可是不存儲norms信息，這個norms中包括了建立索引的時間和權值等信息
        Index.NOT_ANALYZED_NOT_NORMS:即不進行分詞也不存儲norms信息

     */
    doc.add(new Field("path", file.getAbsolutePath(),Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS));
    // 5.經過IndexWriter添加文檔到索引中
    writer.addDocument(doc);
   }
  } catch (CorruptIndexException e) {
   // TODO Auto-generated catch block
   e.printStackTrace();
  } catch (LockObtainFailedException e) {
   // TODO Auto-generated catch block
   e.printStackTrace();
  } catch (IOException e) {
   // TODO Auto-generated catch block
   e.printStackTrace();
  }finally {
   try {
    if(writer!=null)writer.close();//6.這裏要關閉writer,否則寫入索引可能會有問題
   } catch (CorruptIndexException e) {
    e.printStackTrace();
   } catch (IOException e) {
    e.printStackTrace();
   }

  }

}

v3.搜索索引

//搜索索引
public void search(){
  try {
   //1.建立打開索引目錄
   Directory directory = FSDirectory.open(new File("f:/lunece"));
   //2.讀取索引
   IndexReader reader = IndexReader.open(directory);
   //3.根據IndexReader 建立IndexSearcher 對reader在進行解析搜索索引
   IndexSearcher searcher = new IndexSearcher(reader);
   //4.建立Query對象，對那個解析索引域進行搜索如內容  StandardAnalyzer 分詞器
   //建立parse來肯定搜索的內容,第二個參數表明搜索的域
   QueryParser parse=new QueryParser(Version.LUCENE_35,"content",new StandardAnalyzer(Version.LUCENE_35));
   //建立Query 表示搜索的域中包含java的文檔
   Query query=parse.parse("java");
   //5.根據searcher搜索而且返回TopDocs,執行搜索
   TopDocs tds=searcher.search(query, 10);
   //6.根據TopDocs獲取ScoreDoc對象
   ScoreDoc[] sds=tds.scoreDocs;
   for(ScoreDoc sd:sds){

    //7.根據searcher和ScoreDoc對象獲取具體Document對象
    Document d=searcher.doc(sd.doc);
    //8根據Document對象獲取須要的值
    System.out.println(d.get("filename")+"["+d.get("path")+"]");

   }
   reader.close(); //關閉reader
  } catch (Exception e) {
   // TODO: handle exception
  }
}

  //查詢索引
public void query() {
  try {
   IndexReader reader = IndexReader.open(directory);
   //經過reader能夠有效的獲取到文檔的數量
   System.out.println("numDocs:"+reader.numDocs());
   System.out.println("maxDocs:"+reader.maxDoc());
   System.out.println("deleteDocs:"+reader.numDeletedDocs());
   reader.close();
  } catch (CorruptIndexException e) {
   e.printStackTrace();
  } catch (IOException e) {
   e.printStackTrace();
  }
}

//刪除索引

列如我有

private String[] ids = {"1","2","3","4","5","6"};

private String[] emails = {"aa@itat.org","bb@itat.org","cc@cc.org","dd@sina.org","ee@zttc.edu","ff@itat.org"};
private String[] contents = {
   "welcome to visited the space,I like book",
   "hello boy, I like pingpeng ball",
   "my name is cc I like game",
   "I like football",
   "I like football and I like basketball too",
   "I like movie and swim"
};
private Date[] dates = null;
private int[] attachs = {2,3,1,4,5,5};
private String[] names = {"zhangsan","lisi","john","jetty","mike","jake"};

private Directory directory =directory = new RAMDirectory();

public void index() {
  IndexWriter writer = null;
  try {
   writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_35, new StandardAnalyzer(Version.LUCENE_35)));
   writer.deleteAll();
   Document doc = null;
   for(int i=0;i<ids.length;i++) {
    doc = new Document();
    doc.add(new Field("id",ids[i],Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS));
    doc.add(new Field("email",emails[i],Field.Store.YES,Field.Index.NOT_ANALYZED));
    doc.add(new Field("email","test"+i+"@test.com",Field.Store.YES,Field.Index.NOT_ANALYZED));
    doc.add(new Field("content",contents[i],Field.Store.NO,Field.Index.ANALYZED));
    doc.add(new Field("name",names[i],Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS));
    //存儲數字
    doc.add(new NumericField("attach",Field.Store.YES,true).setIntValue(attachs[i]));
    //存儲日期
    doc.add(new NumericField("date",Field.Store.YES,true).setLongValue(dates[i].getTime()));
    String et = emails[i].substring(emails[i].lastIndexOf("@")+1);
    System.out.println(et);
    if(scores.containsKey(et)) {
     doc.setBoost(scores.get(et));
    } else {
     doc.setBoost(0.5f);
    }
    writer.addDocument(doc);
   }
  } catch (CorruptIndexException e) {
   e.printStackTrace();
  } catch (LockObtainFailedException e) {
   e.printStackTrace();
  } catch (IOException e) {
   e.printStackTrace();
  } finally {
   try {
    if(writer!=null)writer.close();
   } catch (CorruptIndexException e) {
    e.printStackTrace();
   } catch (IOException e) {
    e.printStackTrace();
   }
  }
}

public void delete() {
  IndexWriter writer = null;

  try {
   writer = new IndexWriter(directory,
     new IndexWriterConfig(Version.LUCENE_35,new StandardAnalyzer(Version.LUCENE_35)));
   //參數是一個選項，能夠是一個Query，也能夠是一個term，term是一個精確查找的值
   //此時刪除的文檔並不會被徹底刪除，而是存儲在一個回收站中的，能夠恢復
   writer.deleteDocuments(new Term("id","1"));
   writer.commit();
  } catch (CorruptIndexException e) {
   e.printStackTrace();
  } catch (LockObtainFailedException e) {
   e.printStackTrace();
  } catch (IOException e) {
   e.printStackTrace();
  } finally {
   try {
    if(writer!=null) writer.close();
   } catch (CorruptIndexException e) {
    e.printStackTrace();
   } catch (IOException e) {
    e.printStackTrace();
   }
  }
}

而後創建一個TestLunece測試類

@Test
public void testIndex(){

  IndexUtil indexUtil=new IndexUtil();
  indexUtil.index();
}

@Test
public void testDelete() {
IndexUtil iu = new IndexUtil();
iu.delete();
}

//恢復索引
public void undelete() {
  //使用IndexReader進行恢復
  try {
   IndexReader reader = IndexReader.open(directory,false);
   //恢復時，必須把IndexReader的只讀(readOnly)設置爲false
   reader.undeleteAll();
   reader.close();
  } catch (CorruptIndexException e) {
   e.printStackTrace();
  } catch (StaleReaderException e) {
   e.printStackTrace();
  } catch (LockObtainFailedException e) {
   e.printStackTrace();
  } catch (IOException e) {
   e.printStackTrace();
  }
}