Elasticsearch 之(25)重寫IK分詞器源碼來基於mysql熱更新詞庫

熱更新
在上一節《 IK分詞器配置文件講解以及自定義詞庫》自定義詞庫,每次都是在es的擴展詞典中,手動添加新詞語,很坑
(1)每次添加完,都要重啓es才能生效,很是麻煩
(2)es是分佈式的,可能有數百個節點,你不能每次都一個一個節點上面去修改

es不停機,直接咱們在外部某個地方添加新的詞語,es中當即熱加載到這些新詞語

熱更新的方案
(1)修改ik分詞器源碼,而後手動支持從mysql中每隔必定時間,自動加載新的詞庫
(2)基於ik分詞器原生支持的熱更新方案,部署一個web服務器,提供一個http接口,經過modified和tag兩個http響應頭,來提供詞語的熱更新

用第一種方案,第二種,ik git社區官方都不建議採用,以爲不太穩定

一、下載源碼
https://github.com/medcl/elasticsearch-analysis-ik/tree/v5.2.0
ik分詞器,是個標準的java maven工程,直接導入eclipse就能夠看到源碼

二、修改源碼

Dictionary單例類的初始化方法initial,在這裏須要建立一個咱們自定義的線程,而且啓動它
/**
 * 詞典初始化 因爲IK Analyzer的詞典採用Dictionary類的靜態方法進行詞典初始化
 * 只有當Dictionary類被實際調用時,纔會開始載入詞典, 這將延長首次分詞操做的時間 該方法提供了一個在應用加載階段就初始化字典的手段
 * 
 * @return Dictionary
 */
public static synchronized Dictionary initial(Configuration cfg) {
	if (singleton == null) {
		synchronized (Dictionary.class) {
			if (singleton == null) {


				singleton = new Dictionary(cfg);
				singleton.loadMainDict();
				singleton.loadSurnameDict();
				singleton.loadQuantifierDict();
				singleton.loadSuffixDict();
				singleton.loadPrepDict();
				singleton.loadStopWordDict();
				
				new Thread(new HotDictReloadThread()).start();
				
				if(cfg.isEnableRemoteDict()){
					// 創建監控線程
					for (String location : singleton.getRemoteExtDictionarys()) {
						// 10 秒是初始延遲能夠修改的 60是間隔時間 單位秒
						pool.scheduleAtFixedRate(new Monitor(location), 10, 60, TimeUnit.SECONDS);
					}
					for (String location : singleton.getRemoteExtStopWordDictionarys()) {
						pool.scheduleAtFixedRate(new Monitor(location), 10, 60, TimeUnit.SECONDS);
					}
				}


				return singleton;
			}
		}
	}
	return singleton;
}
HotDictReloadThread類:就是死循環,不斷調用Dictionary.getSingleton().reLoadMainDict(),去從新加載詞典
public class HotDictReloadThread implements Runnable {

private static final Logger logger = ESLoggerFactory.getLogger(HotDictReloadThread.class.getName());

@Override
public void run() {
	while(true) {
		logger.info("[==========]reload hot dict from mysql......");   
		Dictionary.getSingleton().reLoadMainDict();
	}
}

}
Dictionary類:更新詞典 this.loadMySQLExtDict()
/**
 * 加載主詞典及擴展詞典
 */
private void loadMainDict() {
	// 創建一個主詞典實例
	_MainDict = new DictSegment((char) 0);

	// 讀取主詞典文件
	Path file = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_MAIN);

	InputStream is = null;
	try {
		is = new FileInputStream(file.toFile());
	} catch (FileNotFoundException e) {
		logger.error(e.getMessage(), e);
	}

	try {
		BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512);
		String theWord = null;
		do {
			theWord = br.readLine();
			if (theWord != null && !"".equals(theWord.trim())) {
				_MainDict.fillSegment(theWord.trim().toCharArray());
			}
		} while (theWord != null);

	} catch (IOException e) {
		logger.error("ik-analyzer", e);

	} finally {
		try {
			if (is != null) {
				is.close();
				is = null;
			}
		} catch (IOException e) {
			logger.error("ik-analyzer", e);
		}
	}
	// 加載擴展詞典
	this.loadExtDict();
	// 加載遠程自定義詞庫
	this.loadRemoteExtDict();
	// 從mysql加載詞典
	this.loadMySQLExtDict();
}

/**
 * 從mysql加載熱更新詞典
 */
private void loadMySQLExtDict() {
	Connection conn = null;
	Statement stmt = null;
	ResultSet rs = null;
	
	try {
		Path file = PathUtils.get(getDictRoot(), "jdbc-reload.properties");   
		prop.load(new FileInputStream(file.toFile()));
		
		logger.info("[==========]jdbc-reload.properties");
		for(Object key : prop.keySet()) {
			logger.info("[==========]" + key + "=" + prop.getProperty(String.valueOf(key)));      
		}
		
		logger.info("[==========]query hot dict from mysql, " + prop.getProperty("jdbc.reload.sql") + "......");  
		
		conn = DriverManager.getConnection(
				prop.getProperty("jdbc.url"),   
				prop.getProperty("jdbc.user"),  
				prop.getProperty("jdbc.password"));  
		stmt = conn.createStatement();
		rs = stmt.executeQuery(prop.getProperty("jdbc.reload.sql"));  
		
		while(rs.next()) {
			String theWord = rs.getString("word"); 
			logger.info("[==========]hot word from mysql: " + theWord); 
			_MainDict.fillSegment(theWord.trim().toCharArray());
		}
		 
		Thread.sleep(Integer.valueOf(String.valueOf(prop.get("jdbc.reload.interval"))));   
	} catch (Exception e) {
		logger.error("erorr", e); 
	} finally {
		if(rs != null) {
			try {
				rs.close();
			} catch (SQLException e) {
				logger.error("error", e); 
			}
		}
		if(stmt != null) {
			try {
				stmt.close();
			} catch (SQLException e) {
				logger.error("error", e); 
			}
		}
		if(conn != null) {
			try {
				conn.close();
			} catch (SQLException e) {
				logger.error("error", e); 
			}
		}
	}
}

Dictionary類:更新分詞 this.loadMySQLStopwordDict();
/**
 * 從mysql加載停用詞
 */
private void loadMySQLStopwordDict() {
	Connection conn = null;
	Statement stmt = null;
	ResultSet rs = null;
	
	try {
		Path file = PathUtils.get(getDictRoot(), "jdbc-reload.properties");   
		prop.load(new FileInputStream(file.toFile()));
		
		logger.info("[==========]jdbc-reload.properties");
		for(Object key : prop.keySet()) {
			logger.info("[==========]" + key + "=" + prop.getProperty(String.valueOf(key)));      
		}
		
		logger.info("[==========]query hot stopword dict from mysql, " + prop.getProperty("jdbc.reload.stopword.sql") + "......");  
		
		conn = DriverManager.getConnection(
				prop.getProperty("jdbc.url"),   
				prop.getProperty("jdbc.user"),  
				prop.getProperty("jdbc.password"));  
		stmt = conn.createStatement();
		rs = stmt.executeQuery(prop.getProperty("jdbc.reload.stopword.sql"));  
		
		while(rs.next()) {
			String theWord = rs.getString("word"); 
			logger.info("[==========]hot stopword from mysql: " + theWord); 
			_StopWords.fillSegment(theWord.trim().toCharArray());
		}
		 
		Thread.sleep(Integer.valueOf(String.valueOf(prop.get("jdbc.reload.interval"))));   
	} catch (Exception e) {
		logger.error("erorr", e); 
	} finally {
		if(rs != null) {
			try {
				rs.close();
			} catch (SQLException e) {
				logger.error("error", e); 
			}
		}
		if(stmt != null) {
			try {
				stmt.close();
			} catch (SQLException e) {
				logger.error("error", e); 
			}
		}
		if(conn != null) {
			try {
				conn.close();
			} catch (SQLException e) {
				logger.error("error", e); 
			}
		}
	}
}
配置
jdbc.url=jdbc:mysql://localhost:3306/test?serverTimezone=GMT
jdbc.user=root
jdbc.password=root
jdbc.reload.sql=select word from hot_words
jdbc.reload.stopword.sql=select stopword as word from hot_stopwords
jdbc.reload.interval=1000

三、mvn package打包代碼

target\releases\elasticsearch-analysis-ik-5.2.0.zip


四、解壓縮ik壓縮包

將mysql驅動jar,放入ik的目錄下


五、重啓es



六、在mysql中添加詞庫與停用詞



七、kibana分詞驗證

GET /my_index/_analyze
{
  "text": "一人飲酒醉",
  "analyzer": "ik_max_word"
}

{
  "tokens": [
    {
      "token": "一人飲酒醉",
      "start_offset": 0,
      "end_offset": 5,
      "type": "CN_WORD",
      "position": 0
    },
    {
      "token": "一人",
      "start_offset": 0,
      "end_offset": 2,
      "type": "CN_WORD",
      "position": 1
    },
    {
      "token": "一",
      "start_offset": 0,
      "end_offset": 1,
      "type": "TYPE_CNUM",
      "position": 2
    },
    {
      "token": "人",
      "start_offset": 1,
      "end_offset": 2,
      "type": "COUNT",
      "position": 3
    },
    {
      "token": "飲酒",
      "start_offset": 2,
      "end_offset": 4,
      "type": "CN_WORD",
      "position": 4
    },
    {
      "token": "飲",
      "start_offset": 2,
      "end_offset": 3,
      "type": "CN_WORD",
      "position": 5
    },
    {
      "token": "酒醉",
      "start_offset": 3,
      "end_offset": 5,
      "type": "CN_WORD",
      "position": 6
    },
    {
      "token": "酒",
      "start_offset": 3,
      "end_offset": 4,
      "type": "CN_WORD",
      "position": 7
    },
    {
      "token": "醉",
      "start_offset": 4,
      "end_offset": 5,
      "type": "CN_WORD",
      "position": 8
    }
  ]
}
相關文章
相關標籤/搜索