DFA算法:即肯定有窮自動機,簡單點說就是,它是是經過event和當前的state獲得下一個state,即event+state=nextstate。理解爲系統中有多個節點,經過傳遞進入的event,來肯定走哪一個路由至另外一個節點,而節點是有限的。java
廢話很少說,直接貼上代碼:正則表達式
import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.InputStreamReader; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.Map; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * @description:敏感詞工具 * @author: maojialong * @date: 2018年1月30日 上午10:59:24 */ public class SensitivewordEngine { private String ENCODING = "GBK"; //字符編碼 //敏感詞庫 public static Map sensitiveWordMap = new HashMap(); //只過濾最小敏感詞 public static int minMatchTYpe = 1; //過濾全部敏感詞 public static int maxMatchType = 2; //正則表達式中文、字母、數字 public static Pattern pattern = Pattern.compile("^[a-zA-Z0-9\u4E00-\u9FA5]+$"); /** * 讀取敏感詞庫中的內容,將內容添加到set集合中 * @author chenming * @date 2014年4月20日 下午2:31:18 * @return * @version 1.0 * @throws Exception */ @SuppressWarnings("resource") private void readSensitiveWordFile() throws Exception{ Set<String> set = null; File file = new File("D:\\SensitiveWord.txt"); //讀取文件 InputStreamReader read = new InputStreamReader(new FileInputStream(file),ENCODING); try { if(file.isFile() && file.exists()){ //文件流是否存在 set = new HashSet<String>(); BufferedReader bufferedReader = new BufferedReader(read); String txt = null; while((txt = bufferedReader.readLine()) != null){ //讀取文件,將文件內容放入到set中 set.add(txt); } } else{ //不存在拋出異常信息 throw new Exception("敏感詞庫文件不存在"); } } catch (Exception e) { throw e; }finally{ read.close(); //關閉文件流 } addNewSensitiveWord(set); } /** * @description: 新增敏感詞庫 * @author: maojialong * @date: 2018年2月1日 上午11:55:10 * @param keyWordSet */ public static void addNewSensitiveWord(Set<String> keyWordSet) { sensitiveWordMap.putAll(getNewSensitiveWordToHashMap(keyWordSet)); } /** * @description: 封裝敏感詞庫 * @author: maojialong * @date: 2018年1月30日 下午4:28:58 * @param keyWordSet * @return */ @SuppressWarnings("rawtypes") public static HashMap getNewSensitiveWordToHashMap(Set<String> keyWordSet) { // 初始化HashMap對象並控制容器的大小 HashMap newSensitiveWordMap = new HashMap(keyWordSet.size()); // 敏感詞 String key = null; // 用來按照相應的格式保存敏感詞庫數據 Map nowMap = null; // 用來輔助構建敏感詞庫 Map<String, String> newWorMap = null; // 使用一個迭代器來循環敏感詞集合 Iterator<String> iterator = keyWordSet.iterator(); while (iterator.hasNext()) { key = iterator.next(); // 等於敏感詞庫,HashMap對象在內存中佔用的是同一個地址,因此此nowMap對象的變化,sensitiveWordMap對象也會跟着改變 nowMap = sensitiveWordMap; for (int i = 0; i < key.length(); i++) { // 截取敏感詞當中的字,在敏感詞庫中字爲HashMap對象的Key鍵值 char keyChar = key.charAt(i); //不是漢字數字字母直接跳過 Matcher match = pattern.matcher(String.valueOf(keyChar)); boolean matched = match.matches(); if(!matched) { continue; } // 判斷這個字是否存在於敏感詞庫中 Object wordMap = nowMap.get(keyChar); if (wordMap != null) { nowMap = (Map) wordMap; } else { newWorMap = new HashMap<String, String>(); newWorMap.put("isEnd", "0"); nowMap.put(keyChar, newWorMap); nowMap = newWorMap; } // 若是該字是當前敏感詞的最後一個字,則標識爲結尾字 if (i == key.length() - 1) { nowMap.put("isEnd", "1"); } } } return newSensitiveWordMap; } /** * @description: 敏感詞庫敏感詞數量 * @author: maojialong * @date: 2018年1月30日 下午4:07:20 * @return */ public static int getWordSize() { if (SensitivewordEngine.sensitiveWordMap == null) { return 0; } return SensitivewordEngine.sensitiveWordMap.size(); } /** * @description: 是否包含敏感詞 * @author: maojialong * @date: 2018年1月30日 下午2:47:37 * @param txt * @param matchType * @return */ public static boolean isContaintSensitiveWord(String txt, int matchType) { boolean flag = false; for (int i = 0; i < txt.length(); i++) { int matchFlag = checkSensitiveWord(txt, i, matchType); if (matchFlag > 0) { flag = true; } } return flag; } /** * @description: 獲取敏感詞內容 * @author: maojialong * @date: 2018年1月30日 下午2:47:27 * @param txt * @param matchType * @return */ public static Set<String> getSensitiveWord(String txt, int matchType) { Set<String> sensitiveWordList = new HashSet<String>(); for (int i = 0; i < txt.length(); i++) { int length = checkSensitiveWord(txt, i, matchType); if (length > 0) { // 將檢測出的敏感詞保存到集合中 sensitiveWordList.add(txt.substring(i, i + length)); i = i + length - 1; } } return sensitiveWordList; } /** * @description: 替換敏感詞 * @author: maojialong * @date: 2018年1月30日 下午2:47:15 * @param txt * @param matchType * @param replaceChar * @return */ public static String replaceSensitiveWord(String txt, int matchType, String replaceChar) { String resultTxt = txt; Set<String> set = getSensitiveWord(txt, matchType); Iterator<String> iterator = set.iterator(); String word = null; String replaceString = null; while (iterator.hasNext()) { word = iterator.next(); replaceString = getReplaceChars(replaceChar, word.length()); try { resultTxt = resultTxt.replaceAll(word, replaceString); }catch(Exception e) { } } return resultTxt; } /** * @description: 獲取替換字符 * @author: maojialong * @date: 2018年1月30日 下午2:46:40 * @param replaceChar * @param length * @return */ private static String getReplaceChars(String replaceChar, int length) { String resultReplace = replaceChar; for (int i = 1; i < length; i++) { resultReplace += replaceChar; } return resultReplace; } /** * @description: 檢查敏感詞 * @author: maojialong * @date: 2018年1月30日 下午2:45:50 * @param txt * @param beginIndex * @param matchType * @return */ public static int checkSensitiveWord(String txt, int beginIndex, int matchType) { boolean flag = false; // 記錄敏感詞數量 int matchFlag = 0; char word = 0; Map nowMap = SensitivewordEngine.sensitiveWordMap; for (int i = beginIndex; i < txt.length(); i++) { word = txt.charAt(i); if(matchFlag > 0 && !flag ) { Matcher match = pattern.matcher(String.valueOf(word)); boolean matched = match.matches(); if(!matched) { matchFlag++; continue; } } // 判斷該字是否存在於敏感詞庫中 nowMap = (Map) nowMap.get(word); if (nowMap != null) { matchFlag++; // 判斷是不是敏感詞的結尾字,若是是結尾字則判斷是否繼續檢測 if ("1".equals(nowMap.get("isEnd"))) { flag = true; // 判斷過濾類型,若是是小過濾則跳出循環,不然繼續循環 if (SensitivewordEngine.minMatchTYpe == matchType) { break; } } } else { break; } } if (matchFlag < 2 || !flag) { matchFlag = 0; } return matchFlag; } /** * @description: 刪除敏感詞 * @author: maojialong * @date: 2018年2月1日 上午11:40:45 * @param keyWord */ public static void removeSensitiveWordToHashMap(String keyWord) { int length = keyWord.length(); Map<Integer,Map> tempMap = new HashMap<Integer,Map>(); char word = 0; boolean flag = false; Map nowMap = sensitiveWordMap; for(int i = 0; i < length ; i++){ word = keyWord.charAt(i); Map lastMap = nowMap; nowMap = (Map) nowMap.get(word); //獲取指定key if(nowMap != null){ //存在,則判斷是否爲最後一個 tempMap.put(i, lastMap); }else{ //不存在,直接返回 break; } if (i == length -1 && "1".equals(nowMap.get("isEnd"))) { flag = true; } } if(flag) { for(int i = length - 1; i >= 0 ; i--){ word = keyWord.charAt(i); nowMap = tempMap.get(i); Map m = (Map) nowMap.get(word); boolean last = m.size() == 1 && "1".equals(m.get("isEnd")) && i == length - 1; boolean notLast = m.size() == 1 && "0".equals(m.get("isEnd")); if(last || notLast) { nowMap.remove(keyWord.charAt(i)); }else { break; } } } } public static void main(String[] args) throws InterruptedException { Set<String> sensitiveWord = new HashSet<String>(); sensitiveWord.add("大娃"); SensitivewordEngine.addNewSensitiveWord(sensitiveWord); String result = SensitivewordEngine.replaceSensitiveWord("我是大娃,我弟弟是大二娃,咱們都是葫蘆娃", 2,"*"); System.out.println(result); System.out.println(SensitivewordEngine.sensitiveWordMap); //新增或者批量新增 sensitiveWord.add("大二娃"); sensitiveWord.add("大二"); SensitivewordEngine.addNewSensitiveWord(sensitiveWord); result = SensitivewordEngine.replaceSensitiveWord("我是大娃,我弟弟是大二娃,咱們如今讀大二,咱們都是葫蘆娃", 2,"*"); System.out.println(result); System.out.println(SensitivewordEngine.sensitiveWordMap); //刪除 SensitivewordEngine.removeSensitiveWordToHashMap("大二娃"); result = SensitivewordEngine.replaceSensitiveWord("我是大娃,我弟弟是大二娃,咱們如今讀大二,咱們都是葫蘆娃", 2,"*"); System.out.println(result); System.out.println(SensitivewordEngine.sensitiveWordMap); } }
複製代碼便可食用,最後的removeSensitiveWordToHashMap是我一個朋友幫忙寫的,其餘方法時參考網上的其餘博文整理的算法