DFA算法實現敏感詞過濾

DFA算法:即肯定有窮自動機,簡單點說就是,它是是經過event和當前的state獲得下一個state,即event+state=nextstate。理解爲系統中有多個節點,經過傳遞進入的event,來肯定走哪一個路由至另外一個節點,而節點是有限的。java

廢話很少說,直接貼上代碼:正則表達式

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * @description:敏感詞工具
 * @author: maojialong
 * @date: 2018年1月30日 上午10:59:24
 */
public class SensitivewordEngine {
    
    private String ENCODING = "GBK";    //字符編碼
    
    //敏感詞庫
    public static Map sensitiveWordMap = new HashMap();

    //只過濾最小敏感詞
    public static int minMatchTYpe = 1;

    //過濾全部敏感詞
    public static int maxMatchType = 2;
    
    //正則表達式中文、字母、數字
    public static Pattern pattern = Pattern.compile("^[a-zA-Z0-9\u4E00-\u9FA5]+$");
    
    /**
     * 讀取敏感詞庫中的內容,將內容添加到set集合中
     * @author chenming 
     * @date 2014年4月20日 下午2:31:18
     * @return
     * @version 1.0
     * @throws Exception 
     */
    @SuppressWarnings("resource")
    private void readSensitiveWordFile() throws Exception{
        Set<String> set = null;
        
        File file = new File("D:\\SensitiveWord.txt");    //讀取文件
        InputStreamReader read = new InputStreamReader(new FileInputStream(file),ENCODING);
        try {
            if(file.isFile() && file.exists()){      //文件流是否存在
                set = new HashSet<String>();
                BufferedReader bufferedReader = new BufferedReader(read);
                String txt = null;
                while((txt = bufferedReader.readLine()) != null){    //讀取文件,將文件內容放入到set中
                    set.add(txt);
                }
            }
            else{         //不存在拋出異常信息
                throw new Exception("敏感詞庫文件不存在");
            }
        } catch (Exception e) {
            throw e;
        }finally{
            read.close();     //關閉文件流
        }
        addNewSensitiveWord(set);
    }
    
    /**
     * @description: 新增敏感詞庫
     * @author: maojialong
     * @date: 2018年2月1日 上午11:55:10
     * @param keyWordSet
     */
    public static void addNewSensitiveWord(Set<String> keyWordSet) {
        sensitiveWordMap.putAll(getNewSensitiveWordToHashMap(keyWordSet));
    }
    
    /**
     * @description: 封裝敏感詞庫
     * @author: maojialong
     * @date: 2018年1月30日 下午4:28:58
     * @param keyWordSet
     * @return
     */
    @SuppressWarnings("rawtypes")
    public static HashMap getNewSensitiveWordToHashMap(Set<String> keyWordSet) {
        // 初始化HashMap對象並控制容器的大小
        HashMap newSensitiveWordMap = new HashMap(keyWordSet.size());
        // 敏感詞
        String key = null;
        // 用來按照相應的格式保存敏感詞庫數據
        Map nowMap = null;
        // 用來輔助構建敏感詞庫
        Map<String, String> newWorMap = null;
        // 使用一個迭代器來循環敏感詞集合
        Iterator<String> iterator = keyWordSet.iterator();
        while (iterator.hasNext()) {
            key = iterator.next();
            // 等於敏感詞庫,HashMap對象在內存中佔用的是同一個地址,因此此nowMap對象的變化,sensitiveWordMap對象也會跟着改變
            nowMap = sensitiveWordMap;
            for (int i = 0; i < key.length(); i++) {
                // 截取敏感詞當中的字,在敏感詞庫中字爲HashMap對象的Key鍵值
                char keyChar = key.charAt(i);
                //不是漢字數字字母直接跳過
                Matcher match = pattern.matcher(String.valueOf(keyChar));
                boolean matched = match.matches();
                if(!matched) {
                    continue;
                }

                // 判斷這個字是否存在於敏感詞庫中
                Object wordMap = nowMap.get(keyChar);
                if (wordMap != null) {
                    nowMap = (Map) wordMap;
                } else {
                    newWorMap = new HashMap<String, String>();
                    newWorMap.put("isEnd", "0");
                    nowMap.put(keyChar, newWorMap);
                    nowMap = newWorMap;
                }

                // 若是該字是當前敏感詞的最後一個字,則標識爲結尾字
                if (i == key.length() - 1) {
                    nowMap.put("isEnd", "1");
                }
            }
        }
        return newSensitiveWordMap;
    }

    /**
     * @description: 敏感詞庫敏感詞數量
     * @author: maojialong
     * @date: 2018年1月30日 下午4:07:20
     * @return
     */
    public static int getWordSize() {
        if (SensitivewordEngine.sensitiveWordMap == null) {
            return 0;
        }
        return SensitivewordEngine.sensitiveWordMap.size();
    }

    /**
     * @description: 是否包含敏感詞
     * @author: maojialong
     * @date: 2018年1月30日 下午2:47:37
     * @param txt
     * @param matchType
     * @return
     */
    public static boolean isContaintSensitiveWord(String txt, int matchType) {
        boolean flag = false;
        for (int i = 0; i < txt.length(); i++) {
            int matchFlag = checkSensitiveWord(txt, i, matchType);
            if (matchFlag > 0) {
                flag = true;
            }
        }
        return flag;
    }

    /**
     * @description: 獲取敏感詞內容
     * @author: maojialong
     * @date: 2018年1月30日 下午2:47:27
     * @param txt
     * @param matchType
     * @return
     */
    public static Set<String> getSensitiveWord(String txt, int matchType) {
        Set<String> sensitiveWordList = new HashSet<String>();

        for (int i = 0; i < txt.length(); i++) {
            int length = checkSensitiveWord(txt, i, matchType);
            if (length > 0) {
                // 將檢測出的敏感詞保存到集合中
                sensitiveWordList.add(txt.substring(i, i + length));
                i = i + length - 1;
            }
        }

        return sensitiveWordList;
    }

    /**
     * @description: 替換敏感詞
     * @author: maojialong
     * @date: 2018年1月30日 下午2:47:15
     * @param txt
     * @param matchType
     * @param replaceChar
     * @return
     */
    public static String replaceSensitiveWord(String txt, int matchType, String replaceChar) {
        String resultTxt = txt;
        Set<String> set = getSensitiveWord(txt, matchType);
        Iterator<String> iterator = set.iterator();
        String word = null;
        String replaceString = null;
        while (iterator.hasNext()) {
            word = iterator.next();
            replaceString = getReplaceChars(replaceChar, word.length());
            try {
                resultTxt = resultTxt.replaceAll(word, replaceString);
            }catch(Exception e) {
            }
        }

        return resultTxt;
    }

    /**
     * @description: 獲取替換字符
     * @author: maojialong
     * @date: 2018年1月30日 下午2:46:40
     * @param replaceChar
     * @param length
     * @return
     */
    private static String getReplaceChars(String replaceChar, int length) {
        String resultReplace = replaceChar;
        for (int i = 1; i < length; i++) {
            resultReplace += replaceChar;
        }

        return resultReplace;
    }

    /**
     * @description: 檢查敏感詞
     * @author: maojialong
     * @date: 2018年1月30日 下午2:45:50
     * @param txt
     * @param beginIndex
     * @param matchType
     * @return
     */
    public static int checkSensitiveWord(String txt, int beginIndex, int matchType) {
        boolean flag = false;
        // 記錄敏感詞數量
        int matchFlag = 0;
        char word = 0;
        Map nowMap = SensitivewordEngine.sensitiveWordMap;
        for (int i = beginIndex; i < txt.length(); i++) {
            word = txt.charAt(i);
            if(matchFlag > 0 && !flag ) {
                Matcher match = pattern.matcher(String.valueOf(word));
                boolean matched = match.matches();
                if(!matched) {
                    matchFlag++;
                    continue;
                }
            }
            // 判斷該字是否存在於敏感詞庫中
            nowMap = (Map) nowMap.get(word);
            if (nowMap != null) {
                matchFlag++;
                // 判斷是不是敏感詞的結尾字,若是是結尾字則判斷是否繼續檢測
                if ("1".equals(nowMap.get("isEnd"))) {
                    flag = true;
                    // 判斷過濾類型,若是是小過濾則跳出循環,不然繼續循環
                    if (SensitivewordEngine.minMatchTYpe == matchType) {
                        break;
                    }
                }
            } else {
                break;
            }
        }
        if (matchFlag < 2 || !flag) {
            matchFlag = 0;
        }
        return matchFlag;
    }
    
    /**
     * @description: 刪除敏感詞
     * @author: maojialong
     * @date: 2018年2月1日 上午11:40:45
     * @param keyWord
     */
    public static void removeSensitiveWordToHashMap(String keyWord) {
        int length = keyWord.length();
        Map<Integer,Map> tempMap = new HashMap<Integer,Map>();
        char word = 0;
        boolean flag = false;
        Map nowMap = sensitiveWordMap;
        for(int i = 0; i < length ; i++){
            word = keyWord.charAt(i);
            Map lastMap = nowMap;
            nowMap = (Map) nowMap.get(word);     //獲取指定key
            if(nowMap != null){     //存在,則判斷是否爲最後一個
                tempMap.put(i, lastMap);
            }else{     //不存在,直接返回
                break;
            }
            if (i == length -1 && "1".equals(nowMap.get("isEnd"))) {
                flag = true;
            }
        }
        if(flag) {
            for(int i = length - 1; i >= 0 ; i--){
                word = keyWord.charAt(i);
                nowMap = tempMap.get(i); 
                Map m = (Map) nowMap.get(word);
                boolean last = m.size() == 1 && "1".equals(m.get("isEnd")) && i == length - 1;
                boolean notLast = m.size() == 1 && "0".equals(m.get("isEnd"));
                if(last || notLast) {
                    nowMap.remove(keyWord.charAt(i));
                }else {
                    break;
                }
            }
        }
    }
    
    public static void main(String[] args) throws InterruptedException {
        Set<String> sensitiveWord = new HashSet<String>();
        sensitiveWord.add("大娃");
        SensitivewordEngine.addNewSensitiveWord(sensitiveWord);
        String result = SensitivewordEngine.replaceSensitiveWord("我是大娃,我弟弟是大二娃,咱們都是葫蘆娃", 2,"*");
        System.out.println(result);
        System.out.println(SensitivewordEngine.sensitiveWordMap);
        
        //新增或者批量新增
        sensitiveWord.add("大二娃");
        sensitiveWord.add("大二");
        SensitivewordEngine.addNewSensitiveWord(sensitiveWord);
        result = SensitivewordEngine.replaceSensitiveWord("我是大娃,我弟弟是大二娃,咱們如今讀大二,咱們都是葫蘆娃", 2,"*");
        System.out.println(result);
        System.out.println(SensitivewordEngine.sensitiveWordMap);
        
        //刪除
        SensitivewordEngine.removeSensitiveWordToHashMap("大二娃");
        result = SensitivewordEngine.replaceSensitiveWord("我是大娃,我弟弟是大二娃,咱們如今讀大二,咱們都是葫蘆娃", 2,"*");
        System.out.println(result);
        System.out.println(SensitivewordEngine.sensitiveWordMap);
        
    }
    
    
}

複製代碼便可食用,最後的removeSensitiveWordToHashMap是我一個朋友幫忙寫的,其餘方法時參考網上的其餘博文整理的算法

相關文章
相關標籤/搜索