雙數組字典樹的java實現

時間 2019-12-19

標籤雙數字典 java 實現欄目 Java 简体版

原文原文鏈接

雙數組字典樹的算法思想這裏就不在詳述，有興趣的能夠本身谷歌一下。java

廢話少說，java代碼以下：算法

/**
*
*/
package com.kongfz.service.banned.check;

/**
* 雙數組字典樹查找敏感詞算法
*
* 讀代碼前，請先了解字典樹和雙數組字典樹算法思想
* @author Administrator
*
*/
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;

public class DoubleArrayTrie {

    /** 分詞結束符 */
    private final char END_CHAR = '\0';

    /** 分配步長 */
    private final int DEFAULT_LEN = 1024;

    /** 基礎位置數組 */
    private int base[] = new int[DEFAULT_LEN];

    /** 前一狀態數組 */
    private int check[] = new int[DEFAULT_LEN];

    /** 詞語結尾字數組 */
    private char tail[] = new char[DEFAULT_LEN];

    /** 開始位置 */
    int pos = 1;

    /** 字典字符和位置對應關係 */
    Map<Character, Integer> charMap = new HashMap<Character, Integer>();

    /** 字典字符列表 */
    ArrayList<Character> charList = new ArrayList<Character>();

    /**
    * 構造函數
    */
    public DoubleArrayTrie() {
        base[1] = 1;

        charMap.put(END_CHAR, 1);
        charList.add(END_CHAR);
        charList.add(END_CHAR);
        for (int i = 0; i < 26; ++i) {
            charMap.put((char) ('a' + i), charMap.size() + 1);
            charList.add((char) ('a' + i));
        }
    }

    /**
    * 擴充位置數組和狀態數組
    */
    private void Extend_Array() {
        base = Arrays.copyOf(base, base.length * 2);
        check = Arrays.copyOf(check, check.length * 2);
    }

    /**
    * 擴充結尾字符數組
    */
    private void Extend_Tail() {
        tail = Arrays.copyOf(tail, tail.length * 2);
    }

    /**
    * 從字符關係map中獲取字符位置
    * 不存在時添加
    * @param c
    * @return
    */
    private int getAndAddCharCode(char c) {
        if (!charMap.containsKey(c)) {
            charMap.put(c, charMap.size() + 1);
            charList.add(c);
        }
        return charMap.get(c);
    }

    /**
    * 從字符集map中取得指定字符位置
    * @param c
    * @return
    */
    private int getCharCode(char c) {
        if (!charMap.containsKey(c)) {
            return -1;
        }
        return charMap.get(c);
    }

    /**
    * 複製字符到詞語結尾字數組
    * @param s
    * @param p
    * @return
    */
    private int copyToTail(String s, int p) {
        int _pos = pos;
        while (s.length() - p + 1 > tail.length - pos) {
            Extend_Tail();
        }
        for (int i = p; i < s.length(); ++i) {
            tail[_pos] = s.charAt(i);
            _pos++;
        }
        return _pos;
    }

    /**
    * 衝突時計算下一個空閒的位置
    * @param set
    * @return
    */
    private int x_check(Integer[] set) {
        for (int i = 1;; ++i) {
            boolean flag = true;
            for (int j = 0; j < set.length; ++j) {
                int cur_p = i + set[j];
                if (cur_p >= base.length)
                    Extend_Array();
                if (base[cur_p] != 0 || check[cur_p] != 0) {
                    flag = false;
                    break;
                }
            }
            if (flag)
                return i;
        }
    }

    /**
    * 取得全部同義詞
    * @param p
    * @return
    */
    private ArrayList<Integer> getChildList(int p) {
        ArrayList<Integer> ret = new ArrayList<Integer>();
        for (int i = 1; i <= charMap.size(); ++i) {
            if (base[p] + i >= check.length)
                break;
            if (check[base[p] + i] == p) {
                ret.add(i);
            }
        }
        return ret;
    }

    /**
    * 判斷結尾字數組中是否包含某字符
    * @param start
    * @param s2
    * @return
    */
    private boolean tailContainString(int start, String s2) {
        for (int i = 0; i < s2.length(); ++i) {
            if (s2.charAt(i) != tail[i + start])
                return false;
        }
        return true;
    }

    private boolean tailMatchString(int start, String s2) {
        s2 += END_CHAR;
        for (int i = 0; i < s2.length(); ++i) {
            if (s2.charAt(i) != tail[i + start])
                return false;
        }
        return true;
    }

    /**
    * 向字典中插入詞
    * @param word
    * @throws Exception
    */
    public void insertWord(String word) throws Exception {
        word += END_CHAR;
        int pre_p = 1;
        int cur_p;
        for (int i = 0; i < word.length(); ++i) {
            // 獲取狀態位置
            cur_p = base[pre_p] + getAndAddCharCode(word.charAt(i));
            // 若是長度超過現有，拓展數組
            if (cur_p >= base.length){
                Extend_Array();
            }
            // 空閒狀態
            if (base[cur_p] == 0 && check[cur_p] == 0) {
                base[cur_p] = -pos;
                check[cur_p] = pre_p;
                pos = copyToTail(word, i + 1);
                break;
            } else {
                // 已存在狀態
                if (base[cur_p] > 0 && check[cur_p] == pre_p) {
                    pre_p = cur_p;
                    continue;
                } else {
                    // 衝突 1：遇到 Base[cur_p]小於0的，即遇到一個被壓縮存到Tail中的字符串
                    if (base[cur_p] < 0 && check[cur_p] == pre_p) {
                        int head = -base[cur_p];
                        // 插入重複字符串
                        if (word.charAt(i + 1) == END_CHAR && tail[head] == END_CHAR) {
                            break;
                        }
                        // 公共字母的狀況，由於上一個判斷已經排除告終束符，因此必定是2個都不是結束符
                        if (tail[head] == word.charAt(i + 1)) {
                            int avail_base = x_check(new Integer[] { getAndAddCharCode(word.charAt(i + 1)) });
                            base[cur_p] = avail_base;
                            check[avail_base + getAndAddCharCode(word.charAt(i + 1))] = cur_p;
                            base[avail_base + getAndAddCharCode(word.charAt(i + 1))] = -(head + 1);
                            pre_p = cur_p;
                            continue;
                        } else {
                            // 2個字母不相同的狀況，可能有一個爲結束符
                            int avail_base;
                            avail_base = x_check(new Integer[] {
                                    getAndAddCharCode(word.charAt(i + 1)),
                                    getAndAddCharCode(tail[head]) });
                            base[cur_p] = avail_base;
                            check[avail_base + getAndAddCharCode(tail[head])] = cur_p;
                            check[avail_base + getAndAddCharCode(word.charAt(i + 1))] = cur_p;
                            // Tail 爲END_FLAG 的狀況
                            if (tail[head] == END_CHAR) {
                                base[avail_base + getAndAddCharCode(tail[head])] = 0;
                            } else {
                                base[avail_base + getAndAddCharCode(tail[head])] = -(head + 1);
                            }
                            if (word.charAt(i + 1) == END_CHAR) {
                                base[avail_base + getAndAddCharCode(word.charAt(i + 1))] = 0;
                            } else {
                                base[avail_base + getAndAddCharCode(word.charAt(i + 1))] = -pos;
                            }
                            pos = copyToTail(word, i + 2);
                            break;
                        }
                    } else {
                        // 衝突2：當前結點已經被佔用，須要調整pre的base
                        if (check[cur_p] != pre_p) {
                            ArrayList<Integer> list1 = getChildList(pre_p);
                            int toBeAdjust;
                            ArrayList<Integer> list = null;
                            if (true) {
                                toBeAdjust = pre_p;
                                list = list1;
                            }
                            int origin_base = base[toBeAdjust];
                            list.add(getAndAddCharCode(word.charAt(i)));
                            int avail_base = x_check((Integer[]) list.toArray(new Integer[list.size()]));
                            list.remove(list.size() - 1);
                            base[toBeAdjust] = avail_base;
                            for (int j = 0; j < list.size(); ++j) {
                                // BUG
                                int tmp1 = origin_base + list.get(j);
                                int tmp2 = avail_base + list.get(j);
                                base[tmp2] = base[tmp1];
                                check[tmp2] = check[tmp1];
                                // 有後續
                                if (base[tmp1] > 0) {
                                    ArrayList<Integer> subsequence = getChildList(tmp1);
                                    for (int k = 0; k < subsequence.size(); ++k) {
                                        check[base[tmp1] + subsequence.get(k)] = tmp2;
                                    }
                                }
                                base[tmp1] = 0;
                                check[tmp1] = 0;
                            }
                            // 更新新的cur_p
                            cur_p = base[pre_p] + getAndAddCharCode(word.charAt(i));
                            if (word.charAt(i) == END_CHAR) {
                                base[cur_p] = 0;
                            } else {
                                base[cur_p] = -pos;
                            }
                            check[cur_p] = pre_p;
                            pos = copyToTail(word, i + 1);
                            break;
                        }
                    }
                }
            }
        }
    }

    /**
    * 查找詞典中是否包含某個詞語
    * @param word
    * @return
    */
    public boolean Exists(String word) {
        int pre_p = 1;
        int cur_p = 0;

        for (int i = 0; i < word.length(); ++i) {
            cur_p = base[pre_p] + getAndAddCharCode(word.charAt(i));
            if (check[cur_p] != pre_p)
                return false;
            if (base[cur_p] < 0) {
                if (tailMatchString(-base[cur_p], word.substring(i + 1)))
                    return true;
                return false;
            }
            pre_p = cur_p;
        }
        if (check[base[cur_p] + getAndAddCharCode(END_CHAR)] == cur_p)
            return true;
        return false;
    }

    // 內部函數，返回匹配單詞的最靠後的Base index，
    class FindStruct {
        int p;
        String prefix = "";
    }

    /**
    * 從詞典中匹配存在的詞語
    * @param word
    * @return
    */
    private FindStruct Find(String word) {
        int pre_p = 1;
        int cur_p = 0;
        FindStruct fs = new FindStruct();
        for (int i = 0; i < word.length(); ++i) {
            // BUG
            fs.prefix += word.charAt(i);
            cur_p = base[pre_p] + getCharCode(word.charAt(i));
            //字典樹中不包含此字符開頭的詞語
            if (check[cur_p] != pre_p) {
                fs = new FindStruct();
                pre_p = 1;
                cur_p = 0;
                continue;
            }
            if (base[cur_p] < 0) {
                if (tailContainString(-base[cur_p], "")){
                    fs.p = cur_p;
                    return fs;
                }
                pre_p = 1;
                cur_p = 0;
                fs = new FindStruct();
                continue;
            }
            pre_p = cur_p;
        }
        fs.p = cur_p;
        return fs;
    }

    /**
    * 取得指定詞語的爲前綴的全部詞語
    * @param index
    * @return
    */
    public ArrayList<String> GetAllChildWord(int index) {
        ArrayList<String> result = new ArrayList<String>();
        if (base[index] == 0) {
            result.add("");
            return result;
        }
        if (base[index] < 0) {
            String r = "";
            for (int i = -base[index]; tail[i] != END_CHAR; ++i) {
                r += tail[i];
            }
            result.add(r);
            return result;
        }
        for (int i = 1; i <= charMap.size(); ++i) {
            if (check[base[index] + i] == index) {
                for (String s : GetAllChildWord(base[index] + i)) {
                    result.add(charList.get(i) + s);
                }
                // result.addAll(GetAllChildWord(Base[index]+i));
            }
        }
        return result;
    }

    public ArrayList<String> findBannedWord(String word) {
        ArrayList<String> result = new ArrayList<String>();
//        String prefix = "";
        FindStruct fs = Find(word);
        int p = fs.p;
        if (p == -1)
            return result;
        if (base[p] < 0) {
            String r = "";
            for (int i = -base[p]; tail[i] != END_CHAR; ++i) {
                r += tail[i];
            }
            result.add(fs.prefix + r);
            return result;
        }
//
//        if (Base[p] > 0) {
//            ArrayList<String> r = GetAllChildWord(p);
//            for (int i = 0; i < r.size(); ++i) {
//                r.set(i, fs.prefix + r.get(i));
//            }
//            return r;
//        }
        result.add(fs.prefix);
        return result;
    }

    /**
    * 刪除敏感詞
    * @param word
    * @return
    */
    public boolean delBannedWord(String word){
        if(!this.Exists(word)){
            return false;
        }
        int pre_p = 1;
        int cur_p = 0;
        int start = 0;
        findToDel(word, pre_p, cur_p, start);
        return true;
    }

    /**
    * 刪除敏感詞
    * @param word
    * @param pre_p
    * @param cur_p
    * @param start
    * @return
    */
    private boolean findToDel(String word, int pre_p, int cur_p, int start) {
        char key = word.charAt(start);
        cur_p = base[pre_p] + getCharCode(key);
        if (base[cur_p] < 0) {
            if (tailContainString(-base[cur_p], "")){
                for (int i = -base[cur_p]; tail[i] != END_CHAR; ++i) {
                    tail[i] = END_CHAR;
                }
                base[cur_p]=0;
                check[cur_p]=0;
                return true;
            }
            return false;
        }
        pre_p = cur_p;
        findToDel(word, pre_p, cur_p, start+1);
        return true;
    }

    public static void main(String[] args) throws Exception {
        long start = System.currentTimeMillis();
        DoubleArrayTrie dat = new DoubleArrayTrie();
        //加載詞庫
//        InputStreamReader isr = new InputStreamReader(new FileInputStream(
//                "E:\\workspace\\MyProject\\src\\test\\segment\\dict.txt"),
//                "UTF-8");
//        BufferedReader br = new BufferedReader(isr);
//        String readLine = br.readLine();
//        while (readLine != null) {
//            dat.Insert(readLine);
//            readLine = br.readLine();
//        }
//        isr.close();
//        br.close();
//        System.out.println("The init total time is "
//                + ((System.currentTimeMillis() - start)) + "ms");

//        dat.insertWord("學生本");
        dat.insertWord("學校");
        dat.insertWord("學習");
        dat.insertWord("調查");
        dat.delBannedWord("調查");
        dat.insertWord("調查");
//        dat.insertWord("本地");
        dat.insertWord("北京");
        dat.delBannedWord("學校");
        dat.insertWord("學生");
//        dat.insertWord("學校");


        System.out.println("The update total time is "
                + ((System.currentTimeMillis() - start)) + "ms");

        System.out.println(dat.base.length);
        System.out.println(dat.tail.length);
        System.out.println("The init total time is "
                + ((System.currentTimeMillis() - start)) + "ms");

        String resStr = "聖鬥士星矢在月宮敗在了嫦娥和玉兔手下，由於星矢不是本地人，也是個不起眼的角色,goodbye."
                + "京華時報訊(記者懷若谷實習記者常鑫)昨天，本報報道河南省潢川縣王香鋪村村民蔡先生家林地被燒一事，"
                + "當事雙方對起火緣由說法不一。昨晚，潢川縣森林公安派出所稱，起火林地類型爲未成林造林地，不予刑事立案。"
                + "潢川縣森林公安派出所李警官對京華時報記者表示，他們聘請淮濱縣林業局工程師從新對起火地點進行勘測，"
                + "「總過火面積爲21991.9平方米，其中有2240平方米範圍內僅有零星樹木痕跡。該林地類型爲未成林造林地。」"
                + "李警官稱，未成林造林地起火不牽扯刑事責任，所以不予刑事立案，但具體起火緣由仍在調查。";
        System.out.println(dat.findBannedWord(resStr));
        String findStr = "學生是本地人學習";
//        System.out.println(dat.Exists(resStr));
        //檢測敏感詞，返回第一個存在的敏感詞
        System.out.println(findStr + ":::::::" + dat.findBannedWord(findStr));
//
        findStr = "學校也在本地";
//        System.out.println(dat.Exists(resStr));
        System.out.println(findStr + ":::::::" + dat.findBannedWord(findStr));
//
        findStr = "本地的學生本";
//        System.out.println(dat.Exists(resStr));
        System.out.println(findStr + ":::::::" + dat.findBannedWord(findStr));

        findStr = "北京市的學生";
//        System.out.println(dat.Exists(resStr));
        System.out.println(findStr + ":::::::" + dat.findBannedWord(findStr));
//
//        findStr = "具體的事情";
//        System.out.println(dat.Exists(resStr));
//        System.out.println(findStr + ":::::::" + dat.FindAllWords(findStr));

        System.out.println("The total time is "
                + ((System.currentTimeMillis() - start)) + "ms");

    }
}
數組