雙數組字典樹的算法思想這裏就不在詳述,有興趣的能夠本身谷歌一下。java
廢話少說,java代碼以下:算法
/**
*
*/
package com.kongfz.service.banned.check;
/**
* 雙數組字典樹查找敏感詞算法
*
* 讀代碼前,請先了解字典樹和雙數組字典樹算法思想
* @author Administrator
*
*/
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
public class DoubleArrayTrie {
/** 分詞結束符 */
private final char END_CHAR = '\0';
/** 分配步長 */
private final int DEFAULT_LEN = 1024;
/** 基礎位置數組 */
private int base[] = new int[DEFAULT_LEN];
/** 前一狀態數組 */
private int check[] = new int[DEFAULT_LEN];
/** 詞語結尾字數組 */
private char tail[] = new char[DEFAULT_LEN];
/** 開始位置 */
int pos = 1;
/** 字典字符和位置對應關係 */
Map<Character, Integer> charMap = new HashMap<Character, Integer>();
/** 字典字符列表 */
ArrayList<Character> charList = new ArrayList<Character>();
/**
* 構造函數
*/
public DoubleArrayTrie() {
base[1] = 1;
charMap.put(END_CHAR, 1);
charList.add(END_CHAR);
charList.add(END_CHAR);
for (int i = 0; i < 26; ++i) {
charMap.put((char) ('a' + i), charMap.size() + 1);
charList.add((char) ('a' + i));
}
}
/**
* 擴充位置數組和狀態數組
*/
private void Extend_Array() {
base = Arrays.copyOf(base, base.length * 2);
check = Arrays.copyOf(check, check.length * 2);
}
/**
* 擴充結尾字符數組
*/
private void Extend_Tail() {
tail = Arrays.copyOf(tail, tail.length * 2);
}
/**
* 從字符關係map中獲取字符位置
* 不存在時添加
* @param c
* @return
*/
private int getAndAddCharCode(char c) {
if (!charMap.containsKey(c)) {
charMap.put(c, charMap.size() + 1);
charList.add(c);
}
return charMap.get(c);
}
/**
* 從字符集map中取得指定字符位置
* @param c
* @return
*/
private int getCharCode(char c) {
if (!charMap.containsKey(c)) {
return -1;
}
return charMap.get(c);
}
/**
* 複製字符到詞語結尾字數組
* @param s
* @param p
* @return
*/
private int copyToTail(String s, int p) {
int _pos = pos;
while (s.length() - p + 1 > tail.length - pos) {
Extend_Tail();
}
for (int i = p; i < s.length(); ++i) {
tail[_pos] = s.charAt(i);
_pos++;
}
return _pos;
}
/**
* 衝突時計算下一個空閒的位置
* @param set
* @return
*/
private int x_check(Integer[] set) {
for (int i = 1;; ++i) {
boolean flag = true;
for (int j = 0; j < set.length; ++j) {
int cur_p = i + set[j];
if (cur_p >= base.length)
Extend_Array();
if (base[cur_p] != 0 || check[cur_p] != 0) {
flag = false;
break;
}
}
if (flag)
return i;
}
}
/**
* 取得全部同義詞
* @param p
* @return
*/
private ArrayList<Integer> getChildList(int p) {
ArrayList<Integer> ret = new ArrayList<Integer>();
for (int i = 1; i <= charMap.size(); ++i) {
if (base[p] + i >= check.length)
break;
if (check[base[p] + i] == p) {
ret.add(i);
}
}
return ret;
}
/**
* 判斷結尾字數組中是否包含某字符
* @param start
* @param s2
* @return
*/
private boolean tailContainString(int start, String s2) {
for (int i = 0; i < s2.length(); ++i) {
if (s2.charAt(i) != tail[i + start])
return false;
}
return true;
}
private boolean tailMatchString(int start, String s2) {
s2 += END_CHAR;
for (int i = 0; i < s2.length(); ++i) {
if (s2.charAt(i) != tail[i + start])
return false;
}
return true;
}
/**
* 向字典中插入詞
* @param word
* @throws Exception
*/
public void insertWord(String word) throws Exception {
word += END_CHAR;
int pre_p = 1;
int cur_p;
for (int i = 0; i < word.length(); ++i) {
// 獲取狀態位置
cur_p = base[pre_p] + getAndAddCharCode(word.charAt(i));
// 若是長度超過現有,拓展數組
if (cur_p >= base.length){
Extend_Array();
}
// 空閒狀態
if (base[cur_p] == 0 && check[cur_p] == 0) {
base[cur_p] = -pos;
check[cur_p] = pre_p;
pos = copyToTail(word, i + 1);
break;
} else {
// 已存在狀態
if (base[cur_p] > 0 && check[cur_p] == pre_p) {
pre_p = cur_p;
continue;
} else {
// 衝突 1:遇到 Base[cur_p]小於0的,即遇到一個被壓縮存到Tail中的字符串
if (base[cur_p] < 0 && check[cur_p] == pre_p) {
int head = -base[cur_p];
// 插入重複字符串
if (word.charAt(i + 1) == END_CHAR && tail[head] == END_CHAR) {
break;
}
// 公共字母的狀況,由於上一個判斷已經排除告終束符,因此必定是2個都不是結束符
if (tail[head] == word.charAt(i + 1)) {
int avail_base = x_check(new Integer[] { getAndAddCharCode(word.charAt(i + 1)) });
base[cur_p] = avail_base;
check[avail_base + getAndAddCharCode(word.charAt(i + 1))] = cur_p;
base[avail_base + getAndAddCharCode(word.charAt(i + 1))] = -(head + 1);
pre_p = cur_p;
continue;
} else {
// 2個字母不相同的狀況,可能有一個爲結束符
int avail_base;
avail_base = x_check(new Integer[] {
getAndAddCharCode(word.charAt(i + 1)),
getAndAddCharCode(tail[head]) });
base[cur_p] = avail_base;
check[avail_base + getAndAddCharCode(tail[head])] = cur_p;
check[avail_base + getAndAddCharCode(word.charAt(i + 1))] = cur_p;
// Tail 爲END_FLAG 的狀況
if (tail[head] == END_CHAR) {
base[avail_base + getAndAddCharCode(tail[head])] = 0;
} else {
base[avail_base + getAndAddCharCode(tail[head])] = -(head + 1);
}
if (word.charAt(i + 1) == END_CHAR) {
base[avail_base + getAndAddCharCode(word.charAt(i + 1))] = 0;
} else {
base[avail_base + getAndAddCharCode(word.charAt(i + 1))] = -pos;
}
pos = copyToTail(word, i + 2);
break;
}
} else {
// 衝突2:當前結點已經被佔用,須要調整pre的base
if (check[cur_p] != pre_p) {
ArrayList<Integer> list1 = getChildList(pre_p);
int toBeAdjust;
ArrayList<Integer> list = null;
if (true) {
toBeAdjust = pre_p;
list = list1;
}
int origin_base = base[toBeAdjust];
list.add(getAndAddCharCode(word.charAt(i)));
int avail_base = x_check((Integer[]) list.toArray(new Integer[list.size()]));
list.remove(list.size() - 1);
base[toBeAdjust] = avail_base;
for (int j = 0; j < list.size(); ++j) {
// BUG
int tmp1 = origin_base + list.get(j);
int tmp2 = avail_base + list.get(j);
base[tmp2] = base[tmp1];
check[tmp2] = check[tmp1];
// 有後續
if (base[tmp1] > 0) {
ArrayList<Integer> subsequence = getChildList(tmp1);
for (int k = 0; k < subsequence.size(); ++k) {
check[base[tmp1] + subsequence.get(k)] = tmp2;
}
}
base[tmp1] = 0;
check[tmp1] = 0;
}
// 更新新的cur_p
cur_p = base[pre_p] + getAndAddCharCode(word.charAt(i));
if (word.charAt(i) == END_CHAR) {
base[cur_p] = 0;
} else {
base[cur_p] = -pos;
}
check[cur_p] = pre_p;
pos = copyToTail(word, i + 1);
break;
}
}
}
}
}
}
/**
* 查找詞典中是否包含某個詞語
* @param word
* @return
*/
public boolean Exists(String word) {
int pre_p = 1;
int cur_p = 0;
for (int i = 0; i < word.length(); ++i) {
cur_p = base[pre_p] + getAndAddCharCode(word.charAt(i));
if (check[cur_p] != pre_p)
return false;
if (base[cur_p] < 0) {
if (tailMatchString(-base[cur_p], word.substring(i + 1)))
return true;
return false;
}
pre_p = cur_p;
}
if (check[base[cur_p] + getAndAddCharCode(END_CHAR)] == cur_p)
return true;
return false;
}
// 內部函數,返回匹配單詞的最靠後的Base index,
class FindStruct {
int p;
String prefix = "";
}
/**
* 從詞典中匹配存在的詞語
* @param word
* @return
*/
private FindStruct Find(String word) {
int pre_p = 1;
int cur_p = 0;
FindStruct fs = new FindStruct();
for (int i = 0; i < word.length(); ++i) {
// BUG
fs.prefix += word.charAt(i);
cur_p = base[pre_p] + getCharCode(word.charAt(i));
//字典樹中不包含此字符開頭的詞語
if (check[cur_p] != pre_p) {
fs = new FindStruct();
pre_p = 1;
cur_p = 0;
continue;
}
if (base[cur_p] < 0) {
if (tailContainString(-base[cur_p], "")){
fs.p = cur_p;
return fs;
}
pre_p = 1;
cur_p = 0;
fs = new FindStruct();
continue;
}
pre_p = cur_p;
}
fs.p = cur_p;
return fs;
}
/**
* 取得指定詞語的爲前綴的全部詞語
* @param index
* @return
*/
public ArrayList<String> GetAllChildWord(int index) {
ArrayList<String> result = new ArrayList<String>();
if (base[index] == 0) {
result.add("");
return result;
}
if (base[index] < 0) {
String r = "";
for (int i = -base[index]; tail[i] != END_CHAR; ++i) {
r += tail[i];
}
result.add(r);
return result;
}
for (int i = 1; i <= charMap.size(); ++i) {
if (check[base[index] + i] == index) {
for (String s : GetAllChildWord(base[index] + i)) {
result.add(charList.get(i) + s);
}
// result.addAll(GetAllChildWord(Base[index]+i));
}
}
return result;
}
public ArrayList<String> findBannedWord(String word) {
ArrayList<String> result = new ArrayList<String>();
// String prefix = "";
FindStruct fs = Find(word);
int p = fs.p;
if (p == -1)
return result;
if (base[p] < 0) {
String r = "";
for (int i = -base[p]; tail[i] != END_CHAR; ++i) {
r += tail[i];
}
result.add(fs.prefix + r);
return result;
}
//
// if (Base[p] > 0) {
// ArrayList<String> r = GetAllChildWord(p);
// for (int i = 0; i < r.size(); ++i) {
// r.set(i, fs.prefix + r.get(i));
// }
// return r;
// }
result.add(fs.prefix);
return result;
}
/**
* 刪除敏感詞
* @param word
* @return
*/
public boolean delBannedWord(String word){
if(!this.Exists(word)){
return false;
}
int pre_p = 1;
int cur_p = 0;
int start = 0;
findToDel(word, pre_p, cur_p, start);
return true;
}
/**
* 刪除敏感詞
* @param word
* @param pre_p
* @param cur_p
* @param start
* @return
*/
private boolean findToDel(String word, int pre_p, int cur_p, int start) {
char key = word.charAt(start);
cur_p = base[pre_p] + getCharCode(key);
if (base[cur_p] < 0) {
if (tailContainString(-base[cur_p], "")){
for (int i = -base[cur_p]; tail[i] != END_CHAR; ++i) {
tail[i] = END_CHAR;
}
base[cur_p]=0;
check[cur_p]=0;
return true;
}
return false;
}
pre_p = cur_p;
findToDel(word, pre_p, cur_p, start+1);
return true;
}
public static void main(String[] args) throws Exception {
long start = System.currentTimeMillis();
DoubleArrayTrie dat = new DoubleArrayTrie();
//加載詞庫
// InputStreamReader isr = new InputStreamReader(new FileInputStream(
// "E:\\workspace\\MyProject\\src\\test\\segment\\dict.txt"),
// "UTF-8");
// BufferedReader br = new BufferedReader(isr);
// String readLine = br.readLine();
// while (readLine != null) {
// dat.Insert(readLine);
// readLine = br.readLine();
// }
// isr.close();
// br.close();
// System.out.println("The init total time is "
// + ((System.currentTimeMillis() - start)) + "ms");
// dat.insertWord("學生本");
dat.insertWord("學校");
dat.insertWord("學習");
dat.insertWord("調查");
dat.delBannedWord("調查");
dat.insertWord("調查");
// dat.insertWord("本地");
dat.insertWord("北京");
dat.delBannedWord("學校");
dat.insertWord("學生");
// dat.insertWord("學校");
System.out.println("The update total time is "
+ ((System.currentTimeMillis() - start)) + "ms");
System.out.println(dat.base.length);
System.out.println(dat.tail.length);
System.out.println("The init total time is "
+ ((System.currentTimeMillis() - start)) + "ms");
String resStr = "聖鬥士星 矢在月 宮敗在了嫦 娥和玉 兔手下,由於星矢不是本地人,也是個不起眼的角色,goodbye."
+ "京華時報訊(記者 懷若谷 實習記者 常鑫)昨天,本報報道河南省潢川縣王香鋪村村民蔡先生家林地被燒一事,"
+ "當事雙方對起火緣由說法不一。昨晚,潢川縣森林公安派出所稱,起火林地類型爲未成林造林地,不予刑事立案。"
+ "潢川縣森林公安派出所李警官對京華時報記者表示,他們聘請淮濱縣林業局工程師從新對起火地點進行勘測,"
+ "「總過火面積爲21991.9平方米,其中有2240平方米範圍內僅有零星樹木痕跡。該林地類型爲未成林造林地。」"
+ "李警官稱,未成林造林地起火不牽扯刑事責任,所以不予刑事立案,但具體起火緣由仍在調查。";
System.out.println(dat.findBannedWord(resStr));
String findStr = "學生是本地人學習";
// System.out.println(dat.Exists(resStr));
//檢測敏感詞,返回第一個存在的敏感詞
System.out.println(findStr + ":::::::" + dat.findBannedWord(findStr));
//
findStr = "學校也在本地";
// System.out.println(dat.Exists(resStr));
System.out.println(findStr + ":::::::" + dat.findBannedWord(findStr));
//
findStr = "本地的學生本";
// System.out.println(dat.Exists(resStr));
System.out.println(findStr + ":::::::" + dat.findBannedWord(findStr));
findStr = "北京市的學生";
// System.out.println(dat.Exists(resStr));
System.out.println(findStr + ":::::::" + dat.findBannedWord(findStr));
//
// findStr = "具體的事情";
// System.out.println(dat.Exists(resStr));
// System.out.println(findStr + ":::::::" + dat.FindAllWords(findStr));
System.out.println("The total time is "
+ ((System.currentTimeMillis() - start)) + "ms");
}
}
數組