JavaScript: 實現簡單的中文分詞

時間 2019-11-13

原文原文鏈接

中文分詞在大數據橫行的今天是愈來愈有用武之地了。它不只被普遍用於專業的中文搜索引擎中，並且在關鍵詞屏蔽、黑白名單以及文本類似度等方面也能大顯身手。中文分詞最簡單也最經常使用的方式是基於字典查找的方式，經過遍歷待分詞字符串並在字典中進行查找匹配以達到分詞的目的。本文便是採用這種方式。javascript

字典

在本文中，徹底依賴於字典，所以須要準備好字典。通常面對不一樣的領域用不一樣的字典。好比面向醫學的，則字典會添加許多醫學術語方面的詞。能夠很容易的找到經常使用詞的字典，好比搜狗輸入法自帶的字典等。html

中止詞

中止詞不能用於成詞。中止詞主要包括無心義的字符(如的、地、得)或詞。
java

常規實現

本文因爲只是簡單的介紹和實現，因此定義好了簡單的字典和中止詞，以下代碼所示：node

<!DOCTYPE html>
<html>
<head>
    <meta charset="utf-8">
    <title>簡單的中文分詞</title>
    <meta name="author" content="" />
    <meta http-equiv="X-UA-Compatible" content="IE=7" />
    <meta name="keywords" content="簡單的中文分詞" />
    <meta name="description" content="簡單的中文分詞" />
</head>
<body>
<script type="text/javascript">
// 字典
var dict  = {
	"家鄉"     : 1,
	"松花"     : 1,
	"松花江"   : 1,
	"那裏"     : 1,
	"四季"     : 1,
	"四季迷人" : 1,
	"迷人"     : 1,
	"花香"     : 1
};
// 中止詞
var stop  = {
	"的" : 1
};
// 待分詞的字符串
var words = "個人家鄉在松花江邊上，那裏有四季迷人的花香。";
</script>
</body>
</html>

dict和stop之因此定義爲Object，是由於這樣可令查找的時間複雜度爲常值O(1)。分詞的過程有點像正則表達式的惰性匹配。先從words中讀取第一個字符"我"並在dict中和stop中查找，若是是中止詞，則丟掉已讀取的，而後讀取第二個字"的"。若是在dict中，則添加到結果集，而後繼續讀到下一個，再一樣去stop和dict中查找，直處處理完成。代碼以下：正則表達式

<!DOCTYPE html>
<html>
<head>
    <meta charset="utf-8">
    <title>簡單的中文分詞</title>
    <meta name="author" content="" />
    <meta http-equiv="X-UA-Compatible" content="IE=7" />
    <meta name="keywords" content="簡單的中文分詞" />
    <meta name="description" content="簡單的中文分詞" />
</head>
<body>
<script type="text/javascript">
// 字典
var dict  = {
	"家鄉"     : 1,
	"松花"     : 1,
	"松花江"   : 1,
	"那裏"     : 1,
	"四季"     : 1,
	"四季迷人" : 1,
	"迷人"     : 1,
	"花香"     : 1
};
// 中止詞
var stop  = {
	"的" : 1
};
// 待分詞的字符串
var words = "個人家鄉在松花江邊上，那裏有四季迷人的花香。";

function splitWords(words) {
	var start = 0, end = words.length - 1, result = [];
	while (start != end) {
		var str = [];
		for (var i = start; i <= end; i++) {
			var s = words.substring(i, i + 1);
			// 若是是中止詞，則跳過
			if (s in stop) {
				break;
			}
			str.push(s);
			// 若是在字典中，則添加到分詞結果集
			if (str.join('') in dict) {
				result.push(str.join(''));
			}
		}

		start++;
	}

	return result;
}

console.group("Base 分詞: ");
console.log("待分詞的字符串: ", words);
console.log("分詞結果:       ", splitWords(words));
console.groupEnd();
</script>
</body>
</html>

Trie樹實現

可是想一下，在實際應用中，字典可能包含了足夠多的詞，並且字典中不少詞是有共同前綴的。好比上述代碼中的"松花"和"松花江"就有共同的前綴"松花"，存儲重複的前綴將致使字典佔用大量的內存，而這部分實際上是能夠優化的。還記得我以前的一篇介紹Trie樹的文章嗎？若是您忘了，那請看：Python: Trie樹實現字典排序。事實上仍是有不一樣之處的，由於以前只是針對26個字母的Trie樹。對於須要支持中文的Trie樹來講，若是直接用一個字符(這個字符多是ASCII碼字符，也多是中文字符或其它多字節字符)來表示一個節點，則是不可取的。你們知道最經常使用的漢字有將近一萬個，若是每個節點都要用一個數組來保存將近一萬個子節點，那就太嚇人了。因此我這裏選擇Object的方式來保存，這樣的好處是查找時間複雜度爲O(1)。但即便這樣，這個Object還將容納將近一萬個key，因此我這裏將結合另一種方案來實現。數組

JavaScript的內碼是Unicode，它用1~2個字節來存儲。若是咱們將一個雙字節轉成UTF8的三個字節(嗯，是的。本文只考慮UTF8的單字節和三字節，由於雙字節、四字節、五字節和六字節太少見了)，單字節仍是不變，以第一個字節爲起始節點，那麼節點的子節點數就變成了固定的256個，而後咱們經過起始字節的大小能夠知道這是一個單字節或三字節。這種方式有效的節約了內存。接下來是實現代碼：大數據

<!DOCTYPE html>
<html>
<head>
    <meta charset="utf-8">
    <title>簡單的中文分詞</title>
    <meta name="author" content="" />
    <meta http-equiv="X-UA-Compatible" content="IE=7" />
    <meta name="keywords" content="簡單的中文分詞" />
    <meta name="description" content="簡單的中文分詞" />
</head>
<body>
<script type="text/javascript">
// 字典
var dict  = [
	"家鄉",
	"松花",
	"松花江",
	"那裏",
	"四季",
	"四季迷人",
	"迷人",
	"花香",
	"hello",
	"kitty",
	"fine"
];
// 中止詞
var stop  = {
	"的" : 1
};
// 待分詞的字符串
var words = "hello, kitty!個人家鄉在松花江邊上，那裏有四季迷人的花香。fine~";

// Trie樹
function Trie() {
	this.root = new Node(null);
}
Trie.prototype = {
	/**
	* 將Unicode轉成UTF8的三字節
	*/
	toBytes : function(word) {
		var result = [];
		for (var i = 0; i < word.length; i++) {
			var code = word.charCodeAt(i);
			// 單字節
			if (code < 0x80) {
				result.push(code);
			} else {
				// 三字節
				result = result.concat(this.toUTF8(code));
			}
		}

		return result;
	},
	toUTF8 : function(c) {
		// 1110xxxx 10xxxxxx 10xxxxxx
		// 1110xxxx
		var byte1 = 0xE0 | ((c >> 12) & 0x0F);
		// 10xxxxxx
		var byte2 = 0x80 | ((c >> 6) & 0x3F);
		// 10xxxxxx
		var byte3 = 0x80 | (c & 0x3F);

		return [byte1, byte2, byte3];
	},
	toUTF16 : function(b1, b2, b3) {
		// 1110xxxx 10xxxxxx 10xxxxxx
		var byte1 = (b1 << 4) | ((b2 >> 2) & 0x0F);
		var byte2 = ((b2 & 0x03) << 6) | (b3 & 0x3F);
		var utf16 = ((byte1 & 0x00FF) << 8) | byte2

		return utf16;
	},
	/**
	* 添加每一個詞到Trie樹
	*/
	add : function(word) {
		var node = this.root, bytes = this.toBytes(word), len = bytes.length;
		for (var i = 0; i < len; i++) {
			var c = bytes[i];
			// 若是不存在則添加，不然不須要再保存了，由於共用前綴
			if (!(c in node.childs)) {
				node.childs[c] = new Node(c);
			}
			node = node.childs[c];
		}
		node.asWord(); // 成詞邊界
	},
	/**
	* 按字節在Trie樹中搜索
	*/
	search : function(bytes) {
		var node = this.root, len = bytes.length, result = [];
		var word = [], j = 0;
		for (var i = 0; i < len; i++) {
			var c = bytes[i], childs = node.childs;
			if (!(c in childs)) {
				return result;
			}

			if (c < 0x80) {
				word.push(String.fromCharCode(c));
			} else {
				j++;
				if (j % 3 == 0) {
					var b1 = bytes[i - 2];
					var b2 = bytes[i - 1];
					var b3 = c;
					word.push(String.fromCharCode(this.toUTF16(b1, b2, b3)));
				}
			}
			// 若是是中止詞，則退出
			if (word.join('') in stop) {
				return result;
			}

			// 成詞
			var cnode = childs[c];
			if (cnode.isWord()) {
				result.push(word.join(''));
			}

			node = cnode;
		}

		return result;
	},
	/**
	* 分詞
	*/
	splitWords : function(words) {
		// 轉換成單字節進行搜索
		var bytes  = this.toBytes(words);
		var start = 0, end = bytes.length - 1, result = [];

		while (start != end) {
			var word = [];
			for (var i = start; i <= end; i++) {
				var b = bytes[i]; // 逐個取出字節
				word.push(b);
				
				var finds = this.search(word);
				if (finds !== false && finds.length > 0) {
					// 若是在字典中，則添加到分詞結果集
					result = result.concat(finds);
					break;
				}
			}

			start++;
		}

		return result;
	},
	/**
	* 詞始化整棵Trie樹
	*/
	init : function(dict) {
		for (var i = 0; i < dict.length; i++) {
			this.add(dict[i]);
		}
	}
};

// 節點
function Node(_byte) {
	this.childs   = {}; // 子節點集合
	this._byte    = _byte || null; // 此節點上存儲的字節
	this._isWord  = false; // 邊界保存，表示是否能夠組成一個詞
}
Node.prototype = {
	isWord : function() {
		return this._isWord;
	},
	asWord : function() {
		this._isWord = true;
	}
};

var trie = new Trie();
trie.init(dict);
var result = trie.splitWords(words);

console.group("Trie 分詞: ");
console.log("待分詞的字符串: ", words);
console.log("分詞結果:       ", result);
console.groupEnd();
</script>
</body>
</html>

各位看了輸出結果後就會發現，這個分詞是有問題的，由於明顯少了"松花江"和"四季迷人"。拿"四季"和"四季迷人"來講，"四季"是"四季迷人"的前綴，在經過trie.isWrod()方法來判斷是否成詞時，一遇到"四季"就成功了，因此"四季迷人"沒有機會獲得判斷，因此咱們須要修改代碼，在Node上加一個屬性，表示已判斷的次數。代碼以下：優化

<!DOCTYPE html>
<html>
<head>
    <meta charset="utf-8">
    <title>簡單的中文分詞</title>
    <meta name="author" content="" />
    <meta http-equiv="X-UA-Compatible" content="IE=7" />
    <meta name="keywords" content="簡單的中文分詞" />
    <meta name="description" content="簡單的中文分詞" />
</head>
<body>
<script type="text/javascript">
// 字典
var dict  = [
	"家鄉",
	"松花",
	"松花江",
	"那裏",
	"四季",
	"四季迷人",
	"迷人",
	"花香",
	"hello",
	"kitty",
	"fine"
];
// 中止詞
var stop  = {
	"的" : 1
};
// 待分詞的字符串
var words = "hello, kitty!個人家鄉在松花江邊上，那裏有四季迷人的花香。fine~";

// Trie樹
function Trie() {
	this.root = new Node(null);
}
Trie.prototype = {
	/**
	* 將Unicode轉成UTF8的三字節
	*/
	toBytes : function(word) {
		var result = [];
		for (var i = 0; i < word.length; i++) {
			var code = word.charCodeAt(i);
			// 單字節
			if (code < 0x80) {
				result.push(code);
			} else {
				// 三字節
				result = result.concat(this.toUTF8(code));
			}
		}

		return result;
	},
	toUTF8 : function(c) {
		// 1110xxxx 10xxxxxx 10xxxxxx
		// 1110xxxx
		var byte1 = 0xE0 | ((c >> 12) & 0x0F);
		// 10xxxxxx
		var byte2 = 0x80 | ((c >> 6) & 0x3F);
		// 10xxxxxx
		var byte3 = 0x80 | (c & 0x3F);

		return [byte1, byte2, byte3];
	},
	toUTF16 : function(b1, b2, b3) {
		// 1110xxxx 10xxxxxx 10xxxxxx
		var byte1 = (b1 << 4) | ((b2 >> 2) & 0x0F);
		var byte2 = ((b2 & 0x03) << 6) | (b3 & 0x3F);
		var utf16 = ((byte1 & 0x00FF) << 8) | byte2

		return utf16;
	},
	/**
	* 添加每一個詞到Trie樹
	*/
	add : function(word) {
		var node = this.root, bytes = this.toBytes(word), len = bytes.length;
		for (var i = 0; i < len; i++) {
			var c = bytes[i];
			// 若是不存在則添加，不然不須要再保存了，由於共用前綴
			if (!(c in node.childs)) {
				node.childs[c] = new Node(c);
			}
			node = node.childs[c];
		}
		node.asWord(); // 成詞邊界
	},
	/**
	* 按字節在Trie樹中搜索
	*/
	search : function(bytes) {
		var node = this.root, len = bytes.length, result = [];
		var word = [], j = 0;
		for (var i = 0; i < len; i++) {
			var c = bytes[i], childs = node.childs;
			if (!(c in childs)) {
				return result;
			}

			if (c < 0x80) {
				word.push(String.fromCharCode(c));
			} else {
				j++;
				if (j % 3 == 0) {
					var b1 = bytes[i - 2];
					var b2 = bytes[i - 1];
					var b3 = c;
					word.push(String.fromCharCode(this.toUTF16(b1, b2, b3)));
				}
			}
			// 若是是中止詞，則退出
			if (word.join('') in stop) {
				return result;
			}

			// 成詞
			var cnode = childs[c];
			if (cnode.isWord()) {
				cnode.addCount(); // 用於計數判斷
				result.push(word.join(''));
			}

			node = cnode;
		}

		return result;
	},
	/**
	* 分詞
	*/
	splitWords : function(words) {
		// 轉換成單字節進行搜索
		var bytes  = this.toBytes(words);
		var start = 0, end = bytes.length - 1, result = [];

		while (start != end) {
			var word = [];
			for (var i = start; i <= end; i++) {
				var b = bytes[i]; // 逐個取出字節
				word.push(b);
				
				var finds = this.search(word);
				if (finds !== false && finds.length > 0) {
					// 若是在字典中，則添加到分詞結果集
					result = result.concat(finds);
				}
			}

			start++;
		}

		return result;
	},
	/**
	* 詞始化整棵Trie樹
	*/
	init : function(dict) {
		for (var i = 0; i < dict.length; i++) {
			this.add(dict[i]);
		}
	}
};

// 節點
function Node(_byte) {
	this.childs   = {}; // 子節點集合
	this._byte    = _byte || null; // 此節點上存儲的字節
	this._isWord  = false; // 邊界保存，表示是否能夠組成一個詞
	this._count   = 0;
}
Node.prototype = {
	isWord : function() {
		return (this._isWord && (this._count == 0));
	},
	asWord : function() {
		this._isWord = true;
	},
	addCount : function() {
		this._count++;
	},
	getCount : function() {
		return this._count;
	}
};

var trie = new Trie();
trie.init(dict);
var result = trie.splitWords(words);

console.group("Trie 分詞: ");
console.log("待分詞的字符串: ", words);
console.log("分詞結果:       ", result);
console.groupEnd();
</script>
</body>
</html>

結束語

如今已經能正確的分詞了，即便有相同的前綴也沒有問題。我上面分詞用到的Trie樹稱爲標準Trie樹，這種標準Trie樹比較直觀。對於須要存儲中文的Trie樹，也有不少是用數組的方式實現的，好比雙數組Trie樹(Double Array Trie，簡稱DAT)、三數組Trie樹等，有興趣的朋友能夠去了解一下。ui

本文只是簡單的實現了中文分詞，還有不少不足的地方。好比沒有考慮未登陸詞的自動成詞，人名、岐義等等。但對於通常的如關鍵詞屏蔽和計算文本類似度等應用已經足夠了。this