這個比較文本用到的主要是餘弦定理比較文本類似度,具體原理右轉某度,主要適用場景是在考試系統中的簡答題概述,可根據權重自動打分,感受實用性蠻廣的。php
先說下思路:json
下面是具體的代碼段(請求是在網上找的,原連接找不到了。。)api
class Program { static void Main(string[] args) { Console.Write(Sim("牀前明月光,疑是地上霜", "牀前明月光,疑是地上霜")); } public static double Sim(string txt1, string txt2) { List<string> sl1 = Segment(txt1); List<string> sl2 = Segment(txt2); //去重 List<string> sl = sl1.Union(sl2).ToList<string>(); //獲取重複次數 List<int> arrA = new List<int>(); List<int> arrB = new List<int>(); foreach (var str in sl) { arrA.Add(sl1.Where(x => x == str).Count()); arrB.Add(sl2.Where(x => x == str).Count()); } //計算商 double num = 0; //被除數 double numA = 0; double numB = 0; for (int i = 0; i < sl.Count; i++) { num += arrA[i] * arrB[i]; numA += Math.Pow(arrA[i], 2); numB += Math.Pow(arrB[i], 2); } double cos = num / (Math.Sqrt(numA)* Math.Sqrt(numB)); return cos; } public static List<string> Segment(string str) { List<string> sl = new List<string>(); try { string s = string.Empty; System.Net.CookieContainer cookieContainer = new System.Net.CookieContainer(); // 將提交的字符串數據轉換成字節數組 byte[] postData = System.Text.Encoding.ASCII.GetBytes("data=" + System.Web.HttpUtility.UrlEncode(str) + "&respond=json&charset=utf8&ignore=yes&duality=no&traditional=no&multi=0"); // 設置提交的相關參數 System.Net.HttpWebRequest request = System.Net.WebRequest.Create("http://www.ftphp.com/scws/api.php") as System.Net.HttpWebRequest; request.Method = "POST"; request.KeepAlive = false; request.ContentType = "application/x-www-form-urlencoded"; request.CookieContainer = cookieContainer; request.ContentLength = postData.Length; // 提交請求數據 System.IO.Stream outputStream = request.GetRequestStream(); outputStream.Write(postData, 0, postData.Length); outputStream.Close(); // 接收返回的頁面 System.Net.HttpWebResponse response = request.GetResponse() as System.Net.HttpWebResponse; System.IO.Stream responseStream = response.GetResponseStream(); System.IO.StreamReader reader = new System.IO.StreamReader(responseStream, System.Text.Encoding.GetEncoding("utf-8")); string val = reader.ReadToEnd(); Newtonsoft.Json.Linq.JObject results = Newtonsoft.Json.Linq.JObject.Parse(val); foreach (var item in results["words"].Children()) { Newtonsoft.Json.Linq.JObject word = Newtonsoft.Json.Linq.JObject.Parse(item.ToString()); var sss = word["word"].ToString(); //判斷是否爲標點符 bool offom = false; foreach (char t in sss) { if (t >= 0x4e00 && t <= 0x9fbb) { offom = true; } else { offom = false; } } if (offom) { sl.Add(sss); } } } catch { } return sl; } }
以上 感受不少地方均可以優化,之後想到更好的再從新來一遍數組