先放上一張Demo的測試圖html
須要基礎知識git
隱馬爾科夫模型(hidden Markov model)是關於時序的機率模型,是最簡單的動態貝葉斯網絡github
HMM模型由Pi、A、B 惟一決定 Pi、A、B 成爲HMM模型的三要素算法
目前/t 這/rzv 條/q 高速公路/n 之間/f 的/ude1 路段/n 已/d 緊急/a 封閉/v 。/w 數組
所以 能夠經過訓練語料來根據詞性以及分詞還有二者之間的關係進行統計得到三種機率網絡
這就是HMM模型中的預測問題app
此時採用維特比算法測試
注:對維特比算法的舉例理解ui
a b c d 表示的是四種狀態lua
1 2 3 是三個觀測時間節點
在t=3的時刻要計算到達此時的4種狀態的全部路徑,例如,到達a狀態的有從t=1,t=2的4*4=16種狀況,要找到機率最大的一條路徑記錄下來;對於其餘的狀態點也是這樣計算下去保存最優路徑。
算法中步驟3中的式子表示兩個時刻的距離,每次記錄的是根據之間的機率最大記住前一時刻的某一狀態。
整個工程的流程
詳細內容(含源代碼)
取前面約80%做爲train語料 後面約20%做爲測試語料
public class DoIt { List<String> wordlist; List<String> labellist; public DoIt() { wordlist = new ArrayList<String>(); labellist = new ArrayList<String>(); } public int[] creatlist(String train) throws IOException{ System.out.println("----- 建立List -----"); System.out.println(".......... . . ."); File file = new File(train); int[] twonum = new int[2]; if(!file.exists()) { throw new IOException(file + "不存在!!!"); } if(!file.isFile()) { throw new IOException(file + "不是文件!!!"); } BufferedReader br = new BufferedReader( new InputStreamReader( new FileInputStream(file))); String str = null; while((str = br.readLine()) != null) { String[] strarray = str.split(" "); for (String substr: strarray) { String[] tempstr = substr.split("/"); if (tempstr.length == 2) { String word = tempstr[0]; String label = tempstr[1]; if(!wordlist.contains(word)) { wordlist.add(word); } if(!labellist.contains(label)) { labellist.add(label); } } } } br.close(); twonum[0] = wordlist.size(); twonum[1] = labellist.size(); listtodocument(labellist, "labellist.dat"); //寫文件 listtodocument(wordlist, "wordlist.dat"); System.out.println("----- 建立List完成 -----"); return twonum; } //list 寫到 文件 public void listtodocument(List<String> list, String filename) throws IOException{ PrintWriter pw = new PrintWriter(filename); for (String string: list) { pw.print(string + " "); } pw.flush(); pw.close(); } }
public class DoIt { List<String> wordlist; List<String> labellist; public DoIt() { wordlist = new ArrayList<String>(); labellist = new ArrayList<String>(); } public void learn(String train, String model, int[] twonum) throws IOException{ System.out.println("----- 開始訓練 -----"); //System.out.println(twonum[0] +"---------" + twonum[1]); int wordnum = twonum[0]; int labelnum = twonum[1]; double[] pi = new double[labelnum]; double[][] A = new double[labelnum][labelnum]; double[][] B = new double[labelnum][wordnum]; for (int i = 0; i < labelnum; i++) { pi[i] = 1; for (int j = 0; j < labelnum; j++) { A[i][j] = 1; } for (int k = 0; k < wordnum; k++) { B[i][k] = 1; } } File file = new File(train); if(!file.exists()) { throw new IOException(file + "不存在!!!"); } if(!file.isFile()) { throw new IOException(file + "不是文件!!!"); } BufferedReader br = new BufferedReader( new InputStreamReader( new FileInputStream(file))); PrintWriter pw = new PrintWriter(model); String str = null; int frontindex = -1; int rowpi = 0; while((str = br.readLine()) != null) { rowpi ++; System.out.println("--learn讀取到文件的行號: " + rowpi); String[] strarray = str.split(" "); for (String substr: strarray) { String[] tempstr = substr.split("/"); if (tempstr.length == 2) { String word = tempstr[0]; String label = tempstr[1]; int wordindex = wordlist.indexOf(word); int labelindex = labellist.indexOf(label); B[labelindex][wordindex] += 1; if (frontindex != -1) { A[frontindex][labelindex] += 1; } frontindex = labelindex; } } String firstlabel = strarray[0].split("/")[1]; int firstlabelindex = labellist.indexOf(firstlabel); // System.out.println(firstlabel); pi[firstlabelindex] += 1; } System.out.println("----- 寫參數到model -----"); //計算機率 寫入model文件 int factor = 1000; pw.println(3); pw.println(4); pw.println(labelnum + 4); for (int i = 0; i < labelnum; i++) { pw.print(factor * pi[i] / rowpi + " "); } pw.println(); double rowsumA = 0; //pw.println("A"); for (int i = 0; i < labelnum; i++) { for (int j = 0; j < labelnum; j++) { rowsumA += A[i][j]; } for (int j = 0; j < labelnum; j++) { pw.print(factor * A[i][j] / rowsumA + " "); } rowsumA = 0; pw.println(); } double rowsumB = 0; //pw.println("B"); for (int i = 0; i < labelnum; i++) { for (int k = 0; k < wordnum; k++) { rowsumB += B[i][k]; } for (int k = 0; k < wordnum; k++) { pw.print(factor * B[i][k] / rowsumB + " "); } rowsumB = 0; pw.println(); } pw.flush(); br.close(); pw.close(); System.out.println("--- 文件寫入完畢 訓練完成 ---"); } //訓練 寫 參數 到model文件中 public void tomodel(String allfile, String train, String model) throws IOException{ //int[] twonum = creatlist(train); int[] twonum = creatlist(allfile); learn(train, model, twonum); } //訓練的入口 public void pleaselearn(String filename) throws IOException{ double start = System.currentTimeMillis(); String train = filename; String model = "model.dat"; String allfile = "dataa.dat"; tomodel(allfile, train, model); double end = System.currentTimeMillis(); System.out.println("訓練用時(s): " + (end - start) / 1000); } }
public class tDoIt { List<String> wordlist; List<String> labellist; public tDoIt() { wordlist = new ArrayList<String>(); labellist = new ArrayList<String>(); } //進行測試 public void test(String model, String test, String result) throws IOException{ System.out.println("----- 開始測試 -----"); String[] wordd = readdocument("wordlist.dat"); for (String stri: wordd) { wordlist.add(stri); } String[] label = readdocument("labellist.dat"); for (String strii: label) { labellist.add(strii); } File filemodel = new File(model); File filetest = new File(test); if(!filemodel.exists()) { throw new IOException(filemodel + "不存在!!!"); } if(!filemodel.isFile()) { throw new IOException(filemodel + "不是文件!!!"); } if(!filetest.exists()) { throw new IOException(filetest + "不存在!!!"); } if(!filetest.isFile()) { throw new IOException(filetest + "不是文件!!!"); } BufferedReader brmodel = new BufferedReader( new InputStreamReader( new FileInputStream(filemodel))); BufferedReader brtest = new BufferedReader( new InputStreamReader( new FileInputStream(filetest))); String[] rowpi = null; String[] rowA = null; String[] rowB = null; String strmodel = null; int rownumpi = tempreadfile(filemodel)[0]; int rownumA = tempreadfile(filemodel)[1]; int rownumB = tempreadfile(filemodel)[2]; double[] pi = new double[rownumB - rownumA]; double[][] A = new double[rownumB - rownumA][]; double[][] B = new double[rownumB - rownumA][]; int j = 0, k = 0; for (int i = 0; (strmodel = brmodel.readLine()) != null; i++) { if(i >= rownumpi && i < rownumA) { rowpi = strmodel.split(" "); pi = strtodouble(rowpi); }else if (i >= rownumA && i < rownumB) { rowA = strmodel.split(" "); A[j++] = strtodouble(rowA); }else if(i >= rownumB){ rowB = strmodel.split(" "); B[k++] = strtodouble(rowB); } } StringBuilder strbd; PrintWriter pw = new PrintWriter(result); String teststr = null; int row = 1; while((teststr = brtest.readLine()) != null) { pw.println(teststr); System.out.println("--test讀取到文件的行號: " + row++); String[] strarray = teststr.split(" "); strbd = new StringBuilder(); for (String substr: strarray) { String[] tempstr = substr.split("/"); if (tempstr.length == 2) { String word = tempstr[0]; strbd.append(word + " "); } } int[] labelindex = viterbi(strbd.toString(), pi, A, B); String[] strwords = strbd.toString().split(" "); for (int i = 0; i < labelindex.length; i++) { pw.print(strwords[i] + "/" + labellist.get(labelindex[i]) + " "); } pw.println(); } pw.flush(); brmodel.close(); brtest.close(); pw.close(); } // viterbi public int[] viterbi(String string, double[] pi, double[][] A, double[][] B) throws IOException{ String[] words = string.split(" "); double[][] delta = new double[words.length][pi.length]; int[][] way = new int[words.length][pi.length]; int[] labelindex = new int[words.length]; for (int i = 0; i < pi.length; i++) { delta[0][i] = pi[i] * B[i][wordlist.indexOf(words[0])]; } for (int t = 1; t < words.length; t++) { for (int i = 0; i < pi.length; i++) { for (int j = 0; j < pi.length; j++) { if(delta[t][i] < delta[t-1][j] * A[j][i] * B[i][wordlist.indexOf(words[t])]) { delta[t][i] = delta[t-1][j] * A[j][i] * B[i][wordlist.indexOf(words[t])]; way[t][i] = j; } } } } double max = delta[words.length - 1][0]; labelindex[words.length - 1] = 0; for (int i = 0; i < pi.length; i++) { if (delta[words.length - 1][i] > max) { max = delta[words.length - 1][i]; labelindex[words.length - 1] = i; } } for (int t = words.length - 2; t >= 0; t--) { labelindex[t] = way[t + 1][labelindex[t + 1]]; } return labelindex; } // 讀文件到數組 public String[] readdocument(String filename) throws IOException{ BufferedReader br = new BufferedReader( new InputStreamReader( new FileInputStream(filename))); String[] strarray = br.readLine().split(" "); br.close(); return strarray; } //讀取文件前的三個參數 public int[] tempreadfile(File file) throws IOException { int[] threenum = new int[3]; BufferedReader br = new BufferedReader( new InputStreamReader( new FileInputStream(file))); int i = 0; String str; while((str = br.readLine()) != null) { if(i > 2) { break; } threenum[i++] = Integer.parseInt(str); } br.close(); return threenum; } //轉String 爲 double類型 public double[] strtodouble(String[] strarray) { double[] dbs = new double[strarray.length]; for (int i = 0; i < strarray.length; i++) { dbs[i] = Double.valueOf(strarray[i]); } return dbs; } //測試的入口 public void pleasetest(String filename, String resultname) throws IOException{ double start = System.currentTimeMillis(); String test = filename; String model = "model.dat"; String result = resultname; test(model, test, result); double end = System.currentTimeMillis(); System.out.println("測試用時(min): " + (end - start) / 1000 / 60); } }
public class eDoIt { public void evaluation(String filename) throws IOException{ File file = new File(filename); if (!file.exists()) { throw new IOException(file + "不存在!!!"); } if (!file.isFile()) { throw new IOException(file + "不是文件"); } BufferedReader br = new BufferedReader( new InputStreamReader( new FileInputStream(file))); int sum = 0; int correct = 0; String str; String[] strarray; int flag = -1; int k = 1; while(true) { str = null; strarray = new String[2]; for (int i = 0; i < 2; i++) { str = br.readLine(); if (str != null) { strarray[i] = str; }else { flag = 1; break; } } if (flag > 0) break; String[] temp1 = strarray[1].split(" "); String[] temp2 = strarray[0].split(" "); for (int i = 0; i < temp1.length; i++) { if (temp1[i].split("/").length == 2 && temp2[i].split("/").length == 2){ sum++; String[] str1 = temp1[i].split("/"); String[] str2 = temp2[i].split("/"); if (str1[1].equals(str2[1])) { correct++; } } } } double accuracy = 100.0 * correct / sum; System.out.println("總單詞的個數:" + sum + "\n正確標註的單詞個數:" + correct); System.out.println("準確率爲:" + accuracy + "%"); br.close(); } }
public class dDoIt { List<String> wordlist; List<String> labellist; public dDoIt() { wordlist = new ArrayList<String>(); labellist = new ArrayList<String>(); } //進行decode public void decode(String model) throws Exception{ Scanner console; System.out.println("[MADE BY XINGLICHAO]"); System.out.println("詞性標註系統加載中..."); System.out.println("------------------------------------"); String[] wordd = readdocument("wordlist.dat"); for (String stri: wordd) { wordlist.add(stri); } String[] label = readdocument("labellist.dat"); for (String strii: label) { labellist.add(strii); } File filemodel = new File(model); if(!filemodel.exists()) { throw new IOException(filemodel + "不存在!!!"); } if(!filemodel.isFile()) { throw new IOException(filemodel + "不是文件!!!"); } BufferedReader brmodel = new BufferedReader( new InputStreamReader( new FileInputStream(filemodel))); String[] rowpi = null; String[] rowA = null; String[] rowB = null; String strmodel = null; int rownumpi = tempreadfile(filemodel)[0]; int rownumA = tempreadfile(filemodel)[1]; int rownumB = tempreadfile(filemodel)[2]; double[] pi = new double[rownumB - rownumA]; double[][] A = new double[rownumB - rownumA][]; double[][] B = new double[rownumB - rownumA][]; int j = 0, k = 0; for (int i = 0; (strmodel = brmodel.readLine()) != null; i++) { if(i >= rownumpi && i < rownumA) { rowpi = strmodel.split(" "); pi = strtodouble(rowpi); }else if (i >= rownumA && i < rownumB) { rowA = strmodel.split(" "); A[j++] = strtodouble(rowA); }else if(i >= rownumB){ rowB = strmodel.split(" "); B[k++] = strtodouble(rowB); } } while(true) { System.out.println("依次輸入句子的各個分詞並以空格分離:"); System.out.println("[結束使用 請按 0 ]"); System.out.println("------------------------------------"); console = new Scanner(System.in); try { String str = console.nextLine(); if (str.equals("0")) { brmodel.close(); console.close(); System.out.println(); System.out.println("應用結束..."); System.exit(0); } long start = System.currentTimeMillis(); int[] labelindex = viterbi(str, pi, A, B); String[] strwords = str.split(" "); System.out.println(); System.out.println("------------------------------------"); System.out.println("標註結果:"); for (int i = 0; i < labelindex.length; i++) { System.out.print(strwords[i] + "/" + labellist.get(labelindex[i]) + " "); } System.out.println(); long end = System.currentTimeMillis(); System.out.println("\n[本次標註用時 " + (end-start) + " ms]"); System.out.println("------------------------------------"); }catch(Exception e) { System.out.println("\n你的分詞超出了個人能力範圍!!"); System.out.println("------------------------------------"); } } } // viterbi public int[] viterbi(String string, double[] pi, double[][] A, double[][] B) throws IOException{ String[] words = string.split(" "); double[][] delta = new double[words.length][pi.length]; int[][] way = new int[words.length][pi.length]; int[] labelindex = new int[words.length]; for (int i = 0; i < pi.length; i++) { delta[0][i] = pi[i] * B[i][wordlist.indexOf(words[0])]; } for (int t = 1; t < words.length; t++) { for (int i = 0; i < pi.length; i++) { for (int j = 0; j < pi.length; j++) { if(delta[t][i] < delta[t-1][j] * A[j][i] * B[i][wordlist.indexOf(words[t])]) { delta[t][i] = delta[t-1][j] * A[j][i] * B[i][wordlist.indexOf(words[t])]; way[t][i] = j; } } } } double max = delta[words.length - 1][0]; labelindex[words.length - 1] = 0; for (int i = 0; i < pi.length; i++) { if (delta[words.length - 1][i] > max) { max = delta[words.length - 1][i]; labelindex[words.length - 1] = i; } } for (int t = words.length - 2; t >= 0; t--) { labelindex[t] = way[t + 1][labelindex[t + 1]]; } return labelindex; } // 讀文件到數組 public String[] readdocument(String filename) throws IOException{ BufferedReader br = new BufferedReader( new InputStreamReader( new FileInputStream(filename))); String[] strarray = br.readLine().split(" "); br.close(); return strarray; } //讀取文件前的三個參數 public int[] tempreadfile(File file) throws IOException { int[] threenum = new int[3]; BufferedReader br = new BufferedReader( new InputStreamReader( new FileInputStream(file))); int i = 0; String str; while((str = br.readLine()) != null) { if(i > 2) { break; } threenum[i++] = Integer.parseInt(str); } br.close(); return threenum; } //轉String 爲 double類型 public double[] strtodouble(String[] strarray) { double[] dbs = new double[strarray.length]; for (int i = 0; i < strarray.length; i++) { dbs[i] = Double.valueOf(strarray[i]); } return dbs; } }
在進行test時 標註的速度慢 因爲數據較大 一次進行讀取測試 時間花費太長 因此 想到將文件按行切分紅多個文件 分別進行測試 最後 再將獲得的結果小文件 整合成一個大文件用於評估
按行切分文件 與 合併文件 點這裏
Github:https://github.com/xinglicha0/Chinese-word-tagging-system-based-on-Hmm