[NLP天然語言處理]計算熵和KL距離，java實現漢字和英文單詞的識別，UTF8變長字符讀取

時間 2019-11-11

標籤 nlp 天然語言處理計算距離 java 實現漢字英文單詞識別 utf8 utf 變長字符讀取欄目 Java 简体版

原文原文鏈接

算法任務：html

1. 給定一個文件，統計這個文件中全部字符的相對頻率（相對頻率就是這些字符出現的機率——該字符出現次數除以字符總個數，並計算該文件的熵）。java

2. 給定另一個文件，按上述一樣的方法計算字符分佈的機率，而後計算兩個文件中的字符分佈的KL距離。正則表達式

（熵和KL距離都是NLP天然語言處理中術語，僅僅是涉及到一兩個公式而已，不影響您對代碼的理解，so just try！）算法

說明：函數

1. 給定的文件能夠是兩個中文文件或兩個英文文件，也能夠是兩個中英文混合文件。對於中文，計算字符，對於英文，計算詞。this

2.有效字符不包括空格換行符標點符號。編碼

3.將中文字符、英文單詞、其餘非有效字符及其出現次數，分別寫入三個文件中。spa

4.代碼用java完成。code

文章的重點：orm

1.如何判斷一個字符是漢字，而不是ASCII、標點、日文、阿拉伯文……

2.瞭解漢字是如何編碼的。「UTF8」絕逼是要花你一整個下午時間去弄明白的東西。

3.正則表達式。對於計算機科班出身的人應該不陌生，在此我就不造次了。

代碼以下：

  1 import java.io.BufferedReader;
  2 import java.io.FileInputStream;
  3 import java.io.FileReader;
  4 import java.io.FileWriter;
  5 import java.util.HashMap;
  6 import java.util.Iterator;
  7 import java.util.Map.Entry;
  8 import java.util.regex.Matcher;
  9 import java.util.regex.Pattern;
 10 
 11 public class NLPFileUnit {
 12     public HashMap<String, Integer> WordOccurrenceNumber;//The Occurrence Number of the single Chinese character
 13     //or Single English word in the file
 14     public HashMap<String, Float> WordProbability;//The probability of single Chinese character or English word
 15     public HashMap<String, Integer> Punctuations;//The punctuation that screened out from the file 
 16     public float entropy;//熵，本文主要計算單個漢字，或者單個英文單詞的熵值
 17     private String filePath;
 18 
 19     //構造函數
 20     public NLPFileUnit(String filePath) throws Exception {
 21         this.filePath = filePath;
 22         WordOccurrenceNumber = createHash(createReader(filePath));
 23         Punctuations = filterPunctuation(WordOccurrenceNumber);
 24         WordProbability = calProbability(WordOccurrenceNumber);
 25         this.entropy = calEntropy(this.WordProbability);
 26 
 27         System.out.println("all punctuations were saved at " + filePath.replace(".", "_punctuation.") + "!");
 28         this.saveFile(Punctuations, filePath.replace(".", "_punctuation."));
 29         System.out.println("all words(En & Ch) were saved at " + filePath.replace(".", "_AllWords.") + "!");
 30         this.saveFile(this.WordOccurrenceNumber, filePath.replace(".", "_AllWords."));
 31     }
 32 
 33     /**
 34      * get the English words form the file to HashMap
 35      * @param hash
 36      * @param path
 37      * @throws Exception
 38      */
 39     public void getEnWords(HashMap<String, Integer> hash, String path) throws Exception {
 40         FileReader fr = new FileReader(path);
 41         BufferedReader br = new BufferedReader(fr);
 42         
 43         //read all lines into content
 44         String content = "";
 45         String line = null;
 46         while((line = br.readLine())!=null){
 47             content+=line;
 48         }
 49         br.close();
 50         
 51         //extract words by regex正則表達式
 52         Pattern enWordsPattern = Pattern.compile("([A-Za-z]+)");
 53         Matcher matcher = enWordsPattern.matcher(content);
 54         while (matcher.find()) {
 55             String word = matcher.group();
 56             if(hash.containsKey(word))
 57                 hash.put(word, 1 + hash.get(word));
 58             else{
 59                 hash.put(word, 1);
 60             }
 61         }
 62     }
 63 
 64     private boolean isPunctuation(String tmp) {
 65         //Punctuation should not be EN words/ Chinese
 66         final String cnregex = "\\p{InCJK Unified Ideographs}";
 67         final String enregex = "[A-Za-z]+"; 
 68         return !(tmp.matches(cnregex) || tmp.matches(enregex)) ;
 69     }
 70 
 71     /**
 72      * judge whether the file is encoded by UTF-8 (UCS Transformation Format)format.
 73      * @param fs
 74      * @return
 75      * @throws Exception
 76      */
 77     private boolean isUTF8(FileInputStream fs) throws Exception {
 78         if (fs.read() == 0xEF && fs.read() == 0xBB && fs.read() == 0xBF)//全部utf8編碼的文件前三個字節爲0xEFBBBF
 79             return true;
 80         return false;
 81     }
 82 
 83     /**
 84      * utf8格式編碼的字符，其第一個byte的二進制編碼能夠判斷該字符的長度（漢字通常佔三個字節）ASCII佔一byte
 85      * @param b
 86      * @return
 87      */
 88     private int getlength(byte b) {
 89         int v = b & 0xff;//byte to 十六進制數
 90         if (v > 0xF0) {
 91             return 4;
 92         }
 93         // 110xxxxx
 94         else if (v > 0xE0) {
 95             return 3;
 96         } else if (v > 0xC0) {
 97             return 2;//該字符長度佔2byte
 98         }
 99         return 1;
100     }
101 
102     /**
103      * 經過讀取頭一個byte來判斷該字符佔用字節數，並讀取該字符，如1110xxxx，表示這個字符佔三個byte
104      * @param fs
105      * @return
106      * @throws Exception
107      */
108     private String readUnit(FileInputStream fs) throws Exception {
109         byte b = (byte) fs.read();
110         if (b == -1)
111             return null;
112         int len = getlength(b);
113         byte[] units = new byte[len];
114         units[0] = b;
115         for (int i = 1; i < len; i++) {
116             units[i] = (byte) fs.read();
117         }
118         String ret = new String(units, "UTF-8");
119         return ret;
120     }
121 
122     /**
123      * 把單詞，標點，漢字等全都讀入hashmap
124      * @param inputStream
125      * @return
126      * @throws Exception
127      */
128     private HashMap<String, Integer> createHash(FileInputStream inputStream)
129             throws Exception {
130         HashMap<String, Integer> hash = new HashMap<String, Integer>();
131         String key = null;
132         while ((key = readUnit(inputStream)) != null) {
133             if (hash.containsKey(key)) {
134                 hash.put(key, 1 + (int) hash.get(key));
135             } else {
136                 hash.put(key, 1);
137             }
138         }
139         inputStream.close();
140         getEnWords(hash, this.filePath);
141         return hash;
142     }
143 
144     /**
145      * FileInputStream讀取文件，若文件不是UTF8編碼，返回null
146      * @param path
147      * @return
148      * @throws Exception
149      */
150     private FileInputStream createReader(String path) throws Exception {
151         FileInputStream br = new FileInputStream(path);
152         if (!isUTF8(br))
153             return null;
154         return br;
155     }
156 
157     /**
158      * save punctuation filtered form (HashMap)hash into (HashMap)puncs,
159      * @param hash;remove punctuation form (HashMap)hash at the same time
160      * @return
161      */
162     private HashMap<String, Integer> filterPunctuation(
163             HashMap<String, Integer> hash) {
164         HashMap<String, Integer> puncs = new HashMap<String, Integer>();
165         Iterator<?> iterator = hash.entrySet().iterator();
166 
167         while (iterator.hasNext()) {
168             Entry<?, ?> entry = (Entry<?, ?>) iterator.next();
169             String key = entry.getKey().toString();
170             if (isPunctuation(key)) {
171                 puncs.put(key, hash.get(key));
172                 iterator.remove();
173             }
174         }
175         return puncs;
176     }
177 
178     /**
179      * calculate the probability of the word in hash
180      * @param hash
181      * @return
182      */
183     private HashMap<String, Float> calProbability(HashMap<String, Integer> hash) {
184         float count = countWords(hash);
185         HashMap<String, Float> prob = new HashMap<String, Float>();
186         Iterator<?> iterator = hash.entrySet().iterator();
187         while (iterator.hasNext()) {
188             Entry<?, ?> entry = (Entry<?, ?>) iterator.next();
189             String key = entry.getKey().toString();
190             prob.put(key, hash.get(key) / count);
191         }
192         return prob;
193     }
194 
195     /**
196      * save the content in the hash into file.txt
197      * @param hash
198      * @param path
199      * @throws Exception
200      */
201     private void saveFile(HashMap<String, Integer> hash, String path)
202             throws Exception {
203         FileWriter fw = new FileWriter(path);
204         fw.write(hash.toString());
205         fw.close();
206     }
207 
208     /**
209      * calculate the total words in hash
210      * @param hash
211      * @return
212      */
213     private int countWords(HashMap<String, Integer> hash) {
214         int count = 0;
215         for (Entry<String, Integer> entry : hash.entrySet()) {
216             count += entry.getValue();
217         }
218         return count;
219     }
220 
221     /**
222      * calculate the entropy（熵） of the characters
223      * @param hash
224      * @return
225      */
226     private float calEntropy(HashMap<String, Float> hash) {
227         float entropy = 0;
228         Iterator<Entry<String, Float>> iterator = hash.entrySet().iterator();
229         while (iterator.hasNext()) {
230             Entry<String, Float> entry = (Entry<String, Float>) iterator.next();
231             Float prob = entry.getValue();//get the probability of the characters
232             entropy += 0 - (prob * Math.log(prob));//calculate the entropy of the characters
233         }
234         return entropy;
235     }
236 }
237 
238 
239 
240 
241 
242 
243 
244 import java.io.BufferedReader;
245 import java.io.FileNotFoundException;
246 import java.io.IOException;
247 import java.io.InputStreamReader;
248 import java.util.HashMap;
249 import java.util.Iterator;
250 import java.util.Map.Entry;
251 
252 public class NLPWork {
253  
254     /**
255      * calculate the KL distance form file u1 to file u2
256      * @param u1
257      * @param u2
258      * @return
259      */
260     public static float calKL(NLPFileUnit u1, NLPFileUnit u2) {
261         HashMap<String, Float> hash1 = u1.WordProbability;
262         HashMap<String, Float> hash2 = u2.WordProbability;
263         float KLdistance = 0;
264         Iterator<Entry<String, Float>> iterator = hash1.entrySet().iterator();
265         while (iterator.hasNext()) {
266             Entry<String, Float> entry = iterator.next();
267             String key = entry.getKey().toString();
268 
269             if (hash2.containsKey(key)) {
270                 Float value1 = entry.getValue();
271                 Float value2 = hash2.get(key);
272                 KLdistance += value1 * Math.log(value1 / value2);
273             }
274         }
275         return KLdistance;
276     }
277 
278     public static void main(String[] args) throws IOException, Exception {
279         //all punctuation will be saved under working directory
280         System.out.println("Now only UTF8 encoded file is supported!!!");
281         System.out.println("PLS input file 1 path:");
282         BufferedReader cin = new BufferedReader(
283                 new InputStreamReader(System.in));
284         String file1 = cin.readLine();
285         System.out.println("PLS input file 2 path:");
286         String file2 = cin.readLine();
287         NLPFileUnit u1 = null;
288         NLPFileUnit u2 = null;
289         try{
290             u1 = new NLPFileUnit(file1);//NLP:Nature Language Processing
291             u2 = new NLPFileUnit(file2);
292         }
293         catch(FileNotFoundException e){
294             System.out.println("File Not Found!!");
295             e.printStackTrace();
296             return;
297         }
298         float KLdistance = calKL(u1, u2);
299         System.out.println("KLdistance is :" + KLdistance);
300         System.out.println("File 1 Entropy: " + u1.entropy);
301         System.out.println("File 2 Entropy: " + u2.entropy);
302     }
303 }