今晚上寫代碼玩,用到java.io.RandomAccessFile.writeUTF(String)
函數,而文件默認保存爲gbk,顯然是亂碼。忽然想起來去看看存儲編碼規則,就去找了些文章瞭解writeUTF(String)的原理,在此記錄。 首先須要弄明白unicode與utf8的表示規則,搜到@Feng哥的一篇文章《字符編碼筆記:ASCII,Unicode和UTF-8》,寫的很明白,在此招錄一段:html
| Unicode符號範圍 | UTF-8編碼方式
| 0000 0000-0000 007F | 0xxxxxxx
| 0000 0080-0000 07FF | 110xxxxx 10xxxxxx | 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
| 0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
下面,仍是以漢字"嚴"爲例,演示如何實現UTF-8編碼。 已知"嚴"的unicode是4E25(100111000100101),根據上表,能夠發現4E25處在第三行的範圍內(0000 0800-0000 FFFF),所以"嚴"的UTF-8編碼須要三個字節,即格式是"1110xxxx 10xxxxxx 10xxxxxx"。而後,從"嚴"的最後一個二進制位開始,依次從後向前填入格式中的x,多出的位補0。這樣就獲得了,"嚴"的UTF-8編碼是"11100100 10111000 10100101",轉換成十六進制就是E4B8A5。java
也就是將4E25(100 111000 100101)依次填充到(1110xxxx 10xxxxxx 10xxxxxx)的x
位置裏面!ubuntu
文章中還強調的一點思想就是:Unicode是爲統一世界多種編碼問題而制定的統一編碼,就是UTF-8只是Unicode的一種實現方式。數組
打印System.out.println(Integer.toHexString('嚴'));
,打印結果爲4e25
,爲Unicode編碼.當使用RandomAccessFile.writeUTF(String), "嚴"以utf8個是寫入文件。markdown
下面是源碼,以及我作的一些備註,之後回憶時候好用:less
static int writeUTF(String str, DataOutput out) throws IOException { int strlen = str.length(); int utflen = 0; int c, count = 0; /* 根據c的大小決定存儲長度utflen的大小,最大65535字節,也就是64kb */ for (int i = 0; i < strlen; i++) { c = str.charAt(i); if ((c >= 0x0001) && (c <= 0x007F)) { utflen++; } else if (c > 0x07FF) { utflen += 3; } else { utflen += 2; } } if (utflen > 65535) throw new UTFDataFormatException( "encoded string too long: " + utflen + " bytes"); /*建立"合適"長度字節數組bytearr,下面的+2是由於須要在bytearr前兩字節中存儲數據長度utflen*/ byte[] bytearr = null; if (out instanceof DataOutputStream) { DataOutputStream dos = (DataOutputStream)out; if(dos.bytearr == null || (dos.bytearr.length < (utflen+2))) dos.bytearr = new byte[(utflen*2) + 2]; bytearr = dos.bytearr; } else { bytearr = new byte[utflen+2]; } bytearr[count++] = (byte) ((utflen >>> 8) & 0xFF); bytearr[count++] = (byte) ((utflen >>> 0) & 0xFF); /*若是是ascii碼就直接存在bytearr裏面了,畢竟老外寫的源碼,都是用ascii碼機率比較高,就省去下面for中的判斷了*/ int i=0; for (i=0; i<strlen; i++) { c = str.charAt(i); if (!((c >= 0x0001) && (c <= 0x007F))) break; bytearr[count++] = (byte) c; } /*上面知足不了就用下面的。。*/ for (;i < strlen; i++){ c = str.charAt(i); if ((c >= 0x0001) && (c <= 0x007F)) { /*單字節,編碼規則:0xxxxxxx ,ascii碼處理*/ bytearr[count++] = (byte) c; } else if (c > 0x07FF) { /*三字節,編碼規則:1110xxxx 10xxxxxx 10xxxxxx 參考上面的"嚴"字(100 111000 100101),則結果分別爲(`1110`0100 `10`111000 `10`100101)*/ bytearr[count++] = (byte) (0xE0 | ((c >> 12) & 0x0F)); bytearr[count++] = (byte) (0x80 | ((c >> 6) & 0x3F)); bytearr[count++] = (byte) (0x80 | ((c >> 0) & 0x3F)); } else { /*兩字節,編碼規則:110xxxxx 10xxxxxx*/ bytearr[count++] = (byte) (0xC0 | ((c >> 6) & 0x1F)); bytearr[count++] = (byte) (0x80 | ((c >> 0) & 0x3F)); } } out.write(bytearr, 0, utflen+2); return utflen + 2; }
public final static String readUTF(DataInput in) throws IOException { int utflen = in.readUnsignedShort(); byte[] bytearr = null; char[] chararr = null; if (in instanceof DataInputStream) { DataInputStream dis = (DataInputStream)in; if (dis.bytearr.length < utflen){ dis.bytearr = new byte[utflen*2]; dis.chararr = new char[utflen*2]; } chararr = dis.chararr; bytearr = dis.bytearr; } else { bytearr = new byte[utflen]; chararr = new char[utflen]; } int c, char2, char3; int count = 0; int chararr_count=0; in.readFully(bytearr, 0, utflen); while (count < utflen) { c = (int) bytearr[count] & 0xff; if (c > 127) break; count++; chararr[chararr_count++]=(char)c; } while (count < utflen) { c = (int) bytearr[count] & 0xff; switch (c >> 4) { case 0: case 1: case 2: case 3: case 4: case 5: case 6: case 7: /* 0xxxxxxx*/ count++; chararr[chararr_count++]=(char)c; break; case 12: case 13: /* 110x xxxx 10xx xxxx*/ count += 2; if (count > utflen) throw new UTFDataFormatException( "malformed input: partial character at end"); char2 = (int) bytearr[count-1]; if ((char2 & 0xC0) != 0x80) throw new UTFDataFormatException( "malformed input around byte " + count); chararr[chararr_count++]=(char)(((c & 0x1F) << 6) | (char2 & 0x3F)); break; case 14: /* 1110 xxxx 10xx xxxx 10xx xxxx */ count += 3; if (count > utflen) throw new UTFDataFormatException( "malformed input: partial character at end"); char2 = (int) bytearr[count-2]; char3 = (int) bytearr[count-1]; if (((char2 & 0xC0) != 0x80) || ((char3 & 0xC0) != 0x80)) throw new UTFDataFormatException( "malformed input around byte " + (count-1)); chararr[chararr_count++]=(char)(((c & 0x0F) << 12) | ((char2 & 0x3F) << 6) | ((char3 & 0x3F) << 0)); break; default: /* 10xx xxxx, 1111 xxxx */ throw new UTFDataFormatException( "malformed input around byte " + count); } } // The number of chars produced may be less than utflen return new String(chararr, 0, chararr_count); }
一切就顯得很明瞭了。dom
第一次用markdown寫東西,有些語法還不太熟悉,但確實能讓人更加關注文章的內容(缺點就是丟失的一些排版性)。函數
使用工具:st2+Table Editor工具