經過Zoopkeeper-BinaryOutputArchive類學習utf-8的實現

時間 2019-11-15

標籤經過 zoopkeeper binaryoutputarchive 學習 utf 實現简体版

原文原文鏈接

BinaryOutputArchive類位於org.apache.jute包中，是序列化組件中的一個類。從字面意思理解就是輸出類。這裏類實現了OutputArchive接口。而且在構造函數中須要傳遞一個DataOutput接口的實現類。java

在這個代碼中有一段代碼引發了個人注意：apache

 /**
     * create our own char encoder to utf8. This is faster 
     * then string.getbytes(UTF8).
     * @param s the string to encode into utf8
     * @return utf8 byte sequence.
     */
    final private ByteBuffer stringToByteBuffer(CharSequence s) {
        bb.clear();
        final int len = s.length();
        for (int i = 0; i < len; i++) {
            if (bb.remaining() < 3) {
                ByteBuffer n = ByteBuffer.allocate(bb.capacity() << 1);
                bb.flip();
                n.put(bb);
                bb = n;
            }
            char c = s.charAt(i);
            //一個字節
            if (c < 0x80) {
                bb.put((byte) c);
            }
            else if (c < 0x800) {
                bb.put((byte) (0xc0 | (c >> 6)));
                bb.put((byte) (0x80 | (c & 0x3f)));
            } else {
                bb.put((byte) (0xe0 | (c >> 12)));
                bb.put((byte) (0x80 | ((c >> 6) & 0x3f)));
                bb.put((byte) (0x80 | (c & 0x3f)));
            }
        }
        bb.flip();
        return bb;
    }

　根據代碼的註釋說這裏的實現比java自身的要高效。那麼到底是怎麼樣呢？值得對比進行學習一番。數組

首先在對比以前要先弄清除他們在作什麼。這裏的方式實現了一個字符串轉化正一個utf-8編碼格式的字節數組。那麼utf-8編碼格式是規定怎麼進行轉換的呢？大致的規則以下：
app

　　1.對於單字節的符號，字節的第一位設爲0，後面7位爲這個符號的unicode碼。所以對於英語字母，UTF-8編碼和ASCII碼是相同的。分佈式

2.對於n字節的符號（n>1），第一個字節的前n位都設爲1，第n+1位設爲0，後面字節的前兩位一概設爲10。剩下的沒有說起的二進制位，所有爲這個符號的unicode碼。函數

實例以下：
學習

一個字節 0000 0000-0000 007F | 0xxxxxxx測試

兩個字節 0000 0080-0000 07FF | 110xxxxx 10xxxxxxthis

三個字節 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx編碼

四個字節 0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx

　根據上面的規則再去理解代碼就方便一些了。stringToByteBuffer方法的邏輯大體是：

循環字符的長度。若是緩衝字節類ByteBuffer 的剩餘空間不足則擴充１倍。（這裏使用了位運算）
若是是１個字節直接保存
若是是２個字節，先把字符的前一部分根據規則放入第一個字節中，而後把後面一部分放入第二個字節中。0xc0-->11000000。用它或一個字符恰好知足上面規則中110xxxxx的要求。代碼　c >> 6　讓我理解了好半天（多是我比較笨的緣由吧）。最後我發現根據規則除了第一個字節，其餘字節都是前兩位爲10.那麼就只剩下６位了。因此移位計算的數字是以６的倍數來進行的。0x80-->10000000、0x3f-->00111111。代碼　0x80 | (c & 0x3f)　恰好知足了第二個字節的條件
３字節的代碼和２字節的代碼比較相似

理解了stringToByteBuffer這個方法以後，咱們再來看看java中的實現：

 public byte[] getBytes(String charsetName)
            throws UnsupportedEncodingException {
        if (charsetName == null) throw new NullPointerException();
        return StringCoding.encode(charsetName, value, 0, value.length);
    }

沒什麼好說的，調用了一個StringCoding的一個方法。繼續跟蹤代碼：

static byte[] encode(String charsetName, char[] ca, int off, int len)
        throws UnsupportedEncodingException
    {
        StringEncoder se = deref(encoder);
        String csn = (charsetName == null) ? "ISO-8859-1" : charsetName;
        if ((se == null) || !(csn.equals(se.requestedCharsetName())
                              || csn.equals(se.charsetName()))) {
            se = null;
            try {
                Charset cs = lookupCharset(csn);
                if (cs != null)
                    se = new StringEncoder(cs, csn);
            } catch (IllegalCharsetNameException x) {}
            if (se == null)
                throw new UnsupportedEncodingException (csn);
            set(encoder, se);
        }
        return se.encode(ca, off, len);
    }

　這段代碼的大概的意思猜想是，先根據傳遞的字符集名稱去查找字符集，而後根據字符集建立一個StringEncoder類型的對象。而後調用對象的encode方法。繼續跟蹤代碼：

byte[] encode(char[] ca, int off, int len) {
            int en = scale(len, ce.maxBytesPerChar());
            byte[] ba = new byte[en];
            if (len == 0)
                return ba;
            if (ce instanceof ArrayEncoder) {
                int blen = ((ArrayEncoder)ce).encode(ca, off, len, ba);
                return safeTrim(ba, blen, cs, isTrusted);
            } else {
                ce.reset();
                ByteBuffer bb = ByteBuffer.wrap(ba);
                CharBuffer cb = CharBuffer.wrap(ca, off, len);
                try {
                    CoderResult cr = ce.encode(cb, bb, true);
                    if (!cr.isUnderflow())
                        cr.throwException();
                    cr = ce.flush(bb);
                    if (!cr.isUnderflow())
                        cr.throwException();
                } catch (CharacterCodingException x) {
                    // Substitution is always enabled,
                    // so this shouldn't happen
                    throw new Error(x);
                }
                return safeTrim(ba, bb.position(), cs, isTrusted);
            }
        }

　這段代碼判斷StringEncoder的類型。我寫個測試用例byte[] x="a".getbytes("utf-8"); 跟蹤代碼到這裏發現走的是 ArrayEncoder這裏邏輯。繼續跟蹤發現ArrayEncoder有多個實現。其中有一個UTF_8類。這個類在nio包中。這裏的代碼只能經過反編譯來看，沒有找到源碼包：

/*     */     public int encode(char[] paramArrayOfChar, int paramInt1, int paramInt2, byte[] paramArrayOfByte)
/*     */     {
/* 627 */       int i = paramInt1 + paramInt2;
/* 628 */       int j = 0;
/* 629 */       int k = j + Math.min(paramInt2, paramArrayOfByte.length);
/* 632 */       while ((j < k) && (paramArrayOfChar[paramInt1] < '')) {
/* 633 */         paramArrayOfByte[(j++)] = ((byte)paramArrayOfChar[(paramInt1++)]);
/*     */       }
/* 635 */       while (paramInt1 < i)
/*     */       {
/* 636 */         char c = paramArrayOfChar[(paramInt1++)];
/* 637 */         if (c < '-')
/*     */         {
/* 639 */           paramArrayOfByte[(j++)] = ((byte)c);
/*     */         }
/* 640 */         else if (c < 'ࠀ')
/*     */         {
/* 642 */           paramArrayOfByte[(j++)] = ((byte)(0xC0 | c >> '\006'));
/* 643 */           paramArrayOfByte[(j++)] = ((byte)(0x80 | c & 0x3F));
/*     */         }
/* 644 */         else if (Character.isSurrogate(c))
/*     */         {
/* 645 */           if (this.sgp == null) {
/* 646 */             this.sgp = new Surrogate.Parser();
/*     */           }
/* 647 */           int m = this.sgp.parse(c, paramArrayOfChar, paramInt1 - 1, i);
/* 648 */           if (m < 0)
/*     */           {
/* 649 */             if (malformedInputAction() != CodingErrorAction.REPLACE) {
/* 650 */               return -1;
/*     */             }
/* 651 */             paramArrayOfByte[(j++)] = replacement()[0];
/*     */           }
/*     */           else
/*     */           {
/* 653 */             paramArrayOfByte[(j++)] = ((byte)(0xF0 | m >> 18));
/* 654 */             paramArrayOfByte[(j++)] = ((byte)(0x80 | m >> 12 & 0x3F));
/* 655 */             paramArrayOfByte[(j++)] = ((byte)(0x80 | m >> 6 & 0x3F));
/* 656 */             paramArrayOfByte[(j++)] = ((byte)(0x80 | m & 0x3F));
/* 657 */             paramInt1++;
/*     */           }
/*     */         }
/*     */         else
/*     */         {
/* 661 */           paramArrayOfByte[(j++)] = ((byte)(0xE0 | c >> '\f'));
/* 662 */           paramArrayOfByte[(j++)] = ((byte)(0x80 | c >> '\006' & 0x3F));
/* 663 */           paramArrayOfByte[(j++)] = ((byte)(0x80 | c & 0x3F));
/*     */         }
/*     */       }
/* 666 */       return j;
/*     */     }

　和zookeeper中的差很少，都是在用位運算來實現utf-8的規則。

　對比代碼的理解以下：