Emoji與unicode特殊字符的處理

時間 2019-11-12

原文原文鏈接

最近遇到了一個很讓人糾結的問題：emoji表情在使用的過程當中，會莫名其妙的消失，或者變成亂碼，同時數據庫用utf8mb4來存儲，可是也出現了問題，冷備事後，導入進庫的時候，變成了不可見字符，神奇的消失了！查閱了網上的解決辦法，沒有找到相應的解決方案，因而決定本身研究unicode，而且處理，發現了幾個主要知識點：unicode被邏輯分爲了17個Plane，每一個Plane存65536個代碼點。而java的 char最多隻有2字節(16 bit)，也就是說，他最多隻能存儲65536個字符，而那麼問題來了，大於0x10000的這些字符怎麼處理？很好這個辦法，java也用了一個比較委婉的辦法來解決，那麼就是Character.codePoint()用int來存儲。直接看代碼吧，代碼中有註釋解釋:java

package etna.core.util;

import org.eclipse.jetty.util.StringUtil;
import com.google.common.base.Strings;
import com.google.common.hash.Hashing;

/**
 * <pre>
 * 本類的主要功能是將帶有emoji的字符串，格式化成unicode字符串，而且提供可見unicode字符反解成emoji字符
 *  
 * 
 * 相關識知點：
 * <b>
 * Unicode平面，
 * BMP的字符可使用charAt(index)來處理,計數可使用length()
 * 其它平面字符，須要用codePointAt(index),計數可使用codePointCount(0,str.lenght())</b>
 * 
 * Unicode能夠邏輯分爲17平面（Plane），每一個平面擁有65536（ = 216）個代碼點，雖然目前只有少數平面被使
 * 用。
 * 平面0 (0000–FFFF): 基本多文種平面（Basic Multilingual Plane, BMP）.
 * 平面1 (10000–1FFFF): 多文種補充平面（Supplementary Multilingual Plane, SMP）.
 * 平面2 (20000–2FFFF): 表意文字補充平面（Supplementary Ideographic Plane, SIP）.
 * 平面3 (30000–3FFFF): 表意文字第三平面（Tertiary Ideographic Plane, TIP）.
 * 平面4 to 13 (40000–DFFFF)還沒有使用
 * 平面14 (E0000–EFFFF): 特別用途補充平面（Supplementary Special-purpose Plane, SSP）
 * 平面15 (F0000–FFFFF)保留做爲私人使用區（Private Use Area, PUA）
 * 平面16 (100000–10FFFF)，保留做爲私人使用區（Private Use Area, PUA）
 * 
 * 參考：
 * 維基百科: http://en.wikipedia.org/wiki/Emoji 
 * GITHUB: http://punchdrunker.github.io/iOSEmoji/
 * 雜項象形符號:1F300-1F5FF
 * 表情符號：1F600-1F64F
 * 交通和地圖符號:1F680-1F6FF
 * 雜項符號：2600-26FF
 * 符號字體:2700-27BF
 * 國旗：1F100-1F1FF
 * 箭頭：2B00-2BFF 2900-297F
 * 各類技術符號：2300-23FF
 * 字母符號: 2100–214F
 * 中文符號： 303D 3200–32FF 2049 203C
 *  Private Use Area:E000-F8FF;
 *  High Surrogates D800..DB7F; 
 *  High Private Use Surrogates  DB80..DBFF
 *  Low Surrogates DC00..DFFF  D800-DFFF E000-F8FF
 *  標點符號：2000-200F 2028-202F 205F 2065-206F
 *  變異選擇器：IOS獨有 FE00-FE0F
 * </pre>
 * 
 * @author Daniel.Zhan
 * @version 1.0
 * @date 2015年5月20日
 */
public class EmojiCharacterUtil {

    // 轉義時標識
    private static final char unicode_separator = '&';
    private static final char unicode_prefix = 'u';
    private static final char separator = ':';

    private static boolean isEmojiCharacter(int codePoint) {
        return (codePoint >= 0x2600 && codePoint <= 0x27BF) // 雜項符號與符號字體
                || codePoint == 0x303D
                || codePoint == 0x2049
                || codePoint == 0x203C
                || (codePoint >= 0x2000 && codePoint <= 0x200F)//
                || (codePoint >= 0x2028 && codePoint <= 0x202F)//
                || codePoint == 0x205F //
                || (codePoint >= 0x2065 && codePoint <= 0x206F)//
                /* 標點符號佔用區域 */
                || (codePoint >= 0x2100 && codePoint <= 0x214F)// 字母符號
                || (codePoint >= 0x2300 && codePoint <= 0x23FF)// 各類技術符號
                || (codePoint >= 0x2B00 && codePoint <= 0x2BFF)// 箭頭A
                || (codePoint >= 0x2900 && codePoint <= 0x297F)// 箭頭B
                || (codePoint >= 0x3200 && codePoint <= 0x32FF)// 中文符號
                || (codePoint >= 0xD800 && codePoint <= 0xDFFF)// 高低位替代符保留區域
                || (codePoint >= 0xE000 && codePoint <= 0xF8FF)// 私有保留區域
                || (codePoint >= 0xFE00 && codePoint <= 0xFE0F)// 變異選擇器
                || codePoint >= 0x10000; // Plane在第二平面以上的，char都不能夠存，所有都轉
    }

    /**
     * 將帶有emoji字符的字符串轉換成可見字符標識
     */
    public static String escape(String src) {
        if (src == null) {
            return null;
        }
        int cpCount = src.codePointCount(0, src.length());
        int firCodeIndex = src.offsetByCodePoints(0, 0);
        int lstCodeIndex = src.offsetByCodePoints(0, cpCount - 1);
        StringBuilder sb = new StringBuilder(src.length());
        for (int index = firCodeIndex; index <= lstCodeIndex;) {
            int codepoint = src.codePointAt(index);
            if (isEmojiCharacter(codepoint)) {
                String hash = Integer.toHexString(codepoint);
                sb.append(unicode_separator).append(hash.length()).append(unicode_prefix).append(separator).append(hash);
            } else {
                sb.append((char) codepoint);
            }
        }
        return sb.toString();
    }

    /** 解析可見字符標識字符串 */
    public static String reverse(String src) {
        // 查找對應編碼的標識位
        if (src == null) {
            return null;
        }
        StringBuilder sb = new StringBuilder(src.length());
        char[] sourceChar = src.toCharArray();
        int index = 0;
        while (index < sourceChar.length) {
            if (sourceChar[index] == unicode_separator) {
                if (index + 6 >= sourceChar.length) {
                    sb.append(sourceChar[index]);
                    index++;
                    continue;
                }
                // 自已的格式，與通用unicode格式不能互轉
                if (sourceChar[index + 1] >= '4' && sourceChar[index + 1] <= '6' && sourceChar[index + 2] == unicode_prefix && sourceChar[index + 3] == separator) {
                    int length = Integer.parseInt(String.valueOf(sourceChar[index + 1]));
                    char[] hexchars = new char[length]; // 建立一個4至六位的數組，來存儲uncode碼的HEX值
                    for (int j = 0; j < length; j++) {
                        char ch = sourceChar[index + 4 + j];// 4位識別碼
                        if ((ch >= '0' && ch <= '9') || (ch >= 'a' && ch <= 'f')) {
                            hexchars[j] = ch;

                        } else { // 字符範圍不對
                            sb.append(sourceChar[index]);
                            index++;
                            break;
                        }
                    }
                    sb.append(Character.toChars(Integer.parseInt(new String(hexchars), 16)));
                    index += (4 + length);// 4位前綴+4-6位字符碼
                } else if (sourceChar[index + 1] == unicode_prefix) { // 通用字符的反轉
                    // 由於第二平面之上的，已經採用了咱們本身轉碼格式，因此這裏是固定的長度4
                    char[] hexchars = new char[4];
                    for (int j = 0; j < 4; j++) {
                        char ch = sourceChar[index + 2 + j]; // 兩位識別碼要去掉
                        if ((ch >= '0' && ch <= '9') || (ch >= 'a' && ch <= 'f')) {
                            hexchars[j] = ch; // 4位識別碼
                        } else { // 字符範圍不對
                            sb.append(sourceChar[index]);
                            index++;
                            break;
                        }
                        sb.append(Character.toChars(Integer.parseInt(String.valueOf(hexchars), 16)));
                        index += (2 + 4);// 2位前綴+4位字符碼
                    }
                } else {
                    sb.append(sourceChar[index]);
                    index++;
                    continue;
                }
            } else {
                sb.append(sourceChar[index]);
                index++;
                continue;
            }
        }

        return sb.toString();
    }

    public static String filter(String src) {
        if (src == null) {
            return null;
        }
        int cpCount = src.codePointCount(0, src.length());
        int firCodeIndex = src.offsetByCodePoints(0, 0);
        int lstCodeIndex = src.offsetByCodePoints(0, cpCount - 1);
        StringBuilder sb = new StringBuilder(src.length());
        for (int index = firCodeIndex; index <= lstCodeIndex;) {
            int codepoint = src.codePointAt(index);
            if (!isEmojiCharacter(codepoint)) {
                System.err.println("codepoint:" + Integer.toHexString(codepoint));
                sb.append((char) codepoint);
            }
            index += ((Character.isSupplementaryCodePoint(codepoint)) ? 2 : 1);

        }
        return sb.toString();
    }
}

相關標籤/搜索

每日一句

每一个你不满意的现在，都有一个你没有努力的曾经。