mysql沒法壓縮存儲表情

原文連接:https://www.cnblogs.com/SimonHu1993/p/7573868.htmlhtml

mysql沒法壓縮存儲表情compress(str),就選擇過濾把emoji表情符號替換成□。查了unicode碼錶,作了一個過濾的工具類,使用正則表達式過濾表情字符,有更好解決方案歡迎留言java

import java.util.regex.Matcher;
import java.util.regex.Pattern;
 
import org.apache.commons.lang.StringUtils;
/**
 *
 * @ClassName: EmojiUtils
 * @Description:過濾emoji表情
 * @author: Simon
 * @date: 2017年9月22日 上午11:11:00
 */
public class EmojiUtils {
    public final static String unicodeReg = "[" + "\u4E00-\u9FBF" + // :CJK
                                                                    // 統一表意符號
                                                                    // (CJK
                                                                    // Unified
                                                                    // Ideographs)
            "\u4DC0-\u4DFF" + // :易經六十四卦符號 (Yijing Hexagrams Symbols)
            "\u0000-\u007F" + // :C0控制符及基本拉丁文 (C0 Control and Basic Latin)
            "\u0080-\u00FF" + // :C1控制符及拉丁:補充-1 (C1 Control and Latin 1
                                // Supplement)
            "\u0100-\u017F" + // :拉丁文擴展-A (Latin Extended-A)
            "\u0180-\u024F" + // :拉丁文擴展-B (Latin Extended-B)
            "\u0250-\u02AF" + // :國際音標擴展 (IPA Extensions)
            "\u02B0-\u02FF" + // :空白修飾字母 (Spacing Modifiers)
            "\u0300-\u036F" + // :結合用讀音符號 (Combining Diacritics Marks)
            "\u0370-\u03FF" + // :希臘文及科普特文 (Greek and Coptic)
            "\u0400-\u04FF" + // :西裏爾字母 (Cyrillic)
            "\u0500-\u052F" + // :西裏爾字母補充 (Cyrillic Supplement)
            "\u0530-\u058F" + // :亞美尼亞語 (Armenian)
            "\u0590-\u05FF" + // :希伯來文 (Hebrew)
            "\u0600-\u06FF" + // :阿拉伯文 (Arabic)
            "\u0700-\u074F" + // :敘利亞文 (Syriac)
            "\u0750-\u077F" + // :阿拉伯文補充 (Arabic Supplement)
            "\u0780-\u07BF" + // :馬爾代夫語 (Thaana)
            // "\u07C0-\u077F"+//:西非書面語言 (N'Ko)
            "\u0800-\u085F" + // :阿維斯塔語及巴列維語 (Avestan and Pahlavi)
            "\u0860-\u087F" + // :Mandaic
            "\u0880-\u08AF" + // :撒馬利亞語 (Samaritan)
            "\u0900-\u097F" + // :天城文書 (Devanagari)
            "\u0980-\u09FF" + // :孟加拉語 (Bengali)
            "\u0A00-\u0A7F" + // :錫克教文 (Gurmukhi)
            "\u0A80-\u0AFF" + // :古吉拉特文 (Gujarati)
            "\u0B00-\u0B7F" + // :奧里亞文 (Oriya)
            "\u0B80-\u0BFF" + // :泰米爾文 (Tamil)
            "\u0C00-\u0C7F" + // :泰盧固文 (Telugu)
            "\u0C80-\u0CFF" + // :卡納達文 (Kannada)
            "\u0D00-\u0D7F" + // :德拉維族語 (Malayalam)
            "\u0D80-\u0DFF" + // :僧伽羅語 (Sinhala)
            "\u0E00-\u0E7F" + // :泰文 (Thai)
            "\u0E80-\u0EFF" + // :老撾文 (Lao)
            "\u0F00-\u0FFF" + // :藏文 (Tibetan)
            "\u1000-\u109F" + // :緬甸語 (Myanmar)
            "\u10A0-\u10FF" + // :格魯吉亞語 (Georgian)
            "\u1100-\u11FF" + // :朝鮮文 (Hangul Jamo)
            "\u1200-\u137F" + // :埃塞俄比亞語 (Ethiopic)
            "\u1380-\u139F" + // :埃塞俄比亞語補充 (Ethiopic Supplement)
            "\u13A0-\u13FF" + // :切羅基語 (Cherokee)
            "\u1400-\u167F" + // :統一加拿大土著語音節 (Unified Canadian Aboriginal
                                // Syllabics)
            "\u1680-\u169F" + // :歐甘字母 (Ogham)
            "\u16A0-\u16FF" + // :如尼文 (Runic)
            "\u1700-\u171F" + // :塔加拉語 (Tagalog)
            "\u1720-\u173F" + // :Hanunóo
            "\u1740-\u175F" + // :Buhid
            "\u1760-\u177F" + // :Tagbanwa
            "\u1780-\u17FF" + // :高棉語 (Khmer)
            "\u1800-\u18AF" + // :蒙古文 (Mongolian)
            "\u18B0-\u18FF" + // :Cham
            "\u1900-\u194F" + // :Limbu
            "\u1950-\u197F" + // :德宏泰語 (Tai Le)
            "\u1980-\u19DF" + // :新傣仂語 (New Tai Lue)
            "\u19E0-\u19FF" + // :高棉語記號 (Kmer Symbols)
            "\u1A00-\u1A1F" + // :Buginese
            "\u1A20-\u1A5F" + // :Batak
            "\u1A80-\u1AEF" + // :Lanna
            "\u1B00-\u1B7F" + // :巴釐語 (Balinese)
            "\u1B80-\u1BB0" + // :巽他語 (Sundanese)
            "\u1BC0-\u1BFF" + // :Pahawh Hmong
            "\u1C00-\u1C4F" + // :雷布查語(Lepcha)
            "\u1C50-\u1C7F" + // :Ol Chiki
            "\u1C80-\u1CDF" + // :曼尼普爾語 (Meithei/Manipuri)
            "\u1D00-\u1D7F" + // :語音學擴展 (Phone tic Extensions)
            "\u1D80-\u1DBF" + // :語音學擴展補充 (Phonetic Extensions Supplement)
            "\u1DC0-\u1DFF" + // 結合用讀音符號補充 (Combining Diacritics Marks
                                // Supplement)
            "\u1E00-\u1EFF" + // :拉丁文擴充附加 (Latin Extended Additional)
            "\u1F00-\u1FFF" + // :希臘語擴充 (Greek Extended)
            "\u2000-\u206F" + // :經常使用標點 (General Punctuation)
            "\u2070-\u209F" + // :上標及下標 (Superscripts and Subscripts)
            "\u20A0-\u20CF" + // :貨幣符號 (Currency Symbols)
            "\u20D0-\u20FF" + // :組合用記號 (Combining Diacritics Marks for Symbols)
            "\u2100-\u214F" + // :字母式符號 (Letterlike Symbols)
            "\u2150-\u218F" + // :數字形式 (Number Form)
            "\u2190-\u21FF" + // :箭頭 (Arrows)
            "\u2200-\u22FF" + // :數學運算符 (Mathematical Operator)
            "\u2300-\u23FF" + // :雜項工業符號 (Miscellaneous Technical)
            "\u2400-\u243F" + // :控制圖片 (Control Pictures)
            "\u2440-\u245F" + // :光學識別符 (Optical Character Recognition)
            "\u2460-\u24FF" + // :封閉式字母數字 (Enclosed Alphanumerics)
            "\u2500-\u257F" + // :製表符 (Box Drawing)
            "\u2580-\u259F" + // :方塊元素 (Block Element)
            "\u25A0-\u25FF" + // :幾何圖形 (Geometric Shapes)
            "\u2600-\u26FF" + // :雜項符號 (Miscellaneous Symbols)
            "\u2700-\u27BF" + // :印刷符號 (Dingbats)
            "\u27C0-\u27EF" + // :雜項數學符號-A (Miscellaneous Mathematical
                                // Symbols-A)
            "\u27F0-\u27FF" + // :追加箭頭-A (Supplemental Arrows-A)
            "\u2800-\u28FF" + // :盲文點字模型 (Braille Patterns)
            "\u2900-\u297F" + // :追加箭頭-B (Supplemental Arrows-B)
            "\u2980-\u29FF" + // :雜項數學符號-B (Miscellaneous Mathematical
                                // Symbols-B)
            "\u2A00-\u2AFF" + // :追加數學運算符 (Supplemental Mathematical Operator)
            "\u2B00-\u2BFF" + // :雜項符號和箭頭 (Miscellaneous Symbols and Arrows)
            "\u2C00-\u2C5F" + // :格拉哥里字母 (Glagolitic)
            "\u2C60-\u2C7F" + // :拉丁文擴展-C (Latin Extended-C)
            "\u2C80-\u2CFF" + // :古埃及語 (Coptic)
            "\u2D00-\u2D2F" + // :格魯吉亞語補充 (Georgian Supplement)
            "\u2D30-\u2D7F" + // :提非納文 (Tifinagh)
            "\u2D80-\u2DDF" + // :埃塞俄比亞語擴展 (Ethiopic Extended)
            "\u2E00-\u2E7F" + // :追加標點 (Supplemental Punctuation)
            "\u2E80-\u2EFF" + // :CJK 部首補充 (CJK Radicals Supplement)
            "\u2F00-\u2FDF" + // :康熙字典部首 (Kangxi Radicals)
            "\u2FF0-\u2FFF" + // :表意文字描述符 (Ideographic Description Characters)
            "\u3000-\u303F" + // :CJK 符號和標點 (CJK Symbols and Punctuation)
            "\u3040-\u309F" + // :日文平假名 (Hiragana)
            "\u30A0-\u30FF" + // :日文片假名 (Katakana)
            "\u3100-\u312F" + // :注音字母 (Bopomofo)
            "\u3130-\u318F" + // :朝鮮文兼容字母 (Hangul Compatibility Jamo)
            "\u3190-\u319F" + // :象形字註釋標誌 (Kanbun)
            "\u31A0-\u31BF" + // :注音字母擴展 (Bopomofo Extended)
            "\u31C0-\u31EF" + // :CJK 筆畫 (CJK Strokes)
            "\u31F0-\u31FF" + // :日文片假名語音擴展 (Katakana Phonetic Extensions)
            "\u3200-\u32FF" + // :封閉式 CJK 文字和月份 (Enclosed CJK Letters and
                                // Months)
            "\u3300-\u33FF" + // :CJK 兼容 (CJK Compatibility)
            "\u3400-\u4DBF" + // :CJK 統一表意符號擴展 A (CJK Unified Ideographs
                                // Extension A)
            "\u4DC0-\u4DFF" + // :易經六十四卦符號 (Yijing Hexagrams Symbols)
            "\u4E00-\u9FBF" + // :CJK 統一表意符號 (CJK Unified Ideographs)
            "\uA000-\uA48F" + // :彝文音節 (Yi Syllables)
            "\uA490-\uA4CF" + // :彝文字根 (Yi Radicals)
            "\uA500-\uA61F" + // :Vai
            "\uA660-\uA6FF" + // :統一加拿大土著語音節補充 (Unified Canadian Aboriginal
                                // Syllabics Supplement)
            "\uA700-\uA71F" + // :聲調修飾字母 (Modifier Tone Letters)
            "\uA720-\uA7FF" + // :拉丁文擴展-D (Latin Extended-D)
            "\uA800-\uA82F" + // :Syloti Nagri
            "\uA840-\uA87F" + // :八思巴字 (Phags-pa)
            "\uA880-\uA8DF" + // :Saurashtra
            "\uA900-\uA97F" + // :爪哇語 (Javanese)
            "\uA980-\uA9DF" + // :Chakma
            "\uAA00-\uAA3F" + // :Varang Kshiti
            "\uAA40-\uAA6F" + // :Sorang Sompeng
            "\uAA80-\uAADF" + // :Newari
            "\uAB00-\uAB5F" + // :越南傣語 (Vi?t Thái)
            "\uAB80-\uABA0" + // :Kayah Li
            "\uAC00-\uD7AF" + // :朝鮮文音節 (Hangul Syllables)
            // "\uD800-\uDBFF"+//:High-half zone of UTF-16
            // "\uDC00-\uDFFF"+//:Low-half zone of UTF-16
            "\uE000-\uF8FF" + // :自行使用區域 (Private Use Zone)
            "\uF900-\uFAFF" + // :CJK 兼容象形文字 (CJK Compatibility Ideographs)
            "\uFB00-\uFB4F" + // :字母表達形式 (Alphabetic Presentation Form)
            "\uFB50-\uFDFF" + // :阿拉伯表達形式A (Arabic Presentation Form-A)
            "\uFE00-\uFE0F" + // :變量選擇符 (Variation Selector)
            "\uFE10-\uFE1F" + // :豎排形式 (Vertical Forms)
            "\uFE20-\uFE2F" + // :組合用半符號 (Combining Half Marks)
            "\uFE30-\uFE4F" + // :CJK 兼容形式 (CJK Compatibility Forms)
            "\uFE50-\uFE6F" + // :小型變體形式 (Small Form Variants)
            "\uFE70-\uFEFF" + // :阿拉伯表達形式B (Arabic Presentation Form-B)
            "\uFF00-\uFFEF" + // :半型及全型形式 (Halfwidth and Fullwidth Form)
            "\uFFF0-\uFFFF]";// :特殊 (Specials);
 
    /**
     * 將字符串轉成unicode
     *
     * @param str
     *            待轉字符串
     * @return unicode字符串
     */
    public static String convert(String str) {
        str = (str == null ? "" : str);
        String tmp;
        StringBuffer sb = new StringBuffer(1000);
        char c;
        int i, j;
        sb.setLength(0);
        for (i = 0; i < str.length(); i++) {
            c = str.charAt(i);
            sb.append("\\u");
            j = (c >>> 8); // 取出高8位
            tmp = Integer.toHexString(j);
            if (tmp.length() == 1) {
                sb.append("0");
            }
            sb.append(tmp);
            j = (c & 0xFF); // 取出低8位
            tmp = Integer.toHexString(j);
            if (tmp.length() == 1) {
                sb.append("0");
            }
            sb.append(tmp);
 
        }
        return (new String(sb).toUpperCase());
    }
 
    /**
     * 2)unicode轉成字符串,與上述過程反向操做便可 將unicode 字符串
     *
     * @param str
     *            待轉字符串
     * @return 普通字符串
     */
    public static String revert(String str) {
        str = (str == null ? "" : str);
        if (str.indexOf("\\u") == -1)// 若是不是unicode碼則原樣返回
            return str;
 
        StringBuffer sb = new StringBuffer(1000);
 
        for (int i = 0; i < str.length() - 6;) {
            String strTemp = str.substring(i, i + 6);
            String value = strTemp.substring(2);
            int c = 0;
            for (int j = 0; j < value.length(); j++) {
                char tempChar = value.charAt(j);
                int t = 0;
                switch (tempChar) {
                case 'a':
                    t = 10;
                    break;
                case 'b':
                    t = 11;
                    break;
                case 'c':
                    t = 12;
                    break;
                case 'd':
                    t = 13;
                    break;
                case 'e':
                    t = 14;
                    break;
                case 'f':
                    t = 15;
                    break;
                default:
                    t = tempChar - 48;
                    break;
                }
 
                c += t * ((int) Math.pow(16, (value.length() - j - 1)));
            }
            sb.append((char) c);
            i = i + 6;
        }
        return sb.toString();
    }
    /**
     *
     * @author: Simon
     * @Description:傳入須要過濾的字符
     * @date: 2017年9月22日 上午11:11:33
     * @param string
     * @return
     */
    public static String emojiChange(String string) {
        // System.out.println("__________________________________");
        try {
            // System.out.println("all-string:"+string);
            // System.out.println("all-unicode:"+convert(string));
            Pattern pattern = Pattern.compile(unicodeReg);
            StringBuffer sbBuffer = new StringBuffer();
            for (int i = 0; i < string.length(); i++) {
                char c = string.charAt(i);
                String temp = String.valueOf(c);
                Matcher matcher = pattern.matcher(temp);
                if (matcher.find()) {
                    sbBuffer.append(temp);
                } else {
                    sbBuffer.append("□");
                }
                // System.out.println("temp:"+temp+";unicode:"+convert(temp));
            }
            // System.out.println("sb:"+sbBuffer.toString());
            // System.out.println("--------------------------------------");
            return sbBuffer.toString();
        } catch (Exception e) {
            e.printStackTrace();
        }
        return "";
    }
}
相關文章
相關標籤/搜索