【Java】如何檢測、替換4個字節的utf-8編碼(此範圍編碼包含emoji表情)

> 參考的優秀文章

一、十分鐘搞清字符集和字符編碼javascript

二、Java中byte與16進制字符串的互相轉換html

三、【異常處理】Incorrect string value: '\xF0\x90\x8D\x83...' for column... Emoji表情字符過濾的Java實現java

四、Why a surrogate java regexp finds hypen-minusjquery

 

> 如何檢測、替換4個字節的utf-8編碼(此範圍編碼包含emoji)

項目有個需求,是保存從手機端H5頁面提交的信息。web

你們知道,手機端輸入法中常常有自帶的表情,其中emoji表情很是流行,若是用戶輸入emoji表情,因爲有部分emoji表情是4個字節的utf-8編碼,咱們的MySQL數據庫在現有版本和編碼設置下只能保存3個字節的utf-8編碼(如要保存4個字節的utf-8編碼則需升級版本和設置另外一種編碼)。相關信息可見文章《十分鐘搞清字符集和字符編碼》。ajax

咱們的需求不須要支持emoji表情,若是遇到emoji彈出提示或過濾便可。正則表達式

經過瀏覽《【異常處理】Incorrect string value: '\xF0\x90\x8D\x83...' for column... Emoji表情字符過濾的Java實現》和《Why a surrogate java regexp finds hypen-minus》,咱們得知經過如下代碼進行替換:數據庫

msg.replaceAll("[\\ud800\\udc00-\\udbff\\udfff\\ud800-\\udfff]", "");

效果是OK的。數組

 

可是,因爲能力緣由,始終沒能理解上述代碼十六進制正則表達式的原理,本身寫了一端代碼來檢測、替換4個字節的utf-8編碼(但未能通過完整測試,僅用於描述大概思路)。app

其中UTF-8編碼規則閱讀自《十分鐘搞清字符集和字符編碼》,字節與十六進制的轉換參考自《 Java中byte與16進制字符串的互相轉換》。

package com.nicchagil.tc.emojifilter;

import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;

public class UTF8Utils {
    
    public static void main(String[] args) {
        String s = "琥珀蜜蠟因爲硬度很低,打磨起來非123常簡單,須要的工具也很是簡單,本身去買蜜蠟是很是不划算的,徹底能夠本身磨蜜蠟原石的樣子";
        System.out.println(UTF8Utils.bytesToHex(s.getBytes()));
        System.out.println(UTF8Utils.bytesToHex(UTF8Utils.remove4BytesUTF8Char(s)));
    }
    
    public static Map<String, Integer> hexMap = new HashMap<String, Integer>();
    public static Map<String, Integer> byteMap = new HashMap<String, Integer>();
    
    static {
        hexMap.put("0", 2);
        hexMap.put("1", 2);
        hexMap.put("2", 2);
        hexMap.put("3", 2);
        hexMap.put("4", 2);
        hexMap.put("5", 2);
        hexMap.put("6", 2);
        hexMap.put("7", 2);
        hexMap.put("c", 4);
        hexMap.put("d", 4);
        hexMap.put("e", 6);
        hexMap.put("f", 8);
        
        byteMap.put("0", 1);
        byteMap.put("1", 1);
        byteMap.put("2", 1);
        byteMap.put("3", 1);
        byteMap.put("4", 1);
        byteMap.put("5", 1);
        byteMap.put("6", 1);
        byteMap.put("7", 1);
        byteMap.put("c", 2);
        byteMap.put("d", 2);
        byteMap.put("e", 3);
        byteMap.put("f", 4);
    }
    
    /**
     * 是否包含4字節UTF-8編碼的字符(先轉換16進制再判斷)
     * @param s 字符串
     * @return 是否包含4字節UTF-8編碼的字符
     */
    public static boolean contains4BytesChar(String s) {
        if (s == null || s.trim().length() == 0) {
            return false;
        }
        
        String hex = UTF8Utils.bytesToHex(s.getBytes());
        System.out.println("full hex : " + hex);
        
        String firstChar = null;
        while (hex != null && hex.length() > 1) {
            firstChar = hex.substring(0, 1);
            System.out.println("firstChar : " + firstChar);
            
            if ("f".equals(firstChar)) {
                System.out.println("it is f start, it is 4 bytes, return.");
                return true;
            }
            
            if (hexMap.get(firstChar) == null) {
                System.out.println("it is f start, it is 4 bytes, return.");
                // todo, throw exception for this case
                return false;
            }
            
            hex = hex.substring(hexMap.get(firstChar), hex.length());
            System.out.println("remain hex : " + hex);
        }
        
        return false;
    }
    
    /**
     * 是否包含4字節UTF-8編碼的字符
     * @param s 字符串
     * @return 是否包含4字節UTF-8編碼的字符
     */
    public static boolean contains4BytesChar2(String s) {
        if (s == null || s.trim().length() == 0) {
            return false;
        }
        
        byte[] bytes = s.getBytes();
        
        if (bytes == null || bytes.length == 0) {
            return false;
        }
        
        int index = 0;
        byte b;
        String hex = null;
        String firstChar = null;
        int step;
        while (index <= bytes.length - 1) {
            System.out.println("while loop, index : " + index);
            b = bytes[index];
            
            hex = byteToHex(b);
            if (hex == null || hex.length() < 2) {
                System.out.println("fail to check whether contains 4 bytes char(1 byte hex char too short), default return false.");
                // todo, throw exception for this case
                return false;
            }
            
            firstChar = hex.substring(0, 1);
            
            if (firstChar.equals("f")) {
                return true;
            }
            
            if (byteMap.get(firstChar) == null) {
                System.out.println("fail to check whether contains 4 bytes char(no firstchar mapping), default return false.");
                // todo, throw exception for this case
                return false;
            }
            
            step = byteMap.get(firstChar);
            System.out.println("while loop, index : " + index + ", step : " + step);
            index = index + step;
        }
        
        return false;
    }
    
    /**
     * 去除4字節UTF-8編碼的字符
     * @param s 字符串
     * @return 已去除4字節UTF-8編碼的字符
     */
    public static byte[] remove4BytesUTF8Char(String s) {
        byte[] bytes = s.getBytes();
        byte[] removedBytes = new byte[bytes.length];
        int index = 0;
        
        String hex = null;
        String firstChar = null;
        for (int i = 0; i < bytes.length; ) {
            hex = UTF8Utils.byteToHex(bytes[i]);
            
            if (hex == null || hex.length() < 2) {
                System.out.println("fail to check whether contains 4 bytes char(1 byte hex char too short), default return false.");
                // todo, throw exception for this case
                return null;
            }
            
            firstChar = hex.substring(0, 1);
            
            if (byteMap.get(firstChar) == null) {
                System.out.println("fail to check whether contains 4 bytes char(no firstchar mapping), default return false.");
                // todo, throw exception for this case
                return null;
            }
            
            if (firstChar.equals("f")) {
                for (int j = 0; j < byteMap.get(firstChar); j++) {
                    i++;
                }
                continue;
            }
            
            for (int j = 0; j < byteMap.get(firstChar); j++) {
                removedBytes[index++] = bytes[i++];
            }
        }
        
        return Arrays.copyOfRange(removedBytes, 0, index);
    }
    
    /**
     * 將字符串的16進制轉換爲HEX,並按每一個字符的16進制分隔格式化
     * @param s 字符串
     */
    public static String splitForReading(String s) {
        if (s == null || s.trim().length() == 0) {
            return "";
        }
        
        String hex = UTF8Utils.bytesToHex(s.getBytes());
        System.out.println("full hex : " + hex);
        
        if (hex == null || hex.length() == 0) {
            System.out.println("fail to translate the bytes to hex.");
            // todo, throw exception for this case
            return "";
        }
        
        StringBuilder sb = new StringBuilder();
        int index = 0;
        
        String firstChar = null;
        String splittedString = null;
        while (index < hex.length()) {
            firstChar = hex.substring(index, index + 1);
            
            if (hexMap.get(firstChar) == null) {
                System.out.println("fail to check whether contains 4 bytes char(no firstchar mapping), default return false.");
                // todo, throw exception for this case
                return "";
            }
            
            splittedString = hex.substring(index, index + hexMap.get(firstChar));
            sb.append(splittedString).append(" ");
            index = index + hexMap.get(firstChar);
        }
        
        System.out.println("formated sb : " + sb);
        return sb.toString();
    }
    
    /**
     * 字節數組轉十六進制
     * @param bytes 字節數組
     * @return 十六進制
     */
    public static String bytesToHex(byte[] bytes) {
        if (bytes == null || bytes.length == 0) {
            return null;
        }
        
        StringBuilder sb = new StringBuilder();
        for (int i = 0; i < bytes.length; i++) {
            int r = bytes[i] & 0xFF;
            
            String hexResult = Integer.toHexString(r);
            if (hexResult.length() < 2) {
                sb.append(0); // 前補0
            }
            sb.append(hexResult);
        }
        
        return sb.toString();
    }
    
    /**
     * 字節轉十六進制
     * @param b 字節
     * @return 十六進制
     */
    public static String byteToHex(byte b) {
        int r = b & 0xFF;
        String hexResult = Integer.toHexString(r);
        
        StringBuilder sb = new StringBuilder();
        if (hexResult.length() < 2) {
            sb.append(0); // 前補0
        }
        sb.append(hexResult);
        return sb.toString();
    }

}
View Code

 

在隨便看下各類字符的UTF-8編碼是什麼:

package com.nicchagil.tc.emojifilter;

public class UTF8HexTester {
    
    public static void main(String[] args) {
        String s = "1";
        System.out.println("the hex of 「" + s + "」 : " + UTF8Utils.bytesToHex(s.getBytes()));
        
        s = "a";
        System.out.println("the hex of 「" + s + "」 : " + UTF8Utils.bytesToHex(s.getBytes()));
        
        s = "我";
        System.out.println("the hex of 「" + s + "」 : " + UTF8Utils.bytesToHex(s.getBytes()));
        
        s = "我很帥";
        System.out.println("the hex of 「" + s + "」 : " + UTF8Utils.bytesToHex(s.getBytes()));
    }

}
View Code

日誌:

the hex of 「1」 : 31
the hex of 「a」 : 61
the hex of 「我」 : e68891
the hex of 「我很帥」 : e68891e5be88e5b885
View Code

 

 

> 搭建一個測試渠道來測試

因爲emoji表情在PC不易輸入,最好的輸入途徑始終在手機上,那麼咱們搭一個簡單的web程序來接收emoji表情吧~

<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title>Emoji</title>
</head>
<script type="text/javascript" src="https://code.jquery.com/jquery-1.12.3.min.js"></script>
<body>

<form id="myform" action="http://192.168.1.3:8080/emoji/EmojiFilterServlet" >

    Input parameter : 
    <input type='text' name='msg' />
    <br/>

    <input type='button' value=' ajax submit ' onclick="save();" />
    <input type='submit' value=' form submit ' />
</form>

</body>

<script type="text/javascript">

function save() {
    // alert('start save...');
    var data = $('#myform').serialize();
    // alert(data);
    
    $.ajax({
        type : "POST",
        url : "http://192.168.1.3:8080/emoji/EmojiFilterServlet",
        data : data,
        success : function(d) {
            alert(d);
        }
    });
}

</script>

</html>
View Code

 

package com.nicchagil.tc.emojifilter;

import java.io.IOException;
import java.nio.charset.Charset;

import javax.servlet.ServletException;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;

/**
 * Servlet implementation class EmojiFilterServlet
 */
public class EmojiFilterServlet extends HttpServlet {
    private static final long serialVersionUID = 1L;
       
    /**
     * @see HttpServlet#HttpServlet()
     */
    public EmojiFilterServlet() {
        super();
        // TODO Auto-generated constructor stub
    }

    /**
     * @see HttpServlet#doGet(HttpServletRequest request, HttpServletResponse response)
     */
    protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
        this.doPost(request, response);
    }

    /**
     * @see HttpServlet#doPost(HttpServletRequest request, HttpServletResponse response)
     */
    protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
        String msg = request.getParameter("msg");
        System.out.println("msg -> " + msg);
    }

}
View Code
相關文章
相關標籤/搜索