字符串UTF-8和GBK之間的轉換以及斷定

時間 2019-11-11

標籤字符串 utf gbk 之間轉換以及斷定简体版

原文原文鏈接

1、斷定字符串是不是UTF-8的編碼ide

bool is_str_utf8(const char* str)
{
    unsigned int nBytes = 0;//UFT8可用1-6個字節編碼,ASCII用一個字節  
    unsigned char chr = *str;
    bool bAllAscii = true;
 
    for (unsigned int i = 0; str[i] != '\0'; ++i)
    {
        chr = *(str + i);
        //判斷是否ASCII編碼,若是不是,說明有多是UTF8,ASCII用7位編碼,最高位標記爲0,0xxxxxxx 
        if (nBytes == 0 && (chr & 0x80) != 0)
        {
            bAllAscii = false;
        }
 
        if (nBytes == 0) 
        {
            //若是不是ASCII碼,應該是多字節符,計算字節數  
            if (chr >= 0x80) 
            {
                if (chr >= 0xFC && chr <= 0xFD)
                {
                    nBytes = 6;
                }
                else if (chr >= 0xF8)
                {
                    nBytes = 5;
                }
                else if (chr >= 0xF0)
                {
                    nBytes = 4;
                }
                else if (chr >= 0xE0)
                {
                    nBytes = 3;
                }
                else if (chr >= 0xC0)
                {
                    nBytes = 2;
                }
                else
                {
                    return false;
                }
                nBytes--;
            }
        }
        else
        {
            //多字節符的非首字節,應爲 10xxxxxx 
            if ((chr & 0xC0) != 0x80)
            {
                return false;
            }
            //減到爲零爲止
            nBytes--;
        }
    }
 
    //違返UTF8編碼規則 
    if (nBytes != 0)  
    {
        return false;
    }
 
    if (bAllAscii)
    { //若是所有都是ASCII, 也是UTF8
        return true;
    }
 
    return true;
}

2、斷定字符串是不是GBk的編碼編碼

bool is_str_gbk(const char* str)
{
    unsigned int nBytes = 0;//GBK可用1-2個字節編碼,中文兩個 ,英文一個 
    unsigned char chr = *str;
    bool bAllAscii = true; //若是所有都是ASCII,  
 
    for (unsigned int i = 0; str[i] != '\0'; ++i)
    {
        chr = *(str + i);
        if ((chr & 0x80) != 0 && nBytes == 0)
        {// 判斷是否ASCII編碼,若是不是,說明有多是GBK
            bAllAscii = false;
        }
 
        if (nBytes == 0) 
        {
            if (chr >= 0x80) 
            {
                if (chr >= 0x81 && chr <= 0xFE)
                {
                    nBytes = +2;
                }
                else
                {
                    return false;
                }
                nBytes--;
            }
        }
        else
        {
            if (chr < 0x40 || chr>0xFE)
            {
                return false;
            }
            nBytes--;
        }//else end
    }
 
    if (nBytes != 0)  
    {    //違返規則 
        return false;
    }
 
    if (bAllAscii)
    { //若是所有都是ASCII, 也是GBK
        return true;
    }
 
    return true;
}

3、字符串由GBk編碼轉換成UTF-8編碼spa

void ConvertGBKToUtf8(CString &strGBK)
 {
  int len=MultiByteToWideChar(CP_ACP, 0, (LPCTSTR)strGBK, -1, NULL,0);
  wchar_t * wszUtf8 = new wchar_t [len];
  memset(wszUtf8, 0, len);
  MultiByteToWideChar(CP_ACP, 0, (LPCTSTR)strGBK, -1, wszUtf8, len);
  len = WideCharToMultiByte(CP_UTF8, 0, wszUtf8, -1, NULL, 0, NULL, NULL);
  char *szUtf8=new char[len + 1];
  memset(szUtf8, 0, len + 1);
  WideCharToMultiByte (CP_UTF8, 0, wszUtf8, -1, szUtf8, len, NULL,NULL);
  strGBK = szUtf8;
  delete[] szUtf8;
  delete[] wszUtf8;
 }


string GBKToUTF8(const char* strGBK)  
{  
    int len = MultiByteToWideChar(CP_ACP, 0, strGBK, -1, NULL, 0);  
    wchar_t* wstr = new wchar_t[len+1];  
    memset(wstr, 0, len+1);  
    MultiByteToWideChar(CP_ACP, 0, strGBK, -1, wstr, len);  
    len = WideCharToMultiByte(CP_UTF8, 0, wstr, -1, NULL, 0, NULL, NULL);  
    char* str = new char[len+1];  
    memset(str, 0, len+1);  
    WideCharToMultiByte(CP_UTF8, 0, wstr, -1, str, len, NULL, NULL);  
    string strTemp = str;  
    if(wstr) delete[] wstr;  
    if(str) delete[] str;  
    return strTemp;  
}

4、字符串由UTF-8編碼轉換成GBk編碼code

string UtfToGbk(const char* utf8)
{
    int len = MultiByteToWideChar(CP_UTF8, 0, utf8, -1, NULL, 0);
    wchar_t* wstr = new wchar_t[len+1];
    memset(wstr, 0, len+1);
    MultiByteToWideChar(CP_UTF8, 0, utf8, -1, wstr, len);
    len = WideCharToMultiByte(CP_ACP, 0, wstr, -1, NULL, 0, NULL, NULL);
    char* str = new char[len+1];
    memset(str, 0, len+1);
    WideCharToMultiByte(CP_ACP, 0, wstr, -1, str, len, NULL, NULL);
    if(wstr) delete[] wstr;
    return str;
}

bool Utf82gbk(std::string &gbkStr, std::string &srcStr)
{
 
    //首先先將utf-8編碼轉換爲unicode編碼   
    if(NULL==setlocale(LC_ALL,"zh_CN.utf8"))//設置轉換爲unicode前的碼,當前爲utf8編碼   
    {
        printf("Bad Parameter\n");
        return false;
    }
 
    int unicodeLen=mbstowcs(NULL,srcStr.c_str(),0);//計算轉換後的長度   
    if(unicodeLen<=0)
    {
        printf("Can not Transfer!!!\n");
        return false;
    }
    wchar_t *unicodeStr=(wchar_t *)calloc(sizeof(wchar_t),unicodeLen+1);
    mbstowcs(unicodeStr,srcStr.c_str(),srcStr.size());//將gbk轉換爲unicode   
 
    //將unicode編碼轉換爲gbk編碼   
    if(NULL==setlocale(LC_ALL,"zh_CN.gbk"))//設置unicode轉換後的碼,當前爲gbk   
    {
        printf("Bad Parameter\n");
        return false;
    }
    int gbkLen = wcstombs(NULL,unicodeStr,0);//計算轉換後的長度   
    if(gbkLen<=0)
    {
        printf("Can not Transfer!!!\n");
        return false;
    }
    char gbkbuf[1024*10];
    wcstombs(gbkbuf,unicodeStr,gbkLen);
    gbkbuf[gbkLen]=0;//添加結束符   
    gbkStr = gbkbuf;
    free(unicodeStr);
    return true;
}


string UTF8ToGBK(const std::string& strUTF8)    
{    
    int len = MultiByteToWideChar(CP_UTF8, 0, strUTF8.c_str(), -1, NULL, 0);    
    WCHAR* wszGBK = new WCHAR[len+1];  
    memset(wszGBK, 0, len * 2 + 2);    
    MultiByteToWideChar(CP_UTF8, 0, (LPCSTR)(LPCTSTR)strUTF8.c_str(), -1, wszGBK, len);    
  
    len = WideCharToMultiByte(CP_ACP, 0, wszGBK, -1, NULL, 0, NULL, NULL);    
    char *szGBK = new char[len + 1];    
    memset(szGBK, 0, len + 1);    
    WideCharToMultiByte(CP_ACP,0, wszGBK, -1, szGBK, len, NULL, NULL);     
    std::string strTemp(szGBK);    
    delete[]szGBK;    
    delete[]wszGBK;    
    return strTemp;    
}

相關標籤/搜索

每日一句

每一个你不满意的现在，都有一个你没有努力的曾经。