tinyXml處理UTF-8編碼詳解——寫入和讀取

時間 2019-11-11

原文原文鏈接

tinyXml的特色是不對xml節點內容的具體編碼處理，這一切都交給用戶。所以tinyXml和字符有關的函數都是隻接受char*的數據類型。
例如：網絡

TiXmlElement *pRoot=new TiXmlElement("test");

pRoot->SetAttribute("name","名字");

上述代碼產生的節點，若是用TiXmlDocument的SaveFile函數直接保存，只能是ANSI的本地編碼（不管程序是不是unicode），即便TiXmlDeclaration指定爲utf-8。一種方法是輸出到TiXmlPrinter，將TiXmlPrinter.CStr()轉換到utf-8編碼的char*後保存。函數

char*在雙字節編碼下是一種很奇特的字符串，中文平臺下的VC的編譯器，char*能夠存放GBK漢字，編譯能正確識別字符，由於ASCII碼的最高位爲0，而GBK雙字節字符的首字節最高位爲1。編碼

在使用utf-8字符串時，必須樹立一個觀念：utf-8應當只在傳輸時使用，不適合做爲函數過程的處理對象。什麼是傳輸場合？網絡傳輸和文件讀寫。以文件讀寫爲例，文件以utf-8編碼存放，在讀入到內存後，應當馬上轉換爲unicode寬字符串。程序的內部處理過程當中只有unicode寬字符串。直到寫入文件時，unicode寬字符串才轉換爲utf-8字符串。指針

utf-8字符串自己是變長字符串，並無特定的數據類型。它是以char*形式存放，它的byte的表現不符合任何雙字節編碼，當成雙字節編碼處理會馬上出錯。事實上，char*只是一個存放空間，用void*、unsigned char*本質上沒有區別。（假若你喜歡，甚至能夠拿char*來存放unicode寬字符串，一次memcpy兩個byte就是了）。code

脫離雙字節編碼（如GBK）的tinyXml使用方法是存在的。
例如上述代碼能夠改成：xml

TiXmlElement *pRoot=new TiXmlElement("test");

CStringA UTF8Str=CW2A(L"名字",CP_UTF8);

pRoot->SetAttribute("name",UTF8Str);

UTF8Str變量名便是內含的char*字符串的起始指針。CW2A函數能夠本身寫一個代替，並不難實現。此時能夠直接調用TiXmlDocument的SaveFile函數保存爲無BOM的UTF-8文檔。要保存爲含BOM的UTF-8文檔，仍然須要TiXmlPrinter，但此時不須要對TiXmlPrinter.CStr()進行任何處理。對象

XmlEntityTree=new TiXmlDocument;

TiXmlDeclaration *dec=new TiXmlDeclaration("1.0","utf-8","");

XmlEntityTree->LinkEndChild(dec);

TiXmlElement *pRoot=new TiXmlElement("test");

CStringA UTF8Str=CW2A(L"名字",CP_UTF8);

pRoot->SetAttribute("name",UTF8Str);

XmlEntityTree->LinkEndChild(pRoot);

TiXmlPrinter printer;

XmlEntityTree->Accept(&printer);


char UTF8BOM[3]={'\xEF','\xBB','\xBF'};


CFile theFile;

theFile.Open(_T("test.xml"),CFile::modeCreate|CFile::modeWrite);

theFile.Write(UTF8BOM,3);

theFile.Write(printer.CStr(),strlen(printer.CStr()));

theFile.Close();

tinyXml在加載xml文檔時有一個標記，TiXmlDocument.LoadFile(TiXmlEncoding encoding);
這個標記沒多大做用，不管設爲TIXML_ENCODING_UTF8仍是TIXML_ENCODING_LEGACY，讀入的節點的數據類型同樣是char*。
設爲TIXML_ENCODING_UTF8標記的惟一做用是tinyXml會自動處理文檔的BOM。內存

對於下面文檔，怎樣才能正確讀取到TemplateStr節點的內容？很簡單，在讀取時進行轉換就行。utf-8

<?xml version="1.0" encoding="utf-8" ?>

<config>

    <TemplateStr>中文</TemplateStr>

    <AutoFixCue>true</AutoFixCue>

    <AutoFixTTA>true</AutoFixTTA>

    <AcceptDragFLAC>true</AcceptDragFLAC>

    <AcceptDragTAK>true</AcceptDragTAK>

    <AcceptDragAPE>true</AcceptDragAPE>

</config>

TiXmlDocument *xmlfile= new TiXmlDocument(FilePath);

xmlfile->LoadFile(TIXML_ENCODING_UTF8);


TiXmlHandle hRoot(xmlfile);

TiXmlElement *pElem;

TiXmlHandle hXmlHandle(0);


//config節點

pElem=hRoot.FirstChildElement().Element();

if (!pElem) return FALSE;

if (strcmp(pElem->Value(),"config")!=0)

	return FALSE;


//TemplateStr節點

hXmlHandle=TiXmlHandle(pElem);

pElem=hXmlHandle.FirstChild("TemplateStr").Element();

if (!pElem) return FALSE;

CString TemplateStr=UTF8toUnicode(pElem->GetText());

UTF8toUnicode函數：unicode

CString UTF8toUnicode(const char* utf8Str,UINT length)

{

	CString unicodeStr;

	unicodeStr=_T("");


	if (!utf8Str)

		return unicodeStr;


	if (length==0)

		return unicodeStr;


	//轉換

	WCHAR chr=0;

	for (UINT i=0;i<length;)

	{

		if ((0x80&utf8Str[i])==0) // ASCII

		{

			chr=utf8Str[i];

			i++;

		}

		else if((0xE0&utf8Str[i])==0xC0) // 110xxxxx 10xxxxxx

		{

			chr =(utf8Str[i+0]&0x3F)<<6;

			chr|=(utf8Str[i+1]&0x3F);

			i+=2;

		}

		else if((0xF0&utf8Str[i])==0xE0) // 1110xxxx 10xxxxxx 10xxxxxx

		{

			chr =(utf8Str[i+0]&0x1F)<<12;

			chr|=(utf8Str[i+1]&0x3F)<<6;

			chr|=(utf8Str[i+2]&0x3F);

			i+=3;

		}

		/*

		else if() // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx

		{}

		else if() // 111110xx 10xxxxxx 10xxxxxx 10xxxxxx  10xxxxxx

		{}

		else if() // 1111110x 10xxxxxx 10xxxxxx 10xxxxxx  10xxxxxx 10xxxxxx 

		{}

		*/

		else // 不是UTF-8字符串

		{

			return unicodeStr;

		}

		unicodeStr.AppendChar(chr);

	}


	return unicodeStr;

}


CString UTF8toUnicode(const char* utf8Str)

{

	UINT theLength=strlen(utf8Str);

	return UTF8toUnicode(utf8Str,theLength);

}

strlen取char*的長度等於字節數（不含終止符），不是utf-8字串的真正字符個數。