下面這段話轉自:https://blog.csdn.net/lightlater/article/details/6326338html
關於文本文件的文件頭ios
第一 ANSI文件的文件頭爲空,不須要處理;windows
第二 UNICODE文件的文件頭爲0xFF,0xFE共計兩個字節,讀取時須要偏移兩個字節再行讀取;app
第三 UTF-8文件的文件頭爲0xEF,0xBB,0xBF共計三個字節,讀取時須要偏移三個字節後再行讀取;函數
1.ansi格式txt文件spa
1 void readAnsiTXT(){ 2 string filename = "ansi.txt"; 3 ifstream fin(filename.c_str()); 4 if (!fin.is_open()){ 5 cout << "open failed!\n"; 6 } 7 char ch; 8 string msg = ""; 9 while (fin.get(ch)){ 10 msg += ch; 11 } 12 cout << msg << "\n"; 13 }
2.Unicode格式.net
轉載:http://www.javashuo.com/article/p-vzubwbix-nt.htmlcode
memset函數:https://baike.baidu.com/item/memset/4747579?fr=aladdinhtm
setlocal函數:https://www.runoob.com/cprogramming/c-function-setlocale.htmlblog
void readUnicodeTXT(){ string filename = "unicode.txt"; ifstream fin; fin.open(filename, ios::binary); fin.seekg(2, ios::beg); wstring wstrLine; while (!fin.eof()) { wchar_t wch; fin.read((char *)(&wch), 2); wstrLine.append(1, wch); } string str = ws2s(wstrLine); str.erase(str.size()-1, 1);//刪除結尾重複的一個字符 cout << str << endl; } std::string ws2s(const std::wstring& ws) { std::string curLocale = setlocale(LC_ALL, NULL); // C 庫函數 char *setlocale(int category, const char *locale) 設置或讀取地域化信息。 setlocale(LC_ALL, "chs"); const wchar_t* _Source = ws.c_str(); size_t _Dsize = 2 * ws.size() + 1; char *_Dest = new char[_Dsize]; memset(_Dest, 0, _Dsize); wcstombs(_Dest, _Source, _Dsize); std::string result = _Dest; delete[]_Dest; setlocale(LC_ALL, curLocale.c_str()); return result; }
utf8格式:
1 void readUtf8TXT(){ 2 string str = "utf8.txt"; 3 wstring res=L""; 4 std::locale loc("chs"); 5 std::wcout.imbue(loc); 6 std::wifstream wif(str, ios::binary); 7 codecvt_utf8<wchar_t, 0x10ffff, std::consume_header>* codecvToUnicode = new codecvt_utf8<wchar_t, 0x10ffff, std::consume_header>; 8 if (wif.is_open()){ 9 wif.imbue(std::locale(wif.getloc(), codecvToUnicode)); 10 wstring wline; 11 while (getline(wif, wline)){ 12 wstring convert; 13 for (auto c : wline){ 14 if (c != L'\0' && c != L'?') convert += c; 15 } 16 res = res + convert; 17 } 18 wif.close(); 19 } 20 for (wstring::iterator i = res.begin(); i != res.end(); i++){//將res中的'\r'換成'\n',不然輸出異常 21 if (*i == '\r'){ 22 *i = '\n'; 23 } 24 } 25 wcout << res << endl; 26 }
Windows下使用std::wifstream讀取Unicode文本的方法:
1 std::locale loc("chs"); //windows下ok 2 std::wcout.imbue(loc); 3 // open as a byte stream 4 std::wifstream wif("路徑", std::ios::binary); 5 std::codecvt_utf16<wchar_t, 0x10ffff, std::consume_header>* codecvtToUnicode = new std::codecvt_utf16 < wchar_t, 0x10ffff, std::consume_header >; 6 if (wif.is_open()) 7 { 8 // apply BOM-sensitive UTF-16 facet 9 wif.imbue(std::locale(wif.getloc(), codecvtToUnicode)); 10 std::wstring wline; 11 while (std::getline(wif, wline)) 12 { 13 std::wstring convert; 14 for (auto c : wline) 15 { 16 if (c != L'\0' && c != L'?') 17 convert += c; 18 } 19 wcout << convert << endl; 20 } 21 wif.close(); 22 //delete codecvtToUnicode; //new和delete,應該不用手動delete,在哪裏delete都會崩潰(親測) 23 }
Windows下使用std::wifstream讀取UTF-8文本的方法:
1 std::locale loc("chs"); //windows下ok 2 std::wcout.imbue(loc); 3 // open as a byte stream 4 std::wifstream wif("路徑", std::ios::binary); 5 std::codecvt_utf8<wchar_t, 0x10ffff, std::consume_header>* codecvtToUnicode = new std::codecvt_utf8 < wchar_t, 0x10ffff, std::consume_header >; 6 if (wif.is_open()) 7 { 8 // apply BOM-sensitive UTF-8 facet 9 wif.imbue(std::locale(wif.getloc(), codecvtToUnicode)); 10 std::wstring wline; 11 while (std::getline(wif, wline)) 12 { 13 std::wstring convert; 14 for (auto c : wline) 15 { 16 if (c != L'\0' && c != L'?') 17 convert += c; 18 } 19 wcout << convert << endl; 20 } 21 wif.close(); 22 //delete codecvtToUnicode; //new和delete,應該不用手動delete,在哪裏delete都會崩潰(親測) 23 }