在python處理文本的過程當中,常常會有文本字符集轉換的狀況,
而咱們但願用一個方法,不用關心文本本來的字符集是什麼樣的,直接轉換成想要的任何字符集就能夠了。 python
import chardet def convert_encoding(data,new_coding='UTF-8'): # 任意字符集轉換 encoding = chardet.detect(data)['encoding'] if new_coding.upper() != encoding.upper(): data = data.decode(encoding,data).encode(new_coding) return data
import icu def convert_encoding2(data,new_coding='UTF-8'): encoding = icu.CharsetDetector(data).detect().getName() # encoding = chardet.detect(content)['encoding'] if new_coding.upper() != encoding.upper(): # data = data.decode(encoding,data).encode(new_coding) data = unicode(data,coding).encode(new_coding) return data
import cchardet def convert_encoding3(data,new_coding='UTF-8'): encoding = cchardet.detect(data)['encoding'] if new_coding.upper() != encoding.upper(): data = data.decode(encoding,data).encode(new_coding) return data
此處使用方法一ide
#轉換成utf-8 convert_encoding(data,'utf-8') #轉抱成GBK convert_encoding(data,'gbk') #轉抱成GB2312 convert_encoding(data,'gbk')