py2默認的編碼是ascii, ascii只支持英文字符python
In [2]: sys.getdefaultencoding() Out[2]: 'ascii' In [4]: a = '你好' In [5]: a Out[5]: '\xe4\xbd\xa0\xe5\xa5\xbd' # 輸出是十六進制的內存地址, 但其實仍是bytes類型 In [6]: type(a) # 類型是str, 其實就是bytes Out[6]: str In [7]: b = 'hello' In [8]: b Out[8]: 'hello' In [9]: type(b) Out[9]: str In [11]: a1 = a.decode('utf-8') In [12]: a1 Out[12]: u'\u4f60\u597d' # 將bytes二進制按utf-8字符集解碼 In [13]: type(a1) Out[13]: unicode # 變成了py2獨有的unicode類型 In [16]: sys.getsizeof(a) # bytes類型的你好 Out[16]: 43 # 字節大小 In [17]: sys.getsizeof(a1) # 解碼後的 你好 Out[17]: 54 In [19]: b1 = b.decode('utf-8') In [20]: b1 Out[20]: u'hello' In [22]: sys.getsizeof(b) # bytes類型佔的字節數少 Out[22]: 42 In [23]: sys.getsizeof(b1) # unicode佔字節數多 Out[23]: 60
注意:python2的basestring和str是不一樣的, basestring是包含了bytes和unicode兩種類型, 而str就是bytes類型.編碼
print isinstance(u'aa', basestring) # True print isinstance('aa', basestring) # True print isinstance(u'aa', str) # False print isinstance('aa', str) # True
py3默認的編碼是unicode, utf-8字符集code
In [1]: import sys In [2]: sys.getdefaultencoding() Out[2]: 'utf-8' In [3]: a = '你好' In [4]: a Out[4]: '你好' In [5]: type(a) # py3的str就是str Out[5]: str In [6]: a1 = a.encode('utf-8') In [7]: a1 Out[7]: b'\xe4\xbd\xa0\xe5\xa5\xbd' # 按默認編碼格式編碼後纔是bytes In [8]: type(a1) Out[8]: bytes