字符串對象是「變長對象」。python
Python中字符串(strs)對象最重要的建立方法爲PyUnicode_DecodeUTF8Stateful,以下Python語句最終會調用到PyUnicode_DecodeUTF8Stateful:緩存
a = 'hello b = str('world')
詞法解析,最終調到PyUnicode_DecodeUTF8Stateful,調用順序以下:性能優化
// ast.c ast_for_expr =>ast_for_power =>ast_for_atom_expr =>ast_for_atom (case STRING) =>parsestrplus =>parsestr // unicodeobject.c => PyUnicode_DecodeUTF8Stateful
// unicodeobject.c PyObject * PyUnicode_DecodeUTF8Stateful(const char *s, Py_ssize_t size, const char *errors, Py_ssize_t *consumed) { _PyUnicodeWriter writer; const char *starts = s; const char *end = s + size; Py_ssize_t startinpos; Py_ssize_t endinpos; const char *errmsg = ""; PyObject *error_handler_obj = NULL; PyObject *exc = NULL; _Py_error_handler error_handler = _Py_ERROR_UNKNOWN; if (size == 0) { if (consumed) *consumed = 0; _Py_RETURN_UNICODE_EMPTY(); } /* ASCII is equivalent to the first 128 ordinals in Unicode. */ if (size == 1 && (unsigned char)s[0] < 128) { if (consumed) *consumed = 1; return get_latin1_char((unsigned char)s[0]); } _PyUnicodeWriter_Init(&writer); writer.min_length = size; if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1) goto onError; writer.pos = ascii_decode(s, end, writer.data); s += writer.pos; while (s < end) { // ascii解碼後的size小於傳入的size } End: if (consumed) *consumed = s - starts; Py_XDECREF(error_handler_obj); Py_XDECREF(exc); return _PyUnicodeWriter_Finish(&writer); onError: Py_XDECREF(error_handler_obj); Py_XDECREF(exc); _PyUnicodeWriter_Dealloc(&writer); return NULL; }
能夠看到:app
// unicodeobject.c static PyObject *unicode_empty = NULL; #define _Py_INCREF_UNICODE_EMPTY() \ do { \ if (unicode_empty != NULL) \ Py_INCREF(unicode_empty); \ else { \ unicode_empty = PyUnicode_New(0, 0); \ if (unicode_empty != NULL) { \ Py_INCREF(unicode_empty); \ assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \ } \ } \ } while (0) #define _Py_RETURN_UNICODE_EMPTY() \ do { \ _Py_INCREF_UNICODE_EMPTY(); \ return unicode_empty; \ } while (0) // PyUnicode_DecodeUTF8Stateful-> // _PyUnicodeWriter_Finish-> // unicode_result_ready static PyObject* unicode_result_ready(PyObject *unicode) { Py_ssize_t length; length = PyUnicode_GET_LENGTH(unicode); if (length == 0) { if (unicode != unicode_empty) { Py_DECREF(unicode); _Py_RETURN_UNICODE_EMPTY(); } return unicode_empty; } if (length == 1) { void *data = PyUnicode_DATA(unicode); int kind = PyUnicode_KIND(unicode); Py_UCS4 ch = PyUnicode_READ(kind, data, 0); if (ch < 256) { PyObject *latin1_char = unicode_latin1[ch]; if (latin1_char != NULL) { if (unicode != latin1_char) { Py_INCREF(latin1_char); Py_DECREF(unicode); } return latin1_char; } else { assert(_PyUnicode_CheckConsistency(unicode, 1)); Py_INCREF(unicode); unicode_latin1[ch] = unicode; return unicode; } } } assert(_PyUnicode_CheckConsistency(unicode, 1)); return unicode; }
// unicodeobject.c static PyObject *unicode_latin1[256] = {NULL}; PyObject * PyUnicode_DecodeUTF8Stateful(const char *s, Py_ssize_t size, const char *errors, Py_ssize_t *consumed) { // do sth. /* ASCII is equivalent to the first 128 ordinals in Unicode. */ if (size == 1 && (unsigned char)s[0] < 128) { if (consumed) *consumed = 1; return get_latin1_char((unsigned char)s[0]); } // do sth. } static PyObject* get_latin1_char(unsigned char ch) { PyObject *unicode = unicode_latin1[ch]; if (!unicode) { unicode = PyUnicode_New(1, ch); if (!unicode) return NULL; PyUnicode_1BYTE_DATA(unicode)[0] = ch; assert(_PyUnicode_CheckConsistency(unicode, 1)); unicode_latin1[ch] = unicode; } Py_INCREF(unicode); return unicode; }
a = 'hello' b = 'hello' a is b #True
由上例能夠看出Python對常量字符串作了緩存。緩存的關鍵性實如今PyUnicode_InternInPlace方法中。性能
// compile.c assemble =>makecode // codeobject.c =>PyCode_New =>intern_string_constants // unicodeobject.c =>PyUnicode_InternInPlace
// unicodeobject.c static PyObject *interned = NULL; void PyUnicode_InternInPlace(PyObject **p) { PyObject *s = *p; PyObject *t; #ifdef Py_DEBUG assert(s != NULL); assert(_PyUnicode_CHECK(s)); #else if (s == NULL || !PyUnicode_Check(s)) return; #endif /* If it's a subclass, we don't really know what putting it in the interned dict might do. */ if (!PyUnicode_CheckExact(s)) return; if (PyUnicode_CHECK_INTERNED(s)) return; if (interned == NULL) { interned = PyDict_New(); if (interned == NULL) { PyErr_Clear(); /* Don't leave an exception */ return; } } Py_ALLOW_RECURSION t = PyDict_SetDefault(interned, s, s); Py_END_ALLOW_RECURSION if (t == NULL) { PyErr_Clear(); return; } if (t != s) { Py_INCREF(t); Py_SETREF(*p, t); return; } /* The two references in interned are not counted by refcnt. The deallocator will take care of this */ Py_REFCNT(s) -= 2; _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL; }
其中最關鍵的方法爲PyDict_SetDefault,該方法存在於字典對象dictobject.c中。若是沒有相同的key(此處爲s),則返回defaultobject(此處也爲s),不然若是有相同的key則返回對應的value。因此若是t與s不一樣,則說明字典中有相應的key,此時將t的計數加1,而且將以前常量字符串的對象指向t。優化
如此一來,常量字符串的對象地址就一致了,此時s的計數會被消除,若是s的計數爲0,則會被釋放。值得注意的是,常量字符串的對象每次仍舊會被多分配一次內存,只是若是以前有分配過,且若是這次分配的對象計數爲0,則會被釋放。ui
有些狀況下(字符串包含非0-9a-zA-Z)不會放到字典裏,這時候能夠經過sys.intern進行性能優化:this
import sys a = '啊' b = '啊' a is b # False a = sys.intern('啊') b = sys.intern('啊') a is b # True
具體能夠參考:memory - What does python sys.intern do, and when should it be used? - Stack Overflowatom
支持tp_as_number、tp_as_sequence、tp_as_mapping這三種操做。spa
// unicodeobject.c &unicode_as_number, /* tp_as_number */
// unicodeobject.c &unicode_as_sequence, /* tp_as_sequence */
// unicodeobject.c static PySequenceMethods unicode_as_sequence = { (lenfunc) unicode_length, /* sq_length */ PyUnicode_Concat, /* sq_concat */ (ssizeargfunc) unicode_repeat, /* sq_repeat */ (ssizeargfunc) unicode_getitem, /* sq_item */ 0, /* sq_slice */ 0, /* sq_ass_item */ 0, /* sq_ass_slice */ PyUnicode_Contains, /* sq_contains */ };
由於沒有實現PySequenceMethods中的設置方法,因此字符串不可變。
其中:
len('hello')
'hello' + 'wolrd'
多個字符串相加效率低於join,join只分配一次內存;
'hello'*10
效率要高於同個字符串相加;
'h' in 'hello'
// unicodeobject.c &unicode_as_mapping, /* tp_as_mapping */
// unicodeobject.c static PyMappingMethods unicode_as_mapping = { (lenfunc)unicode_length, /* mp_length */ (binaryfunc)unicode_subscript, /* mp_subscript */ (objobjargproc)0, /* mp_ass_subscript */ };
其中:
test = 'hello world' test[1] test[0:5]
test[1]會走unicode_subscript方法的index分支,test[0:5]會走slice分支;
// unicodeobject.c unicode_repr, /* tp_repr */ (reprfunc) unicode_str, /* tp_str */
// unicodeobject.c (hashfunc) unicode_hash, /* tp_hash*/
// unicodeobject.c PyUnicode_RichCompare, /* tp_richcompare */
// unicodeobject.c unicode_methods, /* tp_methods */