python requests模塊模擬請求的響應內容亂碼問題（源碼分析）

時間 2020-05-10

標籤 python requests 模塊模擬請求響應內容亂碼問題源碼分析欄目 Python 简体版

原文原文鏈接

def request(url, data=None, get_or_post=None): try: if get_or_post: response = requests.post(url=url, data=data, headers=headers) else: if data: url = url + urlencode(data) response = requests.get(url=url, headers=headers) # print(response.headers)
            # {'Server': 'jfe', 'Date': 'Wed, 06 Mar 2019 05:01:58 GMT', 'Content-Type': 'text/html', 'Transfer-Encoding': 'chunked', 'Connection': 'keep-alive', 'Vary': 'Accept-Encoding', 'Set-Cookie': 'xtest=3695.cf6b6759; expires=Fri, 05-Apr-2019 05:01:58 GMT; Max-Age=2592000; domain=search.jd.com, ipLoc-djd=1-72-2799-0; expires=Fri, 05-Apr-2019 05:01:58 GMT; Max-Age=2592000; path=/; domain=jd.com', 'Content-Encoding': 'gzip', 'Strict-Transport-Security': 'max-age=86400'}
            # print(type(response)) # <class 'requests.models.Response'>
            # print(type(response.text)) # <class 'str'>
            # print(response.headers['content-type']) text/html
            # print(response.encoding) # ISO-8859-1#response內容的編碼
            # print(response.apparent_encoding) utf-8#response headers裏設置的編碼（即服務端返回的數據是用utf8格式編碼的）
            # print(requests.utils.get_encodings_from_content(response.text)) ['utf-8']#response返回的html header標籤裏設置的編碼
            ''' class HTTPAdapter(BaseAdapter): # 接收到服務端的響應以後對服務端的響應進行處理，構造Response對象 def build_response(self, req, resp): response = Response() response.status_code = getattr(resp, 'status', None) response.encoding = get_encoding_from_headers(response.headers) response.encoding由下面的函數返回值賦值獲得的，下面函數判斷響應頭中的content-type中有沒有charset，若是有charset就將charset的值返回，若是沒有則判斷有沒有text，若是有返回ISO-8859-1，而咱們請求搜索頁的時候content-type是沒有charset的，只有text def get_encoding_from_headers(headers): """Returns encodings from given HTTP Header Dict. :param headers: dictionary to extract encoding from. :rtype: str """ content_type = headers.get('content-type') if not content_type: return None content_type, params = cgi.parse_header(content_type) if 'charset' in params: return params['charset'].strip("'\'") if 'text' in content_type: return 'ISO-8859-1' response.text是如何被編碼的： class Response(object): @property def text(self): encoding = self.encoding # （response.encoding已被上面的函數賦值爲ISO-8859-1） try: # 將服務端返回的響應體的內容(bytes類型)使用encoding(ISO-8859-1)的編碼格式進行解碼，解碼成str類型 # 可是服務端返回的響應體的內容(bytes類型)是用utf-8編碼生成的，用ISO-8859-1編碼格式去進行解碼成str類型，確定會亂碼 content = str(self.content, encoding, errors='replace') 總結：requests模塊會根據響應頭的content-type裏的charset去設置響應體的編碼格式，若是沒有會給一個默認的編碼格式ISO-8859-1， 可是服務端對響應體是用utf-8進行編碼，編碼成bytes類型返回的，而後你用ISO-8859-1去解碼成str類型，確定亂碼（response.txt是ISO-8859-1編碼格式的str類型） 解決方案：將上述過程逆向，將response.txt str類型使用ISO-8859-1編碼格式編碼成服務端原始返回的utf-8編碼格式的bytes類型，而後再使用utf-8編碼格式解碼成str類型，即response.text.encode(response.encoding).decode(response.apparent_encoding)，response.apparent_encoding就是服務端返回的響應頭中設置編碼格式，即服務端對返回的響應體(bytes類型)的編碼格式，在本例中就是utf-8 '''
        if response.status_code == 200: return response.text.encode(response.encoding).decode(response.apparent_encoding) return None except RequestException: print('請求' + url + '出錯') return None
 def search(keyword, page): url = "https://search.jd.com/Search?" data = { "keyword": keyword, "enc": "utf-8", "page": page, } html = request(url, data) return html