def request(url, data=None, get_or_post=None): try: if get_or_post: response = requests.post(url=url, data=data, headers=headers) else: if data: url = url + urlencode(data) response = requests.get(url=url, headers=headers) # print(response.headers) # {'Server': 'jfe', 'Date': 'Wed, 06 Mar 2019 05:01:58 GMT', 'Content-Type': 'text/html', 'Transfer-Encoding': 'chunked', 'Connection': 'keep-alive', 'Vary': 'Accept-Encoding', 'Set-Cookie': 'xtest=3695.cf6b6759; expires=Fri, 05-Apr-2019 05:01:58 GMT; Max-Age=2592000; domain=search.jd.com, ipLoc-djd=1-72-2799-0; expires=Fri, 05-Apr-2019 05:01:58 GMT; Max-Age=2592000; path=/; domain=jd.com', 'Content-Encoding': 'gzip', 'Strict-Transport-Security': 'max-age=86400'} # print(type(response)) # <class 'requests.models.Response'> # print(type(response.text)) # <class 'str'> # print(response.headers['content-type']) text/html # print(response.encoding) # ISO-8859-1#response內容的編碼 # print(response.apparent_encoding) utf-8#response headers裏設置的編碼(即服務端返回的數據是用utf8格式編碼的) # print(requests.utils.get_encodings_from_content(response.text)) ['utf-8']#response返回的html header標籤裏設置的編碼 ''' class HTTPAdapter(BaseAdapter): # 接收到服務端的響應以後對服務端的響應進行處理,構造Response對象 def build_response(self, req, resp): response = Response() response.status_code = getattr(resp, 'status', None) response.encoding = get_encoding_from_headers(response.headers) response.encoding由下面的函數返回值賦值獲得的,下面函數判斷響應頭中的content-type中有沒有charset,若是有charset就將charset的值返回,若是沒有則判斷有沒有text,若是有返回ISO-8859-1,而咱們請求搜索頁的時候content-type是沒有charset的,只有text def get_encoding_from_headers(headers): """Returns encodings from given HTTP Header Dict. :param headers: dictionary to extract encoding from. :rtype: str """ content_type = headers.get('content-type') if not content_type: return None content_type, params = cgi.parse_header(content_type) if 'charset' in params: return params['charset'].strip("'\'") if 'text' in content_type: return 'ISO-8859-1' response.text是如何被編碼的: class Response(object): @property def text(self): encoding = self.encoding # (response.encoding已被上面的函數賦值爲ISO-8859-1) try: # 將服務端返回的響應體的內容(bytes類型)使用encoding(ISO-8859-1)的編碼格式進行解碼,解碼成str類型 # 可是服務端返回的響應體的內容(bytes類型)是用utf-8編碼生成的,用ISO-8859-1編碼格式去進行解碼成str類型,確定會亂碼 content = str(self.content, encoding, errors='replace') 總結:requests模塊會根據響應頭的content-type裏的charset去設置響應體的編碼格式,若是沒有會給一個默認的編碼格式ISO-8859-1, 可是服務端對響應體是用utf-8進行編碼,編碼成bytes類型返回的,而後你用ISO-8859-1去解碼成str類型,確定亂碼(response.txt是ISO-8859-1編碼格式的str類型) 解決方案:將上述過程逆向,將response.txt str類型使用ISO-8859-1編碼格式編碼成服務端原始返回的utf-8編碼格式的bytes類型,而後再使用utf-8編碼格式解碼成str類型,即response.text.encode(response.encoding).decode(response.apparent_encoding),response.apparent_encoding就是服務端返回的響應頭中設置編碼格式,即服務端對返回的響應體(bytes類型)的編碼格式,在本例中就是utf-8 ''' if response.status_code == 200: return response.text.encode(response.encoding).decode(response.apparent_encoding) return None except RequestException: print('請求' + url + '出錯') return None
def search(keyword, page): url = "https://search.jd.com/Search?" data = { "keyword": keyword, "enc": "utf-8", "page": page, } html = request(url, data) return html
html = search('顯卡', 2)