python爬蟲：使用帳號、密碼和驗證碼登陸知乎網頁

時間 2019-11-25

原文原文鏈接

先上代碼，後分析出現的問題：html

 1 #coding:utf-8
 2 import re
 3 from bs4 import BeautifulSoup
 4 import gzip
 5 import urllib.request
 6 import urllib.parse
 7 import http.cookiejar
 8 import ssl
 9 import time
10 
11 def get_opener(heads):
12     cj=http.cookiejar.CookieJar()
13     pro=urllib.request.HTTPCookieProcessor(cj)
14     opener=urllib.request.build_opener(pro)
15     header=[]
16     for key,value in heads.items():
17         header.append((key,value))
18     opener.addheaders=header
19     return opener
20 
21 def ungzip(data):
22     try:
23         print("正在解壓....")
24         data=gzip.decompress(data)
25         print("解壓完成")
26     except:
27         print("無需解壓")
28     return data    
29 
30 if __name__=="__main__":
31     ssl._create_default_https_context = ssl._create_unverified_context 
32     heads={
33             "Accept":"text/html, application/xhtml+xml, */*",
34             "Accept-Language":"zh-CN",
35             "User-Agent":"Mozilla/5.0 (Windows NT 6.3; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0",
36             "Accept-Encoding": "gzip, deflate",
37             "Host": "www.zhihu.com",
38             "DNT": "1",
39             "Connection": "Keep-Alive"
40             }
41     opener=get_opener(heads)
42     url="https://www.zhihu.com/"
43     op=opener.open(url)
44     data1=op.read()
45     data1=ungzip(data1).decode('utf-8')
46     #print(data1.decode('utf-8'))
47     #print(op.read().decode('utf-8'))
48 ##    xsrf=re.findall(r'name="_xsrf" value=".*"',data1)
49 ##    print(xsrf[0])
50 ##    print(type(xsrf[0]))
51 ##    value=xsrf[0].split(" ")
52 ##    print(value)
53 ##    _xsrf=re.findall(r'".*"',value[1])[0]
54 ##    print(_xsrf)
55     soup=BeautifulSoup(data1,"html.parser")
56     _xsrf=soup.find("input",{'type':'hidden'}).get("value")
57     password="hzc19911005"
58     #captcha_type="cn"
59     phone_num="13267243809"
60     captcha_url="https://www.zhihu.com/captcha.gif?r=%d&type=login"% (time.time() * 1000)
61     captchadata=opener.open(captcha_url).read()
62     with open("1.gif",'wb') as file:
63         file.write(captchadata)
64     yanzhengma=input("captcha:")
65     postdata={
66         "_xsrf":_xsrf,
67         "password":password,
68         #"captcha_type":captcha_type,#不能帶有這個字段
69         "phone_num":phone_num,
70         "captcha":yanzhengma
71         }
72     postdata=urllib.parse.urlencode(postdata).encode()
73     login_url="https://www.zhihu.com/login/phone_num"
74     op2=opener.open(login_url,postdata)
75     login_data=op2.read()
76     data=ungzip(login_data).decode("utf-8")
77     print(data)
78     result=dict(eval(data))
79     if result["r"]==0:
80         print("登陸成功")
81

一、出現「SSLError: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:581)」：服務器

Python 2.7.9 以後版本引入了一個新特性cookie

當你urllib.urlopen一個 https 的時候會驗證一次 SSL 證書 app

當目標使用的是自簽名的證書時就會爆出一個post

urllib.URLError: <urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:581)> 的錯誤消息，ui

處理方法：url

import ssl 
ssl._create_default_https_context = ssl._create_unverified_context

二、出現驗證碼錯誤，返回：驗證碼過時：{ "r": 1, "errcode": 1991829, "data": {"captcha":"驗證碼回話無效 :(","name":"ERR_VERIFY_CAPTCHA_SESSION_INVALID"}, "msg": "驗證碼回話無效 :(" }:spa

發給服務器的post數據沒有帶驗證碼："captcha"，解決辦法：postdata={
        "_xsrf":_xsrf,
        "password":password,
        #"captcha_type":captcha_type,#不能帶有這個字段
        "phone_num":phone_num,
        "captcha":yanzhengma
        }
驗證碼過時，解決辦法：先從url="https://www.zhihu.com/captcha.gif?r=%d&type=login"% (time.time() * 1000)下載圖片保存在本地，而後人工識別，手動輸入驗證碼

1 captcha_url="https://www.zhihu.com/captcha.gif?r=%d&type=login"% (time.time() * 1000)
2 captchadata=opener.open(captcha_url).read()
3 with open("1.gif",'wb') as file:
4       file.write(captchadata)
5 yanzhengma=input("captcha:")