python爬取豆瓣登錄驗證碼

時間 2019-12-10

原文原文鏈接

先寫一個爬取圖片的方法html

# -*- coding: utf-8 -*-
from urllib.request import Request
from urllib.request import urlopen

#添加模擬瀏覽器協議頭
headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}

url = 'https://ss0.bdstatic.com/94oJfD_bAAcT8t7mm9GUKT-xh_/timg?image&quality=100&size=b4000_4000&sec=1510537362&di=3f1f93bb6bf35c7724e3b5c435528187&src=http://www.zhlzw.com/UploadFiles/Article_UploadFiles/201204/20120412123921838.jpg'

def getImg(url,imgName):
	try:
		req_timeout = 5
		req = Request(url=url,headers=headers)
		f = urlopen(req,None,req_timeout)
		pic = f.read()
		#pic= Request.get(url, timeout=10)
		imgPath = './imgs/%s.jpg'%(imgName)
		fp=open(imgPath,'wb')
		fp.write(pic)
		fp.close()
	except Request.exceptions.ConnectionError:
		print(u'連接失敗')
##再寫一個爬去豆瓣登陸頁面的代碼，並調用上述所寫的方法

import http.cookiejar
import urllib.request

from lxml import etree

from spiderImg import getImg

head = {
    'Connection': 'Keep-Alive',
    'Accept': 'text/html, application/xhtml+xml, */*',
    'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko'
}
def makeMyOpener(head):
    cj = http.cookiejar.CookieJar()
    opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))
    header = []
    for key, value in head.items():
        elem = (key, value)
        header.append(elem)
    opener.addheaders = header
    return opener

oper = makeMyOpener(head)
uop = oper.open('https://accounts.douban.com/login', timeout = 1000)
data = uop.read()
html = data.decode()
'''
spath = './doubanLogin.html'
f=open(spath,"w",encoding='utf-8')
f.write(html)
f.close()
'''
#print(html)
#lxml提取
#獲得驗證碼
selector = etree.HTML(html)
links = selector.xpath('//img[@id="captcha_image"]/@src')
for link in links:
	print(link)
	getImg(link,'captcs')

#獲得驗證碼id
captcha_ids = selector.xpath('//input[@name="captcha-id"]/@value')
captcha_id = 0
for cid in captcha_ids:
	captcha_id = cid
	print(captcha_id)


captcha=input("請輸入驗證碼：")
print(captcha)

url = 'https://accounts.douban.com/login'
datas = {'source':'index_nav','redir':'https://www.douban.com/','form_email':'你的帳號','form_password':'你的密碼','captcha-solution':captcha,'captcha-id':captcha_id}
data_encoded = urllib.parse.urlencode(datas).encode(encoding='utf-8')
response = oper.open(url, data_encoded)
content = response.read()
html = content.decode()
#print(html)

spath = './douban.html'
f=open(spath,"w",encoding='utf-8')
f.write(html)
f.close()

相關標籤/搜索

每日一句

每一个你不满意的现在，都有一个你没有努力的曾经。