在利用scrapy框架爬各類網站時,必定會碰到某些網站是須要登陸才能獲取信息。html
這兩天也在學習怎麼去模擬登陸,經過本身碼的代碼和借鑑別人的項目,調試成功豆瓣的模擬登陸,順便處理了怎麼自動化的處理驗證碼。
python
通常都是經過打碼平臺處理的,固然你也能夠機器學習的知識去識別驗證碼。後期我想本身作一個關於機器學習識別驗證碼的API,訓練主流的網站,方便本身調用。(還不知道能不能作出來呢,走一步看一步咯!)git
1、想要實現登陸豆瓣關鍵點github
相關代碼已經調試成功----2017-4-5api
目標網站:豆瓣網瀏覽器
實現:模擬登陸豆瓣,驗證碼處理,登陸到我的主頁就算是successcookie
數據:沒有抓取數據,此實戰主要是模擬登陸和處理驗證碼的學習。要是有需求要抓取數據,編寫相關的抓取規則便可抓取內容。app
登陸成功展現如圖:python爬蟲
我在這裏貼出主要代碼,完整代碼請移步個人github:https://github.com/pujinxiao/douban_login框架
spiders文件夾中DouBan.py主要代碼以下:
1 # -*- coding: utf-8 -*- 2 import scrapy,urllib,re 3 from scrapy.http import Request,FormRequest 4 import ruokuai 5 class DoubanSpider(scrapy.Spider): 6 name = "DouBan" 7 allowed_domains = ["douban.com"] 8 #start_urls = ['http://douban.com/'] 9 header={"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36"} #供登陸模擬使用 10 def start_requests(self): 11 url='https://www.douban.com/accounts/login' 12 return [Request(url=url,meta={"cookiejar":1},callback=self.parse)]#能夠傳遞一個標示符來使用多個。如meta={'cookiejar': 1}這句,後面那個1就是標示符 13 14 def parse(self, response): 15 captcha=response.xpath('//*[@id="captcha_image"]/@src').extract() #獲取驗證碼圖片的連接 16 print captcha 17 if len(captcha)>0: 18 '''此時有驗證碼''' 19 #人工輸入驗證碼 20 #urllib.urlretrieve(captcha[0],filename="C:/Users/pujinxiao/Desktop/learn/douban20170405/douban/douban/spiders/captcha.png") 21 #captcha_value=raw_input('查看captcha.png,有驗證碼請輸入:') 22 23 #用快若打碼平臺處理驗證碼--------驗證碼是任意長度字母,成功率較低 24 captcha_value=ruokuai.get_captcha(captcha[0]) 25 reg=r'<Result>(.*?)</Result>' 26 reg=re.compile(reg) 27 captcha_value=re.findall(reg,captcha_value)[0] 28 print '驗證碼爲:',captcha_value 29 30 data={ 31 "form_email": "weisuen007@163.com", 32 "form_password": "weijc7789", 33 "captcha-solution": captcha_value, 34 #"redir": "https://www.douban.com/people/151968962/", #設置須要轉向的網址,因爲咱們須要爬取我的中心頁,因此轉向我的中心頁 35 } 36 else: 37 '''此時沒有驗證碼''' 38 print '無驗證碼' 39 data={ 40 "form_email": "weisuen007@163.com", 41 "form_password": "weijc7789", 42 #"redir": "https://www.douban.com/people/151968962/", 43 } 44 print '正在登錄中......' 45 ####FormRequest.from_response()進行登錄 46 return [ 47 FormRequest.from_response( 48 response, 49 meta={"cookiejar":response.meta["cookiejar"]}, 50 headers=self.header, 51 formdata=data, 52 callback=self.get_content, 53 ) 54 ] 55 def get_content(self,response): 56 title=response.xpath('//title/text()').extract()[0] 57 if u'登陸豆瓣' in title: 58 print '登陸失敗,請重試!' 59 else: 60 print '登陸成功' 61 ''' 62 能夠繼續後續的爬取工做 63 '''
ruokaui.py代碼以下:
我所用的是若塊打碼平臺,選擇url識別驗證碼,直接給打碼平臺驗證碼圖片的連接地址,傳回驗證碼的值。
1 # -*- coding: utf-8 -*- 2 import sys, hashlib, os, random, urllib, urllib2 3 from datetime import * 4 5 class APIClient(object): 6 def http_request(self, url, paramDict): 7 post_content = '' 8 for key in paramDict: 9 post_content = post_content + '%s=%s&'%(key,paramDict[key]) 10 post_content = post_content[0:-1] 11 #print post_content 12 req = urllib2.Request(url, data=post_content) 13 req.add_header('Content-Type', 'application/x-www-form-urlencoded') 14 opener = urllib2.build_opener(urllib2.HTTPCookieProcessor()) 15 response = opener.open(req, post_content) 16 return response.read() 17 18 def http_upload_image(self, url, paramKeys, paramDict, filebytes): 19 timestr = datetime.now().strftime('%Y-%m-%d %H:%M:%S') 20 boundary = '------------' + hashlib.md5(timestr).hexdigest().lower() 21 boundarystr = '\r\n--%s\r\n'%(boundary) 22 23 bs = b'' 24 for key in paramKeys: 25 bs = bs + boundarystr.encode('ascii') 26 param = "Content-Disposition: form-data; name=\"%s\"\r\n\r\n%s"%(key, paramDict[key]) 27 #print param 28 bs = bs + param.encode('utf8') 29 bs = bs + boundarystr.encode('ascii') 30 31 header = 'Content-Disposition: form-data; name=\"image\"; filename=\"%s\"\r\nContent-Type: image/gif\r\n\r\n'%('sample') 32 bs = bs + header.encode('utf8') 33 34 bs = bs + filebytes 35 tailer = '\r\n--%s--\r\n'%(boundary) 36 bs = bs + tailer.encode('ascii') 37 38 import requests 39 headers = {'Content-Type':'multipart/form-data; boundary=%s'%boundary, 40 'Connection':'Keep-Alive', 41 'Expect':'100-continue', 42 } 43 response = requests.post(url, params='', data=bs, headers=headers) 44 return response.text 45 46 def arguments_to_dict(args): 47 argDict = {} 48 if args is None: 49 return argDict 50 51 count = len(args) 52 if count <= 1: 53 print 'exit:need arguments.' 54 return argDict 55 56 for i in [1,count-1]: 57 pair = args[i].split('=') 58 if len(pair) < 2: 59 continue 60 else: 61 argDict[pair[0]] = pair[1] 62 63 return argDict 64 65 def get_captcha(image_url): 66 client = APIClient() 67 while 1: 68 paramDict = {} 69 result = '' 70 act = raw_input('請輸入打碼方式url:') 71 if cmp(act, 'info') == 0: 72 paramDict['username'] = raw_input('username:') 73 paramDict['password'] = raw_input('password:') 74 result = client.http_request('http://api.ruokuai.com/info.xml', paramDict) 75 elif cmp(act, 'register') == 0: 76 paramDict['username'] = raw_input('username:') 77 paramDict['password'] = raw_input('password:') 78 paramDict['email'] = raw_input('email:') 79 result = client.http_request('http://api.ruokuai.com/register.xml', paramDict) 80 elif cmp(act, 'recharge') == 0: 81 paramDict['username'] = raw_input('username:') 82 paramDict['id'] = raw_input('id:') 83 paramDict['password'] = raw_input('password:') 84 result = client.http_request('http://api.ruokuai.com/recharge.xml', paramDict) 85 elif cmp(act, 'url') == 0: 86 paramDict['username'] = '********' 87 paramDict['password'] = '********' 88 paramDict['typeid'] = '2000' 89 paramDict['timeout'] = '90' 90 paramDict['softid'] = '76693' 91 paramDict['softkey'] = 'ec2b5b2a576840619bc885a47a025ef6' 92 paramDict['imageurl'] = image_url 93 result = client.http_request('http://api.ruokuai.com/create.xml', paramDict) 94 elif cmp(act, 'report') == 0: 95 paramDict['username'] = raw_input('username:') 96 paramDict['password'] = raw_input('password:') 97 paramDict['id'] = raw_input('id:') 98 result = client.http_request('http://api.ruokuai.com/create.xml', paramDict) 99 elif cmp(act, 'upload') == 0: 100 paramDict['username'] = '********' 101 paramDict['password'] = '********' 102 paramDict['typeid'] = '2000' 103 paramDict['timeout'] = '90' 104 paramDict['softid'] = '76693' 105 paramDict['softkey'] = 'ec2b5b2a576840619bc885a47a025ef6' 106 paramKeys = ['username', 107 'password', 108 'typeid', 109 'timeout', 110 'softid', 111 'softkey' 112 ] 113 114 from PIL import Image 115 imagePath = raw_input('Image Path:') 116 img = Image.open(imagePath) 117 if img is None: 118 print 'get file error!' 119 continue 120 img.save("upload.gif", format="gif") 121 filebytes = open("upload.gif", "rb").read() 122 result = client.http_upload_image("http://api.ruokuai.com/create.xml", paramKeys, paramDict, filebytes) 123 124 elif cmp(act, 'help') == 0: 125 print 'info' 126 print 'register' 127 print 'recharge' 128 print 'url' 129 print 'report' 130 print 'upload' 131 print 'help' 132 print 'exit' 133 elif cmp(act, 'exit') == 0: 134 break 135 136 return result
return [ FormRequest.from_response( response, meta={"cookiejar":response.meta["cookiejar"]}, headers=self.header, formdata=data, callback=self.get_content, ) ]
做者:今孝
出處:http://www.cnblogs.com/jinxiao-pu/p/6670672.html本文版權歸做者和博客園共有,歡迎轉載,但未經做者贊成必須保留此段聲明,且在文章頁面明顯位置給出原文鏈接。