注意:模擬登錄時,必須保證settings.py裏的COOKIES_ENABLED(Cookies中間件) 處於開啓狀態
COOKIES_ENABLED = True
或# COOKIES_ENABLED = False
php
只要是須要提供post數據的,就能夠用這種方法。下面示例裏post的數據是帳戶密碼:html
yield scrapy.FormRequest(url, formdata, callback)
方法發送POST請求。start_requests(self)
方法,而且再也不調用start_urls
裏的url。class mySpider(scrapy.Spider): # start_urls = ["http://www.example.com/"] def start_requests(self): url = 'http://www.renren.com/PLogin.do' #從源碼中form表單提取的action網址 # FormRequest 是Scrapy發送POST請求的方法 yield scrapy.FormRequest( url = url, formdata = {"email" : "mr_mao_hacker@163.com", "password" : "axxxxxxxe"}, callback = self.parse_page ) def parse_page(self, response): # do something # 業務邏輯
正統模擬登陸方法:python
- 首先發送登陸頁面的get請求,獲取到頁面裏的登陸必須的參數(好比說zhihu登錄界面的 _xsrf)
- 而後和帳戶密碼一塊兒post到服務器,登陸成功
- 使用
FormRequest.from_response()
方法[模擬用戶登陸]
import scrapy class LoginSpider(scrapy.Spider): name = 'example.com' start_urls = ['http://www.example.com/users/login.php'] def parse(self, response): return scrapy.FormRequest.from_response( response, formdata={'username': 'john', 'password': 'secret'}, callback=self.after_login ) def after_login(self, response): # check login succeed before going on if "authentication failed" in response.body: self.log("Login failed", level=log.ERROR) return # continue scraping with authenticated session...
模擬瀏覽器登陸web
start_requests()方法,能夠返回一個請求給爬蟲的起始網站,這個返回的請求至關於start_urls,start_requests()返回的請求會替代start_urls裏的請求spring
Request()get請求,能夠設置,url、cookie、回調函數chrome
FormRequest.from_response()表單post提交,第一個必須參數,上一次響應cookie的response對象,其餘參數,cookie、url、表單內容等api
import scrapy # 正統模擬登陸方法: # 首先發送登陸頁面的get請求,獲取到頁面裏的登陸必須的參數,好比說zhihu的 _xsrf # 而後和帳戶密碼一塊兒post到服務器,登陸成功 # 第二種標準 def parse(self, response): print(response.body.decode('utf-8'), "@@" * 40) yield scrapy.FormRequest.from_response(response,formdata={ "email": "18588403840", "origURL":"http://www.renren.com/422167102/profile", "domain": "renren.com", "key_id": "1", "captcha_type": "web_login", "password": "97bfc03b0eec4df7c76eaec10cd08ea57b01eefd0c0ffd4c0e5061ebd66460d9", "rkey": "26615a8e93fee56fc1fb3d679afa3cc4", "f": "" }, dont_filter=True, headers=self.headers, callback=self.get_page) def get_page(self, response): print("===================", response.url) print(response.body.decode('utf-8')) url = "http://www.renren.com/353111356/profile" yield scrapy.Request(url, callback=self.get_info) def get_info(self, response): print('*******' * 30) print(response.body.decode('utf-8'))
yield Request()能夠將一個新的請求返回給爬蟲執行瀏覽器
在發送請求時cookie的操做, meta={'cookiejar':1}表示開啓cookie記錄,首次請求時寫在Request()裏 meta={'cookiejar':response.meta['cookiejar']}表示使用上一次response的cookie,寫在FormRequest.from_response()裏post受權 meta={'cookiejar':True}表示使用受權後的cookie訪問須要登陸查看的頁面服務器
import scrapy from scrapy.spiders import CrawlSpider, Rule from scrapy.linkextractors import LinkExtractor class MyrenSpider(CrawlSpider): name = 'myren' allowed_domains = ['renren.com'] start_urls = ["http://www.renren.com/353111356/profile"] rules = [Rule(LinkExtractor(allow=('(\d+)/profile')), callback='get_info', follow=True)] headers = { "Accept": "*/*", "Connection": "keep-alive", "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36", } def start_requests(self): yield scrapy.Request(url="http://www.renren.com/", meta={'cookiejar': 1}, callback=self.post_login) # 第二種標準 def post_login(self, response): yield scrapy.FormRequest.from_response(response, url="http://www.renren.com/PLogin.do", meta={'cookiejar': response.meta['cookiejar']}, # 在以前須要打開 meta = {'cookiejar' : 1} headers=self.headers, formdata={ "email": "18588403840", "password": "Changeme_123" }, dont_filter=True, callback=self.after_login) def after_login(self, response): for url in self.start_urls: # yield self.make_requests_from_url(url) yield scrapy.Request(url, meta={'cookiejar': response.meta['cookiejar']}) def get_info(self, response): print('*******' * 30) print(response.body.decode('utf-8')) def _requests_to_follow(self, response): """重寫加入cookiejar的更新""" if not isinstance(response, HtmlResponse): return seen = set() for n, rule in enumerate(self._rules): links = [l for l in rule.link_extractor.extract_links(response) if l not in seen] if links and rule.process_links: links = rule.process_links(links) for link in links: seen.add(link) r = Request(url=link.url, callback=self._response_downloaded) # 下面這句是我重寫的 r.meta.update(rule=n, link_text=link.text, cookiejar=response.meta['cookiejar']) yield rule.process_request(r)
若是實在沒辦法了,能夠用這種方法模擬登陸,雖然麻煩一點,可是成功率100%cookie
ChangeCookies 將cookie解析成字典形式
class transCookie: def __init__(self, cookie): self.cookie = cookie def stringToDict(self): ''' 將從瀏覽器上Copy來的cookie字符串轉化爲Scrapy能使用的Dict :return: ''' itemDict = {} items = self.cookie.split(';') for item in items: key = item.split('=')[0].strip() value = item.split('=')[1] itemDict[key] = value return itemDict if __name__ == "__main__": cookie = "你的cookie" trans = transCookie(cookie) print(trans.stringToDict())
將解析好的cookie格式放入請求
# -*- coding: utf-8 -*- import scrapy class RenrenSpider(scrapy.Spider): name = "renren" allowed_domains = ["renren.com"] start_urls = [ 'http://www.renren.com/111111', 'http://www.renren.com/222222', 'http://www.renren.com/333333', ] #開始請求url列表 cookies = { "anonymid" : "ixrna3fysufnwv", "_r01_" : "1", "ap" : "327550029", "JSESSIONID" : "abciwg61A_RvtaRS3GjOv", "depovince" : "GW", "springskin" : "set", "jebe_key" : "f6fb270b-d06d-42e6-8b53-e67c3156aa7e%7Cc13c37f53bca9e1e7132d4b58ce00fa3%7C1484060607478%7C1%7C1486198628950", "t" : "691808127750a83d33704a565d8340ae9", "societyguester" : "691808127750a83d33704a565d8340ae9", "id" : "327550029", "xnsid" : "f42b25cf", "loginfrom" : "syshome" } # 能夠重寫Spider類的start_requests方法,附帶Cookie值,發送POST請求 def start_requests(self): return [scrapy.FormRequest(url, cookies = self.cookies, callback = self.parse)] # 處理響應內容 def parse(self, response): print "===========" + response.url with open("deng.html", "w") as filename: filename.write(response.body)
1 spider.browser.page_source 獲取響應的源代碼
2 session.get(request.url).text 獲取響應的源代碼
3 requests採用session管理cookie
4 urllib 採用cookieJar管理cookie
模擬登陸淘寶
class TaobaoSpider(scrapy.Spider): name = 'mytaobao' allowed_domains = ['taobao.com'] start_urls = ['https://login.m.taobao.com/login.htm', "http://h5.m.taobao.com/mlapp/olist.html?spm=a2141.7756461.2.6"] def __init__(self): # 初始化 self.browser = None self.cookies = None super(TaobaoSpider, self).__init__() # 傳遞給父類 def parse(self, response): # 打印連接,打印網頁源代碼 print(response.url) print(response.body.decode("utf-8", "ignore"))
#中間件middleware 自定義LoginMiddleware登陸 from scrapy import signals from selenium import webdriver from scrapy.http import HtmlResponse # 網頁響應 import requests import time class LoginMiddleware(object): ''' 找到password username輸入框並send_keys 點擊登陸並抓取cookie,spider.browser.get_cookies() 返回頁面信息,HtmlResponse ''' def process_request(self, request, spider): if spider.name == "mytaobao": # 指定僅僅處理這個名稱的爬蟲 if request.url.find("login") != -1: # 判斷是否登錄頁面 mobilesetting = {"deviceName": "iPhone 6 Plus"} options = webdriver.ChromeOptions() # 瀏覽器選項 options.add_experimental_option("mobileEmulation", mobilesetting) # 模擬手機 spider.browser = webdriver.Chrome(chrome_options=options) # 建立一個瀏覽器對象 spider.browser.set_window_size(400, 800) # 配置手機大小 spider.browser.get(request.url) # 爬蟲訪問連接 time.sleep(3) #必需要睡下由於考慮到輸入:用戶名密碼 要時間 print("login訪問", request.url) username = spider.browser.find_element_by_id("username") password = spider.browser.find_element_by_id("password") time.sleep(1) username.send_keys("2403239393@qq.com") # 帳戶 time.sleep(2) password.send_keys("bama100") # 密碼 time.sleep(2) spider.browser.find_element_by_id("btn-submit").click() time.sleep(4) spider.cookies = spider.browser.get_cookies() # 抓取所有的cookie # spider.browser.close() return HtmlResponse(url=spider.browser.current_url, # 當前鏈接 body=spider.browser.page_source, # 源代碼 encoding="utf-8") # 返回頁面信息 else:#登陸後則執行 ''' 1 採用requests.session保存cookie 2 設置cookie session.cookie.set(name,value) 3 清空headers session.headers.clear() 4 發起get請求 session.get(url) ''' print("request 訪問") session = requests.session() # 會話 for cookie in spider.cookies: session.cookies.set(cookie['name'], cookie["value"]) session.headers.clear() # 清空頭 newpage = session.get(request.url) print("---------------------") print(request.url) print("---------------------") print(newpage.text) print("---------------------") # 頁面 time.sleep(3) return HtmlResponse(url=request.url, # 當前鏈接 body=newpage.text, # 源代碼 encoding="utf-8") # 返回頁面信息