功能點:如何發送攜帶cookie訪問登陸後的頁面,如何發送post請求登陸html
爬取網站:bilibili、githubgit
完整代碼:https://files.cnblogs.com/files/bookwed/login.zipgithub
主要代碼:web
bili.pycookie
# -*- coding: utf-8 -*- import scrapy import re class BiliSpider(scrapy.Spider): """直接攜帶cookie訪問登陸後的bilibili頁面""" name = 'bili' allowed_domains = ['bilibili.com'] # 登陸後的我的主頁 start_urls = ['https://account.bilibili.com/home/userInfo'] def start_requests(self): cookies = "_uuid=738F48A9-E13A-9445-3577-3068FADC9F6A05981infoc; buvid3=5DE9F436-F051-44E1-9B97-AB53E60C3ED448999infoc;" cookies = {i.split("=")[0]: i.split("=")[1] for i in cookies.split("; ")} # 把cookies字符串放到headers裏面傳參,這種方式不行,要單獨傳cookies參數 # headers={"Cookie": cookies} print(cookies) yield scrapy.Request( self.start_urls[0], callback=self.parse, cookies=cookies, # headers=headers ) def parse(self, response): # 驗證是否成功 print("*"*30) print(re.findall("bookwed", response.body.decode())) print("*"*30) # yield scrapy.FormRequest( # "http://", # headers=self, # formdata=dict(), # callback=self.after_login # )
github.pysession
# -*- coding: utf-8 -*- import scrapy import re class GithubSpider(scrapy.Spider): """利用scrapy發送post請求,模擬登陸github""" """注意點:針對form表單有action地址的狀況,能夠直接請求action,參考github2.py""" name = 'github' allowed_domains = ['github.com'] start_urls = ['https://github.com/login'] def parse(self, response): authenticity_token = response.xpath("//input[@name='authenticity_token']/@value").extract_first() commit = response.xpath("//input[@name='commit']/@value").extract_first() utf8 = response.xpath("//input[@name='utf8']/@value").extract_first() webauthn_support = response.xpath("//input[@name='webauthn-support']/@value").extract_first() # login = response.xpath("//input[@name='login']/@value").extract_first() # password = response.xpath("//input[@name='password']/@value").extract_first() post_data = dict( login="aa@163.com", password="aaaaaa", commit=commit, utf8=utf8, authenticity_token=authenticity_token, webauthn_support=webauthn_support ) yield scrapy.FormRequest( "https://github.com/session", #發送post請求登陸接口 formdata=post_data, callback=self.after_login ) # 另一種發送post請求的方式:指定請求方式爲POST # yield scrapy.Request( # "https://github.com/session", # method='POST', # body= # ) def after_login(self,response): # 對於不太確認的狀況,能夠先把響應保存到本地,而後進行分析 # with open('aa.html', 'w', encoding='utf-8') as f: # f.write(response.body.decode()) print("*"*30) print(re.findall('wed', response.body.decode())) print("*"*30)
github2.pydom
# -*- coding: utf-8 -*- import scrapy import re class Github2Spider(scrapy.Spider): """對於form表單有action地址的狀況,能夠直接請求action,只用傳用戶名密碼便可""" name = 'github2' allowed_domains = ['github.com'] start_urls = ['https://github.com/login'] # 注意:針對網頁中有多個form的狀況,能夠經過傳參來指定form,如formname、formid、formnumber、formxpath def parse(self, response): yield scrapy.FormRequest.from_response( response, # scrapy會從response中自動尋找form表單 formdata={"login": "aa@163.com", "password": "aaaaaa"}, # key對應頁面上的name,value對應實際的值 callback=self.after_login ) def after_login(self, response): print("*" * 30) print(re.findall('wed', response.body.decode())) print("*" * 30)