import requests from lxml import etree class Login(object): def __init__(self): self.headers = { 'Origin': 'https://github.com', 'Referer': 'https: // github.com /', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36', 'Host': 'github.com' } # 獲取登陸頁面與token self.login_url = 'https://github.com/login' # 登陸頁面 self.login_post_url = 'https://github.com/session' self.session=requests.session() # 獲取token def get_token(self): headers = { 'Host': 'github.com', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36', } res_text = self.session.get(url=self.login_url, headers=headers).text tree = etree.HTML(res_text) token = tree.xpath('//div//input[2]/@value')[0] return token # 模擬登陸 def login(self,username, password): token=self.get_token() print(token) post_data = { "login": username, "password":password, "commit": "Sign in", "utf8": "✓", "authenticity_token": token } res = self.session.post(url=self.login_post_url, data=post_data) print(res.status_code) if res.status_code == 200: self.get_email_page() with open("github.html", "wb") as f: f.write(res.content) #獲取我的頁 def get_email_page(self): print('獲取我的頁') email_data = self.session.get('https://github.com/tjp40922').text with open('xxxxxx.html', 'w', encoding='utf8') as f: f.write(email_data) if __name__ == '__main__': login = Login() username = input('請輸入用戶名') password = input('請輸入密碼') login.login(username, password)
注意點:html
1.要獲取token,並且要一一對應git
2.headers,兩次請求的請求頭是不同的,不然會報錯,422狀態碼github
# -*- coding: utf-8 -*- import scrapy from scrapy.http import Request,FormRequest class LoginrrSpider(scrapy.Spider): name = 'loginrr' allowed_domains = ['renren.com'] start_urls = ['http://renren.com/'] headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36' } # 這裏咱們先編寫start_requests方法(會首先執行) def start_requests(self): # 首先訪問一遍登陸頁面(在這裏咱們能夠監測有無驗證碼,如有則須要添加驗證碼字段,若無則不須要) # 而後回調parse方法 # 設置meta參數,將cookiejar設置爲1,表示cookjar開啓 return [Request("http://www.renren.com/PLogin.do", meta={"cookiejar": 1}, callback=self.parse)] def parse(self, response): # 須要發送的表單,此次沒有驗證碼 data = { # 這裏輸入大家本身的帳號密碼 'email': 'xxxxxx', 'password': 'xxxxxxxx' } print("正在登錄...") # 經過 FormRequest.from_response方法來進行登陸 return [FormRequest.from_response(response, # 設置cookie信息 meta={"cookiejar": response.meta["cookiejar"]}, headers=self.headers, formdata=data, # 回調到next方法 callback=self.next, )] def next(self, response): # 打印登陸成功後的界面源代碼 print(response.body)
import scrapy from faker import Factory
#僞造ua的 f = Factory.create() class MailSpider(scrapy.Spider): name = 'douban-mail' allowed_domains = ['accounts.douban.com', 'douban.com'] start_urls = [ 'https://www.douban.com/' ] headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 'Connection': 'keep-alive', 'Host': 'accounts.douban.com', 'User-Agent': f.user_agent() } formdata = { 'form_email': '您的帳號', 'form_password': '您的密碼', # 'captcha-solution': '', # 'captcha-id': '', 'login': '登陸', 'redir': 'https://www.douban.com/', 'source': 'None' } def start_requests(self): return [scrapy.Request(url='https://www.douban.com/accounts/login', headers=self.headers, meta={'cookiejar': 1}, callback=self.parse_login)] def parse_login(self, response): # 若是有驗證碼要人爲處理 if 'captcha_image' in response.body: print 'Copy the link:' link = response.xpath('//img[@class="captcha_image"]/@src').extract()[0] print link captcha_solution = raw_input('captcha-solution:') captcha_id = urlparse.parse_qs(urlparse.urlparse(link).query, True)['id'] self.formdata['captcha-solution'] = captcha_solution self.formdata['captcha-id'] = captcha_id return [scrapy.FormRequest.from_response(response, formdata=self.formdata, headers=self.headers, meta={'cookiejar': response.meta['cookiejar']}, callback=self.after_login )] def after_login(self, response): print response.status self.headers['Host'] = "www.douban.com" return scrapy.Request(url='https://www.douban.com/doumail/', meta={'cookiejar': response.meta['cookiejar']}, headers=self.headers, callback=self.parse_mail) def parse_mail(self, response): print response.status for item in response.xpath('//div[@class="doumail-list"]/ul/li'): mail = DoubanMailItem() mail['sender_time'] = item.xpath('div[2]/div/span[1]/text()').extract()[0] mail['sender_from'] = item.xpath('div[2]/div/span[2]/text()').extract()[0] mail['url'] = item.xpath('div[2]/p/a/@href').extract()[0] mail['title'] = item.xpath('div[2]/p/a/text()').extract()[0] print mail yield mail