cookie是一種本地存儲機制,cookie是存儲在本地的css
session其實就是將用戶信息用戶名、密碼等)加密成一串字符串,返回給瀏覽器,之後瀏覽器每次請求都帶着這個sessionIdhtml
狀態碼通常是服務器本身定義,也能夠框架定義,也能夠本身定義python
F12 NetWork 下能夠看到每一個請求的狀態碼正則表達式
301永久性重定向,好比更換了 域名,但又但願原域名能夠請求的到shell
302臨時性重定向,好比未登陸狀態下點擊我的中心,會重定向到登錄頁面json
404通常是url非法,固然這種狀況也能夠返回200的空頁面,可是這樣不太好,由於404能夠被過濾掉api
500通常是服務器中某個函數出錯了,可是又沒有捕獲異常,通常開發框架會處理瀏覽器
503的情況通常暫停爬蟲,通常只爬取200,也不爬404服務器
若是要爬知乎,先登陸cookie
能夠經過輸入錯誤的用戶名密碼,在network中找到posturl和post參數
requests.get() 默認加的header是python2或者python3,直接請求會報500
因此requests時須要加header
agent = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0" header = { "HOST":"www.zhihu.com", "Referer": "https://www.zhizhu.com", 'User-Agent': agent } response = session.get("https://www.zhihu.com", headers=header)
模擬登錄時,不是直接用requests,並且用requests的session,session表明某一次鏈接,這樣就不須要每次requests.get時都創建鏈接,效率是更高的
session = requests.session() response = session.get("https://www.zhihu.com", headers=header)
後面因此的requests均可以換成session
session.cookies沒有save()方法,能夠用session.cookies = cookielib.LWPCookieJar()實例化出來的cookies能夠有save()
知乎模擬登錄代碼(未使用scrapy)
# -*- coding: utf-8 -*- __author__ = 'bobby' import requests try: import cookielib except: import http.cookiejar as cookielib import re session = requests.session() session.cookies = cookielib.LWPCookieJar(filename="cookies.txt") try: session.cookies.load(ignore_discard=True) except: print ("cookie未能加載") agent = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0" header = { "HOST":"www.zhihu.com", "Referer": "https://www.zhizhu.com", 'User-Agent': agent } def is_login(): #經過我的中心頁面返回狀態碼來判斷是否爲登陸狀態 inbox_url = "https://www.zhihu.com/question/56250357/answer/148534773" response = session.get(inbox_url, headers=header, allow_redirects=False) if response.status_code != 200: return False else: return True def get_xsrf(): #獲取xsrf code response = session.get("https://www.zhihu.com", headers=header) match_obj = re.match('.*name="_xsrf" value="(.*?)"', response.text) if match_obj: return (match_obj.group(1)) else: return "" def get_index(): response = session.get("https://www.zhihu.com", headers=header) with open("index_page.html", "wb") as f: f.write(response.text.encode("utf-8")) print ("ok") def get_captcha(): import time t = str(int(time.time()*1000)) captcha_url = "https://www.zhihu.com/captcha.gif?r={0}&type=login".format(t) t = session.get(captcha_url, headers=header) with open("captcha.jpg","wb") as f: f.write(t.content) f.close() from PIL import Image try: im = Image.open('captcha.jpg') im.show() im.close() except: pass captcha = input("輸入驗證碼\n>") return captcha def zhihu_login(account, password): #知乎登陸 if re.match("^1\d{10}",account): print ("手機號碼登陸") post_url = "https://www.zhihu.com/login/phone_num" post_data = { "_xsrf": get_xsrf(), "phone_num": account, "password": password, "captcha":get_captcha() } else: if "@" in account: #判斷用戶名是否爲郵箱 print("郵箱方式登陸") post_url = "https://www.zhihu.com/login/email" post_data = { "_xsrf": get_xsrf(), "email": account, "password": password } response_text = session.post(post_url, data=post_data, headers=header) session.cookies.save() zhihu_login("18782902568", "admin123") # get_index() is_login() # get_captcha()
scrapy的spider的入口是start_requests
因此須要重寫start_requests
異步UI
def start_requests(self):
return [scrapy.Request('https://www.zhihu.com/#signin', headers=self.headers, callback=self.login)]
callback不加括號是由於傳遞的是這個函數的對象,它是會被調用的,若是你如今加了括號,那就表明如今就調用它,它就會返回這個函數的值給你,,因此只須要傳遞函數名過去就行了
經過robot.txt會判斷哪些頁面會過濾掉
ROBOTSTXT_OBEY = False 不遵照ROBOTS協議
正則表達式默認只匹配一行
加re.DOTALL能夠匹配全文
scrapy shell -s USER_AGENT="Mozilla/5.0 (Windows NT 10.0; …) Gecko/20100101 Firefox/60.0" https://blog.csdn.net/weixin_42471384/article/details/81556531
# -*- coding: utf-8 -*- import re import json import datetime try: import urlparse as parse except: from urllib import parse import scrapy from scrapy.loader import ItemLoader from items import ZhihuQuestionItem, ZhihuAnswerItem class ZhihuSpider(scrapy.Spider): name = "zhihu" allowed_domains = ["www.zhihu.com"] start_urls = ['https://www.zhihu.com/'] #question的第一頁answer的請求url start_answer_url = "https://www.zhihu.com/api/v4/questions/{0}/answers?sort_by=default&include=data%5B%2A%5D.is_normal%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccollapsed_counts%2Creviewing_comments_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Cmark_infos%2Ccreated_time%2Cupdated_time%2Crelationship.is_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cupvoted_followees%3Bdata%5B%2A%5D.author.is_blocking%2Cis_blocked%2Cis_followed%2Cvoteup_count%2Cmessage_thread_token%2Cbadge%5B%3F%28type%3Dbest_answerer%29%5D.topics&limit={1}&offset={2}" headers = { "HOST": "www.zhihu.com", "Referer": "https://www.zhizhu.com", 'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0" } custom_settings = { "COOKIES_ENABLED": True } def parse(self, response): """ 提取出html頁面中的全部url 並跟蹤這些url進行一步爬取 若是提取的url中格式爲 /question/xxx 就下載以後直接進入解析函數 """ all_urls = response.css("a::attr(href)").extract() all_urls = [parse.urljoin(response.url, url) for url in all_urls] all_urls = filter(lambda x:True if x.startswith("https") else False, all_urls) for url in all_urls: match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", url) if match_obj: #若是提取到question相關的頁面則下載後交由提取函數進行提取 request_url = match_obj.group(1) yield scrapy.Request(request_url, headers=self.headers, callback=self.parse_question) else: #若是不是question頁面則直接進一步跟蹤 yield scrapy.Request(url, headers=self.headers, callback=self.parse) def parse_question(self, response): #處理question頁面, 從頁面中提取出具體的question item if "QuestionHeader-title" in response.text: #處理新版本 match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url) if match_obj: question_id = int(match_obj.group(2)) item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response) item_loader.add_css("title", "h1.QuestionHeader-title::text") item_loader.add_css("content", ".QuestionHeader-detail") item_loader.add_value("url", response.url) item_loader.add_value("zhihu_id", question_id) item_loader.add_css("answer_num", ".List-headerText span::text") item_loader.add_css("comments_num", ".QuestionHeader-actions button::text") item_loader.add_css("watch_user_num", ".NumberBoard-value::text") item_loader.add_css("topics", ".QuestionHeader-topics .Popover div::text") question_item = item_loader.load_item() else: #處理老版本頁面的item提取 match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url) if match_obj: question_id = int(match_obj.group(2)) item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response) # item_loader.add_css("title", ".zh-question-title h2 a::text") item_loader.add_xpath("title", "//*[@id='zh-question-title']/h2/a/text()|//*[@id='zh-question-title']/h2/span/text()") item_loader.add_css("content", "#zh-question-detail") item_loader.add_value("url", response.url) item_loader.add_value("zhihu_id", question_id) item_loader.add_css("answer_num", "#zh-question-answer-num::text") item_loader.add_css("comments_num", "#zh-question-meta-wrap a[name='addcomment']::text") # item_loader.add_css("watch_user_num", "#zh-question-side-header-wrap::text") item_loader.add_xpath("watch_user_num", "//*[@id='zh-question-side-header-wrap']/text()|//*[@class='zh-question-followers-sidebar']/div/a/strong/text()") item_loader.add_css("topics", ".zm-tag-editor-labels a::text") question_item = item_loader.load_item() yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0), headers=self.headers, callback=self.parse_answer) yield question_item def parse_answer(self, reponse): #處理question的answer ans_json = json.loads(reponse.text) is_end = ans_json["paging"]["is_end"] next_url = ans_json["paging"]["next"] #提取answer的具體字段 for answer in ans_json["data"]: answer_item = ZhihuAnswerItem() answer_item["zhihu_id"] = answer["id"] answer_item["url"] = answer["url"] answer_item["question_id"] = answer["question"]["id"] answer_item["author_id"] = answer["author"]["id"] if "id" in answer["author"] else None answer_item["content"] = answer["content"] if "content" in answer else None answer_item["parise_num"] = answer["voteup_count"] answer_item["comments_num"] = answer["comment_count"] answer_item["create_time"] = answer["created_time"] answer_item["update_time"] = answer["updated_time"] answer_item["crawl_time"] = datetime.datetime.now() yield answer_item if not is_end: yield scrapy.Request(next_url, headers=self.headers, callback=self.parse_answer) def start_requests(self): return [scrapy.Request('https://www.zhihu.com/#signin', headers=self.headers, callback=self.login)] def login(self, response): response_text = response.text match_obj = re.match('.*name="_xsrf" value="(.*?)"', response_text, re.DOTALL) xsrf = '' if match_obj: xsrf = (match_obj.group(1)) if xsrf: post_url = "https://www.zhihu.com/login/phone_num" post_data = { "_xsrf": xsrf, "phone_num": "", "password": "", "captcha": "" } import time t = str(int(time.time() * 1000)) captcha_url = "https://www.zhihu.com/captcha.gif?r={0}&type=login".format(t) yield scrapy.Request(captcha_url, headers=self.headers, meta={"post_data":post_data}, callback=self.login_after_captcha) def login_after_captcha(self, response): with open("captcha.jpg", "wb") as f: f.write(response.body) f.close() from PIL import Image try: im = Image.open('captcha.jpg') im.show() im.close() except: pass captcha = input("輸入驗證碼\n>") post_data = response.meta.get("post_data", {}) post_url = "https://www.zhihu.com/login/phone_num" post_data["captcha"] = captcha return [scrapy.FormRequest( url=post_url, formdata=post_data, headers=self.headers, callback=self.check_login )] def check_login(self, response): #驗證服務器的返回數據判斷是否成功 text_json = json.loads(response.text) if "msg" in text_json and text_json["msg"] == "登陸成功": for url in self.start_urls: yield scrapy.Request(url, dont_filter=True, headers=self.headers)
。。。。。。