scrapy模擬知乎登陸(無驗證碼機制)

時間 2019-11-24

標籤 scrapy 模擬登陸驗證碼機制欄目 Python 简体版

原文原文鏈接

---恢復內容開始---css

spiders 文件夾下新建zhihu.py文件（從dos窗口中進入虛擬環境，再進入工程目錄以後輸入命令 scrapy genspider zhihu www.zhihu.com）html

#zhihu.pyjson

import scrapyapi

import redom

import jsonscrapy

from Item import ZhihuQuestionItem,ZhihuAnswerItemide

import datatime函數

from scrapy.loader import ItemLoaderpost

try:url

　　import urlparse as parse

except:

　　from urllib import parse

class ZhuhuSpider(scrapy.Spider):

　　name='zhihu'

　　allow_domains=["www.zhihu.com"]

　　start_urls=["http://www.zhihu.com/"]　

　　headers={

　　"HOST":"www.zhihu.com",

　　"Referer":"https://www.zhihu.com",

　　"User-Agent":"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.86 Safari/537.36"

　　}

　　start_answer_url="https://www.zhihu.com/api/v4/questions/{0}/answers?　　　　　　sort_by=default&include=data%5B%2A%5D.is_normal%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccollapsed_counts%2Creviewing_comments_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Cmark_infos%2Ccreated_time%2Cupdated_time%2Crelationship.is_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cupvoted_followees%3Bdata%5B%2A%5D.author.is_blocking%2Cis_blocked%2Cis_followed%2Cvoteup_count%2Cmessage_thread_token%2Cbadge%5B%3F%28type%3Dbest_answerer%29%5D.topics&limit={1}&offset={2}"　　#question 第一頁answer請求的url

def parse(self,response):

　　"""

　　提取出html頁面中的全部url,並跟蹤這些url進行進一步爬取

　　若是提取出的url中格式爲/requestion/xxx,就下載以後直接進入解析函數

　　"""

　　all_urls=response.css("a::attr(href)").extract()

　　all_urls=[parse.urljoin(response.url,url) for url in all_urls]

　　all_urls=fliter(lambda x:True if x.startswith("https") else False,all_urls)

　　for url in all_urls:

　　　　match_obj=re.match("(.*zhihu.com/requestion/(\d))(/|$).*",url)

　　　　if match_obj:

　　#若是提取到requestion相關頁面，交由parse_question進行解析

　　　　　　request_url=match_obj.group(1)

　　　　　　yield scrapy.Request(request_url,headers=self.headers,callback=self.parse_question)

　　　　else:

　　#若是未提取到相關頁面，則直接進一步跟蹤

　　　　　　yield scrapy.Request(url,headers=self.headers,callback=self.parse)

def parse_question(self,response):

　　#處理question頁面，從頁面中提取出具體的question item

　　match_obj=re.match("(.*zhuhu.com/question/(\d))(/|$).*",response.url)

　　if match_obj:

　　　　 question_id=int(match_obj.group(2))

　　item_loader=ItemLoader(item=zhuhuQuestionItem(),response=response)

　　if "QuestionHeader-title" in response.text:　　#處理新版本　　

　　　　　item_loader.add_css("title","h1.QuestionHeader-title::text")

　　　　　item_loader.add_css("content",".QuestionHeader-detail")

　　　　　item_loader.add_value("url",response.url)

　　　　　item_loader.add_value("zhuhu_id",question_id)

　　　　　item_loader.add_css("answer_num",".List-headerText span::text")

　　　　　item_loader.add_css("comment_num",".QuestionHeader-actions button::text")

　　　　　item_loader.add_css("watch_user_num",".NumberBoard-value::text")　　　

　　　　　item_loader.add_css("topic",".QuestionHeader-topics .Popover div::text")　　　　　

　　else:　　#處理舊版本頁面item的提取

　　　　item_loader.add_xpath("title","//*[@id='zh-question-title']/h2/a/text()|//*[@id='zh-question-title']/h2/span/text()")

　　　　item_loader.add_css("content","#zh-question-detail")

　　　　item_loader.add_value("url",response.url)

　　　　item_loader.add_value("zhuhu_id",question_id)

　　　　item_loader.add_css("answer_num", "#zh-question-answer-num::text")

　　　　item_loader.add_css("comment_num","#zh-question-meta-wrap a[name='addcomment']::text")

　　　　item_loader.add_xpath("watch_user_num","//*[@id='zh-question-side-header-wrap']/text()|//*[@class='zh-question-followers-sidebar']/div/a/strong/text()")

　　　　item_loader.add_css("topic",".zm-tag-editor-labels a::text")　

　　question_item=item_loader.load_item()

　　yield scrapy.Request(self.start_answer_url,format(question_id,20,0),headers=self.headers,callback=self.parse_answer)

　　yield question_item

def parse_answer(self,response):

　　　#處理question中的answer

　　ans_json=json.load(response.text)

　　is_end=ans_json["paging"]["is_end"]

　　next_url=ans_json["paging"]["next"]

　　#提取answer的具體字段

　　for answer in ans_json["data"]:

　　　　answer_item=ZhihuAnswerItem()

　　　　answer_item["zhihu_id"]=answer["id"]

　　　　answer_item["url"]=answer["url"]

　　　　answer_item["question_id"]=answer["question"]["id"]

　　　　answer_item["author_id"] = answer["author"]["id"] if "id" in answer["author"] else None

　　　　answer_item["content"] = answer["content"] if "content" in answer else None

　　　　answer_item["praise_num"]=answer["voteup_count"]

　　　　answer_item["comment_num"]=answer["comment_count"]

　　　　answer_item["creat_time"]=answer["created_time"]

　　　　answer_item["update_time"]=answer["update_time"]

　　　　answer_item["crawl_time"]=datatime.datatime.now()　　　

　　　　yield answer_item

　　　　if not is_end:

　　　　　　yield scrapy.Request(next_url,headers=self.headers,callback=self.answer.parse_answer)　

#重寫start_Request方法

def start_requests(self):

　　return [scrapy.Request("https://www.zhihu.com/#signin",headers=self.headers,callback=self.login)]　　#使用scrapy.Request必定要使用回調函數，不然會默認回調parse(self,response)

def login(self,response):

　　response_text=response.text

　　match_obj=re,match(' .*name="_xsrf" value="(.*?)" ',response_text,re.DOTALL)　　#注意使用單雙引號

　　xsrf=""

　　if match_obj:

　　　　xsrf=(match_obj.group(1))

　　if xsrf:

　　　　post_url="https://www.zhihu.com/login/phone_num"

　　　　post_data={

　　　　　　"_xsrf"　　:　　xsrf,

　　　　　　"phone_num"　　:　　"18282902586",

　　　　　　"password"　　:　　"admin123"