程序邏輯圖以下:html
登陸模塊(獲取cookie):python
# encoding=utf-8 import requests import re import sys #設置請求頭 headers={ 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding':'gzip, deflate, sdch, br', 'Accept-Language':'zh-CN,zh;q=0.8', 'Connection':'keep-alive', 'Host':'www.zhihu.com', 'Origin':'https://www.zhihu.com', 'Referer':'https://www.zhihu.com/', 'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36', 'x-hd-token':'hello', } #下面寫入帳號密碼 post_data={ '_xsrf':'***', 'password':'****', 'captcha':'***', 'phone_num':'*****', } req=requests.Session() def login(): page=req.get(url="https://www.zhihu.com/#signin",headers=headers) parser=re.compile(u'<input type="hidden" name="_xsrf" value="(.*?)"/>',re.S) xsrf=re.findall(parser,page.text)[0] headers['X-Xsrftoken']=xsrf post_data['_xsrf']=xsrf #下載驗證碼 with open("../code.jpg",'wb') as w: p=req.get(url="https://www.zhihu.com/captcha.gif?r=1495546872530&type=login",headers=headers) w.write(p.content) code=input("請輸入驗證碼:") if not code: sys.exit(1) post_data['captcha']=code res=req.post(url='https://www.zhihu.com/login/phone_num',data=post_data,headers=headers) print(res.text) return req cookie=login().cookies.get_dict()
spiders以下:這裏用re(正則表達式和xpath解析網頁,不懂的同窗能夠花時間去學習一下)mysql
# -*- coding: utf-8 -*- import scrapy from zhihu.items import * import re class ZhSpider(scrapy.Spider): name = 'zh' allowed_domains = ['zhihu.com'] start_urls = ['http://zhihu.com/'] url='http://www.zhihu.com/' start_urls=['ruan-fu-zhong-tong-zhi','mu-huan-98', 'zeus871219','a-li-ai-di-10','dyxxg','hao-er-8', 'liu-miao-miao-47-17','peng-chen-xi-39','song-ling-shi-liao-63-56'] task_set=set(start_urls) tasked_set=set() def start_requests(self): while len(self.task_set)>0: print("**********start用戶庫**********") print(str(self.task_set)) print("********************") id=self.task_set.pop() if id in self.tasked_set: print("已經存在的數據 %s" %(id)) continue self.tasked_set.add(id) userinfo_url='https://www.zhihu.com/people/{}/answers'.format(id) user_item=UserItem() user_item['Id']=id user_item['Url']=userinfo_url yield scrapy.Request( userinfo_url, meta={"item":user_item},callback=self.User_parse,dont_filter=True ) yield scrapy.Request( 'https://www.zhihu.com/people/{}/followers'.format(id), callback=self.Add_user,dont_filter=True ) def Add_user(self,response): sel=scrapy.selector.Selector(response) #<a class="UserLink-link" target="_blank" href="/people/12321-89">12321</a> #//*[@id="Profile-following"]/div[2]/div[2]/div/div/div[2]/h2/div/span/div/div/a/@href #//*[@id="Popover-24089-95956-toggle"] #//*[@id="Popover-24089-95956-toggle"]/a #print(response.text) co=sel.xpath('//*[@id="root"]/div/main/div/div/div[2]').extract_first() patten=re.compile(u'<a class="UserLink-link" target="_blank" href="/people/(.*?)">.*?</a>',re.S) l=re.findall(patten,co) #l=sel.xpath('//*[@id="Profile-following"]/div[2]/div[2]/div/div/div[2]/h2/div/span/div/div/a/@href') for i in l: if str(i) not in self.tasked_set and str(i) not in self.task_set: self.task_set.add(i) print("**********用戶庫**********") print(str(self.task_set)) print("********************") def User_parse(self, response): item=response.meta["item"] sel=scrapy.selector.Selector(response) nick_name=sel.xpath('//*[@id="ProfileHeader"]/div/div[2]/div/div[2]/div[1]/h1/span[1]/text()').extract_first() print(nick_name) #item['Nick_name']=nick_name summary=sel.xpath('//*[@id="ProfileHeader"]/div/div[2]/div/div[2]/div[1]/h1/span[2]/text()').extract_first() print(summary) item['Summary']=summary item['Nick_name']=nick_name # print(sel.xpath( '//span[@class="location item"]/@title').extract_first()) co=sel.xpath('//*[@id="ProfileHeader"]/div/div[2]/div/div[2]/div[2]').extract_first() # print("**********************") # print(co) # print('**********************') patten=re.compile(u'.*?</div>(.*?)<div.*?>',re.S) l=re.findall(patten,co) #print(str(l)) print("**********************") print(str(l)) item['Content']=str(l) print('**********************') yield item
pipelines模塊:git
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html import pymysql class ZhihuPipeline(object): def process_item(self, item, spider): return item class MysqlPipeline(object): def __init__(self): self.conn=pymysql.connect( host='localhost', #本地127.0.0.1 port=3306, #默認3306端口 user='root', #mysql最高權限用戶 passwd='****', #root用戶密碼 db='zh', #database name charset='utf8' ) def process_item(self,item,spider): self._conditional_insert(self.conn.cursor(),item)#調用插入的方法 # query.addErrback(self._handle_error,item,spider)#調用異常處理方法 return item def _conditional_insert(self,tx,item): sql="insert into user(id,url,nick_name,summary,content) values(%s,%s,%s,%s,%s)" params=(item["Id"],item["Url"],item['Nick_name'],item['Summary'],item['Content']) tx.execute(sql,params) print('已經插入一條數據!') tx.close() self.conn.commit() # self.conn.close() #錯誤處理方法 def _handle_error(self, failue, item, spider): print(failue)
items模塊:github
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # http://doc.scrapy.org/en/latest/topics/items.html import scrapy from scrapy import Field class ZhihuItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() pass class UserItem(scrapy.Item): """ 知乎用戶的用戶名,居住地,所在行業,職業經歷,教育經歷,我的簡介 """ Id=Field() Url=Field() Nick_name=Field() Summary=Field() # Home_Position=Field() # Compmany=Field() # Edu=Field() Content=Field()
middlewares模塊:web
# -*- coding: utf-8 -*- # Define here the models for your spider middleware # # See documentation in: # http://doc.scrapy.org/en/latest/topics/spider-middleware.html from zhihu.getCookie import cookie from scrapy import signals class CookiesMiddleware(object): """ 換Cookie """ def process_request(self, request, spider): #cookie = random.choice(cookies) request.cookies = cookie class ZhihuSpiderMiddleware(object): # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the spider middleware does not modify the # passed objects. @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_spider_input(self, response, spider): # Called for each response that goes through the spider # middleware and into the spider. # Should return None or raise an exception. return None def process_spider_output(self, response, result, spider): # Called with the results returned from the Spider, after # it has processed the response. # Must return an iterable of Request, dict or Item objects. for i in result: yield i def process_spider_exception(self, response, exception, spider): # Called when a spider or process_spider_input() method # (from other spider middleware) raises an exception. # Should return either None or an iterable of Response, dict # or Item objects. pass def process_start_requests(self, start_requests, spider): # Called with the start requests of the spider, and works # similarly to the process_spider_output() method, except # that it doesn’t have a response associated. # Must return only requests (not items). for r in start_requests: yield r def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name)
代碼是單機的,操做無誤的話,能夠一直運行下去,理論上能夠把全部用戶都抓下來,我測試的時候設的倆秒比較慢,一個小時一倆萬字段。爬取的數據以下正則表達式
項目地址:sql
github:https://github.com/nanxung/-Scrapycookie