1、Scrapy安裝(Windows)。html
在CMD下執行如下命令驗證是否安裝完成。python
沒有報錯說明安裝成功。數據庫
2、開始一個projectjson
scrapy startproject zhihuapi
查看相應目錄,在pycharm打開。app
setting.pydom
# 默認爲True,這個規則可能會影響爬取,先改爲False
ROBOTSTXT_OBEY = False
# 把Headers打開 DEFAULT_REQUEST_HEADERS = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en', 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36', }
items.pyscrapy
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://doc.scrapy.org/en/latest/topics/items.html from scrapy import Item, Field class UsersItem(Item): answer_count = Field() articles_count = Field() avatar_url = Field() avatar_url_template = Field() badge = Field() follower_count = Field() gender = Field() headline = Field() id = Field() is_advertiser = Field() is_followed = Field() is_following = Field() is_org = Field() name = Field() type = Field() url = Field() url_token = Field() user_type = Field()
zhihu.pyide
# -*- coding: utf-8 -*- from scrapy import Spider, Request import json import re from ..items import UsersItem f = open('F:\zhihu.txt', 'w', encoding='utf-8') class UserinfoSpider(Spider): name = 'zhihu' allowed_domains = ['www.zhihu.com'] start_urls = ['http://www.zhihu.com/'] start_user = 'kaifulee' user_url = 'https://www.zhihu.com/api/v4/members/{user}?include={include}' user_query = 'allow_message,is_followed,is_following,is_org,is_blocking,employments,answer_count,follower_count,articles_count,gender,badge[?(type=best_answerer)].topics' follow_url = 'https://www.zhihu.com/api/v4/members/{user}/followees?include={include}&offset={offset}&limit={limit}' follow_query = 'data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics' def start_requests(self): # 起始用戶頁,關注頁 # url = 'https://www.zhihu.com/api/v4/members/kaifulee/followees?include=data%5B*%5D.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics&offset=0&limit=20' yield Request(self.user_url.format(user=self.start_user, include=self.user_query), callback=self.user_parse) yield Request(self.follow_url.format(user=self.start_user, include=self.follow_query, offset=0, limit=20), callback=self.follow_parse) def user_parse(self, response): result = json.loads(response.text) # 獲取用戶信息 item = UsersItem() for field in item.fields: if field in result.keys(): item[field] = result.get(field) yield item yield Request(self.follow_url.format(user=result.get('url_token'), include=self.follow_query, limit=20, offset=0), callback=self.follow_parse) def follow_parse(self, response): results = json.loads(response.text) # 獲取用戶url_token參數傳入用戶頁中進行迭代 if 'data' in results.keys(): # 這裏做為驗證把用戶名保存到txt文檔中 for result in results.get('data'): print(result['name']) f.writelines(result['name']+' | ') yield Request(self.user_url.format(user=result.get('url_token'), include=self.user_query), self.user_parse) # 用戶關注列表頁翻頁迭代 if 'paging' in results.keys() and results.get('paging').get('is_end') == False: next_page = results.get('paging').get('next') num_re = re.compile(r'members/(.*?)/.*?offset=(\d+)').search(next_page) # 多是知乎為了防止爬蟲程序,這裏獲得的下一頁url不能訪問,因此就用正則抓取url_token和offset值,傳入url中實現翻頁 print('get url token {} offset {}'.format(num_re.group(1), num_re.group(2))) user = num_re.group(1) offset = num_re.group(2) next_page_url = self.follow_url.format(user=user, include=self.follow_query, limit=20, offset=offset) print('next_page_url={}'.format(next_page_url)) yield Request(next_page_url, self.follow_parse)
直接在pycharm上的terminal運行函數
打開zhihu.txt看到確實有抓到不少用戶,並且沒有使用代理試跑了半小時也沒有被知乎彈驗證碼阻止。
3、同時爬取粉絲頁並傳入Mongodb
前面是從我關注的人頁面爬url_token,固然也能夠同時從關注個人人頁面爬url_token,只須要在followees的url下面加一個followers的url,include都是同樣的不用改,再加一個follower_pase函數就行了。
# -*- coding: utf-8 -*- from scrapy import Spider, Request import json import re from ..items import UsersItemclass UserinfoSpider(Spider): name = 'zhihu' allowed_domains = ['www.zhihu.com'] start_urls = ['http://www.zhihu.com/'] start_user = 'kaifulee' # 用戶信息url user_url = 'https://www.zhihu.com/api/v4/members/{user}?include={include}' user_query = 'allow_message,is_followed,is_following,is_org,is_blocking,employments,answer_count,follower_count,articles_count,gender,badge[?(type=best_answerer)].topics' # 他關注的人url follow_url = 'https://www.zhihu.com/api/v4/members/{user}/followees?include={include}&offset={offset}&limit={limit}' # 關注他的人url follower_url = 'https://www.zhihu.com/api/v4/members/{user}/followers?include={include}&offset={offset}&limit={limit}' follow_query = 'data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics' def start_requests(self): # 起始用戶頁,關注頁 # url = 'https://www.zhihu.com/api/v4/members/kaifulee/followees?include=data%5B*%5D.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics&offset=0&limit=20' yield Request(self.user_url.format(user=self.start_user, include=self.user_query), callback=self.user_parse) yield Request(self.follow_url.format(user=self.start_user, include=self.follow_query, offset=0, limit=20), callback=self.follow_parse) yield Request(self.follower_url.format(user=self.start_user, include=self.follow_query, offset=0, limit=20), callback=self.follower_parse) def user_parse(self, response): result = json.loads(response.text) # 獲取用戶信息 item = UsersItem() for field in item.fields: if field in result.keys(): item[field] = result.get(field) yield item yield Request(self.follow_url.format(user=result.get('url_token'), include=self.follow_query, limit=20, offset=0), callback=self.follow_parse) yield Request(self.follower_url.format(user=result.get('url_token'), include=self.follow_query, limit=20, offset=0), callback=self.follower_parse) def follow_parse(self, response): results = json.loads(response.text) # 獲取用戶url_token參數傳入用戶頁中進行迭代 if 'data' in results.keys(): # 這裏做為驗證把用戶名保存到txt文檔中 for result in results.get('data'): print(result['name'])yield Request(self.user_url.format(user=result.get('url_token'), include=self.user_query), self.user_parse) # 用戶關注列表頁翻頁迭代 if 'paging' in results.keys() and results.get('paging').get('is_end') == False: next_page = results.get('paging').get('next') num_re = re.compile(r'members/(.*?)/.*?offset=(\d+)').search(next_page) # 多是知乎為了防止爬蟲程序,這裏獲得的下一頁url不能訪問,因此就用正則抓取url_token和offset值,傳入url中實現翻頁 print('get url token {} offset {}'.format(num_re.group(1), num_re.group(2))) user = num_re.group(1) offset = num_re.group(2) next_page_url = self.follow_url.format(user=user, include=self.follow_query, limit=20, offset=offset) print('next_page_url={}'.format(next_page_url)) yield Request(next_page_url, self.follow_parse) def follower_parse(self, response): results = json.loads(response.text) # 獲取用戶url_token參數傳入用戶頁中進行迭代 if 'data' in results.keys(): # 這裏做為驗證把用戶名保存到txt文檔中 for result in results.get('data'): print(result['name']) f.writelines(result['name']+' | ') yield Request(self.user_url.format(user=result.get('url_token'), include=self.user_query), self.user_parse) # 用戶關注列表頁翻頁迭代 if 'paging' in results.keys() and results.get('paging').get('is_end') == False: next_page = results.get('paging').get('next') num_re = re.compile(r'members/(.*?)/.*?offset=(\d+)').search(next_page) # 多是知乎為了防止爬蟲程序,這裏獲得的下一頁url不能訪問,因此就用正則抓取url_token和offset值,傳入url中實現翻頁 print('get url token {} offset {}'.format(num_re.group(1), num_re.group(2))) user = num_re.group(1) offset = num_re.group(2) next_page_url = self.follower_url.format(user=user, include=self.follow_query, limit=20, offset=offset) print('next_page_url={}'.format(next_page_url)) yield Request(next_page_url, self.follower_parse)
把結果保存到Mongdb中,從scrapy官方文檔中copy Mongodb的Demo,直接用就行了,就是寫入數據庫的代碼要改一下,在存入前去重。
官方文檔https://doc.scrapy.org/en/latest/topics/item-pipeline.html
pipelines.py
import pymongo class MongoPipeline(object): collection_name = 'scrapy_items' def __init__(self, mongo_uri, mongo_db): self.mongo_uri = mongo_uri self.mongo_db = mongo_db @classmethod def from_crawler(cls, crawler): return cls( mongo_uri=crawler.settings.get('MONGO_URI'), mongo_db=crawler.settings.get('MONGO_DATABASE', 'items') ) def open_spider(self, spider): self.client = pymongo.MongoClient(self.mongo_uri) self.db = self.client[self.mongo_db] def close_spider(self, spider): self.client.close() def process_item(self, item, spider): self.db['users'].update({'url_token': item['url_token']}, {'$set': item}, True) return item
settings.py把pipelines打開,並加上Mongodb的配置
ITEM_PIPELINES = { 'first_scrapy.pipelines.MongoPipeline': 300, } MONGO_URI = 'localhost' MONGO_DATABASE = 'zhihu'
爲何Mongodb用的是這兩個變量名?由於MongoPipline裏面寫的是從setting裏面拿這兩個變量,因此必須是這兩個變量名,若是要更改須要連pipelines裏面的名字一塊兒改。
@classmethod def from_crawler(cls, crawler): return cls( mongo_uri=crawler.settings.get('MONGO_URI'), mongo_db=crawler.settings.get('MONGO_DATABASE', 'items') )
Mongodb上的結果
我在一個地方沒用代理驗證了半小時,抓到了2w多條數據都沒有出現驗證碼,在另外一個地方用代理爬了4000多條數據就出現驗證碼的問題了,驗證碼問題能夠用Flask維護的動態代理池來解決。