phone='13889083841'
password='登陸密碼'
topUsers=set()
domain='https://www.zhihu.com/'
firstUser='liaoxuefeng'
firstUrl = domain+'api/v4/members/{0}/followees?include=data[*].\
answer_count,articles_count,follower_count,following_count'
from scrapy import Spider,Request,FormRequest
from ..items import ZhihuloginItem
from io import BytesIO
from PIL import Image
import json,time,urllib
class ZhihuusersSpider(Spider):
name = 'zhihuUsers'
allowed_domains = ['www.zhihu.com']
# start_urls = ['https://www.zhihu.com/']
def start_requests(self): #settings中的COOKIES_ENABLED = False,保持默認的註銷狀態
#return [Request(domain,self.captcha,dont_filter=True)]
yield Request(domain,self.captcha,dont_filter=True)
def captcha(self,response):
xsrf=response.css('[name=_xsrf]::attr(value)').extract_first()
r=int(time.time() * 1000)
captchaUrl = domain+f'captcha.gif?r={r}&type=login&lang=cn'
yield Request(captchaUrl,self.getCaptcha,meta={'xsrf':xsrf}) #Request請求的寫法①
#寫法②:有時不得不把1個Request生成器分3句寫,如拉勾網給早期請求所取到的cookies
#再添個鍵:request.cookies['LGUID']=……['user_trace_token'][0]
#request=Request(captchaUrl,self.getCaptcha);request.meta['xsrf']=xsrf;yield request
def getCaptcha(self,response):
#cookies=response.request.headers.get(b'Cookie')
#if cookies: #Cookie的值依然是b節碼,先轉爲str並去除空格,最後用parse_qs()轉爲{}
# xsrf=urllib.parse.parse_qs(cookies.decode().replace(' ', ''))['_xsrf'][0]
#print('cookies中的某參:',response.meta['xsrf'],xsrf,type(xsrf),sep='\n')
Image.open(BytesIO(response.body)).show()
captcha=tuple(int(x)*23 for x in input('輸入各倒字的序號如1-3,自1始,以-隔:').split('-'))
if len(captcha)==2: # 目前的驗證碼,大部分時候是兩個倒立漢字,偶爾是一個
captcha='{"img_size":[200,44],"input_points":[[%s,23],[%s,23]]}' % captcha
elif len(captcha)==1:
captcha='{"img_size":[200,44],"input_points":[[%s,23]]}' % captcha
fd={'captcha_type': 'cn','captcha': captcha,'_xsrf': response.meta['xsrf'],\
'phone_num': phone, 'password': password}
yield FormRequest(domain+'login/phone_num',self.login,formdata=fd)
def login(self,response):
loginResult=json.loads(response.text)
if loginResult['r']==0:
print('登陸成功,開始抓取用戶信息。。。')
yield Request(firstUrl.format(firstUser),self.followers)
else:
print('登陸失敗。。。',loginResult,sep='\n')
def followers(self,response):
item=ZhihuloginItem()
result=json.loads(response.text)
if result['data']:
for user in result['data']:
item['name']=user['name']
item['gender']='男' if user['gender'] else '女'
item['answer_count']=user['answer_count']
item['articles_count']=user['articles_count']
item['follower_count']=user['follower_count']
item['following_count']=user['following_count']
item['url_token']='https://www.zhihu.com/people/'+user['url_token']
if user['follower_count']>10000: topUsers.add(user['url_token'])
yield item
if result['paging']['is_end']==False: #網頁源碼中的false沒加引號,是布爾值
nextPageUrl=result['paging']['next']
yield Request(nextPageUrl,self.followers)
else:
nextTopUser=topUsers.pop()
yield Request(firstUrl.format(nextTopUser),self.followers)