scrapy帶參數爬網站

時間 2019-11-13

標籤 scrapy 參數爬網欄目 Python 简体版

原文原文鏈接

使用scrapy 帶參數（關鍵詞）爬取知乎，得到知乎檢索關鍵詞的前幾個話題：名稱、贊同數、做者、小尾巴、內容、評論數html

.####################################################################python

項目文檔結構
├── scrapy.cfg
└── zhihu
├── __init__.py
├── __init__.pyc
├── items.py
├── items.pyc
├── pipelines.py
├── settings.py
├── settings.pyc
└── spiders
├── __init__.py
├── __init__.pyc
├── myzhihu.py
└── myzhihu.pyc
########################################################################cookie

item.pyapp

[hadoop@iZ25s7cmfyrZ zhihu]$ cat zhihu/items.py
# -*- coding: utf-8 -*-dom

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html機器學習

import scrapyscrapy

class ZhihuItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title=scrapy.Field()
vote=scrapy.Field()
author=scrapy.Field()
author_tail=scrapy.Field()
txt=scrapy.Field()
pinglun=scrapy.Field()ide

#################################################################################oop

spider py文件命名爲myzhihu.py #不要讓spider py文件與項目名相同，不然會提示導入item時會出錯學習

[hadoop@iZ25s7cmfyrZ zhihu]$ cat zhihu/spiders/myzhihu.py
#!/usr/bin/env python
#-*- coding: utf-8 -*-

import scrapy
from scrapy.spiders import Spider
from scrapy.selector import Selector
from faker import Factory

from zhihu.items import ZhihuItem

f=Factory.create()

class ZhihuSpider(Spider):
name="zhihu"
allowed_domain=["zhihu.com"]
"https://www.zhihu.com/search?type=content&sort=upvote&q=%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0"
def __init__(self, category=None, *args, **kwargs):
super(ZhihuSpider, self).__init__(*args, **kwargs)
self.start_urls=[
"https://www.zhihu.com/search?type=content&sort=upvote&q='%s'" % category,
'https://www.zhihu.com'
]
headers={
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
'Connection': 'keep-alive',
'Host': 'www.zhihu.com',
'User-Agent': f.user_agent()
}
def start_requests(self):
return [scrapy.Request(url=self.start_urls[0],
headers=self.headers,
meta={'cookiejar':1},
callback=self.parse_with_cookie)]

def parse_with_cookie(self, response):
sel=Selector(response)
sites=sel.xpath('//li[@class="item clearfix"]')
items=[]
for i,site in enumerate(sites):
item=ZhihuItem()
title=site.xpath('div[@class="title"]/a').re('\">.+</a')[0].replace('\">','').replace('/','').replace('<em>','').replace("<a","").replace("a>","")#.replace(,"").replace(">")
vote=site.xpath('div/div/div/a/text()').extract()
author=site.xpath("div/div/div/div/div/span/a[@class='author author-link']/text()").extract()
author_tail=site.xpath('div/div/div/div/div/span/@title').extract()
txt=site.xpath('div/div/div/div/div[@class="summary hidden-expanded"]').re('\">.+<a')#[0].replace('\">','').replace('</em>','').replace('<a','').replace("<em>","")
pinglun=site.xpath('div/div/div/div/a/span[@class="label"]/text()').extract()#[0]
title = title if title else ""
vote = vote[0] if vote else None
author= author[0] if author else ""
author_tail=author_tail[0] if author_tail else ""
pinglun=pinglun[0] if pinglun else ""
#print title, vote, author, author_tail, pinglun
item["title"]=title
item["vote"]=vote
item["author"]=author
item["author_tail"]=author_tail
item["txt"]=txt
item["pinlun"]=pinglun
yield item
#items.append(item)
#file=open("page.html","w")
#file.write(response.body)
#file.close()