scrapy帶參數爬網站

使用scrapy 帶參數(關鍵詞)爬取知乎,得到知乎檢索關鍵詞的前幾個話題:名稱、贊同數、做者、小尾巴、內容、評論數html

.####################################################################python

項目文檔結構
├── scrapy.cfg
└── zhihu
    ├── __init__.py
    ├── __init__.pyc
    ├── items.py
    ├── items.pyc
    ├── pipelines.py
    ├── settings.py
    ├── settings.pyc
    └── spiders
        ├── __init__.py
        ├── __init__.pyc
        ├── myzhihu.py
        └── myzhihu.pyc
########################################################################cookie

item.pyapp

[hadoop@iZ25s7cmfyrZ zhihu]$ cat zhihu/items.py
# -*- coding: utf-8 -*-dom

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html機器學習

import scrapyscrapy


class ZhihuItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    title=scrapy.Field()
    vote=scrapy.Field()
    author=scrapy.Field()
    author_tail=scrapy.Field()
    txt=scrapy.Field()
    pinglun=scrapy.Field()ide


#################################################################################oop

spider py文件命名爲myzhihu.py #不要讓spider py文件與項目名相同,不然會提示導入item時會出錯學習

[hadoop@iZ25s7cmfyrZ zhihu]$ cat zhihu/spiders/myzhihu.py 
#!/usr/bin/env python
#-*- coding: utf-8 -*-

import scrapy
from scrapy.spiders import Spider
from scrapy.selector import Selector
from faker import Factory

from zhihu.items import ZhihuItem

f=Factory.create()

class ZhihuSpider(Spider):
    name="zhihu"
    allowed_domain=["zhihu.com"]
    "https://www.zhihu.com/search?type=content&sort=upvote&q=%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0"
    def __init__(self, category=None, *args, **kwargs):
        super(ZhihuSpider, self).__init__(*args, **kwargs)
        self.start_urls=[
                "https://www.zhihu.com/search?type=content&sort=upvote&q='%s'" % category,
            'https://www.zhihu.com'
    ]
    headers={
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
            'Connection': 'keep-alive',
            'Host': 'www.zhihu.com',
            'User-Agent': f.user_agent()
            }
    def start_requests(self):
        return [scrapy.Request(url=self.start_urls[0], 
            headers=self.headers,
            meta={'cookiejar':1},
            callback=self.parse_with_cookie)]


    def parse_with_cookie(self, response):
        sel=Selector(response)
        sites=sel.xpath('//li[@class="item clearfix"]')
        items=[]
        for i,site in enumerate(sites):
            item=ZhihuItem()
            title=site.xpath('div[@class="title"]/a').re('\">.+</a')[0].replace('\">','').replace('/','').replace('<em>','').replace("<a","").replace("a>","")#.replace(,"").replace(">")
            vote=site.xpath('div/div/div/a/text()').extract()
            author=site.xpath("div/div/div/div/div/span/a[@class='author author-link']/text()").extract()
            author_tail=site.xpath('div/div/div/div/div/span/@title').extract()
            txt=site.xpath('div/div/div/div/div[@class="summary hidden-expanded"]').re('\">.+<a')#[0].replace('\">','').replace('</em>','').replace('<a','').replace("<em>","")
            pinglun=site.xpath('div/div/div/div/a/span[@class="label"]/text()').extract()#[0]
            title = title if title else ""
            vote = vote[0] if vote else None
            author= author[0] if author else ""
            author_tail=author_tail[0] if author_tail else ""
            pinglun=pinglun[0] if pinglun else ""
            #print title, vote, author, author_tail, pinglun
            item["title"]=title
            item["vote"]=vote
            item["author"]=author
            item["author_tail"]=author_tail
            item["txt"]=txt
            item["pinlun"]=pinglun
            yield item
            #items.append(item)
        #file=open("page.html","w")
        #file.write(response.body)
        #file.close()
 

#####################################################

可使用 scrapy crawl zhihu -a category='機器學習'   得到「機器學習」頭幾個話題

scrapy crawl zhihu -a category='中國' 得到「中國」的頭幾個話題

相關文章
相關標籤/搜索