scrapy高級用法之自動分頁

原理

根據規從頁面中提取到「下一頁」或「其餘分頁」連接html

用到模塊

from pyquery import PyQuery as pq
  from scrapy.contrib.spiders import CrawlSpider, Rule
  from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor

完整代碼

# -*- coding: utf-8 -*-
from pyquery import PyQuery as pq
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor


class SegSpider(CrawlSpider):
    name = "seg"
    allowed_domains = ["segmentfault.com"]
    start_urls = (
        'http://segmentfault.com/t/html5?type=newest&page=1',
    )

    rules = (
        Rule(SgmlLinkExtractor(allow=('\/t\/html5\?type=newest\&page=\d', ))),
        Rule(SgmlLinkExtractor(allow=('\/q\/\d+', )), callback='parse_item'),
    )

    def parse_item(self, response):
        html = response.body
        v = pq(html)
        item = dict()
        item['url'] = response.url
        item['title'] = v('title').text()
        yield item
相關文章
相關標籤/搜索