根據規從頁面中提取到「下一頁」或「其餘分頁」連接html
from pyquery import PyQuery as pq from scrapy.contrib.spiders import CrawlSpider, Rule from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
# -*- coding: utf-8 -*- from pyquery import PyQuery as pq from scrapy.contrib.spiders import CrawlSpider, Rule from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor class SegSpider(CrawlSpider): name = "seg" allowed_domains = ["segmentfault.com"] start_urls = ( 'http://segmentfault.com/t/html5?type=newest&page=1', ) rules = ( Rule(SgmlLinkExtractor(allow=('\/t\/html5\?type=newest\&page=\d', ))), Rule(SgmlLinkExtractor(allow=('\/q\/\d+', )), callback='parse_item'), ) def parse_item(self, response): html = response.body v = pq(html) item = dict() item['url'] = response.url item['title'] = v('title').text() yield item