import scrapy class QuotesSpider(scrapy.Spider): name = "quotes" def start_requests(self): urls = [ 'http://quotes.toscrape.com/page/1/', #'http://quotes.toscrape.com/page/2/', ] for url in urls: yield scrapy.Request(url=url, callback=self.parse) def parse(self, response): for quote in response.xpath('//div[@class="quote"]'): yield { 'text': quote.xpath('.//span[@class="text"]/text()').extract(), 'author': quote.xpath('.//small[@class="author"]/text()').extract(), 'tags': quote.xpath('./div/meta/@content').extract(), } next_page = response.xpath('//li[@class="next"]/a/@href').extract_first() if next_page is not None: next_page = response.urljoin(next_page) yield scrapy.Request(next_page, callback=self.parse)
知識點:python
1.xpath如何在循環中訪問當前節點下的內容('.//scrapy
2.當前循環節點內容下的值能夠按照絕對路徑獲取 (./div/metaide
3.當前頁面的url如何訪問 response.urljoin(url