# -*- coding: utf-8 -*- import scrapy from douban_top250.items import DoubanTop250Item class MovieSpider(scrapy.Spider): name = 'movie' header = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.109 Safari/537.36" } def start_requests(self): urls = 'https://movie.douban.com/top250' yield scrapy.Request(url=urls,headers=self.header) def parse(self, response): item = DoubanTop250Item() info = response.xpath("//*[@id='content']/div/div[1]/ol/li") for each in info: item['ranking'] = each.xpath("div/div[1]/em/text()").extract() item['name'] = each.xpath("div/div[2]/div[1]/a/span[1]/text()").extract() item['grade'] = each.xpath("div/div[2]/div[2]/div/span[2]/text()").extract() item['score_num'] = each.xpath("div/div[2]/div[2]/div/span[4]/text()").extract() yield item next_url = response.xpath("//*[@id='content']/div/div[1]/div[2]/span[3]/link/@href").extract() if next_url: next_url = 'https://movie.douban.com/top250' + next_url[0] yield scrapy.Request(next_url,headers=self.header)