# -*- coding: utf-8 -*- # scrapy爬取豆瓣電影top250 import scrapy from douban.items import DoubanItem class DoubanspiderSpider(scrapy.Spider): name = "doubanspider" # allowed_domains = ["movie.douban.com/top250"]注意這裏的主頁限制,一旦翻頁可能超出範圍 start_urls = ['http://movie.douban.com/top250'] def parse(self, response): item = DoubanItem() for each in response.css('.article .grid_view li'): # 電影名稱 title = each.css('.item .hd .title:nth-child(1)::text').extract_first() # 導演 dire_actor = each.css('.item .bd p::text').extract()[0].strip() director = dire_actor.split('\xa0\xa0\xa0')[0].strip() # 演員 actor = dire_actor.split('\xa0\xa0\xa0')[1].strip() # 年代 info = each.css('.item .bd p::text').extract()[1].strip() year = info.split('/')[0].strip() # 國家 country = info.split('/')[1].strip() # 類型 type = info.split('/')[2].strip() # 評分 rating_num = each.css('.item .bd .star .rating_num::text').extract_first() # 經典臺詞 quote = each.css('.item .bd .quote span::text').extract_first() # 海報 image = each.css('.item .pic a img::attr(src)').extract_first() item['title'] = title item['director'] = director item['actor'] = actor item['year'] = year item['country'] = country item['type'] = type item['rating_num'] = rating_num item['quote'] = quote item['image'] = image yield item # 構造下一頁的請求 next = response.css('.paginator .next a::attr(href)').extract_first() if next: url = 'http://movie.douban.com/top250' + next print(url) yield scrapy.Request(url=url, callback=self.parse)