咱們知道蜘蛛運行時會下載要爬取的頁面,而後傳給給start_urls,頁面的返回對象response響應體就會封裝到parse方法response對象裏面,而後經過response對象css選擇器定位元素,返回一個selector對象,經過extract()方法來提取selector對象中標籤的信息。css
那如今咱們使用dribbble網站來試着解析字段信息,建立一個dribbble蜘蛛,就和以前建立csdn同樣,而後將測試頁面中的execute()方法中的參數改成須要測試的蜘蛛頁面中的name屬性值。html
import scrapy from urllib import parse from scrapy.http import Request class DribbbleSpider(scrapy.Spider): name = 'dribbble' allowed_domains = ['dribbble.com'] start_urls = ['https://dribbble.com/stories'] def parse(self, response): # 獲取a標籤的url值 # urls = response.css('h2 a::attr(href)').extract() a_nodes = response.css('header div.teaser a') for a_node in a_nodes: # print(a_node) a_url = a_node.css('::attr(href)').extract()[0] a_image_url = a_node.css('img::attr(src)').extract()[0] yield Request(url=parse.urljoin(response.url, a_url), callback=self.parse_analyse, meta={'a_image_url': a_image_url}) def parse_analyse(self, response): a_image_url = response.meta.get('a_image_url') title = response.css('.post header h1::text').extract()[0] date = response.css('span.date::text').extract_first() print('圖片的url是:{}'.format(a_image_url)) print('標題是: {}'.format(title)) print('時間是:{}'.format(date.strip()))
scrapy.Item
,而後咱們能夠根據咱們的需求在自動生成的這個modle中隨意建立字段;import scrapy class XkdDribbbleSpiderItem(scrapy.Item): title = scrapy.Field() a_image_url = scrapy.Field() date = scrapy.Field()
import scrapy from urllib import parse from scrapy.http import Request from ..items import XkdDribbbleSpiderItem from datetime import datetime class DribbbleSpider(scrapy.Spider): name = 'dribbble' allowed_domains = ['dribbble.com'] start_urls = ['https://dribbble.com/stories'] def parse(self, response): # 獲取a標籤的url值 # urls = response.css('h2 a::attr(href)').extract() a_nodes = response.css('header div.teaser a') for a_node in a_nodes: # print(a_node) a_url = a_node.css('::attr(href)').extract()[0] a_image_url = a_node.css('img::attr(src)').extract()[0] yield Request(url=parse.urljoin(response.url, a_url), callback=self.parse_analyse, meta={'a_image_url': a_image_url}) def parse_analyse(self, response): a_image_url = response.meta.get('a_image_url') title = response.css('.post header h1::text').extract()[0] date = response.css('span.date::text').extract_first() date = date.strip() date = datetime.strptime(date, '%b %d, %Y').date() # 構建模型 dri_item = XkdDribbbleSpiderItem() dri_item['a_image_url'] = a_image_url dri_item['title'] = title dri_item['date'] = date yield dri_item
參考:https://www.9xkd.com/user/plan-view.html?id=4097329051node