Itemscss
Items就是結構化數據的模塊,至關於字典,好比定義一個{"title":"","author":""},items_loders就是從網頁中提取title和author字段填充到items裏,好比{"title":"初學scrapy","author":"Alex"},而後items把結構化的數據傳給pipeline,pipeline能夠把數據插入進MySQL裏.node
實例dom
items.pyscrapy
import scrapy class JobBoleArticleItem(scrapy.Item): title = scrapy.Field() create_date = scrapy.Field() url = scrapy.Field() url_object_id = scrapy.Field() front_image_url = scrapy.Field() front_image_path = scrapy.Field() praise_nums = scrapy.Field() comment_nums = scrapy.Field() fav_nums = scrapy.Field()
jobbole.pyide
# -*- coding: utf-8 -*- import scrapy from scrapy.http import Request from scrapy.loader import ItemLoader from urllib import parse import re import datetime from ArticleSpider.items import JobBoleArticleItem from utils.common import get_md5 class JpbboleSpider(scrapy.Spider): name = 'jobbole' allowed_domains = ['blog.jobbole.com'] start_urls = ['http://blog.jobbole.com/all-posts/'] #先下載http://blog.jobbole.com/all-posts/這個頁面,而後傳給parse解析 def parse(self, response): #1.start_urls下載頁面http://blog.jobbole.com/all-posts/,而後交給parse解析,parse裏的post_urls獲取這個頁面的每一個文章的url,Request下載每一個文章的頁面,而後callback=parse_detail,交給parse_detao解析 #2.等post_urls這個循環執行完,說明這一個的每一個文章都已經解析完了, 就執行next_url,next_url獲取下一頁的url,而後Request下載,callback=self.parse解析,parse從頭開始,先post_urls獲取第二頁的每一個文章的url,而後循環每一個文章的url,交給parse_detail解析 #獲取http://blog.jobbole.com/all-posts/中全部的文章url,並交給Request去下載,而後callback=parse_detail,交給parse_detail解析 post_nodes = response.css("#archive .floated-thumb .post-thumb a") for post_node in post_nodes: image_url = post_node.css("img::attr(src)").extract_first("") post_url = post_node.css("::attr(href)").extract_first("") yield Request(url=parse.urljoin(response.url,post_url),meta={"front_image_url":image_url},callback=self.parse_detail) #獲取下一頁的url地址,交給Request下載,而後交給parse解析 next_url = response.css(".next.page-numbers::attr(href)").extract_first("") if next_url: yield Request(url=next_url,callback=self.parse) def parse_detail(self,response): article_item = JobBoleArticleItem() #實例化定義的items item_loader = ItemLoader(item=JobBoleArticleItem(),response=response) #實例化item_loader,把咱們定義的item傳進去,再把下載器下載的網頁穿進去 #針對直接取值的狀況 item_loader.add_value("url",response.url) item_loader.add_value("url_object_id",get_md5(response.url)) item_loader.add_value("front_image_url",[front_image_url]) #針對css選擇器 item_loader.add_css("title",".entry-header h1::text") item_loader.add_css("create_date","p.entry-meta-hide-on-mobile::text") item_loader.add_css("praise_nums",".vote-post-up h10::text") item_loader.add_css("comment_nums","a[href='#article-comment'] span::text") item_loader.add_css("fav_nums",".bookmark-btn::text") #把結果返回給items article_item = item_loader.load_item()
debug調試,能夠看到拿到的信息post