scrapy爬取的數據異步存儲至MySQL

以scrapy爬蟲爬取簡書中所有的頁面詳情數據爲例:html

1.cmd執行scrapy genspider -t crawl jbooks jianshu.com mysql

建立完爬蟲項目後最好爲其建立一個腳本啓動文件start.py 文件在項目根目錄便可sql

from scrapy import cmdline #啓動爬蟲命令
cmdline.execute('scrapy crawl js'.split())

去配置文件更改默認的配置信息:數據庫

1.robot協議必須改成falseapi

ROBOTSTXT_OBEY = False

2.添加準備好的請求頭信息,防止被網站識別:app

DEFAULT_REQUEST_HEADERS = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Trident/7.0; rv 11.0) like Gecko', }

至此,新項目的配置完成,開始編寫爬蟲腳本。dom

先明確要獲取的字段數據,在items完成:異步

import scrapy class JbookItem(scrapy.Item): title = scrapy.Field()          #標題
    content = scrapy.Field()        #內容
    article_id = scrapy.Field()     #文章ID
    source_url = scrapy.Field()     #源地址
    author =scrapy.Field()          #做者
    avatar = scrapy.Field()         #做者頭像
    pub_time = scrapy.Field()       #發佈日期

爬蟲腳本代碼:scrapy

# -*- coding: utf-8 -*-
import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from jbook.items import JbookItem class JsSpider(CrawlSpider): name = 'js' allowed_domains = ['jianshu.com'] start_urls = ['https://www.jianshu.com/'] rules = ( #匹配當前頁面中所有的url
        Rule(LinkExtractor(allow=r'.*/p/[0-9a-z]{12}.*'), callback='parse_detail', follow=True), ) def parse_detail(self, response): title = response.xpath('//div[@class="article"]/h1/text()').get().strip()           #標題
        # content = response.xpath('//div[@class="show-content-free"]//text()').getall()
        # cont = ''.join(content).strip() #內容
        content = response.xpath('//div[@class="show-content-free"]').get() url = response.url url1 = url.split("?")[0] article_id = url1.split('/')[-1]                                                    #文章id
 author = response.xpath('//div[@class="info"]/span[@class="name"]/a/text()').get().strip()  #做者
        avatar = response.xpath('//div[@class="author"]/a[@class="avatar"]/img/@src').get() avat = "https:"+avatar.split('?')[0]                                                #頭像
 pub_time = response.xpath('//span[@class="publish-time"]/text()').get().strip()     #發佈日期
 item = JbookItem(title=title, content=content, article_id=article_id, source_url=url, author=author, avatar=avat, pub_time=pub_time ) yield item

將item字典對象中的數據傳遞給管道文件去操做:piplines.pyide

import pymysql class JbookPipeline(object): def __init__(self): dbparams ={ 'host':'127.0.0.1', 'port':3306, 'user':'root', 'password':'root1234', 'database':'jbook', 'charset':'utf8' } self.conn = pymysql.connect(**dbparams)             #建立數據庫鏈接對象
        self.cursor = self.conn.cursor()                    #建立一個遊標對象
        self._sql =None                                     #初始化sql語句

    def process_item(self, item, spider): self.cursor.execute(self.sql, (item['title'], item['content'], item['author'], item['avatar'], item['pub_time'], item['article_id'], item['source_url'], )) self.conn.commit() #提交插入語句
        return item @property def sql(self): if not self._sql: self._sql=""" insert into article (id,title,content,author,avatar,pub_time,article_id,source_url) value (null,%s,%s,%s,%s,%s,%s,%s) """
            return self._sql return self._sql
傳統的同步數據存儲
import pymysql from twisted.enterprise import adbapi from pymysql import cursors class JbookTwistedPipeline(object): def __init__(self): dbparams = { 'host': '127.0.0.1', 'port': 3306, 'user': 'root', 'password': 'root1234', 'database': 'jbook', 'charset': 'utf8', 'cursorclass': cursors.DictCursor } self.dbpool = adbapi.ConnectionPool('pymysql',**dbparams) self._sql= None @property def sql(self): if not self._sql: self._sql = """ insert into article (id,title,content,author,avatar,pub_time,article_id,source_url) value (null,%s,%s,%s,%s,%s,%s,%s) """
            return self._sql return self._sql def process_item(self,item,spider): #對sql語句進行處理
        defer = self.dbpool.runInteraction(self.insert_item,item)  #執行函數insert_item 去插入數據
        defer.addErrback(self.handle_error,item,spider)            #遇到錯誤信息調用 handle_error方法

    def insert_item(self, cursor, item): cursor.execute(self.sql, ( item['title'], item['content'], item['author'], item['avatar'], item['pub_time'], item['article_id'], item['source_url'], )) def handle_error(self,error,item,spider): print('='*20+'error'+'='*20) print(error) print('='*20+'error'+'='*20)
經過數據庫鏈接池實現數據的異步存儲

在配置文件中開啓piplines管道文件:

ITEM_PIPELINES = { # 'jbook.pipelines.JbookPipeline': 300,
   'jbook.pipelines.JbookTwistedPipeline': 300, }
相關文章
相關標籤/搜索