scrapy crawl 爬蟲名稱 -o xxx.json scrapy crawl 爬蟲名稱 -o xxx.xml scrapy crawl 爬蟲名稱 -o xxx.csv
- 爬蟲文件:qiubaiDemo.py數據結構
# -*- coding: utf-8 -*- import scrapy from secondblood.items import SecondbloodItem class QiubaidemoSpider(scrapy.Spider): name = 'qiubaiDemo' allowed_domains = ['www.qiushibaike.com'] start_urls = ['http://www.qiushibaike.com/'] def parse(self, response): odiv = response.xpath('//div[@id="content-left"]/div') for div in odiv: # xpath函數返回的爲列表,列表中存放的數據爲Selector類型的數據。咱們解析到的內容被封裝在了Selector對象中,須要調用extract()函數將解析的內容從Selecor中取出。 author = div.xpath('.//div[@class="author clearfix"]//h2/text()').extract_first() author = author.strip('\n')#過濾空行 content = div.xpath('.//div[@class="content"]/span/text()').extract_first() content = content.strip('\n')#過濾空行 #將解析到的數據封裝至items對象中 item = SecondbloodItem() item['author'] = author item['content'] = content yield item#提交item到管道文件(pipelines.py)
- items文件:items.py框架
import scrapy class SecondbloodItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() author = scrapy.Field() #存儲做者 content = scrapy.Field() #存儲段子內容
- 管道文件:pipelines.pydom
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html class SecondbloodPipeline(object): #構造方法 def __init__(self): self.fp = None #定義一個文件描述符屬性 #下列都是在重寫父類的方法: #開始爬蟲時,執行一次 def open_spider(self,spider): print('爬蟲開始') self.fp = open('./data.txt', 'w') #由於該方法會被執行調用屢次,因此文件的開啓和關閉操做寫在了另外兩個只會各自執行一次的方法中。 def process_item(self, item, spider): #將爬蟲程序提交的item進行持久化存儲 self.fp.write(item['author'] + ':' + item['content'] + '\n') return item #結束爬蟲時,執行一次 def close_spider(self,spider): self.fp.close() print('爬蟲結束')
- 配置文件:settings.pyscrapy
#開啓管道 ITEM_PIPELINES = { 'secondblood.pipelines.SecondbloodPipeline': 300, #300表示爲優先級,值越小優先級越高 }
- pipelines.py文件
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html #導入數據庫的類 import pymysql class QiubaiproPipelineByMysql(object): conn = None #mysql的鏈接對象聲明 cursor = None#mysql遊標對象聲明 def open_spider(self,spider): print('開始爬蟲') #連接數據庫 self.conn = pymysql.Connect(host='',port=3306,user='root',password='123456',db='qiubai') #編寫向數據庫中存儲數據的相關代碼 def process_item(self, item, spider): #1.連接數據庫 #2.執行sql語句 sql = 'insert into qiubai values("%s","%s")'%(item['author'],item['content']) self.cursor = self.conn.cursor() #執行事務 try: self.cursor.execute(sql) self.conn.commit() except Exception as e: print(e) self.conn.rollback() return item def close_spider(self,spider): print('爬蟲結束') self.cursor.close() self.conn.close()
- settings.py
ITEM_PIPELINES = { 'qiubaiPro.pipelines.QiubaiproPipelineByMysql': 300, }
- pipelines.py文件
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html import redis class QiubaiproPipelineByRedis(object): conn = None def open_spider(self,spider): print('開始爬蟲') #建立連接對象 self.conn = redis.Redis(host='',port=6379) def process_item(self, item, spider): dict = { 'author':item['author'], 'content':item['content'] } #寫入redis中 self.conn.lpush('data', dict) return item
- pipelines.py文件
ITEM_PIPELINES = { 'qiubaiPro.pipelines.QiubaiproPipelineByRedis': 300, }
#該類爲管道類,該類中的process_item方法是用來實現持久化存儲操做的。 class DoublekillPipeline(object): def process_item(self, item, spider): #持久化操做代碼 (方式1:寫入磁盤文件) return item #若是想實現另外一種形式的持久化操做,則能夠再定製一個管道類: class DoublekillPipeline_db(object): def process_item(self, item, spider): #持久化操做代碼 (方式1:寫入數據庫) return item
#下列結構爲字典,字典中的鍵值表示的是即將被啓用執行的管道文件和其執行的優先級。 ITEM_PIPELINES = { 'doublekill.pipelines.DoublekillPipeline': 300, 'doublekill.pipelines.DoublekillPipeline_db': 200, } #上述代碼中,字典中的兩組鍵值分別表示會執行管道文件中對應的兩個管道類中的process_item方法,實現兩種不一樣形式的持久化操做。