1.框架了解:高性能的異步下載、解析、持久化存儲
2.下載安裝,建立項目-----------
pip install wheel
Twisted 5步安裝!
二.安裝 Linux: pip3 install scrapy Windows: a. pip3 install wheel b. 下載twisted http://www.lfd.uci.edu/~gohlke/pythonlibs/#twisted c. 進入下載目錄,執行 pip3 install Twisted‑17.1.0‑cp35‑cp35m‑win_amd64.whl d. pip3 install pywin32 e. pip3 install scrapy
scrapy startproject 項目名稱
3.項目使用--5步聽視頻總結:
1.新建工程 scrapy startproject fristBlood
2.cd fristBlood 新建爬蟲文件scrapy genspider chouti www.chouti.com(在spiders中會新增一個chouti.py,注意名稱、start_url,註釋#allowed_domains)
3.在chouti.py中進行parse方法的編寫
4.配置文件的配置:在settings中進行UA假裝、ROBOTSTXT_OBEY = False
5.配置完後,在cmd中執行:scarpy crawl 爬蟲文件名稱
1.爬取chouti fristBlood
# -*- coding: utf-8 -*- import scrapy class ChoutiSpider(scrapy.Spider): #爬蟲文件的名稱:能夠指定某一個具體的爬蟲文件 name = 'chouti' #容許的域名: #allowed_domains = ['www.chouti.com'] #起始url列表:工程被執行後就能夠獲取該列表中url所對應的頁面數據 start_urls = ['https://dig.chouti.com/'] #該方法做用:就是講起始url列表中指定url對應的頁面數據進行解析操做 #response參數:就是對起始url發起請求後對應的響應對象 def parse(self, response): print(response)
2.爬取糗百 ---注意parse中 qiubaiPro
#extract()能夠將selector對象中存儲的文本內容獲取
封裝一個可迭代類型
基於終端指令執行 scarpy crawl -o data.csv qiubai --nolog---不經常使用
# -*- coding: utf-8 -*- import scrapy class QiubaiSpider(scrapy.Spider): name = 'qiubai' #allowed_domains = ['www.fdsfds.com'] start_urls = ['https://www.qiushibaike.com/text/'] def parse(self, response): #xpath返回的列表元素類型爲Selecor類型 div_list = response.xpath('//div[@id="content-left"]/div') #聲明一個用於存儲解析到數據的列表 all_data = [] for div in div_list: #extract()能夠將selector對象中存儲的文本內容獲取 #author = div.xpath('./div[1]/a[2]/h2/text()')[0].extract() author = div.xpath('./div[1]/a[2]/h2/text()').extract_first() #取出第一個元素,不用[0]了--意義同上行 content = div.xpath('.//div[@class="content"]/span//text()').extract() #//text獲取的內容不止一個,extract()獲取多個列表內容 content = "".join(content) #將列表轉化成字符串 dict = { 'author':author, 'content':content } all_data.append(dict) return all_data #持久化存儲方式: #1.基於終端指令:必須保證parse方法有一個可迭代類型對象的返回 #2.基於管道
3.爬取糗百--基於管道執行--注意item pipeLinepro
pipelines.py編寫
在settings中開啓ITEM_PIPELINES 67-69行
ITEM_PIPELINES數值越小,優先級越高(管道中)
一個寫到磁盤,一個寫到數據庫中
屏蔽日誌信息 scarpy crawl chouti --nolog
cls清屏
import scrapy class PipelineproItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() author = scrapy.Field() content = scrapy.Field()
# -*- coding: utf-8 -*- # Scrapy settings for pipeLinePro project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://doc.scrapy.org/en/latest/topics/settings.html # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html # https://doc.scrapy.org/en/latest/topics/spider-middleware.html BOT_NAME = 'pipeLinePro' SPIDER_MODULES = ['pipeLinePro.spiders'] NEWSPIDER_MODULE = 'pipeLinePro.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'pipeLinePro (+http://www.yourdomain.com)' USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36' # Obey robots.txt rules ROBOTSTXT_OBEY = False # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs #DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) #COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: #DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', #} # Enable or disable spider middlewares # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # 'pipeLinePro.middlewares.PipelineproSpiderMiddleware': 543, #} # Enable or disable downloader middlewares # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html #DOWNLOADER_MIDDLEWARES = { # 'pipeLinePro.middlewares.PipelineproDownloaderMiddleware': 543, #} # Enable or disable extensions # See https://doc.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #} # Configure item pipelines # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { 'pipeLinePro.pipelines.PipelineproPipeline': 300, 'pipeLinePro.pipelines.MyPipeline': 301, } # Enable and configure the AutoThrottle extension (disabled by default) # See https://doc.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# -*- coding: utf-8 -*- import scrapy from pipeLinePro.items import PipelineproItem class QiubaiSpider(scrapy.Spider): name = 'qiubai' #allowed_domains = ['www.ds.com'] start_urls = ['https://www.qiushibaike.com/text/'] def parse(self, response): # xpath返回的列表元素類型爲Selecor類型 div_list = response.xpath('//div[@id="content-left"]/div') # 聲明一個用於存儲解析到數據的列表 all_data = [] for div in div_list: # extract()能夠將selector對象中存儲的文本內容獲取 # author = div.xpath('./div[1]/a[2]/h2/text()')[0].extract() author = div.xpath('./div[1]/a[2]/h2/text()').extract_first() content = div.xpath('.//div[@class="content"]/span//text()').extract() content = "".join(content) #實例化item對象 item = PipelineproItem() #將解析到的數據值存儲到item對象中 item['author'] = author item['content'] = content #將item對象提交給管道 yield item # 持久化存儲方式: # 1.基於終端指令:必須保證parse方法有一個可迭代類型對象的返回 # 2.基於管道: #1.items.py:對該文件中的類進行實例化操做(item對象:存儲解析到的數據值)。 #2.pipeline.py:管道,做用就是接受爬蟲文件提交的item對象,而後將該對象中的數據值進行持久化存儲操做
# -*- coding: utf-8 -*- import pymysql class PipelineproPipeline(object): #做用:每當爬蟲文件向管道提交一次item,該方法就會被調用一次。item參數就是接受到爬蟲文件給提交過來的item對象 #該方法只有在開始爬蟲的時候被調用一次 fp = None def open_spider(self,spider): #父類的方法 print('開始爬蟲') self.fp = open('./qiubai_data.txt', 'w', encoding='utf-8') def process_item(self, item, spider): #父類的方法 author = item['author'] content = item['content'] self.fp.write(author+":"+content) return item #該方法只有在爬蟲結束後被調用一次 def close_spider(self,spider): #父類的方法 print('爬蟲結束') self.fp.close() class MyPipeline(object): conn = None cursor = None # 做用:每當爬蟲文件向管道提交一次item,該方法就會被調用一次。item參數就是接受到爬蟲文件給提交過來的item對象 def open_spider(self,spider): self.conn = pymysql.Connect(host="192.168.12.65", port=3306, db="scrapyDB", charset="utf8", user="root") self.cursor = self.conn.cursor() print('mysql') def process_item(self, item, spider): author = item['author'] content = item['content'] sql = "insert into qiubai values('%s','%s')" % (author,content) #qiubai是表名 try: self.cursor.execute(sql) #執行sql self.conn.commit() #事務的處理,沒有問題提交,有問題回滾 except Exception as e: print(e) self.conn.rollback() return item
管道操做4步---聽視頻本身總結:
前提要在parse方法中獲取解析到的數據,
1.將解析到的數據值存儲到item對象中(前提item中要進行屬性的聲明),
2.使用yield關鍵字將item對象提交給管道
3.在pipelines.py中進行PipelineproPipeline方法的編寫,編寫process_item
4.在配置文件中開啓管道
1.#實例化item對象
item = PipelineproItem()
2.在items.py中聲明屬性
3.#將解析到的數據值存儲到item對象中
item['author'] = author
item['content'] = content
4.#將item對象提交給管道
yield item
將數據寫入到數據庫:新建數據庫、表
select * from qiubai 查看寫入的內容