什麼是框架?php
如何學習框架:html
掌握框架的功能,能夠熟練使用每一種功能便可.python
scrapy:mysql
環境的安裝:
a. pip3 install wheelredis
b. 下載twisted http://www.lfd.uci.edu/~gohlke/pythonlibs/#twisted c. 進入下載目錄,執行 pip3 install Twisted‑17.1.0‑cp35‑cp35m‑win_amd64.whl d. pip3 install pywin32 e. pip3 install scrapy 測試:在終端中錄入scrapy
使用流程:sql
#UA假裝 USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36' #不聽從爬蟲Robots協議 ROBOTSTXT_OBEY = False #只輸出錯誤信息日誌 LOG_LEVEL = 'ERROR'
scrapy的數據解析數據庫
scrapy的持久化存儲app
#基於終端指令的持久化存儲代碼示例,直接寫在你的項目文件中便可, #運行指令scrapy crawl qiubai -o filePath # -*- coding: utf-8 -*- import scrapy # from qiubaiPor.items import QiubaiporItem class QiubaiSpider(scrapy.Spider): name = 'qiubai' # allowed_domains = ['www.xxx.com'] # 存放在該列表中的url都會被scrapy自動的進行請求發送 start_urls = ['https://www.qiushibaike.com/text/'] #基於終端指令的持久化存儲:能夠將parse方法的返回值對應的數據進行本地磁盤文件的持久化存儲 def parse(self, response): all_data = [] # 數據解析:做者and段子內容 div_list = response.xpath('//div[@id="content-left"]/div') for div in div_list: # 在scrapy中使用xpath解析標籤中的文本內容的話,最終獲取的是一個Selector的對象,且咱們須要的字符串數據所有被封裝在了該對象中 #若是能夠肯定xpath返回的列表只有一個列表元素則使用extract_first(),不然使用extract() author = div.xpath('./div[1]/a[2]/h2/text()').extract_first() content = div.xpath('./a/div/span/text()').extract() dic = {"author":author,"content":content} all_data.append(dic) return all_data
#基於sarapy模塊進行的數據爬取存儲 # -*- coding: utf-8 -*- import scrapy from qiubaiPro.items import QiubaiproItem class QiubaiSpider(scrapy.Spider): name = 'qiubai' # allowed_domains = ['www.xxx.com'] #存放在該列表中的url都會被scrapy自動的進行請求發送 start_urls = ['https://www.qiushibaike.com/text/'] #基於終端指令的持久化存儲:能夠將parse方法的返回值對應的數據進行本地磁盤文件的持久化存儲 # def parse(self, response): # all_data = [] # #數據解析:做者and段子內容 # div_list = response.xpath('//div[@id="content-left"]/div') # for div in div_list: # #在scrapy中使用xpath解析標籤中的文本內容的話,最終獲取的是一個Selector的對象,且咱們須要的字符串數據所有被封裝在了該對象中 # #若是能夠肯定xpath返回的列表只有一個列表元素則使用extract_first(),不然使用extract() # author = div.xpath('./div[1]/a[2]/h2/text()').extract_first() # content = div.xpath('./a/div/span/text()').extract() # # dic = { # 'author':author, # 'content':content # } # all_data.append(dic) # # print(author,content) # return all_data #基於管道實現持久化存儲 def parse(self, response): all_data = [] #數據解析:做者and段子內容 div_list = response.xpath('//div[@id="content-left"]/div') for div in div_list: #在scrapy中使用xpath解析標籤中的文本內容的話,最終獲取的是一個Selector的對象,且咱們須要的字符串數據所有被封裝在了該對象中 #若是能夠肯定xpath返回的列表只有一個列表元素則使用extract_first(),不然使用extract() author = div.xpath('./div[1]/a[2]/h2/text()').extract_first() if not author: author = '匿名用戶' content = div.xpath('./a/div/span/text()').extract() content = ''.join(content) #建立一個item類型的對象(只能夠存儲一組解析的數據) item = QiubaiproItem() #將解析到的數據存儲到item對象中 item['author'] = author item['content'] = content #將item提交給管道類 yield item
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html import pymysql from redis import Redis #一個管道類對應一種平臺的數據存儲 class QiubaiproPipeline(object): fp = None #重寫父類的方法:只在開始爬蟲的時候被執行一次 def open_spider(self,spider): print('開始爬蟲......') self.fp = open('./qiubai.txt','w',encoding='utf-8') #處理item類型的對象 #什麼是處理? #將封裝在item對象中的數據值提取出來且進行持久化存儲 #參數item表示的就是爬蟲文件提交過來的item對象 #該方法每接收一個item就會被調用一次 def process_item(self, item, spider): print('this is process_item()') author = item['author'] content = item['content'] self.fp.write(author+':'+content+"\n") #返回的item就會傳遞給下一個即將被執行的管道類 return item def close_spider(self,spider): print('結束爬蟲!') self.fp.close() #將數據同時存儲到mysql class mysqlPileLine(object): conn = None cursor = None def open_spider(self,spider): self.conn = pymysql.Connect(host='127.0.0.1',port=3306,db='spider',user='root',password='',charset='utf8') print(self.conn) def process_item(self,item,spider): sql = 'insert into qiubai values ("%s","%s")'%(item['author'],item['content']) #建立一個遊標對象 self.cursor = self.conn.cursor() try: self.cursor.execute(sql) self.conn.commit() except Exception as e: print(e) self.conn.rollback() return item def close_spider(self,spider): self.cursor.close() self.conn.close() class redisPileLine(object): conn = None def open_spider(self,spider): self.conn = Redis(host='127.0.0.1',port=6379) def process_item(self,item,spider): dic = { 'author':item['author'], 'content':item['content'] } self.conn.lpush('qiubaiData',dic)
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://doc.scrapy.org/en/latest/topics/items.html import scrapy class QiubaiporItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() author = scrapy.Field() content = scrapy.Field()
ITEM_PIPELINES = { 'qiubaiPro.pipelines.QiubaiproPipeline': 300, # 'qiubaiPro.pipelines.mysqlPileLine': 301, 'qiubaiPro.pipelines.redisPileLine': 302, #300表示的是優先級,數值越小優先級越高 }
進行全站數據的爬取
- 手動請求的發送(get請求)
- yield scrapy.Request(url,callback)
- callback:用於數據解析框架
def start_requests(self): for url in self.start_urls: data = {} yield scrapy.FormRequest(url,callback=self.parse,formdata=data)
# -*- coding: utf-8 -*- import scrapy #爬取多頁 from qiubaiByPages.items import QiubaibypagesItem class QiubaiSpider(scrapy.Spider): name = 'qiubai' # allowed_domains = ['www.xxx.com'] start_urls = ['https://www.qiushibaike.com/text/'] #定製一個通用的url模板 url = 'https://www.qiushibaike.com/text/page/%d/' pageNum = 1 def parse(self, response): print('正在爬取{}頁......'.format(self.pageNum)) all_data = [] # 數據解析:做者and段子內容 div_list = response.xpath('//div[@id="content-left"]/div') for div in div_list: # 在scrapy中使用xpath解析標籤中的文本內容的話,最終獲取的是一個Selector的對象,且咱們須要的字符串數據所有被封裝在了該對象中 # 若是能夠肯定xpath返回的列表只有一個列表元素則使用extract_first(),不然使用extract() author = div.xpath('./div[1]/a[2]/h2/text()').extract_first() if not author: author = '匿名用戶' content = div.xpath('./a/div/span/text()').extract() content = ''.join(content) # 建立一個item類型的對象(只能夠存儲一組解析的數據) item = QiubaibypagesItem() # 將解析到的數據存儲到item對象中 item['author'] = author item['content'] = content # 將item提交給管道類 yield item #遞歸解析+手動請求發送 ==> 全站數據爬取 if self.pageNum <= 13: self.pageNum += 1 new_url = format(self.url%self.pageNum) yield scrapy.Request(url=new_url,callback=self.parse)
class QiubaibypagesPipeline(object): fp = None # 重寫父類的方法:只在開始爬蟲的時候被執行一次 def open_spider(self, spider): print('開始爬蟲......') self.fp = open('./qiubai.txt', 'w', encoding='utf-8') # 處理item類型的對象 # 什麼是處理? # 將封裝在item對象中的數據值提取出來且進行持久化存儲 # 參數item表示的就是爬蟲文件提交過來的item對象 # 該方法每接收一個item就會被調用一次 def process_item(self, item, spider): # print('this is process_item()') author = item['author'] content = item['content'] self.fp.write(author + ':' + content + "\n") # 返回的item就會傳遞給下一個即將被執行的管道類 return item def close_spider(self, spider): print('結束爬蟲!') self.fp.close()
最後不要忘記settings跟items文件的配置dom
爬取http://www.521609.com/meinvxiaohua/ 圖片數據
#運行文件內容 import scrapy from xiaohuaPro.items import XiaohuaproItem class XiaohuaSpider(scrapy.Spider): name = 'xiaohua' # allowed_domains = ['www.xxx.com'] start_urls = ['http://www.521609.com/meinvxiaohua/'] url = 'http://www.521609.com/meinvxiaohua/list12%d.html' page_num = 1 def parse(self, response): li_list = response.xpath('//*[@id="content"]/div[2]/div[2]/ul/li') for li in li_list: #建立一個對象 item = XiaohuaproItem() img_src = 'http://www.521609.com'+li.xpath('./a[1]/img/@src').extract_first() title = li.xpath('./a[1]/img/@alt').extract_first() item['title'] = title item['img_src'] = img_src yield item if self.page_num <21: self.page_num += 1 new_url = format(self.url%self.page_num) #使用yield方式進行反覆的遞歸回調,來獲取新的內容 yield scrapy.Request(new_url,self.parse)
items中定義字段
import scrapy class XiaohuaproItem(scrapy.Item): # define the fields for your item here like: title = scrapy.Field() img_src = scrapy.Field() pass
pipelines中接收並進行存儲
from scrapy.pipelines.images import ImagesPipeline import scrapy class XiaohuaproPipeline(object): def process_item(self, item, spider): print(item) return item #使用scrapy專門封裝好的一個管道類(ImagesPipeline)文件數據下載和持久化存儲 class imgPileLine(ImagesPipeline): #進行文件請求 def get_media_requests(self, item, info): yield scrapy.Request(item['img_src']) #指定文件最終持久化存儲對應的文件名稱 def file_path(self, request, response=None, info=None): img_src = request.url img_name = img_src.split('/')[-1] return img_name def item_completed(self, results, item, info): print(results) return item #能夠將item傳遞給下一個即將被執行的管道類
在這些代碼操做以前不要忘了對你的工程項目進行settings的配置
#UA假裝 USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36' #不聽從爬蟲Robots協議 ROBOTSTXT_OBEY = False #只顯示報錯信息 LOG_LEVEL = 'ERROR' #管道優先級 TEM_PIPELINES = { 'xiaohuaPro.pipelines.XiaohuaproPipeline': 301, 'xiaohuaPro.pipelines.imgPileLine': 300, } #設置文件夾存儲路徑 IMAGES_STORE = './xiaohuas'
scrapy的五大核心組件
引擎(Scrapy)
調度器(Scheduler)
下載器(Downloader)
爬蟲(Spiders)
項目管道(Pipeline)
請求傳參:
基於scrapy框架的對電影及詳情頁內容的爬取案例
import scrapy from moviePro.items import MovieproItem class MovieSpider(scrapy.Spider): name = 'movie' # allowed_domains = ['www.xxx.com'] start_urls = ['https://www.4567tv.tv/index.php/vod/show/id/5.html'] #封裝一個通用的url模板 url = 'https://www.4567tv.tv/index.php/vod/show/id/5/page/%d.html' page_num = 1 def parse(self, response): print('正在爬取第{}頁......'.format(self.page_num)) li_list = response.xpath('/html/body/div[1]/div/div/div/div[2]/ul/li') for li in li_list: title = li.xpath('./div/a/@title').extract_first() detail_url = 'https://www.4567tv.tv'+li.xpath('./div/a/@href').extract_first() item = MovieproItem() item['title'] = title #對詳情頁發起get請求 #meta是一個字典,將meta傳遞給回調函數 yield scrapy.Request(detail_url,callback=self.parse_detail,meta={'item':item}) if self.page_num <= 33: self.page_num += 1 new_url = format(self.url%self.page_num) yield scrapy.Request(new_url,callback=self.parse) #用來解析詳情頁中的電影簡介 def parse_detail(self,response): #提取meta item = response.meta['item'] desc = response.xpath('/html/body/div[1]/div/div/div/div[2]/p[5]/span[2]/text()').extract_first() item['desc'] = desc yield item
import scrapy class MovieproItem(scrapy.Item): # define the fields for your item here like: #電影名稱 title = scrapy.Field() #電影詳情介紹 desc = scrapy.Field()
class MovieproPipeline(object): def process_item(self, item, spider): print(item) return item