scrapy
安裝scrapy
pyspider
pip install Twisted-17.1.0-cp36-cp36m-win_amd64.whl
建立一個工程:scrapy stratproject PRONAME
php
cd PRONAMEhtml
scrapy genspdier spiderName www.xxx.com
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36
執行工程:scrapy crawl spiderName
python
# 1.在進行了以上配置操做後,在建立的spiderName中類中操做
class FirstSpider(scrapy.Spider): name = 'first' # 容許的域名 通常狀況下注釋掉 # allowed_domains = ['www.baidu.com'] # 起始url列表 做用:若是列表不爲空,列表中存放的url都會被scrapy自動地進行請求發送 start_urls = ['https://www.qiushibaike.com/text/'] # 就是用來將start_urls中url請求到的數據進行數據解析 def parse(self, response): div_list = response.xpath('//div[@id="content-left"]/div') for div in div_list: # 解析出的字符創都存儲在了Selector對象中 # author = div.xpath('./div[1]/a[2]/h2/text()')[0] # content = div.xpath('./a/div/span//text()') # author = div.xpath('./div[1]/a[2]/h2/text()')[0].extract() # 將單獨的Selector對象中存儲的字符串提取出來 author = div.xpath('./div[1]/a[2]/h2/text()').extract_first() # 等價於[0].extract() # 是由列表調用的,extract會依次做用到每個列表元素中 content = div.xpath('./a/div/span//text()').extract() content = ''.join(content) # 將列表元素拼接成字符串 print(author,content)
def parse(self, response): div_list = response.xpath('//div[@id="content-left"]/div') # 基於終端指令的持久化存儲 all_data = [] for div in div_list: # 解析出的字符創都存儲在了Selector對象中 # author = div.xpath('./div[1]/a[2]/h2/text()')[0] # content = div.xpath('./a/div/span//text()') # author = div.xpath('./div[1]/a[2]/h2/text()')[0].extract() # 將單獨的Selector對象中存儲的字符串提取出來 author = div.xpath('./div[1]/a[2]/h2/text()').extract_first() # 是由列表調用的,extract會依次做用到每個列表元素中 content = div.xpath('./a/div/span//text()').extract() content = ''.join(content) dic = { 'author': author, 'content': content, } all_data.append(dic) return all_data ### 接下來在終端中執行命令
scrapy scrawl first -o qiushibaike.csv # 咦,爲何是CSV而不是常見的TXT格式呢,好咱們試一下TXT scrapy scrawl first -o qiushibaike.txt # 你會發現報一個錯誤 """ crawl: error: Unrecognized output format 'txt', set one using the '- t' switch or as a file extension from the supported list ('json', 'j sonlines', 'jl', 'csv', 'xml', 'marshal', 'pickle') """ # 他指出了咱們保存文件的後綴名只能是上面列表中的指定的後綴名,那麼這麼看來,就是基於終端指令的侷限性了,因此在實際應用中咱們不多去使用。這種持久化的好處就是很便捷,侷限性強,只能將數據保存到本地中。
方式二: 基於管道mysql
管道接受item而後調用管道類中的process_item方法進行數據的持久化存儲git
from firstblood.items import FirstbloodItem # 基於管道持久化數據 def parse(self, response): div_list = response.xpath('//div[@id="content-left"]/div') all_data = [] for div in div_list: author = div.xpath('./div[1]/a[2]/h2/text()').extract_first() # 是由列表調用的,extract會依次做用到每個列表元素中 content = div.xpath('./a/div/span//text()').extract() content = ''.join(content) # 實例化item對象 再循環內部 item = FirstbloodItem() # 將解析到的數據封裝到item中 item['author'] = author item['content'] = content # 將解析到的content存儲到item對象中content屬性中 # 將item提交給管道 yield item # 循環多少次就是提交多少次
import scrapy class FirstbloodItem(scrapy.Item): # define the fields for your item here like: author = scrapy.Field() # 字符串 那麼二進制的數據 流數據呢? content = scrapy.Field() # Field() 他是一個萬能的數據類型,能夠存任意類型的數據
class FirstbloodPipeline(object): f = None # 重寫父類的兩個方法之一,開啓時只調用一次 def open_spider(self, spider): print('開始爬蟲') self.f = open('qiushi.txt', 'w', encoding='utf8') def process_item(self, item, spider): """ process_item每接受一個item就會被調用一次 :param item: 就是用來接收爬蟲文件提交過來的item對象 :param spider: :return: """ print(item) self.f.write(item['author']+':'+item['content']+'\n') return item # 重寫父類的兩個方法之一 def close_spider(self,spider): print('結束爬蟲') self.f.close()
ITEM_PIPELINES = { 'firstblood.pipelines.FirstbloodPipeline': 300, # 300 表示的優先級 意味着能夠放多個管道類 數值越小優先級越高 }
建立數據庫github
show databases; create database spider; use database; create table qiushi (author varchar(50),content varchar(8000))
建立管道類web
import pymysql class mysqlPileLine(object): conn = None cursor = None def open_spider(self, spider): # 鏈接數據庫 self.conn = pymysql.Connect(host='127.0.0.1', port=3306, user='root', password='root', db='spider', charset='utf8') # # 執行SQL語句 接下來建立遊標對象 遊標是能夠屢次建立的 # self.cursor = self.conn.cursor() print(self.conn) def process_item(self, item, spider): sql = 'insert into qiubai values ("{}","{}")'.format(item['author'], item['content']) # 執行SQL語句 接下來建立遊標對象 遊標是能夠屢次建立的 self.cursor = self.conn.cursor() # 進行事務處理 try: self.cursor.execute(sql) # 提交數據 self.conn.commit() except Exception as e: print(e) self.conn.rollback() # 事務回滾 return item # 將item傳遞給下一個即將被執行的管道類 def close_spider(self, spider): # 關閉數據庫 self.cursor.close() self.conn.close()
註冊管道類ajax
ITEM_PIPELINES = { 'firstblood.pipelines.FirstbloodPipeline': 300, # 300 表示的優先級 意味着能夠放多個管道類 數值越小優先級越高 'firstblood.pipelines.mysqlPileLine': 301, }
class redisPileLine(object): conn = None def open_spider(self, spider): self.conn = Redis(host='127.0.0.1', port=6379) def process_item(self, item, spider): # 將redis的版本切換到2.10.6 pip install -U redis==2.10.6 dic = { 'author': item['author'], 'content': item['content'] } self.conn.lpush('qiubai', dic)
import scrapy from xiaohuaPro.items import XiaohuaproItem # http://www.521609.com/meinvxiaohua/list12%d.html class XiaohuaSpider(scrapy.Spider): name = 'xiaohua' # allowed_domains = ['www.xxx.com'] start_urls = ['http://www.521609.com/meinvxiaohua/'] # 通用的url模板(不可變) url = 'http://www.521609.com/meinvxiaohua/list12%d.html' page_num = 2 # 該方法何時被調用: post 請求 # def start_requests(self): # for url in self.start_urls: # yield scrapy.FormRequest(url,formdata={},callback=self.parse) def parse(self, response): print('正在爬取第{}頁的數據......'.format(self.page_num)) li_list = response.xpath('//*[@id="content"]/div[2]/div[2]/ul/li') for li in li_list: img_name = li.xpath('./a[2]/text() | ./a[2]/b/text()').extract_first() item = XiaohuaproItem() # 封裝到item中 item['img_name'] = img_name yield item # 生成數據 # 遞歸函數的終止條件 if self.page_num <= 11: new_url = format(self.url % self.page_num) self.page_num += 1 # 手動請求發送 yield scrapy.Request(url=new_url, callback=self.parse) # 回調函數
item程序redis
import scrapyclass XiaohuaproItem(scrapy.Item): # define the fields for your item here like: img_name = scrapy.Field()
scrapy startproject proname
scrapy crawl spiderName -o filePath
settings.py
UA
假裝url
模板url
進行請求發送:
yield scrapy.Request(url,callback)
yield scrapy.FormRequest(url,formdata,callback)
scrapy
自動開啓了cookie處理scrapy
爬取糗事糗圖持久化存儲爬蟲文件sql
import scrapy from qiushiPic.items import QiushipicItem class PictureSpider(scrapy.Spider): name = 'picture' # allowed_domains = ['www.baidu.com'] # 容許經過的域名 通常都註釋 start_urls = ['https://www.qiushibaike.com/pic/'] # 通用的url模板 url = 'https://www.qiushibaike.com/pic/page/%d/' page_num = 2 def parse(self, response): print('正在爬取第{}頁的數據......'.format(self.page_num - 1)) div_list = response.xpath('//div[@id="content-left"]/div') for div in div_list: img_url = 'https:'+div.xpath('./div[2]/a/img/@src').extract_first() # 獲取圖片url地址 # print(img_url) item = QiushipicItem() item['img_url'] = img_url yield item # 封裝 # 遞歸函數的終止條件 if self.page_num <= 12: new_url = format(self.url % self.page_num) self.page_num += 1 # 手動請求發送 yield scrapy.Request(url=new_url, callback=self.parse) # 回調函數
items文件
class QiushipicItem(scrapy.Item): img_url = scrapy.Field() # 存儲圖片url地址
pipelines文件
對於圖片的爬取來講,管道類咱們有封裝好得,繼承ImagesPipeline
,繼承他便可,而後在定義方法
import scrapy from scrapy.pipelines.images import ImagesPipeline class ImgproPipeline(object): def process_item(self, item, spider): return item # 定製指定父類的管道類 class ImgPileline(ImagesPipeline): # 根據圖片地址進行圖片數據的請求 def get_media_requests(self, item, info): # 不須要指定回調函數 yield scrapy.Request(url=item['img_url']) # 指定圖片存儲的名稱 def file_path(self, request, response=None, info=None): url = request.url # 圖片地址 name = url.split('/')[-1] return name # 將item傳遞給下一個即將被執行的管道類 def item_completed(self, results, item, info): return item
組件之間的工做流程
引擎(Scrapy)
用來處理整個系統的數據流處理, 觸發事務(框架核心)
調度器(Scheduler)
用來接受引擎發過來的請求, 壓入隊列中, 並在引擎再次請求的時候返回. 能夠想像成一個URL(抓取網頁的網址或者說是連接)的優先隊列, 由它來決定下一個要抓取的網址是什麼, 同時去除重複的網址
下載器(Downloader)
用於下載網頁內容, 並將網頁內容返回給蜘蛛(Scrapy下載器是創建在twisted這個高效的異步模型上的)
爬蟲(Spiders)
爬蟲是主要幹活的, 用於從特定的網頁中提取本身須要的信息, 即所謂的實體(Item)。用戶也能夠從中提取出連接,讓Scrapy繼續抓取下一個頁面
項目管道(Pipeline)
負責處理爬蟲從網頁中抽取的實體,主要的功能是持久化實體、驗證明體的有效性、清除不須要的信息。當頁面被爬蟲解析後,將被髮送到項目管道,並通過幾個特定的次序處理數據。
適當提高scrapy爬取數據的效率:
增長併發:
默認scrapy開啓的併發線程爲32個,能夠適當進行增長。在settings配置文件中修改CONCURRENT_REQUESTS = 100值爲100,併發設置成了爲100。
下降日誌級別:
在運行scrapy時,會有大量日誌信息的輸出,爲了減小CPU的使用率。能夠設置log輸出信息爲INFO或者ERROR便可。在配置文件中編寫:LOG_LEVEL = ‘INFO’
禁止cookie:
若是不是真的須要cookie,則在scrapy爬取數據時能夠禁止cookie從而減小CPU的使用率,提高爬取效率。在配置文件中編寫:COOKIES_ENABLED = False
禁止重試:
對失敗的HTTP進行從新請求(重試)會減慢爬取速度,所以能夠禁止重試。在配置文件中編寫:RETRY_ENABLED = False
減小下載超時:
若是對一個很是慢的連接進行爬取,減小下載超時能夠能讓卡住的連接快速被放棄,從而提高效率。在配置文件中進行編寫:DOWNLOAD_TIMEOUT = 10 超時時間爲10s
爬蟲中間件
scrapy處理動態加載的數據
settings文件
BOT_NAME = 'midllePro' LOG_LEVEL = 'ERROR' ROBOTSTXT_OBEY = False DOWNLOADER_MIDDLEWARES = { 'midllePro.middlewares.MidlleproDownloaderMiddleware': 543, }
middlewares文件
import random class MidlleproDownloaderMiddleware(object): # UA池 user_agent_list = [ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 " "(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 " "(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 " "(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 " "(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 " "(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 " "(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 " "(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 " "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 " "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24" ] # ip池 PROXY_http = [ '153.180.102.104:80', '195.208.131.189:56055', ] PROXY_https = [ '120.83.49.90:9000', '95.189.112.214:35508', ] # 做用:攔截正常請求 def process_request(self, request, spider): print('this is process_request!!!') # 進行UA假裝 request.headers['User-Agent'] = random.choice(self.user_agent_list) # 測試 request.meta['proxy'] = 'http://27.208.92.247:8060' return None # 攔截響應 def process_response(self, request, response, spider): # Called with the response returned from the downloader. # Must either; # - return a Response object # - return a Request object # - or raise IgnoreRequest return response # 攔截異常的請求 def process_exception(self, request, exception, spider): # 代理ip的設置 if request.url.split(':')[0] == 'http': request.meta['proxy'] = random.choice(self.PROXY_http) else: request.meta['proxy'] = random.choice(self.PROXY_https) return request # 將修正以後的異常請求進行從新發送
爬蟲文件
class MiddleSpider(scrapy.Spider): name = 'middle' # allowed_domains = ['www.xxx.com'] start_urls = ['http://www.baidu.com/s?ie=UTF-8&wd=ip'] def parse(self, response): page_text = response.text with open('./ip.html', 'w', encoding='utf-8') as fp: fp.write(page_text)
settings.py
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36' ROBOTSTXT_OBEY = False LOG_LEVEL = 'ERROR' ITEM_PIPELINES = { 'MoviePro.pipelines.MovieproPipeline': 300, }
spider文件
import scrapy from ..items import MovieproItem class MovieSpider(scrapy.Spider): name = 'movie' # allowed_domains = ['www.baidu.com'] msg = input('電影類型:') start_urls = [f'https://www.4567tv.tv/index.php/vod/show/class/{msg}/id/1.html'] url = f'https://www.4567tv.tv/index.php/vod/show/class/{msg}/id/%d.html' page_num = 2 def parse(self, response): li_list = response.xpath('/html/body/div[1]/div/div/div/div[2]/ul/li') # 解析頁面的li for li in li_list: # 解析到電影的名字和詳情頁的url地址 name = li.xpath('./div/a/@title').extract_first() detail_url = 'https://www.4567tv.tv' + li.xpath('./div/a/@href').extract_first() # print(name,detail_url) item = MovieproItem() item['name'] = name # meta是一個字典,能夠將meta傳遞給callback yield scrapy.Request(detail_url, callback=self.desc_prase, meta={'item': item}) if self.page_num <= 5: new_url = format(self.url % self.page_num) self.page_num += 1 yield scrapy.Request(new_url, callback=self.parse) # 詳情簡介處理 def desc_prase(self, response): desc = response.xpath('/html/body/div[1]/div/div/div/div[2]/p[5]/span[2]/text()').extract_first() item = response.meta['item'] # callback 回去的item item['desc'] = desc yield item
items.py
class MovieproItem(scrapy.Item): # define the fields for your item here like: name = scrapy.Field() desc = scrapy.Field()
pipelines.py
class MovieproPipeline(object): def process_item(self, item, spider): print(item) return item
settings文件
BOT_NAME = 'wangyiPro' SPIDER_MODULES = ['wangyiPro.spiders'] NEWSPIDER_MODULE = 'wangyiPro.spiders' USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36' ROBOTSTXT_OBEY = False LOG_LEVEL = 'ERROR' # 開啓下載器的中間件 DOWNLOADER_MIDDLEWARES = { 'wangyiPro.middlewares.WangyiproDownloaderMiddleware': 543, } ITEM_PIPELINES = { 'wangyiPro.pipelines.WangyiproPipeline': 300, }
spider文件
import scrapy from ..items import WangyiproItem from selenium import webdriver class WangyiSpider(scrapy.Spider): name = 'wangyi' start_urls = ['https://news.163.com/'] # 存儲的是5個板塊對應的url five_model_urls = [] # 開啓selenium對象 def __init__(self): self.bro = webdriver.Chrome(executable_path='F:\spiderlearn\chromedriver.exe') # 從網易新聞首頁中解析出來5個板塊 對應的url def parse(self, response): li_list = response.xpath('//*[@id="index2016_wrap"]/div[1]/div[2]/div[2]/div[2]/div[2]/div/ul/li') alist = [3, 4, 6, 7, 8] # 5個板塊的li標籤 for a in alist: li = li_list[a] # 5個板塊對應的li # 五個板塊對應詳情頁的url news_url = li.xpath('./a/@href').extract_first() self.five_model_urls.append(news_url) # 對五個板塊詳情頁發起請求 yield scrapy.Request(news_url, callback=self.new_parse) # response就是五個板塊對應的響應對象 # 響應對象中的響應數據是不包含動態加載加載的新聞數據的 def new_parse(self, response): # 解析每個板塊中的新聞數據 div_list = response.xpath('/html/body/div/div[3]/div[4]/div[1]/div/div/ul/li/div/div') # print(div_list) for div in div_list: # 解析新聞標題和詳情頁的url title = div.xpath('./div/div[1]/h3/a/text()').extract_first() # 新聞標題 detail_url = div.xpath('./div/div[1]/h3/a/@href').extract_first() # 詳情頁url key_words = div.xpath('./div/div[2]/div//text()').extract() # 標籤 分類 key_words = ''.join(key_words) # print(title,key_words,detail_url) if detail_url is not None: item = WangyiproItem() item['title'] = title item['key_words'] = key_words # 對新聞詳情頁發起請求獲取新聞數據 meta將參數傳遞給回調函數 yield scrapy.Request(detail_url, callback=self.detail_parse, meta={'item': item}) # 解析詳情頁數據 def detail_parse(self, response): item = response.meta['item'] content = response.xpath('//*[@id="endText"]//text()').extract() # 解析到詳情數據列表 content = ''.join(content) item['content'] = content yield item # 提交給管道 # selenium中關閉瀏覽器 def closed(self, spider): self.bro.quit()
items文件
import scrapy class WangyiproItem(scrapy.Item): title = scrapy.Field() content = scrapy.Field() key_words = scrapy.Field()
管道文件
class WangyiproPipeline(object): def process_item(self, item, spider): print(item) return item import pymysql class mysqlPileLine(object): conn = None cursor = None def open_spider(self, spider): # 鏈接數據庫 self.conn = pymysql.Connect(host='127.0.0.1', port=3306, user='root', password='root', db='wangyi', charset='utf8') # # 執行SQL語句 接下來建立遊標對象 遊標是能夠屢次建立的 # self.cursor = self.conn.cursor() print(self.conn) def process_item(self, item, spider): sql = 'insert into qiubai values ("{}","{}","{}")'.format(item['title'], item['content'],item['key_words']) # 執行SQL語句 接下來建立遊標對象 遊標是能夠屢次建立的 self.cursor = self.conn.cursor() # 進行事務處理 try: self.cursor.execute(sql) # 提交數據 self.conn.commit() except Exception as e: print(e) self.conn.rollback() # 事務回滾 return item # 將item傳遞給下一個即將被執行的管道類 def close_spider(self, spider): # 關閉數據庫 self.cursor.close() self.conn.close()
中間件middlewares
from scrapy import signals from time import sleep from scrapy.http import HtmlResponse class WangyiproDownloaderMiddleware(object): def process_request(self, request, spider): return None # 攔截響應(1+5+n個響應) def process_response(self, request, response, spider): # 該方法會攔截到全部的響應(1+5+n) # 咱們須要篡改的是五個板塊對應的響應對象 # 如何有針對性的捕獲到五個板塊對應的響應對象 # 定位指定響應對象的方法: # 根據url定位到指定的request # 根據指定的request定位到指定的response urls = spider.five_model_urls if request.url in urls: # 將原始不知足需求的response篡改爲符合需求的新的response # 先要獲取符合需求的響應數據,而後將該響應數據封裝到新的響應對象中,將新響應對象返回 bro = spider.bro bro.get(request.url) sleep(2) js = 'window.scrollTo(0,document.body.scrollHeight)' bro.execute_script(js) sleep(1) bro.execute_script(js) sleep(1) bro.execute_script(js) sleep(1) # 返回的頁面源碼就包含了動態加載的新聞數據,page_text是須要做爲新的響應對象的響應數據 page_text = bro.page_source new_response = HtmlResponse(url=bro.current_url, body=page_text, encoding='utf-8', request=request) return new_response else: return response def process_exception(self, request, exception, spider): pass
CrawlSpider
是Spider的一個子類
做用:專門用做於全棧數據爬取
使用流程:
scrapy startproject proname
scrapy genspider -t crawl spiderName www.xxx.com
重要功能:
鏈接提取器:
LinkExtractor:根據指定的規則(allow=‘正則’)進行鏈接的提取
規則解析器:
鏈接提取器是被做用在規則解析器中。將鏈接提取器提取到的鏈接進行請求發送,而後使用指定的規則(callback)對響應數據進行數據解析。
注意:一個鏈接提取器必定對應惟一的一個規則解析器
基於CrawlSpider
進行深度數據爬取
settings文件
BOT_NAME = 'SunlinePro' SPIDER_MODULES = ['SunlinePro.spiders'] NEWSPIDER_MODULE = 'SunlinePro.spiders' USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36' ROBOTSTXT_OBEY = False LOG_LEVEL = 'ERROR' ITEM_PIPELINES = { 'SunlinePro.pipelines.SunlineproPipeline': 300, }
items文件
import scrapy class SunlineproItem(scrapy.Item): number = scrapy.Field() title = scrapy.Field() status = scrapy.Field() class ContentItem(scrapy.Item): number = scrapy.Field() content = scrapy.Field()
CrawlSpider文件
import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from ..items import SunlineproItem, ContentItem class SunlineSpider(CrawlSpider): name = 'sunline' start_urls = ['http://wz.sun0769.com/index.php/question/questionType?type=4&page='] # 實例化了一個連接提取器對象,其做用是敢於鏈接符合制定規則的提取 link = LinkExtractor(allow=r'type=4&page=\d+') # 正則 # link_all = LinkExtractor(allow=r'') # 提取全部的鏈接 # 提取詳情頁的url link_detail = LinkExtractor(allow=r'question/\d+/\d+\.shtml') rules = ( # 實例化了一個規則解析器對象 follow爲TRUE是解析全部界面 FALSE是當前頁面 Rule(link, callback='parse_item', follow=True), Rule(link_detail, callback='parse_detail', follow=True), ) # 解析標題和狀態 def parse_item(self, response): # xpath表達式中若是出現了tbody標籤必須跨過 tr_list = response.xpath('//*[@id="morelist"]/div/table[2]//tr/td/table//tr') for tr in tr_list: number = tr.xpath('./td/text()').extract_first() title = tr.xpath('./td[2]/a[2]/text()').extract_first() status = tr.xpath('./td[3]/span/text()').extract_first() item = SunlineproItem() item['number'] = number item['title'] = title item['status'] = status yield item # 解析新聞內容 def parse_detail(self, response): number = response.xpath('/html/body/div[9]/table[1]//tr/td[2]/span[2]/text()').extract_first() try: number = number.split(':')[-1] except AttributeError: print('出現異常!!') content = response.xpath('/html/body/div[9]/table[2]//tr[1]/td//text()').extract() content = ''.join(content) item = ContentItem() item['number'] = number item['content'] = content # print(number) yield item
pipelines文件
class SunlineproPipeline(object): def process_item(self, item, spider): dic = {} # 判斷接受到的item究竟是哪個item if item.__class__.__name__ == 'ContentItem': content = item['content'] num = item['number'] dic['content'] = content else: number = item['number'] title = item['title'] status = item['status'] return item
組建一個分佈式集羣,而後讓其共同執行同一組程序,實現數據的分佈式爬取
如何實現分佈式?
scrapy-redis
組件結合原生的scrapy實現分佈式原生的scrapy爲何不能實現分佈式?
scrapy-redis
的做用:
工程的建立
建立爬蟲文件 spider CrawlSpider
修改爬蟲文件:
from scrapy_redis.spiders import RedisCrawlSpider
配置文件的修改:
指定管道
ITEM_PIPELINES = {
'scrapy_redis.pipelines.RedisPipeline': 400
}
指定調度器:
# 增長了一個去重容器類的配置, 做用使用Redis的set集合來存儲請求的指紋數據, 從而實現請求去重的持久化 DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" # 使用scrapy-redis組件本身的調度器 SCHEDULER = "scrapy_redis.scheduler.Scheduler" # 配置調度器是否要持久化, 也就是當爬蟲結束了, 要不要清空Redis中請求隊列和去重指紋的set。若是是True, 就表示要持久化存儲, 就不清空數據, 不然清空數據 SCHEDULER_PERSIST = True
指定數據庫
REDIS_HOST = 'redis服務的ip地址' REDIS_PORT = 6379
修改Redis的配置:redis.windows.conf
結合配置文件啓動Redis服務,啓動客戶端
redis-server ./redis.windows.conf
redis-cli
啓動執行分佈式工程:
scrapy crawl fbs
or scrapy runspider ./xxx.py
http://wz.sun0769.com/index.php/question/questionType?type=4&page=
settings.py
BOT_NAME = 'fbsPro' SPIDER_MODULES = ['fbsPro.spiders'] NEWSPIDER_MODULE = 'fbsPro.spiders' USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36' ROBOTSTXT_OBEY = False CONCURRENT_REQUESTS = 32 ITEM_PIPELINES = { 'scrapy_redis.pipelines.RedisPipeline': 400 } # 增長了一個去重容器類的配置, 做用使用Redis的set集合來存儲請求的指紋數據, 從而實現請求去重的持久化 DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" # 使用scrapy-redis組件本身的調度器 SCHEDULER = "scrapy_redis.scheduler.Scheduler" # 配置調度器是否要持久化, 也就是當爬蟲結束了, 要不要清空Redis中請求隊列和去重指紋的set。若是是True, 就表示要持久化存儲, 就不清空數據, 不然清空數據 SCHEDULER_PERSIST = True REDIS_HOST = '127.0.0.1' REDIS_PORT = 6379 REDIS_ENCODING = 'utf-8'
items.py
import scrapy class FbsproItem(scrapy.Item): title = scrapy.Field() status = scrapy.Field()
spiders文件
from scrapy_redis.spiders import RedisCrawlSpider import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from ..items import FbsproItem class FbsSpider(RedisCrawlSpider): name = 'fbs' redis_key = 'sun' # 調度器隊列的名稱 rules = ( Rule(LinkExtractor(allow=r'type=4&page=\d+'), callback='parse_item', follow=True), ) def parse_item(self, response): tr_list = response.xpath('//*[@id="morelist"]/div/table[2]//tr/td/table//tr') for tr in tr_list: title = tr.xpath('./td[2]/a[2]/text()').extract_first() status = tr.xpath('./td[3]/span/text()').extract_first() item = FbsproItem() item['title'] = title item['status'] = status yield item
監測網站數據更新的狀況。 重心:去重
https://www.4567tv.tv/index.php/vod/show/class/%E5%8A%A8%E4%BD%9C/id/1.html
spider文件
import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from redis import Redis from ..items import ZlsMovieProItem class ZlsSpider(CrawlSpider): name = 'zls' start_urls = ['https://www.4567tv.tv/index.php/vod/show/class/喜劇/id/1.html'] rules = ( Rule(LinkExtractor(allow=r'page/\d+\.html'), callback='parse_item', follow=True), ) conn = Redis() # 解析出電影的名稱 def parse_item(self, response): li_list = response.xpath('/html/body/div[1]/div/div/div/div[2]/ul/li') for li in li_list: title = li.xpath('./div/a/@title').extract_first() detail_url = 'https://www.4567tv.tv' + li.xpath('./div/a/@href').extract_first() item = ZlsMovieProItem() item['title'] = title # 將detail_url所有存儲到redis的set中 ex = self.conn.sadd('movie_detail_urls', detail_url) if ex == 1: # detail_url未曾存在於redis print('有最新更新的數據可爬......') # 對詳情頁發起請求爬取電影的簡介 yield scrapy.Request(detail_url, callback=self.parse_detail, meta={'item': item}) else: print('暫無數據更新!!!') # 解析電影的簡介 def parse_detail(self, response): item = response.meta['item'] item['desc'] = response.xpath('/html/body/div[1]/div/div/div/div[2]/p[5]/span[2]/text()').extract_first() yield item
items.py
import scrapy class ZlsMovieProItem(scrapy.Item): title = scrapy.Field() desc = scrapy.Field()
pipelines.py
class ZlsMovieProPipeline(object): def process_item(self, item, spider): conn = spider.conn conn.lpush('movie_data', item) print(item) return item
settings.py
BOT_NAME = 'zls_movie_Pro' SPIDER_MODULES = ['zls_movie_Pro.spiders'] NEWSPIDER_MODULE = 'zls_movie_Pro.spiders' USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36' LOG_LEVEL = 'ERROR' ROBOTSTXT_OBEY = False ITEM_PIPELINES = { 'zls_movie_Pro.pipelines.ZlsMovieProPipeline': 300, }
import requests import re import json url = 'http://newsapi.eastmoney.com/kuaixun/v1/getlist_103_ajaxResult_50_%d_.html' headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36", } for i in range(1,21): new_url = format(url%i) # 新的url # 獲取文本文件 page_text = requests.get(url=new_url,headers=headers).text # 進行正則匹配字典字符串 page_str = re.findall('\{.*\}',page_text)[0] # 序列化字符串 page_dic = json.loads(page_str) page_list = page_dic['LivesList'] content = [] for dic in page_list: digest = dic['digest'] content.append(digest) with open(f'./第{i}頁新聞.txt','w',encoding='utf8') as f: f.write('\n'.join(content)) print(f'第{i}頁下載成功!')
import requests from lxml import etree class Login(object): def __init__(self): self.headers = { "Referer": "https://github.com/", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36", "Host": "github.com", } self.login_url = "https://github.com/login" self.post_url = "https://github.com/session" self.session = requests.Session() def token(self): response = self.session.get(self.login_url, headers=self.headers) selector = etree.HTML(response.text) token = selector.xpath('//div//input[2]/@value')[0] print(selector) return token def login(self, email, password): post_data = { "commit": "Sign in", "utf8": "✓", "authenticity_token": self.token(), "login": email, "password": password, } response = self.session.post(self.post_url, data=post_data, headers=self.headers) if response.status_code == 200: print("success") page_text = response.text with open('./login_main.html', 'w', encoding='utf8') as f: f.write(page_text) if __name__ == "__main__": login = Login() login.login(email='msd.yze@gmail.com', password='111111')
1. 簡述cookie的概念和做用 答:cookie就是服務器保存在瀏覽器本地中的記錄咱們信息的一組鍵值對,他的做用是在登陸或者註冊的時候記錄用戶的狀態。 2. 簡述scrapy各個核心組件之間的工做流程 答:spider中的url被封裝成請求對象交給引擎;引擎拿到對象後,將其所有交給調度器,調度器拿到全部的請求對象後在內部經過過濾器過濾掉重複的url,最後將去重後的url對應的請求對象壓入隊列中,以後調度器調度出其中的一個請求對象,並將其交個引擎,引擎將調度器調度出的請求對象交給下載器,下載器拿到請求對象去互聯網中下載數據,數據下載完成後悔封裝到response中,response交給下載器,下載器將response交給引擎,引擎將response交給spider,spider拿到response後調用回調方法進行數據解析,解析成功後產生item,接着spider將item交給引擎,引擎將item交給管道,管道拿到item後進行數據的持久化存儲。 3. 基於crwalSpider實現數據爬取的流程 1. 建立工程 scrapy startproject proname 2. Cd切換目錄,以後建立爬蟲文件 scrapy genspider -t crawl spiderName www.baidu.com 3. 執行 4. 在scrapy中如何實現將同一份數據值存儲到不一樣的數據庫中 答:建立不一樣的管道類,每一個管道類中實現一種存儲方法,並在settings中註冊。 5. scrapy的下載中間件的做用以及類中重點方法的使用介紹 答:批量攔截請求和響應,UA假裝:process_request: request.headers['User-Agent']=’xxxx’ 代理設置process_exception:request.meta['proxy']='http://ip:port', 攔截響應,篡改響應數據或者相應對象。 6. scrapy的pipeline的做用及其工做原理 答:用於持久化數據,將引擎發來的item進行數據的持久化存儲。 7. 有關scrapy的pipeline中的process_item方法的返回值有什麼注意事項。 答:其返回值能夠傳遞給下一個須要處理的對象 8. scrapy實現持久化存儲有幾種方式,如何實現 基於終端,能夠將parse方法的返回值存儲到本地磁盤文件中;基於管道,數據解析,在item類中進行相關屬性的定義,將解析的數據封裝到item類型的對象中,將item對象提交給管道,管道接受item而後調用管道類中的process_item方法進行數據的持久化存儲。 9. 描述使用xpath實現數據解析的流程 答:實例化一個etree的對象,將即將被解析的頁面加載到該對象中,調用etree對象中的xpath方法結合不用的xpath表達式實現標籤訂位 和數據提取。 10. 你如何處理相關動態加載的頁面數據 答:1.利用selenium能夠很是便捷的獲取動態加載數據 2.藉助於抓包工具能夠進行分析和動態加載數據對應url的提取 11. 如何實現分佈式?簡述其實現和部署流程 Scrapy-redis組件結合原生的scrapy實現分佈式 部署流程: 工程的建立 建立爬蟲文件 修改爬蟲文件, 導包 將爬蟲類的父類改爲RedisCrawlSpider 將allow_demains和start_urls刪除 添加一個新的屬性:redis_key=’xxx’:調度器隊列的名稱 完善爬蟲類的相關代碼(鏈接提取器,規則解析器,解析方法) 修改配置文件 指定管道 指定調度器 指定數據庫 修改Redis的配置 啓動Redis服務器,啓動客戶端;啓動執行分佈式工程 想調度器隊列中仍如一個起始url 12. 談談你對https數據加密方式的理解 對稱加密:有一個密鑰,他能夠對一段內容進行加密,加密後只能用它進行解密 非對稱加密:有兩把密鑰,一把叫作公鑰一把叫作私鑰,公鑰加密的數據只能用私鑰打開,一樣私鑰加密的數據只能用公鑰解開。 數字證書:網站在使用HTTPS前,須要向CA機構申請頒發一份數字證書,服務器把證書傳輸給瀏覽器,瀏覽器從中獲取公鑰 信息,證書就像是一個身份證。 數字簽名:把證書內容生成一份簽名,比對證書內容和簽名是否一致就能察覺是否證書被篡改。主要是對證書中明文進行hash,hash後的值用私鑰加密獲得數字簽名。 13. 原生的scrapy框架爲何不能夠實現分佈式? 沒法共享一個調度器 沒法共享一個管道 14. 常見的反爬機制有哪些?如何進行處理? Headers:把headers傳送給requests,繞過他 ip限制:根據ip地址的訪問頻率,次數進行反爬。構建本身的ip代理池,而後每次訪問的時候隨機選擇代理 UA限制:瀏覽器標識。構建本身的UA池,每次請求是隨機 選擇UA標識 驗證碼,模擬登錄:驗證碼識別,打碼平臺 Ajax動態加載:抓包工具,selenium解決 15. 在爬蟲中如何實現數據清洗(三種清洗方法) 去重,去除無效值和缺失值。