from scrapy.http import HtmlResponse from scrapy.http import Request class Md1(object): @classmethod def from_crawler(cls, crawler): # 此方法用於拿到當前的爬蟲 s = cls() return s def process_request(self, request, spider): print('md1.process_request',request) return None # 返回若是是 空就會繼續往下執行下一個中間件的 process_request 方法,若是一旦有返回值就要考慮狀況 """ # 1. 返回 Response # 返回 Response 以後會往下執行 最後一箇中間件的 process_response 方法 # import requests # result = requests.get(request.url) # return HtmlResponse(url=request.url, status=200, headers=None, body=result.content) # 2. 返回 Request # 返回 Request 以後 至關於無視了此次的請求 從新回到 調製器 那邊,至關於又產生了新的任務 # return Request('https://dig.chouti.com/r/tec/hot/1') # 3. 拋出異常 # 拋出異常 必需要 有 process_exception 方法進行捕捉異常,否則會報錯 # process_exception 方法在進行一系列的操做 在捕捉到異常的時候 # from scrapy.exceptions import IgnoreRequest # raise IgnoreRequest # 4. 對請求進行加工(*) # 一般咱們都是用於對請求加工,而後再繼續下面操做不返回東西 # request.headers['user-agent'] = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36" # return None """ def process_response(self, request, response, spider): # Called with the response returned from the downloader. # Must either; # - return a Response object # 返回一個 Response 來代替當前的 Response # - return a Request object # 返回一個 Request 開啓新任務 # - or raise IgnoreRequest # 返回一個 IgnoreRequest 進行異常捕捉 print('m1.process_response',request,response) return response def process_exception(self, request, exception, spider): # Called when a download handler or a process_request() # (from other downloader middleware) raises an exception. # Must either: # - return None: continue processing this exception # 一般咱們都是直接返回 None 就能夠了 # - return a Response object: stops process_exception() chain # 只要返回了 Response 當前的 process_exception 就不作操做了 # 返回 Response 表示交給下一個 中間件的 process_exception 繼續處理 # - return a Request object: stops process_exception() chain # 只要返回了 Request 當前的 process_exception 就不作操做了 # 返回 Request 放棄本次任務,新建任務 pass
DOWNLOADER_MIDDLEWARES = { #'xdb.middlewares.XdbDownloaderMiddleware': 543, # 'xdb.proxy.XdbProxyMiddleware':751, 'xdb.md.Md1':666, # 依舊是 0-1000 越小越優先 'xdb.md.Md2':667, }
調度器 給 下載器的時候先走 process_request(從第一個中間件往最後一個走) 而後若是根據返回狀況進行判斷接下來的方向
返回 None 繼續下一個中間件的 process_request
返回 Response 進入 最後一個下載中間件的 process_response 流程
返回 Request 返回 調度器開啓新任務
返回 異常 進入當前中間件的 process_exception 進行異常處理
下載器 還給 爬蟲的時候要走 process_response(從最後一箇中間件往第一個走)而後若是根據返回狀況進行判斷接下來的方向
返回 None 繼續上一個中間件的 process_response
返回 Response 替換當前Response 進入上一個下載中間件的 process_response 流程
返回 Request 返回 調度器開啓新任務 放棄當前的任務
返回 異常 進入當前中間件的 process_exception 進行異常處理
開源的組件 導入css
from fake_useragent import UserAgent
配置文件中設置選擇方式mysql
RANDOM_UA_TYPE = "random"
根據配置文件中的選擇方式設置模式sql
class RandomUserAgentMiddlware(object): # 隨機更換user-agent def __init__(self, crawler): super(RandomUserAgentMiddlware, self).__init__() self.ua = UserAgent() self.ua_type = crawler.settings.get("RANDOM_UA_TYPE", "random") @classmethod def from_crawler(cls, crawler): return cls(crawler) def process_request(self, request, spider): def get_ua(): return getattr(self.ua, self.ua_type) request.headers.setdefault('User-Agent', get_ua())
寫個腳本完成對 西刺代理IP的爬蟲數據庫
並存入數據庫windows
# -*- coding: utf-8 -*- import requests from scrapy.selector import Selector import pymysql conn = pymysql.connect(host="127.0.0.1", user="root", passwd="root", db="article_spider", charset="utf8") cursor = conn.cursor() def crawl_ips(): # 爬取西刺的免費ip代理 headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0"} for i in range(1568): re = requests.get("http://www.xicidaili.com/nn/{0}".format(i), headers=headers) selector = Selector(text=re.text) all_trs = selector.css("#ip_list tr") ip_list = [] for tr in all_trs[1:]: speed_str = tr.css(".bar::attr(title)").extract()[0] if speed_str: speed = float(speed_str.split("秒")[0]) all_texts = tr.css("td::text").extract() ip = all_texts[0] port = all_texts[1] proxy_type = all_texts[5] ip_list.append((ip, port, proxy_type, speed)) for ip_info in ip_list: cursor.execute( "insert proxy_ip(ip, port, speed, proxy_type) VALUES('{0}', '{1}', {2}, 'HTTP')".format( ip_info[0], ip_info[1], ip_info[3] ) ) conn.commit() class GetIP(object): def delete_ip(self, ip): # 從數據庫中刪除無效的ip delete_sql = """ delete from proxy_ip where ip='{0}' """.format(ip) cursor.execute(delete_sql) conn.commit() return True def judge_ip(self, ip, port): # 判斷ip是否可用 http_url = "http://www.baidu.com" proxy_url = "http://{0}:{1}".format(ip, port) try: proxy_dict = { "http": proxy_url, } response = requests.get(http_url, proxies=proxy_dict) except Exception as e: print("invalid ip and port") self.delete_ip(ip) return False else: code = response.status_code if code >= 200 and code < 300: print("effective ip") return True else: print("invalid ip and port") self.delete_ip(ip) return False def get_random_ip(self): # 從數據庫中隨機獲取一個可用的ip random_sql = """ SELECT ip, port FROM proxy_ip ORDER BY RAND() LIMIT 1 """ result = cursor.execute(random_sql) for ip_info in cursor.fetchall(): ip = ip_info[0] port = ip_info[1] judge_re = self.judge_ip(ip, port) if judge_re: return "http://{0}:{1}".format(ip, port) else: return self.get_random_ip() # print (crawl_ips()) if __name__ == "__main__": get_ip = GetIP() get_ip.get_random_ip()
設置中間件來調用腳本設置代理 IPapp
class RandomProxyMiddleware(object): # 動態設置ip代理 def process_request(self, request, spider): get_ip = GetIP() request.meta["proxy"] = get_ip.get_random_ip()
class Sd1(object): # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the spider middleware does not modify the # passed objects. @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() return s def process_spider_input(self, response, spider): # Called for each response that goes through the spider # middleware and into the spider. # Should return None or raise an exception. return None def process_spider_output(self, response, result, spider): # Called with the results returned from the Spider, after # it has processed the response. # Must return an iterable of Request, dict or Item objects. for i in result: yield i def process_spider_exception(self, response, exception, spider): # Called when a spider or process_spider_input() method # (from other spider middleware) raises an exception. # Should return either None or an iterable of Response, dict # or Item objects. pass # 只在爬蟲啓動時,執行一次。 def process_start_requests(self, start_requests, spider): # Called with the start requests of the spider, and works # similarly to the process_spider_output() method, except # that it doesn’t have a response associated. # Must return only requests (not items). for r in start_requests: yield r
SPIDER_MIDDLEWARES = { # 'xdb.middlewares.XdbSpiderMiddleware': 543, 'xdb.sd.Sd1': 666, # 同爬蟲中間件同樣的判斷機制 'xdb.sd.Sd2': 667, }
1. 第一次啓動爬蟲文件封裝好 request 以後 走 process_start_requests 上傳給引擎框架
2. 引擎將封裝好的 request 給調度器dom
3. 調度器 繼續執行 給下載器scrapy
4. 下載器 下載了內容以後給 引擎ide
5. 引擎再給 爬蟲文件的時候要走 process_spider_input
6. 爬蟲文件處理完以後若是有 yield 就要在走 process_spider_output 給引擎
- 深度
- 優先級
使用框架預留的位置,幫助你自定義一些功能
使用實例
from scrapy import signals class MyExtend(object): def __init__(self): pass @classmethod def from_crawler(cls, crawler): self = cls() crawler.signals.connect(self.x1, signal=signals.spider_opened) # 綁定信號發生時容許的函數 crawler.signals.connect(self.x2, signal=signals.spider_closed) return self def x1(self, spider): print('open') def x2(self, spider): print('close')
# 信號可選類型 from scrapy import signals 中能夠看到 engine_started = object() engine_stopped = object() spider_opened = object() spider_idle = object() spider_closed = object() spider_error = object() request_scheduled = object() request_dropped = object() response_received = object() response_downloaded = object() item_scraped = object() item_dropped = object()
# settings.py
EXTENSIONS = { 'xdb.ext.MyExtend':666, }
import sys from scrapy.cmdline import execute if __name__ == '__main__': execute(["scrapy","crawl","chouti","--nolog"])
- 在spiders同級建立任意目錄,如:commands - 在其中建立 crawlall.py 文件 (此處文件名就是自定義的命令) - 在settings.py 中添加配置 COMMANDS_MODULE = '項目名稱.目錄名稱' - 在項目目錄執行命令:scrapy crawlall
# crawlall.py
from scrapy.commands import ScrapyCommand from scrapy.utils.project import get_project_settings class Command(ScrapyCommand): requires_project = True def syntax(self): return '[options]' def short_desc(self): return 'Runs all of the spiders' def run(self, args, opts): spider_list = self.crawler_process.spiders.list() for name in spider_list: self.crawler_process.crawl(name, **opts.__dict__) self.crawler_process.start()
# settings.py
COMMANDS_MODULE = "xdb.commands"
爬蟲的暫停重啓是須要文件支持
在啓動的命令裏選擇一個路徑
不一樣的爬蟲不能共用,
相同的爬蟲若是公用同一個就會給予這個文件的上一次狀態繼續爬取
該命令的中斷命令是基於 windows Ctrl+c / 殺進程 或者 Linux 裏面的 kill -f -9 main.py
所以在 pycharm 中的中斷是作不到的, 只能在命令行中處理
scrapy crawl lagou -s JOBDIR=job_info/001
指定 文件路徑能夠在 settings.py 中設置
這樣就是全局設置了
JOBDIR="job_info/001"
或者在單爬蟲類中設置
cutom_settings = { "JOBDIR": "job_info/001" }
可是仍是和上面的說法同樣.....pycharm 裏面沒辦法中斷, 所以仍是沒有啥意義,
仍是隻能使用命令行方式