定向抓取時,目標站點的數據不能單次請求獲取,須要3,4次或者更多,請求之間有依賴關係,就是須要連續請求完成這個下載事物html
前面講過的 js 動態頁面下載python
......react
下載器的不能夠影響 twisted 框架自己的異步機制web
與 scrapy 原有的下載器調用接口一致,符合插件規範app
其餘插件依賴的狀態要保留框架
1. 新建項目dom
# 建立項目 scrapy startproject jstest # 建立蜘蛛 scrapy genspider -t basic testSpider 'sina.com.cn'
2. 修改蜘蛛文件 testSpider.pypython2.7
from scrapy.spider import Spider class TestspiderSpider(Spider): name = "testSpider" allowed_domains = ["sina.com.cn"] start_urls = ( 'http://www.sina.com.cn/', ) def parse(self, response): print response.body print 'download_latency:', response.meta['download_latency']
3. 建立 jstest/handler 目錄,新建文件 mydownloader.py異步
參照 /usr/local/lib/python2.7/dist-packages/scrapy/core/downloader/handlers/http11.py 文件scrapy
#!/usr/bin/env python # encoding: utf-8 import re from time import time from cStringIO import StringIO from urlparse import urldefrag from zope.interface import implements from twisted.internet import defer, reactor, protocol from twisted.web.http_headers import Headers as TxHeaders from twisted.web.iweb import IBodyProducer from twisted.internet.error import TimeoutError from twisted.web.http import PotentialDataLoss from scrapy.xlib.tx import Agent, ProxyAgent, ResponseDone, HTTPConnectionPool, TCP4ClientEndpoint from scrapy.http import Headers from scrapy.responsetypes import responsetypes from scrapy.core.downloader.webclient import _parse from scrapy.utils.misc import load_object from scrapy.http import HtmlResponse from twisted.internet import utils class MyLogicDownloader(object): ''' 定製下載邏輯 ''' def __init__(self, agent=None): '''agent: 異步下載代理''' self._agent = agent def download(self, request): ''' 須要異步返回,不能夠阻塞,本例子的演示直接調用 phantomjs的一個簡單包裝腳本 ''' begintime = time() d = self._download(request) d.addCallback(self.parseData, request, begintime) print '證實我是異步的' return d def _download(self, request): '''使用twsited 的函數建立異步進程調用''' d = utils.getProcessOutput('scrapyweb.js', args=(request.url, '24000'), reactor=reactor) def getOutput(result): return result d.addCallback(getOutput) return d def parseData(self, htmldoc, request, begintime): '''解析函數,當請求完成後被調用''' # 這個下載時間在調整下載速度的擴展 AutoThrottle 中被使用 request.meta['download_latency'] = time() - begintime return HtmlResponse(request.url, body=htmldoc + '\n證實我被解析過', request=request) class MyDownloadHandler(object): ''' 下載接口, 被上層所調用 ''' def __init__(self, settings): self._pool = HTTPConnectionPool(reactor, persistent=True) self._pool.maxPersistentPerHost = settings.getint('CONCURRENT_REQUESTS_PER_DOMAIN') self._pool._factory.noisy = False self._contextFactoryClass = load_object(settings['DOWNLOADER_CLIENTCONTEXTFACTORY']) self._contextFactory = self._contextFactoryClass() def download_request(self, request, spider): '''下載的主要被調用接口(異步),返回 deferred (twisted 的延遲迴調對象)''' myDownloader = MyLogicDownloader() return myDownloader.download(request) def close(self): return self._pool.closeCachedConnections()
4. 添加配置到 settings.py
DOWNLOAD_HANDLERS = { 'http': 'jstest.handler.mydownloader.MyDownloadHandler' }
5. 在系統目錄下新建一個 phantomjs 包裝腳本 scrapyweb.js,並添加可執行權限
#!/usr/bin/env phantomjs if (phantom.args.length >= 1){ var url = phantom.args[0]; var timeOut = 10000; if (phantom.args.length == 2){ timeOut = Math.min(30000, Math.max(0, phantom.args[1])); } var page = require('webpage').create(); page.customHeaders = { 'Accept-Language': 'zh-CN,zh;q=0.8', 'Connection': 'keep-alive', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'DNT': '1' }; page.settings.userAgent = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.76 Safari/537.36'; page.open(encodeURI(url), function(status){ if (status != 'success'){ console.log('Err, status=' + status); phantom.exit(1); } console.log(page.content); phantom.exit(); }); setTimeout(function(){ console.log(page.content); phantom.exit(); }, timeOut);}else { console.log('Usage:'); console.log('\tphantomjs scrapyweb.js url timeout'); phantom.exit(1);}
6. 運行
scrapy crawl testSpider
例子是下載 js 動態頁面的例子。若是須要複雜些的連續下載,須要按照 scrapy 框架的 http11.py 文件修改
request.meta['download_latency'] 須要賦值,這個調整下載速度的 AutoThrottle 擴展依賴這個值斷定