Scrapy是一個爲了爬取網站數據,提取結構性數據而編寫的應用框架。 其能夠應用在數據挖掘,信息處理或存儲歷史數據等一系列的程序中。
其最初是爲了頁面抓取 (更確切來講, 網絡抓取 )所設計的, 也能夠應用在獲取API所返回的數據(例如 Amazon Associates Web Services ) 或者通用的網絡爬蟲。Scrapy用途普遍,能夠用於數據挖掘、監測和自動化測試。html
Scrapy 使用了 Twisted異步網絡庫來處理網絡通信。總體架構大體以下python
Scrapy運行流程大概以下:web
安裝:json
#scrapy 的一些依賴:pywin3二、pyOpenSSL、Twisted、lxml 、zope.interface。(安裝的時候,注意看報錯信息) #安裝wheel pip3 install wheel-i http://pypi.douban.com/simple --trusted-host pypi.douban.com #安裝這個依賴包,纔有安裝上Twisted pip3 install Incremental -i http://pypi.douban.com/simple --trusted-host pypi.douban.com #再pip3安裝Twisted,可是仍是安裝不成功,會報錯。(解決其它依賴問題) pip3 install Twisted -i http://pypi.douban.com/simple --trusted-host pypi.douban.com #再進入軟件存放目錄,再安裝就能夠成功啦。 pip3 install Twisted-17.1.0-cp35-cp35m-win32.whl #安裝scrapy pip3 install scrapy -i http://pypi.douban.com/simple --trusted-host pypi.douban.com #pywin32 下載:https://sourceforge.net/projects/pywin32/files/
建立:api
#建立項目 scrapy startproject xiaohuar #進入項目 cd xiaohuar #建立爬蟲應用 scrapy genspider xiaohuar xiaohar.com #運行爬蟲 scrapy crawl chouti --nolog
目錄:cookie
project_name/ scrapy.cfg project_name/ __init__.py items.py pipelines.py settings.py spiders/ __init__.py
解釋:網絡
注意:通常建立爬蟲文件時,以網站域名命名架構
選擇器:併發
#!/usr/bin/env python # -*- coding:utf-8 -*- from scrapy.selector import Selector, HtmlXPathSelector from scrapy.http import HtmlResponse html = """<!DOCTYPE html> <html> <head lang="en"> <meta charset="UTF-8"> <title></title> </head> <body> <ul> <li class="item-"><a id='i1' href="link.html">first item</a></li> <li class="item-0"><a id='i2' href="llink.html">first item</a></li> <li class="item-1"><a href="llink2.html">second item<span>vv</span></a></li> </ul> <div><a href="llink2.html">second item</a></div> </body> </html> """ response = HtmlResponse(url='http://example.com', body=html,encoding='utf-8') # hxs = HtmlXPathSelector(response) # print(hxs) # hxs = Selector(response=response).xpath('//a') # print(hxs) # hxs = Selector(response=response).xpath('//a[2]') # print(hxs) # hxs = Selector(response=response).xpath('//a[@id]') # print(hxs) # hxs = Selector(response=response).xpath('//a[@id="i1"]') # print(hxs) # hxs = Selector(response=response).xpath('//a[@href="link.html"][@id="i1"]') # print(hxs) # hxs = Selector(response=response).xpath('//a[contains(@href, "link")]') # print(hxs) # hxs = Selector(response=response).xpath('//a[starts-with(@href, "link")]') # print(hxs) # hxs = Selector(response=response).xpath('//a[re:test(@id, "i\d+")]') # print(hxs) # hxs = Selector(response=response).xpath('//a[re:test(@id, "i\d+")]/text()').extract() # print(hxs) # hxs = Selector(response=response).xpath('//a[re:test(@id, "i\d+")]/@href').extract() # print(hxs) # hxs = Selector(response=response).xpath('/html/body/ul/li/a/@href').extract() # print(hxs) # hxs = Selector(response=response).xpath('//body/ul/li/a/@href').extract_first() # print(hxs) # ul_list = Selector(response=response).xpath('//body/ul/li') # for item in ul_list: # v = item.xpath('./a/span') # # 或 # # v = item.xpath('a/span') # # 或 # # v = item.xpath('*/a/span') # print(v)
自定義擴展:app
自定義擴展時,利用信號在指定位置註冊制定操做
from scrapy import signals class MyExtension(object): def __init__(self, value): self.value = value @classmethod def from_crawler(cls, crawler): val = crawler.settings.getint('MMMM') ext = cls(val) crawler.signals.connect(ext.spider_opened, signal=signals.spider_opened) crawler.signals.connect(ext.spider_closed, signal=signals.spider_closed) return ext def spider_opened(self, spider): print('open') def spider_closed(self, spider): print('close')
自定義去重複:
scrapy默認使用 scrapy.dupefilter.RFPDupeFilter 進行去重,相關配置有:
DUPEFILTER_CLASS = 'scrapy.dupefilter.RFPDupeFilter' DUPEFILTER_DEBUG = False JOBDIR = "保存範文記錄的日誌路徑,如:/root/" # 最終路徑爲 /root/requests.seen
自定義:
#偶合性低,給url去重使用 class RepeatFilter(object): def __init__(self): self.visited_set = set() @classmethod def from_settings(cls, settings): return cls() def request_seen(self, request): if request.url in self.visited_set:#先看當前url在不在visited_set return True self.visited_set.add(request.url) #若是不在就加進去 return False def open(self): # 每次開始的時候都會調用 # print('open') pass def close(self, reason): #每次結束的時候都會調用 # print('close') pass def log(self, request, spider):#每次捕捉到重複的url都會寫在log裏面 # print('log....') pass
settings:
# 1. 爬蟲名稱 # BOT_NAME = 'step8_king' # 2. 爬蟲應用路徑 # SPIDER_MODULES = ['step8_king.spiders'] # NEWSPIDER_MODULE = 'step8_king.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent # 3. 客戶端 user-agent請求頭 # USER_AGENT = 'step8_king (+http://www.yourdomain.com)' # Obey robots.txt rules # 4. 禁止爬蟲配置 # ROBOTSTXT_OBEY = False # Configure maximum concurrent requests performed by Scrapy (default: 16) # 5. 併發請求數 # CONCURRENT_REQUESTS = 4 # Configure a delay for requests for the same website (default: 0) # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs # 6. 延遲下載秒數 # DOWNLOAD_DELAY = 2 # The download delay setting will honor only one of: # 7. 單域名訪問併發數,而且延遲下次秒數也應用在每一個域名 # CONCURRENT_REQUESTS_PER_DOMAIN = 2 # 單IP訪問併發數,若是有值則忽略:CONCURRENT_REQUESTS_PER_DOMAIN,而且延遲下次秒數也應用在每一個IP # CONCURRENT_REQUESTS_PER_IP = 3 # Disable cookies (enabled by default) # 8. 是否支持cookie,cookiejar進行操做cookie # COOKIES_ENABLED = True # COOKIES_DEBUG = True # Disable Telnet Console (enabled by default) # 9. Telnet用於查看當前爬蟲的信息,操做爬蟲等... # 使用telnet ip port ,而後經過命令操做 # TELNETCONSOLE_ENABLED = True # TELNETCONSOLE_HOST = '127.0.0.1' # TELNETCONSOLE_PORT = [6023,] # Override the default request headers: # 10. 默認請求頭 # DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', # } # Configure item pipelines # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html # 11. 定義pipeline處理請求 # ITEM_PIPELINES = { # 'step8_king.pipelines.CustomPipeline': 500, # } # Enable or disable extensions # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html # 12. 自定義擴展,基於信號進行調用 # EXTENSIONS = { # # 'step8_king.extensions.MyExtension': 500, # } # 13. 爬蟲容許的最大深度,能夠經過meta查看當前深度;0表示無深度 # DEPTH_LIMIT = 3 # 14. 爬取時,0表示深度優先Lifo(默認);1表示廣度優先FiFo # 後進先出,深度優先 # DEPTH_PRIORITY = 0 # SCHEDULER_DISK_QUEUE = 'scrapy.squeue.PickleLifoDiskQueue' # SCHEDULER_MEMORY_QUEUE = 'scrapy.squeue.LifoMemoryQueue' # 先進先出,廣度優先 # DEPTH_PRIORITY = 1 # SCHEDULER_DISK_QUEUE = 'scrapy.squeue.PickleFifoDiskQueue' # SCHEDULER_MEMORY_QUEUE = 'scrapy.squeue.FifoMemoryQueue' # 15. 調度器隊列 # SCHEDULER = 'scrapy.core.scheduler.Scheduler' # from scrapy.core.scheduler import Scheduler # 16. 訪問URL去重 # DUPEFILTER_CLASS = 'step8_king.duplication.RepeatUrl' # Enable and configure the AutoThrottle extension (disabled by default) # See http://doc.scrapy.org/en/latest/topics/autothrottle.html # 開始自動限速 # AUTOTHROTTLE_ENABLED = True # The initial download delay # 初始下載延遲 # AUTOTHROTTLE_START_DELAY = 10 # The maximum download delay to be set in case of high latencies # 最大下載延遲 # AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to each remote server # 平均每秒併發數 # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: # 是否顯示 # AUTOTHROTTLE_DEBUG = True # Enable and configure HTTP caching (disabled by default) # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings # HTTPCACHE_ENABLED = True # HTTPCACHE_EXPIRATION_SECS = 0 # HTTPCACHE_DIR = 'httpcache' # HTTPCACHE_IGNORE_HTTP_CODES = [] # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' # Enable or disable spider middlewares # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html # 爬蟲中間件 SPIDER_MIDDLEWARES = { 'step8_king.middlewares.MyCustomSpiderMiddleware': 543, } # Enable or disable downloader middlewares # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html # 下載中間件 DOWNLOADER_MIDDLEWARES = { # 'step8_king.middlewares.MyCustomDownloaderMiddleware': 500, }
自定義pipline
一個簡單的爬蟲:
#!/usr/bin/env python # -*- coding:utf-8 -*- import scrapy from scrapy.http import Request from scrapy.selector import HtmlXPathSelector import re import urllib import os class XiaoHuarSpider(scrapy.spiders.Spider): name = "xiaohuar" allowed_domains = ["xiaohuar.com"] start_urls = [ "http://www.xiaohuar.com/list-1-1.html", ] def parse(self, response): # 分析頁面 # 找到頁面中符合規則的內容(校花圖片),保存 # 找到全部的a標籤,再訪問其餘a標籤,一層一層的搞下去 hxs = HtmlXPathSelector(response) # 若是url是 http://www.xiaohuar.com/list-1-\d+.html if re.match('http://www.xiaohuar.com/list-1-\d+.html', response.url): items = hxs.select('//div[@class="item_list infinite_scroll"]/div') for i in range(len(items)): src = hxs.select('//div[@class="item_list infinite_scroll"]/div[%d]//div[@class="img"]/a/img/@src' % i).extract() name = hxs.select('//div[@class="item_list infinite_scroll"]/div[%d]//div[@class="img"]/span/text()' % i).extract() school = hxs.select('//div[@class="item_list infinite_scroll"]/div[%d]//div[@class="img"]/div[@class="btns"]/a/text()' % i).extract() if src: ab_src = "http://www.xiaohuar.com" + src[0] file_name = "%s_%s.jpg" % (school[0].encode('utf-8'), name[0].encode('utf-8')) file_path = os.path.join("/Users/wupeiqi/PycharmProjects/beauty/pic", file_name) urllib.urlretrieve(ab_src, file_path) # 獲取全部的url,繼續訪問,並在其中尋找相同的url all_urls = hxs.select('//a/@href').extract() for url in all_urls: if url.startswith('http://www.xiaohuar.com/list-1-'): yield Request(url, callback=self.parse)
以上代碼將符合規則的頁面中的圖片保存在指定目錄,而且在HTML源碼中找到全部的其餘 a 標籤的href屬性,從而「遞歸」的執行下去,直到全部的頁面都被訪問過爲止。以上代碼之因此能夠進行「遞歸」的訪問相關URL,關鍵在於parse方法使用了 yield Request對象。
注:能夠修改settings.py 中的配置文件,以此來指定「遞歸」的層數,如: DEPTH_LIMIT = 1
獲取相應的cookie:
def parse(self, response): from scrapy.http.cookies import CookieJar cookieJar = CookieJar() cookieJar.extract_cookies(response, response.request) print(cookieJar._cookies)
格式化處理:
上述實例只是簡單的圖片處理,因此在parse方法中直接處理。若是對於想要獲取更多的數據(獲取頁面的價格、商品名稱、QQ等),則能夠利用Scrapy的items將數據格式化,而後統一交由pipelines來處理。
import scrapy class JieYiCaiItem(scrapy.Item): company = scrapy.Field() title = scrapy.Field() qq = scrapy.Field() info = scrapy.Field() more = scrapy.Field()
上述定義模板,之後對於從請求的源碼中獲取的數據贊成按照此結構來獲取,因此在spider中須要有一下操做:
import scrapy import hashlib from beauty.items import JieYiCaiItem from scrapy.http import Request from scrapy.selector import HtmlXPathSelector from scrapy.spiders import CrawlSpider, Rule from scrapy.linkextractors import LinkExtractor class JieYiCaiSpider(scrapy.spiders.Spider): count = 0 url_set = set() name = "jieyicai" domain = 'http://www.jieyicai.com' allowed_domains = ["jieyicai.com"] start_urls = [ "http://www.jieyicai.com", ] rules = [ #下面是符合規則的網址,可是不抓取內容,只是提取該頁的連接(這裏網址是虛構的,實際使用時請替換) #Rule(SgmlLinkExtractor(allow=(r'http://test_url/test?page_index=\d+'))), #下面是符合規則的網址,提取內容,(這裏網址是虛構的,實際使用時請替換) #Rule(LinkExtractor(allow=(r'http://www.jieyicai.com/Product/Detail.aspx?pid=\d+')), callback="parse"), ] def parse(self, response): md5_obj = hashlib.md5() md5_obj.update(response.url) md5_url = md5_obj.hexdigest() if md5_url in JieYiCaiSpider.url_set: pass else: JieYiCaiSpider.url_set.add(md5_url) hxs = HtmlXPathSelector(response) if response.url.startswith('http://www.jieyicai.com/Product/Detail.aspx'): item = JieYiCaiItem() item['company'] = hxs.select('//span[@class="username g-fs-14"]/text()').extract() item['qq'] = hxs.select('//span[@class="g-left bor1qq"]/a/@href').re('.*uin=(?P<qq>\d*)&') item['info'] = hxs.select('//div[@class="padd20 bor1 comard"]/text()').extract() item['more'] = hxs.select('//li[@class="style4"]/a/@href').extract() item['title'] = hxs.select('//div[@class="g-left prodetail-text"]/h2/text()').extract() yield item current_page_urls = hxs.select('//a/@href').extract() for i in range(len(current_page_urls)): url = current_page_urls[i] if url.startswith('/'): url_ab = JieYiCaiSpider.domain + url yield Request(url_ab, callback=self.parse)
此處代碼的關鍵在於:
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html import json from twisted.enterprise import adbapi import MySQLdb.cursors import re mobile_re = re.compile(r'(13[0-9]|15[012356789]|17[678]|18[0-9]|14[57])[0-9]{8}') phone_re = re.compile(r'(\d+-\d+|\d+)') class JsonPipeline(object): def __init__(self): self.file = open('/Users/wupeiqi/PycharmProjects/beauty/beauty/jieyicai.json', 'wb') def process_item(self, item, spider): line = "%s %s\n" % (item['company'][0].encode('utf-8'), item['title'][0].encode('utf-8')) self.file.write(line) return item class DBPipeline(object): def __init__(self): self.db_pool = adbapi.ConnectionPool('MySQLdb', db='DbCenter', user='root', passwd='123', cursorclass=MySQLdb.cursors.DictCursor, use_unicode=True) def process_item(self, item, spider): query = self.db_pool.runInteraction(self._conditional_insert, item) query.addErrback(self.handle_error) return item def _conditional_insert(self, tx, item): tx.execute("select nid from company where company = %s", (item['company'][0], )) result = tx.fetchone() if result: pass else: phone_obj = phone_re.search(item['info'][0].strip()) phone = phone_obj.group() if phone_obj else ' ' mobile_obj = mobile_re.search(item['info'][1].strip()) mobile = mobile_obj.group() if mobile_obj else ' ' values = ( item['company'][0], item['qq'][0], phone, mobile, item['info'][2].strip(), item['more'][0]) tx.execute("insert into company(company,qq,phone,mobile,address,more) values(%s,%s,%s,%s,%s,%s)", values) def handle_error(self, e): print 'error',e
上述中的pipelines中有多個類,到底Scapy會自動執行那個?哈哈哈哈,固然須要先配置了,否則Scapy就蒙逼了。。。
在settings.py中作以下配置:
ITEM_PIPELINES = { 'beauty.pipelines.DBPipeline': 300, 'beauty.pipelines.JsonPipeline': 100, } # 每行後面的整型值,肯定了他們運行的順序,item按數字從低到高的順序,經過pipeline,一般將這些數字定義在0-1000範圍內。
一個小蜘蛛:
import scrapy from scrapy.selector import HtmlXPathSelector from scrapy.http.request import Request from scrapy.http.cookies import CookieJar from scrapy import FormRequest class ChouTiSpider(scrapy.Spider): # 爬蟲應用的名稱,經過此名稱啓動爬蟲命令 name = "chouti" # 容許的域名 allowed_domains = ["chouti.com"] cookie_dict = {} has_request_set = {} def start_requests(self): url = 'http://dig.chouti.com/' # return [Request(url=url, callback=self.login)] yield Request(url=url, callback=self.login) def login(self, response): cookie_jar = CookieJar() cookie_jar.extract_cookies(response, response.request) for k, v in cookie_jar._cookies.items(): for i, j in v.items(): for m, n in j.items(): self.cookie_dict[m] = n.value req = Request( url='http://dig.chouti.com/login', method='POST', headers={'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8'}, body='phone=8615131255089&password=pppppppp&oneMonth=1', cookies=self.cookie_dict, callback=self.check_login ) yield req def check_login(self, response): req = Request( url='http://dig.chouti.com/', method='GET', callback=self.show, cookies=self.cookie_dict, dont_filter=True ) yield req def show(self, response): # print(response) hxs = HtmlXPathSelector(response) news_list = hxs.select('//div[@id="content-list"]/div[@class="item"]') for new in news_list: # temp = new.xpath('div/div[@class="part2"]/@share-linkid').extract() link_id = new.xpath('*/div[@class="part2"]/@share-linkid').extract_first() yield Request( url='http://dig.chouti.com/link/vote?linksId=%s' %(link_id,), method='POST', cookies=self.cookie_dict, callback=self.do_favor ) page_list = hxs.select('//div[@id="dig_lcpage"]//a[re:test(@href, "/all/hot/recent/\d+")]/@href').extract() for page in page_list: page_url = 'http://dig.chouti.com%s' % page import hashlib hash = hashlib.md5() hash.update(bytes(page_url,encoding='utf-8')) key = hash.hexdigest() if key in self.has_request_set: pass else: self.has_request_set[key] = page_url yield Request( url=page_url, method='GET', callback=self.show ) def do_favor(self, response): print(response.text)