pip3 install Scrapy
process_request(request, spider)
process_response(request, response, spider)
process_exception(request, exception, spider)
process_spider_input(response, spider)
process_spider_output(response, result, spider)
process_spider_exception(response, exception, spider)
process_start_requests(start_requests, spider)
open_spider(spider)
close_spider(spider)
from_crawler(cls, crawler)
from scrapy import selector body = '...' selector = Selector(text=body) title = selector.xpath('//title/text()').extract_first() #提取title標籤裏的內容 print(title)
構建時傳入text參數,生成selector對象,經過xpath(), css()等方法提取css
result = response.selector.xpath('//a') result result.xpath('.//a[@href="image1.html"]/text()')extract_first() #xpath result.css('a[@href="image1.html"]::text]).extract_first() #css
scrapy startproject tutorial
tutorial/ scrapy.cfg tutorial/ __init__.py items.py pipelines.py settings.py spiders/ __init__.py ...
scrapy genspider quotes quotes.toscrape.com
import scrapy class QuoteItem(scrapy.Item): title = scrapy.Field() link = scrapy.Field() desc = scrapy.Field()
scrapy crawl quotes
quotes = response.css('.quote') #選擇quote的區塊 for quote in quotes: item = QuoteItem() item['text'] = quote.css('.text::text').extract_first() item['author'] = quote.css('.author::text').extract_first() item['tags'] = quote.css('.tags .tag::text').extract() yield item
scrapy shell quotes.toscrape.com #能夠在命令行交互
next = response.css('.pager .next a::attr(href)').extract_first() url = response.urljoin(next) yield scrapy.Request(url=url, callback=self.parse)
scrapy crawl quotes -o quotes.json #也能夠保存成csv,xml等不一樣文件格式
from scrapy.exceptions import DropItem class TextPipeline(object): def __init__(self): self.limit = 50 def process_item(self, item, spider): if item['text']: if len(item['text']) > self.limit: item['text'] = item['text'][0:self.limit].rstrip() + '...' #設置長度50截斷字符串 return item else: return DropItem('Missing Text')
##本系列內容爲《python3爬蟲開發實戰》學習筆記。本系列博客列表以下:html
(零)學習路線python
(一)開發環境配置正則表達式
(二)爬蟲基礎shell
(三)基本庫使用數據庫
(四)解析庫使用json
(五)數據存儲數據結構
持續更新...