Python Scrapy - Ins爬蟲

前言

上午寫完那篇文章後,下午在睡覺,晚上就想試試scrapy比較一下速度,那個更快,我是第一次用scrapy下載圖片,第一次我使用requests下載的。。。賊雞兒慢,就是單線程;後來翻了翻文檔按照官方的例子改了改算是成功了,這篇文章就說一下我遇到的坑吧,文末對比二者速度html

正文

站點分析就免去了,看上一片文章node

首先新建一個項目python

➜  scrapy git:(master) ✗ scrapy startproject ins_crawl
複製代碼

接着生成spider:git

➜  scrapy git:(master) ✗ cd ins_crawl
➜  ins_crawl git:(master) ✗ scrapy genspider ins instagram.com
複製代碼

爲了方便觀看,我先tree一下我項目:github

.
├── ins_crawl
│ ├── init.py
│ ├── pycache
│ │ ├── init.cpython-37.pyc
│ │ ├── items.cpython-37.pyc
│ │ ├── middlewares.cpython-37.pyc
│ │ ├── pipelines.cpython-37.pyc
│ │ └── settings.cpython-37.pyc
│ ├── images
│ │ ├── InsImagesPipeline.py
│ │ ├── init.py
│ │ └── pycache
│ │ ├── InsImagesPipeline.cpython-37.pyc
│ │ └── init.cpython-37.pyc
│ ├── items.py
│ ├── middlewares.py
│ ├── pipelines.py
│ ├── settings.py
│ └── spiders
│ ├── init.py
│ ├── pycache
│ │ ├── init.cpython-37.pyc
│ │ ├── config.cpython-37.pyc
│ │ └── ins.cpython-37.pyc
│ ├── config.py
│ └── ins.py
└── scrapy.cfg

6 directories, 21 filesredis

打開 ins_crawl/spider/ins.py 文件,代碼以下,注意看註釋:shell

# -*- coding: utf-8 -*-
import scrapy
import requests
import json
import logging

from urllib.parse import (urlencode, urljoin)
from ins_crawl.spiders.config import * # 我在同目錄下新建了一個config.py
from ins_crawl.items import InsCrawlItem

LOGGER = logging.getLogger(__name__)


class InsSpider(scrapy.Spider):
    name = 'ins'
    allowed_domains = ['instagram.com']
    start_urls = ['http://instagram.com/']

    def __init__(self, username='taeri__taeri', *args, **kwargs):
        """ :params username:用戶名,能夠在命令行傳參 """
        super(InsSpider, self).__init__(*args, **kwargs)
        self.username = username
        self.shared_data = self.get_shared_data() # 獲取shared_data
        
    def request(self, end_cursor, callback):
        """ request 方法,做用如其名 """
        url = urljoin(self.start_urls[0], 'graphql/query/') + '?'
        params = {
            'query_hash': 'f2405b236d85e8296cf30347c9f08c2a',
            'variables':
                '{{"id":"{0}","first":{1},"after":"{2}"}}'.format(
                    self.user_id, 50, end_cursor),
        }
        url = url + urlencode(params)
        request = scrapy.Request(url=url, callback=callback, meta={'proxy': 'http://127.0.0.1:8001'}) # 這裏須要使用proxy,因此添加參數meta
        # 添加 cookies
        request.cookies['csrftoken'] = CSRFTOKEN
        # 添加 headers (其實這個能夠刪了,我忘了,我在settings裏已經設置好了
        request.headers['User-Agent'] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'
        return request
    
    def start_requests(self):
        """ 重寫start_requests方法 """
        # 首先獲取 user id 和 count
        if self.shared_data is not None:
            user = self.shared_data['entry_data']['ProfilePage'][0]['graphql']['user']
            self.user_id = user['id']
            self.count = user['edge_owner_to_timeline_media']['count']
            LOGGER.info('\n{}\nUser id:{}\nTotal {} photos.\n{}\n'.format('-'*20, self.user_id, self.count, '-'*20))
            for i, url in enumerate(self.start_urls):
                yield self.request("", self.parse_item)
        else:
            LOGGER.error('-----[ERROR] shared_data is None.')

    def parse_item(self, response):
        j = json.loads(response.text)
        edge_media = j['data']['user']['edge_owner_to_timeline_media']
        edges = edge_media['edges']
        if edges:
            for edge in edges:
                item = InsCrawlItem()
                item['image_url'] = edge['node']['display_url']
                item['username'] = self.username
                yield item
            has_next_page = edge_media['page_info']['has_next_page']
            if has_next_page:
                end_cursor = edge_media['page_info']['end_cursor']
                yield self.request(end_cursor, self.parse_item)
            else:
                LOGGER.info('獲取照片完畢.')

    def get_shared_data(self):
        """ 獲取 shared data :return: """
        try:
            proxies = {
                'http': 'http://' + PROXY,
                'https': 'https://' + PROXY
            }
            with requests.get(self.start_urls[0] + self.username, proxies=proxies) as resp:
            # with scrapy.Request(self.start_urls[0] + self.username, meta={'proxy':'http://' + PROXY}) as resp:
                html = resp.text
                if html is not None and '_sharedData' in html:
                    shared_data = html.split("window._sharedData = ")[1].split(
                        ";</script>")[0]
                    if not shared_data:
                        print('Not found [share data]')
                        exit(1)
                    return json.loads(shared_data)
        except Exception as exc:
            LOGGER.error('[-----]', repr(exc))

複製代碼

config.pyjson

PROXY = '127.0.0.1:8001' # 代理
CSRFTOKEN = '' # cookie中的csrftoken
複製代碼

items.pycookie

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class InsCrawlItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    image_url = scrapy.Field() # 照片連接
    username = scrapy.Field() # 用戶名(用不到了

複製代碼

Pipelines 裏沒動,就不貼了dom

InsImagesPipeline.py,有官方提供的例子改的,media-pipeline

import logging
import scrapy
from scrapy.pipelines.images import ImagesPipeline
from scrapy.exceptions import DropItem

LOGGER = logging.getLogger(__name__)


class InsImagesPipeline(ImagesPipeline):

    def get_media_requests(self, item, info): # 請求
        image_url = item['image_url']
        yield scrapy.Request(image_url, meta={'proxy': 'http://127.0.0.1:8001'})

    def item_completed(self, results, item, info): # 請求結果
        image_paths = [x['path'] for ok, x in results if ok]
        if not image_paths:
            raise DropItem("Item contains no images")
        print('-----[DOWLOADING]開始下載:', item['image_url'])
        return item

複製代碼

InsProxyMiddlewares.py

from ins_crawl.spiders.config import *

class InsProxyMiddlewares(object):

    def process_request(self, request, spider):
        request.meta['proxy'] = 'http://127.0.0.1:8001'
複製代碼

settings.py

BOT_NAME = 'scrapy'

SPIDER_MODULES = ['ins_crawl.spiders']
NEWSPIDER_MODULE = 'ins_crawl.spiders'

ROBOTSTXT_OBEY = False

DEFAULT_REQUEST_HEADERS = {
    'User-Agent':'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36',
}

DOWNLOADER_MIDDLEWARES = {
   'ins_crawl.middlewares.InsCrawlDownloaderMiddleware': 543,
}

ITEM_PIPELINES = {
   'ins_crawl.pipelines.InsCrawlPipeline': 2,
   'ins_crawl.images.InsImagesPipeline.InsImagesPipeline':1,
}

# 圖片保存目錄
IMAGES_STORE = '/Users/2h0n91i2hen/Pictures/Instagram/'
複製代碼

運行

taeri__taeri 該用戶目前有 430張照片

抓取430張照片

Scrapy 耗時87秒, 0.20232558139534884 / 張。

scrapy

asyncio+aiohttp 耗時21秒, 0.04883720930232558 / 張

asyncio


以後我換了個用戶:ponysmakeup,照片數量是964

asyncio+aiohttp 耗時42.9秒,0.04450207468879668 / 張

asyncio

而scrapy 耗時159.9秒, 0.1658713692946058 / 張

scrapy

總結

我一開始覺得scrapy會很快,沒想到比不過asyncio+aiohttp這陣容,我打算用 aiohttp+asyncio+aioredis 寫一個代理池,預算用一個禮拜吧,不知道要多久 (:

項目:ins_crawl

相關文章
相關標籤/搜索