爬蟲_本身寫的筆記

時間 2019-11-17

原文原文鏈接

scrapy

custom_settings

class XxSpider(scrapy.Spider):
    name = 'xxxx'
    custom_settings = {
        'ITEM_PIPELINES': {
            # 'cl.pipelines.XxxxxPipeline': 400,
            'cl.pipelines.GifPipeline': 400
        }
    }

使用custom_settings

# 基類

class Spider(object_ref):
    """Base class for scrapy spiders. All spiders must inherit from this
    class.
    """

    name = None
    custom_settings = None


    @classmethod
    def update_settings(cls, settings):
        settings.setdict(cls.custom_settings or {}, priority='spider')

原理_針對每個spider進行更新

支持gif的pipeline

import requests
import os
import hashlib
import time

class GifPipeline(object):
    def __init__(self,images_store,download_delay):
        # if not os.path.exists('imgs'):
        #     os.makedirs('imgs')
        self.download_delay = download_delay
        self.images_store = images_store
        pass

    def process_item(self, item, spider):
        for url in item["img_urls"]:
            print(url)
            suffix = "jpg" if "." not in url else url.rsplit(".",1)[-1]
            _file_name = hashlib.sha1(bytes(url,encoding="utf-8")).hexdigest()
            title = item["title"]
            file_name = '{}{}.{}'.format(title,_file_name,suffix)
            response = requests.get(url, stream=True)
            with open(os.path.join(self.images_store, file_name), mode='wb') as f:
                f.write(response.content)
            time.sleep(self.download_delay)
        return item

    @classmethod
    def from_crawler(cls, crawler,):
        """
        初始化時候，用於建立pipeline對象
        :param crawler:
        :return:
        """
        download_delay = crawler.settings.get('DOWNLOAD_DELAY')
        images_store = crawler.settings.get('IMAGES_STORE')
        return cls(images_store,download_delay)

自定義pipeline，作了一些修改，支持gif

本身寫的支持gif的pipeline，粗糙

1、items.py




import scrapy


class HupuGifItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    hupu_image_url = scrapy.Field()
    images = scrapy.Field()

2、pipelines.py



# -*- coding: utf-8 -*-

from scrapy.pipelines.images import ImagesPipeline
from hupu_gif import settings
import requests
import os


class HupuGifPipeline(ImagesPipeline):
    def process_item(self, item, spider):
        if 'hupu_image_url' in item:
            images = []


        dir_path = '%s/%s' % (settings.IMAGES_STORE, spider.name)
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)


        for image_url in item['hupu_image_url']:
            us = image_url.split('/')[-1]
            file_path = '%s/%s' % (dir_path, us)
            images.append(file_path)
            if os.path.exists(file_path):
                continue
            with open(file_path, 'wb') as handle:
                response = requests.get('http:'+image_url, stream=True)
                for block in response.iter_content(1024):
                    if not block:
                        break
                    handle.write(block)


        item['images'] = images
        return item
--------------------- 
做者：Lee007008 
來源：CSDN 
原文：https://blog.csdn.net/qaz2170/article/details/61417514 
版權聲明：本文爲博主原創文章，轉載請附上博文連接！

別人寫的，不明白這裏類字段images是什麼用的，這個file_path的列表，推測是爲後面的pipeline作後續作持久化

別人寫的支持gif的pipeline，一樣粗糙

其餘

防盜鏈

img標籤不讓src外鏈的東西json

解決：img的src爲本身的後臺，再由後臺發請求獲取數據。可能也須要一些cookie，referer的字段cookie

請求頭

user-agent: 當前用戶使用的設備
Referer: "xxx" # 從什麼url轉過來的
content-type: application/json,Content-Type:application/x-www-form-urlencoded
host
　　request.get('www....')
app

cookies關鍵

相關標籤/搜索

每日一句

每一个你不满意的现在，都有一个你没有努力的曾经。