爬蟲_本身寫的筆記

scrapy

custom_settings

class XxSpider(scrapy.Spider):
    name = 'xxxx'
    custom_settings = {
        'ITEM_PIPELINES': {
            # 'cl.pipelines.XxxxxPipeline': 400,
            'cl.pipelines.GifPipeline': 400
        }
    }
使用custom_settings
# 基類

class Spider(object_ref):
    """Base class for scrapy spiders. All spiders must inherit from this
    class.
    """

    name = None
    custom_settings = None


    @classmethod
    def update_settings(cls, settings):
        settings.setdict(cls.custom_settings or {}, priority='spider')
原理_針對每個spider進行更新

 

支持gif的pipeline

import requests
import os
import hashlib
import time

class GifPipeline(object):
    def __init__(self,images_store,download_delay):
        # if not os.path.exists('imgs'):
        #     os.makedirs('imgs')
        self.download_delay = download_delay
        self.images_store = images_store
        pass

    def process_item(self, item, spider):
        for url in item["img_urls"]:
            print(url)
            suffix = "jpg" if "." not in url else url.rsplit(".",1)[-1]
            _file_name = hashlib.sha1(bytes(url,encoding="utf-8")).hexdigest()
            title = item["title"]
            file_name = '{}{}.{}'.format(title,_file_name,suffix)
            response = requests.get(url, stream=True)
            with open(os.path.join(self.images_store, file_name), mode='wb') as f:
                f.write(response.content)
            time.sleep(self.download_delay)
        return item

    @classmethod
    def from_crawler(cls, crawler,):
        """
        初始化時候,用於建立pipeline對象
        :param crawler:
        :return:
        """
        download_delay = crawler.settings.get('DOWNLOAD_DELAY')
        images_store = crawler.settings.get('IMAGES_STORE')
        return cls(images_store,download_delay)

自定義pipeline,作了一些修改,支持gif
本身寫的支持gif的pipeline,粗糙
1、items.py




import scrapy


class HupuGifItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    hupu_image_url = scrapy.Field()
    images = scrapy.Field()

2、pipelines.py



# -*- coding: utf-8 -*-

from scrapy.pipelines.images import ImagesPipeline
from hupu_gif import settings
import requests
import os


class HupuGifPipeline(ImagesPipeline):
    def process_item(self, item, spider):
        if 'hupu_image_url' in item:
            images = []


        dir_path = '%s/%s' % (settings.IMAGES_STORE, spider.name)
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)


        for image_url in item['hupu_image_url']:
            us = image_url.split('/')[-1]
            file_path = '%s/%s' % (dir_path, us)
            images.append(file_path)
            if os.path.exists(file_path):
                continue
            with open(file_path, 'wb') as handle:
                response = requests.get('http:'+image_url, stream=True)
                for block in response.iter_content(1024):
                    if not block:
                        break
                    handle.write(block)


        item['images'] = images
        return item
--------------------- 
做者:Lee007008 
來源:CSDN 
原文:https://blog.csdn.net/qaz2170/article/details/61417514 
版權聲明:本文爲博主原創文章,轉載請附上博文連接!

別人寫的,不明白這裏類字段images是什麼用的,這個file_path的列表,推測是爲後面的pipeline作後續作持久化
別人寫的支持gif的pipeline,一樣粗糙

 

其餘

防盜鏈

img標籤不讓src外鏈的東西json

解決:img的src爲本身的後臺,再由後臺發請求獲取數據。可能也須要一些cookie,referer的字段cookie

 

請求頭

user-agent: 當前用戶使用的設備
Referer: "xxx" # 從什麼url轉過來的
content-type: application/json,Content-Type:application/x-www-form-urlencoded
host
  request.get('www....')
app

cookies關鍵

相關文章
相關標籤/搜索