class XxSpider(scrapy.Spider): name = 'xxxx' custom_settings = { 'ITEM_PIPELINES': { # 'cl.pipelines.XxxxxPipeline': 400, 'cl.pipelines.GifPipeline': 400 } }
# 基類 class Spider(object_ref): """Base class for scrapy spiders. All spiders must inherit from this class. """ name = None custom_settings = None @classmethod def update_settings(cls, settings): settings.setdict(cls.custom_settings or {}, priority='spider')
import requests import os import hashlib import time class GifPipeline(object): def __init__(self,images_store,download_delay): # if not os.path.exists('imgs'): # os.makedirs('imgs') self.download_delay = download_delay self.images_store = images_store pass def process_item(self, item, spider): for url in item["img_urls"]: print(url) suffix = "jpg" if "." not in url else url.rsplit(".",1)[-1] _file_name = hashlib.sha1(bytes(url,encoding="utf-8")).hexdigest() title = item["title"] file_name = '{}{}.{}'.format(title,_file_name,suffix) response = requests.get(url, stream=True) with open(os.path.join(self.images_store, file_name), mode='wb') as f: f.write(response.content) time.sleep(self.download_delay) return item @classmethod def from_crawler(cls, crawler,): """ 初始化時候,用於建立pipeline對象 :param crawler: :return: """ download_delay = crawler.settings.get('DOWNLOAD_DELAY') images_store = crawler.settings.get('IMAGES_STORE') return cls(images_store,download_delay) 自定義pipeline,作了一些修改,支持gif
1、items.py import scrapy class HupuGifItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() hupu_image_url = scrapy.Field() images = scrapy.Field() 2、pipelines.py # -*- coding: utf-8 -*- from scrapy.pipelines.images import ImagesPipeline from hupu_gif import settings import requests import os class HupuGifPipeline(ImagesPipeline): def process_item(self, item, spider): if 'hupu_image_url' in item: images = [] dir_path = '%s/%s' % (settings.IMAGES_STORE, spider.name) if not os.path.exists(dir_path): os.makedirs(dir_path) for image_url in item['hupu_image_url']: us = image_url.split('/')[-1] file_path = '%s/%s' % (dir_path, us) images.append(file_path) if os.path.exists(file_path): continue with open(file_path, 'wb') as handle: response = requests.get('http:'+image_url, stream=True) for block in response.iter_content(1024): if not block: break handle.write(block) item['images'] = images return item --------------------- 做者:Lee007008 來源:CSDN 原文:https://blog.csdn.net/qaz2170/article/details/61417514 版權聲明:本文爲博主原創文章,轉載請附上博文連接! 別人寫的,不明白這裏類字段images是什麼用的,這個file_path的列表,推測是爲後面的pipeline作後續作持久化
img標籤不讓src外鏈的東西json
解決:img的src爲本身的後臺,再由後臺發請求獲取數據。可能也須要一些cookie,referer的字段cookie
user-agent: 當前用戶使用的設備
Referer: "xxx" # 從什麼url轉過來的
content-type: application/json,Content-Type:application/x-www-form-urlencoded
host
request.get('www....')
app