scrapy學習之路2(圖片下載與下載的路徑獲取)

圖片下載和拿到下載後的路徑

1

items.pyphp

import scrapy

class InfoItem(scrapy.Item):
    url = scrapy.Field()
    url_object_id = scrapy.Field()
    small_image = scrapy.Field()
    small_image_path = scrapy.Field()
    big_image = scrapy.Field()
    big_image_path = scrapy.Field()
    code = scrapy.Field()
    date = scrapy.Field()
    lengths = scrapy.Field()
    author = scrapy.Field()
    cate = scrapy.Field()
    av_artor = scrapy.Field()

spider/jxxx.pycss

# -*- coding: utf-8 -*-
import scrapy
from urllib import parse
from scrapy.http import Request
from JaSpider.items import InfoItem
from JaSpider.utils.common import get_md5


class JxxxSpider(scrapy.Spider):
    name = 'jxxx'
    allowed_domains = ['www.jxxx.com']
    start_urls = ['http://www.jxxx.com/cn/vl_update.php']

    def parse(self, response):
        for i in response.css('.video'):
            small_image = i.css('img::attr(src)').extract_first() # 小封面圖的爬取,後面經過meta傳到parse_info中
            link = i.css('a::attr(href)').extract_first() # 詳情頁的url爬取
            real_url = parse.urljoin(response.url, link) # 詳情頁的完整地址
            yield Request(url=real_url, meta={'small_image': small_image}, callback=self.parse_info)
        # 下一頁的爬取與請求    
        next_url = response.css('.page_selector .page.next::attr(href)').extract_first()
        perfect_next_url = parse.urljoin(response.url, next_url)
        if next_url:
            yield Request(url=perfect_next_url, callback=self.parse)

    def parse_info(self, response):
        small_image = "http:"+response.meta['small_image']
        big_image = "http:"+response.xpath('//div[@id="video_jacket"]/img/@src').extract_first()
        code = response.css('#video_id .text::text').extract_first()
        date = response.css('#video_date .text::text').extract_first()
        lengths = response.css('#video_length .text::text').extract_first()
        author = response.css('#video_director .director a::text').extract_first() if response.css('#video_director .director a::text').extract_first() else "不明"
        cate = ','.join([i.css('a::text').extract_first() for i in response.css('#video_genres .genre') if i.css('a::text').extract_first()])
        av_artor = ','.join([i.css('a::text').extract_first() for i in response.css('.star') if i.css('a::text').extract_first()])
        # print("http:"+small_image)
        info_item = InfoItem()
        info_item['url'] = response.url
        info_item['url_object_id'] = get_md5(response.url)
        info_item['small_image'] = small_image
        info_item['big_image'] = [big_image]
        info_item['code'] = code
        info_item['date'] = date
        info_item['lengths'] = lengths
        info_item['author'] = author
        info_item['cate'] = cate
        info_item['av_artor'] = av_artor
        yield info_item

2

打開pipeline功能 settings.pydom

clipboard.png
注意!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!:
spider/jxxx.py
clipboard.pngscrapy

3

如要進一步定製功能
settings.py
clipboard.pngide

pipeline.pyurl

clipboard.png

4

補充
新建utils/common.pyspa

import hashlib


def get_md5(url):
    if isinstance(url, str):
        url = url.encode("utf-8")
    m = hashlib.md5()
    m.update(url)
    return m.hexdigest()


if __name__ == "__main__":
    a = get_md5('http://www.haddu.com')
    print(a)
相關文章
相關標籤/搜索