scrapy項目5：爬取ajax形式加載的數據，並用ImagePipeline保存圖片

時間 2019-12-06

標籤 scrapy 項目 ajax 形式加載數據並用 imagepipeline 保存圖片欄目 Python 简体版

原文原文鏈接

1.目標分析：html

　　咱們想要獲取的數據爲以下圖：html5

　　　　1).每本書的名稱python

　　　　2).每本書的價格web

　　　　3).每本書的簡介ajax

2.網頁分析：　　json

　　網站url：http://e.dangdang.com/list-WY1-dd_sale-0-1.htmlapi

　　以下圖所示，每當咱們將滾動條滾動到頁面底部是，會自動加載數據,而且url不發生變化，諸如此種加載方式即爲ajax方式加載的數據瀏覽器

　　第一步：經過Fiddler抓取加載過程當中的數據，並觀察規律：cookie

　　圖一：以下圖：滾動鼠標讓數據加載3次，下圖是三次數據加載過程當中Fiddler抓取的數據信息，對三次數據加載過程當中的抓包信息中的三個聚焦點進行關注並對比分析app

　　聚焦點1：規律，經過以下兩幅圖，能夠看出，在數據加載過程當中，只有start和end在變化，而且end-start=20,兩次相連的的數據加載的start知足，後一個是在前一個的基礎上加21

　　第一次加載數據：

　　第二次加載數據：

　　聚焦點2：（第一次加載）下圖中紅框中的url第一個問好後的參數就是聚焦點1中的Querystring中的數據，

　　將上圖中紅色url複製在下方

　　 http://e.dangdang.com/media/api.go?action=mediaCategoryLeaf&promotionType=1&deviceSerialNo=html5&macAddr=html5&channelType=html5&permanentId=20180603095934721824754968589520886&returnType=json&channelId=70000&clientVersionNo=5.8.4&platformSource=DDDS-P&fromPlatform=106&deviceType=pconline&token=&start=21&end=41&category=WY1&dimension=dd_sale&order=0

　　將上述url複製到谷歌瀏覽器（帶有xpath插件），咱們就會獲得以下的json數據（即爲焦點三種相同的數據），這樣的話咱們想上述抓包獲取的url發請求就可以拿到對應的json數據，咱們要爬取的目標在圖中用紅框標出

3 scrapy代碼以下：

　　1).框架結構：

　　2).book.py文件

 1 # -*- coding: utf-8 -*-
 2 import scrapy
 3 from Ebook.items import EbookItem
 4 import json
 5 class BookSpider(scrapy.Spider):
 6     name = "book"
 7     allowed_domains = ["dangdang.com"]
 8 #   start_urls = ['http://e.dangdang.com/media/api.go?']
 9     # 重寫start_requests方法
10     def start_requests(self):
11         # 經過Fiddler抓包獲取的不含參數信息的url
12         url = 'http://e.dangdang.com/media/api.go?'
13         # 請求頭信息User-Agent
14         headers = {
15             'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0',        
16          }
17         # QueryString中start參數的初始值爲21，在此處用start_num表示
18         start_num = 21
19 
20         while True:
21             # QueryString中end參數爲start+20，在此處用end_num表示
22             end_num = start_num + 20
23             # 構造QueryString查詢字符串
24             formdata = {
25                 'action':'mediaCategoryLeaf',
26                 'promotionType':'1',
27                 'deviceSerialNo':'html5',
28                 'macAddr':'html5',
29                 'channelType':'html5',
30                 'permanentId':'20180603095934721824754968589520886',
31                 'returnType':'json',
32                 'channelId':'70000',
33                 'clientVersionNo':'5.8.4',
34                 'platformSource':'DDDS-P',
35                 'fromPlatform':'106',
36                 'deviceType':'pconline',
37                 'token':'', 
38                 'start':str(start_num),
39                 'end':str(end_num),
40                 'category':'SK',
41                 'dimension':'dd_sale',
42                 'order':'0',
43                     }
44             # 發送請求
45             yield scrapy.FormRequest(url=url,formdata=formdata,headers=headers,callback = self.parse)
46             # 給start加21獲取下一頁的數據，如此規律循環直到全部數據抓取完畢
47             start_num += 21
48     # parse用來解析相應數據，即咱們爬取到的json數據
49     def parse(self, response):
50         data = json.loads(response.text)['data']['saleList']
51         for each in data:
52             item = EbookItem()
53             info_dict = each['mediaList'][0]
54             # 書的標題
55             item['book_title'] = info_dict['title']
56             # 書的價格
57             item['book_price'] = info_dict['salePrice']
58             # 書的簡介
59             item['book_desc'] = info_dict['descs']
60             # 書的封面圖片連接
61             item['book_coverPic'] = info_dict['coverPic']
62             yield item
63 
64 
65 
66 
67

book.by

　　3).items.py文件

 1 # -*- coding: utf-8 -*-
 2 
 3 # Define here the models for your scraped items
 4 #
 5 # See documentation in:
 6 # http://doc.scrapy.org/en/latest/topics/items.html
 7 
 8 import scrapy
 9 class EbookItem(scrapy.Item):
10     # define the fields for your item here like:
11     # name = scrapy.Field()
12     # 標題
13     book_title = scrapy.Field()
14     # 價格
15     book_price = scrapy.Field()
16     # 簡介
17     book_desc = scrapy.Field()
18     # 封面圖片連接
19     book_coverPic = scrapy.Field()

items.py

　　4).pipelines.py文件 ImagesPipeline類的學習文檔http://scrapy-chs.readthedocs.io/zh_CN/1.0/topics/media-pipeline.html#scrapy.pipeline.images.ImagesPipeline.item_completed

 1 # -*- coding: utf-8 -*-
 2 # Define your item pipelines here
 3 #
 4 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 5 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 6 
 7 import json
 8 from scrapy.pipelines.images import ImagesPipeline
 9 import scrapy
10 from scrapy.utils.project import get_project_settings
11 import os
12 # EbookImagesPipeline是用來下載書的封面圖片並的自定義類，繼承於ImagesPipeline類
13 class EbookImagesPipeline(ImagesPipeline):
14     # settings.py文件中設置的下載下來的圖片存貯的路徑
15     IMAGES_STORE = get_project_settings().get('IMAGES_STORE')
16     # 發送url請求的方法
17     def get_media_requests(self,item,info):
18         # 從item中獲取圖片連接
19         image_url = item['book_coverPic']
20         headers = {
21             'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Mobile Safari/537.36'
22                 }
23         # 發送請求下載圖片
24         yield scrapy.Request(image_url,headers=headers)
25     # 當全部圖片被下載完成後，item_completed方法被調用
26     def item_completed(self,result,item,info):
27         # result的結構爲：(success, file_info_or_error)
28         # success:布爾型，表示圖片是否下載成功
29         # file_info_or_error:是一個包含下列關鍵字的字典
30         #  -->url:圖片下載的url
31         #  -->path:圖片的存貯路徑
32         #  -->checksum:圖片內容的MD5 hash
33         image_path = [x['path'] for ok,x in result if ok]
34         # 對下載下來的圖片重命名
35         os.rename(self.IMAGES_STORE + '/' + image_path[0], self.IMAGES_STORE + '/'+ item['book_title'].encode('utf-8') + '.jpg')
36         return item 
37 # 將數據保存到本地
38 class EbookPipeline(object):
39     def __init__(self):
40         # 定義文件
41         self.filename = open('bookinfo.json','w')
42     def process_item(self, item, spider):
43         # 將item中數據轉化爲json
44         text = json.dumps(dict(item),ensure_ascii=False) + '\n'
45         # 將內容寫入到文件中
46         self.filename.write(text.encode('utf-8'))
47         return item
48     def close_spider(self,spider):
49         # 關閉文件
50         self.filename.close()

pipelines.py

　　5).settings.py文件:根據項目需求進行配置便可

 1 # -*- coding: utf-8 -*-
 2 
 3 # Scrapy settings for Ebook project
 4 #
 5 # For simplicity, this file contains only settings considered important or
 6 # commonly used. You can find more settings consulting the documentation:
 7 #
 8 #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 
12 BOT_NAME = 'Ebook'
13 SPIDER_MODULES = ['Ebook.spiders']
14 NEWSPIDER_MODULE = 'Ebook.spiders'
15 # 圖片存儲路徑--新增
16 IMAGES_STORE = '/home/python/Desktop/01-爬蟲/01-爬蟲0530/ajax形式加載/Ebook/Ebook/spiders/bookimage'
17 # Crawl responsibly by identifying yourself (and your website) on the user-agent
18 USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0'
19 # log日誌存放文件
20 LOG_FILE = '123.log'
21 LOG_LEVEL = 'INFO'
22 # Obey robots.txt rules
23 # ROBOTSTXT_OBEY = True
24 
25 # Configure maximum concurrent requests performed by Scrapy (default: 16)
26 #CONCURRENT_REQUESTS = 32
27 
28 # Configure a delay for requests for the same website (default: 0)
29 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
30 # See also autothrottle settings and docs
31 # 下載延遲
32 DOWNLOAD_DELAY = 2.5
33 # The download delay setting will honor only one of:
34 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
35 #CONCURRENT_REQUESTS_PER_IP = 16
36 
37 # Disable cookies (enabled by default)
38 # 關閉cookies
39 COOKIES_ENABLED = False
40 
41 # Disable Telnet Console (enabled by default)
42 #TELNETCONSOLE_ENABLED = False
43 
44 # Override the default request headers:
45 #DEFAULT_REQUEST_HEADERS = {
46 #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
47 #   'Accept-Language': 'en',
48 #}
49 
50 # Enable or disable spider middlewares
51 # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
52 #SPIDER_MIDDLEWARES = {
53 #    'Ebook.middlewares.MyCustomSpiderMiddleware': 543,
54 #}
55 
56 # Enable or disable downloader middlewares
57 # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
58 #DOWNLOADER_MIDDLEWARES = {
59 #    'Ebook.middlewares.MyCustomDownloaderMiddleware': 543,
60 #}
61 
62 # Enable or disable extensions
63 # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
64 #EXTENSIONS = {
65 #    'scrapy.extensions.telnet.TelnetConsole': None,
66 #}
67 
68 # Configure item pipelines
69 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
70 # 配置管道文件
71 ITEM_PIPELINES = {
72     'Ebook.pipelines.EbookPipeline': 300,
73     'Ebook.pipelines.EbookImagesPipeline':350,
74 }
75 
76 # Enable and configure the AutoThrottle extension (disabled by default)
77 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
78 #AUTOTHROTTLE_ENABLED = True
79 # The initial download delay
80 #AUTOTHROTTLE_START_DELAY = 5
81 # The maximum download delay to be set in case of high latencies
82 #AUTOTHROTTLE_MAX_DELAY = 60
83 # The average number of requests Scrapy should be sending in parallel to
84 # each remote server
85 #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
86 # Enable showing throttling stats for every response received:
87 #AUTOTHROTTLE_DEBUG = False
88 
89 # Enable and configure HTTP caching (disabled by default)
90 # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
91 #HTTPCACHE_ENABLED = True
92 #HTTPCACHE_EXPIRATION_SECS = 0
93 #HTTPCACHE_DIR = 'httpcache'
94 #HTTPCACHE_IGNORE_HTTP_CODES = []
95 #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'