1.目標分析:html
咱們想要獲取的數據爲以下圖:html5
1).每本書的名稱python
2).每本書的價格web
3).每本書的簡介ajax
2.網頁分析: json
網站url:http://e.dangdang.com/list-WY1-dd_sale-0-1.htmlapi
以下圖所示,每當咱們將滾動條滾動到頁面底部是,會自動加載數據,而且url不發生變化,諸如此種加載方式即爲ajax方式加載的數據瀏覽器
第一步:經過Fiddler抓取加載過程當中的數據,並觀察規律:cookie
圖一:以下圖:滾動鼠標讓數據加載3次,下圖是三次數據加載過程當中Fiddler抓取的數據信息,對三次數據加載過程當中的抓包信息中的 三個聚焦點進行關注並對比分析app
聚焦點1:規律,經過以下兩幅圖,能夠看出,在數據加載過程當中,只有start和end在變化,而且end-start=20,兩次相連的的數據加載的start知足,後一個是在前一個的基礎上加21
第一次加載數據:
第二次加載數據:
聚焦點2:(第一次加載)下圖中紅框中的url第一個問好後的參數就是聚焦點1中的Querystring中的數據,
將上圖中紅色url複製在下方
將上述url複製到谷歌瀏覽器(帶有xpath插件),咱們就會獲得以下的json數據(即爲焦點三種相同的數據),這樣的話咱們想上述抓包獲取的url發請求就可以拿到對應的json數據,咱們要爬取的目標在圖中用紅框標出
3 scrapy代碼以下:
1).框架結構:
2).book.py文件
1 # -*- coding: utf-8 -*- 2 import scrapy 3 from Ebook.items import EbookItem 4 import json 5 class BookSpider(scrapy.Spider): 6 name = "book" 7 allowed_domains = ["dangdang.com"] 8 # start_urls = ['http://e.dangdang.com/media/api.go?'] 9 # 重寫start_requests方法 10 def start_requests(self): 11 # 經過Fiddler抓包獲取的不含參數信息的url 12 url = 'http://e.dangdang.com/media/api.go?' 13 # 請求頭信息User-Agent 14 headers = { 15 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0', 16 } 17 # QueryString中start參數的初始值爲21,在此處用start_num表示 18 start_num = 21 19 20 while True: 21 # QueryString中end參數爲start+20,在此處用end_num表示 22 end_num = start_num + 20 23 # 構造QueryString查詢字符串 24 formdata = { 25 'action':'mediaCategoryLeaf', 26 'promotionType':'1', 27 'deviceSerialNo':'html5', 28 'macAddr':'html5', 29 'channelType':'html5', 30 'permanentId':'20180603095934721824754968589520886', 31 'returnType':'json', 32 'channelId':'70000', 33 'clientVersionNo':'5.8.4', 34 'platformSource':'DDDS-P', 35 'fromPlatform':'106', 36 'deviceType':'pconline', 37 'token':'', 38 'start':str(start_num), 39 'end':str(end_num), 40 'category':'SK', 41 'dimension':'dd_sale', 42 'order':'0', 43 } 44 # 發送請求 45 yield scrapy.FormRequest(url=url,formdata=formdata,headers=headers,callback = self.parse) 46 # 給start加21獲取下一頁的數據,如此規律循環直到全部數據抓取完畢 47 start_num += 21 48 # parse用來解析相應數據,即咱們爬取到的json數據 49 def parse(self, response): 50 data = json.loads(response.text)['data']['saleList'] 51 for each in data: 52 item = EbookItem() 53 info_dict = each['mediaList'][0] 54 # 書的標題 55 item['book_title'] = info_dict['title'] 56 # 書的價格 57 item['book_price'] = info_dict['salePrice'] 58 # 書的簡介 59 item['book_desc'] = info_dict['descs'] 60 # 書的封面圖片連接 61 item['book_coverPic'] = info_dict['coverPic'] 62 yield item 63 64 65 66 67
3).items.py文件
1 # -*- coding: utf-8 -*- 2 3 # Define here the models for your scraped items 4 # 5 # See documentation in: 6 # http://doc.scrapy.org/en/latest/topics/items.html 7 8 import scrapy 9 class EbookItem(scrapy.Item): 10 # define the fields for your item here like: 11 # name = scrapy.Field() 12 # 標題 13 book_title = scrapy.Field() 14 # 價格 15 book_price = scrapy.Field() 16 # 簡介 17 book_desc = scrapy.Field() 18 # 封面圖片連接 19 book_coverPic = scrapy.Field()
4).pipelines.py文件 ImagesPipeline類的學習文檔http://scrapy-chs.readthedocs.io/zh_CN/1.0/topics/media-pipeline.html#scrapy.pipeline.images.ImagesPipeline.item_completed
1 # -*- coding: utf-8 -*- 2 # Define your item pipelines here 3 # 4 # Don't forget to add your pipeline to the ITEM_PIPELINES setting 5 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 6 7 import json 8 from scrapy.pipelines.images import ImagesPipeline 9 import scrapy 10 from scrapy.utils.project import get_project_settings 11 import os 12 # EbookImagesPipeline是用來下載書的封面圖片並的自定義類,繼承於ImagesPipeline類 13 class EbookImagesPipeline(ImagesPipeline): 14 # settings.py文件中設置的下載下來的圖片存貯的路徑 15 IMAGES_STORE = get_project_settings().get('IMAGES_STORE') 16 # 發送url請求的方法 17 def get_media_requests(self,item,info): 18 # 從item中獲取圖片連接 19 image_url = item['book_coverPic'] 20 headers = { 21 'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Mobile Safari/537.36' 22 } 23 # 發送請求下載圖片 24 yield scrapy.Request(image_url,headers=headers) 25 # 當全部圖片被下載完成後,item_completed方法被調用 26 def item_completed(self,result,item,info): 27 # result的結構爲:(success, file_info_or_error) 28 # success:布爾型,表示圖片是否下載成功 29 # file_info_or_error:是一個包含下列關鍵字的字典 30 # -->url:圖片下載的url 31 # -->path:圖片的存貯路徑 32 # -->checksum:圖片內容的MD5 hash 33 image_path = [x['path'] for ok,x in result if ok] 34 # 對下載下來的圖片重命名 35 os.rename(self.IMAGES_STORE + '/' + image_path[0], self.IMAGES_STORE + '/'+ item['book_title'].encode('utf-8') + '.jpg') 36 return item 37 # 將數據保存到本地 38 class EbookPipeline(object): 39 def __init__(self): 40 # 定義文件 41 self.filename = open('bookinfo.json','w') 42 def process_item(self, item, spider): 43 # 將item中數據轉化爲json 44 text = json.dumps(dict(item),ensure_ascii=False) + '\n' 45 # 將內容寫入到文件中 46 self.filename.write(text.encode('utf-8')) 47 return item 48 def close_spider(self,spider): 49 # 關閉文件 50 self.filename.close()
5).settings.py文件:根據項目需求進行配置便可
1 # -*- coding: utf-8 -*- 2 3 # Scrapy settings for Ebook project 4 # 5 # For simplicity, this file contains only settings considered important or 6 # commonly used. You can find more settings consulting the documentation: 7 # 8 # http://doc.scrapy.org/en/latest/topics/settings.html 9 # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 12 BOT_NAME = 'Ebook' 13 SPIDER_MODULES = ['Ebook.spiders'] 14 NEWSPIDER_MODULE = 'Ebook.spiders' 15 # 圖片存儲路徑--新增 16 IMAGES_STORE = '/home/python/Desktop/01-爬蟲/01-爬蟲0530/ajax形式加載/Ebook/Ebook/spiders/bookimage' 17 # Crawl responsibly by identifying yourself (and your website) on the user-agent 18 USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0' 19 # log日誌存放文件 20 LOG_FILE = '123.log' 21 LOG_LEVEL = 'INFO' 22 # Obey robots.txt rules 23 # ROBOTSTXT_OBEY = True 24 25 # Configure maximum concurrent requests performed by Scrapy (default: 16) 26 #CONCURRENT_REQUESTS = 32 27 28 # Configure a delay for requests for the same website (default: 0) 29 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 30 # See also autothrottle settings and docs 31 # 下載延遲 32 DOWNLOAD_DELAY = 2.5 33 # The download delay setting will honor only one of: 34 #CONCURRENT_REQUESTS_PER_DOMAIN = 16 35 #CONCURRENT_REQUESTS_PER_IP = 16 36 37 # Disable cookies (enabled by default) 38 # 關閉cookies 39 COOKIES_ENABLED = False 40 41 # Disable Telnet Console (enabled by default) 42 #TELNETCONSOLE_ENABLED = False 43 44 # Override the default request headers: 45 #DEFAULT_REQUEST_HEADERS = { 46 # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 47 # 'Accept-Language': 'en', 48 #} 49 50 # Enable or disable spider middlewares 51 # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 52 #SPIDER_MIDDLEWARES = { 53 # 'Ebook.middlewares.MyCustomSpiderMiddleware': 543, 54 #} 55 56 # Enable or disable downloader middlewares 57 # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 58 #DOWNLOADER_MIDDLEWARES = { 59 # 'Ebook.middlewares.MyCustomDownloaderMiddleware': 543, 60 #} 61 62 # Enable or disable extensions 63 # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 64 #EXTENSIONS = { 65 # 'scrapy.extensions.telnet.TelnetConsole': None, 66 #} 67 68 # Configure item pipelines 69 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 70 # 配置管道文件 71 ITEM_PIPELINES = { 72 'Ebook.pipelines.EbookPipeline': 300, 73 'Ebook.pipelines.EbookImagesPipeline':350, 74 } 75 76 # Enable and configure the AutoThrottle extension (disabled by default) 77 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 78 #AUTOTHROTTLE_ENABLED = True 79 # The initial download delay 80 #AUTOTHROTTLE_START_DELAY = 5 81 # The maximum download delay to be set in case of high latencies 82 #AUTOTHROTTLE_MAX_DELAY = 60 83 # The average number of requests Scrapy should be sending in parallel to 84 # each remote server 85 #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 86 # Enable showing throttling stats for every response received: 87 #AUTOTHROTTLE_DEBUG = False 88 89 # Enable and configure HTTP caching (disabled by default) 90 # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 91 #HTTPCACHE_ENABLED = True 92 #HTTPCACHE_EXPIRATION_SECS = 0 93 #HTTPCACHE_DIR = 'httpcache' 94 #HTTPCACHE_IGNORE_HTTP_CODES = [] 95 #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
4.結果:
1).bookinfo.json文件
2).書的封面圖片