一.環境 OS:win10
python:3.7
scrapy:1.3.2
pymongo:3.2html
二.先上效果圖
python
能夠看到咱們爬下京東的指定產品信息 包含 產品名 店鋪 價格 產品參數數據大部分 都拿到了哦 很詳細了git
上一些代碼: 代碼是又臭又長 哈哈哈github
#!/usr/bin/env python # -*- coding: utf-8 -*- import re import logging import json import requests from scrapy import Spider from scrapy.selector import Selector from scrapy.http import Request from JDSpider.items import * key_word = ['筆記本電腦', '筆記本', '', '', ''] Base_url = 'https://list.jd.com' price_url = 'https://p.3.cn/prices/mgets?skuIds=J_' comment_url = 'https://club.jd.com/comment/productPageComments.action?productId=%s&score=0&sortType=5&page=%s&pageSize=10' favourable_url = 'https://cd.jd.com/promotion/v2?skuId=%s&area=1_72_2799_0&shopId=%s&venderId=%s&cat=%s' class JDSpider(Spider): name = "JDSpider" allowed_domains = ["jd.com"] start_urls = [ 'https://www.jd.com/allSort.aspx' ] logging.getLogger("requests").setLevel(logging.WARNING) # 將requests的日誌級別設成WARNING def start_requests(self): for url in self.start_urls: yield Request(url=url, callback=self.parse_category) def parse_category(self, response): """獲取分類頁""" selector = Selector(response) try: texts = selector.xpath('//div[@class="category-item m"]/div[@class="mc"]/div[@class="items"]/dl/dd/a').extract() for text in texts: items = re.findall(r'<a href="(.*?)" target="_blank">(.*?)</a>', text) for item in items: if item[0].split('.')[0][2:] in key_word: if item[0].split('.')[0][2:] != 'list': yield Request(url='https:' + item[0], callback=self.parse_category) else: categoriesItem = CategoriesItem() categoriesItem['name'] = item[1] categoriesItem['url'] = 'https:' + item[0] categoriesItem['_id'] = item[0].split('=')[1].split('&')[0] yield categoriesItem yield Request(url='https:' + item[0], callback=self.parse_list) except Exception as e: print('error:', e) # 測試 # yield Request(url='https://list.jd.com/list.html?cat=1315,1343,9720', callback=self.parse_list) def parse_list(self, response): """分別得到商品的地址和下一頁地址""" meta = dict() meta['category'] = response.url.split('=')[1].split('&')[0] selector = Selector(response) texts = selector.xpath('//*[@id="plist"]/ul/li/div/div[@class="p-img"]/a').extract() for text in texts: items = re.findall(r'<a target="_blank" href="(.*?)">', text) yield Request(url='https:' + items[0], callback=self.parse_product, meta=meta) # 測試 # print('2') # yield Request(url='https://item.jd.hk/3460655.html', callback=self.parse_product, meta=meta) # next page next_list = response.xpath('//a[@class="pn-next"]/@href').extract() if next_list: # print('next page:', Base_url + next_list[0]) yield Request(url=Base_url + next_list[0], callback=self.parse_list) def parse_product(self, response): """商品頁獲取title,price,product_id""" category = response.meta['category'] ids = re.findall(r"venderId:(.*?),\s.*?shopId:'(.*?)'", response.text) if not ids: ids = re.findall(r"venderId:(.*?),\s.*?shopId:(.*?),", response.text) vender_id = ids[0][0] shop_id = ids[0][1] # shop shopItem = ShopItem() shopItem['shopId'] = shop_id shopItem['venderId'] = vender_id shopItem['url1'] = 'http://mall.jd.com/index-%s.html' % (shop_id) try: shopItem['url2'] = 'https:' + response.xpath('//ul[@class="parameter2 p-parameter-list"]/li/a/@href').extract()[0] except: shopItem['url2'] = shopItem['url1'] name = '' if shop_id == '0': name = '京東自營' else: try: name = response.xpath('//ul[@class="parameter2 p-parameter-list"]/li/a//text()').extract()[0] except: try: name = response.xpath('//div[@class="name"]/a//text()').extract()[0].strip() except: try: name = response.xpath('//div[@class="shopName"]/strong/span/a//text()').extract()[0].strip() except: try: name = response.xpath('//div[@class="seller-infor"]/a//text()').extract()[0].strip() except: name = u'京東自營' shopItem['name'] = name shopItem['_id'] = name yield shopItem productsItem = ProductsItem() productsItem['shopId'] = shop_id productsItem['category'] = category try: title = response.xpath('//div[@class="sku-name"]/text()').extract()[0].replace(u"\xa0", "").strip() except Exception as e: title = response.xpath('//div[@id="name"]/h1/text()').extract()[0] productsItem['name'] = title product_id = response.url.split('/')[-1][:-5] productsItem['_id'] = product_id productsItem['url'] = response.url # description desc = response.xpath('//ul[@class="parameter2 p-parameter-list"]//text()').extract() productsItem['description'] = ';'.join(i.strip() for i in desc) # price response = requests.get(url=price_url + product_id) price_json = response.json() productsItem['reallyPrice'] = price_json[0]['p'] productsItem['originalPrice'] = price_json[0]['m'] # 優惠 res_url = favourable_url % (product_id, shop_id, vender_id, category.replace(',', '%2c')) # print(res_url) response = requests.get(res_url) fav_data = response.json() if fav_data['skuCoupon']: desc1 = [] for item in fav_data['skuCoupon']: start_time = item['beginTime'] end_time = item['endTime'] time_dec = item['timeDesc'] fav_price = item['quota'] fav_count = item['discount'] fav_time = item['addDays'] desc1.append(u'有效期%s至%s,滿%s減%s' % (start_time, end_time, fav_price, fav_count)) productsItem['favourableDesc1'] = ';'.join(desc1) if fav_data['prom'] and fav_data['prom']['pickOneTag']: desc2 = [] for item in fav_data['prom']['pickOneTag']: desc2.append(item['content']) productsItem['favourableDesc1'] = ';'.join(desc2) data = dict() data['product_id'] = product_id yield productsItem yield Request(url=comment_url % (product_id, '0'), callback=self.parse_comments, meta=data) def parse_comments(self, response): """獲取商品comment""" try: data = json.loads(response.text) except Exception as e: print('get comment failed:', e) return None product_id = response.meta['product_id'] commentSummaryItem = CommentSummaryItem() commentSummary = data.get('productCommentSummary') commentSummaryItem['goodRateShow'] = commentSummary.get('goodRateShow') commentSummaryItem['poorRateShow'] = commentSummary.get('poorRateShow') commentSummaryItem['poorCountStr'] = commentSummary.get('poorCountStr') commentSummaryItem['averageScore'] = commentSummary.get('averageScore') commentSummaryItem['generalCountStr'] = commentSummary.get('generalCountStr') commentSummaryItem['showCount'] = commentSummary.get('showCount') commentSummaryItem['showCountStr'] = commentSummary.get('showCountStr') commentSummaryItem['goodCount'] = commentSummary.get('goodCount') commentSummaryItem['generalRate'] = commentSummary.get('generalRate') commentSummaryItem['generalCount'] = commentSummary.get('generalCount') commentSummaryItem['skuId'] = commentSummary.get('skuId') commentSummaryItem['goodCountStr'] = commentSummary.get('goodCountStr') commentSummaryItem['poorRate'] = commentSummary.get('poorRate') commentSummaryItem['afterCount'] = commentSummary.get('afterCount') commentSummaryItem['goodRateStyle'] = commentSummary.get('goodRateStyle') commentSummaryItem['poorCount'] = commentSummary.get('poorCount') commentSummaryItem['skuIds'] = commentSummary.get('skuIds') commentSummaryItem['poorRateStyle'] = commentSummary.get('poorRateStyle') commentSummaryItem['generalRateStyle'] = commentSummary.get('generalRateStyle') commentSummaryItem['commentCountStr'] = commentSummary.get('commentCountStr') commentSummaryItem['commentCount'] = commentSummary.get('commentCount') commentSummaryItem['productId'] = commentSummary.get('productId') # 同ProductsItem的id相同 commentSummaryItem['_id'] = commentSummary.get('productId') commentSummaryItem['afterCountStr'] = commentSummary.get('afterCountStr') commentSummaryItem['goodRate'] = commentSummary.get('goodRate') commentSummaryItem['generalRateShow'] = commentSummary.get('generalRateShow') commentSummaryItem['jwotestProduct'] = data.get('jwotestProduct') commentSummaryItem['maxPage'] = data.get('maxPage') commentSummaryItem['score'] = data.get('score') commentSummaryItem['soType'] = data.get('soType') commentSummaryItem['imageListCount'] = data.get('imageListCount') yield commentSummaryItem for hotComment in data['hotCommentTagStatistics']: hotCommentTagItem = HotCommentTagItem() hotCommentTagItem['_id'] = hotComment.get('id') hotCommentTagItem['name'] = hotComment.get('name') hotCommentTagItem['status'] = hotComment.get('status') hotCommentTagItem['rid'] = hotComment.get('rid') hotCommentTagItem['productId'] = hotComment.get('productId') hotCommentTagItem['count'] = hotComment.get('count') hotCommentTagItem['created'] = hotComment.get('created') hotCommentTagItem['modified'] = hotComment.get('modified') hotCommentTagItem['type'] = hotComment.get('type') hotCommentTagItem['canBeFiltered'] = hotComment.get('canBeFiltered') yield hotCommentTagItem for comment_item in data['comments']: comment = CommentItem() comment['_id'] = comment_item.get('id') comment['productId'] = product_id comment['guid'] = comment_item.get('guid') comment['content'] = comment_item.get('content') comment['creationTime'] = comment_item.get('creationTime') comment['isTop'] = comment_item.get('isTop') comment['referenceId'] = comment_item.get('referenceId') comment['referenceName'] = comment_item.get('referenceName') comment['referenceType'] = comment_item.get('referenceType') comment['referenceTypeId'] = comment_item.get('referenceTypeId') comment['firstCategory'] = comment_item.get('firstCategory') comment['secondCategory'] = comment_item.get('secondCategory') comment['thirdCategory'] = comment_item.get('thirdCategory') comment['replyCount'] = comment_item.get('replyCount') comment['score'] = comment_item.get('score') comment['status'] = comment_item.get('status') comment['title'] = comment_item.get('title') comment['usefulVoteCount'] = comment_item.get('usefulVoteCount') comment['uselessVoteCount'] = comment_item.get('uselessVoteCount') comment['userImage'] = 'http://' + comment_item.get('userImage') comment['userImageUrl'] = 'http://' + comment_item.get('userImageUrl') comment['userLevelId'] = comment_item.get('userLevelId') comment['userProvince'] = comment_item.get('userProvince') comment['viewCount'] = comment_item.get('viewCount') comment['orderId'] = comment_item.get('orderId') comment['isReplyGrade'] = comment_item.get('isReplyGrade') comment['nickname'] = comment_item.get('nickname') comment['userClient'] = comment_item.get('userClient') comment['mergeOrderStatus'] = comment_item.get('mergeOrderStatus') comment['discussionId'] = comment_item.get('discussionId') comment['productColor'] = comment_item.get('productColor') comment['productSize'] = comment_item.get('productSize') comment['imageCount'] = comment_item.get('imageCount') comment['integral'] = comment_item.get('integral') comment['userImgFlag'] = comment_item.get('userImgFlag') comment['anonymousFlag'] = comment_item.get('anonymousFlag') comment['userLevelName'] = comment_item.get('userLevelName') comment['plusAvailable'] = comment_item.get('plusAvailable') comment['recommend'] = comment_item.get('recommend') comment['userLevelColor'] = comment_item.get('userLevelColor') comment['userClientShow'] = comment_item.get('userClientShow') comment['isMobile'] = comment_item.get('isMobile') comment['days'] = comment_item.get('days') comment['afterDays'] = comment_item.get('afterDays') yield comment if 'images' in comment_item: for image in comment_item['images']: commentImageItem = CommentImageItem() commentImageItem['_id'] = image.get('id') commentImageItem['associateId'] = image.get('associateId') # 和CommentItem的discussionId相同 commentImageItem['productId'] = image.get('productId') # 不是ProductsItem的id,這個值爲0 commentImageItem['imgUrl'] = 'http:' + image.get('imgUrl') commentImageItem['available'] = image.get('available') commentImageItem['pin'] = image.get('pin') commentImageItem['dealt'] = image.get('dealt') commentImageItem['imgTitle'] = image.get('imgTitle') commentImageItem['isMain'] = image.get('isMain') yield commentImageItem # next page max_page = int(data.get('maxPage', '1')) if max_page > 60: max_page = 60 for i in range(1, max_page): url = comment_url % (product_id, str(i)) meta = dict() meta['product_id'] = product_id yield Request(url=url, callback=self.parse_comments2, meta=meta) def parse_comments2(self, response): """獲取商品comment""" try: data = json.loads(response.text) except Exception as e: print('get comment failed:', e) return None product_id = response.meta['product_id'] commentSummaryItem = CommentSummaryItem() commentSummary = data.get('productCommentSummary') commentSummaryItem['goodRateShow'] = commentSummary.get('goodRateShow') commentSummaryItem['poorRateShow'] = commentSummary.get('poorRateShow') commentSummaryItem['poorCountStr'] = commentSummary.get('poorCountStr') commentSummaryItem['averageScore'] = commentSummary.get('averageScore') commentSummaryItem['generalCountStr'] = commentSummary.get('generalCountStr') commentSummaryItem['showCount'] = commentSummary.get('showCount') commentSummaryItem['showCountStr'] = commentSummary.get('showCountStr') commentSummaryItem['goodCount'] = commentSummary.get('goodCount') commentSummaryItem['generalRate'] = commentSummary.get('generalRate') commentSummaryItem['generalCount'] = commentSummary.get('generalCount') commentSummaryItem['skuId'] = commentSummary.get('skuId') commentSummaryItem['goodCountStr'] = commentSummary.get('goodCountStr') commentSummaryItem['poorRate'] = commentSummary.get('poorRate') commentSummaryItem['afterCount'] = commentSummary.get('afterCount') commentSummaryItem['goodRateStyle'] = commentSummary.get('goodRateStyle') commentSummaryItem['poorCount'] = commentSummary.get('poorCount') commentSummaryItem['skuIds'] = commentSummary.get('skuIds') commentSummaryItem['poorRateStyle'] = commentSummary.get('poorRateStyle') commentSummaryItem['generalRateStyle'] = commentSummary.get('generalRateStyle') commentSummaryItem['commentCountStr'] = commentSummary.get('commentCountStr') commentSummaryItem['commentCount'] = commentSummary.get('commentCount') commentSummaryItem['productId'] = commentSummary.get('productId') # 同ProductsItem的id相同 commentSummaryItem['_id'] = commentSummary.get('productId') commentSummaryItem['afterCountStr'] = commentSummary.get('afterCountStr') commentSummaryItem['goodRate'] = commentSummary.get('goodRate') commentSummaryItem['generalRateShow'] = commentSummary.get('generalRateShow') commentSummaryItem['jwotestProduct'] = data.get('jwotestProduct') commentSummaryItem['maxPage'] = data.get('maxPage') commentSummaryItem['score'] = data.get('score') commentSummaryItem['soType'] = data.get('soType') commentSummaryItem['imageListCount'] = data.get('imageListCount') yield commentSummaryItem for hotComment in data['hotCommentTagStatistics']: hotCommentTagItem = HotCommentTagItem() hotCommentTagItem['_id'] = hotComment.get('id') hotCommentTagItem['name'] = hotComment.get('name') hotCommentTagItem['status'] = hotComment.get('status') hotCommentTagItem['rid'] = hotComment.get('rid') hotCommentTagItem['productId'] = hotComment.get('productId') hotCommentTagItem['count'] = hotComment.get('count') hotCommentTagItem['created'] = hotComment.get('created') hotCommentTagItem['modified'] = hotComment.get('modified') hotCommentTagItem['type'] = hotComment.get('type') hotCommentTagItem['canBeFiltered'] = hotComment.get('canBeFiltered') yield hotCommentTagItem for comment_item in data['comments']: comment = CommentItem() comment['_id'] = comment_item.get('id') comment['productId'] = product_id comment['guid'] = comment_item.get('guid') comment['content'] = comment_item.get('content') comment['creationTime'] = comment_item.get('creationTime') comment['isTop'] = comment_item.get('isTop') comment['referenceId'] = comment_item.get('referenceId') comment['referenceName'] = comment_item.get('referenceName') comment['referenceType'] = comment_item.get('referenceType') comment['referenceTypeId'] = comment_item.get('referenceTypeId') comment['firstCategory'] = comment_item.get('firstCategory') comment['secondCategory'] = comment_item.get('secondCategory') comment['thirdCategory'] = comment_item.get('thirdCategory') comment['replyCount'] = comment_item.get('replyCount') comment['score'] = comment_item.get('score') comment['status'] = comment_item.get('status') comment['title'] = comment_item.get('title') comment['usefulVoteCount'] = comment_item.get('usefulVoteCount') comment['uselessVoteCount'] = comment_item.get('uselessVoteCount') comment['userImage'] = 'http://' + comment_item.get('userImage') comment['userImageUrl'] = 'http://' + comment_item.get('userImageUrl') comment['userLevelId'] = comment_item.get('userLevelId') comment['userProvince'] = comment_item.get('userProvince') comment['viewCount'] = comment_item.get('viewCount') comment['orderId'] = comment_item.get('orderId') comment['isReplyGrade'] = comment_item.get('isReplyGrade') comment['nickname'] = comment_item.get('nickname') comment['userClient'] = comment_item.get('userClient') comment['mergeOrderStatus'] = comment_item.get('mergeOrderStatus') comment['discussionId'] = comment_item.get('discussionId') comment['productColor'] = comment_item.get('productColor') comment['productSize'] = comment_item.get('productSize') comment['imageCount'] = comment_item.get('imageCount') comment['integral'] = comment_item.get('integral') comment['userImgFlag'] = comment_item.get('userImgFlag') comment['anonymousFlag'] = comment_item.get('anonymousFlag') comment['userLevelName'] = comment_item.get('userLevelName') comment['plusAvailable'] = comment_item.get('plusAvailable') comment['recommend'] = comment_item.get('recommend') comment['userLevelColor'] = comment_item.get('userLevelColor') comment['userClientShow'] = comment_item.get('userClientShow') comment['isMobile'] = comment_item.get('isMobile') comment['days'] = comment_item.get('days') comment['afterDays'] = comment_item.get('afterDays') yield comment if 'images' in comment_item: for image in comment_item['images']: commentImageItem = CommentImageItem() commentImageItem['_id'] = image.get('id') commentImageItem['associateId'] = image.get('associateId') # 和CommentItem的discussionId相同 commentImageItem['productId'] = image.get('productId') # 不是ProductsItem的id,這個值爲0 commentImageItem['imgUrl'] = 'http:' + image.get('imgUrl') commentImageItem['available'] = image.get('available') commentImageItem['pin'] = image.get('pin') commentImageItem['dealt'] = image.get('dealt') commentImageItem['imgTitle'] = image.get('imgTitle') commentImageItem['isMain'] = image.get('isMain') yield commentImageItem
完整代碼備份上傳到GitHub上了json
其餘相關爬蟲案例能夠在blog看到 嘻嘻 以爲對你有幫助就點個贊app
想了解更多的歡迎在下面的聯繫方式connect meless
加好友請備註: 數據 VX: umakedown 郵箱: 404487132@qq.com github: https://github.com/ChanJeff123dom
轉載註明出處[瑪祖採集]scrapy