# -*- coding: utf-8 -*- import scrapy import json import csv from milk.items import MilkItem class MilkspiderSpider(scrapy.Spider): name = 'milkspider' # allowed_domains = ['www.xxx.com'] start_urls = ['https://search.jd.com/Search?keyword=%E8%BF%9B%E5%8F%A3%E7%89%9B%E5%A5%B6&enc=utf-8&suggest=3.def.0.V09--12s0,20s0,38s0&wq=%E8%BF%9B%E5%8F%A3&pvid=96ab0296e9ce494fb251b716911d93ec'] data_list = [] def parse(self, response): li_list = response.xpath('//li[@class="gl-item"]') for li in li_list: good_id = li.xpath('./@data-sku').get() # 從本身開始找 # print(good_id) shop_name = li.xpath('.//a[@class="curr-shop"]/text()').get() # print(shop_name) good_name = li.xpath('.//div[@class="p-name p-name-type-2"]/a/em/text()').getall() good_name = ','.join(good_name).strip().replace(",", "").replace("\n\t", "") # print(good_name) good_url = li.xpath('.//div[@class="p-name p-name-type-2"]/a/@href').get() if good_url.startswith('https:'): good_url = good_url else: good_url = 'https:' + good_url # print(good_url) good_price = li.xpath('.//div[@class="p-price"]/strong//text()').getall() good_price = ','.join(good_price).replace(",", "") # print(good_price) # 評論數在源碼沒有 獲取不到 須要去詳情頁獲取 item = MilkItem() item["shop_name"] = shop_name item["good_name"] = good_name item["good_price"] = good_price item["good_id"] = good_id item['good_url'] = good_url yield scrapy.Request(url=good_url, meta={"item": item}, callback=self.parse_detail) def parse_detail(self, response): # 獲取的評論是動態加載的 item = response.meta['item'] # 拼接每一個商品的評論的url comment_info_url = 'https://club.jd.com/comment/productCommentSummaries.action?referenceIds=' + item['good_id'] # print(comment_info_url) yield scrapy.Request(url=comment_info_url, meta={"item": item}, callback=self.parse_comment) def parse_comment(self, response): item = response.meta['item'] # response.body是一個bytes格式的 轉成str str = response.body.decode('utf-8', 'replace') json_str = str.replace('��', '萬') dict = json.loads(json_str) total_comment = dict['CommentsCount'][0]['CommentCountStr'] good_comment = dict['CommentsCount'][0]['GoodCountStr'] video_count = dict['CommentsCount'][0]['VideoCountStr'] general_count = dict['CommentsCount'][0]['GeneralCountStr'] poor_count = dict['CommentsCount'][0]['PoorCountStr'] item['total_comment'] = total_comment item['good_comment'] = good_comment item['video_count'] = video_count item['general_count'] = general_count item['poor_count'] = poor_count self.data_list.append(item) # print(self.data_list) with open('./京東進口牛奶.csv', 'w', encoding='utf-8', errors='ignore', newline="") as csvfile: fieldnames = ['good_id', 'good_name', 'shop_name', 'good_url', 'total_comment', 'good_comment', 'video_count', 'general_count', 'poor_count', 'good_price'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() writer.writerows(self.data_list) return self.data_list
itemshtml
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://doc.scrapy.org/en/latest/topics/items.html import scrapy class MilkItem(scrapy.Item): # define the fields for your item here like: good_id = scrapy.Field() good_name = scrapy.Field() shop_name = scrapy.Field() good_url = scrapy.Field() total_comment = scrapy.Field() good_comment = scrapy.Field() video_count = scrapy.Field() general_count = scrapy.Field() poor_count = scrapy.Field() good_price = scrapy.Field()
startjson
from scrapy import cmdline cmdline.execute("scrapy crawl milkspider".split())