京東進口牛奶的爬取

時間 2019-11-10

標籤京東進口简体版

原文原文鏈接

# -*- coding: utf-8 -*-
import scrapy
import json
import csv
from milk.items import MilkItem

class MilkspiderSpider(scrapy.Spider):
    name = 'milkspider'
    # allowed_domains = ['www.xxx.com']
    start_urls = ['https://search.jd.com/Search?keyword=%E8%BF%9B%E5%8F%A3%E7%89%9B%E5%A5%B6&enc=utf-8&suggest=3.def.0.V09--12s0,20s0,38s0&wq=%E8%BF%9B%E5%8F%A3&pvid=96ab0296e9ce494fb251b716911d93ec']
    data_list = []

    def parse(self, response):
        li_list = response.xpath('//li[@class="gl-item"]')
        for li in li_list:
            good_id = li.xpath('./@data-sku').get()  # 從本身開始找
            # print(good_id)
            shop_name = li.xpath('.//a[@class="curr-shop"]/text()').get()
            # print(shop_name)
            good_name = li.xpath('.//div[@class="p-name p-name-type-2"]/a/em/text()').getall()
            good_name = ','.join(good_name).strip().replace(",", "").replace("\n\t", "")
            # print(good_name)
            good_url = li.xpath('.//div[@class="p-name p-name-type-2"]/a/@href').get()
            if good_url.startswith('https:'):
                good_url = good_url
            else:
                good_url = 'https:' + good_url
            # print(good_url)
            good_price = li.xpath('.//div[@class="p-price"]/strong//text()').getall()
            good_price = ','.join(good_price).replace(",", "")
            # print(good_price)

            # 評論數在源碼沒有 獲取不到 須要去詳情頁獲取
            item = MilkItem()
            item["shop_name"] = shop_name
            item["good_name"] = good_name
            item["good_price"] = good_price
            item["good_id"] = good_id
            item['good_url'] = good_url
            yield scrapy.Request(url=good_url, meta={"item": item}, callback=self.parse_detail)

    def parse_detail(self, response):
        # 獲取的評論是動態加載的
        item = response.meta['item']

        # 拼接每一個商品的評論的url
        comment_info_url = 'https://club.jd.com/comment/productCommentSummaries.action?referenceIds=' + item['good_id']
        # print(comment_info_url)
        yield scrapy.Request(url=comment_info_url, meta={"item": item}, callback=self.parse_comment)

    def parse_comment(self, response):
        item = response.meta['item']

        # response.body是一個bytes格式的   轉成str
        str = response.body.decode('utf-8', 'replace')
        json_str = str.replace('��', '萬')
        dict = json.loads(json_str)

        total_comment = dict['CommentsCount'][0]['CommentCountStr']
        good_comment = dict['CommentsCount'][0]['GoodCountStr']
        video_count = dict['CommentsCount'][0]['VideoCountStr']
        general_count = dict['CommentsCount'][0]['GeneralCountStr']
        poor_count = dict['CommentsCount'][0]['PoorCountStr']

        item['total_comment'] = total_comment
        item['good_comment'] = good_comment
        item['video_count'] = video_count
        item['general_count'] = general_count
        item['poor_count'] = poor_count

        self.data_list.append(item)
        # print(self.data_list)

        with open('./京東進口牛奶.csv', 'w', encoding='utf-8', errors='ignore', newline="") as csvfile:
            fieldnames = ['good_id', 'good_name', 'shop_name', 'good_url', 'total_comment', 'good_comment',
                          'video_count', 'general_count', 'poor_count', 'good_price']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()
            writer.writerows(self.data_list)

        return self.data_list

itemshtml

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class MilkItem(scrapy.Item):
    # define the fields for your item here like:
    good_id = scrapy.Field()
    good_name = scrapy.Field()
    shop_name = scrapy.Field()
    good_url = scrapy.Field()

    total_comment = scrapy.Field()
    good_comment = scrapy.Field()
    video_count = scrapy.Field()
    general_count = scrapy.Field()
    poor_count = scrapy.Field()

    good_price = scrapy.Field()

startjson

from scrapy import cmdline

cmdline.execute("scrapy crawl milkspider".split())

相關標籤/搜索

每日一句

每一个你不满意的现在，都有一个你没有努力的曾经。