python：scrapy框架爬取京東圖書案例

時間 2019-12-05

原文原文鏈接

總結了一下scrapy爬蟲步驟：html

1.生成項目：  scrapy startproject  book (項目名)
2.進入項目文件夾  cd  book
3.生成項目爬蟲文件   scrapy genspider  jd(爬蟲文件名) jd.com(域名)
4.編寫代碼
5.執行爬蟲 scrapy crawl jd(爬蟲文件名)
複製代碼

京東爬蟲代碼：

jd.pypython

# -*- coding: utf-8 -*-
import scrapy
import copy, json


class JdSpider(scrapy.Spider):
    name = 'jd'
    allowed_domains = ['jd.com', 'p.3.cn']
    start_urls = ['https://book.jd.com/booksort.html']

    def parse(self, response):
        # 大分類列表
        dt_list = response.xpath('//div[@class="mc"]/dl/dt')
        for dt in dt_list:
            item = {}
            item["b_cate"] = dt.xpath("./a/text()").extract_first()
            # 小分類列表
            em_list = dt.xpath("./following-sibling::dd[1]/em")
            for em in em_list:
                item["s_href"] = "https:" + em.xpath("./a/@href").extract_first()
                item["s_cate"] = em.xpath("./a/text()").extract_first()

                yield scrapy.Request(
                    item["s_href"],
                    callback=self.parse_book_list,
                    meta={"item": copy.deepcopy(item)}
                )

    def parse_book_list(self, response):
        item = response.meta["item"]

        book_list = response.xpath("//div[@id='plist']//li")
        for book in book_list:
            img_url = book.xpath(
                ".//div[@class='p-img']/a/img/@src").extract_first()
            if img_url is None:
                img_url = book.xpath(
                    ".//div[@class='p-img']/a/img/@data-lazy-img").extract_first()

            item["book_img"] = "https:" + img_url
            item["book_title"] = book.xpath(
                ".//div[@class='p-name']/a/em/text()").extract_first().strip()
            item["book_man"] = book.xpath(
                ".//div[@class='p-bookdetails']//span[@class='author_type_1']/a/text()").extract()
            item["book_publish_place"] = book.xpath(
                ".//div[@class='p-bookdetails']//span[@class='p-bi-store']/a/@title").extract_first()
            skuId = book.xpath(
                ".//div[@class='p-operate']/a/@data-sku").extract_first()
            yield scrapy.Request(
                'https://p.3.cn/prices/mgets?skuIds=J_{}'.format(skuId),
                callback=self.parse_book_price_json,
                meta={"item": copy.deepcopy(item)}
            )
        # 翻頁
        next_url = response.xpath("//a[@class='pn-next']/@href").extract_first()
        if next_url is not None:
            yield scrapy.Request(
                "https:" + next_url,
                callback=self.parse_book_list,
                meta={"item": copy.deepcopy(item)}
            )

    def parse_book_price_json(self, response):
        item = response.meta["item"]
        item["book_price"] = json.loads(response.body.decode())[0]["p"]
        item.pop("s_href")
        yield item

複製代碼

pipelines.pymysql

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html

import pymysql


class BookPipeline(object):

    def open_spider(self, spider):
        self.client = pymysql.Connect(
            host='localhost',
            port=3306,
            user='root',
            passwd='6028',
            db='jd_book',
            charset='utf8'
        )

    def process_item(self, item, spider):
        print(item)
        self.client.cursor().execute(
            "insert into book (b_cate, s_cate, book_img, book_title, book_man, book_publish_place, book_price) values (%s, %s, %s, %s, %s, %s, %s)", \
            (item["b_cate"], item["s_cate"], item["book_img"], item["book_title"], \
             item["book_man"][0], item["book_publish_place"], item["book_price"]))
        self.client.commit()
        return item

    def close_spider(self, spider):
        self.client.close()

複製代碼

settings.pyweb

# -*- coding: utf-8 -*-

BOT_NAME = 'book'

SPIDER_MODULES = ['book.spiders']
NEWSPIDER_MODULE = 'book.spiders'

# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Mobile Safari/537.36'
LOG_LEVEL = "WARNING"
# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# 防止重複
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
SCHEDULER_PERSIST = True
REDIS_UEL = "127.0.0.1:6397"

ITEM_PIPELINES = {
    'book.pipelines.BookPipeline': 300,
    'scrapy_redis.pipelines.RedisPipeline': 400,
}
複製代碼