scrapy 分佈式爬蟲- RedisSpider

爬去噹噹書籍信息html

多臺機器同時爬取,共用一個redis記錄 scrapy_redisredis

帶爬取的request對象儲存在redis中,每臺機器讀取request對象並刪除記錄,經行爬取。實現分佈式爬蟲dom

import scrapy
from scrapy_redis.spiders import RedisSpider
from copy import deepcopy


class DangdangSpider(RedisSpider):
    name = 'dangdang'
    allowed_domains = ['dangdang.com']
    # 開始爬蟲,會從redis的key中讀取start_url.
    redis_key = "dangdang" # lpush dangdang 'http://book.dangdang.com/'

    def parse(self, response):
        # 大分類
        div_list = response.xpath("//div[@class='con flq_body']/div")[:-4]
        print(len(div_list), 'duoshao')
        for div in div_list:
            item = {}
            item['b_cate'] = div.xpath("./dl/dt//text()").extract()
            item['b_cate'] = [i.strip() for i in item['b_cate'] if len(i.strip())>0] # 過濾掉空字符
            print('b_cate:', item['b_cate'])
            # 中間分類
            if item['b_cate'] == ['創意文具']:
                print(item['b_cate'], "pass......")
                item['m_cate'] = None
                item['s_cate_url'] = div.xpath("./dl/dt/a/@ddt-src").extract_first()
                print('s_cate_url:', item['m_cate'])
                # yield scrapy.Request(
                #     item['s_cate_url'],
                #     callback=self.parse_special,
                #     meta={'item': deepcopy(item)}
                # )
            else:
                dl_list = div.xpath(".//dl[@class='inner_dl']")
                for dl in dl_list:
                    item['m_cate'] = dl.xpath("./dt//text()").extract()
                    item['m_cate'] = [i.strip() for i in item['m_cate'] if len(i.strip())>0]
                    # 小分類
                    dd_list = dl.xpath("./dd")
                    for dd in dd_list:
                        item['s_cate'] = dd.xpath("./a/@title").extract_first()
                        item['s_cate_url'] = dd.xpath("./a/@ddt-src").extract_first()
                        # 小分類的全部書籍
                        if item['s_cate_url'] is not None:
                            yield scrapy.Request(
                                item['s_cate_url'],
                                callback=self.parse_books,
                                meta={'item': deepcopy(item)}
                            )

    def parse_special(self, response):
        ''' 文具信息 '''
        pass

    def parse_books(self, response):
        item = response.meta['item']
        # 當前小分類的書籍
        li_list = response.xpath("//ul[@class='list_aa ']/li")
        if li_list is not None:
            for li in li_list:
                try:
                    item['book_price'] = li.xpath(".//span[@class='num']/text()").extract_first() + \
                                         li.xpath(".//span[@class='tail']/text()").extract_first()
                except:
                    item['book_price'] = 'Unknown'
                item['book_url'] = li.xpath("./a/@href").extract_first()
                if item['book_url'] is not None:
                    yield scrapy.Request(
                        item['book_url'],
                        callback=self.parse_book_detail,
                        meta={'item': deepcopy(item)}
                    )

    def parse_book_detail(self, response):
        item = response.meta['item']
        item['book_name'] = response.xpath("//div[@class='name_info']/h1/img/text()").extract_first()
        item['book_desc'] = response.xpath("//span[@class='head_title_name']/text()").extract_first()
        # 這一本書籍的詳細信息
        span_list = response.xpath("//div[@class='messbox_info']/span")
        item['book_author'] = span_list.xpath("./span[1]/a/text()").extract() # 可能多個做者
        item['publisher'] = span_list.xpath("./span[2]/a/text()").extract_first()
        item['pub_date'] = span_list.xpath("./span[3]/text()").extract_first()
        print(item)
        # yield item
相關文章
相關標籤/搜索