爬去噹噹書籍信息html
多臺機器同時爬取,共用一個redis記錄 scrapy_redisredis
帶爬取的request對象儲存在redis中,每臺機器讀取request對象並刪除記錄,經行爬取。實現分佈式爬蟲dom
import scrapy from scrapy_redis.spiders import RedisSpider from copy import deepcopy class DangdangSpider(RedisSpider): name = 'dangdang' allowed_domains = ['dangdang.com'] # 開始爬蟲,會從redis的key中讀取start_url. redis_key = "dangdang" # lpush dangdang 'http://book.dangdang.com/' def parse(self, response): # 大分類 div_list = response.xpath("//div[@class='con flq_body']/div")[:-4] print(len(div_list), 'duoshao') for div in div_list: item = {} item['b_cate'] = div.xpath("./dl/dt//text()").extract() item['b_cate'] = [i.strip() for i in item['b_cate'] if len(i.strip())>0] # 過濾掉空字符 print('b_cate:', item['b_cate']) # 中間分類 if item['b_cate'] == ['創意文具']: print(item['b_cate'], "pass......") item['m_cate'] = None item['s_cate_url'] = div.xpath("./dl/dt/a/@ddt-src").extract_first() print('s_cate_url:', item['m_cate']) # yield scrapy.Request( # item['s_cate_url'], # callback=self.parse_special, # meta={'item': deepcopy(item)} # ) else: dl_list = div.xpath(".//dl[@class='inner_dl']") for dl in dl_list: item['m_cate'] = dl.xpath("./dt//text()").extract() item['m_cate'] = [i.strip() for i in item['m_cate'] if len(i.strip())>0] # 小分類 dd_list = dl.xpath("./dd") for dd in dd_list: item['s_cate'] = dd.xpath("./a/@title").extract_first() item['s_cate_url'] = dd.xpath("./a/@ddt-src").extract_first() # 小分類的全部書籍 if item['s_cate_url'] is not None: yield scrapy.Request( item['s_cate_url'], callback=self.parse_books, meta={'item': deepcopy(item)} ) def parse_special(self, response): ''' 文具信息 ''' pass def parse_books(self, response): item = response.meta['item'] # 當前小分類的書籍 li_list = response.xpath("//ul[@class='list_aa ']/li") if li_list is not None: for li in li_list: try: item['book_price'] = li.xpath(".//span[@class='num']/text()").extract_first() + \ li.xpath(".//span[@class='tail']/text()").extract_first() except: item['book_price'] = 'Unknown' item['book_url'] = li.xpath("./a/@href").extract_first() if item['book_url'] is not None: yield scrapy.Request( item['book_url'], callback=self.parse_book_detail, meta={'item': deepcopy(item)} ) def parse_book_detail(self, response): item = response.meta['item'] item['book_name'] = response.xpath("//div[@class='name_info']/h1/img/text()").extract_first() item['book_desc'] = response.xpath("//span[@class='head_title_name']/text()").extract_first() # 這一本書籍的詳細信息 span_list = response.xpath("//div[@class='messbox_info']/span") item['book_author'] = span_list.xpath("./span[1]/a/text()").extract() # 可能多個做者 item['publisher'] = span_list.xpath("./span[2]/a/text()").extract_first() item['pub_date'] = span_list.xpath("./span[3]/text()").extract_first() print(item) # yield item