scrapy框架爬取多級頁面

時間 2019-11-07

原文原文鏈接

spides.pyhtml

# -*- coding: utf-8 -*-
import scrapy
from weather.items import  WeatherItem
from scrapy.crawler import CrawlerProcess
import re
'''
多級分類爬取
'''
class IgxSpider(scrapy.Spider):
    name = 'igx_result'
    allowed_domains = ['www.igxpt.com']
   # start_urls = ['http://www.igxpt.com/cate/192/']

    def start_requests(self):
        start_urls = ['http://www.igxpt.com/cate/{}/'.format(str(i)) for i in range(192, 194)]   #這裏我是簡寫的，固然也能夠進入主頁面，爬取這些url 那就要多一級分類了
        for url in start_urls:
            yield scrapy.Request(url=url)


    def parse(self, response):

        '''獲得分頁頁碼-----start'''
        page = response.xpath('//div[@class="dataTables_paginate paging_simple_numbers"]/span/text()').extract_first()
        ret = re.search('共(\d+)頁', page)
        number = ret.group(1)
        #print(page,number)
        page_link = response.xpath('//ul[@class="pagination"]/li/a/@href').extract_first()
        current_url = 'http://www.igxpt.com'+page_link.split('=')[0]+'='
        '''獲得分頁頁碼-----end'''


        clearfix = response.xpath('//ul[@class="shop-list-recommend mt20  clearfix"]/li')
        for li in clearfix:
            item = WeatherItem()
            item['name'] = li.xpath('./a/p[1]/text()').extract_first()
            url_img = li.xpath('./a/div/img/@src').extract_first()
            item['url'] = "http://www.igxpt.com" + (url_img)
            price_alia = li.xpath('./a/p[2]/span[@class="blue"]/text()').extract_first()
            item['price'] = price_alia + "元"
            yield item


        # 拼接url 遞歸調用分頁
        urls = [current_url + '{}'.format(str(i)) for i in range(1, int(number) + 1)]
        for se in urls:
            yield scrapy.Request(url=se, callback=self.parse)

　　items.pypython

import scrapy


class WeatherItem(scrapy.Item):
    # define the fields for your item here like:
    name = scrapy.Field()
    url  = scrapy.Field()
    price= scrapy.Field()

　　pipelines.pymysql

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html

import pymysql
import urllib.request
class WeatherPipeline(object):
    def process_item(self, item, spider):
        name = item['name']
        url  = item['url']
        price= item['price']


        connection = pymysql.connect(
            host='127.0.0.1',
            user='root',
            passwd='root',
            db='scrapy',
            # charset='utf-8',
            cursorclass=pymysql.cursors.DictCursor
        )

        try:

            # '''下載圖片'''
            # imgname = url.split('/')[-1]
            # path = r"D:\Python\weather\weather\images\%s" % (imgname)
            # urllib.request.urlretrieve(url, filename=path)
            '''插入數據庫'''
            with connection.cursor() as cursor:

                sql = """INSERT INTO `goods_info_detail` (name, url, price) VALUES (%s, %s, %s) """
                cursor.execute(
                    sql,(name,url,price)
                )
                connection.commit()

        except ValueError as e:
            print(e)

        finally:
            connection.close()

        return item

　　settings.pysql

LOG_LEVEL = 'WARNING'BOT_NAME = 'weather'SPIDER_MODULES = ['weather.spiders']NEWSPIDER_MODULE = 'weather.spiders''''管道'''ITEM_PIPELINES = {   'weather.pipelines.WeatherPipeline': 300,}

相關標籤/搜索

每日一句

每一个你不满意的现在，都有一个你没有努力的曾经。