spides.pyhtml
# -*- coding: utf-8 -*- import scrapy from weather.items import WeatherItem from scrapy.crawler import CrawlerProcess import re ''' 多級分類爬取 ''' class IgxSpider(scrapy.Spider): name = 'igx_result' allowed_domains = ['www.igxpt.com'] # start_urls = ['http://www.igxpt.com/cate/192/'] def start_requests(self): start_urls = ['http://www.igxpt.com/cate/{}/'.format(str(i)) for i in range(192, 194)] #這裏我是簡寫的,固然也能夠進入主頁面,爬取這些url 那就要多一級分類了 for url in start_urls: yield scrapy.Request(url=url) def parse(self, response): '''獲得分頁頁碼-----start''' page = response.xpath('//div[@class="dataTables_paginate paging_simple_numbers"]/span/text()').extract_first() ret = re.search('共(\d+)頁', page) number = ret.group(1) #print(page,number) page_link = response.xpath('//ul[@class="pagination"]/li/a/@href').extract_first() current_url = 'http://www.igxpt.com'+page_link.split('=')[0]+'=' '''獲得分頁頁碼-----end''' clearfix = response.xpath('//ul[@class="shop-list-recommend mt20 clearfix"]/li') for li in clearfix: item = WeatherItem() item['name'] = li.xpath('./a/p[1]/text()').extract_first() url_img = li.xpath('./a/div/img/@src').extract_first() item['url'] = "http://www.igxpt.com" + (url_img) price_alia = li.xpath('./a/p[2]/span[@class="blue"]/text()').extract_first() item['price'] = price_alia + "元" yield item # 拼接url 遞歸調用分頁 urls = [current_url + '{}'.format(str(i)) for i in range(1, int(number) + 1)] for se in urls: yield scrapy.Request(url=se, callback=self.parse)
items.pypython
import scrapy class WeatherItem(scrapy.Item): # define the fields for your item here like: name = scrapy.Field() url = scrapy.Field() price= scrapy.Field()
pipelines.pymysql
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html import pymysql import urllib.request class WeatherPipeline(object): def process_item(self, item, spider): name = item['name'] url = item['url'] price= item['price'] connection = pymysql.connect( host='127.0.0.1', user='root', passwd='root', db='scrapy', # charset='utf-8', cursorclass=pymysql.cursors.DictCursor ) try: # '''下載圖片''' # imgname = url.split('/')[-1] # path = r"D:\Python\weather\weather\images\%s" % (imgname) # urllib.request.urlretrieve(url, filename=path) '''插入數據庫''' with connection.cursor() as cursor: sql = """INSERT INTO `goods_info_detail` (name, url, price) VALUES (%s, %s, %s) """ cursor.execute( sql,(name,url,price) ) connection.commit() except ValueError as e: print(e) finally: connection.close() return item
settings.pysql
LOG_LEVEL = 'WARNING'BOT_NAME = 'weather'SPIDER_MODULES = ['weather.spiders']NEWSPIDER_MODULE = 'weather.spiders''''管道'''ITEM_PIPELINES = { 'weather.pipelines.WeatherPipeline': 300,}