汽車之家車型的簡單爬取
spiderhtml
# -*- coding: utf-8 -*- import scrapy from scrapy import Request from mininova.items import carItem import sys reload(sys) sys.setdefaultencoding('utf8') class SplashSpider(scrapy.Spider): #spider名字 name = 'car_home' allowed_domains = ['autohome.com.cn'] start_urls = [ ] # 自定義配置 custom_settings = { 'ITEM_PIPELINES': { 'mininova.pipelines.CarPipeline': 300, } } def start_requests(self): #從新定義起始爬取點 #全部首字母 words = ['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z'] #按照首字母,組合對應的頁面,壓入start_urls for word in words: self.start_urls.append('https://www.autohome.com.cn/grade/carhtml/'+word+'.html') #根據start_urls,抓取頁面 for url in self.start_urls: yield Request(url,meta={'word':word}) #定義默認的抓取函數 def parse(self, response): print('url') print(response.url) word = response.meta['word'] car_cates = response.xpath('//dl').extract() brand_id = 0 total_cars = [] for brand_index in range(len(car_cates)): #品牌編號 brand_num = brand_index + 1 brand_num = str(brand_num) #品牌名 brand = response.xpath('//dl['+brand_num+']/dt/div[1]/a/text()').extract()[0] print('brand:'+brand) #品牌logo brand_logo_url = response.xpath('//dl['+brand_num+']/dt//img[1]/@src').extract()[0] #品牌小類別 brand_items = response.xpath('//dl['+brand_num+']/dd//div[@class="h3-tit"]/a/text()').extract() #品牌小類別對應的頁面 brand_item_urls = response.xpath('//dl['+brand_num+']/dd//div[@class="h3-tit"]/a/@href').extract() for brand_item_index in range(len(brand_items)): #品牌小類別的編號 brand_item_num = brand_item_index + 1 brand_item_num = str(brand_item_num) #品牌小類別名 brand_item = brand_items[brand_item_index] #品牌小類別對應的頁面的url brand_item_url = brand_item_urls[brand_item_index] print('brand_item:'+brand_item) print('brand_item_url:'+brand_item_url) #品牌小類別中的全部車 cars = response.xpath('//dl['+brand_num+']/dd//ul[@class="rank-list-ul"]['+brand_item_num+']/li[@id]').extract() print('cars_count:'+str(len(cars))) for car_index in range(len(cars)): car_num = car_index + 1 car_num = str(car_num) #具體車的名稱 name = response.xpath('//dl['+brand_num+']/dd//ul[@class="rank-list-ul"]['+brand_item_num+']/li[@id]['+car_num+']/h4/a/text()').extract()[0] #車對應的頁面 url = response.xpath('//dl['+brand_num+']/dd//ul[@class="rank-list-ul"]['+brand_item_num+']/li[@id]['+car_num+']/h4/a/@href').extract()[0] #報價(最低價-最高價) price = response.xpath('//dl['+brand_num+']/dd//ul[@class="rank-list-ul"]['+brand_item_num+']/li[@id]['+car_num+']/div[1]/a/text()').extract()[0] prices = price.split('-') price_base = '萬' if len(prices) != 2: max_price = '暫無' min_price = '暫無' else: max_price = str(prices[1].replace(price_base,'')) min_price = str(prices[0]) print('car:'+name+' max_price:'+str(max_price)+' min_price:'+str(min_price)+' price_base:'+price_base) car_item = carItem() car_item['name'] = name car_item['url'] = url car_item['brand_item'] = brand_item car_item['first_word'] = word car_item['brand'] = brand car_item['brand_logo_url'] = brand_logo_url car_item['max_price'] = max_price car_item['min_price'] = min_price total_cars.append(car_item) return total_cars
itemmysql
# -*- coding: utf-8 -*- import scrapy class carItem(scrapy.Item): #具體車名 name = scrapy.Field() #對應的介紹頁面url url = scrapy.Field() #最高報價,單位(萬) max_price = scrapy.Field() #最低報價,單位(萬) min_price = scrapy.Field() #品牌名 brand = scrapy.Field() #品牌logo brand_logo_url = scrapy.Field() #品牌小類別名 brand_item = scrapy.Field() #品牌首字母 first_word = scrapy.Field()
mongo_carsql
from mininova.mongodb import Mongo from mininova.settings import mongo_setting class MongoCar(): db_name = 'car' brand_set_name = 'brand' brand_item_set_name = 'brand_item' car_set_name = 'car' def __init__(self): self.db = Mongo(mongo_setting['mongo_host'],mongo_setting['mongo_port'],mongo_setting['mongo_user'],mongo_setting['mongo_password']) def insert(self,item): brand_where = {'name':item['brand']} brand = self.brand_exist(self.db,brand_where) if brand == False: brand = {'name':item['brand'],'first_word':item['first_word']} brand = self.insert_brand(self.db,brand) print('brand insert ok!') else: brand = {'name':item['brand'],'first_word':item['first_word'],'logo_url':item['brand_logo_url']} brand = self.update_brand(self.db,brand_where,brand) print('brand_exist!') brand_item_where = {'name':item['brand_item']} brand_item = self.brand_item_exist(self.db,brand_item_where) if brand_item == False: brand_item = {'name':item['brand_item'],'first_word':item['first_word'],'brand_id':brand['_id']} brand_item = self.insert_brand_item(self.db,brand_item) print('brand_item insert ok!') else: print('brand_item_exist!') car_where = {'name':item['brand_item'],'name':item['name']} car = self.car_exist(self.db,car_where) if car == False: car = {'name':item['name'],'url':item['url'],'max_price':item['max_price'],'min_price':item['min_price'],'first_word':item['first_word'],'brand_id':brand['_id'],'brand_item_id':brand_item['_id']} car = self.insert_car(self.db,car) print('car insert ok!') else: print('car_exist!') if car != False: return True; else: return False; def update_brand(self,db,brand_where,brand): my_set = db.set(self.db_name,self.brand_set_name) my_set.update_one(brand_where,{'$set':brand}) exist = my_set.find_one(brand_where) if(exist is None): return False else: return exist def brand_exist(self,db,brand): my_set = db.set(self.db_name,self.brand_set_name) exist = my_set.find_one(brand) if(exist is None): return False else: return exist def insert_brand(self,db,brand): my_set = db.set(self.db_name,self.brand_set_name) my_set.insert_one(brand) brand = my_set.find_one(brand) return brand def brand_item_exist(self,db,brand_item): my_set = db.set(self.db_name,self.brand_item_set_name) exist = my_set.find_one(brand_item) if(exist is None): return False else: return exist def insert_brand_item(self,db,brand_item): my_set = db.set(self.db_name,self.brand_item_set_name) my_set.insert_one(brand_item) brand = my_set.find_one(brand_item) return brand def car_exist(self,db,car): my_set = db.set(self.db_name,self.car_set_name) exist = my_set.find_one(car) if(exist is None): return False else: return exist def insert_car(self,db,car): my_set = db.set(self.db_name,self.car_set_name) my_set.insert_one(car) brand = my_set.find_one(car) return brand
pipelinemongodb
from mininova.settings import settings import pymysql import os from mininova.db import Bookdb from mininova.mongo_novel import MongoNovel from mininova.mongo_car import MongoCar import copy class CarPipeline(object): def process_item(self,item,spider): mongo_car = MongoCar() mongo_car.insert(item) print(item['name']) print('item insert ok!')
settingapp
mongo_setting = { 'mongo_host' : 'xxx.xxx.xxx.xxx', 'mongo_port' : 27017, 'mongo_user' : 'username', 'mongo_password' : 'password' }