該程序主要爲了抓取人人車賣車信息,包括車系、車型號、購車日期、賣車價格、行駛路程、首付價格等等信息。話很少說直接代碼。python
入庫以後將Mongodb裏的信息導出成Excel語句redis
mongoexport -d myDB -c user -f _id,name,password,adress --csv -o ./user.csv數據庫
-d 標示 數據庫
-c 標示 數據表
-f 須要提取的field用逗號分隔
-o 輸出路徑 app
車系py文件dom
# -*- coding: utf-8 -*- import re from urllib.request import urlopen from scrapy.http import Request # from urllib.request import Request from bs4 import BeautifulSoup from lxml import etree import pymongo import scrapy from scrapy.selector import HtmlXPathSelector client = pymongo.MongoClient(host="127.0.0.1") db = client.renrenche collection = db.Carclass #表名classification import redis #導入redis數據庫 r = redis.Redis(host='127.0.0.1', port=6379, db=0) class renrencheSpider(scrapy.Spider): name = "Carinfo1" allowed_domains = ["renrenche.com"] #容許訪問的域 start_urls = [ "https://www.renrenche.com/bj/ershouche/" ] #每爬完一個網頁會回調parse方法 def parse(self, response): hxs = HtmlXPathSelector(response) hx = hxs.select('//div[@class="brand-more-content"]/div[@class="brand-section brand-section-1"]/p[@class="bl"]/span[@class="bn"]/a') for secItem in hx: url = secItem.select("@href").extract() c = "https://www.renrenche.com"+url[0] name = secItem.select("text()").extract() classid =self.insertMongo(name,None) print(c) print(name) request = Request(c,callback=lambda response,pid=str(classid):self.parse_subclass(response,pid)) yield request def parse_subclass(self, response,pid): # print(response.body.decode('utf-8')) hxs = HtmlXPathSelector(response) hx = hxs.select('//ul[@id="filter_series"]/li[@class=""]/a') for secItem in hx: urls = secItem.select("@href").extract() url = "https://www.renrenche.com" + urls[0] name = secItem.select("text()").extract() print(url) print(name) classid = self.insertMongo(name,pid) self.pushRedis(classid,url,pid) def insertMongo(self,classname,pid): classid = collection.insert({'classname':classname,'pid':pid}) return classid def pushRedis(self,classid,url,pid,): carurl = '%s,%s,%s' %(classid,url,pid) r.lpush('carurl',carurl)
賣車各類信息py文件scrapy
# -*- coding: utf-8 -*-import refrom urllib.request import urlopenfrom scrapy.http import Requestimport pymongoimport scrapyfrom time import sleepfrom scrapy.selector import HtmlXPathSelectorclient = pymongo.MongoClient(host="127.0.0.1")db = client.renrenchecollection = db.Carinfoimport redis # 導入redis數據庫r = redis.Redis(host='127.0.0.1', port=6379, db=0)class renrencheSpider(scrapy.Spider): name = "Carinfo2" allowed_domains = ["renrenche.com"] dict = {} start_urls = [] def __init__(self): # 定義一個方法 a = r.lrange('carurl', 0, -1) for item in a: novelurl = bytes.decode(item) arr = novelurl.split(',') # 分割字符串 renrencheSpider.start_urls.append(arr[1]) pid = arr[0] url = arr[1] self.dict[url] = {"pid":pid,"num":0} def parse(self, response): classInfo = self.dict[response.url] pid = classInfo['pid'] num = classInfo['num'] # print(self.dict) if num>3: return None hxs = HtmlXPathSelector(response) hx = hxs.select('//ul[@class="row-fluid list-row js-car-list"]') s="" for secItem in hx: hx1 = secItem.select('//li[@class="span6 list-item car-item"]/a[@rrc-event-param="search"]/h3') name = hx1.select("text()").extract() a = "型號:"+name[0] # self.insertMongo(classname=a) s +=a+"\n" # classid = collection.insert({'carinfo': a, 'pid': pid}) # print(a) for secItem in hx: hx2 = secItem.select('//div[@class="mileage"]/span[@class="basic"]') name = hx2.select("text()").extract() b = "購車年份/千米數:"+name[0]+"/"+name[1] # self.insertMongo(classname1=b) s +=b+"\n" # print(b) for secItem in hx: hx3 = secItem.select('//div[@class="tags-box"]/div[@class="price"]') name = hx3.select("text()").extract() c = str(name[0]) c = c.strip() c = "賣車價格:"+c+"萬" # self.insertMongo(classname2=c) s +=c+"\n" # print(c) for secItem in hx: hx4 = secItem.select('//div[@class="down-payment"]/div[@class="m-l"]') name = hx4.select("text()").extract() d = "首付:"+name[0]+"萬" # self.insertMongo(classname3=d,pid=pid) s +=d+"\n" # print(d) # print(s) arr = s.split('\n') print(arr[0]) classid = self.insertMongo(arr[0],arr[1],arr[2],arr[3],pid) # classid = self.insertMongo(s, pid) def insertMongo(self, classname,classname1,classname2,classname3, pid): classid = collection.insert({'classname': classname,'classname1':classname1,'classname2':classname2,'classname3':classname3, 'pid': pid}) return classid # r.lpush('novelnameurl', novelnameurl)