#coding:utf8 #爬取網上車市[http://www.cheshi.com/]的數據 import requests, json, time, re, os, sys, time,urllib2,shutil,string import threading import MySQLdb import redis from pyquery import PyQuery as pq from urlparse import urljoin from selenium import webdriver #設置utf-8編碼格式 reload(sys) sys.setdefaultencoding( "utf-8" ) #讀取文件內容 def getLines(filename): file_object = open(filename,'rb') lines = file_object.readlines() return lines #根據url_name獲取url_type_id def get_url_type_id(v_url_name): #3000 品牌 奧迪 #4000 奧迪 奧迪A6 url_type_id = '' for line in getLines('/home/shutong/crawl/car/script/brand.ini'): line = line.strip() url_cate = line.split(',')[1] url_name = line.split(',')[2] if v_url_name.strip() == url_name.strip(): url_type_id = line.split(',')[0] return url_type_id break else : continue return url_type_id class ResultData(): '''數據文件類''' def __init__(self,industry_id,url_type_id,url_name,url_value,web_type_id,web_name,date_id): self.industry_id = industry_id self.url_type_id = url_type_id self.url_name = url_name self.url_value = url_value self.web_type_id = web_type_id self.web_name = web_name self.date_id = date_id def __str__(self): return self.industry_id,self.url_type_id,self.url_name,self.url_value,self.web_type_id,self.self.web_name,ResultData.date_id class Base(object): '''文件保存的基類''' def __init__(self,dev_prd_flag): self.dev_prd_flag = dev_prd_flag pass #print "This is init function" #保存數據到文件文件 def _saveContext(self,filename,*name): format = '^' context = name[0] for i in name[1:]: context = context + format + str(i) context = str(context).replace('(','(').replace(')',')').replace(',',',').replace(':',':') if self.dev_prd_flag != 'prd': print context else: #去除文件路徑名首位空格 filename = filename.strip() #讀取目錄名稱 path = os.path.dirname(filename) #若是目錄不存在則建立目錄 if not os.path.exists(path): os.makedirs(path) #讀取文件名稱,以追加的方式寫文件 name = os.path.basename(filename) fp = open(filename,'a') fp.write(context+'\n') fp.close() def saveData(self,filename,ResultData): if ResultData.url_type_id: self._saveContext(filename,ResultData.industry_id,ResultData.url_type_id,ResultData.url_name,ResultData.url_value,ResultData.web_type_id,ResultData.web_name,ResultData.date_id) else: #將數據進行保存在redis中 r = redis.Redis(host='192.168.122.140',port=6379,db=0) r.sadd('errorList',ResultData.industry_id+'^'+ResultData.url_name+'^'+ResultData.url_value) def __str__(self): return '保存文件的基類' class Crawl(Base): '''爬蟲基礎類''' driver = None #構造方法 def __init__(self,name,dev_prd_flag): super(Crawl,self).__init__(dev_prd_flag='dev') self.dev_prd_flag = dev_prd_flag self.name = name #self.driver = init_driver() '''初始化啓動瀏覽器''' def init_driver(self): ua = "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.3 Safari/537.36" cap = webdriver.DesiredCapabilities.PHANTOMJS cap["phantomjs.page.settings.resourceTimeout"] = 20000 cap["phantomjs.page.settings.loadImages"] = True cap["phantomjs.page.settings.disk-cache"] = True cap["phantomjs.page.settings.userAgent"] = ua cap["phantomjs.page.customHeaders.User-Agent"] =ua cap["phantomjs.page.customHeaders.Referer"] = "http://tj.ac.10086.cn/login/" driver = webdriver.PhantomJS(executable_path='/home/shutong/phantomjs/bin/phantomjs',desired_capabilities=cap, service_args=['--ignore-ssl-errors=true']) driver.set_page_load_timeout(60) driver.set_script_timeout(60) #return driver self.driver = driver #獲取網頁文本 def getHtml(self,url,code='utf-8'): html = '' try: if self.driver: self.driver.get(url) html = self.driver.page_source else : headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'} request = urllib2.Request(url,headers=headers) response = urllib2.urlopen(request,data=None,timeout=60) if code: if code == 'gbk': html = unicode(response.read(),'gbk') else: html = unicode(response.read(),str(code)) except: pass finally: return html '''析構方法''' def __del__(self): if self.driver: self.driver.quit() print "瀏覽器成功關閉" else: print "瀏覽器未打開使用" def __str__(self): return "爬蟲基礎類" def start_crawl(url): #鏈接redis數據庫 r = redis.Redis(host='192.168.122.140',port=6379,db=0) urllist = [] html = crawl.getHtml(url,'gbk') d = pq(html) for a in d('a'): a = pq(a) try: url_value = urljoin(url,a.attr('href')) name = a.text() #if re.compile(r'([a-z]+) ([a-z]+)', re.I) #http://newcar.xcar.com.cn/162/ if re.match( r'http://newcar.xcar.com.cn/[0-9]{1,10}/$', url_value, re.M|re.I): #print url_value,name #urllist.append(url_value) #將數據存儲在redis中 #r.sadd('urllist',url_value) pass elif re.match(r'http://newcar.xcar.com.cn/m[0-9]{1,10}/$',url_value,re.M|re.I): r.sadd('urllist',url_value) except: pass #for index in urllist: for index in list(set(urllist)): print index try: #return start_crawl(index) pass except: pass def start_wscs_crawl(url): #生產或者測試標誌 dev爲測試 prd爲生產 flag = 'prd' #汽車行業ID industry_id = '004004' #移動PC端web_type_id web_type_id = '0' #網站名稱 web_name = '網上車市' crawl = Crawl('網上車市',flag) #加載瀏覽器 #crawl.init_driver() html = crawl.getHtml(url) d = pq(html) for div in d('div').filter('.list-box'): div = pq(div) #品牌 brand = div('div').filter('.lb').find('span').text() #品牌url brand_url = urljoin(url,div('div').filter('.lb')('a').attr('href')) #print brand,brand_url url_type_id = '3000' url_name = brand url_value = brand_url #保存品牌數據 #crawl._saveContext(filename,industry_id,url_type_id,url_name,url_value,web_type_id,web_name) resultData = ResultData(industry_id,url_type_id,url_name,url_value,web_type_id,web_name,date_id) crawl.saveData(filename,resultData) brand = div('div').filter('.rb')('dl')('dt')('a').text().replace('>>','') brand_url = urljoin(url,div('div').filter('.rb')('dl')('dt')('a').attr('href')) #print brand,brand_url url_type_id = '3000' url_name = brand url_value = brand_url #保存品牌數據 resultData = ResultData(industry_id,url_type_id,url_name,url_value,web_type_id,web_name,date_id) crawl.saveData(filename,resultData) #crawl._saveContext(filename,industry_id,url_type_id,url_name,url_value,web_type_id,web_name) for dd in div('div').filter('.rb')('dl')('dd'): dd = pq(dd) car_name = dd('div').filter('.con')('h4').text() car_url = urljoin(url,dd('div').filter('.con')('h4')('a').attr('href')) #print car_name,car_url url_type_id = get_url_type_id(car_name) url_name = car_name url_value = car_url #保存車系數據 #crawl._saveContext(filename,industry_id,url_type_id,url_name,url_value,web_type_id,web_name) resultData = ResultData(industry_id,url_type_id,url_name,url_value,web_type_id,web_name,date_id) crawl.saveData(filename,resultData) #製做汽車實體信息 #品牌 子品牌 車系名稱 價位 圖片url 網站名稱 #多線程啓動 def start_mutli_crawl(): list = [] for word in string.uppercase: #url = 'http://www.autohome.com.cn/grade/carhtml/%s.html' %(word) url = 'http://product.cheshi.com/static/selectcar/%s.html?t=1519713137030' % (word) list.append(url) #定義線程數組 threads = [] #建立線程 for i in range(len(list)): t = threading.Thread(target=start_wscs_crawl,args=(list[i],)) threads.append(t) #開啓線程 for i in range(len(list)): threads[i].start() for i in range(len(list)): threads[i].join() #filename = '/home/shutong/crawl/car/script/wscs.csv' #date_id = '20180227' date_id = sys.argv[1] filename = sys.argv[2] #url = 'http://product.cheshi.com/static/selectcar/B.html?t=1519713137030' #start_wscs_crawl(url) #多線程啓動 start_mutli_crawl()