題記:javascript
互聯網上關於使用python3去爬取汽車之家的汽車數據(主要是汽車基本參數,配置參數,顏色參數,內飾參數)的教程已經很是多了,但大致的方案分兩種:html
1.解析出汽車之家某個車型的網頁,而後正則表達式匹配出混淆後的數據對象與混淆後的js,並對混淆後的js使用pyv8進行解析返回正常字符,而後經過字符與數據對象進行匹配,具體方法見這位園友,傳送門:https://www.cnblogs.com/my8100/p/js_qichezhijia.html (感謝這位大神前半部分的思路)java
2.解析出汽車之家某個車型的網頁,而後正則表達式匹配出混淆後的數據對象與混淆後的js,針對混淆後的js進行進行手動匹配,由於混淆的js大概分爲8大類(無參數 返回常量,無參數 返回函數,參數等於返回值函數,無參數 返回常量,無參數 返回常量中間無混淆代碼,字符串拼接時使無參常量,字符串拼接時使用返回參數的函數),而後經過正則表達式進行解析出8類內容並進行逐個替換,最終也會返回一個帶有順序的字符串,將這個字符串與前邊的數據對象再次替換,最終數據對象中的全部span都會被替換成中文,具體操做見園友的地址,傳送門:https://www.cnblogs.com/dyfblog/p/6753251.html (感謝這位大神前半部分的思路)python
不過鑑於做者技術有限,上述的兩種方案,我都沒有完整的執行完成,哪怕花了一週的時間也沒有,可是沒有辦法,誰讓我是一個很愛鑽牛角尖的人呢,下一步提出我本身琢磨出來的方案,流程上稍微有點複雜,可是穩打穩紮,仍是能夠爬出來的,好了話很少說了,貼出步驟;git
import bs4
import requests as req
'''
第一步,下載出全部車型的網頁。
'''
def mainMethod():
'''
解析汽車之家全部車型數據保存到D盤
'''
li = [chr(i) for i in range(ord("T"),ord("Z")+1)]
firstSite="https://www.autohome.com.cn/grade/carhtml/"
firstSiteSurfixe=".html"
secondSite = "https://car.autohome.com.cn/config/series/"
secondSiteSurfixe = ".html"
for a in li:
if a is not None:
requestUrl = firstSite+a+firstSiteSurfixe
print(requestUrl)
#開始獲取每一個品牌的車型
resp = req.get(requestUrl)
# print(str(resp.content,"gbk"))
bs = bs4.BeautifulSoup(str(resp.content,"gbk"),"html.parser")
bss = bs.find_all("li")
con = 0
for b in bss:
d = b.h4
if d is not None:
her = str(d.a.attrs['href'])
her = her.split("#")[0]
her = her[her.index(".cn")+3:].replace("/",'')
if her is not None:
secSite = secondSite +her + secondSiteSurfixe
print("secSite="+secSite)
# print(secSite)
#奧迪A3
if her is not None:
resp = req.get(secSite)
text = str(resp.content,encoding="utf-8")
print(a)
fil = open("d:\\autoHome\\html\\"+str(her),"a",encoding="utf-8")
fil.write(text)
con = (con+1)
else:
print(con)
if __name__ =="__main__":
mainMethod()
import os
import re
'''
第二步,解析出每一個車型的關鍵js拼裝成一個html
'''
if __name__=="__main__":
print("Start...")
rootPath = "D:\\autoHome\\html\\"
files = os.listdir(rootPath)
for file in files:
print("fileName=="+file.title())
text = ""
for fi in open(rootPath+file,'r',encoding="utf-8"):
text = text+fi
else:
print("fileName=="+file.title())
#解析數據的json
alljs = ("var rules = '2';"
"var document = {};"
"function getRules(){return rules}"
"document.createElement = function() {"
" return {"
" sheet: {"
" insertRule: function(rule, i) {"
" if (rules.length == 0) {"
" rules = rule;"
" } else {"
" rules = rules + '#' + rule;"
" }"
" }"
" }"
" }"
"};"
"document.querySelectorAll = function() {"
" return {};"
"};"
"document.head = {};"
"document.head.appendChild = function() {};"
"var window = {};"
"window.decodeURIComponent = decodeURIComponent;")
try:
js = re.findall('(\(function\([a-zA-Z]{2}.*?_\).*?\(document\);)', text)
for item in js:
alljs = alljs + item
except Exception as e:
print('makejs function exception')
newHtml = "<html><meta http-equiv='Content-Type' content='text/html; charset=utf-8' /><head></head><body> <script type='text/javascript'>"
alljs = newHtml + alljs+" document.write(rules)</script></body></html>"
f = open("D:\\autoHome\\newhtml\\"+file+".html","a",encoding="utf-8")
f.write(alljs)
f.close()
import os
import re
'''
第三步 解析出每一個車型的數據json,保存到本地。
'''
if __name__=="__main__":
print("Start...")
rootPath = "D:\\autoHome\\html\\"
files = os.listdir(rootPath)
for file in files:
print("fileName=="+file.title())
text = ""
for fi in open(rootPath+file,'r',encoding="utf-8"):
text = text+fi
else:
print("fileName=="+file.title())
#解析數據的json
jsonData = ""
config = re.search('var config = (.*?){1,};',text)
if config!= None:
print(config.group(0))
jsonData = jsonData+ config.group(0)
option = re.search('var option = (.*?)};',text)
if option != None:
print(option.group(0))
jsonData = jsonData+ option.group(0)
bag = re.search('var bag = (.*?);',text)
if bag != None:
print(bag.group(0))
jsonData = jsonData+ bag.group(0)
# print(jsonData)
f = open("D:\\autoHome\\json\\"+file,"a",encoding="utf-8")
f.write(jsonData)
f.close()
import os
from selenium import webdriver
'''
第四步,瀏覽器執行第二步生成的html文件,抓取執行結果,保存到本地。
'''
class Crack():
def __init__(self,keyword,username,passod):
self.url = 'https://www.baidu.com'
self.browser = webdriver.Chrome('E:\work\ChromePortable\App\Google Chrome\chromedriver.exe')
if __name__=="__main__":
lists = os.listdir("D:/autoHome/newHtml/")
crack = Crack('測試公司','17610177519','17610177519')
for fil in lists:
file = os.path.exists("D:\\autoHome\\content\\"+fil)
if file :
print('文件已經解析。。。'+str(file))
continue
print(fil)
crack.browser.get("file:///D:/autoHome/newHtml/"+fil+"")
text = crack.browser.find_element_by_tag_name('body')
print(text.text)
f = open("D:\\autoHome\\content\\"+fil,"a",encoding="utf-8")
f.write(text.text)
else:
f.close()
crack.browser.close()
import os
import re
'''
第五步 匹配樣式文件與json數據文件,生成正常的數據文件。
'''
if __name__ =="__main__":
rootPath = "D:\\autoHome\\json\\"
listdir = os.listdir(rootPath)
for json_s in listdir:
print(json_s.title())
jso = ""
#讀取json數據文件
for fi in open(rootPath+json_s,'r',encoding="utf-8"):
jso = jso+fi
content = ""
#讀取樣式文件
spansPath = "D:\\autoHome\\content\\"+json_s.title()+".html"
# print(spansPath)
for spans in open(spansPath,"r",encoding="utf-8"):
content = content+ spans
print(content)
#獲取全部span對象
jsos = re.findall("<span(.*?)></span>",jso)
num = 0
for js in jsos:
print("匹配到的span=>>"+js)
num = num +1
#獲取class屬性值
sea = re.search("'(.*?)'",js)
print("匹配到的class==>"+sea.group(1))
spanContent = str(sea.group(1))+"::before { content:(.*?)}"
#匹配樣式值
spanContentRe = re.search(spanContent,content)
if spanContentRe != None:
if sea.group(1) != None:
print("匹配到的樣式值="+spanContentRe.group(1))
jso = jso.replace(str("<span class='"+sea.group(1)+"'></span>"),re.search("\"(.*?)\"",spanContentRe.group(1)).group(1))
print(jso)
fi = open("D:\\autoHome\\newJson\\"+json_s.title(),"a",encoding="utf-8")
fi.write(jso)
fi.close()
import json
import os
import re
import xlwt
'''
第七步讀取數據文件,生成excel
'''
if __name__ == "__main__":
rootPath = "D:\\autoHome\\newJson\\"
workbook = xlwt.Workbook(encoding = 'ascii')#建立一個文件
worksheet = workbook.add_sheet('汽車之家')#建立一個表
files = os.listdir(rootPath)
startRow = 0
isFlag = True #默認記錄表頭
for file in files:
list = []
carItem = {}
print("fileName=="+file.title())
text = ""
for fi in open(rootPath+file,'r',encoding="utf-8"):
text = text+fi
# else:
# print("文件內容=="+text)
#解析基本參數配置參數,顏色三種參數,其餘參數
config = "var config = (.*?);"
option = "var option = (.*?);var"
bag = "var bag = (.*?);"
configRe = re.findall(config,text)
optionRe = re.findall(option,text)
bagRe = re.findall(bag,text)
for a in configRe:
config = a
print("++++++++++++++++++++++\n")
for b in optionRe:
option = b
print("---------------------\n")
for c in bagRe:
bag = c
# print(config)
# print(option)
# print(bag)
# print(bag)
try:
config = json.loads(config)
option = json.loads(option)
bag = json.loads(bag)
# print(config)
# print(option)
# print(bag)
path = "D:\\autoHome\\autoHome.xls"
configItem = config['result']['paramtypeitems'][0]['paramitems']
optionItem = option['result']['configtypeitems'][0]['configitems']
except Exception as e:
f = open("D:\\autoHome\\異常數據\\exception.txt","a",encoding="utf-8")
f.write(file.title()+"\n")
continue
#解析基本參數
for car in configItem:
carItem[car['name']]=[]
for ca in car['valueitems']:
carItem[car['name']].append(ca['value'])
# print(carItem)
#解析配置參數
for car in optionItem:
carItem[car['name']]=[]
for ca in car['valueitems']:
carItem[car['name']].append(ca['value'])
if isFlag:
co1s = 0
for co in carItem:
co1s = co1s +1
worksheet.write(startRow,co1s,co)
else:
startRow = startRow+1
isFlag = False
#計算起止行號
endRowNum = startRow + len(carItem['車型名稱']) #車輛款式記錄數
for row in range(startRow,endRowNum):
print(row)
colNum = 0
for col in carItem:
colNum = colNum +1
print(str(carItem[col][row-startRow]),end='|')
worksheet.write(row,colNum,str(carItem[col][row-startRow]))
else:
startRow = endRowNum
workbook.save('d:\\autoHome\\Mybook.xls')
數據量大概有8300的樣子。之後買車就用這個參考了。web