文的文字及圖片來源於網絡,僅供學習、交流使用,不具備任何商業用途,版權歸原做者全部,若有問題請及時聯繫咱們以做處理。php
做者: 孫藝方html
PS:若有須要Python學習資料的小夥伴能夠加點擊下方連接自行獲取json
http://note.youdao.com/noteshare?id=3054cce4add8a909e784ad934f956cef複製代碼
首先導入庫bs四、lxml、requests:bash
import requests
import json
from bs4 import BeautifulSoup
import lxml.html複製代碼
獲取地鐵路線網址網絡
etree = lxml.html.etree
def get_urls():
url = 'https://dt.8684.cn/so.php?k=pp&dtcity=sz&q=wedwed'
response = requests.get(url=url)
soup = BeautifulSoup(response.content, 'lxml').find(attrs={'class': 'fs-fdLink'})
#爲建立BeautifulSoup對象,其意義是將發送網絡請求返回的響應數據建立爲soup對象,數據格式爲lxml。
routes = soup.find('tr').find('td').findAll('a')# 查找全部有關的節點
route_list = []
for i in routes:
per = {}
per['key'] = i.string
per['value'] = 'https://dt.8684.cn' + i.get('href')
route_list.append(per)
return route_list複製代碼
def get_schedule(url):
response = requests.get(url['value'])
soup = BeautifulSoup(response.content, 'lxml').find(attrs={'class': 'ib-mn'})
table_head = soup.find(attrs={'class': 'pi-table tl-table'})
per_info = table_head.findAll('tr')
route_list = []
if (url['key'].find('內圈') == -1 and url['key'].find('外圈') == -1):#地鐵線路名稱既無內環也無外環
route_info = {} #定義地鐵內環字典
route_info_wai = {}#定義地鐵外環字典
stations_nei = [] #定義地鐵內環站臺列表
route_info['name'] = url['key'] + '內環'#定義內環線路字典name鍵的值
route_info_wai['name'] = url['key'] + '外環'# 定義外環線路字典name鍵的值
time_nei_1 = []#定義地鐵內環發車時間列表
time_nei_2 = []#定義地鐵內環收班時間列表
time_wai_1 = [] #定義地鐵外環發車時間列表
time_wai_2 = []# 定義地鐵外環收班時間列表
for i in per_info:
if (i != []):
j = i.findAll('td')
if (j != []):
for k in j[0]:
stations_nei.append(k.text)#內外環站點名
for k in j[3]:
time_nei_2.append(k)#內環收車時間
for k in j[1]:
time_nei_1.append(k)#內環發車時間
for k in j[4]:
time_wai_2.append(k) #外環收車時間
for k in j[2]:
time_wai_1.append(k)#外環發車時間
try:
if (time_nei_1[0] != '--' and time_nei_1[0] != '—'):
route_info['startTime'] = startTime = time_nei_1[0]#篩除地鐵內環線路發車時間爲空的數值
else:
route_info['startTime'] = time_nei_1[1]#定義地鐵內環線路發車時間
if (time_nei_2[len(time_nei_2) - 1] != '--' and time_nei_2[len(time_nei_2) - 1] != '—'):
route_info['endTime'] = time_nei_2[len(time_nei_2) - 1]#篩除地鐵內環線路收車時間爲空的數值
else:
route_info['endTime'] = time_nei_2[len(time_nei_2) - 2]# 定義地鐵內環線路收車時間
if (time_wai_1[len(time_wai_1) - 1] != '--' and time_wai_1[len(time_wai_1) - 1] != '—'):
route_info_wai['startTime'] = time_wai_1[len(time_wai_1) - 1]# 篩除地鐵外環線路發車時間爲空的數值
else:
route_info_wai['startTime'] = time_wai_1[len(time_wai_1) - 2]# 定義地鐵外環線路發車時間
if (time_wai_2[0] != '--' and time_wai_2[0] != '—'):
route_info_wai['endTime'] = startTime = time_wai_2[0]# 篩除地鐵外環線路收車時間爲空的數值
else:
route_info_wai['endTime'] = time_wai_2[1]# 定義地鐵外環線路收車時間
except IndexError as e:
route_info_wai['startTime'] = '06:00'
route_info_wai['endTime'] = '23:00'
route_info['startTime'] = '06:00'
route_info['endTime'] = '23:00'#若沒法找到數據,則捕捉索引異常錯誤,並對運行時間賦默認值
route_info['stations'] = stations_nei #內環線路字典stations值賦爲內環站點名
route_info_wai['stations'] = list(reversed(stations_nei))#反轉列表值並強制轉換數據類型爲列表,做爲外環站點名
route_list.append(route_info)
route_list.append(route_info_wai)#將內外環線路字典插入列表
else:
#地鐵線路名稱包含「內環」或「外環」
route_info = {}
stations = []
route_info['name'] = url['key']# 定義線路字典name鍵的值
time_1 = [] # 定義地鐵發車時間列表
time_2 = []#定義地鐵收車時間列表
for i in per_info:
if (i != []):
j = i.findAll('td')
if (j != []):# 篩除表頭相關空數據
for k in j[0]:
stations.append(k.text)#地鐵線路站點名
for k in j[1]:
time_1.append(k)#地鐵線路發車時間
for k in j[2]:
time_2.append(k)#地鐵線路收車時間
if (time_1[0] != '--' and time_1[0] != '—'):
route_info['startTime'] = startTime = time_1[0]# 篩除地鐵線路發車時間爲空的數值
else:
route_info['startTime'] = time_1[1]#定義地鐵線路發車時間
if (time_2[len(time_2) - 1] != '--' and time_2[len(time_2) - 1] != '—'):
route_info['endTime'] = time_2[len(time_2) - 1]#篩除地鐵線路收車時間爲空的數值
else:
route_info['endTime'] = time_2[len(time_2) - 2]#定義地鐵線路收車時間
route_info['stations'] = stations# 線路字典stations值賦爲站點名
route_list.append(route_info)#將線路字典插入列表
return route_list複製代碼
其中,if語句(Part2的大部分代碼)都是用於對很是規線路數據進行特殊處理,這也印證了「80%的代碼處理20%的狀況」,常規線路數據只需前六行代碼便可爬取,最終返回線路列表rout_list。app
for i in get_urls():#遍歷城市地鐵線路網址
for j in get_schedule(i):
json_str = json.dumps(j, ensure_ascii=False)
#轉換數據格式爲json格式,忽略格式轉換錯誤進行強制轉換
print(json_str)複製代碼
部分結果展現學習
{"name": "1號線內環", "startTime": "06:30", "endTime": "00:04", "stations": ["羅湖", "國貿", "老街", "大劇院", "科學館", "華強路", "崗廈", "會展中心", "購物公園", "香蜜湖", "車公廟", "竹子林", "僑城東", "華僑城", "世界之窗", "白石洲", "高新園", "深大", "桃園", "大新", "鯉魚門", "前海灣", "新安", "寶安中心", "寶體", "坪洲", "西鄉", "固戍", "後瑞", "機場東"]}ui
以上就是本期的所有內容了,但願對你們有所幫助。url