今天上傳一個關於公交線路爬蟲的代碼。python
話很少說,直接講思路:對網站進行分析,分析URL--------->定位「切換城市」並點擊------>根據輸入的字符串進行定位城市並點擊,定位不到返回錯誤信息------->獲取對應城市的公交線路的各個分類------->對各個分類進行點擊,獲取分類下的每個線路的連接,並存入字典中------>循環訪問每個連接,而且把匹配到的內容寫入表格中。web
爬取的網站是:www.8684.cnide
下面是代碼:函數
1 #!/usr/bin/python3 2 # -*- coding: utf-8 -*- 3 4 from selenium import webdriver 5 from selenium.webdriver.common.action_chains import ActionChains 6 from selenium.webdriver.common.by import By 7 from selenium.webdriver.support.wait import WebDriverWait 8 from selenium.webdriver.support import expected_conditions as EC 9 import xlwt 10 def search_bus(city): 11 #訪問主頁 12 url = 'http://beijing.8684.cn/line1' 13 browser = webdriver.Chrome() 14 browser.implicitly_wait(10) 15 browser.get(url) 16 browser.maximize_window() 17 #點擊切換城市。 18 a = browser.find_element_by_xpath('//span[@class="city_switch"]') 19 WebDriverWait(browser,15,0.3).until(EC.presence_of_element_located((By.XPATH,'//span[@class="city_switch"]'))) 20 ActionChains(browser).click(a).perform() 21 #點擊城市 22 b = browser.find_element_by_link_text(city) 23 WebDriverWait(browser,15,0.4).until(EC.visibility_of_element_located((By.LINK_TEXT,city))) 24 ActionChains(browser).click(b).perform() 25 d = browser.current_url 26 #點擊一個進入另外一界面 27 c = browser.find_elements_by_xpath('//div[@class="bus_layer"]//div[4]//div[@class="bus_layer_r"]//a') 28 WebDriverWait(browser,15,0.3).until(EC.presence_of_all_elements_located((By.XPATH,'//div[@class="bus_layer"]//div[4]//div[@class="bus_layer_r"]//a'))) 29 #獲取每一項的連接 30 url_dict = {} 31 for x in c: 32 url_dict[x.text] = x.get_attribute('href') 33 browser.quit() 34 return url_dict 35 def go_url(dicts): 36 dict_1 = {} 37 for k,x in dicts.items(): 38 browser = webdriver.Chrome() 39 browser.get(x) 40 list_1 = browser.find_elements_by_xpath('//div[@id="con_site_1"][@class="stie_list"]//a') 41 WebDriverWait(browser,15,0.3).until(EC.presence_of_all_elements_located((By.XPATH,'//div[@id="con_site_1"][@class="stie_list"]//a'))) 42 dict_2 = {} 43 for x in list_1: 44 dict_2[x.text] = x.get_attribute('href') 45 dict_1[k] = dict_2 46 browser.quit() 47 return dict_1 48 def write_mei_ge(dict_1,city): 49 work = xlwt.Workbook() 50 for k,v in dict_1.items(): 51 table = work.add_sheet(k) 52 t = 0 53 g = 0 54 for ka,va in v.items(): 55 try: 56 browser = webdriver.Chrome() 57 browser.get(va) 58 list_1 = browser.find_elements_by_xpath('//div[@class="bus_line_site "][1]//a') 59 WebDriverWait(browser,15,0.3).until(EC.presence_of_all_elements_located((By.XPATH,'//div[@class="bus_line_site "][1]//a'))) 60 num = len(list_1) 61 table.write(t,0,ka) 62 for w in range(num): 63 table.write(t,w+1,list_1[w].text) 64 t += 1 65 browser.quit() 66 except: 67 print('%s is false' % ka) 68 #g += 1 69 continue 70 return work.save('%s.xls' % city)
分段:網站
1.導入模塊,這個就不用講了,都懂。。。ui
#!/usr/bin/python3 # -*- coding: utf-8 -*- from selenium import webdriver from selenium.webdriver.common.action_chains import ActionChains from selenium.webdriver.common.by import By from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver.support import expected_conditions as EC import xlwt
2. 建立函數,形參爲city,傳入實參-----城市名。最後返回一個字典,字典值是每一個分類對應的連接,鍵是分類名。url
def search_bus(city): #訪問主頁 url = 'http://beijing.8684.cn/line1' browser = webdriver.Chrome() browser.implicitly_wait(10) browser.get(url) browser.maximize_window() #點擊切換城市。 a = browser.find_element_by_xpath('//span[@class="city_switch"]') WebDriverWait(browser,15,0.3).until(EC.presence_of_element_located((By.XPATH,'//span[@class="city_switch"]'))) ActionChains(browser).click(a).perform() #點擊城市 b = browser.find_element_by_link_text(city) WebDriverWait(browser,15,0.4).until(EC.visibility_of_element_located((By.LINK_TEXT,city))) ActionChains(browser).click(b).perform() d = browser.current_url #點擊一個進入另外一界面 c = browser.find_elements_by_xpath('//div[@class="bus_layer"]//div[4]//div[@class="bus_layer_r"]//a') WebDriverWait(browser,15,0.3).until(EC.presence_of_all_elements_located((By.XPATH,'//div[@class="bus_layer"]//div[4]//div[@class="bus_layer_r"]//a'))) #獲取每一項的連接 url_dict = {} for x in c: url_dict[x.text] = x.get_attribute('href') browser.quit() return url_dict
3.對每一個鍵對應的連接進行訪問,得到每一個線路對應的連接,並存入字典中。此處字典爲一個字典嵌套一個字典。舉個例子:dict = {'分類名1':{’1路‘:’http://xxxxxxx.cn/wwwww‘},’分類名2‘:{’20路‘:’http://ssss.cn........‘,’35路‘:’http://ww.ddff.cn/ddffffff‘}}spa
最後返回一個字典。code
1 def go_url(dicts): 2 dict_1 = {} 3 for k,x in dicts.items(): 4 browser = webdriver.Chrome() 5 browser.get(x) 6 list_1 = browser.find_elements_by_xpath('//div[@id="con_site_1"][@class="stie_list"]//a') 7 WebDriverWait(browser,15,0.3).until(EC.presence_of_all_elements_located((By.XPATH,'//div[@id="con_site_1"][@class="stie_list"]//a'))) 8 dict_2 = {} 9 for x in list_1: 10 dict_2[x.text] = x.get_attribute('href') 11 dict_1[k] = dict_2 12 browser.quit() 13 return dict_1
4.最後對以前存儲的每一個連接進行訪問,獲取每條公交線路,並寫入Xls文件中。orm
#形參1是上一個函數返回的字典,形參2是所要查詢的城市的名字。 def write_1(dict_1,city): work = xlwt.Workbook() for k,v in dict_1.items(): table = work.add_sheet(k) t = 0 for ka,va in v.items(): try: browser = webdriver.Chrome() browser.get(va) list_1 = browser.find_elements_by_xpath('//div[@class="bus_line_site "][1]//a') WebDriverWait(browser,15,0.3).until(EC.presence_of_all_elements_located((By.XPATH,'//div[@class="bus_line_site "][1]//a'))) num = len(list_1) table.write(t,0,ka) for w in range(num): table.write(t,w+1,list_1[w].text) t += 1 browser.quit() except: print('%s is false' % ka) #g += 1 continue return work.save('%s.xls' % city)
到此全部代碼就上傳完成了,若是有不解的地方或須要改進的地方,請您提出來,很高興與你一塊兒交流。
謝謝閱讀。