在利用Selenium爬取頁面信息的時候忽然報錯,第一條信息爬取的時候還好好的,第二條就不行了。web
請參考網上的爬取代碼:windows
1 # coding=utf-8 2 """ 3 Created on 2015-12-10 @author: Eastmount 4 利用Selenium爬取百度百科5A級景區的內容介紹的代碼 5 """ 6 7 import time 8 import re 9 import os 10 import sys 11 import codecs 12 import shutil 13 from selenium import webdriver 14 from selenium.webdriver.common.keys import Keys 15 import selenium.webdriver.support.ui as ui 16 from selenium.webdriver.common.action_chains import ActionChains 17 18 # Open PhantomJS 19 # driver = webdriver.PhantomJS(executable_path="D:\phantomjs-2.1.1-windows\sbin\phantomjs.exe") 20 driver = webdriver.PhantomJS(executable_path="D:\phantomjs-1.9.8-windows\phantomjs.exe") 21 # driver = webdriver.Firefox() 22 wait = ui.WebDriverWait(driver, 10) # 顯示等待時間(實例,最大等待時間) 23 24 25 # Get the Content of 5A tourist spots 26 def getInfobox(entityName, fileName): 27 try: 28 # create paths and txt files 29 print(u'文件名稱: ', fileName) 30 info = codecs.open(fileName, 'w', 'utf-8') 31 32 # locate input notice: 1.visit url by unicode 2.write files 33 # Error: Message: Element not found in the cache 34 # Perhaps the page has changed since it was looked up 35 # 解決方法: 使用Selenium和Phantomjs 36 37 print(u'實體名稱: ', entityName.rstrip('\n')) 38 driver.get("http://baike.baidu.com/") 39 elem_inp = driver.find_element_by_xpath("//form[@id='searchForm']/input") 40 # elem_inp = driver.find_elements_by_xpath("//div[@class='lemma-summary']/div") 41 elem_inp.send_keys(entityName) 42 elem_inp.send_keys(Keys.RETURN) 43 info.write(entityName.rstrip('\n') + '\r\n') # codecs不支持'\n'換行 44 45 # load content 摘要 46 elem_value = driver.find_elements_by_xpath("//div[@class='lemma-summary']/div") 47 for value in elem_value: 48 print(value.text) 49 info.writelines(value.text + '\r\n') 50 51 # 爬取文本信息 52 # 爬取全部段落<div class='para'>的內容 class='para-title'爲標題 [省略] 53 54 time.sleep(2) 55 # except Exception as e: # 'utf8' codec can't decode byte 56 # print("Error: ", e) 57 finally: 58 print('\n') 59 info.close() 60 61 62 # Main function 63 def main(): 64 # By function get information 65 path = "BaiduSpider\\" 66 if os.path.isdir(path): 67 shutil.rmtree(path, True) 68 os.makedirs(path) 69 source = open("Tourist_spots_5A.txt", 'r') 70 num = 1 71 for entityName in source: 72 # entityName = unicode(entityName, "utf-8") 73 if u'故宮' in entityName: # else add a '?' 74 entityName = '北京故宮' 75 # else: Name = entityName.rstrip('\n') 76 name = "%04d" % num 77 fileName = path + str(name) + ".txt" 78 getInfobox(entityName, fileName) 79 num = num + 1 80 print('End Read Files!') 81 source.close() 82 driver.close() 83 84 85 if __name__ == '__main__': 86 main()
執行報錯信息爲:ide
Traceback (most recent call last): File "D:/pycharm/untitled_DB/wordcloud/selenium爬取百度百科/Selenium_baidu.py", line 85, in <module> main() File "D:/pycharm/untitled_DB/wordcloud/selenium爬取百度百科/Selenium_baidu.py", line 77, in main getInfobox(entityName, fileName) File "D:/pycharm/untitled_DB/wordcloud/selenium爬取百度百科/Selenium_baidu.py", line 41, in getInfobox elem_inp.send_keys(Keys.RETURN) File "C:\Users\Administrator\AppData\Local\Programs\Python\Python36\lib\site-packages\selenium\webdriver\remote\webelement.py", line 479, in send_keys 'value': keys_to_typing(value)}) File "C:\Users\Administrator\AppData\Local\Programs\Python\Python36\lib\site-packages\selenium\webdriver\remote\webelement.py", line 628, in _execute return self._parent.execute(command, params) File "C:\Users\Administrator\AppData\Local\Programs\Python\Python36\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 312, in execute self.error_handler.check_response(response) File "C:\Users\Administrator\AppData\Local\Programs\Python\Python36\lib\site-packages\selenium\webdriver\remote\errorhandler.py", line 208, in check_response raise exception_class(value) selenium.common.exceptions.WebDriverException: Message: TypeError - 'undefined' is not a function (evaluating '_getTagName(currWindow).toLowerCase()')
找了1天都沒找到緣由,真的 死煩 ,找到緣由是71行代碼寫死,然而要是不加判斷也會出現這樣的報錯,比較鬱悶,後來查了半天資料,在Stackoverflow的評論中找到思路,頗有多是read文件的時候,讀取到的內容格式有問題,因而查看了一下格式發現,果不其然,多了一個"/n",修改代碼:ui
if u'故宮' in entityName: # else add a '?' entityName = '北京故宮' else:
entityName = entityName.rstrip('\n') name = "%04d" % num fileName = path + str(name) + ".txt" getInfobox(entityName, fileName) num = num + 1
在執行,ok,請忽略渣渣排版lua