selenium爬蟲報錯：Message: stale element reference: element is not attached to the page document 促成1分鐘爬完鬥

時間 2019-11-09

標籤 selenium 爬蟲報錯 message stale element reference attached page document 促成 1分欄目網絡爬蟲简体版

原文原文鏈接

先看代碼html

#-*- coding:utf-8 -*-
#_author:John
#date:2018/10/16 19:05
#softwave: PyCharm
import lxml
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait

boswer = webdriver.Chrome()
wait = WebDriverWait(boswer, 10)
url = 'https://www.douyu.com/directory/all'
boswer.get(url)
f = open('F:\douyu.txt', 'w', encoding='utf-8')
i = 0

while True:
    html = boswer.page_source
    soup = BeautifulSoup(html, 'lxml')
    soup_titles = soup.select('.ellipsis')
    soup_nums = soup.select('.dy-num.fr')
    for soup_title, soup_num in zip(soup_titles, soup_nums):
        title = soup_title.get_text().replace('\n', '').strip()
        num = soup_num.get_text()
        print('標題：{} | 人氣：{}'.format(title, num))
        f.write(title)
        f.write(num)
    i += 1
    print('*'*25 + '\n'+ '第 {} 頁爬取成功'.format(i)+ '\n'+'*'*25 )
    # 等待下一頁是可點擊狀態
    next_page = wait.until(
        EC.element_to_be_clickable((By.CLASS_NAME,"shark-pager-next"))
    )
    # 獲取最後一頁
    end_page = boswer.find_element_by_xpath("//a[@class='shark-pager-item'][last()]").text
    if i == int(end_page):
        break
    next_page.click()
    # 等待當前頁碼爲下一頁，證實已經執行了翻頁
    wait.until(
        EC.text_to_be_present_in_element((By.CSS_SELECTOR,".shark-pager-item.current"), str(i+1))
    )
boswer.close()
f.close()

執行後只爬了兩頁就報錯誤：web

Traceback (most recent call last):
  File "C:/Users/yao/PycharmProjects/test20181003/test1016.py", line 38, in <module>
    next_page.click()
  File "C:\Users\yao\PycharmProjects\test20181003\venv\lib\site-packages\selenium\webdriver\remote\webelement.py", line 80, in click
    self._execute(Command.CLICK_ELEMENT)
  File "C:\Users\yao\PycharmProjects\test20181003\venv\lib\site-packages\selenium\webdriver\remote\webelement.py", line 633, in _execute
    return self._parent.execute(command, params)
  File "C:\Users\yao\PycharmProjects\test20181003\venv\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 321, in execute
    self.error_handler.check_response(response)
  File "C:\Users\yao\PycharmProjects\test20181003\venv\lib\site-packages\selenium\webdriver\remote\errorhandler.py", line 242, in check_response
    raise exception_class(message, screen, stacktrace)
selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: element is not attached to the page document
  (Session info: chrome=71.0.3559.6)
  (Driver info: chromedriver=70.0.3538.16 (16ed95b41bb05e565b11fb66ac33c660b721f778),platform=Windows NT 10.0.17134 x86_64)

這樣的錯誤是說我已經點擊了翻頁，可是尚未完成翻頁，因而又爬了一次當前頁，而後再要執行翻頁時頁面已經刷新了，前面找到的翻頁元素已通過期了，沒法執行點擊。ajax

固然最有效的方法是在點擊翻頁後強制sleep幾秒，可是這樣會浪費時間，並且太low了。因而我加了等待當前頁爲下一頁的功能仍然無效，因而嘗試翻頁後再等待下一頁是可點擊狀態。chrome

#-*- coding:utf-8 -*-
#_author:John
#date:2018/10/16 19:05
#softwave: PyCharm
import lxml
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait

boswer = webdriver.Chrome()
wait = WebDriverWait(boswer, 10)
url = 'https://www.douyu.com/directory/all'
boswer.get(url)
f = open('F:\douyu.txt', 'w', encoding='utf-8')
i = 0

while True:
    html = boswer.page_source
    soup = BeautifulSoup(html, 'lxml')
    soup_titles = soup.select('.ellipsis')
    soup_nums = soup.select('.dy-num.fr')
    for soup_title, soup_num in zip(soup_titles, soup_nums):
        title = soup_title.get_text().replace('\n', '').strip()
        num = soup_num.get_text()
        print('標題：{} | 人氣：{}'.format(title, num))
        f.write(title)
        f.write(num)
    i += 1
    print('*'*25 + '\n'+ '第 {} 頁爬取成功'.format(i)+ '\n'+'*'*25 )
    # 等待下一頁是可點擊狀態
    next_page = wait.until(
        EC.element_to_be_clickable((By.CLASS_NAME,"shark-pager-next"))
    )
    # 獲取最後一頁
    end_page = boswer.find_element_by_xpath("//a[@class='shark-pager-item'][last()]").text
    if i == int(end_page):
        break
    next_page.click()
    # 等待當前頁碼爲下一頁，證實已經執行了翻頁
    wait.until(
        EC.text_to_be_present_in_element((By.CSS_SELECTOR,".shark-pager-item.current"), str(i+1))
    )
    # 再等待下頁爲可點擊狀態確保頁面完成刷新
    wait.until(
        EC.element_to_be_clickable((By.CLASS_NAME, "shark-pager-next"))
    )
boswer.close()
f.close()

看着刷刷地翻了20頁覺得大功告成了，卻在21頁又掛掉了。因而我手動點擊了下一頁試試，當前頁瞬間跳到下一頁，下一頁可點擊也只是稍晚一點，並不能徹底證實頁面已經完成刷新了。json

後面再幾經折騰，用了異常處理，遇到失效再sleep 1s從新獲取下一頁元素，能夠重試3次。api

#-*- coding:utf-8 -*-
#_author:John
#date:2018/10/16 19:05
#softwave: PyCharm
import lxml
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.common.exceptions import StaleElementReferenceException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait

boswer = webdriver.Chrome()
wait = WebDriverWait(boswer, 10)
url = 'https://www.douyu.com/directory/all'
boswer.get(url)
f = open('F:\douyu.txt', 'w', encoding='utf-8')
i = 0

while True:
    html = boswer.page_source
    soup = BeautifulSoup(html, 'lxml')
    soup_titles = soup.select('.ellipsis')
    soup_nums = soup.select('.dy-num.fr')
    for soup_title, soup_num in zip(soup_titles, soup_nums):
        title = soup_title.get_text().replace('\n', '').strip()
        num = soup_num.get_text()
        print('標題：{} | 人氣：{}'.format(title, num))
        f.write(title)
        f.write(num)
    i += 1
    print('*'*25 + '\n'+ '第 {} 頁爬取成功'.format(i)+ '\n'+'*'*25 )
    # 等待下一頁是可點擊狀態
    if boswer.page_source.find("shark-pager-disable-next") != -1:
        break
    # 嘗試3次獲取下一頁，若是遇到失效就等待1s從新獲取下一頁元素
    for _ in range(3):
        try:
            next_page = wait.until(
                EC.element_to_be_clickable((By.CLASS_NAME, "shark-pager-next"))
            )
            next_page.click()
            break
        except StaleElementReferenceException:
            time.sleep(1)
            print('try to find element click')
    # 等待當前頁碼爲下一頁，證實已經執行了翻頁
    # wait.until(
    #     EC.text_to_be_present_in_element((By.CSS_SELECTOR,".shark-pager-item.current"), str(i+1))
    # )
    # 再等待下頁爲可點擊狀態確保頁面完成刷新
    # wait.until(
    #     EC.element_to_be_clickable((By.CLASS_NAME, "shark-pager-next"))
    # )
    # time.sleep(0.5)
boswer.close()
f.close()

這樣一來確實沒有再掛掉了，可是會出現重複爬同一頁的狀況。無奈只能按low的方法強制sleep了，再想一想既然已經選擇了selenium就已經預料到它已是犧牲了速度來獲得的一個簡易的翻頁方法，若是說對爬蟲速度確實有要求的話就須要抓包分析，放棄selenium了。多線程

如下是最終用selenium完成的爬蟲，通過一番折騰仍是強制sleep 2s了。ide

#-*- coding:utf-8 -*-
#_author:John
#date:2018/10/16 19:05
#softwave: PyCharm
import lxml
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.common.exceptions import StaleElementReferenceException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait

boswer = webdriver.Chrome()
wait = WebDriverWait(boswer, 10)
url = 'https://www.douyu.com/directory/all'
boswer.get(url)
f = open('F:\douyu.txt', 'w', encoding='utf-8')
i = 0

while True:
    html = boswer.page_source
    soup = BeautifulSoup(html, 'lxml')
    soup_titles = soup.select('.ellipsis')
    soup_nums = soup.select('.dy-num.fr')
    for soup_title, soup_num in zip(soup_titles, soup_nums):
        title = soup_title.get_text().replace('\n', '').strip()
        num = soup_num.get_text()
        print('標題：{} | 人氣：{}'.format(title, num))
        f.write(title)
        f.write(num)
    i += 1
    print('*'*25 + '\n'+ '第 {} 頁爬取成功'.format(i)+ '\n'+'*'*25 )
    # 最後作出的妥協，延時長短根據網速而定
    time.sleep(2)
    # 等待下一頁是可點擊狀態
    if boswer.page_source.find("shark-pager-disable-next") != -1:
        break
    # 嘗試3次獲取下一頁，若是遇到失效就等待1s從新獲取下一頁元素
    for _ in range(3):
        try:
            next_page = wait.until(
                EC.element_to_be_clickable((By.CLASS_NAME, "shark-pager-next"))
            )
            next_page.click()
            break
        except StaleElementReferenceException:
            time.sleep(1)
            print('try to find element click')
    # 等待當前頁碼爲下一頁，證實已經執行了翻頁
    # wait.until(
    #     EC.text_to_be_present_in_element((By.CSS_SELECTOR,".shark-pager-item.current"), str(i+1))
    # )
    # 再等待下頁爲可點擊狀態確保頁面完成刷新
    # wait.until(
    #     EC.element_to_be_clickable((By.CLASS_NAME, "shark-pager-next"))
    # )
    # time.sleep(0.5)
boswer.close()
f.close()

用selenium折騰了好久，仍是不甘心，想看看經過網頁抓包方法試試，因而意外的發現這個URL帶有頁碼的，那後面彷佛就很簡單了。url

進入這個帶頁碼的url獲得這樣一個網頁就能夠用json.load變成字典取出來想要的東西。直接多進程循環200頁，這效率不談了，不到1分鐘瞬間完成！spa

#-*- coding:utf-8 -*-
#_author:John
#date:2018/10/25 0:07
#softwave: PyCharm
import requests
import json
from multiprocessing import Pool
import pymongo

client = pymongo.MongoClient('localhost')
db = client['douyu']

def single_page_info(page):
    respones = requests.get('https://www.douyu.com/gapi/rkc/directory/0_0/{}'.format(page))
    datas = json.loads(respones.text)
    items = datas['data']['rl']
    for item in items:
        data = {
        '標題': item['rn'],
        '主播': item['nn'],
        '人氣' : item['ol'],
        }
        print(data)
        db['ajax_spider'].insert(data)
    print('已經完成第{}頁'.format(page))
if __name__ == '__main__':
    pool = Pool()
    # 多線程抓200頁
    pool.map(single_page_info, [page for page in range(1,200)])

更改一下，找到更多的信息，而且不保存重複數據。

#-*- coding:utf-8 -*-
#_author:John
#date:2018/10/25 0:07
#softwave: PyCharm
import requests
import json
from multiprocessing import Pool
import pymongo
import datetime

client = pymongo.MongoClient('localhost')
db = client['douyu']
cur_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M')

def single_page_info(page):
    respones = requests.get('https://www.douyu.com/gapi/rkc/directory/0_0/{}'.format(page))
    datas = json.loads(respones.text)
    items = datas['data']['rl']
    for item in items:
        data = {
            '標題': item['rn'],
            '主播': item['nn'],
            '人氣': item['ol'],
            '類別': item['c2name'],
            '房間號': item['rid'],
            '時間': cur_time
        }
        # 不保存相同時間相同主播名的記錄
        if db['host_info'].update({'主播': data['主播'], '時間': data['時間']}, {'$set': data}, True):
            print('Save to Mongo, {}'.format(data))
        else:
            print('Save to Mong fail, {}'.format(data))
    print('已經完成第{}頁'.format(page))

if __name__ == '__main__':
    pool = Pool()
    #多線程抓200頁
    pool.map(single_page_info, [page for page in range(1, 201)])

運行完就能夠到Mongdb裏面篩選想要的數據了，好比按類別，按人氣，按時間等key_work查找數據。

相關標籤/搜索

每日一句

每一个你不满意的现在，都有一个你没有努力的曾经。