先看代碼html
#-*- coding:utf-8 -*- #_author:John #date:2018/10/16 19:05 #softwave: PyCharm import lxml from bs4 import BeautifulSoup from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait boswer = webdriver.Chrome() wait = WebDriverWait(boswer, 10) url = 'https://www.douyu.com/directory/all' boswer.get(url) f = open('F:\douyu.txt', 'w', encoding='utf-8') i = 0 while True: html = boswer.page_source soup = BeautifulSoup(html, 'lxml') soup_titles = soup.select('.ellipsis') soup_nums = soup.select('.dy-num.fr') for soup_title, soup_num in zip(soup_titles, soup_nums): title = soup_title.get_text().replace('\n', '').strip() num = soup_num.get_text() print('標題:{} | 人氣:{}'.format(title, num)) f.write(title) f.write(num) i += 1 print('*'*25 + '\n'+ '第 {} 頁爬取成功'.format(i)+ '\n'+'*'*25 ) # 等待下一頁是可點擊狀態 next_page = wait.until( EC.element_to_be_clickable((By.CLASS_NAME,"shark-pager-next")) ) # 獲取最後一頁 end_page = boswer.find_element_by_xpath("//a[@class='shark-pager-item'][last()]").text if i == int(end_page): break next_page.click() # 等待當前頁碼爲下一頁,證實已經執行了翻頁 wait.until( EC.text_to_be_present_in_element((By.CSS_SELECTOR,".shark-pager-item.current"), str(i+1)) ) boswer.close() f.close()
執行後只爬了兩頁就報錯誤:web
Traceback (most recent call last): File "C:/Users/yao/PycharmProjects/test20181003/test1016.py", line 38, in <module> next_page.click() File "C:\Users\yao\PycharmProjects\test20181003\venv\lib\site-packages\selenium\webdriver\remote\webelement.py", line 80, in click self._execute(Command.CLICK_ELEMENT) File "C:\Users\yao\PycharmProjects\test20181003\venv\lib\site-packages\selenium\webdriver\remote\webelement.py", line 633, in _execute return self._parent.execute(command, params) File "C:\Users\yao\PycharmProjects\test20181003\venv\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 321, in execute self.error_handler.check_response(response) File "C:\Users\yao\PycharmProjects\test20181003\venv\lib\site-packages\selenium\webdriver\remote\errorhandler.py", line 242, in check_response raise exception_class(message, screen, stacktrace) selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: element is not attached to the page document (Session info: chrome=71.0.3559.6) (Driver info: chromedriver=70.0.3538.16 (16ed95b41bb05e565b11fb66ac33c660b721f778),platform=Windows NT 10.0.17134 x86_64)
這樣的錯誤是說我已經點擊了翻頁,可是尚未完成翻頁,因而又爬了一次當前頁,而後再要執行翻頁時頁面已經刷新了,前面找到的翻頁元素已通過期了,沒法執行點擊。ajax
固然最有效的方法是在點擊翻頁後強制sleep幾秒,可是這樣會浪費時間,並且太low了。因而我加了等待當前頁爲下一頁的功能仍然無效,因而嘗試翻頁後再等待下一頁是可點擊狀態。chrome
#-*- coding:utf-8 -*- #_author:John #date:2018/10/16 19:05 #softwave: PyCharm import lxml from bs4 import BeautifulSoup from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait boswer = webdriver.Chrome() wait = WebDriverWait(boswer, 10) url = 'https://www.douyu.com/directory/all' boswer.get(url) f = open('F:\douyu.txt', 'w', encoding='utf-8') i = 0 while True: html = boswer.page_source soup = BeautifulSoup(html, 'lxml') soup_titles = soup.select('.ellipsis') soup_nums = soup.select('.dy-num.fr') for soup_title, soup_num in zip(soup_titles, soup_nums): title = soup_title.get_text().replace('\n', '').strip() num = soup_num.get_text() print('標題:{} | 人氣:{}'.format(title, num)) f.write(title) f.write(num) i += 1 print('*'*25 + '\n'+ '第 {} 頁爬取成功'.format(i)+ '\n'+'*'*25 ) # 等待下一頁是可點擊狀態 next_page = wait.until( EC.element_to_be_clickable((By.CLASS_NAME,"shark-pager-next")) ) # 獲取最後一頁 end_page = boswer.find_element_by_xpath("//a[@class='shark-pager-item'][last()]").text if i == int(end_page): break next_page.click() # 等待當前頁碼爲下一頁,證實已經執行了翻頁 wait.until( EC.text_to_be_present_in_element((By.CSS_SELECTOR,".shark-pager-item.current"), str(i+1)) ) # 再等待下頁爲可點擊狀態確保頁面完成刷新 wait.until( EC.element_to_be_clickable((By.CLASS_NAME, "shark-pager-next")) ) boswer.close() f.close()
看着刷刷地翻了20頁覺得大功告成了,卻在21頁又掛掉了。因而我手動點擊了下一頁試試,當前頁瞬間跳到下一頁,下一頁可點擊也只是稍晚一點,並不能徹底證實頁面已經完成刷新了。json
後面再幾經折騰,用了異常處理,遇到失效再sleep 1s從新獲取下一頁元素,能夠重試3次。api
#-*- coding:utf-8 -*- #_author:John #date:2018/10/16 19:05 #softwave: PyCharm import lxml import time from bs4 import BeautifulSoup from selenium import webdriver from selenium.common.exceptions import StaleElementReferenceException from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait boswer = webdriver.Chrome() wait = WebDriverWait(boswer, 10) url = 'https://www.douyu.com/directory/all' boswer.get(url) f = open('F:\douyu.txt', 'w', encoding='utf-8') i = 0 while True: html = boswer.page_source soup = BeautifulSoup(html, 'lxml') soup_titles = soup.select('.ellipsis') soup_nums = soup.select('.dy-num.fr') for soup_title, soup_num in zip(soup_titles, soup_nums): title = soup_title.get_text().replace('\n', '').strip() num = soup_num.get_text() print('標題:{} | 人氣:{}'.format(title, num)) f.write(title) f.write(num) i += 1 print('*'*25 + '\n'+ '第 {} 頁爬取成功'.format(i)+ '\n'+'*'*25 ) # 等待下一頁是可點擊狀態 if boswer.page_source.find("shark-pager-disable-next") != -1: break # 嘗試3次獲取下一頁,若是遇到失效就等待1s從新獲取下一頁元素 for _ in range(3): try: next_page = wait.until( EC.element_to_be_clickable((By.CLASS_NAME, "shark-pager-next")) ) next_page.click() break except StaleElementReferenceException: time.sleep(1) print('try to find element click') # 等待當前頁碼爲下一頁,證實已經執行了翻頁 # wait.until( # EC.text_to_be_present_in_element((By.CSS_SELECTOR,".shark-pager-item.current"), str(i+1)) # ) # 再等待下頁爲可點擊狀態確保頁面完成刷新 # wait.until( # EC.element_to_be_clickable((By.CLASS_NAME, "shark-pager-next")) # ) # time.sleep(0.5) boswer.close() f.close()
這樣一來確實沒有再掛掉了,可是會出現重複爬同一頁的狀況。無奈只能按low的方法強制sleep了,再想一想既然已經選擇了selenium就已經預料到它已是犧牲了速度來獲得的一個簡易的翻頁方法,若是說對爬蟲速度確實有要求的話就須要抓包分析,放棄selenium了。多線程
如下是最終用selenium完成的爬蟲,通過一番折騰仍是強制sleep 2s了。ide
#-*- coding:utf-8 -*- #_author:John #date:2018/10/16 19:05 #softwave: PyCharm import lxml import time from bs4 import BeautifulSoup from selenium import webdriver from selenium.common.exceptions import StaleElementReferenceException from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait boswer = webdriver.Chrome() wait = WebDriverWait(boswer, 10) url = 'https://www.douyu.com/directory/all' boswer.get(url) f = open('F:\douyu.txt', 'w', encoding='utf-8') i = 0 while True: html = boswer.page_source soup = BeautifulSoup(html, 'lxml') soup_titles = soup.select('.ellipsis') soup_nums = soup.select('.dy-num.fr') for soup_title, soup_num in zip(soup_titles, soup_nums): title = soup_title.get_text().replace('\n', '').strip() num = soup_num.get_text() print('標題:{} | 人氣:{}'.format(title, num)) f.write(title) f.write(num) i += 1 print('*'*25 + '\n'+ '第 {} 頁爬取成功'.format(i)+ '\n'+'*'*25 ) # 最後作出的妥協,延時長短根據網速而定 time.sleep(2) # 等待下一頁是可點擊狀態 if boswer.page_source.find("shark-pager-disable-next") != -1: break # 嘗試3次獲取下一頁,若是遇到失效就等待1s從新獲取下一頁元素 for _ in range(3): try: next_page = wait.until( EC.element_to_be_clickable((By.CLASS_NAME, "shark-pager-next")) ) next_page.click() break except StaleElementReferenceException: time.sleep(1) print('try to find element click') # 等待當前頁碼爲下一頁,證實已經執行了翻頁 # wait.until( # EC.text_to_be_present_in_element((By.CSS_SELECTOR,".shark-pager-item.current"), str(i+1)) # ) # 再等待下頁爲可點擊狀態確保頁面完成刷新 # wait.until( # EC.element_to_be_clickable((By.CLASS_NAME, "shark-pager-next")) # ) # time.sleep(0.5) boswer.close() f.close()
用selenium折騰了好久,仍是不甘心,想看看經過網頁抓包方法試試,因而意外的發現這個URL帶有頁碼的,那後面彷佛就很簡單了。url
進入這個帶頁碼的url獲得這樣一個網頁就能夠用json.load變成字典取出來想要的東西。直接多進程循環200頁,這效率不談了,不到1分鐘瞬間完成!spa
#-*- coding:utf-8 -*- #_author:John #date:2018/10/25 0:07 #softwave: PyCharm import requests import json from multiprocessing import Pool import pymongo client = pymongo.MongoClient('localhost') db = client['douyu'] def single_page_info(page): respones = requests.get('https://www.douyu.com/gapi/rkc/directory/0_0/{}'.format(page)) datas = json.loads(respones.text) items = datas['data']['rl'] for item in items: data = { '標題': item['rn'], '主播': item['nn'], '人氣' : item['ol'], } print(data) db['ajax_spider'].insert(data) print('已經完成第{}頁'.format(page)) if __name__ == '__main__': pool = Pool() # 多線程抓200頁 pool.map(single_page_info, [page for page in range(1,200)])
更改一下,找到更多的信息,而且不保存重複數據。
#-*- coding:utf-8 -*- #_author:John #date:2018/10/25 0:07 #softwave: PyCharm import requests import json from multiprocessing import Pool import pymongo import datetime client = pymongo.MongoClient('localhost') db = client['douyu'] cur_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M') def single_page_info(page): respones = requests.get('https://www.douyu.com/gapi/rkc/directory/0_0/{}'.format(page)) datas = json.loads(respones.text) items = datas['data']['rl'] for item in items: data = { '標題': item['rn'], '主播': item['nn'], '人氣': item['ol'], '類別': item['c2name'], '房間號': item['rid'], '時間': cur_time } # 不保存相同時間相同主播名的記錄 if db['host_info'].update({'主播': data['主播'], '時間': data['時間']}, {'$set': data}, True): print('Save to Mongo, {}'.format(data)) else: print('Save to Mong fail, {}'.format(data)) print('已經完成第{}頁'.format(page)) if __name__ == '__main__': pool = Pool() #多線程抓200頁 pool.map(single_page_info, [page for page in range(1, 201)])
運行完就能夠到Mongdb裏面篩選想要的數據了,好比按類別,按人氣,按時間等key_work查找數據。