selenium模塊的做用經過編寫代碼模擬人工對瀏覽器的事件,觸發相關操做,從而獲取網頁信息,相對於使用requests模塊,selenium模塊對動態數據的爬取更爲方便html
安裝selenium:pip install selenium -i https://pypi.douban.com/simpleweb
1:使用內置的webdriver類實例化一個瀏覽器對象diver(示例瀏覽器對象時,須要傳入一個瀏覽器驅動的路徑)如實例化一個谷歌瀏覽器地下diver = webdriver.Chrome(r'./chromedriver.exe)chrome
2:才用實例化的對象中的方法進行模擬人工操做瀏覽器瀏覽器
經常使用內置方法:服務器
打開網頁:diver.get("要訪問的url")cookie
查詢標籤:diver.find_element_by_id("id值") # 根據標籤id查找,能夠修改成class,tagname等值,與js查找標籤相似,找到標籤返回一個obj對象app
obj對象的方法: obj.click(點擊)less
obj.send_keys(輸入)dom
獲取網頁源碼:diver.page_sourceide
關閉瀏覽器:diver.close()/diver.quit
執行js代碼:diver.execute_script("js代碼") # 如widow.scrollTo(0,document.body.scrollHeight放到js代碼中,瀏覽器會執行滾輪下滑必定高度的動做,
截圖:diver.save_screenshot("圖片保存路徑和文件名")
前進:diver.forward()
後退:diver.back()
切換到iframe標籤:diver.switch_to.frame("iframe標籤")
實現是鼠標按住不鬆手:線實例化動做鏈對象action = ActionChains(diver) # ActionChains從selenium.webdriver中導入
而後保持不鬆開:action.click_and_hold(「標籤對象」) #點擊標籤對象鬆開
移動標籤:action.move_by_offset(x,y) #移動標籤對象,若是是移動到另外一個標籤裏,能夠使用action.drag_and_drop(被移動標籤對象, 目標標籤對象)
執行上述代碼:action.perform
獲取cookise值:diver.get_cookies()
經過添加參數能夠讓selenium操做瀏覽器在後臺運行,不會有界面顯示
# 建立一個參數對象,用來控制chrome以無界面模式打開
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
# 實例化一個谷歌瀏覽器對象, 須要加上一個谷歌無界面瀏覽器的參數chrome_options
diver = webdriver.Chrome(r'./chromedriver.exe',chrome_options=chrome_options)
#經過添加參數能夠下降被網站服務器檢測爲自動化程序的風險
# 實例化一個options對象, 添加規避被檢測識別的參數
options = ChromeOptions()
options.add_experimental_option('excludeSwitches', ['enable-automation'])
bro = webdriver.Chrome(executable_path='./chromedriver.exe',options=options)
# 使用selenium爬取網易新聞裏面["國內", "國際", "軍事", "航空"]四個版塊裏面的新聞數據
1 # 使用selenium爬取網易新聞裏面["國內", "國際", "軍事", "航空"]四個版塊裏面的新聞數據 2 import requests,random 3 from selenium import webdriver 4 from selenium.webdriver import ChromeOptions 5 from selenium.webdriver.chrome.options import Options 6 from lxml import etree 7 from multiprocessing.dummy import Pool 8 9 # 設置不打開瀏覽器查詢 10 chrome_options = Options() 11 chrome_options.add_argument("--headless") 12 chrome_options.add_argument("--disable-gpu") 13 14 # 規避腳本檢測 15 options = ChromeOptions() 16 options.add_experimental_option('excludeSwitches', ['enable-automation']) 17 # 生成谷歌瀏覽器對象 18 diver = webdriver.Chrome('chromedriver.exe',chrome_options=chrome_options,options=options) 19 # 連接目標url 20 diver.get("https://news.163.com") 21 # 獲取網頁代碼 22 response_text = diver.page_source 23 24 #使用lxml解析源代碼 25 tree = etree.HTML(response_text) 26 guonei_url = tree.xpath('//li[@class="menu_guonei"]/a/@href')[0] 27 guoji_url = tree.xpath('//li[@class="menu_guoji"]/a/@href')[0] 28 war_url = tree.xpath('//li[@class="menu_war"]/a/@href')[0] 29 hangkong_url = tree.xpath('//li[@class="menu_hangkong"]/a/@href')[0] 30 diver.close() 31 32 33 def get_new(url): 34 '''模仿人工操做瀏覽器下拉到頁面底部,並獲取整張頁面源碼''' 35 new_diver = webdriver.Chrome('chromedriver.exe',options=options,chrome_options=chrome_options) 36 new_diver.get(url) 37 js = 'window.scrollTo(0,document.body.scrollHeight)' 38 check_bottom = new_diver.find_element_by_class_name("load_more_tip") 39 while check_bottom.get_attribute('style') == 'display: none;': 40 new_diver.execute_script(js) 41 obj = new_diver.find_element_by_class_name("post_addmore") 42 if obj.get_attribute('style') == 'visibility: visible;': 43 obj.click() 44 new_diver.execute_script(js) 45 # 獲取網頁代碼 46 response_text = new_diver.page_source 47 # filename = str(random.randint(1000,9999)) + ".html" 48 # with open(filename,"w",encoding="utf-8") as f: 49 # f.write(response_text) 50 new_diver.close() 51 return response_text 52 53 def mark_url(html_text): 54 '''獲取各個新聞的詳情頁標籤''' 55 mark_tree = etree.HTML(html_text) 56 title_url_list = mark_tree.xpath('//div[@class="ndi_main"]/div/div/div/h3/a/@href') 57 return title_url_list 58 59 def get_new_detail(title_url_list): 60 '''爬取並將新聞標題個內容保存在本地''' 61 filename = str(random.randint(1000,9999)) + ".txt" 62 with open(filename,"w",encoding="utf-8") as f: 63 for title_url in title_url_list: 64 detail_diver = webdriver.Chrome('chromedriver.exe',options=options,chrome_options=chrome_options) 65 detail_diver.get(title_url) 66 response_text = detail_diver.page_source 67 detail_tree = etree.HTML(response_text) 68 title = detail_tree.xpath('//div[@id="epContentLeft"]/h1/text()')[0] 69 text = detail_tree.xpath('//div[@id="endText"]/p/text()') 70 text = ''.join(text) 71 f.write(title) 72 f.write(text) 73 74 # 初始化四個要爬取的網頁url 75 url_list = [guonei_url,guoji_url,war_url,hangkong_url] 76 # 實例化線程池 77 pool = Pool(4) 78 # 使用線程池獲取要爬取網頁的全部新聞標題和新聞詳情頁的url 79 data_list = pool.map(get_new,url_list) 80 # 解析全部詳情頁的url 81 title_url_list = pool.map(mark_url,data_list) 82 # 爬取新聞詳情 83 pool.map(get_new_detail,title_url_list)
# 使用線程池爬取梨視頻app的視頻
# 使用線程池爬取梨視頻(10個視頻) import requests,re,random from lxml import etree from multiprocessing.dummy import Pool requests = requests.Session() url = 'https://www.pearvideo.com/category_4' headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36", "connection":"close" } response_text = requests.get(url=url,headers=headers).text tree = etree.HTML(response_text) video_url_list = tree.xpath('//ul[@id="listvideoListUl"]/li/div/a/@href | //ul[@id="categoryList"]/li/div/a/@href') for i in range(len(video_url_list)): video_url_list[i] = 'https://www.pearvideo.com/' + video_url_list[i] ''' srcUrl="https://video.pearvideo.com/mp4/third/20191023/cont-1615387-11549790-203859-hd.mp4",vdoUrl=srcUrl, ''' def get_data_url(url): response_text = requests.get(url=url,headers=headers).text data_url = re.findall('srcUrl="(.*?)",vdoUrl=srcUrl,',response_text)[0] return data_url def get_data(data_url): data = requests.get(url=data_url,headers=headers).content filename = str(random.randint(1000,9999)) + ".mp4" with open(filename,"wb") as f: f.write(data) pool = Pool(5) data_url_list = pool.map(get_data_url,video_url_list) pool.map(get_data,data_url_list)