Python爬蟲學習第一天--環境準備

時間 2019-12-11

原文原文鏈接

BeautifulSoupcss

　　from bs4 import BeautifulSoup
　　soup=BeautifulSoup(url,'lxml')#lxml解析庫速度快，文檔容錯能力強
　　soup=BeautifulSoup(url,'xml')
　　soup=BeautifulSoup(url,'html5lib')

　　soup.prettify()#自動補全

標籤選擇器
　　from bs4 import BeautifulSoup
　　import lxml
　　import requests
　　url='https://movie.douban.com/'
　　content=requests.get(url).text #以文本形式打印出來
　　soup=BeautifulSoup(content,'lxml') #利用beautifulsoup解析
　　print(soup)
　　print(soup.head)
　　print(soup.title)
　　print(soup.p)
　　print(soup.div)

獲取標籤屬性
　　soup.p['class']

獲取標籤內容
　　soup.title.text

標準選擇器
　　soup.find_all(self, name=None, attrs={}, recursive=True, text=None,limit=None, **kwargs)#以列表返回多個元素
　　name#Tag標籤
　　attrs#經過字典匹配內容

　　soup.find(self, name=None, attrs={}, recursive=True, text=None,limit=None, **kwargs)#返回單個元素

CSS選擇器html

　　soup.select(self, selector, _candidate_generator=None, limit=None)
　　selector
　　　　選擇class 用.表示
　　　　選擇id 用#表示

獲取屬性
　　soup.select('id')
　　soup.select('class')

獲取內容
　　soup.select('id').get_text()

PyQuery庫
　　pip3 install pyquery#安裝PyQuery
　　from pyquery import PyQuery as pq
　　dpc=pq('html')
　　dpc=pq(url='http://www.baidu.com/')
　　dpc=pq(filename=r'C:/123.txt')

CSS選擇器
　　class ---> .
　　id ---> #
html5

　　dpc2=dpc('#head .head_wrapper #lg')
　　dpc3=dpc2.find("img")
　　print(dpc3)

遍歷元素
　　dpc4=dpc('#head .head_wrapper #lg').items()#item（）方法
　　for it in dpc4 :
　　print(dpc4)

獲取屬性
　　dpc5=dpc2.find("img").attr('href')

獲取標籤的文本
　　dpc6=dpc2.find("img").text(0
　　dpc6=dpc2.find("img").html(0

DOM操做
　　dpc2.addClass()#class標籤
　　dpc2.removeClass()

dpc2.attr()#更改屬性
　　dpc2css()
　　dpc2.find("img").remove()

僞類選擇器
　　dpc2.find("img").('li:frist-child')

selenium庫
　　#自動化測試工具，驅動瀏覽器，解決JS渲染

pip3 install selenum
from selenium import webdriver
browser=webdriver.Chrome()#聲明瀏覽器對象,須要提供ChromeDriver驅動
browser=webdriver.Firefox()
browser=webdriver.Edge()
browser=webdriver.Safari()

訪問頁面
　　browser.get('http://www.baidu.com')
　　browser.close()#關閉瀏覽器

查找單個元素
　　input_f.browser.find_element_by_id('kw')#'kw'關鍵字須要在網頁源代碼尋找
　　browser.fin_element_by_css_selector()
　　browser.fin_element_by_tag_name()
　　browser.fin_element_by_xpath('q')
　　browser.fin_element_by_class_name()
　　browser.fin_element_by_partial_link_text()

查找多個元素
input_f=browser.find_elements_by_id('kw')#以列表返回結果

元素交互操做
　　input_f.send_keys('關鍵字')#在查詢框中輸入關鍵字
　　input_f.clear()
　　input_f.submit()
　　input_f.click()

交互動做將動做附加到動做鏈中串行執行
　　from selenium.webdriver improt ActionChains
　　actions=ActionChains(browser)
　　actions.drag_and_drop()
　　actions.clic_and_hold()

執行JavaScript
　　browser.execute_script('JS代碼')#重點掌握的方法

獲取元素信息
　　web

獲取屬性
　　　browser.find_element_by_id('kw').get_atrribute('class')
獲取文本值
　　browser.find_element_by_id('kw').text #elemnt單個返回值
獲取ID、位置、標籤名、大小
　　browser.find_element_by_id('kw').id
　　browser.find_element_by_id('kw').location
　　browser.find_element_by_id('kw').tag_name
　　browser.find_element_by_id('kw').size

Frame
　　browser.switch_to.parent_frame()
　　
等待瀏覽器

隱式等待
　　browser.implicitly_wait(10)#元素未加載完成時會等待必定時間來加載
顯示加載
　　from selenium.webdriver.support.ui import WebDriverWait
　　from selenium.webdriver.support import exected_conditions
　　wait=WebDriverWait(browser,10)#聲明等待對象
　　wait.until(exected_conditions.title_is('chenwei'))#Exected_Conditions等待條件

瀏覽器前進後退
　　browser.back()
　　time.sleep(1)
　　browser.foward()

Cookies
　　browser.get_cookies()
　　browser.add_cookie(name)
　　browser.delete_all_cookies()

選項卡管理
　　browser.execute_script(window.open)#'JS腳本'
　　browser.get('https://taobao.com')
　　print(browser.window_handles)#顯示目前選項卡的name
　　browser.switch_to_window(browser.window_handles[1])#切換至新選項卡
　　time.sleep(2)
　　browser.get('http://www.baidu.com')

異常處理
　　from selenium.common.exception import TimeoutException
　　try
　　　　browser.get('https://taobao.com')
　　except TimeoutException
　　　　print(鏈接超時)
　　finally
　　　　browser.close()cookie