目錄css
import threading import requests from lxml import etree from urllib import request import os import re from queue import Queue class HtmlSprider(threading.Thread): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36' } def __init__(self,page_queue,img_queue,*args,**kwargs): super(HtmlSprider, self).__init__(*args,**kwargs) self.page_queue = page_queue self.img_queue = img_queue def run(self): while True: if self.page_queue.empty(): break url = self.page_queue.get() self.parse_page(url) def parse_page(self,url): response = requests.get(url,headers=self.headers) text = response.text html = etree.HTML(text) imgs = html.xpath("//div[@class='page-content text-center']//a//img") for img in imgs: if img.get('class') == 'gif': continue img_url = img.xpath(".//@data-original")[0] suffix = os.path.splitext(img_url)[1] suffix = re.sub(r"!dta", "", suffix) alt = img.xpath(".//@alt")[0] alt = re.sub(r'[,。??,/\\·]','',alt) img_name = alt + suffix self.img_queue.put((img_url,img_name)) class DownloadPicture(threading.Thread): def __init__(self,page_queue,img_queue,*args,**kwargs): super(DownloadPicture, self).__init__(*args,**kwargs) self.page_queue = page_queue self.img_queue = img_queue def run(self): while True: if self.img_queue.empty(): if self.page_queue.empty(): return img = self.img_queue.get(block=True) url,filename = img request.urlretrieve(url,'images/'+filename) print(filename+' 下載完成!') def main(): page_queue = Queue(100) img_queue = Queue(500) for x in range(1,10): url = "http://www.doutula.com/photo/list/?page=%d" % x page_queue.put(url) for x in range(5): t = HtmlSprider(page_queue,img_queue) t.start() for x in range(5): t = DownloadPicture(page_queue,img_queue) t.start() if __name__ == '__main__': main()
AJAX(Asynchronouse JavaScript And XML)
,翻譯爲異步JavaScript和XML。是在後臺與服務器進行少許數據交換,Ajax 能夠使網頁實現異步更新。這意味着能夠在不從新加載整個網頁的狀況下,對網頁的某部分進行更新。咱們有兩種辦法來處理這種狀況:html
selenium
和chromedriver
來模擬登錄。這樣雖然繁瑣,不過很穩定。Selenium
至關因而一個機器人。能夠模擬人類在瀏覽器上的一些行爲,自動處理瀏覽器上的一些行爲,好比點擊,填充數據,刪除cookie
等。chromedriver是一個驅動Chrome瀏覽器的驅動程序,使用他才能夠驅動瀏覽器。selenium
快速入門與基本操做selenium文檔官網python
from selenium import webdriver import time driver_path = "D:\\chromedriver\\chromedriver.exe" ##獲取chromedriver可執行文件的地址 inputTag = webdriver.Chrome(executable_path=driver_path) ##傳入chromedriver地址 inputTag.get("http://www.baidu.com") ##訪問頁面 inputTag = inputTag.find_element_by_id("kw") inputTag.send_keys("python") time.sleep(4)
driver.close()
:關閉當前頁面driver.quit()
:退出瀏覽器find_element_by_id
:根據id來查找元素find_element_by_class_name
:根據類名查找元素find_element_by_name
:根據name屬性的值來查找元素find_element_by_tag_name
:根據標籤名來查找元素find_element_by_xpath
:根據xpath語法來獲取元素find_element_by_css_selector
:根據css選擇器選擇元素注意:把以上方法中的element
改成elements
就能夠獲取全部相關元素了git
#1.先獲取須要填寫內容的表單 #2.用send_keys方法發送要輸入的值 inputTag = driver.find_element_by_id("kw") inputTag.send_keys("python")
inputTag.clear()
rememberTag = driver.find_element_by_name("rememberMe") rememberTag.click()
click_and_hold(element)
:點擊但不鬆開鼠標context_click(element)
:右鍵點擊double_click(element)
:雙擊inputTag = driver.find_element_by_id('kw') submitTag = driver.find_element_by_id('su') ## 咱們能夠使用鼠標行爲鏈類ActionChains來完成 actions = ActionChains(driver) actions.move_to_element(inputTag) actions.send_keys_to_element(inputTag,'python') actions.move_to_element(submitTag) actions.click(submitTag) actions.perform()
Cookie
操做for cookie in driver.get_cookies(): print(cookie
value = driver.get_cookie(key)
driver.delete_all_cookies()
driver.delete_cookie(key)
如今的網頁愈來愈多采用了 Ajax 技術,這樣程序便不能肯定什麼時候某個元素徹底加載出來了。若是實際頁面等待時間過長致使某個dom元素還沒出來,可是你的代碼直接使用了這個WebElement,那麼就會拋出NullPointer的異常。所以,selenium提供瞭如下兩種解決辦法:github
driver = webdriver.Chrome(executable_path=driver_path) driver.implicitly_wait(10) # 請求網頁 driver.get("https://www.baidu.com/")
顯示等待是代表某個條件成立後才執行獲取元素的操做。也能夠在等待的時候指定一個最大的時間,若是超過這個時間那麼就拋出一個異常。web
from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC driver = webdriver.Firefox() driver.get("http://somedomain/url_that_delays_loading") try: element = WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.ID, "myDynamicElement")) ) finally: driver.quit()
一些其餘的等待條件:ajax
presence_of_element_located
:某個元素已經加載完畢presence_of_all_element_located
:網頁中全部知足條件的元素都加載完畢了element_to_be_cliable
:某個元素能夠點擊了from selenium import webdriver driver_path = r"D:\chromedriver\chromedriver.exe" driver = webdriver.Chrome(executable_path=driver_path) driver.get("https://baidu.com/") ##打開新的一個頁面 driver.execute_script("window.open('http://www.douban.com/')") ##切換到這個新的頁面中 driver.driver.switch_to.window(driver.window_handles[1])
from selenium import webdriver options = webdriver.ChromeOptions() options.add_argument("--proxy-server=http://110.52.235.241:9999") driver_path = r"D:\chromedriver\chromedriver.exe" driver = webdriver.Chrome(executable_path=driver_path,options=options) driver.get("http://www.ip138.com/")
tesseract
tesseract
安裝與配置GIT官網chrome
在Windows下把tesseract.exe所在的路徑添加到PATH環境變量中。還有一個環境變量須要設置的是,要把訓練的數據文件路徑也放到環境變量中。在環境變量中,添加一個TESSDATA_PREFIX=C:\path_to_tesseractdata\teseractdata。瀏覽器
tesseract 圖片路徑 文件路徑
文件路徑不須要添加後綴名也會默認爲.txt
服務器
import pytesseract from PIL import Image ##得到tesseract可執行文件地址 pytesseract.pytesseract.tesseract_cmd = r"D:\tesseract\tesseract.exe" ##利用該模塊打開圖片文件 image = Image.open(r"C:\Users\DELL\Desktop\a.png") ##image_to_string()方法轉換成字符串 text = pytesseract.image_to_string(image) print(text)