土福曾說,百度指數很難抓,在淘寶上面是20塊1個關鍵字:
css
哥那麼叼的人怎麼會被他嚇到,因而乎花了零零碎碎加起來大約2天半搞定,在此鄙視一下土福html
谷歌圖像識別tesseract-ocrpython
pip3 install pillowgit
pip3 install pyocrgithub
selenium2.45web
Chrome47.0.2526.106 m or Firebox32.0.1chrome
chromedriver.exe數組
# 打開瀏覽器 def openbrowser(): global browser # https://passport.baidu.com/v2/?login url = "https://passport.baidu.com/v2/?login&tpl=mn&u=http%3A%2F%2Fwww.baidu.com%2F" # 打開谷歌瀏覽器 # Firefox() # Chrome() browser = webdriver.Chrome() # 輸入網址 browser.get(url) # 打開瀏覽器時間 # print("等待10秒打開瀏覽器...") # time.sleep(10) # 找到id="TANGRAM__PSP_3__userName"的對話框 # 清空輸入框 browser.find_element_by_id("TANGRAM__PSP_3__userName").clear() browser.find_element_by_id("TANGRAM__PSP_3__password").clear() # 輸入帳號密碼 # 輸入帳號密碼 account = [] try: fileaccount = open("../baidu/account.txt") accounts = fileaccount.readlines() for acc in accounts: account.append(acc.strip()) fileaccount.close() except Exception as err: print(err) input("請正確在account.txt裏面寫入帳號密碼") exit() browser.find_element_by_id("TANGRAM__PSP_3__userName").send_keys(account[0]) browser.find_element_by_id("TANGRAM__PSP_3__password").send_keys(account[1]) # 點擊登錄登錄 # id="TANGRAM__PSP_3__submit" browser.find_element_by_id("TANGRAM__PSP_3__submit").click() # 等待登錄10秒 # print('等待登錄10秒...') # time.sleep(10) print("等待網址加載完畢...") select = input("請觀察瀏覽器網站是否已經登錄(y/n):") while 1: if select == "y" or select == "Y": print("登錄成功!") print("準備打開新的窗口...") # time.sleep(1) # browser.quit() break elif select == "n" or select == "N": selectno = input("帳號密碼錯誤請按0,驗證碼出現請按1...") # 帳號密碼錯誤則從新輸入 if selectno == "0": # 找到id="TANGRAM__PSP_3__userName"的對話框 # 清空輸入框 browser.find_element_by_id("TANGRAM__PSP_3__userName").clear() browser.find_element_by_id("TANGRAM__PSP_3__password").clear() # 輸入帳號密碼 account = [] try: fileaccount = open("../baidu/account.txt") accounts = fileaccount.readlines() for acc in accounts: account.append(acc.strip()) fileaccount.close() except Exception as err: print(err) input("請正確在account.txt裏面寫入帳號密碼") exit() browser.find_element_by_id("TANGRAM__PSP_3__userName").send_keys(account[0]) browser.find_element_by_id("TANGRAM__PSP_3__password").send_keys(account[1]) # 點擊登錄sign in # id="TANGRAM__PSP_3__submit" browser.find_element_by_id("TANGRAM__PSP_3__submit").click() elif selectno == "1": # 驗證碼的id爲id="ap_captcha_guess"的對話框 input("請在瀏覽器中輸入驗證碼並登錄...") select = input("請觀察瀏覽器網站是否已經登錄(y/n):") else: print("請輸入「y」或者「n」!") select = input("請觀察瀏覽器網站是否已經登錄(y/n):")
# 新開一個窗口,經過執行js來新開一個窗口 js = 'window.open("http://index.baidu.com");' browser.execute_script(js) # 新窗口句柄切換,進入百度指數 # 得到當前打開全部窗口的句柄handles # handles爲一個數組 handles = browser.window_handles # print(handles) # 切換到當前最新打開的窗口 browser.switch_to_window(handles[-1])
# 清空輸入框 browser.find_element_by_id("schword").clear() # 寫入須要搜索的百度指數 browser.find_element_by_id("schword").send_keys(keyword) # 點擊搜索 # <input type="submit" value="" id="searchWords" onclick="searchDemoWords()"> browser.find_element_by_id("searchWords").click() time.sleep(2) # 最大化窗口 browser.maximize_window() # 構造天數 sel = int(input("查詢7天請按0,30天請按1,90天請按2,半年請按3:")) day = 0 if sel == 0: day = 7 elif sel == 1: day = 30 elif sel == 2: day = 90 elif sel == 3: day = 180 sel = '//a[@rel="' + str(day) + '"]' browser.find_element_by_xpath(sel).click() # 太快了 time.sleep(2)
xoyelement = browser.find_elements_by_css_selector("#trend rect")[2]
第一個點的橫座標爲1031.66666
第二個點的橫座標爲1234
因此7天兩個座標之間的差爲:202.33,其餘的天數相似
from selenium.webdriver.common.action_chains import ActionChains ActionChains(browser).move_to_element_with_offset(xoyelement,x_0,y_0).perform()
也就是矩形的左上角,這裏是不會加載js顯示彈出框的,因此要給橫座標+1:
x_0 = 1 y_0 = 0
# 按照選擇的天數循環 for i in range(day): # 構造規則 if day == 7: x_0 = x_0 + 202.33 elif day == 30: x_0 = x_0 + 41.68 elif day == 90: x_0 = x_0 + 13.64 elif day == 180: x_0 = x_0 + 6.78
# <div class="imgtxt" style="margin-left:-117px;"></div> imgelement = browser.find_element_by_xpath('//div[@id="viewbox"]')
# 找到圖片座標 locations = imgelement.location print(locations) # 找到圖片大小 sizes = imgelement.size print(sizes) # 構造指數的位置 rangle = (int(locations['x']), int(locations['y']), int(locations['x'] + sizes['width']), int(locations['y'] + sizes['height']))
截取的圖形爲:
- 將整個屏幕截圖下來
- 打開截圖用上面獲得的這個座標rangle進行裁剪
add_length = (len(keyword) - 2) * sizes['width'] / 15
# 構造指數的位置 rangle = ( int(locations['x'] + sizes['width'] / 4 + add_length), int(locations['y'] + sizes['height'] / 2 - 40), int(locations['x'] + sizes['width'] * 2 / 3), int(locations['y'] + sizes['height'] - 40))
# <div class="imgtxt" style="margin-left:-117px;"></div> imgelement = browser.find_element_by_xpath('//div[@id="viewbox"]') # 找到圖片座標 locations = imgelement.location print(locations) # 找到圖片大小 sizes = imgelement.size print(sizes) # 構造關鍵詞長度 add_length = (len(keyword) - 2) * sizes['width'] / 15 # 構造指數的位置 rangle = ( int(locations['x'] + sizes['width'] / 4 + add_length), int(locations['y'] + sizes['height'] / 2 - 40), int(locations['x'] + sizes['width'] * 2 / 3), int(locations['y'] + sizes['height'] - 40)) # 截取當前瀏覽器 path = "../baidu/" + str(num) browser.save_screenshot(str(path) + ".png") # 打開截圖切割 img = Image.open(str(path) + ".png") jpg = img.crop(rangle) jpg.save(str(path) + ".jpg")
# 將圖片放大一倍 # 原圖大小73.29 jpgzoom = Image.open(str(path) + ".jpg") (x, y) = jpgzoom.size x_s = 146 y_s = 58 out = jpgzoom.resize((x_s, y_s), Image.ANTIALIAS) out.save(path + 'zoom.jpg', 'png', quality=95)
原圖大小請 右鍵->屬性->詳細信息 查看,個人是長73像素,寬29像素
# 圖像識別 index = [] image = Image.open(str(path) + "zoom.jpg") code = pytesseract.image_to_string(image) if code: index.append(code)