# -*-coding:utf-8 -*- from selenium import webdriver from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.ui import WebDriverWait from urllib.request import urlretrieve import time, random from selenium.webdriver.common.action_chains import ActionChains from selenium.webdriver.chrome.options import Options from selenium.common.exceptions import TimeoutException from PIL import ImageChops import PIL.Image as image import cv2 from scrapy.http import HtmlResponse # options = Options() # options.add_argument('--headless') # options.add_argument('--no-sandbox') # options.add_argument('--disable-dev-shm-usage') # driver = webdriver.Chrome(executable_path="d:\CaiPan\Chrome\chromedriver.exe", chrome_options=options) class Crack(object): def __init__(self, url): self.options = Options() # self.options.add_argument('--headless') # self.options.add_argument('--disable-dev-shm-usage') self.options.add_argument('--disable-gpu') self.options.add_argument("--no-sandbox") # self.options.add_experimental_option('excludeSwitches', ['enable-automation']) # self.options.add_experimental_option('debuggerAddress', '127.0.0.1:9222') self.options.add_argument('user-agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36"') self.options.add_argument('upgrade-insecure-requests="1"') self.options.add_argument('sec-fetch-user="?1"') self.options.add_argument('sec-fetch-site="none"') self.options.add_argument('sec-fetch-mode="navigate"') self.options.add_argument('pragma="no-cache"') # self.options.add_argument('cookie="xhsTrackerId=05e3b581-4bbf-4f7b-cf0c-d0aa80b4151a; ra-user-id-ares=5bfe244a9df0a90001b38b2c; Hm_lvt_d0ae755ac51e3c5ff9b1596b0c09c826=1565592524,1565593646,1565593678,1565594279; Hm_lpvt_d0ae755ac51e3c5ff9b1596b0c09c826=1565595993; Hm_lvt_b344979f0455853bf22b3ef05fa7b4ac=1566370861; Hm_lpvt_b344979f0455853bf22b3ef05fa7b4ac=1566370861; xhs_spses.6983=*; solar.beaker.session.id=6bc441db11c89bbade2ee08edcd106efe516723egAJ9cQAoWAwAAABsb2dpbkFjY291bnRxAVgLAAAAMTMxMjIyNTIzMjRxAlgEAAAAcm9sZXEDWAUAAABicmFuZHEEWAoAAABiQWNjb3VudE5vcQVYCgAAADk5NzMyNTg2NTFxBlgJAAAAbG9naW5UeXBlcQdLAVgIAAAAbmlja05hbWVxCFgOAAAAQmVsbGEncyBHYXJkZW5xCVgIAAAAc2hvcE5hbWVxCk5YCQAAAHJlc291cmNlc3ELXXEMKFgYAAAANWMzNTUzZGUxZDk3NzE1OGEzNjc0OTM0cQ1lWAYAAABhdmF0YXJxDlhaAAAAaHR0cHM6Ly9pbWcueGlhb2hvbmdzaHUuY29tL2F2YXRhci81YzM0NTU1NGMxMmZkNDAwMDExMjE3NzYuanBnQDEyMHdfMTIwaF85MnFfMWVfMWNfMXguanBncQ9YDgAAAF9hY2Nlc3NlZF90aW1lcRBHQddXnAhVcKRYBgAAAHVzZXJJZHERWBgAAAA1YmZlMjQ0YTlkZjBhOTAwMDFiMzhiMmNxElgIAAAAYXR0ckxpc3RxE05YDAAAAGJBY2NvdW50VHlwZXEUWAcAAABQUklNQVJZcRVYCwAAAGRlYWN0aXZhdGVkcRaJWAgAAABzZWxsZXJJZHEXWBgAAAA1YzM1NTNkZTFkOTc3MTU4YTM2NzQ5MzRxGFgOAAAAX2NyZWF0aW9uX3RpbWVxGUdB11ecCFVwpFgLAAAAcGVybWlzc2lvbnNxGl1xGyhlWAsAAABzZWxsZXJSb2xlc3EcTlgDAAAAX2lkcR1YIAAAADUxZDNhZWM0NzRmNDQ2MmRhNTM2NDFiNzU5Y2QzYWM2cR5YCgAAAHNlbGxlclR5cGVxH0sEWAUAAABlbWFpbHEgWCEAAAA1YmZlMjQ0YTlkZjBhOTAwMDFiMzhiMmNAeGhzLmZha2VxIXUu; xhs_spid.6983=fe43536f085a4a3f.1565594090.21.1566470205.1566270695.ba61cacc-e97c-41c6-b72f-c6ca1a6b4d9c"') self.options.add_argument('cache-control="no-cache"') self.options.add_argument('accept-language="en-US,en;q=0.9"') self.options.add_argument('accept-encoding="gzip, deflate, br"') self.options.add_argument(':scheme="https"') self.options.add_argument(':method="GET"') self.options.add_argument(':authority="www.xiaohongshu.com"') self.options.add_argument('accept="text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3"') self.url = url # self.url = 'https://www.xiaohongshu.com/web-login/captcha?redirectPath=https%3A%2F%2Finfluencer.xiaohongshu.com%2Fsolar%2Fadvertiser%2Fpatterns%2Fkol' self.browser = webdriver.Chrome('D:\CaiPan\Chrome\chromedriver.exe', chrome_options=self.options) self.wait = WebDriverWait(self.browser, 100) # self.keyword = keyword self.BORDER = 6 def open(self): """ 打開瀏覽器,並輸入查詢內容 """ cookie1 = {'name': 'xhsTrackerId', 'value': '05e3b581-4bbf-4f7b-cf0c-d0aa80b4151a'} cookie2 = {'name': 'ra-user-id-ares', 'value': '5bfe244a9df0a90001b38b2c'} cookie3 = {'name': 'Hm_lvt_d0ae755ac51e3c5ff9b1596b0c09c826', 'value': '1565592524,1565593646,1565593678,1565594279'} cookie4 = {'name': 'Hm_lpvt_d0ae755ac51e3c5ff9b1596b0c09c826', 'value': '1565595993'} cookie5 = {'name': 'Hm_lvt_b344979f0455853bf22b3ef05fa7b4ac', 'value': '1566370861'} cookie6 = {'name': 'Hm_lpvt_b344979f0455853bf22b3ef05fa7b4ac', 'value': '1566370861'} cookie7 = {'name': 'xhs_spses.6983', 'value': '*'} cookie8 = {'name': 'solar.beaker.session.id', 'value': '6bc441db11c89bbade2ee08edcd106efe516723egAJ9cQAoWAwAAABsb2dpbkFjY291bnRxAVgLAAAAMTMxMjIyNTIzMjRxAlgEAAAAcm9sZXEDWAUAAABicmFuZHEEWAoAAABiQWNjb3VudE5vcQVYCgAAADk5NzMyNTg2NTFxBlgJAAAAbG9naW5UeXBlcQdLAVgIAAAAbmlja05hbWVxCFgOAAAAQmVsbGEncyBHYXJkZW5xCVgIAAAAc2hvcE5hbWVxCk5YCQAAAHJlc291cmNlc3ELXXEMKFgYAAAANWMzNTUzZGUxZDk3NzE1OGEzNjc0OTM0cQ1lWAYAAABhdmF0YXJxDlhaAAAAaHR0cHM6Ly9pbWcueGlhb2hvbmdzaHUuY29tL2F2YXRhci81YzM0NTU1NGMxMmZkNDAwMDExMjE3NzYuanBnQDEyMHdfMTIwaF85MnFfMWVfMWNfMXguanBncQ9YDgAAAF9hY2Nlc3NlZF90aW1lcRBHQddXnAhVcKRYBgAAAHVzZXJJZHERWBgAAAA1YmZlMjQ0YTlkZjBhOTAwMDFiMzhiMmNxElgIAAAAYXR0ckxpc3RxE05YDAAAAGJBY2NvdW50VHlwZXEUWAcAAABQUklNQVJZcRVYCwAAAGRlYWN0aXZhdGVkcRaJWAgAAABzZWxsZXJJZHEXWBgAAAA1YzM1NTNkZTFkOTc3MTU4YTM2NzQ5MzRxGFgOAAAAX2NyZWF0aW9uX3RpbWVxGUdB11ecCFVwpFgLAAAAcGVybWlzc2lvbnNxGl1xGyhlWAsAAABzZWxsZXJSb2xlc3EcTlgDAAAAX2lkcR1YIAAAADUxZDNhZWM0NzRmNDQ2MmRhNTM2NDFiNzU5Y2QzYWM2cR5YCgAAAHNlbGxlclR5cGVxH0sEWAUAAABlbWFpbHEgWCEAAAA1YmZlMjQ0YTlkZjBhOTAwMDFiMzhiMmNAeGhzLmZha2VxIXUu'} cookie9 = {'name': 'xhs_spid.6983', 'value': 'fe43536f085a4a3f.1565594090.21.1566470205.1566270695.ba61cacc-e97c-41c6-b72f-c6ca1a6b4d9c'} self.browser.get(self.url) # self.browser.add_cookie(cookie1) # self.browser.add_cookie(cookie2) # self.browser.add_cookie(cookie3) # self.browser.add_cookie(cookie4) # self.browser.add_cookie(cookie5) # self.browser.add_cookie(cookie6) # self.browser.add_cookie(cookie7) # self.browser.add_cookie(cookie8) # self.browser.add_cookie(cookie9) # self.browser.get(self.url) self.browser.implicitly_wait(30) def get_size(self): screenSize = self.browser.get_window_size() # 返回個字典 print(f"當前屏幕尺寸爲{screenSize}") # 當前屏幕尺寸爲{'width': 1080, 'height': 2280} # width = screenSize['width'] # height = screenSize['height'] def get_images(self): """ 獲取驗證碼圖片 :return: 圖片的location信息 """ # bg = [] # gb = [] bg_filename = 'bg.jpg' fg_filename = 'fg.jpg' bg_location_list = [] fg_location_list = [] while True: try: fg = self.browser.find_element_by_class_name("shumei_captcha_loaded_img_fg") if fg: time.sleep(2) print(fg.location) fg_location_list.append(fg.location) print(fg.size) start_x = fg.location["x"] + int(fg.size['width']) * 0.2 start_y = fg.location["y"] + int(fg.size['height']) * 0.5 end_x = fg.location['x'] + int(fg.size['width']) * 0.8 end_y = fg.location['y'] + int(fg.size['height']) * 0.5 print(start_x, start_y, end_x, end_y) fg_url = fg.get_attribute("src") if fg_url: print(fg_url) urlretrieve(url=fg_url, filename=fg_filename) print('缺口圖片下載完成') break except TimeoutException: self.get_images() while True: try: bgfull = self.browser.find_element_by_class_name("shumei_captcha_loaded_img_bg") time.sleep(2) if bgfull: print(bgfull.location) bg_location_list.append(bgfull.location) print(bgfull.size) start_xx = bgfull.location["x"] + int(bgfull.size['width']) * 0.2 start_yy = bgfull.location["y"] + int(bgfull.size['height']) * 0.5 end_xx = bgfull.location['x'] + int(bgfull.size['width']) * 0.8 end_yy = bgfull.location['y'] + int(bgfull.size['height']) * 0.5 print(start_xx, start_yy, end_xx, end_yy) bg_url = bgfull.get_attribute("src") if bg_url: print(bg_url) urlretrieve(url=bg_url, filename=bg_filename) print('背景圖片下載完成') break except TimeoutException: self.get_images() distance = end_xx - end_x print(distance) return distance # return bg_location_list, fg_location_list def get_gap(self, img1, img2): """ 獲取缺口偏移量 :param img1: 不帶缺口圖片 :param img2: 帶缺口圖片 :return: """ left = 15 for i in range(left, img1.size[0]): for j in range(img1.size[1]): if not self.is_pixel_equal(img1, img2, i, j): left = i return left return left def is_pixel_equal(self, img1, img2, x, y): """ 判斷兩個像素是否相同 :param image1: 圖片1 :param image2: 圖片2 :param x: 位置x :param y: 位置y :return: 像素是否相同 """ # 取兩個圖片的像素點 pix1 = img1.load()[x, y] pix2 = img2.load()[x, y] threshold = 60 if (abs(pix1[0] - pix2[0] < threshold) and abs(pix1[1] - pix2[1] < threshold) and abs( pix1[2] - pix2[2] < threshold)): return True else: return False def crack(self): # 打開瀏覽器 self.open() bg_filename = 'bg.jpg' fg_filename = 'fg.jpg' # 獲取圖片 bg_location_list, fullbg_location_list = self.get_images() # 根據位置對圖片進行合併還原 bg_img = self.get_merge_image(bg_filename, bg_location_list) fullbg_img = self.get_merge_image(fg_filename, fullbg_location_list) # 獲取缺口位置 gap = self.get_gap(fullbg_img, bg_img) print('缺口位置', gap) track = self.get_track(gap - self.BORDER) print('滑動滑塊') print(track) def get_merge_image(self, filename, location_list): """ 根據位置對圖片進行合併還原 :filename:圖片 :location_list:圖片位置 """ im = image.open(filename) # 瀏覽器生成的圖片規格是260px * 116px , 因此指定image.new('RGB', (260, 116))· new_im = image.new('RGB', (400, 200)) im_list_upper = [] im_list_down = [] for location in location_list: if location['y'] == -100: im_list_upper.append(im.crop((abs(location['x']), 100, abs(location['x']) + 10, 200))) if location['y'] == 0: im_list_down.append(im.crop((abs(location['x']), 0, abs(location['x']) + 10, 100))) new_im = image.new('RGB', (400, 200)) x_offset = 0 for im in im_list_upper: new_im.paste(im, (x_offset, 0)) x_offset += im.size[0] x_offset = 0 for im in im_list_down: new_im.paste(im, (x_offset, 100)) x_offset += im.size[0] new_im.save(filename) print(new_im) return new_im def fixed_size(self, infile, outfile, width, height): # infile = 'fg.jpg' # outfile = 'new_fg.png' """按照固定尺寸處理圖片""" im = image.open(infile) out = im.resize((width, height), image.ANTIALIAS) out.save(outfile) def FindPic(self, target, template): """ 找出圖像中最佳匹配位置 :param target: 目標即背景圖 :param template: 模板即須要找到的圖 :return: 返回最佳匹配及其最差匹配和對應的座標 """ target_rgb = cv2.imread(target) target_gray = cv2.cvtColor(target_rgb, cv2.COLOR_BGR2GRAY) template_rgb = cv2.imread(template, 0) res = cv2.matchTemplate(target_gray, template_rgb, cv2.TM_CCOEFF_NORMED) value = cv2.minMaxLoc(res) print('*****') print(value) return value[-1][0] def get_slider(self): """ 獲取滑塊 :return: 滑塊對象 """ while True: try: slider = self.browser.find_element_by_xpath("//div[@class='shumei_captcha_slide_btn']") # print(slider) break except: time.sleep(0.5) return slider def get_track(self, distance): """ 根據偏移量獲取移動軌跡 :param distance: 偏移量 :return: 移動軌跡 """ print("=" * 10, distance) # 移動軌跡 track = [] # 當前位移 current = 0 # 減速閾值 mid = distance * 4 / 5 print(mid) # 計算間隔 t = 0.2 # 初速度 v = 0 while current < distance: if current < mid: # 加速度爲正2 a = 4 else: # 加速度爲負3 a = -3.5 # 初速度v0 v0 = v # 當前速度v = v0 + at v = v0 + a * t # 移動距離x = v0t + 1/2 * a * t^2 move = v0 * t + 1 / 2 * a * t * t # 當前位移 current += move # 加入軌跡 track.append(round(move)) # print(track) return track def move_to_gap(self, slider, track): """ 拖動滑塊到缺口處 :param slider: 滑塊 :param track: 軌跡 :return: """ ActionChains(self.browser).click_and_hold(slider).perform() a = [] b = track for x in track: ActionChains(self.browser).move_by_offset(xoffset=x, yoffset=0).perform() time.sleep(0.8) ActionChains(self.browser).release().perform() def result_html(self): response = HtmlResponse(url=self.browser.current_url, body=self.browser.page_source, encoding='utf-8') if '驗證失敗,請從新再試' in response.text: c.process() else: print(response.text) return response def close(self): self.browser.close() def process(self): self.get_images() self.fixed_size('bg.jpg', '1bg.jpg', 400, 200) self.fixed_size('fg.jpg', '1fg.png', 60, 200) x = self.FindPic('1bg.jpg', '1fg.png') a = self.get_slider() r = self.get_track(x) self.move_to_gap(a, r) time.sleep(2) self.result_html() self.close() if __name__ == '__main__': url = 'https://www.xiaohongshu.com/web-login/captcha?redirectPath=http%3A%2F%2Fwww.xiaohongshu.com%2Fuser%2Fprofile%2F590d4d5950c4b4281396ea20' c = Crack(url) c.open() c.get_size() for i in range(1, 2): c.process()
缺口驗證碼的驗證!!html