目錄php
import requests headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36' } def get_ID(pages): url = 'http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsList' ID_list = [] for every_page in range(1, pages + 1): params = { "on": "true", "page": str(every_page), "pageSize": "15", "productName": "", "conditionType": "1", "applyname": "", "applysn": "", } data = requests.post(url=url, params=params, headers=headers).json() for each_dict in data["list"]: ID_list.append(each_dict['ID']) import time time.sleep(0.1) return ID_list def get_all_detail(ID_list): url = "http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsById" with open('化妝品生產許可證信息.txt', 'a', encoding='utf-8') as f: for ID in ID_list: params = { "id": ID } data = requests.post(url=url, params=params, headers=headers).text f.write(data + '\n') import time time.sleep(0.1) print("數據寫入文件成功!") ID_list = get_ID(10) get_all_detail(ID_list)
import requests # UA假裝 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36' } # 指定url url = 'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword' # 處理參數 area = input('請輸入一個地名: ') params = { "cname": "", "pid": "", "keyword": area, "pageIndex": "1", "pageSize": "10", } # 發起請求,獲取響應數據 data = requests.get(url=url, params=params, headers=headers).json() print(data)
import re import os import time import requests from urllib import request if not os.path.exists('./qiutu'): os.mkdir('/qiutu') url = "https://www.qiushibaike.com/pic/" headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36" } page_text = requests.get(url=url, headers=headers).text if not os.path.exists('qiutu'): os.mkdir('qiutu') img_url_list = re.findall('<div class="thumb">.*?<img src="(.*?)" alt.*?</div>', page_text, re.S) for img_url in img_url_list: img_url = 'https:' + img_url img_name = img_url.split('/')[-1] img_path = './qiutu/' + img_name request.urlretrieve(img_url, img_path) print(img_path, '下載成功!') time.sleep(0.1)
import requests from bs4 import BeautifulSoup headers={ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36', } def parse_content(url): #獲取標題正文頁數據 page_text = requests.get(url,headers=headers).text soup = BeautifulSoup(page_text,'lxml') #解析得到標籤 ele = soup.find('div',class_='chapter_content') content = ele.text #獲取標籤中的數據值 return content if __name__ == "__main__": url = 'http://www.shicimingju.com/book/sanguoyanyi.html' reponse = requests.get(url=url,headers=headers) page_text = reponse.text #建立soup對象 soup = BeautifulSoup(page_text,'lxml') #解析數據 a_eles = soup.select('.book-mulu > ul > li > a') print(a_eles) cap = 1 for ele in a_eles: print('開始下載第%d章節'%cap) cap+=1 title = ele.string content_url = 'http://www.shicimingju.com'+ele['href'] content = parse_content(content_url) with open('./sanguo.txt','w') as fp: fp.write(title+":"+content+'\n\n\n\n\n') print('結束下載第%d章節'%cap)
import requests from lxml import etree headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36" } url = 'https://bj.58.com/ershoufang/?PGTID=0d200001-0000-1376-eb9f-25ca6cacedce&ClickID=1' page_text = requests.get(url=url, headers=headers).text # 數據解析 tree = etree.HTML(page_text) li_list = tree.xpath('//ul[@class="house-list-wrap"]/li') if __name__ == '__main__': for li in li_list: title = li.xpath('./div[2]/h2/a/text()')[0].strip() print(title)
import requests, os from lxml import etree from urllib import request # 建立一個空文件夾,用於存放圖片數據 if not os.path.exists('./images'): os.mkdir('./images') headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36" } url = 'http://pic.netbian.com/4kmeinv/' # 獲取響應數據 page_text = requests.get(url=url, headers=headers).text # 實例化etree對象 tree = etree.HTML(page_text) # xpath解析 li_list = tree.xpath('//div[@class="slist"]/ul/li') for li in li_list: img_name = li.xpath('./a/img/@alt')[0] # 處理中文亂碼問題 img_name = img_name.encode('ISO-8859-1').decode('gbk') img_url = 'http://pic.netbian.com' + li.xpath('./a/img/@src')[0] img_path = './images/' + img_name + '.jpg' request.urlretrieve(url=img_url, filename=img_path) print("下載完成!!!")
from lxml import etree from urllib import request import requests import base64 import os if not os.path.exists('./jiandan'): os.mkdir('./jiandan') headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36" } url = 'http://jandan.net/ooxx/page-62' page_text = requests.get(url=url, headers=headers).text tree = etree.HTML(page_text) img_hash_list = tree.xpath('//span[@class="img-hash"]/text()') for img_hash in img_hash_list: # 圖片的真實url(須要用base64解碼) img_url = "http:" + base64.b64decode(img_hash).decode('utf8') # 設置圖片存放路徑 img_path = './jiandan/' + img_url.split('/')[-1] # 持久化存儲 request.urlretrieve(url=img_url, filename=img_path) print("下載完成!!", img_url) print('over!')
import requests import random import os from lxml import etree if not os.path.exists('./jianli'): os.mkdir('./jianli') headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36" } for i in range(1, 6): # 1.指定url if i == 1: url = 'http://sc.chinaz.com/jianli/free.html' else: url = f'http://sc.chinaz.com/jianli/free_{i}.html' # 2.發起請求 response = requests.get(url=url, headers=headers) # 2.1編碼格式 response.encoding = 'utf8' # 3.獲取響應的文本信息 page_text = response.text # 4.實例化etree對象,將頁面源碼加載到該對象中 tree = etree.HTML(page_text) # 5.使用xpath函數進行定位 a_list = tree.xpath('//a[@class="title_wl"]') for a in a_list: jianli_name = a.xpath('./text()')[0] jianli_url = a.xpath('./@href')[0] print(jianli_name) print(jianli_url) print('----------------------------------------------') response2 = requests.get(url=jianli_url, headers=headers) response2.encoding = 'utf8' each_jinali_text = response2.text tree2 = etree.HTML(each_jinali_text) # 全部下載地址列表 download_url_list = tree2.xpath('//div[@class="clearfix mt20 downlist"]/ul/li/a/@href') # 隨機選擇一個下載地址 download_url = random.choice(download_url_list) # 獲取響應的數據 res = requests.get(url=download_url, headers=headers).content # 持久化存儲 filepath = './jianli/' + jianli_name + '.rar' with open(filepath, 'wb') as f: f.write(res) print(jianli_name, '下載完成!') print('over!')
""" 解析全部城市名稱 https://www.aqistudy.cn/historydata/ """ import requests from lxml import etree url = 'https://www.aqistudy.cn/historydata/' headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36" } response = requests.get(url=url, headers=headers) response.encoding = 'utf8' page_text = response.text tree = etree.HTML(page_text) hot_city_list = tree.xpath('//div[@class="hot"]/div[2]/ul/li/a/text()') all_city_list = tree.xpath('//div[@class="all"]/div[2]/ul/div[2]/li/a/text()') # 能夠用管道符鏈接以上兩者 cityname_list = tree.xpath( '//div[@class="hot"]/div[2]/ul/li/a/text() | //div[@class="all"]/div[2]/ul/div[2]/li/a/text()') print('-----------------------------------------------------------') print(hot_city_list) print('***********************************************************') print(all_city_list) print('###########################################################') print(cityname_list)
""" 圖片懶加載概念: - 圖片懶加載是一種網頁優化技術.圖片做爲一種網絡資源, 在被請求時也與普通靜態資源同樣,將佔用網絡資源, 而一次性將整個頁面的全部圖片加載完, 將大大增長頁面的首屏加載時間.爲了解決這種問題,經過先後端配合, 使圖片僅在瀏覽器當前視窗內出現時才加載該圖片, 達到減小首屏圖片請求數的技術就被稱爲"圖片懶加載". 網站通常如何實現圖片懶加載技術呢? - 在網頁源碼中,在img標籤中首先會使用一個"僞屬性"(一般使用src2,original...) 去存放真正的圖片連接而並不是是直接存放在src屬性中.當圖片出現到頁面的可視化區域中, 會動態將僞屬性替換成src屬性,完成圖片的加載. """ import os import requests from urllib import request from lxml import etree if not os.path.exists('./images'): os.mkdir('./images') url = 'http://sc.chinaz.com/tupian/' headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36" } response = requests.get(url=url, headers=headers) response.encoding = 'utf8' page_text = response.text tree = etree.HTML(page_text) img_list = tree.xpath('//div[@class="box picblock col3"]/div/a/img') for img in img_list: img_name = img.xpath('./@alt')[0] img_url = img.xpath('./@src2')[0] file_path = './images/' + img_name + '.jpg' request.urlretrieve(img_url, file_path) print("下載完成!!!", img_name) print('over!') """ 站長素材案例後續分析: - 經過細緻觀察頁面的結構後發現,網頁中圖片的連接是存儲在了src2這個僞屬性中 """
ydmhttp.py
:html
import http.client, mimetypes, urllib, json, time, requests class YDMHttp: apiurl = 'http://api.yundama.com/api.php' username = '' password = '' appid = '' appkey = '' def __init__(self, username, password, appid, appkey): self.username = username self.password = password self.appid = str(appid) self.appkey = appkey def request(self, fields, files=[]): response = self.post_url(self.apiurl, fields, files) response = json.loads(response) return response def balance(self): data = {'method': 'balance', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey} response = self.request(data) if (response): if (response['ret'] and response['ret'] < 0): return response['ret'] else: return response['balance'] else: return -9001 def login(self): data = {'method': 'login', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey} response = self.request(data) if (response): if (response['ret'] and response['ret'] < 0): return response['ret'] else: return response['uid'] else: return -9001 def upload(self, filename, codetype, timeout): data = {'method': 'upload', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey, 'codetype': str(codetype), 'timeout': str(timeout)} file = {'file': filename} response = self.request(data, file) if (response): if (response['ret'] and response['ret'] < 0): return response['ret'] else: return response['cid'] else: return -9001 def result(self, cid): data = {'method': 'result', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey, 'cid': str(cid)} response = self.request(data) return response and response['text'] or '' def decode(self, filename, codetype, timeout): cid = self.upload(filename, codetype, timeout) if (cid > 0): for i in range(0, timeout): result = self.result(cid) if (result != ''): return cid, result else: time.sleep(1) return -3003, '' else: return cid, '' def report(self, cid): data = {'method': 'report', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey, 'cid': str(cid), 'flag': '0'} response = self.request(data) if (response): return response['ret'] else: return -9001 def post_url(self, url, fields, files=[]): for key in files: files[key] = open(files[key], 'rb'); res = requests.post(url, files=files, data=fields) return res.text # 該函數用於獲取識別後的驗證碼 def getCodeData(username, password, filename, codetype, timeout): # 用戶名 username = username # 密碼 password = password # 軟件ID,開發者分紅必要參數。登陸開發者後臺【個人軟件】得到! appid = 1234 # 軟件密鑰,開發者分紅必要參數。登陸開發者後臺【個人軟件】得到! appkey = 'xxx' # 圖片文件 filename = filename # 驗證碼類型,# 例:1004表示4位字母數字,不一樣類型收費不一樣。請準確填寫,不然影響識別率。在此查詢全部類型 http://www.yundama.com/price.html codetype = codetype # 超時時間,秒 timeout = timeout # 檢查 if (username == 'username'): print('請設置好相關參數再測試') else: # 初始化 yundama = YDMHttp(username, password, appid, appkey) # 登錄雲打碼 uid = yundama.login(); print('uid: %s' % uid) # 查詢餘額 balance = yundama.balance(); print('balance: %s' % balance) # 開始識別,圖片路徑,驗證碼類型ID,超時時間(秒),識別結果 cid, result = yundama.decode(filename, codetype, timeout); print('cid: %s, result: %s' % (cid, result)) return result
""" cookie的處理: 1. 手動處理 - cookie封裝到headers 2. 自動處理 - (1)獲取一個session對象 - (2)使用session對象進行請求的發送 - (3)做用: 在使用session進行請求發送的過程當中若是產生了cookie, 則cookie會被自動存儲到session對象中. """ from ydmhttp import getCodeData # 識別人人網中的驗證碼圖片 import requests from urllib import request from lxml import etree headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36" } url = 'http://www.renren.com' page_text = requests.get(url=url, headers=headers).text tree = etree.HTML(page_text) code_img_url = tree.xpath('//*[@id="verifyPic_login"]/@src')[0] if code_img_url: request.urlretrieve(url=code_img_url, filename='./code.jpg') # 識別驗證碼圖片中的數據值,2004表示4位純漢字,其餘類型代碼參考雲打碼幫助文檔 # 查看驗證碼類型: http://www.yundama.com/price.html code_data = getCodeData('username', 'password', './code.jpg', 2004, 30) print(code_data) # code_data爲識別結果 else: print('不須要識別驗證碼') code_data = '' # 指定登陸請求的url login_url = 'http://www.renren.com/ajaxLogin/login?1=1&uniqueTimestamp=2019241516668' data = { # 如下數據經過fiddler抓包工具抓取到網頁後便可拿到 "email": "xxx", # 你的email "icode": code_data, "origURL": "http://www.renren.com/home", "domain": "renren.com", "key_id": "1", "captcha_type": "web_login", "password": "xxx", # 你的password密文 "rkey": "xxx", "f": "http%3A%2F%2Fwww.renren.com%2F970153909" } # 建立session對象 session = requests.Session() # 使用session進行請求的發送: 獲取cookie,且將cookie保存到session中 session.post(url=login_url, data=data, headers=headers) # 指定我的主頁對應的頁面url url = 'http://www.renren.com/970153909/profile' # 攜帶session發送該請求,並獲取響應數據 page_text = session.get(url=url, headers=headers).text # 持久化存儲 with open('renren.html', 'w', encoding='utf8') as f: f.write(page_text)
""" cookie的處理: 1. 手動處理: - 把cookie封裝到headers中 2. 自動處理: - (1)獲取一個session對象 - (2)使用session對象進行請求的發送 - (3)做用: 在使用session進行請求發送的過程當中, 若是產生了cookie,cookie就會被自動存儲到session對象中. """ from ydmhttp import getCodeData # 識別人人網中的驗證碼圖片 from urllib import request from lxml import etree import requests, os, uuid # 建立資源文件存放目錄 if not os.path.exists('./sources'): os.mkdir('./sources') headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36" } # 實例化session對象 session = requests.Session() # 指定登錄頁面的url url = 'https://so.gushiwen.org/user/login.aspx?from=http://so.gushiwen.org/user/collect.aspx' # 獲取登錄頁面的HTML文本 page_text = requests.get(url=url, headers=headers).text # 建立etree對象 tree = etree.HTML(page_text) # 獲取登錄頁面的識別驗證碼 code_img_url = 'https://so.gushiwen.org' + tree.xpath('//*[@id="imgCode"]/@src')[0] # 持久化存儲 驗證碼圖片 filepath = f'./sources/{uuid.uuid4()}' filename = filepath + '.jpg' # 注意:驗證碼圖片必須用攜帶session去獲取,不然獲取的驗證碼沒法該帳戶相匹配 img_data = session.get(url=code_img_url, headers=headers).content with open(filename, 'wb') as fp: fp.write(img_data) # 識別驗證碼圖片中的數據. 驗證碼類型查詢: http://www.yundama.com/price.html # 這裏應該填寫你雲打碼平臺 普通用戶的用戶名和密碼,而不是開發者用戶; 1004是驗證碼類型, 50是延遲時間 code_data = getCodeData('username', 'password', filename, 1004, 50) # 指定登陸請求的url login_url = 'https://so.gushiwen.org/user/login.aspx?from=http%3a%2f%2fso.gushiwen.org%2fuser%2fcollect.aspx' # 登陸該網站時須要在請求頭中加入動態參數 __VIEWSTATE = tree.xpath('//*[@id="__VIEWSTATE"]/@value')[0] __VIEWSTATEGENERATOR = tree.xpath('//*[@id="__VIEWSTATEGENERATOR"]/@value')[0] data = { "__VIEWSTATE": __VIEWSTATE, "__VIEWSTATEGENERATOR": __VIEWSTATEGENERATOR, "from": "http://so.gushiwen.org/user/collect.aspx", "email": "xxx", # 你的email "pwd": "xxx", # 你的密碼 "code": code_data, "denglu": "登陸", } # 模擬登錄,拿到登陸後的首頁數據 index_text = session.post(url=login_url, data=data, headers=headers).content # 持久化存儲 filename2 = filepath + '.html' with open(filename2, 'wb') as f: f.write(index_text) print('下載成功!!!')
import requests, re, os from lxml import etree from uuid import uuid4 # 導入線程池模塊 from multiprocessing.dummy import Pool # 在線程池中建立10個線程 pool = Pool(10) """ 線程池的使用場景: 應用在全部耗時的操做中 """ # 建立資源文件存放目錄 if not os.path.exists('./sources'): os.mkdir('./sources') headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36" } url = 'https://www.pearvideo.com/category_1' page_text = requests.get(url=url, headers=headers).text tree = etree.HTML(page_text) li_list = tree.xpath('//*[@id="listvideoListUl"]/li') video_url_list = [] # 裝的是全部視頻連接地址 for li in li_list: detail_url = 'https://www.pearvideo.com/' + li.xpath('./div/a/@href')[0] print(detail_url) detail_page_text = requests.get(url=detail_url, headers=headers).text # 咱們發現,視頻的連接地址在JS中,沒法經過xpath取到視頻連接地址,因而採用正則匹配 video_url = re.findall('ldUrl="",srcUrl="(.*?)",vdoUrl=srcUrl', detail_page_text, re.S)[0] video_url_list.append(video_url) print(video_url_list) def getVideoData(url): video_data = requests.get(url=url, headers=headers).content return video_data def saveVideoData(data): filename = f'./sources/{uuid4()}.mp4' with open(filename, 'wb') as f: f.write(data) print('下載成功!') # 對視頻的連接發起請求並獲取視頻數據 # video_data_list存儲全部視頻的二進制數據 video_data_list = pool.map(getVideoData, video_url_list) # 使用線程池對視頻數據進行持久化存儲 pool.map(saveVideoData, video_data_list)
博客地址: https://www.cnblogs.com/bobo-zhang/p/9685362.html 谷歌瀏覽器驅動下載地址: http://chromedriver.storage.googleapis.com/index.html 下載的驅動程序必須和瀏覽器的版本統一,你們能夠根據http://blog.csdn.net/huilan_same/article/details/51896672中提供的版本映射表進行對應 PhantomJS下載地址: https://pan.baidu.com/s/11KMIKitILGpVU33oxxzcJA # 提取碼:og8o
""" selenium: 可讓瀏覽器完成相關自動化的操做 環境安裝: - pip install selenium 編碼流程: - 導包 - 建立某一款瀏覽器對象 - 制定相關的行爲動做 """ from selenium import webdriver import time, os if not os.path.exists('./sources'): os.mkdir('./sources') # 配置瀏覽器 browser = webdriver.Chrome(executable_path=r'F:\chromedriver.exe') time.sleep(3) browser.get('https://www.baidu.com/') time.sleep(3) # find系列的函數能夠幫助咱們定位到相關的標籤 text_input = browser.find_element_by_id('kw') # 向文本框錄入一個關鍵字 text_input.send_keys('中國') time.sleep(3) btn = browser.find_element_by_id('su') btn.click() time.sleep(3) # 獲取當前瀏覽器顯示的頁面源碼數據(動態加載的數據) page_text = browser.page_source # 持久化存儲 with open('./sources/zhongguo.html', 'w', encoding='utf-8') as fp: fp.write(page_text) time.sleep(3) browser.quit()
from selenium import webdriver import time, os if not os.path.exists('./sources'): os.mkdir('./sources') # 配置瀏覽器 browser = webdriver.Chrome(executable_path=r'F:\chromedriver.exe') browser.get('https://movie.douban.com/typerank?type_name=%E7%88%B1%E6%83%85&type=13&interval_id=100:90&action=') time.sleep(3) browser.execute_script('window.scrollTo(0, document.body.scrollHeight)') time.sleep(3) browser.execute_script('window.scrollTo(0, document.body.scrollHeight)') time.sleep(3) browser.execute_script('window.scrollTo(0, document.body.scrollHeight)') time.sleep(3) # 獲取瀏覽器當前的頁面源碼數據 page_text = browser.page_source # 持久化存儲 with open('./sources/douban.html', 'w', encoding='utf-8') as fp: fp.write(page_text) time.sleep(3) browser.quit()
from selenium import webdriver import time, os if not os.path.exists('./sources'): os.mkdir('./sources') # 配置PhantomJS瀏覽器 browser = webdriver.PhantomJS( executable_path=r'F:\phantomjs-2.1.1-windows\bin\phantomjs.exe') browser.get('https://movie.douban.com/typerank?type_name=%E7%88%B1%E6%83%85&type=13&interval_id=100:90&action=') time.sleep(3) browser.execute_script('window.scrollTo(0, document.body.scrollHeight)') time.sleep(3) browser.execute_script('window.scrollTo(0, document.body.scrollHeight)') time.sleep(3) browser.execute_script('window.scrollTo(0, document.body.scrollHeight)') time.sleep(3) # 獲取瀏覽器當前的頁面源碼數據 page_text = browser.page_source # 持久化存儲 with open('./sources/douban2.html', 'w', encoding='utf-8') as fp: fp.write(page_text) time.sleep(3) browser.quit()
from selenium import webdriver from selenium.webdriver.chrome.options import Options from time import sleep # 必須寫上這三個配置項 chrome_options = Options() chrome_options.add_argument('--headless') chrome_options.add_argument('--disable-gpu') # 指定chrome_options bro = webdriver.Chrome(executable_path=r'C:\Users\chromedriver.exe', chrome_options=chrome_options) sleep(3) bro.get('https://www.baidu.com/') sleep(3) # find系列的函數能夠幫助咱們定位到相關的標籤 text_input = bro.find_element_by_id('kw') # 向文本框中錄入一個關鍵字 text_input.send_keys('中國') sleep(3) btn = bro.find_element_by_id('su') btn.click() sleep(3) # 獲取當前瀏覽器顯示的頁面源碼數據(動態加載的數據) page_text = bro.page_source print(page_text) bro.quit()
from selenium import webdriver from lxml import etree import time, os if not os.path.exists('./sources'): os.mkdir('./sources') # 配置瀏覽器 browser = webdriver.Chrome(executable_path=r'F:\chromedriver.exe') # 發送請求 browser.get('https://qzone.qq.com/') time.sleep(5) """ 在web應用中常常會遇到frame嵌套頁面的應用, 使用WebDriver每次只能在一個頁面上識別元素, 對於frame嵌套內的頁面上的元素, 直接定位是定位不到的. 這個時候就須要經過switch_to.frame()方法 將當前定位的主體切換到frame裏 """ # 定位到id="login_frame"的iframe標籤下的全部元素 browser.switch_to.frame('login_frame') # 點擊id="switcher_plogin"的標籤 browser.find_element_by_id('switcher_plogin').click() time.sleep(1) # 給id="u"的標籤設置值 browser.find_element_by_id("u").send_keys("username") # 你的用戶名 # time.sleep(1) # 給id="p"的標籤設置值 browser.find_element_by_id("p").send_keys("password") # 你的密碼 # time.sleep(3) # 點擊id="login_button"的標籤設置值 browser.find_element_by_id("login_button").click() time.sleep(1) browser.execute_script('window.scrollTo(0, document.body.scrollHeight)') time.sleep(1) browser.execute_script('window.scrollTo(0, document.body.scrollHeight)') time.sleep(1) # 獲取頁面全部數據 page_text = browser.page_source # 持久化存儲 with open('./sources/QQzone.html', 'w', encoding='utf-8') as fp: fp.write(page_text) tree = etree.HTML(page_text) li_list = tree.xpath('//*[@id="feed_friend_list"]/li') for li in li_list: text_list = li.xpath('.//div[@class="f-info"]//text() | .//div[@class="f-info qz_info_cut"]//text()') text = ''.join(text_list) print(text + '\n\n\n') browser.close()