經過編寫程序模擬瀏覽器上網,去互聯網上抓取數據html
聽從瀏覽器上網流程java
約束爬蟲程序的數據爬蟲,至關於口頭協議,並無強制python
經過相應的策略和技術手段,防止數據的爬取c++
破解反爬蟲手段web
import urllib.request url='https://www.baidu.com/' response=urllib.request.urlopen(url=url) page_text=response.read() print(page_text) with open('1.html','wb') as f: f.write(page_text) print('寫入')
url裏面不能存在非ASCII碼編碼的字符ajax
字符編碼的轉換正則表達式
import urllib.request import urllib.parse url='http://www.baidu.com/s?wd=' word=urllib.parse.quote('人民幣') # 對中文進行編碼 url+=word # 將編碼傳給url response=urllib.request.urlopen(url=url) page_text=response.read() with open('2.html','wb') as f: f.write(page_text) print('寫入')
User-Agent(UA):請求載體的身份標識chrome
網站檢查請求的UA編程
反反爬機制:假裝爬蟲程序請求的UA
import urllib.request url='http://www.baidu.com/' headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36' } # 自定義響應頭 request=urllib.request.Request(url=url,headers=headers) response=urllib.request.urlopen(request) text=response.read() with open('3.html','wb')as f: f.write(text)
netwo的XHR裏面去找
import urllib.request import urllib.parse url='https://fanyi.baidu.com/sug' data={ 'kw':'西瓜' } # 將參數封裝到字典中 data=urllib.parse.urlencode(data) # 進行編碼處理,獲得字符串類型 data=data.encode() # 將字符串類型的轉成bytes類型 response=urllib.request.urlopen(url=url,data=data) ret=response.read() # 一組json字符串 with open('4.html','wb')as f: f.write(ret)
import requests url='https://www.sogou.com/' response=requests.get(url=url) page_data=response.text # 字符串形式的頁面數據 with open('sogou.html','w',encoding='utf-8')as f: f.write(page_data)
response.text # 字符串形式的頁面數據 response.content # 二進制的頁面數據 response.status_code # 返回相應狀態碼 response.headers # 響應頭信息 response.url # 獲取請求的url
方式一
import requests url='https://www.sogou.com/web?query=周杰倫&ie=utf8' response=requests.get(url=url) page_text=response.text
方式二
import requests url='https://www.sogou.com/web' params={ 'query':'周杰倫', 'ie':'utf8' } response=requests.get(url=url,params=params) page_text=response.text
import requests url='https://www.sogou.com/web' params={ 'query':'周杰倫', 'ie':'utf8' } headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36' } response=requests.get(url=url,params=params,headers=headers) page_text=response.text
先去找到url,獲得data
import requests url='https://www.douban.com/accounts/login' data={ 'source': 'index_nav', 'form_email': '18668573649@163.com', 'form_password': 'k365532902', } headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36' } response=requests.post(url=url,data=data,headers=headers) page_text=response.text with open('douban.html','w',encoding='utf8')as f: f.write(page_text)
要找到url和參數
import requests url='https://movie.douban.com/j/chart/top_list?' params={ 'type': '17', 'interval_id': '100:90', 'action': '', 'start': '60', 'limit': '20', } headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36' } response=requests.get(url=url,params=params,headers=headers) page_text=response.text
import requests url='http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword' data={ 'cname': '', 'pid': '', 'keyword': '寧波', 'pageIndex': '1', 'pageSize': '10', } headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36' } response=requests.post(url=url,data=data,headers=headers) page_text=response.text
根據指定的詞條,獲取必定範圍的數據
import requests import os if not os.path.exists('pages'): os.mkdir('pages') word=input('enter a word') # 關鍵字 url='https://zhihu.sogou.com/zhihu' start_pagenum=int(input('enter a start pagenum')) # 開始頁碼 end_pagenum=int(input('enter a end pagenum')) # 結束頁碼 headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36' } for page in range(start_pagenum,end_pagenum+1): params={ 'query':word, 'pane':page, 'ie':'utf-8' } response=requests.get(url=url,params=params,headers=headers) page_text = response.text file_name=word+str(page)+'.html' filepath='pages/'+file_name with open(filepath,'w',encoding='utf-8')as f: f.write(page_text)
被驗證碼攔截了
import requests session = requests.session() login_url = 'https://www.douban.com/accounts/login' data = { 'source': 'None', 'redir': 'https://www.douban.com/people/186654449/', 'form_email': '18668573649@163.com', 'form_password': 'k365532902', 'captcha-solution': 'cough', 'captcha-id': 'TONAWBuNAp3yeI8r67VCHiYx:en', 'remember': 'on', 'login': '登陸', } headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36' } login_response = session.post(url=login_url, data=data, headers=headers) url = 'https://www.douban.com/people/186654449/' response = session.get(url=url, headers=headers) page_text = response.text with open('5.html', 'w', encoding='utf8')as f: f.write(page_text)
import requests url = 'http://www.baidu.com/s?word=ip' proxy = { 'http': '223.111.254.83:80', } headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36' } response = requests.get(url=url, proxies=proxy, headers=headers) with open('daili.html', 'w', encoding='utf-8')as f: f.write(response.text)
雲打碼平臺
import requests, json, time, re from lxml import etree import yan # 使用封裝好的方法 def get_code(code_img): username = 'qych1988gw' # 密碼 password = 'k365532902' # 軟件ID,開發者分紅必要參數。登陸開發者後臺【個人軟件】得到! appid = 6570 # 軟件密鑰,開發者分紅必要參數。登陸開發者後臺【個人軟件】得到! appkey = 'b1237a57c579e506735ffffed31b675c' # 圖片文件 filename = code_img # 驗證碼類型,# 例:1004表示4位字母數字,不一樣類型收費不一樣。請準確填寫,不然影響識別率。在此查詢全部類型 http://www.yundama.com/price.html codetype = 3000 # 超時時間,秒 timeout = 20 # 檢查 if (username == 'username'): print('請設置好相關參數再測試') else: # 初始化 yundama = yan.YDMHttp(username, password, appid, appkey) # 登錄雲打碼 uid = yundama.login(); print('uid: %s' % uid) # 查詢餘額 balance = yundama.balance(); print('balance: %s' % balance) # 開始識別,圖片路徑,驗證碼類型ID,超時時間(秒),識別結果 cid, result = yundama.decode(filename, codetype, timeout); print('cid: %s, result: %s' % (cid, result)) return result # 提取驗證碼圖片 url = 'https://www.douban.com/accounts/login' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36' } page_text = requests.get(url=url, headers=headers).text tree = etree.HTML(page_text) code_image_url = tree.xpath('//*[@id="captcha_image"]/@src')[0] # <img id="captcha_image" src="https://www.douban.com/misc/captcha?id=52yHi3MS1mj5PeSbamzYkPEp:en&size=s" alt="captcha" class="captcha_image"> code_img = requests.get(url=code_image_url, headers=headers).content # 提取驗證碼id c_id = re.findall('<img id="captcha_image".*?id=(.*?)&.*?>', page_text, re.S)[0] with open('code_img.png', 'wb')as f: f.write(code_img) # 進行圖片驗證 codetext = get_code('code_img.png') post = 'https://www.douban.com/accounts/login' data = { 'source': 'None', 'redir': 'https://www.douban.com/people/186654449/', 'form_email': '18668573649@163.com', 'form_password': 'k365532902', 'captcha-solution': codetext, 'captcha-id': c_id, 'login': '登陸' } login_text=requests.post(url=post,data=data,headers=headers).text with open('login.html','w',encoding='utf-8')as f: f.write(login_text)
下載圖片
import requests, re, os url = 'https://www.qiushibaike.com/pic/' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36' } response = requests.get(url=url, headers=headers) page_text = response.text img_list = re.findall('<div class="thumb">.*?<img src="(.*?)".*?>.*?</div>', page_text, re.S) if not os.path.exists('imgs'): os.mkdir('imgs') for i in img_list: img_url = 'https:' + i img_data = requests.get(url=img_url, headers=headers).content img_name=i.split('/')[-1] img_path='imgs/'+img_name with open(img_path, 'wb')as f: f.write(img_data)
from lxml import etree
建立etree對象進行制定數據的解析
返回列表
屬性定位
找到class屬性值爲song的div標籤//div[@class=「song」]
層級&索引定位
找到class屬性值爲tang的div的直系字標籤ul下的第二個字標籤li下的直系子標籤a//div[@class="tang"]/ul/li[2]/a
邏輯運算
找到href屬性值爲空且class屬性值爲du的a標籤//a[@href="" and @class="du"]
模糊匹配
//div[contains(@class,「ng」)]
//div[starts-with(@class,"ta")]
取文本
表示獲取某個標籤下的文本內容,或全部字標籤下的全部內容//div[@class="song"]/p[1]/text()
`//div[@class="tang"]//text()
全部子標籤的文本
取屬性
//div[@class="tang"]//li[2]/a/@href
from lxml import etree tree=etree.parse('text.html') tree.xpath('//div[@class=「song」]')
import requests from lxml import etree url = 'https://ishuo.cn/joke' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36' } page_text = requests.get(url=url, headers=headers).text tree = etree.HTML(page_text) li_list = tree.xpath('//div[@id="list"]/ul/li') for li in li_list: content = li.xpath('./div[@class="content"]/text()')[0] title = li.xpath('./div[@class="info"]/a/text()')[0] print(title,content)
python獨有,更加簡單便捷高效
html文檔來源是本地,Beautiful('open('本地的html文件')','lxml')
html文件來源是網絡,Beautiful('網絡請求到的頁面數據','lxml')
f = open('text.html') soup = BeautifulSoup(f, 'lxml')
屬性和方法:
根據標籤名查找,只能找到第一個
soup.div
獲取屬性
soup.a.attrs
,全部屬性,返回字典
soup.a.attrs['href']
,soup.a['href']
指定屬性
獲取內容
soup.a.string
,/text()
soup.a.text
,//text()
soup.a.get_text()
,//text()
find查找
soup.find('a')
soup.find('a',title="")
soup.find('a',alt="")
soup.find('a',class_="")
有個下劃線
soup.find('a',id="")
find_all查找,返回列表
soup.find_All('a')
soup.find_All(['a','div'])
soup.find_All('a',limit=2)
根據選擇器,返回列表
soup.select('#feng')
soup.select('div>img')
from bs4 import BeautifulSoup import requests url = 'http://www.shicimingju.com/book/sanguoyanyi.html' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36' } page_text = requests.get(url=url, headers=headers).text soup = BeautifulSoup(page_text, 'lxml') a_list = soup.select('.book-mulu > ul > li > a') def get_content(content_url): content_page = requests.get(url=content_url, headers=headers).text soup = BeautifulSoup(content_page, 'lxml') div = soup.find('div', class_="chapter_content") return div.text f=open('sanguo.txt','w',encoding='utf-8') for a in a_list: title = a.string a_url = a['href'] content_url = 'http://www.shicimingju.com' + a_url content = get_content(content_url) f.write(title+'\n\n'+content+'\n\n\n') f.close()
處理頁面動態加載數據的爬取
find_element_by_id
,根據id找節點find_element_by_name
,根據name找節點find_element_by_xpath
,根據xpath查找find_element_by_tag_name
,根據標籤名查找find_element_by_class_name
,根據class名查找from selenium import webdriver import time bro = webdriver.Chrome(executable_path='chromedriver') bro.get('http://www.baidu.com') time.sleep(1) text=bro.find_element_by_id('kw') # 獲取input text.send_keys('人民幣') # 往input輸入 time.sleep(1) button=bro.find_element_by_id('su') button.click() # 點擊 time.sleep(3) bro.quit() # 關閉瀏覽器
無界面瀏覽器,其自動化流程和谷歌瀏覽器一致,可是可截屏
from selenium import webdriver import time bro=webdriver.PhantomJS(executable_path=r'phantomjs-2.1.1-windows\bin\phantomjs.exe') bro.get('http://www.baidu.com') bro.save_screenshot('1.png') time.sleep(1) text=bro.find_element_by_id('kw') # 獲取input text.send_keys('人民幣') # 往input輸入 bro.save_screenshot('2.png') time.sleep(1) button=bro.find_element_by_id('su') button.click() # 點擊 bro.save_screenshot('3.png') time.sleep(3) bro.save_screenshot('3.png') bro.quit() # 關閉瀏覽器
from selenium import webdriver import time bro = webdriver.PhantomJS(executable_path=r'phantomjs-2.1.1-windows\bin\phantomjs.exe') url = 'https://movie.douban.com/typerank?type_name=%E7%A7%91%E5%B9%BB&type=17&interval_id=100:90&action=' bro.get(url) time.sleep(1) # 讓瀏覽器對象執行js代碼 js = 'window.scrollTo(0,document.body.scrollHeight)' # 滾輪到底部 for i in range(5): bro.execute_script(js) time.sleep(2) bro.save_screenshot('2.png') # 獲取加載數據後的頁面 page_text = bro.page_source