res = requests.get(url,headers=headers) 向網站發起請求,並獲取響應對象html
參數前端
響應對象(res)屬性web
import requests url = 'http://www.baidu.com/' # 爬取百度網頁 headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 \ (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1'} res = requests.get(url, headers=headers) print(res.encoding) # 查看網站的編碼格式 ISO-8859-1 # text屬性獲取響應內容(字符串)網站源碼 res = requests.get(url,headers=headers) res.encoding = 'utf-8' html = res.text # content屬性獲取響應內容(字節串 bytes)網站源碼 res = requests.get(url,headers=headers) html = res.content.decode('utf-8') print(res.status_code) # 查看響應碼 200 print(res.url) # 查看訪問的URL地址 https://www.baidu.com/
非結構化數據的保存方式redis
像壓縮文件zip、圖片文件等均可以使用非結構化數據的保存方式api
with open('xxx.jpg','wb') as f: f.write(res.content)
示例:保存趙麗穎圖片到本地網絡
import requests url = 'http://dingyue.nosdn.127.net/lL1JH2YdpAWrzEhfp8BrJ8lTHa1602AEX9E7qpTpH5NzW1535203788506compressflag.jpg' headers = {'User-Agent': 'Mozilla/5.0'} html = requests.get(url, headers=headers).content # 把圖片保存到本地 with open('趙麗穎.jpg', 'wb') as f: f.write(html)
目標:抓取指定貼吧全部圖片app
思路dom
貼吧URL規律:http://tieba.baidu.com/f?kw=??&pn=50ide
xpath表達式函數
一、帖子連接xpath,這裏爲何屬性選擇class,由於相同的元素他們要相同的樣式
//div[@class="t_con cleafix"]/div/div/div/a/@href
二、圖片連接xpath
//div[@class="d_post_content j_d_post_content clearfix"]/img[@class="BDE_Image"]/@src
三、視頻連接xpath
//div[@class="video_src_wrapper"]/embed/@data-video
# 注意: 此處視頻連接前端對響應內容作了處理,須要查看網頁源代碼來查看,複製HTML代碼在線格式化
百度貼吧視頻抓取反爬機制(對響應內容作處理)
網頁源代碼是:
<div class="video_src_wrapper">
<embed data-video="http://tb-video.bdstatic.com/tieba-smallvideo-transcode-cae/2754153_8fcd225842344de9901c1489e49defbe_0_cae.mp4"
F12調試定位到的代碼是:
<div class="video_src_wrapper"> <div class="video_src_wrap_main"> <video src="http://tb-video.bdstatic.com/tie-cae/f2358e8_0_cae.mp4" "></video> </div> </div>
若是經過F12定位的位置,寫xpath,會爬取不到,由於咱們requsets爬取的是網頁代碼,最後仍是要以網頁源代碼爲主。
import requests from lxml import etree import random import time from urllib import parse class BaiduImageSpider(object): def __init__(self): self.url = 'http://tieba.baidu.com/f?kw={}&pn={}' self.ua_list = [ 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET \ CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; InfoPath.3)', ] # 獲取html網頁代碼 def get_html(self, url): headers = {'User-Agent': random.choice(self.ua_list)} html = requests.get(url=url, headers=headers).content.decode('utf-8', 'ignore') return html # 解析html def xpath_func(self, html, xpath_bds): parse_html = etree.HTML(html) r_list = parse_html.xpath(xpath_bds) return r_list # 圖片抓取 def parse_html(self, one_url): html = self.get_html(one_url) xpath_bds = '//div[@class="t_con cleafix"]/div/div/div/a/@href' r_list = self.xpath_func(html, xpath_bds) # 提取帖子連接:xpath_list ['/p/32323','',''] for r in r_list: t_url = 'http://tieba.baidu.com' + r # 拼接帖子的URL地址 self.get_image(t_url) # 把帖子中全部圖片保存到本地 time.sleep(random.uniform(0, 2)) # 爬完1個帖子中全部圖片,休眠0-2秒鐘 # 給定1個帖子URL,把帖子中全部圖片保存到本地 def get_image(self, t_url): html = self.get_html(t_url) # 使用xpath表達式的或| : 圖片連接 + 視頻連接 xpath_bds = '//div[@class="d_post_content j_d_post_content clearfix"]/img[@class="BDE_Image"]/@src | //div[@class="video_src_wrapper"]/embed/@data-video' img_list = self.xpath_func(html, xpath_bds) # ['http://xxx.jpg',''] print(img_list) for img in img_list: html_bytes = requests.get(url=img, headers={'User-Agent': random.choice(self.ua_list)}).content self.save_img(html_bytes, img) # 保存圖片函數 def save_img(self, html_bytes, img): filename = img[-10:] with open(filename, 'wb') as f: f.write(html_bytes) print('%s下載成功' % filename) # 主函數 def main(self): name = input('請輸入貼吧名:') begin = int(input('請輸入起始頁:')) end = int(input('請輸入終止頁:')) # 對貼吧名進行編碼 kw = parse.quote(name) for page in range(begin, end + 1): pn = (page - 1) * 50 url = self.url.format(kw, pn) # 調用主線函數 self.parse_html(url) if __name__ == '__main__': spider = BaiduImageSpider() spider.main()
res = requests.get(url,params=params,headers=headers)
url爲基準的url地址,不包含查詢參數,該方法會自動對params字典編碼,而後和url拼接
參數類型:字典,字典中鍵值對做爲查詢參數
import requests baseurl = 'http://tieba.baidu.com/f?' params = { 'kw': '趙麗穎吧', 'pn': '50'} headers = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2\ ; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR \ 3.0.30729; Media Center PC 6.0; .NET4.0C; InfoPath.3)'} # 自動對params進行編碼,而後自動和url進行拼接,去發請求 res = requests.get(baseurl, headers=headers, params=params) res.encoding = 'utf-8' print(res.text)
res = requests.get(url, headers=headers, auth=('username','password'))
針對於須要web客戶端用戶名密碼認證的網站,auth = ('username','password')
import requests from lxml import etree import random import os class CodeSpider(object): def __init__(self): self.url = 'http://code.tarena.com.cn/AIDCode/aid1904/14-redis/' self.auth = ('tarenacode', 'code_2013') self.ua_list = [ 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .\ NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; InfoPath.3)', ] def parse_html(self): # 獲取響應內容 html = requests.get(url=self.url, headers={'User-Agent': random.choice(self.ua_list)}, auth=self.auth) html = html.content.decode('utf-8', 'ignore') parse_html = etree.HTML(html) # 解析 r_list = parse_html.xpath('//a/@href') # # r_list : ['../','day01','day02','redis_day01.zip'] for r in r_list: if r.endswith('.zip') or r.endswith('.rar'): self.save_files(r) def save_files(self, r): directory = '/home/tarena/AID/redis/' if not os.path.exists(directory): os.makedirs(directory) # 拼接地址,把zip文件保存到指定目錄 url = self.url + r # filename: /home/tarena/AID/redis/xxx.zip filename = directory + r html = requests.get(url=url, headers={'User-Agent': random.choice(self.ua_list)}, auth=self.auth).content with open(filename, 'wb') as f: f.write(html) print('%s下載成功' % r) if __name__ == '__main__': spider = CodeSpider() spider.parse_html()
response = requests.get(url=url,params=params,headers=headers,verify=False)
SSL證書認證參適用於沒有通過 證書認證機構認證的https類型網站,通常這種網站會拋出 SSLError 異常則考慮使用此參數
verify:True(默認)檢查證書認證;False(經常使用)忽略證書認證
定義:代替你原來的IP地址去對接網絡的IP地址。隱藏自身真實IP,避免被封。
獲取代理IP網站:西刺代理、快代理、全網代理、代理精靈、... ...
語法結構
proxies = {'協議':'協議://IP:端口號'} # http和https是相同的 proxies = { 'http':'http://59.172.27.6:38380', 'https':'https://59.172.27.6:38380' }
使用免費普通代理IP訪問測試網站: http://httpbin.org/get
import requests url = 'http://httpbin.org/get' headers = {'User-Agent': 'Mozilla/5.0'} # 定義代理,在代理IP網站中查找免費代理IP proxies = { 'http': 'http://309435365:szayclhp@43.226.164.156:16818', 'https': 'https://309435365:szayclhp@43.226.164.156:16818'} html = requests.get(url, proxies=proxies, headers=headers, timeout=5).text print(html)
從西刺代理上面爬取IP,迭代測試可否使用,創建一個本身的代理IP池,隨時更新用來抓取網站數據
import requests from lxml import etree import time import random from fake_useragent import UserAgent class GetProxyIP(object): def __init__(self): self.url = 'https://www.xicidaili.com/nn/' self.proxies = { 'http': 'http://163.204.247.219:9999', 'https': 'http://163.204.247.219:9999'} # 隨機生成User-Agent def get_random_ua(self): ua = UserAgent() # 建立User-Agent對象 useragent = ua.random return useragent # 從西刺代理網站上獲取隨機的代理IP def get_ip_file(self, url): headers = {'User-Agent': self.get_random_ua()} # 訪問西刺代理網站國內高匿代理,找到全部的tr節點對象 html = requests.get(url=url, proxies=self.proxies, headers=headers, timeout=5).content.decode('utf-8', 'ignore') parse_html = etree.HTML(html) # 基準xpath,匹配每一個代理IP的節點對象列表 tr_list = parse_html.xpath('//tr') for tr in tr_list[1:]: ip = tr.xpath('./td[2]/text()')[0] port = tr.xpath('./td[3]/text()')[0] # 測試ip:port是否可用 self.test_proxy_ip(ip, port) # 測試抓取的代理IP是否可用 def test_proxy_ip(self, ip, port): proxies = { 'http': 'http://{}:{}'.format(ip, port), 'https': 'https://{}:{}'.format(ip, port), } test_url = 'http://www.baidu.com/' try: res = requests.get(url=test_url, proxies=proxies, timeout=8) if res.status_code == 200: print(ip, ":", port, 'Success') with open('proxies.txt', 'a') as f: f.write(ip + ':' + port + '\n') except Exception as e: print(ip, port, 'Failed') # 主函數 def main(self): for i in range(1, 1001): url = self.url.format(i) self.get_ip_file(url) time.sleep(random.randint(5, 10)) if __name__ == '__main__': spider = GetProxyIP() spider.main()
從文件中隨機獲取代理IP寫爬蟲
import random import requests class BaiduSpider(object): def __init__(self): self.url = 'http://www.baidu.com/' self.headers = {'User-Agent': 'Mozilla/5.0'} self.blag = 1 def get_proxies(self): with open('proxies.txt', 'r') as f: result = f.readlines() # 讀取全部行並返回列表 proxy_ip = random.choice(result)[:-1] # 獲取了全部代理IP L = proxy_ip.split(':') proxy_ip = { 'http': 'http://{}:{}'.format(L[0], L[1]), 'https': 'https://{}:{}'.format(L[0], L[1]) } return proxy_ip def get_html(self): proxies = self.get_proxies() if self.blag <= 3: try: html = requests.get(url=self.url, proxies=proxies, headers=self.headers, timeout=5).text print(html) except Exception as e: print('Retry') self.blag += 1 self.get_html() if __name__ == '__main__': spider = BaiduSpider() spider.get_html()
寫一個獲取收費開放API代理的接口
# 獲取開放代理的接口 import requests from fake_useragent import UserAgent ua = UserAgent() # 建立User-Agent對象 useragent = ua.random headers = {'User-Agent': useragent} def ip_test(ip): url = 'http://www.baidu.com/' ip_port = ip.split(':') proxies = { 'http': 'http://{}:{}'.format(ip_port[0], ip_port[1]), 'https': 'https://{}:{}'.format(ip_port[0], ip_port[1]), } res = requests.get(url=url, headers=headers, proxies=proxies, timeout=5) if res.status_code == 200: return True else: return False # 提取代理IP def get_ip_list(): # 快代理:https://www.kuaidaili.com/doc/product/dps/ api_url = 'http://dev.kdlapi.com/api/getproxy/?orderid=946562662041898&num=100&protocol=1&method=2&an_an=1&an_ha=1&sep=2' html = requests.get(api_url).content.decode('utf-8', 'ignore') ip_port_list = html.split('\n') for ip in ip_port_list: with open('proxy_ip.txt', 'a') as f: if ip_test(ip): f.write(ip + '\n') if __name__ == '__main__': get_ip_list()
一、語法結構
proxies = { '協議':'協議://用戶名:密碼@IP:端口號' } proxies = { 'http':'http://用戶名:密碼@IP:端口號', 'https':'https://用戶名:密碼@IP:端口號' } proxies = { 'http': 'http://309435365:szayclhp@106.75.71.140:16816', 'https':'https://309435365:szayclhp@106.75.71.140:16816', }
用戶名和密碼會在給你API_URL的時候給你。不是你的帳號和帳號密碼。
# 獲取開放代理的接口 import requests from fake_useragent import UserAgent ua = UserAgent() # 建立User-Agent對象 useragent = ua.random headers = {'User-Agent': useragent} def ip_test(ip): url = 'https://blog.csdn.net/qq_34218078/article/details/90901602/' ip_port = ip.split(':') proxies = { 'http': 'http://1786088386:b95djiha@{}:{}'.format(ip_port[0], ip_port[1]), 'https': 'http://1786088386:b95djiha@{}:{}'.format(ip_port[0], ip_port[1]), } res = requests.get(url=url, headers=headers, proxies=proxies, timeout=5) if res.status_code == 200: print("OK") return True else: print(res.status_code) print("錯誤") return False # 提取代理IP def get_ip_list(): # 快代理:https://www.kuaidaili.com/doc/product/dps/ api_url = 'http://dps.kdlapi.com/api/getdps/?orderid=986603271748760&num=1000&signature=z4a5b2rpt062iejd6h7wvox16si0f7ct&pt=1&sep=2' html = requests.get(api_url).content.decode('utf-8', 'ignore') ip_port_list = html.split('\n') for ip in ip_port_list: with open('proxy_ip.txt', 'a') as f: if ip_test(ip): f.write(ip + '\n') if __name__ == '__main__': get_ip_list()