Requests 是一個 Python 的 HTTP 客戶端庫。php
Request支持HTTP鏈接保持和鏈接池,支持使用cookie保持會話,支持文件上傳,支持自動響應內容的編碼,支持國際化的URL和POST數據自動編碼。html
在python內置模塊的基礎上進行了高度的封裝從而使得python進行網絡請求時,變得人性化,使用Requests能夠垂手可得的完成瀏覽器可有的任何操做。現代,國際化,友好
。node
requests會自動實現持久鏈接keep-alivepython
開源地址:https://github.com/kennethreitz/requests
中文文檔:http://docs.python-requests.org/zh_CN/latest/index.htmlgit
目錄
1、Requests基礎
2、發送請求與接收響應(基本GET請求)
3、發送請求與接收響應(基本POST請求)
4、response屬性
5、代理
6、cookie和session
7、案例github
pip install requests
import requests
response = requests.get(url)
response = requests.get("http://httpbin.org/get?name=zhangsan&age=22") print(response.text)
data = { "name": "zhangsan", "age": 30 } response = requests.get("http://httpbin.org/get", params=data) print(response.text)
headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36" } response = requests.get("http://httpbin.org/get", headers=headers) print(response.text)
response = requests.post(url, data = data, headers=headers)
屬性 | 描述 |
---|---|
response.text | 獲取str類型(Unicode編碼)的響應 |
response.content | 獲取bytes類型的響應 |
response.status_code | 獲取響應狀態碼 |
response.headers | 獲取響應頭 |
response.request | 獲取響應對應的請求 |
proxies = { "http": "https://175.44.148.176:9000", "https": "https://183.129.207.86:14002" } response = requests.get("https://www.baidu.com/", proxies=proxies)
1.不須要cookie的時候儘可能不去使用cookie。
2.爲了獲取登陸以後的頁面,咱們必須發送帶有cookies的請求,此時爲了確保帳號安全應該儘可能下降數據
採集速度。ajax
response.cookies
session = requests.session()
示例:json
def login_renren(): login_url = 'http://www.renren.com/SysHome.do' headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36" } session = requests.session() login_data = { "email": "帳號", "password": "密碼" } response = session.post(login_url, data=login_data, headers=headers) response = session.get("http://www.renren.com/971909762/newsfeed/photo") print(response.text) login_renren()
import requests import sys class BaiduTieBa: def __init__(self, name, pn, ): self.name = name self.url = "http://tieba.baidu.com/f?kw={}&ie=utf-8&pn={}".format(name, pn) self.headers = { # "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36" # 使用較老版本的請求頭,該瀏覽器不支持js "User-Agent": "Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)" } self.url_list = [self.url + str(pn*50) for pn in range(pn)] print(self.url_list) def get_data(self, url): """ 請求數據 :param url: :return: """ response = requests.get(url, headers=self.headers) return response.content def save_data(self, data, num): """ 保存數據 :param data: :param num: :return: """ file_name = "./pages/" + self.name + "_" + str(num) + ".html" with open(file_name, "wb") as f: f.write(data) def run(self): for url in self.url_list: data = self.get_data(url) num = self.url_list.index(url) self.save_data(data, num) if __name__ == "__main__": name = sys.argv[1] pn = int(sys.argv[2]) baidu = BaiduTieBa(name, pn) baidu.run()
import requests import sys import json class JinshanCiBa: def __init__(self, words): self.url = "http://fy.iciba.com/ajax.php?a=fy" self.headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0", "X-Requested-With": "XMLHttpRequest" } self.post_data = { "f": "auto", "t": "auto", "w": words } def get_data(self): """ 請求數據 :param url: :return: """ response = requests.post(self.url, data=self.post_data, headers=self.headers) return response.text def show_translation(self): """ 顯示翻譯結果 :param data: :param num: :return: """ response = self.get_data() json_data = json.loads(response, encoding='utf-8') if json_data['status'] == 0: translation = json_data['content']['word_mean'] elif json_data['status'] == 1: translation = json_data['content']['out'] else: translation = None print(translation) def run(self): self.show_translation() if __name__ == "__main__": words = sys.argv[1] ciba = JinshanCiBa(words) ciba.run()
從已下載頁面中提取url來爬取圖片(頁面下載方法見案例1)瀏覽器
from lxml import etree import requests class DownloadPhoto: def __init__(self): pass def download_img(self, url): response = requests.get(url) index = url.rfind('/') file_name = url[index + 1:] print("下載圖片:" + file_name) save_name = "./photo/" + file_name with open(save_name, "wb") as f: f.write(response.content) def parse_photo_url(self, page): html = etree.parse(page, etree.HTMLParser()) nodes = html.xpath("//a[contains(@class, 'thumbnail')]/img/@bpic") print(nodes) print(len(nodes)) for node in nodes: self.download_img(node) if __name__ == "__main__": download = DownloadPhoto() for i in range(6000): download.parse_photo_url("./pages/校花_{}.html".format(i))
main.py安全
import requests from lxml import etree from file_download import DownLoadExecutioner, file_download class XiaoHua: def __init__(self, init_url): self.init_url = init_url self.download_executioner = DownLoadExecutioner() def start(self): self.download_executioner.start() self.download_img(self.init_url) def download_img(self, url): html_text = file_download(url, type='text') html = etree.HTML(html_text) img_urls = html.xpath("//a[contains(@class,'thumbnail')]/img/@bpic") self.download_executioner.put_task(img_urls) # 獲取下一頁的鏈接 next_page = html.xpath("//div[@id='frs_list_pager']/a[contains(@class,'next')]/@href") next_page = "http:" + next_page[0] self.download_img(next_page) if __name__ == '__main__': x = XiaoHua("http://tieba.baidu.com/f?kw=校花&ie=utf-8") x.start()
file_download.py
import requests import threading from queue import Queue def file_download(url, type='content'): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko' } r = requests.get(url, headers=headers) if type == 'text': return r.text return r.content class DownLoadExecutioner(threading.Thread): def __init__(self): super().__init__() self.q = Queue(maxsize=50) # 圖片保存目錄 self.save_dir = './img/' # 圖片計數 self.index = 0 def put_task(self, urls): if isinstance(urls, list): for url in urls: self.q.put(url) else: self.q.put(urls) def run(self): while True: url = self.q.get() content = file_download(url) # 截取圖片名稱 index = url.rfind('/') file_name = url[index+1:] save_name = self.save_dir + file_name with open(save_name, 'wb+') as f: f.write(content) self.index += 1 print(save_name + "下載成功! 當前已下載圖片總數:" + str(self.index))
main.py
import requests from lxml import etree from file_download_pool import DownLoadExecutionerPool, file_download class XiaoHua: def __init__(self, init_url): self.init_url = init_url self.download_executioner = DownLoadExecutionerPool() def start(self): self.download_img(self.init_url) def download_img(self, url): html_text = file_download(url, type='text') html = etree.HTML(html_text) img_urls = html.xpath("//a[contains(@class,'thumbnail')]/img/@bpic") self.download_executioner.put_task(img_urls) # 獲取下一頁的鏈接 next_page = html.xpath("//div[@id='frs_list_pager']/a[contains(@class,'next')]/@href") next_page = "http:" + next_page[0] self.download_img(next_page) if __name__ == '__main__': x = XiaoHua("http://tieba.baidu.com/f?kw=校花&ie=utf-8") x.start()
file_download_pool.py
import requests import concurrent.futures as futures def file_download(url, type='content'): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko' } r = requests.get(url, headers=headers) if type == 'text': return r.text return r.content class DownLoadExecutionerPool(): def __init__(self): super().__init__() # 圖片保存目錄 self.save_dir = './img_pool/' # 圖片計數 self.index = 0 # 線程池 self.ex = futures.ThreadPoolExecutor(max_workers=30) def put_task(self, urls): if isinstance(urls, list): for url in urls: self.ex.submit(self.save_img, url) else: self.ex.submit(self.save_img, urls) def save_img(self, url): content = file_download(url) # 截取圖片名稱 index = url.rfind('/') file_name = url[index+1:] save_name = self.save_dir + file_name with open(save_name, 'wb+') as f: f.write(content) self.index += 1 print(save_name + "下載成功! 當前已下載圖片總數:" + str(self.index))
做者:Recalcitrant
連接:https://www.jianshu.com/p/140012f88f8eRequests 是一個 Python 的 HTTP 客戶端庫。
Request支持HTTP鏈接保持和鏈接池,支持使用cookie保持會話,支持文件上傳,支持自動響應內容的編碼,支持國際化的URL和POST數據自動編碼。
在python內置模塊的基礎上進行了高度的封裝,從而使得python進行網絡請求時,變得人性化,使用Requests能夠垂手可得的完成瀏覽器可有的任何操做。現代,國際化,友好。
requests會自動實現持久鏈接keep-alive
![image](//upload-images.jianshu.io/upload_images/17476284-aaaae0326f700dc7.png)
開源地址:https://github.com/kennethreitz/requests
中文文檔:http://docs.python-requests.org/zh_CN/latest/index.html
目錄
1、Requests基礎
2、發送請求與接收響應(基本GET請求)
3、發送請求與接收響應(基本POST請求)
4、response屬性
5、代理
6、cookie和session
7、案例
pip install requests
import requests
response = requests.get(url)
response = requests.get("http://httpbin.org/get?name=zhangsan&age=22") print(response.text)
data = { "name": "zhangsan", "age": 30 } response = requests.get("http://httpbin.org/get", params=data) print(response.text)
headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36" } response = requests.get("http://httpbin.org/get", headers=headers) print(response.text)
response = requests.post(url, data = data, headers=headers)
屬性 | 描述 |
---|---|
response.text | 獲取str類型(Unicode編碼)的響應 |
response.content | 獲取bytes類型的響應 |
response.status_code | 獲取響應狀態碼 |
response.headers | 獲取響應頭 |
response.request | 獲取響應對應的請求 |
proxies = { "http": "https://175.44.148.176:9000", "https": "https://183.129.207.86:14002" } response = requests.get("https://www.baidu.com/", proxies=proxies)
1.不須要cookie的時候儘可能不去使用cookie。
2.爲了獲取登陸以後的頁面,咱們必須發送帶有cookies的請求,此時爲了確保帳號安全應該儘可能下降數據
採集速度。
response.cookies
session = requests.session()
示例:
def login_renren(): login_url = 'http://www.renren.com/SysHome.do' headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36" } session = requests.session() login_data = { "email": "帳號", "password": "密碼" } response = session.post(login_url, data=login_data, headers=headers) response = session.get("http://www.renren.com/971909762/newsfeed/photo") print(response.text) login_renren()
import requests import sys class BaiduTieBa: def __init__(self, name, pn, ): self.name = name self.url = "http://tieba.baidu.com/f?kw={}&ie=utf-8&pn={}".format(name, pn) self.headers = { # "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36" # 使用較老版本的請求頭,該瀏覽器不支持js "User-Agent": "Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)" } self.url_list = [self.url + str(pn*50) for pn in range(pn)] print(self.url_list) def get_data(self, url): """ 請求數據 :param url: :return: """ response = requests.get(url, headers=self.headers) return response.content def save_data(self, data, num): """ 保存數據 :param data: :param num: :return: """ file_name = "./pages/" + self.name + "_" + str(num) + ".html" with open(file_name, "wb") as f: f.write(data) def run(self): for url in self.url_list: data = self.get_data(url) num = self.url_list.index(url) self.save_data(data, num) if __name__ == "__main__": name = sys.argv[1] pn = int(sys.argv[2]) baidu = BaiduTieBa(name, pn) baidu.run()
import requests import sys import json class JinshanCiBa: def __init__(self, words): self.url = "http://fy.iciba.com/ajax.php?a=fy" self.headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0", "X-Requested-With": "XMLHttpRequest" } self.post_data = { "f": "auto", "t": "auto", "w": words } def get_data(self): """ 請求數據 :param url: :return: """ response = requests.post(self.url, data=self.post_data, headers=self.headers) return response.text def show_translation(self): """ 顯示翻譯結果 :param data: :param num: :return: """ response = self.get_data() json_data = json.loads(response, encoding='utf-8') if json_data['status'] == 0: translation = json_data['content']['word_mean'] elif json_data['status'] == 1: translation = json_data['content']['out'] else: translation = None print(translation) def run(self): self.show_translation() if __name__ == "__main__": words = sys.argv[1] ciba = JinshanCiBa(words) ciba.run()
從已下載頁面中提取url來爬取圖片(頁面下載方法見案例1)
from lxml import etree import requests class DownloadPhoto: def __init__(self): pass def download_img(self, url): response = requests.get(url) index = url.rfind('/') file_name = url[index + 1:] print("下載圖片:" + file_name) save_name = "./photo/" + file_name with open(save_name, "wb") as f: f.write(response.content) def parse_photo_url(self, page): html = etree.parse(page, etree.HTMLParser()) nodes = html.xpath("//a[contains(@class, 'thumbnail')]/img/@bpic") print(nodes) print(len(nodes)) for node in nodes: self.download_img(node) if __name__ == "__main__": download = DownloadPhoto() for i in range(6000): download.parse_photo_url("./pages/校花_{}.html".format(i))
main.py
import requests from lxml import etree from file_download import DownLoadExecutioner, file_download class XiaoHua: def __init__(self, init_url): self.init_url = init_url self.download_executioner = DownLoadExecutioner() def start(self): self.download_executioner.start() self.download_img(self.init_url) def download_img(self, url): html_text = file_download(url, type='text') html = etree.HTML(html_text) img_urls = html.xpath("//a[contains(@class,'thumbnail')]/img/@bpic") self.download_executioner.put_task(img_urls) # 獲取下一頁的鏈接 next_page = html.xpath("//div[@id='frs_list_pager']/a[contains(@class,'next')]/@href") next_page = "http:" + next_page[0] self.download_img(next_page) if __name__ == '__main__': x = XiaoHua("http://tieba.baidu.com/f?kw=校花&ie=utf-8") x.start()
file_download.py
import requests import threading from queue import Queue def file_download(url, type='content'): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko' } r = requests.get(url, headers=headers) if type == 'text': return r.text return r.content class DownLoadExecutioner(threading.Thread): def __init__(self): super().__init__() self.q = Queue(maxsize=50) # 圖片保存目錄 self.save_dir = './img/' # 圖片計數 self.index = 0 def put_task(self, urls): if isinstance(urls, list): for url in urls: self.q.put(url) else: self.q.put(urls) def run(self): while True: url = self.q.get() content = file_download(url) # 截取圖片名稱 index = url.rfind('/') file_name = url[index+1:] save_name = self.save_dir + file_name with open(save_name, 'wb+') as f: f.write(content) self.index += 1 print(save_name + "下載成功! 當前已下載圖片總數:" + str(self.index))
main.py
import requests from lxml import etree from file_download_pool import DownLoadExecutionerPool, file_download class XiaoHua: def __init__(self, init_url): self.init_url = init_url self.download_executioner = DownLoadExecutionerPool() def start(self): self.download_img(self.init_url) def download_img(self, url): html_text = file_download(url, type='text') html = etree.HTML(html_text) img_urls = html.xpath("//a[contains(@class,'thumbnail')]/img/@bpic") self.download_executioner.put_task(img_urls) # 獲取下一頁的鏈接 next_page = html.xpath("//div[@id='frs_list_pager']/a[contains(@class,'next')]/@href") next_page = "http:" + next_page[0] self.download_img(next_page) if __name__ == '__main__': x = XiaoHua("http://tieba.baidu.com/f?kw=校花&ie=utf-8") x.start()
file_download_pool.py
import requests import concurrent.futures as futures def file_download(url, type='content'): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko' } r = requests.get(url, headers=headers) if type == 'text': return r.text return r.content class DownLoadExecutionerPool(): def __init__(self): super().__init__() # 圖片保存目錄 self.save_dir = './img_pool/' # 圖片計數 self.index = 0 # 線程池 self.ex = futures.ThreadPoolExecutor(max_workers=30) def put_task(self, urls): if isinstance(urls, list): for url in urls: self.ex.submit(self.save_img, url) else: self.ex.submit(self.save_img, urls) def save_img(self, url): content = file_download(url) # 截取圖片名稱 index = url.rfind('/') file_name = url[index+1:] save_name = self.save_dir + file_name with open(save_name, 'wb+') as f: f.write(content) self.index += 1 print(save_name + "下載成功! 當前已下載圖片總數:" + str(self.index))
做者:Recalcitrant
連接:https://www.jianshu.com/p/140012f88f8e