模仿瀏覽器的行爲,爬取網頁信息。html
無參數實例 import requests ret = requests.get('https://github.com/timeline.json') print ret.text 有參數實例 import requests ret = requests.get("http://httpbin.org/get", params= {'key1': 'value1', 'key2': 'value2'}) print ret.text
import requests import json url = 'https://api.github.com/some/endpoint' payload = {'v1': 'k1} headers = {'content-type': 'application/json'} ret = requests.post(url, data=json.dumps(payload), headers=headers) print ret.text
requests.get(url, params=None, **kwargs) requests.post(url, data=None, json=None, **kwargs) requests.put(url, data=None, **kwargs) requests.head(url, **kwargs) requests.delete(url, **kwargs) requests.patch(url, data=None, **kwargs) requests.options(url, **kwargs) # 以上方法均是在此方法的基礎上構建 requests.request(method, url, **kwargs)
def param_method_url(): ret=requests.request(method='get', url='http://127.0.0.1:8000/test/') ret=requests.request(method='post', url='http://127.0.0.1:8000/test/')
import requests requests.get(url='http://127.0.0.1:8000/test/', params={'k1': 'v1', 'k2': 'v2'}) #他的本質與requests.get(url='xxxxx?k1=v1&k2=v2')
# 能夠是字典 # 能夠是字符串 # 能夠是字節 # 能夠是文件對象 # requests.request(method='POST', # url='http://127.0.0.1:8000/test/', # data={'k1': 'v1', 'k2': '水電費'}) # requests.request(method='POST', # url='http://127.0.0.1:8000/test/', # data="k1=v1; k2=v2; k3=v3; k3=v4" # ) # requests.request(method='POST', # url='http://127.0.0.1:8000/test/', # data="k1=v1;k2=v2;k3=v3;k3=v4", # headers={'Content-Type': 'application/x-www-form-urlencoded'} # ) # requests.request(method='POST', # url='http://127.0.0.1:8000/test/', # data=open('data_file.py', mode='r', encoding='utf-8'), # 文件內容是:k1=v1;k2=v2;k3=v3;k3=v4 # headers={'Content-Type': 'application/x-www-form-urlencoded'} # )
#若是請求體是 payload的話則須要傳入json格式 requests.request(method='POST', url='http://127.0.0.1:8000/test/', json={'k1': 'v1', 'k2': '水電費'})
ret1 = requests.get( url='https://dig.chouti.com/', headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36' } ) ret1_cookies = ret1.cookies.get_dict() #獲取的ret1.cookies是訪問該url返回的cookies對象 #經過get_dict()獲取到字典類型的cookies
headerspython
# 發送請求頭到服務器端 requests.request(method='POST', url='http://127.0.0.1:8000/test/', json={'k1': 'v1', 'k2': '水電費'}, headers={'Content-Type': 'application/x-www-form-urlencoded'} ) #具體須要什麼請求頭要看服務器端
filesgit
# 發送文件 # file_dict = { # 'f1': open('readme', 'rb') # } # requests.request(method='POST', # url='http://127.0.0.1:8000/test/', # files=file_dict) # 發送文件,定製文件名 # file_dict = { # 'f1': ('test.txt', open('readme', 'rb')) # } # requests.request(method='POST', # url='http://127.0.0.1:8000/test/', # files=file_dict) # 發送文件,定製文件名 # file_dict = { # 'f1': ('test.txt', "hahsfaksfa9kasdjflaksdjf") # } # requests.request(method='POST', # url='http://127.0.0.1:8000/test/', # files=file_dict) # 發送文件,定製文件名 # file_dict = { # 'f1': ('test.txt', "hahsfaksfa9kasdjflaksdjf", 'application/text', {'k1': '0'}) # } # requests.request(method='POST', # url='http://127.0.0.1:8000/test/', # files=file_dict) pass
設置超時時間,若是訪問超過超時時間就中止訪問 # ret = requests.get('http://google.com/', timeout=1) # print(ret) # ret = requests.get('http://google.com/', timeout=(5, 1)) # print(ret) pass
#是否容許重定向,默認爲true ret = requests.get('http://127.0.0.1:8000/test/', allow_redirects=False) print(ret.text)
該模塊能夠將接收到的html和xml進行格式化,經過操做對象的方式快速的找到想要的標籤github
from bs4 import BeautifulSoup html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> ... </body> </html> """ soup = BeautifulSoup(html_doc, features="lxml")
# tag = soup.find('a') # name = tag.name # 獲取 # print(name) # tag.name = 'span' # 設置
# tag = soup.find('a') # attrs = tag.attrs # 獲取 # print(attrs) # tag.attrs = {'ik':123} # 設置 # tag.attrs['id'] = 'iiiii' # 設置
# body = soup.find('body') # v = body.children
# body = soup.find('body') # v = body.descendants
# tag = soup.find('body') # tag.clear() # print(soup)
#body = soup.find('body') # v = body.extract() # print(soup)
# tag = soup.find('a') # print(tag) # tag = soup.find(name='a', attrs={'class': 'sister'}, recursive=True, text='Lacie') # tag = soup.find(name='a', class_='sister', recursive=True, text='Lacie') # print(tag)
# tags = soup.find_all('a') # print(tags) # tags = soup.find_all('a',limit=1) # print(tags) # tags = soup.find_all(name='a', attrs={'class': 'sister'}, recursive=True, text='Lacie') # # tags = soup.find(name='a', class_='sister', recursive=True, text='Lacie') # print(tags) # ####### 列表 ####### # v = soup.find_all(name=['a','div']) # print(v) # v = soup.find_all(class_=['sister0', 'sister']) # print(v)
# tag = soup.find('a') # v = tag.has_attr('id') # print(v)
import requests from bs4 import BeautifulSoup # 這個模塊解析html # 下載頁面 ret = requests.get(url='https://www.autohome.com.cn/news/') # print(ret.apparent_encoding)#爬取編碼格式 # print(ret.content) # ret.encoding = 'gbk' ret.encoding=ret.apparent_encoding # print(ret.text) # 頁面解析.獲取想要的內容 soup = BeautifulSoup(ret.text,features='html.parser') # 公司用 lxml(須要單獨安裝) # find是匹配成功的第一個 div =soup.find(name='div',id='auto-channel-lazyload-article') #若是有class 匹配的時候: # *****div = soup.find(name='div',attrs={'class':'dddd','id':'dfa'})***** li_list=div.find_all(name='li') # find_all返回的是一個列表 不可以用.find # print(li_list) for row in li_list: h3=row.find(name='h3') if not h3: continue a=row.find(name='a') print(a.get('href')) p = row.find(name='p') print(p.text) li_img= row.find(name='img') src= li_img.get('src') file_name = src.rsplit('__',maxsplit=1)[1] ret_img = requests.get('https:'+src) with open(file_name,'wb') as f: f.write(ret_img.content)
import requests from bs4 import BeautifulSoup # 第一次訪問返回未受權的cookie值 ret1 = requests.get( url='https://dig.chouti.com/', headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36' }) ret1_cookies = ret1.cookies.get_dict() # 登陸成功以後cookie值已經受權 ret = requests.post( url='https://dig.chouti.com/login', data={ 'phone':'8613612201458', 'password':'wo3384451', 'oneMonth':'1' }, headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36' }, cookies = ret1_cookies, ) for num_page in range(2,10): ret_index= requests.get(url='https://dig.chouti.com/all/hot/recent/%s'%(num_page), headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36' }, ) soup = BeautifulSoup(ret_index.text,'html.parser') div = soup.find(name='div',id='content-list') item_list = div.find_all(attrs={'class':'part2'}) for item in item_list: num = item.get('share-linkid') # 此時帶着已經受權的cookie值去點贊 ret3 = requests.post( url='https://dig.chouti.com/link/vote?linksId=%s'%(num), # data={'linksId':'%s'%(num)}, headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36' }, cookies = ret1_cookies ) print(ret3.text)
import requests import re from bs4 import BeautifulSoup class Github(object): def __init__(self,username=None,passward=None): self.username=username self.passward=passward self.all_cookies={} self.process() def process(self): if not (self.username and self.passward): raise Exception('請輸入用戶名和密碼') self.get_login_key() def get_login_key(self): # 獲取authenticity_token login_result = requests.get( url='https://github.com/login', headers={ 'Host': 'github.com', } ) auth_key =BS4xpath.get_auth_key(login_result.text) self.all_cookies = login_result.cookies.get_dict() self.login(auth_key) def login(self,auth_key): # 登陸獲取已經登陸的cookies login_result = requests.post( url='https://github.com/session', headers={ 'Upgrade-Insecure-Requests': '1', 'Host': 'github.com', }, data={ 'utf8': '✓', 'authenticity_token':auth_key, 'login': self.username, 'password': self.passward, 'commit': 'Sign in' }, cookies=self.all_cookies ) self.all_cookies.update(login_result.cookies.get_dict()) if self.all_cookies['logged_in']=='no': raise Exception('用戶名或密碼錯誤') def get_msg(self): msg_obj = requests.get( url='https://github.com/settings/profile', headers={ 'Host': 'github.com', 'Referer': 'https://github.com/', }, cookies=self.all_cookies ) msg=BS4xpath.get_msg_dict(msg_obj.text) return msg class BS4xpath(object): @classmethod def get_auth_key(self,text): soup = BeautifulSoup(text,'html.parser') auth_key=soup.find(name='input', attrs={'name': 'authenticity_token'}).get('value') return auth_key @classmethod def get_msg_dict(self,text): response = {} ret2_data = BeautifulSoup(text,'html.parser') div = ret2_data.find(name='div', attrs={'class': "column two-thirds"}) dl_list = div.find_all(name='dl', attrs={'class': "form-group"}) for row in dl_list: rowname = row.find('label').text dd_input = row.find('input') if dd_input: response[rowname] = dd_input.get('value') return response obj = Github(username='a3384451',passward='wo3384451') ret = obj.get_msg() print(ret)
#!/usr/bin/env python # -*- coding:utf-8 -*- import re import requests all_cookie = {} # ############### 1. 查看登陸頁面 ############### r1 = requests.get( url='https://passport.lagou.com/login/login.html', headers={ 'Host': 'passport.lagou.com', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36' } ) all_cookie.update(r1.cookies.get_dict()) X_Anti_Forge_Token = re.findall(r"window.X_Anti_Forge_Token = '(.*?)'", r1.text, re.S)[0] X_Anti_Forge_Code = re.findall(r"window.X_Anti_Forge_Code = '(.*?)'", r1.text, re.S)[0] # ############### 2. 用戶名密碼登陸 ############### r2 = requests.post( url='https://passport.lagou.com/login/login.json', headers={ 'Host': 'passport.lagou.com', 'Referer': 'https://passport.lagou.com/login/login.html', 'X-Anit-Forge-Code': X_Anti_Forge_Code, 'X-Anit-Forge-Token': X_Anti_Forge_Token, 'X-Requested-With': 'XMLHttpRequest', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', }, data={ 'isValidate': True, 'username': '15131255089', 'password': 'ab18d270d7126ea65915cc22c0d', 'request_form_verifyCode': '', 'submit': '', }, cookies=r1.cookies.get_dict() ) all_cookie.update(r2.cookies.get_dict()) # ############### 3. 用戶受權 ############### r3 = requests.get( url='https://passport.lagou.com/grantServiceTicket/grant.html', headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36' }, allow_redirects=False, cookies=all_cookie ) all_cookie.update(r3.cookies.get_dict()) # ############### 4. 用戶認證 ############### r4 = requests.get( url=r3.headers['Location'], headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36' }, allow_redirects=False, cookies=all_cookie ) all_cookie.update(r4.cookies.get_dict()) r5 = requests.get( url=r4.headers['Location'], headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36' }, allow_redirects=False, cookies=all_cookie ) all_cookie.update(r5.cookies.get_dict()) r6 = requests.get( url=r5.headers['Location'], headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36' }, allow_redirects=False, cookies=all_cookie ) all_cookie.update(r6.cookies.get_dict()) r7 = requests.get( url=r6.headers['Location'], headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36' }, allow_redirects=False, cookies=all_cookie ) all_cookie.update(r7.cookies.get_dict()) # ############### 5. 查看我的頁面 ############### r5 = requests.get( url='https://www.lagou.com/resume/myresume.html', headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36' }, cookies=all_cookie ) print('武沛齊' in r5.text) # ############### 6. 查看 ############### r6 = requests.get( url='https://gate.lagou.com/v1/neirong/account/users/0/', headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36', 'X-L-REQ-HEADER': "{deviceType:1}", 'Origin': 'https://account.lagou.com', 'Host': 'gate.lagou.com', }, cookies=all_cookie ) r6_json = r6.json() all_cookie.update(r6.cookies.get_dict()) # ############### 7. 修改我的信息 ############### r7 = requests.put( url='https://gate.lagou.com/v1/neirong/account/users/0/', headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36', 'Origin': 'https://account.lagou.com', 'Host': 'gate.lagou.com', 'X-Anit-Forge-Code': r6_json['submitCode'], 'X-Anit-Forge-Token': r6_json['submitToken'], 'X-L-REQ-HEADER': "{deviceType:1}", }, cookies=all_cookie, json={"userName": "wupeiqi888", "sex": "MALE", "portrait": "images/myresume/default_headpic.png", "positionName": '...', "introduce": '....'} ) print(r7.text)
from bs4 import BeautifulSoup class XSSFilter(object): __instance = None def __init__(self): # XSS白名單 self.valid_tags = { "font": ['color', 'size', 'face', 'style'], 'b': [], 'div': [], "span": [], "table": [ 'border', 'cellspacing', 'cellpadding' ], 'th': [ 'colspan', 'rowspan' ], 'td': [ 'colspan', 'rowspan' ], "a": ['href', 'target', 'name'], "img": ['src', 'alt', 'title'], 'p': ['align'], "pre": ['class'], "hr": ['class'], 'strong': [] } def __new__(cls, *args, **kwargs): if not cls.__instance: obj = object.__new__(cls, *args, **kwargs) cls.__instance = obj return cls.__instance def process(self, content): soup = BeautifulSoup(content, 'html.parser') # 遍歷全部HTML標籤 for tag in soup.find_all(): # 判斷標籤名是否在白名單中 if tag.name not in self.valid_tags: tag.hidden = True if tag.name not in ['html', 'body']: tag.hidden = True tag.clear() continue # 當前標籤的全部屬性白名單 attr_rules = self.valid_tags[tag.name] keys = list(tag.attrs.keys()) for key in keys: if key not in attr_rules: del tag[key] return soup.decode() #這裏返回的就是過濾完的內容 content=""" <p class='c1' id='i1'> asdfaa<span style="font-family:NSimSun;" class='c1'>sdf<a>a</a>sdf</span>sdf </p> <p> <strong class='c2' id='i2'>asdf</strong> <script>alert(123)</script> </p> <h2> asdf </h2> """ content = XSSFilter().process(content) print('content',content)
參考:http://www.cnblogs.com/wupeiqi/articles/6283017.htmljson
官方文檔:http://cn.python-requests.org/zh_CN/latest/user/quickstart.html#id4api