爬蟲 http原理,梨視頻,github登錄實例,requests請求參數小總結

回顧:http協議基於請求響應的方式,請求:請求首行 請求頭{'keys':vales} 請求體 ;響應:響應首行,響應頭{'keys':'vales'},響應體。html

import socket


sock=socket.socket()
sock.bind(("127.0.0.1",8808))
sock.listen(5)

while 1:
    print("server waiting.....")
    conn,addr=sock.accept()
    data=conn.recv(1024)
    print("data", data)
    
    # 讀取html文件
    with open("login.html","rb") as f:
        data=f.read()

    conn.send((b"HTTP/1.1 200 OK\r\nContent-type:text/html\r\n\r\n%s"%data))
    conn.close()
基於socket的瀏覽器交互
'''
    GET請求
    # 請求首行
    GET / HTTP/1.1\r\n
    # get請求後面的參數
    b'GET /?name=wd&age=11 HTTP/1.1\r\n
    # 請求頭
    Host: 127.0.0.1:8008\r\n
    Connection: keep-alive\r\n
    Cache-Control: max-age=0\r\n
    Upgrade-Insecure-Requests: 1\r\n
    User-Agent: Mozilla/5.0 (Windows NT 6.1; Win64; x64)                 
    AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181Safari/537.36\r\n
Accept:text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8\r\nAccept-Encoding: gzip, deflate, br\r\n
Accept-Language: zh-CN,zh;q=0.9\r\n Cookie:csrftoken=7xx6BxQDJ6KB0PM7qS8uTA892ACtooNbnnF4LDwlYk1Y7S7nTS81FBqwruizHsxF\r\n\r\n'
    # 請求體(get請求,請求體爲空)    
    '''
   b''
    '''
    POST請求
    # 請求首行
    b'POST /?name=wd&age=11 HTTP/1.1\r\n
    # 請求頭
    Host: 127.0.0.1:8008\r\n
Connection: keep-alive\r\n
Content-Length: 21\r\n
Cache-Control: max-age=0\r\n
Origin: http://127.0.0.1:8008\r\n
Upgrade-Insecure-Requests: 1\r\n
Content-Type: application/x-www-form-urlencoded\r\n
User-Agent: Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36\r\n
Accept:text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8\r\n
Referer: http://127.0.0.1:8008/?name=lqz&age=18\r\n
Accept-Encoding: gzip, deflate, br\r\n
Accept-Language: zh-CN,zh;q=0.9\r\n
Cookie:csrftoken=7xx6BxQDJ6KB0PM7qS8uTA892ACtooNbnnF4LDwlYk1Y7S7nTS81FBqwruizHsxF\r\n\r\n'
    # 請求體
    b'name=wd&password=11'
    
    '''
 
請求
b"HTTP/1.1 200 OK\r\n
Content-type:text/html\r\n\r\n
%s"%data
響應

http原理

點擊詳情git

梨視頻案例

#返回數據3種格式
#1.text                    匹配須要的東西 
#2.content(二進制)    保存成圖片,視頻等
#3.json                    反序列化成字典或列表

#下載功能
def download(videos,title):
    if not os.path.exists('video'):
        os.mkdir('video')
    path=os.path.join('video',title)+'.mp4'
    res=requests.get(videos)
    with open(path,'wb') as f:
        f.write(res.content)

#起線程執行執行
if __name__ == '__main__':
    from concurrent.futures import ThreadPoolExecutor
    p=ThreadPoolExecutor(10)
    for i in parser_index(get_index()):
        dic=video_info(get_video(i))
        print(dic)
        p.submit(download,dic['video'],dic['title'])
    p.shutdown(wait=True)

#注意問題:梨視頻下滑加載視頻(是根據url的參數,例如分類下的視頻顯示多少)github

github登錄實例

#get請求登錄頁面 獲取csrf隨機字符串和cookiesweb

#post請求登錄操做 攜帶csrf,輸入的用戶名密碼等(請求體數據) 和 cookies,user-agent,referer等(請求頭數據) 必須數據ajax

數據是請求體仍是請求頭數據? (個人理解是好比ajax裏的data,django的返回數據都是請求體的數據. request.set_cookies('islogin':'true') request對象的數據爲請求頭的)django

"""
1.請求登錄頁面 獲取token cookie
2.發生登錄的post請求,將用戶名密碼 和token 放在請求體中,cookie放在請求頭中

"""
import requests
import re
login_url = "https://github.com/login"
#瀏覽器標識
headers = {"user-agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"}
#請求登錄頁面
res1 = requests.get(login_url,headers=headers)

print(res1.status_code)
# 從響應體中獲取token
token = re.search('name="authenticity_token" value="(.*?)"',res1.text).group(1)

# 保存cookie
login_cookie = res1.cookies.get_dict()
print(login_cookie)

# 發送登錄請求
res2 = requests.post("https://github.com/session",
              headers={
                  "user-agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"},
              cookies = login_cookie,
              data={
                "commit": "Sign in",
                "utf8": "",
                "authenticity_token": token,
                "login": "xxxxxxxxxxx",
                "password": "xxxxxxxxxxx"},
                # 是否容許自動重定向
                allow_redirects = False)
print(res2.status_code)

# 用戶登陸成功後的cookie
user_cookie = res2.cookies.get_dict()

# 攜帶用戶cookies訪問主頁
res3 = requests.get("https://github.com/settings/profile",cookies = user_cookie,headers = headers)
print(res3.status_code)
print(res3.text)
# "https://github.com/settings/profile"

requests請求參數小總結

#get請求參數
kwd = "吳秀波出軌門"
url = "https://www.baidu.com/s"
requests.get(url,headers=headers,params={"wd":kwd})

#post請求參數
requests.post("https://github.com/session",
              headers={
                  "user-agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"},
              cookies = login_cookie,
              data={
                "commit": "Sign in",
                "utf8": "",
                "authenticity_token": token,
                "login": "ssssss",
                "password": "ssssss"},
                # 是否容許自動重定向
                allow_redirects = False)
#返回值處理
# response.cookies.get_dict() #獲取cookies
# response.status_code # 狀態碼
# response.text # 將結果以文本的形式返回
# response.content # 將結果以二進制的方式返回
# response.json() # 將數據直接反序列化獲得字典或是列表
主要代碼內容
相關文章
相關標籤/搜索