原生socket請求url獲取狀態碼、消息報頭、響應正文

時間 2019-11-25

標籤原生 socket 請求 url 獲取狀態消息報頭響應正文欄目系統網絡简体版

原文原文鏈接

需求：html

1 (1)使用socket及ssl模塊寫通用的web客戶端
2 (2)向服務器發起請求
3 (3)接受響應內容並解析出狀態碼、消息報頭、響應正文
4 (4)最核心的函數: 輸入一個url，返回狀態碼、消息報頭、響應正文；固然這也是最後實現的效果

知識儲備：python

網絡基礎知識nginx

python的web編程(socket)web

最後實現代碼：編程

  1 # __author__ = "wyb"
  2 # date: 2018/6/5
  3 # 代碼: 高內聚低耦合 -> 使用函數封裝一些邏輯代碼 -> 功能函數
  4 
  5 import socket
  6 import ssl
  7 """
  8 在 Python3 中，bytes 和 str 的互相轉換方式是
  9 str.encode('utf-8')
 10 bytes.decode('utf-8')
 11 
 12 send 函數的參數和 recv 函數的返回值都是 bytes 類型
 13 
 14 1、使用 https
 15     1, https 請求的默認端口是 443
 16     2, https 的 socket 鏈接須要 import ssl
 17         而且使用 s = ssl.wrap_socket(socket.socket()) 來初始化
 18 
 19 2、HTTP 協議的 301 狀態
 20     請求豆瓣電影 top250 (注意協議)
 21     http://movie.douban.com/top250
 22     返回結果是一個 301
 23     301 狀態會在 HTTP 頭的 Location 部分告訴你應該轉向的 URL
 24     因此, 若是遇到 301, 就請求新地址而且返回
 25         HTTP/1.1 301 Moved Permanently
 26         Date: Sun, 05 Jun 2016 12:37:55 GMT
 27         Content-Type: text/html
 28         Content-Length: 178
 29         Connection: keep-alive
 30         Keep-Alive: timeout=30
 31         Location: https://movie.douban.com/top250
 32         Server: dae
 33         X-Content-Type-Options: nosniff
 34 
 35         <html>
 36         <head><title>301 Moved Permanently</title></head>
 37         <body bgcolor="white">
 38         <center><h1>301 Moved Permanently</h1></center>
 39         <hr><center>nginx</center>
 40         </body>
 41         </html>
 42 
 43 https 的默認端口是 443, 因此你須要在 get 函數中根據協議設置不一樣的默認端口
 44 """
 45 
 46 
 47 # 功能函數:
 48 # 解析url產生protocol、host、port、path
 49 def parsed_url(url):
 50     """
 51     :param url: 字符串, 可能的值以下
 52     'g.cn'
 53     'g.cn/'
 54     'g.cn:3000'
 55     'g.cn:3000/search'
 56     'http://g.cn'
 57     'https://g.cn'
 58     'http://g.cn/'
 59     :return: 返回一個 tuple, 內容: (protocol, host, port, path)
 60     """
 61     protocol = "http"
 62     if url[:7] == "http://":
 63         u = url.split("://")[1]
 64     elif url[:8] == "https://":
 65         protocol = "https"
 66         u = url.split("://")[1]
 67     else:
 68         u = url
 69 
 70     # 檢查默認path
 71     i = u.find("/")
 72     if i == -1:
 73         host = u
 74         path = "/"
 75     else:
 76         host = u[:i]
 77         path = u[i:]
 78 
 79     # 檢查端口
 80     port_dict = {
 81         "http": 80,
 82         "https": 443,
 83     }
 84     # 默認端口
 85     port = port_dict[protocol]
 86     if ":" in host:
 87         h = host.split(":")
 88         host = h[0]
 89         port = int(h[1])
 90 
 91     return protocol, host, port, path
 92 
 93 
 94 # 根據協議返回socket實例
 95 def socket_by_protocol(protocol):
 96     """
 97     根據協議返回socket實例
 98     :param protocol: 協議
 99     :return: socket實例
100     """
101     if protocol == "http":
102         s = socket.socket()             # 生成一個socket對象
103 
104     else:
105         # HTTPS 協議須要使用 ssl.wrap_socket 包裝一下原始的 socket
106         # 除此以外無其餘差異
107         s = ssl.wrap_socket(socket.socket())
108     return s
109 
110 
111 # 根據socket對象接受數據
112 def response_by_socket(s):
113     """
114     接受數據
115     :param s: socket實例
116     :return: response
117     """
118     response = b""
119     buffer_size = 1024
120     while True:
121         r = s.recv(buffer_size)
122         if len(r) == 0:
123             break
124         response += r
125     return response
126 
127 
128 # 把 response 解析出 狀態碼 headers body 返回
129 def parsed_response(r):
130     """
131     解析response對象獲取狀態碼、headers、body
132     :param r: response
133     :return: tuple(status_code, headers, body)
134     """
135     header, body = r.split('\r\n\r\n', 1)
136     h = header.split('\r\n')
137     # headers的頭部: HTTP/1.1 200 OK
138     status_code = h[0].split()[1]
139     status_code = int(status_code)
140 
141     headers = {}
142     for line in h[1:]:
143         k, v = line.split(': ')
144         headers[k] = v
145     return status_code, headers, body
146 
147 
148 # 主邏輯函數:
149 # 把向服務器發送 HTTP 請求而且得到數據這個過程封裝成函數 -> 複雜的邏輯(具備重用性)封裝成函數
150 def get(url):
151     """
152     使用 socket 鏈接服務器，獲取服務器返回的數據並返回
153     :param url: 連接地址，url的值以下:
154     'g.cn'
155     'g.cn/'
156     'g.cn:3000'
157     'g.cn:3000/search'
158     'http://g.cn'
159     'https://g.cn'
160     'http://g.cn/'
161     :return: 返回tuple(status_code, headers, body)
162     """
163     protocol, host, port, path = parsed_url(url)
164 
165     # 獲得socket對象並鏈接服務器
166     s = socket_by_protocol(protocol)
167     s.connect((host, port))
168 
169     # 發送請求
170     request = 'GET {} HTTP/1.1\r\nhost: {}\r\nConnection: close\r\n\r\n'.format(path, host)
171     encoding = 'utf-8'
172     s.send(request.encode(encoding))
173 
174     # 得到響應
175     response = response_by_socket(s)
176     r = response.decode(encoding)
177 
178     # 解析響應
179     status_code, headers, body = parsed_response(r)
180     # 當狀態碼爲301或302時表示爲重定向
181     if status_code in [301, 302]:
182         url = headers['Location']
183         return get(url)
184 
185     return status_code, headers, body
186 
187 
188 # 單元測試:
189 def test_parsed_url():
190     """
191     parsed_url函數很容易出錯，咱們寫測試函數來運行檢測是否正確運行
192     """
193     http = "http"
194     https = "https"
195     host = "g.cn"
196     path = "/"
197     test_items = [
198         ('http://g.cn', (http, host, 80, path)),
199         ('http://g.cn/', (http, host, 80, path)),
200         ('http://g.cn:90', (http, host, 90, path)),
201         ('http://g.cn:90/', (http, host, 90, path)),
202         ('https://g.cn', (https, host, 443, path)),
203         ('https://g.cn:233', (https, host, 233, path)),
204     ]
205     for t in test_items:
206         url, expected = t
207         u = parsed_url(url)
208         # assert 是一個語句, 名字叫 斷言
209         # 若是斷言成功, 條件成立, 則經過測試, 不然爲測試失敗, 中斷程序報錯
210         e = "parsed_url ERROR, ({}) ({}) ({})".format(url, u, expected)
211         assert u == expected, e
212 
213 
214 def test_get():
215     """
216         測試是否能正確處理 HTTP 和 HTTPS
217     """
218     urls = [
219         'http://movie.douban.com/top250',
220         'https://movie.douban.com/top250',
221     ]
222     for u in urls:
223         res = get(u)
224         print(res)
225 
226 
227 # 使用:
228 def main():
229     url = 'http://movie.douban.com/top250'
230     # r = get(url)
231     # print(r)
232     status_code, headers, body = get(url)
233     print("status_code: ", status_code)
234     print("headers: ", headers)
235     print("body: ", body)
236 
237 
238 if __name__ == '__main__':
239     # test_parsed_url()
240     # test_get()
241     main()