Python爬蟲之urllib示例

一、最簡單:直接抓取頁面代碼

import urllib.request
import urllib.error

url = 'http://test.com/test.html'
try:
    resp = urllib.request.urlopen(url)
except urllib.error.HTTPError as e:
    print(e.code, e.msg)
except urllib.error.URLError as e:
    print(e.reason)
else:
    result = resp.read().decode('utf-8')
    print(result)

二、使用 Request

import urllib.request
import urllib.error

url = 'http://test.com/test.html'
try:
    req = urllib.request.Request(url)  # 構造一個Request對象,推薦
    resp = urllib.request.urlopen(req)
except urllib.error.HTTPError as e:
    print(e.code, e.msg)
except urllib.error.URLError as e:
    print(e.reason)
else:
    result = resp.read().decode('utf-8')
    print(result)

三、發送數據,GET

import urlib.request
import urllib.parse

url = 'http://test.com/a.php?act=login&id=123'
req = urllib.request.Request(url)
resp = urllib.request.urlopen(req)

# or

url = 'http://test.com/a.php'
params = {
    'act': 'login',
    'id': 123,
    'name': u'張三'
}
geturl = url + '?' + urllib.parse.urlencode(params)
req = urllib.request.Request(geturl)
resp = urllib.request.urlopen(req)

print(resp.read().decode('utf-8'))
# {"act":"login","name":"\u5f20\u4e09","id":"123"}

四、發送數據,POST

import urllib.request
import urllib.parse

url = 'http://test.com/a.php'
params = {
    'act': 'login',
    'login[name]': u'張三',
    'login[password]': '123456'
}
data = urllib.parse.urlencode(params).encode('utf-8')

req = urllib.request.Request(url, data)
resp = urllib.request.urlopen(req)

print(resp.read().decode('utf-8'))
# {"act":"login","login":{"password":"123456","name":"\u5f20\u4e09"}}}

五、發送數據和header

import urllib.request
import urllib.parse

url = 'http://test.com/a.php'
params = {
    'act': 'login',
    'login[name]': u'張三',
    'login[password]': '123456'
}
data = urllib.parse.urlencode(params).encode('utf-8')

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
                  'Chrome/54.0.2840.99 Safari/537.36',
    'Referer': 'http://www.baidu.com',
    'haha': 'xixi'
}

req = urllib.request.Request(url, data, headers)
resp = urllib.request.urlopen(req)

print(resp.read().decode('utf-8'))
相關文章
相關標籤/搜索