python3 urllib模塊

時間 2019-11-18

標籤 python3 python urllib 模塊欄目 Python 简体版

原文原文鏈接

3.0版本中已經將urllib二、urlparse、和robotparser併入了urllib中，而且修改urllib模塊，其中包含5個子模塊，便是help()中看到的那五個名字。php

Python2中的urllib模塊，在Python3中被修改成html

20.5. urllib.request — Extensible library for opening URLs
20.6. urllib.response — Response classes used by urllib
20.7. urllib.parse — Parse URLs into components
20.8. urllib.error — Exception classes raised by urllib.request
20.9. urllib.robotparser — Parser for robots.txt

這幾個模塊，經常使用的urllib.urlopen()方法變成了urllib.request.urlopen()方法，其它方法的改變，能夠參考Python3的文檔python

Python3文檔的互聯網協議與支持部分：http://docs.python.org/py3k/library/internet.htmlweb

Python2使用庫：ajax

urllib http://docs.python.org/library/urllib.html【下載】cookie

urllib2 http://docs.python.org/library/urllib2.html【抓取】多線程

urlparse http://docs.python.org/library/urlparse.html【url切分用到】app

sgmllib http://docs.python.org/library/sgmllib.html【html解析用到】dom

# Python urllib2遞歸抓取某個網站下圖片
#!/usr/bin/python
# -*- coding:utf-8 -*-
# author: wklken
# 2012-03-17 wklken@yeah.net
#1實現url解析 #2實現圖片下載 #3優化重構
#4多線程 還沒有加入

import os,sys,urllib,urllib2,urlparse
from sgmllib import SGMLParser 

img = []
class URLLister(SGMLParser):
  def reset(self):
    SGMLParser.reset(self)
    self.urls=[]
    self.imgs=[]
  def start_a(self, attrs):
    href = [ v for k,v in attrs if k=="href" and v.startswith("http")]
    if href:
      self.urls.extend(href)
  def start_img(self, attrs):
    src = [ v for k,v in attrs if k=="src" and v.startswith("http") ]
    if src:
      self.imgs.extend(src)


def get_url_of_page(url, if_img = False):
  urls = []
  try:
    f = urllib2.urlopen(url, timeout=1).read()
    url_listen = URLLister()
    url_listen.feed(f)
    if if_img:
      urls.extend(url_listen.imgs)
    else:
      urls.extend(url_listen.urls)
  except urllib2.URLError, e:
    print e.reason
  return urls

#遞歸處理頁面
def get_page_html(begin_url, depth, ignore_outer, main_site_domain):
  #如果設置排除外站 過濾之
  if ignore_outer:
    if not main_site_domain in begin_url:
      return

  if depth == 1:
    urls = get_url_of_page(begin_url, True)
    img.extend(urls)
  else:
    urls = get_url_of_page(begin_url)
    if urls:
      for url in urls:
        get_page_html(url, depth-1)

#下載圖片
def download_img(save_path, min_size):
  print "download begin..."
  for im in img:
    filename = im.split("/")[-1]
    dist = os.path.join(save_path, filename)
    #此方式判斷圖片的大小太浪費了
    #if len(urllib2.urlopen(im).read()) < min_size:
    #  continue
    #這種方式先拉頭部，應該好多了，不用再下載一次
    connection = urllib2.build_opener().open(urllib2.Request(im))
    if int(connection.headers.dict['content-length']) < min_size:
      continue
    urllib.urlretrieve(im, dist,None)
    print "Done: ", filename
  print "download end..."

if __name__ == "__main__":
  #抓取圖片首個頁面
  url = "http://www.baidu.com/"
  #圖片保存路徑
  save_path = os.path.abspath("./downlaod")
  if not os.path.exists(save_path):
    os.mkdir(save_path)
  #限制圖片最小必須大於此域值  單位 B
  min_size = 92
  #遍歷深度
  max_depth = 1
  #是否只遍歷目標站內，即存在外站是否忽略
  ignore_outer = True
  main_site_domain = urlparse.urlsplit(url).netloc

  get_page_html(url, max_depth, ignore_outer, main_site_domain)

  download_img(save_path, min_size)

#!/usr/bin/env python3

# -*- coding: utf-8 -*-

import time
import sys
import gzip
import socket
import urllib.request, urllib.parse, urllib.error
import http.cookiejar
class HttpTester:
def __init__(self, timeout=10, addHeaders=True):
socket.setdefaulttimeout(timeout) # 設置超時時間
self.__opener = urllib.request.build_opener()
urllib.request.install_opener(self.__opener)
if addHeaders: self.__addHeaders()
def __error(self, e):
'''錯誤處理'''
print(e)
def __addHeaders(self):
'''添加默認的 headers.'''
self.__opener.addheaders = [('User-Agent', 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100101 Firefox/22.0'),
('Connection', 'keep-alive'),
('Cache-Control', 'no-cache'),
('Accept-Language:', 'zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3'),
('Accept-Encoding', 'gzip, deflate'),
('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')]
def __decode(self, webPage, charset):
'''gzip解壓，並根據指定的編碼解碼網頁'''
if webPage.startswith(b'x1fx8b'):
return gzip.decompress(webPage).decode(charset)
else:
return webPage.decode(charset)
def addCookiejar(self):
'''爲 self.__opener 添加 cookiejar handler。'''
cj = http.cookiejar.CookieJar()
self.__opener.add_handler(urllib.request.HTTPCookieProcessor(cj))
def addProxy(self, host, type='http'):
'''設置代理'''
proxy = urllib.request.ProxyHandler({type: host})
self.__opener.add_handler(proxy)
def addAuth(self, url, user, pwd):

'''添加認證'''
pwdMsg = urllib.request.HTTPPasswordMgrWithDefaultRealm()
pwdMsg.add_password(None, url, user, pwd)
auth = urllib.request.HTTPBasicAuthHandler(pwdMsg)
self.__opener.add_handler(auth)

def get(self, url, params={}, headers={}, charset='UTF-8'):
'''HTTP GET 方法'''
if params: url += '?' + urllib.parse.urlencode(params)
request = urllib.request.Request(url)
for k,v in headers.items(): request.add_header(k, v) # 爲特定的 request 添加指定的 headers
try:
response = urllib.request.urlopen(request)
except urllib.error.HTTPError as e:
self.__error(e)
else:
return self.__decode(response.read(), charset)


def post(self, url, params={}, headers={}, charset='UTF-8'):
'''HTTP POST 方法'''
params = urllib.parse.urlencode(params)
request = urllib.request.Request(url, data=params.encode(charset)) # 帶 data 參數的 request 被認爲是 POST 方法。
for k,v in headers.items(): request.add_header(k, v)
try:
response = urllib.request.urlopen(request)
except urllib.error.HTTPError as e:
self.__error(e)
else:
return self.__decode(response.read(), charset)
def download(self, url, savefile):
'''下載文件或網頁'''
header_gzip = None
for header in self.__opener.addheaders: # 移除支持 gzip 壓縮的 header
if 'Accept-Encoding' in header:
header_gzip = header
self.__opener.addheaders.remove(header)
__perLen = 0


def reporthook(a, b, c): # a:已經下載的數據大小; b:數據大小; c:遠程文件大小;
if c > 1000000:
nonlocal __perLen
per = (100.0 * a * b) / c
if per>100: per=100
per = '{:.2f}%'.format(per)
print('b'*__perLen, per, end='') # 打印下載進度百分比
sys.stdout.flush()
__perLen = len(per)+1
print('--> {}t'.format(url), end='')
try:
urllib.request.urlretrieve(url, savefile, reporthook) # reporthook 爲回調鉤子函數，用於顯示下載進度
except urllib.error.HTTPError as e:
self.__error(e)
finally:
self.__opener.addheaders.append(header_gzip)
print()

2、應用實例
在OSC上動彈一下
ht = HttpTester()

ht.addCookiejar()
# 爲了隱私，把有些關鍵字隱藏了
ht.get('https://www.oschina.net/home/login?goto_page=http%3A%2F%2Fwww.oschina.net%2F')
ht.post(url = 'https://www.oschina.net/action/user/hash_login',
params = {'email': '****@foxmail.com','pwd': 'e4a1425583d37fcd33b9*************','save_login': '1'})#密碼哈希，Firefox開發工具抓取的

ht.get('http://www.oschina.net/')
ht.post(url = 'http://www.oschina.net/action/tweet/pub',
params = {'user_code': '8VZTqhkJOqhnuugHvzBtME4***********','user': '102*****','msg': '你們在動彈什麼？ via:(python3, urllib) ->{t}'.format(t = time.time())})
金山快盤簽到送空間
ht = HttpTester()
ht.addCookiejar()
# 爲了隱私，把有些關鍵字隱藏
ht.get('https://www.kuaipan.cn/account_login.htm')
ht.post(url='https://www.kuaipan.cn/index.php?ac=account&op=login',params={'username': '****@qq.com','userpwd': 'lyb********','isajax': 'yes'})
ht.get('http://www.kuaipan.cn/index.php?ac=zone&op=taskdetail')
ht.get('http://www.kuaipan.cn/index.php?ac=common&op=usersign')

相關標籤/搜索

每日一句

每一个你不满意的现在，都有一个你没有努力的曾经。