Python3之經常使用模塊

大綱>>

  1. time &datetime模塊
  2. random模塊
  3. OS模塊
  4. sys模塊
  5. shelve模塊
  6. shutil模塊
  7. xml模塊
  8. configparser模塊
  9. Hashlib、Hmac模塊
  10. zipfile&tarfile模塊
  11. PyYAML模塊
  12. re正則表達式

 

time & datetime模塊

# !/usr/bin/env python
import time, datetime

"""
    經常使用標準庫:
    
    time
    
        一、時間戳:時間戳表示的是從1970年1月1日00:00:00開始按秒計算的偏移量;
        二、格式化的時間字符串
        三、元組(struct_time):struct_time元組共有9個元素
        
        
    
        格式:
            %a    本地(locale)簡化星期名稱    
            %A    本地完整星期名稱    
            %b    本地簡化月份名稱    
            %B    本地完整月份名稱    
            %c    本地相應的日期和時間表示    
            %d    一個月中的第幾天(01 - 31)    
            %H    一天中的第幾個小時(24小時制,00 - 23)    
            %I    第幾個小時(12小時制,01 - 12)    
            %j    一年中的第幾天(001 - 366)    
            %m    月份(01 - 12)    
            %M    分鐘數(00 - 59)    
            %p    本地am或者pm的相應符    一    
            %S    秒(01 - 61)    二    
            %U    一年中的星期數。(00 - 53星期天是一個星期的開始。)第一個星期天以前的全部天數都放在第0周。    三    
            %w    一個星期中的第幾天(0 - 6,0是星期天)    三    
            %W    和%U基本相同,不一樣的是%W以星期一爲一個星期的開始。    
            %x    本地相應日期    
            %X    本地相應時間    
            %y    去掉世紀的年份(00 - 99)    
            %Y    完整的年份    
            %Z    時區的名字(若是不存在爲空字符)    
            %%    ‘%’字符
"""
# print(help(time))
# print(help(time.ctime)) # 查看具體命令用法

# 當前時間 時間戳
print(time.time())
# cpu 時間
print(time.clock())

# 延遲多少秒
# print(time.sleep(1))

# 返回元組格式的時間 UTC         time.gmtime(x) x爲時間戳
print(time.gmtime())

# 返回元組格式的時間 UTC+8  這是咱們經常使用的時間    time.localtime(x) x爲時間戳
print(time.localtime())


x = time.localtime()
print("x:", x)
# 將元組格式的時間格式化爲str格式的自定義格式時間 time.strftime(str_format, x)  str_format:格式   x元組時間
print(time.strftime("%Y-%m-%d %H:%M:%S", x))

# 秒格式化爲字符串形式  格式爲:Tue Jun 16 11:53:31 2009
print(time.ctime(1245124411))

# 獲取元組時間中的具體時間  年/月/日......
print(x.tm_year, x.tm_mon, x.tm_mday, x.tm_hour, x.tm_min, x.tm_sec)

# 將元組格式的時間轉換爲時間戳
print(time.mktime(x))

# 將時間戳轉爲字符串格式
print(time.gmtime(time.time()-86640))   # 將utc時間戳轉換成struct_time格式
print(time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()))   # 將utc struct_time格式轉成指定的字符串格式


"""
    datetime模塊:
"""
print("時間加減datetime模塊".center(50, "~"))

# 返回 2018-01-20 23:20:49.418354
print(datetime.datetime.now())

# 時間戳直接轉成日期格式 2018-01-20
print(datetime.date.fromtimestamp(time.time()))

# 當前時間+3天
print(datetime.datetime.now() + datetime.timedelta(3))

# 當前時間-3天
print(datetime.datetime.now() + datetime.timedelta(-3))

# 當前時間+3小時
print(datetime.datetime.now() + datetime.timedelta(hours=3))

# 當前時間+30分
print(datetime.datetime.now() + datetime.timedelta(minutes=30))

c_time  = datetime.datetime.now()
# 時間替換
print(c_time.replace(minute=54, hour=5))

  

時間關係轉換圖:node

 

random模塊

# Author:Allister.Liu
# !/usr/bin/env python
import random

"""
    random模塊:
        

"""

# 用於生成一個0到1的隨機符點數: 0 <= n < 1.0
print(random.random())

#  random.randint(a, b),用於生成一個指定範圍內的整數。其中參數a是下限,參數b是上限,生成的隨機數n: a <= n <= b
print(random.randint(1, 10))


# random.randrange([start], stop[, step]),
# 從指定範圍內,按指定基數遞增的集合中 獲取一個隨機數。如:random.randrange(10, 100, 2),
# 結果至關於從[10, 12, 14, 16, ... 96, 98]序列中獲取一個隨機數。
# random.randrange(10, 100, 2)在結果上與 random.choice(range(10, 100, 2) 等效。
print(random.randrange(1, 10))
print(random.choice(range(10, 100, 2)))

# 從序列中獲取一個隨機元素。 random.choice(sequence) sequence在python不是一種特定的類型,而是泛指一系列的類型。 list, tuple, 字符串都屬於sequence
print(random.choice("abcdef"))

print(random.choice("學習Python的小夥伴"))  # 夥
print(random.choice(["JGood", "is", "a", "handsome", "boy"]))  # boy--  List
print(random.choice(("Tuple","List","Dict")))   # Tuple

# random.sample(sequence, k),從指定序列中隨機獲取指定長度的片段。sample函數不會修改原有序列。
print(random.sample([1, 2, 3, 4, 5, 6, 7, 8, 9], 5))  # [2, 1, 9, 5, 7]


# 隨機整數:
print(random.randint(0, 99))  # 70

# 隨機選取0到100間的偶數:
print(random.randrange(0, 101, 2))  # 4

# 隨機浮點數:
print(random.random())  # 0.2746445568079129
print(random.uniform(1, 10))  # 9.887001463194844

# 隨機字符:
print(random.choice('abcdefg&#%^*f'))  # e

# 多個字符中選取特定數量的字符:
print(random.sample('abcdefghij123', 3))  # ['3', 'j', 'i']

# 隨機選取字符串:
print(random.choice(['apple', 'pear', 'peach', 'orange', 'lemon']))  # peach

# 洗牌#
items = [1, 2, 3, 4, 5, 6, 7, 8, 9]
print(items)  # [1, 2, 3, 4, 5, 6, 7, 8, 9]

random.shuffle(items)
print(items)  # [8, 3, 6, 1, 4, 9, 5, 7, 2]


"""
 生成6爲驗證碼:由數字, 大寫字母, 小寫字母組成的6位隨機驗證碼
"""


def produce_check_code(scope = 6):
    check_code = ""
    for i in range(scope):
        tmp = random.randint(0, 10)

        if tmp < 6:
            tmp = random.randint(0, 9)
        elif tmp > 8:
            tmp = chr(random.randint(65, 90))
        else:
            tmp = chr(random.randint(97, 122))

        check_code += str(tmp)
    return check_code


print(produce_check_code(8))

  

0.21786963196954112
3
2
34
b
的
JGood
List
[7, 2, 6, 4, 8]
12
14
0.5355914470942843
3.3065568721321013
%
['2', 'g', 'f']
pear
[1, 2, 3, 4, 5, 6, 7, 8, 9]
[6, 7, 5, 9, 1, 2, 3, 4, 8]
D626EbYt

 

OS模塊

提供對操做系統進行調用的接口:python

# Author:Allister.Liu
# !/usr/bin/env python
import os

"""
    OS模塊:
"""
path = "E:/logo/ic2c/logo.png"

# 獲取當前工做目錄,即當前python腳本工做的目錄路徑 === linux: pwd
print(os.getcwd())

# 改變當前腳本工做目錄;至關於shell下cd
# os.chdir("dirname")

# 返回當前目錄: ('.')
print(os.curdir)

# 獲取當前目錄的父目錄字符串名:('..')
print(os.pardir)

# 可生成多層遞歸目錄
# os.makedirs('dirname1/dirname2')

# 若目錄爲空,則刪除,並遞歸到上一級目錄,如若也爲空,則刪除,依此類推
# os.removedirs('dirname1')

# 生成單級目錄;至關於shell中mkdir dirname
# os.mkdir('dirname')

# 刪除單級空目錄,若目錄不爲空則沒法刪除,報錯;至關於shell中rmdir dirname
# os.rmdir('dirname')

# 列出指定目錄下的全部文件和子目錄,包括隱藏文件,並以列表方式打印
print(os.listdir('E:/logo'))

# 刪除一個文件
# os.remove()

# 重命名文件/目錄
# os.rename("oldname","newname")

# 獲取文件/目錄信息
# os.stat('path/filename')

# 輸出操做系統特定的路徑分隔符,win下爲"\\",Linux下爲"/"
os.sep

# 輸出當前平臺使用的行終止符,win下爲"\t\n",Linux下爲"\n"
os.linesep

# 輸出用於分割文件路徑的字符串 eg:環境變量path的分隔符
os.pathsep

# 輸出字符串指示當前使用平臺。win->'nt'; Linux->'posix'
os.name

# 運行shell命令,直接顯示
os.system("dir")

# 獲取系統環境變量
print(os.environ)

# 返回path規範化的絕對路徑
print(os.path.abspath(path))

# 將path分割成目錄和文件名二元組返回
print(os.path.split(path))

# 返回path的目錄。其實就是os.path.split(path)的第一個元素
print(os.path.dirname(path))

# 返回path最後的文件名。如何path以/或\結尾,那麼就會返回空值。即os.path.split(path)的第二個元素
print(os.path.basename(path))

# 若是path存在,返回True;若是path不存在,返回False
print(os.path.exists(path))

# 若是path是絕對路徑,返回True
print(os.path.isabs(path))

# 若是path是一個存在的文件,返回True。不然返回False
print(os.path.isfile(path))

# 若是path是一個存在的目錄,則返回True。不然返回False
print(os.path.isdir(path))

# 將多個路徑組合後返回,第一個絕對路徑以前的參數將被忽略
# os.path.join(path1[, path2[, ...]])

# 返回path所指向的文件或者目錄的最後存取時間
print(os.path.getatime(path))

# 返回path所指向的文件或者目錄的最後修改時間
print(os.path.getmtime(path))

  

sys模塊

# Author:Allister.Liu
# !/usr/bin/env python

import sys


print(help(sys))
# 命令行參數List,第一個元素是程序自己路徑
sys.argv

# 退出程序,正常退出時exit(0)
# sys.exit(0)

# 獲取Python解釋程序的版本信息
print(sys.version)

# 最大的Int值
print(sys.maxsize)

# 返回模塊的搜索路徑,初始化時使用PYTHONPATH環境變量的值
print(sys.path)

# 返回操做系統平臺名稱
print(sys.platform)

# 不換行輸出  進度條
sys.stdout.write('please:')
val = sys.stdin.readline()[:-1]
print(val)

  

shelve模塊

# Author:Allister.Liu
# !/usr/bin/env python
import shelve
import os, datetime


"""
    shelve模塊:shelve模塊是一個簡單的k,v將內存數據經過文件持久化的模塊,能夠持久化任何pickle可支持的python數據格式
"""

file_path = "datas"
# 文件夾不存在則建立
if not os.path.exists(file_path):
    os.mkdir(file_path)


# 打開一個文件
d = shelve.open(file_path + "/shelve_file.data")


class Test(object):
    def __init__(self, n):
        self.n = n

t1 = Test(123)
t2 = Test(123334)

names = ["Allister", "Linde", "Heddy", "Daty"]

# 持久化列表 k爲names
d["names"] = names

# 持久化類
d["t1"] = t1

d["t2"] = t2

d["date"] = datetime.datetime.now()


"""
    獲取文件內容
"""
# 根據key獲取value
print(d.get("names"))
print(d.get("t1"))
print(d.get("date"))

print(d.items())

  

shutil模塊

# Author:Allister.Liu
# !/usr/bin/env python

import shutil

"""
    shutil模塊:
    
    
    shutil.copyfileobj(fsrc, fdst[, length]):將文件內容拷貝到另外一個文件中,能夠部份內容;
    
    shutil.copyfile(src, dst):拷貝文件;
    
    shutil.copymode(src, dst):僅拷貝權限。內容、組、用戶均不變;
    
    shutil.copystat(src, dst):拷貝狀態的信息,包括:mode bits, atime, mtime, flags;
    
    shutil.copy(src, dst):拷貝文件和權限;
    
    shutil.copy2(src, dst):拷貝文件和狀態信息&權限等;
    
    shutil.rmtree(path[, ignore_errors[, onerror]]):遞歸的去刪除文件;
    
    shutil.move(src, dst):遞歸的去移動文件;
    
    shutil.copytree(src, dst, symlinks=False, ignore=None):遞歸的去拷貝文件,目錄;
    
    
    shutil.move(src, dst):遞歸的去移動文件
    
    shutil.make_archive(base_name, format,...):建立壓縮包並返回文件路徑,例如:zip、tar;
        base_name: 壓縮包的文件名,也能夠是壓縮包的路徑。只是文件名時,則保存至當前目錄,不然保存至指定路徑, 如:ic2c      =>保存至當前路徑; 
            如:/Users/Allister/ic2c =>保存至/Users/Allister/;
        format:	壓縮包種類,「zip」, 「tar」, 「bztar」,「gztar」;
        root_dir:	要壓縮的文件夾路徑(默認當前目錄);
        owner:	用戶,默認當前用戶;
        group:	組,默認當前組;
        logger:	用於記錄日誌,一般是logging.Logger對象;
    
    
"""

"""
複製「筆記.data」至文件「筆記1.data」
"""
with open("筆記.data", "r", encoding= "utf-8") as f1:
    with open("筆記1.data", "w", encoding="utf-8") as f2:
        shutil.copyfileobj(f1, f2)


# 無需打開文件,copyfile自動打開文件並複製
# shutil.copyfile("筆記.data", "筆記2.data")

# 遞歸copy文件夾下的全部文件,
# shutil.copytree("../day4", "../day5/copys")

# 將以上遞歸copy的目錄刪除
# shutil.rmtree("copys")

# 壓縮文件並返回路徑
# print(shutil.make_archive("H:/wx/432", "zip" ,root_dir="H:/PycharmProjects/python_tutorial/"))

  

xml模塊

 1 <data>
 2     <country name="Liechtenstein">
 3         <rank updated="yes">2</rank>
 4         <year updated="yes">2009</year>
 5         <gdppc>141100</gdppc>
 6         <neighbor direction="E" name="Austria" />
 7         <neighbor direction="W" name="Switzerland" />
 8     </country>
 9     <country name="Singapore">
10         <rank updated="yes">5</rank>
11         <year updated="yes">2012</year>
12         <gdppc>59900</gdppc>
13         <neighbor direction="N" name="Malaysia" />
14     </country>
15     <country name="Panama">
16         <rank updated="yes">69</rank>
17         <year updated="yes">2012</year>
18         <gdppc>13600</gdppc>
19         <neighbor direction="W" name="Costa Rica" />
20         <neighbor direction="E" name="Colombia" />
21     </country>
22 </data>
xml_test.xml

 

# Author:Allister.Liu
# !/usr/bin/env python
import xml.etree.ElementTree as ET


"""
xml處理模塊:xml是實現不一樣語言或程序之間進行數據交換的協議,跟json差很少,但json使用起來更簡單,不過,古時候,在json還沒誕生的黑暗年代,你們只能選擇用xml呀,
            至今不少傳統公司如金融行業的不少系統的接口還主要是xml。

"""

# xml協議在各個語言裏的都 是支持的,在python中能夠用如下模塊操做xml

tree = ET.parse("datas/xml_test.xml")

root = tree.getroot()
print("父節點:", root.tag)

# print("遍歷xml文檔".center(50, "~"))
# # 遍歷xml文檔
# for child in root:
#     print(child.tag, child.attrib)
#     for i in child:
#         print(i.tag, i.text)
#
# print("year節點".center(50, "~"))
# # 只遍歷year節點
# for node in root.iter('year'):
#     print(node.tag, node.text)


"""
    修改和刪除xml文檔內容
"""
# 修改
for node in root.iter('year'):
    new_year = int(node.text) + 1
    node.text = str(new_year)
    node.set("updated", "yes")

tree.write("datas/xmltest.xml")

# 刪除node
for country in root.findall('country'):
    rank = int(country.find('rank').text)
    if rank > 50:
        root.remove(country)

tree.write('datas/output.xml')




"""
    建立xml文檔
"""
new_xml = ET.Element("namelist")
name = ET.SubElement(new_xml, "name", attrib={"enrolled": "yes"})
age = ET.SubElement(name, "age", attrib={"checked": "no"})
sex = ET.SubElement(name, "sex")
sex.text = '33'
name2 = ET.SubElement(new_xml, "name", attrib={"enrolled": "no"})
age = ET.SubElement(name2, "age")
age.text = '19'

et = ET.ElementTree(new_xml)  # 生成文檔對象
et.write("datas/test.xml", encoding="utf-8", xml_declaration=True)

ET.dump(new_xml)  # 打印生成的格式

  

configparser模塊

  • 文件的生成:
# Author:Allister.Liu
# !/usr/bin/env python
import configparser

"""
    mysql的配置文件:
"""
config = configparser.ConfigParser()
# 第一種賦值
config["client"] = {'port': '3306',
                    'default-character-set': 'utf8'}

# 第二種賦值
config['mysqld'] = {}
config['mysqld']['port'] = '3306'
config['mysqld']['character_set_server'] = 'utf8'
config['mysqld']['collation-server'] = 'utf8_general_ci'
config['mysqld']['lower_case_table_names'] = '1'
config['mysqld']['max_connections'] = '200'

# 第三種賦值
config['mysqld_safe'] = {}
topsecret = config['mysqld_safe']
topsecret['log-error'] = '/usr/local/mysql/error.log'

config['mysqld']['datadir'] = '/usr/local/mysql/data'

with open('datas/my.ini', 'w') as configfile:
    config.write(configfile)
  • 文件的讀取:
# Author:Allister.Liu
# !/usr/bin/env python
import configparser

"""
    configparser的讀取:
"""

config = configparser.ConfigParser()

# 打開文件,返回文件路徑
config.read('datas/my.ini')

# 讀取文件中的父節點
print(config.sections())  # ['client', 'mysqld', 'mysqld_safe', 'logs']

# 判斷節點是否存在文件中
print("mysqld" in config)  # True

# 獲取節點下某個值
print(config["mysqld"]["port"])  # 3306
print(config["mysqld_safe"]["log-error"])   # /usr/local/mysql/error.log

topsecret = config["mysqld_safe"]
print(topsecret["log-error"])   # /usr/local/mysql/error.log

print("遍歷配置文件".center(50, "~"))
for key in config["mysqld"]:
    print(key)

# 返回元組格式的屬性
# [('port', '3306'), ('character_set_server', 'utf8'), ('collation-server', 'utf8_general_ci'), ('lower_case_table_names', '1'), ('max_connections', '200'), ('datadir', '/usr/local/mysql/data')]
print(config.items("mysqld"))


print(" 改寫 ".center(50, "#"))
# 刪除mysqld後從新寫入
# sec = config.remove_section('mysqld') # 要刪除的key
# config.write(open('datas/my.ini', "w"))

# # 判斷一個節點是否存在
# sec = config.has_section('mysqld')
# print(sec)
# # 添加一個節點,若是存在會報錯
# sec = config.add_section('logs')
# config.write(open('datas/my.ini', "w"))


# 新增logs節點下的log_path
config.set('logs', 'log_path', "/usr/logs")
config.write(open('datas/my.ini', "w"))

  

Hashlib、Hmac模塊

# Author:Allister.Liu
# !/usr/bin/env python

import hashlib

"""
    hashlib模塊:用於加密相關的操做,3.x裏代替了md5模塊和sha模塊,主要提供 SHA1, SHA224, SHA256, SHA384, SHA512 ,MD5 算法。
"""
m1 = hashlib.md5()
m1.update("asdfghjkl".encode("utf-8"))

# 2進制
print(m1.digest())
# 16進制
print(m1.hexdigest())

# ######## md5 ########
print(" md5 ".center(50, "#"))
hash = hashlib.md5()
hash.update('admin'.encode("utf-8"))
print(hash.hexdigest())

# ######## sha1 ########
print(" sha1 ".center(50, "#"))
hash = hashlib.sha1()
hash.update('admin'.encode("utf-8"))
print(hash.hexdigest())

# ######## sha256 ########
print(" sha256 ".center(50, "#"))
hash = hashlib.sha256()
hash.update('admin'.encode("utf-8"))
print(hash.hexdigest())

# ######## sha384 ########
print(" sha384 ".center(50, "#"))
hash = hashlib.sha384()
hash.update('admin'.encode("utf-8"))
print(hash.hexdigest())

# ######## sha512 ########
print(" sha512 ".center(50, "#"))
hash = hashlib.sha512()
hash.update('admin'.encode("utf-8"))
print(hash.hexdigest())


"""
    python 還有一個 hmac 模塊,它內部對咱們建立 key 和 內容 再進行處理而後再加密
    
    散列消息鑑別碼,簡稱HMAC,是一種基於消息鑑別碼MAC(Message Authentication Code)的鑑別機制。使用HMAC時,消息通信的雙方,經過驗證消息中加入的鑑別密鑰K來鑑別消息的真僞;
    
    通常用於網絡通訊中消息加密,前提是雙方先要約定好key,就像接頭暗號同樣,而後消息發送把用key把消息加密,接收方用key + 消息明文再加密,拿加密後的值 跟 發送者的相對比是否相等,這樣就能驗證消息的真實性,及發送者的合法性了。
"""

import hmac
h = hmac.new('中華好兒女'.encode("utf-8"), '美麗的山河'.encode("utf-8"))
print(h.hexdigest())

  

zipfile&tarfile模塊

# Author:Allister.Liu
# !/usr/bin/env python

"""
zip解壓縮

"""
import zipfile
# 壓縮
z = zipfile.ZipFile('Allister.zip', 'w')
z.write('筆記.data')
z.write('sys_test.py')
z.close()

# 解壓
z = zipfile.ZipFile('Allister.zip', 'r')
z.extractall()
z.close()


"""
    tar解壓縮
"""

import tarfile

# 壓縮
tar = tarfile.open('your.tar', 'w')
tar.add('/home/dsa.tools/mysql.zip', arcname='mysql.zip')
tar.add('/Users/wupeiqi/PycharmProjects/cmdb.zip', arcname='cmdb.zip')
tar.close()

# 解壓
tar = tarfile.open('your.tar', 'r')
tar.extractall()  # 可設置解壓地址
tar.close()

  

a、zipfile

 1 """
 2 Read and write ZIP files.  3 
 4 XXX references to utf-8 need further investigation.  5 """
 6 import io  7 import os  8 import re  9 import importlib.util  10 import sys  11 import time  12 import stat  13 import shutil  14 import struct  15 import binascii  16 
 17 try:  18     import threading  19 except ImportError:  20     import dummy_threading as threading  21 
 22 try:  23     import zlib # We may need its compression method
 24     crc32 = zlib.crc32  25 except ImportError:  26     zlib = None  27     crc32 = binascii.crc32  28 
 29 try:  30     import bz2 # We may need its compression method
 31 except ImportError:  32     bz2 = None  33 
 34 try:  35     import lzma # We may need its compression method
 36 except ImportError:  37     lzma = None  38 
 39 __all__ = ["BadZipFile", "BadZipfile", "error",  40            "ZIP_STORED", "ZIP_DEFLATED", "ZIP_BZIP2", "ZIP_LZMA",  41            "is_zipfile", "ZipInfo", "ZipFile", "PyZipFile", "LargeZipFile"]  42 
 43 class BadZipFile(Exception):  44     pass
 45 
 46 
 47 class LargeZipFile(Exception):  48     """
 49  Raised when writing a zipfile, the zipfile requires ZIP64 extensions  50  and those extensions are disabled.  51     """
 52 
 53 error = BadZipfile = BadZipFile      # Pre-3.2 compatibility names
 54 
 55 
 56 ZIP64_LIMIT = (1 << 31) - 1
 57 ZIP_FILECOUNT_LIMIT = (1 << 16) - 1
 58 ZIP_MAX_COMMENT = (1 << 16) - 1
 59 
 60 # constants for Zip file compression methods
 61 ZIP_STORED = 0  62 ZIP_DEFLATED = 8
 63 ZIP_BZIP2 = 12
 64 ZIP_LZMA = 14
 65 # Other ZIP compression methods not supported
 66 
 67 DEFAULT_VERSION = 20
 68 ZIP64_VERSION = 45
 69 BZIP2_VERSION = 46
 70 LZMA_VERSION = 63
 71 # we recognize (but not necessarily support) all features up to that version
 72 MAX_EXTRACT_VERSION = 63
 73 
 74 # Below are some formats and associated data for reading/writing headers using
 75 # the struct module. The names and structures of headers/records are those used
 76 # in the PKWARE description of the ZIP file format:
 77 # http://www.pkware.com/documents/casestudies/APPNOTE.TXT
 78 # (URL valid as of January 2008)
 79 
 80 # The "end of central directory" structure, magic number, size, and indices
 81 # (section V.I in the format document)
 82 structEndArchive = b"<4s4H2LH"
 83 stringEndArchive = b"PK\005\006"
 84 sizeEndCentDir = struct.calcsize(structEndArchive)  85 
 86 _ECD_SIGNATURE = 0  87 _ECD_DISK_NUMBER = 1
 88 _ECD_DISK_START = 2
 89 _ECD_ENTRIES_THIS_DISK = 3
 90 _ECD_ENTRIES_TOTAL = 4
 91 _ECD_SIZE = 5
 92 _ECD_OFFSET = 6
 93 _ECD_COMMENT_SIZE = 7
 94 # These last two indices are not part of the structure as defined in the
 95 # spec, but they are used internally by this module as a convenience
 96 _ECD_COMMENT = 8
 97 _ECD_LOCATION = 9
 98 
 99 # The "central directory" structure, magic number, size, and indices
 100 # of entries in the structure (section V.F in the format document)
 101 structCentralDir = "<4s4B4HL2L5H2L"
 102 stringCentralDir = b"PK\001\002"
 103 sizeCentralDir = struct.calcsize(structCentralDir)  104 
 105 # indexes of entries in the central directory structure
 106 _CD_SIGNATURE = 0  107 _CD_CREATE_VERSION = 1
 108 _CD_CREATE_SYSTEM = 2
 109 _CD_EXTRACT_VERSION = 3
 110 _CD_EXTRACT_SYSTEM = 4
 111 _CD_FLAG_BITS = 5
 112 _CD_COMPRESS_TYPE = 6
 113 _CD_TIME = 7
 114 _CD_DATE = 8
 115 _CD_CRC = 9
 116 _CD_COMPRESSED_SIZE = 10
 117 _CD_UNCOMPRESSED_SIZE = 11
 118 _CD_FILENAME_LENGTH = 12
 119 _CD_EXTRA_FIELD_LENGTH = 13
 120 _CD_COMMENT_LENGTH = 14
 121 _CD_DISK_NUMBER_START = 15
 122 _CD_INTERNAL_FILE_ATTRIBUTES = 16
 123 _CD_EXTERNAL_FILE_ATTRIBUTES = 17
 124 _CD_LOCAL_HEADER_OFFSET = 18
 125 
 126 # The "local file header" structure, magic number, size, and indices
 127 # (section V.A in the format document)
 128 structFileHeader = "<4s2B4HL2L2H"
 129 stringFileHeader = b"PK\003\004"
 130 sizeFileHeader = struct.calcsize(structFileHeader)  131 
 132 _FH_SIGNATURE = 0  133 _FH_EXTRACT_VERSION = 1
 134 _FH_EXTRACT_SYSTEM = 2
 135 _FH_GENERAL_PURPOSE_FLAG_BITS = 3
 136 _FH_COMPRESSION_METHOD = 4
 137 _FH_LAST_MOD_TIME = 5
 138 _FH_LAST_MOD_DATE = 6
 139 _FH_CRC = 7
 140 _FH_COMPRESSED_SIZE = 8
 141 _FH_UNCOMPRESSED_SIZE = 9
 142 _FH_FILENAME_LENGTH = 10
 143 _FH_EXTRA_FIELD_LENGTH = 11
 144 
 145 # The "Zip64 end of central directory locator" structure, magic number, and size
 146 structEndArchive64Locator = "<4sLQL"
 147 stringEndArchive64Locator = b"PK\x06\x07"
 148 sizeEndCentDir64Locator = struct.calcsize(structEndArchive64Locator)  149 
 150 # The "Zip64 end of central directory" record, magic number, size, and indices
 151 # (section V.G in the format document)
 152 structEndArchive64 = "<4sQ2H2L4Q"
 153 stringEndArchive64 = b"PK\x06\x06"
 154 sizeEndCentDir64 = struct.calcsize(structEndArchive64)  155 
 156 _CD64_SIGNATURE = 0  157 _CD64_DIRECTORY_RECSIZE = 1
 158 _CD64_CREATE_VERSION = 2
 159 _CD64_EXTRACT_VERSION = 3
 160 _CD64_DISK_NUMBER = 4
 161 _CD64_DISK_NUMBER_START = 5
 162 _CD64_NUMBER_ENTRIES_THIS_DISK = 6
 163 _CD64_NUMBER_ENTRIES_TOTAL = 7
 164 _CD64_DIRECTORY_SIZE = 8
 165 _CD64_OFFSET_START_CENTDIR = 9
 166 
 167 def _check_zipfile(fp):  168     try:  169         if _EndRecData(fp):  170             return True         # file has correct magic number
 171     except OSError:  172         pass
 173     return False  174 
 175 def is_zipfile(filename):  176     """Quickly see if a file is a ZIP file by checking the magic number.  177 
 178  The filename argument may be a file or file-like object too.  179     """
 180     result = False  181     try:  182         if hasattr(filename, "read"):  183             result = _check_zipfile(fp=filename)  184         else:  185             with open(filename, "rb") as fp:  186                 result = _check_zipfile(fp)  187     except OSError:  188         pass
 189     return result  190 
 191 def _EndRecData64(fpin, offset, endrec):  192     """
 193  Read the ZIP64 end-of-archive records and use that to update endrec  194     """
 195     try:  196         fpin.seek(offset - sizeEndCentDir64Locator, 2)  197     except OSError:  198         # If the seek fails, the file is not large enough to contain a ZIP64
 199         # end-of-archive record, so just return the end record we were given.
 200         return endrec  201 
 202     data = fpin.read(sizeEndCentDir64Locator)  203     if len(data) != sizeEndCentDir64Locator:  204         return endrec  205     sig, diskno, reloff, disks = struct.unpack(structEndArchive64Locator, data)  206     if sig != stringEndArchive64Locator:  207         return endrec  208 
 209     if diskno != 0 or disks != 1:  210         raise BadZipFile("zipfiles that span multiple disks are not supported")  211 
 212     # Assume no 'zip64 extensible data'
 213     fpin.seek(offset - sizeEndCentDir64Locator - sizeEndCentDir64, 2)  214     data = fpin.read(sizeEndCentDir64)  215     if len(data) != sizeEndCentDir64:  216         return endrec  217  sig, sz, create_version, read_version, disk_num, disk_dir, \  218         dircount, dircount2, dirsize, diroffset = \  219  struct.unpack(structEndArchive64, data)  220     if sig != stringEndArchive64:  221         return endrec  222 
 223     # Update the original endrec using data from the ZIP64 record
 224     endrec[_ECD_SIGNATURE] = sig  225     endrec[_ECD_DISK_NUMBER] = disk_num  226     endrec[_ECD_DISK_START] = disk_dir  227     endrec[_ECD_ENTRIES_THIS_DISK] = dircount  228     endrec[_ECD_ENTRIES_TOTAL] = dircount2  229     endrec[_ECD_SIZE] = dirsize  230     endrec[_ECD_OFFSET] = diroffset  231     return endrec  232 
 233 
 234 def _EndRecData(fpin):  235     """Return data from the "End of Central Directory" record, or None.  236 
 237  The data is a list of the nine items in the ZIP "End of central dir"  238  record followed by a tenth item, the file seek offset of this record."""
 239 
 240     # Determine file size
 241     fpin.seek(0, 2)  242     filesize = fpin.tell()  243 
 244     # Check to see if this is ZIP file with no archive comment (the
 245     # "end of central directory" structure should be the last item in the
 246     # file if this is the case).
 247     try:  248         fpin.seek(-sizeEndCentDir, 2)  249     except OSError:  250         return None  251     data = fpin.read()  252     if (len(data) == sizeEndCentDir and
 253         data[0:4] == stringEndArchive and
 254         data[-2:] == b"\000\000"):  255         # the signature is correct and there's no comment, unpack structure
 256         endrec = struct.unpack(structEndArchive, data)  257         endrec=list(endrec)  258 
 259         # Append a blank comment and record start offset
 260         endrec.append(b"")  261         endrec.append(filesize - sizeEndCentDir)  262 
 263         # Try to read the "Zip64 end of central directory" structure
 264         return _EndRecData64(fpin, -sizeEndCentDir, endrec)  265 
 266     # Either this is not a ZIP file, or it is a ZIP file with an archive
 267     # comment. Search the end of the file for the "end of central directory"
 268     # record signature. The comment is the last item in the ZIP file and may be
 269     # up to 64K long. It is assumed that the "end of central directory" magic
 270     # number does not appear in the comment.
 271     maxCommentStart = max(filesize - (1 << 16) - sizeEndCentDir, 0)  272  fpin.seek(maxCommentStart, 0)  273     data = fpin.read()  274     start = data.rfind(stringEndArchive)  275     if start >= 0:  276         # found the magic number; attempt to unpack and interpret
 277         recData = data[start:start+sizeEndCentDir]  278         if len(recData) != sizeEndCentDir:  279             # Zip file is corrupted.
 280             return None  281         endrec = list(struct.unpack(structEndArchive, recData))  282         commentSize = endrec[_ECD_COMMENT_SIZE] #as claimed by the zip file
 283         comment = data[start+sizeEndCentDir:start+sizeEndCentDir+commentSize]  284  endrec.append(comment)  285         endrec.append(maxCommentStart + start)  286 
 287         # Try to read the "Zip64 end of central directory" structure
 288         return _EndRecData64(fpin, maxCommentStart + start - filesize,  289  endrec)  290 
 291     # Unable to find a valid end of central directory structure
 292     return None  293 
 294 
 295 class ZipInfo (object):  296     """Class with attributes describing each file in the ZIP archive."""
 297 
 298     __slots__ = (  299         'orig_filename',  300         'filename',  301         'date_time',  302         'compress_type',  303         'comment',  304         'extra',  305         'create_system',  306         'create_version',  307         'extract_version',  308         'reserved',  309         'flag_bits',  310         'volume',  311         'internal_attr',  312         'external_attr',  313         'header_offset',  314         'CRC',  315         'compress_size',  316         'file_size',  317         '_raw_time',  318  )  319 
 320     def __init__(self, filename="NoName", date_time=(1980,1,1,0,0,0)):  321         self.orig_filename = filename   # Original file name in archive
 322 
 323         # Terminate the file name at the first null byte. Null bytes in file
 324         # names are used as tricks by viruses in archives.
 325         null_byte = filename.find(chr(0))  326         if null_byte >= 0:  327             filename = filename[0:null_byte]  328         # This is used to ensure paths in generated ZIP files always use
 329         # forward slashes as the directory separator, as required by the
 330         # ZIP format specification.
 331         if os.sep != "/" and os.sep in filename:  332             filename = filename.replace(os.sep, "/")  333 
 334         self.filename = filename        # Normalized file name
 335         self.date_time = date_time      # year, month, day, hour, min, sec
 336 
 337         if date_time[0] < 1980:  338             raise ValueError('ZIP does not support timestamps before 1980')  339 
 340         # Standard values:
 341         self.compress_type = ZIP_STORED # Type of compression for the file
 342         self.comment = b""              # Comment for each file
 343         self.extra = b""                # ZIP extra data
 344         if sys.platform == 'win32':  345             self.create_system = 0          # System which created ZIP archive
 346         else:  347             # Assume everything else is unix-y
 348             self.create_system = 3          # System which created ZIP archive
 349         self.create_version = DEFAULT_VERSION  # Version which created ZIP archive
 350         self.extract_version = DEFAULT_VERSION # Version needed to extract archive
 351         self.reserved = 0               # Must be zero
 352         self.flag_bits = 0              # ZIP flag bits
 353         self.volume = 0                 # Volume number of file header
 354         self.internal_attr = 0          # Internal attributes
 355         self.external_attr = 0          # External file attributes
 356         # Other attributes are set by class ZipFile:
 357         # header_offset Byte offset to the file header
 358         # CRC CRC-32 of the uncompressed file
 359         # compress_size Size of the compressed file
 360         # file_size Size of the uncompressed file
 361 
 362     def __repr__(self):  363         result = ['<%s filename=%r' % (self.__class__.__name__, self.filename)]  364         if self.compress_type != ZIP_STORED:  365             result.append(' compress_type=%s' %
 366  compressor_names.get(self.compress_type,  367  self.compress_type))  368         hi = self.external_attr >> 16
 369         lo = self.external_attr & 0xFFFF
 370         if hi:  371             result.append(' filemode=%r' % stat.filemode(hi))  372         if lo:  373             result.append(' external_attr=%#x' % lo)  374         isdir = self.filename[-1:] == '/'
 375         if not isdir or self.file_size:  376             result.append(' file_size=%r' % self.file_size)  377         if ((not isdir or self.compress_size) and
 378             (self.compress_type != ZIP_STORED or
 379              self.file_size != self.compress_size)):  380             result.append(' compress_size=%r' % self.compress_size)  381         result.append('>')  382         return ''.join(result)  383 
 384     def FileHeader(self, zip64=None):  385         """Return the per-file header as a string."""
 386         dt = self.date_time  387         dosdate = (dt[0] - 1980) << 9 | dt[1] << 5 | dt[2]  388         dostime = dt[3] << 11 | dt[4] << 5 | (dt[5] // 2)  389         if self.flag_bits & 0x08:  390             # Set these to zero because we write them after the file data
 391             CRC = compress_size = file_size = 0  392         else:  393             CRC = self.CRC  394             compress_size = self.compress_size  395             file_size = self.file_size  396 
 397         extra = self.extra  398 
 399         min_version = 0  400         if zip64 is None:  401             zip64 = file_size > ZIP64_LIMIT or compress_size > ZIP64_LIMIT  402         if zip64:  403             fmt = '<HHQQ'
 404             extra = extra + struct.pack(fmt,  405                                         1, struct.calcsize(fmt)-4, file_size, compress_size)  406         if file_size > ZIP64_LIMIT or compress_size > ZIP64_LIMIT:  407             if not zip64:  408                 raise LargeZipFile("Filesize would require ZIP64 extensions")  409             # File is larger than what fits into a 4 byte integer,
 410             # fall back to the ZIP64 extension
 411             file_size = 0xffffffff
 412             compress_size = 0xffffffff
 413             min_version = ZIP64_VERSION  414 
 415         if self.compress_type == ZIP_BZIP2:  416             min_version = max(BZIP2_VERSION, min_version)  417         elif self.compress_type == ZIP_LZMA:  418             min_version = max(LZMA_VERSION, min_version)  419 
 420         self.extract_version = max(min_version, self.extract_version)  421         self.create_version = max(min_version, self.create_version)  422         filename, flag_bits = self._encodeFilenameFlags()  423         header = struct.pack(structFileHeader, stringFileHeader,  424  self.extract_version, self.reserved, flag_bits,  425  self.compress_type, dostime, dosdate, CRC,  426  compress_size, file_size,  427  len(filename), len(extra))  428         return header + filename + extra  429 
 430     def _encodeFilenameFlags(self):  431         try:  432             return self.filename.encode('ascii'), self.flag_bits  433         except UnicodeEncodeError:  434             return self.filename.encode('utf-8'), self.flag_bits | 0x800
 435 
 436     def _decodeExtra(self):  437         # Try to decode the extra field.
 438         extra = self.extra  439         unpack = struct.unpack  440         while len(extra) >= 4:  441             tp, ln = unpack('<HH', extra[:4])  442             if tp == 1:  443                 if ln >= 24:  444                     counts = unpack('<QQQ', extra[4:28])  445                 elif ln == 16:  446                     counts = unpack('<QQ', extra[4:20])  447                 elif ln == 8:  448                     counts = unpack('<Q', extra[4:12])  449                 elif ln == 0:  450                     counts = ()  451                 else:  452                     raise RuntimeError("Corrupt extra field %s"%(ln,))  453 
 454                 idx = 0  455 
 456                 # ZIP64 extension (large files and/or large archives)
 457                 if self.file_size in (0xffffffffffffffff, 0xffffffff):  458                     self.file_size = counts[idx]  459                     idx += 1
 460 
 461                 if self.compress_size == 0xFFFFFFFF:  462                     self.compress_size = counts[idx]  463                     idx += 1
 464 
 465                 if self.header_offset == 0xffffffff:  466                     old = self.header_offset  467                     self.header_offset = counts[idx]  468                     idx+=1
 469 
 470             extra = extra[ln+4:]  471 
 472 
 473 class _ZipDecrypter:  474     """Class to handle decryption of files stored within a ZIP archive.  475 
 476  ZIP supports a password-based form of encryption. Even though known  477  plaintext attacks have been found against it, it is still useful  478  to be able to get data out of such a file.  479 
 480  Usage:  481  zd = _ZipDecrypter(mypwd)  482  plain_char = zd(cypher_char)  483  plain_text = map(zd, cypher_text)  484     """
 485 
 486     def _GenerateCRCTable():  487         """Generate a CRC-32 table.  488 
 489  ZIP encryption uses the CRC32 one-byte primitive for scrambling some  490  internal keys. We noticed that a direct implementation is faster than  491  relying on binascii.crc32().  492         """
 493         poly = 0xedb88320
 494         table = [0] * 256
 495         for i in range(256):  496             crc = i  497             for j in range(8):  498                 if crc & 1:  499                     crc = ((crc >> 1) & 0x7FFFFFFF) ^ poly  500                 else:  501                     crc = ((crc >> 1) & 0x7FFFFFFF)  502             table[i] = crc  503         return table  504     crctable = None  505 
 506     def _crc32(self, ch, crc):  507         """Compute the CRC32 primitive on one byte."""
 508         return ((crc >> 8) & 0xffffff) ^ self.crctable[(crc ^ ch) & 0xff]  509 
 510     def __init__(self, pwd):  511         if _ZipDecrypter.crctable is None:  512             _ZipDecrypter.crctable = _ZipDecrypter._GenerateCRCTable()  513         self.key0 = 305419896
 514         self.key1 = 591751049
 515         self.key2 = 878082192
 516         for p in pwd:  517  self._UpdateKeys(p)  518 
 519     def _UpdateKeys(self, c):  520         self.key0 = self._crc32(c, self.key0)  521         self.key1 = (self.key1 + (self.key0 & 255)) & 4294967295
 522         self.key1 = (self.key1 * 134775813 + 1) & 4294967295
 523         self.key2 = self._crc32((self.key1 >> 24) & 255, self.key2)  524 
 525     def __call__(self, c):  526         """Decrypt a single character."""
 527         assert isinstance(c, int)  528         k = self.key2 | 2
 529         c = c ^ (((k * (k^1)) >> 8) & 255)  530  self._UpdateKeys(c)  531         return c  532 
 533 
 534 class LZMACompressor:  535 
 536     def __init__(self):  537         self._comp = None  538 
 539     def _init(self):  540         props = lzma._encode_filter_properties({'id': lzma.FILTER_LZMA1})  541         self._comp = lzma.LZMACompressor(lzma.FORMAT_RAW, filters=[  542  lzma._decode_filter_properties(lzma.FILTER_LZMA1, props)  543  ])  544         return struct.pack('<BBH', 9, 4, len(props)) + props  545 
 546     def compress(self, data):  547         if self._comp is None:  548             return self._init() + self._comp.compress(data)  549         return self._comp.compress(data)  550 
 551     def flush(self):  552         if self._comp is None:  553             return self._init() + self._comp.flush()  554         return self._comp.flush()  555 
 556 
 557 class LZMADecompressor:  558 
 559     def __init__(self):  560         self._decomp = None  561         self._unconsumed = b''
 562         self.eof = False  563 
 564     def decompress(self, data):  565         if self._decomp is None:  566             self._unconsumed += data  567             if len(self._unconsumed) <= 4:  568                 return b''
 569             psize, = struct.unpack('<H', self._unconsumed[2:4])  570             if len(self._unconsumed) <= 4 + psize:  571                 return b''
 572 
 573             self._decomp = lzma.LZMADecompressor(lzma.FORMAT_RAW, filters=[  574  lzma._decode_filter_properties(lzma.FILTER_LZMA1,  575                                                self._unconsumed[4:4 + psize])  576  ])  577             data = self._unconsumed[4 + psize:]  578             del self._unconsumed  579 
 580         result = self._decomp.decompress(data)  581         self.eof = self._decomp.eof  582         return result  583 
 584 
 585 compressor_names = {  586     0: 'store',  587     1: 'shrink',  588     2: 'reduce',  589     3: 'reduce',  590     4: 'reduce',  591     5: 'reduce',  592     6: 'implode',  593     7: 'tokenize',  594     8: 'deflate',  595     9: 'deflate64',  596     10: 'implode',  597     12: 'bzip2',  598     14: 'lzma',  599     18: 'terse',  600     19: 'lz77',  601     97: 'wavpack',  602     98: 'ppmd',  603 }  604 
 605 def _check_compression(compression):  606     if compression == ZIP_STORED:  607         pass
 608     elif compression == ZIP_DEFLATED:  609         if not zlib:  610             raise RuntimeError(  611                 "Compression requires the (missing) zlib module")  612     elif compression == ZIP_BZIP2:  613         if not bz2:  614             raise RuntimeError(  615                 "Compression requires the (missing) bz2 module")  616     elif compression == ZIP_LZMA:  617         if not lzma:  618             raise RuntimeError(  619                 "Compression requires the (missing) lzma module")  620     else:  621         raise RuntimeError("That compression method is not supported")  622 
 623 
 624 def _get_compressor(compress_type):  625     if compress_type == ZIP_DEFLATED:  626         return zlib.compressobj(zlib.Z_DEFAULT_COMPRESSION,  627                                 zlib.DEFLATED, -15)  628     elif compress_type == ZIP_BZIP2:  629         return bz2.BZ2Compressor()  630     elif compress_type == ZIP_LZMA:  631         return LZMACompressor()  632     else:  633         return None  634 
 635 
 636 def _get_decompressor(compress_type):  637     if compress_type == ZIP_STORED:  638         return None  639     elif compress_type == ZIP_DEFLATED:  640         return zlib.decompressobj(-15)  641     elif compress_type == ZIP_BZIP2:  642         return bz2.BZ2Decompressor()  643     elif compress_type == ZIP_LZMA:  644         return LZMADecompressor()  645     else:  646         descr = compressor_names.get(compress_type)  647         if descr:  648             raise NotImplementedError("compression type %d (%s)" % (compress_type, descr))  649         else:  650             raise NotImplementedError("compression type %d" % (compress_type,))  651 
 652 
 653 class _SharedFile:  654     def __init__(self, file, pos, close, lock):  655         self._file = file  656         self._pos = pos  657         self._close = close  658         self._lock = lock  659 
 660     def read(self, n=-1):  661  with self._lock:  662  self._file.seek(self._pos)  663             data = self._file.read(n)  664             self._pos = self._file.tell()  665             return data  666 
 667     def close(self):  668         if self._file is not None:  669             fileobj = self._file  670             self._file = None  671  self._close(fileobj)  672 
 673 # Provide the tell method for unseekable stream
 674 class _Tellable:  675     def __init__(self, fp):  676         self.fp = fp  677         self.offset = 0  678 
 679     def write(self, data):  680         n = self.fp.write(data)  681         self.offset += n  682         return n  683 
 684     def tell(self):  685         return self.offset  686 
 687     def flush(self):  688  self.fp.flush()  689 
 690     def close(self):  691  self.fp.close()  692 
 693 
 694 class ZipExtFile(io.BufferedIOBase):  695     """File-like object for reading an archive member.  696  Is returned by ZipFile.open().  697     """
 698 
 699     # Max size supported by decompressor.
 700     MAX_N = 1 << 31 - 1
 701 
 702     # Read from compressed files in 4k blocks.
 703     MIN_READ_SIZE = 4096
 704 
 705     # Search for universal newlines or line chunks.
 706     PATTERN = re.compile(br'^(?P<chunk>[^\r\n]+)|(?P<newline>\n|\r\n?)')  707 
 708     def __init__(self, fileobj, mode, zipinfo, decrypter=None,  709                  close_fileobj=False):  710         self._fileobj = fileobj  711         self._decrypter = decrypter  712         self._close_fileobj = close_fileobj  713 
 714         self._compress_type = zipinfo.compress_type  715         self._compress_left = zipinfo.compress_size  716         self._left = zipinfo.file_size  717 
 718         self._decompressor = _get_decompressor(self._compress_type)  719 
 720         self._eof = False  721         self._readbuffer = b''
 722         self._offset = 0  723 
 724         self._universal = 'U' in mode  725         self.newlines = None  726 
 727         # Adjust read size for encrypted files since the first 12 bytes
 728         # are for the encryption/password information.
 729         if self._decrypter is not None:  730             self._compress_left -= 12
 731 
 732         self.mode = mode  733         self.name = zipinfo.filename  734 
 735         if hasattr(zipinfo, 'CRC'):  736             self._expected_crc = zipinfo.CRC  737             self._running_crc = crc32(b'')  738         else:  739             self._expected_crc = None  740 
 741     def __repr__(self):  742         result = ['<%s.%s' % (self.__class__.__module__,  743                               self.__class__.__qualname__)]  744         if not self.closed:  745             result.append(' name=%r mode=%r' % (self.name, self.mode))  746             if self._compress_type != ZIP_STORED:  747                 result.append(' compress_type=%s' %
 748  compressor_names.get(self._compress_type,  749  self._compress_type))  750         else:  751             result.append(' [closed]')  752         result.append('>')  753         return ''.join(result)  754 
 755     def readline(self, limit=-1):  756         """Read and return a line from the stream.  757 
 758  If limit is specified, at most limit bytes will be read.  759         """
 760 
 761         if not self._universal and limit < 0:  762             # Shortcut common case - newline found in buffer.
 763             i = self._readbuffer.find(b'\n', self._offset) + 1
 764             if i > 0:  765                 line = self._readbuffer[self._offset: i]  766                 self._offset = i  767                 return line  768 
 769         if not self._universal:  770             return io.BufferedIOBase.readline(self, limit)  771 
 772         line = b''
 773         while limit < 0 or len(line) < limit:  774             readahead = self.peek(2)  775             if readahead == b'':  776                 return line  777 
 778             #  779             # Search for universal newlines or line chunks.
 780             #  781             # The pattern returns either a line chunk or a newline, but not
 782             # both. Combined with peek(2), we are assured that the sequence
 783             # '\r\n' is always retrieved completely and never split into
 784             # separate newlines - '\r', '\n' due to coincidental readaheads.
 785             #  786             match = self.PATTERN.search(readahead)  787             newline = match.group('newline')  788             if newline is not None:  789                 if self.newlines is None:  790                     self.newlines = []  791                 if newline not in self.newlines:  792  self.newlines.append(newline)  793                 self._offset += len(newline)  794                 return line + b'\n'
 795 
 796             chunk = match.group('chunk')  797             if limit >= 0:  798                 chunk = chunk[: limit - len(line)]  799 
 800             self._offset += len(chunk)  801             line += chunk  802 
 803         return line  804 
 805     def peek(self, n=1):  806         """Returns buffered bytes without advancing the position."""
 807         if n > len(self._readbuffer) - self._offset:  808             chunk = self.read(n)  809             if len(chunk) > self._offset:  810                 self._readbuffer = chunk + self._readbuffer[self._offset:]  811                 self._offset = 0  812             else:  813                 self._offset -= len(chunk)  814 
 815         # Return up to 512 bytes to reduce allocation overhead for tight loops.
 816         return self._readbuffer[self._offset: self._offset + 512]  817 
 818     def readable(self):  819         return True  820 
 821     def read(self, n=-1):  822         """Read and return up to n bytes.  823  If the argument is omitted, None, or negative, data is read and returned until EOF is reached..  824         """
 825         if n is None or n < 0:  826             buf = self._readbuffer[self._offset:]  827             self._readbuffer = b''
 828             self._offset = 0  829             while not self._eof:  830                 buf += self._read1(self.MAX_N)  831             return buf  832 
 833         end = n + self._offset  834         if end < len(self._readbuffer):  835             buf = self._readbuffer[self._offset:end]  836             self._offset = end  837             return buf  838 
 839         n = end - len(self._readbuffer)  840         buf = self._readbuffer[self._offset:]  841         self._readbuffer = b''
 842         self._offset = 0  843         while n > 0 and not self._eof:  844             data = self._read1(n)  845             if n < len(data):  846                 self._readbuffer = data  847                 self._offset = n  848                 buf += data[:n]  849                 break
 850             buf += data  851             n -= len(data)  852         return buf  853 
 854     def _update_crc(self, newdata):  855         # Update the CRC using the given data.
 856         if self._expected_crc is None:  857             # No need to compute the CRC if we don't have a reference value
 858             return
 859         self._running_crc = crc32(newdata, self._running_crc)  860         # Check the CRC if we're at the end of the file
 861         if self._eof and self._running_crc != self._expected_crc:  862             raise BadZipFile("Bad CRC-32 for file %r" % self.name)  863 
 864     def read1(self, n):  865         """Read up to n bytes with at most one read() system call."""
 866 
 867         if n is None or n < 0:  868             buf = self._readbuffer[self._offset:]  869             self._readbuffer = b''
 870             self._offset = 0  871             while not self._eof:  872                 data = self._read1(self.MAX_N)  873                 if data:  874                     buf += data  875                     break
 876             return buf  877 
 878         end = n + self._offset  879         if end < len(self._readbuffer):  880             buf = self._readbuffer[self._offset:end]  881             self._offset = end  882             return buf  883 
 884         n = end - len(self._readbuffer)  885         buf = self._readbuffer[self._offset:]  886         self._readbuffer = b''
 887         self._offset = 0  888         if n > 0:  889             while not self._eof:  890                 data = self._read1(n)  891                 if n < len(data):  892                     self._readbuffer = data  893                     self._offset = n  894                     buf += data[:n]  895                     break
 896                 if data:  897                     buf += data  898                     break
 899         return buf  900 
 901     def _read1(self, n):  902         # Read up to n compressed bytes with at most one read() system call,
 903         # decrypt and decompress them.
 904         if self._eof or n <= 0:  905             return b''
 906 
 907         # Read from file.
 908         if self._compress_type == ZIP_DEFLATED:  909             ## Handle unconsumed data.
 910             data = self._decompressor.unconsumed_tail  911             if n > len(data):  912                 data += self._read2(n - len(data))  913         else:  914             data = self._read2(n)  915 
 916         if self._compress_type == ZIP_STORED:  917             self._eof = self._compress_left <= 0  918         elif self._compress_type == ZIP_DEFLATED:  919             n = max(n, self.MIN_READ_SIZE)  920             data = self._decompressor.decompress(data, n)  921             self._eof = (self._decompressor.eof or
 922                          self._compress_left <= 0 and
 923                          not self._decompressor.unconsumed_tail)  924             if self._eof:  925                 data += self._decompressor.flush()  926         else:  927             data = self._decompressor.decompress(data)  928             self._eof = self._decompressor.eof or self._compress_left <= 0  929 
 930         data = data[:self._left]  931         self._left -= len(data)  932         if self._left <= 0:  933             self._eof = True  934  self._update_crc(data)  935         return data  936 
 937     def _read2(self, n):  938         if self._compress_left <= 0:  939             return b''
 940 
 941         n = max(n, self.MIN_READ_SIZE)  942         n = min(n, self._compress_left)  943 
 944         data = self._fileobj.read(n)  945         self._compress_left -= len(data)  946         if not data:  947             raise EOFError  948 
 949         if self._decrypter is not None:  950             data = bytes(map(self._decrypter, data))  951         return data  952 
 953     def close(self):  954         try:  955             if self._close_fileobj:  956  self._fileobj.close()  957         finally:  958  super().close()  959 
 960 
 961 class ZipFile:  962     """ Class with methods to open, read, write, close, list zip files.  963 
 964  z = ZipFile(file, mode="r", compression=ZIP_STORED, allowZip64=True)  965 
 966  file: Either the path to the file, or a file-like object.  967  If it is a path, the file will be opened and closed by ZipFile.  968  mode: The mode can be either read 'r', write 'w', exclusive create 'x',  969  or append 'a'.  970  compression: ZIP_STORED (no compression), ZIP_DEFLATED (requires zlib),  971  ZIP_BZIP2 (requires bz2) or ZIP_LZMA (requires lzma).  972  allowZip64: if True ZipFile will create files with ZIP64 extensions when  973  needed, otherwise it will raise an exception when this would  974  be necessary.  975 
 976     """
 977 
 978     fp = None                   # Set here since __del__ checks it
 979     _windows_illegal_name_trans_table = None  980 
 981     def __init__(self, file, mode="r", compression=ZIP_STORED, allowZip64=True):  982         """Open the ZIP file with mode read 'r', write 'w', exclusive create 'x',  983  or append 'a'."""
 984         if mode not in ('r', 'w', 'x', 'a'):  985             raise RuntimeError("ZipFile requires mode 'r', 'w', 'x', or 'a'")  986 
 987  _check_compression(compression)  988 
 989         self._allowZip64 = allowZip64  990         self._didModify = False  991         self.debug = 0  # Level of printing: 0 through 3
 992         self.NameToInfo = {}    # Find file info given name
 993         self.filelist = []      # List of ZipInfo instances for archive
 994         self.compression = compression  # Method of compression
 995         self.mode = mode  996         self.pwd = None  997         self._comment = b''
 998 
 999         # Check if we were passed a file-like object
1000         if isinstance(file, str): 1001             # No, it's a filename
1002             self._filePassed = 0 1003             self.filename = file 1004             modeDict = {'r' : 'rb', 'w': 'w+b', 'x': 'x+b', 'a' : 'r+b', 1005                         'r+b': 'w+b', 'w+b': 'wb', 'x+b': 'xb'} 1006             filemode = modeDict[mode] 1007             while True: 1008                 try: 1009                     self.fp = io.open(file, filemode) 1010                 except OSError: 1011                     if filemode in modeDict: 1012                         filemode = modeDict[filemode] 1013                         continue
1014                     raise
1015                 break
1016         else: 1017             self._filePassed = 1
1018             self.fp = file 1019             self.filename = getattr(file, 'name', None) 1020         self._fileRefCnt = 1
1021         self._lock = threading.RLock() 1022         self._seekable = True 1023 
1024         try: 1025             if mode == 'r': 1026  self._RealGetContents() 1027             elif mode in ('w', 'x'): 1028                 # set the modified flag so central directory gets written
1029                 # even if no files are added to the archive
1030                 self._didModify = True 1031                 try: 1032                     self.start_dir = self.fp.tell() 1033                 except (AttributeError, OSError): 1034                     self.fp = _Tellable(self.fp) 1035                     self.start_dir = 0 1036                     self._seekable = False 1037                 else: 1038                     # Some file-like objects can provide tell() but not seek()
1039                     try: 1040  self.fp.seek(self.start_dir) 1041                     except (AttributeError, OSError): 1042                         self._seekable = False 1043             elif mode == 'a': 1044                 try: 1045                     # See if file is a zip file
1046  self._RealGetContents() 1047                     # seek to start of directory and overwrite
1048  self.fp.seek(self.start_dir) 1049                 except BadZipFile: 1050                     # file is not a zip file, just append
1051                     self.fp.seek(0, 2) 1052 
1053                     # set the modified flag so central directory gets written
1054                     # even if no files are added to the archive
1055                     self._didModify = True 1056                     self.start_dir = self.fp.tell() 1057             else: 1058                 raise RuntimeError("Mode must be 'r', 'w', 'x', or 'a'") 1059         except: 1060             fp = self.fp 1061             self.fp = None 1062  self._fpclose(fp) 1063             raise
1064 
1065     def __enter__(self): 1066         return self 1067 
1068     def __exit__(self, type, value, traceback): 1069  self.close() 1070 
1071     def __repr__(self): 1072         result = ['<%s.%s' % (self.__class__.__module__, 1073                               self.__class__.__qualname__)] 1074         if self.fp is not None: 1075             if self._filePassed: 1076                 result.append(' file=%r' % self.fp) 1077             elif self.filename is not None: 1078                 result.append(' filename=%r' % self.filename) 1079             result.append(' mode=%r' % self.mode) 1080         else: 1081             result.append(' [closed]') 1082         result.append('>') 1083         return ''.join(result) 1084 
1085     def _RealGetContents(self): 1086         """Read in the table of contents for the ZIP file."""
1087         fp = self.fp 1088         try: 1089             endrec = _EndRecData(fp) 1090         except OSError: 1091             raise BadZipFile("File is not a zip file") 1092         if not endrec: 1093             raise BadZipFile("File is not a zip file") 1094         if self.debug > 1: 1095             print(endrec) 1096         size_cd = endrec[_ECD_SIZE]             # bytes in central directory
1097         offset_cd = endrec[_ECD_OFFSET]         # offset of central directory
1098         self._comment = endrec[_ECD_COMMENT]    # archive comment
1099 
1100         # "concat" is zero, unless zip was concatenated to another file
1101         concat = endrec[_ECD_LOCATION] - size_cd - offset_cd 1102         if endrec[_ECD_SIGNATURE] == stringEndArchive64: 1103             # If Zip64 extension structures are present, account for them
1104             concat -= (sizeEndCentDir64 + sizeEndCentDir64Locator) 1105 
1106         if self.debug > 2: 1107             inferred = concat + offset_cd 1108             print("given, inferred, offset", offset_cd, inferred, concat) 1109         # self.start_dir: Position of start of central directory
1110         self.start_dir = offset_cd + concat 1111  fp.seek(self.start_dir, 0) 1112         data = fp.read(size_cd) 1113         fp = io.BytesIO(data) 1114         total = 0 1115         while total < size_cd: 1116             centdir = fp.read(sizeCentralDir) 1117             if len(centdir) != sizeCentralDir: 1118                 raise BadZipFile("Truncated central directory") 1119             centdir = struct.unpack(structCentralDir, centdir) 1120             if centdir[_CD_SIGNATURE] != stringCentralDir: 1121                 raise BadZipFile("Bad magic number for central directory") 1122             if self.debug > 2: 1123                 print(centdir) 1124             filename = fp.read(centdir[_CD_FILENAME_LENGTH]) 1125             flags = centdir[5] 1126             if flags & 0x800: 1127                 # UTF-8 file names extension
1128                 filename = filename.decode('utf-8') 1129             else: 1130                 # Historical ZIP filename encoding
1131                 filename = filename.decode('cp437') 1132             # Create ZipInfo instance to store file information
1133             x = ZipInfo(filename) 1134             x.extra = fp.read(centdir[_CD_EXTRA_FIELD_LENGTH]) 1135             x.comment = fp.read(centdir[_CD_COMMENT_LENGTH]) 1136             x.header_offset = centdir[_CD_LOCAL_HEADER_OFFSET] 1137  (x.create_version, x.create_system, x.extract_version, x.reserved, 1138  x.flag_bits, x.compress_type, t, d, 1139              x.CRC, x.compress_size, x.file_size) = centdir[1:12] 1140             if x.extract_version > MAX_EXTRACT_VERSION: 1141                 raise NotImplementedError("zip file version %.1f" %
1142                                           (x.extract_version / 10)) 1143             x.volume, x.internal_attr, x.external_attr = centdir[15:18] 1144             # Convert date/time code to (year, month, day, hour, min, sec)
1145             x._raw_time = t 1146             x.date_time = ( (d>>9)+1980, (d>>5)&0xF, d&0x1F, 1147                             t>>11, (t>>5)&0x3F, (t&0x1F) * 2 ) 1148 
1149  x._decodeExtra() 1150             x.header_offset = x.header_offset + concat 1151  self.filelist.append(x) 1152             self.NameToInfo[x.filename] = x 1153 
1154             # update total bytes read from central directory
1155             total = (total + sizeCentralDir + centdir[_CD_FILENAME_LENGTH] 1156                      + centdir[_CD_EXTRA_FIELD_LENGTH] 1157                      + centdir[_CD_COMMENT_LENGTH]) 1158 
1159             if self.debug > 2: 1160                 print("total", total) 1161 
1162 
1163     def namelist(self): 1164         """Return a list of file names in the archive."""
1165         return [data.filename for data in self.filelist] 1166 
1167     def infolist(self): 1168         """Return a list of class ZipInfo instances for files in the 1169  archive."""
1170         return self.filelist 1171 
1172     def printdir(self, file=None): 1173         """Print a table of contents for the zip file."""
1174         print("%-46s %19s %12s" % ("File Name", "Modified ", "Size"), 1175               file=file) 1176         for zinfo in self.filelist: 1177             date = "%d-%02d-%02d %02d:%02d:%02d" % zinfo.date_time[:6] 1178             print("%-46s %s %12d" % (zinfo.filename, date, zinfo.file_size), 1179                   file=file) 1180 
1181     def testzip(self): 1182         """Read all the files and check the CRC."""
1183         chunk_size = 2 ** 20
1184         for zinfo in self.filelist: 1185             try: 1186                 # Read by chunks, to avoid an OverflowError or a
1187                 # MemoryError with very large embedded files.
1188                 with self.open(zinfo.filename, "r") as f: 1189                     while f.read(chunk_size):     # Check CRC-32
1190                         pass
1191             except BadZipFile: 1192                 return zinfo.filename 1193 
1194     def getinfo(self, name): 1195         """Return the instance of ZipInfo given 'name'."""
1196         info = self.NameToInfo.get(name) 1197         if info is None: 1198             raise KeyError( 1199                 'There is no item named %r in the archive' % name) 1200 
1201         return info 1202 
1203     def setpassword(self, pwd): 1204         """Set default password for encrypted files."""
1205         if pwd and not isinstance(pwd, bytes): 1206             raise TypeError("pwd: expected bytes, got %s" % type(pwd)) 1207         if pwd: 1208             self.pwd = pwd 1209         else: 1210             self.pwd = None 1211 
1212  @property 1213     def comment(self): 1214         """The comment text associated with the ZIP file."""
1215         return self._comment 1216 
1217  @comment.setter 1218     def comment(self, comment): 1219         if not isinstance(comment, bytes): 1220             raise TypeError("comment: expected bytes, got %s" % type(comment)) 1221         # check for valid comment length
1222         if len(comment) > ZIP_MAX_COMMENT: 1223             import warnings 1224             warnings.warn('Archive comment is too long; truncating to %d bytes'
1225                           % ZIP_MAX_COMMENT, stacklevel=2) 1226             comment = comment[:ZIP_MAX_COMMENT] 1227         self._comment = comment 1228         self._didModify = True 1229 
1230     def read(self, name, pwd=None): 1231         """Return file bytes (as a string) for name."""
1232         with self.open(name, "r", pwd) as fp: 1233             return fp.read() 1234 
1235     def open(self, name, mode="r", pwd=None): 1236         """Return file-like object for 'name'."""
1237         if mode not in ("r", "U", "rU"): 1238             raise RuntimeError('open() requires mode "r", "U", or "rU"') 1239         if 'U' in mode: 1240             import warnings 1241             warnings.warn("'U' mode is deprecated", 1242                           DeprecationWarning, 2) 1243         if pwd and not isinstance(pwd, bytes): 1244             raise TypeError("pwd: expected bytes, got %s" % type(pwd)) 1245         if not self.fp: 1246             raise RuntimeError( 1247                 "Attempt to read ZIP archive that was already closed") 1248 
1249         # Make sure we have an info object
1250         if isinstance(name, ZipInfo): 1251             # 'name' is already an info object
1252             zinfo = name 1253         else: 1254             # Get info object for name
1255             zinfo = self.getinfo(name) 1256 
1257         self._fileRefCnt += 1
1258         zef_file = _SharedFile(self.fp, zinfo.header_offset, self._fpclose, self._lock) 1259         try: 1260             # Skip the file header:
1261             fheader = zef_file.read(sizeFileHeader) 1262             if len(fheader) != sizeFileHeader: 1263                 raise BadZipFile("Truncated file header") 1264             fheader = struct.unpack(structFileHeader, fheader) 1265             if fheader[_FH_SIGNATURE] != stringFileHeader: 1266                 raise BadZipFile("Bad magic number for file header") 1267 
1268             fname = zef_file.read(fheader[_FH_FILENAME_LENGTH]) 1269             if fheader[_FH_EXTRA_FIELD_LENGTH]: 1270  zef_file.read(fheader[_FH_EXTRA_FIELD_LENGTH]) 1271 
1272             if zinfo.flag_bits & 0x20: 1273                 # Zip 2.7: compressed patched data
1274                 raise NotImplementedError("compressed patched data (flag bit 5)") 1275 
1276             if zinfo.flag_bits & 0x40: 1277                 # strong encryption
1278                 raise NotImplementedError("strong encryption (flag bit 6)") 1279 
1280             if zinfo.flag_bits & 0x800: 1281                 # UTF-8 filename
1282                 fname_str = fname.decode("utf-8") 1283             else: 1284                 fname_str = fname.decode("cp437") 1285 
1286             if fname_str != zinfo.orig_filename: 1287                 raise BadZipFile( 1288                     'File name in directory %r and header %r differ.'
1289                     % (zinfo.orig_filename, fname)) 1290 
1291             # check for encrypted flag & handle password
1292             is_encrypted = zinfo.flag_bits & 0x1
1293             zd = None 1294             if is_encrypted: 1295                 if not pwd: 1296                     pwd = self.pwd 1297                 if not pwd: 1298                     raise RuntimeError("File %s is encrypted, password "
1299                                        "required for extraction" % name) 1300 
1301                 zd = _ZipDecrypter(pwd) 1302                 # The first 12 bytes in the cypher stream is an encryption header
1303                 # used to strengthen the algorithm. The first 11 bytes are
1304                 # completely random, while the 12th contains the MSB of the CRC,
1305                 # or the MSB of the file time depending on the header type
1306                 # and is used to check the correctness of the password.
1307                 header = zef_file.read(12) 1308                 h = list(map(zd, header[0:12])) 1309                 if zinfo.flag_bits & 0x8: 1310                     # compare against the file type from extended local headers
1311                     check_byte = (zinfo._raw_time >> 8) & 0xff
1312                 else: 1313                     # compare against the CRC otherwise
1314                     check_byte = (zinfo.CRC >> 24) & 0xff
1315                 if h[11] != check_byte: 1316                     raise RuntimeError("Bad password for file", name) 1317 
1318             return ZipExtFile(zef_file, mode, zinfo, zd, True) 1319         except: 1320  zef_file.close() 1321             raise
1322 
1323     def extract(self, member, path=None, pwd=None): 1324         """Extract a member from the archive to the current working directory, 1325  using its full name. Its file information is extracted as accurately 1326  as possible. `member' may be a filename or a ZipInfo object. You can 1327  specify a different directory using `path'. 1328         """
1329         if not isinstance(member, ZipInfo): 1330             member = self.getinfo(member) 1331 
1332         if path is None: 1333             path = os.getcwd() 1334 
1335         return self._extract_member(member, path, pwd) 1336 
1337     def extractall(self, path=None, members=None, pwd=None): 1338         """Extract all members from the archive to the current working 1339  directory. `path' specifies a different directory to extract to. 1340  `members' is optional and must be a subset of the list returned 1341  by namelist(). 1342         """
1343         if members is None: 1344             members = self.namelist() 1345 
1346         for zipinfo in members: 1347  self.extract(zipinfo, path, pwd) 1348 
1349  @classmethod 1350     def _sanitize_windows_name(cls, arcname, pathsep): 1351         """Replace bad characters and remove trailing dots from parts."""
1352         table = cls._windows_illegal_name_trans_table 1353         if not table: 1354             illegal = ':<>|"?*'
1355             table = str.maketrans(illegal, '_' * len(illegal)) 1356             cls._windows_illegal_name_trans_table = table 1357         arcname = arcname.translate(table) 1358         # remove trailing dots
1359         arcname = (x.rstrip('.') for x in arcname.split(pathsep)) 1360         # rejoin, removing empty parts.
1361         arcname = pathsep.join(x for x in arcname if x) 1362         return arcname 1363 
1364     def _extract_member(self, member, targetpath, pwd): 1365         """Extract the ZipInfo object 'member' to a physical 1366  file on the path targetpath. 1367         """
1368         # build the destination pathname, replacing
1369         # forward slashes to platform specific separators.
1370         arcname = member.filename.replace('/', os.path.sep) 1371 
1372         if os.path.altsep: 1373             arcname = arcname.replace(os.path.altsep, os.path.sep) 1374         # interpret absolute pathname as relative, remove drive letter or
1375         # UNC path, redundant separators, "." and ".." components.
1376         arcname = os.path.splitdrive(arcname)[1] 1377         invalid_path_parts = ('', os.path.curdir, os.path.pardir) 1378         arcname = os.path.sep.join(x for x in arcname.split(os.path.sep) 1379                                    if x not in invalid_path_parts) 1380         if os.path.sep == '\\': 1381             # filter illegal characters on Windows
1382             arcname = self._sanitize_windows_name(arcname, os.path.sep) 1383 
1384         targetpath = os.path.join(targetpath, arcname) 1385         targetpath = os.path.normpath(targetpath) 1386 
1387         # Create all upper directories if necessary.
1388         upperdirs = os.path.dirname(targetpath) 1389         if upperdirs and not os.path.exists(upperdirs): 1390  os.makedirs(upperdirs) 1391 
1392         if member.filename[-1] == '/': 1393             if not os.path.isdir(targetpath): 1394  os.mkdir(targetpath) 1395             return targetpath 1396 
1397         with self.open(member, pwd=pwd) as source, \ 1398              open(targetpath, "wb") as target: 1399  shutil.copyfileobj(source, target) 1400 
1401         return targetpath 1402 
1403     def _writecheck(self, zinfo): 1404         """Check for errors before writing a file to the archive."""
1405         if zinfo.filename in self.NameToInfo: 1406             import warnings 1407             warnings.warn('Duplicate name: %r' % zinfo.filename, stacklevel=3) 1408         if self.mode not in ('w', 'x', 'a'): 1409             raise RuntimeError("write() requires mode 'w', 'x', or 'a'") 1410         if not self.fp: 1411             raise RuntimeError( 1412                 "Attempt to write ZIP archive that was already closed") 1413  _check_compression(zinfo.compress_type) 1414         if not self._allowZip64: 1415             requires_zip64 = None 1416             if len(self.filelist) >= ZIP_FILECOUNT_LIMIT: 1417                 requires_zip64 = "Files count"
1418             elif zinfo.file_size > ZIP64_LIMIT: 1419                 requires_zip64 = "Filesize"
1420             elif zinfo.header_offset > ZIP64_LIMIT: 1421                 requires_zip64 = "Zipfile size"
1422             if requires_zip64: 1423                 raise LargeZipFile(requires_zip64 +
1424                                    " would require ZIP64 extensions") 1425 
1426     def write(self, filename, arcname=None, compress_type=None): 1427         """Put the bytes from filename into the archive under the name 1428  arcname."""
1429         if not self.fp: 1430             raise RuntimeError( 1431                 "Attempt to write to ZIP archive that was already closed") 1432 
1433         st = os.stat(filename) 1434         isdir = stat.S_ISDIR(st.st_mode) 1435         mtime = time.localtime(st.st_mtime) 1436         date_time = mtime[0:6] 1437         # Create ZipInfo instance to store file information
1438         if arcname is None: 1439             arcname = filename 1440         arcname = os.path.normpath(os.path.splitdrive(arcname)[1]) 1441         while arcname[0] in (os.sep, os.altsep): 1442             arcname = arcname[1:] 1443         if isdir: 1444             arcname += '/'
1445         zinfo = ZipInfo(arcname, date_time) 1446         zinfo.external_attr = (st[0] & 0xFFFF) << 16      # Unix attributes
1447         if isdir: 1448             zinfo.compress_type = ZIP_STORED 1449         elif compress_type is None: 1450             zinfo.compress_type = self.compression 1451         else: 1452             zinfo.compress_type = compress_type 1453 
1454         zinfo.file_size = st.st_size 1455         zinfo.flag_bits = 0x00
1456  with self._lock: 1457             if self._seekable: 1458  self.fp.seek(self.start_dir) 1459             zinfo.header_offset = self.fp.tell()    # Start of header bytes
1460             if zinfo.compress_type == ZIP_LZMA: 1461                 # Compressed data includes an end-of-stream (EOS) marker
1462                 zinfo.flag_bits |= 0x02
1463 
1464  self._writecheck(zinfo) 1465             self._didModify = True 1466 
1467             if isdir: 1468                 zinfo.file_size = 0 1469                 zinfo.compress_size = 0 1470                 zinfo.CRC = 0 1471                 zinfo.external_attr |= 0x10  # MS-DOS directory flag
1472  self.filelist.append(zinfo) 1473                 self.NameToInfo[zinfo.filename] = zinfo 1474  self.fp.write(zinfo.FileHeader(False)) 1475                 self.start_dir = self.fp.tell() 1476                 return
1477 
1478             cmpr = _get_compressor(zinfo.compress_type) 1479             if not self._seekable: 1480                 zinfo.flag_bits |= 0x08
1481             with open(filename, "rb") as fp: 1482                 # Must overwrite CRC and sizes with correct data later
1483                 zinfo.CRC = CRC = 0 1484                 zinfo.compress_size = compress_size = 0 1485                 # Compressed size can be larger than uncompressed size
1486                 zip64 = self._allowZip64 and \ 1487                     zinfo.file_size * 1.05 > ZIP64_LIMIT 1488  self.fp.write(zinfo.FileHeader(zip64)) 1489                 file_size = 0 1490                 while 1: 1491                     buf = fp.read(1024 * 8) 1492                     if not buf: 1493                         break
1494                     file_size = file_size + len(buf) 1495                     CRC = crc32(buf, CRC) 1496                     if cmpr: 1497                         buf = cmpr.compress(buf) 1498                         compress_size = compress_size + len(buf) 1499  self.fp.write(buf) 1500             if cmpr: 1501                 buf = cmpr.flush() 1502                 compress_size = compress_size + len(buf) 1503  self.fp.write(buf) 1504                 zinfo.compress_size = compress_size 1505             else: 1506                 zinfo.compress_size = file_size 1507             zinfo.CRC = CRC 1508             zinfo.file_size = file_size 1509             if zinfo.flag_bits & 0x08: 1510                 # Write CRC and file sizes after the file data
1511                 fmt = '<LQQ' if zip64 else '<LLL'
1512  self.fp.write(struct.pack(fmt, zinfo.CRC, zinfo.compress_size, 1513  zinfo.file_size)) 1514                 self.start_dir = self.fp.tell() 1515             else: 1516                 if not zip64 and self._allowZip64: 1517                     if file_size > ZIP64_LIMIT: 1518                         raise RuntimeError('File size has increased during compressing') 1519                     if compress_size > ZIP64_LIMIT: 1520                         raise RuntimeError('Compressed size larger than uncompressed size') 1521                 # Seek backwards and write file header (which will now include
1522                 # correct CRC and file sizes)
1523                 self.start_dir = self.fp.tell() # Preserve current position in file
1524  self.fp.seek(zinfo.header_offset) 1525  self.fp.write(zinfo.FileHeader(zip64)) 1526  self.fp.seek(self.start_dir) 1527  self.filelist.append(zinfo) 1528             self.NameToInfo[zinfo.filename] = zinfo 1529 
1530     def writestr(self, zinfo_or_arcname, data, compress_type=None): 1531         """Write a file into the archive. The contents is 'data', which 1532  may be either a 'str' or a 'bytes' instance; if it is a 'str', 1533  it is encoded as UTF-8 first. 1534  'zinfo_or_arcname' is either a ZipInfo instance or 1535  the name of the file in the archive."""
1536         if isinstance(data, str): 1537             data = data.encode("utf-8") 1538         if not isinstance(zinfo_or_arcname, ZipInfo): 1539             zinfo = ZipInfo(filename=zinfo_or_arcname, 1540                             date_time=time.localtime(time.time())[:6]) 1541             zinfo.compress_type = self.compression 1542             if zinfo.filename[-1] == '/': 1543                 zinfo.external_attr = 0o40775 << 16   # drwxrwxr-x
1544                 zinfo.external_attr |= 0x10           # MS-DOS directory flag
1545             else: 1546                 zinfo.external_attr = 0o600 << 16     # ?rw-------
1547         else: 1548             zinfo = zinfo_or_arcname 1549 
1550         if not self.fp: 1551             raise RuntimeError( 1552                 "Attempt to write to ZIP archive that was already closed") 1553 
1554         zinfo.file_size = len(data)            # Uncompressed size
1555  with self._lock: 1556             if self._seekable: 1557  self.fp.seek(self.start_dir) 1558             zinfo.header_offset = self.fp.tell()    # Start of header data
1559             if compress_type is not None: 1560                 zinfo.compress_type = compress_type 1561             zinfo.header_offset = self.fp.tell()    # Start of header data
1562             if compress_type is not None: 1563                 zinfo.compress_type = compress_type 1564             if zinfo.compress_type == ZIP_LZMA: 1565                 # Compressed data includes an end-of-stream (EOS) marker
1566                 zinfo.flag_bits |= 0x02
1567 
1568  self._writecheck(zinfo) 1569             self._didModify = True 1570             zinfo.CRC = crc32(data)       # CRC-32 checksum
1571             co = _get_compressor(zinfo.compress_type) 1572             if co: 1573                 data = co.compress(data) + co.flush() 1574                 zinfo.compress_size = len(data)    # Compressed size
1575             else: 1576                 zinfo.compress_size = zinfo.file_size 1577             zip64 = zinfo.file_size > ZIP64_LIMIT or \ 1578                 zinfo.compress_size > ZIP64_LIMIT 1579             if zip64 and not self._allowZip64: 1580                 raise LargeZipFile("Filesize would require ZIP64 extensions") 1581  self.fp.write(zinfo.FileHeader(zip64)) 1582  self.fp.write(data) 1583             if zinfo.flag_bits & 0x08: 1584                 # Write CRC and file sizes after the file data
1585                 fmt = '<LQQ' if zip64 else '<LLL'
1586  self.fp.write(struct.pack(fmt, zinfo.CRC, zinfo.compress_size, 1587  zinfo.file_size)) 1588  self.fp.flush() 1589             self.start_dir = self.fp.tell() 1590  self.filelist.append(zinfo) 1591             self.NameToInfo[zinfo.filename] = zinfo 1592 
1593     def __del__(self): 1594         """Call the "close()" method in case the user forgot."""
1595  self.close() 1596 
1597     def close(self): 1598         """Close the file, and for mode 'w', 'x' and 'a' write the ending 1599  records."""
1600         if self.fp is None: 1601             return
1602 
1603         try: 1604             if self.mode in ('w', 'x', 'a') and self._didModify: # write ending records
1605  with self._lock: 1606                     if self._seekable: 1607  self.fp.seek(self.start_dir) 1608  self._write_end_record() 1609         finally: 1610             fp = self.fp 1611             self.fp = None 1612  self._fpclose(fp) 1613 
1614     def _write_end_record(self): 1615         for zinfo in self.filelist:         # write central directory
1616             dt = zinfo.date_time 1617             dosdate = (dt[0] - 1980) << 9 | dt[1] << 5 | dt[2] 1618             dostime = dt[3] << 11 | dt[4] << 5 | (dt[5] // 2) 1619             extra = [] 1620             if zinfo.file_size > ZIP64_LIMIT \ 1621                or zinfo.compress_size > ZIP64_LIMIT: 1622  extra.append(zinfo.file_size) 1623  extra.append(zinfo.compress_size) 1624                 file_size = 0xffffffff
1625                 compress_size = 0xffffffff
1626             else: 1627                 file_size = zinfo.file_size 1628                 compress_size = zinfo.compress_size 1629 
1630             if zinfo.header_offset > ZIP64_LIMIT: 1631  extra.append(zinfo.header_offset) 1632                 header_offset = 0xffffffff
1633             else: 1634                 header_offset = zinfo.header_offset 1635 
1636             extra_data = zinfo.extra 1637             min_version = 0 1638             if extra: 1639                 # Append a ZIP64 field to the extra's
1640                 extra_data = struct.pack( 1641                     '<HH' + 'Q'*len(extra), 1642                     1, 8*len(extra), *extra) + extra_data 1643 
1644                 min_version = ZIP64_VERSION 1645 
1646             if zinfo.compress_type == ZIP_BZIP2: 1647                 min_version = max(BZIP2_VERSION, min_version) 1648             elif zinfo.compress_type == ZIP_LZMA: 1649                 min_version = max(LZMA_VERSION, min_version) 1650 
1651             extract_version = max(min_version, zinfo.extract_version) 1652             create_version = max(min_version, zinfo.create_version) 1653             try: 1654                 filename, flag_bits = zinfo._encodeFilenameFlags() 1655                 centdir = struct.pack(structCentralDir, 1656  stringCentralDir, create_version, 1657  zinfo.create_system, extract_version, zinfo.reserved, 1658  flag_bits, zinfo.compress_type, dostime, dosdate, 1659  zinfo.CRC, compress_size, file_size, 1660  len(filename), len(extra_data), len(zinfo.comment), 1661  0, zinfo.internal_attr, zinfo.external_attr, 1662  header_offset) 1663             except DeprecationWarning: 1664                 print((structCentralDir, stringCentralDir, create_version, 1665  zinfo.create_system, extract_version, zinfo.reserved, 1666  zinfo.flag_bits, zinfo.compress_type, dostime, dosdate, 1667  zinfo.CRC, compress_size, file_size, 1668  len(zinfo.filename), len(extra_data), len(zinfo.comment), 1669  0, zinfo.internal_attr, zinfo.external_attr, 1670                        header_offset), file=sys.stderr) 1671                 raise
1672  self.fp.write(centdir) 1673  self.fp.write(filename) 1674  self.fp.write(extra_data) 1675  self.fp.write(zinfo.comment) 1676 
1677         pos2 = self.fp.tell() 1678         # Write end-of-zip-archive record
1679         centDirCount = len(self.filelist) 1680         centDirSize = pos2 - self.start_dir 1681         centDirOffset = self.start_dir 1682         requires_zip64 = None 1683         if centDirCount > ZIP_FILECOUNT_LIMIT: 1684             requires_zip64 = "Files count"
1685         elif centDirOffset > ZIP64_LIMIT: 1686             requires_zip64 = "Central directory offset"
1687         elif centDirSize > ZIP64_LIMIT: 1688             requires_zip64 = "Central directory size"
1689         if requires_zip64: 1690             # Need to write the ZIP64 end-of-archive records
1691             if not self._allowZip64: 1692                 raise LargeZipFile(requires_zip64 +
1693                                    " would require ZIP64 extensions") 1694             zip64endrec = struct.pack( 1695  structEndArchive64, stringEndArchive64, 1696                 44, 45, 45, 0, 0, centDirCount, centDirCount, 1697  centDirSize, centDirOffset) 1698  self.fp.write(zip64endrec) 1699 
1700             zip64locrec = struct.pack( 1701  structEndArchive64Locator, 1702                 stringEndArchive64Locator, 0, pos2, 1) 1703  self.fp.write(zip64locrec) 1704             centDirCount = min(centDirCount, 0xFFFF) 1705             centDirSize = min(centDirSize, 0xFFFFFFFF) 1706             centDirOffset = min(centDirOffset, 0xFFFFFFFF) 1707 
1708         endrec = struct.pack(structEndArchive, stringEndArchive, 1709  0, 0, centDirCount, centDirCount, 1710  centDirSize, centDirOffset, len(self._comment)) 1711  self.fp.write(endrec) 1712  self.fp.write(self._comment) 1713  self.fp.flush() 1714 
1715     def _fpclose(self, fp): 1716         assert self._fileRefCnt > 0 1717         self._fileRefCnt -= 1
1718         if not self._fileRefCnt and not self._filePassed: 1719  fp.close() 1720 
1721 
1722 class PyZipFile(ZipFile): 1723     """Class to create ZIP archives with Python library files and packages."""
1724 
1725     def __init__(self, file, mode="r", compression=ZIP_STORED, 1726                  allowZip64=True, optimize=-1): 1727         ZipFile.__init__(self, file, mode=mode, compression=compression, 1728                          allowZip64=allowZip64) 1729         self._optimize = optimize 1730 
1731     def writepy(self, pathname, basename="", filterfunc=None): 1732         """Add all files from "pathname" to the ZIP archive. 1733 
1734  If pathname is a package directory, search the directory and 1735  all package subdirectories recursively for all *.py and enter 1736  the modules into the archive. If pathname is a plain 1737  directory, listdir *.py and enter all modules. Else, pathname 1738  must be a Python *.py file and the module will be put into the 1739  archive. Added modules are always module.pyc. 1740  This method will compile the module.py into module.pyc if 1741  necessary. 1742  If filterfunc(pathname) is given, it is called with every argument. 1743  When it is False, the file or directory is skipped. 1744         """
1745         if filterfunc and not filterfunc(pathname): 1746             if self.debug: 1747                 label = 'path' if os.path.isdir(pathname) else 'file'
1748                 print('%s "%s" skipped by filterfunc' % (label, pathname)) 1749             return
1750         dir, name = os.path.split(pathname) 1751         if os.path.isdir(pathname): 1752             initname = os.path.join(pathname, "__init__.py") 1753             if os.path.isfile(initname): 1754                 # This is a package directory, add it
1755                 if basename: 1756                     basename = "%s/%s" % (basename, name) 1757                 else: 1758                     basename = name 1759                 if self.debug: 1760                     print("Adding package in", pathname, "as", basename) 1761                 fname, arcname = self._get_codename(initname[0:-3], basename) 1762                 if self.debug: 1763                     print("Adding", arcname) 1764  self.write(fname, arcname) 1765                 dirlist = os.listdir(pathname) 1766                 dirlist.remove("__init__.py") 1767                 # Add all *.py files and package subdirectories
1768                 for filename in dirlist: 1769                     path = os.path.join(pathname, filename) 1770                     root, ext = os.path.splitext(filename) 1771                     if os.path.isdir(path): 1772                         if os.path.isfile(os.path.join(path, "__init__.py")): 1773                             # This is a package directory, add it
1774  self.writepy(path, basename, 1775                                          filterfunc=filterfunc)  # Recursive call
1776                     elif ext == ".py": 1777                         if filterfunc and not filterfunc(path): 1778                             if self.debug: 1779                                 print('file "%s" skipped by filterfunc' % path) 1780                             continue
1781                         fname, arcname = self._get_codename(path[0:-3], 1782  basename) 1783                         if self.debug: 1784                             print("Adding", arcname) 1785  self.write(fname, arcname) 1786             else: 1787                 # This is NOT a package directory, add its files at top level
1788                 if self.debug: 1789                     print("Adding files from directory", pathname) 1790                 for filename in os.listdir(pathname): 1791                     path = os.path.join(pathname, filename) 1792                     root, ext = os.path.splitext(filename) 1793                     if ext == ".py": 1794                         if filterfunc and not filterfunc(path): 1795                             if self.debug: 1796                                 print('file "%s" skipped by filterfunc' % path) 1797                             continue
1798                         fname, arcname = self._get_codename(path[0:-3], 1799  basename) 1800                         if self.debug: 1801                             print("Adding", arcname) 1802  self.write(fname, arcname) 1803         else: 1804             if pathname[-3:] != ".py": 1805                 raise RuntimeError( 1806                     'Files added with writepy() must end with ".py"') 1807             fname, arcname = self._get_codename(pathname[0:-3], basename) 1808             if self.debug: 1809                 print("Adding file", arcname) 1810  self.write(fname, arcname) 1811 
1812     def _get_codename(self, pathname, basename): 1813         """Return (filename, archivename) for the path. 1814 
1815  Given a module name path, return the correct file path and 1816  archive name, compiling if necessary. For example, given 1817  /python/lib/string, return (/python/lib/string.pyc, string). 1818         """
1819         def _compile(file, optimize=-1): 1820             import py_compile 1821             if self.debug: 1822                 print("Compiling", file) 1823             try: 1824                 py_compile.compile(file, doraise=True, optimize=optimize) 1825             except py_compile.PyCompileError as err: 1826                 print(err.msg) 1827                 return False 1828             return True 1829 
1830         file_py  = pathname + ".py"
1831         file_pyc = pathname + ".pyc"
1832         pycache_opt0 = importlib.util.cache_from_source(file_py, optimization='') 1833         pycache_opt1 = importlib.util.cache_from_source(file_py, optimization=1) 1834         pycache_opt2 = importlib.util.cache_from_source(file_py, optimization=2) 1835         if self._optimize == -1: 1836             # legacy mode: use whatever file is present
1837             if (os.path.isfile(file_pyc) and
1838                   os.stat(file_pyc).st_mtime >= os.stat(file_py).st_mtime): 1839                 # Use .pyc file.
1840                 arcname = fname = file_pyc 1841             elif (os.path.isfile(pycache_opt0) and
1842                   os.stat(pycache_opt0).st_mtime >= os.stat(file_py).st_mtime): 1843                 # Use the __pycache__/*.pyc file, but write it to the legacy pyc
1844                 # file name in the archive.
1845                 fname = pycache_opt0 1846                 arcname = file_pyc 1847             elif (os.path.isfile(pycache_opt1) and
1848                   os.stat(pycache_opt1).st_mtime >= os.stat(file_py).st_mtime): 1849                 # Use the __pycache__/*.pyc file, but write it to the legacy pyc
1850                 # file name in the archive.
1851                 fname = pycache_opt1 1852                 arcname = file_pyc 1853             elif (os.path.isfile(pycache_opt2) and
1854                   os.stat(pycache_opt2).st_mtime >= os.stat(file_py).st_mtime): 1855                 # Use the __pycache__/*.pyc file, but write it to the legacy pyc
1856                 # file name in the archive.
1857                 fname = pycache_opt2 1858                 arcname = file_pyc 1859             else: 1860                 # Compile py into PEP 3147 pyc file.
1861                 if _compile(file_py): 1862                     if sys.flags.optimize == 0: 1863                         fname = pycache_opt0 1864                     elif sys.flags.optimize == 1: 1865                         fname = pycache_opt1 1866                     else: 1867                         fname = pycache_opt2 1868                     arcname = file_pyc 1869                 else: 1870                     fname = arcname = file_py 1871         else: 1872             # new mode: use given optimization level
1873             if self._optimize == 0: 1874                 fname = pycache_opt0 1875                 arcname = file_pyc 1876             else: 1877                 arcname = file_pyc 1878                 if self._optimize == 1: 1879                     fname = pycache_opt1 1880                 elif self._optimize == 2: 1881                     fname = pycache_opt2 1882                 else: 1883                     msg = "invalid value for 'optimize': {!r}".format(self._optimize) 1884                     raise ValueError(msg) 1885             if not (os.path.isfile(fname) and
1886                     os.stat(fname).st_mtime >= os.stat(file_py).st_mtime): 1887                 if not _compile(file_py, optimize=self._optimize): 1888                     fname = arcname = file_py 1889         archivename = os.path.split(arcname)[1] 1890         if basename: 1891             archivename = "%s/%s" % (basename, archivename) 1892         return (fname, archivename) 1893 
1894 
1895 def main(args = None): 1896     import textwrap 1897     USAGE=textwrap.dedent("""\ 1898  Usage: 1899  zipfile.py -l zipfile.zip # Show listing of a zipfile 1900  zipfile.py -t zipfile.zip # Test if a zipfile is valid 1901  zipfile.py -e zipfile.zip target # Extract zipfile into target dir 1902  zipfile.py -c zipfile.zip src ... # Create zipfile from sources 1903         """) 1904     if args is None: 1905         args = sys.argv[1:] 1906 
1907     if not args or args[0] not in ('-l', '-c', '-e', '-t'): 1908         print(USAGE) 1909         sys.exit(1) 1910 
1911     if args[0] == '-l': 1912         if len(args) != 2: 1913             print(USAGE) 1914             sys.exit(1) 1915         with ZipFile(args[1], 'r') as zf: 1916  zf.printdir() 1917 
1918     elif args[0] == '-t': 1919         if len(args) != 2: 1920             print(USAGE) 1921             sys.exit(1) 1922         with ZipFile(args[1], 'r') as zf: 1923             badfile = zf.testzip() 1924         if badfile: 1925             print("The following enclosed file is corrupted: {!r}".format(badfile)) 1926         print("Done testing") 1927 
1928     elif args[0] == '-e': 1929         if len(args) != 3: 1930             print(USAGE) 1931             sys.exit(1) 1932 
1933         with ZipFile(args[1], 'r') as zf: 1934             zf.extractall(args[2]) 1935 
1936     elif args[0] == '-c': 1937         if len(args) < 3: 1938             print(USAGE) 1939             sys.exit(1) 1940 
1941         def addToZip(zf, path, zippath): 1942             if os.path.isfile(path): 1943  zf.write(path, zippath, ZIP_DEFLATED) 1944             elif os.path.isdir(path): 1945                 if zippath: 1946  zf.write(path, zippath) 1947                 for nm in os.listdir(path): 1948  addToZip(zf, 1949  os.path.join(path, nm), os.path.join(zippath, nm)) 1950             # else: ignore
1951 
1952         with ZipFile(args[1], 'w') as zf: 1953             for path in args[2:]: 1954                 zippath = os.path.basename(path) 1955                 if not zippath: 1956                     zippath = os.path.basename(os.path.dirname(path)) 1957                 if zippath in ('', os.curdir, os.pardir): 1958                     zippath = ''
1959  addToZip(zf, path, zippath) 1960 
1961 if __name__ == "__main__": 1962     main()
View zipfile Code

 

b、tarfile

 1 #!/usr/bin/env python3
 2 #-------------------------------------------------------------------
 3 # tarfile.py
 4 #-------------------------------------------------------------------
 5 # Copyright (C) 2002 Lars Gustaebel <lars@gustaebel.de>
 6 # All rights reserved.
 7 #  8 # Permission is hereby granted, free of charge, to any person
 9 # obtaining a copy of this software and associated documentation
 10 # files (the "Software"), to deal in the Software without
 11 # restriction, including without limitation the rights to use,
 12 # copy, modify, merge, publish, distribute, sublicense, and/or sell
 13 # copies of the Software, and to permit persons to whom the
 14 # Software is furnished to do so, subject to the following
 15 # conditions:
 16 #  17 # The above copyright notice and this permission notice shall be
 18 # included in all copies or substantial portions of the Software.
 19 #  20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
 22 # OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
 24 # HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
 25 # WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 26 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 27 # OTHER DEALINGS IN THE SOFTWARE.
 28 #  29 """Read from and write to tar format archives.  30 """
 31 
 32 version     = "0.9.0"
 33 __author__  = "Lars Gust\u00e4bel (lars@gustaebel.de)"
 34 __date__    = "$Date: 2011-02-25 17:42:01 +0200 (Fri, 25 Feb 2011) $"
 35 __cvsid__   = "$Id: tarfile.py 88586 2011-02-25 15:42:01Z marc-andre.lemburg $"
 36 __credits__ = "Gustavo Niemeyer, Niels Gust\u00e4bel, Richard Townsend."
 37 
 38 #---------
 39 # Imports
 40 #---------
 41 from builtins import open as bltn_open  42 import sys  43 import os  44 import io  45 import shutil  46 import stat  47 import time  48 import struct  49 import copy  50 import re  51 
 52 try:  53     import grp, pwd  54 except ImportError:  55     grp = pwd = None  56 
 57 # os.symlink on Windows prior to 6.0 raises NotImplementedError
 58 symlink_exception = (AttributeError, NotImplementedError)  59 try:  60     # OSError (winerror=1314) will be raised if the caller does not hold the
 61     # SeCreateSymbolicLinkPrivilege privilege
 62     symlink_exception += (OSError,)  63 except NameError:  64     pass
 65 
 66 # from tarfile import *
 67 __all__ = ["TarFile", "TarInfo", "is_tarfile", "TarError"]  68 
 69 #---------------------------------------------------------
 70 # tar constants
 71 #---------------------------------------------------------
 72 NUL = b"\0"                     # the null character
 73 BLOCKSIZE = 512                 # length of processing blocks
 74 RECORDSIZE = BLOCKSIZE * 20     # length of records
 75 GNU_MAGIC = b"ustar \0"        # magic gnu tar string
 76 POSIX_MAGIC = b"ustar\x0000"    # magic posix tar string
 77 
 78 LENGTH_NAME = 100               # maximum length of a filename
 79 LENGTH_LINK = 100               # maximum length of a linkname
 80 LENGTH_PREFIX = 155             # maximum length of the prefix field
 81 
 82 REGTYPE = b"0"                  # regular file
 83 AREGTYPE = b"\0"                # regular file
 84 LNKTYPE = b"1"                  # link (inside tarfile)
 85 SYMTYPE = b"2"                  # symbolic link
 86 CHRTYPE = b"3"                  # character special device
 87 BLKTYPE = b"4"                  # block special device
 88 DIRTYPE = b"5"                  # directory
 89 FIFOTYPE = b"6"                 # fifo special device
 90 CONTTYPE = b"7"                 # contiguous file
 91 
 92 GNUTYPE_LONGNAME = b"L"         # GNU tar longname
 93 GNUTYPE_LONGLINK = b"K"         # GNU tar longlink
 94 GNUTYPE_SPARSE = b"S"           # GNU tar sparse file
 95 
 96 XHDTYPE = b"x"                  # POSIX.1-2001 extended header
 97 XGLTYPE = b"g"                  # POSIX.1-2001 global header
 98 SOLARIS_XHDTYPE = b"X"          # Solaris extended header
 99 
 100 USTAR_FORMAT = 0                # POSIX.1-1988 (ustar) format
 101 GNU_FORMAT = 1                  # GNU tar format
 102 PAX_FORMAT = 2                  # POSIX.1-2001 (pax) format
 103 DEFAULT_FORMAT = GNU_FORMAT  104 
 105 #---------------------------------------------------------
 106 # tarfile constants
 107 #---------------------------------------------------------
 108 # File types that tarfile supports:
 109 SUPPORTED_TYPES = (REGTYPE, AREGTYPE, LNKTYPE,  110  SYMTYPE, DIRTYPE, FIFOTYPE,  111  CONTTYPE, CHRTYPE, BLKTYPE,  112  GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,  113  GNUTYPE_SPARSE)  114 
 115 # File types that will be treated as a regular file.
 116 REGULAR_TYPES = (REGTYPE, AREGTYPE,  117  CONTTYPE, GNUTYPE_SPARSE)  118 
 119 # File types that are part of the GNU tar format.
 120 GNU_TYPES = (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,  121  GNUTYPE_SPARSE)  122 
 123 # Fields from a pax header that override a TarInfo attribute.
 124 PAX_FIELDS = ("path", "linkpath", "size", "mtime",  125               "uid", "gid", "uname", "gname")  126 
 127 # Fields from a pax header that are affected by hdrcharset.
 128 PAX_NAME_FIELDS = {"path", "linkpath", "uname", "gname"}  129 
 130 # Fields in a pax header that are numbers, all other fields
 131 # are treated as strings.
 132 PAX_NUMBER_FIELDS = {  133     "atime": float,  134     "ctime": float,  135     "mtime": float,  136     "uid": int,  137     "gid": int,  138     "size": int  139 }  140 
 141 #---------------------------------------------------------
 142 # initialization
 143 #---------------------------------------------------------
 144 if os.name in ("nt", "ce"):  145     ENCODING = "utf-8"
 146 else:  147     ENCODING = sys.getfilesystemencoding()  148 
 149 #---------------------------------------------------------
 150 # Some useful functions
 151 #---------------------------------------------------------
 152 
 153 def stn(s, length, encoding, errors):  154     """Convert a string to a null-terminated bytes object.  155     """
 156     s = s.encode(encoding, errors)  157     return s[:length] + (length - len(s)) * NUL  158 
 159 def nts(s, encoding, errors):  160     """Convert a null-terminated bytes object to a string.  161     """
 162     p = s.find(b"\0")  163     if p != -1:  164         s = s[:p]  165     return s.decode(encoding, errors)  166 
 167 def nti(s):  168     """Convert a number field to a python number.  169     """
 170     # There are two possible encodings for a number field, see
 171     # itn() below.
 172     if s[0] in (0o200, 0o377):  173         n = 0  174         for i in range(len(s) - 1):  175             n <<= 8
 176             n += s[i + 1]  177         if s[0] == 0o377:  178             n = -(256 ** (len(s) - 1) - n)  179     else:  180         try:  181             s = nts(s, "ascii", "strict")  182             n = int(s.strip() or "0", 8)  183         except ValueError:  184             raise InvalidHeaderError("invalid header")  185     return n  186 
 187 def itn(n, digits=8, format=DEFAULT_FORMAT):  188     """Convert a python number to a number field.  189     """
 190     # POSIX 1003.1-1988 requires numbers to be encoded as a string of
 191     # octal digits followed by a null-byte, this allows values up to
 192     # (8**(digits-1))-1. GNU tar allows storing numbers greater than
 193     # that if necessary. A leading 0o200 or 0o377 byte indicate this
 194     # particular encoding, the following digits-1 bytes are a big-endian
 195     # base-256 representation. This allows values up to (256**(digits-1))-1.
 196     # A 0o200 byte indicates a positive number, a 0o377 byte a negative
 197     # number.
 198     if 0 <= n < 8 ** (digits - 1):  199         s = bytes("%0*o" % (digits - 1, int(n)), "ascii") + NUL  200     elif format == GNU_FORMAT and -256 ** (digits - 1) <= n < 256 ** (digits - 1):  201         if n >= 0:  202             s = bytearray([0o200])  203         else:  204             s = bytearray([0o377])  205             n = 256 ** digits + n  206 
 207         for i in range(digits - 1):  208             s.insert(1, n & 0o377)  209             n >>= 8
 210     else:  211         raise ValueError("overflow in number field")  212 
 213     return s  214 
 215 def calc_chksums(buf):  216     """Calculate the checksum for a member's header by summing up all  217  characters except for the chksum field which is treated as if  218  it was filled with spaces. According to the GNU tar sources,  219  some tars (Sun and NeXT) calculate chksum with signed char,  220  which will be different if there are chars in the buffer with  221  the high bit set. So we calculate two checksums, unsigned and  222  signed.  223     """
 224     unsigned_chksum = 256 + sum(struct.unpack_from("148B8x356B", buf))  225     signed_chksum = 256 + sum(struct.unpack_from("148b8x356b", buf))  226     return unsigned_chksum, signed_chksum  227 
 228 def copyfileobj(src, dst, length=None, exception=OSError):  229     """Copy length bytes from fileobj src to fileobj dst.  230  If length is None, copy the entire content.  231     """
 232     if length == 0:  233         return
 234     if length is None:  235  shutil.copyfileobj(src, dst)  236         return
 237 
 238     BUFSIZE = 16 * 1024
 239     blocks, remainder = divmod(length, BUFSIZE)  240     for b in range(blocks):  241         buf = src.read(BUFSIZE)  242         if len(buf) < BUFSIZE:  243             raise exception("unexpected end of data")  244  dst.write(buf)  245 
 246     if remainder != 0:  247         buf = src.read(remainder)  248         if len(buf) < remainder:  249             raise exception("unexpected end of data")  250  dst.write(buf)  251     return
 252 
 253 def filemode(mode):  254     """Deprecated in this location; use stat.filemode."""
 255     import warnings  256     warnings.warn("deprecated in favor of stat.filemode",  257                   DeprecationWarning, 2)  258     return stat.filemode(mode)  259 
 260 def _safe_print(s):  261     encoding = getattr(sys.stdout, 'encoding', None)  262     if encoding is not None:  263         s = s.encode(encoding, 'backslashreplace').decode(encoding)  264     print(s, end=' ')  265 
 266 
 267 class TarError(Exception):  268     """Base exception."""
 269     pass
 270 class ExtractError(TarError):  271     """General exception for extract errors."""
 272     pass
 273 class ReadError(TarError):  274     """Exception for unreadable tar archives."""
 275     pass
 276 class CompressionError(TarError):  277     """Exception for unavailable compression methods."""
 278     pass
 279 class StreamError(TarError):  280     """Exception for unsupported operations on stream-like TarFiles."""
 281     pass
 282 class HeaderError(TarError):  283     """Base exception for header errors."""
 284     pass
 285 class EmptyHeaderError(HeaderError):  286     """Exception for empty headers."""
 287     pass
 288 class TruncatedHeaderError(HeaderError):  289     """Exception for truncated headers."""
 290     pass
 291 class EOFHeaderError(HeaderError):  292     """Exception for end of file headers."""
 293     pass
 294 class InvalidHeaderError(HeaderError):  295     """Exception for invalid headers."""
 296     pass
 297 class SubsequentHeaderError(HeaderError):  298     """Exception for missing and invalid extended headers."""
 299     pass
 300 
 301 #---------------------------
 302 # internal stream interface
 303 #---------------------------
 304 class _LowLevelFile:  305     """Low-level file object. Supports reading and writing.  306  It is used instead of a regular file object for streaming  307  access.  308     """
 309 
 310     def __init__(self, name, mode):  311         mode = {  312             "r": os.O_RDONLY,  313             "w": os.O_WRONLY | os.O_CREAT | os.O_TRUNC,  314  }[mode]  315         if hasattr(os, "O_BINARY"):  316             mode |= os.O_BINARY  317         self.fd = os.open(name, mode, 0o666)  318 
 319     def close(self):  320  os.close(self.fd)  321 
 322     def read(self, size):  323         return os.read(self.fd, size)  324 
 325     def write(self, s):  326  os.write(self.fd, s)  327 
 328 class _Stream:  329     """Class that serves as an adapter between TarFile and  330  a stream-like object. The stream-like object only  331  needs to have a read() or write() method and is accessed  332  blockwise. Use of gzip or bzip2 compression is possible.  333  A stream-like object could be for example: sys.stdin,  334  sys.stdout, a socket, a tape device etc.  335 
 336  _Stream is intended to be used only internally.  337     """
 338 
 339     def __init__(self, name, mode, comptype, fileobj, bufsize):  340         """Construct a _Stream object.  341         """
 342         self._extfileobj = True  343         if fileobj is None:  344             fileobj = _LowLevelFile(name, mode)  345             self._extfileobj = False  346 
 347         if comptype == '*':  348             # Enable transparent compression detection for the
 349             # stream interface
 350             fileobj = _StreamProxy(fileobj)  351             comptype = fileobj.getcomptype()  352 
 353         self.name     = name or ""
 354         self.mode     = mode  355         self.comptype = comptype  356         self.fileobj  = fileobj  357         self.bufsize  = bufsize  358         self.buf      = b""
 359         self.pos      = 0  360         self.closed   = False  361 
 362         try:  363             if comptype == "gz":  364                 try:  365                     import zlib  366                 except ImportError:  367                     raise CompressionError("zlib module is not available")  368                 self.zlib = zlib  369                 self.crc = zlib.crc32(b"")  370                 if mode == "r":  371  self._init_read_gz()  372                     self.exception = zlib.error  373                 else:  374  self._init_write_gz()  375 
 376             elif comptype == "bz2":  377                 try:  378                     import bz2  379                 except ImportError:  380                     raise CompressionError("bz2 module is not available")  381                 if mode == "r":  382                     self.dbuf = b""
 383                     self.cmp = bz2.BZ2Decompressor()  384                     self.exception = OSError  385                 else:  386                     self.cmp = bz2.BZ2Compressor()  387 
 388             elif comptype == "xz":  389                 try:  390                     import lzma  391                 except ImportError:  392                     raise CompressionError("lzma module is not available")  393                 if mode == "r":  394                     self.dbuf = b""
 395                     self.cmp = lzma.LZMADecompressor()  396                     self.exception = lzma.LZMAError  397                 else:  398                     self.cmp = lzma.LZMACompressor()  399 
 400             elif comptype != "tar":  401                 raise CompressionError("unknown compression type %r" % comptype)  402 
 403         except:  404             if not self._extfileobj:  405  self.fileobj.close()  406             self.closed = True  407             raise
 408 
 409     def __del__(self):  410         if hasattr(self, "closed") and not self.closed:  411  self.close()  412 
 413     def _init_write_gz(self):  414         """Initialize for writing with gzip compression.  415         """
 416         self.cmp = self.zlib.compressobj(9, self.zlib.DEFLATED,  417                                             -self.zlib.MAX_WBITS,  418  self.zlib.DEF_MEM_LEVEL,  419  0)  420         timestamp = struct.pack("<L", int(time.time()))  421         self.__write(b"\037\213\010\010" + timestamp + b"\002\377")  422         if self.name.endswith(".gz"):  423             self.name = self.name[:-3]  424         # RFC1952 says we must use ISO-8859-1 for the FNAME field.
 425         self.__write(self.name.encode("iso-8859-1", "replace") + NUL)  426 
 427     def write(self, s):  428         """Write string s to the stream.  429         """
 430         if self.comptype == "gz":  431             self.crc = self.zlib.crc32(s, self.crc)  432         self.pos += len(s)  433         if self.comptype != "tar":  434             s = self.cmp.compress(s)  435         self.__write(s)  436 
 437     def __write(self, s):  438         """Write string s to the stream if a whole new block  439  is ready to be written.  440         """
 441         self.buf += s  442         while len(self.buf) > self.bufsize:  443  self.fileobj.write(self.buf[:self.bufsize])  444             self.buf = self.buf[self.bufsize:]  445 
 446     def close(self):  447         """Close the _Stream object. No operation should be  448  done on it afterwards.  449         """
 450         if self.closed:  451             return
 452 
 453         self.closed = True  454         try:  455             if self.mode == "w" and self.comptype != "tar":  456                 self.buf += self.cmp.flush()  457 
 458             if self.mode == "w" and self.buf:  459  self.fileobj.write(self.buf)  460                 self.buf = b""
 461                 if self.comptype == "gz":  462                     self.fileobj.write(struct.pack("<L", self.crc))  463                     self.fileobj.write(struct.pack("<L", self.pos & 0xffffFFFF))  464         finally:  465             if not self._extfileobj:  466  self.fileobj.close()  467 
 468     def _init_read_gz(self):  469         """Initialize for reading a gzip compressed fileobj.  470         """
 471         self.cmp = self.zlib.decompressobj(-self.zlib.MAX_WBITS)  472         self.dbuf = b""
 473 
 474         # taken from gzip.GzipFile with some alterations
 475         if self.__read(2) != b"\037\213":  476             raise ReadError("not a gzip file")  477         if self.__read(1) != b"\010":  478             raise CompressionError("unsupported compression method")  479 
 480         flag = ord(self.__read(1))  481         self.__read(6)  482 
 483         if flag & 4:  484             xlen = ord(self.__read(1)) + 256 * ord(self.__read(1))  485  self.read(xlen)  486         if flag & 8:  487             while True:  488                 s = self.__read(1)  489                 if not s or s == NUL:  490                     break
 491         if flag & 16:  492             while True:  493                 s = self.__read(1)  494                 if not s or s == NUL:  495                     break
 496         if flag & 2:  497             self.__read(2)  498 
 499     def tell(self):  500         """Return the stream's file pointer position.  501         """
 502         return self.pos  503 
 504     def seek(self, pos=0):  505         """Set the stream's file pointer to pos. Negative seeking  506  is forbidden.  507         """
 508         if pos - self.pos >= 0:  509             blocks, remainder = divmod(pos - self.pos, self.bufsize)  510             for i in range(blocks):  511  self.read(self.bufsize)  512  self.read(remainder)  513         else:  514             raise StreamError("seeking backwards is not allowed")  515         return self.pos  516 
 517     def read(self, size=None):  518         """Return the next size number of bytes from the stream.  519  If size is not defined, return all bytes of the stream  520  up to EOF.  521         """
 522         if size is None:  523             t = []  524             while True:  525                 buf = self._read(self.bufsize)  526                 if not buf:  527                     break
 528  t.append(buf)  529             buf = "".join(t)  530         else:  531             buf = self._read(size)  532         self.pos += len(buf)  533         return buf  534 
 535     def _read(self, size):  536         """Return size bytes from the stream.  537         """
 538         if self.comptype == "tar":  539             return self.__read(size)  540 
 541         c = len(self.dbuf)  542         while c < size:  543             buf = self.__read(self.bufsize)  544             if not buf:  545                 break
 546             try:  547                 buf = self.cmp.decompress(buf)  548             except self.exception:  549                 raise ReadError("invalid compressed data")  550             self.dbuf += buf  551             c += len(buf)  552         buf = self.dbuf[:size]  553         self.dbuf = self.dbuf[size:]  554         return buf  555 
 556     def __read(self, size):  557         """Return size bytes from stream. If internal buffer is empty,  558  read another block from the stream.  559         """
 560         c = len(self.buf)  561         while c < size:  562             buf = self.fileobj.read(self.bufsize)  563             if not buf:  564                 break
 565             self.buf += buf  566             c += len(buf)  567         buf = self.buf[:size]  568         self.buf = self.buf[size:]  569         return buf  570 # class _Stream
 571 
 572 class _StreamProxy(object):  573     """Small proxy class that enables transparent compression  574  detection for the Stream interface (mode 'r|*').  575     """
 576 
 577     def __init__(self, fileobj):  578         self.fileobj = fileobj  579         self.buf = self.fileobj.read(BLOCKSIZE)  580 
 581     def read(self, size):  582         self.read = self.fileobj.read  583         return self.buf  584 
 585     def getcomptype(self):  586         if self.buf.startswith(b"\x1f\x8b\x08"):  587             return "gz"
 588         elif self.buf[0:3] == b"BZh" and self.buf[4:10] == b"1AY&SY":  589             return "bz2"
 590         elif self.buf.startswith((b"\x5d\x00\x00\x80", b"\xfd7zXZ")):  591             return "xz"
 592         else:  593             return "tar"
 594 
 595     def close(self):  596  self.fileobj.close()  597 # class StreamProxy
 598 
 599 #------------------------
 600 # Extraction file object
 601 #------------------------
 602 class _FileInFile(object):  603     """A thin wrapper around an existing file object that  604  provides a part of its data as an individual file  605  object.  606     """
 607 
 608     def __init__(self, fileobj, offset, size, blockinfo=None):  609         self.fileobj = fileobj  610         self.offset = offset  611         self.size = size  612         self.position = 0  613         self.name = getattr(fileobj, "name", None)  614         self.closed = False  615 
 616         if blockinfo is None:  617             blockinfo = [(0, size)]  618 
 619         # Construct a map with data and zero blocks.
 620         self.map_index = 0  621         self.map = []  622         lastpos = 0  623         realpos = self.offset  624         for offset, size in blockinfo:  625             if offset > lastpos:  626  self.map.append((False, lastpos, offset, None))  627             self.map.append((True, offset, offset + size, realpos))  628             realpos += size  629             lastpos = offset + size  630         if lastpos < self.size:  631  self.map.append((False, lastpos, self.size, None))  632 
 633     def flush(self):  634         pass
 635 
 636     def readable(self):  637         return True  638 
 639     def writable(self):  640         return False  641 
 642     def seekable(self):  643         return self.fileobj.seekable()  644 
 645     def tell(self):  646         """Return the current file position.  647         """
 648         return self.position  649 
 650     def seek(self, position, whence=io.SEEK_SET):  651         """Seek to a position in the file.  652         """
 653         if whence == io.SEEK_SET:  654             self.position = min(max(position, 0), self.size)  655         elif whence == io.SEEK_CUR:  656             if position < 0:  657                 self.position = max(self.position + position, 0)  658             else:  659                 self.position = min(self.position + position, self.size)  660         elif whence == io.SEEK_END:  661             self.position = max(min(self.size + position, self.size), 0)  662         else:  663             raise ValueError("Invalid argument")  664         return self.position  665 
 666     def read(self, size=None):  667         """Read data from the file.  668         """
 669         if size is None:  670             size = self.size - self.position  671         else:  672             size = min(size, self.size - self.position)  673 
 674         buf = b""
 675         while size > 0:  676             while True:  677                 data, start, stop, offset = self.map[self.map_index]  678                 if start <= self.position < stop:  679                     break
 680                 else:  681                     self.map_index += 1
 682                     if self.map_index == len(self.map):  683                         self.map_index = 0  684             length = min(size, stop - self.position)  685             if data:  686                 self.fileobj.seek(offset + (self.position - start))  687                 b = self.fileobj.read(length)  688                 if len(b) != length:  689                     raise ReadError("unexpected end of data")  690                 buf += b  691             else:  692                 buf += NUL * length  693             size -= length  694             self.position += length  695         return buf  696 
 697     def readinto(self, b):  698         buf = self.read(len(b))  699         b[:len(buf)] = buf  700         return len(buf)  701 
 702     def close(self):  703         self.closed = True  704 #class _FileInFile
 705 
 706 class ExFileObject(io.BufferedReader):  707 
 708     def __init__(self, tarfile, tarinfo):  709         fileobj = _FileInFile(tarfile.fileobj, tarinfo.offset_data,  710  tarinfo.size, tarinfo.sparse)  711         super().__init__(fileobj)  712 #class ExFileObject
 713 
 714 #------------------
 715 # Exported Classes
 716 #------------------
 717 class TarInfo(object):  718     """Informational class which holds the details about an  719  archive member given by a tar header block.  720  TarInfo objects are returned by TarFile.getmember(),  721  TarFile.getmembers() and TarFile.gettarinfo() and are  722  usually created internally.  723     """
 724 
 725     __slots__ = ("name", "mode", "uid", "gid", "size", "mtime",  726                  "chksum", "type", "linkname", "uname", "gname",  727                  "devmajor", "devminor",  728                  "offset", "offset_data", "pax_headers", "sparse",  729                  "tarfile", "_sparse_structs", "_link_target")  730 
 731     def __init__(self, name=""):  732         """Construct a TarInfo object. name is the optional name  733  of the member.  734         """
 735         self.name = name        # member name
 736         self.mode = 0o644       # file permissions
 737         self.uid = 0            # user id
 738         self.gid = 0            # group id
 739         self.size = 0           # file size
 740         self.mtime = 0          # modification time
 741         self.chksum = 0         # header checksum
 742         self.type = REGTYPE     # member type
 743         self.linkname = ""      # link name
 744         self.uname = ""         # user name
 745         self.gname = ""         # group name
 746         self.devmajor = 0       # device major number
 747         self.devminor = 0       # device minor number
 748 
 749         self.offset = 0         # the tar header starts here
 750         self.offset_data = 0    # the file's data starts here
 751 
 752         self.sparse = None      # sparse member information
 753         self.pax_headers = {}   # pax header information
 754 
 755     # In pax headers the "name" and "linkname" field are called
 756     # "path" and "linkpath".
 757     def _getpath(self):  758         return self.name  759     def _setpath(self, name):  760         self.name = name  761     path = property(_getpath, _setpath)  762 
 763     def _getlinkpath(self):  764         return self.linkname  765     def _setlinkpath(self, linkname):  766         self.linkname = linkname  767     linkpath = property(_getlinkpath, _setlinkpath)  768 
 769     def __repr__(self):  770         return "<%s %r at %#x>" % (self.__class__.__name__,self.name,id(self))  771 
 772     def get_info(self):  773         """Return the TarInfo's attributes as a dictionary.  774         """
 775         info = {  776             "name": self.name,  777             "mode":     self.mode & 0o7777,  778             "uid": self.uid,  779             "gid": self.gid,  780             "size": self.size,  781             "mtime": self.mtime,  782             "chksum": self.chksum,  783             "type": self.type,  784             "linkname": self.linkname,  785             "uname": self.uname,  786             "gname": self.gname,  787             "devmajor": self.devmajor,  788             "devminor": self.devminor  789  }  790 
 791         if info["type"] == DIRTYPE and not info["name"].endswith("/"):  792             info["name"] += "/"
 793 
 794         return info  795 
 796     def tobuf(self, format=DEFAULT_FORMAT, encoding=ENCODING, errors="surrogateescape"):  797         """Return a tar header as a string of 512 byte blocks.  798         """
 799         info = self.get_info()  800 
 801         if format == USTAR_FORMAT:  802             return self.create_ustar_header(info, encoding, errors)  803         elif format == GNU_FORMAT:  804             return self.create_gnu_header(info, encoding, errors)  805         elif format == PAX_FORMAT:  806             return self.create_pax_header(info, encoding)  807         else:  808             raise ValueError("invalid format")  809 
 810     def create_ustar_header(self, info, encoding, errors):  811         """Return the object as a ustar header block.  812         """
 813         info["magic"] = POSIX_MAGIC  814 
 815         if len(info["linkname"].encode(encoding, errors)) > LENGTH_LINK:  816             raise ValueError("linkname is too long")  817 
 818         if len(info["name"].encode(encoding, errors)) > LENGTH_NAME:  819             info["prefix"], info["name"] = self._posix_split_name(info["name"], encoding, errors)  820 
 821         return self._create_header(info, USTAR_FORMAT, encoding, errors)  822 
 823     def create_gnu_header(self, info, encoding, errors):  824         """Return the object as a GNU header block sequence.  825         """
 826         info["magic"] = GNU_MAGIC  827 
 828         buf = b""
 829         if len(info["linkname"].encode(encoding, errors)) > LENGTH_LINK:  830             buf += self._create_gnu_long_header(info["linkname"], GNUTYPE_LONGLINK, encoding, errors)  831 
 832         if len(info["name"].encode(encoding, errors)) > LENGTH_NAME:  833             buf += self._create_gnu_long_header(info["name"], GNUTYPE_LONGNAME, encoding, errors)  834 
 835         return buf + self._create_header(info, GNU_FORMAT, encoding, errors)  836 
 837     def create_pax_header(self, info, encoding):  838         """Return the object as a ustar header block. If it cannot be  839  represented this way, prepend a pax extended header sequence  840  with supplement information.  841         """
 842         info["magic"] = POSIX_MAGIC  843         pax_headers = self.pax_headers.copy()  844 
 845         # Test string fields for values that exceed the field length or cannot
 846         # be represented in ASCII encoding.
 847         for name, hname, length in (  848                 ("name", "path", LENGTH_NAME), ("linkname", "linkpath", LENGTH_LINK),  849                 ("uname", "uname", 32), ("gname", "gname", 32)):  850 
 851             if hname in pax_headers:  852                 # The pax header has priority.
 853                 continue
 854 
 855             # Try to encode the string as ASCII.
 856             try:  857                 info[name].encode("ascii", "strict")  858             except UnicodeEncodeError:  859                 pax_headers[hname] = info[name]  860                 continue
 861 
 862             if len(info[name]) > length:  863                 pax_headers[hname] = info[name]  864 
 865         # Test number fields for values that exceed the field limit or values
 866         # that like to be stored as float.
 867         for name, digits in (("uid", 8), ("gid", 8), ("size", 12), ("mtime", 12)):  868             if name in pax_headers:  869                 # The pax header has priority. Avoid overflow.
 870                 info[name] = 0  871                 continue
 872 
 873             val = info[name]  874             if not 0 <= val < 8 ** (digits - 1) or isinstance(val, float):  875                 pax_headers[name] = str(val)  876                 info[name] = 0  877 
 878         # Create a pax extended header if necessary.
 879         if pax_headers:  880             buf = self._create_pax_generic_header(pax_headers, XHDTYPE, encoding)  881         else:  882             buf = b""
 883 
 884         return buf + self._create_header(info, USTAR_FORMAT, "ascii", "replace")  885 
 886  @classmethod  887     def create_pax_global_header(cls, pax_headers):  888         """Return the object as a pax global header block sequence.  889         """
 890         return cls._create_pax_generic_header(pax_headers, XGLTYPE, "utf-8")  891 
 892     def _posix_split_name(self, name, encoding, errors):  893         """Split a name longer than 100 chars into a prefix  894  and a name part.  895         """
 896         components = name.split("/")  897         for i in range(1, len(components)):  898             prefix = "/".join(components[:i])  899             name = "/".join(components[i:])  900             if len(prefix.encode(encoding, errors)) <= LENGTH_PREFIX and \  901                     len(name.encode(encoding, errors)) <= LENGTH_NAME:  902                 break
 903         else:  904             raise ValueError("name is too long")  905 
 906         return prefix, name  907 
 908  @staticmethod  909     def _create_header(info, format, encoding, errors):  910         """Return a header block. info is a dictionary with file  911  information, format must be one of the *_FORMAT constants.  912         """
 913         parts = [  914             stn(info.get("name", ""), 100, encoding, errors),  915             itn(info.get("mode", 0) & 0o7777, 8, format),  916             itn(info.get("uid", 0), 8, format),  917             itn(info.get("gid", 0), 8, format),  918             itn(info.get("size", 0), 12, format),  919             itn(info.get("mtime", 0), 12, format),  920             b"        ", # checksum field
 921             info.get("type", REGTYPE),  922             stn(info.get("linkname", ""), 100, encoding, errors),  923             info.get("magic", POSIX_MAGIC),  924             stn(info.get("uname", ""), 32, encoding, errors),  925             stn(info.get("gname", ""), 32, encoding, errors),  926             itn(info.get("devmajor", 0), 8, format),  927             itn(info.get("devminor", 0), 8, format),  928             stn(info.get("prefix", ""), 155, encoding, errors)  929  ]  930 
 931         buf = struct.pack("%ds" % BLOCKSIZE, b"".join(parts))  932         chksum = calc_chksums(buf[-BLOCKSIZE:])[0]  933         buf = buf[:-364] + bytes("%06o\0" % chksum, "ascii") + buf[-357:]  934         return buf  935 
 936  @staticmethod  937     def _create_payload(payload):  938         """Return the string payload filled with zero bytes  939  up to the next 512 byte border.  940         """
 941         blocks, remainder = divmod(len(payload), BLOCKSIZE)  942         if remainder > 0:  943             payload += (BLOCKSIZE - remainder) * NUL  944         return payload  945 
 946  @classmethod  947     def _create_gnu_long_header(cls, name, type, encoding, errors):  948         """Return a GNUTYPE_LONGNAME or GNUTYPE_LONGLINK sequence  949  for name.  950         """
 951         name = name.encode(encoding, errors) + NUL  952 
 953         info = {}  954         info["name"] = "././@LongLink"
 955         info["type"] = type  956         info["size"] = len(name)  957         info["magic"] = GNU_MAGIC  958 
 959         # create extended header + name blocks.
 960         return cls._create_header(info, USTAR_FORMAT, encoding, errors) + \  961  cls._create_payload(name)  962 
 963  @classmethod  964     def _create_pax_generic_header(cls, pax_headers, type, encoding):  965         """Return a POSIX.1-2008 extended or global header sequence  966  that contains a list of keyword, value pairs. The values  967  must be strings.  968         """
 969         # Check if one of the fields contains surrogate characters and thereby
 970         # forces hdrcharset=BINARY, see _proc_pax() for more information.
 971         binary = False  972         for keyword, value in pax_headers.items():  973             try:  974                 value.encode("utf-8", "strict")  975             except UnicodeEncodeError:  976                 binary = True  977                 break
 978 
 979         records = b""
 980         if binary:  981             # Put the hdrcharset field at the beginning of the header.
 982             records += b"21 hdrcharset=BINARY\n"
 983 
 984         for keyword, value in pax_headers.items():  985             keyword = keyword.encode("utf-8")  986             if binary:  987                 # Try to restore the original byte representation of `value'.
 988                 # Needless to say, that the encoding must match the string.
 989                 value = value.encode(encoding, "surrogateescape")  990             else:  991                 value = value.encode("utf-8")  992 
 993             l = len(keyword) + len(value) + 3   # ' ' + '=' + '\n'
 994             n = p = 0  995             while True:  996                 n = l + len(str(p))  997                 if n == p:  998                     break
 999                 p = n 1000             records += bytes(str(p), "ascii") + b" " + keyword + b"=" + value + b"\n"
1001 
1002         # We use a hardcoded "././@PaxHeader" name like star does
1003         # instead of the one that POSIX recommends.
1004         info = {} 1005         info["name"] = "././@PaxHeader"
1006         info["type"] = type 1007         info["size"] = len(records) 1008         info["magic"] = POSIX_MAGIC 1009 
1010         # Create pax header + record blocks.
1011         return cls._create_header(info, USTAR_FORMAT, "ascii", "replace") + \ 1012  cls._create_payload(records) 1013 
1014  @classmethod 1015     def frombuf(cls, buf, encoding, errors): 1016         """Construct a TarInfo object from a 512 byte bytes object. 1017         """
1018         if len(buf) == 0: 1019             raise EmptyHeaderError("empty header") 1020         if len(buf) != BLOCKSIZE: 1021             raise TruncatedHeaderError("truncated header") 1022         if buf.count(NUL) == BLOCKSIZE: 1023             raise EOFHeaderError("end of file header") 1024 
1025         chksum = nti(buf[148:156]) 1026         if chksum not in calc_chksums(buf): 1027             raise InvalidHeaderError("bad checksum") 1028 
1029         obj = cls() 1030         obj.name = nts(buf[0:100], encoding, errors) 1031         obj.mode = nti(buf[100:108]) 1032         obj.uid = nti(buf[108:116]) 1033         obj.gid = nti(buf[116:124]) 1034         obj.size = nti(buf[124:136]) 1035         obj.mtime = nti(buf[136:148]) 1036         obj.chksum = chksum 1037         obj.type = buf[156:157] 1038         obj.linkname = nts(buf[157:257], encoding, errors) 1039         obj.uname = nts(buf[265:297], encoding, errors) 1040         obj.gname = nts(buf[297:329], encoding, errors) 1041         obj.devmajor = nti(buf[329:337]) 1042         obj.devminor = nti(buf[337:345]) 1043         prefix = nts(buf[345:500], encoding, errors) 1044 
1045         # Old V7 tar format represents a directory as a regular
1046         # file with a trailing slash.
1047         if obj.type == AREGTYPE and obj.name.endswith("/"): 1048             obj.type = DIRTYPE 1049 
1050         # The old GNU sparse format occupies some of the unused
1051         # space in the buffer for up to 4 sparse structures.
1052         # Save the them for later processing in _proc_sparse().
1053         if obj.type == GNUTYPE_SPARSE: 1054             pos = 386
1055             structs = [] 1056             for i in range(4): 1057                 try: 1058                     offset = nti(buf[pos:pos + 12]) 1059                     numbytes = nti(buf[pos + 12:pos + 24]) 1060                 except ValueError: 1061                     break
1062  structs.append((offset, numbytes)) 1063                 pos += 24
1064             isextended = bool(buf[482]) 1065             origsize = nti(buf[483:495]) 1066             obj._sparse_structs = (structs, isextended, origsize) 1067 
1068         # Remove redundant slashes from directories.
1069         if obj.isdir(): 1070             obj.name = obj.name.rstrip("/") 1071 
1072         # Reconstruct a ustar longname.
1073         if prefix and obj.type not in GNU_TYPES: 1074             obj.name = prefix + "/" + obj.name 1075         return obj 1076 
1077  @classmethod 1078     def fromtarfile(cls, tarfile): 1079         """Return the next TarInfo object from TarFile object 1080  tarfile. 1081         """
1082         buf = tarfile.fileobj.read(BLOCKSIZE) 1083         obj = cls.frombuf(buf, tarfile.encoding, tarfile.errors) 1084         obj.offset = tarfile.fileobj.tell() - BLOCKSIZE 1085         return obj._proc_member(tarfile) 1086 
1087     #--------------------------------------------------------------------------
1088     # The following are methods that are called depending on the type of a
1089     # member. The entry point is _proc_member() which can be overridden in a
1090     # subclass to add custom _proc_*() methods. A _proc_*() method MUST
1091     # implement the following
1092     # operations:
1093     # 1. Set self.offset_data to the position where the data blocks begin,
1094     # if there is data that follows.
1095     # 2. Set tarfile.offset to the position where the next member's header will
1096     # begin.
1097     # 3. Return self or another valid TarInfo object.
1098     def _proc_member(self, tarfile): 1099         """Choose the right processing method depending on 1100  the type and call it. 1101         """
1102         if self.type in (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK): 1103             return self._proc_gnulong(tarfile) 1104         elif self.type == GNUTYPE_SPARSE: 1105             return self._proc_sparse(tarfile) 1106         elif self.type in (XHDTYPE, XGLTYPE, SOLARIS_XHDTYPE): 1107             return self._proc_pax(tarfile) 1108         else: 1109             return self._proc_builtin(tarfile) 1110 
1111     def _proc_builtin(self, tarfile): 1112         """Process a builtin type or an unknown type which 1113  will be treated as a regular file. 1114         """
1115         self.offset_data = tarfile.fileobj.tell() 1116         offset = self.offset_data 1117         if self.isreg() or self.type not in SUPPORTED_TYPES: 1118             # Skip the following data blocks.
1119             offset += self._block(self.size) 1120         tarfile.offset = offset 1121 
1122         # Patch the TarInfo object with saved global
1123         # header information.
1124  self._apply_pax_info(tarfile.pax_headers, tarfile.encoding, tarfile.errors) 1125 
1126         return self 1127 
1128     def _proc_gnulong(self, tarfile): 1129         """Process the blocks that hold a GNU longname 1130  or longlink member. 1131         """
1132         buf = tarfile.fileobj.read(self._block(self.size)) 1133 
1134         # Fetch the next header and process it.
1135         try: 1136             next = self.fromtarfile(tarfile) 1137         except HeaderError: 1138             raise SubsequentHeaderError("missing or bad subsequent header") 1139 
1140         # Patch the TarInfo object from the next header with
1141         # the longname information.
1142         next.offset = self.offset 1143         if self.type == GNUTYPE_LONGNAME: 1144             next.name = nts(buf, tarfile.encoding, tarfile.errors) 1145         elif self.type == GNUTYPE_LONGLINK: 1146             next.linkname = nts(buf, tarfile.encoding, tarfile.errors) 1147 
1148         return next 1149 
1150     def _proc_sparse(self, tarfile): 1151         """Process a GNU sparse header plus extra headers. 1152         """
1153         # We already collected some sparse structures in frombuf().
1154         structs, isextended, origsize = self._sparse_structs 1155         del self._sparse_structs 1156 
1157         # Collect sparse structures from extended header blocks.
1158         while isextended: 1159             buf = tarfile.fileobj.read(BLOCKSIZE) 1160             pos = 0 1161             for i in range(21): 1162                 try: 1163                     offset = nti(buf[pos:pos + 12]) 1164                     numbytes = nti(buf[pos + 12:pos + 24]) 1165                 except ValueError: 1166                     break
1167                 if offset and numbytes: 1168  structs.append((offset, numbytes)) 1169                 pos += 24
1170             isextended = bool(buf[504]) 1171         self.sparse = structs 1172 
1173         self.offset_data = tarfile.fileobj.tell() 1174         tarfile.offset = self.offset_data + self._block(self.size) 1175         self.size = origsize 1176         return self 1177 
1178     def _proc_pax(self, tarfile): 1179         """Process an extended or global header as described in 1180  POSIX.1-2008. 1181         """
1182         # Read the header information.
1183         buf = tarfile.fileobj.read(self._block(self.size)) 1184 
1185         # A pax header stores supplemental information for either
1186         # the following file (extended) or all following files
1187         # (global).
1188         if self.type == XGLTYPE: 1189             pax_headers = tarfile.pax_headers 1190         else: 1191             pax_headers = tarfile.pax_headers.copy() 1192 
1193         # Check if the pax header contains a hdrcharset field. This tells us
1194         # the encoding of the path, linkpath, uname and gname fields. Normally,
1195         # these fields are UTF-8 encoded but since POSIX.1-2008 tar
1196         # implementations are allowed to store them as raw binary strings if
1197         # the translation to UTF-8 fails.
1198         match = re.search(br"\d+ hdrcharset=([^\n]+)\n", buf) 1199         if match is not None: 1200             pax_headers["hdrcharset"] = match.group(1).decode("utf-8") 1201 
1202         # For the time being, we don't care about anything other than "BINARY".
1203         # The only other value that is currently allowed by the standard is
1204         # "ISO-IR 10646 2000 UTF-8" in other words UTF-8.
1205         hdrcharset = pax_headers.get("hdrcharset") 1206         if hdrcharset == "BINARY": 1207             encoding = tarfile.encoding 1208         else: 1209             encoding = "utf-8"
1210 
1211         # Parse pax header information. A record looks like that:
1212         # "%d %s=%s\n" % (length, keyword, value). length is the size
1213         # of the complete record including the length field itself and
1214         # the newline. keyword and value are both UTF-8 encoded strings.
1215         regex = re.compile(br"(\d+) ([^=]+)=") 1216         pos = 0 1217         while True: 1218             match = regex.match(buf, pos) 1219             if not match: 1220                 break
1221 
1222             length, keyword = match.groups() 1223             length = int(length) 1224             value = buf[match.end(2) + 1:match.start(1) + length - 1] 1225 
1226             # Normally, we could just use "utf-8" as the encoding and "strict"
1227             # as the error handler, but we better not take the risk. For
1228             # example, GNU tar <= 1.23 is known to store filenames it cannot
1229             # translate to UTF-8 as raw strings (unfortunately without a
1230             # hdrcharset=BINARY header).
1231             # We first try the strict standard encoding, and if that fails we
1232             # fall back on the user's encoding and error handler.
1233             keyword = self._decode_pax_field(keyword, "utf-8", "utf-8", 1234  tarfile.errors) 1235             if keyword in PAX_NAME_FIELDS: 1236                 value = self._decode_pax_field(value, encoding, tarfile.encoding, 1237  tarfile.errors) 1238             else: 1239                 value = self._decode_pax_field(value, "utf-8", "utf-8", 1240  tarfile.errors) 1241 
1242             pax_headers[keyword] = value 1243             pos += length 1244 
1245         # Fetch the next header.
1246         try: 1247             next = self.fromtarfile(tarfile) 1248         except HeaderError: 1249             raise SubsequentHeaderError("missing or bad subsequent header") 1250 
1251         # Process GNU sparse information.
1252         if "GNU.sparse.map" in pax_headers: 1253             # GNU extended sparse format version 0.1.
1254  self._proc_gnusparse_01(next, pax_headers) 1255 
1256         elif "GNU.sparse.size" in pax_headers: 1257             # GNU extended sparse format version 0.0.
1258  self._proc_gnusparse_00(next, pax_headers, buf) 1259 
1260         elif pax_headers.get("GNU.sparse.major") == "1" and pax_headers.get("GNU.sparse.minor") == "0": 1261             # GNU extended sparse format version 1.0.
1262  self._proc_gnusparse_10(next, pax_headers, tarfile) 1263 
1264         if self.type in (XHDTYPE, SOLARIS_XHDTYPE): 1265             # Patch the TarInfo object with the extended header info.
1266  next._apply_pax_info(pax_headers, tarfile.encoding, tarfile.errors) 1267             next.offset = self.offset 1268 
1269             if "size" in pax_headers: 1270                 # If the extended header replaces the size field,
1271                 # we need to recalculate the offset where the next
1272                 # header starts.
1273                 offset = next.offset_data 1274                 if next.isreg() or next.type not in SUPPORTED_TYPES: 1275                     offset += next._block(next.size) 1276                 tarfile.offset = offset 1277 
1278         return next 1279 
1280     def _proc_gnusparse_00(self, next, pax_headers, buf): 1281         """Process a GNU tar extended sparse header, version 0.0. 1282         """
1283         offsets = [] 1284         for match in re.finditer(br"\d+ GNU.sparse.offset=(\d+)\n", buf): 1285             offsets.append(int(match.group(1))) 1286         numbytes = [] 1287         for match in re.finditer(br"\d+ GNU.sparse.numbytes=(\d+)\n", buf): 1288             numbytes.append(int(match.group(1))) 1289         next.sparse = list(zip(offsets, numbytes)) 1290 
1291     def _proc_gnusparse_01(self, next, pax_headers): 1292         """Process a GNU tar extended sparse header, version 0.1. 1293         """
1294         sparse = [int(x) for x in pax_headers["GNU.sparse.map"].split(",")] 1295         next.sparse = list(zip(sparse[::2], sparse[1::2])) 1296 
1297     def _proc_gnusparse_10(self, next, pax_headers, tarfile): 1298         """Process a GNU tar extended sparse header, version 1.0. 1299         """
1300         fields = None 1301         sparse = [] 1302         buf = tarfile.fileobj.read(BLOCKSIZE) 1303         fields, buf = buf.split(b"\n", 1) 1304         fields = int(fields) 1305         while len(sparse) < fields * 2: 1306             if b"\n" not in buf: 1307                 buf += tarfile.fileobj.read(BLOCKSIZE) 1308             number, buf = buf.split(b"\n", 1) 1309  sparse.append(int(number)) 1310         next.offset_data = tarfile.fileobj.tell() 1311         next.sparse = list(zip(sparse[::2], sparse[1::2])) 1312 
1313     def _apply_pax_info(self, pax_headers, encoding, errors): 1314         """Replace fields with supplemental information from a previous 1315  pax extended or global header. 1316         """
1317         for keyword, value in pax_headers.items(): 1318             if keyword == "GNU.sparse.name": 1319                 setattr(self, "path", value) 1320             elif keyword == "GNU.sparse.size": 1321                 setattr(self, "size", int(value)) 1322             elif keyword == "GNU.sparse.realsize": 1323                 setattr(self, "size", int(value)) 1324             elif keyword in PAX_FIELDS: 1325                 if keyword in PAX_NUMBER_FIELDS: 1326                     try: 1327                         value = PAX_NUMBER_FIELDS[keyword](value) 1328                     except ValueError: 1329                         value = 0 1330                 if keyword == "path": 1331                     value = value.rstrip("/") 1332  setattr(self, keyword, value) 1333 
1334         self.pax_headers = pax_headers.copy() 1335 
1336     def _decode_pax_field(self, value, encoding, fallback_encoding, fallback_errors): 1337         """Decode a single field from a pax record. 1338         """
1339         try: 1340             return value.decode(encoding, "strict") 1341         except UnicodeDecodeError: 1342             return value.decode(fallback_encoding, fallback_errors) 1343 
1344     def _block(self, count): 1345         """Round up a byte count by BLOCKSIZE and return it, 1346  e.g. _block(834) => 1024. 1347         """
1348         blocks, remainder = divmod(count, BLOCKSIZE) 1349         if remainder: 1350             blocks += 1
1351         return blocks * BLOCKSIZE 1352 
1353     def isreg(self): 1354         return self.type in REGULAR_TYPES 1355     def isfile(self): 1356         return self.isreg() 1357     def isdir(self): 1358         return self.type == DIRTYPE 1359     def issym(self): 1360         return self.type == SYMTYPE 1361     def islnk(self): 1362         return self.type == LNKTYPE 1363     def ischr(self): 1364         return self.type == CHRTYPE 1365     def isblk(self): 1366         return self.type == BLKTYPE 1367     def isfifo(self): 1368         return self.type == FIFOTYPE 1369     def issparse(self): 1370         return self.sparse is not None 1371     def isdev(self): 1372         return self.type in (CHRTYPE, BLKTYPE, FIFOTYPE) 1373 # class TarInfo
1374 
1375 class TarFile(object): 1376     """The TarFile Class provides an interface to tar archives. 1377     """
1378 
1379     debug = 0                   # May be set from 0 (no msgs) to 3 (all msgs)
1380 
1381     dereference = False         # If true, add content of linked file to the
1382                                 # tar file, else the link.
1383 
1384     ignore_zeros = False        # If true, skips empty or invalid blocks and
1385                                 # continues processing.
1386 
1387     errorlevel = 1              # If 0, fatal errors only appear in debug
1388                                 # messages (if debug >= 0). If > 0, errors
1389                                 # are passed to the caller as exceptions.
1390 
1391     format = DEFAULT_FORMAT     # The format to use when creating an archive.
1392 
1393     encoding = ENCODING         # Encoding for 8-bit character strings.
1394 
1395     errors = None               # Error handler for unicode conversion.
1396 
1397     tarinfo = TarInfo           # The default TarInfo class to use.
1398 
1399     fileobject = ExFileObject   # The file-object for extractfile().
1400 
1401     def __init__(self, name=None, mode="r", fileobj=None, format=None, 1402             tarinfo=None, dereference=None, ignore_zeros=None, encoding=None, 1403             errors="surrogateescape", pax_headers=None, debug=None, errorlevel=None): 1404         """Open an (uncompressed) tar archive `name'. `mode' is either 'r' to 1405  read from an existing archive, 'a' to append data to an existing 1406  file or 'w' to create a new file overwriting an existing one. `mode' 1407  defaults to 'r'. 1408  If `fileobj' is given, it is used for reading or writing data. If it 1409  can be determined, `mode' is overridden by `fileobj's mode. 1410  `fileobj' is not closed, when TarFile is closed. 1411         """
1412         modes = {"r": "rb", "a": "r+b", "w": "wb", "x": "xb"} 1413         if mode not in modes: 1414             raise ValueError("mode must be 'r', 'a', 'w' or 'x'") 1415         self.mode = mode 1416         self._mode = modes[mode] 1417 
1418         if not fileobj: 1419             if self.mode == "a" and not os.path.exists(name): 1420                 # Create nonexistent files in append mode.
1421                 self.mode = "w"
1422                 self._mode = "wb"
1423             fileobj = bltn_open(name, self._mode) 1424             self._extfileobj = False 1425         else: 1426             if (name is None and hasattr(fileobj, "name") and
1427  isinstance(fileobj.name, (str, bytes))): 1428                 name = fileobj.name 1429             if hasattr(fileobj, "mode"): 1430                 self._mode = fileobj.mode 1431             self._extfileobj = True 1432         self.name = os.path.abspath(name) if name else None 1433         self.fileobj = fileobj 1434 
1435         # Init attributes.
1436         if format is not None: 1437             self.format = format 1438         if tarinfo is not None: 1439             self.tarinfo = tarinfo 1440         if dereference is not None: 1441             self.dereference = dereference 1442         if ignore_zeros is not None: 1443             self.ignore_zeros = ignore_zeros 1444         if encoding is not None: 1445             self.encoding = encoding 1446         self.errors = errors 1447 
1448         if pax_headers is not None and self.format == PAX_FORMAT: 1449             self.pax_headers = pax_headers 1450         else: 1451             self.pax_headers = {} 1452 
1453         if debug is not None: 1454             self.debug = debug 1455         if errorlevel is not None: 1456             self.errorlevel = errorlevel 1457 
1458         # Init datastructures.
1459         self.closed = False 1460         self.members = []       # list of members as TarInfo objects
1461         self._loaded = False    # flag if all members have been read
1462         self.offset = self.fileobj.tell() 1463                                 # current position in the archive file
1464         self.inodes = {}        # dictionary caching the inodes of
1465                                 # archive members already added
1466 
1467         try: 1468             if self.mode == "r": 1469                 self.firstmember = None 1470                 self.firstmember = self.next() 1471 
1472             if self.mode == "a": 1473                 # Move to the end of the archive,
1474                 # before the first empty block.
1475                 while True: 1476  self.fileobj.seek(self.offset) 1477                     try: 1478                         tarinfo = self.tarinfo.fromtarfile(self) 1479  self.members.append(tarinfo) 1480                     except EOFHeaderError: 1481  self.fileobj.seek(self.offset) 1482                         break
1483                     except HeaderError as e: 1484                         raise ReadError(str(e)) 1485 
1486             if self.mode in ("a", "w", "x"): 1487                 self._loaded = True 1488 
1489                 if self.pax_headers: 1490                     buf = self.tarinfo.create_pax_global_header(self.pax_headers.copy()) 1491  self.fileobj.write(buf) 1492                     self.offset += len(buf) 1493         except: 1494             if not self._extfileobj: 1495  self.fileobj.close() 1496             self.closed = True 1497             raise
1498 
1499     #--------------------------------------------------------------------------
1500     # Below are the classmethods which act as alternate constructors to the
1501     # TarFile class. The open() method is the only one that is needed for
1502     # public use; it is the "super"-constructor and is able to select an
1503     # adequate "sub"-constructor for a particular compression using the mapping
1504     # from OPEN_METH.
1505     # 1506     # This concept allows one to subclass TarFile without losing the comfort of
1507     # the super-constructor. A sub-constructor is registered and made available
1508     # by adding it to the mapping in OPEN_METH.
1509 
1510  @classmethod 1511     def open(cls, name=None, mode="r", fileobj=None, bufsize=RECORDSIZE, **kwargs): 1512         """Open a tar archive for reading, writing or appending. Return 1513  an appropriate TarFile class. 1514 
1515  mode: 1516  'r' or 'r:*' open for reading with transparent compression 1517  'r:' open for reading exclusively uncompressed 1518  'r:gz' open for reading with gzip compression 1519  'r:bz2' open for reading with bzip2 compression 1520  'r:xz' open for reading with lzma compression 1521  'a' or 'a:' open for appending, creating the file if necessary 1522  'w' or 'w:' open for writing without compression 1523  'w:gz' open for writing with gzip compression 1524  'w:bz2' open for writing with bzip2 compression 1525  'w:xz' open for writing with lzma compression 1526 
1527  'x' or 'x:' create a tarfile exclusively without compression, raise 1528  an exception if the file is already created 1529  'x:gz' create a gzip compressed tarfile, raise an exception 1530  if the file is already created 1531  'x:bz2' create a bzip2 compressed tarfile, raise an exception 1532  if the file is already created 1533  'x:xz' create an lzma compressed tarfile, raise an exception 1534  if the file is already created 1535 
1536  'r|*' open a stream of tar blocks with transparent compression 1537  'r|' open an uncompressed stream of tar blocks for reading 1538  'r|gz' open a gzip compressed stream of tar blocks 1539  'r|bz2' open a bzip2 compressed stream of tar blocks 1540  'r|xz' open an lzma compressed stream of tar blocks 1541  'w|' open an uncompressed stream for writing 1542  'w|gz' open a gzip compressed stream for writing 1543  'w|bz2' open a bzip2 compressed stream for writing 1544  'w|xz' open an lzma compressed stream for writing 1545         """
1546 
1547         if not name and not fileobj: 1548             raise ValueError("nothing to open") 1549 
1550         if mode in ("r", "r:*"): 1551             # Find out which *open() is appropriate for opening the file.
1552             for comptype in cls.OPEN_METH: 1553                 func = getattr(cls, cls.OPEN_METH[comptype]) 1554                 if fileobj is not None: 1555                     saved_pos = fileobj.tell() 1556                 try: 1557                     return func(name, "r", fileobj, **kwargs) 1558                 except (ReadError, CompressionError) as e: 1559                     if fileobj is not None: 1560  fileobj.seek(saved_pos) 1561                     continue
1562             raise ReadError("file could not be opened successfully") 1563 
1564         elif ":" in mode: 1565             filemode, comptype = mode.split(":", 1) 1566             filemode = filemode or "r"
1567             comptype = comptype or "tar"
1568 
1569             # Select the *open() function according to
1570             # given compression.
1571             if comptype in cls.OPEN_METH: 1572                 func = getattr(cls, cls.OPEN_METH[comptype]) 1573             else: 1574                 raise CompressionError("unknown compression type %r" % comptype) 1575             return func(name, filemode, fileobj, **kwargs) 1576 
1577         elif "|" in mode: 1578             filemode, comptype = mode.split("|", 1) 1579             filemode = filemode or "r"
1580             comptype = comptype or "tar"
1581 
1582             if filemode not in ("r", "w"): 1583                 raise ValueError("mode must be 'r' or 'w'") 1584 
1585             stream = _Stream(name, filemode, comptype, fileobj, bufsize) 1586             try: 1587                 t = cls(name, filemode, stream, **kwargs) 1588             except: 1589  stream.close() 1590                 raise
1591             t._extfileobj = False 1592             return t 1593 
1594         elif mode in ("a", "w", "x"): 1595             return cls.taropen(name, mode, fileobj, **kwargs) 1596 
1597         raise ValueError("undiscernible mode") 1598 
1599  @classmethod 1600     def taropen(cls, name, mode="r", fileobj=None, **kwargs): 1601         """Open uncompressed tar archive name for reading or writing. 1602         """
1603         if mode not in ("r", "a", "w", "x"): 1604             raise ValueError("mode must be 'r', 'a', 'w' or 'x'") 1605         return cls(name, mode, fileobj, **kwargs) 1606 
1607  @classmethod 1608     def gzopen(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs): 1609         """Open gzip compressed tar archive name for reading or writing. 1610  Appending is not allowed. 1611         """
1612         if mode not in ("r", "w", "x"): 1613             raise ValueError("mode must be 'r', 'w' or 'x'") 1614 
1615         try: 1616             import gzip 1617  gzip.GzipFile 1618         except (ImportError, AttributeError): 1619             raise CompressionError("gzip module is not available") 1620 
1621         try: 1622             fileobj = gzip.GzipFile(name, mode + "b", compresslevel, fileobj) 1623         except OSError: 1624             if fileobj is not None and mode == 'r': 1625                 raise ReadError("not a gzip file") 1626             raise
1627 
1628         try: 1629             t = cls.taropen(name, mode, fileobj, **kwargs) 1630         except OSError: 1631  fileobj.close() 1632             if mode == 'r': 1633                 raise ReadError("not a gzip file") 1634             raise
1635         except: 1636  fileobj.close() 1637             raise
1638         t._extfileobj = False 1639         return t 1640 
1641  @classmethod 1642     def bz2open(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs): 1643         """Open bzip2 compressed tar archive name for reading or writing. 1644  Appending is not allowed. 1645         """
1646         if mode not in ("r", "w", "x"): 1647             raise ValueError("mode must be 'r', 'w' or 'x'") 1648 
1649         try: 1650             import bz2 1651         except ImportError: 1652             raise CompressionError("bz2 module is not available") 1653 
1654         fileobj = bz2.BZ2File(fileobj or name, mode, 1655                               compresslevel=compresslevel) 1656 
1657         try: 1658             t = cls.taropen(name, mode, fileobj, **kwargs) 1659         except (OSError, EOFError): 1660  fileobj.close() 1661             if mode == 'r': 1662                 raise ReadError("not a bzip2 file") 1663             raise
1664         except: 1665  fileobj.close() 1666             raise
1667         t._extfileobj = False 1668         return t 1669 
1670  @classmethod 1671     def xzopen(cls, name, mode="r", fileobj=None, preset=None, **kwargs): 1672         """Open lzma compressed tar archive name for reading or writing. 1673  Appending is not allowed. 1674         """
1675         if mode not in ("r", "w", "x"): 1676             raise ValueError("mode must be 'r', 'w' or 'x'") 1677 
1678         try: 1679             import lzma 1680         except ImportError: 1681             raise CompressionError("lzma module is not available") 1682 
1683         fileobj = lzma.LZMAFile(fileobj or name, mode, preset=preset) 1684 
1685         try: 1686             t = cls.taropen(name, mode, fileobj, **kwargs) 1687         except (lzma.LZMAError, EOFError): 1688  fileobj.close() 1689             if mode == 'r': 1690                 raise ReadError("not an lzma file") 1691             raise
1692         except: 1693  fileobj.close() 1694             raise
1695         t._extfileobj = False 1696         return t 1697 
1698     # All *open() methods are registered here.
1699     OPEN_METH = { 1700         "tar": "taropen",   # uncompressed tar
1701         "gz":  "gzopen",    # gzip compressed tar
1702         "bz2": "bz2open",   # bzip2 compressed tar
1703         "xz":  "xzopen"     # lzma compressed tar
1704  } 1705 
1706     #--------------------------------------------------------------------------
1707     # The public methods which TarFile provides:
1708 
1709     def close(self): 1710         """Close the TarFile. In write-mode, two finishing zero blocks are 1711  appended to the archive. 1712         """
1713         if self.closed: 1714             return
1715 
1716         self.closed = True 1717         try: 1718             if self.mode in ("a", "w", "x"): 1719                 self.fileobj.write(NUL * (BLOCKSIZE * 2)) 1720                 self.offset += (BLOCKSIZE * 2) 1721                 # fill up the end with zero-blocks
1722                 # (like option -b20 for tar does)
1723                 blocks, remainder = divmod(self.offset, RECORDSIZE) 1724                 if remainder > 0: 1725                     self.fileobj.write(NUL * (RECORDSIZE - remainder)) 1726         finally: 1727             if not self._extfileobj: 1728  self.fileobj.close() 1729 
1730     def getmember(self, name): 1731         """Return a TarInfo object for member `name'. If `name' can not be 1732  found in the archive, KeyError is raised. If a member occurs more 1733  than once in the archive, its last occurrence is assumed to be the 1734  most up-to-date version. 1735         """
1736         tarinfo = self._getmember(name) 1737         if tarinfo is None: 1738             raise KeyError("filename %r not found" % name) 1739         return tarinfo 1740 
1741     def getmembers(self): 1742         """Return the members of the archive as a list of TarInfo objects. The 1743  list has the same order as the members in the archive. 1744         """
1745  self._check() 1746         if not self._loaded:    # if we want to obtain a list of
1747             self._load()        # all members, we first have to
1748                                 # scan the whole archive.
1749         return self.members 1750 
1751     def getnames(self): 1752         """Return the members of the archive as a list of their names. It has 1753  the same order as the list returned by getmembers(). 1754         """
1755         return [tarinfo.name for tarinfo in self.getmembers()] 1756 
1757     def gettarinfo(self, name=None, arcname=None, fileobj=None): 1758         """Create a TarInfo object from the result of os.stat or equivalent 1759  on an existing file. The file is either named by `name', or 1760  specified as a file object `fileobj' with a file descriptor. If 1761  given, `arcname' specifies an alternative name for the file in the 1762  archive, otherwise, the name is taken from the 'name' attribute of 1763  'fileobj', or the 'name' argument. The name should be a text 1764  string. 1765         """
1766         self._check("awx") 1767 
1768         # When fileobj is given, replace name by
1769         # fileobj's real name.
1770         if fileobj is not None: 1771             name = fileobj.name 1772 
1773         # Building the name of the member in the archive.
1774         # Backward slashes are converted to forward slashes,
1775         # Absolute paths are turned to relative paths.
1776         if arcname is None: 1777             arcname = name 1778         drv, arcname = os.path.splitdrive(arcname) 1779         arcname = arcname.replace(os.sep, "/") 1780         arcname = arcname.lstrip("/") 1781 
1782         # Now, fill the TarInfo object with
1783         # information specific for the file.
1784         tarinfo = self.tarinfo() 1785         tarinfo.tarfile = self  # Not needed
1786 
1787         # Use os.stat or os.lstat, depending on platform
1788         # and if symlinks shall be resolved.
1789         if fileobj is None: 1790             if hasattr(os, "lstat") and not self.dereference: 1791                 statres = os.lstat(name) 1792             else: 1793                 statres = os.stat(name) 1794         else: 1795             statres = os.fstat(fileobj.fileno()) 1796         linkname = ""
1797 
1798         stmd = statres.st_mode 1799         if stat.S_ISREG(stmd): 1800             inode = (statres.st_ino, statres.st_dev) 1801             if not self.dereference and statres.st_nlink > 1 and \ 1802                     inode in self.inodes and arcname != self.inodes[inode]: 1803                 # Is it a hardlink to an already
1804                 # archived file?
1805                 type = LNKTYPE 1806                 linkname = self.inodes[inode] 1807             else: 1808                 # The inode is added only if its valid.
1809                 # For win32 it is always 0.
1810                 type = REGTYPE 1811                 if inode[0]: 1812                     self.inodes[inode] = arcname 1813         elif stat.S_ISDIR(stmd): 1814             type = DIRTYPE 1815         elif stat.S_ISFIFO(stmd): 1816             type = FIFOTYPE 1817         elif stat.S_ISLNK(stmd): 1818             type = SYMTYPE 1819             linkname = os.readlink(name) 1820         elif stat.S_ISCHR(stmd): 1821             type = CHRTYPE 1822         elif stat.S_ISBLK(stmd): 1823             type = BLKTYPE 1824         else: 1825             return None 1826 
1827         # Fill the TarInfo object with all
1828         # information we can get.
1829         tarinfo.name = arcname 1830         tarinfo.mode = stmd 1831         tarinfo.uid = statres.st_uid 1832         tarinfo.gid = statres.st_gid 1833         if type == REGTYPE: 1834             tarinfo.size = statres.st_size 1835         else: 1836             tarinfo.size = 0 1837         tarinfo.mtime = statres.st_mtime 1838         tarinfo.type = type 1839         tarinfo.linkname = linkname 1840         if pwd: 1841             try: 1842                 tarinfo.uname = pwd.getpwuid(tarinfo.uid)[0] 1843             except KeyError: 1844                 pass
1845         if grp: 1846             try: 1847                 tarinfo.gname = grp.getgrgid(tarinfo.gid)[0] 1848             except KeyError: 1849                 pass
1850 
1851         if type in (CHRTYPE, BLKTYPE): 1852             if hasattr(os, "major") and hasattr(os, "minor"): 1853                 tarinfo.devmajor = os.major(statres.st_rdev) 1854                 tarinfo.devminor = os.minor(statres.st_rdev) 1855         return tarinfo 1856 
1857     def list(self, verbose=True, *, members=None): 1858         """Print a table of contents to sys.stdout. If `verbose' is False, only 1859  the names of the members are printed. If it is True, an `ls -l'-like 1860  output is produced. `members' is optional and must be a subset of the 1861  list returned by getmembers(). 1862         """
1863  self._check() 1864 
1865         if members is None: 1866             members = self 1867         for tarinfo in members: 1868             if verbose: 1869  _safe_print(stat.filemode(tarinfo.mode)) 1870                 _safe_print("%s/%s" % (tarinfo.uname or tarinfo.uid, 1871                                        tarinfo.gname or tarinfo.gid)) 1872                 if tarinfo.ischr() or tarinfo.isblk(): 1873                     _safe_print("%10s" %
1874                             ("%d,%d" % (tarinfo.devmajor, tarinfo.devminor))) 1875                 else: 1876                     _safe_print("%10d" % tarinfo.size) 1877                 _safe_print("%d-%02d-%02d %02d:%02d:%02d" \ 1878                             % time.localtime(tarinfo.mtime)[:6]) 1879 
1880             _safe_print(tarinfo.name + ("/" if tarinfo.isdir() else "")) 1881 
1882             if verbose: 1883                 if tarinfo.issym(): 1884                     _safe_print("-> " + tarinfo.linkname) 1885                 if tarinfo.islnk(): 1886                     _safe_print("link to " + tarinfo.linkname) 1887             print() 1888 
1889     def add(self, name, arcname=None, recursive=True, exclude=None, *, filter=None): 1890         """Add the file `name' to the archive. `name' may be any type of file 1891  (directory, fifo, symbolic link, etc.). If given, `arcname' 1892  specifies an alternative name for the file in the archive. 1893  Directories are added recursively by default. This can be avoided by 1894  setting `recursive' to False. `exclude' is a function that should 1895  return True for each filename to be excluded. `filter' is a function 1896  that expects a TarInfo object argument and returns the changed 1897  TarInfo object, if it returns None the TarInfo object will be 1898  excluded from the archive. 1899         """
1900         self._check("awx") 1901 
1902         if arcname is None: 1903             arcname = name 1904 
1905         # Exclude pathnames.
1906         if exclude is not None: 1907             import warnings 1908             warnings.warn("use the filter argument instead", 1909                     DeprecationWarning, 2) 1910             if exclude(name): 1911                 self._dbg(2, "tarfile: Excluded %r" % name) 1912                 return
1913 
1914         # Skip if somebody tries to archive the archive...
1915         if self.name is not None and os.path.abspath(name) == self.name: 1916             self._dbg(2, "tarfile: Skipped %r" % name) 1917             return
1918 
1919         self._dbg(1, name) 1920 
1921         # Create a TarInfo object from the file.
1922         tarinfo = self.gettarinfo(name, arcname) 1923 
1924         if tarinfo is None: 1925             self._dbg(1, "tarfile: Unsupported type %r" % name) 1926             return
1927 
1928         # Change or exclude the TarInfo object.
1929         if filter is not None: 1930             tarinfo = filter(tarinfo) 1931             if tarinfo is None: 1932                 self._dbg(2, "tarfile: Excluded %r" % name) 1933                 return
1934 
1935         # Append the tar header and data to the archive.
1936         if tarinfo.isreg(): 1937             with bltn_open(name, "rb") as f: 1938  self.addfile(tarinfo, f) 1939 
1940         elif tarinfo.isdir(): 1941  self.addfile(tarinfo) 1942             if recursive: 1943                 for f in os.listdir(name): 1944  self.add(os.path.join(name, f), os.path.join(arcname, f), 1945                             recursive, exclude, filter=filter) 1946 
1947         else: 1948  self.addfile(tarinfo) 1949 
1950     def addfile(self, tarinfo, fileobj=None): 1951         """Add the TarInfo object `tarinfo' to the archive. If `fileobj' is 1952  given, it should be a binary file, and tarinfo.size bytes are read 1953  from it and added to the archive. You can create TarInfo objects 1954  directly, or by using gettarinfo(). 1955         """
1956         self._check("awx") 1957 
1958         tarinfo = copy.copy(tarinfo) 1959 
1960         buf = tarinfo.tobuf(self.format, self.encoding, self.errors) 1961  self.fileobj.write(buf) 1962         self.offset += len(buf) 1963 
1964         # If there's data to follow, append it.
1965         if fileobj is not None: 1966  copyfileobj(fileobj, self.fileobj, tarinfo.size) 1967             blocks, remainder = divmod(tarinfo.size, BLOCKSIZE) 1968             if remainder > 0: 1969                 self.fileobj.write(NUL * (BLOCKSIZE - remainder)) 1970                 blocks += 1
1971             self.offset += blocks * BLOCKSIZE 1972 
1973  self.members.append(tarinfo) 1974 
1975     def extractall(self, path=".", members=None, *, numeric_owner=False): 1976         """Extract all members from the archive to the current working 1977  directory and set owner, modification time and permissions on 1978  directories afterwards. `path' specifies a different directory 1979  to extract to. `members' is optional and must be a subset of the 1980  list returned by getmembers(). If `numeric_owner` is True, only 1981  the numbers for user/group names are used and not the names. 1982         """
1983         directories = [] 1984 
1985         if members is None: 1986             members = self 1987 
1988         for tarinfo in members: 1989             if tarinfo.isdir(): 1990                 # Extract directories with a safe mode.
1991  directories.append(tarinfo) 1992                 tarinfo = copy.copy(tarinfo) 1993                 tarinfo.mode = 0o700 1994             # Do not set_attrs directories, as we will do that further down
1995             self.extract(tarinfo, path, set_attrs=not tarinfo.isdir(), 1996                          numeric_owner=numeric_owner) 1997 
1998         # Reverse sort directories.
1999         directories.sort(key=lambda a: a.name) 2000  directories.reverse() 2001 
2002         # Set correct owner, mtime and filemode on directories.
2003         for tarinfo in directories: 2004             dirpath = os.path.join(path, tarinfo.name) 2005             try: 2006                 self.chown(tarinfo, dirpath, numeric_owner=numeric_owner) 2007  self.utime(tarinfo, dirpath) 2008  self.chmod(tarinfo, dirpath) 2009             except ExtractError as e: 2010                 if self.errorlevel > 1: 2011                     raise
2012                 else: 2013                     self._dbg(1, "tarfile: %s" % e) 2014 
2015     def extract(self, member, path="", set_attrs=True, *, numeric_owner=False): 2016         """Extract a member from the archive to the current working directory, 2017  using its full name. Its file information is extracted as accurately 2018  as possible. `member' may be a filename or a TarInfo object. You can 2019  specify a different directory using `path'. File attributes (owner, 2020  mtime, mode) are set unless `set_attrs' is False. If `numeric_owner` 2021  is True, only the numbers for user/group names are used and not 2022  the names. 2023         """
2024         self._check("r") 2025 
2026         if isinstance(member, str): 2027             tarinfo = self.getmember(member) 2028         else: 2029             tarinfo = member 2030 
2031         # Prepare the link target for makelink().
2032         if tarinfo.islnk(): 2033             tarinfo._link_target = os.path.join(path, tarinfo.linkname) 2034 
2035         try: 2036  self._extract_member(tarinfo, os.path.join(path, tarinfo.name), 2037                                  set_attrs=set_attrs, 2038                                  numeric_owner=numeric_owner) 2039         except OSError as e: 2040             if self.errorlevel > 0: 2041                 raise
2042             else: 2043                 if e.filename is None: 2044                     self._dbg(1, "tarfile: %s" % e.strerror) 2045                 else: 2046                     self._dbg(1, "tarfile: %s %r" % (e.strerror, e.filename)) 2047         except ExtractError as e: 2048             if self.errorlevel > 1: 2049                 raise
2050             else: 2051                 self._dbg(1, "tarfile: %s" % e) 2052 
2053     def extractfile(self, member): 2054         """Extract a member from the archive as a file object. `member' may be 2055  a filename or a TarInfo object. If `member' is a regular file or a 2056  link, an io.BufferedReader object is returned. Otherwise, None is 2057  returned. 2058         """
2059         self._check("r") 2060 
2061         if isinstance(member, str): 2062             tarinfo = self.getmember(member) 2063         else: 2064             tarinfo = member 2065 
2066         if tarinfo.isreg() or tarinfo.type not in SUPPORTED_TYPES: 2067             # Members with unknown types are treated as regular files.
2068             return self.fileobject(self, tarinfo) 2069 
2070         elif tarinfo.islnk() or tarinfo.issym(): 2071             if isinstance(self.fileobj, _Stream): 2072                 # A small but ugly workaround for the case that someone tries
2073                 # to extract a (sym)link as a file-object from a non-seekable
2074                 # stream of tar blocks.
2075                 raise StreamError("cannot extract (sym)link as file object") 2076             else: 2077                 # A (sym)link's file object is its target's file object.
2078                 return self.extractfile(self._find_link_target(tarinfo)) 2079         else: 2080             # If there's no data associated with the member (directory, chrdev,
2081             # blkdev, etc.), return None instead of a file object.
2082             return None 2083 
2084     def _extract_member(self, tarinfo, targetpath, set_attrs=True, 2085                         numeric_owner=False): 2086         """Extract the TarInfo object tarinfo to a physical 2087  file called targetpath. 2088         """
2089         # Fetch the TarInfo object for the given name
2090         # and build the destination pathname, replacing
2091         # forward slashes to platform specific separators.
2092         targetpath = targetpath.rstrip("/") 2093         targetpath = targetpath.replace("/", os.sep) 2094 
2095         # Create all upper directories.
2096         upperdirs = os.path.dirname(targetpath) 2097         if upperdirs and not os.path.exists(upperdirs): 2098             # Create directories that are not part of the archive with
2099             # default permissions.
2100  os.makedirs(upperdirs) 2101 
2102         if tarinfo.islnk() or tarinfo.issym(): 2103             self._dbg(1, "%s -> %s" % (tarinfo.name, tarinfo.linkname)) 2104         else: 2105             self._dbg(1, tarinfo.name) 2106 
2107         if tarinfo.isreg(): 2108  self.makefile(tarinfo, targetpath) 2109         elif tarinfo.isdir(): 2110  self.makedir(tarinfo, targetpath) 2111         elif tarinfo.isfifo(): 2112  self.makefifo(tarinfo, targetpath) 2113         elif tarinfo.ischr() or tarinfo.isblk(): 2114  self.makedev(tarinfo, targetpath) 2115         elif tarinfo.islnk() or tarinfo.issym(): 2116  self.makelink(tarinfo, targetpath) 2117         elif tarinfo.type not in SUPPORTED_TYPES: 2118  self.makeunknown(tarinfo, targetpath) 2119         else: 2120  self.makefile(tarinfo, targetpath) 2121 
2122         if set_attrs: 2123  self.chown(tarinfo, targetpath, numeric_owner) 2124             if not tarinfo.issym(): 2125  self.chmod(tarinfo, targetpath) 2126  self.utime(tarinfo, targetpath) 2127 
2128     #--------------------------------------------------------------------------
2129     # Below are the different file methods. They are called via
2130     # _extract_member() when extract() is called. They can be replaced in a
2131     # subclass to implement other functionality.
2132 
2133     def makedir(self, tarinfo, targetpath): 2134         """Make a directory called targetpath. 2135         """
2136         try: 2137             # Use a safe mode for the directory, the real mode is set
2138             # later in _extract_member().
2139  os.mkdir(targetpath, 0o700) 2140         except FileExistsError: 2141             pass
2142 
2143     def makefile(self, tarinfo, targetpath): 2144         """Make a file called targetpath. 2145         """
2146         source = self.fileobj 2147  source.seek(tarinfo.offset_data) 2148         with bltn_open(targetpath, "wb") as target: 2149             if tarinfo.sparse is not None: 2150                 for offset, size in tarinfo.sparse: 2151  target.seek(offset) 2152  copyfileobj(source, target, size, ReadError) 2153  target.seek(tarinfo.size) 2154  target.truncate() 2155             else: 2156  copyfileobj(source, target, tarinfo.size, ReadError) 2157 
2158     def makeunknown(self, tarinfo, targetpath): 2159         """Make a file from a TarInfo object with an unknown type 2160  at targetpath. 2161         """
2162  self.makefile(tarinfo, targetpath) 2163         self._dbg(1, "tarfile: Unknown file type %r, " \ 2164                      "extracted as regular file." % tarinfo.type) 2165 
2166     def makefifo(self, tarinfo, targetpath): 2167         """Make a fifo called targetpath. 2168         """
2169         if hasattr(os, "mkfifo"): 2170  os.mkfifo(targetpath) 2171         else: 2172             raise ExtractError("fifo not supported by system") 2173 
2174     def makedev(self, tarinfo, targetpath): 2175         """Make a character or block device called targetpath. 2176         """
2177         if not hasattr(os, "mknod") or not hasattr(os, "makedev"): 2178             raise ExtractError("special devices not supported by system") 2179 
2180         mode = tarinfo.mode 2181         if tarinfo.isblk(): 2182             mode |= stat.S_IFBLK 2183         else: 2184             mode |= stat.S_IFCHR 2185 
2186  os.mknod(targetpath, mode, 2187  os.makedev(tarinfo.devmajor, tarinfo.devminor)) 2188 
2189     def makelink(self, tarinfo, targetpath): 2190         """Make a (symbolic) link called targetpath. If it cannot be created 2191  (platform limitation), we try to make a copy of the referenced file 2192  instead of a link. 2193         """
2194         try: 2195             # For systems that support symbolic and hard links.
2196             if tarinfo.issym(): 2197  os.symlink(tarinfo.linkname, targetpath) 2198             else: 2199                 # See extract().
2200                 if os.path.exists(tarinfo._link_target): 2201  os.link(tarinfo._link_target, targetpath) 2202                 else: 2203  self._extract_member(self._find_link_target(tarinfo), 2204  targetpath) 2205         except symlink_exception: 2206             try: 2207  self._extract_member(self._find_link_target(tarinfo), 2208  targetpath) 2209             except KeyError: 2210                 raise ExtractError("unable to resolve link inside archive") 2211 
2212     def chown(self, tarinfo, targetpath, numeric_owner): 2213         """Set owner of targetpath according to tarinfo. If numeric_owner 2214  is True, use .gid/.uid instead of .gname/.uname. 2215         """
2216         if pwd and hasattr(os, "geteuid") and os.geteuid() == 0: 2217             # We have to be root to do so.
2218             if numeric_owner: 2219                 g = tarinfo.gid 2220                 u = tarinfo.uid 2221             else: 2222                 try: 2223                     g = grp.getgrnam(tarinfo.gname)[2] 2224                 except KeyError: 2225                     g = tarinfo.gid 2226                 try: 2227                     u = pwd.getpwnam(tarinfo.uname)[2] 2228                 except KeyError: 2229                     u = tarinfo.uid 2230             try: 2231                 if tarinfo.issym() and hasattr(os, "lchown"): 2232  os.lchown(targetpath, u, g) 2233                 else: 2234  os.chown(targetpath, u, g) 2235             except OSError as e: 2236                 raise ExtractError("could not change owner") 2237 
2238     def chmod(self, tarinfo, targetpath): 2239         """Set file permissions of targetpath according to tarinfo. 2240         """
2241         if hasattr(os, 'chmod'): 2242             try: 2243  os.chmod(targetpath, tarinfo.mode) 2244             except OSError as e: 2245                 raise ExtractError("could not change mode") 2246 
2247     def utime(self, tarinfo, targetpath): 2248         """Set modification time of targetpath according to tarinfo. 2249         """
2250         if not hasattr(os, 'utime'): 2251             return
2252         try: 2253  os.utime(targetpath, (tarinfo.mtime, tarinfo.mtime)) 2254         except OSError as e: 2255             raise ExtractError("could not change modification time") 2256 
2257     #--------------------------------------------------------------------------
2258     def next(self): 2259         """Return the next member of the archive as a TarInfo object, when 2260  TarFile is opened for reading. Return None if there is no more 2261  available. 2262         """
2263         self._check("ra") 2264         if self.firstmember is not None: 2265             m = self.firstmember 2266             self.firstmember = None 2267             return m 2268 
2269         # Advance the file pointer.
2270         if self.offset != self.fileobj.tell(): 2271             self.fileobj.seek(self.offset - 1) 2272             if not self.fileobj.read(1): 2273                 raise ReadError("unexpected end of data") 2274 
2275         # Read the next block.
2276         tarinfo = None 2277         while True: 2278             try: 2279                 tarinfo = self.tarinfo.fromtarfile(self) 2280             except EOFHeaderError as e: 2281                 if self.ignore_zeros: 2282                     self._dbg(2, "0x%X: %s" % (self.offset, e)) 2283                     self.offset += BLOCKSIZE 2284                     continue
2285             except InvalidHeaderError as e: 2286                 if self.ignore_zeros: 2287                     self._dbg(2, "0x%X: %s" % (self.offset, e)) 2288                     self.offset += BLOCKSIZE 2289                     continue
2290                 elif self.offset == 0: 2291                     raise ReadError(str(e)) 2292             except EmptyHeaderError: 2293                 if self.offset == 0: 2294                     raise ReadError("empty file") 2295             except TruncatedHeaderError as e: 2296                 if self.offset == 0: 2297                     raise ReadError(str(e)) 2298             except SubsequentHeaderError as e: 2299                 raise ReadError(str(e)) 2300             break
2301 
2302         if tarinfo is not None: 2303  self.members.append(tarinfo) 2304         else: 2305             self._loaded = True 2306 
2307         return tarinfo 2308 
2309     #--------------------------------------------------------------------------
2310     # Little helper methods:
2311 
2312     def _getmember(self, name, tarinfo=None, normalize=False): 2313         """Find an archive member by name from bottom to top. 2314  If tarinfo is given, it is used as the starting point. 2315         """
2316         # Ensure that all members have been loaded.
2317         members = self.getmembers() 2318 
2319         # Limit the member search list up to tarinfo.
2320         if tarinfo is not None: 2321             members = members[:members.index(tarinfo)] 2322 
2323         if normalize: 2324             name = os.path.normpath(name) 2325 
2326         for member in reversed(members): 2327             if normalize: 2328                 member_name = os.path.normpath(member.name) 2329             else: 2330                 member_name = member.name 2331 
2332             if name == member_name: 2333                 return member 2334 
2335     def _load(self): 2336         """Read through the entire archive file and look for readable 2337  members. 2338         """
2339         while True: 2340             tarinfo = self.next() 2341             if tarinfo is None: 2342                 break
2343         self._loaded = True 2344 
2345     def _check(self, mode=None): 2346         """Check if TarFile is still open, and if the operation's mode 2347  corresponds to TarFile's mode. 2348         """
2349         if self.closed: 2350             raise OSError("%s is closed" % self.__class__.__name__) 2351         if mode is not None and self.mode not in mode: 2352             raise OSError("bad operation for mode %r" % self.mode) 2353 
2354     def _find_link_target(self, tarinfo): 2355         """Find the target member of a symlink or hardlink member in the 2356  archive. 2357         """
2358         if tarinfo.issym(): 2359             # Always search the entire archive.
2360             linkname = "/".join(filter(None, (os.path.dirname(tarinfo.name), tarinfo.linkname))) 2361             limit = None 2362         else: 2363             # Search the archive before the link, because a hard link is
2364             # just a reference to an already archived file.
2365             linkname = tarinfo.linkname 2366             limit = tarinfo 2367 
2368         member = self._getmember(linkname, tarinfo=limit, normalize=True) 2369         if member is None: 2370             raise KeyError("linkname %r not found" % linkname) 2371         return member 2372 
2373     def __iter__(self): 2374         """Provide an iterator object. 2375         """
2376         if self._loaded: 2377             return iter(self.members) 2378         else: 2379             return TarIter(self) 2380 
2381     def _dbg(self, level, msg): 2382         """Write debugging output to sys.stderr. 2383         """
2384         if level <= self.debug: 2385             print(msg, file=sys.stderr) 2386 
2387     def __enter__(self): 2388  self._check() 2389         return self 2390 
2391     def __exit__(self, type, value, traceback): 2392         if type is None: 2393  self.close() 2394         else: 2395             # An exception occurred. We must not call close() because
2396             # it would try to write end-of-archive blocks and padding.
2397             if not self._extfileobj: 2398  self.fileobj.close() 2399             self.closed = True 2400 # class TarFile
2401 
2402 class TarIter: 2403     """Iterator Class. 2404 
2405  for tarinfo in TarFile(...): 2406  suite... 2407     """
2408 
2409     def __init__(self, tarfile): 2410         """Construct a TarIter object. 2411         """
2412         self.tarfile = tarfile 2413         self.index = 0 2414     def __iter__(self): 2415         """Return iterator object. 2416         """
2417         return self 2418     def __next__(self): 2419         """Return the next item using TarFile's next() method. 2420  When all members have been read, set TarFile as _loaded. 2421         """
2422         # Fix for SF #1100429: Under rare circumstances it can
2423         # happen that getmembers() is called during iteration,
2424         # which will cause TarIter to stop prematurely.
2425 
2426         if self.index == 0 and self.tarfile.firstmember is not None: 2427             tarinfo = self.tarfile.next() 2428         elif self.index < len(self.tarfile.members): 2429             tarinfo = self.tarfile.members[self.index] 2430         elif not self.tarfile._loaded: 2431             tarinfo = self.tarfile.next() 2432             if not tarinfo: 2433                 self.tarfile._loaded = True 2434                 raise StopIteration 2435         else: 2436             raise StopIteration 2437         self.index += 1
2438         return tarinfo 2439 
2440 #--------------------
2441 # exported functions
2442 #--------------------
2443 def is_tarfile(name): 2444     """Return True if name points to a tar archive that we 2445  are able to handle, else return False. 2446     """
2447     try: 2448         t = open(name) 2449  t.close() 2450         return True 2451     except TarError: 2452         return False 2453 
2454 open = TarFile.open 2455 
2456 
2457 def main(): 2458     import argparse 2459 
2460     description = 'A simple command line interface for tarfile module.'
2461     parser = argparse.ArgumentParser(description=description) 2462     parser.add_argument('-v', '--verbose', action='store_true', default=False, 2463                         help='Verbose output') 2464     group = parser.add_mutually_exclusive_group() 2465     group.add_argument('-l', '--list', metavar='<tarfile>', 2466                        help='Show listing of a tarfile') 2467     group.add_argument('-e', '--extract', nargs='+', 2468                        metavar=('<tarfile>', '<output_dir>'), 2469                        help='Extract tarfile into target dir') 2470     group.add_argument('-c', '--create', nargs='+', 2471                        metavar=('<name>', '<file>'), 2472                        help='Create tarfile from sources') 2473     group.add_argument('-t', '--test', metavar='<tarfile>', 2474                        help='Test if a tarfile is valid') 2475     args = parser.parse_args() 2476 
2477     if args.test: 2478         src = args.test 2479         if is_tarfile(src): 2480             with open(src, 'r') as tar: 2481  tar.getmembers() 2482                 print(tar.getmembers(), file=sys.stderr) 2483             if args.verbose: 2484                 print('{!r} is a tar archive.'.format(src)) 2485         else: 2486             parser.exit(1, '{!r} is not a tar archive.\n'.format(src)) 2487 
2488     elif args.list: 2489         src = args.list 2490         if is_tarfile(src): 2491             with TarFile.open(src, 'r:*') as tf: 2492                 tf.list(verbose=args.verbose) 2493         else: 2494             parser.exit(1, '{!r} is not a tar archive.\n'.format(src)) 2495 
2496     elif args.extract: 2497         if len(args.extract) == 1: 2498             src = args.extract[0] 2499             curdir = os.curdir 2500         elif len(args.extract) == 2: 2501             src, curdir = args.extract 2502         else: 2503             parser.exit(1, parser.format_help()) 2504 
2505         if is_tarfile(src): 2506             with TarFile.open(src, 'r:*') as tf: 2507                 tf.extractall(path=curdir) 2508             if args.verbose: 2509                 if curdir == '.': 2510                     msg = '{!r} file is extracted.'.format(src) 2511                 else: 2512                     msg = ('{!r} file is extracted '
2513                            'into {!r} directory.').format(src, curdir) 2514                 print(msg) 2515         else: 2516             parser.exit(1, '{!r} is not a tar archive.\n'.format(src)) 2517 
2518     elif args.create: 2519         tar_name = args.create.pop(0) 2520         _, ext = os.path.splitext(tar_name) 2521         compressions = { 2522             # gz
2523             '.gz': 'gz', 2524             '.tgz': 'gz', 2525             # xz
2526             '.xz': 'xz', 2527             '.txz': 'xz', 2528             # bz2
2529             '.bz2': 'bz2', 2530             '.tbz': 'bz2', 2531             '.tbz2': 'bz2', 2532             '.tb2': 'bz2', 2533  } 2534         tar_mode = 'w:' + compressions[ext] if ext in compressions else 'w'
2535         tar_files = args.create 2536 
2537  with TarFile.open(tar_name, tar_mode) as tf: 2538             for file_name in tar_files: 2539  tf.add(file_name) 2540 
2541         if args.verbose: 2542             print('{!r} file created.'.format(tar_name)) 2543 
2544     else: 2545         parser.exit(1, parser.format_help()) 2546 
2547 if __name__ == '__main__': 2548     main()
View tarfile Code

 

PyYAML模塊 

Python也能夠很容易的處理ymal文檔格式,只不過須要安裝一個模塊,參考文檔:http://pyyaml.org/wiki/PyYAMLDocumentation。mysql

 

re正則表達式

 

正則表達式:
    
    '.'     默認匹配除\n以外的任意一個字符,若指定flag DOTALL,則匹配任意字符,包括換行
    '^'     匹配字符開頭,若指定flags MULTILINE,這種也能夠匹配上(r"^a","\nabc\neee",flags=re.MULTILINE)
    '$'     匹配字符結尾,或e.search("foo$","bfoo\nsdfsf",flags=re.MULTILINE).group()也能夠
    '*'     匹配*號前的字符0次或屢次,re.findall("ab*","cabb3abcbbac")  結果爲['abb', 'ab', 'a']
    '+'     匹配前一個字符1次或屢次,re.findall("ab+","ab+cd+abb+bba") 結果['ab', 'abb']
    '?'     匹配前一個字符1次或0次
    '{m}'   匹配前一個字符m次
    '{n,m}' 匹配前一個字符n到m次,re.findall("ab{1,3}","abb abc abbcbbb") 結果'abb', 'ab', 'abb']
    '|'     匹配|左或|右的字符,re.search("abc|ABC","ABCBabcCD").group() 結果'ABC'
    '(...)' 分組匹配,re.search("(abc){2}a(123|456)c", "abcabca456c").group() 結果 abcabca456c
     
     
    '\A'    只從字符開頭匹配,re.search("\Aabc","alexabc") 是匹配不到的
    '\Z'    匹配字符結尾,同$
    '\d'    匹配數字0-9
    '\D'    匹配非數字
    '\w'    匹配[A-Za-z0-9]
    '\W'    匹配非[A-Za-z0-9]
    '\s'     匹配空白字符、\t、\n、\r , re.search("\s+","ab\tc1\n3").group() 結果 '\t'
    
    

    re.match 從頭開始匹配
    re.search 匹配包含
    re.findall 把全部匹配到的字符放到以列表中的元素返回
    re.splitall 以匹配到的字符當作列表分隔符
    re.sub      匹配字符並替換
    
    
    匹配模式:
        re.I:忽略大小寫
        re.M;多行模式,改變'^'和'$'的行爲
        re.S:點任意匹配模式,改變'.'的行爲

 

  

# A開頭,[A-Za-z0-9]1-7位,後面1個或n次數字而且以「n」結尾
print(re.match("^A\w{1,7}\d+\w*n$", "Allister12365HaoSen"))

# 以數字開頭長度爲17位,以數字|x|X結尾 18位身份證
print(re.match("^\d{17}(\d|x|X){1}$", "42210319630213275X"))
# 15位身份證 以數字開頭的15位數字
print(re.match("^\d{15}", "422103196302132"))


# 以「A」開頭,a-zA-Z 一個或多個,後面加上r
print(re.search("^A[a-zA-Z]+r", "Allister123Allister&ds"))

# '?'     匹配前一個字符1次或0次
print(re.search("aaa?", "aaEEEEaaa"))  # aa


print(re.findall("abf?.", "abf%dafsgaabfterftw"))

# 按指定字符分割爲列表
print(re.split("[0-9]+", "rf123Allister89ljp"))  # ['rf', 'Allister', 'ljp']

# sub(pattern, repl, string, count=0, flags=0) 將匹配到的值替換爲指定字符 能夠指定替換次數
print(re.sub("[0-9]+", "|", "rf123Allister89ljp", 5))  # rf|Allister|ljp


"""
    將身份證分解爲省、市、區、年、月、日
"""
# {'city': '09', 'county': '21', 'year': '1990', 'province': '51', 'day': '06', 'month': '08'}
print(re.search("(?P<province>[0-9]{2})(?P<city>[0-9]{2})(?P<county>[0-9]{2})(?P<year>[0-9]{4})(?P<month>[0-9]{2})(?P<day>[0-9]{2})",\
                "51092119900806181X").groupdict())


"""
    匹配模式:
        re.I:忽略大小寫
        re.M;多行模式,改變'^'和'$'的行爲
        re.S:點任意匹配模式,改變'.'的行爲
"""

# re.I 忽略大小寫
print(re.search("[a-z]+", "abcdEFg", re.I))  # abcdEFg

  

 

# !/usr/bin/env python
# -*- coding: utf-8 -*-
# @File  : re_test.py
# @Author: Allister.Liu
# @Date  : 2018/1/22
# @Desc  : 正則表達式

import re

"""
正則表達式:
    
    '.'     默認匹配除\n以外的任意一個字符,若指定flag DOTALL,則匹配任意字符,包括換行
    '^'     匹配字符開頭,若指定flags MULTILINE,這種也能夠匹配上(r"^a","\nabc\neee",flags=re.MULTILINE)
    '$'     匹配字符結尾,或e.search("foo$","bfoo\nsdfsf",flags=re.MULTILINE).group()也能夠
    '*'     匹配*號前的字符0次或屢次,re.findall("ab*","cabb3abcbbac")  結果爲['abb', 'ab', 'a']
    '+'     匹配前一個字符1次或屢次,re.findall("ab+","ab+cd+abb+bba") 結果['ab', 'abb']
    '?'     匹配前一個字符1次或0次
    '{m}'   匹配前一個字符m次
    '{n,m}' 匹配前一個字符n到m次,re.findall("ab{1,3}","abb abc abbcbbb") 結果'abb', 'ab', 'abb']
    '|'     匹配|左或|右的字符,re.search("abc|ABC","ABCBabcCD").group() 結果'ABC'
    '(...)' 分組匹配,re.search("(abc){2}a(123|456)c", "abcabca456c").group() 結果 abcabca456c
     
     
    '\A'    只從字符開頭匹配,re.search("\Aabc","alexabc") 是匹配不到的
    '\Z'    匹配字符結尾,同$
    '\d'    匹配數字0-9
    '\D'    匹配非數字
    '\w'    匹配[A-Za-z0-9]
    '\W'    匹配非[A-Za-z0-9]
    '\s'     匹配空白字符、\t、\n、\r , re.search("\s+","ab\tc1\n3").group() 結果 '\t'
    
    

    re.match 從頭開始匹配
    re.search 匹配包含
    re.findall 把全部匹配到的字符放到以列表中的元素返回
    re.splitall 以匹配到的字符當作列表分隔符
    re.sub      匹配字符並替換
    
    
    匹配模式:
        re.I:忽略大小寫
        re.M;多行模式,改變'^'和'$'的行爲
        re.S:點任意匹配模式,改變'.'的行爲
"""

# A開頭,[A-Za-z0-9]1-7位,後面1個或n次數字而且以「n」結尾
print(re.match("^A\w{1,7}\d+\w*n$", "Allister12365HaoSen"))

# 以數字開頭長度爲17位,以數字|x|X結尾 18位身份證
print(re.match("^\d{17}(\d|x|X){1}$", "42210319630213275X"))
# 15位身份證 以數字開頭的15位數字
print(re.match("^\d{15}", "422103196302132"))


# 以「A」開頭,a-zA-Z 一個或多個,後面加上r
print(re.search("^A[a-zA-Z]+r", "Allister123Allister&ds"))

# '?'     匹配前一個字符1次或0次
print(re.search("aaa?", "aaEEEEaaa"))  # aa


print(re.findall("abf?.", "abf%dafsgaabfterftw"))

# 按指定字符分割爲列表
print(re.split("[0-9]+", "rf123Allister89ljp"))  # ['rf', 'Allister', 'ljp']

# sub(pattern, repl, string, count=0, flags=0) 將匹配到的值替換爲指定字符 能夠指定替換次數
print(re.sub("[0-9]+", "|", "rf123Allister89ljp", 5))  # rf|Allister|ljp


"""
    將身份證分解爲省、市、區、年、月、日
"""
# {'city': '09', 'county': '21', 'year': '1990', 'province': '51', 'day': '06', 'month': '08'}
print(re.search("(?P<province>[0-9]{2})(?P<city>[0-9]{2})(?P<county>[0-9]{2})(?P<year>[0-9]{4})(?P<month>[0-9]{2})(?P<day>[0-9]{2})",\
                "51092119900806181X").groupdict())


"""
    匹配模式:
        re.I:忽略大小寫
        re.M;多行模式,改變'^'和'$'的行爲
        re.S:點任意匹配模式,改變'.'的行爲
"""

# re.I 忽略大小寫
print(re.search("[a-z]+", "abcdEFg", re.I))  # abcdEFg
相關文章
相關標籤/搜索