在機器學習中免不了和圖片打交道。有時候收集來的圖片後綴名和實際類型會存在不一致的狀況,這裏可能就須要涉及到圖片文件的批量處理。簡單粗暴上代碼:bash
import struct
import os
def gci(filepath):
files = os.listdir(filepath)
for fi in files:
fi_d = os.path.join(filepath,fi)
if os.path.isdir(fi_d):
gci(fi_d)
else:
tempP = os.path.join(filepath,fi_d)
tp = filetype(tempP)
if tp != 'JPEG':
print(tempP)
os.remove(tempP)
#print tempP
def typeList():
return {
"FFD8FF": "JPEG",
"89504E47": "PNG",
"47494638": "GIF"}
def bytes2hex(bytes):
num = len(bytes)
hexstr = u""
for i in range(num):
t = u"%x" % bytes[i]
if len(t) % 2:
hexstr += u"0"
hexstr += t
return hexstr.upper()
def filetype(filename):
binfile = open(filename, 'rb')
tl = typeList()
ftype = 'unknown'
for hcode in tl.keys():
numOfBytes = len(hcode) / 2
binfile.seek(0)
hbytes = struct.unpack_from("B"*numOfBytes, binfile.read(numOfBytes))
f_hcode = bytes2hex(hbytes)
if f_hcode == hcode:
ftype = tl[hcode]
break
binfile.close()
return ftype
gci('your_img_folder_path')
複製代碼