import pandas as pd gl=pd.read_csv('./pandas/data/game_logs.csv')
# 數據的內存使用狀況 gl.info(memory_usage='deep')
<class 'pandas.core.frame.DataFrame'> RangeIndex: 171907 entries, 0 to 171906 Columns: 161 entries, date to acquisition_info dtypes: float64(77), int64(6), object(78) memory usage: 859.4 MB
for dtype in ['float64','object','int64']: selected_dtype=gl.select_dtypes(include=[dtype]) memory_usage_b=selected_dtype.memory_usage(deep=True).mean() memory_usage_mb=memory_usage_b/1024/1024 print('[%s] memory usage %0.2f MB' % (dtype,memory_usage_mb))
[float64] memory usage 1.29 MB [object] memory usage 9.50 MB [int64] memory usage 1.12 MB
# uint8 int8 int16 int32 int64的取值範圍 import numpy as np for dtype in ['uint8','int8','int16','int32','int64']: print(np.iinfo(dtype))
Machine parameters for uint8 --------------------------------------------------------------- min = 0 max = 255 --------------------------------------------------------------- Machine parameters for int8 --------------------------------------------------------------- min = -128 max = 127 --------------------------------------------------------------- Machine parameters for int16 --------------------------------------------------------------- min = -32768 max = 32767 --------------------------------------------------------------- Machine parameters for int32 --------------------------------------------------------------- min = -2147483648 max = 2147483647 --------------------------------------------------------------- Machine parameters for int64 --------------------------------------------------------------- min = -9223372036854775808 max = 9223372036854775807 ---------------------------------------------------------------
# 類型轉換後的數據佔用內存 def mem_usage(data): if isinstance(data,pd.DataFrame): mem_b=data.memory_usage(deep=True).sum() else: mem_b=data.memory_usage(deep=True) return "{:03.2f} MB".format(mem_b/1024**2) gl_int64=gl.select_dtypes(include=['int64']) # 向下類型轉換 gl_int32=gl_int.apply(pd.to_numeric,downcast='unsigned') print(mem_usage(gl_int64)) print(mem_usage(gl_int32)) # float64 轉 float gl_float64=gl.select_dtypes(include=['float64']) gl_float=gl_float64.apply(pd.to_numeric,downcast='float') print("轉換前:"+mem_usage(gl_float64)) print("轉換後"+mem_usage(gl_float))
7.87 MB 1.48 MB 轉換前:100.99 MB 轉換後50.49 MB
opt_gl=gl.copy() opt_gl[gl_int32.columns]=gl_int32 opt_gl[gl_float.columns]=gl_float print("原數據的大小:"+mem_usage(gl)) print("轉換後的數據大小:"+mem_usage(opt_gl))
原數據的大小:859.43 MB 轉換後的數據大小:802.54 MB
gl_obj=gl.select_dtypes(include=['object']).copy() print(gl_obj.describe())
day_of_week v_name v_league h_name h_league day_night \ count 171907 171907 171907 171907 171907 140150 unique 7 148 7 148 7 2 top Sat CHN NL CHN NL D freq 28891 8870 88866 9024 88867 82724 completion forefeit protest park_id ... h_player_6_id \ count 116 145 180 171907 ... 140838 unique 116 3 5 245 ... 4774 top 19590602,PIT06,2,1,39 H V STL07 ... grimc101 freq 1 69 90 7022 ... 427 h_player_6_name h_player_7_id h_player_7_name h_player_8_id \ count 140838 140838 140838 140838 unique 4720 5253 5197 4760 top Charlie Grimm grimc101 Charlie Grimm lopea102 freq 427 491 491 676 h_player_8_name h_player_9_id h_player_9_name additional_info \ count 140838 140838 140838 1456 unique 4710 5193 5142 332 top Al Lopez spahw101 Warren Spahn HTBF freq 676 339 339 1112 acquisition_info count 140841 unique 1 top Y freq 140841 [4 rows x 78 columns]
dow=gl_obj.day_of_week print(dow.head())
dow_cat=dow.astype('category') print(dow_cat.head())
print("轉換前"+mem_usage(dow)) print("轉換後"+mem_usage(dow_cat))
# 將重複比較多的數據轉換成category,縮小數據內存 convert_obj=pd.DataFrame() for col in gl_obj.columns: num_unique=len(gl_obj[col].unique()) num_total=len(gl_obj[col]) if num_unique/num_total<0.5: convert_obj.loc[:,col]=gl_obj[col].astype('category') else: convert_obj.loc[:,col]=gl_obj[col] print('數據轉換前:'+mem_usage(gl_obj)) print('數據轉換後:'+mem_usage(convert_obj))
opt_gl[convert_obj.columns]=convert_obj print(mem_usage(opt_gl))
# apply操做
titanic=pd.read_csv('./pandas/data/titanic_train.csv') titanic.iloc[99]
# 獲取99行的數據 def get_row(data): return data.iloc[99] row=titanic.apply(get_row) row
# 統計每一列爲NaN的數量 def get_null_count(data): col_null=pd.isnull(data) null=data[col_null] return len(null) null_count=titanic.apply(get_null_count) print(null_count)
# 數據轉換 def which_class(row): pclass=row['Pclass'] if pd.isnull(pclass): return "UnKown" elif pclass == 1: return "One" elif pclass == 2: return "Tow" elif pclass == 3: return "Three" classes=titanic.apply(which_class,axis=1) print(classes)
# 找出未成年的數據 def is_minor(row): age=row['Age'] if age<18: return True else: return False minor=titanic.apply(is_minor,axis=1) print(titanic[minor])