dataFrame 是一個帶有索引的二維數據結構,每列能夠有本身的名字,而且能夠有不一樣的數據類型。你能夠把它想象成一個 excel 表格或者數據庫中的一張表DataFrame是最經常使用的 Pandas 對象。python
index = pd.Index(data=["Tom", "Bob", "Mary", "James"], name="name") data = { "age": [18, 30, 25, 40], "city": ["BeiJing", "ShangHai", "GuangZhou", "ShenZhen"] } user_info = pd.DataFrame(data=data, index=index) user_info Out[35]: age city name Tom 18 BeiJing Bob 30 ShangHai Mary 25 GuangZhou James 40 ShenZhen
data = [{'name':'xiaohong','age':23,'tel':10086},{'name':'xiaogang','age':12},{'name':'xiaozhang','tel':10010}] user_info = pd.DataFrame(data=data) user_info Out[36]: age name tel 0 23.0 xiaohong 10086.0 1 12.0 xiaogang NaN 2 NaN xiaozhang 10010.0
data = [[18, "BeiJing"], [30, "ShangHai"], [25, "GuangZhou"], [40, "ShenZhen"]] columns = ["age", "city"] user_info = pd.DataFrame(data=data, index=index, columns=columns) user_info Out[37]: age city name Tom 18 BeiJing Bob 30 ShangHai Mary 25 GuangZhou James 40 ShenZhen
result = {'name': 'zhangyafei','age': 24, 'city':'shanxi','weather':'sunny','date':'2019-3-11'} data = pd.DataFrame.from_dict(result,orient='index').T data Out[44]: name age city weather date 0 zhangyafei 24 shanxi sunny 2019-3-11
index = pd.Index(data=["Tom", "Bob", "Mary", "James"], name="name") data = { "age": [18, 30, 25, 40], "city": ["BeiJing", "ShangHai", "GuangZhou", "ShenZhen"] } df = pd.DataFrame(data=data, index=index) df Out[45]: age city name Tom 18 BeiJing Bob 30 ShangHai Mary 25 GuangZhou James 40 ShenZhen
#增長行 注意:這種方法,效率很是低,不該該用於遍歷中 df.loc[len(df)]=[23,'shanxi'] #增長列 df['sex'] = [1,1,1,0] df.assign(age_add_one = df.age + 1)
df.loc[len(df)] = [23, 'shanxi'] df Out[47]: age city name Tom 18 BeiJing Bob 30 ShangHai Mary 25 GuangZhou James 40 ShenZhen 4 23 shanxi df['sex'] = [1,1,1,0,1] df Out[49]: age city sex name Tom 18 BeiJing 1 Bob 30 ShangHai 1 Mary 25 GuangZhou 1 James 40 ShenZhen 0 4 23 shanxi 1 df.assign(age_add_one = user_info["age"] + 1) Out[79]: age city age_add_one name Tom 18 BeiJing 19 Bob 30 ShangHai 31 Mary 25 GuangZhou 26 James 40 ShenZhen 41
#根據行索引剔除 df = df.drop(4,axis=0,inplace=True) # inplace可選 #根據列名剔除 df.drop('sex',axis=1,inplace=True) df.pop('sex') # 有返回值 #第二種剔除列的方法 del df['age2']
df.drop('sex', axis=1, inplace=True) df Out[65]: age city name Tom 18 BeiJing Bob 30 ShangHai Mary 25 GuangZhou James 40 ShenZhen 23 shanxi df.drop(4,axis=0, inplace=True) df Out[67]: age city name Tom 18 BeiJing Bob 30 ShangHai Mary 25 GuangZhou James 40 ShenZhen df['sex'] = [1,1,0,1] df Out[71]: age city sex name Tom 18 BeiJing 1 Bob 30 ShangHai 1 Mary 25 GuangZhou 0 James 40 ShenZhen 1 del df['sex'] df Out[73]: age city name Tom 18 BeiJing Bob 30 ShangHai Mary 25 GuangZhou James 40 ShenZhen df.pop('sex') Out[77]: name Tom 1 Bob 1 Mary 0 James 1 Name: sex, dtype: int64
# 方式一: df.columns = ['Age','City','Sex'] df.index = ['tom','bob'] # 方式二 推薦使用方式二 df.rename(columns={"age": "Age", "city": "City", "sex": "Sex"}) df.rename(index={"Tom": "tom", "Bob": "bob"})
# 訪問行 df[0:4:2] # 按行序號訪問 df.loc['Tom',] # 按行索引訪問 df.iloc[:1,] # 按行序號訪問 # 訪問列 df['age'] # 列名訪問,多列用數組 df.loc[:,'age'] df.iloc[:, 0:1] # 訪問行列 df.loc['Tom','age] # 按行索引訪問 df.iloc[:1,:1] # 按行序號訪問 # 根據條件邏輯值取值
df[df.age>=30]
df[1:2] Out[82]: age city name Bob 30 ShangHai df[0:4:2] Out[83]: age city name Tom 18 BeiJing Mary 25 GuangZhou df.loc['Tom',] Out[88]: age 18 city BeiJing Name: Tom, dtype: object df Out[89]: age city name Tom 18 BeiJing Bob 30 ShangHai Mary 25 GuangZhou James 40 ShenZhen df.iloc[:1,] Out[90]: age city name Tom 18 BeiJing df['age'] Out[91]: name Tom 18 Bob 30 Mary 25 James 40 Name: age, dtype: int64 df[['age','city']] Out[92]: age city name Tom 18 BeiJing Bob 30 ShangHai Mary 25 GuangZhou James 40 ShenZhen Out[94]: name Tom 18 Bob 30 Mary 25 James 40 Name: age, dtype: int64 df.iloc[:, 1] Out[95]: name Tom BeiJing Bob ShangHai Mary GuangZhou James ShenZhen Name: city, dtype: object df.iloc[:, 0] Out[96]: name Tom 18 Bob 30 Mary 25 James 40 Name: age, dtype: int64 df.iloc[:, 0:1] Out[97]: age name Tom 18 Bob 30 Mary 25 James 40 df.loc['Tom', 'age'] Out[98]: 18 df.iloc[:1, :1] Out[99]: age name Tom 18 df Out[100]: age city name Tom 18 BeiJing Bob 30 ShangHai Mary 25 GuangZhou James 40 ShenZhen df[df.age >= 30] Out[101]: age city name Bob 30 ShangHai James 40 ShenZhen df = pd.DataFrame({'BoolCol': [1, 2, 3, 3, 4],'attr': [22, 33, 22, 44, 66]}, index=[10,20,30,40,50]) print(df) value= df[(df.BoolCol==3)&(df.attr==22)].values.tolist()[0] type(value) print(" ".join(str(id) for id in value)) index = df[(df.BoolCol==3)&(df.attr==22)].index.tolist() print(index) BoolCol attr 10 1 22 20 2 33 30 3 22 40 3 44 50 4 66 3 22 [30]
#遍歷列名 for r in df: print(r) #遍歷列 for cName in df: print('df的列:\n',cName) print('df的值:\n',df[cName]) print("-"*10) 遍歷行 第一種:apply方式 推薦 def new_data(row): """增長別名列""" drug_name = row['藥品名稱'] try: row['別名'] = drug_name.rsplit('(',1)[1].strip(')') row['藥品名稱'] = drug_name.rsplit('(',1)[0] except IndexError as e: row['別名'] = np.NAN return row new_drug = data.apply(new_data,axis=1) 第二種:dataframe.iterrows for key, row in data.iterrows(): drug_name = row['藥品名稱'].values drug_alias = drug_name.rsplit('(',1)[1].strip(')') print(drug_name) print(drug_alias) 第三種:index方式 resoved_drug_list = [] for row in data.index: drug_name = '{}[{}]'.format(data.iloc[row]['藥品名稱'],data.iloc[row]['藥品ID']) resoved_drug_list.append(drug_name) 第四種:values方式 for r in df.values: print(r) print(r[0]) print(r[1]) print('-'*10) 第五種:while遍歷DataFrame df = DataFrame({ 'age':Series([21,22,23]), 'name':Series(['zhang','liu','kang']) }) rowCount = len(df) i = 0 while i<rowCount: print(df.iloc[i]) i+=1 補充: #遍歷字符串 for letter in 'python': print('如今是:',letter) #遍歷數組 fruits = ['banana','apple','mango'] for fruit in fruits: print('如今是:',fruit) #遍歷序列 x = Series(['a',True,1],index=['first','second','third']) x[0] x['second'] x[2] for v in x: print('x中的值:',v) for index in x.index: print('X中的索引:',index) print('x中的值:',x[index]) print('*'*10)
雖然說 Pandas 爲咱們提供了很是豐富的函數,有時候咱們可能須要本身定製一些函數,並將它應用到 DataFrame 或 Series。經常使用到的函數有:map、apply、applymap。數據庫
index = pd.Index(data=["Tom", "Bob", "Mary", "James"], name="name") data = { "age": [18, 30, 25, 40], "city": ["BeiJing", "ShangHai", "GuangZhou", "ShenZhen"], "sex": ["male", "male", "female", "male"] } user_info = pd.DataFrame(data=data, index=index) user_info Out[117]: age city sex name Tom 18 BeiJing male Bob 30 ShangHai male Mary 25 GuangZhou female James 40 ShenZhen male
# 經過年齡判斷用戶是否屬於中年人(30歲以上爲中年) user_info.age.map(lambda x: "yes" if x >= 30 else "no") Out[118]: name Tom no Bob yes Mary no James yes Name: age, dtype: object # 經過城市來判斷是南方仍是北方 city_map = { "BeiJing": "north", "ShangHai": "south", "GuangZhou": "south", "ShenZhen": "south" } user_info.city.map(city_map) Out[119]: name Tom north Bob south Mary south James south Name: city, dtype: object # 求每一列的最大值 user_info.apply(lambda x: x.max(), axis=0) Out[120]: age 40 city ShenZhen sex male dtype: object # 將每一個值轉化爲小寫字符串 user_info.applymap(lambda x: str(x).lower()) Out[121]: age city sex name Tom 18 beijing male Bob 30 shanghai male Mary 25 guangzhou female James 40 shenzhen male
index = pd.Index(data=["Tom", "Bob", "Mary", "James"], name="name") data = { "age": [18, 30, 25, 40], "city": ["BeiJing", "ShangHai", "GuangZhou", "ShenZhen"], "sex": ["male", "male", "female", "male"] } user_info = pd.DataFrame(data=data, index=index) user_info Out[122]: age city sex name Tom 18 BeiJing male Bob 30 ShangHai male Mary 25 GuangZhou female James 40 ShenZhen male
user_info.shape # 查看形狀 user_info.columns # 查看列名 user_info.dtype # 查看列數據類型 user_info.ndim # 查看數據維度 user_info.T # 轉置 df.transpose() 相同 user_info.values user_info.index
user_info.shape Out[127]: (4, 3) user_info.T Out[128]: name Tom Bob Mary James age 18 30 25 40 city BeiJing ShangHai GuangZhou ShenZhen sex male male female male user_info.values Out[129]: array([[18, 'BeiJing', 'male'], [30, 'ShangHai', 'male'], [25, 'GuangZhou', 'female'], [40, 'ShenZhen', 'male']], dtype=object) user_info.index Out[130]: Index(['Tom', 'Bob', 'Mary', 'James'], dtype='object', name='name') user_info.ndim Out[158]: 2 user_info.columns Out[159]: Index(['age', 'city', 'sex', 'height'], dtype='object') user_info.dtypes Out[160]: age int64 city object sex object height object dtype: object
user_info.info() #查看總體狀況 user_info.head() #默認查看前5行 user_info.tail() # 默認查看後5行 user_info.select_dtypes(include=['float64']).columns # 選擇特定類型的列
user_info.info() <class 'pandas.core.frame.DataFrame'> Index: 4 entries, Tom to James Data columns (total 3 columns): age 4 non-null int64 city 4 non-null object sex 4 non-null object dtypes: int64(1), object(2) memory usage: 288.0+ bytes user_info.head() Out[125]: age city sex name Tom 18 BeiJing male Bob 30 ShangHai male Mary 25 GuangZhou female James 40 ShenZhen male user_info.tail() Out[126]: age city sex name Tom 18 BeiJing male Bob 30 ShangHai male Mary 25 GuangZhou female James 40 ShenZhen male user_info.select_dtypes(include=['object']).columns Out[175]: Index(['city', 'sex', 'height'], dtype='object') user_info.select_dtypes(include=['int64']).columns Out[179]: Index(['age'], dtype='object')
user_info.age.sum() user_info.age.cumsum() #累加求和 user_info.describe() #查看數字類型的列總體概況 user_info.describe(include=['object']) #查看非數字類型的列的總體狀況 user_info.sex.value_counts() #統計某列中每一個值出現的次數,至關於分組 user_info.groupby('sex')['sex'].count() user_info.age.idxmax() #獲取某列最大值或最小值對應的索引 user_info.age.idxmin()
user_info.age.sum() Out[131]: 113 Out[133]: name Tom 18 Bob 48 Mary 73 James 113 Name: age, dtype: int64 user_info.describe() Out[134]: age count 4.000000 mean 28.250000 std 9.251126 min 18.000000 25% 23.250000 50% 27.500000 75% 32.500000 max 40.000000 user_info.sex.value_counts() Out[135]: male 3 female 1 Name: sex, dtype: int64 user_info.groupby('sex')['sex'].count() Out[136]: sex female 1 male 3 Name: sex, dtype: int64 user_info.age.idxmax() Out[137]: 'James' user_info.age.idxmin() Out[138]: 'Tom'
pd.cut(user_info.age,3) pd.cut(user_info.age, [1, 18, 30, 50]) pd.cut(user_info.age, [1, 18, 30, 50], labels=["childhood", "youth", "middle"]) #cut 進行離散化以外,qcut 也能夠實現離散化。cut 是根據每一個值的大小來進行離散化的,qcut 是根據每一個值出現的次數來進行離散化的。 pd.qcut(user_info.age, 3)
pd.cut(user_info.age,3) Out[140]: name Tom (17.978, 25.333] Bob (25.333, 32.667] Mary (17.978, 25.333] James (32.667, 40.0] Name: age, dtype: category Categories (3, interval[float64]): [(17.978, 25.333] < (25.333, 32.667] < (32.667, 40.0]] pd.cut(user_info.age, [1, 18, 30, 50]) Out[141]: name Tom (1, 18] Bob (18, 30] Mary (18, 30] James (30, 50] Name: age, dtype: category Categories (3, interval[int64]): [(1, 18] < (18, 30] < (30, 50]] pd.cut(user_info.age, [1, 18, 30, 50], labels=["childhood", "youth", "middle"]) Out[142]: name Tom childhood Bob youth Mary youth James middle Name: age, dtype: category Categories (3, object): [childhood < youth < middle] pd.qcut(user_info.age, 3) Out[143]: name Tom (17.999, 25.0] Bob (25.0, 30.0] Mary (17.999, 25.0] James (30.0, 40.0] Name: age, dtype: category Categories (3, interval[float64]): [(17.999, 25.0] < (25.0, 30.0] < (30.0, 40.0]]
#排序功能 #在進行數據分析時,少不了進行數據排序。Pandas 支持兩種排序方式:按軸(索引或列)排序和按實際值排序。 #sort_index 方法默認是按照索引進行正序排的。 user_info.sort_index() user_info.sort_index(axis=1, ascending=False)#按照列進行倒序排 #按照實際值來排序 user_info.sort_values(by="age") user_info.sort_values(by=["age", "city"]) user_info.age.nlargest(2) #通常在排序後,咱們可能須要獲取最大的n個值或最小值的n個值,咱們能夠使用 nlargest和 #nsmallest 方法來完成,這比先進行排序,再使用 head(n) 方法快得多。
user_info Out[144]: age city sex name Tom 18 BeiJing male Bob 30 ShangHai male Mary 25 GuangZhou female James 40 ShenZhen male user_info.sort_index() Out[145]: age city sex name Bob 30 ShangHai male James 40 ShenZhen male Mary 25 GuangZhou female Tom 18 BeiJing male user_info.sort_index(axis=1, ascending=False)#按照列進行倒序排 Out[148]: sex city age name Tom male BeiJing 18 Bob male ShangHai 30 Mary female GuangZhou 25 James male ShenZhen 40 user_info.sort_values(by="age") Out[149]: age city sex name Tom 18 BeiJing male Mary 25 GuangZhou female Bob 30 ShangHai male James 40 ShenZhen male user_info.sort_values(by=["age", "city"]) Out[150]: age city sex name Tom 18 BeiJing male Mary 25 GuangZhou female Bob 30 ShangHai male James 40 ShenZhen male user_info.age.nlargest(2) Out[151]: name James 40 Bob 30 Name: age, dtype: int64
若是想要獲取每種類型的列數的話,能夠使用 get_dtype_counts 方法。
若是想要轉換數據類型的話,能夠經過 astype 來完成。
有時候會涉及到將 object 類型轉爲其餘類型,常見的有轉爲數字、日期、時間差,Pandas 中分別對應 to_numeric、to_datetime、to_timedelta 方法。數組
這裏給這些用戶都添加一些關於身高的信息。如今將身高這一列轉爲數字,很明顯,180cm 並不是數字,爲了強制轉換,咱們能夠傳入 errors 參數,這個參數的做用是當強轉失敗時的處理方式。默認狀況下,errors='raise',這意味着強轉失敗後直接拋出異常,設置 errors='coerce' 能夠在強轉失敗時將有問題的元素賦值爲 pd.NaT(對於datetime和timedelta)或 np.nan(數字)。設置 errors='ignore' 能夠在強轉失敗時返回原有的數據。數據結構
user_info.get_dtype_counts() user_info["age"].astype(float) user_info["height"] = ["178", "168", "178", "180cm"] user_info pd.to_numeric(user_info.height, errors="coerce") pd.to_numeric(user_info.height, errors="ignore")
user_info.get_dtype_counts() Out[152]: int64 1 object 2 dtype: int64 user_info["age"].astype(float) Out[153]: name Tom 18.0 Bob 30.0 Mary 25.0 James 40.0 Name: age, dtype: float64 user_info["height"] = ["178", "168", "178", "180cm"] user_info Out[155]: age city sex height name Tom 18 BeiJing male 178 Bob 30 ShangHai male 168 Mary 25 GuangZhou female 178 James 40 ShenZhen male 180cm pd.to_numeric(user_info.height, errors="coerce") Out[156]: name Tom 178.0 Bob 168.0 Mary 178.0 James NaN Name: height, dtype: float64 pd.to_numeric(user_info.height, errors="ignore") Out[157]: name Tom 178 Bob 168 Mary 178 James 180cm Name: height, dtype: object