Python Data Analysis Library 或 pandas 是基於NumPy 的一種工具,該工具是爲了解決數據分析任務而建立的。Pandas 歸入了大量庫和一些標準的數據模型,提供了高效地操做大型數據集所需的工具。pandas提供了大量能使咱們快速便捷地處理數據的函數和方法。數組
Series:一維數組,與Numpy中的一維array相似。兩者與Python基本的數據結構List也很相近,其區別是:List中的元素能夠是不一樣的數據類型,而Array和Series中則只容許存儲相同的數據類型,這樣能夠更有效的使用內存,提升運算效率。數據結構
Time- Series:以時間爲索引的Series。app
DataFrame:二維的表格型數據結構。不少功能與R中的data.frame相似。能夠將DataFrame理解爲Series的容器。如下的內容主要以DataFrame爲主。ide
Panel :三維的數組,能夠理解爲DataFrame的容器。函數
本文主要介紹DateFrame和Series,其中DataFrame充電介紹。工具
本文中用到的數據文件地址:pandas的基本使用.zip學習
本文只是結合實例介紹pandas的基本使用,若要詳細深刻學習,請參閱pandas官方文檔。spa
使用pandas咱們能夠很方便的對二維表結構進行一些常規操做。 excel
1. 使用pandas讀取csv(或excel等)文件code
import pandas food_info = pandas.read_csv("food_info.csv") # 讀取csv文件 # 讀取Excel文件使用pandas.read_excel()便可 print(type(food_info)) # food_info爲一個DataFrame對象 print(food_info.dtypes) # 各項數據的類型
<class 'pandas.core.frame.DataFrame'> NDB_No int64 Shrt_Desc object Water_(g) float64 Energ_Kcal int64 Protein_(g) float64 Lipid_Tot_(g) float64 Ash_(g) float64 Carbohydrt_(g) float64 Fiber_TD_(g) float64 Sugar_Tot_(g) float64 Calcium_(mg) float64 Iron_(mg) float64 Magnesium_(mg) float64 Phosphorus_(mg) float64 Potassium_(mg) float64 Sodium_(mg) float64 Zinc_(mg) float64 Copper_(mg) float64 Manganese_(mg) float64 Selenium_(mcg) float64 Vit_C_(mg) float64 Thiamin_(mg) float64 Riboflavin_(mg) float64 Niacin_(mg) float64 Vit_B6_(mg) float64 Vit_B12_(mcg) float64 Vit_A_IU float64 Vit_A_RAE float64 Vit_E_(mg) float64 Vit_D_mcg float64 Vit_D_IU float64 Vit_K_(mcg) float64 FA_Sat_(g) float64 FA_Mono_(g) float64 FA_Poly_(g) float64 Cholestrl_(mg) float64 dtype: object
2. 獲取數據
food_info.head(10) # 獲取前10行數據,默認獲取5行 # first_rows = food_info.head() # first_rows # food_info.tail(8) # 獲取尾8行數據,默認獲取5行 # print(food_info.tail()) print(food_info.columns) # 獲取foodinfo的各字段名(即表頭)# print(food_info.shape) # 獲取結構 好比此文件時8618行×36列
Index(['NDB_No', 'Shrt_Desc', 'Water_(g)', 'Energ_Kcal', 'Protein_(g)', 'Lipid_Tot_(g)', 'Ash_(g)', 'Carbohydrt_(g)', 'Fiber_TD_(g)', 'Sugar_Tot_(g)', 'Calcium_(mg)', 'Iron_(mg)', 'Magnesium_(mg)', 'Phosphorus_(mg)', 'Potassium_(mg)', 'Sodium_(mg)', 'Zinc_(mg)', 'Copper_(mg)', 'Manganese_(mg)', 'Selenium_(mcg)', 'Vit_C_(mg)', 'Thiamin_(mg)', 'Riboflavin_(mg)', 'Niacin_(mg)', 'Vit_B6_(mg)', 'Vit_B12_(mcg)', 'Vit_A_IU', 'Vit_A_RAE', 'Vit_E_(mg)', 'Vit_D_mcg', 'Vit_D_IU', 'Vit_K_(mcg)', 'FA_Sat_(g)', 'FA_Mono_(g)', 'FA_Poly_(g)', 'Cholestrl_(mg)'], dtype='object')
# print(food_info.loc[0]) # 獲取第0行數據 print(food_info.loc[6000]) # 獲取第6000行數據 # food_info.loc[10000] # 獲取第10000行數據,超過數據文件自己長度,報錯KeyError: 'the label [10000] is not in the [index]'
NDB_No 18995 Shrt_Desc KELLOGG'S EGGO BISCUIT SCRAMBLERS BACON EGG & CHS Water_(g) 42.9 Energ_Kcal 258 Protein_(g) 8.8 Lipid_Tot_(g) 7.9 Ash_(g) NaN Carbohydrt_(g) 38.3 Fiber_TD_(g) 2.1 Sugar_Tot_(g) 4.7 Calcium_(mg) 124 Iron_(mg) 2.7 Magnesium_(mg) 14 Phosphorus_(mg) 215 Potassium_(mg) 225 Sodium_(mg) 610 Zinc_(mg) 0.5 Copper_(mg) NaN Manganese_(mg) NaN Selenium_(mcg) NaN Vit_C_(mg) NaN Thiamin_(mg) 0.3 Riboflavin_(mg) 0.26 Niacin_(mg) 2.4 Vit_B6_(mg) 0.02 Vit_B12_(mcg) 0.1 Vit_A_IU NaN Vit_A_RAE NaN Vit_E_(mg) 0 Vit_D_mcg 0 Vit_D_IU 0 Vit_K_(mcg) NaN FA_Sat_(g) 4.1 FA_Mono_(g) 1.5 FA_Poly_(g) 1.1 Cholestrl_(mg) 27 Name: 6000, dtype: object
# food_info.loc[3:6] # 獲取第3到6行數據 two_five_ten = [2,5,10] print(food_info.loc[two_five_ten]) # 獲取第2,5,10數據
NDB_No Shrt_Desc Water_(g) Energ_Kcal Protein_(g) \ 2 1003 BUTTER OIL ANHYDROUS 0.24 876 0.28 5 1006 CHEESE BRIE 48.42 334 20.75 10 1011 CHEESE COLBY 38.20 394 23.76 Lipid_Tot_(g) Ash_(g) Carbohydrt_(g) Fiber_TD_(g) Sugar_Tot_(g) \ 2 99.48 0.00 0.00 0.0 0.00 5 27.68 2.70 0.45 0.0 0.45 10 32.11 3.36 2.57 0.0 0.52 ... Vit_A_IU Vit_A_RAE Vit_E_(mg) Vit_D_mcg Vit_D_IU \ 2 ... 3069.0 840.0 2.80 1.8 73.0 5 ... 592.0 174.0 0.24 0.5 20.0 10 ... 994.0 264.0 0.28 0.6 24.0 Vit_K_(mcg) FA_Sat_(g) FA_Mono_(g) FA_Poly_(g) Cholestrl_(mg) 2 8.6 61.924 28.732 3.694 256.0 5 2.3 17.410 8.013 0.826 100.0 10 2.7 20.218 9.280 0.953 95.0
# food_info['Shrt_Desc'] # 獲取字段名爲'Shrt_Desc'的這一列 ndb_col = food_info['NDB_No'] # 獲取字段名爲'NDB_No'的這一列 # print(ndb_col) col_name = 'Shrt_Desc' print(food_info[col_name])
0 BUTTER WITH SALT 1 BUTTER WHIPPED WITH SALT 2 BUTTER OIL ANHYDROUS 3 CHEESE BLUE 4 CHEESE BRICK 5 CHEESE BRIE 6 CHEESE CAMEMBERT 7 CHEESE CARAWAY 8 CHEESE CHEDDAR 9 CHEESE CHESHIRE 10 CHEESE COLBY 11 CHEESE COTTAGE CRMD LRG OR SML CURD 12 CHEESE COTTAGE CRMD W/FRUIT 13 CHEESE COTTAGE NONFAT UNCRMD DRY LRG OR SML CURD 14 CHEESE COTTAGE LOWFAT 2% MILKFAT 15 CHEESE COTTAGE LOWFAT 1% MILKFAT 16 CHEESE CREAM 17 CHEESE EDAM 18 CHEESE FETA 19 CHEESE FONTINA 20 CHEESE GJETOST 21 CHEESE GOUDA 22 CHEESE GRUYERE 23 CHEESE LIMBURGER 24 CHEESE MONTEREY 25 CHEESE MOZZARELLA WHL MILK 26 CHEESE MOZZARELLA WHL MILK LO MOIST 27 CHEESE MOZZARELLA PART SKIM MILK 28 CHEESE MOZZARELLA LO MOIST PART-SKIM 29 CHEESE MUENSTER ... 8588 BABYFOOD CRL RICE W/ PEARS & APPL DRY INST 8589 BABYFOOD BANANA NO TAPIOCA STR 8590 BABYFOOD BANANA APPL DSSRT STR 8591 SNACKS TORTILLA CHIPS LT (BAKED W/ LESS OIL) 8592 CEREALS RTE POST HONEY BUNCHES OF OATS HONEY RSTD 8593 POPCORN MICROWAVE LOFAT&NA 8594 BABYFOOD FRUIT SUPREME DSSRT 8595 CHEESE SWISS LOW FAT 8596 BREAKFAST BAR CORN FLAKE CRUST W/FRUIT 8597 CHEESE MOZZARELLA LO NA 8598 MAYONNAISE DRSNG NO CHOL 8599 OIL CORN PEANUT AND OLIVE 8600 SWEETENERS TABLETOP FRUCTOSE LIQ 8601 CHEESE FOOD IMITATION 8602 CELERY FLAKES DRIED 8603 PUDDINGS CHOC FLAVOR LO CAL INST DRY MIX 8604 BABYFOOD GRAPE JUC NO SUGAR CND 8605 JELLIES RED SUGAR HOME PRESERVED 8606 PIE FILLINGS BLUEBERRY CND 8607 COCKTAIL MIX NON-ALCOHOLIC CONCD FRZ 8608 PUDDINGS CHOC FLAVOR LO CAL REG DRY MIX 8609 PUDDINGS ALL FLAVORS XCPT CHOC LO CAL REG DRY MIX 8610 PUDDINGS ALL FLAVORS XCPT CHOC LO CAL INST DRY... 8611 VITAL WHEAT GLUTEN 8612 FROG LEGS RAW 8613 MACKEREL SALTED 8614 SCALLOP (BAY&SEA) CKD STMD 8615 SYRUP CANE 8616 SNAIL RAW 8617 TURTLE GREEN RAW Name: Shrt_Desc, Length: 8618, dtype: object
columns = ['Water_(g)', 'Shrt_Desc'] zinc_copper = food_info[columns] # 獲取字段名爲'Water_(g)', 'Shrt_Desc'的這兩列 print(zinc_copper)
# 獲取以"(mg)"結尾的各列數據 col_names = food_info.columns.tolist() # print(col_names) milligram_columns = [] for items in col_names: if items.endswith("(mg)"): milligram_columns.append(items) milligram_df = food_info[milligram_columns] print(milligram_df)
3. 對數據的簡單處理:
import pandas food_info = pandas.read_csv('food_info.csv') # food_info.head(3) # print(food_info.shape) # print(food_info['Iron_(mg)']) # Iron_(mg)這一列的單位是mg,將其轉爲mg,對其值除以1000 div_1000 = food_info['Iron_(mg)'] / 1000 # print(div_1000) # 對每行數據中的其中兩列進行計算 water_energy = food_info['Water_(g)'] * food_info['Energ_Kcal'] # print(food_info.shape) # DateFrame結構插入一列,字段名爲'water_energy',值爲water_energy的數據 food_info['water_energy'] = water_energy # print(food_info[['Water_(g)', 'Energ_Kcal', 'water_energy']]) # print(food_info.shape) # 求某列的最大值 max_calories = food_info['Energ_Kcal'].max() # print(max_calories) # 對指定字段排序,inplace=False將排序後的結果生成一個新的DataFrame,inplace=True則在原來的基礎上進行排序,默認升序排序 # food_info.sort_values('Sodium_(mg)', inplace=True) # print(food_info['Sodium_(mg)']) a = food_info.sort_values('Sodium_(mg)', inplace=False, ascending=False) # ascending=False 使用降序排序 # print(food_info['Sodium_(mg)']) # print(a['Sodium_(mg)'])
4. 對數據的常規操做
import pandas as pd import numpy as np titanic_survival = pd.read_csv('titanic_train.csv') # titanic_survival.head() age = titanic_survival['Age'] # print(age.loc[0:10]) age_is_null = pd.isnull(age) # 迭代判斷值是否爲空,結果能夠做爲一個索引 # print(age_is_null) age_null_true = age[age_is_null] # 獲取值爲空的數據集 # print(age_null_true) print(len(age_null_true)) # 判斷一共有多少個空數據 # 求平均值,應用不爲空的數據集求 good_ages = age[age_is_null == False] # 獲取值不爲空的數據集 # print(good_ages) correct_mean_age = sum(good_ages) / len(good_ages) # 求平均 print(correct_mean_age) # 或者使用pandas內置的求均值函數,自動去除空數據 correct_mean_age = age.mean() # 求平均,將空值捨棄 print(correct_mean_age) # pivot_table方法默認求平均值,若是需求是求平均aggfunc參數能夠不寫 # index tells the method which column to group by # values is the column that we want to apply the calculation to # aggfunc specifies the calculation we want to perform passenger_surival = titanic_survival.pivot_table(index='Pclass', values='Survived', aggfunc=np.mean) # 對index相同的分別求平均值 print(passenger_surival) # 分組對多列求和 # port_stats = titanic_survival.pivot_table(index="Embarked", values=['Fare', "Survived"], aggfunc=np.sum) # ,分別對價格和存活人數求和 # print(port_stats) # 丟棄空值數據 drop_na_columns = titanic_survival.dropna(axis=1, inplace=False) # axis=1,以行爲判斷依據,數據爲空,則從Dataframe中丟棄,inplace=False返回一個新的Dataframe對象,不然對當前對象作操做 # print(drop_na_columns) new_titanic_survival = titanic_survival.dropna(axis=0, subset=['Age', 'Sex'], inplace=False) # axis=0,以列爲判斷依據,須要指定判斷列的字段,數據爲空,則從Dataframe中丟棄 # print(new_titanic_survival) # 具體定位到某行某列 row_index_83_age = titanic_survival.loc[83, 'Age'] row_index_766_pclass = titanic_survival.loc[766, 'Pclass'] print(row_index_83_age) print(row_index_766_pclass) new_titanic_survival = titanic_survival.sort_values("Age", ascending=False) # 每行的年齡按降序排序 print(new_titanic_survival[0:10]) print('------------------------>') titanic_reindexed = new_titanic_survival.reset_index(drop=True) # 重置每行的索引值 print(titanic_reindexed[0:20]) # 自定義函數,對每行或每列逐個使用 def null_count(column): column_null = pd.isnull(column) null = column[column_null] return len(null) column_null_count = titanic_survival.apply(null_count, axis=0) # 經過自定義函數,統計每列爲空的個數 print(column_null_count) def which_class(row): pclass = row['Pclass'] if pclass == 1: return 'First Class' elif pclass == 2: return 'Second Class' elif pclass == 3: return 'Third Class' else: return 'Unknow' classes = titanic_survival.apply(which_class, axis=1) # 經過自定義函數,替換每行的Pclass值, 注意axis=1 print(classes)
5. 配合numpy將數據載入後進行預處理
import pandas as pd import numpy as np fandango = pd.read_csv('fandango_score_comparison.csv') # print(type(fandango)) # 返回一個新的dataframe,返回的新數據以設定的值爲index,並將丟棄index值爲空的數據,drop=True,丟棄爲索引的列,不然不丟棄 fandango_films = fandango.set_index('FILM', drop=False) # fandango_films # print(fandango_films.index) # 按索引獲取數據 fandango_films["Avengers: Age of Ultron (2015)" : "Hot Tub Time Machine 2 (2015)"] fandango_films.loc["Avengers: Age of Ultron (2015)" : "Hot Tub Time Machine 2 (2015)"] fandango_films.loc['Southpaw (2015)'] movies = ['Kumiko, The Treasure Hunter (2015)', 'Do You Believe? (2015)', 'Ant-Man (2015)'] fandango_films.loc[movies] # def func(coloumn): # return np.std(coloumn) types = fandango_films.dtypes # print(types) float_columns = types[types.values == 'float64'].index # 獲取特定類型的數據的索引 # print(float_columns) float_df = fandango_films[float_columns] # 獲取特定類型的數據 # print(float_df.dtypes) # float_df # print(float_df) deviations = float_df.apply(lambda x: np.std(x)) # 計算每列標準差 print(deviations) # print('----------------------->') # print(float_df.apply(func)) # help(np.std) rt_mt_user = float_df[['RT_user_norm', 'Metacritic_user_nom']] print(rt_mt_user.apply(np.std, axis=1)) # 計算每行數據標準差 # rt_mt_user.apply(np.std, axis=0)
Series爲DateFrame中一行或一列的數據結構
1. 獲取一個Series對象
import pandas as pd from pandas import Series fandango = pd.read_csv('fandango_score_comparison.csv') series_film = fandango['FILM'] # 獲取fandango中FILM這一列 # print(type(series_film)) print(series_film[0:5]) series_rt = fandango['RottenTomatoes'] # 獲取fandango中RottenTomatoes這一列 print(series_rt[0:5])
2. 對Series對象的一些常規操做
file_names = series_film.values # 獲取series_film的全部值,返回值爲一個<class 'numpy.ndarray'> # print(type(file_names)) # print(file_names) rt_scores = series_rt.values # print(rt_scores) series_custom = Series(rt_scores, index=file_names) # 構建一個新的Series, index爲file_names, value爲rt_scores # help(Series) print(series_custom[['Top Five (2014)', 'Night at the Museum: Secret of the Tomb (2014)']]) # 以index獲取數據 # print(type(series_custom)) print('--------------------------------->') print(series_custom[5:10]) # 切片操做 # print(series_custom[["'71 (2015)"]]) original_index = series_custom.index.tolist() # 獲取全部的index值並將其轉爲list # print(original_index) sorted_index = sorted(original_index) # 對list排序 # print(sort_index) sorted_by_index = series_custom.reindex(sorted_index) # 以排過序的list從新爲series_custom設置索引 print(sorted_by_index) sc2 = series_custom.sort_index() # 以index按升序排序整個series_custom # print(sc2) sc3 = series_custom.sort_values(ascending=False) # 以values按降序排序整個series_custom print(sc3) import numpy as np # print(np.add(series_custom, series_custom)) #將series_custom當成一個矩陣,使用numpy進行計算 print(np.sin(series_custom)) print(np.max(series_custom)) # series_custom > 50 series_greater_than_50 = series_custom[series_custom > 50] # 獲取series_custom的值大於50的數據 # series_greater_than_50 criteria_one = series_custom > 50 criteria_two = series_custom < 75 both_criteria = series_custom[criteria_one & criteria_two] # 獲取series_custom的值大於50且小於75的數據 print(both_criteria) rt_critics = Series(fandango['RottenTomatoes'].values, index=fandango['FILM']) rt_users = Series(fandango['RottenTomatoes_User'].values, index=fandango['FILM']) rt_mean = (rt_critics + rt_users) / 2 # 將rt_critics 和 rt_users的值相加除以2 print(rt_mean)