Series對象本質上是有兩個數組組成,一個數組構成對象的鍵(index),一個數組構成對象的值(values)python
import string import pandas as pd import numpy as np # 建立Series t1 = pd.Series(np.arange(5),index=list("abcde")) print(t1) """ 索引能夠指定,默認爲012... a 0 b 1 c 2 d 3 e 4 dtype: int64 """ print(type(t1)) # <class 'pandas.core.series.Series'> # 使用字典建立Series a = {string.ascii_uppercase[i]:i for i in range(5)} # 建立Series print(pd.Series(a)) """ A 0 B 1 C 2 D 3 E 4 dtype: int64 """ print(pd.Series(a,index=list("CDEFG"))) """ C 2.0 D 3.0 E 4.0 F NaN G NaN dtype: float64 """ # 切片 print(t1[0:4:2]) """ a 0 c 2 dtype: int64 """ print(t1[[2,3,4]]) """ c 2 d 3 e 4 dtype: int64 """ print(t1[t1>2]) """ d 3 e 4 dtype: int64 """ print(t1["b"]) # 1 print(t1[["a","e","f"]]) """ a 0.0 e 4.0 f NaN dtype: float64 """ # 索引和值 print(t1.index) # Index(['a', 'b', 'c', 'd', 'e'], dtype='object') print(type(t1.index)) # <class 'pandas.core.indexes.base.Index'> print(t1.values) # [0 1 2 3 4] print(type(t1.values)) # <class 'numpy.ndarray'>
# 建立DataFrame對象 t1 = pd.DataFrame(np.arange(12).reshape(3,4)) print(t1) """ DataFrame對象既有行索引,又有列索引 行索引,代表不一樣行,橫向索引,叫index,0軸,axis=0 列索引,表名不一樣列,縱向索引,叫columns,1軸,axis=1 0 1 2 3 0 0 1 2 3 1 4 5 6 7 2 8 9 10 11 """ t2 = pd.DataFrame(np.arange(12).reshape(3,4),index=list("abc"),columns=list("EFGH")) print(t2) """ E F G H a 0 1 2 3 b 4 5 6 7 c 8 9 10 11 """ # 將字典轉換成dataframe temp_dict = [{"name":"zhangsan","age":15,"tel":10086}, {"name":"lisi","age":15}, {"name":"wangwu","tel":10086} ] t3 = pd.DataFrame(temp_dict) print(t3) """ age name tel 0 15.0 zhangsan 10086.0 1 15.0 lisi NaN 2 NaN wangwu 10086.0 """
# 獲取DataFrame的基本信息 # 行數,列數 print(t1.shape) # 列數據類型 print(t1.dtypes) # 數據維度 print(t1.ndim) # 2 # 行索引 print(t1.index) # RangeIndex(start=0, stop=3, step=1) # 列索引 print(t2.columns) # Index(['E', 'F', 'G', 'H'], dtype='object') # 對象值 print(t1.values) """ [[ 0 1 2 3] [ 4 5 6 7] [ 8 9 10 11]] """ # 顯示頭幾行,默認是5 print(t1.head(2)) # 顯示末尾幾行 print(t1.tail(2)) # 相關信息概覽:行數,列數,列索引,咧非空值個數,行列類型,內存佔用 print(t1.info()) """ <class 'pandas.core.frame.DataFrame'> RangeIndex: 3 entries, 0 to 2 Data columns (total 4 columns): 0 3 non-null int64 1 3 non-null int64 2 3 non-null int64 3 3 non-null int64 dtypes: int64(4) memory usage: 176.0 bytes None """ # 快速綜合統計結果:計數,均值,標準差,最大值,1/4值,最小值 print(t2.describe()) """ 是根據列來計算的 E F G H count 3.0 3.0 3.0 3.0 mean 4.0 5.0 6.0 7.0 std 4.0 4.0 4.0 4.0 min 0.0 1.0 2.0 3.0 25% 2.0 3.0 4.0 5.0 50% 4.0 5.0 6.0 7.0 75% 6.0 7.0 8.0 9.0 max 8.0 9.0 10.0 11.0 """
import numpy as np import pandas as pd # 加載csv數據 t = pd.read_csv("./dogNames2.csv") # 按照字段進行排序,ascending desc/asc t2 = t.sort_values("Count_AnimalName",ascending=False).head(10) print(t2)
import string import numpy as np import pandas as pd # loc和iloc方法 # df.loc:經過標籤獲取行數據 # df.iloc:經過位置獲取行數據 t1 = pd.DataFrame(np.arange(12).reshape(3,4),index=list("abc"),columns=list("EFGH")) # 獲取行列交叉部分 print(t1.loc["a","E"]) # 0 # 獲取行與多列交叉部分 print(t1.loc["a",["E","F"]]) """ E 0 F 1 """ # 獲取行與多列交叉部分 print(t1.loc["a","E":"G"]) """ E 0 F 1 G 2 """ # 獲取行與連續多列交叉部分 print(t1.loc["a":"c","G"]) """ 注意:loc裏的:是包括最後的那個的 a 2 b 6 c 10 """ # iloc和loc是同樣的,只不過採用的是索引來進行的操做
import numpy as np import pandas as pd # pandas中的布爾索引 t1 = pd.read_csv("./dogNames2.csv") # 找出其中名字使用次數超過800的狗 print(t1[t1["Count_AnimalName"]>800]) """ Row_Labels Count_AnimalName 1156 BELLA 1195 2660 CHARLIE 856 3251 COCO 852 9140 MAX 1153 12368 ROCKY 823 """ # 找出狗名字符串長度超過4的狗 print(t1[t1["Row_Labels"].str.len()>4].head(3)) """ Row_Labels Count_AnimalName 2 40804 1 3 90201 1 4 90203 1 """ # 多條件,要使用()分割,&或|作鏈接符 print(t1[(t1["Row_Labels"].str.len()>4)&(t1["Row_Labels"].str.len()<6)].head(3))
字符串方法數組
import pandas as pd import numpy as np temp_dict = [{"name":"zhangsan","age":0,"tel":10086}, {"name":"lisi","age":15}, {"name":"wangwu","tel":10010}] t1 = pd.DataFrame(temp_dict) print(t1) """ age name tel 0 15.0 zhangsan 10086.0 1 15.0 lisi NaN 2 NaN wangwu 10010.0 """ ## 缺失數據的處理 # - 處理方式1:刪除NaN所在的行列dropna (axis=0, how='any', inplace=False) t2 = t1.dropna(axis=0, how='any', inplace=False) print(t2) """ age name tel 0 15.0 zhangsan 10086.0 """ # - 處理方式2:填充數據,t.fillna(t.mean()),t.fiallna(t.median()),t.fillna(0) t3 = t1.fillna(t1.mean()) print(t3) """ age name tel 0 15.0 zhangsan 10086.0 1 15.0 lisi 10048.0 2 15.0 wangwu 10010.0 """ ### 處理爲0的數據:將0改成nan,而後使用上面的方法進行填充 t1[t1==0] = np.nan print(t1) ### 查看是否爲nan,返回布爾索引 print(pd.isnull(t1)) """ age name tel 0 True False False 1 False False True 2 True False False """ print(pd.notnull(t1)) """ age name tel 0 False True True 1 True True False 2 False True True """