import pandas as pd import numpy as np s = pd.Series([1,3,6,np.nan,4,1]) print(s)
dates = pd.date_range('20191009',periods=6) print(dates)
df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=['a','b','c','d'])#行是index 列columns print(df)
#直接用默認形式 df1 = pd.DataFrame(np.arange(12).reshape((3,4))) print(df1)
#利用字典形式 df2 = pd.DataFrame({'A':1., 'B':pd.Timestamp('20191009'), 'C':pd.Series(1,index=list(range(4)),dtype='float32'), 'D':np.array([3]*4,dtype='int32'), 'E':pd.Categorical(["test","train","test","train"]), 'F':'foo'}) print(df2) print(df2.index)#列的名字 print(df2.columns)#行的名字 print(df2.values)#值 print(df2.describe()) print(df2.T)#轉置 df2.sort_index(axis=1,ascending=False)#按列排序 倒的序列排序 df2.sort_index(axis=0,ascending=False)#按行排序 倒的序列排序 df2.sort_values(by='E')#按某一列數值排序
#選擇數據 dates = pd.date_range('20191009',periods=6) df = pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=['A','B','C','D']) # print(df['A'],df.A) # print(df[0:3],df['20191009':'20191011']) #select by label:loc # print(df.loc['20191010']) # print(df.loc[:,['A','B']]) print(df.loc['20191009',['A','B']]) #select by position:iloc print(df.iloc[1:2,1:3])#篩選 print(df.iloc[[1,3,5],1:3])#逐個篩選 #mixed selection:ix print(df.ix[:3,['A','C']])#第0行到第3行,A C 兩列 print(df[df.A>8])#在A那行大於8的數字顯示出來
#設置值 dates = pd.date_range('20191009',periods=6) df = pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=['A','B','C','D']) # df.iloc[2,2] = 1111 # df.loc['20191009','B'] = 222 # df[df.A>4] = 0 df.A[df.A>4] = 0 df['F']=np.nan#定義新的行 df['E'] = pd.Series([1,2,3,4,5,6],index=pd.date_range('20191009',periods=6))#賦值,須要匹配行的名字 print(df)
#處理丟失掉的數據 dates = pd.date_range('20191009',periods=6) df = pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=['A','B','C','D']) df.iloc[0,1] = np.nan df.iloc[1,2] = np.nan print(df) # print(df.fillna(value=0)) # print(df.isnull()) print(np.any(df.isnull()==True))#表格比較大,用這個看是否有丟失的數據