import numpy as np
import pandas as pd
from pandas import DataFrame,Series
df=DataFrame([[3,5,np.nan],[1,3,np.nan],['tom',np.nan,'ivan'],[np.nan,'a','b']])
df
df.isnull()
df.isnull().sum()
df.isnull().sum().sum()
df.notnull()
df.notnull().sum()
df.notnull().sum().sum()
df.dropna(how='any',axis=0)
df.dropna(how='all',axis=0)
df.dropna(how='any',axis=1)
df.dropna(how='all',axis=1)
df=DataFrame([[3,5,np.nan],[1,3,np.nan],['tom',np.nan,'ivan'],[np.nan,'a','b']])
df2=DataFrame([[0,1,2,np.nan],[4,5,6,np.nan],[np.nan,np.nan,np.nan,np.nan]])
df3=df.fillna(0,inplace=False)
df.fillna(0,inplace=True)
df[0].fillna(df[0].mean())
df[0:2]=df[0:2].fillna('enene')
df
df[[0,2]]=df[[0,2]].fillna('dada')
df
df.fillna({0:df[0].mean()})
df2.fillna({0:0,2:'aa'})
df.fillna(method='ffifll')
df.loc[0].fillna(df.loc[0].mean(),inplace=True)
df.loc[0:2]=df.loc[0:2].fillna('enene')
df
df.loc[[0,2]]=df.loc[[0,2]].fillna('dada')
df
df=df.T.fillna({2:'aa'}).T
df=df.T.fillna({0:0,3:'aa'}).T
df.duplicated()
df.drop_duplicated()
df.drop_duplicated(keep='last')
df.drop_duplicated(subset=['sex','year'],keep='last')
df.replace('','未知')
df.replace(['',2001],['未知',2002])
df.replace({'':'未知',2001:2002})
data={
'name':['張三','李四','王五','馬六'],
'grade':[75,52,63,99]
}
df=DataFrame(data)
def f(x):
if x>=90:
return '優秀'
elif 70<=x<90:
return '良好'
elif 60<=x<70:
return '合格'
else:
return '不合格'
df['class']=df.grade.map(f)
df
df=DataFrame(np.arange(10),columns=['X'])
df['Y']=df['X']*2+0.5
df.iloc[9,1]=185
df.plot(kind='scatter',x='X',y='Y')
df = DataFrame(
{
'朝向':['東','南','東','西','北']
'價格':[1200,2000,1200,1100,800]
}
)
df
pd.get_dummies(df['朝向'])
df = DataFrame(
{
'朝向':['東/北','南/西','東','西/北','北']
'價格':[1200,2000,1200,1100,800]
}
)
df
dummies=df['朝向'].apply(lambda x:Series(x.split('/')).value_counts())
dummies
price=DataFrame(
{
'fruit':['apple','banana','orange'],
'price':[23,32,45]
}
)
amount=DataFrame(
{
'fruit':['apple','banana','apple','apple','banana','pear'],
'amount':[5,3,6,3,5,7]
}
)
pd.merge(amount,price,on='fruit')
pd.merge(amount,price,on='fruit',how='outer')
pd.merge(amount,price,on='fruit',how='left')
price2=DataFrame(
{
'fruit':['apple','banana','orange','apple'],
'price':[23,32,45,25]
}
)
amount2=DataFrame(
{
'fruit':['apple','banana','apple','apple','banana','pear'],
'amount':[5,3,6,3,5,7]
}
)
pd.merge(amount2,price2,on='fruit',how='outer')
df1=DataFrame(
{
'k1':['one','one','two'],
'k2':['a','b','a'],
'v1':[2,3,4]
}
)
df2=DataFrame(
{
'k1':['one','one','two','two'],
'k2':['a','a','a','b'],
'v1':[5,6,7,8]
}
)
pd.merge(df1,df2,on=['k1','k2'],how='outer')
pd.merge(df1,df2,on='k1',how='outer',suffixes=('_left','_right'))
df1=DataFrame(
{
'k':['a','b','a'],
'v1':[2,3,4]
}
)
df2=DataFrame({'v2':[5,6,7,8]}index=['a','b'])
pd.merge(df1,df2,left_on='key',right_index=True)
pd.merge(df1,df2,left_on='k',left_index=True,right_index=True,how='outer')
df1.join(df2,how='outer')
s1=Series([0,1],index=['a','b'])
s2=Series([0,3],index=['a','d'])
s3=Series([4,5],index=['e','f'])
pd.concat([s1,s2,s3])
pd.concat([s1,s2,s3],axis=1)
pd.concat([s1,s2],axis=1,join='outer')
pd.concat([s1,s2],axis=0,join='outer')
pd.concat([s1,s2],axis=1,join='inner')
pd.concat([s1,s2],axis=0,join='inner')
pd.concat([s1,s2],axis=0,join='inner',keys=['first','second'])
pd.concat({'first':s1,'second':s2},axis=0,join='inner')
pd.concat([s1,s2],axis=1,join='inner',keys=['first','second'])
pd.concat({'first':s1,'second':s2},axis=1,join='inner')
price=DataFrame(
{
'fruit':['apple','banana','orange'],
'price':[23,32,45]
}
)
amount=DataFrame(
{
'fruit':['apple','banana','apple','apple','banana','pear'],
'amount':[5,3,6,3,5,7]
}
)
pd.concat([price,amount],axis=1)
price=DataFrame(
{
'fruit':['apple','banana','orange'],
'price':[23,32,45]
},index=['a','b','c']
)
amount=DataFrame(
{
'fruit':['apple','banana','apple','apple','banana','pear'],
'amount':[5,3,6,3,5,7]
},index=['a','b','c','d','e','f']
)
pd.concat([price,amount],axis=0)
pd.concat([price,amount],axis=0,ignore_index=True)
price=DataFrame(
{
'fruit':['apple','banana',np.nan,'pear'],
'price':[23,32,np.nan,np.nan]
},index=['a','b','c','d']
)
amount=DataFrame(
{
'fruit':['apple','banana','apple','apple','banana','pear'],
'price':[5,3,6,3,5,7],
'amount':[1,1,1,1,1,1]
},index=['a','b','c','d','e','f']
)
price.combine_first(amount)
df=DataFrame(np.arange(9).reshape(3,3),index=['a','b','c'],columns=['one','two','three'])
df.index.name='alph'
df.columns.name='number'
Se=df.stack()
type(Se)
Se.unstack(),Se.unstack(1),Se.unstack('number')
Se.unstack(0),Se.unstack('alph')
data={
'data':['張三|男','李四|女','王五|女','馬六|男']
}
df=DataFrame(data)
df['name']=df['data'].apply(lambda x:x.split("|")[0])
df['gender']=df['data'].apply(lambda x:x.split("|")[1])
del df['data']
df
df=df['data'].apply(lambda x:Series(x.split("|")))
df.columns=['name','gender']
df
data={
'data':['張三|男','李四|女','王五|女','馬六|男']
}
df=DataFrame(data)
se=df['data'].str.split("|")
df['name']=se.str[0]
df['gender']=se.str[1]
del df['data']
df
df=DataFrame({
'email':['100@qq.com','111@qq.com','222@qq.com']
})
se=df['email'].str.findall('(.*?)@')
type(se)
df['qq']=df['email'].str.findall('(.*?)@').str.get(0)
df
df=DataFrame({
'email':['100@qq.com','111@qq.com','222@qq.com']
})
se=df['email'].str.findall('(.*?)@')
type(se)
df['qq']=df['email'].str.findall('(.*?)@').str.get(0)
df
|
email |
qq |
0 |
100@qq.com |
100 |
1 |
111@qq.com |
111 |
2 |
222@qq.com |
222 |
參考資料:《從零開始學 Python數據分析》羅攀