import pandas as pd
import numpy as np
df = pd.DataFrame(np.random.randn(6,3))
df.iloc[:4,1] = np.NaN
df.iloc[:2,2] = np.NaN
df
#有2個以上的nan才刪除
df.dropna(thresh=2)
df.dropna(axis=1,thresh=3)
df.duplicated()
df.drop_duplicates()
df.drop_duplicates(keep='last')
df.drop_duplicates(['k2'])
ages = [20,22,25,27,21,23,37,31,61,45,41,32]
bins = [18,25,35,60,100]
cats = pd.cut(ages,bins)
cats
cats.codes #ages的數據標籤
cats.categories
pd.value_counts(cats)
#改成左閉右開
pd.cut(ages,bins,right=False)
#自定義箱名
pd.cut(ages,bins,labels=['Youth','YoungAdult','MiddleAged','Senior'])
#定義箱子個數
data = np.random.rand(20)
pd.cut(data,4,precision=2)
#根據樣本的分位數進行分箱
data = np.random.randn(1000)
cats = pd.qcut(data,4)
cats
pd.value_counts(cats)
#自定義分位數
pd.qcut(data,[0,0.1,0.5,0.9,1.])
data = pd.DataFrame(np.random.randn(1000,4))
#擁有絕對值大於3的行
data[(np.abs(data)>3).any(1)]
data.iloc[500] = [4,5,6,-7]
#絕對值都大於3的行
data[(np.abs(data)>3).all(1)]
np.sign(data).head()
pd.get_dummies(df['key'])
#一行屬於多個類別時,比較複雜
movies = pd.read_csv('datasets/movielens/movies.dat',sep='::',header= None,names=['movie_id','title','genres'])
movies.head()
all_genres = []
for genres in movies['genres']:
all_genres.extend(genres.split('|'))
genres = pd.unique(all_genres)
genres
count_df = pd.DataFrame(np.zeros(shape=(len(movies),len(genres))),columns=genres)
count_df.head()
#遍歷分類能夠避免遍歷數據較多的movies,速度較快
for genre in genres:
count_df.loc[movies['genres'].str.contains(genre),genre] = 1
count_df.head()
val = 'a,b, guido'
val.index(',')
#index找不到時會報錯,find找不到時返回-1
val.find(':')
import re
text = 'foo bar\t baz \tqux'
#正則表達式對象
regex = re.compile('\s+')
regex.findall(text)
text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com
"""
#pattern里加上括號返回的是元組
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
regex = re.compile(pattern=pattern,flags=re.IGNORECASE)
regex.findall(text)
print(regex.sub(r'Username:\1,Domain:\2,Suffix:\3',text))
left = pd.DataFrame({'key':['a','b','a','a','b','c'],'value':range(6)})
left
right = pd.DataFrame({'group_val':[3.5,7]},index=['a','b'])
right
#使用right的行索引做爲它的鏈接鍵
pd.merge(left,right,left_on='key',right_index=True)
#不存在相同索引時,能夠考慮concat
df1 = pd.DataFrame(np.arange(6).reshape(3,2),index=list('abc'),columns=['one','two'])
df1
df2 = pd.DataFrame(np.arange(5,9).reshape(2,2),index=list('ac'),columns=['three','four'])
df2
pd.concat([df1,df2],keys=['df1','df2'],sort=True)
pd.concat([df1,df2],axis=1,keys=['df1','df2'],sort=True)
#不保留原來的索引,而是產生新的索引
pd.concat([df1,df2],ignore_index=True,sort=True)
df1 = pd.DataFrame({'a': [1., np.nan, 5., np.nan],
'b': [np.nan, 2., np.nan, 6.],
'c': range(2, 18, 4)})
df1
df2 = pd.DataFrame({'a': [5., 4., np.nan, 3., 7.],
'b': [np.nan, 3., 4., 6., 8.]})
df2
#至關於np.where(pd.isnull(a),b,a)
df1.combine_first(df2)
periods = pd.PeriodIndex(year=data.year,quarter=data.quarter,name='date')
periods
columns = pd.Index(['realgdp','infl','unemp'],name='item')
columns
data = data.reindex(columns=columns)
data.head()
data.index = periods.to_timestamp('D','e')
data.head()
data = data.stack().reset_index().rename(columns={0:'value'})
data.head()
data.pivot('date','item','value')[:5]#行,列,值
df = pd.DataFrame({'key': ['foo', 'bar', 'baz'],
'A': [1, 2, 3],
'B': [4, 5, 6],
'C': [7, 8, 9]})
df
#pivot的反向操做
pd.melt(df,['key'])