import pandas as pd import numpy as np df=pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], 'B' : ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], 'C' : np.random.randn(8), 'D' : np.random.randn(8)}) print(df) grouped=df.groupby('A') print('-'*30) print(grouped.count()) print('-'*30) grouped=df.groupby(['A','B']) print(grouped.count()) print('-'*30) # 經過函數分組 def get_letter_type(letter): if letter.lower() in 'aeiou': return 'a' else: return 'b' grouped=df.groupby(get_letter_type,axis=1) print(grouped.count())
A B C D 0 foo one 1.429387 0.643569 1 bar one -0.858448 -0.213034 2 foo two 0.375644 0.214584 3 bar three 0.042284 -0.330481 4 foo two -1.421967 0.768176 5 bar two 1.293483 -0.399003 6 foo one -1.101385 -0.236341 7 foo three -0.852603 -1.718694 ------------------------------ B C D A bar 3 3 3 foo 5 5 5 ------------------------------ C D A B bar one 1 1 three 1 1 two 1 1 foo one 2 2 three 1 1 two 2 2 ------------------------------ a b 0 1 3 1 1 3 2 1 3 3 1 3 4 1 3 5 1 3 6 1 3 7 1 3
se=pd.Series([1,2,3,4,5],[6,9,8,9,8]) print(se) se.groupby(level=0)
6 1 9 2 8 3 9 4 8 5 dtype: int64 <pandas.core.groupby.generic.SeriesGroupBy object at 0x111b00040>
# 分組求和 grouped=se.groupby(level=0).sum() print(grouped)
6 1 8 8 9 6 dtype: int64
df2=pd.DataFrame({'X':['A','B','A','B'],'Y':[1,2,3,4]}) print(df2)
X Y 0 A 1 1 B 2 2 A 3 3 B 4
# 按X分組,並查詢A列的數據 grp=df2.groupby('X').get_group('A') print(grp)
X Y 0 A 1 2 A 3
arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] index=pd.MultiIndex.from_arrays(arrays,names=['first','second']) print(index)
MultiIndex([('bar', 'one'), ('bar', 'two'), ('baz', 'one'), ('baz', 'two'), ('foo', 'one'), ('foo', 'two'), ('qux', 'one'), ('qux', 'two')], names=['first', 'second'])
s=pd.Series(np.random.randn(8),index=index) print(s)
first second bar one 0.120979 two -0.440384 baz one 0.515106 two -0.019882 foo one 1.149595 two -0.369984 qux one -0.930438 two 0.146044 dtype: float64
# 分組求和 grouped=s.groupby(level='first') print(grouped.sum())
first bar -0.319405 baz 0.495224 foo 0.779611 qux -0.784394 dtype: float64
grouped=df.groupby(['A','B']) print(grouped.size())
A B bar one 1 three 1 two 1 foo one 2 three 1 two 2 dtype: int64
print(df)
A B C D 0 foo one 1.429387 0.643569 1 bar one -0.858448 -0.213034 2 foo two 0.375644 0.214584 3 bar three 0.042284 -0.330481 4 foo two -1.421967 0.768176 5 bar two 1.293483 -0.399003 6 foo one -1.101385 -0.236341 7 foo three -0.852603 -1.718694
print(grouped.describe().head())
C \ count mean std min 25% 50% 75% A B bar one 1.0 -0.858448 NaN -0.858448 -0.858448 -0.858448 -0.858448 three 1.0 0.042284 NaN 0.042284 0.042284 0.042284 0.042284 two 1.0 1.293483 NaN 1.293483 1.293483 1.293483 1.293483 foo one 2.0 0.164001 1.789526 -1.101385 -0.468692 0.164001 0.796694 three 1.0 -0.852603 NaN -0.852603 -0.852603 -0.852603 -0.852603 D \ max count mean std min 25% 50% A B bar one -0.858448 1.0 -0.213034 NaN -0.213034 -0.213034 -0.213034 three 0.042284 1.0 -0.330481 NaN -0.330481 -0.330481 -0.330481 two 1.293483 1.0 -0.399003 NaN -0.399003 -0.399003 -0.399003 foo one 1.429387 2.0 0.203614 0.622191 -0.236341 -0.016364 0.203614 three -0.852603 1.0 -1.718694 NaN -1.718694 -1.718694 -1.718694
75% max A B bar one -0.213034 -0.213034 three -0.330481 -0.330481 two -0.399003 -0.399003 foo one 0.423592 0.643569 three -1.718694 -1.718694
grouped=df.groupby('A') grouped['C'].agg([np.sum,np.mean,np.std])
<div>
<style scoped>python
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
</style>dom
sum | mean | std | |
---|---|---|---|
A | |||
bar | 0.477319 | 0.159106 | 1.080712 |
foo | -1.570925 | -0.314185 | 1.188767 |
</div>函數
import pandas as pd import numpy as np s=pd.Series(['A','b','c','D',np.nan]) print(s) # 轉小寫 print(s.str.lower()) # 轉大寫 print(s.str.upper()) # 每一個字符的長度 print(s.str.len())
0 A 1 b 2 c 3 D 4 NaN dtype: object 0 a 1 b 2 c 3 d 4 NaN dtype: object 0 A 1 B 2 C 3 D 4 NaN dtype: object 0 1.0 1 1.0 2 1.0 3 1.0 4 NaN dtype: float64
index=pd.Index([' Index','ru ',' men']) # 去掉空格 print(index.str.strip()) # 去掉左邊的空格 print(index.str.lstrip()) # 去掉右邊的空格 print(index.str.rstrip())
Index(['Index', 'ru', 'men'], dtype='object') Index(['Index', 'ru ', 'men'], dtype='object') Index([' Index', 'ru', ' men'], dtype='object')
df=pd.DataFrame(np.random.randn(3,2),columns=['A a','B b'],index=range(3)) print(df)
A a B b 0 3.005273 0.486696 1 1.093889 1.054230 2 -2.846352 0.302465
# 列替換 print(df.columns.str.replace(' ','_'))
Index(['A_a', 'B_b'], dtype='object')
s=pd.Series(['a_b_C','c_d_e','f_g_h']) print(s)
0 a_b_C 1 c_d_e 2 f_g_h dtype: object
print(s.str.split('_'))
0 [a, b, C] 1 [c, d, e] 2 [f, g, h] dtype: object
print(s.str.split('_',expand=True,n=1))
0 1 0 a b_C 1 c d_e 2 f g_h
s = pd.Series(['A','rumen','ru','rumen','xiao','zhan'])
print(s.str.contains('ru'))
0 False 1 True 2 True 3 True 4 False 5 False dtype: bool
s=pd.Series(['a','a|b','a|c']) print(s)
0 a 1 a|b 2 a|c dtype: object
print(s.str.get_dummies(sep='|'))
a b c 0 1 0 0 1 1 1 0 2 1 0 1
s=pd.Series(np.arange(5),np.arange(5)[::-1],dtype='int64')
s
4 0 3 1 2 2 1 3 0 4 dtype: int64
print(s[s>2])
1 3 0 4 dtype: int64
# isin查詢索引在某個範圍 print(s.isin([1,3,4]))
4 False 3 True 2 False 1 True 0 True dtype: bool
# 根據索引查詢數據 print(s[s.isin([1,3,4])])
3 1 1 3 0 4 dtype: int64
# 構造一個聯合索引的數據 s=pd.Series(np.arange(6),index=pd.MultiIndex.from_product([[1,2],['a','b','c']]))
print(s)
1 a 0 b 1 c 2 2 a 3 b 4 c 5 dtype: int64
print(s.iloc[s.index.isin([(1,'b'),(2,'c')])])
1 b 1 2 c 5 dtype: int64
# 構造一個時間序列 dates=pd.date_range('20200920',periods=8) print(dates)
DatetimeIndex(['2020-09-20', '2020-09-21', '2020-09-22', '2020-09-23', '2020-09-24', '2020-09-25', '2020-09-26', '2020-09-27'], dtype='datetime64[ns]', freq='D')
df=pd.DataFrame(np.random.randn(8,4),index=dates,columns=['A','B','C','D']) print(df)
A B C D 2020-09-20 -1.218522 2.067088 0.015009 0.158780 2020-09-21 -0.546837 -0.601178 -0.894882 0.172037 2020-09-22 0.189848 -0.910520 0.196186 -0.073495 2020-09-23 -0.566892 0.899193 -0.450925 0.633253 2020-09-24 0.038838 1.577004 0.580927 0.609050 2020-09-25 1.562094 0.020813 -0.618859 -0.515212 2020-09-26 -1.333947 0.275765 0.139325 1.124207 2020-09-27 -1.271748 1.082302 1.036805 -1.041206
# 查詢A列數據 print(df['A'])
2020-09-20 -1.218522 2020-09-21 -0.546837 2020-09-22 0.189848 2020-09-23 -0.566892 2020-09-24 0.038838 2020-09-25 1.562094 2020-09-26 -1.333947 2020-09-27 -1.271748 Freq: D, Name: A, dtype: float64
# 查詢小於0的數字,大於0的值默認被置爲NaN df.where(df<0)
<div>
<style scoped>spa
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
</style>code
A | B | C | D | |
---|---|---|---|---|
2020-09-20 | -1.218522 | NaN | NaN | NaN |
2020-09-21 | -0.546837 | -0.601178 | -0.894882 | NaN |
2020-09-22 | NaN | -0.910520 | NaN | -0.073495 |
2020-09-23 | -0.566892 | NaN | -0.450925 | NaN |
2020-09-24 | NaN | NaN | NaN | NaN |
2020-09-25 | NaN | NaN | -0.618859 | -0.515212 |
2020-09-26 | -1.333947 | NaN | NaN | NaN |
2020-09-27 | -1.271748 | NaN | NaN | -1.041206 |
</div>索引
# 查詢小於0的數字,大於0的值變成負數 print(df.where(df<0,-df))
A B C D 2020-09-20 -1.218522 -2.067088 -0.015009 -0.158780 2020-09-21 -0.546837 -0.601178 -0.894882 -0.172037 2020-09-22 -0.189848 -0.910520 -0.196186 -0.073495 2020-09-23 -0.566892 -0.899193 -0.450925 -0.633253 2020-09-24 -0.038838 -1.577004 -0.580927 -0.609050 2020-09-25 -1.562094 -0.020813 -0.618859 -0.515212 2020-09-26 -1.333947 -0.275765 -0.139325 -1.124207 2020-09-27 -1.271748 -1.082302 -1.036805 -1.041206
# 查詢小於0的數據,大於0的置爲1000 print(df.where(df<0,1000))
A B C D 2020-09-20 -1.218522 1000.000000 1000.000000 1000.000000 2020-09-21 -0.546837 -0.601178 -0.894882 1000.000000 2020-09-22 1000.000000 -0.910520 1000.000000 -0.073495 2020-09-23 -0.566892 1000.000000 -0.450925 1000.000000 2020-09-24 1000.000000 1000.000000 1000.000000 1000.000000 2020-09-25 1000.000000 1000.000000 -0.618859 -0.515212 2020-09-26 -1.333947 1000.000000 1000.000000 1000.000000 2020-09-27 -1.271748 1000.000000 1000.000000 -1.041206
# 構造一個10行3列的數據 df=pd.DataFrame(np.random.randn(10,3),columns=list('abc')) print(df)
a b c 0 1.761415 0.528009 -0.347271 1 -0.682149 0.353312 0.337229 2 1.080733 -0.272290 1.020335 3 -0.979681 -1.753745 0.836387 4 0.243748 2.085531 -0.993318 5 -1.041006 1.518130 -0.087383 6 -1.400354 -0.095196 3.043639 7 -0.835144 0.926415 -1.217102 8 0.326098 1.079906 0.156884 9 1.836618 -1.288516 -2.492620
# 查詢a>b的數據 print(df.query('a>b'))
a b c 0 1.761415 0.528009 -0.347271 2 1.080733 -0.272290 1.020335 3 -0.979681 -1.753745 0.836387 9 1.836618 -1.288516 -2.492620
# 查詢c>b>a的數據 print(df.query('(c<b) & (b<a)'))
a b c 0 1.761415 0.528009 -0.347271 9 1.836618 -1.288516 -2.492620