pandas 常規操做大全

時間 2019-12-13

標籤 pandas 常規做大简体版

原文原文鏈接

前言

pandas 抓住 Series (排序的字典)， DataFrame (row + 多個 Series) 對象 ， 就如同 numpy 裏抓住 ndarray  多維數組同樣
但是人的精力始終是有限的，沒有過目不忘的本領，那就記住 API 以及經常使用參數， 其餘的交給字典吧

下面學習示例可能會用到的兩個函數

def pretty_print(text='',star_cnt=20):
    stars = '*'*star_cnt
    print('{stars} {text} {stars}'.format(text=text,stars=stars))

def generate_df(rows, cols):
    data = np.arange(rows*cols).reshape((rows,cols))
    columns = ['col_'+str(i) for i in range(cols)]
    indices = ['row_'+str(j) for j in range(rows)]
    return DataFrame(data,columns=columns,index=indices)

generate_df(3,4)

修改 dataframe 中數據

from pandas import DataFrame
import numpy as np
import pandas as pd
import logging
logging.basicConfig(level=logging.DEBUG)

def pretty_print(text='',star_cnt=20):
    stars = '*'*star_cnt
    print('{stars}{text}{stars}'.format(text=text,stars=stars))

data = {'open':[8.08, 7.93, 7.97, 8.00],
        'close':[7.93,8.05,7.97,8.05],
        'high':[8.10,8.12,8.00,8.09],
        'low':[7.88,7.92,7.91,8.00]}

df = DataFrame(data,index=pd.date_range('02/01/2016','02/04/2016'))
print(df.head(10))
pretty_print('華麗的分隔符')
df[~df.isin([7.93])] = 0 # 將 df 中不爲 6.93 的變爲 0
print(df)
"""
            open  close  high   low
2016-02-01  8.08   7.93  8.10  7.88
2016-02-02  7.93   8.05  8.12  7.92
2016-02-03  7.97   7.97  8.00  7.91
2016-02-04  8.00   8.05  8.09  8.00
********************華麗的分隔符********************
            open  close  high  low
2016-02-01  0.00   7.93   0.0  0.0
2016-02-02  7.93   0.00   0.0  0.0
2016-02-03  0.00   0.00   0.0  0.0
2016-02-04  0.00   0.00   0.0  0.0
"""

"""
df[]  這樣返回的都是 DataFrame ， df.ix ,df.loc , df.iloc 這類返回的都是 Series
"""

apply + map == applymap

from pandas import DataFrame
import numpy as np
import pandas as pd
import logging
logging.basicConfig(level=logging.DEBUG)
from functools import reduce

def pretty_print(text='',star_cnt=20):
    stars = '*'*star_cnt
    print('{stars}{text}{stars}'.format(text=text,stars=stars))

data = {'open':[8.08, 7.93, 7.97, 8.00],
        'close':[7.93,8.05,7.97,8.05],
        'high':[8.10,8.12,8.00,8.09],
        'low':[7.88,7.92,7.91,8.00]}

df = DataFrame(data,index=pd.date_range('02/01/2016','02/04/2016'))
print(df.head(10))
pretty_print('華麗的分隔線')
# 新增一列 下面兩種方法等價
df['new'] = df.apply(lambda cols:reduce(lambda x,y:x+y,cols), axis=1)
df['new_2'] = df.apply(sum,axis=1)
print(df.head(10))
"""
            open  close  high   low
2016-02-01  8.08   7.93  8.10  7.88
2016-02-02  7.93   8.05  8.12  7.92
2016-02-03  7.97   7.97  8.00  7.91
2016-02-04  8.00   8.05  8.09  8.00
********************華麗的分隔線********************
            open  close  high   low    new  new_2
2016-02-01  8.08   7.93  8.10  7.88  31.99  63.98
2016-02-02  7.93   8.05  8.12  7.92  32.02  64.04
2016-02-03  7.97   7.97  8.00  7.91  31.85  63.70
2016-02-04  8.00   8.05  8.09  8.00  32.14  64.28
"""

分組後分組col 會被做爲 key 索引

from pandas import Series,DataFrame
a=[['Li','男','PE',98.],['Li','男','MATH',60.],['liu','男','MATH',60.],['yu','男','PE',100.]]

af=DataFrame(a,columns=['name','sex','course','score'])
print(af.head(10))
print('*'*50)
print(af.groupby(['name','course'])['score'].sum())
print('*'*50)
print(af.groupby(['name','course'])['score'].sum()['Li'])
"""
name sex course  score
0   Li   男     PE   98.0
1   Li   男   MATH   60.0
2  liu   男   MATH   60.0
3   yu   男     PE  100.0
**************************************************
name  course
Li    MATH       60.0
      PE         98.0
liu   MATH       60.0
yu    PE        100.0
Name: score, dtype: float64
**************************************************
course
MATH    60.0
PE      98.0
Name: score, dtype: float64
"""

騷操做

# -*- coding: utf-8 -*-
__author__ = 'Frank Li'
from pandas import Series,DataFrame
import pandas as pd
import pdb
addr = pd.Series([
 'Washington, D.C. 20003',
 'Brooklyn, NY 11211-1755',
 'Omaha, NE 68154',
 'Pittsburgh, PA 15211' ])
# Series._accessors 有 str , cat , dt 三個對象
addr.str.upper() # 將字符大寫
print(addr.str.count(r'\d')) # 計數 每個單元格包含數字個數

regex = (r'(?P<city>[A-Za-z ]+), '      # 一個或更多字母
    r'(?P<state>[A-Z]{2}) '        # 兩個大寫字母
   r'(?P<zip>\d{5}(?:-\d{4})?)')  # 可選的4個延伸數字
print(addr.str.replace('.','').str.extract(regex))

print([i for i in dir(pd.Series.str) if not i.startswith('_')])
"""
0    5
1    9
2    5
3    5
dtype: int64
         city state         zip
0  Washington    DC       20003
1    Brooklyn    NY  11211-1755
2       Omaha    NE       68154
3  Pittsburgh    PA       15211
['capitalize', 'cat', 'center', 'contains', 'count', 'decode', 'encode', 'endswith', 'extract', 'extractall', 'find', 'findall', 'get', 'get_dummies', 'index', 'isalnum', 'isalpha', 'isdecimal', 'isdigit', 'islower', 'isnumeric', 'isspace', 'istitle', 'isupper', 'join', 'len', 'ljust', 'lower', 'lstrip', 'match', 'normalize', 'pad', 'partition', 'repeat', 'replace', 'rfind', 'rindex', 'rjust', 'rpartition', 'rsplit', 'rstrip', 'slice', 'slice_replace', 'split', 'startswith', 'strip', 'swapcase', 'title', 'translate', 'upper', 'wrap', 'zfill']
"""

daterng = pd.Series(pd.date_range('2017', periods=9, freq='Q'))
print(daterng)
print(daterng.dt.day_name())
# 查看下半年
print(daterng[daterng.dt.quarter > 2])
print(daterng[daterng.dt.is_year_end])
"""
Series.dt.day_name()：從日期判斷出所處星期數；

Series.dt.quarter：從日期判斷所處季節；

Series.dt.is_year_end：從日期判斷是否處在年末
"""

colors = pd.Series([
'periwinkle',
'mint green',
'burnt orange',
'periwinkle',
'burnt orange',
'rose',
'rose',
'mint green',
'rose',
'navy'])
import sys
print(colors.apply(sys.getsizeof))
mapper = {v: k for k, v in enumerate(colors.unique())}
as_int = colors.map(mapper)
print(as_int)
print(as_int.apply(sys.getsizeof))
# 節省內存
primary_usage = colors.memory_usage(index=False, deep=True)
category_usage = colors.astype('category').memory_usage(index=False, deep=True)
print('primary: {}\ncategory_usage: {}'.format(primary_usage,category_usage))
"""
primary: 370
category_usage: 291
這樣看起來彷佛並無什麼很大區別
可是咱們能夠 repeat 屢次試試
"""
manycolors = colors.repeat(10)
print(len(manycolors) / manycolors.nunique())

print(manycolors.memory_usage(index=False, deep=True))
# pdb.set_trace()
print(manycolors.astype('category').memory_usage(index=False, deep=True))

數據清洗與準備

# 處理缺失數據
API 
dropna
fillna
isnull
notnull

# 過濾缺失值
data.dropna()   等價於 data[data.notnull()]

對於 DataFrame 還有  axis= 0 or 1 , how = 'all' 等可選 ，當某列全爲 NaN 時候刪除 ， thresh=2 當 NA 個數 > = 2 時候觸發刪除操做
df.dropna(axis=1,how='all')
df.fillna({1: 0.5, 2: 0}, inplace=True, method='ffill', limit=2)爲 1 列 2 列 分別填充不一樣的默認值
fillna 參數 有：
value 標量值或字典型對象用於填充確實值
method 插值方法， 若是沒有其餘參數， 默認是 ‘ffill’
axis  須要填充的軸， 默認 axis = 0
inplace 
limit 用於前向或後向填充時最大的填充範圍

### 查詢與刪除重複值
data.duplicated()
data.drop_duplicates(['col1','col2'], keep='last' or 'first')

### 使用函數或映射進行數據轉換
data = {'food':['bacon','pulled pork','bacon','Pastrami','corned beef','Bacon','pastrami','honey ham','nova lox'],
        'ounces':[4,5,12,6,7.5,8,3,5,6]}
df = DataFrame(data)
print(df.head(10))
print('*'*50)
meat_to_animal = {'bacon':'pig','pulled pork':'pig','pastrami':'cow','corned beef':'cow','honey ham':'pig','nova lox':'salmon'}

df['meat_to_animall'] = df['food'].map(lambda x:meat_to_animal.get(x.lower(),'unknown'))
print(df.head(10))

### 替代值
data.replace(-999, np.nan)

### 重命名軸索引
data.index.map(lambda x: x[:4].upper())
data.rename(index=str.title, columns=str.upper)
dta.rename(index={'old_idx':'new_idx'}, columns={'old_col':'new_col'}, inplace=True)


### 離散化和分箱  cut , qcut
pd.cut(ages, bins)

from pandas import Series,DataFrame
import pandas as pd

ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
bins = [18,35,50,70,90,120]
cats = pd.cut(ages, bins)
print(cats.codes)

df = DataFrame({'ages':ages})
df['ages_dicretes'] = pd.cut(ages, bins,right=False).codes
print(df.head(10))

from pandas import Series,DataFrame
import pandas as pd
import numpy as np
data = DataFrame(np.random.randn(1000,4))
data[np.abs(data)>3] = np.sign(data) * 3 # 將數值限定於 -3 to +3
print(data.head(10))

# 置換和隨機抽樣
numpy.random.permutation
df.sample , series.sample

df = DataFrame(np.arange(20).reshape((5,4)))
print(df)
print('*'*50)
sampler = np.random.permutation(5)
print(df.take(sampler))


# 組合使用
from pandas import Series,DataFrame
import pandas as pd
import numpy as np

def pretty_print(text='',star_cnt=20):
    stars = '*'*star_cnt
    print('{stars} {text} {stars}'.format(text=text,stars=stars))

np.random.seed(12345)
values = np.random.rand(10)
print(values)
pretty_print('離散分箱')
bins = [0, 0.2, 0.4, 0.6, 0.8, 1]
df = pd.get_dummies(pd.cut(values,bins))
print(df.head(10))

"""
[0.92961609 0.31637555 0.18391881 0.20456028 0.56772503 0.5955447
 0.96451452 0.6531771  0.74890664 0.65356987]
******************** 離散分箱 ********************
   (0.0, 0.2]  (0.2, 0.4]  (0.4, 0.6]  (0.6, 0.8]  (0.8, 1.0]
0           0           0           0           0           1
1           0           1           0           0           0
2           1           0           0           0           0
3           0           1           0           0           0
4           0           0           1           0           0
5           0           0           1           0           0
6           0           0           0           0           1
7           0           0           0           1           0
8           0           0           0           1           0
9           0           0           0           1           0
"""

### 向量化 字符串函數
from pandas import Series,DataFrame
import pandas as pd
import numpy as np
import re

def pretty_print(text='',star_cnt=20):
    stars = '*'*star_cnt
    print('{stars} {text} {stars}'.format(text=text,stars=stars))

data = {'col1':['001100110111'],
        'col2': ['001100110111'],
        'col3': ['001100110111']}
df = DataFrame(data)
print(df.head(10))
pretty_print('華麗的分割線')
df2 = df.apply(lambda s:s.str.extract(r'(?P<nums_1>\d{3})(?P<nums_2>\d{3})(?P<nums_3>\d{3})(?P<nums_4>\d{3})') ,axis=1)
print(df2.values)
"""
           col1          col2          col3
0  001100110111  001100110111  001100110111
******************** 華麗的分割線 ********************
[     nums_1 nums_2 nums_3 nums_4
col1    001    100    110    111
col2    001    100    110    111
col3    001    100    110    111]
"""

# 部分向量化字符串方法列表
cat 根據可選的分隔符暗元素年和字符串
contains 返回是否含有某個模式 / 正則表達式的 布爾值數組
count 模式出現次數的計數
extract 使用正則表達式從字符串Series 中分組抽取一個 或多個字符串， 返回的結果是 每一個分組造成一列的 DataFrame
endswith  等價於對每一個元素使用 x.endswith 
startswith 等價於 對每一個元素使用 x.statswith
findall  找出字符串中全部的 模式 / 正則表達式 匹配項 ，以列表返回
get 對每一個元素進行索引 （得到第 i 個元素）
isalnum
is alhpa
isdecimal
isdigit
islower
isnumeric
isupper
join
len
loer, upper
match
pad
center
repeat
replace
slice

split
strip
rstrip
lstrip

第八章數據規整，鏈接，聯合重塑

# 分層索引  部分索引
分層索引容許你在一個軸向上擁有多個 (兩個或兩個以上) 索引層級， 龍宮地說 分層索引提供了一種在耕地惟獨的形式中處理更高維度 數據的方法。

# -*- coding: utf-8 -*-
__author__ = 'Frank Li'
from pandas import Series,DataFrame
import pandas as pd
import numpy as np
import re

def pretty_print(text='',star_cnt=20):
    stars = '*'*star_cnt
    print('{stars} {text} {stars}'.format(text=text,stars=stars))


data = pd.Series(np.random.randn(9),
                 index=[['a', 'a', 'a', 'b', 'b', 'c', 'c', 'd', 'd'],
                        [1, 2, 3, 1, 3, 1, 2, 2, 3]])
print(data.head(10))
pretty_print('華麗的分割線')
print(data['b'])
pretty_print('華麗的分割線')
print(data.unstack())
pretty_print('華麗的分割線')
print(data.unstack().stack())  

#  重排序 和 層級排序
swaplevel   sort_index
# 按 層級進行彙總統計

# 使用 DataFrame 的列 進行索引

set_index()   提出 層級索引
reset_index()  反操做 封層索引會被移動到 列中

# 聯合於合併數據集 
相似於 SQL 表關聯操做  merge
from pandas import Series,DataFrame
import pandas as pd
import numpy as np
import re

def pretty_print(text='',star_cnt=20):
    stars = '*'*star_cnt
    print('{stars} {text} {stars}'.format(text=text,stars=stars))


df1 = pd.DataFrame({'key1':['b','b','a','c','a','a','b'],
                    'data1':range(7)})
df2 = pd.DataFrame({'key1':['a','b','d'],
                   'data2':range(3)})

df3 = pd.merge(df1,df2,left_on='key1',right_on='key1',how='inner', suffixes=('_left','_right')) 
# inner ,left ,right ,outer ，若是 是多個列進行關聯 則 on = ['key1','key2'] ,若是有相同的列，則使用
# suffixes 參數 給列取別名   , 根據索引來合併 (關聯) left_index=True, right_index=True

print(df3.head(10))

# join 按照 索引進行合併
df4 = df1.join(df2,how='inner', lsuffix='_left', rsuffix='_right')
pretty_print('華麗的分割線')
print(df4.head(10))

left.join([right1, right2,right3], how='outer')  默認 inner 

concat  相似於  union all 其實又不單單是 

arr = np.arange(12).reshape((3,4))
print(arr)
pretty_print('華麗的分割線')
result = np.concatenate([arr,arr],axis=1)
result_2 = np.concatenate([arr,arr],axis=0)
print(result)
pretty_print('華麗的分割線')
print(result_2)

from pandas import Series,DataFrame
import pandas as pd
import numpy as np
import re

def pretty_print(text='',star_cnt=20):
    stars = '*'*star_cnt
    print('{stars} {text} {stars}'.format(text=text,stars=stars))

def generate_df(rows, cols):
    data = np.arange(rows*cols).reshape((rows,cols))
    columns = ['col_'+str(i) for i in range(cols)]
    indices = ['row_'+str(j) for j in range(rows)]
    return DataFrame(data,columns=columns,index=indices)

arr = generate_df(3,4)
s1 = pd.Series([0, 1], index=['a', 'b'])
s2 = pd.Series([2, 3, 4], index=['c', 'd', 'e'])
s3 = pd.Series([5, 6], index=['f', 'g'])

print(pd.concat([s1,s2,s3],axis=0,sort=False))
pretty_print('華麗的分割線')
print(pd.concat([s1,s2,s3],axis=1,sort=False))
"""
a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: int64
******************** 華麗的分割線 ********************
     0    1    2
a  0.0  NaN  NaN
b  1.0  NaN  NaN
c  NaN  2.0  NaN
d  NaN  3.0  NaN
e  NaN  4.0  NaN
f  NaN  NaN  5.0
g  NaN  NaN  6.0
"""

s4 = pd.concat([s1,s3])
print(pd.concat([s1,s4],axis=1,join='inner',sort=False))  # join = outer or inner

from pandas import Series,DataFrame
import pandas as pd
import numpy as np
import re

def pretty_print(text='',star_cnt=20):
    stars = '*'*star_cnt
    print('{stars} {text} {stars}'.format(text=text,stars=stars))

def generate_df(rows, cols):
    data = np.arange(rows*cols).reshape((rows,cols))
    columns = ['col_'+str(i) for i in range(cols)]
    indices = ['row_'+str(j) for j in range(rows)]
    return DataFrame(data,columns=columns,index=indices)

arr = generate_df(3,4)
s1 = pd.Series([0, 1], index=['a', 'b'])
s2 = pd.Series([2, 3, 4], index=['c', 'd', 'e'])
s3 = pd.Series([5, 6], index=['f', 'g'])

s5 = pd.concat([s1,s2,s3],axis=0,keys=['one', 'two', 'three'],sort=False) # 能夠爲每個 series 作分層索引便於區分
print(s5)
pretty_print('華麗的分割線')
"""
one    a    0
       b    1
two    c    2
       d    3
       e    4
three  f    5
       g    6
dtype: int64
******************** 華麗的分割線 ********************
         a    b    c    d    e    f    g
one    0.0  1.0  NaN  NaN  NaN  NaN  NaN
two    NaN  NaN  2.0  3.0  4.0  NaN  NaN
three  NaN  NaN  NaN  NaN  NaN  5.0  6.0

"""

from pandas import Series,DataFrame
import pandas as pd
import numpy as np
import re

def pretty_print(text='',star_cnt=20):
    stars = '*'*star_cnt
    print('{stars} {text} {stars}'.format(text=text,stars=stars))

def generate_df(rows, cols):
    data = np.arange(rows*cols).reshape((rows,cols))
    columns = ['col_'+str(i) for i in range(cols)]
    indices = ['row_'+str(j) for j in range(rows)]
    return DataFrame(data,columns=columns,index=indices)

df1 = generate_df(3,2)
df2 = generate_df(2,2).applymap(lambda x:x+5)
print(df1)
pretty_print('華麗的分割線')
print(df2)
pretty_print('華麗的分割線')
print(pd.concat([df1,df2],axis=1,sort=False,keys=('lvl1','lvl2')))
"""
       col_0  col_1
row_0      0      1
row_1      2      3
row_2      4      5
******************** 華麗的分割線 ********************
       col_0  col_1
row_0      5      6
row_1      7      8
******************** 華麗的分割線 ********************
       lvl1        lvl2      
      col_0 col_1 col_0 col_1
row_0     0     1   5.0   6.0
row_1     2     3   7.0   8.0
row_2     4     5   NaN   NaN
"""

聯合重疊數據

from pandas import Series,DataFrame
import pandas as pd
import numpy as np
import re

def pretty_print(text='',star_cnt=20):
    stars = '*'*star_cnt
    print('{stars} {text} {stars}'.format(text=text,stars=stars))

def generate_df(rows, cols):
    data = np.arange(rows*cols).reshape((rows,cols))
    columns = ['col_'+str(i) for i in range(cols)]
    indices = ['row_'+str(j) for j in range(rows)]
    return DataFrame(data,columns=columns,index=indices)

a = pd.Series([np.nan, 2.5, np.nan, 3.5, 4.5, np.nan],
              index=['f', 'e', 'd', 'c', 'b', 'a'])
b = pd.Series(np.arange(len(a), dtype=np.float64),
              index=['f', 'e', 'd', 'c', 'b', 'a'])
print(a)
pretty_print('華麗的分割線')
print(b)
pretty_print('華麗的分割線')
print(np.where(pd.isnull(a),b,a))
"""
f    NaN
e    2.5
d    NaN
c    3.5
b    4.5
a    NaN
dtype: float64
******************** 華麗的分割線 ********************
f    0.0
e    1.0
d    2.0
c    3.0
b    4.0
a    5.0
dtype: float64
******************** 華麗的分割線 ********************
[0.  2.5 2.  3.5 4.5 5. ]
"""
print(b.combine_first(a))

8.3 重中之重重塑或透視

stack 
unstack
每每結合 分層索引來作

pivot <==> 等價於  set_index 建立分層索引， 而後調用 unstack 拆堆
pivot 反過來操做 就是 pd.melt

import pandas as pd
import numpy as np

"""
對比 某一行 第二三列的 差值 == 下一行 第一列的值，找出這樣的行
"""

data = np.zeros((20,3))
df = pd.DataFrame(data, columns=['col_'+str(i) for i in range(3)], index=['row_'+str(i) for i in range(20)])
df.iloc[:, 1:] = 1

print(df.head(10))

print('{stars} {text} {stars}'.format(stars='*'*20,text='華麗的分割線'))
def func(row):
    return abs(row[1] - row[2])

df['col_3'] = df.apply(func, axis=1)
df['col_4'] = np.where(df['col_3'] == df['col_0'].shift(1), True, False)

df = df[df['col_4']]
print(df.head(20))

判斷相似 gene_1|gene_2 ==> gene_1 = 1 , gene_2 = 1 gene_3 = 0....

from pandas import Series,DataFrame
import pandas as pd
import numpy as np
import re

def pretty_print(text='',star_cnt=20):
    stars = '*'*star_cnt
    print('{stars} {text} {stars}'.format(text=text,stars=stars))

def generate_df(rows, cols):
    data = np.arange(rows*cols).reshape((rows,cols))
    columns = ['col_'+str(i) for i in range(cols)]
    indices = ['row_'+str(j) for j in range(rows)]
    return DataFrame(data,columns=columns,index=indices)

df = generate_df(3,6)
print(df.head(10))
df['col_1'] = 'col_2|col_4'
df.ix[1:2,'col_1'] = 'col_3|col_5'

cols = [col for col in df.columns if col!='col_0']

def do_apply(row):
    for col in cols:
        print('*'*100)
        if col in row['col_1'].split(r'|'):
            tmp = row['col_0']
            df.loc[df['col_0'] == tmp,col] = 1

df.apply(do_apply, axis=1)
# df.apply(lambda row:do_apply(row) ,axis=1)
print('df shape: {}'.format(df.shape))
print('df 前十行: ',df.head(10))