python | 數據分析（二）- Pandas數據包

時間 2019-11-22

標籤 python 數據分析 pandas 欄目 Python 简体版

原文原文鏈接

Pandas是基於NumPy的另外一個python數據分析庫，提供了一套名爲DataFrame的數據結構，實現高性能數據操做和分析。

Pandas的主要特色：

　　快速高效的DataFrame對象，具備默認和自定義的索引；

　　將數據從不一樣文件格式加載到內存中的數據對象的工具；

　　丟失數據的數據對齊和綜合處理；

　　重組和擺動日期集；

　　基於標籤的切片，索引和大數據集的子集；

　　能夠刪除或插入來自數據結構的列；

　　按數據分組進行聚合和轉換；

　　高性能合併和數據加入；

　　時間序列功能；

詳細教程： https://www.yiibai.com/pandas/

10 Minutes to pandas: http://pandas.pydata.org/pandas-docs/stable/10min.html

如下爲代碼筆記：html

# 導入pandas包
import pandas as pd

#1 DataFrame基本操做
# 建立6*4的隨機矩陣
df = pd.DataFrame(np.random.randn(6,4), columns=list('ABCD'))

df.dtypes             #類型
df.head(3)            #前三行
df.tail(5)            #後五行
df.describe()         #描述性統計
df.T                  #轉置
df.sort(columns='C')  #按C列排序
df.iloc[1:3, :]       #數據切片

#2 篩選數據
df[(df.D>0) & (df.C<0)]                   #多個關係篩選
df[['A','B']][(df.D>0) & (df.C<0)]   #只返回特定列結果

#3 讀取csv數據
os.getcwd()     #獲取當前工做目錄
df = pd.read_csv('self/…', engine='python', encoding='gbk') #讀取文件

#4 數據選擇
df[u'專業名稱' u'學號'][:3]  #前三行數據

#5 數據統計
counts=df[u'專業名稱'].value_counts()  #結果會打印出選擇列及對應值 

#6 數據分組
#建立數據
df = pd.DataFrame({'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], 'B': ['one','one','three','one','two','one','one','three'], 'C':np.random.randn(8), 'D':np.random.randn(8)})

#分組
grouped = df.groupby(['A', 'B'])
print (gourped.last())  #打印最後一行

#按函數分組
def get_type(letter):
	if letter.lower() in 'abem': 		
                return 'vowel'
	else:
		return 'consonant'

grouped = df.groupby(get_type, axis = 1)
print (grouped.first())

#7 transformation 標準化數據
#將一列數據轉換爲以1爲標準差以0爲平均數的標準分數

#建立series對象，以時間戳爲index
index = pd.date_range('1/1/2014', periods=100)
ts=pd.Series(np.random.normal(0.5, 2, 100),index)

key = lambda x: x.month       #按月分組
zscore = lambda x: (x-x.mean())/x.std()

transformed =ts.groupby(key).transform(zscore)

print(transformed.groupby(key).mean())
print(transformed.groupby(key).std())

#8 agg分組多種計算
#先建立一個DataFrame
import numpy as np
import pandas.util.testing as tm

colors=tm.np.random.choice(['red','green'],size=10)
foods=tm.np.random.choice(['egg','ham'],size=10)

index=pd.MultiIndex.from_arrays([colors, foods],names=['color','food'])
df=pd.DataFrame(np.random.randn(10,2), index=index)
df.columns = ['a','b']


grouped = df.groupby(level='color')

#計算各組的總數，平均數，標準差
print (grouped.agg([np.sum, np.mean, np.std]))

grouped['a'].agg([np.sum, np.me..  #針對a列計算
grouped['a'].agg({'SUM result': np.sum, 'Mean result': np.mean ..})                                       #自設列標題
grouped['a'].agg({'lambda': lambda x: np.mean(abs(x))})  #經過lambda匿名函數

#9 按月分組
key = lambda x:x.month
grouped = ts.groupby(key)

df=pd.DataFrame({'date':date, 'data':data})
print(df.groupby(df['date'].apply(lambda x:x.month)))  #按日期格式分組並設置列名

#10 字符串日期轉Date
date_stngs = (…)
a = pd.Series([pd.to_datetime(date) for date in date_stngs])

#11 移動、複製、刪除列
df['c'] = pd.Series(np.random.randn(10), index=df.index)  #增長列
df.insert(1, 'e', df['a'])  #插入列 （位置，列名，值）
df = df.drop(['a', 'b'], axis = 1)  #丟到某列

b = df.pop('b')
df.insert(0, 'b', b)   #移動列

#12 Series 建立帶索引數據
ser1 = Series([1,2,3,4])   #默認索引
ser2 = Series(range(4),index = ["a","b","c","d"])    #自定義索引

ser3 = Series({'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000})    #用字典建立

能夠經過索引訪問值，也可經過ser.values/index 獲取全部值/索引

#13 字符串操做
s = pd.Series(list('ABCDEF'))   
print (s)

#同普通字符串操做同樣
s.str.upper()
s.str.len()
s.str.split('_')
s.str.replace('^a|b$', 'X', case=False)

s.str.extract('()()')     #字符串提取，每一個括號表明一個條件
s.str.contains(條件, na = False)           #包含字符串
s.str.match(條件，as_indexer=False)  #匹配字符串
startswith, endwith…

#14 讀寫sql數據庫
#read_sql接受兩個參數，一個是sql語句；一個是con（數據庫鏈接）、read_sql直接返回一個DataFrame對象
con = sqlite3.connect("xx.sqlite") 
sql = "select * from weather_2012 LIMIT 3"
df = pd.read_sql(sql, con, index_col='id')  #將index_col值設置爲列表

#寫數據
con2 = sqlite3.connect("xx.sqlite") 
con2.execute(「drop table if exists weather_2012」)
pd.io.sql.write_frame(df, 「weather_2012」, con2)

#15 廣播
對矩陣中每一個元素執行相同的操做
df = pd.DataFrame({'one':pd.Series(np.random.randn(4), index=list('abcd'))})
df['two']=1
df['thr']=2

#獲得一行和一列
row=df.ix[1]      #ix[1, :-1] 
columns = df['two']

#將df中每一行與row作減法
print(df.sub(row, axis='columns'))  #axis指定廣播的緯度

#16 缺失值計算
#簡單運算中，運算後相應位置也是缺失的；
df.fillna(0)  #0值填充，也能夠用字符串等填充
df.fillna(method='bfill', limit=1)  #後面值填充
df.fillna(df.mean())   #均值填充
df.fillna(df.mean()['one', 'two'])  #指定列填充
df.interpolate()   #插值法估計缺失值  默認直線  #method='values'/'time', 會根據df的類型來自動估計

df.dropna(axis =0)  #刪除缺失值的行  axis=1爲刪除列

#17 值替換
ser = pd.Series([0,1 …])
ser.replace(0, 6)
ser.replace({1:11, 2:12})  #字典映射

#一樣適用於df對象
df[['a', 'b']].replace(2, 10)  #指定多列進行替換
#若多個列中不一樣的值都要替換爲一個相同的值，可使用字典的方法表示全部須要被替換的值：
df.replace({'a':0, 'b':5}, np.nan)

備註：目前寫博客是爲了進行知識和筆記梳理。博客自己可能還存在着一些錯誤，若有發現，請求斧正，謝謝。python