Series和DataFrame的簡單數學運算:php
import numpy as np import pandas as pd
Series的運算:css
n = np.nan n = 1 m+n #nan加任何數都是nan --------------- nan
s1 = pd.Series([1,2,3],index=['a','b','c']) s2 = pd.Series([4,5,6,7],index=["B","C","D","E"]) print(s1) print(s2) --------------------------------------------- A 1 B 2 C 3 B 4 C 5 D 6 E 7
若是index對應就相加,沒有對應的index則變成NaNvue
s3 = s1+s2 print(s3) ------------------- A NaN B 6.0 C 8.0 D NaN E NaN
DataFrame的運算:c++
df3 = df1 + df2 print(df3) --------------------- GX HN SX _________________________ A 2.0 NaN 0.0 B 7.0 NaN 5.0 C NaN NaN NaN
s1 = df3.sum() #計算每一個columns的和 print(s1) ------------------------ c1 12.0 c2 15.0 c3 12.0
s2 = df3.sum(axis=1) #計算行的總和,默認是0 print(s2) ------------------------------------ A 6.0 B 9.0 C 24.0
s1 = df3.sum(axis=0,skipna=False) #skipna是否跳過NaN計算 print(s1) ------------------------------------ c1 12.0 c2 15.0 c3 NaN
Series和Dataframe的排序:git
Series的排序:es6
s1 = pd.Series(np.random.randn(10)) print(s1) ------------------------------ 0 -0.228264 1 0.087456 2 -0.717422 3 0.126064 4 -0.190748 5 -0.600853 6 0.201084 7 0.152117 8 -0.349236 9 -0.120914
s2 = s1.short_values(ascending=False) #按values排序,默認升序True,False是降序 print(s2) ------------------------------- 6 0.201084 7 0.152117 3 0.126064 1 0.087456 9 -0.120914 4 -0.190748 0 -0.228264 8 -0.349236 5 -0.600853 2 -0.717422
s1.sort_values(ascending=False,inplace=True) #inplace是否對原值修改,默認False不對原值修改 print(s1) -------------------------------- 6 0.201084 7 0.152117 3 0.126064 1 0.087456 9 -0.120914 4 -0.190748 0 -0.228264 8 -0.349236 5 -0.600853 2 -0.717422
s1.sort_index() #按index排序 ------------------------ 0 -0.228264 1 0.087456 2 -0.717422 3 0.126064 4 -0.190748 5 -0.600853 6 0.201084 7 0.152117 8 -0.349236 9 -0.120914
DataFrame的排序:npm
df2 = df1['A'].sort_values() #取出A列的值,按values升序排列 print(df2) --------------------------------- 4 -1.146598 1 -0.334181 2 -0.165665 3 0.650646 0 0.771585 5 1.973264
df2.sort_index() ------------------- 0 0.771585 1 -0.334181 2 -0.165665 3 0.650646 4 -1.146598 5 1.973264
DataFrame融合:數組
import pandas as pd import numpy as np
將兩個DataFrame融合必需要有一個公共的列,不然會報錯,且公共列中至少有一個值相等,否則融合出現的是空列表app
df1 = pd.DataFrame({"key1":["A","B","C"],"key":["X","Y","Z"],"values1":[1,2,3]}) df2 = pd.DataFrame({"key1":["A","B","C"],"key":["X","Y","Z"],"values2":[4,5,6]}) print(df1) print(df2) pd.merge(df1,df2) #融合 ------------------------------------ key1 key values1 0 A X 1 1 B Y 2 <-----df1 2 C Z 3 key1 key values2 0 A X 4 1 B Y 5 <-----df2 2 C Z 6
pd.merge(df1,df2,on='key') #指定一個融合的公共列
df1 = pd.DataFrame({"key":["X","Y","Z"],"values1":[1,2,3]}) df2 = pd.DataFrame({"key":["X","B","C"],"values2":[4,5,6]}) print(df1) print(df2) pd.merge(df1,df2) ---------------------------------- key values1 0 X 1 1 Y 2 <-----df1 2 Z 3 key values2 0 X 4 1 B 5 <-----df2 2 C 6
鏈接和組合:dom
numpy的數組鏈接
import numpy as np import pandas as pd
arr1 = np.arange(9).reshape(3,3) arr2 = arr1 print(np.concatenate([arr1,arr2])) #默認 axis=0縱向鏈接 print(np.concatenate([arr1,arr2],axis=1)) #橫向鏈接 -------------------------------------------------------------- [[ 0 1 2] [ 3 4 5] [ 6 7 8] [ 9 10 11] [ 0 1 2] [ 3 4 5] [ 6 7 8]] [[0, 1, 2, 0, 1, 2], [3, 4, 5, 3, 4, 5], [6, 7, 8, 6, 7, 8]]
橫向鏈接必須行數相同,縱向鏈接必須列數相同,否則會報錯
Series的鏈接Contatenate
s1 = pd.Series([1,2,3],index=['X','Y','Z']) s2 = pd.Series([4,5],index=['A','B']) print(pd.concat([s1,s2])) print(pd.concat([s2,s1])) -------------------------------------- X 1 Y 2 Z 3 <-----s1,s2 A 4 B 5 A 4 B 5 X 1 <-----s2,s1 Y 2 Z 3
pd.concat([s1,s2],axis=1,sort=True)
DataFrame的鏈接Contatenate
df1 = pd.DataFrame(np.random.randn(4,3),columns=["X","Y","Z"]) df2 = pd.DataFrame(np.random.randn(3,3),columns=["X","Y","A"]) print(df1) print(df2) ---------------------------------------------
print(pd.concat([df1,df2],sort=True))
print(pd.concat([df1,df2],axis=1))
Series的Combine組合
s1 = pd.Series([2,np.nan,4,np.nan],index=["A","B","C","D"]) s2 = pd.Series([1,2,3,4],index=["A","B","C","D"]) print(s1) print(s2) ----------------------------- A 2.0 B NaN <-----s1 C 4.0 D NaN A 1 B 2 <-----s2 C 3 D 4
s1.combine_first(s2) #將s2和s1組合,若是s1有nan值,則用s2同位置的值填充 ------------------------------ A 2.0 B 2.0 C 4.0 D 4.0
s2.combine_first(s1) ------------------------- A 1.0 B 2.0 C 3.0 D 4.0
DataFrame的組合Combine
df1 = pd.DataFrame({"X":[1,np.nan,3,np.nan], "Y":[5,np.nan,7,np.nan], "Z":[9,np.nan,11,np.nan]}) df2 = pd.DataFrame({"Z":[np.nan,10,np.nan,12], "A":[1,2,3,4]}) print(df1) print(df2)
df1.combine_first(df2) #在df1數據前面組合df2
經過apply進行數據預處理:
#複製如下數據 time data 0 1473411962 Symbol: APPL Seqno: 0 Price: 1623 1 1473411962 Symbol: APPL Seqno: 0 Price: 1623 2 1473411963 Symbol: APPL Seqno: 0 Price: 1623 3 1473411963 Symbol: APPL Seqno: 0 Price: 1623 4 1473411963 Symbol: APPL Seqno: 1 Price: 1649
import numpy as np import pandas as pd
df = pd.read_clipboard() #讀取複製的數據
print(df)
df.to_csv("apply_demo.csv") df = pd.read_csv("apply_demo.csv")
s1 = pd.Series(['x']*5) df['x'] = s1.values #給df添加一列,內容是s1的values df
df['X'] = df['X'].apply(str.upper) #將df的X列的值當作參數,所有upper df
l1 = df["data"][0] print(l1) l1 = df["data"][0].split(" ") print(l1) -------------------------- 'Symbol: APPL Seqno: 0 Price: 1623' ['Symbol:', 'APPL', 'Seqno:', '0', 'Price:', '1623']
def foo(line): items = line.split(' ') return pd.Series([items[1],items[3],items[5]])
df_tmp = df['data'].apply(foo) #將每行的data列當作foo參數 df_tmp
df_tmp.rename(columns={0:'Symbol',1:'Seqno',2:'Price'}) #從新定義每一個columns的名字 df_tmp
df_new = df.combine_first(df_tmp) #在df前面組合df_tmp df_new
del df_new['data'] del df_new['X'] def_new
經過去重進行數據清理
import numpy as np import pandas as pd
這裏使用上面的df數據
df['Seqno'].unique() #經過unique將Seqno列的重複值去除返回剩下的值
------------------------- [0.,1.]
df["Seqno"].duplicated() #查看是否爲重複值 df.drop_duplicates(['Seqno']) #去除Seqno有重複值的行
時間序列的操做基礎
import numpy as np import pandas as pd from datetime import datetime
date_list = [datetime(2017,3,1),datetime(2018,3,1),datetime(2018,6,20),
datetime(2018,7,2),datetime(2018,10,1),datetime(2018,10,9)]
s1 = pd.Series(np.random.randn(6),index=date_list)
print(s1)
---------------------------------------
2017-03-01 0.664216 2018-03-01 0.259339 2018-06-20 0.710367 2018-07-02 0.920019 2018-10-01 -0.297789 2018-10-09 1.337095
print(s1['2018-10-09']) print(s1['2018-10']) print(s1['2018']) ----------------------------- 1.3370947151802048 2018-10-01 -0.297789 2018-10-09 1.337095 2018-03-01 0.259339 2018-06-20 0.710367 2018-07-02 0.920019 2018-10-01 -0.297789 2018-10-09 1.337095
#批量生產日期數據 data_list_new = pd.date_range(start='2018-01-01',end = '2018-01-31') print(date_list_new) s2 = pd.Series(np.random.rand(),index=date_list_new) print(s2) ------------------------------------------ ['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04', '2018-01-05', '2018-01-06', '2018-01-07', '2018-01-08', '2018-01-09', '2018-01-10', '2018-01-11', '2018-01-12', '2018-01-13', '2018-01-14', '2018-01-15', '2018-01-16', '2018-01-17', '2018-01-18', '2018-01-19', '2018-01-20', '2018-01-21', '2018-01-22', '2018-01-23', '2018-01-24', '2018-01-25', '2018-01-26', '2018-01-27', '2018-01-28', '2018-01-29', '2018-01-30', '2018-01-31'] 2018-01-01 0.810988 2018-01-02 0.810988 2018-01-03 0.810988 2018-01-04 0.810988 2018-01-05 0.810988 2018-01-06 0.810988 2018-01-07 0.810988 2018-01-08 0.810988 2018-01-09 0.810988 2018-01-10 0.810988 2018-01-11 0.810988 2018-01-12 0.810988 2018-01-13 0.810988 2018-01-14 0.810988 2018-01-15 0.810988 2018-01-16 0.810988 2018-01-17 0.810988 2018-01-18 0.810988 2018-01-19 0.810988 2018-01-20 0.810988 2018-01-21 0.810988 2018-01-22 0.810988 2018-01-23 0.810988 2018-01-24 0.810988 2018-01-25 0.810988 2018-01-26 0.810988 2018-01-27 0.810988 2018-01-28 0.810988 2018-01-29 0.810988 2018-01-30 0.810988 2018-01-31 0.810988
t_range = pd.date_range("2018-01-01","2018-12-31") print(t_range) ---------------------------------- ['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04', '2018-01-05', '2018-01-06', '2018-01-07', '2018-01-08', '2018-01-09', '2018-01-10', ... '2018-12-22', '2018-12-23', '2018-12-24', '2018-12-25', '2018-12-26', '2018-12-27', '2018-12-28', '2018-12-29', '2018-12-30', '2018-12-31']
s1_month = s1.resample('M') #按月份採樣 s1_month = s1_month.mean() #算每月的平均值 --------------------------------- 2018-01-31 0.309961 2018-02-28 -0.116868 2018-03-31 -0.053147 2018-04-30 -0.053538 2018-05-31 0.026336 2018-06-30 -0.236869 2018-07-31 0.067183 2018-08-31 0.054198 2018-09-30 -0.170730 2018-10-31 -0.195967 2018-11-30 0.009427 2018-12-31 0.476446
#按小時重採樣。數據中沒有小時數據,因此要填充數據 s1.resample("H").ffill().head() ------------------------------------- 2018-01-01 00:00:00 0.113076 2018-01-01 01:00:00 0.113076 2018-01-01 02:00:00 0.113076 2018-01-01 03:00:00 0.113076 2018-01-01 04:00:00 0.113076
數據分箱:
import numpy as np import pandas as pd
數據分箱技術在直方圖時候使用比較有意義
#建立20個在25-100之間的分數 score_list = np.random.randint(25,100,size=20) bins = [0,59,70,80,100] #建立一個成績段列表 #0-59 不合格 60-70 合格 71-80 良 81-100 優 print(score_list) ------------------------------- [86, 75, 57, 66, 37, 83, 61, 92, 61, 36, 62, 96, 76, 27, 31, 77, 47,52, 68, 56]
categories = pd.cut(score_list,bins) #將score_list數據對bins進行分箱 print(categories) ---------------------------- [(80, 100], (70, 80], (0, 59], (59, 70], (0, 59], ..., (70, 80], (0, 59], (0, 59], (59, 70], (0, 59]] Length: 20 Categories (4, interval[int64]): [(0, 59] < (59, 70] < (70, 80] < (80, 100]]
pd.values_counts(categories) #查看每一個分箱段有幾個數據 -------------------- (0, 59] 8 (59, 70] 5 (80, 100] 4 (70, 80] 3
DataFrame的分箱:
df = pd.DataFrame() df["score"] = score_list df.head() -----------------------
#產生隨機字符串,當成學生的名字
df['student'] = [pd.util.testing.rands(3) for i in range(20)]
df.head()
-------------------------------------
df['categoried'] = pd.cut(df["score"],bins,labels=["不及格","及格","良好","優秀"]) #給分箱的每一個分數段取名 df.head() -------------------------
數據分組:
import numpy as np import pandas as pd
df = pd.read_clipboard() #獲取剪貼板數據 df.to_csv('city_weather.csv') #保存成csv文件
df = pd.read_csv('city_weather.csv') del df['Unnamed: 0'] df.head() -------------------------------------
g = df.groupby(df['city']) #按city分組 g.groups ------------------------------------------ {'BJ': Int64Index([0, 1, 2, 3, 4, 5], dtype='int64'), 'GZ': Int64Index([14, 15, 16, 17], dtype='int64'), 'SH': Int64Index([6, 7, 8, 9, 10, 11, 12, 13], dtype='int64'), 'SZ': Int64Index([18, 19], dtype='int64'}
df_bj = g.get_group('BJ') #獲取BJ分組數據 df_bj -------------------------------
df_bj.mean() #取每一個columns的平均值 g.max() #取每組每一個colums的最大值 g.min() #取每組每一個colums的最小值
數據聚合:
import numpy as np import pandas as pd
df = pd.read_csv('city_weather.csv') g = df.groupby('city') del df['Unnamed: 0'] print(df) g.groups ----------------- date city temperature wind 0 03/01/2016 BJ 8 5 1 17/01/2016 BJ 12 2 2 31/01/2016 BJ 19 2 3 14/02/2016 BJ -3 3 4 28/02/2016 BJ 19 2 5 13/03/2016 BJ 5 3 6 27/03/2016 SH -4 4 7 10/04/2016 SH 19 3 8 24/04/2016 SH 20 3 9 08/05/2016 SH 17 3 10 22/05/2016 SH 4 2 11 05/06/2016 SH -10 4 12 19/06/2016 SH 0 5 13 03/07/2016 SH -9 5 14 17/07/2016 GZ 10 2 15 31/07/2016 GZ -1 5 16 14/08/2016 GZ 1 5 17 28/08/2016 GZ 25 4 18 11/09/2016 SZ 20 1 19 25/09/2016 SZ -10 4 {'BJ': Int64Index([0, 1, 2, 3, 4, 5], dtype='int64'), 'GZ': Int64Index([14, 15, 16, 17], dtype='int64'), 'SH': Int64Index([6, 7, 8, 9, 10, 11, 12, 13], dtype='int64'), 'SZ': Int64Index([18, 19], dtype='int64')}
print(g.min()) print(g.agg('min')) #把每一個分組最小的值聚合在一塊兒 ---------------------------------
#多條件分組聚合 g_new = df.groupby(['city','wind']) g_new.groups g_new.get_group(('BJ',3)) -------------------------------------- {('BJ', 2): Int64Index([1, 2, 4], dtype='int64'), ('BJ', 3): Int64Index([3, 5], dtype='int64'), ('BJ', 5): Int64Index([0], dtype='int64'), ('GZ', 2): Int64Index([14], dtype='int64'), ('GZ', 4): Int64Index([17], dtype='int64'), ('GZ', 5): Int64Index([15, 16], dtype='int64'), ('SH', 2): Int64Index([10], dtype='int64'), ('SH', 3): Int64Index([7, 8, 9], dtype='int64'), ('SH', 4): Int64Index([6, 11], dtype='int64'), ('SH', 5): Int64Index([12, 13], dtype='int64'), ('SZ', 1): Int64Index([18], dtype='int64'), ('SZ', 4): Int64Index([19], dtype='int64')}
透視表:
import numpy as np import pandas as pd
df = pd.read_excel('sales-funnel.xlsx') #讀取數據文件 df
-------------------------------------------
#透視表,默認以name爲index將全部name能夠計算的數據求平均值 pd.pivot_table(df,index='Name') ------------------------
#求Name全部值的sum值透視表 pd.pivot_table(df,index=['Name'],aggfunc='sum') -------------------------
#取每一個經理下 銷售與客戶的數據 pd.pivot_table(df,index=['Manager','Rep']) --------------------------------
#統計經理下的銷售賣了多少Product列裏的物品,銷售額多少,數量有多少 pd.pivot_table(df,index=['Manger','Rep'], value=['Price','Quantity'],columns='Product', fill_value=0,aggfunc='sum') #fill_value是將表裏的NAN填充成0 -----------------------------------------------
分組和透視表功能實戰:
import numpy as np import pandas as pd df = pd.read_csv('2008.csv')
df.head()
-------------------------------
#取延誤時間最長的10個航班 #排序函數,默認升序 #以ArrDelay列數值排序 降序 取十個數據 df.sort_values('ArrDelay',ascending=False)[:10][['DepTime','FlightNum',"ArrTime","Origin","Dest","ArrDelay"]]
-------------------------------------------------------
#查看沒有延誤的航班和延誤航班的比例 print(df['Cancelled'].value_counts())
#查看沒有延誤的航班和延誤航班的比例 print(df['Cancelled'].
matplotlib簡單繪圖-plot:
import numpy as np import matplotlib.pyplot as plt
%matplotlib inline
#魔法方法,每次畫完圖不用show()自動就出來
a = [1,2,3,4,5,6] plt.plot(a) #默認以a的index爲x,a的值爲y -------------------------------
a = [1,2,3,4,5,6] b = [4,5,6,22,12,10] plt.plot(a,b) #以a值爲x,b值爲y -----------------------------
c = [10,8,6,4,2] d = [1,8,3,2,9] plt.plot(a,c,'--',c,d,'*') #以a和c爲x b和d爲y ac用虛線,cd用星星 ------------------------------
t = np.arange(0.0,2.0,0.1) print(t) s = np.sin(t*np.pi) print(s) ---------------------------------- array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. , 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9]) ============= [ 0.00000000e+00, 3.09016994e-01, 5.87785252e-01, 8.09016994e-01, 9.51056516e-01, 1.00000000e+00, 9.51056516e-01, 8.09016994e-01, 5.87785252e-01, 3.09016994e-01, 1.22464680e-16, -3.09016994e-01, -5.87785252e-01, -8.09016994e-01, -9.51056516e-01, -1.00000000e+00, -9.51056516e-01, -8.09016994e-01, -5.87785252e-01, -3.09016994e-01]
plt.plot(t,s,'r--',label='sin(t*pi)') #用紅色虛線t爲x s爲y,label設置顯示圖例 plt.plot(t*2,s,'b--',label='sin(2t*pi)') plt.xlabel('X Axis') #給x線取名 plt.xlabel('Y Axis') #給y線取名 plt.title('Sin Wave') #圖表名 plt.legend() #顯示label ----------------------------------
matplotlib繪子圖-subplot:
import numpy as np import matplotlib.pyplot as plt %matplotlib inline
x = np.linspace(0.0,5.0) #以0開始到5.0取50個數 y1 = np.sin(np.pi*x) y2 = np.cos(np.pi*x*2) plt.plot(x,y1,'b--',label='sin(pi*x)') plt.plot(x,y2,'r--',label='cos(pi*2x)') plt.ylabel('y value') plt.xlabel('x value') plt.legend() --------------------------------------------------
plt.subplot(2,2,1) #將圖畫在2*2個窗口的第一個窗口 可寫成221,也是一樣效果 plt.plot(x,y1,'b--') plt,ylabel('y1') plt.subplot(222) plt.plot(x,y2,'r--') plt,ylabel('y2') plt.subplot(223) plt.plot(x,y1,'c*') plt.subplot(224) plt.plot(x,y1,'k^') -----------------------------------
plt.subplot(211) plt.plot(x,y1,'b--') plt.ylabel('y1') plt.subplot(212) plt.plot(x,y2,'ro') #紅色點狀 plt.ylabel('y2') plt.xlabel('x')
-------------------------------------
import numpy as np import pandas as pd import matplotlib.pyplot as plt %matplotlib inline
s1 = pd.Series(np.random.randn(1000)).cumsum() #cumsum爲累計求和,即(((1+2)+3)+4)..... s1 = pd.Series(np.random.randn(1000)).cumsum()
s1.plot(kind='line',label='S1',title='Thie is Series') #Series和DataFrame自帶plot s2.plot(label='S2',grid=True) #grid是否顯示網格 plt.legend()
-----------------------------------
#使用plt的plot來畫 figure,ax = plt.subplots(2,1) ax[0].plot(s1,label='S1') ax[1].plot(s2,label='S2') plt.legend() ----------------------------
import numpy as np import pandas as pd import matplotlib.pyplot as plt %matplotlib inline
df = pd.DataFrame(np.random.randint(1,10,40).reshape(10,4), columns=['A','B','C','D']) #建立40個1-10的隨機數,將其轉爲10行4列 df -----------------------------------------
#使用DataFrame自帶的plot df.plot() -------------------
#使用plt的plot plt.plot(df['A'],label='A') plt.plot(df['B'],label='B') plt.plot(df['C'],label='C') plt.plot(df['D'],label='D') plt.legend()
----------------------
總結:特定狀況下使用DataFrame和Series自帶的plot更方便
import numpy as np import pandas as pd import matplotlib.pyplot as plt %matplotlib inline
#直方圖 s = pd.Series(np.random.randn(1000)) plt.hist(s,bins=20,rwidth=0.5,color='y')
#hist使用直方圖,bins設置20個區間,rwidth是柱子的寬度,color是柱子的顏色
-----------------------------------------
#密度圖 s.plot(kind='kde') ---------------------
Seaborn畫圖:
import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns %matplotlib inline
iris = pd.read_csv(r"iris.csv") del iris['Unnamed: 0'] iris.head() #sepal 花萼 petal花瓣 species 種類 ----------------------------------------
iris.Species.unique() #將Species去重,查看有幾個種類 ---------------------------- ['setosa', 'versicolor', 'virginica']
#用zip給每種花定義一個顏色 color_map = dict(zip(iris.Species.unique(),['blue','green','red'])) color_map ---------------------- {'setosa': 'blue', 'versicolor': 'green', 'virginica': 'red'}
#將iris分組後進行畫圖 for species,group in iris.groupby('Species'): #以Species分組後的數據進行for循環 plt.scatter(group['Petal.Width'],group['Sepal.Width'], color = color_map[species], #用species取出來的種類在color_map取顏色 alpha=0.3, #透明度 edgecolor='black', #每一個點的邊框 label=species) plt.legend(frameon=True,title='Species') #frameon 是否開啓label標籤邊框 plt.xlabel('Petal.Width') plt.ylabel('Sepal.Width')
----------------------------
#使用Seaborn來畫 sns.lmplot('Petal.Width','Sepal.Width',iris,hue='Species',fit_reg=False) #hue會自動區分Species的種類而後自動配色和label,fit_reg是否打開回歸線
----------------------------------------------
Seaborn的密度圖
s1 = pd.Series(np.random.randn(1000)) sns.distplot(s1,bins=20,hist=True,kde=True,rut=True) #hist是否開啓直方圖,kde是否開啓數據線,rut是否開啓密度圖 -----------------------------
sns.heatmap(data,annot=True,fmt='d') #熱力圖 annot是否在每一個格子裏顯示數字 fmt顯示的數據類型 d整數 .2f是2位小數點的浮點型 sns.barplot(x=x軸放什麼數據,y=y軸放什麼數據) #sns柱狀圖
股票市場分析:
import numpy as np import pandas as pd import pandas_datareader as pdr #讀取數據的模塊 import matplotlib.pylab as plt #圖形可視化 import seaborn as sns %matplotlib inline #魔法命令,可讓圖形不用show()命令就能出來 from datetime import datetime
start = datetime(2015,9,20) #獲取阿里巴巴和亞馬遜從2015-9-20開始的數據 alibaba = pdr.get_data_yahoo('BABA',start=start) amazon = pdr.get_data_yahoo('AMZN',start=start)
print(alibaba.head())
print(amazon.head())
--------------------------------------------------
#保存數據 alibaba.to_csv('BABA.csv') amazon.to_csv('AMZN.csv')
alibaba['Adj Close'].plot(legend=True) #用圖形顯示阿里巴巴Adj Close項的走勢 #Series和DataFrame都自帶plot,legend是否顯示label --------------------------------------------
#對比兩隻股票的Adj Close alibaba['Adj Close'].plot(legend=True,label='alibaba Adj close') alibaba['Adj Close'].plot(legend=True,label='alibaba Adj close') -----------------------------------
#將股票每日最大波動添加到新列 最高值-最低值 alibaba['high-low'] = alibaba['High'] - alibaba['Low'] amazon['high-low'] = amazon['High'] - amazon['Low']
print(alibaba.head()) print(amazon.head()) ------------------------------- Open High Low Close Adj Close Volume \ Date 2015-09-21 65.379997 66.400002 62.959999 63.900002 63.900002 22355100 2015-09-22 62.939999 63.270000 61.580002 61.900002 61.900002 14897900 2015-09-23 61.959999 62.299999 59.680000 60.000000 60.000000 22684600 2015-09-24 59.419998 60.340000 58.209999 59.919998 59.919998 20645700 2015-09-25 60.630001 60.840000 58.919998 59.240002 59.240002 17009100 high-low Date 2015-09-21 3.440003 2015-09-22 1.689998 2015-09-23 2.619999 2015-09-24 2.130001 2015-09-25 1.920002 Open High Low Close Adj Close \ Date 2015-09-21 544.330017 549.780029 539.590027 548.390015 548.390015 2015-09-22 539.710022 543.549988 532.659973 538.400024 538.400024 2015-09-23 538.299988 541.210022 534.000000 536.070007 536.070007 2015-09-24 530.549988 534.559998 522.869995 533.750000 533.750000 2015-09-25 542.570007 542.799988 521.400024 524.250000 524.250000 Volume high-low Date 2015-09-21 3283300 10.190002 2015-09-22 3841700 10.890015 2015-09-23 2237600 7.210022 2015-09-24 3501000 11.690003 2015-09-25 4031000 21.399964
#用波動值畫圖 alibaba['high-low'].plot(label='alibaba',legend=True) amazon['high-low'].plot(label='amazon',legend=True) ---------------------------------------
#計算兩個交易日間最後操做收盤價的波動 alibaba['dialy-return'] = alibaba['Adj Close'].pct_change() #pct_change() 計算相鄰兩行之間數值的變化 amazon['dialy-return'] = alibaba['Adj Close'].pct_change()
alibaba['dialy-return'].plot(figsize=(10,4),legend=True,label='alibaba') #figsize調整畫布大小 amazon['dialy-return'].plot(figsize=(10,4),legend=True,label='amazon')
----------------------------------------------------------------------------
alibaba['dialy-return'].plot(kind='hist') #直方圖 sns.distplot(amazon['dialy-return'].dropna(),bins=100,color='blue', label='amazon',rug=True) #用seabron畫的密度圖,由於dialy-return第一行是Nan,因此要dropnan #rug是開啓密度圖形
top_tech_df = pd.read_csv('top5.csv',index_col='Date') #讀取文件數據,以Date做爲index top_tech_df.head() -----------------------------------
top_tech_dr = top_tech_df.pct_change() #計算Adj close波動 top_tech_dr.dropna().head() ------------------------------------
top_tech_df.plot()
--------------------------
top_tech_df[['apple','facebook','microsoft']].plot() #只取3家公司的股票圖 ----------------------------
#pairplot讓全部columns和另外4個任意一個columns畫出散點圖 #由於top_tech_dr的第一行都是NAN,因此要drop掉 sns.pairplot(top_tech_dr.dropna()) --------------------------------------