數據分析——pandas

簡介

1 import pandas as pd
2 
3 # 在數據挖掘前一個數據分析、篩選、清理的多功能工具
4 '''
5 pandas 能夠讀入excel、csv等文件;能夠建立Series序列,DataFrame表格,日期數組data_range
6 '''

數據類型

 

 1 # 將excel文件,csv文件讀取並轉換爲pandas的DataFrame
 2 # df_score = pd.read_csv()
 3 df_score = pd.read_excel('./score.xlsx')
 4 # df_score.values   #數據
 5 # df_score.columns  #列名
 6 # print df_score.describe() #計算表的各項數據,count,mean,std,中位數等
 7 
 8 # 建立一個默認索引從0開始的Series
 9 s = pd.Series([1, 2, 3, 4, 5, 6])
10 # 建立自定義索引的數組,索引由index指定,和前面數組依次對應
11 s = pd.Series([1, 2, 3, 4, 5, 6], index=['a', 'b', 'c', 'd', 'e', 'f'], dtype=int)
12 # 使用字典建立一個DataFrame,字典的Key會自動成爲列名,一個Key默認對應一列數據
13 df1 = pd.DataFrame({'math': [1, 2, 3, 4, 5], 'physic': [5, 6, 7, 8, 9]}, index=['a', 'b', 'c', 'd', 'e'])
14 '''
15 # df1.values  數據
16 # df1.head(2)   前兩行數據
17 # df1.tail(2)   最後兩行數據
18 # df1.index     索引
19 # df1.columns   列名
20 '''
21 # 生成從20180101開始的時間序列,peroids是增長量,默認增長單位是天D,H小時,s秒
22 dates = pd.date_range('20180101', periods=10, freq='D')
23 # 建立使用時間索引的Series
24 # s = pd.Series(range(10),index=dates)
25 # 取出指定間隔的行數據
26 # s['2018-01-01':'2018-01-05']
27 # print dates

票房分析

 

 1 df_imdb = pd.read_csv('./IMDB.csv')
 2 
 3 # print df_imdb
 4 # print df_imdb.columns
 5 # df_imdb['Title'].head(5)  #選出Title列的前五行
 6 # df_imdb['Title'].tail(3)
 7 # df_imdb.Title.head(3)     #同[]的形式
 8 # df_imdb['Revenue (Millions)'].max()   #最大票房
 9 # df_imdb['Revenue (Millions)'].idxmax()    #最大票房的索引
10 # df_imdb[50:51]
11 # df_imdb[50:51]['Title']
12 # df_imdb[50:51]['Revenue (Millions)']  #取出50行,不包括51行
13 # 取出50-56行,收尾都包含,第一維度是行,第二維度是列
14 # df_imdb.loc[50:56,['Director','Year']]
15 # df_imdb[50:56].loc[:,'Director','Year']
16 # 取出1-5行(不包含第5行),2-4(不包含第4列)列的數據,使用整數索引操做,與numpy用法相似
17 # df_imdb.iloc[1:5,2:4]
18 # 統計Director列中不一樣導演出現的次數
19 # df_imdb['Director'].value_counts()
20 # 將票房大於5億美圓的電影選出來
21 # df_imdb[df_imdb['Revenue (Millions)']>500].Director
22 # df_imdb[df_imdb['Revenue (Millions)']>700]['Title']
23 # 將電影風格描述中含有Sci-Fi(科幻) 關鍵字的找出
24 # df_imdb[df_imdb['Genre'].str.contains('Sci-Fi')]
25 
26 # 將缺失數據(NaN)填充爲0,也能夠本身根據項目需求指定其餘數據
27 # df_score.fillna(0)
28 # 將缺失數據的行移除(默認操做,可使用axis=1指定刪除列df_score.dropna(axis=1))
29 # 0刪除行,1刪除列
30 # df_score.dropna()
31 # 在DataFram中增長一列平均值avg,計算當前DataFram中每行的平均值做爲avg的數據
32 # 先後賦值數據的行數要對應,axis=1表示按行計算,axis=0(默認值),表示按列計算
33 # df_score['avg'] = df_score.mean(axis=1)
34 # 按照性別分組並求和指定成績
35 # df_score.iloc[:,4:7].groupby(u'性別').sum()
36 # df_score.loc[:,[u'音樂',u'性別']].groupby(u'性別').sum()
37 # 按照男女分組並繪圖,bar柱狀圖,pie餅狀圖
38 # df_score[u'性別'].value_counts().plot(kind='bar')
39 # df_score[u'性別'].value_counts().plot(kind='pie')
40 # & 數學大於80且化學大於60
41 # df_score[(df_score[u'數學']>80) &(df_score[u'化學']>60) ]
42 
43 # 使用lambda,配合apply方法將日期中的指定年份或月份等提取出來
44 # apply函數會將lambda一次做用到數據集的每一個元素
45 # datas = pd.Series(['20190901','20190902','20190903'])
46 # datas.apply(lambda x:x[0:4])
47 # datas.apply(lambda x:x[4:6])
48 
49 # 建立一個數據的副本
50 # df_copy = df.copy()
51 # df_copy['R_Sum'] = df['SibSp']+df['Parch']
52 
53 # 計算數學列的總和、平均值等,裏面的字符串必須有同名函數
54 # df[u'數學'].agg(['sum','mean','max','std'])
55 
56 # pandas(Series、DataFrame)類型轉換爲numpy(array)類型
57 # df[u'數學'].values
58 # df.loc[:,[u'數學',u'化學']].values
59 
60 # 按照指定列的值排序,可指定正序倒序,默認正序
61 # df[u'數學'].sort_values()
62 # 按照索引排序
63 # df[u'數學'].sort_index()
64 # df[u'數學'].sort_values(ascending=False)
65 # 添加新列sum,值爲每行總和,並倒序排列
66 # df['sum'] = df.sum(axis=1)
67 # df[u'sum'].sort_values(ascending=False)
68 
69 
70 # 取出Embarked,Survived字段,按照兩個字段順序作層次分組,而後作計算總和
71 # r = df.loc[:,['Embarked','Survived']].groupby(['Embarked','Survived']).size()
72 # r.C
73 # r.C[1]
74 # r.Q
75 # r.Q[0]
76 # r.Q[1]
77 # r1 = df.loc[:,['Embarked','Survived']].groupby(['Survived','Embarked']).size()
78 # r2 = df.loc[:,['Embarked','Survived']].groupby('Embarked').size()
79 # r3 = df.loc[:,['Embarked','Survived']].groupby('Survived').size()

運行結果

 

"""
上面的運行結果 r
Embarked  Survived
C         0            75
          1            93
Q         0            47
          1            30
S         0           427
          1           217
dtype: int64

r.C結果
Survived
0    75
1    93
dtype: int64

r.C[1]結果
93

r1結果
Survived  Embarked
0         C            75
          Q            47
          S           427
1         C            93
          Q            30
          S           217
dtype: int64

r2結果
Embarked
C    168
Q     77
S    644
dtype: int64

r3結果
Survived
0    549
1    342
dtype: int64
"""

  

標註:

'''
1.axis轉換行列
2.DataFrame篩選一行或一列時會轉化爲Series類型,能夠直接後面加[數字]直接進行選擇,但Series不能使用DataFrame的方法(groupby等)
3.篩選出來的數據的索引還是原索引,不會從新排列新索引
'''

  

統計拍片數前10的某導演,指導電影的總票房

 1 def piaofang():
 2     director10 = df_imdb['Director'].value_counts().head(10)
 3     # print director10.index[0]
 4     revenues = 0
 5     for d in director10.index:
 6         print df_imdb[df_imdb['Director'] == d]['Revenue (Millions)'].sum()
 7 
 8 # piaofang()
 9 
10 # df_imdb[df_imdb['Director']=='']['Revenue (Millions)'].sum()

 

票房分析

特徵

'''
PassengerId:乘客的惟一標誌
Survived:1獲救,0死亡
Pclass:座艙等級 3最好,1最差
Name,Sex,Age,
SibSp:船上有沒有兄弟姐妹
Parch:父母等直系親屬是否在船上
Ticket,
Fare:票價或消費
Cabin:座艙號
Embarked:從哪一個港口登船
891
'''

  

導入類庫

1 import numpy as np
2 import matplotlib.pyplot as pt
3 import pandas as pd

準備數據

 

1 titanic = pd.read_csv('./Titanic.csv')
2 
3 titanic.fillna(int(titanic[u'Age'].mean()))

測試代碼

 

 1 # print titanic['Age']
 2 
 3 # print titanic[u'Age'].mean()
 4 # print titanic.loc[:,u'Survived'].value_counts()     #存活比例
 5 # print titanic.loc[:,u'Survived'].count()            #總人數
 6 
 7 # print titanic.loc[:, u'Sex'].value_counts()             #男女分類
 8 # print titanic[titanic[u'Sex'] == u'male']['Survived'].value_counts()    #男性生死分類
 9 
10 # print titanic.columns
11 # print titanic[titanic[u'Age'] <= 18][u'Survived'].value_counts()
12 # print titanic[(titanic[u'Age'] > 18) & (titanic[u'Age'] < 60)][u'Survived'].value_counts()
13 # print titanic[titanic[u'Age'] >= 60][u'Survived'].value_counts()
14 
15 # print titanic[u'Fare']
16 # print titanic[u'Fare'].max()          #貧富差距
17 # print titanic[u'Fare'].min()
18 
19 # print titanic[u'Pclass'].value_counts()
20 # print titanic[u'Pclass'].value_counts()[1]
21 # print titanic[u'Pclass'].value_counts()[3]              #座艙
22 # print titanic[titanic[u'Pclass'] == 1]['Survived'].value_counts()
23 # print titanic[titanic[u'Pclass'] == 3]['Survived'].value_counts()
24 
25 # print titanic[u'SibSp'].value_counts()
26 # print titanic[u'Parch'].value_counts()
27 
28 # print titanic[u'Embarked'].value_counts()

案例源碼

 

  1 class Titanic(object):
  2     def __init__(self):
  3         self.data = titanic
  4 
  5     # 1.存活率是多少
  6     def rate_survive(self):
  7         survived = self.data.loc[:, 'Survived'].value_counts()[1]
  8         death = self.data.loc[:, 'Survived'].value_counts()[0]
  9         rate = float(survived) / (float(death) + float(survived))
 10         print '總人數:{},存活人數:{},死亡人數:{}'.format(survived + death, survived, death)
 11         return u'存活率:' + '%.2f' % rate
 12 
 13     # 2.哪一個年齡段存活率最高
 14     def max_survive(self):
 15         age18_survived = self.data[self.data[u'Age'] <= 18][u'Survived'].value_counts()[1]
 16         age18_death = self.data[self.data[u'Age'] <= 18][u'Survived'].value_counts()[0]
 17         age18_rate = float(age18_survived) / (float(age18_survived) + float(age18_death))
 18 
 19         age1860_survived = self.data[(self.data[u'Age'] > 18) & (self.data[u'Age'] < 60)][u'Survived'].value_counts()[1]
 20         age1860_death = self.data[(self.data[u'Age'] > 18) & (self.data[u'Age'] < 60)][u'Survived'].value_counts()[0]
 21         age1860_rate = float(age1860_survived) / (float(age1860_survived) + float(age1860_death))
 22 
 23         age60_survived = self.data[self.data[u'Age'] >= 60][u'Survived'].value_counts()[1]
 24         age60_death = self.data[self.data[u'Age'] >= 60][u'Survived'].value_counts()[0]
 25         age60_rate = float(age60_survived) / (float(age60_survived) + float(age60_death))
 26 
 27         rate = [age18_rate, age60_rate, age1860_rate]
 28         age_data = ['18歲如下', '18-60歲', '60歲以上']
 29         max_rate = max(rate)
 30         age_range = age_data[rate.index(max(rate))]
 31         return '存活率最高的年齡段是{},存活率爲{}'.format(age_range, max_rate)
 32 
 33     # 3.女性存活率是否高於男性
 34     def than_survive(self):
 35         male_survied = self.data[self.data[u'Sex'] == u'male'][u'Survived'].value_counts()[1]
 36         male_death = self.data[self.data[u'Sex'] == u'male'][u'Survived'].value_counts()[0]
 37         rate_male = float(male_survied) / (float(male_survied) + float(male_death))
 38         print '男性共有{}人,存活{}人,死亡{}人'.format(male_death + male_survied, male_survied, male_death)
 39         female_survied = self.data[self.data[u'Sex'] == u'female'][u'Survived'].value_counts()[1]
 40         female_death = self.data[self.data[u'Sex'] == u'female'][u'Survived'].value_counts()[0]
 41         rate_female = float(female_survied) / (float(female_survied) + float(female_death))
 42         print '女性共有{}人,存活{}人,死亡{}人'.format(female_death + female_survied, female_survied, female_death)
 43         if rate_male > rate_female:
 44             return u'男性存活率更高,存活率爲:%.2f' % rate_male
 45         else:
 46             return u'女性存活率更高,存活率爲:%.2f' % rate_female
 47 
 48     # 4.船上是否出現貧富差距
 49     def poor_wealth(self):
 50         max_wealth = self.data[u'Fare'].max()
 51         max_poor = self.data[u'Fare'].min()
 52         if max_wealth - max_poor > 500:
 53             return '船上乘客最多消費了{},最少消費了{},存在貧富差距'.format(max_wealth, max_poor)
 54         else:
 55             return '船上乘客最多消費了{},最少消費了{},不存在貧富差距'.format(max_wealth, max_poor)
 56 
 57     # 5.頭等艙乘客的存活率是否高於經濟艙
 58     def pclass_survive(self):
 59         pclass1_survived = self.data[self.data[u'Pclass'] == 1]['Survived'].value_counts()[1]
 60         pclass1_death = self.data[self.data[u'Pclass'] == 1]['Survived'].value_counts()[0]
 61         pclass1_rate = float(pclass1_survived) / (float(pclass1_survived) + float(pclass1_death))
 62 
 63         pclass3_survived = self.data[self.data[u'Pclass'] == 3]['Survived'].value_counts()[1]
 64         pclass3_death = self.data[self.data[u'Pclass'] == 3]['Survived'].value_counts()[0]
 65         pclass3_rate = float(pclass3_survived) / (float(pclass3_survived) + float(pclass3_death))
 66 
 67         if pclass3_rate > pclass1_rate:
 68             return '頭等艙乘客存活率更高,存活率爲{}'.format(pclass3_rate)
 69         else:
 70             return '經濟艙乘客存活率更高,存活率爲{}'.format(pclass1_rate)
 71 
 72     # 6.有親屬在船上乘客比率,有親屬是否會影響存活率
 73     def family_survive(self):
 74         has_family = self.data[(self.data[u'Parch'] != 0) | (self.data[u'SibSp'] != 0)][u'PassengerId'].count()
 75         no_family = self.data[(self.data[u'Parch'] == 0) & (self.data[u'SibSp'] == 0)][u'PassengerId'].count()
 76         rate_family = float(has_family) / (float(has_family) + float(no_family))
 77 
 78         has_family_survived = \
 79             self.data[(self.data[u'Parch'] != 0) | (self.data[u'SibSp'] != 0)][u'Survived'].value_counts()[1]
 80         has_family_death = \
 81             self.data[(self.data[u'Parch'] != 0) | (self.data[u'SibSp'] != 0)][u'Survived'].value_counts()[0]
 82         has_family_rate = float(has_family_survived) / (float(has_family_survived) + float(has_family_death))
 83 
 84         no_family_survived = \
 85             self.data[(self.data[u'Parch'] == 0) & (self.data[u'SibSp'] == 0)][u'Survived'].value_counts()[1]
 86         no_family_death = \
 87             self.data[(self.data[u'Parch'] == 0) & (self.data[u'SibSp'] == 0)][u'Survived'].value_counts()[0]
 88         no_family_rate = float(no_family_survived) / (float(no_family_survived) + float(no_family_death))
 89 
 90         print '船上乘客中有親屬也在船上的有{}人,無親屬在船上的有{}人,有親屬在船上的乘客的比率爲{}'.format(has_family, no_family, rate_family)
 91         if has_family_rate > no_family_rate:
 92             return '有親屬在船上的乘客存活率更高,存活率爲{}'.format(has_family_rate)
 93         else:
 94             return '無親屬在船上的乘客存活率更高,存活率爲{}'.format(no_family_rate)
 95 
 96     # 7.從哪一個港口登船是否影響獲救
 97     def emarked_survive(self):
 98         Embarked_S_survived = self.data[self.data[u'Embarked'] == 'S'][u'Survived'].value_counts()[1]
 99         Embarked_S_death = self.data[self.data[u'Embarked'] == 'S'][u'Survived'].value_counts()[0]
100         Embarked_S_rate = float(Embarked_S_survived) / (float(Embarked_S_survived) + float(Embarked_S_death))
101 
102         Embarked_C_survived = self.data[self.data[u'Embarked'] == 'C'][u'Survived'].value_counts()[1]
103         Embarked_C_death = self.data[self.data[u'Embarked'] == 'C'][u'Survived'].value_counts()[0]
104         Embarked_C_rate = float(Embarked_C_survived) / (float(Embarked_C_survived) + float(Embarked_C_death))
105 
106         Embarked_Q_survived = self.data[self.data[u'Embarked'] == 'Q'][u'Survived'].value_counts()[1]
107         Embarked_Q_death = self.data[self.data[u'Embarked'] == 'Q'][u'Survived'].value_counts()[0]
108         Embarked_Q_rate = float(Embarked_Q_survived) / (float(Embarked_Q_survived) + float(Embarked_Q_death))
109 
110         embarked = ['S港口', 'C港口', 'Q港口']
111         rate = [Embarked_S_rate, Embarked_C_rate, Embarked_Q_rate]
112         max_rate = max(rate)
113         return '{}存活率最大,爲{}'.format(embarked[rate.index(max_rate)], max_rate)
114 
115     # 8.不一樣年齡段女性的獲救率
116     def female_survive(self):
117         female18_survived = \
118             self.data[(self.data[u'Age'] <= 18) & (self.data[u'Sex'] == u'female')][u'Survived'].value_counts()[1]
119         female18_death = \
120             self.data[(self.data[u'Age'] <= 18) & (self.data[u'Sex'] == u'female')][u'Survived'].value_counts()[0]
121         female18_rate = float(female18_survived) / (float(female18_survived) + float(female18_death))
122 
123         female1850_survived = \
124             self.data[(self.data[u'Age'] > 18) & (self.data[u'Age'] < 50) & (self.data[u'Sex'] == u'female')][
125                 u'Survived'].value_counts()[1]
126         female1850_death = \
127             self.data[(self.data[u'Age'] > 18) & (self.data[u'Age'] < 50) & (self.data[u'Sex'] == u'female')][
128                 u'Survived'].value_counts()[0]
129         female1850_rate = float(female1850_survived) / (float(female1850_survived) + float(female1850_death))
130 
131         female50_survived = \
132             self.data[(self.data[u'Age'] >= 50) & (self.data[u'Sex'] == u'female')][u'Survived'].value_counts()[1]
133         female50_death = \
134             self.data[(self.data[u'Age'] >= 50) & (self.data[u'Sex'] == u'female')][u'Survived'].value_counts()[0]
135         female50_rate = float(female50_survived) / (float(female50_survived) + float(female50_death))
136         
137         return '18歲如下女性存活率:{},18-50歲女性存活率:{},50歲以上女性存活率:{}'.format(female18_rate, female1850_rate, female50_rate)
138 
139 
140 if __name__ == '__main__':
141     tt = Titanic()
142     # print tt.rate_survive()
143     # print tt.than_survive()
144     # print tt.max_survive()
145     # print tt.poor_wealth()
146     # print tt.pclass_survive()
147     # print tt.family_survive()
148     # print tt.emarked_survive()
149     print tt.female_survive()

 

DATA-->INFOMATION-->KNOWLEDGE-->WISDOMpython

數據-->信息-->知識-->智慧數據庫

爬蟲-->數據庫-->數據分析-->機器學習數組

  • 信息:經過某種方式組織和處理數據,分析數據間的關係,數據就有了意義
  • 知識:若是說數據是一個事實的集合,從中能夠得出關於事實的結論。那麼知識(Knowledge)就是信息的集合,它使信息變得有用。知識是對信息的應用,是一個對信息判斷和確認的過程,這個過程結合了經驗、上下文、詮釋和檢討。知識能夠回答「如何?」的問題,能夠幫助咱們建模和仿真
  • 智慧:智慧能夠簡單的概括爲作正確判斷和決定的能力,包括對知識的最佳使用。智慧能夠回答「爲何」的問題。回到前面的例子,根據故障對客戶的業務影響能夠識別改進點
相關文章
相關標籤/搜索