1 import pandas as pd 2 3 # 在數據挖掘前一個數據分析、篩選、清理的多功能工具 4 ''' 5 pandas 能夠讀入excel、csv等文件;能夠建立Series序列,DataFrame表格,日期數組data_range 6 '''
1 # 將excel文件,csv文件讀取並轉換爲pandas的DataFrame 2 # df_score = pd.read_csv() 3 df_score = pd.read_excel('./score.xlsx') 4 # df_score.values #數據 5 # df_score.columns #列名 6 # print df_score.describe() #計算表的各項數據,count,mean,std,中位數等 7 8 # 建立一個默認索引從0開始的Series 9 s = pd.Series([1, 2, 3, 4, 5, 6]) 10 # 建立自定義索引的數組,索引由index指定,和前面數組依次對應 11 s = pd.Series([1, 2, 3, 4, 5, 6], index=['a', 'b', 'c', 'd', 'e', 'f'], dtype=int) 12 # 使用字典建立一個DataFrame,字典的Key會自動成爲列名,一個Key默認對應一列數據 13 df1 = pd.DataFrame({'math': [1, 2, 3, 4, 5], 'physic': [5, 6, 7, 8, 9]}, index=['a', 'b', 'c', 'd', 'e']) 14 ''' 15 # df1.values 數據 16 # df1.head(2) 前兩行數據 17 # df1.tail(2) 最後兩行數據 18 # df1.index 索引 19 # df1.columns 列名 20 ''' 21 # 生成從20180101開始的時間序列,peroids是增長量,默認增長單位是天D,H小時,s秒 22 dates = pd.date_range('20180101', periods=10, freq='D') 23 # 建立使用時間索引的Series 24 # s = pd.Series(range(10),index=dates) 25 # 取出指定間隔的行數據 26 # s['2018-01-01':'2018-01-05'] 27 # print dates
1 df_imdb = pd.read_csv('./IMDB.csv') 2 3 # print df_imdb 4 # print df_imdb.columns 5 # df_imdb['Title'].head(5) #選出Title列的前五行 6 # df_imdb['Title'].tail(3) 7 # df_imdb.Title.head(3) #同[]的形式 8 # df_imdb['Revenue (Millions)'].max() #最大票房 9 # df_imdb['Revenue (Millions)'].idxmax() #最大票房的索引 10 # df_imdb[50:51] 11 # df_imdb[50:51]['Title'] 12 # df_imdb[50:51]['Revenue (Millions)'] #取出50行,不包括51行 13 # 取出50-56行,收尾都包含,第一維度是行,第二維度是列 14 # df_imdb.loc[50:56,['Director','Year']] 15 # df_imdb[50:56].loc[:,'Director','Year'] 16 # 取出1-5行(不包含第5行),2-4(不包含第4列)列的數據,使用整數索引操做,與numpy用法相似 17 # df_imdb.iloc[1:5,2:4] 18 # 統計Director列中不一樣導演出現的次數 19 # df_imdb['Director'].value_counts() 20 # 將票房大於5億美圓的電影選出來 21 # df_imdb[df_imdb['Revenue (Millions)']>500].Director 22 # df_imdb[df_imdb['Revenue (Millions)']>700]['Title'] 23 # 將電影風格描述中含有Sci-Fi(科幻) 關鍵字的找出 24 # df_imdb[df_imdb['Genre'].str.contains('Sci-Fi')] 25 26 # 將缺失數據(NaN)填充爲0,也能夠本身根據項目需求指定其餘數據 27 # df_score.fillna(0) 28 # 將缺失數據的行移除(默認操做,可使用axis=1指定刪除列df_score.dropna(axis=1)) 29 # 0刪除行,1刪除列 30 # df_score.dropna() 31 # 在DataFram中增長一列平均值avg,計算當前DataFram中每行的平均值做爲avg的數據 32 # 先後賦值數據的行數要對應,axis=1表示按行計算,axis=0(默認值),表示按列計算 33 # df_score['avg'] = df_score.mean(axis=1) 34 # 按照性別分組並求和指定成績 35 # df_score.iloc[:,4:7].groupby(u'性別').sum() 36 # df_score.loc[:,[u'音樂',u'性別']].groupby(u'性別').sum() 37 # 按照男女分組並繪圖,bar柱狀圖,pie餅狀圖 38 # df_score[u'性別'].value_counts().plot(kind='bar') 39 # df_score[u'性別'].value_counts().plot(kind='pie') 40 # & 數學大於80且化學大於60 41 # df_score[(df_score[u'數學']>80) &(df_score[u'化學']>60) ] 42 43 # 使用lambda,配合apply方法將日期中的指定年份或月份等提取出來 44 # apply函數會將lambda一次做用到數據集的每一個元素 45 # datas = pd.Series(['20190901','20190902','20190903']) 46 # datas.apply(lambda x:x[0:4]) 47 # datas.apply(lambda x:x[4:6]) 48 49 # 建立一個數據的副本 50 # df_copy = df.copy() 51 # df_copy['R_Sum'] = df['SibSp']+df['Parch'] 52 53 # 計算數學列的總和、平均值等,裏面的字符串必須有同名函數 54 # df[u'數學'].agg(['sum','mean','max','std']) 55 56 # pandas(Series、DataFrame)類型轉換爲numpy(array)類型 57 # df[u'數學'].values 58 # df.loc[:,[u'數學',u'化學']].values 59 60 # 按照指定列的值排序,可指定正序倒序,默認正序 61 # df[u'數學'].sort_values() 62 # 按照索引排序 63 # df[u'數學'].sort_index() 64 # df[u'數學'].sort_values(ascending=False) 65 # 添加新列sum,值爲每行總和,並倒序排列 66 # df['sum'] = df.sum(axis=1) 67 # df[u'sum'].sort_values(ascending=False) 68 69 70 # 取出Embarked,Survived字段,按照兩個字段順序作層次分組,而後作計算總和 71 # r = df.loc[:,['Embarked','Survived']].groupby(['Embarked','Survived']).size() 72 # r.C 73 # r.C[1] 74 # r.Q 75 # r.Q[0] 76 # r.Q[1] 77 # r1 = df.loc[:,['Embarked','Survived']].groupby(['Survived','Embarked']).size() 78 # r2 = df.loc[:,['Embarked','Survived']].groupby('Embarked').size() 79 # r3 = df.loc[:,['Embarked','Survived']].groupby('Survived').size()
""" 上面的運行結果 r Embarked Survived C 0 75 1 93 Q 0 47 1 30 S 0 427 1 217 dtype: int64 r.C結果 Survived 0 75 1 93 dtype: int64 r.C[1]結果 93 r1結果 Survived Embarked 0 C 75 Q 47 S 427 1 C 93 Q 30 S 217 dtype: int64 r2結果 Embarked C 168 Q 77 S 644 dtype: int64 r3結果 Survived 0 549 1 342 dtype: int64 """
''' 1.axis轉換行列 2.DataFrame篩選一行或一列時會轉化爲Series類型,能夠直接後面加[數字]直接進行選擇,但Series不能使用DataFrame的方法(groupby等) 3.篩選出來的數據的索引還是原索引,不會從新排列新索引 '''
1 def piaofang(): 2 director10 = df_imdb['Director'].value_counts().head(10) 3 # print director10.index[0] 4 revenues = 0 5 for d in director10.index: 6 print df_imdb[df_imdb['Director'] == d]['Revenue (Millions)'].sum() 7 8 # piaofang() 9 10 # df_imdb[df_imdb['Director']=='']['Revenue (Millions)'].sum()
''' PassengerId:乘客的惟一標誌 Survived:1獲救,0死亡 Pclass:座艙等級 3最好,1最差 Name,Sex,Age, SibSp:船上有沒有兄弟姐妹 Parch:父母等直系親屬是否在船上 Ticket, Fare:票價或消費 Cabin:座艙號 Embarked:從哪一個港口登船 891 '''
1 import numpy as np 2 import matplotlib.pyplot as pt 3 import pandas as pd
1 titanic = pd.read_csv('./Titanic.csv') 2 3 titanic.fillna(int(titanic[u'Age'].mean()))
1 # print titanic['Age'] 2 3 # print titanic[u'Age'].mean() 4 # print titanic.loc[:,u'Survived'].value_counts() #存活比例 5 # print titanic.loc[:,u'Survived'].count() #總人數 6 7 # print titanic.loc[:, u'Sex'].value_counts() #男女分類 8 # print titanic[titanic[u'Sex'] == u'male']['Survived'].value_counts() #男性生死分類 9 10 # print titanic.columns 11 # print titanic[titanic[u'Age'] <= 18][u'Survived'].value_counts() 12 # print titanic[(titanic[u'Age'] > 18) & (titanic[u'Age'] < 60)][u'Survived'].value_counts() 13 # print titanic[titanic[u'Age'] >= 60][u'Survived'].value_counts() 14 15 # print titanic[u'Fare'] 16 # print titanic[u'Fare'].max() #貧富差距 17 # print titanic[u'Fare'].min() 18 19 # print titanic[u'Pclass'].value_counts() 20 # print titanic[u'Pclass'].value_counts()[1] 21 # print titanic[u'Pclass'].value_counts()[3] #座艙 22 # print titanic[titanic[u'Pclass'] == 1]['Survived'].value_counts() 23 # print titanic[titanic[u'Pclass'] == 3]['Survived'].value_counts() 24 25 # print titanic[u'SibSp'].value_counts() 26 # print titanic[u'Parch'].value_counts() 27 28 # print titanic[u'Embarked'].value_counts()
1 class Titanic(object): 2 def __init__(self): 3 self.data = titanic 4 5 # 1.存活率是多少 6 def rate_survive(self): 7 survived = self.data.loc[:, 'Survived'].value_counts()[1] 8 death = self.data.loc[:, 'Survived'].value_counts()[0] 9 rate = float(survived) / (float(death) + float(survived)) 10 print '總人數:{},存活人數:{},死亡人數:{}'.format(survived + death, survived, death) 11 return u'存活率:' + '%.2f' % rate 12 13 # 2.哪一個年齡段存活率最高 14 def max_survive(self): 15 age18_survived = self.data[self.data[u'Age'] <= 18][u'Survived'].value_counts()[1] 16 age18_death = self.data[self.data[u'Age'] <= 18][u'Survived'].value_counts()[0] 17 age18_rate = float(age18_survived) / (float(age18_survived) + float(age18_death)) 18 19 age1860_survived = self.data[(self.data[u'Age'] > 18) & (self.data[u'Age'] < 60)][u'Survived'].value_counts()[1] 20 age1860_death = self.data[(self.data[u'Age'] > 18) & (self.data[u'Age'] < 60)][u'Survived'].value_counts()[0] 21 age1860_rate = float(age1860_survived) / (float(age1860_survived) + float(age1860_death)) 22 23 age60_survived = self.data[self.data[u'Age'] >= 60][u'Survived'].value_counts()[1] 24 age60_death = self.data[self.data[u'Age'] >= 60][u'Survived'].value_counts()[0] 25 age60_rate = float(age60_survived) / (float(age60_survived) + float(age60_death)) 26 27 rate = [age18_rate, age60_rate, age1860_rate] 28 age_data = ['18歲如下', '18-60歲', '60歲以上'] 29 max_rate = max(rate) 30 age_range = age_data[rate.index(max(rate))] 31 return '存活率最高的年齡段是{},存活率爲{}'.format(age_range, max_rate) 32 33 # 3.女性存活率是否高於男性 34 def than_survive(self): 35 male_survied = self.data[self.data[u'Sex'] == u'male'][u'Survived'].value_counts()[1] 36 male_death = self.data[self.data[u'Sex'] == u'male'][u'Survived'].value_counts()[0] 37 rate_male = float(male_survied) / (float(male_survied) + float(male_death)) 38 print '男性共有{}人,存活{}人,死亡{}人'.format(male_death + male_survied, male_survied, male_death) 39 female_survied = self.data[self.data[u'Sex'] == u'female'][u'Survived'].value_counts()[1] 40 female_death = self.data[self.data[u'Sex'] == u'female'][u'Survived'].value_counts()[0] 41 rate_female = float(female_survied) / (float(female_survied) + float(female_death)) 42 print '女性共有{}人,存活{}人,死亡{}人'.format(female_death + female_survied, female_survied, female_death) 43 if rate_male > rate_female: 44 return u'男性存活率更高,存活率爲:%.2f' % rate_male 45 else: 46 return u'女性存活率更高,存活率爲:%.2f' % rate_female 47 48 # 4.船上是否出現貧富差距 49 def poor_wealth(self): 50 max_wealth = self.data[u'Fare'].max() 51 max_poor = self.data[u'Fare'].min() 52 if max_wealth - max_poor > 500: 53 return '船上乘客最多消費了{},最少消費了{},存在貧富差距'.format(max_wealth, max_poor) 54 else: 55 return '船上乘客最多消費了{},最少消費了{},不存在貧富差距'.format(max_wealth, max_poor) 56 57 # 5.頭等艙乘客的存活率是否高於經濟艙 58 def pclass_survive(self): 59 pclass1_survived = self.data[self.data[u'Pclass'] == 1]['Survived'].value_counts()[1] 60 pclass1_death = self.data[self.data[u'Pclass'] == 1]['Survived'].value_counts()[0] 61 pclass1_rate = float(pclass1_survived) / (float(pclass1_survived) + float(pclass1_death)) 62 63 pclass3_survived = self.data[self.data[u'Pclass'] == 3]['Survived'].value_counts()[1] 64 pclass3_death = self.data[self.data[u'Pclass'] == 3]['Survived'].value_counts()[0] 65 pclass3_rate = float(pclass3_survived) / (float(pclass3_survived) + float(pclass3_death)) 66 67 if pclass3_rate > pclass1_rate: 68 return '頭等艙乘客存活率更高,存活率爲{}'.format(pclass3_rate) 69 else: 70 return '經濟艙乘客存活率更高,存活率爲{}'.format(pclass1_rate) 71 72 # 6.有親屬在船上乘客比率,有親屬是否會影響存活率 73 def family_survive(self): 74 has_family = self.data[(self.data[u'Parch'] != 0) | (self.data[u'SibSp'] != 0)][u'PassengerId'].count() 75 no_family = self.data[(self.data[u'Parch'] == 0) & (self.data[u'SibSp'] == 0)][u'PassengerId'].count() 76 rate_family = float(has_family) / (float(has_family) + float(no_family)) 77 78 has_family_survived = \ 79 self.data[(self.data[u'Parch'] != 0) | (self.data[u'SibSp'] != 0)][u'Survived'].value_counts()[1] 80 has_family_death = \ 81 self.data[(self.data[u'Parch'] != 0) | (self.data[u'SibSp'] != 0)][u'Survived'].value_counts()[0] 82 has_family_rate = float(has_family_survived) / (float(has_family_survived) + float(has_family_death)) 83 84 no_family_survived = \ 85 self.data[(self.data[u'Parch'] == 0) & (self.data[u'SibSp'] == 0)][u'Survived'].value_counts()[1] 86 no_family_death = \ 87 self.data[(self.data[u'Parch'] == 0) & (self.data[u'SibSp'] == 0)][u'Survived'].value_counts()[0] 88 no_family_rate = float(no_family_survived) / (float(no_family_survived) + float(no_family_death)) 89 90 print '船上乘客中有親屬也在船上的有{}人,無親屬在船上的有{}人,有親屬在船上的乘客的比率爲{}'.format(has_family, no_family, rate_family) 91 if has_family_rate > no_family_rate: 92 return '有親屬在船上的乘客存活率更高,存活率爲{}'.format(has_family_rate) 93 else: 94 return '無親屬在船上的乘客存活率更高,存活率爲{}'.format(no_family_rate) 95 96 # 7.從哪一個港口登船是否影響獲救 97 def emarked_survive(self): 98 Embarked_S_survived = self.data[self.data[u'Embarked'] == 'S'][u'Survived'].value_counts()[1] 99 Embarked_S_death = self.data[self.data[u'Embarked'] == 'S'][u'Survived'].value_counts()[0] 100 Embarked_S_rate = float(Embarked_S_survived) / (float(Embarked_S_survived) + float(Embarked_S_death)) 101 102 Embarked_C_survived = self.data[self.data[u'Embarked'] == 'C'][u'Survived'].value_counts()[1] 103 Embarked_C_death = self.data[self.data[u'Embarked'] == 'C'][u'Survived'].value_counts()[0] 104 Embarked_C_rate = float(Embarked_C_survived) / (float(Embarked_C_survived) + float(Embarked_C_death)) 105 106 Embarked_Q_survived = self.data[self.data[u'Embarked'] == 'Q'][u'Survived'].value_counts()[1] 107 Embarked_Q_death = self.data[self.data[u'Embarked'] == 'Q'][u'Survived'].value_counts()[0] 108 Embarked_Q_rate = float(Embarked_Q_survived) / (float(Embarked_Q_survived) + float(Embarked_Q_death)) 109 110 embarked = ['S港口', 'C港口', 'Q港口'] 111 rate = [Embarked_S_rate, Embarked_C_rate, Embarked_Q_rate] 112 max_rate = max(rate) 113 return '{}存活率最大,爲{}'.format(embarked[rate.index(max_rate)], max_rate) 114 115 # 8.不一樣年齡段女性的獲救率 116 def female_survive(self): 117 female18_survived = \ 118 self.data[(self.data[u'Age'] <= 18) & (self.data[u'Sex'] == u'female')][u'Survived'].value_counts()[1] 119 female18_death = \ 120 self.data[(self.data[u'Age'] <= 18) & (self.data[u'Sex'] == u'female')][u'Survived'].value_counts()[0] 121 female18_rate = float(female18_survived) / (float(female18_survived) + float(female18_death)) 122 123 female1850_survived = \ 124 self.data[(self.data[u'Age'] > 18) & (self.data[u'Age'] < 50) & (self.data[u'Sex'] == u'female')][ 125 u'Survived'].value_counts()[1] 126 female1850_death = \ 127 self.data[(self.data[u'Age'] > 18) & (self.data[u'Age'] < 50) & (self.data[u'Sex'] == u'female')][ 128 u'Survived'].value_counts()[0] 129 female1850_rate = float(female1850_survived) / (float(female1850_survived) + float(female1850_death)) 130 131 female50_survived = \ 132 self.data[(self.data[u'Age'] >= 50) & (self.data[u'Sex'] == u'female')][u'Survived'].value_counts()[1] 133 female50_death = \ 134 self.data[(self.data[u'Age'] >= 50) & (self.data[u'Sex'] == u'female')][u'Survived'].value_counts()[0] 135 female50_rate = float(female50_survived) / (float(female50_survived) + float(female50_death)) 136 137 return '18歲如下女性存活率:{},18-50歲女性存活率:{},50歲以上女性存活率:{}'.format(female18_rate, female1850_rate, female50_rate) 138 139 140 if __name__ == '__main__': 141 tt = Titanic() 142 # print tt.rate_survive() 143 # print tt.than_survive() 144 # print tt.max_survive() 145 # print tt.poor_wealth() 146 # print tt.pclass_survive() 147 # print tt.family_survive() 148 # print tt.emarked_survive() 149 print tt.female_survive()
DATA-->INFOMATION-->KNOWLEDGE-->WISDOMpython
數據-->信息-->知識-->智慧數據庫
爬蟲-->數據庫-->數據分析-->機器學習數組