Titanic 案例是Kaggle 入門案例,連接地址https://www.kaggle.com/c/titanic 。如下是摘自官網上的描述信息:ide
加載訓練數據3d
data_train = pd.read_csv("./input/train.csv")
預覽數據code
data_train.head()
訓練集數聽說明:blog
查看數據集信息input
data_train.info()
查看有缺失值的列it
ata_train.columns[data_train.isnull().any()].tolist()
計算缺失數io
age_null_count = data_train.Age.isnull().sum() cabin_null_count = data_train.Cabin.isnull().sum() embarked_null_count = data_train.Embarked.isnull().sum() print('Age列缺失:%s' %age_null_count) print('Cabin列缺失:%s' %cabin_null_count) print('Embarked列缺失:%s' %embarked_null_count)
Age列缺失值
使用Age列中位數填充缺失值入門
data_train.Age.fillna(data_train.Age.median())
Cabin列缺失值
Cabin列數據缺失條目較多,計算Survived列與Cabin列數據關係ast
Survived_cabin = data_train.Survived[pd.notnull(data_train.Cabin)].value_counts() print(Survived_cabin)
Survived_nocabin = data_train.Survived[pd.isnull(data_train.Cabin)].value_counts() print(Survived_nocabin)
能夠發現有Cabin信息的乘客獲救概率要大。將Cabin列數據做爲一個分類標籤處理class
Embarked列缺失值
使用Embarked列衆數填充缺失值
data_train.Embarked.fillna(data_train.Embarked.mode())
獲救人數狀況
# 繪製獲救人數狀況 data_train.Survived.value_counts().plot(kind='bar') plt.title("獲救狀況") plt.xticks([0,1], ["未獲救","獲救"], rotation=0) plt.ylabel("人數")
各等級的乘客年齡分佈
data_train.Age[data_train.Pclass == 1].plot(kind='kde') data_train.Age[data_train.Pclass == 2].plot(kind='kde') data_train.Age[data_train.Pclass == 3].plot(kind='kde') plt.xlabel("年齡") plt.ylabel("密度") plt.title("各等級的乘客年齡分佈") plt.legend(('一等艙', '二等艙','三等艙'),loc='best')
各乘客等級的獲救狀況
Survived_0 = data_train.Pclass[data_train.Survived == 0].value_counts() Survived_1 = data_train.Pclass[data_train.Survived == 1].value_counts() df=pd.DataFrame({'獲救':Survived_1, '未獲救':Survived_0}) df.plot(kind='bar', stacked=True) plt.title("船艙等級的獲救狀況") plt.xlabel("船艙等級") plt.ylabel("人數") plt.xticks(rotation=0)
繪製登船口岸上船人數
data_train.Embarked.value_counts().plot(kind='bar') plt.title("各登船口岸上船人數") plt.ylabel("人數") plt.xticks(rotation=0)
各登陸港口的獲救狀況
Survived_0 = data_train.Embarked[data_train.Survived == 0].value_counts() Survived_1 = data_train.Embarked[data_train.Survived == 1].value_counts() df=pd.DataFrame({'獲救':Survived_1, '未獲救':Survived_0}) df.plot(kind='bar', stacked=True) plt.title("登錄港口乘客的獲救狀況") plt.xlabel("登錄港口") plt.ylabel("人數") plt.xticks(rotation=0)
各性別的獲救狀況
Survived_m = data_train.Survived[data_train.Sex == 'male'].value_counts() Survived_f = data_train.Survived[data_train.Sex == 'female'].value_counts() df=pd.DataFrame({'男性':Survived_m, '女性':Survived_f}) df.plot(kind='bar', stacked=True) plt.title("男女性別獲救狀況") plt.xlabel("性別") plt.ylabel("人數") plt.xticks([0,1], ["未獲救","獲救"], rotation=0)
SibSp字段獲救狀況
SibSp_0 = data_train.SibSp[data_train.Survived == 0].value_counts() SibSp_1 = data_train.SibSp[data_train.Survived == 1].value_counts() SibSp_df=pd.DataFrame({'未獲救':SibSp_0, '獲救':SibSp_1}) SibSp_df.plot(kind='bar',stacked=True) plt.title("堂兄弟/妹個數獲救狀況") plt.xlabel("堂兄弟/妹個數") plt.ylabel("人數") plt.xticks(rotation=0)
Parch字段獲救狀況
Parch_0 = data_train.Parch[data_train.Survived == 0].value_counts() Parch_1 = data_train.Parch[data_train.Survived == 1].value_counts() Parch_df=pd.DataFrame({'未獲救':Parch_0, '獲救':Parch_1}) Parch_df.plot(kind='bar',stacked=True) plt.title("父母與小孩個數獲救狀況") plt.xlabel("父母與小孩個數") plt.ylabel("人數") plt.xticks(rotation=0)