機器學習算法完整版見fenghaootong-github python
數據共有81個特徵 git
SalePrice - the property’s sale price in dollars. This is the target variable that you’re trying to predict.
MSSubClass: The building class
MSZoning: The general zoning classification
LotFrontage: Linear feet of street connected to property
LotArea: Lot size in square feet
Street: Type of road access
Alley: Type of alley access
LotShape: General shape of property
LandContour: Flatness of the property
Utilities: Type of utilities available
LotConfig: Lot configuration
LandSlope: Slope of property
….github
導入所需模塊 web
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math as mat
from scipy import stats
from scipy.stats import norm
from sklearn import preprocessing
import statsmodels.api as sm
from patsy import dmatrices
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
import sklearn.linear_model as LinReg
import sklearn.metrics as metrics
導入數據 算法
#loading the data
data_train = pd.read_csv('../DATA/SalePrice_train.csv')
data_test = pd.read_csv('../DATA/SalePrice_test.csv')
數據共有81個特徵,爲了便於說明只挑選7個特徵
OverallQual
GrLivArea
GarageCars
TotalBsmtSF
1stFlrSF
FullBath
YearBuilt
由於這些數據與房子的售賣價格相關性比較大api
具體如何選擇特徵,見數據清理dom
數據預處理 機器學習
data_train.shape
(1460, 81)
vars = ['OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath','YearBuilt']
Y = data_train[['SalePrice']] #dim (1460, 1)
ID_train = data_train[['Id']] #dim (1460, 1)
ID_test = data_test[['Id']] #dim (1459, 1)
#extract only the relevant feature with cross correlation >0.5 respect to SalePrice
X_matrix = data_train[vars]
X_matrix.shape #dim (1460,6)
X_test = data_test[vars]
X_test.shape #dim (1459,6)
(1459, 6)
查看丟失數據 svg
#check for missing data:
#missing data
total = X_matrix.isnull().sum().sort_values(ascending=False)
percent = (X_matrix.isnull().sum()/X_matrix.count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)
#no missing data in this training set
Total | Percent | |
---|---|---|
YearBuilt | 0 | 0.0 |
FullBath | 0 | 0.0 |
TotalBsmtSF | 0 | 0.0 |
GarageCars | 0 | 0.0 |
GrLivArea | 0 | 0.0 |
OverallQual | 0 | 0.0 |
total = X_test.isnull().sum().sort_values(ascending=False)
percent = (X_test.isnull().sum()/X_test.count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)
#missing data in this test set
Total | Percent | |
---|---|---|
TotalBsmtSF | 1 | 0.000686 |
GarageCars | 1 | 0.000686 |
YearBuilt | 0 | 0.000000 |
FullBath | 0 | 0.000000 |
GrLivArea | 0 | 0.000000 |
OverallQual | 0 | 0.000000 |
#help(mat.ceil) #去上限
使用均值代替缺失的數據 函數
#使用均值代替缺失的數據
X_test['TotalBsmtSF'] = X_test['TotalBsmtSF'].fillna(X_test['TotalBsmtSF'].mean())
X_test['GarageCars'] = X_test['GarageCars'].fillna(mat.ceil(X_test['GarageCars'].mean()))
total = X_test.isnull().sum().sort_values(ascending=False)
percent = (X_test.isnull().sum()/X_test.count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)
Total | Percent | |
---|---|---|
YearBuilt | 0 | 0.0 |
FullBath | 0 | 0.0 |
TotalBsmtSF | 0 | 0.0 |
GarageCars | 0 | 0.0 |
GrLivArea | 0 | 0.0 |
OverallQual | 0 | 0.0 |
X_test.shape
(1459, 6)
max_abs_scaler = preprocessing.MaxAbsScaler() X_train_maxabs = max_abs_scaler.fit_transform(X_matrix) print(X_train_maxabs)
[[ 0.7 0.30308401 0.5 0.1400982 0.66666667 0.99651741] [ 0.6 0.22367955 0.5 0.20654664 0.66666667 0.98308458] [ 0.7 0.31655441 0.5 0.15057283 0.66666667 0.99552239] ..., [ 0.7 0.41474654 0.25 0.18854337 0.66666667 0.96567164] [ 0.5 0.191067 0.25 0.17643208 0.33333333 0.97014925] [ 0.5 0.22261609 0.25 0.20556465 0.33333333 0.97761194]]
X_test_maxabs = max_abs_scaler.fit_transform(X_test) print(X_test_maxabs)
[[ 0.5 0.17585868 0.2 0.17311089 0.25 0.97562189] [ 0.6 0.26084396 0.2 0.26084396 0.25 0.97412935] [ 0.5 0.31972522 0.4 0.18213935 0.5 0.99353234] ..., [ 0.5 0.24023553 0.4 0.24023553 0.25 0.97512438] [ 0.5 0.19038273 0. 0.17899902 0.25 0.99104478] [ 0.7 0.39254171 0.6 0.19548577 0.5 0.99154229]]
模型訓練
lr=LinReg.LinearRegression().fit(X_train_maxabs,Y)
模型預測
Y_pred_train = lr.predict(X_train_maxabs)
print("Los Reg performance evaluation on Y_pred_train")
print("R-squared =", metrics.r2_score(Y, Y_pred_train))
Los Reg performance evaluation on Y_pred_train R-squared = 0.768647335422
Y_pred_test = lr.predict(X_test_maxabs)
print("Lin Reg performance evaluation on X_test")
#print("R-squared =", metrics.r2_score(Y, Y_pred_test))
print("Coefficients =", lr.coef_)
Lin Reg performance evaluation on X_test Coefficients = [[ 205199.68775757 305095.8264889 58585.26325362 178302.68126933 -16511.92112734 676458.9666186 ]]
導入模塊
#導入模塊
import pandas as pd
import numpy as np
數據預處理
#建立特徵列表表頭
column_names = ['Sample code number','Clump Thickness','Uniformity of Cell Size','Uniformity of Cell Shape','Marginal Adhesion','Single Epithelial Cell Size','Bare Nuclei','Bland Chromatin','Normal Nucleoli','Mitoses','Class']
#使用pandas.read_csv函數從網上讀取數據集
data = pd.read_csv('DATA/data.csv',names=column_names)
#將?替換爲標準缺失值表示
data = data.replace(to_replace='?',value = np.nan)
#丟棄帶有缺失值的數據(只要有一個維度有缺失便丟棄)
data = data.dropna(how='any')
#查看data的數據量和維度
data.shape
(683, 11)
data.head(10)
Sample code number | Clump Thickness | Uniformity of Cell Size | Uniformity of Cell Shape | Marginal Adhesion | Single Epithelial Cell Size | Bare Nuclei | Bland Chromatin | Normal Nucleoli | Mitoses | Class | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1000025 | 5 | 1 | 1 | 1 | 2 | 1 | 3 | 1 | 1 | 2 |
1 | 1002945 | 5 | 4 | 4 | 5 | 7 | 10 | 3 | 2 | 1 | 2 |
2 | 1015425 | 3 | 1 | 1 | 1 | 2 | 2 | 3 | 1 | 1 | 2 |
3 | 1016277 | 6 | 8 | 8 | 1 | 3 | 4 | 3 | 7 | 1 | 2 |
4 | 1017023 | 4 | 1 | 1 | 3 | 2 | 1 | 3 | 1 | 1 | 2 |
5 | 1017122 | 8 | 10 | 10 | 8 | 7 | 10 | 9 | 7 | 1 | 4 |
6 | 1018099 | 1 | 1 | 1 | 1 | 2 | 10 | 3 | 1 | 1 | 2 |
7 | 1018561 | 2 | 1 | 2 | 1 | 2 | 1 | 3 | 1 | 1 | 2 |
8 | 1033078 | 2 | 1 | 1 | 1 | 2 | 1 | 1 | 1 | 5 | 2 |
9 | 1033078 | 4 | 2 | 1 | 1 | 2 | 1 | 2 | 1 | 1 | 2 |
因爲原始數據沒有提供對應的測試樣本用於評估模型性能,這裏對帶標記的數據進行分割,25%做爲測試集,其他做爲訓練集
#使用sklearn.cross_validation裏的train_test_split模塊分割數據集
from sklearn.cross_validation import train_test_split
#隨機採樣25%的數據用於測試,剩下的75%用於構建訓練集
X_train,X_test,y_train,y_test = train_test_split(data[column_names[1:10]],data[column_names[10]],test_size = 0.25,random_state = 33)
#查看訓練樣本的數量和類別分佈
y_train.value_counts()
2 344 4 168 Name: Class, dtype: int64
#查看測試樣本的數量和類別分佈
y_test.value_counts()
2 100 4 71 Name: Class, dtype: int64
創建模型,預測數據
#從sklearn.preprocessing導入StandardScaler
from sklearn.preprocessing import StandardScaler
#從sklearn.linear_model導入LogisticRegression(邏輯斯蒂迴歸)
from sklearn.linear_model import LogisticRegression
#從sklearn.linear_model導入SGDClassifier(隨機梯度參數)
from sklearn.linear_model import SGDClassifier
ss = StandardScaler() X_train = ss.fit_transform(X_train) X_test = ss.transform(X_test)
lr = LogisticRegression()
#調用邏輯斯蒂迴歸,使用fit函數訓練模型參數
lr.fit(X_train,y_train)
lr_y_predict = lr.predict(X_test)
#調用隨機梯度的fit函數訓練模型
lr_y_predict
array([2, 2, 4, 4, 2, 2, 2, 4, 2, 2, 2, 2, 4, 2, 4, 4, 4, 4, 4, 2, 2, 4, 4, 2, 4, 4, 2, 2, 4, 4, 4, 4, 4, 4, 4, 4, 2, 4, 4, 4, 4, 4, 2, 4, 2, 2, 4, 2, 2, 4, 4, 2, 2, 2, 4, 2, 2, 2, 2, 2, 4, 4, 2, 2, 2, 4, 2, 2, 2, 2, 4, 2, 2, 2, 2, 2, 2, 4, 4, 2, 2, 2, 4, 2, 2, 2, 4, 2, 4, 2, 4, 4, 2, 2, 2, 2, 4, 4, 2, 2, 2, 4, 2, 2, 4, 2, 2, 2, 2, 2, 4, 2, 2, 2, 2, 2, 2, 4, 2, 2, 4, 4, 2, 4, 2, 2, 2, 4, 2, 2, 4, 4, 2, 4, 4, 2, 2, 2, 2, 4, 2, 4, 2, 4, 2, 2, 2, 2, 2, 4, 4, 2, 4, 4, 2, 4, 2, 2, 2, 2, 4, 4, 4, 2, 4, 2, 2, 4, 2, 4, 4], dtype=int64)
使用線性分類模型進行良/惡性腫瘤預測任務的性能分析
#從sklearn.metrics導入classification_report
from sklearn.metrics import classification_report
#使用邏輯斯蒂迴歸模型自帶的評分函數score得到模型在測試集上的準確性結果
print('Accuracy of LR Classifier:',lr.score(X_test,y_test))
#使用classification_report模塊得到邏輯斯蒂模型其餘三個指標的結果(召回率,精確率,調和平均數)
print(classification_report(y_test,lr_y_predict,target_names=['Benign','Malignant']))
Accuracy of LR Classifier: 0.988304093567 precision recall f1-score support Benign 0.99 0.99 0.99 100 Malignant 0.99 0.99 0.99 71 avg / total 0.99 0.99 0.99 171