數據共有81個特徵 git

SalePrice - the property’s sale price in dollars. This is the target variable that you’re trying to predict.
MSSubClass: The building class
MSZoning: The general zoning classification
LotFrontage: Linear feet of street connected to property
LotArea: Lot size in square feet
Street: Type of road access
Alley: Type of alley access
LotShape: General shape of property
LandContour: Flatness of the property
Utilities: Type of utilities available
LotConfig: Lot configuration
LandSlope: Slope of property

導入所需模塊 web

import numpy as np
import pandas as pd 

import matplotlib.pyplot as plt
import seaborn as sns
import math as mat

from scipy import stats
from scipy.stats import norm
from sklearn import preprocessing

import statsmodels.api as sm
from patsy import dmatrices

import warnings
%matplotlib inline

import sklearn.linear_model as LinReg
import sklearn.metrics as metrics

導入數據 算法

#loading the data 
data_train = pd.read_csv('../DATA/SalePrice_train.csv')
data_test = pd.read_csv('../DATA/SalePrice_test.csv')



數據預處理 機器學習

(1460, 81)
vars = ['OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath','YearBuilt']
Y = data_train[['SalePrice']] #dim (1460, 1)
ID_train = data_train[['Id']] #dim (1460, 1)
ID_test = data_test[['Id']]   #dim (1459, 1)
#extract only the relevant feature with cross correlation >0.5 respect to SalePrice
X_matrix = data_train[vars]
X_matrix.shape  #dim (1460,6)

X_test = data_test[vars]  
X_test.shape   #dim (1459,6)
(1459, 6)

查看丟失數據 svg

#check for missing data:
#missing data
total = X_matrix.isnull().sum().sort_values(ascending=False)
percent = (X_matrix.isnull().sum()/X_matrix.count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
#no missing data in this training set
Total Percent
YearBuilt 0 0.0
FullBath 0 0.0
TotalBsmtSF 0 0.0
GarageCars 0 0.0
GrLivArea 0 0.0
OverallQual 0 0.0
total = X_test.isnull().sum().sort_values(ascending=False)
percent = (X_test.isnull().sum()/X_test.count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
#missing data in this test set
Total Percent
TotalBsmtSF 1 0.000686
GarageCars 1 0.000686
YearBuilt 0 0.000000
FullBath 0 0.000000
GrLivArea 0 0.000000
OverallQual 0 0.000000
#help(mat.ceil) #去上限

使用均值代替缺失的數據 函數

X_test['TotalBsmtSF'] = X_test['TotalBsmtSF'].fillna(X_test['TotalBsmtSF'].mean())
X_test['GarageCars'] = X_test['GarageCars'].fillna(mat.ceil(X_test['GarageCars'].mean()))

total = X_test.isnull().sum().sort_values(ascending=False)
percent = (X_test.isnull().sum()/X_test.count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
Total Percent
YearBuilt 0 0.0
FullBath 0 0.0
TotalBsmtSF 0 0.0
GarageCars 0 0.0
GrLivArea 0 0.0
OverallQual 0 0.0
(1459, 6)
  • 而後預處理模塊的特徵縮放和均值歸一化。 進一步提供了一個實用類StandardScaler,它實現了變換方法來計算訓練集上的均值和標準差,以便稍後可以在測試集上從新應用相同的變換。
max_abs_scaler = preprocessing.MaxAbsScaler()
X_train_maxabs = max_abs_scaler.fit_transform(X_matrix)
[[ 0.7         0.30308401  0.5         0.1400982   0.66666667  0.99651741]       
 [ 0.6         0.22367955  0.5         0.20654664  0.66666667  0.98308458]     
 [ 0.7         0.31655441  0.5         0.15057283  0.66666667  0.99552239] 
 [ 0.7         0.41474654  0.25        0.18854337  0.66666667  0.96567164]
 [ 0.5         0.191067    0.25        0.17643208  0.33333333  0.97014925]  
 [ 0.5         0.22261609  0.25        0.20556465  0.33333333  0.97761194]]
X_test_maxabs = max_abs_scaler.fit_transform(X_test)
[[ 0.5         0.17585868  0.2         0.17311089  0.25        0.97562189]   
 [ 0.6         0.26084396  0.2         0.26084396  0.25        0.97412935]    
 [ 0.5         0.31972522  0.4         0.18213935  0.5         0.99353234] 
 [ 0.5         0.24023553  0.4         0.24023553  0.25        0.97512438]   
 [ 0.5         0.19038273  0.          0.17899902  0.25        0.99104478]
 [ 0.7         0.39254171  0.6         0.19548577  0.5         0.99154229]]




Y_pred_train = lr.predict(X_train_maxabs)
print("Los Reg performance evaluation on Y_pred_train")
print("R-squared =", metrics.r2_score(Y, Y_pred_train))
Los Reg performance evaluation on Y_pred_train   
R-squared = 0.768647335422
Y_pred_test = lr.predict(X_test_maxabs)  
print("Lin Reg performance evaluation on X_test")
#print("R-squared =", metrics.r2_score(Y, Y_pred_test))
print("Coefficients =", lr.coef_)
Lin Reg performance evaluation on X_test 
Coefficients = [[ 205199.68775757  305095.8264889    58585.26325362  178302.68126933
   -16511.92112734  676458.9666186 ]]

Logistic Regression


import pandas as pd
import numpy as np


column_names = ['Sample code number','Clump Thickness','Uniformity of Cell Size','Uniformity of Cell Shape','Marginal Adhesion','Single Epithelial Cell Size','Bare Nuclei','Bland Chromatin','Normal Nucleoli','Mitoses','Class']
data = pd.read_csv('DATA/data.csv',names=column_names)
data = data.replace(to_replace='?',value = np.nan)
data = data.dropna(how='any')
(683, 11)
Sample code number Clump Thickness Uniformity of Cell Size Uniformity of Cell Shape Marginal Adhesion Single Epithelial Cell Size Bare Nuclei Bland Chromatin Normal Nucleoli Mitoses Class
0 1000025 5 1 1 1 2 1 3 1 1 2
1 1002945 5 4 4 5 7 10 3 2 1 2
2 1015425 3 1 1 1 2 2 3 1 1 2
3 1016277 6 8 8 1 3 4 3 7 1 2
4 1017023 4 1 1 3 2 1 3 1 1 2
5 1017122 8 10 10 8 7 10 9 7 1 4
6 1018099 1 1 1 1 2 10 3 1 1 2
7 1018561 2 1 2 1 2 1 3 1 1 2
8 1033078 2 1 1 1 2 1 1 1 5 2
9 1033078 4 2 1 1 2 1 2 1 1 2


from sklearn.cross_validation import train_test_split
X_train,X_test,y_train,y_test = train_test_split(data[column_names[1:10]],data[column_names[10]],test_size = 0.25,random_state = 33)
2    344
4    168
Name: Class, dtype: int64
2    100
4     71
Name: Class, dtype: int64


from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)
lr = LogisticRegression()
lr_y_predict = lr.predict(X_test)
array([2, 2, 4, 4, 2, 2, 2, 4, 2, 2, 2, 2, 4, 2, 4, 4, 4, 4, 4, 2, 2, 4, 4,
       2, 4, 4, 2, 2, 4, 4, 4, 4, 4, 4, 4, 4, 2, 4, 4, 4, 4, 4, 2, 4, 2, 2,
       4, 2, 2, 4, 4, 2, 2, 2, 4, 2, 2, 2, 2, 2, 4, 4, 2, 2, 2, 4, 2, 2, 2,
       2, 4, 2, 2, 2, 2, 2, 2, 4, 4, 2, 2, 2, 4, 2, 2, 2, 4, 2, 4, 2, 4, 4,
       2, 2, 2, 2, 4, 4, 2, 2, 2, 4, 2, 2, 4, 2, 2, 2, 2, 2, 4, 2, 2, 2, 2, 
       2, 2, 4, 2, 2, 4, 4, 2, 4, 2, 2, 2, 4, 2, 2, 4, 4, 2, 4, 4, 2, 2, 2,
       2, 4, 2, 4, 2, 4, 2, 2, 2, 2, 2, 4, 4, 2, 4, 4, 2, 4, 2, 2, 2, 2, 4,
       4, 4, 2, 4, 2, 2, 4, 2, 4, 4], dtype=int64)


from sklearn.metrics import classification_report

print('Accuracy of LR Classifier:',lr.score(X_test,y_test))
Accuracy of LR Classifier: 0.988304093567
             precision    recall  f1-score   support

     Benign       0.99      0.99      0.99       100
  Malignant       0.99      0.99      0.99        71

avg / total       0.99      0.99      0.99       171