from IPython.display import HTML from IPython.display import Image import sys sys.path.append('.') HTML('''<script> code_show=true; function code_toggle() { if (code_show){ $('div.input').hide(); } else { $('div.input').show(); } code_show = !code_show } $( document ).ready(code_toggle); </script> <form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')
""" pandas numpy : data_process matplotlib seaborn : data visualization warning: avoid warning from packages """ import warnings def ignore_warn(*args, **kwargs): pass warnings.warn = ignore_warn # data process import pandas as pd import numpy as np # data visualization %matplotlib inline import matplotlib.pyplot as plt import seaborn as sns # set option for visilazition color = sns.color_palette() sns.set_style('darkgrid') #sns.set(style='white', context='notebook', palette='deep') """ # avoid warning ignore annoying warning (from sklearn and seaborn and other packages xgboost and lightgbm) # we can use !ls or !pip install package_name to ahcieve some magic command line """ # Set visualisation colours mycols = ["#66c2ff", "#5cd6d6", "#00cc99", "#85e085", "#ffd966", "#ffb366", "#ffb3b3", "#dab3ff", "#c2c2d6"] sns.set_palette(palette = mycols, n_colors = 4) #or sns.set(style='white', context='notebook', palette='deep') print('Data Manipulation, Mathematical Computation and Visualisation packages imported!')
Data Manipulation, Mathematical Computation and Visualisation packages imported!
""" function:Statistical packages used for transformations stats: staticstic function in scipy skew: for partial norm distributions skewed coefficient. boxcox1p: transform data or feature to normal distribution https://blog.csdn.net/u012735708/article/details/84755595,determine the lambda估算的值) pearsonr: 皮爾遜係數 """ from scipy import stats from scipy.stats import skew, norm from scipy.special import boxcox1p from scipy.stats.stats import pearsonr print('Statistical packages imported!')
Statistical packages imported!
""" ElasticNet:彈性網絡 Lasso: 奧卡姆剃刀迴歸,正則化 BayesianRidge: 貝葉斯迴歸 常見的線性迴歸模型:http://blog.sina.com.cn/s/blog_62970c250102xfgb.html,LassoLarsIC這個模型不熟悉 ensemble 方法: 隨即森林迴歸,GBDT迴歸,xgboost迴歸,lightGBM 迴歸 numpy.dtype size changed, may indicate binary incompatibility 問題解決方案: numpy 版本太高,調低numpy版本 """ # Algorithms used for modeling from sklearn.linear_model import ElasticNet, Lasso, BayesianRidge, LassoLarsIC from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor from sklearn.kernel_ridge import KernelRidge import xgboost as xgb import lightgbm as lgb print('Algorithm packages imported!')
Algorithm packages imported!
""" make_pipeline: construct pipeline for processing data RobustScaler: 針對離羣點的RobustScaler有些時候,數據集中存在離羣點,用Z-Score進行標準化,可是結果不理想, 由於離羣點在標準化後喪失了利羣特性。RobustScaler針對離羣點作標準化處理,該方法對數據中心化的數據的縮放健壯性有更強的參數控制能力。 StandScaler(Z-Score): 新數據=(原數據-均值)/標準差 歸一化Max-Min:新數據=(原數據-最小值)/(最大值-最小值) """ # Pipeline and scaling preprocessing will be used for models that are sensitive from sklearn.pipeline import make_pipeline from sklearn.preprocessing import RobustScaler from sklearn.preprocessing import StandardScaler from sklearn.preprocessing import LabelEncoder # from sklearn.feature_selection import SelectFromModel # from sklearn.feature_selection import SelectKBest # from sklearn.feature_selection import chi2 # 模型選擇的模塊用的比較少 print('Pipeline and preprocessing packages imported!')
Pipeline and preprocessing packages imported!
# Model select packages used for sampling dataset and optimising parameters """ KFold: 它將原始數據分紅K組(K-Fold),將每一個子集數據分別作一次驗證集,其他的K-1組子集數據做爲訓練集,這樣會獲得K個模型。 cross_val_score: 交叉驗證的評估值 train_test_split: 數據切割成訓練集和測試集(驗證集) GridSearchCV:網格搜索參數,進行模型搜索 ShuffleSplit: train_test_split的參數中shuffle參數設定爲True """ from sklearn import model_selection from sklearn.model_selection import KFold from sklearn.model_selection import cross_val_score, train_test_split from sklearn.model_selection import GridSearchCV from sklearn.model_selection import ShuffleSplit print('Model selection packages imported!')
Model selection packages imported!
from subprocess import check_output print(check_output(['ls']).decode("utf8")) # check the files available in the directory pd.set_option('display.float_format', lambda x: '{:.3f}'.format(x)) #設定pandas數字格式,小數點後3位
1-house-prices-solution-top-1.ipynb Stacked Regressions _ Top 4% on LeaderBoard.ipynb __pycache__ concat_kaggle_house_price.ipynb data_description.txt data_description.zip final_submission.csv input kaggle house price.ipynb laod_Algorithms.py stacking-house-prices-walkthrough-to-top-5.ipynb submission.csv
def load_data(): #Now let's import and put the train and test datasets in pandas dataframe train = pd.read_csv('input/train.csv') test = pd.read_csv('input/test.csv') return train, test
train,test = load_data()
train.head()
Id | MSSubClass | MSZoning | LotFrontage | LotArea | Street | Alley | LotShape | LandContour | Utilities | ... | PoolArea | PoolQC | Fence | MiscFeature | MiscVal | MoSold | YrSold | SaleType | SaleCondition | SalePrice | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 60 | RL | 65.000 | 8450 | Pave | NaN | Reg | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 2 | 2008 | WD | Normal | 208500 |
1 | 2 | 20 | RL | 80.000 | 9600 | Pave | NaN | Reg | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 5 | 2007 | WD | Normal | 181500 |
2 | 3 | 60 | RL | 68.000 | 11250 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 9 | 2008 | WD | Normal | 223500 |
3 | 4 | 70 | RL | 60.000 | 9550 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 2 | 2006 | WD | Abnorml | 140000 |
4 | 5 | 60 | RL | 84.000 | 14260 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 12 | 2008 | WD | Normal | 250000 |
5 rows × 81 columnsjavascript
test.head()
Id | MSSubClass | MSZoning | LotFrontage | LotArea | Street | Alley | LotShape | LandContour | Utilities | ... | ScreenPorch | PoolArea | PoolQC | Fence | MiscFeature | MiscVal | MoSold | YrSold | SaleType | SaleCondition | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1461 | 20 | RH | 80.000 | 11622 | Pave | NaN | Reg | Lvl | AllPub | ... | 120 | 0 | NaN | MnPrv | NaN | 0 | 6 | 2010 | WD | Normal |
1 | 1462 | 20 | RL | 81.000 | 14267 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | 0 | NaN | NaN | Gar2 | 12500 | 6 | 2010 | WD | Normal |
2 | 1463 | 60 | RL | 74.000 | 13830 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | 0 | NaN | MnPrv | NaN | 0 | 3 | 2010 | WD | Normal |
3 | 1464 | 60 | RL | 78.000 | 9978 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | 0 | NaN | NaN | NaN | 0 | 6 | 2010 | WD | Normal |
4 | 1465 | 120 | RL | 43.000 | 5005 | Pave | NaN | IR1 | HLS | AllPub | ... | 144 | 0 | NaN | NaN | NaN | 0 | 1 | 2010 | WD | Normal |
5 rows × 80 columnshtml
train_ID = train['Id'] test_ID = test['Id'] #去掉Id字段,由於這個特徵沒意義 train.drop("Id", axis = 1, inplace = True) test.drop("Id", axis = 1, inplace = True)
plt.subplots(figsize=(12,6)) # 設定畫布大小 plt.subplot(1,2,1) # 圖片的排列方式1行2列的第一張圖 g= sns.regplot(x=train['GrLivArea'],y= train['SalePrice'],fit_reg=False).set_title('Beofre') plt.subplot(1,2,2) # 圖片的排列方式1行2列的第二張圖 train= train.drop(train[train['GrLivArea']>4000].index) # 去掉面積大於4000的樣本,axis=0 默認人蔘數 g=sns.regplot(x=train['GrLivArea'],y=train['SalePrice'],fit_reg=False).set_title('After')
""" P-P圖是根據變量的累積機率對應於所指定的理論分佈累積機率繪製的散點圖,用於直觀地檢測樣本數據是否符合某一律率分佈。 若是被檢驗的數據符合所指定的分佈,則表明樣本數據的點應當基本在表明理論分佈的對角線上。 """ plt.subplots(figsize=(15,6)) plt.subplot(1,2,1) g=sns.distplot(train['SalePrice'],fit=norm) mu, sigma, = norm.fit(train['SalePrice']) # 均值,標準差 skew_co =train['SalePrice'].skew() # 偏態係數 g.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} ),skew{:.2f}'.format(mu, sigma,skew_co)], loc='best') plt.subplot(1,2,2) g = stats.probplot(train['SalePrice'], plot=plt)
#We use the numpy fuction log1p which applies log(1+x) to all elements of the column train["SalePrice"] = np.log1p(train["SalePrice"]) #Check the new distribution sns.distplot(train['SalePrice'] , fit=norm); # Get the fitted parameters used by the function mu, sigma, = norm.fit(train['SalePrice']) # 均值,標準差 skew_co =train['SalePrice'].skew() # 偏態係數 #Now plot the distribution plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} ) skew{:.2f}'.format(mu, sigma,skew_co)], loc='best') plt.ylabel('Frequency') plt.title('SalePrice distribution') #Get also the QQ-plot fig = plt.figure() res = stats.probplot(train['SalePrice'], plot=plt) plt.show()
ntrain = train.shape[0] ntest = test.shape[0] y_train = train.SalePrice.values all_data = pd.concat((train, test)).reset_index(drop=True) all_data.drop(['SalePrice'], axis=1, inplace=True) print("all_data size is : {}".format(all_data.shape))
all_data size is : (2915, 79)
all_data_na = (all_data.isnull().sum() / len(all_data)) * 100 all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)[:30] missing_data = pd.DataFrame({'Missing Ratio' :all_data_na}) missing_data.head(20)
Missing Ratio | |
---|---|
PoolQC | 99.726 |
MiscFeature | 96.398 |
Alley | 93.208 |
Fence | 80.446 |
FireplaceQu | 48.714 |
LotFrontage | 16.672 |
GarageQual | 5.455 |
GarageCond | 5.455 |
GarageFinish | 5.455 |
GarageYrBlt | 5.455 |
GarageType | 5.386 |
BsmtExposure | 2.813 |
BsmtCond | 2.813 |
BsmtQual | 2.779 |
BsmtFinType2 | 2.744 |
BsmtFinType1 | 2.710 |
MasVnrType | 0.823 |
MasVnrArea | 0.789 |
MSZoning | 0.137 |
BsmtFullBath | 0.069 |
plt.subplots(figsize=(12,5)) # 設定畫布大小 sns.barplot(x=all_data_na.index,y=all_data_na) plt.xticks(rotation='90') # 設定x軸的標籤 plt.ylabel('percentage',fontsize=15) plt.xlabel('feature',fontsize=15) plt.title('Percent missing data by feature', fontsize=15)
Text(0.5, 1.0, 'Percent missing data by feature')
all_data["PoolQC"] = all_data["PoolQC"].fillna("None") # 字段的說明中若是沒有游泳池,所以用None填充
all_data["MiscFeature"] = all_data["MiscFeature"].fillna("None") all_data["Alley"] = all_data["Alley"].fillna("None") all_data["Fence"] = all_data["Fence"].fillna("None") all_data["FireplaceQu"] = all_data["FireplaceQu"].fillna("None") for col in ('GarageType', 'GarageFinish', 'GarageQual', 'GarageCond'): all_data[col] = all_data[col].fillna('None')
all_data["LotFrontage"] = all_data.groupby("Neighborhood")["LotFrontage"].transform( lambda x: x.fillna(x.median()))
for col in ('GarageYrBlt', 'GarageArea', 'GarageCars'): all_data[col] = all_data[col].fillna(0)
for col in ('BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath'): all_data[col] = all_data[col].fillna(0)
for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'): all_data[col] = all_data[col].fillna('None')
all_data["MasVnrType"] = all_data["MasVnrType"].fillna("None") all_data["MasVnrArea"] = all_data["MasVnrArea"].fillna(0)
all_data['MSZoning'] = all_data['MSZoning'].fillna(all_data['MSZoning'].mode()[0]) all_data['KitchenQual'] = all_data['KitchenQual'].fillna(all_data['KitchenQual'].mode()[0])
all_data = all_data.drop(['Utilities'], axis=1)
all_data["Functional"] = all_data["Functional"].fillna("Typ")
all_data['Electrical'] = all_data['Electrical'].fillna(all_data['Electrical'].mode()[0]) all_data['Exterior1st'] = all_data['Exterior1st'].fillna(all_data['Exterior1st'].mode()[0]) all_data['Exterior2nd'] = all_data['Exterior2nd'].fillna(all_data['Exterior2nd'].mode()[0]) all_data['SaleType'] = all_data['SaleType'].fillna(all_data['SaleType'].mode()[0])
all_data['MSSubClass'] = all_data['MSSubClass'].fillna("None")
#Check remaining missing values if any all_data_na = (all_data.isnull().sum() / len(all_data)) * 100 all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False) missing_data = pd.DataFrame({'Missing Ratio' :all_data_na}) missing_data.head()
Missing Ratio |
---|
from sklearn.preprocessing import LabelEncoder cols = ('FireplaceQu', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond', 'ExterQual', 'ExterCond','HeatingQC', 'PoolQC', 'KitchenQual', 'BsmtFinType1', 'BsmtFinType2', 'Functional', 'Fence', 'BsmtExposure', 'GarageFinish', 'LandSlope', 'LotShape', 'PavedDrive', 'Street', 'Alley', 'CentralAir', 'MSSubClass', 'OverallCond', 'YrSold', 'MoSold') # process columns, apply LabelEncoder to categorical features for c in cols: lbl = LabelEncoder() lbl.fit(list(all_data[c].values)) all_data[c] = lbl.transform(list(all_data[c].values)) # shape print('Shape all_data: {}'.format(all_data.shape))
Shape all_data: (2915, 78)
all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index # Check the skew of all numerical features skewed_feats = all_data[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False) print("\nSkew in numerical features: \n") skewness = pd.DataFrame({'Skew' :skewed_feats}) skewness.head(20)
Skew in numerical features:
Skew | |
---|---|
MiscVal | 21.932 |
PoolArea | 18.702 |
LotArea | 13.124 |
LowQualFinSF | 12.080 |
3SsnPorch | 11.368 |
LandSlope | 4.971 |
KitchenAbvGr | 4.299 |
BsmtFinSF2 | 4.143 |
EnclosedPorch | 4.001 |
ScreenPorch | 3.944 |
BsmtHalfBath | 3.943 |
MasVnrArea | 2.601 |
OpenPorchSF | 2.529 |
WoodDeckSF | 1.848 |
1stFlrSF | 1.253 |
LotFrontage | 1.093 |
GrLivArea | 0.978 |
BsmtFinSF1 | 0.974 |
TotalSF | 0.936 |
BsmtUnfSF | 0.920 |
skewness = skewness[abs(skewness) > 0.75] print("There are {} highly skewed numerical features to Box Cox transform".format(skewness.shape[0])) skewed_features = skewness.index lam = 0.15 for feat in skewed_features: #all_data[feat] += 1 all_data[feat] = boxcox1p(all_data[feat], lam) #all_data[skewed_features] = np.log1p(all_data[skewed_features])
There are 59 highly skewed numerical features to Box Cox transform
all_data = pd.get_dummies(all_data) print(all_data.shape)
(2915, 220)
def get_data_coorelation(data): corr = data.corr() plt.subplots(figsize=(30,30)) cmap = sns.diverging_palette(150, 250, as_cmap=True) # 之後能夠固定下來這樣的格式,尤爲是對於數據的相關係數 sns.heatmap(corr, cmap="RdYlBu", vmax=1, vmin=-0.6, center=0.2, square=True, linewidths=0, cbar_kws={"shrink": .5}, annot = True)
get_data_coorelation(train)
train = all_data[:ntrain] test = all_data[ntrain:]
from sklearn.metrics import mean_squared_error
# for alg in models: # model_name = alg.__class__.__name__ # before_model_compare.loc[row_index,'Name'] = model_name # before_model_compare.loc[row_index,'Parameters'] = str(alg.get_params()) # alg.fit(X_train,Y_train) # # for cross_validation but the results are negative,we need to convert it to postive,均方偏差 # training_results = np.sqrt((-cross_val_score(alg,X_train,Y_train,cv=shuff,scoring='neg_mean_squared_error')).mean()) # test_results = np.sqrt(((Y_test-alg.predict(X_test))**2).mean()) # before_model_compare.loc[row_index,"Train mean_squared_error"] = training_results*100 # before_model_compare.loc[row_index,'Test mean_squared_error'] = test_results*100 # row_index+=1 # print(row_index,model_name,"trained>>>>")
#Validation function n_folds = 5 def rmsle_cv(model): kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(train.values) rmse= np.sqrt(-cross_val_score(model, train.values, y_train, scoring="neg_mean_squared_error", cv = kf)) return(rmse)
LASSO Regressionjava
This model may be very sensitive to outliers. So we need to made it more robust on them. For that we use the sklearn's Robustscaler() method on pipeline 這個模型對於異常值很是敏感,須要使用Robustscaler方法python
lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.0005, random_state=1)) # alpha參數怎麼定的?這裏面都是須要使用調參數進行解決的
###Elastic Net Regression,一樣是針對異常值處理的一個模型 ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3))
###Kernel Ridge Regression : KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)
###Gradient Boosting Regression : GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05, max_depth=4, max_features='sqrt', min_samples_leaf=15, min_samples_split=10, loss='huber', random_state =5)
###XGBoost : model_xgb = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, learning_rate=0.05, max_depth=3, min_child_weight=1.7817, n_estimators=2200, reg_alpha=0.4640, reg_lambda=0.8571, subsample=0.5213, silent=1, random_state =7, nthread = -1)
####LightGBM : model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=5, learning_rate=0.05, n_estimators=720, max_bin = 55, bagging_fraction = 0.8, bagging_freq = 5, feature_fraction = 0.2319, feature_fraction_seed=9, bagging_seed=9, min_data_in_leaf =6, min_sum_hessian_in_leaf = 11)
score = rmsle_cv(lasso) print("\nLasso score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))
Lasso score: 0.1112 (0.0071)
score = rmsle_cv(ENet) print("ElasticNet score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))
ElasticNet score: 0.1112 (0.0072)
score = rmsle_cv(KRR) print("Kernel Ridge score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))
Kernel Ridge score: 0.1152 (0.0071)
score = rmsle_cv(GBoost) print("Gradient Boosting score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))
Gradient Boosting score: 0.1163 (0.0085)
score = rmsle_cv(model_xgb) print("Xgboost score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))
Xgboost score: 0.1161 (0.0051)
score = rmsle_cv(model_lgb) print("LGBM score: {:.4f} ({:.4f})\n" .format(score.mean(), score.std()))
LGBM score: 0.1154 (0.0052)
from sklearn.base import BaseEstimator,RegressorMixin,TransformerMixin,clone class Averaging_models(BaseEstimator,RegressorMixin,TransformerMixin): def __inti__(self,models): self.models = models # we define clones of the original models to fit the data in def fit(self, X, y): self.models_ = [clone(x) for x in self.models] # Train cloned base models for model in self.models_: model.fit(X, y) return self def predict(self,X): predictions = np.column_stack([model.predict(X) for model in self.models_]) return np.mean(predictions,axis=1) # 返回全部預測結果的平均值 # class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin): # def __init__(self, models): # self.models = models # # we define clones of the original models to fit the data in # def fit(self, X, y): # self.models_ = [clone(x) for x in self.models] # # Train cloned base models # for model in self.models_: # model.fit(X, y) # return self # #Now we do the predictions for cloned models and average them # def predict(self, X): # predictions = np.column_stack([ # model.predict(X) for model in self.models_ # ]) # return np.mean(predictions, axis=1)
平均基模型的結果git
# 選擇了基本6個模型ENET,GBoost,KRR,Lasso,model_lgb averaged_models = AveragingModels(models = (ENet, GBoost, KRR, lasso,model_lgb)) score = rmsle_cv(averaged_models) print(" Averaged base models score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))
Averaged base models score: 0.1082 (0.0068)
print(""" Split the total training set into two disjoint sets (here train and .holdout ) 1. 將原始的train set 分割爲兩部分,train1,驗證集(其對應的目標數據爲y-valid) Train several base models on the first part (train) 2. 用基模型在train1訓練獲得不一樣的基模型M1,M2,M3,M4... Test these base models on the second part (holdout) 3. 用上面的模型預測 驗證集,獲得rsult1,result2,reuslt3,result4... Use the predictions from 3) (called out-of-folds predictions) as the inputs, 將result1,result2,result3,result4....組成新的訓練集做爲輸入 train2 and the correct responses (target variable) 驗證集的y-valid as the outputs to train a higher level learner called meta-model. y-valid 做爲輸出,而後能夠訓練獲得一個更加高級的模型 """) # 前三步 通常是迭代而來,若是採用5fold的stacking,就須要先將訓練集train分割爲5份,而後重複5次,獲得模型對於整個訓練集的預測結果 # 將原始的訓練集組變成一個新的訓練集的一個特徵M1r, # 將不一樣的模型訓練獲得的結果排列成新的輸入集[Mr1,Mr2,Mr3,Mr4....],將整個的訓練集的y值做爲out-put,獲得新的metal model # 下圖的上層描述的是單獨一個模型的的5-flod過程,而後得到該模型處理訓練數據以後的New_feature, # 而後分別得到不一樣模型的上述特徵,組成新的輸入,訓練獲得metal model # 下層是咱們上面用的平均的方法,得到不一樣的結果,而後取平均
Split the total training set into two disjoint sets (here train and .holdout ) 1. 將原始的train set 分割爲兩部分,train1,驗證集(其對應的目標數據爲y-valid) Train several base models on the first part (train) 2. 用基模型在train1訓練獲得不一樣的基模型M1,M2,M3,M4... Test these base models on the second part (holdout) 3. 用上面的模型預測 驗證集,獲得rsult1,result2,reuslt3,result4... Use the predictions from 3) (called out-of-folds predictions) as the inputs, 將result1,result2,result3,result4....組成新的訓練集做爲輸入 train2 and the correct responses (target variable) 驗證集的y-valid as the outputs to train a higher level learner called meta-model. y-valid 做爲輸出,而後能夠訓練獲得一個更加高級的模型
(Image taken from Faron)github
print(""" On this gif, the base models are algorithms 0, 1, 2 and the meta-model is algorithm 3. The entire training dataset is A+B (target variable y known) that we can split into train part (A) and holdout part (B). And the test dataset is C. B1 (which is the prediction from the holdout part) is the new feature used to train the meta-model 3 and C1 (which is the prediction from the test dataset) is the meta-feature on which the final prediction is done. """)
On this gif, the base models are algorithms 0, 1, 2 and the meta-model is algorithm 3. The entire training dataset is A+B (target variable y known) that we can split into train part (A) and holdout part (B). And the test dataset is C. B1 (which is the prediction from the holdout part) is the new feature used to train the meta-model 3 and C1 (which is the prediction from the test dataset) is the meta-feature on which the final prediction is done.
class StackingAveragedModels(BaseEstimator, RegressorMixin, TransformerMixin): def __init__(self, base_models, meta_model, n_fold=5): self.base_models = base_models self.meta_model = meta_model self.n_fold = n_fold def fit(self, X, y): self.base_models_ = [list() for x in self.base_models] # 建立空列表,用於放kfold中的各個模型 self.meta_model_ = clone(self.meta_model) k_fold = KFold(n_splits=self.n_fold, shuffle=True, random_state=43) out_of_flods_predictions = np.zeros((X.shape[0], len(self.base_models))) for i, model in enumerate(self.base_models): for train_index, hold_index in k_fold.split(X, y): instance = clone(model) self.base_models_[i].append(instance) instance.fit(X[train_index], y[train_index]) y_pred = instance.predict(X[hold_index]) out_of_flods_predictions[hold_index, i] = y_pred self.meta_model_.fit(out_of_flods_predictions, y) return self def predict(self, X): meta_features = np.column_stack([np.column_stack([model.predict(X) for model in base_models]).mean( axis=1) for base_models in self.base_models_]) return self.meta_model_.predict(meta_features)
Stacking Averaged models Score算法
stacked_averaged_models = StackingAveragedModels(base_models = (ENet, GBoost, KRR), meta_model = lasso) score = rmsle_cv(stacked_averaged_models) print("Stacking Averaged models score: {:.4f} ({:.4f})".format(score.mean(), score.std()))
Stacking Averaged models score: 0.1081 (0.0069)
def rmsle(y,y_pred): return np.sqrt(mean_squared_error(y,y_pred))
stacked_averaged_models.fit(train.values,y_train) stacked_train_pred = stacked_averaged_models.predict(train.values) stacked_pred = np.expm1(stacked_averaged_models.predict(test.values)) print(rmsle(y_train,stacked_train_pred))
0.07662229886245185
model_xgb.fit(train.values,y_train) xgb_train_pred = model_xgb.predict(train.values) xgb_pred = np.expm1(model_xgb.predict(test.values)) print(rmsle(y_train,xgb_train_pred))
0.07978944418551953
model_lgb.fit(train.values,y_train) # 這些模型的參數,都是經過GridSearch獲得
LGBMRegressor(bagging_fraction=0.8, bagging_freq=5, bagging_seed=9, boosting_type='gbdt', class_weight=None, colsample_bytree=1.0, feature_fraction=0.2319, feature_fraction_seed=9, importance_type='split', learning_rate=0.05, max_bin=55, max_depth=-1, min_child_samples=20, min_child_weight=0.001, min_data_in_leaf=6, min_split_gain=0.0, min_sum_hessian_in_leaf=11, n_estimators=720, n_jobs=-1, num_leaves=5, objective='regression', random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0, subsample_for_bin=200000, subsample_freq=0)
lgb_train_pred= model_lgb.predict(train.values) lgb_pred = np.expm1(model_lgb.predict(test.values)) print(rmsle(y_train,lgb_train_pred))
0.07145250287861045
'''RMSE on the entire Train data when averaging''' print('RMSLE score on train data:') print(rmsle(y_train,stacked_train_pred*0.70 + xgb_train_pred*0.15 + lgb_train_pred*0.15 ))
RMSLE score on train data: 0.07431573219850335
ensemble = stacked_pred*0.70 + xgb_pred*0.15 + lbg_pred*0.15 sub = pd.DataFrame() sub['Id'] = test_ID sub['SalePrice'] = ensemble sub.to_csv('submission.csv',index=False)
github地址:https://github.com/point6013/essay_for_kaggle_test網絡