from xgboost import XGBClassifier import xgboost as xgb import pandas as pd import numpy as np from sklearn.model_selection import GridSearchCV from sklearn.model_selection import StratifiedKFold from sklearn.metrics import log_loss from matplotlib import pyplot import seaborn as sns %matplotlib inline
# path to where the data lies #dpath = '/Users/qing/desktop/XGBoost/data/' dpath = './data/' train = pd.read_csv(dpath +"Otto_train.csv") #train.head()
sns.countplot(train.target); pyplot.xlabel('target'); pyplot.ylabel('Number of occurrences');
# drop ids and get labels y_train = train['target'] y_train = y_train.map(lambda s: s[6:]) y_train = y_train.map(lambda s: int(s)-1) train = train.drop(["id", "target"], axis=1) X_train = np.array(train)
# prepare cross validation kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=3)
再次調整弱分類器數目dom
def modelfit(alg, X_train, y_train, useTrainCV=True, cv_folds=None, early_stopping_rounds=50): if useTrainCV: xgb_param = alg.get_xgb_params() xgb_param['num_class'] = 9 xgtrain = xgb.DMatrix(X_train, label = y_train) cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], folds =cv_folds, metrics='mlogloss', early_stopping_rounds=early_stopping_rounds) n_estimators = cvresult.shape[0] alg.set_params(n_estimators = n_estimators) print cvresult #result = pd.DataFrame(cvresult) #cv缺省返回結果爲DataFrame #result.to_csv('my_preds.csv', index_label = 'n_estimators') cvresult.to_csv('my_preds4_6.csv', index_label = 'n_estimators') # plot test_means = cvresult['test-mlogloss-mean'] test_stds = cvresult['test-mlogloss-std'] train_means = cvresult['train-mlogloss-mean'] train_stds = cvresult['train-mlogloss-std'] x_axis = range(0, n_estimators) pyplot.errorbar(x_axis, test_means, yerr=test_stds ,label='Test') pyplot.errorbar(x_axis, train_means, yerr=train_stds ,label='Train') pyplot.title("XGBoost n_estimators vs Log Loss") pyplot.xlabel( 'n_estimators' ) pyplot.ylabel( 'Log Loss' ) pyplot.savefig( 'n_estimators4_6.png' ) #Fit the algorithm on the data alg.fit(X_train, y_train, eval_metric='mlogloss') #Predict training set: train_predprob = alg.predict_proba(X_train) logloss = log_loss(y_train, train_predprob) #Print model report: print ("logloss of train :" ) print logloss
xgb6 = XGBClassifier( learning_rate =0.1, n_estimators=1000, #數值大不要緊,cv會自動返回合適的n_estimators max_depth=6, min_child_weight=4, gamma=0, subsample=0.7, colsample_bytree=0.6, colsample_bylevel=0.7, reg_alpha = 1, reg_lambda = 0.5, objective= 'multi:softprob', seed=3) modelfit(xgb6, X_train, y_train, cv_folds = kfold)
cvresult = pd.DataFrame.from_csv('my_preds4_6.csv') cvresult = cvresult.iloc[100:] # plot test_means = cvresult['test-mlogloss-mean'] test_stds = cvresult['test-mlogloss-std'] train_means = cvresult['train-mlogloss-mean'] train_stds = cvresult['train-mlogloss-std'] x_axis = range(100,cvresult.shape[0]+100) fig = pyplot.figure(figsize=(10, 10), dpi=100) pyplot.errorbar(x_axis, test_means, yerr=test_stds ,label='Test') pyplot.errorbar(x_axis, train_means, yerr=train_stds ,label='Train') pyplot.title("XGBoost n_estimators vs Log Loss") pyplot.xlabel( 'n_estimators' ) pyplot.ylabel( 'Log Loss' ) pyplot.savefig( 'n_estimators_detail4_6.png' ) pyplot.show()