# Deep feature synthesis feature_matrix, features = ft.dfs(entityset=es, target_entity='clients', agg_primitives = agg_primitives, trans_primitives = trans_primitives)
詳細內容請見:node
rf = RandomForestRegressor(random_state = 42) from pprint import pprint # Look at parameters used by our current forest print('Parameters currently in use:\n') pprint(rf.get_params())
Step 2:爲了使用RandomizedSearchCV,咱們首先須要建立一個參數網格在擬合過程當中進行採樣:python
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
'max_features': max_features,
'max_depth': max_depth,
'min_samples_split': min_samples_split,
'min_samples_leaf': min_samples_leaf,
'bootstrap': bootstrap}
pprint(random_grid)
Step 3:訓練git
# 使用隨機網格搜索最佳超參數 # 首先建立要調優的基本模型 rf = RandomForestRegressor() # 隨機搜索參數,使用3倍交叉驗證 # 採用100種不一樣的組合進行搜索,並使用全部可用的核心 rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1) # Fit模型 rf_random.fit(train_features, train_labels)
Step 4:獲得最佳參數github
rf_random.best_params_
Step 5:將優化後的參數進行訓練和比較驗證。算法
from sklearn.model_selection import GridSearchCV # Create the parameter grid based on the results of random search param_grid = { 'bootstrap': [True], 'max_depth': [80, 90, 100, 110], 'max_features': [2, 3], 'min_samples_leaf': [3, 4, 5], 'min_samples_split': [8, 10, 12], 'n_estimators': [100, 200, 300, 1000] } # Create a based model rf = RandomForestRegressor() # Instantiate the grid search model grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, cv = 3, n_jobs = -1, verbose = 2)
Step 2: Fit模型並從新訓練和比較驗證bootstrap
grid_search.fit(train_features, train_labels) grid_search.best_params_ best_grid = grid_search.best_estimator_ grid_accuracy = evaluate(best_grid, test_features, test_labels)
詳細內容請見:https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74app
def objective(hyperparameters): """Returns validation score from hyperparameters""" model = Classifier(hyperparameters) validation_loss = cross_validation(model, training_data) return validation_loss
import lightgbm as lgb from hyperopt import STATUS_OK N_FOLDS = 10 # Create the dataset train_set = lgb.Dataset(train_features, train_labels) def objective(params, n_folds = N_FOLDS): """Objective function for Gradient Boosting Machine Hyperparameter Tuning""" # Perform n_fold cross validation with hyperparameters # Use early stopping and evalute based on ROC AUC cv_results = lgb.cv(params, train_set, nfold = n_folds, num_boost_round = 10000, early_stopping_rounds = 100, metrics = 'auc', seed = 50) #此部分爲核心代碼, # Extract the best score best_score = max(cv_results['auc-mean']) # Loss must be minimized loss = 1 - best_score # Dictionary with information for evaluation return {'loss': loss, 'params': params, 'status': STATUS_OK}
from hyperopt import hp # Define the search space space = { 'class_weight': hp.choice('class_weight', [None, 'balanced']), 'boosting_type': hp.choice('boosting_type', [{'boosting_type': 'gbdt', 'subsample': hp.uniform('gdbt_subsample', 0.5, 1)}, {'boosting_type': 'dart', 'subsample': hp.uniform('dart_subsample', 0.5, 1)}, {'boosting_type': 'goss'}]), 'num_leaves': hp.quniform('num_leaves', 30, 150, 1), 'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.2)), 'subsample_for_bin': hp.quniform('subsample_for_bin', 20000, 300000, 20000), 'min_child_samples': hp.quniform('min_child_samples', 20, 500, 5), 'reg_alpha': hp.uniform('reg_alpha', 0.0, 1.0), 'reg_lambda': hp.uniform('reg_lambda', 0.0, 1.0), 'colsample_bytree': hp.uniform('colsample_by_tree', 0.6, 1.0) }
# Sample from the full space example = sample(space) # Dictionary get method with default subsample = example['boosting_type'].get('subsample', 1.0) # Assign top-level keys example['boosting_type'] = example['boosting_type']['boosting_type'] example['subsample'] = subsample example
from hyperopt import tpe # Algorithm tpe_algorithm = tpe.suggest
from hyperopt import Trials # Trials object to track progress bayes_trials = Trials()
from hyperopt import fmin MAX_EVALS = 500 # Optimize best = fmin(fn = objective, space = space, algo = tpe.suggest, max_evals = MAX_EVALS, trials = bayes_trials)