from sklearn.svm import LinearSVC X.shape
# (150, 4) lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X, y) model = SelectFromModel(lsvc, prefit=True)
# 原數據 --> 轉變爲 --> 降維後的數據 X_new = model.transform(X) X_new.shape
# (150, 3)
# Author: Manoj Kumar <> # License: BSD 3 clause print(__doc__) import matplotlib.pyplot as plt import numpy as np from sklearn.datasets import load_boston from sklearn.feature_selection import SelectFromModel from sklearn.linear_model import LassoCV # Load the boston dataset. boston = load_boston() X, y = boston['data'], boston['target'] # We use the base estimator LassoCV since the L1 norm promotes sparsity of features. clf = LassoCV() # Set a minimum threshold of 0.25 sfm = SelectFromModel(clf, threshold=0.25), y) n_features = sfm.transform(X).shape[1] # Reset the threshold till the number of features equals two. # Note that the attribute can be set directly instead of repeatedly # fitting the metatransformer. while n_features > 2: sfm.threshold += 0.1 X_transform = sfm.transform(X) n_features = X_transform.shape[1] # Plot the selected two features from X. plt.title( "Features selected from Boston using SelectFromModel with " "threshold %0.3f." % sfm.threshold) feature1 = X_transform[:, 0] feature2 = X_transform[:, 1] plt.plot(feature1, feature2, 'r.') plt.xlabel("Feature number 1") plt.ylabel("Feature number 2") plt.ylim([np.min(feature2), np.max(feature2)])
print(__doc__) # Author: Fabian Pedregosa <> # Alexandre Gramfort <> # License: BSD 3 clause import numpy as np import matplotlib.pyplot as plt from sklearn import linear_model from sklearn import datasets diabetes = datasets.load_diabetes() X = y = print("Computing regularization path using the LARS ...") _, _, coefs = linear_model.lars_path(X, y, method='lasso', verbose=True)
# 註釋一:累加,而後變爲「比例」 xx = np.sum(np.abs(coefs.T), axis=1) xx /= xx[-1]
plt.plot(xx, coefs.T) ymin, ymax = plt.ylim() plt.vlines(xx, ymin, ymax, linestyle='dashed') plt.xlabel('|coef| / max|coef|') plt.ylabel('Coefficients') plt.title('LASSO Path') plt.axis('tight')
「註釋一」 的結果顯示:
Computing regularization path using the LARS ... [ 0. 60.11926965 663.66995526 888.91024335 1250.6953637 1440.79804251 1537.06598321 1914.57052862 2115.73774356 2195.55885543 2802.37509283 2863.01080401 3460.00495515] [0. 0.01737549 0.19181185 0.25691011 0.36147213 0.41641502 0.44423809 0.55334329 0.61148402 0.63455367 0.80993384 0.82745858 1. ]
from sklearn.linear_model import LogisticRegression class LR(LogisticRegression):
def __init__(self, threshold=0.01, dual=False, tol=1e-4, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='liblinear', max_iter=100, multi_class='ovr', verbose=0, warm_start=False, n_jobs=1): #權值相近的閾值 self.threshold = threshold
#初始化模型 LogisticRegression.__init__(self, penalty='l1', dual=dual, tol=tol, C=C, fit_intercept=fit_intercept, intercept_scaling=intercept_scaling, class_weight=class_weight, random_state=random_state, solver=solver, max_iter=max_iter, multi_class=multi_class, verbose=verbose, warm_start=warm_start, n_jobs=n_jobs)
#使用一樣的參數建立L2邏輯迴歸 self.l2 = LogisticRegression(penalty='l2', dual=dual, tol=tol, C=C, fit_intercept=fit_intercept, intercept_scaling=intercept_scaling, class_weight = class_weight, random_state=random_state, solver=solver, max_iter=max_iter, multi_class=multi_class, verbose=verbose, warm_start=warm_start, n_jobs=n_jobs)
def fit(self, X, y, sample_weight=None):
#訓練L1邏輯迴歸 super(LR, self).fit(X, y, sample_weight=sample_weight) self.coef_old_ = self.coef_.copy()
#訓練L2邏輯迴歸, y, sample_weight=sample_weight) cntOfRow, cntOfCol = self.coef_.shape
# 權值係數矩陣的行數對應目標值的種類數目 for i in range(cntOfRow): for j in range(cntOfCol):
coef = self.coef_[i][j] #L1邏輯迴歸的權值係數不爲0 if coef != 0: idx = [j] #對應在L2邏輯迴歸中的權值係數 coef1 = self.l2.coef_[i][j] for k in range(cntOfCol): coef2 = self.l2.coef_[i][k] #在L2邏輯迴歸中,權值係數之差小於設定的閾值,且在L1中對應的權值爲0 if abs(coef1-coef2) < self.threshold and j != k and self.coef_[i][k] == 0: idx.append(k) #計算這一類特徵的權值係數均值 mean = coef / len(idx) self.coef_[i][idx] = mean return self
from sklearn.feature_selection import SelectFromModel #帶L1和L2懲罰項的邏輯迴歸做爲基模型的特徵選擇 #參數threshold爲權值係數之差的閾值 SelectFromModel(LR(threshold=0.5, C=0.1)).fit_transform(,
基於樹的預測模型(見 sklearn.tree
模塊,森林見 sklearn.ensemble
模塊)可以用來計算特徵的重要程度,所以能用來去除不相關的特徵(結合 sklearn.feature_selection.SelectFromModel
print(__doc__) import numpy as np import matplotlib.pyplot as plt from sklearn.datasets import make_classification from sklearn.ensemble import ExtraTreesClassifier # Build a classification task using 3 informative features
# 自定義一個數據集合,這是個好東西
X, y = make_classification(n_samples=1000, n_features=10, n_informative=3, n_redundant=0, n_repeated=0, n_classes=2, random_state=0, shuffle=False) # Build a forest and compute the feature importances forest = ExtraTreesClassifier(n_estimators=250, random_state=0), y)
# 森林中許多樹,每棵樹對應了一套本身的標準獲得的」重要性評估"
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0) indices = np.argsort(importances)[::-1] # Print the feature ranking print("Feature ranking:") for f in range(X.shape[1]): print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]])) # Plot the feature importances of the forest plt.figure() plt.title("Feature importances")[1]), importances[indices], color="r", yerr=std[indices], align="center") plt.xticks(range(X.shape[1]), indices) plt.xlim([-1, X.shape[1]])