scikit-learn預處理實例之一:使用FunctionTransformer選擇列

本例展現怎樣在一個管道中使用FunctionTransformer.若是你知道你的數據集的第一主成分與分類任務無關,你可使用FunctionTransformer選取除PCA轉化的數據的第一列以外的所有數據.python


# coding:utf-8

from pylab import *
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer


myfont = matplotlib.font_manager.FontProperties(fname="Microsoft-Yahei-UI-Light.ttc")
mpl.rcParams['axes.unicode_minus'] = False

def _generate_vector(shift=0.5, noise=15):
    return np.arange(1000) + (np.random.rand(1000) - shift) * noise


def generate_dataset():
    """
    本數據集是兩條斜率爲1的直線,一個截距爲0,一個截距爲100
    """
    return np.vstack((
        np.vstack((
            _generate_vector(),
            _generate_vector() + 100,
        )).T,
        np.vstack((
            _generate_vector(),
            _generate_vector(),
        )).T,
    )), np.hstack((np.zeros(1000), np.ones(1000)))


def all_but_first_column(X):
    return X[:, 1:]


def drop_first_component(X, y):
    """
    建立一個具備PCA(主成分分析)和列選擇器的管道,
    並使用它轉換數據集
    """
    pipeline = make_pipeline(
        PCA(), FunctionTransformer(all_but_first_column),
    )
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    pipeline.fit(X_train, y_train)
    return pipeline.transform(X_test), y_test


if __name__ == '__main__':
    X, y = generate_dataset()
    lw = 0
    plt.figure()
    plt.scatter(X[:, 0], X[:, 1], c=y, lw=lw)
    plt.title(u"FunctionTransformer選擇數據列",fontproperties=myfont)
    plt.figure()
    X_transformed, y_transformed = drop_first_component(*generate_dataset())
    plt.scatter(
        X_transformed[:, 0],
        np.zeros(len(X_transformed)),
        c=y_transformed,
        lw=lw,
        s=60
    )
    plt.title(u"FunctionTransformer選擇數據列",fontproperties=myfont)
    plt.show()
相關文章
相關標籤/搜索