寫了一個簡單的多類分類程序,給定一個數據集,在其上作10-fold交叉檢驗,輸出loss,以及分類的結果。python
最關鍵的函數是def do_cross_validation(dict_feature_list, y_list, num_features, num_fold, clf)。各個參數的含義是git
dict_feature_list是一個python的dict列表,列表中每個元素表明一個樣本,好比一個文檔,dict做爲<k,v>,k表明特徵,v是特徵的值。算法
y_list是樣本的標籤app
num_features是數據集的維度大小dom
num_fold是幾回交叉檢驗,10則表明10-fold交叉檢驗機器學習
clf是分類算法函數
作交叉檢驗時,關鍵代碼是skf = StratifiedKFold(y, n_folds=num_fold, shuffle=False, random_state=None),這個方法將根據類別的分佈狀況,對數據集作stratified分隔,儘可能使得每一個fold裏的類別分佈與原始數據集相同。畢竟,機器學習train出來的model假設作分類時,所面對的數據和訓練數據有一樣的分佈纔好。經過一個for循環,for i, (train_index, test_index) in enumerate(skf):將十次交叉檢驗的數據展開。代碼最後用了metrix計算loss,固然能夠偷懶直接用classification_report,我這裏根據micro、macro、weighted三種方式,計算了下precision,recall和f1-score。學習
函數def make_2d_matrix_to_dict_lst(matrix)徹底是爲了測試代碼,做用是將一個dense的矩陣,變成dict的列表。測試
函數def dict_lst_to_coo_sparse_matrix(dict_lst, num_features):是將一個dict的列表,轉成sparse matrix,這樣能夠很大幅度的節約內存,尤爲是在作文本分類的時候。spa
具體用的時候,偷懶用了one-vs-rest的多分類策略,基礎算法使用的邏輯迴歸clf = OneVsRestClassifier(LogisticRegression())
# -*- coding: utf-8 -*- from sklearn.cross_validation import StratifiedKFold from sklearn import metrics import numpy as np from sklearn.multiclass import OneVsRestClassifier from sklearn import datasets from sklearn.linear_model import LogisticRegression #transfer a python dict list to scipy COO sparse matrix #dict_lst: [{a:b},{a:b,c:d}], each dict is the feature set of an instance #num_features: the total number of features in dataset def dict_lst_to_coo_sparse_matrix(dict_lst, num_features): from scipy.sparse import coo_matrix import numpy as np n_doc = len(dict_lst) #find non-zero element row_vec = [] col_vec = [] data_vec = [] for d_index in range(len(dict_lst)): for k in dict_lst[d_index]: row_vec.append(d_index) col_vec.append(k) data_vec.append(dict_lst[d_index][k]) row_vec = np.array(row_vec) col_vec = np.array(col_vec) data_vec = np.array(data_vec) return coo_matrix((data_vec, (row_vec, col_vec)), shape=(n_doc, num_features)) #transfer a dense 2d matrix to dict lst def make_2d_matrix_to_dict_lst(matrix): lst = [] for row in matrix: d = {} for j in range(len(row)): if row[j] != 0: d[j] = row[j] lst.append(d) return lst #base experimental code def do_cross_validation(dict_feature_list, y_list, num_features, num_fold, clf): X = dict_feature_list#instance set y = np.array(y_list)#label set ids = np.arange(len(X))#instance id set id2result = {} loss_lst = [] predicted_lst = [] #make cross validation set skf = StratifiedKFold(y, n_folds=num_fold, shuffle=False, random_state=None) for i, (train_index, test_index) in enumerate(skf): #split dataset into train and test y_train = y[train_index] id_train = ids[train_index] X_train = [] for t in train_index: X_train.append(X[t]) y_test = y[test_index] id_test = ids[test_index] X_test = [] for t in test_index: X_test.append(X[t]) #make sparse representation sparse_X_train = dict_lst_to_coo_sparse_matrix(X_train, num_features) sparse_X_test = dict_lst_to_coo_sparse_matrix(X_test, num_features) #train a classifier on the training set clf.fit(sparse_X_train, y_train) #do prediction on the test set predicted_labels = clf.predict(sparse_X_test) #store results for later comparision for index in range(len(id_test)): id2result[id_test[index]] = (y_test[index], predicted_labels[index]) #compute loss macro_pr = metrics.precision_score(y_test, predicted_labels, pos_label=None, average='macro') macro_re = metrics.recall_score(y_test, predicted_labels, pos_label=None, average='macro') macro_f1 = metrics.f1_score(y_test, predicted_labels, pos_label=None, average='macro') micro_pr = metrics.precision_score(y_test, predicted_labels, pos_label=None, average='micro') micro_re = metrics.recall_score(y_test, predicted_labels, pos_label=None, average='micro') micro_f1 = metrics.f1_score(y_test, predicted_labels, pos_label=None, average='micro') weighted_pr = metrics.precision_score(y_test, predicted_labels, pos_label=None, average='weighted') weighted_re = metrics.recall_score(y_test, predicted_labels, pos_label=None, average='weighted') weighted_f1 = metrics.f1_score(y_test, predicted_labels, pos_label=None, average='weighted') loss_lst.append((macro_pr, macro_re, macro_f1, micro_pr, micro_re, micro_f1, weighted_pr, weighted_re, weighted_f1)) return loss_lst, id2result #load digital recognition dataset digits = datasets.load_digits() X = digits.data y = digits.target num_features = len(X[0]) #make dict lst features feature_lst = make_2d_matrix_to_dict_lst(X) clf = OneVsRestClassifier(LogisticRegression()) loss_lst, id2result = do_cross_validation(feature_lst, y, num_features, 10, clf) for loss in loss_lst: print ['%.3f' % r for r in loss]