學習下Python的sklearn包作分類

寫了一個簡單的多類分類程序,給定一個數據集,在其上作10-fold交叉檢驗,輸出loss,以及分類的結果。python

最關鍵的函數是def do_cross_validation(dict_feature_list, y_list, num_features, num_fold, clf)。各個參數的含義是git

dict_feature_list是一個python的dict列表,列表中每個元素表明一個樣本,好比一個文檔,dict做爲<k,v>,k表明特徵,v是特徵的值。算法

y_list是樣本的標籤app

num_features是數據集的維度大小dom

num_fold是幾回交叉檢驗,10則表明10-fold交叉檢驗機器學習

clf是分類算法函數

作交叉檢驗時,關鍵代碼是skf = StratifiedKFold(y, n_folds=num_fold, shuffle=False, random_state=None),這個方法將根據類別的分佈狀況,對數據集作stratified分隔,儘可能使得每一個fold裏的類別分佈與原始數據集相同。畢竟,機器學習train出來的model假設作分類時,所面對的數據和訓練數據有一樣的分佈纔好。經過一個for循環,for i, (train_index, test_index) in enumerate(skf):將十次交叉檢驗的數據展開。代碼最後用了metrix計算loss,固然能夠偷懶直接用classification_report,我這裏根據micro、macro、weighted三種方式,計算了下precision,recall和f1-score。學習

函數def make_2d_matrix_to_dict_lst(matrix)徹底是爲了測試代碼,做用是將一個dense的矩陣,變成dict的列表。測試

函數def dict_lst_to_coo_sparse_matrix(dict_lst, num_features):是將一個dict的列表,轉成sparse matrix,這樣能夠很大幅度的節約內存,尤爲是在作文本分類的時候。spa

具體用的時候,偷懶用了one-vs-rest的多分類策略,基礎算法使用的邏輯迴歸clf = OneVsRestClassifier(LogisticRegression())

# -*- coding: utf-8 -*-
from sklearn.cross_validation import StratifiedKFold
from sklearn import metrics
import numpy as np
from sklearn.multiclass import OneVsRestClassifier
from sklearn import datasets
from sklearn.linear_model import LogisticRegression

#transfer a python dict list to scipy COO sparse matrix
#dict_lst: [{a:b},{a:b,c:d}], each dict is the feature set of an instance
#num_features: the total number of features in dataset
def dict_lst_to_coo_sparse_matrix(dict_lst, num_features):
	from scipy.sparse import coo_matrix
	import numpy as np
	
	n_doc = len(dict_lst)
	#find non-zero element
	row_vec = []
	col_vec = []
	data_vec = []
	for d_index in range(len(dict_lst)):
		for k in dict_lst[d_index]:
			row_vec.append(d_index)
			col_vec.append(k)
			data_vec.append(dict_lst[d_index][k])
	row_vec = np.array(row_vec)
	col_vec = np.array(col_vec)
	data_vec = np.array(data_vec)
	return coo_matrix((data_vec, (row_vec, col_vec)), shape=(n_doc, num_features))

#transfer a dense 2d matrix to dict lst
def make_2d_matrix_to_dict_lst(matrix):
	lst = []
	for row in matrix:
		d = {}
		for j in range(len(row)):
			if row[j] != 0:
				d[j] = row[j]
		lst.append(d)
	return lst

#base experimental code	
def do_cross_validation(dict_feature_list, y_list, num_features, num_fold, clf):
	X = dict_feature_list#instance set
	y = np.array(y_list)#label set
	ids = np.arange(len(X))#instance id set
	
	id2result = {}
	
	loss_lst = []
	predicted_lst = []
	#make cross validation set
	skf = StratifiedKFold(y, n_folds=num_fold, shuffle=False, random_state=None)
	for i, (train_index, test_index) in enumerate(skf):
		#split dataset into train and test
		y_train = y[train_index]
		id_train = ids[train_index]
		X_train = []		
		for t in train_index:
			X_train.append(X[t])
				
		y_test = y[test_index]
		id_test = ids[test_index]
		X_test = []
		for t in test_index:
			X_test.append(X[t])
		
		#make sparse representation
		sparse_X_train = dict_lst_to_coo_sparse_matrix(X_train, num_features)
		sparse_X_test = dict_lst_to_coo_sparse_matrix(X_test, num_features)
				
		#train a classifier on the training set
		clf.fit(sparse_X_train, y_train)
		
		#do prediction on the test set
		predicted_labels = clf.predict(sparse_X_test)
		
		#store results for later comparision
		for index in range(len(id_test)):
			id2result[id_test[index]] = (y_test[index], predicted_labels[index])
		
		#compute loss
		macro_pr = metrics.precision_score(y_test, predicted_labels, pos_label=None, average='macro')
		macro_re = metrics.recall_score(y_test, predicted_labels, pos_label=None, average='macro')
		macro_f1 = metrics.f1_score(y_test, predicted_labels, pos_label=None, average='macro')	
		
		micro_pr = metrics.precision_score(y_test, predicted_labels, pos_label=None, average='micro')
		micro_re = metrics.recall_score(y_test, predicted_labels, pos_label=None, average='micro')
		micro_f1 = metrics.f1_score(y_test, predicted_labels, pos_label=None, average='micro')	
		
		weighted_pr = metrics.precision_score(y_test, predicted_labels, pos_label=None, average='weighted')
		weighted_re = metrics.recall_score(y_test, predicted_labels, pos_label=None, average='weighted')
		weighted_f1 = metrics.f1_score(y_test, predicted_labels, pos_label=None, average='weighted')	
		
		loss_lst.append((macro_pr, macro_re, macro_f1, micro_pr, micro_re, micro_f1, weighted_pr, weighted_re, weighted_f1))
	return loss_lst, id2result

#load digital recognition dataset	
digits = datasets.load_digits()

X = digits.data
y = digits.target
num_features = len(X[0])

#make dict lst features
feature_lst = make_2d_matrix_to_dict_lst(X)

clf = OneVsRestClassifier(LogisticRegression())

loss_lst, id2result = do_cross_validation(feature_lst, y, num_features, 10, clf)
for loss in loss_lst:
	print ['%.3f' % r for r in loss]
相關文章
相關標籤/搜索