本文基於TensorFlow官網的Tutorial寫成。輸入數據是MNIST,全稱是Modified National Institute of Standards and Technology,是一組由這個機構蒐集的手寫數字掃描文件和每一個文件對應標籤的數據集,通過必定的修改使其適合機器學習算法讀取。這個數據集能夠從牛的不行的Yann LeCun教授的網站獲取。python
本文首先使用sklearn的LogisticRegression()
進行訓練,獲得的參數繪製效果以下(紅色表示參數估計結果爲負,藍色表示參數估計結果爲正,綠色表明參數估計結果爲零):git
從圖形效果看,咱們發現藍色點組成的輪廓與對應的數字輪廓仍是比較接近的。算法
而後本文使用tensorflow對一樣的數據集進行了softmax regression的訓練,獲得的參數繪製效果以下:dom
藍色點組成的輪廓與對應的數字輪廓比較接近。可是對比上下兩幅截圖,感受tensorflow的效果更平滑一些。不過從測試集的準確率來看,兩者都在92%左右,sklearn稍微好一點。注意,92%的準確率看起來不錯,但實際上是一個很低的準確率,按照官網教程的說法,應該要感到羞愧。機器學習
#!/usr/bin/env python # -*- coding=utf-8 -*- # @author: 陳水平 # @date: 2017-01-10 # @description: implement a softmax regression model upon MNIST handwritten digits # @ref: http://yann.lecun.com/exdb/mnist/ import gzip import struct import numpy as np from sklearn.linear_model import LogisticRegression from sklearn import preprocessing from sklearn.metrics import accuracy_score import tensorflow as tf # MNIST data is stored in binary format, # and we transform them into numpy ndarray objects by the following two utility functions def read_image(file_name): with gzip.open(file_name, 'rb') as f: buf = f.read() index = 0 magic, images, rows, columns = struct.unpack_from('>IIII' , buf , index) index += struct.calcsize('>IIII') image_size = '>' + str(images*rows*columns) + 'B' ims = struct.unpack_from(image_size, buf, index) im_array = np.array(ims).reshape(images, rows, columns) return im_array def read_label(file_name): with gzip.open(file_name, 'rb') as f: buf = f.read() index = 0 magic, labels = struct.unpack_from('>II', buf, index) index += struct.calcsize('>II') label_size = '>' + str(labels) + 'B' labels = struct.unpack_from(label_size, buf, index) label_array = np.array(labels) return label_array print "Start processing MNIST handwritten digits data..." train_x_data = read_image("MNIST_data/train-images-idx3-ubyte.gz") train_x_data = train_x_data.reshape(train_x_data.shape[0], -1).astype(np.float32) train_y_data = read_label("MNIST_data/train-labels-idx1-ubyte.gz") test_x_data = read_image("MNIST_data/t10k-images-idx3-ubyte.gz") test_x_data = test_x_data.reshape(test_x_data.shape[0], -1).astype(np.float32) test_y_data = read_label("MNIST_data/t10k-labels-idx1-ubyte.gz") train_x_minmax = train_x_data / 255.0 test_x_minmax = test_x_data / 255.0 # Of course you can also use the utility function to read in MNIST provided by tensorflow # from tensorflow.examples.tutorials.mnist import input_data # mnist = input_data.read_data_sets("MNIST_data/", one_hot=False) # train_x_minmax = mnist.train.images # train_y_data = mnist.train.labels # test_x_minmax = mnist.test.images # test_y_data = mnist.test.labels # We evaluate the softmax regression model by sklearn first eval_sklearn = False if eval_sklearn: print "Start evaluating softmax regression model by sklearn..." reg = LogisticRegression(solver="lbfgs", multi_class="multinomial") reg.fit(train_x_minmax, train_y_data) np.savetxt('coef_softmax_sklearn.txt', reg.coef_, fmt='%.6f') # Save coefficients to a text file test_y_predict = reg.predict(test_x_minmax) print "Accuracy of test set: %f" % accuracy_score(test_y_data, test_y_predict) eval_tensorflow = True batch_gradient = False if eval_tensorflow: print "Start evaluating softmax regression model by tensorflow..." # reformat y into one-hot encoding style lb = preprocessing.LabelBinarizer() lb.fit(train_y_data) train_y_data_trans = lb.transform(train_y_data) test_y_data_trans = lb.transform(test_y_data) x = tf.placeholder(tf.float32, [None, 784]) W = tf.Variable(tf.zeros([784, 10])) b = tf.Variable(tf.zeros([10])) V = tf.matmul(x, W) + b y = tf.nn.softmax(V) y_ = tf.placeholder(tf.float32, [None, 10]) loss = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y), reduction_indices=[1])) optimizer = tf.train.GradientDescentOptimizer(0.5) train = optimizer.minimize(loss) init = tf.initialize_all_variables() sess = tf.Session() sess.run(init) if batch_gradient: for step in range(300): sess.run(train, feed_dict={x: train_x_minmax, y_: train_y_data_trans}) if step % 10 == 0: print "Batch Gradient Descent processing step %d" % step print "Finally we got the estimated results, take such a long time..." else: for step in range(1000): sample_index = np.random.choice(train_x_minmax.shape[0], 100) batch_xs = train_x_minmax[sample_index, :] batch_ys = train_y_data_trans[sample_index, :] sess.run(train, feed_dict={x: batch_xs, y_: batch_ys}) if step % 100 == 0: print "Stochastic Gradient Descent processing step %d" % step np.savetxt('coef_softmax_tf.txt', np.transpose(sess.run(W)), fmt='%.6f') # Save coefficients to a text file correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) print "Accuracy of test set: %f" % sess.run(accuracy, feed_dict={x: test_x_minmax, y_: test_y_data_trans})
輸出以下:ide
Start processing MNIST handwritten digits data... Start evaluating softmax regression model by sklearn... Accuracy of test set: 0.926300 Start evaluating softmax regression model by tensorflow... Stochastic Gradient Descent processing step 0 Stochastic Gradient Descent processing step 100 Stochastic Gradient Descent processing step 200 Stochastic Gradient Descent processing step 300 Stochastic Gradient Descent processing step 400 Stochastic Gradient Descent processing step 500 Stochastic Gradient Descent processing step 600 Stochastic Gradient Descent processing step 700 Stochastic Gradient Descent processing step 800 Stochastic Gradient Descent processing step 900 Accuracy of test set: 0.917400
sklearn的估計時間有點長,由於每一輪參數更新都是基於全量的訓練集數據算出損失,再算出梯度,而後再改進結果的。學習
tensorflow採用batch gradient descent估計算法時,時間也比較長,緣由同上。測試
tensorflow採用stochastic gradient descent估計算法時間短,最後的估計結果也挺好,至關於每輪迭代只用到了部分數據集算出損失和梯度,速度變快,但可能bias增長;因此把迭代次數增多,這樣能夠下降variance,整體上的偏差相比batch gradient descent並無差多少。網站
參數效果的繪圖採用R實現,示例代碼以下:lua
library(dplyr) library(tidyr) library(ggplot2) t <- read.table("coef_softmax_tf.txt") n <- t %>% tibble::rownames_to_column("digit") %>% gather(var_name, var_value, -digit) %>% mutate(var_name=stringr::str_sub(var_name, 2)) n$var_name <- as.numeric(n$var_name) n$digit <- as.numeric(n$digit) n <- n %>% mutate(digit=digit-1, var_name=var_name-1, y=28 - floor(var_name/28), x=var_name %% 28, v=ifelse(var_value>0, 1, ifelse(var_value<0, -1, 0))) ggplot(n) + geom_point(aes(x=x,y=y,color=as.factor(v))) + facet_wrap(~digit)