# Make division default to floating-point, saving confusion
# i.e. 3/4 = 0.75 3//4 = 0
from __future__ import division
# Necessary libraries
import scipy as sp
import numpy as np
import matplotlib.pyplot as pl
# Put the graphs where we can see them
# 將matplotlib的圖表直接嵌入到Notebook之中
%matplotlib inline
# Display a warning on important floating-point errors
np.seterr(divide='warn', invalid='warn'); # -->
data = np.load(
# training data
xtrain = data['xtrain']
ytrain = data['ytrain']
# test data
xtest = data['xtest']
ytest = data['ytest']
# which class is which?
class_label_strings = data['class_label_strings']
# we don't need the original any more
print("X training data dimensions = {!r}".format(xtrain.shape)) # -->
print("Y training data dimensions = {!r}".format(ytrain.shape))
print("X test data dimensions = {!r}".format(xtest.shape))
print("Y test data dimensions = {!r}".format(ytest.shape))
print("Number of y labels = {!r}".format(len(class_label_strings)))
# 行內求和
np.sum(xtrain, axis=0)
# 列內求和
np.sum(xtrain, axis=1) np.arange(xtrain.shape[1]), np.sum(xtrain, axis=0), width=1);
# 提取 全部y樣本中的第二項,也就是類別=2的樣本;xtrain的第二維數據全要
x2 = xtrain[ytrain[:, 2]==1, :] np.arange(x2.shape[1]), np.mean(x2, axis=0), width=1, alpha=0.5);
x3 = xtrain[ytrain[:, 3]==1, :][1]), np.mean(x3, axis=0), width=1, alpha=0.5); # -->
# 其實就是隻關心一部分數據(效果就是放大了),這裏只關注前100個termslinux, np.mean(x2[:, :100], axis=0), width=1, alpha=0.5);, np.mean(x3[:, :100], axis=0), width=1, alpha=0.5);
def categorical_bar(val, **kwargs):
Convenient categorical bar plot, labelled with the class strings.
This is handy if you want to plot something versus class.
n_cat = len(class_label_strings)
cat_index = np.arange(n_cat)
bar =, val, width=1, **kwargs);
pl.xticks(cat_index, class_label_strings)
return bar
categorical_bar(np.sum(ytrain, axis=0));
for label_string, n_in_class in zip(class_label_strings, np.sum(ytrain, axis=0)):
print("{}: {}".format(label_string, n_in_class))
def fit_naive_bayes_ml(x, y): """ Given an array of features `x` and an array of labels `y`, return ML estimates of class probabilities `pi` and class-conditional feature probabilities `theta`. """ n_class = y.shape[1] n_feat = x.shape[1] print(n_feat) print( len(x[0]) ) pi_counts = np.sum(y, axis=0) print(pi_counts) #pi = pi_counts/np.sum(pi_counts) #print(pi) # 也能夠經過 np.sum(y)直接求得matrix全部元素的總和 pi = pi_counts/np.sum(y) print("pi: ", pi) print((n_feat, n_class)) theta = np.zeros( (n_feat, n_class) ) print("theta: ", theta) for cls in range(n_class): docs_in_class = (y[:, cls]==1) # 處理某一個特定類的train data,這裏統計了該類下的單詞在文檔中的出現次數 class_feat_count = x[docs_in_class, :].sum(axis=0) # Matrix求總和的兩種方式:np.sum(...), x.sum() theta[:, cls] = class_feat_count/(docs_in_class.sum()) # theta[:, cls] = class_feat_count/np.sum(docs_in_class) return pi, theta
pi_hat, theta_hat = fit_naive_bayes_ml(xtrain, ytrain) print("pi_hat: ", pi_hat) print("theta_hat: ", theta_hat)
1703 1703 [ 165. 99. 345. 62. 31.] ('pi: ', array([ 0.23504274, 0.14102564, 0.49145299, 0.08831909, 0.04415954])) (1703, 5) ('theta: ', array([[ 0., 0., 0., 0., 0.], [ 0., 0., 0., 0., 0.], [ 0., 0., 0., 0., 0.], ..., [ 0., 0., 0., 0., 0.], [ 0., 0., 0., 0., 0.], [ 0., 0., 0., 0., 0.]])) ('pi_hat: ', array([ 0.23504274, 0.14102564, 0.49145299, 0.08831909, 0.04415954])) ('theta_hat: ', array([[ 0.01818182, 0.04040404, 0.00289855, 0. , 0. ], [ 0. , 0.01010101, 0.0115942 , 0. , 0.03225806], [ 0. , 0.03030303, 0.00869565, 0.01612903, 0. ], ..., [ 0.01212121, 0. , 0.0115942 , 0.01612903, 0.03225806], [ 0.39393939, 0.45454545, 0.53913043, 0.43548387, 0.41935484], [ 0. , 0. , 0. , 0. , 0. ]]))
categorical_bar( predict_class_prob(xtest[0,:], pi_hat, theta_hat), color='orange' );
from scipy.misc import logsumexp def predict_class_prob(x, pi, theta): class_feat_l = np.zeros_like(theta) # calculations in log space to avoid underflow
# 只提取該樣本中包含單詞對應的theta值
# 由於,本樣本沒出現的單詞,不必關心在此
print(len(theta[x==1, :])) class_feat_l[x==1, :] = np.log(theta[x==1, :]) class_feat_l[x==0, :] = np.log(1 - theta[x==0, :]) class_l = class_feat_l.sum(axis=0) + np.log(pi) # logsumexp 等價於 np.log(np.sum(np.exp(a)))
return np.exp(class_l - logsumexp(class_l)) # --> 原理是什麼
test_correct_ml = predictive_accuracy(xtest, ytest, predict_class, pi_hat, theta_hat)
def predictive_accuracy(xdata, ydata, predictor, *args): """ Given an N-by-D array of features `xdata`, an N-by-C array of one-hot-encoded true classes `ydata` and a predictor function `predictor`, return the proportion of correct predictions. We accept an additional argument list `args` that will be passed to the predictor function. """ correct = np.zeros(xdata.shape[0]) for i, x in enumerate(xdata): prediction = predictor(x, *args) correct[i] = np.all(ydata[i, :] == prediction) return correct.mean()
def predict_class(x, pi, theta): probs = predict_class_prob(x, pi, theta) print(probs) prediction = np.zeros_like(probs)
# 返回最大機率對應的位置,也就是idx prediction[np.argmax(probs)] = 1
return prediction
以上實驗 基本是在批判max likelihood的弊端,也就是無窮大和0帶來的各類問題。