Naïve Bayes Classifier.javascript
We will use, specifically, the Bernoulli-Dirichlet model for text classification,css
We will train the model using both the Maximum Likelihood estimates and Bayesian updating, and compare these in terms of predictive success, and in terms of what can go wrong.html
We will be using the webkb
dataset. - original data: the webkb dataset website.html5
[Scikit-learn] 1.9 Naive Bayesjava
[ML] Naive Bayes for Text Classificationpython
[ML] Naive Bayes for email classificationjquery
# Make division default to floating-point, saving confusion
# i.e. 3/4 = 0.75 3//4 = 0
from __future__ import division
# Necessary libraries
import scipy as sp
import numpy as np
import matplotlib.pyplot as pl
# Put the graphs where we can see them
# 將matplotlib的圖表直接嵌入到Notebook之中
%matplotlib inline
# Display a warning on important floating-point errors
np.seterr(divide='warn', invalid='warn'); # -->
data = np.load(
'webkb.npz',
)
print(type(data))
# training data
xtrain = data['xtrain']
ytrain = data['ytrain']
# test data
xtest = data['xtest']
ytest = data['ytest']
# which class is which?
class_label_strings = data['class_label_strings']
# we don't need the original any more
del(data)
print("X training data dimensions = {!r}".format(xtrain.shape)) # -->
print("Y training data dimensions = {!r}".format(ytrain.shape))
print("X test data dimensions = {!r}".format(xtest.shape))
print("Y test data dimensions = {!r}".format(ytest.shape))
print("Number of y labels = {!r}".format(len(class_label_strings)))
xtrain
xtrain.shape[1]
np.arange(xtrain.shape[0])
# 行內求和
np.sum(xtrain, axis=0)
# 列內求和
np.sum(xtrain, axis=1)
pl.bar( np.arange(xtrain.shape[1]), np.sum(xtrain, axis=0), width=1);
# 提取 全部y樣本中的第二項,也就是類別=2的樣本;xtrain的第二維數據全要
x2 = xtrain[ytrain[:, 2]==1, :]
pl.bar( np.arange(x2.shape[1]), np.mean(x2, axis=0), width=1, alpha=0.5);
x3 = xtrain[ytrain[:, 3]==1, :]
pl.bar(np.arange(x3.shape[1]), np.mean(x3, axis=0), width=1, alpha=0.5); # -->
# 其實就是隻關心一部分數據(效果就是放大了),這裏只關注前100個termslinux
pl.bar(np.arange(100), np.mean(x2[:, :100], axis=0), width=1, alpha=0.5);
pl.bar(np.arange(100), np.mean(x3[:, :100], axis=0), width=1, alpha=0.5);
def categorical_bar(val, **kwargs):
"""
Convenient categorical bar plot, labelled with the class strings.
This is handy if you want to plot something versus class.
"""
n_cat = len(class_label_strings)
cat_index = np.arange(n_cat)
bar = pl.bar(cat_index, val, width=1, **kwargs);
pl.xticks(cat_index, class_label_strings)
return bar
categorical_bar(np.sum(ytrain, axis=0));
或者,直接返回數值形式android
for label_string, n_in_class in zip(class_label_strings, np.sum(ytrain, axis=0)):
print("{}: {}".format(label_string, n_in_class))
只是簡單地求了平均值,過於naive的方法了啦。css3
def fit_naive_bayes_ml(x, y): """ Given an array of features `x` and an array of labels `y`, return ML estimates of class probabilities `pi` and class-conditional feature probabilities `theta`. """ n_class = y.shape[1] n_feat = x.shape[1] print(n_feat) print( len(x[0]) ) pi_counts = np.sum(y, axis=0) print(pi_counts) #pi = pi_counts/np.sum(pi_counts) #print(pi) # 也能夠經過 np.sum(y)直接求得matrix全部元素的總和 pi = pi_counts/np.sum(y) print("pi: ", pi) print((n_feat, n_class)) theta = np.zeros( (n_feat, n_class) ) print("theta: ", theta) for cls in range(n_class): docs_in_class = (y[:, cls]==1) # 處理某一個特定類的train data,這裏統計了該類下的單詞在文檔中的出現次數 class_feat_count = x[docs_in_class, :].sum(axis=0) # Matrix求總和的兩種方式:np.sum(...), x.sum() theta[:, cls] = class_feat_count/(docs_in_class.sum()) # theta[:, cls] = class_feat_count/np.sum(docs_in_class) return pi, theta
pi_hat, theta_hat = fit_naive_bayes_ml(xtrain, ytrain) print("pi_hat: ", pi_hat) print("theta_hat: ", theta_hat)
運行結果:
1703 1703 [ 165. 99. 345. 62. 31.] ('pi: ', array([ 0.23504274, 0.14102564, 0.49145299, 0.08831909, 0.04415954])) (1703, 5) ('theta: ', array([[ 0., 0., 0., 0., 0.], [ 0., 0., 0., 0., 0.], [ 0., 0., 0., 0., 0.], ..., [ 0., 0., 0., 0., 0.], [ 0., 0., 0., 0., 0.], [ 0., 0., 0., 0., 0.]])) ('pi_hat: ', array([ 0.23504274, 0.14102564, 0.49145299, 0.08831909, 0.04415954])) ('theta_hat: ', array([[ 0.01818182, 0.04040404, 0.00289855, 0. , 0. ], [ 0. , 0.01010101, 0.0115942 , 0. , 0.03225806], [ 0. , 0.03030303, 0.00869565, 0.01612903, 0. ], ..., [ 0.01212121, 0. , 0.0115942 , 0.01612903, 0.03225806], [ 0.39393939, 0.45454545, 0.53913043, 0.43548387, 0.41935484], [ 0. , 0. , 0. , 0. , 0. ]]))
這裏拿出一個x作預測實驗:
categorical_bar( predict_class_prob(xtest[0,:], pi_hat, theta_hat), color='orange' );
from scipy.misc import logsumexp def predict_class_prob(x, pi, theta): class_feat_l = np.zeros_like(theta) # calculations in log space to avoid underflow
# 只提取該樣本中包含單詞對應的theta值
# 由於,本樣本沒出現的單詞,不必關心在此
print(len(theta[x==1, :])) class_feat_l[x==1, :] = np.log(theta[x==1, :]) class_feat_l[x==0, :] = np.log(1 - theta[x==0, :]) class_l = class_feat_l.sum(axis=0) + np.log(pi) # logsumexp 等價於 np.log(np.sum(np.exp(a)))
return np.exp(class_l - logsumexp(class_l)) # --> 原理是什麼
test_correct_ml = predictive_accuracy(xtest, ytest, predict_class, pi_hat, theta_hat)
def predictive_accuracy(xdata, ydata, predictor, *args): """ Given an N-by-D array of features `xdata`, an N-by-C array of one-hot-encoded true classes `ydata` and a predictor function `predictor`, return the proportion of correct predictions. We accept an additional argument list `args` that will be passed to the predictor function. """ correct = np.zeros(xdata.shape[0]) for i, x in enumerate(xdata): prediction = predictor(x, *args) correct[i] = np.all(ydata[i, :] == prediction) return correct.mean()
def predict_class(x, pi, theta): probs = predict_class_prob(x, pi, theta) print(probs) prediction = np.zeros_like(probs)
# 返回最大機率對應的位置,也就是idx prediction[np.argmax(probs)] = 1
return prediction
以上實驗 基本是在批判max likelihood的弊端,也就是無窮大和0帶來的各類問題。