首先須要指出的是,筆者認爲,閱讀Weka相關算法實現的源碼以前,應該對所閱讀的算法有原理上的認識與理解,這樣纔會在閱讀時有所心得和收穫。也正是由於這個緣由,再也不對算法的原理作出交代,敬請見諒!算法
一樣,NaiveBayes在繼承了AbstractClassifier的基礎上,也實現了幾個接口。OptionHandler:返回操做的枚舉ide
WeightInstancesHandler:若是有對象用到實例權重提供的信息,就會用到這個接口測試
TechnicalInformationHandler:返回一些對分類算法的技術信息和相關出版物的一些資料ui
Aggregateable<NaiveBayes>:返回NaiveBayes的彙集結果(暫時還不太理解)this
具體實現:orm
public class NaiveBayes extends AbstractClassifier implements OptionHandler,
WeightedInstancesHandler, TechnicalInformationHandler,
Aggregateable<NaiveBayes> {對象
static final long serialVersionUID = 5995231201785697655L;//序列化繼承
protected Estimator[][] m_Distributions;//用於屬性評估接口
protected Estimator m_ClassDistribution;//用於分類評估ci
protected boolean m_UseKernelEstimator = false;//設置用核心密度分佈或者是普通分佈
protected boolean m_UseDiscretization = false;//設置用離散分佈或者普通分佈
protected int m_NumClasses;//分類數
protected Instances m_Instances;
protected static final double DEFAULT_NUM_PRECISION = 0.01;//數值屬性的估計精度
//下面是對算法的全局描述
public String globalInfo() {
return "Class for a Naive Bayes classifier using estimator classes. Numeric"
+ " estimator precision values are chosen based on analysis of the "
+ " training data. For this reason, the classifier is not an"
+ " UpdateableClassifier (which in typical usage are initialized with zero"
+ " training instances) -- if you need the UpdateableClassifier functionality,"
+ " use the NaiveBayesUpdateable classifier. The NaiveBayesUpdateable"
+ " classifier will use a default precision of 0.1 for numeric attributes"
+ " when buildClassifier is called with zero training instances.\n\n"
+ "For more information on Naive Bayes classifiers, see\n\n"
+ getTechnicalInformation().toString();
}
//設置技術信息的內容,技術信息TechnicalIngormation是經過在Field下枚舉信息的各個條目實現的。
@Override
public TechnicalInformation getTechnicalInformation() {
TechnicalInformation result;
result = new TechnicalInformation(Type.INPROCEEDINGS);
result.setValue(Field.AUTHOR, "George H. John and Pat Langley");
result.setValue(Field.TITLE,
"Estimating Continuous Distributions in Bayesian Classifiers");
result.setValue(Field.BOOKTITLE,
"Eleventh Conference on Uncertainty in Artificial Intelligence");
result.setValue(Field.YEAR, "1995");
result.setValue(Field.PAGES, "338-345");
result.setValue(Field.PUBLISHER, "Morgan Kaufmann");
result.setValue(Field.ADDRESS, "San Mateo");
return result;
}
//做用:複寫buildClassifier()方法,生成分類器
@Override
public void buildClassifier(Instances instances) throws Exception {
// 檢查分類器是否能負載測試集
getCapabilities().testWithFail(instances);
// 移除缺失分類的實例
instances = new Instances(instances);
instances.deleteWithMissingClass();
//獲得分類數
m_NumClasses = instances.numClasses();
// 複製測試集,而不是直接對其操做。
m_Instances = new Instances(instances);
// 若是測試集須要離散化,將其離散化,畢竟有些分類算法沒法處理連續值或處理結果精度較低。
if (m_UseDiscretization) {
m_Disc = new weka.filters.supervised.attribute.Discretize();
m_Disc.setInputFormat(m_Instances);
m_Instances = weka.filters.Filter.useFilter(m_Instances, m_Disc);
} else {
m_Disc = null;
}
// 爲離散化預備空間
m_Distributions = new Estimator[m_Instances.numAttributes() - 1][m_Instances
.numClasses()];
m_ClassDistribution = new DiscreteEstimator(m_Instances.numClasses(), true);
int attIndex = 0;
Enumeration<Attribute> enu = m_Instances.enumerateAttributes();
while (enu.hasMoreElements()) {
Attribute attribute = enu.nextElement();
// 若是屬性是數值型,就評價兩個相鄰值的數值精度
double numPrecision = DEFAULT_NUM_PRECISION;
if (attribute.type() == Attribute.NUMERIC) {
m_Instances.sort(attribute);
if ((m_Instances.numInstances() > 0)
&& !m_Instances.instance(0).isMissing(attribute)) {
double lastVal = m_Instances.instance(0).value(attribute);
double currentVal, deltaSum = 0;
int distinct = 0;
for (int i = 1; i < m_Instances.numInstances(); i++) {
Instance currentInst = m_Instances.instance(i);
if (currentInst.isMissing(attribute)) {
break;
}
currentVal = currentInst.value(attribute);
if (currentVal != lastVal) {
deltaSum += currentVal - lastVal;
lastVal = currentVal;
distinct++;
}
}
if (distinct > 0) {
numPrecision = deltaSum / distinct;
}
}
}
for (int j = 0; j < m_Instances.numClasses(); j++) {
switch (attribute.type()) {
case Attribute.NUMERIC:
if (m_UseKernelEstimator) {
m_Distributions[attIndex][j] = new KernelEstimator(numPrecision);
} else {
m_Distributions[attIndex][j] = new NormalEstimator(numPrecision);
}
break;
case Attribute.NOMINAL:
m_Distributions[attIndex][j] = new DiscreteEstimator(
attribute.numValues(), true);
break;
default:
throw new Exception("Attribute type unknown to NaiveBayes");
}
}
attIndex++;
}
// 計算次數
Enumeration<Instance> enumInsts = m_Instances.enumerateInstances();
while (enumInsts.hasMoreElements()) {
Instance instance = enumInsts.nextElement();
updateClassifier(instance);
}
// 存儲空間 m_Instances = new Instances(m_Instances, 0); }