我使用的是Weka自帶的示例數據集java
基本的讀取數據方式爲mysql
Instances data1=DataSource.read("data\\iris.arff");
若是文件的拓展名未知,咱們能夠指定加載器進行加載,例如咱們能夠把以前的iris.arff文件改爲iris.data,而後經過指定加載器加載本地數據sql
package weka.loaddata; import java.io.File; import weka.core.Instances; import weka.core.converters.ArffLoader; public class Test { public static void main(String[] args) { try { ArffLoader loader=new ArffLoader(); loader.setSource(new File("data\\iris.data")); Instances data1=loader.getDataSet(); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } System.out.println("done"); } }
arff和csv須要人爲指定做爲類別的字段數據庫
package weka.loaddata; import weka.core.Instances; import weka.core.converters.ConverterUtils.DataSource; public class Test { public static void main(String[] args) { try { Instances data1=DataSource.read("data\\iris.arff"); System.out.println(data1.classIndex()); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } System.out.println("done"); } }
返回-1表明此時並無指定類別屬性api
package weka.loaddata; import weka.core.Instances; import weka.core.converters.ConverterUtils.DataSource; public class Test { public static void main(String[] args) { try { Instances data1=DataSource.read("data\\iris.arff"); data1.setClassIndex(data1.numAttributes()-1); System.out.println(data1.classIndex()); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } System.out.println("done"); } }
因而咱們經過上述程序將最後一個屬性做爲類別屬性dom
一種是InstanceQuery,容許檢索係數數據,一種是DatabaseLoader,容許增量檢索測試
package weka.loaddata; import weka.core.Instances; import weka.experiment.InstanceQuery; public class Test { public static void main(String[] args) throws Exception { InstanceQuery query = new InstanceQuery(); query.setDatabaseURL("jdbc:mysql://localhost:3306/new_schema"); query.setUsername("root"); query.setPassword("*******"); query.setQuery("select * from iris"); Instances data = query.retrieveInstances(); System.out.println("done"); } }
我首先將iris數據加載進mysql數據庫了
若是你用過jdbc的話,會發現這幾個東西就是用的jdbcui
package weka.loaddata; import weka.core.Instances; import weka.core.converters.DatabaseLoader; public class Test { public static void main(String[] args) throws Exception { DatabaseLoader loader = new DatabaseLoader(); loader.setSource("jdbc:mysql://localhost:3306/new_schema", "root", "*******"); loader.setQuery("select * from iris"); Instances data = loader.getDataSet(); } }
批量檢索lua
package weka.loaddata; import weka.core.Instance; import weka.core.Instances; import weka.core.converters.DatabaseLoader; public class Test { public static void main(String[] args) throws Exception { DatabaseLoader loader = new DatabaseLoader(); loader.setSource("jdbc:mysql://localhost:3306/new_schema", "root", "zxy123456"); loader.setQuery("select * from iris"); Instances structure = loader.getStructure(); Instances data = new Instances(structure); Instance inst; while ((inst = loader.getNextInstance(structure)) != null) data.add(inst); System.out.println("done"); } }
增量檢索spa
package weka.loaddata; import java.io.File; import java.io.FileOutputStream; import weka.core.Instances; import weka.core.converters.ConverterUtils.DataSink; import weka.core.converters.ConverterUtils.DataSource; import weka.core.converters.XRFFSaver; public class Test { public static void main(String[] args) throws Exception { Instances data = DataSource.read("data/iris.arff"); DataSink.write("data/write_iris.csv", data); FileOutputStream arff = new FileOutputStream("data/write_iris.arff"); DataSink.write(arff, data); arff.close(); XRFFSaver saver = new XRFFSaver(); saver.setInstances(data); saver.setFile(new File("data/write_iris.xrff")); saver.writeBatch(); System.out.println("done"); } }
能夠直接寫,也能夠指定加載器
package weka.loaddata; import weka.core.Instances; import weka.core.converters.ConverterUtils.DataSource; import weka.core.converters.DatabaseSaver; public class Test { public static void main(String[] args) throws Exception { Instances data = DataSource.read("data/iris.arff"); DatabaseSaver saver = new DatabaseSaver(); saver.setDestination("jdbc:mysql://localhost:3306/new_schema", "root", "zxy123456"); saver.setTableName("write_iris"); saver.setRelationForTableName(false); saver.setInstances(data); saver.writeBatch(); System.out.println("done"); } }
saver.setRelationForTableName(false);
若是是true的話,只能將數據的relation名做爲表名,固然也能夠改關係名啦
data.setRelationName(newName);
package weka.loaddata; import weka.core.Instances; import weka.core.converters.ConverterUtils.DataSource; import weka.core.converters.DatabaseSaver; public class Test { public static void main(String[] args) throws Exception { Instances data = DataSource.read("data/iris.arff"); DatabaseSaver saver = new DatabaseSaver(); saver.setDestination("jdbc:mysql://localhost:3306/new_schema", "root", "zxy123456"); saver.setTableName("write_iris"); saver.setRelationForTableName(false); saver.setRetrieval(DatabaseSaver.INCREMENTAL); saver.setInstances(data); for (int i = 0; i < data.numInstances(); i++) { saver.writeIncremental(data.instance(i)); } saver.writeIncremental(null); System.out.println("done"); } }
增量保存,看起來就是一條一條存
package weka.loaddata; import weka.core.Instances; import weka.core.converters.ConverterUtils.DataSource; import weka.filters.Filter; import weka.filters.unsupervised.attribute.Remove; public class Test { public static void main(String[] args) throws Exception { Instances data = DataSource.read("data/iris.arff"); System.out.println(data); System.out.println("----------------"); String[] options = new String[2]; options[0] = "-R"; options[1] = "1"; Remove rm = new Remove(); rm.setOptions(options); rm.setInputFormat(data); Instances inst1 = Filter.useFilter(data, rm); System.out.println(inst1); } }
意思是去除一個屬性,其餘的東西該如何應用,仍是看看api吧,
Instances inst1 = Filter.useFilter(data, rm);
這個應該是
的使用方法,猜的,應該是。
能夠經過內存提取數據,總共分兩步
首先設置屬性定義數據格式
其次一行一行地添加實際數據
1.1 定義數據格式
Attribute numeric=new Attribute("attribute_name");
Attribute date=new Attribute("attribute_name","yyyy-MM-dd");
具體日期格式參照SimpleDateFormat中對日期的規定
ArrayList<String> labels=new ArrayList<String>(); labels.add("label_a"); labels.add("label_b"); Attribute nominal=new Attribute("attribute_name",labels);
Attribute string = new Attribute("attribute_name",(ArrayList<String>)null);
提供一個ArrayList的你null對象
ArrayList<Attribute> atts = new ArrayList<Attribute>(); atts.add(new Attribute("rel.numeric")); ArrayList<String> values = new ArrayList<String>(); values.add("val_A"); values.add("val_B"); values.add("val_C"); atts.add(new Attribute("rel.nominal")); Instances rel_struct = new Instances("rel", atts, 0); Attribute relational = new Attribute("attribute_name", rel_struct);
atts裏有一個numeric屬性和一個nominal屬性,而後建立了一個大小爲0的instances對象。而後利用這個instances建立了這個relation數據屬性。
Attribute num1 = new Attribute("num1"); Attribute num2 = new Attribute("num2"); ArrayList<String> labels = new ArrayList<String>(); labels.add("no"); labels.add("yes"); Attribute cls = new Attribute("class", labels); ArrayList<Attribute> attributes = new ArrayList<>(); attributes.add(num1); attributes.add(num2); attributes.add(cls); Instances dataset = new Instances("relation_name", attributes, 0);
咱們建立了num1,num2,cls三個屬性,而後建立了這個數據集的instances對象,
1.2 添加數據
package weka.api; import java.util.ArrayList; import weka.core.Attribute; import weka.core.DenseInstance; import weka.core.Instance; import weka.core.Instances; public class Test { public static void main(String[] args) throws Exception { Attribute numeric = new Attribute("numeric"); Attribute date = new Attribute("date", "yyyy-MM-dd"); ArrayList<String> label = new ArrayList<String>(); label.add("label_a"); label.add("label_b"); label.add("label_c"); Attribute nominal = new Attribute("nominal", label); Attribute string = new Attribute("string", (ArrayList) null); // ArrayList<Attribute> rel_attributes = new ArrayList<>(); // rel_attributes.add(numeric); // rel_attributes.add(nominal); // Instances rel_struct = new Instances("rel_struct", rel_attributes, // 1); // Attribute relation = new Attribute("relation", rel_struct); ArrayList<Attribute> attributes = new ArrayList<>(); attributes.add(numeric); attributes.add(date); attributes.add(nominal); attributes.add(string); // attributes.add(relation); Instances data = new Instances("data", attributes, 1); double[] values = new double[data.numAttributes()]; values[0] = 1.23; values[1] = data.attribute(1).parseDate("2017-8-19"); values[2] = data.attribute(2).indexOfValue("label_c"); System.out.println(values[2]); values[3] = data.attribute(3).addStringValue("A string"); // Instances dataRel=new Instances(data.attribute(4).relation(), 0); // double[] valuesRel=new double[dataRel.numAttributes()]; // valuesRel[0]=2.34; // valuesRel[1]=dataRel.attribute(1).indexOfValue("label_c"); // dataRel.add(new DenseInstance(1.0,valuesRel)); // values[4]=data.attribute(4).addRelation(dataRel); Instance inst = new DenseInstance(1, values); data.add(inst); System.out.println(data); } }
relation這個東西我還不太會用。。。因此註釋掉了
須要注意的是在使用nominal屬性的時候,若是添加的值不在以前的聲明之中,他會返回-1,卻不會報錯,而在使用的時候纔會報錯,並且還找不到哪裏錯誤,從這點來看他們這個API寫的實在有點= =粗糙。。。。
package weka.api; import java.util.ArrayList; import weka.core.Attribute; import weka.core.DenseInstance; import weka.core.Instance; import weka.core.Instances; public class Test { public static void main(String[] args) throws Exception { ArrayList<Attribute> atts; ArrayList<Attribute> attsRel; ArrayList<String> attVals; ArrayList<String> attValsRel; Instances data; Instances dataRel; double[] vals; double[] valsRel; int i = 0; atts = new ArrayList<Attribute>(); atts.add(new Attribute("att1")); attVals = new ArrayList<String>(); for (i = 0; i < 5; i++) { attVals.add("val" + (i + 1)); } atts.add(new Attribute("att2", attVals)); atts.add(new Attribute("att3", (ArrayList<String>) null)); atts.add(new Attribute("att4", "yyyy-MM-dd")); attsRel = new ArrayList<Attribute>(); attsRel.add(new Attribute("att5.1")); attValsRel = new ArrayList<String>(); for (i = 0; i < 5; i++) { attValsRel.add("val5." + (i + 1)); } attsRel.add(new Attribute("att5.2", attValsRel)); dataRel = new Instances("att5", attsRel, 0); atts.add(new Attribute("att5", dataRel, 0)); data=new Instances("MyRelation",atts,0); vals=new double[data.numAttributes()]; vals[0]=Math.PI; vals[1]=attVals.indexOf("val3"); vals[2]=data.attribute(2).addStringValue("a string"); vals[3]=data.attribute(3).parseDate("2017-8-19"); dataRel=new Instances(data.attribute(4).relation(),0); valsRel=new double[2]; valsRel[0]=Math.PI+1; valsRel[1]=attValsRel.indexOf("val5.3"); dataRel.add(new DenseInstance(1,valsRel)); vals[4]=data.attribute(4).addRelation(dataRel); data.add(new DenseInstance(1,vals)); System.out.println(data); } }
這個例子比以前個人好,不過關係型屬性是真的麻煩,不過理解起來就好像是,一組數據被當作一個特徵。
package weka.api; import java.util.Random; import weka.core.Instances; import weka.core.converters.ConverterUtils.DataSource; public class Test { public static void main(String[] args) throws Exception { Instances data = DataSource.read("data\\iris.arff"); System.out.println(data); long seed = 123456; Instances data3 = new Instances(data); data3.randomize(new Random(seed)); System.out.println(data3); } }
這是其中一種方法,在這種方法中,推薦使用種子,另外還有可使用filter的方法進行隨機排序,後文繼續介紹
如今要增長一個數值屬性和一個標稱屬性,並添加隨機值
package weka.api; import java.util.Random; import weka.core.Instances; import weka.core.converters.ConverterUtils.DataSource; import weka.filters.Filter; import weka.filters.unsupervised.attribute.Add; public class Test { public static void main(String[] args) throws Exception { Instances data = DataSource.read("data\\weather.numeric.arff"); Instances result = null; Add filter; result = new Instances(data); filter = new Add(); filter.setAttributeIndex("last"); filter.setAttributeName("NumericAttribute"); filter.setInputFormat(result); result = Filter.useFilter(result, filter); filter = new Add(); filter.setAttributeIndex("last"); filter.setNominalLabels("A,B,C"); filter.setAttributeName("NominalAttribute"); filter.setInputFormat(result); result = Filter.useFilter(result, filter); Random rand = new Random(1234); for (int i = 0; i < result.numInstances(); i++) { result.instance(i).setValue(result.numAttributes() - 2, rand.nextDouble()); result.instance(i).setValue(result.numAttributes() - 1, rand.nextInt(3)); } System.out.println("過濾後的數據集:"); System.out.println(result); } }
運用了Standardize,將數據集中全部數字屬性標準化,零均值與單位方差
package weka.api; import weka.core.Instances; import weka.core.converters.ConverterUtils.DataSource; import weka.filters.Filter; import weka.filters.unsupervised.attribute.Standardize; public class Test { public static void main(String[] args) throws Exception { Instances train = DataSource.read("data\\segment-challenge.arff"); Instances test = DataSource.read("data\\segment-test.arff"); Standardize filter = new Standardize(); filter.setInputFormat(train); Instances newTrain = Filter.useFilter(train, filter); Instances newTest = Filter.useFilter(test, filter); System.out.println("new trainer"); System.out.println(newTrain); System.out.println("new test"); System.out.println(newTest); } }
package weka.api; import weka.classifiers.meta.FilteredClassifier; import weka.classifiers.trees.J48; import weka.core.Instances; import weka.core.converters.ConverterUtils.DataSource; import weka.filters.unsupervised.attribute.Remove; public class Test { public static void main(String[] args) throws Exception { Instances train = DataSource.read("data\\segment-challenge.arff"); Instances test = DataSource.read("data\\segment-test.arff"); train.setClassIndex(train.numAttributes() - 1); test.setClassIndex(test.numAttributes() - 1); if (!train.equalHeaders(test)) { throw new Exception("訓練集與測試機不兼容:\n" + train.equalHeadersMsg(test)); } Remove rm = new Remove(); rm.setAttributeIndices("1"); J48 j48 = new J48(); j48.setUnpruned(true); FilteredClassifier fc = new FilteredClassifier(); fc.setFilter(rm); fc.setClassifier(j48); fc.buildClassifier(train); for (int i = 0; i < test.numInstances(); i++) { double pred = fc.classifyInstance(test.instance(i)); System.out.print("index: " + (i + 1)); System.out.print(", class: " + test.classAttribute() .value((int) test.instance(i).classValue())); System.out.println(", predict class: " + test.classAttribute().value((int) pred)); } } }
解釋一下
分類器分爲批量分類器和增量分類器
構建批量分類器分爲兩步
示例
增量分類器都實現了UpdateableClassifier接口
增量分類器用於處理規模較大的數據,不會將數據一次加載進內存,arff文件能夠增量讀取,一樣也分兩步
示例
爲啥不帶數據,由於以前loader進行的是加載結構的方法
構建分類器的評價標準有兩種方式,交叉驗證和專用測試集驗證
評價由Evaluation類實現
示例
3.1 批量分類器構建
package weka.api; import java.io.File; import weka.classifiers.trees.J48; import weka.core.Instances; import weka.core.converters.ArffLoader; public class Test { public static void main(String[] args) throws Exception { ArffLoader loader=new ArffLoader(); loader.setFile(new File("data\\weather.nominal.arff")); Instances data=loader.getDataSet(); data.setClassIndex(data.numAttributes()-1); String[] options=new String[1]; options[0]="-U"; J48 tree=new J48(); tree.setOptions(options); tree.buildClassifier(data); System.out.println(tree); } }
4.2 增量分類器構建
package weka.api; import java.io.File; import weka.classifiers.bayes.NaiveBayesUpdateable; import weka.core.Instance; import weka.core.Instances; import weka.core.converters.ArffLoader; public class Test { public static void main(String[] args) throws Exception { ArffLoader loader = new ArffLoader(); loader.setFile(new File("data\\weather.nominal.arff")); Instances structure = loader.getStructure(); structure.setClassIndex(structure.numAttributes() - 1); NaiveBayesUpdateable nb = new NaiveBayesUpdateable(); nb.buildClassifier(structure); Instance instance; while ((instance = loader.getNextInstance(structure)) != null) nb.updateClassifier(instance); System.out.println(nb); } }
4.3 輸出類別分佈
package weka.api; import weka.classifiers.trees.J48; import weka.core.Instances; import weka.core.Utils; import weka.core.converters.ConverterUtils.DataSource; public class Test { public static void main(String[] args) throws Exception { Instances train = DataSource.read("data\\segment-challenge.arff"); Instances test = DataSource.read("data\\segment-test.arff"); train.setClassIndex(train.numAttributes() - 1); test.setClassIndex(test.numAttributes() - 1); if (!train.equalHeaders(test)) { throw new Exception("不相容"); } J48 classifier = new J48(); classifier.buildClassifier(train); for (int i = 0; i < test.numInstances(); i++) { double pred = classifier.classifyInstance(test.instance(i)); double[] dist = classifier .distributionForInstance(test.instance(i)); System.out.print((i + 1) + " - " + test.instance(i).toString(test.classIndex()) + " - " + test.classAttribute().value((int) pred) + " - "); if (pred != test.instance(i).classValue()) { System.out.print("wrong"); } else { System.out.print("correct"); } System.out.println(" - " + Utils.arrayToString(dist)); } } }
訓練了一個分類器,而後一條一跳測試集過,
double pred = classifier.classifyInstance(test.instance(i));是預測結果
double[] dist = classifier.distributionForInstance(test.instance(i));獲得的是這條數據的預測各個類的機率
4.5 交叉驗證並預測
package weka.api; import java.util.Random; import weka.classifiers.AbstractClassifier; import weka.classifiers.Classifier; import weka.classifiers.Evaluation; import weka.core.Instances; import weka.core.Utils; import weka.core.converters.ConverterUtils.DataSource; public class Test { public static void main(String[] args) throws Exception { Instances data = DataSource.read("data\\ionosphere.arff"); data.setClassIndex(data.numAttributes() - 1); String[] options = new String[2]; String classname = "weka.classifiers.trees.J48"; options[0] = "-C"; options[1] = "0.25"; Classifier classifier = (Classifier) Utils.forName(Classifier.class, classname, options); int seed = 1234; int folds = 10; Random rand = new Random(seed); Instances newData = new Instances(data); newData.randomize(rand); if (newData.classAttribute().isNominal()) { newData.stratify(folds); } Evaluation eval = new Evaluation(newData); for (int i = 0; i < folds; i++) { Instances train = newData.trainCV(folds, i); Instances test = newData.testCV(folds, i); Classifier clsCopy = AbstractClassifier.makeCopy(classifier); clsCopy.buildClassifier(train); eval.evaluateModel(clsCopy, test); } System.out.println("===分類器設置==="); System.out.println("分類器:" + Utils.toCommandLine(classifier)); System.out.println("數據集:" + data.relationName()); System.out.println("折數:" + folds); System.out.println("隨機種子:" + seed); System.out.println(); System.out.println( eval.toSummaryString("=== " + folds + "折交叉認證===", false)); } }
其實不難理解,不過有幾個地方須要說
newData.randomize(rand);這個是將數據隨機打亂
newData.stratify(folds);這個的api是這麼寫的
Stratifies a set of instances according to its class values if the class attribute is nominal (so that afterwards a stratified cross-validation can be performed).
意思應該是,若是這個類信息是標稱的,那麼咱們以後若是用的是n折的,好比99個個體共3類,每類都33個,那假如分3折,那前33個裏應該每類大約11個左右這樣。
4.5 交叉驗證並預測
package weka.api; import java.util.Random; import weka.classifiers.AbstractClassifier; import weka.classifiers.Classifier; import weka.classifiers.Evaluation; import weka.core.Instances; import weka.core.OptionHandler; import weka.core.Utils; import weka.core.converters.ConverterUtils.DataSource; import weka.filters.Filter; import weka.filters.supervised.attribute.AddClassification; public class Test { public static void main(String[] args) throws Exception { Instances data = DataSource.read("data\\ionosphere.arff"); data.setClassIndex(data.numAttributes() - 1); String[] options = new String[2]; String classname = "weka.classifiers.trees.J48"; options[0] = "-C"; options[1] = "0.25"; Classifier classifier = (Classifier) Utils.forName(Classifier.class, classname, options); int seed = 1234; int folds = 10; Random rand = new Random(seed); Instances newData = new Instances(data); newData.randomize(rand); if (newData.classAttribute().isNominal()) { newData.stratify(folds); } Instances predictedData = null; Evaluation eval = new Evaluation(newData); for (int i = 0; i < folds; i++) { Instances train = newData.trainCV(folds, i); Instances test = newData.testCV(folds, i); Classifier clsCopy = AbstractClassifier.makeCopy(classifier); clsCopy.buildClassifier(train); eval.evaluateModel(clsCopy, test); AddClassification filter = new AddClassification(); filter.setClassifier(classifier); filter.setOutputClassification(true); filter.setOutputDistribution(true); filter.setOutputErrorFlag(true); filter.setInputFormat(train); Filter.useFilter(train, filter); Instances pred = Filter.useFilter(test, filter); if (predictedData == null) predictedData = new Instances(pred, 0); for (int j = 0; j < pred.numInstances(); j++) predictedData.add(pred.instance(j)); } System.out.println("===分類器設置==="); if (classifier instanceof OptionHandler) System.out.println("分類器: " + classifier.getClass().getName() + " " + Utils.joinOptions( ((OptionHandler) classifier).getOptions())); else System.out.println("分類器:" + Utils.toCommandLine(classifier)); System.out.println("數據集:" + data.relationName()); System.out.println("折數:" + folds); System.out.println("隨機種子:" + seed); System.out.println(); System.out.println( eval.toSummaryString("=== " + folds + "折交叉認證===", false)); } }
這個得好好掰扯掰扯
Classifier clsCopy = AbstractClassifier.makeCopy(classifier); clsCopy.buildClassifier(train);Classifier clsCopy = AbstractClassifier.makeCopy(classifier); clsCopy.buildClassifier(train);
建立了一空的原始的啥都不知道的分類器,而後再訓練集進行了訓練
eval.evaluateModel(clsCopy, test);
這是將這個訓練好的分類器,運用到測試集上進行測試,這是個累加的過程,能夠看到好比第一折測試的時候,測試集有35個,那麼這個eval記錄了這35個的測試結果,第二折測試集有31個,那麼這個eval記錄了35+31總共的分類結果。
AddClassification filter = new AddClassification(); filter.setClassifier(classifier); filter.setOutputClassification(true); filter.setOutputDistribution(true); filter.setOutputErrorFlag(true);
doc上寫
用於將分類,類分佈和錯誤標記添加到具備分類器的數據集的過濾器。 分類器是對數據自己進行培訓或做爲序列化模型提供。
其實應該相似於把這個空的Classifier包裝了起來,包裝成一個過濾器
filter.setInputFormat(train); Filter.useFilter(train, filter); Instances pred = Filter.useFilter(test, filter);
先設置數據,Filter.useFilter(train, filter);是訓練,後一個是預測,運用這個過濾器,在預測的同時還會給數據後面加上三條屬性。
可是這兩條命令明明相同啊
以後就是把預測結果丟進去就能夠了。end