深度自編碼器(Deep Autoencoder)MATLAB解讀

時間 2019-12-28

標籤深度編碼器 deep autoencoder matlab 解讀欄目 MATLAB 简体版

原文原文鏈接

深度自編碼器(Deep Autoencoder)MATLAB解讀

做者：凱魯嘎吉 - 博客園 http://www.cnblogs.com/kailugaji/html

這篇文章主要講解Hinton在2006年Science上提出的一篇文章「Reducing the dimensionality of data with neural networks」的主要思想與MATLAB程序解讀。git

深度自編碼器首先用受限玻爾茲曼機進行逐層預訓練，獲得初始的權值與偏置（權值與偏置的更新過程用對比散度CD-1算法）。而後，自編碼獲得重構數據，經過BP算法進行全局微調權值與偏置（權值與偏置的更新過程用Polak-Ribiere共軛梯度法）。github

1. mnistdeepauto.m

%% 自編碼器網絡結構：784->1000->500->250->30->250->500->1000->784
clear all
close all

maxepoch=50; %In the Science paper we use maxepoch=50, but it works just fine. 最大迭代數
numhid=1000; numpen=500; numpen2=250; numopen=30;%rbm每層神經元個數1000-500-250-30
%%  數據預處理
%轉換數據格式
fprintf(1,'Converting Raw files into Matlab format \n');
converter; 
%50個來回迭代
fprintf(1,'Pretraining a deep autoencoder. \n');
fprintf(1,'The Science paper used 50 epochs. This uses %3i \n', maxepoch);
%對數據進行批處理
makebatches;
[numcases numdims numbatches]=size(batchdata);%每批樣本數、維度、批數
%%  逐層預訓練階段（用RBM）
%%可見層->1000隱含層
fprintf(1,'Pretraining Layer 1 with RBM: %d-%d \n',numdims,numhid);
restart=1;
rbm; %0、1變量 輸出權值與偏置的初始更新值
hidrecbiases=hidbiases; 
save mnistvh vishid hidrecbiases visbiases;%保存第1個rbm的權值、隱含層偏置項、可視化層偏置項，爲mnistvh.mat 784*1000 1*1000 1*784
%%1000隱含層->500隱含層
fprintf(1,'\nPretraining Layer 2 with RBM: %d-%d \n',numhid,numpen);
batchdata=batchposhidprobs;
numhid=numpen;
restart=1;
rbm;  %0、1變量 輸出權值與偏置的初始更新值
hidpen=vishid; penrecbiases=hidbiases; hidgenbiases=visbiases;
save mnisthp hidpen penrecbiases hidgenbiases;%保存第2個rbm的權值、隱含層偏置項、可視化層偏置項，爲mnisthp.mat   1000*500 1*500 1*1000
%%500隱含層->250隱含層
fprintf(1,'\nPretraining Layer 3 with RBM: %d-%d \n',numpen,numpen2);
batchdata=batchposhidprobs;
numhid=numpen2;
restart=1;
rbm; %0、1變量 輸出權值與偏置的初始更新值
hidpen2=vishid; penrecbiases2=hidbiases; hidgenbiases2=visbiases;
save mnisthp2 hidpen2 penrecbiases2 hidgenbiases2;%保存第3個rbm的權值、隱含層偏置項、可視化層偏置項，爲mnisthp2.mat  500*250 1*250 1*500
%250隱含層->30隱含層
fprintf(1,'\nPretraining Layer 4 with RBM: %d-%d \n',numpen2,numopen);
batchdata=batchposhidprobs;
numhid=numopen; 
restart=1;
rbmhidlinear;  %激活函數爲f(x)=x，實值變量 輸出權值與偏置的初始更新值
hidtop=vishid; toprecbiases=hidbiases; topgenbiases=visbiases;
save mnistpo hidtop toprecbiases topgenbiases;%保存第4個rbm的權值、隱含層偏置項、可視化層偏置項，爲mnistpo.mat  250*30 1*30 1*250
%%  BP全局調參
backprop; %微調權值與偏置

2. converter.m

%%將gz格式轉爲matlab的文件格式
%實現的功能是將樣本集從.ubyte格式轉換成.ascii格式，而後繼續轉換成.mat格式。
% % 做用：把測試數據集和訓練數據集轉換爲.mat格式
% 最終獲得的測試數據集：test(0~9).mat
% 最終獲得的訓練數據集：digit(0~9).mat
% %% 首先轉換測試數據集的格式 Work with test files first 
fprintf(1,'You first need to download files:\n train-images-idx3-ubyte.gz\n train-labels-idx1-ubyte.gz\n t10k-images-idx3-ubyte.gz\n t10k-labels-idx1-ubyte.gz\n from http://yann.lecun.com/exdb/mnist/\n and gunzip them \n'); 
%該文件前四個32位的數字是數據信息  magic number, number of image, number of rows, number of columns
f = fopen('t10k-images-idx3-ubyte','r');
[a,count] = fread(f,4,'int32');
%該文件前兩個32位的數字是數據信息  magic number, number of image
g = fopen('t10k-labels-idx1-ubyte','r');
[l,count] = fread(g,2,'int32');

fprintf(1,'Starting to convert Test MNIST images (prints 10 dots) \n'); 
n = 1000;
%Df中存的是.ascii文件代號
Df = cell(1,10);
for d=0:9,
  Df{d+1} = fopen(['test' num2str(d) '.ascii'],'w');
end;
%一次從測試集(1w)中讀入1000個圖片和標籤  rawlabel 1000*1  rawimages 784*1000 
for i=1:10,
  fprintf('.');
  rawimages = fread(f,28*28*n,'uchar');
  rawlabels = fread(g,n,'uchar');
  rawimages = reshape(rawimages,28*28,n);
%在對應文檔中輸入圖片的01值(3個整數位)換行
  for j=1:n,
    fprintf(Df{rawlabels(j)+1},'%3d ',rawimages(:,j));
    fprintf(Df{rawlabels(j)+1},'\n');
  end;
end;

fprintf(1,'\n');
for d=0:9,
  fclose(Df{d+1});
  D = load(['test' num2str(d) '.ascii'],'-ascii');%讀取.ascii 中的數據D=內包含樣本數*784
  fprintf('%5d Digits of class %d\n',size(D,1),d);
  save(['test' num2str(d) '.mat'],'D','-mat');%轉化爲.mat文件
end;


% 而後轉換訓練數據集的格式 Work with trainig files second  
f = fopen('train-images-idx3-ubyte','r');
[a,count] = fread(f,4,'int32');

g = fopen('train-labels-idx1-ubyte','r');
[l,count] = fread(g,2,'int32');

fprintf(1,'Starting to convert Training MNIST images (prints 60 dots)\n'); 
n = 1000;

Df = cell(1,10);
for d=0:9,
  Df{d+1} = fopen(['digit' num2str(d) '.ascii'],'w');
end;

for i=1:60,
  fprintf('.');
  rawimages = fread(f,28*28*n,'uchar');
  rawlabels = fread(g,n,'uchar');
  rawimages = reshape(rawimages,28*28,n);

  for j=1:n,
    fprintf(Df{rawlabels(j)+1},'%3d ',rawimages(:,j));
    fprintf(Df{rawlabels(j)+1},'\n');
  end;
end;

fprintf(1,'\n');
for d=0:9,
  fclose(Df{d+1});
  D = load(['digit' num2str(d) '.ascii'],'-ascii');
  fprintf('%5d Digits of class %d\n',size(D,1),d);
  save(['digit' num2str(d) '.mat'],'D','-mat');
end;

dos('rm *.ascii');%刪除中間文件.ascii

3. makebatches.m

%把數據集及其標籤進行打包或分批，方便之後分批進行處理，由於數據太大了，這樣可加快學習速率
%實現的是將本來的2維數據集變成3維的，由於分了多個批次，另外1維表示的是批次。
% 做用：把數據集及其標籤進行分批，方便之後分批進行處理，由於數據太大了，分批處理可加快學習速率
% 訓練數據集及標籤的打包結果：batchdata、batchtargets
% 測試數據集及標籤的打包結果：testbatchdata、testbatchtargets
digitdata=[]; 
targets=[]; 
%訓練集中數字0的樣本load 將文件中的全部數據加載D上；digitdata大小樣本數*784,target大小樣本數*10
load digit0; digitdata = [digitdata; D]; targets = [targets; repmat([1 0 0 0 0 0 0 0 0 0], size(D,1), 1)];  
load digit1; digitdata = [digitdata; D]; targets = [targets; repmat([0 1 0 0 0 0 0 0 0 0], size(D,1), 1)];
load digit2; digitdata = [digitdata; D]; targets = [targets; repmat([0 0 1 0 0 0 0 0 0 0], size(D,1), 1)]; 
load digit3; digitdata = [digitdata; D]; targets = [targets; repmat([0 0 0 1 0 0 0 0 0 0], size(D,1), 1)];
load digit4; digitdata = [digitdata; D]; targets = [targets; repmat([0 0 0 0 1 0 0 0 0 0], size(D,1), 1)]; 
load digit5; digitdata = [digitdata; D]; targets = [targets; repmat([0 0 0 0 0 1 0 0 0 0], size(D,1), 1)];
load digit6; digitdata = [digitdata; D]; targets = [targets; repmat([0 0 0 0 0 0 1 0 0 0], size(D,1), 1)];
load digit7; digitdata = [digitdata; D]; targets = [targets; repmat([0 0 0 0 0 0 0 1 0 0], size(D,1), 1)];
load digit8; digitdata = [digitdata; D]; targets = [targets; repmat([0 0 0 0 0 0 0 0 1 0], size(D,1), 1)];
load digit9; digitdata = [digitdata; D]; targets = [targets; repmat([0 0 0 0 0 0 0 0 0 1], size(D,1), 1)];
digitdata = digitdata/255;%累加起來而且進行歸一化

totnum=size(digitdata,1);%樣本數60000
fprintf(1, 'Size of the training dataset= %5d \n', totnum);

rand('state',0); %so we know the permutation of the training data 打亂順序 randomorder內有60000個不重複的數字
randomorder=randperm(totnum);

numbatches=totnum/100;%批數：600
numdims  =  size(digitdata,2);%維度 784
batchsize = 100;%每批樣本數 100
batchdata = zeros(batchsize, numdims, numbatches);%100*784*600
batchtargets = zeros(batchsize, 10, numbatches);%100*10*600

for b=1:numbatches %打亂了進行存儲還存在兩個數組batchdata，batchtargets中
  batchdata(:,:,b) = digitdata(randomorder(1+(b-1)*batchsize:b*batchsize), :);
  batchtargets(:,:,b) = targets(randomorder(1+(b-1)*batchsize:b*batchsize), :);
end;
clear digitdata targets;

digitdata=[];
targets=[];
load test0; digitdata = [digitdata; D]; targets = [targets; repmat([1 0 0 0 0 0 0 0 0 0], size(D,1), 1)]; 
load test1; digitdata = [digitdata; D]; targets = [targets; repmat([0 1 0 0 0 0 0 0 0 0], size(D,1), 1)]; 
load test2; digitdata = [digitdata; D]; targets = [targets; repmat([0 0 1 0 0 0 0 0 0 0], size(D,1), 1)];
load test3; digitdata = [digitdata; D]; targets = [targets; repmat([0 0 0 1 0 0 0 0 0 0], size(D,1), 1)];
load test4; digitdata = [digitdata; D]; targets = [targets; repmat([0 0 0 0 1 0 0 0 0 0], size(D,1), 1)];
load test5; digitdata = [digitdata; D]; targets = [targets; repmat([0 0 0 0 0 1 0 0 0 0], size(D,1), 1)];
load test6; digitdata = [digitdata; D]; targets = [targets; repmat([0 0 0 0 0 0 1 0 0 0], size(D,1), 1)];
load test7; digitdata = [digitdata; D]; targets = [targets; repmat([0 0 0 0 0 0 0 1 0 0], size(D,1), 1)];
load test8; digitdata = [digitdata; D]; targets = [targets; repmat([0 0 0 0 0 0 0 0 1 0], size(D,1), 1)];
load test9; digitdata = [digitdata; D]; targets = [targets; repmat([0 0 0 0 0 0 0 0 0 1], size(D,1), 1)];
digitdata = digitdata/255;

totnum=size(digitdata,1);
fprintf(1, 'Size of the test dataset= %5d \n', totnum);

rand('state',0); %so we know the permutation of the training data
randomorder=randperm(totnum);

numbatches=totnum/100;
numdims  =  size(digitdata,2);
batchsize = 100;
testbatchdata = zeros(batchsize, numdims, numbatches);
testbatchtargets = zeros(batchsize, 10, numbatches);

for b=1:numbatches
  testbatchdata(:,:,b) = digitdata(randomorder(1+(b-1)*batchsize:b*batchsize), :);
  testbatchtargets(:,:,b) = targets(randomorder(1+(b-1)*batchsize:b*batchsize), :);
end;
clear digitdata targets;


%%% Reset random seeds 
rand('state',sum(100*clock)); 
randn('state',sum(100*clock));

4. rbmhidlinear.m

% maxepoch  -- maximum number of epochs
% numhid    -- number of hidden units
% batchdata -- the data that is divided into batches (numcases numdims numbatches)
% restart   -- set to 1 if learning starts from beginning

%可視、二進制、隨機像素鏈接到隱藏的、由單位方差高斯函數繪製的、平均值由邏輯可見單元輸入決定的、符號型的實值特徵檢測器。
% 做用：訓練最頂層的一個RBM 250->30
% 輸出層神經元的激活函數爲1，是線性的，再也不是sigmoid函數，因此該函數名字叫：rbmhidlinear.m
epsilonw      = 0.001; % Learning rate for weights 
epsilonvb     = 0.001; % Learning rate for biases of visible units
epsilonhb     = 0.001; % Learning rate for biases of hidden units 
weightcost  = 0.0002;  
initialmomentum  = 0.5;
finalmomentum    = 0.9;

[numcases numdims numbatches]=size(batchdata);

if restart ==1
  restart=0;
  epoch=1;

% Initializing symmetric weights and biases.
  vishid     = 0.1*randn(numdims, numhid);
  hidbiases  = zeros(1,numhid);
  visbiases  = zeros(1,numdims);


  poshidprobs = zeros(numcases,numhid);
  neghidprobs = zeros(numcases,numhid);
  posprods    = zeros(numdims,numhid);
  negprods    = zeros(numdims,numhid);
  vishidinc  = zeros(numdims,numhid);
  hidbiasinc = zeros(1,numhid);
  visbiasinc = zeros(1,numdims);
  sigmainc = zeros(1,numhid);
  batchposhidprobs=zeros(numcases,numhid,numbatches);
end

for epoch = epoch:maxepoch
 fprintf(1,'epoch %d\r',epoch); 
 errsum=0;

 for batch = 1:numbatches
 fprintf(1,'epoch %d batch %d\r',epoch,batch);

%%%%%%%%% START POSITIVE PHASE %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  data = batchdata(:,:,batch);
  poshidprobs =  (data*vishid) + repmat(hidbiases,numcases,1);% 樣本第一次正向傳播時隱含層節點的輸出值，即：p(hj=1|v0)＝Wji*v0+bj ,由於輸出層激活函數爲1
  batchposhidprobs(:,:,batch)=poshidprobs;%將輸出存入一個三位數組
  posprods    = data' * poshidprobs;%p(h|v0)*v0 更新權重時會使用到 計算正向梯度vh'
  poshidact   = sum(poshidprobs);%隱藏層中神經元機率和，在更新隱藏層偏置時會使用到
  posvisact = sum(data);%可視層中神經元機率和，在更新可視層偏置時會使用到
%%%%%%%%% END OF POSITIVE PHASE  %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%gibbs採樣 輸出實數
poshidstates = poshidprobs+randn(numcases,numhid);% h0:非機率密度，而是01後的實值

%%%%%%%%% START NEGATIVE PHASE  %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  negdata = 1./(1 + exp(-poshidstates*vishid' - repmat(visbiases,numcases,1)));
  neghidprobs = (negdata*vishid) + repmat(hidbiases,numcases,1);%p(hj=1|v1)＝Wji*v1+bj, neghidprobs表示樣本第二次正向傳播時隱含層節點的輸出值，即：p(hj=1|v1)
  negprods  = negdata'*neghidprobs;
  neghidact = sum(neghidprobs);
  negvisact = sum(negdata); 

%%%%%%%%% END OF NEGATIVE PHASE %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


  err= sum(sum( (data-negdata).^2 )); 
  errsum = err + errsum;
   if epoch>5
     momentum=finalmomentum;
   else
     momentum=initialmomentum;
   end

%%%%%%%%% UPDATE WEIGHTS AND BIASES %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    vishidinc = momentum*vishidinc + ...
                epsilonw*( (posprods-negprods)/numcases - weightcost*vishid);
    visbiasinc = momentum*visbiasinc + (epsilonvb/numcases)*(posvisact-negvisact);
    hidbiasinc = momentum*hidbiasinc + (epsilonhb/numcases)*(poshidact-neghidact);
    vishid = vishid + vishidinc;
    visbiases = visbiases + visbiasinc;
    hidbiases = hidbiases + hidbiasinc;

%%%%%%%%%%%%%%%% END OF UPDATES %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

 end
fprintf(1, 'epoch %4i error %f \n', epoch, errsum);

end

5. backprop.m

%四個RBM鏈接起來進行，使用BP訓練數據進行參數的微調整
maxepoch=200;
fprintf(1,'\nFine-tuning deep autoencoder by minimizing cross entropy error. \n');
fprintf(1,'60 batches of 1000 cases each. \n');
%加載參數：權值與偏置
load mnistvh  %第1個rbm的權值、隱含層偏置項、可視化層偏置項1000 v->h(1000)
load mnisthp  %第二個 1000->500
load mnisthp2  %第三個 500->250
load mnistpo %第四個 250->30
%數據分批
makebatches;
[numcases numdims numbatches]=size(batchdata);
N=numcases; %樣本數個數

%%%% PREINITIALIZE WEIGHTS OF THE AUTOENCODER 預初始化自動編碼器的權重%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
w1=[vishid; hidrecbiases];  %v->h(1000)權值和偏置(1000)   (784+1)*1000
w2=[hidpen; penrecbiases];  %1000->500權值和偏置(500)   1001*500
w3=[hidpen2; penrecbiases2];  %500->250權值和偏置(250)   501*250
w4=[hidtop; toprecbiases];  %250->30權值與偏置(30)  251*30
w5=[hidtop'; topgenbiases]; %30->250權值與偏置(30)  31*250
w6=[hidpen2'; hidgenbiases2]; %250->500權值與偏置(250)  251*500
w7=[hidpen'; hidgenbiases]; %500->1000權值與偏置(500)   501*1000
w8=[vishid'; visbiases];  %1000->可見層權值與偏置(1000)   1001*784

%%%%%%%%%% END OF PREINITIALIZATIO OF WEIGHTS  權重預初始化結束%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

l1=size(w1,1)-1;  %每層節點個數  784
l2=size(w2,1)-1;  %1000
l3=size(w3,1)-1;  %500
l4=size(w4,1)-1;  %250
l5=size(w5,1)-1;  %30
l6=size(w6,1)-1;  %250
l7=size(w7,1)-1;  %500
l8=size(w8,1)-1;  %1000
l9=l1; %輸入層與輸出層節點個數相同  784
test_err=[];
train_err=[];


for epoch = 1:maxepoch   %重複迭代maxepoch次

%%%%%%%%%%%%%%%%%%%% COMPUTE TRAINING RECONSTRUCTION ERROR 計算訓練重構偏差%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
err=0; 
[numcases numdims numbatches]=size(batchdata);%每批樣本數、維度、批數
N=numcases;
 for batch = 1:numbatches  %按匹計算重構偏差，最後求平均
  data = [batchdata(:,:,batch)]; %100*784
  data = [data ones(N,1)];  %每一個樣本再加一個維度1 是由於w1裏既包含權值又包含偏置 100*785
  w1probs = 1./(1 + exp(-data*w1)); w1probs = [w1probs  ones(N,1)];  %(100*(784+1))*(785*1000)=100*1000; w1probs:100*1001;%正向傳播，計算每一層的輸出機率密度p(h|v)，且同時在輸出上增長一維（值爲常量1）
  w2probs = 1./(1 + exp(-w1probs*w2)); w2probs = [w2probs ones(N,1)];  %(100*1001)*(1001*500)=100*500; w2probs:100*501;
  w3probs = 1./(1 + exp(-w2probs*w3)); w3probs = [w3probs  ones(N,1)];  %(100*501)*(501*250)=100*250; w3probs:100*251;
  w4probs = w3probs*w4; w4probs = [w4probs  ones(N,1)];  %(100*251)*(251*30)=100*30; w4probs:100*31;% 第5層神經元激活函數爲1，而不是logistic函數
  w5probs = 1./(1 + exp(-w4probs*w5)); w5probs = [w5probs  ones(N,1)];  %(100*31)*(31*250)=100*250; w5probs:100*251;
  w6probs = 1./(1 + exp(-w5probs*w6)); w6probs = [w6probs  ones(N,1)];  %(100*251)*(251*500)=100*500; w6probs:100*501;
  w7probs = 1./(1 + exp(-w6probs*w7)); w7probs = [w7probs  ones(N,1)];  %(100*501)*(501*1000)=100*1000; w7probs:100*1001;
  dataout = 1./(1 + exp(-w7probs*w8));  %(100*1001)*(1001*784)=100*784;% 輸出層的輸出機率密度，即：重構數據的機率密度，也即：重構數據
  err= err +  1/N*sum(sum( (data(:,1:end-1)-dataout).^2 ));  %剔除掉最後一維 err=∑(∑(||H-X||^2))/N;% 每一個batch內的均方偏差
  end
 train_err(epoch)=err/numbatches;  %第epoch輪平均訓練偏差% 迭代第epoch次的全部樣本內的均方偏差

%%%%%%%%%%%%%% END OF COMPUTING TRAINING RECONSTRUCTION ERROR 訓練重構偏差計算結束%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

%%%% DISPLAY FIGURE TOP ROW REAL DATA BOTTOM ROW RECONSTRUCTIONS 顯示真實的和重構後的數據 %%%%%%%%%%%%%%%%%%%%%%%%%
fprintf(1,'Displaying in figure 1: Top row - real data, Bottom row -- reconstructions \n'); %上面一行是真實數據，下面一行是重構數據
output=[];
 for ii=1:15 %每次顯示15組圖片
  output = [output data(ii,1:end-1)' dataout(ii,:)']; %兩列真實數據和重構後的數據%output爲15（由於是顯示15個數字）組，每組2列，分別爲理論值和重構值
 end
   if epoch==1 
   close all 
   figure('Position',[100,600,1000,200]);
   else 
   figure(1)
   end 
   mnistdisp(output); %畫圖 展現一組圖
   drawnow;

%%%%%%%%%%%%%%%%%%%% COMPUTE TEST RECONSTRUCTION ERROR 計算測試重構偏差%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
[testnumcases testnumdims testnumbatches]=size(testbatchdata);%批數% [100 784 100] 測試數據爲100個batch，每一個batch含100個測試樣本，每一個樣本維數爲784
N=testnumcases;
err=0;
for batch = 1:testnumbatches
  data = [testbatchdata(:,:,batch)];
  data = [data ones(N,1)];
  w1probs = 1./(1 + exp(-data*w1)); w1probs = [w1probs  ones(N,1)];
  w2probs = 1./(1 + exp(-w1probs*w2)); w2probs = [w2probs ones(N,1)];
  w3probs = 1./(1 + exp(-w2probs*w3)); w3probs = [w3probs  ones(N,1)];
  w4probs = w3probs*w4; w4probs = [w4probs  ones(N,1)]; %沒有把4個RBM展開前輸出層神經元（即：第4個rbm的隱含層神經元）的激活函數是f(x)=x，而不是原來的logistic函數。因此把4個RBM展開並鏈接起來變爲9層神經網絡後，它的第5層神經元的激活函數仍然是f(x)=x。
  w5probs = 1./(1 + exp(-w4probs*w5)); w5probs = [w5probs  ones(N,1)];
  w6probs = 1./(1 + exp(-w5probs*w6)); w6probs = [w6probs  ones(N,1)];
  w7probs = 1./(1 + exp(-w6probs*w7)); w7probs = [w7probs  ones(N,1)];
  dataout = 1./(1 + exp(-w7probs*w8)); %輸出層的輸出機率密度＝重構數據的機率密度＝重構數據
  err = err +  1/N*sum(sum( (data(:,1:end-1)-dataout).^2 ));
end
 test_err(epoch)=err/testnumbatches;
 fprintf(1,'Before epoch %d Train squared error: %6.3f Test squared error: %6.3f \t \t \n',epoch,train_err(epoch),test_err(epoch));

%%%%%%%%%%%%%% END OF COMPUTING TEST RECONSTRUCTION ERROR 測試重構偏差計算結束%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

%%組合數據的batches大小由原來的100*600的mini-batches變爲1000*60的larger-batches
 tt=0;
 for batch = 1:numbatches/10% 訓練樣本：批數numbatches是600，每一個batch內100個樣本，組合後變爲批數60，每一個batch1000個樣本
 fprintf(1,'epoch %d batch %d\r',epoch,batch);

%%%%%%%%%%% COMBINE 10 MINIBATCHES INTO 1 LARGER MINIBATCH 將10個小批合併爲1個較大的小批%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 tt=tt+1; 
 data=[];
 for kk=1:10
  data=[data 
        batchdata(:,:,(tt-1)*10+kk)]; %將10個100行數據連成一行%使訓練數據變爲60個batch，每一個batch內含1000個樣本
 end 

%%%%%%%%%%%%%%% PERFORM CONJUGATE GRADIENT WITH 3 LINESEARCHES 共軛梯度%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  max_iter=3;  %3次線性搜索
  % VV將權值偏置矩陣展成一個長長的列向量
  VV = [w1(:)' w2(:)' w3(:)' w4(:)' w5(:)' w6(:)' w7(:)' w8(:)']'; %將全部的權值和偏置合併爲1列% 把全部權值（已經包括了偏置值）變成一個大的列向量
  Dim = [l1; l2; l3; l4; l5; l6; l7; l8; l9];  %全部結點 每層節點個數% 每層網絡對應節點的個數（不包括偏置值）

  [X, fX] = minimize(VV,'CG_MNIST',max_iter,Dim,data);%實現共軛梯度% X爲3次線性搜索最優化後獲得的權值參數，是一個列向量
  %VV是權值偏置 CG_MNIST輸出的是代價函數和偏導 結點 數據
  % 將VV列向量從新還原成矩陣
  w1 = reshape(X(1:(l1+1)*l2),l1+1,l2);  %(l1+1)*l2 (784+1)*1000
  xxx = (l1+1)*l2;
  w2 = reshape(X(xxx+1:xxx+(l2+1)*l3),l2+1,l3);
  xxx = xxx+(l2+1)*l3;
  w3 = reshape(X(xxx+1:xxx+(l3+1)*l4),l3+1,l4);
  xxx = xxx+(l3+1)*l4;
  w4 = reshape(X(xxx+1:xxx+(l4+1)*l5),l4+1,l5);
  xxx = xxx+(l4+1)*l5;
  w5 = reshape(X(xxx+1:xxx+(l5+1)*l6),l5+1,l6);
  xxx = xxx+(l5+1)*l6;
  w6 = reshape(X(xxx+1:xxx+(l6+1)*l7),l6+1,l7);
  xxx = xxx+(l6+1)*l7;
  w7 = reshape(X(xxx+1:xxx+(l7+1)*l8),l7+1,l8);
  xxx = xxx+(l7+1)*l8;
  w8 = reshape(X(xxx+1:xxx+(l8+1)*l9),l8+1,l9);%依次從新賦值爲優化後的參數

%%%%%%%%%%%%%%% END OF CONJUGATE GRADIENT WITH 3 LINESEARCHES %%%%%%%%%%%%%%%%%%%%%%%%%%%%%

 end

 save mnist_weights w1 w2 w3 w4 w5 w6 w7 w8 
 save mnist_error test_err train_err;

end

6. CG_MNIST.m

%該函數實現的功能是計算網絡代價函數值f，以及f對網絡中各個參數值的偏導數df，權值和偏置值是同時處理。
%其中參數VV爲網絡中全部參數構成的列向量，參數Dim爲每層網絡的節點數構成的向量，XX爲訓練樣本集合。f和df分別表示網絡的代價函數和偏導函數值。
%得代價函數和對權值的偏導數
function [f, df] = CG_MNIST(VV,Dim,XX) %權值,結點,輸入數據
% f ：代價函數，即交叉熵偏差 -1/N*∑∑(X*log(H)+(1-X)*log(1-H))
% df ：代價函數對各權值的偏導數
% VV：權值（已經包括了偏置值），爲一個大的列向量 用預訓練初始的權值與偏置
% Dim：每層網絡對應節點的個數
% XX：訓練樣本
% f ：代價函數，即交叉熵偏差
% df ：代價函數對各權值的偏導數
l1 = Dim(1);%各層節點個數（不包括偏置值） 784
l2 = Dim(2);  %1000
l3 = Dim(3);  %500
l4= Dim(4);  %250
l5= Dim(5);  %30
l6= Dim(6);  %250
l7= Dim(7);  %500
l8= Dim(8);  %1000
l9= Dim(9);  %784
N = size(XX,1);% 樣本的個數


% Do decomversion. 權值矩陣化
 w1 = reshape(VV(1:(l1+1)*l2),l1+1,l2); %依次取出每層的權值和偏置% VV是一個長的列向量，它包括偏置值和權值，這裏取出的向量已經包括了偏置值 785*1000
 xxx = (l1+1)*l2;%xxx 表示已經使用了的長度
 w2 = reshape(VV(xxx+1:xxx+(l2+1)*l3),l2+1,l3); %1001*500
 xxx = xxx+(l2+1)*l3;
 w3 = reshape(VV(xxx+1:xxx+(l3+1)*l4),l3+1,l4);  %501*250
 xxx = xxx+(l3+1)*l4;
 w4 = reshape(VV(xxx+1:xxx+(l4+1)*l5),l4+1,l5);  %251*30
 xxx = xxx+(l4+1)*l5;
 w5 = reshape(VV(xxx+1:xxx+(l5+1)*l6),l5+1,l6);  %31*250
 xxx = xxx+(l5+1)*l6;
 w6 = reshape(VV(xxx+1:xxx+(l6+1)*l7),l6+1,l7);  %251*500
 xxx = xxx+(l6+1)*l7;
 w7 = reshape(VV(xxx+1:xxx+(l7+1)*l8),l7+1,l8);  %501*1000
 xxx = xxx+(l7+1)*l8;
 w8 = reshape(VV(xxx+1:xxx+(l8+1)*l9),l8+1,l9);  %1001*784


  XX = [XX ones(N,1)];% 訓練樣本，加1維使其下可乘w1
  w1probs = 1./(1 + exp(-XX*w1)); w1probs = [w1probs  ones(N,1)];
  w2probs = 1./(1 + exp(-w1probs*w2)); w2probs = [w2probs ones(N,1)];
  w3probs = 1./(1 + exp(-w2probs*w3)); w3probs = [w3probs  ones(N,1)];
  w4probs = w3probs*w4; w4probs = [w4probs  ones(N,1)];% 第5層神經元激活函數爲1，而不是logistic函數
  w5probs = 1./(1 + exp(-w4probs*w5)); w5probs = [w5probs  ones(N,1)];
  w6probs = 1./(1 + exp(-w5probs*w6)); w6probs = [w6probs  ones(N,1)];
  w7probs = 1./(1 + exp(-w6probs*w7)); w7probs = [w7probs  ones(N,1)];
  XXout = 1./(1 + exp(-w7probs*w8));  %輸出的機率密度% 輸出層的機率密度，也就是重構數據

%看邱錫鵬: 神經網絡與深度學習 P100
%計算每一層參數的導數
f = -1/N*sum(sum( XX(:,1:end-1).*log(XXout) + (1-XX(:,1:end-1)).*log(1-XXout)));  %代價函數交叉熵 -1/N*∑∑(X*log(H)+(1-X)*log(1-H))
IO = 1/N*(XXout-XX(:,1:end-1));  %偏差項
Ix8=IO;% 至關於輸出層「殘差」 
dw8 =  w7probs'*Ix8;  %向後推導輸出層偏導  W8的偏導=激活值（f(aW+b)）'*殘差項

Ix7 = (Ix8*w8').*w7probs.*(1-w7probs); %第七層殘差
Ix7 = Ix7(:,1:end-1); %偏差項
dw7 =  w6probs'*Ix7;  %第七層偏導=激活值（f(aW+b)）'*殘差項

Ix6 = (Ix7*w7').*w6probs.*(1-w6probs); 
Ix6 = Ix6(:,1:end-1); %偏差項
dw6 =  w5probs'*Ix6;

Ix5 = (Ix6*w6').*w5probs.*(1-w5probs); 
Ix5 = Ix5(:,1:end-1);
dw5 =  w4probs'*Ix5;

Ix4 = (Ix5*w5');
Ix4 = Ix4(:,1:end-1);
dw4 =  w3probs'*Ix4;

Ix3 = (Ix4*w4').*w3probs.*(1-w3probs); 
Ix3 = Ix3(:,1:end-1);
dw3 =  w2probs'*Ix3;

Ix2 = (Ix3*w3').*w2probs.*(1-w2probs); 
Ix2 = Ix2(:,1:end-1);
dw2 =  w1probs'*Ix2;

Ix1 = (Ix2*w2').*w1probs.*(1-w1probs); 
Ix1 = Ix1(:,1:end-1);
dw1 =  XX'*Ix1;

df = [dw1(:)' dw2(:)' dw3(:)' dw4(:)' dw5(:)' dw6(:)'  dw7(:)'  dw8(:)'  ]'; %網絡代價函數的偏導數