InfogainLossLayer與SoftmaxWithLossLayer相似,只不過增長了一個信息增益矩陣\(H\),用於指定某真實類別的數據被預測爲某一類別時的權重,經常使用於類間樣本數不均衡的狀況。當矩陣\(H\)爲單位矩陣時,等同於SoftmaxWithLossLayer。網絡
outer_num_
)乘上內部個數inner_num_
,即\(N=\tilde N * \tilde H * \tilde W\)。template <typename Dtype> void InfogainLossLayer<Dtype>::LayerSetUp( const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { LossLayer<Dtype>::LayerSetUp(bottom, top); //基類的初始化函數 // internal softmax layer LayerParameter softmax_layer_param(this->layer_param_); //layer參數,用於建立softmax層 SoftmaxParameter* softmax_param = softmax_layer_param.mutable_softmax_param(); //layer參數中的softmax參數 softmax_param->set_axis(this->layer_param_.infogain_loss_param().axis()); //設置計算softmax時的沿着的軸 softmax_layer_param.set_type("Softmax"); //設置層的類型 softmax_layer_param.clear_loss_weight(); softmax_layer_param.add_loss_weight(1); //清空權重參數,並設置爲1 softmax_layer_ = LayerRegistry<Dtype>::CreateLayer(softmax_layer_param); //根據layer參數建立softmax層 softmax_bottom_vec_.clear(); softmax_bottom_vec_.push_back(bottom[0]); //設置softmax層的輸入blob softmax_top_vec_.clear(); softmax_top_vec_.push_back(&prob_); //設置softmax層的輸出blob softmax_layer_->SetUp(softmax_bottom_vec_, softmax_top_vec_); //調用softmax層的初始化函數 // ignore label has_ignore_label_ = this->layer_param_.loss_param().has_ignore_label(); //設置了無效標籤 if (has_ignore_label_) { ignore_label_ = this->layer_param_.loss_param().ignore_label(); //存入當前layer中 } // normalization CHECK(!this->layer_param_.loss_param().has_normalize()) << "normalize is deprecated. use \"normalization\""; //normalize參數爲舊版本,已棄用 normalization_ = this->layer_param_.loss_param().normalization(); //normalization參數制定了規範化方式 // matrix H if (bottom.size() < 3) { //輸入blob的個數小於3,則輸入中不帶信息增益矩陣H CHECK(this->layer_param_.infogain_loss_param().has_source()) << "Infogain matrix source must be specified."; //檢查,在layer參數中必須指定增益矩陣H的來源文件 BlobProto blob_proto; //從二進制文件中讀取消息到blob_proto中 ReadProtoFromBinaryFile(this->layer_param_.infogain_loss_param().source(), &blob_proto); infogain_.FromProto(blob_proto); //將blob_proto中的數據轉成blob類型,存儲到信息增益矩陣H中 } } template <typename Dtype> void InfogainLossLayer<Dtype>::Reshape( const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { LossLayer<Dtype>::Reshape(bottom, top); //調整形狀 softmax_layer_->Reshape(softmax_bottom_vec_, softmax_top_vec_); //調整softmax層的形狀 //讀取消息中的axis參數,計算對應的維度存入infogain_axis_中.後續則是沿着第infogain_axis_維計算softmax值 infogain_axis_ = bottom[0]->CanonicalAxisIndex(this->layer_param_.infogain_loss_param().axis()); outer_num_ = bottom[0]->count(0, infogain_axis_); //外部個數,第 [0, infogain_axis_) 維的乘積 inner_num_ = bottom[0]->count(infogain_axis_ + 1); //內部個數,第 [infogain_axis_ + 1, end) 維的乘積 CHECK_EQ(outer_num_ * inner_num_, bottom[1]->count()) //數據的總個數等於外部個數乘上內部個數,必須等於標籤blob的總個數 << "Number of labels must match number of predictions; " << "e.g., if infogain axis == 1 and prediction shape is (N, C, H, W), " << "label count (number of labels) must be N*H*W, " << "with integer values in {0, 1, ..., C-1}."; //一樣,假設infogain_axis_=1.則 outer_num_ = N, inner_num_ = H*W, 類別總數 K=C num_labels_ = bottom[0]->shape(infogain_axis_); //類別總數K Blob<Dtype>* infogain = NULL; //信息增益矩陣 if (bottom.size() < 3) { infogain = &infogain_; //在layer參數中指定 } else { infogain = bottom[2]; //在輸入blob中指定 } CHECK_EQ(infogain->count(), num_labels_*num_labels_); //檢查,信息增益矩陣H的大小必須維K*K,K爲類別總數 sum_rows_H_.Reshape(vector<int>(1, num_labels_)); //用於存放矩陣H的每行的和 if (bottom.size() == 2) { // H is provided as a parameter and will not change. sum rows once sum_rows_of_H(infogain); //若是是在layer參數中指定信息增益矩陣H,則每行的和在每次訓練時是固定值,可先計算出來 } if (top.size() >= 2) { // softmax output top[1]->ReshapeLike(*bottom[0]); //若是設置了多個輸出blob,則將top[1]做爲softmax層的輸出,調整對應的形狀 } } template <typename Dtype> Dtype InfogainLossLayer<Dtype>::get_normalizer( LossParameter_NormalizationMode normalization_mode, int valid_count) { //根據規範化方式計算規範化係數 Dtype normalizer; switch (normalization_mode) { case LossParameter_NormalizationMode_FULL: normalizer = Dtype(outer_num_ * inner_num_); //FULL模式,規範化係數即爲數據的總個數 break; case LossParameter_NormalizationMode_VALID: if (valid_count == -1) { normalizer = Dtype(outer_num_ * inner_num_); //VALID模式,若是未設置無效標籤則等同於FULL模式 } else { normalizer = Dtype(valid_count); //設置了無效標籤,則規範化係數爲有效數據的個數 } break; case LossParameter_NormalizationMode_BATCH_SIZE: //BATCH_SIZE模式,規範化係數爲外部個數 normalizer = Dtype(outer_num_); break; case LossParameter_NormalizationMode_NONE: //NONE模式,無需規範化,規範化係數爲1 normalizer = Dtype(1); break; default: LOG(FATAL) << "Unknown normalization mode: " << LossParameter_NormalizationMode_Name(normalization_mode); } // Some users will have no labels for some examples in order to 'turn off' a // particular loss in a multi-task setup. The max prevents NaNs in that case. return std::max(Dtype(1.0), normalizer); //一樣,防止有效標籤個數爲0而出現的除0錯誤 } template <typename Dtype> void InfogainLossLayer<Dtype>::sum_rows_of_H(const Blob<Dtype>* H) { //計算H矩陣每行的和,存入sum_rows_H_中 CHECK_EQ(H->count(), num_labels_*num_labels_) << "H must be " << num_labels_ << "x" << num_labels_; //檢查,H的大小必須爲K*K const Dtype* infogain_mat = H->cpu_data(); //H矩陣的數據指針 Dtype* sum = sum_rows_H_.mutable_cpu_data(); //sum_rows_H_的數據指針 for ( int row = 0; row < num_labels_ ; row++ ) { sum[row] = 0; for ( int col = 0; col < num_labels_ ; col++ ) { sum[row] += infogain_mat[row*num_labels_+col]; //累加每行的和 } } } template <typename Dtype> void InfogainLossLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { // The forward pass computes the softmax prob values. softmax_layer_->Forward(softmax_bottom_vec_, softmax_top_vec_); //先計算softmax層的輸出 const Dtype* prob_data = prob_.cpu_data(); //softmax層的輸出數據指針 const Dtype* bottom_label = bottom[1]->cpu_data(); //標籤數據指針 const Dtype* infogain_mat = NULL; //信息增益矩陣的數據指針 if (bottom.size() < 3) { infogain_mat = infogain_.cpu_data(); //來自layer參數 } else { infogain_mat = bottom[2]->cpu_data(); //來自輸入blob } int count = 0; Dtype loss = 0; for (int i = 0; i < outer_num_; ++i) { //N for (int j = 0; j < inner_num_; j++) { //H*W //bottom_label數據的大小爲N*H*W,獲取(i,j)位置數據的真實標籤 const int label_value = static_cast<int>(bottom_label[i * inner_num_ + j]); if (has_ignore_label_ && label_value == ignore_label_) { continue; //設置了無效標籤,而且當前數據標籤無效,忽略 } DCHECK_GE(label_value, 0); //數據的標籤值必須在 [0, num_labels_) 之間 DCHECK_LT(label_value, num_labels_); for (int l = 0; l < num_labels_; l++) { //infogain_mat[label_value * num_labels_ + l]爲真實標籤爲label_value,預測標籤爲l的權重 //prob_data[i * inner_num_*num_labels_ + l * inner_num_ + j]爲(i,j)位置的數據的預測標籤爲l的機率(softmax值) loss -= infogain_mat[label_value * num_labels_ + l] * log(std::max(prob_data[i * inner_num_*num_labels_ + l * inner_num_ + j], Dtype(kLOG_THRESHOLD))); } ++count; //有效標籤的數據個數 } } top[0]->mutable_cpu_data()[0] = loss / get_normalizer(normalization_, count); //再除以規範化係數 if (top.size() == 2) { top[1]->ShareData(prob_); //輸入blob個數爲2,則將softmax層的輸出做爲第二個輸出 } } template <typename Dtype> void InfogainLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) { if (propagate_down[1]) { //標籤blob禁止設置梯度反傳 LOG(FATAL) << this->type() << " Layer cannot backpropagate to label inputs."; } if (propagate_down.size() > 2 && propagate_down[2]) { //信息增益矩陣H一樣禁止設置梯度反傳 LOG(FATAL) << this->type() << " Layer cannot backpropagate to infogain inputs."; } if (propagate_down[0]) { //預測blob須要梯度反傳 const Dtype* prob_data = prob_.cpu_data(); //softmax層的輸出 const Dtype* bottom_label = bottom[1]->cpu_data(); //標籤數據 const Dtype* infogain_mat = NULL; if (bottom.size() < 3) { infogain_mat = infogain_.cpu_data(); //增益矩陣H來自layer參數(每行的和已經在Reshape()中計算出) } else { infogain_mat = bottom[2]->cpu_data(); //增益矩陣H來自輸入blob // H is provided as a "bottom" and might change. sum rows every time. sum_rows_of_H(bottom[2]); //則計算每行的和 } const Dtype* sum_rows_H = sum_rows_H_.cpu_data(); //增益矩陣H每行的和 Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); //輸入blob的梯度數據指針 const int dim = bottom[0]->count() / outer_num_; //C*H*W int count = 0; for (int i = 0; i < outer_num_; ++i) { //N for (int j = 0; j < inner_num_; ++j) { //H*W const int label_value = static_cast<int>(bottom_label[i * inner_num_ + j]); //(i,j)位置的真實標籤 DCHECK_GE(label_value, 0); //檢查標籤值在 [0, num_labels_) 之間 DCHECK_LT(label_value, num_labels_); if (has_ignore_label_ && label_value == ignore_label_) { //當前位置的標籤無效 for (int l = 0; l < num_labels_; ++l) { bottom_diff[i * dim + l * inner_num_ + j] = 0; //清空(i,j)位置的數據對每種類別的預測值的梯度 } } else { for (int l = 0; l < num_labels_; ++l) { //prob_data[i*dim + l*inner_num_ + j] 爲(i,j)位置的數據對類別l的預測機率 //sum_rows_H[label_value] 爲(i,j)位置的數據的真實標籤label_value在信息增益矩陣H中所在行的和 //infogain_mat[label_value * num_labels_ + l] 爲真實標籤爲label_value,預測標籤爲l的權重 bottom_diff[i * dim + l * inner_num_ + j] = prob_data[i*dim + l*inner_num_ + j]*sum_rows_H[label_value] - infogain_mat[label_value * num_labels_ + l]; } ++count; //有效數據個數 } } } // Scale gradient Dtype loss_weight = top[0]->cpu_diff()[0] / get_normalizer(normalization_, count); //除以規範化係數,獲得縮放係數 caffe_scal(bottom[0]->count(), loss_weight, bottom_diff); //bottom_diff *= loss_weight } }
EuclideanLossLayer類用於計算預測值與真實值的歐式距離損失,用於迴歸任務中。ide
template <typename Dtype> void EuclideanLossLayer<Dtype>::Reshape( const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { LossLayer<Dtype>::Reshape(bottom, top); //調用基類的調整形狀 CHECK_EQ(bottom[0]->count(1), bottom[1]->count(1)) << "Inputs must have the same dimension."; //檢查C*H*W的總數相等 diff_.ReshapeLike(*bottom[0]); //diff_調整爲bottom[0]的形狀 } template <typename Dtype> void EuclideanLossLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { int count = bottom[0]->count(); //數據的總個數N*C*H*W //diff_ = bottom[0] - bottom[1] //a - b caffe_sub(count, bottom[0]->cpu_data(), bottom[1]->cpu_data(), diff_.mutable_cpu_data()); Dtype dot = caffe_cpu_dot(count, diff_.cpu_data(), diff_.cpu_data()); //計算內積,dot = diff_ * diff_ Dtype loss = dot / bottom[0]->num() / Dtype(2); //獲得 loss = dot / N / 2 //E = 1 / 2 / N * (a - b) * (a - b) top[0]->mutable_cpu_data()[0] = loss; } //EuclideanLossLayer並無嚴格限制輸入blob中預測值和標籤值的位置,而且會計算兩個輸入blob的梯度 template <typename Dtype> void EuclideanLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) { for (int i = 0; i < 2; ++i) { if (propagate_down[i]) { //容許梯度反傳 const Dtype sign = (i == 0) ? 1 : -1; const Dtype alpha = sign * top[0]->cpu_diff()[0] / bottom[i]->num(); //bottom[i] = alpha * diff_ + 0 * bottom[i] //a_diff = 1 * λ / N * (a - b) //b_diff = -1 * λ / N * (a - b) caffe_cpu_axpby(bottom[i]->count(), alpha, diff_.cpu_data(), Dtype(0), bottom[i]->mutable_cpu_diff()); } } }
HingeLossLayer類用於計算合頁損失,用於一對多的分類任務中。hinge loss用於SVM中,也正是hinge loss的特性使得SVM中的超平面僅依賴少數樣本。函數
template <typename Dtype> void HingeLossLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { const Dtype* bottom_data = bottom[0]->cpu_data(); //預測值數據指針 Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); //預測值梯度數據指針 const Dtype* label = bottom[1]->cpu_data(); //標籤值數據指針 int num = bottom[0]->num(); //N,爲數據的總個數 int count = bottom[0]->count(); //N*C*H*W int dim = count / num; //C*H*W,爲標籤的總類別數K caffe_copy(count, bottom_data, bottom_diff); //bottom_diff = bottom_data for (int i = 0; i < num; ++i) { //label[i]爲第i個數據的真實標籤 bottom_diff[i * dim + static_cast<int>(label[i])] *= -1; //獲得 -δ*t_nk, δ= -1(k≠l_n)或1(k=l_n) } for (int i = 0; i < num; ++i) { for (int j = 0; j < dim; ++j) { //第i個數據的第j類別的值 bottom_diff[i * dim + j] = std::max(Dtype(0), 1 + bottom_diff[i * dim + j]); //max(0, 1-δ*t_nk) } } Dtype* loss = top[0]->mutable_cpu_data(); //輸出loss switch (this->layer_param_.hinge_loss_param().norm()) { //正則化方式 case HingeLossParameter_Norm_L1: loss[0] = caffe_cpu_asum(count, bottom_diff) / num; //L1正則化,計算各數據的絕對值之和,再除以個數 break; case HingeLossParameter_Norm_L2: loss[0] = caffe_cpu_dot(count, bottom_diff, bottom_diff) / num; //L2正則化,計算各數據的平方和,在除以個數 break; default: LOG(FATAL) << "Unknown Norm"; } } template <typename Dtype> void HingeLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) { if (propagate_down[1]) { //標籤blob不容許梯度反傳 LOG(FATAL) << this->type() << " Layer cannot backpropagate to label inputs."; } if (propagate_down[0]) { //預測值的梯度數據,在Forward_cpu()函數中已保存了max(0, 1-δ*t_nk) Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); const Dtype* label = bottom[1]->cpu_data(); //標籤值 int num = bottom[0]->num(); //N,爲數據的總個數 int count = bottom[0]->count(); //N*C*H*W int dim = count / num; //C*H*W,爲標籤的總類別數K for (int i = 0; i < num; ++i) { //label[i]爲第i個數據的真實標籤,獲得: //bottom_diff = max(0, 1-δ*t_nk) {k≠l_n}, // -max(0, 1-δ*t_nk) {k=l_n} bottom_diff[i * dim + static_cast<int>(label[i])] *= -1; } //該段的具體計算過程可參考博客上面的說明 const Dtype loss_weight = top[0]->cpu_diff()[0]; switch (this->layer_param_.hinge_loss_param().norm()) { case HingeLossParameter_Norm_L1: //L1正則化方式 //sign(bottom_diff) = 1 {k≠l_n, 1-δ*t_nk > 0}, // 0 {k≠l_n, 1-δ*t_nk ≤ 0}, // -1 {k=l_n, 1-δ*t_nk > 0}, // 0 {k=l_n, 1-δ*t_nk ≤ 0} caffe_cpu_sign(count, bottom_diff, bottom_diff); //計算符號,bottom_diff = sign(bottom_diff) caffe_scal(count, loss_weight / num, bottom_diff); //bottom_diff *= loss_weight / num break; case HingeLossParameter_Norm_L2: //L2正則化方式 caffe_scal(count, loss_weight * 2 / num, bottom_diff); //bottom_diff *= loss_weight * 2 / num break; default: LOG(FATAL) << "Unknown Norm"; } } }
ContrastiveLossLayer類用於計算對比損失,該損失函數的思路是同類樣本的歐氏距離應儘量小,非同類樣本之間的歐氏距離應該不小於指定閾值,經常使用於孿生神經網絡(siamese network)的訓練。this
legacy_version=false
)或\(E = \frac{1}{2N} \sum\limits_{n=1}^N [y_n*d_n^2 + (1-y_n)*\max (margin-d_n^2, 0)]\)(代碼中legacy_version=true
)legacy_version=false
,則\(E = \frac{1}{2N} \sum\limits_{n=1}^N [y_n*d_n^2 + (1-y_n)*\max (margin-d_n, 0)^2]\)legacy_version=true
,則\(E = \frac{1}{2N} \sum\limits_{n=1}^N [y_n*d_n^2 + (1-y_n)*\max (margin-d_n^2, 0)]\)template <typename Dtype> void ContrastiveLossLayer<Dtype>::LayerSetUp( const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { LossLayer<Dtype>::LayerSetUp(bottom, top); //調用基類的初始化函數 CHECK_EQ(bottom[0]->channels(), bottom[1]->channels()); //C維大小相等 CHECK_EQ(bottom[0]->height(), 1); //輸入0的形狀必須爲N*C*1*1 CHECK_EQ(bottom[0]->width(), 1); CHECK_EQ(bottom[1]->height(), 1); //輸入1的形狀必須爲N*C*1*1 CHECK_EQ(bottom[1]->width(), 1); CHECK_EQ(bottom[2]->channels(), 1); //輸入2的形狀必須爲N*1*1*1,標籤值,表示輸入0與輸入1的數據是否屬於同類 CHECK_EQ(bottom[2]->height(), 1); CHECK_EQ(bottom[2]->width(), 1); diff_.Reshape(bottom[0]->num(), bottom[0]->channels(), 1, 1); //形狀調整爲N*C*1*1 //存放全部數據的全部特徵向量的差 diff_sq_.Reshape(bottom[0]->num(), bottom[0]->channels(), 1, 1); //形狀調整爲N*C*1*1 //gpu計算的臨時變量 dist_sq_.Reshape(bottom[0]->num(), 1, 1, 1); //形狀調整爲N*1*1*1 //存放數據的歐氏距離的平方 // vector of ones used to sum along channels summer_vec_.Reshape(bottom[0]->channels(), 1, 1, 1); //形狀調整爲C*1*1*1 for (int i = 0; i < bottom[0]->channels(); ++i) summer_vec_.mutable_cpu_data()[i] = Dtype(1); //初始設置爲1 } template <typename Dtype> void ContrastiveLossLayer<Dtype>::Forward_cpu( const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { int count = bottom[0]->count(); // diff_ = bottom[0] - bottom[1] //a_ij-b_ij caffe_sub(count, bottom[0]->cpu_data(), bottom[1]->cpu_data(), diff_.mutable_cpu_data()); const int channels = bottom[0]->channels(); //每一個數據的特徵長度 //距離閾值,對比損失中,非同類樣本的歐式距離必須大於margin,不然對應的loss值非0 Dtype margin = this->layer_param_.contrastive_loss_param().margin(); //legacy_version爲false(默認值)時使用(margin - d)^2公式,爲true時使用(margin - d^2)公式 bool legacy_version = this->layer_param_.contrastive_loss_param().legacy_version(); Dtype loss(0.0); for (int i = 0; i < bottom[0]->num(); ++i) { //每一個數據 //diff_.cpu_data() + (i*channels)爲第i個數據的特徵向量的起始位置 //計算兩個特徵向量的內積,獲得d^2 //d^2 = Σ_{j} (a_ij-b_ij) * (a_ij-b_ij) dist_sq_.mutable_cpu_data()[i] = caffe_cpu_dot(channels, diff_.cpu_data() + (i*channels), diff_.cpu_data() + (i*channels)); if (static_cast<int>(bottom[2]->cpu_data()[i])) { // similar pairs //兩個向量爲相同類 loss += dist_sq_.cpu_data()[i]; // E += y*d^2 (y=1) } else { // dissimilar pairs //非同類 if (legacy_version) { loss += std::max(margin - dist_sq_.cpu_data()[i], Dtype(0.0)); //E += (1-y)*max(0, margin - d^2) (y=0) } else { Dtype dist = std::max<Dtype>(margin - sqrt(dist_sq_.cpu_data()[i]), Dtype(0.0)); loss += dist*dist; //E += (1-y)*max(0, margin - d)^2 (y=0) } } } loss = loss / static_cast<Dtype>(bottom[0]->num()) / Dtype(2); //E = E / N / 2 top[0]->mutable_cpu_data()[0] = loss; //最終的loss } template <typename Dtype> void ContrastiveLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) { Dtype margin = this->layer_param_.contrastive_loss_param().margin(); //margin距離閾值 bool legacy_version = this->layer_param_.contrastive_loss_param().legacy_version(); //版本 for (int i = 0; i < 2; ++i) { if (propagate_down[i]) { const Dtype sign = (i == 0) ? 1 : -1; //δ = 1 (a_ij) 或 -1 (b_ij) //alpha = δ * λ / N const Dtype alpha = sign * top[0]->cpu_diff()[0] / static_cast<Dtype>(bottom[i]->num()); int num = bottom[i]->num(); //數據的個數 int channels = bottom[i]->channels(); //數據的特徵向量的長度 for (int j = 0; j < num; ++j) { Dtype* bout = bottom[i]->mutable_cpu_diff(); //梯度數據指針 if (static_cast<int>(bottom[2]->cpu_data()[j])) { // similar pairs //相同類 //相同類,loss的計算公式爲 E += y*d^2 (y=1),而且 d^2 = Σ_{j} (a_ij-b_ij) * (a_ij-b_ij) //則對 a_ij 或 b_ij 的梯度爲 δ * λ / N * (a_ij-b_ij) caffe_cpu_axpby(channels, alpha, diff_.cpu_data() + (j*channels), Dtype(0.0), bout + (j*channels)); } else { // dissimilar pairs //不一樣類 Dtype mdist(0.0); Dtype beta(0.0); if (legacy_version) { //對應 E += (1-y)*max(0, margin - d^2) (y=0) mdist = margin - dist_sq_.cpu_data()[j]; //mdist = margin - d^2 beta = -alpha; //beta = -δ * λ / N } else { //對應 E += (1-y)*max(0, margin - d)^2 (y=0) Dtype dist = sqrt(dist_sq_.cpu_data()[j]); //d = sqrt(d^2) mdist = margin - dist; //mdist = margin - d beta = -alpha * mdist / (dist + Dtype(1e-4)); //beta = -δ * λ / N * (margin - d) / d } if (mdist > Dtype(0.0)) { //max(0, mdist)時,取的是mdist //legacy_version爲true時, bout = -δ * λ / N * (a_ij-b_ij) //legacy_version爲false時, bout = -δ * λ / N * (margin - d) / d * (a_ij-b_ij) caffe_cpu_axpby(channels, beta, diff_.cpu_data() + (j*channels), Dtype(0.0), bout + (j*channels)); } else { //max(0, mdist)時,取的是0 caffe_set(channels, Dtype(0), bout + (j*channels)); //置爲0 } } } } } }
Caffe的源碼筆者是第一次閱讀,一邊閱讀一邊記錄,對代碼的理解和分析可能會存在錯誤或遺漏,但願各位讀者批評指正,謝謝支持!idea