YOLO v2 損失函數源碼分析

時間 2019-12-07

標籤 yolo v2 損失函數源碼分析简体版

原文原文鏈接

損失函數的定義是在region_layer.c文件中，關於region層使用的參數在cfg文件的最後一個section中定義。less

首先來看一看region_layer 都定義了那些屬性值：dom

layer make_region_layer(int batch, int w, int h, int n, int classes, int coords)
{
    layer l = {0};
    l.type = REGION;

    l.n = n; // anchors 的個數， 文章中選擇爲5
    l.batch = batch; // batchsize
    l.h = h;
    l.w = w;
    l.c = n*(classes + coords + 1); // 輸出的通道數 
    l.out_w = l.w;
    l.out_h = l.h;
    l.out_c = l.c;
    l.classes = classes; // 檢測的類別數
    l.coords = coords; 
    l.cost = calloc(1, sizeof(float));
    l.biases = calloc(n*2, sizeof(float));  // anchors的存儲位置，一個anchor對應兩個值
    l.bias_updates = calloc(n*2, sizeof(float));
    l.outputs = h*w*n*(classes + coords + 1); //輸出tensor的存儲空間大小 13*13*5*(20+4+1)
    l.inputs = l.outputs;
    l.truths = 30*(l.coords + 1); // ***********注1************
    l.delta = calloc(batch*l.outputs, sizeof(float)); // 批量梯度
    l.output = calloc(batch*l.outputs, sizeof(float));// 批量輸出tensor的存儲空間
    int i;
    for(i = 0; i < n*2; ++i){
        l.biases[i] = .5;//anchors的默認值設爲0.5
    }

    l.forward = forward_region_layer; // 前向計算函數
    l.backward = backward_region_layer;//反向計算函數，這裏delta在前向計算函數中得到了，因此該函數爲空
#ifdef GPU
    l.forward_gpu = forward_region_layer_gpu;
    l.backward_gpu = backward_region_layer_gpu;
    l.output_gpu = cuda_make_array(l.output, batch*l.outputs);
    l.delta_gpu = cuda_make_array(l.delta, batch*l.outputs);
#endif
    fprintf(stderr, "detection\n");
    srand(0);
    return l;
}

 1 layer parse_region(list *options, size_params params)
 2 {
 3     int coords = option_find_int(options, "coords", 4);
 4     int classes = option_find_int(options, "classes", 20);
 5     int num = option_find_int(options, "num", 1);// 每個cell對應的anchors個數， 文中num=5
 6 
 7     layer l = make_region_layer(params.batch, params.w, params.h, num, classes, coords);
 8     assert(l.outputs == params.inputs);
 9 
10     l.log = option_find_int_quiet(options, "log", 0); // 是否計算log,這個標誌定義了，卻未使用
11     l.sqrt = option_find_int_quiet(options, "sqrt", 0); // 輸出預測值的w,h是否開方
12 
13     l.softmax = option_find_int(options, "softmax", 0); // 採用softmax分類
14     l.background = option_find_int_quiet(options, "background", 0);
15     l.max_boxes = option_find_int_quiet(options, "max",30); //******** 注2 **************
16     // 圖片中最多真實boxes的個數，這個應該和make_region_layer中的30有關
17     l.jitter = option_find_float(options, "jitter", .2);//抖動，cfg中設置爲.3
18     l.rescore = option_find_int_quiet(options, "rescore",0); //******** 注3 **************
19 
20     l.thresh = option_find_float(options, "thresh", .5); // .6 大於該值的時候認爲包含目標
21     l.classfix = option_find_int_quiet(options, "classfix", 0);
22     l.absolute = option_find_int_quiet(options, "absolute", 0); // 1
23     l.random = option_find_int_quiet(options, "random", 0); // 1
24 
25     l.coord_scale = option_find_float(options, "coord_scale", 1); // 座標損失的權重，1
26     l.object_scale = option_find_float(options, "object_scale", 1); // 有目標的權重, 5
27     l.noobject_scale = option_find_float(options, "noobject_scale", 1); // 無目標的權重, 1
28     l.mask_scale = option_find_float(options, "mask_scale", 1);
29     l.class_scale = option_find_float(options, "class_scale", 1); // 類別權重, 1
30     l.bias_match = option_find_int_quiet(options, "bias_match",0); // 1
31     // 下面幾句未執行
32     char *tree_file = option_find_str(options, "tree", 0);
33     if (tree_file) l.softmax_tree = read_tree(tree_file);
34     char *map_file = option_find_str(options, "map", 0);
35     if (map_file) l.map = read_map(map_file);
36 
37     char *a = option_find_str(options, "anchors", 0);
38     if(a){
39         int len = strlen(a);
40         int n = 1;
41         int i;
42         for(i = 0; i < len; ++i){
43             if (a[i] == ',') ++n;
44         }
45         for(i = 0; i < n; ++i){
46             float bias = atof(a);
47             l.biases[i] = bias;
48             a = strchr(a, ',')+1;
49         }
50     }
51     // l.biases存放了anchor的數值
52     return l;
53 }

注1: 這裏的30應該是限制了每幀圖像中目標的最大個數，我的認爲應該和注2 相關，但這裏設爲了定值ide

注2: 應該和注1 相關，即再調用make_region_layer方法以前定義，並將後面的30都替換成 l.max_boxessvg

注3: rescore是一個標誌位，推測是regression of confidence score的表示。當該標誌爲1的時候，在計算損失時須要迴歸出被選擇的anchor與真實target的iou，不然當該標誌爲0的時候，直接認爲置信度爲1。源碼中該值在cfg中設置爲1.函數

OK，接下來看一看region_layer 的forward方法是如何實現的。post

在看這部分源碼以前，先了解一下數據的存儲結構，方便看懂源碼中尋找各類值得索引。ui

首先net.truth，及真實target的存儲格式 : x,y,w,h,class,x,y,w,h,class,...this

而後是*output的存儲格式：維度 w->h>entry->n->batch, 其中entry對應着每一個anchor生成的向量維度，文章中就是長度爲(4+1+20)的向量，該向量中存儲順序爲 box, confidence,classesspa

  1 void forward_region_layer(const layer l, network net)
  2 {
  3     int i,j,b,t,n;
  4     memcpy(l.output, net.input, l.outputs*l.batch*sizeof(float));
  5     
  6 #ifndef GPU 
  7     for (b = 0; b < l.batch; ++b){
  8         for(n = 0; n < l.n; ++n){
  9             int index = entry_index(l, b, n*l.w*l.h, 0);
 10             activate_array(l.output + index, 2*l.w*l.h, LOGISTIC);
 11             index = entry_index(l, b, n*l.w*l.h, l.coords);
 12             fprintf(stderr,"background %s \n", l.background)
 13             if(!l.background) activate_array(l.output + index,   l.w*l.h, LOGISTIC);
 14         }
 15     }
 16     if (l.softmax_tree){
 17         int i;
 18         int count = l.coords + 1;
 19         for (i = 0; i < l.softmax_tree->groups; ++i) {
 20             int group_size = l.softmax_tree->group_size[i];
 21             softmax_cpu(net.input + count, group_size, l.batch, l.inputs, l.n*l.w*l.h, 1, l.n*l.w*l.h, l.temperature, l.output + count);
 22             count += group_size;
 23         }
 24     } else if (l.softmax){
 25         int index = entry_index(l, 0, 0, l.coords + !l.background);
 26         softmax_cpu(net.input + index, l.classes + l.background, l.batch*l.n, l.inputs/l.n, l.w*l.h, 1, l.w*l.h, 1, l.output + index);
 27     }
 28 #endif
 29 
 30     memset(l.delta, 0, l.outputs * l.batch * sizeof(float)); // 梯度清零
 31     if(!net.train) return; // 非訓練模式直接返回
 32     float avg_iou = 0;  // average iou
 33     float recall = 0; // 召回數
 34     float avg_cat = 0;  // 平均的類別辨識率
 35     float avg_obj = 0;
 36     float avg_anyobj = 0;
 37     int count = 0; // 該batch內檢測的target數
 38     int class_count = 0;
 39     *(l.cost) = 0; // 損失
 40     for (b = 0; b < l.batch; ++b) { // 遍歷batch內數據
 41         if(l.softmax_tree){// 不執行
 42             int onlyclass = 0;
 43             for(t = 0; t < 30; ++t){
 44                 box truth = float_to_box(net.truth + t*(l.coords + 1) + b*l.truths, 1);
 45                 if(!truth.x) break;
 46                 int class = net.truth[t*(l.coords + 1) + b*l.truths + l.coords];
 47                 float maxp = 0;
 48                 int maxi = 0;
 49                 if(truth.x > 100000 && truth.y > 100000){
 50                     for(n = 0; n < l.n*l.w*l.h; ++n){
 51                         int class_index = entry_index(l, b, n, l.coords + 1);
 52                         int obj_index = entry_index(l, b, n, l.coords);
 53                         float scale =  l.output[obj_index];
 54                         l.delta[obj_index] = l.noobject_scale * (0 - l.output[obj_index]);
 55                         float p = scale*get_hierarchy_probability(l.output + class_index, l.softmax_tree, class, l.w*l.h);
 56                         if(p > maxp){
 57                             maxp = p;
 58                             maxi = n;
 59                         }
 60                     }
 61                     int class_index = entry_index(l, b, maxi, l.coords + 1);
 62                     int obj_index = entry_index(l, b, maxi, l.coords);
 63                     delta_region_class(l.output, l.delta, class_index, class, l.classes, l.softmax_tree, l.class_scale, l.w*l.h, &avg_cat);
 64                     if(l.output[obj_index] < .3) l.delta[obj_index] = l.object_scale * (.3 - l.output[obj_index]);
 65                     else  l.delta[obj_index] = 0;
 66                     l.delta[obj_index] = 0;
 67                     ++class_count;
 68                     onlyclass = 1;
 69                     break;
 70                 }
 71             }
 72             if(onlyclass) continue;
 73         }
 74         for (j = 0; j < l.h; ++j) {
 75             for (i = 0; i < l.w; ++i) {
 76                 for (n = 0; n < l.n; ++n) {
 77                     int box_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 0);
 78                     //帶入 entry_index, 由output tensor的存儲格式能夠知道這裏是第n類anchor在(i,j)上對應box的首地址
 79                     box pred = get_region_box(l.output, l.biases, n, box_index, i, j, l.w, l.h, l.w*l.h);
 80                     // 在cell（i，j）上相對於anchor n的預測結果， 相對於feature map的值
 81                     float best_iou = 0;
 82                     for(t = 0; t < 30; ++t){//net.truth存放的是真實數據
 83                         // net.truth存儲格式：x,y,w,h,c,x,y,w,h,c,....
 84                         box truth = float_to_box(net.truth + t*(l.coords + 1) + b*l.truths, 1);
 85                         //讀取一個真實目標框
 86                         if(!truth.x) break;//遍歷完全部真實box則跳出循環
 87                         float iou = box_iou(pred, truth);//計算iou
 88                         if (iou > best_iou) {
 89                             best_iou = iou;//找到與當前預測box的最大iou
 90                         }
 91                     }
 92                     int obj_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, l.coords);
 93                     // 存儲第n個anchor在cell (i,j)的預測的confidence的index
 94                     avg_anyobj += l.output[obj_index]; // 有目標的機率
 95                     
 96                     l.delta[obj_index] = l.noobject_scale * (0 - l.output[obj_index]);
 97                   // *********** 注4 **********
 98                     // 全部的predict box都當作noobject，計算其損失梯度，主要是爲了計算速度考慮
 99                     if(l.background) l.delta[obj_index] = l.noobject_scale * (1 - l.output[obj_index]);//未執行
100                     if (best_iou > l.thresh) {//該預測框中有目標
101                       // *********** 注5 ***********
102                         l.delta[obj_index] = 0;
103                     }
104 
105                     if(*(net.seen) < 12800){// net.seen 已訓練樣本的個數
106                       // *********** 注6 ***********
107                         box truth = {0}; // 當前cell爲中心對應的第n個anchor的box
108                         truth.x = (i + .5)/l.w; // cell的中點 // 對應tx=0.5
109                         truth.y = (j + .5)/l.h; //ty=0.5
110                         truth.w = l.biases[2*n]/l.w; //相對於feature map的大小 // tw=0
111                         truth.h = l.biases[2*n+1]/l.h; //th=0
112                         delta_region_box(truth, l.output, l.biases, n, box_index, i, j, l.w, l.h, l.delta, .01, l.w*l.h);
113                         //將預測的tx,ty,tw,th和上面的box差值存入l.delta
114                     }
115                 }
116             }
117         }
118         for(t = 0; t < 30; ++t){
119             box truth = float_to_box(net.truth + t*(l.coords + 1) + b*l.truths, 1);
120             //對應的真實值，歸一化的真實值
121 
122             if(!truth.x) break;
123             float best_iou = 0;
124             int best_n = 0;
125             i = (truth.x * l.w);// 類型的強制轉換，計算該truth所在的cell的i,j座標
126             j = (truth.y * l.h);
127             //printf("%d %f %d %f\n", i, truth.x*l.w, j, truth.y*l.h);
128             box truth_shift = truth;
129             truth_shift.x = 0;
130             truth_shift.y = 0;
131             //printf("index %d %d\n",i, j);
132             for(n = 0; n < l.n; ++n){ // 遍歷對應的cell預測出的n個anchor
133               // 即經過該cell對應的anchors與truth的iou來判斷使用哪個anchor產生的predict來回歸
134                 int box_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 0);
135                 box pred = get_region_box(l.output, l.biases, n, box_index, i, j, l.w, l.h, l.w*l.h);
136                 // 預測box，歸一化的值
137               //下面這幾句是將truth與anchor中心對齊後，計算anchor與truch的iou
138                 if(l.bias_match){ // ********* 注7 ***************
139                     pred.w = l.biases[2*n]/l.w; // 由於是和anchor比較，因此直接使用anchor的相對大小
140                     pred.h = l.biases[2*n+1]/l.h;
141                 }
142                 //printf("pred: (%f, %f) %f x %f\n", pred.x, pred.y, pred.w, pred.h);
143                 pred.x = 0;
144                 pred.y = 0;
145                 float iou = box_iou(pred, truth_shift);
146                 if (iou > best_iou){
147                     best_iou = iou;
148                     best_n = n;// 最優iou對應的anchor索引，而後使用該anchor預測的predict box計算與真實box的偏差
149                 }
150             }
151             //printf("%d %f (%f, %f) %f x %f\n", best_n, best_iou, truth.x, truth.y, truth.w, truth.h);
152 
153             int box_index = entry_index(l, b, best_n*l.w*l.h + j*l.w + i, 0);
154             float iou = delta_region_box(truth, l.output, l.biases, best_n, box_index, i, j, l.w, l.h, l.delta, l.coord_scale *  (2 - truth.w*truth.h), l.w*l.h);
155             // 注意這裏的關於box的損失權重 ************* 注 8 **********************
156             if(l.coords > 4){// 不執行
157                 int mask_index = entry_index(l, b, best_n*l.w*l.h + j*l.w + i, 4);
158                 delta_region_mask(net.truth + t*(l.coords + 1) + b*l.truths + 5, l.output, l.coords - 4, mask_index, l.delta, l.w*l.h, l.mask_scale);
159             }
160             if(iou > .5) recall += 1;// 若是iou> 0.5, 認爲找到該目標，召回數+1
161             avg_iou += iou;
162 
163             //l.delta[best_index + 4] = iou - l.output[best_index + 4];
164             int obj_index = entry_index(l, b, best_n*l.w*l.h + j*l.w + i, l.coords);// 對應predict預測的confidence
165             avg_obj += l.output[obj_index];
166             l.delta[obj_index] = l.object_scale * (1 - l.output[obj_index]);//有目標時的損失
167             if (l.rescore) { //定義了rescore表示同時對confidence score進行迴歸
168                 l.delta[obj_index] = l.object_scale * (iou - l.output[obj_index]); 
169             }
170             if(l.background){//不執行
171                 l.delta[obj_index] = l.object_scale * (0 - l.output[obj_index]);
172             }
173 
174             int class = net.truth[t*(l.coords + 1) + b*l.truths + l.coords];// 真實類別
175             if (l.map) class = l.map[class];//不執行
176             int class_index = entry_index(l, b, best_n*l.w*l.h + j*l.w + i, l.coords + 1);//預測的class向量首地址
177             delta_region_class(l.output, l.delta, class_index, class, l.classes, l.softmax_tree, l.class_scale, l.w*l.h, &avg_cat);
178             ++count;
179             ++class_count;
180         }
181     }
182     //printf("\n");
183     *(l.cost) = pow(mag_array(l.delta, l.outputs * l.batch), 2);//MSEloss
184     printf("Region Avg IOU: %f, Class: %f, Obj: %f, No Obj: %f, Avg Recall: %f,  count: %d\n", avg_iou/count, avg_cat/class_count, avg_obj/count, avg_anyobj/(l.w*l.h*l.n*l.batch), recall/count, count);
185 }

注4，5： 這兩個地方定義了iou的損失code

注6： 這段代碼主要是計算anchors中沒能提供truth的有效預測的那些anchor如何計算損失。有點相似於包含object和不包含object的cell的損失差別，這裏沒有提供有效預測的anchors則使用scale=0.01的權重計算損失。主要目的是爲了在模型訓練的前期更加穩定。參見yolo v1中關於object和非object cell的論述

>Also, in every image many grid cells do not contain any object. This pushes the donfidence scores of thos cells towards zero, ofthen overpowering the gradient from cells that do contain objects. This can lead to model instability, causing training to diverge early on.

注7： bias_match標誌位用來肯定由anchor仍是anchor對應的prediction來肯定用哪一個anchor產生的prediction來回歸。若是bias_match=1,即cfg中設置，那麼先用anchor與truth box的iou來選擇每一個cell使用哪一個anchor的預測框計算損失。若是bias_match=0的話，使用每一個anchor的預測框與truth box的iou選擇使用哪個anchor的預測框計算損失，這裏我剛開始納悶，bias_match=0計算的iou和後面rescore=1裏面用的iou不是同樣了嗎，那delta就一直爲0啊？其實這裏在選擇anchor時計算iou是在中心對齊的狀況下計算的，因此和後面rescore計算的iou仍是不同的。

注8： 這裏計算了box的梯度，注意loss的權重爲

在yolo中有這麼一段

> Sum-squred error also equally weights errors in large boxes and small boxes. Our error metric should reflect that small derivations in large boxes matter less than in small boxes. To partially address this we predict the square root of the bounding box width and height instead of the width and height directly.

即yolo v1中使用w和h的開方還和該問題，而在yolo v2中則經過賦值一個和w，h相關的權重函數達到該目的。

　　3. 因此總結起來，代碼中計算的損失包括：其中最後一項只在訓練初期使用

源碼中計算loss的步驟：