less
首先來看一看region_layer 都定義了那些屬性值:dom
layer make_region_layer(int batch, int w, int h, int n, int classes, int coords) { layer l = {0}; l.type = REGION; l.n = n; // anchors 的個數, 文章中選擇爲5 l.batch = batch; // batchsize l.h = h; l.w = w; l.c = n*(classes + coords + 1); // 輸出的通道數 l.out_w = l.w; l.out_h = l.h; l.out_c = l.c; l.classes = classes; // 檢測的類別數 l.coords = coords; l.cost = calloc(1, sizeof(float)); l.biases = calloc(n*2, sizeof(float)); // anchors的存儲位置,一個anchor對應兩個值 l.bias_updates = calloc(n*2, sizeof(float)); l.outputs = h*w*n*(classes + coords + 1); //輸出tensor的存儲空間大小 13*13*5*(20+4+1) l.inputs = l.outputs; l.truths = 30*(l.coords + 1); // ***********注1************ l.delta = calloc(batch*l.outputs, sizeof(float)); // 批量梯度 l.output = calloc(batch*l.outputs, sizeof(float));// 批量輸出tensor的存儲空間 int i; for(i = 0; i < n*2; ++i){ l.biases[i] = .5;//anchors的默認值設爲0.5 } l.forward = forward_region_layer; // 前向計算函數 l.backward = backward_region_layer;//反向計算函數,這裏delta在前向計算函數中得到了,因此該函數爲空 #ifdef GPU l.forward_gpu = forward_region_layer_gpu; l.backward_gpu = backward_region_layer_gpu; l.output_gpu = cuda_make_array(l.output, batch*l.outputs); l.delta_gpu = cuda_make_array(l.delta, batch*l.outputs); #endif fprintf(stderr, "detection\n"); srand(0); return l; }
1 layer parse_region(list *options, size_params params) 2 { 3 int coords = option_find_int(options, "coords", 4); 4 int classes = option_find_int(options, "classes", 20); 5 int num = option_find_int(options, "num", 1);// 每個cell對應的anchors個數, 文中num=5 6 7 layer l = make_region_layer(params.batch, params.w, params.h, num, classes, coords); 8 assert(l.outputs == params.inputs); 9 10 l.log = option_find_int_quiet(options, "log", 0); // 是否計算log,這個標誌定義了,卻未使用 11 l.sqrt = option_find_int_quiet(options, "sqrt", 0); // 輸出預測值的w,h是否開方 12 13 l.softmax = option_find_int(options, "softmax", 0); // 採用softmax分類 14 l.background = option_find_int_quiet(options, "background", 0); 15 l.max_boxes = option_find_int_quiet(options, "max",30); //******** 注2 ************** 16 // 圖片中最多真實boxes的個數,這個應該和make_region_layer中的30有關 17 l.jitter = option_find_float(options, "jitter", .2);//抖動,cfg中設置爲.3 18 l.rescore = option_find_int_quiet(options, "rescore",0); //******** 注3 ************** 19 20 l.thresh = option_find_float(options, "thresh", .5); // .6 大於該值的時候認爲包含目標 21 l.classfix = option_find_int_quiet(options, "classfix", 0); 22 l.absolute = option_find_int_quiet(options, "absolute", 0); // 1 23 l.random = option_find_int_quiet(options, "random", 0); // 1 24 25 l.coord_scale = option_find_float(options, "coord_scale", 1); // 座標損失的權重,1 26 l.object_scale = option_find_float(options, "object_scale", 1); // 有目標的權重, 5 27 l.noobject_scale = option_find_float(options, "noobject_scale", 1); // 無目標的權重, 1 28 l.mask_scale = option_find_float(options, "mask_scale", 1); 29 l.class_scale = option_find_float(options, "class_scale", 1); // 類別權重, 1 30 l.bias_match = option_find_int_quiet(options, "bias_match",0); // 1 31 // 下面幾句未執行 32 char *tree_file = option_find_str(options, "tree", 0); 33 if (tree_file) l.softmax_tree = read_tree(tree_file); 34 char *map_file = option_find_str(options, "map", 0); 35 if (map_file) l.map = read_map(map_file); 36 37 char *a = option_find_str(options, "anchors", 0); 38 if(a){ 39 int len = strlen(a); 40 int n = 1; 41 int i; 42 for(i = 0; i < len; ++i){ 43 if (a[i] == ',') ++n; 44 } 45 for(i = 0; i < n; ++i){ 46 float bias = atof(a); 47 l.biases[i] = bias; 48 a = strchr(a, ',')+1; 49 } 50 } 51 // l.biases存放了anchor的數值 52 return l; 53 }
ide
注2: 應該和注1 相關,即再調用make_region_layer方法以前定義,並將後面的30都替換成 l.max_boxessvg
注3: rescore是一個標誌位,推測是regression of confidence score的表示。 當該標誌爲1的時候,在計算損失時須要迴歸出被選擇的anchor與真實target的iou,不然當該標誌爲0的時候,直接認爲置信度爲1。源碼中該值在cfg中設置爲1.函數
post
在看這部分源碼以前,先了解一下數據的存儲結構,方便看懂源碼中尋找各類值得索引。ui
首先net.truth,及真實target的存儲格式 : x,y,w,h,class,x,y,w,h,class,...this
而後是*output的存儲格式: 維度 w->h>entry->n->batch, 其中entry對應着每一個anchor生成的向量維度,文章中就是長度爲(4+1+20)的向量,該向量中存儲順序爲 box, confidence,classesspa
1 void forward_region_layer(const layer l, network net) 2 { 3 int i,j,b,t,n; 4 memcpy(l.output, net.input, l.outputs*l.batch*sizeof(float)); 5 6 #ifndef GPU 7 for (b = 0; b < l.batch; ++b){ 8 for(n = 0; n < l.n; ++n){ 9 int index = entry_index(l, b, n*l.w*l.h, 0); 10 activate_array(l.output + index, 2*l.w*l.h, LOGISTIC); 11 index = entry_index(l, b, n*l.w*l.h, l.coords); 12 fprintf(stderr,"background %s \n", l.background) 13 if(!l.background) activate_array(l.output + index, l.w*l.h, LOGISTIC); 14 } 15 } 16 if (l.softmax_tree){ 17 int i; 18 int count = l.coords + 1; 19 for (i = 0; i < l.softmax_tree->groups; ++i) { 20 int group_size = l.softmax_tree->group_size[i]; 21 softmax_cpu(net.input + count, group_size, l.batch, l.inputs, l.n*l.w*l.h, 1, l.n*l.w*l.h, l.temperature, l.output + count); 22 count += group_size; 23 } 24 } else if (l.softmax){ 25 int index = entry_index(l, 0, 0, l.coords + !l.background); 26 softmax_cpu(net.input + index, l.classes + l.background, l.batch*l.n, l.inputs/l.n, l.w*l.h, 1, l.w*l.h, 1, l.output + index); 27 } 28 #endif 29 30 memset(l.delta, 0, l.outputs * l.batch * sizeof(float)); // 梯度清零 31 if(!net.train) return; // 非訓練模式直接返回 32 float avg_iou = 0; // average iou 33 float recall = 0; // 召回數 34 float avg_cat = 0; // 平均的類別辨識率 35 float avg_obj = 0; 36 float avg_anyobj = 0; 37 int count = 0; // 該batch內檢測的target數 38 int class_count = 0; 39 *(l.cost) = 0; // 損失 40 for (b = 0; b < l.batch; ++b) { // 遍歷batch內數據 41 if(l.softmax_tree){// 不執行 42 int onlyclass = 0; 43 for(t = 0; t < 30; ++t){ 44 box truth = float_to_box(net.truth + t*(l.coords + 1) + b*l.truths, 1); 45 if(!truth.x) break; 46 int class = net.truth[t*(l.coords + 1) + b*l.truths + l.coords]; 47 float maxp = 0; 48 int maxi = 0; 49 if(truth.x > 100000 && truth.y > 100000){ 50 for(n = 0; n < l.n*l.w*l.h; ++n){ 51 int class_index = entry_index(l, b, n, l.coords + 1); 52 int obj_index = entry_index(l, b, n, l.coords); 53 float scale = l.output[obj_index]; 54 l.delta[obj_index] = l.noobject_scale * (0 - l.output[obj_index]); 55 float p = scale*get_hierarchy_probability(l.output + class_index, l.softmax_tree, class, l.w*l.h); 56 if(p > maxp){ 57 maxp = p; 58 maxi = n; 59 } 60 } 61 int class_index = entry_index(l, b, maxi, l.coords + 1); 62 int obj_index = entry_index(l, b, maxi, l.coords); 63 delta_region_class(l.output, l.delta, class_index, class, l.classes, l.softmax_tree, l.class_scale, l.w*l.h, &avg_cat); 64 if(l.output[obj_index] < .3) l.delta[obj_index] = l.object_scale * (.3 - l.output[obj_index]); 65 else l.delta[obj_index] = 0; 66 l.delta[obj_index] = 0; 67 ++class_count; 68 onlyclass = 1; 69 break; 70 } 71 } 72 if(onlyclass) continue; 73 } 74 for (j = 0; j < l.h; ++j) { 75 for (i = 0; i < l.w; ++i) { 76 for (n = 0; n < l.n; ++n) { 77 int box_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 0); 78 //帶入 entry_index, 由output tensor的存儲格式能夠知道這裏是第n類anchor在(i,j)上對應box的首地址 79 box pred = get_region_box(l.output, l.biases, n, box_index, i, j, l.w, l.h, l.w*l.h); 80 // 在cell(i,j)上相對於anchor n的預測結果, 相對於feature map的值 81 float best_iou = 0; 82 for(t = 0; t < 30; ++t){//net.truth存放的是真實數據 83 // net.truth存儲格式:x,y,w,h,c,x,y,w,h,c,.... 84 box truth = float_to_box(net.truth + t*(l.coords + 1) + b*l.truths, 1); 85 //讀取一個真實目標框 86 if(!truth.x) break;//遍歷完全部真實box則跳出循環 87 float iou = box_iou(pred, truth);//計算iou 88 if (iou > best_iou) { 89 best_iou = iou;//找到與當前預測box的最大iou 90 } 91 } 92 int obj_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, l.coords); 93 // 存儲第n個anchor在cell (i,j)的預測的confidence的index 94 avg_anyobj += l.output[obj_index]; // 有目標的機率 95 96 l.delta[obj_index] = l.noobject_scale * (0 - l.output[obj_index]); 97 // *********** 注4 ********** 98 // 全部的predict box都當作noobject,計算其損失梯度,主要是爲了計算速度考慮 99 if(l.background) l.delta[obj_index] = l.noobject_scale * (1 - l.output[obj_index]);//未執行 100 if (best_iou > l.thresh) {//該預測框中有目標 101 // *********** 注5 *********** 102 l.delta[obj_index] = 0; 103 } 104 105 if(*(net.seen) < 12800){// net.seen 已訓練樣本的個數 106 // *********** 注6 *********** 107 box truth = {0}; // 當前cell爲中心對應的第n個anchor的box 108 truth.x = (i + .5)/l.w; // cell的中點 // 對應tx=0.5 109 truth.y = (j + .5)/l.h; //ty=0.5 110 truth.w = l.biases[2*n]/l.w; //相對於feature map的大小 // tw=0 111 truth.h = l.biases[2*n+1]/l.h; //th=0 112 delta_region_box(truth, l.output, l.biases, n, box_index, i, j, l.w, l.h, l.delta, .01, l.w*l.h); 113 //將預測的tx,ty,tw,th和上面的box差值存入l.delta 114 } 115 } 116 } 117 } 118 for(t = 0; t < 30; ++t){ 119 box truth = float_to_box(net.truth + t*(l.coords + 1) + b*l.truths, 1); 120 //對應的真實值,歸一化的真實值 121 122 if(!truth.x) break; 123 float best_iou = 0; 124 int best_n = 0; 125 i = (truth.x * l.w);// 類型的強制轉換,計算該truth所在的cell的i,j座標 126 j = (truth.y * l.h); 127 //printf("%d %f %d %f\n", i, truth.x*l.w, j, truth.y*l.h); 128 box truth_shift = truth; 129 truth_shift.x = 0; 130 truth_shift.y = 0; 131 //printf("index %d %d\n",i, j); 132 for(n = 0; n < l.n; ++n){ // 遍歷對應的cell預測出的n個anchor 133 // 即經過該cell對應的anchors與truth的iou來判斷使用哪個anchor產生的predict來回歸 134 int box_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 0); 135 box pred = get_region_box(l.output, l.biases, n, box_index, i, j, l.w, l.h, l.w*l.h); 136 // 預測box,歸一化的值 137 //下面這幾句是將truth與anchor中心對齊後,計算anchor與truch的iou 138 if(l.bias_match){ // ********* 注7 *************** 139 pred.w = l.biases[2*n]/l.w; // 由於是和anchor比較,因此直接使用anchor的相對大小 140 pred.h = l.biases[2*n+1]/l.h; 141 } 142 //printf("pred: (%f, %f) %f x %f\n", pred.x, pred.y, pred.w, pred.h); 143 pred.x = 0; 144 pred.y = 0; 145 float iou = box_iou(pred, truth_shift); 146 if (iou > best_iou){ 147 best_iou = iou; 148 best_n = n;// 最優iou對應的anchor索引,而後使用該anchor預測的predict box計算與真實box的偏差 149 } 150 } 151 //printf("%d %f (%f, %f) %f x %f\n", best_n, best_iou, truth.x, truth.y, truth.w, truth.h); 152 153 int box_index = entry_index(l, b, best_n*l.w*l.h + j*l.w + i, 0); 154 float iou = delta_region_box(truth, l.output, l.biases, best_n, box_index, i, j, l.w, l.h, l.delta, l.coord_scale * (2 - truth.w*truth.h), l.w*l.h); 155 // 注意這裏的關於box的損失權重 ************* 注 8 ********************** 156 if(l.coords > 4){// 不執行 157 int mask_index = entry_index(l, b, best_n*l.w*l.h + j*l.w + i, 4); 158 delta_region_mask(net.truth + t*(l.coords + 1) + b*l.truths + 5, l.output, l.coords - 4, mask_index, l.delta, l.w*l.h, l.mask_scale); 159 } 160 if(iou > .5) recall += 1;// 若是iou> 0.5, 認爲找到該目標,召回數+1 161 avg_iou += iou; 162 163 //l.delta[best_index + 4] = iou - l.output[best_index + 4]; 164 int obj_index = entry_index(l, b, best_n*l.w*l.h + j*l.w + i, l.coords);// 對應predict預測的confidence 165 avg_obj += l.output[obj_index]; 166 l.delta[obj_index] = l.object_scale * (1 - l.output[obj_index]);//有目標時的損失 167 if (l.rescore) { //定義了rescore表示同時對confidence score進行迴歸 168 l.delta[obj_index] = l.object_scale * (iou - l.output[obj_index]); 169 } 170 if(l.background){//不執行 171 l.delta[obj_index] = l.object_scale * (0 - l.output[obj_index]); 172 } 173 174 int class = net.truth[t*(l.coords + 1) + b*l.truths + l.coords];// 真實類別 175 if (l.map) class = l.map[class];//不執行 176 int class_index = entry_index(l, b, best_n*l.w*l.h + j*l.w + i, l.coords + 1);//預測的class向量首地址 177 delta_region_class(l.output, l.delta, class_index, class, l.classes, l.softmax_tree, l.class_scale, l.w*l.h, &avg_cat); 178 ++count; 179 ++class_count; 180 } 181 } 182 //printf("\n"); 183 *(l.cost) = pow(mag_array(l.delta, l.outputs * l.batch), 2);//MSEloss 184 printf("Region Avg IOU: %f, Class: %f, Obj: %f, No Obj: %f, Avg Recall: %f, count: %d\n", avg_iou/count, avg_cat/class_count, avg_obj/count, avg_anyobj/(l.w*l.h*l.n*l.batch), recall/count, count); 185 }
code
注6:
>Also, in every image many grid cells do not contain any object. This pushes the donfidence scores of thos cells towards zero, ofthen overpowering the gradient from cells that do contain objects. This can lead to model instability, causing training to diverge early on.
在yolo中有這麼一段
> Sum-squred error also equally weights errors in large boxes and small boxes. Our error metric should reflect that small derivations in large boxes matter less than in small boxes. To partially address this we predict the square root of the bounding box width and height instead of the width and height directly.
即yolo v1中使用w和h的開方還和該問題,而在yolo v2中則經過賦值一個和w,h相關的權重函數達到該目的。
3. 因此總結起來,代碼中計算的損失包括:其中最後一項只在訓練初期使用
計算包含目標和不包含目標的anchors的iou損失
12800樣本以前計算未預測到target的anchors的梯度
針對於每個target,計算最接近的anchor的coord梯度
計算類別預測的損失和梯度。