GOCR v0.50 原理分析

一,簡介:

GOCR是一個c寫的開源OCR庫,GNU Public License,做者:Joerg Schulenburghtml

項目主頁:http://jocr.sourceforge.net/index.htmlc++

源代碼(v0.50) : http://pan.baidu.com/s/1y1Jj1 (VS2005工程項目)git

Update : http://pan.baidu.com/s/1c0b278O (windows下經過liblept支持jpeg/png等格式的OCR)算法

版本(version.h):windows

#define version_string "0.50"
#define release_string "20130305"

二,原理分析:

1,GOCR的主要流程以下:app

int pgm2asc(job_t *job)
{
  pix *pp;
  progress_counter_t *pc;
  static int multi_image_count=0;  /* number of image within multi-image */
  int orig_cs=0; 
  
  if (!multi_image_count) orig_cs = job->cfg.cs; /* save for multi-images */
  
  multi_image_count++;

  assert(job);
  /* FIXME jb: remove pp */
  pp = &(job->src.p);

  pc = open_progress(100,"pgm2asc_main");
  progress(0,pc); /* start progress output 0% 0% */
#if 0 /* dont vast memory */
  /* FIXME jb: malloc */
  if ( job->cfg.verbose & 32 ) { 
    // generate 2nd imagebuffer for debugging output
    job->tmp.ppo.p = (unsigned char *)malloc(job->src.p.y * job->src.p.x);     
    // buffer
    assert(job->tmp.ppo.p);
    copybox(&job->src.p,
            0, 0, job->src.p.x, job->src.p.y,
            &job->tmp.ppo,
            job->src.p.x * job->src.p.y);
  }
#else
  job->tmp.ppo=job->src.p; /* temporarely, removed later */
#endif
  // if (job->cfg.verbose&32) debug_img("out000.ppm",job,0);

  /* ----- count colors ------ create histogram -------
     - this should be used to create a upper and lower limit for cs
     - cs is the optimum gray value between cs_min and cs_max
     - also inverse scans could be detected here later */
  if (orig_cs==0)
    job->cfg.cs=otsu( pp->p,pp->y,pp->x,0,0,pp->x,pp->y,job->cfg.verbose & 1);
  else  // dont set cs, output stats + do inversion if needed 2010-10-07
    otsu( pp->p,pp->y,pp->x,0,0,pp->x,pp->y,job->cfg.verbose & 1);
//  if (job->cfg.verbose&32) debug_img("out001.ppm",job,0);
  /* renormalize the image and set the normalized threshold value */
  job->cfg.cs=thresholding( pp->p,pp->y,pp->x,0,0,pp->x,pp->y, job->cfg.cs );
  if( job->cfg.verbose ) 
    fprintf(stderr, "# thresholding new_threshold= %d\n", job->cfg.cs);
//  if (job->cfg.verbose&32) debug_img("out002.ppm",job,0);

  progress(5,pc); /* progress is only estimated */

  
  /* this is first step for reorganize the PG
     ---- look for letters, put rectangular frames around letters
     letter = connected points near color F
     should be used by dust removing (faster) and line detection!
     ---- 0..cs = black letters, last change = Mai99 */
  
  progress(8,pc); /* progress is only estimated */

//  if (job->cfg.verbose&32) debug_img("out008.ppm",job,8);
  scan_boxes( job, pp );
  if ( !job->res.numC ){ 
    fprintf( stderr,"# no boxes found - stopped\n" );
    if(job->cfg.verbose&32) debug_img("out01",job,8);
    /***** should free stuff, etc) */
    return(1);
  }
  // tmp10/bug100818a.pgm creates artefacts on image
//  if (job->cfg.verbose&32) debug_img("out00",job,4+8);

  progress(10,pc); /* progress is only estimated */
  // if(job->cfg.verbose&32) debug_img("out01",job,4+8);
  // output_list(job);  // for debugging 
  // ToDo: matrix printer preprocessing

  remove_dust( job ); /* from the &(job->res.boxlist)! */
// if(job->cfg.verbose&32) debug_img("out02",job,4+8);
// output_list(job);  // for debugging 
#if 0 // ToDo 2010-10-15 destroys QR-barcodes
  smooth_borders( job ); /* only for big chars */
#endif
  progress(12,pc); /* progress is only estimated */
// if(job->cfg.verbose&32) debug_img("out03",job,4+8);
// output_list(job);  // for debugging 

  detect_barcode( job );  /* mark barcode */
// if(job->cfg.verbose&32) debug_img("out04",job,4+8);
// output_list(job);  // for debugging 

  detect_pictures( job ); /* mark pictures */
//  if(job->cfg.verbose&32) debug_img("out05",job,4+8);
// output_list(job);  // for debugging 

  remove_pictures( job ); /* do this as early as possible, before layout */
//  if(job->cfg.verbose&32) debug_img("out06",job,4+8);
// output_list(job);  // for debugging

  glue_holes_inside_chars( pp ); /* including count subboxes (holes)  */

  detect_rotation_angle( job );

#if 1         /* Rotate the whole picture! move boxes */
  if( job->res.lines.dy!=0 ){  // move down lowest first, move up highest first
    // in work! ??? (at end set dy=0) think on ppo!
  }
#endif
  detect_text_lines( pp, job->cfg.mode ); /* detect and mark job->tmp.ppo */
// if(job->cfg.verbose&32) debug_img("out07",job,4+8);
  progress(20,pc); /* progress is only estimated */

  add_line_info( job /* , &(job->res.boxlist) */);
  if (job->cfg.verbose&32) debug_img("out10",job,4+8);

  divide_vert_glued_boxes( pp, job->cfg.mode); /* after add_line_info, before list_sort! */
//  if(job->cfg.verbose&32) debug_img("out11",job,0);

  remove_melted_serifs( job, pp ); /* make some corrections on pixmap */
  /* list_ins seems to sort in the boxes on the wrong place ??? */
//  if(job->cfg.verbose&32) debug_img("out12",job,4+8);

  glue_broken_chars( job, pp ); /* 2nd glue */
//  if(job->cfg.verbose&32) debug_img("out14",job,4+8);
// 2010-09-24 overall box size is correct here, but later broken

  remove_rest_of_dust( job );
//  if(job->cfg.verbose&32) debug_img("out15",job,4+8);

  /* better sort after dust is removed (slow for lot of pixels) */ 
  list_sort(&(job->res.boxlist), sort_box_func);

  measure_pitch( job );

  if(job->cfg.mode&64) find_same_chars( pp );
  progress(30,pc); /* progress is only estimated */
//  if(job->cfg.verbose&32) debug_img("out16",job,4+8);

  char_recognition( pp, job->cfg.mode);
  progress(60,pc); /* progress is only estimated */
//  if(job->cfg.verbose&32) debug_img("out17",job,4+8);

  if ( adjust_text_lines( pp, job->cfg.mode ) ) { /* correct using chars */
    /* may be, characters/pictures have changed line number */
    list_sort(&(job->res.boxlist), sort_box_func);
    // 2nd recognition call if lines are adjusted
    char_recognition( pp, job->cfg.mode);
  }

#define BlownUpDrawing 0     /* german: Explosionszeichnung, temporarly */
#if     BlownUpDrawing == 1  /* german: Explosionszeichnung */
{ /* just for debugging */
  int i,ii,ni; struct box *box2;
  i=ii=ni=0;
  for_each_data(&(job->res.boxlist)) { /* count boxes */
    box2 = (struct box *)list_get_current(&(job->res.boxlist));
    if (box2->c==UNKNOWN)  i++;
    if (box2->c==PICTURE) ii++;
    ni++;
  } end_for_each(&(job->res.boxlist)); 
  if (job->cfg.verbose)
    fprintf(stderr,"# debug: unknown= %d picts= %d boxes= %d\n",i,ii,ni);
}
#endif
  // ----------- write out20.pgm ----------- mark lines + boxes
  if (job->cfg.verbose&32) debug_img("out20",job,1+4+8);

 compare_unknown_with_known_chars( pp, job->cfg.mode);
  progress(70,pc); /* progress is only estimated */

    try_to_divide_boxes( pp, job->cfg.mode);
  progress(80,pc); /* progress is only estimated */

  /* --- list output ---- for debugging --- */
  if (job->cfg.verbose&6) output_list(job);

  /* ---- insert spaces ---- */
  list_insert_spaces( pp , job );

  // ---- proof difficult chars Il1 by context view ----
  if (job->cfg.verbose)
    fprintf(stderr,"# context correction if !(mode&32)\n");
  if (!(job->cfg.mode&32)) context_correction( job );
  
  store_boxtree_lines( job, job->cfg.mode );
  progress(90,pc); /* progress is only estimated */

/* 0050002.pgm.gz ca. 109 digits, only 50 recognized (only in lines?)
 * ./gocr -v 39 -m 56 -e - -m 4 -C 0-9 -f XML tmp0406/0050002.pbm.gz
 *  awk 'BEGIN{num=0}/1<\/box>/{num++;}END{print num}' o
 * 15*0 24*1 18*2 19*3 15*4 6*5 6*6 6*7 4*8 8*9 sum=125digits counted boxes
 *  9*0 19*1 14*2 15*3 11*4 6*5 5*6 6*7 4*8 8*9 sum=97digits recognized
 * 1*1 1*7 not recognized (Oct04)
 *  33*SPC 76*NL = 109 spaces + 36*unknown sum=241 * 16 missed
 */
#if     BlownUpDrawing == 1  /* german: Explosionszeichnung */
{ /* just for debugging */
  int i,ii,ni; struct box *box2; const char *testc="0123456789ABCDEFGHIJK";
    i=ii=ni=0;
  for_each_data(&(job->res.boxlist)) { /* count boxes */
    box2 = (struct box *)list_get_current(&(job->res.boxlist));
    if (box2->c==UNKNOWN)  i++;
    if (box2->c==PICTURE) ii++;
    if (box2->c>' ' && box2->c<='z') ni++;
  } end_for_each(&(job->res.boxlist)); 
  if(job->cfg.verbose)
    fprintf(stderr,"# debug: (_)= %d picts= %d chars= %d",i,ii,ni);
  for (i=0;i<20;i++) {
    ni=0;
    for_each_data(&(job->res.boxlist)) { /* count boxes */
      box2 = (struct box *)list_get_current(&(job->res.boxlist));
      if (box2->c==testc[i]) ni++;
    } end_for_each(&(job->res.boxlist)); 
    if(job->cfg.verbose && ni>0)
      fprintf(stderr," (%c)=%d",testc[i],ni);
  }
  if(job->cfg.verbose)
    fprintf(stderr,"\n");
}
#endif

  // ---- frame-size-histogram
  // ---- (my own defined) distance between letters
  // ---- write internal picture of textsite
  // ----------- write out30.pgm -----------
  if( job->cfg.verbose&32 ) debug_img("out30",job,2+4);
    
  progress(100,pc); /* progress is only estimated */

  close_progress(pc);
  
  return 0;     /* what should I return? error-state? num-of-chars? */
}

 

2,Scan boxes分析:機器學習

流程:從上往下,分別在X,Y軸方向投影,獲得box list。ide

helloworld

3,去除噪點:oop

/* ---- remove dust ---------------------------------
   What is dust? I think, this is a very small pixel cluster without
   neighbours. Of course not all dust clusters can be detected correct.
   This feature should be possible to switch off via option.
   -> may be, all clusters should be stored here?
   speed is very slow, I know, but I am happy that it is working well
*/

 

4,detect barcode and pictures , remove pictures:學習

圖片:全部box的平均寬度爲avgwidth,平均高度爲avgheight,符合box.width > 4 * avgwidth || height > 4*avgheight條件而且相近大小的box少於4個的box認爲是圖像box。

5,glur holes inside char:

/* ---- join holes to chars( before step1 ) v0.42  -----------------------
   join boxes lying inside another box (usually holes, ex: "aeobdg46890")
   Dont add dust to a char!
   lines are not detected yet
*/

 

6,detect rotation angle:

/*
** Detect rotation angle (one for whole image)
** old: longest text-line and determining the angle of this line.
 *
 * search right nearest neighbour of each box and average vectors
 * to get the text orientation,
 * upside down decision is not made here (I dont know how to do it)
 *  ToDo: set job->res.lines.{dx,dy}
 * pass 1: get mean vector to nearest char
 * pass 2: get mean vector to nearest char without outriders to pass 1
 * extimate direction as (dx,dy,num)[pass]
 * ToDo: estimate an error, boxes only work fine for zero-rotation
 *       for 45 degree use vectors, not boxes to get base line
 */

 

7,detect text lines:

http://en.wikipedia.org/wiki/Cap_height

8,measure pitch:

估計空格的寬度。

9,識別字符:

gocr的識別不是機器學習式的學習,沒有training過程,徹底靠先驗的規則,所以只能識別英文字符,數字,標點等。識別主要是一個filter鏈路,每一個filter決定box是不是該字符,是則略事後續filter。

a,從box外引出一條射線從某個方向(左,右,上,下)某個座標(x,y)向box內部,第一個交點位置必須符合某個字符的先驗規則;

ray

代碼:

/* move from x,y to direction r until pixel of color col is found
 *   or maximum of l steps
 * return the number of steps done */
int loop(pix *p,int x,int y,int l,int cs,int col, DIRECTION r){ 
  int i=0;
  if(x>=0 && y>=0 && x<p->x && y<p->y){
    switch (r) {
    case UP:
      for( ;i<l && y>=0;i++,y--)
    if( (getpixel(p,x,y)<cs)^col )
      break;
      break;
    case DO:
      for( ;i<l && y<p->y;i++,y++)
    if( (getpixel(p,x,y)<cs)^col )
      break;
      break;
    case LE:
      for( ;i<l && x>=0;i++,x--)
    if( (getpixel(p,x,y)<cs)^col )
      break;
      break;
    case RI:
      for( ;i<l && x<p->x;i++,x++)
    if( (getpixel(p,x,y)<cs)^col )
      break;
      break;
    default:;
    }
  }
  return i;
}

b,通過box的一條直線與字符的交點個數必須符合某個字符的先驗規則,算法:計算這樣的點(如從左向右:Pixel(x,y) = white && Pixel(x+1,y) = black ) 的個數

line

代碼:

int num_cross(int x0, int x1, int y0, int y1, pix *p, int cs) {
  int rc = 0, col = 0, k, x, y, i, d;    // rc=crossings  col=0=white
  int dx = x1 - x0, dy = y1 - y0;

  d = MAX(abs(dx), abs(dy));
  for (i = 0, x = x0, y = y0; i <= d; i++) {
    if (d) {
      x = x0 + i * dx / d;
      y = y0 + i * dy / d;
    }
    k = ((getpixel(p, x, y) < cs) ? 1 : 0);    // 0=white 1=black
    if (col == 0 && k == 1)  // found a white-black transition
      rc++;
    col = k;        // last color
  }
  return rc;
}

c,孔洞的個數必須符合某個字符的先驗規則,好比A有一個洞;這一步只是判斷,實際工做在第5步已經完成。

hole

d,以下面識別「{」的代碼:

意思是橫穿過dy條線,全部線與字符的交點個數均爲1;在字符的前半面,豎直穿過dx/2條線,交點個數均爲2,即左凸起部分;等等。

//  --------- test {} --------------------------------
   for(ad=d=99;dx>2 && dy>5 && 2*dy>3*dx;){
      DBG( wchar_t c_ask='}'; )
      if (!hchar) ad=97*ad/100; 
      for(y=0;y<dy;y++){
        if( num_cross(0,dx-1,y,y,bp,cs) != 1 ) break;
      } if (y<dy) Break;
      for(x=0;x<dx/2;x++){
        if( num_cross(x,x,0,dy-1,bp,cs) != 2 ) break;
      } if (y<dx/2) Break;
      if ( num_cross(   0,   0,dy/4,dy-1-dy/4,bp,cs) != 0 ) Break;
      if ( num_cross(dx-1,dx-1,dy/4,dy-1-dy/4,bp,cs) != 1 ) Break;
      i1=loop(bp,dx-1   ,dy/4,dx,cs,0,LE);
      i1=loop(bp,dx-1-i1,dy/4,dx,cs,1,LE); // thickness1
      for (i2=dx,i3=y=dy/2-1-dy/16;y<dy/2+2+dy/16;y++)
       { x=loop(bp,dx-1   , y,dx,cs,0,LE); if (x<i2) {i2=x;i3=y;} }
      i2=  loop(bp,dx-1-i2,i3,dx,cs,1,LE); // thickness2
      if (i2<i1+dx/16+1) Break;
      if ( loop(bp,dx-1,dy-1,dx,cs,0,LE)>3*dx/4 ) {ad=99*ad/100;MSG({})}
      if ( loop(bp,dx-1,   0,dx,cs,0,LE)>3*dx/4 ) {ad=99*ad/100;MSG({})} // >
      if ( loop(bp,dx-1,   0,dy,cs,0,DO)<dy/2-dy/8-1 ) {ad=98*ad/100;MSG({})}
      if ( loop(bp,dx-1,dy-1,dy,cs,0,UP)<dy/2-dy/8-1 ) {ad=98*ad/100;MSG({})} // )
      if ( loop(bp,dx-1,   0,dy,cs,0,DO)<=dy/4) Break;
      if (dy>=8)
      if (   loop(bp,0,   0,dx,cs,0,RI)
         +   loop(bp,0,dy/4,dx,cs,0,RI)
         - 2*loop(bp,0,dy/8,dx,cs,0,RI) >=dx/8 ) {ad=98*ad/100;MSG({})} // <
      if ( loop(bp,1,dy-1,dy,cs,0,UP)>dy/4 ) Break; // ???
      if ( get_bw(x1,x1,y0,y0+dy/4,box1->p,cs,1) == 1 
        || get_bw(x1,x1,y1-dy/4,y1,box1->p,cs,1) == 1 ) Break;
      Setac(box1,(bc='}'),ad);break;
   }

10,compare_unknown_with_known_chars try_to_divide_boxes等後處理;

11,list_insert_spaces 插入空格;

12,store_boxtree_lines;

13,輸出識別結果。

相關文章
相關標籤/搜索