#include "varable.h"
#include "ocrblock.h"
#include "ocrrow.h"
#include "notdll.h"
Go to the source code of this file.
void apply_box_testing | ( | BLOCK_LIST * | block_list | ) |
Test for a TESS screw-up on a word.
block_list | List of blocks |
Definition at line 789 of file applybox.cpp.
References ASSERT_HOST, WERD::blob_list(), count(), fix_quotes(), WERD::gblob_list(), make_bln_copy(), tess_default_matcher(), tess_segment_pass1(), WERD::text(), tprintf(), word_count, ROW::word_list(), and ROW::x_height().
00789 { 00790 BLOCK_IT block_it(block_list); 00791 ROW_IT row_it; 00792 ROW *row; 00793 INT16 row_count = 0; 00794 WERD_IT word_it; 00795 WERD *word; 00796 WERD *bln_word; 00797 INT16 word_count = 0; 00798 PBLOB_IT blob_it; 00799 DENORM denorm; 00800 INT16 count = 0; 00801 char ch[2]; 00802 WERD *outword; //bln best choice 00803 //segmentation 00804 WERD_CHOICE *best_choice; //tess output 00805 WERD_CHOICE *raw_choice; //top choice permuter 00806 //detailed results 00807 BLOB_CHOICE_LIST_CLIST blob_choices; 00808 INT16 char_count = 0; 00809 INT16 correct_count = 0; 00810 INT16 err_count = 0; 00811 INT16 rej_count = 0; 00812 #ifndef SECURE_NAMES 00813 WERDSTATS wordstats; //As from newdiff 00814 #endif 00815 char tess_rej_str[3]; 00816 char tess_long_str[3]; 00817 00818 ch[1] = '\0'; 00819 strcpy (tess_rej_str, "|A"); 00820 strcpy (tess_long_str, "|B"); 00821 00822 for (block_it.mark_cycle_pt (); 00823 !block_it.cycled_list (); block_it.forward ()) { 00824 row_it.set_to_list (block_it.data ()->row_list ()); 00825 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { 00826 row = row_it.data (); 00827 row_count++; 00828 word_count = 0; 00829 word_it.set_to_list (row->word_list ()); 00830 for (word_it.mark_cycle_pt (); 00831 !word_it.cycled_list (); word_it.forward ()) { 00832 word = word_it.data (); 00833 word_count++; 00834 if ((strlen (word->text ()) == 1) && 00835 !STRING (applybox_test_exclusions).contains (*word->text ()) 00836 && (word->gblob_list ()->length () == 1)) { 00837 /* Here is a word with a single char label and a single blob so test it */ 00838 bln_word = 00839 make_bln_copy (word, row, row->x_height (), &denorm); 00840 blob_it.set_to_list (bln_word->blob_list ()); 00841 ch[0] = *word->text (); 00842 char_count++; 00843 best_choice = tess_segment_pass1 (bln_word, 00844 &denorm, 00845 tess_default_matcher, 00846 raw_choice, 00847 &blob_choices, outword); 00848 00849 /* Test for TESS screw up on word. */ 00850 if ((best_choice->string ().length () == 0) || 00851 (strspn (best_choice->string ().string (), " ") == 00852 best_choice->string ().length ())) { 00853 rej_count++; 00854 tprintf ("%d:%d: \"%s\" -> TESS FAILED\n", 00855 row_count, word_count, ch); 00856 #ifndef SECURE_NAMES 00857 wordstats.word (tess_rej_str, 2, ch, 1); 00858 #endif 00859 } 00860 else { 00861 if ((best_choice->string ().length () != 00862 outword->blob_list ()->length ()) || 00863 (best_choice->string ().length () != 00864 blob_choices.length ())) { 00865 tprintf 00866 ("ASSERT FAIL String:\"%s\"; Strlen=%d; #Blobs=%d; #Choices=%d\n", 00867 best_choice->string ().string (), 00868 best_choice->string ().length (), 00869 outword->blob_list ()->length (), 00870 blob_choices.length ()); 00871 } 00872 ASSERT_HOST (best_choice->string ().length () == 00873 outword->blob_list ()->length ()); 00874 ASSERT_HOST (best_choice->string ().length () == 00875 blob_choices.length ()); 00876 fix_quotes ((char *) best_choice->string ().string (), 00877 //turn to double 00878 outword, &blob_choices); 00879 if (strcmp (best_choice->string ().string (), ch) != 0) { 00880 err_count++; 00881 tprintf ("%d:%d: \"%s\" -> \"%s\"\n", 00882 row_count, word_count, ch, 00883 best_choice->string ().string ()); 00884 } 00885 else 00886 correct_count++; 00887 #ifndef SECURE_NAMES 00888 if (best_choice->string ().length () > 2) 00889 wordstats.word (tess_long_str, 2, ch, 1); 00890 else 00891 wordstats.word ((char *) best_choice->string (). 00892 string (), 00893 best_choice->string ().length (), ch, 00894 1); 00895 #endif 00896 } 00897 delete bln_word; 00898 delete outword; 00899 delete best_choice; 00900 delete raw_choice; 00901 blob_choices.deep_clear (); 00902 count++; 00903 } 00904 } 00905 } 00906 } 00907 #ifndef SECURE_NAMES 00908 wordstats.print (1, 100.0); 00909 wordstats.conf_matrix (); 00910 tprintf ("Tested %d chars: %d correct; %d rejected by tess; %d errs\n", 00911 char_count, correct_count, rej_count, err_count); 00912 #endif 00913 }
void apply_box_training | ( | BLOCK_LIST * | block_list | ) |
Generates training data.
block_list | List of blocks |
Definition at line 727 of file applybox.cpp.
References WERD::baseline_denormalise(), WERD::blob_list(), count(), WERD::gblob_list(), make_bln_copy(), NULL, tess_training_tester(), WERD::text(), tprintf(), TRUE, ROW::word_list(), and ROW::x_height().
Referenced by TessBaseAPI::Recognize().
00727 { 00728 BLOCK_IT block_it(block_list); 00729 ROW_IT row_it; 00730 ROW *row; 00731 WERD_IT word_it; 00732 WERD *word; 00733 WERD *bln_word; 00734 WERD copy_outword; // copy to denorm 00735 PBLOB_IT blob_it; 00736 DENORM denorm; 00737 INT16 count = 0; 00738 char ch[2]; 00739 00740 ch[1] = '\0'; 00741 00742 tprintf ("Generating training data\n"); 00743 for (block_it.mark_cycle_pt (); 00744 !block_it.cycled_list (); block_it.forward ()) { 00745 row_it.set_to_list (block_it.data ()->row_list ()); 00746 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { 00747 row = row_it.data (); 00748 word_it.set_to_list (row->word_list ()); 00749 for (word_it.mark_cycle_pt (); 00750 !word_it.cycled_list (); word_it.forward ()) { 00751 word = word_it.data (); 00752 if ((strlen (word->text ()) == 1) && 00753 (word->gblob_list ()->length () == 1)) { 00754 /* Here is a word with a single char label and a single blob so train on it */ 00755 bln_word = 00756 make_bln_copy (word, row, row->x_height (), &denorm); 00757 blob_it.set_to_list (bln_word->blob_list ()); 00758 ch[0] = *word->text (); 00759 tess_training_tester (blob_it.data (), 00760 //single blob 00761 &denorm, TRUE, //correct 00762 ch, //correct ASCII char 00763 1, //ASCII length 00764 NULL); 00765 copy_outword = *(bln_word); 00766 copy_outword.baseline_denormalise (&denorm); 00767 blob_it.set_to_list (copy_outword.blob_list ()); 00768 ch[0] = *word->text (); 00769 delete bln_word; 00770 count++; 00771 } 00772 } 00773 } 00774 } 00775 tprintf ("Generated training data for %d blobs\n", count); 00776 }
void apply_boxes | ( | BLOCK_LIST * | block_list | ) |
Reassigns outlines to form words each with ONE labelled blob.
block_list | Real blocks |
The box file is assumed to contain box definitions, one per line, of the following format:
<Char> <left> <bottom> <right> <top> ... arbitrary trailing fields unused
The approach taken is to search the WHOLE page for stuff overlapping each box.
A warning is given if the box is on the same row as the previous box, but NOT on the same row as the previous blob.
Any OUTLINE which overlaps the box is put into the new word.
ascender chars must ascend above xht significantly xht chars must not rise above row xht significantly bl chars must not descend below baseline significantly descender chars must descend below baseline significantly
?? Certain chars are DROPPED - to limit the training data.
Definition at line 100 of file applybox.cpp.
References CANTOPENFILE, clear_any_old_text(), ERRCODE::error(), EXIT, find_row_of_box(), imagefile, BOX::left(), MAX_INT16, NULL, read_next_box(), report_failed_box(), resegment_box(), BOX::right(), STRING::string(), tidy_up(), and tprintf().
Referenced by TessBaseAPI::Recognize().
00101 { 00102 INT16 boxfile_lineno = 0; 00103 INT16 boxfile_charno = 0; 00104 BOX box; //boxfile box 00105 char ch[2]; //correct ch from boxfile 00106 ROW *row; 00107 ROW *prev_row = NULL; 00108 INT16 prev_box_right = MAX_INT16; 00109 INT16 block_id; 00110 INT16 row_id; 00111 INT16 box_count = 0; 00112 INT16 box_failures = 0; 00113 INT16 labels_ok; 00114 INT16 rows_ok; 00115 INT16 bad_blobs; 00116 INT16 tgt_char_counts[128]; //No. of box samples 00117 // INT16 labelled_char_counts[128]; //No. of unique labelled samples 00118 INT16 i; 00119 INT16 rebalance_count = 0; 00120 char min_char; 00121 INT16 min_samples; 00122 INT16 final_labelled_blob_count; 00123 00124 for (i = 0; i < 128; i++) 00125 tgt_char_counts[i] = 0; 00126 00127 FILE* box_file; 00128 STRING filename = imagefile; 00129 filename += ".box"; 00130 if (!(box_file = fopen (filename.string(), "r"))) { 00131 CANTOPENFILE.error ("read_next_box", EXIT, 00132 "Cant open box file %s %d", 00133 filename.string(), errno); 00134 } 00135 00136 ch[1] = '\0'; 00137 clear_any_old_text(block_list); 00138 while (read_next_box (box_file, &box, &ch[0])) { 00139 box_count++; 00140 tgt_char_counts[ch[0]]++; 00141 row = find_row_of_box (block_list, box, block_id, row_id); 00142 if (box.left () < prev_box_right) { 00143 boxfile_lineno++; 00144 boxfile_charno = 1; 00145 } 00146 else 00147 boxfile_charno++; 00148 00149 if (row == NULL) { 00150 box_failures++; 00151 report_failed_box (boxfile_lineno, boxfile_charno, box, ch, 00152 "FAILURE! box overlaps no blobs or blobs in multiple rows"); 00153 } 00154 else { 00155 if ((box.left () >= prev_box_right) && (row != prev_row)) 00156 report_failed_box (boxfile_lineno, boxfile_charno, box, ch, 00157 "WARNING! false row break"); 00158 box_failures += resegment_box (row, box, ch, block_id, row_id, 00159 boxfile_lineno, boxfile_charno); 00160 prev_row = row; 00161 } 00162 prev_box_right = box.right (); 00163 } 00164 tidy_up(block_list, 00165 labels_ok, 00166 rows_ok, 00167 bad_blobs, 00168 tgt_char_counts, 00169 rebalance_count, 00170 min_char, 00171 min_samples, 00172 final_labelled_blob_count); 00173 tprintf ("APPLY_BOXES:\n"); 00174 tprintf (" Boxes read from boxfile: %6d\n", box_count); 00175 tprintf (" Initially labelled blobs: %6d in %d rows\n", 00176 labels_ok, rows_ok); 00177 tprintf (" Box failures detected: %6d\n", box_failures); 00178 tprintf (" Duped blobs for rebalance:%6d\n", rebalance_count); 00179 tprintf (" \"%c\" has fewest samples:%6d\n", min_char, min_samples); 00180 tprintf (" Total unlabelled words: %6d\n", 00181 bad_blobs); 00182 tprintf (" Final labelled words: %6d\n", 00183 final_labelled_blob_count); 00184 }
void clear_any_old_text | ( | BLOCK_LIST * | block_list | ) |
Remove correct text
Definition at line 190 of file applybox.cpp.
Referenced by apply_boxes().
00192 { 00193 BLOCK_IT block_it(block_list); 00194 ROW_IT row_it; 00195 WERD_IT word_it; 00196 00197 for (block_it.mark_cycle_pt (); 00198 !block_it.cycled_list (); block_it.forward ()) { 00199 row_it.set_to_list (block_it.data ()->row_list ()); 00200 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { 00201 word_it.set_to_list (row_it.data ()->word_list ()); 00202 for (word_it.mark_cycle_pt (); 00203 !word_it.cycled_list (); word_it.forward ()) { 00204 word_it.data ()->set_text (""); 00205 } 00206 } 00207 } 00208 }
ROW* find_row_of_box | ( | BLOCK_LIST * | block_list, | |
BOX | box, | |||
INT16 & | block_id, | |||
INT16 & | row_id_to_process | |||
) |
Find row to process.
Error if box REALLY overlaps more than one row. That is, if it overlaps blobs in the row, not just overlaps the bounding box of the whole row.
Definition at line 263 of file applybox.cpp.
References WERD::bounding_box(), ROW::bounding_box(), WERD::flag(), gblob_bounding_box(), WERD::gblob_list(), gblob_out_list(), goutline_bounding_box(), NULL, outline_it, BOX::overlap(), W_POLYGON, and ROW::word_list().
Referenced by apply_boxes().
00267 { 00268 BLOCK_IT block_it(block_list); 00269 BLOCK *block; 00270 ROW_IT row_it; 00271 ROW *row; 00272 ROW *row_to_process = NULL; 00273 INT16 row_id; 00274 WERD_IT word_it; 00275 WERD *word; 00276 BOOL8 polyg; 00277 PBLOB_IT blob_it; 00278 PBLOB *blob; 00279 OUTLINE_IT outline_it; 00280 OUTLINE *outline; 00281 00282 /* 00283 Find row to process - error if box REALLY overlaps more than one row. (I.e 00284 it overlaps blobs in the row - not just overlaps the bounding box of the 00285 whole row.) 00286 */ 00287 00288 block_id = 0; 00289 for (block_it.mark_cycle_pt (); 00290 !block_it.cycled_list (); block_it.forward ()) { 00291 block_id++; 00292 row_id = 0; 00293 block = block_it.data (); 00294 if (block->bounding_box ().overlap (box)) { 00295 row_it.set_to_list (block->row_list ()); 00296 for (row_it.mark_cycle_pt (); 00297 !row_it.cycled_list (); row_it.forward ()) { 00298 row_id++; 00299 row = row_it.data (); 00300 if (row->bounding_box ().overlap (box)) { 00301 word_it.set_to_list (row->word_list ()); 00302 for (word_it.mark_cycle_pt (); 00303 !word_it.cycled_list (); word_it.forward ()) { 00304 word = word_it.data (); 00305 polyg = word->flag (W_POLYGON); 00306 if (word->bounding_box ().overlap (box)) { 00307 blob_it.set_to_list (word->gblob_list ()); 00308 for (blob_it.mark_cycle_pt (); 00309 !blob_it.cycled_list (); blob_it.forward ()) { 00310 blob = blob_it.data (); 00311 if (gblob_bounding_box (blob, polyg). 00312 overlap (box)) { 00313 outline_it. 00314 set_to_list (gblob_out_list 00315 (blob, polyg)); 00316 for (outline_it.mark_cycle_pt (); 00317 !outline_it.cycled_list (); 00318 outline_it.forward ()) { 00319 outline = outline_it.data (); 00320 if (goutline_bounding_box 00321 (outline, polyg).major_overlap (box)) { 00322 if ((row_to_process == NULL) || 00323 (row_to_process == row)) { 00324 row_to_process = row; 00325 row_id_to_process = row_id; 00326 } 00327 else 00328 /* RETURN ERROR Box overlaps blobs in more than one row */ 00329 return NULL; 00330 } 00331 } 00332 } 00333 } 00334 } 00335 } 00336 } 00337 } 00338 } 00339 } 00340 return row_to_process; 00341 }
Read another box from file <outputbase>.box
The box file is assumed to contain box definitions, one per line, of the following format:
<Char> <left> <bottom> <right> <top> ... arbitrary trailing fields unused
Definition at line 220 of file applybox.cpp.
References count(), FALSE, INT32FORMAT, tprintf(), and TRUE.
Referenced by apply_boxes().
00222 { 00223 char buff[256]; //boxfile read buffer 00224 char *buffptr = buff; 00225 STRING box_filename; 00226 static INT16 line = 0; 00227 INT32 x_min; 00228 INT32 y_min; 00229 INT32 x_max; 00230 INT32 y_max; 00231 INT32 count = 0; 00232 00233 while (!feof (box_file)) { 00234 fgets (buff, sizeof (buff) - 1, box_file); 00235 line++; 00236 00237 /* Check for blank lines in box file */ 00238 for (buffptr = buff; isspace (*buffptr); buffptr++) 00239 ; 00240 if (*buffptr != '\0') { 00241 count = 00242 sscanf (buff, 00243 "%c " INT32FORMAT " " INT32FORMAT " " INT32FORMAT " " 00244 INT32FORMAT, ch, &x_min, &y_min, &x_max, &y_max); 00245 if (count != 5) { 00246 tprintf ("Box file format error on line %i ignored\n", line); 00247 } 00248 else { 00249 *box = BOX (ICOORD (x_min, y_min), ICOORD (x_max, y_max)); 00250 return TRUE; //read a box ok 00251 } 00252 } 00253 } 00254 return FALSE; //EOF 00255 }
void report_failed_box | ( | INT16 | boxfile_lineno, | |
INT16 | boxfile_charno, | |||
BOX | box, | |||
char * | box_ch, | |||
const char * | err_msg | |||
) |
Print info on failed box.
If global variable applybox_debug > 4
Definition at line 707 of file applybox.cpp.
References BOX::bottom(), BOX::left(), BOX::right(), BOX::top(), and tprintf().
Referenced by apply_boxes(), and resegment_box().
00711 { 00712 if (applybox_debug > 4) 00713 tprintf ("APPLY_BOXES: boxfile %1d/%1d/%s ((%1d,%1d),(%1d,%1d)): %s\n", 00714 boxfile_lineno, 00715 boxfile_charno, 00716 box_ch, 00717 box.left (), box.bottom (), box.right (), box.top (), err_msg); 00718 }
INT16 resegment_box | ( | ROW * | row, | |
BOX | box, | |||
char * | ch, | |||
INT16 | block_id, | |||
INT16 | row_id, | |||
INT16 | boxfile_lineno, | |||
INT16 | boxfile_charno | |||
) |
Resegments blobs in box, checking that letters follow prescribed rules.
The error messages show what's going on: box overlaps blob in labelled word; caps-ht char didn't ascend; Odd top char below xht; x-ht char didn't have top near xht; non-ambig BL char didnt have bottom near baseline; Odd bottom char above baseline; Descender doesn't descend; and, finally, Couldn't find any blobs
Definition at line 351 of file applybox.cpp.
References ROW::base_line(), baseline, BOX::bottom(), WERD::bounding_box(), dummy, f, WERD::flag(), gblob_bounding_box(), WERD::gblob_list(), gblob_out_list(), gblob_sort_list(), goutline_bounding_box(), BOX::left(), NULL, outline_it, BOX::overlap(), report_failed_box(), BOX::right(), WERD::set_text(), WERD::shallow_copy(), WERD::text(), BOX::top(), tprintf(), W_POLYGON, ROW::word_list(), and ROW::x_height().
Referenced by apply_boxes().
00358 { 00359 WERD_IT word_it; 00360 WERD *word; 00361 WERD *new_word = NULL; 00362 BOOL8 polyg = false; 00363 PBLOB_IT blob_it; 00364 PBLOB_IT new_blob_it; 00365 PBLOB *blob; 00366 PBLOB *new_blob; 00367 OUTLINE_IT outline_it; 00368 OUTLINE_LIST dummy; // Just to initialize new_outline_it. 00369 OUTLINE_IT new_outline_it = &dummy; 00370 OUTLINE *outline; 00371 BOX new_word_box; 00372 float word_x_centre; 00373 float baseline; 00374 INT16 error_count = 0; //number of chars lost 00375 00376 word_it.set_to_list (row->word_list ()); 00377 for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { 00378 word = word_it.data (); 00379 polyg = word->flag (W_POLYGON); 00380 if (word->bounding_box ().overlap (box)) { 00381 blob_it.set_to_list (word->gblob_list ()); 00382 for (blob_it.mark_cycle_pt (); 00383 !blob_it.cycled_list (); blob_it.forward ()) { 00384 blob = blob_it.data (); 00385 if (gblob_bounding_box (blob, polyg).overlap (box)) { 00386 outline_it.set_to_list (gblob_out_list (blob, polyg)); 00387 for (outline_it.mark_cycle_pt (); 00388 !outline_it.cycled_list (); outline_it.forward ()) { 00389 outline = outline_it.data (); 00390 if (goutline_bounding_box (outline, polyg). 00391 major_overlap (box)) { 00392 if (strlen (word->text ()) > 0) { 00393 if (error_count == 0) { 00394 error_count = 1; 00395 if (applybox_debug > 4) 00396 report_failed_box (boxfile_lineno, 00397 boxfile_charno, 00398 box, ch, 00399 "FAILURE! box overlaps blob in labelled word"); 00400 } 00401 if (applybox_debug > 4) 00402 tprintf 00403 ("APPLY_BOXES: ALSO ignoring corrupted char blk:%d row:%d \"%s\"\n", 00404 block_id, row_id, 00405 word_it.data ()->text ()); 00406 word_it.data ()->set_text (""); 00407 //UN label it 00408 error_count++; 00409 } 00410 00411 if (error_count == 0) { 00412 if (new_word == NULL) { 00413 /* Make a new word with a single blob */ 00414 new_word = word->shallow_copy (); 00415 new_word->set_text (ch); 00416 if (polyg) 00417 new_blob = new PBLOB; 00418 else 00419 new_blob = (PBLOB *) new C_BLOB; 00420 new_blob_it.set_to_list (new_word-> 00421 gblob_list ()); 00422 new_blob_it.add_to_end (new_blob); 00423 new_outline_it. 00424 set_to_list (gblob_out_list 00425 (new_blob, polyg)); 00426 } 00427 new_outline_it.add_to_end (outline_it. 00428 extract ()); 00429 //move blob 00430 } 00431 } 00432 } 00433 //no outlines in blob 00434 if (outline_it.empty ()) 00435 //so delete blob 00436 delete blob_it.extract (); 00437 } 00438 } 00439 if (blob_it.empty ()) //no blobs in word 00440 //so delete word 00441 delete word_it.extract (); 00442 } 00443 } 00444 if (error_count > 0) 00445 return error_count; 00446 00447 if (new_word != NULL) { 00448 gblob_sort_list (new_word->gblob_list (), polyg); 00449 word_it.add_to_end (new_word); 00450 new_word_box = new_word->bounding_box (); 00451 word_x_centre = (new_word_box.left () + new_word_box.right ()) / 2.0f; 00452 baseline = row->base_line (word_x_centre); 00453 00454 if (STRING (chs_caps_ht).contains (ch[0]) && 00455 (new_word_box.top () < 00456 baseline + (1 + applybox_error_band) * row->x_height ())) { 00457 report_failed_box (boxfile_lineno, boxfile_charno, box, ch, 00458 "FAILURE! caps-ht char didn't ascend"); 00459 new_word->set_text (""); 00460 return 1; 00461 } 00462 if (STRING (chs_odd_top).contains (ch[0]) && 00463 (new_word_box.top () < 00464 baseline + (1 - applybox_error_band) * row->x_height ())) { 00465 report_failed_box (boxfile_lineno, boxfile_charno, box, ch, 00466 "FAILURE! Odd top char below xht"); 00467 new_word->set_text (""); 00468 return 1; 00469 } 00470 if (STRING (chs_x_ht).contains (ch[0]) && 00471 ((new_word_box.top () > 00472 baseline + (1 + applybox_error_band) * row->x_height ()) || 00473 (new_word_box.top () < 00474 baseline + (1 - applybox_error_band) * row->x_height ()))) { 00475 report_failed_box (boxfile_lineno, boxfile_charno, box, ch, 00476 "FAILURE! x-ht char didn't have top near xht"); 00477 new_word->set_text (""); 00478 return 1; 00479 } 00480 if (STRING (chs_non_ambig_bl).contains (ch[0]) && 00481 ((new_word_box.bottom () < 00482 baseline - applybox_error_band * row->x_height ()) || 00483 (new_word_box.bottom () > 00484 baseline + applybox_error_band * row->x_height ()))) { 00485 report_failed_box (boxfile_lineno, boxfile_charno, box, ch, 00486 "FAILURE! non ambig BL char didnt have bottom near baseline"); 00487 new_word->set_text (""); 00488 return 1; 00489 } 00490 if (STRING (chs_odd_bot).contains (ch[0]) && 00491 (new_word_box.bottom () > 00492 baseline + applybox_error_band * row->x_height ())) { 00493 report_failed_box (boxfile_lineno, boxfile_charno, box, ch, 00494 "FAILURE! Odd bottom char above baseline"); 00495 new_word->set_text (""); 00496 return 1; 00497 } 00498 if (STRING (chs_desc).contains (ch[0]) && 00499 (new_word_box.bottom () > 00500 baseline - applybox_error_band * row->x_height ())) { 00501 report_failed_box (boxfile_lineno, boxfile_charno, box, ch, 00502 "FAILURE! Descender doesn't descend"); 00503 new_word->set_text (""); 00504 return 1; 00505 } 00506 return 0; 00507 } 00508 else { 00509 report_failed_box (boxfile_lineno, boxfile_charno, box, ch, 00510 "FAILURE! Couldn't find any blobs"); 00511 return 1; 00512 } 00513 }
void tidy_up | ( | BLOCK_LIST * | block_list, | |
INT16 & | ok_char_count, | |||
INT16 & | ok_row_count, | |||
INT16 & | unlabelled_words, | |||
INT16 * | tgt_char_counts, | |||
INT16 & | rebalance_count, | |||
char & | min_char, | |||
INT16 & | min_samples, | |||
INT16 & | final_labelled_blob_count | |||
) |
Tidy up blocks; apply boxes.
Definition at line 525 of file applybox.cpp.
References WERD::bounding_box(), FALSE, WERD::gblob_list(), BOX::left(), NULL, BOX::print(), WERD::text(), tprintf(), TRUE, word_comparator(), and ROW::word_list().
Referenced by apply_boxes().
00534 { 00535 BLOCK_IT block_it(block_list); 00536 ROW_IT row_it; 00537 ROW *row; 00538 WERD_IT word_it; 00539 WERD *word; 00540 WERD *duplicate_word; 00541 INT16 block_idx = 0; 00542 INT16 row_idx; 00543 INT16 all_row_idx = 0; 00544 BOOL8 row_ok; 00545 BOOL8 rebalance_needed = FALSE; 00546 //No. of unique labelled samples 00547 INT16 labelled_char_counts[128]; 00548 INT16 i; 00549 char ch; 00550 char prev_ch = '\0'; 00551 BOOL8 at_dupe_of_prev_word; 00552 ROW *prev_row = NULL; 00553 INT16 left; 00554 INT16 prev_left = -1; 00555 00556 for (i = 0; i < 128; i++) 00557 labelled_char_counts[i] = 0; 00558 00559 ok_char_count = 0; 00560 ok_row_count = 0; 00561 unlabelled_words = 0; 00562 if ((applybox_debug > 4) && (block_it.length () != 1)) 00563 00564 tprintf ("APPLY_BOXES: More than one block??\n"); 00565 00566 for (block_it.mark_cycle_pt (); 00567 !block_it.cycled_list (); block_it.forward ()) { 00568 block_idx++; 00569 row_idx = 0; 00570 row_ok = FALSE; 00571 row_it.set_to_list (block_it.data ()->row_list ()); 00572 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { 00573 row_idx++; 00574 all_row_idx++; 00575 row = row_it.data (); 00576 word_it.set_to_list (row->word_list ()); 00577 word_it.sort (word_comparator); 00578 for (word_it.mark_cycle_pt (); 00579 !word_it.cycled_list (); word_it.forward ()) { 00580 word = word_it.data (); 00581 if (strlen (word->text ()) == 0) { 00582 unlabelled_words++; 00583 if (applybox_debug > 4) { 00584 tprintf 00585 ("APPLY_BOXES: Unlabelled word blk:%d row:%d allrows:%d\n", 00586 block_idx, row_idx, all_row_idx); 00587 } 00588 } 00589 else { 00590 if (word->gblob_list ()->length () != 1) 00591 tprintf 00592 ("APPLY_BOXES: FATALITY - MULTIBLOB Labelled word blk:%d row:%d allrows:%d\n", 00593 block_idx, row_idx, all_row_idx); 00594 00595 ok_char_count++; 00596 labelled_char_counts[*word->text ()]++; 00597 row_ok = TRUE; 00598 } 00599 } 00600 if ((applybox_debug > 4) && (!row_ok)) { 00601 tprintf 00602 ("APPLY_BOXES: Row with no labelled words blk:%d row:%d allrows:%d\n", 00603 block_idx, row_idx, all_row_idx); 00604 } 00605 else 00606 ok_row_count++; 00607 } 00608 } 00609 00610 min_samples = 9999; 00611 for (i = 0; i < 128; i++) { 00612 if (tgt_char_counts[i] > labelled_char_counts[i]) { 00613 if (labelled_char_counts[i] <= 1) { 00614 tprintf 00615 ("APPLY_BOXES: FATALITY - %d labelled samples of \"%c\" - target is %d\n", 00616 labelled_char_counts[i], (char) i, tgt_char_counts[i]); 00617 } 00618 else { 00619 rebalance_needed = TRUE; 00620 if (applybox_debug > 0) 00621 tprintf 00622 ("APPLY_BOXES: REBALANCE REQD \"%c\" - target of %d from %d labelled samples\n", 00623 (char) i, tgt_char_counts[i], labelled_char_counts[i]); 00624 } 00625 } 00626 if ((min_samples > labelled_char_counts[i]) && (tgt_char_counts[i] > 0)) { 00627 min_samples = labelled_char_counts[i]; 00628 min_char = (char) i; 00629 } 00630 } 00631 00632 while (applybox_rebalance && rebalance_needed) { 00633 block_it.set_to_list (block_list); 00634 for (block_it.mark_cycle_pt (); 00635 !block_it.cycled_list (); block_it.forward ()) { 00636 row_it.set_to_list (block_it.data ()->row_list ()); 00637 for (row_it.mark_cycle_pt (); 00638 !row_it.cycled_list (); row_it.forward ()) { 00639 row = row_it.data (); 00640 word_it.set_to_list (row->word_list ()); 00641 for (word_it.mark_cycle_pt (); 00642 !word_it.cycled_list (); word_it.forward ()) { 00643 word = word_it.data (); 00644 left = word->bounding_box ().left (); 00645 ch = *word->text (); 00646 at_dupe_of_prev_word = ((row == prev_row) && 00647 (left = prev_left) && 00648 (ch == prev_ch)); 00649 if ((ch != '\0') && 00650 (labelled_char_counts[ch] > 1) && 00651 (tgt_char_counts[ch] > labelled_char_counts[ch]) && 00652 (!at_dupe_of_prev_word)) { 00653 /* Duplicate the word to rebalance the labelled samples */ 00654 if (applybox_debug > 9) { 00655 tprintf ("Duping \"%c\" from ", ch); 00656 word->bounding_box ().print (); 00657 } 00658 duplicate_word = new WERD; 00659 *duplicate_word = *word; 00660 word_it.add_after_then_move (duplicate_word); 00661 rebalance_count++; 00662 labelled_char_counts[ch]++; 00663 } 00664 prev_row = row; 00665 prev_left = left; 00666 prev_ch = ch; 00667 } 00668 } 00669 } 00670 rebalance_needed = FALSE; 00671 for (i = 0; i < 128; i++) { 00672 if ((tgt_char_counts[i] > labelled_char_counts[i]) && 00673 (labelled_char_counts[i] > 1)) { 00674 rebalance_needed = TRUE; 00675 break; 00676 } 00677 } 00678 } 00679 00680 /* Now final check - count labelled blobs */ 00681 final_labelled_blob_count = 0; 00682 block_it.set_to_list (block_list); 00683 for (block_it.mark_cycle_pt (); 00684 !block_it.cycled_list (); block_it.forward ()) { 00685 row_it.set_to_list (block_it.data ()->row_list ()); 00686 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { 00687 row = row_it.data (); 00688 word_it.set_to_list (row->word_list ()); 00689 word_it.sort (word_comparator); 00690 for (word_it.mark_cycle_pt (); 00691 !word_it.cycled_list (); word_it.forward ()) { 00692 word = word_it.data (); 00693 if ((strlen (word->text ()) == 1) && 00694 (word->gblob_list ()->length () == 1)) 00695 final_labelled_blob_count++; 00696 } 00697 } 00698 } 00699 }