00001
00020
00021
00022
00023
00024
00025
00026
00027 #include "mfcpch.h"
00028 #include "applybox.h"
00029 #include <ctype.h>
00030 #include <string.h>
00031 #ifdef __UNIX__
00032 #include <assert.h>
00033 #include <errno.h>
00034 #endif
00035 #include "mainblk.h"
00036 #include "genblob.h"
00037 #include "fixxht.h"
00038 #include "control.h"
00039 #include "tessbox.h"
00040 #include "globals.h"
00041 #include "secname.h"
00042 #ifdef TEXT_VERBOSE
00043 #include "callcpp.h"
00044 #endif
00045
00046 #define SECURE_NAMES
00047 #ifndef SECURE_NAMES
00048 #include "wordstats.h"
00049 #endif
00050
00051 #define EXTERN
00052
00055 EXTERN BOOL_VAR (applybox_rebalance, TRUE, "Drop dead");
00056 EXTERN INT_VAR (applybox_debug, 0, "Debug level");
00057 EXTERN STRING_VAR (applybox_test_exclusions, "|",
00058 "Chars ignored for testing");
00059 EXTERN double_VAR (applybox_error_band, 0.15, "Err band as fract of xht");
00100 void apply_boxes(BLOCK_LIST *block_list
00101 ) {
00102 INT16 boxfile_lineno = 0;
00103 INT16 boxfile_charno = 0;
00104 BOX box;
00105 char ch[2];
00106 ROW *row;
00107 ROW *prev_row = NULL;
00108 INT16 prev_box_right = MAX_INT16;
00109 INT16 block_id;
00110 INT16 row_id;
00111 INT16 box_count = 0;
00112 INT16 box_failures = 0;
00113 INT16 labels_ok;
00114 INT16 rows_ok;
00115 INT16 bad_blobs;
00116 INT16 tgt_char_counts[128];
00117
00118 INT16 i;
00119 INT16 rebalance_count = 0;
00120 char min_char;
00121 INT16 min_samples;
00122 INT16 final_labelled_blob_count;
00123
00124 for (i = 0; i < 128; i++)
00125 tgt_char_counts[i] = 0;
00126
00127 FILE* box_file;
00128 STRING filename = imagefile;
00129 filename += ".box";
00130 if (!(box_file = fopen (filename.string(), "r"))) {
00131 CANTOPENFILE.error ("read_next_box", EXIT,
00132 "Cant open box file %s %d",
00133 filename.string(), errno);
00134 }
00135
00136 ch[1] = '\0';
00137 clear_any_old_text(block_list);
00138 while (read_next_box (box_file, &box, &ch[0])) {
00139 box_count++;
00140 tgt_char_counts[ch[0]]++;
00141 row = find_row_of_box (block_list, box, block_id, row_id);
00142 if (box.left () < prev_box_right) {
00143 boxfile_lineno++;
00144 boxfile_charno = 1;
00145 }
00146 else
00147 boxfile_charno++;
00148
00149 if (row == NULL) {
00150 box_failures++;
00151 report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
00152 "FAILURE! box overlaps no blobs or blobs in multiple rows");
00153 }
00154 else {
00155 if ((box.left () >= prev_box_right) && (row != prev_row))
00156 report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
00157 "WARNING! false row break");
00158 box_failures += resegment_box (row, box, ch, block_id, row_id,
00159 boxfile_lineno, boxfile_charno);
00160 prev_row = row;
00161 }
00162 prev_box_right = box.right ();
00163 }
00164 tidy_up(block_list,
00165 labels_ok,
00166 rows_ok,
00167 bad_blobs,
00168 tgt_char_counts,
00169 rebalance_count,
00170 min_char,
00171 min_samples,
00172 final_labelled_blob_count);
00173 tprintf ("APPLY_BOXES:\n");
00174 tprintf (" Boxes read from boxfile: %6d\n", box_count);
00175 tprintf (" Initially labelled blobs: %6d in %d rows\n",
00176 labels_ok, rows_ok);
00177 tprintf (" Box failures detected: %6d\n", box_failures);
00178 tprintf (" Duped blobs for rebalance:%6d\n", rebalance_count);
00179 tprintf (" \"%c\" has fewest samples:%6d\n", min_char, min_samples);
00180 tprintf (" Total unlabelled words: %6d\n",
00181 bad_blobs);
00182 tprintf (" Final labelled words: %6d\n",
00183 final_labelled_blob_count);
00184 }
00185
00186
00190 void clear_any_old_text(
00191 BLOCK_LIST *block_list
00192 ) {
00193 BLOCK_IT block_it(block_list);
00194 ROW_IT row_it;
00195 WERD_IT word_it;
00196
00197 for (block_it.mark_cycle_pt ();
00198 !block_it.cycled_list (); block_it.forward ()) {
00199 row_it.set_to_list (block_it.data ()->row_list ());
00200 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
00201 word_it.set_to_list (row_it.data ()->word_list ());
00202 for (word_it.mark_cycle_pt ();
00203 !word_it.cycled_list (); word_it.forward ()) {
00204 word_it.data ()->set_text ("");
00205 }
00206 }
00207 }
00208 }
00209
00210
00220 BOOL8 read_next_box(FILE* box_file,
00221 BOX *box,
00222 char *ch) {
00223 char buff[256];
00224 char *buffptr = buff;
00225 STRING box_filename;
00226 static INT16 line = 0;
00227 INT32 x_min;
00228 INT32 y_min;
00229 INT32 x_max;
00230 INT32 y_max;
00231 INT32 count = 0;
00232
00233 while (!feof (box_file)) {
00234 fgets (buff, sizeof (buff) - 1, box_file);
00235 line++;
00236
00237
00238 for (buffptr = buff; isspace (*buffptr); buffptr++)
00239 ;
00240 if (*buffptr != '\0') {
00241 count =
00242 sscanf (buff,
00243 "%c " INT32FORMAT " " INT32FORMAT " " INT32FORMAT " "
00244 INT32FORMAT, ch, &x_min, &y_min, &x_max, &y_max);
00245 if (count != 5) {
00246 tprintf ("Box file format error on line %i ignored\n", line);
00247 }
00248 else {
00249 *box = BOX (ICOORD (x_min, y_min), ICOORD (x_max, y_max));
00250 return TRUE;
00251 }
00252 }
00253 }
00254 return FALSE;
00255 }
00256
00263 ROW *find_row_of_box(
00264 BLOCK_LIST *block_list,
00265 BOX box,
00266 INT16 &block_id,
00267 INT16 &row_id_to_process) {
00268 BLOCK_IT block_it(block_list);
00269 BLOCK *block;
00270 ROW_IT row_it;
00271 ROW *row;
00272 ROW *row_to_process = NULL;
00273 INT16 row_id;
00274 WERD_IT word_it;
00275 WERD *word;
00276 BOOL8 polyg;
00277 PBLOB_IT blob_it;
00278 PBLOB *blob;
00279 OUTLINE_IT outline_it;
00280 OUTLINE *outline;
00281
00282
00283
00284
00285
00286
00287
00288 block_id = 0;
00289 for (block_it.mark_cycle_pt ();
00290 !block_it.cycled_list (); block_it.forward ()) {
00291 block_id++;
00292 row_id = 0;
00293 block = block_it.data ();
00294 if (block->bounding_box ().overlap (box)) {
00295 row_it.set_to_list (block->row_list ());
00296 for (row_it.mark_cycle_pt ();
00297 !row_it.cycled_list (); row_it.forward ()) {
00298 row_id++;
00299 row = row_it.data ();
00300 if (row->bounding_box ().overlap (box)) {
00301 word_it.set_to_list (row->word_list ());
00302 for (word_it.mark_cycle_pt ();
00303 !word_it.cycled_list (); word_it.forward ()) {
00304 word = word_it.data ();
00305 polyg = word->flag (W_POLYGON);
00306 if (word->bounding_box ().overlap (box)) {
00307 blob_it.set_to_list (word->gblob_list ());
00308 for (blob_it.mark_cycle_pt ();
00309 !blob_it.cycled_list (); blob_it.forward ()) {
00310 blob = blob_it.data ();
00311 if (gblob_bounding_box (blob, polyg).
00312 overlap (box)) {
00313 outline_it.
00314 set_to_list (gblob_out_list
00315 (blob, polyg));
00316 for (outline_it.mark_cycle_pt ();
00317 !outline_it.cycled_list ();
00318 outline_it.forward ()) {
00319 outline = outline_it.data ();
00320 if (goutline_bounding_box
00321 (outline, polyg).major_overlap (box)) {
00322 if ((row_to_process == NULL) ||
00323 (row_to_process == row)) {
00324 row_to_process = row;
00325 row_id_to_process = row_id;
00326 }
00327 else
00328
00329 return NULL;
00330 }
00331 }
00332 }
00333 }
00334 }
00335 }
00336 }
00337 }
00338 }
00339 }
00340 return row_to_process;
00341 }
00342
00351 INT16 resegment_box(
00352 ROW *row,
00353 BOX box,
00354 char *ch,
00355 INT16 block_id,
00356 INT16 row_id,
00357 INT16 boxfile_lineno,
00358 INT16 boxfile_charno) {
00359 WERD_IT word_it;
00360 WERD *word;
00361 WERD *new_word = NULL;
00362 BOOL8 polyg = false;
00363 PBLOB_IT blob_it;
00364 PBLOB_IT new_blob_it;
00365 PBLOB *blob;
00366 PBLOB *new_blob;
00367 OUTLINE_IT outline_it;
00368 OUTLINE_LIST dummy;
00369 OUTLINE_IT new_outline_it = &dummy;
00370 OUTLINE *outline;
00371 BOX new_word_box;
00372 float word_x_centre;
00373 float baseline;
00374 INT16 error_count = 0;
00375
00376 word_it.set_to_list (row->word_list ());
00377 for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
00378 word = word_it.data ();
00379 polyg = word->flag (W_POLYGON);
00380 if (word->bounding_box ().overlap (box)) {
00381 blob_it.set_to_list (word->gblob_list ());
00382 for (blob_it.mark_cycle_pt ();
00383 !blob_it.cycled_list (); blob_it.forward ()) {
00384 blob = blob_it.data ();
00385 if (gblob_bounding_box (blob, polyg).overlap (box)) {
00386 outline_it.set_to_list (gblob_out_list (blob, polyg));
00387 for (outline_it.mark_cycle_pt ();
00388 !outline_it.cycled_list (); outline_it.forward ()) {
00389 outline = outline_it.data ();
00390 if (goutline_bounding_box (outline, polyg).
00391 major_overlap (box)) {
00392 if (strlen (word->text ()) > 0) {
00393 if (error_count == 0) {
00394 error_count = 1;
00395 if (applybox_debug > 4)
00396 report_failed_box (boxfile_lineno,
00397 boxfile_charno,
00398 box, ch,
00399 "FAILURE! box overlaps blob in labelled word");
00400 }
00401 if (applybox_debug > 4)
00402 tprintf
00403 ("APPLY_BOXES: ALSO ignoring corrupted char blk:%d row:%d \"%s\"\n",
00404 block_id, row_id,
00405 word_it.data ()->text ());
00406 word_it.data ()->set_text ("");
00407
00408 error_count++;
00409 }
00410
00411 if (error_count == 0) {
00412 if (new_word == NULL) {
00413
00414 new_word = word->shallow_copy ();
00415 new_word->set_text (ch);
00416 if (polyg)
00417 new_blob = new PBLOB;
00418 else
00419 new_blob = (PBLOB *) new C_BLOB;
00420 new_blob_it.set_to_list (new_word->
00421 gblob_list ());
00422 new_blob_it.add_to_end (new_blob);
00423 new_outline_it.
00424 set_to_list (gblob_out_list
00425 (new_blob, polyg));
00426 }
00427 new_outline_it.add_to_end (outline_it.
00428 extract ());
00429
00430 }
00431 }
00432 }
00433
00434 if (outline_it.empty ())
00435
00436 delete blob_it.extract ();
00437 }
00438 }
00439 if (blob_it.empty ())
00440
00441 delete word_it.extract ();
00442 }
00443 }
00444 if (error_count > 0)
00445 return error_count;
00446
00447 if (new_word != NULL) {
00448 gblob_sort_list (new_word->gblob_list (), polyg);
00449 word_it.add_to_end (new_word);
00450 new_word_box = new_word->bounding_box ();
00451 word_x_centre = (new_word_box.left () + new_word_box.right ()) / 2.0f;
00452 baseline = row->base_line (word_x_centre);
00453
00454 if (STRING (chs_caps_ht).contains (ch[0]) &&
00455 (new_word_box.top () <
00456 baseline + (1 + applybox_error_band) * row->x_height ())) {
00457 report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
00458 "FAILURE! caps-ht char didn't ascend");
00459 new_word->set_text ("");
00460 return 1;
00461 }
00462 if (STRING (chs_odd_top).contains (ch[0]) &&
00463 (new_word_box.top () <
00464 baseline + (1 - applybox_error_band) * row->x_height ())) {
00465 report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
00466 "FAILURE! Odd top char below xht");
00467 new_word->set_text ("");
00468 return 1;
00469 }
00470 if (STRING (chs_x_ht).contains (ch[0]) &&
00471 ((new_word_box.top () >
00472 baseline + (1 + applybox_error_band) * row->x_height ()) ||
00473 (new_word_box.top () <
00474 baseline + (1 - applybox_error_band) * row->x_height ()))) {
00475 report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
00476 "FAILURE! x-ht char didn't have top near xht");
00477 new_word->set_text ("");
00478 return 1;
00479 }
00480 if (STRING (chs_non_ambig_bl).contains (ch[0]) &&
00481 ((new_word_box.bottom () <
00482 baseline - applybox_error_band * row->x_height ()) ||
00483 (new_word_box.bottom () >
00484 baseline + applybox_error_band * row->x_height ()))) {
00485 report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
00486 "FAILURE! non ambig BL char didnt have bottom near baseline");
00487 new_word->set_text ("");
00488 return 1;
00489 }
00490 if (STRING (chs_odd_bot).contains (ch[0]) &&
00491 (new_word_box.bottom () >
00492 baseline + applybox_error_band * row->x_height ())) {
00493 report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
00494 "FAILURE! Odd bottom char above baseline");
00495 new_word->set_text ("");
00496 return 1;
00497 }
00498 if (STRING (chs_desc).contains (ch[0]) &&
00499 (new_word_box.bottom () >
00500 baseline - applybox_error_band * row->x_height ())) {
00501 report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
00502 "FAILURE! Descender doesn't descend");
00503 new_word->set_text ("");
00504 return 1;
00505 }
00506 return 0;
00507 }
00508 else {
00509 report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
00510 "FAILURE! Couldn't find any blobs");
00511 return 1;
00512 }
00513 }
00514
00515
00525 void tidy_up(
00526 BLOCK_LIST *block_list,
00527 INT16 &ok_char_count,
00528 INT16 &ok_row_count,
00529 INT16 &unlabelled_words,
00530 INT16 *tgt_char_counts,
00531 INT16 &rebalance_count,
00532 char &min_char,
00533 INT16 &min_samples,
00534 INT16 &final_labelled_blob_count) {
00535 BLOCK_IT block_it(block_list);
00536 ROW_IT row_it;
00537 ROW *row;
00538 WERD_IT word_it;
00539 WERD *word;
00540 WERD *duplicate_word;
00541 INT16 block_idx = 0;
00542 INT16 row_idx;
00543 INT16 all_row_idx = 0;
00544 BOOL8 row_ok;
00545 BOOL8 rebalance_needed = FALSE;
00546
00547 INT16 labelled_char_counts[128];
00548 INT16 i;
00549 char ch;
00550 char prev_ch = '\0';
00551 BOOL8 at_dupe_of_prev_word;
00552 ROW *prev_row = NULL;
00553 INT16 left;
00554 INT16 prev_left = -1;
00555
00556 for (i = 0; i < 128; i++)
00557 labelled_char_counts[i] = 0;
00558
00559 ok_char_count = 0;
00560 ok_row_count = 0;
00561 unlabelled_words = 0;
00562 if ((applybox_debug > 4) && (block_it.length () != 1))
00563
00564 tprintf ("APPLY_BOXES: More than one block??\n");
00565
00566 for (block_it.mark_cycle_pt ();
00567 !block_it.cycled_list (); block_it.forward ()) {
00568 block_idx++;
00569 row_idx = 0;
00570 row_ok = FALSE;
00571 row_it.set_to_list (block_it.data ()->row_list ());
00572 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
00573 row_idx++;
00574 all_row_idx++;
00575 row = row_it.data ();
00576 word_it.set_to_list (row->word_list ());
00577 word_it.sort (word_comparator);
00578 for (word_it.mark_cycle_pt ();
00579 !word_it.cycled_list (); word_it.forward ()) {
00580 word = word_it.data ();
00581 if (strlen (word->text ()) == 0) {
00582 unlabelled_words++;
00583 if (applybox_debug > 4) {
00584 tprintf
00585 ("APPLY_BOXES: Unlabelled word blk:%d row:%d allrows:%d\n",
00586 block_idx, row_idx, all_row_idx);
00587 }
00588 }
00589 else {
00590 if (word->gblob_list ()->length () != 1)
00591 tprintf
00592 ("APPLY_BOXES: FATALITY - MULTIBLOB Labelled word blk:%d row:%d allrows:%d\n",
00593 block_idx, row_idx, all_row_idx);
00594
00595 ok_char_count++;
00596 labelled_char_counts[*word->text ()]++;
00597 row_ok = TRUE;
00598 }
00599 }
00600 if ((applybox_debug > 4) && (!row_ok)) {
00601 tprintf
00602 ("APPLY_BOXES: Row with no labelled words blk:%d row:%d allrows:%d\n",
00603 block_idx, row_idx, all_row_idx);
00604 }
00605 else
00606 ok_row_count++;
00607 }
00608 }
00609
00610 min_samples = 9999;
00611 for (i = 0; i < 128; i++) {
00612 if (tgt_char_counts[i] > labelled_char_counts[i]) {
00613 if (labelled_char_counts[i] <= 1) {
00614 tprintf
00615 ("APPLY_BOXES: FATALITY - %d labelled samples of \"%c\" - target is %d\n",
00616 labelled_char_counts[i], (char) i, tgt_char_counts[i]);
00617 }
00618 else {
00619 rebalance_needed = TRUE;
00620 if (applybox_debug > 0)
00621 tprintf
00622 ("APPLY_BOXES: REBALANCE REQD \"%c\" - target of %d from %d labelled samples\n",
00623 (char) i, tgt_char_counts[i], labelled_char_counts[i]);
00624 }
00625 }
00626 if ((min_samples > labelled_char_counts[i]) && (tgt_char_counts[i] > 0)) {
00627 min_samples = labelled_char_counts[i];
00628 min_char = (char) i;
00629 }
00630 }
00631
00632 while (applybox_rebalance && rebalance_needed) {
00633 block_it.set_to_list (block_list);
00634 for (block_it.mark_cycle_pt ();
00635 !block_it.cycled_list (); block_it.forward ()) {
00636 row_it.set_to_list (block_it.data ()->row_list ());
00637 for (row_it.mark_cycle_pt ();
00638 !row_it.cycled_list (); row_it.forward ()) {
00639 row = row_it.data ();
00640 word_it.set_to_list (row->word_list ());
00641 for (word_it.mark_cycle_pt ();
00642 !word_it.cycled_list (); word_it.forward ()) {
00643 word = word_it.data ();
00644 left = word->bounding_box ().left ();
00645 ch = *word->text ();
00646 at_dupe_of_prev_word = ((row == prev_row) &&
00647 (left = prev_left) &&
00648 (ch == prev_ch));
00649 if ((ch != '\0') &&
00650 (labelled_char_counts[ch] > 1) &&
00651 (tgt_char_counts[ch] > labelled_char_counts[ch]) &&
00652 (!at_dupe_of_prev_word)) {
00653
00654 if (applybox_debug > 9) {
00655 tprintf ("Duping \"%c\" from ", ch);
00656 word->bounding_box ().print ();
00657 }
00658 duplicate_word = new WERD;
00659 *duplicate_word = *word;
00660 word_it.add_after_then_move (duplicate_word);
00661 rebalance_count++;
00662 labelled_char_counts[ch]++;
00663 }
00664 prev_row = row;
00665 prev_left = left;
00666 prev_ch = ch;
00667 }
00668 }
00669 }
00670 rebalance_needed = FALSE;
00671 for (i = 0; i < 128; i++) {
00672 if ((tgt_char_counts[i] > labelled_char_counts[i]) &&
00673 (labelled_char_counts[i] > 1)) {
00674 rebalance_needed = TRUE;
00675 break;
00676 }
00677 }
00678 }
00679
00680
00681 final_labelled_blob_count = 0;
00682 block_it.set_to_list (block_list);
00683 for (block_it.mark_cycle_pt ();
00684 !block_it.cycled_list (); block_it.forward ()) {
00685 row_it.set_to_list (block_it.data ()->row_list ());
00686 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
00687 row = row_it.data ();
00688 word_it.set_to_list (row->word_list ());
00689 word_it.sort (word_comparator);
00690 for (word_it.mark_cycle_pt ();
00691 !word_it.cycled_list (); word_it.forward ()) {
00692 word = word_it.data ();
00693 if ((strlen (word->text ()) == 1) &&
00694 (word->gblob_list ()->length () == 1))
00695 final_labelled_blob_count++;
00696 }
00697 }
00698 }
00699 }
00700
00701
00707 void report_failed_box(INT16 boxfile_lineno,
00708 INT16 boxfile_charno,
00709 BOX box,
00710 char *box_ch,
00711 const char *err_msg) {
00712 if (applybox_debug > 4)
00713 tprintf ("APPLY_BOXES: boxfile %1d/%1d/%s ((%1d,%1d),(%1d,%1d)): %s\n",
00714 boxfile_lineno,
00715 boxfile_charno,
00716 box_ch,
00717 box.left (), box.bottom (), box.right (), box.top (), err_msg);
00718 }
00719
00720
00727 void apply_box_training(BLOCK_LIST *block_list) {
00728 BLOCK_IT block_it(block_list);
00729 ROW_IT row_it;
00730 ROW *row;
00731 WERD_IT word_it;
00732 WERD *word;
00733 WERD *bln_word;
00734 WERD copy_outword;
00735 PBLOB_IT blob_it;
00736 DENORM denorm;
00737 INT16 count = 0;
00738 char ch[2];
00739
00740 ch[1] = '\0';
00741
00742 tprintf ("Generating training data\n");
00743 for (block_it.mark_cycle_pt ();
00744 !block_it.cycled_list (); block_it.forward ()) {
00745 row_it.set_to_list (block_it.data ()->row_list ());
00746 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
00747 row = row_it.data ();
00748 word_it.set_to_list (row->word_list ());
00749 for (word_it.mark_cycle_pt ();
00750 !word_it.cycled_list (); word_it.forward ()) {
00751 word = word_it.data ();
00752 if ((strlen (word->text ()) == 1) &&
00753 (word->gblob_list ()->length () == 1)) {
00754
00755 bln_word =
00756 make_bln_copy (word, row, row->x_height (), &denorm);
00757 blob_it.set_to_list (bln_word->blob_list ());
00758 ch[0] = *word->text ();
00759 tess_training_tester (blob_it.data (),
00760
00761 &denorm, TRUE,
00762 ch,
00763 1,
00764 NULL);
00765 copy_outword = *(bln_word);
00766 copy_outword.baseline_denormalise (&denorm);
00767 blob_it.set_to_list (copy_outword.blob_list ());
00768 ch[0] = *word->text ();
00769 delete bln_word;
00770 count++;
00771 }
00772 }
00773 }
00774 }
00775 tprintf ("Generated training data for %d blobs\n", count);
00776 }
00777
00778
00789 void apply_box_testing(BLOCK_LIST *block_list) {
00790 BLOCK_IT block_it(block_list);
00791 ROW_IT row_it;
00792 ROW *row;
00793 INT16 row_count = 0;
00794 WERD_IT word_it;
00795 WERD *word;
00796 WERD *bln_word;
00797 INT16 word_count = 0;
00798 PBLOB_IT blob_it;
00799 DENORM denorm;
00800 INT16 count = 0;
00801 char ch[2];
00802 WERD *outword;
00803
00804 WERD_CHOICE *best_choice;
00805 WERD_CHOICE *raw_choice;
00806
00807 BLOB_CHOICE_LIST_CLIST blob_choices;
00808 INT16 char_count = 0;
00809 INT16 correct_count = 0;
00810 INT16 err_count = 0;
00811 INT16 rej_count = 0;
00812 #ifndef SECURE_NAMES
00813 WERDSTATS wordstats;
00814 #endif
00815 char tess_rej_str[3];
00816 char tess_long_str[3];
00817
00818 ch[1] = '\0';
00819 strcpy (tess_rej_str, "|A");
00820 strcpy (tess_long_str, "|B");
00821
00822 for (block_it.mark_cycle_pt ();
00823 !block_it.cycled_list (); block_it.forward ()) {
00824 row_it.set_to_list (block_it.data ()->row_list ());
00825 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
00826 row = row_it.data ();
00827 row_count++;
00828 word_count = 0;
00829 word_it.set_to_list (row->word_list ());
00830 for (word_it.mark_cycle_pt ();
00831 !word_it.cycled_list (); word_it.forward ()) {
00832 word = word_it.data ();
00833 word_count++;
00834 if ((strlen (word->text ()) == 1) &&
00835 !STRING (applybox_test_exclusions).contains (*word->text ())
00836 && (word->gblob_list ()->length () == 1)) {
00837
00838 bln_word =
00839 make_bln_copy (word, row, row->x_height (), &denorm);
00840 blob_it.set_to_list (bln_word->blob_list ());
00841 ch[0] = *word->text ();
00842 char_count++;
00843 best_choice = tess_segment_pass1 (bln_word,
00844 &denorm,
00845 tess_default_matcher,
00846 raw_choice,
00847 &blob_choices, outword);
00848
00849
00850 if ((best_choice->string ().length () == 0) ||
00851 (strspn (best_choice->string ().string (), " ") ==
00852 best_choice->string ().length ())) {
00853 rej_count++;
00854 tprintf ("%d:%d: \"%s\" -> TESS FAILED\n",
00855 row_count, word_count, ch);
00856 #ifndef SECURE_NAMES
00857 wordstats.word (tess_rej_str, 2, ch, 1);
00858 #endif
00859 }
00860 else {
00861 if ((best_choice->string ().length () !=
00862 outword->blob_list ()->length ()) ||
00863 (best_choice->string ().length () !=
00864 blob_choices.length ())) {
00865 tprintf
00866 ("ASSERT FAIL String:\"%s\"; Strlen=%d; #Blobs=%d; #Choices=%d\n",
00867 best_choice->string ().string (),
00868 best_choice->string ().length (),
00869 outword->blob_list ()->length (),
00870 blob_choices.length ());
00871 }
00872 ASSERT_HOST (best_choice->string ().length () ==
00873 outword->blob_list ()->length ());
00874 ASSERT_HOST (best_choice->string ().length () ==
00875 blob_choices.length ());
00876 fix_quotes ((char *) best_choice->string ().string (),
00877
00878 outword, &blob_choices);
00879 if (strcmp (best_choice->string ().string (), ch) != 0) {
00880 err_count++;
00881 tprintf ("%d:%d: \"%s\" -> \"%s\"\n",
00882 row_count, word_count, ch,
00883 best_choice->string ().string ());
00884 }
00885 else
00886 correct_count++;
00887 #ifndef SECURE_NAMES
00888 if (best_choice->string ().length () > 2)
00889 wordstats.word (tess_long_str, 2, ch, 1);
00890 else
00891 wordstats.word ((char *) best_choice->string ().
00892 string (),
00893 best_choice->string ().length (), ch,
00894 1);
00895 #endif
00896 }
00897 delete bln_word;
00898 delete outword;
00899 delete best_choice;
00900 delete raw_choice;
00901 blob_choices.deep_clear ();
00902 count++;
00903 }
00904 }
00905 }
00906 }
00907 #ifndef SECURE_NAMES
00908 wordstats.print (1, 100.0);
00909 wordstats.conf_matrix ();
00910 tprintf ("Tested %d chars: %d correct; %d rejected by tess; %d errs\n",
00911 char_count, correct_count, rej_count, err_count);
00912 #endif
00913 }