00001
00020 #include "mfcpch.h"
00021 #include "ocrshell.h"
00022 #include <string.h>
00023 #include <ctype.h>
00024 #ifdef __UNIX__
00025 #include <assert.h>
00026 #include <unistd.h>
00027 #include <errno.h>
00028 #endif
00029 #include "mainblk.h"
00030 #include "tfacep.h"
00031 #include "tessvars.h"
00032 #include "control.h"
00033 #include "secname.h"
00034 #include "reject.h"
00035 #include "docqual.h"
00036 #include "output.h"
00037 #include "bestfirst.h"
00038 #ifdef TEXT_VERBOSE
00039 #include "callcpp.h"
00040 #endif
00041
00042 #define EXTERN
00043
00044 #define EPAPER_EXT ".ep"
00045 #define PAGE_YSIZE 3508
00046 #define CTRL_INSET '\024' // dc4=text inset
00047 #define CTRL_FONT '\016' // so=font change
00048 #define CTRL_DEFAULT '\017' // si=default font
00049 #define CTRL_SHIFT '\022' // dc2=x shift
00050 #define CTRL_TAB '\011' // tab
00051 #define CTRL_NEWLINE '\012' // newline
00052 #define CTRL_HARDLINE '\015' // cr
00053 int NO_BLOCK = 0;
00054
00055
00056 INT16 XOFFSET = 0;
00057 INT16 YOFFSET = 0;
00058
00061 EXTERN BOOL_EVAR (tessedit_write_block_separators, FALSE,
00062 "Write block separators in output");
00063
00064 EXTERN BOOL_VAR (tessedit_write_raw_output, TRUE,
00065 "Write raw stuff to name.raw");
00066 EXTERN BOOL_EVAR (tessedit_write_output, TRUE, "Write text to name.txt");
00067 EXTERN BOOL_EVAR (tessedit_write_ratings, FALSE,
00068 "Return ratings in IPEOCRAPI data");
00069 EXTERN BOOL_EVAR (tessedit_write_txt_map, TRUE,
00070 "Write .txt to .etx map file");
00071 EXTERN BOOL_EVAR (tessedit_write_rep_codes, TRUE,
00072 "Write repetition char code");
00073 EXTERN BOOL_EVAR (tessedit_write_unlv, FALSE, "Write .unlv output file");
00074 EXTERN STRING_EVAR (unrecognised_char, "|",
00075 "Output char for unidentified blobs");
00076 EXTERN INT_EVAR (suspect_level, 99, "Suspect marker level");
00077 EXTERN INT_VAR (suspect_space_level, 100,
00078 "Min suspect level for rejecting spaces");
00079 EXTERN INT_VAR (suspect_short_words, 2,
00080 "Dont Suspect dict wds longer than this");
00081 EXTERN BOOL_VAR (suspect_constrain_1Il, FALSE,
00082 "UNLV keep 1Il chars rejected");
00083 EXTERN double_VAR (suspect_rating_per_ch, 999.9,
00084 "Dont touch bad rating limit");
00085 EXTERN double_VAR (suspect_accept_rating, -999.9, "Accept good rating limit");
00086
00087 EXTERN BOOL_EVAR (tessedit_minimal_rejection, FALSE,
00088 "Only reject tess failures");
00089 EXTERN BOOL_VAR (tessedit_zero_rejection, FALSE, "Dont reject ANYTHING");
00090 EXTERN BOOL_VAR (tessedit_word_for_word, FALSE,
00091 "Make output have exactly one word per WERD");
00092 EXTERN BOOL_VAR (tessedit_zero_kelvin_rejection, FALSE,
00093 "Dont reject ANYTHING AT ALL");
00094 EXTERN BOOL_VAR (tessedit_consistent_reps, TRUE,
00095 "Force all rep chars the same");
00098 FILE *txt_mapfile = NULL;
00099 FILE *unlv_file = NULL;
00100
00108 INT32 pixels_to_pts(
00109 INT32 pixels,
00110 INT32 pix_res
00111 ) {
00112 float pts;
00113
00114 pts = pixels * 72.0 / pix_res;
00115 return (INT32) (pts + 0.5);
00116 }
00117
00118
00122 void output_pass(
00123 PAGE_RES_IT &page_res_it,
00124 BOOL8 write_to_shm) {
00125 BLOCK_RES *block_of_last_word;
00126 INT16 block_id;
00127 BOOL8 force_eol;
00128 BLOCK *nextblock;
00129 WERD *nextword;
00130
00131 #ifdef TEXT_VERBOSE
00132
00133 cprintf("w");
00134 #endif
00135 if (tessedit_write_txt_map)
00136 txt_mapfile = open_outfile (".map");
00137 if (tessedit_write_unlv)
00138 unlv_file = open_outfile (".unlv");
00139 page_res_it.restart_page ();
00140 block_of_last_word = NULL;
00141 while (page_res_it.word () != NULL) {
00142 check_debug_pt (page_res_it.word (), 120);
00143 if (tessedit_write_block_separators &&
00144 block_of_last_word != page_res_it.block ()) {
00145 block_of_last_word = page_res_it.block ();
00146 if (block_of_last_word->block->text_region () == NULL) {
00147 if (block_of_last_word->block->poly_block () == NULL)
00148 block_id = 1;
00149 else
00150 block_id =
00151 ((WEIRD_BLOCK *) block_of_last_word->block->poly_block ())->
00152 id_no();
00153 }
00154 else
00155 block_id = block_of_last_word->block->text_region ()->id_no ();
00156 if (!NO_BLOCK)
00157 fprintf (textfile, "|^~tr%d\n", block_id);
00158 fprintf (txt_mapfile, "|^~tr%d\n", block_id);
00159 }
00160
00161 force_eol = (tessedit_write_block_separators &&
00162 (page_res_it.block () != page_res_it.next_block ())) ||
00163 (page_res_it.next_word () == NULL);
00164
00165 if (page_res_it.next_word () != NULL)
00166 nextword = page_res_it.next_word ()->word;
00167 else
00168 nextword = NULL;
00169 if (page_res_it.next_block () != NULL)
00170 nextblock = page_res_it.next_block ()->block;
00171 else
00172 nextblock = NULL;
00173 write_results (page_res_it,
00174 determine_newline_type (page_res_it.word ()->word,
00175 page_res_it.block ()->block, nextword, nextblock),
00176 force_eol, write_to_shm);
00177 page_res_it.forward ();
00178 }
00179 if (write_to_shm)
00180 ocr_send_text(FALSE);
00181 if (tessedit_write_block_separators) {
00182 if (!NO_BLOCK)
00183 fprintf (textfile, "|^~tr\n");
00184 fprintf (txt_mapfile, "|^~tr\n");
00185 }
00186 if (tessedit_write_txt_map) {
00187 fprintf (txt_mapfile, "\n");
00188 #ifdef __UNIX__
00189 fsync (fileno (txt_mapfile));
00190 #endif
00191 fclose(txt_mapfile);
00192 }
00193 }
00194
00195
00214 void write_results(
00215 PAGE_RES_IT &page_res_it,
00216 char newline_type,
00217 BOOL8 force_eol,
00218 BOOL8 write_to_shm
00219 ) {
00220 WERD_RES *word = page_res_it.word ();
00221 WERD_CHOICE *ep_choice;
00222 STRING repetition_code;
00223 const STRING *wordstr;
00224 const char *text;
00225 int i;
00226 char unrecognised = STRING (unrecognised_char)[0];
00227 char ep_chars[32];
00228 int ep_chars_index = 0;
00229 char txt_chs[32];
00230 char map_chs[32];
00231 int txt_index = 0;
00232 static BOOL8 tilde_crunch_written = FALSE;
00233 static BOOL8 last_char_was_newline = TRUE;
00234 static BOOL8 last_char_was_tilde = FALSE;
00235 static BOOL8 empty_block = TRUE;
00236 BOOL8 need_reject = FALSE;
00237 char *ptr;
00238 PBLOB_IT blob_it;
00239
00240
00241
00242
00243
00244
00245
00246
00247
00248
00249
00250
00251
00252
00253 if (word->unlv_crunch_mode != CR_NONE
00254 && !tessedit_zero_kelvin_rejection && !tessedit_word_for_word) {
00255 if ((word->unlv_crunch_mode != CR_DELETE) &&
00256 (!tilde_crunch_written ||
00257 ((word->unlv_crunch_mode == CR_KEEP_SPACE) &&
00258 (word->word->space () > 0) &&
00259 !word->word->flag (W_FUZZY_NON) &&
00260 !word->word->flag (W_FUZZY_SP)))) {
00261 if (!word->word->flag (W_BOL) &&
00262 (word->word->space () > 0) &&
00263 !word->word->flag (W_FUZZY_NON) &&
00264 !word->word->flag (W_FUZZY_SP)) {
00265
00266 txt_chs[txt_index] = ' ';
00267 map_chs[txt_index++] = '1';
00268 ep_chars[ep_chars_index++] = ' ';
00269 last_char_was_tilde = FALSE;
00270 }
00271 need_reject = TRUE;
00272 }
00273 if ((need_reject && !last_char_was_tilde) || (force_eol && empty_block)) {
00274
00275 last_char_was_tilde = TRUE;
00276 txt_chs[txt_index] = unrecognised;
00277 if (tessedit_zero_rejection || (suspect_level == 0)) {
00278 map_chs[txt_index++] = '1';
00279 ep_chars[ep_chars_index++] = unrecognised;
00280 }
00281 else {
00282 map_chs[txt_index++] = '0';
00283
00284
00285
00286 ep_chars[ep_chars_index++] = CTRL_INSET;
00287 ep_chars[ep_chars_index++] = 1;
00288 ep_chars[ep_chars_index++] = 1;
00289 ep_chars[ep_chars_index++] = 2;
00290 ep_chars[ep_chars_index++] = 1;
00291 ep_chars[ep_chars_index++] = 1;
00292 }
00293 tilde_crunch_written = TRUE;
00294 last_char_was_newline = FALSE;
00295 empty_block = FALSE;
00296 }
00297
00298 if ((word->word->flag (W_EOL) && !last_char_was_newline) || force_eol) {
00299
00300 txt_chs[txt_index] = '\n';
00301 map_chs[txt_index++] = '\n';
00302 ep_chars[ep_chars_index++] = newline_type;
00303
00304 tilde_crunch_written = FALSE;
00305 last_char_was_newline = TRUE;
00306 last_char_was_tilde = FALSE;
00307 }
00308 txt_chs[txt_index] = '\0';
00309 map_chs[txt_index] = '\0';
00310 if (tessedit_write_output && !NO_BLOCK)
00311 fprintf (textfile, "%s", txt_chs);
00312
00313 if (tessedit_write_unlv)
00314 fprintf (unlv_file, "%s", txt_chs);
00315
00316 if (tessedit_write_txt_map)
00317 fprintf (txt_mapfile, "%s", map_chs);
00318
00319 ep_chars[ep_chars_index] = '\0';
00320 word->ep_choice = new WERD_CHOICE (ep_chars, 0, 0, NO_PERM);
00321
00322 if (force_eol)
00323 empty_block = TRUE;
00324 return;
00325 }
00326
00327
00328
00329 tilde_crunch_written = FALSE;
00330 if (newline_type)
00331 last_char_was_newline = TRUE;
00332 else
00333 last_char_was_newline = FALSE;
00334 empty_block = force_eol;
00335
00336 if (unlv_tilde_crunching &&
00337 last_char_was_tilde &&
00338 (word->word->space () == 0) &&
00339 !(word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes) &&
00340 (word->best_choice->string ()[0] == ' ')) {
00341
00342
00343 ptr = (char *) word->best_choice->string ().string ();
00344 strcpy (ptr, ptr + 1);
00345 word->reject_map.remove_pos (0);
00346 blob_it = word->outword->blob_list ();
00347 delete blob_it.extract ();
00348 }
00349 if (newline_type ||
00350 (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes))
00351 last_char_was_tilde = FALSE;
00352 else {
00353 if (word->reject_map.length () > 0) {
00354 if (word->best_choice->string ()[word->reject_map.length () - 1] ==
00355 ' ')
00356 last_char_was_tilde = TRUE;
00357 else
00358 last_char_was_tilde = FALSE;
00359 }
00360 else if (word->word->space () > 0)
00361 last_char_was_tilde = FALSE;
00362
00363 }
00364
00365 ptr = (char *) word->best_choice->string ().string ();
00366 ASSERT_HOST (strlen (ptr) == word->reject_map.length ());
00367
00368 if (word->word->flag (W_REP_CHAR) && tessedit_consistent_reps)
00369 ensure_rep_chars_are_consistent(word);
00370
00371 set_unlv_suspects(word);
00372 check_debug_pt (word, 120);
00373 if (tessedit_rejection_debug) {
00374 tprintf ("Dict word: \"%s\": %d\n",
00375 word->best_choice->string ().string (),
00376 dict_word (word->best_choice->string ().string ()));
00377 }
00378
00379 if (tessedit_write_unlv) {
00380 write_unlv_text(word);
00381 }
00382
00383 if (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes) {
00384 repetition_code = "|^~R";
00385 repetition_code += get_rep_char (word);
00386 wordstr = &repetition_code;
00387 }
00388 else {
00389 wordstr = &(word->best_choice->string ());
00390 if (tessedit_zero_rejection) {
00391
00392 text = wordstr->string ();
00393 for (i = 0; text[i] != '\0'; i++) {
00394 if (word->reject_map[i].rejected ())
00395 word->reject_map[i].setrej_minimal_rej_accept ();
00396 }
00397 }
00398 if (tessedit_minimal_rejection) {
00399
00400 text = wordstr->string ();
00401 for (i = 0; text[i] != '\0'; i++) {
00402 if ((text[i] != ' ') && word->reject_map[i].rejected ())
00403 word->reject_map[i].setrej_minimal_rej_accept ();
00404 }
00405 }
00406 }
00407
00408 if (write_to_shm)
00409 write_shm_text (word, page_res_it.block ()->block,
00410 page_res_it.row (), *wordstr);
00411
00412 if (tessedit_write_output)
00413 write_cooked_text (word->word, *wordstr, TRUE, FALSE, textfile);
00414
00415 if (tessedit_write_raw_output)
00416 write_cooked_text (word->word, word->raw_choice->string (),
00417 TRUE, FALSE, rawfile);
00418
00419 if (tessedit_write_txt_map)
00420 write_map(txt_mapfile, word);
00421
00422 ep_choice = make_epaper_choice (word, newline_type);
00423 word->ep_choice = ep_choice;
00424
00425 character_count += word->best_choice->string ().length ();
00426 word_count++;
00427 }
00428
00439 WERD_CHOICE *make_epaper_choice(
00440 WERD_RES *word,
00441 char newline_type
00442 ) {
00443 INT16 index = 0;
00444 INT16 blobindex;
00445 INT16 prevright = 0;
00446 INT16 nextleft;
00447 PBLOB *blob;
00448 BOX inset_box;
00449 PBLOB_IT blob_it;
00450 char word_string[MAX_PATH];
00451 BOOL8 force_total_reject;
00452 char unrecognised = STRING (unrecognised_char)[0];
00453
00454 blob_it.set_to_list (word->outword->blob_list ());
00455
00456 ASSERT_HOST (word->reject_map.length () ==
00457 word->best_choice->string ().length ());
00458
00459
00460
00461
00462
00463
00464
00465
00466 if (word->best_choice->string ().length () == 0)
00467 force_total_reject = TRUE;
00468 else {
00469 force_total_reject = FALSE;
00470 ASSERT_HOST (blob_it.length () ==
00471 word->best_choice->string ().length ());
00472 }
00473 if (!blob_it.empty ()) {
00474 for (index = 0; index < word->word->space (); index++)
00475 word_string[index] = ' ';
00476 }
00477
00478
00479
00480
00481
00482
00483 if (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes) {
00484 strcpy (word_string + index, "|^~R");
00485 index += 4;
00486 word_string[index++] = get_rep_char (word);
00487 }
00488 else {
00489 if (!blob_it.empty ())
00490 prevright = blob_it.data ()->bounding_box ().left ();
00491
00492 for (blobindex = 0, blob_it.mark_cycle_pt ();
00493 !blob_it.cycled_list (); blobindex++, blob_it.forward ()) {
00494 blob = blob_it.data ();
00495 if (word->reject_map[blobindex].accepted ()) {
00496 if (word->best_choice->string ()[blobindex] == ' ')
00497 word_string[index++] = unrecognised;
00498 else
00499 word_string[index++] =
00500 word->best_choice->string ()[blobindex];
00501 }
00502 else {
00503 inset_box = blob->bounding_box ();
00504
00505 while (!blob_it.at_last () &&
00506 (force_total_reject ||
00507 (word->reject_map[blobindex + 1].rejected ()))) {
00508 blobindex++;
00509 blob = blob_it.forward ();
00510 inset_box += blob->bounding_box ();
00511 }
00512 if (blob_it.at_last ())
00513 nextleft = inset_box.right ();
00514 else
00515 nextleft = blob_it.data_relative (1)->bounding_box ().left ();
00516
00517
00518
00519
00520
00521 index += make_reject (&inset_box, prevright, nextleft,
00522 &word->denorm, &word_string[index]);
00523 }
00524 prevright = blob->bounding_box ().right ();
00525 }
00526 }
00527 if (newline_type)
00528 word_string[index++] = newline_type;
00529 word_string[index] = '\0';
00530 if (strlen (word_string) != index) {
00531 tprintf ("ASSERT ABOUT TO FAIL: %s, index %d len %d\n",
00532 word_string, index, strlen (word_string));
00533 }
00534 ASSERT_HOST (strlen (word_string) == index);
00535 return new WERD_CHOICE (word_string, 0, 0, NO_PERM);
00536 }
00537
00538
00548 INT16 make_reject (
00549 BOX * inset_box,
00550 INT16 prevright,
00551 INT16 nextleft,
00552 DENORM * denorm,
00553 char word_string[]
00554 ) {
00555 INT16 index;
00556 INT16 xpos;
00557 INT16 ypos;
00558 INT16 width;
00559 INT16 height;
00560 INT16 left_offset;
00561 INT16 right_offset;
00562 INT16 baseline_offset;
00563 INT16 inset_index = 0;
00564 INT16 min_chars;
00565 INT16 max_chars;
00566 float x_centre;
00567
00568 index = 0;
00569 x_centre = (inset_box->left () + inset_box->right ()) / 2.0;
00570 left_offset =
00571 (INT16) (denorm->x (inset_box->left ()) - denorm->x (prevright));
00572 right_offset =
00573 (INT16) (denorm->x (nextleft) - denorm->x (inset_box->right ()));
00574 xpos = (INT16) floor (denorm->x (inset_box->left ()));
00575 width = (INT16) ceil (denorm->x (inset_box->right ())) - xpos;
00576 ypos = (INT16) floor (denorm->y (inset_box->bottom (), x_centre));
00577 height = (INT16) ceil (denorm->y (inset_box->top (), x_centre)) - ypos;
00578 baseline_offset = ypos - (INT16) denorm->y (bln_baseline_offset, x_centre);
00579 word_string[index++] = CTRL_INSET;
00580 min_chars = (INT16) ceil (0.27 * width / denorm->row ()->x_height ());
00581 max_chars = (INT16) floor (1.8 * width / denorm->row ()->x_height ());
00582
00583
00584
00585
00586
00587 if (min_chars < 0)
00588 min_chars = 0;
00589 if (min_chars > 254)
00590 min_chars = 254;
00591 if (max_chars < min_chars)
00592 max_chars = min_chars;
00593 if (max_chars > 254)
00594 max_chars = 254;
00595 word_string[index++] = min_chars + 1;
00596 word_string[index++] = max_chars + 1;
00597 word_string[index++] = 2;
00598 word_string[index++] = inset_index / 255 + 1;
00599 word_string[index++] = inset_index % 255 + 1;
00600 return index;
00601 }
00602
00603
00613 char determine_newline_type(
00614 WERD *word,
00615 BLOCK *block,
00616 WERD *next_word,
00617 BLOCK *next_block
00618 ) {
00619 INT16 end_gap;
00620 INT16 width;
00621 BOX word_box;
00622 BOX next_box;
00623 BOX block_box;
00624
00625 if (!word->flag (W_EOL))
00626 return FALSE;
00627 if (next_word == NULL || next_block == NULL || block != next_block)
00628 return CTRL_NEWLINE;
00629 if (next_word->space () > 0)
00630 return CTRL_HARDLINE;
00631 word_box = word->bounding_box ();
00632 next_box = next_word->bounding_box ();
00633 block_box = block->bounding_box ();
00634
00635 end_gap = block_box.right () - word_box.right ();
00636 end_gap -= (INT32) block->space ();
00637 width = next_box.right () - next_box.left ();
00638
00639
00640
00641
00642 return end_gap > width ? CTRL_HARDLINE : CTRL_NEWLINE;
00643 }
00644
00645
00657 void write_cooked_text(
00658 WERD *word,
00659 const STRING &text,
00660 BOOL8 acceptable,
00661 BOOL8 pass2,
00662 FILE *fp
00663 ) {
00664 INT16 index;
00665 int status;
00666 static int newaline = 1;
00667 static int havespace = 0;
00668 char buff[512];
00669 const char *wordstr = text.string ();
00670 int i = 0;
00671 char unrecognised = STRING (unrecognised_char)[0];
00672 static int old_segs = 0;
00673 BOX mybox;
00674 for (i = 0; wordstr[i] != '\0'; i++) {
00675 if (wordstr[i] == ' ')
00676 buff[i] = unrecognised;
00677 else
00678 buff[i] = wordstr[i];
00679 }
00680 buff[i] = '\0';
00681
00682 if (fp == stdout) {
00683 tprintf ("Cooked=%s, %d segs, acceptable=%d",
00684 buff, num_popped - old_segs, acceptable);
00685 old_segs = num_popped;
00686 return;
00687 }
00688
00689 if (text.length () > 0) {
00690 for (index = 0; index < word->space (); index++) {
00691 status = fprintf (fp, " ");
00692 havespace = 1;
00693 if (status < 0)
00694 WRITEFAILED.error ("write_cooked_text", EXIT,
00695 "Space Errno: %d", errno);
00696 }
00697 if (pass2) {
00698 status = fprintf (fp, BOLD_ON);
00699 if (status < 0)
00700 WRITEFAILED.error ("write_cooked_text", EXIT,
00701 "Bold Errno: %d", errno);
00702 }
00703 if (!acceptable) {
00704 status = fprintf (fp, UNDERLINE_ON);
00705 if (status < 0)
00706 WRITEFAILED.error ("write_cooked_text", EXIT,
00707 "Underline Errno: %d", errno);
00708 }
00709
00710
00711 if (NO_BLOCK && word && strlen (buff)) {
00712 mybox = word->bounding_box ();
00713 if (newaline || !havespace) {
00714 fprintf (fp, " ");
00715 newaline = 0;
00716 }
00717 fprintf (fp, "(%d," INT32FORMAT ",%d," INT32FORMAT ")",
00718 XOFFSET + mybox.left (),
00719 YOFFSET + page_image.get_ysize () - mybox.top (),
00720 XOFFSET + mybox.right (),
00721 YOFFSET + page_image.get_ysize () - mybox.bottom ());
00722 havespace = 0;
00723 }
00724
00725 status = fprintf (fp, "%s", buff);
00726 if (status < 0)
00727 WRITEFAILED.error ("write_cooked_text", EXIT,
00728 "Word Errno: %d", errno);
00729 if (pass2) {
00730 status = fprintf (fp, BOLD_OFF);
00731 if (status < 0)
00732 WRITEFAILED.error ("write_cooked_text", EXIT,
00733 "Bold off Errno: %d", errno);
00734 }
00735 if (!acceptable) {
00736 status = fprintf (fp, UNDERLINE_OFF);
00737 if (status < 0)
00738 WRITEFAILED.error ("write_cooked_text", EXIT,
00739 "Underline off Errno: %d", errno);
00740 }
00741 }
00742 if (word->flag (W_EOL)) {
00743 status = fprintf (fp, "\n");
00744 newaline = 1;
00745 if (status < 0)
00746 WRITEFAILED.error ("write_cooked_text", EXIT,
00747 "Newline Errno: %d", errno);
00748 }
00749 status = fflush (fp);
00750 if (status != 0)
00751 WRITEFAILED.error ("write_cooked_text", EXIT, "Fflush Errno: %d", errno);
00752 }
00753
00754
00758 void write_shm_text(
00759 WERD_RES *word,
00760 BLOCK *block,
00761 ROW_RES *row,
00762 const STRING &text
00763 ) {
00764 INT32 index;
00765 INT32 index2;
00766 INT32 length;
00767 INT32 ptsize;
00768 INT8 blanks;
00769 UINT8 enhancement;
00770 UINT8 font;
00771 char unrecognised = STRING (unrecognised_char)[0];
00772 PBLOB *blob;
00773 BOX blob_box;
00774 PBLOB_IT blob_it;
00775 WERD copy_outword;
00776 UINT32 rating;
00777 BOOL8 lineend;
00778
00779
00780 ptsize = pixels_to_pts ((INT32) (row->row->x_height () + row->row->ascenders () - row->row->descenders ()), 300);
00781 if (word->word->flag (W_BOL) && ocr_char_space () < 128
00782 && ocr_send_text (TRUE) != OKAY)
00783 return;
00784 copy_outword = *(word->outword);
00785 copy_outword.baseline_denormalise (&word->denorm);
00786 blob_it.set_to_list (copy_outword.blob_list ());
00787 length = text.length ();
00788
00789 if (length > 0) {
00790 blanks = word->word->space ();
00791 if (blanks == 0 && tessedit_word_for_word && !word->word->flag (W_BOL))
00792 blanks = 1;
00793 for (index = 0; index < length; index++, blob_it.forward ()) {
00794 blob = blob_it.data ();
00795 blob_box = blob->bounding_box ();
00796
00797 enhancement = 0;
00798 if (word->italic > 0 || word->italic == 0 && row->italic > 0)
00799 enhancement |= EUC_ITALIC;
00800 if (word->bold > 0 || word->bold == 0 && row->bold > 0)
00801 enhancement |= EUC_BOLD;
00802 if (tessedit_write_ratings)
00803 rating = (UINT32) (-word->best_choice->certainty () / 0.035);
00804 else if (tessedit_zero_rejection)
00805 rating = text[index] == ' ' ? 100 : 0;
00806 else
00807 rating = word->reject_map[index].accepted ()? 0 : 100;
00808 if (rating > 255)
00809 rating = 255;
00810 if (word->font1_count > 2)
00811 font = word->font1;
00812 else if (row->font1_count > 8)
00813 font = row->font1;
00814 else
00815
00816 font = word->word->flag (W_DONT_CHOP) ? 0 : 1;
00817
00818 lineend = word->word->flag (W_EOL) && index == length - 1;
00819 if (word->word->flag (W_EOL) && tessedit_zero_rejection
00820 && index < length - 1 && text[index + 1] == ' ') {
00821 for (index2 = index + 1; index2 < length && text[index2] == ' ';
00822 index2++);
00823 if (index2 == length)
00824 lineend = TRUE;
00825 }
00826
00827 if (!tessedit_zero_rejection || text[index] != ' '
00828 || tessedit_word_for_word) {
00829
00830 ocr_append_char (text[index] == ' ' ? unrecognised : text[index], blob_box.left (), blob_box.right (), page_image.get_ysize () - 1 - blob_box.top (), page_image.get_ysize () - 1 - blob_box.bottom (), font, (UINT8) rating,
00831 ptsize,
00832 blanks, enhancement,
00833 OCR_CDIR_LEFT_RIGHT,
00834 OCR_LDIR_DOWN_RIGHT,
00835 lineend ? OCR_NL_NEWLINE : OCR_NL_NONE);
00836 blanks = 0;
00837 }
00838
00839 }
00840 }
00841 else if (tessedit_word_for_word) {
00842 blanks = word->word->space ();
00843 if (blanks == 0 && !word->word->flag (W_BOL))
00844 blanks = 1;
00845 blob_box = word->word->bounding_box ();
00846
00847 enhancement = 0;
00848 if (word->italic > 0)
00849 enhancement |= EUC_ITALIC;
00850 if (word->bold > 0)
00851 enhancement |= EUC_BOLD;
00852 rating = 100;
00853 if (word->font1_count > 2)
00854 font = word->font1;
00855 else if (row->font1_count > 8)
00856 font = row->font1;
00857 else
00858
00859 font = word->word->flag (W_DONT_CHOP) ? 0 : 1;
00860
00861 lineend = word->word->flag (W_EOL);
00862
00863
00864 ocr_append_char (unrecognised, blob_box.left (), blob_box.right (), page_image.get_ysize () - 1 - blob_box.top (), page_image.get_ysize () - 1 - blob_box.bottom (), font,
00865 rating,
00866 ptsize,
00867 blanks, enhancement,
00868 OCR_CDIR_LEFT_RIGHT,
00869 OCR_LDIR_DOWN_RIGHT,
00870 lineend ? OCR_NL_NEWLINE : OCR_NL_NONE);
00871 }
00872 }
00873
00874
00891 void write_map(
00892 FILE *mapfile,
00893 WERD_RES *word) {
00894 INT16 index;
00895 int status;
00896 STRING mapstr = "";
00897
00898 if (word->best_choice->string ().length () > 0) {
00899 for (index = 0; index < word->word->space (); index++) {
00900 if (word->reject_spaces &&
00901 (suspect_level >= suspect_space_level) &&
00902 !tessedit_minimal_rejection && !tessedit_zero_rejection)
00903
00904
00905
00906
00907
00908 status = fprintf (mapfile, "0");
00909 else
00910 status = fprintf (mapfile, "1");
00911 if (status < 0)
00912 WRITEFAILED.error ("write_map", EXIT, "Space Errno: %d", errno);
00913 }
00914
00915 if ((word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes)) {
00916 for (index = 0; index < 5; index++)
00917 mapstr += '1';
00918 }
00919 else {
00920 ASSERT_HOST (word->reject_map.length () ==
00921 word->best_choice->string ().length ());
00922
00923 for (index = 0; index < word->reject_map.length (); index++) {
00924 if (word->reject_map[index].accepted ())
00925 mapstr += '1';
00926 else
00927 mapstr += '0';
00928 }
00929 }
00930 status = fprintf (mapfile, "%s", mapstr.string ());
00931 if (status < 0)
00932 WRITEFAILED.error ("write_map", EXIT, "Map str Errno: %d", errno);
00933 }
00934 if (word->word->flag (W_EOL)) {
00935 status = fprintf (mapfile, "\n");
00936 if (status < 0)
00937 WRITEFAILED.error ("write_map", EXIT, "Newline Errno: %d", errno);
00938 }
00939 status = fflush (mapfile);
00940 if (status != 0)
00941 WRITEFAILED.error ("write_map", EXIT, "fflush Errno: %d", errno);
00942 }
00943
00944
00952 FILE *open_outfile(
00953 const char *extension) {
00954 STRING file_name;
00955 FILE *outfile;
00956
00957 file_name = imagebasename + extension;
00958 if (!(outfile = fopen (file_name.string (), "w"))) {
00959 CANTOPENFILE.error ("open_outfile", EXIT, "%s %d",
00960 file_name.string (), errno);
00961 }
00962 return outfile;
00963 }
00964
00965
00971 void write_unlv_text(WERD_RES *word) {
00972 const char *wordstr;
00973
00974 char buff[512];
00975 int i = 0;
00976 int j = 0;
00977 char unrecognised = STRING (unrecognised_char)[0];
00978 int status;
00979 char space_str[3];
00980
00981 wordstr = word->best_choice->string ().string ();
00982
00983
00984
00985
00986
00987
00988
00989 for (; wordstr[i] != '\0'; i++) {
00990 if ((wordstr[i] == ' ') ||
00991 (wordstr[i] == '~') || (wordstr[i] == '^') || (wordstr[i] == '|'))
00992 buff[j++] = unrecognised;
00993 else {
00994 if (word->reject_map[i].rejected ())
00995 buff[j++] = '^';
00996 buff[j++] = wordstr[i];
00997 }
00998 }
00999 buff[j] = '\0';
01000
01001 if (strlen (wordstr) > 0) {
01002 if (word->reject_spaces &&
01003 (suspect_level >= suspect_space_level) &&
01004 !tessedit_minimal_rejection && !tessedit_zero_rejection)
01005 strcpy (space_str, "^ ");
01006 else
01007 strcpy (space_str, " ");
01008
01009 for (i = 0; i < word->word->space (); i++) {
01010 status = fprintf (unlv_file, "%s", space_str);
01011 if (status < 0)
01012 WRITEFAILED.error ("write_unlv_text", EXIT,
01013 "Space Errno: %d", errno);
01014 }
01015
01016 status = fprintf (unlv_file, "%s", buff);
01017 if (status < 0)
01018 WRITEFAILED.error ("write_unlv_text", EXIT, "Word Errno: %d", errno);
01019 }
01020 if (word->word->flag (W_EOL)) {
01021 status = fprintf (unlv_file, "\n");
01022 if (status < 0)
01023 WRITEFAILED.error ("write_unlv_text", EXIT,
01024 "Newline Errno: %d", errno);
01025 }
01026 status = fflush (unlv_file);
01027 if (status != 0)
01028 WRITEFAILED.error ("write_unlv_text", EXIT, "Fflush Errno: %d", errno);
01029 }
01030
01031
01040 char get_rep_char(
01041 WERD_RES *word) {
01042 int i;
01043
01044 for (i = 0;
01045 ((i < word->reject_map.length ()) &&
01046 (word->reject_map[i].rejected ())); i++);
01047 if (i < word->reject_map.length ())
01048 return word->best_choice->string ()[i];
01049 else
01050 return STRING (unrecognised_char)[0];
01051 }
01052
01053
01059 void ensure_rep_chars_are_consistent(WERD_RES *word) {
01060 char rep_char = get_rep_char (word);
01061 char *ptr;
01062
01063 ptr = (char *) word->best_choice->string ().string ();
01064 for (; *ptr != '\0'; ptr++) {
01065 if (*ptr != rep_char)
01066 *ptr = rep_char;
01067 }
01068 }
01069
01070
01085 void set_unlv_suspects(WERD_RES *word) {
01086 int len = word->reject_map.length ();
01087 int i;
01088 const char *ptr;
01089 float rating_per_ch;
01090
01091 ptr = word->best_choice->string ().string ();
01092
01093 if (suspect_level == 0) {
01094 for (i = 0; i < len; i++) {
01095 if (word->reject_map[i].rejected ())
01096 word->reject_map[i].setrej_minimal_rej_accept ();
01097 }
01098 return;
01099 }
01100
01101 if (suspect_level >= 3)
01102 return;
01103
01104
01105
01106 if (safe_dict_word (ptr) && (count_alphas (ptr) > suspect_short_words)) {
01107
01108 for (i = 0; i < len; i++) {
01109 if (word->reject_map[i].rejected () && isalpha (ptr[i]))
01110 word->reject_map[i].setrej_minimal_rej_accept ();
01111 }
01112 }
01113
01114 rating_per_ch = word->best_choice->rating () / word->reject_map.length ();
01115
01116 if (rating_per_ch >= suspect_rating_per_ch)
01117 return;
01118
01119 if ((word->tess_accepted) || (rating_per_ch < suspect_accept_rating)) {
01120
01121 for (i = 0; i < len; i++) {
01122 if (word->reject_map[i].rejected () && (ptr[i] != ' '))
01123 word->reject_map[i].setrej_minimal_rej_accept ();
01124 }
01125 }
01126
01127 for (i = 0; i < len; i++) {
01128 if (word->reject_map[i].rejected ()) {
01129 if (word->reject_map[i].flag (R_DOC_REJ))
01130 word->reject_map[i].setrej_minimal_rej_accept ();
01131 if (word->reject_map[i].flag (R_BLOCK_REJ))
01132 word->reject_map[i].setrej_minimal_rej_accept ();
01133 if (word->reject_map[i].flag (R_ROW_REJ))
01134 word->reject_map[i].setrej_minimal_rej_accept ();
01135 }
01136 }
01137
01138 if (suspect_level == 2)
01139 return;
01140
01141 if (!suspect_constrain_1Il ||
01142 (word->reject_map.length () <= suspect_short_words)) {
01143 for (i = 0; i < len; i++) {
01144 if (word->reject_map[i].rejected ()) {
01145 if ((word->reject_map[i].flag (R_1IL_CONFLICT) ||
01146 word->reject_map[i].flag (R_POSTNN_1IL)))
01147 word->reject_map[i].setrej_minimal_rej_accept ();
01148
01149 if (!suspect_constrain_1Il &&
01150 word->reject_map[i].flag (R_MM_REJECT))
01151 word->reject_map[i].setrej_minimal_rej_accept ();
01152 }
01153 }
01154 }
01155
01156 if ((acceptable_word_string (word->best_choice->string ().string ())
01157 != AC_UNACCEPTABLE) ||
01158 acceptable_number_string (word->best_choice->string ().string ())) {
01159 if (word->reject_map.length () > suspect_short_words) {
01160 for (i = 0; i < len; i++) {
01161 if (word->reject_map[i].rejected () &&
01162 (!word->reject_map[i].perm_rejected () ||
01163 word->reject_map[i].flag (R_1IL_CONFLICT) ||
01164 word->reject_map[i].flag (R_POSTNN_1IL) ||
01165 word->reject_map[i].flag (R_MM_REJECT))) {
01166 word->reject_map[i].setrej_minimal_rej_accept ();
01167 }
01168 }
01169 }
01170 }
01171 }
01172
01173
01180 INT16 count_alphas(
01181 const char *s) {
01182 int count = 0;
01183
01184 for (; *s != '\0'; s++) {
01185 if (isalpha (*s))
01186 count++;
01187 }
01188 return count;
01189 }
01190
01191
01198 INT16 count_alphanums(
01199 const char *s) {
01200 int count = 0;
01201
01202 for (; *s != '\0'; s++) {
01203 if (isalnum (*s))
01204 count++;
01205 }
01206 return count;
01207 }
01208
01209
01216 BOOL8 acceptable_number_string(const char *s) {
01217 BOOL8 prev_digit = FALSE;
01218
01219 if (*s == '(')
01220 s++;
01221
01222 if ((*s == '$') || (*s == '.') || (*s == '+') || (*s == '-'))
01223 s++;
01224
01225 for (; *s != '\0'; s++) {
01226 if (isdigit (*s))
01227 prev_digit = TRUE;
01228 else if (prev_digit && ((*s == '.') || (*s == ',') || (*s == '-')))
01229 prev_digit = FALSE;
01230 else if (prev_digit &&
01231 (*(s + 1) == '\0') && ((*s == '%') || (*s == ')')))
01232 return TRUE;
01233 else if (prev_digit &&
01234 (*s == '%') && (*(s + 1) == ')') && (*(s + 2) == '\0'))
01235 return TRUE;
01236 else
01237 return FALSE;
01238 }
01239 return TRUE;
01240 }