#include "varable.h"
#include "pageres.h"
#include "notdll.h"
Go to the source code of this file.
BOOL8 acceptable_number_string | ( | const char * | s | ) |
Rules for determining if string is a valid number (of some kind).
s | String |
Definition at line 1216 of file output.cpp.
Referenced by set_unlv_suspects().
01216 { 01217 BOOL8 prev_digit = FALSE; 01218 01219 if (*s == '(') 01220 s++; 01221 01222 if ((*s == '$') || (*s == '.') || (*s == '+') || (*s == '-')) 01223 s++; 01224 01225 for (; *s != '\0'; s++) { 01226 if (isdigit (*s)) 01227 prev_digit = TRUE; 01228 else if (prev_digit && ((*s == '.') || (*s == ',') || (*s == '-'))) 01229 prev_digit = FALSE; 01230 else if (prev_digit && 01231 (*(s + 1) == '\0') && ((*s == '%') || (*s == ')'))) 01232 return TRUE; 01233 else if (prev_digit && 01234 (*s == '%') && (*(s + 1) == ')') && (*(s + 2) == '\0')) 01235 return TRUE; 01236 else 01237 return FALSE; 01238 } 01239 return TRUE; 01240 }
INT16 count_alphanums | ( | const char * | s | ) |
Count number of isalphanum()s in string.
s | String |
Definition at line 1198 of file output.cpp.
References count().
01199 { 01200 int count = 0; 01201 01202 for (; *s != '\0'; s++) { 01203 if (isalnum (*s)) 01204 count++; 01205 } 01206 return count; 01207 }
INT16 count_alphas | ( | const char * | s | ) |
Count number of isalpha()s in string.
s | String |
Definition at line 1180 of file output.cpp.
References count().
Referenced by set_unlv_suspects().
01181 { 01182 int count = 0; 01183 01184 for (; *s != '\0'; s++) { 01185 if (isalpha (*s)) 01186 count++; 01187 } 01188 return count; 01189 }
Find whether we have a wrapping or hard newline.
word | word to do | |
block | current block | |
next_word | next word | |
next_block | block of next word |
Definition at line 613 of file output.cpp.
References WERD::bounding_box(), CTRL_HARDLINE, CTRL_NEWLINE, FALSE, WERD::flag(), BOX::left(), NULL, BOX::right(), WERD::space(), and W_EOL.
Referenced by output_pass().
00618 { 00619 INT16 end_gap; // to right edge 00620 INT16 width; // of next word 00621 BOX word_box; // bounding 00622 BOX next_box; // next word 00623 BOX block_box; // block bounding 00624 00625 if (!word->flag (W_EOL)) 00626 return FALSE; // not end of line 00627 if (next_word == NULL || next_block == NULL || block != next_block) 00628 return CTRL_NEWLINE; 00629 if (next_word->space () > 0) 00630 return CTRL_HARDLINE; //it is tabbed 00631 word_box = word->bounding_box (); 00632 next_box = next_word->bounding_box (); 00633 block_box = block->bounding_box (); 00634 //gap to eol 00635 end_gap = block_box.right () - word_box.right (); 00636 end_gap -= (INT32) block->space (); 00637 width = next_box.right () - next_box.left (); 00638 // tprintf("end_gap=%d-%d=%d, width=%d-%d=%d, nl=%d\n", 00639 // block_box.right(),word_box.right(),end_gap, 00640 // next_box.right(),next_box.left(),width, 00641 // end_gap>width ? CTRL_HARDLINE : CTRL_NEWLINE); 00642 return end_gap > width ? CTRL_HARDLINE : CTRL_NEWLINE; 00643 }
void ensure_rep_chars_are_consistent | ( | WERD_RES * | word | ) |
Enforce repeating characters acceptable.
And this means what?
Definition at line 1059 of file output.cpp.
References WERD_RES::best_choice, and get_rep_char().
Referenced by write_results().
01059 { 01060 char rep_char = get_rep_char (word); 01061 char *ptr; 01062 01063 ptr = (char *) word->best_choice->string ().string (); 01064 for (; *ptr != '\0'; ptr++) { 01065 if (*ptr != rep_char) 01066 *ptr = rep_char; 01067 } 01068 }
char get_rep_char | ( | WERD_RES * | word | ) |
Return the first accepted character from the repetition string.
word | Word |
Definition at line 1040 of file output.cpp.
References WERD_RES::best_choice, REJMAP::length(), and WERD_RES::reject_map.
Referenced by ensure_rep_chars_are_consistent(), make_epaper_choice(), and write_results().
01041 { 01042 int i; 01043 01044 for (i = 0; 01045 ((i < word->reject_map.length ()) && 01046 (word->reject_map[i].rejected ())); i++); 01047 if (i < word->reject_map.length ()) 01048 return word->best_choice->string ()[i]; 01049 else 01050 return STRING (unrecognised_char)[0]; 01051 }
WERD_CHOICE* make_epaper_choice | ( | WERD_RES * | word, | |
char | newline_type | |||
) |
Convert one word.
word | word to do | |
newline_type | type of newline |
Definition at line 439 of file output.cpp.
References ASSERT_HOST, WERD_RES::best_choice, WERD::blob_list(), PBLOB::bounding_box(), WERD_RES::denorm, FALSE, WERD::flag(), get_rep_char(), REJMAP::length(), make_reject(), MAX_PATH, NO_PERM, WERD_RES::outword, WERD_RES::reject_map, BOX::right(), WERD::space(), tprintf(), TRUE, W_REP_CHAR, and WERD_RES::word.
Referenced by write_results().
00442 { 00443 INT16 index = 0; //to string 00444 INT16 blobindex; //to word 00445 INT16 prevright = 0; //right of previous blob 00446 INT16 nextleft; //left of next blob 00447 PBLOB *blob; 00448 BOX inset_box; //bounding box 00449 PBLOB_IT blob_it; //blob iterator 00450 char word_string[MAX_PATH]; //converted string 00451 BOOL8 force_total_reject; 00452 char unrecognised = STRING (unrecognised_char)[0]; 00453 00454 blob_it.set_to_list (word->outword->blob_list ()); 00455 00456 ASSERT_HOST (word->reject_map.length () == 00457 word->best_choice->string ().length ()); 00458 /* 00459 tprintf( "\"%s\" -> length: %d; blobcount: %d (%d)\n", 00460 word->best_choice->string().string(), 00461 word->best_choice->string().length(), 00462 blob_it.length(), 00463 blob_count( word->outword ) ); 00464 */ 00465 00466 if (word->best_choice->string ().length () == 0) 00467 force_total_reject = TRUE; 00468 else { 00469 force_total_reject = FALSE; 00470 ASSERT_HOST (blob_it.length () == 00471 word->best_choice->string ().length ()); 00472 } 00473 if (!blob_it.empty ()) { 00474 for (index = 0; index < word->word->space (); index++) 00475 word_string[index] = ' '; //leading blanks 00476 } 00477 /* 00478 Why does this generate leading blanks regardless of whether the 00479 word_choice string is empty, when write_cooked_text only generates leading 00480 blanks when the string is NOT empty???. 00481 */ 00482 00483 if (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes) { 00484 strcpy (word_string + index, "|^~R"); 00485 index += 4; 00486 word_string[index++] = get_rep_char (word); 00487 } 00488 else { 00489 if (!blob_it.empty ()) 00490 prevright = blob_it.data ()->bounding_box ().left (); 00491 //actually first left 00492 for (blobindex = 0, blob_it.mark_cycle_pt (); 00493 !blob_it.cycled_list (); blobindex++, blob_it.forward ()) { 00494 blob = blob_it.data (); 00495 if (word->reject_map[blobindex].accepted ()) { 00496 if (word->best_choice->string ()[blobindex] == ' ') 00497 word_string[index++] = unrecognised; // but not rejected!! 00498 else 00499 word_string[index++] = 00500 word->best_choice->string ()[blobindex]; 00501 } 00502 else { 00503 inset_box = blob->bounding_box (); // start reject 00504 /* Extend reject box to include rejected neighbours */ 00505 while (!blob_it.at_last () && 00506 (force_total_reject || 00507 (word->reject_map[blobindex + 1].rejected ()))) { 00508 blobindex++; 00509 blob = blob_it.forward (); 00510 inset_box += blob->bounding_box (); // get total box 00511 } 00512 if (blob_it.at_last ()) 00513 nextleft = inset_box.right (); 00514 else 00515 nextleft = blob_it.data_relative (1)->bounding_box ().left (); 00516 00517 // tprintf("Making reject from (%d,%d)->(%d,%d)\n", 00518 // inset_box.left(),inset_box.bottom(), 00519 // inset_box.right(),inset_box.top()); 00520 00521 index += make_reject (&inset_box, prevright, nextleft, 00522 &word->denorm, &word_string[index]); 00523 } 00524 prevright = blob->bounding_box ().right (); 00525 } 00526 } 00527 if (newline_type) 00528 word_string[index++] = newline_type; // end line 00529 word_string[index] = '\0'; // terminate string 00530 if (strlen (word_string) != index) { 00531 tprintf ("ASSERT ABOUT TO FAIL: %s, index %d len %d\n", 00532 word_string, index, strlen (word_string)); 00533 } 00534 ASSERT_HOST (strlen (word_string) == index); // don't pass any zeros 00535 return new WERD_CHOICE (word_string, 0, 0, NO_PERM); 00536 }
INT16 make_reject | ( | BOX * | inset_box, | |
INT16 | prevright, | |||
INT16 | nextleft, | |||
DENORM * | denorm, | |||
char | word_string[] | |||
) |
Add the escape code to the string for the reject.
inset_box | bounding box | |
prevright | previous char | |
nextleft | next char | |
denorm | de-normalizer | |
word_string | output string |
Definition at line 548 of file output.cpp.
References BOX::bottom(), CTRL_INSET, BOX::left(), BOX::right(), DENORM::row(), BOX::top(), DENORM::x(), ROW::x_height(), and DENORM::y().
Referenced by make_epaper_choice().
00554 { 00555 INT16 index; // to string 00556 INT16 xpos; // start of inset 00557 INT16 ypos; 00558 INT16 width; // size of inset 00559 INT16 height; 00560 INT16 left_offset; // shift form prev char 00561 INT16 right_offset; // shift to next char 00562 INT16 baseline_offset; // shift from baseline 00563 INT16 inset_index = 0; // number of inset 00564 INT16 min_chars; // min width estimate 00565 INT16 max_chars; // max width estimate 00566 float x_centre; // centre of box 00567 00568 index = 0; 00569 x_centre = (inset_box->left () + inset_box->right ()) / 2.0; 00570 left_offset = 00571 (INT16) (denorm->x (inset_box->left ()) - denorm->x (prevright)); 00572 right_offset = 00573 (INT16) (denorm->x (nextleft) - denorm->x (inset_box->right ())); 00574 xpos = (INT16) floor (denorm->x (inset_box->left ())); 00575 width = (INT16) ceil (denorm->x (inset_box->right ())) - xpos; 00576 ypos = (INT16) floor (denorm->y (inset_box->bottom (), x_centre)); 00577 height = (INT16) ceil (denorm->y (inset_box->top (), x_centre)) - ypos; 00578 baseline_offset = ypos - (INT16) denorm->y (bln_baseline_offset, x_centre); 00579 word_string[index++] = CTRL_INSET; // escape code 00580 min_chars = (INT16) ceil (0.27 * width / denorm->row ()->x_height ()); 00581 max_chars = (INT16) floor (1.8 * width / denorm->row ()->x_height ()); 00582 /* 00583 Ensure min_chars and max_chars are in the range 0..254. This ensures that 00584 we can add 1 to them to avoid putting \0 in a string, and still not exceed 00585 the max value in a byte. 00586 */ 00587 if (min_chars < 0) 00588 min_chars = 0; 00589 if (min_chars > 254) 00590 min_chars = 254; 00591 if (max_chars < min_chars) 00592 max_chars = min_chars; 00593 if (max_chars > 254) 00594 max_chars = 254; 00595 word_string[index++] = min_chars + 1; // min chars 00596 word_string[index++] = max_chars + 1; // max chars 00597 word_string[index++] = 2; //type? 00598 word_string[index++] = inset_index / 255 + 1; // store index 00599 word_string[index++] = inset_index % 255 + 1; 00600 return index; // size of string 00601 }
FILE* open_outfile | ( | const char * | extension | ) |
Open .map & .unlv file.
extension | MSDOS extension, added to imagebasename |
Definition at line 952 of file output.cpp.
References CANTOPENFILE, ERRCODE::error(), EXIT, imagebasename, and STRING::string().
Referenced by output_pass().
00953 { 00954 STRING file_name; 00955 FILE *outfile; 00956 00957 file_name = imagebasename + extension; 00958 if (!(outfile = fopen (file_name.string (), "w"))) { 00959 CANTOPENFILE.error ("open_outfile", EXIT, "%s %d", 00960 file_name.string (), errno); 00961 } 00962 return outfile; 00963 }
void output_pass | ( | PAGE_RES_IT & | page_res_it, | |
BOOL8 | write_to_shm | |||
) |
Tess output to pass to file or API.
Definition at line 122 of file output.cpp.
References BLOCK_RES::block, PAGE_RES_IT::block(), check_debug_pt(), cprintf(), determine_newline_type(), FALSE, PAGE_RES_IT::forward(), PAGE_RES_IT::next_block(), PAGE_RES_IT::next_word(), NO_BLOCK, NULL, ocr_send_text(), open_outfile(), PAGE_RES_IT::restart_page(), textfile, txt_mapfile, unlv_file, WERD_RES::word, PAGE_RES_IT::word(), and write_results().
00124 { 00125 BLOCK_RES *block_of_last_word; 00126 INT16 block_id; 00127 BOOL8 force_eol; // During output 00128 BLOCK *nextblock; // block of next word 00129 WERD *nextword; // next word 00130 00131 #ifdef TEXT_VERBOSE 00132 // gets a 'w', see ccmain/tesseractmain.dox 00133 cprintf("w"); 00134 #endif 00135 if (tessedit_write_txt_map) 00136 txt_mapfile = open_outfile (".map"); 00137 if (tessedit_write_unlv) 00138 unlv_file = open_outfile (".unlv"); 00139 page_res_it.restart_page (); 00140 block_of_last_word = NULL; 00141 while (page_res_it.word () != NULL) { 00142 check_debug_pt (page_res_it.word (), 120); 00143 if (tessedit_write_block_separators && 00144 block_of_last_word != page_res_it.block ()) { 00145 block_of_last_word = page_res_it.block (); 00146 if (block_of_last_word->block->text_region () == NULL) { 00147 if (block_of_last_word->block->poly_block () == NULL) 00148 block_id = 1; 00149 else 00150 block_id = 00151 ((WEIRD_BLOCK *) block_of_last_word->block->poly_block ())-> 00152 id_no(); 00153 } 00154 else 00155 block_id = block_of_last_word->block->text_region ()->id_no (); 00156 if (!NO_BLOCK) 00157 fprintf (textfile, "|^~tr%d\n", block_id); 00158 fprintf (txt_mapfile, "|^~tr%d\n", block_id); 00159 } 00160 00161 force_eol = (tessedit_write_block_separators && 00162 (page_res_it.block () != page_res_it.next_block ())) || 00163 (page_res_it.next_word () == NULL); 00164 00165 if (page_res_it.next_word () != NULL) 00166 nextword = page_res_it.next_word ()->word; 00167 else 00168 nextword = NULL; 00169 if (page_res_it.next_block () != NULL) 00170 nextblock = page_res_it.next_block ()->block; 00171 else 00172 nextblock = NULL; 00173 write_results (page_res_it, // regardless of tilde crunching 00174 determine_newline_type (page_res_it.word ()->word, 00175 page_res_it.block ()->block, nextword, nextblock), 00176 force_eol, write_to_shm); 00177 page_res_it.forward (); 00178 } 00179 if (write_to_shm) 00180 ocr_send_text(FALSE); 00181 if (tessedit_write_block_separators) { 00182 if (!NO_BLOCK) 00183 fprintf (textfile, "|^~tr\n"); 00184 fprintf (txt_mapfile, "|^~tr\n"); 00185 } 00186 if (tessedit_write_txt_map) { 00187 fprintf (txt_mapfile, "\n"); // because txt gets one 00188 #ifdef __UNIX__ 00189 fsync (fileno (txt_mapfile)); 00190 #endif 00191 fclose(txt_mapfile); 00192 } 00193 }
void set_unlv_suspects | ( | WERD_RES * | word | ) |
Modifies reject_map of word based on suspect_level.
word | Word |
To reject JUST tess failures in the .map file set suspect_level 3 and tessedit_minimal_rejection.
Definition at line 1085 of file output.cpp.
References AC_UNACCEPTABLE, acceptable_number_string(), acceptable_word_string(), WERD_RES::best_choice, count_alphas(), REJMAP::length(), R_1IL_CONFLICT, R_BLOCK_REJ, R_DOC_REJ, R_MM_REJECT, R_POSTNN_1IL, R_ROW_REJ, WERD_RES::reject_map, safe_dict_word(), and WERD_RES::tess_accepted.
Referenced by write_results().
01085 { 01086 int len = word->reject_map.length (); 01087 int i; 01088 const char *ptr; 01089 float rating_per_ch; 01090 01091 ptr = word->best_choice->string ().string (); 01092 01093 if (suspect_level == 0) { 01094 for (i = 0; i < len; i++) { 01095 if (word->reject_map[i].rejected ()) 01096 word->reject_map[i].setrej_minimal_rej_accept (); 01097 } 01098 return; 01099 } 01100 01101 if (suspect_level >= 3) 01102 return; // Use defaults 01103 01104 /* NOW FOR LEVELS 1 and 2 Find some stuff to unreject*/ 01105 01106 if (safe_dict_word (ptr) && (count_alphas (ptr) > suspect_short_words)) { 01107 /* Unreject alphas in dictionary words */ 01108 for (i = 0; i < len; i++) { 01109 if (word->reject_map[i].rejected () && isalpha (ptr[i])) 01110 word->reject_map[i].setrej_minimal_rej_accept (); 01111 } 01112 } 01113 01114 rating_per_ch = word->best_choice->rating () / word->reject_map.length (); 01115 01116 if (rating_per_ch >= suspect_rating_per_ch) 01117 return; //Dont touch bad ratings 01118 01119 if ((word->tess_accepted) || (rating_per_ch < suspect_accept_rating)) { 01120 /* Unreject any Tess Acceptable word - but NOT tess reject chs*/ 01121 for (i = 0; i < len; i++) { 01122 if (word->reject_map[i].rejected () && (ptr[i] != ' ')) 01123 word->reject_map[i].setrej_minimal_rej_accept (); 01124 } 01125 } 01126 01127 for (i = 0; i < len; i++) { 01128 if (word->reject_map[i].rejected ()) { 01129 if (word->reject_map[i].flag (R_DOC_REJ)) 01130 word->reject_map[i].setrej_minimal_rej_accept (); 01131 if (word->reject_map[i].flag (R_BLOCK_REJ)) 01132 word->reject_map[i].setrej_minimal_rej_accept (); 01133 if (word->reject_map[i].flag (R_ROW_REJ)) 01134 word->reject_map[i].setrej_minimal_rej_accept (); 01135 } 01136 } 01137 01138 if (suspect_level == 2) 01139 return; 01140 01141 if (!suspect_constrain_1Il || 01142 (word->reject_map.length () <= suspect_short_words)) { 01143 for (i = 0; i < len; i++) { 01144 if (word->reject_map[i].rejected ()) { 01145 if ((word->reject_map[i].flag (R_1IL_CONFLICT) || 01146 word->reject_map[i].flag (R_POSTNN_1IL))) 01147 word->reject_map[i].setrej_minimal_rej_accept (); 01148 01149 if (!suspect_constrain_1Il && 01150 word->reject_map[i].flag (R_MM_REJECT)) 01151 word->reject_map[i].setrej_minimal_rej_accept (); 01152 } 01153 } 01154 } 01155 01156 if ((acceptable_word_string (word->best_choice->string ().string ()) 01157 != AC_UNACCEPTABLE) || 01158 acceptable_number_string (word->best_choice->string ().string ())) { 01159 if (word->reject_map.length () > suspect_short_words) { 01160 for (i = 0; i < len; i++) { 01161 if (word->reject_map[i].rejected () && 01162 (!word->reject_map[i].perm_rejected () || 01163 word->reject_map[i].flag (R_1IL_CONFLICT) || 01164 word->reject_map[i].flag (R_POSTNN_1IL) || 01165 word->reject_map[i].flag (R_MM_REJECT))) { 01166 word->reject_map[i].setrej_minimal_rej_accept (); 01167 } 01168 } 01169 } 01170 } 01171 }
void write_cooked_text | ( | WERD * | word, | |
const STRING & | text, | |||
BOOL8 | acceptable, | |||
BOOL8 | pass2, | |||
FILE * | fp | |||
) |
Write the cooked text (with bold for pass2 and underline for reject) to the given file.
word | word to do | |
text | text to write | |
acceptable | good stuff | |
pass2 | done on pass2 | |
fp | file to write |
Definition at line 657 of file output.cpp.
References BOLD_OFF, BOLD_ON, BOX::bottom(), WERD::bounding_box(), ERRCODE::error(), EXIT, WERD::flag(), IMAGE::get_ysize(), INT32FORMAT, BOX::left(), STRING::length(), NO_BLOCK, num_popped, page_image, BOX::right(), WERD::space(), STRING::string(), BOX::top(), tprintf(), UNDERLINE_OFF, UNDERLINE_ON, W_EOL, WRITEFAILED, XOFFSET, and YOFFSET.
Referenced by classify_word_pass1(), classify_word_pass2(), and write_results().
00663 { 00664 INT16 index; //blank counter 00665 int status; 00666 static int newaline = 1; 00667 static int havespace = 0; 00668 char buff[512]; 00669 const char *wordstr = text.string (); 00670 int i = 0; 00671 char unrecognised = STRING (unrecognised_char)[0]; 00672 static int old_segs = 0; 00673 BOX mybox; 00674 for (i = 0; wordstr[i] != '\0'; i++) { 00675 if (wordstr[i] == ' ') 00676 buff[i] = unrecognised; 00677 else 00678 buff[i] = wordstr[i]; 00679 } 00680 buff[i] = '\0'; 00681 00682 if (fp == stdout) { 00683 tprintf ("Cooked=%s, %d segs, acceptable=%d", 00684 buff, num_popped - old_segs, acceptable); 00685 old_segs = num_popped; 00686 return; 00687 } 00688 00689 if (text.length () > 0) { 00690 for (index = 0; index < word->space (); index++) { 00691 status = fprintf (fp, " "); 00692 havespace = 1; 00693 if (status < 0) 00694 WRITEFAILED.error ("write_cooked_text", EXIT, 00695 "Space Errno: %d", errno); 00696 } 00697 if (pass2) { 00698 status = fprintf (fp, BOLD_ON); 00699 if (status < 0) 00700 WRITEFAILED.error ("write_cooked_text", EXIT, 00701 "Bold Errno: %d", errno); 00702 } 00703 if (!acceptable) { 00704 status = fprintf (fp, UNDERLINE_ON); 00705 if (status < 0) 00706 WRITEFAILED.error ("write_cooked_text", EXIT, 00707 "Underline Errno: %d", errno); 00708 } 00709 00710 //xiaofan 00711 if (NO_BLOCK && word && strlen (buff)) { 00712 mybox = word->bounding_box (); 00713 if (newaline || !havespace) { 00714 fprintf (fp, " "); 00715 newaline = 0; 00716 } 00717 fprintf (fp, "(%d," INT32FORMAT ",%d," INT32FORMAT ")", 00718 XOFFSET + mybox.left (), 00719 YOFFSET + page_image.get_ysize () - mybox.top (), 00720 XOFFSET + mybox.right (), 00721 YOFFSET + page_image.get_ysize () - mybox.bottom ()); 00722 havespace = 0; 00723 } 00724 00725 status = fprintf (fp, "%s", buff); 00726 if (status < 0) 00727 WRITEFAILED.error ("write_cooked_text", EXIT, 00728 "Word Errno: %d", errno); 00729 if (pass2) { 00730 status = fprintf (fp, BOLD_OFF); 00731 if (status < 0) 00732 WRITEFAILED.error ("write_cooked_text", EXIT, 00733 "Bold off Errno: %d", errno); 00734 } 00735 if (!acceptable) { 00736 status = fprintf (fp, UNDERLINE_OFF); 00737 if (status < 0) 00738 WRITEFAILED.error ("write_cooked_text", EXIT, 00739 "Underline off Errno: %d", errno); 00740 } 00741 } 00742 if (word->flag (W_EOL)) { 00743 status = fprintf (fp, "\n"); 00744 newaline = 1; 00745 if (status < 0) 00746 WRITEFAILED.error ("write_cooked_text", EXIT, 00747 "Newline Errno: %d", errno); 00748 } 00749 status = fflush (fp); 00750 if (status != 0) 00751 WRITEFAILED.error ("write_cooked_text", EXIT, "Fflush Errno: %d", errno); 00752 }
void write_map | ( | FILE * | mapfile, | |
WERD_RES * | word | |||
) |
Generates a map file, approx a 'cmp -l' of .txt file and .etx file.
mapfile | mapfile to write to | |
word | Word |
The paramaterised input is because I thought I might be able to generate multiple map files in a single run. However, it didn't work because newdiff needs etx files!
Definition at line 891 of file output.cpp.
References ASSERT_HOST, WERD_RES::best_choice, ERRCODE::error(), EXIT, WERD::flag(), REJMAP::length(), WERD_RES::reject_map, WERD_RES::reject_spaces, WERD::space(), STRING::string(), W_EOL, W_REP_CHAR, WERD_RES::word, and WRITEFAILED.
Referenced by write_results().
00893 { 00894 INT16 index; 00895 int status; 00896 STRING mapstr = ""; 00897 00898 if (word->best_choice->string ().length () > 0) { 00899 for (index = 0; index < word->word->space (); index++) { 00900 if (word->reject_spaces && 00901 (suspect_level >= suspect_space_level) && 00902 !tessedit_minimal_rejection && !tessedit_zero_rejection) 00903 /* 00904 Write rejected spaces to .map file ONLY. Newdiff converts 00905 these back to accepted spaces AFTER generating basic space 00906 stats but BEFORE using .etx 00907 */ 00908 status = fprintf (mapfile, "0"); 00909 else 00910 status = fprintf (mapfile, "1"); 00911 if (status < 0) 00912 WRITEFAILED.error ("write_map", EXIT, "Space Errno: %d", errno); 00913 } 00914 00915 if ((word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes)) { 00916 for (index = 0; index < 5; index++) 00917 mapstr += '1'; 00918 } 00919 else { 00920 ASSERT_HOST (word->reject_map.length () == 00921 word->best_choice->string ().length ()); 00922 00923 for (index = 0; index < word->reject_map.length (); index++) { 00924 if (word->reject_map[index].accepted ()) 00925 mapstr += '1'; 00926 else 00927 mapstr += '0'; 00928 } 00929 } 00930 status = fprintf (mapfile, "%s", mapstr.string ()); 00931 if (status < 0) 00932 WRITEFAILED.error ("write_map", EXIT, "Map str Errno: %d", errno); 00933 } 00934 if (word->word->flag (W_EOL)) { 00935 status = fprintf (mapfile, "\n"); 00936 if (status < 0) 00937 WRITEFAILED.error ("write_map", EXIT, "Newline Errno: %d", errno); 00938 } 00939 status = fflush (mapfile); 00940 if (status != 0) 00941 WRITEFAILED.error ("write_map", EXIT, "fflush Errno: %d", errno); 00942 }
void write_results | ( | PAGE_RES_IT & | page_res_it, | |
char | newline_type, | |||
BOOL8 | force_eol, | |||
BOOL8 | write_to_shm | |||
) |
All recognition and rejection has now been done so write out word.
page_res_it | full info | |
newline_type | type of newline | |
force_eol | 0 or 1, override tilde crunch? | |
write_to_shm | 0 or 1, send to api |
Definition at line 214 of file output.cpp.
References ASSERT_HOST, WERD_RES::best_choice, WERD::blob_list(), BLOCK_RES::block, PAGE_RES_IT::block(), character_count, check_debug_pt(), CR_DELETE, CR_KEEP_SPACE, CR_NONE, CTRL_INSET, dict_word(), ensure_rep_chars_are_consistent(), WERD_RES::ep_choice, FALSE, WERD::flag(), get_rep_char(), REJMAP::length(), make_epaper_choice(), NO_BLOCK, NO_PERM, WERD_RES::outword, rawfile, WERD_RES::reject_map, REJMAP::remove_pos(), PAGE_RES_IT::row(), set_unlv_suspects(), WERD::space(), STRING::string(), textfile, tprintf(), TRUE, txt_mapfile, WERD_RES::unlv_crunch_mode, unlv_file, W_BOL, W_EOL, W_FUZZY_NON, W_FUZZY_SP, W_REP_CHAR, WERD_RES::word, PAGE_RES_IT::word(), word_count, write_cooked_text(), write_map(), write_shm_text(), and write_unlv_text().
Referenced by output_pass().
00219 { 00220 WERD_RES *word = page_res_it.word (); //word to do 00221 WERD_CHOICE *ep_choice; //ep format 00222 STRING repetition_code; 00223 const STRING *wordstr; 00224 const char *text; 00225 int i; 00226 char unrecognised = STRING (unrecognised_char)[0]; 00227 char ep_chars[32]; //Only for unlv_tilde_crunch 00228 int ep_chars_index = 0; 00229 char txt_chs[32]; //Only for unlv_tilde_crunch 00230 char map_chs[32]; //Only for unlv_tilde_crunch 00231 int txt_index = 0; 00232 static BOOL8 tilde_crunch_written = FALSE; 00233 static BOOL8 last_char_was_newline = TRUE; 00234 static BOOL8 last_char_was_tilde = FALSE; 00235 static BOOL8 empty_block = TRUE; 00236 BOOL8 need_reject = FALSE; 00237 char *ptr; //string ptr 00238 PBLOB_IT blob_it; //blobs 00239 00240 /* 00241 if (word->best_choice->string().length() == 0) 00242 { 00243 tprintf("No output: to output\n"); 00244 } 00245 else if (word->best_choice->string()[0]==' ') 00246 { 00247 tprintf("spaceword to output\n"); 00248 } 00249 else if (word->best_choice->string()[0]== '\0' ) 00250 { 00251 tprintf("null to output\n"); 00252 }*/ 00253 if (word->unlv_crunch_mode != CR_NONE 00254 && !tessedit_zero_kelvin_rejection && !tessedit_word_for_word) { 00255 if ((word->unlv_crunch_mode != CR_DELETE) && 00256 (!tilde_crunch_written || 00257 ((word->unlv_crunch_mode == CR_KEEP_SPACE) && 00258 (word->word->space () > 0) && 00259 !word->word->flag (W_FUZZY_NON) && 00260 !word->word->flag (W_FUZZY_SP)))) { 00261 if (!word->word->flag (W_BOL) && 00262 (word->word->space () > 0) && 00263 !word->word->flag (W_FUZZY_NON) && 00264 !word->word->flag (W_FUZZY_SP)) { 00265 /* Write a space to separate from preceeding good text */ 00266 txt_chs[txt_index] = ' '; 00267 map_chs[txt_index++] = '1'; 00268 ep_chars[ep_chars_index++] = ' '; 00269 last_char_was_tilde = FALSE; 00270 } 00271 need_reject = TRUE; 00272 } 00273 if ((need_reject && !last_char_was_tilde) || (force_eol && empty_block)) { 00274 /* Write a reject char - mark as rejected unless zero_rejection mode */ 00275 last_char_was_tilde = TRUE; 00276 txt_chs[txt_index] = unrecognised; 00277 if (tessedit_zero_rejection || (suspect_level == 0)) { 00278 map_chs[txt_index++] = '1'; 00279 ep_chars[ep_chars_index++] = unrecognised; 00280 } 00281 else { 00282 map_chs[txt_index++] = '0'; 00283 /* The ep_choice string is a faked reject to allow newdiff to 00284 sync the .etx with the .txt and .map files. 00285 */ 00286 ep_chars[ep_chars_index++] = CTRL_INSET; // escape code 00287 ep_chars[ep_chars_index++] = 1; // dummy reject 00288 ep_chars[ep_chars_index++] = 1; // dummy reject 00289 ep_chars[ep_chars_index++] = 2; // type 00290 ep_chars[ep_chars_index++] = 1; // dummy reject 00291 ep_chars[ep_chars_index++] = 1; // dummy reject 00292 } 00293 tilde_crunch_written = TRUE; 00294 last_char_was_newline = FALSE; 00295 empty_block = FALSE; 00296 } 00297 00298 if ((word->word->flag (W_EOL) && !last_char_was_newline) || force_eol) { 00299 /* Add a new line output */ 00300 txt_chs[txt_index] = '\n'; 00301 map_chs[txt_index++] = '\n'; 00302 ep_chars[ep_chars_index++] = newline_type; // end line 00303 00304 tilde_crunch_written = FALSE; // Coordinates of the real newline 00305 last_char_was_newline = TRUE; 00306 last_char_was_tilde = FALSE; 00307 } 00308 txt_chs[txt_index] = '\0'; 00309 map_chs[txt_index] = '\0'; 00310 if (tessedit_write_output && !NO_BLOCK) // xiaofan ? 00311 fprintf (textfile, "%s", txt_chs); 00312 00313 if (tessedit_write_unlv) 00314 fprintf (unlv_file, "%s", txt_chs); 00315 00316 if (tessedit_write_txt_map) 00317 fprintf (txt_mapfile, "%s", map_chs); 00318 00319 ep_chars[ep_chars_index] = '\0'; // terminate string 00320 word->ep_choice = new WERD_CHOICE (ep_chars, 0, 0, NO_PERM); 00321 00322 if (force_eol) 00323 empty_block = TRUE; 00324 return; 00325 } 00326 00327 /* NORMAL PROCESSING of non tilde crunched words */ 00328 00329 tilde_crunch_written = FALSE; 00330 if (newline_type) 00331 last_char_was_newline = TRUE; 00332 else 00333 last_char_was_newline = FALSE; 00334 empty_block = force_eol; // About to write a real word 00335 00336 if (unlv_tilde_crunching && 00337 last_char_was_tilde && 00338 (word->word->space () == 0) && 00339 !(word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes) && 00340 (word->best_choice->string ()[0] == ' ')) { 00341 /* Prevent adjacent tilde across words - we know that adjacent 00342 tildes within words have been removed */ 00343 ptr = (char *) word->best_choice->string ().string (); 00344 strcpy (ptr, ptr + 1); //shuffle up 00345 word->reject_map.remove_pos (0); 00346 blob_it = word->outword->blob_list (); 00347 delete blob_it.extract (); // get rid of reject blob 00348 } 00349 if (newline_type || 00350 (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes)) 00351 last_char_was_tilde = FALSE; 00352 else { 00353 if (word->reject_map.length () > 0) { 00354 if (word->best_choice->string ()[word->reject_map.length () - 1] == 00355 ' ') 00356 last_char_was_tilde = TRUE; 00357 else 00358 last_char_was_tilde = FALSE; 00359 } 00360 else if (word->word->space () > 0) 00361 last_char_was_tilde = FALSE; 00362 /* else it is unchanged as there are no output chars */ 00363 } 00364 00365 ptr = (char *) word->best_choice->string ().string (); 00366 ASSERT_HOST (strlen (ptr) == word->reject_map.length ()); 00367 00368 if (word->word->flag (W_REP_CHAR) && tessedit_consistent_reps) 00369 ensure_rep_chars_are_consistent(word); 00370 00371 set_unlv_suspects(word); 00372 check_debug_pt (word, 120); 00373 if (tessedit_rejection_debug) { 00374 tprintf ("Dict word: \"%s\": %d\n", 00375 word->best_choice->string ().string (), 00376 dict_word (word->best_choice->string ().string ())); 00377 } 00378 00379 if (tessedit_write_unlv) { 00380 write_unlv_text(word); 00381 } 00382 00383 if (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes) { 00384 repetition_code = "|^~R"; 00385 repetition_code += get_rep_char (word); 00386 wordstr = &repetition_code; 00387 } 00388 else { 00389 wordstr = &(word->best_choice->string ()); 00390 if (tessedit_zero_rejection) { 00391 /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */ 00392 text = wordstr->string (); 00393 for (i = 0; text[i] != '\0'; i++) { 00394 if (word->reject_map[i].rejected ()) 00395 word->reject_map[i].setrej_minimal_rej_accept (); 00396 } 00397 } 00398 if (tessedit_minimal_rejection) { 00399 /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */ 00400 text = wordstr->string (); 00401 for (i = 0; text[i] != '\0'; i++) { 00402 if ((text[i] != ' ') && word->reject_map[i].rejected ()) 00403 word->reject_map[i].setrej_minimal_rej_accept (); 00404 } 00405 } 00406 } 00407 00408 if (write_to_shm) 00409 write_shm_text (word, page_res_it.block ()->block, 00410 page_res_it.row (), *wordstr); 00411 00412 if (tessedit_write_output) 00413 write_cooked_text (word->word, *wordstr, TRUE, FALSE, textfile); 00414 00415 if (tessedit_write_raw_output) 00416 write_cooked_text (word->word, word->raw_choice->string (), 00417 TRUE, FALSE, rawfile); 00418 00419 if (tessedit_write_txt_map) 00420 write_map(txt_mapfile, word); 00421 00422 ep_choice = make_epaper_choice (word, newline_type); 00423 word->ep_choice = ep_choice; 00424 00425 character_count += word->best_choice->string ().length (); 00426 word_count++; 00427 }
Write the cooked text to the shared memory for the api.
Definition at line 758 of file output.cpp.
References ROW::ascenders(), WERD::baseline_denormalise(), WERD_RES::best_choice, WERD::blob_list(), ROW_RES::bold, WERD_RES::bold, BOX::bottom(), WERD::bounding_box(), PBLOB::bounding_box(), WERD_RES::denorm, ROW::descenders(), EUC_BOLD, EUC_ITALIC, WERD::flag(), ROW_RES::font1, WERD_RES::font1, ROW_RES::font1_count, WERD_RES::font1_count, IMAGE::get_ysize(), ROW_RES::italic, WERD_RES::italic, BOX::left(), STRING::length(), ocr_append_char(), OCR_CDIR_LEFT_RIGHT, ocr_char_space(), OCR_LDIR_DOWN_RIGHT, OCR_NL_NEWLINE, OCR_NL_NONE, ocr_send_text(), OKAY, WERD_RES::outword, page_image, pixels_to_pts(), WERD_RES::reject_map, BOX::right(), ROW_RES::row, WERD::space(), BOX::top(), TRUE, W_BOL, W_DONT_CHOP, W_EOL, WERD_RES::word, and ROW::x_height().
Referenced by write_results().
00763 { 00764 INT32 index; //char counter 00765 INT32 index2; //char counter 00766 INT32 length; //chars in word 00767 INT32 ptsize; //font size 00768 INT8 blanks; //blanks in word 00769 UINT8 enhancement; //bold etc 00770 UINT8 font; //font index 00771 char unrecognised = STRING (unrecognised_char)[0]; 00772 PBLOB *blob; 00773 BOX blob_box; //bounding box 00774 PBLOB_IT blob_it; //blob iterator 00775 WERD copy_outword; // copy to denorm 00776 UINT32 rating; //of char 00777 BOOL8 lineend; //end of line 00778 00779 //point size 00780 ptsize = pixels_to_pts ((INT32) (row->row->x_height () + row->row->ascenders () - row->row->descenders ()), 300); 00781 if (word->word->flag (W_BOL) && ocr_char_space () < 128 00782 && ocr_send_text (TRUE) != OKAY) 00783 return; //release failed 00784 copy_outword = *(word->outword); 00785 copy_outword.baseline_denormalise (&word->denorm); 00786 blob_it.set_to_list (copy_outword.blob_list ()); 00787 length = text.length (); 00788 00789 if (length > 0) { 00790 blanks = word->word->space (); 00791 if (blanks == 0 && tessedit_word_for_word && !word->word->flag (W_BOL)) 00792 blanks = 1; 00793 for (index = 0; index < length; index++, blob_it.forward ()) { 00794 blob = blob_it.data (); 00795 blob_box = blob->bounding_box (); 00796 00797 enhancement = 0; 00798 if (word->italic > 0 || word->italic == 0 && row->italic > 0) 00799 enhancement |= EUC_ITALIC; 00800 if (word->bold > 0 || word->bold == 0 && row->bold > 0) 00801 enhancement |= EUC_BOLD; 00802 if (tessedit_write_ratings) 00803 rating = (UINT32) (-word->best_choice->certainty () / 0.035); 00804 else if (tessedit_zero_rejection) 00805 rating = text[index] == ' ' ? 100 : 0; 00806 else 00807 rating = word->reject_map[index].accepted ()? 0 : 100; 00808 if (rating > 255) 00809 rating = 255; 00810 if (word->font1_count > 2) 00811 font = word->font1; 00812 else if (row->font1_count > 8) 00813 font = row->font1; 00814 else 00815 //font index 00816 font = word->word->flag (W_DONT_CHOP) ? 0 : 1; 00817 00818 lineend = word->word->flag (W_EOL) && index == length - 1; 00819 if (word->word->flag (W_EOL) && tessedit_zero_rejection 00820 && index < length - 1 && text[index + 1] == ' ') { 00821 for (index2 = index + 1; index2 < length && text[index2] == ' '; 00822 index2++); 00823 if (index2 == length) 00824 lineend = TRUE; 00825 } 00826 00827 if (!tessedit_zero_rejection || text[index] != ' ' 00828 || tessedit_word_for_word) { 00829 //confidence 00830 ocr_append_char (text[index] == ' ' ? unrecognised : text[index], blob_box.left (), blob_box.right (), page_image.get_ysize () - 1 - blob_box.top (), page_image.get_ysize () - 1 - blob_box.bottom (), font, (UINT8) rating, 00831 ptsize, //point size 00832 blanks, enhancement, //enhancement 00833 OCR_CDIR_LEFT_RIGHT, 00834 OCR_LDIR_DOWN_RIGHT, 00835 lineend ? OCR_NL_NEWLINE : OCR_NL_NONE); 00836 blanks = 0; 00837 } 00838 00839 } 00840 } 00841 else if (tessedit_word_for_word) { 00842 blanks = word->word->space (); 00843 if (blanks == 0 && !word->word->flag (W_BOL)) 00844 blanks = 1; 00845 blob_box = word->word->bounding_box (); 00846 00847 enhancement = 0; 00848 if (word->italic > 0) 00849 enhancement |= EUC_ITALIC; 00850 if (word->bold > 0) 00851 enhancement |= EUC_BOLD; 00852 rating = 100; 00853 if (word->font1_count > 2) 00854 font = word->font1; 00855 else if (row->font1_count > 8) 00856 font = row->font1; 00857 else 00858 //font index 00859 font = word->word->flag (W_DONT_CHOP) ? 0 : 1; 00860 00861 lineend = word->word->flag (W_EOL); 00862 00863 //font index 00864 ocr_append_char (unrecognised, blob_box.left (), blob_box.right (), page_image.get_ysize () - 1 - blob_box.top (), page_image.get_ysize () - 1 - blob_box.bottom (), font, 00865 rating, //confidence 00866 ptsize, //point size 00867 blanks, enhancement, //enhancement 00868 OCR_CDIR_LEFT_RIGHT, 00869 OCR_LDIR_DOWN_RIGHT, 00870 lineend ? OCR_NL_NEWLINE : OCR_NL_NONE); 00871 } 00872 }
void write_unlv_text | ( | WERD_RES * | word | ) |
Write text to .unlv file.
word | Word |
Definition at line 971 of file output.cpp.
References WERD_RES::best_choice, ERRCODE::error(), EXIT, WERD::flag(), WERD_RES::reject_map, WERD_RES::reject_spaces, WERD::space(), unlv_file, W_EOL, WERD_RES::word, and WRITEFAILED.
Referenced by write_results().
00971 { 00972 const char *wordstr; 00973 00974 char buff[512]; //string to output 00975 int i = 0; 00976 int j = 0; 00977 char unrecognised = STRING (unrecognised_char)[0]; 00978 int status; 00979 char space_str[3]; 00980 00981 wordstr = word->best_choice->string ().string (); 00982 00983 /* 00984 DONT need to do anything special for repeated char words - at 00985 this stage the repetition char has been identified and any other 00986 chars have been rejected. 00987 */ 00988 00989 for (; wordstr[i] != '\0'; i++) { 00990 if ((wordstr[i] == ' ') || 00991 (wordstr[i] == '~') || (wordstr[i] == '^') || (wordstr[i] == '|')) 00992 buff[j++] = unrecognised; 00993 else { 00994 if (word->reject_map[i].rejected ()) 00995 buff[j++] = '^'; //Add suspect marker 00996 buff[j++] = wordstr[i]; 00997 } 00998 } 00999 buff[j] = '\0'; 01000 01001 if (strlen (wordstr) > 0) { 01002 if (word->reject_spaces && 01003 (suspect_level >= suspect_space_level) && 01004 !tessedit_minimal_rejection && !tessedit_zero_rejection) 01005 strcpy (space_str, "^ "); //Suspect space 01006 else 01007 strcpy (space_str, " "); //Certain space 01008 01009 for (i = 0; i < word->word->space (); i++) { 01010 status = fprintf (unlv_file, "%s", space_str); 01011 if (status < 0) 01012 WRITEFAILED.error ("write_unlv_text", EXIT, 01013 "Space Errno: %d", errno); 01014 } 01015 01016 status = fprintf (unlv_file, "%s", buff); 01017 if (status < 0) 01018 WRITEFAILED.error ("write_unlv_text", EXIT, "Word Errno: %d", errno); 01019 } 01020 if (word->word->flag (W_EOL)) { 01021 status = fprintf (unlv_file, "\n"); 01022 if (status < 0) 01023 WRITEFAILED.error ("write_unlv_text", EXIT, 01024 "Newline Errno: %d", errno); 01025 } 01026 status = fflush (unlv_file); 01027 if (status != 0) 01028 WRITEFAILED.error ("write_unlv_text", EXIT, "Fflush Errno: %d", errno); 01029 }