ccmain/output.cpp

Go to the documentation of this file.
00001 
00020 #include "mfcpch.h"
00021 #include          "ocrshell.h"
00022 #include          <string.h>
00023 #include          <ctype.h>
00024 #ifdef __UNIX__
00025 #include          <assert.h>
00026 #include          <unistd.h>
00027 #include                    <errno.h>
00028 #endif
00029 #include          "mainblk.h"
00030 #include          "tfacep.h"
00031 #include          "tessvars.h"
00032 #include          "control.h"
00033 #include          "secname.h"
00034 #include          "reject.h"
00035 #include          "docqual.h"
00036 #include          "output.h"
00037 #include "bestfirst.h"
00038 #ifdef TEXT_VERBOSE
00039 #include    "callcpp.h"
00040 #endif
00041 
00042 #define EXTERN
00043 
00044 #define EPAPER_EXT      ".ep"
00045 #define PAGE_YSIZE      3508
00046 #define CTRL_INSET      '\024'   // dc4=text inset
00047 #define CTRL_FONT       '\016'   // so=font change
00048 #define CTRL_DEFAULT      '\017' // si=default font
00049 #define CTRL_SHIFT      '\022'   // dc2=x shift
00050 #define CTRL_TAB        '\011'   // tab
00051 #define CTRL_NEWLINE      '\012' // newline
00052 #define CTRL_HARDLINE   '\015'   // cr
00053 int NO_BLOCK = 0;                // don't output block information
00054 /* the image can be a part of bigger picture and we want
00055    to have the original coordinates */
00056 INT16 XOFFSET = 0;
00057 INT16 YOFFSET = 0;
00058 
00061 EXTERN BOOL_EVAR (tessedit_write_block_separators, FALSE,
00062 "Write block separators in output");
00063 //EXTERN BOOL_VAR (tessedit_write_raw_output, FALSE,
00064 EXTERN BOOL_VAR (tessedit_write_raw_output, TRUE,
00065 "Write raw stuff to name.raw");
00066 EXTERN BOOL_EVAR (tessedit_write_output, TRUE, "Write text to name.txt");
00067 EXTERN BOOL_EVAR (tessedit_write_ratings, FALSE,
00068 "Return ratings in IPEOCRAPI data");
00069 EXTERN BOOL_EVAR (tessedit_write_txt_map, TRUE,
00070 "Write .txt to .etx map file");
00071 EXTERN BOOL_EVAR (tessedit_write_rep_codes, TRUE,
00072 "Write repetition char code");
00073 EXTERN BOOL_EVAR (tessedit_write_unlv, FALSE, "Write .unlv output file");
00074 EXTERN STRING_EVAR (unrecognised_char, "|",
00075 "Output char for unidentified blobs");
00076 EXTERN INT_EVAR (suspect_level, 99, "Suspect marker level");
00077 EXTERN INT_VAR (suspect_space_level, 100,
00078 "Min suspect level for rejecting spaces");
00079 EXTERN INT_VAR (suspect_short_words, 2,
00080 "Dont Suspect dict wds longer than this");
00081 EXTERN BOOL_VAR (suspect_constrain_1Il, FALSE,
00082 "UNLV keep 1Il chars rejected");
00083 EXTERN double_VAR (suspect_rating_per_ch, 999.9,
00084 "Dont touch bad rating limit");
00085 EXTERN double_VAR (suspect_accept_rating, -999.9, "Accept good rating limit");
00086 
00087 EXTERN BOOL_EVAR (tessedit_minimal_rejection, FALSE,
00088 "Only reject tess failures");
00089 EXTERN BOOL_VAR (tessedit_zero_rejection, FALSE, "Dont reject ANYTHING");
00090 EXTERN BOOL_VAR (tessedit_word_for_word, FALSE,
00091 "Make output have exactly one word per WERD");
00092 EXTERN BOOL_VAR (tessedit_zero_kelvin_rejection, FALSE,
00093 "Dont reject ANYTHING AT ALL");
00094 EXTERN BOOL_VAR (tessedit_consistent_reps, TRUE,
00095 "Force all rep chars the same");
00098 FILE *txt_mapfile = NULL;        // reject map
00099 FILE *unlv_file = NULL;          // reject map
00100 
00108 INT32 pixels_to_pts(
00109                     INT32 pixels,
00110                     INT32 pix_res
00111                    ) {
00112   float pts;                     // converted value
00113 
00114   pts = pixels * 72.0 / pix_res;
00115   return (INT32) (pts + 0.5);    // round it
00116 }
00117 
00118 
00122 void output_pass(  
00123                  PAGE_RES_IT &page_res_it,
00124                  BOOL8 write_to_shm) {
00125   BLOCK_RES *block_of_last_word;
00126   INT16 block_id;
00127   BOOL8 force_eol;               // During output
00128   BLOCK *nextblock;              // block of next word
00129   WERD *nextword;                // next word
00130 
00131 #ifdef TEXT_VERBOSE
00132   // gets a 'w', see ccmain/tesseractmain.dox
00133   cprintf("w");
00134 #endif
00135   if (tessedit_write_txt_map)
00136     txt_mapfile = open_outfile (".map");
00137   if (tessedit_write_unlv)
00138     unlv_file = open_outfile (".unlv");
00139   page_res_it.restart_page ();
00140   block_of_last_word = NULL;
00141   while (page_res_it.word () != NULL) {
00142     check_debug_pt (page_res_it.word (), 120);
00143     if (tessedit_write_block_separators &&
00144     block_of_last_word != page_res_it.block ()) {
00145       block_of_last_word = page_res_it.block ();
00146       if (block_of_last_word->block->text_region () == NULL) {
00147         if (block_of_last_word->block->poly_block () == NULL)
00148           block_id = 1;
00149         else
00150           block_id =
00151             ((WEIRD_BLOCK *) block_of_last_word->block->poly_block ())->
00152             id_no(); 
00153       }
00154       else
00155         block_id = block_of_last_word->block->text_region ()->id_no ();
00156       if (!NO_BLOCK)
00157         fprintf (textfile, "|^~tr%d\n", block_id);
00158       fprintf (txt_mapfile, "|^~tr%d\n", block_id);
00159     }
00160 
00161     force_eol = (tessedit_write_block_separators &&
00162       (page_res_it.block () != page_res_it.next_block ())) ||
00163       (page_res_it.next_word () == NULL);
00164 
00165     if (page_res_it.next_word () != NULL)
00166       nextword = page_res_it.next_word ()->word;
00167     else
00168       nextword = NULL;
00169     if (page_res_it.next_block () != NULL)
00170       nextblock = page_res_it.next_block ()->block;
00171     else
00172       nextblock = NULL;
00173     write_results (page_res_it, // regardless of tilde crunching
00174       determine_newline_type (page_res_it.word ()->word,
00175          page_res_it.block ()->block, nextword, nextblock),
00176          force_eol, write_to_shm);
00177     page_res_it.forward ();
00178   }
00179   if (write_to_shm)
00180     ocr_send_text(FALSE); 
00181   if (tessedit_write_block_separators) {
00182     if (!NO_BLOCK)
00183       fprintf (textfile, "|^~tr\n");
00184     fprintf (txt_mapfile, "|^~tr\n");
00185   }
00186   if (tessedit_write_txt_map) {
00187     fprintf (txt_mapfile, "\n"); // because txt gets one
00188     #ifdef __UNIX__
00189     fsync (fileno (txt_mapfile));
00190     #endif
00191     fclose(txt_mapfile); 
00192   }
00193 }
00194 
00195 
00214 void write_results(
00215                    PAGE_RES_IT &page_res_it,
00216                    char newline_type,
00217                    BOOL8 force_eol,
00218                    BOOL8 write_to_shm
00219                   ) {
00220   WERD_RES *word = page_res_it.word (); //word to do
00221   WERD_CHOICE *ep_choice;        //ep format
00222   STRING repetition_code;
00223   const STRING *wordstr;
00224   const char *text;
00225   int i;
00226   char unrecognised = STRING (unrecognised_char)[0];
00227   char ep_chars[32];             //Only for unlv_tilde_crunch
00228   int ep_chars_index = 0;
00229   char txt_chs[32];              //Only for unlv_tilde_crunch
00230   char map_chs[32];              //Only for unlv_tilde_crunch
00231   int txt_index = 0;
00232   static BOOL8 tilde_crunch_written = FALSE;
00233   static BOOL8 last_char_was_newline = TRUE;
00234   static BOOL8 last_char_was_tilde = FALSE;
00235   static BOOL8 empty_block = TRUE;
00236   BOOL8 need_reject = FALSE;
00237   char *ptr;                     //string ptr
00238   PBLOB_IT blob_it;              //blobs
00239 
00240   /*
00241    if (word->best_choice->string().length() == 0)
00242     {
00243       tprintf("No output: to output\n");
00244     }
00245     else if (word->best_choice->string()[0]==' ')
00246     {
00247       tprintf("spaceword to output\n");
00248     }
00249     else if (word->best_choice->string()[0]== '\0' )
00250     {
00251       tprintf("null to output\n");
00252     }*/
00253   if (word->unlv_crunch_mode != CR_NONE
00254   && !tessedit_zero_kelvin_rejection && !tessedit_word_for_word) {
00255     if ((word->unlv_crunch_mode != CR_DELETE) &&
00256       (!tilde_crunch_written ||
00257       ((word->unlv_crunch_mode == CR_KEEP_SPACE) &&
00258       (word->word->space () > 0) &&
00259       !word->word->flag (W_FUZZY_NON) &&
00260     !word->word->flag (W_FUZZY_SP)))) {
00261       if (!word->word->flag (W_BOL) &&
00262         (word->word->space () > 0) &&
00263         !word->word->flag (W_FUZZY_NON) &&
00264       !word->word->flag (W_FUZZY_SP)) {
00265         /* Write a space to separate from preceeding good text */
00266         txt_chs[txt_index] = ' ';
00267         map_chs[txt_index++] = '1';
00268         ep_chars[ep_chars_index++] = ' ';
00269         last_char_was_tilde = FALSE;
00270       }
00271       need_reject = TRUE;
00272     }
00273     if ((need_reject && !last_char_was_tilde) || (force_eol && empty_block)) {
00274       /* Write a reject char - mark as rejected unless zero_rejection mode */
00275       last_char_was_tilde = TRUE;
00276       txt_chs[txt_index] = unrecognised;
00277       if (tessedit_zero_rejection || (suspect_level == 0)) {
00278         map_chs[txt_index++] = '1';
00279         ep_chars[ep_chars_index++] = unrecognised;
00280       }
00281       else {
00282         map_chs[txt_index++] = '0';
00283         /* The ep_choice string is a faked reject to allow newdiff to
00284          sync the .etx with the .txt and .map files.
00285          */
00286         ep_chars[ep_chars_index++] = CTRL_INSET; // escape code
00287         ep_chars[ep_chars_index++] = 1; // dummy reject
00288         ep_chars[ep_chars_index++] = 1; // dummy reject
00289         ep_chars[ep_chars_index++] = 2; // type
00290         ep_chars[ep_chars_index++] = 1; // dummy reject
00291         ep_chars[ep_chars_index++] = 1; // dummy reject
00292       }
00293       tilde_crunch_written = TRUE;
00294       last_char_was_newline = FALSE;
00295       empty_block = FALSE;
00296     }
00297 
00298     if ((word->word->flag (W_EOL) && !last_char_was_newline) || force_eol) {
00299       /* Add a new line output */
00300       txt_chs[txt_index] = '\n';
00301       map_chs[txt_index++] = '\n';
00302       ep_chars[ep_chars_index++] = newline_type; // end line
00303 
00304       tilde_crunch_written = FALSE; // Coordinates of the real newline
00305       last_char_was_newline = TRUE;
00306       last_char_was_tilde = FALSE;
00307     }
00308     txt_chs[txt_index] = '\0';
00309     map_chs[txt_index] = '\0';
00310     if (tessedit_write_output && !NO_BLOCK) // xiaofan ?
00311       fprintf (textfile, "%s", txt_chs);
00312 
00313     if (tessedit_write_unlv)
00314       fprintf (unlv_file, "%s", txt_chs);
00315 
00316     if (tessedit_write_txt_map)
00317       fprintf (txt_mapfile, "%s", map_chs);
00318 
00319     ep_chars[ep_chars_index] = '\0'; // terminate string
00320     word->ep_choice = new WERD_CHOICE (ep_chars, 0, 0, NO_PERM);
00321 
00322     if (force_eol)
00323       empty_block = TRUE;
00324     return;
00325   }
00326 
00327   /* NORMAL PROCESSING of non tilde crunched words */
00328 
00329   tilde_crunch_written = FALSE;
00330   if (newline_type)
00331     last_char_was_newline = TRUE;
00332   else
00333     last_char_was_newline = FALSE;
00334   empty_block = force_eol;       // About to write a real word
00335 
00336   if (unlv_tilde_crunching &&
00337     last_char_was_tilde &&
00338     (word->word->space () == 0) &&
00339     !(word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes) &&
00340   (word->best_choice->string ()[0] == ' ')) {
00341     /* Prevent adjacent tilde across words - we know that adjacent
00342       tildes within words have been removed */
00343     ptr = (char *) word->best_choice->string ().string ();
00344     strcpy (ptr, ptr + 1);       //shuffle up
00345     word->reject_map.remove_pos (0);
00346     blob_it = word->outword->blob_list ();
00347     delete blob_it.extract ();   // get rid of reject blob
00348   }
00349   if (newline_type ||
00350     (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes))
00351     last_char_was_tilde = FALSE;
00352   else {
00353     if (word->reject_map.length () > 0) {
00354       if (word->best_choice->string ()[word->reject_map.length () - 1] ==
00355         ' ')
00356         last_char_was_tilde = TRUE;
00357       else
00358         last_char_was_tilde = FALSE;
00359     }
00360     else if (word->word->space () > 0)
00361       last_char_was_tilde = FALSE;
00362     /* else it is unchanged as there are no output chars */
00363   }
00364 
00365   ptr = (char *) word->best_choice->string ().string ();
00366   ASSERT_HOST (strlen (ptr) == word->reject_map.length ());
00367 
00368   if (word->word->flag (W_REP_CHAR) && tessedit_consistent_reps)
00369     ensure_rep_chars_are_consistent(word); 
00370 
00371   set_unlv_suspects(word); 
00372   check_debug_pt (word, 120);
00373   if (tessedit_rejection_debug) {
00374     tprintf ("Dict word: \"%s\": %d\n",
00375       word->best_choice->string ().string (),
00376       dict_word (word->best_choice->string ().string ()));
00377   }
00378 
00379   if (tessedit_write_unlv) {
00380     write_unlv_text(word); 
00381   }
00382 
00383   if (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes) {
00384     repetition_code = "|^~R";
00385     repetition_code += get_rep_char (word);
00386     wordstr = &repetition_code;
00387   }
00388   else {
00389     wordstr = &(word->best_choice->string ());
00390     if (tessedit_zero_rejection) {
00391       /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
00392       text = wordstr->string ();
00393       for (i = 0; text[i] != '\0'; i++) {
00394         if (word->reject_map[i].rejected ())
00395           word->reject_map[i].setrej_minimal_rej_accept ();
00396       }
00397     }
00398     if (tessedit_minimal_rejection) {
00399       /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
00400       text = wordstr->string ();
00401       for (i = 0; text[i] != '\0'; i++) {
00402         if ((text[i] != ' ') && word->reject_map[i].rejected ())
00403           word->reject_map[i].setrej_minimal_rej_accept ();
00404       }
00405     }
00406   }
00407 
00408   if (write_to_shm)
00409     write_shm_text (word, page_res_it.block ()->block,
00410       page_res_it.row (), *wordstr);
00411 
00412   if (tessedit_write_output)
00413     write_cooked_text (word->word, *wordstr, TRUE, FALSE, textfile);
00414 
00415   if (tessedit_write_raw_output)
00416     write_cooked_text (word->word, word->raw_choice->string (),
00417       TRUE, FALSE, rawfile);
00418 
00419   if (tessedit_write_txt_map)
00420     write_map(txt_mapfile, word); 
00421 
00422   ep_choice = make_epaper_choice (word, newline_type);
00423   word->ep_choice = ep_choice;
00424 
00425   character_count += word->best_choice->string ().length ();
00426   word_count++;
00427 }
00428 
00439 WERD_CHOICE *make_epaper_choice(                   //convert one word
00440                                 WERD_RES *word,    //word to do
00441                                 char newline_type  //type of newline
00442                                ) {
00443   INT16 index = 0;               //to string
00444   INT16 blobindex;               //to word
00445   INT16 prevright = 0;           //right of previous blob
00446   INT16 nextleft;                //left of next blob
00447   PBLOB *blob;
00448   BOX inset_box;                 //bounding box
00449   PBLOB_IT blob_it;              //blob iterator
00450   char word_string[MAX_PATH];    //converted string
00451   BOOL8 force_total_reject;
00452   char unrecognised = STRING (unrecognised_char)[0];
00453 
00454   blob_it.set_to_list (word->outword->blob_list ());
00455 
00456   ASSERT_HOST (word->reject_map.length () ==
00457     word->best_choice->string ().length ());
00458   /*
00459   tprintf( "\"%s\" -> length: %d;  blobcount: %d (%d)\n",
00460       word->best_choice->string().string(),
00461         word->best_choice->string().length(),
00462       blob_it.length(),
00463         blob_count( word->outword ) );
00464   */
00465 
00466   if (word->best_choice->string ().length () == 0)
00467     force_total_reject = TRUE;
00468   else {
00469     force_total_reject = FALSE;
00470     ASSERT_HOST (blob_it.length () ==
00471       word->best_choice->string ().length ());
00472   }
00473   if (!blob_it.empty ()) {
00474     for (index = 0; index < word->word->space (); index++)
00475       word_string[index] = ' ';  //leading blanks
00476   }
00477   /*
00478   Why does this generate leading blanks regardless of whether the
00479   word_choice string is empty, when write_cooked_text only generates leading
00480   blanks when the string is NOT empty???. 
00481   */
00482 
00483   if (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes) {
00484     strcpy (word_string + index, "|^~R");
00485     index += 4;
00486     word_string[index++] = get_rep_char (word);
00487   }
00488   else {
00489     if (!blob_it.empty ())
00490       prevright = blob_it.data ()->bounding_box ().left ();
00491     //actually first left
00492     for (blobindex = 0, blob_it.mark_cycle_pt ();
00493     !blob_it.cycled_list (); blobindex++, blob_it.forward ()) {
00494       blob = blob_it.data ();
00495       if (word->reject_map[blobindex].accepted ()) {
00496         if (word->best_choice->string ()[blobindex] == ' ')
00497           word_string[index++] = unrecognised; // but not rejected!!
00498         else
00499           word_string[index++] =
00500             word->best_choice->string ()[blobindex];
00501       }
00502       else {
00503         inset_box = blob->bounding_box (); // start reject
00504         /* Extend reject box to include rejected neighbours */
00505         while (!blob_it.at_last () &&
00506           (force_total_reject ||
00507         (word->reject_map[blobindex + 1].rejected ()))) {
00508           blobindex++;
00509           blob = blob_it.forward ();
00510           inset_box += blob->bounding_box (); // get total box
00511         }
00512         if (blob_it.at_last ())
00513           nextleft = inset_box.right ();
00514         else
00515           nextleft = blob_it.data_relative (1)->bounding_box ().left ();
00516 
00517         //       tprintf("Making reject from (%d,%d)->(%d,%d)\n",
00518         //          inset_box.left(),inset_box.bottom(),
00519         //          inset_box.right(),inset_box.top());
00520 
00521         index += make_reject (&inset_box, prevright, nextleft,
00522           &word->denorm, &word_string[index]);
00523       }
00524       prevright = blob->bounding_box ().right ();
00525     }
00526   }
00527   if (newline_type)
00528     word_string[index++] = newline_type; // end line
00529   word_string[index] = '\0';     // terminate string
00530   if (strlen (word_string) != index) {
00531     tprintf ("ASSERT ABOUT TO FAIL: %s, index %d len %d\n",
00532       word_string, index, strlen (word_string));
00533   }
00534   ASSERT_HOST (strlen (word_string) == index); // don't pass any zeros
00535   return new WERD_CHOICE (word_string, 0, 0, NO_PERM);
00536 }
00537 
00538 
00548 INT16 make_reject (
00549 BOX * inset_box, 
00550 INT16 prevright,
00551 INT16 nextleft,
00552 DENORM * denorm,
00553 char word_string[]
00554 ) {
00555   INT16 index;                   // to string
00556   INT16 xpos;                    // start of inset
00557   INT16 ypos;
00558   INT16 width;                   // size of inset
00559   INT16 height;
00560   INT16 left_offset;             // shift form prev char
00561   INT16 right_offset;            // shift to next char
00562   INT16 baseline_offset;         // shift from baseline
00563   INT16 inset_index = 0;         // number of inset
00564   INT16 min_chars;               // min width estimate
00565   INT16 max_chars;               // max width estimate
00566   float x_centre;                // centre of box
00567 
00568   index = 0;
00569   x_centre = (inset_box->left () + inset_box->right ()) / 2.0;
00570   left_offset =
00571     (INT16) (denorm->x (inset_box->left ()) - denorm->x (prevright));
00572   right_offset =
00573     (INT16) (denorm->x (nextleft) - denorm->x (inset_box->right ()));
00574   xpos = (INT16) floor (denorm->x (inset_box->left ()));
00575   width = (INT16) ceil (denorm->x (inset_box->right ())) - xpos;
00576   ypos = (INT16) floor (denorm->y (inset_box->bottom (), x_centre));
00577   height = (INT16) ceil (denorm->y (inset_box->top (), x_centre)) - ypos;
00578   baseline_offset = ypos - (INT16) denorm->y (bln_baseline_offset, x_centre);
00579   word_string[index++] = CTRL_INSET; // escape code
00580   min_chars = (INT16) ceil (0.27 * width / denorm->row ()->x_height ());
00581   max_chars = (INT16) floor (1.8 * width / denorm->row ()->x_height ());
00582   /*
00583   Ensure min_chars and max_chars are in the range 0..254. This ensures that
00584   we can add 1 to them to avoid putting \0 in a string, and still not exceed
00585   the max value in a byte.
00586   */
00587   if (min_chars < 0)
00588     min_chars = 0;
00589   if (min_chars > 254)
00590     min_chars = 254;
00591   if (max_chars < min_chars)
00592     max_chars = min_chars;
00593   if (max_chars > 254)
00594     max_chars = 254;
00595   word_string[index++] = min_chars + 1; // min chars
00596   word_string[index++] = max_chars + 1; // max chars
00597   word_string[index++] = 2;      //type?
00598   word_string[index++] = inset_index / 255 + 1; // store index
00599   word_string[index++] = inset_index % 255 + 1;
00600   return index;                  // size of string
00601 }
00602 
00603 
00613 char determine_newline_type(
00614                             WERD *word,
00615                             BLOCK *block,
00616                             WERD *next_word,
00617                             BLOCK *next_block
00618                            ) {
00619   INT16 end_gap;                 // to right edge
00620   INT16 width;                   // of next word
00621   BOX word_box;                  // bounding
00622   BOX next_box;                  // next word
00623   BOX block_box;                 // block bounding
00624 
00625   if (!word->flag (W_EOL))
00626     return FALSE;                // not end of line
00627   if (next_word == NULL || next_block == NULL || block != next_block)
00628     return CTRL_NEWLINE;
00629   if (next_word->space () > 0)
00630     return CTRL_HARDLINE;        //it is tabbed
00631   word_box = word->bounding_box ();
00632   next_box = next_word->bounding_box ();
00633   block_box = block->bounding_box ();
00634                                  //gap to eol
00635   end_gap = block_box.right () - word_box.right ();
00636   end_gap -= (INT32) block->space ();
00637   width = next_box.right () - next_box.left ();
00638   //      tprintf("end_gap=%d-%d=%d, width=%d-%d=%d, nl=%d\n",
00639   //              block_box.right(),word_box.right(),end_gap,
00640   //              next_box.right(),next_box.left(),width,
00641   //              end_gap>width ? CTRL_HARDLINE : CTRL_NEWLINE);
00642   return end_gap > width ? CTRL_HARDLINE : CTRL_NEWLINE;
00643 }
00644 
00645 
00657 void write_cooked_text(
00658                        WERD *word,
00659                        const STRING &text,
00660                        BOOL8 acceptable,
00661                        BOOL8 pass2,
00662                        FILE *fp
00663                       ) {
00664   INT16 index;                   //blank counter
00665   int status;
00666   static int newaline = 1;
00667   static int havespace = 0;
00668   char buff[512];
00669   const char *wordstr = text.string ();
00670   int i = 0;
00671   char unrecognised = STRING (unrecognised_char)[0];
00672   static int old_segs = 0;
00673   BOX mybox;
00674   for (i = 0; wordstr[i] != '\0'; i++) {
00675     if (wordstr[i] == ' ')
00676       buff[i] = unrecognised;
00677     else
00678       buff[i] = wordstr[i];
00679   }
00680   buff[i] = '\0';
00681 
00682   if (fp == stdout) {
00683     tprintf ("Cooked=%s, %d segs, acceptable=%d",
00684       buff, num_popped - old_segs, acceptable);
00685     old_segs = num_popped;
00686     return;
00687   }
00688 
00689   if (text.length () > 0) {
00690     for (index = 0; index < word->space (); index++) {
00691       status = fprintf (fp, " ");
00692       havespace = 1;
00693       if (status < 0)
00694         WRITEFAILED.error ("write_cooked_text", EXIT,
00695           "Space Errno: %d", errno);
00696     }
00697     if (pass2) {
00698       status = fprintf (fp, BOLD_ON);
00699       if (status < 0)
00700         WRITEFAILED.error ("write_cooked_text", EXIT,
00701           "Bold Errno: %d", errno);
00702     }
00703     if (!acceptable) {
00704       status = fprintf (fp, UNDERLINE_ON);
00705       if (status < 0)
00706         WRITEFAILED.error ("write_cooked_text", EXIT,
00707           "Underline Errno: %d", errno);
00708     }
00709 
00710                                  //xiaofan
00711     if (NO_BLOCK && word && strlen (buff)) {
00712       mybox = word->bounding_box ();
00713       if (newaline || !havespace) {
00714         fprintf (fp, " ");
00715         newaline = 0;
00716       }
00717       fprintf (fp, "(%d," INT32FORMAT ",%d," INT32FORMAT ")",
00718         XOFFSET + mybox.left (),
00719         YOFFSET + page_image.get_ysize () - mybox.top (),
00720         XOFFSET + mybox.right (),
00721         YOFFSET + page_image.get_ysize () - mybox.bottom ());
00722       havespace = 0;
00723     }
00724 
00725     status = fprintf (fp, "%s", buff);
00726     if (status < 0)
00727       WRITEFAILED.error ("write_cooked_text", EXIT,
00728         "Word Errno: %d", errno);
00729     if (pass2) {
00730       status = fprintf (fp, BOLD_OFF);
00731       if (status < 0)
00732         WRITEFAILED.error ("write_cooked_text", EXIT,
00733           "Bold off Errno: %d", errno);
00734     }
00735     if (!acceptable) {
00736       status = fprintf (fp, UNDERLINE_OFF);
00737       if (status < 0)
00738         WRITEFAILED.error ("write_cooked_text", EXIT,
00739           "Underline off Errno: %d", errno);
00740     }
00741   }
00742   if (word->flag (W_EOL)) {
00743     status = fprintf (fp, "\n");
00744     newaline = 1;
00745     if (status < 0)
00746       WRITEFAILED.error ("write_cooked_text", EXIT,
00747         "Newline Errno: %d", errno);
00748   }
00749   status = fflush (fp);
00750   if (status != 0)
00751     WRITEFAILED.error ("write_cooked_text", EXIT, "Fflush Errno: %d", errno);
00752 }
00753 
00754 
00758 void write_shm_text(                    //write output
00759                     WERD_RES *word,     //word to do
00760                     BLOCK *block,       //block it is from
00761                     ROW_RES *row,       //row it is from
00762                     const STRING &text  //text to write
00763                    ) {
00764   INT32 index;                   //char counter
00765   INT32 index2;                  //char counter
00766   INT32 length;                  //chars in word
00767   INT32 ptsize;                  //font size
00768   INT8 blanks;                   //blanks in word
00769   UINT8 enhancement;             //bold etc
00770   UINT8 font;                    //font index
00771   char unrecognised = STRING (unrecognised_char)[0];
00772   PBLOB *blob;
00773   BOX blob_box;                  //bounding box
00774   PBLOB_IT blob_it;              //blob iterator
00775   WERD copy_outword;             // copy to denorm
00776   UINT32 rating;                 //of char
00777   BOOL8 lineend;                 //end of line
00778 
00779                                  //point size
00780   ptsize = pixels_to_pts ((INT32) (row->row->x_height () + row->row->ascenders () - row->row->descenders ()), 300);
00781   if (word->word->flag (W_BOL) && ocr_char_space () < 128
00782     && ocr_send_text (TRUE) != OKAY)
00783     return;                      //release failed
00784   copy_outword = *(word->outword);
00785   copy_outword.baseline_denormalise (&word->denorm);
00786   blob_it.set_to_list (copy_outword.blob_list ());
00787   length = text.length ();
00788 
00789   if (length > 0) {
00790     blanks = word->word->space ();
00791     if (blanks == 0 && tessedit_word_for_word && !word->word->flag (W_BOL))
00792       blanks = 1;
00793     for (index = 0; index < length; index++, blob_it.forward ()) {
00794       blob = blob_it.data ();
00795       blob_box = blob->bounding_box ();
00796 
00797       enhancement = 0;
00798       if (word->italic > 0 || word->italic == 0 && row->italic > 0)
00799         enhancement |= EUC_ITALIC;
00800       if (word->bold > 0 || word->bold == 0 && row->bold > 0)
00801         enhancement |= EUC_BOLD;
00802       if (tessedit_write_ratings)
00803         rating = (UINT32) (-word->best_choice->certainty () / 0.035);
00804       else if (tessedit_zero_rejection)
00805         rating = text[index] == ' ' ? 100 : 0;
00806       else
00807         rating = word->reject_map[index].accepted ()? 0 : 100;
00808       if (rating > 255)
00809         rating = 255;
00810       if (word->font1_count > 2)
00811         font = word->font1;
00812       else if (row->font1_count > 8)
00813         font = row->font1;
00814       else
00815                                  //font index
00816         font = word->word->flag (W_DONT_CHOP) ? 0 : 1;
00817 
00818       lineend = word->word->flag (W_EOL) && index == length - 1;
00819       if (word->word->flag (W_EOL) && tessedit_zero_rejection
00820       && index < length - 1 && text[index + 1] == ' ') {
00821         for (index2 = index + 1; index2 < length && text[index2] == ' ';
00822           index2++);
00823         if (index2 == length)
00824           lineend = TRUE;
00825       }
00826 
00827       if (!tessedit_zero_rejection || text[index] != ' '
00828       || tessedit_word_for_word) {
00829                                  //confidence
00830         ocr_append_char (text[index] == ' ' ? unrecognised : text[index], blob_box.left (), blob_box.right (), page_image.get_ysize () - 1 - blob_box.top (), page_image.get_ysize () - 1 - blob_box.bottom (), font, (UINT8) rating,
00831           ptsize,                //point size
00832           blanks, enhancement,   //enhancement
00833           OCR_CDIR_LEFT_RIGHT,
00834           OCR_LDIR_DOWN_RIGHT,
00835           lineend ? OCR_NL_NEWLINE : OCR_NL_NONE);
00836         blanks = 0;
00837       }
00838 
00839     }
00840   }
00841   else if (tessedit_word_for_word) {
00842     blanks = word->word->space ();
00843     if (blanks == 0 && !word->word->flag (W_BOL))
00844       blanks = 1;
00845     blob_box = word->word->bounding_box ();
00846 
00847     enhancement = 0;
00848     if (word->italic > 0)
00849       enhancement |= EUC_ITALIC;
00850     if (word->bold > 0)
00851       enhancement |= EUC_BOLD;
00852     rating = 100;
00853     if (word->font1_count > 2)
00854       font = word->font1;
00855     else if (row->font1_count > 8)
00856       font = row->font1;
00857     else
00858                                  //font index
00859       font = word->word->flag (W_DONT_CHOP) ? 0 : 1;
00860 
00861     lineend = word->word->flag (W_EOL);
00862 
00863                                  //font index
00864     ocr_append_char (unrecognised, blob_box.left (), blob_box.right (), page_image.get_ysize () - 1 - blob_box.top (), page_image.get_ysize () - 1 - blob_box.bottom (), font,
00865       rating,                    //confidence
00866       ptsize,                    //point size
00867       blanks, enhancement,       //enhancement
00868       OCR_CDIR_LEFT_RIGHT,
00869       OCR_LDIR_DOWN_RIGHT,
00870       lineend ? OCR_NL_NEWLINE : OCR_NL_NONE);
00871   }
00872 }
00873 
00874 
00891 void write_map(                //output a map file
00892                FILE *mapfile,  //mapfile to write to
00893                WERD_RES *word) {
00894   INT16 index;
00895   int status;
00896   STRING mapstr = "";
00897 
00898   if (word->best_choice->string ().length () > 0) {
00899     for (index = 0; index < word->word->space (); index++) {
00900       if (word->reject_spaces &&
00901         (suspect_level >= suspect_space_level) &&
00902         !tessedit_minimal_rejection && !tessedit_zero_rejection)
00903         /*
00904        Write rejected spaces to .map file ONLY. Newdiff converts
00905        these back to accepted spaces AFTER generating basic space
00906        stats but BEFORE using .etx
00907        */
00908         status = fprintf (mapfile, "0");
00909       else
00910         status = fprintf (mapfile, "1");
00911       if (status < 0)
00912         WRITEFAILED.error ("write_map", EXIT, "Space Errno: %d", errno);
00913     }
00914 
00915     if ((word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes)) {
00916       for (index = 0; index < 5; index++)
00917         mapstr += '1';
00918     }
00919     else {
00920       ASSERT_HOST (word->reject_map.length () ==
00921         word->best_choice->string ().length ());
00922 
00923       for (index = 0; index < word->reject_map.length (); index++) {
00924         if (word->reject_map[index].accepted ())
00925           mapstr += '1';
00926         else
00927           mapstr += '0';
00928       }
00929     }
00930     status = fprintf (mapfile, "%s", mapstr.string ());
00931     if (status < 0)
00932       WRITEFAILED.error ("write_map", EXIT, "Map str Errno: %d", errno);
00933   }
00934   if (word->word->flag (W_EOL)) {
00935     status = fprintf (mapfile, "\n");
00936     if (status < 0)
00937       WRITEFAILED.error ("write_map", EXIT, "Newline Errno: %d", errno);
00938   }
00939   status = fflush (mapfile);
00940   if (status != 0)
00941     WRITEFAILED.error ("write_map", EXIT, "fflush Errno: %d", errno);
00942 }
00943 
00944 
00952 FILE *open_outfile(
00953                    const char *extension) {
00954   STRING file_name;
00955   FILE *outfile;
00956 
00957   file_name = imagebasename + extension;
00958   if (!(outfile = fopen (file_name.string (), "w"))) {
00959     CANTOPENFILE.error ("open_outfile", EXIT, "%s %d",
00960       file_name.string (), errno);
00961   }
00962   return outfile;
00963 }
00964 
00965 
00971 void write_unlv_text(WERD_RES *word) { 
00972   const char *wordstr;
00973 
00974   char buff[512];                //string to output
00975   int i = 0;
00976   int j = 0;
00977   char unrecognised = STRING (unrecognised_char)[0];
00978   int status;
00979   char space_str[3];
00980 
00981   wordstr = word->best_choice->string ().string ();
00982 
00983   /*
00984   DONT need to do anything special for repeated char words - at
00985   this stage the repetition char has been identified and any other
00986   chars have been rejected.
00987   */
00988 
00989   for (; wordstr[i] != '\0'; i++) {
00990     if ((wordstr[i] == ' ') ||
00991       (wordstr[i] == '~') || (wordstr[i] == '^') || (wordstr[i] == '|'))
00992       buff[j++] = unrecognised;
00993     else {
00994       if (word->reject_map[i].rejected ())
00995         buff[j++] = '^';         //Add suspect marker
00996       buff[j++] = wordstr[i];
00997     }
00998   }
00999   buff[j] = '\0';
01000 
01001   if (strlen (wordstr) > 0) {
01002     if (word->reject_spaces &&
01003       (suspect_level >= suspect_space_level) &&
01004       !tessedit_minimal_rejection && !tessedit_zero_rejection)
01005       strcpy (space_str, "^ ");  //Suspect space
01006     else
01007       strcpy (space_str, " ");   //Certain space
01008 
01009     for (i = 0; i < word->word->space (); i++) {
01010       status = fprintf (unlv_file, "%s", space_str);
01011       if (status < 0)
01012         WRITEFAILED.error ("write_unlv_text", EXIT,
01013           "Space Errno: %d", errno);
01014     }
01015 
01016     status = fprintf (unlv_file, "%s", buff);
01017     if (status < 0)
01018       WRITEFAILED.error ("write_unlv_text", EXIT, "Word Errno: %d", errno);
01019   }
01020   if (word->word->flag (W_EOL)) {
01021     status = fprintf (unlv_file, "\n");
01022     if (status < 0)
01023       WRITEFAILED.error ("write_unlv_text", EXIT,
01024         "Newline Errno: %d", errno);
01025   }
01026   status = fflush (unlv_file);
01027   if (status != 0)
01028     WRITEFAILED.error ("write_unlv_text", EXIT, "Fflush Errno: %d", errno);
01029 }
01030 
01031 
01040 char get_rep_char(
01041                   WERD_RES *word) {
01042   int i;
01043 
01044   for (i = 0;
01045     ((i < word->reject_map.length ()) &&
01046     (word->reject_map[i].rejected ())); i++);
01047   if (i < word->reject_map.length ())
01048     return word->best_choice->string ()[i];
01049   else
01050     return STRING (unrecognised_char)[0];
01051 }
01052 
01053 
01059 void ensure_rep_chars_are_consistent(WERD_RES *word) { 
01060   char rep_char = get_rep_char (word);
01061   char *ptr;
01062 
01063   ptr = (char *) word->best_choice->string ().string ();
01064   for (; *ptr != '\0'; ptr++) {
01065     if (*ptr != rep_char)
01066       *ptr = rep_char;
01067   }
01068 }
01069 
01070 
01085 void set_unlv_suspects(WERD_RES *word) { 
01086   int len = word->reject_map.length ();
01087   int i;
01088   const char *ptr;
01089   float rating_per_ch;
01090 
01091   ptr = word->best_choice->string ().string ();
01092 
01093   if (suspect_level == 0) {
01094     for (i = 0; i < len; i++) {
01095       if (word->reject_map[i].rejected ())
01096         word->reject_map[i].setrej_minimal_rej_accept ();
01097     }
01098     return;
01099   }
01100 
01101   if (suspect_level >= 3)
01102     return;                      // Use defaults
01103 
01104   /* NOW FOR LEVELS 1 and 2 Find some stuff to unreject*/
01105 
01106   if (safe_dict_word (ptr) && (count_alphas (ptr) > suspect_short_words)) {
01107     /* Unreject alphas in dictionary words */
01108     for (i = 0; i < len; i++) {
01109       if (word->reject_map[i].rejected () && isalpha (ptr[i]))
01110         word->reject_map[i].setrej_minimal_rej_accept ();
01111     }
01112   }
01113 
01114   rating_per_ch = word->best_choice->rating () / word->reject_map.length ();
01115 
01116   if (rating_per_ch >= suspect_rating_per_ch)
01117     return;                      //Dont touch bad ratings
01118 
01119   if ((word->tess_accepted) || (rating_per_ch < suspect_accept_rating)) {
01120     /* Unreject any Tess Acceptable word - but NOT tess reject chs*/
01121     for (i = 0; i < len; i++) {
01122       if (word->reject_map[i].rejected () && (ptr[i] != ' '))
01123         word->reject_map[i].setrej_minimal_rej_accept ();
01124     }
01125   }
01126 
01127   for (i = 0; i < len; i++) {
01128     if (word->reject_map[i].rejected ()) {
01129       if (word->reject_map[i].flag (R_DOC_REJ))
01130         word->reject_map[i].setrej_minimal_rej_accept ();
01131       if (word->reject_map[i].flag (R_BLOCK_REJ))
01132         word->reject_map[i].setrej_minimal_rej_accept ();
01133       if (word->reject_map[i].flag (R_ROW_REJ))
01134         word->reject_map[i].setrej_minimal_rej_accept ();
01135     }
01136   }
01137 
01138   if (suspect_level == 2)
01139     return;
01140 
01141   if (!suspect_constrain_1Il ||
01142   (word->reject_map.length () <= suspect_short_words)) {
01143     for (i = 0; i < len; i++) {
01144       if (word->reject_map[i].rejected ()) {
01145         if ((word->reject_map[i].flag (R_1IL_CONFLICT) ||
01146           word->reject_map[i].flag (R_POSTNN_1IL)))
01147           word->reject_map[i].setrej_minimal_rej_accept ();
01148 
01149         if (!suspect_constrain_1Il &&
01150           word->reject_map[i].flag (R_MM_REJECT))
01151           word->reject_map[i].setrej_minimal_rej_accept ();
01152       }
01153     }
01154   }
01155 
01156   if ((acceptable_word_string (word->best_choice->string ().string ())
01157     != AC_UNACCEPTABLE) ||
01158   acceptable_number_string (word->best_choice->string ().string ())) {
01159     if (word->reject_map.length () > suspect_short_words) {
01160       for (i = 0; i < len; i++) {
01161         if (word->reject_map[i].rejected () &&
01162           (!word->reject_map[i].perm_rejected () ||
01163           word->reject_map[i].flag (R_1IL_CONFLICT) ||
01164           word->reject_map[i].flag (R_POSTNN_1IL) ||
01165         word->reject_map[i].flag (R_MM_REJECT))) {
01166           word->reject_map[i].setrej_minimal_rej_accept ();
01167         }
01168       }
01169     }
01170   }
01171 }
01172 
01173 
01180 INT16 count_alphas(
01181                    const char *s) {
01182   int count = 0;
01183 
01184   for (; *s != '\0'; s++) {
01185     if (isalpha (*s))
01186       count++;
01187   }
01188   return count;
01189 }
01190 
01191 
01198 INT16 count_alphanums(
01199                       const char *s) {
01200   int count = 0;
01201 
01202   for (; *s != '\0'; s++) {
01203     if (isalnum (*s))
01204       count++;
01205   }
01206   return count;
01207 }
01208 
01209 
01216 BOOL8 acceptable_number_string(const char *s) { 
01217   BOOL8 prev_digit = FALSE;
01218 
01219   if (*s == '(')
01220     s++;
01221 
01222   if ((*s == '$') || (*s == '.') || (*s == '+') || (*s == '-'))
01223     s++;
01224 
01225   for (; *s != '\0'; s++) {
01226     if (isdigit (*s))
01227       prev_digit = TRUE;
01228     else if (prev_digit && ((*s == '.') || (*s == ',') || (*s == '-')))
01229       prev_digit = FALSE;
01230     else if (prev_digit &&
01231       (*(s + 1) == '\0') && ((*s == '%') || (*s == ')')))
01232       return TRUE;
01233     else if (prev_digit &&
01234       (*s == '%') && (*(s + 1) == ')') && (*(s + 2) == '\0'))
01235       return TRUE;
01236     else
01237       return FALSE;
01238   }
01239   return TRUE;
01240 }

Generated on Wed Feb 28 19:49:07 2007 for Tesseract by  doxygen 1.5.1