ccmain/output.h File Reference

#include "varable.h"
#include "pageres.h"
#include "notdll.h"

Go to the source code of this file.

Functions


Function Documentation

BOOL8 acceptable_number_string ( const char *  s  ) 

Rules for determining if string is a valid number (of some kind).

Parameters:
s String
Returns:
TRUE if string matches what COULD be a number

Definition at line 1216 of file output.cpp.

References FALSE, and TRUE.

Referenced by set_unlv_suspects().

01216                                               { 
01217   BOOL8 prev_digit = FALSE;
01218 
01219   if (*s == '(')
01220     s++;
01221 
01222   if ((*s == '$') || (*s == '.') || (*s == '+') || (*s == '-'))
01223     s++;
01224 
01225   for (; *s != '\0'; s++) {
01226     if (isdigit (*s))
01227       prev_digit = TRUE;
01228     else if (prev_digit && ((*s == '.') || (*s == ',') || (*s == '-')))
01229       prev_digit = FALSE;
01230     else if (prev_digit &&
01231       (*(s + 1) == '\0') && ((*s == '%') || (*s == ')')))
01232       return TRUE;
01233     else if (prev_digit &&
01234       (*s == '%') && (*(s + 1) == ')') && (*(s + 2) == '\0'))
01235       return TRUE;
01236     else
01237       return FALSE;
01238   }
01239   return TRUE;
01240 }

INT16 count_alphanums ( const char *  s  ) 

Count number of isalphanum()s in string.

Parameters:
s String
Returns:
count of characters that are alphanumbs

Definition at line 1198 of file output.cpp.

References count().

01199                                      {
01200   int count = 0;
01201 
01202   for (; *s != '\0'; s++) {
01203     if (isalnum (*s))
01204       count++;
01205   }
01206   return count;
01207 }

INT16 count_alphas ( const char *  s  ) 

Count number of isalpha()s in string.

Parameters:
s String
Returns:
count of characters that are alphas

Definition at line 1180 of file output.cpp.

References count().

Referenced by set_unlv_suspects().

01181                                   {
01182   int count = 0;
01183 
01184   for (; *s != '\0'; s++) {
01185     if (isalpha (*s))
01186       count++;
01187   }
01188   return count;
01189 }

char determine_newline_type ( WERD word,
BLOCK block,
WERD next_word,
BLOCK next_block 
)

Find whether we have a wrapping or hard newline.

Parameters:
word word to do
block current block
next_word next word
next_block block of next word
Returns:
FALSE if not at end of line.

Definition at line 613 of file output.cpp.

References WERD::bounding_box(), CTRL_HARDLINE, CTRL_NEWLINE, FALSE, WERD::flag(), BOX::left(), NULL, BOX::right(), WERD::space(), and W_EOL.

Referenced by output_pass().

00618                              {
00619   INT16 end_gap;                 // to right edge
00620   INT16 width;                   // of next word
00621   BOX word_box;                  // bounding
00622   BOX next_box;                  // next word
00623   BOX block_box;                 // block bounding
00624 
00625   if (!word->flag (W_EOL))
00626     return FALSE;                // not end of line
00627   if (next_word == NULL || next_block == NULL || block != next_block)
00628     return CTRL_NEWLINE;
00629   if (next_word->space () > 0)
00630     return CTRL_HARDLINE;        //it is tabbed
00631   word_box = word->bounding_box ();
00632   next_box = next_word->bounding_box ();
00633   block_box = block->bounding_box ();
00634                                  //gap to eol
00635   end_gap = block_box.right () - word_box.right ();
00636   end_gap -= (INT32) block->space ();
00637   width = next_box.right () - next_box.left ();
00638   //      tprintf("end_gap=%d-%d=%d, width=%d-%d=%d, nl=%d\n",
00639   //              block_box.right(),word_box.right(),end_gap,
00640   //              next_box.right(),next_box.left(),width,
00641   //              end_gap>width ? CTRL_HARDLINE : CTRL_NEWLINE);
00642   return end_gap > width ? CTRL_HARDLINE : CTRL_NEWLINE;
00643 }

void ensure_rep_chars_are_consistent ( WERD_RES word  ) 

Enforce repeating characters acceptable.

And this means what?

Definition at line 1059 of file output.cpp.

References WERD_RES::best_choice, and get_rep_char().

Referenced by write_results().

01059                                                      { 
01060   char rep_char = get_rep_char (word);
01061   char *ptr;
01062 
01063   ptr = (char *) word->best_choice->string ().string ();
01064   for (; *ptr != '\0'; ptr++) {
01065     if (*ptr != rep_char)
01066       *ptr = rep_char;
01067   }
01068 }

char get_rep_char ( WERD_RES word  ) 

Return the first accepted character from the repetition string.

Parameters:
word Word
This is the character which is repeated, as determined earlier by fix_rep_char()

Definition at line 1040 of file output.cpp.

References WERD_RES::best_choice, REJMAP::length(), and WERD_RES::reject_map.

Referenced by ensure_rep_chars_are_consistent(), make_epaper_choice(), and write_results().

01041                                   {
01042   int i;
01043 
01044   for (i = 0;
01045     ((i < word->reject_map.length ()) &&
01046     (word->reject_map[i].rejected ())); i++);
01047   if (i < word->reject_map.length ())
01048     return word->best_choice->string ()[i];
01049   else
01050     return STRING (unrecognised_char)[0];
01051 }

WERD_CHOICE* make_epaper_choice ( WERD_RES word,
char  newline_type 
)

Convert one word.

Parameters:
word word to do
newline_type type of newline
Returns:
word
Construct the epaper text string for a word, using the reject map to determine whether each blob should be rejected.

Definition at line 439 of file output.cpp.

References ASSERT_HOST, WERD_RES::best_choice, WERD::blob_list(), PBLOB::bounding_box(), WERD_RES::denorm, FALSE, WERD::flag(), get_rep_char(), REJMAP::length(), make_reject(), MAX_PATH, NO_PERM, WERD_RES::outword, WERD_RES::reject_map, BOX::right(), WERD::space(), tprintf(), TRUE, W_REP_CHAR, and WERD_RES::word.

Referenced by write_results().

00442                                  {
00443   INT16 index = 0;               //to string
00444   INT16 blobindex;               //to word
00445   INT16 prevright = 0;           //right of previous blob
00446   INT16 nextleft;                //left of next blob
00447   PBLOB *blob;
00448   BOX inset_box;                 //bounding box
00449   PBLOB_IT blob_it;              //blob iterator
00450   char word_string[MAX_PATH];    //converted string
00451   BOOL8 force_total_reject;
00452   char unrecognised = STRING (unrecognised_char)[0];
00453 
00454   blob_it.set_to_list (word->outword->blob_list ());
00455 
00456   ASSERT_HOST (word->reject_map.length () ==
00457     word->best_choice->string ().length ());
00458   /*
00459   tprintf( "\"%s\" -> length: %d;  blobcount: %d (%d)\n",
00460       word->best_choice->string().string(),
00461         word->best_choice->string().length(),
00462       blob_it.length(),
00463         blob_count( word->outword ) );
00464   */
00465 
00466   if (word->best_choice->string ().length () == 0)
00467     force_total_reject = TRUE;
00468   else {
00469     force_total_reject = FALSE;
00470     ASSERT_HOST (blob_it.length () ==
00471       word->best_choice->string ().length ());
00472   }
00473   if (!blob_it.empty ()) {
00474     for (index = 0; index < word->word->space (); index++)
00475       word_string[index] = ' ';  //leading blanks
00476   }
00477   /*
00478   Why does this generate leading blanks regardless of whether the
00479   word_choice string is empty, when write_cooked_text only generates leading
00480   blanks when the string is NOT empty???. 
00481   */
00482 
00483   if (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes) {
00484     strcpy (word_string + index, "|^~R");
00485     index += 4;
00486     word_string[index++] = get_rep_char (word);
00487   }
00488   else {
00489     if (!blob_it.empty ())
00490       prevright = blob_it.data ()->bounding_box ().left ();
00491     //actually first left
00492     for (blobindex = 0, blob_it.mark_cycle_pt ();
00493     !blob_it.cycled_list (); blobindex++, blob_it.forward ()) {
00494       blob = blob_it.data ();
00495       if (word->reject_map[blobindex].accepted ()) {
00496         if (word->best_choice->string ()[blobindex] == ' ')
00497           word_string[index++] = unrecognised; // but not rejected!!
00498         else
00499           word_string[index++] =
00500             word->best_choice->string ()[blobindex];
00501       }
00502       else {
00503         inset_box = blob->bounding_box (); // start reject
00504         /* Extend reject box to include rejected neighbours */
00505         while (!blob_it.at_last () &&
00506           (force_total_reject ||
00507         (word->reject_map[blobindex + 1].rejected ()))) {
00508           blobindex++;
00509           blob = blob_it.forward ();
00510           inset_box += blob->bounding_box (); // get total box
00511         }
00512         if (blob_it.at_last ())
00513           nextleft = inset_box.right ();
00514         else
00515           nextleft = blob_it.data_relative (1)->bounding_box ().left ();
00516 
00517         //       tprintf("Making reject from (%d,%d)->(%d,%d)\n",
00518         //          inset_box.left(),inset_box.bottom(),
00519         //          inset_box.right(),inset_box.top());
00520 
00521         index += make_reject (&inset_box, prevright, nextleft,
00522           &word->denorm, &word_string[index]);
00523       }
00524       prevright = blob->bounding_box ().right ();
00525     }
00526   }
00527   if (newline_type)
00528     word_string[index++] = newline_type; // end line
00529   word_string[index] = '\0';     // terminate string
00530   if (strlen (word_string) != index) {
00531     tprintf ("ASSERT ABOUT TO FAIL: %s, index %d len %d\n",
00532       word_string, index, strlen (word_string));
00533   }
00534   ASSERT_HOST (strlen (word_string) == index); // don't pass any zeros
00535   return new WERD_CHOICE (word_string, 0, 0, NO_PERM);
00536 }

INT16 make_reject ( BOX inset_box,
INT16  prevright,
INT16  nextleft,
DENORM denorm,
char  word_string[] 
)

Add the escape code to the string for the reject.

Parameters:
inset_box bounding box
prevright previous char
nextleft next char
denorm de-normalizer
word_string output string

Definition at line 548 of file output.cpp.

References BOX::bottom(), CTRL_INSET, BOX::left(), BOX::right(), DENORM::row(), BOX::top(), DENORM::x(), ROW::x_height(), and DENORM::y().

Referenced by make_epaper_choice().

00554   {
00555   INT16 index;                   // to string
00556   INT16 xpos;                    // start of inset
00557   INT16 ypos;
00558   INT16 width;                   // size of inset
00559   INT16 height;
00560   INT16 left_offset;             // shift form prev char
00561   INT16 right_offset;            // shift to next char
00562   INT16 baseline_offset;         // shift from baseline
00563   INT16 inset_index = 0;         // number of inset
00564   INT16 min_chars;               // min width estimate
00565   INT16 max_chars;               // max width estimate
00566   float x_centre;                // centre of box
00567 
00568   index = 0;
00569   x_centre = (inset_box->left () + inset_box->right ()) / 2.0;
00570   left_offset =
00571     (INT16) (denorm->x (inset_box->left ()) - denorm->x (prevright));
00572   right_offset =
00573     (INT16) (denorm->x (nextleft) - denorm->x (inset_box->right ()));
00574   xpos = (INT16) floor (denorm->x (inset_box->left ()));
00575   width = (INT16) ceil (denorm->x (inset_box->right ())) - xpos;
00576   ypos = (INT16) floor (denorm->y (inset_box->bottom (), x_centre));
00577   height = (INT16) ceil (denorm->y (inset_box->top (), x_centre)) - ypos;
00578   baseline_offset = ypos - (INT16) denorm->y (bln_baseline_offset, x_centre);
00579   word_string[index++] = CTRL_INSET; // escape code
00580   min_chars = (INT16) ceil (0.27 * width / denorm->row ()->x_height ());
00581   max_chars = (INT16) floor (1.8 * width / denorm->row ()->x_height ());
00582   /*
00583   Ensure min_chars and max_chars are in the range 0..254. This ensures that
00584   we can add 1 to them to avoid putting \0 in a string, and still not exceed
00585   the max value in a byte.
00586   */
00587   if (min_chars < 0)
00588     min_chars = 0;
00589   if (min_chars > 254)
00590     min_chars = 254;
00591   if (max_chars < min_chars)
00592     max_chars = min_chars;
00593   if (max_chars > 254)
00594     max_chars = 254;
00595   word_string[index++] = min_chars + 1; // min chars
00596   word_string[index++] = max_chars + 1; // max chars
00597   word_string[index++] = 2;      //type?
00598   word_string[index++] = inset_index / 255 + 1; // store index
00599   word_string[index++] = inset_index % 255 + 1;
00600   return index;                  // size of string
00601 }

FILE* open_outfile ( const char *  extension  ) 

Open .map & .unlv file.

Parameters:
extension MSDOS extension, added to imagebasename
Note:
Global: imagebasename
Returns:
file handle to file

Definition at line 952 of file output.cpp.

References CANTOPENFILE, ERRCODE::error(), EXIT, imagebasename, and STRING::string().

Referenced by output_pass().

00953                                           {
00954   STRING file_name;
00955   FILE *outfile;
00956 
00957   file_name = imagebasename + extension;
00958   if (!(outfile = fopen (file_name.string (), "w"))) {
00959     CANTOPENFILE.error ("open_outfile", EXIT, "%s %d",
00960       file_name.string (), errno);
00961   }
00962   return outfile;
00963 }

void output_pass ( PAGE_RES_IT page_res_it,
BOOL8  write_to_shm 
)

Tess output to pass to file or API.

Definition at line 122 of file output.cpp.

References BLOCK_RES::block, PAGE_RES_IT::block(), check_debug_pt(), cprintf(), determine_newline_type(), FALSE, PAGE_RES_IT::forward(), PAGE_RES_IT::next_block(), PAGE_RES_IT::next_word(), NO_BLOCK, NULL, ocr_send_text(), open_outfile(), PAGE_RES_IT::restart_page(), textfile, txt_mapfile, unlv_file, WERD_RES::word, PAGE_RES_IT::word(), and write_results().

00124                                      {
00125   BLOCK_RES *block_of_last_word;
00126   INT16 block_id;
00127   BOOL8 force_eol;               // During output
00128   BLOCK *nextblock;              // block of next word
00129   WERD *nextword;                // next word
00130 
00131 #ifdef TEXT_VERBOSE
00132   // gets a 'w', see ccmain/tesseractmain.dox
00133   cprintf("w");
00134 #endif
00135   if (tessedit_write_txt_map)
00136     txt_mapfile = open_outfile (".map");
00137   if (tessedit_write_unlv)
00138     unlv_file = open_outfile (".unlv");
00139   page_res_it.restart_page ();
00140   block_of_last_word = NULL;
00141   while (page_res_it.word () != NULL) {
00142     check_debug_pt (page_res_it.word (), 120);
00143     if (tessedit_write_block_separators &&
00144     block_of_last_word != page_res_it.block ()) {
00145       block_of_last_word = page_res_it.block ();
00146       if (block_of_last_word->block->text_region () == NULL) {
00147         if (block_of_last_word->block->poly_block () == NULL)
00148           block_id = 1;
00149         else
00150           block_id =
00151             ((WEIRD_BLOCK *) block_of_last_word->block->poly_block ())->
00152             id_no(); 
00153       }
00154       else
00155         block_id = block_of_last_word->block->text_region ()->id_no ();
00156       if (!NO_BLOCK)
00157         fprintf (textfile, "|^~tr%d\n", block_id);
00158       fprintf (txt_mapfile, "|^~tr%d\n", block_id);
00159     }
00160 
00161     force_eol = (tessedit_write_block_separators &&
00162       (page_res_it.block () != page_res_it.next_block ())) ||
00163       (page_res_it.next_word () == NULL);
00164 
00165     if (page_res_it.next_word () != NULL)
00166       nextword = page_res_it.next_word ()->word;
00167     else
00168       nextword = NULL;
00169     if (page_res_it.next_block () != NULL)
00170       nextblock = page_res_it.next_block ()->block;
00171     else
00172       nextblock = NULL;
00173     write_results (page_res_it, // regardless of tilde crunching
00174       determine_newline_type (page_res_it.word ()->word,
00175          page_res_it.block ()->block, nextword, nextblock),
00176          force_eol, write_to_shm);
00177     page_res_it.forward ();
00178   }
00179   if (write_to_shm)
00180     ocr_send_text(FALSE); 
00181   if (tessedit_write_block_separators) {
00182     if (!NO_BLOCK)
00183       fprintf (textfile, "|^~tr\n");
00184     fprintf (txt_mapfile, "|^~tr\n");
00185   }
00186   if (tessedit_write_txt_map) {
00187     fprintf (txt_mapfile, "\n"); // because txt gets one
00188     #ifdef __UNIX__
00189     fsync (fileno (txt_mapfile));
00190     #endif
00191     fclose(txt_mapfile); 
00192   }
00193 }

void set_unlv_suspects ( WERD_RES word  ) 

Modifies reject_map of word based on suspect_level.

Parameters:
word Word
Note:
Global:
  • tessedit_minimal_rejection,
  • suspect_level,
  • suspect_rating_per_ch,
  • suspect_accept_rating
  • suspect_level = 0 - dont reject ANYTHING
  • suspect_level = 1 or 2 - partial rejection
  • suspect_level 3 - BEST

To reject JUST tess failures in the .map file set suspect_level 3 and tessedit_minimal_rejection.

Definition at line 1085 of file output.cpp.

References AC_UNACCEPTABLE, acceptable_number_string(), acceptable_word_string(), WERD_RES::best_choice, count_alphas(), REJMAP::length(), R_1IL_CONFLICT, R_BLOCK_REJ, R_DOC_REJ, R_MM_REJECT, R_POSTNN_1IL, R_ROW_REJ, WERD_RES::reject_map, safe_dict_word(), and WERD_RES::tess_accepted.

Referenced by write_results().

01085                                        { 
01086   int len = word->reject_map.length ();
01087   int i;
01088   const char *ptr;
01089   float rating_per_ch;
01090 
01091   ptr = word->best_choice->string ().string ();
01092 
01093   if (suspect_level == 0) {
01094     for (i = 0; i < len; i++) {
01095       if (word->reject_map[i].rejected ())
01096         word->reject_map[i].setrej_minimal_rej_accept ();
01097     }
01098     return;
01099   }
01100 
01101   if (suspect_level >= 3)
01102     return;                      // Use defaults
01103 
01104   /* NOW FOR LEVELS 1 and 2 Find some stuff to unreject*/
01105 
01106   if (safe_dict_word (ptr) && (count_alphas (ptr) > suspect_short_words)) {
01107     /* Unreject alphas in dictionary words */
01108     for (i = 0; i < len; i++) {
01109       if (word->reject_map[i].rejected () && isalpha (ptr[i]))
01110         word->reject_map[i].setrej_minimal_rej_accept ();
01111     }
01112   }
01113 
01114   rating_per_ch = word->best_choice->rating () / word->reject_map.length ();
01115 
01116   if (rating_per_ch >= suspect_rating_per_ch)
01117     return;                      //Dont touch bad ratings
01118 
01119   if ((word->tess_accepted) || (rating_per_ch < suspect_accept_rating)) {
01120     /* Unreject any Tess Acceptable word - but NOT tess reject chs*/
01121     for (i = 0; i < len; i++) {
01122       if (word->reject_map[i].rejected () && (ptr[i] != ' '))
01123         word->reject_map[i].setrej_minimal_rej_accept ();
01124     }
01125   }
01126 
01127   for (i = 0; i < len; i++) {
01128     if (word->reject_map[i].rejected ()) {
01129       if (word->reject_map[i].flag (R_DOC_REJ))
01130         word->reject_map[i].setrej_minimal_rej_accept ();
01131       if (word->reject_map[i].flag (R_BLOCK_REJ))
01132         word->reject_map[i].setrej_minimal_rej_accept ();
01133       if (word->reject_map[i].flag (R_ROW_REJ))
01134         word->reject_map[i].setrej_minimal_rej_accept ();
01135     }
01136   }
01137 
01138   if (suspect_level == 2)
01139     return;
01140 
01141   if (!suspect_constrain_1Il ||
01142   (word->reject_map.length () <= suspect_short_words)) {
01143     for (i = 0; i < len; i++) {
01144       if (word->reject_map[i].rejected ()) {
01145         if ((word->reject_map[i].flag (R_1IL_CONFLICT) ||
01146           word->reject_map[i].flag (R_POSTNN_1IL)))
01147           word->reject_map[i].setrej_minimal_rej_accept ();
01148 
01149         if (!suspect_constrain_1Il &&
01150           word->reject_map[i].flag (R_MM_REJECT))
01151           word->reject_map[i].setrej_minimal_rej_accept ();
01152       }
01153     }
01154   }
01155 
01156   if ((acceptable_word_string (word->best_choice->string ().string ())
01157     != AC_UNACCEPTABLE) ||
01158   acceptable_number_string (word->best_choice->string ().string ())) {
01159     if (word->reject_map.length () > suspect_short_words) {
01160       for (i = 0; i < len; i++) {
01161         if (word->reject_map[i].rejected () &&
01162           (!word->reject_map[i].perm_rejected () ||
01163           word->reject_map[i].flag (R_1IL_CONFLICT) ||
01164           word->reject_map[i].flag (R_POSTNN_1IL) ||
01165         word->reject_map[i].flag (R_MM_REJECT))) {
01166           word->reject_map[i].setrej_minimal_rej_accept ();
01167         }
01168       }
01169     }
01170   }
01171 }

void write_cooked_text ( WERD word,
const STRING text,
BOOL8  acceptable,
BOOL8  pass2,
FILE *  fp 
)

Write the cooked text (with bold for pass2 and underline for reject) to the given file.

Parameters:
word word to do
text text to write
acceptable good stuff
pass2 done on pass2
fp file to write
Returns:
none

Definition at line 657 of file output.cpp.

References BOLD_OFF, BOLD_ON, BOX::bottom(), WERD::bounding_box(), ERRCODE::error(), EXIT, WERD::flag(), IMAGE::get_ysize(), INT32FORMAT, BOX::left(), STRING::length(), NO_BLOCK, num_popped, page_image, BOX::right(), WERD::space(), STRING::string(), BOX::top(), tprintf(), UNDERLINE_OFF, UNDERLINE_ON, W_EOL, WRITEFAILED, XOFFSET, and YOFFSET.

Referenced by classify_word_pass1(), classify_word_pass2(), and write_results().

00663                         {
00664   INT16 index;                   //blank counter
00665   int status;
00666   static int newaline = 1;
00667   static int havespace = 0;
00668   char buff[512];
00669   const char *wordstr = text.string ();
00670   int i = 0;
00671   char unrecognised = STRING (unrecognised_char)[0];
00672   static int old_segs = 0;
00673   BOX mybox;
00674   for (i = 0; wordstr[i] != '\0'; i++) {
00675     if (wordstr[i] == ' ')
00676       buff[i] = unrecognised;
00677     else
00678       buff[i] = wordstr[i];
00679   }
00680   buff[i] = '\0';
00681 
00682   if (fp == stdout) {
00683     tprintf ("Cooked=%s, %d segs, acceptable=%d",
00684       buff, num_popped - old_segs, acceptable);
00685     old_segs = num_popped;
00686     return;
00687   }
00688 
00689   if (text.length () > 0) {
00690     for (index = 0; index < word->space (); index++) {
00691       status = fprintf (fp, " ");
00692       havespace = 1;
00693       if (status < 0)
00694         WRITEFAILED.error ("write_cooked_text", EXIT,
00695           "Space Errno: %d", errno);
00696     }
00697     if (pass2) {
00698       status = fprintf (fp, BOLD_ON);
00699       if (status < 0)
00700         WRITEFAILED.error ("write_cooked_text", EXIT,
00701           "Bold Errno: %d", errno);
00702     }
00703     if (!acceptable) {
00704       status = fprintf (fp, UNDERLINE_ON);
00705       if (status < 0)
00706         WRITEFAILED.error ("write_cooked_text", EXIT,
00707           "Underline Errno: %d", errno);
00708     }
00709 
00710                                  //xiaofan
00711     if (NO_BLOCK && word && strlen (buff)) {
00712       mybox = word->bounding_box ();
00713       if (newaline || !havespace) {
00714         fprintf (fp, " ");
00715         newaline = 0;
00716       }
00717       fprintf (fp, "(%d," INT32FORMAT ",%d," INT32FORMAT ")",
00718         XOFFSET + mybox.left (),
00719         YOFFSET + page_image.get_ysize () - mybox.top (),
00720         XOFFSET + mybox.right (),
00721         YOFFSET + page_image.get_ysize () - mybox.bottom ());
00722       havespace = 0;
00723     }
00724 
00725     status = fprintf (fp, "%s", buff);
00726     if (status < 0)
00727       WRITEFAILED.error ("write_cooked_text", EXIT,
00728         "Word Errno: %d", errno);
00729     if (pass2) {
00730       status = fprintf (fp, BOLD_OFF);
00731       if (status < 0)
00732         WRITEFAILED.error ("write_cooked_text", EXIT,
00733           "Bold off Errno: %d", errno);
00734     }
00735     if (!acceptable) {
00736       status = fprintf (fp, UNDERLINE_OFF);
00737       if (status < 0)
00738         WRITEFAILED.error ("write_cooked_text", EXIT,
00739           "Underline off Errno: %d", errno);
00740     }
00741   }
00742   if (word->flag (W_EOL)) {
00743     status = fprintf (fp, "\n");
00744     newaline = 1;
00745     if (status < 0)
00746       WRITEFAILED.error ("write_cooked_text", EXIT,
00747         "Newline Errno: %d", errno);
00748   }
00749   status = fflush (fp);
00750   if (status != 0)
00751     WRITEFAILED.error ("write_cooked_text", EXIT, "Fflush Errno: %d", errno);
00752 }

void write_map ( FILE *  mapfile,
WERD_RES word 
)

Generates a map file, approx a 'cmp -l' of .txt file and .etx file.

Parameters:
mapfile mapfile to write to
word Word
Write a map file of 0's and 1'a which associates characters from the .txt file with those in the .etx file. 0 = .txt char was deleted. 1 = .txt char is kept. Note that there may be reject regions in the .etx file WITHOUT .txt chars being rejected. The map file should be the same length, and the same number of lines as the .txt file

The paramaterised input is because I thought I might be able to generate multiple map files in a single run. However, it didn't work because newdiff needs etx files!

Definition at line 891 of file output.cpp.

References ASSERT_HOST, WERD_RES::best_choice, ERRCODE::error(), EXIT, WERD::flag(), REJMAP::length(), WERD_RES::reject_map, WERD_RES::reject_spaces, WERD::space(), STRING::string(), W_EOL, W_REP_CHAR, WERD_RES::word, and WRITEFAILED.

Referenced by write_results().

00893                                {
00894   INT16 index;
00895   int status;
00896   STRING mapstr = "";
00897 
00898   if (word->best_choice->string ().length () > 0) {
00899     for (index = 0; index < word->word->space (); index++) {
00900       if (word->reject_spaces &&
00901         (suspect_level >= suspect_space_level) &&
00902         !tessedit_minimal_rejection && !tessedit_zero_rejection)
00903         /*
00904        Write rejected spaces to .map file ONLY. Newdiff converts
00905        these back to accepted spaces AFTER generating basic space
00906        stats but BEFORE using .etx
00907        */
00908         status = fprintf (mapfile, "0");
00909       else
00910         status = fprintf (mapfile, "1");
00911       if (status < 0)
00912         WRITEFAILED.error ("write_map", EXIT, "Space Errno: %d", errno);
00913     }
00914 
00915     if ((word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes)) {
00916       for (index = 0; index < 5; index++)
00917         mapstr += '1';
00918     }
00919     else {
00920       ASSERT_HOST (word->reject_map.length () ==
00921         word->best_choice->string ().length ());
00922 
00923       for (index = 0; index < word->reject_map.length (); index++) {
00924         if (word->reject_map[index].accepted ())
00925           mapstr += '1';
00926         else
00927           mapstr += '0';
00928       }
00929     }
00930     status = fprintf (mapfile, "%s", mapstr.string ());
00931     if (status < 0)
00932       WRITEFAILED.error ("write_map", EXIT, "Map str Errno: %d", errno);
00933   }
00934   if (word->word->flag (W_EOL)) {
00935     status = fprintf (mapfile, "\n");
00936     if (status < 0)
00937       WRITEFAILED.error ("write_map", EXIT, "Newline Errno: %d", errno);
00938   }
00939   status = fflush (mapfile);
00940   if (status != 0)
00941     WRITEFAILED.error ("write_map", EXIT, "fflush Errno: %d", errno);
00942 }

void write_results ( PAGE_RES_IT page_res_it,
char  newline_type,
BOOL8  force_eol,
BOOL8  write_to_shm 
)

All recognition and rejection has now been done so write out word.

Parameters:
page_res_it full info
newline_type type of newline
force_eol 0 or 1, override tilde crunch?
write_to_shm 0 or 1, send to api
Returns:
none
Generate the following:

Definition at line 214 of file output.cpp.

References ASSERT_HOST, WERD_RES::best_choice, WERD::blob_list(), BLOCK_RES::block, PAGE_RES_IT::block(), character_count, check_debug_pt(), CR_DELETE, CR_KEEP_SPACE, CR_NONE, CTRL_INSET, dict_word(), ensure_rep_chars_are_consistent(), WERD_RES::ep_choice, FALSE, WERD::flag(), get_rep_char(), REJMAP::length(), make_epaper_choice(), NO_BLOCK, NO_PERM, WERD_RES::outword, rawfile, WERD_RES::reject_map, REJMAP::remove_pos(), PAGE_RES_IT::row(), set_unlv_suspects(), WERD::space(), STRING::string(), textfile, tprintf(), TRUE, txt_mapfile, WERD_RES::unlv_crunch_mode, unlv_file, W_BOL, W_EOL, W_FUZZY_NON, W_FUZZY_SP, W_REP_CHAR, WERD_RES::word, PAGE_RES_IT::word(), word_count, write_cooked_text(), write_map(), write_shm_text(), and write_unlv_text().

Referenced by output_pass().

00219                     {
00220   WERD_RES *word = page_res_it.word (); //word to do
00221   WERD_CHOICE *ep_choice;        //ep format
00222   STRING repetition_code;
00223   const STRING *wordstr;
00224   const char *text;
00225   int i;
00226   char unrecognised = STRING (unrecognised_char)[0];
00227   char ep_chars[32];             //Only for unlv_tilde_crunch
00228   int ep_chars_index = 0;
00229   char txt_chs[32];              //Only for unlv_tilde_crunch
00230   char map_chs[32];              //Only for unlv_tilde_crunch
00231   int txt_index = 0;
00232   static BOOL8 tilde_crunch_written = FALSE;
00233   static BOOL8 last_char_was_newline = TRUE;
00234   static BOOL8 last_char_was_tilde = FALSE;
00235   static BOOL8 empty_block = TRUE;
00236   BOOL8 need_reject = FALSE;
00237   char *ptr;                     //string ptr
00238   PBLOB_IT blob_it;              //blobs
00239 
00240   /*
00241    if (word->best_choice->string().length() == 0)
00242     {
00243       tprintf("No output: to output\n");
00244     }
00245     else if (word->best_choice->string()[0]==' ')
00246     {
00247       tprintf("spaceword to output\n");
00248     }
00249     else if (word->best_choice->string()[0]== '\0' )
00250     {
00251       tprintf("null to output\n");
00252     }*/
00253   if (word->unlv_crunch_mode != CR_NONE
00254   && !tessedit_zero_kelvin_rejection && !tessedit_word_for_word) {
00255     if ((word->unlv_crunch_mode != CR_DELETE) &&
00256       (!tilde_crunch_written ||
00257       ((word->unlv_crunch_mode == CR_KEEP_SPACE) &&
00258       (word->word->space () > 0) &&
00259       !word->word->flag (W_FUZZY_NON) &&
00260     !word->word->flag (W_FUZZY_SP)))) {
00261       if (!word->word->flag (W_BOL) &&
00262         (word->word->space () > 0) &&
00263         !word->word->flag (W_FUZZY_NON) &&
00264       !word->word->flag (W_FUZZY_SP)) {
00265         /* Write a space to separate from preceeding good text */
00266         txt_chs[txt_index] = ' ';
00267         map_chs[txt_index++] = '1';
00268         ep_chars[ep_chars_index++] = ' ';
00269         last_char_was_tilde = FALSE;
00270       }
00271       need_reject = TRUE;
00272     }
00273     if ((need_reject && !last_char_was_tilde) || (force_eol && empty_block)) {
00274       /* Write a reject char - mark as rejected unless zero_rejection mode */
00275       last_char_was_tilde = TRUE;
00276       txt_chs[txt_index] = unrecognised;
00277       if (tessedit_zero_rejection || (suspect_level == 0)) {
00278         map_chs[txt_index++] = '1';
00279         ep_chars[ep_chars_index++] = unrecognised;
00280       }
00281       else {
00282         map_chs[txt_index++] = '0';
00283         /* The ep_choice string is a faked reject to allow newdiff to
00284          sync the .etx with the .txt and .map files.
00285          */
00286         ep_chars[ep_chars_index++] = CTRL_INSET; // escape code
00287         ep_chars[ep_chars_index++] = 1; // dummy reject
00288         ep_chars[ep_chars_index++] = 1; // dummy reject
00289         ep_chars[ep_chars_index++] = 2; // type
00290         ep_chars[ep_chars_index++] = 1; // dummy reject
00291         ep_chars[ep_chars_index++] = 1; // dummy reject
00292       }
00293       tilde_crunch_written = TRUE;
00294       last_char_was_newline = FALSE;
00295       empty_block = FALSE;
00296     }
00297 
00298     if ((word->word->flag (W_EOL) && !last_char_was_newline) || force_eol) {
00299       /* Add a new line output */
00300       txt_chs[txt_index] = '\n';
00301       map_chs[txt_index++] = '\n';
00302       ep_chars[ep_chars_index++] = newline_type; // end line
00303 
00304       tilde_crunch_written = FALSE; // Coordinates of the real newline
00305       last_char_was_newline = TRUE;
00306       last_char_was_tilde = FALSE;
00307     }
00308     txt_chs[txt_index] = '\0';
00309     map_chs[txt_index] = '\0';
00310     if (tessedit_write_output && !NO_BLOCK) // xiaofan ?
00311       fprintf (textfile, "%s", txt_chs);
00312 
00313     if (tessedit_write_unlv)
00314       fprintf (unlv_file, "%s", txt_chs);
00315 
00316     if (tessedit_write_txt_map)
00317       fprintf (txt_mapfile, "%s", map_chs);
00318 
00319     ep_chars[ep_chars_index] = '\0'; // terminate string
00320     word->ep_choice = new WERD_CHOICE (ep_chars, 0, 0, NO_PERM);
00321 
00322     if (force_eol)
00323       empty_block = TRUE;
00324     return;
00325   }
00326 
00327   /* NORMAL PROCESSING of non tilde crunched words */
00328 
00329   tilde_crunch_written = FALSE;
00330   if (newline_type)
00331     last_char_was_newline = TRUE;
00332   else
00333     last_char_was_newline = FALSE;
00334   empty_block = force_eol;       // About to write a real word
00335 
00336   if (unlv_tilde_crunching &&
00337     last_char_was_tilde &&
00338     (word->word->space () == 0) &&
00339     !(word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes) &&
00340   (word->best_choice->string ()[0] == ' ')) {
00341     /* Prevent adjacent tilde across words - we know that adjacent
00342       tildes within words have been removed */
00343     ptr = (char *) word->best_choice->string ().string ();
00344     strcpy (ptr, ptr + 1);       //shuffle up
00345     word->reject_map.remove_pos (0);
00346     blob_it = word->outword->blob_list ();
00347     delete blob_it.extract ();   // get rid of reject blob
00348   }
00349   if (newline_type ||
00350     (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes))
00351     last_char_was_tilde = FALSE;
00352   else {
00353     if (word->reject_map.length () > 0) {
00354       if (word->best_choice->string ()[word->reject_map.length () - 1] ==
00355         ' ')
00356         last_char_was_tilde = TRUE;
00357       else
00358         last_char_was_tilde = FALSE;
00359     }
00360     else if (word->word->space () > 0)
00361       last_char_was_tilde = FALSE;
00362     /* else it is unchanged as there are no output chars */
00363   }
00364 
00365   ptr = (char *) word->best_choice->string ().string ();
00366   ASSERT_HOST (strlen (ptr) == word->reject_map.length ());
00367 
00368   if (word->word->flag (W_REP_CHAR) && tessedit_consistent_reps)
00369     ensure_rep_chars_are_consistent(word); 
00370 
00371   set_unlv_suspects(word); 
00372   check_debug_pt (word, 120);
00373   if (tessedit_rejection_debug) {
00374     tprintf ("Dict word: \"%s\": %d\n",
00375       word->best_choice->string ().string (),
00376       dict_word (word->best_choice->string ().string ()));
00377   }
00378 
00379   if (tessedit_write_unlv) {
00380     write_unlv_text(word); 
00381   }
00382 
00383   if (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes) {
00384     repetition_code = "|^~R";
00385     repetition_code += get_rep_char (word);
00386     wordstr = &repetition_code;
00387   }
00388   else {
00389     wordstr = &(word->best_choice->string ());
00390     if (tessedit_zero_rejection) {
00391       /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
00392       text = wordstr->string ();
00393       for (i = 0; text[i] != '\0'; i++) {
00394         if (word->reject_map[i].rejected ())
00395           word->reject_map[i].setrej_minimal_rej_accept ();
00396       }
00397     }
00398     if (tessedit_minimal_rejection) {
00399       /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
00400       text = wordstr->string ();
00401       for (i = 0; text[i] != '\0'; i++) {
00402         if ((text[i] != ' ') && word->reject_map[i].rejected ())
00403           word->reject_map[i].setrej_minimal_rej_accept ();
00404       }
00405     }
00406   }
00407 
00408   if (write_to_shm)
00409     write_shm_text (word, page_res_it.block ()->block,
00410       page_res_it.row (), *wordstr);
00411 
00412   if (tessedit_write_output)
00413     write_cooked_text (word->word, *wordstr, TRUE, FALSE, textfile);
00414 
00415   if (tessedit_write_raw_output)
00416     write_cooked_text (word->word, word->raw_choice->string (),
00417       TRUE, FALSE, rawfile);
00418 
00419   if (tessedit_write_txt_map)
00420     write_map(txt_mapfile, word); 
00421 
00422   ep_choice = make_epaper_choice (word, newline_type);
00423   word->ep_choice = ep_choice;
00424 
00425   character_count += word->best_choice->string ().length ();
00426   word_count++;
00427 }

void write_shm_text ( WERD_RES word,
BLOCK block,
ROW_RES row,
const STRING text 
)

Write the cooked text to the shared memory for the api.

Definition at line 758 of file output.cpp.

References ROW::ascenders(), WERD::baseline_denormalise(), WERD_RES::best_choice, WERD::blob_list(), ROW_RES::bold, WERD_RES::bold, BOX::bottom(), WERD::bounding_box(), PBLOB::bounding_box(), WERD_RES::denorm, ROW::descenders(), EUC_BOLD, EUC_ITALIC, WERD::flag(), ROW_RES::font1, WERD_RES::font1, ROW_RES::font1_count, WERD_RES::font1_count, IMAGE::get_ysize(), ROW_RES::italic, WERD_RES::italic, BOX::left(), STRING::length(), ocr_append_char(), OCR_CDIR_LEFT_RIGHT, ocr_char_space(), OCR_LDIR_DOWN_RIGHT, OCR_NL_NEWLINE, OCR_NL_NONE, ocr_send_text(), OKAY, WERD_RES::outword, page_image, pixels_to_pts(), WERD_RES::reject_map, BOX::right(), ROW_RES::row, WERD::space(), BOX::top(), TRUE, W_BOL, W_DONT_CHOP, W_EOL, WERD_RES::word, and ROW::x_height().

Referenced by write_results().

00763                      {
00764   INT32 index;                   //char counter
00765   INT32 index2;                  //char counter
00766   INT32 length;                  //chars in word
00767   INT32 ptsize;                  //font size
00768   INT8 blanks;                   //blanks in word
00769   UINT8 enhancement;             //bold etc
00770   UINT8 font;                    //font index
00771   char unrecognised = STRING (unrecognised_char)[0];
00772   PBLOB *blob;
00773   BOX blob_box;                  //bounding box
00774   PBLOB_IT blob_it;              //blob iterator
00775   WERD copy_outword;             // copy to denorm
00776   UINT32 rating;                 //of char
00777   BOOL8 lineend;                 //end of line
00778 
00779                                  //point size
00780   ptsize = pixels_to_pts ((INT32) (row->row->x_height () + row->row->ascenders () - row->row->descenders ()), 300);
00781   if (word->word->flag (W_BOL) && ocr_char_space () < 128
00782     && ocr_send_text (TRUE) != OKAY)
00783     return;                      //release failed
00784   copy_outword = *(word->outword);
00785   copy_outword.baseline_denormalise (&word->denorm);
00786   blob_it.set_to_list (copy_outword.blob_list ());
00787   length = text.length ();
00788 
00789   if (length > 0) {
00790     blanks = word->word->space ();
00791     if (blanks == 0 && tessedit_word_for_word && !word->word->flag (W_BOL))
00792       blanks = 1;
00793     for (index = 0; index < length; index++, blob_it.forward ()) {
00794       blob = blob_it.data ();
00795       blob_box = blob->bounding_box ();
00796 
00797       enhancement = 0;
00798       if (word->italic > 0 || word->italic == 0 && row->italic > 0)
00799         enhancement |= EUC_ITALIC;
00800       if (word->bold > 0 || word->bold == 0 && row->bold > 0)
00801         enhancement |= EUC_BOLD;
00802       if (tessedit_write_ratings)
00803         rating = (UINT32) (-word->best_choice->certainty () / 0.035);
00804       else if (tessedit_zero_rejection)
00805         rating = text[index] == ' ' ? 100 : 0;
00806       else
00807         rating = word->reject_map[index].accepted ()? 0 : 100;
00808       if (rating > 255)
00809         rating = 255;
00810       if (word->font1_count > 2)
00811         font = word->font1;
00812       else if (row->font1_count > 8)
00813         font = row->font1;
00814       else
00815                                  //font index
00816         font = word->word->flag (W_DONT_CHOP) ? 0 : 1;
00817 
00818       lineend = word->word->flag (W_EOL) && index == length - 1;
00819       if (word->word->flag (W_EOL) && tessedit_zero_rejection
00820       && index < length - 1 && text[index + 1] == ' ') {
00821         for (index2 = index + 1; index2 < length && text[index2] == ' ';
00822           index2++);
00823         if (index2 == length)
00824           lineend = TRUE;
00825       }
00826 
00827       if (!tessedit_zero_rejection || text[index] != ' '
00828       || tessedit_word_for_word) {
00829                                  //confidence
00830         ocr_append_char (text[index] == ' ' ? unrecognised : text[index], blob_box.left (), blob_box.right (), page_image.get_ysize () - 1 - blob_box.top (), page_image.get_ysize () - 1 - blob_box.bottom (), font, (UINT8) rating,
00831           ptsize,                //point size
00832           blanks, enhancement,   //enhancement
00833           OCR_CDIR_LEFT_RIGHT,
00834           OCR_LDIR_DOWN_RIGHT,
00835           lineend ? OCR_NL_NEWLINE : OCR_NL_NONE);
00836         blanks = 0;
00837       }
00838 
00839     }
00840   }
00841   else if (tessedit_word_for_word) {
00842     blanks = word->word->space ();
00843     if (blanks == 0 && !word->word->flag (W_BOL))
00844       blanks = 1;
00845     blob_box = word->word->bounding_box ();
00846 
00847     enhancement = 0;
00848     if (word->italic > 0)
00849       enhancement |= EUC_ITALIC;
00850     if (word->bold > 0)
00851       enhancement |= EUC_BOLD;
00852     rating = 100;
00853     if (word->font1_count > 2)
00854       font = word->font1;
00855     else if (row->font1_count > 8)
00856       font = row->font1;
00857     else
00858                                  //font index
00859       font = word->word->flag (W_DONT_CHOP) ? 0 : 1;
00860 
00861     lineend = word->word->flag (W_EOL);
00862 
00863                                  //font index
00864     ocr_append_char (unrecognised, blob_box.left (), blob_box.right (), page_image.get_ysize () - 1 - blob_box.top (), page_image.get_ysize () - 1 - blob_box.bottom (), font,
00865       rating,                    //confidence
00866       ptsize,                    //point size
00867       blanks, enhancement,       //enhancement
00868       OCR_CDIR_LEFT_RIGHT,
00869       OCR_LDIR_DOWN_RIGHT,
00870       lineend ? OCR_NL_NEWLINE : OCR_NL_NONE);
00871   }
00872 }

void write_unlv_text ( WERD_RES word  ) 

Write text to .unlv file.

Parameters:
word Word

Definition at line 971 of file output.cpp.

References WERD_RES::best_choice, ERRCODE::error(), EXIT, WERD::flag(), WERD_RES::reject_map, WERD_RES::reject_spaces, WERD::space(), unlv_file, W_EOL, WERD_RES::word, and WRITEFAILED.

Referenced by write_results().

00971                                      { 
00972   const char *wordstr;
00973 
00974   char buff[512];                //string to output
00975   int i = 0;
00976   int j = 0;
00977   char unrecognised = STRING (unrecognised_char)[0];
00978   int status;
00979   char space_str[3];
00980 
00981   wordstr = word->best_choice->string ().string ();
00982 
00983   /*
00984   DONT need to do anything special for repeated char words - at
00985   this stage the repetition char has been identified and any other
00986   chars have been rejected.
00987   */
00988 
00989   for (; wordstr[i] != '\0'; i++) {
00990     if ((wordstr[i] == ' ') ||
00991       (wordstr[i] == '~') || (wordstr[i] == '^') || (wordstr[i] == '|'))
00992       buff[j++] = unrecognised;
00993     else {
00994       if (word->reject_map[i].rejected ())
00995         buff[j++] = '^';         //Add suspect marker
00996       buff[j++] = wordstr[i];
00997     }
00998   }
00999   buff[j] = '\0';
01000 
01001   if (strlen (wordstr) > 0) {
01002     if (word->reject_spaces &&
01003       (suspect_level >= suspect_space_level) &&
01004       !tessedit_minimal_rejection && !tessedit_zero_rejection)
01005       strcpy (space_str, "^ ");  //Suspect space
01006     else
01007       strcpy (space_str, " ");   //Certain space
01008 
01009     for (i = 0; i < word->word->space (); i++) {
01010       status = fprintf (unlv_file, "%s", space_str);
01011       if (status < 0)
01012         WRITEFAILED.error ("write_unlv_text", EXIT,
01013           "Space Errno: %d", errno);
01014     }
01015 
01016     status = fprintf (unlv_file, "%s", buff);
01017     if (status < 0)
01018       WRITEFAILED.error ("write_unlv_text", EXIT, "Word Errno: %d", errno);
01019   }
01020   if (word->word->flag (W_EOL)) {
01021     status = fprintf (unlv_file, "\n");
01022     if (status < 0)
01023       WRITEFAILED.error ("write_unlv_text", EXIT,
01024         "Newline Errno: %d", errno);
01025   }
01026   status = fflush (unlv_file);
01027   if (status != 0)
01028     WRITEFAILED.error ("write_unlv_text", EXIT, "Fflush Errno: %d", errno);
01029 }


Generated on Wed Feb 28 19:49:15 2007 for Tesseract by  doxygen 1.5.1