wordrec/wordclass.cpp File Reference

#include <stdio.h>
#include "wordclass.h"
#include "fxid.h"
#include "tordvars.h"
#include "associate.h"
#include "render.h"
#include "metrics.h"
#include "matchtab.h"
#include "permute.h"
#include "context.h"
#include "badwords.h"
#include "callcpp.h"

Go to the source code of this file.

Defines

Functions

Variables


Define Documentation

#define BOLD_OFF   "&d@(s0B"

Definition at line 54 of file wordclass.cpp.

#define BOLD_ON   "&dB(s3B"

Doesn't look too portable

Definition at line 53 of file wordclass.cpp.

#define UNDERLINE_OFF   "&d@"

Definition at line 56 of file wordclass.cpp.

#define UNDERLINE_ON   "&dD"

Definition at line 55 of file wordclass.cpp.


Function Documentation

LIST call_matcher ( TBLOB ptblob,
TBLOB tessblob,
TBLOB ntblob,
void *  ,
TEXTROW  
)

Call a matcher.

Parameters:
ptblob previous
tessblob blob to match
ntblob next
Returns:
list of choices/results.
Called from Tess with a blob in tess form. Convert the blob to editor form. Call the matcher setup by the segmenter in tess_matcher. Convert the output choices back to tess form.

Definition at line 350 of file tfacepp.cpp.

References append_choice(), BLOB_CHOICE::certainty(), BLOB_CHOICE::char_class(), BLOB_CHOICE::config(), make_ed_blob(), NULL, BLOB_CHOICE::rating(), ratings, tess_denorm, and tess_word.

00356                    {
00357   PBLOB *pblob;                  //converted blob
00358   PBLOB *blob;                   //converted blob
00359   PBLOB *nblob;                  //converted blob
00360   LIST result;                   //tess output
00361   BLOB_CHOICE *choice;           //current choice
00362   char string[2];                //char converted
00363   BLOB_CHOICE_LIST ratings;      //matcher result
00364   BLOB_CHOICE_IT it;             //iterator
00365 
00366   blob = make_ed_blob (tessblob);//convert blob
00367   if (blob == NULL)
00368     return NULL;                 //can't do it
00369   pblob = ptblob != NULL ? make_ed_blob (ptblob) : NULL;
00370   nblob = ntblob != NULL ? make_ed_blob (ntblob) : NULL;
00371   (*tess_matcher) (pblob, blob, nblob, tess_word, tess_denorm, ratings);
00372   //match it
00373   delete blob;                   //don't need that now
00374   if (pblob != NULL)
00375     delete pblob;
00376   if (nblob != NULL)
00377     delete nblob;
00378   it.set_to_list (&ratings);     //get list
00379   result = NULL;
00380   string[1] = '\0';
00381   for (it.mark_cycle_pt (); !it.cycled_list (); it.forward ()) {
00382     choice = it.data ();
00383     string[0] = choice->char_class ();
00384     result = append_choice (result, string,
00385       choice->rating (), choice->certainty (),
00386       choice->config ());
00387   }
00388   return result;                 //converted list
00389 }

CHOICES classify_blob ( TBLOB pblob,
TBLOB blob,
TBLOB nblob,
TEXTROW row,
int  fx,
const char *  string,
C_COL  color,
STATE this_state,
STATE best_state,
INT32  pass,
INT32  blob_index 
)

Classify blob if not already recorded in the match table.

Parameters:
pblob Pointer to previous TBLOB
blob Pointer to TBLOB in question
nblob Pointer to next TBLOB
row Row of this blob
fx IGNORED, number of feature extractor to use, from cc_recog()
string 'rebuild', 'improve 1:', 'improve 2:', 'chop_word:', or 'pieces:'
color 'Orange', 'Red', 'Yellow', 'Green', or 'White' respectively
this_state FIX: still not sure
best_state FIX: still not sure
pass 1 or 2, how hard to try
blob_index FIX: set by compare_states() and word_answer[blob_index]
Note:
Globals:
  • blob_type 1, 2, 4, or 5, used by ClassPruner()
  • blob_skip FIX
  • display_all_blobs FIX: Plotting?
  • display_ratings Debugging
  • blob_pause FIX
Returns:
CHOICES
Attempt to recognize this blob as a character. The recognition rating (probability) for this blob will be stored as a part of the blob. Called multiple times for different functions

This value will also be returned to the caller.

Definition at line 97 of file wordclass.cpp.

Referenced by chop_word_main(), classify_piece(), improve_one_blob(), and rebuild_current_state().

00107                                         {
00108   CHOICES rating;
00109   INT32 old_index;
00110 
00111   chars_classified++;            /* Global value */
00112   if (blob_skip)
00113     return (NIL);
00114 
00115 #ifndef GRAPHICS_DISABLED
00116   if (display_all_blobs)
00117     display_blob(blob, color); 
00118 #endif
00119   rating = get_match (blob);
00120   if (rating == NIL) {
00121     if (pass) {
00122       old_index = blob_index;
00123                                  //?cast to int*
00124       blob_type = compare_states (best_state, this_state, (int *) &blob_index);
00125       blob_answer = word_answer[blob_index];
00126       if (blob_answer < '!')
00127         fprintf (matcher_fp,
00128           "Bad compare states: best state=0x%x%x, this=0x%x%x, bits="
00129           INT32FORMAT ", index=" INT32FORMAT ", outdex="
00130           INT32FORMAT ", word=%s\n", best_state->part1,
00131           best_state->part2, this_state->part1, this_state->part2,
00132           bits_in_states, old_index, blob_index, word_answer);
00133     }
00134     else
00135       blob_type = 0;
00136     rating = /*(*blob_matchers [fx]) */ (CHOICES) call_matcher (pblob, blob,
00137       nblob, NULL,
00138       row);
00139     put_match(blob, rating); 
00140   }
00141 
00142 #ifndef GRAPHICS_DISABLED
00143   if (display_ratings && string)
00144     print_choices(string, rating); 
00145 
00146   if (blob_pause)
00147     window_wait(blob_window); 
00148 #endif
00149 
00150   return (rating);
00151 }

TBLOB* newblob (  ) 

Note:
File: wordclass.cpp (Formerly wordclass.c)
Word classifier
Author:
Mark Seaman, OCR Technology
Date:
Tue Jan 30 14:03:25 1990 Fri Jul 12 16:03:06 1991 (Mark Seaman) marks
 * (c) Copyright 1990, Hewlett-Packard Company.
 ** Licensed under the Apache License, Version 2.0 (the "License");
 ** you may not use this file except in compliance with the License.
 ** You may obtain a copy of the License at
 ** http://www.apache.org/licenses/LICENSE-2.0
 ** Unless required by applicable law or agreed to in writing, software
 ** distributed under the License is distributed on an "AS IS" BASIS,
 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 ** See the License for the specific language governing permissions and
 ** limitations under the License.

void save_answer ( TWERD word,
TEXTROW row,
A_CHOICE best_choice,
A_CHOICE raw_choice,
int  firstpass 
)

Write an answer to the output file that is the raw guess (without context) directly from the classifier.

Not actually called from anywhere in tesseract 1.02...

Definition at line 209 of file wordclass.cpp.

00213                                 {
00214   static TEXTROW *last_row;
00215   char raw_answer[CHARS_PER_LINE];
00216   int answer_already;
00217   int good_answer;
00218   char *string = NULL;
00219 
00220   if (best_choice) {
00221     good_answer = AcceptableResult (best_choice, raw_choice);
00222     string = class_string (best_choice);
00223   }
00224   else {
00225     good_answer = FALSE;
00226   }
00227 
00228   if (firstpass) {
00229                                  /* First pass */
00230     if (string) {
00231                                  /* Got answer */
00232       add_document_word(best_choice); 
00233 
00234       word->guess = string;
00235       fix_quotes (word->guess);
00236       strcpy (raw_answer, word->guess);
00237 
00238       record_certainty (class_certainty (best_choice), 1);
00239 
00240       if (good_answer) {
00241         record_certainty (class_certainty (best_choice), 2);
00242         strcat (raw_answer, " ");
00243         strcat (raw_answer, class_string (raw_choice));
00244         word->guess = strsave (raw_answer);
00245         word->guess[strlen (string)] = 0;
00246         if (string) {
00247           strfree(string); 
00248           class_string (best_choice) = NULL;
00249         }
00250       }
00251       else {
00252                                  /* Not good enough */
00253         if (word->guess)
00254           strfree (word->guess);
00255         word->guess = NULL;
00256       }
00257     }
00258     else {
00259       word->guess = NULL;
00260       raw_answer[0] = '\0';
00261     }
00262   }
00263   else {
00264                                  /* Second pass */
00265     answer_already = (word->guess != NULL);
00266     if (answer_already) {
00267       write_text_files (word,
00268         &word->guess[strlen (word->guess) + 1],
00269         (row != last_row), TRUE, TRUE);
00270     }
00271     else {
00272                                  /* Required second pass */
00273       if (string) {
00274         if (!good_answer && tessedit_save_stats) {
00275           SaveBadWord (string, class_certainty (best_choice));
00276         }
00277         record_certainty (class_certainty (best_choice), 2);
00278         word->guess = class_string (best_choice);
00279         fix_quotes (word->guess);
00280         write_text_files (word, class_string (raw_choice),
00281           (row != last_row), good_answer, FALSE);
00282       }
00283     }
00284   }
00285   /* Word Display */
00286   if (display_text) {
00287     if (row != last_row)
00288       cprintf ("\n");
00289     if (word->guess && strlen (word->guess))
00290       cprintf ("%s ", word->guess);
00291     else
00292       cprintf ("%s ", raw_answer);
00293     fflush(stdout); 
00294   }
00295 
00296   last_row = row;
00297 }

void write_text_files ( TWERD word,
char *  raw_choice,
int  same_row,
int  good_word,
int  firstpass 
)

Write an answer to the output file that is the raw guess (without context) directly from the classifier.

Not actually called from anywhere in tesseract 1.02 because save_answer() never called in the first place...

Definition at line 162 of file wordclass.cpp.

00166                                      {
00167   int x;
00168   /* Raw output */
00169   if (write_raw_output) {
00170     if (same_row)
00171       fprintf (rawfile, "\n");
00172     if (raw_choice && strlen (raw_choice)) {
00173       fprintf (rawfile, "%s ", raw_choice);
00174       fflush(rawfile); 
00175     }
00176   }
00177   /* Text file output */
00178   if (write_output) {
00179     if (same_row)
00180       fprintf (textfile, "\n");
00181     if (word->guess && strlen (word->guess)) {
00182       for (x = 0; x < word->blanks; x++)
00183         fprintf (textfile, " ");
00184       if (!firstpass)
00185         fprintf(textfile, BOLD_ON); 
00186       if (!good_word)
00187         fprintf(textfile, UNDERLINE_ON); 
00188       fprintf (textfile, "%s", word->guess);
00189       if (!good_word)
00190         fprintf(textfile, UNDERLINE_OFF); 
00191       if (!firstpass)
00192         fprintf(textfile, BOLD_OFF); 
00193       fflush(textfile); 
00194     }
00195   }
00196   /* Global counters */
00197   character_count += (word->guess ? strlen (word->guess) : 0);
00198   word_count++;
00199 }


Variable Documentation

INT16 first_pass

Flag: classifying with first pass heuristics

Definition at line 46 of file wordclass.cpp.

Referenced by attempt_blob_chop(), chop_word_main(), improve_by_chopping(), record_search_status(), set_pass1(), and set_pass2().


Generated on Wed Feb 28 19:49:29 2007 for Tesseract by  doxygen 1.5.1