dict/permdawg.cpp File Reference

#include "permdawg.h"
#include "debug.h"
#include "hyphen.h"
#include "permute.h"
#include "tordvars.h"
#include "context.h"
#include "stopper.h"
#include "freelist.h"
#include "globals.h"
#include "dawg.h"
#include <ctype.h>

Go to the source code of this file.

Defines

Functions

Variables


Define Documentation

#define FREQ_WERD   1.0

Note:
File: permdawg.cpp (Formerly permdawg.c)
Author:
Mark Seaman, OCR Technology
Date:
Oct 16 14:37:00 1987 Jul 9 15:43:18 1991 (Mark Seaman) marks
 * (c) Copyright 1987, Hewlett-Packard Company.
 ** Licensed under the Apache License, Version 2.0 (the "License");
 ** you may not use this file except in compliance with the License.
 ** You may obtain a copy of the License at
 ** http://www.apache.org/licenses/LICENSE-2.0
 ** Unless required by applicable law or agreed to in writing, software
 ** distributed under the License is distributed on an "AS IS" BASIS,
 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 ** See the License for the specific language governing permissions and
 ** limitations under the License.

Definition at line 37 of file permdawg.cpp.

#define GOOD_WERD   1.1

Definition at line 38 of file permdawg.cpp.

#define MAX_FREQ_EDGES   1000

Definition at line 40 of file permdawg.cpp.

Referenced by init_permdawg().

#define NO_RATING   -1

Definition at line 41 of file permdawg.cpp.

Referenced by append_next_choice(), and dawg_permute_and_select().

#define OK_WERD   1.25

Definition at line 39 of file permdawg.cpp.


Function Documentation

void adjust_word ( A_CHOICE best_choice,
float *  certainty_array 
)

Assign an adjusted value to a string that is a word.

Parameters:
best_choice 
certainty_array 
Note:
Global: adjust_debug,
Returns:
none
The value that this word choice has is based on case and punctuation rules.

Definition at line 77 of file permdawg.cpp.

References adjust_debug, case_ok(), class_permuter, class_probability, class_string, cprintf(), FREQ_DAWG_PERM, freq_word, frequent_words, good_word, LogNewWordChoice(), ok_word, punctuation_ok(), RATING_PAD, and word_in_dawg().

Referenced by append_next_choice().

00077                                                                 { 
00078   char *this_word;
00079   int punct_status;
00080   float adjust_factor;
00081 
00082   if (adjust_debug)
00083     cprintf ("%s %4.2f ",
00084       class_string (best_choice), class_probability (best_choice));
00085 
00086   this_word = class_string (best_choice);
00087   punct_status = punctuation_ok (this_word);
00088 
00089   class_probability (best_choice) += RATING_PAD;
00090   if (case_ok (this_word) && punct_status != -1) {
00091     if (punct_status < 1 && word_in_dawg (frequent_words, this_word)) {
00092       class_probability (best_choice) *= freq_word;
00093       class_permuter (best_choice) = FREQ_DAWG_PERM;
00094       adjust_factor = freq_word;
00095       if (adjust_debug)
00096         cprintf (", F, %4.2f ", freq_word);
00097     }
00098     else {
00099       class_probability (best_choice) *= good_word;
00100       adjust_factor = good_word;
00101       if (adjust_debug)
00102         cprintf (", %4.2f ", good_word);
00103     }
00104   }
00105   else {
00106     class_probability (best_choice) *= ok_word;
00107     adjust_factor = ok_word;
00108     if (adjust_debug) {
00109       if (!case_ok (this_word))
00110         cprintf (", C");
00111       if (punctuation_ok (this_word) == -1)
00112         cprintf (", P");
00113       cprintf (", %4.2f ", ok_word);
00114     }
00115   }
00116 
00117   class_probability (best_choice) -= RATING_PAD;
00118 
00119   LogNewWordChoice(best_choice, adjust_factor, certainty_array);
00120 
00121   if (adjust_debug)
00122     cprintf (" --> %4.2f\n", class_probability (best_choice));
00123 }

void append_next_choice ( EDGE_ARRAY  dawg,
NODE_REF  node,
char  permuter,
char *  word,
CHOICES_LIST  choices,
int  char_index,
A_CHOICE this_choice,
char  prevchar,
float *  limit,
float  rating,
float  certainty,
float *  rating_array,
float *  certainty_array,
int  word_ending,
int  last_word,
CHOICES result 
)

Check to see if next choice is worth appending to the string being generated; if so, keep going deeper into the word.

Parameters:
dawg Previous option
node FIX:
permuter FIX:
word FIX:
choices FIX:
char_index FIX:
this_choice FIX:
prevchar FIX:
limit FIX:
rating FIX:
certainty FIX:
rating_array FIX:
certainty_array FIX:
word_ending FIX:
last_word FIX:
result FIX:
Note:
Global: dawg_debug
Returns:
none

Definition at line 149 of file permdawg.cpp.

References adjust_word(), class_certainty, class_probability, class_string, cprintf(), dawg_debug, dawg_permute(), hyphen_base_size, hyphen_tail, JOIN_ON, letter_is_okay(), min, new_choice(), NO_RATING, push_on, rating_margin, rating_pad, and set_hyphen_word().

Referenced by dawg_permute().

00165                                          {
00166   A_CHOICE *better_choice;
00167   /* Add new character */
00168   word[char_index] = class_string (this_choice)[0];
00169   word[char_index + 1] = 0;
00170   if (word[char_index] == 0)
00171     word[char_index] = ' ';
00172   certainty_array[char_index] = class_certainty (this_choice);
00173 
00174   rating += class_probability (this_choice);
00175   certainty = min (class_certainty (this_choice), certainty);
00176 
00177   if (rating_array[char_index] == NO_RATING) {
00178                                  /* Prune bad subwords */
00179     rating_array[char_index] = rating;
00180   }
00181   else {
00182     if (rating_array[char_index] * rating_margin + rating_pad < rating) {
00183       if (dawg_debug)
00184         cprintf ("early pruned word (%s, rating=%4.2f, limit=%4.2f)\n",
00185           word, rating, *limit);
00186       return;
00187     }
00188   }
00189 
00190   /* Deal with hyphens */
00191   if (word_ending && last_word && word[char_index] == '-' && char_index > 0) {
00192     *limit = rating;
00193     if (dawg_debug)
00194       cprintf ("new hyphen choice = %s\n", word);
00195 
00196     better_choice = new_choice (word, rating, certainty, -1, permuter);
00197     adjust_word(better_choice, certainty_array);
00198     push_on(*result, better_choice);
00199     set_hyphen_word(word, rating, node);
00200   }
00201   /* Look up char in DAWG */
00202   else if (letter_is_okay (dawg, &node, char_index, prevchar,
00203   word, word_ending)) {
00204     /* Add a new word choice */
00205     if (word_ending) {
00206       if (dawg_debug == 1)
00207         cprintf ("new choice = %s\n", word);
00208       *limit = rating;
00209 
00210       better_choice = new_choice (hyphen_tail (word), rating, certainty,
00211         -1, permuter);
00212       adjust_word (better_choice, &certainty_array[hyphen_base_size ()]);
00213       push_on(*result, better_choice);
00214     }
00215     else {
00216                                  /* Search the next letter */
00217       JOIN_ON (*result,
00218         dawg_permute (dawg, node, permuter,
00219         choices, char_index + 1, limit,
00220         word, rating, certainty,
00221         rating_array, certainty_array, last_word));
00222     }
00223   }
00224 }

CHOICES dawg_permute ( EDGE_ARRAY  dawg,
NODE_REF  node,
char  permuter,
CHOICES_LIST  choices,
int  char_index,
float *  limit,
char *  word,
float  rating,
float  certainty,
float *  rating_array,
float *  certainty_array,
int  last_word 
)

Permute all the valid words that can be created with this starting point.

Note:
Global: dawg_debug
Returns:
Result
The node (in the DAWG) and the word string define a base from which to start adding the remaining character choices.

Definition at line 237 of file permdawg.cpp.

References append_next_choice(), array_count, array_index, best_string, cprintf(), dawg_debug, FALSE, first, hyphen_base_size, iterate_list, NIL, NULL, print_choices(), and TRUE.

Referenced by append_next_choice(), and dawg_permute_and_select().

00248                                     {
00249   CHOICES result = NIL;
00250   CHOICES c;
00251   char *prevchar;
00252   int word_ending = FALSE;
00253 
00254   if (dawg_debug) {
00255     cprintf ("dawg_permute (node=%d, char_index=%d, limit=%4.2f, ",
00256       node, char_index, *limit);
00257     cprintf ("word=%s, rating=%4.2f, certainty=%4.2f)\n",
00258       word, rating, certainty);
00259   }
00260   /* Check for End of Word (EOW) */
00261   if (1 + char_index == array_count (choices) + hyphen_base_size ())
00262     word_ending = TRUE;
00263 
00264   if (char_index < array_count (choices) + hyphen_base_size ()) {
00265     prevchar = NULL;
00266     iterate_list (c,
00267       (CHOICES) array_index (choices,
00268     char_index - hyphen_base_size ())) {
00269       append_next_choice (dawg, node, permuter, word, choices, char_index,
00270         (A_CHOICE *) first (c),
00271         prevchar != NULL ? *prevchar : '\0', limit,
00272         rating, certainty, rating_array, certainty_array,
00273         word_ending, last_word, &result);
00274       prevchar = best_string (c);
00275     }
00276   }
00277 
00278   if (result && (dawg_debug == 1))
00279     print_choices ("dawg_permute", result);
00280   return (result);
00281 }

void dawg_permute_and_select ( const char *  string,
EDGE_ARRAY  dawg,
char  permuter,
CHOICES_LIST  character_choices,
A_CHOICE best_choice,
INT16  system_words 
)

Use dawg to match word better.

Parameters:
string 'system words:', 'document_words', or 'user words'
dawg For dawg_permute()
permuter SYSTEM_DAWG_PERM, DOC_DAWG_PERM, or USER_DAWG_PERM
character_choices LIST of choices for characters in word
best_choice Best choice (modified)
system_words TRUE if referencing 'tessdata/word-dawg', FIX:
Returns:
none
Use a DAWG type data structure to enumerate all the valid strings in some gramar; compare each of the choices against the best choice so far & update the best choice if needed.

If you enable display_ratings, tesseract will show you what combinations it considered to arrive at any particular word it recognizes. Very useful!

Definition at line 302 of file permdawg.cpp.

References best_probability, class_probability, clone_choice, cprintf(), dawg_permute(), display_ratings, first, free_choice(), good_word, hyphen_state, hyphen_string, is_last_word, MAX_WERD_LENGTH, NIL, NO_RATING, ok_word, pop_off, print_choices(), and rating_margin.

Referenced by permute_words().

00307                                                  {
00308   CHOICES result = NIL;
00309   char word[MAX_WERD_LENGTH + 1];
00310   float certainty_array[MAX_WERD_LENGTH + 1];
00311   float rating_array[MAX_WERD_LENGTH + 1];
00312   float rating;
00313   int char_index;
00314   NODE_REF dawg_node = 0;
00315 
00316                                  /* Pruning margin ratio */
00317   rating_margin = ok_word / good_word;
00318 
00319   word[0] = '\0';
00320   rating = class_probability (best_choice);
00321 
00322   for (char_index = 0; char_index < MAX_WERD_LENGTH + 1; char_index++)
00323     rating_array[char_index] = NO_RATING;
00324   char_index = 0;
00325 
00326   if (!is_last_word () && hyphen_string) {
00327     strcpy(word, hyphen_string);
00328     char_index = strlen (hyphen_string);
00329     if (system_words)
00330       dawg_node = hyphen_state;
00331   }
00332   result = dawg_permute (dawg, dawg_node, permuter, character_choices,
00333     char_index, &rating, word, 0.0, 0.0,
00334     rating_array, certainty_array, is_last_word ());
00335 
00336 #ifdef TEXT_VERBOSE
00337   // gets a 'p', see ccmain/tesseractmain.dox
00338   cprintf("p");
00339 #endif
00340   if (display_ratings && result)
00341     print_choices(string, result);
00342 
00343   while (result != NIL) {
00344     if (best_probability (result) < class_probability (best_choice)) {
00345       clone_choice (best_choice, first (result));
00346     }
00347     free_choice (first (result));
00348     pop_off(result);
00349   }
00350 }

void end_permdawg (  ) 

Free up memory taken by DAWG.

Definition at line 372 of file permdawg.cpp.

References frequent_words, memfree(), and NULL.

Referenced by program_editdown().

00372                     {
00373   memfree(frequent_words);
00374   frequent_words = NULL;
00375 }

void init_permdawg (  ) 

Initialize the variables needed.

Definition at line 355 of file permdawg.cpp.

References demodir, frequent_words, MAX_FREQ_EDGES, memalloc(), and read_squished_dawg().

Referenced by init_permute().

00355                      { 
00356   char name[1024];
00357   make_dawg_debug();
00358   make_ok_word();
00359   make_good_word();
00360   make_freq_word();
00361 
00362   frequent_words = (EDGE_ARRAY) memalloc (sizeof (EDGE_RECORD) *
00363     MAX_FREQ_EDGES);
00364   strcpy(name, demodir);
00365   strcat (name, "tessdata/freq-dawg");
00366   read_squished_dawg(name, frequent_words, MAX_FREQ_EDGES);
00367 }

int test_freq_words ( const char *  word  ) 

Tests a word against the frequent word dawg.

Definition at line 380 of file permdawg.cpp.

References frequent_words, and word_in_dawg().

Referenced by dict_word().

00380                                       { 
00381   return (word_in_dawg (frequent_words, word));
00382 }


Variable Documentation

EDGE_ARRAY frequent_words [static]

Definition at line 46 of file permdawg.cpp.

Referenced by adjust_word(), end_permdawg(), init_permdawg(), and test_freq_words().

float rating_margin [static]

Definition at line 47 of file permdawg.cpp.

Referenced by append_next_choice(), and dawg_permute_and_select().

float rating_pad = 5.0 [static]

Definition at line 48 of file permdawg.cpp.

Referenced by append_next_choice().


Generated on Wed Feb 28 19:49:22 2007 for Tesseract by  doxygen 1.5.1