#include "permdawg.h"
#include "debug.h"
#include "hyphen.h"
#include "permute.h"
#include "tordvars.h"
#include "context.h"
#include "stopper.h"
#include "freelist.h"
#include "globals.h"
#include "dawg.h"
#include <ctype.h>
Go to the source code of this file.
#define FREQ_WERD 1.0 |
* (c) Copyright 1987, Hewlett-Packard Company. ** Licensed under the Apache License, Version 2.0 (the "License"); ** you may not use this file except in compliance with the License. ** You may obtain a copy of the License at ** http://www.apache.org/licenses/LICENSE-2.0 ** Unless required by applicable law or agreed to in writing, software ** distributed under the License is distributed on an "AS IS" BASIS, ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ** See the License for the specific language governing permissions and ** limitations under the License.
Definition at line 37 of file permdawg.cpp.
#define GOOD_WERD 1.1 |
Definition at line 38 of file permdawg.cpp.
#define MAX_FREQ_EDGES 1000 |
#define NO_RATING -1 |
Definition at line 41 of file permdawg.cpp.
Referenced by append_next_choice(), and dawg_permute_and_select().
#define OK_WERD 1.25 |
Definition at line 39 of file permdawg.cpp.
void adjust_word | ( | A_CHOICE * | best_choice, | |
float * | certainty_array | |||
) |
Assign an adjusted value to a string that is a word.
best_choice | ||
certainty_array |
Definition at line 77 of file permdawg.cpp.
References adjust_debug, case_ok(), class_permuter, class_probability, class_string, cprintf(), FREQ_DAWG_PERM, freq_word, frequent_words, good_word, LogNewWordChoice(), ok_word, punctuation_ok(), RATING_PAD, and word_in_dawg().
Referenced by append_next_choice().
00077 { 00078 char *this_word; 00079 int punct_status; 00080 float adjust_factor; 00081 00082 if (adjust_debug) 00083 cprintf ("%s %4.2f ", 00084 class_string (best_choice), class_probability (best_choice)); 00085 00086 this_word = class_string (best_choice); 00087 punct_status = punctuation_ok (this_word); 00088 00089 class_probability (best_choice) += RATING_PAD; 00090 if (case_ok (this_word) && punct_status != -1) { 00091 if (punct_status < 1 && word_in_dawg (frequent_words, this_word)) { 00092 class_probability (best_choice) *= freq_word; 00093 class_permuter (best_choice) = FREQ_DAWG_PERM; 00094 adjust_factor = freq_word; 00095 if (adjust_debug) 00096 cprintf (", F, %4.2f ", freq_word); 00097 } 00098 else { 00099 class_probability (best_choice) *= good_word; 00100 adjust_factor = good_word; 00101 if (adjust_debug) 00102 cprintf (", %4.2f ", good_word); 00103 } 00104 } 00105 else { 00106 class_probability (best_choice) *= ok_word; 00107 adjust_factor = ok_word; 00108 if (adjust_debug) { 00109 if (!case_ok (this_word)) 00110 cprintf (", C"); 00111 if (punctuation_ok (this_word) == -1) 00112 cprintf (", P"); 00113 cprintf (", %4.2f ", ok_word); 00114 } 00115 } 00116 00117 class_probability (best_choice) -= RATING_PAD; 00118 00119 LogNewWordChoice(best_choice, adjust_factor, certainty_array); 00120 00121 if (adjust_debug) 00122 cprintf (" --> %4.2f\n", class_probability (best_choice)); 00123 }
void append_next_choice | ( | EDGE_ARRAY | dawg, | |
NODE_REF | node, | |||
char | permuter, | |||
char * | word, | |||
CHOICES_LIST | choices, | |||
int | char_index, | |||
A_CHOICE * | this_choice, | |||
char | prevchar, | |||
float * | limit, | |||
float | rating, | |||
float | certainty, | |||
float * | rating_array, | |||
float * | certainty_array, | |||
int | word_ending, | |||
int | last_word, | |||
CHOICES * | result | |||
) |
Check to see if next choice is worth appending to the string being generated; if so, keep going deeper into the word.
dawg | Previous option | |
node | FIX: | |
permuter | FIX: | |
word | FIX: | |
choices | FIX: | |
char_index | FIX: | |
this_choice | FIX: | |
prevchar | FIX: | |
limit | FIX: | |
rating | FIX: | |
certainty | FIX: | |
rating_array | FIX: | |
certainty_array | FIX: | |
word_ending | FIX: | |
last_word | FIX: | |
result | FIX: |
Definition at line 149 of file permdawg.cpp.
References adjust_word(), class_certainty, class_probability, class_string, cprintf(), dawg_debug, dawg_permute(), hyphen_base_size, hyphen_tail, JOIN_ON, letter_is_okay(), min, new_choice(), NO_RATING, push_on, rating_margin, rating_pad, and set_hyphen_word().
Referenced by dawg_permute().
00165 { 00166 A_CHOICE *better_choice; 00167 /* Add new character */ 00168 word[char_index] = class_string (this_choice)[0]; 00169 word[char_index + 1] = 0; 00170 if (word[char_index] == 0) 00171 word[char_index] = ' '; 00172 certainty_array[char_index] = class_certainty (this_choice); 00173 00174 rating += class_probability (this_choice); 00175 certainty = min (class_certainty (this_choice), certainty); 00176 00177 if (rating_array[char_index] == NO_RATING) { 00178 /* Prune bad subwords */ 00179 rating_array[char_index] = rating; 00180 } 00181 else { 00182 if (rating_array[char_index] * rating_margin + rating_pad < rating) { 00183 if (dawg_debug) 00184 cprintf ("early pruned word (%s, rating=%4.2f, limit=%4.2f)\n", 00185 word, rating, *limit); 00186 return; 00187 } 00188 } 00189 00190 /* Deal with hyphens */ 00191 if (word_ending && last_word && word[char_index] == '-' && char_index > 0) { 00192 *limit = rating; 00193 if (dawg_debug) 00194 cprintf ("new hyphen choice = %s\n", word); 00195 00196 better_choice = new_choice (word, rating, certainty, -1, permuter); 00197 adjust_word(better_choice, certainty_array); 00198 push_on(*result, better_choice); 00199 set_hyphen_word(word, rating, node); 00200 } 00201 /* Look up char in DAWG */ 00202 else if (letter_is_okay (dawg, &node, char_index, prevchar, 00203 word, word_ending)) { 00204 /* Add a new word choice */ 00205 if (word_ending) { 00206 if (dawg_debug == 1) 00207 cprintf ("new choice = %s\n", word); 00208 *limit = rating; 00209 00210 better_choice = new_choice (hyphen_tail (word), rating, certainty, 00211 -1, permuter); 00212 adjust_word (better_choice, &certainty_array[hyphen_base_size ()]); 00213 push_on(*result, better_choice); 00214 } 00215 else { 00216 /* Search the next letter */ 00217 JOIN_ON (*result, 00218 dawg_permute (dawg, node, permuter, 00219 choices, char_index + 1, limit, 00220 word, rating, certainty, 00221 rating_array, certainty_array, last_word)); 00222 } 00223 } 00224 }
CHOICES dawg_permute | ( | EDGE_ARRAY | dawg, | |
NODE_REF | node, | |||
char | permuter, | |||
CHOICES_LIST | choices, | |||
int | char_index, | |||
float * | limit, | |||
char * | word, | |||
float | rating, | |||
float | certainty, | |||
float * | rating_array, | |||
float * | certainty_array, | |||
int | last_word | |||
) |
Permute all the valid words that can be created with this starting point.
Definition at line 237 of file permdawg.cpp.
References append_next_choice(), array_count, array_index, best_string, cprintf(), dawg_debug, FALSE, first, hyphen_base_size, iterate_list, NIL, NULL, print_choices(), and TRUE.
Referenced by append_next_choice(), and dawg_permute_and_select().
00248 { 00249 CHOICES result = NIL; 00250 CHOICES c; 00251 char *prevchar; 00252 int word_ending = FALSE; 00253 00254 if (dawg_debug) { 00255 cprintf ("dawg_permute (node=%d, char_index=%d, limit=%4.2f, ", 00256 node, char_index, *limit); 00257 cprintf ("word=%s, rating=%4.2f, certainty=%4.2f)\n", 00258 word, rating, certainty); 00259 } 00260 /* Check for End of Word (EOW) */ 00261 if (1 + char_index == array_count (choices) + hyphen_base_size ()) 00262 word_ending = TRUE; 00263 00264 if (char_index < array_count (choices) + hyphen_base_size ()) { 00265 prevchar = NULL; 00266 iterate_list (c, 00267 (CHOICES) array_index (choices, 00268 char_index - hyphen_base_size ())) { 00269 append_next_choice (dawg, node, permuter, word, choices, char_index, 00270 (A_CHOICE *) first (c), 00271 prevchar != NULL ? *prevchar : '\0', limit, 00272 rating, certainty, rating_array, certainty_array, 00273 word_ending, last_word, &result); 00274 prevchar = best_string (c); 00275 } 00276 } 00277 00278 if (result && (dawg_debug == 1)) 00279 print_choices ("dawg_permute", result); 00280 return (result); 00281 }
void dawg_permute_and_select | ( | const char * | string, | |
EDGE_ARRAY | dawg, | |||
char | permuter, | |||
CHOICES_LIST | character_choices, | |||
A_CHOICE * | best_choice, | |||
INT16 | system_words | |||
) |
Use dawg to match word better.
string | 'system words:', 'document_words', or 'user words' | |
dawg | For dawg_permute() | |
permuter | SYSTEM_DAWG_PERM, DOC_DAWG_PERM, or USER_DAWG_PERM | |
character_choices | LIST of choices for characters in word | |
best_choice | Best choice (modified) | |
system_words | TRUE if referencing 'tessdata/word-dawg', FIX: |
If you enable display_ratings, tesseract will show you what combinations it considered to arrive at any particular word it recognizes. Very useful!
Definition at line 302 of file permdawg.cpp.
References best_probability, class_probability, clone_choice, cprintf(), dawg_permute(), display_ratings, first, free_choice(), good_word, hyphen_state, hyphen_string, is_last_word, MAX_WERD_LENGTH, NIL, NO_RATING, ok_word, pop_off, print_choices(), and rating_margin.
Referenced by permute_words().
00307 { 00308 CHOICES result = NIL; 00309 char word[MAX_WERD_LENGTH + 1]; 00310 float certainty_array[MAX_WERD_LENGTH + 1]; 00311 float rating_array[MAX_WERD_LENGTH + 1]; 00312 float rating; 00313 int char_index; 00314 NODE_REF dawg_node = 0; 00315 00316 /* Pruning margin ratio */ 00317 rating_margin = ok_word / good_word; 00318 00319 word[0] = '\0'; 00320 rating = class_probability (best_choice); 00321 00322 for (char_index = 0; char_index < MAX_WERD_LENGTH + 1; char_index++) 00323 rating_array[char_index] = NO_RATING; 00324 char_index = 0; 00325 00326 if (!is_last_word () && hyphen_string) { 00327 strcpy(word, hyphen_string); 00328 char_index = strlen (hyphen_string); 00329 if (system_words) 00330 dawg_node = hyphen_state; 00331 } 00332 result = dawg_permute (dawg, dawg_node, permuter, character_choices, 00333 char_index, &rating, word, 0.0, 0.0, 00334 rating_array, certainty_array, is_last_word ()); 00335 00336 #ifdef TEXT_VERBOSE 00337 // gets a 'p', see ccmain/tesseractmain.dox 00338 cprintf("p"); 00339 #endif 00340 if (display_ratings && result) 00341 print_choices(string, result); 00342 00343 while (result != NIL) { 00344 if (best_probability (result) < class_probability (best_choice)) { 00345 clone_choice (best_choice, first (result)); 00346 } 00347 free_choice (first (result)); 00348 pop_off(result); 00349 } 00350 }
void end_permdawg | ( | ) |
Free up memory taken by DAWG.
Definition at line 372 of file permdawg.cpp.
References frequent_words, memfree(), and NULL.
Referenced by program_editdown().
00372 { 00373 memfree(frequent_words); 00374 frequent_words = NULL; 00375 }
void init_permdawg | ( | ) |
Initialize the variables needed.
Definition at line 355 of file permdawg.cpp.
References demodir, frequent_words, MAX_FREQ_EDGES, memalloc(), and read_squished_dawg().
Referenced by init_permute().
00355 { 00356 char name[1024]; 00357 make_dawg_debug(); 00358 make_ok_word(); 00359 make_good_word(); 00360 make_freq_word(); 00361 00362 frequent_words = (EDGE_ARRAY) memalloc (sizeof (EDGE_RECORD) * 00363 MAX_FREQ_EDGES); 00364 strcpy(name, demodir); 00365 strcat (name, "tessdata/freq-dawg"); 00366 read_squished_dawg(name, frequent_words, MAX_FREQ_EDGES); 00367 }
int test_freq_words | ( | const char * | word | ) |
Tests a word against the frequent word dawg.
Definition at line 380 of file permdawg.cpp.
References frequent_words, and word_in_dawg().
Referenced by dict_word().
00380 { 00381 return (word_in_dawg (frequent_words, word)); 00382 }
EDGE_ARRAY frequent_words [static] |
Definition at line 46 of file permdawg.cpp.
Referenced by adjust_word(), end_permdawg(), init_permdawg(), and test_freq_words().
float rating_margin [static] |
Definition at line 47 of file permdawg.cpp.
Referenced by append_next_choice(), and dawg_permute_and_select().
float rating_pad = 5.0 [static] |