dict/dawg.cpp File Reference

#include "dawg.h"
#include "cutil.h"
#include "callcpp.h"
#include "context.h"
#include "strngs.h"

Go to the source code of this file.

Functions

Variables


Function Documentation

EDGE_REF edge_char_of ( EDGE_ARRAY  dawg,
NODE_REF  node,
int  character,
int  word_end 
)

Return the edge that corresponds to the letter out of this node.

Definition at line 41 of file dawg.cpp.

References case_sensative, edge_letter, edge_loop, edge_occupied, end_of_word, and NO_EDGE.

Referenced by add_word_to_dawg(), and letter_is_okay().

00044                                     {
00045   EDGE_REF   edge = node;
00046 
00047   if (! case_sensative) character = tolower (character);
00048 
00049   if (edge_occupied (dawg, edge)) {
00050     do {
00051       if ((edge_letter (dawg, edge) == character) &&
00052         (! word_end || end_of_word(dawg,edge)))
00053         return (edge);
00054 
00055     } edge_loop (dawg, edge);
00056   }
00057 
00058   return (NO_EDGE);
00059 }

INT32 edges_in_node ( EDGE_ARRAY  dawg,
NODE_REF  node 
)

Count the number of edges in this node in the DAWG; includes both forward and back links.

Definition at line 66 of file dawg.cpp.

References backward_edge, edge_loop, and edge_occupied.

Referenced by add_edge_linkage(), add_word_to_dawg(), move_node(), remove_edge_linkage(), and room_in_node().

00066                                                     {
00067   EDGE_REF   edge = node;
00068 
00069   if (edge_occupied (dawg, edge)) {
00070     edge_loop(dawg, edge);
00071     if (edge_occupied (dawg, edge) && backward_edge (dawg, edge)) {
00072       edge_loop(dawg, edge);
00073       return (edge - node);
00074     }
00075     else {
00076       return (edge - node);
00077     }
00078   }
00079   else {
00080     return (edge - node);
00081   }
00082 }

INT32 letter_is_okay ( EDGE_ARRAY  dawg,
NODE_REF node,
INT32  char_index,
char  prevchar,
const char *  word,
INT32  word_end 
)

Check this letter in light of the current state.

Parameters:
dawg Array of edges
node Node ?
char_index ?
prevchar Previous character
word string of word
word_end 0 or 1, reached end of word
Returns:
TRUE if letter checks out in light of the current state

Definition at line 96 of file dawg.cpp.

References case_is_okay, case_sensative, edge_char_of(), FALSE, leading_punc, NO_EDGE, punctuation_ok(), STRING::string(), trailing_punc, TRUE, verify_trailing_punct(), and word_in_dawg().

Referenced by append_next_choice(), and word_in_dawg().

00101                                      {
00102   EDGE_REF     edge;
00103   STRING dummy_word(word);  // Auto-deleting string fixes memory leak.
00104 
00105   if (*node == NO_EDGE) {        /* Trailing punctuation */
00106     if (trailing_punc (dummy_word [char_index])
00107       && (!trailing_punc (prevchar)
00108       || punctuation_ok(dummy_word.string())>=0))
00109       return (TRUE);
00110     else
00111       return (FALSE);
00112   }
00113   else {
00114     /* Leading punctuation */
00115     if (*node == 0                               &&
00116       char_index != 0                            &&
00117       isalpha (dummy_word [char_index])          &&
00118       ! leading_punc (dummy_word [char_index-1]) &&
00119     dummy_word [char_index-1] != '-') {
00120       return (FALSE);
00121     }
00122   }
00123   /* Handle compund words */
00124   if (dummy_word [char_index] == '-') {
00125     if (char_index>0 && !word_end
00126       && word [char_index-1] == '-'
00127       && word [char_index+1] == '-')
00128       return FALSE;              /*not allowed*/
00129     dummy_word [char_index] = (char) 0;
00130     if (word_in_dawg (dawg, dummy_word.string())) {
00131       dummy_word [char_index] = '-';
00132       *node = 0;
00133       return (TRUE);
00134     }
00135     else {
00136       dummy_word [char_index] = '-';
00137       return (FALSE);
00138     }
00139   }
00140   /* Check the DAWG */
00141   edge = edge_char_of (dawg, *node, dummy_word [char_index], word_end);
00142 
00143   if (edge != NO_EDGE) {         /* Normal edge in DAWG */
00144     if (case_sensative || case_is_okay (dummy_word, char_index)) {
00145                                  //next_node (dawg, edge);
00146       *node = (dawg)[edge] & NO_EDGE;
00147       return (TRUE);
00148     }
00149     else {
00150       return (FALSE);
00151     }
00152   }
00153   else {
00154     /* Leading punctuation */
00155     if (leading_punc (word [char_index]) &&
00156     (char_index == 0  ||  leading_punc (dummy_word [char_index-1]))) {
00157       *node = 0;
00158       if (leading_punc (prevchar) || punctuation_ok (word)>=0)
00159         return (TRUE);
00160       else
00161         return FALSE;
00162     }
00163     /* Trailing punctuation */
00164     if (verify_trailing_punct (dawg, &dummy_word[0], char_index)) {
00165       *node = NO_EDGE;
00166       return (TRUE);
00167     }
00168 
00169     return (FALSE);
00170   }
00171 }

INT32 num_forward_edges ( EDGE_ARRAY  dawg,
NODE_REF  node 
)

Count and return the number of forward edges for this node.

Definition at line 177 of file dawg.cpp.

References edge_loop, and forward_edge.

Referenced by remove_edge_linkage().

00177                                                         { 
00178   EDGE_REF   edge = node;
00179   INT32        num  = 0;
00180 
00181   if (forward_edge (dawg, edge)) {
00182     do {
00183       num++;
00184     } edge_loop (dawg, edge);
00185   }
00186 
00187   return (num);
00188 }

void print_dawg_node ( EDGE_ARRAY  dawg,
NODE_REF  node 
)

Print the contents of one of the nodes in the DAWG.

Definition at line 194 of file dawg.cpp.

References backward_edge, cprintf(), direction(), edge_letter, edge_loop, edge_occupied, end_of_word, forward_edge, last_edge, MAX_NODE_EDGES, new_line, and next_node.

Referenced by move_node(), remove_edge_linkage(), and word_in_dawg().

00194                                                      { 
00195   EDGE_REF   edge = node;
00196   const char       *forward_string  = "FORWARD";
00197   const char       *backward_string = "       ";
00198 
00199   const char       *last_string     = "LAST";
00200   const char       *not_last_string = "    ";
00201 
00202   const char       *eow_string      = "EOW";
00203   const char       *not_eow_string  = "   ";
00204 
00205   const char       *direction;
00206   const char       *is_last;
00207   const char       *eow;
00208 
00209   char       ch;
00210 
00211   if (edge_occupied (dawg, edge)) {
00212     do {
00213       if (forward_edge (dawg, edge)) direction = forward_string;
00214       else                           direction = backward_string;
00215 
00216       if (last_edge    (dawg, edge)) is_last   = last_string;
00217       else                           is_last   = not_last_string;
00218 
00219       if (end_of_word  (dawg, edge)) eow       = eow_string;
00220       else                           eow       = not_eow_string;
00221 
00222       ch = edge_letter (dawg, edge);
00223       cprintf ("%7d : next = %7d, char = '%c', %s %s %s\n",
00224         (int) edge, (int) next_node (dawg, edge), ch,
00225         direction, is_last, eow);
00226 
00227       if (edge - node > MAX_NODE_EDGES) return;
00228     } edge_loop (dawg, edge);
00229 
00230     if (edge_occupied (dawg, edge) && backward_edge (dawg, edge)) {
00231       do {
00232         if (forward_edge (dawg, edge)) direction = forward_string;
00233         else                           direction = backward_string;
00234 
00235         if (last_edge    (dawg, edge)) is_last   = last_string;
00236         else                           is_last   = not_last_string;
00237 
00238         if (end_of_word  (dawg, edge)) eow       = eow_string;
00239         else                           eow       = not_eow_string;
00240 
00241         ch = edge_letter (dawg, edge);
00242         cprintf ("%7d : next = %7d, char = '%c', %s %s %s\n",
00243           (int) edge, (int) next_node (dawg, edge), ch,
00244           direction, is_last, eow);
00245 
00246         if (edge - node > MAX_NODE_EDGES) return;
00247       } edge_loop (dawg, edge);
00248     }
00249   }
00250   else {
00251     cprintf ("%5d : no edges in this node\n", node);
00252   }
00253   new_line();
00254 }

void read_squished_dawg ( char *  filename,
EDGE_ARRAY  dawg,
INT32  max_num_edges 
)

Read the compressed, binary DAWG from the file 'tessdata/word-dawg'.

First record has 179 edges, the second/last one has 47971 edges

Definition at line 262 of file dawg.cpp.

References clear_all_edges, debug, last_edge, open_file(), print_string, and reverse32().

Referenced by init_permdawg(), and init_permute().

00262                                                                               { 
00263   FILE       *file;
00264   EDGE_REF   edge;
00265   INT32      num_edges;
00266   INT32      node_count = 0;
00267 
00268   if (debug) print_string ("read_debug");
00269 
00270   clear_all_edges(dawg, edge, max_num_edges);
00271 
00272   #ifdef __UNIX__
00273   file = open_file (filename, "r");
00274   #else
00275   file = open_file (filename, "rb");
00276   #endif
00277   fseek(file, 0, SEEK_END);
00278   long fsize = ftell(file);
00279   rewind(file);
00280   fread (&num_edges,  sizeof (int), 1, file);
00281   // Auto-detect relative endianness of file and OS as future DAWG
00282   // files may be little-endian.
00283   long diff1 = sizeof(EDGE_RECORD)*num_edges + sizeof(int) - fsize;
00284   reverse32(&num_edges);
00285   long diff2 = sizeof(EDGE_RECORD)*num_edges + sizeof(int) - fsize;
00286   reverse32(&num_edges);
00287   // One of diff1 and diff2 should now be 0, but find the smallest
00288   // just in case.
00289   if (diff1 < 0) diff1 = -diff1;
00290   if (diff2 < 0) diff2 = -diff2;
00291   bool swap = diff2 < diff1;
00292   if (swap)
00293     reverse32(&num_edges);
00294   fread (&dawg[0], sizeof (EDGE_RECORD), num_edges, file);
00295   fclose(file);
00296   if (swap)
00297     for (edge=0;edge<num_edges;edge++)
00298       reverse32(&dawg[edge]);
00299 
00300   for  (edge=0; edge<max_num_edges; edge++)
00301     if (last_edge (dawg, edge)) node_count++;
00302 }

INT32 verify_trailing_punct ( EDGE_ARRAY  dawg,
char *  word,
INT32  char_index 
)

Make sure that there is a valid transition from the word core to a string of trailing puntuation.

Parameters:
dawg Array of edges
word The word in question
char_index Which letter in the word
Returns:
TRUE if everything is OK.

Definition at line 314 of file dawg.cpp.

References FALSE, leading_punc, trailing_punc, TRUE, and word_in_dawg().

Referenced by letter_is_okay().

00314                                                                            { 
00315   char       last_char;
00316   char       *first_char;
00317 
00318   if (trailing_punc (word [char_index])) {
00319 
00320     last_char = word [char_index];
00321     word [char_index] = (char) 0;
00322 
00323     for (first_char = word; leading_punc (first_char[0]); first_char++);
00324 
00325     if (word_in_dawg (dawg, first_char)) {
00326       word [char_index] = last_char;
00327       return (TRUE);
00328     }
00329     word [char_index] = last_char;
00330   }
00331   return (FALSE);
00332 }

INT32 word_in_dawg ( EDGE_ARRAY  dawg,
const char *  string 
)

Test to see if the word can be found in the DAWG.

Definition at line 338 of file dawg.cpp.

References debug, FALSE, letter_is_okay(), new_line, print_dawg_node(), and TRUE.

Referenced by add_document_word(), adjust_word(), letter_is_okay(), read_word_list(), test_freq_words(), valid_word(), and verify_trailing_punct().

00338                                                         { 
00339   NODE_REF   node = 0;
00340   INT32        i;
00341   INT32         length;
00342 
00343   length=strlen(string);
00344   if (length==0)
00345     return FALSE;
00346   for (i=0; i<length; i++) {
00347     if (debug) {
00348       print_dawg_node(dawg, node);
00349       new_line();
00350     }
00351 
00352     if (! letter_is_okay (dawg, &node, i,'\0', string, (string[i+1]==0))) {
00353       return (FALSE);
00354     }
00355   }
00356 
00357   return (TRUE);
00358 }


Variable Documentation

INT32 case_sensative = 0

Always = FALSE/0 in Tesseract 1.02

Definition at line 33 of file dawg.cpp.

Referenced by add_document_word(), add_word_to_dawg(), edge_char_of(), init_permute(), letter_is_okay(), permute_words(), and valid_word().

INT32 debug = 0

Note:
File: dawg.cpp (Formerly dawg.c)
Use a Directed Accyclic Word Graph
Author:
Mark Seaman, OCR Technology
Date:
Oct 16 14:37:00 1987 Jul 24 16:59:16 1991 (Mark Seaman) marks
 * (c) Copyright 1987, Hewlett-Packard Company.
 ** Licensed under the Apache License, Version 2.0 (the "License");
 ** you may not use this file except in compliance with the License.
 ** You may obtain a copy of the License at
 ** http://www.apache.org/licenses/LICENSE-2.0
 ** Unless required by applicable law or agreed to in writing, software
 ** distributed under the License is distributed on an "AS IS" BASIS,
 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 ** See the License for the specific language governing permissions and
 ** limitations under the License.

Definition at line 32 of file dawg.cpp.

Referenced by add_edge_linkage(), add_new_edge(), add_word_to_dawg(), move_node(), read_squished_dawg(), relocate_edge(), remove_edge_linkage(), and word_in_dawg().


Generated on Wed Feb 28 19:49:22 2007 for Tesseract by  doxygen 1.5.1