dict/dawg.h File Reference

#include <ctype.h>
#include "general.h"

Go to the source code of this file.

Defines

Typedefs

Functions

Variables


Define Documentation

#define backward_edge ( edges,
 ) 

Value:

(! ((edges)[e] & (DIRECTION_FLAG << FLAG_START_BIT)) && \
   edge_occupied (edges,e))
TRUE if this edge-spot is in the non-forward/backward direction.

Definition at line 339 of file dawg.h.

Referenced by add_edge_linkage(), edges_in_node(), move_node(), print_dawg_node(), relocate_edge(), and remove_edge_linkage().

#define case_is_okay ( word,
 ) 

Value:

(i ?                                                      \
   ((isupper(word[i]) && islower(word[i-1])) ?              \
   FALSE :                                                 \
   ((islower(word[i]) && isupper(word[i-1]) &&             \
      i>1 && isalpha (word[i-2])) ?                       \
   FALSE :                                                \
   TRUE)) :                                               \
   TRUE)
Check that case of letter in word is OK.

Check the case of this character in the character string to make sure that there is not problem with the case. Checks not only for 'aB' but also '?Ab' where '?' is any alphanumeric character.

Definition at line 358 of file dawg.h.

Referenced by letter_is_okay().

#define clear_all_edges ( dawg,
edge,
max_num_edges   ) 

Value:

for  (edge=0; edge<max_num_edges; edge++)      \
   set_empty_edge (dawg, edge);
Go through all edges in DAWG and clear each out.

Definition at line 299 of file dawg.h.

Referenced by initialize_dawg(), and read_squished_dawg().

#define DIRECTION_FLAG   (INT32) 2

Definition at line 226 of file dawg.h.

Referenced by add_edge_linkage().

#define edge_letter ( edges,
 )     ((edges)[e] >> LETTER_START_BIT)

The letter choice that corresponds to this edge in the DAWG.

Definition at line 312 of file dawg.h.

Referenced by edge_char_of(), print_dawg_node(), and remove_edge_linkage().

#define edge_loop ( edges,
 )     while (! last_edge (edges,e++))

Loop for each of the edge-spots in the forward direction.

This macro is an iterator: 'while (iterator condition)'

Definition at line 348 of file dawg.h.

Referenced by add_edge_linkage(), edge_char_of(), edges_in_node(), move_node(), num_forward_edges(), print_dawg_node(), and relocate_edge().

#define edge_occupied ( edges,
 )     ((edges)[e] != NO_EDGE)

TRUE if this edge-spot is occupied.

Definition at line 306 of file dawg.h.

Referenced by edge_char_of(), edges_in_node(), new_dawg_node(), print_dawg_node(), and room_in_node().

#define end_of_word ( edges,
 )     ((edges)[e] & (WERD_END_FLAG << FLAG_START_BIT))

TRUE if this edge-spot marks end of a word.

Definition at line 326 of file dawg.h.

Referenced by edge_char_of(), print_dawg_node(), and remove_edge_linkage().

#define FLAG_START_BIT   21

Definition at line 230 of file dawg.h.

#define forward_edge ( edges,
 ) 

Value:

((edges)[e] & (DIRECTION_FLAG << FLAG_START_BIT) && \
   edge_occupied (edges,e))
TRUE if this edge-spot is in the forward direction.

Definition at line 332 of file dawg.h.

Referenced by add_edge_linkage(), move_node(), num_forward_edges(), print_dawg_node(), relocate_edge(), and remove_edge_linkage().

#define last_edge ( edges,
 )     ((edges)[e] & (LAST_FLAG << FLAG_START_BIT))

TRUE if this edge is the last edge in the sequence.

TRUE for last edge in both forward and backward part.

Definition at line 320 of file dawg.h.

Referenced by print_dawg_node(), read_squished_dawg(), and remove_edge_linkage().

#define LAST_FLAG   (INT32) 1

Definition at line 225 of file dawg.h.

Referenced by add_edge_linkage().

#define leading_punc ( ch   ) 

Value:

((ch == '\"' ) ||       \
   (ch == '('  ) ||       \
   (ch == '{'  ) ||       \
   (ch == '['  ) ||       \
   (ch == '`'  ) ||       \
   (ch == '\'' ))
Check for leading punctuation.

Definition at line 389 of file dawg.h.

Referenced by letter_is_okay(), and verify_trailing_punct().

#define LETTER_START_BIT   24

Check that letter-choice corresponds an edge in DAWG.

Used in the bit-test 'dawg[edge] >> LETTER_START_BIT' to check that a letter choice corresponds to an edge in DAWG

Definition at line 239 of file dawg.h.

#define MAX_NODE_EDGES   (INT32) 100

FIX:.

Only used in print_dawg_node()

Definition at line 224 of file dawg.h.

Referenced by print_dawg_node().

#define MAX_WERD_LENGTH   (INT32) 40

Max lenght of a werd to be permuted.

fmg thinks this is greater than the "longest English word" as 'letters'/blobs in a werd may end up getting removed/collapsed in the process of permuting.

Definition at line 217 of file dawg.h.

Referenced by dawg_permute_and_select(), new_dawg_node(), number_permute_and_select(), permute_compound_words(), and permute_words().

#define next_node ( edges,
 )     ((edges)[e] & NO_EDGE)

The next node visited in the DAWG by following this edge.

Definition at line 281 of file dawg.h.

Referenced by add_word_to_dawg(), move_node(), print_dawg_node(), relocate_edge(), and remove_edge_linkage().

#define NO_EDGE   (INT32) 0x1fffff

Denotes that edge-spot contains no edge info.

Used in conjunction with class_probability() macro

Referenced by add_word_to_dawg(), edge_char_of(), and letter_is_okay().

#define set_empty_edge ( edges,
 )     ((edges)[e] = NO_EDGE)

Initializes edge-spot e to NO_EDGE.

Definition at line 293 of file dawg.h.

Referenced by remove_edge_linkage().

#define set_next_edge ( edges,
e,
value   )     ( (edges)[e] = ((edges)[e] & (INT32) 0xffe00000) | (value & NO_EDGE) )

Set the next node link for this edge in the DAWG.

Definition at line 287 of file dawg.h.

Referenced by relocate_edge().

#define trailing_punc ( ch   ) 

Value:

((ch == '}'  ) ||       \
   (ch == ':'  ) ||       \
   (ch == ';'  ) ||       \
   (ch == '-'  ) ||       \
   (ch == ']'  ) ||       \
   (ch == '!'  ) ||       \
   (ch == '?'  ) ||       \
   (ch == '`'  ) ||       \
   (ch == ','  ) ||       \
   (ch == '.'  ) ||       \
   (ch == ')'  ) ||       \
   (ch == '\"' ) ||       \
   (ch == '\'' ))
Check for trailing punctuation.

Definition at line 371 of file dawg.h.

Referenced by letter_is_okay(), and verify_trailing_punct().

#define WERD_END_FLAG   (INT32) 4

End-of-word (EOW) flag

Definition at line 228 of file dawg.h.

Referenced by add_edge_linkage().


Typedef Documentation

EDGE_ARRAY

Type of pointer to EDGE_RECORD, holds pointer to one edge record.

Definition at line 256 of file dawg.h.

EDGE_RECORD

Type of UINT32, holds one edge record.

Definition at line 251 of file dawg.h.

EDGE_REF

Type of INT32, holds one edge reference.

Definition at line 261 of file dawg.h.

NODE_REF

Type of INT32, holds one node reference.

Definition at line 266 of file dawg.h.


Function Documentation

EDGE_REF edge_char_of ( EDGE_ARRAY  dawg,
NODE_REF  node,
int  character,
int  word_end 
)

Return the edge that corresponds to the letter out of this node.

Definition at line 41 of file dawg.cpp.

References case_sensative, edge_letter, edge_loop, edge_occupied, end_of_word, and NO_EDGE.

Referenced by add_word_to_dawg(), and letter_is_okay().

00044                                     {
00045   EDGE_REF   edge = node;
00046 
00047   if (! case_sensative) character = tolower (character);
00048 
00049   if (edge_occupied (dawg, edge)) {
00050     do {
00051       if ((edge_letter (dawg, edge) == character) &&
00052         (! word_end || end_of_word(dawg,edge)))
00053         return (edge);
00054 
00055     } edge_loop (dawg, edge);
00056   }
00057 
00058   return (NO_EDGE);
00059 }

INT32 edges_in_node ( EDGE_ARRAY  dawg,
NODE_REF  node 
)

Count the number of edges in this node in the DAWG; includes both forward and back links.

Definition at line 66 of file dawg.cpp.

References backward_edge, edge_loop, and edge_occupied.

Referenced by add_edge_linkage(), add_word_to_dawg(), move_node(), remove_edge_linkage(), and room_in_node().

00066                                                     {
00067   EDGE_REF   edge = node;
00068 
00069   if (edge_occupied (dawg, edge)) {
00070     edge_loop(dawg, edge);
00071     if (edge_occupied (dawg, edge) && backward_edge (dawg, edge)) {
00072       edge_loop(dawg, edge);
00073       return (edge - node);
00074     }
00075     else {
00076       return (edge - node);
00077     }
00078   }
00079   else {
00080     return (edge - node);
00081   }
00082 }

INT32 letter_is_okay ( EDGE_ARRAY  dawg,
NODE_REF node,
INT32  char_index,
char  prevchar,
const char *  word,
INT32  word_end 
)

Check this letter in light of the current state.

Parameters:
dawg Array of edges
node Node ?
char_index ?
prevchar Previous character
word string of word
word_end 0 or 1, reached end of word
Returns:
TRUE if letter checks out in light of the current state

Definition at line 96 of file dawg.cpp.

References case_is_okay, case_sensative, edge_char_of(), FALSE, leading_punc, NO_EDGE, punctuation_ok(), STRING::string(), trailing_punc, TRUE, verify_trailing_punct(), and word_in_dawg().

Referenced by append_next_choice(), and word_in_dawg().

00101                                      {
00102   EDGE_REF     edge;
00103   STRING dummy_word(word);  // Auto-deleting string fixes memory leak.
00104 
00105   if (*node == NO_EDGE) {        /* Trailing punctuation */
00106     if (trailing_punc (dummy_word [char_index])
00107       && (!trailing_punc (prevchar)
00108       || punctuation_ok(dummy_word.string())>=0))
00109       return (TRUE);
00110     else
00111       return (FALSE);
00112   }
00113   else {
00114     /* Leading punctuation */
00115     if (*node == 0                               &&
00116       char_index != 0                            &&
00117       isalpha (dummy_word [char_index])          &&
00118       ! leading_punc (dummy_word [char_index-1]) &&
00119     dummy_word [char_index-1] != '-') {
00120       return (FALSE);
00121     }
00122   }
00123   /* Handle compund words */
00124   if (dummy_word [char_index] == '-') {
00125     if (char_index>0 && !word_end
00126       && word [char_index-1] == '-'
00127       && word [char_index+1] == '-')
00128       return FALSE;              /*not allowed*/
00129     dummy_word [char_index] = (char) 0;
00130     if (word_in_dawg (dawg, dummy_word.string())) {
00131       dummy_word [char_index] = '-';
00132       *node = 0;
00133       return (TRUE);
00134     }
00135     else {
00136       dummy_word [char_index] = '-';
00137       return (FALSE);
00138     }
00139   }
00140   /* Check the DAWG */
00141   edge = edge_char_of (dawg, *node, dummy_word [char_index], word_end);
00142 
00143   if (edge != NO_EDGE) {         /* Normal edge in DAWG */
00144     if (case_sensative || case_is_okay (dummy_word, char_index)) {
00145                                  //next_node (dawg, edge);
00146       *node = (dawg)[edge] & NO_EDGE;
00147       return (TRUE);
00148     }
00149     else {
00150       return (FALSE);
00151     }
00152   }
00153   else {
00154     /* Leading punctuation */
00155     if (leading_punc (word [char_index]) &&
00156     (char_index == 0  ||  leading_punc (dummy_word [char_index-1]))) {
00157       *node = 0;
00158       if (leading_punc (prevchar) || punctuation_ok (word)>=0)
00159         return (TRUE);
00160       else
00161         return FALSE;
00162     }
00163     /* Trailing punctuation */
00164     if (verify_trailing_punct (dawg, &dummy_word[0], char_index)) {
00165       *node = NO_EDGE;
00166       return (TRUE);
00167     }
00168 
00169     return (FALSE);
00170   }
00171 }

INT32 num_forward_edges ( EDGE_ARRAY  dawg,
NODE_REF  node 
)

Count and return the number of forward edges for this node.

Definition at line 177 of file dawg.cpp.

References edge_loop, and forward_edge.

Referenced by remove_edge_linkage().

00177                                                         { 
00178   EDGE_REF   edge = node;
00179   INT32        num  = 0;
00180 
00181   if (forward_edge (dawg, edge)) {
00182     do {
00183       num++;
00184     } edge_loop (dawg, edge);
00185   }
00186 
00187   return (num);
00188 }

void print_dawg_node ( EDGE_ARRAY  dawg,
NODE_REF  node 
)

Print the contents of one of the nodes in the DAWG.

Definition at line 194 of file dawg.cpp.

References backward_edge, cprintf(), direction(), edge_letter, edge_loop, edge_occupied, end_of_word, forward_edge, last_edge, MAX_NODE_EDGES, new_line, and next_node.

Referenced by move_node(), remove_edge_linkage(), and word_in_dawg().

00194                                                      { 
00195   EDGE_REF   edge = node;
00196   const char       *forward_string  = "FORWARD";
00197   const char       *backward_string = "       ";
00198 
00199   const char       *last_string     = "LAST";
00200   const char       *not_last_string = "    ";
00201 
00202   const char       *eow_string      = "EOW";
00203   const char       *not_eow_string  = "   ";
00204 
00205   const char       *direction;
00206   const char       *is_last;
00207   const char       *eow;
00208 
00209   char       ch;
00210 
00211   if (edge_occupied (dawg, edge)) {
00212     do {
00213       if (forward_edge (dawg, edge)) direction = forward_string;
00214       else                           direction = backward_string;
00215 
00216       if (last_edge    (dawg, edge)) is_last   = last_string;
00217       else                           is_last   = not_last_string;
00218 
00219       if (end_of_word  (dawg, edge)) eow       = eow_string;
00220       else                           eow       = not_eow_string;
00221 
00222       ch = edge_letter (dawg, edge);
00223       cprintf ("%7d : next = %7d, char = '%c', %s %s %s\n",
00224         (int) edge, (int) next_node (dawg, edge), ch,
00225         direction, is_last, eow);
00226 
00227       if (edge - node > MAX_NODE_EDGES) return;
00228     } edge_loop (dawg, edge);
00229 
00230     if (edge_occupied (dawg, edge) && backward_edge (dawg, edge)) {
00231       do {
00232         if (forward_edge (dawg, edge)) direction = forward_string;
00233         else                           direction = backward_string;
00234 
00235         if (last_edge    (dawg, edge)) is_last   = last_string;
00236         else                           is_last   = not_last_string;
00237 
00238         if (end_of_word  (dawg, edge)) eow       = eow_string;
00239         else                           eow       = not_eow_string;
00240 
00241         ch = edge_letter (dawg, edge);
00242         cprintf ("%7d : next = %7d, char = '%c', %s %s %s\n",
00243           (int) edge, (int) next_node (dawg, edge), ch,
00244           direction, is_last, eow);
00245 
00246         if (edge - node > MAX_NODE_EDGES) return;
00247       } edge_loop (dawg, edge);
00248     }
00249   }
00250   else {
00251     cprintf ("%5d : no edges in this node\n", node);
00252   }
00253   new_line();
00254 }

void read_squished_dawg ( char *  filename,
EDGE_ARRAY  dawg,
INT32  max_num_edges 
)

Read the compressed, binary DAWG from the file 'tessdata/word-dawg'.

First record has 179 edges, the second/last one has 47971 edges

Definition at line 262 of file dawg.cpp.

References clear_all_edges, debug, last_edge, open_file(), print_string, and reverse32().

Referenced by init_permdawg(), and init_permute().

00262                                                                               { 
00263   FILE       *file;
00264   EDGE_REF   edge;
00265   INT32      num_edges;
00266   INT32      node_count = 0;
00267 
00268   if (debug) print_string ("read_debug");
00269 
00270   clear_all_edges(dawg, edge, max_num_edges);
00271 
00272   #ifdef __UNIX__
00273   file = open_file (filename, "r");
00274   #else
00275   file = open_file (filename, "rb");
00276   #endif
00277   fseek(file, 0, SEEK_END);
00278   long fsize = ftell(file);
00279   rewind(file);
00280   fread (&num_edges,  sizeof (int), 1, file);
00281   // Auto-detect relative endianness of file and OS as future DAWG
00282   // files may be little-endian.
00283   long diff1 = sizeof(EDGE_RECORD)*num_edges + sizeof(int) - fsize;
00284   reverse32(&num_edges);
00285   long diff2 = sizeof(EDGE_RECORD)*num_edges + sizeof(int) - fsize;
00286   reverse32(&num_edges);
00287   // One of diff1 and diff2 should now be 0, but find the smallest
00288   // just in case.
00289   if (diff1 < 0) diff1 = -diff1;
00290   if (diff2 < 0) diff2 = -diff2;
00291   bool swap = diff2 < diff1;
00292   if (swap)
00293     reverse32(&num_edges);
00294   fread (&dawg[0], sizeof (EDGE_RECORD), num_edges, file);
00295   fclose(file);
00296   if (swap)
00297     for (edge=0;edge<num_edges;edge++)
00298       reverse32(&dawg[edge]);
00299 
00300   for  (edge=0; edge<max_num_edges; edge++)
00301     if (last_edge (dawg, edge)) node_count++;
00302 }

INT32 verify_trailing_punct ( EDGE_ARRAY  dawg,
char *  word,
INT32  char_index 
)

Make sure that there is a valid transition from the word core to a string of trailing puntuation.

Parameters:
dawg Array of edges
word The word in question
char_index Which letter in the word
Returns:
TRUE if everything is OK.

Definition at line 314 of file dawg.cpp.

References FALSE, leading_punc, trailing_punc, TRUE, and word_in_dawg().

Referenced by letter_is_okay().

00314                                                                            { 
00315   char       last_char;
00316   char       *first_char;
00317 
00318   if (trailing_punc (word [char_index])) {
00319 
00320     last_char = word [char_index];
00321     word [char_index] = (char) 0;
00322 
00323     for (first_char = word; leading_punc (first_char[0]); first_char++);
00324 
00325     if (word_in_dawg (dawg, first_char)) {
00326       word [char_index] = last_char;
00327       return (TRUE);
00328     }
00329     word [char_index] = last_char;
00330   }
00331   return (FALSE);
00332 }

INT32 word_in_dawg ( EDGE_ARRAY  dawg,
const char *  string 
)

Test to see if the word can be found in the DAWG.

Definition at line 338 of file dawg.cpp.

References debug, FALSE, letter_is_okay(), new_line, print_dawg_node(), and TRUE.

Referenced by add_document_word(), adjust_word(), letter_is_okay(), read_word_list(), test_freq_words(), valid_word(), and verify_trailing_punct().

00338                                                         { 
00339   NODE_REF   node = 0;
00340   INT32        i;
00341   INT32         length;
00342 
00343   length=strlen(string);
00344   if (length==0)
00345     return FALSE;
00346   for (i=0; i<length; i++) {
00347     if (debug) {
00348       print_dawg_node(dawg, node);
00349       new_line();
00350     }
00351 
00352     if (! letter_is_okay (dawg, &node, i,'\0', string, (string[i+1]==0))) {
00353       return (FALSE);
00354     }
00355   }
00356 
00357   return (TRUE);
00358 }


Variable Documentation

INT32 case_sensative

Always = FALSE/0 in Tesseract 1.02

Definition at line 33 of file dawg.cpp.

Referenced by add_document_word(), add_word_to_dawg(), edge_char_of(), init_permute(), letter_is_okay(), permute_words(), and valid_word().

INT32 debug

Note:
File: dawg.cpp (Formerly dawg.c)
Use a Directed Accyclic Word Graph
Author:
Mark Seaman, OCR Technology
Date:
Oct 16 14:37:00 1987 Jul 24 16:59:16 1991 (Mark Seaman) marks
 * (c) Copyright 1987, Hewlett-Packard Company.
 ** Licensed under the Apache License, Version 2.0 (the "License");
 ** you may not use this file except in compliance with the License.
 ** You may obtain a copy of the License at
 ** http://www.apache.org/licenses/LICENSE-2.0
 ** Unless required by applicable law or agreed to in writing, software
 ** distributed under the License is distributed on an "AS IS" BASIS,
 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 ** See the License for the specific language governing permissions and
 ** limitations under the License.

Definition at line 32 of file dawg.cpp.

Referenced by add_edge_linkage(), add_new_edge(), add_word_to_dawg(), move_node(), read_squished_dawg(), relocate_edge(), remove_edge_linkage(), and word_in_dawg().


Generated on Wed Feb 28 19:49:22 2007 for Tesseract by  doxygen 1.5.1