dict/dawg.cpp

Go to the documentation of this file.
00001 
00020 /*----------------------------------------------------------------------
00021               I n c l u d e s
00022 ----------------------------------------------------------------------*/
00023 #include "dawg.h"
00024 #include "cutil.h"
00025 #include "callcpp.h"
00026 #include "context.h"
00027 #include "strngs.h"
00028 
00029 /*----------------------------------------------------------------------
00030               V a r i a b l e s
00031 ----------------------------------------------------------------------*/
00032 INT32 debug          = 0;
00033 INT32 case_sensative = 0;
00034 
00035 /*----------------------------------------------------------------------
00036               F u n c t i o n s
00037 ----------------------------------------------------------------------*/
00041 EDGE_REF edge_char_of(EDGE_ARRAY dawg,
00042                       NODE_REF node,
00043                       int character,
00044                       int word_end) {
00045   EDGE_REF   edge = node;
00046 
00047   if (! case_sensative) character = tolower (character);
00048 
00049   if (edge_occupied (dawg, edge)) {
00050     do {
00051       if ((edge_letter (dawg, edge) == character) &&
00052         (! word_end || end_of_word(dawg,edge)))
00053         return (edge);
00054 
00055     } edge_loop (dawg, edge);
00056   }
00057 
00058   return (NO_EDGE);
00059 }
00060 
00061 
00066 INT32 edges_in_node(EDGE_ARRAY dawg, NODE_REF node) {
00067   EDGE_REF   edge = node;
00068 
00069   if (edge_occupied (dawg, edge)) {
00070     edge_loop(dawg, edge);
00071     if (edge_occupied (dawg, edge) && backward_edge (dawg, edge)) {
00072       edge_loop(dawg, edge);
00073       return (edge - node);
00074     }
00075     else {
00076       return (edge - node);
00077     }
00078   }
00079   else {
00080     return (edge - node);
00081   }
00082 }
00083 
00084 
00096 INT32 letter_is_okay(EDGE_ARRAY dawg,
00097                      NODE_REF *node,
00098                      INT32 char_index,
00099                      char prevchar,
00100                      const char *word,
00101                      INT32 word_end) {
00102   EDGE_REF     edge;
00103   STRING dummy_word(word);  // Auto-deleting string fixes memory leak.
00104 
00105   if (*node == NO_EDGE) {        /* Trailing punctuation */
00106     if (trailing_punc (dummy_word [char_index])
00107       && (!trailing_punc (prevchar)
00108       || punctuation_ok(dummy_word.string())>=0))
00109       return (TRUE);
00110     else
00111       return (FALSE);
00112   }
00113   else {
00114     /* Leading punctuation */
00115     if (*node == 0                               &&
00116       char_index != 0                            &&
00117       isalpha (dummy_word [char_index])          &&
00118       ! leading_punc (dummy_word [char_index-1]) &&
00119     dummy_word [char_index-1] != '-') {
00120       return (FALSE);
00121     }
00122   }
00123   /* Handle compund words */
00124   if (dummy_word [char_index] == '-') {
00125     if (char_index>0 && !word_end
00126       && word [char_index-1] == '-'
00127       && word [char_index+1] == '-')
00128       return FALSE;              /*not allowed*/
00129     dummy_word [char_index] = (char) 0;
00130     if (word_in_dawg (dawg, dummy_word.string())) {
00131       dummy_word [char_index] = '-';
00132       *node = 0;
00133       return (TRUE);
00134     }
00135     else {
00136       dummy_word [char_index] = '-';
00137       return (FALSE);
00138     }
00139   }
00140   /* Check the DAWG */
00141   edge = edge_char_of (dawg, *node, dummy_word [char_index], word_end);
00142 
00143   if (edge != NO_EDGE) {         /* Normal edge in DAWG */
00144     if (case_sensative || case_is_okay (dummy_word, char_index)) {
00145                                  //next_node (dawg, edge);
00146       *node = (dawg)[edge] & NO_EDGE;
00147       return (TRUE);
00148     }
00149     else {
00150       return (FALSE);
00151     }
00152   }
00153   else {
00154     /* Leading punctuation */
00155     if (leading_punc (word [char_index]) &&
00156     (char_index == 0  ||  leading_punc (dummy_word [char_index-1]))) {
00157       *node = 0;
00158       if (leading_punc (prevchar) || punctuation_ok (word)>=0)
00159         return (TRUE);
00160       else
00161         return FALSE;
00162     }
00163     /* Trailing punctuation */
00164     if (verify_trailing_punct (dawg, &dummy_word[0], char_index)) {
00165       *node = NO_EDGE;
00166       return (TRUE);
00167     }
00168 
00169     return (FALSE);
00170   }
00171 }
00172 
00173 
00177 INT32 num_forward_edges(EDGE_ARRAY dawg, NODE_REF node) { 
00178   EDGE_REF   edge = node;
00179   INT32        num  = 0;
00180 
00181   if (forward_edge (dawg, edge)) {
00182     do {
00183       num++;
00184     } edge_loop (dawg, edge);
00185   }
00186 
00187   return (num);
00188 }
00189 
00190 
00194 void print_dawg_node(EDGE_ARRAY dawg, NODE_REF node) { 
00195   EDGE_REF   edge = node;
00196   const char       *forward_string  = "FORWARD";
00197   const char       *backward_string = "       ";
00198 
00199   const char       *last_string     = "LAST";
00200   const char       *not_last_string = "    ";
00201 
00202   const char       *eow_string      = "EOW";
00203   const char       *not_eow_string  = "   ";
00204 
00205   const char       *direction;
00206   const char       *is_last;
00207   const char       *eow;
00208 
00209   char       ch;
00210 
00211   if (edge_occupied (dawg, edge)) {
00212     do {
00213       if (forward_edge (dawg, edge)) direction = forward_string;
00214       else                           direction = backward_string;
00215 
00216       if (last_edge    (dawg, edge)) is_last   = last_string;
00217       else                           is_last   = not_last_string;
00218 
00219       if (end_of_word  (dawg, edge)) eow       = eow_string;
00220       else                           eow       = not_eow_string;
00221 
00222       ch = edge_letter (dawg, edge);
00223       cprintf ("%7d : next = %7d, char = '%c', %s %s %s\n",
00224         (int) edge, (int) next_node (dawg, edge), ch,
00225         direction, is_last, eow);
00226 
00227       if (edge - node > MAX_NODE_EDGES) return;
00228     } edge_loop (dawg, edge);
00229 
00230     if (edge_occupied (dawg, edge) && backward_edge (dawg, edge)) {
00231       do {
00232         if (forward_edge (dawg, edge)) direction = forward_string;
00233         else                           direction = backward_string;
00234 
00235         if (last_edge    (dawg, edge)) is_last   = last_string;
00236         else                           is_last   = not_last_string;
00237 
00238         if (end_of_word  (dawg, edge)) eow       = eow_string;
00239         else                           eow       = not_eow_string;
00240 
00241         ch = edge_letter (dawg, edge);
00242         cprintf ("%7d : next = %7d, char = '%c', %s %s %s\n",
00243           (int) edge, (int) next_node (dawg, edge), ch,
00244           direction, is_last, eow);
00245 
00246         if (edge - node > MAX_NODE_EDGES) return;
00247       } edge_loop (dawg, edge);
00248     }
00249   }
00250   else {
00251     cprintf ("%5d : no edges in this node\n", node);
00252   }
00253   new_line();
00254 }
00255 
00256 
00262 void read_squished_dawg(char *filename, EDGE_ARRAY dawg, INT32 max_num_edges) { 
00263   FILE       *file;
00264   EDGE_REF   edge;
00265   INT32      num_edges;
00266   INT32      node_count = 0;
00267 
00268   if (debug) print_string ("read_debug");
00269 
00270   clear_all_edges(dawg, edge, max_num_edges);
00271 
00272   #ifdef __UNIX__
00273   file = open_file (filename, "r");
00274   #else
00275   file = open_file (filename, "rb");
00276   #endif
00277   fseek(file, 0, SEEK_END);
00278   long fsize = ftell(file);
00279   rewind(file);
00280   fread (&num_edges,  sizeof (int), 1, file);
00281   // Auto-detect relative endianness of file and OS as future DAWG
00282   // files may be little-endian.
00283   long diff1 = sizeof(EDGE_RECORD)*num_edges + sizeof(int) - fsize;
00284   reverse32(&num_edges);
00285   long diff2 = sizeof(EDGE_RECORD)*num_edges + sizeof(int) - fsize;
00286   reverse32(&num_edges);
00287   // One of diff1 and diff2 should now be 0, but find the smallest
00288   // just in case.
00289   if (diff1 < 0) diff1 = -diff1;
00290   if (diff2 < 0) diff2 = -diff2;
00291   bool swap = diff2 < diff1;
00292   if (swap)
00293     reverse32(&num_edges);
00294   fread (&dawg[0], sizeof (EDGE_RECORD), num_edges, file);
00295   fclose(file);
00296   if (swap)
00297     for (edge=0;edge<num_edges;edge++)
00298       reverse32(&dawg[edge]);
00299 
00300   for  (edge=0; edge<max_num_edges; edge++)
00301     if (last_edge (dawg, edge)) node_count++;
00302 }
00303 
00304 
00314 INT32 verify_trailing_punct(EDGE_ARRAY dawg, char *word, INT32 char_index) { 
00315   char       last_char;
00316   char       *first_char;
00317 
00318   if (trailing_punc (word [char_index])) {
00319 
00320     last_char = word [char_index];
00321     word [char_index] = (char) 0;
00322 
00323     for (first_char = word; leading_punc (first_char[0]); first_char++);
00324 
00325     if (word_in_dawg (dawg, first_char)) {
00326       word [char_index] = last_char;
00327       return (TRUE);
00328     }
00329     word [char_index] = last_char;
00330   }
00331   return (FALSE);
00332 }
00333 
00334 
00338 INT32 word_in_dawg(EDGE_ARRAY dawg, const char *string) { 
00339   NODE_REF   node = 0;
00340   INT32        i;
00341   INT32         length;
00342 
00343   length=strlen(string);
00344   if (length==0)
00345     return FALSE;
00346   for (i=0; i<length; i++) {
00347     if (debug) {
00348       print_dawg_node(dawg, node);
00349       new_line();
00350     }
00351 
00352     if (! letter_is_okay (dawg, &node, i,'\0', string, (string[i+1]==0))) {
00353       return (FALSE);
00354     }
00355   }
00356 
00357   return (TRUE);
00358 }

Generated on Wed Feb 28 19:49:10 2007 for Tesseract by  doxygen 1.5.1