dict/dawg.h

Go to the documentation of this file.
00001 
00020 #ifndef DAWG_H
00021 #define DAWG_H
00022 
00023 /*----------------------------------------------------------------------
00024               I n c l u d e s
00025 ----------------------------------------------------------------------*/
00026 #include <ctype.h>
00027 #include "general.h"
00028 
00207 /*----------------------------------------------------------------------
00208               T y p e s
00209 ----------------------------------------------------------------------*/
00217 #define MAX_WERD_LENGTH        (INT32) 40
00218 
00224 #define MAX_NODE_EDGES         (INT32) 100
00225 #define LAST_FLAG              (INT32) 1
00226 #define DIRECTION_FLAG         (INT32) 2
00227 
00228 #define WERD_END_FLAG          (INT32) 4
00229 
00230 #define FLAG_START_BIT         21
00231 
00239 #define LETTER_START_BIT       24
00240 
00245 #define NO_EDGE                (INT32) 0x1fffff
00246 
00251 typedef UINT32 EDGE_RECORD;
00256 typedef EDGE_RECORD *EDGE_ARRAY;
00261 typedef INT32 EDGE_REF;
00266 typedef INT32 NODE_REF;
00267 
00268 /*---------------------------------------------------------------------
00269               V a r i a b l e s
00270 ----------------------------------------------------------------------*/
00272 extern INT32 case_sensative;
00273 extern INT32 debug;
00274 
00275 /*----------------------------------------------------------------------
00276               M a c r o s
00277 ----------------------------------------------------------------------*/
00281 #define next_node(edges,e)  \
00282 ((edges)[e] & NO_EDGE)
00283 
00287 #define set_next_edge(edges,e,value)               \
00288 ( (edges)[e] = ((edges)[e] & (INT32) 0xffe00000) | (value & NO_EDGE) )
00289 
00293 #define set_empty_edge(edges,e)  \
00294 ((edges)[e] = NO_EDGE)
00295 
00299 #define clear_all_edges(dawg,edge,max_num_edges) \
00300 for  (edge=0; edge<max_num_edges; edge++)      \
00301    set_empty_edge (dawg, edge);
00302 
00306 #define edge_occupied(edges,e)  \
00307 ((edges)[e] != NO_EDGE)
00308 
00312 #define edge_letter(edges,e)  \
00313 ((edges)[e] >> LETTER_START_BIT)
00314 
00320 #define last_edge(edges,e)  \
00321 ((edges)[e] & (LAST_FLAG << FLAG_START_BIT))
00322 
00326 #define end_of_word(edges,e)  \
00327 ((edges)[e] & (WERD_END_FLAG << FLAG_START_BIT))
00328 
00332 #define forward_edge(edges,e)  \
00333 ((edges)[e] & (DIRECTION_FLAG << FLAG_START_BIT) && \
00334    edge_occupied (edges,e))
00335 
00339 #define backward_edge(edges,e)  \
00340 (! ((edges)[e] & (DIRECTION_FLAG << FLAG_START_BIT)) && \
00341    edge_occupied (edges,e))
00342 
00348 #define edge_loop(edges,e)  \
00349 while (! last_edge (edges,e++))
00350 
00358 #define case_is_okay(word,i)                                \
00359 (i ?                                                      \
00360    ((isupper(word[i]) && islower(word[i-1])) ?              \
00361    FALSE :                                                 \
00362    ((islower(word[i]) && isupper(word[i-1]) &&             \
00363       i>1 && isalpha (word[i-2])) ?                       \
00364    FALSE :                                                \
00365    TRUE)) :                                               \
00366    TRUE)
00367 
00371 #define trailing_punc(ch) \
00372 ((ch == '}'  ) ||       \
00373    (ch == ':'  ) ||       \
00374    (ch == ';'  ) ||       \
00375    (ch == '-'  ) ||       \
00376    (ch == ']'  ) ||       \
00377    (ch == '!'  ) ||       \
00378    (ch == '?'  ) ||       \
00379    (ch == '`'  ) ||       \
00380    (ch == ','  ) ||       \
00381    (ch == '.'  ) ||       \
00382    (ch == ')'  ) ||       \
00383    (ch == '\"' ) ||       \
00384    (ch == '\'' ))
00385 
00389 #define leading_punc(ch)  \
00390 ((ch == '\"' ) ||       \
00391    (ch == '('  ) ||       \
00392    (ch == '{'  ) ||       \
00393    (ch == '['  ) ||       \
00394    (ch == '`'  ) ||       \
00395    (ch == '\'' ))
00396 
00397 /*----------------------------------------------------------------------
00398               F u n c t i o n s
00399 ----------------------------------------------------------------------*/
00400 EDGE_REF edge_char_of(EDGE_ARRAY dawg,
00401                       NODE_REF node,
00402                       int character,
00403                       int word_end);
00404 
00405 INT32 edges_in_node(EDGE_ARRAY dawg, NODE_REF node); 
00406 
00407 INT32 letter_is_okay(EDGE_ARRAY dawg,
00408                      NODE_REF *node,
00409                      INT32 char_index,
00410                      char prevchar,
00411                      const char *word,
00412                      INT32 word_end);
00413 
00414 INT32 num_forward_edges(EDGE_ARRAY dawg, NODE_REF node); 
00415 
00416 void print_dawg_node(EDGE_ARRAY dawg, NODE_REF node); 
00417 
00418 void read_squished_dawg(char *filename, EDGE_ARRAY dawg, INT32 max_num_edges); 
00419 
00420 INT32 verify_trailing_punct(EDGE_ARRAY dawg, char *word, INT32 char_index); 
00421 
00422 INT32 word_in_dawg(EDGE_ARRAY dawg, const char *string); 
00423 
00424 /*
00425 #if defined(__STDC__) || defined(__cplusplus) || MAC_OR_DOS
00426 # define _ARGS(s) s
00427 #else
00428 # define _ARGS(s) ()
00429 #endif*/
00430 
00431 /* dawg.c
00432 EDGE_REF edge_char_of
00433   _ARGS((EDGE_ARRAY dawg,
00434   NODE_REF node,
00435   int character,
00436   int word_end));
00437 
00438 INT32 edges_in_node
00439   _ARGS((EDGE_ARRAY dawg,
00440   NODE_REF node));
00441 
00442 INT32 letter_is_okay
00443   _ARGS((EDGE_ARRAY dawg,
00444   NODE_REF *node,
00445   INT32 char_index,
00446   char *word,
00447   INT32 word_end));
00448 
00449 INT32 num_forward_edges
00450   _ARGS((EDGE_ARRAY dawg,
00451   NODE_REF node));
00452 
00453 void print_dawg_node
00454   _ARGS((EDGE_ARRAY dawg,
00455   NODE_REF node));
00456 
00457 void read_squished_dawg
00458   _ARGS((char *filename,
00459   EDGE_ARRAY dawg,
00460   INT32 max_num_edges));
00461 
00462 INT32 verify_trailing_punct
00463   _ARGS((EDGE_ARRAY dawg,
00464   char *word,
00465   INT32 char_index));
00466 
00467 INT32 word_in_dawg
00468   _ARGS((EDGE_ARRAY dawg,
00469   char *string));
00470 
00471 #undef _ARGS
00472 */
00473 #endif

Generated on Wed Feb 28 19:49:10 2007 for Tesseract by  doxygen 1.5.1