#include <ctype.h>
#include "general.h"
Go to the source code of this file.
#define backward_edge | ( | edges, | |||
e | ) |
Value:
(! ((edges)[e] & (DIRECTION_FLAG << FLAG_START_BIT)) && \ edge_occupied (edges,e))
Definition at line 339 of file dawg.h.
Referenced by add_edge_linkage(), edges_in_node(), move_node(), print_dawg_node(), relocate_edge(), and remove_edge_linkage().
#define case_is_okay | ( | word, | |||
i | ) |
Value:
(i ? \ ((isupper(word[i]) && islower(word[i-1])) ? \ FALSE : \ ((islower(word[i]) && isupper(word[i-1]) && \ i>1 && isalpha (word[i-2])) ? \ FALSE : \ TRUE)) : \ TRUE)
Check the case of this character in the character string to make sure that there is not problem with the case. Checks not only for 'aB' but also '?Ab' where '?' is any alphanumeric character.
Definition at line 358 of file dawg.h.
Referenced by letter_is_okay().
#define clear_all_edges | ( | dawg, | |||
edge, | |||||
max_num_edges | ) |
Value:
for (edge=0; edge<max_num_edges; edge++) \ set_empty_edge (dawg, edge);
Definition at line 299 of file dawg.h.
Referenced by initialize_dawg(), and read_squished_dawg().
#define DIRECTION_FLAG (INT32) 2 |
#define edge_letter | ( | edges, | |||
e | ) | ((edges)[e] >> LETTER_START_BIT) |
The letter choice that corresponds to this edge in the DAWG.
Definition at line 312 of file dawg.h.
Referenced by edge_char_of(), print_dawg_node(), and remove_edge_linkage().
#define edge_loop | ( | edges, | |||
e | ) | while (! last_edge (edges,e++)) |
Loop for each of the edge-spots in the forward direction.
This macro is an iterator: 'while (iterator condition)'
Definition at line 348 of file dawg.h.
Referenced by add_edge_linkage(), edge_char_of(), edges_in_node(), move_node(), num_forward_edges(), print_dawg_node(), and relocate_edge().
#define edge_occupied | ( | edges, | |||
e | ) | ((edges)[e] != NO_EDGE) |
TRUE if this edge-spot is occupied.
Definition at line 306 of file dawg.h.
Referenced by edge_char_of(), edges_in_node(), new_dawg_node(), print_dawg_node(), and room_in_node().
#define end_of_word | ( | edges, | |||
e | ) | ((edges)[e] & (WERD_END_FLAG << FLAG_START_BIT)) |
TRUE if this edge-spot marks end of a word.
Definition at line 326 of file dawg.h.
Referenced by edge_char_of(), print_dawg_node(), and remove_edge_linkage().
#define forward_edge | ( | edges, | |||
e | ) |
Value:
((edges)[e] & (DIRECTION_FLAG << FLAG_START_BIT) && \ edge_occupied (edges,e))
Definition at line 332 of file dawg.h.
Referenced by add_edge_linkage(), move_node(), num_forward_edges(), print_dawg_node(), relocate_edge(), and remove_edge_linkage().
#define last_edge | ( | edges, | |||
e | ) | ((edges)[e] & (LAST_FLAG << FLAG_START_BIT)) |
TRUE if this edge is the last edge in the sequence.
TRUE for last edge in both forward and backward part.
Definition at line 320 of file dawg.h.
Referenced by print_dawg_node(), read_squished_dawg(), and remove_edge_linkage().
#define LAST_FLAG (INT32) 1 |
#define leading_punc | ( | ch | ) |
Value:
((ch == '\"' ) || \ (ch == '(' ) || \ (ch == '{' ) || \ (ch == '[' ) || \ (ch == '`' ) || \ (ch == '\'' ))
Definition at line 389 of file dawg.h.
Referenced by letter_is_okay(), and verify_trailing_punct().
#define LETTER_START_BIT 24 |
#define MAX_NODE_EDGES (INT32) 100 |
FIX:.
Only used in print_dawg_node()
Definition at line 224 of file dawg.h.
Referenced by print_dawg_node().
#define MAX_WERD_LENGTH (INT32) 40 |
Max lenght of a werd to be permuted.
fmg thinks this is greater than the "longest English word" as 'letters'/blobs in a werd may end up getting removed/collapsed in the process of permuting.
Definition at line 217 of file dawg.h.
Referenced by dawg_permute_and_select(), new_dawg_node(), number_permute_and_select(), permute_compound_words(), and permute_words().
#define next_node | ( | edges, | |||
e | ) | ((edges)[e] & NO_EDGE) |
The next node visited in the DAWG by following this edge.
Definition at line 281 of file dawg.h.
Referenced by add_word_to_dawg(), move_node(), print_dawg_node(), relocate_edge(), and remove_edge_linkage().
#define NO_EDGE (INT32) 0x1fffff |
Denotes that edge-spot contains no edge info.
Used in conjunction with class_probability() macro
Referenced by add_word_to_dawg(), edge_char_of(), and letter_is_okay().
#define set_empty_edge | ( | edges, | |||
e | ) | ((edges)[e] = NO_EDGE) |
Initializes edge-spot e to NO_EDGE.
Definition at line 293 of file dawg.h.
Referenced by remove_edge_linkage().
#define set_next_edge | ( | edges, | |||
e, | |||||
value | ) | ( (edges)[e] = ((edges)[e] & (INT32) 0xffe00000) | (value & NO_EDGE) ) |
Set the next node link for this edge in the DAWG.
Definition at line 287 of file dawg.h.
Referenced by relocate_edge().
#define trailing_punc | ( | ch | ) |
Value:
((ch == '}' ) || \ (ch == ':' ) || \ (ch == ';' ) || \ (ch == '-' ) || \ (ch == ']' ) || \ (ch == '!' ) || \ (ch == '?' ) || \ (ch == '`' ) || \ (ch == ',' ) || \ (ch == '.' ) || \ (ch == ')' ) || \ (ch == '\"' ) || \ (ch == '\'' ))
Definition at line 371 of file dawg.h.
Referenced by letter_is_okay(), and verify_trailing_punct().
#define WERD_END_FLAG (INT32) 4 |
EDGE_REF edge_char_of | ( | EDGE_ARRAY | dawg, | |
NODE_REF | node, | |||
int | character, | |||
int | word_end | |||
) |
Return the edge that corresponds to the letter out of this node.
Definition at line 41 of file dawg.cpp.
References case_sensative, edge_letter, edge_loop, edge_occupied, end_of_word, and NO_EDGE.
Referenced by add_word_to_dawg(), and letter_is_okay().
00044 { 00045 EDGE_REF edge = node; 00046 00047 if (! case_sensative) character = tolower (character); 00048 00049 if (edge_occupied (dawg, edge)) { 00050 do { 00051 if ((edge_letter (dawg, edge) == character) && 00052 (! word_end || end_of_word(dawg,edge))) 00053 return (edge); 00054 00055 } edge_loop (dawg, edge); 00056 } 00057 00058 return (NO_EDGE); 00059 }
INT32 edges_in_node | ( | EDGE_ARRAY | dawg, | |
NODE_REF | node | |||
) |
Count the number of edges in this node in the DAWG; includes both forward and back links.
Definition at line 66 of file dawg.cpp.
References backward_edge, edge_loop, and edge_occupied.
Referenced by add_edge_linkage(), add_word_to_dawg(), move_node(), remove_edge_linkage(), and room_in_node().
00066 { 00067 EDGE_REF edge = node; 00068 00069 if (edge_occupied (dawg, edge)) { 00070 edge_loop(dawg, edge); 00071 if (edge_occupied (dawg, edge) && backward_edge (dawg, edge)) { 00072 edge_loop(dawg, edge); 00073 return (edge - node); 00074 } 00075 else { 00076 return (edge - node); 00077 } 00078 } 00079 else { 00080 return (edge - node); 00081 } 00082 }
INT32 letter_is_okay | ( | EDGE_ARRAY | dawg, | |
NODE_REF * | node, | |||
INT32 | char_index, | |||
char | prevchar, | |||
const char * | word, | |||
INT32 | word_end | |||
) |
Check this letter in light of the current state.
dawg | Array of edges | |
node | Node ? | |
char_index | ? | |
prevchar | Previous character | |
word | string of word | |
word_end | 0 or 1, reached end of word |
Definition at line 96 of file dawg.cpp.
References case_is_okay, case_sensative, edge_char_of(), FALSE, leading_punc, NO_EDGE, punctuation_ok(), STRING::string(), trailing_punc, TRUE, verify_trailing_punct(), and word_in_dawg().
Referenced by append_next_choice(), and word_in_dawg().
00101 { 00102 EDGE_REF edge; 00103 STRING dummy_word(word); // Auto-deleting string fixes memory leak. 00104 00105 if (*node == NO_EDGE) { /* Trailing punctuation */ 00106 if (trailing_punc (dummy_word [char_index]) 00107 && (!trailing_punc (prevchar) 00108 || punctuation_ok(dummy_word.string())>=0)) 00109 return (TRUE); 00110 else 00111 return (FALSE); 00112 } 00113 else { 00114 /* Leading punctuation */ 00115 if (*node == 0 && 00116 char_index != 0 && 00117 isalpha (dummy_word [char_index]) && 00118 ! leading_punc (dummy_word [char_index-1]) && 00119 dummy_word [char_index-1] != '-') { 00120 return (FALSE); 00121 } 00122 } 00123 /* Handle compund words */ 00124 if (dummy_word [char_index] == '-') { 00125 if (char_index>0 && !word_end 00126 && word [char_index-1] == '-' 00127 && word [char_index+1] == '-') 00128 return FALSE; /*not allowed*/ 00129 dummy_word [char_index] = (char) 0; 00130 if (word_in_dawg (dawg, dummy_word.string())) { 00131 dummy_word [char_index] = '-'; 00132 *node = 0; 00133 return (TRUE); 00134 } 00135 else { 00136 dummy_word [char_index] = '-'; 00137 return (FALSE); 00138 } 00139 } 00140 /* Check the DAWG */ 00141 edge = edge_char_of (dawg, *node, dummy_word [char_index], word_end); 00142 00143 if (edge != NO_EDGE) { /* Normal edge in DAWG */ 00144 if (case_sensative || case_is_okay (dummy_word, char_index)) { 00145 //next_node (dawg, edge); 00146 *node = (dawg)[edge] & NO_EDGE; 00147 return (TRUE); 00148 } 00149 else { 00150 return (FALSE); 00151 } 00152 } 00153 else { 00154 /* Leading punctuation */ 00155 if (leading_punc (word [char_index]) && 00156 (char_index == 0 || leading_punc (dummy_word [char_index-1]))) { 00157 *node = 0; 00158 if (leading_punc (prevchar) || punctuation_ok (word)>=0) 00159 return (TRUE); 00160 else 00161 return FALSE; 00162 } 00163 /* Trailing punctuation */ 00164 if (verify_trailing_punct (dawg, &dummy_word[0], char_index)) { 00165 *node = NO_EDGE; 00166 return (TRUE); 00167 } 00168 00169 return (FALSE); 00170 } 00171 }
INT32 num_forward_edges | ( | EDGE_ARRAY | dawg, | |
NODE_REF | node | |||
) |
Count and return the number of forward edges for this node.
Definition at line 177 of file dawg.cpp.
References edge_loop, and forward_edge.
Referenced by remove_edge_linkage().
00177 { 00178 EDGE_REF edge = node; 00179 INT32 num = 0; 00180 00181 if (forward_edge (dawg, edge)) { 00182 do { 00183 num++; 00184 } edge_loop (dawg, edge); 00185 } 00186 00187 return (num); 00188 }
void print_dawg_node | ( | EDGE_ARRAY | dawg, | |
NODE_REF | node | |||
) |
Print the contents of one of the nodes in the DAWG.
Definition at line 194 of file dawg.cpp.
References backward_edge, cprintf(), direction(), edge_letter, edge_loop, edge_occupied, end_of_word, forward_edge, last_edge, MAX_NODE_EDGES, new_line, and next_node.
Referenced by move_node(), remove_edge_linkage(), and word_in_dawg().
00194 { 00195 EDGE_REF edge = node; 00196 const char *forward_string = "FORWARD"; 00197 const char *backward_string = " "; 00198 00199 const char *last_string = "LAST"; 00200 const char *not_last_string = " "; 00201 00202 const char *eow_string = "EOW"; 00203 const char *not_eow_string = " "; 00204 00205 const char *direction; 00206 const char *is_last; 00207 const char *eow; 00208 00209 char ch; 00210 00211 if (edge_occupied (dawg, edge)) { 00212 do { 00213 if (forward_edge (dawg, edge)) direction = forward_string; 00214 else direction = backward_string; 00215 00216 if (last_edge (dawg, edge)) is_last = last_string; 00217 else is_last = not_last_string; 00218 00219 if (end_of_word (dawg, edge)) eow = eow_string; 00220 else eow = not_eow_string; 00221 00222 ch = edge_letter (dawg, edge); 00223 cprintf ("%7d : next = %7d, char = '%c', %s %s %s\n", 00224 (int) edge, (int) next_node (dawg, edge), ch, 00225 direction, is_last, eow); 00226 00227 if (edge - node > MAX_NODE_EDGES) return; 00228 } edge_loop (dawg, edge); 00229 00230 if (edge_occupied (dawg, edge) && backward_edge (dawg, edge)) { 00231 do { 00232 if (forward_edge (dawg, edge)) direction = forward_string; 00233 else direction = backward_string; 00234 00235 if (last_edge (dawg, edge)) is_last = last_string; 00236 else is_last = not_last_string; 00237 00238 if (end_of_word (dawg, edge)) eow = eow_string; 00239 else eow = not_eow_string; 00240 00241 ch = edge_letter (dawg, edge); 00242 cprintf ("%7d : next = %7d, char = '%c', %s %s %s\n", 00243 (int) edge, (int) next_node (dawg, edge), ch, 00244 direction, is_last, eow); 00245 00246 if (edge - node > MAX_NODE_EDGES) return; 00247 } edge_loop (dawg, edge); 00248 } 00249 } 00250 else { 00251 cprintf ("%5d : no edges in this node\n", node); 00252 } 00253 new_line(); 00254 }
void read_squished_dawg | ( | char * | filename, | |
EDGE_ARRAY | dawg, | |||
INT32 | max_num_edges | |||
) |
Read the compressed, binary DAWG from the file 'tessdata/word-dawg'.
First record has 179 edges, the second/last one has 47971 edges
Definition at line 262 of file dawg.cpp.
References clear_all_edges, debug, last_edge, open_file(), print_string, and reverse32().
Referenced by init_permdawg(), and init_permute().
00262 { 00263 FILE *file; 00264 EDGE_REF edge; 00265 INT32 num_edges; 00266 INT32 node_count = 0; 00267 00268 if (debug) print_string ("read_debug"); 00269 00270 clear_all_edges(dawg, edge, max_num_edges); 00271 00272 #ifdef __UNIX__ 00273 file = open_file (filename, "r"); 00274 #else 00275 file = open_file (filename, "rb"); 00276 #endif 00277 fseek(file, 0, SEEK_END); 00278 long fsize = ftell(file); 00279 rewind(file); 00280 fread (&num_edges, sizeof (int), 1, file); 00281 // Auto-detect relative endianness of file and OS as future DAWG 00282 // files may be little-endian. 00283 long diff1 = sizeof(EDGE_RECORD)*num_edges + sizeof(int) - fsize; 00284 reverse32(&num_edges); 00285 long diff2 = sizeof(EDGE_RECORD)*num_edges + sizeof(int) - fsize; 00286 reverse32(&num_edges); 00287 // One of diff1 and diff2 should now be 0, but find the smallest 00288 // just in case. 00289 if (diff1 < 0) diff1 = -diff1; 00290 if (diff2 < 0) diff2 = -diff2; 00291 bool swap = diff2 < diff1; 00292 if (swap) 00293 reverse32(&num_edges); 00294 fread (&dawg[0], sizeof (EDGE_RECORD), num_edges, file); 00295 fclose(file); 00296 if (swap) 00297 for (edge=0;edge<num_edges;edge++) 00298 reverse32(&dawg[edge]); 00299 00300 for (edge=0; edge<max_num_edges; edge++) 00301 if (last_edge (dawg, edge)) node_count++; 00302 }
INT32 verify_trailing_punct | ( | EDGE_ARRAY | dawg, | |
char * | word, | |||
INT32 | char_index | |||
) |
Make sure that there is a valid transition from the word core to a string of trailing puntuation.
dawg | Array of edges | |
word | The word in question | |
char_index | Which letter in the word |
Definition at line 314 of file dawg.cpp.
References FALSE, leading_punc, trailing_punc, TRUE, and word_in_dawg().
Referenced by letter_is_okay().
00314 { 00315 char last_char; 00316 char *first_char; 00317 00318 if (trailing_punc (word [char_index])) { 00319 00320 last_char = word [char_index]; 00321 word [char_index] = (char) 0; 00322 00323 for (first_char = word; leading_punc (first_char[0]); first_char++); 00324 00325 if (word_in_dawg (dawg, first_char)) { 00326 word [char_index] = last_char; 00327 return (TRUE); 00328 } 00329 word [char_index] = last_char; 00330 } 00331 return (FALSE); 00332 }
INT32 word_in_dawg | ( | EDGE_ARRAY | dawg, | |
const char * | string | |||
) |
Test to see if the word can be found in the DAWG.
Definition at line 338 of file dawg.cpp.
References debug, FALSE, letter_is_okay(), new_line, print_dawg_node(), and TRUE.
Referenced by add_document_word(), adjust_word(), letter_is_okay(), read_word_list(), test_freq_words(), valid_word(), and verify_trailing_punct().
00338 { 00339 NODE_REF node = 0; 00340 INT32 i; 00341 INT32 length; 00342 00343 length=strlen(string); 00344 if (length==0) 00345 return FALSE; 00346 for (i=0; i<length; i++) { 00347 if (debug) { 00348 print_dawg_node(dawg, node); 00349 new_line(); 00350 } 00351 00352 if (! letter_is_okay (dawg, &node, i,'\0', string, (string[i+1]==0))) { 00353 return (FALSE); 00354 } 00355 } 00356 00357 return (TRUE); 00358 }
Always = FALSE/0 in Tesseract 1.02
Definition at line 33 of file dawg.cpp.
Referenced by add_document_word(), add_word_to_dawg(), edge_char_of(), init_permute(), letter_is_okay(), permute_words(), and valid_word().
* (c) Copyright 1987, Hewlett-Packard Company. ** Licensed under the Apache License, Version 2.0 (the "License"); ** you may not use this file except in compliance with the License. ** You may obtain a copy of the License at ** http://www.apache.org/licenses/LICENSE-2.0 ** Unless required by applicable law or agreed to in writing, software ** distributed under the License is distributed on an "AS IS" BASIS, ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ** See the License for the specific language governing permissions and ** limitations under the License.
Definition at line 32 of file dawg.cpp.
Referenced by add_edge_linkage(), add_new_edge(), add_word_to_dawg(), move_node(), read_squished_dawg(), relocate_edge(), remove_edge_linkage(), and word_in_dawg().