#include "dawg.h"
#include "cutil.h"
#include "callcpp.h"
#include "context.h"
#include "strngs.h"
Go to the source code of this file.
EDGE_REF edge_char_of | ( | EDGE_ARRAY | dawg, | |
NODE_REF | node, | |||
int | character, | |||
int | word_end | |||
) |
Return the edge that corresponds to the letter out of this node.
Definition at line 41 of file dawg.cpp.
References case_sensative, edge_letter, edge_loop, edge_occupied, end_of_word, and NO_EDGE.
Referenced by add_word_to_dawg(), and letter_is_okay().
00044 { 00045 EDGE_REF edge = node; 00046 00047 if (! case_sensative) character = tolower (character); 00048 00049 if (edge_occupied (dawg, edge)) { 00050 do { 00051 if ((edge_letter (dawg, edge) == character) && 00052 (! word_end || end_of_word(dawg,edge))) 00053 return (edge); 00054 00055 } edge_loop (dawg, edge); 00056 } 00057 00058 return (NO_EDGE); 00059 }
INT32 edges_in_node | ( | EDGE_ARRAY | dawg, | |
NODE_REF | node | |||
) |
Count the number of edges in this node in the DAWG; includes both forward and back links.
Definition at line 66 of file dawg.cpp.
References backward_edge, edge_loop, and edge_occupied.
Referenced by add_edge_linkage(), add_word_to_dawg(), move_node(), remove_edge_linkage(), and room_in_node().
00066 { 00067 EDGE_REF edge = node; 00068 00069 if (edge_occupied (dawg, edge)) { 00070 edge_loop(dawg, edge); 00071 if (edge_occupied (dawg, edge) && backward_edge (dawg, edge)) { 00072 edge_loop(dawg, edge); 00073 return (edge - node); 00074 } 00075 else { 00076 return (edge - node); 00077 } 00078 } 00079 else { 00080 return (edge - node); 00081 } 00082 }
INT32 letter_is_okay | ( | EDGE_ARRAY | dawg, | |
NODE_REF * | node, | |||
INT32 | char_index, | |||
char | prevchar, | |||
const char * | word, | |||
INT32 | word_end | |||
) |
Check this letter in light of the current state.
dawg | Array of edges | |
node | Node ? | |
char_index | ? | |
prevchar | Previous character | |
word | string of word | |
word_end | 0 or 1, reached end of word |
Definition at line 96 of file dawg.cpp.
References case_is_okay, case_sensative, edge_char_of(), FALSE, leading_punc, NO_EDGE, punctuation_ok(), STRING::string(), trailing_punc, TRUE, verify_trailing_punct(), and word_in_dawg().
Referenced by append_next_choice(), and word_in_dawg().
00101 { 00102 EDGE_REF edge; 00103 STRING dummy_word(word); // Auto-deleting string fixes memory leak. 00104 00105 if (*node == NO_EDGE) { /* Trailing punctuation */ 00106 if (trailing_punc (dummy_word [char_index]) 00107 && (!trailing_punc (prevchar) 00108 || punctuation_ok(dummy_word.string())>=0)) 00109 return (TRUE); 00110 else 00111 return (FALSE); 00112 } 00113 else { 00114 /* Leading punctuation */ 00115 if (*node == 0 && 00116 char_index != 0 && 00117 isalpha (dummy_word [char_index]) && 00118 ! leading_punc (dummy_word [char_index-1]) && 00119 dummy_word [char_index-1] != '-') { 00120 return (FALSE); 00121 } 00122 } 00123 /* Handle compund words */ 00124 if (dummy_word [char_index] == '-') { 00125 if (char_index>0 && !word_end 00126 && word [char_index-1] == '-' 00127 && word [char_index+1] == '-') 00128 return FALSE; /*not allowed*/ 00129 dummy_word [char_index] = (char) 0; 00130 if (word_in_dawg (dawg, dummy_word.string())) { 00131 dummy_word [char_index] = '-'; 00132 *node = 0; 00133 return (TRUE); 00134 } 00135 else { 00136 dummy_word [char_index] = '-'; 00137 return (FALSE); 00138 } 00139 } 00140 /* Check the DAWG */ 00141 edge = edge_char_of (dawg, *node, dummy_word [char_index], word_end); 00142 00143 if (edge != NO_EDGE) { /* Normal edge in DAWG */ 00144 if (case_sensative || case_is_okay (dummy_word, char_index)) { 00145 //next_node (dawg, edge); 00146 *node = (dawg)[edge] & NO_EDGE; 00147 return (TRUE); 00148 } 00149 else { 00150 return (FALSE); 00151 } 00152 } 00153 else { 00154 /* Leading punctuation */ 00155 if (leading_punc (word [char_index]) && 00156 (char_index == 0 || leading_punc (dummy_word [char_index-1]))) { 00157 *node = 0; 00158 if (leading_punc (prevchar) || punctuation_ok (word)>=0) 00159 return (TRUE); 00160 else 00161 return FALSE; 00162 } 00163 /* Trailing punctuation */ 00164 if (verify_trailing_punct (dawg, &dummy_word[0], char_index)) { 00165 *node = NO_EDGE; 00166 return (TRUE); 00167 } 00168 00169 return (FALSE); 00170 } 00171 }
INT32 num_forward_edges | ( | EDGE_ARRAY | dawg, | |
NODE_REF | node | |||
) |
Count and return the number of forward edges for this node.
Definition at line 177 of file dawg.cpp.
References edge_loop, and forward_edge.
Referenced by remove_edge_linkage().
00177 { 00178 EDGE_REF edge = node; 00179 INT32 num = 0; 00180 00181 if (forward_edge (dawg, edge)) { 00182 do { 00183 num++; 00184 } edge_loop (dawg, edge); 00185 } 00186 00187 return (num); 00188 }
void print_dawg_node | ( | EDGE_ARRAY | dawg, | |
NODE_REF | node | |||
) |
Print the contents of one of the nodes in the DAWG.
Definition at line 194 of file dawg.cpp.
References backward_edge, cprintf(), direction(), edge_letter, edge_loop, edge_occupied, end_of_word, forward_edge, last_edge, MAX_NODE_EDGES, new_line, and next_node.
Referenced by move_node(), remove_edge_linkage(), and word_in_dawg().
00194 { 00195 EDGE_REF edge = node; 00196 const char *forward_string = "FORWARD"; 00197 const char *backward_string = " "; 00198 00199 const char *last_string = "LAST"; 00200 const char *not_last_string = " "; 00201 00202 const char *eow_string = "EOW"; 00203 const char *not_eow_string = " "; 00204 00205 const char *direction; 00206 const char *is_last; 00207 const char *eow; 00208 00209 char ch; 00210 00211 if (edge_occupied (dawg, edge)) { 00212 do { 00213 if (forward_edge (dawg, edge)) direction = forward_string; 00214 else direction = backward_string; 00215 00216 if (last_edge (dawg, edge)) is_last = last_string; 00217 else is_last = not_last_string; 00218 00219 if (end_of_word (dawg, edge)) eow = eow_string; 00220 else eow = not_eow_string; 00221 00222 ch = edge_letter (dawg, edge); 00223 cprintf ("%7d : next = %7d, char = '%c', %s %s %s\n", 00224 (int) edge, (int) next_node (dawg, edge), ch, 00225 direction, is_last, eow); 00226 00227 if (edge - node > MAX_NODE_EDGES) return; 00228 } edge_loop (dawg, edge); 00229 00230 if (edge_occupied (dawg, edge) && backward_edge (dawg, edge)) { 00231 do { 00232 if (forward_edge (dawg, edge)) direction = forward_string; 00233 else direction = backward_string; 00234 00235 if (last_edge (dawg, edge)) is_last = last_string; 00236 else is_last = not_last_string; 00237 00238 if (end_of_word (dawg, edge)) eow = eow_string; 00239 else eow = not_eow_string; 00240 00241 ch = edge_letter (dawg, edge); 00242 cprintf ("%7d : next = %7d, char = '%c', %s %s %s\n", 00243 (int) edge, (int) next_node (dawg, edge), ch, 00244 direction, is_last, eow); 00245 00246 if (edge - node > MAX_NODE_EDGES) return; 00247 } edge_loop (dawg, edge); 00248 } 00249 } 00250 else { 00251 cprintf ("%5d : no edges in this node\n", node); 00252 } 00253 new_line(); 00254 }
void read_squished_dawg | ( | char * | filename, | |
EDGE_ARRAY | dawg, | |||
INT32 | max_num_edges | |||
) |
Read the compressed, binary DAWG from the file 'tessdata/word-dawg'.
First record has 179 edges, the second/last one has 47971 edges
Definition at line 262 of file dawg.cpp.
References clear_all_edges, debug, last_edge, open_file(), print_string, and reverse32().
Referenced by init_permdawg(), and init_permute().
00262 { 00263 FILE *file; 00264 EDGE_REF edge; 00265 INT32 num_edges; 00266 INT32 node_count = 0; 00267 00268 if (debug) print_string ("read_debug"); 00269 00270 clear_all_edges(dawg, edge, max_num_edges); 00271 00272 #ifdef __UNIX__ 00273 file = open_file (filename, "r"); 00274 #else 00275 file = open_file (filename, "rb"); 00276 #endif 00277 fseek(file, 0, SEEK_END); 00278 long fsize = ftell(file); 00279 rewind(file); 00280 fread (&num_edges, sizeof (int), 1, file); 00281 // Auto-detect relative endianness of file and OS as future DAWG 00282 // files may be little-endian. 00283 long diff1 = sizeof(EDGE_RECORD)*num_edges + sizeof(int) - fsize; 00284 reverse32(&num_edges); 00285 long diff2 = sizeof(EDGE_RECORD)*num_edges + sizeof(int) - fsize; 00286 reverse32(&num_edges); 00287 // One of diff1 and diff2 should now be 0, but find the smallest 00288 // just in case. 00289 if (diff1 < 0) diff1 = -diff1; 00290 if (diff2 < 0) diff2 = -diff2; 00291 bool swap = diff2 < diff1; 00292 if (swap) 00293 reverse32(&num_edges); 00294 fread (&dawg[0], sizeof (EDGE_RECORD), num_edges, file); 00295 fclose(file); 00296 if (swap) 00297 for (edge=0;edge<num_edges;edge++) 00298 reverse32(&dawg[edge]); 00299 00300 for (edge=0; edge<max_num_edges; edge++) 00301 if (last_edge (dawg, edge)) node_count++; 00302 }
INT32 verify_trailing_punct | ( | EDGE_ARRAY | dawg, | |
char * | word, | |||
INT32 | char_index | |||
) |
Make sure that there is a valid transition from the word core to a string of trailing puntuation.
dawg | Array of edges | |
word | The word in question | |
char_index | Which letter in the word |
Definition at line 314 of file dawg.cpp.
References FALSE, leading_punc, trailing_punc, TRUE, and word_in_dawg().
Referenced by letter_is_okay().
00314 { 00315 char last_char; 00316 char *first_char; 00317 00318 if (trailing_punc (word [char_index])) { 00319 00320 last_char = word [char_index]; 00321 word [char_index] = (char) 0; 00322 00323 for (first_char = word; leading_punc (first_char[0]); first_char++); 00324 00325 if (word_in_dawg (dawg, first_char)) { 00326 word [char_index] = last_char; 00327 return (TRUE); 00328 } 00329 word [char_index] = last_char; 00330 } 00331 return (FALSE); 00332 }
INT32 word_in_dawg | ( | EDGE_ARRAY | dawg, | |
const char * | string | |||
) |
Test to see if the word can be found in the DAWG.
Definition at line 338 of file dawg.cpp.
References debug, FALSE, letter_is_okay(), new_line, print_dawg_node(), and TRUE.
Referenced by add_document_word(), adjust_word(), letter_is_okay(), read_word_list(), test_freq_words(), valid_word(), and verify_trailing_punct().
00338 { 00339 NODE_REF node = 0; 00340 INT32 i; 00341 INT32 length; 00342 00343 length=strlen(string); 00344 if (length==0) 00345 return FALSE; 00346 for (i=0; i<length; i++) { 00347 if (debug) { 00348 print_dawg_node(dawg, node); 00349 new_line(); 00350 } 00351 00352 if (! letter_is_okay (dawg, &node, i,'\0', string, (string[i+1]==0))) { 00353 return (FALSE); 00354 } 00355 } 00356 00357 return (TRUE); 00358 }
INT32 case_sensative = 0 |
Always = FALSE/0 in Tesseract 1.02
Definition at line 33 of file dawg.cpp.
Referenced by add_document_word(), add_word_to_dawg(), edge_char_of(), init_permute(), letter_is_okay(), permute_words(), and valid_word().
* (c) Copyright 1987, Hewlett-Packard Company. ** Licensed under the Apache License, Version 2.0 (the "License"); ** you may not use this file except in compliance with the License. ** You may obtain a copy of the License at ** http://www.apache.org/licenses/LICENSE-2.0 ** Unless required by applicable law or agreed to in writing, software ** distributed under the License is distributed on an "AS IS" BASIS, ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ** See the License for the specific language governing permissions and ** limitations under the License.
Definition at line 32 of file dawg.cpp.
Referenced by add_edge_linkage(), add_new_edge(), add_word_to_dawg(), move_node(), read_squished_dawg(), relocate_edge(), remove_edge_linkage(), and word_in_dawg().