00001
00020
00021
00022
00023 #include "permute.h"
00024 #include "globals.h"
00025 #include "permdawg.h"
00026 #include "debug.h"
00027 #include "tordvars.h"
00028 #include "hyphen.h"
00029 #include "stopper.h"
00030 #include "trie.h"
00031 #include "context.h"
00032 #include "permnum.h"
00033 #include "freelist.h"
00034 #include "callcpp.h"
00035
00036 #include <math.h>
00037
00039 int permutation_count;
00040
00041
00042
00050 #define MAX_NUM_EDGES 60000
00051
00055 #define MAX_DOC_EDGES 250000
00056
00065 #define RESERVED_DOC_EDGES 10000
00066
00071 #define MAX_USER_EDGES 80000
00072
00076 #define USER_RESERVED_EDGES 2000
00077
00079 #define NON_WERD 1.25
00080
00081 #define GARBAGE_STRING 1.5
00082
00083 #define MAX_PERM_LENGTH 128
00084
00085 static EDGE_ARRAY pending_words;
00086 static EDGE_ARRAY document_words;
00087 static EDGE_ARRAY user_words;
00088 static EDGE_ARRAY word_dawg;
00089
00092 make_toggle_var (adjust_debug, 0, make_adjust_debug,
00093 8, 13, set_adjust_debug, "Adjustment Debug");
00094 make_toggle_var (compound_debug, 0, make_compound_debug,
00095 8, 14, set_compound_debug, "Compound Debug");
00096 make_float_var (non_word, NON_WERD, make_non_word,
00097 8, 20, set_non_word, "Non-word adjustment");
00098 make_float_var (garbage, GARBAGE_STRING, make_garbage,
00099 8, 21, set_garbage, "Garbage adjustment");
00100 make_toggle_var (save_doc_words, 0, make_doc_words,
00101 8, 22, set_doc_words, "Save Document Words ");
00102 make_toggle_var (doc_dict_enable, 1, make_doc_dict,
00103 8, 25, set_doc_dict, "Enable Document Dictionary ");
00104
00107 int permute_only_top = 0;
00108
00113 static INT32 bigram_counts[256][3] = {
00114 {
00115 0, 0, 0
00116 },
00117 {
00118 0, 0, 0
00119 },
00120 {
00121 0, 0, 0
00122 },
00123 {
00124 0, 0, 0
00125 },
00126 {
00127 0, 0, 0
00128 },
00129 {
00130 0, 0, 0
00131 },
00132 {
00133 0, 0, 0
00134 },
00135 {
00136 0, 0, 0
00137 },
00138 {
00139 0, 0, 0
00140 },
00141 {
00142 0, 0, 0
00143 },
00144 {
00145 93, 28, 0
00146 },
00147 {
00148 0, 0, 0
00149 },
00150 {
00151 0, 0, 0
00152 },
00153 {
00154 0, 0, 0
00155 },
00156 {
00157 0, 0, 0
00158 },
00159 {
00160 0, 0, 0
00161 },
00162 {
00163 0, 0, 0
00164 },
00165 {
00166 0, 0, 0
00167 },
00168 {
00169 0, 0, 0
00170 },
00171 {
00172 0, 0, 0
00173 },
00174 {
00175 0, 0, 0
00176 },
00177 {
00178 0, 0, 0
00179 },
00180 {
00181 0, 0, 0
00182 },
00183 {
00184 0, 0, 0
00185 },
00186 {
00187 0, 0, 0
00188 },
00189 {
00190 0, 0, 0
00191 },
00192 {
00193 0, 0, 0
00194 },
00195 {
00196 0, 0, 0
00197 },
00198 {
00199 0, 0, 0
00200 },
00201 {
00202 0, 0, 0
00203 },
00204 {
00205 0, 0, 0
00206 },
00207 {
00208 0, 0, 0
00209 },
00210 {
00211 324, 377, 2
00212 },
00213 {
00214 2, 1, 0
00215 },
00216 {
00217 2, 1, 0
00218 },
00219 {
00220 1, 0, 1
00221 },
00222 {
00223 2, 1, 0
00224 },
00225 {
00226 2, 0, 0
00227 },
00228 {
00229 2, 1, 0
00230 },
00231 {
00232 1, 21, 8
00233 },
00234 {
00235 2, 1, 0
00236 },
00237 {
00238 19, 0, 0
00239 },
00240 {
00241 2, 1, 0
00242 },
00243 {
00244 1, 0, 0
00245 },
00246 {
00247 75, 4, 0
00248 },
00249 {
00250 52, 7, 0
00251 },
00252 {
00253 190, 16, 3
00254 },
00255 {
00256 53, 2, 0
00257 },
00258 {
00259 399, 0, 0
00260 },
00261 {
00262 220, 0, 0
00263 },
00264 {
00265 226, 0, 0
00266 },
00267 {
00268 128, 0, 0
00269 },
00270 {
00271 147, 0, 0
00272 },
00273 {
00274 179, 0, 1
00275 },
00276 {
00277 173, 0, 0
00278 },
00279 {
00280 115, 0, 0
00281 },
00282 {
00283 107, 0, 0
00284 },
00285 {
00286 934, 0, 1
00287 },
00288 {
00289 27, 0, 1
00290 },
00291 {
00292 2, 1, 0
00293 },
00294 {
00295 2, 1, 0
00296 },
00297 {
00298 2, 1, 0
00299 },
00300 {
00301 2, 1, 0
00302 },
00303 {
00304 2, 1, 0
00305 },
00306 {
00307 2, 1, 0
00308 },
00309 {
00310 3, 1, 0
00311 },
00312 {
00313 1, 73, 0
00314 },
00315 {
00316 1, 6, 0
00317 },
00318 {
00319 1, 24, 0
00320 },
00321 {
00322 1, 2, 0
00323 },
00324 {
00325 1, 19, 0
00326 },
00327 {
00328 1, 2, 0
00329 },
00330 {
00331 3, 2, 1
00332 },
00333 {
00334 0, 68, 0
00335 },
00336 {
00337 1, 2, 0
00338 },
00339 {
00340 1, 2, 0
00341 },
00342 {
00343 1, 82, 0
00344 },
00345 {
00346 10, 10, 0
00347 },
00348 {
00349 3, 239, 0
00350 },
00351 {
00352 1, 10, 0
00353 },
00354 {
00355 0, 1, 3
00356 },
00357 {
00358 2, 3, 0
00359 },
00360 {
00361 1, 43, 0
00362 },
00363 {
00364 1, 53, 0
00365 },
00366 {
00367 2, 18, 0
00368 },
00369 {
00370 1, 2, 0
00371 },
00372 {
00373 1, 17, 0
00374 },
00375 {
00376 1, 5, 0
00377 },
00378 {
00379 1, 6, 0
00380 },
00381 {
00382 1, 2, 0
00383 },
00384 {
00385 1, 2, 0
00386 },
00387 {
00388 2, 1, 0
00389 },
00390 {
00391 2, 1, 0
00392 },
00393 {
00394 2, 1, 0
00395 },
00396 {
00397 2, 1, 0
00398 },
00399 {
00400 2, 1, 0
00401 },
00402 {
00403 1, 0, 2
00404 },
00405 {
00406 0, 0, 671
00407 },
00408 {
00409 0, 1, 16
00410 },
00411 {
00412 0, 2, 1
00413 },
00414 {
00415 0, 14, 0
00416 },
00417 {
00418 0, 0, 763
00419 },
00420 {
00421 0, 186, 0
00422 },
00423 {
00424 0, 2, 1
00425 },
00426 {
00427 0, 2, 1
00428 },
00429 {
00430 0, 0, 818
00431 },
00432 {
00433 0, 2, 1
00434 },
00435 {
00436 0, 4, 1
00437 },
00438 {
00439 0, 26, 3
00440 },
00441 {
00442 0, 69, 0
00443 },
00444 {
00445 0, 885, 0
00446 },
00447 {
00448 0, 17, 722
00449 },
00450 {
00451 0, 1, 5
00452 },
00453 {
00454 2, 1, 0
00455 },
00456 {
00457 0, 21, 0
00458 },
00459 {
00460 3, 49, 0
00461 },
00462 {
00463 0, 219, 5
00464 },
00465 {
00466 0, 0, 56
00467 },
00468 {
00469 0, 4, 0
00470 },
00471 {
00472 0, 2, 1
00473 },
00474 {
00475 0, 2, 1
00476 },
00477 {
00478 0, 1, 23
00479 },
00480 {
00481 0, 2, 1
00482 },
00483 {
00484 2, 1, 0
00485 },
00486 {
00487 59, 0, 3
00488 },
00489 {
00490 2, 1, 0
00491 },
00492 {
00493 2, 1, 0
00494 },
00495 {
00496 0, 0, 0
00497 },
00498 {
00499 0, 0, 0
00500 },
00501 {
00502 0, 0, 0
00503 },
00504 {
00505 0, 0, 0
00506 },
00507 {
00508 0, 0, 0
00509 },
00510 {
00511 0, 0, 0
00512 },
00513 {
00514 0, 0, 0
00515 },
00516 {
00517 0, 0, 0
00518 },
00519 {
00520 0, 0, 0
00521 },
00522 {
00523 0, 0, 0
00524 },
00525 {
00526 0, 0, 0
00527 },
00528 {
00529 0, 0, 0
00530 },
00531 {
00532 0, 0, 0
00533 },
00534 {
00535 0, 0, 0
00536 },
00537 {
00538 0, 0, 0
00539 },
00540 {
00541 0, 0, 0
00542 },
00543 {
00544 0, 0, 0
00545 },
00546 {
00547 0, 0, 0
00548 },
00549 {
00550 0, 0, 0
00551 },
00552 {
00553 0, 0, 0
00554 },
00555 {
00556 0, 0, 0
00557 },
00558 {
00559 0, 0, 0
00560 },
00561 {
00562 0, 0, 0
00563 },
00564 {
00565 0, 0, 0
00566 },
00567 {
00568 0, 0, 0
00569 },
00570 {
00571 0, 0, 0
00572 },
00573 {
00574 0, 0, 0
00575 },
00576 {
00577 0, 0, 0
00578 },
00579 {
00580 0, 0, 0
00581 },
00582 {
00583 0, 0, 0
00584 },
00585 {
00586 0, 0, 0
00587 },
00588 {
00589 0, 0, 0
00590 },
00591 {
00592 0, 0, 0
00593 },
00594 {
00595 0, 0, 0
00596 },
00597 {
00598 0, 0, 0
00599 },
00600 {
00601 0, 0, 0
00602 },
00603 {
00604 0, 0, 0
00605 },
00606 {
00607 0, 0, 0
00608 },
00609 {
00610 0, 0, 0
00611 },
00612 {
00613 0, 0, 0
00614 },
00615 {
00616 0, 0, 0
00617 },
00618 {
00619 0, 0, 0
00620 },
00621 {
00622 0, 0, 0
00623 },
00624 {
00625 0, 0, 0
00626 },
00627 {
00628 0, 0, 0
00629 },
00630 {
00631 0, 0, 0
00632 },
00633 {
00634 0, 0, 0
00635 },
00636 {
00637 0, 0, 0
00638 },
00639 {
00640 0, 0, 0
00641 },
00642 {
00643 0, 0, 0
00644 },
00645 {
00646 0, 0, 0
00647 },
00648 {
00649 0, 0, 0
00650 },
00651 {
00652 0, 0, 0
00653 },
00654 {
00655 0, 0, 0
00656 },
00657 {
00658 0, 0, 0
00659 },
00660 {
00661 0, 0, 0
00662 },
00663 {
00664 0, 0, 0
00665 },
00666 {
00667 0, 0, 0
00668 },
00669 {
00670 0, 0, 0
00671 },
00672 {
00673 0, 0, 0
00674 },
00675 {
00676 0, 0, 0
00677 },
00678 {
00679 0, 0, 0
00680 },
00681 {
00682 0, 0, 0
00683 },
00684 {
00685 0, 0, 0
00686 },
00687 {
00688 0, 0, 0
00689 },
00690 {
00691 0, 0, 0
00692 },
00693 {
00694 0, 0, 0
00695 },
00696 {
00697 0, 0, 0
00698 },
00699 {
00700 0, 0, 0
00701 },
00702 {
00703 0, 0, 0
00704 },
00705 {
00706 0, 0, 0
00707 },
00708 {
00709 0, 0, 0
00710 },
00711 {
00712 0, 0, 0
00713 },
00714 {
00715 0, 0, 0
00716 },
00717 {
00718 0, 0, 0
00719 },
00720 {
00721 0, 0, 0
00722 },
00723 {
00724 0, 0, 0
00725 },
00726 {
00727 0, 0, 0
00728 },
00729 {
00730 0, 0, 0
00731 },
00732 {
00733 0, 0, 0
00734 },
00735 {
00736 0, 0, 0
00737 },
00738 {
00739 0, 0, 0
00740 },
00741 {
00742 0, 0, 0
00743 },
00744 {
00745 0, 0, 0
00746 },
00747 {
00748 0, 0, 0
00749 },
00750 {
00751 0, 0, 0
00752 },
00753 {
00754 0, 0, 0
00755 },
00756 {
00757 0, 0, 0
00758 },
00759 {
00760 0, 0, 0
00761 },
00762 {
00763 0, 0, 0
00764 },
00765 {
00766 0, 0, 0
00767 },
00768 {
00769 0, 0, 0
00770 },
00771 {
00772 0, 0, 0
00773 },
00774 {
00775 0, 0, 0
00776 },
00777 {
00778 0, 0, 0
00779 },
00780 {
00781 0, 0, 0
00782 },
00783 {
00784 0, 0, 0
00785 },
00786 {
00787 0, 0, 0
00788 },
00789 {
00790 0, 0, 0
00791 },
00792 {
00793 0, 0, 0
00794 },
00795 {
00796 0, 0, 0
00797 },
00798 {
00799 0, 0, 0
00800 },
00801 {
00802 0, 0, 0
00803 },
00804 {
00805 0, 0, 0
00806 },
00807 {
00808 0, 0, 0
00809 },
00810 {
00811 0, 0, 0
00812 },
00813 {
00814 0, 0, 0
00815 },
00816 {
00817 0, 0, 0
00818 },
00819 {
00820 0, 0, 0
00821 },
00822 {
00823 0, 0, 0
00824 },
00825 {
00826 0, 0, 0
00827 },
00828 {
00829 0, 0, 0
00830 },
00831 {
00832 0, 0, 0
00833 },
00834 {
00835 0, 0, 0
00836 },
00837 {
00838 0, 0, 0
00839 },
00840 {
00841 0, 0, 0
00842 },
00843 {
00844 0, 0, 0
00845 },
00846 {
00847 0, 0, 0
00848 },
00849 {
00850 0, 0, 0
00851 },
00852 {
00853 0, 0, 0
00854 },
00855 {
00856 0, 0, 0
00857 },
00858 {
00859 0, 0, 0
00860 },
00861 {
00862 0, 0, 0
00863 },
00864 {
00865 0, 0, 0
00866 },
00867 {
00868 0, 0, 0
00869 },
00870 {
00871 0, 0, 0
00872 },
00873 {
00874 0, 0, 0
00875 },
00876 {
00877 0, 0, 0
00878 },
00879 {
00880 0, 0, 0
00881 },
00882 };
00883
00884
00885
00887 #define SIM_CERTAINTY_SCALE -10.0
00888
00889 #define SIM_CERTAINTY_OFFSET -10.0
00890
00891 #define SIMILARITY_FLOOR 100.0
00892
00893
00894
00895
00899 int good_choice(A_CHOICE *choice) {
00900 register float certainty;
00901 if (choice == NULL)
00902 return (FALSE);
00903 if (similarity_enable) {
00904 if ((class_probability (choice) + 1) * class_certainty (choice) >
00905 SIMILARITY_FLOOR)
00906 return (FALSE);
00907 certainty =
00908 SIM_CERTAINTY_OFFSET +
00909 class_probability (choice) * SIM_CERTAINTY_SCALE;
00910 }
00911
00912 else {
00913 certainty = class_certainty (choice);
00914 }
00915 if (certainty > certainty_threshold) {
00916 return (TRUE);
00917 }
00918
00919 else {
00920 return (FALSE);
00921 }
00922 }
00923
00924
00928 void add_document_word(A_CHOICE *best_choice) {
00929 char filename[CHARS_PER_LINE];
00930 FILE *doc_word_file;
00931 char *string;
00932 int stringlen;
00933
00934 string = class_string (best_choice);
00935 stringlen = strlen (string);
00936
00937 if (!doc_dict_enable
00938 || valid_word (string) || CurrentWordAmbig () || stringlen < 2)
00939 return;
00940
00941 if (!good_choice (best_choice) || stringlen == 2) {
00942 if (class_certainty (best_choice) < permuter_pending_threshold)
00943 return;
00944 if (!word_in_dawg (pending_words, string)) {
00945 if (stringlen > 2 || isupper (string[0]) && isupper (string[1]))
00946 add_word_to_dawg(pending_words,
00947 string,
00948 MAX_DOC_EDGES,
00949 RESERVED_DOC_EDGES);
00950 return;
00951 }
00952 }
00953
00954 if (save_doc_words) {
00955 strcpy(filename, imagefile);
00956 strcat (filename, ".doc");
00957 doc_word_file = open_file (filename, "a");
00958 fprintf (doc_word_file, "%s\n", string);
00959 fclose(doc_word_file);
00960 }
00961 add_word_to_dawg(document_words, string, MAX_DOC_EDGES, RESERVED_DOC_EDGES);
00962 case_sensative = FALSE;
00963 }
00964
00965
00970 void
00971 adjust_non_word (A_CHOICE * best_choice, float certainties[]) {
00972 char *this_word;
00973 float adjust_factor;
00974
00975 if (adjust_debug)
00976 cprintf ("%s %4.2f ",
00977 class_string (best_choice), class_probability (best_choice));
00978
00979 this_word = class_string (best_choice);
00980
00981 class_probability (best_choice) += RATING_PAD;
00982 if (case_ok (this_word) && punctuation_ok (this_word) != -1) {
00983 class_probability (best_choice) *= non_word;
00984 adjust_factor = non_word;
00985 if (adjust_debug)
00986 cprintf (", %4.2f ", non_word);
00987 }
00988 else {
00989 class_probability (best_choice) *= garbage;
00990 adjust_factor = garbage;
00991 if (adjust_debug) {
00992 if (!case_ok (this_word))
00993 cprintf (", C");
00994 if (punctuation_ok (this_word) == -1)
00995 cprintf (", P");
00996 cprintf (", %4.2f ", garbage);
00997 }
00998 }
00999
01000 class_probability (best_choice) -= RATING_PAD;
01001
01002 LogNewWordChoice(best_choice, adjust_factor, certainties);
01003
01004 if (adjust_debug)
01005 cprintf (" --> %4.2f\n", class_probability (best_choice));
01006 }
01007
01008
01016 void init_permute() {
01017 char name[1024];
01018 make_adjust_debug();
01019 make_compound_debug();
01020 make_non_word();
01021 make_garbage();
01022 make_doc_words();
01023 make_doc_dict();
01024
01025 init_permdawg();
01026 init_permnum();
01027
01028 #ifdef TEXT_VERBOSE
01029
01030 cprintf("g");
01031 #endif
01032
01033 word_dawg = (EDGE_ARRAY) memalloc (sizeof (EDGE_RECORD) * MAX_NUM_EDGES);
01034 strcpy(name, demodir);
01035 strcat (name, "tessdata/word-dawg");
01036 read_squished_dawg(name, word_dawg, MAX_NUM_EDGES);
01037
01038 document_words =
01039 (EDGE_ARRAY) memalloc (sizeof (EDGE_RECORD) * MAX_DOC_EDGES);
01040 initialize_dawg(document_words, MAX_DOC_EDGES);
01041
01042 pending_words =
01043 (EDGE_ARRAY) memalloc (sizeof (EDGE_RECORD) * MAX_DOC_EDGES);
01044 initialize_dawg(pending_words, MAX_DOC_EDGES);
01045
01046 user_words = (EDGE_ARRAY) memalloc (sizeof (EDGE_RECORD) * MAX_USER_EDGES);
01047 strcpy(name, demodir);
01048 strcat (name, "tessdata/user-words");
01049 read_word_list(name, user_words, MAX_USER_EDGES, USER_RESERVED_EDGES);
01050 case_sensative = FALSE;
01051 }
01052
01056 void end_permute() {
01057 memfree(word_dawg);
01058 word_dawg = NULL;
01059 memfree(document_words);
01060 document_words = NULL;
01061 memfree(pending_words);
01062 pending_words = NULL;
01063 memfree(user_words);
01064 user_words = NULL;
01065 }
01066
01073 A_CHOICE *permute_all(CHOICES_LIST char_choices,
01074 float rating_limit,
01075 A_CHOICE *raw_choice) {
01076 A_CHOICE *result_1;
01077 A_CHOICE *result_2 = NULL;
01078 BOOL8 any_alpha;
01079
01080 result_1 = permute_top_choice (char_choices, rating_limit, raw_choice,
01081 &any_alpha);
01082 if (result_1 == NULL)
01083 return (NULL);
01084 if (permute_only_top)
01085 return result_1;
01086 if (any_alpha && array_count (char_choices) <= 20) {
01087 result_2 = permute_words (char_choices, rating_limit);
01088
01089 if (class_probability (result_1) < class_probability (result_2)
01090 || class_string (result_2) == NULL) {
01091 free_choice(result_2);
01092 }
01093 else {
01094 free_choice(result_1);
01095 result_1 = result_2;
01096 }
01097 }
01098
01099 result_2 = number_permute_and_select (char_choices, rating_limit);
01100
01101 if (class_probability (result_1) < class_probability (result_2)
01102 || class_string (result_2) == NULL) {
01103 free_choice(result_2);
01104 }
01105 else {
01106 free_choice(result_1);
01107 result_1 = result_2;
01108 }
01109
01110 result_2 = permute_compound_words (char_choices, rating_limit);
01111
01112 if (!result_2 ||
01113 class_probability (result_1) < class_probability (result_2)
01114 || class_string (result_2) == NULL) {
01115 free_choice(result_2);
01116 }
01117 else {
01118 free_choice(result_1);
01119 result_1 = result_2;
01120 }
01121
01122 return (result_1);
01123 }
01124
01125
01133 void permute_characters(CHOICES_LIST char_choices,
01134 float limit,
01135 A_CHOICE *best_choice,
01136 A_CHOICE *raw_choice) {
01137 A_CHOICE *this_choice;
01138
01139 permutation_count++;
01140
01141 this_choice = permute_all (char_choices, limit, raw_choice);
01142
01143 if (this_choice &&
01144 class_probability (this_choice) < class_probability (best_choice)) {
01145 clone_choice(best_choice, this_choice);
01146 }
01147 free_choice(this_choice);
01148
01149 if (display_ratings)
01150 cprintf ("permute_characters: %-15s %4.2f %4.2f\n",
01151 class_string (best_choice),
01152 class_probability (best_choice), class_certainty (best_choice));
01153 }
01154
01155
01159 A_CHOICE *permute_compound_words(CHOICES_LIST character_choices,
01160 float rating_limit) {
01161 A_CHOICE *first_choice;
01162 A_CHOICE *best_choice = NULL;
01163 char word[MAX_WERD_LENGTH + 1];
01164 float rating = 0;
01165 float certainty = 10000;
01166 char char_choice;
01167 int x;
01168 int first_index = 0;
01169 char *ptr;
01170
01171 word[0] = '\0';
01172
01173 if (array_count (character_choices) > MAX_WERD_LENGTH) {
01174 return (new_choice (NULL, MAX_FLOAT32, -MAX_FLOAT32, -1, NO_PERM));
01175 }
01176
01177 array_loop(character_choices, x) {
01178
01179 first_choice =
01180 (A_CHOICE *) first ((CHOICES) array_value (character_choices, x));
01181
01182 ptr = class_string (first_choice);
01183 char_choice = ptr != NULL ? *ptr : '\0';
01184 if (x > first_index && (char_choice == '-' || char_choice == '/')) {
01185 if (compound_debug)
01186 cprintf ("Hyphenated word found\n");
01187
01188 permute_subword (character_choices, rating_limit,
01189 first_index, x - 1, word, &rating, &certainty);
01190
01191 if (rating > rating_limit)
01192 break;
01193 first_index = x + 1;
01194 strcat (word, class_string (first_choice));
01195 rating += class_probability (first_choice);
01196 certainty = min (class_certainty (first_choice), certainty);
01197 }
01198 }
01199
01200 if (first_index > 0 && first_index < x && rating <= rating_limit) {
01201 permute_subword (character_choices, rating_limit,
01202 first_index, x - 1, word, &rating, &certainty);
01203
01204 best_choice = new_choice (word, rating, certainty, -1, COMPOUND_PERM);
01205 }
01206 return (best_choice);
01207 }
01208
01209
01230 void permute_subword(CHOICES_LIST character_choices,
01231 float rating_limit,
01232 int start,
01233 int end,
01234 char *word,
01235 float *rating,
01236 float *certainty) {
01237 int x;
01238 A_CHOICE *best_choice = NULL;
01239 A_CHOICE raw_choice;
01240 CHOICES_LIST subchoices;
01241 CHOICES choices;
01242 char this_char;
01243 char *ptr;
01244
01245 DisableChoiceAccum();
01246 raw_choice.string = NULL;
01247 raw_choice.rating = MAX_INT16;
01248 raw_choice.certainty = -MAX_INT16;
01249
01250 subchoices = new_choice_list ();
01251 for (x = start; x <= end; x++) {
01252 choices = (CHOICES) array_value (character_choices, x);
01253 ptr = best_string (choices);
01254 this_char = ptr != NULL ? *ptr : '\0';
01255 if (this_char != '-' && this_char != '/') {
01256 subchoices = array_push (subchoices, choices);
01257 } else {
01258 const char* str = best_string(choices);
01259 strcat (word, str);
01260 }
01261 }
01262
01263 if (array_count (subchoices)) {
01264 if (compound_debug)
01265 dawg_debug = TRUE;
01266 best_choice = permute_all (subchoices, rating_limit, &raw_choice);
01267 if (compound_debug)
01268 dawg_debug = FALSE;
01269
01270 if (best_choice && class_string (best_choice)) {
01271 strcat (word, class_string (best_choice));
01272 *rating += class_probability (best_choice);
01273 *certainty = min (class_certainty (best_choice), *certainty);
01274 }
01275 else {
01276 *rating = MAX_FLOAT32;
01277 }
01278 }
01279 else {
01280 *rating = MAX_FLOAT32;
01281 }
01282
01283 free_choice_list(subchoices);
01284 if (best_choice)
01285 free_choice(best_choice);
01286
01287 if (compound_debug && *rating < MAX_FLOAT32) {
01288 cprintf ("Subword permuted = %s, %5.2f, %5.2f\n\n",
01289 word, *rating, *certainty);
01290 }
01291 if (raw_choice.string)
01292 strfree(raw_choice.string);
01293
01294
01295
01296 EnableChoiceAccum();
01297 }
01298
01299
01313 A_CHOICE *permute_top_choice(CHOICES_LIST character_choices,
01314 float rating_limit,
01315 A_CHOICE *raw_choice,
01316 BOOL8 *any_alpha) {
01317 CHOICES char_list;
01318 A_CHOICE *first_choice;
01319 A_CHOICE *best_choice;
01320 A_CHOICE *other_choice;
01321 char *ptr;
01322 char first_char;
01323 char second_char;
01324 char third_char;
01325 char prev_char = '\0';
01326 char next_char = '\0';
01327 char next_next_char = '\0';
01328
01329 char word[MAX_PERM_LENGTH + 1];
01330 char capital_word[MAX_PERM_LENGTH + 1];
01331 char lower_word[MAX_PERM_LENGTH + 1];
01332 int x;
01333 BOOL8 char_alpha;
01334
01335 float rating = 0;
01336 float upper_rating = 0;
01337 float lower_rating = 0;
01338 float first_rating = 0;
01339
01340 float certainty = 10000;
01341 float upper_certainty = 10000;
01342 float lower_certainty = 10000;
01343
01344 float certainties[MAX_PERM_LENGTH + 1];
01345 float lower_certainties[MAX_PERM_LENGTH + 1];
01346 float upper_certainties[MAX_PERM_LENGTH + 1];
01347
01348 register CHOICES this_char;
01349 register char ch;
01350 register INT8 lower_done;
01351 register INT8 upper_done;
01352
01353 if (any_alpha != NULL)
01354 *any_alpha = FALSE;
01355
01356 if (array_count (character_choices) > MAX_PERM_LENGTH) {
01357 return (NULL);
01358 }
01359
01360 array_loop(character_choices, x) {
01361 if (x + 1 < array_count (character_choices)) {
01362 char_list = (CHOICES) array_value (character_choices, x + 1);
01363 first_choice = (A_CHOICE *) first (char_list);
01364
01365 ptr = class_string (first_choice);
01366 next_char = (ptr != NULL && *ptr != '\0') ? *ptr : ' ';
01367 }
01368 else
01369 next_char = '\0';
01370 if (x + 2 < array_count (character_choices)) {
01371 char_list = (CHOICES) array_value (character_choices, x + 2);
01372 first_choice = (A_CHOICE *) first (char_list);
01373
01374 ptr = class_string (first_choice);
01375 next_next_char = (ptr != NULL && *ptr != '\0') ? *ptr : ' ';
01376 }
01377 else
01378 next_next_char = '\0';
01379
01380 char_list = (CHOICES) array_value (character_choices, x);
01381 first_choice = (A_CHOICE *) first (char_list);
01382
01383 ptr = class_string (first_choice);
01384 word[x] = (ptr != NULL && *ptr != '\0') ? *ptr : ' ';
01385
01386 lower_word[x] = word[x];
01387 capital_word[x] = word[x];
01388 first_char = word[x];
01389 first_rating = class_probability (first_choice);
01390 upper_rating += class_probability (first_choice);
01391 lower_rating += class_probability (first_choice);
01392 lower_certainty = min (class_certainty (first_choice), lower_certainty);
01393 upper_certainty = min (class_certainty (first_choice), upper_certainty);
01394
01395 certainties[x] = class_certainty (first_choice);
01396 lower_certainties[x] = class_certainty (first_choice);
01397 upper_certainties[x] = class_certainty (first_choice);
01398
01399 lower_done = FALSE;
01400 upper_done = FALSE;
01401 char_alpha = FALSE;
01402 second_char = '\0';
01403 third_char = '\0';
01404 iterate_list(this_char, char_list) {
01405 ptr = best_string (this_char);
01406 ch = ptr != NULL ? *ptr : '\0';
01407 if (ch == 'l' && rest (this_char) != NULL
01408 && best_probability (rest (this_char)) == first_rating) {
01409 ptr = best_string (rest (this_char));
01410 if (ptr != NULL && (*ptr == '1' || *ptr == 'I')) {
01411 second_char = *ptr;
01412 this_char = rest (this_char);
01413 if (rest (this_char) != NULL
01414 && best_probability (rest (this_char)) == first_rating) {
01415 ptr = best_string (rest (this_char));
01416 if (ptr != NULL && (*ptr == '1' || *ptr == 'I')) {
01417 third_char = *ptr;
01418 this_char = rest (this_char);
01419 }
01420 }
01421 ch = choose_il1 (first_char, second_char, third_char,
01422 prev_char, next_char, next_next_char);
01423 if (ch != 'l' && word[x] == 'l') {
01424 word[x] = ch;
01425 lower_word[x] = ch;
01426 capital_word[x] = ch;
01427 }
01428 }
01429 }
01430
01431 if (!lower_done && (islower (ch) || (isupper (ch) && x == 0))) {
01432 lower_word[x] = ch;
01433 lower_rating += best_probability (this_char);
01434 lower_rating -= class_probability (first_choice);
01435 lower_certainty = min (best_certainty (this_char), lower_certainty);
01436 lower_certainties[x] = best_certainty (this_char);
01437 lower_done = TRUE;
01438 }
01439
01440 if (!upper_done && isupper (ch)) {
01441 capital_word[x] = ch;
01442 upper_rating += best_probability (this_char);
01443 upper_rating -= class_probability (first_choice);
01444 upper_certainty = min (best_certainty (this_char), upper_certainty);
01445 upper_certainties[x] = best_certainty (this_char);
01446 upper_done = TRUE;
01447 }
01448 if (!char_alpha && isalpha (ch))
01449 char_alpha = TRUE;
01450 if (lower_done && upper_done)
01451 break;
01452 }
01453 if (char_alpha && any_alpha != NULL)
01454 *any_alpha = TRUE;
01455
01456 if (first_choice == NULL) {
01457 cprintf ("Permuter giving up due to null choices list");
01458 word[x + 1] = '$';
01459 word[x + 2] = '\0';
01460 cprintf (" word=%s\n", word);
01461 return (NULL);
01462 }
01463
01464 rating += class_probability (first_choice);
01465 if (rating > rating_limit)
01466 return (NULL);
01467
01468 certainty = min (class_certainty (first_choice), certainty);
01469 prev_char = word[x];
01470 }
01471
01472 lower_word[x] = '\0';
01473 capital_word[x] = '\0';
01474 word[x] = '\0';
01475
01476 if (rating < class_probability (raw_choice)) {
01477 if (class_string (raw_choice))
01478 strfree (class_string (raw_choice));
01479
01480 class_probability (raw_choice) = rating;
01481 class_certainty (raw_choice) = certainty;
01482 class_string (raw_choice) = strsave (word);
01483 class_permuter (raw_choice) = TOP_CHOICE_PERM;
01484
01485 LogNewRawChoice (raw_choice, 1.0, certainties);
01486 }
01487
01488 best_choice = new_choice (word, rating, certainty, -1, TOP_CHOICE_PERM);
01489 adjust_non_word(best_choice, certainties);
01490
01491 other_choice = new_choice (lower_word, lower_rating, lower_certainty,
01492 -1, LOWER_CASE_PERM);
01493 adjust_non_word(other_choice, lower_certainties);
01494 if (class_probability (best_choice) > class_probability (other_choice)) {
01495 clone_choice(best_choice, other_choice);
01496 }
01497 free_choice(other_choice);
01498
01499 other_choice = new_choice (capital_word, upper_rating, upper_certainty,
01500 -1, UPPER_CASE_PERM);
01501 adjust_non_word(other_choice, upper_certainties);
01502 if (class_probability (best_choice) > class_probability (other_choice)) {
01503 clone_choice(best_choice, other_choice);
01504 }
01505 free_choice(other_choice);
01506
01507 return (best_choice);
01508 }
01509
01510
01522 char choose_il1(char first_char,
01523 char second_char,
01524 char third_char,
01525 char prev_char,
01526 char next_char,
01527 char next_next_char) {
01528 INT32 type1;
01529 INT32 type2;
01530 INT32 type3;
01531
01532 if (first_char == 'l' && second_char != '\0') {
01533 if (second_char == 'I'
01534 && (isupper (prev_char) && !islower (next_char)
01535 && !isdigit (next_char) || isupper (next_char)
01536 && !islower (prev_char) && !isdigit (prev_char)))
01537 first_char = second_char;
01538 else if (second_char == '1' || third_char == '1') {
01539 if (isdigit (next_char) || isdigit (prev_char)
01540 || next_char == 'l' && isdigit (next_next_char)) {
01541 first_char = '1';
01542 }
01543 else if (!islower (prev_char)
01544 && (!islower (next_char) || next_char == 's'
01545 && next_next_char == 't')) {
01546 if ((prev_char != '\'' && prev_char != '`' || next_char != '\0')
01547 && (next_char != '\'' && next_char != '`'
01548 || prev_char != '\0')) {
01549 first_char = '1';
01550 }
01551 }
01552 }
01553 if (first_char == 'l' && next_char != '\0' && !isalpha (prev_char)) {
01554 type1 = 2;
01555
01556 if (second_char == '1')
01557 type2 = 0;
01558 else if (second_char == 'I')
01559 type2 = 1;
01560 else if (second_char == 'l')
01561 type2 = 2;
01562 else
01563 type2 = type1;
01564
01565 if (third_char == '1')
01566 type3 = 0;
01567 else if (third_char == 'I')
01568 type3 = 1;
01569 else if (third_char == 'l')
01570 type3 = 2;
01571 else
01572 type3 = type1;
01573
01574 if (bigram_counts[next_char][type2] >
01575 bigram_counts[next_char][type1]) {
01576 first_char = second_char;
01577 type1 = type2;
01578 }
01579 if (bigram_counts[next_char][type3] >
01580 bigram_counts[next_char][type1]) {
01581 first_char = third_char;
01582 }
01583 }
01584 }
01585 return first_char;
01586 }
01587
01588
01593 A_CHOICE *permute_words(CHOICES_LIST char_choices, float rating_limit) {
01594 A_CHOICE *best_choice;
01595 int hyphen_len;
01596
01597 best_choice = new_choice (NULL, rating_limit, -MAX_FLOAT32, -1, NO_PERM);
01598
01599 hyphen_len = hyphen_string != NULL ? strlen (hyphen_string) : 0;
01600 if (hyphen_len + array_count (char_choices) > MAX_WERD_LENGTH) {
01601 class_probability (best_choice) = MAX_FLOAT32;
01602 }
01603 else {
01604
01605 dawg_permute_and_select ("system words:", word_dawg, SYSTEM_DAWG_PERM,
01606 char_choices, best_choice, TRUE);
01607
01608 dawg_permute_and_select ("document_words", document_words,
01609 DOC_DAWG_PERM, char_choices, best_choice,
01610 FALSE);
01611
01612 dawg_permute_and_select ("user words", user_words, USER_DAWG_PERM,
01613 char_choices, best_choice, FALSE);
01614 case_sensative = FALSE;
01615 }
01616
01617 return (best_choice);
01618 }
01619
01620
01624 int valid_word(const char *string) {
01625 int result = NO_PERM;
01626
01627 if (word_in_dawg (word_dawg, string))
01628 result = SYSTEM_DAWG_PERM;
01629 else {
01630 if (word_in_dawg (document_words, string))
01631 result = DOC_DAWG_PERM;
01632 else if (word_in_dawg (user_words, string))
01633 result = USER_DAWG_PERM;
01634 case_sensative = FALSE;
01635 }
01636 return (result);
01637 }