dict/permute.cpp

Go to the documentation of this file.
00001 
00020 /*----------------------------------------------------------------------
00021             I n c l u d e s
00022 ---------------------------------------------------------------------*/
00023 #include "permute.h"
00024 #include "globals.h"
00025 #include "permdawg.h"
00026 #include "debug.h"
00027 #include "tordvars.h"
00028 #include "hyphen.h"
00029 #include "stopper.h"
00030 #include "trie.h"
00031 #include "context.h"
00032 #include "permnum.h"
00033 #include "freelist.h"
00034 #include "callcpp.h"
00035 
00036 #include <math.h>
00037 
00039 int permutation_count;
00040 /*----------------------------------------------------------------------
00041               V a r i a b l e s
00042 ----------------------------------------------------------------------*/
00050 #define MAX_NUM_EDGES          60000
00051 
00055 #define MAX_DOC_EDGES          250000
00056 
00065 #define RESERVED_DOC_EDGES     10000
00066 
00071 #define MAX_USER_EDGES         80000
00072 
00076 #define USER_RESERVED_EDGES    2000
00077 
00079 #define NON_WERD               1.25
00080 
00081 #define GARBAGE_STRING         1.5
00082 
00083 #define MAX_PERM_LENGTH         128
00084 
00085 static EDGE_ARRAY pending_words;
00086 static EDGE_ARRAY document_words;
00087 static EDGE_ARRAY user_words;
00088 static EDGE_ARRAY word_dawg;
00089 
00092 make_toggle_var (adjust_debug, 0, make_adjust_debug,
00093 8, 13, set_adjust_debug, "Adjustment Debug");
00094 make_toggle_var (compound_debug, 0, make_compound_debug,
00095 8, 14, set_compound_debug, "Compound Debug");
00096 make_float_var (non_word, NON_WERD, make_non_word,
00097 8, 20, set_non_word, "Non-word adjustment");
00098 make_float_var (garbage, GARBAGE_STRING, make_garbage,
00099 8, 21, set_garbage, "Garbage adjustment");
00100 make_toggle_var (save_doc_words, 0, make_doc_words,
00101 8, 22, set_doc_words, "Save Document Words ");
00102 make_toggle_var (doc_dict_enable, 1, make_doc_dict,
00103 8, 25, set_doc_dict, "Enable Document Dictionary ");
00104 /* PREV DEFAULT 0 */
00107 int permute_only_top = 0;
00108 
00113 static INT32 bigram_counts[256][3] = {
00114   {                              //0x0=.
00115     0, 0, 0
00116   },
00117   {                              //0x1=.
00118     0, 0, 0
00119   },
00120   {                              //0x2=.
00121     0, 0, 0
00122   },
00123   {                              //0x3=.
00124     0, 0, 0
00125   },
00126   {                              //0x4=.
00127     0, 0, 0
00128   },
00129   {                              //0x5=.
00130     0, 0, 0
00131   },
00132   {                              //0x6=.
00133     0, 0, 0
00134   },
00135   {                              //0x7=.
00136     0, 0, 0
00137   },
00138   {                              //0x8=.
00139     0, 0, 0
00140   },
00141   {                              //0x9=.
00142     0, 0, 0
00143   },
00144   {                              //0xa=.
00145     93, 28, 0
00146   },
00147   {                              //0xb=.
00148     0, 0, 0
00149   },
00150   {                              //0xc=.
00151     0, 0, 0
00152   },
00153   {                              //0xd=.
00154     0, 0, 0
00155   },
00156   {                              //0xe=.
00157     0, 0, 0
00158   },
00159   {                              //0xf=.
00160     0, 0, 0
00161   },
00162   {                              //0x10=.
00163     0, 0, 0
00164   },
00165   {                              //0x11=.
00166     0, 0, 0
00167   },
00168   {                              //0x12=.
00169     0, 0, 0
00170   },
00171   {                              //0x13=.
00172     0, 0, 0
00173   },
00174   {                              //0x14=.
00175     0, 0, 0
00176   },
00177   {                              //0x15=.
00178     0, 0, 0
00179   },
00180   {                              //0x16=.
00181     0, 0, 0
00182   },
00183   {                              //0x17=.
00184     0, 0, 0
00185   },
00186   {                              //0x18=.
00187     0, 0, 0
00188   },
00189   {                              //0x19=.
00190     0, 0, 0
00191   },
00192   {                              //0x1a=.
00193     0, 0, 0
00194   },
00195   {                              //0x1b=.
00196     0, 0, 0
00197   },
00198   {                              //0x1c=.
00199     0, 0, 0
00200   },
00201   {                              //0x1d=.
00202     0, 0, 0
00203   },
00204   {                              //0x1e=.
00205     0, 0, 0
00206   },
00207   {                              //0x1f=.
00208     0, 0, 0
00209   },
00210   {                              //0x20=
00211     324, 377, 2
00212   },
00213   {                              //0x21=!
00214     2, 1, 0
00215   },
00216   {                              //0x22="
00217     2, 1, 0
00218   },
00219   {                              //0x23=#
00220     1, 0, 1
00221   },
00222   {                              //0x24=$
00223     2, 1, 0
00224   },
00225   {                              //0x25=%
00226     2, 0, 0
00227   },
00228   {                              //0x26=&
00229     2, 1, 0
00230   },
00231   {                              //0x27='
00232     1, 21, 8
00233   },
00234   {                              //0x28=(
00235     2, 1, 0
00236   },
00237   {                              //0x29=)
00238     19, 0, 0
00239   },
00240   {                              //0x2a=*
00241     2, 1, 0
00242   },
00243   {                              //0x2b=+
00244     1, 0, 0
00245   },
00246   {                              //0x2c=,
00247     75, 4, 0
00248   },
00249   {                              //0x2d=-
00250     52, 7, 0
00251   },
00252   {                              //0x2e=.
00253     190, 16, 3
00254   },
00255   {                              //0x2f=/
00256     53, 2, 0
00257   },
00258   {                              //0x30=0
00259     399, 0, 0
00260   },
00261   {                              //0x31=1
00262     220, 0, 0
00263   },
00264   {                              //0x32=2
00265     226, 0, 0
00266   },
00267   {                              //0x33=3
00268     128, 0, 0
00269   },
00270   {                              //0x34=4
00271     147, 0, 0
00272   },
00273   {                              //0x35=5
00274     179, 0, 1
00275   },
00276   {                              //0x36=6
00277     173, 0, 0
00278   },
00279   {                              //0x37=7
00280     115, 0, 0
00281   },
00282   {                              //0x38=8
00283     107, 0, 0
00284   },
00285   {                              //0x39=9
00286     934, 0, 1
00287   },
00288   {                              //0x3a=:
00289     27, 0, 1
00290   },
00291   {                              //0x3b=;
00292     2, 1, 0
00293   },
00294   {                              //0x3c=<
00295     2, 1, 0
00296   },
00297   {                              //0x3d==
00298     2, 1, 0
00299   },
00300   {                              //0x3e=>
00301     2, 1, 0
00302   },
00303   {                              //0x3f=?
00304     2, 1, 0
00305   },
00306   {                              //0x40=@
00307     2, 1, 0
00308   },
00309   {                              //0x41=A
00310     3, 1, 0
00311   },
00312   {                              //0x42=B
00313     1, 73, 0
00314   },
00315   {                              //0x43=C
00316     1, 6, 0
00317   },
00318   {                              //0x44=D
00319     1, 24, 0
00320   },
00321   {                              //0x45=E
00322     1, 2, 0
00323   },
00324   {                              //0x46=F
00325     1, 19, 0
00326   },
00327   {                              //0x47=G
00328     1, 2, 0
00329   },
00330   {                              //0x48=H
00331     3, 2, 1
00332   },
00333   {                              //0x49=I
00334     0, 68, 0
00335   },
00336   {                              //0x4a=J
00337     1, 2, 0
00338   },
00339   {                              //0x4b=K
00340     1, 2, 0
00341   },
00342   {                              //0x4c=L
00343     1, 82, 0
00344   },
00345   {                              //0x4d=M
00346     10, 10, 0
00347   },
00348   {                              //0x4e=N
00349     3, 239, 0
00350   },
00351   {                              //0x4f=O
00352     1, 10, 0
00353   },
00354   {                              //0x50=P
00355     0, 1, 3
00356   },
00357   {                              //0x51=Q
00358     2, 3, 0
00359   },
00360   {                              //0x52=R
00361     1, 43, 0
00362   },
00363   {                              //0x53=S
00364     1, 53, 0
00365   },
00366   {                              //0x54=T
00367     2, 18, 0
00368   },
00369   {                              //0x55=U
00370     1, 2, 0
00371   },
00372   {                              //0x56=V
00373     1, 17, 0
00374   },
00375   {                              //0x57=W
00376     1, 5, 0
00377   },
00378   {                              //0x58=X
00379     1, 6, 0
00380   },
00381   {                              //0x59=Y
00382     1, 2, 0
00383   },
00384   {                              //0x5a=Z
00385     1, 2, 0
00386   },
00387   {                              //0x5b=[
00388     2, 1, 0
00389   },
00390   {                              //0x5c=backslash
00391     2, 1, 0
00392   },
00393   {                              //0x5d=]
00394     2, 1, 0
00395   },
00396   {                              //0x5e=^
00397     2, 1, 0
00398   },
00399   {                              //0x5f=_
00400     2, 1, 0
00401   },
00402   {                              //0x60=`
00403     1, 0, 2
00404   },
00405   {                              //0x61=a
00406     0, 0, 671
00407   },
00408   {                              //0x62=b
00409     0, 1, 16
00410   },
00411   {                              //0x63=c
00412     0, 2, 1
00413   },
00414   {                              //0x64=d
00415     0, 14, 0
00416   },
00417   {                              //0x65=e
00418     0, 0, 763
00419   },
00420   {                              //0x66=f
00421     0, 186, 0
00422   },
00423   {                              //0x67=g
00424     0, 2, 1
00425   },
00426   {                              //0x68=h
00427     0, 2, 1
00428   },
00429   {                              //0x69=i
00430     0, 0, 818
00431   },
00432   {                              //0x6a=j
00433     0, 2, 1
00434   },
00435   {                              //0x6b=k
00436     0, 4, 1
00437   },
00438   {                              //0x6c=l
00439     0, 26, 3
00440   },
00441   {                              //0x6d=m
00442     0, 69, 0
00443   },
00444   {                              //0x6e=n
00445     0, 885, 0
00446   },
00447   {                              //0x6f=o
00448     0, 17, 722
00449   },
00450   {                              //0x70=p
00451     0, 1, 5
00452   },
00453   {                              //0x71=q
00454     2, 1, 0
00455   },
00456   {                              //0x72=r
00457     0, 21, 0
00458   },
00459   {                              //0x73=s
00460     3, 49, 0
00461   },
00462   {                              //0x74=t
00463     0, 219, 5
00464   },
00465   {                              //0x75=u
00466     0, 0, 56
00467   },
00468   {                              //0x76=v
00469     0, 4, 0
00470   },
00471   {                              //0x77=w
00472     0, 2, 1
00473   },
00474   {                              //0x78=x
00475     0, 2, 1
00476   },
00477   {                              //0x79=y
00478     0, 1, 23
00479   },
00480   {                              //0x7a=z
00481     0, 2, 1
00482   },
00483   {                              //0x7b={
00484     2, 1, 0
00485   },
00486   {                              //0x7c=|
00487     59, 0, 3
00488   },
00489   {                              //0x7d=}
00490     2, 1, 0
00491   },
00492   {                              //0x7e=~
00493     2, 1, 0
00494   },
00495   {                              //0x7f=.
00496     0, 0, 0
00497   },
00498   {                              //0x80=.
00499     0, 0, 0
00500   },
00501   {                              //0x81=.
00502     0, 0, 0
00503   },
00504   {                              //0x82=.
00505     0, 0, 0
00506   },
00507   {                              //0x83=.
00508     0, 0, 0
00509   },
00510   {                              //0x84=.
00511     0, 0, 0
00512   },
00513   {                              //0x85=.
00514     0, 0, 0
00515   },
00516   {                              //0x86=.
00517     0, 0, 0
00518   },
00519   {                              //0x87=.
00520     0, 0, 0
00521   },
00522   {                              //0x88=.
00523     0, 0, 0
00524   },
00525   {                              //0x89=.
00526     0, 0, 0
00527   },
00528   {                              //0x8a=.
00529     0, 0, 0
00530   },
00531   {                              //0x8b=.
00532     0, 0, 0
00533   },
00534   {                              //0x8c=.
00535     0, 0, 0
00536   },
00537   {                              //0x8d=.
00538     0, 0, 0
00539   },
00540   {                              //0x8e=.
00541     0, 0, 0
00542   },
00543   {                              //0x8f=.
00544     0, 0, 0
00545   },
00546   {                              //0x90=.
00547     0, 0, 0
00548   },
00549   {                              //0x91=.
00550     0, 0, 0
00551   },
00552   {                              //0x92=.
00553     0, 0, 0
00554   },
00555   {                              //0x93=.
00556     0, 0, 0
00557   },
00558   {                              //0x94=.
00559     0, 0, 0
00560   },
00561   {                              //0x95=.
00562     0, 0, 0
00563   },
00564   {                              //0x96=.
00565     0, 0, 0
00566   },
00567   {                              //0x97=.
00568     0, 0, 0
00569   },
00570   {                              //0x98=.
00571     0, 0, 0
00572   },
00573   {                              //0x99=.
00574     0, 0, 0
00575   },
00576   {                              //0x9a=.
00577     0, 0, 0
00578   },
00579   {                              //0x9b=.
00580     0, 0, 0
00581   },
00582   {                              //0x9c=.
00583     0, 0, 0
00584   },
00585   {                              //0x9d=.
00586     0, 0, 0
00587   },
00588   {                              //0x9e=.
00589     0, 0, 0
00590   },
00591   {                              //0x9f=.
00592     0, 0, 0
00593   },
00594   {                              //0xa0=.
00595     0, 0, 0
00596   },
00597   {                              //0xa1=.
00598     0, 0, 0
00599   },
00600   {                              //0xa2=.
00601     0, 0, 0
00602   },
00603   {                              //0xa3=.
00604     0, 0, 0
00605   },
00606   {                              //0xa4=.
00607     0, 0, 0
00608   },
00609   {                              //0xa5=.
00610     0, 0, 0
00611   },
00612   {                              //0xa6=.
00613     0, 0, 0
00614   },
00615   {                              //0xa7=.
00616     0, 0, 0
00617   },
00618   {                              //0xa8=.
00619     0, 0, 0
00620   },
00621   {                              //0xa9=.
00622     0, 0, 0
00623   },
00624   {                              //0xaa=.
00625     0, 0, 0
00626   },
00627   {                              //0xab=.
00628     0, 0, 0
00629   },
00630   {                              //0xac=.
00631     0, 0, 0
00632   },
00633   {                              //0xad=.
00634     0, 0, 0
00635   },
00636   {                              //0xae=.
00637     0, 0, 0
00638   },
00639   {                              //0xaf=.
00640     0, 0, 0
00641   },
00642   {                              //0xb0=.
00643     0, 0, 0
00644   },
00645   {                              //0xb1=.
00646     0, 0, 0
00647   },
00648   {                              //0xb2=.
00649     0, 0, 0
00650   },
00651   {                              //0xb3=.
00652     0, 0, 0
00653   },
00654   {                              //0xb4=.
00655     0, 0, 0
00656   },
00657   {                              //0xb5=.
00658     0, 0, 0
00659   },
00660   {                              //0xb6=.
00661     0, 0, 0
00662   },
00663   {                              //0xb7=.
00664     0, 0, 0
00665   },
00666   {                              //0xb8=.
00667     0, 0, 0
00668   },
00669   {                              //0xb9=.
00670     0, 0, 0
00671   },
00672   {                              //0xba=.
00673     0, 0, 0
00674   },
00675   {                              //0xbb=.
00676     0, 0, 0
00677   },
00678   {                              //0xbc=.
00679     0, 0, 0
00680   },
00681   {                              //0xbd=.
00682     0, 0, 0
00683   },
00684   {                              //0xbe=.
00685     0, 0, 0
00686   },
00687   {                              //0xbf=.
00688     0, 0, 0
00689   },
00690   {                              //0xc0=.
00691     0, 0, 0
00692   },
00693   {                              //0xc1=.
00694     0, 0, 0
00695   },
00696   {                              //0xc2=.
00697     0, 0, 0
00698   },
00699   {                              //0xc3=.
00700     0, 0, 0
00701   },
00702   {                              //0xc4=.
00703     0, 0, 0
00704   },
00705   {                              //0xc5=.
00706     0, 0, 0
00707   },
00708   {                              //0xc6=.
00709     0, 0, 0
00710   },
00711   {                              //0xc7=.
00712     0, 0, 0
00713   },
00714   {                              //0xc8=.
00715     0, 0, 0
00716   },
00717   {                              //0xc9=.
00718     0, 0, 0
00719   },
00720   {                              //0xca=.
00721     0, 0, 0
00722   },
00723   {                              //0xcb=.
00724     0, 0, 0
00725   },
00726   {                              //0xcc=.
00727     0, 0, 0
00728   },
00729   {                              //0xcd=.
00730     0, 0, 0
00731   },
00732   {                              //0xce=.
00733     0, 0, 0
00734   },
00735   {                              //0xcf=.
00736     0, 0, 0
00737   },
00738   {                              //0xd0=.
00739     0, 0, 0
00740   },
00741   {                              //0xd1=.
00742     0, 0, 0
00743   },
00744   {                              //0xd2=.
00745     0, 0, 0
00746   },
00747   {                              //0xd3=.
00748     0, 0, 0
00749   },
00750   {                              //0xd4=.
00751     0, 0, 0
00752   },
00753   {                              //0xd5=.
00754     0, 0, 0
00755   },
00756   {                              //0xd6=.
00757     0, 0, 0
00758   },
00759   {                              //0xd7=.
00760     0, 0, 0
00761   },
00762   {                              //0xd8=.
00763     0, 0, 0
00764   },
00765   {                              //0xd9=.
00766     0, 0, 0
00767   },
00768   {                              //0xda=.
00769     0, 0, 0
00770   },
00771   {                              //0xdb=.
00772     0, 0, 0
00773   },
00774   {                              //0xdc=.
00775     0, 0, 0
00776   },
00777   {                              //0xdd=.
00778     0, 0, 0
00779   },
00780   {                              //0xde=.
00781     0, 0, 0
00782   },
00783   {                              //0xdf=.
00784     0, 0, 0
00785   },
00786   {                              //0xe0=.
00787     0, 0, 0
00788   },
00789   {                              //0xe1=.
00790     0, 0, 0
00791   },
00792   {                              //0xe2=.
00793     0, 0, 0
00794   },
00795   {                              //0xe3=.
00796     0, 0, 0
00797   },
00798   {                              //0xe4=.
00799     0, 0, 0
00800   },
00801   {                              //0xe5=.
00802     0, 0, 0
00803   },
00804   {                              //0xe6=.
00805     0, 0, 0
00806   },
00807   {                              //0xe7=.
00808     0, 0, 0
00809   },
00810   {                              //0xe8=.
00811     0, 0, 0
00812   },
00813   {                              //0xe9=.
00814     0, 0, 0
00815   },
00816   {                              //0xea=.
00817     0, 0, 0
00818   },
00819   {                              //0xeb=.
00820     0, 0, 0
00821   },
00822   {                              //0xec=.
00823     0, 0, 0
00824   },
00825   {                              //0xed=.
00826     0, 0, 0
00827   },
00828   {                              //0xee=.
00829     0, 0, 0
00830   },
00831   {                              //0xef=.
00832     0, 0, 0
00833   },
00834   {                              //0xf0=.
00835     0, 0, 0
00836   },
00837   {                              //0xf1=.
00838     0, 0, 0
00839   },
00840   {                              //0xf2=.
00841     0, 0, 0
00842   },
00843   {                              //0xf3=.
00844     0, 0, 0
00845   },
00846   {                              //0xf4=.
00847     0, 0, 0
00848   },
00849   {                              //0xf5=.
00850     0, 0, 0
00851   },
00852   {                              //0xf6=.
00853     0, 0, 0
00854   },
00855   {                              //0xf7=.
00856     0, 0, 0
00857   },
00858   {                              //0xf8=.
00859     0, 0, 0
00860   },
00861   {                              //0xf9=.
00862     0, 0, 0
00863   },
00864   {                              //0xfa=.
00865     0, 0, 0
00866   },
00867   {                              //0xfb=.
00868     0, 0, 0
00869   },
00870   {                              //0xfc=.
00871     0, 0, 0
00872   },
00873   {                              //0xfd=.
00874     0, 0, 0
00875   },
00876   {                              //0xfe=.
00877     0, 0, 0
00878   },
00879   {                              //0xff=.
00880     0, 0, 0
00881   },
00882 };
00883 
00884 //extern "C" double permuter_pending_threshold;
00885 
00887 #define SIM_CERTAINTY_SCALE  -10.0
00888 
00889 #define SIM_CERTAINTY_OFFSET -10.0
00890 
00891 #define SIMILARITY_FLOOR     100.0
00892 /*----------------------------------------------------------------------
00893               F u n c t i o n s
00894 ----------------------------------------------------------------------*/
00895 
00899 int good_choice(A_CHOICE *choice) { 
00900   register float certainty;
00901   if (choice == NULL)
00902     return (FALSE);
00903   if (similarity_enable) {
00904     if ((class_probability (choice) + 1) * class_certainty (choice) >
00905       SIMILARITY_FLOOR)
00906       return (FALSE);
00907     certainty =
00908       SIM_CERTAINTY_OFFSET +
00909       class_probability (choice) * SIM_CERTAINTY_SCALE;
00910   }
00911 
00912   else {
00913     certainty = class_certainty (choice);
00914   }
00915   if (certainty > certainty_threshold) {
00916     return (TRUE);
00917   }
00918 
00919   else {
00920     return (FALSE);
00921   }
00922 }
00923 
00924 
00928 void add_document_word(A_CHOICE *best_choice) { 
00929   char filename[CHARS_PER_LINE];
00930   FILE *doc_word_file;
00931   char *string;
00932   int stringlen;                 //length of word
00933 
00934   string = class_string (best_choice);
00935   stringlen = strlen (string);
00936 
00937   if (!doc_dict_enable
00938     || valid_word (string) || CurrentWordAmbig () || stringlen < 2)
00939     return;
00940 
00941   if (!good_choice (best_choice) || stringlen == 2) {
00942     if (class_certainty (best_choice) < permuter_pending_threshold)
00943       return;
00944     if (!word_in_dawg (pending_words, string)) {
00945       if (stringlen > 2 || isupper (string[0]) && isupper (string[1]))
00946         add_word_to_dawg(pending_words,
00947                          string,
00948                          MAX_DOC_EDGES,
00949                          RESERVED_DOC_EDGES);
00950       return;
00951     }
00952   }
00953 
00954   if (save_doc_words) {
00955     strcpy(filename, imagefile);
00956     strcat (filename, ".doc");
00957     doc_word_file = open_file (filename, "a");
00958     fprintf (doc_word_file, "%s\n", string);
00959     fclose(doc_word_file);
00960   }
00961   add_word_to_dawg(document_words, string, MAX_DOC_EDGES, RESERVED_DOC_EDGES);
00962   case_sensative = FALSE;
00963 }
00964 
00965 
00970 void
00971 adjust_non_word (A_CHOICE * best_choice, float certainties[]) {
00972   char *this_word;
00973   float adjust_factor;
00974 
00975   if (adjust_debug)
00976     cprintf ("%s %4.2f ",
00977       class_string (best_choice), class_probability (best_choice));
00978 
00979   this_word = class_string (best_choice);
00980 
00981   class_probability (best_choice) += RATING_PAD;
00982   if (case_ok (this_word) && punctuation_ok (this_word) != -1) {
00983     class_probability (best_choice) *= non_word;
00984     adjust_factor = non_word;
00985     if (adjust_debug)
00986       cprintf (", %4.2f ", non_word);
00987   }
00988   else {
00989     class_probability (best_choice) *= garbage;
00990     adjust_factor = garbage;
00991     if (adjust_debug) {
00992       if (!case_ok (this_word))
00993         cprintf (", C");
00994       if (punctuation_ok (this_word) == -1)
00995         cprintf (", P");
00996       cprintf (", %4.2f ", garbage);
00997     }
00998   }
00999 
01000   class_probability (best_choice) -= RATING_PAD;
01001 
01002   LogNewWordChoice(best_choice, adjust_factor, certainties);
01003 
01004   if (adjust_debug)
01005     cprintf (" --> %4.2f\n", class_probability (best_choice));
01006 }
01007 
01008 
01016 void init_permute() {
01017   char name[1024];
01018   make_adjust_debug();
01019   make_compound_debug();
01020   make_non_word();
01021   make_garbage();
01022   make_doc_words();
01023   make_doc_dict();
01024 
01025   init_permdawg();
01026   init_permnum();
01027 
01028 #ifdef TEXT_VERBOSE
01029   // gets a 'g', see ccmain/tesseractmain.dox
01030   cprintf("g");
01031 #endif
01032 
01033   word_dawg = (EDGE_ARRAY) memalloc (sizeof (EDGE_RECORD) * MAX_NUM_EDGES);
01034   strcpy(name, demodir);
01035   strcat (name, "tessdata/word-dawg");
01036   read_squished_dawg(name, word_dawg, MAX_NUM_EDGES);
01037 
01038   document_words =
01039     (EDGE_ARRAY) memalloc (sizeof (EDGE_RECORD) * MAX_DOC_EDGES);
01040   initialize_dawg(document_words, MAX_DOC_EDGES);
01041 
01042   pending_words =
01043     (EDGE_ARRAY) memalloc (sizeof (EDGE_RECORD) * MAX_DOC_EDGES);
01044   initialize_dawg(pending_words, MAX_DOC_EDGES);
01045 
01046   user_words = (EDGE_ARRAY) memalloc (sizeof (EDGE_RECORD) * MAX_USER_EDGES);
01047   strcpy(name, demodir);
01048   strcat (name, "tessdata/user-words");
01049   read_word_list(name, user_words, MAX_USER_EDGES, USER_RESERVED_EDGES);
01050   case_sensative = FALSE;
01051 }
01052 
01056 void end_permute() {
01057   memfree(word_dawg);
01058   word_dawg = NULL;
01059   memfree(document_words);
01060   document_words =  NULL;
01061   memfree(pending_words);
01062   pending_words = NULL;
01063   memfree(user_words);
01064   user_words = NULL;
01065 }
01066 
01073 A_CHOICE *permute_all(CHOICES_LIST char_choices,
01074                       float rating_limit,
01075                       A_CHOICE *raw_choice) {
01076   A_CHOICE *result_1;
01077   A_CHOICE *result_2 = NULL;
01078   BOOL8 any_alpha;
01079 
01080   result_1 = permute_top_choice (char_choices, rating_limit, raw_choice,
01081     &any_alpha);
01082   if (result_1 == NULL)
01083     return (NULL);
01084   if (permute_only_top)
01085     return result_1;
01086   if (any_alpha && array_count (char_choices) <= 20) {
01087     result_2 = permute_words (char_choices, rating_limit);
01088 
01089     if (class_probability (result_1) < class_probability (result_2)
01090     || class_string (result_2) == NULL) {
01091       free_choice(result_2);
01092     }
01093     else {
01094       free_choice(result_1);
01095       result_1 = result_2;
01096     }
01097   }
01098 
01099   result_2 = number_permute_and_select (char_choices, rating_limit);
01100 
01101   if (class_probability (result_1) < class_probability (result_2)
01102   || class_string (result_2) == NULL) {
01103     free_choice(result_2);
01104   }
01105   else {
01106     free_choice(result_1);
01107     result_1 = result_2;
01108   }
01109 
01110   result_2 = permute_compound_words (char_choices, rating_limit);
01111 
01112   if (!result_2 ||
01113     class_probability (result_1) < class_probability (result_2)
01114   || class_string (result_2) == NULL) {
01115     free_choice(result_2);
01116   }
01117   else {
01118     free_choice(result_1);
01119     result_1 = result_2;
01120   }
01121 
01122   return (result_1);
01123 }
01124 
01125 
01133 void permute_characters(CHOICES_LIST char_choices,
01134                         float limit,
01135                         A_CHOICE *best_choice,
01136                         A_CHOICE *raw_choice) {
01137   A_CHOICE *this_choice;
01138 
01139   permutation_count++;           /* Global counter */
01140 
01141   this_choice = permute_all (char_choices, limit, raw_choice);
01142 
01143   if (this_choice &&
01144   class_probability (this_choice) < class_probability (best_choice)) {
01145     clone_choice(best_choice, this_choice);
01146   }
01147   free_choice(this_choice);
01148 
01149   if (display_ratings)
01150     cprintf ("permute_characters:   %-15s %4.2f %4.2f\n",
01151       class_string (best_choice),
01152       class_probability (best_choice), class_certainty (best_choice));
01153 }
01154 
01155 
01159 A_CHOICE *permute_compound_words(CHOICES_LIST character_choices,
01160                                  float rating_limit) {
01161   A_CHOICE *first_choice;
01162   A_CHOICE *best_choice = NULL;
01163   char word[MAX_WERD_LENGTH + 1];
01164   float rating = 0;
01165   float certainty = 10000;
01166   char char_choice;
01167   int x;
01168   int first_index = 0;
01169   char *ptr;
01170 
01171   word[0] = '\0';
01172 
01173   if (array_count (character_choices) > MAX_WERD_LENGTH) {
01174     return (new_choice (NULL, MAX_FLOAT32, -MAX_FLOAT32, -1, NO_PERM));
01175   }
01176 
01177   array_loop(character_choices, x) {
01178 
01179     first_choice =
01180       (A_CHOICE *) first ((CHOICES) array_value (character_choices, x));
01181 
01182     ptr = class_string (first_choice);
01183     char_choice = ptr != NULL ? *ptr : '\0';
01184     if (x > first_index && (char_choice == '-' || char_choice == '/')) {
01185       if (compound_debug)
01186         cprintf ("Hyphenated word found\n");
01187 
01188       permute_subword (character_choices, rating_limit,
01189         first_index, x - 1, word, &rating, &certainty);
01190 
01191       if (rating > rating_limit)
01192         break;
01193       first_index = x + 1;
01194       strcat (word, class_string (first_choice));
01195       rating += class_probability (first_choice);
01196       certainty = min (class_certainty (first_choice), certainty);
01197     }
01198   }
01199 
01200   if (first_index > 0 && first_index < x && rating <= rating_limit) {
01201     permute_subword (character_choices, rating_limit,
01202       first_index, x - 1, word, &rating, &certainty);
01203 
01204     best_choice = new_choice (word, rating, certainty, -1, COMPOUND_PERM);
01205   }
01206   return (best_choice);
01207 }
01208 
01209 
01230 void permute_subword(CHOICES_LIST character_choices,
01231                      float rating_limit,
01232                      int start,
01233                      int end,
01234                      char *word,
01235                      float *rating,
01236                      float *certainty) {
01237   int x;
01238   A_CHOICE *best_choice = NULL;
01239   A_CHOICE raw_choice;
01240   CHOICES_LIST subchoices;
01241   CHOICES choices;
01242   char this_char;
01243   char *ptr;
01244 
01245   DisableChoiceAccum();
01246   raw_choice.string = NULL;
01247   raw_choice.rating = MAX_INT16;
01248   raw_choice.certainty = -MAX_INT16;
01249 
01250   subchoices = new_choice_list ();
01251   for (x = start; x <= end; x++) {
01252     choices = (CHOICES) array_value (character_choices, x);
01253     ptr = best_string (choices);
01254     this_char = ptr != NULL ? *ptr : '\0';
01255     if (this_char != '-' && this_char != '/') {
01256       subchoices = array_push (subchoices, choices);
01257     } else {
01258       const char* str = best_string(choices);
01259       strcat (word, str);
01260     }
01261   }
01262 
01263   if (array_count (subchoices)) {
01264     if (compound_debug)
01265       dawg_debug = TRUE;
01266     best_choice = permute_all (subchoices, rating_limit, &raw_choice);
01267     if (compound_debug)
01268       dawg_debug = FALSE;
01269 
01270     if (best_choice && class_string (best_choice)) {
01271       strcat (word, class_string (best_choice));
01272       *rating += class_probability (best_choice);
01273       *certainty = min (class_certainty (best_choice), *certainty);
01274     }
01275     else {
01276       *rating = MAX_FLOAT32;
01277     }
01278   }
01279   else {
01280     *rating = MAX_FLOAT32;
01281   }
01282 
01283   free_choice_list(subchoices);
01284   if (best_choice)
01285     free_choice(best_choice);
01286 
01287   if (compound_debug && *rating < MAX_FLOAT32) {
01288     cprintf ("Subword permuted = %s, %5.2f, %5.2f\n\n",
01289       word, *rating, *certainty);
01290   }
01291   if (raw_choice.string)
01292     strfree(raw_choice.string);
01293 
01294   /* Sets flag used to disable accumulation of word choices during
01295   compound word permutation. LogNewRawChoice */
01296   EnableChoiceAccum();
01297 }
01298 
01299 
01313 A_CHOICE *permute_top_choice(CHOICES_LIST character_choices,
01314                              float rating_limit,
01315                              A_CHOICE *raw_choice,
01316                              BOOL8 *any_alpha) {
01317   CHOICES char_list;
01318   A_CHOICE *first_choice;
01319   A_CHOICE *best_choice;
01320   A_CHOICE *other_choice;
01321   char *ptr;
01322   char first_char;               //first choice
01323   char second_char;              //second choice
01324   char third_char;               //third choice
01325   char prev_char = '\0';         //prev in word
01326   char next_char = '\0';         //next in word
01327   char next_next_char = '\0';    //after next next in word
01328 
01329   char word[MAX_PERM_LENGTH + 1];
01330   char capital_word[MAX_PERM_LENGTH + 1];
01331   char lower_word[MAX_PERM_LENGTH + 1];
01332   int x;
01333   BOOL8 char_alpha;
01334 
01335   float rating = 0;
01336   float upper_rating = 0;
01337   float lower_rating = 0;
01338   float first_rating = 0;
01339 
01340   float certainty = 10000;
01341   float upper_certainty = 10000;
01342   float lower_certainty = 10000;
01343 
01344   float certainties[MAX_PERM_LENGTH + 1];
01345   float lower_certainties[MAX_PERM_LENGTH + 1];
01346   float upper_certainties[MAX_PERM_LENGTH + 1];
01347 
01348   register CHOICES this_char;
01349   register char ch;
01350   register INT8 lower_done;
01351   register INT8 upper_done;
01352 
01353   if (any_alpha != NULL)
01354     *any_alpha = FALSE;
01355 
01356   if (array_count (character_choices) > MAX_PERM_LENGTH) {
01357     return (NULL);
01358   }
01359 
01360   array_loop(character_choices, x) {
01361     if (x + 1 < array_count (character_choices)) {
01362       char_list = (CHOICES) array_value (character_choices, x + 1);
01363       first_choice = (A_CHOICE *) first (char_list);
01364 
01365       ptr = class_string (first_choice);
01366       next_char = (ptr != NULL && *ptr != '\0') ? *ptr : ' ';
01367     }
01368     else
01369       next_char = '\0';
01370     if (x + 2 < array_count (character_choices)) {
01371       char_list = (CHOICES) array_value (character_choices, x + 2);
01372       first_choice = (A_CHOICE *) first (char_list);
01373 
01374       ptr = class_string (first_choice);
01375       next_next_char = (ptr != NULL && *ptr != '\0') ? *ptr : ' ';
01376     }
01377     else
01378       next_next_char = '\0';
01379 
01380     char_list = (CHOICES) array_value (character_choices, x);
01381     first_choice = (A_CHOICE *) first (char_list);
01382 
01383     ptr = class_string (first_choice);
01384     word[x] = (ptr != NULL && *ptr != '\0') ? *ptr : ' ';
01385 
01386     lower_word[x] = word[x];
01387     capital_word[x] = word[x];
01388     first_char = word[x];
01389     first_rating = class_probability (first_choice);
01390     upper_rating += class_probability (first_choice);
01391     lower_rating += class_probability (first_choice);
01392     lower_certainty = min (class_certainty (first_choice), lower_certainty);
01393     upper_certainty = min (class_certainty (first_choice), upper_certainty);
01394 
01395     certainties[x] = class_certainty (first_choice);
01396     lower_certainties[x] = class_certainty (first_choice);
01397     upper_certainties[x] = class_certainty (first_choice);
01398 
01399     lower_done = FALSE;
01400     upper_done = FALSE;
01401     char_alpha = FALSE;
01402     second_char = '\0';
01403     third_char = '\0';
01404     iterate_list(this_char, char_list) {
01405       ptr = best_string (this_char);
01406       ch = ptr != NULL ? *ptr : '\0';
01407       if (ch == 'l' && rest (this_char) != NULL
01408       && best_probability (rest (this_char)) == first_rating) {
01409         ptr = best_string (rest (this_char));
01410         if (ptr != NULL && (*ptr == '1' || *ptr == 'I')) {
01411           second_char = *ptr;
01412           this_char = rest (this_char);
01413           if (rest (this_char) != NULL
01414           && best_probability (rest (this_char)) == first_rating) {
01415             ptr = best_string (rest (this_char));
01416             if (ptr != NULL && (*ptr == '1' || *ptr == 'I')) {
01417               third_char = *ptr;
01418               this_char = rest (this_char);
01419             }
01420           }
01421           ch = choose_il1 (first_char, second_char, third_char,
01422             prev_char, next_char, next_next_char);
01423           if (ch != 'l' && word[x] == 'l') {
01424             word[x] = ch;
01425             lower_word[x] = ch;
01426             capital_word[x] = ch;
01427           }
01428         }
01429       }
01430       /* Find lower case */
01431       if (!lower_done && (islower (ch) || (isupper (ch) && x == 0))) {
01432         lower_word[x] = ch;
01433         lower_rating += best_probability (this_char);
01434         lower_rating -= class_probability (first_choice);
01435         lower_certainty = min (best_certainty (this_char), lower_certainty);
01436         lower_certainties[x] = best_certainty (this_char);
01437         lower_done = TRUE;
01438       }
01439       /* Find upper case */
01440       if (!upper_done && isupper (ch)) {
01441         capital_word[x] = ch;
01442         upper_rating += best_probability (this_char);
01443         upper_rating -= class_probability (first_choice);
01444         upper_certainty = min (best_certainty (this_char), upper_certainty);
01445         upper_certainties[x] = best_certainty (this_char);
01446         upper_done = TRUE;
01447       }
01448       if (!char_alpha && isalpha (ch))
01449         char_alpha = TRUE;
01450       if (lower_done && upper_done)
01451         break;
01452     }
01453     if (char_alpha && any_alpha != NULL)
01454       *any_alpha = TRUE;
01455 
01456     if (first_choice == NULL) {
01457       cprintf ("Permuter giving up due to null choices list");
01458       word[x + 1] = '$';
01459       word[x + 2] = '\0';
01460       cprintf (" word=%s\n", word);
01461       return (NULL);
01462     }
01463 
01464     rating += class_probability (first_choice);
01465     if (rating > rating_limit)
01466       return (NULL);
01467 
01468     certainty = min (class_certainty (first_choice), certainty);
01469     prev_char = word[x];
01470   }
01471 
01472   lower_word[x] = '\0';
01473   capital_word[x] = '\0';
01474   word[x] = '\0';
01475 
01476   if (rating < class_probability (raw_choice)) {
01477     if (class_string (raw_choice))
01478       strfree (class_string (raw_choice));
01479 
01480     class_probability (raw_choice) = rating;
01481     class_certainty (raw_choice) = certainty;
01482     class_string (raw_choice) = strsave (word);
01483     class_permuter (raw_choice) = TOP_CHOICE_PERM;
01484 
01485     LogNewRawChoice (raw_choice, 1.0, certainties);
01486   }
01487 
01488   best_choice = new_choice (word, rating, certainty, -1, TOP_CHOICE_PERM);
01489   adjust_non_word(best_choice, certainties);
01490 
01491   other_choice = new_choice (lower_word, lower_rating, lower_certainty,
01492     -1, LOWER_CASE_PERM);
01493   adjust_non_word(other_choice, lower_certainties);
01494   if (class_probability (best_choice) > class_probability (other_choice)) {
01495     clone_choice(best_choice, other_choice);
01496   }
01497   free_choice(other_choice);
01498 
01499   other_choice = new_choice (capital_word, upper_rating, upper_certainty,
01500     -1, UPPER_CASE_PERM);
01501   adjust_non_word(other_choice, upper_certainties);
01502   if (class_probability (best_choice) > class_probability (other_choice)) {
01503     clone_choice(best_choice, other_choice);
01504   }
01505   free_choice(other_choice);
01506 
01507   return (best_choice);
01508 }
01509 
01510 
01522 char choose_il1(char first_char,
01523                 char second_char,
01524                 char third_char,
01525                 char prev_char,
01526                 char next_char,
01527                 char next_next_char) {
01528   INT32 type1;                   //1/I/l type of first choice
01529   INT32 type2;                   //1/I/l type of second choice
01530   INT32 type3;                   //1/I/l type of third choice
01531 
01532   if (first_char == 'l' && second_char != '\0') {
01533     if (second_char == 'I'
01534       && (isupper (prev_char) && !islower (next_char)
01535       && !isdigit (next_char) || isupper (next_char)
01536       && !islower (prev_char) && !isdigit (prev_char)))
01537       first_char = second_char;  //override
01538     else if (second_char == '1' || third_char == '1') {
01539       if (isdigit (next_char) || isdigit (prev_char)
01540       || next_char == 'l' && isdigit (next_next_char)) {
01541         first_char = '1';
01542       }
01543       else if (!islower (prev_char)
01544         && (!islower (next_char) || next_char == 's'
01545       && next_next_char == 't')) {
01546         if ((prev_char != '\'' && prev_char != '`' || next_char != '\0')
01547           && (next_char != '\'' && next_char != '`'
01548         || prev_char != '\0')) {
01549           first_char = '1';
01550         }
01551       }
01552     }
01553     if (first_char == 'l' && next_char != '\0' && !isalpha (prev_char)) {
01554       type1 = 2;
01555 
01556       if (second_char == '1')
01557         type2 = 0;
01558       else if (second_char == 'I')
01559         type2 = 1;
01560       else if (second_char == 'l')
01561         type2 = 2;
01562       else
01563         type2 = type1;
01564 
01565       if (third_char == '1')
01566         type3 = 0;
01567       else if (third_char == 'I')
01568         type3 = 1;
01569       else if (third_char == 'l')
01570         type3 = 2;
01571       else
01572         type3 = type1;
01573 
01574       if (bigram_counts[next_char][type2] >
01575       bigram_counts[next_char][type1]) {
01576         first_char = second_char;
01577         type1 = type2;
01578       }
01579       if (bigram_counts[next_char][type3] >
01580       bigram_counts[next_char][type1]) {
01581         first_char = third_char;
01582       }
01583     }
01584   }
01585   return first_char;
01586 }
01587 
01588 
01593 A_CHOICE *permute_words(CHOICES_LIST char_choices, float rating_limit) { 
01594   A_CHOICE *best_choice;
01595   int hyphen_len;
01596 
01597   best_choice = new_choice (NULL, rating_limit, -MAX_FLOAT32, -1, NO_PERM);
01598 
01599   hyphen_len = hyphen_string != NULL ? strlen (hyphen_string) : 0;
01600   if (hyphen_len + array_count (char_choices) > MAX_WERD_LENGTH) {
01601     class_probability (best_choice) = MAX_FLOAT32;
01602   }
01603   else {
01604 
01605     dawg_permute_and_select ("system words:", word_dawg, SYSTEM_DAWG_PERM,
01606       char_choices, best_choice, TRUE);
01607 
01608     dawg_permute_and_select ("document_words", document_words,
01609       DOC_DAWG_PERM, char_choices, best_choice,
01610       FALSE);
01611 
01612     dawg_permute_and_select ("user words", user_words, USER_DAWG_PERM,
01613       char_choices, best_choice, FALSE);
01614     case_sensative = FALSE;
01615   }
01616 
01617   return (best_choice);
01618 }
01619 
01620 
01624 int valid_word(const char *string) { 
01625   int result = NO_PERM;
01626 
01627   if (word_in_dawg (word_dawg, string))
01628     result = SYSTEM_DAWG_PERM;
01629   else {
01630     if (word_in_dawg (document_words, string))
01631       result = DOC_DAWG_PERM;
01632     else if (word_in_dawg (user_words, string))
01633       result = USER_DAWG_PERM;
01634     case_sensative = FALSE;
01635   }
01636   return (result);
01637 }

Generated on Wed Feb 28 19:49:10 2007 for Tesseract by  doxygen 1.5.1