ccmain/docqual.h

Go to the documentation of this file.
00001 
00020 #ifndef           DOCQUAL_H
00021 #define           DOCQUAL_H
00022 
00023 #include          "control.h"
00024 #include          "notdll.h"
00025 
00026 
00031 enum GARBAGE_LEVEL
00032 {
00033   G_NEVER_CRUNCH, //
00034   G_OK,           //
00035   G_DODGY,        //
00036   G_TERRIBLE      //
00037 };
00038 
00041 extern STRING_VAR_H (outlines_odd, "%| ", "Non standard number of outlines");
00042 extern STRING_VAR_H (outlines_2, "ij!?%\":;",
00043 "Non standard number of outlines");
00044 extern BOOL_VAR_H (docqual_excuse_outline_errs, FALSE,
00045 "Allow outline errs in unrejection?");
00046 extern BOOL_VAR_H (tessedit_good_quality_unrej, TRUE,
00047 "Reduce rejection on good docs");
00048 extern BOOL_VAR_H (tessedit_use_reject_spaces, TRUE, "Reject spaces?");
00049 extern double_VAR_H (tessedit_reject_doc_percent, 65.00,
00050 "%rej allowed before rej whole doc");
00051 extern double_VAR_H (tessedit_reject_block_percent, 45.00,
00052 "%rej allowed before rej whole block");
00053 extern double_VAR_H (tessedit_reject_row_percent, 40.00,
00054 "%rej allowed before rej whole row");
00055 extern double_VAR_H (tessedit_whole_wd_rej_row_percent, 70.00,
00056 "%of row rejects in whole word rejects which prevents whole row rejection");
00057 extern BOOL_VAR_H (tessedit_preserve_blk_rej_perfect_wds, TRUE,
00058 "Only rej partially rejected words in block rejection");
00059 extern BOOL_VAR_H (tessedit_preserve_row_rej_perfect_wds, TRUE,
00060 "Only rej partially rejected words in row rejection");
00061 extern BOOL_VAR_H (tessedit_dont_blkrej_good_wds, FALSE,
00062 "Use word segmentation quality metric");
00063 extern BOOL_VAR_H (tessedit_dont_rowrej_good_wds, FALSE,
00064 "Use word segmentation quality metric");
00065 extern INT_VAR_H (tessedit_preserve_min_wd_len, 2,
00066 "Only preserve wds longer than this");
00067 extern BOOL_VAR_H (tessedit_row_rej_good_docs, TRUE,
00068 "Apply row rejection to good docs");
00069 extern double_VAR_H (tessedit_good_doc_still_rowrej_wd, 1.1,
00070 "rej good doc wd if more than this fraction rejected");
00071 extern BOOL_VAR_H (tessedit_reject_bad_qual_wds, TRUE,
00072 "Reject all bad quality wds");
00073 extern BOOL_VAR_H (tessedit_debug_doc_rejection, FALSE, "Page stats");
00074 extern BOOL_VAR_H (tessedit_debug_quality_metrics, FALSE,
00075 "Output data to debug file");
00076 extern BOOL_VAR_H (bland_unrej, FALSE, "unrej potential with no chekcs");
00077 extern double_VAR_H (quality_rowrej_pc, 1.1,
00078 "good_quality_doc gte good char limit");
00079 extern BOOL_VAR_H (unlv_tilde_crunching, TRUE,
00080 "Mark v.bad words for tilde crunch");
00081 extern BOOL_VAR_H (crunch_early_merge_tess_fails, TRUE,
00082 "Before word crunch?");
00083 extern BOOL_VAR_H (crunch_early_convert_bad_unlv_chs, FALSE,
00084 "Take out ~^ early?");
00085 extern double_VAR_H (crunch_terrible_rating, 80.0, "crunch rating lt this");
00086 extern BOOL_VAR_H (crunch_terrible_garbage, TRUE, "As it says");
00087 extern double_VAR_H (crunch_poor_garbage_cert, -9.0,
00088 "crunch garbage cert lt this");
00089 extern double_VAR_H (crunch_poor_garbage_rate, 60,
00090 "crunch garbage rating lt this");
00091 extern double_VAR_H (crunch_pot_poor_rate, 40,
00092 "POTENTIAL crunch rating lt this");
00093 extern double_VAR_H (crunch_pot_poor_cert, -8.0,
00094 "POTENTIAL crunch cert lt this");
00095 extern BOOL_VAR_H (crunch_pot_garbage, TRUE, "POTENTIAL crunch garbage");
00096 extern double_VAR_H (crunch_del_rating, 60,
00097 "POTENTIAL crunch rating lt this");
00098 extern double_VAR_H (crunch_del_cert, -10.0, "POTENTIAL crunch cert lt this");
00099 extern double_VAR_H (crunch_del_min_ht, 0.7, "Del if word ht lt xht x this");
00100 extern double_VAR_H (crunch_del_max_ht, 3.0, "Del if word ht gt xht x this");
00101 extern double_VAR_H (crunch_del_min_width, 3.0,
00102 "Del if word width lt xht x this");
00103 extern double_VAR_H (crunch_del_high_word, 1.5,
00104 "Del if word gt xht x this above bl");
00105 extern double_VAR_H (crunch_del_low_word, 0.5,
00106 "Del if word gt xht x this below bl");
00107 extern double_VAR_H (crunch_small_outlines_size, 0.6,
00108 "Small if lt xht x this");
00109 extern INT_VAR_H (crunch_rating_max, 10, "For adj length in rating per ch");
00110 extern INT_VAR_H (crunch_pot_indicators, 1,
00111 "How many potential indicators needed");
00112 extern BOOL_VAR_H (crunch_leave_ok_strings, TRUE,
00113 "Dont touch sensible strings");
00114 extern BOOL_VAR_H (crunch_accept_ok, TRUE, "Use acceptability in okstring");
00115 extern BOOL_VAR_H (crunch_leave_accept_strings, FALSE,
00116 "Dont pot crunch sensible strings");
00117 extern BOOL_VAR_H (crunch_include_numerals, FALSE, "Fiddle alpha figures");
00118 extern INT_VAR_H (crunch_leave_lc_strings, 4,
00119 "Dont crunch words with long lower case strings");
00120 extern INT_VAR_H (crunch_leave_uc_strings, 4,
00121 "Dont crunch words with long lower case strings");
00122 extern INT_VAR_H (crunch_long_repetitions, 3,
00123 "Crunch words with long repetitions");
00124 extern INT_VAR_H (crunch_debug, 0, "As it says");
00127 INT16 word_blob_quality(  //Blob seg changes
00128                         WERD_RES *word,
00129                         ROW *row);
00130 BOOL8 crude_match_blobs(PBLOB *blob1, PBLOB *blob2); 
00131 INT16 word_outline_errs(  //Outline count errs
00132                         WERD_RES *word);
00133 void word_char_quality(  //Blob seg changes
00134                        WERD_RES *word,
00135                        ROW *row,
00136                        INT16 *match_count,
00137                        INT16 *accepted_match_count);
00138 void unrej_good_chs(WERD_RES *word, ROW *row); 
00139 void print_boxes(WERD *word); 
00140 INT16 count_outline_errs(char c, INT16 outline_count); 
00141 void quality_based_rejection(PAGE_RES_IT &page_res_it, BOOL8 good_quality_doc); 
00142 void unrej_good_quality_words(  //unreject potential
00143                               PAGE_RES_IT &page_res_it);
00144 void doc_and_block_rejection(  //reject big chunks
00145                              PAGE_RES_IT &page_res_it,
00146                              BOOL8 good_quality_doc);
00147 void reject_whole_page(PAGE_RES_IT &page_res_it); 
00148 void tilde_crunch(PAGE_RES_IT &page_res_it); 
00149 BOOL8 terrible_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level); 
00150 BOOL8 potential_word_crunch(WERD_RES *word,
00151                             GARBAGE_LEVEL garbage_level,
00152                             BOOL8 ok_dict_word);
00153 void tilde_delete(PAGE_RES_IT &page_res_it); 
00154                                  //word to do
00155 void convert_bad_unlv_chs(WERD_RES *word_res); 
00156                                  //word to do
00157 void merge_tess_fails(WERD_RES *word_res); 
00158 GARBAGE_LEVEL garbage_word(WERD_RES *word, BOOL8 ok_dict_word); 
00159 CRUNCH_MODE word_deletable(WERD_RES *word, INT16 &delete_mode); 
00160 INT16 failure_count(WERD_RES *word); 
00161 BOOL8 noise_outlines(WERD *word); 
00162                                  //word to do
00163 void insert_rej_cblobs(WERD_RES *word); 
00164 #endif

Generated on Wed Feb 28 19:49:07 2007 for Tesseract by  doxygen 1.5.1