ccmain/docqual.h File Reference

#include "control.h"
#include "notdll.h"

Go to the source code of this file.



Enumeration Type Documentation


Type of document quality metrics


Definition at line 31 of file docqual.h.

00032 {
00033   G_NEVER_CRUNCH, //
00034   G_OK,           //
00035   G_DODGY,        //
00036   G_TERRIBLE      //
00037 };

Function Documentation

void convert_bad_unlv_chs ( WERD_RES word_res  ) 

Converts all '~' to '-' & '^' to ' ' in this word.

word_res Word results
none, instead updates word's reject_map

Definition at line 1106 of file docqual.cpp.

References WERD_RES::best_choice, REJMAP::length(), and WERD_RES::reject_map.

Referenced by tilde_crunch().

01107                                               {
01108   char *ptr;                     //string ptr
01109   int i;
01111   ptr = (char *) word_res->best_choice->string ().string ();
01112   for (i = 0; i < word_res->reject_map.length (); i++) {
01113     if (ptr[i] == '~') {
01114       ptr[i] = '-';
01115       if (word_res->reject_map[i].accepted ())
01116         word_res->reject_map[i].setrej_unlv_rej ();
01117     }
01118     if (ptr[i] == '^') {
01119       ptr[i] = ' ';
01120       if (word_res->reject_map[i].accepted ())
01121         word_res->reject_map[i].setrej_unlv_rej ();
01122     }
01123   }
01124 }

INT16 count_outline_errs ( char  c,
INT16  outline_count 

Test if character is in known-odd categories.

c character
outline_count outlines we have
difference between outline_count and what it should be
So, count_outline_errs() returns 0 if c has the EXPECTED number of outlines.

Definition at line 475 of file docqual.cpp.

Referenced by unrej_good_chs(), word_char_quality(), and word_outline_errs().

00475                                                       { 
00476   int expected_outline_count;
00478   if (STRING (outlines_odd).contains (c))
00479     return 0;                    //Dont use this char
00480   else if (STRING (outlines_2).contains (c))
00481     expected_outline_count = 2;
00482   else
00483     expected_outline_count = 1;
00484   return abs (outline_count - expected_outline_count);
00485 }

BOOL8 crude_match_blobs ( PBLOB blob1,
PBLOB blob2 

Compares two blobs' bounding boxes & lengths of outlines.

blob1 first blob
blob2 second blob
TRUE if blob1 == blob2

Definition at line 227 of file docqual.cpp.

References PBLOB::bounding_box(), BOX::contains(), FALSE, PBLOB::out_list(), and TRUE.

Referenced by unrej_good_chs(), word_blob_quality(), and word_char_quality().

00227                                                     { 
00228   BOX box1 = blob1->bounding_box ();
00229   BOX box2 = blob2->bounding_box ();
00231   if (box1.contains (box2) &&
00232     box2.contains (box1) &&
00233     (blob1->out_list ()->length () == blob1->out_list ()->length ()))
00234     return TRUE;
00235   else
00236     return FALSE;
00237 }

void doc_and_block_rejection ( PAGE_RES_IT page_res_it,
BOOL8  good_quality_doc 

Reject big chunks.

page_res_it pointer to page in question ?
good_quality_doc 1 or 0, document is good
Reject all of the page or block of it's deemed bad (too big)

If the page has too many rejects - reject all of it. If any block has too many rejects - reject all words in the block

Definition at line 608 of file docqual.cpp.

References AC_UNACCEPTABLE, acceptable_word_string(), WERD_RES::best_choice, BLOCK_RES::block, PAGE_RES_IT::block(), ROW_RES::char_count, BLOCK_RES::char_count, FALSE, PAGE_RES_IT::forward(), REJMAP::length(), NULL, PAGE_RES_IT::page_res, PAGE_RES_IT::prev_row(), ROW_RES::rej_count, BLOCK_RES::rej_count, REJMAP::rej_word_block_rej(), REJMAP::reject_count(), WERD_RES::reject_map, WERD_RES::reject_spaces, reject_whole_page(), PAGE_RES_IT::restart_page(), ROW_RES::row, PAGE_RES_IT::row(), SECURE_NAMES, WERD::space(), tprintf(), TRUE, ROW_RES::whole_word_rej_count, WERD_RES::word, PAGE_RES_IT::word(), and word_char_quality().

Referenced by quality_based_rejection().

00610                                                      {
00611   INT16 block_no = 0;
00612   INT16 row_no = 0;
00613   BLOCK_RES *current_block;
00614   ROW_RES *current_row;
00616   BOOL8 rej_word;
00617   BOOL8 prev_word_rejected;
00618   INT16 char_quality;
00619   INT16 accepted_char_quality;
00621   if ((page_res_it.page_res->rej_count * 100.0 /
00622   page_res_it.page_res->char_count) > tessedit_reject_doc_percent) {
00623     reject_whole_page(page_res_it); 
00624     #ifndef SECURE_NAMES
00625     if (tessedit_debug_doc_rejection) {
00626       tprintf ("REJECT ALL #chars: %d #Rejects: %d; \n",
00627         page_res_it.page_res->char_count,
00628         page_res_it.page_res->rej_count);
00629     }
00630     #endif
00631   }
00632   else {
00633     #ifndef SECURE_NAMES
00634     if (tessedit_debug_doc_rejection)
00635       tprintf ("NO PAGE REJECTION #chars: %d  # Rejects: %d; \n",
00636         page_res_it.page_res->char_count,
00637         page_res_it.page_res->rej_count);
00638     #endif
00640     /* Walk blocks testing for block rejection */
00642     page_res_it.restart_page ();
00643     while (page_res_it.word () != NULL) {
00644       current_block = page_res_it.block ();
00645       if (current_block->block->text_region () != NULL)
00646         block_no = current_block->block->text_region ()->id_no ();
00647       else
00648         block_no = -1;
00649       if ((page_res_it.block ()->char_count > 0) &&
00650         ((page_res_it.block ()->rej_count * 100.0 /
00651         page_res_it.block ()->char_count) >
00652       tessedit_reject_block_percent)) {
00653         #ifndef SECURE_NAMES
00654         if (tessedit_debug_block_rejection)
00655           tprintf ("REJECTING BLOCK %d  #chars: %d;  #Rejects: %d\n",
00656             block_no,
00657             page_res_it.block ()->char_count,
00658             page_res_it.block ()->rej_count);
00659         #endif
00660         prev_word_rejected = FALSE;
00661         while ((page_res_it.word () != NULL) &&
00662         (page_res_it.block () == current_block)) {
00663           if (tessedit_preserve_blk_rej_perfect_wds) {
00664             rej_word =
00665               (page_res_it.word ()->reject_map.reject_count () > 0)
00666               || (page_res_it.word ()->reject_map.length () <
00667               tessedit_preserve_min_wd_len);
00668             if (rej_word && tessedit_dont_blkrej_good_wds
00669               && !(page_res_it.word ()->reject_map.length () <
00670               tessedit_preserve_min_wd_len)
00671               &&
00672               (acceptable_word_string
00673               (page_res_it.word ()->best_choice->string ().
00674             string ()) != AC_UNACCEPTABLE)) {
00675               word_char_quality (page_res_it.word (),
00676                 page_res_it.row ()->row,
00677                 &char_quality,
00678                 &accepted_char_quality);
00679               rej_word = char_quality !=
00680                 page_res_it.word ()->reject_map.length ();
00681             }
00682           }
00683           else
00684             rej_word = TRUE;
00685           if (rej_word) {
00686             /*
00687               Reject spacing if both current and prev words are rejected.
00688               NOTE - this is NOT restricted to FUZZY spaces.
00689            When tried this generated more space errors.
00690             */
00691             if (tessedit_use_reject_spaces &&
00692               prev_word_rejected &&
00693               (page_res_it.prev_row () == page_res_it.row ()) &&
00694               (page_res_it.word ()->word->space () == 1))
00695               page_res_it.word ()->reject_spaces = TRUE;
00696             page_res_it.word ()->reject_map.rej_word_block_rej ();
00697           }
00698           prev_word_rejected = rej_word;
00699           page_res_it.forward ();
00700         }
00701       }
00702       else {
00703         #ifndef SECURE_NAMES
00704         if (tessedit_debug_block_rejection)
00705           tprintf
00706             ("NOT REJECTING BLOCK %d #chars: %d  # Rejects: %d; \n",
00707             block_no, page_res_it.block ()->char_count,
00708             page_res_it.block ()->rej_count);
00709         #endif
00711         /* Walk rows in block testing for row rejection */
00712         row_no = 0;
00713         while ((page_res_it.word () != NULL) &&
00714         (page_res_it.block () == current_block)) {
00715           current_row = page_res_it.row ();
00716           row_no++;
00717      /*
00718    Reject whole row if:
00719     fraction of chars on row which are rejected exceed a limit AND
00720     fraction rejects which occur in WHOLE WERD rejects is LESS THAN a limit
00721      */
00722           if ((page_res_it.row ()->char_count > 0) &&
00723             ((page_res_it.row ()->rej_count * 100.0 /
00724             page_res_it.row ()->char_count) >
00725             tessedit_reject_row_percent) &&
00726             ((page_res_it.row ()->whole_word_rej_count * 100.0 /
00727             page_res_it.row ()->rej_count) <
00728           tessedit_whole_wd_rej_row_percent)) {
00729             #ifndef SECURE_NAMES
00730             if (tessedit_debug_block_rejection)
00731               tprintf
00732                 ("REJECTING ROW %d  #chars: %d;  #Rejects: %d\n",
00733                 row_no, page_res_it.row ()->char_count,
00734                 page_res_it.row ()->rej_count);
00735             #endif
00736             prev_word_rejected = FALSE;
00737             while ((page_res_it.word () != NULL) &&
00738             (page_res_it.row () == current_row)) {
00739               /* Preserve words on good docs unless they are mostly rejected*/
00740               if (!tessedit_row_rej_good_docs && good_quality_doc) {
00741                 rej_word =
00742                   page_res_it.word ()->reject_map.
00743                   reject_count () /
00744                   (float) page_res_it.word ()->reject_map.
00745                   length () > tessedit_good_doc_still_rowrej_wd;
00746               }
00748               /* Preserve perfect words anyway */
00749               else if (tessedit_preserve_row_rej_perfect_wds) {
00750                 rej_word =
00751                   (page_res_it.word ()->reject_map.
00752                   reject_count () > 0)
00753                   || (page_res_it.word ()->reject_map.
00754                   length () < tessedit_preserve_min_wd_len);
00755                 if (rej_word && tessedit_dont_rowrej_good_wds
00756                   && !(page_res_it.word ()->reject_map.
00757                   length () <
00758                   tessedit_preserve_min_wd_len)
00759                   &&
00760                   (acceptable_word_string
00761                   (page_res_it.word ()->best_choice->
00762                 string ().string ()) != AC_UNACCEPTABLE)) {
00763                   word_char_quality (page_res_it.word (),
00764                     page_res_it.row ()->row,
00765                     &char_quality,
00766                     &accepted_char_quality);
00767                   rej_word = char_quality !=
00768                     page_res_it.word ()->reject_map.length ();
00769                 }
00770               }
00771               else
00772                 rej_word = TRUE;
00773               if (rej_word) {
00774                 /*
00775                   Reject spacing if both current and prev words are rejected.
00776                   NOTE - this is NOT restricted to FUZZY spaces.
00777               When tried this generated more space errors.
00778                 */
00779                 if (tessedit_use_reject_spaces &&
00780                   prev_word_rejected &&
00781                   (page_res_it.prev_row () ==
00782                   page_res_it.row ())
00783                   && (page_res_it.word ()->word->space () ==
00784                   1))
00785                   page_res_it.word ()->reject_spaces = TRUE;
00786                 page_res_it.word ()->reject_map.
00787                   rej_word_row_rej(); 
00788               }
00789               prev_word_rejected = rej_word;
00790               page_res_it.forward ();
00791             }
00792           }
00793           else {
00794             #ifndef SECURE_NAMES
00795             if (tessedit_debug_block_rejection)
00796               tprintf
00797                 ("NOT REJECTING ROW %d #chars: %d  # Rejects: %d; \n",
00798                 row_no, page_res_it.row ()->char_count,
00799                 page_res_it.row ()->rej_count);
00800             #endif
00801             while ((page_res_it.word () != NULL) &&
00802               (page_res_it.row () == current_row))
00803               page_res_it.forward ();
00804           }
00805         }
00806       }
00807     }
00808   }
00809 }

INT16 failure_count ( WERD_RES word  ) 

Count up all the blanks (' ') in word's best_choice string.

word Word

Definition at line 1484 of file docqual.cpp.

References WERD_RES::best_choice.

Referenced by word_deletable().

01484                                     { 
01485   char *str = (char *) word->best_choice->string ().string ();
01486   int tess_rejs = 0;
01488   for (; *str != '\0'; str++) {
01489     if (*str == ' ')
01490       tess_rejs++;
01491   }
01492   return tess_rejs;
01493 }

GARBAGE_LEVEL garbage_word ( WERD_RES word,
BOOL8  ok_dict_word 

Determine probability that word is garbage using manu heuristic/rules.

word Word in question
ok_dict_word 0 or 1, 1 if word in dictionary
  • crunch_include_numerals,
  • crunch_leave_ok_strings, and
  • crunch_debug
Two steps: first step scans through letters of word and gathers statistics and updates state-machine. Second step works at the word-level together with statistics from first step. Neato.

Definition at line 1181 of file docqual.cpp.

References AC_UNACCEPTABLE, acceptable_word_string(), WERD_RES::best_choice, FREQ_DAWG_PERM, G_DODGY, G_NEVER_CRUNCH, G_OK, G_TERRIBLE, REJMAP::length(), NULL, NUMBER_PERM, WERD_RES::reject_map, SYSTEM_DAWG_PERM, tprintf(), and USER_DAWG_PERM.

Referenced by tilde_crunch().

01181                                                                { 
01182   enum STATES
01183   {
01184     JUNK,
01185     FIRST_UPPER,
01186     FIRST_LOWER,
01187     FIRST_NUM,
01191   };
01192   char *str = (char *) word->best_choice->string ().string ();
01193   STATES state = JUNK;
01194   int len = 0;
01195   int isolated_digits = 0;
01196   int isolated_alphas = 0;
01197   int bad_char_count = 0;
01198   int tess_rejs = 0;
01199   int dodgy_chars = 0;
01200   int ok_chars;
01201   char last_char = ' ';
01202   int alpha_repetition_count = 0;
01203   int longest_alpha_repetition_count = 0;
01204   int longest_lower_run_len = 0;
01205   int lower_string_count = 0;
01206   int longest_upper_run_len = 0;
01207   int upper_string_count = 0;
01208   int total_alpha_count = 0;
01209   int total_digit_count = 0;
01211   /* Step 1: Scan letters of word and set up a bunch of variables
01212       Working at the level of individual letters */
01213   for (; *str != '\0'; str++) {
01214     len++;
01215     if (isupper (*str)) {
01216       total_alpha_count++;
01217       switch (state) {
01218         case SUBSEQUENT_UPPER:
01219         case FIRST_UPPER:
01220           state = SUBSEQUENT_UPPER;
01221           upper_string_count++;
01222           if (longest_upper_run_len < upper_string_count)
01223             longest_upper_run_len = upper_string_count;
01224           if (last_char == *str) {
01225             alpha_repetition_count++;
01226             if (longest_alpha_repetition_count < alpha_repetition_count) {
01227               longest_alpha_repetition_count = alpha_repetition_count;
01228             }
01229           }
01230           else {
01231             last_char = *str;
01232             alpha_repetition_count = 1;
01233           }
01234           break;
01235         case FIRST_NUM:
01236           isolated_digits++;
01237         default:
01238           state = FIRST_UPPER;
01239           last_char = *str;
01240           alpha_repetition_count = 1;
01241           upper_string_count = 1;
01242           break;
01243       }
01244     }
01245     else if (islower (*str)) {
01246       total_alpha_count++;
01247       switch (state) {
01248         case SUBSEQUENT_LOWER:
01249         case FIRST_LOWER:
01250           state = SUBSEQUENT_LOWER;
01251           lower_string_count++;
01252           if (longest_lower_run_len < lower_string_count)
01253             longest_lower_run_len = lower_string_count;
01254           if (last_char == *str) {
01255             alpha_repetition_count++;
01256             if (longest_alpha_repetition_count < alpha_repetition_count) {
01257               longest_alpha_repetition_count = alpha_repetition_count;
01258             }
01259           }
01260           else {
01261             last_char = *str;
01262             alpha_repetition_count = 1;
01263           }
01264           break;
01265         case FIRST_NUM:
01266           isolated_digits++;
01267         default:
01268           state = FIRST_LOWER;
01269           last_char = *str;
01270           alpha_repetition_count = 1;
01271           lower_string_count = 1;
01272           break;
01273       }
01274     }
01275     else if (isdigit (*str)) {
01276       total_digit_count++;
01277       switch (state) {
01278         case FIRST_NUM:
01279           state = SUBSEQUENT_NUM;
01280         case SUBSEQUENT_NUM:
01281           break;
01282         case FIRST_UPPER:
01283         case FIRST_LOWER:
01284           isolated_alphas++;
01285         default:
01286           state = FIRST_NUM;
01287           break;
01288       }
01289     }
01290     else {
01291       if (*str == ' ')
01292         tess_rejs++;
01293       else
01294         bad_char_count++;
01295       switch (state) {
01296         case FIRST_NUM:
01297           isolated_digits++;
01298           break;
01299         case FIRST_UPPER:
01300         case FIRST_LOWER:
01301           isolated_alphas++;
01302         default:
01303           break;
01304       }
01305       state = JUNK;
01306     }
01307   }
01309   /* Step 2: Combine result of Step 1 with heuristics to determine
01310      whether word is garbage. Working at the level of whole word */
01311   switch (state) {
01312     case FIRST_NUM:
01313       isolated_digits++;
01314       break;
01315     case FIRST_UPPER:
01316     case FIRST_LOWER:
01317       isolated_alphas++;
01318     default:
01319       break;
01320   }
01322   if (crunch_include_numerals) {
01323     total_alpha_count += total_digit_count - isolated_digits;
01324   }
01326   if (crunch_leave_ok_strings &&
01327     (len >= 4) &&
01328     (2 * (total_alpha_count - isolated_alphas) > len) &&
01329   (longest_alpha_repetition_count < crunch_long_repetitions)) {
01330     if ((crunch_accept_ok &&
01331       (acceptable_word_string (str) != AC_UNACCEPTABLE)) ||
01332       (longest_lower_run_len > crunch_leave_lc_strings) ||
01333       (longest_upper_run_len > crunch_leave_uc_strings))
01334       return G_NEVER_CRUNCH;
01335   }
01336   if ((word->reject_map.length () > 1) &&
01337     (strpbrk (str, " ") == NULL) &&
01338     ((word->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
01339     (word->best_choice->permuter () == FREQ_DAWG_PERM) ||
01340     (word->best_choice->permuter () == USER_DAWG_PERM) ||
01341     (word->best_choice->permuter () == NUMBER_PERM) ||
01342     (acceptable_word_string (str) != AC_UNACCEPTABLE) || ok_dict_word))
01343     return G_OK;
01345   ok_chars = len - bad_char_count - isolated_digits -
01346     isolated_alphas - tess_rejs;
01348   if (crunch_debug > 3) {
01349     tprintf ("garbage_word: \"%s\"\n",
01350       word->best_choice->string ().string ());
01351     tprintf ("LEN: %d  bad: %d  iso_N: %d  iso_A: %d  rej: %d\n",
01352       len,
01353       bad_char_count, isolated_digits, isolated_alphas, tess_rejs);
01354   }
01355   if ((bad_char_count == 0) &&
01356     (tess_rejs == 0) &&
01357     ((len > isolated_digits + isolated_alphas) || (len <= 2)))
01358     return G_OK;
01360   if ((tess_rejs > ok_chars) ||
01361     ((tess_rejs > 0) && ((bad_char_count + tess_rejs) * 2 > len)))
01362     return G_TERRIBLE;
01364   if (len > 4) {
01365     dodgy_chars = 2 * tess_rejs + bad_char_count +
01366       isolated_digits + isolated_alphas;
01367     if ((dodgy_chars > 5) || ((dodgy_chars / (float) len) > 0.5))
01368       return G_DODGY;
01369     else
01370       return G_OK;
01371   }
01372   else {
01373     dodgy_chars = 2 * tess_rejs + bad_char_count;
01374     if (((len == 4) && (dodgy_chars > 2)) ||
01375       ((len == 3) && (dodgy_chars > 2)) || (dodgy_chars >= len))
01376       return G_DODGY;
01377     else
01378       return G_OK;
01379   }
01380 }

void insert_rej_cblobs ( WERD_RES word  ) 

Put rejected word blobs back into the outword.

word Word

Definition at line 1542 of file docqual.cpp.

References ASSERT_HOST, WERD_RES::best_choice, WERD::blob_list(), gblob_sort_list(), REJMAP::initialise(), REJMAP::length(), STRING::length(), WERD_RES::outword, WERD::rej_blob_list(), WERD_RES::reject_map, and TRUE.

Referenced by quality_based_rejection().

01543                                        {
01544   PBLOB_IT blob_it;              // blob iterator
01545   PBLOB_IT rej_blob_it;
01546   const STRING *wordstr;
01547   int old_len;
01548   int rej_len;
01549   char new_str[512];
01550   REJMAP new_map;
01551   int i = 0;                     //new_str index
01552   int j = 0;                     //old_str index
01553   int new_len;
01555   gblob_sort_list (word->outword->rej_blob_list (), TRUE);
01556   rej_blob_it.set_to_list (word->outword->rej_blob_list ());
01557   if (rej_blob_it.empty ())
01558     return;
01559   rej_len = rej_blob_it.length ();
01560   blob_it.set_to_list (word->outword->blob_list ());
01561   wordstr = &(word->best_choice->string ());
01562   old_len = wordstr->length ();
01563   ASSERT_HOST (word->reject_map.length () == old_len);
01564   ASSERT_HOST (blob_it.length () == old_len);
01565   if ((old_len + rej_len) > 511)
01566     return;                      //Word is garbage anyway prevent abort
01567   new_map.initialise (old_len + rej_len);
01569   while (!rej_blob_it.empty ()) {
01570     if ((j >= old_len) ||
01571       ( ()->bounding_box ().left () <=
01572 ()->bounding_box ().left ())) {
01573       /* Insert reject blob */
01574       if (j >= old_len)
01575         blob_it.add_to_end (rej_blob_it.extract ());
01576       else
01577         blob_it.add_before_stay_put (rej_blob_it.extract ());
01578       if (!rej_blob_it.empty ())
01579         rej_blob_it.forward ();
01580       new_str[i] = ' ';
01581       new_map[i].setrej_rej_cblob ();
01582       i++;
01583     }
01584     else {
01585       new_str[i] = (*wordstr)[j];
01586       new_map[i] = word->reject_map[j];
01587       i++;
01588       j++;
01589       blob_it.forward ();
01590     }
01591   }
01592   /* Add any extra normal blobs to strings */
01593   while (j < wordstr->length ()) {
01594     new_str[i] = (*wordstr)[j];
01595     new_map[i] = word->reject_map[j];
01596     i++;
01597     j++;
01598   }
01599   new_str[i] = '\0';
01600   /*
01601     tprintf(
01602           "\nOld len %d; New len %d; New str \"%s\"; New map \"%s\"\n",
01603           old_len, i, new_str, new_map );
01604   */
01605   ASSERT_HOST (i == blob_it.length ());
01606   ASSERT_HOST (i == old_len + rej_len);
01607   word->reject_map = new_map;
01608   *((STRING *) wordstr) = new_str;
01609   new_len = strlen (word->best_choice->string ().string ());
01610   ASSERT_HOST (word->reject_map.length () == new_len);
01611   ASSERT_HOST (word->outword->blob_list ()->length () == new_len);
01612 }

void merge_tess_fails ( WERD_RES word_res  ) 

Change pairs of tess failures to a single one (merge/collapse).

word_res Results on word in question
none, updates word's reject_map

Definition at line 1133 of file docqual.cpp.

References ASSERT_HOST, WERD_RES::best_choice, WERD::blob_list(), REJMAP::length(), merge_blobs(), WERD_RES::outword, WERD_RES::reject_map, and REJMAP::remove_pos().

Referenced by tilde_crunch(), and tilde_delete().

01134                                           {
01135   char *ptr;                     //string ptr
01136   PBLOB_IT blob_it;              //blobs
01137   int i = 0;
01138   int len;
01140   len = strlen (word_res->best_choice->string ().string ());
01141   ASSERT_HOST (word_res->reject_map.length () == len);
01142   ASSERT_HOST (word_res->outword->blob_list ()->length () == len);
01144   ptr = (char *) word_res->best_choice->string ().string ();
01145   blob_it = word_res->outword->blob_list ();
01146   while (*ptr != '\0') {
01147     if ((*ptr == ' ') && (*(ptr + 1) == ' ')) {
01148       strcpy (ptr + 1, ptr + 2); //shuffle up
01149       word_res->reject_map.remove_pos (i);
01150       merge_blobs (blob_it.data_relative (1), ());
01151       delete blob_it.extract (); //get rid of spare
01152     }
01153     else {
01154       i++;
01155       ptr++;
01156     }
01157     blob_it.forward ();
01158   }
01159   len = strlen (word_res->best_choice->string ().string ());
01160   ASSERT_HOST (word_res->reject_map.length () == len);
01161   ASSERT_HOST (word_res->outword->blob_list ()->length () == len);
01162 }

BOOL8 noise_outlines ( WERD word  ) 

Determine if more 'too' small outlines than 'regular' outlines (noise).

word Word
Global: bln_x_height
TRUE if more/same count of tiny outlines as good outlines in word
Cycle through blobs for word, looking at BB of outlines: if size of BB is less than small_limit, it's a small_outline_count, else it's a outline_count. Answer is boolean of the comparison: small_outline_count >= outline_count

Definition at line 1507 of file docqual.cpp.

References WERD::blob_list(), BOX::height(), outline_it, and BOX::width().

Referenced by word_deletable().

01507                                  { 
01508   PBLOB_IT blob_it;
01509   OUTLINE_IT outline_it;
01510   BOX box;                       // BB of outline
01511   INT16 outline_count = 0;       // regular count
01512   INT16 small_outline_count = 0; // tiny/noise count
01513   INT16 max_dimension;           // the larger of the BB dims
01514   float small_limit = bln_x_height * crunch_small_outlines_size;
01516   blob_it.set_to_list (word->blob_list ());
01517   for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) {
01518     outline_it.set_to_list ( ()->out_list ());
01519     for (outline_it.mark_cycle_pt(); !outline_it.cycled_list(); outline_it.forward()) {
01520       outline_count++;
01521       box = ()->bounding_box ();
01522       if (box.height () > box.width ())
01523         max_dimension = box.height ();
01524       else
01525         max_dimension = box.width ();
01526       if (max_dimension < small_limit)
01527         small_outline_count++;
01528     }
01529   }
01530   return (small_outline_count >= outline_count);
01531 }

BOOL8 potential_word_crunch ( WERD_RES word,
GARBAGE_LEVEL  garbage_level,
BOOL8  ok_dict_word 

Determine if word could be garbage or otherwise questionable.

word Word
ok_dict_word 0 or 1, 1 if word in dictionary
  • crunch_pot_indicators,
  • crunch_pot_poor_rate,
  • crunch_pot_poor_cert,
  • crunch_debug
TRUE if count of 'clues' that word sucks >= crunch_pot_indicators

Definition at line 974 of file docqual.cpp.

References AC_UNACCEPTABLE, acceptable_word_string(), WERD_RES::best_choice, G_OK, REJMAP::length(), WERD_RES::reject_map, and tprintf().

Referenced by tilde_crunch().

00976                                                 {
00977   float rating_per_ch;
00978   int adjusted_len;
00979   char *str = (char *) word->best_choice->string ().string (); // word's string
00980   BOOL8 word_crunchable;
00981   int poor_indicator_count = 0;
00983   word_crunchable =
00984     !crunch_leave_accept_strings ||
00985     (word->reject_map.length () < 3) ||
00986     ((acceptable_word_string (str) == AC_UNACCEPTABLE) && !ok_dict_word);
00988   adjusted_len = word->reject_map.length ();
00989   if (adjusted_len > 10)
00990     adjusted_len = 10;
00991   rating_per_ch = word->best_choice->rating () / adjusted_len;
00993   if (rating_per_ch > crunch_pot_poor_rate) {
00994     if (crunch_debug > 2) {
00995       tprintf ("Potential poor rating on \"%s\"\n",
00996         word->best_choice->string ().string ());
00997     }
00998     poor_indicator_count++;
00999   }
01001   if (word_crunchable &&
01002   (word->best_choice->certainty () < crunch_pot_poor_cert)) {
01003     if (crunch_debug > 2) {
01004       tprintf ("Potential poor cert on \"%s\"\n",
01005         word->best_choice->string ().string ());
01006     }
01007     poor_indicator_count++;
01008   }
01010   if (garbage_level != G_OK) {
01011     if (crunch_debug > 2) {
01012       tprintf ("Potential garbage on \"%s\"\n",
01013         word->best_choice->string ().string ());
01014     }
01015     poor_indicator_count++;
01016   }
01017   return (poor_indicator_count >= crunch_pot_indicators);
01018 }

void print_boxes ( WERD word  ) 

Print all bounding boxes for blobs in word.

word Word in question

Definition at line 447 of file docqual.cpp.

References WERD::blob_list(), and BOX::print().

00447                              { 
00448   PBLOB_IT it;
00449   BOX box;  // bounding box
00451   it.set_to_list (word->blob_list ());
00452   for (it.mark_cycle_pt (); !it.cycled_list (); it.forward ()) {
00453     box = ()->bounding_box ();
00454     box.print ();
00455   }
00456 }

void quality_based_rejection ( PAGE_RES_IT page_res_it,
BOOL8  good_quality_doc 

Deal with rejected blobs and clean up errant tilde's.

page_res_it pointer to page in question ?
good_quality_doc 0 or 1, which pass ?
  • tessedit_good_quality_unrej and
  • unlv_tilde_crunching
none, but word's reject_map is updated

Definition at line 496 of file docqual.cpp.

References doc_and_block_rejection(), insert_rej_cblobs(), NULL, tilde_crunch(), tilde_delete(), and unrej_good_quality_words().

Referenced by recog_all_words().

00497                                                      {
00498   if ((tessedit_good_quality_unrej && good_quality_doc))
00499     unrej_good_quality_words(page_res_it); 
00500   doc_and_block_rejection(page_res_it, good_quality_doc); 
00502   page_res_it.restart_page ();
00503   while (page_res_it.word () != NULL) {
00504     insert_rej_cblobs (page_res_it.word ());
00505     page_res_it.forward ();
00506   }
00508   if (unlv_tilde_crunching) {
00509     tilde_crunch(page_res_it); 
00510     tilde_delete(page_res_it); 
00511   }
00512 }

void reject_whole_page ( PAGE_RES_IT page_res_it  ) 

Dont believe any of it; set map to 00..00 for all words.

Go through all words on page and set their reject_map to rejected, then update page's page_res.

Definition at line 822 of file docqual.cpp.

References PAGE_RES_IT::forward(), NULL, PAGE_RES_IT::page_res, REJMAP::rej_word_doc_rej(), WERD_RES::reject_map, PAGE_RES_IT::restart_page(), TRUE, and PAGE_RES_IT::word().

Referenced by doc_and_block_rejection().

00822                                                  { 
00823   page_res_it.restart_page ();
00824   while (page_res_it.word () != NULL) {
00825     page_res_it.word ()->reject_map.rej_word_doc_rej ();
00826     page_res_it.forward ();
00827   }
00828   page_res_it.page_res->rejected = TRUE; // whole page is rejected
00829 }

BOOL8 terrible_word_crunch ( WERD_RES word,
GARBAGE_LEVEL  garbage_level 

Determine if word very likely is utter garbage, tuned with several globals.

word Word
  • crunch_debug,
  • crunch_terrible_rating,
  • crunch_terrible_garbage,
  • crunch_poor_garbage_cert,
  • crunch_poor_garbage_rate,
  • crunch_rating_max
TRUE if count of 'clues' that word sucks >= crunch_pot_indicators

Definition at line 926 of file docqual.cpp.

References WERD_RES::best_choice, FALSE, G_OK, G_TERRIBLE, REJMAP::length(), WERD_RES::reject_map, tprintf(), and TRUE.

Referenced by tilde_crunch().

00926                                                                         { 
00927   float rating_per_ch;
00928   int adjusted_len;
00929   int crunch_mode = 0;
00931   if ((word->best_choice->string ().length () == 0) ||
00932     (strspn (word->best_choice->string ().string (), " ") ==
00933     word->best_choice->string ().length ()))
00934     crunch_mode = 1;
00935   else {
00936     adjusted_len = word->reject_map.length ();
00937     if (adjusted_len > crunch_rating_max)
00938       adjusted_len = crunch_rating_max;
00939     rating_per_ch = word->best_choice->rating () / adjusted_len;
00941     if (rating_per_ch > crunch_terrible_rating)
00942       crunch_mode = 2;
00943     else if (crunch_terrible_garbage && (garbage_level == G_TERRIBLE))
00944       crunch_mode = 3;
00945     else if ((word->best_choice->certainty () < crunch_poor_garbage_cert) &&
00946       (garbage_level != G_OK))
00947       crunch_mode = 4;
00948     else if ((rating_per_ch > crunch_poor_garbage_rate) &&
00949       (garbage_level != G_OK))
00950       crunch_mode = 5;
00951   }
00952   if (crunch_mode > 0) {
00953     if (crunch_debug > 2) {
00954       tprintf ("Terrible_word_crunch (%d) on \"%s\"\n",
00955         crunch_mode, word->best_choice->string ().string ());
00956     }
00957     return TRUE;
00958   }
00959   else
00960     return FALSE;
00961 }

void tilde_crunch ( PAGE_RES_IT page_res_it  ) 

Definition at line 834 of file docqual.cpp.

References WERD_RES::best_choice, convert_bad_unlv_chs(), CR_KEEP_SPACE, dict_word(), DOC_DAWG_PERM, FALSE, PAGE_RES_IT::forward(), G_NEVER_CRUNCH, garbage_word(), merge_tess_fails(), NULL, potential_word_crunch(), PAGE_RES_IT::restart_page(), terrible_word_crunch(), tprintf(), TRUE, WERD_RES::unlv_crunch_mode, and PAGE_RES_IT::word().

Referenced by quality_based_rejection().

00834                                             { 
00835   WERD_RES *word;
00836   GARBAGE_LEVEL garbage_level;
00837   PAGE_RES_IT copy_it;
00838   BOOL8 prev_potential_marked = FALSE;
00839   BOOL8 found_terrible_word = FALSE;
00840   int dict_type;
00841   BOOL8 ok_dict_word;
00843   page_res_it.restart_page ();
00844   while (page_res_it.word () != NULL) {
00845     word = page_res_it.word ();
00847     if (crunch_early_convert_bad_unlv_chs)
00848       convert_bad_unlv_chs(word); 
00850     if (crunch_early_merge_tess_fails)
00851       merge_tess_fails(word); 
00853     if (word->reject_map.accept_count () != 0) {
00854       found_terrible_word = FALSE;
00855       prev_potential_marked = FALSE; // Forget earlier potential crunches
00856     }
00857     else {
00858       dict_type = dict_word (word->best_choice->string ().string ());
00859       ok_dict_word = (dict_type > 0) && (dict_type != DOC_DAWG_PERM);
00860       garbage_level = garbage_word (word, ok_dict_word);
00862       if ((garbage_level != G_NEVER_CRUNCH) &&
00863       (terrible_word_crunch (word, garbage_level))) {
00864         if (crunch_debug > 0) {
00865           tprintf ("T CRUNCHING: \"%s\"\n",
00866             word->best_choice->string ().string ());
00867         }
00868         word->unlv_crunch_mode = CR_KEEP_SPACE;
00869         if (prev_potential_marked) {
00870           while (copy_it.word () != word) {
00871             if (crunch_debug > 0) {
00872               tprintf ("P1 CRUNCHING: \"%s\"\n",
00873                 copy_it.word ()->best_choice->string ().
00874                 string ());
00875             }
00876             copy_it.word ()->unlv_crunch_mode = CR_KEEP_SPACE;
00877             copy_it.forward ();
00878           }
00879           prev_potential_marked = FALSE;
00880         }
00881         found_terrible_word = TRUE;
00882       }
00883       else if ((garbage_level != G_NEVER_CRUNCH) &&
00884         (potential_word_crunch (word,
00885       garbage_level, ok_dict_word))) {
00886         if (found_terrible_word) {
00887           if (crunch_debug > 0) {
00888             tprintf ("P2 CRUNCHING: \"%s\"\n",
00889               word->best_choice->string ().string ());
00890           }
00891           word->unlv_crunch_mode = CR_KEEP_SPACE;
00892         }
00893         else if (!prev_potential_marked) {
00894           copy_it = page_res_it;
00895           prev_potential_marked = TRUE;
00896           if (crunch_debug > 1) {
00897             tprintf ("P3 CRUNCHING: \"%s\"\n",
00898               word->best_choice->string ().string ());
00899           }
00900         }
00901       }
00902       else {
00903         found_terrible_word = FALSE;
00904         prev_potential_marked = FALSE; // Forget earlier potential crunches
00905         if (crunch_debug > 2) {
00906           tprintf ("NO CRUNCH: \"%s\"\n",
00907             word->best_choice->string ().string ());
00908         }
00909       }
00910     }
00911     page_res_it.forward ();
00912   }
00913 }

void tilde_delete ( PAGE_RES_IT page_res_it  ) 

Determine if word could be bad and delete all '~'s in it?

page_res_it Results of page
Global: crunch_early_merge_tess_fails,
none, but word's maps are modified!
Calls word_deletable()

Definition at line 1030 of file docqual.cpp.

References WERD_RES::best_choice, CR_NONE, FALSE, WERD::flag(), PAGE_RES_IT::forward(), merge_tess_fails(), NULL, PAGE_RES_IT::restart_page(), tprintf(), TRUE, WERD_RES::unlv_crunch_mode, W_BOL, W_EOL, WERD_RES::word, PAGE_RES_IT::word(), and word_deletable().

Referenced by quality_based_rejection().

01030                                             { 
01031   WERD_RES *word;
01032   PAGE_RES_IT copy_it;
01033   BOOL8 deleting_from_bol = FALSE;
01034   BOOL8 marked_delete_point = FALSE;
01035   INT16 debug_delete_mode;
01036   CRUNCH_MODE delete_mode;
01037   INT16 x_debug_delete_mode;
01038   CRUNCH_MODE x_delete_mode;
01040   page_res_it.restart_page ();
01041   while (page_res_it.word () != NULL) {
01042     word = page_res_it.word ();
01044     delete_mode = word_deletable (word, debug_delete_mode);
01045     if (delete_mode != CR_NONE) {
01046       if (word->word->flag (W_BOL) || deleting_from_bol) {
01047         if (crunch_debug > 0) {
01048           tprintf ("BOL CRUNCH DELETING(%d): \"%s\"\n",
01049             debug_delete_mode,
01050             word->best_choice->string ().string ());
01051         }
01052         word->unlv_crunch_mode = delete_mode;
01053         deleting_from_bol = TRUE;
01054       }
01055       else if (word->word->flag (W_EOL)) {
01056         if (marked_delete_point) {
01057           while (copy_it.word () != word) {
01058             x_delete_mode = word_deletable (copy_it.word (),
01059               x_debug_delete_mode);
01060             if (crunch_debug > 0) {
01061               tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n",
01062                 x_debug_delete_mode,
01063                 copy_it.word ()->best_choice->string ().
01064                 string ());
01065             }
01066             copy_it.word ()->unlv_crunch_mode = x_delete_mode;
01067             copy_it.forward ();
01068           }
01069         }
01070         if (crunch_debug > 0) {
01071           tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n",
01072             debug_delete_mode,
01073             word->best_choice->string ().string ());
01074         }
01075         word->unlv_crunch_mode = delete_mode;
01076         deleting_from_bol = FALSE;
01077         marked_delete_point = FALSE;
01078       }
01079       else {
01080         if (!marked_delete_point) {
01081           copy_it = page_res_it;
01082           marked_delete_point = TRUE;
01083         }
01084       }
01085     }
01086     else {
01087       deleting_from_bol = FALSE;
01088       marked_delete_point = FALSE; // Forget earlier potential crunches
01089     }
01090     /* The following step has been left till now as the tess fails are used to
01091       determine if the word is deletable.
01092     */
01093     if (!crunch_early_merge_tess_fails)
01094       merge_tess_fails(word); 
01095     page_res_it.forward ();
01096   }
01097 }

void unrej_good_chs ( WERD_RES word,
ROW row 

Unreject POTENTIAL rejects if the blob passes the blob and outline checks.

word Word in question
row word's row?
  • docqual_excuse_outline_errs and
  • bln_x_height
none, updates word's reject_map

Definition at line 371 of file docqual.cpp.

References WERD_RES::best_choice, WERD::blob_list(), PBLOB::bounding_box(), count_outline_errs(), crude_match_blobs(), delete_word(), WERD_RES::denorm, FALSE, WERD::gblob_list(), BOX::left(), make_bln_copy(), make_ed_word(), make_tess_word(), NULL, WERD_RES::outword, WERD_RES::reject_map, DENORM::scale(), and WERD_RES::word.

Referenced by unrej_good_quality_words().

00371                                               { 
00372   WERD *bln_word;                //BL norm init word
00373   TWERD *tessword;               //tess format
00374   WERD *init_word;               //BL norm init word
00375   PBLOB_IT outword_it;
00376   PBLOB_IT initial_it;
00377   INT16 i;
00378   INT16 init_blobs_left;
00379   BOOL8 matched;
00380   BOX out_box;
00381   PBLOB *test_blob;
00382   DENORM denorm;
00383   float bln_xht;
00384   INT16 j = 0;
00386   if (word->word->gblob_list ()->empty ())
00387     return;
00389   bln_xht = bln_x_height / word->denorm.scale (); // xht used for blnorm
00390   bln_word = make_bln_copy (word->word, row, bln_xht, &denorm);
00391   /* NOTE: Need to convert to tess format and back again to ensure that the
00392     same float -> int rounding of coords is done to source wd as out wd before
00393     comparison
00394   */
00395   tessword = make_tess_word (bln_word, NULL);
00396   init_word = make_ed_word (tessword, bln_word); // convert word
00397   delete bln_word;
00398   delete_word(tessword);  // get rid of it
00400   initial_it.set_to_list (init_word->blob_list ());
00401   init_blobs_left = initial_it.length ();
00402   outword_it.set_to_list (word->outword->blob_list ());
00404   for (outword_it.mark_cycle_pt ();
00405   !outword_it.cycled_list (); outword_it.forward ()) {
00406     out_box = ()->bounding_box ();
00408     /* Skip any initial blobs LEFT of current outword blob */
00409     while (!initial_it.at_last () &&
00410     ( ()->bounding_box ().left () < out_box.left ())) {
00411       initial_it.forward ();
00412       init_blobs_left--;
00413     }
00415     /* See if current outword blob matches any initial blob with the same left
00416       coord. (Normally only one but possibly more - in unknown order) */
00417     i = 0;
00418     matched = FALSE;
00419     do {
00420       test_blob = initial_it.data_relative (i++);
00421       matched = crude_match_blobs (test_blob, ());
00422       if (matched &&
00423         (word->reject_map[j].accept_if_good_quality ()) &&
00424         (docqual_excuse_outline_errs ||
00425         (count_outline_errs (word->best_choice->string ()[j],
00426 ()->out_list ()->
00427         length ()) == 0)))
00428         word->reject_map[j].setrej_quality_accept ();
00429     }
00430     while (!matched &&
00431       (init_blobs_left - i > 0) &&
00432       (i < 129) &&
00433       !initial_it.at_last () &&
00434       test_blob->bounding_box ().left () == out_box.left ());
00435     j++;
00436   }
00437   delete init_word;
00438 }

void unrej_good_quality_words ( PAGE_RES_IT page_res_it  ) 

Unreject potential.

page_res_it pointer to page in question ?
Accept potential rejects in words which pass some checks

- Contains a potential reject
- Word looks like a sensible alpha word.
- Word segmentation is the same as the original image
	- All characters have the expected number of outlines
	- NOTE: the rejection counts are recalculated after unrejection
	- CANT do it in a single pass without a bit of fiddling
	- keep it simple but inefficient

Definition at line 533 of file docqual.cpp.

References AC_UNACCEPTABLE, acceptable_word_string(), WERD_RES::best_choice, PAGE_RES_IT::block(), BLOCK_RES::char_count, ROW_RES::char_count, check_debug_pt(), PAGE_RES_IT::forward(), REJMAP::length(), NULL, PAGE_RES_IT::page_res, REJMAP::quality_recoverable_rejects(), BLOCK_RES::rej_count, ROW_RES::rej_count, PAGE_RES_IT::rej_stat_word(), WERD_RES::reject_map, PAGE_RES_IT::restart_page(), ROW_RES::row, PAGE_RES_IT::row(), unrej_good_chs(), ROW_RES::whole_word_rej_count, and PAGE_RES_IT::word().

Referenced by quality_based_rejection().

00534                                                         {
00535   WERD_RES *word;
00536   ROW_RES *current_row;
00537   BLOCK_RES *current_block;
00538   int i;
00540   page_res_it.restart_page ();
00541   while (page_res_it.word () != NULL) {
00542     check_debug_pt (page_res_it.word (), 100);
00543     if (bland_unrej) {
00544       word = page_res_it.word ();
00545       for (i = 0; i < word->reject_map.length (); i++) {
00546         if (word->reject_map[i].accept_if_good_quality ())
00547           word->reject_map[i].setrej_quality_accept ();
00548       }
00549       page_res_it.forward ();
00550     }
00551     else if ((page_res_it.row ()->char_count > 0) &&
00552       ((page_res_it.row ()->rej_count /
00553       (float) page_res_it.row ()->char_count) <=
00554     quality_rowrej_pc)) {
00555       word = page_res_it.word ();
00556       if (word->reject_map.quality_recoverable_rejects () &&
00557         (tessedit_unrej_any_wd ||
00558         acceptable_word_string (word->best_choice->string ().string ())
00559       != AC_UNACCEPTABLE)) {
00560         unrej_good_chs (word, page_res_it.row ()->row);
00561       }
00562       page_res_it.forward ();
00563     }
00564     else {
00565       /* Skip to end of dodgy row */
00566       current_row = page_res_it.row ();
00567       while ((page_res_it.word () != NULL) &&
00568         (page_res_it.row () == current_row))
00569         page_res_it.forward ();
00570     }
00571     check_debug_pt (page_res_it.word (), 110);
00572   }
00573   page_res_it.restart_page ();
00574   page_res_it.page_res->char_count = 0;
00575   page_res_it.page_res->rej_count = 0;
00576   current_block = NULL;
00577   current_row = NULL;
00578   while (page_res_it.word () != NULL) {
00579     if (current_block != page_res_it.block ()) {
00580       current_block = page_res_it.block ();
00581       current_block->char_count = 0;
00582       current_block->rej_count = 0;
00583     }
00584     if (current_row != page_res_it.row ()) {
00585       current_row = page_res_it.row ();
00586       current_row->char_count = 0;
00587       current_row->rej_count = 0;
00588       current_row->whole_word_rej_count = 0;
00589     }
00590     page_res_it.rej_stat_word ();
00591     page_res_it.forward ();
00592   }
00593 }

INT16 word_blob_quality ( WERD_RES word,
ROW row 

Find number of blobs in outword that are identical to those of inword.

word Word in question
row Row the word came from
Count of good blobs
ASSUME blobs in both initial word and outword are in ascending order of left hand blob edge.

outword is word on edges of block vs inword is any word within a block, right (in word_blob_quality())?

Definition at line 139 of file docqual.cpp.

References WERD::blob_list(), PBLOB::bounding_box(), cprintf(), crude_match_blobs(), delete_word(), WERD_RES::denorm, FALSE, WERD::gblob_list(), BOX::left(), make_bln_copy(), make_ed_word(), make_tess_word(), NULL, WERD_RES::outword, DENORM::scale(), and WERD_RES::word.

Referenced by recog_all_words(), and recog_interactive().

00141                                   {
00142   WERD *bln_word;                //BL norm init word
00143   TWERD *tessword;               //tess format
00144   WERD *init_word;               //BL norm init word
00145   PBLOB_IT outword_it;
00146   PBLOB_IT initial_it;
00147   INT16 i;
00148   INT16 init_blobs_left;
00149   INT16 match_count = 0;
00150   BOOL8 matched;
00151   BOX out_box;
00152   PBLOB *test_blob;
00153   DENORM denorm;
00154   float bln_xht;
00156 #ifdef TEXT_VERBOSE
00157   // gets a 'v', see ccmain/tesseractmain.dox
00158   cprintf("v");
00159 #endif
00160   if (word->word->gblob_list ()->empty ())
00161     return 0;
00162   bln_xht = bln_x_height / word->denorm.scale (); //xht used for blnorm
00163   bln_word = make_bln_copy (word->word, row, bln_xht, &denorm);
00165   /* NOTE: Need to convert to tess format and back again to ensure that the
00166     same float -> int rounding of coords is done to source wd as out wd before
00167     comparison
00168   */
00170   //   if (!bln_word->flag(W_POLYGON))
00171   //           tprintf( "NON POLYGON BLN WERD\n");
00172   tessword = make_tess_word (bln_word, NULL);
00173   init_word = make_ed_word (tessword, bln_word); // convert word
00174   //   if (!init_word->flag(W_POLYGON))
00175   //         tprintf( "NON POLYGON INIT WERD\n");
00176   //   tprintf( "SOURCE BLOBS-AFTER TESS:\n");
00177   //   print_boxes( init_word );
00178   //   tprintf( "OUTPUT BLOBS:\n");
00179   //   print_boxes( word->outword );
00181   initial_it.set_to_list (init_word->blob_list ());
00182   init_blobs_left = initial_it.length ();
00183   outword_it.set_to_list (word->outword->blob_list ());
00184   delete bln_word;
00185   delete_word(tessword);  //get rid of it
00187   for (outword_it.mark_cycle_pt ();
00188   !outword_it.cycled_list (); outword_it.forward ()) {
00189     out_box = ()->bounding_box ();
00191     /* Skip any initial blobs LEFT of current outword blob */
00192     while (!initial_it.at_last () &&
00193     ( ()->bounding_box ().left () < out_box.left ())) {
00194       initial_it.forward ();
00195       init_blobs_left--;
00196     }
00198     /* See if current outword blob matches any initial blob with the same left
00199       coord. (Normally only one but possibly more - in unknown order) */
00201     i = 0;
00202     matched = FALSE;
00203     do {
00204       test_blob = initial_it.data_relative (i++);
00205       matched = crude_match_blobs (test_blob, ());
00206       if (matched)
00207         match_count++;
00208     }
00209     while (!matched &&
00210       (init_blobs_left - i > 0) &&
00211       (i < 129) &&
00212       !initial_it.at_last () &&
00213       test_blob->bounding_box ().left () == out_box.left ());
00214   }
00215   delete init_word;
00216   return match_count;
00217 }

void word_char_quality ( WERD_RES word,
ROW row,
INT16 match_count,
INT16 accepted_match_count 

Check word's blobs' quality.

word Word in question
row word's row?
match_count Return variable for caller
accepted_match_count Return variable for caller
Global: bln_x_height
Combination of blob quality and outline quality & determine how many good chars are there - i.e., chars which pass the blob AND outline tests.

Definition at line 277 of file docqual.cpp.

References WERD_RES::best_choice, WERD::blob_list(), PBLOB::bounding_box(), count_outline_errs(), cprintf(), crude_match_blobs(), delete_word(), WERD_RES::denorm, FALSE, WERD::gblob_list(), BOX::left(), make_bln_copy(), make_ed_word(), make_tess_word(), NULL, WERD_RES::outword, WERD_RES::reject_map, DENORM::scale(), and WERD_RES::word.

Referenced by classify_word_pass2(), doc_and_block_rejection(), recog_all_words(), and recog_interactive().

00281                                                     {
00282   WERD *bln_word;                //BL norm init word
00283   TWERD *tessword;               //tess format
00284   WERD *init_word;               //BL norm init word
00285   PBLOB_IT outword_it;
00286   PBLOB_IT initial_it;
00287   INT16 i;
00288   INT16 init_blobs_left;
00289   BOOL8 matched;
00290   BOX out_box;
00291   PBLOB *test_blob;
00292   DENORM denorm;
00293   float bln_xht;
00294   INT16 j = 0;
00296 #ifdef TEXT_VERBOSE
00297   // gets a 'y', see ccmain/tessvars.doxfg
00298   cprintf("y");
00299 #endif
00300   *match_count = 0;
00301   *accepted_match_count = 0;
00302   if (word->word->gblob_list ()->empty ())
00303     return;
00305   bln_xht = bln_x_height / word->denorm.scale (); //xht used for blnorm
00306   bln_word = make_bln_copy (word->word, row, bln_xht, &denorm);
00307   /* NOTE: Need to convert to tess format and back again to ensure that the
00308     same float -> int rounding of coords is done to source wd as out wd before
00309     comparison
00310   */
00311   tessword = make_tess_word (bln_word, NULL);
00312   init_word = make_ed_word (tessword, bln_word); //convert word
00313   delete bln_word;
00314   delete_word(tessword);  //get rid of it
00315   //   tprintf( "SOURCE BLOBS-AFTER TESS:\n");
00316   //   print_boxes( init_word );
00317   //   tprintf( "OUTPUT BLOBS:\n");
00318   //   print_boxes( word->outword );
00320   initial_it.set_to_list (init_word->blob_list ());
00321   init_blobs_left = initial_it.length ();
00322   outword_it.set_to_list (word->outword->blob_list ());
00324   for (outword_it.mark_cycle_pt ();
00325   !outword_it.cycled_list (); outword_it.forward ()) {
00326     out_box = ()->bounding_box ();
00328     /* Skip any initial blobs LEFT of current outword blob */
00329     while (!initial_it.at_last () &&
00330     ( ()->bounding_box ().left () < out_box.left ())) {
00331       initial_it.forward ();
00332       init_blobs_left--;
00333     }
00335     /* See if current outword blob matches any initial blob with the same left
00336       coord. (Normally only one but possibly more - in unknown order) */
00338     i = 0;
00339     matched = FALSE;
00340     do {
00341       test_blob = initial_it.data_relative (i++);
00342       matched = crude_match_blobs (test_blob, ());
00343       if (matched &&
00344         (count_outline_errs (word->best_choice->string ()[j],
00345 ()->out_list ()->length ()) == 0)) {
00346         (*match_count)++;
00347         if (word->reject_map[j].accepted ())
00348           (*accepted_match_count)++;
00349       }
00350     }
00351     while (!matched &&
00352       (init_blobs_left - i > 0) &&
00353       (i < 129) &&
00354       !initial_it.at_last () &&
00355       test_blob->bounding_box ().left () == out_box.left ());
00356     j++;
00357   }
00358   delete init_word;
00359 }

CRUNCH_MODE word_deletable ( WERD_RES word,
INT16 delete_mode 

Delete suspicious word if a bunch of constraints are met.

word Word in question
delete_mode Actually a return value (0..11) for tilde_delete()
  • bln_baseline_offset,
  • bln_x_height,
  • crunch_del_min_ht,
  • crunch_del_min_width,
  • crunch_del_max_ht,
  • crunch_del_low_word,
  • crunch_del_high_word,
  • crunch_del_rating,
  • crunch_del_cert.
  • CR_NONE,
   Word is crunched AND
   ( string length = 0                                          OR
     > 50% of chars are "|" (before merging)                    OR
     certainty < -10                                            OR
     rating /char > 60                                          OR
     TOP of word is more than 0.5 xht BELOW baseline            OR
     BOTTOM of word is more than 0.5 xht ABOVE xht              OR
     length of word < 3xht                                      OR
     height of word < 0.7 xht                                   OR
     height of word > 3.0 xht                                   OR
     >75% of the outline BBs have longest dimension < 0.5xht )

Definition at line 1409 of file docqual.cpp.

References BOX::bottom(), WERD::bounding_box(), CR_DELETE, CR_LOOSE_SPACE, CR_NONE, failure_count(), BOX::height(), REJMAP::length(), noise_outlines(), WERD_RES::outword, WERD_RES::reject_map, BOX::top(), WERD_RES::unlv_crunch_mode, and BOX::width().

Referenced by tilde_delete().

01409                                                                { 
01410   int word_len = word->reject_map.length ();
01411   float rating_per_ch;
01412   BOX box;                       // BB of word
01414   if (word->unlv_crunch_mode == CR_NONE) {
01415     delete_mode = 0;
01416     return CR_NONE;
01417   }
01419   if (word_len == 0) {
01420     delete_mode = 1;
01421     return CR_DELETE;
01422   }
01424   box = word->outword->bounding_box ();
01425   if (box.height () < crunch_del_min_ht * bln_x_height) {
01426     delete_mode = 4;
01427     return CR_DELETE;
01428   }
01430   if (noise_outlines (word->outword)) {
01431     delete_mode = 5;
01432     return CR_DELETE;
01433   }
01435   if ((failure_count (word) * 1.5) > word_len) {
01436     delete_mode = 2;
01437     return CR_LOOSE_SPACE;
01438   }
01440   if (word->best_choice->certainty () < crunch_del_cert) {
01441     delete_mode = 7;
01442     return CR_LOOSE_SPACE;
01443   }
01445   rating_per_ch = word->best_choice->rating () / word_len;
01447   if (rating_per_ch > crunch_del_rating) {
01448     delete_mode = 8;
01449     return CR_LOOSE_SPACE;
01450   }
01452   if ( () < bln_baseline_offset - crunch_del_low_word * bln_x_height) {
01453     delete_mode = 9;
01454     return CR_LOOSE_SPACE;
01455   }
01457   if (box.bottom () >
01458   bln_baseline_offset + crunch_del_high_word * bln_x_height) {
01459     delete_mode = 10;
01460     return CR_LOOSE_SPACE;
01461   }
01463   if (box.height () > crunch_del_max_ht * bln_x_height) {
01464     delete_mode = 11;
01465     return CR_LOOSE_SPACE;
01466   }
01468   if (box.width () < crunch_del_min_width * bln_x_height) {
01469     delete_mode = 3;
01470     return CR_LOOSE_SPACE;
01471   }
01473   delete_mode = 0;
01474   return CR_NONE;
01475 }

INT16 word_outline_errs ( WERD_RES word  ) 

Count errors in word's outlines using count_outline_errs().

word Word
count of errors

Definition at line 246 of file docqual.cpp.

References WERD_RES::best_choice, WERD::blob_list(), count_outline_errs(), and WERD_RES::outword.

Referenced by recog_all_words(), and recog_interactive().

00247                                         {
00248   PBLOB_IT outword_it;
00249   INT16 i = 0;
00250   INT16 err_count = 0;
00252   outword_it.set_to_list (word->outword->blob_list ());
00254   for (outword_it.mark_cycle_pt ();
00255   !outword_it.cycled_list (); outword_it.forward ()) {
00256     err_count += count_outline_errs (word->best_choice->string ()[i],
00257 ()->out_list ()-> length ());
00258     i++;
00259   }
00260   return err_count;
00261 }

Generated on Wed Feb 28 19:49:14 2007 for Tesseract by  doxygen 1.5.1