ccmain/docqual.cpp File Reference

#include "mfcpch.h"
#include <ctype.h>
#include "docqual.h"
#include "tstruct.h"
#include "tfacep.h"
#include "reject.h"
#include "tessvars.h"
#include "genblob.h"
#include "secname.h"

Go to the source code of this file.

Defines

Functions


Define Documentation

#define EXTERN

Note:
File: docqual.cpp (Formerly docqual.c)
Document Quality Metrics
Author:
Phil Cheatle
Date:
Mon May 9 11:27:28 BST 1994
 * (C) Copyright 1994, Hewlett-Packard Ltd.
 ** Licensed under the Apache License, Version 2.0 (the "License");
 ** you may not use this file except in compliance with the License.
 ** You may obtain a copy of the License at
 ** http://www.apache.org/licenses/LICENSE-2.0
 ** Unless required by applicable law or agreed to in writing, software
 ** distributed under the License is distributed on an "AS IS" BASIS,
 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 ** See the License for the specific language governing permissions and
 ** limitations under the License.

Definition at line 33 of file docqual.cpp.


Function Documentation

void convert_bad_unlv_chs ( WERD_RES word_res  ) 

Converts all '~' to '-' & '^' to ' ' in this word.

Parameters:
word_res Word results
Returns:
none, instead updates word's reject_map

Definition at line 1106 of file docqual.cpp.

References WERD_RES::best_choice, REJMAP::length(), and WERD_RES::reject_map.

Referenced by tilde_crunch().

01107                                               {
01108   char *ptr;                     //string ptr
01109   int i;
01110 
01111   ptr = (char *) word_res->best_choice->string ().string ();
01112   for (i = 0; i < word_res->reject_map.length (); i++) {
01113     if (ptr[i] == '~') {
01114       ptr[i] = '-';
01115       if (word_res->reject_map[i].accepted ())
01116         word_res->reject_map[i].setrej_unlv_rej ();
01117     }
01118     if (ptr[i] == '^') {
01119       ptr[i] = ' ';
01120       if (word_res->reject_map[i].accepted ())
01121         word_res->reject_map[i].setrej_unlv_rej ();
01122     }
01123   }
01124 }

INT16 count_outline_errs ( char  c,
INT16  outline_count 
)

Test if character is in known-odd categories.

Parameters:
c character
outline_count outlines we have
Returns:
difference between outline_count and what it should be
So, count_outline_errs() returns 0 if c has the EXPECTED number of outlines.

Definition at line 475 of file docqual.cpp.

Referenced by unrej_good_chs(), word_char_quality(), and word_outline_errs().

00475                                                       { 
00476   int expected_outline_count;
00477 
00478   if (STRING (outlines_odd).contains (c))
00479     return 0;                    //Dont use this char
00480   else if (STRING (outlines_2).contains (c))
00481     expected_outline_count = 2;
00482   else
00483     expected_outline_count = 1;
00484   return abs (outline_count - expected_outline_count);
00485 }

BOOL8 crude_match_blobs ( PBLOB blob1,
PBLOB blob2 
)

Compares two blobs' bounding boxes & lengths of outlines.

Parameters:
blob1 first blob
blob2 second blob
Returns:
TRUE if blob1 == blob2

Definition at line 227 of file docqual.cpp.

References PBLOB::bounding_box(), BOX::contains(), FALSE, PBLOB::out_list(), and TRUE.

Referenced by unrej_good_chs(), word_blob_quality(), and word_char_quality().

00227                                                     { 
00228   BOX box1 = blob1->bounding_box ();
00229   BOX box2 = blob2->bounding_box ();
00230 
00231   if (box1.contains (box2) &&
00232     box2.contains (box1) &&
00233     (blob1->out_list ()->length () == blob1->out_list ()->length ()))
00234     return TRUE;
00235   else
00236     return FALSE;
00237 }

void doc_and_block_rejection ( PAGE_RES_IT page_res_it,
BOOL8  good_quality_doc 
)

Reject big chunks.

Parameters:
page_res_it pointer to page in question ?
good_quality_doc 1 or 0, document is good
Returns:
none
Reject all of the page or block of it's deemed bad (too big)

If the page has too many rejects - reject all of it. If any block has too many rejects - reject all words in the block

Definition at line 608 of file docqual.cpp.

References AC_UNACCEPTABLE, acceptable_word_string(), WERD_RES::best_choice, PAGE_RES_IT::block(), BLOCK_RES::block, BLOCK_RES::char_count, ROW_RES::char_count, FALSE, PAGE_RES_IT::forward(), REJMAP::length(), NULL, PAGE_RES_IT::page_res, PAGE_RES_IT::prev_row(), BLOCK_RES::rej_count, ROW_RES::rej_count, REJMAP::rej_word_block_rej(), REJMAP::reject_count(), WERD_RES::reject_map, WERD_RES::reject_spaces, reject_whole_page(), PAGE_RES_IT::restart_page(), PAGE_RES_IT::row(), ROW_RES::row, SECURE_NAMES, WERD::space(), tprintf(), TRUE, ROW_RES::whole_word_rej_count, PAGE_RES_IT::word(), WERD_RES::word, and word_char_quality().

Referenced by quality_based_rejection().

00610                                                      {
00611   INT16 block_no = 0;
00612   INT16 row_no = 0;
00613   BLOCK_RES *current_block;
00614   ROW_RES *current_row;
00615 
00616   BOOL8 rej_word;
00617   BOOL8 prev_word_rejected;
00618   INT16 char_quality;
00619   INT16 accepted_char_quality;
00620 
00621   if ((page_res_it.page_res->rej_count * 100.0 /
00622   page_res_it.page_res->char_count) > tessedit_reject_doc_percent) {
00623     reject_whole_page(page_res_it); 
00624     #ifndef SECURE_NAMES
00625     if (tessedit_debug_doc_rejection) {
00626       tprintf ("REJECT ALL #chars: %d #Rejects: %d; \n",
00627         page_res_it.page_res->char_count,
00628         page_res_it.page_res->rej_count);
00629     }
00630     #endif
00631   }
00632   else {
00633     #ifndef SECURE_NAMES
00634     if (tessedit_debug_doc_rejection)
00635       tprintf ("NO PAGE REJECTION #chars: %d  # Rejects: %d; \n",
00636         page_res_it.page_res->char_count,
00637         page_res_it.page_res->rej_count);
00638     #endif
00639 
00640     /* Walk blocks testing for block rejection */
00641 
00642     page_res_it.restart_page ();
00643     while (page_res_it.word () != NULL) {
00644       current_block = page_res_it.block ();
00645       if (current_block->block->text_region () != NULL)
00646         block_no = current_block->block->text_region ()->id_no ();
00647       else
00648         block_no = -1;
00649       if ((page_res_it.block ()->char_count > 0) &&
00650         ((page_res_it.block ()->rej_count * 100.0 /
00651         page_res_it.block ()->char_count) >
00652       tessedit_reject_block_percent)) {
00653         #ifndef SECURE_NAMES
00654         if (tessedit_debug_block_rejection)
00655           tprintf ("REJECTING BLOCK %d  #chars: %d;  #Rejects: %d\n",
00656             block_no,
00657             page_res_it.block ()->char_count,
00658             page_res_it.block ()->rej_count);
00659         #endif
00660         prev_word_rejected = FALSE;
00661         while ((page_res_it.word () != NULL) &&
00662         (page_res_it.block () == current_block)) {
00663           if (tessedit_preserve_blk_rej_perfect_wds) {
00664             rej_word =
00665               (page_res_it.word ()->reject_map.reject_count () > 0)
00666               || (page_res_it.word ()->reject_map.length () <
00667               tessedit_preserve_min_wd_len);
00668             if (rej_word && tessedit_dont_blkrej_good_wds
00669               && !(page_res_it.word ()->reject_map.length () <
00670               tessedit_preserve_min_wd_len)
00671               &&
00672               (acceptable_word_string
00673               (page_res_it.word ()->best_choice->string ().
00674             string ()) != AC_UNACCEPTABLE)) {
00675               word_char_quality (page_res_it.word (),
00676                 page_res_it.row ()->row,
00677                 &char_quality,
00678                 &accepted_char_quality);
00679               rej_word = char_quality !=
00680                 page_res_it.word ()->reject_map.length ();
00681             }
00682           }
00683           else
00684             rej_word = TRUE;
00685           if (rej_word) {
00686             /*
00687               Reject spacing if both current and prev words are rejected.
00688               NOTE - this is NOT restricted to FUZZY spaces.
00689            When tried this generated more space errors.
00690             */
00691             if (tessedit_use_reject_spaces &&
00692               prev_word_rejected &&
00693               (page_res_it.prev_row () == page_res_it.row ()) &&
00694               (page_res_it.word ()->word->space () == 1))
00695               page_res_it.word ()->reject_spaces = TRUE;
00696             page_res_it.word ()->reject_map.rej_word_block_rej ();
00697           }
00698           prev_word_rejected = rej_word;
00699           page_res_it.forward ();
00700         }
00701       }
00702       else {
00703         #ifndef SECURE_NAMES
00704         if (tessedit_debug_block_rejection)
00705           tprintf
00706             ("NOT REJECTING BLOCK %d #chars: %d  # Rejects: %d; \n",
00707             block_no, page_res_it.block ()->char_count,
00708             page_res_it.block ()->rej_count);
00709         #endif
00710 
00711         /* Walk rows in block testing for row rejection */
00712         row_no = 0;
00713         while ((page_res_it.word () != NULL) &&
00714         (page_res_it.block () == current_block)) {
00715           current_row = page_res_it.row ();
00716           row_no++;
00717      /*
00718    Reject whole row if:
00719     fraction of chars on row which are rejected exceed a limit AND
00720     fraction rejects which occur in WHOLE WERD rejects is LESS THAN a limit
00721      */
00722           if ((page_res_it.row ()->char_count > 0) &&
00723             ((page_res_it.row ()->rej_count * 100.0 /
00724             page_res_it.row ()->char_count) >
00725             tessedit_reject_row_percent) &&
00726             ((page_res_it.row ()->whole_word_rej_count * 100.0 /
00727             page_res_it.row ()->rej_count) <
00728           tessedit_whole_wd_rej_row_percent)) {
00729             #ifndef SECURE_NAMES
00730             if (tessedit_debug_block_rejection)
00731               tprintf
00732                 ("REJECTING ROW %d  #chars: %d;  #Rejects: %d\n",
00733                 row_no, page_res_it.row ()->char_count,
00734                 page_res_it.row ()->rej_count);
00735             #endif
00736             prev_word_rejected = FALSE;
00737             while ((page_res_it.word () != NULL) &&
00738             (page_res_it.row () == current_row)) {
00739               /* Preserve words on good docs unless they are mostly rejected*/
00740               if (!tessedit_row_rej_good_docs && good_quality_doc) {
00741                 rej_word =
00742                   page_res_it.word ()->reject_map.
00743                   reject_count () /
00744                   (float) page_res_it.word ()->reject_map.
00745                   length () > tessedit_good_doc_still_rowrej_wd;
00746               }
00747 
00748               /* Preserve perfect words anyway */
00749               else if (tessedit_preserve_row_rej_perfect_wds) {
00750                 rej_word =
00751                   (page_res_it.word ()->reject_map.
00752                   reject_count () > 0)
00753                   || (page_res_it.word ()->reject_map.
00754                   length () < tessedit_preserve_min_wd_len);
00755                 if (rej_word && tessedit_dont_rowrej_good_wds
00756                   && !(page_res_it.word ()->reject_map.
00757                   length () <
00758                   tessedit_preserve_min_wd_len)
00759                   &&
00760                   (acceptable_word_string
00761                   (page_res_it.word ()->best_choice->
00762                 string ().string ()) != AC_UNACCEPTABLE)) {
00763                   word_char_quality (page_res_it.word (),
00764                     page_res_it.row ()->row,
00765                     &char_quality,
00766                     &accepted_char_quality);
00767                   rej_word = char_quality !=
00768                     page_res_it.word ()->reject_map.length ();
00769                 }
00770               }
00771               else
00772                 rej_word = TRUE;
00773               if (rej_word) {
00774                 /*
00775                   Reject spacing if both current and prev words are rejected.
00776                   NOTE - this is NOT restricted to FUZZY spaces.
00777               When tried this generated more space errors.
00778                 */
00779                 if (tessedit_use_reject_spaces &&
00780                   prev_word_rejected &&
00781                   (page_res_it.prev_row () ==
00782                   page_res_it.row ())
00783                   && (page_res_it.word ()->word->space () ==
00784                   1))
00785                   page_res_it.word ()->reject_spaces = TRUE;
00786                 page_res_it.word ()->reject_map.
00787                   rej_word_row_rej(); 
00788               }
00789               prev_word_rejected = rej_word;
00790               page_res_it.forward ();
00791             }
00792           }
00793           else {
00794             #ifndef SECURE_NAMES
00795             if (tessedit_debug_block_rejection)
00796               tprintf
00797                 ("NOT REJECTING ROW %d #chars: %d  # Rejects: %d; \n",
00798                 row_no, page_res_it.row ()->char_count,
00799                 page_res_it.row ()->rej_count);
00800             #endif
00801             while ((page_res_it.word () != NULL) &&
00802               (page_res_it.row () == current_row))
00803               page_res_it.forward ();
00804           }
00805         }
00806       }
00807     }
00808   }
00809 }

INT16 failure_count ( WERD_RES word  ) 

Count up all the blanks (' ') in word's best_choice string.

Parameters:
word Word
Returns:
count

Definition at line 1484 of file docqual.cpp.

References WERD_RES::best_choice.

Referenced by word_deletable().

01484                                     { 
01485   char *str = (char *) word->best_choice->string ().string ();
01486   int tess_rejs = 0;
01487 
01488   for (; *str != '\0'; str++) {
01489     if (*str == ' ')
01490       tess_rejs++;
01491   }
01492   return tess_rejs;
01493 }

GARBAGE_LEVEL garbage_word ( WERD_RES word,
BOOL8  ok_dict_word 
)

Determine probability that word is garbage using manu heuristic/rules.

Parameters:
word Word in question
ok_dict_word 0 or 1, 1 if word in dictionary
Note:
Global:
  • crunch_include_numerals,
  • crunch_leave_ok_strings, and
  • crunch_debug
  • also: SYSTEM_DAWG_PERM, FREQ_DAWG_PERM, USER_DAWG_PERM, NUMBER_PERM, and AC_UNACCEPTABLE)
Returns:
G_NEVER_CRUNCH, G_TERRIBLE, G_DODGY, or G_OK
Two steps: first step scans through letters of word and gathers statistics and updates state-machine. Second step works at the word-level together with statistics from first step. Neato.

Definition at line 1181 of file docqual.cpp.

References AC_UNACCEPTABLE, acceptable_word_string(), WERD_RES::best_choice, FREQ_DAWG_PERM, G_DODGY, G_NEVER_CRUNCH, G_OK, G_TERRIBLE, REJMAP::length(), NULL, NUMBER_PERM, WERD_RES::reject_map, SYSTEM_DAWG_PERM, tprintf(), and USER_DAWG_PERM.

Referenced by tilde_crunch().

01181                                                                { 
01182   enum STATES
01183   {
01184     JUNK,
01185     FIRST_UPPER,
01186     FIRST_LOWER,
01187     FIRST_NUM,
01188     SUBSEQUENT_UPPER,
01189     SUBSEQUENT_LOWER,
01190     SUBSEQUENT_NUM
01191   };
01192   char *str = (char *) word->best_choice->string ().string ();
01193   STATES state = JUNK;
01194   int len = 0;
01195   int isolated_digits = 0;
01196   int isolated_alphas = 0;
01197   int bad_char_count = 0;
01198   int tess_rejs = 0;
01199   int dodgy_chars = 0;
01200   int ok_chars;
01201   char last_char = ' ';
01202   int alpha_repetition_count = 0;
01203   int longest_alpha_repetition_count = 0;
01204   int longest_lower_run_len = 0;
01205   int lower_string_count = 0;
01206   int longest_upper_run_len = 0;
01207   int upper_string_count = 0;
01208   int total_alpha_count = 0;
01209   int total_digit_count = 0;
01210 
01211   /* Step 1: Scan letters of word and set up a bunch of variables
01212       Working at the level of individual letters */
01213   for (; *str != '\0'; str++) {
01214     len++;
01215     if (isupper (*str)) {
01216       total_alpha_count++;
01217       switch (state) {
01218         case SUBSEQUENT_UPPER:
01219         case FIRST_UPPER:
01220           state = SUBSEQUENT_UPPER;
01221           upper_string_count++;
01222           if (longest_upper_run_len < upper_string_count)
01223             longest_upper_run_len = upper_string_count;
01224           if (last_char == *str) {
01225             alpha_repetition_count++;
01226             if (longest_alpha_repetition_count < alpha_repetition_count) {
01227               longest_alpha_repetition_count = alpha_repetition_count;
01228             }
01229           }
01230           else {
01231             last_char = *str;
01232             alpha_repetition_count = 1;
01233           }
01234           break;
01235         case FIRST_NUM:
01236           isolated_digits++;
01237         default:
01238           state = FIRST_UPPER;
01239           last_char = *str;
01240           alpha_repetition_count = 1;
01241           upper_string_count = 1;
01242           break;
01243       }
01244     }
01245     else if (islower (*str)) {
01246       total_alpha_count++;
01247       switch (state) {
01248         case SUBSEQUENT_LOWER:
01249         case FIRST_LOWER:
01250           state = SUBSEQUENT_LOWER;
01251           lower_string_count++;
01252           if (longest_lower_run_len < lower_string_count)
01253             longest_lower_run_len = lower_string_count;
01254           if (last_char == *str) {
01255             alpha_repetition_count++;
01256             if (longest_alpha_repetition_count < alpha_repetition_count) {
01257               longest_alpha_repetition_count = alpha_repetition_count;
01258             }
01259           }
01260           else {
01261             last_char = *str;
01262             alpha_repetition_count = 1;
01263           }
01264           break;
01265         case FIRST_NUM:
01266           isolated_digits++;
01267         default:
01268           state = FIRST_LOWER;
01269           last_char = *str;
01270           alpha_repetition_count = 1;
01271           lower_string_count = 1;
01272           break;
01273       }
01274     }
01275     else if (isdigit (*str)) {
01276       total_digit_count++;
01277       switch (state) {
01278         case FIRST_NUM:
01279           state = SUBSEQUENT_NUM;
01280         case SUBSEQUENT_NUM:
01281           break;
01282         case FIRST_UPPER:
01283         case FIRST_LOWER:
01284           isolated_alphas++;
01285         default:
01286           state = FIRST_NUM;
01287           break;
01288       }
01289     }
01290     else {
01291       if (*str == ' ')
01292         tess_rejs++;
01293       else
01294         bad_char_count++;
01295       switch (state) {
01296         case FIRST_NUM:
01297           isolated_digits++;
01298           break;
01299         case FIRST_UPPER:
01300         case FIRST_LOWER:
01301           isolated_alphas++;
01302         default:
01303           break;
01304       }
01305       state = JUNK;
01306     }
01307   }
01308 
01309   /* Step 2: Combine result of Step 1 with heuristics to determine
01310      whether word is garbage. Working at the level of whole word */
01311   switch (state) {
01312     case FIRST_NUM:
01313       isolated_digits++;
01314       break;
01315     case FIRST_UPPER:
01316     case FIRST_LOWER:
01317       isolated_alphas++;
01318     default:
01319       break;
01320   }
01321 
01322   if (crunch_include_numerals) {
01323     total_alpha_count += total_digit_count - isolated_digits;
01324   }
01325 
01326   if (crunch_leave_ok_strings &&
01327     (len >= 4) &&
01328     (2 * (total_alpha_count - isolated_alphas) > len) &&
01329   (longest_alpha_repetition_count < crunch_long_repetitions)) {
01330     if ((crunch_accept_ok &&
01331       (acceptable_word_string (str) != AC_UNACCEPTABLE)) ||
01332       (longest_lower_run_len > crunch_leave_lc_strings) ||
01333       (longest_upper_run_len > crunch_leave_uc_strings))
01334       return G_NEVER_CRUNCH;
01335   }
01336   if ((word->reject_map.length () > 1) &&
01337     (strpbrk (str, " ") == NULL) &&
01338     ((word->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
01339     (word->best_choice->permuter () == FREQ_DAWG_PERM) ||
01340     (word->best_choice->permuter () == USER_DAWG_PERM) ||
01341     (word->best_choice->permuter () == NUMBER_PERM) ||
01342     (acceptable_word_string (str) != AC_UNACCEPTABLE) || ok_dict_word))
01343     return G_OK;
01344 
01345   ok_chars = len - bad_char_count - isolated_digits -
01346     isolated_alphas - tess_rejs;
01347 
01348   if (crunch_debug > 3) {
01349     tprintf ("garbage_word: \"%s\"\n",
01350       word->best_choice->string ().string ());
01351     tprintf ("LEN: %d  bad: %d  iso_N: %d  iso_A: %d  rej: %d\n",
01352       len,
01353       bad_char_count, isolated_digits, isolated_alphas, tess_rejs);
01354   }
01355   if ((bad_char_count == 0) &&
01356     (tess_rejs == 0) &&
01357     ((len > isolated_digits + isolated_alphas) || (len <= 2)))
01358     return G_OK;
01359 
01360   if ((tess_rejs > ok_chars) ||
01361     ((tess_rejs > 0) && ((bad_char_count + tess_rejs) * 2 > len)))
01362     return G_TERRIBLE;
01363 
01364   if (len > 4) {
01365     dodgy_chars = 2 * tess_rejs + bad_char_count +
01366       isolated_digits + isolated_alphas;
01367     if ((dodgy_chars > 5) || ((dodgy_chars / (float) len) > 0.5))
01368       return G_DODGY;
01369     else
01370       return G_OK;
01371   }
01372   else {
01373     dodgy_chars = 2 * tess_rejs + bad_char_count;
01374     if (((len == 4) && (dodgy_chars > 2)) ||
01375       ((len == 3) && (dodgy_chars > 2)) || (dodgy_chars >= len))
01376       return G_DODGY;
01377     else
01378       return G_OK;
01379   }
01380 }

void insert_rej_cblobs ( WERD_RES word  ) 

Put rejected word blobs back into the outword.

Parameters:
word Word
Returns:
none
Note:
NOTE!!! AFTER insert_rej_cblobs(), THE CHOICES LIST WILL NOT HAVE THE CORRECT NUMBER OF ELEMENTS.

Definition at line 1542 of file docqual.cpp.

References ASSERT_HOST, WERD_RES::best_choice, WERD::blob_list(), gblob_sort_list(), REJMAP::initialise(), STRING::length(), REJMAP::length(), WERD_RES::outword, WERD::rej_blob_list(), WERD_RES::reject_map, and TRUE.

Referenced by quality_based_rejection().

01543                                        {
01544   PBLOB_IT blob_it;              // blob iterator
01545   PBLOB_IT rej_blob_it;
01546   const STRING *wordstr;
01547   int old_len;
01548   int rej_len;
01549   char new_str[512];
01550   REJMAP new_map;
01551   int i = 0;                     //new_str index
01552   int j = 0;                     //old_str index
01553   int new_len;
01554 
01555   gblob_sort_list (word->outword->rej_blob_list (), TRUE);
01556   rej_blob_it.set_to_list (word->outword->rej_blob_list ());
01557   if (rej_blob_it.empty ())
01558     return;
01559   rej_len = rej_blob_it.length ();
01560   blob_it.set_to_list (word->outword->blob_list ());
01561   wordstr = &(word->best_choice->string ());
01562   old_len = wordstr->length ();
01563   ASSERT_HOST (word->reject_map.length () == old_len);
01564   ASSERT_HOST (blob_it.length () == old_len);
01565   if ((old_len + rej_len) > 511)
01566     return;                      //Word is garbage anyway prevent abort
01567   new_map.initialise (old_len + rej_len);
01568 
01569   while (!rej_blob_it.empty ()) {
01570     if ((j >= old_len) ||
01571       (rej_blob_it.data ()->bounding_box ().left () <=
01572     blob_it.data ()->bounding_box ().left ())) {
01573       /* Insert reject blob */
01574       if (j >= old_len)
01575         blob_it.add_to_end (rej_blob_it.extract ());
01576       else
01577         blob_it.add_before_stay_put (rej_blob_it.extract ());
01578       if (!rej_blob_it.empty ())
01579         rej_blob_it.forward ();
01580       new_str[i] = ' ';
01581       new_map[i].setrej_rej_cblob ();
01582       i++;
01583     }
01584     else {
01585       new_str[i] = (*wordstr)[j];
01586       new_map[i] = word->reject_map[j];
01587       i++;
01588       j++;
01589       blob_it.forward ();
01590     }
01591   }
01592   /* Add any extra normal blobs to strings */
01593   while (j < wordstr->length ()) {
01594     new_str[i] = (*wordstr)[j];
01595     new_map[i] = word->reject_map[j];
01596     i++;
01597     j++;
01598   }
01599   new_str[i] = '\0';
01600   /*
01601     tprintf(
01602           "\nOld len %d; New len %d; New str \"%s\"; New map \"%s\"\n",
01603           old_len, i, new_str, new_map );
01604   */
01605   ASSERT_HOST (i == blob_it.length ());
01606   ASSERT_HOST (i == old_len + rej_len);
01607   word->reject_map = new_map;
01608   *((STRING *) wordstr) = new_str;
01609   new_len = strlen (word->best_choice->string ().string ());
01610   ASSERT_HOST (word->reject_map.length () == new_len);
01611   ASSERT_HOST (word->outword->blob_list ()->length () == new_len);
01612 }

void merge_tess_fails ( WERD_RES word_res  ) 

Change pairs of tess failures to a single one (merge/collapse).

Parameters:
word_res Results on word in question
Returns:
none, updates word's reject_map

Definition at line 1133 of file docqual.cpp.

References ASSERT_HOST, WERD_RES::best_choice, WERD::blob_list(), REJMAP::length(), merge_blobs(), WERD_RES::outword, WERD_RES::reject_map, and REJMAP::remove_pos().

Referenced by tilde_crunch(), and tilde_delete().

01134                                           {
01135   char *ptr;                     //string ptr
01136   PBLOB_IT blob_it;              //blobs
01137   int i = 0;
01138   int len;
01139 
01140   len = strlen (word_res->best_choice->string ().string ());
01141   ASSERT_HOST (word_res->reject_map.length () == len);
01142   ASSERT_HOST (word_res->outword->blob_list ()->length () == len);
01143 
01144   ptr = (char *) word_res->best_choice->string ().string ();
01145   blob_it = word_res->outword->blob_list ();
01146   while (*ptr != '\0') {
01147     if ((*ptr == ' ') && (*(ptr + 1) == ' ')) {
01148       strcpy (ptr + 1, ptr + 2); //shuffle up
01149       word_res->reject_map.remove_pos (i);
01150       merge_blobs (blob_it.data_relative (1), blob_it.data ());
01151       delete blob_it.extract (); //get rid of spare
01152     }
01153     else {
01154       i++;
01155       ptr++;
01156     }
01157     blob_it.forward ();
01158   }
01159   len = strlen (word_res->best_choice->string ().string ());
01160   ASSERT_HOST (word_res->reject_map.length () == len);
01161   ASSERT_HOST (word_res->outword->blob_list ()->length () == len);
01162 }

BOOL8 noise_outlines ( WERD word  ) 

Determine if more 'too' small outlines than 'regular' outlines (noise).

Parameters:
word Word
Note:
Global: bln_x_height
Returns:
TRUE if more/same count of tiny outlines as good outlines in word
Cycle through blobs for word, looking at BB of outlines: if size of BB is less than small_limit, it's a small_outline_count, else it's a outline_count. Answer is boolean of the comparison: small_outline_count >= outline_count

Definition at line 1507 of file docqual.cpp.

References WERD::blob_list(), BOX::height(), outline_it, and BOX::width().

Referenced by word_deletable().

01507                                  { 
01508   PBLOB_IT blob_it;
01509   OUTLINE_IT outline_it;
01510   BOX box;                       // BB of outline
01511   INT16 outline_count = 0;       // regular count
01512   INT16 small_outline_count = 0; // tiny/noise count
01513   INT16 max_dimension;           // the larger of the BB dims
01514   float small_limit = bln_x_height * crunch_small_outlines_size;
01515 
01516   blob_it.set_to_list (word->blob_list ());
01517   for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) {
01518     outline_it.set_to_list (blob_it.data ()->out_list ());
01519     for (outline_it.mark_cycle_pt(); !outline_it.cycled_list(); outline_it.forward()) {
01520       outline_count++;
01521       box = outline_it.data ()->bounding_box ();
01522       if (box.height () > box.width ())
01523         max_dimension = box.height ();
01524       else
01525         max_dimension = box.width ();
01526       if (max_dimension < small_limit)
01527         small_outline_count++;
01528     }
01529   }
01530   return (small_outline_count >= outline_count);
01531 }

BOOL8 potential_word_crunch ( WERD_RES word,
GARBAGE_LEVEL  garbage_level,
BOOL8  ok_dict_word 
)

Determine if word could be garbage or otherwise questionable.

Parameters:
word Word
garbage_level G_NEVER_CRUNCH, G_TERRIBLE, G_DODGY, or G_OK
ok_dict_word 0 or 1, 1 if word in dictionary
Note:
Global:
  • crunch_pot_indicators,
  • crunch_pot_poor_rate,
  • crunch_pot_poor_cert,
  • crunch_debug
Returns:
TRUE if count of 'clues' that word sucks >= crunch_pot_indicators

Definition at line 974 of file docqual.cpp.

References AC_UNACCEPTABLE, acceptable_word_string(), WERD_RES::best_choice, G_OK, REJMAP::length(), WERD_RES::reject_map, and tprintf().

Referenced by tilde_crunch().

00976                                                 {
00977   float rating_per_ch;
00978   int adjusted_len;
00979   char *str = (char *) word->best_choice->string ().string (); // word's string
00980   BOOL8 word_crunchable;
00981   int poor_indicator_count = 0;
00982 
00983   word_crunchable =
00984     !crunch_leave_accept_strings ||
00985     (word->reject_map.length () < 3) ||
00986     ((acceptable_word_string (str) == AC_UNACCEPTABLE) && !ok_dict_word);
00987 
00988   adjusted_len = word->reject_map.length ();
00989   if (adjusted_len > 10)
00990     adjusted_len = 10;
00991   rating_per_ch = word->best_choice->rating () / adjusted_len;
00992 
00993   if (rating_per_ch > crunch_pot_poor_rate) {
00994     if (crunch_debug > 2) {
00995       tprintf ("Potential poor rating on \"%s\"\n",
00996         word->best_choice->string ().string ());
00997     }
00998     poor_indicator_count++;
00999   }
01000 
01001   if (word_crunchable &&
01002   (word->best_choice->certainty () < crunch_pot_poor_cert)) {
01003     if (crunch_debug > 2) {
01004       tprintf ("Potential poor cert on \"%s\"\n",
01005         word->best_choice->string ().string ());
01006     }
01007     poor_indicator_count++;
01008   }
01009 
01010   if (garbage_level != G_OK) {
01011     if (crunch_debug > 2) {
01012       tprintf ("Potential garbage on \"%s\"\n",
01013         word->best_choice->string ().string ());
01014     }
01015     poor_indicator_count++;
01016   }
01017   return (poor_indicator_count >= crunch_pot_indicators);
01018 }

void print_boxes ( WERD word  ) 

Print all bounding boxes for blobs in word.

Parameters:
word Word in question
Returns:
none

Definition at line 447 of file docqual.cpp.

References WERD::blob_list(), and BOX::print().

00447                              { 
00448   PBLOB_IT it;
00449   BOX box;  // bounding box
00450 
00451   it.set_to_list (word->blob_list ());
00452   for (it.mark_cycle_pt (); !it.cycled_list (); it.forward ()) {
00453     box = it.data ()->bounding_box ();
00454     box.print ();
00455   }
00456 }

void quality_based_rejection ( PAGE_RES_IT page_res_it,
BOOL8  good_quality_doc 
)

Deal with rejected blobs and clean up errant tilde's.

Parameters:
page_res_it pointer to page in question ?
good_quality_doc 0 or 1, which pass ?
Note:
Global:
  • tessedit_good_quality_unrej and
  • unlv_tilde_crunching
Returns:
none, but word's reject_map is updated

Definition at line 496 of file docqual.cpp.

References doc_and_block_rejection(), insert_rej_cblobs(), NULL, tilde_crunch(), tilde_delete(), and unrej_good_quality_words().

Referenced by recog_all_words().

00497                                                      {
00498   if ((tessedit_good_quality_unrej && good_quality_doc))
00499     unrej_good_quality_words(page_res_it); 
00500   doc_and_block_rejection(page_res_it, good_quality_doc); 
00501 
00502   page_res_it.restart_page ();
00503   while (page_res_it.word () != NULL) {
00504     insert_rej_cblobs (page_res_it.word ());
00505     page_res_it.forward ();
00506   }
00507 
00508   if (unlv_tilde_crunching) {
00509     tilde_crunch(page_res_it); 
00510     tilde_delete(page_res_it); 
00511   }
00512 }

void reject_whole_page ( PAGE_RES_IT page_res_it  ) 

Dont believe any of it; set map to 00..00 for all words.

Parameters:
page_res_it 
Returns:
none
Go through all words on page and set their reject_map to rejected, then update page's page_res.

Definition at line 822 of file docqual.cpp.

References PAGE_RES_IT::forward(), NULL, PAGE_RES_IT::page_res, REJMAP::rej_word_doc_rej(), WERD_RES::reject_map, PAGE_RES_IT::restart_page(), TRUE, and PAGE_RES_IT::word().

Referenced by doc_and_block_rejection().

00822                                                  { 
00823   page_res_it.restart_page ();
00824   while (page_res_it.word () != NULL) {
00825     page_res_it.word ()->reject_map.rej_word_doc_rej ();
00826     page_res_it.forward ();
00827   }
00828   page_res_it.page_res->rejected = TRUE; // whole page is rejected
00829 }

BOOL8 terrible_word_crunch ( WERD_RES word,
GARBAGE_LEVEL  garbage_level 
)

Determine if word very likely is utter garbage, tuned with several globals.

Parameters:
word Word
garbage_level G_NEVER_CRUNCH, G_TERRIBLE, G_DODGY, or G_OK
Note:
Global:
  • crunch_debug,
  • crunch_terrible_rating,
  • crunch_terrible_garbage,
  • crunch_poor_garbage_cert,
  • crunch_poor_garbage_rate,
  • crunch_rating_max
Returns:
TRUE if count of 'clues' that word sucks >= crunch_pot_indicators

Definition at line 926 of file docqual.cpp.

References WERD_RES::best_choice, FALSE, G_OK, G_TERRIBLE, REJMAP::length(), WERD_RES::reject_map, tprintf(), and TRUE.

Referenced by tilde_crunch().

00926                                                                         { 
00927   float rating_per_ch;
00928   int adjusted_len;
00929   int crunch_mode = 0;
00930 
00931   if ((word->best_choice->string ().length () == 0) ||
00932     (strspn (word->best_choice->string ().string (), " ") ==
00933     word->best_choice->string ().length ()))
00934     crunch_mode = 1;
00935   else {
00936     adjusted_len = word->reject_map.length ();
00937     if (adjusted_len > crunch_rating_max)
00938       adjusted_len = crunch_rating_max;
00939     rating_per_ch = word->best_choice->rating () / adjusted_len;
00940 
00941     if (rating_per_ch > crunch_terrible_rating)
00942       crunch_mode = 2;
00943     else if (crunch_terrible_garbage && (garbage_level == G_TERRIBLE))
00944       crunch_mode = 3;
00945     else if ((word->best_choice->certainty () < crunch_poor_garbage_cert) &&
00946       (garbage_level != G_OK))
00947       crunch_mode = 4;
00948     else if ((rating_per_ch > crunch_poor_garbage_rate) &&
00949       (garbage_level != G_OK))
00950       crunch_mode = 5;
00951   }
00952   if (crunch_mode > 0) {
00953     if (crunch_debug > 2) {
00954       tprintf ("Terrible_word_crunch (%d) on \"%s\"\n",
00955         crunch_mode, word->best_choice->string ().string ());
00956     }
00957     return TRUE;
00958   }
00959   else
00960     return FALSE;
00961 }

void tilde_crunch ( PAGE_RES_IT page_res_it  ) 

Definition at line 834 of file docqual.cpp.

References WERD_RES::best_choice, convert_bad_unlv_chs(), CR_KEEP_SPACE, dict_word(), DOC_DAWG_PERM, FALSE, PAGE_RES_IT::forward(), G_NEVER_CRUNCH, garbage_word(), merge_tess_fails(), NULL, potential_word_crunch(), PAGE_RES_IT::restart_page(), terrible_word_crunch(), tprintf(), TRUE, WERD_RES::unlv_crunch_mode, and PAGE_RES_IT::word().

Referenced by quality_based_rejection().

00834                                             { 
00835   WERD_RES *word;
00836   GARBAGE_LEVEL garbage_level;
00837   PAGE_RES_IT copy_it;
00838   BOOL8 prev_potential_marked = FALSE;
00839   BOOL8 found_terrible_word = FALSE;
00840   int dict_type;
00841   BOOL8 ok_dict_word;
00842 
00843   page_res_it.restart_page ();
00844   while (page_res_it.word () != NULL) {
00845     word = page_res_it.word ();
00846 
00847     if (crunch_early_convert_bad_unlv_chs)
00848       convert_bad_unlv_chs(word); 
00849 
00850     if (crunch_early_merge_tess_fails)
00851       merge_tess_fails(word); 
00852 
00853     if (word->reject_map.accept_count () != 0) {
00854       found_terrible_word = FALSE;
00855       prev_potential_marked = FALSE; // Forget earlier potential crunches
00856     }
00857     else {
00858       dict_type = dict_word (word->best_choice->string ().string ());
00859       ok_dict_word = (dict_type > 0) && (dict_type != DOC_DAWG_PERM);
00860       garbage_level = garbage_word (word, ok_dict_word);
00861 
00862       if ((garbage_level != G_NEVER_CRUNCH) &&
00863       (terrible_word_crunch (word, garbage_level))) {
00864         if (crunch_debug > 0) {
00865           tprintf ("T CRUNCHING: \"%s\"\n",
00866             word->best_choice->string ().string ());
00867         }
00868         word->unlv_crunch_mode = CR_KEEP_SPACE;
00869         if (prev_potential_marked) {
00870           while (copy_it.word () != word) {
00871             if (crunch_debug > 0) {
00872               tprintf ("P1 CRUNCHING: \"%s\"\n",
00873                 copy_it.word ()->best_choice->string ().
00874                 string ());
00875             }
00876             copy_it.word ()->unlv_crunch_mode = CR_KEEP_SPACE;
00877             copy_it.forward ();
00878           }
00879           prev_potential_marked = FALSE;
00880         }
00881         found_terrible_word = TRUE;
00882       }
00883       else if ((garbage_level != G_NEVER_CRUNCH) &&
00884         (potential_word_crunch (word,
00885       garbage_level, ok_dict_word))) {
00886         if (found_terrible_word) {
00887           if (crunch_debug > 0) {
00888             tprintf ("P2 CRUNCHING: \"%s\"\n",
00889               word->best_choice->string ().string ());
00890           }
00891           word->unlv_crunch_mode = CR_KEEP_SPACE;
00892         }
00893         else if (!prev_potential_marked) {
00894           copy_it = page_res_it;
00895           prev_potential_marked = TRUE;
00896           if (crunch_debug > 1) {
00897             tprintf ("P3 CRUNCHING: \"%s\"\n",
00898               word->best_choice->string ().string ());
00899           }
00900         }
00901       }
00902       else {
00903         found_terrible_word = FALSE;
00904         prev_potential_marked = FALSE; // Forget earlier potential crunches
00905         if (crunch_debug > 2) {
00906           tprintf ("NO CRUNCH: \"%s\"\n",
00907             word->best_choice->string ().string ());
00908         }
00909       }
00910     }
00911     page_res_it.forward ();
00912   }
00913 }

void tilde_delete ( PAGE_RES_IT page_res_it  ) 

Determine if word could be bad and delete all '~'s in it?

Parameters:
page_res_it Results of page
Note:
Global: crunch_early_merge_tess_fails,
Returns:
none, but word's maps are modified!
Calls word_deletable()

Definition at line 1030 of file docqual.cpp.

References WERD_RES::best_choice, CR_NONE, FALSE, WERD::flag(), PAGE_RES_IT::forward(), merge_tess_fails(), NULL, PAGE_RES_IT::restart_page(), tprintf(), TRUE, WERD_RES::unlv_crunch_mode, W_BOL, W_EOL, PAGE_RES_IT::word(), WERD_RES::word, and word_deletable().

Referenced by quality_based_rejection().

01030                                             { 
01031   WERD_RES *word;
01032   PAGE_RES_IT copy_it;
01033   BOOL8 deleting_from_bol = FALSE;
01034   BOOL8 marked_delete_point = FALSE;
01035   INT16 debug_delete_mode;
01036   CRUNCH_MODE delete_mode;
01037   INT16 x_debug_delete_mode;
01038   CRUNCH_MODE x_delete_mode;
01039 
01040   page_res_it.restart_page ();
01041   while (page_res_it.word () != NULL) {
01042     word = page_res_it.word ();
01043 
01044     delete_mode = word_deletable (word, debug_delete_mode);
01045     if (delete_mode != CR_NONE) {
01046       if (word->word->flag (W_BOL) || deleting_from_bol) {
01047         if (crunch_debug > 0) {
01048           tprintf ("BOL CRUNCH DELETING(%d): \"%s\"\n",
01049             debug_delete_mode,
01050             word->best_choice->string ().string ());
01051         }
01052         word->unlv_crunch_mode = delete_mode;
01053         deleting_from_bol = TRUE;
01054       }
01055       else if (word->word->flag (W_EOL)) {
01056         if (marked_delete_point) {
01057           while (copy_it.word () != word) {
01058             x_delete_mode = word_deletable (copy_it.word (),
01059               x_debug_delete_mode);
01060             if (crunch_debug > 0) {
01061               tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n",
01062                 x_debug_delete_mode,
01063                 copy_it.word ()->best_choice->string ().
01064                 string ());
01065             }
01066             copy_it.word ()->unlv_crunch_mode = x_delete_mode;
01067             copy_it.forward ();
01068           }
01069         }
01070         if (crunch_debug > 0) {
01071           tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n",
01072             debug_delete_mode,
01073             word->best_choice->string ().string ());
01074         }
01075         word->unlv_crunch_mode = delete_mode;
01076         deleting_from_bol = FALSE;
01077         marked_delete_point = FALSE;
01078       }
01079       else {
01080         if (!marked_delete_point) {
01081           copy_it = page_res_it;
01082           marked_delete_point = TRUE;
01083         }
01084       }
01085     }
01086     else {
01087       deleting_from_bol = FALSE;
01088       marked_delete_point = FALSE; // Forget earlier potential crunches
01089     }
01090     /* The following step has been left till now as the tess fails are used to
01091       determine if the word is deletable.
01092     */
01093     if (!crunch_early_merge_tess_fails)
01094       merge_tess_fails(word); 
01095     page_res_it.forward ();
01096   }
01097 }

void unrej_good_chs ( WERD_RES word,
ROW row 
)

Unreject POTENTIAL rejects if the blob passes the blob and outline checks.

Parameters:
word Word in question
row word's row?
Note:
Global:
  • docqual_excuse_outline_errs and
  • bln_x_height
Returns:
none, updates word's reject_map

Definition at line 371 of file docqual.cpp.

References WERD_RES::best_choice, WERD::blob_list(), PBLOB::bounding_box(), count_outline_errs(), crude_match_blobs(), delete_word(), WERD_RES::denorm, FALSE, WERD::gblob_list(), BOX::left(), make_bln_copy(), make_ed_word(), make_tess_word(), NULL, WERD_RES::outword, WERD_RES::reject_map, DENORM::scale(), and WERD_RES::word.

Referenced by unrej_good_quality_words().

00371                                               { 
00372   WERD *bln_word;                //BL norm init word
00373   TWERD *tessword;               //tess format
00374   WERD *init_word;               //BL norm init word
00375   PBLOB_IT outword_it;
00376   PBLOB_IT initial_it;
00377   INT16 i;
00378   INT16 init_blobs_left;
00379   BOOL8 matched;
00380   BOX out_box;
00381   PBLOB *test_blob;
00382   DENORM denorm;
00383   float bln_xht;
00384   INT16 j = 0;
00385 
00386   if (word->word->gblob_list ()->empty ())
00387     return;
00388 
00389   bln_xht = bln_x_height / word->denorm.scale (); // xht used for blnorm
00390   bln_word = make_bln_copy (word->word, row, bln_xht, &denorm);
00391   /* NOTE: Need to convert to tess format and back again to ensure that the
00392     same float -> int rounding of coords is done to source wd as out wd before
00393     comparison
00394   */
00395   tessword = make_tess_word (bln_word, NULL);
00396   init_word = make_ed_word (tessword, bln_word); // convert word
00397   delete bln_word;
00398   delete_word(tessword);  // get rid of it
00399 
00400   initial_it.set_to_list (init_word->blob_list ());
00401   init_blobs_left = initial_it.length ();
00402   outword_it.set_to_list (word->outword->blob_list ());
00403 
00404   for (outword_it.mark_cycle_pt ();
00405   !outword_it.cycled_list (); outword_it.forward ()) {
00406     out_box = outword_it.data ()->bounding_box ();
00407 
00408     /* Skip any initial blobs LEFT of current outword blob */
00409     while (!initial_it.at_last () &&
00410     (initial_it.data ()->bounding_box ().left () < out_box.left ())) {
00411       initial_it.forward ();
00412       init_blobs_left--;
00413     }
00414 
00415     /* See if current outword blob matches any initial blob with the same left
00416       coord. (Normally only one but possibly more - in unknown order) */
00417     i = 0;
00418     matched = FALSE;
00419     do {
00420       test_blob = initial_it.data_relative (i++);
00421       matched = crude_match_blobs (test_blob, outword_it.data ());
00422       if (matched &&
00423         (word->reject_map[j].accept_if_good_quality ()) &&
00424         (docqual_excuse_outline_errs ||
00425         (count_outline_errs (word->best_choice->string ()[j],
00426         outword_it.data ()->out_list ()->
00427         length ()) == 0)))
00428         word->reject_map[j].setrej_quality_accept ();
00429     }
00430     while (!matched &&
00431       (init_blobs_left - i > 0) &&
00432       (i < 129) &&
00433       !initial_it.at_last () &&
00434       test_blob->bounding_box ().left () == out_box.left ());
00435     j++;
00436   }
00437   delete init_word;
00438 }

void unrej_good_quality_words ( PAGE_RES_IT page_res_it  ) 

Unreject potential.

Parameters:
page_res_it pointer to page in question ?
Returns:
none
Accept potential rejects in words which pass some checks

- Contains a potential reject
- Word looks like a sensible alpha word.
- Word segmentation is the same as the original image
	- All characters have the expected number of outlines
	- NOTE: the rejection counts are recalculated after unrejection
	- CANT do it in a single pass without a bit of fiddling
	- keep it simple but inefficient

Definition at line 533 of file docqual.cpp.

References AC_UNACCEPTABLE, acceptable_word_string(), WERD_RES::best_choice, PAGE_RES_IT::block(), ROW_RES::char_count, BLOCK_RES::char_count, check_debug_pt(), PAGE_RES_IT::forward(), REJMAP::length(), NULL, PAGE_RES_IT::page_res, REJMAP::quality_recoverable_rejects(), ROW_RES::rej_count, BLOCK_RES::rej_count, PAGE_RES_IT::rej_stat_word(), WERD_RES::reject_map, PAGE_RES_IT::restart_page(), PAGE_RES_IT::row(), ROW_RES::row, unrej_good_chs(), ROW_RES::whole_word_rej_count, and PAGE_RES_IT::word().

Referenced by quality_based_rejection().

00534                                                         {
00535   WERD_RES *word;
00536   ROW_RES *current_row;
00537   BLOCK_RES *current_block;
00538   int i;
00539 
00540   page_res_it.restart_page ();
00541   while (page_res_it.word () != NULL) {
00542     check_debug_pt (page_res_it.word (), 100);
00543     if (bland_unrej) {
00544       word = page_res_it.word ();
00545       for (i = 0; i < word->reject_map.length (); i++) {
00546         if (word->reject_map[i].accept_if_good_quality ())
00547           word->reject_map[i].setrej_quality_accept ();
00548       }
00549       page_res_it.forward ();
00550     }
00551     else if ((page_res_it.row ()->char_count > 0) &&
00552       ((page_res_it.row ()->rej_count /
00553       (float) page_res_it.row ()->char_count) <=
00554     quality_rowrej_pc)) {
00555       word = page_res_it.word ();
00556       if (word->reject_map.quality_recoverable_rejects () &&
00557         (tessedit_unrej_any_wd ||
00558         acceptable_word_string (word->best_choice->string ().string ())
00559       != AC_UNACCEPTABLE)) {
00560         unrej_good_chs (word, page_res_it.row ()->row);
00561       }
00562       page_res_it.forward ();
00563     }
00564     else {
00565       /* Skip to end of dodgy row */
00566       current_row = page_res_it.row ();
00567       while ((page_res_it.word () != NULL) &&
00568         (page_res_it.row () == current_row))
00569         page_res_it.forward ();
00570     }
00571     check_debug_pt (page_res_it.word (), 110);
00572   }
00573   page_res_it.restart_page ();
00574   page_res_it.page_res->char_count = 0;
00575   page_res_it.page_res->rej_count = 0;
00576   current_block = NULL;
00577   current_row = NULL;
00578   while (page_res_it.word () != NULL) {
00579     if (current_block != page_res_it.block ()) {
00580       current_block = page_res_it.block ();
00581       current_block->char_count = 0;
00582       current_block->rej_count = 0;
00583     }
00584     if (current_row != page_res_it.row ()) {
00585       current_row = page_res_it.row ();
00586       current_row->char_count = 0;
00587       current_row->rej_count = 0;
00588       current_row->whole_word_rej_count = 0;
00589     }
00590     page_res_it.rej_stat_word ();
00591     page_res_it.forward ();
00592   }
00593 }

INT16 word_blob_quality ( WERD_RES word,
ROW row 
)

Find number of blobs in outword that are identical to those of inword.

Parameters:
word Word in question
row Row the word came from
Returns:
Count of good blobs
ASSUME blobs in both initial word and outword are in ascending order of left hand blob edge.

Todo:
outword is word on edges of block vs inword is any word within a block, right (in word_blob_quality())?

Definition at line 139 of file docqual.cpp.

References WERD::blob_list(), PBLOB::bounding_box(), cprintf(), crude_match_blobs(), delete_word(), WERD_RES::denorm, FALSE, WERD::gblob_list(), BOX::left(), make_bln_copy(), make_ed_word(), make_tess_word(), NULL, WERD_RES::outword, DENORM::scale(), and WERD_RES::word.

Referenced by recog_all_words(), and recog_interactive().

00141                                   {
00142   WERD *bln_word;                //BL norm init word
00143   TWERD *tessword;               //tess format
00144   WERD *init_word;               //BL norm init word
00145   PBLOB_IT outword_it;
00146   PBLOB_IT initial_it;
00147   INT16 i;
00148   INT16 init_blobs_left;
00149   INT16 match_count = 0;
00150   BOOL8 matched;
00151   BOX out_box;
00152   PBLOB *test_blob;
00153   DENORM denorm;
00154   float bln_xht;
00155 
00156 #ifdef TEXT_VERBOSE
00157   // gets a 'v', see ccmain/tesseractmain.dox
00158   cprintf("v");
00159 #endif
00160   if (word->word->gblob_list ()->empty ())
00161     return 0;
00162   bln_xht = bln_x_height / word->denorm.scale (); //xht used for blnorm
00163   bln_word = make_bln_copy (word->word, row, bln_xht, &denorm);
00164 
00165   /* NOTE: Need to convert to tess format and back again to ensure that the
00166     same float -> int rounding of coords is done to source wd as out wd before
00167     comparison
00168   */
00169 
00170   //   if (!bln_word->flag(W_POLYGON))
00171   //           tprintf( "NON POLYGON BLN WERD\n");
00172   tessword = make_tess_word (bln_word, NULL);
00173   init_word = make_ed_word (tessword, bln_word); // convert word
00174   //   if (!init_word->flag(W_POLYGON))
00175   //         tprintf( "NON POLYGON INIT WERD\n");
00176   //   tprintf( "SOURCE BLOBS-AFTER TESS:\n");
00177   //   print_boxes( init_word );
00178   //   tprintf( "OUTPUT BLOBS:\n");
00179   //   print_boxes( word->outword );
00180 
00181   initial_it.set_to_list (init_word->blob_list ());
00182   init_blobs_left = initial_it.length ();
00183   outword_it.set_to_list (word->outword->blob_list ());
00184   delete bln_word;
00185   delete_word(tessword);  //get rid of it
00186 
00187   for (outword_it.mark_cycle_pt ();
00188   !outword_it.cycled_list (); outword_it.forward ()) {
00189     out_box = outword_it.data ()->bounding_box ();
00190 
00191     /* Skip any initial blobs LEFT of current outword blob */
00192     while (!initial_it.at_last () &&
00193     (initial_it.data ()->bounding_box ().left () < out_box.left ())) {
00194       initial_it.forward ();
00195       init_blobs_left--;
00196     }
00197 
00198     /* See if current outword blob matches any initial blob with the same left
00199       coord. (Normally only one but possibly more - in unknown order) */
00200 
00201     i = 0;
00202     matched = FALSE;
00203     do {
00204       test_blob = initial_it.data_relative (i++);
00205       matched = crude_match_blobs (test_blob, outword_it.data ());
00206       if (matched)
00207         match_count++;
00208     }
00209     while (!matched &&
00210       (init_blobs_left - i > 0) &&
00211       (i < 129) &&
00212       !initial_it.at_last () &&
00213       test_blob->bounding_box ().left () == out_box.left ());
00214   }
00215   delete init_word;
00216   return match_count;
00217 }

void word_char_quality ( WERD_RES word,
ROW row,
INT16 match_count,
INT16 accepted_match_count 
)

Check word's blobs' quality.

Parameters:
word Word in question
row word's row?
match_count Return variable for caller
accepted_match_count Return variable for caller
Note:
Global: bln_x_height
Returns:
none
Combination of blob quality and outline quality & determine how many good chars are there - i.e., chars which pass the blob AND outline tests.

Definition at line 277 of file docqual.cpp.

References WERD_RES::best_choice, WERD::blob_list(), PBLOB::bounding_box(), count_outline_errs(), cprintf(), crude_match_blobs(), delete_word(), WERD_RES::denorm, FALSE, WERD::gblob_list(), BOX::left(), make_bln_copy(), make_ed_word(), make_tess_word(), NULL, WERD_RES::outword, WERD_RES::reject_map, DENORM::scale(), and WERD_RES::word.

Referenced by classify_word_pass2(), doc_and_block_rejection(), recog_all_words(), and recog_interactive().

00281                                                     {
00282   WERD *bln_word;                //BL norm init word
00283   TWERD *tessword;               //tess format
00284   WERD *init_word;               //BL norm init word
00285   PBLOB_IT outword_it;
00286   PBLOB_IT initial_it;
00287   INT16 i;
00288   INT16 init_blobs_left;
00289   BOOL8 matched;
00290   BOX out_box;
00291   PBLOB *test_blob;
00292   DENORM denorm;
00293   float bln_xht;
00294   INT16 j = 0;
00295 
00296 #ifdef TEXT_VERBOSE
00297   // gets a 'y', see ccmain/tessvars.doxfg
00298   cprintf("y");
00299 #endif
00300   *match_count = 0;
00301   *accepted_match_count = 0;
00302   if (word->word->gblob_list ()->empty ())
00303     return;
00304 
00305   bln_xht = bln_x_height / word->denorm.scale (); //xht used for blnorm
00306   bln_word = make_bln_copy (word->word, row, bln_xht, &denorm);
00307   /* NOTE: Need to convert to tess format and back again to ensure that the
00308     same float -> int rounding of coords is done to source wd as out wd before
00309     comparison
00310   */
00311   tessword = make_tess_word (bln_word, NULL);
00312   init_word = make_ed_word (tessword, bln_word); //convert word
00313   delete bln_word;
00314   delete_word(tessword);  //get rid of it
00315   //   tprintf( "SOURCE BLOBS-AFTER TESS:\n");
00316   //   print_boxes( init_word );
00317   //   tprintf( "OUTPUT BLOBS:\n");
00318   //   print_boxes( word->outword );
00319 
00320   initial_it.set_to_list (init_word->blob_list ());
00321   init_blobs_left = initial_it.length ();
00322   outword_it.set_to_list (word->outword->blob_list ());
00323 
00324   for (outword_it.mark_cycle_pt ();
00325   !outword_it.cycled_list (); outword_it.forward ()) {
00326     out_box = outword_it.data ()->bounding_box ();
00327 
00328     /* Skip any initial blobs LEFT of current outword blob */
00329     while (!initial_it.at_last () &&
00330     (initial_it.data ()->bounding_box ().left () < out_box.left ())) {
00331       initial_it.forward ();
00332       init_blobs_left--;
00333     }
00334 
00335     /* See if current outword blob matches any initial blob with the same left
00336       coord. (Normally only one but possibly more - in unknown order) */
00337 
00338     i = 0;
00339     matched = FALSE;
00340     do {
00341       test_blob = initial_it.data_relative (i++);
00342       matched = crude_match_blobs (test_blob, outword_it.data ());
00343       if (matched &&
00344         (count_outline_errs (word->best_choice->string ()[j],
00345         outword_it.data ()->out_list ()->length ()) == 0)) {
00346         (*match_count)++;
00347         if (word->reject_map[j].accepted ())
00348           (*accepted_match_count)++;
00349       }
00350     }
00351     while (!matched &&
00352       (init_blobs_left - i > 0) &&
00353       (i < 129) &&
00354       !initial_it.at_last () &&
00355       test_blob->bounding_box ().left () == out_box.left ());
00356     j++;
00357   }
00358   delete init_word;
00359 }

CRUNCH_MODE word_deletable ( WERD_RES word,
INT16 delete_mode 
)

Delete suspicious word if a bunch of constraints are met.

Parameters:
word Word in question
delete_mode Actually a return value (0..11) for tilde_delete()
Note:
Global:
  • bln_baseline_offset,
  • bln_x_height,
  • crunch_del_min_ht,
  • crunch_del_min_width,
  • crunch_del_max_ht,
  • crunch_del_low_word,
  • crunch_del_high_word,
  • crunch_del_rating,
  • crunch_del_cert.
Returns:
  • CR_DELETE,
  • CR_NONE,
  • CR_LOOSE_SPACE
DELETE WERDS AT ENDS OF ROWS IF
   Word is crunched AND
   ( string length = 0                                          OR
     > 50% of chars are "|" (before merging)                    OR
     certainty < -10                                            OR
     rating /char > 60                                          OR
     TOP of word is more than 0.5 xht BELOW baseline            OR
     BOTTOM of word is more than 0.5 xht ABOVE xht              OR
     length of word < 3xht                                      OR
     height of word < 0.7 xht                                   OR
     height of word > 3.0 xht                                   OR
     >75% of the outline BBs have longest dimension < 0.5xht )

Definition at line 1409 of file docqual.cpp.

References BOX::bottom(), WERD::bounding_box(), CR_DELETE, CR_LOOSE_SPACE, CR_NONE, failure_count(), BOX::height(), REJMAP::length(), noise_outlines(), WERD_RES::outword, WERD_RES::reject_map, BOX::top(), WERD_RES::unlv_crunch_mode, and BOX::width().

Referenced by tilde_delete().

01409                                                                { 
01410   int word_len = word->reject_map.length ();
01411   float rating_per_ch;
01412   BOX box;                       // BB of word
01413 
01414   if (word->unlv_crunch_mode == CR_NONE) {
01415     delete_mode = 0;
01416     return CR_NONE;
01417   }
01418 
01419   if (word_len == 0) {
01420     delete_mode = 1;
01421     return CR_DELETE;
01422   }
01423 
01424   box = word->outword->bounding_box ();
01425   if (box.height () < crunch_del_min_ht * bln_x_height) {
01426     delete_mode = 4;
01427     return CR_DELETE;
01428   }
01429 
01430   if (noise_outlines (word->outword)) {
01431     delete_mode = 5;
01432     return CR_DELETE;
01433   }
01434 
01435   if ((failure_count (word) * 1.5) > word_len) {
01436     delete_mode = 2;
01437     return CR_LOOSE_SPACE;
01438   }
01439 
01440   if (word->best_choice->certainty () < crunch_del_cert) {
01441     delete_mode = 7;
01442     return CR_LOOSE_SPACE;
01443   }
01444 
01445   rating_per_ch = word->best_choice->rating () / word_len;
01446 
01447   if (rating_per_ch > crunch_del_rating) {
01448     delete_mode = 8;
01449     return CR_LOOSE_SPACE;
01450   }
01451 
01452   if (box.top () < bln_baseline_offset - crunch_del_low_word * bln_x_height) {
01453     delete_mode = 9;
01454     return CR_LOOSE_SPACE;
01455   }
01456 
01457   if (box.bottom () >
01458   bln_baseline_offset + crunch_del_high_word * bln_x_height) {
01459     delete_mode = 10;
01460     return CR_LOOSE_SPACE;
01461   }
01462 
01463   if (box.height () > crunch_del_max_ht * bln_x_height) {
01464     delete_mode = 11;
01465     return CR_LOOSE_SPACE;
01466   }
01467 
01468   if (box.width () < crunch_del_min_width * bln_x_height) {
01469     delete_mode = 3;
01470     return CR_LOOSE_SPACE;
01471   }
01472 
01473   delete_mode = 0;
01474   return CR_NONE;
01475 }

INT16 word_outline_errs ( WERD_RES word  ) 

Count errors in word's outlines using count_outline_errs().

Parameters:
word Word
Returns:
count of errors

Definition at line 246 of file docqual.cpp.

References WERD_RES::best_choice, WERD::blob_list(), count_outline_errs(), and WERD_RES::outword.

Referenced by recog_all_words(), and recog_interactive().

00247                                         {
00248   PBLOB_IT outword_it;
00249   INT16 i = 0;
00250   INT16 err_count = 0;
00251 
00252   outword_it.set_to_list (word->outword->blob_list ());
00253 
00254   for (outword_it.mark_cycle_pt ();
00255   !outword_it.cycled_list (); outword_it.forward ()) {
00256     err_count += count_outline_errs (word->best_choice->string ()[i],
00257       outword_it.data ()->out_list ()-> length ());
00258     i++;
00259   }
00260   return err_count;
00261 }


Generated on Wed Feb 28 19:49:14 2007 for Tesseract by  doxygen 1.5.1