00001
00021
00022
00023
00024 #include "chopper.h"
00025 #include "wordclass.h"
00026 #include "makechop.h"
00027 #include "associate.h"
00028 #include "metrics.h"
00029 #include "tordvars.h"
00030 #include "stopper.h"
00031 #include "callcpp.h"
00032 #include "structures.h"
00033 #include "findseam.h"
00034 #include "render.h"
00035 #include "seam.h"
00036 #include "const.h"
00037 #include "freelist.h"
00038 #include "pieces.h"
00039 #include "permute.h"
00040
00041
00042 #include <math.h>
00043
00044 extern int blob_skip;
00047 INT_VAR (repair_unchopped_blobs, 1, "Fix blobs that aren't chopped");
00048
00049
00050 double_VAR (tessedit_certainty_threshold, -2.25, "Good blob limit");
00053
00054
00055
00060 #define bounds_inside(inner_tl,inner_br,outer_tl,outer_br) \
00061 ((inner_tl.x >= outer_tl.x) && \
00062 (inner_tl.y <= outer_tl.y) && \
00063 (inner_br.x <= outer_br.x) && \
00064 (inner_br.y >= outer_br.y)) \
00065
00066
00069 #define set_null_choice(choice) \
00070 (class_string (choice) = NULL, \
00071 class_probability (choice) = MAX_FLOAT32, \
00072 class_certainty (choice) = -MAX_FLOAT32) \
00073
00074
00075
00076
00077
00081 void preserve_outline(EDGEPT *start) {
00082 EDGEPT *srcpt;
00083
00084 if (start == NULL)
00085 return;
00086 srcpt = start;
00087 do {
00088 srcpt->flags[1] = 1;
00089 srcpt = srcpt->next;
00090 }
00091 while (srcpt != start);
00092 srcpt->flags[1] = 2;
00093 }
00094
00095
00096
00102 void preserve_outline_tree(TESSLINE *srcline) {
00103 TESSLINE *outline;
00104
00105 for (outline = srcline; outline != NULL; outline = outline->next) {
00106 preserve_outline (outline->loop);
00107 }
00108 if (srcline->child != NULL)
00109 preserve_outline_tree (srcline->child);
00110 }
00111
00112
00113
00117 EDGEPT *restore_outline(EDGEPT *start) {
00118 EDGEPT *srcpt;
00119 EDGEPT *real_start;
00120 EDGEPT *deadpt;
00121
00122 if (start == NULL)
00123 return NULL;
00124 srcpt = start;
00125 do {
00126 if (srcpt->flags[1] == 2)
00127 break;
00128 srcpt = srcpt->next;
00129 }
00130 while (srcpt != start);
00131 real_start = srcpt;
00132 do {
00133 if (srcpt->flags[1] == 0) {
00134 deadpt = srcpt;
00135 srcpt = srcpt->next;
00136 srcpt->prev = deadpt->prev;
00137 deadpt->prev->next = srcpt;
00138 deadpt->prev->vec.x = srcpt->pos.x - deadpt->prev->pos.x;
00139 deadpt->prev->vec.y = srcpt->pos.y - deadpt->prev->pos.y;
00140 oldedgept(deadpt);
00141 }
00142 else
00143 srcpt = srcpt->next;
00144 }
00145 while (srcpt != real_start);
00146 return real_start;
00147 }
00148
00149
00150
00156 void restore_outline_tree(TESSLINE *srcline) {
00157 TESSLINE *outline;
00158
00159 for (outline = srcline; outline != NULL; outline = outline->next) {
00160 outline->loop = restore_outline (outline->loop);
00161 outline->start = outline->loop->pos;
00162 }
00163 if (srcline->child != NULL)
00164 restore_outline_tree (srcline->child);
00165 }
00166
00167
00168
00181 SEAM *attempt_blob_chop(TWERD *word, INT32 blob_number, SEAMS seam_list) {
00182 TBLOB *blob;
00183 TBLOB *other_blob;
00184 SEAM *seam;
00185 TBLOB *last_blob;
00186 TBLOB *next_blob;
00187 INT16 x;
00188
00189 if (first_pass)
00190 chops_attempted1++;
00191 else
00192 chops_attempted2++;
00193
00194 last_blob = NULL;
00195 blob = word->blobs;
00196 for (x = 0; x < blob_number; x++) {
00197 last_blob = blob;
00198 blob = blob->next;
00199 }
00200 next_blob = blob->next;
00201
00202 if (repair_unchopped_blobs)
00203 preserve_outline_tree (blob->outlines);
00204 other_blob = newblob ();
00205 other_blob->next = blob->next;
00206 other_blob->outlines = NULL;
00207 blob->next = other_blob;
00208
00209 seam = pick_good_seam (blob);
00210
00211 #define TEXT_PROGRESS 1
00212 #ifdef TEXT_PROGRESS
00213 #ifndef TEXT_VERBOSE
00214
00215 if (seam != NULL && !chop_debug)
00216 cprintf(".");
00217 #else
00218
00219 if (seam != NULL && !chop_debug)
00220 cprintf("s");
00221 #endif
00222 #endif
00223 if (chop_debug) {
00224 if (seam != NULL) {
00225 print_seam ("Good seam picked=", seam);
00226 }
00227 else
00228 cprintf ("\n** no seam picked *** \n");
00229 }
00230 if (seam) {
00231 apply_seam(blob, other_blob, seam);
00232 }
00233
00234 if ((seam == NULL) ||
00235 (blob->outlines == NULL) ||
00236 (other_blob->outlines == NULL) ||
00237 total_containment (blob, other_blob) ||
00238 check_blob (other_blob) ||
00239 !(check_seam_order (blob, seam) &&
00240 check_seam_order (other_blob, seam)) ||
00241 any_shared_split_points (seam_list, seam) ||
00242 !test_insert_seam(seam_list, blob_number, blob, word->blobs)) {
00243
00244 blob->next = next_blob;
00245 if (seam) {
00246 undo_seam(blob, other_blob, seam);
00247 delete_seam(seam);
00248 #ifndef GRAPHICS_DISABLED
00249 if (chop_debug) {
00250 display_blob(blob, Red);
00251 cprintf ("\n** seam being removed ** \n");
00252 }
00253 #endif
00254 }
00255 else {
00256 oldblob(other_blob);
00257 }
00258
00259 if (repair_unchopped_blobs)
00260 restore_outline_tree (blob->outlines);
00261 return (NULL);
00262 }
00263 return (seam);
00264 }
00265
00266
00267
00271 int any_shared_split_points(SEAMS seam_list, SEAM *seam) {
00272 int length;
00273 int index;
00274
00275 length = array_count (seam_list);
00276 for (index = 0; index < length; index++)
00277 if (shared_split_points ((SEAM *) array_value (seam_list, index), seam))
00278 return TRUE;
00279 return FALSE;
00280 }
00281
00282
00283
00287 int check_blob(TBLOB *blob) {
00288 TESSLINE *outline;
00289 EDGEPT *edgept;
00290
00291 for (outline = blob->outlines; outline != NULL; outline = outline->next) {
00292 edgept = outline->loop;
00293 do {
00294 if (edgept == NULL)
00295 break;
00296 edgept = edgept->next;
00297 }
00298 while (edgept != outline->loop);
00299 if (edgept == NULL)
00300 return 1;
00301 }
00302 return 0;
00303 }
00304
00305
00306
00313 CHOICES_LIST improve_one_blob(TWERD *word,
00314 CHOICES_LIST char_choices,
00315 int fx,
00316 INT32 *blob_number,
00317 SEAMS *seam_list,
00318 DANGERR *fixpt,
00319 STATE *this_state,
00320 STATE *correct_state,
00321 INT32 pass) {
00322 TBLOB *pblob;
00323 TBLOB *blob;
00324 INT16 x = 0;
00325 float rating_ceiling = MAX_FLOAT32;
00326 CHOICES answer;
00327 SEAM *seam;
00328
00329 do {
00330 *blob_number = select_blob_to_split (char_choices, rating_ceiling);
00331 if (*blob_number == -1)
00332 return (NULL);
00333
00334 seam = attempt_blob_chop (word, *blob_number, *seam_list);
00335 if (seam != NULL)
00336 break;
00337
00338 answer = (CHOICES) array_value (char_choices, *blob_number);
00339 if (answer == NIL)
00340 return (NULL);
00341 rating_ceiling = best_probability (answer);
00342 }
00343 while (!blob_skip);
00344
00345 for (blob = word->blobs, pblob = NULL; x < *blob_number; x++) {
00346 pblob = blob;
00347 blob = blob->next;
00348 }
00349
00350 *seam_list =
00351 insert_seam (*seam_list, *blob_number, seam, blob, word->blobs);
00352
00353 free_choices ((CHOICES) array_value (char_choices, *blob_number));
00354
00355 answer =
00356 classify_blob (pblob, blob, blob->next, NULL, fx, "improve 1:", Red,
00357 this_state, correct_state, pass, *blob_number);
00358 char_choices = array_insert (char_choices, *blob_number, answer);
00359
00360 answer =
00361 classify_blob (blob, blob->next, blob->next->next, NULL, fx, "improve 2:",
00362 Yellow, this_state, correct_state, pass, *blob_number + 1);
00363 array_value (char_choices, *blob_number + 1) = (char *) answer;
00364
00365 return (char_choices);
00366 }
00367
00368
00369
00379 INT16 check_seam_order(TBLOB *blob, SEAM *seam) {
00380 TESSLINE *outline;
00381 TESSLINE *last_outline;
00382 INT8 found_em[3];
00383
00384 if (seam->split1 == NULL || seam->split1 == NULL || blob == NULL)
00385 return (TRUE);
00386
00387 found_em[0] = found_em[1] = found_em[2] = FALSE;
00388
00389 for (outline = blob->outlines; outline; outline = outline->next) {
00390 if (!found_em[0] &&
00391 ((seam->split1 == NULL) ||
00392 is_split_outline (outline, seam->split1))) {
00393 found_em[0] = TRUE;
00394 }
00395 if (!found_em[1] &&
00396 ((seam->split2 == NULL) ||
00397 is_split_outline (outline, seam->split2))) {
00398 found_em[1] = TRUE;
00399 }
00400 if (!found_em[2] &&
00401 ((seam->split3 == NULL) ||
00402 is_split_outline (outline, seam->split3))) {
00403 found_em[2] = TRUE;
00404 }
00405 last_outline = outline;
00406 }
00407
00408 if (!found_em[0] || !found_em[1] || !found_em[2])
00409 return (FALSE);
00410 else
00411 return (TRUE);
00412 }
00413
00414
00415
00437 CHOICES_LIST chop_word_main(register TWERD *word,
00438 int fx,
00439 A_CHOICE *best_choice,
00440 A_CHOICE *raw_choice,
00441 BOOL8 tester,
00442 BOOL8 trainer) {
00443 TBLOB *pblob;
00444 TBLOB *blob;
00445 CHOICES_LIST char_choices;
00446 int index;
00447 int did_chopping;
00448 float rating_limit = 1000.0;
00449 STATE state;
00450 SEAMS seam_list = NULL;
00451 CHOICES match_result;
00452 MATRIX ratings = NULL;
00453 DANGERR fixpt;
00454 INT32 state_count;
00455 INT32 bit_count;
00456 static STATE best_state;
00457 static STATE chop_states[64];
00458
00459 state_count = 0;
00460 set_null_choice(best_choice);
00461 set_null_choice(raw_choice);
00462
00463 char_choices = new_choice_list ();
00464
00465 did_chopping = 0;
00466 for (blob = word->blobs, pblob = NULL, index = 0; blob != NULL;
00467 blob = blob->next, index++) {
00468 match_result =
00469 (CHOICES) classify_blob (pblob, blob, blob->next, NULL, fx,
00470 "chop_word:", Green, &chop_states[0],
00471 &best_state, matcher_pass, index);
00472 char_choices = array_push (char_choices, match_result);
00473 pblob = blob;
00474 }
00475 bit_count = index - 1;
00476 permute_characters(char_choices, rating_limit, best_choice, raw_choice);
00477
00478 set_n_ones (&state, array_count (char_choices) - 1);
00479 if (matcher_fp != NULL) {
00480 if (matcher_pass == 0) {
00481 bits_in_states = bit_count;
00482 chop_states[state_count] = state;
00483 }
00484 state_count++;
00485 }
00486
00487 if (!AcceptableChoice (char_choices, best_choice, raw_choice, &fixpt)
00488 || (tester || trainer)
00489 && strcmp (word->correct, class_string (best_choice))) {
00490 did_chopping = 1;
00491 if (first_pass)
00492 words_chopped1++;
00493 else
00494 words_chopped2++;
00495
00496 seam_list = start_seam_list (word->blobs);
00497
00498 if (chop_enable)
00499 improve_by_chopping(word,
00500 &char_choices,
00501 fx,
00502 &state,
00503 best_choice,
00504 raw_choice,
00505 &seam_list,
00506 &fixpt,
00507 chop_states,
00508 &state_count,
00509 &best_state,
00510 matcher_pass);
00511
00512 if (chop_debug)
00513 print_seams ("Final seam list:", seam_list);
00514
00515 if (enable_assoc &&
00516 !AcceptableChoice (char_choices, best_choice, raw_choice, NULL)
00517 || (tester || trainer)
00518 && strcmp (word->correct, class_string (best_choice))) {
00519 ratings = word_associator (word->blobs, seam_list, &state, fx,
00520 best_choice, raw_choice, word->correct,
00521 &fixpt,
00522 &best_state, matcher_pass);
00523 }
00524 bits_in_states = bit_count + state_count - 1;
00525
00526 }
00527 if (ratings != NULL)
00528 free_matrix(ratings);
00529 if (did_chopping || tester || trainer)
00530 char_choices = rebuild_current_state (word->blobs, seam_list, &state,
00531 char_choices, fx);
00532 if (seam_list != NULL)
00533 free_seam_list(seam_list);
00534 if (matcher_fp != NULL) {
00535 best_state = state;
00536 }
00537 FilterWordChoices();
00538 return char_choices;
00539 }
00540
00541
00542
00568 void improve_by_chopping(register TWERD *word,
00569 CHOICES_LIST *char_choices,
00570 int fx,
00571 STATE *best_state,
00572 A_CHOICE *best_choice,
00573 A_CHOICE *raw_choice,
00574 SEAMS *seam_list,
00575 DANGERR *fixpt,
00576 STATE *chop_states,
00577 INT32 *state_count,
00578 STATE *correct_state,
00579 INT32 pass) {
00580 INT32 blob_number;
00581 INT32 index;
00582 CHOICES_LIST choices = *char_choices;
00583 float old_best;
00584 int fixpt_valid = 1;
00585 static INT32 old_count;
00586
00587 do {
00588 if (!fixpt_valid)
00589 fixpt->index = -1;
00590 old_best = class_probability (best_choice);
00591 choices = improve_one_blob (word, *char_choices, fx,
00592 &blob_number, seam_list, fixpt,
00593 chop_states + *state_count, correct_state,
00594 pass);
00595 if (choices != NULL) {
00596 LogNewSplit(blob_number);
00597 permute_characters (choices,
00598 class_probability (best_choice),
00599 best_choice, raw_choice);
00600 *char_choices = choices;
00601
00602 if (old_best > class_probability (best_choice)) {
00603 set_n_ones (best_state, array_count (*char_choices) - 1);
00604 fixpt_valid = 1;
00605 }
00606 else {
00607 insert_new_chunk (best_state, blob_number,
00608 array_count (*char_choices) - 2);
00609 fixpt_valid = 0;
00610 }
00611 if (*state_count > 0) {
00612 if (pass == 0) {
00613 for (index = 0; index < *state_count; index++)
00614 insert_new_chunk (&chop_states[index], blob_number,
00615 array_count (*char_choices) - 2);
00616 set_n_ones (&chop_states[index],
00617 array_count (*char_choices) - 1);
00618 }
00619 (*state_count)++;
00620 }
00621
00622 if (chop_debug)
00623 print_state ("best state = ",
00624 best_state, count_blobs (word->blobs) - 1);
00625 if (first_pass)
00626 chops_performed1++;
00627 else
00628 chops_performed2++;
00629
00630 }
00631 }
00632 while (choices &&
00633 !AcceptableChoice (*char_choices, best_choice, raw_choice, fixpt) &&
00634 !blob_skip && array_count (*char_choices) < MAX_NUM_CHUNKS);
00635 if (pass == 0)
00636 old_count = *state_count;
00637 else {
00638 if (old_count != *state_count)
00639 fprintf (matcher_fp,
00640 "Mis-matched state counts, " INT32FORMAT " pass1, "
00641 INT32FORMAT " pass2\n", old_count, *state_count);
00642 }
00643 if (!fixpt_valid)
00644 fixpt->index = -1;
00645 }
00646
00647
00648
00654 INT16 select_blob_to_split(CHOICES_LIST char_choices, float rating_ceiling) {
00655 CHOICES this_choice;
00656 int x;
00657 float worst = -MAX_FLOAT32;
00658 int worst_index = -1;
00659
00660 if (chop_debug)
00661 if (rating_ceiling < MAX_FLOAT32)
00662 cprintf ("rating_ceiling = %8.4f\n", rating_ceiling);
00663 else
00664 cprintf ("rating_ceiling = No Limit\n");
00665
00666 for_each_choice(char_choices, x) {
00667 this_choice = (CHOICES) array_value (char_choices, x);
00668 if (this_choice == NIL) {
00669 return (x);
00670 }
00671 else {
00672 if (best_probability (this_choice) > worst &&
00673 best_probability (this_choice) < rating_ceiling &&
00674 best_certainty (this_choice) < tessedit_certainty_threshold) {
00675 worst_index = x;
00676 worst = best_probability (this_choice);
00677 }
00678 }
00679 }
00680
00681 if (chop_debug)
00682 cprintf ("blob_number = %4d\n", worst_index);
00683
00684 return (worst_index);
00685 }
00686
00687
00688
00700 SEAMS start_seam_list(TBLOB *blobs) {
00701 TBLOB *blob;
00702 SEAMS seam_list;
00703 TPOINT topleft;
00704 TPOINT botright;
00705 int location;
00706
00707 seam_list = new_seam_list ();
00708
00709 for (blob = blobs; blob->next != NULL; blob = blob->next) {
00710
00711 blob_bounding_box(blob, &topleft, &botright);
00712 location = botright.x;
00713 blob_bounding_box (blob->next, &topleft, &botright);
00714 location += topleft.x;
00715 location /= 2;
00716
00717 seam_list = add_seam (seam_list,
00718 new_seam (0.0, location, NULL, NULL, NULL));
00719 }
00720
00721 return (seam_list);
00722 }
00723
00724
00725
00730 INT16 total_containment(TBLOB *blob1, TBLOB *blob2) {
00731 TPOINT topleft1;
00732 TPOINT botright1;
00733 TPOINT topleft2;
00734 TPOINT botright2;
00735
00736 blob_bounding_box(blob1, &topleft1, &botright1);
00737 blob_bounding_box(blob2, &topleft2, &botright2);
00738
00739 return (bounds_inside (topleft1, botright1, topleft2, botright2) ||
00740 bounds_inside (topleft2, botright2, topleft1, botright1));
00741 }
00742
00743
00744
00749 MATRIX word_associator(TBLOB *blobs,
00750 SEAMS seams,
00751 STATE *state,
00752 int fxid,
00753 A_CHOICE *best_choice,
00754 A_CHOICE *raw_choice,
00755 char *correct,
00756 DANGERR *fixpt,
00757 STATE *best_state,
00758 INT32 pass) {
00759 CHUNKS_RECORD chunks_record;
00760 BLOB_WEIGHTS blob_weights;
00761 int x;
00762 int num_chunks;
00763 A_CHOICE *this_choice;
00764
00765 num_chunks = array_count (seams) + 1;
00766
00767 chunks_record.chunks = blobs;
00768 chunks_record.splits = seams;
00769 chunks_record.ratings = record_piece_ratings (blobs);
00770 chunks_record.char_widths = blobs_widths (blobs);
00771 chunks_record.chunk_widths = blobs_widths (blobs);
00772 chunks_record.fx = fxid;
00773
00774 for (x = 0; x < num_chunks; x++) {
00775 this_choice =
00776 (A_CHOICE *) first (matrix_get (chunks_record.ratings, x, x));
00777 blob_weights[x] = -(INT16) (10 * class_probability (this_choice) /
00778 class_certainty (this_choice));
00779 }
00780 chunks_record.weights = blob_weights;
00781
00782 if (chop_debug)
00783 print_matrix (chunks_record.ratings);
00784 best_first_search(&chunks_record,
00785 best_choice,
00786 raw_choice,
00787 state,
00788 fixpt,
00789 best_state,
00790 pass);
00791
00792 free_widths (chunks_record.chunk_widths);
00793 free_widths (chunks_record.char_widths);
00794 return chunks_record.ratings;
00795 }