#include <gap_map.h>
A block gap map is a quantised histogram of whitespace regions in the block. It is a vertical projection of wide gaps WITHIN lines.
The map is held as an array of counts of rows which have a wide gap covering that region of the row. Each bucket in the map represents a width of about half an xheight; the median of the xhts in the rows is used.
The block is considered RECTANGULAR - delimited by the left and right extremes of the rows in the block. However, ONLY wide gaps WITHIN a row are counted.
Definition at line 25 of file gap_map.h.
GAPMAP::GAPMAP | ( | TO_BLOCK * | block | ) |
Constructor.
Definition at line 42 of file gap_map.cpp.
References STATS::add(), alloc_mem(), any_tabs, TO_ROW::blob_list(), box_next(), bucket_size, FALSE, BOX::left(), map, map_max, MAX_INT16, max_right, STATS::median(), min_left, NULL, BOX::right(), total_rows, tprintf(), TRUE, and TO_ROW::xheight.
00044 { 00045 TO_ROW_IT row_it; //row iterator 00046 TO_ROW *row; //current row 00047 BLOBNBOX_IT blob_it; //iterator 00048 BOX blob_box; 00049 BOX prev_blob_box; 00050 INT16 gap_width; 00051 INT16 start_of_row; 00052 INT16 end_of_row; 00053 STATS xht_stats (0, 128); 00054 INT16 min_quantum; 00055 INT16 max_quantum; 00056 INT16 i; 00057 00058 row_it.set_to_list (block->get_rows ()); 00059 /* 00060 Find left and right extremes and bucket size 00061 */ 00062 map = NULL; 00063 min_left = MAX_INT16; 00064 max_right = -MAX_INT16; 00065 total_rows = 0; 00066 any_tabs = FALSE; 00067 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { 00068 row = row_it.data (); 00069 if (!row->blob_list ()->empty ()) { 00070 total_rows++; 00071 xht_stats.add ((INT16) floor (row->xheight + 0.5), 1); 00072 blob_it.set_to_list (row->blob_list ()); 00073 start_of_row = blob_it.data ()->bounding_box ().left (); 00074 end_of_row = blob_it.data_relative (-1)->bounding_box ().right (); 00075 if (min_left > start_of_row) 00076 min_left = start_of_row; 00077 if (max_right < end_of_row) 00078 max_right = end_of_row; 00079 } 00080 } 00081 if ((total_rows < 3) || (min_left >= max_right)) { 00082 total_rows = 0; 00083 min_left = max_right = 0; 00084 return; 00085 } 00086 bucket_size = (INT16) floor (xht_stats.median () + 0.5) / 2; 00087 map_max = (max_right - min_left) / bucket_size; 00088 map = (INT16 *) alloc_mem ((map_max + 1) * sizeof (INT16)); 00089 for (i = 0; i <= map_max; i++) 00090 map[i] = 0; 00091 00092 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { 00093 row = row_it.data (); 00094 if (!row->blob_list ()->empty ()) { 00095 blob_it.set_to_list (row->blob_list ()); 00096 blob_it.mark_cycle_pt (); 00097 blob_box = box_next (&blob_it); 00098 prev_blob_box = blob_box; 00099 if (gapmap_use_ends) { 00100 /* Leading space */ 00101 gap_width = blob_box.left () - min_left; 00102 if ((gap_width > gapmap_big_gaps * row->xheight) 00103 && gap_width > 2) { 00104 max_quantum = (blob_box.left () - min_left) / bucket_size; 00105 for (i = 0; i <= max_quantum; i++) 00106 map[i]++; 00107 } 00108 } 00109 while (!blob_it.cycled_list ()) { 00110 blob_box = box_next (&blob_it); 00111 gap_width = blob_box.left () - prev_blob_box.right (); 00112 if ((gap_width > gapmap_big_gaps * row->xheight) 00113 && gap_width > 2) { 00114 min_quantum = 00115 (prev_blob_box.right () - min_left) / bucket_size; 00116 max_quantum = (blob_box.left () - min_left) / bucket_size; 00117 for (i = min_quantum; i <= max_quantum; i++) 00118 map[i]++; 00119 } 00120 prev_blob_box = blob_box; 00121 } 00122 if (gapmap_use_ends) { 00123 /* Trailing space */ 00124 gap_width = max_right - prev_blob_box.right (); 00125 if ((gap_width > gapmap_big_gaps * row->xheight) 00126 && gap_width > 2) { 00127 min_quantum = 00128 (prev_blob_box.right () - min_left) / bucket_size; 00129 for (i = min_quantum; i <= map_max; i++) 00130 map[i]++; 00131 } 00132 } 00133 } 00134 } 00135 for (i = 0; i <= map_max; i++) { 00136 if (map[i] > total_rows / 2) { 00137 if (gapmap_no_isolated_quanta && 00138 (((i == 0) && 00139 (map[i + 1] <= total_rows / 2)) || 00140 ((i == map_max) && 00141 (map[i - 1] <= total_rows / 2)) || 00142 ((i > 0) && 00143 (i < map_max) && 00144 (map[i - 1] <= total_rows / 2) && 00145 (map[i + 1] <= total_rows / 2)))) { 00146 map[i] = 0; //prevent isolated quantum 00147 } 00148 else 00149 any_tabs = TRUE; 00150 } 00151 } 00152 if (gapmap_debug && any_tabs) 00153 tprintf ("Table found\n"); 00154 }
GAPMAP::~GAPMAP | ( | ) | [inline] |
Is gap a table?
Is there a bucket in the specified range where more than half the rows in the block have a wide gap?
Definition at line 163 of file gap_map.cpp.
References any_tabs, bucket_size, FALSE, map, min_left, total_rows, and TRUE.
Referenced by ignore_big_gap().
00166 { 00167 INT16 min_quantum; 00168 INT16 max_quantum; 00169 INT16 i; 00170 BOOL8 tab_found = FALSE; 00171 00172 if (!any_tabs) 00173 return FALSE; 00174 00175 min_quantum = (left - min_left) / bucket_size; 00176 max_quantum = (right - min_left) / bucket_size; 00177 for (i = min_quantum; (!tab_found && (i <= max_quantum)); i++) 00178 if (map[i] > total_rows / 2) 00179 tab_found = TRUE; 00180 return tab_found; 00181 }
BOOL8 GAPMAP::any_tabs [private] |
INT16 GAPMAP::bucket_size [private] |
INT16* GAPMAP::map [private] |
INT16 GAPMAP::map_max [private] |
INT16 GAPMAP::max_right [private] |
INT16 GAPMAP::min_left [private] |
INT16 GAPMAP::total_rows [private] |