Tesseract: ccmain/baseapi.cpp Source File

00001 /**********************************************************************
00002  * File:        baseapi.cpp
00003  * Description: Simple API for calling tesseract.
00004  * Author:      Ray Smith
00005  * Created:     Fri Oct 06 15:35:01 PDT 2006
00006  *
00007  * (C) Copyright 2006, Google Inc.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  *
00018  **********************************************************************/
00019 
00020 #include "baseapi.h"
00021 
00022 #include "tessedit.h"
00023 #include "pageres.h"
00024 #include "tessvars.h"
00025 #include "control.h"
00026 #include "applybox.h"
00027 #include "pgedit.h"
00028 #include "varabled.h"
00029 #include "adaptmatch.h"
00030 
00031 BOOL_VAR(tessedit_resegment_from_boxes, FALSE,
00032          "Take segmentation and labeling from box file");
00033 BOOL_VAR(tessedit_train_from_boxes, FALSE,
00034          "Generate training data from boxed chars");
00035 
00036 // Minimum sensible image size to be worth running tesseract.
00037 const int kMinRectSize = 10;
00038 
00039 // Start tesseract.
00040 // The datapath must be the name of the data directory or some other file
00041 // in which the data directory resides (for instance argv[0].)
00042 // The configfile is the name of a file in the tessconfigs directory
00043 // (eg batch) or NULL to run on defaults.
00044 // Outputbase may also be NULL, and is the basename of various output files.
00045 // If the output of any of these files is enabled, then a name nmust be given.
00046 // If numeric_mode is true, only possible digits and roman numbers are
00047 // returned. Returns 0 if successful. Crashes if not.
00048 // The argc and argv may be 0 and NULL respectively. They are used for
00049 // providing config files for debug/display purposes.
00050 // TODO(rays) get the facts straight. Is it OK to call
00051 // it more than once? Make it properly check for errors and return them.
00052 int TessBaseAPI::Init(const char* datapath, const char* outputbase,
00053                       const char* configfile, bool numeric_mode,
00054                       int argc, char* argv[]) {
00055   int result = init_tesseract(datapath, outputbase, configfile, argc, argv);
00056   bln_numericmode.set_value(numeric_mode);
00057   return result;
00058 }
00059 
00060 // Recognize a rectangle from an image and return the result as a string.
00061 // May be called many times for a single Init.
00062 // Currently has no error checking.
00063 // Greyscale of 8 and color of 24 or 32 bits per pixel may be given.
00064 // Palette color images will not work properly and must be converted to
00065 // 24 bit.
00066 // Binary images of 1 bit per pixel may also be given but they must be
00067 // byte packed with the MSB of the first byte being the first pixel, and a
00068 // one pixel is WHITE. For binary images set bytes_per_pixel=0.
00069 // The recognized text is returned as a char* which (in future will be coded
00070 // as UTF8 and) must be freed with the delete [] operator.
00071 char* TessBaseAPI::TesseractRect(const UINT8* imagedata,
00072                                  int bytes_per_pixel,
00073                                  int bytes_per_line,
00074                                  int left, int top,
00075                                  int width, int height) {
00076 #ifndef TEXT_VERBOSE
00077   if (width < kMinRectSize || height < kMinRectSize)
00078     return NULL;  // Nothing worth doing.
00079 #endif // TEXT_VERBOSE - useful for debugging single characters!
00080 
00081   // Copy/Threshold the image to the tesseract global page_image.
00082   CopyImageToTesseract(imagedata, bytes_per_pixel, bytes_per_line,
00083                        left, top, width, height);
00084 
00085   return RecognizeToString();
00086 }
00087 
00088 // Call between pages or documents etc to free up memory and forget
00089 // adaptive data.
00090 void TessBaseAPI::ClearAdaptiveClassifier() {
00091   ResetAdaptiveClassifier();
00092 }
00093 
00094 // Close down tesseract and free up memory.
00095 void TessBaseAPI::End() {
00096   ResetAdaptiveClassifier();
00097   end_tesseract();
00098 }
00099 
00100 // Dump the internal binary image to a PGM file.
00101 void TessBaseAPI::DumpPGM(const char* filename) {
00102   IMAGELINE line;
00103   line.init(page_image.get_xsize());
00104   FILE *fp = fopen(filename, "w");
00105   fprintf(fp, "P5 " INT32FORMAT " " INT32FORMAT " 255\n", page_image.get_xsize(),
00106           page_image.get_ysize());
00107   for (int j = page_image.get_ysize()-1; j >= 0 ; --j) {
00108     page_image.get_line(0, j, page_image.get_xsize(), &line, 0);
00109     for (int i = 0; i < page_image.get_xsize(); ++i) {
00110       UINT8 b = line.pixels[i] ? 255 : 0;
00111       fwrite(&b, 1, 1, fp);
00112     }
00113   }
00114   fclose(fp);
00115 }
00116 
00117 // Copy the given image rectangle to Tesseract, with adaptive thresholding
00118 // if the image is not already binary.
00119 void TessBaseAPI::CopyImageToTesseract(const UINT8* imagedata,
00120                                        int bytes_per_pixel,
00121                                        int bytes_per_line,
00122                                        int left, int top,
00123                                        int width, int height) {
00124   if (bytes_per_pixel > 0) {
00125     // Threshold grey or color.
00126     int* thresholds = new int[bytes_per_pixel];
00127     int* hi_values = new int[bytes_per_pixel];
00128 
00129     // Compute the thresholds.
00130     OtsuThreshold(imagedata, bytes_per_pixel, bytes_per_line,
00131                   left, top, left + width, top + height,
00132                   thresholds, hi_values);
00133 
00134     // Threshold the image to the tesseract global page_image.
00135     ThresholdRect(imagedata, bytes_per_pixel, bytes_per_line,
00136                   left, top, width, height,
00137                   thresholds, hi_values);
00138     delete [] thresholds;
00139     delete [] hi_values;
00140   } else {
00141     CopyBinaryRect(imagedata, bytes_per_line, left, top, width, height);
00142   }
00143 }
00144 
00145 // Compute the Otsu threshold(s) for the given image rectangle, making one
00146 // for each channel. Each channel is always one byte per pixel.
00147 // Returns an array of threshold values and an array of hi_values, such
00148 // that a pixel value >threshold[channel] is considered foreground if
00149 // hi_values[channel] is 0 or background if 1. A hi_value of -1 indicates
00150 // that there is no apparent foreground. At least one hi_value will not be -1.
00151 // thresholds and hi_values are assumed to be of bytes_per_pixel size.
00152 void TessBaseAPI::OtsuThreshold(const UINT8* imagedata,
00153                                 int bytes_per_pixel,
00154                                 int bytes_per_line,
00155                                 int left, int top, int right, int bottom,
00156                                 int* thresholds,
00157                                 int* hi_values) {
00158   // Of all channels with no good hi_value, keep the best so we can always
00159   // produce at least one answer.
00160   int best_hi_value = 0;
00161   int best_hi_index = 0;
00162   bool any_good_hivalue = false;
00163   double best_hi_dist = 0.0;
00164 
00165   for (int ch = 0; ch < bytes_per_pixel; ++ch) {
00166     thresholds[ch] = 0;
00167     hi_values[ch] = -1;
00168     // Compute the histogram of the image rectangle.
00169     int histogram[256];
00170     HistogramRect(imagedata + ch, bytes_per_pixel, bytes_per_line,
00171                   left, top, right, bottom, histogram);
00172     int H;
00173     int best_omega_0;
00174     int best_t = OtsuStats(histogram, &H, &best_omega_0);
00175     // To be a convincing foreground we must have a small fraction of H
00176     // or to be a convincing background we must have a large fraction of H.
00177     // In between we assume this channel contains no thresholding information.
00178     int hi_value = best_omega_0 < H * 0.5;
00179     thresholds[ch] = best_t;
00180     if (best_omega_0 > H * 0.75) {
00181       any_good_hivalue = true;
00182       hi_values[ch] = 0;
00183     }
00184     else if (best_omega_0 < H * 0.25) {
00185       any_good_hivalue = true;
00186       hi_values[ch] = 1;
00187     }
00188     else {
00189       // In case all channels are like this, keep the best of the bad lot.
00190       double hi_dist = hi_value ? (H - best_omega_0) : best_omega_0;
00191       if (hi_dist > best_hi_dist) {
00192         best_hi_dist = hi_dist;
00193         best_hi_value = hi_value;
00194         best_hi_index = ch;
00195       }
00196     }
00197   }
00198   if (!any_good_hivalue) {
00199     // Use the best of the ones that were not good enough.
00200     hi_values[best_hi_index] = best_hi_value;
00201   }
00202 }
00203 
00204 // Compute the histogram for the given image rectangle, and the given
00205 // channel. (Channel pointed to by imagedata.) Each channel is always
00206 // one byte per pixel.
00207 // Bytes per pixel is used to skip channels not being
00208 // counted with this call in a multi-channel (pixel-major) image.
00209 // Histogram is always a 256 element array to count occurrences of
00210 // each pixel value.
00211 void TessBaseAPI::HistogramRect(const UINT8* imagedata,
00212                                 int bytes_per_pixel,
00213                                 int bytes_per_line,
00214                                 int left, int top, int right, int bottom,
00215                                 int* histogram) {
00216   int width = right - left;
00217   memset(histogram, 0, sizeof(*histogram) * 256);
00218   const UINT8* pix = imagedata +
00219                      top*bytes_per_line +
00220                      left*bytes_per_pixel;
00221   for (int y = top; y < bottom; ++y) {
00222     for (int x = 0; x < width; ++x) {
00223       ++histogram[pix[x * bytes_per_pixel]];
00224     }
00225     pix += bytes_per_line;
00226   }
00227 }
00228 
00229 // Compute the Otsu threshold(s) for the given histogram.
00230 // Also returns H = total count in histogram, and
00231 // omega0 = count of histogram below threshold.
00232 int TessBaseAPI::OtsuStats(const int* histogram,
00233                            int* H_out,
00234                            int* omega0_out) {
00235   int H = 0;
00236   double mu_T = 0.0;
00237   for (int i = 0; i < 256; ++i) {
00238     H += histogram[i];
00239     mu_T += i * histogram[i];
00240   }
00241 
00242   // Now maximize sig_sq_B over t.
00243   // http://www.ctie.monash.edu.au/hargreave/Cornall_Terry_328.pdf
00244   int best_t = -1;
00245   int omega_0, omega_1;
00246   int best_omega_0 = 0;
00247   double best_sig_sq_B = 0.0;
00248   double mu_0, mu_1, mu_t;
00249   omega_0 = 0;
00250   mu_t = 0.0;
00251   for (int t = 0; t < 255; ++t) {
00252     omega_0 += histogram[t];
00253     mu_t += t * static_cast<double>(histogram[t]);
00254     if (omega_0 == 0)
00255       continue;
00256     omega_1 = H - omega_0;
00257     mu_0 = mu_t / omega_0;
00258     mu_1 = (mu_T - mu_t) / omega_1;
00259     double sig_sq_B = mu_1 - mu_0;
00260     sig_sq_B *= sig_sq_B * omega_0 * omega_1;
00261     if (best_t < 0 || sig_sq_B > best_sig_sq_B) {
00262       best_sig_sq_B = sig_sq_B;
00263       best_t = t;
00264       best_omega_0 = omega_0;
00265     }
00266   }
00267   if (H_out != NULL) *H_out = H;
00268   if (omega0_out != NULL) *omega0_out = best_omega_0;
00269   return best_t;
00270 }
00271 
00272 // Threshold the given grey or color image into the tesseract global
00273 // image ready for recognition. Requires thresholds and hi_value
00274 // produced by OtsuThreshold above.
00275 void TessBaseAPI::ThresholdRect(const UINT8* imagedata,
00276                                 int bytes_per_pixel,
00277                                 int bytes_per_line,
00278                                 int left, int top,
00279                                 int width, int height,
00280                                 const int* thresholds,
00281                                 const int* hi_values) {
00282   IMAGELINE line;
00283   page_image.create(width, height, 1);
00284   line.init(width);
00285   // For each line in the image, fill the IMAGELINE class and put it into the
00286   // Tesseract global page_image. Note that Tesseract stores images with the
00287   // bottom at y=0 and 0 is black, so we need 2 kinds of inversion.
00288   const UINT8* data = imagedata + top*bytes_per_line + left*bytes_per_pixel;
00289   for (int y = height - 1 ; y >= 0; --y) {
00290     const UINT8* pix = data;
00291     for (int x = 0; x < width; ++x, pix += bytes_per_pixel) {
00292       line.pixels[x] = 1;
00293       for (int ch = 0; ch < bytes_per_pixel; ++ch) {
00294         if (hi_values[ch] >= 0 &&
00295             (pix[ch] > thresholds[ch]) == (hi_values[ch] == 0)) {
00296           line.pixels[x] = 0;
00297           break;
00298         }
00299       }
00300     }
00301     page_image.put_line(0, y, width, &line, 0);
00302     data += bytes_per_line;
00303   }
00304 }
00305 
00306 // Cut out the requested rectangle of the binary image to the
00307 // tesseract global image ready for recognition.
00308 void TessBaseAPI::CopyBinaryRect(const UINT8* imagedata,
00309                                  int bytes_per_line,
00310                                  int left, int top,
00311                                  int width, int height) {
00312   // Copy binary image, cutting out the required rectangle.
00313   IMAGE image;
00314   image.capture(const_cast<UINT8*>(imagedata),
00315                 bytes_per_line*8, top + height, 1);
00316   page_image.create(width, height, 1);
00317   copy_sub_image(&image, left, top, width, height, &page_image, 0, 0, false);
00318 }
00319 
00320 // Low-level function to recognize the current global image to a string.
00321 char* TessBaseAPI::RecognizeToString() {
00322   BLOCK_LIST    block_list;
00323 
00324   FindLines(&block_list);
00325 
00326   // Now run the main recognition.
00327   PAGE_RES* page_res = Recognize(&block_list, NULL);
00328 
00329   return TesseractToText(page_res);
00330 }
00331 
00332 // Find lines from the image making the BLOCK_LIST.
00333 void TessBaseAPI::FindLines(BLOCK_LIST* block_list) {
00334   STRING input_file = "noname.tif";
00335   // The following call creates a full-page block and then runs connected
00336   // component analysis and text line creation.
00337   pgeditor_read_file(input_file, block_list);
00338 }
00339 
00340 // Recognize the tesseract global image and return the result as Tesseract
00341 // internal structures.
00342 PAGE_RES* TessBaseAPI::Recognize(BLOCK_LIST* block_list, ETEXT_DESC* monitor) {
00343   if (tessedit_resegment_from_boxes)
00344     apply_boxes(block_list);
00345   if (edit_variables)
00346     start_variables_editor();
00347 
00348   PAGE_RES* page_res = new PAGE_RES(block_list);
00349   if (interactive_mode) {
00350     pgeditor_main(block_list);                  //pgeditor user I/F
00351   } else if (tessedit_train_from_boxes) {
00352     apply_box_training(block_list);
00353   } else {
00354     // Now run the main recognition.
00355     recog_all_words(page_res, monitor);
00356   }
00357   return page_res;
00358 }
00359 
00360 // Make a text string from the internal data structures.
00361 // The input page_res is deleted.
00362 char* TessBaseAPI::TesseractToText(PAGE_RES* page_res) {
00363   if (page_res != NULL) {
00364     int total_length = 2;
00365     PAGE_RES_IT   page_res_it(page_res);
00366     // Iterate over the data structures to extract the recognition result.
00367     for (page_res_it.restart_page(); page_res_it.word () != NULL;
00368          page_res_it.forward()) {
00369       WERD_RES *word = page_res_it.word();
00370       WERD_CHOICE* choice = word->best_choice;
00371       if (choice != NULL) {
00372         total_length += choice->string().length() + 1;
00373       }
00374     }
00375     char* result = new char[total_length];
00376     char* ptr = result;
00377     for (page_res_it.restart_page(); page_res_it.word () != NULL;
00378          page_res_it.forward()) {
00379       WERD_RES *word = page_res_it.word();
00380       WERD_CHOICE* choice = word->best_choice;
00381       if (choice != NULL) {
00382         strcpy(ptr, choice->string().string());
00383         ptr += strlen(ptr);
00384         if (word->word->flag(W_EOL))
00385           *ptr++ = '\n';
00386         else
00387           *ptr++ = ' ';
00388       }
00389     }
00390     *ptr++ = '\n';
00391     *ptr = '\0';
00392     delete page_res;
00393     return result;
00394   }
00395   return NULL;
00396 }
00397