00001 00002 // File: baseapi.h 00003 // Description: Simple API for calling tesseract. 00004 // Author: Ray Smith 00005 // Created: Fri Oct 06 15:35:01 PDT 2006 00006 // 00007 // (C) Copyright 2006, Google Inc. 00008 // Licensed under the Apache License, Version 2.0 (the "License"); 00009 // you may not use this file except in compliance with the License. 00010 // You may obtain a copy of the License at 00011 // http://www.apache.org/licenses/LICENSE-2.0 00012 // Unless required by applicable law or agreed to in writing, software 00013 // distributed under the License is distributed on an "AS IS" BASIS, 00014 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 // See the License for the specific language governing permissions and 00016 // limitations under the License. 00017 // 00019 00020 #ifndef THIRD_PARTY_TESSERACT_CCMAIN_BASEAPI_H__ 00021 #define THIRD_PARTY_TESSERACT_CCMAIN_BASEAPI_H__ 00022 00023 #include <string> 00024 00025 #include "host.h" 00026 #include "ocrclass.h" 00027 00028 class PAGE_RES; 00029 class BLOCK_LIST; 00030 00031 // Base class for all tesseract APIs. 00032 // Specific classes can add ability to work on different inputs or produce 00033 // different outputs. 00034 00035 class TessBaseAPI { 00036 public: 00037 // Start tesseract. 00038 // The datapath must be the name of the data directory or some other file 00039 // in which the data directory resides (for instance argv[0].) 00040 // The configfile is the name of a file in the tessconfigs directory 00041 // (eg batch) or NULL to run on defaults. 00042 // Outputbase may also be NULL, and is the basename of various output files. 00043 // If the output of any of these files is enabled, then a name must be given. 00044 // If numeric_mode is true, only possible digits and roman numbers are 00045 // returned. Returns 0 if successful. Crashes if not. 00046 // The argc and argv may be 0 and NULL respectively. They are used for 00047 // providing config files for debug/display purposes. 00048 // TODO(rays) get the facts straight. Is it OK to call 00049 // it more than once? Make it properly check for errors and return them. 00050 static int Init(const char* datapath, const char* outputbase, 00051 const char* configfile, bool numeric_mode, 00052 int argc, char* argv[]); 00053 00054 // Recognize a rectangle from an image and return the result as a string. 00055 // May be called many times for a single Init. 00056 // Currently has no error checking. 00057 // Greyscale of 8 and color of 24 or 32 bits per pixel may be given. 00058 // Palette color images will not work properly and must be converted to 00059 // 24 bit. 00060 // Binary images of 1 bit per pixel may also be given but they must be 00061 // byte packed with the MSB of the first byte being the first pixel, and a 00062 // 1 represents WHITE. For binary images set bytes_per_pixel=0. 00063 // The recognized text is returned as a char* which (in future will be coded 00064 // as UTF8 and) must be freed with the delete [] operator. 00065 static char* TesseractRect(const UINT8* imagedata, 00066 int bytes_per_pixel, 00067 int bytes_per_line, 00068 int left, int top, int width, int height); 00069 00070 // Call between pages or documents etc to free up memory and forget 00071 // adaptive data. 00072 static void ClearAdaptiveClassifier(); 00073 00074 // Close down tesseract and free up memory. 00075 static void End(); 00076 00077 // Dump the internal binary image to a PGM file. 00078 static void DumpPGM(const char* filename); 00079 00080 protected: 00081 // Copy the given image rectangle to Tesseract, with adaptive thresholding 00082 // if the image is not already binary. 00083 static void CopyImageToTesseract(const UINT8* imagedata, 00084 int bytes_per_pixel, 00085 int bytes_per_line, 00086 int left, int top, int width, int height); 00087 00088 // Compute the Otsu threshold(s) for the given image rectangle, making one 00089 // for each channel. Each channel is always one byte per pixel. 00090 // Returns an array of threshold values and an array of hi_values, such 00091 // that a pixel value >threshold[channel] is considered foreground if 00092 // hi_values[channel] is 0 or background if 1. A hi_value of -1 indicates 00093 // that there is no apparent foreground. At least one hi_value will not be -1. 00094 // thresholds and hi_values are assumed to be of bytes_per_pixel size. 00095 static void OtsuThreshold(const UINT8* imagedata, 00096 int bytes_per_pixel, 00097 int bytes_per_line, 00098 int left, int top, int right, int bottom, 00099 int* thresholds, 00100 int* hi_values); 00101 00102 // Compute the histogram for the given image rectangle, and the given 00103 // channel. (Channel pointed to by imagedata.) Each channel is always 00104 // one byte per pixel. 00105 // Bytes per pixel is used to skip channels not being 00106 // counted with this call in a multi-channel (pixel-major) image. 00107 // Histogram is always a 256 element array to count occurrences of 00108 // each pixel value. 00109 static void HistogramRect(const UINT8* imagedata, 00110 int bytes_per_pixel, 00111 int bytes_per_line, 00112 int left, int top, int right, int bottom, 00113 int* histogram); 00114 00115 // Compute the Otsu threshold(s) for the given histogram. 00116 // Also returns H = total count in histogram, and 00117 // omega0 = count of histogram below threshold. 00118 static int OtsuStats(const int* histogram, 00119 int* H_out, 00120 int* omega0_out); 00121 00122 // Threshold the given grey or color image into the tesseract global 00123 // image ready for recognition. Requires thresholds and hi_value 00124 // produced by OtsuThreshold above. 00125 static void ThresholdRect(const UINT8* imagedata, 00126 int bytes_per_pixel, 00127 int bytes_per_line, 00128 int left, int top, 00129 int width, int height, 00130 const int* thresholds, 00131 const int* hi_values); 00132 00133 // Cut out the requested rectangle of the binary image to the 00134 // tesseract global image ready for recognition. 00135 static void CopyBinaryRect(const UINT8* imagedata, 00136 int bytes_per_line, 00137 int left, int top, 00138 int width, int height); 00139 00140 // Low-level function to recognize the current global image to a string. 00141 static char* RecognizeToString(); 00142 00143 // Find lines from the image making the BLOCK_LIST. 00144 static void FindLines(BLOCK_LIST* block_list); 00145 00146 // Recognize the tesseract global image and return the result as Tesseract 00147 // internal structures. 00148 static PAGE_RES* Recognize(BLOCK_LIST* block_list, ETEXT_DESC* monitor); 00149 00150 // Convert (and free) the internal data structures into a text string. 00151 static char* TesseractToText(PAGE_RES* page_res); 00152 }; 00153 00154 #endif // THIRD_PARTY_TESSERACT_CCMAIN_BASEAPI_H__