Tesseract: UNICHAR Class Reference

This may be a single Unicode character (stored as between 1 and 4 utf8 bytes) or multple Unicode characters representing the NFKC expansion of a ligature such as fi, ffl etc. These are also stored as utf8.

Public Member Functions

Static Public Member Functions

Private Attributes

Constructor & Destructor Documentation

UNICHAR::UNICHAR ( ) [inline]

Definition at line 43 of file unichar.h.

References chars, and UNICHAR_LEN.

00043             {
00044     memset(chars, 0, UNICHAR_LEN);
00045   }

UNICHAR::UNICHAR	(	const char *	utf8_str,
		int	len
	)

Convert utf8 string to Unicode.

If len<0 then the string is null terminated. If the string is too long to fit in the UNICHAR then it takes only what will fit. Checks for illegal input and stops at an illegal sequence.

Note:: The resulting UNICHAR may be empty.

Definition at line 33 of file unichar.cpp.

References chars, UNICHAR_LEN, and utf8_step().

00033                                               {
00034   int total_len = 0;
00035   int step = 0;
00036   if (len < 0) {
00037     for (len = 0; utf8_str[len] != 0 && len < UNICHAR_LEN; ++len);
00038   }
00039   for (total_len = 0; total_len < len; total_len += step) {
00040     step = utf8_step(utf8_str + total_len);
00041     if (total_len + step > UNICHAR_LEN)
00042       break;  // Too long.
00043     if (step == 0)
00044       break;  // Illegal first byte.
00045     int i;
00046     for (i = 1; i < step; ++i)
00047       if ((utf8_str[total_len + i] & 0xc0) != 0x80)
00048         break;
00049     if (i < step)
00050       break;  // Illegal surrogate
00051   }
00052   memcpy(chars, utf8_str, total_len);
00053   if (total_len < UNICHAR_LEN) {
00054     chars[UNICHAR_LEN - 1] = total_len;
00055     while (total_len < UNICHAR_LEN - 1)
00056       chars[total_len++] = 0;
00057   }
00058 }

UNICHAR::UNICHAR ( int unicode ) [explicit]

Convert a single UCS4 character to Unicode.

Illegal values are ignored, resulting in an empty UNICHAR.

Definition at line 65 of file unichar.cpp.

References chars, UNI_MAX_LEGAL_UTF32, and UNICHAR_LEN.

00065                             {
00066   const int bytemask = 0xBF;
00067   const int bytemark = 0x80;
00068 
00069   if (unicode < 0x80) {
00070     chars[UNICHAR_LEN - 1] = 1;
00071     chars[2] = 0;
00072     chars[1] = 0;
00073     chars[0] = static_cast<char>(unicode);
00074   } else if (unicode < 0x800) {
00075     chars[UNICHAR_LEN - 1] = 2;
00076     chars[2] = 0;
00077     chars[1] = static_cast<char>((unicode | bytemark) & bytemask);
00078     unicode >>= 6;
00079     chars[0] = static_cast<char>(unicode | 0xc0);
00080   } else if (unicode < 0x10000) {
00081     chars[UNICHAR_LEN - 1] = 3;
00082     chars[2] = static_cast<char>((unicode | bytemark) & bytemask);
00083     unicode >>= 6;
00084     chars[1] = static_cast<char>((unicode | bytemark) & bytemask);
00085     unicode >>= 6;
00086     chars[0] = static_cast<char>(unicode | 0xe0);
00087   } else if (unicode <= UNI_MAX_LEGAL_UTF32) {
00088     chars[UNICHAR_LEN - 1] = 4;
00089     chars[3] = static_cast<char>((unicode | bytemark) & bytemask);
00090     unicode >>= 6;
00091     chars[2] = static_cast<char>((unicode | bytemark) & bytemask);
00092     unicode >>= 6;
00093     chars[1] = static_cast<char>((unicode | bytemark) & bytemask);
00094     unicode >>= 6;
00095     chars[0] = static_cast<char>(unicode | 0xf0);
00096   } else {
00097     memset(chars, 0, UNICHAR_LEN);
00098   }
00099 }

Member Function Documentation

int UNICHAR::first_uni ( ) const

Get the first character as UCS-4.

Definition at line 104 of file unichar.cpp.

References chars, and utf8_step().

00104                              {
00105   static const int utf8_offsets[5] = {
00106     0, 0, 0x3080, 0xE2080, 0x3C82080
00107   };
00108   int uni = 0;
00109   int len = utf8_step(chars);
00110   const char* src = chars;
00111 
00112   switch (len) {
00113   default:
00114     break;
00115   case 4:
00116     uni += *src++;
00117     uni <<= 6;
00118   case 3:
00119     uni += *src++;
00120     uni <<= 6;
00121   case 2:
00122     uni += *src++;
00123     uni <<= 6;
00124   case 1:
00125     uni += *src++;
00126   }
00127   uni -= utf8_offsets[len];
00128   return uni;
00129 }

const char* UNICHAR::utf8 ( ) const [inline]

Definition at line 67 of file unichar.h.

References chars.

00067                            {
00068     return chars;
00069   }

int UNICHAR::utf8_len ( ) const [inline]

Definition at line 61 of file unichar.h.

References chars, and UNICHAR_LEN.

Referenced by utf8_str().

00061                        {
00062     int len = chars[UNICHAR_LEN - 1];
00063     return len >=0 && len < UNICHAR_LEN ? len : UNICHAR_LEN;
00064   }

int UNICHAR::utf8_step ( const char * utf8_str ) [static]

Get number of bytes in first character of utf8 string.

Definition at line 147 of file unichar.cpp.

Referenced by first_uni(), and UNICHAR().

00147                                            {
00148   static const char utf8_bytes[256] = {
00149     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
00150     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
00151     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
00152     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
00153     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
00154     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
00155     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
00156     3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4,0,0,0,0,0,0,0,0
00157   };
00158 
00159   return utf8_bytes[*utf8_str];
00160 }

char * UNICHAR::utf8_str ( ) const

Get a terminated UTF8 string.

Note:: Must delete[] it after use.

Definition at line 136 of file unichar.cpp.

References chars, and utf8_len().

00136                               {
00137   int len = utf8_len();
00138   char* str = new char[len + 1];
00139   memcpy(str, chars, len);
00140   str[len] = 0;
00141   return str;
00142 }

Member Data Documentation

char UNICHAR::chars[UNICHAR_LEN] [private]

Definition at line 81 of file unichar.h.

Referenced by first_uni(), UNICHAR(), utf8(), utf8_len(), and utf8_str().