Tesseract: ccutil/unichar.cpp Source File

00001 
00020 #include "unichar.h"
00021 
00022 #define UNI_MAX_LEGAL_UTF32 0x0010FFFF
00023 
00033 UNICHAR::UNICHAR(const char* utf8_str, int len) {
00034   int total_len = 0;
00035   int step = 0;
00036   if (len < 0) {
00037     for (len = 0; utf8_str[len] != 0 && len < UNICHAR_LEN; ++len);
00038   }
00039   for (total_len = 0; total_len < len; total_len += step) {
00040     step = utf8_step(utf8_str + total_len);
00041     if (total_len + step > UNICHAR_LEN)
00042       break;  // Too long.
00043     if (step == 0)
00044       break;  // Illegal first byte.
00045     int i;
00046     for (i = 1; i < step; ++i)
00047       if ((utf8_str[total_len + i] & 0xc0) != 0x80)
00048         break;
00049     if (i < step)
00050       break;  // Illegal surrogate
00051   }
00052   memcpy(chars, utf8_str, total_len);
00053   if (total_len < UNICHAR_LEN) {
00054     chars[UNICHAR_LEN - 1] = total_len;
00055     while (total_len < UNICHAR_LEN - 1)
00056       chars[total_len++] = 0;
00057   }
00058 }
00059 
00065 UNICHAR::UNICHAR(int unicode) {
00066   const int bytemask = 0xBF;
00067   const int bytemark = 0x80;
00068 
00069   if (unicode < 0x80) {
00070     chars[UNICHAR_LEN - 1] = 1;
00071     chars[2] = 0;
00072     chars[1] = 0;
00073     chars[0] = static_cast<char>(unicode);
00074   } else if (unicode < 0x800) {
00075     chars[UNICHAR_LEN - 1] = 2;
00076     chars[2] = 0;
00077     chars[1] = static_cast<char>((unicode | bytemark) & bytemask);
00078     unicode >>= 6;
00079     chars[0] = static_cast<char>(unicode | 0xc0);
00080   } else if (unicode < 0x10000) {
00081     chars[UNICHAR_LEN - 1] = 3;
00082     chars[2] = static_cast<char>((unicode | bytemark) & bytemask);
00083     unicode >>= 6;
00084     chars[1] = static_cast<char>((unicode | bytemark) & bytemask);
00085     unicode >>= 6;
00086     chars[0] = static_cast<char>(unicode | 0xe0);
00087   } else if (unicode <= UNI_MAX_LEGAL_UTF32) {
00088     chars[UNICHAR_LEN - 1] = 4;
00089     chars[3] = static_cast<char>((unicode | bytemark) & bytemask);
00090     unicode >>= 6;
00091     chars[2] = static_cast<char>((unicode | bytemark) & bytemask);
00092     unicode >>= 6;
00093     chars[1] = static_cast<char>((unicode | bytemark) & bytemask);
00094     unicode >>= 6;
00095     chars[0] = static_cast<char>(unicode | 0xf0);
00096   } else {
00097     memset(chars, 0, UNICHAR_LEN);
00098   }
00099 }
00100 
00104 int UNICHAR::first_uni() const {
00105   static const int utf8_offsets[5] = {
00106     0, 0, 0x3080, 0xE2080, 0x3C82080
00107   };
00108   int uni = 0;
00109   int len = utf8_step(chars);
00110   const char* src = chars;
00111 
00112   switch (len) {
00113   default:
00114     break;
00115   case 4:
00116     uni += *src++;
00117     uni <<= 6;
00118   case 3:
00119     uni += *src++;
00120     uni <<= 6;
00121   case 2:
00122     uni += *src++;
00123     uni <<= 6;
00124   case 1:
00125     uni += *src++;
00126   }
00127   uni -= utf8_offsets[len];
00128   return uni;
00129 }
00130 
00136 char* UNICHAR::utf8_str() const {
00137   int len = utf8_len();
00138   char* str = new char[len + 1];
00139   memcpy(str, chars, len);
00140   str[len] = 0;
00141   return str;
00142 }
00143 
00147 int UNICHAR::utf8_step(const char* utf8_str) {
00148   static const char utf8_bytes[256] = {
00149     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
00150     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
00151     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
00152     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
00153     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
00154     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
00155     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
00156     3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4,0,0,0,0,0,0,0,0
00157   };
00158 
00159   return utf8_bytes[*utf8_str];
00160 }