00001 00020 #ifndef THIRD_PARTY_TESSERACT_CCUTIL_UNICHAR_H__ 00021 #define THIRD_PARTY_TESSERACT_CCUTIL_UNICHAR_H__ 00022 00023 #include <memory.h> 00024 00030 #define UNICHAR_LEN 4 00031 00041 class UNICHAR { 00042 public: 00043 UNICHAR() { 00044 memset(chars, 0, UNICHAR_LEN); 00045 } 00046 00047 // Construct from a utf8 string. If len<0 then the string is null terminated. 00048 // If the string is too long to fit in the UNICHAR then it takes only what 00049 // will fit. 00050 UNICHAR(const char* utf8_str, int len); 00051 00052 // Construct from a single UCS4 character. 00053 explicit UNICHAR(int unicode); 00054 00055 // Default copy constructor and operator= are OK. 00056 00057 // Get the first character as UCS-4. 00058 int first_uni() const; 00059 00060 // Get the length of the UTF8 string. 00061 int utf8_len() const { 00062 int len = chars[UNICHAR_LEN - 1]; 00063 return len >=0 && len < UNICHAR_LEN ? len : UNICHAR_LEN; 00064 } 00065 00066 // Get a UTF8 string, but NOT NULL terminated. 00067 const char* utf8() const { 00068 return chars; 00069 } 00070 00071 // Get a terminated UTF8 string: Must delete[] it after use. 00072 char* utf8_str() const; 00073 00074 // Get the number of bytes in the first character of the given utf8 string. 00075 static int utf8_step(const char* utf8_str); 00076 00077 private: 00078 // A UTF-8 representation of 1 or more Unicode characters. 00079 // The last element (chars[UNICHAR_LEN - 1]) is a length if 00080 // its value < UNICHAR_LEN, otherwise it is a genuine character. 00081 char chars[UNICHAR_LEN]; 00082 }; 00083 00084 #endif // THIRD_PARTY_TESSERACT_CCUTIL_UNICHAR_H__