#include <unichar.h>
This may be a single Unicode character (stored as between 1 and 4 utf8 bytes) or multple Unicode characters representing the NFKC expansion of a ligature such as fi, ffl etc. These are also stored as utf8.
Definition at line 41 of file unichar.h.
UNICHAR::UNICHAR | ( | ) | [inline] |
Definition at line 43 of file unichar.h.
References chars, and UNICHAR_LEN.
00043 { 00044 memset(chars, 0, UNICHAR_LEN); 00045 }
UNICHAR::UNICHAR | ( | const char * | utf8_str, | |
int | len | |||
) |
Convert utf8 string to Unicode.
If len<0 then the string is null terminated. If the string is too long to fit in the UNICHAR then it takes only what will fit. Checks for illegal input and stops at an illegal sequence.
Definition at line 33 of file unichar.cpp.
References chars, UNICHAR_LEN, and utf8_step().
00033 { 00034 int total_len = 0; 00035 int step = 0; 00036 if (len < 0) { 00037 for (len = 0; utf8_str[len] != 0 && len < UNICHAR_LEN; ++len); 00038 } 00039 for (total_len = 0; total_len < len; total_len += step) { 00040 step = utf8_step(utf8_str + total_len); 00041 if (total_len + step > UNICHAR_LEN) 00042 break; // Too long. 00043 if (step == 0) 00044 break; // Illegal first byte. 00045 int i; 00046 for (i = 1; i < step; ++i) 00047 if ((utf8_str[total_len + i] & 0xc0) != 0x80) 00048 break; 00049 if (i < step) 00050 break; // Illegal surrogate 00051 } 00052 memcpy(chars, utf8_str, total_len); 00053 if (total_len < UNICHAR_LEN) { 00054 chars[UNICHAR_LEN - 1] = total_len; 00055 while (total_len < UNICHAR_LEN - 1) 00056 chars[total_len++] = 0; 00057 } 00058 }
UNICHAR::UNICHAR | ( | int | unicode | ) | [explicit] |
Convert a single UCS4 character to Unicode.
Illegal values are ignored, resulting in an empty UNICHAR.
Definition at line 65 of file unichar.cpp.
References chars, UNI_MAX_LEGAL_UTF32, and UNICHAR_LEN.
00065 { 00066 const int bytemask = 0xBF; 00067 const int bytemark = 0x80; 00068 00069 if (unicode < 0x80) { 00070 chars[UNICHAR_LEN - 1] = 1; 00071 chars[2] = 0; 00072 chars[1] = 0; 00073 chars[0] = static_cast<char>(unicode); 00074 } else if (unicode < 0x800) { 00075 chars[UNICHAR_LEN - 1] = 2; 00076 chars[2] = 0; 00077 chars[1] = static_cast<char>((unicode | bytemark) & bytemask); 00078 unicode >>= 6; 00079 chars[0] = static_cast<char>(unicode | 0xc0); 00080 } else if (unicode < 0x10000) { 00081 chars[UNICHAR_LEN - 1] = 3; 00082 chars[2] = static_cast<char>((unicode | bytemark) & bytemask); 00083 unicode >>= 6; 00084 chars[1] = static_cast<char>((unicode | bytemark) & bytemask); 00085 unicode >>= 6; 00086 chars[0] = static_cast<char>(unicode | 0xe0); 00087 } else if (unicode <= UNI_MAX_LEGAL_UTF32) { 00088 chars[UNICHAR_LEN - 1] = 4; 00089 chars[3] = static_cast<char>((unicode | bytemark) & bytemask); 00090 unicode >>= 6; 00091 chars[2] = static_cast<char>((unicode | bytemark) & bytemask); 00092 unicode >>= 6; 00093 chars[1] = static_cast<char>((unicode | bytemark) & bytemask); 00094 unicode >>= 6; 00095 chars[0] = static_cast<char>(unicode | 0xf0); 00096 } else { 00097 memset(chars, 0, UNICHAR_LEN); 00098 } 00099 }
int UNICHAR::first_uni | ( | ) | const |
Get the first character as UCS-4.
Definition at line 104 of file unichar.cpp.
References chars, and utf8_step().
00104 { 00105 static const int utf8_offsets[5] = { 00106 0, 0, 0x3080, 0xE2080, 0x3C82080 00107 }; 00108 int uni = 0; 00109 int len = utf8_step(chars); 00110 const char* src = chars; 00111 00112 switch (len) { 00113 default: 00114 break; 00115 case 4: 00116 uni += *src++; 00117 uni <<= 6; 00118 case 3: 00119 uni += *src++; 00120 uni <<= 6; 00121 case 2: 00122 uni += *src++; 00123 uni <<= 6; 00124 case 1: 00125 uni += *src++; 00126 } 00127 uni -= utf8_offsets[len]; 00128 return uni; 00129 }
const char* UNICHAR::utf8 | ( | ) | const [inline] |
int UNICHAR::utf8_len | ( | ) | const [inline] |
Definition at line 61 of file unichar.h.
References chars, and UNICHAR_LEN.
Referenced by utf8_str().
00061 { 00062 int len = chars[UNICHAR_LEN - 1]; 00063 return len >=0 && len < UNICHAR_LEN ? len : UNICHAR_LEN; 00064 }
int UNICHAR::utf8_step | ( | const char * | utf8_str | ) | [static] |
Get number of bytes in first character of utf8 string.
Definition at line 147 of file unichar.cpp.
Referenced by first_uni(), and UNICHAR().
00147 { 00148 static const char utf8_bytes[256] = { 00149 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 00150 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 00151 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 00152 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 00153 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 00154 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 00155 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 00156 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4,0,0,0,0,0,0,0,0 00157 }; 00158 00159 return utf8_bytes[*utf8_str]; 00160 }
char * UNICHAR::utf8_str | ( | ) | const |
Get a terminated UTF8 string.
Definition at line 136 of file unichar.cpp.
References chars, and utf8_len().
00136 { 00137 int len = utf8_len(); 00138 char* str = new char[len + 1]; 00139 memcpy(str, chars, len); 00140 str[len] = 0; 00141 return str; 00142 }
char UNICHAR::chars[UNICHAR_LEN] [private] |
Definition at line 81 of file unichar.h.
Referenced by first_uni(), UNICHAR(), utf8(), utf8_len(), and utf8_str().