1 /* 2 * Copyright 2004-2006, Haiku, Inc. 3 * Distributed under the terms of the MIT License. 4 */ 5 #ifndef _UTF8_FUNCTIONS_H 6 #define _UTF8_FUNCTIONS_H 7 8 9 #include <SupportDefs.h> 10 11 12 static inline bool 13 IsInsideGlyph(uchar ch) 14 { 15 return (ch & 0xc0) == 0x80; 16 } 17 18 19 static inline uint32 20 UTF8NextCharLenUnsafe(const char *text) 21 { 22 const char *ptr = text; 23 24 do { 25 ptr++; 26 } while (IsInsideGlyph(*ptr)); 27 28 return ptr - text; 29 } 30 31 32 static inline uint32 33 UTF8NextCharLen(const char *text) 34 { 35 if (text == NULL || *text == 0) 36 return 0; 37 38 return UTF8NextCharLenUnsafe(text); 39 } 40 41 42 static inline uint32 43 UTF8PreviousCharLen(const char *text, const char *limit) 44 { 45 const char *ptr = text; 46 47 if (ptr == NULL || limit == NULL) 48 return 0; 49 50 do { 51 if (ptr == limit) 52 break; 53 ptr--; 54 } while (IsInsideGlyph(*ptr)); 55 56 return text - ptr; 57 } 58 59 60 /*! UTF8CountBytes gets the length (in bytes) of a UTF8 string. Up to 61 numChars characters are read. If numChars is a negative value it is ignored 62 and the string is read up to the terminating 0. 63 */ 64 static inline uint32 65 UTF8CountBytes(const char *bytes, int32 numChars) 66 { 67 if (!bytes) 68 return 0; 69 70 if (numChars < 0) 71 numChars = INT_MAX; 72 73 const char *base = bytes; 74 while (*bytes && numChars-- > 0) { 75 if (bytes[0] & 0x80) { 76 if (bytes[0] & 0x40) { 77 if (bytes[0] & 0x20) { 78 if (bytes[0] & 0x10) { 79 if (bytes[1] == 0 || bytes[2] == 0 || bytes[3] == 0) 80 return (bytes - base); 81 82 bytes += 4; 83 continue; 84 } 85 86 if (bytes[1] == 0 || bytes[2] == 0) 87 return (bytes - base); 88 89 bytes += 3; 90 continue; 91 } 92 93 if (bytes[1] == 0) 94 return (bytes - base); 95 96 bytes += 2; 97 continue; 98 } 99 100 /* Not a startbyte - skip */ 101 bytes += 1; 102 continue; 103 } 104 105 bytes += 1; 106 } 107 108 return (bytes - base); 109 } 110 111 112 /*! UTF8CountChars gets the length (in characters) of a UTF8 string. Up to 113 numBytes bytes are read. If numBytes is a negative value it is ignored 114 and the string is read up to the terminating 0. 115 */ 116 static inline uint32 117 UTF8CountChars(const char *bytes, int32 numBytes) 118 { 119 if (!bytes) 120 return 0; 121 122 uint32 length = 0; 123 const char *last = bytes + numBytes - 1; 124 if (numBytes < 0) 125 last = (const char *)UINT_MAX; 126 127 while (*bytes && bytes <= last) { 128 if (bytes[0] & 0x80) { 129 if (bytes[0] & 0x40) { 130 if (bytes[0] & 0x20) { 131 if (bytes[0] & 0x10) { 132 if (bytes[1] == 0 || bytes[2] == 0 || bytes[3] == 0) 133 return length; 134 135 bytes += 4; 136 length++; 137 continue; 138 } 139 140 if (bytes[1] == 0 || bytes[2] == 0) 141 return length; 142 143 bytes += 3; 144 length++; 145 continue; 146 } 147 148 if (bytes[1] == 0) 149 return length; 150 151 bytes += 2; 152 length++; 153 continue; 154 } 155 156 /* Not a startbyte - skip */ 157 bytes += 1; 158 continue; 159 } 160 161 bytes += 1; 162 length++; 163 } 164 165 return length; 166 } 167 168 169 /*! UTF8ToCharCode converts the input that includes potential multibyte chars 170 to UTF-32 char codes that can be used by FreeType. The string pointer is 171 then advanced to the next character in the string. In case the terminating 172 0 is reached, the string pointer is not advanced anymore and spaces are 173 returned. This makes it safe to overruns and enables streamed processing 174 of UTF8 strings. 175 */ 176 static inline uint32 177 UTF8ToCharCode(const char **bytes) 178 { 179 register uint32 result = 0; 180 181 if ((*bytes)[0] & 0x80) { 182 if ((*bytes)[0] & 0x40) { 183 if ((*bytes)[0] & 0x20) { 184 if ((*bytes)[0] & 0x10) { 185 if ((*bytes)[0] & 0x08) { 186 /* A five byte char?! 187 Something's wrong, substitute. */ 188 result += 0x20; 189 (*bytes)++; 190 return result; 191 } 192 193 if ((*bytes)[1] == 0 || (*bytes)[2] == 0 || (*bytes)[3] == 0) 194 return 0x00; 195 196 /* A four byte char */ 197 result += (*bytes)[0] & 0x07; 198 result <<= 6; 199 result += (*bytes)[1] & 0x3f; 200 result <<= 6; 201 result += (*bytes)[2] & 0x3f; 202 result <<= 6; 203 result += (*bytes)[3] & 0x3f; 204 (*bytes) += 4; 205 return result; 206 } 207 208 if ((*bytes)[1] == 0 || (*bytes)[2] == 0) 209 return 0x00; 210 211 /* A three byte char */ 212 result += (*bytes)[0] & 0x0f; 213 result <<= 6; 214 result += (*bytes)[1] & 0x3f; 215 result <<= 6; 216 result += (*bytes)[2] & 0x3f; 217 (*bytes) += 3; 218 return result; 219 } 220 221 if ((*bytes)[1] == 0) 222 return 0x00; 223 224 /* A two byte char */ 225 result += (*bytes)[0] & 0x1f; 226 result <<= 6; 227 result += (*bytes)[1] & 0x3f; 228 (*bytes) += 2; 229 return result; 230 } 231 232 /* This (10) is not a startbyte. 233 Substitute with a space. */ 234 result += 0x20; 235 (*bytes)++; 236 return result; 237 } 238 239 if ((*bytes)[0] == 0) { 240 /* We do not advance beyond the terminating 0. */ 241 return 0x00; 242 } 243 244 result += (*bytes)[0]; 245 (*bytes)++; 246 return result; 247 } 248 249 #endif // _UTF8_FUNCTIONS_H 250