1 /* 2 * Copyright 2004-2010, Haiku, Inc. 3 * Distributed under the terms of the MIT License. 4 */ 5 #ifndef _UTF8_FUNCTIONS_H 6 #define _UTF8_FUNCTIONS_H 7 8 9 #include <SupportDefs.h> 10 11 12 static inline bool 13 IsInsideGlyph(uchar ch) 14 { 15 return (ch & 0xc0) == 0x80; 16 } 17 18 19 static inline uint32 20 UTF8NextCharLenUnsafe(const char *text) 21 { 22 const char *ptr = text; 23 24 do { 25 ptr++; 26 } while (IsInsideGlyph(*ptr)); 27 28 return ptr - text; 29 } 30 31 32 static inline uint32 33 UTF8NextCharLen(const char *text) 34 { 35 if (text == NULL || *text == 0) 36 return 0; 37 38 return UTF8NextCharLenUnsafe(text); 39 } 40 41 42 static inline uint32 43 UTF8NextCharLen(const char *bytes, size_t length) 44 { 45 if (bytes == NULL || length == 0 || bytes[0] == 0) 46 return 0; 47 48 if ((bytes[0] & 0x80) == 0) { 49 // A single ASCII char - or so... 50 return 1; 51 } 52 53 if (IsInsideGlyph(bytes[0])) { 54 // Not a proper multibyte start. 55 return 0; 56 } 57 58 // We already know that we have the upper two bits set due to the above 59 // two checks. 60 uint8 mask = 0x20; 61 size_t bytesExpected = 2; 62 while ((bytes[0] & mask) != 0) { 63 if (mask == 0x02) { 64 // Seven byte char - invalid. 65 return 0; 66 } 67 68 bytesExpected++; 69 mask >>= 1; 70 } 71 72 // There would need to be more bytes to satisfy the char. 73 if (bytesExpected > length) 74 return 0; 75 76 // We already know the first byte is fine, check the rest. 77 for (size_t i = 1; i < bytesExpected; i++) { 78 if (!IsInsideGlyph(bytes[i])) { 79 // The sequence is incomplete. 80 return 0; 81 } 82 } 83 84 // Puh, everything's fine. 85 return bytesExpected; 86 } 87 88 89 static inline uint32 90 UTF8PreviousCharLen(const char *text, const char *limit) 91 { 92 const char *ptr = text; 93 94 if (ptr == NULL || limit == NULL) 95 return 0; 96 97 do { 98 if (ptr == limit) 99 break; 100 ptr--; 101 } while (IsInsideGlyph(*ptr)); 102 103 return text - ptr; 104 } 105 106 107 /*! UTF8CountBytes gets the length (in bytes) of a UTF8 string. Up to 108 numChars characters are read. If numChars is a negative value it is ignored 109 and the string is read up to the terminating 0. 110 */ 111 static inline uint32 112 UTF8CountBytes(const char *bytes, int32 numChars) 113 { 114 if (bytes == NULL) 115 return 0; 116 117 if (numChars < 0) 118 numChars = INT_MAX; 119 120 const char *base = bytes; 121 while (bytes[0] != '\0') { 122 if ((bytes[0] & 0xc0) != 0x80) { 123 if (--numChars < 0) 124 break; 125 } 126 bytes++; 127 } 128 129 return bytes - base; 130 } 131 132 133 /*! UTF8CountChars gets the length (in characters) of a UTF8 string. Up to 134 numBytes bytes are read. If numBytes is a negative value it is ignored 135 and the string is read up to the terminating 0. 136 */ 137 static inline uint32 138 UTF8CountChars(const char *bytes, int32 numBytes) 139 { 140 if (bytes == NULL) 141 return 0; 142 143 uint32 length = 0; 144 const char *last; 145 if (numBytes < 0) 146 last = (const char *)SIZE_MAX; 147 else 148 last = bytes + numBytes - 1; 149 150 while (bytes[0] && bytes <= last) { 151 if ((bytes++[0] & 0xc0) != 0x80) 152 length++; 153 } 154 155 return length; 156 } 157 158 159 /*! UTF8ToCharCode converts the input that includes potential multibyte chars 160 to UTF-32 char codes that can be used by FreeType. The string pointer is 161 then advanced to the next character in the string. In case the terminating 162 0 is reached, the string pointer is not advanced anymore and nulls are 163 returned. This makes it safe to overruns and enables streamed processing 164 of UTF8 strings. 165 */ 166 static inline uint32 167 UTF8ToCharCode(const char **bytes) 168 { 169 #define UTF8_SUBSTITUTE_CHARACTER 0xfffd 170 171 uint32 result; 172 if (((*bytes)[0] & 0x80) == 0) { 173 // a single byte character 174 result = (*bytes)[0]; 175 if (result != '\0') { 176 // do not advance beyond the terminating '\0' 177 (*bytes)++; 178 } 179 180 return result; 181 } 182 183 if (((*bytes)[0] & 0xc0) == 0x80) { 184 // not a proper multibyte start 185 (*bytes)++; 186 return UTF8_SUBSTITUTE_CHARACTER; 187 } 188 189 // start of a multibyte character 190 uint8 mask = 0x80; 191 result = (uint32)((*bytes)[0] & 0xff); 192 (*bytes)++; 193 194 while (result & mask) { 195 if (mask == 0x02) { 196 // seven byte char - invalid 197 return UTF8_SUBSTITUTE_CHARACTER; 198 } 199 200 result &= ~mask; 201 mask >>= 1; 202 } 203 204 while (((*bytes)[0] & 0xc0) == 0x80) { 205 result <<= 6; 206 result += (*bytes)[0] & 0x3f; 207 (*bytes)++; 208 209 mask <<= 1; 210 if (mask == 0x40) 211 return result; 212 } 213 214 if (mask == 0x40) 215 return result; 216 217 if ((*bytes)[0] == '\0') { 218 // string terminated within multibyte char 219 return 0x00; 220 } 221 222 // not enough bytes in multibyte char 223 return UTF8_SUBSTITUTE_CHARACTER; 224 225 #undef UTF8_SUBSTITUTE_CHARACTER 226 } 227 228 #endif // _UTF8_FUNCTIONS_H 229