1 /* 2 * Copyright 2004-2010, Haiku, Inc. 3 * Distributed under the terms of the MIT License. 4 */ 5 #ifndef _UTF8_FUNCTIONS_H 6 #define _UTF8_FUNCTIONS_H 7 8 9 #include <SupportDefs.h> 10 11 12 static inline bool 13 IsInsideGlyph(uchar ch) 14 { 15 return (ch & 0xc0) == 0x80; 16 } 17 18 19 static inline uint32 20 UTF8NextCharLenUnsafe(const char *text) 21 { 22 const char *ptr = text; 23 24 do { 25 ptr++; 26 } while (IsInsideGlyph(*ptr)); 27 28 return ptr - text; 29 } 30 31 32 static inline uint32 33 UTF8NextCharLen(const char *text) 34 { 35 if (text == NULL || *text == 0) 36 return 0; 37 38 return UTF8NextCharLenUnsafe(text); 39 } 40 41 42 static inline uint32 43 UTF8NextCharLen(const char *bytes, size_t length) 44 { 45 if (bytes == NULL || length == 0 || bytes[0] == 0) 46 return 0; 47 48 if ((bytes[0] & 0x80) == 0) { 49 // A single ASCII char - or so... 50 return 1; 51 } 52 53 if (IsInsideGlyph(bytes[0])) { 54 // Not a proper multibyte start. 55 return 0; 56 } 57 58 // We already know that we have the upper two bits set due to the above 59 // two checks. 60 uint8 mask = 0x20; 61 size_t bytesExpected = 2; 62 while ((bytes[0] & mask) != 0) { 63 if (mask == 0x02) { 64 // Seven byte char - invalid. 65 return 0; 66 } 67 68 bytesExpected++; 69 mask >>= 1; 70 } 71 72 // There would need to be more bytes to satisfy the char. 73 if (bytesExpected > length) 74 return 0; 75 76 // We already know the first byte is fine, check the rest. 77 for (size_t i = 1; i < bytesExpected; i++) { 78 if (!IsInsideGlyph(bytes[i])) { 79 // The sequence is incomplete. 80 return 0; 81 } 82 } 83 84 // Puh, everything's fine. 85 return bytesExpected; 86 } 87 88 89 static inline uint32 90 UTF8PreviousCharLen(const char *text, const char *limit) 91 { 92 const char *ptr = text; 93 94 if (ptr == NULL || limit == NULL) 95 return 0; 96 97 do { 98 if (ptr == limit) 99 break; 100 ptr--; 101 } while (IsInsideGlyph(*ptr)); 102 103 return text - ptr; 104 } 105 106 107 /*! UTF8CountBytes gets the length (in bytes) of a UTF8 string. Up to 108 numChars characters are read. If numChars is a negative value it is ignored 109 and the string is read up to the terminating 0. 110 */ 111 static inline uint32 112 UTF8CountBytes(const char *bytes, int32 numChars) 113 { 114 if (bytes == NULL) 115 return 0; 116 117 if (numChars < 0) 118 numChars = INT_MAX; 119 120 const char *base = bytes; 121 while (bytes[0] != '\0') { 122 if ((bytes[0] & 0xc0) != 0x80) { 123 if (--numChars < 0) 124 break; 125 } 126 bytes++; 127 } 128 129 return bytes - base; 130 } 131 132 133 /*! UTF8CountChars gets the length (in characters) of a UTF8 string. Up to 134 numBytes bytes are read. If numBytes is a negative value it is ignored 135 and the string is read up to the terminating 0. 136 */ 137 static inline uint32 138 UTF8CountChars(const char *bytes, int32 numBytes) 139 { 140 if (bytes == NULL) 141 return 0; 142 143 uint32 length = 0; 144 if (numBytes < 0) { 145 while (bytes[0]) { 146 if ((bytes++[0] & 0xc0) != 0x80) 147 length++; 148 } 149 } else { 150 const char *last = bytes + numBytes - 1; 151 while (bytes[0] && bytes <= last) { 152 if ((bytes++[0] & 0xc0) != 0x80) 153 length++; 154 } 155 } 156 157 return length; 158 } 159 160 161 /*! UTF8ToCharCode converts the input that includes potential multibyte chars 162 to UTF-32 char codes that can be used by FreeType. The string pointer is 163 then advanced to the next character in the string. In case the terminating 164 0 is reached, the string pointer is not advanced anymore and nulls are 165 returned. This makes it safe to overruns and enables streamed processing 166 of UTF8 strings. 167 */ 168 static inline uint32 169 UTF8ToCharCode(const char **bytes) 170 { 171 #define UTF8_SUBSTITUTE_CHARACTER 0xfffd 172 173 uint32 result; 174 if (((*bytes)[0] & 0x80) == 0) { 175 // a single byte character 176 result = (*bytes)[0]; 177 if (result != '\0') { 178 // do not advance beyond the terminating '\0' 179 (*bytes)++; 180 } 181 182 return result; 183 } 184 185 if (((*bytes)[0] & 0xc0) == 0x80) { 186 // not a proper multibyte start 187 (*bytes)++; 188 return UTF8_SUBSTITUTE_CHARACTER; 189 } 190 191 // start of a multibyte character 192 uint8 mask = 0x80; 193 result = (uint32)((*bytes)[0] & 0xff); 194 (*bytes)++; 195 196 while (result & mask) { 197 if (mask == 0x02) { 198 // seven byte char - invalid 199 return UTF8_SUBSTITUTE_CHARACTER; 200 } 201 202 result &= ~mask; 203 mask >>= 1; 204 } 205 206 while (((*bytes)[0] & 0xc0) == 0x80) { 207 result <<= 6; 208 result += (*bytes)[0] & 0x3f; 209 (*bytes)++; 210 211 mask <<= 1; 212 if (mask == 0x40) 213 return result; 214 } 215 216 if (mask == 0x40) 217 return result; 218 219 if ((*bytes)[0] == '\0') { 220 // string terminated within multibyte char 221 return 0x00; 222 } 223 224 // not enough bytes in multibyte char 225 return UTF8_SUBSTITUTE_CHARACTER; 226 227 #undef UTF8_SUBSTITUTE_CHARACTER 228 } 229 230 #endif // _UTF8_FUNCTIONS_H 231