1 /* 2 * Copyright 2004-2010, Haiku, Inc. 3 * Distributed under the terms of the MIT License. 4 */ 5 #ifndef _UTF8_FUNCTIONS_H 6 #define _UTF8_FUNCTIONS_H 7 8 9 #include <SupportDefs.h> 10 11 12 static inline bool 13 IsInsideGlyph(uchar ch) 14 { 15 return (ch & 0xc0) == 0x80; 16 } 17 18 19 static inline uint32 20 UTF8NextCharLenUnsafe(const char *text) 21 { 22 const char *ptr = text; 23 24 do { 25 ptr++; 26 } while (IsInsideGlyph(*ptr)); 27 28 return ptr - text; 29 } 30 31 32 static inline uint32 33 UTF8NextCharLen(const char *text) 34 { 35 if (text == NULL || *text == 0) 36 return 0; 37 38 return UTF8NextCharLenUnsafe(text); 39 } 40 41 42 static inline uint32 43 UTF8PreviousCharLen(const char *text, const char *limit) 44 { 45 const char *ptr = text; 46 47 if (ptr == NULL || limit == NULL) 48 return 0; 49 50 do { 51 if (ptr == limit) 52 break; 53 ptr--; 54 } while (IsInsideGlyph(*ptr)); 55 56 return text - ptr; 57 } 58 59 60 /*! UTF8CountBytes gets the length (in bytes) of a UTF8 string. Up to 61 numChars characters are read. If numChars is a negative value it is ignored 62 and the string is read up to the terminating 0. 63 */ 64 static inline uint32 65 UTF8CountBytes(const char *bytes, int32 numChars) 66 { 67 if (bytes == NULL) 68 return 0; 69 70 if (numChars < 0) 71 numChars = INT_MAX; 72 73 const char *base = bytes; 74 while (bytes[0] != '\0') { 75 if ((bytes[0] & 0xc0) != 0x80) { 76 if (--numChars < 0) 77 break; 78 } 79 bytes++; 80 } 81 82 return bytes - base; 83 } 84 85 86 /*! UTF8CountChars gets the length (in characters) of a UTF8 string. Up to 87 numBytes bytes are read. If numBytes is a negative value it is ignored 88 and the string is read up to the terminating 0. 89 */ 90 static inline uint32 91 UTF8CountChars(const char *bytes, int32 numBytes) 92 { 93 if (bytes == NULL) 94 return 0; 95 96 uint32 length = 0; 97 const char *last; 98 if (numBytes < 0) 99 last = (const char *)SIZE_MAX; 100 else 101 last = bytes + numBytes - 1; 102 103 while (bytes[0] && bytes <= last) { 104 if ((bytes++[0] & 0xc0) != 0x80) 105 length++; 106 } 107 108 return length; 109 } 110 111 112 /*! UTF8ToCharCode converts the input that includes potential multibyte chars 113 to UTF-32 char codes that can be used by FreeType. The string pointer is 114 then advanced to the next character in the string. In case the terminating 115 0 is reached, the string pointer is not advanced anymore and nulls are 116 returned. This makes it safe to overruns and enables streamed processing 117 of UTF8 strings. 118 */ 119 static inline uint32 120 UTF8ToCharCode(const char **bytes) 121 { 122 #define UTF8_SUBSTITUTE_CHARACTER 0xfffd 123 124 uint32 result; 125 if (((*bytes)[0] & 0x80) == 0) { 126 // a single byte character 127 result = (*bytes)[0]; 128 if (result != '\0') { 129 // do not advance beyond the terminating '\0' 130 (*bytes)++; 131 } 132 133 return result; 134 } 135 136 if (((*bytes)[0] & 0xc0) == 0x80) { 137 // not a proper multibyte start 138 (*bytes)++; 139 return UTF8_SUBSTITUTE_CHARACTER; 140 } 141 142 // start of a multibyte character 143 uint8 mask = 0x80; 144 result = (uint32)((*bytes)[0] & 0xff); 145 (*bytes)++; 146 147 while (result & mask) { 148 if (mask == 0x02) { 149 // seven byte char - invalid 150 return UTF8_SUBSTITUTE_CHARACTER; 151 } 152 153 result &= ~mask; 154 mask >>= 1; 155 } 156 157 while (((*bytes)[0] & 0xc0) == 0x80) { 158 result <<= 6; 159 result += (*bytes)[0] & 0x3f; 160 (*bytes)++; 161 162 mask <<= 1; 163 if (mask == 0x40) 164 return result; 165 } 166 167 if (mask == 0x40) 168 return result; 169 170 if ((*bytes)[0] == '\0') { 171 // string terminated within multibyte char 172 return 0x00; 173 } 174 175 // not enough bytes in multibyte char 176 return UTF8_SUBSTITUTE_CHARACTER; 177 178 #undef UTF8_SUBSTITUTE_CHARACTER 179 } 180 181 #endif // _UTF8_FUNCTIONS_H 182