1 /*
2 * Copyright 2004-2010, Haiku, Inc.
3 * Distributed under the terms of the MIT License.
4 */
5 #ifndef _UTF8_FUNCTIONS_H
6 #define _UTF8_FUNCTIONS_H
7
8
9 #include <SupportDefs.h>
10
11
12 static inline bool
IsInsideGlyph(uchar ch)13 IsInsideGlyph(uchar ch)
14 {
15 return (ch & 0xc0) == 0x80;
16 }
17
18
19 static inline uint32
UTF8NextCharLenUnsafe(const char * text)20 UTF8NextCharLenUnsafe(const char *text)
21 {
22 const char *ptr = text;
23
24 do {
25 ptr++;
26 } while (IsInsideGlyph(*ptr));
27
28 return ptr - text;
29 }
30
31
32 static inline uint32
UTF8NextCharLen(const char * text)33 UTF8NextCharLen(const char *text)
34 {
35 if (text == NULL || *text == 0)
36 return 0;
37
38 return UTF8NextCharLenUnsafe(text);
39 }
40
41
42 static inline uint32
UTF8NextCharLen(const char * bytes,size_t length)43 UTF8NextCharLen(const char *bytes, size_t length)
44 {
45 if (bytes == NULL || length == 0 || bytes[0] == 0)
46 return 0;
47
48 if ((bytes[0] & 0x80) == 0) {
49 // A single ASCII char - or so...
50 return 1;
51 }
52
53 if (IsInsideGlyph(bytes[0])) {
54 // Not a proper multibyte start.
55 return 0;
56 }
57
58 // We already know that we have the upper two bits set due to the above
59 // two checks.
60 uint8 mask = 0x20;
61 size_t bytesExpected = 2;
62 while ((bytes[0] & mask) != 0) {
63 if (mask == 0x02) {
64 // Seven byte char - invalid.
65 return 0;
66 }
67
68 bytesExpected++;
69 mask >>= 1;
70 }
71
72 // There would need to be more bytes to satisfy the char.
73 if (bytesExpected > length)
74 return 0;
75
76 // We already know the first byte is fine, check the rest.
77 for (size_t i = 1; i < bytesExpected; i++) {
78 if (!IsInsideGlyph(bytes[i])) {
79 // The sequence is incomplete.
80 return 0;
81 }
82 }
83
84 // Puh, everything's fine.
85 return bytesExpected;
86 }
87
88
89 static inline uint32
UTF8PreviousCharLen(const char * text,const char * limit)90 UTF8PreviousCharLen(const char *text, const char *limit)
91 {
92 const char *ptr = text;
93
94 if (ptr == NULL || limit == NULL)
95 return 0;
96
97 do {
98 if (ptr == limit)
99 break;
100 ptr--;
101 } while (IsInsideGlyph(*ptr));
102
103 return text - ptr;
104 }
105
106
107 /*! UTF8CountBytes gets the length (in bytes) of a UTF8 string. Up to
108 numChars characters are read. If numChars is a negative value it is ignored
109 and the string is read up to the terminating 0.
110 */
111 static inline uint32
UTF8CountBytes(const char * bytes,int32 numChars)112 UTF8CountBytes(const char *bytes, int32 numChars)
113 {
114 if (bytes == NULL)
115 return 0;
116
117 if (numChars < 0)
118 numChars = INT_MAX;
119
120 const char *base = bytes;
121 while (bytes[0] != '\0') {
122 if ((bytes[0] & 0xc0) != 0x80) {
123 if (--numChars < 0)
124 break;
125 }
126 bytes++;
127 }
128
129 return bytes - base;
130 }
131
132
133 /*! UTF8CountChars gets the length (in characters) of a UTF8 string. Up to
134 numBytes bytes are read. If numBytes is a negative value it is ignored
135 and the string is read up to the terminating 0.
136 */
137 static inline uint32
UTF8CountChars(const char * bytes,int32 numBytes)138 UTF8CountChars(const char *bytes, int32 numBytes)
139 {
140 if (bytes == NULL)
141 return 0;
142
143 uint32 length = 0;
144 if (numBytes < 0) {
145 while (bytes[0]) {
146 if ((bytes++[0] & 0xc0) != 0x80)
147 length++;
148 }
149 } else {
150 const char *last = bytes + numBytes - 1;
151 while (bytes[0] && bytes <= last) {
152 if ((bytes++[0] & 0xc0) != 0x80)
153 length++;
154 }
155 }
156
157 return length;
158 }
159
160
161 /*! UTF8ToCharCode converts the input that includes potential multibyte chars
162 to UTF-32 char codes that can be used by FreeType. The string pointer is
163 then advanced to the next character in the string. In case the terminating
164 0 is reached, the string pointer is not advanced anymore and nulls are
165 returned. This makes it safe to overruns and enables streamed processing
166 of UTF8 strings.
167 */
168 static inline uint32
UTF8ToCharCode(const char ** bytes)169 UTF8ToCharCode(const char **bytes)
170 {
171 #define UTF8_SUBSTITUTE_CHARACTER 0xfffd
172
173 uint32 result;
174 if (((*bytes)[0] & 0x80) == 0) {
175 // a single byte character
176 result = (*bytes)[0];
177 if (result != '\0') {
178 // do not advance beyond the terminating '\0'
179 (*bytes)++;
180 }
181
182 return result;
183 }
184
185 if (((*bytes)[0] & 0xc0) == 0x80) {
186 // not a proper multibyte start
187 (*bytes)++;
188 return UTF8_SUBSTITUTE_CHARACTER;
189 }
190
191 // start of a multibyte character
192 uint8 mask = 0x80;
193 result = (uint32)((*bytes)[0] & 0xff);
194 (*bytes)++;
195
196 while (result & mask) {
197 if (mask == 0x02) {
198 // seven byte char - invalid
199 return UTF8_SUBSTITUTE_CHARACTER;
200 }
201
202 result &= ~mask;
203 mask >>= 1;
204 }
205
206 while (((*bytes)[0] & 0xc0) == 0x80) {
207 result <<= 6;
208 result += (*bytes)[0] & 0x3f;
209 (*bytes)++;
210
211 mask <<= 1;
212 if (mask == 0x40)
213 return result;
214 }
215
216 if (mask == 0x40)
217 return result;
218
219 if ((*bytes)[0] == '\0') {
220 // string terminated within multibyte char
221 return 0x00;
222 }
223
224 // not enough bytes in multibyte char
225 return UTF8_SUBSTITUTE_CHARACTER;
226
227 #undef UTF8_SUBSTITUTE_CHARACTER
228 }
229
230 #endif // _UTF8_FUNCTIONS_H
231