xref: /haiku/headers/private/interface/utf8_functions.h (revision 647b5a29e9ff9b30fc4292c3c8b96124b1b43d69)
1 /*
2  * Copyright 2004-2010, Haiku, Inc.
3  * Distributed under the terms of the MIT License.
4  */
5 #ifndef _UTF8_FUNCTIONS_H
6 #define _UTF8_FUNCTIONS_H
7 
8 
9 #include <SupportDefs.h>
10 
11 
12 static inline bool
IsInsideGlyph(uchar ch)13 IsInsideGlyph(uchar ch)
14 {
15 	return (ch & 0xc0) == 0x80;
16 }
17 
18 
19 static inline uint32
UTF8NextCharLenUnsafe(const char * text)20 UTF8NextCharLenUnsafe(const char *text)
21 {
22 	const char *ptr = text;
23 
24 	do {
25 		ptr++;
26 	} while (IsInsideGlyph(*ptr));
27 
28 	return ptr - text;
29 }
30 
31 
32 static inline uint32
UTF8NextCharLen(const char * text)33 UTF8NextCharLen(const char *text)
34 {
35 	if (text == NULL || *text == 0)
36 		return 0;
37 
38 	return UTF8NextCharLenUnsafe(text);
39 }
40 
41 
42 static inline uint32
UTF8NextCharLen(const char * bytes,size_t length)43 UTF8NextCharLen(const char *bytes, size_t length)
44 {
45 	if (bytes == NULL || length == 0 || bytes[0] == 0)
46 		return 0;
47 
48 	if ((bytes[0] & 0x80) == 0) {
49 		// A single ASCII char - or so...
50 		return 1;
51 	}
52 
53 	if (IsInsideGlyph(bytes[0])) {
54 		// Not a proper multibyte start.
55 		return 0;
56 	}
57 
58 	// We already know that we have the upper two bits set due to the above
59 	// two checks.
60 	uint8 mask = 0x20;
61 	size_t bytesExpected = 2;
62 	while ((bytes[0] & mask) != 0) {
63 		if (mask == 0x02) {
64 			// Seven byte char - invalid.
65 			return 0;
66 		}
67 
68 		bytesExpected++;
69 		mask >>= 1;
70 	}
71 
72 	// There would need to be more bytes to satisfy the char.
73 	if (bytesExpected > length)
74 		return 0;
75 
76 	// We already know the first byte is fine, check the rest.
77 	for (size_t i = 1; i < bytesExpected; i++) {
78 		if (!IsInsideGlyph(bytes[i])) {
79 			// The sequence is incomplete.
80 			return 0;
81 		}
82 	}
83 
84 	// Puh, everything's fine.
85 	return bytesExpected;
86 }
87 
88 
89 static inline uint32
UTF8PreviousCharLen(const char * text,const char * limit)90 UTF8PreviousCharLen(const char *text, const char *limit)
91 {
92 	const char *ptr = text;
93 
94 	if (ptr == NULL || limit == NULL)
95 		return 0;
96 
97 	do {
98 		if (ptr == limit)
99 			break;
100 		ptr--;
101 	} while (IsInsideGlyph(*ptr));
102 
103 	return text - ptr;
104 }
105 
106 
107 /*!	UTF8CountBytes gets the length (in bytes) of a UTF8 string. Up to
108 	numChars characters are read. If numChars is a negative value it is ignored
109 	and the string is read up to the terminating 0.
110 */
111 static inline uint32
UTF8CountBytes(const char * bytes,int32 numChars)112 UTF8CountBytes(const char *bytes, int32 numChars)
113 {
114 	if (bytes == NULL)
115 		return 0;
116 
117 	if (numChars < 0)
118 		numChars = INT_MAX;
119 
120 	const char *base = bytes;
121 	while (bytes[0] != '\0') {
122 		if ((bytes[0] & 0xc0) != 0x80) {
123 			if (--numChars < 0)
124 				break;
125 		}
126 		bytes++;
127 	}
128 
129 	return bytes - base;
130 }
131 
132 
133 /*!	UTF8CountChars gets the length (in characters) of a UTF8 string. Up to
134 	numBytes bytes are read. If numBytes is a negative value it is ignored
135 	and the string is read up to the terminating 0.
136 */
137 static inline uint32
UTF8CountChars(const char * bytes,int32 numBytes)138 UTF8CountChars(const char *bytes, int32 numBytes)
139 {
140 	if (bytes == NULL)
141 		return 0;
142 
143 	uint32 length = 0;
144 	if (numBytes < 0) {
145 		while (bytes[0]) {
146 			if ((bytes++[0] & 0xc0) != 0x80)
147 				length++;
148 		}
149 	} else {
150 		const char *last = bytes + numBytes - 1;
151 		while (bytes[0] && bytes <= last) {
152 			if ((bytes++[0] & 0xc0) != 0x80)
153 				length++;
154 		}
155 	}
156 
157 	return length;
158 }
159 
160 
161 /*!	UTF8ToCharCode converts the input that includes potential multibyte chars
162 	to UTF-32 char codes that can be used by FreeType. The string pointer is
163 	then advanced to the next character in the string. In case the terminating
164 	0 is reached, the string pointer is not advanced anymore and nulls are
165 	returned. This makes it safe to overruns and enables streamed processing
166 	of UTF8 strings.
167 */
168 static inline uint32
UTF8ToCharCode(const char ** bytes)169 UTF8ToCharCode(const char **bytes)
170 {
171 	#define UTF8_SUBSTITUTE_CHARACTER	0xfffd
172 
173 	uint32 result;
174 	if (((*bytes)[0] & 0x80) == 0) {
175 		// a single byte character
176 		result = (*bytes)[0];
177 		if (result != '\0') {
178 			// do not advance beyond the terminating '\0'
179 			(*bytes)++;
180 		}
181 
182 		return result;
183 	}
184 
185 	if (((*bytes)[0] & 0xc0) == 0x80) {
186 		// not a proper multibyte start
187 		(*bytes)++;
188 		return UTF8_SUBSTITUTE_CHARACTER;
189 	}
190 
191 	// start of a multibyte character
192 	uint8 mask = 0x80;
193 	result = (uint32)((*bytes)[0] & 0xff);
194 	(*bytes)++;
195 
196 	while (result & mask) {
197 		if (mask == 0x02) {
198 			// seven byte char - invalid
199 			return UTF8_SUBSTITUTE_CHARACTER;
200 		}
201 
202 		result &= ~mask;
203 		mask >>= 1;
204 	}
205 
206 	while (((*bytes)[0] & 0xc0) == 0x80) {
207 		result <<= 6;
208 		result += (*bytes)[0] & 0x3f;
209 		(*bytes)++;
210 
211 		mask <<= 1;
212 		if (mask == 0x40)
213 			return result;
214 	}
215 
216 	if (mask == 0x40)
217 		return result;
218 
219 	if ((*bytes)[0] == '\0') {
220 		// string terminated within multibyte char
221 		return 0x00;
222 	}
223 
224 	// not enough bytes in multibyte char
225 	return UTF8_SUBSTITUTE_CHARACTER;
226 
227 	#undef UTF8_SUBSTITUTE_CHARACTER
228 }
229 
230 #endif	// _UTF8_FUNCTIONS_H
231