xref: /haiku/headers/private/interface/utf8_functions.h (revision 1acbe440b8dd798953bec31d18ee589aa3f71b73)
1 /*
2  * Copyright 2004-2006, Haiku, Inc.
3  * Distributed under the terms of the MIT License.
4  */
5 #ifndef _UTF8_FUNCTIONS_H
6 #define _UTF8_FUNCTIONS_H
7 
8 
9 #include <SupportDefs.h>
10 
11 
12 static inline bool
13 IsInsideGlyph(uchar ch)
14 {
15 	return (ch & 0xc0) == 0x80;
16 }
17 
18 
19 static inline uint32
20 UTF8NextCharLenUnsafe(const char *text)
21 {
22 	const char *ptr = text;
23 
24 	do {
25 		ptr++;
26 	} while (IsInsideGlyph(*ptr));
27 
28 	return ptr - text;
29 }
30 
31 
32 static inline uint32
33 UTF8NextCharLen(const char *text)
34 {
35 	if (text == NULL || *text == 0)
36 		return 0;
37 
38 	return UTF8NextCharLenUnsafe(text);
39 }
40 
41 
42 static inline uint32
43 UTF8PreviousCharLen(const char *text, const char *limit)
44 {
45 	const char *ptr = text;
46 
47 	if (ptr == NULL || limit == NULL)
48 		return 0;
49 
50 	do {
51 		if (ptr == limit)
52 			break;
53 		ptr--;
54 	} while (IsInsideGlyph(*ptr));
55 
56 	return text - ptr;
57 }
58 
59 
60 /*!	UTF8CountBytes gets the length (in bytes) of a UTF8 string. Up to
61 	numChars characters are read. If numChars is a negative value it is ignored
62 	and the string is read up to the terminating 0.
63 */
64 static inline uint32
65 UTF8CountBytes(const char *bytes, int32 numChars)
66 {
67 	if (!bytes)
68 		return 0;
69 
70 	if (numChars < 0)
71 		numChars = INT_MAX;
72 
73 	const char *base = bytes;
74 	while (*bytes && numChars-- > 0) {
75 		if (bytes[0] & 0x80) {
76 			if (bytes[0] & 0x40) {
77 				if (bytes[0] & 0x20) {
78 					if (bytes[0] & 0x10) {
79 						if (bytes[1] == 0 || bytes[2] == 0 || bytes[3] == 0)
80 							return (bytes - base);
81 
82 						bytes += 4;
83 						continue;
84 					}
85 
86 					if (bytes[1] == 0 || bytes[2] == 0)
87 						return (bytes - base);
88 
89 					bytes += 3;
90 					continue;
91 				}
92 
93 				if (bytes[1] == 0)
94 					return (bytes - base);
95 
96 				bytes += 2;
97 				continue;
98 			}
99 
100 			/* Not a startbyte - skip */
101 			bytes += 1;
102 			continue;
103 		}
104 
105 		bytes += 1;
106 	}
107 
108 	return (bytes - base);
109 }
110 
111 
112 /*!	UTF8CountChars gets the length (in characters) of a UTF8 string. Up to
113 	numBytes bytes are read. If numBytes is a negative value it is ignored
114 	and the string is read up to the terminating 0.
115 */
116 static inline uint32
117 UTF8CountChars(const char *bytes, int32 numBytes)
118 {
119 	if (!bytes)
120 		return 0;
121 
122 	uint32 length = 0;
123 	const char *last = bytes + numBytes - 1;
124 	if (numBytes < 0)
125 		last = (const char *)UINT_MAX;
126 
127 	while (*bytes && bytes <= last) {
128 		if (bytes[0] & 0x80) {
129 			if (bytes[0] & 0x40) {
130 				if (bytes[0] & 0x20) {
131 					if (bytes[0] & 0x10) {
132 						if (bytes[1] == 0 || bytes[2] == 0 || bytes[3] == 0)
133 							return length;
134 
135 						bytes += 4;
136 						length++;
137 						continue;
138 					}
139 
140 					if (bytes[1] == 0 || bytes[2] == 0)
141 						return length;
142 
143 					bytes += 3;
144 					length++;
145 					continue;
146 				}
147 
148 				if (bytes[1] == 0)
149 					return length;
150 
151 				bytes += 2;
152 				length++;
153 				continue;
154 			}
155 
156 			/* Not a startbyte - skip */
157 			bytes += 1;
158 			continue;
159 		}
160 
161 		bytes += 1;
162 		length++;
163 	}
164 
165 	return length;
166 }
167 
168 
169 /*!	UTF8ToCharCode converts the input that includes potential multibyte chars
170 	to UTF-32 char codes that can be used by FreeType. The string pointer is
171 	then advanced to the next character in the string. In case the terminating
172 	0 is reached, the string pointer is not advanced anymore and spaces are
173 	returned. This makes it safe to overruns and enables streamed processing
174 	of UTF8 strings.
175 */
176 static inline uint32
177 UTF8ToCharCode(const char **bytes)
178 {
179 	register uint32 result = 0;
180 
181 	if ((*bytes)[0] & 0x80) {
182 		if ((*bytes)[0] & 0x40) {
183 			if ((*bytes)[0] & 0x20) {
184 				if ((*bytes)[0] & 0x10) {
185 					if ((*bytes)[0] & 0x08) {
186 						/*	A five byte char?!
187 							Something's wrong, substitute. */
188 						result += 0x20;
189 						(*bytes)++;
190 						return result;
191 					}
192 
193 					if ((*bytes)[1] == 0 || (*bytes)[2] == 0 || (*bytes)[3] == 0)
194 						return 0x00;
195 
196 					/* A four byte char */
197 					result += (*bytes)[0] & 0x07;
198 					result <<= 6;
199 					result += (*bytes)[1] & 0x3f;
200 					result <<= 6;
201 					result += (*bytes)[2] & 0x3f;
202 					result <<= 6;
203 					result += (*bytes)[3] & 0x3f;
204 					(*bytes) += 4;
205 					return result;
206 				}
207 
208 				if ((*bytes)[1] == 0 || (*bytes)[2] == 0)
209 					return 0x00;
210 
211 				/* A three byte char */
212 				result += (*bytes)[0] & 0x0f;
213 				result <<= 6;
214 				result += (*bytes)[1] & 0x3f;
215 				result <<= 6;
216 				result += (*bytes)[2] & 0x3f;
217 				(*bytes) += 3;
218 				return result;
219 			}
220 
221 			if ((*bytes)[1] == 0)
222 				return 0x00;
223 
224 			/* A two byte char */
225 			result += (*bytes)[0] & 0x1f;
226 			result <<= 6;
227 			result += (*bytes)[1] & 0x3f;
228 			(*bytes) += 2;
229 			return result;
230 		}
231 
232 		/*	This (10) is not a startbyte.
233 			Substitute with a space. */
234 		result += 0x20;
235 		(*bytes)++;
236 		return result;
237 	}
238 
239 	if ((*bytes)[0] == 0) {
240 		/*	We do not advance beyond the terminating 0. */
241 		return 0x00;
242 	}
243 
244 	result += (*bytes)[0];
245 	(*bytes)++;
246 	return result;
247 }
248 
249 #endif	// _UTF8_FUNCTIONS_H
250