xref: /haiku/headers/private/interface/utf8_functions.h (revision d3d8b26997fac34a84981e6d2b649521de2cc45a)
1 /*
2  * Copyright 2004-2006, Haiku, Inc.
3  * Distributed under the terms of the MIT License.
4  */
5 #ifndef _UTF8_FUNCTIONS_H
6 #define _UTF8_FUNCTIONS_H
7 
8 
9 #include <SupportDefs.h>
10 
11 
12 static inline bool
13 IsInsideGlyph(uchar ch)
14 {
15 	return (ch & 0xC0) == 0x80;
16 }
17 
18 static inline uint32
19 UTF8NextCharLenUnsafe(const char *text)
20 {
21 	const char *ptr = text;
22 
23 	do {
24 		ptr++;
25 	} while (IsInsideGlyph(*ptr));
26 
27 	return ptr - text;
28 }
29 
30 static inline uint32
31 UTF8NextCharLen(const char *text)
32 {
33 	if (text == NULL || *text == 0)
34 		return 0;
35 
36 	return UTF8NextCharLenUnsafe(text);
37 }
38 
39 static inline uint32
40 UTF8PreviousCharLen(const char *text, const char *limit)
41 {
42 	const char *ptr = text;
43 
44 	if (ptr == NULL || limit == NULL)
45 		return 0;
46 
47 	do {
48 		if (ptr == limit)
49 			break;
50 		ptr--;
51 	} while (IsInsideGlyph(*ptr));
52 
53 	return text - ptr;
54 }
55 
56 // TODO: use this function in other places of this file...
57 static inline uint32
58 count_utf8_bytes(uchar ch)
59 {
60 	// the number of high bits set until the first
61 	// unset bit determine the count of bytes used for
62 	// this glyph from this byte on
63 	uchar bit = 1 << 7;
64 	uint32 count = 1;
65 	if (ch & bit) {
66 		bit = bit >> 1;
67 		while (ch & bit) {
68 			count++;
69 			bit = bit >> 1;
70 		}
71 	}
72 	return count;
73 }
74 
75 static inline uint32
76 UTF8CountBytes(const char *text, uint32 numChars)
77 {
78 	if (text) {
79 		// iterate over numChars glyphs incrementing ptr by the
80 		// number of bytes for each glyph, which is encoded in
81 		// the first byte of any glyph.
82 		const char *ptr = text;
83 		while (numChars--) {
84 			ptr += count_utf8_bytes(*ptr);
85 		}
86 		return ptr - text;
87 	}
88 	return 0;
89 }
90 
91 static inline uint32
92 UTF8CountChars(const char *text, int32 numBytes)
93 {
94 	const char* ptr = text;
95 	const char* last = ptr + numBytes - 1;
96 
97 	uint32 count = 0;
98 	while (ptr <= last) {
99 		ptr += UTF8NextCharLen(ptr);
100 		count++;
101 	}
102 
103 	return count;
104 }
105 
106 
107 /*	UTF8ToCharCode converts the input that includes potential multibyte chars
108 	to UTF-32 char codes that can be used by FreeType. The string pointer is
109 	then advanced to the next character in the string. In case the terminating
110 	0 is reached, the string pointer is not advanced anymore and spaces are
111 	returned. This makes it safe to overruns and enables streamed processing
112 	of UTF8 strings. */
113 static inline uint32
114 UTF8ToCharCode(const char **bytes)
115 {
116 	register uint32 result = 0;
117 
118 	if ((*bytes)[0] & 0x80) {
119 		if ((*bytes)[0] & 0x40) {
120 			if ((*bytes)[0] & 0x20) {
121 				if ((*bytes)[0] & 0x10) {
122 					if ((*bytes)[0] & 0x08) {
123 						/*	A five byte char?!
124 							Something's wrong, substitute. */
125 						result += 0x20;
126 						(*bytes)++;
127 						return result;
128 					}
129 
130 					/* A four byte char */
131 					result += (*bytes)[0] & 0x07;
132 					result <<= 6;
133 					result += (*bytes)[1] & 0x3f;
134 					result <<= 6;
135 					result += (*bytes)[2] & 0x3f;
136 					result <<= 6;
137 					result += (*bytes)[3] & 0x3f;
138 					(*bytes) += 3;
139 					return result;
140 				}
141 
142 				/* A three byte char */
143 				result += (*bytes)[0] & 0x0f;
144 				result <<= 6;
145 				result += (*bytes)[1] & 0x3f;
146 				result <<= 6;
147 				result += (*bytes)[2] & 0x3f;
148 				(*bytes) += 3;
149 				return result;
150 			}
151 
152 			/* A two byte char */
153 			result += (*bytes)[0] & 0x1f;
154 			result <<= 6;
155 			result += (*bytes)[1] & 0x3f;
156 			(*bytes) += 2;
157 			return result;
158 		}
159 
160 		/*	This (10) is not a startbyte.
161 			Substitute with a space. */
162 		result += 0x20;
163 		(*bytes)++;
164 		return result;
165 	}
166 
167 	if ((*bytes)[0] == 0) {
168 		/*	We do not advance beyond the terminating 0. */
169 		return 0x00;
170 	}
171 
172 	result += (*bytes)[0];
173 	(*bytes)++;
174 	return result;
175 }
176 
177 
178 /*	UTF8ToLength works like strlen() but takes UTF8 encoded multibyte chars
179 	into account. It's a quicker version of UTF8CountChars above. */
180 static inline int32
181 UTF8ToLength(const char *bytes)
182 {
183 	int32 length = 0;
184 	while (*bytes) {
185 		length++;
186 
187 		if (bytes[0] & 0x80) {
188 			if (bytes[0] & 0x40) {
189 				if (bytes[0] & 0x20) {
190 					if (bytes[0] & 0x10) {
191 						bytes += 4;
192 						continue;
193 					}
194 
195 					bytes += 3;
196 					continue;
197 				}
198 
199 				bytes += 2;
200 				continue;
201 			}
202 
203 			/* Not a startbyte - skip */
204 		}
205 
206 		bytes += 1;
207 	}
208 
209 	return length;
210 }
211 
212 #endif	// _UTF8_FUNCTIONS_H
213