xref: /haiku/headers/build/private/interface/utf8_functions.h (revision b46615c55ad2c8fe6de54412055a0713da3d610a)
1 /*
2  * Copyright 2004-2010, Haiku, Inc.
3  * Distributed under the terms of the MIT License.
4  */
5 #ifndef _UTF8_FUNCTIONS_H
6 #define _UTF8_FUNCTIONS_H
7 
8 
9 #include <SupportDefs.h>
10 
11 
12 static inline bool
13 IsInsideGlyph(uchar ch)
14 {
15 	return (ch & 0xc0) == 0x80;
16 }
17 
18 
19 static inline uint32
20 UTF8NextCharLenUnsafe(const char *text)
21 {
22 	const char *ptr = text;
23 
24 	do {
25 		ptr++;
26 	} while (IsInsideGlyph(*ptr));
27 
28 	return ptr - text;
29 }
30 
31 
32 static inline uint32
33 UTF8NextCharLen(const char *text)
34 {
35 	if (text == NULL || *text == 0)
36 		return 0;
37 
38 	return UTF8NextCharLenUnsafe(text);
39 }
40 
41 
42 static inline uint32
43 UTF8PreviousCharLen(const char *text, const char *limit)
44 {
45 	const char *ptr = text;
46 
47 	if (ptr == NULL || limit == NULL)
48 		return 0;
49 
50 	do {
51 		if (ptr == limit)
52 			break;
53 		ptr--;
54 	} while (IsInsideGlyph(*ptr));
55 
56 	return text - ptr;
57 }
58 
59 
60 /*!	UTF8CountBytes gets the length (in bytes) of a UTF8 string. Up to
61 	numChars characters are read. If numChars is a negative value it is ignored
62 	and the string is read up to the terminating 0.
63 */
64 static inline uint32
65 UTF8CountBytes(const char *bytes, int32 numChars)
66 {
67 	if (bytes == NULL)
68 		return 0;
69 
70 	if (numChars < 0)
71 		numChars = INT_MAX;
72 
73 	const char *base = bytes;
74 	while (bytes[0] != '\0') {
75 		if ((bytes[0] & 0xc0) != 0x80) {
76 			if (--numChars < 0)
77 				break;
78 		}
79 		bytes++;
80 	}
81 
82 	return bytes - base;
83 }
84 
85 
86 /*!	UTF8CountChars gets the length (in characters) of a UTF8 string. Up to
87 	numBytes bytes are read. If numBytes is a negative value it is ignored
88 	and the string is read up to the terminating 0.
89 */
90 static inline uint32
91 UTF8CountChars(const char *bytes, int32 numBytes)
92 {
93 	if (bytes == NULL)
94 		return 0;
95 
96 	uint32 length = 0;
97 	const char *last;
98 	if (numBytes < 0)
99 		last = (const char *)SIZE_MAX;
100 	else
101 		last = bytes + numBytes - 1;
102 
103 	while (bytes[0] && bytes <= last) {
104 		if ((bytes++[0] & 0xc0) != 0x80)
105 			length++;
106 	}
107 
108 	return length;
109 }
110 
111 
112 /*!	UTF8ToCharCode converts the input that includes potential multibyte chars
113 	to UTF-32 char codes that can be used by FreeType. The string pointer is
114 	then advanced to the next character in the string. In case the terminating
115 	0 is reached, the string pointer is not advanced anymore and nulls are
116 	returned. This makes it safe to overruns and enables streamed processing
117 	of UTF8 strings.
118 */
119 static inline uint32
120 UTF8ToCharCode(const char **bytes)
121 {
122 	#define UTF8_SUBSTITUTE_CHARACTER	0xfffd
123 
124 	uint32 result;
125 	if (((*bytes)[0] & 0x80) == 0) {
126 		// a single byte character
127 		result = (*bytes)[0];
128 		if (result != '\0') {
129 			// do not advance beyond the terminating '\0'
130 			(*bytes)++;
131 		}
132 
133 		return result;
134 	}
135 
136 	if (((*bytes)[0] & 0xc0) == 0x80) {
137 		// not a proper multibyte start
138 		(*bytes)++;
139 		return UTF8_SUBSTITUTE_CHARACTER;
140 	}
141 
142 	// start of a multibyte character
143 	uint8 mask = 0x80;
144 	result = (uint32)((*bytes)[0] & 0xff);
145 	(*bytes)++;
146 
147 	while (result & mask) {
148 		if (mask == 0x02) {
149 			// seven byte char - invalid
150 			return UTF8_SUBSTITUTE_CHARACTER;
151 		}
152 
153 		result &= ~mask;
154 		mask >>= 1;
155 	}
156 
157 	while (((*bytes)[0] & 0xc0) == 0x80) {
158 		result <<= 6;
159 		result += (*bytes)[0] & 0x3f;
160 		(*bytes)++;
161 
162 		mask <<= 1;
163 		if (mask == 0x40)
164 			return result;
165 	}
166 
167 	if (mask == 0x40)
168 		return result;
169 
170 	if ((*bytes)[0] == '\0') {
171 		// string terminated within multibyte char
172 		return 0x00;
173 	}
174 
175 	// not enough bytes in multibyte char
176 	return UTF8_SUBSTITUTE_CHARACTER;
177 
178 	#undef UTF8_SUBSTITUTE_CHARACTER
179 }
180 
181 #endif	// _UTF8_FUNCTIONS_H
182