xref: /haiku/src/kits/locale/UnicodeChar.cpp (revision 5c4b63b505326ac530e26f1926b511949888c5c9)
1 /*
2 ** Copyright 2003, Axel Dörfler, axeld@pinc-software.de. All rights reserved.
3 ** Distributed under the terms of the OpenBeOS License.
4 */
5 
6 /* Reads the information out of the data files created by (an edited version of)
7  * IBM's ICU genprops utility. The BUnicodeChar class is mostly the counterpart
8  * to ICU's uchar module, but is not as huge or broad as that one.
9  *
10  * Note, it probably won't be able to handle the output of the orginal genprops
11  * tool and vice versa - only use the tool provided with this project to create
12  * the Unicode property file.
13  * However, the algorithmic idea behind the property file is still the same as
14  * found in ICU - nothing important has been changed, so more recent versions
15  * of genprops tool/data can probably be ported without too much effort.
16  *
17  * In case no property file can be found it will still provide basic services
18  * for the Latin-1 part of the character tables.
19  */
20 
21 
22 #include <OS.h>
23 
24 #include <UnicodeChar.h>
25 
26 #include <stdlib.h>
27 #include <stdio.h>
28 #include <string.h>
29 
30 
31 #define FLAG(n) ((uint32)1 << (n))
32 enum {
33 	UF_UPPERCASE		= FLAG(B_UNICODE_UPPERCASE_LETTER),
34 	UF_LOWERCASE		= FLAG(B_UNICODE_LOWERCASE_LETTER),
35 	UF_TITLECASE		= FLAG(B_UNICODE_TITLECASE_LETTER),
36 	UF_MODIFIER_LETTER	= FLAG(B_UNICODE_MODIFIER_LETTER),
37 	UF_OTHER_LETTER		= FLAG(B_UNICODE_OTHER_LETTER),
38 	UF_DECIMAL_NUMBER	= FLAG(B_UNICODE_DECIMAL_DIGIT_NUMBER),
39 	UF_OTHER_NUMBER		= FLAG(B_UNICODE_OTHER_NUMBER),
40 	UF_LETTER_NUMBER	= FLAG(B_UNICODE_LETTER_NUMBER)
41 };
42 
43 
44 static uint32 gStaticProps32Table[] = {
45     /* 0x00 */	0x48f,		0x48f,		0x48f,		0x48f,
46     /* 0x04 */	0x48f,		0x48f,		0x48f,		0x48f,
47     /* 0x08 */	0x48f,		0x20c,		0x1ce,		0x20c,
48     /* 0x0c */	0x24d,		0x1ce,		0x48f,		0x48f,
49     /* 0x10 */	0x48f,		0x48f,		0x48f,		0x48f,
50     /* 0x14 */	0x48f,		0x48f,		0x48f,		0x48f,
51     /* 0x18 */	0x48f,		0x48f,		0x48f,		0x48f,
52     /* 0x1c */	0x1ce,		0x1ce,		0x1ce,		0x20c,
53     /* 0x20 */	0x24c,		0x297,		0x297,		0x117,
54     /* 0x24 */	0x119,		0x117,		0x297,		0x297,
55     /* 0x28 */	0x100a94,	0xfff00a95,	0x297,		0x118,
56     /* 0x2c */	0x197,		0x113,		0x197,		0xd7,
57     /* 0x30 */	0x89,		0x100089,	0x200089,	0x300089,
58     /* 0x34 */	0x400089,	0x500089,	0x600089,	0x700089,
59     /* 0x38 */	0x800089,	0x900089,	0x197,		0x297,
60     /* 0x3c */	0x200a98,	0x298,		0xffe00a98,	0x297,
61     /* 0x40 */	0x297,		0x2000001,	0x2000001,	0x2000001,
62     /* 0x44 */	0x2000001,	0x2000001,	0x2000001,	0x2000001,
63     /* 0x48 */	0x2000001,	0x2000001,	0x2000001,	0x2000001,
64     /* 0x4c */	0x2000001,	0x2000001,	0x2000001,	0x2000001,
65     /* 0x50 */	0x2000001,	0x2000001,	0x2000001,	0x2000001,
66     /* 0x54 */	0x2000001,	0x2000001,	0x2000001,	0x2000001,
67     /* 0x58 */	0x2000001,	0x2000001,	0x2000001,	0x200a94,
68     /* 0x5c */	0x297,		0xffe00a95,	0x29a,		0x296,
69     /* 0x60 */	0x29a,		0x2000002,	0x2000002,	0x2000002,
70     /* 0x64 */	0x2000002,	0x2000002,	0x2000002,	0x2000002,
71     /* 0x68 */	0x2000002,	0x2000002,	0x2000002,	0x2000002,
72     /* 0x6c */	0x2000002,	0x2000002,	0x2000002,	0x2000002,
73     /* 0x70 */	0x2000002,	0x2000002,	0x2000002,	0x2000002,
74     /* 0x74 */	0x2000002,	0x2000002,	0x2000002,	0x2000002,
75     /* 0x78 */	0x2000002,	0x2000002,	0x2000002,	0x200a94,
76     /* 0x7c */	0x298,		0xffe00a95,	0x298,		0x48f,
77     /* 0x80 */	0x48f,		0x48f,		0x48f,		0x48f,
78     /* 0x84 */	0x48f,		0x1ce,		0x48f,		0x48f,
79     /* 0x88 */	0x48f,		0x48f,		0x48f,		0x48f,
80     /* 0x8c */	0x48f,		0x48f,		0x48f,		0x48f,
81     /* 0x90 */	0x48f,		0x48f,		0x48f,		0x48f,
82     /* 0x94 */	0x48f,		0x48f,		0x48f,		0x48f,
83     /* 0x98 */	0x48f,		0x48f,		0x48f,		0x48f,
84     /* 0x9c */	0x48f,		0x48f,		0x48f,		0x48f
85 };
86 
87 enum {
88     INDEX_STAGE_2_BITS,
89     INDEX_STAGE_3_BITS,
90     INDEX_EXCEPTIONS,
91     INDEX_STAGE_3_INDEX,
92     INDEX_PROPS,
93     INDEX_UCHARS
94 };
95 
96 /* constants and macros for access to the data */
97 enum {
98     EXC_UPPERCASE,
99     EXC_LOWERCASE,
100     EXC_TITLECASE,
101     EXC_DIGIT_VALUE,
102     EXC_NUMERIC_VALUE,
103     EXC_DENOMINATOR_VALUE,
104     EXC_MIRROR_MAPPING,
105     EXC_SPECIAL_CASING,
106     EXC_CASE_FOLDING
107 };
108 
109 enum {
110     EXCEPTION_SHIFT	= 5,
111     BIDI_SHIFT,
112     MIRROR_SHIFT	= BIDI_SHIFT + 5,
113     VALUE_SHIFT		= 20,
114 
115     VALUE_BITS		= 32 - VALUE_SHIFT
116 };
117 
118 /* number of bits in an 8-bit integer value */
119 #define EXC_GROUP 8
120 static uint8 gFlagsOffset[256] = {
121 	0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
122 	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
123 	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
124 	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
125 	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
126 	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
127 	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
128 	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
129 	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
130 	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
131 	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
132 	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
133 	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
134 	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
135 	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
136 	4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
137 };
138 
139 #ifdef UCHAR_VARIABLE_TRIE_BITS
140 	// access values calculated from indices
141 	static uint16_t stage23Bits, stage2Mask, stage3Mask;
142 #	define sStage3Bits   indexes[INDEX_STAGE_3_BITS]
143 #else
144     // Use hardcoded bit distribution for the trie table access
145 #	define sStage23Bits  10
146 #	define sStage2Mask   0x3f
147 #	define sStage3Mask   0xf
148 #	define sStage3Bits   4
149 #endif
150 
151 
152 /**	We need to change the char category for ISO 8 controls, since the
153  *	genprops utility we got from IBM's ICU apparently changes it for
154  *	some characters.
155  */
156 
157 static inline bool
158 isISO8Control(uint32 c)
159 {
160 	return ((uint32)c < 0x20 || (uint32)(c - 0x7f) <= 0x20);
161 }
162 
163 
164 static inline uint32
165 getProperties(uint32 c)
166 {
167 	if (c > 0x10ffff)
168 		return 0;
169 
170 	// TODO : Data from unicode
171 
172 	return c > 0x9f ? 0 : gStaticProps32Table[c];
173 }
174 
175 
176 static inline uint8
177 getCategory(uint32 properties)
178 {
179 	return properties & 0x1f;
180 }
181 
182 
183 static inline bool
184 propertyIsException(uint32 properties)
185 {
186 	return properties & (1UL << EXCEPTION_SHIFT);
187 }
188 
189 
190 static inline uint32
191 getUnsignedValue(uint32 properties)
192 {
193 	return properties >> VALUE_SHIFT;
194 }
195 
196 
197 static inline uint32
198 getSignedValue(uint32 properties)
199 {
200 	return (int32)properties >> VALUE_SHIFT;
201 }
202 
203 
204 static inline uint32 *
205 getExceptions(uint32 properties)
206 {
207 	// TODO : data from unicode
208 	return 0;
209 }
210 
211 
212 static inline bool
213 haveExceptionValue(uint32 flags,int16 index)
214 {
215 	return flags & (1UL << index);
216 }
217 
218 
219 static inline void
220 addExceptionOffset(uint32 &flags, int16 &index, uint32 **offset)
221 {
222 	if (index >= EXC_GROUP) {
223 		*offset += gFlagsOffset[flags & ((1 << EXC_GROUP) - 1)];
224 		flags >>= EXC_GROUP;
225 		index -= EXC_GROUP;
226 	}
227 	*offset += gFlagsOffset[flags & ((1 << index) - 1)];
228 }
229 
230 
231 //	#pragma mark -
232 
233 
234 BUnicodeChar::BUnicodeChar()
235 {
236 }
237 
238 
239 bool
240 BUnicodeChar::IsAlpha(uint32 c)
241 {
242 	BUnicodeChar();
243 	return (FLAG(getCategory(getProperties(c)))
244 			& (UF_UPPERCASE | UF_LOWERCASE | UF_TITLECASE | UF_MODIFIER_LETTER | UF_OTHER_LETTER)
245 		   ) != 0;
246 }
247 
248 
249 /** Returns the type code of the specified unicode character */
250 int8
251 BUnicodeChar::Type(uint32 c)
252 {
253 	BUnicodeChar();
254 	return (int8)getCategory(getProperties(c));
255 }
256 
257 
258 bool
259 BUnicodeChar::IsLower(uint32 c)
260 {
261 	BUnicodeChar();
262     return getCategory(getProperties(c)) == B_UNICODE_LOWERCASE_LETTER;
263 }
264 
265 
266 bool
267 BUnicodeChar::IsUpper(uint32 c)
268 {
269 	BUnicodeChar();
270 	return getCategory(getProperties(c)) == B_UNICODE_UPPERCASE_LETTER;
271 }
272 
273 
274 bool
275 BUnicodeChar::IsTitle(uint32 c)
276 {
277 	BUnicodeChar();
278 	return getCategory(getProperties(c)) == B_UNICODE_TITLECASE_LETTER;
279 }
280 
281 
282 bool
283 BUnicodeChar::IsDigit(uint32 c)
284 {
285 	BUnicodeChar();
286 	return (FLAG(getCategory(getProperties(c)))
287 			& (UF_DECIMAL_NUMBER | UF_OTHER_NUMBER | UF_LETTER_NUMBER)
288 		   ) != 0;
289 }
290 
291 
292 bool
293 BUnicodeChar::IsAlNum(uint32 c)
294 {
295 	BUnicodeChar();
296 	return (FLAG(getCategory(getProperties(c)))
297 			& (UF_DECIMAL_NUMBER | UF_OTHER_NUMBER | UF_LETTER_NUMBER | UF_UPPERCASE
298 			   | UF_LOWERCASE | UF_TITLECASE | UF_MODIFIER_LETTER | UF_OTHER_LETTER)
299            ) != 0;
300 }
301 
302 
303 bool
304 BUnicodeChar::IsDefined(uint32 c)
305 {
306 	BUnicodeChar();
307 	return getProperties(c) != 0;
308 }
309 
310 
311 /** Returns true if the specified unicode character is a base
312  *	form character that can be used with a diacritic.
313  *	This doesn't mean that the character has to be distinct,
314  *	though.
315  */
316 
317 bool
318 BUnicodeChar::IsBase(uint32 c)
319 {
320 	BUnicodeChar();
321 	return (FLAG(getCategory(getProperties(c)))
322 			& (UF_DECIMAL_NUMBER | UF_OTHER_NUMBER | UF_LETTER_NUMBER
323 			   | UF_UPPERCASE | UF_LOWERCASE | UF_TITLECASE
324 			   | UF_MODIFIER_LETTER | UF_OTHER_LETTER | FLAG(B_UNICODE_NON_SPACING_MARK)
325 			   | FLAG(B_UNICODE_ENCLOSING_MARK) | FLAG(B_UNICODE_COMBINING_SPACING_MARK))
326 		   ) != 0;
327 }
328 
329 
330 /** Returns true if the specified unicode character is a
331  *	control character.
332  */
333 
334 bool
335 BUnicodeChar::IsControl(uint32 c)
336 {
337 	BUnicodeChar();
338 	return isISO8Control(c)
339 			|| (FLAG(getCategory(getProperties(c)))
340 				& (FLAG(B_UNICODE_CONTROL_CHAR) | FLAG(B_UNICODE_FORMAT_CHAR)
341 					| FLAG(B_UNICODE_LINE_SEPARATOR) | FLAG(B_UNICODE_PARAGRAPH_SEPARATOR))
342 			   ) != 0;
343 }
344 
345 
346 /** Returns true if the specified unicode character is a
347  *	punctuation character.
348  */
349 
350 bool
351 BUnicodeChar::IsPunctuation(uint32 c)
352 {
353 	BUnicodeChar();
354 	return (FLAG(getCategory(getProperties(c)))
355 			& (FLAG(B_UNICODE_DASH_PUNCTUATION)
356 				| FLAG(B_UNICODE_START_PUNCTUATION)
357 				| FLAG(B_UNICODE_END_PUNCTUATION)
358 				| FLAG(B_UNICODE_CONNECTOR_PUNCTUATION)
359 				| FLAG(B_UNICODE_OTHER_PUNCTUATION))
360 			) != 0;
361 }
362 
363 
364 /** Returns true if the specified unicode character is some
365  *	kind of a space character.
366  */
367 
368 bool
369 BUnicodeChar::IsSpace(uint32 c)
370 {
371 	BUnicodeChar();
372 	return (FLAG(getCategory(getProperties(c)))
373 			& (FLAG(B_UNICODE_SPACE_SEPARATOR)
374 				| FLAG(B_UNICODE_LINE_SEPARATOR)
375 				| FLAG(B_UNICODE_PARAGRAPH_SEPARATOR))
376 		   ) != 0;
377 }
378 
379 
380 /** Returns true if the specified unicode character is a white
381  *	space character.
382  *	This is essentially the same as IsSpace(), but excludes all
383  *	non-breakable spaces.
384  */
385 
386 bool
387 BUnicodeChar::IsWhitespace(uint32 c)
388 {
389 	BUnicodeChar();
390 	return (FLAG(getCategory(getProperties(c)))
391 			& (FLAG(B_UNICODE_SPACE_SEPARATOR)
392 				| FLAG(B_UNICODE_LINE_SEPARATOR)
393 				| FLAG(B_UNICODE_PARAGRAPH_SEPARATOR))
394 		   ) != 0 && c != 0xa0 && c != 0x202f && c != 0xfeff; // exclude non-breakable spaces
395 }
396 
397 
398 /** Returns true if the specified unicode character is printable.
399  */
400 
401 bool
402 BUnicodeChar::IsPrintable(uint32 c)
403 {
404 	BUnicodeChar();
405 	return !isISO8Control(c)
406 			&& (FLAG(getCategory(getProperties(c)))
407 				& ~(FLAG(B_UNICODE_UNASSIGNED) | FLAG(B_UNICODE_CONTROL_CHAR)
408 					| FLAG(B_UNICODE_FORMAT_CHAR) | FLAG(B_UNICODE_PRIVATE_USE_CHAR)
409 					| FLAG(B_UNICODE_SURROGATE) | FLAG(B_UNICODE_GENERAL_OTHER_TYPES)
410 					| FLAG(31))
411 				   ) != 0;
412 }
413 
414 
415 //	#pragma mark -
416 
417 
418 /** Transforms the specified unicode character to lowercase.
419  */
420 
421 uint32
422 BUnicodeChar::ToLower(uint32 c)
423 {
424 	BUnicodeChar();
425 
426 	uint32 props = getProperties(c);
427 
428 	if (!propertyIsException(props)) {
429 		if (FLAG(getCategory(props)) & (UF_UPPERCASE | UF_TITLECASE))
430 			return c + getSignedValue(props);
431 	} else {
432 		uint32 *exceptions = getExceptions(props);
433 		uint32 firstExceptionValue = *exceptions;
434 
435 		if (haveExceptionValue(firstExceptionValue, EXC_LOWERCASE)) {
436 			int16 index = EXC_LOWERCASE;
437 			addExceptionOffset(firstExceptionValue, index, &++exceptions);
438 			return *exceptions;
439 		}
440 	}
441 	// no mapping found, just return the character unchanged
442 	return c;
443 }
444 
445 
446 /** Transforms the specified unicode character to uppercase.
447  */
448 
449 uint32
450 BUnicodeChar::ToUpper(uint32 c)
451 {
452 	BUnicodeChar();
453 
454 	uint32 props = getProperties(c);
455 
456 	if (!propertyIsException(props)) {
457 		if (getCategory(props) == B_UNICODE_LOWERCASE_LETTER)
458 			return c - getSignedValue(props);
459 	} else {
460 		uint32 *exceptions = getExceptions(props);
461 		uint32 firstExceptionValue = *exceptions;
462 
463 		if (haveExceptionValue(firstExceptionValue, EXC_UPPERCASE)) {
464 			int16 index = EXC_UPPERCASE;
465 			++exceptions;
466 			addExceptionOffset(firstExceptionValue, index, &exceptions);
467 			return *exceptions;
468 		}
469     }
470 	// no mapping found, just return the character unchanged
471 	return c;
472 }
473 
474 
475 /** Transforms the specified unicode character to title case.
476  */
477 
478 uint32
479 BUnicodeChar::ToTitle(uint32 c)
480 {
481 	BUnicodeChar();
482 
483 	uint32 props = getProperties(c);
484 
485 	if (!propertyIsException(props)) {
486 		if (getCategory(props) == B_UNICODE_LOWERCASE_LETTER) {
487 			// here, titlecase is the same as uppercase
488 			return c - getSignedValue(props);
489 		}
490 	} else {
491 		uint32 *exceptions = getExceptions(props);
492 		uint32 firstExceptionValue = *exceptions;
493 
494 		if (haveExceptionValue(firstExceptionValue, EXC_TITLECASE)) {
495 			int16 index = EXC_TITLECASE;
496 			addExceptionOffset(firstExceptionValue, index, &++exceptions);
497 			return (uint32)*exceptions;
498 		} else if (haveExceptionValue(firstExceptionValue, EXC_UPPERCASE)) {
499 			// here, titlecase is the same as uppercase
500 			int16 index = EXC_UPPERCASE;
501 			addExceptionOffset(firstExceptionValue, index, &++exceptions);
502 			return *exceptions;
503 		}
504 	}
505 	// no mapping found, just return the character unchanged
506 	return c;
507 }
508 
509 
510 int32
511 BUnicodeChar::DigitValue(uint32 c)
512 {
513 	BUnicodeChar();
514 
515 	uint32 props = getProperties(c);
516 
517 	if (!propertyIsException(props)) {
518 		if (getCategory(props) == B_UNICODE_DECIMAL_DIGIT_NUMBER)
519 			return getSignedValue(props);
520 	} else {
521 		uint32 *exceptions = getExceptions(props);
522 		uint32 firstExceptionValue = *exceptions;
523 
524 		if (haveExceptionValue(firstExceptionValue, EXC_DIGIT_VALUE)) {
525 			int16 index = EXC_DIGIT_VALUE;
526 			addExceptionOffset(firstExceptionValue, index, &++exceptions);
527 
528 			int32 value = (int32)(int16)*exceptions;
529 				 // the digit value is in the lower 16 bits
530 			if (value != -1)
531 				return value;
532 		}
533 	}
534 
535     // If there is no value in the properties table,
536     // then check for some special characters
537 	switch (c) {
538 		case 0x3007:	return 0;
539 		case 0x4e00:	return 1;
540 		case 0x4e8c:	return 2;
541 		case 0x4e09:	return 3;
542 		case 0x56d8:	return 4;
543 		case 0x4e94:	return 5;
544 		case 0x516d:	return 6;
545 		case 0x4e03:	return 7;
546 		case 0x516b:	return 8;
547 		case 0x4e5d:	return 9;
548 		default:		return -1;
549 	}
550 }
551 
552 
553 void
554 BUnicodeChar::ToUTF8(uint32 c, char **out)
555 {
556 	char *s = *out;
557 
558 	if (c < 0x80)
559 		*(s++) = c;
560 	else if (c < 0x800) {
561 		*(s++) = 0xc0 | (c >> 6);
562 		*(s++) = 0x80 | (c & 0x3f);
563 	} else if (c < 0x10000) {
564 		*(s++) = 0xe0 | (c >> 12);
565 		*(s++) = 0x80 | ((c >> 6) & 0x3f);
566 		*(s++) = 0x80 | (c & 0x3f);
567 	} else if (c <= 0x10ffff) {
568 		*(s++) = 0xf0 | (c >> 18);
569 		*(s++) = 0x80 | ((c >> 12) & 0x3f);
570 		*(s++) = 0x80 | ((c >> 6) & 0x3f);
571 		*(s++) = 0x80 | (c & 0x3f);
572 	}
573 	*out = s;
574 }
575 
576 
577 uint32
578 BUnicodeChar::FromUTF8(const char **in)
579 {
580 	uint8 *bytes = (uint8 *)*in;
581 	if (bytes == NULL)
582 		return 0;
583 
584 	int32 length;
585 	uint8 mask = 0x1f;
586 
587 	switch (bytes[0] & 0xf0) {
588 		case 0xc0:
589 		case 0xd0:	length = 2; break;
590 		case 0xe0:	length = 3; break;
591 		case 0xf0:
592 			mask = 0x0f;
593 			length = 4;
594 			break;
595 		default:
596 			// valid 1-byte character
597 			// and invalid characters
598 			(*in)++;
599 			return bytes[0];
600 	}
601 	uint32 c = bytes[0] & mask;
602 	int32 i = 1;
603 	for (;i < length && (bytes[i] & 0x80) > 0;i++)
604 		c = (c << 6) | (bytes[i] & 0x3f);
605 
606 	if (i < length) {
607 		// invalid character
608 		(*in)++;
609 		return (uint32)bytes[0];
610 	}
611 	*in += length;
612 	return c;
613 }
614 
615 size_t
616 BUnicodeChar::UTF8StringLength(const char *str)
617 {
618 	size_t len = 0;
619 	while (*str) {
620 		FromUTF8(&str);
621 		len++;
622 	}
623 	return len;
624 }
625 
626 size_t
627 BUnicodeChar::UTF8StringLength(const char *str, size_t maxLength)
628 {
629 	size_t len = 0;
630 	while (len < maxLength && *str) {
631 		FromUTF8(&str);
632 		len++;
633 	}
634 	return len;
635 }
636 
637