xref: /haiku/src/kits/locale/UnicodeChar.cpp (revision 23338ed551920aae841646afa77530c41efb42c8)
1 /*
2 ** Copyright 2003, Axel Dörfler, axeld@pinc-software.de. All rights reserved.
3 ** Distributed under the terms of the OpenBeOS License.
4 */
5 
6 /* Reads the information out of the data files created by (an edited version of)
7  * IBM's ICU genprops utility. The BUnicodeChar class is mostly the counterpart
8  * to ICU's uchar module, but is not as huge or broad as that one.
9  *
10  * Note, it probably won't be able to handle the output of the orginal genprops
11  * tool and vice versa - only use the tool provided with this project to create
12  * the Unicode property file.
13  * However, the algorithmic idea behind the property file is still the same as
14  * found in ICU - nothing important has been changed, so more recent versions
15  * of genprops tool/data can probably be ported without too much effort.
16  *
17  * In case no property file can be found it will still provide basic services
18  * for the Latin-1 part of the character tables.
19  */
20 
21 
22 #include <OS.h>
23 
24 #include <UnicodeChar.h>
25 #include "UnicodeProperties.h"
26 #include "PropertyFile.h"
27 
28 #include <stdlib.h>
29 #include <stdio.h>
30 #include <string.h>
31 
32 
33 #if B_BEOS_VERSION <= B_BEOS_VERSION_5 && !defined(__HAIKU__)
34 // B_BAD_DATA was introduced with DANO, so we define it for R5:
35 #define B_BAD_DATA -2147483632L
36 #endif
37 
38 static const uint16 *sPropsTable = NULL;
39 #define sProps32Table ((uint32 *)sPropsTable)
40 static uint16 *sIndices;
41 static vint32 sHavePropsData = 0;
42 
43 #define FLAG(n) ((uint32)1 << (n))
44 enum {
45 	UF_UPPERCASE		= FLAG(B_UNICODE_UPPERCASE_LETTER),
46 	UF_LOWERCASE		= FLAG(B_UNICODE_LOWERCASE_LETTER),
47 	UF_TITLECASE		= FLAG(B_UNICODE_TITLECASE_LETTER),
48 	UF_MODIFIER_LETTER	= FLAG(B_UNICODE_MODIFIER_LETTER),
49 	UF_OTHER_LETTER		= FLAG(B_UNICODE_OTHER_LETTER),
50 	UF_DECIMAL_NUMBER	= FLAG(B_UNICODE_DECIMAL_DIGIT_NUMBER),
51 	UF_OTHER_NUMBER		= FLAG(B_UNICODE_OTHER_NUMBER),
52 	UF_LETTER_NUMBER	= FLAG(B_UNICODE_LETTER_NUMBER)
53 };
54 
55 
56 static uint32 gStaticProps32Table[] = {
57     /* 0x00 */	0x48f,		0x48f,		0x48f,		0x48f,
58     /* 0x04 */	0x48f,		0x48f,		0x48f,		0x48f,
59     /* 0x08 */	0x48f,		0x20c,		0x1ce,		0x20c,
60     /* 0x0c */	0x24d,		0x1ce,		0x48f,		0x48f,
61     /* 0x10 */	0x48f,		0x48f,		0x48f,		0x48f,
62     /* 0x14 */	0x48f,		0x48f,		0x48f,		0x48f,
63     /* 0x18 */	0x48f,		0x48f,		0x48f,		0x48f,
64     /* 0x1c */	0x1ce,		0x1ce,		0x1ce,		0x20c,
65     /* 0x20 */	0x24c,		0x297,		0x297,		0x117,
66     /* 0x24 */	0x119,		0x117,		0x297,		0x297,
67     /* 0x28 */	0x100a94,	0xfff00a95,	0x297,		0x118,
68     /* 0x2c */	0x197,		0x113,		0x197,		0xd7,
69     /* 0x30 */	0x89,		0x100089,	0x200089,	0x300089,
70     /* 0x34 */	0x400089,	0x500089,	0x600089,	0x700089,
71     /* 0x38 */	0x800089,	0x900089,	0x197,		0x297,
72     /* 0x3c */	0x200a98,	0x298,		0xffe00a98,	0x297,
73     /* 0x40 */	0x297,		0x2000001,	0x2000001,	0x2000001,
74     /* 0x44 */	0x2000001,	0x2000001,	0x2000001,	0x2000001,
75     /* 0x48 */	0x2000001,	0x2000001,	0x2000001,	0x2000001,
76     /* 0x4c */	0x2000001,	0x2000001,	0x2000001,	0x2000001,
77     /* 0x50 */	0x2000001,	0x2000001,	0x2000001,	0x2000001,
78     /* 0x54 */	0x2000001,	0x2000001,	0x2000001,	0x2000001,
79     /* 0x58 */	0x2000001,	0x2000001,	0x2000001,	0x200a94,
80     /* 0x5c */	0x297,		0xffe00a95,	0x29a,		0x296,
81     /* 0x60 */	0x29a,		0x2000002,	0x2000002,	0x2000002,
82     /* 0x64 */	0x2000002,	0x2000002,	0x2000002,	0x2000002,
83     /* 0x68 */	0x2000002,	0x2000002,	0x2000002,	0x2000002,
84     /* 0x6c */	0x2000002,	0x2000002,	0x2000002,	0x2000002,
85     /* 0x70 */	0x2000002,	0x2000002,	0x2000002,	0x2000002,
86     /* 0x74 */	0x2000002,	0x2000002,	0x2000002,	0x2000002,
87     /* 0x78 */	0x2000002,	0x2000002,	0x2000002,	0x200a94,
88     /* 0x7c */	0x298,		0xffe00a95,	0x298,		0x48f,
89     /* 0x80 */	0x48f,		0x48f,		0x48f,		0x48f,
90     /* 0x84 */	0x48f,		0x1ce,		0x48f,		0x48f,
91     /* 0x88 */	0x48f,		0x48f,		0x48f,		0x48f,
92     /* 0x8c */	0x48f,		0x48f,		0x48f,		0x48f,
93     /* 0x90 */	0x48f,		0x48f,		0x48f,		0x48f,
94     /* 0x94 */	0x48f,		0x48f,		0x48f,		0x48f,
95     /* 0x98 */	0x48f,		0x48f,		0x48f,		0x48f,
96     /* 0x9c */	0x48f,		0x48f,		0x48f,		0x48f
97 };
98 
99 enum {
100     INDEX_STAGE_2_BITS,
101     INDEX_STAGE_3_BITS,
102     INDEX_EXCEPTIONS,
103     INDEX_STAGE_3_INDEX,
104     INDEX_PROPS,
105     INDEX_UCHARS
106 };
107 
108 /* constants and macros for access to the data */
109 enum {
110     EXC_UPPERCASE,
111     EXC_LOWERCASE,
112     EXC_TITLECASE,
113     EXC_DIGIT_VALUE,
114     EXC_NUMERIC_VALUE,
115     EXC_DENOMINATOR_VALUE,
116     EXC_MIRROR_MAPPING,
117     EXC_SPECIAL_CASING,
118     EXC_CASE_FOLDING
119 };
120 
121 enum {
122     EXCEPTION_SHIFT	= 5,
123     BIDI_SHIFT,
124     MIRROR_SHIFT	= BIDI_SHIFT + 5,
125     VALUE_SHIFT		= 20,
126 
127     VALUE_BITS		= 32 - VALUE_SHIFT
128 };
129 
130 /* number of bits in an 8-bit integer value */
131 #define EXC_GROUP 8
132 static uint8 gFlagsOffset[256] = {
133 	0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
134 	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
135 	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
136 	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
137 	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
138 	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
139 	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
140 	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
141 	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
142 	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
143 	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
144 	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
145 	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
146 	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
147 	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
148 	4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
149 };
150 
151 #ifdef UCHAR_VARIABLE_TRIE_BITS
152 	// access values calculated from indices
153 	static uint16_t stage23Bits, stage2Mask, stage3Mask;
154 #	define sStage3Bits   indexes[INDEX_STAGE_3_BITS]
155 #else
156     // Use hardcoded bit distribution for the trie table access
157 #	define sStage23Bits  10
158 #	define sStage2Mask   0x3f
159 #	define sStage3Mask   0xf
160 #	define sStage3Bits   4
161 #endif
162 
163 
164 /**	We need to change the char category for ISO 8 controls, since the
165  *	genprops utility we got from IBM's ICU apparently changes it for
166  *	some characters.
167  */
168 
169 static inline bool
170 isISO8Control(uint32 c)
171 {
172 	return ((uint32)c < 0x20 || (uint32)(c - 0x7f) <= 0x20);
173 }
174 
175 
176 static inline uint32
177 getProperties(uint32 c)
178 {
179 	if (c > 0x10ffff)
180 		return 0;
181 
182 	if (sHavePropsData > 0)
183 		return sProps32Table[sPropsTable[
184 					sPropsTable[sPropsTable[8 + (c >> sStage23Bits)]
185 						+ ((c >> sStage3Bits) & sStage2Mask)]
186 					+ (c & sStage3Mask)]];
187 
188 	return c > 0x9f ? 0 : gStaticProps32Table[c];
189 }
190 
191 
192 static inline uint8
193 getCategory(uint32 properties)
194 {
195 	return properties & 0x1f;
196 }
197 
198 
199 static inline bool
200 propertyIsException(uint32 properties)
201 {
202 	return properties & (1UL << EXCEPTION_SHIFT);
203 }
204 
205 
206 static inline uint32
207 getUnsignedValue(uint32 properties)
208 {
209 	return properties >> VALUE_SHIFT;
210 }
211 
212 
213 static inline uint32
214 getSignedValue(uint32 properties)
215 {
216 	return (int32)properties >> VALUE_SHIFT;
217 }
218 
219 
220 static inline uint32 *
221 getExceptions(uint32 properties)
222 {
223 	return sProps32Table + sIndices[INDEX_EXCEPTIONS] + getUnsignedValue(properties);
224 }
225 
226 
227 static inline bool
228 haveExceptionValue(uint32 flags,int16 index)
229 {
230 	return flags & (1UL << index);
231 }
232 
233 
234 static inline void
235 addExceptionOffset(uint32 &flags, int16 &index, uint32 **offset)
236 {
237 	if (index >= EXC_GROUP) {
238 		*offset += gFlagsOffset[flags & ((1 << EXC_GROUP) - 1)];
239 		flags >>= EXC_GROUP;
240 		index -= EXC_GROUP;
241 	}
242 	*offset += gFlagsOffset[flags & ((1 << index) - 1)];
243 }
244 
245 
246 static status_t
247 loadPropsData()
248 {
249 	PropertyFile file;
250 	status_t status = file.SetTo(PROPERTIES_DIRECTORY, PROPERTIES_FILE_NAME);
251 	if (status < B_OK) {
252 		fprintf(stderr, "could not open unicode.properties file: %s\n", strerror(status));
253 		return status;
254 	}
255 
256 	off_t size = file.Size();
257 	uint16 *table = (uint16 *)malloc(size);
258 	if (table == NULL)
259 		return B_NO_MEMORY;
260 
261 	if (file.Read(table, size) < size) {
262 		free(table);
263 		return B_IO_ERROR;
264 	}
265 
266 	// check if the property file matches our needs
267 	if (table[INDEX_STAGE_2_BITS] != 6 || table[INDEX_STAGE_3_BITS] != 4) {
268 		free(table);
269 		return B_BAD_DATA;
270 	}
271 
272 	sIndices = table;
273 #ifdef UCHAR_VARIABLE_TRIE_BITS
274 	sStage23Bits = uint16(sIndices[INDEX_STAGE_2_BITS] + sIndices[INDEX_STAGE_3_BITS]);
275 	sStage2Mask = uint16((1 << sIndices[INDEX_STAGE_2_BITS]) - 1);
276 	sStage3Mask = uint16((1 << sIndices[INDEX_STAGE_3_BITS]) - 1);
277 #endif
278 
279 	sPropsTable = table;
280 	sHavePropsData = 1;
281 
282 	return B_OK;
283 }
284 
285 
286 //	#pragma mark -
287 
288 
289 /**	If the constructor is used for the first time, the property
290  *	file gets loaded from disk.
291  *	It makes sure that this will only happen once throughout the
292  *	application's lifetime.
293  */
294 
295 BUnicodeChar::BUnicodeChar()
296 {
297 	static int32 lock = 0;
298 
299 	if (atomic_add(&lock, 1) > 0) {
300 		while (sHavePropsData == 0)
301 			snooze(10000);
302 
303 		return;
304 	}
305 	if (loadPropsData() < B_OK)
306 		sHavePropsData = -1;
307 }
308 
309 
310 bool
311 BUnicodeChar::IsAlpha(uint32 c)
312 {
313 	BUnicodeChar();
314 	return (FLAG(getCategory(getProperties(c)))
315 			& (UF_UPPERCASE | UF_LOWERCASE | UF_TITLECASE | UF_MODIFIER_LETTER | UF_OTHER_LETTER)
316 		   ) != 0;
317 }
318 
319 
320 /** Returns the type code of the specified unicode character */
321 
322 int8
323 BUnicodeChar::Type(uint32 c)
324 {
325 	BUnicodeChar();
326 	return (int8)getCategory(getProperties(c));
327 }
328 
329 
330 bool
331 BUnicodeChar::IsLower(uint32 c)
332 {
333 	BUnicodeChar();
334     return getCategory(getProperties(c)) == B_UNICODE_LOWERCASE_LETTER;
335 }
336 
337 
338 bool
339 BUnicodeChar::IsUpper(uint32 c)
340 {
341 	BUnicodeChar();
342 	return getCategory(getProperties(c)) == B_UNICODE_UPPERCASE_LETTER;
343 }
344 
345 
346 bool
347 BUnicodeChar::IsTitle(uint32 c)
348 {
349 	BUnicodeChar();
350 	return getCategory(getProperties(c)) == B_UNICODE_TITLECASE_LETTER;
351 }
352 
353 
354 bool
355 BUnicodeChar::IsDigit(uint32 c)
356 {
357 	BUnicodeChar();
358 	return (FLAG(getCategory(getProperties(c)))
359 			& (UF_DECIMAL_NUMBER | UF_OTHER_NUMBER | UF_LETTER_NUMBER)
360 		   ) != 0;
361 }
362 
363 
364 bool
365 BUnicodeChar::IsAlNum(uint32 c)
366 {
367 	BUnicodeChar();
368 	return (FLAG(getCategory(getProperties(c)))
369 			& (UF_DECIMAL_NUMBER | UF_OTHER_NUMBER | UF_LETTER_NUMBER | UF_UPPERCASE
370 			   | UF_LOWERCASE | UF_TITLECASE | UF_MODIFIER_LETTER | UF_OTHER_LETTER)
371            ) != 0;
372 }
373 
374 
375 bool
376 BUnicodeChar::IsDefined(uint32 c)
377 {
378 	BUnicodeChar();
379 	return getProperties(c) != 0;
380 }
381 
382 
383 /** Returns true if the specified unicode character is a base
384  *	form character that can be used with a diacritic.
385  *	This doesn't mean that the character has to be distinct,
386  *	though.
387  */
388 
389 bool
390 BUnicodeChar::IsBase(uint32 c)
391 {
392 	BUnicodeChar();
393 	return (FLAG(getCategory(getProperties(c)))
394 			& (UF_DECIMAL_NUMBER | UF_OTHER_NUMBER | UF_LETTER_NUMBER
395 			   | UF_UPPERCASE | UF_LOWERCASE | UF_TITLECASE
396 			   | UF_MODIFIER_LETTER | UF_OTHER_LETTER | FLAG(B_UNICODE_NON_SPACING_MARK)
397 			   | FLAG(B_UNICODE_ENCLOSING_MARK) | FLAG(B_UNICODE_COMBINING_SPACING_MARK))
398 		   ) != 0;
399 }
400 
401 
402 /** Returns true if the specified unicode character is a
403  *	control character.
404  */
405 
406 bool
407 BUnicodeChar::IsControl(uint32 c)
408 {
409 	BUnicodeChar();
410 	return isISO8Control(c)
411 			|| (FLAG(getCategory(getProperties(c)))
412 				& (FLAG(B_UNICODE_CONTROL_CHAR) | FLAG(B_UNICODE_FORMAT_CHAR)
413 					| FLAG(B_UNICODE_LINE_SEPARATOR) | FLAG(B_UNICODE_PARAGRAPH_SEPARATOR))
414 			   ) != 0;
415 }
416 
417 
418 /** Returns true if the specified unicode character is a
419  *	punctuation character.
420  */
421 
422 bool
423 BUnicodeChar::IsPunctuation(uint32 c)
424 {
425 	BUnicodeChar();
426 	return (FLAG(getCategory(getProperties(c)))
427 			& (FLAG(B_UNICODE_DASH_PUNCTUATION)
428 				| FLAG(B_UNICODE_START_PUNCTUATION)
429 				| FLAG(B_UNICODE_END_PUNCTUATION)
430 				| FLAG(B_UNICODE_CONNECTOR_PUNCTUATION)
431 				| FLAG(B_UNICODE_OTHER_PUNCTUATION))
432 			) != 0;
433 }
434 
435 
436 /** Returns true if the specified unicode character is some
437  *	kind of a space character.
438  */
439 
440 bool
441 BUnicodeChar::IsSpace(uint32 c)
442 {
443 	BUnicodeChar();
444 	return (FLAG(getCategory(getProperties(c)))
445 			& (FLAG(B_UNICODE_SPACE_SEPARATOR)
446 				| FLAG(B_UNICODE_LINE_SEPARATOR)
447 				| FLAG(B_UNICODE_PARAGRAPH_SEPARATOR))
448 		   ) != 0;
449 }
450 
451 
452 /** Returns true if the specified unicode character is a white
453  *	space character.
454  *	This is essentially the same as IsSpace(), but excludes all
455  *	non-breakable spaces.
456  */
457 
458 bool
459 BUnicodeChar::IsWhitespace(uint32 c)
460 {
461 	BUnicodeChar();
462 	return (FLAG(getCategory(getProperties(c)))
463 			& (FLAG(B_UNICODE_SPACE_SEPARATOR)
464 				| FLAG(B_UNICODE_LINE_SEPARATOR)
465 				| FLAG(B_UNICODE_PARAGRAPH_SEPARATOR))
466 		   ) != 0 && c != 0xa0 && c != 0x202f && c != 0xfeff; // exclude non-breakable spaces
467 }
468 
469 
470 /** Returns true if the specified unicode character is printable.
471  */
472 
473 bool
474 BUnicodeChar::IsPrintable(uint32 c)
475 {
476 	BUnicodeChar();
477 	return !isISO8Control(c)
478 			&& (FLAG(getCategory(getProperties(c)))
479 				& ~(FLAG(B_UNICODE_UNASSIGNED) | FLAG(B_UNICODE_CONTROL_CHAR)
480 					| FLAG(B_UNICODE_FORMAT_CHAR) | FLAG(B_UNICODE_PRIVATE_USE_CHAR)
481 					| FLAG(B_UNICODE_SURROGATE) | FLAG(B_UNICODE_GENERAL_OTHER_TYPES)
482 					| FLAG(31))
483 				   ) != 0;
484 }
485 
486 
487 //	#pragma mark -
488 
489 
490 /** Transforms the specified unicode character to lowercase.
491  */
492 
493 uint32
494 BUnicodeChar::ToLower(uint32 c)
495 {
496 	BUnicodeChar();
497 
498 	uint32 props = getProperties(c);
499 
500 	if (!propertyIsException(props)) {
501 		if (FLAG(getCategory(props)) & (UF_UPPERCASE | UF_TITLECASE))
502 			return c + getSignedValue(props);
503 	} else {
504 		uint32 *exceptions = getExceptions(props);
505 		uint32 firstExceptionValue = *exceptions;
506 
507 		if (haveExceptionValue(firstExceptionValue, EXC_LOWERCASE)) {
508 			int16 index = EXC_LOWERCASE;
509 			addExceptionOffset(firstExceptionValue, index, &++exceptions);
510 			return *exceptions;
511 		}
512 	}
513 	// no mapping found, just return the character unchanged
514 	return c;
515 }
516 
517 
518 /** Transforms the specified unicode character to uppercase.
519  */
520 
521 uint32
522 BUnicodeChar::ToUpper(uint32 c)
523 {
524 	BUnicodeChar();
525 
526 	uint32 props = getProperties(c);
527 
528 	if (!propertyIsException(props)) {
529 		if (getCategory(props) == B_UNICODE_LOWERCASE_LETTER)
530 			return c - getSignedValue(props);
531 	} else {
532 		uint32 *exceptions = getExceptions(props);
533 		uint32 firstExceptionValue = *exceptions;
534 
535 		if (haveExceptionValue(firstExceptionValue, EXC_UPPERCASE)) {
536 			int16 index = EXC_UPPERCASE;
537 			++exceptions;
538 			addExceptionOffset(firstExceptionValue, index, &exceptions);
539 			return *exceptions;
540 		}
541     }
542 	// no mapping found, just return the character unchanged
543 	return c;
544 }
545 
546 
547 /** Transforms the specified unicode character to title case.
548  */
549 
550 uint32
551 BUnicodeChar::ToTitle(uint32 c)
552 {
553 	BUnicodeChar();
554 
555 	uint32 props = getProperties(c);
556 
557 	if (!propertyIsException(props)) {
558 		if (getCategory(props) == B_UNICODE_LOWERCASE_LETTER) {
559 			// here, titlecase is the same as uppercase
560 			return c - getSignedValue(props);
561 		}
562 	} else {
563 		uint32 *exceptions = getExceptions(props);
564 		uint32 firstExceptionValue = *exceptions;
565 
566 		if (haveExceptionValue(firstExceptionValue, EXC_TITLECASE)) {
567 			int16 index = EXC_TITLECASE;
568 			addExceptionOffset(firstExceptionValue, index, &++exceptions);
569 			return (uint32)*exceptions;
570 		} else if (haveExceptionValue(firstExceptionValue, EXC_UPPERCASE)) {
571 			// here, titlecase is the same as uppercase
572 			int16 index = EXC_UPPERCASE;
573 			addExceptionOffset(firstExceptionValue, index, &++exceptions);
574 			return *exceptions;
575 		}
576 	}
577 	// no mapping found, just return the character unchanged
578 	return c;
579 }
580 
581 
582 int32
583 BUnicodeChar::DigitValue(uint32 c)
584 {
585 	BUnicodeChar();
586 
587 	uint32 props = getProperties(c);
588 
589 	if (!propertyIsException(props)) {
590 		if (getCategory(props) == B_UNICODE_DECIMAL_DIGIT_NUMBER)
591 			return getSignedValue(props);
592 	} else {
593 		uint32 *exceptions = getExceptions(props);
594 		uint32 firstExceptionValue = *exceptions;
595 
596 		if (haveExceptionValue(firstExceptionValue, EXC_DIGIT_VALUE)) {
597 			int16 index = EXC_DIGIT_VALUE;
598 			addExceptionOffset(firstExceptionValue, index, &++exceptions);
599 
600 			int32 value = (int32)(int16)*exceptions;
601 				 // the digit value is in the lower 16 bits
602 			if (value != -1)
603 				return value;
604 		}
605 	}
606 
607     // If there is no value in the properties table,
608     // then check for some special characters
609 	switch (c) {
610 		case 0x3007:	return 0;
611 		case 0x4e00:	return 1;
612 		case 0x4e8c:	return 2;
613 		case 0x4e09:	return 3;
614 		case 0x56d8:	return 4;
615 		case 0x4e94:	return 5;
616 		case 0x516d:	return 6;
617 		case 0x4e03:	return 7;
618 		case 0x516b:	return 8;
619 		case 0x4e5d:	return 9;
620 		default:		return -1;
621 	}
622 }
623 
624 
625 void
626 BUnicodeChar::ToUTF8(uint32 c, char **out)
627 {
628 	char *s = *out;
629 
630 	if (c < 0x80)
631 		*(s++) = c;
632 	else if (c < 0x800) {
633 		*(s++) = 0xc0 | (c >> 6);
634 		*(s++) = 0x80 | (c & 0x3f);
635 	} else if (c < 0x10000) {
636 		*(s++) = 0xe0 | (c >> 12);
637 		*(s++) = 0x80 | ((c >> 6) & 0x3f);
638 		*(s++) = 0x80 | (c & 0x3f);
639 	} else if (c <= 0x10ffff) {
640 		*(s++) = 0xf0 | (c >> 18);
641 		*(s++) = 0x80 | ((c >> 12) & 0x3f);
642 		*(s++) = 0x80 | ((c >> 6) & 0x3f);
643 		*(s++) = 0x80 | (c & 0x3f);
644 	}
645 	*out = s;
646 }
647 
648 
649 uint32
650 BUnicodeChar::FromUTF8(const char **in)
651 {
652 	uint8 *bytes = (uint8 *)*in;
653 	if (bytes == NULL)
654 		return 0;
655 
656 	int32 length;
657 	uint8 mask = 0x1f;
658 
659 	switch (bytes[0] & 0xf0) {
660 		case 0xc0:
661 		case 0xd0:	length = 2; break;
662 		case 0xe0:	length = 3; break;
663 		case 0xf0:
664 			mask = 0x0f;
665 			length = 4;
666 			break;
667 		default:
668 			// valid 1-byte character
669 			// and invalid characters
670 			(*in)++;
671 			return bytes[0];
672 	}
673 	uint32 c = bytes[0] & mask;
674 	int32 i = 1;
675 	for (;i < length && (bytes[i] & 0x80) > 0;i++)
676 		c = (c << 6) | (bytes[i] & 0x3f);
677 
678 	if (i < length) {
679 		// invalid character
680 		(*in)++;
681 		return (uint32)bytes[0];
682 	}
683 	*in += length;
684 	return c;
685 }
686 
687 size_t
688 BUnicodeChar::UTF8StringLength(const char *str)
689 {
690 	size_t len = 0;
691 	while (*str) {
692 		FromUTF8(&str);
693 		len++;
694 	}
695 	return len;
696 }
697 
698 size_t
699 BUnicodeChar::UTF8StringLength(const char *str, size_t maxLength)
700 {
701 	size_t len = 0;
702 	while (len < maxLength && *str) {
703 		FromUTF8(&str);
704 		len++;
705 	}
706 	return len;
707 }
708 
709