1 /* 2 ** Copyright 2003, Axel Dörfler, axeld@pinc-software.de. All rights reserved. 3 ** Distributed under the terms of the OpenBeOS License. 4 */ 5 6 /* Reads the information out of the data files created by (an edited version of) 7 * IBM's ICU genprops utility. The BUnicodeChar class is mostly the counterpart 8 * to ICU's uchar module, but is not as huge or broad as that one. 9 * 10 * Note, it probably won't be able to handle the output of the orginal genprops 11 * tool and vice versa - only use the tool provided with this project to create 12 * the Unicode property file. 13 * However, the algorithmic idea behind the property file is still the same as 14 * found in ICU - nothing important has been changed, so more recent versions 15 * of genprops tool/data can probably be ported without too much effort. 16 * 17 * In case no property file can be found it will still provide basic services 18 * for the Latin-1 part of the character tables. 19 */ 20 21 22 #include <OS.h> 23 24 #include <UnicodeChar.h> 25 #include "UnicodeProperties.h" 26 #include "PropertyFile.h" 27 28 #include <stdlib.h> 29 #include <stdio.h> 30 #include <string.h> 31 32 33 #if B_BEOS_VERSION <= B_BEOS_VERSION_5 && !defined(__HAIKU__) 34 // B_BAD_DATA was introduced with DANO, so we define it for R5: 35 #define B_BAD_DATA -2147483632L 36 #endif 37 38 static const uint16 *sPropsTable = NULL; 39 #define sProps32Table ((uint32 *)sPropsTable) 40 static uint16 *sIndices; 41 static vint32 sHavePropsData = 0; 42 43 #define FLAG(n) ((uint32)1 << (n)) 44 enum { 45 UF_UPPERCASE = FLAG(B_UNICODE_UPPERCASE_LETTER), 46 UF_LOWERCASE = FLAG(B_UNICODE_LOWERCASE_LETTER), 47 UF_TITLECASE = FLAG(B_UNICODE_TITLECASE_LETTER), 48 UF_MODIFIER_LETTER = FLAG(B_UNICODE_MODIFIER_LETTER), 49 UF_OTHER_LETTER = FLAG(B_UNICODE_OTHER_LETTER), 50 UF_DECIMAL_NUMBER = FLAG(B_UNICODE_DECIMAL_DIGIT_NUMBER), 51 UF_OTHER_NUMBER = FLAG(B_UNICODE_OTHER_NUMBER), 52 UF_LETTER_NUMBER = FLAG(B_UNICODE_LETTER_NUMBER) 53 }; 54 55 56 static uint32 gStaticProps32Table[] = { 57 /* 0x00 */ 0x48f, 0x48f, 0x48f, 0x48f, 58 /* 0x04 */ 0x48f, 0x48f, 0x48f, 0x48f, 59 /* 0x08 */ 0x48f, 0x20c, 0x1ce, 0x20c, 60 /* 0x0c */ 0x24d, 0x1ce, 0x48f, 0x48f, 61 /* 0x10 */ 0x48f, 0x48f, 0x48f, 0x48f, 62 /* 0x14 */ 0x48f, 0x48f, 0x48f, 0x48f, 63 /* 0x18 */ 0x48f, 0x48f, 0x48f, 0x48f, 64 /* 0x1c */ 0x1ce, 0x1ce, 0x1ce, 0x20c, 65 /* 0x20 */ 0x24c, 0x297, 0x297, 0x117, 66 /* 0x24 */ 0x119, 0x117, 0x297, 0x297, 67 /* 0x28 */ 0x100a94, 0xfff00a95, 0x297, 0x118, 68 /* 0x2c */ 0x197, 0x113, 0x197, 0xd7, 69 /* 0x30 */ 0x89, 0x100089, 0x200089, 0x300089, 70 /* 0x34 */ 0x400089, 0x500089, 0x600089, 0x700089, 71 /* 0x38 */ 0x800089, 0x900089, 0x197, 0x297, 72 /* 0x3c */ 0x200a98, 0x298, 0xffe00a98, 0x297, 73 /* 0x40 */ 0x297, 0x2000001, 0x2000001, 0x2000001, 74 /* 0x44 */ 0x2000001, 0x2000001, 0x2000001, 0x2000001, 75 /* 0x48 */ 0x2000001, 0x2000001, 0x2000001, 0x2000001, 76 /* 0x4c */ 0x2000001, 0x2000001, 0x2000001, 0x2000001, 77 /* 0x50 */ 0x2000001, 0x2000001, 0x2000001, 0x2000001, 78 /* 0x54 */ 0x2000001, 0x2000001, 0x2000001, 0x2000001, 79 /* 0x58 */ 0x2000001, 0x2000001, 0x2000001, 0x200a94, 80 /* 0x5c */ 0x297, 0xffe00a95, 0x29a, 0x296, 81 /* 0x60 */ 0x29a, 0x2000002, 0x2000002, 0x2000002, 82 /* 0x64 */ 0x2000002, 0x2000002, 0x2000002, 0x2000002, 83 /* 0x68 */ 0x2000002, 0x2000002, 0x2000002, 0x2000002, 84 /* 0x6c */ 0x2000002, 0x2000002, 0x2000002, 0x2000002, 85 /* 0x70 */ 0x2000002, 0x2000002, 0x2000002, 0x2000002, 86 /* 0x74 */ 0x2000002, 0x2000002, 0x2000002, 0x2000002, 87 /* 0x78 */ 0x2000002, 0x2000002, 0x2000002, 0x200a94, 88 /* 0x7c */ 0x298, 0xffe00a95, 0x298, 0x48f, 89 /* 0x80 */ 0x48f, 0x48f, 0x48f, 0x48f, 90 /* 0x84 */ 0x48f, 0x1ce, 0x48f, 0x48f, 91 /* 0x88 */ 0x48f, 0x48f, 0x48f, 0x48f, 92 /* 0x8c */ 0x48f, 0x48f, 0x48f, 0x48f, 93 /* 0x90 */ 0x48f, 0x48f, 0x48f, 0x48f, 94 /* 0x94 */ 0x48f, 0x48f, 0x48f, 0x48f, 95 /* 0x98 */ 0x48f, 0x48f, 0x48f, 0x48f, 96 /* 0x9c */ 0x48f, 0x48f, 0x48f, 0x48f 97 }; 98 99 enum { 100 INDEX_STAGE_2_BITS, 101 INDEX_STAGE_3_BITS, 102 INDEX_EXCEPTIONS, 103 INDEX_STAGE_3_INDEX, 104 INDEX_PROPS, 105 INDEX_UCHARS 106 }; 107 108 /* constants and macros for access to the data */ 109 enum { 110 EXC_UPPERCASE, 111 EXC_LOWERCASE, 112 EXC_TITLECASE, 113 EXC_DIGIT_VALUE, 114 EXC_NUMERIC_VALUE, 115 EXC_DENOMINATOR_VALUE, 116 EXC_MIRROR_MAPPING, 117 EXC_SPECIAL_CASING, 118 EXC_CASE_FOLDING 119 }; 120 121 enum { 122 EXCEPTION_SHIFT = 5, 123 BIDI_SHIFT, 124 MIRROR_SHIFT = BIDI_SHIFT + 5, 125 VALUE_SHIFT = 20, 126 127 VALUE_BITS = 32 - VALUE_SHIFT 128 }; 129 130 /* number of bits in an 8-bit integer value */ 131 #define EXC_GROUP 8 132 static uint8 gFlagsOffset[256] = { 133 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 134 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 135 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 136 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 137 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 138 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 139 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 140 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 141 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 142 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 143 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 144 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 145 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 146 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 147 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 148 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8 149 }; 150 151 #ifdef UCHAR_VARIABLE_TRIE_BITS 152 // access values calculated from indices 153 static uint16_t stage23Bits, stage2Mask, stage3Mask; 154 # define sStage3Bits indexes[INDEX_STAGE_3_BITS] 155 #else 156 // Use hardcoded bit distribution for the trie table access 157 # define sStage23Bits 10 158 # define sStage2Mask 0x3f 159 # define sStage3Mask 0xf 160 # define sStage3Bits 4 161 #endif 162 163 164 /** We need to change the char category for ISO 8 controls, since the 165 * genprops utility we got from IBM's ICU apparently changes it for 166 * some characters. 167 */ 168 169 static inline bool 170 isISO8Control(uint32 c) 171 { 172 return ((uint32)c < 0x20 || (uint32)(c - 0x7f) <= 0x20); 173 } 174 175 176 static inline uint32 177 getProperties(uint32 c) 178 { 179 if (c > 0x10ffff) 180 return 0; 181 182 if (sHavePropsData > 0) 183 return sProps32Table[sPropsTable[ 184 sPropsTable[sPropsTable[8 + (c >> sStage23Bits)] 185 + ((c >> sStage3Bits) & sStage2Mask)] 186 + (c & sStage3Mask)]]; 187 188 return c > 0x9f ? 0 : gStaticProps32Table[c]; 189 } 190 191 192 static inline uint8 193 getCategory(uint32 properties) 194 { 195 return properties & 0x1f; 196 } 197 198 199 static inline bool 200 propertyIsException(uint32 properties) 201 { 202 return properties & (1UL << EXCEPTION_SHIFT); 203 } 204 205 206 static inline uint32 207 getUnsignedValue(uint32 properties) 208 { 209 return properties >> VALUE_SHIFT; 210 } 211 212 213 static inline uint32 214 getSignedValue(uint32 properties) 215 { 216 return (int32)properties >> VALUE_SHIFT; 217 } 218 219 220 static inline uint32 * 221 getExceptions(uint32 properties) 222 { 223 return sProps32Table + sIndices[INDEX_EXCEPTIONS] + getUnsignedValue(properties); 224 } 225 226 227 static inline bool 228 haveExceptionValue(uint32 flags,int16 index) 229 { 230 return flags & (1UL << index); 231 } 232 233 234 static inline void 235 addExceptionOffset(uint32 &flags, int16 &index, uint32 **offset) 236 { 237 if (index >= EXC_GROUP) { 238 *offset += gFlagsOffset[flags & ((1 << EXC_GROUP) - 1)]; 239 flags >>= EXC_GROUP; 240 index -= EXC_GROUP; 241 } 242 *offset += gFlagsOffset[flags & ((1 << index) - 1)]; 243 } 244 245 246 static status_t 247 loadPropsData() 248 { 249 PropertyFile file; 250 status_t status = file.SetTo(PROPERTIES_DIRECTORY, PROPERTIES_FILE_NAME); 251 if (status < B_OK) { 252 fprintf(stderr, "could not open unicode.properties file: %s\n", strerror(status)); 253 return status; 254 } 255 256 off_t size = file.Size(); 257 uint16 *table = (uint16 *)malloc(size); 258 if (table == NULL) 259 return B_NO_MEMORY; 260 261 if (file.Read(table, size) < size) { 262 free(table); 263 return B_IO_ERROR; 264 } 265 266 // check if the property file matches our needs 267 if (table[INDEX_STAGE_2_BITS] != 6 || table[INDEX_STAGE_3_BITS] != 4) { 268 free(table); 269 return B_BAD_DATA; 270 } 271 272 sIndices = table; 273 #ifdef UCHAR_VARIABLE_TRIE_BITS 274 sStage23Bits = uint16(sIndices[INDEX_STAGE_2_BITS] + sIndices[INDEX_STAGE_3_BITS]); 275 sStage2Mask = uint16((1 << sIndices[INDEX_STAGE_2_BITS]) - 1); 276 sStage3Mask = uint16((1 << sIndices[INDEX_STAGE_3_BITS]) - 1); 277 #endif 278 279 sPropsTable = table; 280 sHavePropsData = 1; 281 282 return B_OK; 283 } 284 285 286 // #pragma mark - 287 288 289 /** If the constructor is used for the first time, the property 290 * file gets loaded from disk. 291 * It makes sure that this will only happen once throughout the 292 * application's lifetime. 293 */ 294 295 BUnicodeChar::BUnicodeChar() 296 { 297 static int32 lock = 0; 298 299 if (atomic_add(&lock, 1) > 0) { 300 while (sHavePropsData == 0) 301 snooze(10000); 302 303 return; 304 } 305 if (loadPropsData() < B_OK) 306 sHavePropsData = -1; 307 } 308 309 310 bool 311 BUnicodeChar::IsAlpha(uint32 c) 312 { 313 BUnicodeChar(); 314 return (FLAG(getCategory(getProperties(c))) 315 & (UF_UPPERCASE | UF_LOWERCASE | UF_TITLECASE | UF_MODIFIER_LETTER | UF_OTHER_LETTER) 316 ) != 0; 317 } 318 319 320 /** Returns the type code of the specified unicode character */ 321 322 int8 323 BUnicodeChar::Type(uint32 c) 324 { 325 BUnicodeChar(); 326 return (int8)getCategory(getProperties(c)); 327 } 328 329 330 bool 331 BUnicodeChar::IsLower(uint32 c) 332 { 333 BUnicodeChar(); 334 return getCategory(getProperties(c)) == B_UNICODE_LOWERCASE_LETTER; 335 } 336 337 338 bool 339 BUnicodeChar::IsUpper(uint32 c) 340 { 341 BUnicodeChar(); 342 return getCategory(getProperties(c)) == B_UNICODE_UPPERCASE_LETTER; 343 } 344 345 346 bool 347 BUnicodeChar::IsTitle(uint32 c) 348 { 349 BUnicodeChar(); 350 return getCategory(getProperties(c)) == B_UNICODE_TITLECASE_LETTER; 351 } 352 353 354 bool 355 BUnicodeChar::IsDigit(uint32 c) 356 { 357 BUnicodeChar(); 358 return (FLAG(getCategory(getProperties(c))) 359 & (UF_DECIMAL_NUMBER | UF_OTHER_NUMBER | UF_LETTER_NUMBER) 360 ) != 0; 361 } 362 363 364 bool 365 BUnicodeChar::IsAlNum(uint32 c) 366 { 367 BUnicodeChar(); 368 return (FLAG(getCategory(getProperties(c))) 369 & (UF_DECIMAL_NUMBER | UF_OTHER_NUMBER | UF_LETTER_NUMBER | UF_UPPERCASE 370 | UF_LOWERCASE | UF_TITLECASE | UF_MODIFIER_LETTER | UF_OTHER_LETTER) 371 ) != 0; 372 } 373 374 375 bool 376 BUnicodeChar::IsDefined(uint32 c) 377 { 378 BUnicodeChar(); 379 return getProperties(c) != 0; 380 } 381 382 383 /** Returns true if the specified unicode character is a base 384 * form character that can be used with a diacritic. 385 * This doesn't mean that the character has to be distinct, 386 * though. 387 */ 388 389 bool 390 BUnicodeChar::IsBase(uint32 c) 391 { 392 BUnicodeChar(); 393 return (FLAG(getCategory(getProperties(c))) 394 & (UF_DECIMAL_NUMBER | UF_OTHER_NUMBER | UF_LETTER_NUMBER 395 | UF_UPPERCASE | UF_LOWERCASE | UF_TITLECASE 396 | UF_MODIFIER_LETTER | UF_OTHER_LETTER | FLAG(B_UNICODE_NON_SPACING_MARK) 397 | FLAG(B_UNICODE_ENCLOSING_MARK) | FLAG(B_UNICODE_COMBINING_SPACING_MARK)) 398 ) != 0; 399 } 400 401 402 /** Returns true if the specified unicode character is a 403 * control character. 404 */ 405 406 bool 407 BUnicodeChar::IsControl(uint32 c) 408 { 409 BUnicodeChar(); 410 return isISO8Control(c) 411 || (FLAG(getCategory(getProperties(c))) 412 & (FLAG(B_UNICODE_CONTROL_CHAR) | FLAG(B_UNICODE_FORMAT_CHAR) 413 | FLAG(B_UNICODE_LINE_SEPARATOR) | FLAG(B_UNICODE_PARAGRAPH_SEPARATOR)) 414 ) != 0; 415 } 416 417 418 /** Returns true if the specified unicode character is a 419 * punctuation character. 420 */ 421 422 bool 423 BUnicodeChar::IsPunctuation(uint32 c) 424 { 425 BUnicodeChar(); 426 return (FLAG(getCategory(getProperties(c))) 427 & (FLAG(B_UNICODE_DASH_PUNCTUATION) 428 | FLAG(B_UNICODE_START_PUNCTUATION) 429 | FLAG(B_UNICODE_END_PUNCTUATION) 430 | FLAG(B_UNICODE_CONNECTOR_PUNCTUATION) 431 | FLAG(B_UNICODE_OTHER_PUNCTUATION)) 432 ) != 0; 433 } 434 435 436 /** Returns true if the specified unicode character is some 437 * kind of a space character. 438 */ 439 440 bool 441 BUnicodeChar::IsSpace(uint32 c) 442 { 443 BUnicodeChar(); 444 return (FLAG(getCategory(getProperties(c))) 445 & (FLAG(B_UNICODE_SPACE_SEPARATOR) 446 | FLAG(B_UNICODE_LINE_SEPARATOR) 447 | FLAG(B_UNICODE_PARAGRAPH_SEPARATOR)) 448 ) != 0; 449 } 450 451 452 /** Returns true if the specified unicode character is a white 453 * space character. 454 * This is essentially the same as IsSpace(), but excludes all 455 * non-breakable spaces. 456 */ 457 458 bool 459 BUnicodeChar::IsWhitespace(uint32 c) 460 { 461 BUnicodeChar(); 462 return (FLAG(getCategory(getProperties(c))) 463 & (FLAG(B_UNICODE_SPACE_SEPARATOR) 464 | FLAG(B_UNICODE_LINE_SEPARATOR) 465 | FLAG(B_UNICODE_PARAGRAPH_SEPARATOR)) 466 ) != 0 && c != 0xa0 && c != 0x202f && c != 0xfeff; // exclude non-breakable spaces 467 } 468 469 470 /** Returns true if the specified unicode character is printable. 471 */ 472 473 bool 474 BUnicodeChar::IsPrintable(uint32 c) 475 { 476 BUnicodeChar(); 477 return !isISO8Control(c) 478 && (FLAG(getCategory(getProperties(c))) 479 & ~(FLAG(B_UNICODE_UNASSIGNED) | FLAG(B_UNICODE_CONTROL_CHAR) 480 | FLAG(B_UNICODE_FORMAT_CHAR) | FLAG(B_UNICODE_PRIVATE_USE_CHAR) 481 | FLAG(B_UNICODE_SURROGATE) | FLAG(B_UNICODE_GENERAL_OTHER_TYPES) 482 | FLAG(31)) 483 ) != 0; 484 } 485 486 487 // #pragma mark - 488 489 490 /** Transforms the specified unicode character to lowercase. 491 */ 492 493 uint32 494 BUnicodeChar::ToLower(uint32 c) 495 { 496 BUnicodeChar(); 497 498 uint32 props = getProperties(c); 499 500 if (!propertyIsException(props)) { 501 if (FLAG(getCategory(props)) & (UF_UPPERCASE | UF_TITLECASE)) 502 return c + getSignedValue(props); 503 } else { 504 uint32 *exceptions = getExceptions(props); 505 uint32 firstExceptionValue = *exceptions; 506 507 if (haveExceptionValue(firstExceptionValue, EXC_LOWERCASE)) { 508 int16 index = EXC_LOWERCASE; 509 addExceptionOffset(firstExceptionValue, index, &++exceptions); 510 return *exceptions; 511 } 512 } 513 // no mapping found, just return the character unchanged 514 return c; 515 } 516 517 518 /** Transforms the specified unicode character to uppercase. 519 */ 520 521 uint32 522 BUnicodeChar::ToUpper(uint32 c) 523 { 524 BUnicodeChar(); 525 526 uint32 props = getProperties(c); 527 528 if (!propertyIsException(props)) { 529 if (getCategory(props) == B_UNICODE_LOWERCASE_LETTER) 530 return c - getSignedValue(props); 531 } else { 532 uint32 *exceptions = getExceptions(props); 533 uint32 firstExceptionValue = *exceptions; 534 535 if (haveExceptionValue(firstExceptionValue, EXC_UPPERCASE)) { 536 int16 index = EXC_UPPERCASE; 537 ++exceptions; 538 addExceptionOffset(firstExceptionValue, index, &exceptions); 539 return *exceptions; 540 } 541 } 542 // no mapping found, just return the character unchanged 543 return c; 544 } 545 546 547 /** Transforms the specified unicode character to title case. 548 */ 549 550 uint32 551 BUnicodeChar::ToTitle(uint32 c) 552 { 553 BUnicodeChar(); 554 555 uint32 props = getProperties(c); 556 557 if (!propertyIsException(props)) { 558 if (getCategory(props) == B_UNICODE_LOWERCASE_LETTER) { 559 // here, titlecase is the same as uppercase 560 return c - getSignedValue(props); 561 } 562 } else { 563 uint32 *exceptions = getExceptions(props); 564 uint32 firstExceptionValue = *exceptions; 565 566 if (haveExceptionValue(firstExceptionValue, EXC_TITLECASE)) { 567 int16 index = EXC_TITLECASE; 568 addExceptionOffset(firstExceptionValue, index, &++exceptions); 569 return (uint32)*exceptions; 570 } else if (haveExceptionValue(firstExceptionValue, EXC_UPPERCASE)) { 571 // here, titlecase is the same as uppercase 572 int16 index = EXC_UPPERCASE; 573 addExceptionOffset(firstExceptionValue, index, &++exceptions); 574 return *exceptions; 575 } 576 } 577 // no mapping found, just return the character unchanged 578 return c; 579 } 580 581 582 int32 583 BUnicodeChar::DigitValue(uint32 c) 584 { 585 BUnicodeChar(); 586 587 uint32 props = getProperties(c); 588 589 if (!propertyIsException(props)) { 590 if (getCategory(props) == B_UNICODE_DECIMAL_DIGIT_NUMBER) 591 return getSignedValue(props); 592 } else { 593 uint32 *exceptions = getExceptions(props); 594 uint32 firstExceptionValue = *exceptions; 595 596 if (haveExceptionValue(firstExceptionValue, EXC_DIGIT_VALUE)) { 597 int16 index = EXC_DIGIT_VALUE; 598 addExceptionOffset(firstExceptionValue, index, &++exceptions); 599 600 int32 value = (int32)(int16)*exceptions; 601 // the digit value is in the lower 16 bits 602 if (value != -1) 603 return value; 604 } 605 } 606 607 // If there is no value in the properties table, 608 // then check for some special characters 609 switch (c) { 610 case 0x3007: return 0; 611 case 0x4e00: return 1; 612 case 0x4e8c: return 2; 613 case 0x4e09: return 3; 614 case 0x56d8: return 4; 615 case 0x4e94: return 5; 616 case 0x516d: return 6; 617 case 0x4e03: return 7; 618 case 0x516b: return 8; 619 case 0x4e5d: return 9; 620 default: return -1; 621 } 622 } 623 624 625 void 626 BUnicodeChar::ToUTF8(uint32 c, char **out) 627 { 628 char *s = *out; 629 630 if (c < 0x80) 631 *(s++) = c; 632 else if (c < 0x800) { 633 *(s++) = 0xc0 | (c >> 6); 634 *(s++) = 0x80 | (c & 0x3f); 635 } else if (c < 0x10000) { 636 *(s++) = 0xe0 | (c >> 12); 637 *(s++) = 0x80 | ((c >> 6) & 0x3f); 638 *(s++) = 0x80 | (c & 0x3f); 639 } else if (c <= 0x10ffff) { 640 *(s++) = 0xf0 | (c >> 18); 641 *(s++) = 0x80 | ((c >> 12) & 0x3f); 642 *(s++) = 0x80 | ((c >> 6) & 0x3f); 643 *(s++) = 0x80 | (c & 0x3f); 644 } 645 *out = s; 646 } 647 648 649 uint32 650 BUnicodeChar::FromUTF8(const char **in) 651 { 652 uint8 *bytes = (uint8 *)*in; 653 if (bytes == NULL) 654 return 0; 655 656 int32 length; 657 uint8 mask = 0x1f; 658 659 switch (bytes[0] & 0xf0) { 660 case 0xc0: 661 case 0xd0: length = 2; break; 662 case 0xe0: length = 3; break; 663 case 0xf0: 664 mask = 0x0f; 665 length = 4; 666 break; 667 default: 668 // valid 1-byte character 669 // and invalid characters 670 (*in)++; 671 return bytes[0]; 672 } 673 uint32 c = bytes[0] & mask; 674 int32 i = 1; 675 for (;i < length && (bytes[i] & 0x80) > 0;i++) 676 c = (c << 6) | (bytes[i] & 0x3f); 677 678 if (i < length) { 679 // invalid character 680 (*in)++; 681 return (uint32)bytes[0]; 682 } 683 *in += length; 684 return c; 685 } 686 687 size_t 688 BUnicodeChar::UTF8StringLength(const char *str) 689 { 690 size_t len = 0; 691 while (*str) { 692 FromUTF8(&str); 693 len++; 694 } 695 return len; 696 } 697 698 size_t 699 BUnicodeChar::UTF8StringLength(const char *str, size_t maxLength) 700 { 701 size_t len = 0; 702 while (len < maxLength && *str) { 703 FromUTF8(&str); 704 len++; 705 } 706 return len; 707 } 708 709