1 /* 2 ** Copyright 2003, Axel Dörfler, axeld@pinc-software.de. All rights reserved. 3 ** Distributed under the terms of the OpenBeOS License. 4 */ 5 6 /* Reads the information out of the data files created by (an edited version of) 7 * IBM's ICU genprops utility. The BUnicodeChar class is mostly the counterpart 8 * to ICU's uchar module, but is not as huge or broad as that one. 9 * 10 * Note, it probably won't be able to handle the output of the orginal genprops 11 * tool and vice versa - only use the tool provided with this project to create 12 * the Unicode property file. 13 * However, the algorithmic idea behind the property file is still the same as 14 * found in ICU - nothing important has been changed, so more recent versions 15 * of genprops tool/data can probably be ported without too much effort. 16 * 17 * In case no property file can be found it will still provide basic services 18 * for the Latin-1 part of the character tables. 19 */ 20 21 22 #include <OS.h> 23 24 #include <UnicodeChar.h> 25 26 #include <stdlib.h> 27 #include <stdio.h> 28 #include <string.h> 29 30 31 #define FLAG(n) ((uint32)1 << (n)) 32 enum { 33 UF_UPPERCASE = FLAG(B_UNICODE_UPPERCASE_LETTER), 34 UF_LOWERCASE = FLAG(B_UNICODE_LOWERCASE_LETTER), 35 UF_TITLECASE = FLAG(B_UNICODE_TITLECASE_LETTER), 36 UF_MODIFIER_LETTER = FLAG(B_UNICODE_MODIFIER_LETTER), 37 UF_OTHER_LETTER = FLAG(B_UNICODE_OTHER_LETTER), 38 UF_DECIMAL_NUMBER = FLAG(B_UNICODE_DECIMAL_DIGIT_NUMBER), 39 UF_OTHER_NUMBER = FLAG(B_UNICODE_OTHER_NUMBER), 40 UF_LETTER_NUMBER = FLAG(B_UNICODE_LETTER_NUMBER) 41 }; 42 43 44 static uint32 gStaticProps32Table[] = { 45 /* 0x00 */ 0x48f, 0x48f, 0x48f, 0x48f, 46 /* 0x04 */ 0x48f, 0x48f, 0x48f, 0x48f, 47 /* 0x08 */ 0x48f, 0x20c, 0x1ce, 0x20c, 48 /* 0x0c */ 0x24d, 0x1ce, 0x48f, 0x48f, 49 /* 0x10 */ 0x48f, 0x48f, 0x48f, 0x48f, 50 /* 0x14 */ 0x48f, 0x48f, 0x48f, 0x48f, 51 /* 0x18 */ 0x48f, 0x48f, 0x48f, 0x48f, 52 /* 0x1c */ 0x1ce, 0x1ce, 0x1ce, 0x20c, 53 /* 0x20 */ 0x24c, 0x297, 0x297, 0x117, 54 /* 0x24 */ 0x119, 0x117, 0x297, 0x297, 55 /* 0x28 */ 0x100a94, 0xfff00a95, 0x297, 0x118, 56 /* 0x2c */ 0x197, 0x113, 0x197, 0xd7, 57 /* 0x30 */ 0x89, 0x100089, 0x200089, 0x300089, 58 /* 0x34 */ 0x400089, 0x500089, 0x600089, 0x700089, 59 /* 0x38 */ 0x800089, 0x900089, 0x197, 0x297, 60 /* 0x3c */ 0x200a98, 0x298, 0xffe00a98, 0x297, 61 /* 0x40 */ 0x297, 0x2000001, 0x2000001, 0x2000001, 62 /* 0x44 */ 0x2000001, 0x2000001, 0x2000001, 0x2000001, 63 /* 0x48 */ 0x2000001, 0x2000001, 0x2000001, 0x2000001, 64 /* 0x4c */ 0x2000001, 0x2000001, 0x2000001, 0x2000001, 65 /* 0x50 */ 0x2000001, 0x2000001, 0x2000001, 0x2000001, 66 /* 0x54 */ 0x2000001, 0x2000001, 0x2000001, 0x2000001, 67 /* 0x58 */ 0x2000001, 0x2000001, 0x2000001, 0x200a94, 68 /* 0x5c */ 0x297, 0xffe00a95, 0x29a, 0x296, 69 /* 0x60 */ 0x29a, 0x2000002, 0x2000002, 0x2000002, 70 /* 0x64 */ 0x2000002, 0x2000002, 0x2000002, 0x2000002, 71 /* 0x68 */ 0x2000002, 0x2000002, 0x2000002, 0x2000002, 72 /* 0x6c */ 0x2000002, 0x2000002, 0x2000002, 0x2000002, 73 /* 0x70 */ 0x2000002, 0x2000002, 0x2000002, 0x2000002, 74 /* 0x74 */ 0x2000002, 0x2000002, 0x2000002, 0x2000002, 75 /* 0x78 */ 0x2000002, 0x2000002, 0x2000002, 0x200a94, 76 /* 0x7c */ 0x298, 0xffe00a95, 0x298, 0x48f, 77 /* 0x80 */ 0x48f, 0x48f, 0x48f, 0x48f, 78 /* 0x84 */ 0x48f, 0x1ce, 0x48f, 0x48f, 79 /* 0x88 */ 0x48f, 0x48f, 0x48f, 0x48f, 80 /* 0x8c */ 0x48f, 0x48f, 0x48f, 0x48f, 81 /* 0x90 */ 0x48f, 0x48f, 0x48f, 0x48f, 82 /* 0x94 */ 0x48f, 0x48f, 0x48f, 0x48f, 83 /* 0x98 */ 0x48f, 0x48f, 0x48f, 0x48f, 84 /* 0x9c */ 0x48f, 0x48f, 0x48f, 0x48f 85 }; 86 87 enum { 88 INDEX_STAGE_2_BITS, 89 INDEX_STAGE_3_BITS, 90 INDEX_EXCEPTIONS, 91 INDEX_STAGE_3_INDEX, 92 INDEX_PROPS, 93 INDEX_UCHARS 94 }; 95 96 /* constants and macros for access to the data */ 97 enum { 98 EXC_UPPERCASE, 99 EXC_LOWERCASE, 100 EXC_TITLECASE, 101 EXC_DIGIT_VALUE, 102 EXC_NUMERIC_VALUE, 103 EXC_DENOMINATOR_VALUE, 104 EXC_MIRROR_MAPPING, 105 EXC_SPECIAL_CASING, 106 EXC_CASE_FOLDING 107 }; 108 109 enum { 110 EXCEPTION_SHIFT = 5, 111 BIDI_SHIFT, 112 MIRROR_SHIFT = BIDI_SHIFT + 5, 113 VALUE_SHIFT = 20, 114 115 VALUE_BITS = 32 - VALUE_SHIFT 116 }; 117 118 /* number of bits in an 8-bit integer value */ 119 #define EXC_GROUP 8 120 static uint8 gFlagsOffset[256] = { 121 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 122 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 123 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 124 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 125 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 126 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 127 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 128 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 129 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 130 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 131 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 132 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 133 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 134 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 135 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 136 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8 137 }; 138 139 #ifdef UCHAR_VARIABLE_TRIE_BITS 140 // access values calculated from indices 141 static uint16_t stage23Bits, stage2Mask, stage3Mask; 142 # define sStage3Bits indexes[INDEX_STAGE_3_BITS] 143 #else 144 // Use hardcoded bit distribution for the trie table access 145 # define sStage23Bits 10 146 # define sStage2Mask 0x3f 147 # define sStage3Mask 0xf 148 # define sStage3Bits 4 149 #endif 150 151 152 /** We need to change the char category for ISO 8 controls, since the 153 * genprops utility we got from IBM's ICU apparently changes it for 154 * some characters. 155 */ 156 157 static inline bool 158 isISO8Control(uint32 c) 159 { 160 return ((uint32)c < 0x20 || (uint32)(c - 0x7f) <= 0x20); 161 } 162 163 164 static inline uint32 165 getProperties(uint32 c) 166 { 167 if (c > 0x10ffff) 168 return 0; 169 170 // TODO : Data from unicode 171 172 return c > 0x9f ? 0 : gStaticProps32Table[c]; 173 } 174 175 176 static inline uint8 177 getCategory(uint32 properties) 178 { 179 return properties & 0x1f; 180 } 181 182 183 static inline bool 184 propertyIsException(uint32 properties) 185 { 186 return properties & (1UL << EXCEPTION_SHIFT); 187 } 188 189 190 static inline uint32 191 getUnsignedValue(uint32 properties) 192 { 193 return properties >> VALUE_SHIFT; 194 } 195 196 197 static inline uint32 198 getSignedValue(uint32 properties) 199 { 200 return (int32)properties >> VALUE_SHIFT; 201 } 202 203 204 static inline uint32 * 205 getExceptions(uint32 properties) 206 { 207 // TODO : data from unicode 208 return 0; 209 } 210 211 212 static inline bool 213 haveExceptionValue(uint32 flags,int16 index) 214 { 215 return flags & (1UL << index); 216 } 217 218 219 static inline void 220 addExceptionOffset(uint32 &flags, int16 &index, uint32 **offset) 221 { 222 if (index >= EXC_GROUP) { 223 *offset += gFlagsOffset[flags & ((1 << EXC_GROUP) - 1)]; 224 flags >>= EXC_GROUP; 225 index -= EXC_GROUP; 226 } 227 *offset += gFlagsOffset[flags & ((1 << index) - 1)]; 228 } 229 230 231 // #pragma mark - 232 233 234 BUnicodeChar::BUnicodeChar() 235 { 236 } 237 238 239 bool 240 BUnicodeChar::IsAlpha(uint32 c) 241 { 242 BUnicodeChar(); 243 return (FLAG(getCategory(getProperties(c))) 244 & (UF_UPPERCASE | UF_LOWERCASE | UF_TITLECASE | UF_MODIFIER_LETTER | UF_OTHER_LETTER) 245 ) != 0; 246 } 247 248 249 /** Returns the type code of the specified unicode character */ 250 int8 251 BUnicodeChar::Type(uint32 c) 252 { 253 BUnicodeChar(); 254 return (int8)getCategory(getProperties(c)); 255 } 256 257 258 bool 259 BUnicodeChar::IsLower(uint32 c) 260 { 261 BUnicodeChar(); 262 return getCategory(getProperties(c)) == B_UNICODE_LOWERCASE_LETTER; 263 } 264 265 266 bool 267 BUnicodeChar::IsUpper(uint32 c) 268 { 269 BUnicodeChar(); 270 return getCategory(getProperties(c)) == B_UNICODE_UPPERCASE_LETTER; 271 } 272 273 274 bool 275 BUnicodeChar::IsTitle(uint32 c) 276 { 277 BUnicodeChar(); 278 return getCategory(getProperties(c)) == B_UNICODE_TITLECASE_LETTER; 279 } 280 281 282 bool 283 BUnicodeChar::IsDigit(uint32 c) 284 { 285 BUnicodeChar(); 286 return (FLAG(getCategory(getProperties(c))) 287 & (UF_DECIMAL_NUMBER | UF_OTHER_NUMBER | UF_LETTER_NUMBER) 288 ) != 0; 289 } 290 291 292 bool 293 BUnicodeChar::IsAlNum(uint32 c) 294 { 295 BUnicodeChar(); 296 return (FLAG(getCategory(getProperties(c))) 297 & (UF_DECIMAL_NUMBER | UF_OTHER_NUMBER | UF_LETTER_NUMBER | UF_UPPERCASE 298 | UF_LOWERCASE | UF_TITLECASE | UF_MODIFIER_LETTER | UF_OTHER_LETTER) 299 ) != 0; 300 } 301 302 303 bool 304 BUnicodeChar::IsDefined(uint32 c) 305 { 306 BUnicodeChar(); 307 return getProperties(c) != 0; 308 } 309 310 311 /** Returns true if the specified unicode character is a base 312 * form character that can be used with a diacritic. 313 * This doesn't mean that the character has to be distinct, 314 * though. 315 */ 316 317 bool 318 BUnicodeChar::IsBase(uint32 c) 319 { 320 BUnicodeChar(); 321 return (FLAG(getCategory(getProperties(c))) 322 & (UF_DECIMAL_NUMBER | UF_OTHER_NUMBER | UF_LETTER_NUMBER 323 | UF_UPPERCASE | UF_LOWERCASE | UF_TITLECASE 324 | UF_MODIFIER_LETTER | UF_OTHER_LETTER | FLAG(B_UNICODE_NON_SPACING_MARK) 325 | FLAG(B_UNICODE_ENCLOSING_MARK) | FLAG(B_UNICODE_COMBINING_SPACING_MARK)) 326 ) != 0; 327 } 328 329 330 /** Returns true if the specified unicode character is a 331 * control character. 332 */ 333 334 bool 335 BUnicodeChar::IsControl(uint32 c) 336 { 337 BUnicodeChar(); 338 return isISO8Control(c) 339 || (FLAG(getCategory(getProperties(c))) 340 & (FLAG(B_UNICODE_CONTROL_CHAR) | FLAG(B_UNICODE_FORMAT_CHAR) 341 | FLAG(B_UNICODE_LINE_SEPARATOR) | FLAG(B_UNICODE_PARAGRAPH_SEPARATOR)) 342 ) != 0; 343 } 344 345 346 /** Returns true if the specified unicode character is a 347 * punctuation character. 348 */ 349 350 bool 351 BUnicodeChar::IsPunctuation(uint32 c) 352 { 353 BUnicodeChar(); 354 return (FLAG(getCategory(getProperties(c))) 355 & (FLAG(B_UNICODE_DASH_PUNCTUATION) 356 | FLAG(B_UNICODE_START_PUNCTUATION) 357 | FLAG(B_UNICODE_END_PUNCTUATION) 358 | FLAG(B_UNICODE_CONNECTOR_PUNCTUATION) 359 | FLAG(B_UNICODE_OTHER_PUNCTUATION)) 360 ) != 0; 361 } 362 363 364 /** Returns true if the specified unicode character is some 365 * kind of a space character. 366 */ 367 368 bool 369 BUnicodeChar::IsSpace(uint32 c) 370 { 371 BUnicodeChar(); 372 return (FLAG(getCategory(getProperties(c))) 373 & (FLAG(B_UNICODE_SPACE_SEPARATOR) 374 | FLAG(B_UNICODE_LINE_SEPARATOR) 375 | FLAG(B_UNICODE_PARAGRAPH_SEPARATOR)) 376 ) != 0; 377 } 378 379 380 /** Returns true if the specified unicode character is a white 381 * space character. 382 * This is essentially the same as IsSpace(), but excludes all 383 * non-breakable spaces. 384 */ 385 386 bool 387 BUnicodeChar::IsWhitespace(uint32 c) 388 { 389 BUnicodeChar(); 390 return (FLAG(getCategory(getProperties(c))) 391 & (FLAG(B_UNICODE_SPACE_SEPARATOR) 392 | FLAG(B_UNICODE_LINE_SEPARATOR) 393 | FLAG(B_UNICODE_PARAGRAPH_SEPARATOR)) 394 ) != 0 && c != 0xa0 && c != 0x202f && c != 0xfeff; // exclude non-breakable spaces 395 } 396 397 398 /** Returns true if the specified unicode character is printable. 399 */ 400 401 bool 402 BUnicodeChar::IsPrintable(uint32 c) 403 { 404 BUnicodeChar(); 405 return !isISO8Control(c) 406 && (FLAG(getCategory(getProperties(c))) 407 & ~(FLAG(B_UNICODE_UNASSIGNED) | FLAG(B_UNICODE_CONTROL_CHAR) 408 | FLAG(B_UNICODE_FORMAT_CHAR) | FLAG(B_UNICODE_PRIVATE_USE_CHAR) 409 | FLAG(B_UNICODE_SURROGATE) | FLAG(B_UNICODE_GENERAL_OTHER_TYPES) 410 | FLAG(31)) 411 ) != 0; 412 } 413 414 415 // #pragma mark - 416 417 418 /** Transforms the specified unicode character to lowercase. 419 */ 420 421 uint32 422 BUnicodeChar::ToLower(uint32 c) 423 { 424 BUnicodeChar(); 425 426 uint32 props = getProperties(c); 427 428 if (!propertyIsException(props)) { 429 if (FLAG(getCategory(props)) & (UF_UPPERCASE | UF_TITLECASE)) 430 return c + getSignedValue(props); 431 } else { 432 uint32 *exceptions = getExceptions(props); 433 uint32 firstExceptionValue = *exceptions; 434 435 if (haveExceptionValue(firstExceptionValue, EXC_LOWERCASE)) { 436 int16 index = EXC_LOWERCASE; 437 addExceptionOffset(firstExceptionValue, index, &++exceptions); 438 return *exceptions; 439 } 440 } 441 // no mapping found, just return the character unchanged 442 return c; 443 } 444 445 446 /** Transforms the specified unicode character to uppercase. 447 */ 448 449 uint32 450 BUnicodeChar::ToUpper(uint32 c) 451 { 452 BUnicodeChar(); 453 454 uint32 props = getProperties(c); 455 456 if (!propertyIsException(props)) { 457 if (getCategory(props) == B_UNICODE_LOWERCASE_LETTER) 458 return c - getSignedValue(props); 459 } else { 460 uint32 *exceptions = getExceptions(props); 461 uint32 firstExceptionValue = *exceptions; 462 463 if (haveExceptionValue(firstExceptionValue, EXC_UPPERCASE)) { 464 int16 index = EXC_UPPERCASE; 465 ++exceptions; 466 addExceptionOffset(firstExceptionValue, index, &exceptions); 467 return *exceptions; 468 } 469 } 470 // no mapping found, just return the character unchanged 471 return c; 472 } 473 474 475 /** Transforms the specified unicode character to title case. 476 */ 477 478 uint32 479 BUnicodeChar::ToTitle(uint32 c) 480 { 481 BUnicodeChar(); 482 483 uint32 props = getProperties(c); 484 485 if (!propertyIsException(props)) { 486 if (getCategory(props) == B_UNICODE_LOWERCASE_LETTER) { 487 // here, titlecase is the same as uppercase 488 return c - getSignedValue(props); 489 } 490 } else { 491 uint32 *exceptions = getExceptions(props); 492 uint32 firstExceptionValue = *exceptions; 493 494 if (haveExceptionValue(firstExceptionValue, EXC_TITLECASE)) { 495 int16 index = EXC_TITLECASE; 496 addExceptionOffset(firstExceptionValue, index, &++exceptions); 497 return (uint32)*exceptions; 498 } else if (haveExceptionValue(firstExceptionValue, EXC_UPPERCASE)) { 499 // here, titlecase is the same as uppercase 500 int16 index = EXC_UPPERCASE; 501 addExceptionOffset(firstExceptionValue, index, &++exceptions); 502 return *exceptions; 503 } 504 } 505 // no mapping found, just return the character unchanged 506 return c; 507 } 508 509 510 int32 511 BUnicodeChar::DigitValue(uint32 c) 512 { 513 BUnicodeChar(); 514 515 uint32 props = getProperties(c); 516 517 if (!propertyIsException(props)) { 518 if (getCategory(props) == B_UNICODE_DECIMAL_DIGIT_NUMBER) 519 return getSignedValue(props); 520 } else { 521 uint32 *exceptions = getExceptions(props); 522 uint32 firstExceptionValue = *exceptions; 523 524 if (haveExceptionValue(firstExceptionValue, EXC_DIGIT_VALUE)) { 525 int16 index = EXC_DIGIT_VALUE; 526 addExceptionOffset(firstExceptionValue, index, &++exceptions); 527 528 int32 value = (int32)(int16)*exceptions; 529 // the digit value is in the lower 16 bits 530 if (value != -1) 531 return value; 532 } 533 } 534 535 // If there is no value in the properties table, 536 // then check for some special characters 537 switch (c) { 538 case 0x3007: return 0; 539 case 0x4e00: return 1; 540 case 0x4e8c: return 2; 541 case 0x4e09: return 3; 542 case 0x56d8: return 4; 543 case 0x4e94: return 5; 544 case 0x516d: return 6; 545 case 0x4e03: return 7; 546 case 0x516b: return 8; 547 case 0x4e5d: return 9; 548 default: return -1; 549 } 550 } 551 552 553 void 554 BUnicodeChar::ToUTF8(uint32 c, char **out) 555 { 556 char *s = *out; 557 558 if (c < 0x80) 559 *(s++) = c; 560 else if (c < 0x800) { 561 *(s++) = 0xc0 | (c >> 6); 562 *(s++) = 0x80 | (c & 0x3f); 563 } else if (c < 0x10000) { 564 *(s++) = 0xe0 | (c >> 12); 565 *(s++) = 0x80 | ((c >> 6) & 0x3f); 566 *(s++) = 0x80 | (c & 0x3f); 567 } else if (c <= 0x10ffff) { 568 *(s++) = 0xf0 | (c >> 18); 569 *(s++) = 0x80 | ((c >> 12) & 0x3f); 570 *(s++) = 0x80 | ((c >> 6) & 0x3f); 571 *(s++) = 0x80 | (c & 0x3f); 572 } 573 *out = s; 574 } 575 576 577 uint32 578 BUnicodeChar::FromUTF8(const char **in) 579 { 580 uint8 *bytes = (uint8 *)*in; 581 if (bytes == NULL) 582 return 0; 583 584 int32 length; 585 uint8 mask = 0x1f; 586 587 switch (bytes[0] & 0xf0) { 588 case 0xc0: 589 case 0xd0: length = 2; break; 590 case 0xe0: length = 3; break; 591 case 0xf0: 592 mask = 0x0f; 593 length = 4; 594 break; 595 default: 596 // valid 1-byte character 597 // and invalid characters 598 (*in)++; 599 return bytes[0]; 600 } 601 uint32 c = bytes[0] & mask; 602 int32 i = 1; 603 for (;i < length && (bytes[i] & 0x80) > 0;i++) 604 c = (c << 6) | (bytes[i] & 0x3f); 605 606 if (i < length) { 607 // invalid character 608 (*in)++; 609 return (uint32)bytes[0]; 610 } 611 *in += length; 612 return c; 613 } 614 615 size_t 616 BUnicodeChar::UTF8StringLength(const char *str) 617 { 618 size_t len = 0; 619 while (*str) { 620 FromUTF8(&str); 621 len++; 622 } 623 return len; 624 } 625 626 size_t 627 BUnicodeChar::UTF8StringLength(const char *str, size_t maxLength) 628 { 629 size_t len = 0; 630 while (len < maxLength && *str) { 631 FromUTF8(&str); 632 len++; 633 } 634 return len; 635 } 636 637