1 /* 2 ** Copyright 2003, Axel Dörfler, axeld@pinc-software.de. All rights reserved. 3 ** Distributed under the terms of the OpenBeOS License. 4 */ 5 6 7 #include <Collator.h> 8 #include <UnicodeChar.h> 9 #include <String.h> 10 #include <Message.h> 11 12 #include <typeinfo> 13 #include <ctype.h> 14 15 16 // conversion array for character ranges 192 - 223 & 224 - 255 17 static const uint8 kNoDiacrits[] = { 18 'a','a','a','a','a','a','a', 19 'c', 20 'e','e','e','e', 21 'i','i','i','i', 22 240, // eth 23 'n', 24 'o','o','o','o','o', 25 247, // 26 'o', 27 'u','u','u','u', 28 'y', 29 254, // thorn 30 'y' 31 }; 32 33 34 static inline uint32 35 getPrimaryChar(uint32 c) 36 { 37 if (c < 0x80) 38 return tolower(c); 39 40 // this automatically returns lowercase letters 41 if (c >= 192 && c < 223) 42 return kNoDiacrits[c - 192]; 43 if (c == 223) // ß 44 return 's'; 45 if (c >= 224 && c < 256) 46 return kNoDiacrits[c - 224]; 47 48 return BUnicodeChar::ToLower(c); 49 } 50 51 52 BCollatorAddOn::input_context::input_context(bool ignorePunctuation) 53 : 54 ignore_punctuation(ignorePunctuation), 55 next_char(0), 56 reserved1(0), 57 reserved2(0) 58 { 59 } 60 61 62 // #pragma mark - 63 64 65 BCollator::BCollator() 66 : 67 fCollatorImage(B_ERROR), 68 fStrength(B_COLLATE_PRIMARY), 69 fIgnorePunctuation(true) 70 { 71 // ToDo: the collator construction will have to change; the default 72 // collator should be constructed by the Locale/LocaleRoster, so we 73 // only need a constructor where you specify all details 74 75 fCollator = new BCollatorAddOn(); 76 } 77 78 79 BCollator::BCollator(BCollatorAddOn *collator, int8 strength, 80 bool ignorePunctuation) 81 : 82 fCollator(collator), 83 fCollatorImage(B_ERROR), 84 fStrength(strength), 85 fIgnorePunctuation(ignorePunctuation) 86 { 87 if (collator == NULL) 88 fCollator = new BCollatorAddOn(); 89 } 90 91 92 BCollator::BCollator(BMessage *archive) 93 : BArchivable(archive), 94 fCollator(NULL), 95 fCollatorImage(B_ERROR), 96 fIgnorePunctuation(true) 97 { 98 #if HAIKU_TARGET_PLATFORM_HAIKU 99 int32 data; 100 if (archive->FindInt32("loc:strength", &data) == B_OK) 101 fStrength = (uint8)data; 102 else 103 fStrength = B_COLLATE_PRIMARY; 104 105 if (archive->FindBool("loc:punctuation", &fIgnorePunctuation) != B_OK) 106 fIgnorePunctuation = true; 107 108 BMessage collatorArchive; 109 if (archive->FindMessage("loc:collator", &collatorArchive) == B_OK) { 110 BArchivable *unarchived = 111 instantiate_object(&collatorArchive, &fCollatorImage); 112 113 // do we really have a BCollatorAddOn here? 114 fCollator = dynamic_cast<BCollatorAddOn *>(unarchived); 115 if (fCollator == NULL) 116 delete unarchived; 117 } 118 119 if (fCollator == NULL) { 120 fCollator = new BCollatorAddOn(); 121 fCollatorImage = B_ERROR; 122 } 123 #endif 124 } 125 126 127 BCollator::~BCollator() 128 { 129 delete fCollator; 130 131 if (fCollatorImage >= B_OK) 132 unload_add_on(fCollatorImage); 133 } 134 135 136 void 137 BCollator::SetDefaultStrength(int8 strength) 138 { 139 fStrength = strength; 140 } 141 142 143 int8 144 BCollator::DefaultStrength() const 145 { 146 return fStrength; 147 } 148 149 150 void 151 BCollator::SetIgnorePunctuation(bool ignore) 152 { 153 fIgnorePunctuation = ignore; 154 } 155 156 157 bool 158 BCollator::IgnorePunctuation() const 159 { 160 return fIgnorePunctuation; 161 } 162 163 164 status_t 165 BCollator::GetSortKey(const char *string, BString *key, int8 strength) 166 { 167 if (strength == B_COLLATE_DEFAULT) 168 strength = fStrength; 169 170 return fCollator->GetSortKey(string, key, strength, fIgnorePunctuation); 171 } 172 173 174 int 175 BCollator::Compare(const char *a, const char *b, int32 length, int8 strength) 176 { 177 if (length == -1) // match the whole string 178 length = 0x7fffffff; 179 180 return fCollator->Compare(a, b, length, 181 strength == B_COLLATE_DEFAULT ? fStrength : strength, 182 fIgnorePunctuation); 183 } 184 185 186 status_t 187 BCollator::Archive(BMessage *archive, bool deep) const 188 { 189 status_t status = BArchivable::Archive(archive, deep); 190 if (status < B_OK) 191 return status; 192 193 if (status == B_OK) 194 status = archive->AddInt32("loc:strength", fStrength); 195 if (status == B_OK) 196 status = archive->AddBool("loc:punctuation", fIgnorePunctuation); 197 198 BMessage collatorArchive; 199 if (status == B_OK && deep 200 && typeid(*fCollator) != typeid(BCollatorAddOn) 201 // only archive subclasses from BCollatorAddOn 202 && (status = fCollator->Archive(&collatorArchive, true)) == B_OK) 203 status = archive->AddMessage("loc:collator", &collatorArchive); 204 205 return status; 206 } 207 208 209 BArchivable * 210 BCollator::Instantiate(BMessage *archive) 211 { 212 if (validate_instantiation(archive, "BCollator")) 213 return new BCollator(archive); 214 215 return NULL; 216 } 217 218 219 // #pragma mark - 220 221 222 BCollatorAddOn::BCollatorAddOn() 223 { 224 } 225 226 227 BCollatorAddOn::BCollatorAddOn(BMessage *archive) 228 : BArchivable(archive) 229 { 230 } 231 232 233 BCollatorAddOn::~BCollatorAddOn() 234 { 235 } 236 237 238 /** This returns the next Unicode character from the UTF-8 encoded 239 * input string, and bumps it to the next character. 240 * It will ignore punctuation if specified by the context, and 241 * might substitute characters if needed. 242 */ 243 244 uint32 245 BCollatorAddOn::GetNextChar(const char **string, input_context &context) 246 { 247 uint32 c = context.next_char; 248 if (c != 0) { 249 context.next_char = 0; 250 return c; 251 } 252 253 do { 254 c = BUnicodeChar::FromUTF8(string); 255 } while (context.ignore_punctuation 256 && (BUnicodeChar::IsPunctuation(c) || BUnicodeChar::IsSpace(c))); 257 258 if (c == 223) { 259 context.next_char = 's'; 260 return 's'; 261 } 262 263 return c; 264 } 265 266 267 /** Fills the specified buffer with the primary sort key. The buffer 268 * has to be long enough to hold the key. 269 * It returns the position in the buffer immediately after the key; 270 * it does not add a terminating null byte! 271 */ 272 273 char * 274 BCollatorAddOn::PutPrimaryKey(const char *string, char *buffer, int32 length, 275 bool ignorePunctuation) 276 { 277 input_context context(ignorePunctuation); 278 279 uint32 c; 280 for (int32 i = 0; (c = GetNextChar(&string, context)) != 0 && i < length; 281 i++) { 282 if (c < 0x80) 283 *buffer++ = tolower(c); 284 else 285 BUnicodeChar::ToUTF8(getPrimaryChar(c), &buffer); 286 } 287 288 return buffer; 289 } 290 291 292 size_t 293 BCollatorAddOn::PrimaryKeyLength(size_t length) 294 { 295 return length * 2; 296 // the primary key needs to make space for doubled characters (like 'ß') 297 } 298 299 300 status_t 301 BCollatorAddOn::GetSortKey(const char *string, BString *key, int8 strength, 302 bool ignorePunctuation) 303 { 304 if (strength >= B_COLLATE_QUATERNARY) { 305 // the difference between tertiary and quaternary collation strength 306 // are usually a different handling of punctuation characters 307 ignorePunctuation = false; 308 } 309 310 size_t length = strlen(string); 311 312 switch (strength) { 313 case B_COLLATE_PRIMARY: 314 { 315 char *begin = key->LockBuffer(PrimaryKeyLength(length)); 316 if (begin == NULL) 317 return B_NO_MEMORY; 318 319 char *end = PutPrimaryKey(string, begin, length, ignorePunctuation); 320 *end = '\0'; 321 322 key->UnlockBuffer(end - begin); 323 break; 324 } 325 326 case B_COLLATE_SECONDARY: 327 { 328 char *begin 329 = key->LockBuffer(PrimaryKeyLength(length) + length + 1); 330 // the primary key + the secondary key + separator char 331 if (begin == NULL) 332 return B_NO_MEMORY; 333 334 char *buffer = PutPrimaryKey(string, begin, length, ignorePunctuation); 335 *buffer++ = '\01'; 336 // separator 337 338 input_context context(ignorePunctuation); 339 uint32 c; 340 for (uint32 i = 0; 341 (c = GetNextChar(&string, context)) && i < length; i++) { 342 if (c < 0x80) 343 *buffer++ = tolower(c); 344 else 345 BUnicodeChar::ToUTF8(BUnicodeChar::ToLower(c), &buffer); 346 } 347 *buffer = '\0'; 348 349 key->UnlockBuffer(buffer - begin); 350 break; 351 } 352 353 case B_COLLATE_TERTIARY: 354 case B_COLLATE_QUATERNARY: 355 { 356 char *begin 357 = key->LockBuffer(PrimaryKeyLength(length) + length + 1); 358 // the primary key + the tertiary key + separator char 359 if (begin == NULL) 360 return B_NO_MEMORY; 361 362 char *buffer = PutPrimaryKey(string, begin, length, ignorePunctuation); 363 *buffer++ = '\01'; 364 // separator 365 366 input_context context(ignorePunctuation); 367 uint32 c; 368 for (uint32 i = 0; 369 (c = GetNextChar(&string, context)) && i < length; i++) { 370 BUnicodeChar::ToUTF8(c, &buffer); 371 } 372 *buffer = '\0'; 373 374 key->UnlockBuffer(buffer + length - begin); 375 break; 376 } 377 378 case B_COLLATE_IDENTICAL: 379 default: 380 key->SetTo(string, length); 381 // is there any way to check if BString::SetTo() actually succeeded? 382 break; 383 } 384 return B_OK; 385 } 386 387 388 int 389 BCollatorAddOn::Compare(const char *a, const char *b, int32 length, 390 int8 strength, bool ignorePunctuation) 391 { 392 if (strength >= B_COLLATE_QUATERNARY) { 393 // the difference between tertiary and quaternary collation strength 394 // are usually a different handling of punctuation characters 395 ignorePunctuation = false; 396 } 397 398 input_context contextA(ignorePunctuation); 399 input_context contextB(ignorePunctuation); 400 401 switch (strength) { 402 case B_COLLATE_PRIMARY: 403 { 404 for (int32 i = 0; i < length; i++) { 405 uint32 charA = GetNextChar(&a, contextA); 406 uint32 charB = GetNextChar(&b, contextB); 407 if (charA == 0) 408 return charB == 0 ? 0 : -(int32)charB; 409 else if (charB == 0) 410 return (int32)charA; 411 412 charA = getPrimaryChar(charA); 413 charB = getPrimaryChar(charB); 414 415 if (charA != charB) 416 return (int32)charA - (int32)charB; 417 } 418 return 0; 419 } 420 421 case B_COLLATE_SECONDARY: 422 { 423 // diacriticals can only change the order between equal strings 424 int32 compare 425 = Compare(a, b, length, B_COLLATE_PRIMARY, ignorePunctuation); 426 if (compare != 0) 427 return compare; 428 429 for (int32 i = 0; i < length; i++) { 430 uint32 charA = BUnicodeChar::ToLower(GetNextChar(&a, contextA)); 431 uint32 charB = BUnicodeChar::ToLower(GetNextChar(&b, contextB)); 432 433 // the two strings does have the same size when we get here 434 if (charA == 0) 435 return 0; 436 437 if (charA != charB) 438 return (int32)charA - (int32)charB; 439 } 440 return 0; 441 } 442 443 case B_COLLATE_TERTIARY: 444 case B_COLLATE_QUATERNARY: 445 { 446 // diacriticals can only change the order between equal strings 447 int32 compare 448 = Compare(a, b, length, B_COLLATE_PRIMARY, ignorePunctuation); 449 if (compare != 0) 450 return compare; 451 452 for (int32 i = 0; i < length; i++) { 453 uint32 charA = GetNextChar(&a, contextA); 454 uint32 charB = GetNextChar(&b, contextB); 455 456 // the two strings does have the same size when we get here 457 if (charA == 0) 458 return 0; 459 460 if (charA != charB) 461 return (int32)charA - (int32)charB; 462 } 463 return 0; 464 } 465 466 case B_COLLATE_IDENTICAL: 467 default: 468 return strncmp(a, b, length); 469 } 470 } 471 472 473 status_t 474 BCollatorAddOn::Archive(BMessage *archive, bool deep) const 475 { 476 return BArchivable::Archive(archive, deep); 477 } 478 479 480 BArchivable * 481 BCollatorAddOn::Instantiate(BMessage *archive) 482 { 483 if (validate_instantiation(archive, "BCollatorAddOn")) 484 return new BCollatorAddOn(archive); 485 486 return NULL; 487 } 488 489