1 /* 2 ** Copyright 2003, Axel Dörfler, axeld@pinc-software.de. All rights reserved. 3 ** Distributed under the terms of the OpenBeOS License. 4 */ 5 6 7 #include <Collator.h> 8 #include <UnicodeChar.h> 9 #include <String.h> 10 #include <Message.h> 11 12 #include <typeinfo> 13 #include <ctype.h> 14 15 16 // conversion array for character ranges 192 - 223 & 224 - 255 17 static const uint8 kNoDiacrits[] = { 18 'a','a','a','a','a','a','a', 19 'c', 20 'e','e','e','e', 21 'i','i','i','i', 22 240, // eth 23 'n', 24 'o','o','o','o','o', 25 247, // 26 'o', 27 'u','u','u','u', 28 'y', 29 254, // thorn 30 'y' 31 }; 32 33 34 static inline uint32 35 getPrimaryChar(uint32 c) 36 { 37 if (c < 0x80) 38 return tolower(c); 39 40 // this automatically returns lowercase letters 41 if (c >= 192 && c < 223) 42 return kNoDiacrits[c - 192]; 43 if (c == 223) // ß 44 return 's'; 45 if (c >= 224 && c < 256) 46 return kNoDiacrits[c - 224]; 47 48 return BUnicodeChar::ToLower(c); 49 } 50 51 52 BCollatorAddOn::input_context::input_context(bool ignorePunctuation) 53 : 54 ignore_punctuation(ignorePunctuation), 55 next_char(0), 56 reserved1(0), 57 reserved2(0) 58 { 59 } 60 61 62 // #pragma mark - 63 64 65 BCollator::BCollator() 66 : 67 fCollatorImage(B_ERROR), 68 fStrength(B_COLLATE_PRIMARY), 69 fIgnorePunctuation(true) 70 { 71 // ToDo: the collator construction will have to change; the default 72 // collator should be constructed by the Locale/LocaleRoster, so we 73 // only need a constructor where you specify all details 74 75 fCollator = new BCollatorAddOn(); 76 } 77 78 79 BCollator::BCollator(BCollatorAddOn *collator, int8 strength, 80 bool ignorePunctuation) 81 : 82 fCollator(collator), 83 fCollatorImage(B_ERROR), 84 fStrength(strength), 85 fIgnorePunctuation(ignorePunctuation) 86 { 87 if (collator == NULL) 88 fCollator = new BCollatorAddOn(); 89 } 90 91 92 BCollator::BCollator(BMessage *archive) 93 : BArchivable(archive), 94 fCollator(NULL), 95 fCollatorImage(B_ERROR) 96 { 97 #if HAIKU_TARGET_PLATFORM_HAIKU 98 int32 data; 99 if (archive->FindInt32("loc:strength", &data) == B_OK) 100 fStrength = (uint8)data; 101 else 102 fStrength = B_COLLATE_PRIMARY; 103 104 if (archive->FindBool("loc:punctuation", &fIgnorePunctuation) != B_OK) 105 fIgnorePunctuation = true; 106 107 BMessage collatorArchive; 108 if (archive->FindMessage("loc:collator", &collatorArchive) == B_OK) { 109 BArchivable *unarchived = 110 instantiate_object(&collatorArchive, &fCollatorImage); 111 112 // do we really have a BCollatorAddOn here? 113 fCollator = dynamic_cast<BCollatorAddOn *>(unarchived); 114 if (fCollator == NULL) 115 delete unarchived; 116 } 117 118 if (fCollator == NULL) { 119 fCollator = new BCollatorAddOn(); 120 fCollatorImage = B_ERROR; 121 } 122 #endif 123 } 124 125 126 BCollator::~BCollator() 127 { 128 delete fCollator; 129 130 if (fCollatorImage >= B_OK) 131 unload_add_on(fCollatorImage); 132 } 133 134 135 void 136 BCollator::SetDefaultStrength(int8 strength) 137 { 138 fStrength = strength; 139 } 140 141 142 int8 143 BCollator::DefaultStrength() const 144 { 145 return fStrength; 146 } 147 148 149 void 150 BCollator::SetIgnorePunctuation(bool ignore) 151 { 152 fIgnorePunctuation = ignore; 153 } 154 155 156 bool 157 BCollator::IgnorePunctuation() const 158 { 159 return fIgnorePunctuation; 160 } 161 162 163 status_t 164 BCollator::GetSortKey(const char *string, BString *key, int8 strength) 165 { 166 if (strength == B_COLLATE_DEFAULT) 167 strength = fStrength; 168 169 return fCollator->GetSortKey(string, key, strength, fIgnorePunctuation); 170 } 171 172 173 int 174 BCollator::Compare(const char *a, const char *b, int32 length, int8 strength) 175 { 176 if (length == -1) // match the whole string 177 length = 0x7fffffff; 178 179 return fCollator->Compare(a, b, length, 180 strength == B_COLLATE_DEFAULT ? fStrength : strength, 181 fIgnorePunctuation); 182 } 183 184 185 status_t 186 BCollator::Archive(BMessage *archive, bool deep) const 187 { 188 status_t status = BArchivable::Archive(archive, deep); 189 if (status < B_OK) 190 return status; 191 192 if (status == B_OK) 193 status = archive->AddInt32("loc:strength", fStrength); 194 if (status == B_OK) 195 status = archive->AddBool("loc:punctuation", fIgnorePunctuation); 196 197 BMessage collatorArchive; 198 if (status == B_OK && deep 199 && typeid(*fCollator) != typeid(BCollatorAddOn) 200 // only archive subclasses from BCollatorAddOn 201 && (status = fCollator->Archive(&collatorArchive, true)) == B_OK) 202 status = archive->AddMessage("loc:collator", &collatorArchive); 203 204 return status; 205 } 206 207 208 BArchivable * 209 BCollator::Instantiate(BMessage *archive) 210 { 211 if (validate_instantiation(archive, "BCollator")) 212 return new BCollator(archive); 213 214 return NULL; 215 } 216 217 218 // #pragma mark - 219 220 221 BCollatorAddOn::BCollatorAddOn() 222 { 223 } 224 225 226 BCollatorAddOn::BCollatorAddOn(BMessage *archive) 227 : BArchivable(archive) 228 { 229 } 230 231 232 BCollatorAddOn::~BCollatorAddOn() 233 { 234 } 235 236 237 /** This returns the next Unicode character from the UTF-8 encoded 238 * input string, and bumps it to the next character. 239 * It will ignore punctuation if specified by the context, and 240 * might substitute characters if needed. 241 */ 242 243 uint32 244 BCollatorAddOn::GetNextChar(const char **string, input_context &context) 245 { 246 uint32 c = context.next_char; 247 if (c != 0) { 248 context.next_char = 0; 249 return c; 250 } 251 252 do { 253 c = BUnicodeChar::FromUTF8(string); 254 } while (context.ignore_punctuation 255 && (BUnicodeChar::IsPunctuation(c) || BUnicodeChar::IsSpace(c))); 256 257 if (c == 223) { 258 context.next_char = 's'; 259 return 's'; 260 } 261 262 return c; 263 } 264 265 266 /** Fills the specified buffer with the primary sort key. The buffer 267 * has to be long enough to hold the key. 268 * It returns the position in the buffer immediately after the key; 269 * it does not add a terminating null byte! 270 */ 271 272 char * 273 BCollatorAddOn::PutPrimaryKey(const char *string, char *buffer, int32 length, 274 bool ignorePunctuation) 275 { 276 input_context context(ignorePunctuation); 277 278 uint32 c; 279 for (int32 i = 0; (c = GetNextChar(&string, context)) != 0 && i < length; 280 i++) { 281 if (c < 0x80) 282 *buffer++ = tolower(c); 283 else 284 BUnicodeChar::ToUTF8(getPrimaryChar(c), &buffer); 285 } 286 287 return buffer; 288 } 289 290 291 size_t 292 BCollatorAddOn::PrimaryKeyLength(size_t length) 293 { 294 return length * 2; 295 // the primary key needs to make space for doubled characters (like 'ß') 296 } 297 298 299 status_t 300 BCollatorAddOn::GetSortKey(const char *string, BString *key, int8 strength, 301 bool ignorePunctuation) 302 { 303 if (strength >= B_COLLATE_QUATERNARY) { 304 // the difference between tertiary and quaternary collation strength 305 // are usually a different handling of punctuation characters 306 ignorePunctuation = false; 307 } 308 309 size_t length = strlen(string); 310 311 switch (strength) { 312 case B_COLLATE_PRIMARY: 313 { 314 char *begin = key->LockBuffer(PrimaryKeyLength(length)); 315 if (begin == NULL) 316 return B_NO_MEMORY; 317 318 char *end = PutPrimaryKey(string, begin, length, ignorePunctuation); 319 *end = '\0'; 320 321 key->UnlockBuffer(end - begin); 322 break; 323 } 324 325 case B_COLLATE_SECONDARY: 326 { 327 char *begin 328 = key->LockBuffer(PrimaryKeyLength(length) + length + 1); 329 // the primary key + the secondary key + separator char 330 if (begin == NULL) 331 return B_NO_MEMORY; 332 333 char *buffer = PutPrimaryKey(string, begin, length, ignorePunctuation); 334 *buffer++ = '\01'; 335 // separator 336 337 input_context context(ignorePunctuation); 338 uint32 c; 339 for (uint32 i = 0; 340 (c = GetNextChar(&string, context)) && i < length; i++) { 341 if (c < 0x80) 342 *buffer++ = tolower(c); 343 else 344 BUnicodeChar::ToUTF8(BUnicodeChar::ToLower(c), &buffer); 345 } 346 *buffer = '\0'; 347 348 key->UnlockBuffer(buffer - begin); 349 break; 350 } 351 352 case B_COLLATE_TERTIARY: 353 case B_COLLATE_QUATERNARY: 354 { 355 char *begin 356 = key->LockBuffer(PrimaryKeyLength(length) + length + 1); 357 // the primary key + the tertiary key + separator char 358 if (begin == NULL) 359 return B_NO_MEMORY; 360 361 char *buffer = PutPrimaryKey(string, begin, length, ignorePunctuation); 362 *buffer++ = '\01'; 363 // separator 364 365 input_context context(ignorePunctuation); 366 uint32 c; 367 for (uint32 i = 0; 368 (c = GetNextChar(&string, context)) && i < length; i++) { 369 BUnicodeChar::ToUTF8(c, &buffer); 370 } 371 *buffer = '\0'; 372 373 key->UnlockBuffer(buffer + length - begin); 374 break; 375 } 376 377 case B_COLLATE_IDENTICAL: 378 default: 379 key->SetTo(string, length); 380 // is there any way to check if BString::SetTo() actually succeeded? 381 break; 382 } 383 return B_OK; 384 } 385 386 387 int 388 BCollatorAddOn::Compare(const char *a, const char *b, int32 length, 389 int8 strength, bool ignorePunctuation) 390 { 391 if (strength >= B_COLLATE_QUATERNARY) { 392 // the difference between tertiary and quaternary collation strength 393 // are usually a different handling of punctuation characters 394 ignorePunctuation = false; 395 } 396 397 input_context contextA(ignorePunctuation); 398 input_context contextB(ignorePunctuation); 399 400 switch (strength) { 401 case B_COLLATE_PRIMARY: 402 { 403 for (int32 i = 0; i < length; i++) { 404 uint32 charA = GetNextChar(&a, contextA); 405 uint32 charB = GetNextChar(&b, contextB); 406 if (charA == 0) 407 return charB == 0 ? 0 : -(int32)charB; 408 else if (charB == 0) 409 return (int32)charA; 410 411 charA = getPrimaryChar(charA); 412 charB = getPrimaryChar(charB); 413 414 if (charA != charB) 415 return (int32)charA - (int32)charB; 416 } 417 return 0; 418 } 419 420 case B_COLLATE_SECONDARY: 421 { 422 // diacriticals can only change the order between equal strings 423 int32 compare 424 = Compare(a, b, length, B_COLLATE_PRIMARY, ignorePunctuation); 425 if (compare != 0) 426 return compare; 427 428 for (int32 i = 0; i < length; i++) { 429 uint32 charA = BUnicodeChar::ToLower(GetNextChar(&a, contextA)); 430 uint32 charB = BUnicodeChar::ToLower(GetNextChar(&b, contextB)); 431 432 // the two strings does have the same size when we get here 433 if (charA == 0) 434 return 0; 435 436 if (charA != charB) 437 return (int32)charA - (int32)charB; 438 } 439 return 0; 440 } 441 442 case B_COLLATE_TERTIARY: 443 case B_COLLATE_QUATERNARY: 444 { 445 // diacriticals can only change the order between equal strings 446 int32 compare 447 = Compare(a, b, length, B_COLLATE_PRIMARY, ignorePunctuation); 448 if (compare != 0) 449 return compare; 450 451 for (int32 i = 0; i < length; i++) { 452 uint32 charA = GetNextChar(&a, contextA); 453 uint32 charB = GetNextChar(&b, contextB); 454 455 // the two strings does have the same size when we get here 456 if (charA == 0) 457 return 0; 458 459 if (charA != charB) 460 return (int32)charA - (int32)charB; 461 } 462 return 0; 463 } 464 465 case B_COLLATE_IDENTICAL: 466 default: 467 return strncmp(a, b, length); 468 } 469 } 470 471 472 status_t 473 BCollatorAddOn::Archive(BMessage *archive, bool deep) const 474 { 475 return BArchivable::Archive(archive, deep); 476 } 477 478 479 BArchivable * 480 BCollatorAddOn::Instantiate(BMessage *archive) 481 { 482 if (validate_instantiation(archive, "BCollatorAddOn")) 483 return new BCollatorAddOn(archive); 484 485 return NULL; 486 } 487 488