1 /* 2 Open Tracker License 3 4 Terms and Conditions 5 6 Copyright (c) 1991-2001, Be Incorporated. All rights reserved. 7 8 Permission is hereby granted, free of charge, to any person obtaining a copy of 9 this software and associated documentation files (the "Software"), to deal in 10 the Software without restriction, including without limitation the rights to 11 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 12 of the Software, and to permit persons to whom the Software is furnished to do 13 so, subject to the following conditions: 14 15 The above copyright notice and this permission notice applies to all licensees 16 and shall be included in all copies or substantial portions of the Software. 17 18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF TITLE, MERCHANTABILITY, 20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 21 BE INCORPORATED BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN 22 AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN CONNECTION 23 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 24 25 Except as contained in this notice, the name of Be Incorporated shall not be 26 used in advertising or otherwise to promote the sale, use or other dealings in 27 this Software without prior written authorization from Be Incorporated. 28 29 BeMail(TM), Tracker(TM), Be(R), BeOS(R), and BeIA(TM) are trademarks or registered trademarks 30 of Be Incorporated in the United States and other countries. Other brand product 31 names are registered trademarks or trademarks of their respective holders. 32 All rights reserved. 33 */ 34 35 36 #include "Words.h" 37 38 #include <ctype.h> 39 #include <stdio.h> 40 #include <string.h> 41 42 43 enum { 44 FIND_WORD, 45 GET_WORD, 46 GET_FLAGS 47 }; 48 49 50 /* 51 MAXMETAPH is the length of the Metaphone code. 52 53 Four is a good compromise value for English names. For comparing words 54 which are not names or for some non-English names, use a longer code 55 length for more precise matches. 56 57 The default here is 5. 58 */ 59 60 #define MAXMETAPH 6 61 62 63 // Character coding array, A-Z 64 static char vsvfn[26] = { 1, 16, 4, 16, 9, 2, 4, 16, 9, 2, 0, 2, 2, 2, 1, 4, 0, 65 2, 4, 4, 1, 0, 0, 0, 8, 0}; 66 67 68 static const char* gCmpKey; 69 70 71 static int 72 word_cmp(BString** firstArg, BString** secondArg) 73 { 74 return word_match(gCmpKey, (*firstArg)->String()) - word_match(gCmpKey, 75 (*secondArg)->String()); 76 } 77 78 79 Words::Words(bool useMetaphone) 80 : 81 fUseMetaphone(useMetaphone) 82 { 83 84 } 85 86 87 Words::Words(BPositionIO* thes, bool useMetaphone) 88 : 89 WIndex(thes), 90 fUseMetaphone(useMetaphone) 91 { 92 93 } 94 95 96 Words::~Words(void) 97 { 98 99 } 100 101 102 Words::Words(const char* dataPath, const char* indexPath, bool useMetaphone) 103 : 104 fUseMetaphone(useMetaphone) 105 { 106 if (!useMetaphone) 107 fEntrySize = sizeof(uint32); 108 SetTo(dataPath, indexPath); 109 } 110 111 112 status_t 113 Words::BuildIndex(void) 114 { 115 // Parse the Words file... 116 117 // Buffer Stuff 118 char buffer[16384]; 119 char *nptr, *eptr; 120 int64 blockOffset; 121 int32 blockSize; 122 123 // The Word Entry 124 WIndexEntry entry; 125 char entryName[256], *namePtr = entryName; 126 char suffixName[256]; 127 char flags[32], *flagsPtr = flags; 128 129 // State Info 130 int32 state = FIND_WORD; 131 132 // Make sure we are at start of file 133 fDataFile->Seek(0, SEEK_SET); 134 entry.offset = -1; 135 136 // Read blocks from thes until eof 137 while (true) { 138 // Get next block 139 blockOffset = fDataFile->Position(); 140 if ((blockSize = fDataFile->Read(buffer, 16384)) == 0) 141 break; 142 143 // parse block 144 for (nptr = buffer, eptr = buffer + blockSize; nptr < eptr; nptr++) { 145 // Looking for start of word? 146 if (state == FIND_WORD) { 147 // Is start of word? 148 if (isalpha(*nptr)) { 149 state = GET_WORD; 150 *namePtr++ = *nptr; // copy word 151 entry.offset = blockOffset + (nptr - buffer); 152 } else { 153 entry.offset++; 154 } 155 } else if ((*nptr == '\n') || (*nptr == '\r')) { 156 // End of word? 157 158 if (namePtr != entryName) { 159 // Add previous entry to word index 160 *namePtr = 0; // terminate word 161 *flagsPtr = 0; // terminate flags 162 NormalizeWord(entryName, entryName); 163 // Add base word 164 entry.key = GetKey(entryName); 165 AddItem(&entry); 166 167 // Add suffixed words if any 168 if (flagsPtr != flags) { 169 // printf("Base: %s, flags: %s\n", entryName, flags); 170 for (flagsPtr = flags; *flagsPtr != 0; flagsPtr++) { 171 if (suffix_word(suffixName, entryName, *flagsPtr)) { 172 // printf("Suffix: %s\n", suffixName); 173 entry.key = GetKey(suffixName); 174 AddItem(&entry); 175 } 176 } 177 } 178 } 179 // Init new entry 180 state = FIND_WORD; 181 namePtr = entryName; 182 flagsPtr = flags; 183 } else if (state == GET_WORD) { 184 // Start of flags? 185 if (*nptr == '/') { 186 *namePtr = 0; // terminate word 187 // printf("Found word: %s\n", entryName); 188 189 state = GET_FLAGS; 190 } else { 191 *namePtr++ = *nptr; // copy word 192 } 193 } else if (state == GET_FLAGS) // Are we getting the flags? 194 *flagsPtr++ = *nptr; // copy flag 195 } // End for (nptr = buffer, eptr = buffer + blockSize; 196 // nptr < eptr; nptr++, entry.size++) 197 } // End while (true) 198 199 SortItems(); 200 return B_OK; 201 } 202 203 204 int32 205 Words::GetKey(const char* s) 206 { 207 if (fUseMetaphone) { 208 char Metaph[12]; 209 const char *sPtr; 210 int32 key = 0; 211 int32 offset; 212 char c; 213 214 metaphone(s, Metaph, GENERATE); 215 // Compact Metaphone from 6 bytes to 4 216 217 // printf("%s -> %s: \n", s, Metaph); 218 219 for (sPtr = Metaph, offset = 25; *sPtr; sPtr++, offset -= 5) { 220 c = *sPtr - 'A'; 221 // printf("%d,", int16(c)); 222 key |= int32(c) << offset; 223 } 224 for (; offset >= 0; offset -= 5) 225 key |= int32(31) << offset; 226 // printf(": %ld\n", key); 227 return key; 228 } else { 229 return WIndex::GetKey(s); 230 } 231 } 232 233 234 // Macros to access the character coding array 235 #define vowel(x) (vsvfn[(x) - 'A'] & 1) // AEIOU 236 #define same(x) (vsvfn[(x) - 'A'] & 2) // FJLMNR 237 #define varson(x) (vsvfn[(x) - 'A'] & 4) // CGPST 238 #define frontv(x) (vsvfn[(x) - 'A'] & 8) // EIY 239 #define noghf(x) (vsvfn[(x) - 'A'] & 16) // BDH 240 #define NUL '\0' 241 242 /* 243 metaphone() 244 245 Arguments: 246 1 - The word to be converted to a metaphone code. 247 2 - A MAXMETAPH + 1 char field for the result. 248 3 - Function flag: 249 If 0: Compute the Metaphone code for the first argument, 250 then compare it to the Metaphone code passed in the second argument. 251 If 1: Compute the Metaphone code for the first argument, 252 then store the result in the area pointed to by the second argument. 253 254 Returns: 255 If function code is 0, returns Success_ for a match, else Error_. 256 If function code is 1, returns Success_. 257 */ 258 259 260 bool 261 metaphone(const char* Word, char* Metaph, metaphlag Flag) 262 { 263 char *n, *n_start, *n_end; // Pointers to string 264 char *metaph = NULL, *metaph_end; // Pointers to metaph 265 char ntrans[512]; // Word with uppercase letters 266 char newm[MAXMETAPH + 4]; // New metaph for comparison 267 int KSflag; // State flag for X translation 268 269 // Copy word to internal buffer, dropping non-alphabetic characters 270 // and converting to upper case. 271 272 for (n = ntrans + 1, n_end = ntrans + sizeof(ntrans) - 2; 273 *Word && n < n_end; ++Word) { 274 if (isalpha(*Word)) 275 *n++ = toupper(*Word); 276 } 277 278 if (n == ntrans + 1) 279 return false; // Return if zero characters 280 else 281 n_end = n; // Set end of string pointer 282 283 // Pad with NULs, front and rear 284 285 *n++ = NUL; 286 *n = NUL; 287 n = ntrans; 288 *n++ = NUL; 289 290 // If doing comparison, redirect pointers 291 292 if (COMPARE == Flag) { 293 metaph = Metaph; 294 Metaph = newm; 295 } 296 297 // Check for PN, KN, GN, WR, WH, and X at start 298 299 switch (*n) { 300 case 'P': 301 case 'K': 302 case 'G': 303 if ('N' == *(n + 1)) 304 *n++ = NUL; 305 break; 306 307 case 'A': 308 if ('E' == *(n + 1)) 309 *n++ = NUL; 310 break; 311 312 case 'W': 313 if ('R' == *(n + 1)) { 314 *n++ = NUL; 315 } else if ('H' == *(n + 1)) { 316 *(n + 1) = *n; 317 *n++ = NUL; 318 } 319 break; 320 321 case 'X': 322 *n = 'S'; 323 break; 324 } 325 326 // Now loop through the string, stopping at the end of the string 327 // or when the computed Metaphone code is MAXMETAPH characters long. 328 329 KSflag = false; // State flag for KStranslation 330 331 for (metaph_end = Metaph + MAXMETAPH, n_start = n; 332 n <= n_end && Metaph < metaph_end; ++n) { 333 if (KSflag) { 334 KSflag = false; 335 *Metaph++ = *n; 336 } else { 337 // Drop duplicates except for CC 338 339 if (*(n - 1) == *n && *n != 'C') 340 continue; 341 342 // Check for F J L M N R or first letter vowel 343 344 if (same(*n) || (n == n_start && vowel(*n))) { 345 *Metaph++ = *n; 346 } else { 347 switch (*n) { 348 case 'B': 349 if (n < n_end || *(n - 1) != 'M') 350 *Metaph++ = *n; 351 break; 352 353 case 'C': 354 if (*(n - 1) != 'S' || !frontv(*(n + 1))) { 355 if ('I' == *(n + 1) && 'A' == *(n + 2)) 356 *Metaph++ = 'X'; 357 else if (frontv(*(n + 1))) 358 *Metaph++ = 'S'; 359 else if ('H' == *(n + 1)) { 360 *Metaph++ = ((n == n_start && !vowel(*(n + 2))) 361 || 'S' == *(n - 1)) ? 'K' : 'X'; 362 } else { 363 *Metaph++ = 'K'; 364 } 365 } 366 break; 367 368 case 'D': 369 *Metaph++ = ('G' == *(n + 1) && frontv(*(n + 2))) 370 ? 'J' : 'T'; 371 break; 372 373 case 'G': 374 if ((*(n + 1) != 'H' || vowel(*(n + 2))) 375 && (*(n + 1) != 'N' || ((n + 1) < n_end 376 && (*(n + 2) != 'E' || *(n + 3) != 'D'))) 377 && (*(n - 1) != 'D' || !frontv(*(n + 1)))) { 378 *Metaph++ = (frontv(*(n + 1)) 379 && *(n + 2) != 'G') ? 'J' : 'K'; 380 } else if ('H' == *(n + 1) && !noghf(*(n - 3)) 381 && *(n - 4) != 'H') { 382 *Metaph++ = 'F'; 383 } 384 break; 385 386 case 'H': 387 if (!varson(*(n - 1)) 388 && (!vowel(*(n - 1)) || vowel(*(n + 1)))) { 389 *Metaph++ = 'H'; 390 } 391 break; 392 393 case 'K': 394 if (*(n - 1) != 'C') 395 *Metaph++ = 'K'; 396 break; 397 398 case 'P': 399 *Metaph++ = ('H' == *(n + 1)) ? 'F' : 'P'; 400 break; 401 402 case 'Q': 403 *Metaph++ = 'K'; 404 break; 405 406 case 'S': 407 *Metaph++ = ('H' == *(n + 1) || ('I' == *(n + 1) 408 && ('O' == *(n + 2) || 'A' == *(n + 2)))) 409 ? 'X' : 'S'; 410 break; 411 412 case 'T': 413 if ('I' == *(n + 1) 414 && ('O' == *(n + 2) || 'A' == *(n + 2))) { 415 *Metaph++ = 'X'; 416 } else if ('H' == *(n + 1)) { 417 *Metaph++ = 'O'; 418 } else if (*(n + 1) != 'C' || *(n + 2) != 'H') { 419 *Metaph++ = 'T'; 420 } 421 break; 422 423 case 'V': 424 *Metaph++ = 'F'; 425 break; 426 427 case 'W': 428 case 'Y': 429 if (vowel(*(n + 1))) 430 *Metaph++ = *n; 431 break; 432 433 case 'X': 434 if (n == n_start) { 435 *Metaph++ = 'S'; 436 } else { 437 *Metaph++ = 'K'; 438 KSflag = true; 439 } 440 break; 441 442 case 'Z': 443 *Metaph++ = 'S'; 444 break; 445 } 446 } 447 } 448 449 // Compare new Metaphone code with old 450 if (COMPARE == Flag 451 && *(Metaph - 1) != metaph[(Metaph - newm) - 1]) { 452 return false; 453 } 454 } 455 456 // If comparing, check if Metaphone codes were equal in length 457 if (COMPARE == Flag && metaph[Metaph - newm]) 458 return false; 459 460 *Metaph = NUL; 461 return true; 462 } 463 464 465 int 466 word_match(const char* reference, const char* test) 467 { 468 const char *s1, *s2; 469 int32 x = 0; 470 char c1, c2; 471 s1 = test; 472 s2 = reference; 473 474 bool a, b; 475 476 while (*s2 || *s1) { 477 c1 = tolower(*s1); 478 c2 = tolower(*s2); 479 480 if (*s2 && *s1) { 481 if (c1 != c2) { 482 a = (tolower(s1[1]) == c2); 483 b = (tolower(s2[1]) == c1); 484 // Reversed pair 485 if (a && b) { 486 x += 1; 487 s1++; 488 s2++; 489 } 490 // Extra character 491 if (a) { 492 x += 1; 493 s1++; 494 } 495 // Missing Character 496 else if (b) { 497 x += 1; 498 s2++; 499 } 500 // Equivalent Character 501 else if (vsvfn[(unsigned)c1] == vsvfn[(unsigned)c2]) 502 x++; 503 // Unrelated Character 504 else 505 x += 3; 506 } 507 } else { 508 x += 1; 509 } 510 511 if (*s2) 512 s2++; 513 if (*s1) 514 s1++; 515 } 516 517 return x; 518 } 519 520 521 int32 522 suffix_word(char* dst, const char* src, char flag) 523 { 524 char* end; 525 526 end = stpcpy(dst, src); 527 flag = toupper(flag); 528 switch (flag) { 529 case 'V': 530 switch (end[-1]) { 531 case 'e': 532 end = stpcpy(end - 1, "ive"); 533 break; 534 default: 535 end = stpcpy(end, "ive"); 536 break; 537 } 538 break; 539 case 'N': 540 switch (end[-1]) { 541 case 'e': 542 end = stpcpy(end - 1, "ion"); 543 break; 544 case 'y': 545 end = stpcpy(end - 1, "ication"); 546 break; 547 default: 548 end = stpcpy(end, "en"); 549 break; 550 } 551 break; 552 case 'X': 553 switch (end[-1]) { 554 case 'e': 555 end = stpcpy(end - 1, "ions"); 556 break; 557 case 'y': 558 end = stpcpy(end - 1, "ications"); 559 break; 560 default: 561 end = stpcpy(end, "ens"); 562 break; 563 } 564 break; 565 case 'H': 566 switch (end[-1]) { 567 case 'y': 568 end = stpcpy(end - 1, "ieth"); 569 break; 570 default: 571 end = stpcpy(end, "th"); 572 break; 573 } 574 break; 575 case 'Y': 576 end = stpcpy(end, "ly"); 577 break; 578 case 'G': 579 switch (end[-1]) { 580 case 'e': 581 end = stpcpy(end - 1, "ing"); 582 break; 583 default: 584 end = stpcpy(end, "ing"); 585 break; 586 } 587 break; 588 case 'J': 589 switch (end[-1]) { 590 case 'e': 591 end = stpcpy(end - 1, "ings"); 592 break; 593 default: 594 end = stpcpy(end, "ings"); 595 break; 596 } 597 break; 598 case 'D': 599 switch (end[-1]) { 600 case 'e': 601 end = stpcpy(end - 1, "ed"); 602 break; 603 case 'y': 604 if (!strchr("aeiou", end[-2])) { 605 end = stpcpy(end - 1, "ied"); 606 break; 607 } 608 // Fall through 609 default: 610 end = stpcpy(end, "ed"); 611 break; 612 } 613 break; 614 case 'T': 615 switch (end[-1]) { 616 case 'e': 617 end = stpcpy(end - 1, "est"); 618 break; 619 case 'y': 620 if (!strchr("aeiou", end[-2])) { 621 end = stpcpy(end - 1, "iest"); 622 break; 623 } 624 // Fall through 625 default: 626 end = stpcpy(end, "est"); 627 break; 628 } 629 break; 630 case 'R': 631 switch (end[-1]) { 632 case 'e': 633 end = stpcpy(end - 1, "er"); 634 break; 635 case 'y': 636 if (!strchr("aeiou", end[-2])) { 637 end = stpcpy(end - 1, "ier"); 638 break; 639 } 640 // Fall through 641 default: 642 end = stpcpy(end, "er"); 643 break; 644 } 645 break; 646 case 'Z': 647 switch (end[-1]) { 648 case 'e': 649 end = stpcpy(end - 1, "ers"); 650 break; 651 case 'y': 652 if (!strchr("aeiou", end[-2])) { 653 end = stpcpy(end - 1, "iers"); 654 break; 655 } 656 // Fall through 657 default: 658 end = stpcpy(end, "ers"); 659 break; 660 } 661 break; 662 case 'S': 663 switch (end[-1]) { 664 case 's': 665 case 'x': 666 case 'z': 667 case 'h': 668 end = stpcpy(end, "es"); 669 break; 670 case 'y': 671 if (!strchr("aeiou", end[-2])) { 672 end = stpcpy(end - 1, "ies"); 673 break; 674 } 675 // Fall through 676 default: 677 end = stpcpy(end, "s"); 678 break; 679 } 680 break; 681 case 'P': 682 switch (end[-1]) { 683 case 'y': 684 if (!strchr("aeiou", end[-2])) { 685 end = stpcpy(end - 1, "iness"); 686 break; 687 } 688 // Fall through 689 default: 690 end = stpcpy(end, "ness"); 691 break; 692 } 693 break; 694 case 'M': 695 end = stpcpy(end, "'s"); 696 break; 697 default: 698 return 0; 699 } 700 return end - dst; 701 } 702 703 704 int32 705 Words::FindBestMatches(BList* matches, const char* s) 706 { 707 int32 index; 708 // printf("*** Looking for %s: ***\n", s); 709 710 if ((index = FindFirst(s)) >= 0) { 711 BString srcWord(s); 712 FileEntry* entry; 713 WIndexEntry* indexEntry; 714 715 int32 key = (ItemAt(index))->key; 716 int32 suffixLength; 717 char word[128], suffixWord[128]; 718 const char *src, *testWord; 719 const char *suffixFlags; 720 char *dst; 721 722 gCmpKey = srcWord.String(); 723 724 uint8 hashTable[32]; 725 uint8 hashValue, highHash, lowHash; 726 for (int32 i = 0; i < 32; i++) 727 hashTable[i] = 0; 728 729 do { 730 indexEntry = ItemAt(index); 731 // Hash the entry offset; we use this to make sure we don't add 732 // the same word file entry twice; 733 // It is possible for the same entry in the words file to have 734 // multiple entries in the index. 735 736 hashValue = indexEntry->offset % 256; 737 highHash = hashValue >> 3; 738 lowHash = 0x01 << (hashValue & 0x07); 739 740 // printf("Testing Entry: %ld: hash=%d, highHash=%d, lowHash=%d\n", 741 // indexEntry->offset, hashValue, (uint16)highHash, 742 // (uint16)lowHash); 743 744 // Has this entry offset been seen before? 745 if (!(hashTable[highHash] & lowHash)) { 746 // printf("New Entry\n"); 747 hashTable[highHash] |= lowHash; // Mark this offset so we don't add it twice 748 749 entry = GetEntry(index); 750 src = entry->String(); 751 while (*src && !isalpha(*src)) 752 src++; 753 dst = word; 754 while (*src && *src != '/') 755 *dst++ = *src++; 756 *dst = 0; 757 if (*src == '/') 758 suffixFlags = src + 1; 759 else 760 suffixFlags = src; 761 762 // printf("Base Word: %s\n", word); 763 // printf("Flags: %s\n", suffixFlags); 764 testWord = word; // Test the base word first 765 do { 766 // printf("Testing: %s\n", testWord); 767 // Does this word match the key 768 if ((GetKey(testWord) == key) 769 // And does it look close enough to the compare key? 770 // word_match(gCmpKey, testWord) 771 // <= int32((strlen(gCmpKey)-1)/2)) 772 && word_match(gCmpKey, testWord) 773 <= int32(float(strlen(gCmpKey)-1)*.75)) { 774 // printf("Added: %s\n", testWord); 775 matches->AddItem(new BString(testWord)); 776 } 777 778 // If suffix, transform and test 779 if (*suffixFlags) { 780 // Repeat until valid suffix found or end is reached 781 suffixLength = 0; 782 while (*suffixFlags 783 && !(suffixLength = suffix_word(suffixWord, 784 word, *suffixFlags++))) {} 785 if (suffixLength) 786 testWord = suffixWord; 787 else 788 testWord = NULL; 789 } else { 790 testWord = NULL; 791 } 792 } while (testWord); 793 delete entry; 794 } 795 // else printf("Redundant entry\n"); 796 797 index++; 798 } while (key == (ItemAt(index))->key); 799 800 return matches->CountItems(); 801 } else { 802 return 0; 803 } 804 } 805 806 807 void 808 sort_word_list(BList* matches, const char* reference) 809 { 810 if (matches->CountItems() > 0) { 811 BString srcWord(reference); 812 gCmpKey = srcWord.String(); 813 matches->SortItems((int(*)(const void*, const void*))word_cmp); 814 } 815 } 816 817