1 /* 2 Open Tracker License 3 4 Terms and Conditions 5 6 Copyright (c) 1991-2001, Be Incorporated. All rights reserved. 7 8 Permission is hereby granted, free of charge, to any person obtaining a copy of 9 this software and associated documentation files (the "Software"), to deal in 10 the Software without restriction, including without limitation the rights to 11 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 12 of the Software, and to permit persons to whom the Software is furnished to do 13 so, subject to the following conditions: 14 15 The above copyright notice and this permission notice applies to all licensees 16 and shall be included in all copies or substantial portions of the Software. 17 18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF TITLE, MERCHANTABILITY, 20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 21 BE INCORPORATED BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN 22 AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN CONNECTION 23 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 24 25 Except as contained in this notice, the name of Be Incorporated shall not be 26 used in advertising or otherwise to promote the sale, use or other dealings in 27 this Software without prior written authorization from Be Incorporated. 28 29 BeMail(TM), Tracker(TM), Be(R), BeOS(R), and BeIA(TM) are trademarks or registered trademarks 30 of Be Incorporated in the United States and other countries. Other brand product 31 names are registered trademarks or trademarks of their respective holders. 32 All rights reserved. 33 */ 34 35 #include <ctype.h> 36 #include <stdio.h> 37 #include <string.h> 38 #include <List.h> 39 #include "Words.h" 40 41 /* 42 ** File METAPHON.C 43 */ 44 45 /* 46 ** MAXMETAPH is the length of the Metaphone code. 47 ** 48 ** Four is a good compromise value for English names. For comparing words 49 ** which are not names or for some non-English names, use a longer code 50 ** length for more precise matches. 51 ** 52 ** The default here is 5. 53 */ 54 55 #define MAXMETAPH 6 56 57 static const char *gCmpKey; 58 static int word_cmp( BString **firstArg, BString **secondArg ); 59 60 static int word_cmp( BString **firstArg, BString **secondArg ) 61 { 62 return word_match( gCmpKey, (*firstArg)->String() ) - word_match( gCmpKey, (*secondArg)->String() ); 63 } 64 65 Words::Words( bool useMetaphone ) 66 : fUseMetaphone( useMetaphone ) 67 { 68 69 } 70 71 Words::Words( BPositionIO *thes, bool useMetaphone ) 72 : WIndex( thes ), 73 fUseMetaphone( useMetaphone ) 74 { 75 76 } 77 78 Words::~Words( void ) 79 { 80 81 } 82 83 Words::Words( const char *dataPath, const char *indexPath, bool useMetaphone ) 84 : fUseMetaphone( useMetaphone ) 85 { 86 if( !useMetaphone ) 87 entrySize = sizeof( uint32 ); 88 SetTo( dataPath, indexPath ); 89 } 90 91 enum 92 { 93 FIND_WORD, 94 GET_WORD, 95 GET_FLAGS 96 }; 97 98 // Parse the Words file... 99 status_t Words::BuildIndex( void ) 100 { 101 // Buffer Stuff 102 char buffer[16384]; 103 char *nptr, *eptr; 104 int64 blockOffset; 105 int32 blockSize; 106 107 // The Word Entry 108 WIndexEntry entry; 109 char entryName[256], *namePtr = entryName; 110 char suffixName[256]; 111 char flags[32], *flagsPtr = flags; 112 113 // State Info 114 int32 state = FIND_WORD; 115 116 // Make sure we are at start of file 117 dataFile->Seek( 0, SEEK_SET ); 118 entry.offset = -1; 119 120 // Read blocks from thes until eof 121 while( true ) 122 { 123 // Get next block 124 blockOffset = dataFile->Position(); 125 if( (blockSize = dataFile->Read( buffer, 16384 )) == 0 ) 126 break; 127 128 // parse block 129 for( nptr = buffer, eptr = buffer + blockSize; nptr < eptr; nptr++ ) 130 { 131 // Looking for start of word? 132 if( state == FIND_WORD ) 133 { 134 // Is start of word? 135 if( isalpha(*nptr) ) 136 { 137 state = GET_WORD; 138 *namePtr++ = *nptr; // copy word 139 entry.offset = blockOffset + (nptr - buffer); 140 } 141 else 142 entry.offset++; 143 } 144 // End of word? 145 else if( (*nptr == '\n')||(*nptr == '\r') ) 146 { 147 if( namePtr != entryName ) 148 { 149 // Add previous entry to word index 150 *namePtr = 0; // terminate word 151 *flagsPtr = 0; // terminate flags 152 NormalizeWord( entryName, entryName ); 153 // Add base word 154 entry.key = GetKey( entryName ); 155 AddItem( &entry ); 156 157 // Add suffixed words if any 158 if( flagsPtr != flags ) 159 { 160 //printf( "Base: %s, flags: %s\n", entryName, flags ); 161 for( flagsPtr=flags; *flagsPtr != 0; flagsPtr++ ) 162 { 163 if( suffix_word( suffixName, entryName, *flagsPtr ) ) 164 { 165 //printf( "Suffix: %s\n", suffixName ); 166 entry.key = GetKey( suffixName ); 167 AddItem( &entry ); 168 } 169 } 170 } 171 } 172 // Init new entry 173 state = FIND_WORD; 174 namePtr = entryName; 175 flagsPtr = flags; 176 } 177 else if( state == GET_WORD ) // Are we looking for a word? 178 { 179 // Start of flags? 180 if( *nptr == '/' ) 181 { 182 *namePtr = 0; // terminate word 183 //printf( "Found word: %s\n", entryName ); 184 185 // Set state to get flags 186 state = GET_FLAGS; 187 } 188 else 189 *namePtr++ = *nptr; // copy word 190 } 191 else if( state == GET_FLAGS ) // Are we getting the flags? 192 *flagsPtr++ = *nptr; // copy flag 193 } // End for( nptr = buffer, eptr = buffer + blockSize; nptr < eptr; nptr++, entry.size++ ) 194 } // End while( true ) 195 196 SortItems(); 197 return B_OK; 198 } 199 200 /* 201 ** Character coding array 202 */ 203 204 static char vsvfn[26] = { 205 1,16,4,16,9,2,4,16,9,2,0,2,2,2,1,4,0,2,4,4,1,0,0,0,8,0}; 206 /* A B C D E F G H I J K L M N O P Q R S T U V W X Y Z */ 207 208 int32 Words::GetKey( const char *s ) 209 { 210 if( fUseMetaphone ) 211 { 212 char Metaph[12]; 213 const char *sPtr; 214 int32 key = 0; 215 int32 offset; 216 char c; 217 218 metaphone( s, Metaph, GENERATE ); 219 // Compact Metaphone from 6 bytes to 4 220 221 // printf( "%s -> %s: \n", s, Metaph ); 222 223 for( sPtr = Metaph, offset = 25; *sPtr; sPtr++, offset -= 5 ) 224 { 225 c = *sPtr - 'A'; 226 // printf( "%d,", int16(c) ); 227 key |= int32(c) << offset; 228 } 229 for( ; offset >= 0; offset -= 5 ) 230 key |= int32(31) << offset; 231 // printf( ": %ld\n", key ); 232 return key; 233 } 234 else 235 return WIndex::GetKey( s ); 236 } 237 238 /* 239 ** Macros to access the character coding array 240 */ 241 242 #define vowel(x) (vsvfn[(x) - 'A'] & 1) /* AEIOU */ 243 #define same(x) (vsvfn[(x) - 'A'] & 2) /* FJLMNR */ 244 #define varson(x) (vsvfn[(x) - 'A'] & 4) /* CGPST */ 245 #define frontv(x) (vsvfn[(x) - 'A'] & 8) /* EIY */ 246 #define noghf(x) (vsvfn[(x) - 'A'] & 16) /* BDH */ 247 #define NUL '\0' 248 /* 249 ** metaphone() 250 ** 251 ** Arguments: 1 - The word to be converted to a metaphone code. 252 ** 2 - A MAXMETAPH+1 char field for the result. 253 ** 3 - Function flag: 254 ** If 0: Compute the Metaphone code for the first argument, 255 ** then compare it to the Metaphone code passed in 256 ** the second argument. 257 ** If 1: Compute the Metaphone code for the first argument, 258 ** then store the result in the area pointed to by the 259 ** second argument. 260 ** 261 ** Returns: If function code is 0, returns Success_ for a match, else Error_. 262 ** If function code is 1, returns Success_. 263 */ 264 265 bool metaphone(const char *Word, char *Metaph, metaphlag Flag) 266 { 267 char *n, *n_start, *n_end; /* Pointers to string */ 268 char *metaph = NULL, *metaph_end; /* Pointers to metaph */ 269 char ntrans[512]; /* Word with uppercase letters */ 270 char newm[MAXMETAPH + 4]; /* New metaph for comparison */ 271 int KSflag; /* State flag for X translation */ 272 273 /* 274 ** Copy word to internal buffer, dropping non-alphabetic characters 275 ** and converting to upper case. 276 */ 277 278 for (n = ntrans + 1, n_end = ntrans + sizeof(ntrans) - 2; 279 *Word && n < n_end; ++Word) 280 { 281 if (isalpha(*Word)) 282 *n++ = toupper(*Word); 283 } 284 285 if (n == ntrans + 1) 286 return false; /* Return if zero characters */ 287 else n_end = n; /* Set end of string pointer */ 288 289 /* 290 ** Pad with NULs, front and rear 291 */ 292 293 *n++ = NUL; 294 *n = NUL; 295 n = ntrans; 296 *n++ = NUL; 297 298 /* 299 ** If doing comparison, redirect pointers 300 */ 301 302 if (COMPARE == Flag) 303 { 304 metaph = Metaph; 305 Metaph = newm; 306 } 307 308 /* 309 ** Check for PN, KN, GN, WR, WH, and X at start 310 */ 311 312 switch (*n) 313 { 314 case 'P': 315 case 'K': 316 case 'G': 317 if ('N' == *(n + 1)) 318 *n++ = NUL; 319 break; 320 321 case 'A': 322 if ('E' == *(n + 1)) 323 *n++ = NUL; 324 break; 325 326 case 'W': 327 if ('R' == *(n + 1)) 328 *n++ = NUL; 329 else if ('H' == *(n + 1)) 330 { 331 *(n + 1) = *n; 332 *n++ = NUL; 333 } 334 break; 335 336 case 'X': 337 *n = 'S'; 338 break; 339 } 340 341 /* 342 ** Now loop through the string, stopping at the end of the string 343 ** or when the computed Metaphone code is MAXMETAPH characters long. 344 */ 345 346 KSflag = false; /* State flag for KStranslation */ 347 for (metaph_end = Metaph + MAXMETAPH, n_start = n; 348 n <= n_end && Metaph < metaph_end; ++n) 349 { 350 if (KSflag) 351 { 352 KSflag = false; 353 *Metaph++ = *n; 354 } 355 else 356 { 357 /* Drop duplicates except for CC */ 358 359 if (*(n - 1) == *n && *n != 'C') 360 continue; 361 362 /* Check for F J L M N R or first letter vowel */ 363 364 if (same(*n) || (n == n_start && vowel(*n))) 365 *Metaph++ = *n; 366 else switch (*n) 367 { 368 case 'B': 369 if (n < n_end || *(n - 1) != 'M') 370 *Metaph++ = *n; 371 break; 372 373 case 'C': 374 if (*(n - 1) != 'S' || !frontv(*(n + 1))) 375 { 376 if ('I' == *(n + 1) && 'A' == *(n + 2)) 377 *Metaph++ = 'X'; 378 else if (frontv(*(n + 1))) 379 *Metaph++ = 'S'; 380 else if ('H' == *(n + 1)) 381 *Metaph++ = ((n == n_start && 382 !vowel(*(n + 2))) || 383 'S' == *(n - 1)) ? 'K' : 'X'; 384 else *Metaph++ = 'K'; 385 } 386 break; 387 388 case 'D': 389 *Metaph++ = ('G' == *(n + 1) && frontv(*(n + 2))) ? 390 'J' : 'T'; 391 break; 392 393 case 'G': 394 if ((*(n + 1) != 'H' || vowel(*(n + 2))) && 395 (*(n + 1) != 'N' || ((n + 1) < n_end && 396 (*(n + 2) != 'E' || *(n + 3) != 'D'))) && 397 (*(n - 1) != 'D' || !frontv(*(n + 1)))) 398 { 399 *Metaph++ = (frontv(*(n + 1)) && 400 *(n + 2) != 'G') ? 'J' : 'K'; 401 } 402 else if ('H' == *(n + 1) && !noghf(*(n - 3)) && 403 *(n - 4) != 'H') 404 { 405 *Metaph++ = 'F'; 406 } 407 break; 408 409 case 'H': 410 if (!varson(*(n - 1)) && (!vowel(*(n - 1)) || 411 vowel(*(n + 1)))) 412 { 413 *Metaph++ = 'H'; 414 } 415 break; 416 417 case 'K': 418 if (*(n - 1) != 'C') 419 *Metaph++ = 'K'; 420 break; 421 422 case 'P': 423 *Metaph++ = ('H' == *(n + 1)) ? 'F' : 'P'; 424 break; 425 426 case 'Q': 427 *Metaph++ = 'K'; 428 break; 429 430 case 'S': 431 *Metaph++ = ('H' == *(n + 1) || ('I' == *(n + 1) && 432 ('O' == *(n + 2) || 'A' == *(n + 2)))) ? 433 'X' : 'S'; 434 break; 435 436 case 'T': 437 if ('I' == *(n + 1) && ('O' == *(n + 2) || 438 'A' == *(n + 2))) 439 { 440 *Metaph++ = 'X'; 441 } 442 else if ('H' == *(n + 1)) 443 *Metaph++ = 'O'; 444 else if (*(n + 1) != 'C' || *(n + 2) != 'H') 445 *Metaph++ = 'T'; 446 break; 447 448 case 'V': 449 *Metaph++ = 'F'; 450 break; 451 452 case 'W': 453 case 'Y': 454 if (vowel(*(n + 1))) 455 *Metaph++ = *n; 456 break; 457 458 case 'X': 459 if (n == n_start) 460 *Metaph++ = 'S'; 461 else 462 { 463 *Metaph++ = 'K'; 464 KSflag = true; 465 } 466 break; 467 468 case 'Z': 469 *Metaph++ = 'S'; 470 break; 471 } 472 } 473 474 /* 475 ** Compare new Metaphone code with old 476 */ 477 478 if (COMPARE == Flag && 479 *(Metaph - 1) != metaph[(Metaph - newm) - 1]) 480 { 481 return false; 482 } 483 } 484 485 /* 486 ** If comparing, check if Metaphone codes were equal in length 487 */ 488 489 if (COMPARE == Flag && metaph[Metaph - newm]) 490 return false; 491 492 *Metaph = NUL; 493 return true; 494 } 495 496 int word_match( const char *reference, const char *test ) 497 { 498 const char *s1, *s2; 499 int32 x = 0; 500 char c1, c2; 501 s1 = test; 502 s2 = reference; 503 504 bool a, b; 505 506 while( *s2 || *s1 ) 507 { 508 c1 = tolower(*s1); 509 c2 = tolower(*s2); 510 511 if( *s2 && *s1 ) 512 { 513 if( c1 != c2 ) 514 { 515 a = (tolower(s1[1]) == c2); 516 b = (tolower(s2[1]) == c1); 517 // Reversed pair 518 if( a && b ) 519 { 520 x += 1; 521 s1++; 522 s2++; 523 } 524 // Extra character 525 if( a ) 526 { 527 x += 1; 528 s1++; 529 } 530 // Missing Character 531 else if( b ) 532 { 533 x += 1; 534 s2++; 535 } 536 // Equivalent Character 537 else if( vsvfn[c1] == vsvfn[c2] ) 538 x++; 539 // Unrelated Character 540 else 541 x += 3; 542 } 543 } 544 else 545 x += 1; 546 if( *s2 ) 547 s2++; 548 if( *s1 ) 549 s1++; 550 } 551 552 return x; 553 } 554 555 int32 suffix_word( char *dst, const char *src, char flag ) 556 { 557 char *end; 558 559 end = stpcpy( dst, src ); 560 flag = toupper( flag ); 561 switch( flag ) 562 { 563 case 'V': 564 switch( end[-1] ) 565 { 566 case 'e': 567 end = stpcpy( end-1, "ive" ); 568 break; 569 default: 570 end = stpcpy( end, "ive" ); 571 break; 572 } 573 break; 574 case 'N': 575 switch( end[-1] ) 576 { 577 case 'e': 578 end = stpcpy( end-1, "ion" ); 579 break; 580 case 'y': 581 end = stpcpy( end-1, "ication" ); 582 break; 583 default: 584 end = stpcpy( end, "en" ); 585 break; 586 } 587 break; 588 case 'X': 589 switch( end[-1] ) 590 { 591 case 'e': 592 end = stpcpy( end-1, "ions" ); 593 break; 594 case 'y': 595 end = stpcpy( end-1, "ications" ); 596 break; 597 default: 598 end = stpcpy( end, "ens" ); 599 break; 600 } 601 break; 602 case 'H': 603 switch( end[-1] ) 604 { 605 case 'y': 606 end = stpcpy( end-1, "ieth" ); 607 break; 608 default: 609 end = stpcpy( end, "th" ); 610 break; 611 } 612 break; 613 case 'Y': 614 end = stpcpy( end, "ly" ); 615 break; 616 case 'G': 617 switch( end[-1] ) 618 { 619 case 'e': 620 end = stpcpy( end-1, "ing" ); 621 break; 622 default: 623 end = stpcpy( end, "ing" ); 624 break; 625 } 626 break; 627 case 'J': 628 switch( end[-1] ) 629 { 630 case 'e': 631 end = stpcpy( end-1, "ings" ); 632 break; 633 default: 634 end = stpcpy( end, "ings" ); 635 break; 636 } 637 break; 638 case 'D': 639 switch( end[-1] ) 640 { 641 case 'e': 642 end = stpcpy( end-1, "ed" ); 643 break; 644 case 'y': 645 if( !strchr( "aeiou", end[-2] ) ) 646 { 647 end = stpcpy( end-1, "ied" ); 648 break; 649 } 650 // Fall through 651 default: 652 end = stpcpy( end, "ed" ); 653 break; 654 } 655 break; 656 case 'T': 657 switch( end[-1] ) 658 { 659 case 'e': 660 end = stpcpy( end-1, "est" ); 661 break; 662 case 'y': 663 if( !strchr( "aeiou", end[-2] ) ) 664 { 665 end = stpcpy( end-1, "iest" ); 666 break; 667 } 668 // Fall through 669 default: 670 end = stpcpy( end, "est" ); 671 break; 672 } 673 break; 674 case 'R': 675 switch( end[-1] ) 676 { 677 case 'e': 678 end = stpcpy( end-1, "er" ); 679 break; 680 case 'y': 681 if( !strchr( "aeiou", end[-2] ) ) 682 { 683 end = stpcpy( end-1, "ier" ); 684 break; 685 } 686 // Fall through 687 default: 688 end = stpcpy( end, "er" ); 689 break; 690 } 691 break; 692 case 'Z': 693 switch( end[-1] ) 694 { 695 case 'e': 696 end = stpcpy( end-1, "ers" ); 697 break; 698 case 'y': 699 if( !strchr( "aeiou", end[-2] ) ) 700 { 701 end = stpcpy( end-1, "iers" ); 702 break; 703 } 704 // Fall through 705 default: 706 end = stpcpy( end, "ers" ); 707 break; 708 } 709 break; 710 case 'S': 711 switch( end[-1] ) 712 { 713 case 's': 714 case 'x': 715 case 'z': 716 case 'h': 717 end = stpcpy( end, "es" ); 718 break; 719 case 'y': 720 if( !strchr( "aeiou", end[-2] ) ) 721 { 722 end = stpcpy( end-1, "ies" ); 723 break; 724 } 725 // Fall through 726 default: 727 end = stpcpy( end, "s" ); 728 break; 729 } 730 break; 731 case 'P': 732 switch( end[-1] ) 733 { 734 case 'y': 735 if( !strchr( "aeiou", end[-2] ) ) 736 { 737 end = stpcpy( end-1, "iness" ); 738 break; 739 } 740 // Fall through 741 default: 742 end = stpcpy( end, "ness" ); 743 break; 744 } 745 break; 746 case 'M': 747 end = stpcpy( end, "'s" ); 748 break; 749 default: 750 return 0; 751 } 752 return end-dst; 753 } 754 755 int32 Words::FindBestMatches( BList *matches, const char *s ) 756 { 757 int32 index; 758 // printf( "*** Looking for %s: ***\n", s ); 759 760 if( (index = FindFirst( s )) >= 0 ) 761 { 762 BString srcWord( s ); 763 FileEntry *entry; 764 WIndexEntry *indexEntry; 765 766 int32 key = (ItemAt( index ))->key; 767 int32 suffixLength; 768 char word[128], suffixWord[128]; 769 const char *src, *testWord; 770 const char *suffixFlags; 771 char *dst; 772 773 gCmpKey = srcWord.String(); 774 775 uint8 hashTable[32]; 776 uint8 hashValue, highHash, lowHash; 777 for( int32 i=0; i<32; i++ ) 778 hashTable[i] = 0; 779 780 do 781 { 782 indexEntry = ItemAt( index ); 783 // Hash the entry offset; we use this to make sure we don't add 784 // the same word file entry twice; 785 // It is possible for the same entry in the words file to have 786 // multiple entries in the index. 787 788 hashValue = indexEntry->offset % 256; 789 highHash = hashValue >> 3; 790 lowHash = 0x01 << (hashValue & 0x07); 791 792 //printf( "Testing Entry: %ld: hash=%d, highHash=%d, lowHash=%d\n", indexEntry->offset, hashValue, (uint16)highHash, (uint16)lowHash ); 793 794 // Has this entry offset been seen before? 795 if( !(hashTable[highHash] & lowHash) ) 796 { 797 //printf( "New Entry\n" ); 798 hashTable[highHash] |= lowHash; // Mark this offset so we don't add it twice 799 800 entry = GetEntry( index ); 801 src = entry->String(); 802 while( *src && !isalpha(*src) ) 803 src++; 804 dst = word; 805 while( *src && *src != '/' ) 806 *dst++ = *src++; 807 *dst = 0; 808 if( *src == '/' ) 809 suffixFlags = src+1; 810 else 811 suffixFlags = src; 812 813 //printf( "Base Word: %s\n", word ); 814 //printf( "Flags: %s\n", suffixFlags ); 815 testWord = word; // Test the base word first 816 do 817 { 818 //printf( "Testing: %s\n", testWord ); 819 // Does this word match the key 820 if( (GetKey( testWord ) == key) && 821 // And does it look close enough to the compare key? 822 //word_match( gCmpKey, testWord ) <= int32((strlen( gCmpKey )-1)/2) ) 823 word_match( gCmpKey, testWord ) <= int32(float(strlen( gCmpKey )-1)*.75) ) 824 { 825 //printf( "Added: %s\n", testWord ); 826 matches->AddItem( (void *)(new BString( testWord )) ); // Add it to the list 827 } 828 829 // If suffix, transform and test 830 if( *suffixFlags ) 831 { 832 // Repeat until valid suffix found or end is reached 833 suffixLength = 0; 834 while( *suffixFlags && !(suffixLength=suffix_word( suffixWord, word, *suffixFlags++ )) ) {} 835 if( suffixLength ) 836 testWord = suffixWord; 837 else 838 testWord = NULL; 839 } 840 else 841 testWord = NULL; 842 }while( testWord ); 843 delete entry; 844 } 845 //else 846 //printf( "Redundant entry\n" ); 847 index++; 848 }while( key == (ItemAt( index ))->key ); 849 850 return matches->CountItems(); 851 } 852 else 853 return 0; 854 } 855 856 void sort_word_list( BList *matches, const char *reference ) 857 { 858 if( matches->CountItems() > 0 ) 859 { 860 BString srcWord( reference ); 861 gCmpKey = srcWord.String(); 862 matches->SortItems( (int (*)(const void *, const void *))word_cmp ); 863 } 864 } 865