1 /*
2 Open Tracker License
3
4 Terms and Conditions
5
6 Copyright (c) 1991-2001, Be Incorporated. All rights reserved.
7
8 Permission is hereby granted, free of charge, to any person obtaining a copy of
9 this software and associated documentation files (the "Software"), to deal in
10 the Software without restriction, including without limitation the rights to
11 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
12 of the Software, and to permit persons to whom the Software is furnished to do
13 so, subject to the following conditions:
14
15 The above copyright notice and this permission notice applies to all licensees
16 and shall be included in all copies or substantial portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF TITLE, MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21 BE INCORPORATED BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
22 AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN CONNECTION
23 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
24
25 Except as contained in this notice, the name of Be Incorporated shall not be
26 used in advertising or otherwise to promote the sale, use or other dealings in
27 this Software without prior written authorization from Be Incorporated.
28
29 BeMail(TM), Tracker(TM), Be(R), BeOS(R), and BeIA(TM) are trademarks or registered trademarks
30 of Be Incorporated in the United States and other countries. Other brand product
31 names are registered trademarks or trademarks of their respective holders.
32 All rights reserved.
33 */
34
35
36 #include "Words.h"
37
38 #include <ctype.h>
39 #include <stdio.h>
40 #include <string.h>
41
42
43 enum {
44 FIND_WORD,
45 GET_WORD,
46 GET_FLAGS
47 };
48
49
50 /*
51 MAXMETAPH is the length of the Metaphone code.
52
53 Four is a good compromise value for English names. For comparing words
54 which are not names or for some non-English names, use a longer code
55 length for more precise matches.
56
57 The default here is 5.
58 */
59
60 #define MAXMETAPH 6
61
62
63 // Character coding array, A-Z
64 static char vsvfn[26] = { 1, 16, 4, 16, 9, 2, 4, 16, 9, 2, 0, 2, 2, 2, 1, 4, 0,
65 2, 4, 4, 1, 0, 0, 0, 8, 0};
66
67
68 static const char* gCmpKey;
69
70
71 static int
word_cmp(BString ** firstArg,BString ** secondArg)72 word_cmp(BString** firstArg, BString** secondArg)
73 {
74 return word_match(gCmpKey, (*firstArg)->String()) - word_match(gCmpKey,
75 (*secondArg)->String());
76 }
77
78
Words(bool useMetaphone)79 Words::Words(bool useMetaphone)
80 :
81 fUseMetaphone(useMetaphone)
82 {
83
84 }
85
86
Words(BPositionIO * thes,bool useMetaphone)87 Words::Words(BPositionIO* thes, bool useMetaphone)
88 :
89 WIndex(thes),
90 fUseMetaphone(useMetaphone)
91 {
92
93 }
94
95
~Words(void)96 Words::~Words(void)
97 {
98
99 }
100
101
Words(const char * dataPath,const char * indexPath,bool useMetaphone)102 Words::Words(const char* dataPath, const char* indexPath, bool useMetaphone)
103 :
104 fUseMetaphone(useMetaphone)
105 {
106 if (!useMetaphone)
107 fEntrySize = sizeof(uint32);
108 SetTo(dataPath, indexPath);
109 }
110
111
112 status_t
BuildIndex(void)113 Words::BuildIndex(void)
114 {
115 // Parse the Words file...
116
117 // Buffer Stuff
118 char buffer[16384];
119 char *nptr, *eptr;
120 int64 blockOffset;
121 int32 blockSize;
122
123 // The Word Entry
124 WIndexEntry entry;
125 char entryName[256], *namePtr = entryName;
126 char suffixName[256];
127 char flags[32], *flagsPtr = flags;
128
129 // State Info
130 int32 state = FIND_WORD;
131
132 // Make sure we are at start of file
133 fDataFile->Seek(0, SEEK_SET);
134 entry.offset = -1;
135
136 // Read blocks from thes until eof
137 while (true) {
138 // Get next block
139 blockOffset = fDataFile->Position();
140 if ((blockSize = fDataFile->Read(buffer, 16384)) == 0)
141 break;
142
143 // parse block
144 for (nptr = buffer, eptr = buffer + blockSize; nptr < eptr; nptr++) {
145 // Looking for start of word?
146 if (state == FIND_WORD) {
147 // Is start of word?
148 if (isalpha(*nptr)) {
149 state = GET_WORD;
150 *namePtr++ = *nptr; // copy word
151 entry.offset = blockOffset + (nptr - buffer);
152 } else {
153 entry.offset++;
154 }
155 } else if ((*nptr == '\n') || (*nptr == '\r')) {
156 // End of word?
157
158 if (namePtr != entryName) {
159 // Add previous entry to word index
160 *namePtr = 0; // terminate word
161 *flagsPtr = 0; // terminate flags
162 NormalizeWord(entryName, entryName);
163 // Add base word
164 entry.key = GetKey(entryName);
165 AddItem(&entry);
166
167 // Add suffixed words if any
168 if (flagsPtr != flags) {
169 // printf("Base: %s, flags: %s\n", entryName, flags);
170 for (flagsPtr = flags; *flagsPtr != 0; flagsPtr++) {
171 if (suffix_word(suffixName, entryName, *flagsPtr)) {
172 // printf("Suffix: %s\n", suffixName);
173 entry.key = GetKey(suffixName);
174 AddItem(&entry);
175 }
176 }
177 }
178 }
179 // Init new entry
180 state = FIND_WORD;
181 namePtr = entryName;
182 flagsPtr = flags;
183 } else if (state == GET_WORD) {
184 // Start of flags?
185 if (*nptr == '/') {
186 *namePtr = 0; // terminate word
187 // printf("Found word: %s\n", entryName);
188
189 state = GET_FLAGS;
190 } else {
191 *namePtr++ = *nptr; // copy word
192 }
193 } else if (state == GET_FLAGS) // Are we getting the flags?
194 *flagsPtr++ = *nptr; // copy flag
195 } // End for (nptr = buffer, eptr = buffer + blockSize;
196 // nptr < eptr; nptr++, entry.size++)
197 } // End while (true)
198
199 SortItems();
200 return B_OK;
201 }
202
203
204 int32
GetKey(const char * s)205 Words::GetKey(const char* s)
206 {
207 if (fUseMetaphone) {
208 char Metaph[12];
209 const char *sPtr;
210 int32 key = 0;
211 int32 offset;
212 char c;
213
214 metaphone(s, Metaph, GENERATE);
215 // Compact Metaphone from 6 bytes to 4
216
217 // printf("%s -> %s: \n", s, Metaph);
218
219 for (sPtr = Metaph, offset = 25; *sPtr; sPtr++, offset -= 5) {
220 c = *sPtr - 'A';
221 // printf("%d,", int16(c));
222 key |= int32(c) << offset;
223 }
224 for (; offset >= 0; offset -= 5)
225 key |= int32(31) << offset;
226 // printf(": %ld\n", key);
227 return key;
228 } else {
229 return WIndex::GetKey(s);
230 }
231 }
232
233
234 // Macros to access the character coding array
235 #define vowel(x) (vsvfn[(x) - 'A'] & 1) // AEIOU
236 #define same(x) (vsvfn[(x) - 'A'] & 2) // FJLMNR
237 #define varson(x) (vsvfn[(x) - 'A'] & 4) // CGPST
238 #define frontv(x) (vsvfn[(x) - 'A'] & 8) // EIY
239 #define noghf(x) (vsvfn[(x) - 'A'] & 16) // BDH
240 #define NUL '\0'
241
242 /*
243 metaphone()
244
245 Arguments:
246 1 - The word to be converted to a metaphone code.
247 2 - A MAXMETAPH + 1 char field for the result.
248 3 - Function flag:
249 If 0: Compute the Metaphone code for the first argument,
250 then compare it to the Metaphone code passed in the second argument.
251 If 1: Compute the Metaphone code for the first argument,
252 then store the result in the area pointed to by the second argument.
253
254 Returns:
255 If function code is 0, returns Success_ for a match, else Error_.
256 If function code is 1, returns Success_.
257 */
258
259
260 bool
metaphone(const char * Word,char * Metaph,metaphlag Flag)261 metaphone(const char* Word, char* Metaph, metaphlag Flag)
262 {
263 char *n, *n_start, *n_end; // Pointers to string
264 char *metaph = NULL, *metaph_end; // Pointers to metaph
265 char ntrans[512]; // Word with uppercase letters
266 char newm[MAXMETAPH + 4]; // New metaph for comparison
267 int KSflag; // State flag for X translation
268
269 // Copy word to internal buffer, dropping non-alphabetic characters
270 // and converting to upper case.
271
272 for (n = ntrans + 1, n_end = ntrans + sizeof(ntrans) - 2;
273 *Word && n < n_end; ++Word) {
274 if (isalpha(*Word))
275 *n++ = toupper(*Word);
276 }
277
278 if (n == ntrans + 1)
279 return false; // Return if zero characters
280 else
281 n_end = n; // Set end of string pointer
282
283 // Pad with NULs, front and rear
284
285 *n++ = NUL;
286 *n = NUL;
287 n = ntrans;
288 *n++ = NUL;
289
290 // If doing comparison, redirect pointers
291
292 if (COMPARE == Flag) {
293 metaph = Metaph;
294 Metaph = newm;
295 }
296
297 // Check for PN, KN, GN, WR, WH, and X at start
298
299 switch (*n) {
300 case 'P':
301 case 'K':
302 case 'G':
303 if ('N' == *(n + 1))
304 *n++ = NUL;
305 break;
306
307 case 'A':
308 if ('E' == *(n + 1))
309 *n++ = NUL;
310 break;
311
312 case 'W':
313 if ('R' == *(n + 1)) {
314 *n++ = NUL;
315 } else if ('H' == *(n + 1)) {
316 *(n + 1) = *n;
317 *n++ = NUL;
318 }
319 break;
320
321 case 'X':
322 *n = 'S';
323 break;
324 }
325
326 // Now loop through the string, stopping at the end of the string
327 // or when the computed Metaphone code is MAXMETAPH characters long.
328
329 KSflag = false; // State flag for KStranslation
330
331 for (metaph_end = Metaph + MAXMETAPH, n_start = n;
332 n <= n_end && Metaph < metaph_end; ++n) {
333 if (KSflag) {
334 KSflag = false;
335 *Metaph++ = *n;
336 } else {
337 // Drop duplicates except for CC
338
339 if (*(n - 1) == *n && *n != 'C')
340 continue;
341
342 // Check for F J L M N R or first letter vowel
343
344 if (same(*n) || (n == n_start && vowel(*n))) {
345 *Metaph++ = *n;
346 } else {
347 switch (*n) {
348 case 'B':
349 if (n < n_end || *(n - 1) != 'M')
350 *Metaph++ = *n;
351 break;
352
353 case 'C':
354 if (*(n - 1) != 'S' || !frontv(*(n + 1))) {
355 if ('I' == *(n + 1) && 'A' == *(n + 2))
356 *Metaph++ = 'X';
357 else if (frontv(*(n + 1)))
358 *Metaph++ = 'S';
359 else if ('H' == *(n + 1)) {
360 *Metaph++ = ((n == n_start && !vowel(*(n + 2)))
361 || 'S' == *(n - 1)) ? 'K' : 'X';
362 } else {
363 *Metaph++ = 'K';
364 }
365 }
366 break;
367
368 case 'D':
369 *Metaph++ = ('G' == *(n + 1) && frontv(*(n + 2)))
370 ? 'J' : 'T';
371 break;
372
373 case 'G':
374 if ((*(n + 1) != 'H' || vowel(*(n + 2)))
375 && (*(n + 1) != 'N' || ((n + 1) < n_end
376 && (*(n + 2) != 'E' || *(n + 3) != 'D')))
377 && (*(n - 1) != 'D' || !frontv(*(n + 1)))) {
378 *Metaph++ = (frontv(*(n + 1))
379 && *(n + 2) != 'G') ? 'J' : 'K';
380 } else if ('H' == *(n + 1) && !noghf(*(n - 3))
381 && *(n - 4) != 'H') {
382 *Metaph++ = 'F';
383 }
384 break;
385
386 case 'H':
387 if (!varson(*(n - 1))
388 && (!vowel(*(n - 1)) || vowel(*(n + 1)))) {
389 *Metaph++ = 'H';
390 }
391 break;
392
393 case 'K':
394 if (*(n - 1) != 'C')
395 *Metaph++ = 'K';
396 break;
397
398 case 'P':
399 *Metaph++ = ('H' == *(n + 1)) ? 'F' : 'P';
400 break;
401
402 case 'Q':
403 *Metaph++ = 'K';
404 break;
405
406 case 'S':
407 *Metaph++ = ('H' == *(n + 1) || ('I' == *(n + 1)
408 && ('O' == *(n + 2) || 'A' == *(n + 2))))
409 ? 'X' : 'S';
410 break;
411
412 case 'T':
413 if ('I' == *(n + 1)
414 && ('O' == *(n + 2) || 'A' == *(n + 2))) {
415 *Metaph++ = 'X';
416 } else if ('H' == *(n + 1)) {
417 *Metaph++ = 'O';
418 } else if (*(n + 1) != 'C' || *(n + 2) != 'H') {
419 *Metaph++ = 'T';
420 }
421 break;
422
423 case 'V':
424 *Metaph++ = 'F';
425 break;
426
427 case 'W':
428 case 'Y':
429 if (vowel(*(n + 1)))
430 *Metaph++ = *n;
431 break;
432
433 case 'X':
434 if (n == n_start) {
435 *Metaph++ = 'S';
436 } else {
437 *Metaph++ = 'K';
438 KSflag = true;
439 }
440 break;
441
442 case 'Z':
443 *Metaph++ = 'S';
444 break;
445 }
446 }
447 }
448
449 // Compare new Metaphone code with old
450 if (COMPARE == Flag
451 && *(Metaph - 1) != metaph[(Metaph - newm) - 1]) {
452 return false;
453 }
454 }
455
456 // If comparing, check if Metaphone codes were equal in length
457 if (COMPARE == Flag && metaph[Metaph - newm])
458 return false;
459
460 *Metaph = NUL;
461 return true;
462 }
463
464
465 int
word_match(const char * reference,const char * test)466 word_match(const char* reference, const char* test)
467 {
468 const char *s1, *s2;
469 int32 x = 0;
470 char c1, c2;
471 s1 = test;
472 s2 = reference;
473
474 bool a, b;
475
476 while (*s2 || *s1) {
477 c1 = tolower(*s1);
478 c2 = tolower(*s2);
479
480 if (*s2 && *s1) {
481 if (c1 != c2) {
482 a = (tolower(s1[1]) == c2);
483 b = (tolower(s2[1]) == c1);
484 // Reversed pair
485 if (a && b) {
486 x += 1;
487 s1++;
488 s2++;
489 }
490 // Extra character
491 if (a) {
492 x += 1;
493 s1++;
494 }
495 // Missing Character
496 else if (b) {
497 x += 1;
498 s2++;
499 }
500 // Equivalent Character
501 else if (vsvfn[(unsigned)c1] == vsvfn[(unsigned)c2])
502 x++;
503 // Unrelated Character
504 else
505 x += 3;
506 }
507 } else {
508 x += 1;
509 }
510
511 if (*s2)
512 s2++;
513 if (*s1)
514 s1++;
515 }
516
517 return x;
518 }
519
520
521 int32
suffix_word(char * dst,const char * src,char flag)522 suffix_word(char* dst, const char* src, char flag)
523 {
524 char* end;
525
526 end = stpcpy(dst, src);
527 flag = toupper(flag);
528 switch (flag) {
529 case 'V':
530 switch (end[-1]) {
531 case 'e':
532 end = stpcpy(end - 1, "ive");
533 break;
534 default:
535 end = stpcpy(end, "ive");
536 break;
537 }
538 break;
539 case 'N':
540 switch (end[-1]) {
541 case 'e':
542 end = stpcpy(end - 1, "ion");
543 break;
544 case 'y':
545 end = stpcpy(end - 1, "ication");
546 break;
547 default:
548 end = stpcpy(end, "en");
549 break;
550 }
551 break;
552 case 'X':
553 switch (end[-1]) {
554 case 'e':
555 end = stpcpy(end - 1, "ions");
556 break;
557 case 'y':
558 end = stpcpy(end - 1, "ications");
559 break;
560 default:
561 end = stpcpy(end, "ens");
562 break;
563 }
564 break;
565 case 'H':
566 switch (end[-1]) {
567 case 'y':
568 end = stpcpy(end - 1, "ieth");
569 break;
570 default:
571 end = stpcpy(end, "th");
572 break;
573 }
574 break;
575 case 'Y':
576 end = stpcpy(end, "ly");
577 break;
578 case 'G':
579 switch (end[-1]) {
580 case 'e':
581 end = stpcpy(end - 1, "ing");
582 break;
583 default:
584 end = stpcpy(end, "ing");
585 break;
586 }
587 break;
588 case 'J':
589 switch (end[-1]) {
590 case 'e':
591 end = stpcpy(end - 1, "ings");
592 break;
593 default:
594 end = stpcpy(end, "ings");
595 break;
596 }
597 break;
598 case 'D':
599 switch (end[-1]) {
600 case 'e':
601 end = stpcpy(end - 1, "ed");
602 break;
603 case 'y':
604 if (!strchr("aeiou", end[-2])) {
605 end = stpcpy(end - 1, "ied");
606 break;
607 }
608 // Fall through
609 default:
610 end = stpcpy(end, "ed");
611 break;
612 }
613 break;
614 case 'T':
615 switch (end[-1]) {
616 case 'e':
617 end = stpcpy(end - 1, "est");
618 break;
619 case 'y':
620 if (!strchr("aeiou", end[-2])) {
621 end = stpcpy(end - 1, "iest");
622 break;
623 }
624 // Fall through
625 default:
626 end = stpcpy(end, "est");
627 break;
628 }
629 break;
630 case 'R':
631 switch (end[-1]) {
632 case 'e':
633 end = stpcpy(end - 1, "er");
634 break;
635 case 'y':
636 if (!strchr("aeiou", end[-2])) {
637 end = stpcpy(end - 1, "ier");
638 break;
639 }
640 // Fall through
641 default:
642 end = stpcpy(end, "er");
643 break;
644 }
645 break;
646 case 'Z':
647 switch (end[-1]) {
648 case 'e':
649 end = stpcpy(end - 1, "ers");
650 break;
651 case 'y':
652 if (!strchr("aeiou", end[-2])) {
653 end = stpcpy(end - 1, "iers");
654 break;
655 }
656 // Fall through
657 default:
658 end = stpcpy(end, "ers");
659 break;
660 }
661 break;
662 case 'S':
663 switch (end[-1]) {
664 case 's':
665 case 'x':
666 case 'z':
667 case 'h':
668 end = stpcpy(end, "es");
669 break;
670 case 'y':
671 if (!strchr("aeiou", end[-2])) {
672 end = stpcpy(end - 1, "ies");
673 break;
674 }
675 // Fall through
676 default:
677 end = stpcpy(end, "s");
678 break;
679 }
680 break;
681 case 'P':
682 switch (end[-1]) {
683 case 'y':
684 if (!strchr("aeiou", end[-2])) {
685 end = stpcpy(end - 1, "iness");
686 break;
687 }
688 // Fall through
689 default:
690 end = stpcpy(end, "ness");
691 break;
692 }
693 break;
694 case 'M':
695 end = stpcpy(end, "'s");
696 break;
697 default:
698 return 0;
699 }
700 return end - dst;
701 }
702
703
704 int32
FindBestMatches(BList * matches,const char * s)705 Words::FindBestMatches(BList* matches, const char* s)
706 {
707 int32 index;
708 // printf("*** Looking for %s: ***\n", s);
709
710 if ((index = FindFirst(s)) >= 0) {
711 BString srcWord(s);
712 FileEntry* entry;
713 WIndexEntry* indexEntry;
714
715 int32 key = (ItemAt(index))->key;
716 int32 suffixLength;
717 char word[128], suffixWord[128];
718 const char *src, *testWord;
719 const char *suffixFlags;
720 char *dst;
721
722 gCmpKey = srcWord.String();
723
724 uint8 hashTable[32];
725 uint8 hashValue, highHash, lowHash;
726 for (int32 i = 0; i < 32; i++)
727 hashTable[i] = 0;
728
729 do {
730 indexEntry = ItemAt(index);
731 // Hash the entry offset; we use this to make sure we don't add
732 // the same word file entry twice;
733 // It is possible for the same entry in the words file to have
734 // multiple entries in the index.
735
736 hashValue = indexEntry->offset % 256;
737 highHash = hashValue >> 3;
738 lowHash = 0x01 << (hashValue & 0x07);
739
740 // printf("Testing Entry: %ld: hash=%d, highHash=%d, lowHash=%d\n",
741 // indexEntry->offset, hashValue, (uint16)highHash,
742 // (uint16)lowHash);
743
744 // Has this entry offset been seen before?
745 if (!(hashTable[highHash] & lowHash)) {
746 // printf("New Entry\n");
747 hashTable[highHash] |= lowHash; // Mark this offset so we don't add it twice
748
749 entry = GetEntry(index);
750 src = entry->String();
751 while (*src && !isalpha(*src))
752 src++;
753 dst = word;
754 while (*src && *src != '/')
755 *dst++ = *src++;
756 *dst = 0;
757 if (*src == '/')
758 suffixFlags = src + 1;
759 else
760 suffixFlags = src;
761
762 // printf("Base Word: %s\n", word);
763 // printf("Flags: %s\n", suffixFlags);
764 testWord = word; // Test the base word first
765 do {
766 // printf("Testing: %s\n", testWord);
767 // Does this word match the key
768 if ((GetKey(testWord) == key)
769 // And does it look close enough to the compare key?
770 // word_match(gCmpKey, testWord)
771 // <= int32((strlen(gCmpKey)-1)/2))
772 && word_match(gCmpKey, testWord)
773 <= int32(float(strlen(gCmpKey)-1)*.75)) {
774 // printf("Added: %s\n", testWord);
775 matches->AddItem(new BString(testWord));
776 }
777
778 // If suffix, transform and test
779 if (*suffixFlags) {
780 // Repeat until valid suffix found or end is reached
781 suffixLength = 0;
782 while (*suffixFlags
783 && !(suffixLength = suffix_word(suffixWord,
784 word, *suffixFlags++))) {}
785 if (suffixLength)
786 testWord = suffixWord;
787 else
788 testWord = NULL;
789 } else {
790 testWord = NULL;
791 }
792 } while (testWord);
793 delete entry;
794 }
795 // else printf("Redundant entry\n");
796
797 index++;
798 } while (key == (ItemAt(index))->key);
799
800 return matches->CountItems();
801 } else {
802 return 0;
803 }
804 }
805
806
807 void
sort_word_list(BList * matches,const char * reference)808 sort_word_list(BList* matches, const char* reference)
809 {
810 if (matches->CountItems() > 0) {
811 BString srcWord(reference);
812 gCmpKey = srcWord.String();
813 matches->SortItems((int(*)(const void*, const void*))word_cmp);
814 }
815 }
816
817