xref: /webtrees/app/Soundex.php (revision 6930e9b42b9925bfc3a874fc2aaa59aabd0d2418)
1<?php
2
3/**
4 * webtrees: online genealogy
5 * Copyright (C) 2022 webtrees development team
6 * This program is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation, either version 3 of the License, or
9 * (at your option) any later version.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program. If not, see <https://www.gnu.org/licenses/>.
16 */
17
18declare(strict_types=1);
19
20namespace Fisharebest\Webtrees;
21
22/**
23 * Phonetic matching of strings.
24 */
25class Soundex
26{
27    // Determine the Daitch–Mokotoff Soundex code for a word
28    // Original implementation by Gerry Kroll, and analysis by Meliza Amity
29
30    // Max. table key length (in ASCII bytes -- NOT in UTF-8 characters!)
31    private const MAXCHAR = 7;
32
33    /**
34     * Name transformation arrays.
35     * Used to transform the Name string to simplify the "sounds like" table.
36     * This is especially useful in Hebrew.
37     *
38     * Each array entry defines the "from" and "to" arguments of an preg($from, $to, $text)
39     * function call to achieve the desired transformations.
40     *
41     * Note about the use of "\x01":
42     * This code, which can’t legitimately occur in the kind of text we're dealing with,
43     * is used as a place-holder so that conditional string replacements can be done.
44     */
45    private const TRANSFORM_NAMES = [
46        // Force Yiddish ligatures to be treated as separate letters
47        ['װ', 'וו'],
48        ['ײ', 'יי'],
49        ['ױ', 'וי'],
50        ['בו', 'בע'],
51        ['פו', 'פע'],
52        ['ומ', 'עמ'],
53        ['ום', 'עם'],
54        ['ונ', 'ענ'],
55        ['ון', 'ען'],
56        ['וו', 'ב'],
57        ["\x01", ''],
58        ['ייה$', "\x01ה"],
59        ['ייע$', "\x01ע"],
60        ['יי', 'ע'],
61        ["\x01", 'יי'],
62    ];
63
64    /**
65     * The DM sound coding table is organized this way:
66     * key: a variable-length string that corresponds to the UTF-8 character sequence
67     * represented by the table entry. Currently, that string can be up to 7
68     * bytes long. This maximum length is defined by the value of global variable
69     * $maxchar.
70     *
71     * value: an array as follows:
72     * [0]:  zero if not a vowel
73     * [1]:  sound value when this string is at the beginning of the word
74     * [2]:  sound value when this string is followed by a vowel
75     * [3]:  sound value for other cases
76     * [1],[2],[3] can be repeated several times to create branches in the code
77     * an empty sound value means "ignore in this state"
78     */
79    private const DM_SOUNDS = [
80        'A'       => ['1', '0', '', ''],
81        'À'       => ['1', '0', '', ''],
82        'Á'       => ['1', '0', '', ''],
83        'Â'       => ['1', '0', '', ''],
84        'Ã'       => ['1', '0', '', ''],
85        'Ä'       => ['1', '0', '1', '', '0', '', ''],
86        'Å'       => ['1', '0', '', ''],
87        'Ă'       => ['1', '0', '', ''],
88        'Ą'       => ['1', '', '', '', '', '', '6'],
89        'Ạ'       => ['1', '0', '', ''],
90        'Ả'       => ['1', '0', '', ''],
91        'Ấ'       => ['1', '0', '', ''],
92        'Ầ'       => ['1', '0', '', ''],
93        'Ẩ'       => ['1', '0', '', ''],
94        'Ẫ'       => ['1', '0', '', ''],
95        'Ậ'       => ['1', '0', '', ''],
96        'Ắ'       => ['1', '0', '', ''],
97        'Ằ'       => ['1', '0', '', ''],
98        'Ẳ'       => ['1', '0', '', ''],
99        'Ẵ'       => ['1', '0', '', ''],
100        'Ặ'       => ['1', '0', '', ''],
101        'AE'      => ['1', '0', '1', ''],
102        'Æ'       => ['1', '0', '1', ''],
103        'AI'      => ['1', '0', '1', ''],
104        'AJ'      => ['1', '0', '1', ''],
105        'AU'      => ['1', '0', '7', ''],
106        'AV'      => ['1', '0', '7', '', '7', '7', '7'],
107        'ÄU'      => ['1', '0', '1', ''],
108        'AY'      => ['1', '0', '1', ''],
109        'B'       => ['0', '7', '7', '7'],
110        'C'       => ['0', '5', '5', '5', '34', '4', '4'],
111        'Ć'       => ['0', '4', '4', '4'],
112        'Č'       => ['0', '4', '4', '4'],
113        'Ç'       => ['0', '4', '4', '4'],
114        'CH'      => ['0', '5', '5', '5', '34', '4', '4'],
115        'CHS'     => ['0', '5', '54', '54'],
116        'CK'      => ['0', '5', '5', '5', '45', '45', '45'],
117        'CCS'     => ['0', '4', '4', '4'],
118        'CS'      => ['0', '4', '4', '4'],
119        'CSZ'     => ['0', '4', '4', '4'],
120        'CZ'      => ['0', '4', '4', '4'],
121        'CZS'     => ['0', '4', '4', '4'],
122        'D'       => ['0', '3', '3', '3'],
123        'Ď'       => ['0', '3', '3', '3'],
124        'Đ'       => ['0', '3', '3', '3'],
125        'DRS'     => ['0', '4', '4', '4'],
126        'DRZ'     => ['0', '4', '4', '4'],
127        'DS'      => ['0', '4', '4', '4'],
128        'DSH'     => ['0', '4', '4', '4'],
129        'DSZ'     => ['0', '4', '4', '4'],
130        'DT'      => ['0', '3', '3', '3'],
131        'DDZ'     => ['0', '4', '4', '4'],
132        'DDZS'    => ['0', '4', '4', '4'],
133        'DZ'      => ['0', '4', '4', '4'],
134        'DŹ'      => ['0', '4', '4', '4'],
135        'DŻ'      => ['0', '4', '4', '4'],
136        'DZH'     => ['0', '4', '4', '4'],
137        'DZS'     => ['0', '4', '4', '4'],
138        'E'       => ['1', '0', '', ''],
139        'È'       => ['1', '0', '', ''],
140        'É'       => ['1', '0', '', ''],
141        'Ê'       => ['1', '0', '', ''],
142        'Ë'       => ['1', '0', '', ''],
143        'Ĕ'       => ['1', '0', '', ''],
144        'Ė'       => ['1', '0', '', ''],
145        'Ę'       => ['1', '', '', '6', '', '', ''],
146        'Ẹ'       => ['1', '0', '', ''],
147        'Ẻ'       => ['1', '0', '', ''],
148        'Ẽ'       => ['1', '0', '', ''],
149        'Ế'       => ['1', '0', '', ''],
150        'Ề'       => ['1', '0', '', ''],
151        'Ể'       => ['1', '0', '', ''],
152        'Ễ'       => ['1', '0', '', ''],
153        'Ệ'       => ['1', '0', '', ''],
154        'EAU'     => ['1', '0', '', ''],
155        'EI'      => ['1', '0', '1', ''],
156        'EJ'      => ['1', '0', '1', ''],
157        'EU'      => ['1', '1', '1', ''],
158        'EY'      => ['1', '0', '1', ''],
159        'F'       => ['0', '7', '7', '7'],
160        'FB'      => ['0', '7', '7', '7'],
161        'G'       => ['0', '5', '5', '5', '34', '4', '4'],
162        'Ğ'       => ['0', '', '', ''],
163        'GGY'     => ['0', '5', '5', '5'],
164        'GY'      => ['0', '5', '5', '5'],
165        'H'       => ['0', '5', '5', '', '5', '5', '5'],
166        'I'       => ['1', '0', '', ''],
167        'Ì'       => ['1', '0', '', ''],
168        'Í'       => ['1', '0', '', ''],
169        'Î'       => ['1', '0', '', ''],
170        'Ï'       => ['1', '0', '', ''],
171        'Ĩ'       => ['1', '0', '', ''],
172        'Į'       => ['1', '0', '', ''],
173        'İ'       => ['1', '0', '', ''],
174        'Ỉ'       => ['1', '0', '', ''],
175        'Ị'       => ['1', '0', '', ''],
176        'IA'      => ['1', '1', '', ''],
177        'IE'      => ['1', '1', '', ''],
178        'IO'      => ['1', '1', '', ''],
179        'IU'      => ['1', '1', '', ''],
180        'J'       => ['0', '1', '', '', '4', '4', '4', '5', '5', ''],
181        'K'       => ['0', '5', '5', '5'],
182        'KH'      => ['0', '5', '5', '5'],
183        'KS'      => ['0', '5', '54', '54'],
184        'L'       => ['0', '8', '8', '8'],
185        'Ľ'       => ['0', '8', '8', '8'],
186        'Ĺ'       => ['0', '8', '8', '8'],
187        'Ł'       => ['0', '7', '7', '7', '8', '8', '8'],
188        'LL'      => ['0', '8', '8', '8', '58', '8', '8', '1', '8', '8'],
189        'LLY'     => ['0', '8', '8', '8', '1', '8', '8'],
190        'LY'      => ['0', '8', '8', '8', '1', '8', '8'],
191        'M'       => ['0', '6', '6', '6'],
192        'MĔ'      => ['0', '66', '66', '66'],
193        'MN'      => ['0', '66', '66', '66'],
194        'N'       => ['0', '6', '6', '6'],
195        'Ń'       => ['0', '6', '6', '6'],
196        'Ň'       => ['0', '6', '6', '6'],
197        'Ñ'       => ['0', '6', '6', '6'],
198        'NM'      => ['0', '66', '66', '66'],
199        'O'       => ['1', '0', '', ''],
200        'Ò'       => ['1', '0', '', ''],
201        'Ó'       => ['1', '0', '', ''],
202        'Ô'       => ['1', '0', '', ''],
203        'Õ'       => ['1', '0', '', ''],
204        'Ö'       => ['1', '0', '', ''],
205        'Ø'       => ['1', '0', '', ''],
206        'Ő'       => ['1', '0', '', ''],
207        'Œ'       => ['1', '0', '', ''],
208        'Ơ'       => ['1', '0', '', ''],
209        'Ọ'       => ['1', '0', '', ''],
210        'Ỏ'       => ['1', '0', '', ''],
211        'Ố'       => ['1', '0', '', ''],
212        'Ồ'       => ['1', '0', '', ''],
213        'Ổ'       => ['1', '0', '', ''],
214        'Ỗ'       => ['1', '0', '', ''],
215        'Ộ'       => ['1', '0', '', ''],
216        'Ớ'       => ['1', '0', '', ''],
217        'Ờ'       => ['1', '0', '', ''],
218        'Ở'       => ['1', '0', '', ''],
219        'Ỡ'       => ['1', '0', '', ''],
220        'Ợ'       => ['1', '0', '', ''],
221        'OE'      => ['1', '0', '', ''],
222        'OI'      => ['1', '0', '1', ''],
223        'OJ'      => ['1', '0', '1', ''],
224        'OU'      => ['1', '0', '', ''],
225        'OY'      => ['1', '0', '1', ''],
226        'P'       => ['0', '7', '7', '7'],
227        'PF'      => ['0', '7', '7', '7'],
228        'PH'      => ['0', '7', '7', '7'],
229        'Q'       => ['0', '5', '5', '5'],
230        'R'       => ['0', '9', '9', '9'],
231        'Ř'       => ['0', '4', '4', '4'],
232        'RS'      => ['0', '4', '4', '4', '94', '94', '94'],
233        'RZ'      => ['0', '4', '4', '4', '94', '94', '94'],
234        'S'       => ['0', '4', '4', '4'],
235        'Ś'       => ['0', '4', '4', '4'],
236        'Š'       => ['0', '4', '4', '4'],
237        'Ş'       => ['0', '4', '4', '4'],
238        'SC'      => ['0', '2', '4', '4'],
239        'ŠČ'      => ['0', '2', '4', '4'],
240        'SCH'     => ['0', '4', '4', '4'],
241        'SCHD'    => ['0', '2', '43', '43'],
242        'SCHT'    => ['0', '2', '43', '43'],
243        'SCHTCH'  => ['0', '2', '4', '4'],
244        'SCHTSCH' => ['0', '2', '4', '4'],
245        'SCHTSH'  => ['0', '2', '4', '4'],
246        'SD'      => ['0', '2', '43', '43'],
247        'SH'      => ['0', '4', '4', '4'],
248        'SHCH'    => ['0', '2', '4', '4'],
249        'SHD'     => ['0', '2', '43', '43'],
250        'SHT'     => ['0', '2', '43', '43'],
251        'SHTCH'   => ['0', '2', '4', '4'],
252        'SHTSH'   => ['0', '2', '4', '4'],
253        'ß'       => ['0', '', '4', '4'],
254        'ST'      => ['0', '2', '43', '43'],
255        'STCH'    => ['0', '2', '4', '4'],
256        'STRS'    => ['0', '2', '4', '4'],
257        'STRZ'    => ['0', '2', '4', '4'],
258        'STSCH'   => ['0', '2', '4', '4'],
259        'STSH'    => ['0', '2', '4', '4'],
260        'SSZ'     => ['0', '4', '4', '4'],
261        'SZ'      => ['0', '4', '4', '4'],
262        'SZCS'    => ['0', '2', '4', '4'],
263        'SZCZ'    => ['0', '2', '4', '4'],
264        'SZD'     => ['0', '2', '43', '43'],
265        'SZT'     => ['0', '2', '43', '43'],
266        'T'       => ['0', '3', '3', '3'],
267        'Ť'       => ['0', '3', '3', '3'],
268        'Ţ'       => ['0', '3', '3', '3', '4', '4', '4'],
269        'TC'      => ['0', '4', '4', '4'],
270        'TCH'     => ['0', '4', '4', '4'],
271        'TH'      => ['0', '3', '3', '3'],
272        'TRS'     => ['0', '4', '4', '4'],
273        'TRZ'     => ['0', '4', '4', '4'],
274        'TS'      => ['0', '4', '4', '4'],
275        'TSCH'    => ['0', '4', '4', '4'],
276        'TSH'     => ['0', '4', '4', '4'],
277        'TSZ'     => ['0', '4', '4', '4'],
278        'TTCH'    => ['0', '4', '4', '4'],
279        'TTS'     => ['0', '4', '4', '4'],
280        'TTSCH'   => ['0', '4', '4', '4'],
281        'TTSZ'    => ['0', '4', '4', '4'],
282        'TTZ'     => ['0', '4', '4', '4'],
283        'TZ'      => ['0', '4', '4', '4'],
284        'TZS'     => ['0', '4', '4', '4'],
285        'U'       => ['1', '0', '', ''],
286        'Ù'       => ['1', '0', '', ''],
287        'Ú'       => ['1', '0', '', ''],
288        'Û'       => ['1', '0', '', ''],
289        'Ü'       => ['1', '0', '', ''],
290        'Ũ'       => ['1', '0', '', ''],
291        'Ū'       => ['1', '0', '', ''],
292        'Ů'       => ['1', '0', '', ''],
293        'Ű'       => ['1', '0', '', ''],
294        'Ų'       => ['1', '0', '', ''],
295        'Ư'       => ['1', '0', '', ''],
296        'Ụ'       => ['1', '0', '', ''],
297        'Ủ'       => ['1', '0', '', ''],
298        'Ứ'       => ['1', '0', '', ''],
299        'Ừ'       => ['1', '0', '', ''],
300        'Ử'       => ['1', '0', '', ''],
301        'Ữ'       => ['1', '0', '', ''],
302        'Ự'       => ['1', '0', '', ''],
303        'UE'      => ['1', '0', '', ''],
304        'UI'      => ['1', '0', '1', ''],
305        'UJ'      => ['1', '0', '1', ''],
306        'UY'      => ['1', '0', '1', ''],
307        'UW'      => ['1', '0', '1', '', '0', '7', '7'],
308        'V'       => ['0', '7', '7', '7'],
309        'W'       => ['0', '7', '7', '7'],
310        'X'       => ['0', '5', '54', '54'],
311        'Y'       => ['1', '1', '', ''],
312        'Ý'       => ['1', '1', '', ''],
313        'Ỳ'       => ['1', '1', '', ''],
314        'Ỵ'       => ['1', '1', '', ''],
315        'Ỷ'       => ['1', '1', '', ''],
316        'Ỹ'       => ['1', '1', '', ''],
317        'Z'       => ['0', '4', '4', '4'],
318        'Ź'       => ['0', '4', '4', '4'],
319        'Ż'       => ['0', '4', '4', '4'],
320        'Ž'       => ['0', '4', '4', '4'],
321        'ZD'      => ['0', '2', '43', '43'],
322        'ZDZ'     => ['0', '2', '4', '4'],
323        'ZDZH'    => ['0', '2', '4', '4'],
324        'ZH'      => ['0', '4', '4', '4'],
325        'ZHD'     => ['0', '2', '43', '43'],
326        'ZHDZH'   => ['0', '2', '4', '4'],
327        'ZS'      => ['0', '4', '4', '4'],
328        'ZSCH'    => ['0', '4', '4', '4'],
329        'ZSH'     => ['0', '4', '4', '4'],
330        'ZZS'     => ['0', '4', '4', '4'],
331        // Cyrillic alphabet
332        'А'       => ['1', '0', '', ''],
333        'Б'       => ['0', '7', '7', '7'],
334        'В'       => ['0', '7', '7', '7'],
335        'Г'       => ['0', '5', '5', '5'],
336        'Д'       => ['0', '3', '3', '3'],
337        'ДЗ'      => ['0', '4', '4', '4'],
338        'Е'       => ['1', '0', '', ''],
339        'Ё'       => ['1', '0', '', ''],
340        'Ж'       => ['0', '4', '4', '4'],
341        'З'       => ['0', '4', '4', '4'],
342        'И'       => ['1', '0', '', ''],
343        'Й'       => ['1', '1', '', '', '4', '4', '4'],
344        'К'       => ['0', '5', '5', '5'],
345        'Л'       => ['0', '8', '8', '8'],
346        'М'       => ['0', '6', '6', '6'],
347        'Н'       => ['0', '6', '6', '6'],
348        'О'       => ['1', '0', '', ''],
349        'П'       => ['0', '7', '7', '7'],
350        'Р'       => ['0', '9', '9', '9'],
351        'РЖ'      => ['0', '4', '4', '4'],
352        'С'       => ['0', '4', '4', '4'],
353        'Т'       => ['0', '3', '3', '3'],
354        'У'       => ['1', '0', '', ''],
355        'Ф'       => ['0', '7', '7', '7'],
356        'Х'       => ['0', '5', '5', '5'],
357        'Ц'       => ['0', '4', '4', '4'],
358        'Ч'       => ['0', '4', '4', '4'],
359        'Ш'       => ['0', '4', '4', '4'],
360        'Щ'       => ['0', '2', '4', '4'],
361        'Ъ'       => ['0', '', '', ''],
362        'Ы'       => ['0', '1', '', ''],
363        'Ь'       => ['0', '', '', ''],
364        'Э'       => ['1', '0', '', ''],
365        'Ю'       => ['0', '1', '', ''],
366        'Я'       => ['0', '1', '', ''],
367        // Greek alphabet
368        'Α'       => ['1', '0', '', ''],
369        'Ά'       => ['1', '0', '', ''],
370        'ΑΙ'      => ['1', '0', '1', ''],
371        'ΑΥ'      => ['1', '0', '1', ''],
372        'Β'       => ['0', '7', '7', '7'],
373        'Γ'       => ['0', '5', '5', '5'],
374        'Δ'       => ['0', '3', '3', '3'],
375        'Ε'       => ['1', '0', '', ''],
376        'Έ'       => ['1', '0', '', ''],
377        'ΕΙ'      => ['1', '0', '1', ''],
378        'ΕΥ'      => ['1', '1', '1', ''],
379        'Ζ'       => ['0', '4', '4', '4'],
380        'Η'       => ['1', '0', '', ''],
381        'Ή'       => ['1', '0', '', ''],
382        'Θ'       => ['0', '3', '3', '3'],
383        'Ι'       => ['1', '0', '', ''],
384        'Ί'       => ['1', '0', '', ''],
385        'Ϊ'       => ['1', '0', '', ''],
386        'ΐ'       => ['1', '0', '', ''],
387        'Κ'       => ['0', '5', '5', '5'],
388        'Λ'       => ['0', '8', '8', '8'],
389        'Μ'       => ['0', '6', '6', '6'],
390        'ΜΠ'      => ['0', '7', '7', '7'],
391        'Ν'       => ['0', '6', '6', '6'],
392        'ΝΤ'      => ['0', '3', '3', '3'],
393        'Ξ'       => ['0', '5', '54', '54'],
394        'Ο'       => ['1', '0', '', ''],
395        'Ό'       => ['1', '0', '', ''],
396        'ΟΙ'      => ['1', '0', '1', ''],
397        'ΟΥ'      => ['1', '0', '1', ''],
398        'Π'       => ['0', '7', '7', '7'],
399        'Ρ'       => ['0', '9', '9', '9'],
400        'Σ'       => ['0', '4', '4', '4'],
401        'ς'       => ['0', '', '', '4'],
402        'Τ'       => ['0', '3', '3', '3'],
403        'ΤΖ'      => ['0', '4', '4', '4'],
404        'ΤΣ'      => ['0', '4', '4', '4'],
405        'Υ'       => ['1', '1', '', ''],
406        'Ύ'       => ['1', '1', '', ''],
407        'Ϋ'       => ['1', '1', '', ''],
408        'ΰ'       => ['1', '1', '', ''],
409        'ΥΚ'      => ['1', '5', '5', '5'],
410        'ΥΥ'      => ['1', '65', '65', '65'],
411        'Φ'       => ['0', '7', '7', '7'],
412        'Χ'       => ['0', '5', '5', '5'],
413        'Ψ'       => ['0', '7', '7', '7'],
414        'Ω'       => ['1', '0', '', ''],
415        'Ώ'       => ['1', '0', '', ''],
416        // Hebrew alphabet
417        'א'       => ['1', '0', '', ''],
418        'או'      => ['1', '0', '7', ''],
419        'אג'      => ['1', '4', '4', '4', '5', '5', '5', '34', '34', '34'],
420        'בב'      => ['0', '7', '7', '7', '77', '77', '77'],
421        'ב'       => ['0', '7', '7', '7'],
422        'גג'      => ['0', '4', '4', '4', '5', '5', '5', '45', '45', '45', '55', '55', '55', '54', '54', '54'],
423        'גד'      => ['0', '43', '43', '43', '53', '53', '53'],
424        'גה'      => ['0', '45', '45', '45', '55', '55', '55'],
425        'גז'      => ['0', '44', '44', '44', '45', '45', '45'],
426        'גח'      => ['0', '45', '45', '45', '55', '55', '55'],
427        'גכ'      => ['0', '45', '45', '45', '55', '55', '55'],
428        'גך'      => ['0', '45', '45', '45', '55', '55', '55'],
429        'גצ'      => ['0', '44', '44', '44', '45', '45', '45'],
430        'גץ'      => ['0', '44', '44', '44', '45', '45', '45'],
431        'גק'      => ['0', '45', '45', '45', '54', '54', '54'],
432        'גש'      => ['0', '44', '44', '44', '54', '54', '54'],
433        'גת'      => ['0', '43', '43', '43', '53', '53', '53'],
434        'ג'       => ['0', '4', '4', '4', '5', '5', '5'],
435        'דז'      => ['0', '4', '4', '4'],
436        'דד'      => ['0', '3', '3', '3', '33', '33', '33'],
437        'דט'      => ['0', '33', '33', '33'],
438        'דש'      => ['0', '4', '4', '4'],
439        'דצ'      => ['0', '4', '4', '4'],
440        'דץ'      => ['0', '4', '4', '4'],
441        'ד'       => ['0', '3', '3', '3'],
442        'הג'      => ['0', '54', '54', '54', '55', '55', '55'],
443        'הכ'      => ['0', '55', '55', '55'],
444        'הח'      => ['0', '55', '55', '55'],
445        'הק'      => ['0', '55', '55', '55', '5', '5', '5'],
446        'הה'      => ['0', '5', '5', '', '55', '55', ''],
447        'ה'       => ['0', '5', '5', ''],
448        'וי'      => ['1', '', '', '', '7', '7', '7'],
449        'ו'       => ['1', '7', '7', '7', '7', '', ''],
450        'וו'      => ['1', '7', '7', '7', '7', '', ''],
451        'וופ'     => ['1', '7', '7', '7', '77', '77', '77'],
452        'זש'      => ['0', '4', '4', '4', '44', '44', '44'],
453        'זדז'     => ['0', '2', '4', '4'],
454        'ז'       => ['0', '4', '4', '4'],
455        'זג'      => ['0', '44', '44', '44', '45', '45', '45'],
456        'זז'      => ['0', '4', '4', '4', '44', '44', '44'],
457        'זס'      => ['0', '44', '44', '44'],
458        'זצ'      => ['0', '44', '44', '44'],
459        'זץ'      => ['0', '44', '44', '44'],
460        'חג'      => ['0', '54', '54', '54', '53', '53', '53'],
461        'חח'      => ['0', '5', '5', '5', '55', '55', '55'],
462        'חק'      => ['0', '55', '55', '55', '5', '5', '5'],
463        'חכ'      => ['0', '45', '45', '45', '55', '55', '55'],
464        'חס'      => ['0', '5', '54', '54'],
465        'חש'      => ['0', '5', '54', '54'],
466        'ח'       => ['0', '5', '5', '5'],
467        'טש'      => ['0', '4', '4', '4'],
468        'טד'      => ['0', '33', '33', '33'],
469        'טי'      => ['0', '3', '3', '3', '4', '4', '4', '3', '3', '34'],
470        'טת'      => ['0', '33', '33', '33'],
471        'טט'      => ['0', '3', '3', '3', '33', '33', '33'],
472        'ט'       => ['0', '3', '3', '3'],
473        'י'       => ['1', '1', '', ''],
474        'יא'      => ['1', '1', '', '', '1', '1', '1'],
475        'כג'      => ['0', '55', '55', '55', '54', '54', '54'],
476        'כש'      => ['0', '5', '54', '54'],
477        'כס'      => ['0', '5', '54', '54'],
478        'ככ'      => ['0', '5', '5', '5', '55', '55', '55'],
479        'כך'      => ['0', '5', '5', '5', '55', '55', '55'],
480        'כ'       => ['0', '5', '5', '5'],
481        'כח'      => ['0', '55', '55', '55', '5', '5', '5'],
482        'ך'       => ['0', '', '5', '5'],
483        'ל'       => ['0', '8', '8', '8'],
484        'לל'      => ['0', '88', '88', '88', '8', '8', '8'],
485        'מנ'      => ['0', '66', '66', '66'],
486        'מן'      => ['0', '66', '66', '66'],
487        'ממ'      => ['0', '6', '6', '6', '66', '66', '66'],
488        'מם'      => ['0', '6', '6', '6', '66', '66', '66'],
489        'מ'       => ['0', '6', '6', '6'],
490        'ם'       => ['0', '', '6', '6'],
491        'נמ'      => ['0', '66', '66', '66'],
492        'נם'      => ['0', '66', '66', '66'],
493        'ננ'      => ['0', '6', '6', '6', '66', '66', '66'],
494        'נן'      => ['0', '6', '6', '6', '66', '66', '66'],
495        'נ'       => ['0', '6', '6', '6'],
496        'ן'       => ['0', '', '6', '6'],
497        'סתש'     => ['0', '2', '4', '4'],
498        'סתז'     => ['0', '2', '4', '4'],
499        'סטז'     => ['0', '2', '4', '4'],
500        'סטש'     => ['0', '2', '4', '4'],
501        'סצד'     => ['0', '2', '4', '4'],
502        'סט'      => ['0', '2', '4', '4', '43', '43', '43'],
503        'סת'      => ['0', '2', '4', '4', '43', '43', '43'],
504        'סג'      => ['0', '44', '44', '44', '4', '4', '4'],
505        'סס'      => ['0', '4', '4', '4', '44', '44', '44'],
506        'סצ'      => ['0', '44', '44', '44'],
507        'סץ'      => ['0', '44', '44', '44'],
508        'סז'      => ['0', '44', '44', '44'],
509        'סש'      => ['0', '44', '44', '44'],
510        'ס'       => ['0', '4', '4', '4'],
511        'ע'       => ['1', '0', '', ''],
512        'פב'      => ['0', '7', '7', '7', '77', '77', '77'],
513        'פוו'     => ['0', '7', '7', '7', '77', '77', '77'],
514        'פפ'      => ['0', '7', '7', '7', '77', '77', '77'],
515        'פף'      => ['0', '7', '7', '7', '77', '77', '77'],
516        'פ'       => ['0', '7', '7', '7'],
517        'ף'       => ['0', '', '7', '7'],
518        'צג'      => ['0', '44', '44', '44', '45', '45', '45'],
519        'צז'      => ['0', '44', '44', '44'],
520        'צס'      => ['0', '44', '44', '44'],
521        'צצ'      => ['0', '4', '4', '4', '5', '5', '5', '44', '44', '44', '54', '54', '54', '45', '45', '45'],
522        'צץ'      => ['0', '4', '4', '4', '5', '5', '5', '44', '44', '44', '54', '54', '54'],
523        'צש'      => ['0', '44', '44', '44', '4', '4', '4', '5', '5', '5'],
524        'צ'       => ['0', '4', '4', '4', '5', '5', '5'],
525        'ץ'       => ['0', '', '4', '4'],
526        'קה'      => ['0', '55', '55', '5'],
527        'קס'      => ['0', '5', '54', '54'],
528        'קש'      => ['0', '5', '54', '54'],
529        'קק'      => ['0', '5', '5', '5', '55', '55', '55'],
530        'קח'      => ['0', '55', '55', '55'],
531        'קכ'      => ['0', '55', '55', '55'],
532        'קך'      => ['0', '55', '55', '55'],
533        'קג'      => ['0', '55', '55', '55', '54', '54', '54'],
534        'ק'       => ['0', '5', '5', '5'],
535        'רר'      => ['0', '99', '99', '99', '9', '9', '9'],
536        'ר'       => ['0', '9', '9', '9'],
537        'שטז'     => ['0', '2', '4', '4'],
538        'שתש'     => ['0', '2', '4', '4'],
539        'שתז'     => ['0', '2', '4', '4'],
540        'שטש'     => ['0', '2', '4', '4'],
541        'שד'      => ['0', '2', '43', '43'],
542        'שז'      => ['0', '44', '44', '44'],
543        'שס'      => ['0', '44', '44', '44'],
544        'שת'      => ['0', '2', '43', '43'],
545        'שג'      => ['0', '4', '4', '4', '44', '44', '44', '4', '43', '43'],
546        'שט'      => ['0', '2', '43', '43', '44', '44', '44'],
547        'שצ'      => ['0', '44', '44', '44', '45', '45', '45'],
548        'שץ'      => ['0', '44', '', '44', '45', '', '45'],
549        'שש'      => ['0', '4', '4', '4', '44', '44', '44'],
550        'ש'       => ['0', '4', '4', '4'],
551        'תג'      => ['0', '34', '34', '34'],
552        'תז'      => ['0', '34', '34', '34'],
553        'תש'      => ['0', '4', '4', '4'],
554        'תת'      => ['0', '3', '3', '3', '4', '4', '4', '33', '33', '33', '44', '44', '44', '34', '34', '34', '43', '43', '43'],
555        'ת'       => ['0', '3', '3', '3', '4', '4', '4'],
556        // Arabic alphabet
557        'ا'       => ['1', '0', '', ''],
558        'ب'       => ['0', '7', '7', '7'],
559        'ت'       => ['0', '3', '3', '3'],
560        'ث'       => ['0', '3', '3', '3'],
561        'ج'       => ['0', '4', '4', '4'],
562        'ح'       => ['0', '5', '5', '5'],
563        'خ'       => ['0', '5', '5', '5'],
564        'د'       => ['0', '3', '3', '3'],
565        'ذ'       => ['0', '3', '3', '3'],
566        'ر'       => ['0', '9', '9', '9'],
567        'ز'       => ['0', '4', '4', '4'],
568        'س'       => ['0', '4', '4', '4'],
569        'ش'       => ['0', '4', '4', '4'],
570        'ص'       => ['0', '4', '4', '4'],
571        'ض'       => ['0', '3', '3', '3'],
572        'ط'       => ['0', '3', '3', '3'],
573        'ظ'       => ['0', '4', '4', '4'],
574        'ع'       => ['1', '0', '', ''],
575        'غ'       => ['0', '0', '', ''],
576        'ف'       => ['0', '7', '7', '7'],
577        'ق'       => ['0', '5', '5', '5'],
578        'ك'       => ['0', '5', '5', '5'],
579        'ل'       => ['0', '8', '8', '8'],
580        'لا'      => ['0', '8', '8', '8'],
581        'م'       => ['0', '6', '6', '6'],
582        'ن'       => ['0', '6', '6', '6'],
583        'هن'      => ['0', '66', '66', '66'],
584        'ه'       => ['0', '5', '5', ''],
585        'و'       => ['1', '', '', '', '7', '', ''],
586        'ي'       => ['0', '1', '', ''],
587        'آ'       => ['0', '1', '', ''],
588        'ة'       => ['0', '', '', '3'],
589        'ی'       => ['0', '1', '', ''],
590        'ى'       => ['1', '1', '', ''],
591    ];
592
593    /**
594     * Which algorithms are supported.
595     *
596     * @return array<string>
597     */
598    public static function getAlgorithms(): array
599    {
600        return [
601            /* I18N: https://en.wikipedia.org/wiki/Soundex */
602            'std' => I18N::translate('Russell'),
603            /* I18N: https://en.wikipedia.org/wiki/Daitch–Mokotoff_Soundex */
604            'dm'  => I18N::translate('Daitch-Mokotoff'),
605        ];
606    }
607
608    /**
609     * Is there a match between two soundex codes?
610     *
611     * @param string $soundex1
612     * @param string $soundex2
613     *
614     * @return bool
615     */
616    public static function compare(string $soundex1, string $soundex2): bool
617    {
618        if ($soundex1 !== '' && $soundex2 !== '') {
619            return array_intersect(explode(':', $soundex1), explode(':', $soundex2)) !== [];
620        }
621
622        return false;
623    }
624
625    /**
626     * Generate Russell soundex codes for a given text.
627     *
628     * @param string $text
629     *
630     * @return string
631     */
632    public static function russell(string $text): string
633    {
634        $words         = explode(' ', $text);
635        $soundex_array = [];
636
637        foreach ($words as $word) {
638            $soundex = soundex($word);
639
640            // Only return codes from recognisable sounds
641            if ($soundex !== '0000') {
642                $soundex_array[] = $soundex;
643            }
644        }
645
646        // Combine words, e.g. “New York” as “Newyork”
647        if (count($words) > 1) {
648            $soundex_array[] = soundex(str_replace(' ', '', $text));
649        }
650
651        // A varchar(255) column can only hold 51 4-character codes (plus 50 delimiters)
652        $soundex_array = array_slice(array_unique($soundex_array), 0, 51);
653
654        return implode(':', $soundex_array);
655    }
656
657    /**
658     * Generate Daitch–Mokotoff soundex codes for a given text.
659     *
660     * @param string $text
661     *
662     * @return string
663     */
664    public static function daitchMokotoff(string $text): string
665    {
666        $words         = explode(' ', $text);
667        $soundex_array = [];
668
669        foreach ($words as $word) {
670            $soundex_array = array_merge($soundex_array, self::daitchMokotoffWord($word));
671        }
672        // Combine words, e.g. “New York” as “Newyork”
673        if (count($words) > 1) {
674            $soundex_array = array_merge($soundex_array, self::daitchMokotoffWord(str_replace(' ', '', $text)));
675        }
676
677        // A varchar(255) column can only hold 36 6-character codes (plus 35 delimiters)
678        $soundex_array = array_slice(array_unique($soundex_array), 0, 36);
679
680        return implode(':', $soundex_array);
681    }
682
683    /**
684     * Calculate the Daitch-Mokotoff soundex for a word.
685     *
686     * @param string $name
687     *
688     * @return array<string> List of possible DM codes for the word.
689     */
690    private static function daitchMokotoffWord(string $name): array
691    {
692        // Apply special transformation rules to the input string
693        $name = I18N::strtoupper($name);
694        foreach (self::TRANSFORM_NAMES as $transformRule) {
695            $name = str_replace($transformRule[0], $transformRule[1], $name);
696        }
697
698        // Initialize
699        $name_script = I18N::textScript($name);
700        $noVowels    = $name_script === 'Hebr' || $name_script === 'Arab';
701
702        $lastPos         = strlen($name) - 1;
703        $currPos         = 0;
704        $state           = 1; // 1: start of input string, 2: before vowel, 3: other
705        $result          = []; // accumulate complete 6-digit D-M codes here
706        $partialResult   = []; // accumulate incomplete D-M codes here
707        $partialResult[] = ['!']; // initialize 1st partial result  ('!' stops "duplicate sound" check)
708
709        // Loop through the input string.
710        // Stop when the string is exhausted or when no more partial results remain
711        while (count($partialResult) !== 0 && $currPos <= $lastPos) {
712            // Find the DM coding table entry for the chunk at the current position
713            $thisEntry = substr($name, $currPos, self::MAXCHAR); // Get maximum length chunk
714            while ($thisEntry !== '') {
715                if (isset(self::DM_SOUNDS[$thisEntry])) {
716                    break;
717                }
718                $thisEntry = substr($thisEntry, 0, -1); // Not in table: try a shorter chunk
719            }
720            if ($thisEntry === '') {
721                $currPos++; // Not in table: advance pointer to next byte
722                continue; // and try again
723            }
724
725            $soundTableEntry = self::DM_SOUNDS[$thisEntry];
726            $workingResult   = $partialResult;
727            $partialResult   = [];
728            $currPos += strlen($thisEntry);
729
730            // Not at beginning of input string
731            if ($state !== 1) {
732                if ($currPos <= $lastPos) {
733                    // Determine whether the next chunk is a vowel
734                    $nextEntry = substr($name, $currPos, self::MAXCHAR); // Get maximum length chunk
735                    while ($nextEntry !== '') {
736                        if (isset(self::DM_SOUNDS[$nextEntry])) {
737                            break;
738                        }
739                        $nextEntry = substr($nextEntry, 0, -1); // Not in table: try a shorter chunk
740                    }
741                } else {
742                    $nextEntry = '';
743                }
744                if ($nextEntry !== '' && self::DM_SOUNDS[$nextEntry][0] !== '0') {
745                    $state = 2;
746                } else {
747                    // Next chunk is a vowel
748                    $state = 3;
749                }
750            }
751
752            while ($state < count($soundTableEntry)) {
753                // empty means 'ignore this sound in this state'
754                if ($soundTableEntry[$state] === '') {
755                    foreach ($workingResult as $workingEntry) {
756                        $tempEntry                        = $workingEntry;
757                        $tempEntry[count($tempEntry) - 1] .= '!'; // Prevent false 'doubles'
758                        $partialResult[]                  = $tempEntry;
759                    }
760                } else {
761                    foreach ($workingResult as $workingEntry) {
762                        if ($soundTableEntry[$state] !== $workingEntry[count($workingEntry) - 1]) {
763                            // Incoming sound isn't a duplicate of the previous sound
764                            $workingEntry[] = $soundTableEntry[$state];
765                        } elseif ($noVowels) {
766                            // Incoming sound is a duplicate of the previous sound
767                            // For Hebrew and Arabic, we need to create a pair of D-M sound codes,
768                            // one of the pair with only a single occurrence of the duplicate sound,
769                            // the other with both occurrences
770                            $workingEntry[] = $soundTableEntry[$state];
771                        }
772
773                        if (count($workingEntry) < 7) {
774                            $partialResult[] = $workingEntry;
775                        } else {
776                            // This is the 6th code in the sequence
777                            // We're looking for 7 entries because the first is '!' and doesn't count
778                            $tempResult = str_replace('!', '', implode('', $workingEntry));
779                            // Only return codes from recognisable sounds
780                            if ($tempResult) {
781                                $result[] = substr($tempResult . '000000', 0, 6);
782                            }
783                        }
784                    }
785                }
786                $state += 3; // Advance to next triplet while keeping the same basic state
787            }
788        }
789
790        // Zero-fill and copy all remaining partial results
791        foreach ($partialResult as $workingEntry) {
792            $tempResult = str_replace('!', '', implode('', $workingEntry));
793            // Only return codes from recognisable sounds
794            if ($tempResult) {
795                $result[] = substr($tempResult . '000000', 0, 6);
796            }
797        }
798
799        return $result;
800    }
801}
802