xref: /webtrees/app/Soundex.php (revision 1da5dadd02c9f21e0a345e21a19b2c63ae792ed3)
1<?php
2
3/**
4 * webtrees: online genealogy
5 * Copyright (C) 2023 webtrees development team
6 * This program is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation, either version 3 of the License, or
9 * (at your option) any later version.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program. If not, see <https://www.gnu.org/licenses/>.
16 */
17
18declare(strict_types=1);
19
20namespace Fisharebest\Webtrees;
21
22use function array_slice;
23use function count;
24use function strlen;
25
26/**
27 * Phonetic matching of strings.
28 */
29class Soundex
30{
31    // Determine the Daitch–Mokotoff Soundex code for a word
32    // Original implementation by Gerry Kroll, and analysis by Meliza Amity
33
34    // Max. table key length (in ASCII bytes -- NOT in UTF-8 characters!)
35    private const MAXCHAR = 7;
36
37    /**
38     * Name transformation arrays.
39     * Used to transform the Name string to simplify the "sounds like" table.
40     * This is especially useful in Hebrew.
41     *
42     * Each array entry defines the "from" and "to" arguments of an preg($from, $to, $text)
43     * function call to achieve the desired transformations.
44     *
45     * Note about the use of "\x01":
46     * This code, which can’t legitimately occur in the kind of text we're dealing with,
47     * is used as a place-holder so that conditional string replacements can be done.
48     */
49    private const TRANSFORM_NAMES = [
50        // Force Yiddish ligatures to be treated as separate letters
51        ['װ', 'וו'],
52        ['ײ', 'יי'],
53        ['ױ', 'וי'],
54        ['בו', 'בע'],
55        ['פו', 'פע'],
56        ['ומ', 'עמ'],
57        ['ום', 'עם'],
58        ['ונ', 'ענ'],
59        ['ון', 'ען'],
60        ['וו', 'ב'],
61        ["\x01", ''],
62        ['ייה$', "\x01ה"],
63        ['ייע$', "\x01ע"],
64        ['יי', 'ע'],
65        ["\x01", 'יי'],
66    ];
67
68    /**
69     * The DM sound coding table is organized this way:
70     * key: a variable-length string that corresponds to the UTF-8 character sequence
71     * represented by the table entry. Currently, that string can be up to 7
72     * bytes long. This maximum length is defined by the value of global variable
73     * $maxchar.
74     *
75     * value: an array as follows:
76     * [0]:  zero if not a vowel
77     * [1]:  sound value when this string is at the beginning of the word
78     * [2]:  sound value when this string is followed by a vowel
79     * [3]:  sound value for other cases
80     * [1],[2],[3] can be repeated several times to create branches in the code
81     * an empty sound value means "ignore in this state"
82     */
83    private const DM_SOUNDS = [
84        'A'       => ['1', '0', '', ''],
85        'À'       => ['1', '0', '', ''],
86        'Á'       => ['1', '0', '', ''],
87        'Â'       => ['1', '0', '', ''],
88        'Ã'       => ['1', '0', '', ''],
89        'Ä'       => ['1', '0', '1', '', '0', '', ''],
90        'Å'       => ['1', '0', '', ''],
91        'Ă'       => ['1', '0', '', ''],
92        'Ą'       => ['1', '', '', '', '', '', '6'],
93        'Ạ'       => ['1', '0', '', ''],
94        'Ả'       => ['1', '0', '', ''],
95        'Ấ'       => ['1', '0', '', ''],
96        'Ầ'       => ['1', '0', '', ''],
97        'Ẩ'       => ['1', '0', '', ''],
98        'Ẫ'       => ['1', '0', '', ''],
99        'Ậ'       => ['1', '0', '', ''],
100        'Ắ'       => ['1', '0', '', ''],
101        'Ằ'       => ['1', '0', '', ''],
102        'Ẳ'       => ['1', '0', '', ''],
103        'Ẵ'       => ['1', '0', '', ''],
104        'Ặ'       => ['1', '0', '', ''],
105        'AE'      => ['1', '0', '1', ''],
106        'Æ'       => ['1', '0', '1', ''],
107        'AI'      => ['1', '0', '1', ''],
108        'AJ'      => ['1', '0', '1', ''],
109        'AU'      => ['1', '0', '7', ''],
110        'AV'      => ['1', '0', '7', '', '7', '7', '7'],
111        'ÄU'      => ['1', '0', '1', ''],
112        'AY'      => ['1', '0', '1', ''],
113        'B'       => ['0', '7', '7', '7'],
114        'C'       => ['0', '5', '5', '5', '34', '4', '4'],
115        'Ć'       => ['0', '4', '4', '4'],
116        'Č'       => ['0', '4', '4', '4'],
117        'Ç'       => ['0', '4', '4', '4'],
118        'CH'      => ['0', '5', '5', '5', '34', '4', '4'],
119        'CHS'     => ['0', '5', '54', '54'],
120        'CK'      => ['0', '5', '5', '5', '45', '45', '45'],
121        'CCS'     => ['0', '4', '4', '4'],
122        'CS'      => ['0', '4', '4', '4'],
123        'CSZ'     => ['0', '4', '4', '4'],
124        'CZ'      => ['0', '4', '4', '4'],
125        'CZS'     => ['0', '4', '4', '4'],
126        'D'       => ['0', '3', '3', '3'],
127        'Ď'       => ['0', '3', '3', '3'],
128        'Đ'       => ['0', '3', '3', '3'],
129        'DRS'     => ['0', '4', '4', '4'],
130        'DRZ'     => ['0', '4', '4', '4'],
131        'DS'      => ['0', '4', '4', '4'],
132        'DSH'     => ['0', '4', '4', '4'],
133        'DSZ'     => ['0', '4', '4', '4'],
134        'DT'      => ['0', '3', '3', '3'],
135        'DDZ'     => ['0', '4', '4', '4'],
136        'DDZS'    => ['0', '4', '4', '4'],
137        'DZ'      => ['0', '4', '4', '4'],
138        'DŹ'      => ['0', '4', '4', '4'],
139        'DŻ'      => ['0', '4', '4', '4'],
140        'DZH'     => ['0', '4', '4', '4'],
141        'DZS'     => ['0', '4', '4', '4'],
142        'E'       => ['1', '0', '', ''],
143        'È'       => ['1', '0', '', ''],
144        'É'       => ['1', '0', '', ''],
145        'Ê'       => ['1', '0', '', ''],
146        'Ë'       => ['1', '0', '', ''],
147        'Ĕ'       => ['1', '0', '', ''],
148        'Ė'       => ['1', '0', '', ''],
149        'Ę'       => ['1', '', '', '6', '', '', ''],
150        'Ẹ'       => ['1', '0', '', ''],
151        'Ẻ'       => ['1', '0', '', ''],
152        'Ẽ'       => ['1', '0', '', ''],
153        'Ế'       => ['1', '0', '', ''],
154        'Ề'       => ['1', '0', '', ''],
155        'Ể'       => ['1', '0', '', ''],
156        'Ễ'       => ['1', '0', '', ''],
157        'Ệ'       => ['1', '0', '', ''],
158        'EAU'     => ['1', '0', '', ''],
159        'EI'      => ['1', '0', '1', ''],
160        'EJ'      => ['1', '0', '1', ''],
161        'EU'      => ['1', '1', '1', ''],
162        'EY'      => ['1', '0', '1', ''],
163        'F'       => ['0', '7', '7', '7'],
164        'FB'      => ['0', '7', '7', '7'],
165        'G'       => ['0', '5', '5', '5', '34', '4', '4'],
166        'Ğ'       => ['0', '', '', ''],
167        'GGY'     => ['0', '5', '5', '5'],
168        'GY'      => ['0', '5', '5', '5'],
169        'H'       => ['0', '5', '5', '', '5', '5', '5'],
170        'I'       => ['1', '0', '', ''],
171        'Ì'       => ['1', '0', '', ''],
172        'Í'       => ['1', '0', '', ''],
173        'Î'       => ['1', '0', '', ''],
174        'Ï'       => ['1', '0', '', ''],
175        'Ĩ'       => ['1', '0', '', ''],
176        'Į'       => ['1', '0', '', ''],
177        'İ'       => ['1', '0', '', ''],
178        'Ỉ'       => ['1', '0', '', ''],
179        'Ị'       => ['1', '0', '', ''],
180        'IA'      => ['1', '1', '', ''],
181        'IE'      => ['1', '1', '', ''],
182        'IO'      => ['1', '1', '', ''],
183        'IU'      => ['1', '1', '', ''],
184        'J'       => ['0', '1', '', '', '4', '4', '4', '5', '5', ''],
185        'K'       => ['0', '5', '5', '5'],
186        'KH'      => ['0', '5', '5', '5'],
187        'KS'      => ['0', '5', '54', '54'],
188        'L'       => ['0', '8', '8', '8'],
189        'Ľ'       => ['0', '8', '8', '8'],
190        'Ĺ'       => ['0', '8', '8', '8'],
191        'Ł'       => ['0', '7', '7', '7', '8', '8', '8'],
192        'LL'      => ['0', '8', '8', '8', '58', '8', '8', '1', '8', '8'],
193        'LLY'     => ['0', '8', '8', '8', '1', '8', '8'],
194        'LY'      => ['0', '8', '8', '8', '1', '8', '8'],
195        'M'       => ['0', '6', '6', '6'],
196        'MĔ'      => ['0', '66', '66', '66'],
197        'MN'      => ['0', '66', '66', '66'],
198        'N'       => ['0', '6', '6', '6'],
199        'Ń'       => ['0', '6', '6', '6'],
200        'Ň'       => ['0', '6', '6', '6'],
201        'Ñ'       => ['0', '6', '6', '6'],
202        'NM'      => ['0', '66', '66', '66'],
203        'O'       => ['1', '0', '', ''],
204        'Ò'       => ['1', '0', '', ''],
205        'Ó'       => ['1', '0', '', ''],
206        'Ô'       => ['1', '0', '', ''],
207        'Õ'       => ['1', '0', '', ''],
208        'Ö'       => ['1', '0', '', ''],
209        'Ø'       => ['1', '0', '', ''],
210        'Ő'       => ['1', '0', '', ''],
211        'Œ'       => ['1', '0', '', ''],
212        'Ơ'       => ['1', '0', '', ''],
213        'Ọ'       => ['1', '0', '', ''],
214        'Ỏ'       => ['1', '0', '', ''],
215        'Ố'       => ['1', '0', '', ''],
216        'Ồ'       => ['1', '0', '', ''],
217        'Ổ'       => ['1', '0', '', ''],
218        'Ỗ'       => ['1', '0', '', ''],
219        'Ộ'       => ['1', '0', '', ''],
220        'Ớ'       => ['1', '0', '', ''],
221        'Ờ'       => ['1', '0', '', ''],
222        'Ở'       => ['1', '0', '', ''],
223        'Ỡ'       => ['1', '0', '', ''],
224        'Ợ'       => ['1', '0', '', ''],
225        'OE'      => ['1', '0', '', ''],
226        'OI'      => ['1', '0', '1', ''],
227        'OJ'      => ['1', '0', '1', ''],
228        'OU'      => ['1', '0', '', ''],
229        'OY'      => ['1', '0', '1', ''],
230        'P'       => ['0', '7', '7', '7'],
231        'PF'      => ['0', '7', '7', '7'],
232        'PH'      => ['0', '7', '7', '7'],
233        'Q'       => ['0', '5', '5', '5'],
234        'R'       => ['0', '9', '9', '9'],
235        'Ř'       => ['0', '4', '4', '4'],
236        'RS'      => ['0', '4', '4', '4', '94', '94', '94'],
237        'RZ'      => ['0', '4', '4', '4', '94', '94', '94'],
238        'S'       => ['0', '4', '4', '4'],
239        'Ś'       => ['0', '4', '4', '4'],
240        'Š'       => ['0', '4', '4', '4'],
241        'Ş'       => ['0', '4', '4', '4'],
242        'SC'      => ['0', '2', '4', '4'],
243        'ŠČ'      => ['0', '2', '4', '4'],
244        'SCH'     => ['0', '4', '4', '4'],
245        'SCHD'    => ['0', '2', '43', '43'],
246        'SCHT'    => ['0', '2', '43', '43'],
247        'SCHTCH'  => ['0', '2', '4', '4'],
248        'SCHTSCH' => ['0', '2', '4', '4'],
249        'SCHTSH'  => ['0', '2', '4', '4'],
250        'SD'      => ['0', '2', '43', '43'],
251        'SH'      => ['0', '4', '4', '4'],
252        'SHCH'    => ['0', '2', '4', '4'],
253        'SHD'     => ['0', '2', '43', '43'],
254        'SHT'     => ['0', '2', '43', '43'],
255        'SHTCH'   => ['0', '2', '4', '4'],
256        'SHTSH'   => ['0', '2', '4', '4'],
257        'ß'       => ['0', '', '4', '4'],
258        'ST'      => ['0', '2', '43', '43'],
259        'STCH'    => ['0', '2', '4', '4'],
260        'STRS'    => ['0', '2', '4', '4'],
261        'STRZ'    => ['0', '2', '4', '4'],
262        'STSCH'   => ['0', '2', '4', '4'],
263        'STSH'    => ['0', '2', '4', '4'],
264        'SSZ'     => ['0', '4', '4', '4'],
265        'SZ'      => ['0', '4', '4', '4'],
266        'SZCS'    => ['0', '2', '4', '4'],
267        'SZCZ'    => ['0', '2', '4', '4'],
268        'SZD'     => ['0', '2', '43', '43'],
269        'SZT'     => ['0', '2', '43', '43'],
270        'T'       => ['0', '3', '3', '3'],
271        'Ť'       => ['0', '3', '3', '3'],
272        'Ţ'       => ['0', '3', '3', '3', '4', '4', '4'],
273        'TC'      => ['0', '4', '4', '4'],
274        'TCH'     => ['0', '4', '4', '4'],
275        'TH'      => ['0', '3', '3', '3'],
276        'TRS'     => ['0', '4', '4', '4'],
277        'TRZ'     => ['0', '4', '4', '4'],
278        'TS'      => ['0', '4', '4', '4'],
279        'TSCH'    => ['0', '4', '4', '4'],
280        'TSH'     => ['0', '4', '4', '4'],
281        'TSZ'     => ['0', '4', '4', '4'],
282        'TTCH'    => ['0', '4', '4', '4'],
283        'TTS'     => ['0', '4', '4', '4'],
284        'TTSCH'   => ['0', '4', '4', '4'],
285        'TTSZ'    => ['0', '4', '4', '4'],
286        'TTZ'     => ['0', '4', '4', '4'],
287        'TZ'      => ['0', '4', '4', '4'],
288        'TZS'     => ['0', '4', '4', '4'],
289        'U'       => ['1', '0', '', ''],
290        'Ù'       => ['1', '0', '', ''],
291        'Ú'       => ['1', '0', '', ''],
292        'Û'       => ['1', '0', '', ''],
293        'Ü'       => ['1', '0', '', ''],
294        'Ũ'       => ['1', '0', '', ''],
295        'Ū'       => ['1', '0', '', ''],
296        'Ů'       => ['1', '0', '', ''],
297        'Ű'       => ['1', '0', '', ''],
298        'Ų'       => ['1', '0', '', ''],
299        'Ư'       => ['1', '0', '', ''],
300        'Ụ'       => ['1', '0', '', ''],
301        'Ủ'       => ['1', '0', '', ''],
302        'Ứ'       => ['1', '0', '', ''],
303        'Ừ'       => ['1', '0', '', ''],
304        'Ử'       => ['1', '0', '', ''],
305        'Ữ'       => ['1', '0', '', ''],
306        'Ự'       => ['1', '0', '', ''],
307        'UE'      => ['1', '0', '', ''],
308        'UI'      => ['1', '0', '1', ''],
309        'UJ'      => ['1', '0', '1', ''],
310        'UY'      => ['1', '0', '1', ''],
311        'UW'      => ['1', '0', '1', '', '0', '7', '7'],
312        'V'       => ['0', '7', '7', '7'],
313        'W'       => ['0', '7', '7', '7'],
314        'X'       => ['0', '5', '54', '54'],
315        'Y'       => ['1', '1', '', ''],
316        'Ý'       => ['1', '1', '', ''],
317        'Ỳ'       => ['1', '1', '', ''],
318        'Ỵ'       => ['1', '1', '', ''],
319        'Ỷ'       => ['1', '1', '', ''],
320        'Ỹ'       => ['1', '1', '', ''],
321        'Z'       => ['0', '4', '4', '4'],
322        'Ź'       => ['0', '4', '4', '4'],
323        'Ż'       => ['0', '4', '4', '4'],
324        'Ž'       => ['0', '4', '4', '4'],
325        'ZD'      => ['0', '2', '43', '43'],
326        'ZDZ'     => ['0', '2', '4', '4'],
327        'ZDZH'    => ['0', '2', '4', '4'],
328        'ZH'      => ['0', '4', '4', '4'],
329        'ZHD'     => ['0', '2', '43', '43'],
330        'ZHDZH'   => ['0', '2', '4', '4'],
331        'ZS'      => ['0', '4', '4', '4'],
332        'ZSCH'    => ['0', '4', '4', '4'],
333        'ZSH'     => ['0', '4', '4', '4'],
334        'ZZS'     => ['0', '4', '4', '4'],
335        // Cyrillic alphabet
336        'А'       => ['1', '0', '', ''],
337        'Б'       => ['0', '7', '7', '7'],
338        'В'       => ['0', '7', '7', '7'],
339        'Г'       => ['0', '5', '5', '5'],
340        'Д'       => ['0', '3', '3', '3'],
341        'ДЗ'      => ['0', '4', '4', '4'],
342        'Е'       => ['1', '0', '', ''],
343        'Ё'       => ['1', '0', '', ''],
344        'Ж'       => ['0', '4', '4', '4'],
345        'З'       => ['0', '4', '4', '4'],
346        'И'       => ['1', '0', '', ''],
347        'Й'       => ['1', '1', '', '', '4', '4', '4'],
348        'К'       => ['0', '5', '5', '5'],
349        'Л'       => ['0', '8', '8', '8'],
350        'М'       => ['0', '6', '6', '6'],
351        'Н'       => ['0', '6', '6', '6'],
352        'О'       => ['1', '0', '', ''],
353        'П'       => ['0', '7', '7', '7'],
354        'Р'       => ['0', '9', '9', '9'],
355        'РЖ'      => ['0', '4', '4', '4'],
356        'С'       => ['0', '4', '4', '4'],
357        'Т'       => ['0', '3', '3', '3'],
358        'У'       => ['1', '0', '', ''],
359        'Ф'       => ['0', '7', '7', '7'],
360        'Х'       => ['0', '5', '5', '5'],
361        'Ц'       => ['0', '4', '4', '4'],
362        'Ч'       => ['0', '4', '4', '4'],
363        'Ш'       => ['0', '4', '4', '4'],
364        'Щ'       => ['0', '2', '4', '4'],
365        'Ъ'       => ['0', '', '', ''],
366        'Ы'       => ['0', '1', '', ''],
367        'Ь'       => ['0', '', '', ''],
368        'Э'       => ['1', '0', '', ''],
369        'Ю'       => ['0', '1', '', ''],
370        'Я'       => ['0', '1', '', ''],
371        // Greek alphabet
372        'Α'       => ['1', '0', '', ''],
373        'Ά'       => ['1', '0', '', ''],
374        'ΑΙ'      => ['1', '0', '1', ''],
375        'ΑΥ'      => ['1', '0', '1', ''],
376        'Β'       => ['0', '7', '7', '7'],
377        'Γ'       => ['0', '5', '5', '5'],
378        'Δ'       => ['0', '3', '3', '3'],
379        'Ε'       => ['1', '0', '', ''],
380        'Έ'       => ['1', '0', '', ''],
381        'ΕΙ'      => ['1', '0', '1', ''],
382        'ΕΥ'      => ['1', '1', '1', ''],
383        'Ζ'       => ['0', '4', '4', '4'],
384        'Η'       => ['1', '0', '', ''],
385        'Ή'       => ['1', '0', '', ''],
386        'Θ'       => ['0', '3', '3', '3'],
387        'Ι'       => ['1', '0', '', ''],
388        'Ί'       => ['1', '0', '', ''],
389        'Ϊ'       => ['1', '0', '', ''],
390        'ΐ'       => ['1', '0', '', ''],
391        'Κ'       => ['0', '5', '5', '5'],
392        'Λ'       => ['0', '8', '8', '8'],
393        'Μ'       => ['0', '6', '6', '6'],
394        'ΜΠ'      => ['0', '7', '7', '7'],
395        'Ν'       => ['0', '6', '6', '6'],
396        'ΝΤ'      => ['0', '3', '3', '3'],
397        'Ξ'       => ['0', '5', '54', '54'],
398        'Ο'       => ['1', '0', '', ''],
399        'Ό'       => ['1', '0', '', ''],
400        'ΟΙ'      => ['1', '0', '1', ''],
401        'ΟΥ'      => ['1', '0', '1', ''],
402        'Π'       => ['0', '7', '7', '7'],
403        'Ρ'       => ['0', '9', '9', '9'],
404        'Σ'       => ['0', '4', '4', '4'],
405        'ς'       => ['0', '', '', '4'],
406        'Τ'       => ['0', '3', '3', '3'],
407        'ΤΖ'      => ['0', '4', '4', '4'],
408        'ΤΣ'      => ['0', '4', '4', '4'],
409        'Υ'       => ['1', '1', '', ''],
410        'Ύ'       => ['1', '1', '', ''],
411        'Ϋ'       => ['1', '1', '', ''],
412        'ΰ'       => ['1', '1', '', ''],
413        'ΥΚ'      => ['1', '5', '5', '5'],
414        'ΥΥ'      => ['1', '65', '65', '65'],
415        'Φ'       => ['0', '7', '7', '7'],
416        'Χ'       => ['0', '5', '5', '5'],
417        'Ψ'       => ['0', '7', '7', '7'],
418        'Ω'       => ['1', '0', '', ''],
419        'Ώ'       => ['1', '0', '', ''],
420        // Hebrew alphabet
421        'א'       => ['1', '0', '', ''],
422        'או'      => ['1', '0', '7', ''],
423        'אג'      => ['1', '4', '4', '4', '5', '5', '5', '34', '34', '34'],
424        'בב'      => ['0', '7', '7', '7', '77', '77', '77'],
425        'ב'       => ['0', '7', '7', '7'],
426        'גג'      => ['0', '4', '4', '4', '5', '5', '5', '45', '45', '45', '55', '55', '55', '54', '54', '54'],
427        'גד'      => ['0', '43', '43', '43', '53', '53', '53'],
428        'גה'      => ['0', '45', '45', '45', '55', '55', '55'],
429        'גז'      => ['0', '44', '44', '44', '45', '45', '45'],
430        'גח'      => ['0', '45', '45', '45', '55', '55', '55'],
431        'גכ'      => ['0', '45', '45', '45', '55', '55', '55'],
432        'גך'      => ['0', '45', '45', '45', '55', '55', '55'],
433        'גצ'      => ['0', '44', '44', '44', '45', '45', '45'],
434        'גץ'      => ['0', '44', '44', '44', '45', '45', '45'],
435        'גק'      => ['0', '45', '45', '45', '54', '54', '54'],
436        'גש'      => ['0', '44', '44', '44', '54', '54', '54'],
437        'גת'      => ['0', '43', '43', '43', '53', '53', '53'],
438        'ג'       => ['0', '4', '4', '4', '5', '5', '5'],
439        'דז'      => ['0', '4', '4', '4'],
440        'דד'      => ['0', '3', '3', '3', '33', '33', '33'],
441        'דט'      => ['0', '33', '33', '33'],
442        'דש'      => ['0', '4', '4', '4'],
443        'דצ'      => ['0', '4', '4', '4'],
444        'דץ'      => ['0', '4', '4', '4'],
445        'ד'       => ['0', '3', '3', '3'],
446        'הג'      => ['0', '54', '54', '54', '55', '55', '55'],
447        'הכ'      => ['0', '55', '55', '55'],
448        'הח'      => ['0', '55', '55', '55'],
449        'הק'      => ['0', '55', '55', '55', '5', '5', '5'],
450        'הה'      => ['0', '5', '5', '', '55', '55', ''],
451        'ה'       => ['0', '5', '5', ''],
452        'וי'      => ['1', '', '', '', '7', '7', '7'],
453        'ו'       => ['1', '7', '7', '7', '7', '', ''],
454        'וו'      => ['1', '7', '7', '7', '7', '', ''],
455        'וופ'     => ['1', '7', '7', '7', '77', '77', '77'],
456        'זש'      => ['0', '4', '4', '4', '44', '44', '44'],
457        'זדז'     => ['0', '2', '4', '4'],
458        'ז'       => ['0', '4', '4', '4'],
459        'זג'      => ['0', '44', '44', '44', '45', '45', '45'],
460        'זז'      => ['0', '4', '4', '4', '44', '44', '44'],
461        'זס'      => ['0', '44', '44', '44'],
462        'זצ'      => ['0', '44', '44', '44'],
463        'זץ'      => ['0', '44', '44', '44'],
464        'חג'      => ['0', '54', '54', '54', '53', '53', '53'],
465        'חח'      => ['0', '5', '5', '5', '55', '55', '55'],
466        'חק'      => ['0', '55', '55', '55', '5', '5', '5'],
467        'חכ'      => ['0', '45', '45', '45', '55', '55', '55'],
468        'חס'      => ['0', '5', '54', '54'],
469        'חש'      => ['0', '5', '54', '54'],
470        'ח'       => ['0', '5', '5', '5'],
471        'טש'      => ['0', '4', '4', '4'],
472        'טד'      => ['0', '33', '33', '33'],
473        'טי'      => ['0', '3', '3', '3', '4', '4', '4', '3', '3', '34'],
474        'טת'      => ['0', '33', '33', '33'],
475        'טט'      => ['0', '3', '3', '3', '33', '33', '33'],
476        'ט'       => ['0', '3', '3', '3'],
477        'י'       => ['1', '1', '', ''],
478        'יא'      => ['1', '1', '', '', '1', '1', '1'],
479        'כג'      => ['0', '55', '55', '55', '54', '54', '54'],
480        'כש'      => ['0', '5', '54', '54'],
481        'כס'      => ['0', '5', '54', '54'],
482        'ככ'      => ['0', '5', '5', '5', '55', '55', '55'],
483        'כך'      => ['0', '5', '5', '5', '55', '55', '55'],
484        'כ'       => ['0', '5', '5', '5'],
485        'כח'      => ['0', '55', '55', '55', '5', '5', '5'],
486        'ך'       => ['0', '', '5', '5'],
487        'ל'       => ['0', '8', '8', '8'],
488        'לל'      => ['0', '88', '88', '88', '8', '8', '8'],
489        'מנ'      => ['0', '66', '66', '66'],
490        'מן'      => ['0', '66', '66', '66'],
491        'ממ'      => ['0', '6', '6', '6', '66', '66', '66'],
492        'מם'      => ['0', '6', '6', '6', '66', '66', '66'],
493        'מ'       => ['0', '6', '6', '6'],
494        'ם'       => ['0', '', '6', '6'],
495        'נמ'      => ['0', '66', '66', '66'],
496        'נם'      => ['0', '66', '66', '66'],
497        'ננ'      => ['0', '6', '6', '6', '66', '66', '66'],
498        'נן'      => ['0', '6', '6', '6', '66', '66', '66'],
499        'נ'       => ['0', '6', '6', '6'],
500        'ן'       => ['0', '', '6', '6'],
501        'סתש'     => ['0', '2', '4', '4'],
502        'סתז'     => ['0', '2', '4', '4'],
503        'סטז'     => ['0', '2', '4', '4'],
504        'סטש'     => ['0', '2', '4', '4'],
505        'סצד'     => ['0', '2', '4', '4'],
506        'סט'      => ['0', '2', '4', '4', '43', '43', '43'],
507        'סת'      => ['0', '2', '4', '4', '43', '43', '43'],
508        'סג'      => ['0', '44', '44', '44', '4', '4', '4'],
509        'סס'      => ['0', '4', '4', '4', '44', '44', '44'],
510        'סצ'      => ['0', '44', '44', '44'],
511        'סץ'      => ['0', '44', '44', '44'],
512        'סז'      => ['0', '44', '44', '44'],
513        'סש'      => ['0', '44', '44', '44'],
514        'ס'       => ['0', '4', '4', '4'],
515        'ע'       => ['1', '0', '', ''],
516        'פב'      => ['0', '7', '7', '7', '77', '77', '77'],
517        'פוו'     => ['0', '7', '7', '7', '77', '77', '77'],
518        'פפ'      => ['0', '7', '7', '7', '77', '77', '77'],
519        'פף'      => ['0', '7', '7', '7', '77', '77', '77'],
520        'פ'       => ['0', '7', '7', '7'],
521        'ף'       => ['0', '', '7', '7'],
522        'צג'      => ['0', '44', '44', '44', '45', '45', '45'],
523        'צז'      => ['0', '44', '44', '44'],
524        'צס'      => ['0', '44', '44', '44'],
525        'צצ'      => ['0', '4', '4', '4', '5', '5', '5', '44', '44', '44', '54', '54', '54', '45', '45', '45'],
526        'צץ'      => ['0', '4', '4', '4', '5', '5', '5', '44', '44', '44', '54', '54', '54'],
527        'צש'      => ['0', '44', '44', '44', '4', '4', '4', '5', '5', '5'],
528        'צ'       => ['0', '4', '4', '4', '5', '5', '5'],
529        'ץ'       => ['0', '', '4', '4'],
530        'קה'      => ['0', '55', '55', '5'],
531        'קס'      => ['0', '5', '54', '54'],
532        'קש'      => ['0', '5', '54', '54'],
533        'קק'      => ['0', '5', '5', '5', '55', '55', '55'],
534        'קח'      => ['0', '55', '55', '55'],
535        'קכ'      => ['0', '55', '55', '55'],
536        'קך'      => ['0', '55', '55', '55'],
537        'קג'      => ['0', '55', '55', '55', '54', '54', '54'],
538        'ק'       => ['0', '5', '5', '5'],
539        'רר'      => ['0', '99', '99', '99', '9', '9', '9'],
540        'ר'       => ['0', '9', '9', '9'],
541        'שטז'     => ['0', '2', '4', '4'],
542        'שתש'     => ['0', '2', '4', '4'],
543        'שתז'     => ['0', '2', '4', '4'],
544        'שטש'     => ['0', '2', '4', '4'],
545        'שד'      => ['0', '2', '43', '43'],
546        'שז'      => ['0', '44', '44', '44'],
547        'שס'      => ['0', '44', '44', '44'],
548        'שת'      => ['0', '2', '43', '43'],
549        'שג'      => ['0', '4', '4', '4', '44', '44', '44', '4', '43', '43'],
550        'שט'      => ['0', '2', '43', '43', '44', '44', '44'],
551        'שצ'      => ['0', '44', '44', '44', '45', '45', '45'],
552        'שץ'      => ['0', '44', '', '44', '45', '', '45'],
553        'שש'      => ['0', '4', '4', '4', '44', '44', '44'],
554        'ש'       => ['0', '4', '4', '4'],
555        'תג'      => ['0', '34', '34', '34'],
556        'תז'      => ['0', '34', '34', '34'],
557        'תש'      => ['0', '4', '4', '4'],
558        'תת'      => ['0', '3', '3', '3', '4', '4', '4', '33', '33', '33', '44', '44', '44', '34', '34', '34', '43', '43', '43'],
559        'ת'       => ['0', '3', '3', '3', '4', '4', '4'],
560        // Arabic alphabet
561        'ا'       => ['1', '0', '', ''],
562        'ب'       => ['0', '7', '7', '7'],
563        'ت'       => ['0', '3', '3', '3'],
564        'ث'       => ['0', '3', '3', '3'],
565        'ج'       => ['0', '4', '4', '4'],
566        'ح'       => ['0', '5', '5', '5'],
567        'خ'       => ['0', '5', '5', '5'],
568        'د'       => ['0', '3', '3', '3'],
569        'ذ'       => ['0', '3', '3', '3'],
570        'ر'       => ['0', '9', '9', '9'],
571        'ز'       => ['0', '4', '4', '4'],
572        'س'       => ['0', '4', '4', '4'],
573        'ش'       => ['0', '4', '4', '4'],
574        'ص'       => ['0', '4', '4', '4'],
575        'ض'       => ['0', '3', '3', '3'],
576        'ط'       => ['0', '3', '3', '3'],
577        'ظ'       => ['0', '4', '4', '4'],
578        'ع'       => ['1', '0', '', ''],
579        'غ'       => ['0', '0', '', ''],
580        'ف'       => ['0', '7', '7', '7'],
581        'ق'       => ['0', '5', '5', '5'],
582        'ك'       => ['0', '5', '5', '5'],
583        'ل'       => ['0', '8', '8', '8'],
584        'لا'      => ['0', '8', '8', '8'],
585        'م'       => ['0', '6', '6', '6'],
586        'ن'       => ['0', '6', '6', '6'],
587        'هن'      => ['0', '66', '66', '66'],
588        'ه'       => ['0', '5', '5', ''],
589        'و'       => ['1', '', '', '', '7', '', ''],
590        'ي'       => ['0', '1', '', ''],
591        'آ'       => ['0', '1', '', ''],
592        'ة'       => ['0', '', '', '3'],
593        'ی'       => ['0', '1', '', ''],
594        'ى'       => ['1', '1', '', ''],
595    ];
596
597    /**
598     * Which algorithms are supported.
599     *
600     * @return array<string>
601     */
602    public static function getAlgorithms(): array
603    {
604        return [
605            /* I18N: https://en.wikipedia.org/wiki/Soundex */
606            'std' => I18N::translate('Russell'),
607            /* I18N: https://en.wikipedia.org/wiki/Daitch–Mokotoff_Soundex */
608            'dm'  => I18N::translate('Daitch-Mokotoff'),
609        ];
610    }
611
612    /**
613     * Is there a match between two soundex codes?
614     *
615     * @param string $soundex1
616     * @param string $soundex2
617     *
618     * @return bool
619     */
620    public static function compare(string $soundex1, string $soundex2): bool
621    {
622        if ($soundex1 !== '' && $soundex2 !== '') {
623            return array_intersect(explode(':', $soundex1), explode(':', $soundex2)) !== [];
624        }
625
626        return false;
627    }
628
629    /**
630     * Generate Russell soundex codes for a given text.
631     *
632     * @param string $text
633     *
634     * @return string
635     */
636    public static function russell(string $text): string
637    {
638        $words         = explode(' ', $text);
639        $soundex_array = [];
640
641        foreach ($words as $word) {
642            $soundex = soundex($word);
643
644            // Only return codes from recognisable sounds
645            if ($soundex !== '0000') {
646                $soundex_array[] = $soundex;
647            }
648        }
649
650        // Combine words, e.g. “New York” as “Newyork”
651        if (count($words) > 1) {
652            $soundex_array[] = soundex(str_replace(' ', '', $text));
653        }
654
655        // A varchar(255) column can only hold 51 4-character codes (plus 50 delimiters)
656        $soundex_array = array_slice(array_unique($soundex_array), 0, 51);
657
658        return implode(':', $soundex_array);
659    }
660
661    /**
662     * Generate Daitch–Mokotoff soundex codes for a given text.
663     *
664     * @param string $text
665     *
666     * @return string
667     */
668    public static function daitchMokotoff(string $text): string
669    {
670        $words         = explode(' ', $text);
671        $soundex_array = [];
672
673        foreach ($words as $word) {
674            $soundex_array = array_merge($soundex_array, self::daitchMokotoffWord($word));
675        }
676        // Combine words, e.g. “New York” as “Newyork”
677        if (count($words) > 1) {
678            $soundex_array = array_merge($soundex_array, self::daitchMokotoffWord(str_replace(' ', '', $text)));
679        }
680
681        // A varchar(255) column can only hold 36 6-character codes (plus 35 delimiters)
682        $soundex_array = array_slice(array_unique($soundex_array), 0, 36);
683
684        return implode(':', $soundex_array);
685    }
686
687    /**
688     * Calculate the Daitch-Mokotoff soundex for a word.
689     *
690     * @param string $name
691     *
692     * @return array<string> List of possible DM codes for the word.
693     */
694    private static function daitchMokotoffWord(string $name): array
695    {
696        // Apply special transformation rules to the input string
697        $name = I18N::strtoupper($name);
698        foreach (self::TRANSFORM_NAMES as $transformRule) {
699            $name = str_replace($transformRule[0], $transformRule[1], $name);
700        }
701
702        // Initialize
703        $name_script = I18N::textScript($name);
704        $noVowels    = $name_script === 'Hebr' || $name_script === 'Arab';
705
706        $lastPos         = strlen($name) - 1;
707        $currPos         = 0;
708        $state           = 1; // 1: start of input string, 2: before vowel, 3: other
709        $result          = []; // accumulate complete 6-digit D-M codes here
710        $partialResult   = []; // accumulate incomplete D-M codes here
711        $partialResult[] = ['!']; // initialize 1st partial result  ('!' stops "duplicate sound" check)
712
713        // Loop through the input string.
714        // Stop when the string is exhausted or when no more partial results remain
715        while ($partialResult !== [] && $currPos <= $lastPos) {
716            // Find the DM coding table entry for the chunk at the current position
717            $thisEntry = substr($name, $currPos, self::MAXCHAR); // Get maximum length chunk
718            while ($thisEntry !== '') {
719                if (isset(self::DM_SOUNDS[$thisEntry])) {
720                    break;
721                }
722                $thisEntry = substr($thisEntry, 0, -1); // Not in table: try a shorter chunk
723            }
724            if ($thisEntry === '') {
725                $currPos++; // Not in table: advance pointer to next byte
726                continue; // and try again
727            }
728
729            $soundTableEntry = self::DM_SOUNDS[$thisEntry];
730            $workingResult   = $partialResult;
731            $partialResult   = [];
732            $currPos += strlen($thisEntry);
733
734            // Not at beginning of input string
735            if ($state !== 1) {
736                if ($currPos <= $lastPos) {
737                    // Determine whether the next chunk is a vowel
738                    $nextEntry = substr($name, $currPos, self::MAXCHAR); // Get maximum length chunk
739                    while ($nextEntry !== '') {
740                        if (isset(self::DM_SOUNDS[$nextEntry])) {
741                            break;
742                        }
743                        $nextEntry = substr($nextEntry, 0, -1); // Not in table: try a shorter chunk
744                    }
745                } else {
746                    $nextEntry = '';
747                }
748                if ($nextEntry !== '' && self::DM_SOUNDS[$nextEntry][0] !== '0') {
749                    $state = 2;
750                } else {
751                    // Next chunk is a vowel
752                    $state = 3;
753                }
754            }
755
756            while ($state < count($soundTableEntry)) {
757                // empty means 'ignore this sound in this state'
758                if ($soundTableEntry[$state] === '') {
759                    foreach ($workingResult as $workingEntry) {
760                        $tempEntry                        = $workingEntry;
761                        $tempEntry[count($tempEntry) - 1] .= '!'; // Prevent false 'doubles'
762                        $partialResult[]                  = $tempEntry;
763                    }
764                } else {
765                    foreach ($workingResult as $workingEntry) {
766                        if ($soundTableEntry[$state] !== $workingEntry[count($workingEntry) - 1]) {
767                            // Incoming sound isn't a duplicate of the previous sound
768                            $workingEntry[] = $soundTableEntry[$state];
769                        } elseif ($noVowels) {
770                            // Incoming sound is a duplicate of the previous sound
771                            // For Hebrew and Arabic, we need to create a pair of D-M sound codes,
772                            // one of the pair with only a single occurrence of the duplicate sound,
773                            // the other with both occurrences
774                            $workingEntry[] = $soundTableEntry[$state];
775                        }
776
777                        if (count($workingEntry) < 7) {
778                            $partialResult[] = $workingEntry;
779                        } else {
780                            // This is the 6th code in the sequence
781                            // We're looking for 7 entries because the first is '!' and doesn't count
782                            $tempResult = str_replace('!', '', implode('', $workingEntry));
783                            // Only return codes from recognisable sounds
784                            if ($tempResult !== '') {
785                                $result[] = substr($tempResult . '000000', 0, 6);
786                            }
787                        }
788                    }
789                }
790                $state += 3; // Advance to next triplet while keeping the same basic state
791            }
792        }
793
794        // Zero-fill and copy all remaining partial results
795        foreach ($partialResult as $workingEntry) {
796            $tempResult = str_replace('!', '', implode('', $workingEntry));
797            // Only return codes from recognisable sounds
798            if ($tempResult !== '') {
799                $result[] = substr($tempResult . '000000', 0, 6);
800            }
801        }
802
803        return $result;
804    }
805}
806