xref: /webtrees/app/Soundex.php (revision 46bb661f32cdac2ea7c60006b34b13e316578b11)
1<?php
2/**
3 * webtrees: online genealogy
4 * Copyright (C) 2018 webtrees development team
5 * This program is free software: you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation, either version 3 of the License, or
8 * (at your option) any later version.
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 * You should have received a copy of the GNU General Public License
14 * along with this program. If not, see <http://www.gnu.org/licenses/>.
15 */
16namespace Fisharebest\Webtrees;
17
18/**
19 * Phonetic matching of strings.
20 */
21class Soundex {
22	/**
23	 * Which algorithms are supported.
24	 *
25	 * @return string[]
26	 */
27	public static function getAlgorithms() {
28		return [
29			'std' => /* I18N: http://en.wikipedia.org/wiki/Soundex */ I18N::translate('Russell'),
30			'dm'  => /* I18N: http://en.wikipedia.org/wiki/Daitch–Mokotoff_Soundex */ I18N::translate('Daitch-Mokotoff'),
31		];
32	}
33
34	/**
35	 * Is there a match between two soundex codes?
36	 *
37	 * @param string $soundex1
38	 * @param string $soundex2
39	 *
40	 * @return bool
41	 */
42	public static function compare($soundex1, $soundex2) {
43		if ($soundex1 !== '' && $soundex2 !== '') {
44			return !empty(array_intersect(explode(':', $soundex1), explode(':', $soundex2)));
45		}
46
47		return false;
48	}
49
50	/**
51	 * Generate Russell soundex codes for a given text.
52	 *
53	 * @param $text
54	 *
55	 * @return null|string
56	 */
57	public static function russell($text) {
58		$words         = preg_split('/\s/', $text, -1, PREG_SPLIT_NO_EMPTY);
59		$soundex_array = [];
60		foreach ($words as $word) {
61			$soundex = soundex($word);
62			// Only return codes from recognisable sounds
63			if ($soundex !== '0000') {
64				$soundex_array[] = $soundex;
65			}
66		}
67		// Combine words, e.g. “New York” as “Newyork”
68		if (count($words) > 1) {
69			$soundex_array[] = soundex(strtr($text, ' ', ''));
70		}
71		// A varchar(255) column can only hold 51 4-character codes (plus 50 delimiters)
72		$soundex_array = array_slice(array_unique($soundex_array), 0, 51);
73
74		if ($soundex_array) {
75			return implode(':', $soundex_array);
76		} else {
77			return '';
78		}
79	}
80
81	/**
82	 * Generate Daitch–Mokotoff soundex codes for a given text.
83	 *
84	 * @param $text
85	 *
86	 * @return string
87	 */
88	public static function daitchMokotoff($text) {
89		$words         = preg_split('/\s/', $text, -1, PREG_SPLIT_NO_EMPTY);
90		$soundex_array = [];
91		foreach ($words as $word) {
92			$soundex_array = array_merge($soundex_array, self::daitchMokotoffWord($word));
93		}
94		// Combine words, e.g. “New York” as “Newyork”
95		if (count($words) > 1) {
96			$soundex_array = array_merge($soundex_array, self::daitchMokotoffWord(strtr($text, ' ', '')));
97		}
98		// A varchar(255) column can only hold 36 6-character codes (plus 35 delimiters)
99		$soundex_array = array_slice(array_unique($soundex_array), 0, 36);
100
101		if ($soundex_array) {
102			return implode(':', $soundex_array);
103		} else {
104			return '';
105		}
106	}
107
108	// Determine the Daitch–Mokotoff Soundex code for a word
109	// Original implementation by Gerry Kroll, and analysis by Meliza Amity
110
111	// Max. table key length (in ASCII bytes -- NOT in UTF-8 characters!)
112	const MAXCHAR = 7;
113
114	/**
115	 * Name transformation arrays.
116	 * Used to transform the Name string to simplify the "sounds like" table.
117	 * This is especially useful in Hebrew.
118	 *
119	 * Each array entry defines the "from" and "to" arguments of an preg($from, $to, $text)
120	 * function call to achieve the desired transformations.
121	 *
122	 * Note about the use of "\x01":
123	 * This code, which can’t legitimately occur in the kind of text we're dealing with,
124	 * is used as a place-holder so that conditional string replacements can be done.
125	 *
126	 * @var string[][]
127	 */
128	private static $transformNameTable = [
129		// Force Yiddish ligatures to be treated as separate letters
130		['װ', 'וו'],
131		['ײ', 'יי'],
132		['ױ', 'וי'],
133		['בו', 'בע'],
134		['פו', 'פע'],
135		['ומ', 'עמ'],
136		['ום', 'עם'],
137		['ונ', 'ענ'],
138		['ון', 'ען'],
139		['וו', 'ב'],
140		["\x01", ''],
141		['ייה$', "\x01ה"],
142		['ייע$', "\x01ע"],
143		['יי', 'ע'],
144		["\x01", 'יי'],
145	];
146
147	/**
148	 * The DM sound coding table is organized this way:
149	 * key: a variable-length string that corresponds to the UTF-8 character sequence
150	 * represented by the table entry. Currently, that string can be up to 7
151	 * bytes long. This maximum length is defined by the value of global variable
152	 * $maxchar.
153	 *
154	 * value: an array as follows:
155	 * [0]:  zero if not a vowel
156	 * [1]:  sound value when this string is at the beginning of the word
157	 * [2]:  sound value when this string is followed by a vowel
158	 * [3]:  sound value for other cases
159	 * [1],[2],[3] can be repeated several times to create branches in the code
160	 * an empty sound value means "ignore in this state"
161	 *
162	 * @var string[][]
163	 */
164	private static $dmsounds = [
165		'A'       => ['1', '0', '', ''],
166		'À'       => ['1', '0', '', ''],
167		'Á'       => ['1', '0', '', ''],
168		'Â'       => ['1', '0', '', ''],
169		'Ã'       => ['1', '0', '', ''],
170		'Ä'       => ['1', '0', '1', '', '0', '', ''],
171		'Å'       => ['1', '0', '', ''],
172		'Ă'       => ['1', '0', '', ''],
173		'Ą'       => ['1', '', '', '', '', '', '6'],
174		'Ạ'       => ['1', '0', '', ''],
175		'Ả'       => ['1', '0', '', ''],
176		'Ấ'       => ['1', '0', '', ''],
177		'Ầ'       => ['1', '0', '', ''],
178		'Ẩ'       => ['1', '0', '', ''],
179		'Ẫ'       => ['1', '0', '', ''],
180		'Ậ'       => ['1', '0', '', ''],
181		'Ắ'       => ['1', '0', '', ''],
182		'Ằ'       => ['1', '0', '', ''],
183		'Ẳ'       => ['1', '0', '', ''],
184		'Ẵ'       => ['1', '0', '', ''],
185		'Ặ'       => ['1', '0', '', ''],
186		'AE'      => ['1', '0', '1', ''],
187		'Æ'       => ['1', '0', '1', ''],
188		'AI'      => ['1', '0', '1', ''],
189		'AJ'      => ['1', '0', '1', ''],
190		'AU'      => ['1', '0', '7', ''],
191		'AV'      => ['1', '0', '7', '', '7', '7', '7'],
192		'ÄU'      => ['1', '0', '1', ''],
193		'AY'      => ['1', '0', '1', ''],
194		'B'       => ['0', '7', '7', '7'],
195		'C'       => ['0', '5', '5', '5', '34', '4', '4'],
196		'Ć'       => ['0', '4', '4', '4'],
197		'Č'       => ['0', '4', '4', '4'],
198		'Ç'       => ['0', '4', '4', '4'],
199		'CH'      => ['0', '5', '5', '5', '34', '4', '4'],
200		'CHS'     => ['0', '5', '54', '54'],
201		'CK'      => ['0', '5', '5', '5', '45', '45', '45'],
202		'CCS'     => ['0', '4', '4', '4'],
203		'CS'      => ['0', '4', '4', '4'],
204		'CSZ'     => ['0', '4', '4', '4'],
205		'CZ'      => ['0', '4', '4', '4'],
206		'CZS'     => ['0', '4', '4', '4'],
207		'D'       => ['0', '3', '3', '3'],
208		'Ď'       => ['0', '3', '3', '3'],
209		'Đ'       => ['0', '3', '3', '3'],
210		'DRS'     => ['0', '4', '4', '4'],
211		'DRZ'     => ['0', '4', '4', '4'],
212		'DS'      => ['0', '4', '4', '4'],
213		'DSH'     => ['0', '4', '4', '4'],
214		'DSZ'     => ['0', '4', '4', '4'],
215		'DT'      => ['0', '3', '3', '3'],
216		'DDZ'     => ['0', '4', '4', '4'],
217		'DDZS'    => ['0', '4', '4', '4'],
218		'DZ'      => ['0', '4', '4', '4'],
219		'DŹ'      => ['0', '4', '4', '4'],
220		'DŻ'      => ['0', '4', '4', '4'],
221		'DZH'     => ['0', '4', '4', '4'],
222		'DZS'     => ['0', '4', '4', '4'],
223		'E'       => ['1', '0', '', ''],
224		'È'       => ['1', '0', '', ''],
225		'É'       => ['1', '0', '', ''],
226		'Ê'       => ['1', '0', '', ''],
227		'Ë'       => ['1', '0', '', ''],
228		'Ĕ'       => ['1', '0', '', ''],
229		'Ė'       => ['1', '0', '', ''],
230		'Ę'       => ['1', '', '', '6', '', '', ''],
231		'Ẹ'       => ['1', '0', '', ''],
232		'Ẻ'       => ['1', '0', '', ''],
233		'Ẽ'       => ['1', '0', '', ''],
234		'Ế'       => ['1', '0', '', ''],
235		'Ề'       => ['1', '0', '', ''],
236		'Ể'       => ['1', '0', '', ''],
237		'Ễ'       => ['1', '0', '', ''],
238		'Ệ'       => ['1', '0', '', ''],
239		'EAU'     => ['1', '0', '', ''],
240		'EI'      => ['1', '0', '1', ''],
241		'EJ'      => ['1', '0', '1', ''],
242		'EU'      => ['1', '1', '1', ''],
243		'EY'      => ['1', '0', '1', ''],
244		'F'       => ['0', '7', '7', '7'],
245		'FB'      => ['0', '7', '7', '7'],
246		'G'       => ['0', '5', '5', '5', '34', '4', '4'],
247		'Ğ'       => ['0', '', '', ''],
248		'GGY'     => ['0', '5', '5', '5'],
249		'GY'      => ['0', '5', '5', '5'],
250		'H'       => ['0', '5', '5', '', '5', '5', '5'],
251		'I'       => ['1', '0', '', ''],
252		'Ì'       => ['1', '0', '', ''],
253		'Í'       => ['1', '0', '', ''],
254		'Î'       => ['1', '0', '', ''],
255		'Ï'       => ['1', '0', '', ''],
256		'Ĩ'       => ['1', '0', '', ''],
257		'Į'       => ['1', '0', '', ''],
258		'İ'       => ['1', '0', '', ''],
259		'Ỉ'       => ['1', '0', '', ''],
260		'Ị'       => ['1', '0', '', ''],
261		'IA'      => ['1', '1', '', ''],
262		'IE'      => ['1', '1', '', ''],
263		'IO'      => ['1', '1', '', ''],
264		'IU'      => ['1', '1', '', ''],
265		'J'       => ['0', '1', '', '', '4', '4', '4', '5', '5', ''],
266		'K'       => ['0', '5', '5', '5'],
267		'KH'      => ['0', '5', '5', '5'],
268		'KS'      => ['0', '5', '54', '54'],
269		'L'       => ['0', '8', '8', '8'],
270		'Ľ'       => ['0', '8', '8', '8'],
271		'Ĺ'       => ['0', '8', '8', '8'],
272		'Ł'       => ['0', '7', '7', '7', '8', '8', '8'],
273		'LL'      => ['0', '8', '8', '8', '58', '8', '8', '1', '8', '8'],
274		'LLY'     => ['0', '8', '8', '8', '1', '8', '8'],
275		'LY'      => ['0', '8', '8', '8', '1', '8', '8'],
276		'M'       => ['0', '6', '6', '6'],
277		'MĔ'      => ['0', '66', '66', '66'],
278		'MN'      => ['0', '66', '66', '66'],
279		'N'       => ['0', '6', '6', '6'],
280		'Ń'       => ['0', '6', '6', '6'],
281		'Ň'       => ['0', '6', '6', '6'],
282		'Ñ'       => ['0', '6', '6', '6'],
283		'NM'      => ['0', '66', '66', '66'],
284		'O'       => ['1', '0', '', ''],
285		'Ò'       => ['1', '0', '', ''],
286		'Ó'       => ['1', '0', '', ''],
287		'Ô'       => ['1', '0', '', ''],
288		'Õ'       => ['1', '0', '', ''],
289		'Ö'       => ['1', '0', '', ''],
290		'Ø'       => ['1', '0', '', ''],
291		'Ő'       => ['1', '0', '', ''],
292		'Œ'       => ['1', '0', '', ''],
293		'Ơ'       => ['1', '0', '', ''],
294		'Ọ'       => ['1', '0', '', ''],
295		'Ỏ'       => ['1', '0', '', ''],
296		'Ố'       => ['1', '0', '', ''],
297		'Ồ'       => ['1', '0', '', ''],
298		'Ổ'       => ['1', '0', '', ''],
299		'Ỗ'       => ['1', '0', '', ''],
300		'Ộ'       => ['1', '0', '', ''],
301		'Ớ'       => ['1', '0', '', ''],
302		'Ờ'       => ['1', '0', '', ''],
303		'Ở'       => ['1', '0', '', ''],
304		'Ỡ'       => ['1', '0', '', ''],
305		'Ợ'       => ['1', '0', '', ''],
306		'OE'      => ['1', '0', '', ''],
307		'OI'      => ['1', '0', '1', ''],
308		'OJ'      => ['1', '0', '1', ''],
309		'OU'      => ['1', '0', '', ''],
310		'OY'      => ['1', '0', '1', ''],
311		'P'       => ['0', '7', '7', '7'],
312		'PF'      => ['0', '7', '7', '7'],
313		'PH'      => ['0', '7', '7', '7'],
314		'Q'       => ['0', '5', '5', '5'],
315		'R'       => ['0', '9', '9', '9'],
316		'Ř'       => ['0', '4', '4', '4'],
317		'RS'      => ['0', '4', '4', '4', '94', '94', '94'],
318		'RZ'      => ['0', '4', '4', '4', '94', '94', '94'],
319		'S'       => ['0', '4', '4', '4'],
320		'Ś'       => ['0', '4', '4', '4'],
321		'Š'       => ['0', '4', '4', '4'],
322		'Ş'       => ['0', '4', '4', '4'],
323		'SC'      => ['0', '2', '4', '4'],
324		'ŠČ'      => ['0', '2', '4', '4'],
325		'SCH'     => ['0', '4', '4', '4'],
326		'SCHD'    => ['0', '2', '43', '43'],
327		'SCHT'    => ['0', '2', '43', '43'],
328		'SCHTCH'  => ['0', '2', '4', '4'],
329		'SCHTSCH' => ['0', '2', '4', '4'],
330		'SCHTSH'  => ['0', '2', '4', '4'],
331		'SD'      => ['0', '2', '43', '43'],
332		'SH'      => ['0', '4', '4', '4'],
333		'SHCH'    => ['0', '2', '4', '4'],
334		'SHD'     => ['0', '2', '43', '43'],
335		'SHT'     => ['0', '2', '43', '43'],
336		'SHTCH'   => ['0', '2', '4', '4'],
337		'SHTSH'   => ['0', '2', '4', '4'],
338		'ß'       => ['0', '', '4', '4'],
339		'ST'      => ['0', '2', '43', '43'],
340		'STCH'    => ['0', '2', '4', '4'],
341		'STRS'    => ['0', '2', '4', '4'],
342		'STRZ'    => ['0', '2', '4', '4'],
343		'STSCH'   => ['0', '2', '4', '4'],
344		'STSH'    => ['0', '2', '4', '4'],
345		'SSZ'     => ['0', '4', '4', '4'],
346		'SZ'      => ['0', '4', '4', '4'],
347		'SZCS'    => ['0', '2', '4', '4'],
348		'SZCZ'    => ['0', '2', '4', '4'],
349		'SZD'     => ['0', '2', '43', '43'],
350		'SZT'     => ['0', '2', '43', '43'],
351		'T'       => ['0', '3', '3', '3'],
352		'Ť'       => ['0', '3', '3', '3'],
353		'Ţ'       => ['0', '3', '3', '3', '4', '4', '4'],
354		'TC'      => ['0', '4', '4', '4'],
355		'TCH'     => ['0', '4', '4', '4'],
356		'TH'      => ['0', '3', '3', '3'],
357		'TRS'     => ['0', '4', '4', '4'],
358		'TRZ'     => ['0', '4', '4', '4'],
359		'TS'      => ['0', '4', '4', '4'],
360		'TSCH'    => ['0', '4', '4', '4'],
361		'TSH'     => ['0', '4', '4', '4'],
362		'TSZ'     => ['0', '4', '4', '4'],
363		'TTCH'    => ['0', '4', '4', '4'],
364		'TTS'     => ['0', '4', '4', '4'],
365		'TTSCH'   => ['0', '4', '4', '4'],
366		'TTSZ'    => ['0', '4', '4', '4'],
367		'TTZ'     => ['0', '4', '4', '4'],
368		'TZ'      => ['0', '4', '4', '4'],
369		'TZS'     => ['0', '4', '4', '4'],
370		'U'       => ['1', '0', '', ''],
371		'Ù'       => ['1', '0', '', ''],
372		'Ú'       => ['1', '0', '', ''],
373		'Û'       => ['1', '0', '', ''],
374		'Ü'       => ['1', '0', '', ''],
375		'Ũ'       => ['1', '0', '', ''],
376		'Ū'       => ['1', '0', '', ''],
377		'Ů'       => ['1', '0', '', ''],
378		'Ű'       => ['1', '0', '', ''],
379		'Ų'       => ['1', '0', '', ''],
380		'Ư'       => ['1', '0', '', ''],
381		'Ụ'       => ['1', '0', '', ''],
382		'Ủ'       => ['1', '0', '', ''],
383		'Ứ'       => ['1', '0', '', ''],
384		'Ừ'       => ['1', '0', '', ''],
385		'Ử'       => ['1', '0', '', ''],
386		'Ữ'       => ['1', '0', '', ''],
387		'Ự'       => ['1', '0', '', ''],
388		'UE'      => ['1', '0', '', ''],
389		'UI'      => ['1', '0', '1', ''],
390		'UJ'      => ['1', '0', '1', ''],
391		'UY'      => ['1', '0', '1', ''],
392		'UW'      => ['1', '0', '1', '', '0', '7', '7'],
393		'V'       => ['0', '7', '7', '7'],
394		'W'       => ['0', '7', '7', '7'],
395		'X'       => ['0', '5', '54', '54'],
396		'Y'       => ['1', '1', '', ''],
397		'Ý'       => ['1', '1', '', ''],
398		'Ỳ'       => ['1', '1', '', ''],
399		'Ỵ'       => ['1', '1', '', ''],
400		'Ỷ'       => ['1', '1', '', ''],
401		'Ỹ'       => ['1', '1', '', ''],
402		'Z'       => ['0', '4', '4', '4'],
403		'Ź'       => ['0', '4', '4', '4'],
404		'Ż'       => ['0', '4', '4', '4'],
405		'Ž'       => ['0', '4', '4', '4'],
406		'ZD'      => ['0', '2', '43', '43'],
407		'ZDZ'     => ['0', '2', '4', '4'],
408		'ZDZH'    => ['0', '2', '4', '4'],
409		'ZH'      => ['0', '4', '4', '4'],
410		'ZHD'     => ['0', '2', '43', '43'],
411		'ZHDZH'   => ['0', '2', '4', '4'],
412		'ZS'      => ['0', '4', '4', '4'],
413		'ZSCH'    => ['0', '4', '4', '4'],
414		'ZSH'     => ['0', '4', '4', '4'],
415		'ZZS'     => ['0', '4', '4', '4'],
416		// Cyrillic alphabet
417		'А'   => ['1', '0', '', ''],
418		'Б'   => ['0', '7', '7', '7'],
419		'В'   => ['0', '7', '7', '7'],
420		'Г'   => ['0', '5', '5', '5'],
421		'Д'   => ['0', '3', '3', '3'],
422		'ДЗ'  => ['0', '4', '4', '4'],
423		'Е'   => ['1', '0', '', ''],
424		'Ё'   => ['1', '0', '', ''],
425		'Ж'   => ['0', '4', '4', '4'],
426		'З'   => ['0', '4', '4', '4'],
427		'И'   => ['1', '0', '', ''],
428		'Й'   => ['1', '1', '', '', '4', '4', '4'],
429		'К'   => ['0', '5', '5', '5'],
430		'Л'   => ['0', '8', '8', '8'],
431		'М'   => ['0', '6', '6', '6'],
432		'Н'   => ['0', '6', '6', '6'],
433		'О'   => ['1', '0', '', ''],
434		'П'   => ['0', '7', '7', '7'],
435		'Р'   => ['0', '9', '9', '9'],
436		'РЖ'  => ['0', '4', '4', '4'],
437		'С'   => ['0', '4', '4', '4'],
438		'Т'   => ['0', '3', '3', '3'],
439		'У'   => ['1', '0', '', ''],
440		'Ф'   => ['0', '7', '7', '7'],
441		'Х'   => ['0', '5', '5', '5'],
442		'Ц'   => ['0', '4', '4', '4'],
443		'Ч'   => ['0', '4', '4', '4'],
444		'Ш'   => ['0', '4', '4', '4'],
445		'Щ'   => ['0', '2', '4', '4'],
446		'Ъ'   => ['0', '', '', ''],
447		'Ы'   => ['0', '1', '', ''],
448		'Ь'   => ['0', '', '', ''],
449		'Э'   => ['1', '0', '', ''],
450		'Ю'   => ['0', '1', '', ''],
451		'Я'   => ['0', '1', '', ''],
452		// Greek alphabet
453		'Α'   => ['1', '0', '', ''],
454		'Ά'   => ['1', '0', '', ''],
455		'ΑΙ'  => ['1', '0', '1', ''],
456		'ΑΥ'  => ['1', '0', '1', ''],
457		'Β'   => ['0', '7', '7', '7'],
458		'Γ'   => ['0', '5', '5', '5'],
459		'Δ'   => ['0', '3', '3', '3'],
460		'Ε'   => ['1', '0', '', ''],
461		'Έ'   => ['1', '0', '', ''],
462		'ΕΙ'  => ['1', '0', '1', ''],
463		'ΕΥ'  => ['1', '1', '1', ''],
464		'Ζ'   => ['0', '4', '4', '4'],
465		'Η'   => ['1', '0', '', ''],
466		'Ή'   => ['1', '0', '', ''],
467		'Θ'   => ['0', '3', '3', '3'],
468		'Ι'   => ['1', '0', '', ''],
469		'Ί'   => ['1', '0', '', ''],
470		'Ϊ'   => ['1', '0', '', ''],
471		'ΐ'   => ['1', '0', '', ''],
472		'Κ'   => ['0', '5', '5', '5'],
473		'Λ'   => ['0', '8', '8', '8'],
474		'Μ'   => ['0', '6', '6', '6'],
475		'ΜΠ'  => ['0', '7', '7', '7'],
476		'Ν'   => ['0', '6', '6', '6'],
477		'ΝΤ'  => ['0', '3', '3', '3'],
478		'Ξ'   => ['0', '5', '54', '54'],
479		'Ο'   => ['1', '0', '', ''],
480		'Ό'   => ['1', '0', '', ''],
481		'ΟΙ'  => ['1', '0', '1', ''],
482		'ΟΥ'  => ['1', '0', '1', ''],
483		'Π'   => ['0', '7', '7', '7'],
484		'Ρ'   => ['0', '9', '9', '9'],
485		'Σ'   => ['0', '4', '4', '4'],
486		'ς'   => ['0', '', '', '4'],
487		'Τ'   => ['0', '3', '3', '3'],
488		'ΤΖ'  => ['0', '4', '4', '4'],
489		'ΤΣ'  => ['0', '4', '4', '4'],
490		'Υ'   => ['1', '1', '', ''],
491		'Ύ'   => ['1', '1', '', ''],
492		'Ϋ'   => ['1', '1', '', ''],
493		'ΰ'   => ['1', '1', '', ''],
494		'ΥΚ'  => ['1', '5', '5', '5'],
495		'ΥΥ'  => ['1', '65', '65', '65'],
496		'Φ'   => ['0', '7', '7', '7'],
497		'Χ'   => ['0', '5', '5', '5'],
498		'Ψ'   => ['0', '7', '7', '7'],
499		'Ω'   => ['1', '0', '', ''],
500		'Ώ'   => ['1', '0', '', ''],
501		// Hebrew alphabet
502		'א'     => ['1', '0', '', ''],
503		'או'    => ['1', '0', '7', ''],
504		'אג'    => ['1', '4', '4', '4', '5', '5', '5', '34', '34', '34'],
505		'בב'    => ['0', '7', '7', '7', '77', '77', '77'],
506		'ב'     => ['0', '7', '7', '7'],
507		'גג'    => ['0', '4', '4', '4', '5', '5', '5', '45', '45', '45', '55', '55', '55', '54', '54', '54'],
508		'גד'    => ['0', '43', '43', '43', '53', '53', '53'],
509		'גה'    => ['0', '45', '45', '45', '55', '55', '55'],
510		'גז'    => ['0', '44', '44', '44', '45', '45', '45'],
511		'גח'    => ['0', '45', '45', '45', '55', '55', '55'],
512		'גכ'    => ['0', '45', '45', '45', '55', '55', '55'],
513		'גך'    => ['0', '45', '45', '45', '55', '55', '55'],
514		'גצ'    => ['0', '44', '44', '44', '45', '45', '45'],
515		'גץ'    => ['0', '44', '44', '44', '45', '45', '45'],
516		'גק'    => ['0', '45', '45', '45', '54', '54', '54'],
517		'גש'    => ['0', '44', '44', '44', '54', '54', '54'],
518		'גת'    => ['0', '43', '43', '43', '53', '53', '53'],
519		'ג'     => ['0', '4', '4', '4', '5', '5', '5'],
520		'דז'    => ['0', '4', '4', '4'],
521		'דד'    => ['0', '3', '3', '3', '33', '33', '33'],
522		'דט'    => ['0', '33', '33', '33'],
523		'דש'    => ['0', '4', '4', '4'],
524		'דצ'    => ['0', '4', '4', '4'],
525		'דץ'    => ['0', '4', '4', '4'],
526		'ד'     => ['0', '3', '3', '3'],
527		'הג'    => ['0', '54', '54', '54', '55', '55', '55'],
528		'הכ'    => ['0', '55', '55', '55'],
529		'הח'    => ['0', '55', '55', '55'],
530		'הק'    => ['0', '55', '55', '55', '5', '5', '5'],
531		'הה'    => ['0', '5', '5', '', '55', '55', ''],
532		'ה'     => ['0', '5', '5', ''],
533		'וי'    => ['1', '', '', '', '7', '7', '7'],
534		'ו'     => ['1', '7', '7', '7', '7', '', ''],
535		'וו'    => ['1', '7', '7', '7', '7', '', ''],
536		'וופ'   => ['1', '7', '7', '7', '77', '77', '77'],
537		'זש'    => ['0', '4', '4', '4', '44', '44', '44'],
538		'זדז'   => ['0', '2', '4', '4'],
539		'ז'     => ['0', '4', '4', '4'],
540		'זג'    => ['0', '44', '44', '44', '45', '45', '45'],
541		'זז'    => ['0', '4', '4', '4', '44', '44', '44'],
542		'זס'    => ['0', '44', '44', '44'],
543		'זצ'    => ['0', '44', '44', '44'],
544		'זץ'    => ['0', '44', '44', '44'],
545		'חג'    => ['0', '54', '54', '54', '53', '53', '53'],
546		'חח'    => ['0', '5', '5', '5', '55', '55', '55'],
547		'חק'    => ['0', '55', '55', '55', '5', '5', '5'],
548		'חכ'    => ['0', '45', '45', '45', '55', '55', '55'],
549		'חס'    => ['0', '5', '54', '54'],
550		'חש'    => ['0', '5', '54', '54'],
551		'ח'     => ['0', '5', '5', '5'],
552		'טש'    => ['0', '4', '4', '4'],
553		'טד'    => ['0', '33', '33', '33'],
554		'טי'    => ['0', '3', '3', '3', '4', '4', '4', '3', '3', '34'],
555		'טת'    => ['0', '33', '33', '33'],
556		'טט'    => ['0', '3', '3', '3', '33', '33', '33'],
557		'ט'     => ['0', '3', '3', '3'],
558		'י'     => ['1', '1', '', ''],
559		'יא'    => ['1', '1', '', '', '1', '1', '1'],
560		'כג'    => ['0', '55', '55', '55', '54', '54', '54'],
561		'כש'    => ['0', '5', '54', '54'],
562		'כס'    => ['0', '5', '54', '54'],
563		'ככ'    => ['0', '5', '5', '5', '55', '55', '55'],
564		'כך'    => ['0', '5', '5', '5', '55', '55', '55'],
565		'כ'     => ['0', '5', '5', '5'],
566		'כח'    => ['0', '55', '55', '55', '5', '5', '5'],
567		'ך'     => ['0', '', '5', '5'],
568		'ל'     => ['0', '8', '8', '8'],
569		'לל'    => ['0', '88', '88', '88', '8', '8', '8'],
570		'מנ'    => ['0', '66', '66', '66'],
571		'מן'    => ['0', '66', '66', '66'],
572		'ממ'    => ['0', '6', '6', '6', '66', '66', '66'],
573		'מם'    => ['0', '6', '6', '6', '66', '66', '66'],
574		'מ'     => ['0', '6', '6', '6'],
575		'ם'     => ['0', '', '6', '6'],
576		'נמ'    => ['0', '66', '66', '66'],
577		'נם'    => ['0', '66', '66', '66'],
578		'ננ'    => ['0', '6', '6', '6', '66', '66', '66'],
579		'נן'    => ['0', '6', '6', '6', '66', '66', '66'],
580		'נ'     => ['0', '6', '6', '6'],
581		'ן'     => ['0', '', '6', '6'],
582		'סתש'   => ['0', '2', '4', '4'],
583		'סתז'   => ['0', '2', '4', '4'],
584		'סטז'   => ['0', '2', '4', '4'],
585		'סטש'   => ['0', '2', '4', '4'],
586		'סצד'   => ['0', '2', '4', '4'],
587		'סט'    => ['0', '2', '4', '4', '43', '43', '43'],
588		'סת'    => ['0', '2', '4', '4', '43', '43', '43'],
589		'סג'    => ['0', '44', '44', '44', '4', '4', '4'],
590		'סס'    => ['0', '4', '4', '4', '44', '44', '44'],
591		'סצ'    => ['0', '44', '44', '44'],
592		'סץ'    => ['0', '44', '44', '44'],
593		'סז'    => ['0', '44', '44', '44'],
594		'סש'    => ['0', '44', '44', '44'],
595		'ס'     => ['0', '4', '4', '4'],
596		'ע'     => ['1', '0', '', ''],
597		'פב'    => ['0', '7', '7', '7', '77', '77', '77'],
598		'פוו'   => ['0', '7', '7', '7', '77', '77', '77'],
599		'פפ'    => ['0', '7', '7', '7', '77', '77', '77'],
600		'פף'    => ['0', '7', '7', '7', '77', '77', '77'],
601		'פ'     => ['0', '7', '7', '7'],
602		'ף'     => ['0', '', '7', '7'],
603		'צג'    => ['0', '44', '44', '44', '45', '45', '45'],
604		'צז'    => ['0', '44', '44', '44'],
605		'צס'    => ['0', '44', '44', '44'],
606		'צצ'    => ['0', '4', '4', '4', '5', '5', '5', '44', '44', '44', '54', '54', '54', '45', '45', '45'],
607		'צץ'    => ['0', '4', '4', '4', '5', '5', '5', '44', '44', '44', '54', '54', '54'],
608		'צש'    => ['0', '44', '44', '44', '4', '4', '4', '5', '5', '5'],
609		'צ'     => ['0', '4', '4', '4', '5', '5', '5'],
610		'ץ'     => ['0', '', '4', '4'],
611		'קה'    => ['0', '55', '55', '5'],
612		'קס'    => ['0', '5', '54', '54'],
613		'קש'    => ['0', '5', '54', '54'],
614		'קק'    => ['0', '5', '5', '5', '55', '55', '55'],
615		'קח'    => ['0', '55', '55', '55'],
616		'קכ'    => ['0', '55', '55', '55'],
617		'קך'    => ['0', '55', '55', '55'],
618		'קג'    => ['0', '55', '55', '55', '54', '54', '54'],
619		'ק'     => ['0', '5', '5', '5'],
620		'רר'    => ['0', '99', '99', '99', '9', '9', '9'],
621		'ר'     => ['0', '9', '9', '9'],
622		'שטז'   => ['0', '2', '4', '4'],
623		'שתש'   => ['0', '2', '4', '4'],
624		'שתז'   => ['0', '2', '4', '4'],
625		'שטש'   => ['0', '2', '4', '4'],
626		'שד'    => ['0', '2', '43', '43'],
627		'שז'    => ['0', '44', '44', '44'],
628		'שס'    => ['0', '44', '44', '44'],
629		'שת'    => ['0', '2', '43', '43'],
630		'שג'    => ['0', '4', '4', '4', '44', '44', '44', '4', '43', '43'],
631		'שט'    => ['0', '2', '43', '43', '44', '44', '44'],
632		'שצ'    => ['0', '44', '44', '44', '45', '45', '45'],
633		'שץ'    => ['0', '44', '', '44', '45', '', '45'],
634		'שש'    => ['0', '4', '4', '4', '44', '44', '44'],
635		'ש'     => ['0', '4', '4', '4'],
636		'תג'    => ['0', '34', '34', '34'],
637		'תז'    => ['0', '34', '34', '34'],
638		'תש'    => ['0', '4', '4', '4'],
639		'תת'    => ['0', '3', '3', '3', '4', '4', '4', '33', '33', '33', '44', '44', '44', '34', '34', '34', '43', '43', '43'],
640		'ת'     => ['0', '3', '3', '3', '4', '4', '4'],
641		// Arabic alphabet
642		'ا'   => ['1', '0', '', ''],
643		'ب'   => ['0', '7', '7', '7'],
644		'ت'   => ['0', '3', '3', '3'],
645		'ث'   => ['0', '3', '3', '3'],
646		'ج'   => ['0', '4', '4', '4'],
647		'ح'   => ['0', '5', '5', '5'],
648		'خ'   => ['0', '5', '5', '5'],
649		'د'   => ['0', '3', '3', '3'],
650		'ذ'   => ['0', '3', '3', '3'],
651		'ر'   => ['0', '9', '9', '9'],
652		'ز'   => ['0', '4', '4', '4'],
653		'س'   => ['0', '4', '4', '4'],
654		'ش'   => ['0', '4', '4', '4'],
655		'ص'   => ['0', '4', '4', '4'],
656		'ض'   => ['0', '3', '3', '3'],
657		'ط'   => ['0', '3', '3', '3'],
658		'ظ'   => ['0', '4', '4', '4'],
659		'ع'   => ['1', '0', '', ''],
660		'غ'   => ['0', '0', '', ''],
661		'ف'   => ['0', '7', '7', '7'],
662		'ق'   => ['0', '5', '5', '5'],
663		'ك'   => ['0', '5', '5', '5'],
664		'ل'   => ['0', '8', '8', '8'],
665		'لا'  => ['0', '8', '8', '8'],
666		'م'   => ['0', '6', '6', '6'],
667		'ن'   => ['0', '6', '6', '6'],
668		'هن'  => ['0', '66', '66', '66'],
669		'ه'   => ['0', '5', '5', ''],
670		'و'   => ['1', '', '', '', '7', '', ''],
671		'ي'   => ['0', '1', '', ''],
672		'آ'   => ['0', '1', '', ''],
673		'ة'   => ['0', '', '', '3'],
674		'ی'   => ['0', '1', '', ''],
675		'ى'   => ['1', '1', '', ''],
676	];
677
678	/**
679	 * Calculate the Daitch-Mokotoff soundex for a word.
680	 *
681	 * @param string $name
682	 *
683	 * @return string[] List of possible DM codes for the word.
684	 */
685	private static function daitchMokotoffWord($name) {
686		// Apply special transformation rules to the input string
687		$name = I18N::strtoupper($name);
688		foreach (self::$transformNameTable as $transformRule) {
689			$name = str_replace($transformRule[0], $transformRule[1], $name);
690		}
691
692		// Initialize
693		$name_script = I18N::textScript($name);
694		$noVowels    = ($name_script == 'Hebr' || $name_script == 'Arab');
695
696		$lastPos         = strlen($name) - 1;
697		$currPos         = 0;
698		$state           = 1; // 1: start of input string, 2: before vowel, 3: other
699		$result          = []; // accumulate complete 6-digit D-M codes here
700		$partialResult   = []; // accumulate incomplete D-M codes here
701		$partialResult[] = ['!']; // initialize 1st partial result  ('!' stops "duplicate sound" check)
702
703		// Loop through the input string.
704		// Stop when the string is exhausted or when no more partial results remain
705		while (count($partialResult) !== 0 && $currPos <= $lastPos) {
706			// Find the DM coding table entry for the chunk at the current position
707			$thisEntry = substr($name, $currPos, self::MAXCHAR); // Get maximum length chunk
708			while ($thisEntry != '') {
709				if (isset(self::$dmsounds[$thisEntry])) {
710					break;
711				}
712				$thisEntry = substr($thisEntry, 0, -1); // Not in table: try a shorter chunk
713			}
714			if ($thisEntry === '') {
715				$currPos++; // Not in table: advance pointer to next byte
716				continue; // and try again
717			}
718
719			$soundTableEntry = self::$dmsounds[$thisEntry];
720			$workingResult   = $partialResult;
721			$partialResult   = [];
722			$currPos += strlen($thisEntry);
723
724			// Not at beginning of input string
725			if ($state != 1) {
726				if ($currPos <= $lastPos) {
727					// Determine whether the next chunk is a vowel
728					$nextEntry = substr($name, $currPos, self::MAXCHAR); // Get maximum length chunk
729					while ($nextEntry != '') {
730						if (isset(self::$dmsounds[$nextEntry])) {
731							break;
732						}
733						$nextEntry = substr($nextEntry, 0, -1); // Not in table: try a shorter chunk
734					}
735				} else {
736					$nextEntry = '';
737				}
738				if ($nextEntry != '' && self::$dmsounds[$nextEntry][0] != '0') {
739					$state = 2;
740				} else {
741					// Next chunk is a vowel
742					$state = 3;
743				}
744			}
745
746			while ($state < count($soundTableEntry)) {
747				// empty means 'ignore this sound in this state'
748				if ($soundTableEntry[$state] == '') {
749					foreach ($workingResult as $workingEntry) {
750						$tempEntry = $workingEntry;
751						$tempEntry[count($tempEntry) - 1] .= '!'; // Prevent false 'doubles'
752						$partialResult[] = $tempEntry;
753					}
754				} else {
755					foreach ($workingResult as $workingEntry) {
756						if ($soundTableEntry[$state] !== $workingEntry[count($workingEntry) - 1]) {
757							// Incoming sound isn't a duplicate of the previous sound
758							$workingEntry[] = $soundTableEntry[$state];
759						} else {
760							// Incoming sound is a duplicate of the previous sound
761							// For Hebrew and Arabic, we need to create a pair of D-M sound codes,
762							// one of the pair with only a single occurrence of the duplicate sound,
763							// the other with both occurrences
764							if ($noVowels) {
765								$workingEntry[] = $soundTableEntry[$state];
766							}
767						}
768						if (count($workingEntry) < 7) {
769							$partialResult[] = $workingEntry;
770						} else {
771							// This is the 6th code in the sequence
772							// We're looking for 7 entries because the first is '!' and doesn't count
773							$tempResult = str_replace('!', '', implode('', $workingEntry));
774							// Only return codes from recognisable sounds
775							if ($tempResult) {
776								$result[] = substr($tempResult . '000000', 0, 6);
777							}
778						}
779					}
780				}
781				$state = $state + 3; // Advance to next triplet while keeping the same basic state
782			}
783		}
784
785		// Zero-fill and copy all remaining partial results
786		foreach ($partialResult as $workingEntry) {
787			$tempResult = str_replace('!', '', implode('', $workingEntry));
788			// Only return codes from recognisable sounds
789			if ($tempResult) {
790				$result[] = substr($tempResult . '000000', 0, 6);
791			}
792		}
793
794		return $result;
795	}
796}
797