xref: /webtrees/app/Soundex.php (revision ed53a9c0a479c5871db36fde10b575b33b9a209b)
1<?php
2/**
3 * webtrees: online genealogy
4 * Copyright (C) 2017 webtrees development team
5 * This program is free software: you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation, either version 3 of the License, or
8 * (at your option) any later version.
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 * You should have received a copy of the GNU General Public License
14 * along with this program. If not, see <http://www.gnu.org/licenses/>.
15 */
16namespace Fisharebest\Webtrees;
17
18/**
19 * Phonetic matching of strings.
20 */
21class Soundex {
22	/**
23	 * Which algorithms are supported.
24	 *
25	 * @return string[]
26	 */
27	public static function getAlgorithms() {
28		return [
29			'std' => /* I18N: http://en.wikipedia.org/wiki/Soundex */ I18N::translate('Russell'),
30			'dm'  => /* I18N: http://en.wikipedia.org/wiki/Daitch–Mokotoff_Soundex */ I18N::translate('Daitch-Mokotoff'),
31		];
32	}
33
34	/**
35	 * Is there a match between two soundex codes?
36	 *
37	 * @param string $soundex1
38	 * @param string $soundex2
39	 *
40	 * @return bool
41	 */
42	public static function compare($soundex1, $soundex2) {
43		if ($soundex1 && $soundex2) {
44			foreach (explode(':', $soundex1) as $code) {
45				if (strpos($soundex2, $code) !== false) {
46					return true;
47				}
48			}
49		}
50
51		return false;
52	}
53
54	/**
55	 * Generate Russell soundex codes for a given text.
56	 *
57	 * @param $text
58	 *
59	 * @return null|string
60	 */
61	public static function russell($text) {
62		$words         = preg_split('/\s/', $text, -1, PREG_SPLIT_NO_EMPTY);
63		$soundex_array = [];
64		foreach ($words as $word) {
65			$soundex = soundex($word);
66			// Only return codes from recognisable sounds
67			if ($soundex !== '0000') {
68				$soundex_array[] = $soundex;
69			}
70		}
71		// Combine words, e.g. “New York” as “Newyork”
72		if (count($words) > 1) {
73			$soundex_array[] = soundex(strtr($text, ' ', ''));
74		}
75		// A varchar(255) column can only hold 51 4-character codes (plus 50 delimiters)
76		$soundex_array = array_slice(array_unique($soundex_array), 0, 51);
77
78		if ($soundex_array) {
79			return implode(':', $soundex_array);
80		} else {
81			return '';
82		}
83	}
84
85	/**
86	 * Generate Daitch–Mokotoff soundex codes for a given text.
87	 *
88	 * @param $text
89	 *
90	 * @return null|string
91	 */
92	public static function daitchMokotoff($text) {
93		$words         = preg_split('/\s/', $text, -1, PREG_SPLIT_NO_EMPTY);
94		$soundex_array = [];
95		foreach ($words as $word) {
96			$soundex_array = array_merge($soundex_array, self::daitchMokotoffWord($word));
97		}
98		// Combine words, e.g. “New York” as “Newyork”
99		if (count($words) > 1) {
100			$soundex_array = array_merge($soundex_array, self::daitchMokotoffWord(strtr($text, ' ', '')));
101		}
102		// A varchar(255) column can only hold 36 6-character codes (plus 35 delimiters)
103		$soundex_array = array_slice(array_unique($soundex_array), 0, 36);
104
105		if ($soundex_array) {
106			return implode(':', $soundex_array);
107		} else {
108			return '';
109		}
110	}
111
112	// Determine the Daitch–Mokotoff Soundex code for a word
113	// Original implementation by Gerry Kroll, and analysis by Meliza Amity
114
115	// Max. table key length (in ASCII bytes -- NOT in UTF-8 characters!)
116	const MAXCHAR = 7;
117
118	/**
119	 * Name transformation arrays.
120	 * Used to transform the Name string to simplify the "sounds like" table.
121	 * This is especially useful in Hebrew.
122	 *
123	 * Each array entry defines the "from" and "to" arguments of an preg($from, $to, $text)
124	 * function call to achieve the desired transformations.
125	 *
126	 * Note about the use of "\x01":
127	 * This code, which can’t legitimately occur in the kind of text we're dealing with,
128	 * is used as a place-holder so that conditional string replacements can be done.
129	 *
130	 * @var string[][]
131	 */
132	private static $transformNameTable = [
133		// Force Yiddish ligatures to be treated as separate letters
134		['װ', 'וו'],
135		['ײ', 'יי'],
136		['ױ', 'וי'],
137		['בו', 'בע'],
138		['פו', 'פע'],
139		['ומ', 'עמ'],
140		['ום', 'עם'],
141		['ונ', 'ענ'],
142		['ון', 'ען'],
143		['וו', 'ב'],
144		["\x01", ''],
145		['ייה$', "\x01ה"],
146		['ייע$', "\x01ע"],
147		['יי', 'ע'],
148		["\x01", 'יי'],
149	];
150
151	/**
152	 * The DM sound coding table is organized this way:
153	 * key: a variable-length string that corresponds to the UTF-8 character sequence
154	 * represented by the table entry. Currently, that string can be up to 7
155	 * bytes long. This maximum length is defined by the value of global variable
156	 * $maxchar.
157	 *
158	 * value: an array as follows:
159	 * [0]:  zero if not a vowel
160	 * [1]:  sound value when this string is at the beginning of the word
161	 * [2]:  sound value when this string is followed by a vowel
162	 * [3]:  sound value for other cases
163	 * [1],[2],[3] can be repeated several times to create branches in the code
164	 * an empty sound value means "ignore in this state"
165	 *
166	 * @var string[][]
167	 */
168	private static $dmsounds = [
169		'A'       => ['1', '0', '', ''],
170		'À'       => ['1', '0', '', ''],
171		'Á'       => ['1', '0', '', ''],
172		'Â'       => ['1', '0', '', ''],
173		'Ã'       => ['1', '0', '', ''],
174		'Ä'       => ['1', '0', '1', '', '0', '', ''],
175		'Å'       => ['1', '0', '', ''],
176		'Ă'       => ['1', '0', '', ''],
177		'Ą'       => ['1', '', '', '', '', '', '6'],
178		'Ạ'       => ['1', '0', '', ''],
179		'Ả'       => ['1', '0', '', ''],
180		'Ấ'       => ['1', '0', '', ''],
181		'Ầ'       => ['1', '0', '', ''],
182		'Ẩ'       => ['1', '0', '', ''],
183		'Ẫ'       => ['1', '0', '', ''],
184		'Ậ'       => ['1', '0', '', ''],
185		'Ắ'       => ['1', '0', '', ''],
186		'Ằ'       => ['1', '0', '', ''],
187		'Ẳ'       => ['1', '0', '', ''],
188		'Ẵ'       => ['1', '0', '', ''],
189		'Ặ'       => ['1', '0', '', ''],
190		'AE'      => ['1', '0', '1', ''],
191		'Æ'       => ['1', '0', '1', ''],
192		'AI'      => ['1', '0', '1', ''],
193		'AJ'      => ['1', '0', '1', ''],
194		'AU'      => ['1', '0', '7', ''],
195		'AV'      => ['1', '0', '7', '', '7', '7', '7'],
196		'ÄU'      => ['1', '0', '1', ''],
197		'AY'      => ['1', '0', '1', ''],
198		'B'       => ['0', '7', '7', '7'],
199		'C'       => ['0', '5', '5', '5', '34', '4', '4'],
200		'Ć'       => ['0', '4', '4', '4'],
201		'Č'       => ['0', '4', '4', '4'],
202		'Ç'       => ['0', '4', '4', '4'],
203		'CH'      => ['0', '5', '5', '5', '34', '4', '4'],
204		'CHS'     => ['0', '5', '54', '54'],
205		'CK'      => ['0', '5', '5', '5', '45', '45', '45'],
206		'CCS'     => ['0', '4', '4', '4'],
207		'CS'      => ['0', '4', '4', '4'],
208		'CSZ'     => ['0', '4', '4', '4'],
209		'CZ'      => ['0', '4', '4', '4'],
210		'CZS'     => ['0', '4', '4', '4'],
211		'D'       => ['0', '3', '3', '3'],
212		'Ď'       => ['0', '3', '3', '3'],
213		'Đ'       => ['0', '3', '3', '3'],
214		'DRS'     => ['0', '4', '4', '4'],
215		'DRZ'     => ['0', '4', '4', '4'],
216		'DS'      => ['0', '4', '4', '4'],
217		'DSH'     => ['0', '4', '4', '4'],
218		'DSZ'     => ['0', '4', '4', '4'],
219		'DT'      => ['0', '3', '3', '3'],
220		'DDZ'     => ['0', '4', '4', '4'],
221		'DDZS'    => ['0', '4', '4', '4'],
222		'DZ'      => ['0', '4', '4', '4'],
223		'DŹ'      => ['0', '4', '4', '4'],
224		'DŻ'      => ['0', '4', '4', '4'],
225		'DZH'     => ['0', '4', '4', '4'],
226		'DZS'     => ['0', '4', '4', '4'],
227		'E'       => ['1', '0', '', ''],
228		'È'       => ['1', '0', '', ''],
229		'É'       => ['1', '0', '', ''],
230		'Ê'       => ['1', '0', '', ''],
231		'Ë'       => ['1', '0', '', ''],
232		'Ĕ'       => ['1', '0', '', ''],
233		'Ė'       => ['1', '0', '', ''],
234		'Ę'       => ['1', '', '', '6', '', '', ''],
235		'Ẹ'       => ['1', '0', '', ''],
236		'Ẻ'       => ['1', '0', '', ''],
237		'Ẽ'       => ['1', '0', '', ''],
238		'Ế'       => ['1', '0', '', ''],
239		'Ề'       => ['1', '0', '', ''],
240		'Ể'       => ['1', '0', '', ''],
241		'Ễ'       => ['1', '0', '', ''],
242		'Ệ'       => ['1', '0', '', ''],
243		'EAU'     => ['1', '0', '', ''],
244		'EI'      => ['1', '0', '1', ''],
245		'EJ'      => ['1', '0', '1', ''],
246		'EU'      => ['1', '1', '1', ''],
247		'EY'      => ['1', '0', '1', ''],
248		'F'       => ['0', '7', '7', '7'],
249		'FB'      => ['0', '7', '7', '7'],
250		'G'       => ['0', '5', '5', '5', '34', '4', '4'],
251		'Ğ'       => ['0', '', '', ''],
252		'GGY'     => ['0', '5', '5', '5'],
253		'GY'      => ['0', '5', '5', '5'],
254		'H'       => ['0', '5', '5', '', '5', '5', '5'],
255		'I'       => ['1', '0', '', ''],
256		'Ì'       => ['1', '0', '', ''],
257		'Í'       => ['1', '0', '', ''],
258		'Î'       => ['1', '0', '', ''],
259		'Ï'       => ['1', '0', '', ''],
260		'Ĩ'       => ['1', '0', '', ''],
261		'Į'       => ['1', '0', '', ''],
262		'İ'       => ['1', '0', '', ''],
263		'Ỉ'       => ['1', '0', '', ''],
264		'Ị'       => ['1', '0', '', ''],
265		'IA'      => ['1', '1', '', ''],
266		'IE'      => ['1', '1', '', ''],
267		'IO'      => ['1', '1', '', ''],
268		'IU'      => ['1', '1', '', ''],
269		'J'       => ['0', '1', '', '', '4', '4', '4', '5', '5', ''],
270		'K'       => ['0', '5', '5', '5'],
271		'KH'      => ['0', '5', '5', '5'],
272		'KS'      => ['0', '5', '54', '54'],
273		'L'       => ['0', '8', '8', '8'],
274		'Ľ'       => ['0', '8', '8', '8'],
275		'Ĺ'       => ['0', '8', '8', '8'],
276		'Ł'       => ['0', '7', '7', '7', '8', '8', '8'],
277		'LL'      => ['0', '8', '8', '8', '58', '8', '8', '1', '8', '8'],
278		'LLY'     => ['0', '8', '8', '8', '1', '8', '8'],
279		'LY'      => ['0', '8', '8', '8', '1', '8', '8'],
280		'M'       => ['0', '6', '6', '6'],
281		'MĔ'      => ['0', '66', '66', '66'],
282		'MN'      => ['0', '66', '66', '66'],
283		'N'       => ['0', '6', '6', '6'],
284		'Ń'       => ['0', '6', '6', '6'],
285		'Ň'       => ['0', '6', '6', '6'],
286		'Ñ'       => ['0', '6', '6', '6'],
287		'NM'      => ['0', '66', '66', '66'],
288		'O'       => ['1', '0', '', ''],
289		'Ò'       => ['1', '0', '', ''],
290		'Ó'       => ['1', '0', '', ''],
291		'Ô'       => ['1', '0', '', ''],
292		'Õ'       => ['1', '0', '', ''],
293		'Ö'       => ['1', '0', '', ''],
294		'Ø'       => ['1', '0', '', ''],
295		'Ő'       => ['1', '0', '', ''],
296		'Œ'       => ['1', '0', '', ''],
297		'Ơ'       => ['1', '0', '', ''],
298		'Ọ'       => ['1', '0', '', ''],
299		'Ỏ'       => ['1', '0', '', ''],
300		'Ố'       => ['1', '0', '', ''],
301		'Ồ'       => ['1', '0', '', ''],
302		'Ổ'       => ['1', '0', '', ''],
303		'Ỗ'       => ['1', '0', '', ''],
304		'Ộ'       => ['1', '0', '', ''],
305		'Ớ'       => ['1', '0', '', ''],
306		'Ờ'       => ['1', '0', '', ''],
307		'Ở'       => ['1', '0', '', ''],
308		'Ỡ'       => ['1', '0', '', ''],
309		'Ợ'       => ['1', '0', '', ''],
310		'OE'      => ['1', '0', '', ''],
311		'OI'      => ['1', '0', '1', ''],
312		'OJ'      => ['1', '0', '1', ''],
313		'OU'      => ['1', '0', '', ''],
314		'OY'      => ['1', '0', '1', ''],
315		'P'       => ['0', '7', '7', '7'],
316		'PF'      => ['0', '7', '7', '7'],
317		'PH'      => ['0', '7', '7', '7'],
318		'Q'       => ['0', '5', '5', '5'],
319		'R'       => ['0', '9', '9', '9'],
320		'Ř'       => ['0', '4', '4', '4'],
321		'RS'      => ['0', '4', '4', '4', '94', '94', '94'],
322		'RZ'      => ['0', '4', '4', '4', '94', '94', '94'],
323		'S'       => ['0', '4', '4', '4'],
324		'Ś'       => ['0', '4', '4', '4'],
325		'Š'       => ['0', '4', '4', '4'],
326		'Ş'       => ['0', '4', '4', '4'],
327		'SC'      => ['0', '2', '4', '4'],
328		'ŠČ'      => ['0', '2', '4', '4'],
329		'SCH'     => ['0', '4', '4', '4'],
330		'SCHD'    => ['0', '2', '43', '43'],
331		'SCHT'    => ['0', '2', '43', '43'],
332		'SCHTCH'  => ['0', '2', '4', '4'],
333		'SCHTSCH' => ['0', '2', '4', '4'],
334		'SCHTSH'  => ['0', '2', '4', '4'],
335		'SD'      => ['0', '2', '43', '43'],
336		'SH'      => ['0', '4', '4', '4'],
337		'SHCH'    => ['0', '2', '4', '4'],
338		'SHD'     => ['0', '2', '43', '43'],
339		'SHT'     => ['0', '2', '43', '43'],
340		'SHTCH'   => ['0', '2', '4', '4'],
341		'SHTSH'   => ['0', '2', '4', '4'],
342		'ß'       => ['0', '', '4', '4'],
343		'ST'      => ['0', '2', '43', '43'],
344		'STCH'    => ['0', '2', '4', '4'],
345		'STRS'    => ['0', '2', '4', '4'],
346		'STRZ'    => ['0', '2', '4', '4'],
347		'STSCH'   => ['0', '2', '4', '4'],
348		'STSH'    => ['0', '2', '4', '4'],
349		'SSZ'     => ['0', '4', '4', '4'],
350		'SZ'      => ['0', '4', '4', '4'],
351		'SZCS'    => ['0', '2', '4', '4'],
352		'SZCZ'    => ['0', '2', '4', '4'],
353		'SZD'     => ['0', '2', '43', '43'],
354		'SZT'     => ['0', '2', '43', '43'],
355		'T'       => ['0', '3', '3', '3'],
356		'Ť'       => ['0', '3', '3', '3'],
357		'Ţ'       => ['0', '3', '3', '3', '4', '4', '4'],
358		'TC'      => ['0', '4', '4', '4'],
359		'TCH'     => ['0', '4', '4', '4'],
360		'TH'      => ['0', '3', '3', '3'],
361		'TRS'     => ['0', '4', '4', '4'],
362		'TRZ'     => ['0', '4', '4', '4'],
363		'TS'      => ['0', '4', '4', '4'],
364		'TSCH'    => ['0', '4', '4', '4'],
365		'TSH'     => ['0', '4', '4', '4'],
366		'TSZ'     => ['0', '4', '4', '4'],
367		'TTCH'    => ['0', '4', '4', '4'],
368		'TTS'     => ['0', '4', '4', '4'],
369		'TTSCH'   => ['0', '4', '4', '4'],
370		'TTSZ'    => ['0', '4', '4', '4'],
371		'TTZ'     => ['0', '4', '4', '4'],
372		'TZ'      => ['0', '4', '4', '4'],
373		'TZS'     => ['0', '4', '4', '4'],
374		'U'       => ['1', '0', '', ''],
375		'Ù'       => ['1', '0', '', ''],
376		'Ú'       => ['1', '0', '', ''],
377		'Û'       => ['1', '0', '', ''],
378		'Ü'       => ['1', '0', '', ''],
379		'Ũ'       => ['1', '0', '', ''],
380		'Ū'       => ['1', '0', '', ''],
381		'Ů'       => ['1', '0', '', ''],
382		'Ű'       => ['1', '0', '', ''],
383		'Ų'       => ['1', '0', '', ''],
384		'Ư'       => ['1', '0', '', ''],
385		'Ụ'       => ['1', '0', '', ''],
386		'Ủ'       => ['1', '0', '', ''],
387		'Ứ'       => ['1', '0', '', ''],
388		'Ừ'       => ['1', '0', '', ''],
389		'Ử'       => ['1', '0', '', ''],
390		'Ữ'       => ['1', '0', '', ''],
391		'Ự'       => ['1', '0', '', ''],
392		'UE'      => ['1', '0', '', ''],
393		'UI'      => ['1', '0', '1', ''],
394		'UJ'      => ['1', '0', '1', ''],
395		'UY'      => ['1', '0', '1', ''],
396		'UW'      => ['1', '0', '1', '', '0', '7', '7'],
397		'V'       => ['0', '7', '7', '7'],
398		'W'       => ['0', '7', '7', '7'],
399		'X'       => ['0', '5', '54', '54'],
400		'Y'       => ['1', '1', '', ''],
401		'Ý'       => ['1', '1', '', ''],
402		'Ỳ'       => ['1', '1', '', ''],
403		'Ỵ'       => ['1', '1', '', ''],
404		'Ỷ'       => ['1', '1', '', ''],
405		'Ỹ'       => ['1', '1', '', ''],
406		'Z'       => ['0', '4', '4', '4'],
407		'Ź'       => ['0', '4', '4', '4'],
408		'Ż'       => ['0', '4', '4', '4'],
409		'Ž'       => ['0', '4', '4', '4'],
410		'ZD'      => ['0', '2', '43', '43'],
411		'ZDZ'     => ['0', '2', '4', '4'],
412		'ZDZH'    => ['0', '2', '4', '4'],
413		'ZH'      => ['0', '4', '4', '4'],
414		'ZHD'     => ['0', '2', '43', '43'],
415		'ZHDZH'   => ['0', '2', '4', '4'],
416		'ZS'      => ['0', '4', '4', '4'],
417		'ZSCH'    => ['0', '4', '4', '4'],
418		'ZSH'     => ['0', '4', '4', '4'],
419		'ZZS'     => ['0', '4', '4', '4'],
420		// Cyrillic alphabet
421		'А'   => ['1', '0', '', ''],
422		'Б'   => ['0', '7', '7', '7'],
423		'В'   => ['0', '7', '7', '7'],
424		'Г'   => ['0', '5', '5', '5'],
425		'Д'   => ['0', '3', '3', '3'],
426		'ДЗ'  => ['0', '4', '4', '4'],
427		'Е'   => ['1', '0', '', ''],
428		'Ё'   => ['1', '0', '', ''],
429		'Ж'   => ['0', '4', '4', '4'],
430		'З'   => ['0', '4', '4', '4'],
431		'И'   => ['1', '0', '', ''],
432		'Й'   => ['1', '1', '', '', '4', '4', '4'],
433		'К'   => ['0', '5', '5', '5'],
434		'Л'   => ['0', '8', '8', '8'],
435		'М'   => ['0', '6', '6', '6'],
436		'Н'   => ['0', '6', '6', '6'],
437		'О'   => ['1', '0', '', ''],
438		'П'   => ['0', '7', '7', '7'],
439		'Р'   => ['0', '9', '9', '9'],
440		'РЖ'  => ['0', '4', '4', '4'],
441		'С'   => ['0', '4', '4', '4'],
442		'Т'   => ['0', '3', '3', '3'],
443		'У'   => ['1', '0', '', ''],
444		'Ф'   => ['0', '7', '7', '7'],
445		'Х'   => ['0', '5', '5', '5'],
446		'Ц'   => ['0', '4', '4', '4'],
447		'Ч'   => ['0', '4', '4', '4'],
448		'Ш'   => ['0', '4', '4', '4'],
449		'Щ'   => ['0', '2', '4', '4'],
450		'Ъ'   => ['0', '', '', ''],
451		'Ы'   => ['0', '1', '', ''],
452		'Ь'   => ['0', '', '', ''],
453		'Э'   => ['1', '0', '', ''],
454		'Ю'   => ['0', '1', '', ''],
455		'Я'   => ['0', '1', '', ''],
456		// Greek alphabet
457		'Α'   => ['1', '0', '', ''],
458		'Ά'   => ['1', '0', '', ''],
459		'ΑΙ'  => ['1', '0', '1', ''],
460		'ΑΥ'  => ['1', '0', '1', ''],
461		'Β'   => ['0', '7', '7', '7'],
462		'Γ'   => ['0', '5', '5', '5'],
463		'Δ'   => ['0', '3', '3', '3'],
464		'Ε'   => ['1', '0', '', ''],
465		'Έ'   => ['1', '0', '', ''],
466		'ΕΙ'  => ['1', '0', '1', ''],
467		'ΕΥ'  => ['1', '1', '1', ''],
468		'Ζ'   => ['0', '4', '4', '4'],
469		'Η'   => ['1', '0', '', ''],
470		'Ή'   => ['1', '0', '', ''],
471		'Θ'   => ['0', '3', '3', '3'],
472		'Ι'   => ['1', '0', '', ''],
473		'Ί'   => ['1', '0', '', ''],
474		'Ϊ'   => ['1', '0', '', ''],
475		'ΐ'   => ['1', '0', '', ''],
476		'Κ'   => ['0', '5', '5', '5'],
477		'Λ'   => ['0', '8', '8', '8'],
478		'Μ'   => ['0', '6', '6', '6'],
479		'ΜΠ'  => ['0', '7', '7', '7'],
480		'Ν'   => ['0', '6', '6', '6'],
481		'ΝΤ'  => ['0', '3', '3', '3'],
482		'Ξ'   => ['0', '5', '54', '54'],
483		'Ο'   => ['1', '0', '', ''],
484		'Ό'   => ['1', '0', '', ''],
485		'ΟΙ'  => ['1', '0', '1', ''],
486		'ΟΥ'  => ['1', '0', '1', ''],
487		'Π'   => ['0', '7', '7', '7'],
488		'Ρ'   => ['0', '9', '9', '9'],
489		'Σ'   => ['0', '4', '4', '4'],
490		'ς'   => ['0', '', '', '4'],
491		'Τ'   => ['0', '3', '3', '3'],
492		'ΤΖ'  => ['0', '4', '4', '4'],
493		'ΤΣ'  => ['0', '4', '4', '4'],
494		'Υ'   => ['1', '1', '', ''],
495		'Ύ'   => ['1', '1', '', ''],
496		'Ϋ'   => ['1', '1', '', ''],
497		'ΰ'   => ['1', '1', '', ''],
498		'ΥΚ'  => ['1', '5', '5', '5'],
499		'ΥΥ'  => ['1', '65', '65', '65'],
500		'Φ'   => ['0', '7', '7', '7'],
501		'Χ'   => ['0', '5', '5', '5'],
502		'Ψ'   => ['0', '7', '7', '7'],
503		'Ω'   => ['1', '0', '', ''],
504		'Ώ'   => ['1', '0', '', ''],
505		// Hebrew alphabet
506		'א'     => ['1', '0', '', ''],
507		'או'    => ['1', '0', '7', ''],
508		'אג'    => ['1', '4', '4', '4', '5', '5', '5', '34', '34', '34'],
509		'בב'    => ['0', '7', '7', '7', '77', '77', '77'],
510		'ב'     => ['0', '7', '7', '7'],
511		'גג'    => ['0', '4', '4', '4', '5', '5', '5', '45', '45', '45', '55', '55', '55', '54', '54', '54'],
512		'גד'    => ['0', '43', '43', '43', '53', '53', '53'],
513		'גה'    => ['0', '45', '45', '45', '55', '55', '55'],
514		'גז'    => ['0', '44', '44', '44', '45', '45', '45'],
515		'גח'    => ['0', '45', '45', '45', '55', '55', '55'],
516		'גכ'    => ['0', '45', '45', '45', '55', '55', '55'],
517		'גך'    => ['0', '45', '45', '45', '55', '55', '55'],
518		'גצ'    => ['0', '44', '44', '44', '45', '45', '45'],
519		'גץ'    => ['0', '44', '44', '44', '45', '45', '45'],
520		'גק'    => ['0', '45', '45', '45', '54', '54', '54'],
521		'גש'    => ['0', '44', '44', '44', '54', '54', '54'],
522		'גת'    => ['0', '43', '43', '43', '53', '53', '53'],
523		'ג'     => ['0', '4', '4', '4', '5', '5', '5'],
524		'דז'    => ['0', '4', '4', '4'],
525		'דד'    => ['0', '3', '3', '3', '33', '33', '33'],
526		'דט'    => ['0', '33', '33', '33'],
527		'דש'    => ['0', '4', '4', '4'],
528		'דצ'    => ['0', '4', '4', '4'],
529		'דץ'    => ['0', '4', '4', '4'],
530		'ד'     => ['0', '3', '3', '3'],
531		'הג'    => ['0', '54', '54', '54', '55', '55', '55'],
532		'הכ'    => ['0', '55', '55', '55'],
533		'הח'    => ['0', '55', '55', '55'],
534		'הק'    => ['0', '55', '55', '55', '5', '5', '5'],
535		'הה'    => ['0', '5', '5', '', '55', '55', ''],
536		'ה'     => ['0', '5', '5', ''],
537		'וי'    => ['1', '', '', '', '7', '7', '7'],
538		'ו'     => ['1', '7', '7', '7', '7', '', ''],
539		'וו'    => ['1', '7', '7', '7', '7', '', ''],
540		'וופ'   => ['1', '7', '7', '7', '77', '77', '77'],
541		'זש'    => ['0', '4', '4', '4', '44', '44', '44'],
542		'זדז'   => ['0', '2', '4', '4'],
543		'ז'     => ['0', '4', '4', '4'],
544		'זג'    => ['0', '44', '44', '44', '45', '45', '45'],
545		'זז'    => ['0', '4', '4', '4', '44', '44', '44'],
546		'זס'    => ['0', '44', '44', '44'],
547		'זצ'    => ['0', '44', '44', '44'],
548		'זץ'    => ['0', '44', '44', '44'],
549		'חג'    => ['0', '54', '54', '54', '53', '53', '53'],
550		'חח'    => ['0', '5', '5', '5', '55', '55', '55'],
551		'חק'    => ['0', '55', '55', '55', '5', '5', '5'],
552		'חכ'    => ['0', '45', '45', '45', '55', '55', '55'],
553		'חס'    => ['0', '5', '54', '54'],
554		'חש'    => ['0', '5', '54', '54'],
555		'ח'     => ['0', '5', '5', '5'],
556		'טש'    => ['0', '4', '4', '4'],
557		'טד'    => ['0', '33', '33', '33'],
558		'טי'    => ['0', '3', '3', '3', '4', '4', '4', '3', '3', '34'],
559		'טת'    => ['0', '33', '33', '33'],
560		'טט'    => ['0', '3', '3', '3', '33', '33', '33'],
561		'ט'     => ['0', '3', '3', '3'],
562		'י'     => ['1', '1', '', ''],
563		'יא'    => ['1', '1', '', '', '1', '1', '1'],
564		'כג'    => ['0', '55', '55', '55', '54', '54', '54'],
565		'כש'    => ['0', '5', '54', '54'],
566		'כס'    => ['0', '5', '54', '54'],
567		'ככ'    => ['0', '5', '5', '5', '55', '55', '55'],
568		'כך'    => ['0', '5', '5', '5', '55', '55', '55'],
569		'כ'     => ['0', '5', '5', '5'],
570		'כח'    => ['0', '55', '55', '55', '5', '5', '5'],
571		'ך'     => ['0', '', '5', '5'],
572		'ל'     => ['0', '8', '8', '8'],
573		'לל'    => ['0', '88', '88', '88', '8', '8', '8'],
574		'מנ'    => ['0', '66', '66', '66'],
575		'מן'    => ['0', '66', '66', '66'],
576		'ממ'    => ['0', '6', '6', '6', '66', '66', '66'],
577		'מם'    => ['0', '6', '6', '6', '66', '66', '66'],
578		'מ'     => ['0', '6', '6', '6'],
579		'ם'     => ['0', '', '6', '6'],
580		'נמ'    => ['0', '66', '66', '66'],
581		'נם'    => ['0', '66', '66', '66'],
582		'ננ'    => ['0', '6', '6', '6', '66', '66', '66'],
583		'נן'    => ['0', '6', '6', '6', '66', '66', '66'],
584		'נ'     => ['0', '6', '6', '6'],
585		'ן'     => ['0', '', '6', '6'],
586		'סתש'   => ['0', '2', '4', '4'],
587		'סתז'   => ['0', '2', '4', '4'],
588		'סטז'   => ['0', '2', '4', '4'],
589		'סטש'   => ['0', '2', '4', '4'],
590		'סצד'   => ['0', '2', '4', '4'],
591		'סט'    => ['0', '2', '4', '4', '43', '43', '43'],
592		'סת'    => ['0', '2', '4', '4', '43', '43', '43'],
593		'סג'    => ['0', '44', '44', '44', '4', '4', '4'],
594		'סס'    => ['0', '4', '4', '4', '44', '44', '44'],
595		'סצ'    => ['0', '44', '44', '44'],
596		'סץ'    => ['0', '44', '44', '44'],
597		'סז'    => ['0', '44', '44', '44'],
598		'סש'    => ['0', '44', '44', '44'],
599		'ס'     => ['0', '4', '4', '4'],
600		'ע'     => ['1', '0', '', ''],
601		'פב'    => ['0', '7', '7', '7', '77', '77', '77'],
602		'פוו'   => ['0', '7', '7', '7', '77', '77', '77'],
603		'פפ'    => ['0', '7', '7', '7', '77', '77', '77'],
604		'פף'    => ['0', '7', '7', '7', '77', '77', '77'],
605		'פ'     => ['0', '7', '7', '7'],
606		'ף'     => ['0', '', '7', '7'],
607		'צג'    => ['0', '44', '44', '44', '45', '45', '45'],
608		'צז'    => ['0', '44', '44', '44'],
609		'צס'    => ['0', '44', '44', '44'],
610		'צצ'    => ['0', '4', '4', '4', '5', '5', '5', '44', '44', '44', '54', '54', '54', '45', '45', '45'],
611		'צץ'    => ['0', '4', '4', '4', '5', '5', '5', '44', '44', '44', '54', '54', '54'],
612		'צש'    => ['0', '44', '44', '44', '4', '4', '4', '5', '5', '5'],
613		'צ'     => ['0', '4', '4', '4', '5', '5', '5'],
614		'ץ'     => ['0', '', '4', '4'],
615		'קה'    => ['0', '55', '55', '5'],
616		'קס'    => ['0', '5', '54', '54'],
617		'קש'    => ['0', '5', '54', '54'],
618		'קק'    => ['0', '5', '5', '5', '55', '55', '55'],
619		'קח'    => ['0', '55', '55', '55'],
620		'קכ'    => ['0', '55', '55', '55'],
621		'קך'    => ['0', '55', '55', '55'],
622		'קג'    => ['0', '55', '55', '55', '54', '54', '54'],
623		'ק'     => ['0', '5', '5', '5'],
624		'רר'    => ['0', '99', '99', '99', '9', '9', '9'],
625		'ר'     => ['0', '9', '9', '9'],
626		'שטז'   => ['0', '2', '4', '4'],
627		'שתש'   => ['0', '2', '4', '4'],
628		'שתז'   => ['0', '2', '4', '4'],
629		'שטש'   => ['0', '2', '4', '4'],
630		'שד'    => ['0', '2', '43', '43'],
631		'שז'    => ['0', '44', '44', '44'],
632		'שס'    => ['0', '44', '44', '44'],
633		'שת'    => ['0', '2', '43', '43'],
634		'שג'    => ['0', '4', '4', '4', '44', '44', '44', '4', '43', '43'],
635		'שט'    => ['0', '2', '43', '43', '44', '44', '44'],
636		'שצ'    => ['0', '44', '44', '44', '45', '45', '45'],
637		'שץ'    => ['0', '44', '', '44', '45', '', '45'],
638		'שש'    => ['0', '4', '4', '4', '44', '44', '44'],
639		'ש'     => ['0', '4', '4', '4'],
640		'תג'    => ['0', '34', '34', '34'],
641		'תז'    => ['0', '34', '34', '34'],
642		'תש'    => ['0', '4', '4', '4'],
643		'תת'    => ['0', '3', '3', '3', '4', '4', '4', '33', '33', '33', '44', '44', '44', '34', '34', '34', '43', '43', '43'],
644		'ת'     => ['0', '3', '3', '3', '4', '4', '4'],
645		// Arabic alphabet
646		'ا'   => ['1', '0', '', ''],
647		'ب'   => ['0', '7', '7', '7'],
648		'ت'   => ['0', '3', '3', '3'],
649		'ث'   => ['0', '3', '3', '3'],
650		'ج'   => ['0', '4', '4', '4'],
651		'ح'   => ['0', '5', '5', '5'],
652		'خ'   => ['0', '5', '5', '5'],
653		'د'   => ['0', '3', '3', '3'],
654		'ذ'   => ['0', '3', '3', '3'],
655		'ر'   => ['0', '9', '9', '9'],
656		'ز'   => ['0', '4', '4', '4'],
657		'س'   => ['0', '4', '4', '4'],
658		'ش'   => ['0', '4', '4', '4'],
659		'ص'   => ['0', '4', '4', '4'],
660		'ض'   => ['0', '3', '3', '3'],
661		'ط'   => ['0', '3', '3', '3'],
662		'ظ'   => ['0', '4', '4', '4'],
663		'ع'   => ['1', '0', '', ''],
664		'غ'   => ['0', '0', '', ''],
665		'ف'   => ['0', '7', '7', '7'],
666		'ق'   => ['0', '5', '5', '5'],
667		'ك'   => ['0', '5', '5', '5'],
668		'ل'   => ['0', '8', '8', '8'],
669		'لا'  => ['0', '8', '8', '8'],
670		'م'   => ['0', '6', '6', '6'],
671		'ن'   => ['0', '6', '6', '6'],
672		'هن'  => ['0', '66', '66', '66'],
673		'ه'   => ['0', '5', '5', ''],
674		'و'   => ['1', '', '', '', '7', '', ''],
675		'ي'   => ['0', '1', '', ''],
676		'آ'   => ['0', '1', '', ''],
677		'ة'   => ['0', '', '', '3'],
678		'ی'   => ['0', '1', '', ''],
679		'ى'   => ['1', '1', '', ''],
680	];
681
682	/**
683	 * Calculate the Daitch-Mokotoff soundex for a word.
684	 *
685	 * @param string $name
686	 *
687	 * @return string[] List of possible DM codes for the word.
688	 */
689	private static function daitchMokotoffWord($name) {
690		// Apply special transformation rules to the input string
691		$name = I18N::strtoupper($name);
692		foreach (self::$transformNameTable as $transformRule) {
693			$name = str_replace($transformRule[0], $transformRule[1], $name);
694		}
695
696		// Initialize
697		$name_script = I18N::textScript($name);
698		$noVowels    = ($name_script == 'Hebr' || $name_script == 'Arab');
699
700		$lastPos         = strlen($name) - 1;
701		$currPos         = 0;
702		$state           = 1; // 1: start of input string, 2: before vowel, 3: other
703		$result          = []; // accumulate complete 6-digit D-M codes here
704		$partialResult   = []; // accumulate incomplete D-M codes here
705		$partialResult[] = ['!']; // initialize 1st partial result  ('!' stops "duplicate sound" check)
706
707		// Loop through the input string.
708		// Stop when the string is exhausted or when no more partial results remain
709		while (count($partialResult) !== 0 && $currPos <= $lastPos) {
710			// Find the DM coding table entry for the chunk at the current position
711			$thisEntry = substr($name, $currPos, self::MAXCHAR); // Get maximum length chunk
712			while ($thisEntry != '') {
713				if (isset(self::$dmsounds[$thisEntry])) {
714					break;
715				}
716				$thisEntry = substr($thisEntry, 0, -1); // Not in table: try a shorter chunk
717			}
718			if ($thisEntry === '') {
719				$currPos++; // Not in table: advance pointer to next byte
720				continue; // and try again
721			}
722
723			$soundTableEntry = self::$dmsounds[$thisEntry];
724			$workingResult   = $partialResult;
725			$partialResult   = [];
726			$currPos += strlen($thisEntry);
727
728			// Not at beginning of input string
729			if ($state != 1) {
730				if ($currPos <= $lastPos) {
731					// Determine whether the next chunk is a vowel
732					$nextEntry = substr($name, $currPos, self::MAXCHAR); // Get maximum length chunk
733					while ($nextEntry != '') {
734						if (isset(self::$dmsounds[$nextEntry])) {
735							break;
736						}
737						$nextEntry = substr($nextEntry, 0, -1); // Not in table: try a shorter chunk
738					}
739				} else {
740					$nextEntry = '';
741				}
742				if ($nextEntry != '' && self::$dmsounds[$nextEntry][0] != '0') {
743					$state = 2;
744				} else {
745					// Next chunk is a vowel
746					$state = 3;
747				}
748			}
749
750			while ($state < count($soundTableEntry)) {
751				// empty means 'ignore this sound in this state'
752				if ($soundTableEntry[$state] == '') {
753					foreach ($workingResult as $workingEntry) {
754						$tempEntry = $workingEntry;
755						$tempEntry[count($tempEntry) - 1] .= '!'; // Prevent false 'doubles'
756						$partialResult[] = $tempEntry;
757					}
758				} else {
759					foreach ($workingResult as $workingEntry) {
760						if ($soundTableEntry[$state] !== $workingEntry[count($workingEntry) - 1]) {
761							// Incoming sound isn't a duplicate of the previous sound
762							$workingEntry[] = $soundTableEntry[$state];
763						} else {
764							// Incoming sound is a duplicate of the previous sound
765							// For Hebrew and Arabic, we need to create a pair of D-M sound codes,
766							// one of the pair with only a single occurrence of the duplicate sound,
767							// the other with both occurrences
768							if ($noVowels) {
769								$workingEntry[] = $soundTableEntry[$state];
770							}
771						}
772						if (count($workingEntry) < 7) {
773							$partialResult[] = $workingEntry;
774						} else {
775							// This is the 6th code in the sequence
776							// We're looking for 7 entries because the first is '!' and doesn't count
777							$tempResult = str_replace('!', '', implode('', $workingEntry));
778							// Only return codes from recognisable sounds
779							if ($tempResult) {
780								$result[] = substr($tempResult . '000000', 0, 6);
781							}
782						}
783					}
784				}
785				$state = $state + 3; // Advance to next triplet while keeping the same basic state
786			}
787		}
788
789		// Zero-fill and copy all remaining partial results
790		foreach ($partialResult as $workingEntry) {
791			$tempResult = str_replace('!', '', implode('', $workingEntry));
792			// Only return codes from recognisable sounds
793			if ($tempResult) {
794				$result[] = substr($tempResult . '000000', 0, 6);
795			}
796		}
797
798		return $result;
799	}
800}
801