xref: /webtrees/app/Soundex.php (revision 3cf92ae205660ec36316541b9e23f2ecbf0af8bb)
1<?php
2/**
3 * webtrees: online genealogy
4 * Copyright (C) 2015 webtrees development team
5 * This program is free software: you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation, either version 3 of the License, or
8 * (at your option) any later version.
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 * You should have received a copy of the GNU General Public License
14 * along with this program. If not, see <http://www.gnu.org/licenses/>.
15 */
16namespace Fisharebest\Webtrees;
17
18/**
19 * Phonetic matching of strings.
20 */
21class Soundex {
22	/**
23	 * Which algorithms are supported.
24	 *
25	 * @return string[]
26	 */
27	public static function getAlgorithms() {
28		return array(
29			'std' => /* I18N: http://en.wikipedia.org/wiki/Soundex */ I18N::translate('Russell'),
30			'dm'  => /* I18N: http://en.wikipedia.org/wiki/Daitch–Mokotoff_Soundex */ I18N::translate('Daitch-Mokotoff'),
31		);
32	}
33
34	/**
35	 * Is there a match between two soundex codes?
36	 *
37	 * @param string $soundex1
38	 * @param string $soundex2
39	 *
40	 * @return bool
41	 */
42	public static function compare($soundex1, $soundex2) {
43		if ($soundex1 && $soundex2) {
44			foreach (explode(':', $soundex1) as $code) {
45				if (strpos($soundex2, $code) !== false) {
46					return true;
47				}
48			}
49		}
50
51		return false;
52	}
53
54	/**
55	 * Generate Russell soundex codes for a given text.
56	 *
57	 * @param $text
58	 *
59	 * @return null|string
60	 */
61	public static function russell($text) {
62		$words         = preg_split('/\s/', $text, -1, PREG_SPLIT_NO_EMPTY);
63		$soundex_array = array();
64		foreach ($words as $word) {
65			$soundex = soundex($word);
66			// Only return codes from recognisable sounds
67			if ($soundex !== '0000') {
68				$soundex_array[] = $soundex;
69			}
70		}
71		// Combine words, e.g. “New York” as “Newyork”
72		if (count($words) > 1) {
73			$soundex_array[] = soundex(strtr($text, ' ', ''));
74		}
75		// A varchar(255) column can only hold 51 4-character codes (plus 50 delimiters)
76		$soundex_array = array_slice(array_unique($soundex_array), 0, 51);
77
78		if ($soundex_array) {
79			return implode(':', $soundex_array);
80		} else {
81			return null;
82		}
83	}
84
85	/**
86	 * Generate Daitch–Mokotoff soundex codes for a given text.
87	 *
88	 * @param $text
89	 *
90	 * @return null|string
91	 */
92	public static function daitchMokotoff($text) {
93		$words         = preg_split('/\s/', $text, -1, PREG_SPLIT_NO_EMPTY);
94		$soundex_array = array();
95		foreach ($words as $word) {
96			$soundex_array = array_merge($soundex_array, self::daitchMokotoffWord($word));
97		}
98		// Combine words, e.g. “New York” as “Newyork”
99		if (count($words) > 1) {
100			$soundex_array = array_merge($soundex_array, self::daitchMokotoffWord(strtr($text, ' ', '')));
101		}
102		// A varchar(255) column can only hold 36 6-character codes (plus 35 delimiters)
103		$soundex_array = array_slice(array_unique($soundex_array), 0, 36);
104
105		if ($soundex_array) {
106			return implode(':', $soundex_array);
107		} else {
108			return null;
109		}
110	}
111
112	// Determine the Daitch–Mokotoff Soundex code for a word
113	// Original implementation by Gerry Kroll, and analysis by Meliza Amity
114
115	// Max. table key length (in ASCII bytes -- NOT in UTF-8 characters!)
116	const MAXCHAR = 7;
117
118	/**
119	 * Name transformation arrays.
120	 * Used to transform the Name string to simplify the "sounds like" table.
121	 * This is especially useful in Hebrew.
122	 *
123	 * Each array entry defines the "from" and "to" arguments of an preg($from, $to, $text)
124	 * function call to achieve the desired transformations.
125	 *
126	 * Note about the use of "\x01":
127	 * This code, which can’t legitimately occur in the kind of text we're dealing with,
128	 * is used as a place-holder so that conditional string replacements can be done.
129	 *
130	 * @var string[][]
131	 */
132	private static $transformNameTable = array(
133		// Force Yiddish ligatures to be treated as separate letters
134		array('װ', 'וו'),
135		array('ײ', 'יי'),
136		array('ױ', 'וי'),
137		array('בו', 'בע'),
138		array('פו', 'פע'),
139		array('ומ', 'עמ'),
140		array('ום', 'עם'),
141		array('ונ', 'ענ'),
142		array('ון', 'ען'),
143		array('וו', 'ב'),
144		array("\x01", ''),
145		array('ייה$', "\x01ה"),
146		array('ייע$', "\x01ע"),
147		array('יי', 'ע'),
148		array("\x01", 'יי'),
149	);
150
151	/**
152	 * The DM sound coding table is organized this way:
153	 * key: a variable-length string that corresponds to the UTF-8 character sequence
154	 * represented by the table entry.  Currently, that string can be up to 7
155	 * bytes long.  This maximum length is defined by the value of global variable
156	 * $maxchar.
157	 *
158	 * value: an array as follows:
159	 * [0]:  zero if not a vowel
160	 * [1]:  sound value when this string is at the beginning of the word
161	 * [2]:  sound value when this string is followed by a vowel
162	 * [3]:  sound value for other cases
163	 * [1],[2],[3] can be repeated several times to create branches in the code
164	 * an empty sound value means "ignore in this state"
165	 *
166	 * @var string[][]
167	 */
168	private static $dmsounds = array(
169		'A'       => array('1', '0', '', ''),
170		'À'      => array('1', '0', '', ''),
171		'Á'      => array('1', '0', '', ''),
172		'Â'      => array('1', '0', '', ''),
173		'Ã'      => array('1', '0', '', ''),
174		'Ä'      => array('1', '0', '1', '', '0', '', ''),
175		'Å'      => array('1', '0', '', ''),
176		'Ă'      => array('1', '0', '', ''),
177		'Ą'      => array('1', '', '', '', '', '', '6'),
178		'Ạ'     => array('1', '0', '', ''),
179		'Ả'     => array('1', '0', '', ''),
180		'Ấ'     => array('1', '0', '', ''),
181		'Ầ'     => array('1', '0', '', ''),
182		'Ẩ'     => array('1', '0', '', ''),
183		'Ẫ'     => array('1', '0', '', ''),
184		'Ậ'     => array('1', '0', '', ''),
185		'Ắ'     => array('1', '0', '', ''),
186		'Ằ'     => array('1', '0', '', ''),
187		'Ẳ'     => array('1', '0', '', ''),
188		'Ẵ'     => array('1', '0', '', ''),
189		'Ặ'     => array('1', '0', '', ''),
190		'AE'      => array('1', '0', '1', ''),
191		'Æ'      => array('1', '0', '1', ''),
192		'AI'      => array('1', '0', '1', ''),
193		'AJ'      => array('1', '0', '1', ''),
194		'AU'      => array('1', '0', '7', ''),
195		'AV'      => array('1', '0', '7', '', '7', '7', '7'),
196		'ÄU'     => array('1', '0', '1', ''),
197		'AY'      => array('1', '0', '1', ''),
198		'B'       => array('0', '7', '7', '7'),
199		'C'       => array('0', '5', '5', '5', '34', '4', '4'),
200		'Ć'      => array('0', '4', '4', '4'),
201		'Č'      => array('0', '4', '4', '4'),
202		'Ç'      => array('0', '4', '4', '4'),
203		'CH'      => array('0', '5', '5', '5', '34', '4', '4'),
204		'CHS'     => array('0', '5', '54', '54'),
205		'CK'      => array('0', '5', '5', '5', '45', '45', '45'),
206		'CCS'     => array('0', '4', '4', '4'),
207		'CS'      => array('0', '4', '4', '4'),
208		'CSZ'     => array('0', '4', '4', '4'),
209		'CZ'      => array('0', '4', '4', '4'),
210		'CZS'     => array('0', '4', '4', '4'),
211		'D'       => array('0', '3', '3', '3'),
212		'Ď'      => array('0', '3', '3', '3'),
213		'Đ'      => array('0', '3', '3', '3'),
214		'DRS'     => array('0', '4', '4', '4'),
215		'DRZ'     => array('0', '4', '4', '4'),
216		'DS'      => array('0', '4', '4', '4'),
217		'DSH'     => array('0', '4', '4', '4'),
218		'DSZ'     => array('0', '4', '4', '4'),
219		'DT'      => array('0', '3', '3', '3'),
220		'DDZ'     => array('0', '4', '4', '4'),
221		'DDZS'    => array('0', '4', '4', '4'),
222		'DZ'      => array('0', '4', '4', '4'),
223		'DŹ'     => array('0', '4', '4', '4'),
224		'DŻ'     => array('0', '4', '4', '4'),
225		'DZH'     => array('0', '4', '4', '4'),
226		'DZS'     => array('0', '4', '4', '4'),
227		'E'       => array('1', '0', '', ''),
228		'È'      => array('1', '0', '', ''),
229		'É'      => array('1', '0', '', ''),
230		'Ê'      => array('1', '0', '', ''),
231		'Ë'      => array('1', '0', '', ''),
232		'Ĕ'      => array('1', '0', '', ''),
233		'Ė'      => array('1', '0', '', ''),
234		'Ę'      => array('1', '', '', '6', '', '', ''),
235		'Ẹ'     => array('1', '0', '', ''),
236		'Ẻ'     => array('1', '0', '', ''),
237		'Ẽ'     => array('1', '0', '', ''),
238		'Ế'     => array('1', '0', '', ''),
239		'Ề'     => array('1', '0', '', ''),
240		'Ể'     => array('1', '0', '', ''),
241		'Ễ'     => array('1', '0', '', ''),
242		'Ệ'     => array('1', '0', '', ''),
243		'EAU'     => array('1', '0', '', ''),
244		'EI'      => array('1', '0', '1', ''),
245		'EJ'      => array('1', '0', '1', ''),
246		'EU'      => array('1', '1', '1', ''),
247		'EY'      => array('1', '0', '1', ''),
248		'F'       => array('0', '7', '7', '7'),
249		'FB'      => array('0', '7', '7', '7'),
250		'G'       => array('0', '5', '5', '5', '34', '4', '4'),
251		'Ğ'      => array('0', '', '', ''),
252		'GGY'     => array('0', '5', '5', '5'),
253		'GY'      => array('0', '5', '5', '5'),
254		'H'       => array('0', '5', '5', '', '5', '5', '5'),
255		'I'       => array('1', '0', '', ''),
256		'Ì'      => array('1', '0', '', ''),
257		'Í'      => array('1', '0', '', ''),
258		'Î'      => array('1', '0', '', ''),
259		'Ï'      => array('1', '0', '', ''),
260		'Ĩ'      => array('1', '0', '', ''),
261		'Į'      => array('1', '0', '', ''),
262		'İ'      => array('1', '0', '', ''),
263		'Ỉ'     => array('1', '0', '', ''),
264		'Ị'     => array('1', '0', '', ''),
265		'IA'      => array('1', '1', '', ''),
266		'IE'      => array('1', '1', '', ''),
267		'IO'      => array('1', '1', '', ''),
268		'IU'      => array('1', '1', '', ''),
269		'J'       => array('0', '1', '', '', '4', '4', '4', '5', '5', ''),
270		'K'       => array('0', '5', '5', '5'),
271		'KH'      => array('0', '5', '5', '5'),
272		'KS'      => array('0', '5', '54', '54'),
273		'L'       => array('0', '8', '8', '8'),
274		'Ľ'      => array('0', '8', '8', '8'),
275		'Ĺ'      => array('0', '8', '8', '8'),
276		'Ł'      => array('0', '7', '7', '7', '8', '8', '8'),
277		'LL'      => array('0', '8', '8', '8', '58', '8', '8', '1', '8', '8'),
278		'LLY'     => array('0', '8', '8', '8', '1', '8', '8'),
279		'LY'      => array('0', '8', '8', '8', '1', '8', '8'),
280		'M'       => array('0', '6', '6', '6'),
281		'MĔ'     => array('0', '66', '66', '66'),
282		'MN'      => array('0', '66', '66', '66'),
283		'N'       => array('0', '6', '6', '6'),
284		'Ń'      => array('0', '6', '6', '6'),
285		'Ň'      => array('0', '6', '6', '6'),
286		'Ñ'      => array('0', '6', '6', '6'),
287		'NM'      => array('0', '66', '66', '66'),
288		'O'       => array('1', '0', '', ''),
289		'Ò'      => array('1', '0', '', ''),
290		'Ó'      => array('1', '0', '', ''),
291		'Ô'      => array('1', '0', '', ''),
292		'Õ'      => array('1', '0', '', ''),
293		'Ö'      => array('1', '0', '', ''),
294		'Ø'      => array('1', '0', '', ''),
295		'Ő'      => array('1', '0', '', ''),
296		'Œ'      => array('1', '0', '', ''),
297		'Ơ'      => array('1', '0', '', ''),
298		'Ọ'     => array('1', '0', '', ''),
299		'Ỏ'     => array('1', '0', '', ''),
300		'Ố'     => array('1', '0', '', ''),
301		'Ồ'     => array('1', '0', '', ''),
302		'Ổ'     => array('1', '0', '', ''),
303		'Ỗ'     => array('1', '0', '', ''),
304		'Ộ'     => array('1', '0', '', ''),
305		'Ớ'     => array('1', '0', '', ''),
306		'Ờ'     => array('1', '0', '', ''),
307		'Ở'     => array('1', '0', '', ''),
308		'Ỡ'     => array('1', '0', '', ''),
309		'Ợ'     => array('1', '0', '', ''),
310		'OE'      => array('1', '0', '', ''),
311		'OI'      => array('1', '0', '1', ''),
312		'OJ'      => array('1', '0', '1', ''),
313		'OU'      => array('1', '0', '', ''),
314		'OY'      => array('1', '0', '1', ''),
315		'P'       => array('0', '7', '7', '7'),
316		'PF'      => array('0', '7', '7', '7'),
317		'PH'      => array('0', '7', '7', '7'),
318		'Q'       => array('0', '5', '5', '5'),
319		'R'       => array('0', '9', '9', '9'),
320		'Ř'      => array('0', '4', '4', '4'),
321		'RS'      => array('0', '4', '4', '4', '94', '94', '94'),
322		'RZ'      => array('0', '4', '4', '4', '94', '94', '94'),
323		'S'       => array('0', '4', '4', '4'),
324		'Ś'      => array('0', '4', '4', '4'),
325		'Š'      => array('0', '4', '4', '4'),
326		'Ş'      => array('0', '4', '4', '4'),
327		'SC'      => array('0', '2', '4', '4'),
328		'ŠČ'    => array('0', '2', '4', '4'),
329		'SCH'     => array('0', '4', '4', '4'),
330		'SCHD'    => array('0', '2', '43', '43'),
331		'SCHT'    => array('0', '2', '43', '43'),
332		'SCHTCH'  => array('0', '2', '4', '4'),
333		'SCHTSCH' => array('0', '2', '4', '4'),
334		'SCHTSH'  => array('0', '2', '4', '4'),
335		'SD'      => array('0', '2', '43', '43'),
336		'SH'      => array('0', '4', '4', '4'),
337		'SHCH'    => array('0', '2', '4', '4'),
338		'SHD'     => array('0', '2', '43', '43'),
339		'SHT'     => array('0', '2', '43', '43'),
340		'SHTCH'   => array('0', '2', '4', '4'),
341		'SHTSH'   => array('0', '2', '4', '4'),
342		'ß'      => array('0', '', '4', '4'),
343		'ST'      => array('0', '2', '43', '43'),
344		'STCH'    => array('0', '2', '4', '4'),
345		'STRS'    => array('0', '2', '4', '4'),
346		'STRZ'    => array('0', '2', '4', '4'),
347		'STSCH'   => array('0', '2', '4', '4'),
348		'STSH'    => array('0', '2', '4', '4'),
349		'SSZ'     => array('0', '4', '4', '4'),
350		'SZ'      => array('0', '4', '4', '4'),
351		'SZCS'    => array('0', '2', '4', '4'),
352		'SZCZ'    => array('0', '2', '4', '4'),
353		'SZD'     => array('0', '2', '43', '43'),
354		'SZT'     => array('0', '2', '43', '43'),
355		'T'       => array('0', '3', '3', '3'),
356		'Ť'      => array('0', '3', '3', '3'),
357		'Ţ'      => array('0', '3', '3', '3', '4', '4', '4'),
358		'TC'      => array('0', '4', '4', '4'),
359		'TCH'     => array('0', '4', '4', '4'),
360		'TH'      => array('0', '3', '3', '3'),
361		'TRS'     => array('0', '4', '4', '4'),
362		'TRZ'     => array('0', '4', '4', '4'),
363		'TS'      => array('0', '4', '4', '4'),
364		'TSCH'    => array('0', '4', '4', '4'),
365		'TSH'     => array('0', '4', '4', '4'),
366		'TSZ'     => array('0', '4', '4', '4'),
367		'TTCH'    => array('0', '4', '4', '4'),
368		'TTS'     => array('0', '4', '4', '4'),
369		'TTSCH'   => array('0', '4', '4', '4'),
370		'TTSZ'    => array('0', '4', '4', '4'),
371		'TTZ'     => array('0', '4', '4', '4'),
372		'TZ'      => array('0', '4', '4', '4'),
373		'TZS'     => array('0', '4', '4', '4'),
374		'U'       => array('1', '0', '', ''),
375		'Ù'      => array('1', '0', '', ''),
376		'Ú'      => array('1', '0', '', ''),
377		'Û'      => array('1', '0', '', ''),
378		'Ü'      => array('1', '0', '', ''),
379		'Ũ'      => array('1', '0', '', ''),
380		'Ū'      => array('1', '0', '', ''),
381		'Ů'      => array('1', '0', '', ''),
382		'Ű'      => array('1', '0', '', ''),
383		'Ų'      => array('1', '0', '', ''),
384		'Ư'      => array('1', '0', '', ''),
385		'Ụ'     => array('1', '0', '', ''),
386		'Ủ'     => array('1', '0', '', ''),
387		'Ứ'     => array('1', '0', '', ''),
388		'Ừ'     => array('1', '0', '', ''),
389		'Ử'     => array('1', '0', '', ''),
390		'Ữ'     => array('1', '0', '', ''),
391		'Ự'     => array('1', '0', '', ''),
392		'UE'      => array('1', '0', '', ''),
393		'UI'      => array('1', '0', '1', ''),
394		'UJ'      => array('1', '0', '1', ''),
395		'UY'      => array('1', '0', '1', ''),
396		'UW'      => array('1', '0', '1', '', '0', '7', '7'),
397		'V'       => array('0', '7', '7', '7'),
398		'W'       => array('0', '7', '7', '7'),
399		'X'       => array('0', '5', '54', '54'),
400		'Y'       => array('1', '1', '', ''),
401		'Ý'      => array('1', '1', '', ''),
402		'Ỳ'     => array('1', '1', '', ''),
403		'Ỵ'     => array('1', '1', '', ''),
404		'Ỷ'     => array('1', '1', '', ''),
405		'Ỹ'     => array('1', '1', '', ''),
406		'Z'       => array('0', '4', '4', '4'),
407		'Ź'      => array('0', '4', '4', '4'),
408		'Ż'      => array('0', '4', '4', '4'),
409		'Ž'      => array('0', '4', '4', '4'),
410		'ZD'      => array('0', '2', '43', '43'),
411		'ZDZ'     => array('0', '2', '4', '4'),
412		'ZDZH'    => array('0', '2', '4', '4'),
413		'ZH'      => array('0', '4', '4', '4'),
414		'ZHD'     => array('0', '2', '43', '43'),
415		'ZHDZH'   => array('0', '2', '4', '4'),
416		'ZS'      => array('0', '4', '4', '4'),
417		'ZSCH'    => array('0', '4', '4', '4'),
418		'ZSH'     => array('0', '4', '4', '4'),
419		'ZZS'     => array('0', '4', '4', '4'),
420		// Cyrillic alphabet
421		'А'   => array('1', '0', '', ''),
422		'Б'   => array('0', '7', '7', '7'),
423		'В'   => array('0', '7', '7', '7'),
424		'Г'   => array('0', '5', '5', '5'),
425		'Д'   => array('0', '3', '3', '3'),
426		'ДЗ' => array('0', '4', '4', '4'),
427		'Е'   => array('1', '0', '', ''),
428		'Ё'   => array('1', '0', '', ''),
429		'Ж'   => array('0', '4', '4', '4'),
430		'З'   => array('0', '4', '4', '4'),
431		'И'   => array('1', '0', '', ''),
432		'Й'   => array('1', '1', '', '', '4', '4', '4'),
433		'К'   => array('0', '5', '5', '5'),
434		'Л'   => array('0', '8', '8', '8'),
435		'М'   => array('0', '6', '6', '6'),
436		'Н'   => array('0', '6', '6', '6'),
437		'О'   => array('1', '0', '', ''),
438		'П'   => array('0', '7', '7', '7'),
439		'Р'   => array('0', '9', '9', '9'),
440		'РЖ' => array('0', '4', '4', '4'),
441		'С'   => array('0', '4', '4', '4'),
442		'Т'   => array('0', '3', '3', '3'),
443		'У'   => array('1', '0', '', ''),
444		'Ф'   => array('0', '7', '7', '7'),
445		'Х'   => array('0', '5', '5', '5'),
446		'Ц'   => array('0', '4', '4', '4'),
447		'Ч'   => array('0', '4', '4', '4'),
448		'Ш'   => array('0', '4', '4', '4'),
449		'Щ'   => array('0', '2', '4', '4'),
450		'Ъ'   => array('0', '', '', ''),
451		'Ы'   => array('0', '1', '', ''),
452		'Ь'   => array('0', '', '', ''),
453		'Э'   => array('1', '0', '', ''),
454		'Ю'   => array('0', '1', '', ''),
455		'Я'   => array('0', '1', '', ''),
456		// Greek alphabet
457		'Α'   => array('1', '0', '', ''),
458		'Ά'   => array('1', '0', '', ''),
459		'ΑΙ' => array('1', '0', '1', ''),
460		'ΑΥ' => array('1', '0', '1', ''),
461		'Β'   => array('0', '7', '7', '7'),
462		'Γ'   => array('0', '5', '5', '5'),
463		'Δ'   => array('0', '3', '3', '3'),
464		'Ε'   => array('1', '0', '', ''),
465		'Έ'   => array('1', '0', '', ''),
466		'ΕΙ' => array('1', '0', '1', ''),
467		'ΕΥ' => array('1', '1', '1', ''),
468		'Ζ'   => array('0', '4', '4', '4'),
469		'Η'   => array('1', '0', '', ''),
470		'Ή'   => array('1', '0', '', ''),
471		'Θ'   => array('0', '3', '3', '3'),
472		'Ι'   => array('1', '0', '', ''),
473		'Ί'   => array('1', '0', '', ''),
474		'Ϊ'   => array('1', '0', '', ''),
475		'ΐ'   => array('1', '0', '', ''),
476		'Κ'   => array('0', '5', '5', '5'),
477		'Λ'   => array('0', '8', '8', '8'),
478		'Μ'   => array('0', '6', '6', '6'),
479		'ΜΠ' => array('0', '7', '7', '7'),
480		'Ν'   => array('0', '6', '6', '6'),
481		'ΝΤ' => array('0', '3', '3', '3'),
482		'Ξ'   => array('0', '5', '54', '54'),
483		'Ο'   => array('1', '0', '', ''),
484		'Ό'   => array('1', '0', '', ''),
485		'ΟΙ' => array('1', '0', '1', ''),
486		'ΟΥ' => array('1', '0', '1', ''),
487		'Π'   => array('0', '7', '7', '7'),
488		'Ρ'   => array('0', '9', '9', '9'),
489		'Σ'   => array('0', '4', '4', '4'),
490		'ς'   => array('0', '', '', '4'),
491		'Τ'   => array('0', '3', '3', '3'),
492		'ΤΖ' => array('0', '4', '4', '4'),
493		'ΤΣ' => array('0', '4', '4', '4'),
494		'Υ'   => array('1', '1', '', ''),
495		'Ύ'   => array('1', '1', '', ''),
496		'Ϋ'   => array('1', '1', '', ''),
497		'ΰ'   => array('1', '1', '', ''),
498		'ΥΚ' => array('1', '5', '5', '5'),
499		'ΥΥ' => array('1', '65', '65', '65'),
500		'Φ'   => array('0', '7', '7', '7'),
501		'Χ'   => array('0', '5', '5', '5'),
502		'Ψ'   => array('0', '7', '7', '7'),
503		'Ω'   => array('1', '0', '', ''),
504		'Ώ'   => array('1', '0', '', ''),
505		// Hebrew alphabet
506		'א'     => array('1', '0', '', ''),
507		'או'   => array('1', '0', '7', ''),
508		'אג'   => array('1', '4', '4', '4', '5', '5', '5', '34', '34', '34'),
509		'בב'   => array('0', '7', '7', '7', '77', '77', '77'),
510		'ב'     => array('0', '7', '7', '7'),
511		'גג'   => array('0', '4', '4', '4', '5', '5', '5', '45', '45', '45', '55', '55', '55', '54', '54', '54'),
512		'גד'   => array('0', '43', '43', '43', '53', '53', '53'),
513		'גה'   => array('0', '45', '45', '45', '55', '55', '55'),
514		'גז'   => array('0', '44', '44', '44', '45', '45', '45'),
515		'גח'   => array('0', '45', '45', '45', '55', '55', '55'),
516		'גכ'   => array('0', '45', '45', '45', '55', '55', '55'),
517		'גך'   => array('0', '45', '45', '45', '55', '55', '55'),
518		'גצ'   => array('0', '44', '44', '44', '45', '45', '45'),
519		'גץ'   => array('0', '44', '44', '44', '45', '45', '45'),
520		'גק'   => array('0', '45', '45', '45', '54', '54', '54'),
521		'גש'   => array('0', '44', '44', '44', '54', '54', '54'),
522		'גת'   => array('0', '43', '43', '43', '53', '53', '53'),
523		'ג'     => array('0', '4', '4', '4', '5', '5', '5'),
524		'דז'   => array('0', '4', '4', '4'),
525		'דד'   => array('0', '3', '3', '3', '33', '33', '33'),
526		'דט'   => array('0', '33', '33', '33'),
527		'דש'   => array('0', '4', '4', '4'),
528		'דצ'   => array('0', '4', '4', '4'),
529		'דץ'   => array('0', '4', '4', '4'),
530		'ד'     => array('0', '3', '3', '3'),
531		'הג'   => array('0', '54', '54', '54', '55', '55', '55'),
532		'הכ'   => array('0', '55', '55', '55'),
533		'הח'   => array('0', '55', '55', '55'),
534		'הק'   => array('0', '55', '55', '55', '5', '5', '5'),
535		'הה'   => array('0', '5', '5', '', '55', '55', ''),
536		'ה'     => array('0', '5', '5', ''),
537		'וי'   => array('1', '', '', '', '7', '7', '7'),
538		'ו'     => array('1', '7', '7', '7', '7', '', ''),
539		'וו'   => array('1', '7', '7', '7', '7', '', ''),
540		'וופ' => array('1', '7', '7', '7', '77', '77', '77'),
541		'זש'   => array('0', '4', '4', '4', '44', '44', '44'),
542		'זדז' => array('0', '2', '4', '4'),
543		'ז'     => array('0', '4', '4', '4'),
544		'זג'   => array('0', '44', '44', '44', '45', '45', '45'),
545		'זז'   => array('0', '4', '4', '4', '44', '44', '44'),
546		'זס'   => array('0', '44', '44', '44'),
547		'זצ'   => array('0', '44', '44', '44'),
548		'זץ'   => array('0', '44', '44', '44'),
549		'חג'   => array('0', '54', '54', '54', '53', '53', '53'),
550		'חח'   => array('0', '5', '5', '5', '55', '55', '55'),
551		'חק'   => array('0', '55', '55', '55', '5', '5', '5'),
552		'חכ'   => array('0', '45', '45', '45', '55', '55', '55'),
553		'חס'   => array('0', '5', '54', '54'),
554		'חש'   => array('0', '5', '54', '54'),
555		'ח'     => array('0', '5', '5', '5'),
556		'טש'   => array('0', '4', '4', '4'),
557		'טד'   => array('0', '33', '33', '33'),
558		'טי'   => array('0', '3', '3', '3', '4', '4', '4', '3', '3', '34'),
559		'טת'   => array('0', '33', '33', '33'),
560		'טט'   => array('0', '3', '3', '3', '33', '33', '33'),
561		'ט'     => array('0', '3', '3', '3'),
562		'י'     => array('1', '1', '', ''),
563		'יא'   => array('1', '1', '', '', '1', '1', '1'),
564		'כג'   => array('0', '55', '55', '55', '54', '54', '54'),
565		'כש'   => array('0', '5', '54', '54'),
566		'כס'   => array('0', '5', '54', '54'),
567		'ככ'   => array('0', '5', '5', '5', '55', '55', '55'),
568		'כך'   => array('0', '5', '5', '5', '55', '55', '55'),
569		'כ'     => array('0', '5', '5', '5'),
570		'כח'   => array('0', '55', '55', '55', '5', '5', '5'),
571		'ך'     => array('0', '', '5', '5'),
572		'ל'     => array('0', '8', '8', '8'),
573		'לל'   => array('0', '88', '88', '88', '8', '8', '8'),
574		'מנ'   => array('0', '66', '66', '66'),
575		'מן'   => array('0', '66', '66', '66'),
576		'ממ'   => array('0', '6', '6', '6', '66', '66', '66'),
577		'מם'   => array('0', '6', '6', '6', '66', '66', '66'),
578		'מ'     => array('0', '6', '6', '6'),
579		'ם'     => array('0', '', '6', '6'),
580		'נמ'   => array('0', '66', '66', '66'),
581		'נם'   => array('0', '66', '66', '66'),
582		'ננ'   => array('0', '6', '6', '6', '66', '66', '66'),
583		'נן'   => array('0', '6', '6', '6', '66', '66', '66'),
584		'נ'     => array('0', '6', '6', '6'),
585		'ן'     => array('0', '', '6', '6'),
586		'סתש' => array('0', '2', '4', '4'),
587		'סתז' => array('0', '2', '4', '4'),
588		'סטז' => array('0', '2', '4', '4'),
589		'סטש' => array('0', '2', '4', '4'),
590		'סצד' => array('0', '2', '4', '4'),
591		'סט'   => array('0', '2', '4', '4', '43', '43', '43'),
592		'סת'   => array('0', '2', '4', '4', '43', '43', '43'),
593		'סג'   => array('0', '44', '44', '44', '4', '4', '4'),
594		'סס'   => array('0', '4', '4', '4', '44', '44', '44'),
595		'סצ'   => array('0', '44', '44', '44'),
596		'סץ'   => array('0', '44', '44', '44'),
597		'סז'   => array('0', '44', '44', '44'),
598		'סש'   => array('0', '44', '44', '44'),
599		'ס'     => array('0', '4', '4', '4'),
600		'ע'     => array('1', '0', '', ''),
601		'פב'   => array('0', '7', '7', '7', '77', '77', '77'),
602		'פוו' => array('0', '7', '7', '7', '77', '77', '77'),
603		'פפ'   => array('0', '7', '7', '7', '77', '77', '77'),
604		'פף'   => array('0', '7', '7', '7', '77', '77', '77'),
605		'פ'     => array('0', '7', '7', '7'),
606		'ף'     => array('0', '', '7', '7'),
607		'צג'   => array('0', '44', '44', '44', '45', '45', '45'),
608		'צז'   => array('0', '44', '44', '44'),
609		'צס'   => array('0', '44', '44', '44'),
610		'צצ'   => array('0', '4', '4', '4', '5', '5', '5', '44', '44', '44', '54', '54', '54', '45', '45', '45'),
611		'צץ'   => array('0', '4', '4', '4', '5', '5', '5', '44', '44', '44', '54', '54', '54'),
612		'צש'   => array('0', '44', '44', '44', '4', '4', '4', '5', '5', '5'),
613		'צ'     => array('0', '4', '4', '4', '5', '5', '5'),
614		'ץ'     => array('0', '', '4', '4'),
615		'קה'   => array('0', '55', '55', '5'),
616		'קס'   => array('0', '5', '54', '54'),
617		'קש'   => array('0', '5', '54', '54'),
618		'קק'   => array('0', '5', '5', '5', '55', '55', '55'),
619		'קח'   => array('0', '55', '55', '55'),
620		'קכ'   => array('0', '55', '55', '55'),
621		'קך'   => array('0', '55', '55', '55'),
622		'קג'   => array('0', '55', '55', '55', '54', '54', '54'),
623		'ק'     => array('0', '5', '5', '5'),
624		'רר'   => array('0', '99', '99', '99', '9', '9', '9'),
625		'ר'     => array('0', '9', '9', '9'),
626		'שטז' => array('0', '2', '4', '4'),
627		'שתש' => array('0', '2', '4', '4'),
628		'שתז' => array('0', '2', '4', '4'),
629		'שטש' => array('0', '2', '4', '4'),
630		'שד'   => array('0', '2', '43', '43'),
631		'שז'   => array('0', '44', '44', '44'),
632		'שס'   => array('0', '44', '44', '44'),
633		'שת'   => array('0', '2', '43', '43'),
634		'שג'   => array('0', '4', '4', '4', '44', '44', '44', '4', '43', '43'),
635		'שט'   => array('0', '2', '43', '43', '44', '44', '44'),
636		'שצ'   => array('0', '44', '44', '44', '45', '45', '45'),
637		'שץ'   => array('0', '44', '', '44', '45', '', '45'),
638		'שש'   => array('0', '4', '4', '4', '44', '44', '44'),
639		'ש'     => array('0', '4', '4', '4'),
640		'תג'   => array('0', '34', '34', '34'),
641		'תז'   => array('0', '34', '34', '34'),
642		'תש'   => array('0', '4', '4', '4'),
643		'תת'   => array('0', '3', '3', '3', '4', '4', '4', '33', '33', '33', '44', '44', '44', '34', '34', '34', '43', '43', '43'),
644		'ת'     => array('0', '3', '3', '3', '4', '4', '4'),
645		// Arabic alphabet
646		'ا'   => array('1', '0', '', ''),
647		'ب'   => array('0', '7', '7', '7'),
648		'ت'   => array('0', '3', '3', '3'),
649		'ث'   => array('0', '3', '3', '3'),
650		'ج'   => array('0', '4', '4', '4'),
651		'ح'   => array('0', '5', '5', '5'),
652		'خ'   => array('0', '5', '5', '5'),
653		'د'   => array('0', '3', '3', '3'),
654		'ذ'   => array('0', '3', '3', '3'),
655		'ر'   => array('0', '9', '9', '9'),
656		'ز'   => array('0', '4', '4', '4'),
657		'س'   => array('0', '4', '4', '4'),
658		'ش'   => array('0', '4', '4', '4'),
659		'ص'   => array('0', '4', '4', '4'),
660		'ض'   => array('0', '3', '3', '3'),
661		'ط'   => array('0', '3', '3', '3'),
662		'ظ'   => array('0', '4', '4', '4'),
663		'ع'   => array('1', '0', '', ''),
664		'غ'   => array('0', '0', '', ''),
665		'ف'   => array('0', '7', '7', '7'),
666		'ق'   => array('0', '5', '5', '5'),
667		'ك'   => array('0', '5', '5', '5'),
668		'ل'   => array('0', '8', '8', '8'),
669		'لا' => array('0', '8', '8', '8'),
670		'م'   => array('0', '6', '6', '6'),
671		'ن'   => array('0', '6', '6', '6'),
672		'هن' => array('0', '66', '66', '66'),
673		'ه'   => array('0', '5', '5', ''),
674		'و'   => array('1', '', '', '', '7', '', ''),
675		'ي'   => array('0', '1', '', ''),
676		'آ'   => array('0', '1', '', ''),
677		'ة'   => array('0', '', '', '3'),
678		'ی'   => array('0', '1', '', ''),
679		'ى'   => array('1', '1', '', ''),
680	);
681
682	/**
683	 * Calculate the Daitch-Mokotoff soundex for a word.
684	 *
685	 * @param string $name
686	 *
687	 * @return string[] List of possible DM codes for the word.
688	 */
689	private static function daitchMokotoffWord($name) {
690		// Apply special transformation rules to the input string
691		$name = I18N::strtoupper($name);
692		foreach (self::$transformNameTable as $transformRule) {
693			$name = str_replace($transformRule[0], $transformRule[1], $name);
694		}
695
696		// Initialize
697		$name_script = I18N::textScript($name);
698		$noVowels    = ($name_script == 'Hebr' || $name_script == 'Arab');
699
700		$lastPos         = strlen($name) - 1;
701		$currPos         = 0;
702		$state           = 1; // 1: start of input string, 2: before vowel, 3: other
703		$result          = array(); // accumulate complete 6-digit D-M codes here
704		$partialResult   = array(); // accumulate incomplete D-M codes here
705		$partialResult[] = array('!'); // initialize 1st partial result  ('!' stops "duplicate sound" check)
706
707		// Loop through the input string.
708		// Stop when the string is exhausted or when no more partial results remain
709		while (count($partialResult) !== 0 && $currPos <= $lastPos) {
710			// Find the DM coding table entry for the chunk at the current position
711			$thisEntry = substr($name, $currPos, self::MAXCHAR); // Get maximum length chunk
712			while ($thisEntry != '') {
713				if (isset(self::$dmsounds[$thisEntry])) {
714					break;
715				}
716				$thisEntry = substr($thisEntry, 0, -1); // Not in table: try a shorter chunk
717			}
718			if ($thisEntry === '') {
719				$currPos++; // Not in table: advance pointer to next byte
720				continue; // and try again
721			}
722
723			$soundTableEntry = self::$dmsounds[$thisEntry];
724			$workingResult   = $partialResult;
725			$partialResult   = array();
726			$currPos += strlen($thisEntry);
727
728			// Not at beginning of input string
729			if ($state != 1) {
730				if ($currPos <= $lastPos) {
731					// Determine whether the next chunk is a vowel
732					$nextEntry = substr($name, $currPos, self::MAXCHAR); // Get maximum length chunk
733					while ($nextEntry != '') {
734						if (isset(self::$dmsounds[$nextEntry])) {
735							break;
736						}
737						$nextEntry = substr($nextEntry, 0, -1); // Not in table: try a shorter chunk
738					}
739				} else {
740					$nextEntry = '';
741				}
742				if ($nextEntry != '' && self::$dmsounds[$nextEntry][0] != '0') {
743					$state = 2;
744				} else {
745					// Next chunk is a vowel
746					$state = 3;
747				}
748			}
749
750			while ($state < count($soundTableEntry)) {
751				// empty means 'ignore this sound in this state'
752				if ($soundTableEntry[$state] == '') {
753					foreach ($workingResult as $workingEntry) {
754						$tempEntry = $workingEntry;
755						$tempEntry[count($tempEntry) - 1] .= '!'; // Prevent false 'doubles'
756						$partialResult[] = $tempEntry;
757					}
758				} else {
759					foreach ($workingResult as $workingEntry) {
760						if ($soundTableEntry[$state] !== $workingEntry[count($workingEntry) - 1]) {
761							// Incoming sound isn't a duplicate of the previous sound
762							$workingEntry[] = $soundTableEntry[$state];
763						} else {
764							// Incoming sound is a duplicate of the previous sound
765							// For Hebrew and Arabic, we need to create a pair of D-M sound codes,
766							// one of the pair with only a single occurrence of the duplicate sound,
767							// the other with both occurrences
768							if ($noVowels) {
769								$workingEntry[] = $soundTableEntry[$state];
770							}
771						}
772						if (count($workingEntry) < 7) {
773							$partialResult[] = $workingEntry;
774						} else {
775							// This is the 6th code in the sequence
776							// We're looking for 7 entries because the first is '!' and doesn't count
777							$tempResult = str_replace('!', '', implode('', $workingEntry));
778							// Only return codes from recognisable sounds
779							if ($tempResult) {
780								$result[] = substr($tempResult . '000000', 0, 6);
781							}
782						}
783					}
784				}
785				$state = $state + 3; // Advance to next triplet while keeping the same basic state
786			}
787		}
788
789		// Zero-fill and copy all remaining partial results
790		foreach ($partialResult as $workingEntry) {
791			$tempResult = str_replace('!', '', implode('', $workingEntry));
792			// Only return codes from recognisable sounds
793			if ($tempResult) {
794				$result[] = substr($tempResult . '000000', 0, 6);
795			}
796		}
797
798		return $result;
799	}
800}
801