xref: /webtrees/app/Soundex.php (revision b90d8acc82ea3080403ec948fc3f1106a64e74cc)
1<?php
2namespace Fisharebest\Webtrees;
3
4/**
5 * webtrees: online genealogy
6 * Copyright (C) 2015 webtrees development team
7 * This program is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with this program. If not, see <http://www.gnu.org/licenses/>.
17 */
18
19/**
20 * Class Soundex Functions for phonetic matching of strings
21 */
22class Soundex {
23	/**
24	 * @return string[]
25	 */
26	public static function getAlgorithms() {
27		return array(
28			'std' => /* I18N: http://en.wikipedia.org/wiki/Soundex */ I18N::translate('Russell'),
29			'dm'  => /* I18N: http://en.wikipedia.org/wiki/Daitch–Mokotoff_Soundex */ I18N::translate('Daitch-Mokotoff'),
30		);
31	}
32
33	/**
34	 * Is there a match between two soundex codes?
35	 *
36	 * @param string $soundex1
37	 * @param string $soundex2
38	 *
39	 * @return boolean
40	 */
41	public static function compare($soundex1, $soundex2) {
42		if ($soundex1 && $soundex2) {
43			foreach (explode(':', $soundex1) as $code) {
44				if (strpos($soundex2, $code) !== false) {
45					return true;
46				}
47			}
48		}
49
50		return false;
51	}
52
53	/**
54	 * Generate Russell soundex codes for a given text.
55	 *
56	 * @param $text
57	 *
58	 * @return null|string
59	 */
60	public static function russell($text) {
61		$words         = preg_split('/\s/', $text, -1, PREG_SPLIT_NO_EMPTY);
62		$soundex_array = array();
63		foreach ($words as $word) {
64			$soundex = soundex($word);
65			// Only return codes from recognisable sounds
66			if ($soundex !== '0000') {
67				$soundex_array[] = $soundex;
68			}
69		}
70		// Combine words, e.g. “New York” as “Newyork”
71		if (count($words) > 1) {
72			$soundex_array[] = soundex(strtr($text, ' ', ''));
73		}
74		// A varchar(255) column can only hold 51 4-character codes (plus 50 delimiters)
75		$soundex_array = array_slice(array_unique($soundex_array), 0, 51);
76
77		if ($soundex_array) {
78			return implode(':', $soundex_array);
79		} else {
80			return null;
81		}
82	}
83
84	/**
85	 * Generate Daitch–Mokotoff soundex codes for a given text.
86	 *
87	 * @param $text
88	 *
89	 * @return null|string
90	 */
91	public static function daitchMokotoff($text) {
92		$words         = preg_split('/\s/', $text, -1, PREG_SPLIT_NO_EMPTY);
93		$soundex_array = array();
94		foreach ($words as $word) {
95			$soundex_array = array_merge($soundex_array, self::daitchMokotoffWord($word));
96		}
97		// Combine words, e.g. “New York” as “Newyork”
98		if (count($words) > 1) {
99			$soundex_array = array_merge($soundex_array, self::daitchMokotoffWord(strtr($text, ' ', '')));
100		}
101		// A varchar(255) column can only hold 36 6-character codes (plus 35 delimiters)
102		$soundex_array = array_slice(array_unique($soundex_array), 0, 36);
103
104		if ($soundex_array) {
105			return implode(':', $soundex_array);
106		} else {
107			return null;
108		}
109	}
110
111	// Determine the Daitch–Mokotoff Soundex code for a word
112	// Original implementation by Gerry Kroll, and analysis by Meliza Amity
113
114	// Max. table key length (in ASCII bytes -- NOT in UTF-8 characters!)
115	const MAXCHAR = 7;
116
117	/**
118	 * Name transformation arrays.
119	 * Used to transform the Name string to simplify the "sounds like" table.
120	 * This is especially useful in Hebrew.
121	 *
122	 * Each array entry defines the "from" and "to" arguments of an preg($from, $to, $text)
123	 * function call to achieve the desired transformations.
124	 *
125	 * Note about the use of "\x01":
126	 * This code, which can’t legitimately occur in the kind of text we're dealing with,
127	 * is used as a place-holder so that conditional string replacements can be done.
128	 *
129	 * @var string[][]
130	 */
131	private static $transformNameTable = array(
132		// Force Yiddish ligatures to be treated as separate letters
133		array('װ', 'וו'),
134		array('ײ', 'יי'),
135		array('ױ', 'וי'),
136		array('בו', 'בע'),
137		array('פו', 'פע'),
138		array('ומ', 'עמ'),
139		array('ום', 'עם'),
140		array('ונ', 'ענ'),
141		array('ון', 'ען'),
142		array('וו', 'ב'),
143		array("\x01", ''),
144		array('ייה$', "\x01ה"),
145		array('ייע$', "\x01ע"),
146		array('יי', 'ע'),
147		array("\x01", 'יי'),
148	);
149
150	/**
151	 * The DM sound coding table is organized this way:
152	 * key: a variable-length string that corresponds to the UTF-8 character sequence
153	 * represented by the table entry.  Currently, that string can be up to 7
154	 * bytes long.  This maximum length is defined by the value of global variable
155	 * $maxchar.
156	 *
157	 * value: an array as follows:
158	 * [0]:  zero if not a vowel
159	 * [1]:  sound value when this string is at the beginning of the word
160	 * [2]:  sound value when this string is followed by a vowel
161	 * [3]:  sound value for other cases
162	 * [1],[2],[3] can be repeated several times to create branches in the code
163	 * an empty sound value means "ignore in this state"
164	 *
165	 * @var string[][]
166	 */
167	private static $dmsounds = array(
168		'A' => array('1', '0', '', ''),
169		'À' => array('1', '0', '', ''),
170		'Á' => array('1', '0', '', ''),
171		'Â' => array('1', '0', '', ''),
172		'Ã' => array('1', '0', '', ''),
173		'Ä' => array('1', '0', '1', '', '0', '', ''),
174		'Å' => array('1', '0', '', ''),
175		'Ă' => array('1', '0', '', ''),
176		'Ą' => array('1', '', '', '', '', '', '6'),
177		'Ạ' => array('1', '0', '', ''),
178		'Ả' => array('1', '0', '', ''),
179		'Ấ' => array('1', '0', '', ''),
180		'Ầ' => array('1', '0', '', ''),
181		'Ẩ' => array('1', '0', '', ''),
182		'Ẫ' => array('1', '0', '', ''),
183		'Ậ' => array('1', '0', '', ''),
184		'Ắ' => array('1', '0', '', ''),
185		'Ằ' => array('1', '0', '', ''),
186		'Ẳ' => array('1', '0', '', ''),
187		'Ẵ' => array('1', '0', '', ''),
188		'Ặ' => array('1', '0', '', ''),
189		'AE' => array('1', '0', '1', ''),
190		'Æ' => array('1', '0', '1', ''),
191		'AI' => array('1', '0', '1', ''),
192		'AJ' => array('1', '0', '1', ''),
193		'AU' => array('1', '0', '7', ''),
194		'AV' => array('1', '0', '7', '', '7', '7', '7'),
195		'ÄU' => array('1', '0', '1', ''),
196		'AY' => array('1', '0', '1', ''),
197		'B' => array('0', '7', '7', '7'),
198		'C' => array('0', '5', '5', '5', '34', '4', '4'),
199		'Ć' => array('0', '4', '4', '4'),
200		'Č' => array('0', '4', '4', '4'),
201		'Ç' => array('0', '4', '4', '4'),
202		'CH' => array('0', '5', '5', '5', '34', '4', '4'),
203		'CHS' => array('0', '5', '54', '54'),
204		'CK' => array('0', '5', '5', '5', '45', '45', '45'),
205		'CCS' => array('0', '4', '4', '4'),
206		'CS' => array('0', '4', '4', '4'),
207		'CSZ' => array('0', '4', '4', '4'),
208		'CZ' => array('0', '4', '4', '4'),
209		'CZS' => array('0', '4', '4', '4'),
210		'D' => array('0', '3', '3', '3'),
211		'Ď' => array('0', '3', '3', '3'),
212		'Đ' => array('0', '3', '3', '3'),
213		'DRS' => array('0', '4', '4', '4'),
214		'DRZ' => array('0', '4', '4', '4'),
215		'DS' => array('0', '4', '4', '4'),
216		'DSH' => array('0', '4', '4', '4'),
217		'DSZ' => array('0', '4', '4', '4'),
218		'DT' => array('0', '3', '3', '3'),
219		'DDZ' => array('0', '4', '4', '4'),
220		'DDZS' => array('0', '4', '4', '4'),
221		'DZ' => array('0', '4', '4', '4'),
222		'DŹ' => array('0', '4', '4', '4'),
223		'DŻ' => array('0', '4', '4', '4'),
224		'DZH' => array('0', '4', '4', '4'),
225		'DZS' => array('0', '4', '4', '4'),
226		'E' => array('1', '0', '', ''),
227		'È' => array('1', '0', '', ''),
228		'É' => array('1', '0', '', ''),
229		'Ê' => array('1', '0', '', ''),
230		'Ë' => array('1', '0', '', ''),
231		'Ĕ' => array('1', '0', '', ''),
232		'Ė' => array('1', '0', '', ''),
233		'Ę' => array('1', '', '', '6', '', '', ''),
234		'Ẹ' => array('1', '0', '', ''),
235		'Ẻ' => array('1', '0', '', ''),
236		'Ẽ' => array('1', '0', '', ''),
237		'Ế' => array('1', '0', '', ''),
238		'Ề' => array('1', '0', '', ''),
239		'Ể' => array('1', '0', '', ''),
240		'Ễ' => array('1', '0', '', ''),
241		'Ệ' => array('1', '0', '', ''),
242		'EAU' => array('1', '0', '', ''),
243		'EI' => array('1', '0', '1', ''),
244		'EJ' => array('1', '0', '1', ''),
245		'EU' => array('1', '1', '1', ''),
246		'EY' => array('1', '0', '1', ''),
247		'F' => array('0', '7', '7', '7'),
248		'FB' => array('0', '7', '7', '7'),
249		'G' => array('0', '5', '5', '5', '34', '4', '4'),
250		'Ğ' => array('0', '', '', ''),
251		'GGY' => array('0', '5', '5', '5'),
252		'GY' => array('0', '5', '5', '5'),
253		'H' => array('0', '5', '5', '', '5', '5', '5'),
254		'I' => array('1', '0', '', ''),
255		'Ì' => array('1', '0', '', ''),
256		'Í' => array('1', '0', '', ''),
257		'Î' => array('1', '0', '', ''),
258		'Ï' => array('1', '0', '', ''),
259		'Ĩ' => array('1', '0', '', ''),
260		'Į' => array('1', '0', '', ''),
261		'İ' => array('1', '0', '', ''),
262		'Ỉ' => array('1', '0', '', ''),
263		'Ị' => array('1', '0', '', ''),
264		'IA' => array('1', '1', '', ''),
265		'IE' => array('1', '1', '', ''),
266		'IO' => array('1', '1', '', ''),
267		'IU' => array('1', '1', '', ''),
268		'J' => array('0', '1', '', '', '4', '4', '4', '5', '5', ''),
269		'K' => array('0', '5', '5', '5'),
270		'KH' => array('0', '5', '5', '5'),
271		'KS' => array('0', '5', '54', '54'),
272		'L' => array('0', '8', '8', '8'),
273		'Ľ' => array('0', '8', '8', '8'),
274		'Ĺ' => array('0', '8', '8', '8'),
275		'Ł' => array('0', '7', '7', '7', '8', '8', '8'),
276		'LL' => array('0', '8', '8', '8', '58', '8', '8', '1', '8', '8'),
277		'LLY' => array('0', '8', '8', '8', '1', '8', '8'),
278		'LY' => array('0', '8', '8', '8', '1', '8', '8'),
279		'M' => array('0', '6', '6', '6'),
280		'MĔ' => array('0', '66', '66', '66'),
281		'MN' => array('0', '66', '66', '66'),
282		'N' => array('0', '6', '6', '6'),
283		'Ń' => array('0', '6', '6', '6'),
284		'Ň' => array('0', '6', '6', '6'),
285		'Ñ' => array('0', '6', '6', '6'),
286		'NM' => array('0', '66', '66', '66'),
287		'O' => array('1', '0', '', ''),
288		'Ò' => array('1', '0', '', ''),
289		'Ó' => array('1', '0', '', ''),
290		'Ô' => array('1', '0', '', ''),
291		'Õ' => array('1', '0', '', ''),
292		'Ö' => array('1', '0', '', ''),
293		'Ø' => array('1', '0', '', ''),
294		'Ő' => array('1', '0', '', ''),
295		'Œ' => array('1', '0', '', ''),
296		'Ơ' => array('1', '0', '', ''),
297		'Ọ' => array('1', '0', '', ''),
298		'Ỏ' => array('1', '0', '', ''),
299		'Ố' => array('1', '0', '', ''),
300		'Ồ' => array('1', '0', '', ''),
301		'Ổ' => array('1', '0', '', ''),
302		'Ỗ' => array('1', '0', '', ''),
303		'Ộ' => array('1', '0', '', ''),
304		'Ớ' => array('1', '0', '', ''),
305		'Ờ' => array('1', '0', '', ''),
306		'Ở' => array('1', '0', '', ''),
307		'Ỡ' => array('1', '0', '', ''),
308		'Ợ' => array('1', '0', '', ''),
309		'OE' => array('1', '0', '', ''),
310		'OI' => array('1', '0', '1', ''),
311		'OJ' => array('1', '0', '1', ''),
312		'OU' => array('1', '0', '', ''),
313		'OY' => array('1', '0', '1', ''),
314		'P' => array('0', '7', '7', '7'),
315		'PF' => array('0', '7', '7', '7'),
316		'PH' => array('0', '7', '7', '7'),
317		'Q' => array('0', '5', '5', '5'),
318		'R' => array('0', '9', '9', '9'),
319		'Ř' => array('0', '4', '4', '4'),
320		'RS' => array('0', '4', '4', '4', '94', '94', '94'),
321		'RZ' => array('0', '4', '4', '4', '94', '94', '94'),
322		'S' => array('0', '4', '4', '4'),
323		'Ś' => array('0', '4', '4', '4'),
324		'Š' => array('0', '4', '4', '4'),
325		'Ş' => array('0', '4', '4', '4'),
326		'SC' => array('0', '2', '4', '4'),
327		'ŠČ' => array('0', '2', '4', '4'),
328		'SCH' => array('0', '4', '4', '4'),
329		'SCHD' => array('0', '2', '43', '43'),
330		'SCHT' => array('0', '2', '43', '43'),
331		'SCHTCH' => array('0', '2', '4', '4'),
332		'SCHTSCH' => array('0', '2', '4', '4'),
333		'SCHTSH' => array('0', '2', '4', '4'),
334		'SD' => array('0', '2', '43', '43'),
335		'SH' => array('0', '4', '4', '4'),
336		'SHCH' => array('0', '2', '4', '4'),
337		'SHD' => array('0', '2', '43', '43'),
338		'SHT' => array('0', '2', '43', '43'),
339		'SHTCH' => array('0', '2', '4', '4'),
340		'SHTSH' => array('0', '2', '4', '4'),
341		'ß' => array('0', '', '4', '4'),
342		'ST' => array('0', '2', '43', '43'),
343		'STCH' => array('0', '2', '4', '4'),
344		'STRS' => array('0', '2', '4', '4'),
345		'STRZ' => array('0', '2', '4', '4'),
346		'STSCH' => array('0', '2', '4', '4'),
347		'STSH' => array('0', '2', '4', '4'),
348		'SSZ' => array('0', '4', '4', '4'),
349		'SZ' => array('0', '4', '4', '4'),
350		'SZCS' => array('0', '2', '4', '4'),
351		'SZCZ' => array('0', '2', '4', '4'),
352		'SZD' => array('0', '2', '43', '43'),
353		'SZT' => array('0', '2', '43', '43'),
354		'T' => array('0', '3', '3', '3'),
355		'Ť' => array('0', '3', '3', '3'),
356		'Ţ' => array('0', '3', '3', '3', '4', '4', '4'),
357		'TC' => array('0', '4', '4', '4'),
358		'TCH' => array('0', '4', '4', '4'),
359		'TH' => array('0', '3', '3', '3'),
360		'TRS' => array('0', '4', '4', '4'),
361		'TRZ' => array('0', '4', '4', '4'),
362		'TS' => array('0', '4', '4', '4'),
363		'TSCH' => array('0', '4', '4', '4'),
364		'TSH' => array('0', '4', '4', '4'),
365		'TSZ' => array('0', '4', '4', '4'),
366		'TTCH' => array('0', '4', '4', '4'),
367		'TTS' => array('0', '4', '4', '4'),
368		'TTSCH' => array('0', '4', '4', '4'),
369		'TTSZ' => array('0', '4', '4', '4'),
370		'TTZ' => array('0', '4', '4', '4'),
371		'TZ' => array('0', '4', '4', '4'),
372		'TZS' => array('0', '4', '4', '4'),
373		'U' => array('1', '0', '', ''),
374		'Ù' => array('1', '0', '', ''),
375		'Ú' => array('1', '0', '', ''),
376		'Û' => array('1', '0', '', ''),
377		'Ü' => array('1', '0', '', ''),
378		'Ũ' => array('1', '0', '', ''),
379		'Ū' => array('1', '0', '', ''),
380		'Ů' => array('1', '0', '', ''),
381		'Ű' => array('1', '0', '', ''),
382		'Ų' => array('1', '0', '', ''),
383		'Ư' => array('1', '0', '', ''),
384		'Ụ' => array('1', '0', '', ''),
385		'Ủ' => array('1', '0', '', ''),
386		'Ứ' => array('1', '0', '', ''),
387		'Ừ' => array('1', '0', '', ''),
388		'Ử' => array('1', '0', '', ''),
389		'Ữ' => array('1', '0', '', ''),
390		'Ự' => array('1', '0', '', ''),
391		'UE' => array('1', '0', '', ''),
392		'UI' => array('1', '0', '1', ''),
393		'UJ' => array('1', '0', '1', ''),
394		'UY' => array('1', '0', '1', ''),
395		'UW' => array('1', '0', '1', '', '0', '7', '7'),
396		'V' => array('0', '7', '7', '7'),
397		'W' => array('0', '7', '7', '7'),
398		'X' => array('0', '5', '54', '54'),
399		'Y' => array('1', '1', '', ''),
400		'Ý' => array('1', '1', '', ''),
401		'Ỳ' => array('1', '1', '', ''),
402		'Ỵ' => array('1', '1', '', ''),
403		'Ỷ' => array('1', '1', '', ''),
404		'Ỹ' => array('1', '1', '', ''),
405		'Z' => array('0', '4', '4', '4'),
406		'Ź' => array('0', '4', '4', '4'),
407		'Ż' => array('0', '4', '4', '4'),
408		'Ž' => array('0', '4', '4', '4'),
409		'ZD' => array('0', '2', '43', '43'),
410		'ZDZ' => array('0', '2', '4', '4'),
411		'ZDZH' => array('0', '2', '4', '4'),
412		'ZH' => array('0', '4', '4', '4'),
413		'ZHD' => array('0', '2', '43', '43'),
414		'ZHDZH' => array('0', '2', '4', '4'),
415		'ZS' => array('0', '4', '4', '4'),
416		'ZSCH' => array('0', '4', '4', '4'),
417		'ZSH' => array('0', '4', '4', '4'),
418		'ZZS' => array('0', '4', '4', '4'),
419		// Cyrillic alphabet
420		'А' => array('1', '0', '', ''),
421		'Б' => array('0', '7', '7', '7'),
422		'В' => array('0', '7', '7', '7'),
423		'Г' => array('0', '5', '5', '5'),
424		'Д' => array('0', '3', '3', '3'),
425		'ДЗ' => array('0', '4', '4', '4'),
426		'Е' => array('1', '0', '', ''),
427		'Ё' => array('1', '0', '', ''),
428		'Ж' => array('0', '4', '4', '4'),
429		'З' => array('0', '4', '4', '4'),
430		'И' => array('1', '0', '', ''),
431		'Й' => array('1', '1', '', '', '4', '4', '4'),
432		'К' => array('0', '5', '5', '5'),
433		'Л' => array('0', '8', '8', '8'),
434		'М' => array('0', '6', '6', '6'),
435		'Н' => array('0', '6', '6', '6'),
436		'О' => array('1', '0', '', ''),
437		'П' => array('0', '7', '7', '7'),
438		'Р' => array('0', '9', '9', '9'),
439		'РЖ' => array('0', '4', '4', '4'),
440		'С' => array('0', '4', '4', '4'),
441		'Т' => array('0', '3', '3', '3'),
442		'У' => array('1', '0', '', ''),
443		'Ф' => array('0', '7', '7', '7'),
444		'Х' => array('0', '5', '5', '5'),
445		'Ц' => array('0', '4', '4', '4'),
446		'Ч' => array('0', '4', '4', '4'),
447		'Ш' => array('0', '4', '4', '4'),
448		'Щ' => array('0', '2', '4', '4'),
449		'Ъ' => array('0', '', '', ''),
450		'Ы' => array('0', '1', '', ''),
451		'Ь' => array('0', '', '', ''),
452		'Э' => array('1', '0', '', ''),
453		'Ю' => array('0', '1', '', ''),
454		'Я' => array('0', '1', '', ''),
455		// Greek alphabet
456		'Α' => array('1', '0', '', ''),
457		'Ά' => array('1', '0', '', ''),
458		'ΑΙ' => array('1', '0', '1', ''),
459		'ΑΥ' => array('1', '0', '1', ''),
460		'Β' => array('0', '7', '7', '7'),
461		'Γ' => array('0', '5', '5', '5'),
462		'Δ' => array('0', '3', '3', '3'),
463		'Ε' => array('1', '0', '', ''),
464		'Έ' => array('1', '0', '', ''),
465		'ΕΙ' => array('1', '0', '1', ''),
466		'ΕΥ' => array('1', '1', '1', ''),
467		'Ζ' => array('0', '4', '4', '4'),
468		'Η' => array('1', '0', '', ''),
469		'Ή' => array('1', '0', '', ''),
470		'Θ' => array('0', '3', '3', '3'),
471		'Ι' => array('1', '0', '', ''),
472		'Ί' => array('1', '0', '', ''),
473		'Ϊ' => array('1', '0', '', ''),
474		'ΐ' => array('1', '0', '', ''),
475		'Κ' => array('0', '5', '5', '5'),
476		'Λ' => array('0', '8', '8', '8'),
477		'Μ' => array('0', '6', '6', '6'),
478		'ΜΠ' => array('0', '7', '7', '7'),
479		'Ν' => array('0', '6', '6', '6'),
480		'ΝΤ' => array('0', '3', '3', '3'),
481		'Ξ' => array('0', '5', '54', '54'),
482		'Ο' => array('1', '0', '', ''),
483		'Ό' => array('1', '0', '', ''),
484		'ΟΙ' => array('1', '0', '1', ''),
485		'ΟΥ' => array('1', '0', '1', ''),
486		'Π' => array('0', '7', '7', '7'),
487		'Ρ' => array('0', '9', '9', '9'),
488		'Σ' => array('0', '4', '4', '4'),
489		'ς' => array('0', '', '', '4'),
490		'Τ' => array('0', '3', '3', '3'),
491		'ΤΖ' => array('0', '4', '4', '4'),
492		'ΤΣ' => array('0', '4', '4', '4'),
493		'Υ' => array('1', '1', '', ''),
494		'Ύ' => array('1', '1', '', ''),
495		'Ϋ' => array('1', '1', '', ''),
496		'ΰ' => array('1', '1', '', ''),
497		'ΥΚ' => array('1', '5', '5', '5'),
498		'ΥΥ' => array('1', '65', '65', '65'),
499		'Φ' => array('0', '7', '7', '7'),
500		'Χ' => array('0', '5', '5', '5'),
501		'Ψ' => array('0', '7', '7', '7'),
502		'Ω' => array('1', '0', '', ''),
503		'Ώ' => array('1', '0', '', ''),
504		// Hebrew alphabet
505		'א' => array('1', '0', '', ''),
506		'או' => array('1', '0', '7', ''),
507		'אג' => array('1', '4', '4', '4', '5', '5', '5', '34', '34', '34'),
508		'בב' => array('0', '7', '7', '7', '77', '77', '77'),
509		'ב' => array('0', '7', '7', '7'),
510		'גג' => array('0', '4', '4', '4', '5', '5', '5', '45', '45', '45', '55', '55', '55', '54', '54', '54'),
511		'גד' => array('0', '43', '43', '43', '53', '53', '53'),
512		'גה' => array('0', '45', '45', '45', '55', '55', '55'),
513		'גז' => array('0', '44', '44', '44', '45', '45', '45'),
514		'גח' => array('0', '45', '45', '45', '55', '55', '55'),
515		'גכ' => array('0', '45', '45', '45', '55', '55', '55'),
516		'גך' => array('0', '45', '45', '45', '55', '55', '55'),
517		'גצ' => array('0', '44', '44', '44', '45', '45', '45'),
518		'גץ' => array('0', '44', '44', '44', '45', '45', '45'),
519		'גק' => array('0', '45', '45', '45', '54', '54', '54'),
520		'גש' => array('0', '44', '44', '44', '54', '54', '54'),
521		'גת' => array('0', '43', '43', '43', '53', '53', '53'),
522		'ג' => array('0', '4', '4', '4', '5', '5', '5'),
523		'דז' => array('0', '4', '4', '4'),
524		'דד' => array('0', '3', '3', '3', '33', '33', '33'),
525		'דט' => array('0', '33', '33', '33'),
526		'דש' => array('0', '4', '4', '4'),
527		'דצ' => array('0', '4', '4', '4'),
528		'דץ' => array('0', '4', '4', '4'),
529		'ד' => array('0', '3', '3', '3'),
530		'הג' => array('0', '54', '54', '54', '55', '55', '55'),
531		'הכ' => array('0', '55', '55', '55'),
532		'הח' => array('0', '55', '55', '55'),
533		'הק' => array('0', '55', '55', '55', '5', '5', '5'),
534		'הה' => array('0', '5', '5', '', '55', '55', ''),
535		'ה' => array('0', '5', '5', ''),
536		'וי' => array('1', '', '', '', '7', '7', '7'),
537		'ו' => array('1', '7', '7', '7', '7', '', ''),
538		'וו' => array('1', '7', '7', '7', '7', '', ''),
539		'וופ' => array('1', '7', '7', '7', '77', '77', '77'),
540		'זש' => array('0', '4', '4', '4', '44', '44', '44'),
541		'זדז' => array('0', '2', '4', '4'),
542		'ז' => array('0', '4', '4', '4'),
543		'זג' => array('0', '44', '44', '44', '45', '45', '45'),
544		'זז' => array('0', '4', '4', '4', '44', '44', '44'),
545		'זס' => array('0', '44', '44', '44'),
546		'זצ' => array('0', '44', '44', '44'),
547		'זץ' => array('0', '44', '44', '44'),
548		'חג' => array('0', '54', '54', '54', '53', '53', '53'),
549		'חח' => array('0', '5', '5', '5', '55', '55', '55'),
550		'חק' => array('0', '55', '55', '55', '5', '5', '5'),
551		'חכ' => array('0', '45', '45', '45', '55', '55', '55'),
552		'חס' => array('0', '5', '54', '54'),
553		'חש' => array('0', '5', '54', '54'),
554		'ח' => array('0', '5', '5', '5'),
555		'טש' => array('0', '4', '4', '4'),
556		'טד' => array('0', '33', '33', '33'),
557		'טי' => array('0', '3', '3', '3', '4', '4', '4', '3', '3', '34'),
558		'טת' => array('0', '33', '33', '33'),
559		'טט' => array('0', '3', '3', '3', '33', '33', '33'),
560		'ט' => array('0', '3', '3', '3'),
561		'י' => array('1', '1', '', ''),
562		'יא' => array('1', '1', '', '', '1', '1', '1'),
563		'כג' => array('0', '55', '55', '55', '54', '54', '54'),
564		'כש' => array('0', '5', '54', '54'),
565		'כס' => array('0', '5', '54', '54'),
566		'ככ' => array('0', '5', '5', '5', '55', '55', '55'),
567		'כך' => array('0', '5', '5', '5', '55', '55', '55'),
568		'כ' => array('0', '5', '5', '5'),
569		'כח' => array('0', '55', '55', '55', '5', '5', '5'),
570		'ך' => array('0', '', '5', '5'),
571		'ל' => array('0', '8', '8', '8'),
572		'לל' => array('0', '88', '88', '88', '8', '8', '8'),
573		'מנ' => array('0', '66', '66', '66'),
574		'מן' => array('0', '66', '66', '66'),
575		'ממ' => array('0', '6', '6', '6', '66', '66', '66'),
576		'מם' => array('0', '6', '6', '6', '66', '66', '66'),
577		'מ' => array('0', '6', '6', '6'),
578		'ם' => array('0', '', '6', '6'),
579		'נמ' => array('0', '66', '66', '66'),
580		'נם' => array('0', '66', '66', '66'),
581		'ננ' => array('0', '6', '6', '6', '66', '66', '66'),
582		'נן' => array('0', '6', '6', '6', '66', '66', '66'),
583		'נ' => array('0', '6', '6', '6'),
584		'ן' => array('0', '', '6', '6'),
585		'סתש' => array('0', '2', '4', '4'),
586		'סתז' => array('0', '2', '4', '4'),
587		'סטז' => array('0', '2', '4', '4'),
588		'סטש' => array('0', '2', '4', '4'),
589		'סצד' => array('0', '2', '4', '4'),
590		'סט' => array('0', '2', '4', '4', '43', '43', '43'),
591		'סת' => array('0', '2', '4', '4', '43', '43', '43'),
592		'סג' => array('0', '44', '44', '44', '4', '4', '4'),
593		'סס' => array('0', '4', '4', '4', '44', '44', '44'),
594		'סצ' => array('0', '44', '44', '44'),
595		'סץ' => array('0', '44', '44', '44'),
596		'סז' => array('0', '44', '44', '44'),
597		'סש' => array('0', '44', '44', '44'),
598		'ס' => array('0', '4', '4', '4'),
599		'ע' => array('1', '0', '', ''),
600		'פב' => array('0', '7', '7', '7', '77', '77', '77'),
601		'פוו' => array('0', '7', '7', '7', '77', '77', '77'),
602		'פפ' => array('0', '7', '7', '7', '77', '77', '77'),
603		'פף' => array('0', '7', '7', '7', '77', '77', '77'),
604		'פ' => array('0', '7', '7', '7'),
605		'ף' => array('0', '', '7', '7'),
606		'צג' => array('0', '44', '44', '44', '45', '45', '45'),
607		'צז' => array('0', '44', '44', '44'),
608		'צס' => array('0', '44', '44', '44'),
609		'צצ' => array('0', '4', '4', '4', '5', '5', '5', '44', '44', '44', '54', '54', '54', '45', '45', '45'),
610		'צץ' => array('0', '4', '4', '4', '5', '5', '5', '44', '44', '44', '54', '54', '54'),
611		'צש' => array('0', '44', '44', '44', '4', '4', '4', '5', '5', '5'),
612		'צ' => array('0', '4', '4', '4', '5', '5', '5'),
613		'ץ' => array('0', '', '4', '4'),
614		'קה' => array('0', '55', '55', '5'),
615		'קס' => array('0', '5', '54', '54'),
616		'קש' => array('0', '5', '54', '54'),
617		'קק' => array('0', '5', '5', '5', '55', '55', '55'),
618		'קח' => array('0', '55', '55', '55'),
619		'קכ' => array('0', '55', '55', '55'),
620		'קך' => array('0', '55', '55', '55'),
621		'קג' => array('0', '55', '55', '55', '54', '54', '54'),
622		'ק' => array('0', '5', '5', '5'),
623		'רר' => array('0', '99', '99', '99', '9', '9', '9'),
624		'ר' => array('0', '9', '9', '9'),
625		'שטז' => array('0', '2', '4', '4'),
626		'שתש' => array('0', '2', '4', '4'),
627		'שתז' => array('0', '2', '4', '4'),
628		'שטש' => array('0', '2', '4', '4'),
629		'שד' => array('0', '2', '43', '43'),
630		'שז' => array('0', '44', '44', '44'),
631		'שס' => array('0', '44', '44', '44'),
632		'שת' => array('0', '2', '43', '43'),
633		'שג' => array('0', '4', '4', '4', '44', '44', '44', '4', '43', '43'),
634		'שט' => array('0', '2', '43', '43', '44', '44', '44'),
635		'שצ' => array('0', '44', '44', '44', '45', '45', '45'),
636		'שץ' => array('0', '44', '', '44', '45', '', '45'),
637		'שש' => array('0', '4', '4', '4', '44', '44', '44'),
638		'ש' => array('0', '4', '4', '4'),
639		'תג' => array('0', '34', '34', '34'),
640		'תז' => array('0', '34', '34', '34'),
641		'תש' => array('0', '4', '4', '4'),
642		'תת' => array('0', '3', '3', '3', '4', '4', '4', '33', '33', '33', '44', '44', '44', '34', '34', '34', '43', '43', '43'),
643		'ת' => array('0', '3', '3', '3', '4', '4', '4'),
644		// Arabic alphabet
645		'ا' => array('1', '0', '', ''),
646		'ب' => array('0', '7', '7', '7'),
647		'ت' => array('0', '3', '3', '3'),
648		'ث' => array('0', '3', '3', '3'),
649		'ج' => array('0', '4', '4', '4'),
650		'ح' => array('0', '5', '5', '5'),
651		'خ' => array('0', '5', '5', '5'),
652		'د' => array('0', '3', '3', '3'),
653		'ذ' => array('0', '3', '3', '3'),
654		'ر' => array('0', '9', '9', '9'),
655		'ز' => array('0', '4', '4', '4'),
656		'س' => array('0', '4', '4', '4'),
657		'ش' => array('0', '4', '4', '4'),
658		'ص' => array('0', '4', '4', '4'),
659		'ض' => array('0', '3', '3', '3'),
660		'ط' => array('0', '3', '3', '3'),
661		'ظ' => array('0', '4', '4', '4'),
662		'ع' => array('1', '0', '', ''),
663		'غ' => array('0', '0', '', ''),
664		'ف' => array('0', '7', '7', '7'),
665		'ق' => array('0', '5', '5', '5'),
666		'ك' => array('0', '5', '5', '5'),
667		'ل' => array('0', '8', '8', '8'),
668		'لا' => array('0', '8', '8', '8'),
669		'م' => array('0', '6', '6', '6'),
670		'ن' => array('0', '6', '6', '6'),
671		'هن' => array('0', '66', '66', '66'),
672		'ه' => array('0', '5', '5', ''),
673		'و' => array('1', '', '', '', '7', '', ''),
674		'ي' => array('0', '1', '', ''),
675		'آ' => array('0', '1', '', ''),
676		'ة' => array('0', '', '', '3'),
677		'ی' => array('0', '1', '', ''),
678		'ى' => array('1', '1', '', ''),
679	);
680
681	/**
682	 * @param string $name
683	 *
684	 * @return string[] List of possible DM codes for the word.
685	 */
686	private static function daitchMokotoffWord($name) {
687		// Apply special transformation rules to the input string
688		$name = I18N::strtoupper($name);
689		foreach (self::$transformNameTable as $transformRule) {
690			$name = str_replace($transformRule[0], $transformRule[1], $name);
691		}
692
693		// Initialize
694		$name_script = I18N::textScript($name);
695		$noVowels = ($name_script == 'Hebr' || $name_script == 'Arab');
696
697		$lastPos         = strlen($name) - 1;
698		$currPos         = 0;
699		$state           = 1; // 1: start of input string, 2: before vowel, 3: other
700		$result          = array(); // accumulate complete 6-digit D-M codes here
701		$partialResult   = array(); // accumulate incomplete D-M codes here
702		$partialResult[] = array('!'); // initialize 1st partial result  ('!' stops "duplicate sound" check)
703
704		// Loop through the input string.
705		// Stop when the string is exhausted or when no more partial results remain
706		while (count($partialResult) !== 0 && $currPos <= $lastPos) {
707			// Find the DM coding table entry for the chunk at the current position
708			$thisEntry = substr($name, $currPos, self::MAXCHAR); // Get maximum length chunk
709			while ($thisEntry != '') {
710				if (isset(self::$dmsounds[$thisEntry])) {
711					break;
712				}
713				$thisEntry = substr($thisEntry, 0, -1); // Not in table: try a shorter chunk
714			}
715			if ($thisEntry === '') {
716				$currPos++; // Not in table: advance pointer to next byte
717				continue; // and try again
718			}
719
720			$soundTableEntry = self::$dmsounds[$thisEntry];
721			$workingResult   = $partialResult;
722			$partialResult   = array();
723			$currPos += strlen($thisEntry);
724
725			// Not at beginning of input string
726			if ($state != 1) {
727				if ($currPos <= $lastPos) {
728					// Determine whether the next chunk is a vowel
729					$nextEntry = substr($name, $currPos, self::MAXCHAR); // Get maximum length chunk
730					while ($nextEntry != '') {
731						if (isset(self::$dmsounds[$nextEntry])) {
732							break;
733						}
734						$nextEntry = substr($nextEntry, 0, -1); // Not in table: try a shorter chunk
735					}
736				} else {
737					$nextEntry = '';
738				}
739				if ($nextEntry != '' && self::$dmsounds[$nextEntry][0] != '0') {
740					$state = 2;
741				} else {
742					// Next chunk is a vowel
743					$state = 3;
744				}
745			}
746
747			while ($state < count($soundTableEntry)) {
748				// empty means 'ignore this sound in this state'
749				if ($soundTableEntry[$state] == '') {
750					foreach ($workingResult as $workingEntry) {
751						$tempEntry = $workingEntry;
752						$tempEntry[count($tempEntry) - 1] .= '!'; // Prevent false 'doubles'
753						$partialResult[] = $tempEntry;
754					}
755				} else {
756					foreach ($workingResult as $workingEntry) {
757						if ($soundTableEntry[$state] !== $workingEntry[count($workingEntry) - 1]) {
758							// Incoming sound isn't a duplicate of the previous sound
759							$workingEntry[] = $soundTableEntry[$state];
760						} else {
761							// Incoming sound is a duplicate of the previous sound
762							// For Hebrew and Arabic, we need to create a pair of D-M sound codes,
763							// one of the pair with only a single occurrence of the duplicate sound,
764							// the other with both occurrences
765							if ($noVowels) {
766								$workingEntry[] = $soundTableEntry[$state];
767							}
768						}
769						if (count($workingEntry) < 7) {
770							$partialResult[] = $workingEntry;
771						} else {
772							// This is the 6th code in the sequence
773							// We're looking for 7 entries because the first is '!' and doesn't count
774							$tempResult = str_replace('!', '', implode('', $workingEntry));
775							// Only return codes from recognisable sounds
776							if ($tempResult) {
777								$result[] = substr($tempResult . '000000', 0, 6);
778							}
779						}
780					}
781				}
782				$state = $state + 3; // Advance to next triplet while keeping the same basic state
783			}
784		}
785
786		// Zero-fill and copy all remaining partial results
787		foreach ($partialResult as $workingEntry) {
788			$tempResult = str_replace('!', '', implode('', $workingEntry));
789			// Only return codes from recognisable sounds
790			if ($tempResult) {
791				$result[] = substr($tempResult . '000000', 0, 6);
792			}
793		}
794
795		return $result;
796	}
797}
798