1<?php 2/** 3 * webtrees: online genealogy 4 * Copyright (C) 2018 webtrees development team 5 * This program is free software: you can redistribute it and/or modify 6 * it under the terms of the GNU General Public License as published by 7 * the Free Software Foundation, either version 3 of the License, or 8 * (at your option) any later version. 9 * This program is distributed in the hope that it will be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * You should have received a copy of the GNU General Public License 14 * along with this program. If not, see <http://www.gnu.org/licenses/>. 15 */ 16namespace Fisharebest\Webtrees; 17 18/** 19 * Phonetic matching of strings. 20 */ 21class Soundex { 22 /** 23 * Which algorithms are supported. 24 * 25 * @return string[] 26 */ 27 public static function getAlgorithms() { 28 return [ 29 'std' => /* I18N: http://en.wikipedia.org/wiki/Soundex */ I18N::translate('Russell'), 30 'dm' => /* I18N: http://en.wikipedia.org/wiki/Daitch–Mokotoff_Soundex */ I18N::translate('Daitch-Mokotoff'), 31 ]; 32 } 33 34 /** 35 * Is there a match between two soundex codes? 36 * 37 * @param string $soundex1 38 * @param string $soundex2 39 * 40 * @return bool 41 */ 42 public static function compare($soundex1, $soundex2) { 43 if ($soundex1 !== '' && $soundex2 !== '') { 44 return !empty(array_intersect(explode(':', $soundex1), explode(':', $soundex2))); 45 } 46 47 return false; 48 } 49 50 /** 51 * Generate Russell soundex codes for a given text. 52 * 53 * @param $text 54 * 55 * @return null|string 56 */ 57 public static function russell($text) { 58 $words = preg_split('/\s/', $text, -1, PREG_SPLIT_NO_EMPTY); 59 $soundex_array = []; 60 foreach ($words as $word) { 61 $soundex = soundex($word); 62 // Only return codes from recognisable sounds 63 if ($soundex !== '0000') { 64 $soundex_array[] = $soundex; 65 } 66 } 67 // Combine words, e.g. “New York” as “Newyork” 68 if (count($words) > 1) { 69 $soundex_array[] = soundex(strtr($text, ' ', '')); 70 } 71 // A varchar(255) column can only hold 51 4-character codes (plus 50 delimiters) 72 $soundex_array = array_slice(array_unique($soundex_array), 0, 51); 73 74 if ($soundex_array) { 75 return implode(':', $soundex_array); 76 } else { 77 return ''; 78 } 79 } 80 81 /** 82 * Generate Daitch–Mokotoff soundex codes for a given text. 83 * 84 * @param $text 85 * 86 * @return string 87 */ 88 public static function daitchMokotoff($text) { 89 $words = preg_split('/\s/', $text, -1, PREG_SPLIT_NO_EMPTY); 90 $soundex_array = []; 91 foreach ($words as $word) { 92 $soundex_array = array_merge($soundex_array, self::daitchMokotoffWord($word)); 93 } 94 // Combine words, e.g. “New York” as “Newyork” 95 if (count($words) > 1) { 96 $soundex_array = array_merge($soundex_array, self::daitchMokotoffWord(strtr($text, ' ', ''))); 97 } 98 // A varchar(255) column can only hold 36 6-character codes (plus 35 delimiters) 99 $soundex_array = array_slice(array_unique($soundex_array), 0, 36); 100 101 if ($soundex_array) { 102 return implode(':', $soundex_array); 103 } else { 104 return ''; 105 } 106 } 107 108 // Determine the Daitch–Mokotoff Soundex code for a word 109 // Original implementation by Gerry Kroll, and analysis by Meliza Amity 110 111 // Max. table key length (in ASCII bytes -- NOT in UTF-8 characters!) 112 const MAXCHAR = 7; 113 114 /** 115 * Name transformation arrays. 116 * Used to transform the Name string to simplify the "sounds like" table. 117 * This is especially useful in Hebrew. 118 * 119 * Each array entry defines the "from" and "to" arguments of an preg($from, $to, $text) 120 * function call to achieve the desired transformations. 121 * 122 * Note about the use of "\x01": 123 * This code, which can’t legitimately occur in the kind of text we're dealing with, 124 * is used as a place-holder so that conditional string replacements can be done. 125 * 126 * @var string[][] 127 */ 128 private static $transformNameTable = [ 129 // Force Yiddish ligatures to be treated as separate letters 130 ['װ', 'וו'], 131 ['ײ', 'יי'], 132 ['ױ', 'וי'], 133 ['בו', 'בע'], 134 ['פו', 'פע'], 135 ['ומ', 'עמ'], 136 ['ום', 'עם'], 137 ['ונ', 'ענ'], 138 ['ון', 'ען'], 139 ['וו', 'ב'], 140 ["\x01", ''], 141 ['ייה$', "\x01ה"], 142 ['ייע$', "\x01ע"], 143 ['יי', 'ע'], 144 ["\x01", 'יי'], 145 ]; 146 147 /** 148 * The DM sound coding table is organized this way: 149 * key: a variable-length string that corresponds to the UTF-8 character sequence 150 * represented by the table entry. Currently, that string can be up to 7 151 * bytes long. This maximum length is defined by the value of global variable 152 * $maxchar. 153 * 154 * value: an array as follows: 155 * [0]: zero if not a vowel 156 * [1]: sound value when this string is at the beginning of the word 157 * [2]: sound value when this string is followed by a vowel 158 * [3]: sound value for other cases 159 * [1],[2],[3] can be repeated several times to create branches in the code 160 * an empty sound value means "ignore in this state" 161 * 162 * @var string[][] 163 */ 164 private static $dmsounds = [ 165 'A' => ['1', '0', '', ''], 166 'À' => ['1', '0', '', ''], 167 'Á' => ['1', '0', '', ''], 168 'Â' => ['1', '0', '', ''], 169 'Ã' => ['1', '0', '', ''], 170 'Ä' => ['1', '0', '1', '', '0', '', ''], 171 'Å' => ['1', '0', '', ''], 172 'Ă' => ['1', '0', '', ''], 173 'Ą' => ['1', '', '', '', '', '', '6'], 174 'Ạ' => ['1', '0', '', ''], 175 'Ả' => ['1', '0', '', ''], 176 'Ấ' => ['1', '0', '', ''], 177 'Ầ' => ['1', '0', '', ''], 178 'Ẩ' => ['1', '0', '', ''], 179 'Ẫ' => ['1', '0', '', ''], 180 'Ậ' => ['1', '0', '', ''], 181 'Ắ' => ['1', '0', '', ''], 182 'Ằ' => ['1', '0', '', ''], 183 'Ẳ' => ['1', '0', '', ''], 184 'Ẵ' => ['1', '0', '', ''], 185 'Ặ' => ['1', '0', '', ''], 186 'AE' => ['1', '0', '1', ''], 187 'Æ' => ['1', '0', '1', ''], 188 'AI' => ['1', '0', '1', ''], 189 'AJ' => ['1', '0', '1', ''], 190 'AU' => ['1', '0', '7', ''], 191 'AV' => ['1', '0', '7', '', '7', '7', '7'], 192 'ÄU' => ['1', '0', '1', ''], 193 'AY' => ['1', '0', '1', ''], 194 'B' => ['0', '7', '7', '7'], 195 'C' => ['0', '5', '5', '5', '34', '4', '4'], 196 'Ć' => ['0', '4', '4', '4'], 197 'Č' => ['0', '4', '4', '4'], 198 'Ç' => ['0', '4', '4', '4'], 199 'CH' => ['0', '5', '5', '5', '34', '4', '4'], 200 'CHS' => ['0', '5', '54', '54'], 201 'CK' => ['0', '5', '5', '5', '45', '45', '45'], 202 'CCS' => ['0', '4', '4', '4'], 203 'CS' => ['0', '4', '4', '4'], 204 'CSZ' => ['0', '4', '4', '4'], 205 'CZ' => ['0', '4', '4', '4'], 206 'CZS' => ['0', '4', '4', '4'], 207 'D' => ['0', '3', '3', '3'], 208 'Ď' => ['0', '3', '3', '3'], 209 'Đ' => ['0', '3', '3', '3'], 210 'DRS' => ['0', '4', '4', '4'], 211 'DRZ' => ['0', '4', '4', '4'], 212 'DS' => ['0', '4', '4', '4'], 213 'DSH' => ['0', '4', '4', '4'], 214 'DSZ' => ['0', '4', '4', '4'], 215 'DT' => ['0', '3', '3', '3'], 216 'DDZ' => ['0', '4', '4', '4'], 217 'DDZS' => ['0', '4', '4', '4'], 218 'DZ' => ['0', '4', '4', '4'], 219 'DŹ' => ['0', '4', '4', '4'], 220 'DŻ' => ['0', '4', '4', '4'], 221 'DZH' => ['0', '4', '4', '4'], 222 'DZS' => ['0', '4', '4', '4'], 223 'E' => ['1', '0', '', ''], 224 'È' => ['1', '0', '', ''], 225 'É' => ['1', '0', '', ''], 226 'Ê' => ['1', '0', '', ''], 227 'Ë' => ['1', '0', '', ''], 228 'Ĕ' => ['1', '0', '', ''], 229 'Ė' => ['1', '0', '', ''], 230 'Ę' => ['1', '', '', '6', '', '', ''], 231 'Ẹ' => ['1', '0', '', ''], 232 'Ẻ' => ['1', '0', '', ''], 233 'Ẽ' => ['1', '0', '', ''], 234 'Ế' => ['1', '0', '', ''], 235 'Ề' => ['1', '0', '', ''], 236 'Ể' => ['1', '0', '', ''], 237 'Ễ' => ['1', '0', '', ''], 238 'Ệ' => ['1', '0', '', ''], 239 'EAU' => ['1', '0', '', ''], 240 'EI' => ['1', '0', '1', ''], 241 'EJ' => ['1', '0', '1', ''], 242 'EU' => ['1', '1', '1', ''], 243 'EY' => ['1', '0', '1', ''], 244 'F' => ['0', '7', '7', '7'], 245 'FB' => ['0', '7', '7', '7'], 246 'G' => ['0', '5', '5', '5', '34', '4', '4'], 247 'Ğ' => ['0', '', '', ''], 248 'GGY' => ['0', '5', '5', '5'], 249 'GY' => ['0', '5', '5', '5'], 250 'H' => ['0', '5', '5', '', '5', '5', '5'], 251 'I' => ['1', '0', '', ''], 252 'Ì' => ['1', '0', '', ''], 253 'Í' => ['1', '0', '', ''], 254 'Î' => ['1', '0', '', ''], 255 'Ï' => ['1', '0', '', ''], 256 'Ĩ' => ['1', '0', '', ''], 257 'Į' => ['1', '0', '', ''], 258 'İ' => ['1', '0', '', ''], 259 'Ỉ' => ['1', '0', '', ''], 260 'Ị' => ['1', '0', '', ''], 261 'IA' => ['1', '1', '', ''], 262 'IE' => ['1', '1', '', ''], 263 'IO' => ['1', '1', '', ''], 264 'IU' => ['1', '1', '', ''], 265 'J' => ['0', '1', '', '', '4', '4', '4', '5', '5', ''], 266 'K' => ['0', '5', '5', '5'], 267 'KH' => ['0', '5', '5', '5'], 268 'KS' => ['0', '5', '54', '54'], 269 'L' => ['0', '8', '8', '8'], 270 'Ľ' => ['0', '8', '8', '8'], 271 'Ĺ' => ['0', '8', '8', '8'], 272 'Ł' => ['0', '7', '7', '7', '8', '8', '8'], 273 'LL' => ['0', '8', '8', '8', '58', '8', '8', '1', '8', '8'], 274 'LLY' => ['0', '8', '8', '8', '1', '8', '8'], 275 'LY' => ['0', '8', '8', '8', '1', '8', '8'], 276 'M' => ['0', '6', '6', '6'], 277 'MĔ' => ['0', '66', '66', '66'], 278 'MN' => ['0', '66', '66', '66'], 279 'N' => ['0', '6', '6', '6'], 280 'Ń' => ['0', '6', '6', '6'], 281 'Ň' => ['0', '6', '6', '6'], 282 'Ñ' => ['0', '6', '6', '6'], 283 'NM' => ['0', '66', '66', '66'], 284 'O' => ['1', '0', '', ''], 285 'Ò' => ['1', '0', '', ''], 286 'Ó' => ['1', '0', '', ''], 287 'Ô' => ['1', '0', '', ''], 288 'Õ' => ['1', '0', '', ''], 289 'Ö' => ['1', '0', '', ''], 290 'Ø' => ['1', '0', '', ''], 291 'Ő' => ['1', '0', '', ''], 292 'Œ' => ['1', '0', '', ''], 293 'Ơ' => ['1', '0', '', ''], 294 'Ọ' => ['1', '0', '', ''], 295 'Ỏ' => ['1', '0', '', ''], 296 'Ố' => ['1', '0', '', ''], 297 'Ồ' => ['1', '0', '', ''], 298 'Ổ' => ['1', '0', '', ''], 299 'Ỗ' => ['1', '0', '', ''], 300 'Ộ' => ['1', '0', '', ''], 301 'Ớ' => ['1', '0', '', ''], 302 'Ờ' => ['1', '0', '', ''], 303 'Ở' => ['1', '0', '', ''], 304 'Ỡ' => ['1', '0', '', ''], 305 'Ợ' => ['1', '0', '', ''], 306 'OE' => ['1', '0', '', ''], 307 'OI' => ['1', '0', '1', ''], 308 'OJ' => ['1', '0', '1', ''], 309 'OU' => ['1', '0', '', ''], 310 'OY' => ['1', '0', '1', ''], 311 'P' => ['0', '7', '7', '7'], 312 'PF' => ['0', '7', '7', '7'], 313 'PH' => ['0', '7', '7', '7'], 314 'Q' => ['0', '5', '5', '5'], 315 'R' => ['0', '9', '9', '9'], 316 'Ř' => ['0', '4', '4', '4'], 317 'RS' => ['0', '4', '4', '4', '94', '94', '94'], 318 'RZ' => ['0', '4', '4', '4', '94', '94', '94'], 319 'S' => ['0', '4', '4', '4'], 320 'Ś' => ['0', '4', '4', '4'], 321 'Š' => ['0', '4', '4', '4'], 322 'Ş' => ['0', '4', '4', '4'], 323 'SC' => ['0', '2', '4', '4'], 324 'ŠČ' => ['0', '2', '4', '4'], 325 'SCH' => ['0', '4', '4', '4'], 326 'SCHD' => ['0', '2', '43', '43'], 327 'SCHT' => ['0', '2', '43', '43'], 328 'SCHTCH' => ['0', '2', '4', '4'], 329 'SCHTSCH' => ['0', '2', '4', '4'], 330 'SCHTSH' => ['0', '2', '4', '4'], 331 'SD' => ['0', '2', '43', '43'], 332 'SH' => ['0', '4', '4', '4'], 333 'SHCH' => ['0', '2', '4', '4'], 334 'SHD' => ['0', '2', '43', '43'], 335 'SHT' => ['0', '2', '43', '43'], 336 'SHTCH' => ['0', '2', '4', '4'], 337 'SHTSH' => ['0', '2', '4', '4'], 338 'ß' => ['0', '', '4', '4'], 339 'ST' => ['0', '2', '43', '43'], 340 'STCH' => ['0', '2', '4', '4'], 341 'STRS' => ['0', '2', '4', '4'], 342 'STRZ' => ['0', '2', '4', '4'], 343 'STSCH' => ['0', '2', '4', '4'], 344 'STSH' => ['0', '2', '4', '4'], 345 'SSZ' => ['0', '4', '4', '4'], 346 'SZ' => ['0', '4', '4', '4'], 347 'SZCS' => ['0', '2', '4', '4'], 348 'SZCZ' => ['0', '2', '4', '4'], 349 'SZD' => ['0', '2', '43', '43'], 350 'SZT' => ['0', '2', '43', '43'], 351 'T' => ['0', '3', '3', '3'], 352 'Ť' => ['0', '3', '3', '3'], 353 'Ţ' => ['0', '3', '3', '3', '4', '4', '4'], 354 'TC' => ['0', '4', '4', '4'], 355 'TCH' => ['0', '4', '4', '4'], 356 'TH' => ['0', '3', '3', '3'], 357 'TRS' => ['0', '4', '4', '4'], 358 'TRZ' => ['0', '4', '4', '4'], 359 'TS' => ['0', '4', '4', '4'], 360 'TSCH' => ['0', '4', '4', '4'], 361 'TSH' => ['0', '4', '4', '4'], 362 'TSZ' => ['0', '4', '4', '4'], 363 'TTCH' => ['0', '4', '4', '4'], 364 'TTS' => ['0', '4', '4', '4'], 365 'TTSCH' => ['0', '4', '4', '4'], 366 'TTSZ' => ['0', '4', '4', '4'], 367 'TTZ' => ['0', '4', '4', '4'], 368 'TZ' => ['0', '4', '4', '4'], 369 'TZS' => ['0', '4', '4', '4'], 370 'U' => ['1', '0', '', ''], 371 'Ù' => ['1', '0', '', ''], 372 'Ú' => ['1', '0', '', ''], 373 'Û' => ['1', '0', '', ''], 374 'Ü' => ['1', '0', '', ''], 375 'Ũ' => ['1', '0', '', ''], 376 'Ū' => ['1', '0', '', ''], 377 'Ů' => ['1', '0', '', ''], 378 'Ű' => ['1', '0', '', ''], 379 'Ų' => ['1', '0', '', ''], 380 'Ư' => ['1', '0', '', ''], 381 'Ụ' => ['1', '0', '', ''], 382 'Ủ' => ['1', '0', '', ''], 383 'Ứ' => ['1', '0', '', ''], 384 'Ừ' => ['1', '0', '', ''], 385 'Ử' => ['1', '0', '', ''], 386 'Ữ' => ['1', '0', '', ''], 387 'Ự' => ['1', '0', '', ''], 388 'UE' => ['1', '0', '', ''], 389 'UI' => ['1', '0', '1', ''], 390 'UJ' => ['1', '0', '1', ''], 391 'UY' => ['1', '0', '1', ''], 392 'UW' => ['1', '0', '1', '', '0', '7', '7'], 393 'V' => ['0', '7', '7', '7'], 394 'W' => ['0', '7', '7', '7'], 395 'X' => ['0', '5', '54', '54'], 396 'Y' => ['1', '1', '', ''], 397 'Ý' => ['1', '1', '', ''], 398 'Ỳ' => ['1', '1', '', ''], 399 'Ỵ' => ['1', '1', '', ''], 400 'Ỷ' => ['1', '1', '', ''], 401 'Ỹ' => ['1', '1', '', ''], 402 'Z' => ['0', '4', '4', '4'], 403 'Ź' => ['0', '4', '4', '4'], 404 'Ż' => ['0', '4', '4', '4'], 405 'Ž' => ['0', '4', '4', '4'], 406 'ZD' => ['0', '2', '43', '43'], 407 'ZDZ' => ['0', '2', '4', '4'], 408 'ZDZH' => ['0', '2', '4', '4'], 409 'ZH' => ['0', '4', '4', '4'], 410 'ZHD' => ['0', '2', '43', '43'], 411 'ZHDZH' => ['0', '2', '4', '4'], 412 'ZS' => ['0', '4', '4', '4'], 413 'ZSCH' => ['0', '4', '4', '4'], 414 'ZSH' => ['0', '4', '4', '4'], 415 'ZZS' => ['0', '4', '4', '4'], 416 // Cyrillic alphabet 417 'А' => ['1', '0', '', ''], 418 'Б' => ['0', '7', '7', '7'], 419 'В' => ['0', '7', '7', '7'], 420 'Г' => ['0', '5', '5', '5'], 421 'Д' => ['0', '3', '3', '3'], 422 'ДЗ' => ['0', '4', '4', '4'], 423 'Е' => ['1', '0', '', ''], 424 'Ё' => ['1', '0', '', ''], 425 'Ж' => ['0', '4', '4', '4'], 426 'З' => ['0', '4', '4', '4'], 427 'И' => ['1', '0', '', ''], 428 'Й' => ['1', '1', '', '', '4', '4', '4'], 429 'К' => ['0', '5', '5', '5'], 430 'Л' => ['0', '8', '8', '8'], 431 'М' => ['0', '6', '6', '6'], 432 'Н' => ['0', '6', '6', '6'], 433 'О' => ['1', '0', '', ''], 434 'П' => ['0', '7', '7', '7'], 435 'Р' => ['0', '9', '9', '9'], 436 'РЖ' => ['0', '4', '4', '4'], 437 'С' => ['0', '4', '4', '4'], 438 'Т' => ['0', '3', '3', '3'], 439 'У' => ['1', '0', '', ''], 440 'Ф' => ['0', '7', '7', '7'], 441 'Х' => ['0', '5', '5', '5'], 442 'Ц' => ['0', '4', '4', '4'], 443 'Ч' => ['0', '4', '4', '4'], 444 'Ш' => ['0', '4', '4', '4'], 445 'Щ' => ['0', '2', '4', '4'], 446 'Ъ' => ['0', '', '', ''], 447 'Ы' => ['0', '1', '', ''], 448 'Ь' => ['0', '', '', ''], 449 'Э' => ['1', '0', '', ''], 450 'Ю' => ['0', '1', '', ''], 451 'Я' => ['0', '1', '', ''], 452 // Greek alphabet 453 'Α' => ['1', '0', '', ''], 454 'Ά' => ['1', '0', '', ''], 455 'ΑΙ' => ['1', '0', '1', ''], 456 'ΑΥ' => ['1', '0', '1', ''], 457 'Β' => ['0', '7', '7', '7'], 458 'Γ' => ['0', '5', '5', '5'], 459 'Δ' => ['0', '3', '3', '3'], 460 'Ε' => ['1', '0', '', ''], 461 'Έ' => ['1', '0', '', ''], 462 'ΕΙ' => ['1', '0', '1', ''], 463 'ΕΥ' => ['1', '1', '1', ''], 464 'Ζ' => ['0', '4', '4', '4'], 465 'Η' => ['1', '0', '', ''], 466 'Ή' => ['1', '0', '', ''], 467 'Θ' => ['0', '3', '3', '3'], 468 'Ι' => ['1', '0', '', ''], 469 'Ί' => ['1', '0', '', ''], 470 'Ϊ' => ['1', '0', '', ''], 471 'ΐ' => ['1', '0', '', ''], 472 'Κ' => ['0', '5', '5', '5'], 473 'Λ' => ['0', '8', '8', '8'], 474 'Μ' => ['0', '6', '6', '6'], 475 'ΜΠ' => ['0', '7', '7', '7'], 476 'Ν' => ['0', '6', '6', '6'], 477 'ΝΤ' => ['0', '3', '3', '3'], 478 'Ξ' => ['0', '5', '54', '54'], 479 'Ο' => ['1', '0', '', ''], 480 'Ό' => ['1', '0', '', ''], 481 'ΟΙ' => ['1', '0', '1', ''], 482 'ΟΥ' => ['1', '0', '1', ''], 483 'Π' => ['0', '7', '7', '7'], 484 'Ρ' => ['0', '9', '9', '9'], 485 'Σ' => ['0', '4', '4', '4'], 486 'ς' => ['0', '', '', '4'], 487 'Τ' => ['0', '3', '3', '3'], 488 'ΤΖ' => ['0', '4', '4', '4'], 489 'ΤΣ' => ['0', '4', '4', '4'], 490 'Υ' => ['1', '1', '', ''], 491 'Ύ' => ['1', '1', '', ''], 492 'Ϋ' => ['1', '1', '', ''], 493 'ΰ' => ['1', '1', '', ''], 494 'ΥΚ' => ['1', '5', '5', '5'], 495 'ΥΥ' => ['1', '65', '65', '65'], 496 'Φ' => ['0', '7', '7', '7'], 497 'Χ' => ['0', '5', '5', '5'], 498 'Ψ' => ['0', '7', '7', '7'], 499 'Ω' => ['1', '0', '', ''], 500 'Ώ' => ['1', '0', '', ''], 501 // Hebrew alphabet 502 'א' => ['1', '0', '', ''], 503 'או' => ['1', '0', '7', ''], 504 'אג' => ['1', '4', '4', '4', '5', '5', '5', '34', '34', '34'], 505 'בב' => ['0', '7', '7', '7', '77', '77', '77'], 506 'ב' => ['0', '7', '7', '7'], 507 'גג' => ['0', '4', '4', '4', '5', '5', '5', '45', '45', '45', '55', '55', '55', '54', '54', '54'], 508 'גד' => ['0', '43', '43', '43', '53', '53', '53'], 509 'גה' => ['0', '45', '45', '45', '55', '55', '55'], 510 'גז' => ['0', '44', '44', '44', '45', '45', '45'], 511 'גח' => ['0', '45', '45', '45', '55', '55', '55'], 512 'גכ' => ['0', '45', '45', '45', '55', '55', '55'], 513 'גך' => ['0', '45', '45', '45', '55', '55', '55'], 514 'גצ' => ['0', '44', '44', '44', '45', '45', '45'], 515 'גץ' => ['0', '44', '44', '44', '45', '45', '45'], 516 'גק' => ['0', '45', '45', '45', '54', '54', '54'], 517 'גש' => ['0', '44', '44', '44', '54', '54', '54'], 518 'גת' => ['0', '43', '43', '43', '53', '53', '53'], 519 'ג' => ['0', '4', '4', '4', '5', '5', '5'], 520 'דז' => ['0', '4', '4', '4'], 521 'דד' => ['0', '3', '3', '3', '33', '33', '33'], 522 'דט' => ['0', '33', '33', '33'], 523 'דש' => ['0', '4', '4', '4'], 524 'דצ' => ['0', '4', '4', '4'], 525 'דץ' => ['0', '4', '4', '4'], 526 'ד' => ['0', '3', '3', '3'], 527 'הג' => ['0', '54', '54', '54', '55', '55', '55'], 528 'הכ' => ['0', '55', '55', '55'], 529 'הח' => ['0', '55', '55', '55'], 530 'הק' => ['0', '55', '55', '55', '5', '5', '5'], 531 'הה' => ['0', '5', '5', '', '55', '55', ''], 532 'ה' => ['0', '5', '5', ''], 533 'וי' => ['1', '', '', '', '7', '7', '7'], 534 'ו' => ['1', '7', '7', '7', '7', '', ''], 535 'וו' => ['1', '7', '7', '7', '7', '', ''], 536 'וופ' => ['1', '7', '7', '7', '77', '77', '77'], 537 'זש' => ['0', '4', '4', '4', '44', '44', '44'], 538 'זדז' => ['0', '2', '4', '4'], 539 'ז' => ['0', '4', '4', '4'], 540 'זג' => ['0', '44', '44', '44', '45', '45', '45'], 541 'זז' => ['0', '4', '4', '4', '44', '44', '44'], 542 'זס' => ['0', '44', '44', '44'], 543 'זצ' => ['0', '44', '44', '44'], 544 'זץ' => ['0', '44', '44', '44'], 545 'חג' => ['0', '54', '54', '54', '53', '53', '53'], 546 'חח' => ['0', '5', '5', '5', '55', '55', '55'], 547 'חק' => ['0', '55', '55', '55', '5', '5', '5'], 548 'חכ' => ['0', '45', '45', '45', '55', '55', '55'], 549 'חס' => ['0', '5', '54', '54'], 550 'חש' => ['0', '5', '54', '54'], 551 'ח' => ['0', '5', '5', '5'], 552 'טש' => ['0', '4', '4', '4'], 553 'טד' => ['0', '33', '33', '33'], 554 'טי' => ['0', '3', '3', '3', '4', '4', '4', '3', '3', '34'], 555 'טת' => ['0', '33', '33', '33'], 556 'טט' => ['0', '3', '3', '3', '33', '33', '33'], 557 'ט' => ['0', '3', '3', '3'], 558 'י' => ['1', '1', '', ''], 559 'יא' => ['1', '1', '', '', '1', '1', '1'], 560 'כג' => ['0', '55', '55', '55', '54', '54', '54'], 561 'כש' => ['0', '5', '54', '54'], 562 'כס' => ['0', '5', '54', '54'], 563 'ככ' => ['0', '5', '5', '5', '55', '55', '55'], 564 'כך' => ['0', '5', '5', '5', '55', '55', '55'], 565 'כ' => ['0', '5', '5', '5'], 566 'כח' => ['0', '55', '55', '55', '5', '5', '5'], 567 'ך' => ['0', '', '5', '5'], 568 'ל' => ['0', '8', '8', '8'], 569 'לל' => ['0', '88', '88', '88', '8', '8', '8'], 570 'מנ' => ['0', '66', '66', '66'], 571 'מן' => ['0', '66', '66', '66'], 572 'ממ' => ['0', '6', '6', '6', '66', '66', '66'], 573 'מם' => ['0', '6', '6', '6', '66', '66', '66'], 574 'מ' => ['0', '6', '6', '6'], 575 'ם' => ['0', '', '6', '6'], 576 'נמ' => ['0', '66', '66', '66'], 577 'נם' => ['0', '66', '66', '66'], 578 'ננ' => ['0', '6', '6', '6', '66', '66', '66'], 579 'נן' => ['0', '6', '6', '6', '66', '66', '66'], 580 'נ' => ['0', '6', '6', '6'], 581 'ן' => ['0', '', '6', '6'], 582 'סתש' => ['0', '2', '4', '4'], 583 'סתז' => ['0', '2', '4', '4'], 584 'סטז' => ['0', '2', '4', '4'], 585 'סטש' => ['0', '2', '4', '4'], 586 'סצד' => ['0', '2', '4', '4'], 587 'סט' => ['0', '2', '4', '4', '43', '43', '43'], 588 'סת' => ['0', '2', '4', '4', '43', '43', '43'], 589 'סג' => ['0', '44', '44', '44', '4', '4', '4'], 590 'סס' => ['0', '4', '4', '4', '44', '44', '44'], 591 'סצ' => ['0', '44', '44', '44'], 592 'סץ' => ['0', '44', '44', '44'], 593 'סז' => ['0', '44', '44', '44'], 594 'סש' => ['0', '44', '44', '44'], 595 'ס' => ['0', '4', '4', '4'], 596 'ע' => ['1', '0', '', ''], 597 'פב' => ['0', '7', '7', '7', '77', '77', '77'], 598 'פוו' => ['0', '7', '7', '7', '77', '77', '77'], 599 'פפ' => ['0', '7', '7', '7', '77', '77', '77'], 600 'פף' => ['0', '7', '7', '7', '77', '77', '77'], 601 'פ' => ['0', '7', '7', '7'], 602 'ף' => ['0', '', '7', '7'], 603 'צג' => ['0', '44', '44', '44', '45', '45', '45'], 604 'צז' => ['0', '44', '44', '44'], 605 'צס' => ['0', '44', '44', '44'], 606 'צצ' => ['0', '4', '4', '4', '5', '5', '5', '44', '44', '44', '54', '54', '54', '45', '45', '45'], 607 'צץ' => ['0', '4', '4', '4', '5', '5', '5', '44', '44', '44', '54', '54', '54'], 608 'צש' => ['0', '44', '44', '44', '4', '4', '4', '5', '5', '5'], 609 'צ' => ['0', '4', '4', '4', '5', '5', '5'], 610 'ץ' => ['0', '', '4', '4'], 611 'קה' => ['0', '55', '55', '5'], 612 'קס' => ['0', '5', '54', '54'], 613 'קש' => ['0', '5', '54', '54'], 614 'קק' => ['0', '5', '5', '5', '55', '55', '55'], 615 'קח' => ['0', '55', '55', '55'], 616 'קכ' => ['0', '55', '55', '55'], 617 'קך' => ['0', '55', '55', '55'], 618 'קג' => ['0', '55', '55', '55', '54', '54', '54'], 619 'ק' => ['0', '5', '5', '5'], 620 'רר' => ['0', '99', '99', '99', '9', '9', '9'], 621 'ר' => ['0', '9', '9', '9'], 622 'שטז' => ['0', '2', '4', '4'], 623 'שתש' => ['0', '2', '4', '4'], 624 'שתז' => ['0', '2', '4', '4'], 625 'שטש' => ['0', '2', '4', '4'], 626 'שד' => ['0', '2', '43', '43'], 627 'שז' => ['0', '44', '44', '44'], 628 'שס' => ['0', '44', '44', '44'], 629 'שת' => ['0', '2', '43', '43'], 630 'שג' => ['0', '4', '4', '4', '44', '44', '44', '4', '43', '43'], 631 'שט' => ['0', '2', '43', '43', '44', '44', '44'], 632 'שצ' => ['0', '44', '44', '44', '45', '45', '45'], 633 'שץ' => ['0', '44', '', '44', '45', '', '45'], 634 'שש' => ['0', '4', '4', '4', '44', '44', '44'], 635 'ש' => ['0', '4', '4', '4'], 636 'תג' => ['0', '34', '34', '34'], 637 'תז' => ['0', '34', '34', '34'], 638 'תש' => ['0', '4', '4', '4'], 639 'תת' => ['0', '3', '3', '3', '4', '4', '4', '33', '33', '33', '44', '44', '44', '34', '34', '34', '43', '43', '43'], 640 'ת' => ['0', '3', '3', '3', '4', '4', '4'], 641 // Arabic alphabet 642 'ا' => ['1', '0', '', ''], 643 'ب' => ['0', '7', '7', '7'], 644 'ت' => ['0', '3', '3', '3'], 645 'ث' => ['0', '3', '3', '3'], 646 'ج' => ['0', '4', '4', '4'], 647 'ح' => ['0', '5', '5', '5'], 648 'خ' => ['0', '5', '5', '5'], 649 'د' => ['0', '3', '3', '3'], 650 'ذ' => ['0', '3', '3', '3'], 651 'ر' => ['0', '9', '9', '9'], 652 'ز' => ['0', '4', '4', '4'], 653 'س' => ['0', '4', '4', '4'], 654 'ش' => ['0', '4', '4', '4'], 655 'ص' => ['0', '4', '4', '4'], 656 'ض' => ['0', '3', '3', '3'], 657 'ط' => ['0', '3', '3', '3'], 658 'ظ' => ['0', '4', '4', '4'], 659 'ع' => ['1', '0', '', ''], 660 'غ' => ['0', '0', '', ''], 661 'ف' => ['0', '7', '7', '7'], 662 'ق' => ['0', '5', '5', '5'], 663 'ك' => ['0', '5', '5', '5'], 664 'ل' => ['0', '8', '8', '8'], 665 'لا' => ['0', '8', '8', '8'], 666 'م' => ['0', '6', '6', '6'], 667 'ن' => ['0', '6', '6', '6'], 668 'هن' => ['0', '66', '66', '66'], 669 'ه' => ['0', '5', '5', ''], 670 'و' => ['1', '', '', '', '7', '', ''], 671 'ي' => ['0', '1', '', ''], 672 'آ' => ['0', '1', '', ''], 673 'ة' => ['0', '', '', '3'], 674 'ی' => ['0', '1', '', ''], 675 'ى' => ['1', '1', '', ''], 676 ]; 677 678 /** 679 * Calculate the Daitch-Mokotoff soundex for a word. 680 * 681 * @param string $name 682 * 683 * @return string[] List of possible DM codes for the word. 684 */ 685 private static function daitchMokotoffWord($name) { 686 // Apply special transformation rules to the input string 687 $name = I18N::strtoupper($name); 688 foreach (self::$transformNameTable as $transformRule) { 689 $name = str_replace($transformRule[0], $transformRule[1], $name); 690 } 691 692 // Initialize 693 $name_script = I18N::textScript($name); 694 $noVowels = ($name_script == 'Hebr' || $name_script == 'Arab'); 695 696 $lastPos = strlen($name) - 1; 697 $currPos = 0; 698 $state = 1; // 1: start of input string, 2: before vowel, 3: other 699 $result = []; // accumulate complete 6-digit D-M codes here 700 $partialResult = []; // accumulate incomplete D-M codes here 701 $partialResult[] = ['!']; // initialize 1st partial result ('!' stops "duplicate sound" check) 702 703 // Loop through the input string. 704 // Stop when the string is exhausted or when no more partial results remain 705 while (count($partialResult) !== 0 && $currPos <= $lastPos) { 706 // Find the DM coding table entry for the chunk at the current position 707 $thisEntry = substr($name, $currPos, self::MAXCHAR); // Get maximum length chunk 708 while ($thisEntry != '') { 709 if (isset(self::$dmsounds[$thisEntry])) { 710 break; 711 } 712 $thisEntry = substr($thisEntry, 0, -1); // Not in table: try a shorter chunk 713 } 714 if ($thisEntry === '') { 715 $currPos++; // Not in table: advance pointer to next byte 716 continue; // and try again 717 } 718 719 $soundTableEntry = self::$dmsounds[$thisEntry]; 720 $workingResult = $partialResult; 721 $partialResult = []; 722 $currPos += strlen($thisEntry); 723 724 // Not at beginning of input string 725 if ($state != 1) { 726 if ($currPos <= $lastPos) { 727 // Determine whether the next chunk is a vowel 728 $nextEntry = substr($name, $currPos, self::MAXCHAR); // Get maximum length chunk 729 while ($nextEntry != '') { 730 if (isset(self::$dmsounds[$nextEntry])) { 731 break; 732 } 733 $nextEntry = substr($nextEntry, 0, -1); // Not in table: try a shorter chunk 734 } 735 } else { 736 $nextEntry = ''; 737 } 738 if ($nextEntry != '' && self::$dmsounds[$nextEntry][0] != '0') { 739 $state = 2; 740 } else { 741 // Next chunk is a vowel 742 $state = 3; 743 } 744 } 745 746 while ($state < count($soundTableEntry)) { 747 // empty means 'ignore this sound in this state' 748 if ($soundTableEntry[$state] == '') { 749 foreach ($workingResult as $workingEntry) { 750 $tempEntry = $workingEntry; 751 $tempEntry[count($tempEntry) - 1] .= '!'; // Prevent false 'doubles' 752 $partialResult[] = $tempEntry; 753 } 754 } else { 755 foreach ($workingResult as $workingEntry) { 756 if ($soundTableEntry[$state] !== $workingEntry[count($workingEntry) - 1]) { 757 // Incoming sound isn't a duplicate of the previous sound 758 $workingEntry[] = $soundTableEntry[$state]; 759 } else { 760 // Incoming sound is a duplicate of the previous sound 761 // For Hebrew and Arabic, we need to create a pair of D-M sound codes, 762 // one of the pair with only a single occurrence of the duplicate sound, 763 // the other with both occurrences 764 if ($noVowels) { 765 $workingEntry[] = $soundTableEntry[$state]; 766 } 767 } 768 if (count($workingEntry) < 7) { 769 $partialResult[] = $workingEntry; 770 } else { 771 // This is the 6th code in the sequence 772 // We're looking for 7 entries because the first is '!' and doesn't count 773 $tempResult = str_replace('!', '', implode('', $workingEntry)); 774 // Only return codes from recognisable sounds 775 if ($tempResult) { 776 $result[] = substr($tempResult . '000000', 0, 6); 777 } 778 } 779 } 780 } 781 $state = $state + 3; // Advance to next triplet while keeping the same basic state 782 } 783 } 784 785 // Zero-fill and copy all remaining partial results 786 foreach ($partialResult as $workingEntry) { 787 $tempResult = str_replace('!', '', implode('', $workingEntry)); 788 // Only return codes from recognisable sounds 789 if ($tempResult) { 790 $result[] = substr($tempResult . '000000', 0, 6); 791 } 792 } 793 794 return $result; 795 } 796} 797