1<?php 2/** 3 * webtrees: online genealogy 4 * Copyright (C) 2017 webtrees development team 5 * This program is free software: you can redistribute it and/or modify 6 * it under the terms of the GNU General Public License as published by 7 * the Free Software Foundation, either version 3 of the License, or 8 * (at your option) any later version. 9 * This program is distributed in the hope that it will be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * You should have received a copy of the GNU General Public License 14 * along with this program. If not, see <http://www.gnu.org/licenses/>. 15 */ 16namespace Fisharebest\Webtrees; 17 18/** 19 * Phonetic matching of strings. 20 */ 21class Soundex { 22 /** 23 * Which algorithms are supported. 24 * 25 * @return string[] 26 */ 27 public static function getAlgorithms() { 28 return [ 29 'std' => /* I18N: http://en.wikipedia.org/wiki/Soundex */ I18N::translate('Russell'), 30 'dm' => /* I18N: http://en.wikipedia.org/wiki/Daitch–Mokotoff_Soundex */ I18N::translate('Daitch-Mokotoff'), 31 ]; 32 } 33 34 /** 35 * Is there a match between two soundex codes? 36 * 37 * @param string $soundex1 38 * @param string $soundex2 39 * 40 * @return bool 41 */ 42 public static function compare($soundex1, $soundex2) { 43 if ($soundex1 && $soundex2) { 44 foreach (explode(':', $soundex1) as $code) { 45 if (strpos($soundex2, $code) !== false) { 46 return true; 47 } 48 } 49 } 50 51 return false; 52 } 53 54 /** 55 * Generate Russell soundex codes for a given text. 56 * 57 * @param $text 58 * 59 * @return null|string 60 */ 61 public static function russell($text) { 62 $words = preg_split('/\s/', $text, -1, PREG_SPLIT_NO_EMPTY); 63 $soundex_array = []; 64 foreach ($words as $word) { 65 $soundex = soundex($word); 66 // Only return codes from recognisable sounds 67 if ($soundex !== '0000') { 68 $soundex_array[] = $soundex; 69 } 70 } 71 // Combine words, e.g. “New York” as “Newyork” 72 if (count($words) > 1) { 73 $soundex_array[] = soundex(strtr($text, ' ', '')); 74 } 75 // A varchar(255) column can only hold 51 4-character codes (plus 50 delimiters) 76 $soundex_array = array_slice(array_unique($soundex_array), 0, 51); 77 78 if ($soundex_array) { 79 return implode(':', $soundex_array); 80 } else { 81 return ''; 82 } 83 } 84 85 /** 86 * Generate Daitch–Mokotoff soundex codes for a given text. 87 * 88 * @param $text 89 * 90 * @return null|string 91 */ 92 public static function daitchMokotoff($text) { 93 $words = preg_split('/\s/', $text, -1, PREG_SPLIT_NO_EMPTY); 94 $soundex_array = []; 95 foreach ($words as $word) { 96 $soundex_array = array_merge($soundex_array, self::daitchMokotoffWord($word)); 97 } 98 // Combine words, e.g. “New York” as “Newyork” 99 if (count($words) > 1) { 100 $soundex_array = array_merge($soundex_array, self::daitchMokotoffWord(strtr($text, ' ', ''))); 101 } 102 // A varchar(255) column can only hold 36 6-character codes (plus 35 delimiters) 103 $soundex_array = array_slice(array_unique($soundex_array), 0, 36); 104 105 if ($soundex_array) { 106 return implode(':', $soundex_array); 107 } else { 108 return ''; 109 } 110 } 111 112 // Determine the Daitch–Mokotoff Soundex code for a word 113 // Original implementation by Gerry Kroll, and analysis by Meliza Amity 114 115 // Max. table key length (in ASCII bytes -- NOT in UTF-8 characters!) 116 const MAXCHAR = 7; 117 118 /** 119 * Name transformation arrays. 120 * Used to transform the Name string to simplify the "sounds like" table. 121 * This is especially useful in Hebrew. 122 * 123 * Each array entry defines the "from" and "to" arguments of an preg($from, $to, $text) 124 * function call to achieve the desired transformations. 125 * 126 * Note about the use of "\x01": 127 * This code, which can’t legitimately occur in the kind of text we're dealing with, 128 * is used as a place-holder so that conditional string replacements can be done. 129 * 130 * @var string[][] 131 */ 132 private static $transformNameTable = [ 133 // Force Yiddish ligatures to be treated as separate letters 134 ['װ', 'וו'], 135 ['ײ', 'יי'], 136 ['ױ', 'וי'], 137 ['בו', 'בע'], 138 ['פו', 'פע'], 139 ['ומ', 'עמ'], 140 ['ום', 'עם'], 141 ['ונ', 'ענ'], 142 ['ון', 'ען'], 143 ['וו', 'ב'], 144 ["\x01", ''], 145 ['ייה$', "\x01ה"], 146 ['ייע$', "\x01ע"], 147 ['יי', 'ע'], 148 ["\x01", 'יי'], 149 ]; 150 151 /** 152 * The DM sound coding table is organized this way: 153 * key: a variable-length string that corresponds to the UTF-8 character sequence 154 * represented by the table entry. Currently, that string can be up to 7 155 * bytes long. This maximum length is defined by the value of global variable 156 * $maxchar. 157 * 158 * value: an array as follows: 159 * [0]: zero if not a vowel 160 * [1]: sound value when this string is at the beginning of the word 161 * [2]: sound value when this string is followed by a vowel 162 * [3]: sound value for other cases 163 * [1],[2],[3] can be repeated several times to create branches in the code 164 * an empty sound value means "ignore in this state" 165 * 166 * @var string[][] 167 */ 168 private static $dmsounds = [ 169 'A' => ['1', '0', '', ''], 170 'À' => ['1', '0', '', ''], 171 'Á' => ['1', '0', '', ''], 172 'Â' => ['1', '0', '', ''], 173 'Ã' => ['1', '0', '', ''], 174 'Ä' => ['1', '0', '1', '', '0', '', ''], 175 'Å' => ['1', '0', '', ''], 176 'Ă' => ['1', '0', '', ''], 177 'Ą' => ['1', '', '', '', '', '', '6'], 178 'Ạ ' => ['1', '0', '', ''], 179 'Ả ' => ['1', '0', '', ''], 180 'Ấ ' => ['1', '0', '', ''], 181 'Ầ ' => ['1', '0', '', ''], 182 'Ẩ ' => ['1', '0', '', ''], 183 'Ẫ ' => ['1', '0', '', ''], 184 'Ậ ' => ['1', '0', '', ''], 185 'Ắ ' => ['1', '0', '', ''], 186 'Ằ ' => ['1', '0', '', ''], 187 'Ẳ ' => ['1', '0', '', ''], 188 'Ẵ ' => ['1', '0', '', ''], 189 'Ặ ' => ['1', '0', '', ''], 190 'AE' => ['1', '0', '1', ''], 191 'Æ' => ['1', '0', '1', ''], 192 'AI' => ['1', '0', '1', ''], 193 'AJ' => ['1', '0', '1', ''], 194 'AU' => ['1', '0', '7', ''], 195 'AV' => ['1', '0', '7', '', '7', '7', '7'], 196 'ÄU' => ['1', '0', '1', ''], 197 'AY' => ['1', '0', '1', ''], 198 'B' => ['0', '7', '7', '7'], 199 'C' => ['0', '5', '5', '5', '34', '4', '4'], 200 'Ć' => ['0', '4', '4', '4'], 201 'Č' => ['0', '4', '4', '4'], 202 'Ç' => ['0', '4', '4', '4'], 203 'CH' => ['0', '5', '5', '5', '34', '4', '4'], 204 'CHS' => ['0', '5', '54', '54'], 205 'CK' => ['0', '5', '5', '5', '45', '45', '45'], 206 'CCS' => ['0', '4', '4', '4'], 207 'CS' => ['0', '4', '4', '4'], 208 'CSZ' => ['0', '4', '4', '4'], 209 'CZ' => ['0', '4', '4', '4'], 210 'CZS' => ['0', '4', '4', '4'], 211 'D' => ['0', '3', '3', '3'], 212 'Ď' => ['0', '3', '3', '3'], 213 'Đ' => ['0', '3', '3', '3'], 214 'DRS' => ['0', '4', '4', '4'], 215 'DRZ' => ['0', '4', '4', '4'], 216 'DS' => ['0', '4', '4', '4'], 217 'DSH' => ['0', '4', '4', '4'], 218 'DSZ' => ['0', '4', '4', '4'], 219 'DT' => ['0', '3', '3', '3'], 220 'DDZ' => ['0', '4', '4', '4'], 221 'DDZS' => ['0', '4', '4', '4'], 222 'DZ' => ['0', '4', '4', '4'], 223 'DŹ' => ['0', '4', '4', '4'], 224 'DŻ' => ['0', '4', '4', '4'], 225 'DZH' => ['0', '4', '4', '4'], 226 'DZS' => ['0', '4', '4', '4'], 227 'E' => ['1', '0', '', ''], 228 'È' => ['1', '0', '', ''], 229 'É' => ['1', '0', '', ''], 230 'Ê' => ['1', '0', '', ''], 231 'Ë' => ['1', '0', '', ''], 232 'Ĕ' => ['1', '0', '', ''], 233 'Ė' => ['1', '0', '', ''], 234 'Ę' => ['1', '', '', '6', '', '', ''], 235 'Ẹ ' => ['1', '0', '', ''], 236 'Ẻ ' => ['1', '0', '', ''], 237 'Ẽ ' => ['1', '0', '', ''], 238 'Ế ' => ['1', '0', '', ''], 239 'Ề ' => ['1', '0', '', ''], 240 'Ể ' => ['1', '0', '', ''], 241 'Ễ ' => ['1', '0', '', ''], 242 'Ệ ' => ['1', '0', '', ''], 243 'EAU' => ['1', '0', '', ''], 244 'EI' => ['1', '0', '1', ''], 245 'EJ' => ['1', '0', '1', ''], 246 'EU' => ['1', '1', '1', ''], 247 'EY' => ['1', '0', '1', ''], 248 'F' => ['0', '7', '7', '7'], 249 'FB' => ['0', '7', '7', '7'], 250 'G' => ['0', '5', '5', '5', '34', '4', '4'], 251 'Ğ' => ['0', '', '', ''], 252 'GGY' => ['0', '5', '5', '5'], 253 'GY' => ['0', '5', '5', '5'], 254 'H' => ['0', '5', '5', '', '5', '5', '5'], 255 'I' => ['1', '0', '', ''], 256 'Ì' => ['1', '0', '', ''], 257 'Í' => ['1', '0', '', ''], 258 'Î' => ['1', '0', '', ''], 259 'Ï' => ['1', '0', '', ''], 260 'Ĩ' => ['1', '0', '', ''], 261 'Į' => ['1', '0', '', ''], 262 'İ' => ['1', '0', '', ''], 263 'Ỉ ' => ['1', '0', '', ''], 264 'Ị ' => ['1', '0', '', ''], 265 'IA' => ['1', '1', '', ''], 266 'IE' => ['1', '1', '', ''], 267 'IO' => ['1', '1', '', ''], 268 'IU' => ['1', '1', '', ''], 269 'J' => ['0', '1', '', '', '4', '4', '4', '5', '5', ''], 270 'K' => ['0', '5', '5', '5'], 271 'KH' => ['0', '5', '5', '5'], 272 'KS' => ['0', '5', '54', '54'], 273 'L' => ['0', '8', '8', '8'], 274 'Ľ' => ['0', '8', '8', '8'], 275 'Ĺ' => ['0', '8', '8', '8'], 276 'Ł' => ['0', '7', '7', '7', '8', '8', '8'], 277 'LL' => ['0', '8', '8', '8', '58', '8', '8', '1', '8', '8'], 278 'LLY' => ['0', '8', '8', '8', '1', '8', '8'], 279 'LY' => ['0', '8', '8', '8', '1', '8', '8'], 280 'M' => ['0', '6', '6', '6'], 281 'MĔ' => ['0', '66', '66', '66'], 282 'MN' => ['0', '66', '66', '66'], 283 'N' => ['0', '6', '6', '6'], 284 'Ń' => ['0', '6', '6', '6'], 285 'Ň' => ['0', '6', '6', '6'], 286 'Ñ' => ['0', '6', '6', '6'], 287 'NM' => ['0', '66', '66', '66'], 288 'O' => ['1', '0', '', ''], 289 'Ò' => ['1', '0', '', ''], 290 'Ó' => ['1', '0', '', ''], 291 'Ô' => ['1', '0', '', ''], 292 'Õ' => ['1', '0', '', ''], 293 'Ö' => ['1', '0', '', ''], 294 'Ø' => ['1', '0', '', ''], 295 'Ő' => ['1', '0', '', ''], 296 'Œ' => ['1', '0', '', ''], 297 'Ơ' => ['1', '0', '', ''], 298 'Ọ ' => ['1', '0', '', ''], 299 'Ỏ ' => ['1', '0', '', ''], 300 'Ố ' => ['1', '0', '', ''], 301 'Ồ ' => ['1', '0', '', ''], 302 'Ổ ' => ['1', '0', '', ''], 303 'Ỗ ' => ['1', '0', '', ''], 304 'Ộ ' => ['1', '0', '', ''], 305 'Ớ ' => ['1', '0', '', ''], 306 'Ờ ' => ['1', '0', '', ''], 307 'Ở ' => ['1', '0', '', ''], 308 'Ỡ ' => ['1', '0', '', ''], 309 'Ợ ' => ['1', '0', '', ''], 310 'OE' => ['1', '0', '', ''], 311 'OI' => ['1', '0', '1', ''], 312 'OJ' => ['1', '0', '1', ''], 313 'OU' => ['1', '0', '', ''], 314 'OY' => ['1', '0', '1', ''], 315 'P' => ['0', '7', '7', '7'], 316 'PF' => ['0', '7', '7', '7'], 317 'PH' => ['0', '7', '7', '7'], 318 'Q' => ['0', '5', '5', '5'], 319 'R' => ['0', '9', '9', '9'], 320 'Ř' => ['0', '4', '4', '4'], 321 'RS' => ['0', '4', '4', '4', '94', '94', '94'], 322 'RZ' => ['0', '4', '4', '4', '94', '94', '94'], 323 'S' => ['0', '4', '4', '4'], 324 'Ś' => ['0', '4', '4', '4'], 325 'Š' => ['0', '4', '4', '4'], 326 'Ş' => ['0', '4', '4', '4'], 327 'SC' => ['0', '2', '4', '4'], 328 'ŠČ ' => ['0', '2', '4', '4'], 329 'SCH' => ['0', '4', '4', '4'], 330 'SCHD' => ['0', '2', '43', '43'], 331 'SCHT' => ['0', '2', '43', '43'], 332 'SCHTCH' => ['0', '2', '4', '4'], 333 'SCHTSCH' => ['0', '2', '4', '4'], 334 'SCHTSH' => ['0', '2', '4', '4'], 335 'SD' => ['0', '2', '43', '43'], 336 'SH' => ['0', '4', '4', '4'], 337 'SHCH' => ['0', '2', '4', '4'], 338 'SHD' => ['0', '2', '43', '43'], 339 'SHT' => ['0', '2', '43', '43'], 340 'SHTCH' => ['0', '2', '4', '4'], 341 'SHTSH' => ['0', '2', '4', '4'], 342 'ß' => ['0', '', '4', '4'], 343 'ST' => ['0', '2', '43', '43'], 344 'STCH' => ['0', '2', '4', '4'], 345 'STRS' => ['0', '2', '4', '4'], 346 'STRZ' => ['0', '2', '4', '4'], 347 'STSCH' => ['0', '2', '4', '4'], 348 'STSH' => ['0', '2', '4', '4'], 349 'SSZ' => ['0', '4', '4', '4'], 350 'SZ' => ['0', '4', '4', '4'], 351 'SZCS' => ['0', '2', '4', '4'], 352 'SZCZ' => ['0', '2', '4', '4'], 353 'SZD' => ['0', '2', '43', '43'], 354 'SZT' => ['0', '2', '43', '43'], 355 'T' => ['0', '3', '3', '3'], 356 'Ť' => ['0', '3', '3', '3'], 357 'Ţ' => ['0', '3', '3', '3', '4', '4', '4'], 358 'TC' => ['0', '4', '4', '4'], 359 'TCH' => ['0', '4', '4', '4'], 360 'TH' => ['0', '3', '3', '3'], 361 'TRS' => ['0', '4', '4', '4'], 362 'TRZ' => ['0', '4', '4', '4'], 363 'TS' => ['0', '4', '4', '4'], 364 'TSCH' => ['0', '4', '4', '4'], 365 'TSH' => ['0', '4', '4', '4'], 366 'TSZ' => ['0', '4', '4', '4'], 367 'TTCH' => ['0', '4', '4', '4'], 368 'TTS' => ['0', '4', '4', '4'], 369 'TTSCH' => ['0', '4', '4', '4'], 370 'TTSZ' => ['0', '4', '4', '4'], 371 'TTZ' => ['0', '4', '4', '4'], 372 'TZ' => ['0', '4', '4', '4'], 373 'TZS' => ['0', '4', '4', '4'], 374 'U' => ['1', '0', '', ''], 375 'Ù' => ['1', '0', '', ''], 376 'Ú' => ['1', '0', '', ''], 377 'Û' => ['1', '0', '', ''], 378 'Ü' => ['1', '0', '', ''], 379 'Ũ' => ['1', '0', '', ''], 380 'Ū' => ['1', '0', '', ''], 381 'Ů' => ['1', '0', '', ''], 382 'Ű' => ['1', '0', '', ''], 383 'Ų' => ['1', '0', '', ''], 384 'Ư' => ['1', '0', '', ''], 385 'Ụ ' => ['1', '0', '', ''], 386 'Ủ ' => ['1', '0', '', ''], 387 'Ứ ' => ['1', '0', '', ''], 388 'Ừ ' => ['1', '0', '', ''], 389 'Ử ' => ['1', '0', '', ''], 390 'Ữ ' => ['1', '0', '', ''], 391 'Ự ' => ['1', '0', '', ''], 392 'UE' => ['1', '0', '', ''], 393 'UI' => ['1', '0', '1', ''], 394 'UJ' => ['1', '0', '1', ''], 395 'UY' => ['1', '0', '1', ''], 396 'UW' => ['1', '0', '1', '', '0', '7', '7'], 397 'V' => ['0', '7', '7', '7'], 398 'W' => ['0', '7', '7', '7'], 399 'X' => ['0', '5', '54', '54'], 400 'Y' => ['1', '1', '', ''], 401 'Ý' => ['1', '1', '', ''], 402 'Ỳ ' => ['1', '1', '', ''], 403 'Ỵ ' => ['1', '1', '', ''], 404 'Ỷ ' => ['1', '1', '', ''], 405 'Ỹ ' => ['1', '1', '', ''], 406 'Z' => ['0', '4', '4', '4'], 407 'Ź' => ['0', '4', '4', '4'], 408 'Ż' => ['0', '4', '4', '4'], 409 'Ž' => ['0', '4', '4', '4'], 410 'ZD' => ['0', '2', '43', '43'], 411 'ZDZ' => ['0', '2', '4', '4'], 412 'ZDZH' => ['0', '2', '4', '4'], 413 'ZH' => ['0', '4', '4', '4'], 414 'ZHD' => ['0', '2', '43', '43'], 415 'ZHDZH' => ['0', '2', '4', '4'], 416 'ZS' => ['0', '4', '4', '4'], 417 'ZSCH' => ['0', '4', '4', '4'], 418 'ZSH' => ['0', '4', '4', '4'], 419 'ZZS' => ['0', '4', '4', '4'], 420 // Cyrillic alphabet 421 'А' => ['1', '0', '', ''], 422 'Б' => ['0', '7', '7', '7'], 423 'В' => ['0', '7', '7', '7'], 424 'Г' => ['0', '5', '5', '5'], 425 'Д' => ['0', '3', '3', '3'], 426 'ДЗ ' => ['0', '4', '4', '4'], 427 'Е' => ['1', '0', '', ''], 428 'Ё' => ['1', '0', '', ''], 429 'Ж' => ['0', '4', '4', '4'], 430 'З' => ['0', '4', '4', '4'], 431 'И' => ['1', '0', '', ''], 432 'Й' => ['1', '1', '', '', '4', '4', '4'], 433 'К' => ['0', '5', '5', '5'], 434 'Л' => ['0', '8', '8', '8'], 435 'М' => ['0', '6', '6', '6'], 436 'Н' => ['0', '6', '6', '6'], 437 'О' => ['1', '0', '', ''], 438 'П' => ['0', '7', '7', '7'], 439 'Р' => ['0', '9', '9', '9'], 440 'РЖ ' => ['0', '4', '4', '4'], 441 'С' => ['0', '4', '4', '4'], 442 'Т' => ['0', '3', '3', '3'], 443 'У' => ['1', '0', '', ''], 444 'Ф' => ['0', '7', '7', '7'], 445 'Х' => ['0', '5', '5', '5'], 446 'Ц' => ['0', '4', '4', '4'], 447 'Ч' => ['0', '4', '4', '4'], 448 'Ш' => ['0', '4', '4', '4'], 449 'Щ' => ['0', '2', '4', '4'], 450 'Ъ' => ['0', '', '', ''], 451 'Ы' => ['0', '1', '', ''], 452 'Ь' => ['0', '', '', ''], 453 'Э' => ['1', '0', '', ''], 454 'Ю' => ['0', '1', '', ''], 455 'Я' => ['0', '1', '', ''], 456 // Greek alphabet 457 'Α' => ['1', '0', '', ''], 458 'Ά' => ['1', '0', '', ''], 459 'ΑΙ ' => ['1', '0', '1', ''], 460 'ΑΥ ' => ['1', '0', '1', ''], 461 'Β' => ['0', '7', '7', '7'], 462 'Γ' => ['0', '5', '5', '5'], 463 'Δ' => ['0', '3', '3', '3'], 464 'Ε' => ['1', '0', '', ''], 465 'Έ' => ['1', '0', '', ''], 466 'ΕΙ ' => ['1', '0', '1', ''], 467 'ΕΥ ' => ['1', '1', '1', ''], 468 'Ζ' => ['0', '4', '4', '4'], 469 'Η' => ['1', '0', '', ''], 470 'Ή' => ['1', '0', '', ''], 471 'Θ' => ['0', '3', '3', '3'], 472 'Ι' => ['1', '0', '', ''], 473 'Ί' => ['1', '0', '', ''], 474 'Ϊ' => ['1', '0', '', ''], 475 'ΐ' => ['1', '0', '', ''], 476 'Κ' => ['0', '5', '5', '5'], 477 'Λ' => ['0', '8', '8', '8'], 478 'Μ' => ['0', '6', '6', '6'], 479 'ΜΠ ' => ['0', '7', '7', '7'], 480 'Ν' => ['0', '6', '6', '6'], 481 'ΝΤ ' => ['0', '3', '3', '3'], 482 'Ξ' => ['0', '5', '54', '54'], 483 'Ο' => ['1', '0', '', ''], 484 'Ό' => ['1', '0', '', ''], 485 'ΟΙ ' => ['1', '0', '1', ''], 486 'ΟΥ ' => ['1', '0', '1', ''], 487 'Π' => ['0', '7', '7', '7'], 488 'Ρ' => ['0', '9', '9', '9'], 489 'Σ' => ['0', '4', '4', '4'], 490 'ς' => ['0', '', '', '4'], 491 'Τ' => ['0', '3', '3', '3'], 492 'ΤΖ ' => ['0', '4', '4', '4'], 493 'ΤΣ ' => ['0', '4', '4', '4'], 494 'Υ' => ['1', '1', '', ''], 495 'Ύ' => ['1', '1', '', ''], 496 'Ϋ' => ['1', '1', '', ''], 497 'ΰ' => ['1', '1', '', ''], 498 'ΥΚ ' => ['1', '5', '5', '5'], 499 'ΥΥ ' => ['1', '65', '65', '65'], 500 'Φ' => ['0', '7', '7', '7'], 501 'Χ' => ['0', '5', '5', '5'], 502 'Ψ' => ['0', '7', '7', '7'], 503 'Ω' => ['1', '0', '', ''], 504 'Ώ' => ['1', '0', '', ''], 505 // Hebrew alphabet 506 'א' => ['1', '0', '', ''], 507 'או ' => ['1', '0', '7', ''], 508 'אג ' => ['1', '4', '4', '4', '5', '5', '5', '34', '34', '34'], 509 'בב ' => ['0', '7', '7', '7', '77', '77', '77'], 510 'ב' => ['0', '7', '7', '7'], 511 'גג ' => ['0', '4', '4', '4', '5', '5', '5', '45', '45', '45', '55', '55', '55', '54', '54', '54'], 512 'גד ' => ['0', '43', '43', '43', '53', '53', '53'], 513 'גה ' => ['0', '45', '45', '45', '55', '55', '55'], 514 'גז ' => ['0', '44', '44', '44', '45', '45', '45'], 515 'גח ' => ['0', '45', '45', '45', '55', '55', '55'], 516 'גכ ' => ['0', '45', '45', '45', '55', '55', '55'], 517 'גך ' => ['0', '45', '45', '45', '55', '55', '55'], 518 'גצ ' => ['0', '44', '44', '44', '45', '45', '45'], 519 'גץ ' => ['0', '44', '44', '44', '45', '45', '45'], 520 'גק ' => ['0', '45', '45', '45', '54', '54', '54'], 521 'גש ' => ['0', '44', '44', '44', '54', '54', '54'], 522 'גת ' => ['0', '43', '43', '43', '53', '53', '53'], 523 'ג' => ['0', '4', '4', '4', '5', '5', '5'], 524 'דז ' => ['0', '4', '4', '4'], 525 'דד ' => ['0', '3', '3', '3', '33', '33', '33'], 526 'דט ' => ['0', '33', '33', '33'], 527 'דש ' => ['0', '4', '4', '4'], 528 'דצ ' => ['0', '4', '4', '4'], 529 'דץ ' => ['0', '4', '4', '4'], 530 'ד' => ['0', '3', '3', '3'], 531 'הג ' => ['0', '54', '54', '54', '55', '55', '55'], 532 'הכ ' => ['0', '55', '55', '55'], 533 'הח ' => ['0', '55', '55', '55'], 534 'הק ' => ['0', '55', '55', '55', '5', '5', '5'], 535 'הה ' => ['0', '5', '5', '', '55', '55', ''], 536 'ה' => ['0', '5', '5', ''], 537 'וי ' => ['1', '', '', '', '7', '7', '7'], 538 'ו' => ['1', '7', '7', '7', '7', '', ''], 539 'וו ' => ['1', '7', '7', '7', '7', '', ''], 540 'וו� �' => ['1', '7', '7', '7', '77', '77', '77'], 541 'זש ' => ['0', '4', '4', '4', '44', '44', '44'], 542 'זד� �' => ['0', '2', '4', '4'], 543 'ז' => ['0', '4', '4', '4'], 544 'זג ' => ['0', '44', '44', '44', '45', '45', '45'], 545 'זז ' => ['0', '4', '4', '4', '44', '44', '44'], 546 'זס ' => ['0', '44', '44', '44'], 547 'זצ ' => ['0', '44', '44', '44'], 548 'זץ ' => ['0', '44', '44', '44'], 549 'חג ' => ['0', '54', '54', '54', '53', '53', '53'], 550 'חח ' => ['0', '5', '5', '5', '55', '55', '55'], 551 'חק ' => ['0', '55', '55', '55', '5', '5', '5'], 552 'חכ ' => ['0', '45', '45', '45', '55', '55', '55'], 553 'חס ' => ['0', '5', '54', '54'], 554 'חש ' => ['0', '5', '54', '54'], 555 'ח' => ['0', '5', '5', '5'], 556 'טש ' => ['0', '4', '4', '4'], 557 'טד ' => ['0', '33', '33', '33'], 558 'טי ' => ['0', '3', '3', '3', '4', '4', '4', '3', '3', '34'], 559 'טת ' => ['0', '33', '33', '33'], 560 'טט ' => ['0', '3', '3', '3', '33', '33', '33'], 561 'ט' => ['0', '3', '3', '3'], 562 'י' => ['1', '1', '', ''], 563 'יא ' => ['1', '1', '', '', '1', '1', '1'], 564 'כג ' => ['0', '55', '55', '55', '54', '54', '54'], 565 'כש ' => ['0', '5', '54', '54'], 566 'כס ' => ['0', '5', '54', '54'], 567 'ככ ' => ['0', '5', '5', '5', '55', '55', '55'], 568 'כך ' => ['0', '5', '5', '5', '55', '55', '55'], 569 'כ' => ['0', '5', '5', '5'], 570 'כח ' => ['0', '55', '55', '55', '5', '5', '5'], 571 'ך' => ['0', '', '5', '5'], 572 'ל' => ['0', '8', '8', '8'], 573 'לל ' => ['0', '88', '88', '88', '8', '8', '8'], 574 'מנ ' => ['0', '66', '66', '66'], 575 'מן ' => ['0', '66', '66', '66'], 576 'ממ ' => ['0', '6', '6', '6', '66', '66', '66'], 577 'מם ' => ['0', '6', '6', '6', '66', '66', '66'], 578 'מ' => ['0', '6', '6', '6'], 579 'ם' => ['0', '', '6', '6'], 580 'נמ ' => ['0', '66', '66', '66'], 581 'נם ' => ['0', '66', '66', '66'], 582 'ננ ' => ['0', '6', '6', '6', '66', '66', '66'], 583 'נן ' => ['0', '6', '6', '6', '66', '66', '66'], 584 'נ' => ['0', '6', '6', '6'], 585 'ן' => ['0', '', '6', '6'], 586 'סת� �' => ['0', '2', '4', '4'], 587 'סת� �' => ['0', '2', '4', '4'], 588 'סט� �' => ['0', '2', '4', '4'], 589 'סט� �' => ['0', '2', '4', '4'], 590 'סצ� �' => ['0', '2', '4', '4'], 591 'סט ' => ['0', '2', '4', '4', '43', '43', '43'], 592 'סת ' => ['0', '2', '4', '4', '43', '43', '43'], 593 'סג ' => ['0', '44', '44', '44', '4', '4', '4'], 594 'סס ' => ['0', '4', '4', '4', '44', '44', '44'], 595 'סצ ' => ['0', '44', '44', '44'], 596 'סץ ' => ['0', '44', '44', '44'], 597 'סז ' => ['0', '44', '44', '44'], 598 'סש ' => ['0', '44', '44', '44'], 599 'ס' => ['0', '4', '4', '4'], 600 'ע' => ['1', '0', '', ''], 601 'פב ' => ['0', '7', '7', '7', '77', '77', '77'], 602 'פו� �' => ['0', '7', '7', '7', '77', '77', '77'], 603 'פפ ' => ['0', '7', '7', '7', '77', '77', '77'], 604 'פף ' => ['0', '7', '7', '7', '77', '77', '77'], 605 'פ' => ['0', '7', '7', '7'], 606 'ף' => ['0', '', '7', '7'], 607 'צג ' => ['0', '44', '44', '44', '45', '45', '45'], 608 'צז ' => ['0', '44', '44', '44'], 609 'צס ' => ['0', '44', '44', '44'], 610 'צצ ' => ['0', '4', '4', '4', '5', '5', '5', '44', '44', '44', '54', '54', '54', '45', '45', '45'], 611 'צץ ' => ['0', '4', '4', '4', '5', '5', '5', '44', '44', '44', '54', '54', '54'], 612 'צש ' => ['0', '44', '44', '44', '4', '4', '4', '5', '5', '5'], 613 'צ' => ['0', '4', '4', '4', '5', '5', '5'], 614 'ץ' => ['0', '', '4', '4'], 615 'קה ' => ['0', '55', '55', '5'], 616 'קס ' => ['0', '5', '54', '54'], 617 'קש ' => ['0', '5', '54', '54'], 618 'קק ' => ['0', '5', '5', '5', '55', '55', '55'], 619 'קח ' => ['0', '55', '55', '55'], 620 'קכ ' => ['0', '55', '55', '55'], 621 'קך ' => ['0', '55', '55', '55'], 622 'קג ' => ['0', '55', '55', '55', '54', '54', '54'], 623 'ק' => ['0', '5', '5', '5'], 624 'רר ' => ['0', '99', '99', '99', '9', '9', '9'], 625 'ר' => ['0', '9', '9', '9'], 626 'שט� �' => ['0', '2', '4', '4'], 627 'שת� �' => ['0', '2', '4', '4'], 628 'שת� �' => ['0', '2', '4', '4'], 629 'שט� �' => ['0', '2', '4', '4'], 630 'שד ' => ['0', '2', '43', '43'], 631 'שז ' => ['0', '44', '44', '44'], 632 'שס ' => ['0', '44', '44', '44'], 633 'שת ' => ['0', '2', '43', '43'], 634 'שג ' => ['0', '4', '4', '4', '44', '44', '44', '4', '43', '43'], 635 'שט ' => ['0', '2', '43', '43', '44', '44', '44'], 636 'שצ ' => ['0', '44', '44', '44', '45', '45', '45'], 637 'שץ ' => ['0', '44', '', '44', '45', '', '45'], 638 'שש ' => ['0', '4', '4', '4', '44', '44', '44'], 639 'ש' => ['0', '4', '4', '4'], 640 'תג ' => ['0', '34', '34', '34'], 641 'תז ' => ['0', '34', '34', '34'], 642 'תש ' => ['0', '4', '4', '4'], 643 'תת ' => ['0', '3', '3', '3', '4', '4', '4', '33', '33', '33', '44', '44', '44', '34', '34', '34', '43', '43', '43'], 644 'ת' => ['0', '3', '3', '3', '4', '4', '4'], 645 // Arabic alphabet 646 'ا' => ['1', '0', '', ''], 647 'ب' => ['0', '7', '7', '7'], 648 'ت' => ['0', '3', '3', '3'], 649 'ث' => ['0', '3', '3', '3'], 650 'ج' => ['0', '4', '4', '4'], 651 'ح' => ['0', '5', '5', '5'], 652 'خ' => ['0', '5', '5', '5'], 653 'د' => ['0', '3', '3', '3'], 654 'ذ' => ['0', '3', '3', '3'], 655 'ر' => ['0', '9', '9', '9'], 656 'ز' => ['0', '4', '4', '4'], 657 'س' => ['0', '4', '4', '4'], 658 'ش' => ['0', '4', '4', '4'], 659 'ص' => ['0', '4', '4', '4'], 660 'ض' => ['0', '3', '3', '3'], 661 'ط' => ['0', '3', '3', '3'], 662 'ظ' => ['0', '4', '4', '4'], 663 'ع' => ['1', '0', '', ''], 664 'غ' => ['0', '0', '', ''], 665 'ف' => ['0', '7', '7', '7'], 666 'ق' => ['0', '5', '5', '5'], 667 'ك' => ['0', '5', '5', '5'], 668 'ل' => ['0', '8', '8', '8'], 669 'لا ' => ['0', '8', '8', '8'], 670 'م' => ['0', '6', '6', '6'], 671 'ن' => ['0', '6', '6', '6'], 672 'هن ' => ['0', '66', '66', '66'], 673 'ه' => ['0', '5', '5', ''], 674 'و' => ['1', '', '', '', '7', '', ''], 675 'ي' => ['0', '1', '', ''], 676 'آ' => ['0', '1', '', ''], 677 'ة' => ['0', '', '', '3'], 678 'ی' => ['0', '1', '', ''], 679 'ى' => ['1', '1', '', ''], 680 ]; 681 682 /** 683 * Calculate the Daitch-Mokotoff soundex for a word. 684 * 685 * @param string $name 686 * 687 * @return string[] List of possible DM codes for the word. 688 */ 689 private static function daitchMokotoffWord($name) { 690 // Apply special transformation rules to the input string 691 $name = I18N::strtoupper($name); 692 foreach (self::$transformNameTable as $transformRule) { 693 $name = str_replace($transformRule[0], $transformRule[1], $name); 694 } 695 696 // Initialize 697 $name_script = I18N::textScript($name); 698 $noVowels = ($name_script == 'Hebr' || $name_script == 'Arab'); 699 700 $lastPos = strlen($name) - 1; 701 $currPos = 0; 702 $state = 1; // 1: start of input string, 2: before vowel, 3: other 703 $result = []; // accumulate complete 6-digit D-M codes here 704 $partialResult = []; // accumulate incomplete D-M codes here 705 $partialResult[] = ['!']; // initialize 1st partial result ('!' stops "duplicate sound" check) 706 707 // Loop through the input string. 708 // Stop when the string is exhausted or when no more partial results remain 709 while (count($partialResult) !== 0 && $currPos <= $lastPos) { 710 // Find the DM coding table entry for the chunk at the current position 711 $thisEntry = substr($name, $currPos, self::MAXCHAR); // Get maximum length chunk 712 while ($thisEntry != '') { 713 if (isset(self::$dmsounds[$thisEntry])) { 714 break; 715 } 716 $thisEntry = substr($thisEntry, 0, -1); // Not in table: try a shorter chunk 717 } 718 if ($thisEntry === '') { 719 $currPos++; // Not in table: advance pointer to next byte 720 continue; // and try again 721 } 722 723 $soundTableEntry = self::$dmsounds[$thisEntry]; 724 $workingResult = $partialResult; 725 $partialResult = []; 726 $currPos += strlen($thisEntry); 727 728 // Not at beginning of input string 729 if ($state != 1) { 730 if ($currPos <= $lastPos) { 731 // Determine whether the next chunk is a vowel 732 $nextEntry = substr($name, $currPos, self::MAXCHAR); // Get maximum length chunk 733 while ($nextEntry != '') { 734 if (isset(self::$dmsounds[$nextEntry])) { 735 break; 736 } 737 $nextEntry = substr($nextEntry, 0, -1); // Not in table: try a shorter chunk 738 } 739 } else { 740 $nextEntry = ''; 741 } 742 if ($nextEntry != '' && self::$dmsounds[$nextEntry][0] != '0') { 743 $state = 2; 744 } else { 745 // Next chunk is a vowel 746 $state = 3; 747 } 748 } 749 750 while ($state < count($soundTableEntry)) { 751 // empty means 'ignore this sound in this state' 752 if ($soundTableEntry[$state] == '') { 753 foreach ($workingResult as $workingEntry) { 754 $tempEntry = $workingEntry; 755 $tempEntry[count($tempEntry) - 1] .= '!'; // Prevent false 'doubles' 756 $partialResult[] = $tempEntry; 757 } 758 } else { 759 foreach ($workingResult as $workingEntry) { 760 if ($soundTableEntry[$state] !== $workingEntry[count($workingEntry) - 1]) { 761 // Incoming sound isn't a duplicate of the previous sound 762 $workingEntry[] = $soundTableEntry[$state]; 763 } else { 764 // Incoming sound is a duplicate of the previous sound 765 // For Hebrew and Arabic, we need to create a pair of D-M sound codes, 766 // one of the pair with only a single occurrence of the duplicate sound, 767 // the other with both occurrences 768 if ($noVowels) { 769 $workingEntry[] = $soundTableEntry[$state]; 770 } 771 } 772 if (count($workingEntry) < 7) { 773 $partialResult[] = $workingEntry; 774 } else { 775 // This is the 6th code in the sequence 776 // We're looking for 7 entries because the first is '!' and doesn't count 777 $tempResult = str_replace('!', '', implode('', $workingEntry)); 778 // Only return codes from recognisable sounds 779 if ($tempResult) { 780 $result[] = substr($tempResult . '000000', 0, 6); 781 } 782 } 783 } 784 } 785 $state = $state + 3; // Advance to next triplet while keeping the same basic state 786 } 787 } 788 789 // Zero-fill and copy all remaining partial results 790 foreach ($partialResult as $workingEntry) { 791 $tempResult = str_replace('!', '', implode('', $workingEntry)); 792 // Only return codes from recognisable sounds 793 if ($tempResult) { 794 $result[] = substr($tempResult . '000000', 0, 6); 795 } 796 } 797 798 return $result; 799 } 800} 801