1<?php 2/** 3 * webtrees: online genealogy 4 * Copyright (C) 2015 webtrees development team 5 * This program is free software: you can redistribute it and/or modify 6 * it under the terms of the GNU General Public License as published by 7 * the Free Software Foundation, either version 3 of the License, or 8 * (at your option) any later version. 9 * This program is distributed in the hope that it will be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * You should have received a copy of the GNU General Public License 14 * along with this program. If not, see <http://www.gnu.org/licenses/>. 15 */ 16namespace Fisharebest\Webtrees; 17 18/** 19 * Phonetic matching of strings. 20 */ 21class Soundex { 22 /** 23 * Which algorithms are supported. 24 * 25 * @return string[] 26 */ 27 public static function getAlgorithms() { 28 return array( 29 'std' => /* I18N: http://en.wikipedia.org/wiki/Soundex */ I18N::translate('Russell'), 30 'dm' => /* I18N: http://en.wikipedia.org/wiki/Daitch–Mokotoff_Soundex */ I18N::translate('Daitch-Mokotoff'), 31 ); 32 } 33 34 /** 35 * Is there a match between two soundex codes? 36 * 37 * @param string $soundex1 38 * @param string $soundex2 39 * 40 * @return bool 41 */ 42 public static function compare($soundex1, $soundex2) { 43 if ($soundex1 && $soundex2) { 44 foreach (explode(':', $soundex1) as $code) { 45 if (strpos($soundex2, $code) !== false) { 46 return true; 47 } 48 } 49 } 50 51 return false; 52 } 53 54 /** 55 * Generate Russell soundex codes for a given text. 56 * 57 * @param $text 58 * 59 * @return null|string 60 */ 61 public static function russell($text) { 62 $words = preg_split('/\s/', $text, -1, PREG_SPLIT_NO_EMPTY); 63 $soundex_array = array(); 64 foreach ($words as $word) { 65 $soundex = soundex($word); 66 // Only return codes from recognisable sounds 67 if ($soundex !== '0000') { 68 $soundex_array[] = $soundex; 69 } 70 } 71 // Combine words, e.g. “New York” as “Newyork” 72 if (count($words) > 1) { 73 $soundex_array[] = soundex(strtr($text, ' ', '')); 74 } 75 // A varchar(255) column can only hold 51 4-character codes (plus 50 delimiters) 76 $soundex_array = array_slice(array_unique($soundex_array), 0, 51); 77 78 if ($soundex_array) { 79 return implode(':', $soundex_array); 80 } else { 81 return null; 82 } 83 } 84 85 /** 86 * Generate Daitch–Mokotoff soundex codes for a given text. 87 * 88 * @param $text 89 * 90 * @return null|string 91 */ 92 public static function daitchMokotoff($text) { 93 $words = preg_split('/\s/', $text, -1, PREG_SPLIT_NO_EMPTY); 94 $soundex_array = array(); 95 foreach ($words as $word) { 96 $soundex_array = array_merge($soundex_array, self::daitchMokotoffWord($word)); 97 } 98 // Combine words, e.g. “New York” as “Newyork” 99 if (count($words) > 1) { 100 $soundex_array = array_merge($soundex_array, self::daitchMokotoffWord(strtr($text, ' ', ''))); 101 } 102 // A varchar(255) column can only hold 36 6-character codes (plus 35 delimiters) 103 $soundex_array = array_slice(array_unique($soundex_array), 0, 36); 104 105 if ($soundex_array) { 106 return implode(':', $soundex_array); 107 } else { 108 return null; 109 } 110 } 111 112 // Determine the Daitch–Mokotoff Soundex code for a word 113 // Original implementation by Gerry Kroll, and analysis by Meliza Amity 114 115 // Max. table key length (in ASCII bytes -- NOT in UTF-8 characters!) 116 const MAXCHAR = 7; 117 118 /** 119 * Name transformation arrays. 120 * Used to transform the Name string to simplify the "sounds like" table. 121 * This is especially useful in Hebrew. 122 * 123 * Each array entry defines the "from" and "to" arguments of an preg($from, $to, $text) 124 * function call to achieve the desired transformations. 125 * 126 * Note about the use of "\x01": 127 * This code, which can’t legitimately occur in the kind of text we're dealing with, 128 * is used as a place-holder so that conditional string replacements can be done. 129 * 130 * @var string[][] 131 */ 132 private static $transformNameTable = array( 133 // Force Yiddish ligatures to be treated as separate letters 134 array('װ', 'וו'), 135 array('ײ', 'יי'), 136 array('ױ', 'וי'), 137 array('בו', 'בע'), 138 array('פו', 'פע'), 139 array('ומ', 'עמ'), 140 array('ום', 'עם'), 141 array('ונ', 'ענ'), 142 array('ון', 'ען'), 143 array('וו', 'ב'), 144 array("\x01", ''), 145 array('ייה$', "\x01ה"), 146 array('ייע$', "\x01ע"), 147 array('יי', 'ע'), 148 array("\x01", 'יי'), 149 ); 150 151 /** 152 * The DM sound coding table is organized this way: 153 * key: a variable-length string that corresponds to the UTF-8 character sequence 154 * represented by the table entry. Currently, that string can be up to 7 155 * bytes long. This maximum length is defined by the value of global variable 156 * $maxchar. 157 * 158 * value: an array as follows: 159 * [0]: zero if not a vowel 160 * [1]: sound value when this string is at the beginning of the word 161 * [2]: sound value when this string is followed by a vowel 162 * [3]: sound value for other cases 163 * [1],[2],[3] can be repeated several times to create branches in the code 164 * an empty sound value means "ignore in this state" 165 * 166 * @var string[][] 167 */ 168 private static $dmsounds = array( 169 'A' => array('1', '0', '', ''), 170 'À' => array('1', '0', '', ''), 171 'Á' => array('1', '0', '', ''), 172 'Â' => array('1', '0', '', ''), 173 'Ã' => array('1', '0', '', ''), 174 'Ä' => array('1', '0', '1', '', '0', '', ''), 175 'Å' => array('1', '0', '', ''), 176 'Ă' => array('1', '0', '', ''), 177 'Ą' => array('1', '', '', '', '', '', '6'), 178 'Ạ' => array('1', '0', '', ''), 179 'Ả' => array('1', '0', '', ''), 180 'Ấ' => array('1', '0', '', ''), 181 'Ầ' => array('1', '0', '', ''), 182 'Ẩ' => array('1', '0', '', ''), 183 'Ẫ' => array('1', '0', '', ''), 184 'Ậ' => array('1', '0', '', ''), 185 'Ắ' => array('1', '0', '', ''), 186 'Ằ' => array('1', '0', '', ''), 187 'Ẳ' => array('1', '0', '', ''), 188 'Ẵ' => array('1', '0', '', ''), 189 'Ặ' => array('1', '0', '', ''), 190 'AE' => array('1', '0', '1', ''), 191 'Æ' => array('1', '0', '1', ''), 192 'AI' => array('1', '0', '1', ''), 193 'AJ' => array('1', '0', '1', ''), 194 'AU' => array('1', '0', '7', ''), 195 'AV' => array('1', '0', '7', '', '7', '7', '7'), 196 'ÄU' => array('1', '0', '1', ''), 197 'AY' => array('1', '0', '1', ''), 198 'B' => array('0', '7', '7', '7'), 199 'C' => array('0', '5', '5', '5', '34', '4', '4'), 200 'Ć' => array('0', '4', '4', '4'), 201 'Č' => array('0', '4', '4', '4'), 202 'Ç' => array('0', '4', '4', '4'), 203 'CH' => array('0', '5', '5', '5', '34', '4', '4'), 204 'CHS' => array('0', '5', '54', '54'), 205 'CK' => array('0', '5', '5', '5', '45', '45', '45'), 206 'CCS' => array('0', '4', '4', '4'), 207 'CS' => array('0', '4', '4', '4'), 208 'CSZ' => array('0', '4', '4', '4'), 209 'CZ' => array('0', '4', '4', '4'), 210 'CZS' => array('0', '4', '4', '4'), 211 'D' => array('0', '3', '3', '3'), 212 'Ď' => array('0', '3', '3', '3'), 213 'Đ' => array('0', '3', '3', '3'), 214 'DRS' => array('0', '4', '4', '4'), 215 'DRZ' => array('0', '4', '4', '4'), 216 'DS' => array('0', '4', '4', '4'), 217 'DSH' => array('0', '4', '4', '4'), 218 'DSZ' => array('0', '4', '4', '4'), 219 'DT' => array('0', '3', '3', '3'), 220 'DDZ' => array('0', '4', '4', '4'), 221 'DDZS' => array('0', '4', '4', '4'), 222 'DZ' => array('0', '4', '4', '4'), 223 'DŹ' => array('0', '4', '4', '4'), 224 'DŻ' => array('0', '4', '4', '4'), 225 'DZH' => array('0', '4', '4', '4'), 226 'DZS' => array('0', '4', '4', '4'), 227 'E' => array('1', '0', '', ''), 228 'È' => array('1', '0', '', ''), 229 'É' => array('1', '0', '', ''), 230 'Ê' => array('1', '0', '', ''), 231 'Ë' => array('1', '0', '', ''), 232 'Ĕ' => array('1', '0', '', ''), 233 'Ė' => array('1', '0', '', ''), 234 'Ę' => array('1', '', '', '6', '', '', ''), 235 'Ẹ' => array('1', '0', '', ''), 236 'Ẻ' => array('1', '0', '', ''), 237 'Ẽ' => array('1', '0', '', ''), 238 'Ế' => array('1', '0', '', ''), 239 'Ề' => array('1', '0', '', ''), 240 'Ể' => array('1', '0', '', ''), 241 'Ễ' => array('1', '0', '', ''), 242 'Ệ' => array('1', '0', '', ''), 243 'EAU' => array('1', '0', '', ''), 244 'EI' => array('1', '0', '1', ''), 245 'EJ' => array('1', '0', '1', ''), 246 'EU' => array('1', '1', '1', ''), 247 'EY' => array('1', '0', '1', ''), 248 'F' => array('0', '7', '7', '7'), 249 'FB' => array('0', '7', '7', '7'), 250 'G' => array('0', '5', '5', '5', '34', '4', '4'), 251 'Ğ' => array('0', '', '', ''), 252 'GGY' => array('0', '5', '5', '5'), 253 'GY' => array('0', '5', '5', '5'), 254 'H' => array('0', '5', '5', '', '5', '5', '5'), 255 'I' => array('1', '0', '', ''), 256 'Ì' => array('1', '0', '', ''), 257 'Í' => array('1', '0', '', ''), 258 'Î' => array('1', '0', '', ''), 259 'Ï' => array('1', '0', '', ''), 260 'Ĩ' => array('1', '0', '', ''), 261 'Į' => array('1', '0', '', ''), 262 'İ' => array('1', '0', '', ''), 263 'Ỉ' => array('1', '0', '', ''), 264 'Ị' => array('1', '0', '', ''), 265 'IA' => array('1', '1', '', ''), 266 'IE' => array('1', '1', '', ''), 267 'IO' => array('1', '1', '', ''), 268 'IU' => array('1', '1', '', ''), 269 'J' => array('0', '1', '', '', '4', '4', '4', '5', '5', ''), 270 'K' => array('0', '5', '5', '5'), 271 'KH' => array('0', '5', '5', '5'), 272 'KS' => array('0', '5', '54', '54'), 273 'L' => array('0', '8', '8', '8'), 274 'Ľ' => array('0', '8', '8', '8'), 275 'Ĺ' => array('0', '8', '8', '8'), 276 'Ł' => array('0', '7', '7', '7', '8', '8', '8'), 277 'LL' => array('0', '8', '8', '8', '58', '8', '8', '1', '8', '8'), 278 'LLY' => array('0', '8', '8', '8', '1', '8', '8'), 279 'LY' => array('0', '8', '8', '8', '1', '8', '8'), 280 'M' => array('0', '6', '6', '6'), 281 'MĔ' => array('0', '66', '66', '66'), 282 'MN' => array('0', '66', '66', '66'), 283 'N' => array('0', '6', '6', '6'), 284 'Ń' => array('0', '6', '6', '6'), 285 'Ň' => array('0', '6', '6', '6'), 286 'Ñ' => array('0', '6', '6', '6'), 287 'NM' => array('0', '66', '66', '66'), 288 'O' => array('1', '0', '', ''), 289 'Ò' => array('1', '0', '', ''), 290 'Ó' => array('1', '0', '', ''), 291 'Ô' => array('1', '0', '', ''), 292 'Õ' => array('1', '0', '', ''), 293 'Ö' => array('1', '0', '', ''), 294 'Ø' => array('1', '0', '', ''), 295 'Ő' => array('1', '0', '', ''), 296 'Œ' => array('1', '0', '', ''), 297 'Ơ' => array('1', '0', '', ''), 298 'Ọ' => array('1', '0', '', ''), 299 'Ỏ' => array('1', '0', '', ''), 300 'Ố' => array('1', '0', '', ''), 301 'Ồ' => array('1', '0', '', ''), 302 'Ổ' => array('1', '0', '', ''), 303 'Ỗ' => array('1', '0', '', ''), 304 'Ộ' => array('1', '0', '', ''), 305 'Ớ' => array('1', '0', '', ''), 306 'Ờ' => array('1', '0', '', ''), 307 'Ở' => array('1', '0', '', ''), 308 'Ỡ' => array('1', '0', '', ''), 309 'Ợ' => array('1', '0', '', ''), 310 'OE' => array('1', '0', '', ''), 311 'OI' => array('1', '0', '1', ''), 312 'OJ' => array('1', '0', '1', ''), 313 'OU' => array('1', '0', '', ''), 314 'OY' => array('1', '0', '1', ''), 315 'P' => array('0', '7', '7', '7'), 316 'PF' => array('0', '7', '7', '7'), 317 'PH' => array('0', '7', '7', '7'), 318 'Q' => array('0', '5', '5', '5'), 319 'R' => array('0', '9', '9', '9'), 320 'Ř' => array('0', '4', '4', '4'), 321 'RS' => array('0', '4', '4', '4', '94', '94', '94'), 322 'RZ' => array('0', '4', '4', '4', '94', '94', '94'), 323 'S' => array('0', '4', '4', '4'), 324 'Ś' => array('0', '4', '4', '4'), 325 'Š' => array('0', '4', '4', '4'), 326 'Ş' => array('0', '4', '4', '4'), 327 'SC' => array('0', '2', '4', '4'), 328 'ŠČ' => array('0', '2', '4', '4'), 329 'SCH' => array('0', '4', '4', '4'), 330 'SCHD' => array('0', '2', '43', '43'), 331 'SCHT' => array('0', '2', '43', '43'), 332 'SCHTCH' => array('0', '2', '4', '4'), 333 'SCHTSCH' => array('0', '2', '4', '4'), 334 'SCHTSH' => array('0', '2', '4', '4'), 335 'SD' => array('0', '2', '43', '43'), 336 'SH' => array('0', '4', '4', '4'), 337 'SHCH' => array('0', '2', '4', '4'), 338 'SHD' => array('0', '2', '43', '43'), 339 'SHT' => array('0', '2', '43', '43'), 340 'SHTCH' => array('0', '2', '4', '4'), 341 'SHTSH' => array('0', '2', '4', '4'), 342 'ß' => array('0', '', '4', '4'), 343 'ST' => array('0', '2', '43', '43'), 344 'STCH' => array('0', '2', '4', '4'), 345 'STRS' => array('0', '2', '4', '4'), 346 'STRZ' => array('0', '2', '4', '4'), 347 'STSCH' => array('0', '2', '4', '4'), 348 'STSH' => array('0', '2', '4', '4'), 349 'SSZ' => array('0', '4', '4', '4'), 350 'SZ' => array('0', '4', '4', '4'), 351 'SZCS' => array('0', '2', '4', '4'), 352 'SZCZ' => array('0', '2', '4', '4'), 353 'SZD' => array('0', '2', '43', '43'), 354 'SZT' => array('0', '2', '43', '43'), 355 'T' => array('0', '3', '3', '3'), 356 'Ť' => array('0', '3', '3', '3'), 357 'Ţ' => array('0', '3', '3', '3', '4', '4', '4'), 358 'TC' => array('0', '4', '4', '4'), 359 'TCH' => array('0', '4', '4', '4'), 360 'TH' => array('0', '3', '3', '3'), 361 'TRS' => array('0', '4', '4', '4'), 362 'TRZ' => array('0', '4', '4', '4'), 363 'TS' => array('0', '4', '4', '4'), 364 'TSCH' => array('0', '4', '4', '4'), 365 'TSH' => array('0', '4', '4', '4'), 366 'TSZ' => array('0', '4', '4', '4'), 367 'TTCH' => array('0', '4', '4', '4'), 368 'TTS' => array('0', '4', '4', '4'), 369 'TTSCH' => array('0', '4', '4', '4'), 370 'TTSZ' => array('0', '4', '4', '4'), 371 'TTZ' => array('0', '4', '4', '4'), 372 'TZ' => array('0', '4', '4', '4'), 373 'TZS' => array('0', '4', '4', '4'), 374 'U' => array('1', '0', '', ''), 375 'Ù' => array('1', '0', '', ''), 376 'Ú' => array('1', '0', '', ''), 377 'Û' => array('1', '0', '', ''), 378 'Ü' => array('1', '0', '', ''), 379 'Ũ' => array('1', '0', '', ''), 380 'Ū' => array('1', '0', '', ''), 381 'Ů' => array('1', '0', '', ''), 382 'Ű' => array('1', '0', '', ''), 383 'Ų' => array('1', '0', '', ''), 384 'Ư' => array('1', '0', '', ''), 385 'Ụ' => array('1', '0', '', ''), 386 'Ủ' => array('1', '0', '', ''), 387 'Ứ' => array('1', '0', '', ''), 388 'Ừ' => array('1', '0', '', ''), 389 'Ử' => array('1', '0', '', ''), 390 'Ữ' => array('1', '0', '', ''), 391 'Ự' => array('1', '0', '', ''), 392 'UE' => array('1', '0', '', ''), 393 'UI' => array('1', '0', '1', ''), 394 'UJ' => array('1', '0', '1', ''), 395 'UY' => array('1', '0', '1', ''), 396 'UW' => array('1', '0', '1', '', '0', '7', '7'), 397 'V' => array('0', '7', '7', '7'), 398 'W' => array('0', '7', '7', '7'), 399 'X' => array('0', '5', '54', '54'), 400 'Y' => array('1', '1', '', ''), 401 'Ý' => array('1', '1', '', ''), 402 'Ỳ' => array('1', '1', '', ''), 403 'Ỵ' => array('1', '1', '', ''), 404 'Ỷ' => array('1', '1', '', ''), 405 'Ỹ' => array('1', '1', '', ''), 406 'Z' => array('0', '4', '4', '4'), 407 'Ź' => array('0', '4', '4', '4'), 408 'Ż' => array('0', '4', '4', '4'), 409 'Ž' => array('0', '4', '4', '4'), 410 'ZD' => array('0', '2', '43', '43'), 411 'ZDZ' => array('0', '2', '4', '4'), 412 'ZDZH' => array('0', '2', '4', '4'), 413 'ZH' => array('0', '4', '4', '4'), 414 'ZHD' => array('0', '2', '43', '43'), 415 'ZHDZH' => array('0', '2', '4', '4'), 416 'ZS' => array('0', '4', '4', '4'), 417 'ZSCH' => array('0', '4', '4', '4'), 418 'ZSH' => array('0', '4', '4', '4'), 419 'ZZS' => array('0', '4', '4', '4'), 420 // Cyrillic alphabet 421 'А' => array('1', '0', '', ''), 422 'Б' => array('0', '7', '7', '7'), 423 'В' => array('0', '7', '7', '7'), 424 'Г' => array('0', '5', '5', '5'), 425 'Д' => array('0', '3', '3', '3'), 426 'ДЗ' => array('0', '4', '4', '4'), 427 'Е' => array('1', '0', '', ''), 428 'Ё' => array('1', '0', '', ''), 429 'Ж' => array('0', '4', '4', '4'), 430 'З' => array('0', '4', '4', '4'), 431 'И' => array('1', '0', '', ''), 432 'Й' => array('1', '1', '', '', '4', '4', '4'), 433 'К' => array('0', '5', '5', '5'), 434 'Л' => array('0', '8', '8', '8'), 435 'М' => array('0', '6', '6', '6'), 436 'Н' => array('0', '6', '6', '6'), 437 'О' => array('1', '0', '', ''), 438 'П' => array('0', '7', '7', '7'), 439 'Р' => array('0', '9', '9', '9'), 440 'РЖ' => array('0', '4', '4', '4'), 441 'С' => array('0', '4', '4', '4'), 442 'Т' => array('0', '3', '3', '3'), 443 'У' => array('1', '0', '', ''), 444 'Ф' => array('0', '7', '7', '7'), 445 'Х' => array('0', '5', '5', '5'), 446 'Ц' => array('0', '4', '4', '4'), 447 'Ч' => array('0', '4', '4', '4'), 448 'Ш' => array('0', '4', '4', '4'), 449 'Щ' => array('0', '2', '4', '4'), 450 'Ъ' => array('0', '', '', ''), 451 'Ы' => array('0', '1', '', ''), 452 'Ь' => array('0', '', '', ''), 453 'Э' => array('1', '0', '', ''), 454 'Ю' => array('0', '1', '', ''), 455 'Я' => array('0', '1', '', ''), 456 // Greek alphabet 457 'Α' => array('1', '0', '', ''), 458 'Ά' => array('1', '0', '', ''), 459 'ΑΙ' => array('1', '0', '1', ''), 460 'ΑΥ' => array('1', '0', '1', ''), 461 'Β' => array('0', '7', '7', '7'), 462 'Γ' => array('0', '5', '5', '5'), 463 'Δ' => array('0', '3', '3', '3'), 464 'Ε' => array('1', '0', '', ''), 465 'Έ' => array('1', '0', '', ''), 466 'ΕΙ' => array('1', '0', '1', ''), 467 'ΕΥ' => array('1', '1', '1', ''), 468 'Ζ' => array('0', '4', '4', '4'), 469 'Η' => array('1', '0', '', ''), 470 'Ή' => array('1', '0', '', ''), 471 'Θ' => array('0', '3', '3', '3'), 472 'Ι' => array('1', '0', '', ''), 473 'Ί' => array('1', '0', '', ''), 474 'Ϊ' => array('1', '0', '', ''), 475 'ΐ' => array('1', '0', '', ''), 476 'Κ' => array('0', '5', '5', '5'), 477 'Λ' => array('0', '8', '8', '8'), 478 'Μ' => array('0', '6', '6', '6'), 479 'ΜΠ' => array('0', '7', '7', '7'), 480 'Ν' => array('0', '6', '6', '6'), 481 'ΝΤ' => array('0', '3', '3', '3'), 482 'Ξ' => array('0', '5', '54', '54'), 483 'Ο' => array('1', '0', '', ''), 484 'Ό' => array('1', '0', '', ''), 485 'ΟΙ' => array('1', '0', '1', ''), 486 'ΟΥ' => array('1', '0', '1', ''), 487 'Π' => array('0', '7', '7', '7'), 488 'Ρ' => array('0', '9', '9', '9'), 489 'Σ' => array('0', '4', '4', '4'), 490 'ς' => array('0', '', '', '4'), 491 'Τ' => array('0', '3', '3', '3'), 492 'ΤΖ' => array('0', '4', '4', '4'), 493 'ΤΣ' => array('0', '4', '4', '4'), 494 'Υ' => array('1', '1', '', ''), 495 'Ύ' => array('1', '1', '', ''), 496 'Ϋ' => array('1', '1', '', ''), 497 'ΰ' => array('1', '1', '', ''), 498 'ΥΚ' => array('1', '5', '5', '5'), 499 'ΥΥ' => array('1', '65', '65', '65'), 500 'Φ' => array('0', '7', '7', '7'), 501 'Χ' => array('0', '5', '5', '5'), 502 'Ψ' => array('0', '7', '7', '7'), 503 'Ω' => array('1', '0', '', ''), 504 'Ώ' => array('1', '0', '', ''), 505 // Hebrew alphabet 506 'א' => array('1', '0', '', ''), 507 'או' => array('1', '0', '7', ''), 508 'אג' => array('1', '4', '4', '4', '5', '5', '5', '34', '34', '34'), 509 'בב' => array('0', '7', '7', '7', '77', '77', '77'), 510 'ב' => array('0', '7', '7', '7'), 511 'גג' => array('0', '4', '4', '4', '5', '5', '5', '45', '45', '45', '55', '55', '55', '54', '54', '54'), 512 'גד' => array('0', '43', '43', '43', '53', '53', '53'), 513 'גה' => array('0', '45', '45', '45', '55', '55', '55'), 514 'גז' => array('0', '44', '44', '44', '45', '45', '45'), 515 'גח' => array('0', '45', '45', '45', '55', '55', '55'), 516 'גכ' => array('0', '45', '45', '45', '55', '55', '55'), 517 'גך' => array('0', '45', '45', '45', '55', '55', '55'), 518 'גצ' => array('0', '44', '44', '44', '45', '45', '45'), 519 'גץ' => array('0', '44', '44', '44', '45', '45', '45'), 520 'גק' => array('0', '45', '45', '45', '54', '54', '54'), 521 'גש' => array('0', '44', '44', '44', '54', '54', '54'), 522 'גת' => array('0', '43', '43', '43', '53', '53', '53'), 523 'ג' => array('0', '4', '4', '4', '5', '5', '5'), 524 'דז' => array('0', '4', '4', '4'), 525 'דד' => array('0', '3', '3', '3', '33', '33', '33'), 526 'דט' => array('0', '33', '33', '33'), 527 'דש' => array('0', '4', '4', '4'), 528 'דצ' => array('0', '4', '4', '4'), 529 'דץ' => array('0', '4', '4', '4'), 530 'ד' => array('0', '3', '3', '3'), 531 'הג' => array('0', '54', '54', '54', '55', '55', '55'), 532 'הכ' => array('0', '55', '55', '55'), 533 'הח' => array('0', '55', '55', '55'), 534 'הק' => array('0', '55', '55', '55', '5', '5', '5'), 535 'הה' => array('0', '5', '5', '', '55', '55', ''), 536 'ה' => array('0', '5', '5', ''), 537 'וי' => array('1', '', '', '', '7', '7', '7'), 538 'ו' => array('1', '7', '7', '7', '7', '', ''), 539 'וו' => array('1', '7', '7', '7', '7', '', ''), 540 'וופ' => array('1', '7', '7', '7', '77', '77', '77'), 541 'זש' => array('0', '4', '4', '4', '44', '44', '44'), 542 'זדז' => array('0', '2', '4', '4'), 543 'ז' => array('0', '4', '4', '4'), 544 'זג' => array('0', '44', '44', '44', '45', '45', '45'), 545 'זז' => array('0', '4', '4', '4', '44', '44', '44'), 546 'זס' => array('0', '44', '44', '44'), 547 'זצ' => array('0', '44', '44', '44'), 548 'זץ' => array('0', '44', '44', '44'), 549 'חג' => array('0', '54', '54', '54', '53', '53', '53'), 550 'חח' => array('0', '5', '5', '5', '55', '55', '55'), 551 'חק' => array('0', '55', '55', '55', '5', '5', '5'), 552 'חכ' => array('0', '45', '45', '45', '55', '55', '55'), 553 'חס' => array('0', '5', '54', '54'), 554 'חש' => array('0', '5', '54', '54'), 555 'ח' => array('0', '5', '5', '5'), 556 'טש' => array('0', '4', '4', '4'), 557 'טד' => array('0', '33', '33', '33'), 558 'טי' => array('0', '3', '3', '3', '4', '4', '4', '3', '3', '34'), 559 'טת' => array('0', '33', '33', '33'), 560 'טט' => array('0', '3', '3', '3', '33', '33', '33'), 561 'ט' => array('0', '3', '3', '3'), 562 'י' => array('1', '1', '', ''), 563 'יא' => array('1', '1', '', '', '1', '1', '1'), 564 'כג' => array('0', '55', '55', '55', '54', '54', '54'), 565 'כש' => array('0', '5', '54', '54'), 566 'כס' => array('0', '5', '54', '54'), 567 'ככ' => array('0', '5', '5', '5', '55', '55', '55'), 568 'כך' => array('0', '5', '5', '5', '55', '55', '55'), 569 'כ' => array('0', '5', '5', '5'), 570 'כח' => array('0', '55', '55', '55', '5', '5', '5'), 571 'ך' => array('0', '', '5', '5'), 572 'ל' => array('0', '8', '8', '8'), 573 'לל' => array('0', '88', '88', '88', '8', '8', '8'), 574 'מנ' => array('0', '66', '66', '66'), 575 'מן' => array('0', '66', '66', '66'), 576 'ממ' => array('0', '6', '6', '6', '66', '66', '66'), 577 'מם' => array('0', '6', '6', '6', '66', '66', '66'), 578 'מ' => array('0', '6', '6', '6'), 579 'ם' => array('0', '', '6', '6'), 580 'נמ' => array('0', '66', '66', '66'), 581 'נם' => array('0', '66', '66', '66'), 582 'ננ' => array('0', '6', '6', '6', '66', '66', '66'), 583 'נן' => array('0', '6', '6', '6', '66', '66', '66'), 584 'נ' => array('0', '6', '6', '6'), 585 'ן' => array('0', '', '6', '6'), 586 'סתש' => array('0', '2', '4', '4'), 587 'סתז' => array('0', '2', '4', '4'), 588 'סטז' => array('0', '2', '4', '4'), 589 'סטש' => array('0', '2', '4', '4'), 590 'סצד' => array('0', '2', '4', '4'), 591 'סט' => array('0', '2', '4', '4', '43', '43', '43'), 592 'סת' => array('0', '2', '4', '4', '43', '43', '43'), 593 'סג' => array('0', '44', '44', '44', '4', '4', '4'), 594 'סס' => array('0', '4', '4', '4', '44', '44', '44'), 595 'סצ' => array('0', '44', '44', '44'), 596 'סץ' => array('0', '44', '44', '44'), 597 'סז' => array('0', '44', '44', '44'), 598 'סש' => array('0', '44', '44', '44'), 599 'ס' => array('0', '4', '4', '4'), 600 'ע' => array('1', '0', '', ''), 601 'פב' => array('0', '7', '7', '7', '77', '77', '77'), 602 'פוו' => array('0', '7', '7', '7', '77', '77', '77'), 603 'פפ' => array('0', '7', '7', '7', '77', '77', '77'), 604 'פף' => array('0', '7', '7', '7', '77', '77', '77'), 605 'פ' => array('0', '7', '7', '7'), 606 'ף' => array('0', '', '7', '7'), 607 'צג' => array('0', '44', '44', '44', '45', '45', '45'), 608 'צז' => array('0', '44', '44', '44'), 609 'צס' => array('0', '44', '44', '44'), 610 'צצ' => array('0', '4', '4', '4', '5', '5', '5', '44', '44', '44', '54', '54', '54', '45', '45', '45'), 611 'צץ' => array('0', '4', '4', '4', '5', '5', '5', '44', '44', '44', '54', '54', '54'), 612 'צש' => array('0', '44', '44', '44', '4', '4', '4', '5', '5', '5'), 613 'צ' => array('0', '4', '4', '4', '5', '5', '5'), 614 'ץ' => array('0', '', '4', '4'), 615 'קה' => array('0', '55', '55', '5'), 616 'קס' => array('0', '5', '54', '54'), 617 'קש' => array('0', '5', '54', '54'), 618 'קק' => array('0', '5', '5', '5', '55', '55', '55'), 619 'קח' => array('0', '55', '55', '55'), 620 'קכ' => array('0', '55', '55', '55'), 621 'קך' => array('0', '55', '55', '55'), 622 'קג' => array('0', '55', '55', '55', '54', '54', '54'), 623 'ק' => array('0', '5', '5', '5'), 624 'רר' => array('0', '99', '99', '99', '9', '9', '9'), 625 'ר' => array('0', '9', '9', '9'), 626 'שטז' => array('0', '2', '4', '4'), 627 'שתש' => array('0', '2', '4', '4'), 628 'שתז' => array('0', '2', '4', '4'), 629 'שטש' => array('0', '2', '4', '4'), 630 'שד' => array('0', '2', '43', '43'), 631 'שז' => array('0', '44', '44', '44'), 632 'שס' => array('0', '44', '44', '44'), 633 'שת' => array('0', '2', '43', '43'), 634 'שג' => array('0', '4', '4', '4', '44', '44', '44', '4', '43', '43'), 635 'שט' => array('0', '2', '43', '43', '44', '44', '44'), 636 'שצ' => array('0', '44', '44', '44', '45', '45', '45'), 637 'שץ' => array('0', '44', '', '44', '45', '', '45'), 638 'שש' => array('0', '4', '4', '4', '44', '44', '44'), 639 'ש' => array('0', '4', '4', '4'), 640 'תג' => array('0', '34', '34', '34'), 641 'תז' => array('0', '34', '34', '34'), 642 'תש' => array('0', '4', '4', '4'), 643 'תת' => array('0', '3', '3', '3', '4', '4', '4', '33', '33', '33', '44', '44', '44', '34', '34', '34', '43', '43', '43'), 644 'ת' => array('0', '3', '3', '3', '4', '4', '4'), 645 // Arabic alphabet 646 'ا' => array('1', '0', '', ''), 647 'ب' => array('0', '7', '7', '7'), 648 'ت' => array('0', '3', '3', '3'), 649 'ث' => array('0', '3', '3', '3'), 650 'ج' => array('0', '4', '4', '4'), 651 'ح' => array('0', '5', '5', '5'), 652 'خ' => array('0', '5', '5', '5'), 653 'د' => array('0', '3', '3', '3'), 654 'ذ' => array('0', '3', '3', '3'), 655 'ر' => array('0', '9', '9', '9'), 656 'ز' => array('0', '4', '4', '4'), 657 'س' => array('0', '4', '4', '4'), 658 'ش' => array('0', '4', '4', '4'), 659 'ص' => array('0', '4', '4', '4'), 660 'ض' => array('0', '3', '3', '3'), 661 'ط' => array('0', '3', '3', '3'), 662 'ظ' => array('0', '4', '4', '4'), 663 'ع' => array('1', '0', '', ''), 664 'غ' => array('0', '0', '', ''), 665 'ف' => array('0', '7', '7', '7'), 666 'ق' => array('0', '5', '5', '5'), 667 'ك' => array('0', '5', '5', '5'), 668 'ل' => array('0', '8', '8', '8'), 669 'لا' => array('0', '8', '8', '8'), 670 'م' => array('0', '6', '6', '6'), 671 'ن' => array('0', '6', '6', '6'), 672 'هن' => array('0', '66', '66', '66'), 673 'ه' => array('0', '5', '5', ''), 674 'و' => array('1', '', '', '', '7', '', ''), 675 'ي' => array('0', '1', '', ''), 676 'آ' => array('0', '1', '', ''), 677 'ة' => array('0', '', '', '3'), 678 'ی' => array('0', '1', '', ''), 679 'ى' => array('1', '1', '', ''), 680 ); 681 682 /** 683 * Calculate the Daitch-Mokotoff soundex for a word. 684 * 685 * @param string $name 686 * 687 * @return string[] List of possible DM codes for the word. 688 */ 689 private static function daitchMokotoffWord($name) { 690 // Apply special transformation rules to the input string 691 $name = I18N::strtoupper($name); 692 foreach (self::$transformNameTable as $transformRule) { 693 $name = str_replace($transformRule[0], $transformRule[1], $name); 694 } 695 696 // Initialize 697 $name_script = I18N::textScript($name); 698 $noVowels = ($name_script == 'Hebr' || $name_script == 'Arab'); 699 700 $lastPos = strlen($name) - 1; 701 $currPos = 0; 702 $state = 1; // 1: start of input string, 2: before vowel, 3: other 703 $result = array(); // accumulate complete 6-digit D-M codes here 704 $partialResult = array(); // accumulate incomplete D-M codes here 705 $partialResult[] = array('!'); // initialize 1st partial result ('!' stops "duplicate sound" check) 706 707 // Loop through the input string. 708 // Stop when the string is exhausted or when no more partial results remain 709 while (count($partialResult) !== 0 && $currPos <= $lastPos) { 710 // Find the DM coding table entry for the chunk at the current position 711 $thisEntry = substr($name, $currPos, self::MAXCHAR); // Get maximum length chunk 712 while ($thisEntry != '') { 713 if (isset(self::$dmsounds[$thisEntry])) { 714 break; 715 } 716 $thisEntry = substr($thisEntry, 0, -1); // Not in table: try a shorter chunk 717 } 718 if ($thisEntry === '') { 719 $currPos++; // Not in table: advance pointer to next byte 720 continue; // and try again 721 } 722 723 $soundTableEntry = self::$dmsounds[$thisEntry]; 724 $workingResult = $partialResult; 725 $partialResult = array(); 726 $currPos += strlen($thisEntry); 727 728 // Not at beginning of input string 729 if ($state != 1) { 730 if ($currPos <= $lastPos) { 731 // Determine whether the next chunk is a vowel 732 $nextEntry = substr($name, $currPos, self::MAXCHAR); // Get maximum length chunk 733 while ($nextEntry != '') { 734 if (isset(self::$dmsounds[$nextEntry])) { 735 break; 736 } 737 $nextEntry = substr($nextEntry, 0, -1); // Not in table: try a shorter chunk 738 } 739 } else { 740 $nextEntry = ''; 741 } 742 if ($nextEntry != '' && self::$dmsounds[$nextEntry][0] != '0') { 743 $state = 2; 744 } else { 745 // Next chunk is a vowel 746 $state = 3; 747 } 748 } 749 750 while ($state < count($soundTableEntry)) { 751 // empty means 'ignore this sound in this state' 752 if ($soundTableEntry[$state] == '') { 753 foreach ($workingResult as $workingEntry) { 754 $tempEntry = $workingEntry; 755 $tempEntry[count($tempEntry) - 1] .= '!'; // Prevent false 'doubles' 756 $partialResult[] = $tempEntry; 757 } 758 } else { 759 foreach ($workingResult as $workingEntry) { 760 if ($soundTableEntry[$state] !== $workingEntry[count($workingEntry) - 1]) { 761 // Incoming sound isn't a duplicate of the previous sound 762 $workingEntry[] = $soundTableEntry[$state]; 763 } else { 764 // Incoming sound is a duplicate of the previous sound 765 // For Hebrew and Arabic, we need to create a pair of D-M sound codes, 766 // one of the pair with only a single occurrence of the duplicate sound, 767 // the other with both occurrences 768 if ($noVowels) { 769 $workingEntry[] = $soundTableEntry[$state]; 770 } 771 } 772 if (count($workingEntry) < 7) { 773 $partialResult[] = $workingEntry; 774 } else { 775 // This is the 6th code in the sequence 776 // We're looking for 7 entries because the first is '!' and doesn't count 777 $tempResult = str_replace('!', '', implode('', $workingEntry)); 778 // Only return codes from recognisable sounds 779 if ($tempResult) { 780 $result[] = substr($tempResult . '000000', 0, 6); 781 } 782 } 783 } 784 } 785 $state = $state + 3; // Advance to next triplet while keeping the same basic state 786 } 787 } 788 789 // Zero-fill and copy all remaining partial results 790 foreach ($partialResult as $workingEntry) { 791 $tempResult = str_replace('!', '', implode('', $workingEntry)); 792 // Only return codes from recognisable sounds 793 if ($tempResult) { 794 $result[] = substr($tempResult . '000000', 0, 6); 795 } 796 } 797 798 return $result; 799 } 800} 801