1<?php 2namespace Fisharebest\Webtrees; 3 4/** 5 * webtrees: online genealogy 6 * Copyright (C) 2015 webtrees development team 7 * This program is free software: you can redistribute it and/or modify 8 * it under the terms of the GNU General Public License as published by 9 * the Free Software Foundation, either version 3 of the License, or 10 * (at your option) any later version. 11 * This program is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 * GNU General Public License for more details. 15 * You should have received a copy of the GNU General Public License 16 * along with this program. If not, see <http://www.gnu.org/licenses/>. 17 */ 18 19/** 20 * Class Soundex Functions for phonetic matching of strings 21 */ 22class Soundex { 23 /** 24 * @return string[] 25 */ 26 public static function getAlgorithms() { 27 return array( 28 'std' => /* I18N: http://en.wikipedia.org/wiki/Soundex */ I18N::translate('Russell'), 29 'dm' => /* I18N: http://en.wikipedia.org/wiki/Daitch–Mokotoff_Soundex */ I18N::translate('Daitch-Mokotoff'), 30 ); 31 } 32 33 /** 34 * Is there a match between two soundex codes? 35 * 36 * @param string $soundex1 37 * @param string $soundex2 38 * 39 * @return boolean 40 */ 41 public static function compare($soundex1, $soundex2) { 42 if ($soundex1 && $soundex2) { 43 foreach (explode(':', $soundex1) as $code) { 44 if (strpos($soundex2, $code) !== false) { 45 return true; 46 } 47 } 48 } 49 50 return false; 51 } 52 53 /** 54 * Generate Russell soundex codes for a given text. 55 * 56 * @param $text 57 * 58 * @return null|string 59 */ 60 public static function russell($text) { 61 $words = preg_split('/\s/', $text, -1, PREG_SPLIT_NO_EMPTY); 62 $soundex_array = array(); 63 foreach ($words as $word) { 64 $soundex = soundex($word); 65 // Only return codes from recognisable sounds 66 if ($soundex !== '0000') { 67 $soundex_array[] = $soundex; 68 } 69 } 70 // Combine words, e.g. “New York” as “Newyork” 71 if (count($words) > 1) { 72 $soundex_array[] = soundex(strtr($text, ' ', '')); 73 } 74 // A varchar(255) column can only hold 51 4-character codes (plus 50 delimiters) 75 $soundex_array = array_slice(array_unique($soundex_array), 0, 51); 76 77 if ($soundex_array) { 78 return implode(':', $soundex_array); 79 } else { 80 return null; 81 } 82 } 83 84 /** 85 * Generate Daitch–Mokotoff soundex codes for a given text. 86 * 87 * @param $text 88 * 89 * @return null|string 90 */ 91 public static function daitchMokotoff($text) { 92 $words = preg_split('/\s/', $text, -1, PREG_SPLIT_NO_EMPTY); 93 $soundex_array = array(); 94 foreach ($words as $word) { 95 $soundex_array = array_merge($soundex_array, self::daitchMokotoffWord($word)); 96 } 97 // Combine words, e.g. “New York” as “Newyork” 98 if (count($words) > 1) { 99 $soundex_array = array_merge($soundex_array, self::daitchMokotoffWord(strtr($text, ' ', ''))); 100 } 101 // A varchar(255) column can only hold 36 6-character codes (plus 35 delimiters) 102 $soundex_array = array_slice(array_unique($soundex_array), 0, 36); 103 104 if ($soundex_array) { 105 return implode(':', $soundex_array); 106 } else { 107 return null; 108 } 109 } 110 111 // Determine the Daitch–Mokotoff Soundex code for a word 112 // Original implementation by Gerry Kroll, and analysis by Meliza Amity 113 114 // Max. table key length (in ASCII bytes -- NOT in UTF-8 characters!) 115 const MAXCHAR = 7; 116 117 /** 118 * Name transformation arrays. 119 * Used to transform the Name string to simplify the "sounds like" table. 120 * This is especially useful in Hebrew. 121 * 122 * Each array entry defines the "from" and "to" arguments of an preg($from, $to, $text) 123 * function call to achieve the desired transformations. 124 * 125 * Note about the use of "\x01": 126 * This code, which can’t legitimately occur in the kind of text we're dealing with, 127 * is used as a place-holder so that conditional string replacements can be done. 128 * 129 * @var string[][] 130 */ 131 private static $transformNameTable = array( 132 // Force Yiddish ligatures to be treated as separate letters 133 array('װ', 'וו'), 134 array('ײ', 'יי'), 135 array('ױ', 'וי'), 136 array('בו', 'בע'), 137 array('פו', 'פע'), 138 array('ומ', 'עמ'), 139 array('ום', 'עם'), 140 array('ונ', 'ענ'), 141 array('ון', 'ען'), 142 array('וו', 'ב'), 143 array("\x01", ''), 144 array('ייה$', "\x01ה"), 145 array('ייע$', "\x01ע"), 146 array('יי', 'ע'), 147 array("\x01", 'יי'), 148 ); 149 150 /** 151 * The DM sound coding table is organized this way: 152 * key: a variable-length string that corresponds to the UTF-8 character sequence 153 * represented by the table entry. Currently, that string can be up to 7 154 * bytes long. This maximum length is defined by the value of global variable 155 * $maxchar. 156 * 157 * value: an array as follows: 158 * [0]: zero if not a vowel 159 * [1]: sound value when this string is at the beginning of the word 160 * [2]: sound value when this string is followed by a vowel 161 * [3]: sound value for other cases 162 * [1],[2],[3] can be repeated several times to create branches in the code 163 * an empty sound value means "ignore in this state" 164 * 165 * @var string[][] 166 */ 167 private static $dmsounds = array( 168 'A' => array('1', '0', '', ''), 169 'À' => array('1', '0', '', ''), 170 'Á' => array('1', '0', '', ''), 171 'Â' => array('1', '0', '', ''), 172 'Ã' => array('1', '0', '', ''), 173 'Ä' => array('1', '0', '1', '', '0', '', ''), 174 'Å' => array('1', '0', '', ''), 175 'Ă' => array('1', '0', '', ''), 176 'Ą' => array('1', '', '', '', '', '', '6'), 177 'Ạ' => array('1', '0', '', ''), 178 'Ả' => array('1', '0', '', ''), 179 'Ấ' => array('1', '0', '', ''), 180 'Ầ' => array('1', '0', '', ''), 181 'Ẩ' => array('1', '0', '', ''), 182 'Ẫ' => array('1', '0', '', ''), 183 'Ậ' => array('1', '0', '', ''), 184 'Ắ' => array('1', '0', '', ''), 185 'Ằ' => array('1', '0', '', ''), 186 'Ẳ' => array('1', '0', '', ''), 187 'Ẵ' => array('1', '0', '', ''), 188 'Ặ' => array('1', '0', '', ''), 189 'AE' => array('1', '0', '1', ''), 190 'Æ' => array('1', '0', '1', ''), 191 'AI' => array('1', '0', '1', ''), 192 'AJ' => array('1', '0', '1', ''), 193 'AU' => array('1', '0', '7', ''), 194 'AV' => array('1', '0', '7', '', '7', '7', '7'), 195 'ÄU' => array('1', '0', '1', ''), 196 'AY' => array('1', '0', '1', ''), 197 'B' => array('0', '7', '7', '7'), 198 'C' => array('0', '5', '5', '5', '34', '4', '4'), 199 'Ć' => array('0', '4', '4', '4'), 200 'Č' => array('0', '4', '4', '4'), 201 'Ç' => array('0', '4', '4', '4'), 202 'CH' => array('0', '5', '5', '5', '34', '4', '4'), 203 'CHS' => array('0', '5', '54', '54'), 204 'CK' => array('0', '5', '5', '5', '45', '45', '45'), 205 'CCS' => array('0', '4', '4', '4'), 206 'CS' => array('0', '4', '4', '4'), 207 'CSZ' => array('0', '4', '4', '4'), 208 'CZ' => array('0', '4', '4', '4'), 209 'CZS' => array('0', '4', '4', '4'), 210 'D' => array('0', '3', '3', '3'), 211 'Ď' => array('0', '3', '3', '3'), 212 'Đ' => array('0', '3', '3', '3'), 213 'DRS' => array('0', '4', '4', '4'), 214 'DRZ' => array('0', '4', '4', '4'), 215 'DS' => array('0', '4', '4', '4'), 216 'DSH' => array('0', '4', '4', '4'), 217 'DSZ' => array('0', '4', '4', '4'), 218 'DT' => array('0', '3', '3', '3'), 219 'DDZ' => array('0', '4', '4', '4'), 220 'DDZS' => array('0', '4', '4', '4'), 221 'DZ' => array('0', '4', '4', '4'), 222 'DŹ' => array('0', '4', '4', '4'), 223 'DŻ' => array('0', '4', '4', '4'), 224 'DZH' => array('0', '4', '4', '4'), 225 'DZS' => array('0', '4', '4', '4'), 226 'E' => array('1', '0', '', ''), 227 'È' => array('1', '0', '', ''), 228 'É' => array('1', '0', '', ''), 229 'Ê' => array('1', '0', '', ''), 230 'Ë' => array('1', '0', '', ''), 231 'Ĕ' => array('1', '0', '', ''), 232 'Ė' => array('1', '0', '', ''), 233 'Ę' => array('1', '', '', '6', '', '', ''), 234 'Ẹ' => array('1', '0', '', ''), 235 'Ẻ' => array('1', '0', '', ''), 236 'Ẽ' => array('1', '0', '', ''), 237 'Ế' => array('1', '0', '', ''), 238 'Ề' => array('1', '0', '', ''), 239 'Ể' => array('1', '0', '', ''), 240 'Ễ' => array('1', '0', '', ''), 241 'Ệ' => array('1', '0', '', ''), 242 'EAU' => array('1', '0', '', ''), 243 'EI' => array('1', '0', '1', ''), 244 'EJ' => array('1', '0', '1', ''), 245 'EU' => array('1', '1', '1', ''), 246 'EY' => array('1', '0', '1', ''), 247 'F' => array('0', '7', '7', '7'), 248 'FB' => array('0', '7', '7', '7'), 249 'G' => array('0', '5', '5', '5', '34', '4', '4'), 250 'Ğ' => array('0', '', '', ''), 251 'GGY' => array('0', '5', '5', '5'), 252 'GY' => array('0', '5', '5', '5'), 253 'H' => array('0', '5', '5', '', '5', '5', '5'), 254 'I' => array('1', '0', '', ''), 255 'Ì' => array('1', '0', '', ''), 256 'Í' => array('1', '0', '', ''), 257 'Î' => array('1', '0', '', ''), 258 'Ï' => array('1', '0', '', ''), 259 'Ĩ' => array('1', '0', '', ''), 260 'Į' => array('1', '0', '', ''), 261 'İ' => array('1', '0', '', ''), 262 'Ỉ' => array('1', '0', '', ''), 263 'Ị' => array('1', '0', '', ''), 264 'IA' => array('1', '1', '', ''), 265 'IE' => array('1', '1', '', ''), 266 'IO' => array('1', '1', '', ''), 267 'IU' => array('1', '1', '', ''), 268 'J' => array('0', '1', '', '', '4', '4', '4', '5', '5', ''), 269 'K' => array('0', '5', '5', '5'), 270 'KH' => array('0', '5', '5', '5'), 271 'KS' => array('0', '5', '54', '54'), 272 'L' => array('0', '8', '8', '8'), 273 'Ľ' => array('0', '8', '8', '8'), 274 'Ĺ' => array('0', '8', '8', '8'), 275 'Ł' => array('0', '7', '7', '7', '8', '8', '8'), 276 'LL' => array('0', '8', '8', '8', '58', '8', '8', '1', '8', '8'), 277 'LLY' => array('0', '8', '8', '8', '1', '8', '8'), 278 'LY' => array('0', '8', '8', '8', '1', '8', '8'), 279 'M' => array('0', '6', '6', '6'), 280 'MĔ' => array('0', '66', '66', '66'), 281 'MN' => array('0', '66', '66', '66'), 282 'N' => array('0', '6', '6', '6'), 283 'Ń' => array('0', '6', '6', '6'), 284 'Ň' => array('0', '6', '6', '6'), 285 'Ñ' => array('0', '6', '6', '6'), 286 'NM' => array('0', '66', '66', '66'), 287 'O' => array('1', '0', '', ''), 288 'Ò' => array('1', '0', '', ''), 289 'Ó' => array('1', '0', '', ''), 290 'Ô' => array('1', '0', '', ''), 291 'Õ' => array('1', '0', '', ''), 292 'Ö' => array('1', '0', '', ''), 293 'Ø' => array('1', '0', '', ''), 294 'Ő' => array('1', '0', '', ''), 295 'Œ' => array('1', '0', '', ''), 296 'Ơ' => array('1', '0', '', ''), 297 'Ọ' => array('1', '0', '', ''), 298 'Ỏ' => array('1', '0', '', ''), 299 'Ố' => array('1', '0', '', ''), 300 'Ồ' => array('1', '0', '', ''), 301 'Ổ' => array('1', '0', '', ''), 302 'Ỗ' => array('1', '0', '', ''), 303 'Ộ' => array('1', '0', '', ''), 304 'Ớ' => array('1', '0', '', ''), 305 'Ờ' => array('1', '0', '', ''), 306 'Ở' => array('1', '0', '', ''), 307 'Ỡ' => array('1', '0', '', ''), 308 'Ợ' => array('1', '0', '', ''), 309 'OE' => array('1', '0', '', ''), 310 'OI' => array('1', '0', '1', ''), 311 'OJ' => array('1', '0', '1', ''), 312 'OU' => array('1', '0', '', ''), 313 'OY' => array('1', '0', '1', ''), 314 'P' => array('0', '7', '7', '7'), 315 'PF' => array('0', '7', '7', '7'), 316 'PH' => array('0', '7', '7', '7'), 317 'Q' => array('0', '5', '5', '5'), 318 'R' => array('0', '9', '9', '9'), 319 'Ř' => array('0', '4', '4', '4'), 320 'RS' => array('0', '4', '4', '4', '94', '94', '94'), 321 'RZ' => array('0', '4', '4', '4', '94', '94', '94'), 322 'S' => array('0', '4', '4', '4'), 323 'Ś' => array('0', '4', '4', '4'), 324 'Š' => array('0', '4', '4', '4'), 325 'Ş' => array('0', '4', '4', '4'), 326 'SC' => array('0', '2', '4', '4'), 327 'ŠČ' => array('0', '2', '4', '4'), 328 'SCH' => array('0', '4', '4', '4'), 329 'SCHD' => array('0', '2', '43', '43'), 330 'SCHT' => array('0', '2', '43', '43'), 331 'SCHTCH' => array('0', '2', '4', '4'), 332 'SCHTSCH' => array('0', '2', '4', '4'), 333 'SCHTSH' => array('0', '2', '4', '4'), 334 'SD' => array('0', '2', '43', '43'), 335 'SH' => array('0', '4', '4', '4'), 336 'SHCH' => array('0', '2', '4', '4'), 337 'SHD' => array('0', '2', '43', '43'), 338 'SHT' => array('0', '2', '43', '43'), 339 'SHTCH' => array('0', '2', '4', '4'), 340 'SHTSH' => array('0', '2', '4', '4'), 341 'ß' => array('0', '', '4', '4'), 342 'ST' => array('0', '2', '43', '43'), 343 'STCH' => array('0', '2', '4', '4'), 344 'STRS' => array('0', '2', '4', '4'), 345 'STRZ' => array('0', '2', '4', '4'), 346 'STSCH' => array('0', '2', '4', '4'), 347 'STSH' => array('0', '2', '4', '4'), 348 'SSZ' => array('0', '4', '4', '4'), 349 'SZ' => array('0', '4', '4', '4'), 350 'SZCS' => array('0', '2', '4', '4'), 351 'SZCZ' => array('0', '2', '4', '4'), 352 'SZD' => array('0', '2', '43', '43'), 353 'SZT' => array('0', '2', '43', '43'), 354 'T' => array('0', '3', '3', '3'), 355 'Ť' => array('0', '3', '3', '3'), 356 'Ţ' => array('0', '3', '3', '3', '4', '4', '4'), 357 'TC' => array('0', '4', '4', '4'), 358 'TCH' => array('0', '4', '4', '4'), 359 'TH' => array('0', '3', '3', '3'), 360 'TRS' => array('0', '4', '4', '4'), 361 'TRZ' => array('0', '4', '4', '4'), 362 'TS' => array('0', '4', '4', '4'), 363 'TSCH' => array('0', '4', '4', '4'), 364 'TSH' => array('0', '4', '4', '4'), 365 'TSZ' => array('0', '4', '4', '4'), 366 'TTCH' => array('0', '4', '4', '4'), 367 'TTS' => array('0', '4', '4', '4'), 368 'TTSCH' => array('0', '4', '4', '4'), 369 'TTSZ' => array('0', '4', '4', '4'), 370 'TTZ' => array('0', '4', '4', '4'), 371 'TZ' => array('0', '4', '4', '4'), 372 'TZS' => array('0', '4', '4', '4'), 373 'U' => array('1', '0', '', ''), 374 'Ù' => array('1', '0', '', ''), 375 'Ú' => array('1', '0', '', ''), 376 'Û' => array('1', '0', '', ''), 377 'Ü' => array('1', '0', '', ''), 378 'Ũ' => array('1', '0', '', ''), 379 'Ū' => array('1', '0', '', ''), 380 'Ů' => array('1', '0', '', ''), 381 'Ű' => array('1', '0', '', ''), 382 'Ų' => array('1', '0', '', ''), 383 'Ư' => array('1', '0', '', ''), 384 'Ụ' => array('1', '0', '', ''), 385 'Ủ' => array('1', '0', '', ''), 386 'Ứ' => array('1', '0', '', ''), 387 'Ừ' => array('1', '0', '', ''), 388 'Ử' => array('1', '0', '', ''), 389 'Ữ' => array('1', '0', '', ''), 390 'Ự' => array('1', '0', '', ''), 391 'UE' => array('1', '0', '', ''), 392 'UI' => array('1', '0', '1', ''), 393 'UJ' => array('1', '0', '1', ''), 394 'UY' => array('1', '0', '1', ''), 395 'UW' => array('1', '0', '1', '', '0', '7', '7'), 396 'V' => array('0', '7', '7', '7'), 397 'W' => array('0', '7', '7', '7'), 398 'X' => array('0', '5', '54', '54'), 399 'Y' => array('1', '1', '', ''), 400 'Ý' => array('1', '1', '', ''), 401 'Ỳ' => array('1', '1', '', ''), 402 'Ỵ' => array('1', '1', '', ''), 403 'Ỷ' => array('1', '1', '', ''), 404 'Ỹ' => array('1', '1', '', ''), 405 'Z' => array('0', '4', '4', '4'), 406 'Ź' => array('0', '4', '4', '4'), 407 'Ż' => array('0', '4', '4', '4'), 408 'Ž' => array('0', '4', '4', '4'), 409 'ZD' => array('0', '2', '43', '43'), 410 'ZDZ' => array('0', '2', '4', '4'), 411 'ZDZH' => array('0', '2', '4', '4'), 412 'ZH' => array('0', '4', '4', '4'), 413 'ZHD' => array('0', '2', '43', '43'), 414 'ZHDZH' => array('0', '2', '4', '4'), 415 'ZS' => array('0', '4', '4', '4'), 416 'ZSCH' => array('0', '4', '4', '4'), 417 'ZSH' => array('0', '4', '4', '4'), 418 'ZZS' => array('0', '4', '4', '4'), 419 // Cyrillic alphabet 420 'А' => array('1', '0', '', ''), 421 'Б' => array('0', '7', '7', '7'), 422 'В' => array('0', '7', '7', '7'), 423 'Г' => array('0', '5', '5', '5'), 424 'Д' => array('0', '3', '3', '3'), 425 'ДЗ' => array('0', '4', '4', '4'), 426 'Е' => array('1', '0', '', ''), 427 'Ё' => array('1', '0', '', ''), 428 'Ж' => array('0', '4', '4', '4'), 429 'З' => array('0', '4', '4', '4'), 430 'И' => array('1', '0', '', ''), 431 'Й' => array('1', '1', '', '', '4', '4', '4'), 432 'К' => array('0', '5', '5', '5'), 433 'Л' => array('0', '8', '8', '8'), 434 'М' => array('0', '6', '6', '6'), 435 'Н' => array('0', '6', '6', '6'), 436 'О' => array('1', '0', '', ''), 437 'П' => array('0', '7', '7', '7'), 438 'Р' => array('0', '9', '9', '9'), 439 'РЖ' => array('0', '4', '4', '4'), 440 'С' => array('0', '4', '4', '4'), 441 'Т' => array('0', '3', '3', '3'), 442 'У' => array('1', '0', '', ''), 443 'Ф' => array('0', '7', '7', '7'), 444 'Х' => array('0', '5', '5', '5'), 445 'Ц' => array('0', '4', '4', '4'), 446 'Ч' => array('0', '4', '4', '4'), 447 'Ш' => array('0', '4', '4', '4'), 448 'Щ' => array('0', '2', '4', '4'), 449 'Ъ' => array('0', '', '', ''), 450 'Ы' => array('0', '1', '', ''), 451 'Ь' => array('0', '', '', ''), 452 'Э' => array('1', '0', '', ''), 453 'Ю' => array('0', '1', '', ''), 454 'Я' => array('0', '1', '', ''), 455 // Greek alphabet 456 'Α' => array('1', '0', '', ''), 457 'Ά' => array('1', '0', '', ''), 458 'ΑΙ' => array('1', '0', '1', ''), 459 'ΑΥ' => array('1', '0', '1', ''), 460 'Β' => array('0', '7', '7', '7'), 461 'Γ' => array('0', '5', '5', '5'), 462 'Δ' => array('0', '3', '3', '3'), 463 'Ε' => array('1', '0', '', ''), 464 'Έ' => array('1', '0', '', ''), 465 'ΕΙ' => array('1', '0', '1', ''), 466 'ΕΥ' => array('1', '1', '1', ''), 467 'Ζ' => array('0', '4', '4', '4'), 468 'Η' => array('1', '0', '', ''), 469 'Ή' => array('1', '0', '', ''), 470 'Θ' => array('0', '3', '3', '3'), 471 'Ι' => array('1', '0', '', ''), 472 'Ί' => array('1', '0', '', ''), 473 'Ϊ' => array('1', '0', '', ''), 474 'ΐ' => array('1', '0', '', ''), 475 'Κ' => array('0', '5', '5', '5'), 476 'Λ' => array('0', '8', '8', '8'), 477 'Μ' => array('0', '6', '6', '6'), 478 'ΜΠ' => array('0', '7', '7', '7'), 479 'Ν' => array('0', '6', '6', '6'), 480 'ΝΤ' => array('0', '3', '3', '3'), 481 'Ξ' => array('0', '5', '54', '54'), 482 'Ο' => array('1', '0', '', ''), 483 'Ό' => array('1', '0', '', ''), 484 'ΟΙ' => array('1', '0', '1', ''), 485 'ΟΥ' => array('1', '0', '1', ''), 486 'Π' => array('0', '7', '7', '7'), 487 'Ρ' => array('0', '9', '9', '9'), 488 'Σ' => array('0', '4', '4', '4'), 489 'ς' => array('0', '', '', '4'), 490 'Τ' => array('0', '3', '3', '3'), 491 'ΤΖ' => array('0', '4', '4', '4'), 492 'ΤΣ' => array('0', '4', '4', '4'), 493 'Υ' => array('1', '1', '', ''), 494 'Ύ' => array('1', '1', '', ''), 495 'Ϋ' => array('1', '1', '', ''), 496 'ΰ' => array('1', '1', '', ''), 497 'ΥΚ' => array('1', '5', '5', '5'), 498 'ΥΥ' => array('1', '65', '65', '65'), 499 'Φ' => array('0', '7', '7', '7'), 500 'Χ' => array('0', '5', '5', '5'), 501 'Ψ' => array('0', '7', '7', '7'), 502 'Ω' => array('1', '0', '', ''), 503 'Ώ' => array('1', '0', '', ''), 504 // Hebrew alphabet 505 'א' => array('1', '0', '', ''), 506 'או' => array('1', '0', '7', ''), 507 'אג' => array('1', '4', '4', '4', '5', '5', '5', '34', '34', '34'), 508 'בב' => array('0', '7', '7', '7', '77', '77', '77'), 509 'ב' => array('0', '7', '7', '7'), 510 'גג' => array('0', '4', '4', '4', '5', '5', '5', '45', '45', '45', '55', '55', '55', '54', '54', '54'), 511 'גד' => array('0', '43', '43', '43', '53', '53', '53'), 512 'גה' => array('0', '45', '45', '45', '55', '55', '55'), 513 'גז' => array('0', '44', '44', '44', '45', '45', '45'), 514 'גח' => array('0', '45', '45', '45', '55', '55', '55'), 515 'גכ' => array('0', '45', '45', '45', '55', '55', '55'), 516 'גך' => array('0', '45', '45', '45', '55', '55', '55'), 517 'גצ' => array('0', '44', '44', '44', '45', '45', '45'), 518 'גץ' => array('0', '44', '44', '44', '45', '45', '45'), 519 'גק' => array('0', '45', '45', '45', '54', '54', '54'), 520 'גש' => array('0', '44', '44', '44', '54', '54', '54'), 521 'גת' => array('0', '43', '43', '43', '53', '53', '53'), 522 'ג' => array('0', '4', '4', '4', '5', '5', '5'), 523 'דז' => array('0', '4', '4', '4'), 524 'דד' => array('0', '3', '3', '3', '33', '33', '33'), 525 'דט' => array('0', '33', '33', '33'), 526 'דש' => array('0', '4', '4', '4'), 527 'דצ' => array('0', '4', '4', '4'), 528 'דץ' => array('0', '4', '4', '4'), 529 'ד' => array('0', '3', '3', '3'), 530 'הג' => array('0', '54', '54', '54', '55', '55', '55'), 531 'הכ' => array('0', '55', '55', '55'), 532 'הח' => array('0', '55', '55', '55'), 533 'הק' => array('0', '55', '55', '55', '5', '5', '5'), 534 'הה' => array('0', '5', '5', '', '55', '55', ''), 535 'ה' => array('0', '5', '5', ''), 536 'וי' => array('1', '', '', '', '7', '7', '7'), 537 'ו' => array('1', '7', '7', '7', '7', '', ''), 538 'וו' => array('1', '7', '7', '7', '7', '', ''), 539 'וופ' => array('1', '7', '7', '7', '77', '77', '77'), 540 'זש' => array('0', '4', '4', '4', '44', '44', '44'), 541 'זדז' => array('0', '2', '4', '4'), 542 'ז' => array('0', '4', '4', '4'), 543 'זג' => array('0', '44', '44', '44', '45', '45', '45'), 544 'זז' => array('0', '4', '4', '4', '44', '44', '44'), 545 'זס' => array('0', '44', '44', '44'), 546 'זצ' => array('0', '44', '44', '44'), 547 'זץ' => array('0', '44', '44', '44'), 548 'חג' => array('0', '54', '54', '54', '53', '53', '53'), 549 'חח' => array('0', '5', '5', '5', '55', '55', '55'), 550 'חק' => array('0', '55', '55', '55', '5', '5', '5'), 551 'חכ' => array('0', '45', '45', '45', '55', '55', '55'), 552 'חס' => array('0', '5', '54', '54'), 553 'חש' => array('0', '5', '54', '54'), 554 'ח' => array('0', '5', '5', '5'), 555 'טש' => array('0', '4', '4', '4'), 556 'טד' => array('0', '33', '33', '33'), 557 'טי' => array('0', '3', '3', '3', '4', '4', '4', '3', '3', '34'), 558 'טת' => array('0', '33', '33', '33'), 559 'טט' => array('0', '3', '3', '3', '33', '33', '33'), 560 'ט' => array('0', '3', '3', '3'), 561 'י' => array('1', '1', '', ''), 562 'יא' => array('1', '1', '', '', '1', '1', '1'), 563 'כג' => array('0', '55', '55', '55', '54', '54', '54'), 564 'כש' => array('0', '5', '54', '54'), 565 'כס' => array('0', '5', '54', '54'), 566 'ככ' => array('0', '5', '5', '5', '55', '55', '55'), 567 'כך' => array('0', '5', '5', '5', '55', '55', '55'), 568 'כ' => array('0', '5', '5', '5'), 569 'כח' => array('0', '55', '55', '55', '5', '5', '5'), 570 'ך' => array('0', '', '5', '5'), 571 'ל' => array('0', '8', '8', '8'), 572 'לל' => array('0', '88', '88', '88', '8', '8', '8'), 573 'מנ' => array('0', '66', '66', '66'), 574 'מן' => array('0', '66', '66', '66'), 575 'ממ' => array('0', '6', '6', '6', '66', '66', '66'), 576 'מם' => array('0', '6', '6', '6', '66', '66', '66'), 577 'מ' => array('0', '6', '6', '6'), 578 'ם' => array('0', '', '6', '6'), 579 'נמ' => array('0', '66', '66', '66'), 580 'נם' => array('0', '66', '66', '66'), 581 'ננ' => array('0', '6', '6', '6', '66', '66', '66'), 582 'נן' => array('0', '6', '6', '6', '66', '66', '66'), 583 'נ' => array('0', '6', '6', '6'), 584 'ן' => array('0', '', '6', '6'), 585 'סתש' => array('0', '2', '4', '4'), 586 'סתז' => array('0', '2', '4', '4'), 587 'סטז' => array('0', '2', '4', '4'), 588 'סטש' => array('0', '2', '4', '4'), 589 'סצד' => array('0', '2', '4', '4'), 590 'סט' => array('0', '2', '4', '4', '43', '43', '43'), 591 'סת' => array('0', '2', '4', '4', '43', '43', '43'), 592 'סג' => array('0', '44', '44', '44', '4', '4', '4'), 593 'סס' => array('0', '4', '4', '4', '44', '44', '44'), 594 'סצ' => array('0', '44', '44', '44'), 595 'סץ' => array('0', '44', '44', '44'), 596 'סז' => array('0', '44', '44', '44'), 597 'סש' => array('0', '44', '44', '44'), 598 'ס' => array('0', '4', '4', '4'), 599 'ע' => array('1', '0', '', ''), 600 'פב' => array('0', '7', '7', '7', '77', '77', '77'), 601 'פוו' => array('0', '7', '7', '7', '77', '77', '77'), 602 'פפ' => array('0', '7', '7', '7', '77', '77', '77'), 603 'פף' => array('0', '7', '7', '7', '77', '77', '77'), 604 'פ' => array('0', '7', '7', '7'), 605 'ף' => array('0', '', '7', '7'), 606 'צג' => array('0', '44', '44', '44', '45', '45', '45'), 607 'צז' => array('0', '44', '44', '44'), 608 'צס' => array('0', '44', '44', '44'), 609 'צצ' => array('0', '4', '4', '4', '5', '5', '5', '44', '44', '44', '54', '54', '54', '45', '45', '45'), 610 'צץ' => array('0', '4', '4', '4', '5', '5', '5', '44', '44', '44', '54', '54', '54'), 611 'צש' => array('0', '44', '44', '44', '4', '4', '4', '5', '5', '5'), 612 'צ' => array('0', '4', '4', '4', '5', '5', '5'), 613 'ץ' => array('0', '', '4', '4'), 614 'קה' => array('0', '55', '55', '5'), 615 'קס' => array('0', '5', '54', '54'), 616 'קש' => array('0', '5', '54', '54'), 617 'קק' => array('0', '5', '5', '5', '55', '55', '55'), 618 'קח' => array('0', '55', '55', '55'), 619 'קכ' => array('0', '55', '55', '55'), 620 'קך' => array('0', '55', '55', '55'), 621 'קג' => array('0', '55', '55', '55', '54', '54', '54'), 622 'ק' => array('0', '5', '5', '5'), 623 'רר' => array('0', '99', '99', '99', '9', '9', '9'), 624 'ר' => array('0', '9', '9', '9'), 625 'שטז' => array('0', '2', '4', '4'), 626 'שתש' => array('0', '2', '4', '4'), 627 'שתז' => array('0', '2', '4', '4'), 628 'שטש' => array('0', '2', '4', '4'), 629 'שד' => array('0', '2', '43', '43'), 630 'שז' => array('0', '44', '44', '44'), 631 'שס' => array('0', '44', '44', '44'), 632 'שת' => array('0', '2', '43', '43'), 633 'שג' => array('0', '4', '4', '4', '44', '44', '44', '4', '43', '43'), 634 'שט' => array('0', '2', '43', '43', '44', '44', '44'), 635 'שצ' => array('0', '44', '44', '44', '45', '45', '45'), 636 'שץ' => array('0', '44', '', '44', '45', '', '45'), 637 'שש' => array('0', '4', '4', '4', '44', '44', '44'), 638 'ש' => array('0', '4', '4', '4'), 639 'תג' => array('0', '34', '34', '34'), 640 'תז' => array('0', '34', '34', '34'), 641 'תש' => array('0', '4', '4', '4'), 642 'תת' => array('0', '3', '3', '3', '4', '4', '4', '33', '33', '33', '44', '44', '44', '34', '34', '34', '43', '43', '43'), 643 'ת' => array('0', '3', '3', '3', '4', '4', '4'), 644 // Arabic alphabet 645 'ا' => array('1', '0', '', ''), 646 'ب' => array('0', '7', '7', '7'), 647 'ت' => array('0', '3', '3', '3'), 648 'ث' => array('0', '3', '3', '3'), 649 'ج' => array('0', '4', '4', '4'), 650 'ح' => array('0', '5', '5', '5'), 651 'خ' => array('0', '5', '5', '5'), 652 'د' => array('0', '3', '3', '3'), 653 'ذ' => array('0', '3', '3', '3'), 654 'ر' => array('0', '9', '9', '9'), 655 'ز' => array('0', '4', '4', '4'), 656 'س' => array('0', '4', '4', '4'), 657 'ش' => array('0', '4', '4', '4'), 658 'ص' => array('0', '4', '4', '4'), 659 'ض' => array('0', '3', '3', '3'), 660 'ط' => array('0', '3', '3', '3'), 661 'ظ' => array('0', '4', '4', '4'), 662 'ع' => array('1', '0', '', ''), 663 'غ' => array('0', '0', '', ''), 664 'ف' => array('0', '7', '7', '7'), 665 'ق' => array('0', '5', '5', '5'), 666 'ك' => array('0', '5', '5', '5'), 667 'ل' => array('0', '8', '8', '8'), 668 'لا' => array('0', '8', '8', '8'), 669 'م' => array('0', '6', '6', '6'), 670 'ن' => array('0', '6', '6', '6'), 671 'هن' => array('0', '66', '66', '66'), 672 'ه' => array('0', '5', '5', ''), 673 'و' => array('1', '', '', '', '7', '', ''), 674 'ي' => array('0', '1', '', ''), 675 'آ' => array('0', '1', '', ''), 676 'ة' => array('0', '', '', '3'), 677 'ی' => array('0', '1', '', ''), 678 'ى' => array('1', '1', '', ''), 679 ); 680 681 /** 682 * @param string $name 683 * 684 * @return string[] List of possible DM codes for the word. 685 */ 686 private static function daitchMokotoffWord($name) { 687 // Apply special transformation rules to the input string 688 $name = I18N::strtoupper($name); 689 foreach (self::$transformNameTable as $transformRule) { 690 $name = str_replace($transformRule[0], $transformRule[1], $name); 691 } 692 693 // Initialize 694 $name_script = I18N::textScript($name); 695 $noVowels = ($name_script == 'Hebr' || $name_script == 'Arab'); 696 697 $lastPos = strlen($name) - 1; 698 $currPos = 0; 699 $state = 1; // 1: start of input string, 2: before vowel, 3: other 700 $result = array(); // accumulate complete 6-digit D-M codes here 701 $partialResult = array(); // accumulate incomplete D-M codes here 702 $partialResult[] = array('!'); // initialize 1st partial result ('!' stops "duplicate sound" check) 703 704 // Loop through the input string. 705 // Stop when the string is exhausted or when no more partial results remain 706 while (count($partialResult) !== 0 && $currPos <= $lastPos) { 707 // Find the DM coding table entry for the chunk at the current position 708 $thisEntry = substr($name, $currPos, self::MAXCHAR); // Get maximum length chunk 709 while ($thisEntry != '') { 710 if (isset(self::$dmsounds[$thisEntry])) { 711 break; 712 } 713 $thisEntry = substr($thisEntry, 0, -1); // Not in table: try a shorter chunk 714 } 715 if ($thisEntry === '') { 716 $currPos++; // Not in table: advance pointer to next byte 717 continue; // and try again 718 } 719 720 $soundTableEntry = self::$dmsounds[$thisEntry]; 721 $workingResult = $partialResult; 722 $partialResult = array(); 723 $currPos += strlen($thisEntry); 724 725 // Not at beginning of input string 726 if ($state != 1) { 727 if ($currPos <= $lastPos) { 728 // Determine whether the next chunk is a vowel 729 $nextEntry = substr($name, $currPos, self::MAXCHAR); // Get maximum length chunk 730 while ($nextEntry != '') { 731 if (isset(self::$dmsounds[$nextEntry])) { 732 break; 733 } 734 $nextEntry = substr($nextEntry, 0, -1); // Not in table: try a shorter chunk 735 } 736 } else { 737 $nextEntry = ''; 738 } 739 if ($nextEntry != '' && self::$dmsounds[$nextEntry][0] != '0') { 740 $state = 2; 741 } else { 742 // Next chunk is a vowel 743 $state = 3; 744 } 745 } 746 747 while ($state < count($soundTableEntry)) { 748 // empty means 'ignore this sound in this state' 749 if ($soundTableEntry[$state] == '') { 750 foreach ($workingResult as $workingEntry) { 751 $tempEntry = $workingEntry; 752 $tempEntry[count($tempEntry) - 1] .= '!'; // Prevent false 'doubles' 753 $partialResult[] = $tempEntry; 754 } 755 } else { 756 foreach ($workingResult as $workingEntry) { 757 if ($soundTableEntry[$state] !== $workingEntry[count($workingEntry) - 1]) { 758 // Incoming sound isn't a duplicate of the previous sound 759 $workingEntry[] = $soundTableEntry[$state]; 760 } else { 761 // Incoming sound is a duplicate of the previous sound 762 // For Hebrew and Arabic, we need to create a pair of D-M sound codes, 763 // one of the pair with only a single occurrence of the duplicate sound, 764 // the other with both occurrences 765 if ($noVowels) { 766 $workingEntry[] = $soundTableEntry[$state]; 767 } 768 } 769 if (count($workingEntry) < 7) { 770 $partialResult[] = $workingEntry; 771 } else { 772 // This is the 6th code in the sequence 773 // We're looking for 7 entries because the first is '!' and doesn't count 774 $tempResult = str_replace('!', '', implode('', $workingEntry)); 775 // Only return codes from recognisable sounds 776 if ($tempResult) { 777 $result[] = substr($tempResult . '000000', 0, 6); 778 } 779 } 780 } 781 } 782 $state = $state + 3; // Advance to next triplet while keeping the same basic state 783 } 784 } 785 786 // Zero-fill and copy all remaining partial results 787 foreach ($partialResult as $workingEntry) { 788 $tempResult = str_replace('!', '', implode('', $workingEntry)); 789 // Only return codes from recognisable sounds 790 if ($tempResult) { 791 $result[] = substr($tempResult . '000000', 0, 6); 792 } 793 } 794 795 return $result; 796 } 797} 798