1<?php 2 3/** 4 * webtrees: online genealogy 5 * Copyright (C) 2023 webtrees development team 6 * This program is free software: you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License as published by 8 * the Free Software Foundation, either version 3 of the License, or 9 * (at your option) any later version. 10 * This program is distributed in the hope that it will be useful, 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 * GNU General Public License for more details. 14 * You should have received a copy of the GNU General Public License 15 * along with this program. If not, see <https://www.gnu.org/licenses/>. 16 */ 17 18declare(strict_types=1); 19 20namespace Fisharebest\Webtrees; 21 22use function array_slice; 23use function count; 24use function strlen; 25 26/** 27 * Phonetic matching of strings. 28 */ 29class Soundex 30{ 31 // Determine the Daitch–Mokotoff Soundex code for a word 32 // Original implementation by Gerry Kroll, and analysis by Meliza Amity 33 34 // Max. table key length (in ASCII bytes -- NOT in UTF-8 characters!) 35 private const MAXCHAR = 7; 36 37 /** 38 * Name transformation arrays. 39 * Used to transform the Name string to simplify the "sounds like" table. 40 * This is especially useful in Hebrew. 41 * 42 * Each array entry defines the "from" and "to" arguments of an preg($from, $to, $text) 43 * function call to achieve the desired transformations. 44 * 45 * Note about the use of "\x01": 46 * This code, which can’t legitimately occur in the kind of text we're dealing with, 47 * is used as a place-holder so that conditional string replacements can be done. 48 */ 49 private const TRANSFORM_NAMES = [ 50 // Force Yiddish ligatures to be treated as separate letters 51 ['װ', 'וו'], 52 ['ײ', 'יי'], 53 ['ױ', 'וי'], 54 ['בו', 'בע'], 55 ['פו', 'פע'], 56 ['ומ', 'עמ'], 57 ['ום', 'עם'], 58 ['ונ', 'ענ'], 59 ['ון', 'ען'], 60 ['וו', 'ב'], 61 ["\x01", ''], 62 ['ייה$', "\x01ה"], 63 ['ייע$', "\x01ע"], 64 ['יי', 'ע'], 65 ["\x01", 'יי'], 66 ]; 67 68 /** 69 * The DM sound coding table is organized this way: 70 * key: a variable-length string that corresponds to the UTF-8 character sequence 71 * represented by the table entry. Currently, that string can be up to 7 72 * bytes long. This maximum length is defined by the value of global variable 73 * $maxchar. 74 * 75 * value: an array as follows: 76 * [0]: zero if not a vowel 77 * [1]: sound value when this string is at the beginning of the word 78 * [2]: sound value when this string is followed by a vowel 79 * [3]: sound value for other cases 80 * [1],[2],[3] can be repeated several times to create branches in the code 81 * an empty sound value means "ignore in this state" 82 */ 83 private const DM_SOUNDS = [ 84 'A' => ['1', '0', '', ''], 85 'À' => ['1', '0', '', ''], 86 'Á' => ['1', '0', '', ''], 87 'Â' => ['1', '0', '', ''], 88 'Ã' => ['1', '0', '', ''], 89 'Ä' => ['1', '0', '1', '', '0', '', ''], 90 'Å' => ['1', '0', '', ''], 91 'Ă' => ['1', '0', '', ''], 92 'Ą' => ['1', '', '', '', '', '', '6'], 93 'Ạ' => ['1', '0', '', ''], 94 'Ả' => ['1', '0', '', ''], 95 'Ấ' => ['1', '0', '', ''], 96 'Ầ' => ['1', '0', '', ''], 97 'Ẩ' => ['1', '0', '', ''], 98 'Ẫ' => ['1', '0', '', ''], 99 'Ậ' => ['1', '0', '', ''], 100 'Ắ' => ['1', '0', '', ''], 101 'Ằ' => ['1', '0', '', ''], 102 'Ẳ' => ['1', '0', '', ''], 103 'Ẵ' => ['1', '0', '', ''], 104 'Ặ' => ['1', '0', '', ''], 105 'AE' => ['1', '0', '1', ''], 106 'Æ' => ['1', '0', '1', ''], 107 'AI' => ['1', '0', '1', ''], 108 'AJ' => ['1', '0', '1', ''], 109 'AU' => ['1', '0', '7', ''], 110 'AV' => ['1', '0', '7', '', '7', '7', '7'], 111 'ÄU' => ['1', '0', '1', ''], 112 'AY' => ['1', '0', '1', ''], 113 'B' => ['0', '7', '7', '7'], 114 'C' => ['0', '5', '5', '5', '34', '4', '4'], 115 'Ć' => ['0', '4', '4', '4'], 116 'Č' => ['0', '4', '4', '4'], 117 'Ç' => ['0', '4', '4', '4'], 118 'CH' => ['0', '5', '5', '5', '34', '4', '4'], 119 'CHS' => ['0', '5', '54', '54'], 120 'CK' => ['0', '5', '5', '5', '45', '45', '45'], 121 'CCS' => ['0', '4', '4', '4'], 122 'CS' => ['0', '4', '4', '4'], 123 'CSZ' => ['0', '4', '4', '4'], 124 'CZ' => ['0', '4', '4', '4'], 125 'CZS' => ['0', '4', '4', '4'], 126 'D' => ['0', '3', '3', '3'], 127 'Ď' => ['0', '3', '3', '3'], 128 'Đ' => ['0', '3', '3', '3'], 129 'DRS' => ['0', '4', '4', '4'], 130 'DRZ' => ['0', '4', '4', '4'], 131 'DS' => ['0', '4', '4', '4'], 132 'DSH' => ['0', '4', '4', '4'], 133 'DSZ' => ['0', '4', '4', '4'], 134 'DT' => ['0', '3', '3', '3'], 135 'DDZ' => ['0', '4', '4', '4'], 136 'DDZS' => ['0', '4', '4', '4'], 137 'DZ' => ['0', '4', '4', '4'], 138 'DŹ' => ['0', '4', '4', '4'], 139 'DŻ' => ['0', '4', '4', '4'], 140 'DZH' => ['0', '4', '4', '4'], 141 'DZS' => ['0', '4', '4', '4'], 142 'E' => ['1', '0', '', ''], 143 'È' => ['1', '0', '', ''], 144 'É' => ['1', '0', '', ''], 145 'Ê' => ['1', '0', '', ''], 146 'Ë' => ['1', '0', '', ''], 147 'Ĕ' => ['1', '0', '', ''], 148 'Ė' => ['1', '0', '', ''], 149 'Ę' => ['1', '', '', '6', '', '', ''], 150 'Ẹ' => ['1', '0', '', ''], 151 'Ẻ' => ['1', '0', '', ''], 152 'Ẽ' => ['1', '0', '', ''], 153 'Ế' => ['1', '0', '', ''], 154 'Ề' => ['1', '0', '', ''], 155 'Ể' => ['1', '0', '', ''], 156 'Ễ' => ['1', '0', '', ''], 157 'Ệ' => ['1', '0', '', ''], 158 'EAU' => ['1', '0', '', ''], 159 'EI' => ['1', '0', '1', ''], 160 'EJ' => ['1', '0', '1', ''], 161 'EU' => ['1', '1', '1', ''], 162 'EY' => ['1', '0', '1', ''], 163 'F' => ['0', '7', '7', '7'], 164 'FB' => ['0', '7', '7', '7'], 165 'G' => ['0', '5', '5', '5', '34', '4', '4'], 166 'Ğ' => ['0', '', '', ''], 167 'GGY' => ['0', '5', '5', '5'], 168 'GY' => ['0', '5', '5', '5'], 169 'H' => ['0', '5', '5', '', '5', '5', '5'], 170 'I' => ['1', '0', '', ''], 171 'Ì' => ['1', '0', '', ''], 172 'Í' => ['1', '0', '', ''], 173 'Î' => ['1', '0', '', ''], 174 'Ï' => ['1', '0', '', ''], 175 'Ĩ' => ['1', '0', '', ''], 176 'Į' => ['1', '0', '', ''], 177 'İ' => ['1', '0', '', ''], 178 'Ỉ' => ['1', '0', '', ''], 179 'Ị' => ['1', '0', '', ''], 180 'IA' => ['1', '1', '', ''], 181 'IE' => ['1', '1', '', ''], 182 'IO' => ['1', '1', '', ''], 183 'IU' => ['1', '1', '', ''], 184 'J' => ['0', '1', '', '', '4', '4', '4', '5', '5', ''], 185 'K' => ['0', '5', '5', '5'], 186 'KH' => ['0', '5', '5', '5'], 187 'KS' => ['0', '5', '54', '54'], 188 'L' => ['0', '8', '8', '8'], 189 'Ľ' => ['0', '8', '8', '8'], 190 'Ĺ' => ['0', '8', '8', '8'], 191 'Ł' => ['0', '7', '7', '7', '8', '8', '8'], 192 'LL' => ['0', '8', '8', '8', '58', '8', '8', '1', '8', '8'], 193 'LLY' => ['0', '8', '8', '8', '1', '8', '8'], 194 'LY' => ['0', '8', '8', '8', '1', '8', '8'], 195 'M' => ['0', '6', '6', '6'], 196 'MĔ' => ['0', '66', '66', '66'], 197 'MN' => ['0', '66', '66', '66'], 198 'N' => ['0', '6', '6', '6'], 199 'Ń' => ['0', '6', '6', '6'], 200 'Ň' => ['0', '6', '6', '6'], 201 'Ñ' => ['0', '6', '6', '6'], 202 'NM' => ['0', '66', '66', '66'], 203 'O' => ['1', '0', '', ''], 204 'Ò' => ['1', '0', '', ''], 205 'Ó' => ['1', '0', '', ''], 206 'Ô' => ['1', '0', '', ''], 207 'Õ' => ['1', '0', '', ''], 208 'Ö' => ['1', '0', '', ''], 209 'Ø' => ['1', '0', '', ''], 210 'Ő' => ['1', '0', '', ''], 211 'Œ' => ['1', '0', '', ''], 212 'Ơ' => ['1', '0', '', ''], 213 'Ọ' => ['1', '0', '', ''], 214 'Ỏ' => ['1', '0', '', ''], 215 'Ố' => ['1', '0', '', ''], 216 'Ồ' => ['1', '0', '', ''], 217 'Ổ' => ['1', '0', '', ''], 218 'Ỗ' => ['1', '0', '', ''], 219 'Ộ' => ['1', '0', '', ''], 220 'Ớ' => ['1', '0', '', ''], 221 'Ờ' => ['1', '0', '', ''], 222 'Ở' => ['1', '0', '', ''], 223 'Ỡ' => ['1', '0', '', ''], 224 'Ợ' => ['1', '0', '', ''], 225 'OE' => ['1', '0', '', ''], 226 'OI' => ['1', '0', '1', ''], 227 'OJ' => ['1', '0', '1', ''], 228 'OU' => ['1', '0', '', ''], 229 'OY' => ['1', '0', '1', ''], 230 'P' => ['0', '7', '7', '7'], 231 'PF' => ['0', '7', '7', '7'], 232 'PH' => ['0', '7', '7', '7'], 233 'Q' => ['0', '5', '5', '5'], 234 'R' => ['0', '9', '9', '9'], 235 'Ř' => ['0', '4', '4', '4'], 236 'RS' => ['0', '4', '4', '4', '94', '94', '94'], 237 'RZ' => ['0', '4', '4', '4', '94', '94', '94'], 238 'S' => ['0', '4', '4', '4'], 239 'Ś' => ['0', '4', '4', '4'], 240 'Š' => ['0', '4', '4', '4'], 241 'Ş' => ['0', '4', '4', '4'], 242 'SC' => ['0', '2', '4', '4'], 243 'ŠČ' => ['0', '2', '4', '4'], 244 'SCH' => ['0', '4', '4', '4'], 245 'SCHD' => ['0', '2', '43', '43'], 246 'SCHT' => ['0', '2', '43', '43'], 247 'SCHTCH' => ['0', '2', '4', '4'], 248 'SCHTSCH' => ['0', '2', '4', '4'], 249 'SCHTSH' => ['0', '2', '4', '4'], 250 'SD' => ['0', '2', '43', '43'], 251 'SH' => ['0', '4', '4', '4'], 252 'SHCH' => ['0', '2', '4', '4'], 253 'SHD' => ['0', '2', '43', '43'], 254 'SHT' => ['0', '2', '43', '43'], 255 'SHTCH' => ['0', '2', '4', '4'], 256 'SHTSH' => ['0', '2', '4', '4'], 257 'ß' => ['0', '', '4', '4'], 258 'ST' => ['0', '2', '43', '43'], 259 'STCH' => ['0', '2', '4', '4'], 260 'STRS' => ['0', '2', '4', '4'], 261 'STRZ' => ['0', '2', '4', '4'], 262 'STSCH' => ['0', '2', '4', '4'], 263 'STSH' => ['0', '2', '4', '4'], 264 'SSZ' => ['0', '4', '4', '4'], 265 'SZ' => ['0', '4', '4', '4'], 266 'SZCS' => ['0', '2', '4', '4'], 267 'SZCZ' => ['0', '2', '4', '4'], 268 'SZD' => ['0', '2', '43', '43'], 269 'SZT' => ['0', '2', '43', '43'], 270 'T' => ['0', '3', '3', '3'], 271 'Ť' => ['0', '3', '3', '3'], 272 'Ţ' => ['0', '3', '3', '3', '4', '4', '4'], 273 'TC' => ['0', '4', '4', '4'], 274 'TCH' => ['0', '4', '4', '4'], 275 'TH' => ['0', '3', '3', '3'], 276 'TRS' => ['0', '4', '4', '4'], 277 'TRZ' => ['0', '4', '4', '4'], 278 'TS' => ['0', '4', '4', '4'], 279 'TSCH' => ['0', '4', '4', '4'], 280 'TSH' => ['0', '4', '4', '4'], 281 'TSZ' => ['0', '4', '4', '4'], 282 'TTCH' => ['0', '4', '4', '4'], 283 'TTS' => ['0', '4', '4', '4'], 284 'TTSCH' => ['0', '4', '4', '4'], 285 'TTSZ' => ['0', '4', '4', '4'], 286 'TTZ' => ['0', '4', '4', '4'], 287 'TZ' => ['0', '4', '4', '4'], 288 'TZS' => ['0', '4', '4', '4'], 289 'U' => ['1', '0', '', ''], 290 'Ù' => ['1', '0', '', ''], 291 'Ú' => ['1', '0', '', ''], 292 'Û' => ['1', '0', '', ''], 293 'Ü' => ['1', '0', '', ''], 294 'Ũ' => ['1', '0', '', ''], 295 'Ū' => ['1', '0', '', ''], 296 'Ů' => ['1', '0', '', ''], 297 'Ű' => ['1', '0', '', ''], 298 'Ų' => ['1', '0', '', ''], 299 'Ư' => ['1', '0', '', ''], 300 'Ụ' => ['1', '0', '', ''], 301 'Ủ' => ['1', '0', '', ''], 302 'Ứ' => ['1', '0', '', ''], 303 'Ừ' => ['1', '0', '', ''], 304 'Ử' => ['1', '0', '', ''], 305 'Ữ' => ['1', '0', '', ''], 306 'Ự' => ['1', '0', '', ''], 307 'UE' => ['1', '0', '', ''], 308 'UI' => ['1', '0', '1', ''], 309 'UJ' => ['1', '0', '1', ''], 310 'UY' => ['1', '0', '1', ''], 311 'UW' => ['1', '0', '1', '', '0', '7', '7'], 312 'V' => ['0', '7', '7', '7'], 313 'W' => ['0', '7', '7', '7'], 314 'X' => ['0', '5', '54', '54'], 315 'Y' => ['1', '1', '', ''], 316 'Ý' => ['1', '1', '', ''], 317 'Ỳ' => ['1', '1', '', ''], 318 'Ỵ' => ['1', '1', '', ''], 319 'Ỷ' => ['1', '1', '', ''], 320 'Ỹ' => ['1', '1', '', ''], 321 'Z' => ['0', '4', '4', '4'], 322 'Ź' => ['0', '4', '4', '4'], 323 'Ż' => ['0', '4', '4', '4'], 324 'Ž' => ['0', '4', '4', '4'], 325 'ZD' => ['0', '2', '43', '43'], 326 'ZDZ' => ['0', '2', '4', '4'], 327 'ZDZH' => ['0', '2', '4', '4'], 328 'ZH' => ['0', '4', '4', '4'], 329 'ZHD' => ['0', '2', '43', '43'], 330 'ZHDZH' => ['0', '2', '4', '4'], 331 'ZS' => ['0', '4', '4', '4'], 332 'ZSCH' => ['0', '4', '4', '4'], 333 'ZSH' => ['0', '4', '4', '4'], 334 'ZZS' => ['0', '4', '4', '4'], 335 // Cyrillic alphabet 336 'А' => ['1', '0', '', ''], 337 'Б' => ['0', '7', '7', '7'], 338 'В' => ['0', '7', '7', '7'], 339 'Г' => ['0', '5', '5', '5'], 340 'Д' => ['0', '3', '3', '3'], 341 'ДЗ' => ['0', '4', '4', '4'], 342 'Е' => ['1', '0', '', ''], 343 'Ё' => ['1', '0', '', ''], 344 'Ж' => ['0', '4', '4', '4'], 345 'З' => ['0', '4', '4', '4'], 346 'И' => ['1', '0', '', ''], 347 'Й' => ['1', '1', '', '', '4', '4', '4'], 348 'К' => ['0', '5', '5', '5'], 349 'Л' => ['0', '8', '8', '8'], 350 'М' => ['0', '6', '6', '6'], 351 'Н' => ['0', '6', '6', '6'], 352 'О' => ['1', '0', '', ''], 353 'П' => ['0', '7', '7', '7'], 354 'Р' => ['0', '9', '9', '9'], 355 'РЖ' => ['0', '4', '4', '4'], 356 'С' => ['0', '4', '4', '4'], 357 'Т' => ['0', '3', '3', '3'], 358 'У' => ['1', '0', '', ''], 359 'Ф' => ['0', '7', '7', '7'], 360 'Х' => ['0', '5', '5', '5'], 361 'Ц' => ['0', '4', '4', '4'], 362 'Ч' => ['0', '4', '4', '4'], 363 'Ш' => ['0', '4', '4', '4'], 364 'Щ' => ['0', '2', '4', '4'], 365 'Ъ' => ['0', '', '', ''], 366 'Ы' => ['0', '1', '', ''], 367 'Ь' => ['0', '', '', ''], 368 'Э' => ['1', '0', '', ''], 369 'Ю' => ['0', '1', '', ''], 370 'Я' => ['0', '1', '', ''], 371 // Greek alphabet 372 'Α' => ['1', '0', '', ''], 373 'Ά' => ['1', '0', '', ''], 374 'ΑΙ' => ['1', '0', '1', ''], 375 'ΑΥ' => ['1', '0', '1', ''], 376 'Β' => ['0', '7', '7', '7'], 377 'Γ' => ['0', '5', '5', '5'], 378 'Δ' => ['0', '3', '3', '3'], 379 'Ε' => ['1', '0', '', ''], 380 'Έ' => ['1', '0', '', ''], 381 'ΕΙ' => ['1', '0', '1', ''], 382 'ΕΥ' => ['1', '1', '1', ''], 383 'Ζ' => ['0', '4', '4', '4'], 384 'Η' => ['1', '0', '', ''], 385 'Ή' => ['1', '0', '', ''], 386 'Θ' => ['0', '3', '3', '3'], 387 'Ι' => ['1', '0', '', ''], 388 'Ί' => ['1', '0', '', ''], 389 'Ϊ' => ['1', '0', '', ''], 390 'ΐ' => ['1', '0', '', ''], 391 'Κ' => ['0', '5', '5', '5'], 392 'Λ' => ['0', '8', '8', '8'], 393 'Μ' => ['0', '6', '6', '6'], 394 'ΜΠ' => ['0', '7', '7', '7'], 395 'Ν' => ['0', '6', '6', '6'], 396 'ΝΤ' => ['0', '3', '3', '3'], 397 'Ξ' => ['0', '5', '54', '54'], 398 'Ο' => ['1', '0', '', ''], 399 'Ό' => ['1', '0', '', ''], 400 'ΟΙ' => ['1', '0', '1', ''], 401 'ΟΥ' => ['1', '0', '1', ''], 402 'Π' => ['0', '7', '7', '7'], 403 'Ρ' => ['0', '9', '9', '9'], 404 'Σ' => ['0', '4', '4', '4'], 405 'ς' => ['0', '', '', '4'], 406 'Τ' => ['0', '3', '3', '3'], 407 'ΤΖ' => ['0', '4', '4', '4'], 408 'ΤΣ' => ['0', '4', '4', '4'], 409 'Υ' => ['1', '1', '', ''], 410 'Ύ' => ['1', '1', '', ''], 411 'Ϋ' => ['1', '1', '', ''], 412 'ΰ' => ['1', '1', '', ''], 413 'ΥΚ' => ['1', '5', '5', '5'], 414 'ΥΥ' => ['1', '65', '65', '65'], 415 'Φ' => ['0', '7', '7', '7'], 416 'Χ' => ['0', '5', '5', '5'], 417 'Ψ' => ['0', '7', '7', '7'], 418 'Ω' => ['1', '0', '', ''], 419 'Ώ' => ['1', '0', '', ''], 420 // Hebrew alphabet 421 'א' => ['1', '0', '', ''], 422 'או' => ['1', '0', '7', ''], 423 'אג' => ['1', '4', '4', '4', '5', '5', '5', '34', '34', '34'], 424 'בב' => ['0', '7', '7', '7', '77', '77', '77'], 425 'ב' => ['0', '7', '7', '7'], 426 'גג' => ['0', '4', '4', '4', '5', '5', '5', '45', '45', '45', '55', '55', '55', '54', '54', '54'], 427 'גד' => ['0', '43', '43', '43', '53', '53', '53'], 428 'גה' => ['0', '45', '45', '45', '55', '55', '55'], 429 'גז' => ['0', '44', '44', '44', '45', '45', '45'], 430 'גח' => ['0', '45', '45', '45', '55', '55', '55'], 431 'גכ' => ['0', '45', '45', '45', '55', '55', '55'], 432 'גך' => ['0', '45', '45', '45', '55', '55', '55'], 433 'גצ' => ['0', '44', '44', '44', '45', '45', '45'], 434 'גץ' => ['0', '44', '44', '44', '45', '45', '45'], 435 'גק' => ['0', '45', '45', '45', '54', '54', '54'], 436 'גש' => ['0', '44', '44', '44', '54', '54', '54'], 437 'גת' => ['0', '43', '43', '43', '53', '53', '53'], 438 'ג' => ['0', '4', '4', '4', '5', '5', '5'], 439 'דז' => ['0', '4', '4', '4'], 440 'דד' => ['0', '3', '3', '3', '33', '33', '33'], 441 'דט' => ['0', '33', '33', '33'], 442 'דש' => ['0', '4', '4', '4'], 443 'דצ' => ['0', '4', '4', '4'], 444 'דץ' => ['0', '4', '4', '4'], 445 'ד' => ['0', '3', '3', '3'], 446 'הג' => ['0', '54', '54', '54', '55', '55', '55'], 447 'הכ' => ['0', '55', '55', '55'], 448 'הח' => ['0', '55', '55', '55'], 449 'הק' => ['0', '55', '55', '55', '5', '5', '5'], 450 'הה' => ['0', '5', '5', '', '55', '55', ''], 451 'ה' => ['0', '5', '5', ''], 452 'וי' => ['1', '', '', '', '7', '7', '7'], 453 'ו' => ['1', '7', '7', '7', '7', '', ''], 454 'וו' => ['1', '7', '7', '7', '7', '', ''], 455 'וופ' => ['1', '7', '7', '7', '77', '77', '77'], 456 'זש' => ['0', '4', '4', '4', '44', '44', '44'], 457 'זדז' => ['0', '2', '4', '4'], 458 'ז' => ['0', '4', '4', '4'], 459 'זג' => ['0', '44', '44', '44', '45', '45', '45'], 460 'זז' => ['0', '4', '4', '4', '44', '44', '44'], 461 'זס' => ['0', '44', '44', '44'], 462 'זצ' => ['0', '44', '44', '44'], 463 'זץ' => ['0', '44', '44', '44'], 464 'חג' => ['0', '54', '54', '54', '53', '53', '53'], 465 'חח' => ['0', '5', '5', '5', '55', '55', '55'], 466 'חק' => ['0', '55', '55', '55', '5', '5', '5'], 467 'חכ' => ['0', '45', '45', '45', '55', '55', '55'], 468 'חס' => ['0', '5', '54', '54'], 469 'חש' => ['0', '5', '54', '54'], 470 'ח' => ['0', '5', '5', '5'], 471 'טש' => ['0', '4', '4', '4'], 472 'טד' => ['0', '33', '33', '33'], 473 'טי' => ['0', '3', '3', '3', '4', '4', '4', '3', '3', '34'], 474 'טת' => ['0', '33', '33', '33'], 475 'טט' => ['0', '3', '3', '3', '33', '33', '33'], 476 'ט' => ['0', '3', '3', '3'], 477 'י' => ['1', '1', '', ''], 478 'יא' => ['1', '1', '', '', '1', '1', '1'], 479 'כג' => ['0', '55', '55', '55', '54', '54', '54'], 480 'כש' => ['0', '5', '54', '54'], 481 'כס' => ['0', '5', '54', '54'], 482 'ככ' => ['0', '5', '5', '5', '55', '55', '55'], 483 'כך' => ['0', '5', '5', '5', '55', '55', '55'], 484 'כ' => ['0', '5', '5', '5'], 485 'כח' => ['0', '55', '55', '55', '5', '5', '5'], 486 'ך' => ['0', '', '5', '5'], 487 'ל' => ['0', '8', '8', '8'], 488 'לל' => ['0', '88', '88', '88', '8', '8', '8'], 489 'מנ' => ['0', '66', '66', '66'], 490 'מן' => ['0', '66', '66', '66'], 491 'ממ' => ['0', '6', '6', '6', '66', '66', '66'], 492 'מם' => ['0', '6', '6', '6', '66', '66', '66'], 493 'מ' => ['0', '6', '6', '6'], 494 'ם' => ['0', '', '6', '6'], 495 'נמ' => ['0', '66', '66', '66'], 496 'נם' => ['0', '66', '66', '66'], 497 'ננ' => ['0', '6', '6', '6', '66', '66', '66'], 498 'נן' => ['0', '6', '6', '6', '66', '66', '66'], 499 'נ' => ['0', '6', '6', '6'], 500 'ן' => ['0', '', '6', '6'], 501 'סתש' => ['0', '2', '4', '4'], 502 'סתז' => ['0', '2', '4', '4'], 503 'סטז' => ['0', '2', '4', '4'], 504 'סטש' => ['0', '2', '4', '4'], 505 'סצד' => ['0', '2', '4', '4'], 506 'סט' => ['0', '2', '4', '4', '43', '43', '43'], 507 'סת' => ['0', '2', '4', '4', '43', '43', '43'], 508 'סג' => ['0', '44', '44', '44', '4', '4', '4'], 509 'סס' => ['0', '4', '4', '4', '44', '44', '44'], 510 'סצ' => ['0', '44', '44', '44'], 511 'סץ' => ['0', '44', '44', '44'], 512 'סז' => ['0', '44', '44', '44'], 513 'סש' => ['0', '44', '44', '44'], 514 'ס' => ['0', '4', '4', '4'], 515 'ע' => ['1', '0', '', ''], 516 'פב' => ['0', '7', '7', '7', '77', '77', '77'], 517 'פוו' => ['0', '7', '7', '7', '77', '77', '77'], 518 'פפ' => ['0', '7', '7', '7', '77', '77', '77'], 519 'פף' => ['0', '7', '7', '7', '77', '77', '77'], 520 'פ' => ['0', '7', '7', '7'], 521 'ף' => ['0', '', '7', '7'], 522 'צג' => ['0', '44', '44', '44', '45', '45', '45'], 523 'צז' => ['0', '44', '44', '44'], 524 'צס' => ['0', '44', '44', '44'], 525 'צצ' => ['0', '4', '4', '4', '5', '5', '5', '44', '44', '44', '54', '54', '54', '45', '45', '45'], 526 'צץ' => ['0', '4', '4', '4', '5', '5', '5', '44', '44', '44', '54', '54', '54'], 527 'צש' => ['0', '44', '44', '44', '4', '4', '4', '5', '5', '5'], 528 'צ' => ['0', '4', '4', '4', '5', '5', '5'], 529 'ץ' => ['0', '', '4', '4'], 530 'קה' => ['0', '55', '55', '5'], 531 'קס' => ['0', '5', '54', '54'], 532 'קש' => ['0', '5', '54', '54'], 533 'קק' => ['0', '5', '5', '5', '55', '55', '55'], 534 'קח' => ['0', '55', '55', '55'], 535 'קכ' => ['0', '55', '55', '55'], 536 'קך' => ['0', '55', '55', '55'], 537 'קג' => ['0', '55', '55', '55', '54', '54', '54'], 538 'ק' => ['0', '5', '5', '5'], 539 'רר' => ['0', '99', '99', '99', '9', '9', '9'], 540 'ר' => ['0', '9', '9', '9'], 541 'שטז' => ['0', '2', '4', '4'], 542 'שתש' => ['0', '2', '4', '4'], 543 'שתז' => ['0', '2', '4', '4'], 544 'שטש' => ['0', '2', '4', '4'], 545 'שד' => ['0', '2', '43', '43'], 546 'שז' => ['0', '44', '44', '44'], 547 'שס' => ['0', '44', '44', '44'], 548 'שת' => ['0', '2', '43', '43'], 549 'שג' => ['0', '4', '4', '4', '44', '44', '44', '4', '43', '43'], 550 'שט' => ['0', '2', '43', '43', '44', '44', '44'], 551 'שצ' => ['0', '44', '44', '44', '45', '45', '45'], 552 'שץ' => ['0', '44', '', '44', '45', '', '45'], 553 'שש' => ['0', '4', '4', '4', '44', '44', '44'], 554 'ש' => ['0', '4', '4', '4'], 555 'תג' => ['0', '34', '34', '34'], 556 'תז' => ['0', '34', '34', '34'], 557 'תש' => ['0', '4', '4', '4'], 558 'תת' => ['0', '3', '3', '3', '4', '4', '4', '33', '33', '33', '44', '44', '44', '34', '34', '34', '43', '43', '43'], 559 'ת' => ['0', '3', '3', '3', '4', '4', '4'], 560 // Arabic alphabet 561 'ا' => ['1', '0', '', ''], 562 'ب' => ['0', '7', '7', '7'], 563 'ت' => ['0', '3', '3', '3'], 564 'ث' => ['0', '3', '3', '3'], 565 'ج' => ['0', '4', '4', '4'], 566 'ح' => ['0', '5', '5', '5'], 567 'خ' => ['0', '5', '5', '5'], 568 'د' => ['0', '3', '3', '3'], 569 'ذ' => ['0', '3', '3', '3'], 570 'ر' => ['0', '9', '9', '9'], 571 'ز' => ['0', '4', '4', '4'], 572 'س' => ['0', '4', '4', '4'], 573 'ش' => ['0', '4', '4', '4'], 574 'ص' => ['0', '4', '4', '4'], 575 'ض' => ['0', '3', '3', '3'], 576 'ط' => ['0', '3', '3', '3'], 577 'ظ' => ['0', '4', '4', '4'], 578 'ع' => ['1', '0', '', ''], 579 'غ' => ['0', '0', '', ''], 580 'ف' => ['0', '7', '7', '7'], 581 'ق' => ['0', '5', '5', '5'], 582 'ك' => ['0', '5', '5', '5'], 583 'ل' => ['0', '8', '8', '8'], 584 'لا' => ['0', '8', '8', '8'], 585 'م' => ['0', '6', '6', '6'], 586 'ن' => ['0', '6', '6', '6'], 587 'هن' => ['0', '66', '66', '66'], 588 'ه' => ['0', '5', '5', ''], 589 'و' => ['1', '', '', '', '7', '', ''], 590 'ي' => ['0', '1', '', ''], 591 'آ' => ['0', '1', '', ''], 592 'ة' => ['0', '', '', '3'], 593 'ی' => ['0', '1', '', ''], 594 'ى' => ['1', '1', '', ''], 595 ]; 596 597 /** 598 * Which algorithms are supported. 599 * 600 * @return array<string> 601 */ 602 public static function getAlgorithms(): array 603 { 604 return [ 605 /* I18N: https://en.wikipedia.org/wiki/Soundex */ 606 'std' => I18N::translate('Russell'), 607 /* I18N: https://en.wikipedia.org/wiki/Daitch–Mokotoff_Soundex */ 608 'dm' => I18N::translate('Daitch-Mokotoff'), 609 ]; 610 } 611 612 /** 613 * Is there a match between two soundex codes? 614 * 615 * @param string $soundex1 616 * @param string $soundex2 617 * 618 * @return bool 619 */ 620 public static function compare(string $soundex1, string $soundex2): bool 621 { 622 if ($soundex1 !== '' && $soundex2 !== '') { 623 return array_intersect(explode(':', $soundex1), explode(':', $soundex2)) !== []; 624 } 625 626 return false; 627 } 628 629 /** 630 * Generate Russell soundex codes for a given text. 631 * 632 * @param string $text 633 * 634 * @return string 635 */ 636 public static function russell(string $text): string 637 { 638 $words = explode(' ', $text); 639 $soundex_array = []; 640 641 foreach ($words as $word) { 642 $soundex = soundex($word); 643 644 // Only return codes from recognisable sounds 645 if ($soundex !== '0000') { 646 $soundex_array[] = $soundex; 647 } 648 } 649 650 // Combine words, e.g. “New York” as “Newyork” 651 if (count($words) > 1) { 652 $soundex_array[] = soundex(str_replace(' ', '', $text)); 653 } 654 655 // A varchar(255) column can only hold 51 4-character codes (plus 50 delimiters) 656 $soundex_array = array_slice(array_unique($soundex_array), 0, 51); 657 658 return implode(':', $soundex_array); 659 } 660 661 /** 662 * Generate Daitch–Mokotoff soundex codes for a given text. 663 * 664 * @param string $text 665 * 666 * @return string 667 */ 668 public static function daitchMokotoff(string $text): string 669 { 670 $words = explode(' ', $text); 671 $soundex_array = []; 672 673 foreach ($words as $word) { 674 $soundex_array = array_merge($soundex_array, self::daitchMokotoffWord($word)); 675 } 676 // Combine words, e.g. “New York” as “Newyork” 677 if (count($words) > 1) { 678 $soundex_array = array_merge($soundex_array, self::daitchMokotoffWord(str_replace(' ', '', $text))); 679 } 680 681 // A varchar(255) column can only hold 36 6-character codes (plus 35 delimiters) 682 $soundex_array = array_slice(array_unique($soundex_array), 0, 36); 683 684 return implode(':', $soundex_array); 685 } 686 687 /** 688 * Calculate the Daitch-Mokotoff soundex for a word. 689 * 690 * @param string $name 691 * 692 * @return array<string> List of possible DM codes for the word. 693 */ 694 private static function daitchMokotoffWord(string $name): array 695 { 696 // Apply special transformation rules to the input string 697 $name = I18N::strtoupper($name); 698 foreach (self::TRANSFORM_NAMES as $transformRule) { 699 $name = str_replace($transformRule[0], $transformRule[1], $name); 700 } 701 702 // Initialize 703 $name_script = I18N::textScript($name); 704 $noVowels = $name_script === 'Hebr' || $name_script === 'Arab'; 705 706 $lastPos = strlen($name) - 1; 707 $currPos = 0; 708 $state = 1; // 1: start of input string, 2: before vowel, 3: other 709 $result = []; // accumulate complete 6-digit D-M codes here 710 $partialResult = []; // accumulate incomplete D-M codes here 711 $partialResult[] = ['!']; // initialize 1st partial result ('!' stops "duplicate sound" check) 712 713 // Loop through the input string. 714 // Stop when the string is exhausted or when no more partial results remain 715 while ($partialResult !== [] && $currPos <= $lastPos) { 716 // Find the DM coding table entry for the chunk at the current position 717 $thisEntry = substr($name, $currPos, self::MAXCHAR); // Get maximum length chunk 718 while ($thisEntry !== '') { 719 if (isset(self::DM_SOUNDS[$thisEntry])) { 720 break; 721 } 722 $thisEntry = substr($thisEntry, 0, -1); // Not in table: try a shorter chunk 723 } 724 if ($thisEntry === '') { 725 $currPos++; // Not in table: advance pointer to next byte 726 continue; // and try again 727 } 728 729 $soundTableEntry = self::DM_SOUNDS[$thisEntry]; 730 $workingResult = $partialResult; 731 $partialResult = []; 732 $currPos += strlen($thisEntry); 733 734 // Not at beginning of input string 735 if ($state !== 1) { 736 if ($currPos <= $lastPos) { 737 // Determine whether the next chunk is a vowel 738 $nextEntry = substr($name, $currPos, self::MAXCHAR); // Get maximum length chunk 739 while ($nextEntry !== '') { 740 if (isset(self::DM_SOUNDS[$nextEntry])) { 741 break; 742 } 743 $nextEntry = substr($nextEntry, 0, -1); // Not in table: try a shorter chunk 744 } 745 } else { 746 $nextEntry = ''; 747 } 748 if ($nextEntry !== '' && self::DM_SOUNDS[$nextEntry][0] !== '0') { 749 $state = 2; 750 } else { 751 // Next chunk is a vowel 752 $state = 3; 753 } 754 } 755 756 while ($state < count($soundTableEntry)) { 757 // empty means 'ignore this sound in this state' 758 if ($soundTableEntry[$state] === '') { 759 foreach ($workingResult as $workingEntry) { 760 $tempEntry = $workingEntry; 761 $tempEntry[count($tempEntry) - 1] .= '!'; // Prevent false 'doubles' 762 $partialResult[] = $tempEntry; 763 } 764 } else { 765 foreach ($workingResult as $workingEntry) { 766 if ($soundTableEntry[$state] !== $workingEntry[count($workingEntry) - 1]) { 767 // Incoming sound isn't a duplicate of the previous sound 768 $workingEntry[] = $soundTableEntry[$state]; 769 } elseif ($noVowels) { 770 // Incoming sound is a duplicate of the previous sound 771 // For Hebrew and Arabic, we need to create a pair of D-M sound codes, 772 // one of the pair with only a single occurrence of the duplicate sound, 773 // the other with both occurrences 774 $workingEntry[] = $soundTableEntry[$state]; 775 } 776 777 if (count($workingEntry) < 7) { 778 $partialResult[] = $workingEntry; 779 } else { 780 // This is the 6th code in the sequence 781 // We're looking for 7 entries because the first is '!' and doesn't count 782 $tempResult = str_replace('!', '', implode('', $workingEntry)); 783 // Only return codes from recognisable sounds 784 if ($tempResult !== '') { 785 $result[] = substr($tempResult . '000000', 0, 6); 786 } 787 } 788 } 789 } 790 $state += 3; // Advance to next triplet while keeping the same basic state 791 } 792 } 793 794 // Zero-fill and copy all remaining partial results 795 foreach ($partialResult as $workingEntry) { 796 $tempResult = str_replace('!', '', implode('', $workingEntry)); 797 // Only return codes from recognisable sounds 798 if ($tempResult !== '') { 799 $result[] = substr($tempResult . '000000', 0, 6); 800 } 801 } 802 803 return $result; 804 } 805} 806