1<?php 2 3/** 4 * webtrees: online genealogy 5 * Copyright (C) 2021 webtrees development team 6 * This program is free software: you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License as published by 8 * the Free Software Foundation, either version 3 of the License, or 9 * (at your option) any later version. 10 * This program is distributed in the hope that it will be useful, 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 * GNU General Public License for more details. 14 * You should have received a copy of the GNU General Public License 15 * along with this program. If not, see <https://www.gnu.org/licenses/>. 16 */ 17 18declare(strict_types=1); 19 20namespace Fisharebest\Webtrees; 21 22/** 23 * Phonetic matching of strings. 24 */ 25class Soundex 26{ 27 // Determine the Daitch–Mokotoff Soundex code for a word 28 // Original implementation by Gerry Kroll, and analysis by Meliza Amity 29 30 // Max. table key length (in ASCII bytes -- NOT in UTF-8 characters!) 31 private const MAXCHAR = 7; 32 33 /** 34 * Name transformation arrays. 35 * Used to transform the Name string to simplify the "sounds like" table. 36 * This is especially useful in Hebrew. 37 * 38 * Each array entry defines the "from" and "to" arguments of an preg($from, $to, $text) 39 * function call to achieve the desired transformations. 40 * 41 * Note about the use of "\x01": 42 * This code, which can’t legitimately occur in the kind of text we're dealing with, 43 * is used as a place-holder so that conditional string replacements can be done. 44 */ 45 private const TRANSFORM_NAMES = [ 46 // Force Yiddish ligatures to be treated as separate letters 47 ['װ', 'וו'], 48 ['ײ', 'יי'], 49 ['ױ', 'וי'], 50 ['בו', 'בע'], 51 ['פו', 'פע'], 52 ['ומ', 'עמ'], 53 ['ום', 'עם'], 54 ['ונ', 'ענ'], 55 ['ון', 'ען'], 56 ['וו', 'ב'], 57 ["\x01", ''], 58 ['ייה$', "\x01ה"], 59 ['ייע$', "\x01ע"], 60 ['יי', 'ע'], 61 ["\x01", 'יי'], 62 ]; 63 64 /** 65 * The DM sound coding table is organized this way: 66 * key: a variable-length string that corresponds to the UTF-8 character sequence 67 * represented by the table entry. Currently, that string can be up to 7 68 * bytes long. This maximum length is defined by the value of global variable 69 * $maxchar. 70 * 71 * value: an array as follows: 72 * [0]: zero if not a vowel 73 * [1]: sound value when this string is at the beginning of the word 74 * [2]: sound value when this string is followed by a vowel 75 * [3]: sound value for other cases 76 * [1],[2],[3] can be repeated several times to create branches in the code 77 * an empty sound value means "ignore in this state" 78 */ 79 private const DM_SOUNDS = [ 80 'A' => ['1', '0', '', ''], 81 'À' => ['1', '0', '', ''], 82 'Á' => ['1', '0', '', ''], 83 'Â' => ['1', '0', '', ''], 84 'Ã' => ['1', '0', '', ''], 85 'Ä' => ['1', '0', '1', '', '0', '', ''], 86 'Å' => ['1', '0', '', ''], 87 'Ă' => ['1', '0', '', ''], 88 'Ą' => ['1', '', '', '', '', '', '6'], 89 'Ạ' => ['1', '0', '', ''], 90 'Ả' => ['1', '0', '', ''], 91 'Ấ' => ['1', '0', '', ''], 92 'Ầ' => ['1', '0', '', ''], 93 'Ẩ' => ['1', '0', '', ''], 94 'Ẫ' => ['1', '0', '', ''], 95 'Ậ' => ['1', '0', '', ''], 96 'Ắ' => ['1', '0', '', ''], 97 'Ằ' => ['1', '0', '', ''], 98 'Ẳ' => ['1', '0', '', ''], 99 'Ẵ' => ['1', '0', '', ''], 100 'Ặ' => ['1', '0', '', ''], 101 'AE' => ['1', '0', '1', ''], 102 'Æ' => ['1', '0', '1', ''], 103 'AI' => ['1', '0', '1', ''], 104 'AJ' => ['1', '0', '1', ''], 105 'AU' => ['1', '0', '7', ''], 106 'AV' => ['1', '0', '7', '', '7', '7', '7'], 107 'ÄU' => ['1', '0', '1', ''], 108 'AY' => ['1', '0', '1', ''], 109 'B' => ['0', '7', '7', '7'], 110 'C' => ['0', '5', '5', '5', '34', '4', '4'], 111 'Ć' => ['0', '4', '4', '4'], 112 'Č' => ['0', '4', '4', '4'], 113 'Ç' => ['0', '4', '4', '4'], 114 'CH' => ['0', '5', '5', '5', '34', '4', '4'], 115 'CHS' => ['0', '5', '54', '54'], 116 'CK' => ['0', '5', '5', '5', '45', '45', '45'], 117 'CCS' => ['0', '4', '4', '4'], 118 'CS' => ['0', '4', '4', '4'], 119 'CSZ' => ['0', '4', '4', '4'], 120 'CZ' => ['0', '4', '4', '4'], 121 'CZS' => ['0', '4', '4', '4'], 122 'D' => ['0', '3', '3', '3'], 123 'Ď' => ['0', '3', '3', '3'], 124 'Đ' => ['0', '3', '3', '3'], 125 'DRS' => ['0', '4', '4', '4'], 126 'DRZ' => ['0', '4', '4', '4'], 127 'DS' => ['0', '4', '4', '4'], 128 'DSH' => ['0', '4', '4', '4'], 129 'DSZ' => ['0', '4', '4', '4'], 130 'DT' => ['0', '3', '3', '3'], 131 'DDZ' => ['0', '4', '4', '4'], 132 'DDZS' => ['0', '4', '4', '4'], 133 'DZ' => ['0', '4', '4', '4'], 134 'DŹ' => ['0', '4', '4', '4'], 135 'DŻ' => ['0', '4', '4', '4'], 136 'DZH' => ['0', '4', '4', '4'], 137 'DZS' => ['0', '4', '4', '4'], 138 'E' => ['1', '0', '', ''], 139 'È' => ['1', '0', '', ''], 140 'É' => ['1', '0', '', ''], 141 'Ê' => ['1', '0', '', ''], 142 'Ë' => ['1', '0', '', ''], 143 'Ĕ' => ['1', '0', '', ''], 144 'Ė' => ['1', '0', '', ''], 145 'Ę' => ['1', '', '', '6', '', '', ''], 146 'Ẹ' => ['1', '0', '', ''], 147 'Ẻ' => ['1', '0', '', ''], 148 'Ẽ' => ['1', '0', '', ''], 149 'Ế' => ['1', '0', '', ''], 150 'Ề' => ['1', '0', '', ''], 151 'Ể' => ['1', '0', '', ''], 152 'Ễ' => ['1', '0', '', ''], 153 'Ệ' => ['1', '0', '', ''], 154 'EAU' => ['1', '0', '', ''], 155 'EI' => ['1', '0', '1', ''], 156 'EJ' => ['1', '0', '1', ''], 157 'EU' => ['1', '1', '1', ''], 158 'EY' => ['1', '0', '1', ''], 159 'F' => ['0', '7', '7', '7'], 160 'FB' => ['0', '7', '7', '7'], 161 'G' => ['0', '5', '5', '5', '34', '4', '4'], 162 'Ğ' => ['0', '', '', ''], 163 'GGY' => ['0', '5', '5', '5'], 164 'GY' => ['0', '5', '5', '5'], 165 'H' => ['0', '5', '5', '', '5', '5', '5'], 166 'I' => ['1', '0', '', ''], 167 'Ì' => ['1', '0', '', ''], 168 'Í' => ['1', '0', '', ''], 169 'Î' => ['1', '0', '', ''], 170 'Ï' => ['1', '0', '', ''], 171 'Ĩ' => ['1', '0', '', ''], 172 'Į' => ['1', '0', '', ''], 173 'İ' => ['1', '0', '', ''], 174 'Ỉ' => ['1', '0', '', ''], 175 'Ị' => ['1', '0', '', ''], 176 'IA' => ['1', '1', '', ''], 177 'IE' => ['1', '1', '', ''], 178 'IO' => ['1', '1', '', ''], 179 'IU' => ['1', '1', '', ''], 180 'J' => ['0', '1', '', '', '4', '4', '4', '5', '5', ''], 181 'K' => ['0', '5', '5', '5'], 182 'KH' => ['0', '5', '5', '5'], 183 'KS' => ['0', '5', '54', '54'], 184 'L' => ['0', '8', '8', '8'], 185 'Ľ' => ['0', '8', '8', '8'], 186 'Ĺ' => ['0', '8', '8', '8'], 187 'Ł' => ['0', '7', '7', '7', '8', '8', '8'], 188 'LL' => ['0', '8', '8', '8', '58', '8', '8', '1', '8', '8'], 189 'LLY' => ['0', '8', '8', '8', '1', '8', '8'], 190 'LY' => ['0', '8', '8', '8', '1', '8', '8'], 191 'M' => ['0', '6', '6', '6'], 192 'MĔ' => ['0', '66', '66', '66'], 193 'MN' => ['0', '66', '66', '66'], 194 'N' => ['0', '6', '6', '6'], 195 'Ń' => ['0', '6', '6', '6'], 196 'Ň' => ['0', '6', '6', '6'], 197 'Ñ' => ['0', '6', '6', '6'], 198 'NM' => ['0', '66', '66', '66'], 199 'O' => ['1', '0', '', ''], 200 'Ò' => ['1', '0', '', ''], 201 'Ó' => ['1', '0', '', ''], 202 'Ô' => ['1', '0', '', ''], 203 'Õ' => ['1', '0', '', ''], 204 'Ö' => ['1', '0', '', ''], 205 'Ø' => ['1', '0', '', ''], 206 'Ő' => ['1', '0', '', ''], 207 'Œ' => ['1', '0', '', ''], 208 'Ơ' => ['1', '0', '', ''], 209 'Ọ' => ['1', '0', '', ''], 210 'Ỏ' => ['1', '0', '', ''], 211 'Ố' => ['1', '0', '', ''], 212 'Ồ' => ['1', '0', '', ''], 213 'Ổ' => ['1', '0', '', ''], 214 'Ỗ' => ['1', '0', '', ''], 215 'Ộ' => ['1', '0', '', ''], 216 'Ớ' => ['1', '0', '', ''], 217 'Ờ' => ['1', '0', '', ''], 218 'Ở' => ['1', '0', '', ''], 219 'Ỡ' => ['1', '0', '', ''], 220 'Ợ' => ['1', '0', '', ''], 221 'OE' => ['1', '0', '', ''], 222 'OI' => ['1', '0', '1', ''], 223 'OJ' => ['1', '0', '1', ''], 224 'OU' => ['1', '0', '', ''], 225 'OY' => ['1', '0', '1', ''], 226 'P' => ['0', '7', '7', '7'], 227 'PF' => ['0', '7', '7', '7'], 228 'PH' => ['0', '7', '7', '7'], 229 'Q' => ['0', '5', '5', '5'], 230 'R' => ['0', '9', '9', '9'], 231 'Ř' => ['0', '4', '4', '4'], 232 'RS' => ['0', '4', '4', '4', '94', '94', '94'], 233 'RZ' => ['0', '4', '4', '4', '94', '94', '94'], 234 'S' => ['0', '4', '4', '4'], 235 'Ś' => ['0', '4', '4', '4'], 236 'Š' => ['0', '4', '4', '4'], 237 'Ş' => ['0', '4', '4', '4'], 238 'SC' => ['0', '2', '4', '4'], 239 'ŠČ' => ['0', '2', '4', '4'], 240 'SCH' => ['0', '4', '4', '4'], 241 'SCHD' => ['0', '2', '43', '43'], 242 'SCHT' => ['0', '2', '43', '43'], 243 'SCHTCH' => ['0', '2', '4', '4'], 244 'SCHTSCH' => ['0', '2', '4', '4'], 245 'SCHTSH' => ['0', '2', '4', '4'], 246 'SD' => ['0', '2', '43', '43'], 247 'SH' => ['0', '4', '4', '4'], 248 'SHCH' => ['0', '2', '4', '4'], 249 'SHD' => ['0', '2', '43', '43'], 250 'SHT' => ['0', '2', '43', '43'], 251 'SHTCH' => ['0', '2', '4', '4'], 252 'SHTSH' => ['0', '2', '4', '4'], 253 'ß' => ['0', '', '4', '4'], 254 'ST' => ['0', '2', '43', '43'], 255 'STCH' => ['0', '2', '4', '4'], 256 'STRS' => ['0', '2', '4', '4'], 257 'STRZ' => ['0', '2', '4', '4'], 258 'STSCH' => ['0', '2', '4', '4'], 259 'STSH' => ['0', '2', '4', '4'], 260 'SSZ' => ['0', '4', '4', '4'], 261 'SZ' => ['0', '4', '4', '4'], 262 'SZCS' => ['0', '2', '4', '4'], 263 'SZCZ' => ['0', '2', '4', '4'], 264 'SZD' => ['0', '2', '43', '43'], 265 'SZT' => ['0', '2', '43', '43'], 266 'T' => ['0', '3', '3', '3'], 267 'Ť' => ['0', '3', '3', '3'], 268 'Ţ' => ['0', '3', '3', '3', '4', '4', '4'], 269 'TC' => ['0', '4', '4', '4'], 270 'TCH' => ['0', '4', '4', '4'], 271 'TH' => ['0', '3', '3', '3'], 272 'TRS' => ['0', '4', '4', '4'], 273 'TRZ' => ['0', '4', '4', '4'], 274 'TS' => ['0', '4', '4', '4'], 275 'TSCH' => ['0', '4', '4', '4'], 276 'TSH' => ['0', '4', '4', '4'], 277 'TSZ' => ['0', '4', '4', '4'], 278 'TTCH' => ['0', '4', '4', '4'], 279 'TTS' => ['0', '4', '4', '4'], 280 'TTSCH' => ['0', '4', '4', '4'], 281 'TTSZ' => ['0', '4', '4', '4'], 282 'TTZ' => ['0', '4', '4', '4'], 283 'TZ' => ['0', '4', '4', '4'], 284 'TZS' => ['0', '4', '4', '4'], 285 'U' => ['1', '0', '', ''], 286 'Ù' => ['1', '0', '', ''], 287 'Ú' => ['1', '0', '', ''], 288 'Û' => ['1', '0', '', ''], 289 'Ü' => ['1', '0', '', ''], 290 'Ũ' => ['1', '0', '', ''], 291 'Ū' => ['1', '0', '', ''], 292 'Ů' => ['1', '0', '', ''], 293 'Ű' => ['1', '0', '', ''], 294 'Ų' => ['1', '0', '', ''], 295 'Ư' => ['1', '0', '', ''], 296 'Ụ' => ['1', '0', '', ''], 297 'Ủ' => ['1', '0', '', ''], 298 'Ứ' => ['1', '0', '', ''], 299 'Ừ' => ['1', '0', '', ''], 300 'Ử' => ['1', '0', '', ''], 301 'Ữ' => ['1', '0', '', ''], 302 'Ự' => ['1', '0', '', ''], 303 'UE' => ['1', '0', '', ''], 304 'UI' => ['1', '0', '1', ''], 305 'UJ' => ['1', '0', '1', ''], 306 'UY' => ['1', '0', '1', ''], 307 'UW' => ['1', '0', '1', '', '0', '7', '7'], 308 'V' => ['0', '7', '7', '7'], 309 'W' => ['0', '7', '7', '7'], 310 'X' => ['0', '5', '54', '54'], 311 'Y' => ['1', '1', '', ''], 312 'Ý' => ['1', '1', '', ''], 313 'Ỳ' => ['1', '1', '', ''], 314 'Ỵ' => ['1', '1', '', ''], 315 'Ỷ' => ['1', '1', '', ''], 316 'Ỹ' => ['1', '1', '', ''], 317 'Z' => ['0', '4', '4', '4'], 318 'Ź' => ['0', '4', '4', '4'], 319 'Ż' => ['0', '4', '4', '4'], 320 'Ž' => ['0', '4', '4', '4'], 321 'ZD' => ['0', '2', '43', '43'], 322 'ZDZ' => ['0', '2', '4', '4'], 323 'ZDZH' => ['0', '2', '4', '4'], 324 'ZH' => ['0', '4', '4', '4'], 325 'ZHD' => ['0', '2', '43', '43'], 326 'ZHDZH' => ['0', '2', '4', '4'], 327 'ZS' => ['0', '4', '4', '4'], 328 'ZSCH' => ['0', '4', '4', '4'], 329 'ZSH' => ['0', '4', '4', '4'], 330 'ZZS' => ['0', '4', '4', '4'], 331 // Cyrillic alphabet 332 'А' => ['1', '0', '', ''], 333 'Б' => ['0', '7', '7', '7'], 334 'В' => ['0', '7', '7', '7'], 335 'Г' => ['0', '5', '5', '5'], 336 'Д' => ['0', '3', '3', '3'], 337 'ДЗ' => ['0', '4', '4', '4'], 338 'Е' => ['1', '0', '', ''], 339 'Ё' => ['1', '0', '', ''], 340 'Ж' => ['0', '4', '4', '4'], 341 'З' => ['0', '4', '4', '4'], 342 'И' => ['1', '0', '', ''], 343 'Й' => ['1', '1', '', '', '4', '4', '4'], 344 'К' => ['0', '5', '5', '5'], 345 'Л' => ['0', '8', '8', '8'], 346 'М' => ['0', '6', '6', '6'], 347 'Н' => ['0', '6', '6', '6'], 348 'О' => ['1', '0', '', ''], 349 'П' => ['0', '7', '7', '7'], 350 'Р' => ['0', '9', '9', '9'], 351 'РЖ' => ['0', '4', '4', '4'], 352 'С' => ['0', '4', '4', '4'], 353 'Т' => ['0', '3', '3', '3'], 354 'У' => ['1', '0', '', ''], 355 'Ф' => ['0', '7', '7', '7'], 356 'Х' => ['0', '5', '5', '5'], 357 'Ц' => ['0', '4', '4', '4'], 358 'Ч' => ['0', '4', '4', '4'], 359 'Ш' => ['0', '4', '4', '4'], 360 'Щ' => ['0', '2', '4', '4'], 361 'Ъ' => ['0', '', '', ''], 362 'Ы' => ['0', '1', '', ''], 363 'Ь' => ['0', '', '', ''], 364 'Э' => ['1', '0', '', ''], 365 'Ю' => ['0', '1', '', ''], 366 'Я' => ['0', '1', '', ''], 367 // Greek alphabet 368 'Α' => ['1', '0', '', ''], 369 'Ά' => ['1', '0', '', ''], 370 'ΑΙ' => ['1', '0', '1', ''], 371 'ΑΥ' => ['1', '0', '1', ''], 372 'Β' => ['0', '7', '7', '7'], 373 'Γ' => ['0', '5', '5', '5'], 374 'Δ' => ['0', '3', '3', '3'], 375 'Ε' => ['1', '0', '', ''], 376 'Έ' => ['1', '0', '', ''], 377 'ΕΙ' => ['1', '0', '1', ''], 378 'ΕΥ' => ['1', '1', '1', ''], 379 'Ζ' => ['0', '4', '4', '4'], 380 'Η' => ['1', '0', '', ''], 381 'Ή' => ['1', '0', '', ''], 382 'Θ' => ['0', '3', '3', '3'], 383 'Ι' => ['1', '0', '', ''], 384 'Ί' => ['1', '0', '', ''], 385 'Ϊ' => ['1', '0', '', ''], 386 'ΐ' => ['1', '0', '', ''], 387 'Κ' => ['0', '5', '5', '5'], 388 'Λ' => ['0', '8', '8', '8'], 389 'Μ' => ['0', '6', '6', '6'], 390 'ΜΠ' => ['0', '7', '7', '7'], 391 'Ν' => ['0', '6', '6', '6'], 392 'ΝΤ' => ['0', '3', '3', '3'], 393 'Ξ' => ['0', '5', '54', '54'], 394 'Ο' => ['1', '0', '', ''], 395 'Ό' => ['1', '0', '', ''], 396 'ΟΙ' => ['1', '0', '1', ''], 397 'ΟΥ' => ['1', '0', '1', ''], 398 'Π' => ['0', '7', '7', '7'], 399 'Ρ' => ['0', '9', '9', '9'], 400 'Σ' => ['0', '4', '4', '4'], 401 'ς' => ['0', '', '', '4'], 402 'Τ' => ['0', '3', '3', '3'], 403 'ΤΖ' => ['0', '4', '4', '4'], 404 'ΤΣ' => ['0', '4', '4', '4'], 405 'Υ' => ['1', '1', '', ''], 406 'Ύ' => ['1', '1', '', ''], 407 'Ϋ' => ['1', '1', '', ''], 408 'ΰ' => ['1', '1', '', ''], 409 'ΥΚ' => ['1', '5', '5', '5'], 410 'ΥΥ' => ['1', '65', '65', '65'], 411 'Φ' => ['0', '7', '7', '7'], 412 'Χ' => ['0', '5', '5', '5'], 413 'Ψ' => ['0', '7', '7', '7'], 414 'Ω' => ['1', '0', '', ''], 415 'Ώ' => ['1', '0', '', ''], 416 // Hebrew alphabet 417 'א' => ['1', '0', '', ''], 418 'או' => ['1', '0', '7', ''], 419 'אג' => ['1', '4', '4', '4', '5', '5', '5', '34', '34', '34'], 420 'בב' => ['0', '7', '7', '7', '77', '77', '77'], 421 'ב' => ['0', '7', '7', '7'], 422 'גג' => ['0', '4', '4', '4', '5', '5', '5', '45', '45', '45', '55', '55', '55', '54', '54', '54'], 423 'גד' => ['0', '43', '43', '43', '53', '53', '53'], 424 'גה' => ['0', '45', '45', '45', '55', '55', '55'], 425 'גז' => ['0', '44', '44', '44', '45', '45', '45'], 426 'גח' => ['0', '45', '45', '45', '55', '55', '55'], 427 'גכ' => ['0', '45', '45', '45', '55', '55', '55'], 428 'גך' => ['0', '45', '45', '45', '55', '55', '55'], 429 'גצ' => ['0', '44', '44', '44', '45', '45', '45'], 430 'גץ' => ['0', '44', '44', '44', '45', '45', '45'], 431 'גק' => ['0', '45', '45', '45', '54', '54', '54'], 432 'גש' => ['0', '44', '44', '44', '54', '54', '54'], 433 'גת' => ['0', '43', '43', '43', '53', '53', '53'], 434 'ג' => ['0', '4', '4', '4', '5', '5', '5'], 435 'דז' => ['0', '4', '4', '4'], 436 'דד' => ['0', '3', '3', '3', '33', '33', '33'], 437 'דט' => ['0', '33', '33', '33'], 438 'דש' => ['0', '4', '4', '4'], 439 'דצ' => ['0', '4', '4', '4'], 440 'דץ' => ['0', '4', '4', '4'], 441 'ד' => ['0', '3', '3', '3'], 442 'הג' => ['0', '54', '54', '54', '55', '55', '55'], 443 'הכ' => ['0', '55', '55', '55'], 444 'הח' => ['0', '55', '55', '55'], 445 'הק' => ['0', '55', '55', '55', '5', '5', '5'], 446 'הה' => ['0', '5', '5', '', '55', '55', ''], 447 'ה' => ['0', '5', '5', ''], 448 'וי' => ['1', '', '', '', '7', '7', '7'], 449 'ו' => ['1', '7', '7', '7', '7', '', ''], 450 'וו' => ['1', '7', '7', '7', '7', '', ''], 451 'וופ' => ['1', '7', '7', '7', '77', '77', '77'], 452 'זש' => ['0', '4', '4', '4', '44', '44', '44'], 453 'זדז' => ['0', '2', '4', '4'], 454 'ז' => ['0', '4', '4', '4'], 455 'זג' => ['0', '44', '44', '44', '45', '45', '45'], 456 'זז' => ['0', '4', '4', '4', '44', '44', '44'], 457 'זס' => ['0', '44', '44', '44'], 458 'זצ' => ['0', '44', '44', '44'], 459 'זץ' => ['0', '44', '44', '44'], 460 'חג' => ['0', '54', '54', '54', '53', '53', '53'], 461 'חח' => ['0', '5', '5', '5', '55', '55', '55'], 462 'חק' => ['0', '55', '55', '55', '5', '5', '5'], 463 'חכ' => ['0', '45', '45', '45', '55', '55', '55'], 464 'חס' => ['0', '5', '54', '54'], 465 'חש' => ['0', '5', '54', '54'], 466 'ח' => ['0', '5', '5', '5'], 467 'טש' => ['0', '4', '4', '4'], 468 'טד' => ['0', '33', '33', '33'], 469 'טי' => ['0', '3', '3', '3', '4', '4', '4', '3', '3', '34'], 470 'טת' => ['0', '33', '33', '33'], 471 'טט' => ['0', '3', '3', '3', '33', '33', '33'], 472 'ט' => ['0', '3', '3', '3'], 473 'י' => ['1', '1', '', ''], 474 'יא' => ['1', '1', '', '', '1', '1', '1'], 475 'כג' => ['0', '55', '55', '55', '54', '54', '54'], 476 'כש' => ['0', '5', '54', '54'], 477 'כס' => ['0', '5', '54', '54'], 478 'ככ' => ['0', '5', '5', '5', '55', '55', '55'], 479 'כך' => ['0', '5', '5', '5', '55', '55', '55'], 480 'כ' => ['0', '5', '5', '5'], 481 'כח' => ['0', '55', '55', '55', '5', '5', '5'], 482 'ך' => ['0', '', '5', '5'], 483 'ל' => ['0', '8', '8', '8'], 484 'לל' => ['0', '88', '88', '88', '8', '8', '8'], 485 'מנ' => ['0', '66', '66', '66'], 486 'מן' => ['0', '66', '66', '66'], 487 'ממ' => ['0', '6', '6', '6', '66', '66', '66'], 488 'מם' => ['0', '6', '6', '6', '66', '66', '66'], 489 'מ' => ['0', '6', '6', '6'], 490 'ם' => ['0', '', '6', '6'], 491 'נמ' => ['0', '66', '66', '66'], 492 'נם' => ['0', '66', '66', '66'], 493 'ננ' => ['0', '6', '6', '6', '66', '66', '66'], 494 'נן' => ['0', '6', '6', '6', '66', '66', '66'], 495 'נ' => ['0', '6', '6', '6'], 496 'ן' => ['0', '', '6', '6'], 497 'סתש' => ['0', '2', '4', '4'], 498 'סתז' => ['0', '2', '4', '4'], 499 'סטז' => ['0', '2', '4', '4'], 500 'סטש' => ['0', '2', '4', '4'], 501 'סצד' => ['0', '2', '4', '4'], 502 'סט' => ['0', '2', '4', '4', '43', '43', '43'], 503 'סת' => ['0', '2', '4', '4', '43', '43', '43'], 504 'סג' => ['0', '44', '44', '44', '4', '4', '4'], 505 'סס' => ['0', '4', '4', '4', '44', '44', '44'], 506 'סצ' => ['0', '44', '44', '44'], 507 'סץ' => ['0', '44', '44', '44'], 508 'סז' => ['0', '44', '44', '44'], 509 'סש' => ['0', '44', '44', '44'], 510 'ס' => ['0', '4', '4', '4'], 511 'ע' => ['1', '0', '', ''], 512 'פב' => ['0', '7', '7', '7', '77', '77', '77'], 513 'פוו' => ['0', '7', '7', '7', '77', '77', '77'], 514 'פפ' => ['0', '7', '7', '7', '77', '77', '77'], 515 'פף' => ['0', '7', '7', '7', '77', '77', '77'], 516 'פ' => ['0', '7', '7', '7'], 517 'ף' => ['0', '', '7', '7'], 518 'צג' => ['0', '44', '44', '44', '45', '45', '45'], 519 'צז' => ['0', '44', '44', '44'], 520 'צס' => ['0', '44', '44', '44'], 521 'צצ' => ['0', '4', '4', '4', '5', '5', '5', '44', '44', '44', '54', '54', '54', '45', '45', '45'], 522 'צץ' => ['0', '4', '4', '4', '5', '5', '5', '44', '44', '44', '54', '54', '54'], 523 'צש' => ['0', '44', '44', '44', '4', '4', '4', '5', '5', '5'], 524 'צ' => ['0', '4', '4', '4', '5', '5', '5'], 525 'ץ' => ['0', '', '4', '4'], 526 'קה' => ['0', '55', '55', '5'], 527 'קס' => ['0', '5', '54', '54'], 528 'קש' => ['0', '5', '54', '54'], 529 'קק' => ['0', '5', '5', '5', '55', '55', '55'], 530 'קח' => ['0', '55', '55', '55'], 531 'קכ' => ['0', '55', '55', '55'], 532 'קך' => ['0', '55', '55', '55'], 533 'קג' => ['0', '55', '55', '55', '54', '54', '54'], 534 'ק' => ['0', '5', '5', '5'], 535 'רר' => ['0', '99', '99', '99', '9', '9', '9'], 536 'ר' => ['0', '9', '9', '9'], 537 'שטז' => ['0', '2', '4', '4'], 538 'שתש' => ['0', '2', '4', '4'], 539 'שתז' => ['0', '2', '4', '4'], 540 'שטש' => ['0', '2', '4', '4'], 541 'שד' => ['0', '2', '43', '43'], 542 'שז' => ['0', '44', '44', '44'], 543 'שס' => ['0', '44', '44', '44'], 544 'שת' => ['0', '2', '43', '43'], 545 'שג' => ['0', '4', '4', '4', '44', '44', '44', '4', '43', '43'], 546 'שט' => ['0', '2', '43', '43', '44', '44', '44'], 547 'שצ' => ['0', '44', '44', '44', '45', '45', '45'], 548 'שץ' => ['0', '44', '', '44', '45', '', '45'], 549 'שש' => ['0', '4', '4', '4', '44', '44', '44'], 550 'ש' => ['0', '4', '4', '4'], 551 'תג' => ['0', '34', '34', '34'], 552 'תז' => ['0', '34', '34', '34'], 553 'תש' => ['0', '4', '4', '4'], 554 'תת' => ['0', '3', '3', '3', '4', '4', '4', '33', '33', '33', '44', '44', '44', '34', '34', '34', '43', '43', '43'], 555 'ת' => ['0', '3', '3', '3', '4', '4', '4'], 556 // Arabic alphabet 557 'ا' => ['1', '0', '', ''], 558 'ب' => ['0', '7', '7', '7'], 559 'ت' => ['0', '3', '3', '3'], 560 'ث' => ['0', '3', '3', '3'], 561 'ج' => ['0', '4', '4', '4'], 562 'ح' => ['0', '5', '5', '5'], 563 'خ' => ['0', '5', '5', '5'], 564 'د' => ['0', '3', '3', '3'], 565 'ذ' => ['0', '3', '3', '3'], 566 'ر' => ['0', '9', '9', '9'], 567 'ز' => ['0', '4', '4', '4'], 568 'س' => ['0', '4', '4', '4'], 569 'ش' => ['0', '4', '4', '4'], 570 'ص' => ['0', '4', '4', '4'], 571 'ض' => ['0', '3', '3', '3'], 572 'ط' => ['0', '3', '3', '3'], 573 'ظ' => ['0', '4', '4', '4'], 574 'ع' => ['1', '0', '', ''], 575 'غ' => ['0', '0', '', ''], 576 'ف' => ['0', '7', '7', '7'], 577 'ق' => ['0', '5', '5', '5'], 578 'ك' => ['0', '5', '5', '5'], 579 'ل' => ['0', '8', '8', '8'], 580 'لا' => ['0', '8', '8', '8'], 581 'م' => ['0', '6', '6', '6'], 582 'ن' => ['0', '6', '6', '6'], 583 'هن' => ['0', '66', '66', '66'], 584 'ه' => ['0', '5', '5', ''], 585 'و' => ['1', '', '', '', '7', '', ''], 586 'ي' => ['0', '1', '', ''], 587 'آ' => ['0', '1', '', ''], 588 'ة' => ['0', '', '', '3'], 589 'ی' => ['0', '1', '', ''], 590 'ى' => ['1', '1', '', ''], 591 ]; 592 593 /** 594 * Which algorithms are supported. 595 * 596 * @return array<string> 597 */ 598 public static function getAlgorithms(): array 599 { 600 return [ 601 /* I18N: https://en.wikipedia.org/wiki/Soundex */ 602 'std' => I18N::translate('Russell'), 603 /* I18N: https://en.wikipedia.org/wiki/Daitch–Mokotoff_Soundex */ 604 'dm' => I18N::translate('Daitch-Mokotoff'), 605 ]; 606 } 607 608 /** 609 * Is there a match between two soundex codes? 610 * 611 * @param string $soundex1 612 * @param string $soundex2 613 * 614 * @return bool 615 */ 616 public static function compare(string $soundex1, string $soundex2): bool 617 { 618 if ($soundex1 !== '' && $soundex2 !== '') { 619 return array_intersect(explode(':', $soundex1), explode(':', $soundex2)) !== []; 620 } 621 622 return false; 623 } 624 625 /** 626 * Generate Russell soundex codes for a given text. 627 * 628 * @param string $text 629 * 630 * @return string 631 */ 632 public static function russell(string $text): string 633 { 634 $words = explode(' ', $text); 635 $soundex_array = []; 636 637 foreach ($words as $word) { 638 $soundex = soundex($word); 639 640 // Only return codes from recognisable sounds 641 if ($soundex !== '0000') { 642 $soundex_array[] = $soundex; 643 } 644 } 645 646 // Combine words, e.g. “New York” as “Newyork” 647 if (count($words) > 1) { 648 $soundex_array[] = soundex(str_replace(' ', '', $text)); 649 } 650 651 // A varchar(255) column can only hold 51 4-character codes (plus 50 delimiters) 652 $soundex_array = array_slice(array_unique($soundex_array), 0, 51); 653 654 return implode(':', $soundex_array); 655 } 656 657 /** 658 * Generate Daitch–Mokotoff soundex codes for a given text. 659 * 660 * @param string $text 661 * 662 * @return string 663 */ 664 public static function daitchMokotoff(string $text): string 665 { 666 $words = explode(' ', $text); 667 $soundex_array = []; 668 669 foreach ($words as $word) { 670 $soundex_array = array_merge($soundex_array, self::daitchMokotoffWord($word)); 671 } 672 // Combine words, e.g. “New York” as “Newyork” 673 if (count($words) > 1) { 674 $soundex_array = array_merge($soundex_array, self::daitchMokotoffWord(str_replace(' ', '', $text))); 675 } 676 677 // A varchar(255) column can only hold 36 6-character codes (plus 35 delimiters) 678 $soundex_array = array_slice(array_unique($soundex_array), 0, 36); 679 680 return implode(':', $soundex_array); 681 } 682 683 /** 684 * Calculate the Daitch-Mokotoff soundex for a word. 685 * 686 * @param string $name 687 * 688 * @return array<string> List of possible DM codes for the word. 689 */ 690 private static function daitchMokotoffWord(string $name): array 691 { 692 // Apply special transformation rules to the input string 693 $name = I18N::strtoupper($name); 694 foreach (self::TRANSFORM_NAMES as $transformRule) { 695 $name = str_replace($transformRule[0], $transformRule[1], $name); 696 } 697 698 // Initialize 699 $name_script = I18N::textScript($name); 700 $noVowels = $name_script === 'Hebr' || $name_script === 'Arab'; 701 702 $lastPos = strlen($name) - 1; 703 $currPos = 0; 704 $state = 1; // 1: start of input string, 2: before vowel, 3: other 705 $result = []; // accumulate complete 6-digit D-M codes here 706 $partialResult = []; // accumulate incomplete D-M codes here 707 $partialResult[] = ['!']; // initialize 1st partial result ('!' stops "duplicate sound" check) 708 709 // Loop through the input string. 710 // Stop when the string is exhausted or when no more partial results remain 711 while (count($partialResult) !== 0 && $currPos <= $lastPos) { 712 // Find the DM coding table entry for the chunk at the current position 713 $thisEntry = substr($name, $currPos, self::MAXCHAR); // Get maximum length chunk 714 while ($thisEntry !== '') { 715 if (isset(self::DM_SOUNDS[$thisEntry])) { 716 break; 717 } 718 $thisEntry = substr($thisEntry, 0, -1); // Not in table: try a shorter chunk 719 } 720 if ($thisEntry === '') { 721 $currPos++; // Not in table: advance pointer to next byte 722 continue; // and try again 723 } 724 725 $soundTableEntry = self::DM_SOUNDS[$thisEntry]; 726 $workingResult = $partialResult; 727 $partialResult = []; 728 $currPos += strlen($thisEntry); 729 730 // Not at beginning of input string 731 if ($state !== 1) { 732 if ($currPos <= $lastPos) { 733 // Determine whether the next chunk is a vowel 734 $nextEntry = substr($name, $currPos, self::MAXCHAR); // Get maximum length chunk 735 while ($nextEntry !== '') { 736 if (isset(self::DM_SOUNDS[$nextEntry])) { 737 break; 738 } 739 $nextEntry = substr($nextEntry, 0, -1); // Not in table: try a shorter chunk 740 } 741 } else { 742 $nextEntry = ''; 743 } 744 if ($nextEntry !== '' && self::DM_SOUNDS[$nextEntry][0] !== '0') { 745 $state = 2; 746 } else { 747 // Next chunk is a vowel 748 $state = 3; 749 } 750 } 751 752 while ($state < count($soundTableEntry)) { 753 // empty means 'ignore this sound in this state' 754 if ($soundTableEntry[$state] === '') { 755 foreach ($workingResult as $workingEntry) { 756 $tempEntry = $workingEntry; 757 $tempEntry[count($tempEntry) - 1] .= '!'; // Prevent false 'doubles' 758 $partialResult[] = $tempEntry; 759 } 760 } else { 761 foreach ($workingResult as $workingEntry) { 762 if ($soundTableEntry[$state] !== $workingEntry[count($workingEntry) - 1]) { 763 // Incoming sound isn't a duplicate of the previous sound 764 $workingEntry[] = $soundTableEntry[$state]; 765 } elseif ($noVowels) { 766 // Incoming sound is a duplicate of the previous sound 767 // For Hebrew and Arabic, we need to create a pair of D-M sound codes, 768 // one of the pair with only a single occurrence of the duplicate sound, 769 // the other with both occurrences 770 $workingEntry[] = $soundTableEntry[$state]; 771 } 772 773 if (count($workingEntry) < 7) { 774 $partialResult[] = $workingEntry; 775 } else { 776 // This is the 6th code in the sequence 777 // We're looking for 7 entries because the first is '!' and doesn't count 778 $tempResult = str_replace('!', '', implode('', $workingEntry)); 779 // Only return codes from recognisable sounds 780 if ($tempResult) { 781 $result[] = substr($tempResult . '000000', 0, 6); 782 } 783 } 784 } 785 } 786 $state += 3; // Advance to next triplet while keeping the same basic state 787 } 788 } 789 790 // Zero-fill and copy all remaining partial results 791 foreach ($partialResult as $workingEntry) { 792 $tempResult = str_replace('!', '', implode('', $workingEntry)); 793 // Only return codes from recognisable sounds 794 if ($tempResult) { 795 $result[] = substr($tempResult . '000000', 0, 6); 796 } 797 } 798 799 return $result; 800 } 801} 802