xref: /webtrees/app/Soundex.php (revision f9b64f4645b5fb43a1aaed50100ddd750a0b68d8)
1<?php
2
3/**
4 * webtrees: online genealogy
5 * Copyright (C) 2021 webtrees development team
6 * This program is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation, either version 3 of the License, or
9 * (at your option) any later version.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program. If not, see <https://www.gnu.org/licenses/>.
16 */
17
18declare(strict_types=1);
19
20namespace Fisharebest\Webtrees;
21
22/**
23 * Phonetic matching of strings.
24 */
25class Soundex
26{
27    // Determine the Daitch–Mokotoff Soundex code for a word
28    // Original implementation by Gerry Kroll, and analysis by Meliza Amity
29
30    // Max. table key length (in ASCII bytes -- NOT in UTF-8 characters!)
31    private const MAXCHAR = 7;
32
33    /**
34     * Name transformation arrays.
35     * Used to transform the Name string to simplify the "sounds like" table.
36     * This is especially useful in Hebrew.
37     *
38     * Each array entry defines the "from" and "to" arguments of an preg($from, $to, $text)
39     * function call to achieve the desired transformations.
40     *
41     * Note about the use of "\x01":
42     * This code, which can’t legitimately occur in the kind of text we're dealing with,
43     * is used as a place-holder so that conditional string replacements can be done.
44     */
45    private const TRANSFORM_NAMES = [
46        // Force Yiddish ligatures to be treated as separate letters
47        [
48            'װ',
49            'וו',
50        ],
51        [
52            'ײ',
53            'יי',
54        ],
55        [
56            'ױ',
57            'וי',
58        ],
59        [
60            'בו',
61            'בע',
62        ],
63        [
64            'פו',
65            'פע',
66        ],
67        [
68            'ומ',
69            'עמ',
70        ],
71        [
72            'ום',
73            'עם',
74        ],
75        [
76            'ונ',
77            'ענ',
78        ],
79        [
80            'ון',
81            'ען',
82        ],
83        [
84            'וו',
85            'ב',
86        ],
87        [
88            "\x01",
89            '',
90        ],
91        [
92            'ייה$',
93            "\x01ה",
94        ],
95        [
96            'ייע$',
97            "\x01ע",
98        ],
99        [
100            'יי',
101            'ע',
102        ],
103        [
104            "\x01",
105            'יי',
106        ],
107    ];
108
109    /**
110     * The DM sound coding table is organized this way:
111     * key: a variable-length string that corresponds to the UTF-8 character sequence
112     * represented by the table entry. Currently, that string can be up to 7
113     * bytes long. This maximum length is defined by the value of global variable
114     * $maxchar.
115     *
116     * value: an array as follows:
117     * [0]:  zero if not a vowel
118     * [1]:  sound value when this string is at the beginning of the word
119     * [2]:  sound value when this string is followed by a vowel
120     * [3]:  sound value for other cases
121     * [1],[2],[3] can be repeated several times to create branches in the code
122     * an empty sound value means "ignore in this state"
123     */
124    private const DM_SOUNDS = [
125        'A'       => [
126            '1',
127            '0',
128            '',
129            '',
130        ],
131        'À'       => [
132            '1',
133            '0',
134            '',
135            '',
136        ],
137        'Á'       => [
138            '1',
139            '0',
140            '',
141            '',
142        ],
143        'Â'       => [
144            '1',
145            '0',
146            '',
147            '',
148        ],
149        'Ã'       => [
150            '1',
151            '0',
152            '',
153            '',
154        ],
155        'Ä'       => [
156            '1',
157            '0',
158            '1',
159            '',
160            '0',
161            '',
162            '',
163        ],
164        'Å'       => [
165            '1',
166            '0',
167            '',
168            '',
169        ],
170        'Ă'       => [
171            '1',
172            '0',
173            '',
174            '',
175        ],
176        'Ą'       => [
177            '1',
178            '',
179            '',
180            '',
181            '',
182            '',
183            '6',
184        ],
185        'Ạ'       => [
186            '1',
187            '0',
188            '',
189            '',
190        ],
191        'Ả'       => [
192            '1',
193            '0',
194            '',
195            '',
196        ],
197        'Ấ'       => [
198            '1',
199            '0',
200            '',
201            '',
202        ],
203        'Ầ'       => [
204            '1',
205            '0',
206            '',
207            '',
208        ],
209        'Ẩ'       => [
210            '1',
211            '0',
212            '',
213            '',
214        ],
215        'Ẫ'       => [
216            '1',
217            '0',
218            '',
219            '',
220        ],
221        'Ậ'       => [
222            '1',
223            '0',
224            '',
225            '',
226        ],
227        'Ắ'       => [
228            '1',
229            '0',
230            '',
231            '',
232        ],
233        'Ằ'       => [
234            '1',
235            '0',
236            '',
237            '',
238        ],
239        'Ẳ'       => [
240            '1',
241            '0',
242            '',
243            '',
244        ],
245        'Ẵ'       => [
246            '1',
247            '0',
248            '',
249            '',
250        ],
251        'Ặ'       => [
252            '1',
253            '0',
254            '',
255            '',
256        ],
257        'AE'      => [
258            '1',
259            '0',
260            '1',
261            '',
262        ],
263        'Æ'       => [
264            '1',
265            '0',
266            '1',
267            '',
268        ],
269        'AI'      => [
270            '1',
271            '0',
272            '1',
273            '',
274        ],
275        'AJ'      => [
276            '1',
277            '0',
278            '1',
279            '',
280        ],
281        'AU'      => [
282            '1',
283            '0',
284            '7',
285            '',
286        ],
287        'AV'      => [
288            '1',
289            '0',
290            '7',
291            '',
292            '7',
293            '7',
294            '7',
295        ],
296        'ÄU'      => [
297            '1',
298            '0',
299            '1',
300            '',
301        ],
302        'AY'      => [
303            '1',
304            '0',
305            '1',
306            '',
307        ],
308        'B'       => [
309            '0',
310            '7',
311            '7',
312            '7',
313        ],
314        'C'       => [
315            '0',
316            '5',
317            '5',
318            '5',
319            '34',
320            '4',
321            '4',
322        ],
323        'Ć'       => [
324            '0',
325            '4',
326            '4',
327            '4',
328        ],
329        'Č'       => [
330            '0',
331            '4',
332            '4',
333            '4',
334        ],
335        'Ç'       => [
336            '0',
337            '4',
338            '4',
339            '4',
340        ],
341        'CH'      => [
342            '0',
343            '5',
344            '5',
345            '5',
346            '34',
347            '4',
348            '4',
349        ],
350        'CHS'     => [
351            '0',
352            '5',
353            '54',
354            '54',
355        ],
356        'CK'      => [
357            '0',
358            '5',
359            '5',
360            '5',
361            '45',
362            '45',
363            '45',
364        ],
365        'CCS'     => [
366            '0',
367            '4',
368            '4',
369            '4',
370        ],
371        'CS'      => [
372            '0',
373            '4',
374            '4',
375            '4',
376        ],
377        'CSZ'     => [
378            '0',
379            '4',
380            '4',
381            '4',
382        ],
383        'CZ'      => [
384            '0',
385            '4',
386            '4',
387            '4',
388        ],
389        'CZS'     => [
390            '0',
391            '4',
392            '4',
393            '4',
394        ],
395        'D'       => [
396            '0',
397            '3',
398            '3',
399            '3',
400        ],
401        'Ď'       => [
402            '0',
403            '3',
404            '3',
405            '3',
406        ],
407        'Đ'       => [
408            '0',
409            '3',
410            '3',
411            '3',
412        ],
413        'DRS'     => [
414            '0',
415            '4',
416            '4',
417            '4',
418        ],
419        'DRZ'     => [
420            '0',
421            '4',
422            '4',
423            '4',
424        ],
425        'DS'      => [
426            '0',
427            '4',
428            '4',
429            '4',
430        ],
431        'DSH'     => [
432            '0',
433            '4',
434            '4',
435            '4',
436        ],
437        'DSZ'     => [
438            '0',
439            '4',
440            '4',
441            '4',
442        ],
443        'DT'      => [
444            '0',
445            '3',
446            '3',
447            '3',
448        ],
449        'DDZ'     => [
450            '0',
451            '4',
452            '4',
453            '4',
454        ],
455        'DDZS'    => [
456            '0',
457            '4',
458            '4',
459            '4',
460        ],
461        'DZ'      => [
462            '0',
463            '4',
464            '4',
465            '4',
466        ],
467        'DŹ'      => [
468            '0',
469            '4',
470            '4',
471            '4',
472        ],
473        'DŻ'      => [
474            '0',
475            '4',
476            '4',
477            '4',
478        ],
479        'DZH'     => [
480            '0',
481            '4',
482            '4',
483            '4',
484        ],
485        'DZS'     => [
486            '0',
487            '4',
488            '4',
489            '4',
490        ],
491        'E'       => [
492            '1',
493            '0',
494            '',
495            '',
496        ],
497        'È'       => [
498            '1',
499            '0',
500            '',
501            '',
502        ],
503        'É'       => [
504            '1',
505            '0',
506            '',
507            '',
508        ],
509        'Ê'       => [
510            '1',
511            '0',
512            '',
513            '',
514        ],
515        'Ë'       => [
516            '1',
517            '0',
518            '',
519            '',
520        ],
521        'Ĕ'       => [
522            '1',
523            '0',
524            '',
525            '',
526        ],
527        'Ė'       => [
528            '1',
529            '0',
530            '',
531            '',
532        ],
533        'Ę'       => [
534            '1',
535            '',
536            '',
537            '6',
538            '',
539            '',
540            '',
541        ],
542        'Ẹ'       => [
543            '1',
544            '0',
545            '',
546            '',
547        ],
548        'Ẻ'       => [
549            '1',
550            '0',
551            '',
552            '',
553        ],
554        'Ẽ'       => [
555            '1',
556            '0',
557            '',
558            '',
559        ],
560        'Ế'       => [
561            '1',
562            '0',
563            '',
564            '',
565        ],
566        'Ề'       => [
567            '1',
568            '0',
569            '',
570            '',
571        ],
572        'Ể'       => [
573            '1',
574            '0',
575            '',
576            '',
577        ],
578        'Ễ'       => [
579            '1',
580            '0',
581            '',
582            '',
583        ],
584        'Ệ'       => [
585            '1',
586            '0',
587            '',
588            '',
589        ],
590        'EAU'     => [
591            '1',
592            '0',
593            '',
594            '',
595        ],
596        'EI'      => [
597            '1',
598            '0',
599            '1',
600            '',
601        ],
602        'EJ'      => [
603            '1',
604            '0',
605            '1',
606            '',
607        ],
608        'EU'      => [
609            '1',
610            '1',
611            '1',
612            '',
613        ],
614        'EY'      => [
615            '1',
616            '0',
617            '1',
618            '',
619        ],
620        'F'       => [
621            '0',
622            '7',
623            '7',
624            '7',
625        ],
626        'FB'      => [
627            '0',
628            '7',
629            '7',
630            '7',
631        ],
632        'G'       => [
633            '0',
634            '5',
635            '5',
636            '5',
637            '34',
638            '4',
639            '4',
640        ],
641        'Ğ'       => [
642            '0',
643            '',
644            '',
645            '',
646        ],
647        'GGY'     => [
648            '0',
649            '5',
650            '5',
651            '5',
652        ],
653        'GY'      => [
654            '0',
655            '5',
656            '5',
657            '5',
658        ],
659        'H'       => [
660            '0',
661            '5',
662            '5',
663            '',
664            '5',
665            '5',
666            '5',
667        ],
668        'I'       => [
669            '1',
670            '0',
671            '',
672            '',
673        ],
674        'Ì'       => [
675            '1',
676            '0',
677            '',
678            '',
679        ],
680        'Í'       => [
681            '1',
682            '0',
683            '',
684            '',
685        ],
686        'Î'       => [
687            '1',
688            '0',
689            '',
690            '',
691        ],
692        'Ï'       => [
693            '1',
694            '0',
695            '',
696            '',
697        ],
698        'Ĩ'       => [
699            '1',
700            '0',
701            '',
702            '',
703        ],
704        'Į'       => [
705            '1',
706            '0',
707            '',
708            '',
709        ],
710        'İ'       => [
711            '1',
712            '0',
713            '',
714            '',
715        ],
716        'Ỉ'       => [
717            '1',
718            '0',
719            '',
720            '',
721        ],
722        'Ị'       => [
723            '1',
724            '0',
725            '',
726            '',
727        ],
728        'IA'      => [
729            '1',
730            '1',
731            '',
732            '',
733        ],
734        'IE'      => [
735            '1',
736            '1',
737            '',
738            '',
739        ],
740        'IO'      => [
741            '1',
742            '1',
743            '',
744            '',
745        ],
746        'IU'      => [
747            '1',
748            '1',
749            '',
750            '',
751        ],
752        'J'       => [
753            '0',
754            '1',
755            '',
756            '',
757            '4',
758            '4',
759            '4',
760            '5',
761            '5',
762            '',
763        ],
764        'K'       => [
765            '0',
766            '5',
767            '5',
768            '5',
769        ],
770        'KH'      => [
771            '0',
772            '5',
773            '5',
774            '5',
775        ],
776        'KS'      => [
777            '0',
778            '5',
779            '54',
780            '54',
781        ],
782        'L'       => [
783            '0',
784            '8',
785            '8',
786            '8',
787        ],
788        'Ľ'       => [
789            '0',
790            '8',
791            '8',
792            '8',
793        ],
794        'Ĺ'       => [
795            '0',
796            '8',
797            '8',
798            '8',
799        ],
800        'Ł'       => [
801            '0',
802            '7',
803            '7',
804            '7',
805            '8',
806            '8',
807            '8',
808        ],
809        'LL'      => [
810            '0',
811            '8',
812            '8',
813            '8',
814            '58',
815            '8',
816            '8',
817            '1',
818            '8',
819            '8',
820        ],
821        'LLY'     => [
822            '0',
823            '8',
824            '8',
825            '8',
826            '1',
827            '8',
828            '8',
829        ],
830        'LY'      => [
831            '0',
832            '8',
833            '8',
834            '8',
835            '1',
836            '8',
837            '8',
838        ],
839        'M'       => [
840            '0',
841            '6',
842            '6',
843            '6',
844        ],
845        'MĔ'      => [
846            '0',
847            '66',
848            '66',
849            '66',
850        ],
851        'MN'      => [
852            '0',
853            '66',
854            '66',
855            '66',
856        ],
857        'N'       => [
858            '0',
859            '6',
860            '6',
861            '6',
862        ],
863        'Ń'       => [
864            '0',
865            '6',
866            '6',
867            '6',
868        ],
869        'Ň'       => [
870            '0',
871            '6',
872            '6',
873            '6',
874        ],
875        'Ñ'       => [
876            '0',
877            '6',
878            '6',
879            '6',
880        ],
881        'NM'      => [
882            '0',
883            '66',
884            '66',
885            '66',
886        ],
887        'O'       => [
888            '1',
889            '0',
890            '',
891            '',
892        ],
893        'Ò'       => [
894            '1',
895            '0',
896            '',
897            '',
898        ],
899        'Ó'       => [
900            '1',
901            '0',
902            '',
903            '',
904        ],
905        'Ô'       => [
906            '1',
907            '0',
908            '',
909            '',
910        ],
911        'Õ'       => [
912            '1',
913            '0',
914            '',
915            '',
916        ],
917        'Ö'       => [
918            '1',
919            '0',
920            '',
921            '',
922        ],
923        'Ø'       => [
924            '1',
925            '0',
926            '',
927            '',
928        ],
929        'Ő'       => [
930            '1',
931            '0',
932            '',
933            '',
934        ],
935        'Œ'       => [
936            '1',
937            '0',
938            '',
939            '',
940        ],
941        'Ơ'       => [
942            '1',
943            '0',
944            '',
945            '',
946        ],
947        'Ọ'       => [
948            '1',
949            '0',
950            '',
951            '',
952        ],
953        'Ỏ'       => [
954            '1',
955            '0',
956            '',
957            '',
958        ],
959        'Ố'       => [
960            '1',
961            '0',
962            '',
963            '',
964        ],
965        'Ồ'       => [
966            '1',
967            '0',
968            '',
969            '',
970        ],
971        'Ổ'       => [
972            '1',
973            '0',
974            '',
975            '',
976        ],
977        'Ỗ'       => [
978            '1',
979            '0',
980            '',
981            '',
982        ],
983        'Ộ'       => [
984            '1',
985            '0',
986            '',
987            '',
988        ],
989        'Ớ'       => [
990            '1',
991            '0',
992            '',
993            '',
994        ],
995        'Ờ'       => [
996            '1',
997            '0',
998            '',
999            '',
1000        ],
1001        'Ở'       => [
1002            '1',
1003            '0',
1004            '',
1005            '',
1006        ],
1007        'Ỡ'       => [
1008            '1',
1009            '0',
1010            '',
1011            '',
1012        ],
1013        'Ợ'       => [
1014            '1',
1015            '0',
1016            '',
1017            '',
1018        ],
1019        'OE'      => [
1020            '1',
1021            '0',
1022            '',
1023            '',
1024        ],
1025        'OI'      => [
1026            '1',
1027            '0',
1028            '1',
1029            '',
1030        ],
1031        'OJ'      => [
1032            '1',
1033            '0',
1034            '1',
1035            '',
1036        ],
1037        'OU'      => [
1038            '1',
1039            '0',
1040            '',
1041            '',
1042        ],
1043        'OY'      => [
1044            '1',
1045            '0',
1046            '1',
1047            '',
1048        ],
1049        'P'       => [
1050            '0',
1051            '7',
1052            '7',
1053            '7',
1054        ],
1055        'PF'      => [
1056            '0',
1057            '7',
1058            '7',
1059            '7',
1060        ],
1061        'PH'      => [
1062            '0',
1063            '7',
1064            '7',
1065            '7',
1066        ],
1067        'Q'       => [
1068            '0',
1069            '5',
1070            '5',
1071            '5',
1072        ],
1073        'R'       => [
1074            '0',
1075            '9',
1076            '9',
1077            '9',
1078        ],
1079        'Ř'       => [
1080            '0',
1081            '4',
1082            '4',
1083            '4',
1084        ],
1085        'RS'      => [
1086            '0',
1087            '4',
1088            '4',
1089            '4',
1090            '94',
1091            '94',
1092            '94',
1093        ],
1094        'RZ'      => [
1095            '0',
1096            '4',
1097            '4',
1098            '4',
1099            '94',
1100            '94',
1101            '94',
1102        ],
1103        'S'       => [
1104            '0',
1105            '4',
1106            '4',
1107            '4',
1108        ],
1109        'Ś'       => [
1110            '0',
1111            '4',
1112            '4',
1113            '4',
1114        ],
1115        'Š'       => [
1116            '0',
1117            '4',
1118            '4',
1119            '4',
1120        ],
1121        'Ş'       => [
1122            '0',
1123            '4',
1124            '4',
1125            '4',
1126        ],
1127        'SC'      => [
1128            '0',
1129            '2',
1130            '4',
1131            '4',
1132        ],
1133        'ŠČ'      => [
1134            '0',
1135            '2',
1136            '4',
1137            '4',
1138        ],
1139        'SCH'     => [
1140            '0',
1141            '4',
1142            '4',
1143            '4',
1144        ],
1145        'SCHD'    => [
1146            '0',
1147            '2',
1148            '43',
1149            '43',
1150        ],
1151        'SCHT'    => [
1152            '0',
1153            '2',
1154            '43',
1155            '43',
1156        ],
1157        'SCHTCH'  => [
1158            '0',
1159            '2',
1160            '4',
1161            '4',
1162        ],
1163        'SCHTSCH' => [
1164            '0',
1165            '2',
1166            '4',
1167            '4',
1168        ],
1169        'SCHTSH'  => [
1170            '0',
1171            '2',
1172            '4',
1173            '4',
1174        ],
1175        'SD'      => [
1176            '0',
1177            '2',
1178            '43',
1179            '43',
1180        ],
1181        'SH'      => [
1182            '0',
1183            '4',
1184            '4',
1185            '4',
1186        ],
1187        'SHCH'    => [
1188            '0',
1189            '2',
1190            '4',
1191            '4',
1192        ],
1193        'SHD'     => [
1194            '0',
1195            '2',
1196            '43',
1197            '43',
1198        ],
1199        'SHT'     => [
1200            '0',
1201            '2',
1202            '43',
1203            '43',
1204        ],
1205        'SHTCH'   => [
1206            '0',
1207            '2',
1208            '4',
1209            '4',
1210        ],
1211        'SHTSH'   => [
1212            '0',
1213            '2',
1214            '4',
1215            '4',
1216        ],
1217        'ß'       => [
1218            '0',
1219            '',
1220            '4',
1221            '4',
1222        ],
1223        'ST'      => [
1224            '0',
1225            '2',
1226            '43',
1227            '43',
1228        ],
1229        'STCH'    => [
1230            '0',
1231            '2',
1232            '4',
1233            '4',
1234        ],
1235        'STRS'    => [
1236            '0',
1237            '2',
1238            '4',
1239            '4',
1240        ],
1241        'STRZ'    => [
1242            '0',
1243            '2',
1244            '4',
1245            '4',
1246        ],
1247        'STSCH'   => [
1248            '0',
1249            '2',
1250            '4',
1251            '4',
1252        ],
1253        'STSH'    => [
1254            '0',
1255            '2',
1256            '4',
1257            '4',
1258        ],
1259        'SSZ'     => [
1260            '0',
1261            '4',
1262            '4',
1263            '4',
1264        ],
1265        'SZ'      => [
1266            '0',
1267            '4',
1268            '4',
1269            '4',
1270        ],
1271        'SZCS'    => [
1272            '0',
1273            '2',
1274            '4',
1275            '4',
1276        ],
1277        'SZCZ'    => [
1278            '0',
1279            '2',
1280            '4',
1281            '4',
1282        ],
1283        'SZD'     => [
1284            '0',
1285            '2',
1286            '43',
1287            '43',
1288        ],
1289        'SZT'     => [
1290            '0',
1291            '2',
1292            '43',
1293            '43',
1294        ],
1295        'T'       => [
1296            '0',
1297            '3',
1298            '3',
1299            '3',
1300        ],
1301        'Ť'       => [
1302            '0',
1303            '3',
1304            '3',
1305            '3',
1306        ],
1307        'Ţ'       => [
1308            '0',
1309            '3',
1310            '3',
1311            '3',
1312            '4',
1313            '4',
1314            '4',
1315        ],
1316        'TC'      => [
1317            '0',
1318            '4',
1319            '4',
1320            '4',
1321        ],
1322        'TCH'     => [
1323            '0',
1324            '4',
1325            '4',
1326            '4',
1327        ],
1328        'TH'      => [
1329            '0',
1330            '3',
1331            '3',
1332            '3',
1333        ],
1334        'TRS'     => [
1335            '0',
1336            '4',
1337            '4',
1338            '4',
1339        ],
1340        'TRZ'     => [
1341            '0',
1342            '4',
1343            '4',
1344            '4',
1345        ],
1346        'TS'      => [
1347            '0',
1348            '4',
1349            '4',
1350            '4',
1351        ],
1352        'TSCH'    => [
1353            '0',
1354            '4',
1355            '4',
1356            '4',
1357        ],
1358        'TSH'     => [
1359            '0',
1360            '4',
1361            '4',
1362            '4',
1363        ],
1364        'TSZ'     => [
1365            '0',
1366            '4',
1367            '4',
1368            '4',
1369        ],
1370        'TTCH'    => [
1371            '0',
1372            '4',
1373            '4',
1374            '4',
1375        ],
1376        'TTS'     => [
1377            '0',
1378            '4',
1379            '4',
1380            '4',
1381        ],
1382        'TTSCH'   => [
1383            '0',
1384            '4',
1385            '4',
1386            '4',
1387        ],
1388        'TTSZ'    => [
1389            '0',
1390            '4',
1391            '4',
1392            '4',
1393        ],
1394        'TTZ'     => [
1395            '0',
1396            '4',
1397            '4',
1398            '4',
1399        ],
1400        'TZ'      => [
1401            '0',
1402            '4',
1403            '4',
1404            '4',
1405        ],
1406        'TZS'     => [
1407            '0',
1408            '4',
1409            '4',
1410            '4',
1411        ],
1412        'U'       => [
1413            '1',
1414            '0',
1415            '',
1416            '',
1417        ],
1418        'Ù'       => [
1419            '1',
1420            '0',
1421            '',
1422            '',
1423        ],
1424        'Ú'       => [
1425            '1',
1426            '0',
1427            '',
1428            '',
1429        ],
1430        'Û'       => [
1431            '1',
1432            '0',
1433            '',
1434            '',
1435        ],
1436        'Ü'       => [
1437            '1',
1438            '0',
1439            '',
1440            '',
1441        ],
1442        'Ũ'       => [
1443            '1',
1444            '0',
1445            '',
1446            '',
1447        ],
1448        'Ū'       => [
1449            '1',
1450            '0',
1451            '',
1452            '',
1453        ],
1454        'Ů'       => [
1455            '1',
1456            '0',
1457            '',
1458            '',
1459        ],
1460        'Ű'       => [
1461            '1',
1462            '0',
1463            '',
1464            '',
1465        ],
1466        'Ų'       => [
1467            '1',
1468            '0',
1469            '',
1470            '',
1471        ],
1472        'Ư'       => [
1473            '1',
1474            '0',
1475            '',
1476            '',
1477        ],
1478        'Ụ'       => [
1479            '1',
1480            '0',
1481            '',
1482            '',
1483        ],
1484        'Ủ'       => [
1485            '1',
1486            '0',
1487            '',
1488            '',
1489        ],
1490        'Ứ'       => [
1491            '1',
1492            '0',
1493            '',
1494            '',
1495        ],
1496        'Ừ'       => [
1497            '1',
1498            '0',
1499            '',
1500            '',
1501        ],
1502        'Ử'       => [
1503            '1',
1504            '0',
1505            '',
1506            '',
1507        ],
1508        'Ữ'       => [
1509            '1',
1510            '0',
1511            '',
1512            '',
1513        ],
1514        'Ự'       => [
1515            '1',
1516            '0',
1517            '',
1518            '',
1519        ],
1520        'UE'      => [
1521            '1',
1522            '0',
1523            '',
1524            '',
1525        ],
1526        'UI'      => [
1527            '1',
1528            '0',
1529            '1',
1530            '',
1531        ],
1532        'UJ'      => [
1533            '1',
1534            '0',
1535            '1',
1536            '',
1537        ],
1538        'UY'      => [
1539            '1',
1540            '0',
1541            '1',
1542            '',
1543        ],
1544        'UW'      => [
1545            '1',
1546            '0',
1547            '1',
1548            '',
1549            '0',
1550            '7',
1551            '7',
1552        ],
1553        'V'       => [
1554            '0',
1555            '7',
1556            '7',
1557            '7',
1558        ],
1559        'W'       => [
1560            '0',
1561            '7',
1562            '7',
1563            '7',
1564        ],
1565        'X'       => [
1566            '0',
1567            '5',
1568            '54',
1569            '54',
1570        ],
1571        'Y'       => [
1572            '1',
1573            '1',
1574            '',
1575            '',
1576        ],
1577        'Ý'       => [
1578            '1',
1579            '1',
1580            '',
1581            '',
1582        ],
1583        'Ỳ'       => [
1584            '1',
1585            '1',
1586            '',
1587            '',
1588        ],
1589        'Ỵ'       => [
1590            '1',
1591            '1',
1592            '',
1593            '',
1594        ],
1595        'Ỷ'       => [
1596            '1',
1597            '1',
1598            '',
1599            '',
1600        ],
1601        'Ỹ'       => [
1602            '1',
1603            '1',
1604            '',
1605            '',
1606        ],
1607        'Z'       => [
1608            '0',
1609            '4',
1610            '4',
1611            '4',
1612        ],
1613        'Ź'       => [
1614            '0',
1615            '4',
1616            '4',
1617            '4',
1618        ],
1619        'Ż'       => [
1620            '0',
1621            '4',
1622            '4',
1623            '4',
1624        ],
1625        'Ž'       => [
1626            '0',
1627            '4',
1628            '4',
1629            '4',
1630        ],
1631        'ZD'      => [
1632            '0',
1633            '2',
1634            '43',
1635            '43',
1636        ],
1637        'ZDZ'     => [
1638            '0',
1639            '2',
1640            '4',
1641            '4',
1642        ],
1643        'ZDZH'    => [
1644            '0',
1645            '2',
1646            '4',
1647            '4',
1648        ],
1649        'ZH'      => [
1650            '0',
1651            '4',
1652            '4',
1653            '4',
1654        ],
1655        'ZHD'     => [
1656            '0',
1657            '2',
1658            '43',
1659            '43',
1660        ],
1661        'ZHDZH'   => [
1662            '0',
1663            '2',
1664            '4',
1665            '4',
1666        ],
1667        'ZS'      => [
1668            '0',
1669            '4',
1670            '4',
1671            '4',
1672        ],
1673        'ZSCH'    => [
1674            '0',
1675            '4',
1676            '4',
1677            '4',
1678        ],
1679        'ZSH'     => [
1680            '0',
1681            '4',
1682            '4',
1683            '4',
1684        ],
1685        'ZZS'     => [
1686            '0',
1687            '4',
1688            '4',
1689            '4',
1690        ],
1691        // Cyrillic alphabet
1692        'А'       => [
1693            '1',
1694            '0',
1695            '',
1696            '',
1697        ],
1698        'Б'       => [
1699            '0',
1700            '7',
1701            '7',
1702            '7',
1703        ],
1704        'В'       => [
1705            '0',
1706            '7',
1707            '7',
1708            '7',
1709        ],
1710        'Г'       => [
1711            '0',
1712            '5',
1713            '5',
1714            '5',
1715        ],
1716        'Д'       => [
1717            '0',
1718            '3',
1719            '3',
1720            '3',
1721        ],
1722        'ДЗ'      => [
1723            '0',
1724            '4',
1725            '4',
1726            '4',
1727        ],
1728        'Е'       => [
1729            '1',
1730            '0',
1731            '',
1732            '',
1733        ],
1734        'Ё'       => [
1735            '1',
1736            '0',
1737            '',
1738            '',
1739        ],
1740        'Ж'       => [
1741            '0',
1742            '4',
1743            '4',
1744            '4',
1745        ],
1746        'З'       => [
1747            '0',
1748            '4',
1749            '4',
1750            '4',
1751        ],
1752        'И'       => [
1753            '1',
1754            '0',
1755            '',
1756            '',
1757        ],
1758        'Й'       => [
1759            '1',
1760            '1',
1761            '',
1762            '',
1763            '4',
1764            '4',
1765            '4',
1766        ],
1767        'К'       => [
1768            '0',
1769            '5',
1770            '5',
1771            '5',
1772        ],
1773        'Л'       => [
1774            '0',
1775            '8',
1776            '8',
1777            '8',
1778        ],
1779        'М'       => [
1780            '0',
1781            '6',
1782            '6',
1783            '6',
1784        ],
1785        'Н'       => [
1786            '0',
1787            '6',
1788            '6',
1789            '6',
1790        ],
1791        'О'       => [
1792            '1',
1793            '0',
1794            '',
1795            '',
1796        ],
1797        'П'       => [
1798            '0',
1799            '7',
1800            '7',
1801            '7',
1802        ],
1803        'Р'       => [
1804            '0',
1805            '9',
1806            '9',
1807            '9',
1808        ],
1809        'РЖ'      => [
1810            '0',
1811            '4',
1812            '4',
1813            '4',
1814        ],
1815        'С'       => [
1816            '0',
1817            '4',
1818            '4',
1819            '4',
1820        ],
1821        'Т'       => [
1822            '0',
1823            '3',
1824            '3',
1825            '3',
1826        ],
1827        'У'       => [
1828            '1',
1829            '0',
1830            '',
1831            '',
1832        ],
1833        'Ф'       => [
1834            '0',
1835            '7',
1836            '7',
1837            '7',
1838        ],
1839        'Х'       => [
1840            '0',
1841            '5',
1842            '5',
1843            '5',
1844        ],
1845        'Ц'       => [
1846            '0',
1847            '4',
1848            '4',
1849            '4',
1850        ],
1851        'Ч'       => [
1852            '0',
1853            '4',
1854            '4',
1855            '4',
1856        ],
1857        'Ш'       => [
1858            '0',
1859            '4',
1860            '4',
1861            '4',
1862        ],
1863        'Щ'       => [
1864            '0',
1865            '2',
1866            '4',
1867            '4',
1868        ],
1869        'Ъ'       => [
1870            '0',
1871            '',
1872            '',
1873            '',
1874        ],
1875        'Ы'       => [
1876            '0',
1877            '1',
1878            '',
1879            '',
1880        ],
1881        'Ь'       => [
1882            '0',
1883            '',
1884            '',
1885            '',
1886        ],
1887        'Э'       => [
1888            '1',
1889            '0',
1890            '',
1891            '',
1892        ],
1893        'Ю'       => [
1894            '0',
1895            '1',
1896            '',
1897            '',
1898        ],
1899        'Я'       => [
1900            '0',
1901            '1',
1902            '',
1903            '',
1904        ],
1905        // Greek alphabet
1906        'Α'       => [
1907            '1',
1908            '0',
1909            '',
1910            '',
1911        ],
1912        'Ά'       => [
1913            '1',
1914            '0',
1915            '',
1916            '',
1917        ],
1918        'ΑΙ'      => [
1919            '1',
1920            '0',
1921            '1',
1922            '',
1923        ],
1924        'ΑΥ'      => [
1925            '1',
1926            '0',
1927            '1',
1928            '',
1929        ],
1930        'Β'       => [
1931            '0',
1932            '7',
1933            '7',
1934            '7',
1935        ],
1936        'Γ'       => [
1937            '0',
1938            '5',
1939            '5',
1940            '5',
1941        ],
1942        'Δ'       => [
1943            '0',
1944            '3',
1945            '3',
1946            '3',
1947        ],
1948        'Ε'       => [
1949            '1',
1950            '0',
1951            '',
1952            '',
1953        ],
1954        'Έ'       => [
1955            '1',
1956            '0',
1957            '',
1958            '',
1959        ],
1960        'ΕΙ'      => [
1961            '1',
1962            '0',
1963            '1',
1964            '',
1965        ],
1966        'ΕΥ'      => [
1967            '1',
1968            '1',
1969            '1',
1970            '',
1971        ],
1972        'Ζ'       => [
1973            '0',
1974            '4',
1975            '4',
1976            '4',
1977        ],
1978        'Η'       => [
1979            '1',
1980            '0',
1981            '',
1982            '',
1983        ],
1984        'Ή'       => [
1985            '1',
1986            '0',
1987            '',
1988            '',
1989        ],
1990        'Θ'       => [
1991            '0',
1992            '3',
1993            '3',
1994            '3',
1995        ],
1996        'Ι'       => [
1997            '1',
1998            '0',
1999            '',
2000            '',
2001        ],
2002        'Ί'       => [
2003            '1',
2004            '0',
2005            '',
2006            '',
2007        ],
2008        'Ϊ'       => [
2009            '1',
2010            '0',
2011            '',
2012            '',
2013        ],
2014        'ΐ'       => [
2015            '1',
2016            '0',
2017            '',
2018            '',
2019        ],
2020        'Κ'       => [
2021            '0',
2022            '5',
2023            '5',
2024            '5',
2025        ],
2026        'Λ'       => [
2027            '0',
2028            '8',
2029            '8',
2030            '8',
2031        ],
2032        'Μ'       => [
2033            '0',
2034            '6',
2035            '6',
2036            '6',
2037        ],
2038        'ΜΠ'      => [
2039            '0',
2040            '7',
2041            '7',
2042            '7',
2043        ],
2044        'Ν'       => [
2045            '0',
2046            '6',
2047            '6',
2048            '6',
2049        ],
2050        'ΝΤ'      => [
2051            '0',
2052            '3',
2053            '3',
2054            '3',
2055        ],
2056        'Ξ'       => [
2057            '0',
2058            '5',
2059            '54',
2060            '54',
2061        ],
2062        'Ο'       => [
2063            '1',
2064            '0',
2065            '',
2066            '',
2067        ],
2068        'Ό'       => [
2069            '1',
2070            '0',
2071            '',
2072            '',
2073        ],
2074        'ΟΙ'      => [
2075            '1',
2076            '0',
2077            '1',
2078            '',
2079        ],
2080        'ΟΥ'      => [
2081            '1',
2082            '0',
2083            '1',
2084            '',
2085        ],
2086        'Π'       => [
2087            '0',
2088            '7',
2089            '7',
2090            '7',
2091        ],
2092        'Ρ'       => [
2093            '0',
2094            '9',
2095            '9',
2096            '9',
2097        ],
2098        'Σ'       => [
2099            '0',
2100            '4',
2101            '4',
2102            '4',
2103        ],
2104        'ς'       => [
2105            '0',
2106            '',
2107            '',
2108            '4',
2109        ],
2110        'Τ'       => [
2111            '0',
2112            '3',
2113            '3',
2114            '3',
2115        ],
2116        'ΤΖ'      => [
2117            '0',
2118            '4',
2119            '4',
2120            '4',
2121        ],
2122        'ΤΣ'      => [
2123            '0',
2124            '4',
2125            '4',
2126            '4',
2127        ],
2128        'Υ'       => [
2129            '1',
2130            '1',
2131            '',
2132            '',
2133        ],
2134        'Ύ'       => [
2135            '1',
2136            '1',
2137            '',
2138            '',
2139        ],
2140        'Ϋ'       => [
2141            '1',
2142            '1',
2143            '',
2144            '',
2145        ],
2146        'ΰ'       => [
2147            '1',
2148            '1',
2149            '',
2150            '',
2151        ],
2152        'ΥΚ'      => [
2153            '1',
2154            '5',
2155            '5',
2156            '5',
2157        ],
2158        'ΥΥ'      => [
2159            '1',
2160            '65',
2161            '65',
2162            '65',
2163        ],
2164        'Φ'       => [
2165            '0',
2166            '7',
2167            '7',
2168            '7',
2169        ],
2170        'Χ'       => [
2171            '0',
2172            '5',
2173            '5',
2174            '5',
2175        ],
2176        'Ψ'       => [
2177            '0',
2178            '7',
2179            '7',
2180            '7',
2181        ],
2182        'Ω'       => [
2183            '1',
2184            '0',
2185            '',
2186            '',
2187        ],
2188        'Ώ'       => [
2189            '1',
2190            '0',
2191            '',
2192            '',
2193        ],
2194        // Hebrew alphabet
2195        'א'       => [
2196            '1',
2197            '0',
2198            '',
2199            '',
2200        ],
2201        'או'      => [
2202            '1',
2203            '0',
2204            '7',
2205            '',
2206        ],
2207        'אג'      => [
2208            '1',
2209            '4',
2210            '4',
2211            '4',
2212            '5',
2213            '5',
2214            '5',
2215            '34',
2216            '34',
2217            '34',
2218        ],
2219        'בב'      => [
2220            '0',
2221            '7',
2222            '7',
2223            '7',
2224            '77',
2225            '77',
2226            '77',
2227        ],
2228        'ב'       => [
2229            '0',
2230            '7',
2231            '7',
2232            '7',
2233        ],
2234        'גג'      => [
2235            '0',
2236            '4',
2237            '4',
2238            '4',
2239            '5',
2240            '5',
2241            '5',
2242            '45',
2243            '45',
2244            '45',
2245            '55',
2246            '55',
2247            '55',
2248            '54',
2249            '54',
2250            '54',
2251        ],
2252        'גד'      => [
2253            '0',
2254            '43',
2255            '43',
2256            '43',
2257            '53',
2258            '53',
2259            '53',
2260        ],
2261        'גה'      => [
2262            '0',
2263            '45',
2264            '45',
2265            '45',
2266            '55',
2267            '55',
2268            '55',
2269        ],
2270        'גז'      => [
2271            '0',
2272            '44',
2273            '44',
2274            '44',
2275            '45',
2276            '45',
2277            '45',
2278        ],
2279        'גח'      => [
2280            '0',
2281            '45',
2282            '45',
2283            '45',
2284            '55',
2285            '55',
2286            '55',
2287        ],
2288        'גכ'      => [
2289            '0',
2290            '45',
2291            '45',
2292            '45',
2293            '55',
2294            '55',
2295            '55',
2296        ],
2297        'גך'      => [
2298            '0',
2299            '45',
2300            '45',
2301            '45',
2302            '55',
2303            '55',
2304            '55',
2305        ],
2306        'גצ'      => [
2307            '0',
2308            '44',
2309            '44',
2310            '44',
2311            '45',
2312            '45',
2313            '45',
2314        ],
2315        'גץ'      => [
2316            '0',
2317            '44',
2318            '44',
2319            '44',
2320            '45',
2321            '45',
2322            '45',
2323        ],
2324        'גק'      => [
2325            '0',
2326            '45',
2327            '45',
2328            '45',
2329            '54',
2330            '54',
2331            '54',
2332        ],
2333        'גש'      => [
2334            '0',
2335            '44',
2336            '44',
2337            '44',
2338            '54',
2339            '54',
2340            '54',
2341        ],
2342        'גת'      => [
2343            '0',
2344            '43',
2345            '43',
2346            '43',
2347            '53',
2348            '53',
2349            '53',
2350        ],
2351        'ג'       => [
2352            '0',
2353            '4',
2354            '4',
2355            '4',
2356            '5',
2357            '5',
2358            '5',
2359        ],
2360        'דז'      => [
2361            '0',
2362            '4',
2363            '4',
2364            '4',
2365        ],
2366        'דד'      => [
2367            '0',
2368            '3',
2369            '3',
2370            '3',
2371            '33',
2372            '33',
2373            '33',
2374        ],
2375        'דט'      => [
2376            '0',
2377            '33',
2378            '33',
2379            '33',
2380        ],
2381        'דש'      => [
2382            '0',
2383            '4',
2384            '4',
2385            '4',
2386        ],
2387        'דצ'      => [
2388            '0',
2389            '4',
2390            '4',
2391            '4',
2392        ],
2393        'דץ'      => [
2394            '0',
2395            '4',
2396            '4',
2397            '4',
2398        ],
2399        'ד'       => [
2400            '0',
2401            '3',
2402            '3',
2403            '3',
2404        ],
2405        'הג'      => [
2406            '0',
2407            '54',
2408            '54',
2409            '54',
2410            '55',
2411            '55',
2412            '55',
2413        ],
2414        'הכ'      => [
2415            '0',
2416            '55',
2417            '55',
2418            '55',
2419        ],
2420        'הח'      => [
2421            '0',
2422            '55',
2423            '55',
2424            '55',
2425        ],
2426        'הק'      => [
2427            '0',
2428            '55',
2429            '55',
2430            '55',
2431            '5',
2432            '5',
2433            '5',
2434        ],
2435        'הה'      => [
2436            '0',
2437            '5',
2438            '5',
2439            '',
2440            '55',
2441            '55',
2442            '',
2443        ],
2444        'ה'       => [
2445            '0',
2446            '5',
2447            '5',
2448            '',
2449        ],
2450        'וי'      => [
2451            '1',
2452            '',
2453            '',
2454            '',
2455            '7',
2456            '7',
2457            '7',
2458        ],
2459        'ו'       => [
2460            '1',
2461            '7',
2462            '7',
2463            '7',
2464            '7',
2465            '',
2466            '',
2467        ],
2468        'וו'      => [
2469            '1',
2470            '7',
2471            '7',
2472            '7',
2473            '7',
2474            '',
2475            '',
2476        ],
2477        'וופ'     => [
2478            '1',
2479            '7',
2480            '7',
2481            '7',
2482            '77',
2483            '77',
2484            '77',
2485        ],
2486        'זש'      => [
2487            '0',
2488            '4',
2489            '4',
2490            '4',
2491            '44',
2492            '44',
2493            '44',
2494        ],
2495        'זדז'     => [
2496            '0',
2497            '2',
2498            '4',
2499            '4',
2500        ],
2501        'ז'       => [
2502            '0',
2503            '4',
2504            '4',
2505            '4',
2506        ],
2507        'זג'      => [
2508            '0',
2509            '44',
2510            '44',
2511            '44',
2512            '45',
2513            '45',
2514            '45',
2515        ],
2516        'זז'      => [
2517            '0',
2518            '4',
2519            '4',
2520            '4',
2521            '44',
2522            '44',
2523            '44',
2524        ],
2525        'זס'      => [
2526            '0',
2527            '44',
2528            '44',
2529            '44',
2530        ],
2531        'זצ'      => [
2532            '0',
2533            '44',
2534            '44',
2535            '44',
2536        ],
2537        'זץ'      => [
2538            '0',
2539            '44',
2540            '44',
2541            '44',
2542        ],
2543        'חג'      => [
2544            '0',
2545            '54',
2546            '54',
2547            '54',
2548            '53',
2549            '53',
2550            '53',
2551        ],
2552        'חח'      => [
2553            '0',
2554            '5',
2555            '5',
2556            '5',
2557            '55',
2558            '55',
2559            '55',
2560        ],
2561        'חק'      => [
2562            '0',
2563            '55',
2564            '55',
2565            '55',
2566            '5',
2567            '5',
2568            '5',
2569        ],
2570        'חכ'      => [
2571            '0',
2572            '45',
2573            '45',
2574            '45',
2575            '55',
2576            '55',
2577            '55',
2578        ],
2579        'חס'      => [
2580            '0',
2581            '5',
2582            '54',
2583            '54',
2584        ],
2585        'חש'      => [
2586            '0',
2587            '5',
2588            '54',
2589            '54',
2590        ],
2591        'ח'       => [
2592            '0',
2593            '5',
2594            '5',
2595            '5',
2596        ],
2597        'טש'      => [
2598            '0',
2599            '4',
2600            '4',
2601            '4',
2602        ],
2603        'טד'      => [
2604            '0',
2605            '33',
2606            '33',
2607            '33',
2608        ],
2609        'טי'      => [
2610            '0',
2611            '3',
2612            '3',
2613            '3',
2614            '4',
2615            '4',
2616            '4',
2617            '3',
2618            '3',
2619            '34',
2620        ],
2621        'טת'      => [
2622            '0',
2623            '33',
2624            '33',
2625            '33',
2626        ],
2627        'טט'      => [
2628            '0',
2629            '3',
2630            '3',
2631            '3',
2632            '33',
2633            '33',
2634            '33',
2635        ],
2636        'ט'       => [
2637            '0',
2638            '3',
2639            '3',
2640            '3',
2641        ],
2642        'י'       => [
2643            '1',
2644            '1',
2645            '',
2646            '',
2647        ],
2648        'יא'      => [
2649            '1',
2650            '1',
2651            '',
2652            '',
2653            '1',
2654            '1',
2655            '1',
2656        ],
2657        'כג'      => [
2658            '0',
2659            '55',
2660            '55',
2661            '55',
2662            '54',
2663            '54',
2664            '54',
2665        ],
2666        'כש'      => [
2667            '0',
2668            '5',
2669            '54',
2670            '54',
2671        ],
2672        'כס'      => [
2673            '0',
2674            '5',
2675            '54',
2676            '54',
2677        ],
2678        'ככ'      => [
2679            '0',
2680            '5',
2681            '5',
2682            '5',
2683            '55',
2684            '55',
2685            '55',
2686        ],
2687        'כך'      => [
2688            '0',
2689            '5',
2690            '5',
2691            '5',
2692            '55',
2693            '55',
2694            '55',
2695        ],
2696        'כ'       => [
2697            '0',
2698            '5',
2699            '5',
2700            '5',
2701        ],
2702        'כח'      => [
2703            '0',
2704            '55',
2705            '55',
2706            '55',
2707            '5',
2708            '5',
2709            '5',
2710        ],
2711        'ך'       => [
2712            '0',
2713            '',
2714            '5',
2715            '5',
2716        ],
2717        'ל'       => [
2718            '0',
2719            '8',
2720            '8',
2721            '8',
2722        ],
2723        'לל'      => [
2724            '0',
2725            '88',
2726            '88',
2727            '88',
2728            '8',
2729            '8',
2730            '8',
2731        ],
2732        'מנ'      => [
2733            '0',
2734            '66',
2735            '66',
2736            '66',
2737        ],
2738        'מן'      => [
2739            '0',
2740            '66',
2741            '66',
2742            '66',
2743        ],
2744        'ממ'      => [
2745            '0',
2746            '6',
2747            '6',
2748            '6',
2749            '66',
2750            '66',
2751            '66',
2752        ],
2753        'מם'      => [
2754            '0',
2755            '6',
2756            '6',
2757            '6',
2758            '66',
2759            '66',
2760            '66',
2761        ],
2762        'מ'       => [
2763            '0',
2764            '6',
2765            '6',
2766            '6',
2767        ],
2768        'ם'       => [
2769            '0',
2770            '',
2771            '6',
2772            '6',
2773        ],
2774        'נמ'      => [
2775            '0',
2776            '66',
2777            '66',
2778            '66',
2779        ],
2780        'נם'      => [
2781            '0',
2782            '66',
2783            '66',
2784            '66',
2785        ],
2786        'ננ'      => [
2787            '0',
2788            '6',
2789            '6',
2790            '6',
2791            '66',
2792            '66',
2793            '66',
2794        ],
2795        'נן'      => [
2796            '0',
2797            '6',
2798            '6',
2799            '6',
2800            '66',
2801            '66',
2802            '66',
2803        ],
2804        'נ'       => [
2805            '0',
2806            '6',
2807            '6',
2808            '6',
2809        ],
2810        'ן'       => [
2811            '0',
2812            '',
2813            '6',
2814            '6',
2815        ],
2816        'סתש'     => [
2817            '0',
2818            '2',
2819            '4',
2820            '4',
2821        ],
2822        'סתז'     => [
2823            '0',
2824            '2',
2825            '4',
2826            '4',
2827        ],
2828        'סטז'     => [
2829            '0',
2830            '2',
2831            '4',
2832            '4',
2833        ],
2834        'סטש'     => [
2835            '0',
2836            '2',
2837            '4',
2838            '4',
2839        ],
2840        'סצד'     => [
2841            '0',
2842            '2',
2843            '4',
2844            '4',
2845        ],
2846        'סט'      => [
2847            '0',
2848            '2',
2849            '4',
2850            '4',
2851            '43',
2852            '43',
2853            '43',
2854        ],
2855        'סת'      => [
2856            '0',
2857            '2',
2858            '4',
2859            '4',
2860            '43',
2861            '43',
2862            '43',
2863        ],
2864        'סג'      => [
2865            '0',
2866            '44',
2867            '44',
2868            '44',
2869            '4',
2870            '4',
2871            '4',
2872        ],
2873        'סס'      => [
2874            '0',
2875            '4',
2876            '4',
2877            '4',
2878            '44',
2879            '44',
2880            '44',
2881        ],
2882        'סצ'      => [
2883            '0',
2884            '44',
2885            '44',
2886            '44',
2887        ],
2888        'סץ'      => [
2889            '0',
2890            '44',
2891            '44',
2892            '44',
2893        ],
2894        'סז'      => [
2895            '0',
2896            '44',
2897            '44',
2898            '44',
2899        ],
2900        'סש'      => [
2901            '0',
2902            '44',
2903            '44',
2904            '44',
2905        ],
2906        'ס'       => [
2907            '0',
2908            '4',
2909            '4',
2910            '4',
2911        ],
2912        'ע'       => [
2913            '1',
2914            '0',
2915            '',
2916            '',
2917        ],
2918        'פב'      => [
2919            '0',
2920            '7',
2921            '7',
2922            '7',
2923            '77',
2924            '77',
2925            '77',
2926        ],
2927        'פוו'     => [
2928            '0',
2929            '7',
2930            '7',
2931            '7',
2932            '77',
2933            '77',
2934            '77',
2935        ],
2936        'פפ'      => [
2937            '0',
2938            '7',
2939            '7',
2940            '7',
2941            '77',
2942            '77',
2943            '77',
2944        ],
2945        'פף'      => [
2946            '0',
2947            '7',
2948            '7',
2949            '7',
2950            '77',
2951            '77',
2952            '77',
2953        ],
2954        'פ'       => [
2955            '0',
2956            '7',
2957            '7',
2958            '7',
2959        ],
2960        'ף'       => [
2961            '0',
2962            '',
2963            '7',
2964            '7',
2965        ],
2966        'צג'      => [
2967            '0',
2968            '44',
2969            '44',
2970            '44',
2971            '45',
2972            '45',
2973            '45',
2974        ],
2975        'צז'      => [
2976            '0',
2977            '44',
2978            '44',
2979            '44',
2980        ],
2981        'צס'      => [
2982            '0',
2983            '44',
2984            '44',
2985            '44',
2986        ],
2987        'צצ'      => [
2988            '0',
2989            '4',
2990            '4',
2991            '4',
2992            '5',
2993            '5',
2994            '5',
2995            '44',
2996            '44',
2997            '44',
2998            '54',
2999            '54',
3000            '54',
3001            '45',
3002            '45',
3003            '45',
3004        ],
3005        'צץ'      => [
3006            '0',
3007            '4',
3008            '4',
3009            '4',
3010            '5',
3011            '5',
3012            '5',
3013            '44',
3014            '44',
3015            '44',
3016            '54',
3017            '54',
3018            '54',
3019        ],
3020        'צש'      => [
3021            '0',
3022            '44',
3023            '44',
3024            '44',
3025            '4',
3026            '4',
3027            '4',
3028            '5',
3029            '5',
3030            '5',
3031        ],
3032        'צ'       => [
3033            '0',
3034            '4',
3035            '4',
3036            '4',
3037            '5',
3038            '5',
3039            '5',
3040        ],
3041        'ץ'       => [
3042            '0',
3043            '',
3044            '4',
3045            '4',
3046        ],
3047        'קה'      => [
3048            '0',
3049            '55',
3050            '55',
3051            '5',
3052        ],
3053        'קס'      => [
3054            '0',
3055            '5',
3056            '54',
3057            '54',
3058        ],
3059        'קש'      => [
3060            '0',
3061            '5',
3062            '54',
3063            '54',
3064        ],
3065        'קק'      => [
3066            '0',
3067            '5',
3068            '5',
3069            '5',
3070            '55',
3071            '55',
3072            '55',
3073        ],
3074        'קח'      => [
3075            '0',
3076            '55',
3077            '55',
3078            '55',
3079        ],
3080        'קכ'      => [
3081            '0',
3082            '55',
3083            '55',
3084            '55',
3085        ],
3086        'קך'      => [
3087            '0',
3088            '55',
3089            '55',
3090            '55',
3091        ],
3092        'קג'      => [
3093            '0',
3094            '55',
3095            '55',
3096            '55',
3097            '54',
3098            '54',
3099            '54',
3100        ],
3101        'ק'       => [
3102            '0',
3103            '5',
3104            '5',
3105            '5',
3106        ],
3107        'רר'      => [
3108            '0',
3109            '99',
3110            '99',
3111            '99',
3112            '9',
3113            '9',
3114            '9',
3115        ],
3116        'ר'       => [
3117            '0',
3118            '9',
3119            '9',
3120            '9',
3121        ],
3122        'שטז'     => [
3123            '0',
3124            '2',
3125            '4',
3126            '4',
3127        ],
3128        'שתש'     => [
3129            '0',
3130            '2',
3131            '4',
3132            '4',
3133        ],
3134        'שתז'     => [
3135            '0',
3136            '2',
3137            '4',
3138            '4',
3139        ],
3140        'שטש'     => [
3141            '0',
3142            '2',
3143            '4',
3144            '4',
3145        ],
3146        'שד'      => [
3147            '0',
3148            '2',
3149            '43',
3150            '43',
3151        ],
3152        'שז'      => [
3153            '0',
3154            '44',
3155            '44',
3156            '44',
3157        ],
3158        'שס'      => [
3159            '0',
3160            '44',
3161            '44',
3162            '44',
3163        ],
3164        'שת'      => [
3165            '0',
3166            '2',
3167            '43',
3168            '43',
3169        ],
3170        'שג'      => [
3171            '0',
3172            '4',
3173            '4',
3174            '4',
3175            '44',
3176            '44',
3177            '44',
3178            '4',
3179            '43',
3180            '43',
3181        ],
3182        'שט'      => [
3183            '0',
3184            '2',
3185            '43',
3186            '43',
3187            '44',
3188            '44',
3189            '44',
3190        ],
3191        'שצ'      => [
3192            '0',
3193            '44',
3194            '44',
3195            '44',
3196            '45',
3197            '45',
3198            '45',
3199        ],
3200        'שץ'      => [
3201            '0',
3202            '44',
3203            '',
3204            '44',
3205            '45',
3206            '',
3207            '45',
3208        ],
3209        'שש'      => [
3210            '0',
3211            '4',
3212            '4',
3213            '4',
3214            '44',
3215            '44',
3216            '44',
3217        ],
3218        'ש'       => [
3219            '0',
3220            '4',
3221            '4',
3222            '4',
3223        ],
3224        'תג'      => [
3225            '0',
3226            '34',
3227            '34',
3228            '34',
3229        ],
3230        'תז'      => [
3231            '0',
3232            '34',
3233            '34',
3234            '34',
3235        ],
3236        'תש'      => [
3237            '0',
3238            '4',
3239            '4',
3240            '4',
3241        ],
3242        'תת'      => [
3243            '0',
3244            '3',
3245            '3',
3246            '3',
3247            '4',
3248            '4',
3249            '4',
3250            '33',
3251            '33',
3252            '33',
3253            '44',
3254            '44',
3255            '44',
3256            '34',
3257            '34',
3258            '34',
3259            '43',
3260            '43',
3261            '43',
3262        ],
3263        'ת'       => [
3264            '0',
3265            '3',
3266            '3',
3267            '3',
3268            '4',
3269            '4',
3270            '4',
3271        ],
3272        // Arabic alphabet
3273        'ا'       => [
3274            '1',
3275            '0',
3276            '',
3277            '',
3278        ],
3279        'ب'       => [
3280            '0',
3281            '7',
3282            '7',
3283            '7',
3284        ],
3285        'ت'       => [
3286            '0',
3287            '3',
3288            '3',
3289            '3',
3290        ],
3291        'ث'       => [
3292            '0',
3293            '3',
3294            '3',
3295            '3',
3296        ],
3297        'ج'       => [
3298            '0',
3299            '4',
3300            '4',
3301            '4',
3302        ],
3303        'ح'       => [
3304            '0',
3305            '5',
3306            '5',
3307            '5',
3308        ],
3309        'خ'       => [
3310            '0',
3311            '5',
3312            '5',
3313            '5',
3314        ],
3315        'د'       => [
3316            '0',
3317            '3',
3318            '3',
3319            '3',
3320        ],
3321        'ذ'       => [
3322            '0',
3323            '3',
3324            '3',
3325            '3',
3326        ],
3327        'ر'       => [
3328            '0',
3329            '9',
3330            '9',
3331            '9',
3332        ],
3333        'ز'       => [
3334            '0',
3335            '4',
3336            '4',
3337            '4',
3338        ],
3339        'س'       => [
3340            '0',
3341            '4',
3342            '4',
3343            '4',
3344        ],
3345        'ش'       => [
3346            '0',
3347            '4',
3348            '4',
3349            '4',
3350        ],
3351        'ص'       => [
3352            '0',
3353            '4',
3354            '4',
3355            '4',
3356        ],
3357        'ض'       => [
3358            '0',
3359            '3',
3360            '3',
3361            '3',
3362        ],
3363        'ط'       => [
3364            '0',
3365            '3',
3366            '3',
3367            '3',
3368        ],
3369        'ظ'       => [
3370            '0',
3371            '4',
3372            '4',
3373            '4',
3374        ],
3375        'ع'       => [
3376            '1',
3377            '0',
3378            '',
3379            '',
3380        ],
3381        'غ'       => [
3382            '0',
3383            '0',
3384            '',
3385            '',
3386        ],
3387        'ف'       => [
3388            '0',
3389            '7',
3390            '7',
3391            '7',
3392        ],
3393        'ق'       => [
3394            '0',
3395            '5',
3396            '5',
3397            '5',
3398        ],
3399        'ك'       => [
3400            '0',
3401            '5',
3402            '5',
3403            '5',
3404        ],
3405        'ل'       => [
3406            '0',
3407            '8',
3408            '8',
3409            '8',
3410        ],
3411        'لا'      => [
3412            '0',
3413            '8',
3414            '8',
3415            '8',
3416        ],
3417        'م'       => [
3418            '0',
3419            '6',
3420            '6',
3421            '6',
3422        ],
3423        'ن'       => [
3424            '0',
3425            '6',
3426            '6',
3427            '6',
3428        ],
3429        'هن'      => [
3430            '0',
3431            '66',
3432            '66',
3433            '66',
3434        ],
3435        'ه'       => [
3436            '0',
3437            '5',
3438            '5',
3439            '',
3440        ],
3441        'و'       => [
3442            '1',
3443            '',
3444            '',
3445            '',
3446            '7',
3447            '',
3448            '',
3449        ],
3450        'ي'       => [
3451            '0',
3452            '1',
3453            '',
3454            '',
3455        ],
3456        'آ'       => [
3457            '0',
3458            '1',
3459            '',
3460            '',
3461        ],
3462        'ة'       => [
3463            '0',
3464            '',
3465            '',
3466            '3',
3467        ],
3468        'ی'       => [
3469            '0',
3470            '1',
3471            '',
3472            '',
3473        ],
3474        'ى'       => [
3475            '1',
3476            '1',
3477            '',
3478            '',
3479        ],
3480    ];
3481
3482    /**
3483     * Which algorithms are supported.
3484     *
3485     * @return array<string>
3486     */
3487    public static function getAlgorithms(): array
3488    {
3489        return [
3490            /* I18N: https://en.wikipedia.org/wiki/Soundex */
3491            'std' => I18N::translate('Russell'),
3492            /* I18N: https://en.wikipedia.org/wiki/Daitch–Mokotoff_Soundex */
3493            'dm'  => I18N::translate('Daitch-Mokotoff'),
3494        ];
3495    }
3496
3497    /**
3498     * Is there a match between two soundex codes?
3499     *
3500     * @param string $soundex1
3501     * @param string $soundex2
3502     *
3503     * @return bool
3504     */
3505    public static function compare(string $soundex1, string $soundex2): bool
3506    {
3507        if ($soundex1 !== '' && $soundex2 !== '') {
3508            return array_intersect(explode(':', $soundex1), explode(':', $soundex2)) !== [];
3509        }
3510
3511        return false;
3512    }
3513
3514    /**
3515     * Generate Russell soundex codes for a given text.
3516     *
3517     * @param string $text
3518     *
3519     * @return string
3520     */
3521    public static function russell(string $text): string
3522    {
3523        $words         = explode(' ', $text);
3524        $soundex_array = [];
3525
3526        foreach ($words as $word) {
3527            $soundex = soundex($word);
3528
3529            // Only return codes from recognisable sounds
3530            if ($soundex !== '0000') {
3531                $soundex_array[] = $soundex;
3532            }
3533        }
3534
3535        // Combine words, e.g. “New York” as “Newyork”
3536        if (count($words) > 1) {
3537            $soundex_array[] = soundex(str_replace(' ', '', $text));
3538        }
3539
3540        // A varchar(255) column can only hold 51 4-character codes (plus 50 delimiters)
3541        $soundex_array = array_slice(array_unique($soundex_array), 0, 51);
3542
3543        return implode(':', $soundex_array);
3544    }
3545
3546    /**
3547     * Generate Daitch–Mokotoff soundex codes for a given text.
3548     *
3549     * @param string $text
3550     *
3551     * @return string
3552     */
3553    public static function daitchMokotoff(string $text): string
3554    {
3555        $words         = explode(' ', $text);
3556        $soundex_array = [];
3557
3558        foreach ($words as $word) {
3559            $soundex_array = array_merge($soundex_array, self::daitchMokotoffWord($word));
3560        }
3561        // Combine words, e.g. “New York” as “Newyork”
3562        if (count($words) > 1) {
3563            $soundex_array = array_merge($soundex_array, self::daitchMokotoffWord(str_replace(' ', '', $text)));
3564        }
3565
3566        // A varchar(255) column can only hold 36 6-character codes (plus 35 delimiters)
3567        $soundex_array = array_slice(array_unique($soundex_array), 0, 36);
3568
3569        return implode(':', $soundex_array);
3570    }
3571
3572    /**
3573     * Calculate the Daitch-Mokotoff soundex for a word.
3574     *
3575     * @param string $name
3576     *
3577     * @return array<string> List of possible DM codes for the word.
3578     */
3579    private static function daitchMokotoffWord(string $name): array
3580    {
3581        // Apply special transformation rules to the input string
3582        $name = I18N::strtoupper($name);
3583        foreach (self::TRANSFORM_NAMES as $transformRule) {
3584            $name = str_replace($transformRule[0], $transformRule[1], $name);
3585        }
3586
3587        // Initialize
3588        $name_script = I18N::textScript($name);
3589        $noVowels    = ($name_script === 'Hebr' || $name_script === 'Arab');
3590
3591        $lastPos         = strlen($name) - 1;
3592        $currPos         = 0;
3593        $state           = 1; // 1: start of input string, 2: before vowel, 3: other
3594        $result          = []; // accumulate complete 6-digit D-M codes here
3595        $partialResult   = []; // accumulate incomplete D-M codes here
3596        $partialResult[] = ['!']; // initialize 1st partial result  ('!' stops "duplicate sound" check)
3597
3598        // Loop through the input string.
3599        // Stop when the string is exhausted or when no more partial results remain
3600        while (count($partialResult) !== 0 && $currPos <= $lastPos) {
3601            // Find the DM coding table entry for the chunk at the current position
3602            $thisEntry = substr($name, $currPos, self::MAXCHAR); // Get maximum length chunk
3603            while ($thisEntry !== '') {
3604                if (isset(self::DM_SOUNDS[$thisEntry])) {
3605                    break;
3606                }
3607                $thisEntry = substr($thisEntry, 0, -1); // Not in table: try a shorter chunk
3608            }
3609            if ($thisEntry === '') {
3610                $currPos++; // Not in table: advance pointer to next byte
3611                continue; // and try again
3612            }
3613
3614            $soundTableEntry = self::DM_SOUNDS[$thisEntry];
3615            $workingResult   = $partialResult;
3616            $partialResult   = [];
3617            $currPos += strlen($thisEntry);
3618
3619            // Not at beginning of input string
3620            if ($state !== 1) {
3621                if ($currPos <= $lastPos) {
3622                    // Determine whether the next chunk is a vowel
3623                    $nextEntry = substr($name, $currPos, self::MAXCHAR); // Get maximum length chunk
3624                    while ($nextEntry !== '') {
3625                        if (isset(self::DM_SOUNDS[$nextEntry])) {
3626                            break;
3627                        }
3628                        $nextEntry = substr($nextEntry, 0, -1); // Not in table: try a shorter chunk
3629                    }
3630                } else {
3631                    $nextEntry = '';
3632                }
3633                if ($nextEntry !== '' && self::DM_SOUNDS[$nextEntry][0] !== '0') {
3634                    $state = 2;
3635                } else {
3636                    // Next chunk is a vowel
3637                    $state = 3;
3638                }
3639            }
3640
3641            while ($state < count($soundTableEntry)) {
3642                // empty means 'ignore this sound in this state'
3643                if ($soundTableEntry[$state] === '') {
3644                    foreach ($workingResult as $workingEntry) {
3645                        $tempEntry                        = $workingEntry;
3646                        $tempEntry[count($tempEntry) - 1] .= '!'; // Prevent false 'doubles'
3647                        $partialResult[]                  = $tempEntry;
3648                    }
3649                } else {
3650                    foreach ($workingResult as $workingEntry) {
3651                        if ($soundTableEntry[$state] !== $workingEntry[count($workingEntry) - 1]) {
3652                            // Incoming sound isn't a duplicate of the previous sound
3653                            $workingEntry[] = $soundTableEntry[$state];
3654                        } elseif ($noVowels) {
3655                            // Incoming sound is a duplicate of the previous sound
3656                            // For Hebrew and Arabic, we need to create a pair of D-M sound codes,
3657                            // one of the pair with only a single occurrence of the duplicate sound,
3658                            // the other with both occurrences
3659                            $workingEntry[] = $soundTableEntry[$state];
3660                        }
3661
3662                        if (count($workingEntry) < 7) {
3663                            $partialResult[] = $workingEntry;
3664                        } else {
3665                            // This is the 6th code in the sequence
3666                            // We're looking for 7 entries because the first is '!' and doesn't count
3667                            $tempResult = str_replace('!', '', implode('', $workingEntry));
3668                            // Only return codes from recognisable sounds
3669                            if ($tempResult) {
3670                                $result[] = substr($tempResult . '000000', 0, 6);
3671                            }
3672                        }
3673                    }
3674                }
3675                $state += 3; // Advance to next triplet while keeping the same basic state
3676            }
3677        }
3678
3679        // Zero-fill and copy all remaining partial results
3680        foreach ($partialResult as $workingEntry) {
3681            $tempResult = str_replace('!', '', implode('', $workingEntry));
3682            // Only return codes from recognisable sounds
3683            if ($tempResult) {
3684                $result[] = substr($tempResult . '000000', 0, 6);
3685            }
3686        }
3687
3688        return $result;
3689    }
3690}
3691