xref: /webtrees/app/Soundex.php (revision 3976b4703df669696105ed6b024b96d433c8fbdb)
1<?php
2
3/**
4 * webtrees: online genealogy
5 * Copyright (C) 2019 webtrees development team
6 * This program is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation, either version 3 of the License, or
9 * (at your option) any later version.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program. If not, see <http://www.gnu.org/licenses/>.
16 */
17declare(strict_types=1);
18
19namespace Fisharebest\Webtrees;
20
21/**
22 * Phonetic matching of strings.
23 */
24class Soundex
25{
26    // Determine the Daitch–Mokotoff Soundex code for a word
27    // Original implementation by Gerry Kroll, and analysis by Meliza Amity
28
29    // Max. table key length (in ASCII bytes -- NOT in UTF-8 characters!)
30    private const MAXCHAR = 7;
31
32    /**
33     * Name transformation arrays.
34     * Used to transform the Name string to simplify the "sounds like" table.
35     * This is especially useful in Hebrew.
36     *
37     * Each array entry defines the "from" and "to" arguments of an preg($from, $to, $text)
38     * function call to achieve the desired transformations.
39     *
40     * Note about the use of "\x01":
41     * This code, which can’t legitimately occur in the kind of text we're dealing with,
42     * is used as a place-holder so that conditional string replacements can be done.
43     */
44    private const TRANSFORM_NAMES = [
45        // Force Yiddish ligatures to be treated as separate letters
46        [
47            'װ',
48            'וו',
49        ],
50        [
51            'ײ',
52            'יי',
53        ],
54        [
55            'ױ',
56            'וי',
57        ],
58        [
59            'בו',
60            'בע',
61        ],
62        [
63            'פו',
64            'פע',
65        ],
66        [
67            'ומ',
68            'עמ',
69        ],
70        [
71            'ום',
72            'עם',
73        ],
74        [
75            'ונ',
76            'ענ',
77        ],
78        [
79            'ון',
80            'ען',
81        ],
82        [
83            'וו',
84            'ב',
85        ],
86        [
87            "\x01",
88            '',
89        ],
90        [
91            'ייה$',
92            "\x01ה",
93        ],
94        [
95            'ייע$',
96            "\x01ע",
97        ],
98        [
99            'יי',
100            'ע',
101        ],
102        [
103            "\x01",
104            'יי',
105        ],
106    ];
107
108    /**
109     * The DM sound coding table is organized this way:
110     * key: a variable-length string that corresponds to the UTF-8 character sequence
111     * represented by the table entry. Currently, that string can be up to 7
112     * bytes long. This maximum length is defined by the value of global variable
113     * $maxchar.
114     *
115     * value: an array as follows:
116     * [0]:  zero if not a vowel
117     * [1]:  sound value when this string is at the beginning of the word
118     * [2]:  sound value when this string is followed by a vowel
119     * [3]:  sound value for other cases
120     * [1],[2],[3] can be repeated several times to create branches in the code
121     * an empty sound value means "ignore in this state"
122     */
123    private const DM_SOUNDS = [
124        'A'       => [
125            '1',
126            '0',
127            '',
128            '',
129        ],
130        'À'       => [
131            '1',
132            '0',
133            '',
134            '',
135        ],
136        'Á'       => [
137            '1',
138            '0',
139            '',
140            '',
141        ],
142        'Â'       => [
143            '1',
144            '0',
145            '',
146            '',
147        ],
148        'Ã'       => [
149            '1',
150            '0',
151            '',
152            '',
153        ],
154        'Ä'       => [
155            '1',
156            '0',
157            '1',
158            '',
159            '0',
160            '',
161            '',
162        ],
163        'Å'       => [
164            '1',
165            '0',
166            '',
167            '',
168        ],
169        'Ă'       => [
170            '1',
171            '0',
172            '',
173            '',
174        ],
175        'Ą'       => [
176            '1',
177            '',
178            '',
179            '',
180            '',
181            '',
182            '6',
183        ],
184        'Ạ'       => [
185            '1',
186            '0',
187            '',
188            '',
189        ],
190        'Ả'       => [
191            '1',
192            '0',
193            '',
194            '',
195        ],
196        'Ấ'       => [
197            '1',
198            '0',
199            '',
200            '',
201        ],
202        'Ầ'       => [
203            '1',
204            '0',
205            '',
206            '',
207        ],
208        'Ẩ'       => [
209            '1',
210            '0',
211            '',
212            '',
213        ],
214        'Ẫ'       => [
215            '1',
216            '0',
217            '',
218            '',
219        ],
220        'Ậ'       => [
221            '1',
222            '0',
223            '',
224            '',
225        ],
226        'Ắ'       => [
227            '1',
228            '0',
229            '',
230            '',
231        ],
232        'Ằ'       => [
233            '1',
234            '0',
235            '',
236            '',
237        ],
238        'Ẳ'       => [
239            '1',
240            '0',
241            '',
242            '',
243        ],
244        'Ẵ'       => [
245            '1',
246            '0',
247            '',
248            '',
249        ],
250        'Ặ'       => [
251            '1',
252            '0',
253            '',
254            '',
255        ],
256        'AE'      => [
257            '1',
258            '0',
259            '1',
260            '',
261        ],
262        'Æ'       => [
263            '1',
264            '0',
265            '1',
266            '',
267        ],
268        'AI'      => [
269            '1',
270            '0',
271            '1',
272            '',
273        ],
274        'AJ'      => [
275            '1',
276            '0',
277            '1',
278            '',
279        ],
280        'AU'      => [
281            '1',
282            '0',
283            '7',
284            '',
285        ],
286        'AV'      => [
287            '1',
288            '0',
289            '7',
290            '',
291            '7',
292            '7',
293            '7',
294        ],
295        'ÄU'      => [
296            '1',
297            '0',
298            '1',
299            '',
300        ],
301        'AY'      => [
302            '1',
303            '0',
304            '1',
305            '',
306        ],
307        'B'       => [
308            '0',
309            '7',
310            '7',
311            '7',
312        ],
313        'C'       => [
314            '0',
315            '5',
316            '5',
317            '5',
318            '34',
319            '4',
320            '4',
321        ],
322        'Ć'       => [
323            '0',
324            '4',
325            '4',
326            '4',
327        ],
328        'Č'       => [
329            '0',
330            '4',
331            '4',
332            '4',
333        ],
334        'Ç'       => [
335            '0',
336            '4',
337            '4',
338            '4',
339        ],
340        'CH'      => [
341            '0',
342            '5',
343            '5',
344            '5',
345            '34',
346            '4',
347            '4',
348        ],
349        'CHS'     => [
350            '0',
351            '5',
352            '54',
353            '54',
354        ],
355        'CK'      => [
356            '0',
357            '5',
358            '5',
359            '5',
360            '45',
361            '45',
362            '45',
363        ],
364        'CCS'     => [
365            '0',
366            '4',
367            '4',
368            '4',
369        ],
370        'CS'      => [
371            '0',
372            '4',
373            '4',
374            '4',
375        ],
376        'CSZ'     => [
377            '0',
378            '4',
379            '4',
380            '4',
381        ],
382        'CZ'      => [
383            '0',
384            '4',
385            '4',
386            '4',
387        ],
388        'CZS'     => [
389            '0',
390            '4',
391            '4',
392            '4',
393        ],
394        'D'       => [
395            '0',
396            '3',
397            '3',
398            '3',
399        ],
400        'Ď'       => [
401            '0',
402            '3',
403            '3',
404            '3',
405        ],
406        'Đ'       => [
407            '0',
408            '3',
409            '3',
410            '3',
411        ],
412        'DRS'     => [
413            '0',
414            '4',
415            '4',
416            '4',
417        ],
418        'DRZ'     => [
419            '0',
420            '4',
421            '4',
422            '4',
423        ],
424        'DS'      => [
425            '0',
426            '4',
427            '4',
428            '4',
429        ],
430        'DSH'     => [
431            '0',
432            '4',
433            '4',
434            '4',
435        ],
436        'DSZ'     => [
437            '0',
438            '4',
439            '4',
440            '4',
441        ],
442        'DT'      => [
443            '0',
444            '3',
445            '3',
446            '3',
447        ],
448        'DDZ'     => [
449            '0',
450            '4',
451            '4',
452            '4',
453        ],
454        'DDZS'    => [
455            '0',
456            '4',
457            '4',
458            '4',
459        ],
460        'DZ'      => [
461            '0',
462            '4',
463            '4',
464            '4',
465        ],
466        'DŹ'      => [
467            '0',
468            '4',
469            '4',
470            '4',
471        ],
472        'DŻ'      => [
473            '0',
474            '4',
475            '4',
476            '4',
477        ],
478        'DZH'     => [
479            '0',
480            '4',
481            '4',
482            '4',
483        ],
484        'DZS'     => [
485            '0',
486            '4',
487            '4',
488            '4',
489        ],
490        'E'       => [
491            '1',
492            '0',
493            '',
494            '',
495        ],
496        'È'       => [
497            '1',
498            '0',
499            '',
500            '',
501        ],
502        'É'       => [
503            '1',
504            '0',
505            '',
506            '',
507        ],
508        'Ê'       => [
509            '1',
510            '0',
511            '',
512            '',
513        ],
514        'Ë'       => [
515            '1',
516            '0',
517            '',
518            '',
519        ],
520        'Ĕ'       => [
521            '1',
522            '0',
523            '',
524            '',
525        ],
526        'Ė'       => [
527            '1',
528            '0',
529            '',
530            '',
531        ],
532        'Ę'       => [
533            '1',
534            '',
535            '',
536            '6',
537            '',
538            '',
539            '',
540        ],
541        'Ẹ'       => [
542            '1',
543            '0',
544            '',
545            '',
546        ],
547        'Ẻ'       => [
548            '1',
549            '0',
550            '',
551            '',
552        ],
553        'Ẽ'       => [
554            '1',
555            '0',
556            '',
557            '',
558        ],
559        'Ế'       => [
560            '1',
561            '0',
562            '',
563            '',
564        ],
565        'Ề'       => [
566            '1',
567            '0',
568            '',
569            '',
570        ],
571        'Ể'       => [
572            '1',
573            '0',
574            '',
575            '',
576        ],
577        'Ễ'       => [
578            '1',
579            '0',
580            '',
581            '',
582        ],
583        'Ệ'       => [
584            '1',
585            '0',
586            '',
587            '',
588        ],
589        'EAU'     => [
590            '1',
591            '0',
592            '',
593            '',
594        ],
595        'EI'      => [
596            '1',
597            '0',
598            '1',
599            '',
600        ],
601        'EJ'      => [
602            '1',
603            '0',
604            '1',
605            '',
606        ],
607        'EU'      => [
608            '1',
609            '1',
610            '1',
611            '',
612        ],
613        'EY'      => [
614            '1',
615            '0',
616            '1',
617            '',
618        ],
619        'F'       => [
620            '0',
621            '7',
622            '7',
623            '7',
624        ],
625        'FB'      => [
626            '0',
627            '7',
628            '7',
629            '7',
630        ],
631        'G'       => [
632            '0',
633            '5',
634            '5',
635            '5',
636            '34',
637            '4',
638            '4',
639        ],
640        'Ğ'       => [
641            '0',
642            '',
643            '',
644            '',
645        ],
646        'GGY'     => [
647            '0',
648            '5',
649            '5',
650            '5',
651        ],
652        'GY'      => [
653            '0',
654            '5',
655            '5',
656            '5',
657        ],
658        'H'       => [
659            '0',
660            '5',
661            '5',
662            '',
663            '5',
664            '5',
665            '5',
666        ],
667        'I'       => [
668            '1',
669            '0',
670            '',
671            '',
672        ],
673        'Ì'       => [
674            '1',
675            '0',
676            '',
677            '',
678        ],
679        'Í'       => [
680            '1',
681            '0',
682            '',
683            '',
684        ],
685        'Î'       => [
686            '1',
687            '0',
688            '',
689            '',
690        ],
691        'Ï'       => [
692            '1',
693            '0',
694            '',
695            '',
696        ],
697        'Ĩ'       => [
698            '1',
699            '0',
700            '',
701            '',
702        ],
703        'Į'       => [
704            '1',
705            '0',
706            '',
707            '',
708        ],
709        'İ'       => [
710            '1',
711            '0',
712            '',
713            '',
714        ],
715        'Ỉ'       => [
716            '1',
717            '0',
718            '',
719            '',
720        ],
721        'Ị'       => [
722            '1',
723            '0',
724            '',
725            '',
726        ],
727        'IA'      => [
728            '1',
729            '1',
730            '',
731            '',
732        ],
733        'IE'      => [
734            '1',
735            '1',
736            '',
737            '',
738        ],
739        'IO'      => [
740            '1',
741            '1',
742            '',
743            '',
744        ],
745        'IU'      => [
746            '1',
747            '1',
748            '',
749            '',
750        ],
751        'J'       => [
752            '0',
753            '1',
754            '',
755            '',
756            '4',
757            '4',
758            '4',
759            '5',
760            '5',
761            '',
762        ],
763        'K'       => [
764            '0',
765            '5',
766            '5',
767            '5',
768        ],
769        'KH'      => [
770            '0',
771            '5',
772            '5',
773            '5',
774        ],
775        'KS'      => [
776            '0',
777            '5',
778            '54',
779            '54',
780        ],
781        'L'       => [
782            '0',
783            '8',
784            '8',
785            '8',
786        ],
787        'Ľ'       => [
788            '0',
789            '8',
790            '8',
791            '8',
792        ],
793        'Ĺ'       => [
794            '0',
795            '8',
796            '8',
797            '8',
798        ],
799        'Ł'       => [
800            '0',
801            '7',
802            '7',
803            '7',
804            '8',
805            '8',
806            '8',
807        ],
808        'LL'      => [
809            '0',
810            '8',
811            '8',
812            '8',
813            '58',
814            '8',
815            '8',
816            '1',
817            '8',
818            '8',
819        ],
820        'LLY'     => [
821            '0',
822            '8',
823            '8',
824            '8',
825            '1',
826            '8',
827            '8',
828        ],
829        'LY'      => [
830            '0',
831            '8',
832            '8',
833            '8',
834            '1',
835            '8',
836            '8',
837        ],
838        'M'       => [
839            '0',
840            '6',
841            '6',
842            '6',
843        ],
844        'MĔ'      => [
845            '0',
846            '66',
847            '66',
848            '66',
849        ],
850        'MN'      => [
851            '0',
852            '66',
853            '66',
854            '66',
855        ],
856        'N'       => [
857            '0',
858            '6',
859            '6',
860            '6',
861        ],
862        'Ń'       => [
863            '0',
864            '6',
865            '6',
866            '6',
867        ],
868        'Ň'       => [
869            '0',
870            '6',
871            '6',
872            '6',
873        ],
874        'Ñ'       => [
875            '0',
876            '6',
877            '6',
878            '6',
879        ],
880        'NM'      => [
881            '0',
882            '66',
883            '66',
884            '66',
885        ],
886        'O'       => [
887            '1',
888            '0',
889            '',
890            '',
891        ],
892        'Ò'       => [
893            '1',
894            '0',
895            '',
896            '',
897        ],
898        'Ó'       => [
899            '1',
900            '0',
901            '',
902            '',
903        ],
904        'Ô'       => [
905            '1',
906            '0',
907            '',
908            '',
909        ],
910        'Õ'       => [
911            '1',
912            '0',
913            '',
914            '',
915        ],
916        'Ö'       => [
917            '1',
918            '0',
919            '',
920            '',
921        ],
922        'Ø'       => [
923            '1',
924            '0',
925            '',
926            '',
927        ],
928        'Ő'       => [
929            '1',
930            '0',
931            '',
932            '',
933        ],
934        'Œ'       => [
935            '1',
936            '0',
937            '',
938            '',
939        ],
940        'Ơ'       => [
941            '1',
942            '0',
943            '',
944            '',
945        ],
946        'Ọ'       => [
947            '1',
948            '0',
949            '',
950            '',
951        ],
952        'Ỏ'       => [
953            '1',
954            '0',
955            '',
956            '',
957        ],
958        'Ố'       => [
959            '1',
960            '0',
961            '',
962            '',
963        ],
964        'Ồ'       => [
965            '1',
966            '0',
967            '',
968            '',
969        ],
970        'Ổ'       => [
971            '1',
972            '0',
973            '',
974            '',
975        ],
976        'Ỗ'       => [
977            '1',
978            '0',
979            '',
980            '',
981        ],
982        'Ộ'       => [
983            '1',
984            '0',
985            '',
986            '',
987        ],
988        'Ớ'       => [
989            '1',
990            '0',
991            '',
992            '',
993        ],
994        'Ờ'       => [
995            '1',
996            '0',
997            '',
998            '',
999        ],
1000        'Ở'       => [
1001            '1',
1002            '0',
1003            '',
1004            '',
1005        ],
1006        'Ỡ'       => [
1007            '1',
1008            '0',
1009            '',
1010            '',
1011        ],
1012        'Ợ'       => [
1013            '1',
1014            '0',
1015            '',
1016            '',
1017        ],
1018        'OE'      => [
1019            '1',
1020            '0',
1021            '',
1022            '',
1023        ],
1024        'OI'      => [
1025            '1',
1026            '0',
1027            '1',
1028            '',
1029        ],
1030        'OJ'      => [
1031            '1',
1032            '0',
1033            '1',
1034            '',
1035        ],
1036        'OU'      => [
1037            '1',
1038            '0',
1039            '',
1040            '',
1041        ],
1042        'OY'      => [
1043            '1',
1044            '0',
1045            '1',
1046            '',
1047        ],
1048        'P'       => [
1049            '0',
1050            '7',
1051            '7',
1052            '7',
1053        ],
1054        'PF'      => [
1055            '0',
1056            '7',
1057            '7',
1058            '7',
1059        ],
1060        'PH'      => [
1061            '0',
1062            '7',
1063            '7',
1064            '7',
1065        ],
1066        'Q'       => [
1067            '0',
1068            '5',
1069            '5',
1070            '5',
1071        ],
1072        'R'       => [
1073            '0',
1074            '9',
1075            '9',
1076            '9',
1077        ],
1078        'Ř'       => [
1079            '0',
1080            '4',
1081            '4',
1082            '4',
1083        ],
1084        'RS'      => [
1085            '0',
1086            '4',
1087            '4',
1088            '4',
1089            '94',
1090            '94',
1091            '94',
1092        ],
1093        'RZ'      => [
1094            '0',
1095            '4',
1096            '4',
1097            '4',
1098            '94',
1099            '94',
1100            '94',
1101        ],
1102        'S'       => [
1103            '0',
1104            '4',
1105            '4',
1106            '4',
1107        ],
1108        'Ś'       => [
1109            '0',
1110            '4',
1111            '4',
1112            '4',
1113        ],
1114        'Š'       => [
1115            '0',
1116            '4',
1117            '4',
1118            '4',
1119        ],
1120        'Ş'       => [
1121            '0',
1122            '4',
1123            '4',
1124            '4',
1125        ],
1126        'SC'      => [
1127            '0',
1128            '2',
1129            '4',
1130            '4',
1131        ],
1132        'ŠČ'      => [
1133            '0',
1134            '2',
1135            '4',
1136            '4',
1137        ],
1138        'SCH'     => [
1139            '0',
1140            '4',
1141            '4',
1142            '4',
1143        ],
1144        'SCHD'    => [
1145            '0',
1146            '2',
1147            '43',
1148            '43',
1149        ],
1150        'SCHT'    => [
1151            '0',
1152            '2',
1153            '43',
1154            '43',
1155        ],
1156        'SCHTCH'  => [
1157            '0',
1158            '2',
1159            '4',
1160            '4',
1161        ],
1162        'SCHTSCH' => [
1163            '0',
1164            '2',
1165            '4',
1166            '4',
1167        ],
1168        'SCHTSH'  => [
1169            '0',
1170            '2',
1171            '4',
1172            '4',
1173        ],
1174        'SD'      => [
1175            '0',
1176            '2',
1177            '43',
1178            '43',
1179        ],
1180        'SH'      => [
1181            '0',
1182            '4',
1183            '4',
1184            '4',
1185        ],
1186        'SHCH'    => [
1187            '0',
1188            '2',
1189            '4',
1190            '4',
1191        ],
1192        'SHD'     => [
1193            '0',
1194            '2',
1195            '43',
1196            '43',
1197        ],
1198        'SHT'     => [
1199            '0',
1200            '2',
1201            '43',
1202            '43',
1203        ],
1204        'SHTCH'   => [
1205            '0',
1206            '2',
1207            '4',
1208            '4',
1209        ],
1210        'SHTSH'   => [
1211            '0',
1212            '2',
1213            '4',
1214            '4',
1215        ],
1216        'ß'       => [
1217            '0',
1218            '',
1219            '4',
1220            '4',
1221        ],
1222        'ST'      => [
1223            '0',
1224            '2',
1225            '43',
1226            '43',
1227        ],
1228        'STCH'    => [
1229            '0',
1230            '2',
1231            '4',
1232            '4',
1233        ],
1234        'STRS'    => [
1235            '0',
1236            '2',
1237            '4',
1238            '4',
1239        ],
1240        'STRZ'    => [
1241            '0',
1242            '2',
1243            '4',
1244            '4',
1245        ],
1246        'STSCH'   => [
1247            '0',
1248            '2',
1249            '4',
1250            '4',
1251        ],
1252        'STSH'    => [
1253            '0',
1254            '2',
1255            '4',
1256            '4',
1257        ],
1258        'SSZ'     => [
1259            '0',
1260            '4',
1261            '4',
1262            '4',
1263        ],
1264        'SZ'      => [
1265            '0',
1266            '4',
1267            '4',
1268            '4',
1269        ],
1270        'SZCS'    => [
1271            '0',
1272            '2',
1273            '4',
1274            '4',
1275        ],
1276        'SZCZ'    => [
1277            '0',
1278            '2',
1279            '4',
1280            '4',
1281        ],
1282        'SZD'     => [
1283            '0',
1284            '2',
1285            '43',
1286            '43',
1287        ],
1288        'SZT'     => [
1289            '0',
1290            '2',
1291            '43',
1292            '43',
1293        ],
1294        'T'       => [
1295            '0',
1296            '3',
1297            '3',
1298            '3',
1299        ],
1300        'Ť'       => [
1301            '0',
1302            '3',
1303            '3',
1304            '3',
1305        ],
1306        'Ţ'       => [
1307            '0',
1308            '3',
1309            '3',
1310            '3',
1311            '4',
1312            '4',
1313            '4',
1314        ],
1315        'TC'      => [
1316            '0',
1317            '4',
1318            '4',
1319            '4',
1320        ],
1321        'TCH'     => [
1322            '0',
1323            '4',
1324            '4',
1325            '4',
1326        ],
1327        'TH'      => [
1328            '0',
1329            '3',
1330            '3',
1331            '3',
1332        ],
1333        'TRS'     => [
1334            '0',
1335            '4',
1336            '4',
1337            '4',
1338        ],
1339        'TRZ'     => [
1340            '0',
1341            '4',
1342            '4',
1343            '4',
1344        ],
1345        'TS'      => [
1346            '0',
1347            '4',
1348            '4',
1349            '4',
1350        ],
1351        'TSCH'    => [
1352            '0',
1353            '4',
1354            '4',
1355            '4',
1356        ],
1357        'TSH'     => [
1358            '0',
1359            '4',
1360            '4',
1361            '4',
1362        ],
1363        'TSZ'     => [
1364            '0',
1365            '4',
1366            '4',
1367            '4',
1368        ],
1369        'TTCH'    => [
1370            '0',
1371            '4',
1372            '4',
1373            '4',
1374        ],
1375        'TTS'     => [
1376            '0',
1377            '4',
1378            '4',
1379            '4',
1380        ],
1381        'TTSCH'   => [
1382            '0',
1383            '4',
1384            '4',
1385            '4',
1386        ],
1387        'TTSZ'    => [
1388            '0',
1389            '4',
1390            '4',
1391            '4',
1392        ],
1393        'TTZ'     => [
1394            '0',
1395            '4',
1396            '4',
1397            '4',
1398        ],
1399        'TZ'      => [
1400            '0',
1401            '4',
1402            '4',
1403            '4',
1404        ],
1405        'TZS'     => [
1406            '0',
1407            '4',
1408            '4',
1409            '4',
1410        ],
1411        'U'       => [
1412            '1',
1413            '0',
1414            '',
1415            '',
1416        ],
1417        'Ù'       => [
1418            '1',
1419            '0',
1420            '',
1421            '',
1422        ],
1423        'Ú'       => [
1424            '1',
1425            '0',
1426            '',
1427            '',
1428        ],
1429        'Û'       => [
1430            '1',
1431            '0',
1432            '',
1433            '',
1434        ],
1435        'Ü'       => [
1436            '1',
1437            '0',
1438            '',
1439            '',
1440        ],
1441        'Ũ'       => [
1442            '1',
1443            '0',
1444            '',
1445            '',
1446        ],
1447        'Ū'       => [
1448            '1',
1449            '0',
1450            '',
1451            '',
1452        ],
1453        'Ů'       => [
1454            '1',
1455            '0',
1456            '',
1457            '',
1458        ],
1459        'Ű'       => [
1460            '1',
1461            '0',
1462            '',
1463            '',
1464        ],
1465        'Ų'       => [
1466            '1',
1467            '0',
1468            '',
1469            '',
1470        ],
1471        'Ư'       => [
1472            '1',
1473            '0',
1474            '',
1475            '',
1476        ],
1477        'Ụ'       => [
1478            '1',
1479            '0',
1480            '',
1481            '',
1482        ],
1483        'Ủ'       => [
1484            '1',
1485            '0',
1486            '',
1487            '',
1488        ],
1489        'Ứ'       => [
1490            '1',
1491            '0',
1492            '',
1493            '',
1494        ],
1495        'Ừ'       => [
1496            '1',
1497            '0',
1498            '',
1499            '',
1500        ],
1501        'Ử'       => [
1502            '1',
1503            '0',
1504            '',
1505            '',
1506        ],
1507        'Ữ'       => [
1508            '1',
1509            '0',
1510            '',
1511            '',
1512        ],
1513        'Ự'       => [
1514            '1',
1515            '0',
1516            '',
1517            '',
1518        ],
1519        'UE'      => [
1520            '1',
1521            '0',
1522            '',
1523            '',
1524        ],
1525        'UI'      => [
1526            '1',
1527            '0',
1528            '1',
1529            '',
1530        ],
1531        'UJ'      => [
1532            '1',
1533            '0',
1534            '1',
1535            '',
1536        ],
1537        'UY'      => [
1538            '1',
1539            '0',
1540            '1',
1541            '',
1542        ],
1543        'UW'      => [
1544            '1',
1545            '0',
1546            '1',
1547            '',
1548            '0',
1549            '7',
1550            '7',
1551        ],
1552        'V'       => [
1553            '0',
1554            '7',
1555            '7',
1556            '7',
1557        ],
1558        'W'       => [
1559            '0',
1560            '7',
1561            '7',
1562            '7',
1563        ],
1564        'X'       => [
1565            '0',
1566            '5',
1567            '54',
1568            '54',
1569        ],
1570        'Y'       => [
1571            '1',
1572            '1',
1573            '',
1574            '',
1575        ],
1576        'Ý'       => [
1577            '1',
1578            '1',
1579            '',
1580            '',
1581        ],
1582        'Ỳ'       => [
1583            '1',
1584            '1',
1585            '',
1586            '',
1587        ],
1588        'Ỵ'       => [
1589            '1',
1590            '1',
1591            '',
1592            '',
1593        ],
1594        'Ỷ'       => [
1595            '1',
1596            '1',
1597            '',
1598            '',
1599        ],
1600        'Ỹ'       => [
1601            '1',
1602            '1',
1603            '',
1604            '',
1605        ],
1606        'Z'       => [
1607            '0',
1608            '4',
1609            '4',
1610            '4',
1611        ],
1612        'Ź'       => [
1613            '0',
1614            '4',
1615            '4',
1616            '4',
1617        ],
1618        'Ż'       => [
1619            '0',
1620            '4',
1621            '4',
1622            '4',
1623        ],
1624        'Ž'       => [
1625            '0',
1626            '4',
1627            '4',
1628            '4',
1629        ],
1630        'ZD'      => [
1631            '0',
1632            '2',
1633            '43',
1634            '43',
1635        ],
1636        'ZDZ'     => [
1637            '0',
1638            '2',
1639            '4',
1640            '4',
1641        ],
1642        'ZDZH'    => [
1643            '0',
1644            '2',
1645            '4',
1646            '4',
1647        ],
1648        'ZH'      => [
1649            '0',
1650            '4',
1651            '4',
1652            '4',
1653        ],
1654        'ZHD'     => [
1655            '0',
1656            '2',
1657            '43',
1658            '43',
1659        ],
1660        'ZHDZH'   => [
1661            '0',
1662            '2',
1663            '4',
1664            '4',
1665        ],
1666        'ZS'      => [
1667            '0',
1668            '4',
1669            '4',
1670            '4',
1671        ],
1672        'ZSCH'    => [
1673            '0',
1674            '4',
1675            '4',
1676            '4',
1677        ],
1678        'ZSH'     => [
1679            '0',
1680            '4',
1681            '4',
1682            '4',
1683        ],
1684        'ZZS'     => [
1685            '0',
1686            '4',
1687            '4',
1688            '4',
1689        ],
1690        // Cyrillic alphabet
1691        'А'       => [
1692            '1',
1693            '0',
1694            '',
1695            '',
1696        ],
1697        'Б'       => [
1698            '0',
1699            '7',
1700            '7',
1701            '7',
1702        ],
1703        'В'       => [
1704            '0',
1705            '7',
1706            '7',
1707            '7',
1708        ],
1709        'Г'       => [
1710            '0',
1711            '5',
1712            '5',
1713            '5',
1714        ],
1715        'Д'       => [
1716            '0',
1717            '3',
1718            '3',
1719            '3',
1720        ],
1721        'ДЗ'      => [
1722            '0',
1723            '4',
1724            '4',
1725            '4',
1726        ],
1727        'Е'       => [
1728            '1',
1729            '0',
1730            '',
1731            '',
1732        ],
1733        'Ё'       => [
1734            '1',
1735            '0',
1736            '',
1737            '',
1738        ],
1739        'Ж'       => [
1740            '0',
1741            '4',
1742            '4',
1743            '4',
1744        ],
1745        'З'       => [
1746            '0',
1747            '4',
1748            '4',
1749            '4',
1750        ],
1751        'И'       => [
1752            '1',
1753            '0',
1754            '',
1755            '',
1756        ],
1757        'Й'       => [
1758            '1',
1759            '1',
1760            '',
1761            '',
1762            '4',
1763            '4',
1764            '4',
1765        ],
1766        'К'       => [
1767            '0',
1768            '5',
1769            '5',
1770            '5',
1771        ],
1772        'Л'       => [
1773            '0',
1774            '8',
1775            '8',
1776            '8',
1777        ],
1778        'М'       => [
1779            '0',
1780            '6',
1781            '6',
1782            '6',
1783        ],
1784        'Н'       => [
1785            '0',
1786            '6',
1787            '6',
1788            '6',
1789        ],
1790        'О'       => [
1791            '1',
1792            '0',
1793            '',
1794            '',
1795        ],
1796        'П'       => [
1797            '0',
1798            '7',
1799            '7',
1800            '7',
1801        ],
1802        'Р'       => [
1803            '0',
1804            '9',
1805            '9',
1806            '9',
1807        ],
1808        'РЖ'      => [
1809            '0',
1810            '4',
1811            '4',
1812            '4',
1813        ],
1814        'С'       => [
1815            '0',
1816            '4',
1817            '4',
1818            '4',
1819        ],
1820        'Т'       => [
1821            '0',
1822            '3',
1823            '3',
1824            '3',
1825        ],
1826        'У'       => [
1827            '1',
1828            '0',
1829            '',
1830            '',
1831        ],
1832        'Ф'       => [
1833            '0',
1834            '7',
1835            '7',
1836            '7',
1837        ],
1838        'Х'       => [
1839            '0',
1840            '5',
1841            '5',
1842            '5',
1843        ],
1844        'Ц'       => [
1845            '0',
1846            '4',
1847            '4',
1848            '4',
1849        ],
1850        'Ч'       => [
1851            '0',
1852            '4',
1853            '4',
1854            '4',
1855        ],
1856        'Ш'       => [
1857            '0',
1858            '4',
1859            '4',
1860            '4',
1861        ],
1862        'Щ'       => [
1863            '0',
1864            '2',
1865            '4',
1866            '4',
1867        ],
1868        'Ъ'       => [
1869            '0',
1870            '',
1871            '',
1872            '',
1873        ],
1874        'Ы'       => [
1875            '0',
1876            '1',
1877            '',
1878            '',
1879        ],
1880        'Ь'       => [
1881            '0',
1882            '',
1883            '',
1884            '',
1885        ],
1886        'Э'       => [
1887            '1',
1888            '0',
1889            '',
1890            '',
1891        ],
1892        'Ю'       => [
1893            '0',
1894            '1',
1895            '',
1896            '',
1897        ],
1898        'Я'       => [
1899            '0',
1900            '1',
1901            '',
1902            '',
1903        ],
1904        // Greek alphabet
1905        'Α'       => [
1906            '1',
1907            '0',
1908            '',
1909            '',
1910        ],
1911        'Ά'       => [
1912            '1',
1913            '0',
1914            '',
1915            '',
1916        ],
1917        'ΑΙ'      => [
1918            '1',
1919            '0',
1920            '1',
1921            '',
1922        ],
1923        'ΑΥ'      => [
1924            '1',
1925            '0',
1926            '1',
1927            '',
1928        ],
1929        'Β'       => [
1930            '0',
1931            '7',
1932            '7',
1933            '7',
1934        ],
1935        'Γ'       => [
1936            '0',
1937            '5',
1938            '5',
1939            '5',
1940        ],
1941        'Δ'       => [
1942            '0',
1943            '3',
1944            '3',
1945            '3',
1946        ],
1947        'Ε'       => [
1948            '1',
1949            '0',
1950            '',
1951            '',
1952        ],
1953        'Έ'       => [
1954            '1',
1955            '0',
1956            '',
1957            '',
1958        ],
1959        'ΕΙ'      => [
1960            '1',
1961            '0',
1962            '1',
1963            '',
1964        ],
1965        'ΕΥ'      => [
1966            '1',
1967            '1',
1968            '1',
1969            '',
1970        ],
1971        'Ζ'       => [
1972            '0',
1973            '4',
1974            '4',
1975            '4',
1976        ],
1977        'Η'       => [
1978            '1',
1979            '0',
1980            '',
1981            '',
1982        ],
1983        'Ή'       => [
1984            '1',
1985            '0',
1986            '',
1987            '',
1988        ],
1989        'Θ'       => [
1990            '0',
1991            '3',
1992            '3',
1993            '3',
1994        ],
1995        'Ι'       => [
1996            '1',
1997            '0',
1998            '',
1999            '',
2000        ],
2001        'Ί'       => [
2002            '1',
2003            '0',
2004            '',
2005            '',
2006        ],
2007        'Ϊ'       => [
2008            '1',
2009            '0',
2010            '',
2011            '',
2012        ],
2013        'ΐ'       => [
2014            '1',
2015            '0',
2016            '',
2017            '',
2018        ],
2019        'Κ'       => [
2020            '0',
2021            '5',
2022            '5',
2023            '5',
2024        ],
2025        'Λ'       => [
2026            '0',
2027            '8',
2028            '8',
2029            '8',
2030        ],
2031        'Μ'       => [
2032            '0',
2033            '6',
2034            '6',
2035            '6',
2036        ],
2037        'ΜΠ'      => [
2038            '0',
2039            '7',
2040            '7',
2041            '7',
2042        ],
2043        'Ν'       => [
2044            '0',
2045            '6',
2046            '6',
2047            '6',
2048        ],
2049        'ΝΤ'      => [
2050            '0',
2051            '3',
2052            '3',
2053            '3',
2054        ],
2055        'Ξ'       => [
2056            '0',
2057            '5',
2058            '54',
2059            '54',
2060        ],
2061        'Ο'       => [
2062            '1',
2063            '0',
2064            '',
2065            '',
2066        ],
2067        'Ό'       => [
2068            '1',
2069            '0',
2070            '',
2071            '',
2072        ],
2073        'ΟΙ'      => [
2074            '1',
2075            '0',
2076            '1',
2077            '',
2078        ],
2079        'ΟΥ'      => [
2080            '1',
2081            '0',
2082            '1',
2083            '',
2084        ],
2085        'Π'       => [
2086            '0',
2087            '7',
2088            '7',
2089            '7',
2090        ],
2091        'Ρ'       => [
2092            '0',
2093            '9',
2094            '9',
2095            '9',
2096        ],
2097        'Σ'       => [
2098            '0',
2099            '4',
2100            '4',
2101            '4',
2102        ],
2103        'ς'       => [
2104            '0',
2105            '',
2106            '',
2107            '4',
2108        ],
2109        'Τ'       => [
2110            '0',
2111            '3',
2112            '3',
2113            '3',
2114        ],
2115        'ΤΖ'      => [
2116            '0',
2117            '4',
2118            '4',
2119            '4',
2120        ],
2121        'ΤΣ'      => [
2122            '0',
2123            '4',
2124            '4',
2125            '4',
2126        ],
2127        'Υ'       => [
2128            '1',
2129            '1',
2130            '',
2131            '',
2132        ],
2133        'Ύ'       => [
2134            '1',
2135            '1',
2136            '',
2137            '',
2138        ],
2139        'Ϋ'       => [
2140            '1',
2141            '1',
2142            '',
2143            '',
2144        ],
2145        'ΰ'       => [
2146            '1',
2147            '1',
2148            '',
2149            '',
2150        ],
2151        'ΥΚ'      => [
2152            '1',
2153            '5',
2154            '5',
2155            '5',
2156        ],
2157        'ΥΥ'      => [
2158            '1',
2159            '65',
2160            '65',
2161            '65',
2162        ],
2163        'Φ'       => [
2164            '0',
2165            '7',
2166            '7',
2167            '7',
2168        ],
2169        'Χ'       => [
2170            '0',
2171            '5',
2172            '5',
2173            '5',
2174        ],
2175        'Ψ'       => [
2176            '0',
2177            '7',
2178            '7',
2179            '7',
2180        ],
2181        'Ω'       => [
2182            '1',
2183            '0',
2184            '',
2185            '',
2186        ],
2187        'Ώ'       => [
2188            '1',
2189            '0',
2190            '',
2191            '',
2192        ],
2193        // Hebrew alphabet
2194        'א'       => [
2195            '1',
2196            '0',
2197            '',
2198            '',
2199        ],
2200        'או'      => [
2201            '1',
2202            '0',
2203            '7',
2204            '',
2205        ],
2206        'אג'      => [
2207            '1',
2208            '4',
2209            '4',
2210            '4',
2211            '5',
2212            '5',
2213            '5',
2214            '34',
2215            '34',
2216            '34',
2217        ],
2218        'בב'      => [
2219            '0',
2220            '7',
2221            '7',
2222            '7',
2223            '77',
2224            '77',
2225            '77',
2226        ],
2227        'ב'       => [
2228            '0',
2229            '7',
2230            '7',
2231            '7',
2232        ],
2233        'גג'      => [
2234            '0',
2235            '4',
2236            '4',
2237            '4',
2238            '5',
2239            '5',
2240            '5',
2241            '45',
2242            '45',
2243            '45',
2244            '55',
2245            '55',
2246            '55',
2247            '54',
2248            '54',
2249            '54',
2250        ],
2251        'גד'      => [
2252            '0',
2253            '43',
2254            '43',
2255            '43',
2256            '53',
2257            '53',
2258            '53',
2259        ],
2260        'גה'      => [
2261            '0',
2262            '45',
2263            '45',
2264            '45',
2265            '55',
2266            '55',
2267            '55',
2268        ],
2269        'גז'      => [
2270            '0',
2271            '44',
2272            '44',
2273            '44',
2274            '45',
2275            '45',
2276            '45',
2277        ],
2278        'גח'      => [
2279            '0',
2280            '45',
2281            '45',
2282            '45',
2283            '55',
2284            '55',
2285            '55',
2286        ],
2287        'גכ'      => [
2288            '0',
2289            '45',
2290            '45',
2291            '45',
2292            '55',
2293            '55',
2294            '55',
2295        ],
2296        'גך'      => [
2297            '0',
2298            '45',
2299            '45',
2300            '45',
2301            '55',
2302            '55',
2303            '55',
2304        ],
2305        'גצ'      => [
2306            '0',
2307            '44',
2308            '44',
2309            '44',
2310            '45',
2311            '45',
2312            '45',
2313        ],
2314        'גץ'      => [
2315            '0',
2316            '44',
2317            '44',
2318            '44',
2319            '45',
2320            '45',
2321            '45',
2322        ],
2323        'גק'      => [
2324            '0',
2325            '45',
2326            '45',
2327            '45',
2328            '54',
2329            '54',
2330            '54',
2331        ],
2332        'גש'      => [
2333            '0',
2334            '44',
2335            '44',
2336            '44',
2337            '54',
2338            '54',
2339            '54',
2340        ],
2341        'גת'      => [
2342            '0',
2343            '43',
2344            '43',
2345            '43',
2346            '53',
2347            '53',
2348            '53',
2349        ],
2350        'ג'       => [
2351            '0',
2352            '4',
2353            '4',
2354            '4',
2355            '5',
2356            '5',
2357            '5',
2358        ],
2359        'דז'      => [
2360            '0',
2361            '4',
2362            '4',
2363            '4',
2364        ],
2365        'דד'      => [
2366            '0',
2367            '3',
2368            '3',
2369            '3',
2370            '33',
2371            '33',
2372            '33',
2373        ],
2374        'דט'      => [
2375            '0',
2376            '33',
2377            '33',
2378            '33',
2379        ],
2380        'דש'      => [
2381            '0',
2382            '4',
2383            '4',
2384            '4',
2385        ],
2386        'דצ'      => [
2387            '0',
2388            '4',
2389            '4',
2390            '4',
2391        ],
2392        'דץ'      => [
2393            '0',
2394            '4',
2395            '4',
2396            '4',
2397        ],
2398        'ד'       => [
2399            '0',
2400            '3',
2401            '3',
2402            '3',
2403        ],
2404        'הג'      => [
2405            '0',
2406            '54',
2407            '54',
2408            '54',
2409            '55',
2410            '55',
2411            '55',
2412        ],
2413        'הכ'      => [
2414            '0',
2415            '55',
2416            '55',
2417            '55',
2418        ],
2419        'הח'      => [
2420            '0',
2421            '55',
2422            '55',
2423            '55',
2424        ],
2425        'הק'      => [
2426            '0',
2427            '55',
2428            '55',
2429            '55',
2430            '5',
2431            '5',
2432            '5',
2433        ],
2434        'הה'      => [
2435            '0',
2436            '5',
2437            '5',
2438            '',
2439            '55',
2440            '55',
2441            '',
2442        ],
2443        'ה'       => [
2444            '0',
2445            '5',
2446            '5',
2447            '',
2448        ],
2449        'וי'      => [
2450            '1',
2451            '',
2452            '',
2453            '',
2454            '7',
2455            '7',
2456            '7',
2457        ],
2458        'ו'       => [
2459            '1',
2460            '7',
2461            '7',
2462            '7',
2463            '7',
2464            '',
2465            '',
2466        ],
2467        'וו'      => [
2468            '1',
2469            '7',
2470            '7',
2471            '7',
2472            '7',
2473            '',
2474            '',
2475        ],
2476        'וופ'     => [
2477            '1',
2478            '7',
2479            '7',
2480            '7',
2481            '77',
2482            '77',
2483            '77',
2484        ],
2485        'זש'      => [
2486            '0',
2487            '4',
2488            '4',
2489            '4',
2490            '44',
2491            '44',
2492            '44',
2493        ],
2494        'זדז'     => [
2495            '0',
2496            '2',
2497            '4',
2498            '4',
2499        ],
2500        'ז'       => [
2501            '0',
2502            '4',
2503            '4',
2504            '4',
2505        ],
2506        'זג'      => [
2507            '0',
2508            '44',
2509            '44',
2510            '44',
2511            '45',
2512            '45',
2513            '45',
2514        ],
2515        'זז'      => [
2516            '0',
2517            '4',
2518            '4',
2519            '4',
2520            '44',
2521            '44',
2522            '44',
2523        ],
2524        'זס'      => [
2525            '0',
2526            '44',
2527            '44',
2528            '44',
2529        ],
2530        'זצ'      => [
2531            '0',
2532            '44',
2533            '44',
2534            '44',
2535        ],
2536        'זץ'      => [
2537            '0',
2538            '44',
2539            '44',
2540            '44',
2541        ],
2542        'חג'      => [
2543            '0',
2544            '54',
2545            '54',
2546            '54',
2547            '53',
2548            '53',
2549            '53',
2550        ],
2551        'חח'      => [
2552            '0',
2553            '5',
2554            '5',
2555            '5',
2556            '55',
2557            '55',
2558            '55',
2559        ],
2560        'חק'      => [
2561            '0',
2562            '55',
2563            '55',
2564            '55',
2565            '5',
2566            '5',
2567            '5',
2568        ],
2569        'חכ'      => [
2570            '0',
2571            '45',
2572            '45',
2573            '45',
2574            '55',
2575            '55',
2576            '55',
2577        ],
2578        'חס'      => [
2579            '0',
2580            '5',
2581            '54',
2582            '54',
2583        ],
2584        'חש'      => [
2585            '0',
2586            '5',
2587            '54',
2588            '54',
2589        ],
2590        'ח'       => [
2591            '0',
2592            '5',
2593            '5',
2594            '5',
2595        ],
2596        'טש'      => [
2597            '0',
2598            '4',
2599            '4',
2600            '4',
2601        ],
2602        'טד'      => [
2603            '0',
2604            '33',
2605            '33',
2606            '33',
2607        ],
2608        'טי'      => [
2609            '0',
2610            '3',
2611            '3',
2612            '3',
2613            '4',
2614            '4',
2615            '4',
2616            '3',
2617            '3',
2618            '34',
2619        ],
2620        'טת'      => [
2621            '0',
2622            '33',
2623            '33',
2624            '33',
2625        ],
2626        'טט'      => [
2627            '0',
2628            '3',
2629            '3',
2630            '3',
2631            '33',
2632            '33',
2633            '33',
2634        ],
2635        'ט'       => [
2636            '0',
2637            '3',
2638            '3',
2639            '3',
2640        ],
2641        'י'       => [
2642            '1',
2643            '1',
2644            '',
2645            '',
2646        ],
2647        'יא'      => [
2648            '1',
2649            '1',
2650            '',
2651            '',
2652            '1',
2653            '1',
2654            '1',
2655        ],
2656        'כג'      => [
2657            '0',
2658            '55',
2659            '55',
2660            '55',
2661            '54',
2662            '54',
2663            '54',
2664        ],
2665        'כש'      => [
2666            '0',
2667            '5',
2668            '54',
2669            '54',
2670        ],
2671        'כס'      => [
2672            '0',
2673            '5',
2674            '54',
2675            '54',
2676        ],
2677        'ככ'      => [
2678            '0',
2679            '5',
2680            '5',
2681            '5',
2682            '55',
2683            '55',
2684            '55',
2685        ],
2686        'כך'      => [
2687            '0',
2688            '5',
2689            '5',
2690            '5',
2691            '55',
2692            '55',
2693            '55',
2694        ],
2695        'כ'       => [
2696            '0',
2697            '5',
2698            '5',
2699            '5',
2700        ],
2701        'כח'      => [
2702            '0',
2703            '55',
2704            '55',
2705            '55',
2706            '5',
2707            '5',
2708            '5',
2709        ],
2710        'ך'       => [
2711            '0',
2712            '',
2713            '5',
2714            '5',
2715        ],
2716        'ל'       => [
2717            '0',
2718            '8',
2719            '8',
2720            '8',
2721        ],
2722        'לל'      => [
2723            '0',
2724            '88',
2725            '88',
2726            '88',
2727            '8',
2728            '8',
2729            '8',
2730        ],
2731        'מנ'      => [
2732            '0',
2733            '66',
2734            '66',
2735            '66',
2736        ],
2737        'מן'      => [
2738            '0',
2739            '66',
2740            '66',
2741            '66',
2742        ],
2743        'ממ'      => [
2744            '0',
2745            '6',
2746            '6',
2747            '6',
2748            '66',
2749            '66',
2750            '66',
2751        ],
2752        'מם'      => [
2753            '0',
2754            '6',
2755            '6',
2756            '6',
2757            '66',
2758            '66',
2759            '66',
2760        ],
2761        'מ'       => [
2762            '0',
2763            '6',
2764            '6',
2765            '6',
2766        ],
2767        'ם'       => [
2768            '0',
2769            '',
2770            '6',
2771            '6',
2772        ],
2773        'נמ'      => [
2774            '0',
2775            '66',
2776            '66',
2777            '66',
2778        ],
2779        'נם'      => [
2780            '0',
2781            '66',
2782            '66',
2783            '66',
2784        ],
2785        'ננ'      => [
2786            '0',
2787            '6',
2788            '6',
2789            '6',
2790            '66',
2791            '66',
2792            '66',
2793        ],
2794        'נן'      => [
2795            '0',
2796            '6',
2797            '6',
2798            '6',
2799            '66',
2800            '66',
2801            '66',
2802        ],
2803        'נ'       => [
2804            '0',
2805            '6',
2806            '6',
2807            '6',
2808        ],
2809        'ן'       => [
2810            '0',
2811            '',
2812            '6',
2813            '6',
2814        ],
2815        'סתש'     => [
2816            '0',
2817            '2',
2818            '4',
2819            '4',
2820        ],
2821        'סתז'     => [
2822            '0',
2823            '2',
2824            '4',
2825            '4',
2826        ],
2827        'סטז'     => [
2828            '0',
2829            '2',
2830            '4',
2831            '4',
2832        ],
2833        'סטש'     => [
2834            '0',
2835            '2',
2836            '4',
2837            '4',
2838        ],
2839        'סצד'     => [
2840            '0',
2841            '2',
2842            '4',
2843            '4',
2844        ],
2845        'סט'      => [
2846            '0',
2847            '2',
2848            '4',
2849            '4',
2850            '43',
2851            '43',
2852            '43',
2853        ],
2854        'סת'      => [
2855            '0',
2856            '2',
2857            '4',
2858            '4',
2859            '43',
2860            '43',
2861            '43',
2862        ],
2863        'סג'      => [
2864            '0',
2865            '44',
2866            '44',
2867            '44',
2868            '4',
2869            '4',
2870            '4',
2871        ],
2872        'סס'      => [
2873            '0',
2874            '4',
2875            '4',
2876            '4',
2877            '44',
2878            '44',
2879            '44',
2880        ],
2881        'סצ'      => [
2882            '0',
2883            '44',
2884            '44',
2885            '44',
2886        ],
2887        'סץ'      => [
2888            '0',
2889            '44',
2890            '44',
2891            '44',
2892        ],
2893        'סז'      => [
2894            '0',
2895            '44',
2896            '44',
2897            '44',
2898        ],
2899        'סש'      => [
2900            '0',
2901            '44',
2902            '44',
2903            '44',
2904        ],
2905        'ס'       => [
2906            '0',
2907            '4',
2908            '4',
2909            '4',
2910        ],
2911        'ע'       => [
2912            '1',
2913            '0',
2914            '',
2915            '',
2916        ],
2917        'פב'      => [
2918            '0',
2919            '7',
2920            '7',
2921            '7',
2922            '77',
2923            '77',
2924            '77',
2925        ],
2926        'פוו'     => [
2927            '0',
2928            '7',
2929            '7',
2930            '7',
2931            '77',
2932            '77',
2933            '77',
2934        ],
2935        'פפ'      => [
2936            '0',
2937            '7',
2938            '7',
2939            '7',
2940            '77',
2941            '77',
2942            '77',
2943        ],
2944        'פף'      => [
2945            '0',
2946            '7',
2947            '7',
2948            '7',
2949            '77',
2950            '77',
2951            '77',
2952        ],
2953        'פ'       => [
2954            '0',
2955            '7',
2956            '7',
2957            '7',
2958        ],
2959        'ף'       => [
2960            '0',
2961            '',
2962            '7',
2963            '7',
2964        ],
2965        'צג'      => [
2966            '0',
2967            '44',
2968            '44',
2969            '44',
2970            '45',
2971            '45',
2972            '45',
2973        ],
2974        'צז'      => [
2975            '0',
2976            '44',
2977            '44',
2978            '44',
2979        ],
2980        'צס'      => [
2981            '0',
2982            '44',
2983            '44',
2984            '44',
2985        ],
2986        'צצ'      => [
2987            '0',
2988            '4',
2989            '4',
2990            '4',
2991            '5',
2992            '5',
2993            '5',
2994            '44',
2995            '44',
2996            '44',
2997            '54',
2998            '54',
2999            '54',
3000            '45',
3001            '45',
3002            '45',
3003        ],
3004        'צץ'      => [
3005            '0',
3006            '4',
3007            '4',
3008            '4',
3009            '5',
3010            '5',
3011            '5',
3012            '44',
3013            '44',
3014            '44',
3015            '54',
3016            '54',
3017            '54',
3018        ],
3019        'צש'      => [
3020            '0',
3021            '44',
3022            '44',
3023            '44',
3024            '4',
3025            '4',
3026            '4',
3027            '5',
3028            '5',
3029            '5',
3030        ],
3031        'צ'       => [
3032            '0',
3033            '4',
3034            '4',
3035            '4',
3036            '5',
3037            '5',
3038            '5',
3039        ],
3040        'ץ'       => [
3041            '0',
3042            '',
3043            '4',
3044            '4',
3045        ],
3046        'קה'      => [
3047            '0',
3048            '55',
3049            '55',
3050            '5',
3051        ],
3052        'קס'      => [
3053            '0',
3054            '5',
3055            '54',
3056            '54',
3057        ],
3058        'קש'      => [
3059            '0',
3060            '5',
3061            '54',
3062            '54',
3063        ],
3064        'קק'      => [
3065            '0',
3066            '5',
3067            '5',
3068            '5',
3069            '55',
3070            '55',
3071            '55',
3072        ],
3073        'קח'      => [
3074            '0',
3075            '55',
3076            '55',
3077            '55',
3078        ],
3079        'קכ'      => [
3080            '0',
3081            '55',
3082            '55',
3083            '55',
3084        ],
3085        'קך'      => [
3086            '0',
3087            '55',
3088            '55',
3089            '55',
3090        ],
3091        'קג'      => [
3092            '0',
3093            '55',
3094            '55',
3095            '55',
3096            '54',
3097            '54',
3098            '54',
3099        ],
3100        'ק'       => [
3101            '0',
3102            '5',
3103            '5',
3104            '5',
3105        ],
3106        'רר'      => [
3107            '0',
3108            '99',
3109            '99',
3110            '99',
3111            '9',
3112            '9',
3113            '9',
3114        ],
3115        'ר'       => [
3116            '0',
3117            '9',
3118            '9',
3119            '9',
3120        ],
3121        'שטז'     => [
3122            '0',
3123            '2',
3124            '4',
3125            '4',
3126        ],
3127        'שתש'     => [
3128            '0',
3129            '2',
3130            '4',
3131            '4',
3132        ],
3133        'שתז'     => [
3134            '0',
3135            '2',
3136            '4',
3137            '4',
3138        ],
3139        'שטש'     => [
3140            '0',
3141            '2',
3142            '4',
3143            '4',
3144        ],
3145        'שד'      => [
3146            '0',
3147            '2',
3148            '43',
3149            '43',
3150        ],
3151        'שז'      => [
3152            '0',
3153            '44',
3154            '44',
3155            '44',
3156        ],
3157        'שס'      => [
3158            '0',
3159            '44',
3160            '44',
3161            '44',
3162        ],
3163        'שת'      => [
3164            '0',
3165            '2',
3166            '43',
3167            '43',
3168        ],
3169        'שג'      => [
3170            '0',
3171            '4',
3172            '4',
3173            '4',
3174            '44',
3175            '44',
3176            '44',
3177            '4',
3178            '43',
3179            '43',
3180        ],
3181        'שט'      => [
3182            '0',
3183            '2',
3184            '43',
3185            '43',
3186            '44',
3187            '44',
3188            '44',
3189        ],
3190        'שצ'      => [
3191            '0',
3192            '44',
3193            '44',
3194            '44',
3195            '45',
3196            '45',
3197            '45',
3198        ],
3199        'שץ'      => [
3200            '0',
3201            '44',
3202            '',
3203            '44',
3204            '45',
3205            '',
3206            '45',
3207        ],
3208        'שש'      => [
3209            '0',
3210            '4',
3211            '4',
3212            '4',
3213            '44',
3214            '44',
3215            '44',
3216        ],
3217        'ש'       => [
3218            '0',
3219            '4',
3220            '4',
3221            '4',
3222        ],
3223        'תג'      => [
3224            '0',
3225            '34',
3226            '34',
3227            '34',
3228        ],
3229        'תז'      => [
3230            '0',
3231            '34',
3232            '34',
3233            '34',
3234        ],
3235        'תש'      => [
3236            '0',
3237            '4',
3238            '4',
3239            '4',
3240        ],
3241        'תת'      => [
3242            '0',
3243            '3',
3244            '3',
3245            '3',
3246            '4',
3247            '4',
3248            '4',
3249            '33',
3250            '33',
3251            '33',
3252            '44',
3253            '44',
3254            '44',
3255            '34',
3256            '34',
3257            '34',
3258            '43',
3259            '43',
3260            '43',
3261        ],
3262        'ת'       => [
3263            '0',
3264            '3',
3265            '3',
3266            '3',
3267            '4',
3268            '4',
3269            '4',
3270        ],
3271        // Arabic alphabet
3272        'ا'       => [
3273            '1',
3274            '0',
3275            '',
3276            '',
3277        ],
3278        'ب'       => [
3279            '0',
3280            '7',
3281            '7',
3282            '7',
3283        ],
3284        'ت'       => [
3285            '0',
3286            '3',
3287            '3',
3288            '3',
3289        ],
3290        'ث'       => [
3291            '0',
3292            '3',
3293            '3',
3294            '3',
3295        ],
3296        'ج'       => [
3297            '0',
3298            '4',
3299            '4',
3300            '4',
3301        ],
3302        'ح'       => [
3303            '0',
3304            '5',
3305            '5',
3306            '5',
3307        ],
3308        'خ'       => [
3309            '0',
3310            '5',
3311            '5',
3312            '5',
3313        ],
3314        'د'       => [
3315            '0',
3316            '3',
3317            '3',
3318            '3',
3319        ],
3320        'ذ'       => [
3321            '0',
3322            '3',
3323            '3',
3324            '3',
3325        ],
3326        'ر'       => [
3327            '0',
3328            '9',
3329            '9',
3330            '9',
3331        ],
3332        'ز'       => [
3333            '0',
3334            '4',
3335            '4',
3336            '4',
3337        ],
3338        'س'       => [
3339            '0',
3340            '4',
3341            '4',
3342            '4',
3343        ],
3344        'ش'       => [
3345            '0',
3346            '4',
3347            '4',
3348            '4',
3349        ],
3350        'ص'       => [
3351            '0',
3352            '4',
3353            '4',
3354            '4',
3355        ],
3356        'ض'       => [
3357            '0',
3358            '3',
3359            '3',
3360            '3',
3361        ],
3362        'ط'       => [
3363            '0',
3364            '3',
3365            '3',
3366            '3',
3367        ],
3368        'ظ'       => [
3369            '0',
3370            '4',
3371            '4',
3372            '4',
3373        ],
3374        'ع'       => [
3375            '1',
3376            '0',
3377            '',
3378            '',
3379        ],
3380        'غ'       => [
3381            '0',
3382            '0',
3383            '',
3384            '',
3385        ],
3386        'ف'       => [
3387            '0',
3388            '7',
3389            '7',
3390            '7',
3391        ],
3392        'ق'       => [
3393            '0',
3394            '5',
3395            '5',
3396            '5',
3397        ],
3398        'ك'       => [
3399            '0',
3400            '5',
3401            '5',
3402            '5',
3403        ],
3404        'ل'       => [
3405            '0',
3406            '8',
3407            '8',
3408            '8',
3409        ],
3410        'لا'      => [
3411            '0',
3412            '8',
3413            '8',
3414            '8',
3415        ],
3416        'م'       => [
3417            '0',
3418            '6',
3419            '6',
3420            '6',
3421        ],
3422        'ن'       => [
3423            '0',
3424            '6',
3425            '6',
3426            '6',
3427        ],
3428        'هن'      => [
3429            '0',
3430            '66',
3431            '66',
3432            '66',
3433        ],
3434        'ه'       => [
3435            '0',
3436            '5',
3437            '5',
3438            '',
3439        ],
3440        'و'       => [
3441            '1',
3442            '',
3443            '',
3444            '',
3445            '7',
3446            '',
3447            '',
3448        ],
3449        'ي'       => [
3450            '0',
3451            '1',
3452            '',
3453            '',
3454        ],
3455        'آ'       => [
3456            '0',
3457            '1',
3458            '',
3459            '',
3460        ],
3461        'ة'       => [
3462            '0',
3463            '',
3464            '',
3465            '3',
3466        ],
3467        'ی'       => [
3468            '0',
3469            '1',
3470            '',
3471            '',
3472        ],
3473        'ى'       => [
3474            '1',
3475            '1',
3476            '',
3477            '',
3478        ],
3479    ];
3480
3481    /**
3482     * Which algorithms are supported.
3483     *
3484     * @return string[]
3485     */
3486    public static function getAlgorithms(): array
3487    {
3488        return [
3489            /* I18N: http://en.wikipedia.org/wiki/Soundex */
3490            'std' => I18N::translate('Russell'),
3491            /* I18N: http://en.wikipedia.org/wiki/Daitch–Mokotoff_Soundex */
3492            'dm'  => I18N::translate('Daitch-Mokotoff'),
3493        ];
3494    }
3495
3496    /**
3497     * Is there a match between two soundex codes?
3498     *
3499     * @param string $soundex1
3500     * @param string $soundex2
3501     *
3502     * @return bool
3503     */
3504    public static function compare($soundex1, $soundex2): bool
3505    {
3506        if ($soundex1 !== '' && $soundex2 !== '') {
3507            return !empty(array_intersect(explode(':', $soundex1), explode(':', $soundex2)));
3508        }
3509
3510        return false;
3511    }
3512
3513    /**
3514     * Generate Russell soundex codes for a given text.
3515     *
3516     * @param string $text
3517     *
3518     * @return string
3519     */
3520    public static function russell(string $text): string
3521    {
3522        $words         = explode(' ', $text);
3523        $soundex_array = [];
3524
3525        foreach ($words as $word) {
3526            $soundex = soundex($word);
3527
3528            // Only return codes from recognisable sounds
3529            if ($soundex !== '0000') {
3530                $soundex_array[] = $soundex;
3531            }
3532        }
3533
3534        // Combine words, e.g. “New York” as “Newyork”
3535        if (count($words) > 1) {
3536            $soundex_array[] = soundex(str_replace(' ', '', $text));
3537        }
3538
3539        // A varchar(255) column can only hold 51 4-character codes (plus 50 delimiters)
3540        $soundex_array = array_slice(array_unique($soundex_array), 0, 51);
3541
3542        return implode(':', $soundex_array);
3543    }
3544
3545    /**
3546     * Generate Daitch–Mokotoff soundex codes for a given text.
3547     *
3548     * @param string $text
3549     *
3550     * @return string
3551     */
3552    public static function daitchMokotoff(string $text): string
3553    {
3554        $words         = explode(' ', $text);
3555        $soundex_array = [];
3556
3557        foreach ($words as $word) {
3558            $soundex_array = array_merge($soundex_array, self::daitchMokotoffWord($word));
3559        }
3560        // Combine words, e.g. “New York” as “Newyork”
3561        if (count($words) > 1) {
3562            $soundex_array = array_merge($soundex_array, self::daitchMokotoffWord(str_replace(' ', '', $text)));
3563        }
3564
3565        // A varchar(255) column can only hold 36 6-character codes (plus 35 delimiters)
3566        $soundex_array = array_slice(array_unique($soundex_array), 0, 36);
3567
3568        return implode(':', $soundex_array);
3569    }
3570
3571    /**
3572     * Calculate the Daitch-Mokotoff soundex for a word.
3573     *
3574     * @param string $name
3575     *
3576     * @return string[] List of possible DM codes for the word.
3577     */
3578    private static function daitchMokotoffWord($name): array
3579    {
3580        // Apply special transformation rules to the input string
3581        $name = I18N::strtoupper($name);
3582        foreach (self::TRANSFORM_NAMES as $transformRule) {
3583            $name = str_replace($transformRule[0], $transformRule[1], $name);
3584        }
3585
3586        // Initialize
3587        $name_script = I18N::textScript($name);
3588        $noVowels    = ($name_script === 'Hebr' || $name_script === 'Arab');
3589
3590        $lastPos         = strlen($name) - 1;
3591        $currPos         = 0;
3592        $state           = 1; // 1: start of input string, 2: before vowel, 3: other
3593        $result          = []; // accumulate complete 6-digit D-M codes here
3594        $partialResult   = []; // accumulate incomplete D-M codes here
3595        $partialResult[] = ['!']; // initialize 1st partial result  ('!' stops "duplicate sound" check)
3596
3597        // Loop through the input string.
3598        // Stop when the string is exhausted or when no more partial results remain
3599        while (count($partialResult) !== 0 && $currPos <= $lastPos) {
3600            // Find the DM coding table entry for the chunk at the current position
3601            $thisEntry = substr($name, $currPos, self::MAXCHAR); // Get maximum length chunk
3602            while ($thisEntry !== '') {
3603                if (isset(self::DM_SOUNDS[$thisEntry])) {
3604                    break;
3605                }
3606                $thisEntry = substr($thisEntry, 0, -1); // Not in table: try a shorter chunk
3607            }
3608            if ($thisEntry === '') {
3609                $currPos++; // Not in table: advance pointer to next byte
3610                continue; // and try again
3611            }
3612
3613            $soundTableEntry = self::DM_SOUNDS[$thisEntry];
3614            $workingResult   = $partialResult;
3615            $partialResult   = [];
3616            $currPos += strlen($thisEntry);
3617
3618            // Not at beginning of input string
3619            if ($state !== 1) {
3620                if ($currPos <= $lastPos) {
3621                    // Determine whether the next chunk is a vowel
3622                    $nextEntry = substr($name, $currPos, self::MAXCHAR); // Get maximum length chunk
3623                    while ($nextEntry !== '') {
3624                        if (isset(self::DM_SOUNDS[$nextEntry])) {
3625                            break;
3626                        }
3627                        $nextEntry = substr($nextEntry, 0, -1); // Not in table: try a shorter chunk
3628                    }
3629                } else {
3630                    $nextEntry = '';
3631                }
3632                if ($nextEntry !== '' && self::DM_SOUNDS[$nextEntry][0] !== '0') {
3633                    $state = 2;
3634                } else {
3635                    // Next chunk is a vowel
3636                    $state = 3;
3637                }
3638            }
3639
3640            while ($state < count($soundTableEntry)) {
3641                // empty means 'ignore this sound in this state'
3642                if ($soundTableEntry[$state] === '') {
3643                    foreach ($workingResult as $workingEntry) {
3644                        $tempEntry                        = $workingEntry;
3645                        $tempEntry[count($tempEntry) - 1] .= '!'; // Prevent false 'doubles'
3646                        $partialResult[]                  = $tempEntry;
3647                    }
3648                } else {
3649                    foreach ($workingResult as $workingEntry) {
3650                        if ($soundTableEntry[$state] !== $workingEntry[count($workingEntry) - 1]) {
3651                            // Incoming sound isn't a duplicate of the previous sound
3652                            $workingEntry[] = $soundTableEntry[$state];
3653                        } elseif ($noVowels) {
3654                            // Incoming sound is a duplicate of the previous sound
3655                            // For Hebrew and Arabic, we need to create a pair of D-M sound codes,
3656                            // one of the pair with only a single occurrence of the duplicate sound,
3657                            // the other with both occurrences
3658                            $workingEntry[] = $soundTableEntry[$state];
3659                        }
3660
3661                        if (count($workingEntry) < 7) {
3662                            $partialResult[] = $workingEntry;
3663                        } else {
3664                            // This is the 6th code in the sequence
3665                            // We're looking for 7 entries because the first is '!' and doesn't count
3666                            $tempResult = str_replace('!', '', implode('', $workingEntry));
3667                            // Only return codes from recognisable sounds
3668                            if ($tempResult) {
3669                                $result[] = substr($tempResult . '000000', 0, 6);
3670                            }
3671                        }
3672                    }
3673                }
3674                $state += 3; // Advance to next triplet while keeping the same basic state
3675            }
3676        }
3677
3678        // Zero-fill and copy all remaining partial results
3679        foreach ($partialResult as $workingEntry) {
3680            $tempResult = str_replace('!', '', implode('', $workingEntry));
3681            // Only return codes from recognisable sounds
3682            if ($tempResult) {
3683                $result[] = substr($tempResult . '000000', 0, 6);
3684            }
3685        }
3686
3687        return $result;
3688    }
3689}
3690