xref: /webtrees/app/Soundex.php (revision 17907095d917ef2b56d7bdf08f08ea42db03cb32)
1<?php
2/**
3 * webtrees: online genealogy
4 * Copyright (C) 2019 webtrees development team
5 * This program is free software: you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation, either version 3 of the License, or
8 * (at your option) any later version.
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 * You should have received a copy of the GNU General Public License
14 * along with this program. If not, see <http://www.gnu.org/licenses/>.
15 */
16declare(strict_types=1);
17
18namespace Fisharebest\Webtrees;
19
20/**
21 * Phonetic matching of strings.
22 */
23class Soundex
24{
25    // Determine the Daitch–Mokotoff Soundex code for a word
26    // Original implementation by Gerry Kroll, and analysis by Meliza Amity
27
28    // Max. table key length (in ASCII bytes -- NOT in UTF-8 characters!)
29    private const MAXCHAR = 7;
30
31    /**
32     * Name transformation arrays.
33     * Used to transform the Name string to simplify the "sounds like" table.
34     * This is especially useful in Hebrew.
35     *
36     * Each array entry defines the "from" and "to" arguments of an preg($from, $to, $text)
37     * function call to achieve the desired transformations.
38     *
39     * Note about the use of "\x01":
40     * This code, which can’t legitimately occur in the kind of text we're dealing with,
41     * is used as a place-holder so that conditional string replacements can be done.
42     */
43    private const TRANSFORM_NAMES = [
44        // Force Yiddish ligatures to be treated as separate letters
45        [
46            'װ',
47            'וו',
48        ],
49        [
50            'ײ',
51            'יי',
52        ],
53        [
54            'ױ',
55            'וי',
56        ],
57        [
58            'בו',
59            'בע',
60        ],
61        [
62            'פו',
63            'פע',
64        ],
65        [
66            'ומ',
67            'עמ',
68        ],
69        [
70            'ום',
71            'עם',
72        ],
73        [
74            'ונ',
75            'ענ',
76        ],
77        [
78            'ון',
79            'ען',
80        ],
81        [
82            'וו',
83            'ב',
84        ],
85        [
86            "\x01",
87            '',
88        ],
89        [
90            'ייה$',
91            "\x01ה",
92        ],
93        [
94            'ייע$',
95            "\x01ע",
96        ],
97        [
98            'יי',
99            'ע',
100        ],
101        [
102            "\x01",
103            'יי',
104        ],
105    ];
106
107    /**
108     * The DM sound coding table is organized this way:
109     * key: a variable-length string that corresponds to the UTF-8 character sequence
110     * represented by the table entry. Currently, that string can be up to 7
111     * bytes long. This maximum length is defined by the value of global variable
112     * $maxchar.
113     *
114     * value: an array as follows:
115     * [0]:  zero if not a vowel
116     * [1]:  sound value when this string is at the beginning of the word
117     * [2]:  sound value when this string is followed by a vowel
118     * [3]:  sound value for other cases
119     * [1],[2],[3] can be repeated several times to create branches in the code
120     * an empty sound value means "ignore in this state"
121     */
122    private const DM_SOUNDS = [
123        'A'       => [
124            '1',
125            '0',
126            '',
127            '',
128        ],
129        'À'       => [
130            '1',
131            '0',
132            '',
133            '',
134        ],
135        'Á'       => [
136            '1',
137            '0',
138            '',
139            '',
140        ],
141        'Â'       => [
142            '1',
143            '0',
144            '',
145            '',
146        ],
147        'Ã'       => [
148            '1',
149            '0',
150            '',
151            '',
152        ],
153        'Ä'       => [
154            '1',
155            '0',
156            '1',
157            '',
158            '0',
159            '',
160            '',
161        ],
162        'Å'       => [
163            '1',
164            '0',
165            '',
166            '',
167        ],
168        'Ă'       => [
169            '1',
170            '0',
171            '',
172            '',
173        ],
174        'Ą'       => [
175            '1',
176            '',
177            '',
178            '',
179            '',
180            '',
181            '6',
182        ],
183        'Ạ'       => [
184            '1',
185            '0',
186            '',
187            '',
188        ],
189        'Ả'       => [
190            '1',
191            '0',
192            '',
193            '',
194        ],
195        'Ấ'       => [
196            '1',
197            '0',
198            '',
199            '',
200        ],
201        'Ầ'       => [
202            '1',
203            '0',
204            '',
205            '',
206        ],
207        'Ẩ'       => [
208            '1',
209            '0',
210            '',
211            '',
212        ],
213        'Ẫ'       => [
214            '1',
215            '0',
216            '',
217            '',
218        ],
219        'Ậ'       => [
220            '1',
221            '0',
222            '',
223            '',
224        ],
225        'Ắ'       => [
226            '1',
227            '0',
228            '',
229            '',
230        ],
231        'Ằ'       => [
232            '1',
233            '0',
234            '',
235            '',
236        ],
237        'Ẳ'       => [
238            '1',
239            '0',
240            '',
241            '',
242        ],
243        'Ẵ'       => [
244            '1',
245            '0',
246            '',
247            '',
248        ],
249        'Ặ'       => [
250            '1',
251            '0',
252            '',
253            '',
254        ],
255        'AE'      => [
256            '1',
257            '0',
258            '1',
259            '',
260        ],
261        'Æ'       => [
262            '1',
263            '0',
264            '1',
265            '',
266        ],
267        'AI'      => [
268            '1',
269            '0',
270            '1',
271            '',
272        ],
273        'AJ'      => [
274            '1',
275            '0',
276            '1',
277            '',
278        ],
279        'AU'      => [
280            '1',
281            '0',
282            '7',
283            '',
284        ],
285        'AV'      => [
286            '1',
287            '0',
288            '7',
289            '',
290            '7',
291            '7',
292            '7',
293        ],
294        'ÄU'      => [
295            '1',
296            '0',
297            '1',
298            '',
299        ],
300        'AY'      => [
301            '1',
302            '0',
303            '1',
304            '',
305        ],
306        'B'       => [
307            '0',
308            '7',
309            '7',
310            '7',
311        ],
312        'C'       => [
313            '0',
314            '5',
315            '5',
316            '5',
317            '34',
318            '4',
319            '4',
320        ],
321        'Ć'       => [
322            '0',
323            '4',
324            '4',
325            '4',
326        ],
327        'Č'       => [
328            '0',
329            '4',
330            '4',
331            '4',
332        ],
333        'Ç'       => [
334            '0',
335            '4',
336            '4',
337            '4',
338        ],
339        'CH'      => [
340            '0',
341            '5',
342            '5',
343            '5',
344            '34',
345            '4',
346            '4',
347        ],
348        'CHS'     => [
349            '0',
350            '5',
351            '54',
352            '54',
353        ],
354        'CK'      => [
355            '0',
356            '5',
357            '5',
358            '5',
359            '45',
360            '45',
361            '45',
362        ],
363        'CCS'     => [
364            '0',
365            '4',
366            '4',
367            '4',
368        ],
369        'CS'      => [
370            '0',
371            '4',
372            '4',
373            '4',
374        ],
375        'CSZ'     => [
376            '0',
377            '4',
378            '4',
379            '4',
380        ],
381        'CZ'      => [
382            '0',
383            '4',
384            '4',
385            '4',
386        ],
387        'CZS'     => [
388            '0',
389            '4',
390            '4',
391            '4',
392        ],
393        'D'       => [
394            '0',
395            '3',
396            '3',
397            '3',
398        ],
399        'Ď'       => [
400            '0',
401            '3',
402            '3',
403            '3',
404        ],
405        'Đ'       => [
406            '0',
407            '3',
408            '3',
409            '3',
410        ],
411        'DRS'     => [
412            '0',
413            '4',
414            '4',
415            '4',
416        ],
417        'DRZ'     => [
418            '0',
419            '4',
420            '4',
421            '4',
422        ],
423        'DS'      => [
424            '0',
425            '4',
426            '4',
427            '4',
428        ],
429        'DSH'     => [
430            '0',
431            '4',
432            '4',
433            '4',
434        ],
435        'DSZ'     => [
436            '0',
437            '4',
438            '4',
439            '4',
440        ],
441        'DT'      => [
442            '0',
443            '3',
444            '3',
445            '3',
446        ],
447        'DDZ'     => [
448            '0',
449            '4',
450            '4',
451            '4',
452        ],
453        'DDZS'    => [
454            '0',
455            '4',
456            '4',
457            '4',
458        ],
459        'DZ'      => [
460            '0',
461            '4',
462            '4',
463            '4',
464        ],
465        'DŹ'      => [
466            '0',
467            '4',
468            '4',
469            '4',
470        ],
471        'DŻ'      => [
472            '0',
473            '4',
474            '4',
475            '4',
476        ],
477        'DZH'     => [
478            '0',
479            '4',
480            '4',
481            '4',
482        ],
483        'DZS'     => [
484            '0',
485            '4',
486            '4',
487            '4',
488        ],
489        'E'       => [
490            '1',
491            '0',
492            '',
493            '',
494        ],
495        'È'       => [
496            '1',
497            '0',
498            '',
499            '',
500        ],
501        'É'       => [
502            '1',
503            '0',
504            '',
505            '',
506        ],
507        'Ê'       => [
508            '1',
509            '0',
510            '',
511            '',
512        ],
513        'Ë'       => [
514            '1',
515            '0',
516            '',
517            '',
518        ],
519        'Ĕ'       => [
520            '1',
521            '0',
522            '',
523            '',
524        ],
525        'Ė'       => [
526            '1',
527            '0',
528            '',
529            '',
530        ],
531        'Ę'       => [
532            '1',
533            '',
534            '',
535            '6',
536            '',
537            '',
538            '',
539        ],
540        'Ẹ'       => [
541            '1',
542            '0',
543            '',
544            '',
545        ],
546        'Ẻ'       => [
547            '1',
548            '0',
549            '',
550            '',
551        ],
552        'Ẽ'       => [
553            '1',
554            '0',
555            '',
556            '',
557        ],
558        'Ế'       => [
559            '1',
560            '0',
561            '',
562            '',
563        ],
564        'Ề'       => [
565            '1',
566            '0',
567            '',
568            '',
569        ],
570        'Ể'       => [
571            '1',
572            '0',
573            '',
574            '',
575        ],
576        'Ễ'       => [
577            '1',
578            '0',
579            '',
580            '',
581        ],
582        'Ệ'       => [
583            '1',
584            '0',
585            '',
586            '',
587        ],
588        'EAU'     => [
589            '1',
590            '0',
591            '',
592            '',
593        ],
594        'EI'      => [
595            '1',
596            '0',
597            '1',
598            '',
599        ],
600        'EJ'      => [
601            '1',
602            '0',
603            '1',
604            '',
605        ],
606        'EU'      => [
607            '1',
608            '1',
609            '1',
610            '',
611        ],
612        'EY'      => [
613            '1',
614            '0',
615            '1',
616            '',
617        ],
618        'F'       => [
619            '0',
620            '7',
621            '7',
622            '7',
623        ],
624        'FB'      => [
625            '0',
626            '7',
627            '7',
628            '7',
629        ],
630        'G'       => [
631            '0',
632            '5',
633            '5',
634            '5',
635            '34',
636            '4',
637            '4',
638        ],
639        'Ğ'       => [
640            '0',
641            '',
642            '',
643            '',
644        ],
645        'GGY'     => [
646            '0',
647            '5',
648            '5',
649            '5',
650        ],
651        'GY'      => [
652            '0',
653            '5',
654            '5',
655            '5',
656        ],
657        'H'       => [
658            '0',
659            '5',
660            '5',
661            '',
662            '5',
663            '5',
664            '5',
665        ],
666        'I'       => [
667            '1',
668            '0',
669            '',
670            '',
671        ],
672        'Ì'       => [
673            '1',
674            '0',
675            '',
676            '',
677        ],
678        'Í'       => [
679            '1',
680            '0',
681            '',
682            '',
683        ],
684        'Î'       => [
685            '1',
686            '0',
687            '',
688            '',
689        ],
690        'Ï'       => [
691            '1',
692            '0',
693            '',
694            '',
695        ],
696        'Ĩ'       => [
697            '1',
698            '0',
699            '',
700            '',
701        ],
702        'Į'       => [
703            '1',
704            '0',
705            '',
706            '',
707        ],
708        'İ'       => [
709            '1',
710            '0',
711            '',
712            '',
713        ],
714        'Ỉ'       => [
715            '1',
716            '0',
717            '',
718            '',
719        ],
720        'Ị'       => [
721            '1',
722            '0',
723            '',
724            '',
725        ],
726        'IA'      => [
727            '1',
728            '1',
729            '',
730            '',
731        ],
732        'IE'      => [
733            '1',
734            '1',
735            '',
736            '',
737        ],
738        'IO'      => [
739            '1',
740            '1',
741            '',
742            '',
743        ],
744        'IU'      => [
745            '1',
746            '1',
747            '',
748            '',
749        ],
750        'J'       => [
751            '0',
752            '1',
753            '',
754            '',
755            '4',
756            '4',
757            '4',
758            '5',
759            '5',
760            '',
761        ],
762        'K'       => [
763            '0',
764            '5',
765            '5',
766            '5',
767        ],
768        'KH'      => [
769            '0',
770            '5',
771            '5',
772            '5',
773        ],
774        'KS'      => [
775            '0',
776            '5',
777            '54',
778            '54',
779        ],
780        'L'       => [
781            '0',
782            '8',
783            '8',
784            '8',
785        ],
786        'Ľ'       => [
787            '0',
788            '8',
789            '8',
790            '8',
791        ],
792        'Ĺ'       => [
793            '0',
794            '8',
795            '8',
796            '8',
797        ],
798        'Ł'       => [
799            '0',
800            '7',
801            '7',
802            '7',
803            '8',
804            '8',
805            '8',
806        ],
807        'LL'      => [
808            '0',
809            '8',
810            '8',
811            '8',
812            '58',
813            '8',
814            '8',
815            '1',
816            '8',
817            '8',
818        ],
819        'LLY'     => [
820            '0',
821            '8',
822            '8',
823            '8',
824            '1',
825            '8',
826            '8',
827        ],
828        'LY'      => [
829            '0',
830            '8',
831            '8',
832            '8',
833            '1',
834            '8',
835            '8',
836        ],
837        'M'       => [
838            '0',
839            '6',
840            '6',
841            '6',
842        ],
843        'MĔ'      => [
844            '0',
845            '66',
846            '66',
847            '66',
848        ],
849        'MN'      => [
850            '0',
851            '66',
852            '66',
853            '66',
854        ],
855        'N'       => [
856            '0',
857            '6',
858            '6',
859            '6',
860        ],
861        'Ń'       => [
862            '0',
863            '6',
864            '6',
865            '6',
866        ],
867        'Ň'       => [
868            '0',
869            '6',
870            '6',
871            '6',
872        ],
873        'Ñ'       => [
874            '0',
875            '6',
876            '6',
877            '6',
878        ],
879        'NM'      => [
880            '0',
881            '66',
882            '66',
883            '66',
884        ],
885        'O'       => [
886            '1',
887            '0',
888            '',
889            '',
890        ],
891        'Ò'       => [
892            '1',
893            '0',
894            '',
895            '',
896        ],
897        'Ó'       => [
898            '1',
899            '0',
900            '',
901            '',
902        ],
903        'Ô'       => [
904            '1',
905            '0',
906            '',
907            '',
908        ],
909        'Õ'       => [
910            '1',
911            '0',
912            '',
913            '',
914        ],
915        'Ö'       => [
916            '1',
917            '0',
918            '',
919            '',
920        ],
921        'Ø'       => [
922            '1',
923            '0',
924            '',
925            '',
926        ],
927        'Ő'       => [
928            '1',
929            '0',
930            '',
931            '',
932        ],
933        'Œ'       => [
934            '1',
935            '0',
936            '',
937            '',
938        ],
939        'Ơ'       => [
940            '1',
941            '0',
942            '',
943            '',
944        ],
945        'Ọ'       => [
946            '1',
947            '0',
948            '',
949            '',
950        ],
951        'Ỏ'       => [
952            '1',
953            '0',
954            '',
955            '',
956        ],
957        'Ố'       => [
958            '1',
959            '0',
960            '',
961            '',
962        ],
963        'Ồ'       => [
964            '1',
965            '0',
966            '',
967            '',
968        ],
969        'Ổ'       => [
970            '1',
971            '0',
972            '',
973            '',
974        ],
975        'Ỗ'       => [
976            '1',
977            '0',
978            '',
979            '',
980        ],
981        'Ộ'       => [
982            '1',
983            '0',
984            '',
985            '',
986        ],
987        'Ớ'       => [
988            '1',
989            '0',
990            '',
991            '',
992        ],
993        'Ờ'       => [
994            '1',
995            '0',
996            '',
997            '',
998        ],
999        'Ở'       => [
1000            '1',
1001            '0',
1002            '',
1003            '',
1004        ],
1005        'Ỡ'       => [
1006            '1',
1007            '0',
1008            '',
1009            '',
1010        ],
1011        'Ợ'       => [
1012            '1',
1013            '0',
1014            '',
1015            '',
1016        ],
1017        'OE'      => [
1018            '1',
1019            '0',
1020            '',
1021            '',
1022        ],
1023        'OI'      => [
1024            '1',
1025            '0',
1026            '1',
1027            '',
1028        ],
1029        'OJ'      => [
1030            '1',
1031            '0',
1032            '1',
1033            '',
1034        ],
1035        'OU'      => [
1036            '1',
1037            '0',
1038            '',
1039            '',
1040        ],
1041        'OY'      => [
1042            '1',
1043            '0',
1044            '1',
1045            '',
1046        ],
1047        'P'       => [
1048            '0',
1049            '7',
1050            '7',
1051            '7',
1052        ],
1053        'PF'      => [
1054            '0',
1055            '7',
1056            '7',
1057            '7',
1058        ],
1059        'PH'      => [
1060            '0',
1061            '7',
1062            '7',
1063            '7',
1064        ],
1065        'Q'       => [
1066            '0',
1067            '5',
1068            '5',
1069            '5',
1070        ],
1071        'R'       => [
1072            '0',
1073            '9',
1074            '9',
1075            '9',
1076        ],
1077        'Ř'       => [
1078            '0',
1079            '4',
1080            '4',
1081            '4',
1082        ],
1083        'RS'      => [
1084            '0',
1085            '4',
1086            '4',
1087            '4',
1088            '94',
1089            '94',
1090            '94',
1091        ],
1092        'RZ'      => [
1093            '0',
1094            '4',
1095            '4',
1096            '4',
1097            '94',
1098            '94',
1099            '94',
1100        ],
1101        'S'       => [
1102            '0',
1103            '4',
1104            '4',
1105            '4',
1106        ],
1107        'Ś'       => [
1108            '0',
1109            '4',
1110            '4',
1111            '4',
1112        ],
1113        'Š'       => [
1114            '0',
1115            '4',
1116            '4',
1117            '4',
1118        ],
1119        'Ş'       => [
1120            '0',
1121            '4',
1122            '4',
1123            '4',
1124        ],
1125        'SC'      => [
1126            '0',
1127            '2',
1128            '4',
1129            '4',
1130        ],
1131        'ŠČ'      => [
1132            '0',
1133            '2',
1134            '4',
1135            '4',
1136        ],
1137        'SCH'     => [
1138            '0',
1139            '4',
1140            '4',
1141            '4',
1142        ],
1143        'SCHD'    => [
1144            '0',
1145            '2',
1146            '43',
1147            '43',
1148        ],
1149        'SCHT'    => [
1150            '0',
1151            '2',
1152            '43',
1153            '43',
1154        ],
1155        'SCHTCH'  => [
1156            '0',
1157            '2',
1158            '4',
1159            '4',
1160        ],
1161        'SCHTSCH' => [
1162            '0',
1163            '2',
1164            '4',
1165            '4',
1166        ],
1167        'SCHTSH'  => [
1168            '0',
1169            '2',
1170            '4',
1171            '4',
1172        ],
1173        'SD'      => [
1174            '0',
1175            '2',
1176            '43',
1177            '43',
1178        ],
1179        'SH'      => [
1180            '0',
1181            '4',
1182            '4',
1183            '4',
1184        ],
1185        'SHCH'    => [
1186            '0',
1187            '2',
1188            '4',
1189            '4',
1190        ],
1191        'SHD'     => [
1192            '0',
1193            '2',
1194            '43',
1195            '43',
1196        ],
1197        'SHT'     => [
1198            '0',
1199            '2',
1200            '43',
1201            '43',
1202        ],
1203        'SHTCH'   => [
1204            '0',
1205            '2',
1206            '4',
1207            '4',
1208        ],
1209        'SHTSH'   => [
1210            '0',
1211            '2',
1212            '4',
1213            '4',
1214        ],
1215        'ß'       => [
1216            '0',
1217            '',
1218            '4',
1219            '4',
1220        ],
1221        'ST'      => [
1222            '0',
1223            '2',
1224            '43',
1225            '43',
1226        ],
1227        'STCH'    => [
1228            '0',
1229            '2',
1230            '4',
1231            '4',
1232        ],
1233        'STRS'    => [
1234            '0',
1235            '2',
1236            '4',
1237            '4',
1238        ],
1239        'STRZ'    => [
1240            '0',
1241            '2',
1242            '4',
1243            '4',
1244        ],
1245        'STSCH'   => [
1246            '0',
1247            '2',
1248            '4',
1249            '4',
1250        ],
1251        'STSH'    => [
1252            '0',
1253            '2',
1254            '4',
1255            '4',
1256        ],
1257        'SSZ'     => [
1258            '0',
1259            '4',
1260            '4',
1261            '4',
1262        ],
1263        'SZ'      => [
1264            '0',
1265            '4',
1266            '4',
1267            '4',
1268        ],
1269        'SZCS'    => [
1270            '0',
1271            '2',
1272            '4',
1273            '4',
1274        ],
1275        'SZCZ'    => [
1276            '0',
1277            '2',
1278            '4',
1279            '4',
1280        ],
1281        'SZD'     => [
1282            '0',
1283            '2',
1284            '43',
1285            '43',
1286        ],
1287        'SZT'     => [
1288            '0',
1289            '2',
1290            '43',
1291            '43',
1292        ],
1293        'T'       => [
1294            '0',
1295            '3',
1296            '3',
1297            '3',
1298        ],
1299        'Ť'       => [
1300            '0',
1301            '3',
1302            '3',
1303            '3',
1304        ],
1305        'Ţ'       => [
1306            '0',
1307            '3',
1308            '3',
1309            '3',
1310            '4',
1311            '4',
1312            '4',
1313        ],
1314        'TC'      => [
1315            '0',
1316            '4',
1317            '4',
1318            '4',
1319        ],
1320        'TCH'     => [
1321            '0',
1322            '4',
1323            '4',
1324            '4',
1325        ],
1326        'TH'      => [
1327            '0',
1328            '3',
1329            '3',
1330            '3',
1331        ],
1332        'TRS'     => [
1333            '0',
1334            '4',
1335            '4',
1336            '4',
1337        ],
1338        'TRZ'     => [
1339            '0',
1340            '4',
1341            '4',
1342            '4',
1343        ],
1344        'TS'      => [
1345            '0',
1346            '4',
1347            '4',
1348            '4',
1349        ],
1350        'TSCH'    => [
1351            '0',
1352            '4',
1353            '4',
1354            '4',
1355        ],
1356        'TSH'     => [
1357            '0',
1358            '4',
1359            '4',
1360            '4',
1361        ],
1362        'TSZ'     => [
1363            '0',
1364            '4',
1365            '4',
1366            '4',
1367        ],
1368        'TTCH'    => [
1369            '0',
1370            '4',
1371            '4',
1372            '4',
1373        ],
1374        'TTS'     => [
1375            '0',
1376            '4',
1377            '4',
1378            '4',
1379        ],
1380        'TTSCH'   => [
1381            '0',
1382            '4',
1383            '4',
1384            '4',
1385        ],
1386        'TTSZ'    => [
1387            '0',
1388            '4',
1389            '4',
1390            '4',
1391        ],
1392        'TTZ'     => [
1393            '0',
1394            '4',
1395            '4',
1396            '4',
1397        ],
1398        'TZ'      => [
1399            '0',
1400            '4',
1401            '4',
1402            '4',
1403        ],
1404        'TZS'     => [
1405            '0',
1406            '4',
1407            '4',
1408            '4',
1409        ],
1410        'U'       => [
1411            '1',
1412            '0',
1413            '',
1414            '',
1415        ],
1416        'Ù'       => [
1417            '1',
1418            '0',
1419            '',
1420            '',
1421        ],
1422        'Ú'       => [
1423            '1',
1424            '0',
1425            '',
1426            '',
1427        ],
1428        'Û'       => [
1429            '1',
1430            '0',
1431            '',
1432            '',
1433        ],
1434        'Ü'       => [
1435            '1',
1436            '0',
1437            '',
1438            '',
1439        ],
1440        'Ũ'       => [
1441            '1',
1442            '0',
1443            '',
1444            '',
1445        ],
1446        'Ū'       => [
1447            '1',
1448            '0',
1449            '',
1450            '',
1451        ],
1452        'Ů'       => [
1453            '1',
1454            '0',
1455            '',
1456            '',
1457        ],
1458        'Ű'       => [
1459            '1',
1460            '0',
1461            '',
1462            '',
1463        ],
1464        'Ų'       => [
1465            '1',
1466            '0',
1467            '',
1468            '',
1469        ],
1470        'Ư'       => [
1471            '1',
1472            '0',
1473            '',
1474            '',
1475        ],
1476        'Ụ'       => [
1477            '1',
1478            '0',
1479            '',
1480            '',
1481        ],
1482        'Ủ'       => [
1483            '1',
1484            '0',
1485            '',
1486            '',
1487        ],
1488        'Ứ'       => [
1489            '1',
1490            '0',
1491            '',
1492            '',
1493        ],
1494        'Ừ'       => [
1495            '1',
1496            '0',
1497            '',
1498            '',
1499        ],
1500        'Ử'       => [
1501            '1',
1502            '0',
1503            '',
1504            '',
1505        ],
1506        'Ữ'       => [
1507            '1',
1508            '0',
1509            '',
1510            '',
1511        ],
1512        'Ự'       => [
1513            '1',
1514            '0',
1515            '',
1516            '',
1517        ],
1518        'UE'      => [
1519            '1',
1520            '0',
1521            '',
1522            '',
1523        ],
1524        'UI'      => [
1525            '1',
1526            '0',
1527            '1',
1528            '',
1529        ],
1530        'UJ'      => [
1531            '1',
1532            '0',
1533            '1',
1534            '',
1535        ],
1536        'UY'      => [
1537            '1',
1538            '0',
1539            '1',
1540            '',
1541        ],
1542        'UW'      => [
1543            '1',
1544            '0',
1545            '1',
1546            '',
1547            '0',
1548            '7',
1549            '7',
1550        ],
1551        'V'       => [
1552            '0',
1553            '7',
1554            '7',
1555            '7',
1556        ],
1557        'W'       => [
1558            '0',
1559            '7',
1560            '7',
1561            '7',
1562        ],
1563        'X'       => [
1564            '0',
1565            '5',
1566            '54',
1567            '54',
1568        ],
1569        'Y'       => [
1570            '1',
1571            '1',
1572            '',
1573            '',
1574        ],
1575        'Ý'       => [
1576            '1',
1577            '1',
1578            '',
1579            '',
1580        ],
1581        'Ỳ'       => [
1582            '1',
1583            '1',
1584            '',
1585            '',
1586        ],
1587        'Ỵ'       => [
1588            '1',
1589            '1',
1590            '',
1591            '',
1592        ],
1593        'Ỷ'       => [
1594            '1',
1595            '1',
1596            '',
1597            '',
1598        ],
1599        'Ỹ'       => [
1600            '1',
1601            '1',
1602            '',
1603            '',
1604        ],
1605        'Z'       => [
1606            '0',
1607            '4',
1608            '4',
1609            '4',
1610        ],
1611        'Ź'       => [
1612            '0',
1613            '4',
1614            '4',
1615            '4',
1616        ],
1617        'Ż'       => [
1618            '0',
1619            '4',
1620            '4',
1621            '4',
1622        ],
1623        'Ž'       => [
1624            '0',
1625            '4',
1626            '4',
1627            '4',
1628        ],
1629        'ZD'      => [
1630            '0',
1631            '2',
1632            '43',
1633            '43',
1634        ],
1635        'ZDZ'     => [
1636            '0',
1637            '2',
1638            '4',
1639            '4',
1640        ],
1641        'ZDZH'    => [
1642            '0',
1643            '2',
1644            '4',
1645            '4',
1646        ],
1647        'ZH'      => [
1648            '0',
1649            '4',
1650            '4',
1651            '4',
1652        ],
1653        'ZHD'     => [
1654            '0',
1655            '2',
1656            '43',
1657            '43',
1658        ],
1659        'ZHDZH'   => [
1660            '0',
1661            '2',
1662            '4',
1663            '4',
1664        ],
1665        'ZS'      => [
1666            '0',
1667            '4',
1668            '4',
1669            '4',
1670        ],
1671        'ZSCH'    => [
1672            '0',
1673            '4',
1674            '4',
1675            '4',
1676        ],
1677        'ZSH'     => [
1678            '0',
1679            '4',
1680            '4',
1681            '4',
1682        ],
1683        'ZZS'     => [
1684            '0',
1685            '4',
1686            '4',
1687            '4',
1688        ],
1689        // Cyrillic alphabet
1690        'А'       => [
1691            '1',
1692            '0',
1693            '',
1694            '',
1695        ],
1696        'Б'       => [
1697            '0',
1698            '7',
1699            '7',
1700            '7',
1701        ],
1702        'В'       => [
1703            '0',
1704            '7',
1705            '7',
1706            '7',
1707        ],
1708        'Г'       => [
1709            '0',
1710            '5',
1711            '5',
1712            '5',
1713        ],
1714        'Д'       => [
1715            '0',
1716            '3',
1717            '3',
1718            '3',
1719        ],
1720        'ДЗ'      => [
1721            '0',
1722            '4',
1723            '4',
1724            '4',
1725        ],
1726        'Е'       => [
1727            '1',
1728            '0',
1729            '',
1730            '',
1731        ],
1732        'Ё'       => [
1733            '1',
1734            '0',
1735            '',
1736            '',
1737        ],
1738        'Ж'       => [
1739            '0',
1740            '4',
1741            '4',
1742            '4',
1743        ],
1744        'З'       => [
1745            '0',
1746            '4',
1747            '4',
1748            '4',
1749        ],
1750        'И'       => [
1751            '1',
1752            '0',
1753            '',
1754            '',
1755        ],
1756        'Й'       => [
1757            '1',
1758            '1',
1759            '',
1760            '',
1761            '4',
1762            '4',
1763            '4',
1764        ],
1765        'К'       => [
1766            '0',
1767            '5',
1768            '5',
1769            '5',
1770        ],
1771        'Л'       => [
1772            '0',
1773            '8',
1774            '8',
1775            '8',
1776        ],
1777        'М'       => [
1778            '0',
1779            '6',
1780            '6',
1781            '6',
1782        ],
1783        'Н'       => [
1784            '0',
1785            '6',
1786            '6',
1787            '6',
1788        ],
1789        'О'       => [
1790            '1',
1791            '0',
1792            '',
1793            '',
1794        ],
1795        'П'       => [
1796            '0',
1797            '7',
1798            '7',
1799            '7',
1800        ],
1801        'Р'       => [
1802            '0',
1803            '9',
1804            '9',
1805            '9',
1806        ],
1807        'РЖ'      => [
1808            '0',
1809            '4',
1810            '4',
1811            '4',
1812        ],
1813        'С'       => [
1814            '0',
1815            '4',
1816            '4',
1817            '4',
1818        ],
1819        'Т'       => [
1820            '0',
1821            '3',
1822            '3',
1823            '3',
1824        ],
1825        'У'       => [
1826            '1',
1827            '0',
1828            '',
1829            '',
1830        ],
1831        'Ф'       => [
1832            '0',
1833            '7',
1834            '7',
1835            '7',
1836        ],
1837        'Х'       => [
1838            '0',
1839            '5',
1840            '5',
1841            '5',
1842        ],
1843        'Ц'       => [
1844            '0',
1845            '4',
1846            '4',
1847            '4',
1848        ],
1849        'Ч'       => [
1850            '0',
1851            '4',
1852            '4',
1853            '4',
1854        ],
1855        'Ш'       => [
1856            '0',
1857            '4',
1858            '4',
1859            '4',
1860        ],
1861        'Щ'       => [
1862            '0',
1863            '2',
1864            '4',
1865            '4',
1866        ],
1867        'Ъ'       => [
1868            '0',
1869            '',
1870            '',
1871            '',
1872        ],
1873        'Ы'       => [
1874            '0',
1875            '1',
1876            '',
1877            '',
1878        ],
1879        'Ь'       => [
1880            '0',
1881            '',
1882            '',
1883            '',
1884        ],
1885        'Э'       => [
1886            '1',
1887            '0',
1888            '',
1889            '',
1890        ],
1891        'Ю'       => [
1892            '0',
1893            '1',
1894            '',
1895            '',
1896        ],
1897        'Я'       => [
1898            '0',
1899            '1',
1900            '',
1901            '',
1902        ],
1903        // Greek alphabet
1904        'Α'       => [
1905            '1',
1906            '0',
1907            '',
1908            '',
1909        ],
1910        'Ά'       => [
1911            '1',
1912            '0',
1913            '',
1914            '',
1915        ],
1916        'ΑΙ'      => [
1917            '1',
1918            '0',
1919            '1',
1920            '',
1921        ],
1922        'ΑΥ'      => [
1923            '1',
1924            '0',
1925            '1',
1926            '',
1927        ],
1928        'Β'       => [
1929            '0',
1930            '7',
1931            '7',
1932            '7',
1933        ],
1934        'Γ'       => [
1935            '0',
1936            '5',
1937            '5',
1938            '5',
1939        ],
1940        'Δ'       => [
1941            '0',
1942            '3',
1943            '3',
1944            '3',
1945        ],
1946        'Ε'       => [
1947            '1',
1948            '0',
1949            '',
1950            '',
1951        ],
1952        'Έ'       => [
1953            '1',
1954            '0',
1955            '',
1956            '',
1957        ],
1958        'ΕΙ'      => [
1959            '1',
1960            '0',
1961            '1',
1962            '',
1963        ],
1964        'ΕΥ'      => [
1965            '1',
1966            '1',
1967            '1',
1968            '',
1969        ],
1970        'Ζ'       => [
1971            '0',
1972            '4',
1973            '4',
1974            '4',
1975        ],
1976        'Η'       => [
1977            '1',
1978            '0',
1979            '',
1980            '',
1981        ],
1982        'Ή'       => [
1983            '1',
1984            '0',
1985            '',
1986            '',
1987        ],
1988        'Θ'       => [
1989            '0',
1990            '3',
1991            '3',
1992            '3',
1993        ],
1994        'Ι'       => [
1995            '1',
1996            '0',
1997            '',
1998            '',
1999        ],
2000        'Ί'       => [
2001            '1',
2002            '0',
2003            '',
2004            '',
2005        ],
2006        'Ϊ'       => [
2007            '1',
2008            '0',
2009            '',
2010            '',
2011        ],
2012        'ΐ'       => [
2013            '1',
2014            '0',
2015            '',
2016            '',
2017        ],
2018        'Κ'       => [
2019            '0',
2020            '5',
2021            '5',
2022            '5',
2023        ],
2024        'Λ'       => [
2025            '0',
2026            '8',
2027            '8',
2028            '8',
2029        ],
2030        'Μ'       => [
2031            '0',
2032            '6',
2033            '6',
2034            '6',
2035        ],
2036        'ΜΠ'      => [
2037            '0',
2038            '7',
2039            '7',
2040            '7',
2041        ],
2042        'Ν'       => [
2043            '0',
2044            '6',
2045            '6',
2046            '6',
2047        ],
2048        'ΝΤ'      => [
2049            '0',
2050            '3',
2051            '3',
2052            '3',
2053        ],
2054        'Ξ'       => [
2055            '0',
2056            '5',
2057            '54',
2058            '54',
2059        ],
2060        'Ο'       => [
2061            '1',
2062            '0',
2063            '',
2064            '',
2065        ],
2066        'Ό'       => [
2067            '1',
2068            '0',
2069            '',
2070            '',
2071        ],
2072        'ΟΙ'      => [
2073            '1',
2074            '0',
2075            '1',
2076            '',
2077        ],
2078        'ΟΥ'      => [
2079            '1',
2080            '0',
2081            '1',
2082            '',
2083        ],
2084        'Π'       => [
2085            '0',
2086            '7',
2087            '7',
2088            '7',
2089        ],
2090        'Ρ'       => [
2091            '0',
2092            '9',
2093            '9',
2094            '9',
2095        ],
2096        'Σ'       => [
2097            '0',
2098            '4',
2099            '4',
2100            '4',
2101        ],
2102        'ς'       => [
2103            '0',
2104            '',
2105            '',
2106            '4',
2107        ],
2108        'Τ'       => [
2109            '0',
2110            '3',
2111            '3',
2112            '3',
2113        ],
2114        'ΤΖ'      => [
2115            '0',
2116            '4',
2117            '4',
2118            '4',
2119        ],
2120        'ΤΣ'      => [
2121            '0',
2122            '4',
2123            '4',
2124            '4',
2125        ],
2126        'Υ'       => [
2127            '1',
2128            '1',
2129            '',
2130            '',
2131        ],
2132        'Ύ'       => [
2133            '1',
2134            '1',
2135            '',
2136            '',
2137        ],
2138        'Ϋ'       => [
2139            '1',
2140            '1',
2141            '',
2142            '',
2143        ],
2144        'ΰ'       => [
2145            '1',
2146            '1',
2147            '',
2148            '',
2149        ],
2150        'ΥΚ'      => [
2151            '1',
2152            '5',
2153            '5',
2154            '5',
2155        ],
2156        'ΥΥ'      => [
2157            '1',
2158            '65',
2159            '65',
2160            '65',
2161        ],
2162        'Φ'       => [
2163            '0',
2164            '7',
2165            '7',
2166            '7',
2167        ],
2168        'Χ'       => [
2169            '0',
2170            '5',
2171            '5',
2172            '5',
2173        ],
2174        'Ψ'       => [
2175            '0',
2176            '7',
2177            '7',
2178            '7',
2179        ],
2180        'Ω'       => [
2181            '1',
2182            '0',
2183            '',
2184            '',
2185        ],
2186        'Ώ'       => [
2187            '1',
2188            '0',
2189            '',
2190            '',
2191        ],
2192        // Hebrew alphabet
2193        'א'       => [
2194            '1',
2195            '0',
2196            '',
2197            '',
2198        ],
2199        'או'      => [
2200            '1',
2201            '0',
2202            '7',
2203            '',
2204        ],
2205        'אג'      => [
2206            '1',
2207            '4',
2208            '4',
2209            '4',
2210            '5',
2211            '5',
2212            '5',
2213            '34',
2214            '34',
2215            '34',
2216        ],
2217        'בב'      => [
2218            '0',
2219            '7',
2220            '7',
2221            '7',
2222            '77',
2223            '77',
2224            '77',
2225        ],
2226        'ב'       => [
2227            '0',
2228            '7',
2229            '7',
2230            '7',
2231        ],
2232        'גג'      => [
2233            '0',
2234            '4',
2235            '4',
2236            '4',
2237            '5',
2238            '5',
2239            '5',
2240            '45',
2241            '45',
2242            '45',
2243            '55',
2244            '55',
2245            '55',
2246            '54',
2247            '54',
2248            '54',
2249        ],
2250        'גד'      => [
2251            '0',
2252            '43',
2253            '43',
2254            '43',
2255            '53',
2256            '53',
2257            '53',
2258        ],
2259        'גה'      => [
2260            '0',
2261            '45',
2262            '45',
2263            '45',
2264            '55',
2265            '55',
2266            '55',
2267        ],
2268        'גז'      => [
2269            '0',
2270            '44',
2271            '44',
2272            '44',
2273            '45',
2274            '45',
2275            '45',
2276        ],
2277        'גח'      => [
2278            '0',
2279            '45',
2280            '45',
2281            '45',
2282            '55',
2283            '55',
2284            '55',
2285        ],
2286        'גכ'      => [
2287            '0',
2288            '45',
2289            '45',
2290            '45',
2291            '55',
2292            '55',
2293            '55',
2294        ],
2295        'גך'      => [
2296            '0',
2297            '45',
2298            '45',
2299            '45',
2300            '55',
2301            '55',
2302            '55',
2303        ],
2304        'גצ'      => [
2305            '0',
2306            '44',
2307            '44',
2308            '44',
2309            '45',
2310            '45',
2311            '45',
2312        ],
2313        'גץ'      => [
2314            '0',
2315            '44',
2316            '44',
2317            '44',
2318            '45',
2319            '45',
2320            '45',
2321        ],
2322        'גק'      => [
2323            '0',
2324            '45',
2325            '45',
2326            '45',
2327            '54',
2328            '54',
2329            '54',
2330        ],
2331        'גש'      => [
2332            '0',
2333            '44',
2334            '44',
2335            '44',
2336            '54',
2337            '54',
2338            '54',
2339        ],
2340        'גת'      => [
2341            '0',
2342            '43',
2343            '43',
2344            '43',
2345            '53',
2346            '53',
2347            '53',
2348        ],
2349        'ג'       => [
2350            '0',
2351            '4',
2352            '4',
2353            '4',
2354            '5',
2355            '5',
2356            '5',
2357        ],
2358        'דז'      => [
2359            '0',
2360            '4',
2361            '4',
2362            '4',
2363        ],
2364        'דד'      => [
2365            '0',
2366            '3',
2367            '3',
2368            '3',
2369            '33',
2370            '33',
2371            '33',
2372        ],
2373        'דט'      => [
2374            '0',
2375            '33',
2376            '33',
2377            '33',
2378        ],
2379        'דש'      => [
2380            '0',
2381            '4',
2382            '4',
2383            '4',
2384        ],
2385        'דצ'      => [
2386            '0',
2387            '4',
2388            '4',
2389            '4',
2390        ],
2391        'דץ'      => [
2392            '0',
2393            '4',
2394            '4',
2395            '4',
2396        ],
2397        'ד'       => [
2398            '0',
2399            '3',
2400            '3',
2401            '3',
2402        ],
2403        'הג'      => [
2404            '0',
2405            '54',
2406            '54',
2407            '54',
2408            '55',
2409            '55',
2410            '55',
2411        ],
2412        'הכ'      => [
2413            '0',
2414            '55',
2415            '55',
2416            '55',
2417        ],
2418        'הח'      => [
2419            '0',
2420            '55',
2421            '55',
2422            '55',
2423        ],
2424        'הק'      => [
2425            '0',
2426            '55',
2427            '55',
2428            '55',
2429            '5',
2430            '5',
2431            '5',
2432        ],
2433        'הה'      => [
2434            '0',
2435            '5',
2436            '5',
2437            '',
2438            '55',
2439            '55',
2440            '',
2441        ],
2442        'ה'       => [
2443            '0',
2444            '5',
2445            '5',
2446            '',
2447        ],
2448        'וי'      => [
2449            '1',
2450            '',
2451            '',
2452            '',
2453            '7',
2454            '7',
2455            '7',
2456        ],
2457        'ו'       => [
2458            '1',
2459            '7',
2460            '7',
2461            '7',
2462            '7',
2463            '',
2464            '',
2465        ],
2466        'וו'      => [
2467            '1',
2468            '7',
2469            '7',
2470            '7',
2471            '7',
2472            '',
2473            '',
2474        ],
2475        'וופ'     => [
2476            '1',
2477            '7',
2478            '7',
2479            '7',
2480            '77',
2481            '77',
2482            '77',
2483        ],
2484        'זש'      => [
2485            '0',
2486            '4',
2487            '4',
2488            '4',
2489            '44',
2490            '44',
2491            '44',
2492        ],
2493        'זדז'     => [
2494            '0',
2495            '2',
2496            '4',
2497            '4',
2498        ],
2499        'ז'       => [
2500            '0',
2501            '4',
2502            '4',
2503            '4',
2504        ],
2505        'זג'      => [
2506            '0',
2507            '44',
2508            '44',
2509            '44',
2510            '45',
2511            '45',
2512            '45',
2513        ],
2514        'זז'      => [
2515            '0',
2516            '4',
2517            '4',
2518            '4',
2519            '44',
2520            '44',
2521            '44',
2522        ],
2523        'זס'      => [
2524            '0',
2525            '44',
2526            '44',
2527            '44',
2528        ],
2529        'זצ'      => [
2530            '0',
2531            '44',
2532            '44',
2533            '44',
2534        ],
2535        'זץ'      => [
2536            '0',
2537            '44',
2538            '44',
2539            '44',
2540        ],
2541        'חג'      => [
2542            '0',
2543            '54',
2544            '54',
2545            '54',
2546            '53',
2547            '53',
2548            '53',
2549        ],
2550        'חח'      => [
2551            '0',
2552            '5',
2553            '5',
2554            '5',
2555            '55',
2556            '55',
2557            '55',
2558        ],
2559        'חק'      => [
2560            '0',
2561            '55',
2562            '55',
2563            '55',
2564            '5',
2565            '5',
2566            '5',
2567        ],
2568        'חכ'      => [
2569            '0',
2570            '45',
2571            '45',
2572            '45',
2573            '55',
2574            '55',
2575            '55',
2576        ],
2577        'חס'      => [
2578            '0',
2579            '5',
2580            '54',
2581            '54',
2582        ],
2583        'חש'      => [
2584            '0',
2585            '5',
2586            '54',
2587            '54',
2588        ],
2589        'ח'       => [
2590            '0',
2591            '5',
2592            '5',
2593            '5',
2594        ],
2595        'טש'      => [
2596            '0',
2597            '4',
2598            '4',
2599            '4',
2600        ],
2601        'טד'      => [
2602            '0',
2603            '33',
2604            '33',
2605            '33',
2606        ],
2607        'טי'      => [
2608            '0',
2609            '3',
2610            '3',
2611            '3',
2612            '4',
2613            '4',
2614            '4',
2615            '3',
2616            '3',
2617            '34',
2618        ],
2619        'טת'      => [
2620            '0',
2621            '33',
2622            '33',
2623            '33',
2624        ],
2625        'טט'      => [
2626            '0',
2627            '3',
2628            '3',
2629            '3',
2630            '33',
2631            '33',
2632            '33',
2633        ],
2634        'ט'       => [
2635            '0',
2636            '3',
2637            '3',
2638            '3',
2639        ],
2640        'י'       => [
2641            '1',
2642            '1',
2643            '',
2644            '',
2645        ],
2646        'יא'      => [
2647            '1',
2648            '1',
2649            '',
2650            '',
2651            '1',
2652            '1',
2653            '1',
2654        ],
2655        'כג'      => [
2656            '0',
2657            '55',
2658            '55',
2659            '55',
2660            '54',
2661            '54',
2662            '54',
2663        ],
2664        'כש'      => [
2665            '0',
2666            '5',
2667            '54',
2668            '54',
2669        ],
2670        'כס'      => [
2671            '0',
2672            '5',
2673            '54',
2674            '54',
2675        ],
2676        'ככ'      => [
2677            '0',
2678            '5',
2679            '5',
2680            '5',
2681            '55',
2682            '55',
2683            '55',
2684        ],
2685        'כך'      => [
2686            '0',
2687            '5',
2688            '5',
2689            '5',
2690            '55',
2691            '55',
2692            '55',
2693        ],
2694        'כ'       => [
2695            '0',
2696            '5',
2697            '5',
2698            '5',
2699        ],
2700        'כח'      => [
2701            '0',
2702            '55',
2703            '55',
2704            '55',
2705            '5',
2706            '5',
2707            '5',
2708        ],
2709        'ך'       => [
2710            '0',
2711            '',
2712            '5',
2713            '5',
2714        ],
2715        'ל'       => [
2716            '0',
2717            '8',
2718            '8',
2719            '8',
2720        ],
2721        'לל'      => [
2722            '0',
2723            '88',
2724            '88',
2725            '88',
2726            '8',
2727            '8',
2728            '8',
2729        ],
2730        'מנ'      => [
2731            '0',
2732            '66',
2733            '66',
2734            '66',
2735        ],
2736        'מן'      => [
2737            '0',
2738            '66',
2739            '66',
2740            '66',
2741        ],
2742        'ממ'      => [
2743            '0',
2744            '6',
2745            '6',
2746            '6',
2747            '66',
2748            '66',
2749            '66',
2750        ],
2751        'מם'      => [
2752            '0',
2753            '6',
2754            '6',
2755            '6',
2756            '66',
2757            '66',
2758            '66',
2759        ],
2760        'מ'       => [
2761            '0',
2762            '6',
2763            '6',
2764            '6',
2765        ],
2766        'ם'       => [
2767            '0',
2768            '',
2769            '6',
2770            '6',
2771        ],
2772        'נמ'      => [
2773            '0',
2774            '66',
2775            '66',
2776            '66',
2777        ],
2778        'נם'      => [
2779            '0',
2780            '66',
2781            '66',
2782            '66',
2783        ],
2784        'ננ'      => [
2785            '0',
2786            '6',
2787            '6',
2788            '6',
2789            '66',
2790            '66',
2791            '66',
2792        ],
2793        'נן'      => [
2794            '0',
2795            '6',
2796            '6',
2797            '6',
2798            '66',
2799            '66',
2800            '66',
2801        ],
2802        'נ'       => [
2803            '0',
2804            '6',
2805            '6',
2806            '6',
2807        ],
2808        'ן'       => [
2809            '0',
2810            '',
2811            '6',
2812            '6',
2813        ],
2814        'סתש'     => [
2815            '0',
2816            '2',
2817            '4',
2818            '4',
2819        ],
2820        'סתז'     => [
2821            '0',
2822            '2',
2823            '4',
2824            '4',
2825        ],
2826        'סטז'     => [
2827            '0',
2828            '2',
2829            '4',
2830            '4',
2831        ],
2832        'סטש'     => [
2833            '0',
2834            '2',
2835            '4',
2836            '4',
2837        ],
2838        'סצד'     => [
2839            '0',
2840            '2',
2841            '4',
2842            '4',
2843        ],
2844        'סט'      => [
2845            '0',
2846            '2',
2847            '4',
2848            '4',
2849            '43',
2850            '43',
2851            '43',
2852        ],
2853        'סת'      => [
2854            '0',
2855            '2',
2856            '4',
2857            '4',
2858            '43',
2859            '43',
2860            '43',
2861        ],
2862        'סג'      => [
2863            '0',
2864            '44',
2865            '44',
2866            '44',
2867            '4',
2868            '4',
2869            '4',
2870        ],
2871        'סס'      => [
2872            '0',
2873            '4',
2874            '4',
2875            '4',
2876            '44',
2877            '44',
2878            '44',
2879        ],
2880        'סצ'      => [
2881            '0',
2882            '44',
2883            '44',
2884            '44',
2885        ],
2886        'סץ'      => [
2887            '0',
2888            '44',
2889            '44',
2890            '44',
2891        ],
2892        'סז'      => [
2893            '0',
2894            '44',
2895            '44',
2896            '44',
2897        ],
2898        'סש'      => [
2899            '0',
2900            '44',
2901            '44',
2902            '44',
2903        ],
2904        'ס'       => [
2905            '0',
2906            '4',
2907            '4',
2908            '4',
2909        ],
2910        'ע'       => [
2911            '1',
2912            '0',
2913            '',
2914            '',
2915        ],
2916        'פב'      => [
2917            '0',
2918            '7',
2919            '7',
2920            '7',
2921            '77',
2922            '77',
2923            '77',
2924        ],
2925        'פוו'     => [
2926            '0',
2927            '7',
2928            '7',
2929            '7',
2930            '77',
2931            '77',
2932            '77',
2933        ],
2934        'פפ'      => [
2935            '0',
2936            '7',
2937            '7',
2938            '7',
2939            '77',
2940            '77',
2941            '77',
2942        ],
2943        'פף'      => [
2944            '0',
2945            '7',
2946            '7',
2947            '7',
2948            '77',
2949            '77',
2950            '77',
2951        ],
2952        'פ'       => [
2953            '0',
2954            '7',
2955            '7',
2956            '7',
2957        ],
2958        'ף'       => [
2959            '0',
2960            '',
2961            '7',
2962            '7',
2963        ],
2964        'צג'      => [
2965            '0',
2966            '44',
2967            '44',
2968            '44',
2969            '45',
2970            '45',
2971            '45',
2972        ],
2973        'צז'      => [
2974            '0',
2975            '44',
2976            '44',
2977            '44',
2978        ],
2979        'צס'      => [
2980            '0',
2981            '44',
2982            '44',
2983            '44',
2984        ],
2985        'צצ'      => [
2986            '0',
2987            '4',
2988            '4',
2989            '4',
2990            '5',
2991            '5',
2992            '5',
2993            '44',
2994            '44',
2995            '44',
2996            '54',
2997            '54',
2998            '54',
2999            '45',
3000            '45',
3001            '45',
3002        ],
3003        'צץ'      => [
3004            '0',
3005            '4',
3006            '4',
3007            '4',
3008            '5',
3009            '5',
3010            '5',
3011            '44',
3012            '44',
3013            '44',
3014            '54',
3015            '54',
3016            '54',
3017        ],
3018        'צש'      => [
3019            '0',
3020            '44',
3021            '44',
3022            '44',
3023            '4',
3024            '4',
3025            '4',
3026            '5',
3027            '5',
3028            '5',
3029        ],
3030        'צ'       => [
3031            '0',
3032            '4',
3033            '4',
3034            '4',
3035            '5',
3036            '5',
3037            '5',
3038        ],
3039        'ץ'       => [
3040            '0',
3041            '',
3042            '4',
3043            '4',
3044        ],
3045        'קה'      => [
3046            '0',
3047            '55',
3048            '55',
3049            '5',
3050        ],
3051        'קס'      => [
3052            '0',
3053            '5',
3054            '54',
3055            '54',
3056        ],
3057        'קש'      => [
3058            '0',
3059            '5',
3060            '54',
3061            '54',
3062        ],
3063        'קק'      => [
3064            '0',
3065            '5',
3066            '5',
3067            '5',
3068            '55',
3069            '55',
3070            '55',
3071        ],
3072        'קח'      => [
3073            '0',
3074            '55',
3075            '55',
3076            '55',
3077        ],
3078        'קכ'      => [
3079            '0',
3080            '55',
3081            '55',
3082            '55',
3083        ],
3084        'קך'      => [
3085            '0',
3086            '55',
3087            '55',
3088            '55',
3089        ],
3090        'קג'      => [
3091            '0',
3092            '55',
3093            '55',
3094            '55',
3095            '54',
3096            '54',
3097            '54',
3098        ],
3099        'ק'       => [
3100            '0',
3101            '5',
3102            '5',
3103            '5',
3104        ],
3105        'רר'      => [
3106            '0',
3107            '99',
3108            '99',
3109            '99',
3110            '9',
3111            '9',
3112            '9',
3113        ],
3114        'ר'       => [
3115            '0',
3116            '9',
3117            '9',
3118            '9',
3119        ],
3120        'שטז'     => [
3121            '0',
3122            '2',
3123            '4',
3124            '4',
3125        ],
3126        'שתש'     => [
3127            '0',
3128            '2',
3129            '4',
3130            '4',
3131        ],
3132        'שתז'     => [
3133            '0',
3134            '2',
3135            '4',
3136            '4',
3137        ],
3138        'שטש'     => [
3139            '0',
3140            '2',
3141            '4',
3142            '4',
3143        ],
3144        'שד'      => [
3145            '0',
3146            '2',
3147            '43',
3148            '43',
3149        ],
3150        'שז'      => [
3151            '0',
3152            '44',
3153            '44',
3154            '44',
3155        ],
3156        'שס'      => [
3157            '0',
3158            '44',
3159            '44',
3160            '44',
3161        ],
3162        'שת'      => [
3163            '0',
3164            '2',
3165            '43',
3166            '43',
3167        ],
3168        'שג'      => [
3169            '0',
3170            '4',
3171            '4',
3172            '4',
3173            '44',
3174            '44',
3175            '44',
3176            '4',
3177            '43',
3178            '43',
3179        ],
3180        'שט'      => [
3181            '0',
3182            '2',
3183            '43',
3184            '43',
3185            '44',
3186            '44',
3187            '44',
3188        ],
3189        'שצ'      => [
3190            '0',
3191            '44',
3192            '44',
3193            '44',
3194            '45',
3195            '45',
3196            '45',
3197        ],
3198        'שץ'      => [
3199            '0',
3200            '44',
3201            '',
3202            '44',
3203            '45',
3204            '',
3205            '45',
3206        ],
3207        'שש'      => [
3208            '0',
3209            '4',
3210            '4',
3211            '4',
3212            '44',
3213            '44',
3214            '44',
3215        ],
3216        'ש'       => [
3217            '0',
3218            '4',
3219            '4',
3220            '4',
3221        ],
3222        'תג'      => [
3223            '0',
3224            '34',
3225            '34',
3226            '34',
3227        ],
3228        'תז'      => [
3229            '0',
3230            '34',
3231            '34',
3232            '34',
3233        ],
3234        'תש'      => [
3235            '0',
3236            '4',
3237            '4',
3238            '4',
3239        ],
3240        'תת'      => [
3241            '0',
3242            '3',
3243            '3',
3244            '3',
3245            '4',
3246            '4',
3247            '4',
3248            '33',
3249            '33',
3250            '33',
3251            '44',
3252            '44',
3253            '44',
3254            '34',
3255            '34',
3256            '34',
3257            '43',
3258            '43',
3259            '43',
3260        ],
3261        'ת'       => [
3262            '0',
3263            '3',
3264            '3',
3265            '3',
3266            '4',
3267            '4',
3268            '4',
3269        ],
3270        // Arabic alphabet
3271        'ا'       => [
3272            '1',
3273            '0',
3274            '',
3275            '',
3276        ],
3277        'ب'       => [
3278            '0',
3279            '7',
3280            '7',
3281            '7',
3282        ],
3283        'ت'       => [
3284            '0',
3285            '3',
3286            '3',
3287            '3',
3288        ],
3289        'ث'       => [
3290            '0',
3291            '3',
3292            '3',
3293            '3',
3294        ],
3295        'ج'       => [
3296            '0',
3297            '4',
3298            '4',
3299            '4',
3300        ],
3301        'ح'       => [
3302            '0',
3303            '5',
3304            '5',
3305            '5',
3306        ],
3307        'خ'       => [
3308            '0',
3309            '5',
3310            '5',
3311            '5',
3312        ],
3313        'د'       => [
3314            '0',
3315            '3',
3316            '3',
3317            '3',
3318        ],
3319        'ذ'       => [
3320            '0',
3321            '3',
3322            '3',
3323            '3',
3324        ],
3325        'ر'       => [
3326            '0',
3327            '9',
3328            '9',
3329            '9',
3330        ],
3331        'ز'       => [
3332            '0',
3333            '4',
3334            '4',
3335            '4',
3336        ],
3337        'س'       => [
3338            '0',
3339            '4',
3340            '4',
3341            '4',
3342        ],
3343        'ش'       => [
3344            '0',
3345            '4',
3346            '4',
3347            '4',
3348        ],
3349        'ص'       => [
3350            '0',
3351            '4',
3352            '4',
3353            '4',
3354        ],
3355        'ض'       => [
3356            '0',
3357            '3',
3358            '3',
3359            '3',
3360        ],
3361        'ط'       => [
3362            '0',
3363            '3',
3364            '3',
3365            '3',
3366        ],
3367        'ظ'       => [
3368            '0',
3369            '4',
3370            '4',
3371            '4',
3372        ],
3373        'ع'       => [
3374            '1',
3375            '0',
3376            '',
3377            '',
3378        ],
3379        'غ'       => [
3380            '0',
3381            '0',
3382            '',
3383            '',
3384        ],
3385        'ف'       => [
3386            '0',
3387            '7',
3388            '7',
3389            '7',
3390        ],
3391        'ق'       => [
3392            '0',
3393            '5',
3394            '5',
3395            '5',
3396        ],
3397        'ك'       => [
3398            '0',
3399            '5',
3400            '5',
3401            '5',
3402        ],
3403        'ل'       => [
3404            '0',
3405            '8',
3406            '8',
3407            '8',
3408        ],
3409        'لا'      => [
3410            '0',
3411            '8',
3412            '8',
3413            '8',
3414        ],
3415        'م'       => [
3416            '0',
3417            '6',
3418            '6',
3419            '6',
3420        ],
3421        'ن'       => [
3422            '0',
3423            '6',
3424            '6',
3425            '6',
3426        ],
3427        'هن'      => [
3428            '0',
3429            '66',
3430            '66',
3431            '66',
3432        ],
3433        'ه'       => [
3434            '0',
3435            '5',
3436            '5',
3437            '',
3438        ],
3439        'و'       => [
3440            '1',
3441            '',
3442            '',
3443            '',
3444            '7',
3445            '',
3446            '',
3447        ],
3448        'ي'       => [
3449            '0',
3450            '1',
3451            '',
3452            '',
3453        ],
3454        'آ'       => [
3455            '0',
3456            '1',
3457            '',
3458            '',
3459        ],
3460        'ة'       => [
3461            '0',
3462            '',
3463            '',
3464            '3',
3465        ],
3466        'ی'       => [
3467            '0',
3468            '1',
3469            '',
3470            '',
3471        ],
3472        'ى'       => [
3473            '1',
3474            '1',
3475            '',
3476            '',
3477        ],
3478    ];
3479
3480    /**
3481     * Which algorithms are supported.
3482     *
3483     * @return string[]
3484     */
3485    public static function getAlgorithms(): array
3486    {
3487        return [
3488            /* I18N: http://en.wikipedia.org/wiki/Soundex */
3489            'std' => I18N::translate('Russell'),
3490            /* I18N: http://en.wikipedia.org/wiki/Daitch–Mokotoff_Soundex */
3491            'dm'  => I18N::translate('Daitch-Mokotoff'),
3492        ];
3493    }
3494
3495    /**
3496     * Is there a match between two soundex codes?
3497     *
3498     * @param string $soundex1
3499     * @param string $soundex2
3500     *
3501     * @return bool
3502     */
3503    public static function compare($soundex1, $soundex2): bool
3504    {
3505        if ($soundex1 !== '' && $soundex2 !== '') {
3506            return !empty(array_intersect(explode(':', $soundex1), explode(':', $soundex2)));
3507        }
3508
3509        return false;
3510    }
3511
3512    /**
3513     * Generate Russell soundex codes for a given text.
3514     *
3515     * @param string $text
3516     *
3517     * @return string
3518     */
3519    public static function russell(string $text): string
3520    {
3521        $words         = explode(' ', $text);
3522        $soundex_array = [];
3523
3524        foreach ($words as $word) {
3525            $soundex = soundex($word);
3526
3527            // Only return codes from recognisable sounds
3528            if ($soundex !== '0000') {
3529                $soundex_array[] = $soundex;
3530            }
3531        }
3532
3533        // Combine words, e.g. “New York” as “Newyork”
3534        if (count($words) > 1) {
3535            $soundex_array[] = soundex(strtr($text, ' ', ''));
3536        }
3537
3538        // A varchar(255) column can only hold 51 4-character codes (plus 50 delimiters)
3539        $soundex_array = array_slice(array_unique($soundex_array), 0, 51);
3540
3541        return implode(':', $soundex_array);
3542    }
3543
3544    /**
3545     * Generate Daitch–Mokotoff soundex codes for a given text.
3546     *
3547     * @param string $text
3548     *
3549     * @return string
3550     */
3551    public static function daitchMokotoff(string $text): string
3552    {
3553        $words         = explode(' ', $text);
3554        $soundex_array = [];
3555
3556        foreach ($words as $word) {
3557            $soundex_array = array_merge($soundex_array, self::daitchMokotoffWord($word));
3558        }
3559        // Combine words, e.g. “New York” as “Newyork”
3560        if (count($words) > 1) {
3561            $soundex_array = array_merge($soundex_array, self::daitchMokotoffWord(strtr($text, ' ', '')));
3562        }
3563
3564        // A varchar(255) column can only hold 36 6-character codes (plus 35 delimiters)
3565        $soundex_array = array_slice(array_unique($soundex_array), 0, 36);
3566
3567        return implode(':', $soundex_array);
3568    }
3569
3570    /**
3571     * Calculate the Daitch-Mokotoff soundex for a word.
3572     *
3573     * @param string $name
3574     *
3575     * @return string[] List of possible DM codes for the word.
3576     */
3577    private static function daitchMokotoffWord($name): array
3578    {
3579        // Apply special transformation rules to the input string
3580        $name = I18N::strtoupper($name);
3581        foreach (self::TRANSFORM_NAMES as $transformRule) {
3582            $name = str_replace($transformRule[0], $transformRule[1], $name);
3583        }
3584
3585        // Initialize
3586        $name_script = I18N::textScript($name);
3587        $noVowels    = ($name_script === 'Hebr' || $name_script === 'Arab');
3588
3589        $lastPos         = strlen($name) - 1;
3590        $currPos         = 0;
3591        $state           = 1; // 1: start of input string, 2: before vowel, 3: other
3592        $result          = []; // accumulate complete 6-digit D-M codes here
3593        $partialResult   = []; // accumulate incomplete D-M codes here
3594        $partialResult[] = ['!']; // initialize 1st partial result  ('!' stops "duplicate sound" check)
3595
3596        // Loop through the input string.
3597        // Stop when the string is exhausted or when no more partial results remain
3598        while (count($partialResult) !== 0 && $currPos <= $lastPos) {
3599            // Find the DM coding table entry for the chunk at the current position
3600            $thisEntry = substr($name, $currPos, self::MAXCHAR); // Get maximum length chunk
3601            while ($thisEntry != '') {
3602                if (isset(self::DM_SOUNDS[$thisEntry])) {
3603                    break;
3604                }
3605                $thisEntry = substr($thisEntry, 0, -1); // Not in table: try a shorter chunk
3606            }
3607            if ($thisEntry === '') {
3608                $currPos++; // Not in table: advance pointer to next byte
3609                continue; // and try again
3610            }
3611
3612            $soundTableEntry = self::DM_SOUNDS[$thisEntry];
3613            $workingResult   = $partialResult;
3614            $partialResult   = [];
3615            $currPos += strlen($thisEntry);
3616
3617            // Not at beginning of input string
3618            if ($state != 1) {
3619                if ($currPos <= $lastPos) {
3620                    // Determine whether the next chunk is a vowel
3621                    $nextEntry = substr($name, $currPos, self::MAXCHAR); // Get maximum length chunk
3622                    while ($nextEntry != '') {
3623                        if (isset(self::DM_SOUNDS[$nextEntry])) {
3624                            break;
3625                        }
3626                        $nextEntry = substr($nextEntry, 0, -1); // Not in table: try a shorter chunk
3627                    }
3628                } else {
3629                    $nextEntry = '';
3630                }
3631                if ($nextEntry != '' && self::DM_SOUNDS[$nextEntry][0] != '0') {
3632                    $state = 2;
3633                } else {
3634                    // Next chunk is a vowel
3635                    $state = 3;
3636                }
3637            }
3638
3639            while ($state < count($soundTableEntry)) {
3640                // empty means 'ignore this sound in this state'
3641                if ($soundTableEntry[$state] == '') {
3642                    foreach ($workingResult as $workingEntry) {
3643                        $tempEntry                        = $workingEntry;
3644                        $tempEntry[count($tempEntry) - 1] .= '!'; // Prevent false 'doubles'
3645                        $partialResult[]                  = $tempEntry;
3646                    }
3647                } else {
3648                    foreach ($workingResult as $workingEntry) {
3649                        if ($soundTableEntry[$state] !== $workingEntry[count($workingEntry) - 1]) {
3650                            // Incoming sound isn't a duplicate of the previous sound
3651                            $workingEntry[] = $soundTableEntry[$state];
3652                        } else {
3653                            // Incoming sound is a duplicate of the previous sound
3654                            // For Hebrew and Arabic, we need to create a pair of D-M sound codes,
3655                            // one of the pair with only a single occurrence of the duplicate sound,
3656                            // the other with both occurrences
3657                            if ($noVowels) {
3658                                $workingEntry[] = $soundTableEntry[$state];
3659                            }
3660                        }
3661                        if (count($workingEntry) < 7) {
3662                            $partialResult[] = $workingEntry;
3663                        } else {
3664                            // This is the 6th code in the sequence
3665                            // We're looking for 7 entries because the first is '!' and doesn't count
3666                            $tempResult = str_replace('!', '', implode('', $workingEntry));
3667                            // Only return codes from recognisable sounds
3668                            if ($tempResult) {
3669                                $result[] = substr($tempResult . '000000', 0, 6);
3670                            }
3671                        }
3672                    }
3673                }
3674                $state = $state + 3; // Advance to next triplet while keeping the same basic state
3675            }
3676        }
3677
3678        // Zero-fill and copy all remaining partial results
3679        foreach ($partialResult as $workingEntry) {
3680            $tempResult = str_replace('!', '', implode('', $workingEntry));
3681            // Only return codes from recognisable sounds
3682            if ($tempResult) {
3683                $result[] = substr($tempResult . '000000', 0, 6);
3684            }
3685        }
3686
3687        return $result;
3688    }
3689}
3690