xref: /webtrees/app/Soundex.php (revision 4096896c4306e7f90f3103a5ead610d5711b7444)
1a25f0a04SGreg Roach<?php
23976b470SGreg Roach
3a25f0a04SGreg Roach/**
4a25f0a04SGreg Roach * webtrees: online genealogy
589f7189bSGreg Roach * Copyright (C) 2021 webtrees development team
6a25f0a04SGreg Roach * This program is free software: you can redistribute it and/or modify
7a25f0a04SGreg Roach * it under the terms of the GNU General Public License as published by
8a25f0a04SGreg Roach * the Free Software Foundation, either version 3 of the License, or
9a25f0a04SGreg Roach * (at your option) any later version.
10a25f0a04SGreg Roach * This program is distributed in the hope that it will be useful,
11a25f0a04SGreg Roach * but WITHOUT ANY WARRANTY; without even the implied warranty of
12a25f0a04SGreg Roach * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13a25f0a04SGreg Roach * GNU General Public License for more details.
14a25f0a04SGreg Roach * You should have received a copy of the GNU General Public License
1589f7189bSGreg Roach * along with this program. If not, see <https://www.gnu.org/licenses/>.
16a25f0a04SGreg Roach */
17fcfa147eSGreg Roach
18e7f56f2aSGreg Roachdeclare(strict_types=1);
19e7f56f2aSGreg Roach
2076692c8bSGreg Roachnamespace Fisharebest\Webtrees;
21a25f0a04SGreg Roach
22a25f0a04SGreg Roach/**
2376692c8bSGreg Roach * Phonetic matching of strings.
24a25f0a04SGreg Roach */
25c1010edaSGreg Roachclass Soundex
26c1010edaSGreg Roach{
27a25f0a04SGreg Roach    // Determine the Daitch–Mokotoff Soundex code for a word
28a25f0a04SGreg Roach    // Original implementation by Gerry Kroll, and analysis by Meliza Amity
29a25f0a04SGreg Roach
30a25f0a04SGreg Roach    // Max. table key length (in ASCII bytes -- NOT in UTF-8 characters!)
3116cfb0b9SGreg Roach    private const MAXCHAR = 7;
32a25f0a04SGreg Roach
33a25f0a04SGreg Roach    /**
34a25f0a04SGreg Roach     * Name transformation arrays.
35a25f0a04SGreg Roach     * Used to transform the Name string to simplify the "sounds like" table.
36a25f0a04SGreg Roach     * This is especially useful in Hebrew.
37a25f0a04SGreg Roach     *
38a25f0a04SGreg Roach     * Each array entry defines the "from" and "to" arguments of an preg($from, $to, $text)
39a25f0a04SGreg Roach     * function call to achieve the desired transformations.
40a25f0a04SGreg Roach     *
41a25f0a04SGreg Roach     * Note about the use of "\x01":
42a25f0a04SGreg Roach     * This code, which can’t legitimately occur in the kind of text we're dealing with,
43a25f0a04SGreg Roach     * is used as a place-holder so that conditional string replacements can be done.
44a25f0a04SGreg Roach     */
4516cfb0b9SGreg Roach    private const TRANSFORM_NAMES = [
46a25f0a04SGreg Roach        // Force Yiddish ligatures to be treated as separate letters
47*4096896cSGreg Roach        ['װ', 'וו'],
48*4096896cSGreg Roach        ['ײ', 'יי'],
49*4096896cSGreg Roach        ['ױ', 'וי'],
50*4096896cSGreg Roach        ['בו', 'בע'],
51*4096896cSGreg Roach        ['פו', 'פע'],
52*4096896cSGreg Roach        ['ומ', 'עמ'],
53*4096896cSGreg Roach        ['ום', 'עם'],
54*4096896cSGreg Roach        ['ונ', 'ענ'],
55*4096896cSGreg Roach        ['ון', 'ען'],
56*4096896cSGreg Roach        ['וו', 'ב'],
57*4096896cSGreg Roach        ["\x01", ''],
58*4096896cSGreg Roach        ['ייה$', "\x01ה"],
59*4096896cSGreg Roach        ['ייע$', "\x01ע"],
60*4096896cSGreg Roach        ['יי', 'ע'],
61*4096896cSGreg Roach        ["\x01", 'יי'],
6213abd6f3SGreg Roach    ];
63a25f0a04SGreg Roach
64a25f0a04SGreg Roach    /**
65a25f0a04SGreg Roach     * The DM sound coding table is organized this way:
66a25f0a04SGreg Roach     * key: a variable-length string that corresponds to the UTF-8 character sequence
67a25f0a04SGreg Roach     * represented by the table entry. Currently, that string can be up to 7
68a25f0a04SGreg Roach     * bytes long. This maximum length is defined by the value of global variable
69a25f0a04SGreg Roach     * $maxchar.
70a25f0a04SGreg Roach     *
71a25f0a04SGreg Roach     * value: an array as follows:
72a25f0a04SGreg Roach     * [0]:  zero if not a vowel
73a25f0a04SGreg Roach     * [1]:  sound value when this string is at the beginning of the word
74a25f0a04SGreg Roach     * [2]:  sound value when this string is followed by a vowel
75a25f0a04SGreg Roach     * [3]:  sound value for other cases
76a25f0a04SGreg Roach     * [1],[2],[3] can be repeated several times to create branches in the code
77a25f0a04SGreg Roach     * an empty sound value means "ignore in this state"
78a25f0a04SGreg Roach     */
7916cfb0b9SGreg Roach    private const DM_SOUNDS = [
80*4096896cSGreg Roach        'A'       => ['1', '0', '', ''],
81*4096896cSGreg Roach        'À'       => ['1', '0', '', ''],
82*4096896cSGreg Roach        'Á'       => ['1', '0', '', ''],
83*4096896cSGreg Roach        'Â'       => ['1', '0', '', ''],
84*4096896cSGreg Roach        'Ã'       => ['1', '0', '', ''],
85*4096896cSGreg Roach        'Ä'       => ['1', '0', '1', '', '0', '', ''],
86*4096896cSGreg Roach        'Å'       => ['1', '0', '', ''],
87*4096896cSGreg Roach        'Ă'       => ['1', '0', '', ''],
88*4096896cSGreg Roach        'Ą'       => ['1', '', '', '', '', '', '6'],
89*4096896cSGreg Roach        'Ạ'       => ['1', '0', '', ''],
90*4096896cSGreg Roach        'Ả'       => ['1', '0', '', ''],
91*4096896cSGreg Roach        'Ấ'       => ['1', '0', '', ''],
92*4096896cSGreg Roach        'Ầ'       => ['1', '0', '', ''],
93*4096896cSGreg Roach        'Ẩ'       => ['1', '0', '', ''],
94*4096896cSGreg Roach        'Ẫ'       => ['1', '0', '', ''],
95*4096896cSGreg Roach        'Ậ'       => ['1', '0', '', ''],
96*4096896cSGreg Roach        'Ắ'       => ['1', '0', '', ''],
97*4096896cSGreg Roach        'Ằ'       => ['1', '0', '', ''],
98*4096896cSGreg Roach        'Ẳ'       => ['1', '0', '', ''],
99*4096896cSGreg Roach        'Ẵ'       => ['1', '0', '', ''],
100*4096896cSGreg Roach        'Ặ'       => ['1', '0', '', ''],
101*4096896cSGreg Roach        'AE'      => ['1', '0', '1', ''],
102*4096896cSGreg Roach        'Æ'       => ['1', '0', '1', ''],
103*4096896cSGreg Roach        'AI'      => ['1', '0', '1', ''],
104*4096896cSGreg Roach        'AJ'      => ['1', '0', '1', ''],
105*4096896cSGreg Roach        'AU'      => ['1', '0', '7', ''],
106*4096896cSGreg Roach        'AV'      => ['1', '0', '7', '', '7', '7', '7'],
107*4096896cSGreg Roach        'ÄU'      => ['1', '0', '1', ''],
108*4096896cSGreg Roach        'AY'      => ['1', '0', '1', ''],
109*4096896cSGreg Roach        'B'       => ['0', '7', '7', '7'],
110*4096896cSGreg Roach        'C'       => ['0', '5', '5', '5', '34', '4', '4'],
111*4096896cSGreg Roach        'Ć'       => ['0', '4', '4', '4'],
112*4096896cSGreg Roach        'Č'       => ['0', '4', '4', '4'],
113*4096896cSGreg Roach        'Ç'       => ['0', '4', '4', '4'],
114*4096896cSGreg Roach        'CH'      => ['0', '5', '5', '5', '34', '4', '4'],
115*4096896cSGreg Roach        'CHS'     => ['0', '5', '54', '54'],
116*4096896cSGreg Roach        'CK'      => ['0', '5', '5', '5', '45', '45', '45'],
117*4096896cSGreg Roach        'CCS'     => ['0', '4', '4', '4'],
118*4096896cSGreg Roach        'CS'      => ['0', '4', '4', '4'],
119*4096896cSGreg Roach        'CSZ'     => ['0', '4', '4', '4'],
120*4096896cSGreg Roach        'CZ'      => ['0', '4', '4', '4'],
121*4096896cSGreg Roach        'CZS'     => ['0', '4', '4', '4'],
122*4096896cSGreg Roach        'D'       => ['0', '3', '3', '3'],
123*4096896cSGreg Roach        'Ď'       => ['0', '3', '3', '3'],
124*4096896cSGreg Roach        'Đ'       => ['0', '3', '3', '3'],
125*4096896cSGreg Roach        'DRS'     => ['0', '4', '4', '4'],
126*4096896cSGreg Roach        'DRZ'     => ['0', '4', '4', '4'],
127*4096896cSGreg Roach        'DS'      => ['0', '4', '4', '4'],
128*4096896cSGreg Roach        'DSH'     => ['0', '4', '4', '4'],
129*4096896cSGreg Roach        'DSZ'     => ['0', '4', '4', '4'],
130*4096896cSGreg Roach        'DT'      => ['0', '3', '3', '3'],
131*4096896cSGreg Roach        'DDZ'     => ['0', '4', '4', '4'],
132*4096896cSGreg Roach        'DDZS'    => ['0', '4', '4', '4'],
133*4096896cSGreg Roach        'DZ'      => ['0', '4', '4', '4'],
134*4096896cSGreg Roach        'DŹ'      => ['0', '4', '4', '4'],
135*4096896cSGreg Roach        'DŻ'      => ['0', '4', '4', '4'],
136*4096896cSGreg Roach        'DZH'     => ['0', '4', '4', '4'],
137*4096896cSGreg Roach        'DZS'     => ['0', '4', '4', '4'],
138*4096896cSGreg Roach        'E'       => ['1', '0', '', ''],
139*4096896cSGreg Roach        'È'       => ['1', '0', '', ''],
140*4096896cSGreg Roach        'É'       => ['1', '0', '', ''],
141*4096896cSGreg Roach        'Ê'       => ['1', '0', '', ''],
142*4096896cSGreg Roach        'Ë'       => ['1', '0', '', ''],
143*4096896cSGreg Roach        'Ĕ'       => ['1', '0', '', ''],
144*4096896cSGreg Roach        'Ė'       => ['1', '0', '', ''],
145*4096896cSGreg Roach        'Ę'       => ['1', '', '', '6', '', '', ''],
146*4096896cSGreg Roach        'Ẹ'       => ['1', '0', '', ''],
147*4096896cSGreg Roach        'Ẻ'       => ['1', '0', '', ''],
148*4096896cSGreg Roach        'Ẽ'       => ['1', '0', '', ''],
149*4096896cSGreg Roach        'Ế'       => ['1', '0', '', ''],
150*4096896cSGreg Roach        'Ề'       => ['1', '0', '', ''],
151*4096896cSGreg Roach        'Ể'       => ['1', '0', '', ''],
152*4096896cSGreg Roach        'Ễ'       => ['1', '0', '', ''],
153*4096896cSGreg Roach        'Ệ'       => ['1', '0', '', ''],
154*4096896cSGreg Roach        'EAU'     => ['1', '0', '', ''],
155*4096896cSGreg Roach        'EI'      => ['1', '0', '1', ''],
156*4096896cSGreg Roach        'EJ'      => ['1', '0', '1', ''],
157*4096896cSGreg Roach        'EU'      => ['1', '1', '1', ''],
158*4096896cSGreg Roach        'EY'      => ['1', '0', '1', ''],
159*4096896cSGreg Roach        'F'       => ['0', '7', '7', '7'],
160*4096896cSGreg Roach        'FB'      => ['0', '7', '7', '7'],
161*4096896cSGreg Roach        'G'       => ['0', '5', '5', '5', '34', '4', '4'],
162*4096896cSGreg Roach        'Ğ'       => ['0', '', '', ''],
163*4096896cSGreg Roach        'GGY'     => ['0', '5', '5', '5'],
164*4096896cSGreg Roach        'GY'      => ['0', '5', '5', '5'],
165*4096896cSGreg Roach        'H'       => ['0', '5', '5', '', '5', '5', '5'],
166*4096896cSGreg Roach        'I'       => ['1', '0', '', ''],
167*4096896cSGreg Roach        'Ì'       => ['1', '0', '', ''],
168*4096896cSGreg Roach        'Í'       => ['1', '0', '', ''],
169*4096896cSGreg Roach        'Î'       => ['1', '0', '', ''],
170*4096896cSGreg Roach        'Ï'       => ['1', '0', '', ''],
171*4096896cSGreg Roach        'Ĩ'       => ['1', '0', '', ''],
172*4096896cSGreg Roach        'Į'       => ['1', '0', '', ''],
173*4096896cSGreg Roach        'İ'       => ['1', '0', '', ''],
174*4096896cSGreg Roach        'Ỉ'       => ['1', '0', '', ''],
175*4096896cSGreg Roach        'Ị'       => ['1', '0', '', ''],
176*4096896cSGreg Roach        'IA'      => ['1', '1', '', ''],
177*4096896cSGreg Roach        'IE'      => ['1', '1', '', ''],
178*4096896cSGreg Roach        'IO'      => ['1', '1', '', ''],
179*4096896cSGreg Roach        'IU'      => ['1', '1', '', ''],
180*4096896cSGreg Roach        'J'       => ['0', '1', '', '', '4', '4', '4', '5', '5', ''],
181*4096896cSGreg Roach        'K'       => ['0', '5', '5', '5'],
182*4096896cSGreg Roach        'KH'      => ['0', '5', '5', '5'],
183*4096896cSGreg Roach        'KS'      => ['0', '5', '54', '54'],
184*4096896cSGreg Roach        'L'       => ['0', '8', '8', '8'],
185*4096896cSGreg Roach        'Ľ'       => ['0', '8', '8', '8'],
186*4096896cSGreg Roach        'Ĺ'       => ['0', '8', '8', '8'],
187*4096896cSGreg Roach        'Ł'       => ['0', '7', '7', '7', '8', '8', '8'],
188*4096896cSGreg Roach        'LL'      => ['0', '8', '8', '8', '58', '8', '8', '1', '8', '8'],
189*4096896cSGreg Roach        'LLY'     => ['0', '8', '8', '8', '1', '8', '8'],
190*4096896cSGreg Roach        'LY'      => ['0', '8', '8', '8', '1', '8', '8'],
191*4096896cSGreg Roach        'M'       => ['0', '6', '6', '6'],
192*4096896cSGreg Roach        'MĔ'      => ['0', '66', '66', '66'],
193*4096896cSGreg Roach        'MN'      => ['0', '66', '66', '66'],
194*4096896cSGreg Roach        'N'       => ['0', '6', '6', '6'],
195*4096896cSGreg Roach        'Ń'       => ['0', '6', '6', '6'],
196*4096896cSGreg Roach        'Ň'       => ['0', '6', '6', '6'],
197*4096896cSGreg Roach        'Ñ'       => ['0', '6', '6', '6'],
198*4096896cSGreg Roach        'NM'      => ['0', '66', '66', '66'],
199*4096896cSGreg Roach        'O'       => ['1', '0', '', ''],
200*4096896cSGreg Roach        'Ò'       => ['1', '0', '', ''],
201*4096896cSGreg Roach        'Ó'       => ['1', '0', '', ''],
202*4096896cSGreg Roach        'Ô'       => ['1', '0', '', ''],
203*4096896cSGreg Roach        'Õ'       => ['1', '0', '', ''],
204*4096896cSGreg Roach        'Ö'       => ['1', '0', '', ''],
205*4096896cSGreg Roach        'Ø'       => ['1', '0', '', ''],
206*4096896cSGreg Roach        'Ő'       => ['1', '0', '', ''],
207*4096896cSGreg Roach        'Œ'       => ['1', '0', '', ''],
208*4096896cSGreg Roach        'Ơ'       => ['1', '0', '', ''],
209*4096896cSGreg Roach        'Ọ'       => ['1', '0', '', ''],
210*4096896cSGreg Roach        'Ỏ'       => ['1', '0', '', ''],
211*4096896cSGreg Roach        'Ố'       => ['1', '0', '', ''],
212*4096896cSGreg Roach        'Ồ'       => ['1', '0', '', ''],
213*4096896cSGreg Roach        'Ổ'       => ['1', '0', '', ''],
214*4096896cSGreg Roach        'Ỗ'       => ['1', '0', '', ''],
215*4096896cSGreg Roach        'Ộ'       => ['1', '0', '', ''],
216*4096896cSGreg Roach        'Ớ'       => ['1', '0', '', ''],
217*4096896cSGreg Roach        'Ờ'       => ['1', '0', '', ''],
218*4096896cSGreg Roach        'Ở'       => ['1', '0', '', ''],
219*4096896cSGreg Roach        'Ỡ'       => ['1', '0', '', ''],
220*4096896cSGreg Roach        'Ợ'       => ['1', '0', '', ''],
221*4096896cSGreg Roach        'OE'      => ['1', '0', '', ''],
222*4096896cSGreg Roach        'OI'      => ['1', '0', '1', ''],
223*4096896cSGreg Roach        'OJ'      => ['1', '0', '1', ''],
224*4096896cSGreg Roach        'OU'      => ['1', '0', '', ''],
225*4096896cSGreg Roach        'OY'      => ['1', '0', '1', ''],
226*4096896cSGreg Roach        'P'       => ['0', '7', '7', '7'],
227*4096896cSGreg Roach        'PF'      => ['0', '7', '7', '7'],
228*4096896cSGreg Roach        'PH'      => ['0', '7', '7', '7'],
229*4096896cSGreg Roach        'Q'       => ['0', '5', '5', '5'],
230*4096896cSGreg Roach        'R'       => ['0', '9', '9', '9'],
231*4096896cSGreg Roach        'Ř'       => ['0', '4', '4', '4'],
232*4096896cSGreg Roach        'RS'      => ['0', '4', '4', '4', '94', '94', '94'],
233*4096896cSGreg Roach        'RZ'      => ['0', '4', '4', '4', '94', '94', '94'],
234*4096896cSGreg Roach        'S'       => ['0', '4', '4', '4'],
235*4096896cSGreg Roach        'Ś'       => ['0', '4', '4', '4'],
236*4096896cSGreg Roach        'Š'       => ['0', '4', '4', '4'],
237*4096896cSGreg Roach        'Ş'       => ['0', '4', '4', '4'],
238*4096896cSGreg Roach        'SC'      => ['0', '2', '4', '4'],
239*4096896cSGreg Roach        'ŠČ'      => ['0', '2', '4', '4'],
240*4096896cSGreg Roach        'SCH'     => ['0', '4', '4', '4'],
241*4096896cSGreg Roach        'SCHD'    => ['0', '2', '43', '43'],
242*4096896cSGreg Roach        'SCHT'    => ['0', '2', '43', '43'],
243*4096896cSGreg Roach        'SCHTCH'  => ['0', '2', '4', '4'],
244*4096896cSGreg Roach        'SCHTSCH' => ['0', '2', '4', '4'],
245*4096896cSGreg Roach        'SCHTSH'  => ['0', '2', '4', '4'],
246*4096896cSGreg Roach        'SD'      => ['0', '2', '43', '43'],
247*4096896cSGreg Roach        'SH'      => ['0', '4', '4', '4'],
248*4096896cSGreg Roach        'SHCH'    => ['0', '2', '4', '4'],
249*4096896cSGreg Roach        'SHD'     => ['0', '2', '43', '43'],
250*4096896cSGreg Roach        'SHT'     => ['0', '2', '43', '43'],
251*4096896cSGreg Roach        'SHTCH'   => ['0', '2', '4', '4'],
252*4096896cSGreg Roach        'SHTSH'   => ['0', '2', '4', '4'],
253*4096896cSGreg Roach        'ß'       => ['0', '', '4', '4'],
254*4096896cSGreg Roach        'ST'      => ['0', '2', '43', '43'],
255*4096896cSGreg Roach        'STCH'    => ['0', '2', '4', '4'],
256*4096896cSGreg Roach        'STRS'    => ['0', '2', '4', '4'],
257*4096896cSGreg Roach        'STRZ'    => ['0', '2', '4', '4'],
258*4096896cSGreg Roach        'STSCH'   => ['0', '2', '4', '4'],
259*4096896cSGreg Roach        'STSH'    => ['0', '2', '4', '4'],
260*4096896cSGreg Roach        'SSZ'     => ['0', '4', '4', '4'],
261*4096896cSGreg Roach        'SZ'      => ['0', '4', '4', '4'],
262*4096896cSGreg Roach        'SZCS'    => ['0', '2', '4', '4'],
263*4096896cSGreg Roach        'SZCZ'    => ['0', '2', '4', '4'],
264*4096896cSGreg Roach        'SZD'     => ['0', '2', '43', '43'],
265*4096896cSGreg Roach        'SZT'     => ['0', '2', '43', '43'],
266*4096896cSGreg Roach        'T'       => ['0', '3', '3', '3'],
267*4096896cSGreg Roach        'Ť'       => ['0', '3', '3', '3'],
268*4096896cSGreg Roach        'Ţ'       => ['0', '3', '3', '3', '4', '4', '4'],
269*4096896cSGreg Roach        'TC'      => ['0', '4', '4', '4'],
270*4096896cSGreg Roach        'TCH'     => ['0', '4', '4', '4'],
271*4096896cSGreg Roach        'TH'      => ['0', '3', '3', '3'],
272*4096896cSGreg Roach        'TRS'     => ['0', '4', '4', '4'],
273*4096896cSGreg Roach        'TRZ'     => ['0', '4', '4', '4'],
274*4096896cSGreg Roach        'TS'      => ['0', '4', '4', '4'],
275*4096896cSGreg Roach        'TSCH'    => ['0', '4', '4', '4'],
276*4096896cSGreg Roach        'TSH'     => ['0', '4', '4', '4'],
277*4096896cSGreg Roach        'TSZ'     => ['0', '4', '4', '4'],
278*4096896cSGreg Roach        'TTCH'    => ['0', '4', '4', '4'],
279*4096896cSGreg Roach        'TTS'     => ['0', '4', '4', '4'],
280*4096896cSGreg Roach        'TTSCH'   => ['0', '4', '4', '4'],
281*4096896cSGreg Roach        'TTSZ'    => ['0', '4', '4', '4'],
282*4096896cSGreg Roach        'TTZ'     => ['0', '4', '4', '4'],
283*4096896cSGreg Roach        'TZ'      => ['0', '4', '4', '4'],
284*4096896cSGreg Roach        'TZS'     => ['0', '4', '4', '4'],
285*4096896cSGreg Roach        'U'       => ['1', '0', '', ''],
286*4096896cSGreg Roach        'Ù'       => ['1', '0', '', ''],
287*4096896cSGreg Roach        'Ú'       => ['1', '0', '', ''],
288*4096896cSGreg Roach        'Û'       => ['1', '0', '', ''],
289*4096896cSGreg Roach        'Ü'       => ['1', '0', '', ''],
290*4096896cSGreg Roach        'Ũ'       => ['1', '0', '', ''],
291*4096896cSGreg Roach        'Ū'       => ['1', '0', '', ''],
292*4096896cSGreg Roach        'Ů'       => ['1', '0', '', ''],
293*4096896cSGreg Roach        'Ű'       => ['1', '0', '', ''],
294*4096896cSGreg Roach        'Ų'       => ['1', '0', '', ''],
295*4096896cSGreg Roach        'Ư'       => ['1', '0', '', ''],
296*4096896cSGreg Roach        'Ụ'       => ['1', '0', '', ''],
297*4096896cSGreg Roach        'Ủ'       => ['1', '0', '', ''],
298*4096896cSGreg Roach        'Ứ'       => ['1', '0', '', ''],
299*4096896cSGreg Roach        'Ừ'       => ['1', '0', '', ''],
300*4096896cSGreg Roach        'Ử'       => ['1', '0', '', ''],
301*4096896cSGreg Roach        'Ữ'       => ['1', '0', '', ''],
302*4096896cSGreg Roach        'Ự'       => ['1', '0', '', ''],
303*4096896cSGreg Roach        'UE'      => ['1', '0', '', ''],
304*4096896cSGreg Roach        'UI'      => ['1', '0', '1', ''],
305*4096896cSGreg Roach        'UJ'      => ['1', '0', '1', ''],
306*4096896cSGreg Roach        'UY'      => ['1', '0', '1', ''],
307*4096896cSGreg Roach        'UW'      => ['1', '0', '1', '', '0', '7', '7'],
308*4096896cSGreg Roach        'V'       => ['0', '7', '7', '7'],
309*4096896cSGreg Roach        'W'       => ['0', '7', '7', '7'],
310*4096896cSGreg Roach        'X'       => ['0', '5', '54', '54'],
311*4096896cSGreg Roach        'Y'       => ['1', '1', '', ''],
312*4096896cSGreg Roach        'Ý'       => ['1', '1', '', ''],
313*4096896cSGreg Roach        'Ỳ'       => ['1', '1', '', ''],
314*4096896cSGreg Roach        'Ỵ'       => ['1', '1', '', ''],
315*4096896cSGreg Roach        'Ỷ'       => ['1', '1', '', ''],
316*4096896cSGreg Roach        'Ỹ'       => ['1', '1', '', ''],
317*4096896cSGreg Roach        'Z'       => ['0', '4', '4', '4'],
318*4096896cSGreg Roach        'Ź'       => ['0', '4', '4', '4'],
319*4096896cSGreg Roach        'Ż'       => ['0', '4', '4', '4'],
320*4096896cSGreg Roach        'Ž'       => ['0', '4', '4', '4'],
321*4096896cSGreg Roach        'ZD'      => ['0', '2', '43', '43'],
322*4096896cSGreg Roach        'ZDZ'     => ['0', '2', '4', '4'],
323*4096896cSGreg Roach        'ZDZH'    => ['0', '2', '4', '4'],
324*4096896cSGreg Roach        'ZH'      => ['0', '4', '4', '4'],
325*4096896cSGreg Roach        'ZHD'     => ['0', '2', '43', '43'],
326*4096896cSGreg Roach        'ZHDZH'   => ['0', '2', '4', '4'],
327*4096896cSGreg Roach        'ZS'      => ['0', '4', '4', '4'],
328*4096896cSGreg Roach        'ZSCH'    => ['0', '4', '4', '4'],
329*4096896cSGreg Roach        'ZSH'     => ['0', '4', '4', '4'],
330*4096896cSGreg Roach        'ZZS'     => ['0', '4', '4', '4'],
331a25f0a04SGreg Roach        // Cyrillic alphabet
332*4096896cSGreg Roach        'А'       => ['1', '0', '', ''],
333*4096896cSGreg Roach        'Б'       => ['0', '7', '7', '7'],
334*4096896cSGreg Roach        'В'       => ['0', '7', '7', '7'],
335*4096896cSGreg Roach        'Г'       => ['0', '5', '5', '5'],
336*4096896cSGreg Roach        'Д'       => ['0', '3', '3', '3'],
337*4096896cSGreg Roach        'ДЗ'      => ['0', '4', '4', '4'],
338*4096896cSGreg Roach        'Е'       => ['1', '0', '', ''],
339*4096896cSGreg Roach        'Ё'       => ['1', '0', '', ''],
340*4096896cSGreg Roach        'Ж'       => ['0', '4', '4', '4'],
341*4096896cSGreg Roach        'З'       => ['0', '4', '4', '4'],
342*4096896cSGreg Roach        'И'       => ['1', '0', '', ''],
343*4096896cSGreg Roach        'Й'       => ['1', '1', '', '', '4', '4', '4'],
344*4096896cSGreg Roach        'К'       => ['0', '5', '5', '5'],
345*4096896cSGreg Roach        'Л'       => ['0', '8', '8', '8'],
346*4096896cSGreg Roach        'М'       => ['0', '6', '6', '6'],
347*4096896cSGreg Roach        'Н'       => ['0', '6', '6', '6'],
348*4096896cSGreg Roach        'О'       => ['1', '0', '', ''],
349*4096896cSGreg Roach        'П'       => ['0', '7', '7', '7'],
350*4096896cSGreg Roach        'Р'       => ['0', '9', '9', '9'],
351*4096896cSGreg Roach        'РЖ'      => ['0', '4', '4', '4'],
352*4096896cSGreg Roach        'С'       => ['0', '4', '4', '4'],
353*4096896cSGreg Roach        'Т'       => ['0', '3', '3', '3'],
354*4096896cSGreg Roach        'У'       => ['1', '0', '', ''],
355*4096896cSGreg Roach        'Ф'       => ['0', '7', '7', '7'],
356*4096896cSGreg Roach        'Х'       => ['0', '5', '5', '5'],
357*4096896cSGreg Roach        'Ц'       => ['0', '4', '4', '4'],
358*4096896cSGreg Roach        'Ч'       => ['0', '4', '4', '4'],
359*4096896cSGreg Roach        'Ш'       => ['0', '4', '4', '4'],
360*4096896cSGreg Roach        'Щ'       => ['0', '2', '4', '4'],
361*4096896cSGreg Roach        'Ъ'       => ['0', '', '', ''],
362*4096896cSGreg Roach        'Ы'       => ['0', '1', '', ''],
363*4096896cSGreg Roach        'Ь'       => ['0', '', '', ''],
364*4096896cSGreg Roach        'Э'       => ['1', '0', '', ''],
365*4096896cSGreg Roach        'Ю'       => ['0', '1', '', ''],
366*4096896cSGreg Roach        'Я'       => ['0', '1', '', ''],
367a25f0a04SGreg Roach        // Greek alphabet
368*4096896cSGreg Roach        'Α'       => ['1', '0', '', ''],
369*4096896cSGreg Roach        'Ά'       => ['1', '0', '', ''],
370*4096896cSGreg Roach        'ΑΙ'      => ['1', '0', '1', ''],
371*4096896cSGreg Roach        'ΑΥ'      => ['1', '0', '1', ''],
372*4096896cSGreg Roach        'Β'       => ['0', '7', '7', '7'],
373*4096896cSGreg Roach        'Γ'       => ['0', '5', '5', '5'],
374*4096896cSGreg Roach        'Δ'       => ['0', '3', '3', '3'],
375*4096896cSGreg Roach        'Ε'       => ['1', '0', '', ''],
376*4096896cSGreg Roach        'Έ'       => ['1', '0', '', ''],
377*4096896cSGreg Roach        'ΕΙ'      => ['1', '0', '1', ''],
378*4096896cSGreg Roach        'ΕΥ'      => ['1', '1', '1', ''],
379*4096896cSGreg Roach        'Ζ'       => ['0', '4', '4', '4'],
380*4096896cSGreg Roach        'Η'       => ['1', '0', '', ''],
381*4096896cSGreg Roach        'Ή'       => ['1', '0', '', ''],
382*4096896cSGreg Roach        'Θ'       => ['0', '3', '3', '3'],
383*4096896cSGreg Roach        'Ι'       => ['1', '0', '', ''],
384*4096896cSGreg Roach        'Ί'       => ['1', '0', '', ''],
385*4096896cSGreg Roach        'Ϊ'       => ['1', '0', '', ''],
386*4096896cSGreg Roach        'ΐ'       => ['1', '0', '', ''],
387*4096896cSGreg Roach        'Κ'       => ['0', '5', '5', '5'],
388*4096896cSGreg Roach        'Λ'       => ['0', '8', '8', '8'],
389*4096896cSGreg Roach        'Μ'       => ['0', '6', '6', '6'],
390*4096896cSGreg Roach        'ΜΠ'      => ['0', '7', '7', '7'],
391*4096896cSGreg Roach        'Ν'       => ['0', '6', '6', '6'],
392*4096896cSGreg Roach        'ΝΤ'      => ['0', '3', '3', '3'],
393*4096896cSGreg Roach        'Ξ'       => ['0', '5', '54', '54'],
394*4096896cSGreg Roach        'Ο'       => ['1', '0', '', ''],
395*4096896cSGreg Roach        'Ό'       => ['1', '0', '', ''],
396*4096896cSGreg Roach        'ΟΙ'      => ['1', '0', '1', ''],
397*4096896cSGreg Roach        'ΟΥ'      => ['1', '0', '1', ''],
398*4096896cSGreg Roach        'Π'       => ['0', '7', '7', '7'],
399*4096896cSGreg Roach        'Ρ'       => ['0', '9', '9', '9'],
400*4096896cSGreg Roach        'Σ'       => ['0', '4', '4', '4'],
401*4096896cSGreg Roach        'ς'       => ['0', '', '', '4'],
402*4096896cSGreg Roach        'Τ'       => ['0', '3', '3', '3'],
403*4096896cSGreg Roach        'ΤΖ'      => ['0', '4', '4', '4'],
404*4096896cSGreg Roach        'ΤΣ'      => ['0', '4', '4', '4'],
405*4096896cSGreg Roach        'Υ'       => ['1', '1', '', ''],
406*4096896cSGreg Roach        'Ύ'       => ['1', '1', '', ''],
407*4096896cSGreg Roach        'Ϋ'       => ['1', '1', '', ''],
408*4096896cSGreg Roach        'ΰ'       => ['1', '1', '', ''],
409*4096896cSGreg Roach        'ΥΚ'      => ['1', '5', '5', '5'],
410*4096896cSGreg Roach        'ΥΥ'      => ['1', '65', '65', '65'],
411*4096896cSGreg Roach        'Φ'       => ['0', '7', '7', '7'],
412*4096896cSGreg Roach        'Χ'       => ['0', '5', '5', '5'],
413*4096896cSGreg Roach        'Ψ'       => ['0', '7', '7', '7'],
414*4096896cSGreg Roach        'Ω'       => ['1', '0', '', ''],
415*4096896cSGreg Roach        'Ώ'       => ['1', '0', '', ''],
416a25f0a04SGreg Roach        // Hebrew alphabet
417*4096896cSGreg Roach        'א'       => ['1', '0', '', ''],
418*4096896cSGreg Roach        'או'      => ['1', '0', '7', ''],
419*4096896cSGreg Roach        'אג'      => ['1', '4', '4', '4', '5', '5', '5', '34', '34', '34'],
420*4096896cSGreg Roach        'בב'      => ['0', '7', '7', '7', '77', '77', '77'],
421*4096896cSGreg Roach        'ב'       => ['0', '7', '7', '7'],
422*4096896cSGreg Roach        'גג'      => ['0', '4', '4', '4', '5', '5', '5', '45', '45', '45', '55', '55', '55', '54', '54', '54'],
423*4096896cSGreg Roach        'גד'      => ['0', '43', '43', '43', '53', '53', '53'],
424*4096896cSGreg Roach        'גה'      => ['0', '45', '45', '45', '55', '55', '55'],
425*4096896cSGreg Roach        'גז'      => ['0', '44', '44', '44', '45', '45', '45'],
426*4096896cSGreg Roach        'גח'      => ['0', '45', '45', '45', '55', '55', '55'],
427*4096896cSGreg Roach        'גכ'      => ['0', '45', '45', '45', '55', '55', '55'],
428*4096896cSGreg Roach        'גך'      => ['0', '45', '45', '45', '55', '55', '55'],
429*4096896cSGreg Roach        'גצ'      => ['0', '44', '44', '44', '45', '45', '45'],
430*4096896cSGreg Roach        'גץ'      => ['0', '44', '44', '44', '45', '45', '45'],
431*4096896cSGreg Roach        'גק'      => ['0', '45', '45', '45', '54', '54', '54'],
432*4096896cSGreg Roach        'גש'      => ['0', '44', '44', '44', '54', '54', '54'],
433*4096896cSGreg Roach        'גת'      => ['0', '43', '43', '43', '53', '53', '53'],
434*4096896cSGreg Roach        'ג'       => ['0', '4', '4', '4', '5', '5', '5'],
435*4096896cSGreg Roach        'דז'      => ['0', '4', '4', '4'],
436*4096896cSGreg Roach        'דד'      => ['0', '3', '3', '3', '33', '33', '33'],
437*4096896cSGreg Roach        'דט'      => ['0', '33', '33', '33'],
438*4096896cSGreg Roach        'דש'      => ['0', '4', '4', '4'],
439*4096896cSGreg Roach        'דצ'      => ['0', '4', '4', '4'],
440*4096896cSGreg Roach        'דץ'      => ['0', '4', '4', '4'],
441*4096896cSGreg Roach        'ד'       => ['0', '3', '3', '3'],
442*4096896cSGreg Roach        'הג'      => ['0', '54', '54', '54', '55', '55', '55'],
443*4096896cSGreg Roach        'הכ'      => ['0', '55', '55', '55'],
444*4096896cSGreg Roach        'הח'      => ['0', '55', '55', '55'],
445*4096896cSGreg Roach        'הק'      => ['0', '55', '55', '55', '5', '5', '5'],
446*4096896cSGreg Roach        'הה'      => ['0', '5', '5', '', '55', '55', ''],
447*4096896cSGreg Roach        'ה'       => ['0', '5', '5', ''],
448*4096896cSGreg Roach        'וי'      => ['1', '', '', '', '7', '7', '7'],
449*4096896cSGreg Roach        'ו'       => ['1', '7', '7', '7', '7', '', ''],
450*4096896cSGreg Roach        'וו'      => ['1', '7', '7', '7', '7', '', ''],
451*4096896cSGreg Roach        'וופ'     => ['1', '7', '7', '7', '77', '77', '77'],
452*4096896cSGreg Roach        'זש'      => ['0', '4', '4', '4', '44', '44', '44'],
453*4096896cSGreg Roach        'זדז'     => ['0', '2', '4', '4'],
454*4096896cSGreg Roach        'ז'       => ['0', '4', '4', '4'],
455*4096896cSGreg Roach        'זג'      => ['0', '44', '44', '44', '45', '45', '45'],
456*4096896cSGreg Roach        'זז'      => ['0', '4', '4', '4', '44', '44', '44'],
457*4096896cSGreg Roach        'זס'      => ['0', '44', '44', '44'],
458*4096896cSGreg Roach        'זצ'      => ['0', '44', '44', '44'],
459*4096896cSGreg Roach        'זץ'      => ['0', '44', '44', '44'],
460*4096896cSGreg Roach        'חג'      => ['0', '54', '54', '54', '53', '53', '53'],
461*4096896cSGreg Roach        'חח'      => ['0', '5', '5', '5', '55', '55', '55'],
462*4096896cSGreg Roach        'חק'      => ['0', '55', '55', '55', '5', '5', '5'],
463*4096896cSGreg Roach        'חכ'      => ['0', '45', '45', '45', '55', '55', '55'],
464*4096896cSGreg Roach        'חס'      => ['0', '5', '54', '54'],
465*4096896cSGreg Roach        'חש'      => ['0', '5', '54', '54'],
466*4096896cSGreg Roach        'ח'       => ['0', '5', '5', '5'],
467*4096896cSGreg Roach        'טש'      => ['0', '4', '4', '4'],
468*4096896cSGreg Roach        'טד'      => ['0', '33', '33', '33'],
469*4096896cSGreg Roach        'טי'      => ['0', '3', '3', '3', '4', '4', '4', '3', '3', '34'],
470*4096896cSGreg Roach        'טת'      => ['0', '33', '33', '33'],
471*4096896cSGreg Roach        'טט'      => ['0', '3', '3', '3', '33', '33', '33'],
472*4096896cSGreg Roach        'ט'       => ['0', '3', '3', '3'],
473*4096896cSGreg Roach        'י'       => ['1', '1', '', ''],
474*4096896cSGreg Roach        'יא'      => ['1', '1', '', '', '1', '1', '1'],
475*4096896cSGreg Roach        'כג'      => ['0', '55', '55', '55', '54', '54', '54'],
476*4096896cSGreg Roach        'כש'      => ['0', '5', '54', '54'],
477*4096896cSGreg Roach        'כס'      => ['0', '5', '54', '54'],
478*4096896cSGreg Roach        'ככ'      => ['0', '5', '5', '5', '55', '55', '55'],
479*4096896cSGreg Roach        'כך'      => ['0', '5', '5', '5', '55', '55', '55'],
480*4096896cSGreg Roach        'כ'       => ['0', '5', '5', '5'],
481*4096896cSGreg Roach        'כח'      => ['0', '55', '55', '55', '5', '5', '5'],
482*4096896cSGreg Roach        'ך'       => ['0', '', '5', '5'],
483*4096896cSGreg Roach        'ל'       => ['0', '8', '8', '8'],
484*4096896cSGreg Roach        'לל'      => ['0', '88', '88', '88', '8', '8', '8'],
485*4096896cSGreg Roach        'מנ'      => ['0', '66', '66', '66'],
486*4096896cSGreg Roach        'מן'      => ['0', '66', '66', '66'],
487*4096896cSGreg Roach        'ממ'      => ['0', '6', '6', '6', '66', '66', '66'],
488*4096896cSGreg Roach        'מם'      => ['0', '6', '6', '6', '66', '66', '66'],
489*4096896cSGreg Roach        'מ'       => ['0', '6', '6', '6'],
490*4096896cSGreg Roach        'ם'       => ['0', '', '6', '6'],
491*4096896cSGreg Roach        'נמ'      => ['0', '66', '66', '66'],
492*4096896cSGreg Roach        'נם'      => ['0', '66', '66', '66'],
493*4096896cSGreg Roach        'ננ'      => ['0', '6', '6', '6', '66', '66', '66'],
494*4096896cSGreg Roach        'נן'      => ['0', '6', '6', '6', '66', '66', '66'],
495*4096896cSGreg Roach        'נ'       => ['0', '6', '6', '6'],
496*4096896cSGreg Roach        'ן'       => ['0', '', '6', '6'],
497*4096896cSGreg Roach        'סתש'     => ['0', '2', '4', '4'],
498*4096896cSGreg Roach        'סתז'     => ['0', '2', '4', '4'],
499*4096896cSGreg Roach        'סטז'     => ['0', '2', '4', '4'],
500*4096896cSGreg Roach        'סטש'     => ['0', '2', '4', '4'],
501*4096896cSGreg Roach        'סצד'     => ['0', '2', '4', '4'],
502*4096896cSGreg Roach        'סט'      => ['0', '2', '4', '4', '43', '43', '43'],
503*4096896cSGreg Roach        'סת'      => ['0', '2', '4', '4', '43', '43', '43'],
504*4096896cSGreg Roach        'סג'      => ['0', '44', '44', '44', '4', '4', '4'],
505*4096896cSGreg Roach        'סס'      => ['0', '4', '4', '4', '44', '44', '44'],
506*4096896cSGreg Roach        'סצ'      => ['0', '44', '44', '44'],
507*4096896cSGreg Roach        'סץ'      => ['0', '44', '44', '44'],
508*4096896cSGreg Roach        'סז'      => ['0', '44', '44', '44'],
509*4096896cSGreg Roach        'סש'      => ['0', '44', '44', '44'],
510*4096896cSGreg Roach        'ס'       => ['0', '4', '4', '4'],
511*4096896cSGreg Roach        'ע'       => ['1', '0', '', ''],
512*4096896cSGreg Roach        'פב'      => ['0', '7', '7', '7', '77', '77', '77'],
513*4096896cSGreg Roach        'פוו'     => ['0', '7', '7', '7', '77', '77', '77'],
514*4096896cSGreg Roach        'פפ'      => ['0', '7', '7', '7', '77', '77', '77'],
515*4096896cSGreg Roach        'פף'      => ['0', '7', '7', '7', '77', '77', '77'],
516*4096896cSGreg Roach        'פ'       => ['0', '7', '7', '7'],
517*4096896cSGreg Roach        'ף'       => ['0', '', '7', '7'],
518*4096896cSGreg Roach        'צג'      => ['0', '44', '44', '44', '45', '45', '45'],
519*4096896cSGreg Roach        'צז'      => ['0', '44', '44', '44'],
520*4096896cSGreg Roach        'צס'      => ['0', '44', '44', '44'],
521*4096896cSGreg Roach        'צצ'      => ['0', '4', '4', '4', '5', '5', '5', '44', '44', '44', '54', '54', '54', '45', '45', '45'],
522*4096896cSGreg Roach        'צץ'      => ['0', '4', '4', '4', '5', '5', '5', '44', '44', '44', '54', '54', '54'],
523*4096896cSGreg Roach        'צש'      => ['0', '44', '44', '44', '4', '4', '4', '5', '5', '5'],
524*4096896cSGreg Roach        'צ'       => ['0', '4', '4', '4', '5', '5', '5'],
525*4096896cSGreg Roach        'ץ'       => ['0', '', '4', '4'],
526*4096896cSGreg Roach        'קה'      => ['0', '55', '55', '5'],
527*4096896cSGreg Roach        'קס'      => ['0', '5', '54', '54'],
528*4096896cSGreg Roach        'קש'      => ['0', '5', '54', '54'],
529*4096896cSGreg Roach        'קק'      => ['0', '5', '5', '5', '55', '55', '55'],
530*4096896cSGreg Roach        'קח'      => ['0', '55', '55', '55'],
531*4096896cSGreg Roach        'קכ'      => ['0', '55', '55', '55'],
532*4096896cSGreg Roach        'קך'      => ['0', '55', '55', '55'],
533*4096896cSGreg Roach        'קג'      => ['0', '55', '55', '55', '54', '54', '54'],
534*4096896cSGreg Roach        'ק'       => ['0', '5', '5', '5'],
535*4096896cSGreg Roach        'רר'      => ['0', '99', '99', '99', '9', '9', '9'],
536*4096896cSGreg Roach        'ר'       => ['0', '9', '9', '9'],
537*4096896cSGreg Roach        'שטז'     => ['0', '2', '4', '4'],
538*4096896cSGreg Roach        'שתש'     => ['0', '2', '4', '4'],
539*4096896cSGreg Roach        'שתז'     => ['0', '2', '4', '4'],
540*4096896cSGreg Roach        'שטש'     => ['0', '2', '4', '4'],
541*4096896cSGreg Roach        'שד'      => ['0', '2', '43', '43'],
542*4096896cSGreg Roach        'שז'      => ['0', '44', '44', '44'],
543*4096896cSGreg Roach        'שס'      => ['0', '44', '44', '44'],
544*4096896cSGreg Roach        'שת'      => ['0', '2', '43', '43'],
545*4096896cSGreg Roach        'שג'      => ['0', '4', '4', '4', '44', '44', '44', '4', '43', '43'],
546*4096896cSGreg Roach        'שט'      => ['0', '2', '43', '43', '44', '44', '44'],
547*4096896cSGreg Roach        'שצ'      => ['0', '44', '44', '44', '45', '45', '45'],
548*4096896cSGreg Roach        'שץ'      => ['0', '44', '', '44', '45', '', '45'],
549*4096896cSGreg Roach        'שש'      => ['0', '4', '4', '4', '44', '44', '44'],
550*4096896cSGreg Roach        'ש'       => ['0', '4', '4', '4'],
551*4096896cSGreg Roach        'תג'      => ['0', '34', '34', '34'],
552*4096896cSGreg Roach        'תז'      => ['0', '34', '34', '34'],
553*4096896cSGreg Roach        'תש'      => ['0', '4', '4', '4'],
554*4096896cSGreg Roach        'תת'      => ['0', '3', '3', '3', '4', '4', '4', '33', '33', '33', '44', '44', '44', '34', '34', '34', '43', '43', '43'],
555*4096896cSGreg Roach        'ת'       => ['0', '3', '3', '3', '4', '4', '4'],
556a25f0a04SGreg Roach        // Arabic alphabet
557*4096896cSGreg Roach        'ا'       => ['1', '0', '', ''],
558*4096896cSGreg Roach        'ب'       => ['0', '7', '7', '7'],
559*4096896cSGreg Roach        'ت'       => ['0', '3', '3', '3'],
560*4096896cSGreg Roach        'ث'       => ['0', '3', '3', '3'],
561*4096896cSGreg Roach        'ج'       => ['0', '4', '4', '4'],
562*4096896cSGreg Roach        'ح'       => ['0', '5', '5', '5'],
563*4096896cSGreg Roach        'خ'       => ['0', '5', '5', '5'],
564*4096896cSGreg Roach        'د'       => ['0', '3', '3', '3'],
565*4096896cSGreg Roach        'ذ'       => ['0', '3', '3', '3'],
566*4096896cSGreg Roach        'ر'       => ['0', '9', '9', '9'],
567*4096896cSGreg Roach        'ز'       => ['0', '4', '4', '4'],
568*4096896cSGreg Roach        'س'       => ['0', '4', '4', '4'],
569*4096896cSGreg Roach        'ش'       => ['0', '4', '4', '4'],
570*4096896cSGreg Roach        'ص'       => ['0', '4', '4', '4'],
571*4096896cSGreg Roach        'ض'       => ['0', '3', '3', '3'],
572*4096896cSGreg Roach        'ط'       => ['0', '3', '3', '3'],
573*4096896cSGreg Roach        'ظ'       => ['0', '4', '4', '4'],
574*4096896cSGreg Roach        'ع'       => ['1', '0', '', ''],
575*4096896cSGreg Roach        'غ'       => ['0', '0', '', ''],
576*4096896cSGreg Roach        'ف'       => ['0', '7', '7', '7'],
577*4096896cSGreg Roach        'ق'       => ['0', '5', '5', '5'],
578*4096896cSGreg Roach        'ك'       => ['0', '5', '5', '5'],
579*4096896cSGreg Roach        'ل'       => ['0', '8', '8', '8'],
580*4096896cSGreg Roach        'لا'      => ['0', '8', '8', '8'],
581*4096896cSGreg Roach        'م'       => ['0', '6', '6', '6'],
582*4096896cSGreg Roach        'ن'       => ['0', '6', '6', '6'],
583*4096896cSGreg Roach        'هن'      => ['0', '66', '66', '66'],
584*4096896cSGreg Roach        'ه'       => ['0', '5', '5', ''],
585*4096896cSGreg Roach        'و'       => ['1', '', '', '', '7', '', ''],
586*4096896cSGreg Roach        'ي'       => ['0', '1', '', ''],
587*4096896cSGreg Roach        'آ'       => ['0', '1', '', ''],
588*4096896cSGreg Roach        'ة'       => ['0', '', '', '3'],
589*4096896cSGreg Roach        'ی'       => ['0', '1', '', ''],
590*4096896cSGreg Roach        'ى'       => ['1', '1', '', ''],
59113abd6f3SGreg Roach    ];
592a25f0a04SGreg Roach
593a25f0a04SGreg Roach    /**
59416cfb0b9SGreg Roach     * Which algorithms are supported.
59516cfb0b9SGreg Roach     *
59624f2a3afSGreg Roach     * @return array<string>
59716cfb0b9SGreg Roach     */
59816cfb0b9SGreg Roach    public static function getAlgorithms(): array
59916cfb0b9SGreg Roach    {
60016cfb0b9SGreg Roach        return [
601ad3143ccSGreg Roach            /* I18N: https://en.wikipedia.org/wiki/Soundex */
60216cfb0b9SGreg Roach            'std' => I18N::translate('Russell'),
603ad3143ccSGreg Roach            /* I18N: https://en.wikipedia.org/wiki/Daitch–Mokotoff_Soundex */
60416cfb0b9SGreg Roach            'dm'  => I18N::translate('Daitch-Mokotoff'),
60516cfb0b9SGreg Roach        ];
60616cfb0b9SGreg Roach    }
60716cfb0b9SGreg Roach
60816cfb0b9SGreg Roach    /**
60916cfb0b9SGreg Roach     * Is there a match between two soundex codes?
61016cfb0b9SGreg Roach     *
61116cfb0b9SGreg Roach     * @param string $soundex1
61216cfb0b9SGreg Roach     * @param string $soundex2
61316cfb0b9SGreg Roach     *
61416cfb0b9SGreg Roach     * @return bool
61516cfb0b9SGreg Roach     */
61624f2a3afSGreg Roach    public static function compare(string $soundex1, string $soundex2): bool
61716cfb0b9SGreg Roach    {
61816cfb0b9SGreg Roach        if ($soundex1 !== '' && $soundex2 !== '') {
61954c1ab5eSGreg Roach            return array_intersect(explode(':', $soundex1), explode(':', $soundex2)) !== [];
62016cfb0b9SGreg Roach        }
62116cfb0b9SGreg Roach
62216cfb0b9SGreg Roach        return false;
62316cfb0b9SGreg Roach    }
62416cfb0b9SGreg Roach
62516cfb0b9SGreg Roach    /**
62616cfb0b9SGreg Roach     * Generate Russell soundex codes for a given text.
62716cfb0b9SGreg Roach     *
62816cfb0b9SGreg Roach     * @param string $text
62916cfb0b9SGreg Roach     *
63016cfb0b9SGreg Roach     * @return string
63116cfb0b9SGreg Roach     */
63216cfb0b9SGreg Roach    public static function russell(string $text): string
63316cfb0b9SGreg Roach    {
63416cfb0b9SGreg Roach        $words         = explode(' ', $text);
63516cfb0b9SGreg Roach        $soundex_array = [];
63616cfb0b9SGreg Roach
63716cfb0b9SGreg Roach        foreach ($words as $word) {
63816cfb0b9SGreg Roach            $soundex = soundex($word);
63916cfb0b9SGreg Roach
64016cfb0b9SGreg Roach            // Only return codes from recognisable sounds
64116cfb0b9SGreg Roach            if ($soundex !== '0000') {
64216cfb0b9SGreg Roach                $soundex_array[] = $soundex;
64316cfb0b9SGreg Roach            }
64416cfb0b9SGreg Roach        }
64516cfb0b9SGreg Roach
64616cfb0b9SGreg Roach        // Combine words, e.g. “New York” as “Newyork”
64716cfb0b9SGreg Roach        if (count($words) > 1) {
648e364afe4SGreg Roach            $soundex_array[] = soundex(str_replace(' ', '', $text));
64916cfb0b9SGreg Roach        }
65016cfb0b9SGreg Roach
65116cfb0b9SGreg Roach        // A varchar(255) column can only hold 51 4-character codes (plus 50 delimiters)
65216cfb0b9SGreg Roach        $soundex_array = array_slice(array_unique($soundex_array), 0, 51);
65316cfb0b9SGreg Roach
65416cfb0b9SGreg Roach        return implode(':', $soundex_array);
65516cfb0b9SGreg Roach    }
65616cfb0b9SGreg Roach
65716cfb0b9SGreg Roach    /**
65816cfb0b9SGreg Roach     * Generate Daitch–Mokotoff soundex codes for a given text.
65916cfb0b9SGreg Roach     *
66016cfb0b9SGreg Roach     * @param string $text
66116cfb0b9SGreg Roach     *
66216cfb0b9SGreg Roach     * @return string
66316cfb0b9SGreg Roach     */
66416cfb0b9SGreg Roach    public static function daitchMokotoff(string $text): string
66516cfb0b9SGreg Roach    {
66616cfb0b9SGreg Roach        $words         = explode(' ', $text);
66716cfb0b9SGreg Roach        $soundex_array = [];
66816cfb0b9SGreg Roach
66916cfb0b9SGreg Roach        foreach ($words as $word) {
67016cfb0b9SGreg Roach            $soundex_array = array_merge($soundex_array, self::daitchMokotoffWord($word));
67116cfb0b9SGreg Roach        }
67216cfb0b9SGreg Roach        // Combine words, e.g. “New York” as “Newyork”
67316cfb0b9SGreg Roach        if (count($words) > 1) {
674e364afe4SGreg Roach            $soundex_array = array_merge($soundex_array, self::daitchMokotoffWord(str_replace(' ', '', $text)));
67516cfb0b9SGreg Roach        }
67616cfb0b9SGreg Roach
67716cfb0b9SGreg Roach        // A varchar(255) column can only hold 36 6-character codes (plus 35 delimiters)
67816cfb0b9SGreg Roach        $soundex_array = array_slice(array_unique($soundex_array), 0, 36);
67916cfb0b9SGreg Roach
68016cfb0b9SGreg Roach        return implode(':', $soundex_array);
68116cfb0b9SGreg Roach    }
68216cfb0b9SGreg Roach
68316cfb0b9SGreg Roach    /**
68476692c8bSGreg Roach     * Calculate the Daitch-Mokotoff soundex for a word.
68576692c8bSGreg Roach     *
686a25f0a04SGreg Roach     * @param string $name
687a25f0a04SGreg Roach     *
68824f2a3afSGreg Roach     * @return array<string> List of possible DM codes for the word.
689a25f0a04SGreg Roach     */
69024f2a3afSGreg Roach    private static function daitchMokotoffWord(string $name): array
691c1010edaSGreg Roach    {
692a25f0a04SGreg Roach        // Apply special transformation rules to the input string
693a25f0a04SGreg Roach        $name = I18N::strtoupper($name);
69416cfb0b9SGreg Roach        foreach (self::TRANSFORM_NAMES as $transformRule) {
695a25f0a04SGreg Roach            $name = str_replace($transformRule[0], $transformRule[1], $name);
696a25f0a04SGreg Roach        }
697a25f0a04SGreg Roach
698a25f0a04SGreg Roach        // Initialize
699a25f0a04SGreg Roach        $name_script = I18N::textScript($name);
700dd71ff6bSGreg Roach        $noVowels    = $name_script === 'Hebr' || $name_script === 'Arab';
701a25f0a04SGreg Roach
702a25f0a04SGreg Roach        $lastPos         = strlen($name) - 1;
703a25f0a04SGreg Roach        $currPos         = 0;
704a25f0a04SGreg Roach        $state           = 1; // 1: start of input string, 2: before vowel, 3: other
70513abd6f3SGreg Roach        $result          = []; // accumulate complete 6-digit D-M codes here
70613abd6f3SGreg Roach        $partialResult   = []; // accumulate incomplete D-M codes here
70713abd6f3SGreg Roach        $partialResult[] = ['!']; // initialize 1st partial result  ('!' stops "duplicate sound" check)
708a25f0a04SGreg Roach
709a25f0a04SGreg Roach        // Loop through the input string.
710a25f0a04SGreg Roach        // Stop when the string is exhausted or when no more partial results remain
711a25f0a04SGreg Roach        while (count($partialResult) !== 0 && $currPos <= $lastPos) {
712a25f0a04SGreg Roach            // Find the DM coding table entry for the chunk at the current position
713a25f0a04SGreg Roach            $thisEntry = substr($name, $currPos, self::MAXCHAR); // Get maximum length chunk
714e364afe4SGreg Roach            while ($thisEntry !== '') {
71516cfb0b9SGreg Roach                if (isset(self::DM_SOUNDS[$thisEntry])) {
716a25f0a04SGreg Roach                    break;
717a25f0a04SGreg Roach                }
718a25f0a04SGreg Roach                $thisEntry = substr($thisEntry, 0, -1); // Not in table: try a shorter chunk
719a25f0a04SGreg Roach            }
720a25f0a04SGreg Roach            if ($thisEntry === '') {
721a25f0a04SGreg Roach                $currPos++; // Not in table: advance pointer to next byte
722a25f0a04SGreg Roach                continue; // and try again
723a25f0a04SGreg Roach            }
724a25f0a04SGreg Roach
72516cfb0b9SGreg Roach            $soundTableEntry = self::DM_SOUNDS[$thisEntry];
726a25f0a04SGreg Roach            $workingResult   = $partialResult;
72713abd6f3SGreg Roach            $partialResult   = [];
728a25f0a04SGreg Roach            $currPos += strlen($thisEntry);
729a25f0a04SGreg Roach
730a25f0a04SGreg Roach            // Not at beginning of input string
731e364afe4SGreg Roach            if ($state !== 1) {
732a25f0a04SGreg Roach                if ($currPos <= $lastPos) {
733a25f0a04SGreg Roach                    // Determine whether the next chunk is a vowel
734a25f0a04SGreg Roach                    $nextEntry = substr($name, $currPos, self::MAXCHAR); // Get maximum length chunk
735e364afe4SGreg Roach                    while ($nextEntry !== '') {
73616cfb0b9SGreg Roach                        if (isset(self::DM_SOUNDS[$nextEntry])) {
737a25f0a04SGreg Roach                            break;
738a25f0a04SGreg Roach                        }
739a25f0a04SGreg Roach                        $nextEntry = substr($nextEntry, 0, -1); // Not in table: try a shorter chunk
740a25f0a04SGreg Roach                    }
741a25f0a04SGreg Roach                } else {
742a25f0a04SGreg Roach                    $nextEntry = '';
743a25f0a04SGreg Roach                }
744e364afe4SGreg Roach                if ($nextEntry !== '' && self::DM_SOUNDS[$nextEntry][0] !== '0') {
745a25f0a04SGreg Roach                    $state = 2;
746a25f0a04SGreg Roach                } else {
747a25f0a04SGreg Roach                    // Next chunk is a vowel
748a25f0a04SGreg Roach                    $state = 3;
749a25f0a04SGreg Roach                }
750a25f0a04SGreg Roach            }
751a25f0a04SGreg Roach
752a25f0a04SGreg Roach            while ($state < count($soundTableEntry)) {
753a25f0a04SGreg Roach                // empty means 'ignore this sound in this state'
754e364afe4SGreg Roach                if ($soundTableEntry[$state] === '') {
755a25f0a04SGreg Roach                    foreach ($workingResult as $workingEntry) {
756a25f0a04SGreg Roach                        $tempEntry                        = $workingEntry;
757a25f0a04SGreg Roach                        $tempEntry[count($tempEntry) - 1] .= '!'; // Prevent false 'doubles'
758a25f0a04SGreg Roach                        $partialResult[]                  = $tempEntry;
759a25f0a04SGreg Roach                    }
760a25f0a04SGreg Roach                } else {
761a25f0a04SGreg Roach                    foreach ($workingResult as $workingEntry) {
762a25f0a04SGreg Roach                        if ($soundTableEntry[$state] !== $workingEntry[count($workingEntry) - 1]) {
763a25f0a04SGreg Roach                            // Incoming sound isn't a duplicate of the previous sound
764a25f0a04SGreg Roach                            $workingEntry[] = $soundTableEntry[$state];
765e364afe4SGreg Roach                        } elseif ($noVowels) {
766a25f0a04SGreg Roach                            // Incoming sound is a duplicate of the previous sound
767a25f0a04SGreg Roach                            // For Hebrew and Arabic, we need to create a pair of D-M sound codes,
768a25f0a04SGreg Roach                            // one of the pair with only a single occurrence of the duplicate sound,
769a25f0a04SGreg Roach                            // the other with both occurrences
770a25f0a04SGreg Roach                            $workingEntry[] = $soundTableEntry[$state];
771a25f0a04SGreg Roach                        }
772e364afe4SGreg Roach
773a25f0a04SGreg Roach                        if (count($workingEntry) < 7) {
774a25f0a04SGreg Roach                            $partialResult[] = $workingEntry;
775a25f0a04SGreg Roach                        } else {
776a25f0a04SGreg Roach                            // This is the 6th code in the sequence
777a25f0a04SGreg Roach                            // We're looking for 7 entries because the first is '!' and doesn't count
778a25f0a04SGreg Roach                            $tempResult = str_replace('!', '', implode('', $workingEntry));
779a25f0a04SGreg Roach                            // Only return codes from recognisable sounds
780a25f0a04SGreg Roach                            if ($tempResult) {
781a25f0a04SGreg Roach                                $result[] = substr($tempResult . '000000', 0, 6);
782a25f0a04SGreg Roach                            }
783a25f0a04SGreg Roach                        }
784a25f0a04SGreg Roach                    }
785a25f0a04SGreg Roach                }
786e364afe4SGreg Roach                $state += 3; // Advance to next triplet while keeping the same basic state
787a25f0a04SGreg Roach            }
788a25f0a04SGreg Roach        }
789a25f0a04SGreg Roach
790a25f0a04SGreg Roach        // Zero-fill and copy all remaining partial results
791a25f0a04SGreg Roach        foreach ($partialResult as $workingEntry) {
792a25f0a04SGreg Roach            $tempResult = str_replace('!', '', implode('', $workingEntry));
793a25f0a04SGreg Roach            // Only return codes from recognisable sounds
794a25f0a04SGreg Roach            if ($tempResult) {
795a25f0a04SGreg Roach                $result[] = substr($tempResult . '000000', 0, 6);
796a25f0a04SGreg Roach            }
797a25f0a04SGreg Roach        }
798a25f0a04SGreg Roach
799a25f0a04SGreg Roach        return $result;
800a25f0a04SGreg Roach    }
801a25f0a04SGreg Roach}
802