xref: /webtrees/app/Encodings/ANSEL.php (revision b5505f697291435abadf92d9c68555144f816161)
1<?php
2
3/**
4 * webtrees: online genealogy
5 * Copyright (C) 2023 webtrees development team
6 * This program is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation, either version 3 of the License, or
9 * (at your option) any later version.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program. If not, see <https://www.gnu.org/licenses/>.
16 */
17
18declare(strict_types=1);
19
20namespace Fisharebest\Webtrees\Encodings;
21
22use function preg_replace;
23use function strtr;
24
25/**
26 * Convert between UTF-8 and ANSEL encoding.
27 *
28 * ANSEL is the common name for the MARC-21 encoding, also known as Z39.47, which
29 * has a number of editions.  These are denoted by a year suffix.
30 *
31 * The GEDCOM 5.5.1 specification (1999-10-02) specifies the Z39.47-1985 edition.
32 * It adds Es Zett (ß) at CF.
33 *
34 * According to wikipedia, other non-standard characters are also added.
35 *
36 * HEX Unicode Glyph Description
37 * BE  25A1    □     Empty box
38 * BF  25A0    ■     Black box
39 * CD  0065    e     Midline e
40 * CE  006F    o     Midline o
41 * CF  00DF    ß     Es Zett
42 * FC  0338    /     Combining slash
43 *
44 * @link https://en.wikipedia.org/wiki/ANSEL
45 *
46 * The MARC-21 specification has added a number of additional characters since
47 * the 1985 edition.
48 *
49 * HEX Unicode Glyph Description
50 * 88  0098          Start of string
51 * 89  009C          String terminator
52 * 8D  200D          Zero width joiner
53 * 8E  200C          Zero width non-joiner
54 * A7  CAB9       ʹ     Single prime
55 * AC  C6AF    Ơ     LATIN CAPITAL LETTER O WITH HORN
56 * AD  C6AF    Ư     LATIN CAPITAL LETTER U WITH HORN
57 * B7  CABA    ʺ     Double prime
58 * BC  C6A1    ơ     LATIN SMALL LETTER O WITH HORN
59 * BD  C6B0    ư     LATIN SMALL LETTER U WITH HORN
60 * C0  C2B0    °     Degree sign
61 * C1  E28493  ℓ     Script small L
62 * C2  E28497  ℗     Sound recording copyright
63 * C4  E282AC  ♯     Music sharp sign
64 * C7  00DF    ß     Es Zett
65 * C8  20AC    €     Euro sign
66 * E0  0309          Hook above
67 * EB  0361          Breve (first part / double)
68 * EC  0361          Breve (second part)
69 * EF  0310          Candrabindu
70 * F2  0323          Low dot
71 * F3  0324          Diaeresis below
72 * F4  0325          Ring below
73 * F5  0333          Double underline
74 * F7  0332          Underline
75 * F8  031C          Comma below
76 * F9  032E          Breve below
77 * FA  0360          Double tilde (first part / double).
78 * FB  0360          Double tilde (second part).
79 * FF  0338          Slash
80 *
81 * @link https://memory.loc.gov/diglib/codetables/45.html
82 *
83 * Note that this means we can expect two different representations of Es Zett.
84 *
85 * There are two multi-part diacritics.  There are two ways to represent these.
86 *
87 * ANSEL       | UTF-8         | UTF-8 (prefered)
88 * ------------+---------------+-----------------
89 * FA x FB y   | x FE22 y FE23 | x 0360 y
90 * EB x EC y   | y FE20 y FE21 | x 0361 y
91 */
92class ANSEL extends AbstractEncoding
93{
94    public const NAME = 'ANSEL';
95
96    protected const TO_UTF8 = [
97        "\x80" => UTF8::REPLACEMENT_CHARACTER,
98        "\x81" => UTF8::REPLACEMENT_CHARACTER,
99        "\x82" => UTF8::REPLACEMENT_CHARACTER,
100        "\x83" => UTF8::REPLACEMENT_CHARACTER,
101        "\x84" => UTF8::REPLACEMENT_CHARACTER,
102        "\x85" => UTF8::REPLACEMENT_CHARACTER,
103        "\x86" => UTF8::REPLACEMENT_CHARACTER,
104        "\x87" => UTF8::REPLACEMENT_CHARACTER,
105        "\x88" => UTF8::START_OF_STRING,
106        "\x89" => UTF8::STRING_TERMINATOR,
107        "\x8A" => UTF8::REPLACEMENT_CHARACTER,
108        "\x8B" => UTF8::REPLACEMENT_CHARACTER,
109        "\x8C" => UTF8::REPLACEMENT_CHARACTER,
110        "\x8D" => UTF8::ZERO_WIDTH_JOINER,
111        "\x8E" => UTF8::ZERO_WIDTH_NON_JOINER,
112        "\x8F" => UTF8::REPLACEMENT_CHARACTER,
113        "\x90" => UTF8::REPLACEMENT_CHARACTER,
114        "\x91" => UTF8::REPLACEMENT_CHARACTER,
115        "\x92" => UTF8::REPLACEMENT_CHARACTER,
116        "\x93" => UTF8::REPLACEMENT_CHARACTER,
117        "\x94" => UTF8::REPLACEMENT_CHARACTER,
118        "\x95" => UTF8::REPLACEMENT_CHARACTER,
119        "\x96" => UTF8::REPLACEMENT_CHARACTER,
120        "\x97" => UTF8::REPLACEMENT_CHARACTER,
121        "\x98" => UTF8::REPLACEMENT_CHARACTER,
122        "\x99" => UTF8::REPLACEMENT_CHARACTER,
123        "\x9A" => UTF8::REPLACEMENT_CHARACTER,
124        "\x9B" => UTF8::REPLACEMENT_CHARACTER,
125        "\x9C" => UTF8::REPLACEMENT_CHARACTER,
126        "\x9D" => UTF8::REPLACEMENT_CHARACTER,
127        "\x9E" => UTF8::REPLACEMENT_CHARACTER,
128        "\x9F" => UTF8::REPLACEMENT_CHARACTER,
129        "\xA0" => UTF8::REPLACEMENT_CHARACTER,
130        "\xA1" => UTF8::LATIN_CAPITAL_LETTER_L_WITH_STROKE,
131        "\xA2" => UTF8::LATIN_CAPITAL_LETTER_O_WITH_STROKE,
132        "\xA3" => UTF8::LATIN_CAPITAL_LETTER_D_WITH_STROKE,
133        "\xA4" => UTF8::LATIN_CAPITAL_LETTER_THORN,
134        "\xA5" => UTF8::LATIN_CAPITAL_LETTER_AE,
135        "\xA6" => UTF8::LATIN_CAPITAL_LIGATURE_OE,
136        "\xA7" => UTF8::MODIFIER_LETTER_PRIME,
137        "\xA8" => UTF8::MIDDLE_DOT,
138        "\xA9" => UTF8::MUSIC_FLAT_SIGN,
139        "\xAA" => UTF8::REGISTERED_SIGN,
140        "\xAB" => UTF8::PLUS_MINUS_SIGN,
141        "\xAC" => UTF8::LATIN_CAPITAL_LETTER_O_WITH_HORN,
142        "\xAD" => UTF8::LATIN_CAPITAL_LETTER_U_WITH_HORN,
143        "\xAE" => UTF8::MODIFIER_LETTER_APOSTROPHE,
144        "\xAF" => UTF8::REPLACEMENT_CHARACTER,
145        "\xB0" => UTF8::MODIFIER_LETTER_TURNED_COMMA,
146        "\xB1" => UTF8::LATIN_SMALL_LETTER_L_WITH_STROKE,
147        "\xB2" => UTF8::LATIN_SMALL_LETTER_O_WITH_STROKE,
148        "\xB3" => UTF8::LATIN_SMALL_LETTER_D_WITH_STROKE,
149        "\xB4" => UTF8::LATIN_SMALL_LETTER_THORN,
150        "\xB5" => UTF8::LATIN_SMALL_LETTER_AE,
151        "\xB6" => UTF8::LATIN_SMALL_LIGATURE_OE,
152        "\xB7" => UTF8::MODIFIER_LETTER_DOUBLE_PRIME,
153        "\xB8" => UTF8::LATIN_SMALL_LETTER_DOTLESS_I,
154        "\xB9" => UTF8::POUND_SIGN,
155        "\xBA" => UTF8::LATIN_SMALL_LETTER_ETH,
156        "\xBB" => UTF8::REPLACEMENT_CHARACTER,
157        "\xBC" => UTF8::LATIN_SMALL_LETTER_O_WITH_HORN,
158        "\xBD" => UTF8::LATIN_SMALL_LETTER_U_WITH_HORN,
159        "\xBE" => UTF8::WHITE_SQUARE,
160        "\xBF" => UTF8::BLACK_SQUARE,
161        "\xC0" => UTF8::DEGREE_SIGN,
162        "\xC1" => UTF8::SCRIPT_SMALL_L,
163        "\xC2" => UTF8::SOUND_RECORDING_COPYRIGHT,
164        "\xC3" => UTF8::COPYRIGHT_SIGN,
165        "\xC4" => UTF8::MUSIC_SHARP_SIGN,
166        "\xC5" => UTF8::INVERTED_QUESTION_MARK,
167        "\xC6" => UTF8::INVERTED_EXCLAMATION_MARK,
168        "\xC7" => UTF8::LATIN_CAPITAL_LETTER_SHARP_S,
169        "\xC8" => UTF8::EURO_SIGN,
170        "\xC9" => UTF8::REPLACEMENT_CHARACTER,
171        "\xCA" => UTF8::REPLACEMENT_CHARACTER,
172        "\xCB" => UTF8::REPLACEMENT_CHARACTER,
173        "\xCC" => UTF8::REPLACEMENT_CHARACTER,
174        "\xCD" => UTF8::REPLACEMENT_CHARACTER,
175        "\xCE" => UTF8::REPLACEMENT_CHARACTER,
176        "\xCF" => UTF8::LATIN_SMALL_LETTER_SHARP_S,
177        "\xD0" => UTF8::REPLACEMENT_CHARACTER,
178        "\xD1" => UTF8::REPLACEMENT_CHARACTER,
179        "\xD2" => UTF8::REPLACEMENT_CHARACTER,
180        "\xD3" => UTF8::REPLACEMENT_CHARACTER,
181        "\xD4" => UTF8::REPLACEMENT_CHARACTER,
182        "\xD5" => UTF8::REPLACEMENT_CHARACTER,
183        "\xD6" => UTF8::REPLACEMENT_CHARACTER,
184        "\xD7" => UTF8::REPLACEMENT_CHARACTER,
185        "\xD8" => UTF8::REPLACEMENT_CHARACTER,
186        "\xD9" => UTF8::REPLACEMENT_CHARACTER,
187        "\xDA" => UTF8::REPLACEMENT_CHARACTER,
188        "\xDB" => UTF8::REPLACEMENT_CHARACTER,
189        "\xDC" => UTF8::REPLACEMENT_CHARACTER,
190        "\xDD" => UTF8::REPLACEMENT_CHARACTER,
191        "\xDE" => UTF8::REPLACEMENT_CHARACTER,
192        "\xDF" => UTF8::REPLACEMENT_CHARACTER,
193        "\xE0" => UTF8::COMBINING_HOOK_ABOVE,
194        "\xE1" => UTF8::COMBINING_GRAVE_ACCENT,
195        "\xE2" => UTF8::COMBINING_ACUTE_ACCENT,
196        "\xE3" => UTF8::COMBINING_CIRCUMFLEX_ACCENT,
197        "\xE4" => UTF8::COMBINING_TILDE,
198        "\xE5" => UTF8::COMBINING_MACRON,
199        "\xE6" => UTF8::COMBINING_BREVE,
200        "\xE7" => UTF8::COMBINING_DOT_ABOVE,
201        "\xE8" => UTF8::COMBINING_DIAERESIS,
202        "\xE9" => UTF8::COMBINING_CARON,
203        "\xEA" => UTF8::COMBINING_RING_ABOVE,
204        "\xEB" => UTF8::COMBINING_DOUBLE_INVERTED_BREVE,
205        "\xEC" => '',
206        "\xED" => UTF8::COMBINING_COMMA_ABOVE_RIGHT,
207        "\xEE" => UTF8::COMBINING_DOUBLE_ACUTE_ACCENT,
208        "\xEF" => UTF8::COMBINING_CANDRABINDU,
209        "\xF0" => UTF8::COMBINING_CEDILLA,
210        "\xF1" => UTF8::COMBINING_OGONEK,
211        "\xF2" => UTF8::COMBINING_DOT_BELOW,
212        "\xF3" => UTF8::COMBINING_DIAERESIS_BELOW,
213        "\xF4" => UTF8::COMBINING_RING_BELOW,
214        "\xF5" => UTF8::COMBINING_DOUBLE_LOW_LINE,
215        "\xF6" => UTF8::COMBINING_LOW_LINE,
216        "\xF7" => UTF8::COMBINING_COMMA_BELOW,
217        "\xF8" => UTF8::COMBINING_LEFT_HALF_RING_BELOW,
218        "\xF9" => UTF8::COMBINING_BREVE_BELOW,
219        "\xFA" => UTF8::COMBINING_DOUBLE_TILDE,
220        "\xFB" => '',
221        "\xFC" => UTF8::REPLACEMENT_CHARACTER,
222        "\xFD" => UTF8::REPLACEMENT_CHARACTER,
223        "\xFE" => UTF8::COMBINING_COMMA_ABOVE,
224        "\xFF" => UTF8::COMBINING_LONG_SOLIDUS_OVERLAY,
225    ];
226
227    // The subset of pre-composed UTF8 characters that can be made from ANSEL characters.
228    private const PRECOMPOSED_CHARACTERS = [
229        'A' . UTF8::COMBINING_ACUTE_ACCENT                                      => UTF8::LATIN_CAPITAL_LETTER_A_WITH_ACUTE,
230        'A' . UTF8::COMBINING_BREVE                                             => UTF8::LATIN_CAPITAL_LETTER_A_WITH_BREVE,
231        'A' . UTF8::COMBINING_BREVE . UTF8::COMBINING_ACUTE_ACCENT              => UTF8::LATIN_CAPITAL_LETTER_A_WITH_BREVE_AND_ACUTE,
232        'A' . UTF8::COMBINING_BREVE . UTF8::COMBINING_DOT_BELOW                 => UTF8::LATIN_CAPITAL_LETTER_A_WITH_BREVE_AND_DOT_BELOW,
233        'A' . UTF8::COMBINING_BREVE . UTF8::COMBINING_GRAVE_ACCENT              => UTF8::LATIN_CAPITAL_LETTER_A_WITH_BREVE_AND_GRAVE,
234        'A' . UTF8::COMBINING_BREVE . UTF8::COMBINING_HOOK_ABOVE                => UTF8::LATIN_CAPITAL_LETTER_A_WITH_BREVE_AND_HOOK_ABOVE,
235        'A' . UTF8::COMBINING_BREVE . UTF8::COMBINING_TILDE                     => UTF8::LATIN_CAPITAL_LETTER_A_WITH_BREVE_AND_TILDE,
236        'A' . UTF8::COMBINING_CARON                                             => UTF8::LATIN_CAPITAL_LETTER_A_WITH_CARON,
237        'A' . UTF8::COMBINING_CIRCUMFLEX_ACCENT                                 => UTF8::LATIN_CAPITAL_LETTER_A_WITH_CIRCUMFLEX,
238        'A' . UTF8::COMBINING_CIRCUMFLEX_ACCENT . UTF8::COMBINING_ACUTE_ACCENT  => UTF8::LATIN_CAPITAL_LETTER_A_WITH_CIRCUMFLEX_AND_ACUTE,
239        'A' . UTF8::COMBINING_CIRCUMFLEX_ACCENT . UTF8::COMBINING_DOT_BELOW     => UTF8::LATIN_CAPITAL_LETTER_A_WITH_CIRCUMFLEX_AND_DOT_BELOW,
240        'A' . UTF8::COMBINING_CIRCUMFLEX_ACCENT . UTF8::COMBINING_GRAVE_ACCENT  => UTF8::LATIN_CAPITAL_LETTER_A_WITH_CIRCUMFLEX_AND_GRAVE,
241        'A' . UTF8::COMBINING_CIRCUMFLEX_ACCENT . UTF8::COMBINING_HOOK_ABOVE    => UTF8::LATIN_CAPITAL_LETTER_A_WITH_CIRCUMFLEX_AND_HOOK_ABOVE,
242        'A' . UTF8::COMBINING_CIRCUMFLEX_ACCENT . UTF8::COMBINING_TILDE         => UTF8::LATIN_CAPITAL_LETTER_A_WITH_CIRCUMFLEX_AND_TILDE,
243        'A' . UTF8::COMBINING_DIAERESIS                                         => UTF8::LATIN_CAPITAL_LETTER_A_WITH_DIAERESIS,
244        'A' . UTF8::COMBINING_DIAERESIS . UTF8::COMBINING_MACRON                => UTF8::LATIN_CAPITAL_LETTER_A_WITH_DIAERESIS_AND_MACRON,
245        'A' . UTF8::COMBINING_DOT_ABOVE                                         => UTF8::LATIN_CAPITAL_LETTER_A_WITH_DOT_ABOVE,
246        'A' . UTF8::COMBINING_DOT_ABOVE . UTF8::COMBINING_MACRON                => UTF8::LATIN_CAPITAL_LETTER_A_WITH_DOT_ABOVE_AND_MACRON,
247        'A' . UTF8::COMBINING_DOT_BELOW                                         => UTF8::LATIN_CAPITAL_LETTER_A_WITH_DOT_BELOW,
248        'A' . UTF8::COMBINING_GRAVE_ACCENT                                      => UTF8::LATIN_CAPITAL_LETTER_A_WITH_GRAVE,
249        'A' . UTF8::COMBINING_HOOK_ABOVE                                        => UTF8::LATIN_CAPITAL_LETTER_A_WITH_HOOK_ABOVE,
250        'A' . UTF8::COMBINING_MACRON                                            => UTF8::LATIN_CAPITAL_LETTER_A_WITH_MACRON,
251        'A' . UTF8::COMBINING_OGONEK                                            => UTF8::LATIN_CAPITAL_LETTER_A_WITH_OGONEK,
252        'A' . UTF8::COMBINING_RING_ABOVE                                        => UTF8::LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE,
253        'A' . UTF8::COMBINING_RING_ABOVE . UTF8::COMBINING_ACUTE_ACCENT         => UTF8::LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE_AND_ACUTE,
254        'A' . UTF8::COMBINING_RING_BELOW                                        => UTF8::LATIN_CAPITAL_LETTER_A_WITH_RING_BELOW,
255        'A' . UTF8::COMBINING_TILDE                                             => UTF8::LATIN_CAPITAL_LETTER_A_WITH_TILDE,
256        'B' . UTF8::COMBINING_DOT_ABOVE                                         => UTF8::LATIN_CAPITAL_LETTER_B_WITH_DOT_ABOVE,
257        'B' . UTF8::COMBINING_DOT_BELOW                                         => UTF8::LATIN_CAPITAL_LETTER_B_WITH_DOT_BELOW,
258        'C' . UTF8::COMBINING_ACUTE_ACCENT                                      => UTF8::LATIN_CAPITAL_LETTER_C_WITH_ACUTE,
259        'C' . UTF8::COMBINING_CARON                                             => UTF8::LATIN_CAPITAL_LETTER_C_WITH_CARON,
260        'C' . UTF8::COMBINING_CEDILLA                                           => UTF8::LATIN_CAPITAL_LETTER_C_WITH_CEDILLA,
261        'C' . UTF8::COMBINING_CIRCUMFLEX_ACCENT                                 => UTF8::LATIN_CAPITAL_LETTER_C_WITH_CIRCUMFLEX,
262        'C' . UTF8::COMBINING_DOT_ABOVE                                         => UTF8::LATIN_CAPITAL_LETTER_C_WITH_DOT_ABOVE,
263        'C' . UTF8::COMBINING_DOT_BELOW                                         => UTF8::LATIN_CAPITAL_LETTER_C_WITH_CEDILLA_AND_ACUTE,
264        'D' . UTF8::COMBINING_CARON                                             => UTF8::LATIN_CAPITAL_LETTER_D_WITH_CARON,
265        'D' . UTF8::COMBINING_CEDILLA                                           => UTF8::LATIN_CAPITAL_LETTER_D_WITH_CEDILLA,
266        'D' . UTF8::COMBINING_DOT_ABOVE                                         => UTF8::LATIN_CAPITAL_LETTER_D_WITH_DOT_ABOVE,
267        'D' . UTF8::COMBINING_DOT_BELOW                                         => UTF8::LATIN_CAPITAL_LETTER_D_WITH_DOT_BELOW,
268        'E' . UTF8::COMBINING_ACUTE_ACCENT                                      => UTF8::LATIN_CAPITAL_LETTER_E_WITH_ACUTE,
269        'E' . UTF8::COMBINING_BREVE                                             => UTF8::LATIN_CAPITAL_LETTER_E_WITH_BREVE,
270        'E' . UTF8::COMBINING_BREVE . UTF8::COMBINING_CEDILLA                   => UTF8::LATIN_CAPITAL_LETTER_E_WITH_CEDILLA_AND_BREVE,
271        'E' . UTF8::COMBINING_CARON                                             => UTF8::LATIN_CAPITAL_LETTER_E_WITH_CARON,
272        'E' . UTF8::COMBINING_CEDILLA                                           => UTF8::LATIN_CAPITAL_LETTER_E_WITH_CEDILLA,
273        'E' . UTF8::COMBINING_CIRCUMFLEX_ACCENT                                 => UTF8::LATIN_CAPITAL_LETTER_E_WITH_CIRCUMFLEX,
274        'E' . UTF8::COMBINING_CIRCUMFLEX_ACCENT . UTF8::COMBINING_ACUTE_ACCENT  => UTF8::LATIN_CAPITAL_LETTER_E_WITH_CIRCUMFLEX_AND_ACUTE,
275        'E' . UTF8::COMBINING_CIRCUMFLEX_ACCENT . UTF8::COMBINING_DOT_BELOW     => UTF8::LATIN_CAPITAL_LETTER_E_WITH_CIRCUMFLEX_AND_DOT_BELOW,
276        'E' . UTF8::COMBINING_CIRCUMFLEX_ACCENT . UTF8::COMBINING_GRAVE_ACCENT  => UTF8::LATIN_CAPITAL_LETTER_E_WITH_CIRCUMFLEX_AND_GRAVE,
277        'E' . UTF8::COMBINING_CIRCUMFLEX_ACCENT . UTF8::COMBINING_HOOK_ABOVE    => UTF8::LATIN_CAPITAL_LETTER_E_WITH_CIRCUMFLEX_AND_HOOK_ABOVE,
278        'E' . UTF8::COMBINING_CIRCUMFLEX_ACCENT . UTF8::COMBINING_TILDE         => UTF8::LATIN_CAPITAL_LETTER_E_WITH_CIRCUMFLEX_AND_TILDE,
279        'E' . UTF8::COMBINING_DIAERESIS                                         => UTF8::LATIN_CAPITAL_LETTER_E_WITH_DIAERESIS,
280        'E' . UTF8::COMBINING_DOT_ABOVE                                         => UTF8::LATIN_CAPITAL_LETTER_E_WITH_DOT_ABOVE,
281        'E' . UTF8::COMBINING_DOT_BELOW                                         => UTF8::LATIN_CAPITAL_LETTER_E_WITH_DOT_BELOW,
282        'E' . UTF8::COMBINING_GRAVE_ACCENT                                      => UTF8::LATIN_CAPITAL_LETTER_E_WITH_GRAVE,
283        'E' . UTF8::COMBINING_HOOK_ABOVE                                        => UTF8::LATIN_CAPITAL_LETTER_E_WITH_HOOK_ABOVE,
284        'E' . UTF8::COMBINING_MACRON                                            => UTF8::LATIN_CAPITAL_LETTER_E_WITH_MACRON,
285        'E' . UTF8::COMBINING_MACRON . UTF8::COMBINING_ACUTE_ACCENT             => UTF8::LATIN_CAPITAL_LETTER_E_WITH_MACRON_AND_ACUTE,
286        'E' . UTF8::COMBINING_MACRON . UTF8::COMBINING_GRAVE_ACCENT             => UTF8::LATIN_CAPITAL_LETTER_E_WITH_MACRON_AND_GRAVE,
287        'E' . UTF8::COMBINING_OGONEK                                            => UTF8::LATIN_CAPITAL_LETTER_E_WITH_OGONEK,
288        'E' . UTF8::COMBINING_TILDE                                             => UTF8::LATIN_CAPITAL_LETTER_E_WITH_TILDE,
289        'F' . UTF8::COMBINING_DOT_ABOVE                                         => UTF8::LATIN_CAPITAL_LETTER_F_WITH_DOT_ABOVE,
290        'G' . UTF8::COMBINING_ACUTE_ACCENT                                      => UTF8::LATIN_CAPITAL_LETTER_G_WITH_ACUTE,
291        'G' . UTF8::COMBINING_BREVE                                             => UTF8::LATIN_CAPITAL_LETTER_G_WITH_BREVE,
292        'G' . UTF8::COMBINING_CARON                                             => UTF8::LATIN_CAPITAL_LETTER_G_WITH_CARON,
293        'G' . UTF8::COMBINING_CEDILLA                                           => UTF8::LATIN_CAPITAL_LETTER_G_WITH_CEDILLA,
294        'G' . UTF8::COMBINING_CIRCUMFLEX_ACCENT                                 => UTF8::LATIN_CAPITAL_LETTER_G_WITH_CIRCUMFLEX,
295        'G' . UTF8::COMBINING_DOT_ABOVE                                         => UTF8::LATIN_CAPITAL_LETTER_G_WITH_DOT_ABOVE,
296        'G' . UTF8::COMBINING_MACRON                                            => UTF8::LATIN_CAPITAL_LETTER_G_WITH_MACRON,
297        'H' . UTF8::COMBINING_BREVE_BELOW                                       => UTF8::LATIN_CAPITAL_LETTER_H_WITH_BREVE_BELOW,
298        'H' . UTF8::COMBINING_CARON                                             => UTF8::LATIN_CAPITAL_LETTER_H_WITH_CARON,
299        'H' . UTF8::COMBINING_CEDILLA                                           => UTF8::LATIN_CAPITAL_LETTER_H_WITH_CEDILLA,
300        'H' . UTF8::COMBINING_CIRCUMFLEX_ACCENT                                 => UTF8::LATIN_CAPITAL_LETTER_H_WITH_CIRCUMFLEX,
301        'H' . UTF8::COMBINING_DIAERESIS                                         => UTF8::LATIN_CAPITAL_LETTER_H_WITH_DIAERESIS,
302        'H' . UTF8::COMBINING_DOT_ABOVE                                         => UTF8::LATIN_CAPITAL_LETTER_H_WITH_DOT_ABOVE,
303        'H' . UTF8::COMBINING_DOT_BELOW                                         => UTF8::LATIN_CAPITAL_LETTER_H_WITH_DOT_BELOW,
304        'I' . UTF8::COMBINING_ACUTE_ACCENT                                      => UTF8::LATIN_CAPITAL_LETTER_I_WITH_ACUTE,
305        'I' . UTF8::COMBINING_BREVE                                             => UTF8::LATIN_CAPITAL_LETTER_I_WITH_BREVE,
306        'I' . UTF8::COMBINING_CARON                                             => UTF8::LATIN_CAPITAL_LETTER_I_WITH_CARON,
307        'I' . UTF8::COMBINING_CIRCUMFLEX_ACCENT                                 => UTF8::LATIN_CAPITAL_LETTER_I_WITH_CIRCUMFLEX,
308        'I' . UTF8::COMBINING_DIAERESIS                                         => UTF8::LATIN_CAPITAL_LETTER_I_WITH_DIAERESIS,
309        'I' . UTF8::COMBINING_DIAERESIS . UTF8::COMBINING_ACUTE_ACCENT          => UTF8::LATIN_CAPITAL_LETTER_I_WITH_DIAERESIS_AND_ACUTE,
310        'I' . UTF8::COMBINING_DOT_ABOVE                                         => UTF8::LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE,
311        'I' . UTF8::COMBINING_DOT_BELOW                                         => UTF8::LATIN_CAPITAL_LETTER_I_WITH_DOT_BELOW,
312        'I' . UTF8::COMBINING_GRAVE_ACCENT                                      => UTF8::LATIN_CAPITAL_LETTER_I_WITH_GRAVE,
313        'I' . UTF8::COMBINING_HOOK_ABOVE                                        => UTF8::LATIN_CAPITAL_LETTER_I_WITH_HOOK_ABOVE,
314        'I' . UTF8::COMBINING_MACRON                                            => UTF8::LATIN_CAPITAL_LETTER_I_WITH_MACRON,
315        'I' . UTF8::COMBINING_OGONEK                                            => UTF8::LATIN_CAPITAL_LETTER_I_WITH_OGONEK,
316        'I' . UTF8::COMBINING_TILDE                                             => UTF8::LATIN_CAPITAL_LETTER_I_WITH_TILDE,
317        'J' . UTF8::COMBINING_CIRCUMFLEX_ACCENT                                 => UTF8::LATIN_CAPITAL_LETTER_J_WITH_CIRCUMFLEX,
318        'K' . UTF8::COMBINING_CARON                                             => UTF8::LATIN_CAPITAL_LETTER_K_WITH_CARON,
319        'K' . UTF8::COMBINING_CEDILLA                                           => UTF8::LATIN_CAPITAL_LETTER_K_WITH_CEDILLA,
320        'K' . UTF8::COMBINING_DIAERESIS                                         => UTF8::LATIN_CAPITAL_LETTER_K_WITH_ACUTE,
321        'K' . UTF8::COMBINING_DOT_BELOW                                         => UTF8::LATIN_CAPITAL_LETTER_K_WITH_DOT_BELOW,
322        'L' . UTF8::COMBINING_ACUTE_ACCENT                                      => UTF8::LATIN_CAPITAL_LETTER_L_WITH_ACUTE,
323        'L' . UTF8::COMBINING_CARON                                             => UTF8::LATIN_CAPITAL_LETTER_L_WITH_CARON,
324        'L' . UTF8::COMBINING_CEDILLA                                           => UTF8::LATIN_CAPITAL_LETTER_L_WITH_CEDILLA,
325        'L' . UTF8::COMBINING_DOT_BELOW                                         => UTF8::LATIN_CAPITAL_LETTER_L_WITH_DOT_BELOW,
326        'L' . UTF8::COMBINING_DOT_BELOW . UTF8::COMBINING_MACRON                => UTF8::LATIN_CAPITAL_LETTER_L_WITH_DOT_BELOW_AND_MACRON,
327        'M' . UTF8::COMBINING_ACUTE_ACCENT                                      => UTF8::LATIN_CAPITAL_LETTER_M_WITH_ACUTE,
328        'M' . UTF8::COMBINING_DOT_ABOVE                                         => UTF8::LATIN_CAPITAL_LETTER_M_WITH_DOT_ABOVE,
329        'M' . UTF8::COMBINING_DOT_BELOW                                         => UTF8::LATIN_CAPITAL_LETTER_M_WITH_DOT_BELOW,
330        'N' . UTF8::COMBINING_ACUTE_ACCENT                                      => UTF8::LATIN_CAPITAL_LETTER_N_WITH_ACUTE,
331        'N' . UTF8::COMBINING_CARON                                             => UTF8::LATIN_CAPITAL_LETTER_N_WITH_CARON,
332        'N' . UTF8::COMBINING_CEDILLA                                           => UTF8::LATIN_CAPITAL_LETTER_N_WITH_CEDILLA,
333        'N' . UTF8::COMBINING_DOT_ABOVE                                         => UTF8::LATIN_CAPITAL_LETTER_N_WITH_DOT_ABOVE,
334        'N' . UTF8::COMBINING_DOT_BELOW                                         => UTF8::LATIN_CAPITAL_LETTER_N_WITH_DOT_BELOW,
335        'N' . UTF8::COMBINING_GRAVE_ACCENT                                      => UTF8::LATIN_CAPITAL_LETTER_N_WITH_GRAVE,
336        'N' . UTF8::COMBINING_TILDE                                             => UTF8::LATIN_CAPITAL_LETTER_N_WITH_TILDE,
337        'O' . UTF8::COMBINING_ACUTE_ACCENT                                      => UTF8::LATIN_CAPITAL_LETTER_O_WITH_ACUTE,
338        'O' . UTF8::COMBINING_BREVE                                             => UTF8::LATIN_CAPITAL_LETTER_O_WITH_BREVE,
339        'O' . UTF8::COMBINING_CARON                                             => UTF8::LATIN_CAPITAL_LETTER_O_WITH_CARON,
340        'O' . UTF8::COMBINING_CIRCUMFLEX_ACCENT                                 => UTF8::LATIN_CAPITAL_LETTER_O_WITH_CIRCUMFLEX,
341        'O' . UTF8::COMBINING_CIRCUMFLEX_ACCENT . UTF8::COMBINING_ACUTE_ACCENT  => UTF8::LATIN_CAPITAL_LETTER_O_WITH_CIRCUMFLEX_AND_ACUTE,
342        'O' . UTF8::COMBINING_CIRCUMFLEX_ACCENT . UTF8::COMBINING_DOT_BELOW     => UTF8::LATIN_CAPITAL_LETTER_O_WITH_CIRCUMFLEX_AND_DOT_BELOW,
343        'O' . UTF8::COMBINING_CIRCUMFLEX_ACCENT . UTF8::COMBINING_GRAVE_ACCENT  => UTF8::LATIN_CAPITAL_LETTER_O_WITH_CIRCUMFLEX_AND_GRAVE,
344        'O' . UTF8::COMBINING_CIRCUMFLEX_ACCENT . UTF8::COMBINING_HOOK_ABOVE    => UTF8::LATIN_CAPITAL_LETTER_O_WITH_CIRCUMFLEX_AND_HOOK_ABOVE,
345        'O' . UTF8::COMBINING_CIRCUMFLEX_ACCENT . UTF8::COMBINING_TILDE         => UTF8::LATIN_CAPITAL_LETTER_O_WITH_CIRCUMFLEX_AND_TILDE,
346        'O' . UTF8::COMBINING_DIAERESIS                                         => UTF8::LATIN_CAPITAL_LETTER_O_WITH_DIAERESIS,
347        'O' . UTF8::COMBINING_DIAERESIS . UTF8::COMBINING_MACRON                => UTF8::LATIN_CAPITAL_LETTER_O_WITH_DIAERESIS_AND_MACRON,
348        'O' . UTF8::COMBINING_DOT_ABOVE                                         => UTF8::LATIN_CAPITAL_LETTER_O_WITH_DOT_ABOVE,
349        'O' . UTF8::COMBINING_DOT_ABOVE . UTF8::COMBINING_MACRON                => UTF8::LATIN_CAPITAL_LETTER_O_WITH_DOT_ABOVE_AND_MACRON,
350        'O' . UTF8::COMBINING_DOT_BELOW                                         => UTF8::LATIN_CAPITAL_LETTER_O_WITH_DOT_BELOW,
351        'O' . UTF8::COMBINING_DOUBLE_ACUTE_ACCENT                               => UTF8::LATIN_CAPITAL_LETTER_O_WITH_DOUBLE_ACUTE,
352        'O' . UTF8::COMBINING_GRAVE_ACCENT                                      => UTF8::LATIN_CAPITAL_LETTER_O_WITH_GRAVE,
353        'O' . UTF8::COMBINING_HOOK_ABOVE                                        => UTF8::LATIN_CAPITAL_LETTER_O_WITH_HOOK_ABOVE,
354        'O' . UTF8::COMBINING_MACRON                                            => UTF8::LATIN_CAPITAL_LETTER_O_WITH_MACRON,
355        'O' . UTF8::COMBINING_MACRON . UTF8::COMBINING_ACUTE_ACCENT             => UTF8::LATIN_CAPITAL_LETTER_O_WITH_MACRON_AND_ACUTE,
356        'O' . UTF8::COMBINING_MACRON . UTF8::COMBINING_GRAVE_ACCENT             => UTF8::LATIN_CAPITAL_LETTER_O_WITH_MACRON_AND_GRAVE,
357        'O' . UTF8::COMBINING_OGONEK                                            => UTF8::LATIN_CAPITAL_LETTER_O_WITH_OGONEK,
358        'O' . UTF8::COMBINING_OGONEK . UTF8::COMBINING_MACRON                   => UTF8::LATIN_CAPITAL_LETTER_O_WITH_OGONEK_AND_MACRON,
359        'O' . UTF8::COMBINING_TILDE                                             => UTF8::LATIN_CAPITAL_LETTER_O_WITH_TILDE,
360        'O' . UTF8::COMBINING_TILDE . UTF8::COMBINING_ACUTE_ACCENT              => UTF8::LATIN_CAPITAL_LETTER_O_WITH_TILDE_AND_ACUTE,
361        'O' . UTF8::COMBINING_TILDE . UTF8::COMBINING_DIAERESIS                 => UTF8::LATIN_CAPITAL_LETTER_O_WITH_TILDE_AND_DIAERESIS,
362        'O' . UTF8::COMBINING_TILDE . UTF8::COMBINING_MACRON                    => UTF8::LATIN_CAPITAL_LETTER_O_WITH_TILDE_AND_MACRON,
363        'P' . UTF8::COMBINING_ACUTE_ACCENT                                      => UTF8::LATIN_CAPITAL_LETTER_P_WITH_ACUTE,
364        'P' . UTF8::COMBINING_DOT_ABOVE                                         => UTF8::LATIN_CAPITAL_LETTER_P_WITH_DOT_ABOVE,
365        'R' . UTF8::COMBINING_ACUTE_ACCENT                                      => UTF8::LATIN_CAPITAL_LETTER_R_WITH_ACUTE,
366        'R' . UTF8::COMBINING_CARON                                             => UTF8::LATIN_CAPITAL_LETTER_R_WITH_CARON,
367        'R' . UTF8::COMBINING_CEDILLA                                           => UTF8::LATIN_CAPITAL_LETTER_R_WITH_CEDILLA,
368        'R' . UTF8::COMBINING_DOT_ABOVE                                         => UTF8::LATIN_CAPITAL_LETTER_R_WITH_DOT_ABOVE,
369        'R' . UTF8::COMBINING_DOT_BELOW                                         => UTF8::LATIN_CAPITAL_LETTER_R_WITH_DOT_BELOW,
370        'R' . UTF8::COMBINING_DOT_BELOW . UTF8::COMBINING_MACRON                => UTF8::LATIN_CAPITAL_LETTER_R_WITH_DOT_BELOW_AND_MACRON,
371        'S' . UTF8::COMBINING_ACUTE_ACCENT                                      => UTF8::LATIN_CAPITAL_LETTER_S_WITH_ACUTE,
372        'S' . UTF8::COMBINING_ACUTE_ACCENT . UTF8::COMBINING_DOT_ABOVE          => UTF8::LATIN_CAPITAL_LETTER_S_WITH_ACUTE_AND_DOT_ABOVE,
373        'S' . UTF8::COMBINING_CARON                                             => UTF8::LATIN_CAPITAL_LETTER_S_WITH_CARON,
374        'S' . UTF8::COMBINING_CARON . UTF8::COMBINING_DOT_ABOVE                 => UTF8::LATIN_CAPITAL_LETTER_S_WITH_CARON_AND_DOT_ABOVE,
375        'S' . UTF8::COMBINING_CEDILLA                                           => UTF8::LATIN_CAPITAL_LETTER_S_WITH_CEDILLA,
376        'S' . UTF8::COMBINING_CIRCUMFLEX_ACCENT                                 => UTF8::LATIN_CAPITAL_LETTER_S_WITH_CIRCUMFLEX,
377        'S' . UTF8::COMBINING_COMMA_BELOW                                       => UTF8::LATIN_CAPITAL_LETTER_S_WITH_COMMA_BELOW,
378        'S' . UTF8::COMBINING_DOT_ABOVE                                         => UTF8::LATIN_CAPITAL_LETTER_S_WITH_DOT_ABOVE,
379        'S' . UTF8::COMBINING_DOT_BELOW                                         => UTF8::LATIN_CAPITAL_LETTER_S_WITH_DOT_BELOW,
380        'S' . UTF8::COMBINING_DOT_BELOW . UTF8::COMBINING_DOT_ABOVE             => UTF8::LATIN_CAPITAL_LETTER_S_WITH_DOT_BELOW_AND_DOT_ABOVE,
381        'T' . UTF8::COMBINING_CARON                                             => UTF8::LATIN_CAPITAL_LETTER_T_WITH_CARON,
382        'T' . UTF8::COMBINING_CEDILLA                                           => UTF8::LATIN_CAPITAL_LETTER_T_WITH_CEDILLA,
383        'T' . UTF8::COMBINING_COMMA_BELOW                                       => UTF8::LATIN_CAPITAL_LETTER_T_WITH_COMMA_BELOW,
384        'T' . UTF8::COMBINING_DOT_ABOVE                                         => UTF8::LATIN_CAPITAL_LETTER_T_WITH_DOT_ABOVE,
385        'T' . UTF8::COMBINING_DOT_BELOW                                         => UTF8::LATIN_CAPITAL_LETTER_T_WITH_DOT_BELOW,
386        'U' . UTF8::COMBINING_ACUTE_ACCENT                                      => UTF8::LATIN_CAPITAL_LETTER_U_WITH_ACUTE,
387        'U' . UTF8::COMBINING_BREVE                                             => UTF8::LATIN_CAPITAL_LETTER_U_WITH_BREVE,
388        'U' . UTF8::COMBINING_CARON                                             => UTF8::LATIN_CAPITAL_LETTER_U_WITH_CARON,
389        'U' . UTF8::COMBINING_CIRCUMFLEX_ACCENT                                 => UTF8::LATIN_CAPITAL_LETTER_U_WITH_CIRCUMFLEX,
390        'U' . UTF8::COMBINING_DIAERESIS                                         => UTF8::LATIN_CAPITAL_LETTER_U_WITH_DIAERESIS,
391        'U' . UTF8::COMBINING_DIAERESIS . UTF8::COMBINING_ACUTE_ACCENT          => UTF8::LATIN_CAPITAL_LETTER_U_WITH_DIAERESIS_AND_ACUTE,
392        'U' . UTF8::COMBINING_DIAERESIS . UTF8::COMBINING_CARON                 => UTF8::LATIN_CAPITAL_LETTER_U_WITH_DIAERESIS_AND_CARON,
393        'U' . UTF8::COMBINING_DIAERESIS . UTF8::COMBINING_GRAVE_ACCENT          => UTF8::LATIN_CAPITAL_LETTER_U_WITH_DIAERESIS_AND_GRAVE,
394        'U' . UTF8::COMBINING_DIAERESIS . UTF8::COMBINING_MACRON                => UTF8::LATIN_CAPITAL_LETTER_U_WITH_DIAERESIS_AND_MACRON,
395        'U' . UTF8::COMBINING_DIAERESIS_BELOW                                   => UTF8::LATIN_CAPITAL_LETTER_U_WITH_DIAERESIS_BELOW,
396        'U' . UTF8::COMBINING_DOT_BELOW                                         => UTF8::LATIN_CAPITAL_LETTER_U_WITH_DOT_BELOW,
397        'U' . UTF8::COMBINING_DOUBLE_ACUTE_ACCENT                               => UTF8::LATIN_CAPITAL_LETTER_U_WITH_DOUBLE_ACUTE,
398        'U' . UTF8::COMBINING_GRAVE_ACCENT                                      => UTF8::LATIN_CAPITAL_LETTER_U_WITH_GRAVE,
399        'U' . UTF8::COMBINING_HOOK_ABOVE                                        => UTF8::LATIN_CAPITAL_LETTER_U_WITH_HOOK_ABOVE,
400        'U' . UTF8::COMBINING_MACRON                                            => UTF8::LATIN_CAPITAL_LETTER_U_WITH_MACRON,
401        'U' . UTF8::COMBINING_MACRON . UTF8::COMBINING_DIAERESIS                => UTF8::LATIN_CAPITAL_LETTER_U_WITH_MACRON_AND_DIAERESIS,
402        'U' . UTF8::COMBINING_OGONEK                                            => UTF8::LATIN_CAPITAL_LETTER_U_WITH_OGONEK,
403        'U' . UTF8::COMBINING_RING_ABOVE                                        => UTF8::LATIN_CAPITAL_LETTER_U_WITH_RING_ABOVE,
404        'U' . UTF8::COMBINING_TILDE                                             => UTF8::LATIN_CAPITAL_LETTER_U_WITH_TILDE,
405        'U' . UTF8::COMBINING_TILDE . UTF8::COMBINING_ACUTE_ACCENT              => UTF8::LATIN_CAPITAL_LETTER_U_WITH_TILDE_AND_ACUTE,
406        'V' . UTF8::COMBINING_DOT_BELOW                                         => UTF8::LATIN_CAPITAL_LETTER_V_WITH_DOT_BELOW,
407        'V' . UTF8::COMBINING_TILDE                                             => UTF8::LATIN_CAPITAL_LETTER_V_WITH_TILDE,
408        'W' . UTF8::COMBINING_ACUTE_ACCENT                                      => UTF8::LATIN_CAPITAL_LETTER_W_WITH_ACUTE,
409        'W' . UTF8::COMBINING_CIRCUMFLEX_ACCENT                                 => UTF8::LATIN_CAPITAL_LETTER_W_WITH_CIRCUMFLEX,
410        'W' . UTF8::COMBINING_DIAERESIS                                         => UTF8::LATIN_CAPITAL_LETTER_W_WITH_DIAERESIS,
411        'W' . UTF8::COMBINING_DOT_ABOVE                                         => UTF8::LATIN_CAPITAL_LETTER_W_WITH_DOT_ABOVE,
412        'W' . UTF8::COMBINING_DOT_BELOW                                         => UTF8::LATIN_CAPITAL_LETTER_W_WITH_DOT_BELOW,
413        'W' . UTF8::COMBINING_GRAVE_ACCENT                                      => UTF8::LATIN_CAPITAL_LETTER_W_WITH_GRAVE,
414        'X' . UTF8::COMBINING_DIAERESIS                                         => UTF8::LATIN_CAPITAL_LETTER_X_WITH_DIAERESIS,
415        'X' . UTF8::COMBINING_DOT_ABOVE                                         => UTF8::LATIN_CAPITAL_LETTER_X_WITH_DOT_ABOVE,
416        'Y' . UTF8::COMBINING_ACUTE_ACCENT                                      => UTF8::LATIN_CAPITAL_LETTER_Y_WITH_ACUTE,
417        'Y' . UTF8::COMBINING_CIRCUMFLEX_ACCENT                                 => UTF8::LATIN_CAPITAL_LETTER_Y_WITH_CIRCUMFLEX,
418        'Y' . UTF8::COMBINING_DIAERESIS                                         => UTF8::LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS,
419        'Y' . UTF8::COMBINING_DOT_ABOVE                                         => UTF8::LATIN_CAPITAL_LETTER_Y_WITH_DOT_ABOVE,
420        'Y' . UTF8::COMBINING_DOT_BELOW                                         => UTF8::LATIN_CAPITAL_LETTER_Y_WITH_DOT_BELOW,
421        'Y' . UTF8::COMBINING_GRAVE_ACCENT                                      => UTF8::LATIN_CAPITAL_LETTER_Y_WITH_GRAVE,
422        'Y' . UTF8::COMBINING_HOOK_ABOVE                                        => UTF8::LATIN_CAPITAL_LETTER_Y_WITH_HOOK_ABOVE,
423        'Y' . UTF8::COMBINING_MACRON                                            => UTF8::LATIN_CAPITAL_LETTER_Y_WITH_MACRON,
424        'Y' . UTF8::COMBINING_TILDE                                             => UTF8::LATIN_CAPITAL_LETTER_Y_WITH_TILDE,
425        'Z' . UTF8::COMBINING_ACUTE_ACCENT                                      => UTF8::LATIN_CAPITAL_LETTER_Z_WITH_ACUTE,
426        'Z' . UTF8::COMBINING_CARON                                             => UTF8::LATIN_CAPITAL_LETTER_Z_WITH_CARON,
427        'Z' . UTF8::COMBINING_CIRCUMFLEX_ACCENT                                 => UTF8::LATIN_CAPITAL_LETTER_Z_WITH_CIRCUMFLEX,
428        'Z' . UTF8::COMBINING_DOT_ABOVE                                         => UTF8::LATIN_CAPITAL_LETTER_Z_WITH_DOT_ABOVE,
429        'Z' . UTF8::COMBINING_DOT_BELOW                                         => UTF8::LATIN_CAPITAL_LETTER_Z_WITH_DOT_BELOW,
430        'a' . UTF8::COMBINING_ACUTE_ACCENT                                      => UTF8::LATIN_SMALL_LETTER_A_WITH_ACUTE,
431        'a' . UTF8::COMBINING_BREVE                                             => UTF8::LATIN_SMALL_LETTER_A_WITH_BREVE,
432        'a' . UTF8::COMBINING_BREVE . UTF8::COMBINING_ACUTE_ACCENT              => UTF8::LATIN_SMALL_LETTER_A_WITH_BREVE_AND_ACUTE,
433        'a' . UTF8::COMBINING_BREVE . UTF8::COMBINING_DOT_BELOW                 => UTF8::LATIN_SMALL_LETTER_A_WITH_BREVE_AND_DOT_BELOW,
434        'a' . UTF8::COMBINING_BREVE . UTF8::COMBINING_GRAVE_ACCENT              => UTF8::LATIN_SMALL_LETTER_A_WITH_BREVE_AND_GRAVE,
435        'a' . UTF8::COMBINING_BREVE . UTF8::COMBINING_HOOK_ABOVE                => UTF8::LATIN_SMALL_LETTER_A_WITH_BREVE_AND_HOOK_ABOVE,
436        'a' . UTF8::COMBINING_BREVE . UTF8::COMBINING_TILDE                     => UTF8::LATIN_SMALL_LETTER_A_WITH_BREVE_AND_TILDE,
437        'a' . UTF8::COMBINING_CARON                                             => UTF8::LATIN_SMALL_LETTER_A_WITH_CARON,
438        'a' . UTF8::COMBINING_CIRCUMFLEX_ACCENT                                 => UTF8::LATIN_SMALL_LETTER_A_WITH_CIRCUMFLEX,
439        'a' . UTF8::COMBINING_CIRCUMFLEX_ACCENT . UTF8::COMBINING_ACUTE_ACCENT  => UTF8::LATIN_SMALL_LETTER_A_WITH_CIRCUMFLEX_AND_ACUTE,
440        'a' . UTF8::COMBINING_CIRCUMFLEX_ACCENT . UTF8::COMBINING_DOT_BELOW     => UTF8::LATIN_SMALL_LETTER_A_WITH_CIRCUMFLEX_AND_DOT_BELOW,
441        'a' . UTF8::COMBINING_CIRCUMFLEX_ACCENT . UTF8::COMBINING_GRAVE_ACCENT  => UTF8::LATIN_SMALL_LETTER_A_WITH_CIRCUMFLEX_AND_GRAVE,
442        'a' . UTF8::COMBINING_CIRCUMFLEX_ACCENT . UTF8::COMBINING_HOOK_ABOVE    => UTF8::LATIN_SMALL_LETTER_A_WITH_CIRCUMFLEX_AND_HOOK_ABOVE,
443        'a' . UTF8::COMBINING_CIRCUMFLEX_ACCENT . UTF8::COMBINING_TILDE         => UTF8::LATIN_SMALL_LETTER_A_WITH_CIRCUMFLEX_AND_TILDE,
444        'a' . UTF8::COMBINING_DIAERESIS                                         => UTF8::LATIN_SMALL_LETTER_A_WITH_DIAERESIS,
445        'a' . UTF8::COMBINING_DIAERESIS . UTF8::COMBINING_MACRON                => UTF8::LATIN_SMALL_LETTER_A_WITH_DIAERESIS_AND_MACRON,
446        'a' . UTF8::COMBINING_DOT_ABOVE                                         => UTF8::LATIN_SMALL_LETTER_A_WITH_DOT_ABOVE,
447        'a' . UTF8::COMBINING_DOT_ABOVE . UTF8::COMBINING_MACRON                => UTF8::LATIN_SMALL_LETTER_A_WITH_DOT_ABOVE_AND_MACRON,
448        'a' . UTF8::COMBINING_DOT_BELOW                                         => UTF8::LATIN_SMALL_LETTER_A_WITH_DOT_BELOW,
449        'a' . UTF8::COMBINING_GRAVE_ACCENT                                      => UTF8::LATIN_SMALL_LETTER_A_WITH_GRAVE,
450        'a' . UTF8::COMBINING_HOOK_ABOVE                                        => UTF8::LATIN_SMALL_LETTER_A_WITH_HOOK_ABOVE,
451        'a' . UTF8::COMBINING_MACRON                                            => UTF8::LATIN_SMALL_LETTER_A_WITH_MACRON,
452        'a' . UTF8::COMBINING_OGONEK                                            => UTF8::LATIN_SMALL_LETTER_A_WITH_OGONEK,
453        'a' . UTF8::COMBINING_RING_ABOVE                                        => UTF8::LATIN_SMALL_LETTER_A_WITH_RING_ABOVE,
454        'a' . UTF8::COMBINING_RING_ABOVE . UTF8::COMBINING_ACUTE_ACCENT         => UTF8::LATIN_SMALL_LETTER_A_WITH_RING_ABOVE_AND_ACUTE,
455        'a' . UTF8::COMBINING_RING_BELOW                                        => UTF8::LATIN_SMALL_LETTER_A_WITH_RING_BELOW,
456        'a' . UTF8::COMBINING_TILDE                                             => UTF8::LATIN_SMALL_LETTER_A_WITH_TILDE,
457        'b' . UTF8::COMBINING_DOT_ABOVE                                         => UTF8::LATIN_SMALL_LETTER_B_WITH_DOT_ABOVE,
458        'b' . UTF8::COMBINING_DOT_BELOW                                         => UTF8::LATIN_SMALL_LETTER_B_WITH_DOT_BELOW,
459        'c' . UTF8::COMBINING_ACUTE_ACCENT                                      => UTF8::LATIN_SMALL_LETTER_C_WITH_ACUTE,
460        'c' . UTF8::COMBINING_CARON                                             => UTF8::LATIN_SMALL_LETTER_C_WITH_CARON,
461        'c' . UTF8::COMBINING_CEDILLA                                           => UTF8::LATIN_SMALL_LETTER_C_WITH_CEDILLA,
462        'c' . UTF8::COMBINING_CIRCUMFLEX_ACCENT                                 => UTF8::LATIN_SMALL_LETTER_C_WITH_CIRCUMFLEX,
463        'c' . UTF8::COMBINING_DOT_ABOVE                                         => UTF8::LATIN_SMALL_LETTER_C_WITH_DOT_ABOVE,
464        'c' . UTF8::COMBINING_DOT_BELOW                                         => UTF8::LATIN_SMALL_LETTER_C_WITH_CEDILLA_AND_ACUTE,
465        'd' . UTF8::COMBINING_CARON                                             => UTF8::LATIN_SMALL_LETTER_D_WITH_CARON,
466        'd' . UTF8::COMBINING_CEDILLA                                           => UTF8::LATIN_SMALL_LETTER_D_WITH_CEDILLA,
467        'd' . UTF8::COMBINING_DOT_ABOVE                                         => UTF8::LATIN_SMALL_LETTER_D_WITH_DOT_ABOVE,
468        'd' . UTF8::COMBINING_DOT_BELOW                                         => UTF8::LATIN_SMALL_LETTER_D_WITH_DOT_BELOW,
469        'e' . UTF8::COMBINING_ACUTE_ACCENT                                      => UTF8::LATIN_SMALL_LETTER_E_WITH_ACUTE,
470        'e' . UTF8::COMBINING_BREVE                                             => UTF8::LATIN_SMALL_LETTER_E_WITH_BREVE,
471        'e' . UTF8::COMBINING_BREVE . UTF8::COMBINING_CEDILLA                   => UTF8::LATIN_SMALL_LETTER_E_WITH_CEDILLA_AND_BREVE,
472        'e' . UTF8::COMBINING_CARON                                             => UTF8::LATIN_SMALL_LETTER_E_WITH_CARON,
473        'e' . UTF8::COMBINING_CEDILLA                                           => UTF8::LATIN_SMALL_LETTER_E_WITH_CEDILLA,
474        'e' . UTF8::COMBINING_CIRCUMFLEX_ACCENT                                 => UTF8::LATIN_SMALL_LETTER_E_WITH_CIRCUMFLEX,
475        'e' . UTF8::COMBINING_CIRCUMFLEX_ACCENT . UTF8::COMBINING_ACUTE_ACCENT  => UTF8::LATIN_SMALL_LETTER_E_WITH_CIRCUMFLEX_AND_ACUTE,
476        'e' . UTF8::COMBINING_CIRCUMFLEX_ACCENT . UTF8::COMBINING_DOT_BELOW     => UTF8::LATIN_SMALL_LETTER_E_WITH_CIRCUMFLEX_AND_DOT_BELOW,
477        'e' . UTF8::COMBINING_CIRCUMFLEX_ACCENT . UTF8::COMBINING_GRAVE_ACCENT  => UTF8::LATIN_SMALL_LETTER_E_WITH_CIRCUMFLEX_AND_GRAVE,
478        'e' . UTF8::COMBINING_CIRCUMFLEX_ACCENT . UTF8::COMBINING_HOOK_ABOVE    => UTF8::LATIN_SMALL_LETTER_E_WITH_CIRCUMFLEX_AND_HOOK_ABOVE,
479        'e' . UTF8::COMBINING_CIRCUMFLEX_ACCENT . UTF8::COMBINING_TILDE         => UTF8::LATIN_SMALL_LETTER_E_WITH_CIRCUMFLEX_AND_TILDE,
480        'e' . UTF8::COMBINING_DIAERESIS                                         => UTF8::LATIN_SMALL_LETTER_E_WITH_DIAERESIS,
481        'e' . UTF8::COMBINING_DOT_ABOVE                                         => UTF8::LATIN_SMALL_LETTER_E_WITH_DOT_ABOVE,
482        'e' . UTF8::COMBINING_DOT_BELOW                                         => UTF8::LATIN_SMALL_LETTER_E_WITH_DOT_BELOW,
483        'e' . UTF8::COMBINING_GRAVE_ACCENT                                      => UTF8::LATIN_SMALL_LETTER_E_WITH_GRAVE,
484        'e' . UTF8::COMBINING_HOOK_ABOVE                                        => UTF8::LATIN_SMALL_LETTER_E_WITH_HOOK_ABOVE,
485        'e' . UTF8::COMBINING_MACRON                                            => UTF8::LATIN_SMALL_LETTER_E_WITH_MACRON,
486        'e' . UTF8::COMBINING_MACRON . UTF8::COMBINING_ACUTE_ACCENT             => UTF8::LATIN_SMALL_LETTER_E_WITH_MACRON_AND_ACUTE,
487        'e' . UTF8::COMBINING_MACRON . UTF8::COMBINING_GRAVE_ACCENT             => UTF8::LATIN_SMALL_LETTER_E_WITH_MACRON_AND_GRAVE,
488        'e' . UTF8::COMBINING_OGONEK                                            => UTF8::LATIN_SMALL_LETTER_E_WITH_OGONEK,
489        'e' . UTF8::COMBINING_TILDE                                             => UTF8::LATIN_SMALL_LETTER_E_WITH_TILDE,
490        'f' . UTF8::COMBINING_DOT_ABOVE                                         => UTF8::LATIN_SMALL_LETTER_F_WITH_DOT_ABOVE,
491        'g' . UTF8::COMBINING_ACUTE_ACCENT                                      => UTF8::LATIN_SMALL_LETTER_G_WITH_ACUTE,
492        'g' . UTF8::COMBINING_BREVE                                             => UTF8::LATIN_SMALL_LETTER_G_WITH_BREVE,
493        'g' . UTF8::COMBINING_CARON                                             => UTF8::LATIN_SMALL_LETTER_G_WITH_CARON,
494        'g' . UTF8::COMBINING_CEDILLA                                           => UTF8::LATIN_SMALL_LETTER_G_WITH_CEDILLA,
495        'g' . UTF8::COMBINING_CIRCUMFLEX_ACCENT                                 => UTF8::LATIN_SMALL_LETTER_G_WITH_CIRCUMFLEX,
496        'g' . UTF8::COMBINING_DOT_ABOVE                                         => UTF8::LATIN_SMALL_LETTER_G_WITH_DOT_ABOVE,
497        'g' . UTF8::COMBINING_MACRON                                            => UTF8::LATIN_SMALL_LETTER_G_WITH_MACRON,
498        'h' . UTF8::COMBINING_BREVE_BELOW                                       => UTF8::LATIN_SMALL_LETTER_H_WITH_BREVE_BELOW,
499        'h' . UTF8::COMBINING_CARON                                             => UTF8::LATIN_SMALL_LETTER_H_WITH_CARON,
500        'h' . UTF8::COMBINING_CEDILLA                                           => UTF8::LATIN_SMALL_LETTER_H_WITH_CEDILLA,
501        'h' . UTF8::COMBINING_CIRCUMFLEX_ACCENT                                 => UTF8::LATIN_SMALL_LETTER_H_WITH_CIRCUMFLEX,
502        'h' . UTF8::COMBINING_DIAERESIS                                         => UTF8::LATIN_SMALL_LETTER_H_WITH_DIAERESIS,
503        'h' . UTF8::COMBINING_DOT_ABOVE                                         => UTF8::LATIN_SMALL_LETTER_H_WITH_DOT_ABOVE,
504        'h' . UTF8::COMBINING_DOT_BELOW                                         => UTF8::LATIN_SMALL_LETTER_H_WITH_DOT_BELOW,
505        'i' . UTF8::COMBINING_ACUTE_ACCENT                                      => UTF8::LATIN_SMALL_LETTER_I_WITH_ACUTE,
506        'i' . UTF8::COMBINING_BREVE                                             => UTF8::LATIN_SMALL_LETTER_I_WITH_BREVE,
507        'i' . UTF8::COMBINING_CARON                                             => UTF8::LATIN_SMALL_LETTER_I_WITH_CARON,
508        'i' . UTF8::COMBINING_CIRCUMFLEX_ACCENT                                 => UTF8::LATIN_SMALL_LETTER_I_WITH_CIRCUMFLEX,
509        'i' . UTF8::COMBINING_DIAERESIS                                         => UTF8::LATIN_SMALL_LETTER_I_WITH_DIAERESIS,
510        'i' . UTF8::COMBINING_DIAERESIS . UTF8::COMBINING_ACUTE_ACCENT          => UTF8::LATIN_SMALL_LETTER_I_WITH_DIAERESIS_AND_ACUTE,
511        'i' . UTF8::COMBINING_DOT_BELOW                                         => UTF8::LATIN_SMALL_LETTER_I_WITH_DOT_BELOW,
512        'i' . UTF8::COMBINING_GRAVE_ACCENT                                      => UTF8::LATIN_SMALL_LETTER_I_WITH_GRAVE,
513        'i' . UTF8::COMBINING_HOOK_ABOVE                                        => UTF8::LATIN_SMALL_LETTER_I_WITH_HOOK_ABOVE,
514        'i' . UTF8::COMBINING_MACRON                                            => UTF8::LATIN_SMALL_LETTER_I_WITH_MACRON,
515        'i' . UTF8::COMBINING_OGONEK                                            => UTF8::LATIN_SMALL_LETTER_I_WITH_OGONEK,
516        'i' . UTF8::COMBINING_TILDE                                             => UTF8::LATIN_SMALL_LETTER_I_WITH_TILDE,
517        'j' . UTF8::COMBINING_CARON                                             => UTF8::LATIN_SMALL_LETTER_J_WITH_CARON,
518        'j' . UTF8::COMBINING_CIRCUMFLEX_ACCENT                                 => UTF8::LATIN_SMALL_LETTER_J_WITH_CIRCUMFLEX,
519        'k' . UTF8::COMBINING_CARON                                             => UTF8::LATIN_SMALL_LETTER_K_WITH_CARON,
520        'k' . UTF8::COMBINING_CEDILLA                                           => UTF8::LATIN_SMALL_LETTER_K_WITH_CEDILLA,
521        'k' . UTF8::COMBINING_DIAERESIS                                         => UTF8::LATIN_SMALL_LETTER_K_WITH_ACUTE,
522        'k' . UTF8::COMBINING_DOT_BELOW                                         => UTF8::LATIN_SMALL_LETTER_K_WITH_DOT_BELOW,
523        'l' . UTF8::COMBINING_ACUTE_ACCENT                                      => UTF8::LATIN_SMALL_LETTER_L_WITH_ACUTE,
524        'l' . UTF8::COMBINING_CARON                                             => UTF8::LATIN_SMALL_LETTER_L_WITH_CARON,
525        'l' . UTF8::COMBINING_CEDILLA                                           => UTF8::LATIN_SMALL_LETTER_L_WITH_CEDILLA,
526        'l' . UTF8::COMBINING_DOT_BELOW                                         => UTF8::LATIN_SMALL_LETTER_L_WITH_DOT_BELOW,
527        'l' . UTF8::COMBINING_DOT_BELOW . UTF8::COMBINING_MACRON                => UTF8::LATIN_SMALL_LETTER_L_WITH_DOT_BELOW_AND_MACRON,
528        'm' . UTF8::COMBINING_ACUTE_ACCENT                                      => UTF8::LATIN_SMALL_LETTER_M_WITH_ACUTE,
529        'm' . UTF8::COMBINING_DOT_ABOVE                                         => UTF8::LATIN_SMALL_LETTER_M_WITH_DOT_ABOVE,
530        'm' . UTF8::COMBINING_DOT_BELOW                                         => UTF8::LATIN_SMALL_LETTER_M_WITH_DOT_BELOW,
531        'n' . UTF8::COMBINING_ACUTE_ACCENT                                      => UTF8::LATIN_SMALL_LETTER_N_WITH_ACUTE,
532        'n' . UTF8::COMBINING_CARON                                             => UTF8::LATIN_SMALL_LETTER_N_WITH_CARON,
533        'n' . UTF8::COMBINING_CEDILLA                                           => UTF8::LATIN_SMALL_LETTER_N_WITH_CEDILLA,
534        'n' . UTF8::COMBINING_DOT_ABOVE                                         => UTF8::LATIN_SMALL_LETTER_N_WITH_DOT_ABOVE,
535        'n' . UTF8::COMBINING_DOT_BELOW                                         => UTF8::LATIN_SMALL_LETTER_N_WITH_DOT_BELOW,
536        'n' . UTF8::COMBINING_GRAVE_ACCENT                                      => UTF8::LATIN_SMALL_LETTER_N_WITH_GRAVE,
537        'n' . UTF8::COMBINING_TILDE                                             => UTF8::LATIN_SMALL_LETTER_N_WITH_TILDE,
538        'o' . UTF8::COMBINING_ACUTE_ACCENT                                      => UTF8::LATIN_SMALL_LETTER_O_WITH_ACUTE,
539        'o' . UTF8::COMBINING_BREVE                                             => UTF8::LATIN_SMALL_LETTER_O_WITH_BREVE,
540        'o' . UTF8::COMBINING_CARON                                             => UTF8::LATIN_SMALL_LETTER_O_WITH_CARON,
541        'o' . UTF8::COMBINING_CIRCUMFLEX_ACCENT                                 => UTF8::LATIN_SMALL_LETTER_O_WITH_CIRCUMFLEX,
542        'o' . UTF8::COMBINING_CIRCUMFLEX_ACCENT . UTF8::COMBINING_ACUTE_ACCENT  => UTF8::LATIN_SMALL_LETTER_O_WITH_CIRCUMFLEX_AND_ACUTE,
543        'o' . UTF8::COMBINING_CIRCUMFLEX_ACCENT . UTF8::COMBINING_DOT_BELOW     => UTF8::LATIN_SMALL_LETTER_O_WITH_CIRCUMFLEX_AND_DOT_BELOW,
544        'o' . UTF8::COMBINING_CIRCUMFLEX_ACCENT . UTF8::COMBINING_GRAVE_ACCENT  => UTF8::LATIN_SMALL_LETTER_O_WITH_CIRCUMFLEX_AND_GRAVE,
545        'o' . UTF8::COMBINING_CIRCUMFLEX_ACCENT . UTF8::COMBINING_HOOK_ABOVE    => UTF8::LATIN_SMALL_LETTER_O_WITH_CIRCUMFLEX_AND_HOOK_ABOVE,
546        'o' . UTF8::COMBINING_CIRCUMFLEX_ACCENT . UTF8::COMBINING_TILDE         => UTF8::LATIN_SMALL_LETTER_O_WITH_CIRCUMFLEX_AND_TILDE,
547        'o' . UTF8::COMBINING_DIAERESIS                                         => UTF8::LATIN_SMALL_LETTER_O_WITH_DIAERESIS,
548        'o' . UTF8::COMBINING_DIAERESIS . UTF8::COMBINING_MACRON                => UTF8::LATIN_SMALL_LETTER_O_WITH_DIAERESIS_AND_MACRON,
549        'o' . UTF8::COMBINING_DOT_ABOVE                                         => UTF8::LATIN_SMALL_LETTER_O_WITH_DOT_ABOVE,
550        'o' . UTF8::COMBINING_DOT_ABOVE . UTF8::COMBINING_MACRON                => UTF8::LATIN_SMALL_LETTER_O_WITH_DOT_ABOVE_AND_MACRON,
551        'o' . UTF8::COMBINING_DOT_BELOW                                         => UTF8::LATIN_SMALL_LETTER_O_WITH_DOT_BELOW,
552        'o' . UTF8::COMBINING_DOUBLE_ACUTE_ACCENT                               => UTF8::LATIN_SMALL_LETTER_O_WITH_DOUBLE_ACUTE,
553        'o' . UTF8::COMBINING_GRAVE_ACCENT                                      => UTF8::LATIN_SMALL_LETTER_O_WITH_GRAVE,
554        'o' . UTF8::COMBINING_HOOK_ABOVE                                        => UTF8::LATIN_SMALL_LETTER_O_WITH_HOOK_ABOVE,
555        'o' . UTF8::COMBINING_MACRON                                            => UTF8::LATIN_SMALL_LETTER_O_WITH_MACRON,
556        'o' . UTF8::COMBINING_MACRON . UTF8::COMBINING_ACUTE_ACCENT             => UTF8::LATIN_SMALL_LETTER_O_WITH_MACRON_AND_ACUTE,
557        'o' . UTF8::COMBINING_MACRON . UTF8::COMBINING_GRAVE_ACCENT             => UTF8::LATIN_SMALL_LETTER_O_WITH_MACRON_AND_GRAVE,
558        'o' . UTF8::COMBINING_OGONEK                                            => UTF8::LATIN_SMALL_LETTER_O_WITH_OGONEK,
559        'o' . UTF8::COMBINING_OGONEK . UTF8::COMBINING_MACRON                   => UTF8::LATIN_SMALL_LETTER_O_WITH_OGONEK_AND_MACRON,
560        'o' . UTF8::COMBINING_TILDE                                             => UTF8::LATIN_SMALL_LETTER_O_WITH_TILDE,
561        'o' . UTF8::COMBINING_TILDE . UTF8::COMBINING_ACUTE_ACCENT              => UTF8::LATIN_SMALL_LETTER_O_WITH_TILDE_AND_ACUTE,
562        'o' . UTF8::COMBINING_TILDE . UTF8::COMBINING_DIAERESIS                 => UTF8::LATIN_SMALL_LETTER_O_WITH_TILDE_AND_DIAERESIS,
563        'o' . UTF8::COMBINING_TILDE . UTF8::COMBINING_MACRON                    => UTF8::LATIN_SMALL_LETTER_O_WITH_TILDE_AND_MACRON,
564        'p' . UTF8::COMBINING_ACUTE_ACCENT                                      => UTF8::LATIN_SMALL_LETTER_P_WITH_ACUTE,
565        'p' . UTF8::COMBINING_DOT_ABOVE                                         => UTF8::LATIN_SMALL_LETTER_P_WITH_DOT_ABOVE,
566        'r' . UTF8::COMBINING_ACUTE_ACCENT                                      => UTF8::LATIN_SMALL_LETTER_R_WITH_ACUTE,
567        'r' . UTF8::COMBINING_CARON                                             => UTF8::LATIN_SMALL_LETTER_R_WITH_CARON,
568        'r' . UTF8::COMBINING_CEDILLA                                           => UTF8::LATIN_SMALL_LETTER_R_WITH_CEDILLA,
569        'r' . UTF8::COMBINING_DOT_ABOVE                                         => UTF8::LATIN_SMALL_LETTER_R_WITH_DOT_ABOVE,
570        'r' . UTF8::COMBINING_DOT_BELOW                                         => UTF8::LATIN_SMALL_LETTER_R_WITH_DOT_BELOW,
571        'r' . UTF8::COMBINING_DOT_BELOW . UTF8::COMBINING_MACRON                => UTF8::LATIN_SMALL_LETTER_R_WITH_DOT_BELOW_AND_MACRON,
572        's' . UTF8::COMBINING_ACUTE_ACCENT                                      => UTF8::LATIN_SMALL_LETTER_S_WITH_ACUTE,
573        's' . UTF8::COMBINING_ACUTE_ACCENT . UTF8::COMBINING_DOT_ABOVE          => UTF8::LATIN_SMALL_LETTER_S_WITH_ACUTE_AND_DOT_ABOVE,
574        's' . UTF8::COMBINING_CARON                                             => UTF8::LATIN_SMALL_LETTER_S_WITH_CARON,
575        's' . UTF8::COMBINING_CARON . UTF8::COMBINING_DOT_ABOVE                 => UTF8::LATIN_SMALL_LETTER_S_WITH_CARON_AND_DOT_ABOVE,
576        's' . UTF8::COMBINING_CEDILLA                                           => UTF8::LATIN_SMALL_LETTER_S_WITH_CEDILLA,
577        's' . UTF8::COMBINING_CIRCUMFLEX_ACCENT                                 => UTF8::LATIN_SMALL_LETTER_S_WITH_CIRCUMFLEX,
578        's' . UTF8::COMBINING_COMMA_BELOW                                       => UTF8::LATIN_SMALL_LETTER_S_WITH_COMMA_BELOW,
579        's' . UTF8::COMBINING_DOT_ABOVE                                         => UTF8::LATIN_SMALL_LETTER_S_WITH_DOT_ABOVE,
580        's' . UTF8::COMBINING_DOT_BELOW                                         => UTF8::LATIN_SMALL_LETTER_S_WITH_DOT_BELOW,
581        's' . UTF8::COMBINING_DOT_BELOW . UTF8::COMBINING_DOT_ABOVE             => UTF8::LATIN_SMALL_LETTER_S_WITH_DOT_BELOW_AND_DOT_ABOVE,
582        't' . UTF8::COMBINING_CARON                                             => UTF8::LATIN_SMALL_LETTER_T_WITH_CARON,
583        't' . UTF8::COMBINING_CEDILLA                                           => UTF8::LATIN_SMALL_LETTER_T_WITH_CEDILLA,
584        't' . UTF8::COMBINING_COMMA_BELOW                                       => UTF8::LATIN_SMALL_LETTER_T_WITH_COMMA_BELOW,
585        't' . UTF8::COMBINING_DIAERESIS                                         => UTF8::LATIN_SMALL_LETTER_T_WITH_DIAERESIS,
586        't' . UTF8::COMBINING_DOT_ABOVE                                         => UTF8::LATIN_SMALL_LETTER_T_WITH_DOT_ABOVE,
587        't' . UTF8::COMBINING_DOT_BELOW                                         => UTF8::LATIN_SMALL_LETTER_T_WITH_DOT_BELOW,
588        'u' . UTF8::COMBINING_ACUTE_ACCENT                                      => UTF8::LATIN_SMALL_LETTER_U_WITH_ACUTE,
589        'u' . UTF8::COMBINING_BREVE                                             => UTF8::LATIN_SMALL_LETTER_U_WITH_BREVE,
590        'u' . UTF8::COMBINING_CARON                                             => UTF8::LATIN_SMALL_LETTER_U_WITH_CARON,
591        'u' . UTF8::COMBINING_CIRCUMFLEX_ACCENT                                 => UTF8::LATIN_SMALL_LETTER_U_WITH_CIRCUMFLEX,
592        'u' . UTF8::COMBINING_DIAERESIS                                         => UTF8::LATIN_SMALL_LETTER_U_WITH_DIAERESIS,
593        'u' . UTF8::COMBINING_DIAERESIS . UTF8::COMBINING_ACUTE_ACCENT          => UTF8::LATIN_SMALL_LETTER_U_WITH_DIAERESIS_AND_ACUTE,
594        'u' . UTF8::COMBINING_DIAERESIS . UTF8::COMBINING_CARON                 => UTF8::LATIN_SMALL_LETTER_U_WITH_DIAERESIS_AND_CARON,
595        'u' . UTF8::COMBINING_DIAERESIS . UTF8::COMBINING_GRAVE_ACCENT          => UTF8::LATIN_SMALL_LETTER_U_WITH_DIAERESIS_AND_GRAVE,
596        'u' . UTF8::COMBINING_DIAERESIS . UTF8::COMBINING_MACRON                => UTF8::LATIN_SMALL_LETTER_U_WITH_DIAERESIS_AND_MACRON,
597        'u' . UTF8::COMBINING_DIAERESIS_BELOW                                   => UTF8::LATIN_SMALL_LETTER_U_WITH_DIAERESIS_BELOW,
598        'u' . UTF8::COMBINING_DOT_BELOW                                         => UTF8::LATIN_SMALL_LETTER_U_WITH_DOT_BELOW,
599        'u' . UTF8::COMBINING_DOUBLE_ACUTE_ACCENT                               => UTF8::LATIN_SMALL_LETTER_U_WITH_DOUBLE_ACUTE,
600        'u' . UTF8::COMBINING_GRAVE_ACCENT                                      => UTF8::LATIN_SMALL_LETTER_U_WITH_GRAVE,
601        'u' . UTF8::COMBINING_HOOK_ABOVE                                        => UTF8::LATIN_SMALL_LETTER_U_WITH_HOOK_ABOVE,
602        'u' . UTF8::COMBINING_MACRON                                            => UTF8::LATIN_SMALL_LETTER_U_WITH_MACRON,
603        'u' . UTF8::COMBINING_MACRON . UTF8::COMBINING_DIAERESIS                => UTF8::LATIN_SMALL_LETTER_U_WITH_MACRON_AND_DIAERESIS,
604        'u' . UTF8::COMBINING_OGONEK                                            => UTF8::LATIN_SMALL_LETTER_U_WITH_OGONEK,
605        'u' . UTF8::COMBINING_RING_ABOVE                                        => UTF8::LATIN_SMALL_LETTER_U_WITH_RING_ABOVE,
606        'u' . UTF8::COMBINING_TILDE                                             => UTF8::LATIN_SMALL_LETTER_U_WITH_TILDE,
607        'u' . UTF8::COMBINING_TILDE . UTF8::COMBINING_ACUTE_ACCENT              => UTF8::LATIN_SMALL_LETTER_U_WITH_TILDE_AND_ACUTE,
608        'v' . UTF8::COMBINING_DOT_BELOW                                         => UTF8::LATIN_SMALL_LETTER_V_WITH_DOT_BELOW,
609        'v' . UTF8::COMBINING_TILDE                                             => UTF8::LATIN_SMALL_LETTER_V_WITH_TILDE,
610        'w' . UTF8::COMBINING_ACUTE_ACCENT                                      => UTF8::LATIN_SMALL_LETTER_W_WITH_ACUTE,
611        'w' . UTF8::COMBINING_CIRCUMFLEX_ACCENT                                 => UTF8::LATIN_SMALL_LETTER_W_WITH_CIRCUMFLEX,
612        'w' . UTF8::COMBINING_DIAERESIS                                         => UTF8::LATIN_SMALL_LETTER_W_WITH_DIAERESIS,
613        'w' . UTF8::COMBINING_DOT_ABOVE                                         => UTF8::LATIN_SMALL_LETTER_W_WITH_DOT_ABOVE,
614        'w' . UTF8::COMBINING_DOT_BELOW                                         => UTF8::LATIN_SMALL_LETTER_W_WITH_DOT_BELOW,
615        'w' . UTF8::COMBINING_GRAVE_ACCENT                                      => UTF8::LATIN_SMALL_LETTER_W_WITH_GRAVE,
616        'w' . UTF8::COMBINING_RING_ABOVE                                        => UTF8::LATIN_SMALL_LETTER_W_WITH_RING_ABOVE,
617        'x' . UTF8::COMBINING_DIAERESIS                                         => UTF8::LATIN_SMALL_LETTER_X_WITH_DIAERESIS,
618        'x' . UTF8::COMBINING_DOT_ABOVE                                         => UTF8::LATIN_SMALL_LETTER_X_WITH_DOT_ABOVE,
619        'y' . UTF8::COMBINING_ACUTE_ACCENT                                      => UTF8::LATIN_SMALL_LETTER_Y_WITH_ACUTE,
620        'y' . UTF8::COMBINING_CIRCUMFLEX_ACCENT                                 => UTF8::LATIN_SMALL_LETTER_Y_WITH_CIRCUMFLEX,
621        'y' . UTF8::COMBINING_DIAERESIS                                         => UTF8::LATIN_SMALL_LETTER_Y_WITH_DIAERESIS,
622        'y' . UTF8::COMBINING_DOT_ABOVE                                         => UTF8::LATIN_SMALL_LETTER_Y_WITH_DOT_ABOVE,
623        'y' . UTF8::COMBINING_DOT_BELOW                                         => UTF8::LATIN_SMALL_LETTER_Y_WITH_DOT_BELOW,
624        'y' . UTF8::COMBINING_GRAVE_ACCENT                                      => UTF8::LATIN_SMALL_LETTER_Y_WITH_GRAVE,
625        'y' . UTF8::COMBINING_HOOK_ABOVE                                        => UTF8::LATIN_SMALL_LETTER_Y_WITH_HOOK_ABOVE,
626        'y' . UTF8::COMBINING_MACRON                                            => UTF8::LATIN_SMALL_LETTER_Y_WITH_MACRON,
627        'y' . UTF8::COMBINING_RING_ABOVE                                        => UTF8::LATIN_SMALL_LETTER_Y_WITH_RING_ABOVE,
628        'y' . UTF8::COMBINING_TILDE                                             => UTF8::LATIN_SMALL_LETTER_Y_WITH_TILDE,
629        'z' . UTF8::COMBINING_ACUTE_ACCENT                                      => UTF8::LATIN_SMALL_LETTER_Z_WITH_ACUTE,
630        'z' . UTF8::COMBINING_CARON                                             => UTF8::LATIN_SMALL_LETTER_Z_WITH_CARON,
631        'z' . UTF8::COMBINING_CIRCUMFLEX_ACCENT                                 => UTF8::LATIN_SMALL_LETTER_Z_WITH_CIRCUMFLEX,
632        'z' . UTF8::COMBINING_DOT_ABOVE                                         => UTF8::LATIN_SMALL_LETTER_Z_WITH_DOT_ABOVE,
633        'z' . UTF8::COMBINING_DOT_BELOW                                         => UTF8::LATIN_SMALL_LETTER_Z_WITH_DOT_BELOW,
634        UTF8::LATIN_CAPITAL_LETTER_AE . UTF8::COMBINING_ACUTE_ACCENT            => UTF8::LATIN_CAPITAL_LETTER_AE_WITH_ACUTE,
635        UTF8::LATIN_CAPITAL_LETTER_AE . UTF8::COMBINING_MACRON                  => UTF8::LATIN_CAPITAL_LETTER_AE_WITH_MACRON,
636        UTF8::LATIN_CAPITAL_LETTER_O_WITH_STROKE . UTF8::COMBINING_ACUTE_ACCENT => UTF8::LATIN_CAPITAL_LETTER_O_WITH_STROKE_AND_ACUTE,
637        UTF8::LATIN_SMALL_LETTER_AE . UTF8::COMBINING_ACUTE_ACCENT              => UTF8::LATIN_SMALL_LETTER_AE_WITH_ACUTE,
638        UTF8::LATIN_SMALL_LETTER_AE . UTF8::COMBINING_MACRON                    => UTF8::LATIN_SMALL_LETTER_AE_WITH_MACRON,
639        UTF8::LATIN_SMALL_LETTER_O_WITH_STROKE . UTF8::COMBINING_ACUTE_ACCENT   => UTF8::LATIN_SMALL_LETTER_O_WITH_STROKE_AND_ACUTE,
640    ];
641
642    // ANSEL supports O and U with a horn diacritic, but not the combining diacritic.
643    private const HORN_CONVERT_STEP_1 = [
644        'O' . UTF8::COMBINING_HORN => "\x00O_WITH_HORN\x00",
645        'U' . UTF8::COMBINING_HORN => "\x00U_WITH_HORN\x00",
646        'o' . UTF8::COMBINING_HORN => "\x00o_WITH_HORN\x00",
647        'u' . UTF8::COMBINING_HORN => "\x00u_WITH_HORN\x00",
648    ];
649    private const HORN_CONVERT_STEP_2 = [
650        "\x00O_WITH_HORN\x00" => "\xAC",
651        "\x00U_WITH_HORN\x00" => "\xAD",
652        "\x00o_WITH_HORN\x00" => "\xBC",
653        "\x00u_WITH_HORN\x00" => "\xBD",
654    ];
655
656    /**
657     * Convert a string from another encoding to UTF-8.
658     *
659     * @param string $text
660     *
661     * @return string
662     */
663    public function toUtf8(string $text): string
664    {
665        // ANSEL diacritics are prefixes.  UTF-8 diacritics are suffixes.
666        $text = preg_replace('/([\xE0-\xFF]+)(.)/', '$2$1', $text);
667
668        // Simple substitution creates denormalized UTF-8.
669        $text = strtr($text, self::TO_UTF8);
670
671        // Convert combining diacritics into pre-composed characters.
672        return strtr($text, self::PRECOMPOSED_CHARACTERS);
673    }
674
675    /**
676     * Convert a string from UTF-8 to another encoding.
677     *
678     * @param string $text
679     *
680     * @return string
681     */
682    public function fromUtf8(string $text): string
683    {
684        // Convert pre-composed characters into combining diacritics.
685        $text = strtr($text, array_flip(self::PRECOMPOSED_CHARACTERS));
686
687        // ANSEL supports letters with horns, but not the combining horn.
688        $text = strtr($text, self::HORN_CONVERT_STEP_1);
689
690        // Convert characters and combining diacritics separately.
691        $text = parent::fromUtf8($text);
692
693        // ANSEL supports two letters with horns, but not the combining horn.
694        $text = strtr($text, self::HORN_CONVERT_STEP_2);
695
696        // ANSEL diacritics are prefixes.  UTF-8 diacritics are suffixes.
697        $text = preg_replace('/([^\xE0-\xFF])([\xE0-\xFF]+)/', '$2$1', $text);
698
699        return $text;
700    }
701}
702