xref: /webtrees/tests/app/Encodings/AnselTest.php (revision 5a8afed46297e8105e3e5a33ce37e6a8e88bc79d)
11c6adce8SGreg Roach<?php
21c6adce8SGreg Roach
31c6adce8SGreg Roach/**
41c6adce8SGreg Roach * webtrees: online genealogy
5d11be702SGreg Roach * Copyright (C) 2023 webtrees development team
61c6adce8SGreg Roach * This program is free software: you can redistribute it and/or modify
71c6adce8SGreg Roach * it under the terms of the GNU General Public License as published by
81c6adce8SGreg Roach * the Free Software Foundation, either version 3 of the License, or
91c6adce8SGreg Roach * (at your option) any later version.
101c6adce8SGreg Roach * This program is distributed in the hope that it will be useful,
111c6adce8SGreg Roach * but WITHOUT ANY WARRANTY; without even the implied warranty of
121c6adce8SGreg Roach * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
131c6adce8SGreg Roach * GNU General Public License for more details.
141c6adce8SGreg Roach * You should have received a copy of the GNU General Public License
151c6adce8SGreg Roach * along with this program. If not, see <https://www.gnu.org/licenses/>.
161c6adce8SGreg Roach */
171c6adce8SGreg Roach
181c6adce8SGreg Roachdeclare(strict_types=1);
191c6adce8SGreg Roach
201c6adce8SGreg Roachnamespace Fisharebest\Webtrees\Tests\Encodings;
211c6adce8SGreg Roach
22*202c018bSGreg Roachuse Fisharebest\Webtrees\Encodings\AbstractEncoding;
231c6adce8SGreg Roachuse Fisharebest\Webtrees\Encodings\ANSEL;
241c6adce8SGreg Roachuse Fisharebest\Webtrees\Encodings\UTF8;
251c6adce8SGreg Roachuse Normalizer;
26*202c018bSGreg Roachuse PHPUnit\Framework\Attributes\CoversClass;
271c6adce8SGreg Roachuse PHPUnit\Framework\TestCase;
281c6adce8SGreg Roach
291c6adce8SGreg Roachuse function bin2hex;
3010e06497SGreg Roachuse function count;
311c6adce8SGreg Roachuse function ctype_alpha;
321c6adce8SGreg Roachuse function dechex;
331c6adce8SGreg Roachuse function in_array;
341c6adce8SGreg Roachuse function preg_split;
351c6adce8SGreg Roachuse function range;
3610e06497SGreg Roachuse function strlen;
371c6adce8SGreg Roach
381c6adce8SGreg Roachuse const PREG_SPLIT_NO_EMPTY;
391c6adce8SGreg Roach
40*202c018bSGreg Roach#[CoversClass(AbstractEncoding::class)]
41*202c018bSGreg Roach#[CoversClass(ANSEL::class)]
42*202c018bSGreg Roach#[CoversClass(UTF8::class)]
431c6adce8SGreg Roachclass AnselTest extends TestCase
441c6adce8SGreg Roach{
451c6adce8SGreg Roach    private const TEST_DATA = [
461c6adce8SGreg Roach        "\x00\x01\x02\x03\x04\x05\x06\x07"         => "\x00\x01\x02\x03\x04\x05\x06\x07",
471c6adce8SGreg Roach        "\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F"         => "\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F",
481c6adce8SGreg Roach        "\x10\x11\x12\x13\x14\x15\x16\x17"         => "\x10\x11\x12\x13\x14\x15\x16\x17",
491c6adce8SGreg Roach        "\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F"         => "\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F",
501c6adce8SGreg Roach        ' !"#$%&\''                                => "\x20\x21\x22\x23\x24\x25\x26\x27",
511c6adce8SGreg Roach        '()*+,-./'                                 => "\x28\x29\x2A\x2B\x2C\x2D\x2E\x2F",
521c6adce8SGreg Roach        '01234567'                                 => "\x30\x31\x32\x33\x34\x35\x36\x37",
531c6adce8SGreg Roach        '89:;<=>?'                                 => "\x38\x39\x3A\x3B\x3C\x3D\x3E\x3F",
541c6adce8SGreg Roach        '@ABCDEFG'                                 => "\x40\x41\x42\x43\x44\x45\x46\x47",
551c6adce8SGreg Roach        'HIJKLMNO'                                 => "\x48\x49\x4A\x4B\x4C\x4D\x4E\x4F",
561c6adce8SGreg Roach        'PQRSTUVW'                                 => "\x50\x51\x52\x53\x54\x55\x56\x57",
571c6adce8SGreg Roach        'XYZ[\\]^_'                                => "\x58\x59\x5A\x5B\x5C\x5D\x5E\x5F",
581c6adce8SGreg Roach        '`abcdefg'                                 => "\x60\x61\x62\x63\x64\x65\x66\x67",
591c6adce8SGreg Roach        'hijklmno'                                 => "\x68\x69\x6A\x6B\x6C\x6D\x6E\x6F",
601c6adce8SGreg Roach        'pqrstuvw'                                 => "\x70\x71\x72\x73\x74\x75\x76\x77",
611c6adce8SGreg Roach        "xyz{|}~\x7F"                              => "\x78\x79\x7A\x7B\x7C\x7D\x7E\x7F",
621c6adce8SGreg Roach        "\xC2\x98\xC2\x9C\xE2\x80\x8D\xE2\x80\x8C" => "\x88\x89\x8D\x8E",
631c6adce8SGreg Roach        'ŁØĐÞÆŒʹ'                                  => "\xA1\xA2\xA3\xA4\xA5\xA6\xA7",
641c6adce8SGreg Roach        '·♭®±ƠƯʼ'                                  => "\xA8\xA9\xAA\xAB\xAC\xAD\xAE",
651c6adce8SGreg Roach        'ʻłøđþæœʺ'                                 => "\xB0\xB1\xB2\xB3\xB4\xB5\xB6\xB7",
661c6adce8SGreg Roach        'ı£ðơư'                                    => "\xB8\xB9\xBA\xBC\xBD",
671c6adce8SGreg Roach        '°ℓ℗©♯¿¡ẞ€'                                => "\xC0\xC1\xC2\xC3\xC4\xC5\xC6\xC7\xC8",
681c6adce8SGreg Roach        // Combining diacritics
691c6adce8SGreg Roach        'ảàáâãāăȧ'                                 => "\xE0a\xE1a\xE2a\xE3a\xE4a\xE5a\xE6a\xE7a",
701c6adce8SGreg Roach        'äǎåa͡a̕a̋a̐'                              => "\xE8a\xE9a\xEAa\xEBa\xEDa\xEEa\xEFa",
711c6adce8SGreg Roach        'a̧ąạa̤ḁa̳a̲a̦'                            => "\xF0a\xF1a\xF2a\xF3a\xF4a\xF5a\xF6a\xF7a",
721c6adce8SGreg Roach        'a̜a̮a͠a̓a̸'                               => "\xF8a\xF9a\xFAa\xFEa\xFFa",
731c6adce8SGreg Roach        // Diacritics with non-ascii
741c6adce8SGreg Roach        'ǣ'                                        => "\xE5\xB5",
751c6adce8SGreg Roach        // LATIN CAPITAL LETTER O WITH DIAERESIS AND MACRON
761c6adce8SGreg Roach        'Ō̈'                                       => "\xE5\xE8O",
771c6adce8SGreg Roach        // LATIN CAPITAL LETTER O WITH MACRON AND DIAERESIS
781c6adce8SGreg Roach        'Ȫ'                                        => "\xE8\xE5O",
791c6adce8SGreg Roach    ];
801c6adce8SGreg Roach
811c6adce8SGreg Roach    private const UNPRINTABLE = [
821c6adce8SGreg Roach        "\x80\x81\x82\x83\x84\x85\x86\x87",
831c6adce8SGreg Roach        "\x8A\x8B\x8C\x8F",
841c6adce8SGreg Roach        "\x90\x91\x92\x93\x94\x95\x96\x97",
851c6adce8SGreg Roach        "\x98\x99\x9A\x9B\x9C\x9D\x9E\x9F",
861c6adce8SGreg Roach        "\xA0",
871c6adce8SGreg Roach        "\xAF",
881c6adce8SGreg Roach        "\xBB",
891c6adce8SGreg Roach        "\xC9\xCA\xCB\xCC\xCD\xCE",
901c6adce8SGreg Roach        "\xD0\xD1\xD2\xD3\xD4\xD5\xD6\xD7",
911c6adce8SGreg Roach        "\xD8\xD9\xDA\xDB\xDC\xDD\xDE\xDF",
921c6adce8SGreg Roach        "\xFC\xFD",
931c6adce8SGreg Roach    ];
941c6adce8SGreg Roach
951c6adce8SGreg Roach    private const MULTIPART_DIACRITIC = ["\xEC", "\xFB"];
961c6adce8SGreg Roach
971c6adce8SGreg Roach    public function testPreComposedCharacters(): void
981c6adce8SGreg Roach    {
991c6adce8SGreg Roach        $latin_code_blocks = [
1001c6adce8SGreg Roach            range(0x80, 0xFF),
1011c6adce8SGreg Roach            range(0x100, 0x17F),
1021c6adce8SGreg Roach            range(0x180, 0x24F),
1031c6adce8SGreg Roach            range(0x1E00, 0x1EFF),
1041c6adce8SGreg Roach        ];
1051c6adce8SGreg Roach
1061c6adce8SGreg Roach        $ansel_combining_characters = [
1071c6adce8SGreg Roach            UTF8::COMBINING_HOOK_ABOVE,
1081c6adce8SGreg Roach            UTF8::COMBINING_GRAVE_ACCENT,
1091c6adce8SGreg Roach            UTF8::COMBINING_ACUTE_ACCENT,
1101c6adce8SGreg Roach            UTF8::COMBINING_CIRCUMFLEX_ACCENT,
1111c6adce8SGreg Roach            UTF8::COMBINING_TILDE,
1121c6adce8SGreg Roach            UTF8::COMBINING_MACRON,
1131c6adce8SGreg Roach            UTF8::COMBINING_BREVE,
1141c6adce8SGreg Roach            UTF8::COMBINING_DOT_ABOVE,
1151c6adce8SGreg Roach            UTF8::COMBINING_DIAERESIS,
1161c6adce8SGreg Roach            UTF8::COMBINING_CARON,
1171c6adce8SGreg Roach            UTF8::COMBINING_RING_ABOVE,
1181c6adce8SGreg Roach            UTF8::COMBINING_DOUBLE_INVERTED_BREVE,
1191c6adce8SGreg Roach            UTF8::COMBINING_COMMA_ABOVE_RIGHT,
1201c6adce8SGreg Roach            UTF8::COMBINING_DOUBLE_ACUTE_ACCENT,
1211c6adce8SGreg Roach            UTF8::COMBINING_CANDRABINDU,
1221c6adce8SGreg Roach            UTF8::COMBINING_CEDILLA,
1231c6adce8SGreg Roach            UTF8::COMBINING_OGONEK,
1241c6adce8SGreg Roach            UTF8::COMBINING_DOT_BELOW,
1251c6adce8SGreg Roach            UTF8::COMBINING_DIAERESIS_BELOW,
1261c6adce8SGreg Roach            UTF8::COMBINING_RING_BELOW,
1271c6adce8SGreg Roach            UTF8::COMBINING_DOUBLE_LOW_LINE,
1281c6adce8SGreg Roach            UTF8::COMBINING_LOW_LINE,
1291c6adce8SGreg Roach            UTF8::COMBINING_COMMA_BELOW,
1301c6adce8SGreg Roach            UTF8::COMBINING_LEFT_HALF_RING_BELOW,
1311c6adce8SGreg Roach            UTF8::COMBINING_BREVE_BELOW,
1321c6adce8SGreg Roach            UTF8::COMBINING_DOUBLE_TILDE,
1331c6adce8SGreg Roach            UTF8::REPLACEMENT_CHARACTER,
1341c6adce8SGreg Roach            UTF8::REPLACEMENT_CHARACTER,
1351c6adce8SGreg Roach            UTF8::COMBINING_COMMA_ABOVE,
1361c6adce8SGreg Roach            UTF8::COMBINING_LONG_SOLIDUS_OVERLAY,
1371c6adce8SGreg Roach        ];
1381c6adce8SGreg Roach
1391c6adce8SGreg Roach        $encoding = new ANSEL();
1401c6adce8SGreg Roach
1411c6adce8SGreg Roach        foreach ($latin_code_blocks as $codes) {
1421c6adce8SGreg Roach            foreach ($codes as $code) {
1431c6adce8SGreg Roach                $utf8 = UTF8::chr($code);
1441c6adce8SGreg Roach                $norm = Normalizer::normalize($utf8, Normalizer::FORM_D);
1451c6adce8SGreg Roach
1461c6adce8SGreg Roach                if ($norm !== $utf8) {
1471c6adce8SGreg Roach                    $chars = preg_split('//u', $norm, -1, PREG_SPLIT_NO_EMPTY);
1481c6adce8SGreg Roach                    if (!ctype_alpha($chars[0])) {
1491c6adce8SGreg Roach                        continue;
1501c6adce8SGreg Roach                    }
1511c6adce8SGreg Roach                    if (!in_array($chars[1], $ansel_combining_characters, true)) {
1521c6adce8SGreg Roach                        continue;
1531c6adce8SGreg Roach                    }
1541c6adce8SGreg Roach                    if (count($chars) >= 3 && !in_array($chars[2], $ansel_combining_characters, true)) {
1551c6adce8SGreg Roach                        continue;
1561c6adce8SGreg Roach                    }
1571c6adce8SGreg Roach
158f01ab4acSGreg Roach                    static::assertSame($utf8, $encoding->toUtf8($encoding->fromUtf8($utf8)), 'U+' . dechex($code));
1591c6adce8SGreg Roach                }
1601c6adce8SGreg Roach            }
1611c6adce8SGreg Roach        }
1621c6adce8SGreg Roach    }
1631c6adce8SGreg Roach
1641c6adce8SGreg Roach    public function testToUtf8(): void
1651c6adce8SGreg Roach    {
1661c6adce8SGreg Roach        $encoding = new ANSEL();
1671c6adce8SGreg Roach
1681c6adce8SGreg Roach        foreach (self::TEST_DATA as $utf8 => $ansel) {
1691c6adce8SGreg Roach            self::assertSame($utf8, $encoding->toUtf8($ansel), bin2hex($utf8) . ' ' . bin2hex($encoding->toUtf8($ansel)));
1701c6adce8SGreg Roach        }
1711c6adce8SGreg Roach    }
1721c6adce8SGreg Roach
1731c6adce8SGreg Roach    public function testFromUtf8(): void
1741c6adce8SGreg Roach    {
1751c6adce8SGreg Roach        $encoding = new ANSEL();
1761c6adce8SGreg Roach
1771c6adce8SGreg Roach        foreach (self::TEST_DATA as $utf8 => $other) {
1781c6adce8SGreg Roach            self::assertSame($other, $encoding->fromUtf8($utf8));
1791c6adce8SGreg Roach        }
1801c6adce8SGreg Roach    }
1811c6adce8SGreg Roach
1821c6adce8SGreg Roach    public function testUnprintable(): void
1831c6adce8SGreg Roach    {
1841c6adce8SGreg Roach        $encoding = new ANSEL();
1851c6adce8SGreg Roach
1861c6adce8SGreg Roach        foreach (self::UNPRINTABLE as $chars) {
1871c6adce8SGreg Roach            $expected = str_repeat(UTF8::REPLACEMENT_CHARACTER, strlen($chars));
1881c6adce8SGreg Roach            self::assertSame($expected, $encoding->toUtf8($chars));
1891c6adce8SGreg Roach        }
1901c6adce8SGreg Roach    }
1911c6adce8SGreg Roach
1921c6adce8SGreg Roach    public function testMultiPartDiacritic(): void
1931c6adce8SGreg Roach    {
1941c6adce8SGreg Roach        $encoding = new ANSEL();
1951c6adce8SGreg Roach
1961c6adce8SGreg Roach        foreach (self::MULTIPART_DIACRITIC as $chars) {
1971c6adce8SGreg Roach            self::assertSame('', $encoding->toUtf8($chars));
1981c6adce8SGreg Roach        }
1991c6adce8SGreg Roach    }
2001c6adce8SGreg Roach}
201