xref: /webtrees/tests/app/Encodings/AnselTest.php (revision f01ab4ac305e1fac9efbeef65f5be51ced21e7a7)
11c6adce8SGreg Roach<?php
21c6adce8SGreg Roach
31c6adce8SGreg Roach/**
41c6adce8SGreg Roach * webtrees: online genealogy
51c6adce8SGreg Roach * Copyright (C) 2021 webtrees development team
61c6adce8SGreg Roach * This program is free software: you can redistribute it and/or modify
71c6adce8SGreg Roach * it under the terms of the GNU General Public License as published by
81c6adce8SGreg Roach * the Free Software Foundation, either version 3 of the License, or
91c6adce8SGreg Roach * (at your option) any later version.
101c6adce8SGreg Roach * This program is distributed in the hope that it will be useful,
111c6adce8SGreg Roach * but WITHOUT ANY WARRANTY; without even the implied warranty of
121c6adce8SGreg Roach * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
131c6adce8SGreg Roach * GNU General Public License for more details.
141c6adce8SGreg Roach * You should have received a copy of the GNU General Public License
151c6adce8SGreg Roach * along with this program. If not, see <https://www.gnu.org/licenses/>.
161c6adce8SGreg Roach */
171c6adce8SGreg Roach
181c6adce8SGreg Roachdeclare(strict_types=1);
191c6adce8SGreg Roach
201c6adce8SGreg Roachnamespace Fisharebest\Webtrees\Tests\Encodings;
211c6adce8SGreg Roach
221c6adce8SGreg Roachuse Fisharebest\Webtrees\Encodings\ANSEL;
231c6adce8SGreg Roachuse Fisharebest\Webtrees\Encodings\UTF8;
241c6adce8SGreg Roachuse Normalizer;
251c6adce8SGreg Roachuse PHPUnit\Framework\TestCase;
261c6adce8SGreg Roach
271c6adce8SGreg Roachuse function bin2hex;
281c6adce8SGreg Roachuse function ctype_alpha;
291c6adce8SGreg Roachuse function dechex;
301c6adce8SGreg Roachuse function in_array;
311c6adce8SGreg Roachuse function preg_split;
321c6adce8SGreg Roachuse function range;
331c6adce8SGreg Roach
341c6adce8SGreg Roachuse const PREG_SPLIT_NO_EMPTY;
351c6adce8SGreg Roach
361c6adce8SGreg Roach/**
371c6adce8SGreg Roach * Tests for class ANSEL.
381c6adce8SGreg Roach */
391c6adce8SGreg Roachclass AnselTest extends TestCase
401c6adce8SGreg Roach{
411c6adce8SGreg Roach    private const TEST_DATA = [
421c6adce8SGreg Roach        "\x00\x01\x02\x03\x04\x05\x06\x07"         => "\x00\x01\x02\x03\x04\x05\x06\x07",
431c6adce8SGreg Roach        "\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F"         => "\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F",
441c6adce8SGreg Roach        "\x10\x11\x12\x13\x14\x15\x16\x17"         => "\x10\x11\x12\x13\x14\x15\x16\x17",
451c6adce8SGreg Roach        "\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F"         => "\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F",
461c6adce8SGreg Roach        ' !"#$%&\''                                => "\x20\x21\x22\x23\x24\x25\x26\x27",
471c6adce8SGreg Roach        '()*+,-./'                                 => "\x28\x29\x2A\x2B\x2C\x2D\x2E\x2F",
481c6adce8SGreg Roach        '01234567'                                 => "\x30\x31\x32\x33\x34\x35\x36\x37",
491c6adce8SGreg Roach        '89:;<=>?'                                 => "\x38\x39\x3A\x3B\x3C\x3D\x3E\x3F",
501c6adce8SGreg Roach        '@ABCDEFG'                                 => "\x40\x41\x42\x43\x44\x45\x46\x47",
511c6adce8SGreg Roach        'HIJKLMNO'                                 => "\x48\x49\x4A\x4B\x4C\x4D\x4E\x4F",
521c6adce8SGreg Roach        'PQRSTUVW'                                 => "\x50\x51\x52\x53\x54\x55\x56\x57",
531c6adce8SGreg Roach        'XYZ[\\]^_'                                => "\x58\x59\x5A\x5B\x5C\x5D\x5E\x5F",
541c6adce8SGreg Roach        '`abcdefg'                                 => "\x60\x61\x62\x63\x64\x65\x66\x67",
551c6adce8SGreg Roach        'hijklmno'                                 => "\x68\x69\x6A\x6B\x6C\x6D\x6E\x6F",
561c6adce8SGreg Roach        'pqrstuvw'                                 => "\x70\x71\x72\x73\x74\x75\x76\x77",
571c6adce8SGreg Roach        "xyz{|}~\x7F"                              => "\x78\x79\x7A\x7B\x7C\x7D\x7E\x7F",
581c6adce8SGreg Roach        "\xC2\x98\xC2\x9C\xE2\x80\x8D\xE2\x80\x8C" => "\x88\x89\x8D\x8E",
591c6adce8SGreg Roach        'ŁØĐÞÆŒʹ'                                  => "\xA1\xA2\xA3\xA4\xA5\xA6\xA7",
601c6adce8SGreg Roach        '·♭®±ƠƯʼ'                                  => "\xA8\xA9\xAA\xAB\xAC\xAD\xAE",
611c6adce8SGreg Roach        'ʻłøđþæœʺ'                                 => "\xB0\xB1\xB2\xB3\xB4\xB5\xB6\xB7",
621c6adce8SGreg Roach        'ı£ðơư'                                    => "\xB8\xB9\xBA\xBC\xBD",
631c6adce8SGreg Roach        '°ℓ℗©♯¿¡ẞ€'                                => "\xC0\xC1\xC2\xC3\xC4\xC5\xC6\xC7\xC8",
641c6adce8SGreg Roach        // Combining diacritics
651c6adce8SGreg Roach        'ảàáâãāăȧ'                                 => "\xE0a\xE1a\xE2a\xE3a\xE4a\xE5a\xE6a\xE7a",
661c6adce8SGreg Roach        'äǎåa͡a̕a̋a̐'                              => "\xE8a\xE9a\xEAa\xEBa\xEDa\xEEa\xEFa",
671c6adce8SGreg Roach        'a̧ąạa̤ḁa̳a̲a̦'                            => "\xF0a\xF1a\xF2a\xF3a\xF4a\xF5a\xF6a\xF7a",
681c6adce8SGreg Roach        'a̜a̮a͠a̓a̸'                               => "\xF8a\xF9a\xFAa\xFEa\xFFa",
691c6adce8SGreg Roach        // Diacritics with non-ascii
701c6adce8SGreg Roach        'ǣ'                                        => "\xE5\xB5",
711c6adce8SGreg Roach        // LATIN CAPITAL LETTER O WITH DIAERESIS AND MACRON
721c6adce8SGreg Roach        'Ō̈'                                       => "\xE5\xE8O",
731c6adce8SGreg Roach        // LATIN CAPITAL LETTER O WITH MACRON AND DIAERESIS
741c6adce8SGreg Roach        'Ȫ'                                        => "\xE8\xE5O",
751c6adce8SGreg Roach    ];
761c6adce8SGreg Roach
771c6adce8SGreg Roach    private const UNPRINTABLE = [
781c6adce8SGreg Roach        "\x80\x81\x82\x83\x84\x85\x86\x87",
791c6adce8SGreg Roach        "\x8A\x8B\x8C\x8F",
801c6adce8SGreg Roach        "\x90\x91\x92\x93\x94\x95\x96\x97",
811c6adce8SGreg Roach        "\x98\x99\x9A\x9B\x9C\x9D\x9E\x9F",
821c6adce8SGreg Roach        "\xA0",
831c6adce8SGreg Roach        "\xAF",
841c6adce8SGreg Roach        "\xBB",
851c6adce8SGreg Roach        "\xC9\xCA\xCB\xCC\xCD\xCE",
861c6adce8SGreg Roach        "\xD0\xD1\xD2\xD3\xD4\xD5\xD6\xD7",
871c6adce8SGreg Roach        "\xD8\xD9\xDA\xDB\xDC\xDD\xDE\xDF",
881c6adce8SGreg Roach        "\xFC\xFD",
891c6adce8SGreg Roach    ];
901c6adce8SGreg Roach
911c6adce8SGreg Roach    private const MULTIPART_DIACRITIC = ["\xEC", "\xFB"];
921c6adce8SGreg Roach
931c6adce8SGreg Roach    /**
941c6adce8SGreg Roach     * @covers \Fisharebest\Webtrees\Encodings\AbstractEncoding
951c6adce8SGreg Roach     * @covers \Fisharebest\Webtrees\Encodings\ANSEL
961c6adce8SGreg Roach     * @covers \Fisharebest\Webtrees\Encodings\UTF8::chr
971c6adce8SGreg Roach     *
981c6adce8SGreg Roach     * @return void
991c6adce8SGreg Roach     */
1001c6adce8SGreg Roach    public function testPreComposedCharacters(): void
1011c6adce8SGreg Roach    {
1021c6adce8SGreg Roach        $latin_code_blocks = [
1031c6adce8SGreg Roach            range(0x80, 0xFF),
1041c6adce8SGreg Roach            range(0x100, 0x17F),
1051c6adce8SGreg Roach            range(0x180, 0x24F),
1061c6adce8SGreg Roach            range(0x1E00, 0x1EFF),
1071c6adce8SGreg Roach        ];
1081c6adce8SGreg Roach
1091c6adce8SGreg Roach        $ansel_combining_characters = [
1101c6adce8SGreg Roach            UTF8::COMBINING_HOOK_ABOVE,
1111c6adce8SGreg Roach            UTF8::COMBINING_GRAVE_ACCENT,
1121c6adce8SGreg Roach            UTF8::COMBINING_ACUTE_ACCENT,
1131c6adce8SGreg Roach            UTF8::COMBINING_CIRCUMFLEX_ACCENT,
1141c6adce8SGreg Roach            UTF8::COMBINING_TILDE,
1151c6adce8SGreg Roach            UTF8::COMBINING_MACRON,
1161c6adce8SGreg Roach            UTF8::COMBINING_BREVE,
1171c6adce8SGreg Roach            UTF8::COMBINING_DOT_ABOVE,
1181c6adce8SGreg Roach            UTF8::COMBINING_DIAERESIS,
1191c6adce8SGreg Roach            UTF8::COMBINING_CARON,
1201c6adce8SGreg Roach            UTF8::COMBINING_RING_ABOVE,
1211c6adce8SGreg Roach            UTF8::COMBINING_DOUBLE_INVERTED_BREVE,
1221c6adce8SGreg Roach            UTF8::COMBINING_COMMA_ABOVE_RIGHT,
1231c6adce8SGreg Roach            UTF8::COMBINING_DOUBLE_ACUTE_ACCENT,
1241c6adce8SGreg Roach            UTF8::COMBINING_CANDRABINDU,
1251c6adce8SGreg Roach            UTF8::COMBINING_CEDILLA,
1261c6adce8SGreg Roach            UTF8::COMBINING_OGONEK,
1271c6adce8SGreg Roach            UTF8::COMBINING_DOT_BELOW,
1281c6adce8SGreg Roach            UTF8::COMBINING_DIAERESIS_BELOW,
1291c6adce8SGreg Roach            UTF8::COMBINING_RING_BELOW,
1301c6adce8SGreg Roach            UTF8::COMBINING_DOUBLE_LOW_LINE,
1311c6adce8SGreg Roach            UTF8::COMBINING_LOW_LINE,
1321c6adce8SGreg Roach            UTF8::COMBINING_COMMA_BELOW,
1331c6adce8SGreg Roach            UTF8::COMBINING_LEFT_HALF_RING_BELOW,
1341c6adce8SGreg Roach            UTF8::COMBINING_BREVE_BELOW,
1351c6adce8SGreg Roach            UTF8::COMBINING_DOUBLE_TILDE,
1361c6adce8SGreg Roach            UTF8::REPLACEMENT_CHARACTER,
1371c6adce8SGreg Roach            UTF8::REPLACEMENT_CHARACTER,
1381c6adce8SGreg Roach            UTF8::COMBINING_COMMA_ABOVE,
1391c6adce8SGreg Roach            UTF8::COMBINING_LONG_SOLIDUS_OVERLAY,
1401c6adce8SGreg Roach        ];
1411c6adce8SGreg Roach
1421c6adce8SGreg Roach        $encoding = new ANSEL();
1431c6adce8SGreg Roach
1441c6adce8SGreg Roach        foreach ($latin_code_blocks as $codes) {
1451c6adce8SGreg Roach            foreach ($codes as $code) {
1461c6adce8SGreg Roach                $utf8 = UTF8::chr($code);
1471c6adce8SGreg Roach                $norm = Normalizer::normalize($utf8, Normalizer::FORM_D);
1481c6adce8SGreg Roach
1491c6adce8SGreg Roach                if ($norm !== $utf8) {
1501c6adce8SGreg Roach                    $chars = preg_split('//u', $norm, -1, PREG_SPLIT_NO_EMPTY);
1511c6adce8SGreg Roach                    if (!ctype_alpha($chars[0])) {
1521c6adce8SGreg Roach                        continue;
1531c6adce8SGreg Roach                    }
1541c6adce8SGreg Roach                    if (!in_array($chars[1], $ansel_combining_characters, true)) {
1551c6adce8SGreg Roach                        continue;
1561c6adce8SGreg Roach                    }
1571c6adce8SGreg Roach                    if (count($chars) >= 3 && !in_array($chars[2], $ansel_combining_characters, true)) {
1581c6adce8SGreg Roach                        continue;
1591c6adce8SGreg Roach                    }
1601c6adce8SGreg Roach
161*f01ab4acSGreg Roach                    static::assertSame($utf8, $encoding->toUtf8($encoding->fromUtf8($utf8)), 'U+' . dechex($code));
1621c6adce8SGreg Roach                }
1631c6adce8SGreg Roach            }
1641c6adce8SGreg Roach        }
1651c6adce8SGreg Roach    }
1661c6adce8SGreg Roach
1671c6adce8SGreg Roach    /**
1681c6adce8SGreg Roach     * @covers \Fisharebest\Webtrees\Encodings\AbstractEncoding
1691c6adce8SGreg Roach     * @covers \Fisharebest\Webtrees\Encodings\ANSEL
1701c6adce8SGreg Roach     *
1711c6adce8SGreg Roach     * @return void
1721c6adce8SGreg Roach     */
1731c6adce8SGreg Roach    public function testToUtf8(): void
1741c6adce8SGreg Roach    {
1751c6adce8SGreg Roach        $encoding = new ANSEL();
1761c6adce8SGreg Roach
1771c6adce8SGreg Roach        foreach (self::TEST_DATA as $utf8 => $ansel) {
1781c6adce8SGreg Roach            self::assertSame($utf8, $encoding->toUtf8($ansel), bin2hex($utf8) . ' ' . bin2hex($encoding->toUtf8($ansel)));
1791c6adce8SGreg Roach        }
1801c6adce8SGreg Roach    }
1811c6adce8SGreg Roach
1821c6adce8SGreg Roach    /**
1831c6adce8SGreg Roach     * @covers \Fisharebest\Webtrees\Encodings\AbstractEncoding
1841c6adce8SGreg Roach     * @covers \Fisharebest\Webtrees\Encodings\ANSEL
1851c6adce8SGreg Roach     *
1861c6adce8SGreg Roach     * @return void
1871c6adce8SGreg Roach     */
1881c6adce8SGreg Roach    public function testFromUtf8(): void
1891c6adce8SGreg Roach    {
1901c6adce8SGreg Roach        $encoding = new ANSEL();
1911c6adce8SGreg Roach
1921c6adce8SGreg Roach        foreach (self::TEST_DATA as $utf8 => $other) {
1931c6adce8SGreg Roach            self::assertSame($other, $encoding->fromUtf8($utf8));
1941c6adce8SGreg Roach        }
1951c6adce8SGreg Roach    }
1961c6adce8SGreg Roach
1971c6adce8SGreg Roach    /**
1981c6adce8SGreg Roach     * @covers \Fisharebest\Webtrees\Encodings\AbstractEncoding
1991c6adce8SGreg Roach     * @covers \Fisharebest\Webtrees\Encodings\ANSEL
2001c6adce8SGreg Roach     *
2011c6adce8SGreg Roach     * @return void
2021c6adce8SGreg Roach     */
2031c6adce8SGreg Roach    public function testUnprintable(): void
2041c6adce8SGreg Roach    {
2051c6adce8SGreg Roach        $encoding = new ANSEL();
2061c6adce8SGreg Roach
2071c6adce8SGreg Roach        foreach (self::UNPRINTABLE as $chars) {
2081c6adce8SGreg Roach            $expected = str_repeat(UTF8::REPLACEMENT_CHARACTER, strlen($chars));
2091c6adce8SGreg Roach            self::assertSame($expected, $encoding->toUtf8($chars));
2101c6adce8SGreg Roach        }
2111c6adce8SGreg Roach    }
2121c6adce8SGreg Roach
2131c6adce8SGreg Roach    /**
2141c6adce8SGreg Roach     * @covers \Fisharebest\Webtrees\Encodings\AbstractEncoding
2151c6adce8SGreg Roach     * @covers \Fisharebest\Webtrees\Encodings\ANSEL
2161c6adce8SGreg Roach     *
2171c6adce8SGreg Roach     * @return void
2181c6adce8SGreg Roach     */
2191c6adce8SGreg Roach    public function testMultiPartDiacritic(): void
2201c6adce8SGreg Roach    {
2211c6adce8SGreg Roach        $encoding = new ANSEL();
2221c6adce8SGreg Roach
2231c6adce8SGreg Roach        foreach (self::MULTIPART_DIACRITIC as $chars) {
2241c6adce8SGreg Roach            self::assertSame('', $encoding->toUtf8($chars));
2251c6adce8SGreg Roach        }
2261c6adce8SGreg Roach    }
2271c6adce8SGreg Roach}
228