11c6adce8SGreg Roach<?php 21c6adce8SGreg Roach 31c6adce8SGreg Roach/** 41c6adce8SGreg Roach * webtrees: online genealogy 5d11be702SGreg Roach * Copyright (C) 2023 webtrees development team 61c6adce8SGreg Roach * This program is free software: you can redistribute it and/or modify 71c6adce8SGreg Roach * it under the terms of the GNU General Public License as published by 81c6adce8SGreg Roach * the Free Software Foundation, either version 3 of the License, or 91c6adce8SGreg Roach * (at your option) any later version. 101c6adce8SGreg Roach * This program is distributed in the hope that it will be useful, 111c6adce8SGreg Roach * but WITHOUT ANY WARRANTY; without even the implied warranty of 121c6adce8SGreg Roach * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 131c6adce8SGreg Roach * GNU General Public License for more details. 141c6adce8SGreg Roach * You should have received a copy of the GNU General Public License 151c6adce8SGreg Roach * along with this program. If not, see <https://www.gnu.org/licenses/>. 161c6adce8SGreg Roach */ 171c6adce8SGreg Roach 181c6adce8SGreg Roachdeclare(strict_types=1); 191c6adce8SGreg Roach 201c6adce8SGreg Roachnamespace Fisharebest\Webtrees\Tests\Encodings; 211c6adce8SGreg Roach 22*202c018bSGreg Roachuse Fisharebest\Webtrees\Encodings\AbstractEncoding; 231c6adce8SGreg Roachuse Fisharebest\Webtrees\Encodings\ANSEL; 241c6adce8SGreg Roachuse Fisharebest\Webtrees\Encodings\UTF8; 251c6adce8SGreg Roachuse Normalizer; 26*202c018bSGreg Roachuse PHPUnit\Framework\Attributes\CoversClass; 271c6adce8SGreg Roachuse PHPUnit\Framework\TestCase; 281c6adce8SGreg Roach 291c6adce8SGreg Roachuse function bin2hex; 3010e06497SGreg Roachuse function count; 311c6adce8SGreg Roachuse function ctype_alpha; 321c6adce8SGreg Roachuse function dechex; 331c6adce8SGreg Roachuse function in_array; 341c6adce8SGreg Roachuse function preg_split; 351c6adce8SGreg Roachuse function range; 3610e06497SGreg Roachuse function strlen; 371c6adce8SGreg Roach 381c6adce8SGreg Roachuse const PREG_SPLIT_NO_EMPTY; 391c6adce8SGreg Roach 40*202c018bSGreg Roach#[CoversClass(AbstractEncoding::class)] 41*202c018bSGreg Roach#[CoversClass(ANSEL::class)] 42*202c018bSGreg Roach#[CoversClass(UTF8::class)] 431c6adce8SGreg Roachclass AnselTest extends TestCase 441c6adce8SGreg Roach{ 451c6adce8SGreg Roach private const TEST_DATA = [ 461c6adce8SGreg Roach "\x00\x01\x02\x03\x04\x05\x06\x07" => "\x00\x01\x02\x03\x04\x05\x06\x07", 471c6adce8SGreg Roach "\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F" => "\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F", 481c6adce8SGreg Roach "\x10\x11\x12\x13\x14\x15\x16\x17" => "\x10\x11\x12\x13\x14\x15\x16\x17", 491c6adce8SGreg Roach "\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F" => "\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F", 501c6adce8SGreg Roach ' !"#$%&\'' => "\x20\x21\x22\x23\x24\x25\x26\x27", 511c6adce8SGreg Roach '()*+,-./' => "\x28\x29\x2A\x2B\x2C\x2D\x2E\x2F", 521c6adce8SGreg Roach '01234567' => "\x30\x31\x32\x33\x34\x35\x36\x37", 531c6adce8SGreg Roach '89:;<=>?' => "\x38\x39\x3A\x3B\x3C\x3D\x3E\x3F", 541c6adce8SGreg Roach '@ABCDEFG' => "\x40\x41\x42\x43\x44\x45\x46\x47", 551c6adce8SGreg Roach 'HIJKLMNO' => "\x48\x49\x4A\x4B\x4C\x4D\x4E\x4F", 561c6adce8SGreg Roach 'PQRSTUVW' => "\x50\x51\x52\x53\x54\x55\x56\x57", 571c6adce8SGreg Roach 'XYZ[\\]^_' => "\x58\x59\x5A\x5B\x5C\x5D\x5E\x5F", 581c6adce8SGreg Roach '`abcdefg' => "\x60\x61\x62\x63\x64\x65\x66\x67", 591c6adce8SGreg Roach 'hijklmno' => "\x68\x69\x6A\x6B\x6C\x6D\x6E\x6F", 601c6adce8SGreg Roach 'pqrstuvw' => "\x70\x71\x72\x73\x74\x75\x76\x77", 611c6adce8SGreg Roach "xyz{|}~\x7F" => "\x78\x79\x7A\x7B\x7C\x7D\x7E\x7F", 621c6adce8SGreg Roach "\xC2\x98\xC2\x9C\xE2\x80\x8D\xE2\x80\x8C" => "\x88\x89\x8D\x8E", 631c6adce8SGreg Roach 'ŁØĐÞÆŒʹ' => "\xA1\xA2\xA3\xA4\xA5\xA6\xA7", 641c6adce8SGreg Roach '·♭®±ƠƯʼ' => "\xA8\xA9\xAA\xAB\xAC\xAD\xAE", 651c6adce8SGreg Roach 'ʻłøđþæœʺ' => "\xB0\xB1\xB2\xB3\xB4\xB5\xB6\xB7", 661c6adce8SGreg Roach 'ı£ðơư' => "\xB8\xB9\xBA\xBC\xBD", 671c6adce8SGreg Roach '°ℓ℗©♯¿¡ẞ€' => "\xC0\xC1\xC2\xC3\xC4\xC5\xC6\xC7\xC8", 681c6adce8SGreg Roach // Combining diacritics 691c6adce8SGreg Roach 'ảàáâãāăȧ' => "\xE0a\xE1a\xE2a\xE3a\xE4a\xE5a\xE6a\xE7a", 701c6adce8SGreg Roach 'äǎåa͡a̕a̋a̐' => "\xE8a\xE9a\xEAa\xEBa\xEDa\xEEa\xEFa", 711c6adce8SGreg Roach 'a̧ąạa̤ḁa̳a̲a̦' => "\xF0a\xF1a\xF2a\xF3a\xF4a\xF5a\xF6a\xF7a", 721c6adce8SGreg Roach 'a̜a̮a͠a̓a̸' => "\xF8a\xF9a\xFAa\xFEa\xFFa", 731c6adce8SGreg Roach // Diacritics with non-ascii 741c6adce8SGreg Roach 'ǣ' => "\xE5\xB5", 751c6adce8SGreg Roach // LATIN CAPITAL LETTER O WITH DIAERESIS AND MACRON 761c6adce8SGreg Roach 'Ō̈' => "\xE5\xE8O", 771c6adce8SGreg Roach // LATIN CAPITAL LETTER O WITH MACRON AND DIAERESIS 781c6adce8SGreg Roach 'Ȫ' => "\xE8\xE5O", 791c6adce8SGreg Roach ]; 801c6adce8SGreg Roach 811c6adce8SGreg Roach private const UNPRINTABLE = [ 821c6adce8SGreg Roach "\x80\x81\x82\x83\x84\x85\x86\x87", 831c6adce8SGreg Roach "\x8A\x8B\x8C\x8F", 841c6adce8SGreg Roach "\x90\x91\x92\x93\x94\x95\x96\x97", 851c6adce8SGreg Roach "\x98\x99\x9A\x9B\x9C\x9D\x9E\x9F", 861c6adce8SGreg Roach "\xA0", 871c6adce8SGreg Roach "\xAF", 881c6adce8SGreg Roach "\xBB", 891c6adce8SGreg Roach "\xC9\xCA\xCB\xCC\xCD\xCE", 901c6adce8SGreg Roach "\xD0\xD1\xD2\xD3\xD4\xD5\xD6\xD7", 911c6adce8SGreg Roach "\xD8\xD9\xDA\xDB\xDC\xDD\xDE\xDF", 921c6adce8SGreg Roach "\xFC\xFD", 931c6adce8SGreg Roach ]; 941c6adce8SGreg Roach 951c6adce8SGreg Roach private const MULTIPART_DIACRITIC = ["\xEC", "\xFB"]; 961c6adce8SGreg Roach 971c6adce8SGreg Roach public function testPreComposedCharacters(): void 981c6adce8SGreg Roach { 991c6adce8SGreg Roach $latin_code_blocks = [ 1001c6adce8SGreg Roach range(0x80, 0xFF), 1011c6adce8SGreg Roach range(0x100, 0x17F), 1021c6adce8SGreg Roach range(0x180, 0x24F), 1031c6adce8SGreg Roach range(0x1E00, 0x1EFF), 1041c6adce8SGreg Roach ]; 1051c6adce8SGreg Roach 1061c6adce8SGreg Roach $ansel_combining_characters = [ 1071c6adce8SGreg Roach UTF8::COMBINING_HOOK_ABOVE, 1081c6adce8SGreg Roach UTF8::COMBINING_GRAVE_ACCENT, 1091c6adce8SGreg Roach UTF8::COMBINING_ACUTE_ACCENT, 1101c6adce8SGreg Roach UTF8::COMBINING_CIRCUMFLEX_ACCENT, 1111c6adce8SGreg Roach UTF8::COMBINING_TILDE, 1121c6adce8SGreg Roach UTF8::COMBINING_MACRON, 1131c6adce8SGreg Roach UTF8::COMBINING_BREVE, 1141c6adce8SGreg Roach UTF8::COMBINING_DOT_ABOVE, 1151c6adce8SGreg Roach UTF8::COMBINING_DIAERESIS, 1161c6adce8SGreg Roach UTF8::COMBINING_CARON, 1171c6adce8SGreg Roach UTF8::COMBINING_RING_ABOVE, 1181c6adce8SGreg Roach UTF8::COMBINING_DOUBLE_INVERTED_BREVE, 1191c6adce8SGreg Roach UTF8::COMBINING_COMMA_ABOVE_RIGHT, 1201c6adce8SGreg Roach UTF8::COMBINING_DOUBLE_ACUTE_ACCENT, 1211c6adce8SGreg Roach UTF8::COMBINING_CANDRABINDU, 1221c6adce8SGreg Roach UTF8::COMBINING_CEDILLA, 1231c6adce8SGreg Roach UTF8::COMBINING_OGONEK, 1241c6adce8SGreg Roach UTF8::COMBINING_DOT_BELOW, 1251c6adce8SGreg Roach UTF8::COMBINING_DIAERESIS_BELOW, 1261c6adce8SGreg Roach UTF8::COMBINING_RING_BELOW, 1271c6adce8SGreg Roach UTF8::COMBINING_DOUBLE_LOW_LINE, 1281c6adce8SGreg Roach UTF8::COMBINING_LOW_LINE, 1291c6adce8SGreg Roach UTF8::COMBINING_COMMA_BELOW, 1301c6adce8SGreg Roach UTF8::COMBINING_LEFT_HALF_RING_BELOW, 1311c6adce8SGreg Roach UTF8::COMBINING_BREVE_BELOW, 1321c6adce8SGreg Roach UTF8::COMBINING_DOUBLE_TILDE, 1331c6adce8SGreg Roach UTF8::REPLACEMENT_CHARACTER, 1341c6adce8SGreg Roach UTF8::REPLACEMENT_CHARACTER, 1351c6adce8SGreg Roach UTF8::COMBINING_COMMA_ABOVE, 1361c6adce8SGreg Roach UTF8::COMBINING_LONG_SOLIDUS_OVERLAY, 1371c6adce8SGreg Roach ]; 1381c6adce8SGreg Roach 1391c6adce8SGreg Roach $encoding = new ANSEL(); 1401c6adce8SGreg Roach 1411c6adce8SGreg Roach foreach ($latin_code_blocks as $codes) { 1421c6adce8SGreg Roach foreach ($codes as $code) { 1431c6adce8SGreg Roach $utf8 = UTF8::chr($code); 1441c6adce8SGreg Roach $norm = Normalizer::normalize($utf8, Normalizer::FORM_D); 1451c6adce8SGreg Roach 1461c6adce8SGreg Roach if ($norm !== $utf8) { 1471c6adce8SGreg Roach $chars = preg_split('//u', $norm, -1, PREG_SPLIT_NO_EMPTY); 1481c6adce8SGreg Roach if (!ctype_alpha($chars[0])) { 1491c6adce8SGreg Roach continue; 1501c6adce8SGreg Roach } 1511c6adce8SGreg Roach if (!in_array($chars[1], $ansel_combining_characters, true)) { 1521c6adce8SGreg Roach continue; 1531c6adce8SGreg Roach } 1541c6adce8SGreg Roach if (count($chars) >= 3 && !in_array($chars[2], $ansel_combining_characters, true)) { 1551c6adce8SGreg Roach continue; 1561c6adce8SGreg Roach } 1571c6adce8SGreg Roach 158f01ab4acSGreg Roach static::assertSame($utf8, $encoding->toUtf8($encoding->fromUtf8($utf8)), 'U+' . dechex($code)); 1591c6adce8SGreg Roach } 1601c6adce8SGreg Roach } 1611c6adce8SGreg Roach } 1621c6adce8SGreg Roach } 1631c6adce8SGreg Roach 1641c6adce8SGreg Roach public function testToUtf8(): void 1651c6adce8SGreg Roach { 1661c6adce8SGreg Roach $encoding = new ANSEL(); 1671c6adce8SGreg Roach 1681c6adce8SGreg Roach foreach (self::TEST_DATA as $utf8 => $ansel) { 1691c6adce8SGreg Roach self::assertSame($utf8, $encoding->toUtf8($ansel), bin2hex($utf8) . ' ' . bin2hex($encoding->toUtf8($ansel))); 1701c6adce8SGreg Roach } 1711c6adce8SGreg Roach } 1721c6adce8SGreg Roach 1731c6adce8SGreg Roach public function testFromUtf8(): void 1741c6adce8SGreg Roach { 1751c6adce8SGreg Roach $encoding = new ANSEL(); 1761c6adce8SGreg Roach 1771c6adce8SGreg Roach foreach (self::TEST_DATA as $utf8 => $other) { 1781c6adce8SGreg Roach self::assertSame($other, $encoding->fromUtf8($utf8)); 1791c6adce8SGreg Roach } 1801c6adce8SGreg Roach } 1811c6adce8SGreg Roach 1821c6adce8SGreg Roach public function testUnprintable(): void 1831c6adce8SGreg Roach { 1841c6adce8SGreg Roach $encoding = new ANSEL(); 1851c6adce8SGreg Roach 1861c6adce8SGreg Roach foreach (self::UNPRINTABLE as $chars) { 1871c6adce8SGreg Roach $expected = str_repeat(UTF8::REPLACEMENT_CHARACTER, strlen($chars)); 1881c6adce8SGreg Roach self::assertSame($expected, $encoding->toUtf8($chars)); 1891c6adce8SGreg Roach } 1901c6adce8SGreg Roach } 1911c6adce8SGreg Roach 1921c6adce8SGreg Roach public function testMultiPartDiacritic(): void 1931c6adce8SGreg Roach { 1941c6adce8SGreg Roach $encoding = new ANSEL(); 1951c6adce8SGreg Roach 1961c6adce8SGreg Roach foreach (self::MULTIPART_DIACRITIC as $chars) { 1971c6adce8SGreg Roach self::assertSame('', $encoding->toUtf8($chars)); 1981c6adce8SGreg Roach } 1991c6adce8SGreg Roach } 2001c6adce8SGreg Roach} 201