11c6adce8SGreg Roach<?php 21c6adce8SGreg Roach 31c6adce8SGreg Roach/** 41c6adce8SGreg Roach * webtrees: online genealogy 51c6adce8SGreg Roach * Copyright (C) 2021 webtrees development team 61c6adce8SGreg Roach * This program is free software: you can redistribute it and/or modify 71c6adce8SGreg Roach * it under the terms of the GNU General Public License as published by 81c6adce8SGreg Roach * the Free Software Foundation, either version 3 of the License, or 91c6adce8SGreg Roach * (at your option) any later version. 101c6adce8SGreg Roach * This program is distributed in the hope that it will be useful, 111c6adce8SGreg Roach * but WITHOUT ANY WARRANTY; without even the implied warranty of 121c6adce8SGreg Roach * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 131c6adce8SGreg Roach * GNU General Public License for more details. 141c6adce8SGreg Roach * You should have received a copy of the GNU General Public License 151c6adce8SGreg Roach * along with this program. If not, see <https://www.gnu.org/licenses/>. 161c6adce8SGreg Roach */ 171c6adce8SGreg Roach 181c6adce8SGreg Roachdeclare(strict_types=1); 191c6adce8SGreg Roach 201c6adce8SGreg Roachnamespace Fisharebest\Webtrees\Tests\Encodings; 211c6adce8SGreg Roach 221c6adce8SGreg Roachuse Fisharebest\Webtrees\Encodings\ANSEL; 231c6adce8SGreg Roachuse Fisharebest\Webtrees\Encodings\UTF8; 241c6adce8SGreg Roachuse Normalizer; 251c6adce8SGreg Roachuse PHPUnit\Framework\TestCase; 261c6adce8SGreg Roach 271c6adce8SGreg Roachuse function bin2hex; 281c6adce8SGreg Roachuse function ctype_alpha; 291c6adce8SGreg Roachuse function dechex; 301c6adce8SGreg Roachuse function in_array; 311c6adce8SGreg Roachuse function preg_split; 321c6adce8SGreg Roachuse function range; 331c6adce8SGreg Roach 341c6adce8SGreg Roachuse const PREG_SPLIT_NO_EMPTY; 351c6adce8SGreg Roach 361c6adce8SGreg Roach/** 371c6adce8SGreg Roach * Tests for class ANSEL. 381c6adce8SGreg Roach */ 391c6adce8SGreg Roachclass AnselTest extends TestCase 401c6adce8SGreg Roach{ 411c6adce8SGreg Roach private const TEST_DATA = [ 421c6adce8SGreg Roach "\x00\x01\x02\x03\x04\x05\x06\x07" => "\x00\x01\x02\x03\x04\x05\x06\x07", 431c6adce8SGreg Roach "\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F" => "\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F", 441c6adce8SGreg Roach "\x10\x11\x12\x13\x14\x15\x16\x17" => "\x10\x11\x12\x13\x14\x15\x16\x17", 451c6adce8SGreg Roach "\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F" => "\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F", 461c6adce8SGreg Roach ' !"#$%&\'' => "\x20\x21\x22\x23\x24\x25\x26\x27", 471c6adce8SGreg Roach '()*+,-./' => "\x28\x29\x2A\x2B\x2C\x2D\x2E\x2F", 481c6adce8SGreg Roach '01234567' => "\x30\x31\x32\x33\x34\x35\x36\x37", 491c6adce8SGreg Roach '89:;<=>?' => "\x38\x39\x3A\x3B\x3C\x3D\x3E\x3F", 501c6adce8SGreg Roach '@ABCDEFG' => "\x40\x41\x42\x43\x44\x45\x46\x47", 511c6adce8SGreg Roach 'HIJKLMNO' => "\x48\x49\x4A\x4B\x4C\x4D\x4E\x4F", 521c6adce8SGreg Roach 'PQRSTUVW' => "\x50\x51\x52\x53\x54\x55\x56\x57", 531c6adce8SGreg Roach 'XYZ[\\]^_' => "\x58\x59\x5A\x5B\x5C\x5D\x5E\x5F", 541c6adce8SGreg Roach '`abcdefg' => "\x60\x61\x62\x63\x64\x65\x66\x67", 551c6adce8SGreg Roach 'hijklmno' => "\x68\x69\x6A\x6B\x6C\x6D\x6E\x6F", 561c6adce8SGreg Roach 'pqrstuvw' => "\x70\x71\x72\x73\x74\x75\x76\x77", 571c6adce8SGreg Roach "xyz{|}~\x7F" => "\x78\x79\x7A\x7B\x7C\x7D\x7E\x7F", 581c6adce8SGreg Roach "\xC2\x98\xC2\x9C\xE2\x80\x8D\xE2\x80\x8C" => "\x88\x89\x8D\x8E", 591c6adce8SGreg Roach 'ŁØĐÞÆŒʹ' => "\xA1\xA2\xA3\xA4\xA5\xA6\xA7", 601c6adce8SGreg Roach '·♭®±ƠƯʼ' => "\xA8\xA9\xAA\xAB\xAC\xAD\xAE", 611c6adce8SGreg Roach 'ʻłøđþæœʺ' => "\xB0\xB1\xB2\xB3\xB4\xB5\xB6\xB7", 621c6adce8SGreg Roach 'ı£ðơư' => "\xB8\xB9\xBA\xBC\xBD", 631c6adce8SGreg Roach '°ℓ℗©♯¿¡ẞ€' => "\xC0\xC1\xC2\xC3\xC4\xC5\xC6\xC7\xC8", 641c6adce8SGreg Roach // Combining diacritics 651c6adce8SGreg Roach 'ảàáâãāăȧ' => "\xE0a\xE1a\xE2a\xE3a\xE4a\xE5a\xE6a\xE7a", 661c6adce8SGreg Roach 'äǎåa͡a̕a̋a̐' => "\xE8a\xE9a\xEAa\xEBa\xEDa\xEEa\xEFa", 671c6adce8SGreg Roach 'a̧ąạa̤ḁa̳a̲a̦' => "\xF0a\xF1a\xF2a\xF3a\xF4a\xF5a\xF6a\xF7a", 681c6adce8SGreg Roach 'a̜a̮a͠a̓a̸' => "\xF8a\xF9a\xFAa\xFEa\xFFa", 691c6adce8SGreg Roach // Diacritics with non-ascii 701c6adce8SGreg Roach 'ǣ' => "\xE5\xB5", 711c6adce8SGreg Roach // LATIN CAPITAL LETTER O WITH DIAERESIS AND MACRON 721c6adce8SGreg Roach 'Ō̈' => "\xE5\xE8O", 731c6adce8SGreg Roach // LATIN CAPITAL LETTER O WITH MACRON AND DIAERESIS 741c6adce8SGreg Roach 'Ȫ' => "\xE8\xE5O", 751c6adce8SGreg Roach ]; 761c6adce8SGreg Roach 771c6adce8SGreg Roach private const UNPRINTABLE = [ 781c6adce8SGreg Roach "\x80\x81\x82\x83\x84\x85\x86\x87", 791c6adce8SGreg Roach "\x8A\x8B\x8C\x8F", 801c6adce8SGreg Roach "\x90\x91\x92\x93\x94\x95\x96\x97", 811c6adce8SGreg Roach "\x98\x99\x9A\x9B\x9C\x9D\x9E\x9F", 821c6adce8SGreg Roach "\xA0", 831c6adce8SGreg Roach "\xAF", 841c6adce8SGreg Roach "\xBB", 851c6adce8SGreg Roach "\xC9\xCA\xCB\xCC\xCD\xCE", 861c6adce8SGreg Roach "\xD0\xD1\xD2\xD3\xD4\xD5\xD6\xD7", 871c6adce8SGreg Roach "\xD8\xD9\xDA\xDB\xDC\xDD\xDE\xDF", 881c6adce8SGreg Roach "\xFC\xFD", 891c6adce8SGreg Roach ]; 901c6adce8SGreg Roach 911c6adce8SGreg Roach private const MULTIPART_DIACRITIC = ["\xEC", "\xFB"]; 921c6adce8SGreg Roach 931c6adce8SGreg Roach /** 941c6adce8SGreg Roach * @covers \Fisharebest\Webtrees\Encodings\AbstractEncoding 951c6adce8SGreg Roach * @covers \Fisharebest\Webtrees\Encodings\ANSEL 961c6adce8SGreg Roach * @covers \Fisharebest\Webtrees\Encodings\UTF8::chr 971c6adce8SGreg Roach * 981c6adce8SGreg Roach * @return void 991c6adce8SGreg Roach */ 1001c6adce8SGreg Roach public function testPreComposedCharacters(): void 1011c6adce8SGreg Roach { 1021c6adce8SGreg Roach $latin_code_blocks = [ 1031c6adce8SGreg Roach range(0x80, 0xFF), 1041c6adce8SGreg Roach range(0x100, 0x17F), 1051c6adce8SGreg Roach range(0x180, 0x24F), 1061c6adce8SGreg Roach range(0x1E00, 0x1EFF), 1071c6adce8SGreg Roach ]; 1081c6adce8SGreg Roach 1091c6adce8SGreg Roach $ansel_combining_characters = [ 1101c6adce8SGreg Roach UTF8::COMBINING_HOOK_ABOVE, 1111c6adce8SGreg Roach UTF8::COMBINING_GRAVE_ACCENT, 1121c6adce8SGreg Roach UTF8::COMBINING_ACUTE_ACCENT, 1131c6adce8SGreg Roach UTF8::COMBINING_CIRCUMFLEX_ACCENT, 1141c6adce8SGreg Roach UTF8::COMBINING_TILDE, 1151c6adce8SGreg Roach UTF8::COMBINING_MACRON, 1161c6adce8SGreg Roach UTF8::COMBINING_BREVE, 1171c6adce8SGreg Roach UTF8::COMBINING_DOT_ABOVE, 1181c6adce8SGreg Roach UTF8::COMBINING_DIAERESIS, 1191c6adce8SGreg Roach UTF8::COMBINING_CARON, 1201c6adce8SGreg Roach UTF8::COMBINING_RING_ABOVE, 1211c6adce8SGreg Roach UTF8::COMBINING_DOUBLE_INVERTED_BREVE, 1221c6adce8SGreg Roach UTF8::COMBINING_COMMA_ABOVE_RIGHT, 1231c6adce8SGreg Roach UTF8::COMBINING_DOUBLE_ACUTE_ACCENT, 1241c6adce8SGreg Roach UTF8::COMBINING_CANDRABINDU, 1251c6adce8SGreg Roach UTF8::COMBINING_CEDILLA, 1261c6adce8SGreg Roach UTF8::COMBINING_OGONEK, 1271c6adce8SGreg Roach UTF8::COMBINING_DOT_BELOW, 1281c6adce8SGreg Roach UTF8::COMBINING_DIAERESIS_BELOW, 1291c6adce8SGreg Roach UTF8::COMBINING_RING_BELOW, 1301c6adce8SGreg Roach UTF8::COMBINING_DOUBLE_LOW_LINE, 1311c6adce8SGreg Roach UTF8::COMBINING_LOW_LINE, 1321c6adce8SGreg Roach UTF8::COMBINING_COMMA_BELOW, 1331c6adce8SGreg Roach UTF8::COMBINING_LEFT_HALF_RING_BELOW, 1341c6adce8SGreg Roach UTF8::COMBINING_BREVE_BELOW, 1351c6adce8SGreg Roach UTF8::COMBINING_DOUBLE_TILDE, 1361c6adce8SGreg Roach UTF8::REPLACEMENT_CHARACTER, 1371c6adce8SGreg Roach UTF8::REPLACEMENT_CHARACTER, 1381c6adce8SGreg Roach UTF8::COMBINING_COMMA_ABOVE, 1391c6adce8SGreg Roach UTF8::COMBINING_LONG_SOLIDUS_OVERLAY, 1401c6adce8SGreg Roach ]; 1411c6adce8SGreg Roach 1421c6adce8SGreg Roach $encoding = new ANSEL(); 1431c6adce8SGreg Roach 1441c6adce8SGreg Roach foreach ($latin_code_blocks as $codes) { 1451c6adce8SGreg Roach foreach ($codes as $code) { 1461c6adce8SGreg Roach $utf8 = UTF8::chr($code); 1471c6adce8SGreg Roach $norm = Normalizer::normalize($utf8, Normalizer::FORM_D); 1481c6adce8SGreg Roach 1491c6adce8SGreg Roach if ($norm !== $utf8) { 1501c6adce8SGreg Roach $chars = preg_split('//u', $norm, -1, PREG_SPLIT_NO_EMPTY); 1511c6adce8SGreg Roach if (!ctype_alpha($chars[0])) { 1521c6adce8SGreg Roach continue; 1531c6adce8SGreg Roach } 1541c6adce8SGreg Roach if (!in_array($chars[1], $ansel_combining_characters, true)) { 1551c6adce8SGreg Roach continue; 1561c6adce8SGreg Roach } 1571c6adce8SGreg Roach if (count($chars) >= 3 && !in_array($chars[2], $ansel_combining_characters, true)) { 1581c6adce8SGreg Roach continue; 1591c6adce8SGreg Roach } 1601c6adce8SGreg Roach 161*f01ab4acSGreg Roach static::assertSame($utf8, $encoding->toUtf8($encoding->fromUtf8($utf8)), 'U+' . dechex($code)); 1621c6adce8SGreg Roach } 1631c6adce8SGreg Roach } 1641c6adce8SGreg Roach } 1651c6adce8SGreg Roach } 1661c6adce8SGreg Roach 1671c6adce8SGreg Roach /** 1681c6adce8SGreg Roach * @covers \Fisharebest\Webtrees\Encodings\AbstractEncoding 1691c6adce8SGreg Roach * @covers \Fisharebest\Webtrees\Encodings\ANSEL 1701c6adce8SGreg Roach * 1711c6adce8SGreg Roach * @return void 1721c6adce8SGreg Roach */ 1731c6adce8SGreg Roach public function testToUtf8(): void 1741c6adce8SGreg Roach { 1751c6adce8SGreg Roach $encoding = new ANSEL(); 1761c6adce8SGreg Roach 1771c6adce8SGreg Roach foreach (self::TEST_DATA as $utf8 => $ansel) { 1781c6adce8SGreg Roach self::assertSame($utf8, $encoding->toUtf8($ansel), bin2hex($utf8) . ' ' . bin2hex($encoding->toUtf8($ansel))); 1791c6adce8SGreg Roach } 1801c6adce8SGreg Roach } 1811c6adce8SGreg Roach 1821c6adce8SGreg Roach /** 1831c6adce8SGreg Roach * @covers \Fisharebest\Webtrees\Encodings\AbstractEncoding 1841c6adce8SGreg Roach * @covers \Fisharebest\Webtrees\Encodings\ANSEL 1851c6adce8SGreg Roach * 1861c6adce8SGreg Roach * @return void 1871c6adce8SGreg Roach */ 1881c6adce8SGreg Roach public function testFromUtf8(): void 1891c6adce8SGreg Roach { 1901c6adce8SGreg Roach $encoding = new ANSEL(); 1911c6adce8SGreg Roach 1921c6adce8SGreg Roach foreach (self::TEST_DATA as $utf8 => $other) { 1931c6adce8SGreg Roach self::assertSame($other, $encoding->fromUtf8($utf8)); 1941c6adce8SGreg Roach } 1951c6adce8SGreg Roach } 1961c6adce8SGreg Roach 1971c6adce8SGreg Roach /** 1981c6adce8SGreg Roach * @covers \Fisharebest\Webtrees\Encodings\AbstractEncoding 1991c6adce8SGreg Roach * @covers \Fisharebest\Webtrees\Encodings\ANSEL 2001c6adce8SGreg Roach * 2011c6adce8SGreg Roach * @return void 2021c6adce8SGreg Roach */ 2031c6adce8SGreg Roach public function testUnprintable(): void 2041c6adce8SGreg Roach { 2051c6adce8SGreg Roach $encoding = new ANSEL(); 2061c6adce8SGreg Roach 2071c6adce8SGreg Roach foreach (self::UNPRINTABLE as $chars) { 2081c6adce8SGreg Roach $expected = str_repeat(UTF8::REPLACEMENT_CHARACTER, strlen($chars)); 2091c6adce8SGreg Roach self::assertSame($expected, $encoding->toUtf8($chars)); 2101c6adce8SGreg Roach } 2111c6adce8SGreg Roach } 2121c6adce8SGreg Roach 2131c6adce8SGreg Roach /** 2141c6adce8SGreg Roach * @covers \Fisharebest\Webtrees\Encodings\AbstractEncoding 2151c6adce8SGreg Roach * @covers \Fisharebest\Webtrees\Encodings\ANSEL 2161c6adce8SGreg Roach * 2171c6adce8SGreg Roach * @return void 2181c6adce8SGreg Roach */ 2191c6adce8SGreg Roach public function testMultiPartDiacritic(): void 2201c6adce8SGreg Roach { 2211c6adce8SGreg Roach $encoding = new ANSEL(); 2221c6adce8SGreg Roach 2231c6adce8SGreg Roach foreach (self::MULTIPART_DIACRITIC as $chars) { 2241c6adce8SGreg Roach self::assertSame('', $encoding->toUtf8($chars)); 2251c6adce8SGreg Roach } 2261c6adce8SGreg Roach } 2271c6adce8SGreg Roach} 228