1<?php 2 3/** 4 * webtrees: online genealogy 5 * 'Copyright (C) 2023 webtrees development team 6 * This program is free software: you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License as published by 8 * the Free Software Foundation, either version 3 of the License, or 9 * (at your option) any later version. 10 * This program is distributed in the hope that it will be useful, 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 * GNU General Public License for more details. 14 * You should have received a copy of the GNU General Public License 15 * along with this program. If not, see <https://www.gnu.org/licenses/>. 16 */ 17 18declare(strict_types=1); 19 20namespace Fisharebest\Webtrees\Tests\Encodings; 21 22use Fisharebest\Webtrees\Encodings\ANSEL; 23use Fisharebest\Webtrees\Encodings\UTF8; 24use Normalizer; 25use PHPUnit\Framework\TestCase; 26 27use function bin2hex; 28use function count; 29use function ctype_alpha; 30use function dechex; 31use function in_array; 32use function preg_split; 33use function range; 34use function strlen; 35 36use const PREG_SPLIT_NO_EMPTY; 37 38/** 39 * Tests for class ANSEL. 40 */ 41class AnselTest extends TestCase 42{ 43 private const TEST_DATA = [ 44 "\x00\x01\x02\x03\x04\x05\x06\x07" => "\x00\x01\x02\x03\x04\x05\x06\x07", 45 "\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F" => "\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F", 46 "\x10\x11\x12\x13\x14\x15\x16\x17" => "\x10\x11\x12\x13\x14\x15\x16\x17", 47 "\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F" => "\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F", 48 ' !"#$%&\'' => "\x20\x21\x22\x23\x24\x25\x26\x27", 49 '()*+,-./' => "\x28\x29\x2A\x2B\x2C\x2D\x2E\x2F", 50 '01234567' => "\x30\x31\x32\x33\x34\x35\x36\x37", 51 '89:;<=>?' => "\x38\x39\x3A\x3B\x3C\x3D\x3E\x3F", 52 '@ABCDEFG' => "\x40\x41\x42\x43\x44\x45\x46\x47", 53 'HIJKLMNO' => "\x48\x49\x4A\x4B\x4C\x4D\x4E\x4F", 54 'PQRSTUVW' => "\x50\x51\x52\x53\x54\x55\x56\x57", 55 'XYZ[\\]^_' => "\x58\x59\x5A\x5B\x5C\x5D\x5E\x5F", 56 '`abcdefg' => "\x60\x61\x62\x63\x64\x65\x66\x67", 57 'hijklmno' => "\x68\x69\x6A\x6B\x6C\x6D\x6E\x6F", 58 'pqrstuvw' => "\x70\x71\x72\x73\x74\x75\x76\x77", 59 "xyz{|}~\x7F" => "\x78\x79\x7A\x7B\x7C\x7D\x7E\x7F", 60 "\xC2\x98\xC2\x9C\xE2\x80\x8D\xE2\x80\x8C" => "\x88\x89\x8D\x8E", 61 'ŁØĐÞÆŒʹ' => "\xA1\xA2\xA3\xA4\xA5\xA6\xA7", 62 '·♭®±ƠƯʼ' => "\xA8\xA9\xAA\xAB\xAC\xAD\xAE", 63 'ʻłøđþæœʺ' => "\xB0\xB1\xB2\xB3\xB4\xB5\xB6\xB7", 64 'ı£ðơư' => "\xB8\xB9\xBA\xBC\xBD", 65 '°ℓ℗©♯¿¡ẞ€' => "\xC0\xC1\xC2\xC3\xC4\xC5\xC6\xC7\xC8", 66 // Combining diacritics 67 'ảàáâãāăȧ' => "\xE0a\xE1a\xE2a\xE3a\xE4a\xE5a\xE6a\xE7a", 68 'äǎåa͡a̕a̋a̐' => "\xE8a\xE9a\xEAa\xEBa\xEDa\xEEa\xEFa", 69 'a̧ąạa̤ḁa̳a̲a̦' => "\xF0a\xF1a\xF2a\xF3a\xF4a\xF5a\xF6a\xF7a", 70 'a̜a̮a͠a̓a̸' => "\xF8a\xF9a\xFAa\xFEa\xFFa", 71 // Diacritics with non-ascii 72 'ǣ' => "\xE5\xB5", 73 // LATIN CAPITAL LETTER O WITH DIAERESIS AND MACRON 74 'Ō̈' => "\xE5\xE8O", 75 // LATIN CAPITAL LETTER O WITH MACRON AND DIAERESIS 76 'Ȫ' => "\xE8\xE5O", 77 ]; 78 79 private const UNPRINTABLE = [ 80 "\x80\x81\x82\x83\x84\x85\x86\x87", 81 "\x8A\x8B\x8C\x8F", 82 "\x90\x91\x92\x93\x94\x95\x96\x97", 83 "\x98\x99\x9A\x9B\x9C\x9D\x9E\x9F", 84 "\xA0", 85 "\xAF", 86 "\xBB", 87 "\xC9\xCA\xCB\xCC\xCD\xCE", 88 "\xD0\xD1\xD2\xD3\xD4\xD5\xD6\xD7", 89 "\xD8\xD9\xDA\xDB\xDC\xDD\xDE\xDF", 90 "\xFC\xFD", 91 ]; 92 93 private const MULTIPART_DIACRITIC = ["\xEC", "\xFB"]; 94 95 /** 96 * @covers \Fisharebest\Webtrees\Encodings\AbstractEncoding 97 * @covers \Fisharebest\Webtrees\Encodings\ANSEL 98 * @covers \Fisharebest\Webtrees\Encodings\UTF8::chr 99 * 100 * @return void 101 */ 102 public function testPreComposedCharacters(): void 103 { 104 $latin_code_blocks = [ 105 range(0x80, 0xFF), 106 range(0x100, 0x17F), 107 range(0x180, 0x24F), 108 range(0x1E00, 0x1EFF), 109 ]; 110 111 $ansel_combining_characters = [ 112 UTF8::COMBINING_HOOK_ABOVE, 113 UTF8::COMBINING_GRAVE_ACCENT, 114 UTF8::COMBINING_ACUTE_ACCENT, 115 UTF8::COMBINING_CIRCUMFLEX_ACCENT, 116 UTF8::COMBINING_TILDE, 117 UTF8::COMBINING_MACRON, 118 UTF8::COMBINING_BREVE, 119 UTF8::COMBINING_DOT_ABOVE, 120 UTF8::COMBINING_DIAERESIS, 121 UTF8::COMBINING_CARON, 122 UTF8::COMBINING_RING_ABOVE, 123 UTF8::COMBINING_DOUBLE_INVERTED_BREVE, 124 UTF8::COMBINING_COMMA_ABOVE_RIGHT, 125 UTF8::COMBINING_DOUBLE_ACUTE_ACCENT, 126 UTF8::COMBINING_CANDRABINDU, 127 UTF8::COMBINING_CEDILLA, 128 UTF8::COMBINING_OGONEK, 129 UTF8::COMBINING_DOT_BELOW, 130 UTF8::COMBINING_DIAERESIS_BELOW, 131 UTF8::COMBINING_RING_BELOW, 132 UTF8::COMBINING_DOUBLE_LOW_LINE, 133 UTF8::COMBINING_LOW_LINE, 134 UTF8::COMBINING_COMMA_BELOW, 135 UTF8::COMBINING_LEFT_HALF_RING_BELOW, 136 UTF8::COMBINING_BREVE_BELOW, 137 UTF8::COMBINING_DOUBLE_TILDE, 138 UTF8::REPLACEMENT_CHARACTER, 139 UTF8::REPLACEMENT_CHARACTER, 140 UTF8::COMBINING_COMMA_ABOVE, 141 UTF8::COMBINING_LONG_SOLIDUS_OVERLAY, 142 ]; 143 144 $encoding = new ANSEL(); 145 146 foreach ($latin_code_blocks as $codes) { 147 foreach ($codes as $code) { 148 $utf8 = UTF8::chr($code); 149 $norm = Normalizer::normalize($utf8, Normalizer::FORM_D); 150 151 if ($norm !== $utf8) { 152 $chars = preg_split('//u', $norm, -1, PREG_SPLIT_NO_EMPTY); 153 if (!ctype_alpha($chars[0])) { 154 continue; 155 } 156 if (!in_array($chars[1], $ansel_combining_characters, true)) { 157 continue; 158 } 159 if (count($chars) >= 3 && !in_array($chars[2], $ansel_combining_characters, true)) { 160 continue; 161 } 162 163 static::assertSame($utf8, $encoding->toUtf8($encoding->fromUtf8($utf8)), 'U+' . dechex($code)); 164 } 165 } 166 } 167 } 168 169 /** 170 * @covers \Fisharebest\Webtrees\Encodings\AbstractEncoding 171 * @covers \Fisharebest\Webtrees\Encodings\ANSEL 172 * 173 * @return void 174 */ 175 public function testToUtf8(): void 176 { 177 $encoding = new ANSEL(); 178 179 foreach (self::TEST_DATA as $utf8 => $ansel) { 180 self::assertSame($utf8, $encoding->toUtf8($ansel), bin2hex($utf8) . ' ' . bin2hex($encoding->toUtf8($ansel))); 181 } 182 } 183 184 /** 185 * @covers \Fisharebest\Webtrees\Encodings\AbstractEncoding 186 * @covers \Fisharebest\Webtrees\Encodings\ANSEL 187 * 188 * @return void 189 */ 190 public function testFromUtf8(): void 191 { 192 $encoding = new ANSEL(); 193 194 foreach (self::TEST_DATA as $utf8 => $other) { 195 self::assertSame($other, $encoding->fromUtf8($utf8)); 196 } 197 } 198 199 /** 200 * @covers \Fisharebest\Webtrees\Encodings\AbstractEncoding 201 * @covers \Fisharebest\Webtrees\Encodings\ANSEL 202 * 203 * @return void 204 */ 205 public function testUnprintable(): void 206 { 207 $encoding = new ANSEL(); 208 209 foreach (self::UNPRINTABLE as $chars) { 210 $expected = str_repeat(UTF8::REPLACEMENT_CHARACTER, strlen($chars)); 211 self::assertSame($expected, $encoding->toUtf8($chars)); 212 } 213 } 214 215 /** 216 * @covers \Fisharebest\Webtrees\Encodings\AbstractEncoding 217 * @covers \Fisharebest\Webtrees\Encodings\ANSEL 218 * 219 * @return void 220 */ 221 public function testMultiPartDiacritic(): void 222 { 223 $encoding = new ANSEL(); 224 225 foreach (self::MULTIPART_DIACRITIC as $chars) { 226 self::assertSame('', $encoding->toUtf8($chars)); 227 } 228 } 229} 230