11c6adce8SGreg Roach<?php 21c6adce8SGreg Roach 31c6adce8SGreg Roach/** 41c6adce8SGreg Roach * webtrees: online genealogy 5d11be702SGreg Roach * Copyright (C) 2023 webtrees development team 61c6adce8SGreg Roach * This program is free software: you can redistribute it and/or modify 71c6adce8SGreg Roach * it under the terms of the GNU General Public License as published by 81c6adce8SGreg Roach * the Free Software Foundation, either version 3 of the License, or 91c6adce8SGreg Roach * (at your option) any later version. 101c6adce8SGreg Roach * This program is distributed in the hope that it will be useful, 111c6adce8SGreg Roach * but WITHOUT ANY WARRANTY; without even the implied warranty of 121c6adce8SGreg Roach * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 131c6adce8SGreg Roach * GNU General Public License for more details. 141c6adce8SGreg Roach * You should have received a copy of the GNU General Public License 151c6adce8SGreg Roach * along with this program. If not, see <https://www.gnu.org/licenses/>. 161c6adce8SGreg Roach */ 171c6adce8SGreg Roach 181c6adce8SGreg Roachdeclare(strict_types=1); 191c6adce8SGreg Roach 201c6adce8SGreg Roachnamespace Fisharebest\Webtrees\Encodings; 211c6adce8SGreg Roach 221c6adce8SGreg Roachuse function chr; 231c6adce8SGreg Roachuse function intdiv; 241c6adce8SGreg Roachuse function ord; 251c6adce8SGreg Roachuse function str_split; 261c6adce8SGreg Roachuse function strlen; 271c6adce8SGreg Roach 281c6adce8SGreg Roach/** 291c6adce8SGreg Roach * Convert between an encoding and UTF-16. 301c6adce8SGreg Roach */ 311c6adce8SGreg Roachabstract class AbstractUTF16Encoding implements EncodingInterface 321c6adce8SGreg Roach{ 331c6adce8SGreg Roach // Concrete classes should implement this. 34*e873f434SGreg Roach public const string REPLACEMENT_CHARACTER = ''; 351c6adce8SGreg Roach 361c6adce8SGreg Roach /** 371c6adce8SGreg Roach * Convert a string from UTF-8 to another encoding. 381c6adce8SGreg Roach * 391c6adce8SGreg Roach * @param string $text 401c6adce8SGreg Roach * 411c6adce8SGreg Roach * @return string 421c6adce8SGreg Roach */ 431c6adce8SGreg Roach public function fromUtf8(string $text): string 441c6adce8SGreg Roach { 451c6adce8SGreg Roach $out = ''; 461c6adce8SGreg Roach $len = strlen($text); 471c6adce8SGreg Roach 481c6adce8SGreg Roach for ($n = 0; $n < $len; ++$n) { 491c6adce8SGreg Roach $code_point = ord($text[$n]); 501c6adce8SGreg Roach 511c6adce8SGreg Roach if ($code_point <= 0x7F) { 521c6adce8SGreg Roach $out .= $this->codePointToCharacter($code_point); 531c6adce8SGreg Roach } elseif ($code_point <= 0xBF) { 541c6adce8SGreg Roach // Invalid 551c6adce8SGreg Roach $out .= static::REPLACEMENT_CHARACTER; 561c6adce8SGreg Roach } elseif ($code_point <= 0xDF) { 571c6adce8SGreg Roach $byte2 = ord($text[++$n]); 581c6adce8SGreg Roach 591c6adce8SGreg Roach if (($byte2 & 0xC0) !== 0x80) { 601c6adce8SGreg Roach // Invalid 611c6adce8SGreg Roach $out .= static::REPLACEMENT_CHARACTER; 621c6adce8SGreg Roach } else { 631c6adce8SGreg Roach $out .= $this->codePointToCharacter($code_point << 6 + $byte2 & 0x3F); 641c6adce8SGreg Roach } 651c6adce8SGreg Roach } elseif ($code_point <= 0xEF) { 661c6adce8SGreg Roach $byte2 = ord($text[++$n]); 671c6adce8SGreg Roach $byte3 = ord($text[++$n]); 681c6adce8SGreg Roach 691c6adce8SGreg Roach if (($byte2 & 0xC0) !== 0x80 || ($byte3 & 0xC0) !== 0x80) { 701c6adce8SGreg Roach // Invalid 711c6adce8SGreg Roach $out .= static::REPLACEMENT_CHARACTER; 721c6adce8SGreg Roach } else { 731c6adce8SGreg Roach $out .= $this->codePointToCharacter($code_point << 12 + ($byte2 & 0x3F) << 6 + $byte3 & 0x3F); 741c6adce8SGreg Roach } 751c6adce8SGreg Roach } else { 761c6adce8SGreg Roach // Invalid 771c6adce8SGreg Roach $out .= static::REPLACEMENT_CHARACTER; 781c6adce8SGreg Roach } 791c6adce8SGreg Roach } 801c6adce8SGreg Roach 811c6adce8SGreg Roach return $out; 821c6adce8SGreg Roach } 831c6adce8SGreg Roach 841c6adce8SGreg Roach /** 851c6adce8SGreg Roach * Convert a string from another encoding to UTF-8. 861c6adce8SGreg Roach * 871c6adce8SGreg Roach * @param string $text 881c6adce8SGreg Roach * 891c6adce8SGreg Roach * @return string 901c6adce8SGreg Roach */ 911c6adce8SGreg Roach public function toUtf8(string $text): string 921c6adce8SGreg Roach { 931c6adce8SGreg Roach $utf8 = ''; 941c6adce8SGreg Roach 951c6adce8SGreg Roach foreach (str_split($text, 2) as $character) { 961c6adce8SGreg Roach $code_point = $this->characterToCodePoint($character); 971c6adce8SGreg Roach 981c6adce8SGreg Roach if ($code_point <= 0x7F) { 991c6adce8SGreg Roach // 7 bits => 1 byte 1001c6adce8SGreg Roach $utf8 .= chr($code_point); 1011c6adce8SGreg Roach } elseif ($code_point <= 0xFF) { 1021c6adce8SGreg Roach // U+80 - U+FF are invalid 1031c6adce8SGreg Roach $utf8 .= UTF8::REPLACEMENT_CHARACTER; 1041c6adce8SGreg Roach } elseif ($code_point <= 0x7FF) { 1051c6adce8SGreg Roach // 11 bits (5,6) => 2 bytes 1061c6adce8SGreg Roach $utf8 .= chr(0xC0 | ($code_point >> 6)); 1071c6adce8SGreg Roach $utf8 .= chr(0x80 | $code_point & 0x3F); 1081c6adce8SGreg Roach } elseif ($code_point <= 0xD7FF || $code_point >= 0xE000) { 1091c6adce8SGreg Roach // 16 bits (4,6,6) => 3 bytes 1101c6adce8SGreg Roach $utf8 .= chr(0xE0 | ($code_point >> 12)); 1111c6adce8SGreg Roach $utf8 .= chr(0x80 | ($code_point >> 6) & 0x3F); 1121c6adce8SGreg Roach $utf8 .= chr(0x80 | $code_point & 0x3F); 1131c6adce8SGreg Roach } else { 1141c6adce8SGreg Roach // U+D800 - U+DFFF are invalid 1151c6adce8SGreg Roach $utf8 .= UTF8::REPLACEMENT_CHARACTER; 1161c6adce8SGreg Roach } 1171c6adce8SGreg Roach } 1181c6adce8SGreg Roach 1191c6adce8SGreg Roach return $utf8; 1201c6adce8SGreg Roach } 1211c6adce8SGreg Roach 1221c6adce8SGreg Roach /** 1231c6adce8SGreg Roach * When reading multi-byte encodings using a stream, we must avoid incomplete characters. 1241c6adce8SGreg Roach * 1251c6adce8SGreg Roach * @param string $text 1261c6adce8SGreg Roach * 1271c6adce8SGreg Roach * @return int 1281c6adce8SGreg Roach */ 1291c6adce8SGreg Roach public function convertibleBytes(string $text): int 1301c6adce8SGreg Roach { 1311c6adce8SGreg Roach return 2 * intdiv(strlen($text), 2); 1321c6adce8SGreg Roach } 1331c6adce8SGreg Roach 1341c6adce8SGreg Roach /** 1351c6adce8SGreg Roach * Convert two bytes to a code-point, taking care of byte-order. 1361c6adce8SGreg Roach * 1371c6adce8SGreg Roach * @param string $character 1381c6adce8SGreg Roach * 1391c6adce8SGreg Roach * @return int 1401c6adce8SGreg Roach */ 1411c6adce8SGreg Roach abstract protected function characterToCodePoint(string $character): int; 1421c6adce8SGreg Roach 1431c6adce8SGreg Roach /** 1441c6adce8SGreg Roach * Convert a code-point to two bytes, taking care of byte-order. 1451c6adce8SGreg Roach * 1461c6adce8SGreg Roach * @param int $code_point 1471c6adce8SGreg Roach * 1481c6adce8SGreg Roach * @return string 1491c6adce8SGreg Roach */ 1501c6adce8SGreg Roach abstract protected function codePointToCharacter(int $code_point): string; 1511c6adce8SGreg Roach} 152