1<?php 2 3/** 4 * webtrees: online genealogy 5 * Copyright (C) 2022 webtrees development team 6 * This program is free software: you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License as published by 8 * the Free Software Foundation, either version 3 of the License, or 9 * (at your option) any later version. 10 * This program is distributed in the hope that it will be useful, 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 * GNU General Public License for more details. 14 * You should have received a copy of the GNU General Public License 15 * along with this program. If not, see <https://www.gnu.org/licenses/>. 16 */ 17 18declare(strict_types=1); 19 20namespace Fisharebest\Webtrees\Encodings; 21 22use function chr; 23use function intdiv; 24use function ord; 25use function str_split; 26use function strlen; 27 28/** 29 * Convert between an encoding and UTF-16. 30 */ 31abstract class AbstractUTF16Encoding implements EncodingInterface 32{ 33 // Concrete classes should implement this. 34 public const REPLACEMENT_CHARACTER = ''; 35 36 /** 37 * Convert a string from UTF-8 to another encoding. 38 * 39 * @param string $text 40 * 41 * @return string 42 */ 43 public function fromUtf8(string $text): string 44 { 45 $out = ''; 46 $len = strlen($text); 47 48 for ($n = 0; $n < $len; ++$n) { 49 $code_point = ord($text[$n]); 50 51 if ($code_point <= 0x7F) { 52 $out .= $this->codePointToCharacter($code_point); 53 } elseif ($code_point <= 0xBF) { 54 // Invalid 55 $out .= static::REPLACEMENT_CHARACTER; 56 } elseif ($code_point <= 0xDF) { 57 $byte2 = ord($text[++$n]); 58 59 if (($byte2 & 0xC0) !== 0x80) { 60 // Invalid 61 $out .= static::REPLACEMENT_CHARACTER; 62 } else { 63 $out .= $this->codePointToCharacter($code_point << 6 + $byte2 & 0x3F); 64 } 65 } elseif ($code_point <= 0xEF) { 66 $byte2 = ord($text[++$n]); 67 $byte3 = ord($text[++$n]); 68 69 if (($byte2 & 0xC0) !== 0x80 || ($byte3 & 0xC0) !== 0x80) { 70 // Invalid 71 $out .= static::REPLACEMENT_CHARACTER; 72 } else { 73 $out .= $this->codePointToCharacter($code_point << 12 + ($byte2 & 0x3F) << 6 + $byte3 & 0x3F); 74 } 75 } else { 76 // Invalid 77 $out .= static::REPLACEMENT_CHARACTER; 78 } 79 } 80 81 return $out; 82 } 83 84 /** 85 * Convert a string from another encoding to UTF-8. 86 * 87 * @param string $text 88 * 89 * @return string 90 */ 91 public function toUtf8(string $text): string 92 { 93 $utf8 = ''; 94 95 foreach (str_split($text, 2) as $character) { 96 $code_point = $this->characterToCodePoint($character); 97 98 if ($code_point <= 0x7F) { 99 // 7 bits => 1 byte 100 $utf8 .= chr($code_point); 101 } elseif ($code_point <= 0xFF) { 102 // U+80 - U+FF are invalid 103 $utf8 .= UTF8::REPLACEMENT_CHARACTER; 104 } elseif ($code_point <= 0x7FF) { 105 // 11 bits (5,6) => 2 bytes 106 $utf8 .= chr(0xC0 | ($code_point >> 6)); 107 $utf8 .= chr(0x80 | $code_point & 0x3F); 108 } elseif ($code_point <= 0xD7FF || $code_point >= 0xE000) { 109 // 16 bits (4,6,6) => 3 bytes 110 $utf8 .= chr(0xE0 | ($code_point >> 12)); 111 $utf8 .= chr(0x80 | ($code_point >> 6) & 0x3F); 112 $utf8 .= chr(0x80 | $code_point & 0x3F); 113 } else { 114 // U+D800 - U+DFFF are invalid 115 $utf8 .= UTF8::REPLACEMENT_CHARACTER; 116 } 117 } 118 119 return $utf8; 120 } 121 122 /** 123 * When reading multi-byte encodings using a stream, we must avoid incomplete characters. 124 * 125 * @param string $text 126 * 127 * @return int 128 */ 129 public function convertibleBytes(string $text): int 130 { 131 return 2 * intdiv(strlen($text), 2); 132 } 133 134 /** 135 * Convert two bytes to a code-point, taking care of byte-order. 136 * 137 * @param string $character 138 * 139 * @return int 140 */ 141 abstract protected function characterToCodePoint(string $character): int; 142 143 /** 144 * Convert a code-point to two bytes, taking care of byte-order. 145 * 146 * @param int $code_point 147 * 148 * @return string 149 */ 150 abstract protected function codePointToCharacter(int $code_point): string; 151} 152