xref: /webtrees/app/Encodings/AbstractUTF16Encoding.php (revision 1ff45046fabc22237b5d0d8e489c96f031fc598d)
1<?php
2
3/**
4 * webtrees: online genealogy
5 * Copyright (C) 2023 webtrees development team
6 * This program is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation, either version 3 of the License, or
9 * (at your option) any later version.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program. If not, see <https://www.gnu.org/licenses/>.
16 */
17
18declare(strict_types=1);
19
20namespace Fisharebest\Webtrees\Encodings;
21
22use function chr;
23use function intdiv;
24use function ord;
25use function str_split;
26use function strlen;
27
28/**
29 * Convert between an encoding and UTF-16.
30 */
31abstract class AbstractUTF16Encoding implements EncodingInterface
32{
33    // Concrete classes should implement this.
34    public const REPLACEMENT_CHARACTER = '';
35
36    /**
37     * Convert a string from UTF-8 to another encoding.
38     *
39     * @param string $text
40     *
41     * @return string
42     */
43    public function fromUtf8(string $text): string
44    {
45        $out = '';
46        $len = strlen($text);
47
48        for ($n = 0; $n < $len; ++$n) {
49            $code_point = ord($text[$n]);
50
51            if ($code_point <= 0x7F) {
52                $out .= $this->codePointToCharacter($code_point);
53            } elseif ($code_point <= 0xBF) {
54                // Invalid
55                $out .= static::REPLACEMENT_CHARACTER;
56            } elseif ($code_point <= 0xDF) {
57                $byte2 = ord($text[++$n]);
58
59                if (($byte2 & 0xC0) !== 0x80) {
60                    // Invalid
61                    $out .= static::REPLACEMENT_CHARACTER;
62                } else {
63                    $out .= $this->codePointToCharacter($code_point << 6 + $byte2 & 0x3F);
64                }
65            } elseif ($code_point <= 0xEF) {
66                $byte2 = ord($text[++$n]);
67                $byte3 = ord($text[++$n]);
68
69                if (($byte2 & 0xC0) !== 0x80 || ($byte3 & 0xC0) !== 0x80) {
70                    // Invalid
71                    $out .= static::REPLACEMENT_CHARACTER;
72                } else {
73                    $out .= $this->codePointToCharacter($code_point << 12 + ($byte2 & 0x3F) << 6 + $byte3 & 0x3F);
74                }
75            } else {
76                // Invalid
77                $out .= static::REPLACEMENT_CHARACTER;
78            }
79        }
80
81        return $out;
82    }
83
84    /**
85     * Convert a string from another encoding to UTF-8.
86     *
87     * @param string $text
88     *
89     * @return string
90     */
91    public function toUtf8(string $text): string
92    {
93        $utf8 = '';
94
95        foreach (str_split($text, 2) as $character) {
96            $code_point = $this->characterToCodePoint($character);
97
98            if ($code_point <= 0x7F) {
99                // 7 bits => 1 byte
100                $utf8 .= chr($code_point);
101            } elseif ($code_point <= 0xFF) {
102                // U+80 - U+FF are invalid
103                $utf8 .= UTF8::REPLACEMENT_CHARACTER;
104            } elseif ($code_point <= 0x7FF) {
105                // 11 bits (5,6) => 2 bytes
106                $utf8 .= chr(0xC0 | ($code_point >> 6));
107                $utf8 .= chr(0x80 | $code_point & 0x3F);
108            } elseif ($code_point <= 0xD7FF || $code_point >= 0xE000) {
109                // 16 bits (4,6,6) => 3 bytes
110                $utf8 .= chr(0xE0 | ($code_point >> 12));
111                $utf8 .= chr(0x80 | ($code_point >> 6) & 0x3F);
112                $utf8 .= chr(0x80 | $code_point & 0x3F);
113            } else {
114                // U+D800 - U+DFFF are invalid
115                $utf8 .= UTF8::REPLACEMENT_CHARACTER;
116            }
117        }
118
119        return $utf8;
120    }
121
122    /**
123     * When reading multi-byte encodings using a stream, we must avoid incomplete characters.
124     *
125     * @param string $text
126     *
127     * @return int
128     */
129    public function convertibleBytes(string $text): int
130    {
131        return 2 * intdiv(strlen($text), 2);
132    }
133
134    /**
135     * Convert two bytes to a code-point, taking care of byte-order.
136     *
137     * @param string $character
138     *
139     * @return int
140     */
141    abstract protected function characterToCodePoint(string $character): int;
142
143    /**
144     * Convert a code-point to two bytes, taking care of byte-order.
145     *
146     * @param int $code_point
147     *
148     * @return string
149     */
150    abstract protected function codePointToCharacter(int $code_point): string;
151}
152