xref: /webtrees/app/Encodings/AbstractUTF16Encoding.php (revision e873f434551745f888937263ff89e80db3b0f785)
11c6adce8SGreg Roach<?php
21c6adce8SGreg Roach
31c6adce8SGreg Roach/**
41c6adce8SGreg Roach * webtrees: online genealogy
5d11be702SGreg Roach * Copyright (C) 2023 webtrees development team
61c6adce8SGreg Roach * This program is free software: you can redistribute it and/or modify
71c6adce8SGreg Roach * it under the terms of the GNU General Public License as published by
81c6adce8SGreg Roach * the Free Software Foundation, either version 3 of the License, or
91c6adce8SGreg Roach * (at your option) any later version.
101c6adce8SGreg Roach * This program is distributed in the hope that it will be useful,
111c6adce8SGreg Roach * but WITHOUT ANY WARRANTY; without even the implied warranty of
121c6adce8SGreg Roach * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
131c6adce8SGreg Roach * GNU General Public License for more details.
141c6adce8SGreg Roach * You should have received a copy of the GNU General Public License
151c6adce8SGreg Roach * along with this program. If not, see <https://www.gnu.org/licenses/>.
161c6adce8SGreg Roach */
171c6adce8SGreg Roach
181c6adce8SGreg Roachdeclare(strict_types=1);
191c6adce8SGreg Roach
201c6adce8SGreg Roachnamespace Fisharebest\Webtrees\Encodings;
211c6adce8SGreg Roach
221c6adce8SGreg Roachuse function chr;
231c6adce8SGreg Roachuse function intdiv;
241c6adce8SGreg Roachuse function ord;
251c6adce8SGreg Roachuse function str_split;
261c6adce8SGreg Roachuse function strlen;
271c6adce8SGreg Roach
281c6adce8SGreg Roach/**
291c6adce8SGreg Roach * Convert between an encoding and UTF-16.
301c6adce8SGreg Roach */
311c6adce8SGreg Roachabstract class AbstractUTF16Encoding implements EncodingInterface
321c6adce8SGreg Roach{
331c6adce8SGreg Roach    // Concrete classes should implement this.
34*e873f434SGreg Roach    public const string REPLACEMENT_CHARACTER = '';
351c6adce8SGreg Roach
361c6adce8SGreg Roach    /**
371c6adce8SGreg Roach     * Convert a string from UTF-8 to another encoding.
381c6adce8SGreg Roach     *
391c6adce8SGreg Roach     * @param string $text
401c6adce8SGreg Roach     *
411c6adce8SGreg Roach     * @return string
421c6adce8SGreg Roach     */
431c6adce8SGreg Roach    public function fromUtf8(string $text): string
441c6adce8SGreg Roach    {
451c6adce8SGreg Roach        $out = '';
461c6adce8SGreg Roach        $len = strlen($text);
471c6adce8SGreg Roach
481c6adce8SGreg Roach        for ($n = 0; $n < $len; ++$n) {
491c6adce8SGreg Roach            $code_point = ord($text[$n]);
501c6adce8SGreg Roach
511c6adce8SGreg Roach            if ($code_point <= 0x7F) {
521c6adce8SGreg Roach                $out .= $this->codePointToCharacter($code_point);
531c6adce8SGreg Roach            } elseif ($code_point <= 0xBF) {
541c6adce8SGreg Roach                // Invalid
551c6adce8SGreg Roach                $out .= static::REPLACEMENT_CHARACTER;
561c6adce8SGreg Roach            } elseif ($code_point <= 0xDF) {
571c6adce8SGreg Roach                $byte2 = ord($text[++$n]);
581c6adce8SGreg Roach
591c6adce8SGreg Roach                if (($byte2 & 0xC0) !== 0x80) {
601c6adce8SGreg Roach                    // Invalid
611c6adce8SGreg Roach                    $out .= static::REPLACEMENT_CHARACTER;
621c6adce8SGreg Roach                } else {
631c6adce8SGreg Roach                    $out .= $this->codePointToCharacter($code_point << 6 + $byte2 & 0x3F);
641c6adce8SGreg Roach                }
651c6adce8SGreg Roach            } elseif ($code_point <= 0xEF) {
661c6adce8SGreg Roach                $byte2 = ord($text[++$n]);
671c6adce8SGreg Roach                $byte3 = ord($text[++$n]);
681c6adce8SGreg Roach
691c6adce8SGreg Roach                if (($byte2 & 0xC0) !== 0x80 || ($byte3 & 0xC0) !== 0x80) {
701c6adce8SGreg Roach                    // Invalid
711c6adce8SGreg Roach                    $out .= static::REPLACEMENT_CHARACTER;
721c6adce8SGreg Roach                } else {
731c6adce8SGreg Roach                    $out .= $this->codePointToCharacter($code_point << 12 + ($byte2 & 0x3F) << 6 + $byte3 & 0x3F);
741c6adce8SGreg Roach                }
751c6adce8SGreg Roach            } else {
761c6adce8SGreg Roach                // Invalid
771c6adce8SGreg Roach                $out .= static::REPLACEMENT_CHARACTER;
781c6adce8SGreg Roach            }
791c6adce8SGreg Roach        }
801c6adce8SGreg Roach
811c6adce8SGreg Roach        return $out;
821c6adce8SGreg Roach    }
831c6adce8SGreg Roach
841c6adce8SGreg Roach    /**
851c6adce8SGreg Roach     * Convert a string from another encoding to UTF-8.
861c6adce8SGreg Roach     *
871c6adce8SGreg Roach     * @param string $text
881c6adce8SGreg Roach     *
891c6adce8SGreg Roach     * @return string
901c6adce8SGreg Roach     */
911c6adce8SGreg Roach    public function toUtf8(string $text): string
921c6adce8SGreg Roach    {
931c6adce8SGreg Roach        $utf8 = '';
941c6adce8SGreg Roach
951c6adce8SGreg Roach        foreach (str_split($text, 2) as $character) {
961c6adce8SGreg Roach            $code_point = $this->characterToCodePoint($character);
971c6adce8SGreg Roach
981c6adce8SGreg Roach            if ($code_point <= 0x7F) {
991c6adce8SGreg Roach                // 7 bits => 1 byte
1001c6adce8SGreg Roach                $utf8 .= chr($code_point);
1011c6adce8SGreg Roach            } elseif ($code_point <= 0xFF) {
1021c6adce8SGreg Roach                // U+80 - U+FF are invalid
1031c6adce8SGreg Roach                $utf8 .= UTF8::REPLACEMENT_CHARACTER;
1041c6adce8SGreg Roach            } elseif ($code_point <= 0x7FF) {
1051c6adce8SGreg Roach                // 11 bits (5,6) => 2 bytes
1061c6adce8SGreg Roach                $utf8 .= chr(0xC0 | ($code_point >> 6));
1071c6adce8SGreg Roach                $utf8 .= chr(0x80 | $code_point & 0x3F);
1081c6adce8SGreg Roach            } elseif ($code_point <= 0xD7FF || $code_point >= 0xE000) {
1091c6adce8SGreg Roach                // 16 bits (4,6,6) => 3 bytes
1101c6adce8SGreg Roach                $utf8 .= chr(0xE0 | ($code_point >> 12));
1111c6adce8SGreg Roach                $utf8 .= chr(0x80 | ($code_point >> 6) & 0x3F);
1121c6adce8SGreg Roach                $utf8 .= chr(0x80 | $code_point & 0x3F);
1131c6adce8SGreg Roach            } else {
1141c6adce8SGreg Roach                // U+D800 - U+DFFF are invalid
1151c6adce8SGreg Roach                $utf8 .= UTF8::REPLACEMENT_CHARACTER;
1161c6adce8SGreg Roach            }
1171c6adce8SGreg Roach        }
1181c6adce8SGreg Roach
1191c6adce8SGreg Roach        return $utf8;
1201c6adce8SGreg Roach    }
1211c6adce8SGreg Roach
1221c6adce8SGreg Roach    /**
1231c6adce8SGreg Roach     * When reading multi-byte encodings using a stream, we must avoid incomplete characters.
1241c6adce8SGreg Roach     *
1251c6adce8SGreg Roach     * @param string $text
1261c6adce8SGreg Roach     *
1271c6adce8SGreg Roach     * @return int
1281c6adce8SGreg Roach     */
1291c6adce8SGreg Roach    public function convertibleBytes(string $text): int
1301c6adce8SGreg Roach    {
1311c6adce8SGreg Roach        return 2 * intdiv(strlen($text), 2);
1321c6adce8SGreg Roach    }
1331c6adce8SGreg Roach
1341c6adce8SGreg Roach    /**
1351c6adce8SGreg Roach     * Convert two bytes to a code-point, taking care of byte-order.
1361c6adce8SGreg Roach     *
1371c6adce8SGreg Roach     * @param string $character
1381c6adce8SGreg Roach     *
1391c6adce8SGreg Roach     * @return int
1401c6adce8SGreg Roach     */
1411c6adce8SGreg Roach    abstract protected function characterToCodePoint(string $character): int;
1421c6adce8SGreg Roach
1431c6adce8SGreg Roach    /**
1441c6adce8SGreg Roach     * Convert a code-point to two bytes, taking care of byte-order.
1451c6adce8SGreg Roach     *
1461c6adce8SGreg Roach     * @param int $code_point
1471c6adce8SGreg Roach     *
1481c6adce8SGreg Roach     * @return string
1491c6adce8SGreg Roach     */
1501c6adce8SGreg Roach    abstract protected function codePointToCharacter(int $code_point): string;
1511c6adce8SGreg Roach}
152