xref: /webtrees/app/Encodings/AbstractEncoding.php (revision d11be7027e34e3121be11cc025421873364403f9)
11c6adce8SGreg Roach<?php
21c6adce8SGreg Roach
31c6adce8SGreg Roach/**
41c6adce8SGreg Roach * webtrees: online genealogy
5*d11be702SGreg Roach * Copyright (C) 2023 webtrees development team
61c6adce8SGreg Roach * This program is free software: you can redistribute it and/or modify
71c6adce8SGreg Roach * it under the terms of the GNU General Public License as published by
81c6adce8SGreg Roach * the Free Software Foundation, either version 3 of the License, or
91c6adce8SGreg Roach * (at your option) any later version.
101c6adce8SGreg Roach * This program is distributed in the hope that it will be useful,
111c6adce8SGreg Roach * but WITHOUT ANY WARRANTY; without even the implied warranty of
121c6adce8SGreg Roach * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
131c6adce8SGreg Roach * GNU General Public License for more details.
141c6adce8SGreg Roach * You should have received a copy of the GNU General Public License
151c6adce8SGreg Roach * along with this program. If not, see <https://www.gnu.org/licenses/>.
161c6adce8SGreg Roach */
171c6adce8SGreg Roach
181c6adce8SGreg Roachdeclare(strict_types=1);
191c6adce8SGreg Roach
201c6adce8SGreg Roachnamespace Fisharebest\Webtrees\Encodings;
211c6adce8SGreg Roach
221c6adce8SGreg Roachuse function array_flip;
231c6adce8SGreg Roachuse function array_map;
241c6adce8SGreg Roachuse function implode;
251c6adce8SGreg Roachuse function ord;
261c6adce8SGreg Roachuse function preg_split;
271c6adce8SGreg Roachuse function strlen;
281c6adce8SGreg Roachuse function strrpos;
291c6adce8SGreg Roachuse function strtr;
301c6adce8SGreg Roach
311c6adce8SGreg Roachuse const PREG_SPLIT_NO_EMPTY;
321c6adce8SGreg Roach
331c6adce8SGreg Roach/**
341c6adce8SGreg Roach * Convert between an encoding and UTF-8.
351c6adce8SGreg Roach */
361c6adce8SGreg Roachabstract class AbstractEncoding implements EncodingInterface
371c6adce8SGreg Roach{
381c6adce8SGreg Roach    protected const REPLACEMENT_CHARACTER = '?';
391c6adce8SGreg Roach
401c6adce8SGreg Roach    /** @var array<string,string> Encoded character => utf8 character */
411c6adce8SGreg Roach    protected const TO_UTF8 = [];
421c6adce8SGreg Roach
431c6adce8SGreg Roach    /**
441c6adce8SGreg Roach     * Convert a string from UTF-8 to another encoding.
451c6adce8SGreg Roach     *
461c6adce8SGreg Roach     * @param string $text
471c6adce8SGreg Roach     *
481c6adce8SGreg Roach     * @return string
491c6adce8SGreg Roach     */
501c6adce8SGreg Roach    public function fromUtf8(string $text): string
511c6adce8SGreg Roach    {
521c6adce8SGreg Roach        $utf8  = array_flip(static::TO_UTF8);
531c6adce8SGreg Roach        $utf8[UTF8::REPLACEMENT_CHARACTER] = static::REPLACEMENT_CHARACTER;
541c6adce8SGreg Roach
551c6adce8SGreg Roach        $chars = preg_split('//u', $text, -1, PREG_SPLIT_NO_EMPTY);
561c6adce8SGreg Roach        $chars = array_map(static function (string $char) use ($utf8): string {
571c6adce8SGreg Roach            if (ord($char) < 128) {
581c6adce8SGreg Roach                return $char;
591c6adce8SGreg Roach            }
601c6adce8SGreg Roach
611c6adce8SGreg Roach            return $utf8[$char] ?? static::REPLACEMENT_CHARACTER;
621c6adce8SGreg Roach        }, $chars);
631c6adce8SGreg Roach
641c6adce8SGreg Roach        return implode('', $chars);
651c6adce8SGreg Roach    }
661c6adce8SGreg Roach
671c6adce8SGreg Roach    /**
681c6adce8SGreg Roach     * Convert a string from another encoding to UTF-8.
691c6adce8SGreg Roach     *
701c6adce8SGreg Roach     * @param string $text
711c6adce8SGreg Roach     *
721c6adce8SGreg Roach     * @return string
731c6adce8SGreg Roach     */
741c6adce8SGreg Roach    public function toUtf8(string $text): string
751c6adce8SGreg Roach    {
761c6adce8SGreg Roach        return strtr($text, static::TO_UTF8);
771c6adce8SGreg Roach    }
781c6adce8SGreg Roach
791c6adce8SGreg Roach    /**
801c6adce8SGreg Roach     * When reading multi-byte encodings using a stream, we must avoid incomplete characters.
811c6adce8SGreg Roach     *
821c6adce8SGreg Roach     * @param string $text
831c6adce8SGreg Roach     *
841c6adce8SGreg Roach     * @return int
851c6adce8SGreg Roach     */
861c6adce8SGreg Roach    public function convertibleBytes(string $text): int
871c6adce8SGreg Roach    {
881c6adce8SGreg Roach        $safe_chars = [
891c6adce8SGreg Roach            $this->fromUtf8("\n"),
901c6adce8SGreg Roach            $this->fromUtf8("\r"),
911c6adce8SGreg Roach            $this->fromUtf8(' '),
921c6adce8SGreg Roach        ];
931c6adce8SGreg Roach
941c6adce8SGreg Roach        foreach ($safe_chars as $char) {
951c6adce8SGreg Roach            $pos = strrpos($text, $char);
961c6adce8SGreg Roach
971c6adce8SGreg Roach            if ($pos !== false) {
981c6adce8SGreg Roach                return $pos + strlen($char);
991c6adce8SGreg Roach            }
1001c6adce8SGreg Roach        }
1011c6adce8SGreg Roach
1021c6adce8SGreg Roach        return 0;
1031c6adce8SGreg Roach    }
1041c6adce8SGreg Roach}
105