xref: /webtrees/app/Factories/EncodingFactory.php (revision 1ff45046fabc22237b5d0d8e489c96f031fc598d)
11c6adce8SGreg Roach<?php
21c6adce8SGreg Roach
31c6adce8SGreg Roach/**
41c6adce8SGreg Roach * webtrees: online genealogy
5d11be702SGreg Roach * Copyright (C) 2023 webtrees development team
61c6adce8SGreg Roach * This program is free software: you can redistribute it and/or modify
71c6adce8SGreg Roach * it under the terms of the GNU General Public License as published by
81c6adce8SGreg Roach * the Free Software Foundation, either version 3 of the License, or
91c6adce8SGreg Roach * (at your option) any later version.
101c6adce8SGreg Roach * This program is distributed in the hope that it will be useful,
111c6adce8SGreg Roach * but WITHOUT ANY WARRANTY; without even the implied warranty of
121c6adce8SGreg Roach * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
131c6adce8SGreg Roach * GNU General Public License for more details.
141c6adce8SGreg Roach * You should have received a copy of the GNU General Public License
151c6adce8SGreg Roach * along with this program. If not, see <https://www.gnu.org/licenses/>.
161c6adce8SGreg Roach */
171c6adce8SGreg Roach
181c6adce8SGreg Roachdeclare(strict_types=1);
191c6adce8SGreg Roach
201c6adce8SGreg Roachnamespace Fisharebest\Webtrees\Factories;
211c6adce8SGreg Roach
221c6adce8SGreg Roachuse DomainException;
231c6adce8SGreg Roachuse Fisharebest\Webtrees\Contracts\EncodingFactoryInterface;
241c6adce8SGreg Roachuse Fisharebest\Webtrees\Encodings\ANSEL;
251c6adce8SGreg Roachuse Fisharebest\Webtrees\Encodings\ASCII;
261c6adce8SGreg Roachuse Fisharebest\Webtrees\Encodings\CP437;
271c6adce8SGreg Roachuse Fisharebest\Webtrees\Encodings\CP850;
281c6adce8SGreg Roachuse Fisharebest\Webtrees\Encodings\EncodingInterface;
291c6adce8SGreg Roachuse Fisharebest\Webtrees\Encodings\ISO88591;
301c6adce8SGreg Roachuse Fisharebest\Webtrees\Encodings\ISO88592;
311c6adce8SGreg Roachuse Fisharebest\Webtrees\Encodings\MacRoman;
321c6adce8SGreg Roachuse Fisharebest\Webtrees\Encodings\UTF16BE;
331c6adce8SGreg Roachuse Fisharebest\Webtrees\Encodings\UTF16LE;
341c6adce8SGreg Roachuse Fisharebest\Webtrees\Encodings\UTF8;
351c6adce8SGreg Roachuse Fisharebest\Webtrees\Encodings\Windows1250;
361c6adce8SGreg Roachuse Fisharebest\Webtrees\Encodings\Windows1251;
371c6adce8SGreg Roachuse Fisharebest\Webtrees\Encodings\Windows1252;
381c6adce8SGreg Roachuse Fisharebest\Webtrees\Exceptions\InvalidGedcomEncodingException;
391c6adce8SGreg Roach
401c6adce8SGreg Roachuse function explode;
411c6adce8SGreg Roachuse function ltrim;
421c6adce8SGreg Roachuse function preg_match;
431c6adce8SGreg Roachuse function str_contains;
441c6adce8SGreg Roachuse function str_starts_with;
451c6adce8SGreg Roachuse function strstr;
461c6adce8SGreg Roach
471c6adce8SGreg Roach/**
481c6adce8SGreg Roach * Create an encoding object.
491c6adce8SGreg Roach */
501c6adce8SGreg Roachclass EncodingFactory implements EncodingFactoryInterface
511c6adce8SGreg Roach{
521c6adce8SGreg Roach    /**
531c6adce8SGreg Roach     * Detect an encoding from a GEDCOM header record.
541c6adce8SGreg Roach     *
551c6adce8SGreg Roach     * @throws InvalidGedcomEncodingException
561c6adce8SGreg Roach     */
57*1ff45046SGreg Roach    public function detect(string $header): EncodingInterface|null
581c6adce8SGreg Roach    {
591c6adce8SGreg Roach        $utf_bom = [
601c6adce8SGreg Roach            '/^' . UTF8::BYTE_ORDER_MARK . '/'    => UTF8::NAME,
611c6adce8SGreg Roach            '/^' . UTF16BE::BYTE_ORDER_MARK . '/' => UTF16BE::NAME,
621c6adce8SGreg Roach            '/^' . UTF16LE::BYTE_ORDER_MARK . '/' => UTF16LE::NAME,
631c6adce8SGreg Roach        ];
641c6adce8SGreg Roach
651c6adce8SGreg Roach        foreach ($utf_bom as $regex => $encoding) {
661c6adce8SGreg Roach            if (preg_match($regex, $header) === 1) {
671c6adce8SGreg Roach                return $this->make($encoding);
681c6adce8SGreg Roach            }
691c6adce8SGreg Roach        }
701c6adce8SGreg Roach
711c6adce8SGreg Roach        $utf16 = [
721c6adce8SGreg Roach            "\x000" => UTF16BE::NAME,
731c6adce8SGreg Roach            "0\x00" => UTF16LE::NAME,
741c6adce8SGreg Roach        ];
751c6adce8SGreg Roach
761c6adce8SGreg Roach        foreach ($utf16 as $start => $encoding) {
771c6adce8SGreg Roach            if (str_starts_with($header, $start)) {
781c6adce8SGreg Roach                return $this->make($encoding);
791c6adce8SGreg Roach            }
801c6adce8SGreg Roach        }
811c6adce8SGreg Roach
821c6adce8SGreg Roach        // Standardize whitespace to simplify matching.
831c6adce8SGreg Roach        $header = strtr(ltrim($header), ["\r\n" => "\n", "\n\r" => "\n", "\r" => "\n"]);
841c6adce8SGreg Roach
851c6adce8SGreg Roach        while (str_contains($header, "\n ") || str_contains($header, " \n") || str_contains($header, '  ')) {
861c6adce8SGreg Roach            $header = strtr($header, ["\n " => "\n", " \n" => "\n", '  ' => ' ']);
871c6adce8SGreg Roach        }
881c6adce8SGreg Roach
891c6adce8SGreg Roach        // We need a complete header record
901c6adce8SGreg Roach        $header = strstr($header, "\n0", true);
911c6adce8SGreg Roach
921c6adce8SGreg Roach        if ($header === false) {
931c6adce8SGreg Roach            return null;
941c6adce8SGreg Roach        }
951c6adce8SGreg Roach
961c6adce8SGreg Roach        // Some of these come from Tamura Jones, the rest from webtrees users.
971c6adce8SGreg Roach        $character_sets = [
981c6adce8SGreg Roach            'ASCII'             => ASCII::NAME,
991c6adce8SGreg Roach            'ANSEL'             => ANSEL::NAME,
1001c6adce8SGreg Roach            'UTF-8'             => UTF8::NAME,
1011c6adce8SGreg Roach            'UNICODE'           => UTF8::NAME, // If the null byte test failed, this can't be UTF16
1021c6adce8SGreg Roach            'ASCII/MacOS Roman' => MacRoman::NAME, // GEDitCOM
1031c6adce8SGreg Roach            'ASCII/MACINTOSH'   => MacRoman::NAME, // MacFamilyTree < 8.3.5
1041c6adce8SGreg Roach            'MACINTOSH'         => MacRoman::NAME, // MacFamilyTree >= 8.3.5
1051c6adce8SGreg Roach            'CP437'             => CP437::NAME,
1061c6adce8SGreg Roach            'IBMPC'             => CP437::NAME,
1071c6adce8SGreg Roach            'IBM'               => CP437::NAME, // Reunion
1081c6adce8SGreg Roach            'IBM-PC'            => CP437::NAME, // CumberlandFamilyTree
1091c6adce8SGreg Roach            'OEM'               => CP437::NAME, // Généatique
1101c6adce8SGreg Roach            'CP850'             => CP850::NAME,
1111c6adce8SGreg Roach            'MSDOS'             => CP850::NAME,
1121c6adce8SGreg Roach            'IBM-DOS'           => CP850::NAME, // Reunion, EasyTree
1131c6adce8SGreg Roach            'MS-DOS'            => CP850::NAME, // AbrEdit FTM for Windows
1141c6adce8SGreg Roach            'ANSI'              => CP850::NAME,
1151c6adce8SGreg Roach            'WINDOWS'           => CP850::NAME, // Parentele
1161c6adce8SGreg Roach            'IBM WINDOWS'       => CP850::NAME, // EasyTree, Généalogie, Reunion, TribalPages
1171c6adce8SGreg Roach            'IBM_WINDOWS'       => CP850::NAME, // EasyTree
1181c6adce8SGreg Roach            'CP1250'            => Windows1250::NAME,
1191c6adce8SGreg Roach            'windows-1250'      => Windows1250::NAME, // GenoPro, Rodokmen Pro
1201c6adce8SGreg Roach            'CP1251'            => Windows1251::NAME,
1211c6adce8SGreg Roach            'WINDOWS-1251'      => Windows1251::NAME, // Rodovid
1221c6adce8SGreg Roach            'CP1252'            => Windows1252::NAME, // Lifelines
1231c6adce8SGreg Roach            'ISO-8859-1'        => ISO88591::NAME, // Cumberland Family Tree, Lifelines
1241c6adce8SGreg Roach            'ISO8859-1'         => ISO88591::NAME, // Scion Genealogist
1251c6adce8SGreg Roach            'ISO8859'           => ISO88591::NAME, // Genealogica Grafica
1261c6adce8SGreg Roach            'LATIN-1'           => ISO88591::NAME,
1271c6adce8SGreg Roach            'LATIN1'            => ISO88591::NAME, // GenealogyJ
1281c6adce8SGreg Roach            'ISO-8859-2'        => ISO88592::NAME,
1291c6adce8SGreg Roach            'ISO8859-2'         => ISO88592::NAME,
1301c6adce8SGreg Roach            'LATIN-2'           => ISO88592::NAME,
1311c6adce8SGreg Roach            'LATIN2'            => ISO88592::NAME,
1321c6adce8SGreg Roach        ];
1331c6adce8SGreg Roach
1341c6adce8SGreg Roach        foreach ($character_sets as $pattern => $encoding) {
1351c6adce8SGreg Roach            if (str_contains($pattern, '/')) {
1361c6adce8SGreg Roach                [$char, $vers] = explode('/', $pattern);
1371c6adce8SGreg Roach                $regex = "\n1 CHAR " . $char . "\n2 VERS " . $vers;
1381c6adce8SGreg Roach            } else {
1391c6adce8SGreg Roach                $regex = "\n1 CHAR(?:ACTER)? " . $pattern;
1401c6adce8SGreg Roach            }
1411c6adce8SGreg Roach
1426e60786aSGreg Roach            if (preg_match('/' . $regex . '/i', $header) === 1) {
1431c6adce8SGreg Roach                return $this->make($encoding);
1441c6adce8SGreg Roach            }
1451c6adce8SGreg Roach        }
1461c6adce8SGreg Roach
1471c6adce8SGreg Roach        if (preg_match('/1 CHAR (.+)/', $header, $match) === 1) {
148aa377663SGreg Roach            throw new InvalidGedcomEncodingException($match[1]);
1491c6adce8SGreg Roach        }
1501c6adce8SGreg Roach
1515dd4472aSGreg Roach        return $this->make(UTF8::NAME);
1521c6adce8SGreg Roach    }
1531c6adce8SGreg Roach
1541c6adce8SGreg Roach    /**
1551c6adce8SGreg Roach     * Create a named encoding.
1561c6adce8SGreg Roach     *
1571c6adce8SGreg Roach     * @param string $name
1581c6adce8SGreg Roach     *
1591c6adce8SGreg Roach     * @return EncodingInterface
160bbc7031eSGreg Roach     * @throws DomainException
1611c6adce8SGreg Roach     */
1621c6adce8SGreg Roach    public function make(string $name): EncodingInterface
1631c6adce8SGreg Roach    {
1641c6adce8SGreg Roach        switch ($name) {
1651c6adce8SGreg Roach            case UTF8::NAME:
1661c6adce8SGreg Roach                return new UTF8();
1671c6adce8SGreg Roach
1681c6adce8SGreg Roach            case UTF16BE::NAME:
1691c6adce8SGreg Roach                return new UTF16BE();
1701c6adce8SGreg Roach
1711c6adce8SGreg Roach            case UTF16LE::NAME:
1721c6adce8SGreg Roach                return new UTF16LE();
1731c6adce8SGreg Roach
1741c6adce8SGreg Roach            case ANSEL::NAME:
1751c6adce8SGreg Roach                return new ANSEL();
1761c6adce8SGreg Roach
1771c6adce8SGreg Roach            case ASCII::NAME:
1781c6adce8SGreg Roach                return new ASCII();
1791c6adce8SGreg Roach
1801c6adce8SGreg Roach            case CP437::NAME:
1811c6adce8SGreg Roach                return new CP437();
1821c6adce8SGreg Roach
1831c6adce8SGreg Roach            case CP850::NAME:
1841c6adce8SGreg Roach                return new CP850();
1851c6adce8SGreg Roach
1861c6adce8SGreg Roach            case Windows1250::NAME:
1871c6adce8SGreg Roach                return new Windows1250();
1881c6adce8SGreg Roach
1891c6adce8SGreg Roach            case Windows1251::NAME:
1901c6adce8SGreg Roach                return new Windows1251();
1911c6adce8SGreg Roach
1921c6adce8SGreg Roach            case Windows1252::NAME:
1931c6adce8SGreg Roach                return new Windows1252();
1941c6adce8SGreg Roach
1951c6adce8SGreg Roach            case MacRoman::NAME:
1961c6adce8SGreg Roach                return new MacRoman();
1971c6adce8SGreg Roach
1981c6adce8SGreg Roach            case ISO88591::NAME:
1991c6adce8SGreg Roach                return new ISO88591();
2001c6adce8SGreg Roach
2011c6adce8SGreg Roach            case ISO88592::NAME:
2021c6adce8SGreg Roach                return new ISO88592();
2031c6adce8SGreg Roach
2041c6adce8SGreg Roach            default:
2051c6adce8SGreg Roach                throw new DomainException('Invalid encoding: ' . $name);
2061c6adce8SGreg Roach        }
2071c6adce8SGreg Roach    }
2081c6adce8SGreg Roach
2091c6adce8SGreg Roach    /**
2101c6adce8SGreg Roach     * A list of supported encodings and their names.
2111c6adce8SGreg Roach     *
2121c6adce8SGreg Roach     * @return array<string,string>
2131c6adce8SGreg Roach     */
2141c6adce8SGreg Roach    public function list(): array
2151c6adce8SGreg Roach    {
2161c6adce8SGreg Roach        return [
2171c6adce8SGreg Roach            UTF8::NAME        => 'UTF-8',
2181c6adce8SGreg Roach            UTF16BE::NAME     => 'UTF-16BE',
2191c6adce8SGreg Roach            UTF16LE::NAME     => 'UTF-16LE',
2201c6adce8SGreg Roach            ANSEL::NAME       => 'ANSEL',
2211c6adce8SGreg Roach            ASCII::NAME       => 'ASCII',
2221c6adce8SGreg Roach            ISO88591::NAME    => 'ISO-8859-1',
2231c6adce8SGreg Roach            ISO88592::NAME    => 'ISO-8859-2',
2241c6adce8SGreg Roach            Windows1250::NAME => 'Windows 1250',
2251c6adce8SGreg Roach            Windows1251::NAME => 'Windows 1251',
2261c6adce8SGreg Roach            Windows1252::NAME => 'Windows 1252',
2271c6adce8SGreg Roach            CP437::NAME       => 'CP437',
2281c6adce8SGreg Roach            CP850::NAME       => 'CP850',
2291c6adce8SGreg Roach            MacRoman::NAME    => 'MacOS Roman',
2301c6adce8SGreg Roach        ];
2311c6adce8SGreg Roach    }
2321c6adce8SGreg Roach}
233