xref: /webtrees/app/Factories/EncodingFactory.php (revision 5dd4472a208c74e50dcc9e48e06a5bf2deaf36f6)
11c6adce8SGreg Roach<?php
21c6adce8SGreg Roach
31c6adce8SGreg Roach/**
41c6adce8SGreg Roach * webtrees: online genealogy
55bfc6897SGreg Roach * Copyright (C) 2022 webtrees development team
61c6adce8SGreg Roach * This program is free software: you can redistribute it and/or modify
71c6adce8SGreg Roach * it under the terms of the GNU General Public License as published by
81c6adce8SGreg Roach * the Free Software Foundation, either version 3 of the License, or
91c6adce8SGreg Roach * (at your option) any later version.
101c6adce8SGreg Roach * This program is distributed in the hope that it will be useful,
111c6adce8SGreg Roach * but WITHOUT ANY WARRANTY; without even the implied warranty of
121c6adce8SGreg Roach * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
131c6adce8SGreg Roach * GNU General Public License for more details.
141c6adce8SGreg Roach * You should have received a copy of the GNU General Public License
151c6adce8SGreg Roach * along with this program. If not, see <https://www.gnu.org/licenses/>.
161c6adce8SGreg Roach */
171c6adce8SGreg Roach
181c6adce8SGreg Roachdeclare(strict_types=1);
191c6adce8SGreg Roach
201c6adce8SGreg Roachnamespace Fisharebest\Webtrees\Factories;
211c6adce8SGreg Roach
221c6adce8SGreg Roachuse DomainException;
231c6adce8SGreg Roachuse Fisharebest\Webtrees\Contracts\EncodingFactoryInterface;
241c6adce8SGreg Roachuse Fisharebest\Webtrees\Encodings\ANSEL;
251c6adce8SGreg Roachuse Fisharebest\Webtrees\Encodings\ASCII;
261c6adce8SGreg Roachuse Fisharebest\Webtrees\Encodings\CP437;
271c6adce8SGreg Roachuse Fisharebest\Webtrees\Encodings\CP850;
281c6adce8SGreg Roachuse Fisharebest\Webtrees\Encodings\EncodingInterface;
291c6adce8SGreg Roachuse Fisharebest\Webtrees\Encodings\ISO88591;
301c6adce8SGreg Roachuse Fisharebest\Webtrees\Encodings\ISO88592;
311c6adce8SGreg Roachuse Fisharebest\Webtrees\Encodings\MacRoman;
321c6adce8SGreg Roachuse Fisharebest\Webtrees\Encodings\UTF16BE;
331c6adce8SGreg Roachuse Fisharebest\Webtrees\Encodings\UTF16LE;
341c6adce8SGreg Roachuse Fisharebest\Webtrees\Encodings\UTF8;
351c6adce8SGreg Roachuse Fisharebest\Webtrees\Encodings\Windows1250;
361c6adce8SGreg Roachuse Fisharebest\Webtrees\Encodings\Windows1251;
371c6adce8SGreg Roachuse Fisharebest\Webtrees\Encodings\Windows1252;
381c6adce8SGreg Roachuse Fisharebest\Webtrees\Exceptions\InvalidGedcomEncodingException;
391c6adce8SGreg Roach
401c6adce8SGreg Roachuse function explode;
411c6adce8SGreg Roachuse function ltrim;
421c6adce8SGreg Roachuse function preg_match;
431c6adce8SGreg Roachuse function str_contains;
441c6adce8SGreg Roachuse function str_starts_with;
451c6adce8SGreg Roachuse function strstr;
461c6adce8SGreg Roach
471c6adce8SGreg Roach/**
481c6adce8SGreg Roach * Create an encoding object.
491c6adce8SGreg Roach */
501c6adce8SGreg Roachclass EncodingFactory implements EncodingFactoryInterface
511c6adce8SGreg Roach{
521c6adce8SGreg Roach    /**
531c6adce8SGreg Roach     * Detect an encoding from a GEDCOM header record.
541c6adce8SGreg Roach     *
551c6adce8SGreg Roach     * @param string $header
561c6adce8SGreg Roach     *
571c6adce8SGreg Roach     * @return EncodingInterface|null
581c6adce8SGreg Roach     * @throws InvalidGedcomEncodingException
591c6adce8SGreg Roach     */
601c6adce8SGreg Roach    public function detect(string $header): ?EncodingInterface
611c6adce8SGreg Roach    {
621c6adce8SGreg Roach        $utf_bom = [
631c6adce8SGreg Roach            '/^' . UTF8::BYTE_ORDER_MARK . '/'    => UTF8::NAME,
641c6adce8SGreg Roach            '/^' . UTF16BE::BYTE_ORDER_MARK . '/' => UTF16BE::NAME,
651c6adce8SGreg Roach            '/^' . UTF16LE::BYTE_ORDER_MARK . '/' => UTF16LE::NAME,
661c6adce8SGreg Roach        ];
671c6adce8SGreg Roach
681c6adce8SGreg Roach        foreach ($utf_bom as $regex => $encoding) {
691c6adce8SGreg Roach            if (preg_match($regex, $header) === 1) {
701c6adce8SGreg Roach                return $this->make($encoding);
711c6adce8SGreg Roach            }
721c6adce8SGreg Roach        }
731c6adce8SGreg Roach
741c6adce8SGreg Roach        $utf16 = [
751c6adce8SGreg Roach            "\x000" => UTF16BE::NAME,
761c6adce8SGreg Roach            "0\x00" => UTF16LE::NAME,
771c6adce8SGreg Roach        ];
781c6adce8SGreg Roach
791c6adce8SGreg Roach        foreach ($utf16 as $start => $encoding) {
801c6adce8SGreg Roach            if (str_starts_with($header, $start)) {
811c6adce8SGreg Roach                return $this->make($encoding);
821c6adce8SGreg Roach            }
831c6adce8SGreg Roach        }
841c6adce8SGreg Roach
851c6adce8SGreg Roach        // Standardize whitespace to simplify matching.
861c6adce8SGreg Roach        $header = strtr(ltrim($header), ["\r\n" => "\n", "\n\r" => "\n", "\r" => "\n"]);
871c6adce8SGreg Roach
881c6adce8SGreg Roach        while (str_contains($header, "\n ") || str_contains($header, " \n") || str_contains($header, '  ')) {
891c6adce8SGreg Roach            $header = strtr($header, ["\n " => "\n", " \n" => "\n", '  ' => ' ']);
901c6adce8SGreg Roach        }
911c6adce8SGreg Roach
921c6adce8SGreg Roach        // We need a complete header record
931c6adce8SGreg Roach        $header = strstr($header, "\n0", true);
941c6adce8SGreg Roach
951c6adce8SGreg Roach        if ($header === false) {
961c6adce8SGreg Roach            return null;
971c6adce8SGreg Roach        }
981c6adce8SGreg Roach
991c6adce8SGreg Roach        // Some of these come from Tamura Jones, the rest from webtrees users.
1001c6adce8SGreg Roach        $character_sets = [
1011c6adce8SGreg Roach            'ASCII'             => ASCII::NAME,
1021c6adce8SGreg Roach            'ANSEL'             => ANSEL::NAME,
1031c6adce8SGreg Roach            'UTF-8'             => UTF8::NAME,
1041c6adce8SGreg Roach            'UNICODE'           => UTF8::NAME, // If the null byte test failed, this can't be UTF16
1051c6adce8SGreg Roach            'ASCII/MacOS Roman' => MacRoman::NAME, // GEDitCOM
1061c6adce8SGreg Roach            'ASCII/MACINTOSH'   => MacRoman::NAME, // MacFamilyTree < 8.3.5
1071c6adce8SGreg Roach            'MACINTOSH'         => MacRoman::NAME, // MacFamilyTree >= 8.3.5
1081c6adce8SGreg Roach            'CP437'             => CP437::NAME,
1091c6adce8SGreg Roach            'IBMPC'             => CP437::NAME,
1101c6adce8SGreg Roach            'IBM'               => CP437::NAME, // Reunion
1111c6adce8SGreg Roach            'IBM-PC'            => CP437::NAME, // CumberlandFamilyTree
1121c6adce8SGreg Roach            'OEM'               => CP437::NAME, // Généatique
1131c6adce8SGreg Roach            'CP850'             => CP850::NAME,
1141c6adce8SGreg Roach            'MSDOS'             => CP850::NAME,
1151c6adce8SGreg Roach            'IBM-DOS'           => CP850::NAME, // Reunion, EasyTree
1161c6adce8SGreg Roach            'MS-DOS'            => CP850::NAME, // AbrEdit FTM for Windows
1171c6adce8SGreg Roach            'ANSI'              => CP850::NAME,
1181c6adce8SGreg Roach            'WINDOWS'           => CP850::NAME, // Parentele
1191c6adce8SGreg Roach            'IBM WINDOWS'       => CP850::NAME, // EasyTree, Généalogie, Reunion, TribalPages
1201c6adce8SGreg Roach            'IBM_WINDOWS'       => CP850::NAME, // EasyTree
1211c6adce8SGreg Roach            'CP1250'            => Windows1250::NAME,
1221c6adce8SGreg Roach            'windows-1250'      => Windows1250::NAME, // GenoPro, Rodokmen Pro
1231c6adce8SGreg Roach            'CP1251'            => Windows1251::NAME,
1241c6adce8SGreg Roach            'WINDOWS-1251'      => Windows1251::NAME, // Rodovid
1251c6adce8SGreg Roach            'CP1252'            => Windows1252::NAME, // Lifelines
1261c6adce8SGreg Roach            'ISO-8859-1'        => ISO88591::NAME, // Cumberland Family Tree, Lifelines
1271c6adce8SGreg Roach            'ISO8859-1'         => ISO88591::NAME, // Scion Genealogist
1281c6adce8SGreg Roach            'ISO8859'           => ISO88591::NAME, // Genealogica Grafica
1291c6adce8SGreg Roach            'LATIN-1'           => ISO88591::NAME,
1301c6adce8SGreg Roach            'LATIN1'            => ISO88591::NAME, // GenealogyJ
1311c6adce8SGreg Roach            'ISO-8859-2'        => ISO88592::NAME,
1321c6adce8SGreg Roach            'ISO8859-2'         => ISO88592::NAME,
1331c6adce8SGreg Roach            'LATIN-2'           => ISO88592::NAME,
1341c6adce8SGreg Roach            'LATIN2'            => ISO88592::NAME,
1351c6adce8SGreg Roach        ];
1361c6adce8SGreg Roach
1371c6adce8SGreg Roach        foreach ($character_sets as $pattern => $encoding) {
1381c6adce8SGreg Roach            if (str_contains($pattern, '/')) {
1391c6adce8SGreg Roach                [$char, $vers] = explode('/', $pattern);
1401c6adce8SGreg Roach                $regex = "\n1 CHAR " . $char . "\n2 VERS " . $vers;
1411c6adce8SGreg Roach            } else {
1421c6adce8SGreg Roach                $regex = "\n1 CHAR(?:ACTER)? " . $pattern;
1431c6adce8SGreg Roach            }
1441c6adce8SGreg Roach
1456e60786aSGreg Roach            if (preg_match('/' . $regex . '/i', $header) === 1) {
1461c6adce8SGreg Roach                return $this->make($encoding);
1471c6adce8SGreg Roach            }
1481c6adce8SGreg Roach        }
1491c6adce8SGreg Roach
1501c6adce8SGreg Roach        if (preg_match('/1 CHAR (.+)/', $header, $match) === 1) {
151aa377663SGreg Roach            throw new InvalidGedcomEncodingException($match[1]);
1521c6adce8SGreg Roach        }
1531c6adce8SGreg Roach
154*5dd4472aSGreg Roach        return $this->make(UTF8::NAME);
1551c6adce8SGreg Roach    }
1561c6adce8SGreg Roach
1571c6adce8SGreg Roach    /**
1581c6adce8SGreg Roach     * Create a named encoding.
1591c6adce8SGreg Roach     *
1601c6adce8SGreg Roach     * @param string $name
1611c6adce8SGreg Roach     *
1621c6adce8SGreg Roach     * @return EncodingInterface
163bbc7031eSGreg Roach     * @throws DomainException
1641c6adce8SGreg Roach     */
1651c6adce8SGreg Roach    public function make(string $name): EncodingInterface
1661c6adce8SGreg Roach    {
1671c6adce8SGreg Roach        switch ($name) {
1681c6adce8SGreg Roach            case UTF8::NAME:
1691c6adce8SGreg Roach                return new UTF8();
1701c6adce8SGreg Roach
1711c6adce8SGreg Roach            case UTF16BE::NAME:
1721c6adce8SGreg Roach                return new UTF16BE();
1731c6adce8SGreg Roach
1741c6adce8SGreg Roach            case UTF16LE::NAME:
1751c6adce8SGreg Roach                return new UTF16LE();
1761c6adce8SGreg Roach
1771c6adce8SGreg Roach            case ANSEL::NAME:
1781c6adce8SGreg Roach                return new ANSEL();
1791c6adce8SGreg Roach
1801c6adce8SGreg Roach            case ASCII::NAME:
1811c6adce8SGreg Roach                return new ASCII();
1821c6adce8SGreg Roach
1831c6adce8SGreg Roach            case CP437::NAME:
1841c6adce8SGreg Roach                return new CP437();
1851c6adce8SGreg Roach
1861c6adce8SGreg Roach            case CP850::NAME:
1871c6adce8SGreg Roach                return new CP850();
1881c6adce8SGreg Roach
1891c6adce8SGreg Roach            case Windows1250::NAME:
1901c6adce8SGreg Roach                return new Windows1250();
1911c6adce8SGreg Roach
1921c6adce8SGreg Roach            case Windows1251::NAME:
1931c6adce8SGreg Roach                return new Windows1251();
1941c6adce8SGreg Roach
1951c6adce8SGreg Roach            case Windows1252::NAME:
1961c6adce8SGreg Roach                return new Windows1252();
1971c6adce8SGreg Roach
1981c6adce8SGreg Roach            case MacRoman::NAME:
1991c6adce8SGreg Roach                return new MacRoman();
2001c6adce8SGreg Roach
2011c6adce8SGreg Roach            case ISO88591::NAME:
2021c6adce8SGreg Roach                return new ISO88591();
2031c6adce8SGreg Roach
2041c6adce8SGreg Roach            case ISO88592::NAME:
2051c6adce8SGreg Roach                return new ISO88592();
2061c6adce8SGreg Roach
2071c6adce8SGreg Roach            default:
2081c6adce8SGreg Roach                throw new DomainException('Invalid encoding: ' . $name);
2091c6adce8SGreg Roach        }
2101c6adce8SGreg Roach    }
2111c6adce8SGreg Roach
2121c6adce8SGreg Roach    /**
2131c6adce8SGreg Roach     * A list of supported encodings and their names.
2141c6adce8SGreg Roach     *
2151c6adce8SGreg Roach     * @return array<string,string>
2161c6adce8SGreg Roach     */
2171c6adce8SGreg Roach    public function list(): array
2181c6adce8SGreg Roach    {
2191c6adce8SGreg Roach        return [
2201c6adce8SGreg Roach            UTF8::NAME        => 'UTF-8',
2211c6adce8SGreg Roach            UTF16BE::NAME     => 'UTF-16BE',
2221c6adce8SGreg Roach            UTF16LE::NAME     => 'UTF-16LE',
2231c6adce8SGreg Roach            ANSEL::NAME       => 'ANSEL',
2241c6adce8SGreg Roach            ASCII::NAME       => 'ASCII',
2251c6adce8SGreg Roach            ISO88591::NAME    => 'ISO-8859-1',
2261c6adce8SGreg Roach            ISO88592::NAME    => 'ISO-8859-2',
2271c6adce8SGreg Roach            Windows1250::NAME => 'Windows 1250',
2281c6adce8SGreg Roach            Windows1251::NAME => 'Windows 1251',
2291c6adce8SGreg Roach            Windows1252::NAME => 'Windows 1252',
2301c6adce8SGreg Roach            CP437::NAME       => 'CP437',
2311c6adce8SGreg Roach            CP850::NAME       => 'CP850',
2321c6adce8SGreg Roach            MacRoman::NAME    => 'MacOS Roman',
2331c6adce8SGreg Roach        ];
2341c6adce8SGreg Roach    }
2351c6adce8SGreg Roach}
236