xref: /webtrees/app/Factories/EncodingFactory.php (revision bbc7031e8ae32e1f33bf006b05c0ac45770d10c6)
1<?php
2
3/**
4 * webtrees: online genealogy
5 * Copyright (C) 2021 webtrees development team
6 * This program is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation, either version 3 of the License, or
9 * (at your option) any later version.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program. If not, see <https://www.gnu.org/licenses/>.
16 */
17
18declare(strict_types=1);
19
20namespace Fisharebest\Webtrees\Factories;
21
22use DomainException;
23use Fisharebest\Webtrees\Contracts\EncodingFactoryInterface;
24use Fisharebest\Webtrees\Encodings\ANSEL;
25use Fisharebest\Webtrees\Encodings\ASCII;
26use Fisharebest\Webtrees\Encodings\CP437;
27use Fisharebest\Webtrees\Encodings\CP850;
28use Fisharebest\Webtrees\Encodings\EncodingInterface;
29use Fisharebest\Webtrees\Encodings\ISO88591;
30use Fisharebest\Webtrees\Encodings\ISO88592;
31use Fisharebest\Webtrees\Encodings\MacRoman;
32use Fisharebest\Webtrees\Encodings\UTF16BE;
33use Fisharebest\Webtrees\Encodings\UTF16LE;
34use Fisharebest\Webtrees\Encodings\UTF8;
35use Fisharebest\Webtrees\Encodings\Windows1250;
36use Fisharebest\Webtrees\Encodings\Windows1251;
37use Fisharebest\Webtrees\Encodings\Windows1252;
38use Fisharebest\Webtrees\Exceptions\InvalidGedcomEncodingException;
39
40use function explode;
41use function ltrim;
42use function preg_match;
43use function str_contains;
44use function str_starts_with;
45use function strstr;
46
47/**
48 * Create an encoding object.
49 */
50class EncodingFactory implements EncodingFactoryInterface
51{
52    /**
53     * Detect an encoding from a GEDCOM header record.
54     *
55     * @param string $header
56     *
57     * @return EncodingInterface|null
58     * @throws InvalidGedcomEncodingException
59     */
60    public function detect(string $header): ?EncodingInterface
61    {
62        $utf_bom = [
63            '/^' . UTF8::BYTE_ORDER_MARK . '/'    => UTF8::NAME,
64            '/^' . UTF16BE::BYTE_ORDER_MARK . '/' => UTF16BE::NAME,
65            '/^' . UTF16LE::BYTE_ORDER_MARK . '/' => UTF16LE::NAME,
66        ];
67
68        foreach ($utf_bom as $regex => $encoding) {
69            if (preg_match($regex, $header) === 1) {
70                return $this->make($encoding);
71            }
72        }
73
74        $utf16 = [
75            "\x000" => UTF16BE::NAME,
76            "0\x00" => UTF16LE::NAME,
77        ];
78
79        foreach ($utf16 as $start => $encoding) {
80            if (str_starts_with($header, $start)) {
81                return $this->make($encoding);
82            }
83        }
84
85        // Standardize whitespace to simplify matching.
86        $header = strtr(ltrim($header), ["\r\n" => "\n", "\n\r" => "\n", "\r" => "\n"]);
87
88        while (str_contains($header, "\n ") || str_contains($header, " \n") || str_contains($header, '  ')) {
89            $header = strtr($header, ["\n " => "\n", " \n" => "\n", '  ' => ' ']);
90        }
91
92        // We need a complete header record
93        $header = strstr($header, "\n0", true);
94
95        if ($header === false) {
96            return null;
97        }
98
99        // Some of these come from Tamura Jones, the rest from webtrees users.
100        $character_sets = [
101            'ASCII'             => ASCII::NAME,
102            'ANSEL'             => ANSEL::NAME,
103            'UTF-8'             => UTF8::NAME,
104            'UNICODE'           => UTF8::NAME, // If the null byte test failed, this can't be UTF16
105            'ASCII/MacOS Roman' => MacRoman::NAME, // GEDitCOM
106            'ASCII/MACINTOSH'   => MacRoman::NAME, // MacFamilyTree < 8.3.5
107            'MACINTOSH'         => MacRoman::NAME, // MacFamilyTree >= 8.3.5
108            'CP437'             => CP437::NAME,
109            'IBMPC'             => CP437::NAME,
110            'IBM'               => CP437::NAME, // Reunion
111            'IBM-PC'            => CP437::NAME, // CumberlandFamilyTree
112            'OEM'               => CP437::NAME, // Généatique
113            'CP850'             => CP850::NAME,
114            'MSDOS'             => CP850::NAME,
115            'IBM-DOS'           => CP850::NAME, // Reunion, EasyTree
116            'MS-DOS'            => CP850::NAME, // AbrEdit FTM for Windows
117            'ANSI'              => CP850::NAME,
118            'WINDOWS'           => CP850::NAME, // Parentele
119            'IBM WINDOWS'       => CP850::NAME, // EasyTree, Généalogie, Reunion, TribalPages
120            'IBM_WINDOWS'       => CP850::NAME, // EasyTree
121            'CP1250'            => Windows1250::NAME,
122            'windows-1250'      => Windows1250::NAME, // GenoPro, Rodokmen Pro
123            'CP1251'            => Windows1251::NAME,
124            'WINDOWS-1251'      => Windows1251::NAME, // Rodovid
125            'CP1252'            => Windows1252::NAME, // Lifelines
126            'ISO-8859-1'        => ISO88591::NAME, // Cumberland Family Tree, Lifelines
127            'ISO8859-1'         => ISO88591::NAME, // Scion Genealogist
128            'ISO8859'           => ISO88591::NAME, // Genealogica Grafica
129            'LATIN-1'           => ISO88591::NAME,
130            'LATIN1'            => ISO88591::NAME, // GenealogyJ
131            'ISO-8859-2'        => ISO88592::NAME,
132            'ISO8859-2'         => ISO88592::NAME,
133            'LATIN-2'           => ISO88592::NAME,
134            'LATIN2'            => ISO88592::NAME,
135        ];
136
137        foreach ($character_sets as $pattern => $encoding) {
138            if (str_contains($pattern, '/')) {
139                [$char, $vers] = explode('/', $pattern);
140                $regex = "\n1 CHAR " . $char . "\n2 VERS " . $vers;
141            } else {
142                $regex = "\n1 CHAR(?:ACTER)? " . $pattern;
143            }
144
145            if (preg_match("/" . $regex . "/i", $header) === 1) {
146                return $this->make($encoding);
147            }
148        }
149
150        if (preg_match('/1 CHAR (.+)/', $header, $match) === 1) {
151            $charset = $match[1];
152        } else {
153            $charset = '???';
154        }
155
156        throw new InvalidGedcomEncodingException($charset);
157    }
158
159    /**
160     * Create a named encoding.
161     *
162     * @param string $name
163     *
164     * @return EncodingInterface
165     * @throws DomainException
166     */
167    public function make(string $name): EncodingInterface
168    {
169        switch ($name) {
170            case UTF8::NAME:
171                return new UTF8();
172
173            case UTF16BE::NAME:
174                return new UTF16BE();
175
176            case UTF16LE::NAME:
177                return new UTF16LE();
178
179            case ANSEL::NAME:
180                return new ANSEL();
181
182            case ASCII::NAME:
183                return new ASCII();
184
185            case CP437::NAME:
186                return new CP437();
187
188            case CP850::NAME:
189                return new CP850();
190
191            case Windows1250::NAME:
192                return new Windows1250();
193
194            case Windows1251::NAME:
195                return new Windows1251();
196
197            case Windows1252::NAME:
198                return new Windows1252();
199
200            case MacRoman::NAME:
201                return new MacRoman();
202
203            case ISO88591::NAME:
204                return new ISO88591();
205
206            case ISO88592::NAME:
207                return new ISO88592();
208
209            default:
210                throw new DomainException('Invalid encoding: ' . $name);
211        }
212    }
213
214    /**
215     * A list of supported encodings and their names.
216     *
217     * @return array<string,string>
218     */
219    public function list(): array
220    {
221        return [
222            UTF8::NAME        => 'UTF-8',
223            UTF16BE::NAME     => 'UTF-16BE',
224            UTF16LE::NAME     => 'UTF-16LE',
225            ANSEL::NAME       => 'ANSEL',
226            ASCII::NAME       => 'ASCII',
227            ISO88591::NAME    => 'ISO-8859-1',
228            ISO88592::NAME    => 'ISO-8859-2',
229            Windows1250::NAME => 'Windows 1250',
230            Windows1251::NAME => 'Windows 1251',
231            Windows1252::NAME => 'Windows 1252',
232            CP437::NAME       => 'CP437',
233            CP850::NAME       => 'CP850',
234            MacRoman::NAME    => 'MacOS Roman',
235        ];
236    }
237}
238