11c6adce8SGreg Roach<?php 21c6adce8SGreg Roach 31c6adce8SGreg Roach/** 41c6adce8SGreg Roach * webtrees: online genealogy 5d11be702SGreg Roach * Copyright (C) 2023 webtrees development team 61c6adce8SGreg Roach * This program is free software: you can redistribute it and/or modify 71c6adce8SGreg Roach * it under the terms of the GNU General Public License as published by 81c6adce8SGreg Roach * the Free Software Foundation, either version 3 of the License, or 91c6adce8SGreg Roach * (at your option) any later version. 101c6adce8SGreg Roach * This program is distributed in the hope that it will be useful, 111c6adce8SGreg Roach * but WITHOUT ANY WARRANTY; without even the implied warranty of 121c6adce8SGreg Roach * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 131c6adce8SGreg Roach * GNU General Public License for more details. 141c6adce8SGreg Roach * You should have received a copy of the GNU General Public License 151c6adce8SGreg Roach * along with this program. If not, see <https://www.gnu.org/licenses/>. 161c6adce8SGreg Roach */ 171c6adce8SGreg Roach 181c6adce8SGreg Roachdeclare(strict_types=1); 191c6adce8SGreg Roach 201c6adce8SGreg Roachnamespace Fisharebest\Webtrees\Factories; 211c6adce8SGreg Roach 221c6adce8SGreg Roachuse DomainException; 231c6adce8SGreg Roachuse Fisharebest\Webtrees\Contracts\EncodingFactoryInterface; 241c6adce8SGreg Roachuse Fisharebest\Webtrees\Encodings\ANSEL; 251c6adce8SGreg Roachuse Fisharebest\Webtrees\Encodings\ASCII; 261c6adce8SGreg Roachuse Fisharebest\Webtrees\Encodings\CP437; 271c6adce8SGreg Roachuse Fisharebest\Webtrees\Encodings\CP850; 281c6adce8SGreg Roachuse Fisharebest\Webtrees\Encodings\EncodingInterface; 291c6adce8SGreg Roachuse Fisharebest\Webtrees\Encodings\ISO88591; 301c6adce8SGreg Roachuse Fisharebest\Webtrees\Encodings\ISO88592; 311c6adce8SGreg Roachuse Fisharebest\Webtrees\Encodings\MacRoman; 321c6adce8SGreg Roachuse Fisharebest\Webtrees\Encodings\UTF16BE; 331c6adce8SGreg Roachuse Fisharebest\Webtrees\Encodings\UTF16LE; 341c6adce8SGreg Roachuse Fisharebest\Webtrees\Encodings\UTF8; 351c6adce8SGreg Roachuse Fisharebest\Webtrees\Encodings\Windows1250; 361c6adce8SGreg Roachuse Fisharebest\Webtrees\Encodings\Windows1251; 371c6adce8SGreg Roachuse Fisharebest\Webtrees\Encodings\Windows1252; 381c6adce8SGreg Roachuse Fisharebest\Webtrees\Exceptions\InvalidGedcomEncodingException; 391c6adce8SGreg Roach 401c6adce8SGreg Roachuse function explode; 411c6adce8SGreg Roachuse function ltrim; 421c6adce8SGreg Roachuse function preg_match; 431c6adce8SGreg Roachuse function str_contains; 441c6adce8SGreg Roachuse function str_starts_with; 451c6adce8SGreg Roachuse function strstr; 461c6adce8SGreg Roach 471c6adce8SGreg Roach/** 481c6adce8SGreg Roach * Create an encoding object. 491c6adce8SGreg Roach */ 501c6adce8SGreg Roachclass EncodingFactory implements EncodingFactoryInterface 511c6adce8SGreg Roach{ 521c6adce8SGreg Roach /** 531c6adce8SGreg Roach * Detect an encoding from a GEDCOM header record. 541c6adce8SGreg Roach * 551c6adce8SGreg Roach * @throws InvalidGedcomEncodingException 561c6adce8SGreg Roach */ 57*1ff45046SGreg Roach public function detect(string $header): EncodingInterface|null 581c6adce8SGreg Roach { 591c6adce8SGreg Roach $utf_bom = [ 601c6adce8SGreg Roach '/^' . UTF8::BYTE_ORDER_MARK . '/' => UTF8::NAME, 611c6adce8SGreg Roach '/^' . UTF16BE::BYTE_ORDER_MARK . '/' => UTF16BE::NAME, 621c6adce8SGreg Roach '/^' . UTF16LE::BYTE_ORDER_MARK . '/' => UTF16LE::NAME, 631c6adce8SGreg Roach ]; 641c6adce8SGreg Roach 651c6adce8SGreg Roach foreach ($utf_bom as $regex => $encoding) { 661c6adce8SGreg Roach if (preg_match($regex, $header) === 1) { 671c6adce8SGreg Roach return $this->make($encoding); 681c6adce8SGreg Roach } 691c6adce8SGreg Roach } 701c6adce8SGreg Roach 711c6adce8SGreg Roach $utf16 = [ 721c6adce8SGreg Roach "\x000" => UTF16BE::NAME, 731c6adce8SGreg Roach "0\x00" => UTF16LE::NAME, 741c6adce8SGreg Roach ]; 751c6adce8SGreg Roach 761c6adce8SGreg Roach foreach ($utf16 as $start => $encoding) { 771c6adce8SGreg Roach if (str_starts_with($header, $start)) { 781c6adce8SGreg Roach return $this->make($encoding); 791c6adce8SGreg Roach } 801c6adce8SGreg Roach } 811c6adce8SGreg Roach 821c6adce8SGreg Roach // Standardize whitespace to simplify matching. 831c6adce8SGreg Roach $header = strtr(ltrim($header), ["\r\n" => "\n", "\n\r" => "\n", "\r" => "\n"]); 841c6adce8SGreg Roach 851c6adce8SGreg Roach while (str_contains($header, "\n ") || str_contains($header, " \n") || str_contains($header, ' ')) { 861c6adce8SGreg Roach $header = strtr($header, ["\n " => "\n", " \n" => "\n", ' ' => ' ']); 871c6adce8SGreg Roach } 881c6adce8SGreg Roach 891c6adce8SGreg Roach // We need a complete header record 901c6adce8SGreg Roach $header = strstr($header, "\n0", true); 911c6adce8SGreg Roach 921c6adce8SGreg Roach if ($header === false) { 931c6adce8SGreg Roach return null; 941c6adce8SGreg Roach } 951c6adce8SGreg Roach 961c6adce8SGreg Roach // Some of these come from Tamura Jones, the rest from webtrees users. 971c6adce8SGreg Roach $character_sets = [ 981c6adce8SGreg Roach 'ASCII' => ASCII::NAME, 991c6adce8SGreg Roach 'ANSEL' => ANSEL::NAME, 1001c6adce8SGreg Roach 'UTF-8' => UTF8::NAME, 1011c6adce8SGreg Roach 'UNICODE' => UTF8::NAME, // If the null byte test failed, this can't be UTF16 1021c6adce8SGreg Roach 'ASCII/MacOS Roman' => MacRoman::NAME, // GEDitCOM 1031c6adce8SGreg Roach 'ASCII/MACINTOSH' => MacRoman::NAME, // MacFamilyTree < 8.3.5 1041c6adce8SGreg Roach 'MACINTOSH' => MacRoman::NAME, // MacFamilyTree >= 8.3.5 1051c6adce8SGreg Roach 'CP437' => CP437::NAME, 1061c6adce8SGreg Roach 'IBMPC' => CP437::NAME, 1071c6adce8SGreg Roach 'IBM' => CP437::NAME, // Reunion 1081c6adce8SGreg Roach 'IBM-PC' => CP437::NAME, // CumberlandFamilyTree 1091c6adce8SGreg Roach 'OEM' => CP437::NAME, // Généatique 1101c6adce8SGreg Roach 'CP850' => CP850::NAME, 1111c6adce8SGreg Roach 'MSDOS' => CP850::NAME, 1121c6adce8SGreg Roach 'IBM-DOS' => CP850::NAME, // Reunion, EasyTree 1131c6adce8SGreg Roach 'MS-DOS' => CP850::NAME, // AbrEdit FTM for Windows 1141c6adce8SGreg Roach 'ANSI' => CP850::NAME, 1151c6adce8SGreg Roach 'WINDOWS' => CP850::NAME, // Parentele 1161c6adce8SGreg Roach 'IBM WINDOWS' => CP850::NAME, // EasyTree, Généalogie, Reunion, TribalPages 1171c6adce8SGreg Roach 'IBM_WINDOWS' => CP850::NAME, // EasyTree 1181c6adce8SGreg Roach 'CP1250' => Windows1250::NAME, 1191c6adce8SGreg Roach 'windows-1250' => Windows1250::NAME, // GenoPro, Rodokmen Pro 1201c6adce8SGreg Roach 'CP1251' => Windows1251::NAME, 1211c6adce8SGreg Roach 'WINDOWS-1251' => Windows1251::NAME, // Rodovid 1221c6adce8SGreg Roach 'CP1252' => Windows1252::NAME, // Lifelines 1231c6adce8SGreg Roach 'ISO-8859-1' => ISO88591::NAME, // Cumberland Family Tree, Lifelines 1241c6adce8SGreg Roach 'ISO8859-1' => ISO88591::NAME, // Scion Genealogist 1251c6adce8SGreg Roach 'ISO8859' => ISO88591::NAME, // Genealogica Grafica 1261c6adce8SGreg Roach 'LATIN-1' => ISO88591::NAME, 1271c6adce8SGreg Roach 'LATIN1' => ISO88591::NAME, // GenealogyJ 1281c6adce8SGreg Roach 'ISO-8859-2' => ISO88592::NAME, 1291c6adce8SGreg Roach 'ISO8859-2' => ISO88592::NAME, 1301c6adce8SGreg Roach 'LATIN-2' => ISO88592::NAME, 1311c6adce8SGreg Roach 'LATIN2' => ISO88592::NAME, 1321c6adce8SGreg Roach ]; 1331c6adce8SGreg Roach 1341c6adce8SGreg Roach foreach ($character_sets as $pattern => $encoding) { 1351c6adce8SGreg Roach if (str_contains($pattern, '/')) { 1361c6adce8SGreg Roach [$char, $vers] = explode('/', $pattern); 1371c6adce8SGreg Roach $regex = "\n1 CHAR " . $char . "\n2 VERS " . $vers; 1381c6adce8SGreg Roach } else { 1391c6adce8SGreg Roach $regex = "\n1 CHAR(?:ACTER)? " . $pattern; 1401c6adce8SGreg Roach } 1411c6adce8SGreg Roach 1426e60786aSGreg Roach if (preg_match('/' . $regex . '/i', $header) === 1) { 1431c6adce8SGreg Roach return $this->make($encoding); 1441c6adce8SGreg Roach } 1451c6adce8SGreg Roach } 1461c6adce8SGreg Roach 1471c6adce8SGreg Roach if (preg_match('/1 CHAR (.+)/', $header, $match) === 1) { 148aa377663SGreg Roach throw new InvalidGedcomEncodingException($match[1]); 1491c6adce8SGreg Roach } 1501c6adce8SGreg Roach 1515dd4472aSGreg Roach return $this->make(UTF8::NAME); 1521c6adce8SGreg Roach } 1531c6adce8SGreg Roach 1541c6adce8SGreg Roach /** 1551c6adce8SGreg Roach * Create a named encoding. 1561c6adce8SGreg Roach * 1571c6adce8SGreg Roach * @param string $name 1581c6adce8SGreg Roach * 1591c6adce8SGreg Roach * @return EncodingInterface 160bbc7031eSGreg Roach * @throws DomainException 1611c6adce8SGreg Roach */ 1621c6adce8SGreg Roach public function make(string $name): EncodingInterface 1631c6adce8SGreg Roach { 1641c6adce8SGreg Roach switch ($name) { 1651c6adce8SGreg Roach case UTF8::NAME: 1661c6adce8SGreg Roach return new UTF8(); 1671c6adce8SGreg Roach 1681c6adce8SGreg Roach case UTF16BE::NAME: 1691c6adce8SGreg Roach return new UTF16BE(); 1701c6adce8SGreg Roach 1711c6adce8SGreg Roach case UTF16LE::NAME: 1721c6adce8SGreg Roach return new UTF16LE(); 1731c6adce8SGreg Roach 1741c6adce8SGreg Roach case ANSEL::NAME: 1751c6adce8SGreg Roach return new ANSEL(); 1761c6adce8SGreg Roach 1771c6adce8SGreg Roach case ASCII::NAME: 1781c6adce8SGreg Roach return new ASCII(); 1791c6adce8SGreg Roach 1801c6adce8SGreg Roach case CP437::NAME: 1811c6adce8SGreg Roach return new CP437(); 1821c6adce8SGreg Roach 1831c6adce8SGreg Roach case CP850::NAME: 1841c6adce8SGreg Roach return new CP850(); 1851c6adce8SGreg Roach 1861c6adce8SGreg Roach case Windows1250::NAME: 1871c6adce8SGreg Roach return new Windows1250(); 1881c6adce8SGreg Roach 1891c6adce8SGreg Roach case Windows1251::NAME: 1901c6adce8SGreg Roach return new Windows1251(); 1911c6adce8SGreg Roach 1921c6adce8SGreg Roach case Windows1252::NAME: 1931c6adce8SGreg Roach return new Windows1252(); 1941c6adce8SGreg Roach 1951c6adce8SGreg Roach case MacRoman::NAME: 1961c6adce8SGreg Roach return new MacRoman(); 1971c6adce8SGreg Roach 1981c6adce8SGreg Roach case ISO88591::NAME: 1991c6adce8SGreg Roach return new ISO88591(); 2001c6adce8SGreg Roach 2011c6adce8SGreg Roach case ISO88592::NAME: 2021c6adce8SGreg Roach return new ISO88592(); 2031c6adce8SGreg Roach 2041c6adce8SGreg Roach default: 2051c6adce8SGreg Roach throw new DomainException('Invalid encoding: ' . $name); 2061c6adce8SGreg Roach } 2071c6adce8SGreg Roach } 2081c6adce8SGreg Roach 2091c6adce8SGreg Roach /** 2101c6adce8SGreg Roach * A list of supported encodings and their names. 2111c6adce8SGreg Roach * 2121c6adce8SGreg Roach * @return array<string,string> 2131c6adce8SGreg Roach */ 2141c6adce8SGreg Roach public function list(): array 2151c6adce8SGreg Roach { 2161c6adce8SGreg Roach return [ 2171c6adce8SGreg Roach UTF8::NAME => 'UTF-8', 2181c6adce8SGreg Roach UTF16BE::NAME => 'UTF-16BE', 2191c6adce8SGreg Roach UTF16LE::NAME => 'UTF-16LE', 2201c6adce8SGreg Roach ANSEL::NAME => 'ANSEL', 2211c6adce8SGreg Roach ASCII::NAME => 'ASCII', 2221c6adce8SGreg Roach ISO88591::NAME => 'ISO-8859-1', 2231c6adce8SGreg Roach ISO88592::NAME => 'ISO-8859-2', 2241c6adce8SGreg Roach Windows1250::NAME => 'Windows 1250', 2251c6adce8SGreg Roach Windows1251::NAME => 'Windows 1251', 2261c6adce8SGreg Roach Windows1252::NAME => 'Windows 1252', 2271c6adce8SGreg Roach CP437::NAME => 'CP437', 2281c6adce8SGreg Roach CP850::NAME => 'CP850', 2291c6adce8SGreg Roach MacRoman::NAME => 'MacOS Roman', 2301c6adce8SGreg Roach ]; 2311c6adce8SGreg Roach } 2321c6adce8SGreg Roach} 233