1<?php 2 3/** 4 * webtrees: online genealogy 5 * Copyright (C) 2023 webtrees development team 6 * This program is free software: you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License as published by 8 * the Free Software Foundation, either version 3 of the License, or 9 * (at your option) any later version. 10 * This program is distributed in the hope that it will be useful, 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 * GNU General Public License for more details. 14 * You should have received a copy of the GNU General Public License 15 * along with this program. If not, see <https://www.gnu.org/licenses/>. 16 */ 17 18declare(strict_types=1); 19 20namespace Fisharebest\Webtrees\Factories; 21 22use DomainException; 23use Fisharebest\Webtrees\Contracts\EncodingFactoryInterface; 24use Fisharebest\Webtrees\Encodings\ANSEL; 25use Fisharebest\Webtrees\Encodings\ASCII; 26use Fisharebest\Webtrees\Encodings\CP437; 27use Fisharebest\Webtrees\Encodings\CP850; 28use Fisharebest\Webtrees\Encodings\EncodingInterface; 29use Fisharebest\Webtrees\Encodings\ISO88591; 30use Fisharebest\Webtrees\Encodings\ISO88592; 31use Fisharebest\Webtrees\Encodings\MacRoman; 32use Fisharebest\Webtrees\Encodings\UTF16BE; 33use Fisharebest\Webtrees\Encodings\UTF16LE; 34use Fisharebest\Webtrees\Encodings\UTF8; 35use Fisharebest\Webtrees\Encodings\Windows1250; 36use Fisharebest\Webtrees\Encodings\Windows1251; 37use Fisharebest\Webtrees\Encodings\Windows1252; 38use Fisharebest\Webtrees\Exceptions\InvalidGedcomEncodingException; 39 40use function explode; 41use function ltrim; 42use function preg_match; 43use function str_contains; 44use function str_starts_with; 45use function strstr; 46 47/** 48 * Create an encoding object. 49 */ 50class EncodingFactory implements EncodingFactoryInterface 51{ 52 /** 53 * Detect an encoding from a GEDCOM header record. 54 * 55 * @throws InvalidGedcomEncodingException 56 */ 57 public function detect(string $header): EncodingInterface|null 58 { 59 $utf_bom = [ 60 '/^' . UTF8::BYTE_ORDER_MARK . '/' => UTF8::NAME, 61 '/^' . UTF16BE::BYTE_ORDER_MARK . '/' => UTF16BE::NAME, 62 '/^' . UTF16LE::BYTE_ORDER_MARK . '/' => UTF16LE::NAME, 63 ]; 64 65 foreach ($utf_bom as $regex => $encoding) { 66 if (preg_match($regex, $header) === 1) { 67 return $this->make($encoding); 68 } 69 } 70 71 $utf16 = [ 72 "\x000" => UTF16BE::NAME, 73 "0\x00" => UTF16LE::NAME, 74 ]; 75 76 foreach ($utf16 as $start => $encoding) { 77 if (str_starts_with($header, $start)) { 78 return $this->make($encoding); 79 } 80 } 81 82 // Standardize whitespace to simplify matching. 83 $header = strtr(ltrim($header), ["\r\n" => "\n", "\n\r" => "\n", "\r" => "\n"]); 84 85 while (str_contains($header, "\n ") || str_contains($header, " \n") || str_contains($header, ' ')) { 86 $header = strtr($header, ["\n " => "\n", " \n" => "\n", ' ' => ' ']); 87 } 88 89 // We need a complete header record 90 $header = strstr($header, "\n0", true); 91 92 if ($header === false) { 93 return null; 94 } 95 96 // Some of these come from Tamura Jones, the rest from webtrees users. 97 $character_sets = [ 98 'ASCII' => ASCII::NAME, 99 'ANSEL' => ANSEL::NAME, 100 'UTF-8' => UTF8::NAME, 101 'UNICODE' => UTF8::NAME, // If the null byte test failed, this can't be UTF16 102 'ASCII/MacOS Roman' => MacRoman::NAME, // GEDitCOM 103 'ASCII/MACINTOSH' => MacRoman::NAME, // MacFamilyTree < 8.3.5 104 'MACINTOSH' => MacRoman::NAME, // MacFamilyTree >= 8.3.5 105 'CP437' => CP437::NAME, 106 'IBMPC' => CP437::NAME, 107 'IBM' => CP437::NAME, // Reunion 108 'IBM-PC' => CP437::NAME, // CumberlandFamilyTree 109 'OEM' => CP437::NAME, // Généatique 110 'CP850' => CP850::NAME, 111 'MSDOS' => CP850::NAME, 112 'IBM-DOS' => CP850::NAME, // Reunion, EasyTree 113 'MS-DOS' => CP850::NAME, // AbrEdit FTM for Windows 114 'ANSI' => CP850::NAME, 115 'WINDOWS' => CP850::NAME, // Parentele 116 'IBM WINDOWS' => CP850::NAME, // EasyTree, Généalogie, Reunion, TribalPages 117 'IBM_WINDOWS' => CP850::NAME, // EasyTree 118 'CP1250' => Windows1250::NAME, 119 'windows-1250' => Windows1250::NAME, // GenoPro, Rodokmen Pro 120 'CP1251' => Windows1251::NAME, 121 'WINDOWS-1251' => Windows1251::NAME, // Rodovid 122 'CP1252' => Windows1252::NAME, // Lifelines 123 'ISO-8859-1' => ISO88591::NAME, // Cumberland Family Tree, Lifelines 124 'ISO8859-1' => ISO88591::NAME, // Scion Genealogist 125 'ISO8859' => ISO88591::NAME, // Genealogica Grafica 126 'LATIN-1' => ISO88591::NAME, 127 'LATIN1' => ISO88591::NAME, // GenealogyJ 128 'ISO-8859-2' => ISO88592::NAME, 129 'ISO8859-2' => ISO88592::NAME, 130 'LATIN-2' => ISO88592::NAME, 131 'LATIN2' => ISO88592::NAME, 132 ]; 133 134 foreach ($character_sets as $pattern => $encoding) { 135 if (str_contains($pattern, '/')) { 136 [$char, $vers] = explode('/', $pattern); 137 $regex = "\n1 CHAR " . $char . "\n2 VERS " . $vers; 138 } else { 139 $regex = "\n1 CHAR(?:ACTER)? " . $pattern; 140 } 141 142 if (preg_match('/' . $regex . '/i', $header) === 1) { 143 return $this->make($encoding); 144 } 145 } 146 147 if (preg_match('/1 CHAR (.+)/', $header, $match) === 1) { 148 throw new InvalidGedcomEncodingException($match[1]); 149 } 150 151 return $this->make(UTF8::NAME); 152 } 153 154 /** 155 * Create a named encoding. 156 * 157 * @param string $name 158 * 159 * @return EncodingInterface 160 * @throws DomainException 161 */ 162 public function make(string $name): EncodingInterface 163 { 164 switch ($name) { 165 case UTF8::NAME: 166 return new UTF8(); 167 168 case UTF16BE::NAME: 169 return new UTF16BE(); 170 171 case UTF16LE::NAME: 172 return new UTF16LE(); 173 174 case ANSEL::NAME: 175 return new ANSEL(); 176 177 case ASCII::NAME: 178 return new ASCII(); 179 180 case CP437::NAME: 181 return new CP437(); 182 183 case CP850::NAME: 184 return new CP850(); 185 186 case Windows1250::NAME: 187 return new Windows1250(); 188 189 case Windows1251::NAME: 190 return new Windows1251(); 191 192 case Windows1252::NAME: 193 return new Windows1252(); 194 195 case MacRoman::NAME: 196 return new MacRoman(); 197 198 case ISO88591::NAME: 199 return new ISO88591(); 200 201 case ISO88592::NAME: 202 return new ISO88592(); 203 204 default: 205 throw new DomainException('Invalid encoding: ' . $name); 206 } 207 } 208 209 /** 210 * A list of supported encodings and their names. 211 * 212 * @return array<string,string> 213 */ 214 public function list(): array 215 { 216 return [ 217 UTF8::NAME => 'UTF-8', 218 UTF16BE::NAME => 'UTF-16BE', 219 UTF16LE::NAME => 'UTF-16LE', 220 ANSEL::NAME => 'ANSEL', 221 ASCII::NAME => 'ASCII', 222 ISO88591::NAME => 'ISO-8859-1', 223 ISO88592::NAME => 'ISO-8859-2', 224 Windows1250::NAME => 'Windows 1250', 225 Windows1251::NAME => 'Windows 1251', 226 Windows1252::NAME => 'Windows 1252', 227 CP437::NAME => 'CP437', 228 CP850::NAME => 'CP850', 229 MacRoman::NAME => 'MacOS Roman', 230 ]; 231 } 232} 233