1<?php 2 3/** 4 * webtrees: online genealogy 5 * Copyright (C) 2022 webtrees development team 6 * This program is free software: you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License as published by 8 * the Free Software Foundation, either version 3 of the License, or 9 * (at your option) any later version. 10 * This program is distributed in the hope that it will be useful, 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 * GNU General Public License for more details. 14 * You should have received a copy of the GNU General Public License 15 * along with this program. If not, see <https://www.gnu.org/licenses/>. 16 */ 17 18declare(strict_types=1); 19 20namespace Fisharebest\Webtrees\Factories; 21 22use DomainException; 23use Fisharebest\Webtrees\Contracts\EncodingFactoryInterface; 24use Fisharebest\Webtrees\Encodings\ANSEL; 25use Fisharebest\Webtrees\Encodings\ASCII; 26use Fisharebest\Webtrees\Encodings\CP437; 27use Fisharebest\Webtrees\Encodings\CP850; 28use Fisharebest\Webtrees\Encodings\EncodingInterface; 29use Fisharebest\Webtrees\Encodings\ISO88591; 30use Fisharebest\Webtrees\Encodings\ISO88592; 31use Fisharebest\Webtrees\Encodings\MacRoman; 32use Fisharebest\Webtrees\Encodings\UTF16BE; 33use Fisharebest\Webtrees\Encodings\UTF16LE; 34use Fisharebest\Webtrees\Encodings\UTF8; 35use Fisharebest\Webtrees\Encodings\Windows1250; 36use Fisharebest\Webtrees\Encodings\Windows1251; 37use Fisharebest\Webtrees\Encodings\Windows1252; 38use Fisharebest\Webtrees\Exceptions\InvalidGedcomEncodingException; 39 40use function explode; 41use function ltrim; 42use function preg_match; 43use function str_contains; 44use function str_starts_with; 45use function strstr; 46 47/** 48 * Create an encoding object. 49 */ 50class EncodingFactory implements EncodingFactoryInterface 51{ 52 /** 53 * Detect an encoding from a GEDCOM header record. 54 * 55 * @param string $header 56 * 57 * @return EncodingInterface|null 58 * @throws InvalidGedcomEncodingException 59 */ 60 public function detect(string $header): ?EncodingInterface 61 { 62 $utf_bom = [ 63 '/^' . UTF8::BYTE_ORDER_MARK . '/' => UTF8::NAME, 64 '/^' . UTF16BE::BYTE_ORDER_MARK . '/' => UTF16BE::NAME, 65 '/^' . UTF16LE::BYTE_ORDER_MARK . '/' => UTF16LE::NAME, 66 ]; 67 68 foreach ($utf_bom as $regex => $encoding) { 69 if (preg_match($regex, $header) === 1) { 70 return $this->make($encoding); 71 } 72 } 73 74 $utf16 = [ 75 "\x000" => UTF16BE::NAME, 76 "0\x00" => UTF16LE::NAME, 77 ]; 78 79 foreach ($utf16 as $start => $encoding) { 80 if (str_starts_with($header, $start)) { 81 return $this->make($encoding); 82 } 83 } 84 85 // Standardize whitespace to simplify matching. 86 $header = strtr(ltrim($header), ["\r\n" => "\n", "\n\r" => "\n", "\r" => "\n"]); 87 88 while (str_contains($header, "\n ") || str_contains($header, " \n") || str_contains($header, ' ')) { 89 $header = strtr($header, ["\n " => "\n", " \n" => "\n", ' ' => ' ']); 90 } 91 92 // We need a complete header record 93 $header = strstr($header, "\n0", true); 94 95 if ($header === false) { 96 return null; 97 } 98 99 // Some of these come from Tamura Jones, the rest from webtrees users. 100 $character_sets = [ 101 'ASCII' => ASCII::NAME, 102 'ANSEL' => ANSEL::NAME, 103 'UTF-8' => UTF8::NAME, 104 'UNICODE' => UTF8::NAME, // If the null byte test failed, this can't be UTF16 105 'ASCII/MacOS Roman' => MacRoman::NAME, // GEDitCOM 106 'ASCII/MACINTOSH' => MacRoman::NAME, // MacFamilyTree < 8.3.5 107 'MACINTOSH' => MacRoman::NAME, // MacFamilyTree >= 8.3.5 108 'CP437' => CP437::NAME, 109 'IBMPC' => CP437::NAME, 110 'IBM' => CP437::NAME, // Reunion 111 'IBM-PC' => CP437::NAME, // CumberlandFamilyTree 112 'OEM' => CP437::NAME, // Généatique 113 'CP850' => CP850::NAME, 114 'MSDOS' => CP850::NAME, 115 'IBM-DOS' => CP850::NAME, // Reunion, EasyTree 116 'MS-DOS' => CP850::NAME, // AbrEdit FTM for Windows 117 'ANSI' => CP850::NAME, 118 'WINDOWS' => CP850::NAME, // Parentele 119 'IBM WINDOWS' => CP850::NAME, // EasyTree, Généalogie, Reunion, TribalPages 120 'IBM_WINDOWS' => CP850::NAME, // EasyTree 121 'CP1250' => Windows1250::NAME, 122 'windows-1250' => Windows1250::NAME, // GenoPro, Rodokmen Pro 123 'CP1251' => Windows1251::NAME, 124 'WINDOWS-1251' => Windows1251::NAME, // Rodovid 125 'CP1252' => Windows1252::NAME, // Lifelines 126 'ISO-8859-1' => ISO88591::NAME, // Cumberland Family Tree, Lifelines 127 'ISO8859-1' => ISO88591::NAME, // Scion Genealogist 128 'ISO8859' => ISO88591::NAME, // Genealogica Grafica 129 'LATIN-1' => ISO88591::NAME, 130 'LATIN1' => ISO88591::NAME, // GenealogyJ 131 'ISO-8859-2' => ISO88592::NAME, 132 'ISO8859-2' => ISO88592::NAME, 133 'LATIN-2' => ISO88592::NAME, 134 'LATIN2' => ISO88592::NAME, 135 ]; 136 137 foreach ($character_sets as $pattern => $encoding) { 138 if (str_contains($pattern, '/')) { 139 [$char, $vers] = explode('/', $pattern); 140 $regex = "\n1 CHAR " . $char . "\n2 VERS " . $vers; 141 } else { 142 $regex = "\n1 CHAR(?:ACTER)? " . $pattern; 143 } 144 145 if (preg_match("/" . $regex . "/i", $header) === 1) { 146 return $this->make($encoding); 147 } 148 } 149 150 if (preg_match('/1 CHAR (.+)/', $header, $match) === 1) { 151 throw new InvalidGedcomEncodingException($match[1]); 152 } 153 154 return $this->make(ASCII::NAME); 155 } 156 157 /** 158 * Create a named encoding. 159 * 160 * @param string $name 161 * 162 * @return EncodingInterface 163 * @throws DomainException 164 */ 165 public function make(string $name): EncodingInterface 166 { 167 switch ($name) { 168 case UTF8::NAME: 169 return new UTF8(); 170 171 case UTF16BE::NAME: 172 return new UTF16BE(); 173 174 case UTF16LE::NAME: 175 return new UTF16LE(); 176 177 case ANSEL::NAME: 178 return new ANSEL(); 179 180 case ASCII::NAME: 181 return new ASCII(); 182 183 case CP437::NAME: 184 return new CP437(); 185 186 case CP850::NAME: 187 return new CP850(); 188 189 case Windows1250::NAME: 190 return new Windows1250(); 191 192 case Windows1251::NAME: 193 return new Windows1251(); 194 195 case Windows1252::NAME: 196 return new Windows1252(); 197 198 case MacRoman::NAME: 199 return new MacRoman(); 200 201 case ISO88591::NAME: 202 return new ISO88591(); 203 204 case ISO88592::NAME: 205 return new ISO88592(); 206 207 default: 208 throw new DomainException('Invalid encoding: ' . $name); 209 } 210 } 211 212 /** 213 * A list of supported encodings and their names. 214 * 215 * @return array<string,string> 216 */ 217 public function list(): array 218 { 219 return [ 220 UTF8::NAME => 'UTF-8', 221 UTF16BE::NAME => 'UTF-16BE', 222 UTF16LE::NAME => 'UTF-16LE', 223 ANSEL::NAME => 'ANSEL', 224 ASCII::NAME => 'ASCII', 225 ISO88591::NAME => 'ISO-8859-1', 226 ISO88592::NAME => 'ISO-8859-2', 227 Windows1250::NAME => 'Windows 1250', 228 Windows1251::NAME => 'Windows 1251', 229 Windows1252::NAME => 'Windows 1252', 230 CP437::NAME => 'CP437', 231 CP850::NAME => 'CP850', 232 MacRoman::NAME => 'MacOS Roman', 233 ]; 234 } 235} 236