. */ declare(strict_types=1); namespace Fisharebest\Webtrees\Factories; use DomainException; use Fisharebest\Webtrees\Contracts\EncodingFactoryInterface; use Fisharebest\Webtrees\Encodings\ANSEL; use Fisharebest\Webtrees\Encodings\ASCII; use Fisharebest\Webtrees\Encodings\CP437; use Fisharebest\Webtrees\Encodings\CP850; use Fisharebest\Webtrees\Encodings\EncodingInterface; use Fisharebest\Webtrees\Encodings\ISO88591; use Fisharebest\Webtrees\Encodings\ISO88592; use Fisharebest\Webtrees\Encodings\MacRoman; use Fisharebest\Webtrees\Encodings\UTF16BE; use Fisharebest\Webtrees\Encodings\UTF16LE; use Fisharebest\Webtrees\Encodings\UTF8; use Fisharebest\Webtrees\Encodings\Windows1250; use Fisharebest\Webtrees\Encodings\Windows1251; use Fisharebest\Webtrees\Encodings\Windows1252; use Fisharebest\Webtrees\Exceptions\InvalidGedcomEncodingException; use function explode; use function ltrim; use function preg_match; use function str_contains; use function str_starts_with; use function strstr; /** * Create an encoding object. */ class EncodingFactory implements EncodingFactoryInterface { /** * Detect an encoding from a GEDCOM header record. * * @throws InvalidGedcomEncodingException */ public function detect(string $header): EncodingInterface|null { $utf_bom = [ '/^' . UTF8::BYTE_ORDER_MARK . '/' => UTF8::NAME, '/^' . UTF16BE::BYTE_ORDER_MARK . '/' => UTF16BE::NAME, '/^' . UTF16LE::BYTE_ORDER_MARK . '/' => UTF16LE::NAME, ]; foreach ($utf_bom as $regex => $encoding) { if (preg_match($regex, $header) === 1) { return $this->make($encoding); } } $utf16 = [ "\x000" => UTF16BE::NAME, "0\x00" => UTF16LE::NAME, ]; foreach ($utf16 as $start => $encoding) { if (str_starts_with($header, $start)) { return $this->make($encoding); } } // Standardize whitespace to simplify matching. $header = strtr(ltrim($header), ["\r\n" => "\n", "\n\r" => "\n", "\r" => "\n"]); while (str_contains($header, "\n ") || str_contains($header, " \n") || str_contains($header, ' ')) { $header = strtr($header, ["\n " => "\n", " \n" => "\n", ' ' => ' ']); } // We need a complete header record $header = strstr($header, "\n0", true); if ($header === false) { return null; } // Some of these come from Tamura Jones, the rest from webtrees users. $character_sets = [ 'ASCII' => ASCII::NAME, 'ANSEL' => ANSEL::NAME, 'UTF-8' => UTF8::NAME, 'UNICODE' => UTF8::NAME, // If the null byte test failed, this can't be UTF16 'ASCII/MacOS Roman' => MacRoman::NAME, // GEDitCOM 'ASCII/MACINTOSH' => MacRoman::NAME, // MacFamilyTree < 8.3.5 'MACINTOSH' => MacRoman::NAME, // MacFamilyTree >= 8.3.5 'CP437' => CP437::NAME, 'IBMPC' => CP437::NAME, 'IBM' => CP437::NAME, // Reunion 'IBM-PC' => CP437::NAME, // CumberlandFamilyTree 'OEM' => CP437::NAME, // Généatique 'CP850' => CP850::NAME, 'MSDOS' => CP850::NAME, 'IBM-DOS' => CP850::NAME, // Reunion, EasyTree 'MS-DOS' => CP850::NAME, // AbrEdit FTM for Windows 'ANSI' => CP850::NAME, 'WINDOWS' => CP850::NAME, // Parentele 'IBM WINDOWS' => CP850::NAME, // EasyTree, Généalogie, Reunion, TribalPages 'IBM_WINDOWS' => CP850::NAME, // EasyTree 'CP1250' => Windows1250::NAME, 'windows-1250' => Windows1250::NAME, // GenoPro, Rodokmen Pro 'CP1251' => Windows1251::NAME, 'WINDOWS-1251' => Windows1251::NAME, // Rodovid 'CP1252' => Windows1252::NAME, // Lifelines 'ISO-8859-1' => ISO88591::NAME, // Cumberland Family Tree, Lifelines 'ISO8859-1' => ISO88591::NAME, // Scion Genealogist 'ISO8859' => ISO88591::NAME, // Genealogica Grafica 'LATIN-1' => ISO88591::NAME, 'LATIN1' => ISO88591::NAME, // GenealogyJ 'ISO-8859-2' => ISO88592::NAME, 'ISO8859-2' => ISO88592::NAME, 'LATIN-2' => ISO88592::NAME, 'LATIN2' => ISO88592::NAME, ]; foreach ($character_sets as $pattern => $encoding) { if (str_contains($pattern, '/')) { [$char, $vers] = explode('/', $pattern); $regex = "\n1 CHAR " . $char . "\n2 VERS " . $vers; } else { $regex = "\n1 CHAR(?:ACTER)? " . $pattern; } if (preg_match('/' . $regex . '/i', $header) === 1) { return $this->make($encoding); } } if (preg_match('/1 CHAR (.+)/', $header, $match) === 1) { throw new InvalidGedcomEncodingException($match[1]); } return $this->make(UTF8::NAME); } /** * Create a named encoding. * * @param string $name * * @return EncodingInterface * @throws DomainException */ public function make(string $name): EncodingInterface { switch ($name) { case UTF8::NAME: return new UTF8(); case UTF16BE::NAME: return new UTF16BE(); case UTF16LE::NAME: return new UTF16LE(); case ANSEL::NAME: return new ANSEL(); case ASCII::NAME: return new ASCII(); case CP437::NAME: return new CP437(); case CP850::NAME: return new CP850(); case Windows1250::NAME: return new Windows1250(); case Windows1251::NAME: return new Windows1251(); case Windows1252::NAME: return new Windows1252(); case MacRoman::NAME: return new MacRoman(); case ISO88591::NAME: return new ISO88591(); case ISO88592::NAME: return new ISO88592(); default: throw new DomainException('Invalid encoding: ' . $name); } } /** * A list of supported encodings and their names. * * @return array */ public function list(): array { return [ UTF8::NAME => 'UTF-8', UTF16BE::NAME => 'UTF-16BE', UTF16LE::NAME => 'UTF-16LE', ANSEL::NAME => 'ANSEL', ASCII::NAME => 'ASCII', ISO88591::NAME => 'ISO-8859-1', ISO88592::NAME => 'ISO-8859-2', Windows1250::NAME => 'Windows 1250', Windows1251::NAME => 'Windows 1251', Windows1252::NAME => 'Windows 1252', CP437::NAME => 'CP437', CP850::NAME => 'CP850', MacRoman::NAME => 'MacOS Roman', ]; } }