16fd01894SGreg Roach<?php 26fd01894SGreg Roach 36fd01894SGreg Roach/** 46fd01894SGreg Roach * webtrees: online genealogy 589f7189bSGreg Roach * Copyright (C) 2021 webtrees development team 66fd01894SGreg Roach * This program is free software: you can redistribute it and/or modify 76fd01894SGreg Roach * it under the terms of the GNU General Public License as published by 86fd01894SGreg Roach * the Free Software Foundation, either version 3 of the License, or 96fd01894SGreg Roach * (at your option) any later version. 106fd01894SGreg Roach * This program is distributed in the hope that it will be useful, 116fd01894SGreg Roach * but WITHOUT ANY WARRANTY; without even the implied warranty of 126fd01894SGreg Roach * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 136fd01894SGreg Roach * GNU General Public License for more details. 146fd01894SGreg Roach * You should have received a copy of the GNU General Public License 1589f7189bSGreg Roach * along with this program. If not, see <https://www.gnu.org/licenses/>. 166fd01894SGreg Roach */ 176fd01894SGreg Roach 186fd01894SGreg Roachdeclare(strict_types=1); 196fd01894SGreg Roach 206fd01894SGreg Roachnamespace Fisharebest\Webtrees\Http\RequestHandlers; 216fd01894SGreg Roach 226fd01894SGreg Roachuse Exception; 236fd01894SGreg Roachuse Fisharebest\Webtrees\Exceptions\GedcomErrorException; 246fd01894SGreg Roachuse Fisharebest\Webtrees\Functions\FunctionsImport; 256fd01894SGreg Roachuse Fisharebest\Webtrees\Gedcom; 266fd01894SGreg Roachuse Fisharebest\Webtrees\Http\ViewResponseTrait; 276fd01894SGreg Roachuse Fisharebest\Webtrees\I18N; 286fd01894SGreg Roachuse Fisharebest\Webtrees\Services\TimeoutService; 29*5cd281f4SGreg Roachuse Fisharebest\Webtrees\Services\TreeService; 306fd01894SGreg Roachuse Fisharebest\Webtrees\Tree; 316fd01894SGreg Roachuse Illuminate\Database\Capsule\Manager as DB; 326fd01894SGreg Roachuse Illuminate\Database\Query\Expression; 336fd01894SGreg Roachuse Psr\Http\Message\ResponseInterface; 346fd01894SGreg Roachuse Psr\Http\Message\ServerRequestInterface; 356fd01894SGreg Roachuse Psr\Http\Server\RequestHandlerInterface; 366fd01894SGreg Roach 376fd01894SGreg Roachuse function assert; 386fd01894SGreg Roachuse function preg_match; 396fd01894SGreg Roachuse function preg_split; 406fd01894SGreg Roachuse function response; 416fd01894SGreg Roachuse function str_replace; 426fd01894SGreg Roachuse function str_starts_with; 436fd01894SGreg Roachuse function strlen; 446fd01894SGreg Roachuse function strtoupper; 456fd01894SGreg Roachuse function substr; 466fd01894SGreg Roachuse function trim; 476fd01894SGreg Roachuse function view; 486fd01894SGreg Roach 496fd01894SGreg Roach/** 506fd01894SGreg Roach * Load a chunk of GEDCOM data. 516fd01894SGreg Roach */ 526fd01894SGreg Roachclass GedcomLoad implements RequestHandlerInterface 536fd01894SGreg Roach{ 546fd01894SGreg Roach use ViewResponseTrait; 556fd01894SGreg Roach 566fd01894SGreg Roach /** @var TimeoutService */ 576fd01894SGreg Roach private $timeout_service; 586fd01894SGreg Roach 59*5cd281f4SGreg Roach /** @var TreeService */ 60*5cd281f4SGreg Roach private $tree_service; 61*5cd281f4SGreg Roach 626fd01894SGreg Roach /** 636fd01894SGreg Roach * GedcomLoad constructor. 646fd01894SGreg Roach * 656fd01894SGreg Roach * @param TimeoutService $timeout_service 66*5cd281f4SGreg Roach * @param TreeService $tree_service 676fd01894SGreg Roach */ 68*5cd281f4SGreg Roach public function __construct(TimeoutService $timeout_service, TreeService $tree_service) 696fd01894SGreg Roach { 706fd01894SGreg Roach $this->timeout_service = $timeout_service; 71*5cd281f4SGreg Roach $this->tree_service = $tree_service; 726fd01894SGreg Roach } 736fd01894SGreg Roach 746fd01894SGreg Roach /** 756fd01894SGreg Roach * @param ServerRequestInterface $request 766fd01894SGreg Roach * 776fd01894SGreg Roach * @return ResponseInterface 786fd01894SGreg Roach */ 796fd01894SGreg Roach public function handle(ServerRequestInterface $request): ResponseInterface 806fd01894SGreg Roach { 816fd01894SGreg Roach $this->layout = 'layouts/ajax'; 826fd01894SGreg Roach 836fd01894SGreg Roach $tree = $request->getAttribute('tree'); 846fd01894SGreg Roach assert($tree instanceof Tree); 856fd01894SGreg Roach 866fd01894SGreg Roach try { 876fd01894SGreg Roach // Only allow one process to import each gedcom at a time 886fd01894SGreg Roach DB::table('gedcom_chunk') 896fd01894SGreg Roach ->where('gedcom_id', '=', $tree->id()) 906fd01894SGreg Roach ->lockForUpdate() 916fd01894SGreg Roach ->get(); 926fd01894SGreg Roach 936fd01894SGreg Roach // What is the current import status? 946fd01894SGreg Roach $import_offset = DB::table('gedcom_chunk') 956fd01894SGreg Roach ->where('gedcom_id', '=', $tree->id()) 966fd01894SGreg Roach ->where('imported', '=', '1') 976fd01894SGreg Roach ->count(); 986fd01894SGreg Roach 996fd01894SGreg Roach $import_total = DB::table('gedcom_chunk') 1006fd01894SGreg Roach ->where('gedcom_id', '=', $tree->id()) 1016fd01894SGreg Roach ->count(); 1026fd01894SGreg Roach 1036fd01894SGreg Roach // Finished? 1046fd01894SGreg Roach if ($import_offset === $import_total) { 1056fd01894SGreg Roach $tree->setPreference('imported', '1'); 1066fd01894SGreg Roach 1076fd01894SGreg Roach $html = view('admin/import-complete', ['tree' => $tree]); 1086fd01894SGreg Roach 1096fd01894SGreg Roach return response($html); 1106fd01894SGreg Roach } 1116fd01894SGreg Roach 1126fd01894SGreg Roach // Calculate progress so far 1136fd01894SGreg Roach $progress = $import_offset / $import_total; 1146fd01894SGreg Roach 1156fd01894SGreg Roach $first_time = ($import_offset === 0); 1166fd01894SGreg Roach 1176fd01894SGreg Roach // Collect up any errors, and show them later. 1186fd01894SGreg Roach $errors = ''; 1196fd01894SGreg Roach 1206fd01894SGreg Roach // Run for a short period of time. This keeps the resource requirements low. 1216fd01894SGreg Roach do { 1226fd01894SGreg Roach $data = DB::table('gedcom_chunk') 1236fd01894SGreg Roach ->where('gedcom_id', '=', $tree->id()) 1246fd01894SGreg Roach ->where('imported', '=', '0') 1256fd01894SGreg Roach ->orderBy('gedcom_chunk_id') 1266fd01894SGreg Roach ->select(['gedcom_chunk_id', 'chunk_data']) 1276fd01894SGreg Roach ->first(); 1286fd01894SGreg Roach 1296fd01894SGreg Roach // If we are loading the first (header) record, make sure the encoding is UTF-8. 1306fd01894SGreg Roach if ($first_time) { 131*5cd281f4SGreg Roach $this->tree_service->deleteGenealogyData($tree, (bool) $tree->getPreference('keep_media')); 132*5cd281f4SGreg Roach 1336fd01894SGreg Roach // Remove any byte-order-mark 1346fd01894SGreg Roach if (str_starts_with($data->chunk_data, Gedcom::UTF8_BOM)) { 1356fd01894SGreg Roach $data->chunk_data = substr($data->chunk_data, strlen(Gedcom::UTF8_BOM)); 1366fd01894SGreg Roach // Put it back in the database, so we can do character conversion 1376fd01894SGreg Roach DB::table('gedcom_chunk') 1386fd01894SGreg Roach ->where('gedcom_chunk_id', '=', $data->gedcom_chunk_id) 1396fd01894SGreg Roach ->update(['chunk_data' => $data->chunk_data]); 1406fd01894SGreg Roach } 1416fd01894SGreg Roach 1426fd01894SGreg Roach if (!str_starts_with($data->chunk_data, '0 HEAD')) { 1436fd01894SGreg Roach return $this->viewResponse('admin/import-fail', [ 1446fd01894SGreg Roach 'error' => I18N::translate('Invalid GEDCOM file - no header record found.'), 1456fd01894SGreg Roach 'tree' => $tree, 1466fd01894SGreg Roach ]); 1476fd01894SGreg Roach } 1486fd01894SGreg Roach 1496fd01894SGreg Roach // What character set is this? Need to convert it to UTF8 150678794efSGreg Roach if (preg_match('/[\r\n][ \t]*1 CHAR(?:ACTER)? ([^\r\n]+)/', $data->chunk_data, $match)) { 1516fd01894SGreg Roach $charset = strtoupper(trim($match[1])); 1526fd01894SGreg Roach } else { 1536fd01894SGreg Roach $charset = 'ASCII'; 1546fd01894SGreg Roach } 1556fd01894SGreg Roach // MySQL supports a wide range of collation conversions. These are ones that 1566fd01894SGreg Roach // have been encountered "in the wild". 1576fd01894SGreg Roach switch ($charset) { 1586fd01894SGreg Roach case 'ASCII': 1596fd01894SGreg Roach DB::table('gedcom_chunk') 1606fd01894SGreg Roach ->where('gedcom_id', '=', $tree->id()) 1616fd01894SGreg Roach ->update(['chunk_data' => new Expression('CONVERT(CONVERT(chunk_data USING ascii) USING utf8)')]); 1626fd01894SGreg Roach break; 1636fd01894SGreg Roach case 'IBMPC': // IBMPC, IBM WINDOWS and MS-DOS could be anything. Mostly it means CP850. 1646fd01894SGreg Roach case 'IBM WINDOWS': 1656fd01894SGreg Roach case 'MS-DOS': 1666fd01894SGreg Roach case 'CP437': 1676fd01894SGreg Roach case 'CP850': 1686fd01894SGreg Roach // CP850 has extra letters with diacritics to replace box-drawing chars in CP437. 1696fd01894SGreg Roach DB::table('gedcom_chunk') 1706fd01894SGreg Roach ->where('gedcom_id', '=', $tree->id()) 1716fd01894SGreg Roach ->update(['chunk_data' => new Expression('CONVERT(CONVERT(chunk_data USING cp850) USING utf8)')]); 1726fd01894SGreg Roach break; 1736fd01894SGreg Roach case 'ANSI': // ANSI could be anything. Most applications seem to treat it as latin1. 1746fd01894SGreg Roach case 'WINDOWS': 1756fd01894SGreg Roach case 'CP1252': 1766fd01894SGreg Roach case 'ISO8859-1': 1776fd01894SGreg Roach case 'ISO-8859-1': 1786fd01894SGreg Roach case 'LATIN1': 1796fd01894SGreg Roach case 'LATIN-1': 1806fd01894SGreg Roach // Convert from ISO-8859-1 (western european) to UTF8. 1816fd01894SGreg Roach DB::table('gedcom_chunk') 1826fd01894SGreg Roach ->where('gedcom_id', '=', $tree->id()) 1836fd01894SGreg Roach ->update(['chunk_data' => new Expression('CONVERT(CONVERT(chunk_data USING latin1) USING utf8)')]); 1846fd01894SGreg Roach break; 1856fd01894SGreg Roach case 'CP1250': 1866fd01894SGreg Roach case 'ISO8859-2': 1876fd01894SGreg Roach case 'ISO-8859-2': 1886fd01894SGreg Roach case 'LATIN2': 1896fd01894SGreg Roach case 'LATIN-2': 1906fd01894SGreg Roach // Convert from ISO-8859-2 (eastern european) to UTF8. 1916fd01894SGreg Roach DB::table('gedcom_chunk') 1926fd01894SGreg Roach ->where('gedcom_id', '=', $tree->id()) 1936fd01894SGreg Roach ->update(['chunk_data' => new Expression('CONVERT(CONVERT(chunk_data USING latin2) USING utf8)')]); 1946fd01894SGreg Roach break; 1956fd01894SGreg Roach case 'MACINTOSH': 1966fd01894SGreg Roach // Convert from MAC Roman to UTF8. 1976fd01894SGreg Roach DB::table('gedcom_chunk') 1986fd01894SGreg Roach ->where('gedcom_id', '=', $tree->id()) 1996fd01894SGreg Roach ->update(['chunk_data' => new Expression('CONVERT(CONVERT(chunk_data USING macroman) USING utf8)')]); 2006fd01894SGreg Roach break; 2016fd01894SGreg Roach case 'UTF8': 2026fd01894SGreg Roach case 'UTF-8': 2036fd01894SGreg Roach // Already UTF-8 so nothing to do! 2046fd01894SGreg Roach break; 2056fd01894SGreg Roach case 'ANSEL': 2066fd01894SGreg Roach default: 2076fd01894SGreg Roach return $this->viewResponse('admin/import-fail', [ 2086fd01894SGreg Roach 'error' => I18N::translate('Error: converting GEDCOM files from %s encoding to UTF-8 encoding not currently supported.', $charset), 2096fd01894SGreg Roach 'tree' => $tree, 2106fd01894SGreg Roach ]); 2116fd01894SGreg Roach } 2126fd01894SGreg Roach $first_time = false; 2136fd01894SGreg Roach 2146fd01894SGreg Roach // Re-fetch the data, now that we have performed character set conversion. 2156fd01894SGreg Roach $data = DB::table('gedcom_chunk') 2166fd01894SGreg Roach ->where('gedcom_chunk_id', '=', $data->gedcom_chunk_id) 2176fd01894SGreg Roach ->select(['gedcom_chunk_id', 'chunk_data']) 2186fd01894SGreg Roach ->first(); 2196fd01894SGreg Roach } 2206fd01894SGreg Roach 2216fd01894SGreg Roach if (!$data) { 2226fd01894SGreg Roach break; 2236fd01894SGreg Roach } 2246fd01894SGreg Roach 2256fd01894SGreg Roach $data->chunk_data = str_replace("\r", "\n", $data->chunk_data); 2266fd01894SGreg Roach 2276fd01894SGreg Roach // Import all the records in this chunk of data 2286fd01894SGreg Roach foreach (preg_split('/\n+(?=0)/', $data->chunk_data) as $rec) { 2296fd01894SGreg Roach try { 2306fd01894SGreg Roach FunctionsImport::importRecord($rec, $tree, false); 2316fd01894SGreg Roach } catch (GedcomErrorException $exception) { 2326fd01894SGreg Roach $errors .= $exception->getMessage(); 2336fd01894SGreg Roach } 2346fd01894SGreg Roach } 2356fd01894SGreg Roach 2366fd01894SGreg Roach // Mark the chunk as imported 2376fd01894SGreg Roach DB::table('gedcom_chunk') 2386fd01894SGreg Roach ->where('gedcom_chunk_id', '=', $data->gedcom_chunk_id) 2396fd01894SGreg Roach ->update(['imported' => 1]); 2406fd01894SGreg Roach } while (!$this->timeout_service->isTimeLimitUp()); 2416fd01894SGreg Roach 2426fd01894SGreg Roach return $this->viewResponse('admin/import-progress', [ 2436fd01894SGreg Roach 'errors' => $errors, 2446fd01894SGreg Roach 'progress' => $progress, 2456fd01894SGreg Roach 'tree' => $tree, 2466fd01894SGreg Roach ]); 2476fd01894SGreg Roach } catch (Exception $ex) { 2486fd01894SGreg Roach DB::connection()->rollBack(); 2496fd01894SGreg Roach 2506fd01894SGreg Roach return $this->viewResponse('admin/import-fail', [ 2516fd01894SGreg Roach 'error' => $ex->getMessage(), 2526fd01894SGreg Roach 'tree' => $tree, 2536fd01894SGreg Roach ]); 2546fd01894SGreg Roach } 2556fd01894SGreg Roach } 2566fd01894SGreg Roach} 257