xref: /webtrees/app/Http/RequestHandlers/GedcomLoad.php (revision 5cd281f4f76e660b2e033b96db47543fa16f7748)
16fd01894SGreg Roach<?php
26fd01894SGreg Roach
36fd01894SGreg Roach/**
46fd01894SGreg Roach * webtrees: online genealogy
589f7189bSGreg Roach * Copyright (C) 2021 webtrees development team
66fd01894SGreg Roach * This program is free software: you can redistribute it and/or modify
76fd01894SGreg Roach * it under the terms of the GNU General Public License as published by
86fd01894SGreg Roach * the Free Software Foundation, either version 3 of the License, or
96fd01894SGreg Roach * (at your option) any later version.
106fd01894SGreg Roach * This program is distributed in the hope that it will be useful,
116fd01894SGreg Roach * but WITHOUT ANY WARRANTY; without even the implied warranty of
126fd01894SGreg Roach * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
136fd01894SGreg Roach * GNU General Public License for more details.
146fd01894SGreg Roach * You should have received a copy of the GNU General Public License
1589f7189bSGreg Roach * along with this program. If not, see <https://www.gnu.org/licenses/>.
166fd01894SGreg Roach */
176fd01894SGreg Roach
186fd01894SGreg Roachdeclare(strict_types=1);
196fd01894SGreg Roach
206fd01894SGreg Roachnamespace Fisharebest\Webtrees\Http\RequestHandlers;
216fd01894SGreg Roach
226fd01894SGreg Roachuse Exception;
236fd01894SGreg Roachuse Fisharebest\Webtrees\Exceptions\GedcomErrorException;
246fd01894SGreg Roachuse Fisharebest\Webtrees\Functions\FunctionsImport;
256fd01894SGreg Roachuse Fisharebest\Webtrees\Gedcom;
266fd01894SGreg Roachuse Fisharebest\Webtrees\Http\ViewResponseTrait;
276fd01894SGreg Roachuse Fisharebest\Webtrees\I18N;
286fd01894SGreg Roachuse Fisharebest\Webtrees\Services\TimeoutService;
29*5cd281f4SGreg Roachuse Fisharebest\Webtrees\Services\TreeService;
306fd01894SGreg Roachuse Fisharebest\Webtrees\Tree;
316fd01894SGreg Roachuse Illuminate\Database\Capsule\Manager as DB;
326fd01894SGreg Roachuse Illuminate\Database\Query\Expression;
336fd01894SGreg Roachuse Psr\Http\Message\ResponseInterface;
346fd01894SGreg Roachuse Psr\Http\Message\ServerRequestInterface;
356fd01894SGreg Roachuse Psr\Http\Server\RequestHandlerInterface;
366fd01894SGreg Roach
376fd01894SGreg Roachuse function assert;
386fd01894SGreg Roachuse function preg_match;
396fd01894SGreg Roachuse function preg_split;
406fd01894SGreg Roachuse function response;
416fd01894SGreg Roachuse function str_replace;
426fd01894SGreg Roachuse function str_starts_with;
436fd01894SGreg Roachuse function strlen;
446fd01894SGreg Roachuse function strtoupper;
456fd01894SGreg Roachuse function substr;
466fd01894SGreg Roachuse function trim;
476fd01894SGreg Roachuse function view;
486fd01894SGreg Roach
496fd01894SGreg Roach/**
506fd01894SGreg Roach * Load a chunk of GEDCOM data.
516fd01894SGreg Roach */
526fd01894SGreg Roachclass GedcomLoad implements RequestHandlerInterface
536fd01894SGreg Roach{
546fd01894SGreg Roach    use ViewResponseTrait;
556fd01894SGreg Roach
566fd01894SGreg Roach    /** @var TimeoutService */
576fd01894SGreg Roach    private $timeout_service;
586fd01894SGreg Roach
59*5cd281f4SGreg Roach    /** @var TreeService */
60*5cd281f4SGreg Roach    private $tree_service;
61*5cd281f4SGreg Roach
626fd01894SGreg Roach    /**
636fd01894SGreg Roach     * GedcomLoad constructor.
646fd01894SGreg Roach     *
656fd01894SGreg Roach     * @param TimeoutService $timeout_service
66*5cd281f4SGreg Roach     * @param TreeService    $tree_service
676fd01894SGreg Roach     */
68*5cd281f4SGreg Roach    public function __construct(TimeoutService $timeout_service, TreeService $tree_service)
696fd01894SGreg Roach    {
706fd01894SGreg Roach        $this->timeout_service = $timeout_service;
71*5cd281f4SGreg Roach        $this->tree_service    = $tree_service;
726fd01894SGreg Roach    }
736fd01894SGreg Roach
746fd01894SGreg Roach    /**
756fd01894SGreg Roach     * @param ServerRequestInterface $request
766fd01894SGreg Roach     *
776fd01894SGreg Roach     * @return ResponseInterface
786fd01894SGreg Roach     */
796fd01894SGreg Roach    public function handle(ServerRequestInterface $request): ResponseInterface
806fd01894SGreg Roach    {
816fd01894SGreg Roach        $this->layout = 'layouts/ajax';
826fd01894SGreg Roach
836fd01894SGreg Roach        $tree = $request->getAttribute('tree');
846fd01894SGreg Roach        assert($tree instanceof Tree);
856fd01894SGreg Roach
866fd01894SGreg Roach        try {
876fd01894SGreg Roach            // Only allow one process to import each gedcom at a time
886fd01894SGreg Roach            DB::table('gedcom_chunk')
896fd01894SGreg Roach                ->where('gedcom_id', '=', $tree->id())
906fd01894SGreg Roach                ->lockForUpdate()
916fd01894SGreg Roach                ->get();
926fd01894SGreg Roach
936fd01894SGreg Roach            // What is the current import status?
946fd01894SGreg Roach            $import_offset = DB::table('gedcom_chunk')
956fd01894SGreg Roach                ->where('gedcom_id', '=', $tree->id())
966fd01894SGreg Roach                ->where('imported', '=', '1')
976fd01894SGreg Roach                ->count();
986fd01894SGreg Roach
996fd01894SGreg Roach            $import_total = DB::table('gedcom_chunk')
1006fd01894SGreg Roach                ->where('gedcom_id', '=', $tree->id())
1016fd01894SGreg Roach                ->count();
1026fd01894SGreg Roach
1036fd01894SGreg Roach            // Finished?
1046fd01894SGreg Roach            if ($import_offset === $import_total) {
1056fd01894SGreg Roach                $tree->setPreference('imported', '1');
1066fd01894SGreg Roach
1076fd01894SGreg Roach                $html = view('admin/import-complete', ['tree' => $tree]);
1086fd01894SGreg Roach
1096fd01894SGreg Roach                return response($html);
1106fd01894SGreg Roach            }
1116fd01894SGreg Roach
1126fd01894SGreg Roach            // Calculate progress so far
1136fd01894SGreg Roach            $progress = $import_offset / $import_total;
1146fd01894SGreg Roach
1156fd01894SGreg Roach            $first_time = ($import_offset === 0);
1166fd01894SGreg Roach
1176fd01894SGreg Roach            // Collect up any errors, and show them later.
1186fd01894SGreg Roach            $errors = '';
1196fd01894SGreg Roach
1206fd01894SGreg Roach            // Run for a short period of time. This keeps the resource requirements low.
1216fd01894SGreg Roach            do {
1226fd01894SGreg Roach                $data = DB::table('gedcom_chunk')
1236fd01894SGreg Roach                    ->where('gedcom_id', '=', $tree->id())
1246fd01894SGreg Roach                    ->where('imported', '=', '0')
1256fd01894SGreg Roach                    ->orderBy('gedcom_chunk_id')
1266fd01894SGreg Roach                    ->select(['gedcom_chunk_id', 'chunk_data'])
1276fd01894SGreg Roach                    ->first();
1286fd01894SGreg Roach
1296fd01894SGreg Roach                // If we are loading the first (header) record, make sure the encoding is UTF-8.
1306fd01894SGreg Roach                if ($first_time) {
131*5cd281f4SGreg Roach                    $this->tree_service->deleteGenealogyData($tree, (bool) $tree->getPreference('keep_media'));
132*5cd281f4SGreg Roach
1336fd01894SGreg Roach                    // Remove any byte-order-mark
1346fd01894SGreg Roach                    if (str_starts_with($data->chunk_data, Gedcom::UTF8_BOM)) {
1356fd01894SGreg Roach                        $data->chunk_data = substr($data->chunk_data, strlen(Gedcom::UTF8_BOM));
1366fd01894SGreg Roach                        // Put it back in the database, so we can do character conversion
1376fd01894SGreg Roach                        DB::table('gedcom_chunk')
1386fd01894SGreg Roach                            ->where('gedcom_chunk_id', '=', $data->gedcom_chunk_id)
1396fd01894SGreg Roach                            ->update(['chunk_data' => $data->chunk_data]);
1406fd01894SGreg Roach                    }
1416fd01894SGreg Roach
1426fd01894SGreg Roach                    if (!str_starts_with($data->chunk_data, '0 HEAD')) {
1436fd01894SGreg Roach                        return $this->viewResponse('admin/import-fail', [
1446fd01894SGreg Roach                            'error' => I18N::translate('Invalid GEDCOM file - no header record found.'),
1456fd01894SGreg Roach                            'tree'  => $tree,
1466fd01894SGreg Roach                        ]);
1476fd01894SGreg Roach                    }
1486fd01894SGreg Roach
1496fd01894SGreg Roach                    // What character set is this? Need to convert it to UTF8
150678794efSGreg Roach                    if (preg_match('/[\r\n][ \t]*1 CHAR(?:ACTER)? ([^\r\n]+)/', $data->chunk_data, $match)) {
1516fd01894SGreg Roach                        $charset = strtoupper(trim($match[1]));
1526fd01894SGreg Roach                    } else {
1536fd01894SGreg Roach                        $charset = 'ASCII';
1546fd01894SGreg Roach                    }
1556fd01894SGreg Roach                    // MySQL supports a wide range of collation conversions. These are ones that
1566fd01894SGreg Roach                    // have been encountered "in the wild".
1576fd01894SGreg Roach                    switch ($charset) {
1586fd01894SGreg Roach                        case 'ASCII':
1596fd01894SGreg Roach                            DB::table('gedcom_chunk')
1606fd01894SGreg Roach                                ->where('gedcom_id', '=', $tree->id())
1616fd01894SGreg Roach                                ->update(['chunk_data' => new Expression('CONVERT(CONVERT(chunk_data USING ascii) USING utf8)')]);
1626fd01894SGreg Roach                            break;
1636fd01894SGreg Roach                        case 'IBMPC':   // IBMPC, IBM WINDOWS and MS-DOS could be anything. Mostly it means CP850.
1646fd01894SGreg Roach                        case 'IBM WINDOWS':
1656fd01894SGreg Roach                        case 'MS-DOS':
1666fd01894SGreg Roach                        case 'CP437':
1676fd01894SGreg Roach                        case 'CP850':
1686fd01894SGreg Roach                            // CP850 has extra letters with diacritics to replace box-drawing chars in CP437.
1696fd01894SGreg Roach                            DB::table('gedcom_chunk')
1706fd01894SGreg Roach                                ->where('gedcom_id', '=', $tree->id())
1716fd01894SGreg Roach                                ->update(['chunk_data' => new Expression('CONVERT(CONVERT(chunk_data USING cp850) USING utf8)')]);
1726fd01894SGreg Roach                            break;
1736fd01894SGreg Roach                        case 'ANSI': // ANSI could be anything. Most applications seem to treat it as latin1.
1746fd01894SGreg Roach                        case 'WINDOWS':
1756fd01894SGreg Roach                        case 'CP1252':
1766fd01894SGreg Roach                        case 'ISO8859-1':
1776fd01894SGreg Roach                        case 'ISO-8859-1':
1786fd01894SGreg Roach                        case 'LATIN1':
1796fd01894SGreg Roach                        case 'LATIN-1':
1806fd01894SGreg Roach                            // Convert from ISO-8859-1 (western european) to UTF8.
1816fd01894SGreg Roach                            DB::table('gedcom_chunk')
1826fd01894SGreg Roach                                ->where('gedcom_id', '=', $tree->id())
1836fd01894SGreg Roach                                ->update(['chunk_data' => new Expression('CONVERT(CONVERT(chunk_data USING latin1) USING utf8)')]);
1846fd01894SGreg Roach                            break;
1856fd01894SGreg Roach                        case 'CP1250':
1866fd01894SGreg Roach                        case 'ISO8859-2':
1876fd01894SGreg Roach                        case 'ISO-8859-2':
1886fd01894SGreg Roach                        case 'LATIN2':
1896fd01894SGreg Roach                        case 'LATIN-2':
1906fd01894SGreg Roach                            // Convert from ISO-8859-2 (eastern european) to UTF8.
1916fd01894SGreg Roach                            DB::table('gedcom_chunk')
1926fd01894SGreg Roach                                ->where('gedcom_id', '=', $tree->id())
1936fd01894SGreg Roach                                ->update(['chunk_data' => new Expression('CONVERT(CONVERT(chunk_data USING latin2) USING utf8)')]);
1946fd01894SGreg Roach                            break;
1956fd01894SGreg Roach                        case 'MACINTOSH':
1966fd01894SGreg Roach                            // Convert from MAC Roman to UTF8.
1976fd01894SGreg Roach                            DB::table('gedcom_chunk')
1986fd01894SGreg Roach                                ->where('gedcom_id', '=', $tree->id())
1996fd01894SGreg Roach                                ->update(['chunk_data' => new Expression('CONVERT(CONVERT(chunk_data USING macroman) USING utf8)')]);
2006fd01894SGreg Roach                            break;
2016fd01894SGreg Roach                        case 'UTF8':
2026fd01894SGreg Roach                        case 'UTF-8':
2036fd01894SGreg Roach                            // Already UTF-8 so nothing to do!
2046fd01894SGreg Roach                            break;
2056fd01894SGreg Roach                        case 'ANSEL':
2066fd01894SGreg Roach                        default:
2076fd01894SGreg Roach                            return $this->viewResponse('admin/import-fail', [
2086fd01894SGreg Roach                                'error' => I18N::translate('Error: converting GEDCOM files from %s encoding to UTF-8 encoding not currently supported.', $charset),
2096fd01894SGreg Roach                                'tree'  => $tree,
2106fd01894SGreg Roach                            ]);
2116fd01894SGreg Roach                    }
2126fd01894SGreg Roach                    $first_time = false;
2136fd01894SGreg Roach
2146fd01894SGreg Roach                    // Re-fetch the data, now that we have performed character set conversion.
2156fd01894SGreg Roach                    $data = DB::table('gedcom_chunk')
2166fd01894SGreg Roach                        ->where('gedcom_chunk_id', '=', $data->gedcom_chunk_id)
2176fd01894SGreg Roach                        ->select(['gedcom_chunk_id', 'chunk_data'])
2186fd01894SGreg Roach                        ->first();
2196fd01894SGreg Roach                }
2206fd01894SGreg Roach
2216fd01894SGreg Roach                if (!$data) {
2226fd01894SGreg Roach                    break;
2236fd01894SGreg Roach                }
2246fd01894SGreg Roach
2256fd01894SGreg Roach                $data->chunk_data = str_replace("\r", "\n", $data->chunk_data);
2266fd01894SGreg Roach
2276fd01894SGreg Roach                // Import all the records in this chunk of data
2286fd01894SGreg Roach                foreach (preg_split('/\n+(?=0)/', $data->chunk_data) as $rec) {
2296fd01894SGreg Roach                    try {
2306fd01894SGreg Roach                        FunctionsImport::importRecord($rec, $tree, false);
2316fd01894SGreg Roach                    } catch (GedcomErrorException $exception) {
2326fd01894SGreg Roach                        $errors .= $exception->getMessage();
2336fd01894SGreg Roach                    }
2346fd01894SGreg Roach                }
2356fd01894SGreg Roach
2366fd01894SGreg Roach                // Mark the chunk as imported
2376fd01894SGreg Roach                DB::table('gedcom_chunk')
2386fd01894SGreg Roach                    ->where('gedcom_chunk_id', '=', $data->gedcom_chunk_id)
2396fd01894SGreg Roach                    ->update(['imported' => 1]);
2406fd01894SGreg Roach            } while (!$this->timeout_service->isTimeLimitUp());
2416fd01894SGreg Roach
2426fd01894SGreg Roach            return $this->viewResponse('admin/import-progress', [
2436fd01894SGreg Roach                'errors'   => $errors,
2446fd01894SGreg Roach                'progress' => $progress,
2456fd01894SGreg Roach                'tree'     => $tree,
2466fd01894SGreg Roach            ]);
2476fd01894SGreg Roach        } catch (Exception $ex) {
2486fd01894SGreg Roach            DB::connection()->rollBack();
2496fd01894SGreg Roach
2506fd01894SGreg Roach            return $this->viewResponse('admin/import-fail', [
2516fd01894SGreg Roach                'error' => $ex->getMessage(),
2526fd01894SGreg Roach                'tree'  => $tree,
2536fd01894SGreg Roach            ]);
2546fd01894SGreg Roach        }
2556fd01894SGreg Roach    }
2566fd01894SGreg Roach}
257