xref: /webtrees/app/Http/RequestHandlers/GedcomLoad.php (revision 5cd281f4f76e660b2e033b96db47543fa16f7748)
1<?php
2
3/**
4 * webtrees: online genealogy
5 * Copyright (C) 2021 webtrees development team
6 * This program is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation, either version 3 of the License, or
9 * (at your option) any later version.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program. If not, see <https://www.gnu.org/licenses/>.
16 */
17
18declare(strict_types=1);
19
20namespace Fisharebest\Webtrees\Http\RequestHandlers;
21
22use Exception;
23use Fisharebest\Webtrees\Exceptions\GedcomErrorException;
24use Fisharebest\Webtrees\Functions\FunctionsImport;
25use Fisharebest\Webtrees\Gedcom;
26use Fisharebest\Webtrees\Http\ViewResponseTrait;
27use Fisharebest\Webtrees\I18N;
28use Fisharebest\Webtrees\Services\TimeoutService;
29use Fisharebest\Webtrees\Services\TreeService;
30use Fisharebest\Webtrees\Tree;
31use Illuminate\Database\Capsule\Manager as DB;
32use Illuminate\Database\Query\Expression;
33use Psr\Http\Message\ResponseInterface;
34use Psr\Http\Message\ServerRequestInterface;
35use Psr\Http\Server\RequestHandlerInterface;
36
37use function assert;
38use function preg_match;
39use function preg_split;
40use function response;
41use function str_replace;
42use function str_starts_with;
43use function strlen;
44use function strtoupper;
45use function substr;
46use function trim;
47use function view;
48
49/**
50 * Load a chunk of GEDCOM data.
51 */
52class GedcomLoad implements RequestHandlerInterface
53{
54    use ViewResponseTrait;
55
56    /** @var TimeoutService */
57    private $timeout_service;
58
59    /** @var TreeService */
60    private $tree_service;
61
62    /**
63     * GedcomLoad constructor.
64     *
65     * @param TimeoutService $timeout_service
66     * @param TreeService    $tree_service
67     */
68    public function __construct(TimeoutService $timeout_service, TreeService $tree_service)
69    {
70        $this->timeout_service = $timeout_service;
71        $this->tree_service    = $tree_service;
72    }
73
74    /**
75     * @param ServerRequestInterface $request
76     *
77     * @return ResponseInterface
78     */
79    public function handle(ServerRequestInterface $request): ResponseInterface
80    {
81        $this->layout = 'layouts/ajax';
82
83        $tree = $request->getAttribute('tree');
84        assert($tree instanceof Tree);
85
86        try {
87            // Only allow one process to import each gedcom at a time
88            DB::table('gedcom_chunk')
89                ->where('gedcom_id', '=', $tree->id())
90                ->lockForUpdate()
91                ->get();
92
93            // What is the current import status?
94            $import_offset = DB::table('gedcom_chunk')
95                ->where('gedcom_id', '=', $tree->id())
96                ->where('imported', '=', '1')
97                ->count();
98
99            $import_total = DB::table('gedcom_chunk')
100                ->where('gedcom_id', '=', $tree->id())
101                ->count();
102
103            // Finished?
104            if ($import_offset === $import_total) {
105                $tree->setPreference('imported', '1');
106
107                $html = view('admin/import-complete', ['tree' => $tree]);
108
109                return response($html);
110            }
111
112            // Calculate progress so far
113            $progress = $import_offset / $import_total;
114
115            $first_time = ($import_offset === 0);
116
117            // Collect up any errors, and show them later.
118            $errors = '';
119
120            // Run for a short period of time. This keeps the resource requirements low.
121            do {
122                $data = DB::table('gedcom_chunk')
123                    ->where('gedcom_id', '=', $tree->id())
124                    ->where('imported', '=', '0')
125                    ->orderBy('gedcom_chunk_id')
126                    ->select(['gedcom_chunk_id', 'chunk_data'])
127                    ->first();
128
129                // If we are loading the first (header) record, make sure the encoding is UTF-8.
130                if ($first_time) {
131                    $this->tree_service->deleteGenealogyData($tree, (bool) $tree->getPreference('keep_media'));
132
133                    // Remove any byte-order-mark
134                    if (str_starts_with($data->chunk_data, Gedcom::UTF8_BOM)) {
135                        $data->chunk_data = substr($data->chunk_data, strlen(Gedcom::UTF8_BOM));
136                        // Put it back in the database, so we can do character conversion
137                        DB::table('gedcom_chunk')
138                            ->where('gedcom_chunk_id', '=', $data->gedcom_chunk_id)
139                            ->update(['chunk_data' => $data->chunk_data]);
140                    }
141
142                    if (!str_starts_with($data->chunk_data, '0 HEAD')) {
143                        return $this->viewResponse('admin/import-fail', [
144                            'error' => I18N::translate('Invalid GEDCOM file - no header record found.'),
145                            'tree'  => $tree,
146                        ]);
147                    }
148
149                    // What character set is this? Need to convert it to UTF8
150                    if (preg_match('/[\r\n][ \t]*1 CHAR(?:ACTER)? ([^\r\n]+)/', $data->chunk_data, $match)) {
151                        $charset = strtoupper(trim($match[1]));
152                    } else {
153                        $charset = 'ASCII';
154                    }
155                    // MySQL supports a wide range of collation conversions. These are ones that
156                    // have been encountered "in the wild".
157                    switch ($charset) {
158                        case 'ASCII':
159                            DB::table('gedcom_chunk')
160                                ->where('gedcom_id', '=', $tree->id())
161                                ->update(['chunk_data' => new Expression('CONVERT(CONVERT(chunk_data USING ascii) USING utf8)')]);
162                            break;
163                        case 'IBMPC':   // IBMPC, IBM WINDOWS and MS-DOS could be anything. Mostly it means CP850.
164                        case 'IBM WINDOWS':
165                        case 'MS-DOS':
166                        case 'CP437':
167                        case 'CP850':
168                            // CP850 has extra letters with diacritics to replace box-drawing chars in CP437.
169                            DB::table('gedcom_chunk')
170                                ->where('gedcom_id', '=', $tree->id())
171                                ->update(['chunk_data' => new Expression('CONVERT(CONVERT(chunk_data USING cp850) USING utf8)')]);
172                            break;
173                        case 'ANSI': // ANSI could be anything. Most applications seem to treat it as latin1.
174                        case 'WINDOWS':
175                        case 'CP1252':
176                        case 'ISO8859-1':
177                        case 'ISO-8859-1':
178                        case 'LATIN1':
179                        case 'LATIN-1':
180                            // Convert from ISO-8859-1 (western european) to UTF8.
181                            DB::table('gedcom_chunk')
182                                ->where('gedcom_id', '=', $tree->id())
183                                ->update(['chunk_data' => new Expression('CONVERT(CONVERT(chunk_data USING latin1) USING utf8)')]);
184                            break;
185                        case 'CP1250':
186                        case 'ISO8859-2':
187                        case 'ISO-8859-2':
188                        case 'LATIN2':
189                        case 'LATIN-2':
190                            // Convert from ISO-8859-2 (eastern european) to UTF8.
191                            DB::table('gedcom_chunk')
192                                ->where('gedcom_id', '=', $tree->id())
193                                ->update(['chunk_data' => new Expression('CONVERT(CONVERT(chunk_data USING latin2) USING utf8)')]);
194                            break;
195                        case 'MACINTOSH':
196                            // Convert from MAC Roman to UTF8.
197                            DB::table('gedcom_chunk')
198                                ->where('gedcom_id', '=', $tree->id())
199                                ->update(['chunk_data' => new Expression('CONVERT(CONVERT(chunk_data USING macroman) USING utf8)')]);
200                            break;
201                        case 'UTF8':
202                        case 'UTF-8':
203                            // Already UTF-8 so nothing to do!
204                            break;
205                        case 'ANSEL':
206                        default:
207                            return $this->viewResponse('admin/import-fail', [
208                                'error' => I18N::translate('Error: converting GEDCOM files from %s encoding to UTF-8 encoding not currently supported.', $charset),
209                                'tree'  => $tree,
210                            ]);
211                    }
212                    $first_time = false;
213
214                    // Re-fetch the data, now that we have performed character set conversion.
215                    $data = DB::table('gedcom_chunk')
216                        ->where('gedcom_chunk_id', '=', $data->gedcom_chunk_id)
217                        ->select(['gedcom_chunk_id', 'chunk_data'])
218                        ->first();
219                }
220
221                if (!$data) {
222                    break;
223                }
224
225                $data->chunk_data = str_replace("\r", "\n", $data->chunk_data);
226
227                // Import all the records in this chunk of data
228                foreach (preg_split('/\n+(?=0)/', $data->chunk_data) as $rec) {
229                    try {
230                        FunctionsImport::importRecord($rec, $tree, false);
231                    } catch (GedcomErrorException $exception) {
232                        $errors .= $exception->getMessage();
233                    }
234                }
235
236                // Mark the chunk as imported
237                DB::table('gedcom_chunk')
238                    ->where('gedcom_chunk_id', '=', $data->gedcom_chunk_id)
239                    ->update(['imported' => 1]);
240            } while (!$this->timeout_service->isTimeLimitUp());
241
242            return $this->viewResponse('admin/import-progress', [
243                'errors'   => $errors,
244                'progress' => $progress,
245                'tree'     => $tree,
246            ]);
247        } catch (Exception $ex) {
248            DB::connection()->rollBack();
249
250            return $this->viewResponse('admin/import-fail', [
251                'error' => $ex->getMessage(),
252                'tree'  => $tree,
253            ]);
254        }
255    }
256}
257