xref: /webtrees/app/Http/RequestHandlers/GedcomLoad.php (revision 89f7189b61a494347591c99bdb92afb7d8b66e1b)
1<?php
2
3/**
4 * webtrees: online genealogy
5 * Copyright (C) 2021 webtrees development team
6 * This program is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation, either version 3 of the License, or
9 * (at your option) any later version.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program. If not, see <https://www.gnu.org/licenses/>.
16 */
17
18declare(strict_types=1);
19
20namespace Fisharebest\Webtrees\Http\RequestHandlers;
21
22use Exception;
23use Fisharebest\Webtrees\Exceptions\GedcomErrorException;
24use Fisharebest\Webtrees\Functions\FunctionsImport;
25use Fisharebest\Webtrees\Gedcom;
26use Fisharebest\Webtrees\Http\ViewResponseTrait;
27use Fisharebest\Webtrees\I18N;
28use Fisharebest\Webtrees\Services\TimeoutService;
29use Fisharebest\Webtrees\Tree;
30use Illuminate\Database\Capsule\Manager as DB;
31use Illuminate\Database\Query\Expression;
32use Psr\Http\Message\ResponseInterface;
33use Psr\Http\Message\ServerRequestInterface;
34use Psr\Http\Server\RequestHandlerInterface;
35
36use function assert;
37use function preg_match;
38use function preg_split;
39use function response;
40use function str_replace;
41use function str_starts_with;
42use function strlen;
43use function strtoupper;
44use function substr;
45use function trim;
46use function view;
47
48/**
49 * Load a chunk of GEDCOM data.
50 */
51class GedcomLoad implements RequestHandlerInterface
52{
53    use ViewResponseTrait;
54
55    /** @var TimeoutService */
56    private $timeout_service;
57
58    /**
59     * GedcomLoad constructor.
60     *
61     * @param TimeoutService $timeout_service
62     */
63    public function __construct(TimeoutService $timeout_service)
64    {
65        $this->timeout_service = $timeout_service;
66    }
67
68    /**
69     * @param ServerRequestInterface $request
70     *
71     * @return ResponseInterface
72     */
73    public function handle(ServerRequestInterface $request): ResponseInterface
74    {
75        $this->layout = 'layouts/ajax';
76
77        $tree = $request->getAttribute('tree');
78        assert($tree instanceof Tree);
79
80        try {
81            // Only allow one process to import each gedcom at a time
82            DB::table('gedcom_chunk')
83                ->where('gedcom_id', '=', $tree->id())
84                ->lockForUpdate()
85                ->get();
86
87            // What is the current import status?
88            $import_offset = DB::table('gedcom_chunk')
89                ->where('gedcom_id', '=', $tree->id())
90                ->where('imported', '=', '1')
91                ->count();
92
93            $import_total = DB::table('gedcom_chunk')
94                ->where('gedcom_id', '=', $tree->id())
95                ->count();
96
97            // Finished?
98            if ($import_offset === $import_total) {
99                $tree->setPreference('imported', '1');
100
101                $html = view('admin/import-complete', ['tree' => $tree]);
102
103                return response($html);
104            }
105
106            // Calculate progress so far
107            $progress = $import_offset / $import_total;
108
109            $first_time = ($import_offset === 0);
110
111            // Collect up any errors, and show them later.
112            $errors = '';
113
114            // Run for a short period of time. This keeps the resource requirements low.
115            do {
116                $data = DB::table('gedcom_chunk')
117                    ->where('gedcom_id', '=', $tree->id())
118                    ->where('imported', '=', '0')
119                    ->orderBy('gedcom_chunk_id')
120                    ->select(['gedcom_chunk_id', 'chunk_data'])
121                    ->first();
122
123                // If we are loading the first (header) record, make sure the encoding is UTF-8.
124                if ($first_time) {
125                    // Remove any byte-order-mark
126                    if (str_starts_with($data->chunk_data, Gedcom::UTF8_BOM)) {
127                        $data->chunk_data = substr($data->chunk_data, strlen(Gedcom::UTF8_BOM));
128                        // Put it back in the database, so we can do character conversion
129                        DB::table('gedcom_chunk')
130                            ->where('gedcom_chunk_id', '=', $data->gedcom_chunk_id)
131                            ->update(['chunk_data' => $data->chunk_data]);
132                    }
133
134                    if (!str_starts_with($data->chunk_data, '0 HEAD')) {
135                        return $this->viewResponse('admin/import-fail', [
136                            'error' => I18N::translate('Invalid GEDCOM file - no header record found.'),
137                            'tree'  => $tree,
138                        ]);
139                    }
140
141                    // What character set is this? Need to convert it to UTF8
142                    if (preg_match('/[\r\n][ \t]*1 CHAR(?:ACTER)? ([^\r\n]+)/', $data->chunk_data, $match)) {
143                        $charset = strtoupper(trim($match[1]));
144                    } else {
145                        $charset = 'ASCII';
146                    }
147                    // MySQL supports a wide range of collation conversions. These are ones that
148                    // have been encountered "in the wild".
149                    switch ($charset) {
150                        case 'ASCII':
151                            DB::table('gedcom_chunk')
152                                ->where('gedcom_id', '=', $tree->id())
153                                ->update(['chunk_data' => new Expression('CONVERT(CONVERT(chunk_data USING ascii) USING utf8)')]);
154                            break;
155                        case 'IBMPC':   // IBMPC, IBM WINDOWS and MS-DOS could be anything. Mostly it means CP850.
156                        case 'IBM WINDOWS':
157                        case 'MS-DOS':
158                        case 'CP437':
159                        case 'CP850':
160                            // CP850 has extra letters with diacritics to replace box-drawing chars in CP437.
161                            DB::table('gedcom_chunk')
162                                ->where('gedcom_id', '=', $tree->id())
163                                ->update(['chunk_data' => new Expression('CONVERT(CONVERT(chunk_data USING cp850) USING utf8)')]);
164                            break;
165                        case 'ANSI': // ANSI could be anything. Most applications seem to treat it as latin1.
166                        case 'WINDOWS':
167                        case 'CP1252':
168                        case 'ISO8859-1':
169                        case 'ISO-8859-1':
170                        case 'LATIN1':
171                        case 'LATIN-1':
172                            // Convert from ISO-8859-1 (western european) to UTF8.
173                            DB::table('gedcom_chunk')
174                                ->where('gedcom_id', '=', $tree->id())
175                                ->update(['chunk_data' => new Expression('CONVERT(CONVERT(chunk_data USING latin1) USING utf8)')]);
176                            break;
177                        case 'CP1250':
178                        case 'ISO8859-2':
179                        case 'ISO-8859-2':
180                        case 'LATIN2':
181                        case 'LATIN-2':
182                            // Convert from ISO-8859-2 (eastern european) to UTF8.
183                            DB::table('gedcom_chunk')
184                                ->where('gedcom_id', '=', $tree->id())
185                                ->update(['chunk_data' => new Expression('CONVERT(CONVERT(chunk_data USING latin2) USING utf8)')]);
186                            break;
187                        case 'MACINTOSH':
188                            // Convert from MAC Roman to UTF8.
189                            DB::table('gedcom_chunk')
190                                ->where('gedcom_id', '=', $tree->id())
191                                ->update(['chunk_data' => new Expression('CONVERT(CONVERT(chunk_data USING macroman) USING utf8)')]);
192                            break;
193                        case 'UTF8':
194                        case 'UTF-8':
195                            // Already UTF-8 so nothing to do!
196                            break;
197                        case 'ANSEL':
198                        default:
199                            return $this->viewResponse('admin/import-fail', [
200                                'error' => I18N::translate('Error: converting GEDCOM files from %s encoding to UTF-8 encoding not currently supported.', $charset),
201                                'tree'  => $tree,
202                            ]);
203                    }
204                    $first_time = false;
205
206                    // Re-fetch the data, now that we have performed character set conversion.
207                    $data = DB::table('gedcom_chunk')
208                        ->where('gedcom_chunk_id', '=', $data->gedcom_chunk_id)
209                        ->select(['gedcom_chunk_id', 'chunk_data'])
210                        ->first();
211                }
212
213                if (!$data) {
214                    break;
215                }
216
217                $data->chunk_data = str_replace("\r", "\n", $data->chunk_data);
218
219                // Import all the records in this chunk of data
220                foreach (preg_split('/\n+(?=0)/', $data->chunk_data) as $rec) {
221                    try {
222                        FunctionsImport::importRecord($rec, $tree, false);
223                    } catch (GedcomErrorException $exception) {
224                        $errors .= $exception->getMessage();
225                    }
226                }
227
228                // Mark the chunk as imported
229                DB::table('gedcom_chunk')
230                    ->where('gedcom_chunk_id', '=', $data->gedcom_chunk_id)
231                    ->update(['imported' => 1]);
232            } while (!$this->timeout_service->isTimeLimitUp());
233
234            return $this->viewResponse('admin/import-progress', [
235                'errors'   => $errors,
236                'progress' => $progress,
237                'tree'     => $tree,
238            ]);
239        } catch (Exception $ex) {
240            DB::connection()->rollBack();
241
242            return $this->viewResponse('admin/import-fail', [
243                'error' => $ex->getMessage(),
244                'tree'  => $tree,
245            ]);
246        }
247    }
248}
249