1<?php 2 3/** 4 * webtrees: online genealogy 5 * Copyright (C) 2021 webtrees development team 6 * This program is free software: you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License as published by 8 * the Free Software Foundation, either version 3 of the License, or 9 * (at your option) any later version. 10 * This program is distributed in the hope that it will be useful, 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 * GNU General Public License for more details. 14 * You should have received a copy of the GNU General Public License 15 * along with this program. If not, see <https://www.gnu.org/licenses/>. 16 */ 17 18declare(strict_types=1); 19 20namespace Fisharebest\Webtrees\Http\RequestHandlers; 21 22use Exception; 23use Fisharebest\Webtrees\Exceptions\GedcomErrorException; 24use Fisharebest\Webtrees\Functions\FunctionsImport; 25use Fisharebest\Webtrees\Gedcom; 26use Fisharebest\Webtrees\Http\ViewResponseTrait; 27use Fisharebest\Webtrees\I18N; 28use Fisharebest\Webtrees\Services\TimeoutService; 29use Fisharebest\Webtrees\Services\TreeService; 30use Fisharebest\Webtrees\Tree; 31use Illuminate\Database\Capsule\Manager as DB; 32use Illuminate\Database\DetectsDeadlocks; 33use Illuminate\Database\Query\Expression; 34use Illuminate\Support\Str; 35use PDOException; 36use Psr\Http\Message\ResponseInterface; 37use Psr\Http\Message\ServerRequestInterface; 38use Psr\Http\Server\RequestHandlerInterface; 39 40use function assert; 41use function preg_match; 42use function preg_split; 43use function response; 44use function str_replace; 45use function str_starts_with; 46use function strlen; 47use function strtoupper; 48use function substr; 49use function trim; 50use function view; 51 52/** 53 * Load a chunk of GEDCOM data. 54 */ 55class GedcomLoad implements RequestHandlerInterface 56{ 57 use ViewResponseTrait; 58 use DetectsDeadlocks; 59 60 /** @var TimeoutService */ 61 private $timeout_service; 62 63 /** @var TreeService */ 64 private $tree_service; 65 66 /** 67 * GedcomLoad constructor. 68 * 69 * @param TimeoutService $timeout_service 70 * @param TreeService $tree_service 71 */ 72 public function __construct(TimeoutService $timeout_service, TreeService $tree_service) 73 { 74 $this->timeout_service = $timeout_service; 75 $this->tree_service = $tree_service; 76 } 77 78 /** 79 * @param ServerRequestInterface $request 80 * 81 * @return ResponseInterface 82 */ 83 public function handle(ServerRequestInterface $request): ResponseInterface 84 { 85 $this->layout = 'layouts/ajax'; 86 87 $tree = $request->getAttribute('tree'); 88 assert($tree instanceof Tree); 89 90 try { 91 // What is the current import status? 92 $import_offset = DB::table('gedcom_chunk') 93 ->where('gedcom_id', '=', $tree->id()) 94 ->where('imported', '=', '1') 95 ->count(); 96 97 $import_total = DB::table('gedcom_chunk') 98 ->where('gedcom_id', '=', $tree->id()) 99 ->count(); 100 101 // Finished? 102 if ($import_offset === $import_total) { 103 $tree->setPreference('imported', '1'); 104 105 $html = view('admin/import-complete', ['tree' => $tree]); 106 107 return response($html); 108 } 109 110 // Calculate progress so far 111 $progress = $import_offset / $import_total; 112 113 $first_time = ($import_offset === 0); 114 115 // Collect up any errors, and show them later. 116 $errors = ''; 117 118 // Run for a short period of time. This keeps the resource requirements low. 119 do { 120 $data = DB::table('gedcom_chunk') 121 ->where('gedcom_id', '=', $tree->id()) 122 ->where('imported', '=', '0') 123 ->orderBy('gedcom_chunk_id') 124 ->select(['gedcom_chunk_id', 'chunk_data']) 125 ->first(); 126 127 if ($data === null) { 128 break; 129 } 130 131 // Mark the chunk as imported. This will create a row-lock, to prevent other 132 // processes from reading it until we have finished. 133 $n = DB::table('gedcom_chunk') 134 ->where('gedcom_chunk_id', '=', $data->gedcom_chunk_id) 135 ->where('imported', '=', '0') 136 ->update(['imported' => 1]); 137 138 // Another process has already imported this data? 139 if ($n === 0) { 140 break; 141 } 142 143 // If we are loading the first (header) record, then delete old data and convert to UTF-8. 144 if ($first_time) { 145 $this->tree_service->deleteGenealogyData($tree, (bool) $tree->getPreference('keep_media')); 146 147 // Remove any byte-order-mark 148 if (str_starts_with($data->chunk_data, Gedcom::UTF8_BOM)) { 149 $data->chunk_data = substr($data->chunk_data, strlen(Gedcom::UTF8_BOM)); 150 // Put it back in the database, so we can do character conversion 151 DB::table('gedcom_chunk') 152 ->where('gedcom_chunk_id', '=', $data->gedcom_chunk_id) 153 ->update(['chunk_data' => $data->chunk_data]); 154 } 155 156 if (!str_starts_with($data->chunk_data, '0 HEAD')) { 157 return $this->viewResponse('admin/import-fail', [ 158 'error' => I18N::translate('Invalid GEDCOM file - no header record found.'), 159 'tree' => $tree, 160 ]); 161 } 162 163 // What character set is this? Need to convert it to UTF8 164 if (preg_match('/[\r\n][ \t]*1 CHAR(?:ACTER)? ([^\r\n]+)/', $data->chunk_data, $match)) { 165 $charset = strtoupper(trim($match[1])); 166 } else { 167 $charset = 'ASCII'; 168 } 169 170 // MySQL supports a wide range of collation conversions. These are ones that 171 // have been encountered "in the wild". 172 switch ($charset) { 173 case 'ASCII': 174 DB::table('gedcom_chunk') 175 ->where('gedcom_id', '=', $tree->id()) 176 ->update(['chunk_data' => new Expression('CONVERT(CONVERT(chunk_data USING ascii) USING utf8)')]); 177 break; 178 case 'IBMPC': // IBMPC, IBM WINDOWS and MS-DOS could be anything. Mostly it means CP850. 179 case 'IBM WINDOWS': 180 case 'MS-DOS': 181 case 'CP437': 182 case 'CP850': 183 // CP850 has extra letters with diacritics to replace box-drawing chars in CP437. 184 DB::table('gedcom_chunk') 185 ->where('gedcom_id', '=', $tree->id()) 186 ->update(['chunk_data' => new Expression('CONVERT(CONVERT(chunk_data USING cp850) USING utf8)')]); 187 break; 188 case 'ANSI': // ANSI could be anything. Most applications seem to treat it as latin1. 189 case 'WINDOWS': 190 case 'CP1252': 191 case 'ISO8859-1': 192 case 'ISO-8859-1': 193 case 'LATIN1': 194 case 'LATIN-1': 195 // Convert from ISO-8859-1 (western european) to UTF8. 196 DB::table('gedcom_chunk') 197 ->where('gedcom_id', '=', $tree->id()) 198 ->update(['chunk_data' => new Expression('CONVERT(CONVERT(chunk_data USING latin1) USING utf8)')]); 199 break; 200 case 'CP1250': 201 case 'ISO8859-2': 202 case 'ISO-8859-2': 203 case 'LATIN2': 204 case 'LATIN-2': 205 // Convert from ISO-8859-2 (eastern european) to UTF8. 206 DB::table('gedcom_chunk') 207 ->where('gedcom_id', '=', $tree->id()) 208 ->update(['chunk_data' => new Expression('CONVERT(CONVERT(chunk_data USING latin2) USING utf8)')]); 209 break; 210 case 'MACINTOSH': 211 // Convert from MAC Roman to UTF8. 212 DB::table('gedcom_chunk') 213 ->where('gedcom_id', '=', $tree->id()) 214 ->update(['chunk_data' => new Expression('CONVERT(CONVERT(chunk_data USING macroman) USING utf8)')]); 215 break; 216 case 'UTF8': 217 case 'UTF-8': 218 // Already UTF-8 so nothing to do! 219 break; 220 case 'ANSEL': 221 default: 222 return $this->viewResponse('admin/import-fail', [ 223 'error' => I18N::translate('Error: converting GEDCOM files from %s encoding to UTF-8 encoding not currently supported.', $charset), 224 'tree' => $tree, 225 ]); 226 } 227 $first_time = false; 228 229 // Re-fetch the data, now that we have performed character set conversion. 230 $data = DB::table('gedcom_chunk') 231 ->where('gedcom_chunk_id', '=', $data->gedcom_chunk_id) 232 ->select(['gedcom_chunk_id', 'chunk_data']) 233 ->first(); 234 } 235 236 $data->chunk_data = str_replace("\r", "\n", $data->chunk_data); 237 238 // Import all the records in this chunk of data 239 foreach (preg_split('/\n+(?=0)/', $data->chunk_data) as $rec) { 240 try { 241 FunctionsImport::importRecord($rec, $tree, false); 242 } catch (GedcomErrorException $exception) { 243 $errors .= $exception->getMessage(); 244 } 245 } 246 247 // Do not need the data any more. 248 DB::table('gedcom_chunk') 249 ->where('gedcom_chunk_id', '=', $data->gedcom_chunk_id) 250 ->update(['chunk_data' => '']); 251 } while (!$this->timeout_service->isTimeLimitUp()); 252 253 return $this->viewResponse('admin/import-progress', [ 254 'errors' => $errors, 255 'progress' => $progress, 256 'tree' => $tree, 257 ]); 258 } catch (Exception $ex) { 259 DB::connection()->rollBack(); 260 261 // Deadlock? Try again. 262 if ($this->causedByDeadlock($ex)) { 263 return $this->viewResponse('admin/import-progress', [ 264 'errors' => '', 265 'progress' => $progress ?? 0.0, 266 'tree' => $tree, 267 ]); 268 } 269 270 return $this->viewResponse('admin/import-fail', [ 271 'error' => $ex->getMessage(), 272 'tree' => $tree, 273 ]); 274 } 275 } 276} 277