1<?php 2 3/** 4 * webtrees: online genealogy 5 * Copyright (C) 2021 webtrees development team 6 * This program is free software: you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License as published by 8 * the Free Software Foundation, either version 3 of the License, or 9 * (at your option) any later version. 10 * This program is distributed in the hope that it will be useful, 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 * GNU General Public License for more details. 14 * You should have received a copy of the GNU General Public License 15 * along with this program. If not, see <https://www.gnu.org/licenses/>. 16 */ 17 18declare(strict_types=1); 19 20namespace Fisharebest\Webtrees\Http\RequestHandlers; 21 22use Exception; 23use Fisharebest\Webtrees\Exceptions\GedcomErrorException; 24use Fisharebest\Webtrees\Functions\FunctionsImport; 25use Fisharebest\Webtrees\Gedcom; 26use Fisharebest\Webtrees\Http\ViewResponseTrait; 27use Fisharebest\Webtrees\I18N; 28use Fisharebest\Webtrees\Services\TimeoutService; 29use Fisharebest\Webtrees\Services\TreeService; 30use Fisharebest\Webtrees\Tree; 31use Illuminate\Database\Capsule\Manager as DB; 32use Illuminate\Database\Query\Expression; 33use Psr\Http\Message\ResponseInterface; 34use Psr\Http\Message\ServerRequestInterface; 35use Psr\Http\Server\RequestHandlerInterface; 36 37use function assert; 38use function preg_match; 39use function preg_split; 40use function response; 41use function str_replace; 42use function str_starts_with; 43use function strlen; 44use function strtoupper; 45use function substr; 46use function trim; 47use function view; 48 49/** 50 * Load a chunk of GEDCOM data. 51 */ 52class GedcomLoad implements RequestHandlerInterface 53{ 54 use ViewResponseTrait; 55 56 /** @var TimeoutService */ 57 private $timeout_service; 58 59 /** @var TreeService */ 60 private $tree_service; 61 62 /** 63 * GedcomLoad constructor. 64 * 65 * @param TimeoutService $timeout_service 66 * @param TreeService $tree_service 67 */ 68 public function __construct(TimeoutService $timeout_service, TreeService $tree_service) 69 { 70 $this->timeout_service = $timeout_service; 71 $this->tree_service = $tree_service; 72 } 73 74 /** 75 * @param ServerRequestInterface $request 76 * 77 * @return ResponseInterface 78 */ 79 public function handle(ServerRequestInterface $request): ResponseInterface 80 { 81 $this->layout = 'layouts/ajax'; 82 83 $tree = $request->getAttribute('tree'); 84 assert($tree instanceof Tree); 85 86 try { 87 // Only allow one process to import each gedcom at a time 88 DB::table('gedcom_chunk') 89 ->where('gedcom_id', '=', $tree->id()) 90 ->lockForUpdate() 91 ->get(); 92 93 // What is the current import status? 94 $import_offset = DB::table('gedcom_chunk') 95 ->where('gedcom_id', '=', $tree->id()) 96 ->where('imported', '=', '1') 97 ->count(); 98 99 $import_total = DB::table('gedcom_chunk') 100 ->where('gedcom_id', '=', $tree->id()) 101 ->count(); 102 103 // Finished? 104 if ($import_offset === $import_total) { 105 $tree->setPreference('imported', '1'); 106 107 $html = view('admin/import-complete', ['tree' => $tree]); 108 109 return response($html); 110 } 111 112 // Calculate progress so far 113 $progress = $import_offset / $import_total; 114 115 $first_time = ($import_offset === 0); 116 117 // Collect up any errors, and show them later. 118 $errors = ''; 119 120 // Run for a short period of time. This keeps the resource requirements low. 121 do { 122 $data = DB::table('gedcom_chunk') 123 ->where('gedcom_id', '=', $tree->id()) 124 ->where('imported', '=', '0') 125 ->orderBy('gedcom_chunk_id') 126 ->select(['gedcom_chunk_id', 'chunk_data']) 127 ->first(); 128 129 // If we are loading the first (header) record, make sure the encoding is UTF-8. 130 if ($first_time) { 131 $this->tree_service->deleteGenealogyData($tree, (bool) $tree->getPreference('keep_media')); 132 133 // Remove any byte-order-mark 134 if (str_starts_with($data->chunk_data, Gedcom::UTF8_BOM)) { 135 $data->chunk_data = substr($data->chunk_data, strlen(Gedcom::UTF8_BOM)); 136 // Put it back in the database, so we can do character conversion 137 DB::table('gedcom_chunk') 138 ->where('gedcom_chunk_id', '=', $data->gedcom_chunk_id) 139 ->update(['chunk_data' => $data->chunk_data]); 140 } 141 142 if (!str_starts_with($data->chunk_data, '0 HEAD')) { 143 return $this->viewResponse('admin/import-fail', [ 144 'error' => I18N::translate('Invalid GEDCOM file - no header record found.'), 145 'tree' => $tree, 146 ]); 147 } 148 149 // What character set is this? Need to convert it to UTF8 150 if (preg_match('/[\r\n][ \t]*1 CHAR(?:ACTER)? ([^\r\n]+)/', $data->chunk_data, $match)) { 151 $charset = strtoupper(trim($match[1])); 152 } else { 153 $charset = 'ASCII'; 154 } 155 // MySQL supports a wide range of collation conversions. These are ones that 156 // have been encountered "in the wild". 157 switch ($charset) { 158 case 'ASCII': 159 DB::table('gedcom_chunk') 160 ->where('gedcom_id', '=', $tree->id()) 161 ->update(['chunk_data' => new Expression('CONVERT(CONVERT(chunk_data USING ascii) USING utf8)')]); 162 break; 163 case 'IBMPC': // IBMPC, IBM WINDOWS and MS-DOS could be anything. Mostly it means CP850. 164 case 'IBM WINDOWS': 165 case 'MS-DOS': 166 case 'CP437': 167 case 'CP850': 168 // CP850 has extra letters with diacritics to replace box-drawing chars in CP437. 169 DB::table('gedcom_chunk') 170 ->where('gedcom_id', '=', $tree->id()) 171 ->update(['chunk_data' => new Expression('CONVERT(CONVERT(chunk_data USING cp850) USING utf8)')]); 172 break; 173 case 'ANSI': // ANSI could be anything. Most applications seem to treat it as latin1. 174 case 'WINDOWS': 175 case 'CP1252': 176 case 'ISO8859-1': 177 case 'ISO-8859-1': 178 case 'LATIN1': 179 case 'LATIN-1': 180 // Convert from ISO-8859-1 (western european) to UTF8. 181 DB::table('gedcom_chunk') 182 ->where('gedcom_id', '=', $tree->id()) 183 ->update(['chunk_data' => new Expression('CONVERT(CONVERT(chunk_data USING latin1) USING utf8)')]); 184 break; 185 case 'CP1250': 186 case 'ISO8859-2': 187 case 'ISO-8859-2': 188 case 'LATIN2': 189 case 'LATIN-2': 190 // Convert from ISO-8859-2 (eastern european) to UTF8. 191 DB::table('gedcom_chunk') 192 ->where('gedcom_id', '=', $tree->id()) 193 ->update(['chunk_data' => new Expression('CONVERT(CONVERT(chunk_data USING latin2) USING utf8)')]); 194 break; 195 case 'MACINTOSH': 196 // Convert from MAC Roman to UTF8. 197 DB::table('gedcom_chunk') 198 ->where('gedcom_id', '=', $tree->id()) 199 ->update(['chunk_data' => new Expression('CONVERT(CONVERT(chunk_data USING macroman) USING utf8)')]); 200 break; 201 case 'UTF8': 202 case 'UTF-8': 203 // Already UTF-8 so nothing to do! 204 break; 205 case 'ANSEL': 206 default: 207 return $this->viewResponse('admin/import-fail', [ 208 'error' => I18N::translate('Error: converting GEDCOM files from %s encoding to UTF-8 encoding not currently supported.', $charset), 209 'tree' => $tree, 210 ]); 211 } 212 $first_time = false; 213 214 // Re-fetch the data, now that we have performed character set conversion. 215 $data = DB::table('gedcom_chunk') 216 ->where('gedcom_chunk_id', '=', $data->gedcom_chunk_id) 217 ->select(['gedcom_chunk_id', 'chunk_data']) 218 ->first(); 219 } 220 221 if (!$data) { 222 break; 223 } 224 225 $data->chunk_data = str_replace("\r", "\n", $data->chunk_data); 226 227 // Import all the records in this chunk of data 228 foreach (preg_split('/\n+(?=0)/', $data->chunk_data) as $rec) { 229 try { 230 FunctionsImport::importRecord($rec, $tree, false); 231 } catch (GedcomErrorException $exception) { 232 $errors .= $exception->getMessage(); 233 } 234 } 235 236 // Mark the chunk as imported 237 DB::table('gedcom_chunk') 238 ->where('gedcom_chunk_id', '=', $data->gedcom_chunk_id) 239 ->update(['imported' => 1]); 240 } while (!$this->timeout_service->isTimeLimitUp()); 241 242 return $this->viewResponse('admin/import-progress', [ 243 'errors' => $errors, 244 'progress' => $progress, 245 'tree' => $tree, 246 ]); 247 } catch (Exception $ex) { 248 DB::connection()->rollBack(); 249 250 return $this->viewResponse('admin/import-fail', [ 251 'error' => $ex->getMessage(), 252 'tree' => $tree, 253 ]); 254 } 255 } 256} 257