1<?php 2 3/** 4 * webtrees: online genealogy 5 * Copyright (C) 2021 webtrees development team 6 * This program is free software: you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License as published by 8 * the Free Software Foundation, either version 3 of the License, or 9 * (at your option) any later version. 10 * This program is distributed in the hope that it will be useful, 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 * GNU General Public License for more details. 14 * You should have received a copy of the GNU General Public License 15 * along with this program. If not, see <https://www.gnu.org/licenses/>. 16 */ 17 18declare(strict_types=1); 19 20namespace Fisharebest\Webtrees\Http\RequestHandlers; 21 22use Exception; 23use Fisharebest\Webtrees\Exceptions\GedcomErrorException; 24use Fisharebest\Webtrees\Functions\FunctionsImport; 25use Fisharebest\Webtrees\Gedcom; 26use Fisharebest\Webtrees\Http\ViewResponseTrait; 27use Fisharebest\Webtrees\I18N; 28use Fisharebest\Webtrees\Services\TimeoutService; 29use Fisharebest\Webtrees\Tree; 30use Illuminate\Database\Capsule\Manager as DB; 31use Illuminate\Database\Query\Expression; 32use Psr\Http\Message\ResponseInterface; 33use Psr\Http\Message\ServerRequestInterface; 34use Psr\Http\Server\RequestHandlerInterface; 35 36use function assert; 37use function preg_match; 38use function preg_split; 39use function response; 40use function str_replace; 41use function str_starts_with; 42use function strlen; 43use function strtoupper; 44use function substr; 45use function trim; 46use function view; 47 48/** 49 * Load a chunk of GEDCOM data. 50 */ 51class GedcomLoad implements RequestHandlerInterface 52{ 53 use ViewResponseTrait; 54 55 /** @var TimeoutService */ 56 private $timeout_service; 57 58 /** 59 * GedcomLoad constructor. 60 * 61 * @param TimeoutService $timeout_service 62 */ 63 public function __construct(TimeoutService $timeout_service) 64 { 65 $this->timeout_service = $timeout_service; 66 } 67 68 /** 69 * @param ServerRequestInterface $request 70 * 71 * @return ResponseInterface 72 */ 73 public function handle(ServerRequestInterface $request): ResponseInterface 74 { 75 $this->layout = 'layouts/ajax'; 76 77 $tree = $request->getAttribute('tree'); 78 assert($tree instanceof Tree); 79 80 try { 81 // Only allow one process to import each gedcom at a time 82 DB::table('gedcom_chunk') 83 ->where('gedcom_id', '=', $tree->id()) 84 ->lockForUpdate() 85 ->get(); 86 87 // What is the current import status? 88 $import_offset = DB::table('gedcom_chunk') 89 ->where('gedcom_id', '=', $tree->id()) 90 ->where('imported', '=', '1') 91 ->count(); 92 93 $import_total = DB::table('gedcom_chunk') 94 ->where('gedcom_id', '=', $tree->id()) 95 ->count(); 96 97 // Finished? 98 if ($import_offset === $import_total) { 99 $tree->setPreference('imported', '1'); 100 101 $html = view('admin/import-complete', ['tree' => $tree]); 102 103 return response($html); 104 } 105 106 // Calculate progress so far 107 $progress = $import_offset / $import_total; 108 109 $first_time = ($import_offset === 0); 110 111 // Collect up any errors, and show them later. 112 $errors = ''; 113 114 // Run for a short period of time. This keeps the resource requirements low. 115 do { 116 $data = DB::table('gedcom_chunk') 117 ->where('gedcom_id', '=', $tree->id()) 118 ->where('imported', '=', '0') 119 ->orderBy('gedcom_chunk_id') 120 ->select(['gedcom_chunk_id', 'chunk_data']) 121 ->first(); 122 123 // If we are loading the first (header) record, make sure the encoding is UTF-8. 124 if ($first_time) { 125 // Remove any byte-order-mark 126 if (str_starts_with($data->chunk_data, Gedcom::UTF8_BOM)) { 127 $data->chunk_data = substr($data->chunk_data, strlen(Gedcom::UTF8_BOM)); 128 // Put it back in the database, so we can do character conversion 129 DB::table('gedcom_chunk') 130 ->where('gedcom_chunk_id', '=', $data->gedcom_chunk_id) 131 ->update(['chunk_data' => $data->chunk_data]); 132 } 133 134 if (!str_starts_with($data->chunk_data, '0 HEAD')) { 135 return $this->viewResponse('admin/import-fail', [ 136 'error' => I18N::translate('Invalid GEDCOM file - no header record found.'), 137 'tree' => $tree, 138 ]); 139 } 140 141 // What character set is this? Need to convert it to UTF8 142 if (preg_match('/[\r\n][ \t]*1 CHAR(?:ACTER)? ([^\r\n]+)/', $data->chunk_data, $match)) { 143 $charset = strtoupper(trim($match[1])); 144 } else { 145 $charset = 'ASCII'; 146 } 147 // MySQL supports a wide range of collation conversions. These are ones that 148 // have been encountered "in the wild". 149 switch ($charset) { 150 case 'ASCII': 151 DB::table('gedcom_chunk') 152 ->where('gedcom_id', '=', $tree->id()) 153 ->update(['chunk_data' => new Expression('CONVERT(CONVERT(chunk_data USING ascii) USING utf8)')]); 154 break; 155 case 'IBMPC': // IBMPC, IBM WINDOWS and MS-DOS could be anything. Mostly it means CP850. 156 case 'IBM WINDOWS': 157 case 'MS-DOS': 158 case 'CP437': 159 case 'CP850': 160 // CP850 has extra letters with diacritics to replace box-drawing chars in CP437. 161 DB::table('gedcom_chunk') 162 ->where('gedcom_id', '=', $tree->id()) 163 ->update(['chunk_data' => new Expression('CONVERT(CONVERT(chunk_data USING cp850) USING utf8)')]); 164 break; 165 case 'ANSI': // ANSI could be anything. Most applications seem to treat it as latin1. 166 case 'WINDOWS': 167 case 'CP1252': 168 case 'ISO8859-1': 169 case 'ISO-8859-1': 170 case 'LATIN1': 171 case 'LATIN-1': 172 // Convert from ISO-8859-1 (western european) to UTF8. 173 DB::table('gedcom_chunk') 174 ->where('gedcom_id', '=', $tree->id()) 175 ->update(['chunk_data' => new Expression('CONVERT(CONVERT(chunk_data USING latin1) USING utf8)')]); 176 break; 177 case 'CP1250': 178 case 'ISO8859-2': 179 case 'ISO-8859-2': 180 case 'LATIN2': 181 case 'LATIN-2': 182 // Convert from ISO-8859-2 (eastern european) to UTF8. 183 DB::table('gedcom_chunk') 184 ->where('gedcom_id', '=', $tree->id()) 185 ->update(['chunk_data' => new Expression('CONVERT(CONVERT(chunk_data USING latin2) USING utf8)')]); 186 break; 187 case 'MACINTOSH': 188 // Convert from MAC Roman to UTF8. 189 DB::table('gedcom_chunk') 190 ->where('gedcom_id', '=', $tree->id()) 191 ->update(['chunk_data' => new Expression('CONVERT(CONVERT(chunk_data USING macroman) USING utf8)')]); 192 break; 193 case 'UTF8': 194 case 'UTF-8': 195 // Already UTF-8 so nothing to do! 196 break; 197 case 'ANSEL': 198 default: 199 return $this->viewResponse('admin/import-fail', [ 200 'error' => I18N::translate('Error: converting GEDCOM files from %s encoding to UTF-8 encoding not currently supported.', $charset), 201 'tree' => $tree, 202 ]); 203 } 204 $first_time = false; 205 206 // Re-fetch the data, now that we have performed character set conversion. 207 $data = DB::table('gedcom_chunk') 208 ->where('gedcom_chunk_id', '=', $data->gedcom_chunk_id) 209 ->select(['gedcom_chunk_id', 'chunk_data']) 210 ->first(); 211 } 212 213 if (!$data) { 214 break; 215 } 216 217 $data->chunk_data = str_replace("\r", "\n", $data->chunk_data); 218 219 // Import all the records in this chunk of data 220 foreach (preg_split('/\n+(?=0)/', $data->chunk_data) as $rec) { 221 try { 222 FunctionsImport::importRecord($rec, $tree, false); 223 } catch (GedcomErrorException $exception) { 224 $errors .= $exception->getMessage(); 225 } 226 } 227 228 // Mark the chunk as imported 229 DB::table('gedcom_chunk') 230 ->where('gedcom_chunk_id', '=', $data->gedcom_chunk_id) 231 ->update(['imported' => 1]); 232 } while (!$this->timeout_service->isTimeLimitUp()); 233 234 return $this->viewResponse('admin/import-progress', [ 235 'errors' => $errors, 236 'progress' => $progress, 237 'tree' => $tree, 238 ]); 239 } catch (Exception $ex) { 240 DB::connection()->rollBack(); 241 242 return $this->viewResponse('admin/import-fail', [ 243 'error' => $ex->getMessage(), 244 'tree' => $tree, 245 ]); 246 } 247 } 248} 249