xref: /webtrees/app/Services/GedcomImportService.php (revision 5a8afed46297e8105e3e5a33ce37e6a8e88bc79d)
1<?php
2
3/**
4 * webtrees: online genealogy
5 * Copyright (C) 2023 webtrees development team
6 * This program is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation, either version 3 of the License, or
9 * (at your option) any later version.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program. If not, see <https://www.gnu.org/licenses/>.
16 */
17
18declare(strict_types=1);
19
20namespace Fisharebest\Webtrees\Services;
21
22use Fisharebest\Webtrees\Date;
23use Fisharebest\Webtrees\DB;
24use Fisharebest\Webtrees\Elements\UnknownElement;
25use Fisharebest\Webtrees\Exceptions\GedcomErrorException;
26use Fisharebest\Webtrees\Family;
27use Fisharebest\Webtrees\Gedcom;
28use Fisharebest\Webtrees\Header;
29use Fisharebest\Webtrees\Individual;
30use Fisharebest\Webtrees\Location;
31use Fisharebest\Webtrees\Media;
32use Fisharebest\Webtrees\Note;
33use Fisharebest\Webtrees\Place;
34use Fisharebest\Webtrees\PlaceLocation;
35use Fisharebest\Webtrees\Registry;
36use Fisharebest\Webtrees\Repository;
37use Fisharebest\Webtrees\Soundex;
38use Fisharebest\Webtrees\Source;
39use Fisharebest\Webtrees\Submission;
40use Fisharebest\Webtrees\Submitter;
41use Fisharebest\Webtrees\Tree;
42use Illuminate\Database\Query\JoinClause;
43
44use function array_chunk;
45use function array_intersect_key;
46use function array_map;
47use function array_unique;
48use function date;
49use function explode;
50use function max;
51use function mb_substr;
52use function preg_match;
53use function preg_match_all;
54use function preg_replace;
55use function round;
56use function str_contains;
57use function str_replace;
58use function str_starts_with;
59use function strlen;
60use function strtoupper;
61use function strtr;
62use function substr;
63use function trim;
64
65use const PREG_SET_ORDER;
66
67/**
68 * Class GedcomImportService - import GEDCOM data
69 */
70class GedcomImportService
71{
72    /**
73     * Tidy up a gedcom record on import, so that we can access it consistently/efficiently.
74     *
75     * @param string $rec
76     * @param Tree   $tree
77     *
78     * @return string
79     */
80    private function reformatRecord(string $rec, Tree $tree): string
81    {
82        $gedcom_service = Registry::container()->get(GedcomService::class);
83
84        // Strip out mac/msdos line endings
85        $rec = preg_replace("/[\r\n]+/", "\n", $rec);
86
87        // Extract lines from the record; lines consist of: level + optional xref + tag + optional data
88        $num_matches = preg_match_all('/^[ \t]*(\d+)[ \t]*(@[^@]*@)?[ \t]*(\w+)[ \t]?(.*)$/m', $rec, $matches, PREG_SET_ORDER);
89
90        // Process the record line-by-line
91        $newrec = '';
92        foreach ($matches as $n => $match) {
93            [, $level, $xref, $tag, $data] = $match;
94
95            $tag = $gedcom_service->canonicalTag($tag);
96
97            switch ($tag) {
98                case 'DATE':
99                    // Preserve text from INT dates
100                    if (str_contains($data, '(')) {
101                        [$date, $text] = explode('(', $data, 2);
102                        $text = ' (' . $text;
103                    } else {
104                        $date = $data;
105                        $text = '';
106                    }
107                    // Capitals
108                    $date = strtoupper($date);
109                    // Temporarily add leading/trailing spaces, to allow efficient matching below
110                    $date = ' ' . $date . ' ';
111                    // Ensure space digits and letters
112                    $date = preg_replace('/([A-Z])(\d)/', '$1 $2', $date);
113                    $date = preg_replace('/(\d)([A-Z])/', '$1 $2', $date);
114                    // Ensure space before/after calendar escapes
115                    $date = preg_replace('/@#[^@]+@/', ' $0 ', $date);
116                    // "BET." => "BET"
117                    $date = preg_replace('/(\w\w)\./', '$1', $date);
118                    // "CIR" => "ABT"
119                    $date = str_replace(' CIR ', ' ABT ', $date);
120                    $date = str_replace(' APX ', ' ABT ', $date);
121                    // B.C. => BC (temporarily, to allow easier handling of ".")
122                    $date = str_replace(' B.C. ', ' BC ', $date);
123                    // TMG uses "EITHER X OR Y"
124                    $date = preg_replace('/^ EITHER (.+) OR (.+)/', ' BET $1 AND $2', $date);
125                    // "BET X - Y " => "BET X AND Y"
126                    $date = preg_replace('/^(.* BET .+) - (.+)/', '$1 AND $2', $date);
127                    $date = preg_replace('/^(.* FROM .+) - (.+)/', '$1 TO $2', $date);
128                    // "@#ESC@ FROM X TO Y" => "FROM @#ESC@ X TO @#ESC@ Y"
129                    $date = preg_replace('/^ +(@#[^@]+@) +FROM +(.+) +TO +(.+)/', ' FROM $1 $2 TO $1 $3', $date);
130                    $date = preg_replace('/^ +(@#[^@]+@) +BET +(.+) +AND +(.+)/', ' BET $1 $2 AND $1 $3', $date);
131                    // "@#ESC@ AFT X" => "AFT @#ESC@ X"
132                    $date = preg_replace('/^ +(@#[^@]+@) +(FROM|BET|TO|AND|BEF|AFT|CAL|EST|INT|ABT) +(.+)/', ' $2 $1 $3', $date);
133                    // Ignore any remaining punctuation, e.g. "14-MAY, 1900" => "14 MAY 1900"
134                    // (don't change "/" - it is used in NS/OS dates)
135                    $date = preg_replace('/[.,:;-]/', ' ', $date);
136                    // BC => B.C.
137                    $date = str_replace(' BC ', ' B.C. ', $date);
138                    // Append the "INT" text
139                    $data = $date . $text;
140                    break;
141                case 'HEAD':
142                case 'TRLR':
143                    // HEAD and TRLR records do not have an XREF or DATA
144                    if ($level === '0') {
145                        $xref = '';
146                        $data = '';
147                    }
148                    break;
149                case 'NAME':
150                    // Tidy up non-printing characters
151                    $data = preg_replace('/  +/', ' ', trim($data));
152                    break;
153                case 'PLAC':
154                    // Consistent commas
155                    $data = preg_replace('/ *[,,،] */u', ', ', $data);
156                    // The Master Genealogist stores LAT/LONG data in the PLAC field, e.g. Pennsylvania, USA, 395945N0751013W
157                    if (preg_match('/(.*), (\d\d)(\d\d)(\d\d)([NS])(\d\d\d)(\d\d)(\d\d)([EW])$/', $data, $match) === 1) {
158                        $degns = (int) $match[2];
159                        $minns = (int) $match[3];
160                        $secns = (int) $match[4];
161                        $degew = (int) $match[6];
162                        $minew = (int) $match[7];
163                        $secew = (int) $match[8];
164                        $data =
165                            $match[1] . "\n" .
166                            (1 + (int) $level) . " MAP\n" .
167                            (2 + (int) $level) . ' LATI ' . ($match[5] . round($degns + $minns / 60 + $secns / 3600, 4)) . "\n" .
168                            (2 + (int) $level) . ' LONG ' . ($match[9] . round($degew + $minew / 60 + $secew / 3600, 4));
169                    }
170                    break;
171                case 'SEX':
172                    $data = strtoupper($data);
173                    break;
174            }
175            // Suppress "Y", for facts/events with a DATE or PLAC
176            if ($data === 'y') {
177                $data = 'Y';
178            }
179            if ($level === '1' && $data === 'Y') {
180                for ($i = $n + 1; $i < $num_matches - 1 && $matches[$i][1] !== '1'; ++$i) {
181                    if ($matches[$i][3] === 'DATE' || $matches[$i][3] === 'PLAC') {
182                        $data = '';
183                        break;
184                    }
185                }
186            }
187            // Reassemble components back into a single line
188            switch ($tag) {
189                default:
190                    // Remove tabs and multiple/leading/trailing spaces
191                    $data = strtr($data, ["\t" => ' ']);
192                    $data = trim($data, ' ');
193                    while (str_contains($data, '  ')) {
194                        $data = strtr($data, ['  ' => ' ']);
195                    }
196                    $newrec .= ($newrec ? "\n" : '') . $level . ' ' . ($level === '0' && $xref ? $xref . ' ' : '') . $tag . ($data === '' && $tag !== 'NOTE' ? '' : ' ' . $data);
197                    break;
198                case 'NOTE':
199                case 'TEXT':
200                case 'DATA':
201                case 'CONT':
202                    $newrec .= ($newrec ? "\n" : '') . $level . ' ' . ($level === '0' && $xref ? $xref . ' ' : '') . $tag . ($data === '' && $tag !== 'NOTE' ? '' : ' ' . $data);
203                    break;
204                case 'FILE':
205                    // Strip off the user-defined path prefix
206                    $GEDCOM_MEDIA_PATH = $tree->getPreference('GEDCOM_MEDIA_PATH');
207                    if ($GEDCOM_MEDIA_PATH !== '' && str_starts_with($data, $GEDCOM_MEDIA_PATH)) {
208                        $data = substr($data, strlen($GEDCOM_MEDIA_PATH));
209                    }
210                    // convert backslashes in filenames to forward slashes
211                    $data = preg_replace("/\\\\/", '/', $data);
212
213                    $newrec .= ($newrec ? "\n" : '') . $level . ' ' . ($level === '0' && $xref ? $xref . ' ' : '') . $tag . ($data === '' && $tag !== 'NOTE' ? '' : ' ' . $data);
214                    break;
215                case 'CONC':
216                    // Merge CONC lines, to simplify access later on.
217                    $newrec .= ($tree->getPreference('WORD_WRAPPED_NOTES') ? ' ' : '') . $data;
218                    break;
219            }
220        }
221
222        return $newrec;
223    }
224
225    /**
226     * import record into database
227     * this function will parse the given gedcom record and add it to the database
228     *
229     * @param string $gedrec the raw gedcom record to parse
230     * @param Tree   $tree   import the record into this tree
231     * @param bool   $update whether this is an updated record that has been accepted
232     *
233     * @return void
234     * @throws GedcomErrorException
235     */
236    public function importRecord(string $gedrec, Tree $tree, bool $update): void
237    {
238        $tree_id = $tree->id();
239
240        // Escaped @ signs (only if importing from file)
241        if (!$update) {
242            $gedrec = str_replace('@@', '@', $gedrec);
243        }
244
245        // Standardise gedcom format
246        $gedrec = $this->reformatRecord($gedrec, $tree);
247
248        // import different types of records
249        if (preg_match('/^0 @(' . Gedcom::REGEX_XREF . ')@ (' . Gedcom::REGEX_TAG . ')/', $gedrec, $match)) {
250            [, $xref, $type] = $match;
251        } elseif (str_starts_with($gedrec, '0 HEAD')) {
252            $type = 'HEAD';
253            $xref = 'HEAD'; // For records without an XREF, use the type as a pseudo XREF.
254        } elseif (str_starts_with($gedrec, '0 TRLR')) {
255            $tree->setPreference('imported', '1');
256            $type = 'TRLR';
257            $xref = 'TRLR'; // For records without an XREF, use the type as a pseudo XREF.
258        } elseif (str_starts_with($gedrec, '0 _PLAC_DEFN')) {
259            $this->importLegacyPlacDefn($gedrec);
260
261            return;
262        } elseif (str_starts_with($gedrec, '0 _PLAC ')) {
263            $this->importTNGPlac($gedrec);
264
265            return;
266        } else {
267            foreach (Gedcom::CUSTOM_RECORDS_WITHOUT_XREFS as $record_type) {
268                if (preg_match('/^0 ' . $record_type . '\b/', $gedrec) === 1) {
269                    return;
270                }
271            }
272
273            throw new GedcomErrorException($gedrec);
274        }
275
276        // Add a _UID
277        if ($tree->getPreference('GENERATE_UIDS') === '1' && !str_contains($gedrec, "\n1 _UID ")) {
278            $element = Registry::elementFactory()->make($type . ':_UID');
279            if (!$element instanceof UnknownElement) {
280                $gedrec .= "\n1 _UID " . $element->default($tree);
281            }
282        }
283
284        // If the user has downloaded their GEDCOM data (containing media objects) and edited it
285        // using an application which does not support (and deletes) media objects, then add them
286        // back in.
287        if ($tree->getPreference('keep_media') === '1') {
288            $old_linked_media = DB::table('link')
289                ->where('l_from', '=', $xref)
290                ->where('l_file', '=', $tree_id)
291                ->where('l_type', '=', 'OBJE')
292                ->pluck('l_to');
293
294            // Delete these links - so that we do not insert them again in updateLinks()
295            DB::table('link')
296                ->where('l_from', '=', $xref)
297                ->where('l_file', '=', $tree_id)
298                ->where('l_type', '=', 'OBJE')
299                ->delete();
300
301            foreach ($old_linked_media as $media_id) {
302                $gedrec .= "\n1 OBJE @" . $media_id . '@';
303            }
304        }
305
306        // Convert inline media into media objects
307        $gedrec = $this->convertInlineMedia($tree, $gedrec);
308
309        switch ($type) {
310            case Individual::RECORD_TYPE:
311                $record = Registry::individualFactory()->new($xref, $gedrec, null, $tree);
312
313                if (preg_match('/\n1 RIN (.+)/', $gedrec, $match)) {
314                    $rin = $match[1];
315                } else {
316                    $rin = $xref;
317                }
318
319                // The database can only store MFU, and many of the stats queries assume this.
320                $sex = $record->sex();
321                $sex = $sex === 'M' || $sex === 'F' ? $sex : 'U';
322
323                DB::table('individuals')->insert([
324                    'i_id'     => $xref,
325                    'i_file'   => $tree_id,
326                    'i_rin'    => $rin,
327                    'i_sex'    => $sex,
328                    'i_gedcom' => $gedrec,
329                ]);
330
331                // Update the cross-reference/index tables.
332                $this->updatePlaces($xref, $tree, $gedrec);
333                $this->updateDates($xref, $tree_id, $gedrec);
334                $this->updateNames($xref, $tree_id, $record);
335                break;
336
337            case Family::RECORD_TYPE:
338                if (preg_match('/\n1 HUSB @(' . Gedcom::REGEX_XREF . ')@/', $gedrec, $match)) {
339                    $husb = $match[1];
340                } else {
341                    $husb = '';
342                }
343                if (preg_match('/\n1 WIFE @(' . Gedcom::REGEX_XREF . ')@/', $gedrec, $match)) {
344                    $wife = $match[1];
345                } else {
346                    $wife = '';
347                }
348                $nchi = preg_match_all('/\n1 CHIL @(' . Gedcom::REGEX_XREF . ')@/', $gedrec, $match);
349                if (preg_match('/\n1 NCHI (\d+)/', $gedrec, $match)) {
350                    $nchi = max($nchi, $match[1]);
351                }
352
353                DB::table('families')->insert([
354                    'f_id'      => $xref,
355                    'f_file'    => $tree_id,
356                    'f_husb'    => $husb,
357                    'f_wife'    => $wife,
358                    'f_gedcom'  => $gedrec,
359                    'f_numchil' => $nchi,
360                ]);
361
362                // Update the cross-reference/index tables.
363                $this->updatePlaces($xref, $tree, $gedrec);
364                $this->updateDates($xref, $tree_id, $gedrec);
365                break;
366
367            case Source::RECORD_TYPE:
368                if (preg_match('/\n1 TITL (.+)/', $gedrec, $match)) {
369                    $name = $match[1];
370                } elseif (preg_match('/\n1 ABBR (.+)/', $gedrec, $match)) {
371                    $name = $match[1];
372                } else {
373                    $name = $xref;
374                }
375
376                DB::table('sources')->insert([
377                    's_id'     => $xref,
378                    's_file'   => $tree_id,
379                    's_name'   => mb_substr($name, 0, 255),
380                    's_gedcom' => $gedrec,
381                ]);
382                break;
383
384            case Repository::RECORD_TYPE:
385            case Note::RECORD_TYPE:
386            case Submission::RECORD_TYPE:
387            case Submitter::RECORD_TYPE:
388            case Location::RECORD_TYPE:
389                DB::table('other')->insert([
390                    'o_id'     => $xref,
391                    'o_file'   => $tree_id,
392                    'o_type'   => $type,
393                    'o_gedcom' => $gedrec,
394                ]);
395                break;
396
397            case Header::RECORD_TYPE:
398                // Force HEAD records to have a creation date.
399                if (!str_contains($gedrec, "\n1 DATE ")) {
400                    $today = strtoupper(date('d M Y'));
401                    $gedrec .= "\n1 DATE " . $today;
402                }
403
404                DB::table('other')->insert([
405                    'o_id'     => $xref,
406                    'o_file'   => $tree_id,
407                    'o_type'   => Header::RECORD_TYPE,
408                    'o_gedcom' => $gedrec,
409                ]);
410                break;
411
412            case Media::RECORD_TYPE:
413                $record = Registry::mediaFactory()->new($xref, $gedrec, null, $tree);
414
415                DB::table('media')->insert([
416                    'm_id'     => $xref,
417                    'm_file'   => $tree_id,
418                    'm_gedcom' => $gedrec,
419                ]);
420
421                foreach ($record->mediaFiles() as $media_file) {
422                    DB::table('media_file')->insert([
423                        'm_id'                 => $xref,
424                        'm_file'               => $tree_id,
425                        'multimedia_file_refn' => mb_substr($media_file->filename(), 0, 248),
426                        'multimedia_format'    => mb_substr($media_file->format(), 0, 4),
427                        'source_media_type'    => mb_substr($media_file->type(), 0, 15),
428                        'descriptive_title'    => mb_substr($media_file->title(), 0, 248),
429                    ]);
430                }
431                break;
432
433            default: // Custom record types.
434                DB::table('other')->insert([
435                    'o_id'     => $xref,
436                    'o_file'   => $tree_id,
437                    'o_type'   => mb_substr($type, 0, 15),
438                    'o_gedcom' => $gedrec,
439                ]);
440                break;
441        }
442
443        // Update the cross-reference/index tables.
444        $this->updateLinks($xref, $tree_id, $gedrec);
445    }
446
447    /**
448     * Legacy Family Tree software generates _PLAC_DEFN records containing LAT/LONG values
449     *
450     * @param string $gedcom
451     */
452    private function importLegacyPlacDefn(string $gedcom): void
453    {
454        $gedcom_service = new GedcomService();
455
456        if (preg_match('/\n1 PLAC (.+)/', $gedcom, $match)) {
457            $place_name = $match[1];
458        } else {
459            return;
460        }
461
462        if (preg_match('/\n3 LATI ([NS].+)/', $gedcom, $match)) {
463            $latitude = $gedcom_service->readLatitude($match[1]);
464        } else {
465            return;
466        }
467
468        if (preg_match('/\n3 LONG ([EW].+)/', $gedcom, $match)) {
469            $longitude = $gedcom_service->readLongitude($match[1]);
470        } else {
471            return;
472        }
473
474        $location = new PlaceLocation($place_name);
475
476        if ($location->latitude() === null && $location->longitude() === null) {
477            DB::table('place_location')
478                ->where('id', '=', $location->id())
479                ->update([
480                    'latitude'  => $latitude,
481                    'longitude' => $longitude,
482                ]);
483        }
484    }
485
486    /**
487     * Legacy Family Tree software generates _PLAC records containing LAT/LONG values
488     *
489     * @param string $gedcom
490     */
491    private function importTNGPlac(string $gedcom): void
492    {
493        if (preg_match('/^0 _PLAC (.+)/', $gedcom, $match)) {
494            $place_name = $match[1];
495        } else {
496            return;
497        }
498
499        if (preg_match('/\n2 LATI (.+)/', $gedcom, $match)) {
500            $latitude = (float) $match[1];
501        } else {
502            return;
503        }
504
505        if (preg_match('/\n2 LONG (.+)/', $gedcom, $match)) {
506            $longitude = (float) $match[1];
507        } else {
508            return;
509        }
510
511        $location = new PlaceLocation($place_name);
512
513        if ($location->latitude() === null && $location->longitude() === null) {
514            DB::table('place_location')
515                ->where('id', '=', $location->id())
516                ->update([
517                    'latitude'  => $latitude,
518                    'longitude' => $longitude,
519                ]);
520        }
521    }
522
523    /**
524     * Extract all level 2 places from the given record and insert them into the places table
525     *
526     * @param string $xref
527     * @param Tree   $tree
528     * @param string $gedrec
529     *
530     * @return void
531     */
532    public function updatePlaces(string $xref, Tree $tree, string $gedrec): void
533    {
534        // Insert all new rows together
535        $rows = [];
536
537        preg_match_all('/\n2 PLAC (.+)/', $gedrec, $matches);
538
539        $places = array_unique($matches[1]);
540
541        foreach ($places as $place_name) {
542            $place = new Place($place_name, $tree);
543
544            // Calling Place::id() will create the entry in the database, if it doesn't already exist.
545            while ($place->id() !== 0) {
546                $rows[] = [
547                    'pl_p_id' => $place->id(),
548                    'pl_gid'  => $xref,
549                    'pl_file' => $tree->id(),
550                ];
551
552                $place = $place->parent();
553            }
554        }
555
556        // array_unique doesn't work with arrays of arrays
557        $rows = array_intersect_key($rows, array_unique(array_map('serialize', $rows)));
558
559        // PDO has a limit of 65535 placeholders, and each row requires 3 placeholders.
560        foreach (array_chunk($rows, 20000) as $chunk) {
561            DB::table('placelinks')->insert($chunk);
562        }
563    }
564
565    /**
566     * Extract all the dates from the given record and insert them into the database.
567     *
568     * @param string $xref
569     * @param int    $ged_id
570     * @param string $gedrec
571     *
572     * @return void
573     */
574    private function updateDates(string $xref, int $ged_id, string $gedrec): void
575    {
576        // Insert all new rows together
577        $rows = [];
578
579        preg_match_all("/\n1 (\w+).*(?:\n[2-9].*)*\n2 DATE (.+)(?:\n[2-9].*)*/", $gedrec, $matches, PREG_SET_ORDER);
580
581        foreach ($matches as $match) {
582            $fact = $match[1];
583            $date = new Date($match[2]);
584            $rows[] = [
585                'd_day'        => $date->minimumDate()->day,
586                'd_month'      => $date->minimumDate()->format('%O'),
587                'd_mon'        => $date->minimumDate()->month,
588                'd_year'       => $date->minimumDate()->year,
589                'd_julianday1' => $date->minimumDate()->minimumJulianDay(),
590                'd_julianday2' => $date->minimumDate()->maximumJulianDay(),
591                'd_fact'       => $fact,
592                'd_gid'        => $xref,
593                'd_file'       => $ged_id,
594                'd_type'       => $date->minimumDate()->format('%@'),
595            ];
596
597            $rows[] = [
598                'd_day'        => $date->maximumDate()->day,
599                'd_month'      => $date->maximumDate()->format('%O'),
600                'd_mon'        => $date->maximumDate()->month,
601                'd_year'       => $date->maximumDate()->year,
602                'd_julianday1' => $date->maximumDate()->minimumJulianDay(),
603                'd_julianday2' => $date->maximumDate()->maximumJulianDay(),
604                'd_fact'       => $fact,
605                'd_gid'        => $xref,
606                'd_file'       => $ged_id,
607                'd_type'       => $date->minimumDate()->format('%@'),
608            ];
609        }
610
611        // array_unique doesn't work with arrays of arrays
612        $rows = array_intersect_key($rows, array_unique(array_map('serialize', $rows)));
613
614        DB::table('dates')->insert($rows);
615    }
616
617    /**
618     * Extract all the links from the given record and insert them into the database
619     *
620     * @param string $xref
621     * @param int    $ged_id
622     * @param string $gedrec
623     *
624     * @return void
625     */
626    private function updateLinks(string $xref, int $ged_id, string $gedrec): void
627    {
628        // Insert all new rows together
629        $rows = [];
630
631        preg_match_all('/\n\d+ (' . Gedcom::REGEX_TAG . ') @(' . Gedcom::REGEX_XREF . ')@/', $gedrec, $matches, PREG_SET_ORDER);
632
633        foreach ($matches as $match) {
634            // Some applications (e.g. GenoPro) create links longer than 15 characters.
635            $link = mb_substr($match[1], 0, 15);
636
637            // Take care of "duplicates" that differ on case/collation, e.g. "SOUR @S1@" and "SOUR @s1@"
638            $rows[$link . strtoupper($match[2])] = [
639                'l_from' => $xref,
640                'l_to'   => $match[2],
641                'l_type' => $link,
642                'l_file' => $ged_id,
643            ];
644        }
645
646        DB::table('link')->insert($rows);
647    }
648
649    /**
650     * Extract all the names from the given record and insert them into the database.
651     *
652     * @param string     $xref
653     * @param int        $ged_id
654     * @param Individual $record
655     *
656     * @return void
657     */
658    private function updateNames(string $xref, int $ged_id, Individual $record): void
659    {
660        // Insert all new rows together
661        $rows = [];
662
663        foreach ($record->getAllNames() as $n => $name) {
664            if ($name['givn'] === Individual::PRAENOMEN_NESCIO) {
665                $soundex_givn_std = null;
666                $soundex_givn_dm  = null;
667            } else {
668                $soundex_givn_std = Soundex::russell($name['givn']);
669                $soundex_givn_dm  = Soundex::daitchMokotoff($name['givn']);
670            }
671
672            if ($name['surn'] === Individual::NOMEN_NESCIO) {
673                $soundex_surn_std = null;
674                $soundex_surn_dm  = null;
675            } else {
676                $soundex_surn_std = Soundex::russell($name['surname']);
677                $soundex_surn_dm  = Soundex::daitchMokotoff($name['surname']);
678            }
679
680            $rows[] = [
681                'n_file'             => $ged_id,
682                'n_id'               => $xref,
683                'n_num'              => $n,
684                'n_type'             => $name['type'],
685                'n_sort'             => mb_substr($name['sort'], 0, 255),
686                'n_full'             => mb_substr($name['fullNN'], 0, 255),
687                'n_surname'          => mb_substr($name['surname'], 0, 255),
688                'n_surn'             => mb_substr($name['surn'], 0, 255),
689                'n_givn'             => mb_substr($name['givn'], 0, 255),
690                'n_soundex_givn_std' => $soundex_givn_std,
691                'n_soundex_surn_std' => $soundex_surn_std,
692                'n_soundex_givn_dm'  => $soundex_givn_dm,
693                'n_soundex_surn_dm'  => $soundex_surn_dm,
694            ];
695        }
696
697        DB::table('name')->insert($rows);
698    }
699
700    /**
701     * Extract inline media data, and convert to media objects.
702     *
703     * @param Tree   $tree
704     * @param string $gedcom
705     *
706     * @return string
707     */
708    private function convertInlineMedia(Tree $tree, string $gedcom): string
709    {
710        while (preg_match('/\n1 OBJE(?:\n[2-9].+)+/', $gedcom, $match)) {
711            $xref   = $this->createMediaObject($match[0], $tree);
712            $gedcom = strtr($gedcom, [$match[0] =>  "\n1 OBJE @" . $xref . '@']);
713        }
714        while (preg_match('/\n2 OBJE(?:\n[3-9].+)+/', $gedcom, $match)) {
715            $xref   = $this->createMediaObject($match[0], $tree);
716            $gedcom = strtr($gedcom, [$match[0] =>  "\n2 OBJE @" . $xref . '@']);
717        }
718        while (preg_match('/\n3 OBJE(?:\n[4-9].+)+/', $gedcom, $match)) {
719            $xref   = $this->createMediaObject($match[0], $tree);
720            $gedcom = strtr($gedcom, [$match[0] =>  "\n3 OBJE @" . $xref . '@']);
721        }
722
723        return $gedcom;
724    }
725
726    /**
727     * Create a new media object, from inline media data.
728     *
729     * GEDCOM 5.5.1 specifies: +1 FILE / +2 FORM / +3 MEDI / +1 TITL
730     * GEDCOM 5.5 specifies: +1 FILE / +1 FORM / +1 TITL
731     * GEDCOM 5.5.1 says that GEDCOM 5.5 specifies:  +1 FILE / +1 FORM / +2 MEDI
732     *
733     * Legacy generates: +1 FORM / +1 FILE / +1 TITL / +1 _SCBK / +1 _PRIM / +1 _TYPE / +1 NOTE
734     * RootsMagic generates: +1 FILE / +1 FORM / +1 TITL
735     *
736     * @param string $gedcom
737     * @param Tree   $tree
738     *
739     * @return string
740     */
741    private function createMediaObject(string $gedcom, Tree $tree): string
742    {
743        preg_match('/\n\d FILE (.+)/', $gedcom, $match);
744        $file = $match[1] ?? '';
745
746        preg_match('/\n\d TITL (.+)/', $gedcom, $match);
747        $title = $match[1] ?? '';
748
749        preg_match('/\n\d FORM (.+)/', $gedcom, $match);
750        $format = $match[1] ?? '';
751
752        preg_match('/\n\d MEDI (.+)/', $gedcom, $match);
753        $media = $match[1] ?? '';
754
755        preg_match('/\n\d _SCBK (.+)/', $gedcom, $match);
756        $scrapbook = $match[1] ?? '';
757
758        preg_match('/\n\d _PRIM (.+)/', $gedcom, $match);
759        $primary = $match[1] ?? '';
760
761        preg_match('/\n\d _TYPE (.+)/', $gedcom, $match);
762        if ($media === '') {
763            // Legacy uses _TYPE instead of MEDI
764            $media = $match[1] ?? '';
765            $type  = '';
766        } else {
767            $type = $match[1] ?? '';
768        }
769
770        preg_match_all('/\n\d NOTE (.+(?:\n\d CONT.*)*)/', $gedcom, $matches);
771        $notes = $matches[1] ?? [];
772
773        // Have we already created a media object with the same title/filename?
774        $xref = DB::table('media_file')
775            ->where('m_file', '=', $tree->id())
776            ->where('descriptive_title', '=', mb_substr($title, 0, 248))
777            ->where('multimedia_file_refn', '=', mb_substr($file, 0, 248))
778            ->value('m_id');
779
780        if ($xref === null) {
781            $xref = Registry::xrefFactory()->make(Media::RECORD_TYPE);
782
783            // convert to a media-object
784            $gedcom = '0 @' . $xref . "@ OBJE\n1 FILE " . $file;
785
786            if ($format !== '') {
787                $gedcom .= "\n2 FORM " . $format;
788
789                if ($media !== '') {
790                    $gedcom .= "\n3 TYPE " . $media;
791                }
792            }
793
794            if ($title !== '') {
795                $gedcom .= "\n2 TITL " . $title;
796            }
797
798            if ($scrapbook !== '') {
799                $gedcom .= "\n1 _SCBK " . $scrapbook;
800            }
801
802            if ($primary !== '') {
803                $gedcom .= "\n1 _PRIM " . $primary;
804            }
805
806            if ($type !== '') {
807                $gedcom .= "\n1 _TYPE " . $type;
808            }
809
810            foreach ($notes as $note) {
811                $gedcom .= "\n1 NOTE " . strtr($note, ["\n3" => "\n2", "\n4" => "\n2", "\n5" => "\n2"]);
812            }
813
814            DB::table('media')->insert([
815                'm_id'     => $xref,
816                'm_file'   => $tree->id(),
817                'm_gedcom' => $gedcom,
818            ]);
819
820            DB::table('media_file')->insert([
821                'm_id'                 => $xref,
822                'm_file'               => $tree->id(),
823                'multimedia_file_refn' => mb_substr($file, 0, 248),
824                'multimedia_format'    => mb_substr($format, 0, 4),
825                'source_media_type'    => mb_substr($media, 0, 15),
826                'descriptive_title'    => mb_substr($title, 0, 248),
827            ]);
828        }
829
830        return $xref;
831    }
832
833    /**
834     * update a record in the database
835     *
836     * @param string $gedrec
837     * @param Tree   $tree
838     * @param bool   $delete
839     *
840     * @return void
841     * @throws GedcomErrorException
842     */
843    public function updateRecord(string $gedrec, Tree $tree, bool $delete): void
844    {
845        if (preg_match('/^0 @(' . Gedcom::REGEX_XREF . ')@ (' . Gedcom::REGEX_TAG . ')/', $gedrec, $match)) {
846            [, $gid, $type] = $match;
847        } elseif (preg_match('/^0 (HEAD)(?:\n|$)/', $gedrec, $match)) {
848            // The HEAD record has no XREF.  Any others?
849            $gid  = $match[1];
850            $type = $match[1];
851        } else {
852            throw new GedcomErrorException($gedrec);
853        }
854
855        // Place links
856        DB::table('placelinks')
857            ->where('pl_gid', '=', $gid)
858            ->where('pl_file', '=', $tree->id())
859            ->delete();
860
861        // Orphaned places.  If we're deleting  "Westminster, London, England",
862        // then we may also need to delete "London, England" and "England".
863        do {
864            $affected = DB::table('places')
865                ->leftJoin('placelinks', function (JoinClause $join): void {
866                    $join
867                        ->on('p_id', '=', 'pl_p_id')
868                        ->on('p_file', '=', 'pl_file');
869                })
870                ->whereNull('pl_p_id')
871                ->delete();
872        } while ($affected > 0);
873
874        DB::table('dates')
875            ->where('d_gid', '=', $gid)
876            ->where('d_file', '=', $tree->id())
877            ->delete();
878
879        DB::table('name')
880            ->where('n_id', '=', $gid)
881            ->where('n_file', '=', $tree->id())
882            ->delete();
883
884        DB::table('link')
885            ->where('l_from', '=', $gid)
886            ->where('l_file', '=', $tree->id())
887            ->delete();
888
889        switch ($type) {
890            case Individual::RECORD_TYPE:
891                DB::table('individuals')
892                    ->where('i_id', '=', $gid)
893                    ->where('i_file', '=', $tree->id())
894                    ->delete();
895                break;
896
897            case Family::RECORD_TYPE:
898                DB::table('families')
899                    ->where('f_id', '=', $gid)
900                    ->where('f_file', '=', $tree->id())
901                    ->delete();
902                break;
903
904            case Source::RECORD_TYPE:
905                DB::table('sources')
906                    ->where('s_id', '=', $gid)
907                    ->where('s_file', '=', $tree->id())
908                    ->delete();
909                break;
910
911            case Media::RECORD_TYPE:
912                DB::table('media_file')
913                    ->where('m_id', '=', $gid)
914                    ->where('m_file', '=', $tree->id())
915                    ->delete();
916
917                DB::table('media')
918                    ->where('m_id', '=', $gid)
919                    ->where('m_file', '=', $tree->id())
920                    ->delete();
921                break;
922
923            default:
924                DB::table('other')
925                    ->where('o_id', '=', $gid)
926                    ->where('o_file', '=', $tree->id())
927                    ->delete();
928                break;
929        }
930
931        if (!$delete) {
932            $this->importRecord($gedrec, $tree, true);
933        }
934    }
935}
936