xref: /webtrees/app/Services/GedcomImportService.php (revision 0e7e67a63333ea4fb57ff0664b5a1445655d9f62)
1<?php
2
3/**
4 * webtrees: online genealogy
5 * Copyright (C) 2022 webtrees development team
6 * This program is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation, either version 3 of the License, or
9 * (at your option) any later version.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program. If not, see <https://www.gnu.org/licenses/>.
16 */
17
18declare(strict_types=1);
19
20namespace Fisharebest\Webtrees\Services;
21
22use Fisharebest\Webtrees\Date;
23use Fisharebest\Webtrees\Elements\UnknownElement;
24use Fisharebest\Webtrees\Exceptions\GedcomErrorException;
25use Fisharebest\Webtrees\Family;
26use Fisharebest\Webtrees\Gedcom;
27use Fisharebest\Webtrees\Header;
28use Fisharebest\Webtrees\Individual;
29use Fisharebest\Webtrees\Location;
30use Fisharebest\Webtrees\Media;
31use Fisharebest\Webtrees\Note;
32use Fisharebest\Webtrees\Place;
33use Fisharebest\Webtrees\PlaceLocation;
34use Fisharebest\Webtrees\Registry;
35use Fisharebest\Webtrees\Repository;
36use Fisharebest\Webtrees\Soundex;
37use Fisharebest\Webtrees\Source;
38use Fisharebest\Webtrees\Submission;
39use Fisharebest\Webtrees\Submitter;
40use Fisharebest\Webtrees\Tree;
41use Illuminate\Database\Capsule\Manager as DB;
42use Illuminate\Database\Query\JoinClause;
43
44use function app;
45use function array_chunk;
46use function array_intersect_key;
47use function array_map;
48use function array_unique;
49use function assert;
50use function date;
51use function explode;
52use function max;
53use function mb_substr;
54use function preg_match;
55use function preg_match_all;
56use function preg_replace;
57use function round;
58use function str_contains;
59use function str_replace;
60use function str_starts_with;
61use function strlen;
62use function strtolower;
63use function strtoupper;
64use function strtr;
65use function substr;
66use function trim;
67
68use const PREG_SET_ORDER;
69
70/**
71 * Class GedcomImportService - import GEDCOM data
72 */
73class GedcomImportService
74{
75    /**
76     * Tidy up a gedcom record on import, so that we can access it consistently/efficiently.
77     *
78     * @param string $rec
79     * @param Tree   $tree
80     *
81     * @return string
82     */
83    private function reformatRecord(string $rec, Tree $tree): string
84    {
85        $gedcom_service = app(GedcomService::class);
86        assert($gedcom_service instanceof GedcomService);
87
88        // Strip out mac/msdos line endings
89        $rec = preg_replace("/[\r\n]+/", "\n", $rec);
90
91        // Extract lines from the record; lines consist of: level + optional xref + tag + optional data
92        $num_matches = preg_match_all('/^[ \t]*(\d+)[ \t]*(@[^@]*@)?[ \t]*(\w+)[ \t]?(.*)$/m', $rec, $matches, PREG_SET_ORDER);
93
94        // Process the record line-by-line
95        $newrec = '';
96        foreach ($matches as $n => $match) {
97            [, $level, $xref, $tag, $data] = $match;
98
99            $tag = $gedcom_service->canonicalTag($tag);
100
101            switch ($tag) {
102                case 'DATE':
103                    // Preserve text from INT dates
104                    if (str_contains($data, '(')) {
105                        [$date, $text] = explode('(', $data, 2);
106                        $text = ' (' . $text;
107                    } else {
108                        $date = $data;
109                        $text = '';
110                    }
111                    // Capitals
112                    $date = strtoupper($date);
113                    // Temporarily add leading/trailing spaces, to allow efficient matching below
114                    $date = ' ' . $date . ' ';
115                    // Ensure space digits and letters
116                    $date = preg_replace('/([A-Z])(\d)/', '$1 $2', $date);
117                    $date = preg_replace('/(\d)([A-Z])/', '$1 $2', $date);
118                    // Ensure space before/after calendar escapes
119                    $date = preg_replace('/@#[^@]+@/', ' $0 ', $date);
120                    // "BET." => "BET"
121                    $date = preg_replace('/(\w\w)\./', '$1', $date);
122                    // "CIR" => "ABT"
123                    $date = str_replace(' CIR ', ' ABT ', $date);
124                    $date = str_replace(' APX ', ' ABT ', $date);
125                    // B.C. => BC (temporarily, to allow easier handling of ".")
126                    $date = str_replace(' B.C. ', ' BC ', $date);
127                    // TMG uses "EITHER X OR Y"
128                    $date = preg_replace('/^ EITHER (.+) OR (.+)/', ' BET $1 AND $2', $date);
129                    // "BET X - Y " => "BET X AND Y"
130                    $date = preg_replace('/^(.* BET .+) - (.+)/', '$1 AND $2', $date);
131                    $date = preg_replace('/^(.* FROM .+) - (.+)/', '$1 TO $2', $date);
132                    // "@#ESC@ FROM X TO Y" => "FROM @#ESC@ X TO @#ESC@ Y"
133                    $date = preg_replace('/^ +(@#[^@]+@) +FROM +(.+) +TO +(.+)/', ' FROM $1 $2 TO $1 $3', $date);
134                    $date = preg_replace('/^ +(@#[^@]+@) +BET +(.+) +AND +(.+)/', ' BET $1 $2 AND $1 $3', $date);
135                    // "@#ESC@ AFT X" => "AFT @#ESC@ X"
136                    $date = preg_replace('/^ +(@#[^@]+@) +(FROM|BET|TO|AND|BEF|AFT|CAL|EST|INT|ABT) +(.+)/', ' $2 $1 $3', $date);
137                    // Ignore any remaining punctuation, e.g. "14-MAY, 1900" => "14 MAY 1900"
138                    // (don't change "/" - it is used in NS/OS dates)
139                    $date = preg_replace('/[.,:;-]/', ' ', $date);
140                    // BC => B.C.
141                    $date = str_replace(' BC ', ' B.C. ', $date);
142                    // Append the "INT" text
143                    $data = $date . $text;
144                    break;
145                case 'FORM':
146                    // Consistent commas
147                    $data = preg_replace('/ *, */', ', ', $data);
148                    break;
149                case 'HEAD':
150                    // HEAD records don't have an XREF or DATA
151                    if ($level === '0') {
152                        $xref = '';
153                        $data = '';
154                    }
155                    break;
156                case 'NAME':
157                    // Tidy up non-printing characters
158                    $data = preg_replace('/  +/', ' ', trim($data));
159                    break;
160                case 'PLAC':
161                    // Consistent commas
162                    $data = preg_replace('/ *[,,،] */u', ', ', $data);
163                    // The Master Genealogist stores LAT/LONG data in the PLAC field, e.g. Pennsylvania, USA, 395945N0751013W
164                    if (preg_match('/(.*), (\d\d)(\d\d)(\d\d)([NS])(\d\d\d)(\d\d)(\d\d)([EW])$/', $data, $match)) {
165                        $data =
166                            $match[1] . "\n" .
167                            ($level + 1) . " MAP\n" .
168                            ($level + 2) . ' LATI ' . ($match[5] . round($match[2] + ($match[3] / 60) + ($match[4] / 3600), 4)) . "\n" .
169                            ($level + 2) . ' LONG ' . ($match[9] . round($match[6] + ($match[7] / 60) + ($match[8] / 3600), 4));
170                    }
171                    break;
172                case 'RESN':
173                    // RESN values are lower case (confidential, privacy, locked, none)
174                    $data = strtolower($data);
175                    if ($data === 'invisible') {
176                        $data = 'confidential'; // From old versions of Legacy.
177                    }
178                    break;
179                case 'SEX':
180                    $data = strtoupper($data);
181                    break;
182                case 'TRLR':
183                    // TRLR records don't have an XREF or DATA
184                    if ($level === '0') {
185                        $xref = '';
186                        $data = '';
187                    }
188                    break;
189            }
190            // Suppress "Y", for facts/events with a DATE or PLAC
191            if ($data === 'y') {
192                $data = 'Y';
193            }
194            if ($level === '1' && $data === 'Y') {
195                for ($i = $n + 1; $i < $num_matches - 1 && $matches[$i][1] !== '1'; ++$i) {
196                    if ($matches[$i][3] === 'DATE' || $matches[$i][3] === 'PLAC') {
197                        $data = '';
198                        break;
199                    }
200                }
201            }
202            // Reassemble components back into a single line
203            switch ($tag) {
204                default:
205                    // Remove tabs and multiple/leading/trailing spaces
206                    $data = strtr($data, ["\t" => ' ']);
207                    $data = trim($data, ' ');
208                    while (str_contains($data, '  ')) {
209                        $data = strtr($data, ['  ' => ' ']);
210                    }
211                    $newrec .= ($newrec ? "\n" : '') . $level . ' ' . ($level === '0' && $xref ? $xref . ' ' : '') . $tag . ($data === '' && $tag !== 'NOTE' ? '' : ' ' . $data);
212                    break;
213                case 'NOTE':
214                case 'TEXT':
215                case 'DATA':
216                case 'CONT':
217                    $newrec .= ($newrec ? "\n" : '') . $level . ' ' . ($level === '0' && $xref ? $xref . ' ' : '') . $tag . ($data === '' && $tag !== 'NOTE' ? '' : ' ' . $data);
218                    break;
219                case 'FILE':
220                    // Strip off the user-defined path prefix
221                    $GEDCOM_MEDIA_PATH = $tree->getPreference('GEDCOM_MEDIA_PATH');
222                    if ($GEDCOM_MEDIA_PATH !== '' && str_starts_with($data, $GEDCOM_MEDIA_PATH)) {
223                        $data = substr($data, strlen($GEDCOM_MEDIA_PATH));
224                    }
225                    // convert backslashes in filenames to forward slashes
226                    $data = preg_replace("/\\\\/", '/', $data);
227
228                    $newrec .= ($newrec ? "\n" : '') . $level . ' ' . ($level === '0' && $xref ? $xref . ' ' : '') . $tag . ($data === '' && $tag !== 'NOTE' ? '' : ' ' . $data);
229                    break;
230                case 'CONC':
231                    // Merge CONC lines, to simplify access later on.
232                    $newrec .= ($tree->getPreference('WORD_WRAPPED_NOTES') ? ' ' : '') . $data;
233                    break;
234            }
235        }
236
237        return $newrec;
238    }
239
240    /**
241     * import record into database
242     * this function will parse the given gedcom record and add it to the database
243     *
244     * @param string $gedrec the raw gedcom record to parse
245     * @param Tree   $tree   import the record into this tree
246     * @param bool   $update whether this is an updated record that has been accepted
247     *
248     * @return void
249     * @throws GedcomErrorException
250     */
251    public function importRecord(string $gedrec, Tree $tree, bool $update): void
252    {
253        $tree_id = $tree->id();
254
255        // Escaped @ signs (only if importing from file)
256        if (!$update) {
257            $gedrec = str_replace('@@', '@', $gedrec);
258        }
259
260        // Standardise gedcom format
261        $gedrec = $this->reformatRecord($gedrec, $tree);
262
263        // import different types of records
264        if (preg_match('/^0 @(' . Gedcom::REGEX_XREF . ')@ (' . Gedcom::REGEX_TAG . ')/', $gedrec, $match)) {
265            [, $xref, $type] = $match;
266        } elseif (str_starts_with($gedrec, '0 HEAD')) {
267            $type = 'HEAD';
268            $xref = 'HEAD'; // For records without an XREF, use the type as a pseudo XREF.
269        } elseif (str_starts_with($gedrec, '0 TRLR')) {
270            $tree->setPreference('imported', '1');
271            $type = 'TRLR';
272            $xref = 'TRLR'; // For records without an XREF, use the type as a pseudo XREF.
273        } elseif (str_starts_with($gedrec, '0 _PLAC_DEFN')) {
274            $this->importLegacyPlacDefn($gedrec);
275
276            return;
277        } elseif (str_starts_with($gedrec, '0 _PLAC ')) {
278            $this->importTNGPlac($gedrec);
279
280            return;
281        } else {
282            foreach (Gedcom::CUSTOM_RECORDS_WITHOUT_XREFS as $record_type) {
283                if (preg_match('/^0 ' . $record_type . '\b/', $gedrec) === 1) {
284                    return;
285                }
286            }
287
288            throw new GedcomErrorException($gedrec);
289        }
290
291        // Add a _UID
292        if ($tree->getPreference('GENERATE_UIDS') === '1' && !str_contains($gedrec, "\n1 _UID ")) {
293            $element = Registry::elementFactory()->make($type . ':_UID');
294            if (!$element instanceof UnknownElement) {
295                $gedrec .= "\n1 _UID " . $element->default($tree);
296            }
297        }
298
299        // If the user has downloaded their GEDCOM data (containing media objects) and edited it
300        // using an application which does not support (and deletes) media objects, then add them
301        // back in.
302        if ($tree->getPreference('keep_media')) {
303            $old_linked_media = DB::table('link')
304                ->where('l_from', '=', $xref)
305                ->where('l_file', '=', $tree_id)
306                ->where('l_type', '=', 'OBJE')
307                ->pluck('l_to');
308
309            // Delete these links - so that we do not insert them again in updateLinks()
310            DB::table('link')
311                ->where('l_from', '=', $xref)
312                ->where('l_file', '=', $tree_id)
313                ->where('l_type', '=', 'OBJE')
314                ->delete();
315
316            foreach ($old_linked_media as $media_id) {
317                $gedrec .= "\n1 OBJE @" . $media_id . '@';
318            }
319        }
320
321        // Convert inline media into media objects
322        $gedrec = $this->convertInlineMedia($tree, $gedrec);
323
324        switch ($type) {
325            case Individual::RECORD_TYPE:
326                $record = Registry::individualFactory()->new($xref, $gedrec, null, $tree);
327
328                if (preg_match('/\n1 RIN (.+)/', $gedrec, $match)) {
329                    $rin = $match[1];
330                } else {
331                    $rin = $xref;
332                }
333
334                // The database can only store MFU, and many of the stats queries assume this.
335                $sex = $record->sex();
336                $sex = $sex === 'M' || $sex === 'F' ? $sex : 'U';
337
338                DB::table('individuals')->insert([
339                    'i_id'     => $xref,
340                    'i_file'   => $tree_id,
341                    'i_rin'    => $rin,
342                    'i_sex'    => $sex,
343                    'i_gedcom' => $gedrec,
344                ]);
345
346                // Update the cross-reference/index tables.
347                $this->updatePlaces($xref, $tree, $gedrec);
348                $this->updateDates($xref, $tree_id, $gedrec);
349                $this->updateNames($xref, $tree_id, $record);
350                break;
351
352            case Family::RECORD_TYPE:
353                if (preg_match('/\n1 HUSB @(' . Gedcom::REGEX_XREF . ')@/', $gedrec, $match)) {
354                    $husb = $match[1];
355                } else {
356                    $husb = '';
357                }
358                if (preg_match('/\n1 WIFE @(' . Gedcom::REGEX_XREF . ')@/', $gedrec, $match)) {
359                    $wife = $match[1];
360                } else {
361                    $wife = '';
362                }
363                $nchi = preg_match_all('/\n1 CHIL @(' . Gedcom::REGEX_XREF . ')@/', $gedrec, $match);
364                if (preg_match('/\n1 NCHI (\d+)/', $gedrec, $match)) {
365                    $nchi = max($nchi, $match[1]);
366                }
367
368                DB::table('families')->insert([
369                    'f_id'      => $xref,
370                    'f_file'    => $tree_id,
371                    'f_husb'    => $husb,
372                    'f_wife'    => $wife,
373                    'f_gedcom'  => $gedrec,
374                    'f_numchil' => $nchi,
375                ]);
376
377                // Update the cross-reference/index tables.
378                $this->updatePlaces($xref, $tree, $gedrec);
379                $this->updateDates($xref, $tree_id, $gedrec);
380                break;
381
382            case Source::RECORD_TYPE:
383                if (preg_match('/\n1 TITL (.+)/', $gedrec, $match)) {
384                    $name = $match[1];
385                } elseif (preg_match('/\n1 ABBR (.+)/', $gedrec, $match)) {
386                    $name = $match[1];
387                } else {
388                    $name = $xref;
389                }
390
391                DB::table('sources')->insert([
392                    's_id'     => $xref,
393                    's_file'   => $tree_id,
394                    's_name'   => mb_substr($name, 0, 255),
395                    's_gedcom' => $gedrec,
396                ]);
397                break;
398
399            case Repository::RECORD_TYPE:
400            case Note::RECORD_TYPE:
401            case Submission::RECORD_TYPE:
402            case Submitter::RECORD_TYPE:
403            case Location::RECORD_TYPE:
404                DB::table('other')->insert([
405                    'o_id'     => $xref,
406                    'o_file'   => $tree_id,
407                    'o_type'   => $type,
408                    'o_gedcom' => $gedrec,
409                ]);
410                break;
411
412            case Header::RECORD_TYPE:
413                // Force HEAD records to have a creation date.
414                if (!str_contains($gedrec, "\n1 DATE ")) {
415                    $today = strtoupper(date('d M Y'));
416                    $gedrec .= "\n1 DATE " . $today;
417                }
418
419                DB::table('other')->insert([
420                    'o_id'     => $xref,
421                    'o_file'   => $tree_id,
422                    'o_type'   => Header::RECORD_TYPE,
423                    'o_gedcom' => $gedrec,
424                ]);
425                break;
426
427
428            case Media::RECORD_TYPE:
429                $record = Registry::mediaFactory()->new($xref, $gedrec, null, $tree);
430
431                DB::table('media')->insert([
432                    'm_id'     => $xref,
433                    'm_file'   => $tree_id,
434                    'm_gedcom' => $gedrec,
435                ]);
436
437                foreach ($record->mediaFiles() as $media_file) {
438                    DB::table('media_file')->insert([
439                        'm_id'                 => $xref,
440                        'm_file'               => $tree_id,
441                        'multimedia_file_refn' => mb_substr($media_file->filename(), 0, 248),
442                        'multimedia_format'    => mb_substr($media_file->format(), 0, 4),
443                        'source_media_type'    => mb_substr($media_file->type(), 0, 15),
444                        'descriptive_title'    => mb_substr($media_file->title(), 0, 248),
445                    ]);
446                }
447                break;
448
449            default: // Custom record types.
450                DB::table('other')->insert([
451                    'o_id'     => $xref,
452                    'o_file'   => $tree_id,
453                    'o_type'   => mb_substr($type, 0, 15),
454                    'o_gedcom' => $gedrec,
455                ]);
456                break;
457        }
458
459        // Update the cross-reference/index tables.
460        $this->updateLinks($xref, $tree_id, $gedrec);
461    }
462
463    /**
464     * Legacy Family Tree software generates _PLAC_DEFN records containing LAT/LONG values
465     *
466     * @param string $gedcom
467     */
468    private function importLegacyPlacDefn(string $gedcom): void
469    {
470        $gedcom_service = new GedcomService();
471
472        if (preg_match('/\n1 PLAC (.+)/', $gedcom, $match)) {
473            $place_name = $match[1];
474        } else {
475            return;
476        }
477
478        if (preg_match('/\n3 LATI ([NS].+)/', $gedcom, $match)) {
479            $latitude = $gedcom_service->readLatitude($match[1]);
480        } else {
481            return;
482        }
483
484        if (preg_match('/\n3 LONG ([EW].+)/', $gedcom, $match)) {
485            $longitude = $gedcom_service->readLongitude($match[1]);
486        } else {
487            return;
488        }
489
490        $location = new PlaceLocation($place_name);
491
492        if ($location->latitude() === null && $location->longitude() === null) {
493            DB::table('place_location')
494                ->where('id', '=', $location->id())
495                ->update([
496                    'latitude'  => $latitude,
497                    'longitude' => $longitude,
498                ]);
499        }
500    }
501
502    /**
503     * Legacy Family Tree software generates _PLAC records containing LAT/LONG values
504     *
505     * @param string $gedcom
506     */
507    private function importTNGPlac(string $gedcom): void
508    {
509        if (preg_match('/^0 _PLAC (.+)/', $gedcom, $match)) {
510            $place_name = $match[1];
511        } else {
512            return;
513        }
514
515        if (preg_match('/\n2 LATI (.+)/', $gedcom, $match)) {
516            $latitude = (float) $match[1];
517        } else {
518            return;
519        }
520
521        if (preg_match('/\n2 LONG (.+)/', $gedcom, $match)) {
522            $longitude = (float) $match[1];
523        } else {
524            return;
525        }
526
527        $location = new PlaceLocation($place_name);
528
529        if ($location->latitude() === null && $location->longitude() === null) {
530            DB::table('place_location')
531                ->where('id', '=', $location->id())
532                ->update([
533                    'latitude'  => $latitude,
534                    'longitude' => $longitude,
535                ]);
536        }
537    }
538
539    /**
540     * Extract all level 2 places from the given record and insert them into the places table
541     *
542     * @param string $xref
543     * @param Tree   $tree
544     * @param string $gedrec
545     *
546     * @return void
547     */
548    public function updatePlaces(string $xref, Tree $tree, string $gedrec): void
549    {
550        // Insert all new rows together
551        $rows = [];
552
553        preg_match_all('/\n2 PLAC (.+)/', $gedrec, $matches);
554
555        $places = array_unique($matches[1]);
556
557        foreach ($places as $place_name) {
558            $place = new Place($place_name, $tree);
559
560            // Calling Place::id() will create the entry in the database, if it doesn't already exist.
561            while ($place->id() !== 0) {
562                $rows[] = [
563                    'pl_p_id' => $place->id(),
564                    'pl_gid'  => $xref,
565                    'pl_file' => $tree->id(),
566                ];
567
568                $place = $place->parent();
569            }
570        }
571
572        // array_unique doesn't work with arrays of arrays
573        $rows = array_intersect_key($rows, array_unique(array_map('serialize', $rows)));
574
575        // PDO has a limit of 65535 placeholders, and each row requires 3 placeholders.
576        foreach (array_chunk($rows, 20000) as $chunk) {
577            DB::table('placelinks')->insert($chunk);
578        }
579    }
580
581    /**
582     * Extract all the dates from the given record and insert them into the database.
583     *
584     * @param string $xref
585     * @param int    $ged_id
586     * @param string $gedrec
587     *
588     * @return void
589     */
590    private function updateDates(string $xref, int $ged_id, string $gedrec): void
591    {
592        // Insert all new rows together
593        $rows = [];
594
595        preg_match_all("/\n1 (\w+).*(?:\n[2-9].*)*\n2 DATE (.+)(?:\n[2-9].*)*/", $gedrec, $matches, PREG_SET_ORDER);
596
597        foreach ($matches as $match) {
598            $fact = $match[1];
599            $date = new Date($match[2]);
600            $rows[] = [
601                'd_day'        => $date->minimumDate()->day,
602                'd_month'      => $date->minimumDate()->format('%O'),
603                'd_mon'        => $date->minimumDate()->month,
604                'd_year'       => $date->minimumDate()->year,
605                'd_julianday1' => $date->minimumDate()->minimumJulianDay(),
606                'd_julianday2' => $date->minimumDate()->maximumJulianDay(),
607                'd_fact'       => $fact,
608                'd_gid'        => $xref,
609                'd_file'       => $ged_id,
610                'd_type'       => $date->minimumDate()->format('%@'),
611            ];
612
613            $rows[] = [
614                'd_day'        => $date->maximumDate()->day,
615                'd_month'      => $date->maximumDate()->format('%O'),
616                'd_mon'        => $date->maximumDate()->month,
617                'd_year'       => $date->maximumDate()->year,
618                'd_julianday1' => $date->maximumDate()->minimumJulianDay(),
619                'd_julianday2' => $date->maximumDate()->maximumJulianDay(),
620                'd_fact'       => $fact,
621                'd_gid'        => $xref,
622                'd_file'       => $ged_id,
623                'd_type'       => $date->minimumDate()->format('%@'),
624            ];
625        }
626
627        // array_unique doesn't work with arrays of arrays
628        $rows = array_intersect_key($rows, array_unique(array_map('serialize', $rows)));
629
630        DB::table('dates')->insert($rows);
631    }
632
633    /**
634     * Extract all the links from the given record and insert them into the database
635     *
636     * @param string $xref
637     * @param int    $ged_id
638     * @param string $gedrec
639     *
640     * @return void
641     */
642    private function updateLinks(string $xref, int $ged_id, string $gedrec): void
643    {
644        // Insert all new rows together
645        $rows = [];
646
647        preg_match_all('/\n\d+ (' . Gedcom::REGEX_TAG . ') @(' . Gedcom::REGEX_XREF . ')@/', $gedrec, $matches, PREG_SET_ORDER);
648
649        foreach ($matches as $match) {
650            // Some applications (e.g. GenoPro) create links longer than 15 characters.
651            $link = mb_substr($match[1], 0, 15);
652
653            // Take care of "duplicates" that differ on case/collation, e.g. "SOUR @S1@" and "SOUR @s1@"
654            $rows[$link . strtoupper($match[2])] = [
655                'l_from' => $xref,
656                'l_to'   => $match[2],
657                'l_type' => $link,
658                'l_file' => $ged_id,
659            ];
660        }
661
662        DB::table('link')->insert($rows);
663    }
664
665    /**
666     * Extract all the names from the given record and insert them into the database.
667     *
668     * @param string     $xref
669     * @param int        $ged_id
670     * @param Individual $record
671     *
672     * @return void
673     */
674    private function updateNames(string $xref, int $ged_id, Individual $record): void
675    {
676        // Insert all new rows together
677        $rows = [];
678
679        foreach ($record->getAllNames() as $n => $name) {
680            if ($name['givn'] === Individual::PRAENOMEN_NESCIO) {
681                $soundex_givn_std = null;
682                $soundex_givn_dm  = null;
683            } else {
684                $soundex_givn_std = Soundex::russell($name['givn']);
685                $soundex_givn_dm  = Soundex::daitchMokotoff($name['givn']);
686            }
687
688            if ($name['surn'] === Individual::NOMEN_NESCIO) {
689                $soundex_surn_std = null;
690                $soundex_surn_dm  = null;
691            } else {
692                $soundex_surn_std = Soundex::russell($name['surname']);
693                $soundex_surn_dm  = Soundex::daitchMokotoff($name['surname']);
694            }
695
696            $rows[] = [
697                'n_file'             => $ged_id,
698                'n_id'               => $xref,
699                'n_num'              => $n,
700                'n_type'             => $name['type'],
701                'n_sort'             => mb_substr($name['sort'], 0, 255),
702                'n_full'             => mb_substr($name['fullNN'], 0, 255),
703                'n_surname'          => mb_substr($name['surname'], 0, 255),
704                'n_surn'             => mb_substr($name['surn'], 0, 255),
705                'n_givn'             => mb_substr($name['givn'], 0, 255),
706                'n_soundex_givn_std' => $soundex_givn_std,
707                'n_soundex_surn_std' => $soundex_surn_std,
708                'n_soundex_givn_dm'  => $soundex_givn_dm,
709                'n_soundex_surn_dm'  => $soundex_surn_dm,
710            ];
711        }
712
713        DB::table('name')->insert($rows);
714    }
715
716    /**
717     * Extract inline media data, and convert to media objects.
718     *
719     * @param Tree   $tree
720     * @param string $gedcom
721     *
722     * @return string
723     */
724    private function convertInlineMedia(Tree $tree, string $gedcom): string
725    {
726        while (preg_match('/\n1 OBJE(?:\n[2-9].+)+/', $gedcom, $match)) {
727            $xref   = $this->createMediaObject($match[0], $tree);
728            $gedcom = strtr($gedcom, [$match[0] =>  "\n1 OBJE @" . $xref . '@']);
729        }
730        while (preg_match('/\n2 OBJE(?:\n[3-9].+)+/', $gedcom, $match)) {
731            $xref   = $this->createMediaObject($match[0], $tree);
732            $gedcom = strtr($gedcom, [$match[0] =>  "\n2 OBJE @" . $xref . '@']);
733        }
734        while (preg_match('/\n3 OBJE(?:\n[4-9].+)+/', $gedcom, $match)) {
735            $xref   = $this->createMediaObject($match[0], $tree);
736            $gedcom = strtr($gedcom, [$match[0] =>  "\n3 OBJE @" . $xref . '@']);
737        }
738
739        return $gedcom;
740    }
741
742    /**
743     * Create a new media object, from inline media data.
744     *
745     * GEDCOM 5.5.1 specifies: +1 FILE / +2 FORM / +3 MEDI / +1 TITL
746     * GEDCOM 5.5 specifies: +1 FILE / +1 FORM / +1 TITL
747     * GEDCOM 5.5.1 says that GEDCOM 5.5 specifies:  +1 FILE / +1 FORM / +2 MEDI
748     *
749     * Legacy generates: +1 FORM / +1 FILE / +1 TITL / +1 _SCBK / +1 _PRIM / +1 _TYPE / +1 NOTE
750     * RootsMagic generates: +1 FILE / +1 FORM / +1 TITL
751     *
752     * @param string $gedcom
753     * @param Tree   $tree
754     *
755     * @return string
756     */
757    private function createMediaObject(string $gedcom, Tree $tree): string
758    {
759        preg_match('/\n\d FILE (.+)/', $gedcom, $match);
760        $file = $match[1] ?? '';
761
762        preg_match('/\n\d TITL (.+)/', $gedcom, $match);
763        $title = $match[1] ?? '';
764
765        preg_match('/\n\d FORM (.+)/', $gedcom, $match);
766        $format = $match[1] ?? '';
767
768        preg_match('/\n\d MEDI (.+)/', $gedcom, $match);
769        $media = $match[1] ?? '';
770
771        preg_match('/\n\d _SCBK (.+)/', $gedcom, $match);
772        $scrapbook = $match[1] ?? '';
773
774        preg_match('/\n\d _PRIM (.+)/', $gedcom, $match);
775        $primary = $match[1] ?? '';
776
777        preg_match('/\n\d _TYPE (.+)/', $gedcom, $match);
778        if ($media === '') {
779            // Legacy uses _TYPE instead of MEDI
780            $media = $match[1] ?? '';
781            $type  = '';
782        } else {
783            $type = $match[1] ?? '';
784        }
785
786        preg_match_all('/\n\d NOTE (.+(?:\n\d CONT.*)*)/', $gedcom, $matches);
787        $notes = $matches[1] ?? [];
788
789        // Have we already created a media object with the same title/filename?
790        $xref = DB::table('media_file')
791            ->where('m_file', '=', $tree->id())
792            ->where('descriptive_title', '=', mb_substr($title, 0, 248))
793            ->where('multimedia_file_refn', '=', mb_substr($file, 0, 248))
794            ->value('m_id');
795
796        if ($xref === null) {
797            $xref = Registry::xrefFactory()->make(Media::RECORD_TYPE);
798
799            // convert to a media-object
800            $gedcom = '0 @' . $xref . "@ OBJE\n1 FILE " . $file;
801
802            if ($format !== '') {
803                $gedcom .= "\n2 FORM " . $format;
804
805                if ($media !== '') {
806                    $gedcom .= "\n3 TYPE " . $media;
807                }
808            }
809
810            if ($title !== '') {
811                $gedcom .= "\n3 TITL " . $title;
812            }
813
814            if ($scrapbook !== '') {
815                $gedcom .= "\n1 _SCBK " . $scrapbook;
816            }
817
818            if ($primary !== '') {
819                $gedcom .= "\n1 _PRIM " . $primary;
820            }
821
822            if ($type !== '') {
823                $gedcom .= "\n1 _TYPE " . $type;
824            }
825
826            foreach ($notes as $note) {
827                $gedcom .= "\n1 NOTE " . strtr($note, ["\n3" => "\n2", "\n4" => "\n2", "\n5" => "\n2"]);
828            }
829
830            DB::table('media')->insert([
831                'm_id'     => $xref,
832                'm_file'   => $tree->id(),
833                'm_gedcom' => $gedcom,
834            ]);
835
836            DB::table('media_file')->insert([
837                'm_id'                 => $xref,
838                'm_file'               => $tree->id(),
839                'multimedia_file_refn' => mb_substr($file, 0, 248),
840                'multimedia_format'    => mb_substr($format, 0, 4),
841                'source_media_type'    => mb_substr($media, 0, 15),
842                'descriptive_title'    => mb_substr($title, 0, 248),
843            ]);
844        }
845
846        return $xref;
847    }
848
849    /**
850     * update a record in the database
851     *
852     * @param string $gedrec
853     * @param Tree   $tree
854     * @param bool   $delete
855     *
856     * @return void
857     * @throws GedcomErrorException
858     */
859    public function updateRecord(string $gedrec, Tree $tree, bool $delete): void
860    {
861        if (preg_match('/^0 @(' . Gedcom::REGEX_XREF . ')@ (' . Gedcom::REGEX_TAG . ')/', $gedrec, $match)) {
862            [, $gid, $type] = $match;
863        } elseif (preg_match('/^0 (HEAD)(?:\n|$)/', $gedrec, $match)) {
864            // The HEAD record has no XREF.  Any others?
865            $gid  = $match[1];
866            $type = $match[1];
867        } else {
868            throw new GedcomErrorException($gedrec);
869        }
870
871        // Place links
872        DB::table('placelinks')
873            ->where('pl_gid', '=', $gid)
874            ->where('pl_file', '=', $tree->id())
875            ->delete();
876
877        // Orphaned places.  If we're deleting  "Westminster, London, England",
878        // then we may also need to delete "London, England" and "England".
879        do {
880            $affected = DB::table('places')
881                ->leftJoin('placelinks', function (JoinClause $join): void {
882                    $join
883                        ->on('p_id', '=', 'pl_p_id')
884                        ->on('p_file', '=', 'pl_file');
885                })
886                ->whereNull('pl_p_id')
887                ->delete();
888        } while ($affected > 0);
889
890        DB::table('dates')
891            ->where('d_gid', '=', $gid)
892            ->where('d_file', '=', $tree->id())
893            ->delete();
894
895        DB::table('name')
896            ->where('n_id', '=', $gid)
897            ->where('n_file', '=', $tree->id())
898            ->delete();
899
900        DB::table('link')
901            ->where('l_from', '=', $gid)
902            ->where('l_file', '=', $tree->id())
903            ->delete();
904
905        switch ($type) {
906            case Individual::RECORD_TYPE:
907                DB::table('individuals')
908                    ->where('i_id', '=', $gid)
909                    ->where('i_file', '=', $tree->id())
910                    ->delete();
911                break;
912
913            case Family::RECORD_TYPE:
914                DB::table('families')
915                    ->where('f_id', '=', $gid)
916                    ->where('f_file', '=', $tree->id())
917                    ->delete();
918                break;
919
920            case Source::RECORD_TYPE:
921                DB::table('sources')
922                    ->where('s_id', '=', $gid)
923                    ->where('s_file', '=', $tree->id())
924                    ->delete();
925                break;
926
927            case Media::RECORD_TYPE:
928                DB::table('media_file')
929                    ->where('m_id', '=', $gid)
930                    ->where('m_file', '=', $tree->id())
931                    ->delete();
932
933                DB::table('media')
934                    ->where('m_id', '=', $gid)
935                    ->where('m_file', '=', $tree->id())
936                    ->delete();
937                break;
938
939            default:
940                DB::table('other')
941                    ->where('o_id', '=', $gid)
942                    ->where('o_file', '=', $tree->id())
943                    ->delete();
944                break;
945        }
946
947        if (!$delete) {
948            $this->importRecord($gedrec, $tree, true);
949        }
950    }
951}
952