xref: /webtrees/app/Services/GedcomImportService.php (revision b5505f697291435abadf92d9c68555144f816161)
1<?php
2
3/**
4 * webtrees: online genealogy
5 * Copyright (C) 2023 webtrees development team
6 * This program is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation, either version 3 of the License, or
9 * (at your option) any later version.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program. If not, see <https://www.gnu.org/licenses/>.
16 */
17
18declare(strict_types=1);
19
20namespace Fisharebest\Webtrees\Services;
21
22use Fisharebest\Webtrees\Date;
23use Fisharebest\Webtrees\Elements\UnknownElement;
24use Fisharebest\Webtrees\Exceptions\GedcomErrorException;
25use Fisharebest\Webtrees\Family;
26use Fisharebest\Webtrees\Gedcom;
27use Fisharebest\Webtrees\Header;
28use Fisharebest\Webtrees\Individual;
29use Fisharebest\Webtrees\Location;
30use Fisharebest\Webtrees\Media;
31use Fisharebest\Webtrees\Note;
32use Fisharebest\Webtrees\Place;
33use Fisharebest\Webtrees\PlaceLocation;
34use Fisharebest\Webtrees\Registry;
35use Fisharebest\Webtrees\Repository;
36use Fisharebest\Webtrees\Soundex;
37use Fisharebest\Webtrees\Source;
38use Fisharebest\Webtrees\Submission;
39use Fisharebest\Webtrees\Submitter;
40use Fisharebest\Webtrees\Tree;
41use Illuminate\Database\Capsule\Manager as DB;
42use Illuminate\Database\Query\JoinClause;
43
44use function app;
45use function array_chunk;
46use function array_intersect_key;
47use function array_map;
48use function array_unique;
49use function assert;
50use function date;
51use function explode;
52use function max;
53use function mb_substr;
54use function preg_match;
55use function preg_match_all;
56use function preg_replace;
57use function round;
58use function str_contains;
59use function str_replace;
60use function str_starts_with;
61use function strlen;
62use function strtoupper;
63use function strtr;
64use function substr;
65use function trim;
66
67use const PREG_SET_ORDER;
68
69/**
70 * Class GedcomImportService - import GEDCOM data
71 */
72class GedcomImportService
73{
74    /**
75     * Tidy up a gedcom record on import, so that we can access it consistently/efficiently.
76     *
77     * @param string $rec
78     * @param Tree   $tree
79     *
80     * @return string
81     */
82    private function reformatRecord(string $rec, Tree $tree): string
83    {
84        $gedcom_service = app(GedcomService::class);
85        assert($gedcom_service instanceof GedcomService);
86
87        // Strip out mac/msdos line endings
88        $rec = preg_replace("/[\r\n]+/", "\n", $rec);
89
90        // Extract lines from the record; lines consist of: level + optional xref + tag + optional data
91        $num_matches = preg_match_all('/^[ \t]*(\d+)[ \t]*(@[^@]*@)?[ \t]*(\w+)[ \t]?(.*)$/m', $rec, $matches, PREG_SET_ORDER);
92
93        // Process the record line-by-line
94        $newrec = '';
95        foreach ($matches as $n => $match) {
96            [, $level, $xref, $tag, $data] = $match;
97
98            $tag = $gedcom_service->canonicalTag($tag);
99
100            switch ($tag) {
101                case 'DATE':
102                    // Preserve text from INT dates
103                    if (str_contains($data, '(')) {
104                        [$date, $text] = explode('(', $data, 2);
105                        $text = ' (' . $text;
106                    } else {
107                        $date = $data;
108                        $text = '';
109                    }
110                    // Capitals
111                    $date = strtoupper($date);
112                    // Temporarily add leading/trailing spaces, to allow efficient matching below
113                    $date = ' ' . $date . ' ';
114                    // Ensure space digits and letters
115                    $date = preg_replace('/([A-Z])(\d)/', '$1 $2', $date);
116                    $date = preg_replace('/(\d)([A-Z])/', '$1 $2', $date);
117                    // Ensure space before/after calendar escapes
118                    $date = preg_replace('/@#[^@]+@/', ' $0 ', $date);
119                    // "BET." => "BET"
120                    $date = preg_replace('/(\w\w)\./', '$1', $date);
121                    // "CIR" => "ABT"
122                    $date = str_replace(' CIR ', ' ABT ', $date);
123                    $date = str_replace(' APX ', ' ABT ', $date);
124                    // B.C. => BC (temporarily, to allow easier handling of ".")
125                    $date = str_replace(' B.C. ', ' BC ', $date);
126                    // TMG uses "EITHER X OR Y"
127                    $date = preg_replace('/^ EITHER (.+) OR (.+)/', ' BET $1 AND $2', $date);
128                    // "BET X - Y " => "BET X AND Y"
129                    $date = preg_replace('/^(.* BET .+) - (.+)/', '$1 AND $2', $date);
130                    $date = preg_replace('/^(.* FROM .+) - (.+)/', '$1 TO $2', $date);
131                    // "@#ESC@ FROM X TO Y" => "FROM @#ESC@ X TO @#ESC@ Y"
132                    $date = preg_replace('/^ +(@#[^@]+@) +FROM +(.+) +TO +(.+)/', ' FROM $1 $2 TO $1 $3', $date);
133                    $date = preg_replace('/^ +(@#[^@]+@) +BET +(.+) +AND +(.+)/', ' BET $1 $2 AND $1 $3', $date);
134                    // "@#ESC@ AFT X" => "AFT @#ESC@ X"
135                    $date = preg_replace('/^ +(@#[^@]+@) +(FROM|BET|TO|AND|BEF|AFT|CAL|EST|INT|ABT) +(.+)/', ' $2 $1 $3', $date);
136                    // Ignore any remaining punctuation, e.g. "14-MAY, 1900" => "14 MAY 1900"
137                    // (don't change "/" - it is used in NS/OS dates)
138                    $date = preg_replace('/[.,:;-]/', ' ', $date);
139                    // BC => B.C.
140                    $date = str_replace(' BC ', ' B.C. ', $date);
141                    // Append the "INT" text
142                    $data = $date . $text;
143                    break;
144                case 'HEAD':
145                case 'TRLR':
146                    // HEAD and TRLR records do not have an XREF or DATA
147                    if ($level === '0') {
148                        $xref = '';
149                        $data = '';
150                    }
151                    break;
152                case 'NAME':
153                    // Tidy up non-printing characters
154                    $data = preg_replace('/  +/', ' ', trim($data));
155                    break;
156                case 'PLAC':
157                    // Consistent commas
158                    $data = preg_replace('/ *[,,،] */u', ', ', $data);
159                    // The Master Genealogist stores LAT/LONG data in the PLAC field, e.g. Pennsylvania, USA, 395945N0751013W
160                    if (preg_match('/(.*), (\d\d)(\d\d)(\d\d)([NS])(\d\d\d)(\d\d)(\d\d)([EW])$/', $data, $match) === 1) {
161                        $degns = (int) $match[2];
162                        $minns = (int) $match[3];
163                        $secns = (int) $match[4];
164                        $degew = (int) $match[6];
165                        $minew = (int) $match[7];
166                        $secew = (int) $match[8];
167                        $data =
168                            $match[1] . "\n" .
169                            (1 + (int) $level) . " MAP\n" .
170                            (2 + (int) $level) . ' LATI ' . ($match[5] . round($degns + $minns / 60 + $secns / 3600, 4)) . "\n" .
171                            (2 + (int) $level) . ' LONG ' . ($match[9] . round($degew + $minew / 60 + $secew / 3600, 4));
172                    }
173                    break;
174                case 'SEX':
175                    $data = strtoupper($data);
176                    break;
177            }
178            // Suppress "Y", for facts/events with a DATE or PLAC
179            if ($data === 'y') {
180                $data = 'Y';
181            }
182            if ($level === '1' && $data === 'Y') {
183                for ($i = $n + 1; $i < $num_matches - 1 && $matches[$i][1] !== '1'; ++$i) {
184                    if ($matches[$i][3] === 'DATE' || $matches[$i][3] === 'PLAC') {
185                        $data = '';
186                        break;
187                    }
188                }
189            }
190            // Reassemble components back into a single line
191            switch ($tag) {
192                default:
193                    // Remove tabs and multiple/leading/trailing spaces
194                    $data = strtr($data, ["\t" => ' ']);
195                    $data = trim($data, ' ');
196                    while (str_contains($data, '  ')) {
197                        $data = strtr($data, ['  ' => ' ']);
198                    }
199                    $newrec .= ($newrec ? "\n" : '') . $level . ' ' . ($level === '0' && $xref ? $xref . ' ' : '') . $tag . ($data === '' && $tag !== 'NOTE' ? '' : ' ' . $data);
200                    break;
201                case 'NOTE':
202                case 'TEXT':
203                case 'DATA':
204                case 'CONT':
205                    $newrec .= ($newrec ? "\n" : '') . $level . ' ' . ($level === '0' && $xref ? $xref . ' ' : '') . $tag . ($data === '' && $tag !== 'NOTE' ? '' : ' ' . $data);
206                    break;
207                case 'FILE':
208                    // Strip off the user-defined path prefix
209                    $GEDCOM_MEDIA_PATH = $tree->getPreference('GEDCOM_MEDIA_PATH');
210                    if ($GEDCOM_MEDIA_PATH !== '' && str_starts_with($data, $GEDCOM_MEDIA_PATH)) {
211                        $data = substr($data, strlen($GEDCOM_MEDIA_PATH));
212                    }
213                    // convert backslashes in filenames to forward slashes
214                    $data = preg_replace("/\\\\/", '/', $data);
215
216                    $newrec .= ($newrec ? "\n" : '') . $level . ' ' . ($level === '0' && $xref ? $xref . ' ' : '') . $tag . ($data === '' && $tag !== 'NOTE' ? '' : ' ' . $data);
217                    break;
218                case 'CONC':
219                    // Merge CONC lines, to simplify access later on.
220                    $newrec .= ($tree->getPreference('WORD_WRAPPED_NOTES') ? ' ' : '') . $data;
221                    break;
222            }
223        }
224
225        return $newrec;
226    }
227
228    /**
229     * import record into database
230     * this function will parse the given gedcom record and add it to the database
231     *
232     * @param string $gedrec the raw gedcom record to parse
233     * @param Tree   $tree   import the record into this tree
234     * @param bool   $update whether this is an updated record that has been accepted
235     *
236     * @return void
237     * @throws GedcomErrorException
238     */
239    public function importRecord(string $gedrec, Tree $tree, bool $update): void
240    {
241        $tree_id = $tree->id();
242
243        // Escaped @ signs (only if importing from file)
244        if (!$update) {
245            $gedrec = str_replace('@@', '@', $gedrec);
246        }
247
248        // Standardise gedcom format
249        $gedrec = $this->reformatRecord($gedrec, $tree);
250
251        // import different types of records
252        if (preg_match('/^0 @(' . Gedcom::REGEX_XREF . ')@ (' . Gedcom::REGEX_TAG . ')/', $gedrec, $match)) {
253            [, $xref, $type] = $match;
254        } elseif (str_starts_with($gedrec, '0 HEAD')) {
255            $type = 'HEAD';
256            $xref = 'HEAD'; // For records without an XREF, use the type as a pseudo XREF.
257        } elseif (str_starts_with($gedrec, '0 TRLR')) {
258            $tree->setPreference('imported', '1');
259            $type = 'TRLR';
260            $xref = 'TRLR'; // For records without an XREF, use the type as a pseudo XREF.
261        } elseif (str_starts_with($gedrec, '0 _PLAC_DEFN')) {
262            $this->importLegacyPlacDefn($gedrec);
263
264            return;
265        } elseif (str_starts_with($gedrec, '0 _PLAC ')) {
266            $this->importTNGPlac($gedrec);
267
268            return;
269        } else {
270            foreach (Gedcom::CUSTOM_RECORDS_WITHOUT_XREFS as $record_type) {
271                if (preg_match('/^0 ' . $record_type . '\b/', $gedrec) === 1) {
272                    return;
273                }
274            }
275
276            throw new GedcomErrorException($gedrec);
277        }
278
279        // Add a _UID
280        if ($tree->getPreference('GENERATE_UIDS') === '1' && !str_contains($gedrec, "\n1 _UID ")) {
281            $element = Registry::elementFactory()->make($type . ':_UID');
282            if (!$element instanceof UnknownElement) {
283                $gedrec .= "\n1 _UID " . $element->default($tree);
284            }
285        }
286
287        // If the user has downloaded their GEDCOM data (containing media objects) and edited it
288        // using an application which does not support (and deletes) media objects, then add them
289        // back in.
290        if ($tree->getPreference('keep_media')) {
291            $old_linked_media = DB::table('link')
292                ->where('l_from', '=', $xref)
293                ->where('l_file', '=', $tree_id)
294                ->where('l_type', '=', 'OBJE')
295                ->pluck('l_to');
296
297            // Delete these links - so that we do not insert them again in updateLinks()
298            DB::table('link')
299                ->where('l_from', '=', $xref)
300                ->where('l_file', '=', $tree_id)
301                ->where('l_type', '=', 'OBJE')
302                ->delete();
303
304            foreach ($old_linked_media as $media_id) {
305                $gedrec .= "\n1 OBJE @" . $media_id . '@';
306            }
307        }
308
309        // Convert inline media into media objects
310        $gedrec = $this->convertInlineMedia($tree, $gedrec);
311
312        switch ($type) {
313            case Individual::RECORD_TYPE:
314                $record = Registry::individualFactory()->new($xref, $gedrec, null, $tree);
315
316                if (preg_match('/\n1 RIN (.+)/', $gedrec, $match)) {
317                    $rin = $match[1];
318                } else {
319                    $rin = $xref;
320                }
321
322                // The database can only store MFU, and many of the stats queries assume this.
323                $sex = $record->sex();
324                $sex = $sex === 'M' || $sex === 'F' ? $sex : 'U';
325
326                DB::table('individuals')->insert([
327                    'i_id'     => $xref,
328                    'i_file'   => $tree_id,
329                    'i_rin'    => $rin,
330                    'i_sex'    => $sex,
331                    'i_gedcom' => $gedrec,
332                ]);
333
334                // Update the cross-reference/index tables.
335                $this->updatePlaces($xref, $tree, $gedrec);
336                $this->updateDates($xref, $tree_id, $gedrec);
337                $this->updateNames($xref, $tree_id, $record);
338                break;
339
340            case Family::RECORD_TYPE:
341                if (preg_match('/\n1 HUSB @(' . Gedcom::REGEX_XREF . ')@/', $gedrec, $match)) {
342                    $husb = $match[1];
343                } else {
344                    $husb = '';
345                }
346                if (preg_match('/\n1 WIFE @(' . Gedcom::REGEX_XREF . ')@/', $gedrec, $match)) {
347                    $wife = $match[1];
348                } else {
349                    $wife = '';
350                }
351                $nchi = preg_match_all('/\n1 CHIL @(' . Gedcom::REGEX_XREF . ')@/', $gedrec, $match);
352                if (preg_match('/\n1 NCHI (\d+)/', $gedrec, $match)) {
353                    $nchi = max($nchi, $match[1]);
354                }
355
356                DB::table('families')->insert([
357                    'f_id'      => $xref,
358                    'f_file'    => $tree_id,
359                    'f_husb'    => $husb,
360                    'f_wife'    => $wife,
361                    'f_gedcom'  => $gedrec,
362                    'f_numchil' => $nchi,
363                ]);
364
365                // Update the cross-reference/index tables.
366                $this->updatePlaces($xref, $tree, $gedrec);
367                $this->updateDates($xref, $tree_id, $gedrec);
368                break;
369
370            case Source::RECORD_TYPE:
371                if (preg_match('/\n1 TITL (.+)/', $gedrec, $match)) {
372                    $name = $match[1];
373                } elseif (preg_match('/\n1 ABBR (.+)/', $gedrec, $match)) {
374                    $name = $match[1];
375                } else {
376                    $name = $xref;
377                }
378
379                DB::table('sources')->insert([
380                    's_id'     => $xref,
381                    's_file'   => $tree_id,
382                    's_name'   => mb_substr($name, 0, 255),
383                    's_gedcom' => $gedrec,
384                ]);
385                break;
386
387            case Repository::RECORD_TYPE:
388            case Note::RECORD_TYPE:
389            case Submission::RECORD_TYPE:
390            case Submitter::RECORD_TYPE:
391            case Location::RECORD_TYPE:
392                DB::table('other')->insert([
393                    'o_id'     => $xref,
394                    'o_file'   => $tree_id,
395                    'o_type'   => $type,
396                    'o_gedcom' => $gedrec,
397                ]);
398                break;
399
400            case Header::RECORD_TYPE:
401                // Force HEAD records to have a creation date.
402                if (!str_contains($gedrec, "\n1 DATE ")) {
403                    $today = strtoupper(date('d M Y'));
404                    $gedrec .= "\n1 DATE " . $today;
405                }
406
407                DB::table('other')->insert([
408                    'o_id'     => $xref,
409                    'o_file'   => $tree_id,
410                    'o_type'   => Header::RECORD_TYPE,
411                    'o_gedcom' => $gedrec,
412                ]);
413                break;
414
415
416            case Media::RECORD_TYPE:
417                $record = Registry::mediaFactory()->new($xref, $gedrec, null, $tree);
418
419                DB::table('media')->insert([
420                    'm_id'     => $xref,
421                    'm_file'   => $tree_id,
422                    'm_gedcom' => $gedrec,
423                ]);
424
425                foreach ($record->mediaFiles() as $media_file) {
426                    DB::table('media_file')->insert([
427                        'm_id'                 => $xref,
428                        'm_file'               => $tree_id,
429                        'multimedia_file_refn' => mb_substr($media_file->filename(), 0, 248),
430                        'multimedia_format'    => mb_substr($media_file->format(), 0, 4),
431                        'source_media_type'    => mb_substr($media_file->type(), 0, 15),
432                        'descriptive_title'    => mb_substr($media_file->title(), 0, 248),
433                    ]);
434                }
435                break;
436
437            default: // Custom record types.
438                DB::table('other')->insert([
439                    'o_id'     => $xref,
440                    'o_file'   => $tree_id,
441                    'o_type'   => mb_substr($type, 0, 15),
442                    'o_gedcom' => $gedrec,
443                ]);
444                break;
445        }
446
447        // Update the cross-reference/index tables.
448        $this->updateLinks($xref, $tree_id, $gedrec);
449    }
450
451    /**
452     * Legacy Family Tree software generates _PLAC_DEFN records containing LAT/LONG values
453     *
454     * @param string $gedcom
455     */
456    private function importLegacyPlacDefn(string $gedcom): void
457    {
458        $gedcom_service = new GedcomService();
459
460        if (preg_match('/\n1 PLAC (.+)/', $gedcom, $match)) {
461            $place_name = $match[1];
462        } else {
463            return;
464        }
465
466        if (preg_match('/\n3 LATI ([NS].+)/', $gedcom, $match)) {
467            $latitude = $gedcom_service->readLatitude($match[1]);
468        } else {
469            return;
470        }
471
472        if (preg_match('/\n3 LONG ([EW].+)/', $gedcom, $match)) {
473            $longitude = $gedcom_service->readLongitude($match[1]);
474        } else {
475            return;
476        }
477
478        $location = new PlaceLocation($place_name);
479
480        if ($location->latitude() === null && $location->longitude() === null) {
481            DB::table('place_location')
482                ->where('id', '=', $location->id())
483                ->update([
484                    'latitude'  => $latitude,
485                    'longitude' => $longitude,
486                ]);
487        }
488    }
489
490    /**
491     * Legacy Family Tree software generates _PLAC records containing LAT/LONG values
492     *
493     * @param string $gedcom
494     */
495    private function importTNGPlac(string $gedcom): void
496    {
497        if (preg_match('/^0 _PLAC (.+)/', $gedcom, $match)) {
498            $place_name = $match[1];
499        } else {
500            return;
501        }
502
503        if (preg_match('/\n2 LATI (.+)/', $gedcom, $match)) {
504            $latitude = (float) $match[1];
505        } else {
506            return;
507        }
508
509        if (preg_match('/\n2 LONG (.+)/', $gedcom, $match)) {
510            $longitude = (float) $match[1];
511        } else {
512            return;
513        }
514
515        $location = new PlaceLocation($place_name);
516
517        if ($location->latitude() === null && $location->longitude() === null) {
518            DB::table('place_location')
519                ->where('id', '=', $location->id())
520                ->update([
521                    'latitude'  => $latitude,
522                    'longitude' => $longitude,
523                ]);
524        }
525    }
526
527    /**
528     * Extract all level 2 places from the given record and insert them into the places table
529     *
530     * @param string $xref
531     * @param Tree   $tree
532     * @param string $gedrec
533     *
534     * @return void
535     */
536    public function updatePlaces(string $xref, Tree $tree, string $gedrec): void
537    {
538        // Insert all new rows together
539        $rows = [];
540
541        preg_match_all('/\n2 PLAC (.+)/', $gedrec, $matches);
542
543        $places = array_unique($matches[1]);
544
545        foreach ($places as $place_name) {
546            $place = new Place($place_name, $tree);
547
548            // Calling Place::id() will create the entry in the database, if it doesn't already exist.
549            while ($place->id() !== 0) {
550                $rows[] = [
551                    'pl_p_id' => $place->id(),
552                    'pl_gid'  => $xref,
553                    'pl_file' => $tree->id(),
554                ];
555
556                $place = $place->parent();
557            }
558        }
559
560        // array_unique doesn't work with arrays of arrays
561        $rows = array_intersect_key($rows, array_unique(array_map('serialize', $rows)));
562
563        // PDO has a limit of 65535 placeholders, and each row requires 3 placeholders.
564        foreach (array_chunk($rows, 20000) as $chunk) {
565            DB::table('placelinks')->insert($chunk);
566        }
567    }
568
569    /**
570     * Extract all the dates from the given record and insert them into the database.
571     *
572     * @param string $xref
573     * @param int    $ged_id
574     * @param string $gedrec
575     *
576     * @return void
577     */
578    private function updateDates(string $xref, int $ged_id, string $gedrec): void
579    {
580        // Insert all new rows together
581        $rows = [];
582
583        preg_match_all("/\n1 (\w+).*(?:\n[2-9].*)*\n2 DATE (.+)(?:\n[2-9].*)*/", $gedrec, $matches, PREG_SET_ORDER);
584
585        foreach ($matches as $match) {
586            $fact = $match[1];
587            $date = new Date($match[2]);
588            $rows[] = [
589                'd_day'        => $date->minimumDate()->day,
590                'd_month'      => $date->minimumDate()->format('%O'),
591                'd_mon'        => $date->minimumDate()->month,
592                'd_year'       => $date->minimumDate()->year,
593                'd_julianday1' => $date->minimumDate()->minimumJulianDay(),
594                'd_julianday2' => $date->minimumDate()->maximumJulianDay(),
595                'd_fact'       => $fact,
596                'd_gid'        => $xref,
597                'd_file'       => $ged_id,
598                'd_type'       => $date->minimumDate()->format('%@'),
599            ];
600
601            $rows[] = [
602                'd_day'        => $date->maximumDate()->day,
603                'd_month'      => $date->maximumDate()->format('%O'),
604                'd_mon'        => $date->maximumDate()->month,
605                'd_year'       => $date->maximumDate()->year,
606                'd_julianday1' => $date->maximumDate()->minimumJulianDay(),
607                'd_julianday2' => $date->maximumDate()->maximumJulianDay(),
608                'd_fact'       => $fact,
609                'd_gid'        => $xref,
610                'd_file'       => $ged_id,
611                'd_type'       => $date->minimumDate()->format('%@'),
612            ];
613        }
614
615        // array_unique doesn't work with arrays of arrays
616        $rows = array_intersect_key($rows, array_unique(array_map('serialize', $rows)));
617
618        DB::table('dates')->insert($rows);
619    }
620
621    /**
622     * Extract all the links from the given record and insert them into the database
623     *
624     * @param string $xref
625     * @param int    $ged_id
626     * @param string $gedrec
627     *
628     * @return void
629     */
630    private function updateLinks(string $xref, int $ged_id, string $gedrec): void
631    {
632        // Insert all new rows together
633        $rows = [];
634
635        preg_match_all('/\n\d+ (' . Gedcom::REGEX_TAG . ') @(' . Gedcom::REGEX_XREF . ')@/', $gedrec, $matches, PREG_SET_ORDER);
636
637        foreach ($matches as $match) {
638            // Some applications (e.g. GenoPro) create links longer than 15 characters.
639            $link = mb_substr($match[1], 0, 15);
640
641            // Take care of "duplicates" that differ on case/collation, e.g. "SOUR @S1@" and "SOUR @s1@"
642            $rows[$link . strtoupper($match[2])] = [
643                'l_from' => $xref,
644                'l_to'   => $match[2],
645                'l_type' => $link,
646                'l_file' => $ged_id,
647            ];
648        }
649
650        DB::table('link')->insert($rows);
651    }
652
653    /**
654     * Extract all the names from the given record and insert them into the database.
655     *
656     * @param string     $xref
657     * @param int        $ged_id
658     * @param Individual $record
659     *
660     * @return void
661     */
662    private function updateNames(string $xref, int $ged_id, Individual $record): void
663    {
664        // Insert all new rows together
665        $rows = [];
666
667        foreach ($record->getAllNames() as $n => $name) {
668            if ($name['givn'] === Individual::PRAENOMEN_NESCIO) {
669                $soundex_givn_std = null;
670                $soundex_givn_dm  = null;
671            } else {
672                $soundex_givn_std = Soundex::russell($name['givn']);
673                $soundex_givn_dm  = Soundex::daitchMokotoff($name['givn']);
674            }
675
676            if ($name['surn'] === Individual::NOMEN_NESCIO) {
677                $soundex_surn_std = null;
678                $soundex_surn_dm  = null;
679            } else {
680                $soundex_surn_std = Soundex::russell($name['surname']);
681                $soundex_surn_dm  = Soundex::daitchMokotoff($name['surname']);
682            }
683
684            $rows[] = [
685                'n_file'             => $ged_id,
686                'n_id'               => $xref,
687                'n_num'              => $n,
688                'n_type'             => $name['type'],
689                'n_sort'             => mb_substr($name['sort'], 0, 255),
690                'n_full'             => mb_substr($name['fullNN'], 0, 255),
691                'n_surname'          => mb_substr($name['surname'], 0, 255),
692                'n_surn'             => mb_substr($name['surn'], 0, 255),
693                'n_givn'             => mb_substr($name['givn'], 0, 255),
694                'n_soundex_givn_std' => $soundex_givn_std,
695                'n_soundex_surn_std' => $soundex_surn_std,
696                'n_soundex_givn_dm'  => $soundex_givn_dm,
697                'n_soundex_surn_dm'  => $soundex_surn_dm,
698            ];
699        }
700
701        DB::table('name')->insert($rows);
702    }
703
704    /**
705     * Extract inline media data, and convert to media objects.
706     *
707     * @param Tree   $tree
708     * @param string $gedcom
709     *
710     * @return string
711     */
712    private function convertInlineMedia(Tree $tree, string $gedcom): string
713    {
714        while (preg_match('/\n1 OBJE(?:\n[2-9].+)+/', $gedcom, $match)) {
715            $xref   = $this->createMediaObject($match[0], $tree);
716            $gedcom = strtr($gedcom, [$match[0] =>  "\n1 OBJE @" . $xref . '@']);
717        }
718        while (preg_match('/\n2 OBJE(?:\n[3-9].+)+/', $gedcom, $match)) {
719            $xref   = $this->createMediaObject($match[0], $tree);
720            $gedcom = strtr($gedcom, [$match[0] =>  "\n2 OBJE @" . $xref . '@']);
721        }
722        while (preg_match('/\n3 OBJE(?:\n[4-9].+)+/', $gedcom, $match)) {
723            $xref   = $this->createMediaObject($match[0], $tree);
724            $gedcom = strtr($gedcom, [$match[0] =>  "\n3 OBJE @" . $xref . '@']);
725        }
726
727        return $gedcom;
728    }
729
730    /**
731     * Create a new media object, from inline media data.
732     *
733     * GEDCOM 5.5.1 specifies: +1 FILE / +2 FORM / +3 MEDI / +1 TITL
734     * GEDCOM 5.5 specifies: +1 FILE / +1 FORM / +1 TITL
735     * GEDCOM 5.5.1 says that GEDCOM 5.5 specifies:  +1 FILE / +1 FORM / +2 MEDI
736     *
737     * Legacy generates: +1 FORM / +1 FILE / +1 TITL / +1 _SCBK / +1 _PRIM / +1 _TYPE / +1 NOTE
738     * RootsMagic generates: +1 FILE / +1 FORM / +1 TITL
739     *
740     * @param string $gedcom
741     * @param Tree   $tree
742     *
743     * @return string
744     */
745    private function createMediaObject(string $gedcom, Tree $tree): string
746    {
747        preg_match('/\n\d FILE (.+)/', $gedcom, $match);
748        $file = $match[1] ?? '';
749
750        preg_match('/\n\d TITL (.+)/', $gedcom, $match);
751        $title = $match[1] ?? '';
752
753        preg_match('/\n\d FORM (.+)/', $gedcom, $match);
754        $format = $match[1] ?? '';
755
756        preg_match('/\n\d MEDI (.+)/', $gedcom, $match);
757        $media = $match[1] ?? '';
758
759        preg_match('/\n\d _SCBK (.+)/', $gedcom, $match);
760        $scrapbook = $match[1] ?? '';
761
762        preg_match('/\n\d _PRIM (.+)/', $gedcom, $match);
763        $primary = $match[1] ?? '';
764
765        preg_match('/\n\d _TYPE (.+)/', $gedcom, $match);
766        if ($media === '') {
767            // Legacy uses _TYPE instead of MEDI
768            $media = $match[1] ?? '';
769            $type  = '';
770        } else {
771            $type = $match[1] ?? '';
772        }
773
774        preg_match_all('/\n\d NOTE (.+(?:\n\d CONT.*)*)/', $gedcom, $matches);
775        $notes = $matches[1] ?? [];
776
777        // Have we already created a media object with the same title/filename?
778        $xref = DB::table('media_file')
779            ->where('m_file', '=', $tree->id())
780            ->where('descriptive_title', '=', mb_substr($title, 0, 248))
781            ->where('multimedia_file_refn', '=', mb_substr($file, 0, 248))
782            ->value('m_id');
783
784        if ($xref === null) {
785            $xref = Registry::xrefFactory()->make(Media::RECORD_TYPE);
786
787            // convert to a media-object
788            $gedcom = '0 @' . $xref . "@ OBJE\n1 FILE " . $file;
789
790            if ($format !== '') {
791                $gedcom .= "\n2 FORM " . $format;
792
793                if ($media !== '') {
794                    $gedcom .= "\n3 TYPE " . $media;
795                }
796            }
797
798            if ($title !== '') {
799                $gedcom .= "\n2 TITL " . $title;
800            }
801
802            if ($scrapbook !== '') {
803                $gedcom .= "\n1 _SCBK " . $scrapbook;
804            }
805
806            if ($primary !== '') {
807                $gedcom .= "\n1 _PRIM " . $primary;
808            }
809
810            if ($type !== '') {
811                $gedcom .= "\n1 _TYPE " . $type;
812            }
813
814            foreach ($notes as $note) {
815                $gedcom .= "\n1 NOTE " . strtr($note, ["\n3" => "\n2", "\n4" => "\n2", "\n5" => "\n2"]);
816            }
817
818            DB::table('media')->insert([
819                'm_id'     => $xref,
820                'm_file'   => $tree->id(),
821                'm_gedcom' => $gedcom,
822            ]);
823
824            DB::table('media_file')->insert([
825                'm_id'                 => $xref,
826                'm_file'               => $tree->id(),
827                'multimedia_file_refn' => mb_substr($file, 0, 248),
828                'multimedia_format'    => mb_substr($format, 0, 4),
829                'source_media_type'    => mb_substr($media, 0, 15),
830                'descriptive_title'    => mb_substr($title, 0, 248),
831            ]);
832        }
833
834        return $xref;
835    }
836
837    /**
838     * update a record in the database
839     *
840     * @param string $gedrec
841     * @param Tree   $tree
842     * @param bool   $delete
843     *
844     * @return void
845     * @throws GedcomErrorException
846     */
847    public function updateRecord(string $gedrec, Tree $tree, bool $delete): void
848    {
849        if (preg_match('/^0 @(' . Gedcom::REGEX_XREF . ')@ (' . Gedcom::REGEX_TAG . ')/', $gedrec, $match)) {
850            [, $gid, $type] = $match;
851        } elseif (preg_match('/^0 (HEAD)(?:\n|$)/', $gedrec, $match)) {
852            // The HEAD record has no XREF.  Any others?
853            $gid  = $match[1];
854            $type = $match[1];
855        } else {
856            throw new GedcomErrorException($gedrec);
857        }
858
859        // Place links
860        DB::table('placelinks')
861            ->where('pl_gid', '=', $gid)
862            ->where('pl_file', '=', $tree->id())
863            ->delete();
864
865        // Orphaned places.  If we're deleting  "Westminster, London, England",
866        // then we may also need to delete "London, England" and "England".
867        do {
868            $affected = DB::table('places')
869                ->leftJoin('placelinks', function (JoinClause $join): void {
870                    $join
871                        ->on('p_id', '=', 'pl_p_id')
872                        ->on('p_file', '=', 'pl_file');
873                })
874                ->whereNull('pl_p_id')
875                ->delete();
876        } while ($affected > 0);
877
878        DB::table('dates')
879            ->where('d_gid', '=', $gid)
880            ->where('d_file', '=', $tree->id())
881            ->delete();
882
883        DB::table('name')
884            ->where('n_id', '=', $gid)
885            ->where('n_file', '=', $tree->id())
886            ->delete();
887
888        DB::table('link')
889            ->where('l_from', '=', $gid)
890            ->where('l_file', '=', $tree->id())
891            ->delete();
892
893        switch ($type) {
894            case Individual::RECORD_TYPE:
895                DB::table('individuals')
896                    ->where('i_id', '=', $gid)
897                    ->where('i_file', '=', $tree->id())
898                    ->delete();
899                break;
900
901            case Family::RECORD_TYPE:
902                DB::table('families')
903                    ->where('f_id', '=', $gid)
904                    ->where('f_file', '=', $tree->id())
905                    ->delete();
906                break;
907
908            case Source::RECORD_TYPE:
909                DB::table('sources')
910                    ->where('s_id', '=', $gid)
911                    ->where('s_file', '=', $tree->id())
912                    ->delete();
913                break;
914
915            case Media::RECORD_TYPE:
916                DB::table('media_file')
917                    ->where('m_id', '=', $gid)
918                    ->where('m_file', '=', $tree->id())
919                    ->delete();
920
921                DB::table('media')
922                    ->where('m_id', '=', $gid)
923                    ->where('m_file', '=', $tree->id())
924                    ->delete();
925                break;
926
927            default:
928                DB::table('other')
929                    ->where('o_id', '=', $gid)
930                    ->where('o_file', '=', $tree->id())
931                    ->delete();
932                break;
933        }
934
935        if (!$delete) {
936            $this->importRecord($gedrec, $tree, true);
937        }
938    }
939}
940