xref: /webtrees/app/Services/GedcomImportService.php (revision d68b8af46973dcf347dbb45279dbfe0c9c299082)
1<?php
2
3/**
4 * webtrees: online genealogy
5 * Copyright (C) 2022 webtrees development team
6 * This program is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation, either version 3 of the License, or
9 * (at your option) any later version.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program. If not, see <https://www.gnu.org/licenses/>.
16 */
17
18declare(strict_types=1);
19
20namespace Fisharebest\Webtrees\Services;
21
22use Fisharebest\Webtrees\Date;
23use Fisharebest\Webtrees\Elements\UnknownElement;
24use Fisharebest\Webtrees\Exceptions\GedcomErrorException;
25use Fisharebest\Webtrees\Family;
26use Fisharebest\Webtrees\Gedcom;
27use Fisharebest\Webtrees\Header;
28use Fisharebest\Webtrees\Individual;
29use Fisharebest\Webtrees\Location;
30use Fisharebest\Webtrees\Media;
31use Fisharebest\Webtrees\Note;
32use Fisharebest\Webtrees\Place;
33use Fisharebest\Webtrees\PlaceLocation;
34use Fisharebest\Webtrees\Registry;
35use Fisharebest\Webtrees\Repository;
36use Fisharebest\Webtrees\Soundex;
37use Fisharebest\Webtrees\Source;
38use Fisharebest\Webtrees\Submission;
39use Fisharebest\Webtrees\Submitter;
40use Fisharebest\Webtrees\Tree;
41use Illuminate\Database\Capsule\Manager as DB;
42use Illuminate\Database\Query\JoinClause;
43
44use function app;
45use function array_chunk;
46use function array_intersect_key;
47use function array_map;
48use function array_unique;
49use function assert;
50use function date;
51use function explode;
52use function max;
53use function mb_substr;
54use function preg_match;
55use function preg_match_all;
56use function preg_replace;
57use function round;
58use function str_contains;
59use function str_replace;
60use function str_starts_with;
61use function strlen;
62use function strtolower;
63use function strtoupper;
64use function strtr;
65use function substr;
66use function trim;
67
68use const PREG_SET_ORDER;
69
70/**
71 * Class GedcomImportService - import GEDCOM data
72 */
73class GedcomImportService
74{
75    /**
76     * Tidy up a gedcom record on import, so that we can access it consistently/efficiently.
77     *
78     * @param string $rec
79     * @param Tree   $tree
80     *
81     * @return string
82     */
83    private function reformatRecord(string $rec, Tree $tree): string
84    {
85        $gedcom_service = app(GedcomService::class);
86        assert($gedcom_service instanceof GedcomService);
87
88        // Strip out mac/msdos line endings
89        $rec = preg_replace("/[\r\n]+/", "\n", $rec);
90
91        // Extract lines from the record; lines consist of: level + optional xref + tag + optional data
92        $num_matches = preg_match_all('/^[ \t]*(\d+)[ \t]*(@[^@]*@)?[ \t]*(\w+)[ \t]?(.*)$/m', $rec, $matches, PREG_SET_ORDER);
93
94        // Process the record line-by-line
95        $newrec = '';
96        foreach ($matches as $n => $match) {
97            [, $level, $xref, $tag, $data] = $match;
98
99            $tag = $gedcom_service->canonicalTag($tag);
100
101            switch ($tag) {
102                case 'AFN':
103                    // AFN values are upper case
104                    $data = strtoupper($data);
105                    break;
106                case 'DATE':
107                    // Preserve text from INT dates
108                    if (str_contains($data, '(')) {
109                        [$date, $text] = explode('(', $data, 2);
110                        $text = ' (' . $text;
111                    } else {
112                        $date = $data;
113                        $text = '';
114                    }
115                    // Capitals
116                    $date = strtoupper($date);
117                    // Temporarily add leading/trailing spaces, to allow efficient matching below
118                    $date = ' ' . $date . ' ';
119                    // Ensure space digits and letters
120                    $date = preg_replace('/([A-Z])(\d)/', '$1 $2', $date);
121                    $date = preg_replace('/(\d)([A-Z])/', '$1 $2', $date);
122                    // Ensure space before/after calendar escapes
123                    $date = preg_replace('/@#[^@]+@/', ' $0 ', $date);
124                    // "BET." => "BET"
125                    $date = preg_replace('/(\w\w)\./', '$1', $date);
126                    // "CIR" => "ABT"
127                    $date = str_replace(' CIR ', ' ABT ', $date);
128                    $date = str_replace(' APX ', ' ABT ', $date);
129                    // B.C. => BC (temporarily, to allow easier handling of ".")
130                    $date = str_replace(' B.C. ', ' BC ', $date);
131                    // TMG uses "EITHER X OR Y"
132                    $date = preg_replace('/^ EITHER (.+) OR (.+)/', ' BET $1 AND $2', $date);
133                    // "BET X - Y " => "BET X AND Y"
134                    $date = preg_replace('/^(.* BET .+) - (.+)/', '$1 AND $2', $date);
135                    $date = preg_replace('/^(.* FROM .+) - (.+)/', '$1 TO $2', $date);
136                    // "@#ESC@ FROM X TO Y" => "FROM @#ESC@ X TO @#ESC@ Y"
137                    $date = preg_replace('/^ +(@#[^@]+@) +FROM +(.+) +TO +(.+)/', ' FROM $1 $2 TO $1 $3', $date);
138                    $date = preg_replace('/^ +(@#[^@]+@) +BET +(.+) +AND +(.+)/', ' BET $1 $2 AND $1 $3', $date);
139                    // "@#ESC@ AFT X" => "AFT @#ESC@ X"
140                    $date = preg_replace('/^ +(@#[^@]+@) +(FROM|BET|TO|AND|BEF|AFT|CAL|EST|INT|ABT) +(.+)/', ' $2 $1 $3', $date);
141                    // Ignore any remaining punctuation, e.g. "14-MAY, 1900" => "14 MAY 1900"
142                    // (don't change "/" - it is used in NS/OS dates)
143                    $date = preg_replace('/[.,:;-]/', ' ', $date);
144                    // BC => B.C.
145                    $date = str_replace(' BC ', ' B.C. ', $date);
146                    // Append the "INT" text
147                    $data = $date . $text;
148                    break;
149                case '_FILE':
150                    $tag = 'FILE';
151                    break;
152                case 'FORM':
153                    // Consistent commas
154                    $data = preg_replace('/ *, */', ', ', $data);
155                    break;
156                case 'HEAD':
157                    // HEAD records don't have an XREF or DATA
158                    if ($level === '0') {
159                        $xref = '';
160                        $data = '';
161                    }
162                    break;
163                case 'NAME':
164                    // Tidy up non-printing characters
165                    $data = preg_replace('/  +/', ' ', trim($data));
166                    break;
167                case 'PEDI':
168                    // PEDI values are lower case
169                    $data = strtolower($data);
170                    break;
171                case 'PLAC':
172                    // Consistent commas
173                    $data = preg_replace('/ *[,,،] */u', ', ', $data);
174                    // The Master Genealogist stores LAT/LONG data in the PLAC field, e.g. Pennsylvania, USA, 395945N0751013W
175                    if (preg_match('/(.*), (\d\d)(\d\d)(\d\d)([NS])(\d\d\d)(\d\d)(\d\d)([EW])$/', $data, $match)) {
176                        $data =
177                            $match[1] . "\n" .
178                            ($level + 1) . " MAP\n" .
179                            ($level + 2) . ' LATI ' . ($match[5] . round($match[2] + ($match[3] / 60) + ($match[4] / 3600), 4)) . "\n" .
180                            ($level + 2) . ' LONG ' . ($match[9] . round($match[6] + ($match[7] / 60) + ($match[8] / 3600), 4));
181                    }
182                    break;
183                case 'RESN':
184                    // RESN values are lower case (confidential, privacy, locked, none)
185                    $data = strtolower($data);
186                    if ($data === 'invisible') {
187                        $data = 'confidential'; // From old versions of Legacy.
188                    }
189                    break;
190                case 'SEX':
191                    $data = strtoupper($data);
192                    break;
193                case 'STAT':
194                    if ($data === 'CANCELLED') {
195                        // PhpGedView mis-spells this tag - correct it.
196                        $data = 'CANCELED';
197                    }
198                    break;
199                case 'TEMP':
200                    // Temple codes are upper case
201                    $data = strtoupper($data);
202                    break;
203                case 'TRLR':
204                    // TRLR records don't have an XREF or DATA
205                    if ($level === '0') {
206                        $xref = '';
207                        $data = '';
208                    }
209                    break;
210            }
211            // Suppress "Y", for facts/events with a DATE or PLAC
212            if ($data === 'y') {
213                $data = 'Y';
214            }
215            if ($level === '1' && $data === 'Y') {
216                for ($i = $n + 1; $i < $num_matches - 1 && $matches[$i][1] !== '1'; ++$i) {
217                    if ($matches[$i][3] === 'DATE' || $matches[$i][3] === 'PLAC') {
218                        $data = '';
219                        break;
220                    }
221                }
222            }
223            // Reassemble components back into a single line
224            switch ($tag) {
225                default:
226                    // Remove tabs and multiple/leading/trailing spaces
227                    $data = strtr($data, ["\t" => ' ']);
228                    $data = trim($data, ' ');
229                    while (str_contains($data, '  ')) {
230                        $data = strtr($data, ['  ' => ' ']);
231                    }
232                    $newrec .= ($newrec ? "\n" : '') . $level . ' ' . ($level === '0' && $xref ? $xref . ' ' : '') . $tag . ($data === '' && $tag !== 'NOTE' ? '' : ' ' . $data);
233                    break;
234                case 'NOTE':
235                case 'TEXT':
236                case 'DATA':
237                case 'CONT':
238                    $newrec .= ($newrec ? "\n" : '') . $level . ' ' . ($level === '0' && $xref ? $xref . ' ' : '') . $tag . ($data === '' && $tag !== 'NOTE' ? '' : ' ' . $data);
239                    break;
240                case 'FILE':
241                    // Strip off the user-defined path prefix
242                    $GEDCOM_MEDIA_PATH = $tree->getPreference('GEDCOM_MEDIA_PATH');
243                    if ($GEDCOM_MEDIA_PATH !== '' && str_starts_with($data, $GEDCOM_MEDIA_PATH)) {
244                        $data = substr($data, strlen($GEDCOM_MEDIA_PATH));
245                    }
246                    // convert backslashes in filenames to forward slashes
247                    $data = preg_replace("/\\\\/", '/', $data);
248
249                    $newrec .= ($newrec ? "\n" : '') . $level . ' ' . ($level === '0' && $xref ? $xref . ' ' : '') . $tag . ($data === '' && $tag !== 'NOTE' ? '' : ' ' . $data);
250                    break;
251                case 'CONC':
252                    // Merge CONC lines, to simplify access later on.
253                    $newrec .= ($tree->getPreference('WORD_WRAPPED_NOTES') ? ' ' : '') . $data;
254                    break;
255            }
256        }
257
258        return $newrec;
259    }
260
261    /**
262     * import record into database
263     * this function will parse the given gedcom record and add it to the database
264     *
265     * @param string $gedrec the raw gedcom record to parse
266     * @param Tree   $tree   import the record into this tree
267     * @param bool   $update whether this is an updated record that has been accepted
268     *
269     * @return void
270     * @throws GedcomErrorException
271     */
272    public function importRecord(string $gedrec, Tree $tree, bool $update): void
273    {
274        $tree_id = $tree->id();
275
276        // Escaped @ signs (only if importing from file)
277        if (!$update) {
278            $gedrec = str_replace('@@', '@', $gedrec);
279        }
280
281        // Standardise gedcom format
282        $gedrec = $this->reformatRecord($gedrec, $tree);
283
284        // import different types of records
285        if (preg_match('/^0 @(' . Gedcom::REGEX_XREF . ')@ (' . Gedcom::REGEX_TAG . ')/', $gedrec, $match)) {
286            [, $xref, $type] = $match;
287        } elseif (str_starts_with($gedrec, '0 HEAD')) {
288            $type = 'HEAD';
289            $xref = 'HEAD'; // For records without an XREF, use the type as a pseudo XREF.
290        } elseif (str_starts_with($gedrec, '0 TRLR')) {
291            $tree->setPreference('imported', '1');
292            $type = 'TRLR';
293            $xref = 'TRLR'; // For records without an XREF, use the type as a pseudo XREF.
294        } elseif (str_starts_with($gedrec, '0 _PLAC_DEFN')) {
295            $this->importLegacyPlacDefn($gedrec);
296
297            return;
298        } elseif (str_starts_with($gedrec, '0 _PLAC ')) {
299            $this->importTNGPlac($gedrec);
300
301            return;
302        } elseif (str_starts_with($gedrec, '0 _EVDEF')) {
303            // Created by RootsMagic.  We cannot process these records without an XREF.
304            return;
305        } elseif (str_starts_with($gedrec, '0 _HASHTAG_DEFN')) {
306            // Created by Legacy.  We cannot process these records without an XREF.
307            return;
308        } elseif (str_starts_with($gedrec, '0 _EVENT_DEFN')) {
309            // Created by PAF and Legacy.  We cannot process these records without an XREF.
310            return;
311        } elseif (str_starts_with($gedrec, '0 _TODO')) {
312            // Created by Legacy.  We cannot process these records without an XREF.
313            return;
314        } elseif (str_starts_with($gedrec, '0 PEDIGREELINK')) {
315            // Created by GenoPro.  We cannot process these records without an XREF.
316            return;
317        } elseif (str_starts_with($gedrec, '0 GLOBAL')) {
318            // Created by GenoPro.  We cannot process these records without an XREF.
319            return;
320        } elseif (str_starts_with($gedrec, '0 GENOMAP')) {
321            // Created by GenoPro.  We cannot process these records without an XREF.
322            return;
323        } elseif (str_starts_with($gedrec, '0 EMOTIONALRELATIONSHIP')) {
324            // Created by GenoPro.  We cannot process these records without an XREF.
325            return;
326        } elseif (str_starts_with($gedrec, '0 SOCIALRELATIONSHIP')) {
327            // Created by GenoPro.  We cannot process these records without an XREF.
328            return;
329        } elseif (str_starts_with($gedrec, '0 LABEL')) {
330            // Created by GenoPro.  We cannot process these records without an XREF.
331            return;
332        } else {
333            throw new GedcomErrorException($gedrec);
334        }
335
336        // Add a _UID
337        if ($tree->getPreference('GENERATE_UIDS') === '1' && !str_contains($gedrec, "\n1 _UID ")) {
338            $element = Registry::elementFactory()->make($type . ':_UID');
339            if (!$element instanceof UnknownElement) {
340                $gedrec .= "\n1 _UID " . $element->default($tree);
341            }
342        }
343
344        // If the user has downloaded their GEDCOM data (containing media objects) and edited it
345        // using an application which does not support (and deletes) media objects, then add them
346        // back in.
347        if ($tree->getPreference('keep_media')) {
348            $old_linked_media = DB::table('link')
349                ->where('l_from', '=', $xref)
350                ->where('l_file', '=', $tree_id)
351                ->where('l_type', '=', 'OBJE')
352                ->pluck('l_to');
353
354            // Delete these links - so that we do not insert them again in updateLinks()
355            DB::table('link')
356                ->where('l_from', '=', $xref)
357                ->where('l_file', '=', $tree_id)
358                ->where('l_type', '=', 'OBJE')
359                ->delete();
360
361            foreach ($old_linked_media as $media_id) {
362                $gedrec .= "\n1 OBJE @" . $media_id . '@';
363            }
364        }
365
366        // Convert inline media into media objects
367        $gedrec = $this->convertInlineMedia($tree, $gedrec);
368
369        switch ($type) {
370            case Individual::RECORD_TYPE:
371                $record = Registry::individualFactory()->new($xref, $gedrec, null, $tree);
372
373                if (preg_match('/\n1 RIN (.+)/', $gedrec, $match)) {
374                    $rin = $match[1];
375                } else {
376                    $rin = $xref;
377                }
378
379                // The database can only store MFU, and many of the stats queries assume this.
380                $sex = $record->sex();
381                $sex = $sex === 'M' || $sex === 'F' ? $sex : 'U';
382
383                DB::table('individuals')->insert([
384                    'i_id'     => $xref,
385                    'i_file'   => $tree_id,
386                    'i_rin'    => $rin,
387                    'i_sex'    => $sex,
388                    'i_gedcom' => $gedrec,
389                ]);
390
391                // Update the cross-reference/index tables.
392                $this->updatePlaces($xref, $tree, $gedrec);
393                $this->updateDates($xref, $tree_id, $gedrec);
394                $this->updateNames($xref, $tree_id, $record);
395                break;
396
397            case Family::RECORD_TYPE:
398                if (preg_match('/\n1 HUSB @(' . Gedcom::REGEX_XREF . ')@/', $gedrec, $match)) {
399                    $husb = $match[1];
400                } else {
401                    $husb = '';
402                }
403                if (preg_match('/\n1 WIFE @(' . Gedcom::REGEX_XREF . ')@/', $gedrec, $match)) {
404                    $wife = $match[1];
405                } else {
406                    $wife = '';
407                }
408                $nchi = preg_match_all('/\n1 CHIL @(' . Gedcom::REGEX_XREF . ')@/', $gedrec, $match);
409                if (preg_match('/\n1 NCHI (\d+)/', $gedrec, $match)) {
410                    $nchi = max($nchi, $match[1]);
411                }
412
413                DB::table('families')->insert([
414                    'f_id'      => $xref,
415                    'f_file'    => $tree_id,
416                    'f_husb'    => $husb,
417                    'f_wife'    => $wife,
418                    'f_gedcom'  => $gedrec,
419                    'f_numchil' => $nchi,
420                ]);
421
422                // Update the cross-reference/index tables.
423                $this->updatePlaces($xref, $tree, $gedrec);
424                $this->updateDates($xref, $tree_id, $gedrec);
425                break;
426
427            case Source::RECORD_TYPE:
428                if (preg_match('/\n1 TITL (.+)/', $gedrec, $match)) {
429                    $name = $match[1];
430                } elseif (preg_match('/\n1 ABBR (.+)/', $gedrec, $match)) {
431                    $name = $match[1];
432                } else {
433                    $name = $xref;
434                }
435
436                DB::table('sources')->insert([
437                    's_id'     => $xref,
438                    's_file'   => $tree_id,
439                    's_name'   => mb_substr($name, 0, 255),
440                    's_gedcom' => $gedrec,
441                ]);
442                break;
443
444            case Repository::RECORD_TYPE:
445            case Note::RECORD_TYPE:
446            case Submission::RECORD_TYPE:
447            case Submitter::RECORD_TYPE:
448            case Location::RECORD_TYPE:
449                DB::table('other')->insert([
450                    'o_id'     => $xref,
451                    'o_file'   => $tree_id,
452                    'o_type'   => $type,
453                    'o_gedcom' => $gedrec,
454                ]);
455                break;
456
457            case Header::RECORD_TYPE:
458                // Force HEAD records to have a creation date.
459                if (!str_contains($gedrec, "\n1 DATE ")) {
460                    $today = strtoupper(date('d M Y'));
461                    $gedrec .= "\n1 DATE " . $today;
462                }
463
464                DB::table('other')->insert([
465                    'o_id'     => $xref,
466                    'o_file'   => $tree_id,
467                    'o_type'   => Header::RECORD_TYPE,
468                    'o_gedcom' => $gedrec,
469                ]);
470                break;
471
472
473            case Media::RECORD_TYPE:
474                $record = Registry::mediaFactory()->new($xref, $gedrec, null, $tree);
475
476                DB::table('media')->insert([
477                    'm_id'     => $xref,
478                    'm_file'   => $tree_id,
479                    'm_gedcom' => $gedrec,
480                ]);
481
482                foreach ($record->mediaFiles() as $media_file) {
483                    DB::table('media_file')->insert([
484                        'm_id'                 => $xref,
485                        'm_file'               => $tree_id,
486                        'multimedia_file_refn' => mb_substr($media_file->filename(), 0, 248),
487                        'multimedia_format'    => mb_substr($media_file->format(), 0, 4),
488                        'source_media_type'    => mb_substr($media_file->type(), 0, 15),
489                        'descriptive_title'    => mb_substr($media_file->title(), 0, 248),
490                    ]);
491                }
492                break;
493
494            default: // Custom record types.
495                DB::table('other')->insert([
496                    'o_id'     => $xref,
497                    'o_file'   => $tree_id,
498                    'o_type'   => mb_substr($type, 0, 15),
499                    'o_gedcom' => $gedrec,
500                ]);
501                break;
502        }
503
504        // Update the cross-reference/index tables.
505        $this->updateLinks($xref, $tree_id, $gedrec);
506    }
507
508    /**
509     * Legacy Family Tree software generates _PLAC_DEFN records containing LAT/LONG values
510     *
511     * @param string $gedcom
512     */
513    private function importLegacyPlacDefn(string $gedcom): void
514    {
515        $gedcom_service = new GedcomService();
516
517        if (preg_match('/\n1 PLAC (.+)/', $gedcom, $match)) {
518            $place_name = $match[1];
519        } else {
520            return;
521        }
522
523        if (preg_match('/\n3 LATI ([NS].+)/', $gedcom, $match)) {
524            $latitude = $gedcom_service->readLatitude($match[1]);
525        } else {
526            return;
527        }
528
529        if (preg_match('/\n3 LONG ([EW].+)/', $gedcom, $match)) {
530            $longitude = $gedcom_service->readLongitude($match[1]);
531        } else {
532            return;
533        }
534
535        $location = new PlaceLocation($place_name);
536
537        if ($location->latitude() === null && $location->longitude() === null) {
538            DB::table('place_location')
539                ->where('id', '=', $location->id())
540                ->update([
541                    'latitude'  => $latitude,
542                    'longitude' => $longitude,
543                ]);
544        }
545    }
546
547    /**
548     * Legacy Family Tree software generates _PLAC records containing LAT/LONG values
549     *
550     * @param string $gedcom
551     */
552    private function importTNGPlac(string $gedcom): void
553    {
554        if (preg_match('/^0 _PLAC (.+)/', $gedcom, $match)) {
555            $place_name = $match[1];
556        } else {
557            return;
558        }
559
560        if (preg_match('/\n2 LATI (.+)/', $gedcom, $match)) {
561            $latitude = (float) $match[1];
562        } else {
563            return;
564        }
565
566        if (preg_match('/\n2 LONG (.+)/', $gedcom, $match)) {
567            $longitude = (float) $match[1];
568        } else {
569            return;
570        }
571
572        $location = new PlaceLocation($place_name);
573
574        if ($location->latitude() === null && $location->longitude() === null) {
575            DB::table('place_location')
576                ->where('id', '=', $location->id())
577                ->update([
578                    'latitude'  => $latitude,
579                    'longitude' => $longitude,
580                ]);
581        }
582    }
583
584    /**
585     * Extract all level 2 places from the given record and insert them into the places table
586     *
587     * @param string $xref
588     * @param Tree   $tree
589     * @param string $gedrec
590     *
591     * @return void
592     */
593    public function updatePlaces(string $xref, Tree $tree, string $gedrec): void
594    {
595        // Insert all new rows together
596        $rows = [];
597
598        preg_match_all('/\n2 PLAC (.+)/', $gedrec, $matches);
599
600        $places = array_unique($matches[1]);
601
602        foreach ($places as $place_name) {
603            $place = new Place($place_name, $tree);
604
605            // Calling Place::id() will create the entry in the database, if it doesn't already exist.
606            while ($place->id() !== 0) {
607                $rows[] = [
608                    'pl_p_id' => $place->id(),
609                    'pl_gid'  => $xref,
610                    'pl_file' => $tree->id(),
611                ];
612
613                $place = $place->parent();
614            }
615        }
616
617        // array_unique doesn't work with arrays of arrays
618        $rows = array_intersect_key($rows, array_unique(array_map('serialize', $rows)));
619
620        // PDO has a limit of 65535 placeholders, and each row requires 3 placeholders.
621        foreach (array_chunk($rows, 20000) as $chunk) {
622            DB::table('placelinks')->insert($chunk);
623        }
624    }
625
626    /**
627     * Extract all the dates from the given record and insert them into the database.
628     *
629     * @param string $xref
630     * @param int    $ged_id
631     * @param string $gedrec
632     *
633     * @return void
634     */
635    private function updateDates(string $xref, int $ged_id, string $gedrec): void
636    {
637        // Insert all new rows together
638        $rows = [];
639
640        preg_match_all("/\n1 (\w+).*(?:\n[2-9].*)*\n2 DATE (.+)(?:\n[2-9].*)*/", $gedrec, $matches, PREG_SET_ORDER);
641
642        foreach ($matches as $match) {
643            $fact = $match[1];
644            $date = new Date($match[2]);
645            $rows[] = [
646                'd_day'        => $date->minimumDate()->day,
647                'd_month'      => $date->minimumDate()->format('%O'),
648                'd_mon'        => $date->minimumDate()->month,
649                'd_year'       => $date->minimumDate()->year,
650                'd_julianday1' => $date->minimumDate()->minimumJulianDay(),
651                'd_julianday2' => $date->minimumDate()->maximumJulianDay(),
652                'd_fact'       => $fact,
653                'd_gid'        => $xref,
654                'd_file'       => $ged_id,
655                'd_type'       => $date->minimumDate()->format('%@'),
656            ];
657
658            $rows[] = [
659                'd_day'        => $date->maximumDate()->day,
660                'd_month'      => $date->maximumDate()->format('%O'),
661                'd_mon'        => $date->maximumDate()->month,
662                'd_year'       => $date->maximumDate()->year,
663                'd_julianday1' => $date->maximumDate()->minimumJulianDay(),
664                'd_julianday2' => $date->maximumDate()->maximumJulianDay(),
665                'd_fact'       => $fact,
666                'd_gid'        => $xref,
667                'd_file'       => $ged_id,
668                'd_type'       => $date->minimumDate()->format('%@'),
669            ];
670        }
671
672        // array_unique doesn't work with arrays of arrays
673        $rows = array_intersect_key($rows, array_unique(array_map('serialize', $rows)));
674
675        DB::table('dates')->insert($rows);
676    }
677
678    /**
679     * Extract all the links from the given record and insert them into the database
680     *
681     * @param string $xref
682     * @param int    $ged_id
683     * @param string $gedrec
684     *
685     * @return void
686     */
687    private function updateLinks(string $xref, int $ged_id, string $gedrec): void
688    {
689        // Insert all new rows together
690        $rows = [];
691
692        preg_match_all('/\n\d+ (' . Gedcom::REGEX_TAG . ') @(' . Gedcom::REGEX_XREF . ')@/', $gedrec, $matches, PREG_SET_ORDER);
693
694        foreach ($matches as $match) {
695            // Some applications (e.g. GenoPro) create links longer than 15 characters.
696            $link = mb_substr($match[1], 0, 15);
697
698            // Take care of "duplicates" that differ on case/collation, e.g. "SOUR @S1@" and "SOUR @s1@"
699            $rows[$link . strtoupper($match[2])] = [
700                'l_from' => $xref,
701                'l_to'   => $match[2],
702                'l_type' => $link,
703                'l_file' => $ged_id,
704            ];
705        }
706
707        DB::table('link')->insert($rows);
708    }
709
710    /**
711     * Extract all the names from the given record and insert them into the database.
712     *
713     * @param string     $xref
714     * @param int        $ged_id
715     * @param Individual $record
716     *
717     * @return void
718     */
719    private function updateNames(string $xref, int $ged_id, Individual $record): void
720    {
721        // Insert all new rows together
722        $rows = [];
723
724        foreach ($record->getAllNames() as $n => $name) {
725            if ($name['givn'] === Individual::PRAENOMEN_NESCIO) {
726                $soundex_givn_std = null;
727                $soundex_givn_dm  = null;
728            } else {
729                $soundex_givn_std = Soundex::russell($name['givn']);
730                $soundex_givn_dm  = Soundex::daitchMokotoff($name['givn']);
731            }
732
733            if ($name['surn'] === Individual::NOMEN_NESCIO) {
734                $soundex_surn_std = null;
735                $soundex_surn_dm  = null;
736            } else {
737                $soundex_surn_std = Soundex::russell($name['surname']);
738                $soundex_surn_dm  = Soundex::daitchMokotoff($name['surname']);
739            }
740
741            $rows[] = [
742                'n_file'             => $ged_id,
743                'n_id'               => $xref,
744                'n_num'              => $n,
745                'n_type'             => $name['type'],
746                'n_sort'             => mb_substr($name['sort'], 0, 255),
747                'n_full'             => mb_substr($name['fullNN'], 0, 255),
748                'n_surname'          => mb_substr($name['surname'], 0, 255),
749                'n_surn'             => mb_substr($name['surn'], 0, 255),
750                'n_givn'             => mb_substr($name['givn'], 0, 255),
751                'n_soundex_givn_std' => $soundex_givn_std,
752                'n_soundex_surn_std' => $soundex_surn_std,
753                'n_soundex_givn_dm'  => $soundex_givn_dm,
754                'n_soundex_surn_dm'  => $soundex_surn_dm,
755            ];
756        }
757
758        DB::table('name')->insert($rows);
759    }
760
761    /**
762     * Extract inline media data, and convert to media objects.
763     *
764     * @param Tree   $tree
765     * @param string $gedcom
766     *
767     * @return string
768     */
769    private function convertInlineMedia(Tree $tree, string $gedcom): string
770    {
771        while (preg_match('/\n1 OBJE(?:\n[2-9].+)+/', $gedcom, $match)) {
772            $xref   = $this->createMediaObject($match[0], $tree);
773            $gedcom = strtr($gedcom, [$match[0] =>  "\n1 OBJE @" . $xref . '@']);
774        }
775        while (preg_match('/\n2 OBJE(?:\n[3-9].+)+/', $gedcom, $match)) {
776            $xref   = $this->createMediaObject($match[0], $tree);
777            $gedcom = strtr($gedcom, [$match[0] =>  "\n2 OBJE @" . $xref . '@']);
778        }
779        while (preg_match('/\n3 OBJE(?:\n[4-9].+)+/', $gedcom, $match)) {
780            $xref   = $this->createMediaObject($match[0], $tree);
781            $gedcom = strtr($gedcom, [$match[0] =>  "\n3 OBJE @" . $xref . '@']);
782        }
783
784        return $gedcom;
785    }
786
787    /**
788     * Create a new media object, from inline media data.
789     *
790     * GEDCOM 5.5.1 specifies: +1 FILE / +2 FORM / +3 MEDI / +1 TITL
791     * GEDCOM 5.5 specifies: +1 FILE / +1 FORM / +1 TITL
792     * GEDCOM 5.5.1 says that GEDCOM 5.5 specifies:  +1 FILE / +1 FORM / +2 MEDI
793     *
794     * Legacy generates: +1 FORM / +1 FILE / +1 TITL / +1 _SCBK / +1 _PRIM / +1 _TYPE / +1 NOTE
795     * RootsMagic generates: +1 FILE / +1 FORM / +1 TITL
796     *
797     * @param string $gedcom
798     * @param Tree   $tree
799     *
800     * @return string
801     */
802    private function createMediaObject(string $gedcom, Tree $tree): string
803    {
804        preg_match('/\n\d FILE (.+)/', $gedcom, $match);
805        $file = $match[1] ?? '';
806
807        preg_match('/\n\d TITL (.+)/', $gedcom, $match);
808        $title = $match[1] ?? '';
809
810        preg_match('/\n\d FORM (.+)/', $gedcom, $match);
811        $format = $match[1] ?? '';
812
813        preg_match('/\n\d MEDI (.+)/', $gedcom, $match);
814        $media = $match[1] ?? '';
815
816        preg_match('/\n\d _SCBK (.+)/', $gedcom, $match);
817        $scrapbook = $match[1] ?? '';
818
819        preg_match('/\n\d _PRIM (.+)/', $gedcom, $match);
820        $primary = $match[1] ?? '';
821
822        preg_match('/\n\d _TYPE (.+)/', $gedcom, $match);
823        if ($media === '') {
824            // Legacy uses _TYPE instead of MEDI
825            $media = $match[1] ?? '';
826            $type  = '';
827        } else {
828            $type = $match[1] ?? '';
829        }
830
831        preg_match_all('/\n\d NOTE (.+(?:\n\d CONT.*)*)/', $gedcom, $matches);
832        $notes = $matches[1] ?? [];
833
834        // Have we already created a media object with the same title/filename?
835        $xref = DB::table('media_file')
836            ->where('m_file', '=', $tree->id())
837            ->where('descriptive_title', '=', mb_substr($title, 0, 248))
838            ->where('multimedia_file_refn', '=', mb_substr($file, 0, 248))
839            ->value('m_id');
840
841        if ($xref === null) {
842            $xref = Registry::xrefFactory()->make(Media::RECORD_TYPE);
843
844            // convert to a media-object
845            $gedcom = '0 @' . $xref . "@ OBJE\n1 FILE " . $file;
846
847            if ($format !== '') {
848                $gedcom .= "\n2 FORM " . $format;
849
850                if ($media !== '') {
851                    $gedcom .= "\n3 TYPE " . $media;
852                }
853            }
854
855            if ($title !== '') {
856                $gedcom .= "\n3 TITL " . $title;
857            }
858
859            if ($scrapbook !== '') {
860                $gedcom .= "\n1 _SCBK " . $scrapbook;
861            }
862
863            if ($primary !== '') {
864                $gedcom .= "\n1 _PRIM " . $primary;
865            }
866
867            if ($type !== '') {
868                $gedcom .= "\n1 _TYPE " . $type;
869            }
870
871            foreach ($notes as $note) {
872                $gedcom .= "\n1 NOTE " . strtr($note, ["\n3" => "\n2", "\n4" => "\n2", "\n5" => "\n2"]);
873            }
874
875            DB::table('media')->insert([
876                'm_id'     => $xref,
877                'm_file'   => $tree->id(),
878                'm_gedcom' => $gedcom,
879            ]);
880
881            DB::table('media_file')->insert([
882                'm_id'                 => $xref,
883                'm_file'               => $tree->id(),
884                'multimedia_file_refn' => mb_substr($file, 0, 248),
885                'multimedia_format'    => mb_substr($format, 0, 4),
886                'source_media_type'    => mb_substr($media, 0, 15),
887                'descriptive_title'    => mb_substr($title, 0, 248),
888            ]);
889        }
890
891        return $xref;
892    }
893
894    /**
895     * update a record in the database
896     *
897     * @param string $gedrec
898     * @param Tree   $tree
899     * @param bool   $delete
900     *
901     * @return void
902     * @throws GedcomErrorException
903     */
904    public function updateRecord(string $gedrec, Tree $tree, bool $delete): void
905    {
906        if (preg_match('/^0 @(' . Gedcom::REGEX_XREF . ')@ (' . Gedcom::REGEX_TAG . ')/', $gedrec, $match)) {
907            [, $gid, $type] = $match;
908        } elseif (preg_match('/^0 (HEAD)(?:\n|$)/', $gedrec, $match)) {
909            // The HEAD record has no XREF.  Any others?
910            $gid  = $match[1];
911            $type = $match[1];
912        } else {
913            throw new GedcomErrorException($gedrec);
914        }
915
916        // Place links
917        DB::table('placelinks')
918            ->where('pl_gid', '=', $gid)
919            ->where('pl_file', '=', $tree->id())
920            ->delete();
921
922        // Orphaned places.  If we're deleting  "Westminster, London, England",
923        // then we may also need to delete "London, England" and "England".
924        do {
925            $affected = DB::table('places')
926                ->leftJoin('placelinks', function (JoinClause $join): void {
927                    $join
928                        ->on('p_id', '=', 'pl_p_id')
929                        ->on('p_file', '=', 'pl_file');
930                })
931                ->whereNull('pl_p_id')
932                ->delete();
933        } while ($affected > 0);
934
935        DB::table('dates')
936            ->where('d_gid', '=', $gid)
937            ->where('d_file', '=', $tree->id())
938            ->delete();
939
940        DB::table('name')
941            ->where('n_id', '=', $gid)
942            ->where('n_file', '=', $tree->id())
943            ->delete();
944
945        DB::table('link')
946            ->where('l_from', '=', $gid)
947            ->where('l_file', '=', $tree->id())
948            ->delete();
949
950        switch ($type) {
951            case Individual::RECORD_TYPE:
952                DB::table('individuals')
953                    ->where('i_id', '=', $gid)
954                    ->where('i_file', '=', $tree->id())
955                    ->delete();
956                break;
957
958            case Family::RECORD_TYPE:
959                DB::table('families')
960                    ->where('f_id', '=', $gid)
961                    ->where('f_file', '=', $tree->id())
962                    ->delete();
963                break;
964
965            case Source::RECORD_TYPE:
966                DB::table('sources')
967                    ->where('s_id', '=', $gid)
968                    ->where('s_file', '=', $tree->id())
969                    ->delete();
970                break;
971
972            case Media::RECORD_TYPE:
973                DB::table('media_file')
974                    ->where('m_id', '=', $gid)
975                    ->where('m_file', '=', $tree->id())
976                    ->delete();
977
978                DB::table('media')
979                    ->where('m_id', '=', $gid)
980                    ->where('m_file', '=', $tree->id())
981                    ->delete();
982                break;
983
984            default:
985                DB::table('other')
986                    ->where('o_id', '=', $gid)
987                    ->where('o_file', '=', $tree->id())
988                    ->delete();
989                break;
990        }
991
992        if (!$delete) {
993            $this->importRecord($gedrec, $tree, true);
994        }
995    }
996}
997