1 /* Extended regular expression matching and search library,
2 version 0.12.
3 (Implements POSIX draft P10003.2/D11.2, except for
4 internationalization features.)
5
6 Copyright (C) 1993 Free Software Foundation, Inc.
7
8 This program is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 2, or (at your option)
11 any later version.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; if not, write to the Free Software
20 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
21
22 /* AIX requires this to be the first thing in the file. */
23 #if defined (_AIX) && !defined (REGEX_MALLOC)
24 #pragma alloca
25 #endif
26
27 #define _GNU_SOURCE
28
29 /* We need this for `regex.h', and perhaps for the Emacs include files. */
30 #include <sys/types.h>
31
32 #ifdef HAVE_CONFIG_H
33 #include "config.h"
34 #endif
35
36 /* The `emacs' switch turns on certain matching commands
37 that make sense only in Emacs. */
38 #ifdef emacs
39
40 #include "lisp.h"
41 #include "buffer.h"
42 #include "syntax.h"
43
44 /* Emacs uses `NULL' as a predicate. */
45 #undef NULL
46
47 #else /* not emacs */
48
49 /* We used to test for `BSTRING' here, but only GCC and Emacs define
50 `BSTRING', as far as I know, and neither of them use this code. */
51 #if HAVE_STRING_H || STDC_HEADERS
52 #include <string.h>
53 #ifndef bcmp
54 #define bcmp(s1, s2, n) memcmp ((s1), (s2), (n))
55 #endif
56 #ifndef bcopy
57 #define bcopy(s, d, n) memcpy ((d), (s), (n))
58 #endif
59 #ifndef bzero
60 #define bzero(s, n) memset ((s), 0, (n))
61 #endif
62 #else
63 #include <strings.h>
64 #endif
65
66 #include <stdlib.h>
67
68 /* Define the syntax stuff for \<, \>, etc. */
69
70 /* This must be nonzero for the wordchar and notwordchar pattern
71 commands in re_match_2. */
72 #ifndef Sword
73 #define Sword 1
74 #endif
75
76 #ifdef SYNTAX_TABLE
77
78 extern char *re_syntax_table;
79
80 #else /* not SYNTAX_TABLE */
81
82 /* How many characters in the character set. */
83 #define CHAR_SET_SIZE 256
84
85 static char re_syntax_table[CHAR_SET_SIZE];
86
87 static void
init_syntax_once()88 init_syntax_once ()
89 {
90 register int c;
91 static int done = 0;
92
93 if (done)
94 return;
95
96 bzero (re_syntax_table, sizeof re_syntax_table);
97
98 for (c = 'a'; c <= 'z'; c++)
99 re_syntax_table[c] = Sword;
100
101 for (c = 'A'; c <= 'Z'; c++)
102 re_syntax_table[c] = Sword;
103
104 for (c = '0'; c <= '9'; c++)
105 re_syntax_table[c] = Sword;
106
107 re_syntax_table['_'] = Sword;
108
109 done = 1;
110 }
111
112 #endif /* not SYNTAX_TABLE */
113
114 #define SYNTAX(c) re_syntax_table[c]
115
116 #endif /* not emacs */
117
118 /* Get the interface, including the syntax bits. */
119 #include "regex.h"
120
121 /* isalpha etc. are used for the character classes. */
122 #include <ctype.h>
123
124 #ifndef isascii
125 #define isascii(c) 1
126 #endif
127
128 #ifdef isblank
129 #define ISBLANK(c) (isascii (c) && isblank (c))
130 #else
131 #define ISBLANK(c) ((c) == ' ' || (c) == '\t')
132 #endif
133 #ifdef isgraph
134 #define ISGRAPH(c) (isascii (c) && isgraph (c))
135 #else
136 #define ISGRAPH(c) (isascii (c) && isprint (c) && !isspace (c))
137 #endif
138
139 #define ISPRINT(c) (isascii (c) && isprint (c))
140 #define ISDIGIT(c) (isascii (c) && isdigit (c))
141 #define ISALNUM(c) (isascii (c) && isalnum (c))
142 #define ISALPHA(c) (isascii (c) && isalpha (c))
143 #define ISCNTRL(c) (isascii (c) && iscntrl (c))
144 #define ISLOWER(c) (isascii (c) && islower (c))
145 #define ISPUNCT(c) (isascii (c) && ispunct (c))
146 #define ISSPACE(c) (isascii (c) && isspace (c))
147 #define ISUPPER(c) (isascii (c) && isupper (c))
148 #define ISXDIGIT(c) (isascii (c) && isxdigit (c))
149
150 #ifndef NULL
151 #define NULL 0
152 #endif
153
154 /* We remove any previous definition of `SIGN_EXTEND_CHAR',
155 since ours (we hope) works properly with all combinations of
156 machines, compilers, `char' and `unsigned char' argument types.
157 (Per Bothner suggested the basic approach.) */
158 #undef SIGN_EXTEND_CHAR
159 #if __STDC__
160 #define SIGN_EXTEND_CHAR(c) ((signed char) (c))
161 #else /* not __STDC__ */
162 /* As in Harbison and Steele. */
163 #define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128)
164 #endif
165
166 /* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we
167 use `alloca' instead of `malloc'. This is because using malloc in
168 re_search* or re_match* could cause memory leaks when C-g is used in
169 Emacs; also, malloc is slower and causes storage fragmentation. On
170 the other hand, malloc is more portable, and easier to debug.
171
172 Because we sometimes use alloca, some routines have to be macros,
173 not functions -- `alloca'-allocated space disappears at the end of the
174 function it is called in. */
175
176 #ifdef REGEX_MALLOC
177
178 #define REGEX_ALLOCATE malloc
179 #define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize)
180
181 #else /* not REGEX_MALLOC */
182
183 /* Emacs already defines alloca, sometimes. */
184 #ifndef alloca
185
186 /* Make alloca work the best possible way. */
187 #ifdef __GNUC__
188 #define alloca __builtin_alloca
189 #else /* not __GNUC__ */
190 #if HAVE_ALLOCA_H
191 #include <alloca.h>
192 #else /* not __GNUC__ or HAVE_ALLOCA_H */
193 #ifndef _AIX /* Already did AIX, up at the top. */
194 char *alloca ();
195 #endif /* not _AIX */
196 #endif /* not HAVE_ALLOCA_H */
197 #endif /* not __GNUC__ */
198
199 #endif /* not alloca */
200
201 #define REGEX_ALLOCATE alloca
202
203 /* Assumes a `char *destination' variable. */
204 #define REGEX_REALLOCATE(source, osize, nsize) \
205 (destination = (char *) alloca (nsize), \
206 bcopy (source, destination, osize), \
207 destination)
208
209 #endif /* not REGEX_MALLOC */
210
211
212 /* True if `size1' is non-NULL and PTR is pointing anywhere inside
213 `string1' or just past its end. This works if PTR is NULL, which is
214 a good thing. */
215 #define FIRST_STRING_P(ptr) \
216 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
217
218 /* (Re)Allocate N items of type T using malloc, or fail. */
219 #define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t)))
220 #define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t)))
221 #define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t)))
222
223 #define BYTEWIDTH 8 /* In bits. */
224
225 #define STREQ(s1, s2) ((strcmp (s1, s2) == 0))
226
227 #define MAX(a, b) ((a) > (b) ? (a) : (b))
228 #define MIN(a, b) ((a) < (b) ? (a) : (b))
229
230 typedef char boolean;
231 #define false 0
232 #define true 1
233
234 /* These are the command codes that appear in compiled regular
235 expressions. Some opcodes are followed by argument bytes. A
236 command code can specify any interpretation whatsoever for its
237 arguments. Zero bytes may appear in the compiled regular expression.
238
239 The value of `exactn' is needed in search.c (search_buffer) in Emacs.
240 So regex.h defines a symbol `RE_EXACTN_VALUE' to be 1; the value of
241 `exactn' we use here must also be 1. */
242
243 typedef enum
244 {
245 no_op = 0,
246
247 /* Followed by one byte giving n, then by n literal bytes. */
248 exactn = 1,
249
250 /* Matches any (more or less) character. */
251 anychar,
252
253 /* Matches any one char belonging to specified set. First
254 following byte is number of bitmap bytes. Then come bytes
255 for a bitmap saying which chars are in. Bits in each byte
256 are ordered low-bit-first. A character is in the set if its
257 bit is 1. A character too large to have a bit in the map is
258 automatically not in the set. */
259 charset,
260
261 /* Same parameters as charset, but match any character that is
262 not one of those specified. */
263 charset_not,
264
265 /* Start remembering the text that is matched, for storing in a
266 register. Followed by one byte with the register number, in
267 the range 0 to one less than the pattern buffer's re_nsub
268 field. Then followed by one byte with the number of groups
269 inner to this one. (This last has to be part of the
270 start_memory only because we need it in the on_failure_jump
271 of re_match_2.) */
272 start_memory,
273
274 /* Stop remembering the text that is matched and store it in a
275 memory register. Followed by one byte with the register
276 number, in the range 0 to one less than `re_nsub' in the
277 pattern buffer, and one byte with the number of inner groups,
278 just like `start_memory'. (We need the number of inner
279 groups here because we don't have any easy way of finding the
280 corresponding start_memory when we're at a stop_memory.) */
281 stop_memory,
282
283 /* Match a duplicate of something remembered. Followed by one
284 byte containing the register number. */
285 duplicate,
286
287 /* Fail unless at beginning of line. */
288 begline,
289
290 /* Fail unless at end of line. */
291 endline,
292
293 /* Succeeds if at beginning of buffer (if emacs) or at beginning
294 of string to be matched (if not). */
295 begbuf,
296
297 /* Analogously, for end of buffer/string. */
298 endbuf,
299
300 /* Followed by two byte relative address to which to jump. */
301 jump,
302
303 /* Same as jump, but marks the end of an alternative. */
304 jump_past_alt,
305
306 /* Followed by two-byte relative address of place to resume at
307 in case of failure. */
308 on_failure_jump,
309
310 /* Like on_failure_jump, but pushes a placeholder instead of the
311 current string position when executed. */
312 on_failure_keep_string_jump,
313
314 /* Throw away latest failure point and then jump to following
315 two-byte relative address. */
316 pop_failure_jump,
317
318 /* Change to pop_failure_jump if know won't have to backtrack to
319 match; otherwise change to jump. This is used to jump
320 back to the beginning of a repeat. If what follows this jump
321 clearly won't match what the repeat does, such that we can be
322 sure that there is no use backtracking out of repetitions
323 already matched, then we change it to a pop_failure_jump.
324 Followed by two-byte address. */
325 maybe_pop_jump,
326
327 /* Jump to following two-byte address, and push a dummy failure
328 point. This failure point will be thrown away if an attempt
329 is made to use it for a failure. A `+' construct makes this
330 before the first repeat. Also used as an intermediary kind
331 of jump when compiling an alternative. */
332 dummy_failure_jump,
333
334 /* Push a dummy failure point and continue. Used at the end of
335 alternatives. */
336 push_dummy_failure,
337
338 /* Followed by two-byte relative address and two-byte number n.
339 After matching N times, jump to the address upon failure. */
340 succeed_n,
341
342 /* Followed by two-byte relative address, and two-byte number n.
343 Jump to the address N times, then fail. */
344 jump_n,
345
346 /* Set the following two-byte relative address to the
347 subsequent two-byte number. The address *includes* the two
348 bytes of number. */
349 set_number_at,
350
351 wordchar, /* Matches any word-constituent character. */
352 notwordchar, /* Matches any char that is not a word-constituent. */
353
354 wordbeg, /* Succeeds if at word beginning. */
355 wordend, /* Succeeds if at word end. */
356
357 wordbound, /* Succeeds if at a word boundary. */
358 notwordbound /* Succeeds if not at a word boundary. */
359
360 #ifdef emacs
361 ,before_dot, /* Succeeds if before point. */
362 at_dot, /* Succeeds if at point. */
363 after_dot, /* Succeeds if after point. */
364
365 /* Matches any character whose syntax is specified. Followed by
366 a byte which contains a syntax code, e.g., Sword. */
367 syntaxspec,
368
369 /* Matches any character whose syntax is not that specified. */
370 notsyntaxspec
371 #endif /* emacs */
372 } re_opcode_t;
373
374 /* Common operations on the compiled pattern. */
375
376 /* Store NUMBER in two contiguous bytes starting at DESTINATION. */
377
378 #define STORE_NUMBER(destination, number) \
379 do { \
380 (destination)[0] = (number) & 0377; \
381 (destination)[1] = (number) >> 8; \
382 } while (0)
383
384 /* Same as STORE_NUMBER, except increment DESTINATION to
385 the byte after where the number is stored. Therefore, DESTINATION
386 must be an lvalue. */
387
388 #define STORE_NUMBER_AND_INCR(destination, number) \
389 do { \
390 STORE_NUMBER (destination, number); \
391 (destination) += 2; \
392 } while (0)
393
394 /* Put into DESTINATION a number stored in two contiguous bytes starting
395 at SOURCE. */
396
397 #define EXTRACT_NUMBER(destination, source) \
398 do { \
399 (destination) = *(source) & 0377; \
400 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \
401 } while (0)
402
403 #ifdef DEBUG
404 static void
extract_number(dest,source)405 extract_number (dest, source)
406 int *dest;
407 unsigned char *source;
408 {
409 int temp = SIGN_EXTEND_CHAR (*(source + 1));
410 *dest = *source & 0377;
411 *dest += temp << 8;
412 }
413
414 #ifndef EXTRACT_MACROS /* To debug the macros. */
415 #undef EXTRACT_NUMBER
416 #define EXTRACT_NUMBER(dest, src) extract_number (&dest, src)
417 #endif /* not EXTRACT_MACROS */
418
419 #endif /* DEBUG */
420
421 /* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
422 SOURCE must be an lvalue. */
423
424 #define EXTRACT_NUMBER_AND_INCR(destination, source) \
425 do { \
426 EXTRACT_NUMBER (destination, source); \
427 (source) += 2; \
428 } while (0)
429
430 #ifdef DEBUG
431 static void
extract_number_and_incr(destination,source)432 extract_number_and_incr (destination, source)
433 int *destination;
434 unsigned char **source;
435 {
436 extract_number (destination, *source);
437 *source += 2;
438 }
439
440 #ifndef EXTRACT_MACROS
441 #undef EXTRACT_NUMBER_AND_INCR
442 #define EXTRACT_NUMBER_AND_INCR(dest, src) \
443 extract_number_and_incr (&dest, &src)
444 #endif /* not EXTRACT_MACROS */
445
446 #endif /* DEBUG */
447
448 /* If DEBUG is defined, Regex prints many voluminous messages about what
449 it is doing (if the variable `debug' is nonzero). If linked with the
450 main program in `iregex.c', you can enter patterns and strings
451 interactively. And if linked with the main program in `main.c' and
452 the other test files, you can run the already-written tests. */
453
454 #ifdef DEBUG
455
456 /* We use standard I/O for debugging. */
457 #include <stdio.h>
458
459 /* It is useful to test things that ``must'' be true when debugging. */
460 #include <assert.h>
461
462 static int debug = 0;
463
464 #define DEBUG_STATEMENT(e) e
465 #define DEBUG_PRINT1(x) if (debug) printf (x)
466 #define DEBUG_PRINT2(x1, x2) if (debug) printf (x1, x2)
467 #define DEBUG_PRINT3(x1, x2, x3) if (debug) printf (x1, x2, x3)
468 #define DEBUG_PRINT4(x1, x2, x3, x4) if (debug) printf (x1, x2, x3, x4)
469 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
470 if (debug) print_partial_compiled_pattern (s, e)
471 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
472 if (debug) print_double_string (w, s1, sz1, s2, sz2)
473
474
475 extern void printchar ();
476
477 /* Print the fastmap in human-readable form. */
478
479 void
print_fastmap(fastmap)480 print_fastmap (fastmap)
481 char *fastmap;
482 {
483 unsigned was_a_range = 0;
484 unsigned i = 0;
485
486 while (i < (1 << BYTEWIDTH))
487 {
488 if (fastmap[i++])
489 {
490 was_a_range = 0;
491 printchar (i - 1);
492 while (i < (1 << BYTEWIDTH) && fastmap[i])
493 {
494 was_a_range = 1;
495 i++;
496 }
497 if (was_a_range)
498 {
499 printf ("-");
500 printchar (i - 1);
501 }
502 }
503 }
504 putchar ('\n');
505 }
506
507
508 /* Print a compiled pattern string in human-readable form, starting at
509 the START pointer into it and ending just before the pointer END. */
510
511 void
print_partial_compiled_pattern(start,end)512 print_partial_compiled_pattern (start, end)
513 unsigned char *start;
514 unsigned char *end;
515 {
516 int mcnt, mcnt2;
517 unsigned char *p = start;
518 unsigned char *pend = end;
519
520 if (start == NULL)
521 {
522 printf ("(null)\n");
523 return;
524 }
525
526 /* Loop over pattern commands. */
527 while (p < pend)
528 {
529 switch ((re_opcode_t) *p++)
530 {
531 case no_op:
532 printf ("/no_op");
533 break;
534
535 case exactn:
536 mcnt = *p++;
537 printf ("/exactn/%d", mcnt);
538 do
539 {
540 putchar ('/');
541 printchar (*p++);
542 }
543 while (--mcnt);
544 break;
545
546 case start_memory:
547 mcnt = *p++;
548 printf ("/start_memory/%d/%d", mcnt, *p++);
549 break;
550
551 case stop_memory:
552 mcnt = *p++;
553 printf ("/stop_memory/%d/%d", mcnt, *p++);
554 break;
555
556 case duplicate:
557 printf ("/duplicate/%d", *p++);
558 break;
559
560 case anychar:
561 printf ("/anychar");
562 break;
563
564 case charset:
565 case charset_not:
566 {
567 register int c;
568
569 printf ("/charset%s",
570 (re_opcode_t) *(p - 1) == charset_not ? "_not" : "");
571
572 assert (p + *p < pend);
573
574 for (c = 0; c < *p; c++)
575 {
576 unsigned bit;
577 unsigned char map_byte = p[1 + c];
578
579 putchar ('/');
580
581 for (bit = 0; bit < BYTEWIDTH; bit++)
582 if (map_byte & (1 << bit))
583 printchar (c * BYTEWIDTH + bit);
584 }
585 p += 1 + *p;
586 break;
587 }
588
589 case begline:
590 printf ("/begline");
591 break;
592
593 case endline:
594 printf ("/endline");
595 break;
596
597 case on_failure_jump:
598 extract_number_and_incr (&mcnt, &p);
599 printf ("/on_failure_jump/0/%d", mcnt);
600 break;
601
602 case on_failure_keep_string_jump:
603 extract_number_and_incr (&mcnt, &p);
604 printf ("/on_failure_keep_string_jump/0/%d", mcnt);
605 break;
606
607 case dummy_failure_jump:
608 extract_number_and_incr (&mcnt, &p);
609 printf ("/dummy_failure_jump/0/%d", mcnt);
610 break;
611
612 case push_dummy_failure:
613 printf ("/push_dummy_failure");
614 break;
615
616 case maybe_pop_jump:
617 extract_number_and_incr (&mcnt, &p);
618 printf ("/maybe_pop_jump/0/%d", mcnt);
619 break;
620
621 case pop_failure_jump:
622 extract_number_and_incr (&mcnt, &p);
623 printf ("/pop_failure_jump/0/%d", mcnt);
624 break;
625
626 case jump_past_alt:
627 extract_number_and_incr (&mcnt, &p);
628 printf ("/jump_past_alt/0/%d", mcnt);
629 break;
630
631 case jump:
632 extract_number_and_incr (&mcnt, &p);
633 printf ("/jump/0/%d", mcnt);
634 break;
635
636 case succeed_n:
637 extract_number_and_incr (&mcnt, &p);
638 extract_number_and_incr (&mcnt2, &p);
639 printf ("/succeed_n/0/%d/0/%d", mcnt, mcnt2);
640 break;
641
642 case jump_n:
643 extract_number_and_incr (&mcnt, &p);
644 extract_number_and_incr (&mcnt2, &p);
645 printf ("/jump_n/0/%d/0/%d", mcnt, mcnt2);
646 break;
647
648 case set_number_at:
649 extract_number_and_incr (&mcnt, &p);
650 extract_number_and_incr (&mcnt2, &p);
651 printf ("/set_number_at/0/%d/0/%d", mcnt, mcnt2);
652 break;
653
654 case wordbound:
655 printf ("/wordbound");
656 break;
657
658 case notwordbound:
659 printf ("/notwordbound");
660 break;
661
662 case wordbeg:
663 printf ("/wordbeg");
664 break;
665
666 case wordend:
667 printf ("/wordend");
668
669 #ifdef emacs
670 case before_dot:
671 printf ("/before_dot");
672 break;
673
674 case at_dot:
675 printf ("/at_dot");
676 break;
677
678 case after_dot:
679 printf ("/after_dot");
680 break;
681
682 case syntaxspec:
683 printf ("/syntaxspec");
684 mcnt = *p++;
685 printf ("/%d", mcnt);
686 break;
687
688 case notsyntaxspec:
689 printf ("/notsyntaxspec");
690 mcnt = *p++;
691 printf ("/%d", mcnt);
692 break;
693 #endif /* emacs */
694
695 case wordchar:
696 printf ("/wordchar");
697 break;
698
699 case notwordchar:
700 printf ("/notwordchar");
701 break;
702
703 case begbuf:
704 printf ("/begbuf");
705 break;
706
707 case endbuf:
708 printf ("/endbuf");
709 break;
710
711 default:
712 printf ("?%d", *(p-1));
713 }
714 }
715 printf ("/\n");
716 }
717
718
719 void
print_compiled_pattern(bufp)720 print_compiled_pattern (bufp)
721 struct re_pattern_buffer *bufp;
722 {
723 unsigned char *buffer = bufp->buffer;
724
725 print_partial_compiled_pattern (buffer, buffer + bufp->used);
726 printf ("%d bytes used/%d bytes allocated.\n", bufp->used, bufp->allocated);
727
728 if (bufp->fastmap_accurate && bufp->fastmap)
729 {
730 printf ("fastmap: ");
731 print_fastmap (bufp->fastmap);
732 }
733
734 printf ("re_nsub: %d\t", bufp->re_nsub);
735 printf ("regs_alloc: %d\t", bufp->regs_allocated);
736 printf ("can_be_null: %d\t", bufp->can_be_null);
737 printf ("newline_anchor: %d\n", bufp->newline_anchor);
738 printf ("no_sub: %d\t", bufp->no_sub);
739 printf ("not_bol: %d\t", bufp->not_bol);
740 printf ("not_eol: %d\t", bufp->not_eol);
741 printf ("syntax: %d\n", bufp->syntax);
742 /* Perhaps we should print the translate table? */
743 }
744
745
746 void
print_double_string(where,string1,size1,string2,size2)747 print_double_string (where, string1, size1, string2, size2)
748 const char *where;
749 const char *string1;
750 const char *string2;
751 int size1;
752 int size2;
753 {
754 unsigned this_char;
755
756 if (where == NULL)
757 printf ("(null)");
758 else
759 {
760 if (FIRST_STRING_P (where))
761 {
762 for (this_char = where - string1; this_char < size1; this_char++)
763 printchar (string1[this_char]);
764
765 where = string2;
766 }
767
768 for (this_char = where - string2; this_char < size2; this_char++)
769 printchar (string2[this_char]);
770 }
771 }
772
773 #else /* not DEBUG */
774
775 #undef assert
776 #define assert(e)
777
778 #define DEBUG_STATEMENT(e)
779 #define DEBUG_PRINT1(x)
780 #define DEBUG_PRINT2(x1, x2)
781 #define DEBUG_PRINT3(x1, x2, x3)
782 #define DEBUG_PRINT4(x1, x2, x3, x4)
783 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
784 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
785
786 #endif /* not DEBUG */
787
788 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
789 also be assigned to arbitrarily: each pattern buffer stores its own
790 syntax, so it can be changed between regex compilations. */
791 reg_syntax_t re_syntax_options = RE_SYNTAX_EMACS;
792
793
794 /* Specify the precise syntax of regexps for compilation. This provides
795 for compatibility for various utilities which historically have
796 different, incompatible syntaxes.
797
798 The argument SYNTAX is a bit mask comprised of the various bits
799 defined in regex.h. We return the old syntax. */
800
801 reg_syntax_t
re_set_syntax(syntax)802 re_set_syntax (syntax)
803 reg_syntax_t syntax;
804 {
805 reg_syntax_t ret = re_syntax_options;
806
807 re_syntax_options = syntax;
808 return ret;
809 }
810
811 /* This table gives an error message for each of the error codes listed
812 in regex.h. Obviously the order here has to be same as there. */
813
814 static const char *re_error_msg[] =
815 { NULL, /* REG_NOERROR */
816 "No match", /* REG_NOMATCH */
817 "Invalid regular expression", /* REG_BADPAT */
818 "Invalid collation character", /* REG_ECOLLATE */
819 "Invalid character class name", /* REG_ECTYPE */
820 "Trailing backslash", /* REG_EESCAPE */
821 "Invalid back reference", /* REG_ESUBREG */
822 "Unmatched [ or [^", /* REG_EBRACK */
823 "Unmatched ( or \\(", /* REG_EPAREN */
824 "Unmatched \\{", /* REG_EBRACE */
825 "Invalid content of \\{\\}", /* REG_BADBR */
826 "Invalid range end", /* REG_ERANGE */
827 "Memory exhausted", /* REG_ESPACE */
828 "Invalid preceding regular expression", /* REG_BADRPT */
829 "Premature end of regular expression", /* REG_EEND */
830 "Regular expression too big", /* REG_ESIZE */
831 "Unmatched ) or \\)", /* REG_ERPAREN */
832 };
833
834 /* Subroutine declarations and macros for regex_compile. */
835
836 static void store_op1 (), store_op2 ();
837 static void insert_op1 (), insert_op2 ();
838 static boolean at_begline_loc_p (), at_endline_loc_p ();
839 static boolean group_in_compile_stack ();
840 static reg_errcode_t compile_range ();
841
842 /* Fetch the next character in the uncompiled pattern---translating it
843 if necessary. Also cast from a signed character in the constant
844 string passed to us by the user to an unsigned char that we can use
845 as an array index (in, e.g., `translate'). */
846 #define PATFETCH(c) \
847 do {if (p == pend) return REG_EEND; \
848 c = (unsigned char) *p++; \
849 if (translate) c = translate[c]; \
850 } while (0)
851
852 /* Fetch the next character in the uncompiled pattern, with no
853 translation. */
854 #define PATFETCH_RAW(c) \
855 do {if (p == pend) return REG_EEND; \
856 c = (unsigned char) *p++; \
857 } while (0)
858
859 /* Go backwards one character in the pattern. */
860 #define PATUNFETCH p--
861
862
863 /* If `translate' is non-null, return translate[D], else just D. We
864 cast the subscript to translate because some data is declared as
865 `char *', to avoid warnings when a string constant is passed. But
866 when we use a character as a subscript we must make it unsigned. */
867 #define TRANSLATE(d) (translate ? translate[(unsigned char) (d)] : (d))
868
869
870 /* Macros for outputting the compiled pattern into `buffer'. */
871
872 /* If the buffer isn't allocated when it comes in, use this. */
873 #define INIT_BUF_SIZE 32
874
875 /* Make sure we have at least N more bytes of space in buffer. */
876 #define GET_BUFFER_SPACE(n) \
877 while (b - bufp->buffer + (n) > bufp->allocated) \
878 EXTEND_BUFFER ()
879
880 /* Make sure we have one more byte of buffer space and then add C to it. */
881 #define BUF_PUSH(c) \
882 do { \
883 GET_BUFFER_SPACE (1); \
884 *b++ = (unsigned char) (c); \
885 } while (0)
886
887
888 /* Ensure we have two more bytes of buffer space and then append C1 and C2. */
889 #define BUF_PUSH_2(c1, c2) \
890 do { \
891 GET_BUFFER_SPACE (2); \
892 *b++ = (unsigned char) (c1); \
893 *b++ = (unsigned char) (c2); \
894 } while (0)
895
896
897 /* As with BUF_PUSH_2, except for three bytes. */
898 #define BUF_PUSH_3(c1, c2, c3) \
899 do { \
900 GET_BUFFER_SPACE (3); \
901 *b++ = (unsigned char) (c1); \
902 *b++ = (unsigned char) (c2); \
903 *b++ = (unsigned char) (c3); \
904 } while (0)
905
906
907 /* Store a jump with opcode OP at LOC to location TO. We store a
908 relative address offset by the three bytes the jump itself occupies. */
909 #define STORE_JUMP(op, loc, to) \
910 store_op1 (op, loc, (to) - (loc) - 3)
911
912 /* Likewise, for a two-argument jump. */
913 #define STORE_JUMP2(op, loc, to, arg) \
914 store_op2 (op, loc, (to) - (loc) - 3, arg)
915
916 /* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */
917 #define INSERT_JUMP(op, loc, to) \
918 insert_op1 (op, loc, (to) - (loc) - 3, b)
919
920 /* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */
921 #define INSERT_JUMP2(op, loc, to, arg) \
922 insert_op2 (op, loc, (to) - (loc) - 3, arg, b)
923
924
925 /* This is not an arbitrary limit: the arguments which represent offsets
926 into the pattern are two bytes long. So if 2^16 bytes turns out to
927 be too small, many things would have to change. */
928 #define MAX_BUF_SIZE (1L << 16)
929
930
931 /* Extend the buffer by twice its current size via realloc and
932 reset the pointers that pointed into the old block to point to the
933 correct places in the new one. If extending the buffer results in it
934 being larger than MAX_BUF_SIZE, then flag memory exhausted. */
935 #define EXTEND_BUFFER() \
936 do { \
937 unsigned char *old_buffer = bufp->buffer; \
938 if (bufp->allocated == MAX_BUF_SIZE) \
939 return REG_ESIZE; \
940 bufp->allocated <<= 1; \
941 if (bufp->allocated > MAX_BUF_SIZE) \
942 bufp->allocated = MAX_BUF_SIZE; \
943 bufp->buffer = (unsigned char *) realloc (bufp->buffer, bufp->allocated);\
944 if (bufp->buffer == NULL) \
945 return REG_ESPACE; \
946 /* If the buffer moved, move all the pointers into it. */ \
947 if (old_buffer != bufp->buffer) \
948 { \
949 b = (b - old_buffer) + bufp->buffer; \
950 begalt = (begalt - old_buffer) + bufp->buffer; \
951 if (fixup_alt_jump) \
952 fixup_alt_jump = (fixup_alt_jump - old_buffer) + bufp->buffer;\
953 if (laststart) \
954 laststart = (laststart - old_buffer) + bufp->buffer; \
955 if (pending_exact) \
956 pending_exact = (pending_exact - old_buffer) + bufp->buffer; \
957 } \
958 } while (0)
959
960
961 /* Since we have one byte reserved for the register number argument to
962 {start,stop}_memory, the maximum number of groups we can report
963 things about is what fits in that byte. */
964 #define MAX_REGNUM 255
965
966 /* But patterns can have more than `MAX_REGNUM' registers. We just
967 ignore the excess. */
968 typedef unsigned regnum_t;
969
970
971 /* Macros for the compile stack. */
972
973 /* Since offsets can go either forwards or backwards, this type needs to
974 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */
975 typedef int pattern_offset_t;
976
977 typedef struct
978 {
979 pattern_offset_t begalt_offset;
980 pattern_offset_t fixup_alt_jump;
981 pattern_offset_t inner_group_offset;
982 pattern_offset_t laststart_offset;
983 regnum_t regnum;
984 } compile_stack_elt_t;
985
986
987 typedef struct
988 {
989 compile_stack_elt_t *stack;
990 unsigned size;
991 unsigned avail; /* Offset of next open position. */
992 } compile_stack_type;
993
994
995 #define INIT_COMPILE_STACK_SIZE 32
996
997 #define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
998 #define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
999
1000 /* The next available element. */
1001 #define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
1002
1003
1004 /* Set the bit for character C in a list. */
1005 #define SET_LIST_BIT(c) \
1006 (b[((unsigned char) (c)) / BYTEWIDTH] \
1007 |= 1 << (((unsigned char) c) % BYTEWIDTH))
1008
1009
1010 /* Get the next unsigned number in the uncompiled pattern. */
1011 #define GET_UNSIGNED_NUMBER(num) \
1012 { if (p != pend) \
1013 { \
1014 PATFETCH (c); \
1015 while (ISDIGIT (c)) \
1016 { \
1017 if (num < 0) \
1018 num = 0; \
1019 num = num * 10 + c - '0'; \
1020 if (p == pend) \
1021 break; \
1022 PATFETCH (c); \
1023 } \
1024 } \
1025 }
1026
1027 #define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */
1028
1029 #define IS_CHAR_CLASS(string) \
1030 (STREQ (string, "alpha") || STREQ (string, "upper") \
1031 || STREQ (string, "lower") || STREQ (string, "digit") \
1032 || STREQ (string, "alnum") || STREQ (string, "xdigit") \
1033 || STREQ (string, "space") || STREQ (string, "print") \
1034 || STREQ (string, "punct") || STREQ (string, "graph") \
1035 || STREQ (string, "cntrl") || STREQ (string, "blank"))
1036
1037 /* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
1038 Returns one of error codes defined in `regex.h', or zero for success.
1039
1040 Assumes the `allocated' (and perhaps `buffer') and `translate'
1041 fields are set in BUFP on entry.
1042
1043 If it succeeds, results are put in BUFP (if it returns an error, the
1044 contents of BUFP are undefined):
1045 `buffer' is the compiled pattern;
1046 `syntax' is set to SYNTAX;
1047 `used' is set to the length of the compiled pattern;
1048 `fastmap_accurate' is zero;
1049 `re_nsub' is the number of subexpressions in PATTERN;
1050 `not_bol' and `not_eol' are zero;
1051
1052 The `fastmap' and `newline_anchor' fields are neither
1053 examined nor set. */
1054
1055 static reg_errcode_t
regex_compile(pattern,size,syntax,bufp)1056 regex_compile (pattern, size, syntax, bufp)
1057 const char *pattern;
1058 int size;
1059 reg_syntax_t syntax;
1060 struct re_pattern_buffer *bufp;
1061 {
1062 /* We fetch characters from PATTERN here. Even though PATTERN is
1063 `char *' (i.e., signed), we declare these variables as unsigned, so
1064 they can be reliably used as array indices. */
1065 register unsigned char c, c1;
1066
1067 /* A random tempory spot in PATTERN. */
1068 const char *p1;
1069
1070 /* Points to the end of the buffer, where we should append. */
1071 register unsigned char *b;
1072
1073 /* Keeps track of unclosed groups. */
1074 compile_stack_type compile_stack;
1075
1076 /* Points to the current (ending) position in the pattern. */
1077 const char *p = pattern;
1078 const char *pend = pattern + size;
1079
1080 /* How to translate the characters in the pattern. */
1081 char *translate = bufp->translate;
1082
1083 /* Address of the count-byte of the most recently inserted `exactn'
1084 command. This makes it possible to tell if a new exact-match
1085 character can be added to that command or if the character requires
1086 a new `exactn' command. */
1087 unsigned char *pending_exact = 0;
1088
1089 /* Address of start of the most recently finished expression.
1090 This tells, e.g., postfix * where to find the start of its
1091 operand. Reset at the beginning of groups and alternatives. */
1092 unsigned char *laststart = 0;
1093
1094 /* Address of beginning of regexp, or inside of last group. */
1095 unsigned char *begalt;
1096
1097 /* Place in the uncompiled pattern (i.e., the {) to
1098 which to go back if the interval is invalid. */
1099 const char *beg_interval;
1100
1101 /* Address of the place where a forward jump should go to the end of
1102 the containing expression. Each alternative of an `or' -- except the
1103 last -- ends with a forward jump of this sort. */
1104 unsigned char *fixup_alt_jump = 0;
1105
1106 /* Counts open-groups as they are encountered. Remembered for the
1107 matching close-group on the compile stack, so the same register
1108 number is put in the stop_memory as the start_memory. */
1109 regnum_t regnum = 0;
1110
1111 #ifdef DEBUG
1112 DEBUG_PRINT1 ("\nCompiling pattern: ");
1113 if (debug)
1114 {
1115 unsigned debug_count;
1116
1117 for (debug_count = 0; debug_count < size; debug_count++)
1118 printchar (pattern[debug_count]);
1119 putchar ('\n');
1120 }
1121 #endif /* DEBUG */
1122
1123 /* Initialize the compile stack. */
1124 compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
1125 if (compile_stack.stack == NULL)
1126 return REG_ESPACE;
1127
1128 compile_stack.size = INIT_COMPILE_STACK_SIZE;
1129 compile_stack.avail = 0;
1130
1131 /* Initialize the pattern buffer. */
1132 bufp->syntax = syntax;
1133 bufp->fastmap_accurate = 0;
1134 bufp->not_bol = bufp->not_eol = 0;
1135
1136 /* Set `used' to zero, so that if we return an error, the pattern
1137 printer (for debugging) will think there's no pattern. We reset it
1138 at the end. */
1139 bufp->used = 0;
1140
1141 /* Always count groups, whether or not bufp->no_sub is set. */
1142 bufp->re_nsub = 0;
1143
1144 #if !defined (emacs) && !defined (SYNTAX_TABLE)
1145 /* Initialize the syntax table. */
1146 init_syntax_once ();
1147 #endif
1148
1149 if (bufp->allocated == 0)
1150 {
1151 if (bufp->buffer)
1152 { /* If zero allocated, but buffer is non-null, try to realloc
1153 enough space. This loses if buffer's address is bogus, but
1154 that is the user's responsibility. */
1155 RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char);
1156 }
1157 else
1158 { /* Caller did not allocate a buffer. Do it for them. */
1159 bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char);
1160 }
1161 if (!bufp->buffer) return REG_ESPACE;
1162
1163 bufp->allocated = INIT_BUF_SIZE;
1164 }
1165
1166 begalt = b = bufp->buffer;
1167
1168 /* Loop through the uncompiled pattern until we're at the end. */
1169 while (p != pend)
1170 {
1171 PATFETCH (c);
1172
1173 switch (c)
1174 {
1175 case '^':
1176 {
1177 if ( /* If at start of pattern, it's an operator. */
1178 p == pattern + 1
1179 /* If context independent, it's an operator. */
1180 || syntax & RE_CONTEXT_INDEP_ANCHORS
1181 /* Otherwise, depends on what's come before. */
1182 || at_begline_loc_p (pattern, p, syntax))
1183 BUF_PUSH (begline);
1184 else
1185 goto normal_char;
1186 }
1187 break;
1188
1189
1190 case '$':
1191 {
1192 if ( /* If at end of pattern, it's an operator. */
1193 p == pend
1194 /* If context independent, it's an operator. */
1195 || syntax & RE_CONTEXT_INDEP_ANCHORS
1196 /* Otherwise, depends on what's next. */
1197 || at_endline_loc_p (p, pend, syntax))
1198 BUF_PUSH (endline);
1199 else
1200 goto normal_char;
1201 }
1202 break;
1203
1204
1205 case '+':
1206 case '?':
1207 if ((syntax & RE_BK_PLUS_QM)
1208 || (syntax & RE_LIMITED_OPS))
1209 goto normal_char;
1210 handle_plus:
1211 case '*':
1212 /* If there is no previous pattern... */
1213 if (!laststart)
1214 {
1215 if (syntax & RE_CONTEXT_INVALID_OPS)
1216 return REG_BADRPT;
1217 else if (!(syntax & RE_CONTEXT_INDEP_OPS))
1218 goto normal_char;
1219 }
1220
1221 {
1222 /* Are we optimizing this jump? */
1223 boolean keep_string_p = false;
1224
1225 /* 1 means zero (many) matches is allowed. */
1226 char zero_times_ok = 0, many_times_ok = 0;
1227
1228 /* If there is a sequence of repetition chars, collapse it
1229 down to just one (the right one). We can't combine
1230 interval operators with these because of, e.g., `a{2}*',
1231 which should only match an even number of `a's. */
1232
1233 for (;;)
1234 {
1235 zero_times_ok |= c != '+';
1236 many_times_ok |= c != '?';
1237
1238 if (p == pend)
1239 break;
1240
1241 PATFETCH (c);
1242
1243 if (c == '*'
1244 || (!(syntax & RE_BK_PLUS_QM) && (c == '+' || c == '?')))
1245 ;
1246
1247 else if (syntax & RE_BK_PLUS_QM && c == '\\')
1248 {
1249 if (p == pend) return REG_EESCAPE;
1250
1251 PATFETCH (c1);
1252 if (!(c1 == '+' || c1 == '?'))
1253 {
1254 PATUNFETCH;
1255 PATUNFETCH;
1256 break;
1257 }
1258
1259 c = c1;
1260 }
1261 else
1262 {
1263 PATUNFETCH;
1264 break;
1265 }
1266
1267 /* If we get here, we found another repeat character. */
1268 }
1269
1270 /* Star, etc. applied to an empty pattern is equivalent
1271 to an empty pattern. */
1272 if (!laststart)
1273 break;
1274
1275 /* Now we know whether or not zero matches is allowed
1276 and also whether or not two or more matches is allowed. */
1277 if (many_times_ok)
1278 { /* More than one repetition is allowed, so put in at the
1279 end a backward relative jump from `b' to before the next
1280 jump we're going to put in below (which jumps from
1281 laststart to after this jump).
1282
1283 But if we are at the `*' in the exact sequence `.*\n',
1284 insert an unconditional jump backwards to the .,
1285 instead of the beginning of the loop. This way we only
1286 push a failure point once, instead of every time
1287 through the loop. */
1288 assert (p - 1 > pattern);
1289
1290 /* Allocate the space for the jump. */
1291 GET_BUFFER_SPACE (3);
1292
1293 /* We know we are not at the first character of the pattern,
1294 because laststart was nonzero. And we've already
1295 incremented `p', by the way, to be the character after
1296 the `*'. Do we have to do something analogous here
1297 for null bytes, because of RE_DOT_NOT_NULL? */
1298 if (TRANSLATE (*(p - 2)) == TRANSLATE ('.')
1299 && zero_times_ok
1300 && p < pend && TRANSLATE (*p) == TRANSLATE ('\n')
1301 && !(syntax & RE_DOT_NEWLINE))
1302 { /* We have .*\n. */
1303 STORE_JUMP (jump, b, laststart);
1304 keep_string_p = true;
1305 }
1306 else
1307 /* Anything else. */
1308 STORE_JUMP (maybe_pop_jump, b, laststart - 3);
1309
1310 /* We've added more stuff to the buffer. */
1311 b += 3;
1312 }
1313
1314 /* On failure, jump from laststart to b + 3, which will be the
1315 end of the buffer after this jump is inserted. */
1316 GET_BUFFER_SPACE (3);
1317 INSERT_JUMP (keep_string_p ? on_failure_keep_string_jump
1318 : on_failure_jump,
1319 laststart, b + 3);
1320 pending_exact = 0;
1321 b += 3;
1322
1323 if (!zero_times_ok)
1324 {
1325 /* At least one repetition is required, so insert a
1326 `dummy_failure_jump' before the initial
1327 `on_failure_jump' instruction of the loop. This
1328 effects a skip over that instruction the first time
1329 we hit that loop. */
1330 GET_BUFFER_SPACE (3);
1331 INSERT_JUMP (dummy_failure_jump, laststart, laststart + 6);
1332 b += 3;
1333 }
1334 }
1335 break;
1336
1337
1338 case '.':
1339 laststart = b;
1340 BUF_PUSH (anychar);
1341 break;
1342
1343
1344 case '[':
1345 {
1346 boolean had_char_class = false;
1347
1348 if (p == pend) return REG_EBRACK;
1349
1350 /* Ensure that we have enough space to push a charset: the
1351 opcode, the length count, and the bitset; 34 bytes in all. */
1352 GET_BUFFER_SPACE (34);
1353
1354 laststart = b;
1355
1356 /* We test `*p == '^' twice, instead of using an if
1357 statement, so we only need one BUF_PUSH. */
1358 BUF_PUSH (*p == '^' ? charset_not : charset);
1359 if (*p == '^')
1360 p++;
1361
1362 /* Remember the first position in the bracket expression. */
1363 p1 = p;
1364
1365 /* Push the number of bytes in the bitmap. */
1366 BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
1367
1368 /* Clear the whole map. */
1369 bzero (b, (1 << BYTEWIDTH) / BYTEWIDTH);
1370
1371 /* charset_not matches newline according to a syntax bit. */
1372 if ((re_opcode_t) b[-2] == charset_not
1373 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
1374 SET_LIST_BIT ('\n');
1375
1376 /* Read in characters and ranges, setting map bits. */
1377 for (;;)
1378 {
1379 if (p == pend) return REG_EBRACK;
1380
1381 PATFETCH (c);
1382
1383 /* \ might escape characters inside [...] and [^...]. */
1384 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
1385 {
1386 if (p == pend) return REG_EESCAPE;
1387
1388 PATFETCH (c1);
1389 SET_LIST_BIT (c1);
1390 continue;
1391 }
1392
1393 /* Could be the end of the bracket expression. If it's
1394 not (i.e., when the bracket expression is `[]' so
1395 far), the ']' character bit gets set way below. */
1396 if (c == ']' && p != p1 + 1)
1397 break;
1398
1399 /* Look ahead to see if it's a range when the last thing
1400 was a character class. */
1401 if (had_char_class && c == '-' && *p != ']')
1402 return REG_ERANGE;
1403
1404 /* Look ahead to see if it's a range when the last thing
1405 was a character: if this is a hyphen not at the
1406 beginning or the end of a list, then it's the range
1407 operator. */
1408 if (c == '-'
1409 && !(p - 2 >= pattern && p[-2] == '[')
1410 && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^')
1411 && *p != ']')
1412 {
1413 reg_errcode_t ret
1414 = compile_range (&p, pend, translate, syntax, b);
1415 if (ret != REG_NOERROR) return ret;
1416 }
1417
1418 else if (p[0] == '-' && p[1] != ']')
1419 { /* This handles ranges made up of characters only. */
1420 reg_errcode_t ret;
1421
1422 /* Move past the `-'. */
1423 PATFETCH (c1);
1424
1425 ret = compile_range (&p, pend, translate, syntax, b);
1426 if (ret != REG_NOERROR) return ret;
1427 }
1428
1429 /* See if we're at the beginning of a possible character
1430 class. */
1431
1432 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
1433 { /* Leave room for the null. */
1434 char str[CHAR_CLASS_MAX_LENGTH + 1];
1435
1436 PATFETCH (c);
1437 c1 = 0;
1438
1439 /* If pattern is `[[:'. */
1440 if (p == pend) return REG_EBRACK;
1441
1442 for (;;)
1443 {
1444 PATFETCH (c);
1445 if (c == ':' || c == ']' || p == pend
1446 || c1 == CHAR_CLASS_MAX_LENGTH)
1447 break;
1448 str[c1++] = c;
1449 }
1450 str[c1] = '\0';
1451
1452 /* If isn't a word bracketed by `[:' and:`]':
1453 undo the ending character, the letters, and leave
1454 the leading `:' and `[' (but set bits for them). */
1455 if (c == ':' && *p == ']')
1456 {
1457 int ch;
1458 boolean is_alnum = STREQ (str, "alnum");
1459 boolean is_alpha = STREQ (str, "alpha");
1460 boolean is_blank = STREQ (str, "blank");
1461 boolean is_cntrl = STREQ (str, "cntrl");
1462 boolean is_digit = STREQ (str, "digit");
1463 boolean is_graph = STREQ (str, "graph");
1464 boolean is_lower = STREQ (str, "lower");
1465 boolean is_print = STREQ (str, "print");
1466 boolean is_punct = STREQ (str, "punct");
1467 boolean is_space = STREQ (str, "space");
1468 boolean is_upper = STREQ (str, "upper");
1469 boolean is_xdigit = STREQ (str, "xdigit");
1470
1471 if (!IS_CHAR_CLASS (str)) return REG_ECTYPE;
1472
1473 /* Throw away the ] at the end of the character
1474 class. */
1475 PATFETCH (c);
1476
1477 if (p == pend) return REG_EBRACK;
1478
1479 for (ch = 0; ch < 1 << BYTEWIDTH; ch++)
1480 {
1481 if ( (is_alnum && ISALNUM (ch))
1482 || (is_alpha && ISALPHA (ch))
1483 || (is_blank && ISBLANK (ch))
1484 || (is_cntrl && ISCNTRL (ch))
1485 || (is_digit && ISDIGIT (ch))
1486 || (is_graph && ISGRAPH (ch))
1487 || (is_lower && ISLOWER (ch))
1488 || (is_print && ISPRINT (ch))
1489 || (is_punct && ISPUNCT (ch))
1490 || (is_space && ISSPACE (ch))
1491 || (is_upper && ISUPPER (ch))
1492 || (is_xdigit && ISXDIGIT (ch)))
1493 SET_LIST_BIT (ch);
1494 }
1495 had_char_class = true;
1496 }
1497 else
1498 {
1499 c1++;
1500 while (c1--)
1501 PATUNFETCH;
1502 SET_LIST_BIT ('[');
1503 SET_LIST_BIT (':');
1504 had_char_class = false;
1505 }
1506 }
1507 else
1508 {
1509 had_char_class = false;
1510 SET_LIST_BIT (c);
1511 }
1512 }
1513
1514 /* Discard any (non)matching list bytes that are all 0 at the
1515 end of the map. Decrease the map-length byte too. */
1516 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
1517 b[-1]--;
1518 b += b[-1];
1519 }
1520 break;
1521
1522
1523 case '(':
1524 if (syntax & RE_NO_BK_PARENS)
1525 goto handle_open;
1526 else
1527 goto normal_char;
1528
1529
1530 case ')':
1531 if (syntax & RE_NO_BK_PARENS)
1532 goto handle_close;
1533 else
1534 goto normal_char;
1535
1536
1537 case '\n':
1538 if (syntax & RE_NEWLINE_ALT)
1539 goto handle_alt;
1540 else
1541 goto normal_char;
1542
1543
1544 case '|':
1545 if (syntax & RE_NO_BK_VBAR)
1546 goto handle_alt;
1547 else
1548 goto normal_char;
1549
1550
1551 case '{':
1552 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES)
1553 goto handle_interval;
1554 else
1555 goto normal_char;
1556
1557
1558 case '\\':
1559 if (p == pend) return REG_EESCAPE;
1560
1561 /* Do not translate the character after the \, so that we can
1562 distinguish, e.g., \B from \b, even if we normally would
1563 translate, e.g., B to b. */
1564 PATFETCH_RAW (c);
1565
1566 switch (c)
1567 {
1568 case '(':
1569 if (syntax & RE_NO_BK_PARENS)
1570 goto normal_backslash;
1571
1572 handle_open:
1573 bufp->re_nsub++;
1574 regnum++;
1575
1576 if (COMPILE_STACK_FULL)
1577 {
1578 RETALLOC (compile_stack.stack, compile_stack.size << 1,
1579 compile_stack_elt_t);
1580 if (compile_stack.stack == NULL) return REG_ESPACE;
1581
1582 compile_stack.size <<= 1;
1583 }
1584
1585 /* These are the values to restore when we hit end of this
1586 group. They are all relative offsets, so that if the
1587 whole pattern moves because of realloc, they will still
1588 be valid. */
1589 COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer;
1590 COMPILE_STACK_TOP.fixup_alt_jump
1591 = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0;
1592 COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer;
1593 COMPILE_STACK_TOP.regnum = regnum;
1594
1595 /* We will eventually replace the 0 with the number of
1596 groups inner to this one. But do not push a
1597 start_memory for groups beyond the last one we can
1598 represent in the compiled pattern. */
1599 if (regnum <= MAX_REGNUM)
1600 {
1601 COMPILE_STACK_TOP.inner_group_offset = b - bufp->buffer + 2;
1602 BUF_PUSH_3 (start_memory, regnum, 0);
1603 }
1604
1605 compile_stack.avail++;
1606
1607 fixup_alt_jump = 0;
1608 laststart = 0;
1609 begalt = b;
1610 /* If we've reached MAX_REGNUM groups, then this open
1611 won't actually generate any code, so we'll have to
1612 clear pending_exact explicitly. */
1613 pending_exact = 0;
1614 break;
1615
1616
1617 case ')':
1618 if (syntax & RE_NO_BK_PARENS) goto normal_backslash;
1619
1620 if (COMPILE_STACK_EMPTY)
1621 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
1622 goto normal_backslash;
1623 else
1624 return REG_ERPAREN;
1625
1626 handle_close:
1627 if (fixup_alt_jump)
1628 { /* Push a dummy failure point at the end of the
1629 alternative for a possible future
1630 `pop_failure_jump' to pop. See comments at
1631 `push_dummy_failure' in `re_match_2'. */
1632 BUF_PUSH (push_dummy_failure);
1633
1634 /* We allocated space for this jump when we assigned
1635 to `fixup_alt_jump', in the `handle_alt' case below. */
1636 STORE_JUMP (jump_past_alt, fixup_alt_jump, b - 1);
1637 }
1638
1639 /* See similar code for backslashed left paren above. */
1640 if (COMPILE_STACK_EMPTY)
1641 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
1642 goto normal_char;
1643 else
1644 return REG_ERPAREN;
1645
1646 /* Since we just checked for an empty stack above, this
1647 ``can't happen''. */
1648 assert (compile_stack.avail != 0);
1649 {
1650 /* We don't just want to restore into `regnum', because
1651 later groups should continue to be numbered higher,
1652 as in `(ab)c(de)' -- the second group is #2. */
1653 regnum_t this_group_regnum;
1654
1655 compile_stack.avail--;
1656 begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset;
1657 fixup_alt_jump
1658 = COMPILE_STACK_TOP.fixup_alt_jump
1659 ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1
1660 : 0;
1661 laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset;
1662 this_group_regnum = COMPILE_STACK_TOP.regnum;
1663 /* If we've reached MAX_REGNUM groups, then this open
1664 won't actually generate any code, so we'll have to
1665 clear pending_exact explicitly. */
1666 pending_exact = 0;
1667
1668 /* We're at the end of the group, so now we know how many
1669 groups were inside this one. */
1670 if (this_group_regnum <= MAX_REGNUM)
1671 {
1672 unsigned char *inner_group_loc
1673 = bufp->buffer + COMPILE_STACK_TOP.inner_group_offset;
1674
1675 *inner_group_loc = regnum - this_group_regnum;
1676 BUF_PUSH_3 (stop_memory, this_group_regnum,
1677 regnum - this_group_regnum);
1678 }
1679 }
1680 break;
1681
1682
1683 case '|': /* `\|'. */
1684 if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR)
1685 goto normal_backslash;
1686 handle_alt:
1687 if (syntax & RE_LIMITED_OPS)
1688 goto normal_char;
1689
1690 /* Insert before the previous alternative a jump which
1691 jumps to this alternative if the former fails. */
1692 GET_BUFFER_SPACE (3);
1693 INSERT_JUMP (on_failure_jump, begalt, b + 6);
1694 pending_exact = 0;
1695 b += 3;
1696
1697 /* The alternative before this one has a jump after it
1698 which gets executed if it gets matched. Adjust that
1699 jump so it will jump to this alternative's analogous
1700 jump (put in below, which in turn will jump to the next
1701 (if any) alternative's such jump, etc.). The last such
1702 jump jumps to the correct final destination. A picture:
1703 _____ _____
1704 | | | |
1705 | v | v
1706 a | b | c
1707
1708 If we are at `b', then fixup_alt_jump right now points to a
1709 three-byte space after `a'. We'll put in the jump, set
1710 fixup_alt_jump to right after `b', and leave behind three
1711 bytes which we'll fill in when we get to after `c'. */
1712
1713 if (fixup_alt_jump)
1714 STORE_JUMP (jump_past_alt, fixup_alt_jump, b);
1715
1716 /* Mark and leave space for a jump after this alternative,
1717 to be filled in later either by next alternative or
1718 when know we're at the end of a series of alternatives. */
1719 fixup_alt_jump = b;
1720 GET_BUFFER_SPACE (3);
1721 b += 3;
1722
1723 laststart = 0;
1724 begalt = b;
1725 break;
1726
1727
1728 case '{':
1729 /* If \{ is a literal. */
1730 if (!(syntax & RE_INTERVALS)
1731 /* If we're at `\{' and it's not the open-interval
1732 operator. */
1733 || ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1734 || (p - 2 == pattern && p == pend))
1735 goto normal_backslash;
1736
1737 handle_interval:
1738 {
1739 /* If got here, then the syntax allows intervals. */
1740
1741 /* At least (most) this many matches must be made. */
1742 int lower_bound = -1, upper_bound = -1;
1743
1744 beg_interval = p - 1;
1745
1746 if (p == pend)
1747 {
1748 if (syntax & RE_NO_BK_BRACES)
1749 goto unfetch_interval;
1750 else
1751 return REG_EBRACE;
1752 }
1753
1754 GET_UNSIGNED_NUMBER (lower_bound);
1755
1756 if (c == ',')
1757 {
1758 GET_UNSIGNED_NUMBER (upper_bound);
1759 if (upper_bound < 0) upper_bound = RE_DUP_MAX;
1760 }
1761 else
1762 /* Interval such as `{1}' => match exactly once. */
1763 upper_bound = lower_bound;
1764
1765 if (lower_bound < 0 || upper_bound > RE_DUP_MAX
1766 || lower_bound > upper_bound)
1767 {
1768 if (syntax & RE_NO_BK_BRACES)
1769 goto unfetch_interval;
1770 else
1771 return REG_BADBR;
1772 }
1773
1774 if (!(syntax & RE_NO_BK_BRACES))
1775 {
1776 if (c != '\\') return REG_EBRACE;
1777
1778 PATFETCH (c);
1779 }
1780
1781 if (c != '}')
1782 {
1783 if (syntax & RE_NO_BK_BRACES)
1784 goto unfetch_interval;
1785 else
1786 return REG_BADBR;
1787 }
1788
1789 /* We just parsed a valid interval. */
1790
1791 /* If it's invalid to have no preceding re. */
1792 if (!laststart)
1793 {
1794 if (syntax & RE_CONTEXT_INVALID_OPS)
1795 return REG_BADRPT;
1796 else if (syntax & RE_CONTEXT_INDEP_OPS)
1797 laststart = b;
1798 else
1799 goto unfetch_interval;
1800 }
1801
1802 /* If the upper bound is zero, don't want to succeed at
1803 all; jump from `laststart' to `b + 3', which will be
1804 the end of the buffer after we insert the jump. */
1805 if (upper_bound == 0)
1806 {
1807 GET_BUFFER_SPACE (3);
1808 INSERT_JUMP (jump, laststart, b + 3);
1809 b += 3;
1810 }
1811
1812 /* Otherwise, we have a nontrivial interval. When
1813 we're all done, the pattern will look like:
1814 set_number_at <jump count> <upper bound>
1815 set_number_at <succeed_n count> <lower bound>
1816 succeed_n <after jump addr> <succed_n count>
1817 <body of loop>
1818 jump_n <succeed_n addr> <jump count>
1819 (The upper bound and `jump_n' are omitted if
1820 `upper_bound' is 1, though.) */
1821 else
1822 { /* If the upper bound is > 1, we need to insert
1823 more at the end of the loop. */
1824 unsigned nbytes = 10 + (upper_bound > 1) * 10;
1825
1826 GET_BUFFER_SPACE (nbytes);
1827
1828 /* Initialize lower bound of the `succeed_n', even
1829 though it will be set during matching by its
1830 attendant `set_number_at' (inserted next),
1831 because `re_compile_fastmap' needs to know.
1832 Jump to the `jump_n' we might insert below. */
1833 INSERT_JUMP2 (succeed_n, laststart,
1834 b + 5 + (upper_bound > 1) * 5,
1835 lower_bound);
1836 b += 5;
1837
1838 /* Code to initialize the lower bound. Insert
1839 before the `succeed_n'. The `5' is the last two
1840 bytes of this `set_number_at', plus 3 bytes of
1841 the following `succeed_n'. */
1842 insert_op2 (set_number_at, laststart, 5, lower_bound, b);
1843 b += 5;
1844
1845 if (upper_bound > 1)
1846 { /* More than one repetition is allowed, so
1847 append a backward jump to the `succeed_n'
1848 that starts this interval.
1849
1850 When we've reached this during matching,
1851 we'll have matched the interval once, so
1852 jump back only `upper_bound - 1' times. */
1853 STORE_JUMP2 (jump_n, b, laststart + 5,
1854 upper_bound - 1);
1855 b += 5;
1856
1857 /* The location we want to set is the second
1858 parameter of the `jump_n'; that is `b-2' as
1859 an absolute address. `laststart' will be
1860 the `set_number_at' we're about to insert;
1861 `laststart+3' the number to set, the source
1862 for the relative address. But we are
1863 inserting into the middle of the pattern --
1864 so everything is getting moved up by 5.
1865 Conclusion: (b - 2) - (laststart + 3) + 5,
1866 i.e., b - laststart.
1867
1868 We insert this at the beginning of the loop
1869 so that if we fail during matching, we'll
1870 reinitialize the bounds. */
1871 insert_op2 (set_number_at, laststart, b - laststart,
1872 upper_bound - 1, b);
1873 b += 5;
1874 }
1875 }
1876 pending_exact = 0;
1877 beg_interval = NULL;
1878 }
1879 break;
1880
1881 unfetch_interval:
1882 /* If an invalid interval, match the characters as literals. */
1883 assert (beg_interval);
1884 p = beg_interval;
1885 beg_interval = NULL;
1886
1887 /* normal_char and normal_backslash need `c'. */
1888 PATFETCH (c);
1889
1890 if (!(syntax & RE_NO_BK_BRACES))
1891 {
1892 if (p > pattern && p[-1] == '\\')
1893 goto normal_backslash;
1894 }
1895 goto normal_char;
1896
1897 #ifdef emacs
1898 /* There is no way to specify the before_dot and after_dot
1899 operators. rms says this is ok. --karl */
1900 case '=':
1901 BUF_PUSH (at_dot);
1902 break;
1903
1904 case 's':
1905 laststart = b;
1906 PATFETCH (c);
1907 BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]);
1908 break;
1909
1910 case 'S':
1911 laststart = b;
1912 PATFETCH (c);
1913 BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]);
1914 break;
1915 #endif /* emacs */
1916
1917
1918 case 'w':
1919 laststart = b;
1920 BUF_PUSH (wordchar);
1921 break;
1922
1923
1924 case 'W':
1925 laststart = b;
1926 BUF_PUSH (notwordchar);
1927 break;
1928
1929
1930 case '<':
1931 BUF_PUSH (wordbeg);
1932 break;
1933
1934 case '>':
1935 BUF_PUSH (wordend);
1936 break;
1937
1938 case 'b':
1939 BUF_PUSH (wordbound);
1940 break;
1941
1942 case 'B':
1943 BUF_PUSH (notwordbound);
1944 break;
1945
1946 case '`':
1947 BUF_PUSH (begbuf);
1948 break;
1949
1950 case '\'':
1951 BUF_PUSH (endbuf);
1952 break;
1953
1954 case '1': case '2': case '3': case '4': case '5':
1955 case '6': case '7': case '8': case '9':
1956 if (syntax & RE_NO_BK_REFS)
1957 goto normal_char;
1958
1959 c1 = c - '0';
1960
1961 if (c1 > regnum)
1962 return REG_ESUBREG;
1963
1964 /* Can't back reference to a subexpression if inside of it. */
1965 if (group_in_compile_stack (compile_stack, c1))
1966 goto normal_char;
1967
1968 laststart = b;
1969 BUF_PUSH_2 (duplicate, c1);
1970 break;
1971
1972
1973 case '+':
1974 case '?':
1975 if (syntax & RE_BK_PLUS_QM)
1976 goto handle_plus;
1977 else
1978 goto normal_backslash;
1979
1980 default:
1981 normal_backslash:
1982 /* You might think it would be useful for \ to mean
1983 not to translate; but if we don't translate it
1984 it will never match anything. */
1985 c = TRANSLATE (c);
1986 goto normal_char;
1987 }
1988 break;
1989
1990
1991 default:
1992 /* Expects the character in `c'. */
1993 normal_char:
1994 /* If no exactn currently being built. */
1995 if (!pending_exact
1996
1997 /* If last exactn not at current position. */
1998 || pending_exact + *pending_exact + 1 != b
1999
2000 /* We have only one byte following the exactn for the count. */
2001 || *pending_exact == (1 << BYTEWIDTH) - 1
2002
2003 /* If followed by a repetition operator. */
2004 || *p == '*' || *p == '^'
2005 || ((syntax & RE_BK_PLUS_QM)
2006 ? *p == '\\' && (p[1] == '+' || p[1] == '?')
2007 : (*p == '+' || *p == '?'))
2008 || ((syntax & RE_INTERVALS)
2009 && ((syntax & RE_NO_BK_BRACES)
2010 ? *p == '{'
2011 : (p[0] == '\\' && p[1] == '{'))))
2012 {
2013 /* Start building a new exactn. */
2014
2015 laststart = b;
2016
2017 BUF_PUSH_2 (exactn, 0);
2018 pending_exact = b - 1;
2019 }
2020
2021 BUF_PUSH (c);
2022 (*pending_exact)++;
2023 break;
2024 } /* switch (c) */
2025 } /* while p != pend */
2026
2027
2028 /* Through the pattern now. */
2029
2030 if (fixup_alt_jump)
2031 STORE_JUMP (jump_past_alt, fixup_alt_jump, b);
2032
2033 if (!COMPILE_STACK_EMPTY)
2034 return REG_EPAREN;
2035
2036 free (compile_stack.stack);
2037
2038 /* We have succeeded; set the length of the buffer. */
2039 bufp->used = b - bufp->buffer;
2040
2041 #ifdef DEBUG
2042 if (debug)
2043 {
2044 DEBUG_PRINT1 ("\nCompiled pattern: ");
2045 print_compiled_pattern (bufp);
2046 }
2047 #endif /* DEBUG */
2048
2049 return REG_NOERROR;
2050 } /* regex_compile */
2051
2052 /* Subroutines for `regex_compile'. */
2053
2054 /* Store OP at LOC followed by two-byte integer parameter ARG. */
2055
2056 static void
store_op1(op,loc,arg)2057 store_op1 (op, loc, arg)
2058 re_opcode_t op;
2059 unsigned char *loc;
2060 int arg;
2061 {
2062 *loc = (unsigned char) op;
2063 STORE_NUMBER (loc + 1, arg);
2064 }
2065
2066
2067 /* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */
2068
2069 static void
store_op2(op,loc,arg1,arg2)2070 store_op2 (op, loc, arg1, arg2)
2071 re_opcode_t op;
2072 unsigned char *loc;
2073 int arg1, arg2;
2074 {
2075 *loc = (unsigned char) op;
2076 STORE_NUMBER (loc + 1, arg1);
2077 STORE_NUMBER (loc + 3, arg2);
2078 }
2079
2080
2081 /* Copy the bytes from LOC to END to open up three bytes of space at LOC
2082 for OP followed by two-byte integer parameter ARG. */
2083
2084 static void
insert_op1(op,loc,arg,end)2085 insert_op1 (op, loc, arg, end)
2086 re_opcode_t op;
2087 unsigned char *loc;
2088 int arg;
2089 unsigned char *end;
2090 {
2091 register unsigned char *pfrom = end;
2092 register unsigned char *pto = end + 3;
2093
2094 while (pfrom != loc)
2095 *--pto = *--pfrom;
2096
2097 store_op1 (op, loc, arg);
2098 }
2099
2100
2101 /* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */
2102
2103 static void
insert_op2(op,loc,arg1,arg2,end)2104 insert_op2 (op, loc, arg1, arg2, end)
2105 re_opcode_t op;
2106 unsigned char *loc;
2107 int arg1, arg2;
2108 unsigned char *end;
2109 {
2110 register unsigned char *pfrom = end;
2111 register unsigned char *pto = end + 5;
2112
2113 while (pfrom != loc)
2114 *--pto = *--pfrom;
2115
2116 store_op2 (op, loc, arg1, arg2);
2117 }
2118
2119
2120 /* P points to just after a ^ in PATTERN. Return true if that ^ comes
2121 after an alternative or a begin-subexpression. We assume there is at
2122 least one character before the ^. */
2123
2124 static boolean
at_begline_loc_p(pattern,p,syntax)2125 at_begline_loc_p (pattern, p, syntax)
2126 const char *pattern, *p;
2127 reg_syntax_t syntax;
2128 {
2129 const char *prev = p - 2;
2130 boolean prev_prev_backslash = prev > pattern && prev[-1] == '\\';
2131
2132 return
2133 /* After a subexpression? */
2134 (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash))
2135 /* After an alternative? */
2136 || (*prev == '|' && (syntax & RE_NO_BK_VBAR || prev_prev_backslash));
2137 }
2138
2139
2140 /* The dual of at_begline_loc_p. This one is for $. We assume there is
2141 at least one character after the $, i.e., `P < PEND'. */
2142
2143 static boolean
at_endline_loc_p(p,pend,syntax)2144 at_endline_loc_p (p, pend, syntax)
2145 const char *p, *pend;
2146 int syntax;
2147 {
2148 const char *next = p;
2149 boolean next_backslash = *next == '\\';
2150 const char *next_next = p + 1 < pend ? p + 1 : NULL;
2151
2152 return
2153 /* Before a subexpression? */
2154 (syntax & RE_NO_BK_PARENS ? *next == ')'
2155 : next_backslash && next_next && *next_next == ')')
2156 /* Before an alternative? */
2157 || (syntax & RE_NO_BK_VBAR ? *next == '|'
2158 : next_backslash && next_next && *next_next == '|');
2159 }
2160
2161
2162 /* Returns true if REGNUM is in one of COMPILE_STACK's elements and
2163 false if it's not. */
2164
2165 static boolean
group_in_compile_stack(compile_stack,regnum)2166 group_in_compile_stack (compile_stack, regnum)
2167 compile_stack_type compile_stack;
2168 regnum_t regnum;
2169 {
2170 int this_element;
2171
2172 for (this_element = compile_stack.avail - 1;
2173 this_element >= 0;
2174 this_element--)
2175 if (compile_stack.stack[this_element].regnum == regnum)
2176 return true;
2177
2178 return false;
2179 }
2180
2181
2182 /* Read the ending character of a range (in a bracket expression) from the
2183 uncompiled pattern *P_PTR (which ends at PEND). We assume the
2184 starting character is in `P[-2]'. (`P[-1]' is the character `-'.)
2185 Then we set the translation of all bits between the starting and
2186 ending characters (inclusive) in the compiled pattern B.
2187
2188 Return an error code.
2189
2190 We use these short variable names so we can use the same macros as
2191 `regex_compile' itself. */
2192
2193 static reg_errcode_t
compile_range(p_ptr,pend,translate,syntax,b)2194 compile_range (p_ptr, pend, translate, syntax, b)
2195 const char **p_ptr, *pend;
2196 char *translate;
2197 reg_syntax_t syntax;
2198 unsigned char *b;
2199 {
2200 unsigned this_char;
2201
2202 const char *p = *p_ptr;
2203 int range_start, range_end;
2204
2205 if (p == pend)
2206 return REG_ERANGE;
2207
2208 /* Even though the pattern is a signed `char *', we need to fetch
2209 with unsigned char *'s; if the high bit of the pattern character
2210 is set, the range endpoints will be negative if we fetch using a
2211 signed char *.
2212
2213 We also want to fetch the endpoints without translating them; the
2214 appropriate translation is done in the bit-setting loop below. */
2215 range_start = ((unsigned char *) p)[-2];
2216 range_end = ((unsigned char *) p)[0];
2217
2218 /* Have to increment the pointer into the pattern string, so the
2219 caller isn't still at the ending character. */
2220 (*p_ptr)++;
2221
2222 /* If the start is after the end, the range is empty. */
2223 if (range_start > range_end)
2224 return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR;
2225
2226 /* Here we see why `this_char' has to be larger than an `unsigned
2227 char' -- the range is inclusive, so if `range_end' == 0xff
2228 (assuming 8-bit characters), we would otherwise go into an infinite
2229 loop, since all characters <= 0xff. */
2230 for (this_char = range_start; this_char <= range_end; this_char++)
2231 {
2232 SET_LIST_BIT (TRANSLATE (this_char));
2233 }
2234
2235 return REG_NOERROR;
2236 }
2237
2238 /* Failure stack declarations and macros; both re_compile_fastmap and
2239 re_match_2 use a failure stack. These have to be macros because of
2240 REGEX_ALLOCATE. */
2241
2242
2243 /* Number of failure points for which to initially allocate space
2244 when matching. If this number is exceeded, we allocate more
2245 space, so it is not a hard limit. */
2246 #ifndef INIT_FAILURE_ALLOC
2247 #define INIT_FAILURE_ALLOC 5
2248 #endif
2249
2250 /* Roughly the maximum number of failure points on the stack. Would be
2251 exactly that if always used MAX_FAILURE_SPACE each time we failed.
2252 This is a variable only so users of regex can assign to it; we never
2253 change it ourselves. */
2254 int re_max_failures = 2000;
2255
2256 typedef const unsigned char *fail_stack_elt_t;
2257
2258 typedef struct
2259 {
2260 fail_stack_elt_t *stack;
2261 unsigned size;
2262 unsigned avail; /* Offset of next open position. */
2263 } fail_stack_type;
2264
2265 #define FAIL_STACK_EMPTY() (fail_stack.avail == 0)
2266 #define FAIL_STACK_PTR_EMPTY() (fail_stack_ptr->avail == 0)
2267 #define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size)
2268 #define FAIL_STACK_TOP() (fail_stack.stack[fail_stack.avail])
2269
2270
2271 /* Initialize `fail_stack'. Do `return -2' if the alloc fails. */
2272
2273 #define INIT_FAIL_STACK() \
2274 do { \
2275 fail_stack.stack = (fail_stack_elt_t *) \
2276 REGEX_ALLOCATE (INIT_FAILURE_ALLOC * sizeof (fail_stack_elt_t)); \
2277 \
2278 if (fail_stack.stack == NULL) \
2279 return -2; \
2280 \
2281 fail_stack.size = INIT_FAILURE_ALLOC; \
2282 fail_stack.avail = 0; \
2283 } while (0)
2284
2285
2286 /* Double the size of FAIL_STACK, up to approximately `re_max_failures' items.
2287
2288 Return 1 if succeeds, and 0 if either ran out of memory
2289 allocating space for it or it was already too large.
2290
2291 REGEX_REALLOCATE requires `destination' be declared. */
2292
2293 #define DOUBLE_FAIL_STACK(fail_stack) \
2294 ((fail_stack).size > re_max_failures * MAX_FAILURE_ITEMS \
2295 ? 0 \
2296 : ((fail_stack).stack = (fail_stack_elt_t *) \
2297 REGEX_REALLOCATE ((fail_stack).stack, \
2298 (fail_stack).size * sizeof (fail_stack_elt_t), \
2299 ((fail_stack).size << 1) * sizeof (fail_stack_elt_t)), \
2300 \
2301 (fail_stack).stack == NULL \
2302 ? 0 \
2303 : ((fail_stack).size <<= 1, \
2304 1)))
2305
2306
2307 /* Push PATTERN_OP on FAIL_STACK.
2308
2309 Return 1 if was able to do so and 0 if ran out of memory allocating
2310 space to do so. */
2311 #define PUSH_PATTERN_OP(pattern_op, fail_stack) \
2312 ((FAIL_STACK_FULL () \
2313 && !DOUBLE_FAIL_STACK (fail_stack)) \
2314 ? 0 \
2315 : ((fail_stack).stack[(fail_stack).avail++] = pattern_op, \
2316 1))
2317
2318 /* This pushes an item onto the failure stack. Must be a four-byte
2319 value. Assumes the variable `fail_stack'. Probably should only
2320 be called from within `PUSH_FAILURE_POINT'. */
2321 #define PUSH_FAILURE_ITEM(item) \
2322 fail_stack.stack[fail_stack.avail++] = (fail_stack_elt_t) item
2323
2324 /* The complement operation. Assumes `fail_stack' is nonempty. */
2325 #define POP_FAILURE_ITEM() fail_stack.stack[--fail_stack.avail]
2326
2327 /* Used to omit pushing failure point id's when we're not debugging. */
2328 #ifdef DEBUG
2329 #define DEBUG_PUSH PUSH_FAILURE_ITEM
2330 #define DEBUG_POP(item_addr) *(item_addr) = POP_FAILURE_ITEM ()
2331 #else
2332 #define DEBUG_PUSH(item)
2333 #define DEBUG_POP(item_addr)
2334 #endif
2335
2336
2337 /* Push the information about the state we will need
2338 if we ever fail back to it.
2339
2340 Requires variables fail_stack, regstart, regend, reg_info, and
2341 num_regs be declared. DOUBLE_FAIL_STACK requires `destination' be
2342 declared.
2343
2344 Does `return FAILURE_CODE' if runs out of memory. */
2345
2346 #define PUSH_FAILURE_POINT(pattern_place, string_place, failure_code) \
2347 do { \
2348 char *destination; \
2349 /* Must be int, so when we don't save any registers, the arithmetic \
2350 of 0 + -1 isn't done as unsigned. */ \
2351 int this_reg; \
2352 \
2353 DEBUG_STATEMENT (failure_id++); \
2354 DEBUG_STATEMENT (nfailure_points_pushed++); \
2355 DEBUG_PRINT2 ("\nPUSH_FAILURE_POINT #%u:\n", failure_id); \
2356 DEBUG_PRINT2 (" Before push, next avail: %d\n", (fail_stack).avail);\
2357 DEBUG_PRINT2 (" size: %d\n", (fail_stack).size);\
2358 \
2359 DEBUG_PRINT2 (" slots needed: %d\n", NUM_FAILURE_ITEMS); \
2360 DEBUG_PRINT2 (" available: %d\n", REMAINING_AVAIL_SLOTS); \
2361 \
2362 /* Ensure we have enough space allocated for what we will push. */ \
2363 while (REMAINING_AVAIL_SLOTS < NUM_FAILURE_ITEMS) \
2364 { \
2365 if (!DOUBLE_FAIL_STACK (fail_stack)) \
2366 return failure_code; \
2367 \
2368 DEBUG_PRINT2 ("\n Doubled stack; size now: %d\n", \
2369 (fail_stack).size); \
2370 DEBUG_PRINT2 (" slots available: %d\n", REMAINING_AVAIL_SLOTS);\
2371 } \
2372 \
2373 /* Push the info, starting with the registers. */ \
2374 DEBUG_PRINT1 ("\n"); \
2375 \
2376 for (this_reg = lowest_active_reg; this_reg <= highest_active_reg; \
2377 this_reg++) \
2378 { \
2379 DEBUG_PRINT2 (" Pushing reg: %d\n", this_reg); \
2380 DEBUG_STATEMENT (num_regs_pushed++); \
2381 \
2382 DEBUG_PRINT2 (" start: 0x%x\n", regstart[this_reg]); \
2383 PUSH_FAILURE_ITEM (regstart[this_reg]); \
2384 \
2385 DEBUG_PRINT2 (" end: 0x%x\n", regend[this_reg]); \
2386 PUSH_FAILURE_ITEM (regend[this_reg]); \
2387 \
2388 DEBUG_PRINT2 (" info: 0x%x\n ", reg_info[this_reg]); \
2389 DEBUG_PRINT2 (" match_null=%d", \
2390 REG_MATCH_NULL_STRING_P (reg_info[this_reg])); \
2391 DEBUG_PRINT2 (" active=%d", IS_ACTIVE (reg_info[this_reg])); \
2392 DEBUG_PRINT2 (" matched_something=%d", \
2393 MATCHED_SOMETHING (reg_info[this_reg])); \
2394 DEBUG_PRINT2 (" ever_matched=%d", \
2395 EVER_MATCHED_SOMETHING (reg_info[this_reg])); \
2396 DEBUG_PRINT1 ("\n"); \
2397 PUSH_FAILURE_ITEM (reg_info[this_reg].word); \
2398 } \
2399 \
2400 DEBUG_PRINT2 (" Pushing low active reg: %d\n", lowest_active_reg);\
2401 PUSH_FAILURE_ITEM (lowest_active_reg); \
2402 \
2403 DEBUG_PRINT2 (" Pushing high active reg: %d\n", highest_active_reg);\
2404 PUSH_FAILURE_ITEM (highest_active_reg); \
2405 \
2406 DEBUG_PRINT2 (" Pushing pattern 0x%x: ", pattern_place); \
2407 DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern_place, pend); \
2408 PUSH_FAILURE_ITEM (pattern_place); \
2409 \
2410 DEBUG_PRINT2 (" Pushing string 0x%x: `", string_place); \
2411 DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, \
2412 size2); \
2413 DEBUG_PRINT1 ("'\n"); \
2414 PUSH_FAILURE_ITEM (string_place); \
2415 \
2416 DEBUG_PRINT2 (" Pushing failure id: %u\n", failure_id); \
2417 DEBUG_PUSH (failure_id); \
2418 } while (0)
2419
2420 /* This is the number of items that are pushed and popped on the stack
2421 for each register. */
2422 #define NUM_REG_ITEMS 3
2423
2424 /* Individual items aside from the registers. */
2425 #ifdef DEBUG
2426 #define NUM_NONREG_ITEMS 5 /* Includes failure point id. */
2427 #else
2428 #define NUM_NONREG_ITEMS 4
2429 #endif
2430
2431 /* We push at most this many items on the stack. */
2432 #define MAX_FAILURE_ITEMS ((num_regs - 1) * NUM_REG_ITEMS + NUM_NONREG_ITEMS)
2433
2434 /* We actually push this many items. */
2435 #define NUM_FAILURE_ITEMS \
2436 ((highest_active_reg - lowest_active_reg + 1) * NUM_REG_ITEMS \
2437 + NUM_NONREG_ITEMS)
2438
2439 /* How many items can still be added to the stack without overflowing it. */
2440 #define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail)
2441
2442
2443 /* Pops what PUSH_FAIL_STACK pushes.
2444
2445 We restore into the parameters, all of which should be lvalues:
2446 STR -- the saved data position.
2447 PAT -- the saved pattern position.
2448 LOW_REG, HIGH_REG -- the highest and lowest active registers.
2449 REGSTART, REGEND -- arrays of string positions.
2450 REG_INFO -- array of information about each subexpression.
2451
2452 Also assumes the variables `fail_stack' and (if debugging), `bufp',
2453 `pend', `string1', `size1', `string2', and `size2'. */
2454
2455 #define POP_FAILURE_POINT(str, pat, low_reg, high_reg, regstart, regend, reg_info)\
2456 { \
2457 DEBUG_STATEMENT (fail_stack_elt_t failure_id;) \
2458 int this_reg; \
2459 const unsigned char *string_temp; \
2460 \
2461 assert (!FAIL_STACK_EMPTY ()); \
2462 \
2463 /* Remove failure points and point to how many regs pushed. */ \
2464 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \
2465 DEBUG_PRINT2 (" Before pop, next avail: %d\n", fail_stack.avail); \
2466 DEBUG_PRINT2 (" size: %d\n", fail_stack.size); \
2467 \
2468 assert (fail_stack.avail >= NUM_NONREG_ITEMS); \
2469 \
2470 DEBUG_POP (&failure_id); \
2471 DEBUG_PRINT2 (" Popping failure id: %u\n", failure_id); \
2472 \
2473 /* If the saved string location is NULL, it came from an \
2474 on_failure_keep_string_jump opcode, and we want to throw away the \
2475 saved NULL, thus retaining our current position in the string. */ \
2476 string_temp = POP_FAILURE_ITEM (); \
2477 if (string_temp != NULL) \
2478 str = (const char *) string_temp; \
2479 \
2480 DEBUG_PRINT2 (" Popping string 0x%x: `", str); \
2481 DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \
2482 DEBUG_PRINT1 ("'\n"); \
2483 \
2484 pat = (unsigned char *) POP_FAILURE_ITEM (); \
2485 DEBUG_PRINT2 (" Popping pattern 0x%x: ", pat); \
2486 DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \
2487 \
2488 /* Restore register info. */ \
2489 high_reg = (unsigned) POP_FAILURE_ITEM (); \
2490 DEBUG_PRINT2 (" Popping high active reg: %d\n", high_reg); \
2491 \
2492 low_reg = (unsigned) POP_FAILURE_ITEM (); \
2493 DEBUG_PRINT2 (" Popping low active reg: %d\n", low_reg); \
2494 \
2495 for (this_reg = high_reg; this_reg >= low_reg; this_reg--) \
2496 { \
2497 DEBUG_PRINT2 (" Popping reg: %d\n", this_reg); \
2498 \
2499 reg_info[this_reg].word = POP_FAILURE_ITEM (); \
2500 DEBUG_PRINT2 (" info: 0x%x\n", reg_info[this_reg]); \
2501 \
2502 regend[this_reg] = (const char *) POP_FAILURE_ITEM (); \
2503 DEBUG_PRINT2 (" end: 0x%x\n", regend[this_reg]); \
2504 \
2505 regstart[this_reg] = (const char *) POP_FAILURE_ITEM (); \
2506 DEBUG_PRINT2 (" start: 0x%x\n", regstart[this_reg]); \
2507 } \
2508 \
2509 DEBUG_STATEMENT (nfailure_points_popped++); \
2510 } /* POP_FAILURE_POINT */
2511
2512 /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
2513 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
2514 characters can start a string that matches the pattern. This fastmap
2515 is used by re_search to skip quickly over impossible starting points.
2516
2517 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
2518 area as BUFP->fastmap.
2519
2520 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in
2521 the pattern buffer.
2522
2523 Returns 0 if we succeed, -2 if an internal error. */
2524
2525 int
re_compile_fastmap(bufp)2526 re_compile_fastmap (bufp)
2527 struct re_pattern_buffer *bufp;
2528 {
2529 int j, k;
2530 fail_stack_type fail_stack;
2531 #ifndef REGEX_MALLOC
2532 char *destination;
2533 #endif
2534 /* We don't push any register information onto the failure stack. */
2535 unsigned num_regs = 0;
2536
2537 register char *fastmap = bufp->fastmap;
2538 unsigned char *pattern = bufp->buffer;
2539 unsigned long size = bufp->used;
2540 const unsigned char *p = pattern;
2541 register unsigned char *pend = pattern + size;
2542
2543 /* Assume that each path through the pattern can be null until
2544 proven otherwise. We set this false at the bottom of switch
2545 statement, to which we get only if a particular path doesn't
2546 match the empty string. */
2547 boolean path_can_be_null = true;
2548
2549 /* We aren't doing a `succeed_n' to begin with. */
2550 boolean succeed_n_p = false;
2551
2552 assert (fastmap != NULL && p != NULL);
2553
2554 INIT_FAIL_STACK ();
2555 bzero (fastmap, 1 << BYTEWIDTH); /* Assume nothing's valid. */
2556 bufp->fastmap_accurate = 1; /* It will be when we're done. */
2557 bufp->can_be_null = 0;
2558
2559 while (p != pend || !FAIL_STACK_EMPTY ())
2560 {
2561 if (p == pend)
2562 {
2563 bufp->can_be_null |= path_can_be_null;
2564
2565 /* Reset for next path. */
2566 path_can_be_null = true;
2567
2568 p = fail_stack.stack[--fail_stack.avail];
2569 }
2570
2571 /* We should never be about to go beyond the end of the pattern. */
2572 assert (p < pend);
2573
2574 #ifdef SWITCH_ENUM_BUG
2575 switch ((int) ((re_opcode_t) *p++))
2576 #else
2577 switch ((re_opcode_t) *p++)
2578 #endif
2579 {
2580
2581 /* I guess the idea here is to simply not bother with a fastmap
2582 if a backreference is used, since it's too hard to figure out
2583 the fastmap for the corresponding group. Setting
2584 `can_be_null' stops `re_search_2' from using the fastmap, so
2585 that is all we do. */
2586 case duplicate:
2587 bufp->can_be_null = 1;
2588 return 0;
2589
2590
2591 /* Following are the cases which match a character. These end
2592 with `break'. */
2593
2594 case exactn:
2595 fastmap[p[1]] = 1;
2596 break;
2597
2598
2599 case charset:
2600 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
2601 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))
2602 fastmap[j] = 1;
2603 break;
2604
2605
2606 case charset_not:
2607 /* Chars beyond end of map must be allowed. */
2608 for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++)
2609 fastmap[j] = 1;
2610
2611 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
2612 if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))))
2613 fastmap[j] = 1;
2614 break;
2615
2616
2617 case wordchar:
2618 for (j = 0; j < (1 << BYTEWIDTH); j++)
2619 if (SYNTAX (j) == Sword)
2620 fastmap[j] = 1;
2621 break;
2622
2623
2624 case notwordchar:
2625 for (j = 0; j < (1 << BYTEWIDTH); j++)
2626 if (SYNTAX (j) != Sword)
2627 fastmap[j] = 1;
2628 break;
2629
2630
2631 case anychar:
2632 /* `.' matches anything ... */
2633 for (j = 0; j < (1 << BYTEWIDTH); j++)
2634 fastmap[j] = 1;
2635
2636 /* ... except perhaps newline. */
2637 if (!(bufp->syntax & RE_DOT_NEWLINE))
2638 fastmap['\n'] = 0;
2639
2640 /* Return if we have already set `can_be_null'; if we have,
2641 then the fastmap is irrelevant. Something's wrong here. */
2642 else if (bufp->can_be_null)
2643 return 0;
2644
2645 /* Otherwise, have to check alternative paths. */
2646 break;
2647
2648
2649 #ifdef emacs
2650 case syntaxspec:
2651 k = *p++;
2652 for (j = 0; j < (1 << BYTEWIDTH); j++)
2653 if (SYNTAX (j) == (enum syntaxcode) k)
2654 fastmap[j] = 1;
2655 break;
2656
2657
2658 case notsyntaxspec:
2659 k = *p++;
2660 for (j = 0; j < (1 << BYTEWIDTH); j++)
2661 if (SYNTAX (j) != (enum syntaxcode) k)
2662 fastmap[j] = 1;
2663 break;
2664
2665
2666 /* All cases after this match the empty string. These end with
2667 `continue'. */
2668
2669
2670 case before_dot:
2671 case at_dot:
2672 case after_dot:
2673 continue;
2674 #endif /* not emacs */
2675
2676
2677 case no_op:
2678 case begline:
2679 case endline:
2680 case begbuf:
2681 case endbuf:
2682 case wordbound:
2683 case notwordbound:
2684 case wordbeg:
2685 case wordend:
2686 case push_dummy_failure:
2687 continue;
2688
2689
2690 case jump_n:
2691 case pop_failure_jump:
2692 case maybe_pop_jump:
2693 case jump:
2694 case jump_past_alt:
2695 case dummy_failure_jump:
2696 EXTRACT_NUMBER_AND_INCR (j, p);
2697 p += j;
2698 if (j > 0)
2699 continue;
2700
2701 /* Jump backward implies we just went through the body of a
2702 loop and matched nothing. Opcode jumped to should be
2703 `on_failure_jump' or `succeed_n'. Just treat it like an
2704 ordinary jump. For a * loop, it has pushed its failure
2705 point already; if so, discard that as redundant. */
2706 if ((re_opcode_t) *p != on_failure_jump
2707 && (re_opcode_t) *p != succeed_n)
2708 continue;
2709
2710 p++;
2711 EXTRACT_NUMBER_AND_INCR (j, p);
2712 p += j;
2713
2714 /* If what's on the stack is where we are now, pop it. */
2715 if (!FAIL_STACK_EMPTY ()
2716 && fail_stack.stack[fail_stack.avail - 1] == p)
2717 fail_stack.avail--;
2718
2719 continue;
2720
2721
2722 case on_failure_jump:
2723 case on_failure_keep_string_jump:
2724 handle_on_failure_jump:
2725 EXTRACT_NUMBER_AND_INCR (j, p);
2726
2727 /* For some patterns, e.g., `(a?)?', `p+j' here points to the
2728 end of the pattern. We don't want to push such a point,
2729 since when we restore it above, entering the switch will
2730 increment `p' past the end of the pattern. We don't need
2731 to push such a point since we obviously won't find any more
2732 fastmap entries beyond `pend'. Such a pattern can match
2733 the null string, though. */
2734 if (p + j < pend)
2735 {
2736 if (!PUSH_PATTERN_OP (p + j, fail_stack))
2737 return -2;
2738 }
2739 else
2740 bufp->can_be_null = 1;
2741
2742 if (succeed_n_p)
2743 {
2744 EXTRACT_NUMBER_AND_INCR (k, p); /* Skip the n. */
2745 succeed_n_p = false;
2746 }
2747
2748 continue;
2749
2750
2751 case succeed_n:
2752 /* Get to the number of times to succeed. */
2753 p += 2;
2754
2755 /* Increment p past the n for when k != 0. */
2756 EXTRACT_NUMBER_AND_INCR (k, p);
2757 if (k == 0)
2758 {
2759 p -= 4;
2760 succeed_n_p = true; /* Spaghetti code alert. */
2761 goto handle_on_failure_jump;
2762 }
2763 continue;
2764
2765
2766 case set_number_at:
2767 p += 4;
2768 continue;
2769
2770
2771 case start_memory:
2772 case stop_memory:
2773 p += 2;
2774 continue;
2775
2776
2777 default:
2778 abort (); /* We have listed all the cases. */
2779 } /* switch *p++ */
2780
2781 /* Getting here means we have found the possible starting
2782 characters for one path of the pattern -- and that the empty
2783 string does not match. We need not follow this path further.
2784 Instead, look at the next alternative (remembered on the
2785 stack), or quit if no more. The test at the top of the loop
2786 does these things. */
2787 path_can_be_null = false;
2788 p = pend;
2789 } /* while p */
2790
2791 /* Set `can_be_null' for the last path (also the first path, if the
2792 pattern is empty). */
2793 bufp->can_be_null |= path_can_be_null;
2794 return 0;
2795 } /* re_compile_fastmap */
2796
2797 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
2798 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
2799 this memory for recording register information. STARTS and ENDS
2800 must be allocated using the malloc library routine, and must each
2801 be at least NUM_REGS * sizeof (regoff_t) bytes long.
2802
2803 If NUM_REGS == 0, then subsequent matches should allocate their own
2804 register data.
2805
2806 Unless this function is called, the first search or match using
2807 PATTERN_BUFFER will allocate its own register data, without
2808 freeing the old data. */
2809
2810 void
re_set_registers(bufp,regs,num_regs,starts,ends)2811 re_set_registers (bufp, regs, num_regs, starts, ends)
2812 struct re_pattern_buffer *bufp;
2813 struct re_registers *regs;
2814 unsigned num_regs;
2815 regoff_t *starts, *ends;
2816 {
2817 if (num_regs)
2818 {
2819 bufp->regs_allocated = REGS_REALLOCATE;
2820 regs->num_regs = num_regs;
2821 regs->start = starts;
2822 regs->end = ends;
2823 }
2824 else
2825 {
2826 bufp->regs_allocated = REGS_UNALLOCATED;
2827 regs->num_regs = 0;
2828 regs->start = regs->end = (regoff_t) 0;
2829 }
2830 }
2831
2832 /* Searching routines. */
2833
2834 /* Like re_search_2, below, but only one string is specified, and
2835 doesn't let you say where to stop matching. */
2836
2837 int
re_search(bufp,string,size,startpos,range,regs)2838 re_search (bufp, string, size, startpos, range, regs)
2839 struct re_pattern_buffer *bufp;
2840 const char *string;
2841 int size, startpos, range;
2842 struct re_registers *regs;
2843 {
2844 return re_search_2 (bufp, NULL, 0, string, size, startpos, range,
2845 regs, size);
2846 }
2847
2848
2849 /* Using the compiled pattern in BUFP->buffer, first tries to match the
2850 virtual concatenation of STRING1 and STRING2, starting first at index
2851 STARTPOS, then at STARTPOS + 1, and so on.
2852
2853 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
2854
2855 RANGE is how far to scan while trying to match. RANGE = 0 means try
2856 only at STARTPOS; in general, the last start tried is STARTPOS +
2857 RANGE.
2858
2859 In REGS, return the indices of the virtual concatenation of STRING1
2860 and STRING2 that matched the entire BUFP->buffer and its contained
2861 subexpressions.
2862
2863 Do not consider matching one past the index STOP in the virtual
2864 concatenation of STRING1 and STRING2.
2865
2866 We return either the position in the strings at which the match was
2867 found, -1 if no match, or -2 if error (such as failure
2868 stack overflow). */
2869
2870 int
re_search_2(bufp,string1,size1,string2,size2,startpos,range,regs,stop)2871 re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop)
2872 struct re_pattern_buffer *bufp;
2873 const char *string1, *string2;
2874 int size1, size2;
2875 int startpos;
2876 int range;
2877 struct re_registers *regs;
2878 int stop;
2879 {
2880 int val;
2881 register char *fastmap = bufp->fastmap;
2882 register char *translate = bufp->translate;
2883 int total_size = size1 + size2;
2884 int endpos = startpos + range;
2885
2886 /* Check for out-of-range STARTPOS. */
2887 if (startpos < 0 || startpos > total_size)
2888 return -1;
2889
2890 /* Fix up RANGE if it might eventually take us outside
2891 the virtual concatenation of STRING1 and STRING2. */
2892 if (endpos < -1)
2893 range = -1 - startpos;
2894 else if (endpos > total_size)
2895 range = total_size - startpos;
2896
2897 /* If the search isn't to be a backwards one, don't waste time in a
2898 search for a pattern that must be anchored. */
2899 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0)
2900 {
2901 if (startpos > 0)
2902 return -1;
2903 else
2904 range = 1;
2905 }
2906
2907 /* Update the fastmap now if not correct already. */
2908 if (fastmap && !bufp->fastmap_accurate)
2909 if (re_compile_fastmap (bufp) == -2)
2910 return -2;
2911
2912 /* Loop through the string, looking for a place to start matching. */
2913 for (;;)
2914 {
2915 /* If a fastmap is supplied, skip quickly over characters that
2916 cannot be the start of a match. If the pattern can match the
2917 null string, however, we don't need to skip characters; we want
2918 the first null string. */
2919 if (fastmap && startpos < total_size && !bufp->can_be_null)
2920 {
2921 if (range > 0) /* Searching forwards. */
2922 {
2923 register const char *d;
2924 register int lim = 0;
2925 int irange = range;
2926
2927 if (startpos < size1 && startpos + range >= size1)
2928 lim = range - (size1 - startpos);
2929
2930 d = (startpos >= size1 ? string2 - size1 : string1) + startpos;
2931
2932 /* Written out as an if-else to avoid testing `translate'
2933 inside the loop. */
2934 if (translate)
2935 while (range > lim
2936 && !fastmap[(unsigned char)
2937 translate[(unsigned char) *d++]])
2938 range--;
2939 else
2940 while (range > lim && !fastmap[(unsigned char) *d++])
2941 range--;
2942
2943 startpos += irange - range;
2944 }
2945 else /* Searching backwards. */
2946 {
2947 register char c = (size1 == 0 || startpos >= size1
2948 ? string2[startpos - size1]
2949 : string1[startpos]);
2950
2951 if (!fastmap[(unsigned char) TRANSLATE (c)])
2952 goto advance;
2953 }
2954 }
2955
2956 /* If can't match the null string, and that's all we have left, fail. */
2957 if (range >= 0 && startpos == total_size && fastmap
2958 && !bufp->can_be_null)
2959 return -1;
2960
2961 val = re_match_2 (bufp, string1, size1, string2, size2,
2962 startpos, regs, stop);
2963 if (val >= 0)
2964 return startpos;
2965
2966 if (val == -2)
2967 return -2;
2968
2969 advance:
2970 if (!range)
2971 break;
2972 else if (range > 0)
2973 {
2974 range--;
2975 startpos++;
2976 }
2977 else
2978 {
2979 range++;
2980 startpos--;
2981 }
2982 }
2983 return -1;
2984 } /* re_search_2 */
2985
2986 /* Declarations and macros for re_match_2. */
2987
2988 static int bcmp_translate ();
2989 static boolean alt_match_null_string_p (),
2990 common_op_match_null_string_p (),
2991 group_match_null_string_p ();
2992
2993 /* Structure for per-register (a.k.a. per-group) information.
2994 This must not be longer than one word, because we push this value
2995 onto the failure stack. Other register information, such as the
2996 starting and ending positions (which are addresses), and the list of
2997 inner groups (which is a bits list) are maintained in separate
2998 variables.
2999
3000 We are making a (strictly speaking) nonportable assumption here: that
3001 the compiler will pack our bit fields into something that fits into
3002 the type of `word', i.e., is something that fits into one item on the
3003 failure stack. */
3004 typedef union
3005 {
3006 fail_stack_elt_t word;
3007 struct
3008 {
3009 /* This field is one if this group can match the empty string,
3010 zero if not. If not yet determined, `MATCH_NULL_UNSET_VALUE'. */
3011 #define MATCH_NULL_UNSET_VALUE 3
3012 unsigned match_null_string_p : 2;
3013 unsigned is_active : 1;
3014 unsigned matched_something : 1;
3015 unsigned ever_matched_something : 1;
3016 } bits;
3017 } register_info_type;
3018
3019 #define REG_MATCH_NULL_STRING_P(R) ((R).bits.match_null_string_p)
3020 #define IS_ACTIVE(R) ((R).bits.is_active)
3021 #define MATCHED_SOMETHING(R) ((R).bits.matched_something)
3022 #define EVER_MATCHED_SOMETHING(R) ((R).bits.ever_matched_something)
3023
3024
3025 /* Call this when have matched a real character; it sets `matched' flags
3026 for the subexpressions which we are currently inside. Also records
3027 that those subexprs have matched. */
3028 #define SET_REGS_MATCHED() \
3029 do \
3030 { \
3031 unsigned r; \
3032 for (r = lowest_active_reg; r <= highest_active_reg; r++) \
3033 { \
3034 MATCHED_SOMETHING (reg_info[r]) \
3035 = EVER_MATCHED_SOMETHING (reg_info[r]) \
3036 = 1; \
3037 } \
3038 } \
3039 while (0)
3040
3041
3042 /* This converts PTR, a pointer into one of the search strings `string1'
3043 and `string2' into an offset from the beginning of that string. */
3044 #define POINTER_TO_OFFSET(ptr) \
3045 (FIRST_STRING_P (ptr) ? (ptr) - string1 : (ptr) - string2 + size1)
3046
3047 /* Registers are set to a sentinel when they haven't yet matched. */
3048 #define REG_UNSET_VALUE ((char *) -1)
3049 #define REG_UNSET(e) ((e) == REG_UNSET_VALUE)
3050
3051
3052 /* Macros for dealing with the split strings in re_match_2. */
3053
3054 #define MATCHING_IN_FIRST_STRING (dend == end_match_1)
3055
3056 /* Call before fetching a character with *d. This switches over to
3057 string2 if necessary. */
3058 #define PREFETCH() \
3059 while (d == dend) \
3060 { \
3061 /* End of string2 => fail. */ \
3062 if (dend == end_match_2) \
3063 goto fail; \
3064 /* End of string1 => advance to string2. */ \
3065 d = string2; \
3066 dend = end_match_2; \
3067 }
3068
3069
3070 /* Test if at very beginning or at very end of the virtual concatenation
3071 of `string1' and `string2'. If only one string, it's `string2'. */
3072 #define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
3073 #define AT_STRINGS_END(d) ((d) == end2)
3074
3075
3076 /* Test if D points to a character which is word-constituent. We have
3077 two special cases to check for: if past the end of string1, look at
3078 the first character in string2; and if before the beginning of
3079 string2, look at the last character in string1. */
3080 #define WORDCHAR_P(d) \
3081 (SYNTAX ((d) == end1 ? *string2 \
3082 : (d) == string2 - 1 ? *(end1 - 1) : *(d)) \
3083 == Sword)
3084
3085 /* Test if the character before D and the one at D differ with respect
3086 to being word-constituent. */
3087 #define AT_WORD_BOUNDARY(d) \
3088 (AT_STRINGS_BEG (d) || AT_STRINGS_END (d) \
3089 || WORDCHAR_P (d - 1) != WORDCHAR_P (d))
3090
3091
3092 /* Free everything we malloc. */
3093 #ifdef REGEX_MALLOC
3094 #define FREE_VAR(var) if (var) free (var); var = NULL
3095 #define FREE_VARIABLES() \
3096 do { \
3097 FREE_VAR (fail_stack.stack); \
3098 FREE_VAR (regstart); \
3099 FREE_VAR (regend); \
3100 FREE_VAR (old_regstart); \
3101 FREE_VAR (old_regend); \
3102 FREE_VAR (best_regstart); \
3103 FREE_VAR (best_regend); \
3104 FREE_VAR (reg_info); \
3105 FREE_VAR (reg_dummy); \
3106 FREE_VAR (reg_info_dummy); \
3107 } while (0)
3108 #else /* not REGEX_MALLOC */
3109 /* Some MIPS systems (at least) want this to free alloca'd storage. */
3110 #define FREE_VARIABLES() alloca (0)
3111 #endif /* not REGEX_MALLOC */
3112
3113
3114 /* These values must meet several constraints. They must not be valid
3115 register values; since we have a limit of 255 registers (because
3116 we use only one byte in the pattern for the register number), we can
3117 use numbers larger than 255. They must differ by 1, because of
3118 NUM_FAILURE_ITEMS above. And the value for the lowest register must
3119 be larger than the value for the highest register, so we do not try
3120 to actually save any registers when none are active. */
3121 #define NO_HIGHEST_ACTIVE_REG (1 << BYTEWIDTH)
3122 #define NO_LOWEST_ACTIVE_REG (NO_HIGHEST_ACTIVE_REG + 1)
3123
3124 /* Matching routines. */
3125
3126 #ifndef emacs /* Emacs never uses this. */
3127 /* re_match is like re_match_2 except it takes only a single string. */
3128
3129 int
re_match(bufp,string,size,pos,regs)3130 re_match (bufp, string, size, pos, regs)
3131 struct re_pattern_buffer *bufp;
3132 const char *string;
3133 int size, pos;
3134 struct re_registers *regs;
3135 {
3136 return re_match_2 (bufp, NULL, 0, string, size, pos, regs, size);
3137 }
3138 #endif /* not emacs */
3139
3140
3141 /* re_match_2 matches the compiled pattern in BUFP against the
3142 the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1
3143 and SIZE2, respectively). We start matching at POS, and stop
3144 matching at STOP.
3145
3146 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we
3147 store offsets for the substring each group matched in REGS. See the
3148 documentation for exactly how many groups we fill.
3149
3150 We return -1 if no match, -2 if an internal error (such as the
3151 failure stack overflowing). Otherwise, we return the length of the
3152 matched substring. */
3153
3154 int
re_match_2(bufp,string1,size1,string2,size2,pos,regs,stop)3155 re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
3156 struct re_pattern_buffer *bufp;
3157 const char *string1, *string2;
3158 int size1, size2;
3159 int pos;
3160 struct re_registers *regs;
3161 int stop;
3162 {
3163 /* General temporaries. */
3164 int mcnt;
3165 unsigned char *p1;
3166
3167 /* Just past the end of the corresponding string. */
3168 const char *end1, *end2;
3169
3170 /* Pointers into string1 and string2, just past the last characters in
3171 each to consider matching. */
3172 const char *end_match_1, *end_match_2;
3173
3174 /* Where we are in the data, and the end of the current string. */
3175 const char *d, *dend;
3176
3177 /* Where we are in the pattern, and the end of the pattern. */
3178 unsigned char *p = bufp->buffer;
3179 register unsigned char *pend = p + bufp->used;
3180
3181 /* We use this to map every character in the string. */
3182 char *translate = bufp->translate;
3183
3184 /* Failure point stack. Each place that can handle a failure further
3185 down the line pushes a failure point on this stack. It consists of
3186 restart, regend, and reg_info for all registers corresponding to
3187 the subexpressions we're currently inside, plus the number of such
3188 registers, and, finally, two char *'s. The first char * is where
3189 to resume scanning the pattern; the second one is where to resume
3190 scanning the strings. If the latter is zero, the failure point is
3191 a ``dummy''; if a failure happens and the failure point is a dummy,
3192 it gets discarded and the next next one is tried. */
3193 fail_stack_type fail_stack;
3194 #ifdef DEBUG
3195 static unsigned failure_id = 0;
3196 unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0;
3197 #endif
3198
3199 /* We fill all the registers internally, independent of what we
3200 return, for use in backreferences. The number here includes
3201 an element for register zero. */
3202 unsigned num_regs = bufp->re_nsub + 1;
3203
3204 /* The currently active registers. */
3205 unsigned lowest_active_reg = NO_LOWEST_ACTIVE_REG;
3206 unsigned highest_active_reg = NO_HIGHEST_ACTIVE_REG;
3207
3208 /* Information on the contents of registers. These are pointers into
3209 the input strings; they record just what was matched (on this
3210 attempt) by a subexpression part of the pattern, that is, the
3211 regnum-th regstart pointer points to where in the pattern we began
3212 matching and the regnum-th regend points to right after where we
3213 stopped matching the regnum-th subexpression. (The zeroth register
3214 keeps track of what the whole pattern matches.) */
3215 const char **regstart, **regend;
3216
3217 /* If a group that's operated upon by a repetition operator fails to
3218 match anything, then the register for its start will need to be
3219 restored because it will have been set to wherever in the string we
3220 are when we last see its open-group operator. Similarly for a
3221 register's end. */
3222 const char **old_regstart, **old_regend;
3223
3224 /* The is_active field of reg_info helps us keep track of which (possibly
3225 nested) subexpressions we are currently in. The matched_something
3226 field of reg_info[reg_num] helps us tell whether or not we have
3227 matched any of the pattern so far this time through the reg_num-th
3228 subexpression. These two fields get reset each time through any
3229 loop their register is in. */
3230 register_info_type *reg_info;
3231
3232 /* The following record the register info as found in the above
3233 variables when we find a match better than any we've seen before.
3234 This happens as we backtrack through the failure points, which in
3235 turn happens only if we have not yet matched the entire string. */
3236 unsigned best_regs_set = false;
3237 const char **best_regstart, **best_regend;
3238
3239 /* Logically, this is `best_regend[0]'. But we don't want to have to
3240 allocate space for that if we're not allocating space for anything
3241 else (see below). Also, we never need info about register 0 for
3242 any of the other register vectors, and it seems rather a kludge to
3243 treat `best_regend' differently than the rest. So we keep track of
3244 the end of the best match so far in a separate variable. We
3245 initialize this to NULL so that when we backtrack the first time
3246 and need to test it, it's not garbage. */
3247 const char *match_end = NULL;
3248
3249 /* Used when we pop values we don't care about. */
3250 const char **reg_dummy;
3251 register_info_type *reg_info_dummy;
3252
3253 #ifdef DEBUG
3254 /* Counts the total number of registers pushed. */
3255 unsigned num_regs_pushed = 0;
3256 #endif
3257
3258 DEBUG_PRINT1 ("\n\nEntering re_match_2.\n");
3259
3260 INIT_FAIL_STACK ();
3261
3262 /* Do not bother to initialize all the register variables if there are
3263 no groups in the pattern, as it takes a fair amount of time. If
3264 there are groups, we include space for register 0 (the whole
3265 pattern), even though we never use it, since it simplifies the
3266 array indexing. We should fix this. */
3267 if (bufp->re_nsub)
3268 {
3269 regstart = REGEX_TALLOC (num_regs, const char *);
3270 regend = REGEX_TALLOC (num_regs, const char *);
3271 old_regstart = REGEX_TALLOC (num_regs, const char *);
3272 old_regend = REGEX_TALLOC (num_regs, const char *);
3273 best_regstart = REGEX_TALLOC (num_regs, const char *);
3274 best_regend = REGEX_TALLOC (num_regs, const char *);
3275 reg_info = REGEX_TALLOC (num_regs, register_info_type);
3276 reg_dummy = REGEX_TALLOC (num_regs, const char *);
3277 reg_info_dummy = REGEX_TALLOC (num_regs, register_info_type);
3278
3279 if (!(regstart && regend && old_regstart && old_regend && reg_info
3280 && best_regstart && best_regend && reg_dummy && reg_info_dummy))
3281 {
3282 FREE_VARIABLES ();
3283 return -2;
3284 }
3285 }
3286 #ifdef REGEX_MALLOC
3287 else
3288 {
3289 /* We must initialize all our variables to NULL, so that
3290 `FREE_VARIABLES' doesn't try to free them. */
3291 regstart = regend = old_regstart = old_regend = best_regstart
3292 = best_regend = reg_dummy = NULL;
3293 reg_info = reg_info_dummy = (register_info_type *) NULL;
3294 }
3295 #endif /* REGEX_MALLOC */
3296
3297 /* The starting position is bogus. */
3298 if (pos < 0 || pos > size1 + size2)
3299 {
3300 FREE_VARIABLES ();
3301 return -1;
3302 }
3303
3304 /* Initialize subexpression text positions to -1 to mark ones that no
3305 start_memory/stop_memory has been seen for. Also initialize the
3306 register information struct. */
3307 for (mcnt = 1; mcnt < num_regs; mcnt++)
3308 {
3309 regstart[mcnt] = regend[mcnt]
3310 = old_regstart[mcnt] = old_regend[mcnt] = REG_UNSET_VALUE;
3311
3312 REG_MATCH_NULL_STRING_P (reg_info[mcnt]) = MATCH_NULL_UNSET_VALUE;
3313 IS_ACTIVE (reg_info[mcnt]) = 0;
3314 MATCHED_SOMETHING (reg_info[mcnt]) = 0;
3315 EVER_MATCHED_SOMETHING (reg_info[mcnt]) = 0;
3316 }
3317
3318 /* We move `string1' into `string2' if the latter's empty -- but not if
3319 `string1' is null. */
3320 if (size2 == 0 && string1 != NULL)
3321 {
3322 string2 = string1;
3323 size2 = size1;
3324 string1 = 0;
3325 size1 = 0;
3326 }
3327 end1 = string1 + size1;
3328 end2 = string2 + size2;
3329
3330 /* Compute where to stop matching, within the two strings. */
3331 if (stop <= size1)
3332 {
3333 end_match_1 = string1 + stop;
3334 end_match_2 = string2;
3335 }
3336 else
3337 {
3338 end_match_1 = end1;
3339 end_match_2 = string2 + stop - size1;
3340 }
3341
3342 /* `p' scans through the pattern as `d' scans through the data.
3343 `dend' is the end of the input string that `d' points within. `d'
3344 is advanced into the following input string whenever necessary, but
3345 this happens before fetching; therefore, at the beginning of the
3346 loop, `d' can be pointing at the end of a string, but it cannot
3347 equal `string2'. */
3348 if (size1 > 0 && pos <= size1)
3349 {
3350 d = string1 + pos;
3351 dend = end_match_1;
3352 }
3353 else
3354 {
3355 d = string2 + pos - size1;
3356 dend = end_match_2;
3357 }
3358
3359 DEBUG_PRINT1 ("The compiled pattern is: ");
3360 DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend);
3361 DEBUG_PRINT1 ("The string to match is: `");
3362 DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2);
3363 DEBUG_PRINT1 ("'\n");
3364
3365 /* This loops over pattern commands. It exits by returning from the
3366 function if the match is complete, or it drops through if the match
3367 fails at this starting point in the input data. */
3368 for (;;)
3369 {
3370 DEBUG_PRINT2 ("\n0x%x: ", p);
3371
3372 if (p == pend)
3373 { /* End of pattern means we might have succeeded. */
3374 DEBUG_PRINT1 ("end of pattern ... ");
3375
3376 /* If we haven't matched the entire string, and we want the
3377 longest match, try backtracking. */
3378 if (d != end_match_2)
3379 {
3380 DEBUG_PRINT1 ("backtracking.\n");
3381
3382 if (!FAIL_STACK_EMPTY ())
3383 { /* More failure points to try. */
3384 boolean same_str_p = (FIRST_STRING_P (match_end)
3385 == MATCHING_IN_FIRST_STRING);
3386
3387 /* If exceeds best match so far, save it. */
3388 if (!best_regs_set
3389 || (same_str_p && d > match_end)
3390 || (!same_str_p && !MATCHING_IN_FIRST_STRING))
3391 {
3392 best_regs_set = true;
3393 match_end = d;
3394
3395 DEBUG_PRINT1 ("\nSAVING match as best so far.\n");
3396
3397 for (mcnt = 1; mcnt < num_regs; mcnt++)
3398 {
3399 best_regstart[mcnt] = regstart[mcnt];
3400 best_regend[mcnt] = regend[mcnt];
3401 }
3402 }
3403 goto fail;
3404 }
3405
3406 /* If no failure points, don't restore garbage. */
3407 else if (best_regs_set)
3408 {
3409 restore_best_regs:
3410 /* Restore best match. It may happen that `dend ==
3411 end_match_1' while the restored d is in string2.
3412 For example, the pattern `x.*y.*z' against the
3413 strings `x-' and `y-z-', if the two strings are
3414 not consecutive in memory. */
3415 DEBUG_PRINT1 ("Restoring best registers.\n");
3416
3417 d = match_end;
3418 dend = ((d >= string1 && d <= end1)
3419 ? end_match_1 : end_match_2);
3420
3421 for (mcnt = 1; mcnt < num_regs; mcnt++)
3422 {
3423 regstart[mcnt] = best_regstart[mcnt];
3424 regend[mcnt] = best_regend[mcnt];
3425 }
3426 }
3427 } /* d != end_match_2 */
3428
3429 DEBUG_PRINT1 ("Accepting match.\n");
3430
3431 /* If caller wants register contents data back, do it. */
3432 if (regs && !bufp->no_sub)
3433 {
3434 /* Have the register data arrays been allocated? */
3435 if (bufp->regs_allocated == REGS_UNALLOCATED)
3436 { /* No. So allocate them with malloc. We need one
3437 extra element beyond `num_regs' for the `-1' marker
3438 GNU code uses. */
3439 regs->num_regs = MAX (RE_NREGS, num_regs + 1);
3440 regs->start = TALLOC (regs->num_regs, regoff_t);
3441 regs->end = TALLOC (regs->num_regs, regoff_t);
3442 if (regs->start == NULL || regs->end == NULL)
3443 return -2;
3444 bufp->regs_allocated = REGS_REALLOCATE;
3445 }
3446 else if (bufp->regs_allocated == REGS_REALLOCATE)
3447 { /* Yes. If we need more elements than were already
3448 allocated, reallocate them. If we need fewer, just
3449 leave it alone. */
3450 if (regs->num_regs < num_regs + 1)
3451 {
3452 regs->num_regs = num_regs + 1;
3453 RETALLOC (regs->start, regs->num_regs, regoff_t);
3454 RETALLOC (regs->end, regs->num_regs, regoff_t);
3455 if (regs->start == NULL || regs->end == NULL)
3456 return -2;
3457 }
3458 }
3459 else
3460 assert (bufp->regs_allocated == REGS_FIXED);
3461
3462 /* Convert the pointer data in `regstart' and `regend' to
3463 indices. Register zero has to be set differently,
3464 since we haven't kept track of any info for it. */
3465 if (regs->num_regs > 0)
3466 {
3467 regs->start[0] = pos;
3468 regs->end[0] = (MATCHING_IN_FIRST_STRING ? d - string1
3469 : d - string2 + size1);
3470 }
3471
3472 /* Go through the first `min (num_regs, regs->num_regs)'
3473 registers, since that is all we initialized. */
3474 for (mcnt = 1; mcnt < MIN (num_regs, regs->num_regs); mcnt++)
3475 {
3476 if (REG_UNSET (regstart[mcnt]) || REG_UNSET (regend[mcnt]))
3477 regs->start[mcnt] = regs->end[mcnt] = -1;
3478 else
3479 {
3480 regs->start[mcnt] = POINTER_TO_OFFSET (regstart[mcnt]);
3481 regs->end[mcnt] = POINTER_TO_OFFSET (regend[mcnt]);
3482 }
3483 }
3484
3485 /* If the regs structure we return has more elements than
3486 were in the pattern, set the extra elements to -1. If
3487 we (re)allocated the registers, this is the case,
3488 because we always allocate enough to have at least one
3489 -1 at the end. */
3490 for (mcnt = num_regs; mcnt < regs->num_regs; mcnt++)
3491 regs->start[mcnt] = regs->end[mcnt] = -1;
3492 } /* regs && !bufp->no_sub */
3493
3494 FREE_VARIABLES ();
3495 DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n",
3496 nfailure_points_pushed, nfailure_points_popped,
3497 nfailure_points_pushed - nfailure_points_popped);
3498 DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed);
3499
3500 mcnt = d - pos - (MATCHING_IN_FIRST_STRING
3501 ? string1
3502 : string2 - size1);
3503
3504 DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt);
3505
3506 return mcnt;
3507 }
3508
3509 /* Otherwise match next pattern command. */
3510 #ifdef SWITCH_ENUM_BUG
3511 switch ((int) ((re_opcode_t) *p++))
3512 #else
3513 switch ((re_opcode_t) *p++)
3514 #endif
3515 {
3516 /* Ignore these. Used to ignore the n of succeed_n's which
3517 currently have n == 0. */
3518 case no_op:
3519 DEBUG_PRINT1 ("EXECUTING no_op.\n");
3520 break;
3521
3522
3523 /* Match the next n pattern characters exactly. The following
3524 byte in the pattern defines n, and the n bytes after that
3525 are the characters to match. */
3526 case exactn:
3527 mcnt = *p++;
3528 DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt);
3529
3530 /* This is written out as an if-else so we don't waste time
3531 testing `translate' inside the loop. */
3532 if (translate)
3533 {
3534 do
3535 {
3536 PREFETCH ();
3537 if (translate[(unsigned char) *d++] != (char) *p++)
3538 goto fail;
3539 }
3540 while (--mcnt);
3541 }
3542 else
3543 {
3544 do
3545 {
3546 PREFETCH ();
3547 if (*d++ != (char) *p++) goto fail;
3548 }
3549 while (--mcnt);
3550 }
3551 SET_REGS_MATCHED ();
3552 break;
3553
3554
3555 /* Match any character except possibly a newline or a null. */
3556 case anychar:
3557 DEBUG_PRINT1 ("EXECUTING anychar.\n");
3558
3559 PREFETCH ();
3560
3561 if ((!(bufp->syntax & RE_DOT_NEWLINE) && TRANSLATE (*d) == '\n')
3562 || (bufp->syntax & RE_DOT_NOT_NULL && TRANSLATE (*d) == '\000'))
3563 goto fail;
3564
3565 SET_REGS_MATCHED ();
3566 DEBUG_PRINT2 (" Matched `%d'.\n", *d);
3567 d++;
3568 break;
3569
3570
3571 case charset:
3572 case charset_not:
3573 {
3574 register unsigned char c;
3575 boolean not = (re_opcode_t) *(p - 1) == charset_not;
3576
3577 DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : "");
3578
3579 PREFETCH ();
3580 c = TRANSLATE (*d); /* The character to match. */
3581
3582 /* Cast to `unsigned' instead of `unsigned char' in case the
3583 bit list is a full 32 bytes long. */
3584 if (c < (unsigned) (*p * BYTEWIDTH)
3585 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
3586 not = !not;
3587
3588 p += 1 + *p;
3589
3590 if (!not) goto fail;
3591
3592 SET_REGS_MATCHED ();
3593 d++;
3594 break;
3595 }
3596
3597
3598 /* The beginning of a group is represented by start_memory.
3599 The arguments are the register number in the next byte, and the
3600 number of groups inner to this one in the next. The text
3601 matched within the group is recorded (in the internal
3602 registers data structure) under the register number. */
3603 case start_memory:
3604 DEBUG_PRINT3 ("EXECUTING start_memory %d (%d):\n", *p, p[1]);
3605
3606 /* Find out if this group can match the empty string. */
3607 p1 = p; /* To send to group_match_null_string_p. */
3608
3609 if (REG_MATCH_NULL_STRING_P (reg_info[*p]) == MATCH_NULL_UNSET_VALUE)
3610 REG_MATCH_NULL_STRING_P (reg_info[*p])
3611 = group_match_null_string_p (&p1, pend, reg_info);
3612
3613 /* Save the position in the string where we were the last time
3614 we were at this open-group operator in case the group is
3615 operated upon by a repetition operator, e.g., with `(a*)*b'
3616 against `ab'; then we want to ignore where we are now in
3617 the string in case this attempt to match fails. */
3618 old_regstart[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p])
3619 ? REG_UNSET (regstart[*p]) ? d : regstart[*p]
3620 : regstart[*p];
3621 DEBUG_PRINT2 (" old_regstart: %d\n",
3622 POINTER_TO_OFFSET (old_regstart[*p]));
3623
3624 regstart[*p] = d;
3625 DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p]));
3626
3627 IS_ACTIVE (reg_info[*p]) = 1;
3628 MATCHED_SOMETHING (reg_info[*p]) = 0;
3629
3630 /* This is the new highest active register. */
3631 highest_active_reg = *p;
3632
3633 /* If nothing was active before, this is the new lowest active
3634 register. */
3635 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
3636 lowest_active_reg = *p;
3637
3638 /* Move past the register number and inner group count. */
3639 p += 2;
3640 break;
3641
3642
3643 /* The stop_memory opcode represents the end of a group. Its
3644 arguments are the same as start_memory's: the register
3645 number, and the number of inner groups. */
3646 case stop_memory:
3647 DEBUG_PRINT3 ("EXECUTING stop_memory %d (%d):\n", *p, p[1]);
3648
3649 /* We need to save the string position the last time we were at
3650 this close-group operator in case the group is operated
3651 upon by a repetition operator, e.g., with `((a*)*(b*)*)*'
3652 against `aba'; then we want to ignore where we are now in
3653 the string in case this attempt to match fails. */
3654 old_regend[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p])
3655 ? REG_UNSET (regend[*p]) ? d : regend[*p]
3656 : regend[*p];
3657 DEBUG_PRINT2 (" old_regend: %d\n",
3658 POINTER_TO_OFFSET (old_regend[*p]));
3659
3660 regend[*p] = d;
3661 DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p]));
3662
3663 /* This register isn't active anymore. */
3664 IS_ACTIVE (reg_info[*p]) = 0;
3665
3666 /* If this was the only register active, nothing is active
3667 anymore. */
3668 if (lowest_active_reg == highest_active_reg)
3669 {
3670 lowest_active_reg = NO_LOWEST_ACTIVE_REG;
3671 highest_active_reg = NO_HIGHEST_ACTIVE_REG;
3672 }
3673 else
3674 { /* We must scan for the new highest active register, since
3675 it isn't necessarily one less than now: consider
3676 (a(b)c(d(e)f)g). When group 3 ends, after the f), the
3677 new highest active register is 1. */
3678 unsigned char r = *p - 1;
3679 while (r > 0 && !IS_ACTIVE (reg_info[r]))
3680 r--;
3681
3682 /* If we end up at register zero, that means that we saved
3683 the registers as the result of an `on_failure_jump', not
3684 a `start_memory', and we jumped to past the innermost
3685 `stop_memory'. For example, in ((.)*) we save
3686 registers 1 and 2 as a result of the *, but when we pop
3687 back to the second ), we are at the stop_memory 1.
3688 Thus, nothing is active. */
3689 if (r == 0)
3690 {
3691 lowest_active_reg = NO_LOWEST_ACTIVE_REG;
3692 highest_active_reg = NO_HIGHEST_ACTIVE_REG;
3693 }
3694 else
3695 highest_active_reg = r;
3696 }
3697
3698 /* If just failed to match something this time around with a
3699 group that's operated on by a repetition operator, try to
3700 force exit from the ``loop'', and restore the register
3701 information for this group that we had before trying this
3702 last match. */
3703 if ((!MATCHED_SOMETHING (reg_info[*p])
3704 || (re_opcode_t) p[-3] == start_memory)
3705 && (p + 2) < pend)
3706 {
3707 boolean is_a_jump_n = false;
3708
3709 p1 = p + 2;
3710 mcnt = 0;
3711 switch ((re_opcode_t) *p1++)
3712 {
3713 case jump_n:
3714 is_a_jump_n = true;
3715 case pop_failure_jump:
3716 case maybe_pop_jump:
3717 case jump:
3718 case dummy_failure_jump:
3719 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
3720 if (is_a_jump_n)
3721 p1 += 2;
3722 break;
3723
3724 default:
3725 /* do nothing */ ;
3726 }
3727 p1 += mcnt;
3728
3729 /* If the next operation is a jump backwards in the pattern
3730 to an on_failure_jump right before the start_memory
3731 corresponding to this stop_memory, exit from the loop
3732 by forcing a failure after pushing on the stack the
3733 on_failure_jump's jump in the pattern, and d. */
3734 if (mcnt < 0 && (re_opcode_t) *p1 == on_failure_jump
3735 && (re_opcode_t) p1[3] == start_memory && p1[4] == *p)
3736 {
3737 /* If this group ever matched anything, then restore
3738 what its registers were before trying this last
3739 failed match, e.g., with `(a*)*b' against `ab' for
3740 regstart[1], and, e.g., with `((a*)*(b*)*)*'
3741 against `aba' for regend[3].
3742
3743 Also restore the registers for inner groups for,
3744 e.g., `((a*)(b*))*' against `aba' (register 3 would
3745 otherwise get trashed). */
3746
3747 if (EVER_MATCHED_SOMETHING (reg_info[*p]))
3748 {
3749 unsigned r;
3750
3751 EVER_MATCHED_SOMETHING (reg_info[*p]) = 0;
3752
3753 /* Restore this and inner groups' (if any) registers. */
3754 for (r = *p; r < *p + *(p + 1); r++)
3755 {
3756 regstart[r] = old_regstart[r];
3757
3758 /* xx why this test? */
3759 if ((int) old_regend[r] >= (int) regstart[r])
3760 regend[r] = old_regend[r];
3761 }
3762 }
3763 p1++;
3764 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
3765 PUSH_FAILURE_POINT (p1 + mcnt, d, -2);
3766
3767 goto fail;
3768 }
3769 }
3770
3771 /* Move past the register number and the inner group count. */
3772 p += 2;
3773 break;
3774
3775
3776 /* \<digit> has been turned into a `duplicate' command which is
3777 followed by the numeric value of <digit> as the register number. */
3778 case duplicate:
3779 {
3780 register const char *d2, *dend2;
3781 int regno = *p++; /* Get which register to match against. */
3782 DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno);
3783
3784 /* Can't back reference a group which we've never matched. */
3785 if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno]))
3786 goto fail;
3787
3788 /* Where in input to try to start matching. */
3789 d2 = regstart[regno];
3790
3791 /* Where to stop matching; if both the place to start and
3792 the place to stop matching are in the same string, then
3793 set to the place to stop, otherwise, for now have to use
3794 the end of the first string. */
3795
3796 dend2 = ((FIRST_STRING_P (regstart[regno])
3797 == FIRST_STRING_P (regend[regno]))
3798 ? regend[regno] : end_match_1);
3799 for (;;)
3800 {
3801 /* If necessary, advance to next segment in register
3802 contents. */
3803 while (d2 == dend2)
3804 {
3805 if (dend2 == end_match_2) break;
3806 if (dend2 == regend[regno]) break;
3807
3808 /* End of string1 => advance to string2. */
3809 d2 = string2;
3810 dend2 = regend[regno];
3811 }
3812 /* At end of register contents => success */
3813 if (d2 == dend2) break;
3814
3815 /* If necessary, advance to next segment in data. */
3816 PREFETCH ();
3817
3818 /* How many characters left in this segment to match. */
3819 mcnt = dend - d;
3820
3821 /* Want how many consecutive characters we can match in
3822 one shot, so, if necessary, adjust the count. */
3823 if (mcnt > dend2 - d2)
3824 mcnt = dend2 - d2;
3825
3826 /* Compare that many; failure if mismatch, else move
3827 past them. */
3828 if (translate
3829 ? bcmp_translate (d, d2, mcnt, translate)
3830 : bcmp (d, d2, mcnt))
3831 goto fail;
3832 d += mcnt, d2 += mcnt;
3833 }
3834 }
3835 break;
3836
3837
3838 /* begline matches the empty string at the beginning of the string
3839 (unless `not_bol' is set in `bufp'), and, if
3840 `newline_anchor' is set, after newlines. */
3841 case begline:
3842 DEBUG_PRINT1 ("EXECUTING begline.\n");
3843
3844 if (AT_STRINGS_BEG (d))
3845 {
3846 if (!bufp->not_bol) break;
3847 }
3848 else if (d[-1] == '\n' && bufp->newline_anchor)
3849 {
3850 break;
3851 }
3852 /* In all other cases, we fail. */
3853 goto fail;
3854
3855
3856 /* endline is the dual of begline. */
3857 case endline:
3858 DEBUG_PRINT1 ("EXECUTING endline.\n");
3859
3860 if (AT_STRINGS_END (d))
3861 {
3862 if (!bufp->not_eol) break;
3863 }
3864
3865 /* We have to ``prefetch'' the next character. */
3866 else if ((d == end1 ? *string2 : *d) == '\n'
3867 && bufp->newline_anchor)
3868 {
3869 break;
3870 }
3871 goto fail;
3872
3873
3874 /* Match at the very beginning of the data. */
3875 case begbuf:
3876 DEBUG_PRINT1 ("EXECUTING begbuf.\n");
3877 if (AT_STRINGS_BEG (d))
3878 break;
3879 goto fail;
3880
3881
3882 /* Match at the very end of the data. */
3883 case endbuf:
3884 DEBUG_PRINT1 ("EXECUTING endbuf.\n");
3885 if (AT_STRINGS_END (d))
3886 break;
3887 goto fail;
3888
3889
3890 /* on_failure_keep_string_jump is used to optimize `.*\n'. It
3891 pushes NULL as the value for the string on the stack. Then
3892 `pop_failure_point' will keep the current value for the
3893 string, instead of restoring it. To see why, consider
3894 matching `foo\nbar' against `.*\n'. The .* matches the foo;
3895 then the . fails against the \n. But the next thing we want
3896 to do is match the \n against the \n; if we restored the
3897 string value, we would be back at the foo.
3898
3899 Because this is used only in specific cases, we don't need to
3900 check all the things that `on_failure_jump' does, to make
3901 sure the right things get saved on the stack. Hence we don't
3902 share its code. The only reason to push anything on the
3903 stack at all is that otherwise we would have to change
3904 `anychar's code to do something besides goto fail in this
3905 case; that seems worse than this. */
3906 case on_failure_keep_string_jump:
3907 DEBUG_PRINT1 ("EXECUTING on_failure_keep_string_jump");
3908
3909 EXTRACT_NUMBER_AND_INCR (mcnt, p);
3910 DEBUG_PRINT3 (" %d (to 0x%x):\n", mcnt, p + mcnt);
3911
3912 PUSH_FAILURE_POINT (p + mcnt, NULL, -2);
3913 break;
3914
3915
3916 /* Uses of on_failure_jump:
3917
3918 Each alternative starts with an on_failure_jump that points
3919 to the beginning of the next alternative. Each alternative
3920 except the last ends with a jump that in effect jumps past
3921 the rest of the alternatives. (They really jump to the
3922 ending jump of the following alternative, because tensioning
3923 these jumps is a hassle.)
3924
3925 Repeats start with an on_failure_jump that points past both
3926 the repetition text and either the following jump or
3927 pop_failure_jump back to this on_failure_jump. */
3928 case on_failure_jump:
3929 on_failure:
3930 DEBUG_PRINT1 ("EXECUTING on_failure_jump");
3931
3932 EXTRACT_NUMBER_AND_INCR (mcnt, p);
3933 DEBUG_PRINT3 (" %d (to 0x%x)", mcnt, p + mcnt);
3934
3935 /* If this on_failure_jump comes right before a group (i.e.,
3936 the original * applied to a group), save the information
3937 for that group and all inner ones, so that if we fail back
3938 to this point, the group's information will be correct.
3939 For example, in \(a*\)*\1, we need the preceding group,
3940 and in \(\(a*\)b*\)\2, we need the inner group. */
3941
3942 /* We can't use `p' to check ahead because we push
3943 a failure point to `p + mcnt' after we do this. */
3944 p1 = p;
3945
3946 /* We need to skip no_op's before we look for the
3947 start_memory in case this on_failure_jump is happening as
3948 the result of a completed succeed_n, as in \(a\)\{1,3\}b\1
3949 against aba. */
3950 while (p1 < pend && (re_opcode_t) *p1 == no_op)
3951 p1++;
3952
3953 if (p1 < pend && (re_opcode_t) *p1 == start_memory)
3954 {
3955 /* We have a new highest active register now. This will
3956 get reset at the start_memory we are about to get to,
3957 but we will have saved all the registers relevant to
3958 this repetition op, as described above. */
3959 highest_active_reg = *(p1 + 1) + *(p1 + 2);
3960 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
3961 lowest_active_reg = *(p1 + 1);
3962 }
3963
3964 DEBUG_PRINT1 (":\n");
3965 PUSH_FAILURE_POINT (p + mcnt, d, -2);
3966 break;
3967
3968
3969 /* A smart repeat ends with `maybe_pop_jump'.
3970 We change it to either `pop_failure_jump' or `jump'. */
3971 case maybe_pop_jump:
3972 EXTRACT_NUMBER_AND_INCR (mcnt, p);
3973 DEBUG_PRINT2 ("EXECUTING maybe_pop_jump %d.\n", mcnt);
3974 {
3975 register unsigned char *p2 = p;
3976
3977 /* Compare the beginning of the repeat with what in the
3978 pattern follows its end. If we can establish that there
3979 is nothing that they would both match, i.e., that we
3980 would have to backtrack because of (as in, e.g., `a*a')
3981 then we can change to pop_failure_jump, because we'll
3982 never have to backtrack.
3983
3984 This is not true in the case of alternatives: in
3985 `(a|ab)*' we do need to backtrack to the `ab' alternative
3986 (e.g., if the string was `ab'). But instead of trying to
3987 detect that here, the alternative has put on a dummy
3988 failure point which is what we will end up popping. */
3989
3990 /* Skip over open/close-group commands. */
3991 while (p2 + 2 < pend
3992 && ((re_opcode_t) *p2 == stop_memory
3993 || (re_opcode_t) *p2 == start_memory))
3994 p2 += 3; /* Skip over args, too. */
3995
3996 /* If we're at the end of the pattern, we can change. */
3997 if (p2 == pend)
3998 {
3999 /* Consider what happens when matching ":\(.*\)"
4000 against ":/". I don't really understand this code
4001 yet. */
4002 p[-3] = (unsigned char) pop_failure_jump;
4003 DEBUG_PRINT1
4004 (" End of pattern: change to `pop_failure_jump'.\n");
4005 }
4006
4007 else if ((re_opcode_t) *p2 == exactn
4008 || (bufp->newline_anchor && (re_opcode_t) *p2 == endline))
4009 {
4010 register unsigned char c
4011 = *p2 == (unsigned char) endline ? '\n' : p2[2];
4012 p1 = p + mcnt;
4013
4014 /* p1[0] ... p1[2] are the `on_failure_jump' corresponding
4015 to the `maybe_finalize_jump' of this case. Examine what
4016 follows. */
4017 if ((re_opcode_t) p1[3] == exactn && p1[5] != c)
4018 {
4019 p[-3] = (unsigned char) pop_failure_jump;
4020 DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n",
4021 c, p1[5]);
4022 }
4023
4024 else if ((re_opcode_t) p1[3] == charset
4025 || (re_opcode_t) p1[3] == charset_not)
4026 {
4027 int not = (re_opcode_t) p1[3] == charset_not;
4028
4029 if (c < (unsigned char) (p1[4] * BYTEWIDTH)
4030 && p1[5 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
4031 not = !not;
4032
4033 /* `not' is equal to 1 if c would match, which means
4034 that we can't change to pop_failure_jump. */
4035 if (!not)
4036 {
4037 p[-3] = (unsigned char) pop_failure_jump;
4038 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
4039 }
4040 }
4041 }
4042 }
4043 p -= 2; /* Point at relative address again. */
4044 if ((re_opcode_t) p[-1] != pop_failure_jump)
4045 {
4046 p[-1] = (unsigned char) jump;
4047 DEBUG_PRINT1 (" Match => jump.\n");
4048 goto unconditional_jump;
4049 }
4050 /* Note fall through. */
4051
4052
4053 /* The end of a simple repeat has a pop_failure_jump back to
4054 its matching on_failure_jump, where the latter will push a
4055 failure point. The pop_failure_jump takes off failure
4056 points put on by this pop_failure_jump's matching
4057 on_failure_jump; we got through the pattern to here from the
4058 matching on_failure_jump, so didn't fail. */
4059 case pop_failure_jump:
4060 {
4061 /* We need to pass separate storage for the lowest and
4062 highest registers, even though we don't care about the
4063 actual values. Otherwise, we will restore only one
4064 register from the stack, since lowest will == highest in
4065 `pop_failure_point'. */
4066 unsigned dummy_low_reg, dummy_high_reg;
4067 unsigned char *pdummy;
4068 const char *sdummy;
4069
4070 DEBUG_PRINT1 ("EXECUTING pop_failure_jump.\n");
4071 POP_FAILURE_POINT (sdummy, pdummy,
4072 dummy_low_reg, dummy_high_reg,
4073 reg_dummy, reg_dummy, reg_info_dummy);
4074 }
4075 /* Note fall through. */
4076
4077
4078 /* Unconditionally jump (without popping any failure points). */
4079 case jump:
4080 unconditional_jump:
4081 EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */
4082 DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt);
4083 p += mcnt; /* Do the jump. */
4084 DEBUG_PRINT2 ("(to 0x%x).\n", p);
4085 break;
4086
4087
4088 /* We need this opcode so we can detect where alternatives end
4089 in `group_match_null_string_p' et al. */
4090 case jump_past_alt:
4091 DEBUG_PRINT1 ("EXECUTING jump_past_alt.\n");
4092 goto unconditional_jump;
4093
4094
4095 /* Normally, the on_failure_jump pushes a failure point, which
4096 then gets popped at pop_failure_jump. We will end up at
4097 pop_failure_jump, also, and with a pattern of, say, `a+', we
4098 are skipping over the on_failure_jump, so we have to push
4099 something meaningless for pop_failure_jump to pop. */
4100 case dummy_failure_jump:
4101 DEBUG_PRINT1 ("EXECUTING dummy_failure_jump.\n");
4102 /* It doesn't matter what we push for the string here. What
4103 the code at `fail' tests is the value for the pattern. */
4104 PUSH_FAILURE_POINT (0, 0, -2);
4105 goto unconditional_jump;
4106
4107
4108 /* At the end of an alternative, we need to push a dummy failure
4109 point in case we are followed by a `pop_failure_jump', because
4110 we don't want the failure point for the alternative to be
4111 popped. For example, matching `(a|ab)*' against `aab'
4112 requires that we match the `ab' alternative. */
4113 case push_dummy_failure:
4114 DEBUG_PRINT1 ("EXECUTING push_dummy_failure.\n");
4115 /* See comments just above at `dummy_failure_jump' about the
4116 two zeroes. */
4117 PUSH_FAILURE_POINT (0, 0, -2);
4118 break;
4119
4120 /* Have to succeed matching what follows at least n times.
4121 After that, handle like `on_failure_jump'. */
4122 case succeed_n:
4123 EXTRACT_NUMBER (mcnt, p + 2);
4124 DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt);
4125
4126 assert (mcnt >= 0);
4127 /* Originally, this is how many times we HAVE to succeed. */
4128 if (mcnt > 0)
4129 {
4130 mcnt--;
4131 p += 2;
4132 STORE_NUMBER_AND_INCR (p, mcnt);
4133 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p, mcnt);
4134 }
4135 else if (mcnt == 0)
4136 {
4137 DEBUG_PRINT2 (" Setting two bytes from 0x%x to no_op.\n", p+2);
4138 p[2] = (unsigned char) no_op;
4139 p[3] = (unsigned char) no_op;
4140 goto on_failure;
4141 }
4142 break;
4143
4144 case jump_n:
4145 EXTRACT_NUMBER (mcnt, p + 2);
4146 DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt);
4147
4148 /* Originally, this is how many times we CAN jump. */
4149 if (mcnt)
4150 {
4151 mcnt--;
4152 STORE_NUMBER (p + 2, mcnt);
4153 goto unconditional_jump;
4154 }
4155 /* If don't have to jump any more, skip over the rest of command. */
4156 else
4157 p += 4;
4158 break;
4159
4160 case set_number_at:
4161 {
4162 DEBUG_PRINT1 ("EXECUTING set_number_at.\n");
4163
4164 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4165 p1 = p + mcnt;
4166 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4167 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p1, mcnt);
4168 STORE_NUMBER (p1, mcnt);
4169 break;
4170 }
4171
4172 case wordbound:
4173 DEBUG_PRINT1 ("EXECUTING wordbound.\n");
4174 if (AT_WORD_BOUNDARY (d))
4175 break;
4176 goto fail;
4177
4178 case notwordbound:
4179 DEBUG_PRINT1 ("EXECUTING notwordbound.\n");
4180 if (AT_WORD_BOUNDARY (d))
4181 goto fail;
4182 break;
4183
4184 case wordbeg:
4185 DEBUG_PRINT1 ("EXECUTING wordbeg.\n");
4186 if (WORDCHAR_P (d) && (AT_STRINGS_BEG (d) || !WORDCHAR_P (d - 1)))
4187 break;
4188 goto fail;
4189
4190 case wordend:
4191 DEBUG_PRINT1 ("EXECUTING wordend.\n");
4192 if (!AT_STRINGS_BEG (d) && WORDCHAR_P (d - 1)
4193 && (!WORDCHAR_P (d) || AT_STRINGS_END (d)))
4194 break;
4195 goto fail;
4196
4197 #ifdef emacs
4198 #ifdef emacs19
4199 case before_dot:
4200 DEBUG_PRINT1 ("EXECUTING before_dot.\n");
4201 if (PTR_CHAR_POS ((unsigned char *) d) >= point)
4202 goto fail;
4203 break;
4204
4205 case at_dot:
4206 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
4207 if (PTR_CHAR_POS ((unsigned char *) d) != point)
4208 goto fail;
4209 break;
4210
4211 case after_dot:
4212 DEBUG_PRINT1 ("EXECUTING after_dot.\n");
4213 if (PTR_CHAR_POS ((unsigned char *) d) <= point)
4214 goto fail;
4215 break;
4216 #else /* not emacs19 */
4217 case at_dot:
4218 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
4219 if (PTR_CHAR_POS ((unsigned char *) d) + 1 != point)
4220 goto fail;
4221 break;
4222 #endif /* not emacs19 */
4223
4224 case syntaxspec:
4225 DEBUG_PRINT2 ("EXECUTING syntaxspec %d.\n", mcnt);
4226 mcnt = *p++;
4227 goto matchsyntax;
4228
4229 case wordchar:
4230 DEBUG_PRINT1 ("EXECUTING Emacs wordchar.\n");
4231 mcnt = (int) Sword;
4232 matchsyntax:
4233 PREFETCH ();
4234 if (SYNTAX (*d++) != (enum syntaxcode) mcnt)
4235 goto fail;
4236 SET_REGS_MATCHED ();
4237 break;
4238
4239 case notsyntaxspec:
4240 DEBUG_PRINT2 ("EXECUTING notsyntaxspec %d.\n", mcnt);
4241 mcnt = *p++;
4242 goto matchnotsyntax;
4243
4244 case notwordchar:
4245 DEBUG_PRINT1 ("EXECUTING Emacs notwordchar.\n");
4246 mcnt = (int) Sword;
4247 matchnotsyntax:
4248 PREFETCH ();
4249 if (SYNTAX (*d++) == (enum syntaxcode) mcnt)
4250 goto fail;
4251 SET_REGS_MATCHED ();
4252 break;
4253
4254 #else /* not emacs */
4255 case wordchar:
4256 DEBUG_PRINT1 ("EXECUTING non-Emacs wordchar.\n");
4257 PREFETCH ();
4258 if (!WORDCHAR_P (d))
4259 goto fail;
4260 SET_REGS_MATCHED ();
4261 d++;
4262 break;
4263
4264 case notwordchar:
4265 DEBUG_PRINT1 ("EXECUTING non-Emacs notwordchar.\n");
4266 PREFETCH ();
4267 if (WORDCHAR_P (d))
4268 goto fail;
4269 SET_REGS_MATCHED ();
4270 d++;
4271 break;
4272 #endif /* not emacs */
4273
4274 default:
4275 abort ();
4276 }
4277 continue; /* Successfully executed one pattern command; keep going. */
4278
4279
4280 /* We goto here if a matching operation fails. */
4281 fail:
4282 if (!FAIL_STACK_EMPTY ())
4283 { /* A restart point is known. Restore to that state. */
4284 DEBUG_PRINT1 ("\nFAIL:\n");
4285 POP_FAILURE_POINT (d, p,
4286 lowest_active_reg, highest_active_reg,
4287 regstart, regend, reg_info);
4288
4289 /* If this failure point is a dummy, try the next one. */
4290 if (!p)
4291 goto fail;
4292
4293 /* If we failed to the end of the pattern, don't examine *p. */
4294 assert (p <= pend);
4295 if (p < pend)
4296 {
4297 boolean is_a_jump_n = false;
4298
4299 /* If failed to a backwards jump that's part of a repetition
4300 loop, need to pop this failure point and use the next one. */
4301 switch ((re_opcode_t) *p)
4302 {
4303 case jump_n:
4304 is_a_jump_n = true;
4305 case maybe_pop_jump:
4306 case pop_failure_jump:
4307 case jump:
4308 p1 = p + 1;
4309 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4310 p1 += mcnt;
4311
4312 if ((is_a_jump_n && (re_opcode_t) *p1 == succeed_n)
4313 || (!is_a_jump_n
4314 && (re_opcode_t) *p1 == on_failure_jump))
4315 goto fail;
4316 break;
4317 default:
4318 /* do nothing */ ;
4319 }
4320 }
4321
4322 if (d >= string1 && d <= end1)
4323 dend = end_match_1;
4324 }
4325 else
4326 break; /* Matching at this starting point really fails. */
4327 } /* for (;;) */
4328
4329 if (best_regs_set)
4330 goto restore_best_regs;
4331
4332 FREE_VARIABLES ();
4333
4334 return -1; /* Failure to match. */
4335 } /* re_match_2 */
4336
4337 /* Subroutine definitions for re_match_2. */
4338
4339
4340 /* We are passed P pointing to a register number after a start_memory.
4341
4342 Return true if the pattern up to the corresponding stop_memory can
4343 match the empty string, and false otherwise.
4344
4345 If we find the matching stop_memory, sets P to point to one past its number.
4346 Otherwise, sets P to an undefined byte less than or equal to END.
4347
4348 We don't handle duplicates properly (yet). */
4349
4350 static boolean
group_match_null_string_p(p,end,reg_info)4351 group_match_null_string_p (p, end, reg_info)
4352 unsigned char **p, *end;
4353 register_info_type *reg_info;
4354 {
4355 int mcnt;
4356 /* Point to after the args to the start_memory. */
4357 unsigned char *p1 = *p + 2;
4358
4359 while (p1 < end)
4360 {
4361 /* Skip over opcodes that can match nothing, and return true or
4362 false, as appropriate, when we get to one that can't, or to the
4363 matching stop_memory. */
4364
4365 switch ((re_opcode_t) *p1)
4366 {
4367 /* Could be either a loop or a series of alternatives. */
4368 case on_failure_jump:
4369 p1++;
4370 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4371
4372 /* If the next operation is not a jump backwards in the
4373 pattern. */
4374
4375 if (mcnt >= 0)
4376 {
4377 /* Go through the on_failure_jumps of the alternatives,
4378 seeing if any of the alternatives cannot match nothing.
4379 The last alternative starts with only a jump,
4380 whereas the rest start with on_failure_jump and end
4381 with a jump, e.g., here is the pattern for `a|b|c':
4382
4383 /on_failure_jump/0/6/exactn/1/a/jump_past_alt/0/6
4384 /on_failure_jump/0/6/exactn/1/b/jump_past_alt/0/3
4385 /exactn/1/c
4386
4387 So, we have to first go through the first (n-1)
4388 alternatives and then deal with the last one separately. */
4389
4390
4391 /* Deal with the first (n-1) alternatives, which start
4392 with an on_failure_jump (see above) that jumps to right
4393 past a jump_past_alt. */
4394
4395 while ((re_opcode_t) p1[mcnt-3] == jump_past_alt)
4396 {
4397 /* `mcnt' holds how many bytes long the alternative
4398 is, including the ending `jump_past_alt' and
4399 its number. */
4400
4401 if (!alt_match_null_string_p (p1, p1 + mcnt - 3,
4402 reg_info))
4403 return false;
4404
4405 /* Move to right after this alternative, including the
4406 jump_past_alt. */
4407 p1 += mcnt;
4408
4409 /* Break if it's the beginning of an n-th alternative
4410 that doesn't begin with an on_failure_jump. */
4411 if ((re_opcode_t) *p1 != on_failure_jump)
4412 break;
4413
4414 /* Still have to check that it's not an n-th
4415 alternative that starts with an on_failure_jump. */
4416 p1++;
4417 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4418 if ((re_opcode_t) p1[mcnt-3] != jump_past_alt)
4419 {
4420 /* Get to the beginning of the n-th alternative. */
4421 p1 -= 3;
4422 break;
4423 }
4424 }
4425
4426 /* Deal with the last alternative: go back and get number
4427 of the `jump_past_alt' just before it. `mcnt' contains
4428 the length of the alternative. */
4429 EXTRACT_NUMBER (mcnt, p1 - 2);
4430
4431 if (!alt_match_null_string_p (p1, p1 + mcnt, reg_info))
4432 return false;
4433
4434 p1 += mcnt; /* Get past the n-th alternative. */
4435 } /* if mcnt > 0 */
4436 break;
4437
4438
4439 case stop_memory:
4440 assert (p1[1] == **p);
4441 *p = p1 + 2;
4442 return true;
4443
4444
4445 default:
4446 if (!common_op_match_null_string_p (&p1, end, reg_info))
4447 return false;
4448 }
4449 } /* while p1 < end */
4450
4451 return false;
4452 } /* group_match_null_string_p */
4453
4454
4455 /* Similar to group_match_null_string_p, but doesn't deal with alternatives:
4456 It expects P to be the first byte of a single alternative and END one
4457 byte past the last. The alternative can contain groups. */
4458
4459 static boolean
alt_match_null_string_p(p,end,reg_info)4460 alt_match_null_string_p (p, end, reg_info)
4461 unsigned char *p, *end;
4462 register_info_type *reg_info;
4463 {
4464 int mcnt;
4465 unsigned char *p1 = p;
4466
4467 while (p1 < end)
4468 {
4469 /* Skip over opcodes that can match nothing, and break when we get
4470 to one that can't. */
4471
4472 switch ((re_opcode_t) *p1)
4473 {
4474 /* It's a loop. */
4475 case on_failure_jump:
4476 p1++;
4477 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4478 p1 += mcnt;
4479 break;
4480
4481 default:
4482 if (!common_op_match_null_string_p (&p1, end, reg_info))
4483 return false;
4484 }
4485 } /* while p1 < end */
4486
4487 return true;
4488 } /* alt_match_null_string_p */
4489
4490
4491 /* Deals with the ops common to group_match_null_string_p and
4492 alt_match_null_string_p.
4493
4494 Sets P to one after the op and its arguments, if any. */
4495
4496 static boolean
common_op_match_null_string_p(p,end,reg_info)4497 common_op_match_null_string_p (p, end, reg_info)
4498 unsigned char **p, *end;
4499 register_info_type *reg_info;
4500 {
4501 int mcnt;
4502 boolean ret;
4503 int reg_no;
4504 unsigned char *p1 = *p;
4505
4506 switch ((re_opcode_t) *p1++)
4507 {
4508 case no_op:
4509 case begline:
4510 case endline:
4511 case begbuf:
4512 case endbuf:
4513 case wordbeg:
4514 case wordend:
4515 case wordbound:
4516 case notwordbound:
4517 #ifdef emacs
4518 case before_dot:
4519 case at_dot:
4520 case after_dot:
4521 #endif
4522 break;
4523
4524 case start_memory:
4525 reg_no = *p1;
4526 assert (reg_no > 0 && reg_no <= MAX_REGNUM);
4527 ret = group_match_null_string_p (&p1, end, reg_info);
4528
4529 /* Have to set this here in case we're checking a group which
4530 contains a group and a back reference to it. */
4531
4532 if (REG_MATCH_NULL_STRING_P (reg_info[reg_no]) == MATCH_NULL_UNSET_VALUE)
4533 REG_MATCH_NULL_STRING_P (reg_info[reg_no]) = ret;
4534
4535 if (!ret)
4536 return false;
4537 break;
4538
4539 /* If this is an optimized succeed_n for zero times, make the jump. */
4540 case jump:
4541 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4542 if (mcnt >= 0)
4543 p1 += mcnt;
4544 else
4545 return false;
4546 break;
4547
4548 case succeed_n:
4549 /* Get to the number of times to succeed. */
4550 p1 += 2;
4551 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4552
4553 if (mcnt == 0)
4554 {
4555 p1 -= 4;
4556 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4557 p1 += mcnt;
4558 }
4559 else
4560 return false;
4561 break;
4562
4563 case duplicate:
4564 if (!REG_MATCH_NULL_STRING_P (reg_info[*p1]))
4565 return false;
4566 break;
4567
4568 case set_number_at:
4569 p1 += 4;
4570
4571 default:
4572 /* All other opcodes mean we cannot match the empty string. */
4573 return false;
4574 }
4575
4576 *p = p1;
4577 return true;
4578 } /* common_op_match_null_string_p */
4579
4580
4581 /* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN
4582 bytes; nonzero otherwise. */
4583
4584 static int
bcmp_translate(s1,s2,len,translate)4585 bcmp_translate (s1, s2, len, translate)
4586 unsigned char *s1, *s2;
4587 register int len;
4588 char *translate;
4589 {
4590 register unsigned char *p1 = s1, *p2 = s2;
4591 while (len)
4592 {
4593 if (translate[*p1++] != translate[*p2++]) return 1;
4594 len--;
4595 }
4596 return 0;
4597 }
4598
4599 /* Entry points for GNU code. */
4600
4601 /* re_compile_pattern is the GNU regular expression compiler: it
4602 compiles PATTERN (of length SIZE) and puts the result in BUFP.
4603 Returns 0 if the pattern was valid, otherwise an error string.
4604
4605 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
4606 are set in BUFP on entry.
4607
4608 We call regex_compile to do the actual compilation. */
4609
4610 const char *
re_compile_pattern(pattern,length,bufp)4611 re_compile_pattern (pattern, length, bufp)
4612 const char *pattern;
4613 int length;
4614 struct re_pattern_buffer *bufp;
4615 {
4616 reg_errcode_t ret;
4617
4618 /* GNU code is written to assume at least RE_NREGS registers will be set
4619 (and at least one extra will be -1). */
4620 bufp->regs_allocated = REGS_UNALLOCATED;
4621
4622 /* And GNU code determines whether or not to get register information
4623 by passing null for the REGS argument to re_match, etc., not by
4624 setting no_sub. */
4625 bufp->no_sub = 0;
4626
4627 /* Match anchors at newline. */
4628 bufp->newline_anchor = 1;
4629
4630 ret = regex_compile (pattern, length, re_syntax_options, bufp);
4631
4632 return re_error_msg[(int) ret];
4633 }
4634
4635 /* Entry points compatible with 4.2 BSD regex library. We don't define
4636 them if this is an Emacs or POSIX compilation. */
4637
4638 #if !defined (emacs) && !defined (_POSIX_SOURCE)
4639
4640 /* BSD has one and only one pattern buffer. */
4641 static struct re_pattern_buffer re_comp_buf;
4642
4643 char *
re_comp(s)4644 re_comp (s)
4645 const char *s;
4646 {
4647 reg_errcode_t ret;
4648
4649 if (!s)
4650 {
4651 if (!re_comp_buf.buffer)
4652 return "No previous regular expression";
4653 return 0;
4654 }
4655
4656 if (!re_comp_buf.buffer)
4657 {
4658 re_comp_buf.buffer = (unsigned char *) malloc (200);
4659 if (re_comp_buf.buffer == NULL)
4660 return "Memory exhausted";
4661 re_comp_buf.allocated = 200;
4662
4663 re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH);
4664 if (re_comp_buf.fastmap == NULL)
4665 return "Memory exhausted";
4666 }
4667
4668 /* Since `re_exec' always passes NULL for the `regs' argument, we
4669 don't need to initialize the pattern buffer fields which affect it. */
4670
4671 /* Match anchors at newlines. */
4672 re_comp_buf.newline_anchor = 1;
4673
4674 ret = regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf);
4675
4676 /* Yes, we're discarding `const' here. */
4677 return (char *) re_error_msg[(int) ret];
4678 }
4679
4680
4681 int
re_exec(s)4682 re_exec (s)
4683 const char *s;
4684 {
4685 const int len = strlen (s);
4686 return
4687 0 <= re_search (&re_comp_buf, s, len, 0, len, (struct re_registers *) 0);
4688 }
4689 #endif /* not emacs and not _POSIX_SOURCE */
4690
4691 /* POSIX.2 functions. Don't define these for Emacs. */
4692
4693 #ifndef emacs
4694
4695 /* regcomp takes a regular expression as a string and compiles it.
4696
4697 PREG is a regex_t *. We do not expect any fields to be initialized,
4698 since POSIX says we shouldn't. Thus, we set
4699
4700 `buffer' to the compiled pattern;
4701 `used' to the length of the compiled pattern;
4702 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
4703 REG_EXTENDED bit in CFLAGS is set; otherwise, to
4704 RE_SYNTAX_POSIX_BASIC;
4705 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
4706 `fastmap' and `fastmap_accurate' to zero;
4707 `re_nsub' to the number of subexpressions in PATTERN.
4708
4709 PATTERN is the address of the pattern string.
4710
4711 CFLAGS is a series of bits which affect compilation.
4712
4713 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
4714 use POSIX basic syntax.
4715
4716 If REG_NEWLINE is set, then . and [^...] don't match newline.
4717 Also, regexec will try a match beginning after every newline.
4718
4719 If REG_ICASE is set, then we considers upper- and lowercase
4720 versions of letters to be equivalent when matching.
4721
4722 If REG_NOSUB is set, then when PREG is passed to regexec, that
4723 routine will report only success or failure, and nothing about the
4724 registers.
4725
4726 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
4727 the return codes and their meanings.) */
4728
4729 int
regcomp(preg,pattern,cflags)4730 regcomp (preg, pattern, cflags)
4731 regex_t *preg;
4732 const char *pattern;
4733 int cflags;
4734 {
4735 reg_errcode_t ret;
4736 unsigned syntax
4737 = (cflags & REG_EXTENDED) ?
4738 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC;
4739
4740 /* regex_compile will allocate the space for the compiled pattern. */
4741 preg->buffer = 0;
4742 preg->allocated = 0;
4743
4744 /* Don't bother to use a fastmap when searching. This simplifies the
4745 REG_NEWLINE case: if we used a fastmap, we'd have to put all the
4746 characters after newlines into the fastmap. This way, we just try
4747 every character. */
4748 preg->fastmap = 0;
4749
4750 if (cflags & REG_ICASE)
4751 {
4752 unsigned i;
4753
4754 preg->translate = (char *) malloc (CHAR_SET_SIZE);
4755 if (preg->translate == NULL)
4756 return (int) REG_ESPACE;
4757
4758 /* Map uppercase characters to corresponding lowercase ones. */
4759 for (i = 0; i < CHAR_SET_SIZE; i++)
4760 preg->translate[i] = ISUPPER (i) ? tolower (i) : i;
4761 }
4762 else
4763 preg->translate = NULL;
4764
4765 /* If REG_NEWLINE is set, newlines are treated differently. */
4766 if (cflags & REG_NEWLINE)
4767 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
4768 syntax &= ~RE_DOT_NEWLINE;
4769 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
4770 /* It also changes the matching behavior. */
4771 preg->newline_anchor = 1;
4772 }
4773 else
4774 preg->newline_anchor = 0;
4775
4776 preg->no_sub = !!(cflags & REG_NOSUB);
4777
4778 /* POSIX says a null character in the pattern terminates it, so we
4779 can use strlen here in compiling the pattern. */
4780 ret = regex_compile (pattern, strlen (pattern), syntax, preg);
4781
4782 /* POSIX doesn't distinguish between an unmatched open-group and an
4783 unmatched close-group: both are REG_EPAREN. */
4784 if (ret == REG_ERPAREN) ret = REG_EPAREN;
4785
4786 return (int) ret;
4787 }
4788
4789
4790 /* regexec searches for a given pattern, specified by PREG, in the
4791 string STRING.
4792
4793 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
4794 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
4795 least NMATCH elements, and we set them to the offsets of the
4796 corresponding matched substrings.
4797
4798 EFLAGS specifies `execution flags' which affect matching: if
4799 REG_NOTBOL is set, then ^ does not match at the beginning of the
4800 string; if REG_NOTEOL is set, then $ does not match at the end.
4801
4802 We return 0 if we find a match and REG_NOMATCH if not. */
4803
4804 int
regexec(preg,string,nmatch,pmatch,eflags)4805 regexec (preg, string, nmatch, pmatch, eflags)
4806 const regex_t *preg;
4807 const char *string;
4808 size_t nmatch;
4809 regmatch_t pmatch[];
4810 int eflags;
4811 {
4812 int ret;
4813 struct re_registers regs;
4814 regex_t private_preg;
4815 int len = strlen (string);
4816 boolean want_reg_info = !preg->no_sub && nmatch > 0;
4817
4818 private_preg = *preg;
4819
4820 private_preg.not_bol = !!(eflags & REG_NOTBOL);
4821 private_preg.not_eol = !!(eflags & REG_NOTEOL);
4822
4823 /* The user has told us exactly how many registers to return
4824 information about, via `nmatch'. We have to pass that on to the
4825 matching routines. */
4826 private_preg.regs_allocated = REGS_FIXED;
4827
4828 if (want_reg_info)
4829 {
4830 regs.num_regs = nmatch;
4831 regs.start = TALLOC (nmatch, regoff_t);
4832 regs.end = TALLOC (nmatch, regoff_t);
4833 if (regs.start == NULL || regs.end == NULL)
4834 return (int) REG_NOMATCH;
4835 }
4836
4837 /* Perform the searching operation. */
4838 ret = re_search (&private_preg, string, len,
4839 /* start: */ 0, /* range: */ len,
4840 want_reg_info ? ®s : (struct re_registers *) 0);
4841
4842 /* Copy the register information to the POSIX structure. */
4843 if (want_reg_info)
4844 {
4845 if (ret >= 0)
4846 {
4847 unsigned r;
4848
4849 for (r = 0; r < nmatch; r++)
4850 {
4851 pmatch[r].rm_so = regs.start[r];
4852 pmatch[r].rm_eo = regs.end[r];
4853 }
4854 }
4855
4856 /* If we needed the temporary register info, free the space now. */
4857 free (regs.start);
4858 free (regs.end);
4859 }
4860
4861 /* We want zero return to mean success, unlike `re_search'. */
4862 return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH;
4863 }
4864
4865
4866 /* Returns a message corresponding to an error code, ERRCODE, returned
4867 from either regcomp or regexec. We don't use PREG here. */
4868
4869 size_t
regerror(errcode,preg,errbuf,errbuf_size)4870 regerror (errcode, preg, errbuf, errbuf_size)
4871 int errcode;
4872 const regex_t *preg;
4873 char *errbuf;
4874 size_t errbuf_size;
4875 {
4876 const char *msg;
4877 size_t msg_size;
4878
4879 if (errcode < 0
4880 || errcode >= (sizeof (re_error_msg) / sizeof (re_error_msg[0])))
4881 /* Only error codes returned by the rest of the code should be passed
4882 to this routine. If we are given anything else, or if other regex
4883 code generates an invalid error code, then the program has a bug.
4884 Dump core so we can fix it. */
4885 abort ();
4886
4887 msg = re_error_msg[errcode];
4888
4889 /* POSIX doesn't require that we do anything in this case, but why
4890 not be nice. */
4891 if (! msg)
4892 msg = "Success";
4893
4894 msg_size = strlen (msg) + 1; /* Includes the null. */
4895
4896 if (errbuf_size != 0)
4897 {
4898 if (msg_size > errbuf_size)
4899 {
4900 strncpy (errbuf, msg, errbuf_size - 1);
4901 errbuf[errbuf_size - 1] = 0;
4902 }
4903 else
4904 strcpy (errbuf, msg);
4905 }
4906
4907 return msg_size;
4908 }
4909
4910
4911 /* Free dynamically allocated space used by PREG. */
4912
4913 void
regfree(preg)4914 regfree (preg)
4915 regex_t *preg;
4916 {
4917 if (preg->buffer != NULL)
4918 free (preg->buffer);
4919 preg->buffer = NULL;
4920
4921 preg->allocated = 0;
4922 preg->used = 0;
4923
4924 if (preg->fastmap != NULL)
4925 free (preg->fastmap);
4926 preg->fastmap = NULL;
4927 preg->fastmap_accurate = 0;
4928
4929 if (preg->translate != NULL)
4930 free (preg->translate);
4931 preg->translate = NULL;
4932 }
4933
4934 #endif /* not emacs */
4935
4936 /*
4937 Local variables:
4938 make-backup-files: t
4939 version-control: t
4940 trim-versions-without-asking: nil
4941 End:
4942 */
4943