1 /* Extended regular expression matching and search library, 2 version 0.12. 3 (Implements POSIX draft P10003.2/D11.2, except for 4 internationalization features.) 5 6 Copyright (C) 1993 Free Software Foundation, Inc. 7 8 This program is free software; you can redistribute it and/or modify 9 it under the terms of the GNU General Public License as published by 10 the Free Software Foundation; either version 2, or (at your option) 11 any later version. 12 13 This program is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public License 19 along with this program; if not, write to the Free Software 20 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ 21 22 /* AIX requires this to be the first thing in the file. */ 23 #if defined (_AIX) && !defined (REGEX_MALLOC) 24 #pragma alloca 25 #endif 26 27 #define _GNU_SOURCE 28 29 /* We need this for `regex.h', and perhaps for the Emacs include files. */ 30 #include <sys/types.h> 31 32 #ifdef HAVE_CONFIG_H 33 #include "config.h" 34 #endif 35 36 /* The `emacs' switch turns on certain matching commands 37 that make sense only in Emacs. */ 38 #ifdef emacs 39 40 #include "lisp.h" 41 #include "buffer.h" 42 #include "syntax.h" 43 44 /* Emacs uses `NULL' as a predicate. */ 45 #undef NULL 46 47 #else /* not emacs */ 48 49 /* We used to test for `BSTRING' here, but only GCC and Emacs define 50 `BSTRING', as far as I know, and neither of them use this code. */ 51 #if HAVE_STRING_H || STDC_HEADERS 52 #include <string.h> 53 #ifndef bcmp 54 #define bcmp(s1, s2, n) memcmp ((s1), (s2), (n)) 55 #endif 56 #ifndef bcopy 57 #define bcopy(s, d, n) memcpy ((d), (s), (n)) 58 #endif 59 #ifndef bzero 60 #define bzero(s, n) memset ((s), 0, (n)) 61 #endif 62 #else 63 #include <strings.h> 64 #endif 65 66 #include <stdlib.h> 67 68 /* Define the syntax stuff for \<, \>, etc. */ 69 70 /* This must be nonzero for the wordchar and notwordchar pattern 71 commands in re_match_2. */ 72 #ifndef Sword 73 #define Sword 1 74 #endif 75 76 #ifdef SYNTAX_TABLE 77 78 extern char *re_syntax_table; 79 80 #else /* not SYNTAX_TABLE */ 81 82 /* How many characters in the character set. */ 83 #define CHAR_SET_SIZE 256 84 85 static char re_syntax_table[CHAR_SET_SIZE]; 86 87 static void 88 init_syntax_once () 89 { 90 register int c; 91 static int done = 0; 92 93 if (done) 94 return; 95 96 bzero (re_syntax_table, sizeof re_syntax_table); 97 98 for (c = 'a'; c <= 'z'; c++) 99 re_syntax_table[c] = Sword; 100 101 for (c = 'A'; c <= 'Z'; c++) 102 re_syntax_table[c] = Sword; 103 104 for (c = '0'; c <= '9'; c++) 105 re_syntax_table[c] = Sword; 106 107 re_syntax_table['_'] = Sword; 108 109 done = 1; 110 } 111 112 #endif /* not SYNTAX_TABLE */ 113 114 #define SYNTAX(c) re_syntax_table[c] 115 116 #endif /* not emacs */ 117 118 /* Get the interface, including the syntax bits. */ 119 #include "regex.h" 120 121 /* isalpha etc. are used for the character classes. */ 122 #include <ctype.h> 123 124 #ifndef isascii 125 #define isascii(c) 1 126 #endif 127 128 #ifdef isblank 129 #define ISBLANK(c) (isascii (c) && isblank (c)) 130 #else 131 #define ISBLANK(c) ((c) == ' ' || (c) == '\t') 132 #endif 133 #ifdef isgraph 134 #define ISGRAPH(c) (isascii (c) && isgraph (c)) 135 #else 136 #define ISGRAPH(c) (isascii (c) && isprint (c) && !isspace (c)) 137 #endif 138 139 #define ISPRINT(c) (isascii (c) && isprint (c)) 140 #define ISDIGIT(c) (isascii (c) && isdigit (c)) 141 #define ISALNUM(c) (isascii (c) && isalnum (c)) 142 #define ISALPHA(c) (isascii (c) && isalpha (c)) 143 #define ISCNTRL(c) (isascii (c) && iscntrl (c)) 144 #define ISLOWER(c) (isascii (c) && islower (c)) 145 #define ISPUNCT(c) (isascii (c) && ispunct (c)) 146 #define ISSPACE(c) (isascii (c) && isspace (c)) 147 #define ISUPPER(c) (isascii (c) && isupper (c)) 148 #define ISXDIGIT(c) (isascii (c) && isxdigit (c)) 149 150 #ifndef NULL 151 #define NULL 0 152 #endif 153 154 /* We remove any previous definition of `SIGN_EXTEND_CHAR', 155 since ours (we hope) works properly with all combinations of 156 machines, compilers, `char' and `unsigned char' argument types. 157 (Per Bothner suggested the basic approach.) */ 158 #undef SIGN_EXTEND_CHAR 159 #if __STDC__ 160 #define SIGN_EXTEND_CHAR(c) ((signed char) (c)) 161 #else /* not __STDC__ */ 162 /* As in Harbison and Steele. */ 163 #define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128) 164 #endif 165 166 /* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we 167 use `alloca' instead of `malloc'. This is because using malloc in 168 re_search* or re_match* could cause memory leaks when C-g is used in 169 Emacs; also, malloc is slower and causes storage fragmentation. On 170 the other hand, malloc is more portable, and easier to debug. 171 172 Because we sometimes use alloca, some routines have to be macros, 173 not functions -- `alloca'-allocated space disappears at the end of the 174 function it is called in. */ 175 176 #ifdef REGEX_MALLOC 177 178 #define REGEX_ALLOCATE malloc 179 #define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize) 180 181 #else /* not REGEX_MALLOC */ 182 183 /* Emacs already defines alloca, sometimes. */ 184 #ifndef alloca 185 186 /* Make alloca work the best possible way. */ 187 #ifdef __GNUC__ 188 #define alloca __builtin_alloca 189 #else /* not __GNUC__ */ 190 #if HAVE_ALLOCA_H 191 #include <alloca.h> 192 #else /* not __GNUC__ or HAVE_ALLOCA_H */ 193 #ifndef _AIX /* Already did AIX, up at the top. */ 194 char *alloca (); 195 #endif /* not _AIX */ 196 #endif /* not HAVE_ALLOCA_H */ 197 #endif /* not __GNUC__ */ 198 199 #endif /* not alloca */ 200 201 #define REGEX_ALLOCATE alloca 202 203 /* Assumes a `char *destination' variable. */ 204 #define REGEX_REALLOCATE(source, osize, nsize) \ 205 (destination = (char *) alloca (nsize), \ 206 bcopy (source, destination, osize), \ 207 destination) 208 209 #endif /* not REGEX_MALLOC */ 210 211 212 /* True if `size1' is non-NULL and PTR is pointing anywhere inside 213 `string1' or just past its end. This works if PTR is NULL, which is 214 a good thing. */ 215 #define FIRST_STRING_P(ptr) \ 216 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1) 217 218 /* (Re)Allocate N items of type T using malloc, or fail. */ 219 #define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t))) 220 #define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t))) 221 #define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t))) 222 223 #define BYTEWIDTH 8 /* In bits. */ 224 225 #define STREQ(s1, s2) ((strcmp (s1, s2) == 0)) 226 227 #define MAX(a, b) ((a) > (b) ? (a) : (b)) 228 #define MIN(a, b) ((a) < (b) ? (a) : (b)) 229 230 typedef char boolean; 231 #define false 0 232 #define true 1 233 234 /* These are the command codes that appear in compiled regular 235 expressions. Some opcodes are followed by argument bytes. A 236 command code can specify any interpretation whatsoever for its 237 arguments. Zero bytes may appear in the compiled regular expression. 238 239 The value of `exactn' is needed in search.c (search_buffer) in Emacs. 240 So regex.h defines a symbol `RE_EXACTN_VALUE' to be 1; the value of 241 `exactn' we use here must also be 1. */ 242 243 typedef enum 244 { 245 no_op = 0, 246 247 /* Followed by one byte giving n, then by n literal bytes. */ 248 exactn = 1, 249 250 /* Matches any (more or less) character. */ 251 anychar, 252 253 /* Matches any one char belonging to specified set. First 254 following byte is number of bitmap bytes. Then come bytes 255 for a bitmap saying which chars are in. Bits in each byte 256 are ordered low-bit-first. A character is in the set if its 257 bit is 1. A character too large to have a bit in the map is 258 automatically not in the set. */ 259 charset, 260 261 /* Same parameters as charset, but match any character that is 262 not one of those specified. */ 263 charset_not, 264 265 /* Start remembering the text that is matched, for storing in a 266 register. Followed by one byte with the register number, in 267 the range 0 to one less than the pattern buffer's re_nsub 268 field. Then followed by one byte with the number of groups 269 inner to this one. (This last has to be part of the 270 start_memory only because we need it in the on_failure_jump 271 of re_match_2.) */ 272 start_memory, 273 274 /* Stop remembering the text that is matched and store it in a 275 memory register. Followed by one byte with the register 276 number, in the range 0 to one less than `re_nsub' in the 277 pattern buffer, and one byte with the number of inner groups, 278 just like `start_memory'. (We need the number of inner 279 groups here because we don't have any easy way of finding the 280 corresponding start_memory when we're at a stop_memory.) */ 281 stop_memory, 282 283 /* Match a duplicate of something remembered. Followed by one 284 byte containing the register number. */ 285 duplicate, 286 287 /* Fail unless at beginning of line. */ 288 begline, 289 290 /* Fail unless at end of line. */ 291 endline, 292 293 /* Succeeds if at beginning of buffer (if emacs) or at beginning 294 of string to be matched (if not). */ 295 begbuf, 296 297 /* Analogously, for end of buffer/string. */ 298 endbuf, 299 300 /* Followed by two byte relative address to which to jump. */ 301 jump, 302 303 /* Same as jump, but marks the end of an alternative. */ 304 jump_past_alt, 305 306 /* Followed by two-byte relative address of place to resume at 307 in case of failure. */ 308 on_failure_jump, 309 310 /* Like on_failure_jump, but pushes a placeholder instead of the 311 current string position when executed. */ 312 on_failure_keep_string_jump, 313 314 /* Throw away latest failure point and then jump to following 315 two-byte relative address. */ 316 pop_failure_jump, 317 318 /* Change to pop_failure_jump if know won't have to backtrack to 319 match; otherwise change to jump. This is used to jump 320 back to the beginning of a repeat. If what follows this jump 321 clearly won't match what the repeat does, such that we can be 322 sure that there is no use backtracking out of repetitions 323 already matched, then we change it to a pop_failure_jump. 324 Followed by two-byte address. */ 325 maybe_pop_jump, 326 327 /* Jump to following two-byte address, and push a dummy failure 328 point. This failure point will be thrown away if an attempt 329 is made to use it for a failure. A `+' construct makes this 330 before the first repeat. Also used as an intermediary kind 331 of jump when compiling an alternative. */ 332 dummy_failure_jump, 333 334 /* Push a dummy failure point and continue. Used at the end of 335 alternatives. */ 336 push_dummy_failure, 337 338 /* Followed by two-byte relative address and two-byte number n. 339 After matching N times, jump to the address upon failure. */ 340 succeed_n, 341 342 /* Followed by two-byte relative address, and two-byte number n. 343 Jump to the address N times, then fail. */ 344 jump_n, 345 346 /* Set the following two-byte relative address to the 347 subsequent two-byte number. The address *includes* the two 348 bytes of number. */ 349 set_number_at, 350 351 wordchar, /* Matches any word-constituent character. */ 352 notwordchar, /* Matches any char that is not a word-constituent. */ 353 354 wordbeg, /* Succeeds if at word beginning. */ 355 wordend, /* Succeeds if at word end. */ 356 357 wordbound, /* Succeeds if at a word boundary. */ 358 notwordbound /* Succeeds if not at a word boundary. */ 359 360 #ifdef emacs 361 ,before_dot, /* Succeeds if before point. */ 362 at_dot, /* Succeeds if at point. */ 363 after_dot, /* Succeeds if after point. */ 364 365 /* Matches any character whose syntax is specified. Followed by 366 a byte which contains a syntax code, e.g., Sword. */ 367 syntaxspec, 368 369 /* Matches any character whose syntax is not that specified. */ 370 notsyntaxspec 371 #endif /* emacs */ 372 } re_opcode_t; 373 374 /* Common operations on the compiled pattern. */ 375 376 /* Store NUMBER in two contiguous bytes starting at DESTINATION. */ 377 378 #define STORE_NUMBER(destination, number) \ 379 do { \ 380 (destination)[0] = (number) & 0377; \ 381 (destination)[1] = (number) >> 8; \ 382 } while (0) 383 384 /* Same as STORE_NUMBER, except increment DESTINATION to 385 the byte after where the number is stored. Therefore, DESTINATION 386 must be an lvalue. */ 387 388 #define STORE_NUMBER_AND_INCR(destination, number) \ 389 do { \ 390 STORE_NUMBER (destination, number); \ 391 (destination) += 2; \ 392 } while (0) 393 394 /* Put into DESTINATION a number stored in two contiguous bytes starting 395 at SOURCE. */ 396 397 #define EXTRACT_NUMBER(destination, source) \ 398 do { \ 399 (destination) = *(source) & 0377; \ 400 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \ 401 } while (0) 402 403 #ifdef DEBUG 404 static void 405 extract_number (dest, source) 406 int *dest; 407 unsigned char *source; 408 { 409 int temp = SIGN_EXTEND_CHAR (*(source + 1)); 410 *dest = *source & 0377; 411 *dest += temp << 8; 412 } 413 414 #ifndef EXTRACT_MACROS /* To debug the macros. */ 415 #undef EXTRACT_NUMBER 416 #define EXTRACT_NUMBER(dest, src) extract_number (&dest, src) 417 #endif /* not EXTRACT_MACROS */ 418 419 #endif /* DEBUG */ 420 421 /* Same as EXTRACT_NUMBER, except increment SOURCE to after the number. 422 SOURCE must be an lvalue. */ 423 424 #define EXTRACT_NUMBER_AND_INCR(destination, source) \ 425 do { \ 426 EXTRACT_NUMBER (destination, source); \ 427 (source) += 2; \ 428 } while (0) 429 430 #ifdef DEBUG 431 static void 432 extract_number_and_incr (destination, source) 433 int *destination; 434 unsigned char **source; 435 { 436 extract_number (destination, *source); 437 *source += 2; 438 } 439 440 #ifndef EXTRACT_MACROS 441 #undef EXTRACT_NUMBER_AND_INCR 442 #define EXTRACT_NUMBER_AND_INCR(dest, src) \ 443 extract_number_and_incr (&dest, &src) 444 #endif /* not EXTRACT_MACROS */ 445 446 #endif /* DEBUG */ 447 448 /* If DEBUG is defined, Regex prints many voluminous messages about what 449 it is doing (if the variable `debug' is nonzero). If linked with the 450 main program in `iregex.c', you can enter patterns and strings 451 interactively. And if linked with the main program in `main.c' and 452 the other test files, you can run the already-written tests. */ 453 454 #ifdef DEBUG 455 456 /* We use standard I/O for debugging. */ 457 #include <stdio.h> 458 459 /* It is useful to test things that ``must'' be true when debugging. */ 460 #include <assert.h> 461 462 static int debug = 0; 463 464 #define DEBUG_STATEMENT(e) e 465 #define DEBUG_PRINT1(x) if (debug) printf (x) 466 #define DEBUG_PRINT2(x1, x2) if (debug) printf (x1, x2) 467 #define DEBUG_PRINT3(x1, x2, x3) if (debug) printf (x1, x2, x3) 468 #define DEBUG_PRINT4(x1, x2, x3, x4) if (debug) printf (x1, x2, x3, x4) 469 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \ 470 if (debug) print_partial_compiled_pattern (s, e) 471 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \ 472 if (debug) print_double_string (w, s1, sz1, s2, sz2) 473 474 475 extern void printchar (); 476 477 /* Print the fastmap in human-readable form. */ 478 479 void 480 print_fastmap (fastmap) 481 char *fastmap; 482 { 483 unsigned was_a_range = 0; 484 unsigned i = 0; 485 486 while (i < (1 << BYTEWIDTH)) 487 { 488 if (fastmap[i++]) 489 { 490 was_a_range = 0; 491 printchar (i - 1); 492 while (i < (1 << BYTEWIDTH) && fastmap[i]) 493 { 494 was_a_range = 1; 495 i++; 496 } 497 if (was_a_range) 498 { 499 printf ("-"); 500 printchar (i - 1); 501 } 502 } 503 } 504 putchar ('\n'); 505 } 506 507 508 /* Print a compiled pattern string in human-readable form, starting at 509 the START pointer into it and ending just before the pointer END. */ 510 511 void 512 print_partial_compiled_pattern (start, end) 513 unsigned char *start; 514 unsigned char *end; 515 { 516 int mcnt, mcnt2; 517 unsigned char *p = start; 518 unsigned char *pend = end; 519 520 if (start == NULL) 521 { 522 printf ("(null)\n"); 523 return; 524 } 525 526 /* Loop over pattern commands. */ 527 while (p < pend) 528 { 529 switch ((re_opcode_t) *p++) 530 { 531 case no_op: 532 printf ("/no_op"); 533 break; 534 535 case exactn: 536 mcnt = *p++; 537 printf ("/exactn/%d", mcnt); 538 do 539 { 540 putchar ('/'); 541 printchar (*p++); 542 } 543 while (--mcnt); 544 break; 545 546 case start_memory: 547 mcnt = *p++; 548 printf ("/start_memory/%d/%d", mcnt, *p++); 549 break; 550 551 case stop_memory: 552 mcnt = *p++; 553 printf ("/stop_memory/%d/%d", mcnt, *p++); 554 break; 555 556 case duplicate: 557 printf ("/duplicate/%d", *p++); 558 break; 559 560 case anychar: 561 printf ("/anychar"); 562 break; 563 564 case charset: 565 case charset_not: 566 { 567 register int c; 568 569 printf ("/charset%s", 570 (re_opcode_t) *(p - 1) == charset_not ? "_not" : ""); 571 572 assert (p + *p < pend); 573 574 for (c = 0; c < *p; c++) 575 { 576 unsigned bit; 577 unsigned char map_byte = p[1 + c]; 578 579 putchar ('/'); 580 581 for (bit = 0; bit < BYTEWIDTH; bit++) 582 if (map_byte & (1 << bit)) 583 printchar (c * BYTEWIDTH + bit); 584 } 585 p += 1 + *p; 586 break; 587 } 588 589 case begline: 590 printf ("/begline"); 591 break; 592 593 case endline: 594 printf ("/endline"); 595 break; 596 597 case on_failure_jump: 598 extract_number_and_incr (&mcnt, &p); 599 printf ("/on_failure_jump/0/%d", mcnt); 600 break; 601 602 case on_failure_keep_string_jump: 603 extract_number_and_incr (&mcnt, &p); 604 printf ("/on_failure_keep_string_jump/0/%d", mcnt); 605 break; 606 607 case dummy_failure_jump: 608 extract_number_and_incr (&mcnt, &p); 609 printf ("/dummy_failure_jump/0/%d", mcnt); 610 break; 611 612 case push_dummy_failure: 613 printf ("/push_dummy_failure"); 614 break; 615 616 case maybe_pop_jump: 617 extract_number_and_incr (&mcnt, &p); 618 printf ("/maybe_pop_jump/0/%d", mcnt); 619 break; 620 621 case pop_failure_jump: 622 extract_number_and_incr (&mcnt, &p); 623 printf ("/pop_failure_jump/0/%d", mcnt); 624 break; 625 626 case jump_past_alt: 627 extract_number_and_incr (&mcnt, &p); 628 printf ("/jump_past_alt/0/%d", mcnt); 629 break; 630 631 case jump: 632 extract_number_and_incr (&mcnt, &p); 633 printf ("/jump/0/%d", mcnt); 634 break; 635 636 case succeed_n: 637 extract_number_and_incr (&mcnt, &p); 638 extract_number_and_incr (&mcnt2, &p); 639 printf ("/succeed_n/0/%d/0/%d", mcnt, mcnt2); 640 break; 641 642 case jump_n: 643 extract_number_and_incr (&mcnt, &p); 644 extract_number_and_incr (&mcnt2, &p); 645 printf ("/jump_n/0/%d/0/%d", mcnt, mcnt2); 646 break; 647 648 case set_number_at: 649 extract_number_and_incr (&mcnt, &p); 650 extract_number_and_incr (&mcnt2, &p); 651 printf ("/set_number_at/0/%d/0/%d", mcnt, mcnt2); 652 break; 653 654 case wordbound: 655 printf ("/wordbound"); 656 break; 657 658 case notwordbound: 659 printf ("/notwordbound"); 660 break; 661 662 case wordbeg: 663 printf ("/wordbeg"); 664 break; 665 666 case wordend: 667 printf ("/wordend"); 668 669 #ifdef emacs 670 case before_dot: 671 printf ("/before_dot"); 672 break; 673 674 case at_dot: 675 printf ("/at_dot"); 676 break; 677 678 case after_dot: 679 printf ("/after_dot"); 680 break; 681 682 case syntaxspec: 683 printf ("/syntaxspec"); 684 mcnt = *p++; 685 printf ("/%d", mcnt); 686 break; 687 688 case notsyntaxspec: 689 printf ("/notsyntaxspec"); 690 mcnt = *p++; 691 printf ("/%d", mcnt); 692 break; 693 #endif /* emacs */ 694 695 case wordchar: 696 printf ("/wordchar"); 697 break; 698 699 case notwordchar: 700 printf ("/notwordchar"); 701 break; 702 703 case begbuf: 704 printf ("/begbuf"); 705 break; 706 707 case endbuf: 708 printf ("/endbuf"); 709 break; 710 711 default: 712 printf ("?%d", *(p-1)); 713 } 714 } 715 printf ("/\n"); 716 } 717 718 719 void 720 print_compiled_pattern (bufp) 721 struct re_pattern_buffer *bufp; 722 { 723 unsigned char *buffer = bufp->buffer; 724 725 print_partial_compiled_pattern (buffer, buffer + bufp->used); 726 printf ("%d bytes used/%d bytes allocated.\n", bufp->used, bufp->allocated); 727 728 if (bufp->fastmap_accurate && bufp->fastmap) 729 { 730 printf ("fastmap: "); 731 print_fastmap (bufp->fastmap); 732 } 733 734 printf ("re_nsub: %d\t", bufp->re_nsub); 735 printf ("regs_alloc: %d\t", bufp->regs_allocated); 736 printf ("can_be_null: %d\t", bufp->can_be_null); 737 printf ("newline_anchor: %d\n", bufp->newline_anchor); 738 printf ("no_sub: %d\t", bufp->no_sub); 739 printf ("not_bol: %d\t", bufp->not_bol); 740 printf ("not_eol: %d\t", bufp->not_eol); 741 printf ("syntax: %d\n", bufp->syntax); 742 /* Perhaps we should print the translate table? */ 743 } 744 745 746 void 747 print_double_string (where, string1, size1, string2, size2) 748 const char *where; 749 const char *string1; 750 const char *string2; 751 int size1; 752 int size2; 753 { 754 unsigned this_char; 755 756 if (where == NULL) 757 printf ("(null)"); 758 else 759 { 760 if (FIRST_STRING_P (where)) 761 { 762 for (this_char = where - string1; this_char < size1; this_char++) 763 printchar (string1[this_char]); 764 765 where = string2; 766 } 767 768 for (this_char = where - string2; this_char < size2; this_char++) 769 printchar (string2[this_char]); 770 } 771 } 772 773 #else /* not DEBUG */ 774 775 #undef assert 776 #define assert(e) 777 778 #define DEBUG_STATEMENT(e) 779 #define DEBUG_PRINT1(x) 780 #define DEBUG_PRINT2(x1, x2) 781 #define DEBUG_PRINT3(x1, x2, x3) 782 #define DEBUG_PRINT4(x1, x2, x3, x4) 783 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) 784 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) 785 786 #endif /* not DEBUG */ 787 788 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can 789 also be assigned to arbitrarily: each pattern buffer stores its own 790 syntax, so it can be changed between regex compilations. */ 791 reg_syntax_t re_syntax_options = RE_SYNTAX_EMACS; 792 793 794 /* Specify the precise syntax of regexps for compilation. This provides 795 for compatibility for various utilities which historically have 796 different, incompatible syntaxes. 797 798 The argument SYNTAX is a bit mask comprised of the various bits 799 defined in regex.h. We return the old syntax. */ 800 801 reg_syntax_t 802 re_set_syntax (syntax) 803 reg_syntax_t syntax; 804 { 805 reg_syntax_t ret = re_syntax_options; 806 807 re_syntax_options = syntax; 808 return ret; 809 } 810 811 /* This table gives an error message for each of the error codes listed 812 in regex.h. Obviously the order here has to be same as there. */ 813 814 static const char *re_error_msg[] = 815 { NULL, /* REG_NOERROR */ 816 "No match", /* REG_NOMATCH */ 817 "Invalid regular expression", /* REG_BADPAT */ 818 "Invalid collation character", /* REG_ECOLLATE */ 819 "Invalid character class name", /* REG_ECTYPE */ 820 "Trailing backslash", /* REG_EESCAPE */ 821 "Invalid back reference", /* REG_ESUBREG */ 822 "Unmatched [ or [^", /* REG_EBRACK */ 823 "Unmatched ( or \\(", /* REG_EPAREN */ 824 "Unmatched \\{", /* REG_EBRACE */ 825 "Invalid content of \\{\\}", /* REG_BADBR */ 826 "Invalid range end", /* REG_ERANGE */ 827 "Memory exhausted", /* REG_ESPACE */ 828 "Invalid preceding regular expression", /* REG_BADRPT */ 829 "Premature end of regular expression", /* REG_EEND */ 830 "Regular expression too big", /* REG_ESIZE */ 831 "Unmatched ) or \\)", /* REG_ERPAREN */ 832 }; 833 834 /* Subroutine declarations and macros for regex_compile. */ 835 836 static void store_op1 (), store_op2 (); 837 static void insert_op1 (), insert_op2 (); 838 static boolean at_begline_loc_p (), at_endline_loc_p (); 839 static boolean group_in_compile_stack (); 840 static reg_errcode_t compile_range (); 841 842 /* Fetch the next character in the uncompiled pattern---translating it 843 if necessary. Also cast from a signed character in the constant 844 string passed to us by the user to an unsigned char that we can use 845 as an array index (in, e.g., `translate'). */ 846 #define PATFETCH(c) \ 847 do {if (p == pend) return REG_EEND; \ 848 c = (unsigned char) *p++; \ 849 if (translate) c = translate[c]; \ 850 } while (0) 851 852 /* Fetch the next character in the uncompiled pattern, with no 853 translation. */ 854 #define PATFETCH_RAW(c) \ 855 do {if (p == pend) return REG_EEND; \ 856 c = (unsigned char) *p++; \ 857 } while (0) 858 859 /* Go backwards one character in the pattern. */ 860 #define PATUNFETCH p-- 861 862 863 /* If `translate' is non-null, return translate[D], else just D. We 864 cast the subscript to translate because some data is declared as 865 `char *', to avoid warnings when a string constant is passed. But 866 when we use a character as a subscript we must make it unsigned. */ 867 #define TRANSLATE(d) (translate ? translate[(unsigned char) (d)] : (d)) 868 869 870 /* Macros for outputting the compiled pattern into `buffer'. */ 871 872 /* If the buffer isn't allocated when it comes in, use this. */ 873 #define INIT_BUF_SIZE 32 874 875 /* Make sure we have at least N more bytes of space in buffer. */ 876 #define GET_BUFFER_SPACE(n) \ 877 while (b - bufp->buffer + (n) > bufp->allocated) \ 878 EXTEND_BUFFER () 879 880 /* Make sure we have one more byte of buffer space and then add C to it. */ 881 #define BUF_PUSH(c) \ 882 do { \ 883 GET_BUFFER_SPACE (1); \ 884 *b++ = (unsigned char) (c); \ 885 } while (0) 886 887 888 /* Ensure we have two more bytes of buffer space and then append C1 and C2. */ 889 #define BUF_PUSH_2(c1, c2) \ 890 do { \ 891 GET_BUFFER_SPACE (2); \ 892 *b++ = (unsigned char) (c1); \ 893 *b++ = (unsigned char) (c2); \ 894 } while (0) 895 896 897 /* As with BUF_PUSH_2, except for three bytes. */ 898 #define BUF_PUSH_3(c1, c2, c3) \ 899 do { \ 900 GET_BUFFER_SPACE (3); \ 901 *b++ = (unsigned char) (c1); \ 902 *b++ = (unsigned char) (c2); \ 903 *b++ = (unsigned char) (c3); \ 904 } while (0) 905 906 907 /* Store a jump with opcode OP at LOC to location TO. We store a 908 relative address offset by the three bytes the jump itself occupies. */ 909 #define STORE_JUMP(op, loc, to) \ 910 store_op1 (op, loc, (to) - (loc) - 3) 911 912 /* Likewise, for a two-argument jump. */ 913 #define STORE_JUMP2(op, loc, to, arg) \ 914 store_op2 (op, loc, (to) - (loc) - 3, arg) 915 916 /* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */ 917 #define INSERT_JUMP(op, loc, to) \ 918 insert_op1 (op, loc, (to) - (loc) - 3, b) 919 920 /* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */ 921 #define INSERT_JUMP2(op, loc, to, arg) \ 922 insert_op2 (op, loc, (to) - (loc) - 3, arg, b) 923 924 925 /* This is not an arbitrary limit: the arguments which represent offsets 926 into the pattern are two bytes long. So if 2^16 bytes turns out to 927 be too small, many things would have to change. */ 928 #define MAX_BUF_SIZE (1L << 16) 929 930 931 /* Extend the buffer by twice its current size via realloc and 932 reset the pointers that pointed into the old block to point to the 933 correct places in the new one. If extending the buffer results in it 934 being larger than MAX_BUF_SIZE, then flag memory exhausted. */ 935 #define EXTEND_BUFFER() \ 936 do { \ 937 unsigned char *old_buffer = bufp->buffer; \ 938 if (bufp->allocated == MAX_BUF_SIZE) \ 939 return REG_ESIZE; \ 940 bufp->allocated <<= 1; \ 941 if (bufp->allocated > MAX_BUF_SIZE) \ 942 bufp->allocated = MAX_BUF_SIZE; \ 943 bufp->buffer = (unsigned char *) realloc (bufp->buffer, bufp->allocated);\ 944 if (bufp->buffer == NULL) \ 945 return REG_ESPACE; \ 946 /* If the buffer moved, move all the pointers into it. */ \ 947 if (old_buffer != bufp->buffer) \ 948 { \ 949 b = (b - old_buffer) + bufp->buffer; \ 950 begalt = (begalt - old_buffer) + bufp->buffer; \ 951 if (fixup_alt_jump) \ 952 fixup_alt_jump = (fixup_alt_jump - old_buffer) + bufp->buffer;\ 953 if (laststart) \ 954 laststart = (laststart - old_buffer) + bufp->buffer; \ 955 if (pending_exact) \ 956 pending_exact = (pending_exact - old_buffer) + bufp->buffer; \ 957 } \ 958 } while (0) 959 960 961 /* Since we have one byte reserved for the register number argument to 962 {start,stop}_memory, the maximum number of groups we can report 963 things about is what fits in that byte. */ 964 #define MAX_REGNUM 255 965 966 /* But patterns can have more than `MAX_REGNUM' registers. We just 967 ignore the excess. */ 968 typedef unsigned regnum_t; 969 970 971 /* Macros for the compile stack. */ 972 973 /* Since offsets can go either forwards or backwards, this type needs to 974 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */ 975 typedef int pattern_offset_t; 976 977 typedef struct 978 { 979 pattern_offset_t begalt_offset; 980 pattern_offset_t fixup_alt_jump; 981 pattern_offset_t inner_group_offset; 982 pattern_offset_t laststart_offset; 983 regnum_t regnum; 984 } compile_stack_elt_t; 985 986 987 typedef struct 988 { 989 compile_stack_elt_t *stack; 990 unsigned size; 991 unsigned avail; /* Offset of next open position. */ 992 } compile_stack_type; 993 994 995 #define INIT_COMPILE_STACK_SIZE 32 996 997 #define COMPILE_STACK_EMPTY (compile_stack.avail == 0) 998 #define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size) 999 1000 /* The next available element. */ 1001 #define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail]) 1002 1003 1004 /* Set the bit for character C in a list. */ 1005 #define SET_LIST_BIT(c) \ 1006 (b[((unsigned char) (c)) / BYTEWIDTH] \ 1007 |= 1 << (((unsigned char) c) % BYTEWIDTH)) 1008 1009 1010 /* Get the next unsigned number in the uncompiled pattern. */ 1011 #define GET_UNSIGNED_NUMBER(num) \ 1012 { if (p != pend) \ 1013 { \ 1014 PATFETCH (c); \ 1015 while (ISDIGIT (c)) \ 1016 { \ 1017 if (num < 0) \ 1018 num = 0; \ 1019 num = num * 10 + c - '0'; \ 1020 if (p == pend) \ 1021 break; \ 1022 PATFETCH (c); \ 1023 } \ 1024 } \ 1025 } 1026 1027 #define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */ 1028 1029 #define IS_CHAR_CLASS(string) \ 1030 (STREQ (string, "alpha") || STREQ (string, "upper") \ 1031 || STREQ (string, "lower") || STREQ (string, "digit") \ 1032 || STREQ (string, "alnum") || STREQ (string, "xdigit") \ 1033 || STREQ (string, "space") || STREQ (string, "print") \ 1034 || STREQ (string, "punct") || STREQ (string, "graph") \ 1035 || STREQ (string, "cntrl") || STREQ (string, "blank")) 1036 1037 /* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX. 1038 Returns one of error codes defined in `regex.h', or zero for success. 1039 1040 Assumes the `allocated' (and perhaps `buffer') and `translate' 1041 fields are set in BUFP on entry. 1042 1043 If it succeeds, results are put in BUFP (if it returns an error, the 1044 contents of BUFP are undefined): 1045 `buffer' is the compiled pattern; 1046 `syntax' is set to SYNTAX; 1047 `used' is set to the length of the compiled pattern; 1048 `fastmap_accurate' is zero; 1049 `re_nsub' is the number of subexpressions in PATTERN; 1050 `not_bol' and `not_eol' are zero; 1051 1052 The `fastmap' and `newline_anchor' fields are neither 1053 examined nor set. */ 1054 1055 static reg_errcode_t 1056 regex_compile (pattern, size, syntax, bufp) 1057 const char *pattern; 1058 int size; 1059 reg_syntax_t syntax; 1060 struct re_pattern_buffer *bufp; 1061 { 1062 /* We fetch characters from PATTERN here. Even though PATTERN is 1063 `char *' (i.e., signed), we declare these variables as unsigned, so 1064 they can be reliably used as array indices. */ 1065 register unsigned char c, c1; 1066 1067 /* A random tempory spot in PATTERN. */ 1068 const char *p1; 1069 1070 /* Points to the end of the buffer, where we should append. */ 1071 register unsigned char *b; 1072 1073 /* Keeps track of unclosed groups. */ 1074 compile_stack_type compile_stack; 1075 1076 /* Points to the current (ending) position in the pattern. */ 1077 const char *p = pattern; 1078 const char *pend = pattern + size; 1079 1080 /* How to translate the characters in the pattern. */ 1081 char *translate = bufp->translate; 1082 1083 /* Address of the count-byte of the most recently inserted `exactn' 1084 command. This makes it possible to tell if a new exact-match 1085 character can be added to that command or if the character requires 1086 a new `exactn' command. */ 1087 unsigned char *pending_exact = 0; 1088 1089 /* Address of start of the most recently finished expression. 1090 This tells, e.g., postfix * where to find the start of its 1091 operand. Reset at the beginning of groups and alternatives. */ 1092 unsigned char *laststart = 0; 1093 1094 /* Address of beginning of regexp, or inside of last group. */ 1095 unsigned char *begalt; 1096 1097 /* Place in the uncompiled pattern (i.e., the {) to 1098 which to go back if the interval is invalid. */ 1099 const char *beg_interval; 1100 1101 /* Address of the place where a forward jump should go to the end of 1102 the containing expression. Each alternative of an `or' -- except the 1103 last -- ends with a forward jump of this sort. */ 1104 unsigned char *fixup_alt_jump = 0; 1105 1106 /* Counts open-groups as they are encountered. Remembered for the 1107 matching close-group on the compile stack, so the same register 1108 number is put in the stop_memory as the start_memory. */ 1109 regnum_t regnum = 0; 1110 1111 #ifdef DEBUG 1112 DEBUG_PRINT1 ("\nCompiling pattern: "); 1113 if (debug) 1114 { 1115 unsigned debug_count; 1116 1117 for (debug_count = 0; debug_count < size; debug_count++) 1118 printchar (pattern[debug_count]); 1119 putchar ('\n'); 1120 } 1121 #endif /* DEBUG */ 1122 1123 /* Initialize the compile stack. */ 1124 compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t); 1125 if (compile_stack.stack == NULL) 1126 return REG_ESPACE; 1127 1128 compile_stack.size = INIT_COMPILE_STACK_SIZE; 1129 compile_stack.avail = 0; 1130 1131 /* Initialize the pattern buffer. */ 1132 bufp->syntax = syntax; 1133 bufp->fastmap_accurate = 0; 1134 bufp->not_bol = bufp->not_eol = 0; 1135 1136 /* Set `used' to zero, so that if we return an error, the pattern 1137 printer (for debugging) will think there's no pattern. We reset it 1138 at the end. */ 1139 bufp->used = 0; 1140 1141 /* Always count groups, whether or not bufp->no_sub is set. */ 1142 bufp->re_nsub = 0; 1143 1144 #if !defined (emacs) && !defined (SYNTAX_TABLE) 1145 /* Initialize the syntax table. */ 1146 init_syntax_once (); 1147 #endif 1148 1149 if (bufp->allocated == 0) 1150 { 1151 if (bufp->buffer) 1152 { /* If zero allocated, but buffer is non-null, try to realloc 1153 enough space. This loses if buffer's address is bogus, but 1154 that is the user's responsibility. */ 1155 RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char); 1156 } 1157 else 1158 { /* Caller did not allocate a buffer. Do it for them. */ 1159 bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char); 1160 } 1161 if (!bufp->buffer) return REG_ESPACE; 1162 1163 bufp->allocated = INIT_BUF_SIZE; 1164 } 1165 1166 begalt = b = bufp->buffer; 1167 1168 /* Loop through the uncompiled pattern until we're at the end. */ 1169 while (p != pend) 1170 { 1171 PATFETCH (c); 1172 1173 switch (c) 1174 { 1175 case '^': 1176 { 1177 if ( /* If at start of pattern, it's an operator. */ 1178 p == pattern + 1 1179 /* If context independent, it's an operator. */ 1180 || syntax & RE_CONTEXT_INDEP_ANCHORS 1181 /* Otherwise, depends on what's come before. */ 1182 || at_begline_loc_p (pattern, p, syntax)) 1183 BUF_PUSH (begline); 1184 else 1185 goto normal_char; 1186 } 1187 break; 1188 1189 1190 case '$': 1191 { 1192 if ( /* If at end of pattern, it's an operator. */ 1193 p == pend 1194 /* If context independent, it's an operator. */ 1195 || syntax & RE_CONTEXT_INDEP_ANCHORS 1196 /* Otherwise, depends on what's next. */ 1197 || at_endline_loc_p (p, pend, syntax)) 1198 BUF_PUSH (endline); 1199 else 1200 goto normal_char; 1201 } 1202 break; 1203 1204 1205 case '+': 1206 case '?': 1207 if ((syntax & RE_BK_PLUS_QM) 1208 || (syntax & RE_LIMITED_OPS)) 1209 goto normal_char; 1210 handle_plus: 1211 case '*': 1212 /* If there is no previous pattern... */ 1213 if (!laststart) 1214 { 1215 if (syntax & RE_CONTEXT_INVALID_OPS) 1216 return REG_BADRPT; 1217 else if (!(syntax & RE_CONTEXT_INDEP_OPS)) 1218 goto normal_char; 1219 } 1220 1221 { 1222 /* Are we optimizing this jump? */ 1223 boolean keep_string_p = false; 1224 1225 /* 1 means zero (many) matches is allowed. */ 1226 char zero_times_ok = 0, many_times_ok = 0; 1227 1228 /* If there is a sequence of repetition chars, collapse it 1229 down to just one (the right one). We can't combine 1230 interval operators with these because of, e.g., `a{2}*', 1231 which should only match an even number of `a's. */ 1232 1233 for (;;) 1234 { 1235 zero_times_ok |= c != '+'; 1236 many_times_ok |= c != '?'; 1237 1238 if (p == pend) 1239 break; 1240 1241 PATFETCH (c); 1242 1243 if (c == '*' 1244 || (!(syntax & RE_BK_PLUS_QM) && (c == '+' || c == '?'))) 1245 ; 1246 1247 else if (syntax & RE_BK_PLUS_QM && c == '\\') 1248 { 1249 if (p == pend) return REG_EESCAPE; 1250 1251 PATFETCH (c1); 1252 if (!(c1 == '+' || c1 == '?')) 1253 { 1254 PATUNFETCH; 1255 PATUNFETCH; 1256 break; 1257 } 1258 1259 c = c1; 1260 } 1261 else 1262 { 1263 PATUNFETCH; 1264 break; 1265 } 1266 1267 /* If we get here, we found another repeat character. */ 1268 } 1269 1270 /* Star, etc. applied to an empty pattern is equivalent 1271 to an empty pattern. */ 1272 if (!laststart) 1273 break; 1274 1275 /* Now we know whether or not zero matches is allowed 1276 and also whether or not two or more matches is allowed. */ 1277 if (many_times_ok) 1278 { /* More than one repetition is allowed, so put in at the 1279 end a backward relative jump from `b' to before the next 1280 jump we're going to put in below (which jumps from 1281 laststart to after this jump). 1282 1283 But if we are at the `*' in the exact sequence `.*\n', 1284 insert an unconditional jump backwards to the ., 1285 instead of the beginning of the loop. This way we only 1286 push a failure point once, instead of every time 1287 through the loop. */ 1288 assert (p - 1 > pattern); 1289 1290 /* Allocate the space for the jump. */ 1291 GET_BUFFER_SPACE (3); 1292 1293 /* We know we are not at the first character of the pattern, 1294 because laststart was nonzero. And we've already 1295 incremented `p', by the way, to be the character after 1296 the `*'. Do we have to do something analogous here 1297 for null bytes, because of RE_DOT_NOT_NULL? */ 1298 if (TRANSLATE (*(p - 2)) == TRANSLATE ('.') 1299 && zero_times_ok 1300 && p < pend && TRANSLATE (*p) == TRANSLATE ('\n') 1301 && !(syntax & RE_DOT_NEWLINE)) 1302 { /* We have .*\n. */ 1303 STORE_JUMP (jump, b, laststart); 1304 keep_string_p = true; 1305 } 1306 else 1307 /* Anything else. */ 1308 STORE_JUMP (maybe_pop_jump, b, laststart - 3); 1309 1310 /* We've added more stuff to the buffer. */ 1311 b += 3; 1312 } 1313 1314 /* On failure, jump from laststart to b + 3, which will be the 1315 end of the buffer after this jump is inserted. */ 1316 GET_BUFFER_SPACE (3); 1317 INSERT_JUMP (keep_string_p ? on_failure_keep_string_jump 1318 : on_failure_jump, 1319 laststart, b + 3); 1320 pending_exact = 0; 1321 b += 3; 1322 1323 if (!zero_times_ok) 1324 { 1325 /* At least one repetition is required, so insert a 1326 `dummy_failure_jump' before the initial 1327 `on_failure_jump' instruction of the loop. This 1328 effects a skip over that instruction the first time 1329 we hit that loop. */ 1330 GET_BUFFER_SPACE (3); 1331 INSERT_JUMP (dummy_failure_jump, laststart, laststart + 6); 1332 b += 3; 1333 } 1334 } 1335 break; 1336 1337 1338 case '.': 1339 laststart = b; 1340 BUF_PUSH (anychar); 1341 break; 1342 1343 1344 case '[': 1345 { 1346 boolean had_char_class = false; 1347 1348 if (p == pend) return REG_EBRACK; 1349 1350 /* Ensure that we have enough space to push a charset: the 1351 opcode, the length count, and the bitset; 34 bytes in all. */ 1352 GET_BUFFER_SPACE (34); 1353 1354 laststart = b; 1355 1356 /* We test `*p == '^' twice, instead of using an if 1357 statement, so we only need one BUF_PUSH. */ 1358 BUF_PUSH (*p == '^' ? charset_not : charset); 1359 if (*p == '^') 1360 p++; 1361 1362 /* Remember the first position in the bracket expression. */ 1363 p1 = p; 1364 1365 /* Push the number of bytes in the bitmap. */ 1366 BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH); 1367 1368 /* Clear the whole map. */ 1369 bzero (b, (1 << BYTEWIDTH) / BYTEWIDTH); 1370 1371 /* charset_not matches newline according to a syntax bit. */ 1372 if ((re_opcode_t) b[-2] == charset_not 1373 && (syntax & RE_HAT_LISTS_NOT_NEWLINE)) 1374 SET_LIST_BIT ('\n'); 1375 1376 /* Read in characters and ranges, setting map bits. */ 1377 for (;;) 1378 { 1379 if (p == pend) return REG_EBRACK; 1380 1381 PATFETCH (c); 1382 1383 /* \ might escape characters inside [...] and [^...]. */ 1384 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\') 1385 { 1386 if (p == pend) return REG_EESCAPE; 1387 1388 PATFETCH (c1); 1389 SET_LIST_BIT (c1); 1390 continue; 1391 } 1392 1393 /* Could be the end of the bracket expression. If it's 1394 not (i.e., when the bracket expression is `[]' so 1395 far), the ']' character bit gets set way below. */ 1396 if (c == ']' && p != p1 + 1) 1397 break; 1398 1399 /* Look ahead to see if it's a range when the last thing 1400 was a character class. */ 1401 if (had_char_class && c == '-' && *p != ']') 1402 return REG_ERANGE; 1403 1404 /* Look ahead to see if it's a range when the last thing 1405 was a character: if this is a hyphen not at the 1406 beginning or the end of a list, then it's the range 1407 operator. */ 1408 if (c == '-' 1409 && !(p - 2 >= pattern && p[-2] == '[') 1410 && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^') 1411 && *p != ']') 1412 { 1413 reg_errcode_t ret 1414 = compile_range (&p, pend, translate, syntax, b); 1415 if (ret != REG_NOERROR) return ret; 1416 } 1417 1418 else if (p[0] == '-' && p[1] != ']') 1419 { /* This handles ranges made up of characters only. */ 1420 reg_errcode_t ret; 1421 1422 /* Move past the `-'. */ 1423 PATFETCH (c1); 1424 1425 ret = compile_range (&p, pend, translate, syntax, b); 1426 if (ret != REG_NOERROR) return ret; 1427 } 1428 1429 /* See if we're at the beginning of a possible character 1430 class. */ 1431 1432 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':') 1433 { /* Leave room for the null. */ 1434 char str[CHAR_CLASS_MAX_LENGTH + 1]; 1435 1436 PATFETCH (c); 1437 c1 = 0; 1438 1439 /* If pattern is `[[:'. */ 1440 if (p == pend) return REG_EBRACK; 1441 1442 for (;;) 1443 { 1444 PATFETCH (c); 1445 if (c == ':' || c == ']' || p == pend 1446 || c1 == CHAR_CLASS_MAX_LENGTH) 1447 break; 1448 str[c1++] = c; 1449 } 1450 str[c1] = '\0'; 1451 1452 /* If isn't a word bracketed by `[:' and:`]': 1453 undo the ending character, the letters, and leave 1454 the leading `:' and `[' (but set bits for them). */ 1455 if (c == ':' && *p == ']') 1456 { 1457 int ch; 1458 boolean is_alnum = STREQ (str, "alnum"); 1459 boolean is_alpha = STREQ (str, "alpha"); 1460 boolean is_blank = STREQ (str, "blank"); 1461 boolean is_cntrl = STREQ (str, "cntrl"); 1462 boolean is_digit = STREQ (str, "digit"); 1463 boolean is_graph = STREQ (str, "graph"); 1464 boolean is_lower = STREQ (str, "lower"); 1465 boolean is_print = STREQ (str, "print"); 1466 boolean is_punct = STREQ (str, "punct"); 1467 boolean is_space = STREQ (str, "space"); 1468 boolean is_upper = STREQ (str, "upper"); 1469 boolean is_xdigit = STREQ (str, "xdigit"); 1470 1471 if (!IS_CHAR_CLASS (str)) return REG_ECTYPE; 1472 1473 /* Throw away the ] at the end of the character 1474 class. */ 1475 PATFETCH (c); 1476 1477 if (p == pend) return REG_EBRACK; 1478 1479 for (ch = 0; ch < 1 << BYTEWIDTH; ch++) 1480 { 1481 if ( (is_alnum && ISALNUM (ch)) 1482 || (is_alpha && ISALPHA (ch)) 1483 || (is_blank && ISBLANK (ch)) 1484 || (is_cntrl && ISCNTRL (ch)) 1485 || (is_digit && ISDIGIT (ch)) 1486 || (is_graph && ISGRAPH (ch)) 1487 || (is_lower && ISLOWER (ch)) 1488 || (is_print && ISPRINT (ch)) 1489 || (is_punct && ISPUNCT (ch)) 1490 || (is_space && ISSPACE (ch)) 1491 || (is_upper && ISUPPER (ch)) 1492 || (is_xdigit && ISXDIGIT (ch))) 1493 SET_LIST_BIT (ch); 1494 } 1495 had_char_class = true; 1496 } 1497 else 1498 { 1499 c1++; 1500 while (c1--) 1501 PATUNFETCH; 1502 SET_LIST_BIT ('['); 1503 SET_LIST_BIT (':'); 1504 had_char_class = false; 1505 } 1506 } 1507 else 1508 { 1509 had_char_class = false; 1510 SET_LIST_BIT (c); 1511 } 1512 } 1513 1514 /* Discard any (non)matching list bytes that are all 0 at the 1515 end of the map. Decrease the map-length byte too. */ 1516 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0) 1517 b[-1]--; 1518 b += b[-1]; 1519 } 1520 break; 1521 1522 1523 case '(': 1524 if (syntax & RE_NO_BK_PARENS) 1525 goto handle_open; 1526 else 1527 goto normal_char; 1528 1529 1530 case ')': 1531 if (syntax & RE_NO_BK_PARENS) 1532 goto handle_close; 1533 else 1534 goto normal_char; 1535 1536 1537 case '\n': 1538 if (syntax & RE_NEWLINE_ALT) 1539 goto handle_alt; 1540 else 1541 goto normal_char; 1542 1543 1544 case '|': 1545 if (syntax & RE_NO_BK_VBAR) 1546 goto handle_alt; 1547 else 1548 goto normal_char; 1549 1550 1551 case '{': 1552 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES) 1553 goto handle_interval; 1554 else 1555 goto normal_char; 1556 1557 1558 case '\\': 1559 if (p == pend) return REG_EESCAPE; 1560 1561 /* Do not translate the character after the \, so that we can 1562 distinguish, e.g., \B from \b, even if we normally would 1563 translate, e.g., B to b. */ 1564 PATFETCH_RAW (c); 1565 1566 switch (c) 1567 { 1568 case '(': 1569 if (syntax & RE_NO_BK_PARENS) 1570 goto normal_backslash; 1571 1572 handle_open: 1573 bufp->re_nsub++; 1574 regnum++; 1575 1576 if (COMPILE_STACK_FULL) 1577 { 1578 RETALLOC (compile_stack.stack, compile_stack.size << 1, 1579 compile_stack_elt_t); 1580 if (compile_stack.stack == NULL) return REG_ESPACE; 1581 1582 compile_stack.size <<= 1; 1583 } 1584 1585 /* These are the values to restore when we hit end of this 1586 group. They are all relative offsets, so that if the 1587 whole pattern moves because of realloc, they will still 1588 be valid. */ 1589 COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer; 1590 COMPILE_STACK_TOP.fixup_alt_jump 1591 = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0; 1592 COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer; 1593 COMPILE_STACK_TOP.regnum = regnum; 1594 1595 /* We will eventually replace the 0 with the number of 1596 groups inner to this one. But do not push a 1597 start_memory for groups beyond the last one we can 1598 represent in the compiled pattern. */ 1599 if (regnum <= MAX_REGNUM) 1600 { 1601 COMPILE_STACK_TOP.inner_group_offset = b - bufp->buffer + 2; 1602 BUF_PUSH_3 (start_memory, regnum, 0); 1603 } 1604 1605 compile_stack.avail++; 1606 1607 fixup_alt_jump = 0; 1608 laststart = 0; 1609 begalt = b; 1610 /* If we've reached MAX_REGNUM groups, then this open 1611 won't actually generate any code, so we'll have to 1612 clear pending_exact explicitly. */ 1613 pending_exact = 0; 1614 break; 1615 1616 1617 case ')': 1618 if (syntax & RE_NO_BK_PARENS) goto normal_backslash; 1619 1620 if (COMPILE_STACK_EMPTY) 1621 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD) 1622 goto normal_backslash; 1623 else 1624 return REG_ERPAREN; 1625 1626 handle_close: 1627 if (fixup_alt_jump) 1628 { /* Push a dummy failure point at the end of the 1629 alternative for a possible future 1630 `pop_failure_jump' to pop. See comments at 1631 `push_dummy_failure' in `re_match_2'. */ 1632 BUF_PUSH (push_dummy_failure); 1633 1634 /* We allocated space for this jump when we assigned 1635 to `fixup_alt_jump', in the `handle_alt' case below. */ 1636 STORE_JUMP (jump_past_alt, fixup_alt_jump, b - 1); 1637 } 1638 1639 /* See similar code for backslashed left paren above. */ 1640 if (COMPILE_STACK_EMPTY) 1641 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD) 1642 goto normal_char; 1643 else 1644 return REG_ERPAREN; 1645 1646 /* Since we just checked for an empty stack above, this 1647 ``can't happen''. */ 1648 assert (compile_stack.avail != 0); 1649 { 1650 /* We don't just want to restore into `regnum', because 1651 later groups should continue to be numbered higher, 1652 as in `(ab)c(de)' -- the second group is #2. */ 1653 regnum_t this_group_regnum; 1654 1655 compile_stack.avail--; 1656 begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset; 1657 fixup_alt_jump 1658 = COMPILE_STACK_TOP.fixup_alt_jump 1659 ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1 1660 : 0; 1661 laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset; 1662 this_group_regnum = COMPILE_STACK_TOP.regnum; 1663 /* If we've reached MAX_REGNUM groups, then this open 1664 won't actually generate any code, so we'll have to 1665 clear pending_exact explicitly. */ 1666 pending_exact = 0; 1667 1668 /* We're at the end of the group, so now we know how many 1669 groups were inside this one. */ 1670 if (this_group_regnum <= MAX_REGNUM) 1671 { 1672 unsigned char *inner_group_loc 1673 = bufp->buffer + COMPILE_STACK_TOP.inner_group_offset; 1674 1675 *inner_group_loc = regnum - this_group_regnum; 1676 BUF_PUSH_3 (stop_memory, this_group_regnum, 1677 regnum - this_group_regnum); 1678 } 1679 } 1680 break; 1681 1682 1683 case '|': /* `\|'. */ 1684 if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR) 1685 goto normal_backslash; 1686 handle_alt: 1687 if (syntax & RE_LIMITED_OPS) 1688 goto normal_char; 1689 1690 /* Insert before the previous alternative a jump which 1691 jumps to this alternative if the former fails. */ 1692 GET_BUFFER_SPACE (3); 1693 INSERT_JUMP (on_failure_jump, begalt, b + 6); 1694 pending_exact = 0; 1695 b += 3; 1696 1697 /* The alternative before this one has a jump after it 1698 which gets executed if it gets matched. Adjust that 1699 jump so it will jump to this alternative's analogous 1700 jump (put in below, which in turn will jump to the next 1701 (if any) alternative's such jump, etc.). The last such 1702 jump jumps to the correct final destination. A picture: 1703 _____ _____ 1704 | | | | 1705 | v | v 1706 a | b | c 1707 1708 If we are at `b', then fixup_alt_jump right now points to a 1709 three-byte space after `a'. We'll put in the jump, set 1710 fixup_alt_jump to right after `b', and leave behind three 1711 bytes which we'll fill in when we get to after `c'. */ 1712 1713 if (fixup_alt_jump) 1714 STORE_JUMP (jump_past_alt, fixup_alt_jump, b); 1715 1716 /* Mark and leave space for a jump after this alternative, 1717 to be filled in later either by next alternative or 1718 when know we're at the end of a series of alternatives. */ 1719 fixup_alt_jump = b; 1720 GET_BUFFER_SPACE (3); 1721 b += 3; 1722 1723 laststart = 0; 1724 begalt = b; 1725 break; 1726 1727 1728 case '{': 1729 /* If \{ is a literal. */ 1730 if (!(syntax & RE_INTERVALS) 1731 /* If we're at `\{' and it's not the open-interval 1732 operator. */ 1733 || ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES)) 1734 || (p - 2 == pattern && p == pend)) 1735 goto normal_backslash; 1736 1737 handle_interval: 1738 { 1739 /* If got here, then the syntax allows intervals. */ 1740 1741 /* At least (most) this many matches must be made. */ 1742 int lower_bound = -1, upper_bound = -1; 1743 1744 beg_interval = p - 1; 1745 1746 if (p == pend) 1747 { 1748 if (syntax & RE_NO_BK_BRACES) 1749 goto unfetch_interval; 1750 else 1751 return REG_EBRACE; 1752 } 1753 1754 GET_UNSIGNED_NUMBER (lower_bound); 1755 1756 if (c == ',') 1757 { 1758 GET_UNSIGNED_NUMBER (upper_bound); 1759 if (upper_bound < 0) upper_bound = RE_DUP_MAX; 1760 } 1761 else 1762 /* Interval such as `{1}' => match exactly once. */ 1763 upper_bound = lower_bound; 1764 1765 if (lower_bound < 0 || upper_bound > RE_DUP_MAX 1766 || lower_bound > upper_bound) 1767 { 1768 if (syntax & RE_NO_BK_BRACES) 1769 goto unfetch_interval; 1770 else 1771 return REG_BADBR; 1772 } 1773 1774 if (!(syntax & RE_NO_BK_BRACES)) 1775 { 1776 if (c != '\\') return REG_EBRACE; 1777 1778 PATFETCH (c); 1779 } 1780 1781 if (c != '}') 1782 { 1783 if (syntax & RE_NO_BK_BRACES) 1784 goto unfetch_interval; 1785 else 1786 return REG_BADBR; 1787 } 1788 1789 /* We just parsed a valid interval. */ 1790 1791 /* If it's invalid to have no preceding re. */ 1792 if (!laststart) 1793 { 1794 if (syntax & RE_CONTEXT_INVALID_OPS) 1795 return REG_BADRPT; 1796 else if (syntax & RE_CONTEXT_INDEP_OPS) 1797 laststart = b; 1798 else 1799 goto unfetch_interval; 1800 } 1801 1802 /* If the upper bound is zero, don't want to succeed at 1803 all; jump from `laststart' to `b + 3', which will be 1804 the end of the buffer after we insert the jump. */ 1805 if (upper_bound == 0) 1806 { 1807 GET_BUFFER_SPACE (3); 1808 INSERT_JUMP (jump, laststart, b + 3); 1809 b += 3; 1810 } 1811 1812 /* Otherwise, we have a nontrivial interval. When 1813 we're all done, the pattern will look like: 1814 set_number_at <jump count> <upper bound> 1815 set_number_at <succeed_n count> <lower bound> 1816 succeed_n <after jump addr> <succed_n count> 1817 <body of loop> 1818 jump_n <succeed_n addr> <jump count> 1819 (The upper bound and `jump_n' are omitted if 1820 `upper_bound' is 1, though.) */ 1821 else 1822 { /* If the upper bound is > 1, we need to insert 1823 more at the end of the loop. */ 1824 unsigned nbytes = 10 + (upper_bound > 1) * 10; 1825 1826 GET_BUFFER_SPACE (nbytes); 1827 1828 /* Initialize lower bound of the `succeed_n', even 1829 though it will be set during matching by its 1830 attendant `set_number_at' (inserted next), 1831 because `re_compile_fastmap' needs to know. 1832 Jump to the `jump_n' we might insert below. */ 1833 INSERT_JUMP2 (succeed_n, laststart, 1834 b + 5 + (upper_bound > 1) * 5, 1835 lower_bound); 1836 b += 5; 1837 1838 /* Code to initialize the lower bound. Insert 1839 before the `succeed_n'. The `5' is the last two 1840 bytes of this `set_number_at', plus 3 bytes of 1841 the following `succeed_n'. */ 1842 insert_op2 (set_number_at, laststart, 5, lower_bound, b); 1843 b += 5; 1844 1845 if (upper_bound > 1) 1846 { /* More than one repetition is allowed, so 1847 append a backward jump to the `succeed_n' 1848 that starts this interval. 1849 1850 When we've reached this during matching, 1851 we'll have matched the interval once, so 1852 jump back only `upper_bound - 1' times. */ 1853 STORE_JUMP2 (jump_n, b, laststart + 5, 1854 upper_bound - 1); 1855 b += 5; 1856 1857 /* The location we want to set is the second 1858 parameter of the `jump_n'; that is `b-2' as 1859 an absolute address. `laststart' will be 1860 the `set_number_at' we're about to insert; 1861 `laststart+3' the number to set, the source 1862 for the relative address. But we are 1863 inserting into the middle of the pattern -- 1864 so everything is getting moved up by 5. 1865 Conclusion: (b - 2) - (laststart + 3) + 5, 1866 i.e., b - laststart. 1867 1868 We insert this at the beginning of the loop 1869 so that if we fail during matching, we'll 1870 reinitialize the bounds. */ 1871 insert_op2 (set_number_at, laststart, b - laststart, 1872 upper_bound - 1, b); 1873 b += 5; 1874 } 1875 } 1876 pending_exact = 0; 1877 beg_interval = NULL; 1878 } 1879 break; 1880 1881 unfetch_interval: 1882 /* If an invalid interval, match the characters as literals. */ 1883 assert (beg_interval); 1884 p = beg_interval; 1885 beg_interval = NULL; 1886 1887 /* normal_char and normal_backslash need `c'. */ 1888 PATFETCH (c); 1889 1890 if (!(syntax & RE_NO_BK_BRACES)) 1891 { 1892 if (p > pattern && p[-1] == '\\') 1893 goto normal_backslash; 1894 } 1895 goto normal_char; 1896 1897 #ifdef emacs 1898 /* There is no way to specify the before_dot and after_dot 1899 operators. rms says this is ok. --karl */ 1900 case '=': 1901 BUF_PUSH (at_dot); 1902 break; 1903 1904 case 's': 1905 laststart = b; 1906 PATFETCH (c); 1907 BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]); 1908 break; 1909 1910 case 'S': 1911 laststart = b; 1912 PATFETCH (c); 1913 BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]); 1914 break; 1915 #endif /* emacs */ 1916 1917 1918 case 'w': 1919 laststart = b; 1920 BUF_PUSH (wordchar); 1921 break; 1922 1923 1924 case 'W': 1925 laststart = b; 1926 BUF_PUSH (notwordchar); 1927 break; 1928 1929 1930 case '<': 1931 BUF_PUSH (wordbeg); 1932 break; 1933 1934 case '>': 1935 BUF_PUSH (wordend); 1936 break; 1937 1938 case 'b': 1939 BUF_PUSH (wordbound); 1940 break; 1941 1942 case 'B': 1943 BUF_PUSH (notwordbound); 1944 break; 1945 1946 case '`': 1947 BUF_PUSH (begbuf); 1948 break; 1949 1950 case '\'': 1951 BUF_PUSH (endbuf); 1952 break; 1953 1954 case '1': case '2': case '3': case '4': case '5': 1955 case '6': case '7': case '8': case '9': 1956 if (syntax & RE_NO_BK_REFS) 1957 goto normal_char; 1958 1959 c1 = c - '0'; 1960 1961 if (c1 > regnum) 1962 return REG_ESUBREG; 1963 1964 /* Can't back reference to a subexpression if inside of it. */ 1965 if (group_in_compile_stack (compile_stack, c1)) 1966 goto normal_char; 1967 1968 laststart = b; 1969 BUF_PUSH_2 (duplicate, c1); 1970 break; 1971 1972 1973 case '+': 1974 case '?': 1975 if (syntax & RE_BK_PLUS_QM) 1976 goto handle_plus; 1977 else 1978 goto normal_backslash; 1979 1980 default: 1981 normal_backslash: 1982 /* You might think it would be useful for \ to mean 1983 not to translate; but if we don't translate it 1984 it will never match anything. */ 1985 c = TRANSLATE (c); 1986 goto normal_char; 1987 } 1988 break; 1989 1990 1991 default: 1992 /* Expects the character in `c'. */ 1993 normal_char: 1994 /* If no exactn currently being built. */ 1995 if (!pending_exact 1996 1997 /* If last exactn not at current position. */ 1998 || pending_exact + *pending_exact + 1 != b 1999 2000 /* We have only one byte following the exactn for the count. */ 2001 || *pending_exact == (1 << BYTEWIDTH) - 1 2002 2003 /* If followed by a repetition operator. */ 2004 || *p == '*' || *p == '^' 2005 || ((syntax & RE_BK_PLUS_QM) 2006 ? *p == '\\' && (p[1] == '+' || p[1] == '?') 2007 : (*p == '+' || *p == '?')) 2008 || ((syntax & RE_INTERVALS) 2009 && ((syntax & RE_NO_BK_BRACES) 2010 ? *p == '{' 2011 : (p[0] == '\\' && p[1] == '{')))) 2012 { 2013 /* Start building a new exactn. */ 2014 2015 laststart = b; 2016 2017 BUF_PUSH_2 (exactn, 0); 2018 pending_exact = b - 1; 2019 } 2020 2021 BUF_PUSH (c); 2022 (*pending_exact)++; 2023 break; 2024 } /* switch (c) */ 2025 } /* while p != pend */ 2026 2027 2028 /* Through the pattern now. */ 2029 2030 if (fixup_alt_jump) 2031 STORE_JUMP (jump_past_alt, fixup_alt_jump, b); 2032 2033 if (!COMPILE_STACK_EMPTY) 2034 return REG_EPAREN; 2035 2036 free (compile_stack.stack); 2037 2038 /* We have succeeded; set the length of the buffer. */ 2039 bufp->used = b - bufp->buffer; 2040 2041 #ifdef DEBUG 2042 if (debug) 2043 { 2044 DEBUG_PRINT1 ("\nCompiled pattern: "); 2045 print_compiled_pattern (bufp); 2046 } 2047 #endif /* DEBUG */ 2048 2049 return REG_NOERROR; 2050 } /* regex_compile */ 2051 2052 /* Subroutines for `regex_compile'. */ 2053 2054 /* Store OP at LOC followed by two-byte integer parameter ARG. */ 2055 2056 static void 2057 store_op1 (op, loc, arg) 2058 re_opcode_t op; 2059 unsigned char *loc; 2060 int arg; 2061 { 2062 *loc = (unsigned char) op; 2063 STORE_NUMBER (loc + 1, arg); 2064 } 2065 2066 2067 /* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */ 2068 2069 static void 2070 store_op2 (op, loc, arg1, arg2) 2071 re_opcode_t op; 2072 unsigned char *loc; 2073 int arg1, arg2; 2074 { 2075 *loc = (unsigned char) op; 2076 STORE_NUMBER (loc + 1, arg1); 2077 STORE_NUMBER (loc + 3, arg2); 2078 } 2079 2080 2081 /* Copy the bytes from LOC to END to open up three bytes of space at LOC 2082 for OP followed by two-byte integer parameter ARG. */ 2083 2084 static void 2085 insert_op1 (op, loc, arg, end) 2086 re_opcode_t op; 2087 unsigned char *loc; 2088 int arg; 2089 unsigned char *end; 2090 { 2091 register unsigned char *pfrom = end; 2092 register unsigned char *pto = end + 3; 2093 2094 while (pfrom != loc) 2095 *--pto = *--pfrom; 2096 2097 store_op1 (op, loc, arg); 2098 } 2099 2100 2101 /* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */ 2102 2103 static void 2104 insert_op2 (op, loc, arg1, arg2, end) 2105 re_opcode_t op; 2106 unsigned char *loc; 2107 int arg1, arg2; 2108 unsigned char *end; 2109 { 2110 register unsigned char *pfrom = end; 2111 register unsigned char *pto = end + 5; 2112 2113 while (pfrom != loc) 2114 *--pto = *--pfrom; 2115 2116 store_op2 (op, loc, arg1, arg2); 2117 } 2118 2119 2120 /* P points to just after a ^ in PATTERN. Return true if that ^ comes 2121 after an alternative or a begin-subexpression. We assume there is at 2122 least one character before the ^. */ 2123 2124 static boolean 2125 at_begline_loc_p (pattern, p, syntax) 2126 const char *pattern, *p; 2127 reg_syntax_t syntax; 2128 { 2129 const char *prev = p - 2; 2130 boolean prev_prev_backslash = prev > pattern && prev[-1] == '\\'; 2131 2132 return 2133 /* After a subexpression? */ 2134 (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash)) 2135 /* After an alternative? */ 2136 || (*prev == '|' && (syntax & RE_NO_BK_VBAR || prev_prev_backslash)); 2137 } 2138 2139 2140 /* The dual of at_begline_loc_p. This one is for $. We assume there is 2141 at least one character after the $, i.e., `P < PEND'. */ 2142 2143 static boolean 2144 at_endline_loc_p (p, pend, syntax) 2145 const char *p, *pend; 2146 int syntax; 2147 { 2148 const char *next = p; 2149 boolean next_backslash = *next == '\\'; 2150 const char *next_next = p + 1 < pend ? p + 1 : NULL; 2151 2152 return 2153 /* Before a subexpression? */ 2154 (syntax & RE_NO_BK_PARENS ? *next == ')' 2155 : next_backslash && next_next && *next_next == ')') 2156 /* Before an alternative? */ 2157 || (syntax & RE_NO_BK_VBAR ? *next == '|' 2158 : next_backslash && next_next && *next_next == '|'); 2159 } 2160 2161 2162 /* Returns true if REGNUM is in one of COMPILE_STACK's elements and 2163 false if it's not. */ 2164 2165 static boolean 2166 group_in_compile_stack (compile_stack, regnum) 2167 compile_stack_type compile_stack; 2168 regnum_t regnum; 2169 { 2170 int this_element; 2171 2172 for (this_element = compile_stack.avail - 1; 2173 this_element >= 0; 2174 this_element--) 2175 if (compile_stack.stack[this_element].regnum == regnum) 2176 return true; 2177 2178 return false; 2179 } 2180 2181 2182 /* Read the ending character of a range (in a bracket expression) from the 2183 uncompiled pattern *P_PTR (which ends at PEND). We assume the 2184 starting character is in `P[-2]'. (`P[-1]' is the character `-'.) 2185 Then we set the translation of all bits between the starting and 2186 ending characters (inclusive) in the compiled pattern B. 2187 2188 Return an error code. 2189 2190 We use these short variable names so we can use the same macros as 2191 `regex_compile' itself. */ 2192 2193 static reg_errcode_t 2194 compile_range (p_ptr, pend, translate, syntax, b) 2195 const char **p_ptr, *pend; 2196 char *translate; 2197 reg_syntax_t syntax; 2198 unsigned char *b; 2199 { 2200 unsigned this_char; 2201 2202 const char *p = *p_ptr; 2203 int range_start, range_end; 2204 2205 if (p == pend) 2206 return REG_ERANGE; 2207 2208 /* Even though the pattern is a signed `char *', we need to fetch 2209 with unsigned char *'s; if the high bit of the pattern character 2210 is set, the range endpoints will be negative if we fetch using a 2211 signed char *. 2212 2213 We also want to fetch the endpoints without translating them; the 2214 appropriate translation is done in the bit-setting loop below. */ 2215 range_start = ((unsigned char *) p)[-2]; 2216 range_end = ((unsigned char *) p)[0]; 2217 2218 /* Have to increment the pointer into the pattern string, so the 2219 caller isn't still at the ending character. */ 2220 (*p_ptr)++; 2221 2222 /* If the start is after the end, the range is empty. */ 2223 if (range_start > range_end) 2224 return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR; 2225 2226 /* Here we see why `this_char' has to be larger than an `unsigned 2227 char' -- the range is inclusive, so if `range_end' == 0xff 2228 (assuming 8-bit characters), we would otherwise go into an infinite 2229 loop, since all characters <= 0xff. */ 2230 for (this_char = range_start; this_char <= range_end; this_char++) 2231 { 2232 SET_LIST_BIT (TRANSLATE (this_char)); 2233 } 2234 2235 return REG_NOERROR; 2236 } 2237 2238 /* Failure stack declarations and macros; both re_compile_fastmap and 2239 re_match_2 use a failure stack. These have to be macros because of 2240 REGEX_ALLOCATE. */ 2241 2242 2243 /* Number of failure points for which to initially allocate space 2244 when matching. If this number is exceeded, we allocate more 2245 space, so it is not a hard limit. */ 2246 #ifndef INIT_FAILURE_ALLOC 2247 #define INIT_FAILURE_ALLOC 5 2248 #endif 2249 2250 /* Roughly the maximum number of failure points on the stack. Would be 2251 exactly that if always used MAX_FAILURE_SPACE each time we failed. 2252 This is a variable only so users of regex can assign to it; we never 2253 change it ourselves. */ 2254 int re_max_failures = 2000; 2255 2256 typedef const unsigned char *fail_stack_elt_t; 2257 2258 typedef struct 2259 { 2260 fail_stack_elt_t *stack; 2261 unsigned size; 2262 unsigned avail; /* Offset of next open position. */ 2263 } fail_stack_type; 2264 2265 #define FAIL_STACK_EMPTY() (fail_stack.avail == 0) 2266 #define FAIL_STACK_PTR_EMPTY() (fail_stack_ptr->avail == 0) 2267 #define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size) 2268 #define FAIL_STACK_TOP() (fail_stack.stack[fail_stack.avail]) 2269 2270 2271 /* Initialize `fail_stack'. Do `return -2' if the alloc fails. */ 2272 2273 #define INIT_FAIL_STACK() \ 2274 do { \ 2275 fail_stack.stack = (fail_stack_elt_t *) \ 2276 REGEX_ALLOCATE (INIT_FAILURE_ALLOC * sizeof (fail_stack_elt_t)); \ 2277 \ 2278 if (fail_stack.stack == NULL) \ 2279 return -2; \ 2280 \ 2281 fail_stack.size = INIT_FAILURE_ALLOC; \ 2282 fail_stack.avail = 0; \ 2283 } while (0) 2284 2285 2286 /* Double the size of FAIL_STACK, up to approximately `re_max_failures' items. 2287 2288 Return 1 if succeeds, and 0 if either ran out of memory 2289 allocating space for it or it was already too large. 2290 2291 REGEX_REALLOCATE requires `destination' be declared. */ 2292 2293 #define DOUBLE_FAIL_STACK(fail_stack) \ 2294 ((fail_stack).size > re_max_failures * MAX_FAILURE_ITEMS \ 2295 ? 0 \ 2296 : ((fail_stack).stack = (fail_stack_elt_t *) \ 2297 REGEX_REALLOCATE ((fail_stack).stack, \ 2298 (fail_stack).size * sizeof (fail_stack_elt_t), \ 2299 ((fail_stack).size << 1) * sizeof (fail_stack_elt_t)), \ 2300 \ 2301 (fail_stack).stack == NULL \ 2302 ? 0 \ 2303 : ((fail_stack).size <<= 1, \ 2304 1))) 2305 2306 2307 /* Push PATTERN_OP on FAIL_STACK. 2308 2309 Return 1 if was able to do so and 0 if ran out of memory allocating 2310 space to do so. */ 2311 #define PUSH_PATTERN_OP(pattern_op, fail_stack) \ 2312 ((FAIL_STACK_FULL () \ 2313 && !DOUBLE_FAIL_STACK (fail_stack)) \ 2314 ? 0 \ 2315 : ((fail_stack).stack[(fail_stack).avail++] = pattern_op, \ 2316 1)) 2317 2318 /* This pushes an item onto the failure stack. Must be a four-byte 2319 value. Assumes the variable `fail_stack'. Probably should only 2320 be called from within `PUSH_FAILURE_POINT'. */ 2321 #define PUSH_FAILURE_ITEM(item) \ 2322 fail_stack.stack[fail_stack.avail++] = (fail_stack_elt_t) item 2323 2324 /* The complement operation. Assumes `fail_stack' is nonempty. */ 2325 #define POP_FAILURE_ITEM() fail_stack.stack[--fail_stack.avail] 2326 2327 /* Used to omit pushing failure point id's when we're not debugging. */ 2328 #ifdef DEBUG 2329 #define DEBUG_PUSH PUSH_FAILURE_ITEM 2330 #define DEBUG_POP(item_addr) *(item_addr) = POP_FAILURE_ITEM () 2331 #else 2332 #define DEBUG_PUSH(item) 2333 #define DEBUG_POP(item_addr) 2334 #endif 2335 2336 2337 /* Push the information about the state we will need 2338 if we ever fail back to it. 2339 2340 Requires variables fail_stack, regstart, regend, reg_info, and 2341 num_regs be declared. DOUBLE_FAIL_STACK requires `destination' be 2342 declared. 2343 2344 Does `return FAILURE_CODE' if runs out of memory. */ 2345 2346 #define PUSH_FAILURE_POINT(pattern_place, string_place, failure_code) \ 2347 do { \ 2348 char *destination; \ 2349 /* Must be int, so when we don't save any registers, the arithmetic \ 2350 of 0 + -1 isn't done as unsigned. */ \ 2351 int this_reg; \ 2352 \ 2353 DEBUG_STATEMENT (failure_id++); \ 2354 DEBUG_STATEMENT (nfailure_points_pushed++); \ 2355 DEBUG_PRINT2 ("\nPUSH_FAILURE_POINT #%u:\n", failure_id); \ 2356 DEBUG_PRINT2 (" Before push, next avail: %d\n", (fail_stack).avail);\ 2357 DEBUG_PRINT2 (" size: %d\n", (fail_stack).size);\ 2358 \ 2359 DEBUG_PRINT2 (" slots needed: %d\n", NUM_FAILURE_ITEMS); \ 2360 DEBUG_PRINT2 (" available: %d\n", REMAINING_AVAIL_SLOTS); \ 2361 \ 2362 /* Ensure we have enough space allocated for what we will push. */ \ 2363 while (REMAINING_AVAIL_SLOTS < NUM_FAILURE_ITEMS) \ 2364 { \ 2365 if (!DOUBLE_FAIL_STACK (fail_stack)) \ 2366 return failure_code; \ 2367 \ 2368 DEBUG_PRINT2 ("\n Doubled stack; size now: %d\n", \ 2369 (fail_stack).size); \ 2370 DEBUG_PRINT2 (" slots available: %d\n", REMAINING_AVAIL_SLOTS);\ 2371 } \ 2372 \ 2373 /* Push the info, starting with the registers. */ \ 2374 DEBUG_PRINT1 ("\n"); \ 2375 \ 2376 for (this_reg = lowest_active_reg; this_reg <= highest_active_reg; \ 2377 this_reg++) \ 2378 { \ 2379 DEBUG_PRINT2 (" Pushing reg: %d\n", this_reg); \ 2380 DEBUG_STATEMENT (num_regs_pushed++); \ 2381 \ 2382 DEBUG_PRINT2 (" start: 0x%x\n", regstart[this_reg]); \ 2383 PUSH_FAILURE_ITEM (regstart[this_reg]); \ 2384 \ 2385 DEBUG_PRINT2 (" end: 0x%x\n", regend[this_reg]); \ 2386 PUSH_FAILURE_ITEM (regend[this_reg]); \ 2387 \ 2388 DEBUG_PRINT2 (" info: 0x%x\n ", reg_info[this_reg]); \ 2389 DEBUG_PRINT2 (" match_null=%d", \ 2390 REG_MATCH_NULL_STRING_P (reg_info[this_reg])); \ 2391 DEBUG_PRINT2 (" active=%d", IS_ACTIVE (reg_info[this_reg])); \ 2392 DEBUG_PRINT2 (" matched_something=%d", \ 2393 MATCHED_SOMETHING (reg_info[this_reg])); \ 2394 DEBUG_PRINT2 (" ever_matched=%d", \ 2395 EVER_MATCHED_SOMETHING (reg_info[this_reg])); \ 2396 DEBUG_PRINT1 ("\n"); \ 2397 PUSH_FAILURE_ITEM (reg_info[this_reg].word); \ 2398 } \ 2399 \ 2400 DEBUG_PRINT2 (" Pushing low active reg: %d\n", lowest_active_reg);\ 2401 PUSH_FAILURE_ITEM (lowest_active_reg); \ 2402 \ 2403 DEBUG_PRINT2 (" Pushing high active reg: %d\n", highest_active_reg);\ 2404 PUSH_FAILURE_ITEM (highest_active_reg); \ 2405 \ 2406 DEBUG_PRINT2 (" Pushing pattern 0x%x: ", pattern_place); \ 2407 DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern_place, pend); \ 2408 PUSH_FAILURE_ITEM (pattern_place); \ 2409 \ 2410 DEBUG_PRINT2 (" Pushing string 0x%x: `", string_place); \ 2411 DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, \ 2412 size2); \ 2413 DEBUG_PRINT1 ("'\n"); \ 2414 PUSH_FAILURE_ITEM (string_place); \ 2415 \ 2416 DEBUG_PRINT2 (" Pushing failure id: %u\n", failure_id); \ 2417 DEBUG_PUSH (failure_id); \ 2418 } while (0) 2419 2420 /* This is the number of items that are pushed and popped on the stack 2421 for each register. */ 2422 #define NUM_REG_ITEMS 3 2423 2424 /* Individual items aside from the registers. */ 2425 #ifdef DEBUG 2426 #define NUM_NONREG_ITEMS 5 /* Includes failure point id. */ 2427 #else 2428 #define NUM_NONREG_ITEMS 4 2429 #endif 2430 2431 /* We push at most this many items on the stack. */ 2432 #define MAX_FAILURE_ITEMS ((num_regs - 1) * NUM_REG_ITEMS + NUM_NONREG_ITEMS) 2433 2434 /* We actually push this many items. */ 2435 #define NUM_FAILURE_ITEMS \ 2436 ((highest_active_reg - lowest_active_reg + 1) * NUM_REG_ITEMS \ 2437 + NUM_NONREG_ITEMS) 2438 2439 /* How many items can still be added to the stack without overflowing it. */ 2440 #define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail) 2441 2442 2443 /* Pops what PUSH_FAIL_STACK pushes. 2444 2445 We restore into the parameters, all of which should be lvalues: 2446 STR -- the saved data position. 2447 PAT -- the saved pattern position. 2448 LOW_REG, HIGH_REG -- the highest and lowest active registers. 2449 REGSTART, REGEND -- arrays of string positions. 2450 REG_INFO -- array of information about each subexpression. 2451 2452 Also assumes the variables `fail_stack' and (if debugging), `bufp', 2453 `pend', `string1', `size1', `string2', and `size2'. */ 2454 2455 #define POP_FAILURE_POINT(str, pat, low_reg, high_reg, regstart, regend, reg_info)\ 2456 { \ 2457 DEBUG_STATEMENT (fail_stack_elt_t failure_id;) \ 2458 int this_reg; \ 2459 const unsigned char *string_temp; \ 2460 \ 2461 assert (!FAIL_STACK_EMPTY ()); \ 2462 \ 2463 /* Remove failure points and point to how many regs pushed. */ \ 2464 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \ 2465 DEBUG_PRINT2 (" Before pop, next avail: %d\n", fail_stack.avail); \ 2466 DEBUG_PRINT2 (" size: %d\n", fail_stack.size); \ 2467 \ 2468 assert (fail_stack.avail >= NUM_NONREG_ITEMS); \ 2469 \ 2470 DEBUG_POP (&failure_id); \ 2471 DEBUG_PRINT2 (" Popping failure id: %u\n", failure_id); \ 2472 \ 2473 /* If the saved string location is NULL, it came from an \ 2474 on_failure_keep_string_jump opcode, and we want to throw away the \ 2475 saved NULL, thus retaining our current position in the string. */ \ 2476 string_temp = POP_FAILURE_ITEM (); \ 2477 if (string_temp != NULL) \ 2478 str = (const char *) string_temp; \ 2479 \ 2480 DEBUG_PRINT2 (" Popping string 0x%x: `", str); \ 2481 DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \ 2482 DEBUG_PRINT1 ("'\n"); \ 2483 \ 2484 pat = (unsigned char *) POP_FAILURE_ITEM (); \ 2485 DEBUG_PRINT2 (" Popping pattern 0x%x: ", pat); \ 2486 DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \ 2487 \ 2488 /* Restore register info. */ \ 2489 high_reg = (unsigned) POP_FAILURE_ITEM (); \ 2490 DEBUG_PRINT2 (" Popping high active reg: %d\n", high_reg); \ 2491 \ 2492 low_reg = (unsigned) POP_FAILURE_ITEM (); \ 2493 DEBUG_PRINT2 (" Popping low active reg: %d\n", low_reg); \ 2494 \ 2495 for (this_reg = high_reg; this_reg >= low_reg; this_reg--) \ 2496 { \ 2497 DEBUG_PRINT2 (" Popping reg: %d\n", this_reg); \ 2498 \ 2499 reg_info[this_reg].word = POP_FAILURE_ITEM (); \ 2500 DEBUG_PRINT2 (" info: 0x%x\n", reg_info[this_reg]); \ 2501 \ 2502 regend[this_reg] = (const char *) POP_FAILURE_ITEM (); \ 2503 DEBUG_PRINT2 (" end: 0x%x\n", regend[this_reg]); \ 2504 \ 2505 regstart[this_reg] = (const char *) POP_FAILURE_ITEM (); \ 2506 DEBUG_PRINT2 (" start: 0x%x\n", regstart[this_reg]); \ 2507 } \ 2508 \ 2509 DEBUG_STATEMENT (nfailure_points_popped++); \ 2510 } /* POP_FAILURE_POINT */ 2511 2512 /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in 2513 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible 2514 characters can start a string that matches the pattern. This fastmap 2515 is used by re_search to skip quickly over impossible starting points. 2516 2517 The caller must supply the address of a (1 << BYTEWIDTH)-byte data 2518 area as BUFP->fastmap. 2519 2520 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in 2521 the pattern buffer. 2522 2523 Returns 0 if we succeed, -2 if an internal error. */ 2524 2525 int 2526 re_compile_fastmap (bufp) 2527 struct re_pattern_buffer *bufp; 2528 { 2529 int j, k; 2530 fail_stack_type fail_stack; 2531 #ifndef REGEX_MALLOC 2532 char *destination; 2533 #endif 2534 /* We don't push any register information onto the failure stack. */ 2535 unsigned num_regs = 0; 2536 2537 register char *fastmap = bufp->fastmap; 2538 unsigned char *pattern = bufp->buffer; 2539 unsigned long size = bufp->used; 2540 const unsigned char *p = pattern; 2541 register unsigned char *pend = pattern + size; 2542 2543 /* Assume that each path through the pattern can be null until 2544 proven otherwise. We set this false at the bottom of switch 2545 statement, to which we get only if a particular path doesn't 2546 match the empty string. */ 2547 boolean path_can_be_null = true; 2548 2549 /* We aren't doing a `succeed_n' to begin with. */ 2550 boolean succeed_n_p = false; 2551 2552 assert (fastmap != NULL && p != NULL); 2553 2554 INIT_FAIL_STACK (); 2555 bzero (fastmap, 1 << BYTEWIDTH); /* Assume nothing's valid. */ 2556 bufp->fastmap_accurate = 1; /* It will be when we're done. */ 2557 bufp->can_be_null = 0; 2558 2559 while (p != pend || !FAIL_STACK_EMPTY ()) 2560 { 2561 if (p == pend) 2562 { 2563 bufp->can_be_null |= path_can_be_null; 2564 2565 /* Reset for next path. */ 2566 path_can_be_null = true; 2567 2568 p = fail_stack.stack[--fail_stack.avail]; 2569 } 2570 2571 /* We should never be about to go beyond the end of the pattern. */ 2572 assert (p < pend); 2573 2574 #ifdef SWITCH_ENUM_BUG 2575 switch ((int) ((re_opcode_t) *p++)) 2576 #else 2577 switch ((re_opcode_t) *p++) 2578 #endif 2579 { 2580 2581 /* I guess the idea here is to simply not bother with a fastmap 2582 if a backreference is used, since it's too hard to figure out 2583 the fastmap for the corresponding group. Setting 2584 `can_be_null' stops `re_search_2' from using the fastmap, so 2585 that is all we do. */ 2586 case duplicate: 2587 bufp->can_be_null = 1; 2588 return 0; 2589 2590 2591 /* Following are the cases which match a character. These end 2592 with `break'. */ 2593 2594 case exactn: 2595 fastmap[p[1]] = 1; 2596 break; 2597 2598 2599 case charset: 2600 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--) 2601 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) 2602 fastmap[j] = 1; 2603 break; 2604 2605 2606 case charset_not: 2607 /* Chars beyond end of map must be allowed. */ 2608 for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++) 2609 fastmap[j] = 1; 2610 2611 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--) 2612 if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))) 2613 fastmap[j] = 1; 2614 break; 2615 2616 2617 case wordchar: 2618 for (j = 0; j < (1 << BYTEWIDTH); j++) 2619 if (SYNTAX (j) == Sword) 2620 fastmap[j] = 1; 2621 break; 2622 2623 2624 case notwordchar: 2625 for (j = 0; j < (1 << BYTEWIDTH); j++) 2626 if (SYNTAX (j) != Sword) 2627 fastmap[j] = 1; 2628 break; 2629 2630 2631 case anychar: 2632 /* `.' matches anything ... */ 2633 for (j = 0; j < (1 << BYTEWIDTH); j++) 2634 fastmap[j] = 1; 2635 2636 /* ... except perhaps newline. */ 2637 if (!(bufp->syntax & RE_DOT_NEWLINE)) 2638 fastmap['\n'] = 0; 2639 2640 /* Return if we have already set `can_be_null'; if we have, 2641 then the fastmap is irrelevant. Something's wrong here. */ 2642 else if (bufp->can_be_null) 2643 return 0; 2644 2645 /* Otherwise, have to check alternative paths. */ 2646 break; 2647 2648 2649 #ifdef emacs 2650 case syntaxspec: 2651 k = *p++; 2652 for (j = 0; j < (1 << BYTEWIDTH); j++) 2653 if (SYNTAX (j) == (enum syntaxcode) k) 2654 fastmap[j] = 1; 2655 break; 2656 2657 2658 case notsyntaxspec: 2659 k = *p++; 2660 for (j = 0; j < (1 << BYTEWIDTH); j++) 2661 if (SYNTAX (j) != (enum syntaxcode) k) 2662 fastmap[j] = 1; 2663 break; 2664 2665 2666 /* All cases after this match the empty string. These end with 2667 `continue'. */ 2668 2669 2670 case before_dot: 2671 case at_dot: 2672 case after_dot: 2673 continue; 2674 #endif /* not emacs */ 2675 2676 2677 case no_op: 2678 case begline: 2679 case endline: 2680 case begbuf: 2681 case endbuf: 2682 case wordbound: 2683 case notwordbound: 2684 case wordbeg: 2685 case wordend: 2686 case push_dummy_failure: 2687 continue; 2688 2689 2690 case jump_n: 2691 case pop_failure_jump: 2692 case maybe_pop_jump: 2693 case jump: 2694 case jump_past_alt: 2695 case dummy_failure_jump: 2696 EXTRACT_NUMBER_AND_INCR (j, p); 2697 p += j; 2698 if (j > 0) 2699 continue; 2700 2701 /* Jump backward implies we just went through the body of a 2702 loop and matched nothing. Opcode jumped to should be 2703 `on_failure_jump' or `succeed_n'. Just treat it like an 2704 ordinary jump. For a * loop, it has pushed its failure 2705 point already; if so, discard that as redundant. */ 2706 if ((re_opcode_t) *p != on_failure_jump 2707 && (re_opcode_t) *p != succeed_n) 2708 continue; 2709 2710 p++; 2711 EXTRACT_NUMBER_AND_INCR (j, p); 2712 p += j; 2713 2714 /* If what's on the stack is where we are now, pop it. */ 2715 if (!FAIL_STACK_EMPTY () 2716 && fail_stack.stack[fail_stack.avail - 1] == p) 2717 fail_stack.avail--; 2718 2719 continue; 2720 2721 2722 case on_failure_jump: 2723 case on_failure_keep_string_jump: 2724 handle_on_failure_jump: 2725 EXTRACT_NUMBER_AND_INCR (j, p); 2726 2727 /* For some patterns, e.g., `(a?)?', `p+j' here points to the 2728 end of the pattern. We don't want to push such a point, 2729 since when we restore it above, entering the switch will 2730 increment `p' past the end of the pattern. We don't need 2731 to push such a point since we obviously won't find any more 2732 fastmap entries beyond `pend'. Such a pattern can match 2733 the null string, though. */ 2734 if (p + j < pend) 2735 { 2736 if (!PUSH_PATTERN_OP (p + j, fail_stack)) 2737 return -2; 2738 } 2739 else 2740 bufp->can_be_null = 1; 2741 2742 if (succeed_n_p) 2743 { 2744 EXTRACT_NUMBER_AND_INCR (k, p); /* Skip the n. */ 2745 succeed_n_p = false; 2746 } 2747 2748 continue; 2749 2750 2751 case succeed_n: 2752 /* Get to the number of times to succeed. */ 2753 p += 2; 2754 2755 /* Increment p past the n for when k != 0. */ 2756 EXTRACT_NUMBER_AND_INCR (k, p); 2757 if (k == 0) 2758 { 2759 p -= 4; 2760 succeed_n_p = true; /* Spaghetti code alert. */ 2761 goto handle_on_failure_jump; 2762 } 2763 continue; 2764 2765 2766 case set_number_at: 2767 p += 4; 2768 continue; 2769 2770 2771 case start_memory: 2772 case stop_memory: 2773 p += 2; 2774 continue; 2775 2776 2777 default: 2778 abort (); /* We have listed all the cases. */ 2779 } /* switch *p++ */ 2780 2781 /* Getting here means we have found the possible starting 2782 characters for one path of the pattern -- and that the empty 2783 string does not match. We need not follow this path further. 2784 Instead, look at the next alternative (remembered on the 2785 stack), or quit if no more. The test at the top of the loop 2786 does these things. */ 2787 path_can_be_null = false; 2788 p = pend; 2789 } /* while p */ 2790 2791 /* Set `can_be_null' for the last path (also the first path, if the 2792 pattern is empty). */ 2793 bufp->can_be_null |= path_can_be_null; 2794 return 0; 2795 } /* re_compile_fastmap */ 2796 2797 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and 2798 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use 2799 this memory for recording register information. STARTS and ENDS 2800 must be allocated using the malloc library routine, and must each 2801 be at least NUM_REGS * sizeof (regoff_t) bytes long. 2802 2803 If NUM_REGS == 0, then subsequent matches should allocate their own 2804 register data. 2805 2806 Unless this function is called, the first search or match using 2807 PATTERN_BUFFER will allocate its own register data, without 2808 freeing the old data. */ 2809 2810 void 2811 re_set_registers (bufp, regs, num_regs, starts, ends) 2812 struct re_pattern_buffer *bufp; 2813 struct re_registers *regs; 2814 unsigned num_regs; 2815 regoff_t *starts, *ends; 2816 { 2817 if (num_regs) 2818 { 2819 bufp->regs_allocated = REGS_REALLOCATE; 2820 regs->num_regs = num_regs; 2821 regs->start = starts; 2822 regs->end = ends; 2823 } 2824 else 2825 { 2826 bufp->regs_allocated = REGS_UNALLOCATED; 2827 regs->num_regs = 0; 2828 regs->start = regs->end = (regoff_t) 0; 2829 } 2830 } 2831 2832 /* Searching routines. */ 2833 2834 /* Like re_search_2, below, but only one string is specified, and 2835 doesn't let you say where to stop matching. */ 2836 2837 int 2838 re_search (bufp, string, size, startpos, range, regs) 2839 struct re_pattern_buffer *bufp; 2840 const char *string; 2841 int size, startpos, range; 2842 struct re_registers *regs; 2843 { 2844 return re_search_2 (bufp, NULL, 0, string, size, startpos, range, 2845 regs, size); 2846 } 2847 2848 2849 /* Using the compiled pattern in BUFP->buffer, first tries to match the 2850 virtual concatenation of STRING1 and STRING2, starting first at index 2851 STARTPOS, then at STARTPOS + 1, and so on. 2852 2853 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively. 2854 2855 RANGE is how far to scan while trying to match. RANGE = 0 means try 2856 only at STARTPOS; in general, the last start tried is STARTPOS + 2857 RANGE. 2858 2859 In REGS, return the indices of the virtual concatenation of STRING1 2860 and STRING2 that matched the entire BUFP->buffer and its contained 2861 subexpressions. 2862 2863 Do not consider matching one past the index STOP in the virtual 2864 concatenation of STRING1 and STRING2. 2865 2866 We return either the position in the strings at which the match was 2867 found, -1 if no match, or -2 if error (such as failure 2868 stack overflow). */ 2869 2870 int 2871 re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop) 2872 struct re_pattern_buffer *bufp; 2873 const char *string1, *string2; 2874 int size1, size2; 2875 int startpos; 2876 int range; 2877 struct re_registers *regs; 2878 int stop; 2879 { 2880 int val; 2881 register char *fastmap = bufp->fastmap; 2882 register char *translate = bufp->translate; 2883 int total_size = size1 + size2; 2884 int endpos = startpos + range; 2885 2886 /* Check for out-of-range STARTPOS. */ 2887 if (startpos < 0 || startpos > total_size) 2888 return -1; 2889 2890 /* Fix up RANGE if it might eventually take us outside 2891 the virtual concatenation of STRING1 and STRING2. */ 2892 if (endpos < -1) 2893 range = -1 - startpos; 2894 else if (endpos > total_size) 2895 range = total_size - startpos; 2896 2897 /* If the search isn't to be a backwards one, don't waste time in a 2898 search for a pattern that must be anchored. */ 2899 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0) 2900 { 2901 if (startpos > 0) 2902 return -1; 2903 else 2904 range = 1; 2905 } 2906 2907 /* Update the fastmap now if not correct already. */ 2908 if (fastmap && !bufp->fastmap_accurate) 2909 if (re_compile_fastmap (bufp) == -2) 2910 return -2; 2911 2912 /* Loop through the string, looking for a place to start matching. */ 2913 for (;;) 2914 { 2915 /* If a fastmap is supplied, skip quickly over characters that 2916 cannot be the start of a match. If the pattern can match the 2917 null string, however, we don't need to skip characters; we want 2918 the first null string. */ 2919 if (fastmap && startpos < total_size && !bufp->can_be_null) 2920 { 2921 if (range > 0) /* Searching forwards. */ 2922 { 2923 register const char *d; 2924 register int lim = 0; 2925 int irange = range; 2926 2927 if (startpos < size1 && startpos + range >= size1) 2928 lim = range - (size1 - startpos); 2929 2930 d = (startpos >= size1 ? string2 - size1 : string1) + startpos; 2931 2932 /* Written out as an if-else to avoid testing `translate' 2933 inside the loop. */ 2934 if (translate) 2935 while (range > lim 2936 && !fastmap[(unsigned char) 2937 translate[(unsigned char) *d++]]) 2938 range--; 2939 else 2940 while (range > lim && !fastmap[(unsigned char) *d++]) 2941 range--; 2942 2943 startpos += irange - range; 2944 } 2945 else /* Searching backwards. */ 2946 { 2947 register char c = (size1 == 0 || startpos >= size1 2948 ? string2[startpos - size1] 2949 : string1[startpos]); 2950 2951 if (!fastmap[(unsigned char) TRANSLATE (c)]) 2952 goto advance; 2953 } 2954 } 2955 2956 /* If can't match the null string, and that's all we have left, fail. */ 2957 if (range >= 0 && startpos == total_size && fastmap 2958 && !bufp->can_be_null) 2959 return -1; 2960 2961 val = re_match_2 (bufp, string1, size1, string2, size2, 2962 startpos, regs, stop); 2963 if (val >= 0) 2964 return startpos; 2965 2966 if (val == -2) 2967 return -2; 2968 2969 advance: 2970 if (!range) 2971 break; 2972 else if (range > 0) 2973 { 2974 range--; 2975 startpos++; 2976 } 2977 else 2978 { 2979 range++; 2980 startpos--; 2981 } 2982 } 2983 return -1; 2984 } /* re_search_2 */ 2985 2986 /* Declarations and macros for re_match_2. */ 2987 2988 static int bcmp_translate (); 2989 static boolean alt_match_null_string_p (), 2990 common_op_match_null_string_p (), 2991 group_match_null_string_p (); 2992 2993 /* Structure for per-register (a.k.a. per-group) information. 2994 This must not be longer than one word, because we push this value 2995 onto the failure stack. Other register information, such as the 2996 starting and ending positions (which are addresses), and the list of 2997 inner groups (which is a bits list) are maintained in separate 2998 variables. 2999 3000 We are making a (strictly speaking) nonportable assumption here: that 3001 the compiler will pack our bit fields into something that fits into 3002 the type of `word', i.e., is something that fits into one item on the 3003 failure stack. */ 3004 typedef union 3005 { 3006 fail_stack_elt_t word; 3007 struct 3008 { 3009 /* This field is one if this group can match the empty string, 3010 zero if not. If not yet determined, `MATCH_NULL_UNSET_VALUE'. */ 3011 #define MATCH_NULL_UNSET_VALUE 3 3012 unsigned match_null_string_p : 2; 3013 unsigned is_active : 1; 3014 unsigned matched_something : 1; 3015 unsigned ever_matched_something : 1; 3016 } bits; 3017 } register_info_type; 3018 3019 #define REG_MATCH_NULL_STRING_P(R) ((R).bits.match_null_string_p) 3020 #define IS_ACTIVE(R) ((R).bits.is_active) 3021 #define MATCHED_SOMETHING(R) ((R).bits.matched_something) 3022 #define EVER_MATCHED_SOMETHING(R) ((R).bits.ever_matched_something) 3023 3024 3025 /* Call this when have matched a real character; it sets `matched' flags 3026 for the subexpressions which we are currently inside. Also records 3027 that those subexprs have matched. */ 3028 #define SET_REGS_MATCHED() \ 3029 do \ 3030 { \ 3031 unsigned r; \ 3032 for (r = lowest_active_reg; r <= highest_active_reg; r++) \ 3033 { \ 3034 MATCHED_SOMETHING (reg_info[r]) \ 3035 = EVER_MATCHED_SOMETHING (reg_info[r]) \ 3036 = 1; \ 3037 } \ 3038 } \ 3039 while (0) 3040 3041 3042 /* This converts PTR, a pointer into one of the search strings `string1' 3043 and `string2' into an offset from the beginning of that string. */ 3044 #define POINTER_TO_OFFSET(ptr) \ 3045 (FIRST_STRING_P (ptr) ? (ptr) - string1 : (ptr) - string2 + size1) 3046 3047 /* Registers are set to a sentinel when they haven't yet matched. */ 3048 #define REG_UNSET_VALUE ((char *) -1) 3049 #define REG_UNSET(e) ((e) == REG_UNSET_VALUE) 3050 3051 3052 /* Macros for dealing with the split strings in re_match_2. */ 3053 3054 #define MATCHING_IN_FIRST_STRING (dend == end_match_1) 3055 3056 /* Call before fetching a character with *d. This switches over to 3057 string2 if necessary. */ 3058 #define PREFETCH() \ 3059 while (d == dend) \ 3060 { \ 3061 /* End of string2 => fail. */ \ 3062 if (dend == end_match_2) \ 3063 goto fail; \ 3064 /* End of string1 => advance to string2. */ \ 3065 d = string2; \ 3066 dend = end_match_2; \ 3067 } 3068 3069 3070 /* Test if at very beginning or at very end of the virtual concatenation 3071 of `string1' and `string2'. If only one string, it's `string2'. */ 3072 #define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2) 3073 #define AT_STRINGS_END(d) ((d) == end2) 3074 3075 3076 /* Test if D points to a character which is word-constituent. We have 3077 two special cases to check for: if past the end of string1, look at 3078 the first character in string2; and if before the beginning of 3079 string2, look at the last character in string1. */ 3080 #define WORDCHAR_P(d) \ 3081 (SYNTAX ((d) == end1 ? *string2 \ 3082 : (d) == string2 - 1 ? *(end1 - 1) : *(d)) \ 3083 == Sword) 3084 3085 /* Test if the character before D and the one at D differ with respect 3086 to being word-constituent. */ 3087 #define AT_WORD_BOUNDARY(d) \ 3088 (AT_STRINGS_BEG (d) || AT_STRINGS_END (d) \ 3089 || WORDCHAR_P (d - 1) != WORDCHAR_P (d)) 3090 3091 3092 /* Free everything we malloc. */ 3093 #ifdef REGEX_MALLOC 3094 #define FREE_VAR(var) if (var) free (var); var = NULL 3095 #define FREE_VARIABLES() \ 3096 do { \ 3097 FREE_VAR (fail_stack.stack); \ 3098 FREE_VAR (regstart); \ 3099 FREE_VAR (regend); \ 3100 FREE_VAR (old_regstart); \ 3101 FREE_VAR (old_regend); \ 3102 FREE_VAR (best_regstart); \ 3103 FREE_VAR (best_regend); \ 3104 FREE_VAR (reg_info); \ 3105 FREE_VAR (reg_dummy); \ 3106 FREE_VAR (reg_info_dummy); \ 3107 } while (0) 3108 #else /* not REGEX_MALLOC */ 3109 /* Some MIPS systems (at least) want this to free alloca'd storage. */ 3110 #define FREE_VARIABLES() alloca (0) 3111 #endif /* not REGEX_MALLOC */ 3112 3113 3114 /* These values must meet several constraints. They must not be valid 3115 register values; since we have a limit of 255 registers (because 3116 we use only one byte in the pattern for the register number), we can 3117 use numbers larger than 255. They must differ by 1, because of 3118 NUM_FAILURE_ITEMS above. And the value for the lowest register must 3119 be larger than the value for the highest register, so we do not try 3120 to actually save any registers when none are active. */ 3121 #define NO_HIGHEST_ACTIVE_REG (1 << BYTEWIDTH) 3122 #define NO_LOWEST_ACTIVE_REG (NO_HIGHEST_ACTIVE_REG + 1) 3123 3124 /* Matching routines. */ 3125 3126 #ifndef emacs /* Emacs never uses this. */ 3127 /* re_match is like re_match_2 except it takes only a single string. */ 3128 3129 int 3130 re_match (bufp, string, size, pos, regs) 3131 struct re_pattern_buffer *bufp; 3132 const char *string; 3133 int size, pos; 3134 struct re_registers *regs; 3135 { 3136 return re_match_2 (bufp, NULL, 0, string, size, pos, regs, size); 3137 } 3138 #endif /* not emacs */ 3139 3140 3141 /* re_match_2 matches the compiled pattern in BUFP against the 3142 the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1 3143 and SIZE2, respectively). We start matching at POS, and stop 3144 matching at STOP. 3145 3146 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we 3147 store offsets for the substring each group matched in REGS. See the 3148 documentation for exactly how many groups we fill. 3149 3150 We return -1 if no match, -2 if an internal error (such as the 3151 failure stack overflowing). Otherwise, we return the length of the 3152 matched substring. */ 3153 3154 int 3155 re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop) 3156 struct re_pattern_buffer *bufp; 3157 const char *string1, *string2; 3158 int size1, size2; 3159 int pos; 3160 struct re_registers *regs; 3161 int stop; 3162 { 3163 /* General temporaries. */ 3164 int mcnt; 3165 unsigned char *p1; 3166 3167 /* Just past the end of the corresponding string. */ 3168 const char *end1, *end2; 3169 3170 /* Pointers into string1 and string2, just past the last characters in 3171 each to consider matching. */ 3172 const char *end_match_1, *end_match_2; 3173 3174 /* Where we are in the data, and the end of the current string. */ 3175 const char *d, *dend; 3176 3177 /* Where we are in the pattern, and the end of the pattern. */ 3178 unsigned char *p = bufp->buffer; 3179 register unsigned char *pend = p + bufp->used; 3180 3181 /* We use this to map every character in the string. */ 3182 char *translate = bufp->translate; 3183 3184 /* Failure point stack. Each place that can handle a failure further 3185 down the line pushes a failure point on this stack. It consists of 3186 restart, regend, and reg_info for all registers corresponding to 3187 the subexpressions we're currently inside, plus the number of such 3188 registers, and, finally, two char *'s. The first char * is where 3189 to resume scanning the pattern; the second one is where to resume 3190 scanning the strings. If the latter is zero, the failure point is 3191 a ``dummy''; if a failure happens and the failure point is a dummy, 3192 it gets discarded and the next next one is tried. */ 3193 fail_stack_type fail_stack; 3194 #ifdef DEBUG 3195 static unsigned failure_id = 0; 3196 unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0; 3197 #endif 3198 3199 /* We fill all the registers internally, independent of what we 3200 return, for use in backreferences. The number here includes 3201 an element for register zero. */ 3202 unsigned num_regs = bufp->re_nsub + 1; 3203 3204 /* The currently active registers. */ 3205 unsigned lowest_active_reg = NO_LOWEST_ACTIVE_REG; 3206 unsigned highest_active_reg = NO_HIGHEST_ACTIVE_REG; 3207 3208 /* Information on the contents of registers. These are pointers into 3209 the input strings; they record just what was matched (on this 3210 attempt) by a subexpression part of the pattern, that is, the 3211 regnum-th regstart pointer points to where in the pattern we began 3212 matching and the regnum-th regend points to right after where we 3213 stopped matching the regnum-th subexpression. (The zeroth register 3214 keeps track of what the whole pattern matches.) */ 3215 const char **regstart, **regend; 3216 3217 /* If a group that's operated upon by a repetition operator fails to 3218 match anything, then the register for its start will need to be 3219 restored because it will have been set to wherever in the string we 3220 are when we last see its open-group operator. Similarly for a 3221 register's end. */ 3222 const char **old_regstart, **old_regend; 3223 3224 /* The is_active field of reg_info helps us keep track of which (possibly 3225 nested) subexpressions we are currently in. The matched_something 3226 field of reg_info[reg_num] helps us tell whether or not we have 3227 matched any of the pattern so far this time through the reg_num-th 3228 subexpression. These two fields get reset each time through any 3229 loop their register is in. */ 3230 register_info_type *reg_info; 3231 3232 /* The following record the register info as found in the above 3233 variables when we find a match better than any we've seen before. 3234 This happens as we backtrack through the failure points, which in 3235 turn happens only if we have not yet matched the entire string. */ 3236 unsigned best_regs_set = false; 3237 const char **best_regstart, **best_regend; 3238 3239 /* Logically, this is `best_regend[0]'. But we don't want to have to 3240 allocate space for that if we're not allocating space for anything 3241 else (see below). Also, we never need info about register 0 for 3242 any of the other register vectors, and it seems rather a kludge to 3243 treat `best_regend' differently than the rest. So we keep track of 3244 the end of the best match so far in a separate variable. We 3245 initialize this to NULL so that when we backtrack the first time 3246 and need to test it, it's not garbage. */ 3247 const char *match_end = NULL; 3248 3249 /* Used when we pop values we don't care about. */ 3250 const char **reg_dummy; 3251 register_info_type *reg_info_dummy; 3252 3253 #ifdef DEBUG 3254 /* Counts the total number of registers pushed. */ 3255 unsigned num_regs_pushed = 0; 3256 #endif 3257 3258 DEBUG_PRINT1 ("\n\nEntering re_match_2.\n"); 3259 3260 INIT_FAIL_STACK (); 3261 3262 /* Do not bother to initialize all the register variables if there are 3263 no groups in the pattern, as it takes a fair amount of time. If 3264 there are groups, we include space for register 0 (the whole 3265 pattern), even though we never use it, since it simplifies the 3266 array indexing. We should fix this. */ 3267 if (bufp->re_nsub) 3268 { 3269 regstart = REGEX_TALLOC (num_regs, const char *); 3270 regend = REGEX_TALLOC (num_regs, const char *); 3271 old_regstart = REGEX_TALLOC (num_regs, const char *); 3272 old_regend = REGEX_TALLOC (num_regs, const char *); 3273 best_regstart = REGEX_TALLOC (num_regs, const char *); 3274 best_regend = REGEX_TALLOC (num_regs, const char *); 3275 reg_info = REGEX_TALLOC (num_regs, register_info_type); 3276 reg_dummy = REGEX_TALLOC (num_regs, const char *); 3277 reg_info_dummy = REGEX_TALLOC (num_regs, register_info_type); 3278 3279 if (!(regstart && regend && old_regstart && old_regend && reg_info 3280 && best_regstart && best_regend && reg_dummy && reg_info_dummy)) 3281 { 3282 FREE_VARIABLES (); 3283 return -2; 3284 } 3285 } 3286 #ifdef REGEX_MALLOC 3287 else 3288 { 3289 /* We must initialize all our variables to NULL, so that 3290 `FREE_VARIABLES' doesn't try to free them. */ 3291 regstart = regend = old_regstart = old_regend = best_regstart 3292 = best_regend = reg_dummy = NULL; 3293 reg_info = reg_info_dummy = (register_info_type *) NULL; 3294 } 3295 #endif /* REGEX_MALLOC */ 3296 3297 /* The starting position is bogus. */ 3298 if (pos < 0 || pos > size1 + size2) 3299 { 3300 FREE_VARIABLES (); 3301 return -1; 3302 } 3303 3304 /* Initialize subexpression text positions to -1 to mark ones that no 3305 start_memory/stop_memory has been seen for. Also initialize the 3306 register information struct. */ 3307 for (mcnt = 1; mcnt < num_regs; mcnt++) 3308 { 3309 regstart[mcnt] = regend[mcnt] 3310 = old_regstart[mcnt] = old_regend[mcnt] = REG_UNSET_VALUE; 3311 3312 REG_MATCH_NULL_STRING_P (reg_info[mcnt]) = MATCH_NULL_UNSET_VALUE; 3313 IS_ACTIVE (reg_info[mcnt]) = 0; 3314 MATCHED_SOMETHING (reg_info[mcnt]) = 0; 3315 EVER_MATCHED_SOMETHING (reg_info[mcnt]) = 0; 3316 } 3317 3318 /* We move `string1' into `string2' if the latter's empty -- but not if 3319 `string1' is null. */ 3320 if (size2 == 0 && string1 != NULL) 3321 { 3322 string2 = string1; 3323 size2 = size1; 3324 string1 = 0; 3325 size1 = 0; 3326 } 3327 end1 = string1 + size1; 3328 end2 = string2 + size2; 3329 3330 /* Compute where to stop matching, within the two strings. */ 3331 if (stop <= size1) 3332 { 3333 end_match_1 = string1 + stop; 3334 end_match_2 = string2; 3335 } 3336 else 3337 { 3338 end_match_1 = end1; 3339 end_match_2 = string2 + stop - size1; 3340 } 3341 3342 /* `p' scans through the pattern as `d' scans through the data. 3343 `dend' is the end of the input string that `d' points within. `d' 3344 is advanced into the following input string whenever necessary, but 3345 this happens before fetching; therefore, at the beginning of the 3346 loop, `d' can be pointing at the end of a string, but it cannot 3347 equal `string2'. */ 3348 if (size1 > 0 && pos <= size1) 3349 { 3350 d = string1 + pos; 3351 dend = end_match_1; 3352 } 3353 else 3354 { 3355 d = string2 + pos - size1; 3356 dend = end_match_2; 3357 } 3358 3359 DEBUG_PRINT1 ("The compiled pattern is: "); 3360 DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend); 3361 DEBUG_PRINT1 ("The string to match is: `"); 3362 DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2); 3363 DEBUG_PRINT1 ("'\n"); 3364 3365 /* This loops over pattern commands. It exits by returning from the 3366 function if the match is complete, or it drops through if the match 3367 fails at this starting point in the input data. */ 3368 for (;;) 3369 { 3370 DEBUG_PRINT2 ("\n0x%x: ", p); 3371 3372 if (p == pend) 3373 { /* End of pattern means we might have succeeded. */ 3374 DEBUG_PRINT1 ("end of pattern ... "); 3375 3376 /* If we haven't matched the entire string, and we want the 3377 longest match, try backtracking. */ 3378 if (d != end_match_2) 3379 { 3380 DEBUG_PRINT1 ("backtracking.\n"); 3381 3382 if (!FAIL_STACK_EMPTY ()) 3383 { /* More failure points to try. */ 3384 boolean same_str_p = (FIRST_STRING_P (match_end) 3385 == MATCHING_IN_FIRST_STRING); 3386 3387 /* If exceeds best match so far, save it. */ 3388 if (!best_regs_set 3389 || (same_str_p && d > match_end) 3390 || (!same_str_p && !MATCHING_IN_FIRST_STRING)) 3391 { 3392 best_regs_set = true; 3393 match_end = d; 3394 3395 DEBUG_PRINT1 ("\nSAVING match as best so far.\n"); 3396 3397 for (mcnt = 1; mcnt < num_regs; mcnt++) 3398 { 3399 best_regstart[mcnt] = regstart[mcnt]; 3400 best_regend[mcnt] = regend[mcnt]; 3401 } 3402 } 3403 goto fail; 3404 } 3405 3406 /* If no failure points, don't restore garbage. */ 3407 else if (best_regs_set) 3408 { 3409 restore_best_regs: 3410 /* Restore best match. It may happen that `dend == 3411 end_match_1' while the restored d is in string2. 3412 For example, the pattern `x.*y.*z' against the 3413 strings `x-' and `y-z-', if the two strings are 3414 not consecutive in memory. */ 3415 DEBUG_PRINT1 ("Restoring best registers.\n"); 3416 3417 d = match_end; 3418 dend = ((d >= string1 && d <= end1) 3419 ? end_match_1 : end_match_2); 3420 3421 for (mcnt = 1; mcnt < num_regs; mcnt++) 3422 { 3423 regstart[mcnt] = best_regstart[mcnt]; 3424 regend[mcnt] = best_regend[mcnt]; 3425 } 3426 } 3427 } /* d != end_match_2 */ 3428 3429 DEBUG_PRINT1 ("Accepting match.\n"); 3430 3431 /* If caller wants register contents data back, do it. */ 3432 if (regs && !bufp->no_sub) 3433 { 3434 /* Have the register data arrays been allocated? */ 3435 if (bufp->regs_allocated == REGS_UNALLOCATED) 3436 { /* No. So allocate them with malloc. We need one 3437 extra element beyond `num_regs' for the `-1' marker 3438 GNU code uses. */ 3439 regs->num_regs = MAX (RE_NREGS, num_regs + 1); 3440 regs->start = TALLOC (regs->num_regs, regoff_t); 3441 regs->end = TALLOC (regs->num_regs, regoff_t); 3442 if (regs->start == NULL || regs->end == NULL) 3443 return -2; 3444 bufp->regs_allocated = REGS_REALLOCATE; 3445 } 3446 else if (bufp->regs_allocated == REGS_REALLOCATE) 3447 { /* Yes. If we need more elements than were already 3448 allocated, reallocate them. If we need fewer, just 3449 leave it alone. */ 3450 if (regs->num_regs < num_regs + 1) 3451 { 3452 regs->num_regs = num_regs + 1; 3453 RETALLOC (regs->start, regs->num_regs, regoff_t); 3454 RETALLOC (regs->end, regs->num_regs, regoff_t); 3455 if (regs->start == NULL || regs->end == NULL) 3456 return -2; 3457 } 3458 } 3459 else 3460 assert (bufp->regs_allocated == REGS_FIXED); 3461 3462 /* Convert the pointer data in `regstart' and `regend' to 3463 indices. Register zero has to be set differently, 3464 since we haven't kept track of any info for it. */ 3465 if (regs->num_regs > 0) 3466 { 3467 regs->start[0] = pos; 3468 regs->end[0] = (MATCHING_IN_FIRST_STRING ? d - string1 3469 : d - string2 + size1); 3470 } 3471 3472 /* Go through the first `min (num_regs, regs->num_regs)' 3473 registers, since that is all we initialized. */ 3474 for (mcnt = 1; mcnt < MIN (num_regs, regs->num_regs); mcnt++) 3475 { 3476 if (REG_UNSET (regstart[mcnt]) || REG_UNSET (regend[mcnt])) 3477 regs->start[mcnt] = regs->end[mcnt] = -1; 3478 else 3479 { 3480 regs->start[mcnt] = POINTER_TO_OFFSET (regstart[mcnt]); 3481 regs->end[mcnt] = POINTER_TO_OFFSET (regend[mcnt]); 3482 } 3483 } 3484 3485 /* If the regs structure we return has more elements than 3486 were in the pattern, set the extra elements to -1. If 3487 we (re)allocated the registers, this is the case, 3488 because we always allocate enough to have at least one 3489 -1 at the end. */ 3490 for (mcnt = num_regs; mcnt < regs->num_regs; mcnt++) 3491 regs->start[mcnt] = regs->end[mcnt] = -1; 3492 } /* regs && !bufp->no_sub */ 3493 3494 FREE_VARIABLES (); 3495 DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n", 3496 nfailure_points_pushed, nfailure_points_popped, 3497 nfailure_points_pushed - nfailure_points_popped); 3498 DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed); 3499 3500 mcnt = d - pos - (MATCHING_IN_FIRST_STRING 3501 ? string1 3502 : string2 - size1); 3503 3504 DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt); 3505 3506 return mcnt; 3507 } 3508 3509 /* Otherwise match next pattern command. */ 3510 #ifdef SWITCH_ENUM_BUG 3511 switch ((int) ((re_opcode_t) *p++)) 3512 #else 3513 switch ((re_opcode_t) *p++) 3514 #endif 3515 { 3516 /* Ignore these. Used to ignore the n of succeed_n's which 3517 currently have n == 0. */ 3518 case no_op: 3519 DEBUG_PRINT1 ("EXECUTING no_op.\n"); 3520 break; 3521 3522 3523 /* Match the next n pattern characters exactly. The following 3524 byte in the pattern defines n, and the n bytes after that 3525 are the characters to match. */ 3526 case exactn: 3527 mcnt = *p++; 3528 DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt); 3529 3530 /* This is written out as an if-else so we don't waste time 3531 testing `translate' inside the loop. */ 3532 if (translate) 3533 { 3534 do 3535 { 3536 PREFETCH (); 3537 if (translate[(unsigned char) *d++] != (char) *p++) 3538 goto fail; 3539 } 3540 while (--mcnt); 3541 } 3542 else 3543 { 3544 do 3545 { 3546 PREFETCH (); 3547 if (*d++ != (char) *p++) goto fail; 3548 } 3549 while (--mcnt); 3550 } 3551 SET_REGS_MATCHED (); 3552 break; 3553 3554 3555 /* Match any character except possibly a newline or a null. */ 3556 case anychar: 3557 DEBUG_PRINT1 ("EXECUTING anychar.\n"); 3558 3559 PREFETCH (); 3560 3561 if ((!(bufp->syntax & RE_DOT_NEWLINE) && TRANSLATE (*d) == '\n') 3562 || (bufp->syntax & RE_DOT_NOT_NULL && TRANSLATE (*d) == '\000')) 3563 goto fail; 3564 3565 SET_REGS_MATCHED (); 3566 DEBUG_PRINT2 (" Matched `%d'.\n", *d); 3567 d++; 3568 break; 3569 3570 3571 case charset: 3572 case charset_not: 3573 { 3574 register unsigned char c; 3575 boolean not = (re_opcode_t) *(p - 1) == charset_not; 3576 3577 DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : ""); 3578 3579 PREFETCH (); 3580 c = TRANSLATE (*d); /* The character to match. */ 3581 3582 /* Cast to `unsigned' instead of `unsigned char' in case the 3583 bit list is a full 32 bytes long. */ 3584 if (c < (unsigned) (*p * BYTEWIDTH) 3585 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH))) 3586 not = !not; 3587 3588 p += 1 + *p; 3589 3590 if (!not) goto fail; 3591 3592 SET_REGS_MATCHED (); 3593 d++; 3594 break; 3595 } 3596 3597 3598 /* The beginning of a group is represented by start_memory. 3599 The arguments are the register number in the next byte, and the 3600 number of groups inner to this one in the next. The text 3601 matched within the group is recorded (in the internal 3602 registers data structure) under the register number. */ 3603 case start_memory: 3604 DEBUG_PRINT3 ("EXECUTING start_memory %d (%d):\n", *p, p[1]); 3605 3606 /* Find out if this group can match the empty string. */ 3607 p1 = p; /* To send to group_match_null_string_p. */ 3608 3609 if (REG_MATCH_NULL_STRING_P (reg_info[*p]) == MATCH_NULL_UNSET_VALUE) 3610 REG_MATCH_NULL_STRING_P (reg_info[*p]) 3611 = group_match_null_string_p (&p1, pend, reg_info); 3612 3613 /* Save the position in the string where we were the last time 3614 we were at this open-group operator in case the group is 3615 operated upon by a repetition operator, e.g., with `(a*)*b' 3616 against `ab'; then we want to ignore where we are now in 3617 the string in case this attempt to match fails. */ 3618 old_regstart[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p]) 3619 ? REG_UNSET (regstart[*p]) ? d : regstart[*p] 3620 : regstart[*p]; 3621 DEBUG_PRINT2 (" old_regstart: %d\n", 3622 POINTER_TO_OFFSET (old_regstart[*p])); 3623 3624 regstart[*p] = d; 3625 DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p])); 3626 3627 IS_ACTIVE (reg_info[*p]) = 1; 3628 MATCHED_SOMETHING (reg_info[*p]) = 0; 3629 3630 /* This is the new highest active register. */ 3631 highest_active_reg = *p; 3632 3633 /* If nothing was active before, this is the new lowest active 3634 register. */ 3635 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG) 3636 lowest_active_reg = *p; 3637 3638 /* Move past the register number and inner group count. */ 3639 p += 2; 3640 break; 3641 3642 3643 /* The stop_memory opcode represents the end of a group. Its 3644 arguments are the same as start_memory's: the register 3645 number, and the number of inner groups. */ 3646 case stop_memory: 3647 DEBUG_PRINT3 ("EXECUTING stop_memory %d (%d):\n", *p, p[1]); 3648 3649 /* We need to save the string position the last time we were at 3650 this close-group operator in case the group is operated 3651 upon by a repetition operator, e.g., with `((a*)*(b*)*)*' 3652 against `aba'; then we want to ignore where we are now in 3653 the string in case this attempt to match fails. */ 3654 old_regend[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p]) 3655 ? REG_UNSET (regend[*p]) ? d : regend[*p] 3656 : regend[*p]; 3657 DEBUG_PRINT2 (" old_regend: %d\n", 3658 POINTER_TO_OFFSET (old_regend[*p])); 3659 3660 regend[*p] = d; 3661 DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p])); 3662 3663 /* This register isn't active anymore. */ 3664 IS_ACTIVE (reg_info[*p]) = 0; 3665 3666 /* If this was the only register active, nothing is active 3667 anymore. */ 3668 if (lowest_active_reg == highest_active_reg) 3669 { 3670 lowest_active_reg = NO_LOWEST_ACTIVE_REG; 3671 highest_active_reg = NO_HIGHEST_ACTIVE_REG; 3672 } 3673 else 3674 { /* We must scan for the new highest active register, since 3675 it isn't necessarily one less than now: consider 3676 (a(b)c(d(e)f)g). When group 3 ends, after the f), the 3677 new highest active register is 1. */ 3678 unsigned char r = *p - 1; 3679 while (r > 0 && !IS_ACTIVE (reg_info[r])) 3680 r--; 3681 3682 /* If we end up at register zero, that means that we saved 3683 the registers as the result of an `on_failure_jump', not 3684 a `start_memory', and we jumped to past the innermost 3685 `stop_memory'. For example, in ((.)*) we save 3686 registers 1 and 2 as a result of the *, but when we pop 3687 back to the second ), we are at the stop_memory 1. 3688 Thus, nothing is active. */ 3689 if (r == 0) 3690 { 3691 lowest_active_reg = NO_LOWEST_ACTIVE_REG; 3692 highest_active_reg = NO_HIGHEST_ACTIVE_REG; 3693 } 3694 else 3695 highest_active_reg = r; 3696 } 3697 3698 /* If just failed to match something this time around with a 3699 group that's operated on by a repetition operator, try to 3700 force exit from the ``loop'', and restore the register 3701 information for this group that we had before trying this 3702 last match. */ 3703 if ((!MATCHED_SOMETHING (reg_info[*p]) 3704 || (re_opcode_t) p[-3] == start_memory) 3705 && (p + 2) < pend) 3706 { 3707 boolean is_a_jump_n = false; 3708 3709 p1 = p + 2; 3710 mcnt = 0; 3711 switch ((re_opcode_t) *p1++) 3712 { 3713 case jump_n: 3714 is_a_jump_n = true; 3715 case pop_failure_jump: 3716 case maybe_pop_jump: 3717 case jump: 3718 case dummy_failure_jump: 3719 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 3720 if (is_a_jump_n) 3721 p1 += 2; 3722 break; 3723 3724 default: 3725 /* do nothing */ ; 3726 } 3727 p1 += mcnt; 3728 3729 /* If the next operation is a jump backwards in the pattern 3730 to an on_failure_jump right before the start_memory 3731 corresponding to this stop_memory, exit from the loop 3732 by forcing a failure after pushing on the stack the 3733 on_failure_jump's jump in the pattern, and d. */ 3734 if (mcnt < 0 && (re_opcode_t) *p1 == on_failure_jump 3735 && (re_opcode_t) p1[3] == start_memory && p1[4] == *p) 3736 { 3737 /* If this group ever matched anything, then restore 3738 what its registers were before trying this last 3739 failed match, e.g., with `(a*)*b' against `ab' for 3740 regstart[1], and, e.g., with `((a*)*(b*)*)*' 3741 against `aba' for regend[3]. 3742 3743 Also restore the registers for inner groups for, 3744 e.g., `((a*)(b*))*' against `aba' (register 3 would 3745 otherwise get trashed). */ 3746 3747 if (EVER_MATCHED_SOMETHING (reg_info[*p])) 3748 { 3749 unsigned r; 3750 3751 EVER_MATCHED_SOMETHING (reg_info[*p]) = 0; 3752 3753 /* Restore this and inner groups' (if any) registers. */ 3754 for (r = *p; r < *p + *(p + 1); r++) 3755 { 3756 regstart[r] = old_regstart[r]; 3757 3758 /* xx why this test? */ 3759 if ((int) old_regend[r] >= (int) regstart[r]) 3760 regend[r] = old_regend[r]; 3761 } 3762 } 3763 p1++; 3764 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 3765 PUSH_FAILURE_POINT (p1 + mcnt, d, -2); 3766 3767 goto fail; 3768 } 3769 } 3770 3771 /* Move past the register number and the inner group count. */ 3772 p += 2; 3773 break; 3774 3775 3776 /* \<digit> has been turned into a `duplicate' command which is 3777 followed by the numeric value of <digit> as the register number. */ 3778 case duplicate: 3779 { 3780 register const char *d2, *dend2; 3781 int regno = *p++; /* Get which register to match against. */ 3782 DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno); 3783 3784 /* Can't back reference a group which we've never matched. */ 3785 if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno])) 3786 goto fail; 3787 3788 /* Where in input to try to start matching. */ 3789 d2 = regstart[regno]; 3790 3791 /* Where to stop matching; if both the place to start and 3792 the place to stop matching are in the same string, then 3793 set to the place to stop, otherwise, for now have to use 3794 the end of the first string. */ 3795 3796 dend2 = ((FIRST_STRING_P (regstart[regno]) 3797 == FIRST_STRING_P (regend[regno])) 3798 ? regend[regno] : end_match_1); 3799 for (;;) 3800 { 3801 /* If necessary, advance to next segment in register 3802 contents. */ 3803 while (d2 == dend2) 3804 { 3805 if (dend2 == end_match_2) break; 3806 if (dend2 == regend[regno]) break; 3807 3808 /* End of string1 => advance to string2. */ 3809 d2 = string2; 3810 dend2 = regend[regno]; 3811 } 3812 /* At end of register contents => success */ 3813 if (d2 == dend2) break; 3814 3815 /* If necessary, advance to next segment in data. */ 3816 PREFETCH (); 3817 3818 /* How many characters left in this segment to match. */ 3819 mcnt = dend - d; 3820 3821 /* Want how many consecutive characters we can match in 3822 one shot, so, if necessary, adjust the count. */ 3823 if (mcnt > dend2 - d2) 3824 mcnt = dend2 - d2; 3825 3826 /* Compare that many; failure if mismatch, else move 3827 past them. */ 3828 if (translate 3829 ? bcmp_translate (d, d2, mcnt, translate) 3830 : bcmp (d, d2, mcnt)) 3831 goto fail; 3832 d += mcnt, d2 += mcnt; 3833 } 3834 } 3835 break; 3836 3837 3838 /* begline matches the empty string at the beginning of the string 3839 (unless `not_bol' is set in `bufp'), and, if 3840 `newline_anchor' is set, after newlines. */ 3841 case begline: 3842 DEBUG_PRINT1 ("EXECUTING begline.\n"); 3843 3844 if (AT_STRINGS_BEG (d)) 3845 { 3846 if (!bufp->not_bol) break; 3847 } 3848 else if (d[-1] == '\n' && bufp->newline_anchor) 3849 { 3850 break; 3851 } 3852 /* In all other cases, we fail. */ 3853 goto fail; 3854 3855 3856 /* endline is the dual of begline. */ 3857 case endline: 3858 DEBUG_PRINT1 ("EXECUTING endline.\n"); 3859 3860 if (AT_STRINGS_END (d)) 3861 { 3862 if (!bufp->not_eol) break; 3863 } 3864 3865 /* We have to ``prefetch'' the next character. */ 3866 else if ((d == end1 ? *string2 : *d) == '\n' 3867 && bufp->newline_anchor) 3868 { 3869 break; 3870 } 3871 goto fail; 3872 3873 3874 /* Match at the very beginning of the data. */ 3875 case begbuf: 3876 DEBUG_PRINT1 ("EXECUTING begbuf.\n"); 3877 if (AT_STRINGS_BEG (d)) 3878 break; 3879 goto fail; 3880 3881 3882 /* Match at the very end of the data. */ 3883 case endbuf: 3884 DEBUG_PRINT1 ("EXECUTING endbuf.\n"); 3885 if (AT_STRINGS_END (d)) 3886 break; 3887 goto fail; 3888 3889 3890 /* on_failure_keep_string_jump is used to optimize `.*\n'. It 3891 pushes NULL as the value for the string on the stack. Then 3892 `pop_failure_point' will keep the current value for the 3893 string, instead of restoring it. To see why, consider 3894 matching `foo\nbar' against `.*\n'. The .* matches the foo; 3895 then the . fails against the \n. But the next thing we want 3896 to do is match the \n against the \n; if we restored the 3897 string value, we would be back at the foo. 3898 3899 Because this is used only in specific cases, we don't need to 3900 check all the things that `on_failure_jump' does, to make 3901 sure the right things get saved on the stack. Hence we don't 3902 share its code. The only reason to push anything on the 3903 stack at all is that otherwise we would have to change 3904 `anychar's code to do something besides goto fail in this 3905 case; that seems worse than this. */ 3906 case on_failure_keep_string_jump: 3907 DEBUG_PRINT1 ("EXECUTING on_failure_keep_string_jump"); 3908 3909 EXTRACT_NUMBER_AND_INCR (mcnt, p); 3910 DEBUG_PRINT3 (" %d (to 0x%x):\n", mcnt, p + mcnt); 3911 3912 PUSH_FAILURE_POINT (p + mcnt, NULL, -2); 3913 break; 3914 3915 3916 /* Uses of on_failure_jump: 3917 3918 Each alternative starts with an on_failure_jump that points 3919 to the beginning of the next alternative. Each alternative 3920 except the last ends with a jump that in effect jumps past 3921 the rest of the alternatives. (They really jump to the 3922 ending jump of the following alternative, because tensioning 3923 these jumps is a hassle.) 3924 3925 Repeats start with an on_failure_jump that points past both 3926 the repetition text and either the following jump or 3927 pop_failure_jump back to this on_failure_jump. */ 3928 case on_failure_jump: 3929 on_failure: 3930 DEBUG_PRINT1 ("EXECUTING on_failure_jump"); 3931 3932 EXTRACT_NUMBER_AND_INCR (mcnt, p); 3933 DEBUG_PRINT3 (" %d (to 0x%x)", mcnt, p + mcnt); 3934 3935 /* If this on_failure_jump comes right before a group (i.e., 3936 the original * applied to a group), save the information 3937 for that group and all inner ones, so that if we fail back 3938 to this point, the group's information will be correct. 3939 For example, in \(a*\)*\1, we need the preceding group, 3940 and in \(\(a*\)b*\)\2, we need the inner group. */ 3941 3942 /* We can't use `p' to check ahead because we push 3943 a failure point to `p + mcnt' after we do this. */ 3944 p1 = p; 3945 3946 /* We need to skip no_op's before we look for the 3947 start_memory in case this on_failure_jump is happening as 3948 the result of a completed succeed_n, as in \(a\)\{1,3\}b\1 3949 against aba. */ 3950 while (p1 < pend && (re_opcode_t) *p1 == no_op) 3951 p1++; 3952 3953 if (p1 < pend && (re_opcode_t) *p1 == start_memory) 3954 { 3955 /* We have a new highest active register now. This will 3956 get reset at the start_memory we are about to get to, 3957 but we will have saved all the registers relevant to 3958 this repetition op, as described above. */ 3959 highest_active_reg = *(p1 + 1) + *(p1 + 2); 3960 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG) 3961 lowest_active_reg = *(p1 + 1); 3962 } 3963 3964 DEBUG_PRINT1 (":\n"); 3965 PUSH_FAILURE_POINT (p + mcnt, d, -2); 3966 break; 3967 3968 3969 /* A smart repeat ends with `maybe_pop_jump'. 3970 We change it to either `pop_failure_jump' or `jump'. */ 3971 case maybe_pop_jump: 3972 EXTRACT_NUMBER_AND_INCR (mcnt, p); 3973 DEBUG_PRINT2 ("EXECUTING maybe_pop_jump %d.\n", mcnt); 3974 { 3975 register unsigned char *p2 = p; 3976 3977 /* Compare the beginning of the repeat with what in the 3978 pattern follows its end. If we can establish that there 3979 is nothing that they would both match, i.e., that we 3980 would have to backtrack because of (as in, e.g., `a*a') 3981 then we can change to pop_failure_jump, because we'll 3982 never have to backtrack. 3983 3984 This is not true in the case of alternatives: in 3985 `(a|ab)*' we do need to backtrack to the `ab' alternative 3986 (e.g., if the string was `ab'). But instead of trying to 3987 detect that here, the alternative has put on a dummy 3988 failure point which is what we will end up popping. */ 3989 3990 /* Skip over open/close-group commands. */ 3991 while (p2 + 2 < pend 3992 && ((re_opcode_t) *p2 == stop_memory 3993 || (re_opcode_t) *p2 == start_memory)) 3994 p2 += 3; /* Skip over args, too. */ 3995 3996 /* If we're at the end of the pattern, we can change. */ 3997 if (p2 == pend) 3998 { 3999 /* Consider what happens when matching ":\(.*\)" 4000 against ":/". I don't really understand this code 4001 yet. */ 4002 p[-3] = (unsigned char) pop_failure_jump; 4003 DEBUG_PRINT1 4004 (" End of pattern: change to `pop_failure_jump'.\n"); 4005 } 4006 4007 else if ((re_opcode_t) *p2 == exactn 4008 || (bufp->newline_anchor && (re_opcode_t) *p2 == endline)) 4009 { 4010 register unsigned char c 4011 = *p2 == (unsigned char) endline ? '\n' : p2[2]; 4012 p1 = p + mcnt; 4013 4014 /* p1[0] ... p1[2] are the `on_failure_jump' corresponding 4015 to the `maybe_finalize_jump' of this case. Examine what 4016 follows. */ 4017 if ((re_opcode_t) p1[3] == exactn && p1[5] != c) 4018 { 4019 p[-3] = (unsigned char) pop_failure_jump; 4020 DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n", 4021 c, p1[5]); 4022 } 4023 4024 else if ((re_opcode_t) p1[3] == charset 4025 || (re_opcode_t) p1[3] == charset_not) 4026 { 4027 int not = (re_opcode_t) p1[3] == charset_not; 4028 4029 if (c < (unsigned char) (p1[4] * BYTEWIDTH) 4030 && p1[5 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH))) 4031 not = !not; 4032 4033 /* `not' is equal to 1 if c would match, which means 4034 that we can't change to pop_failure_jump. */ 4035 if (!not) 4036 { 4037 p[-3] = (unsigned char) pop_failure_jump; 4038 DEBUG_PRINT1 (" No match => pop_failure_jump.\n"); 4039 } 4040 } 4041 } 4042 } 4043 p -= 2; /* Point at relative address again. */ 4044 if ((re_opcode_t) p[-1] != pop_failure_jump) 4045 { 4046 p[-1] = (unsigned char) jump; 4047 DEBUG_PRINT1 (" Match => jump.\n"); 4048 goto unconditional_jump; 4049 } 4050 /* Note fall through. */ 4051 4052 4053 /* The end of a simple repeat has a pop_failure_jump back to 4054 its matching on_failure_jump, where the latter will push a 4055 failure point. The pop_failure_jump takes off failure 4056 points put on by this pop_failure_jump's matching 4057 on_failure_jump; we got through the pattern to here from the 4058 matching on_failure_jump, so didn't fail. */ 4059 case pop_failure_jump: 4060 { 4061 /* We need to pass separate storage for the lowest and 4062 highest registers, even though we don't care about the 4063 actual values. Otherwise, we will restore only one 4064 register from the stack, since lowest will == highest in 4065 `pop_failure_point'. */ 4066 unsigned dummy_low_reg, dummy_high_reg; 4067 unsigned char *pdummy; 4068 const char *sdummy; 4069 4070 DEBUG_PRINT1 ("EXECUTING pop_failure_jump.\n"); 4071 POP_FAILURE_POINT (sdummy, pdummy, 4072 dummy_low_reg, dummy_high_reg, 4073 reg_dummy, reg_dummy, reg_info_dummy); 4074 } 4075 /* Note fall through. */ 4076 4077 4078 /* Unconditionally jump (without popping any failure points). */ 4079 case jump: 4080 unconditional_jump: 4081 EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */ 4082 DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt); 4083 p += mcnt; /* Do the jump. */ 4084 DEBUG_PRINT2 ("(to 0x%x).\n", p); 4085 break; 4086 4087 4088 /* We need this opcode so we can detect where alternatives end 4089 in `group_match_null_string_p' et al. */ 4090 case jump_past_alt: 4091 DEBUG_PRINT1 ("EXECUTING jump_past_alt.\n"); 4092 goto unconditional_jump; 4093 4094 4095 /* Normally, the on_failure_jump pushes a failure point, which 4096 then gets popped at pop_failure_jump. We will end up at 4097 pop_failure_jump, also, and with a pattern of, say, `a+', we 4098 are skipping over the on_failure_jump, so we have to push 4099 something meaningless for pop_failure_jump to pop. */ 4100 case dummy_failure_jump: 4101 DEBUG_PRINT1 ("EXECUTING dummy_failure_jump.\n"); 4102 /* It doesn't matter what we push for the string here. What 4103 the code at `fail' tests is the value for the pattern. */ 4104 PUSH_FAILURE_POINT (0, 0, -2); 4105 goto unconditional_jump; 4106 4107 4108 /* At the end of an alternative, we need to push a dummy failure 4109 point in case we are followed by a `pop_failure_jump', because 4110 we don't want the failure point for the alternative to be 4111 popped. For example, matching `(a|ab)*' against `aab' 4112 requires that we match the `ab' alternative. */ 4113 case push_dummy_failure: 4114 DEBUG_PRINT1 ("EXECUTING push_dummy_failure.\n"); 4115 /* See comments just above at `dummy_failure_jump' about the 4116 two zeroes. */ 4117 PUSH_FAILURE_POINT (0, 0, -2); 4118 break; 4119 4120 /* Have to succeed matching what follows at least n times. 4121 After that, handle like `on_failure_jump'. */ 4122 case succeed_n: 4123 EXTRACT_NUMBER (mcnt, p + 2); 4124 DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt); 4125 4126 assert (mcnt >= 0); 4127 /* Originally, this is how many times we HAVE to succeed. */ 4128 if (mcnt > 0) 4129 { 4130 mcnt--; 4131 p += 2; 4132 STORE_NUMBER_AND_INCR (p, mcnt); 4133 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p, mcnt); 4134 } 4135 else if (mcnt == 0) 4136 { 4137 DEBUG_PRINT2 (" Setting two bytes from 0x%x to no_op.\n", p+2); 4138 p[2] = (unsigned char) no_op; 4139 p[3] = (unsigned char) no_op; 4140 goto on_failure; 4141 } 4142 break; 4143 4144 case jump_n: 4145 EXTRACT_NUMBER (mcnt, p + 2); 4146 DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt); 4147 4148 /* Originally, this is how many times we CAN jump. */ 4149 if (mcnt) 4150 { 4151 mcnt--; 4152 STORE_NUMBER (p + 2, mcnt); 4153 goto unconditional_jump; 4154 } 4155 /* If don't have to jump any more, skip over the rest of command. */ 4156 else 4157 p += 4; 4158 break; 4159 4160 case set_number_at: 4161 { 4162 DEBUG_PRINT1 ("EXECUTING set_number_at.\n"); 4163 4164 EXTRACT_NUMBER_AND_INCR (mcnt, p); 4165 p1 = p + mcnt; 4166 EXTRACT_NUMBER_AND_INCR (mcnt, p); 4167 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p1, mcnt); 4168 STORE_NUMBER (p1, mcnt); 4169 break; 4170 } 4171 4172 case wordbound: 4173 DEBUG_PRINT1 ("EXECUTING wordbound.\n"); 4174 if (AT_WORD_BOUNDARY (d)) 4175 break; 4176 goto fail; 4177 4178 case notwordbound: 4179 DEBUG_PRINT1 ("EXECUTING notwordbound.\n"); 4180 if (AT_WORD_BOUNDARY (d)) 4181 goto fail; 4182 break; 4183 4184 case wordbeg: 4185 DEBUG_PRINT1 ("EXECUTING wordbeg.\n"); 4186 if (WORDCHAR_P (d) && (AT_STRINGS_BEG (d) || !WORDCHAR_P (d - 1))) 4187 break; 4188 goto fail; 4189 4190 case wordend: 4191 DEBUG_PRINT1 ("EXECUTING wordend.\n"); 4192 if (!AT_STRINGS_BEG (d) && WORDCHAR_P (d - 1) 4193 && (!WORDCHAR_P (d) || AT_STRINGS_END (d))) 4194 break; 4195 goto fail; 4196 4197 #ifdef emacs 4198 #ifdef emacs19 4199 case before_dot: 4200 DEBUG_PRINT1 ("EXECUTING before_dot.\n"); 4201 if (PTR_CHAR_POS ((unsigned char *) d) >= point) 4202 goto fail; 4203 break; 4204 4205 case at_dot: 4206 DEBUG_PRINT1 ("EXECUTING at_dot.\n"); 4207 if (PTR_CHAR_POS ((unsigned char *) d) != point) 4208 goto fail; 4209 break; 4210 4211 case after_dot: 4212 DEBUG_PRINT1 ("EXECUTING after_dot.\n"); 4213 if (PTR_CHAR_POS ((unsigned char *) d) <= point) 4214 goto fail; 4215 break; 4216 #else /* not emacs19 */ 4217 case at_dot: 4218 DEBUG_PRINT1 ("EXECUTING at_dot.\n"); 4219 if (PTR_CHAR_POS ((unsigned char *) d) + 1 != point) 4220 goto fail; 4221 break; 4222 #endif /* not emacs19 */ 4223 4224 case syntaxspec: 4225 DEBUG_PRINT2 ("EXECUTING syntaxspec %d.\n", mcnt); 4226 mcnt = *p++; 4227 goto matchsyntax; 4228 4229 case wordchar: 4230 DEBUG_PRINT1 ("EXECUTING Emacs wordchar.\n"); 4231 mcnt = (int) Sword; 4232 matchsyntax: 4233 PREFETCH (); 4234 if (SYNTAX (*d++) != (enum syntaxcode) mcnt) 4235 goto fail; 4236 SET_REGS_MATCHED (); 4237 break; 4238 4239 case notsyntaxspec: 4240 DEBUG_PRINT2 ("EXECUTING notsyntaxspec %d.\n", mcnt); 4241 mcnt = *p++; 4242 goto matchnotsyntax; 4243 4244 case notwordchar: 4245 DEBUG_PRINT1 ("EXECUTING Emacs notwordchar.\n"); 4246 mcnt = (int) Sword; 4247 matchnotsyntax: 4248 PREFETCH (); 4249 if (SYNTAX (*d++) == (enum syntaxcode) mcnt) 4250 goto fail; 4251 SET_REGS_MATCHED (); 4252 break; 4253 4254 #else /* not emacs */ 4255 case wordchar: 4256 DEBUG_PRINT1 ("EXECUTING non-Emacs wordchar.\n"); 4257 PREFETCH (); 4258 if (!WORDCHAR_P (d)) 4259 goto fail; 4260 SET_REGS_MATCHED (); 4261 d++; 4262 break; 4263 4264 case notwordchar: 4265 DEBUG_PRINT1 ("EXECUTING non-Emacs notwordchar.\n"); 4266 PREFETCH (); 4267 if (WORDCHAR_P (d)) 4268 goto fail; 4269 SET_REGS_MATCHED (); 4270 d++; 4271 break; 4272 #endif /* not emacs */ 4273 4274 default: 4275 abort (); 4276 } 4277 continue; /* Successfully executed one pattern command; keep going. */ 4278 4279 4280 /* We goto here if a matching operation fails. */ 4281 fail: 4282 if (!FAIL_STACK_EMPTY ()) 4283 { /* A restart point is known. Restore to that state. */ 4284 DEBUG_PRINT1 ("\nFAIL:\n"); 4285 POP_FAILURE_POINT (d, p, 4286 lowest_active_reg, highest_active_reg, 4287 regstart, regend, reg_info); 4288 4289 /* If this failure point is a dummy, try the next one. */ 4290 if (!p) 4291 goto fail; 4292 4293 /* If we failed to the end of the pattern, don't examine *p. */ 4294 assert (p <= pend); 4295 if (p < pend) 4296 { 4297 boolean is_a_jump_n = false; 4298 4299 /* If failed to a backwards jump that's part of a repetition 4300 loop, need to pop this failure point and use the next one. */ 4301 switch ((re_opcode_t) *p) 4302 { 4303 case jump_n: 4304 is_a_jump_n = true; 4305 case maybe_pop_jump: 4306 case pop_failure_jump: 4307 case jump: 4308 p1 = p + 1; 4309 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 4310 p1 += mcnt; 4311 4312 if ((is_a_jump_n && (re_opcode_t) *p1 == succeed_n) 4313 || (!is_a_jump_n 4314 && (re_opcode_t) *p1 == on_failure_jump)) 4315 goto fail; 4316 break; 4317 default: 4318 /* do nothing */ ; 4319 } 4320 } 4321 4322 if (d >= string1 && d <= end1) 4323 dend = end_match_1; 4324 } 4325 else 4326 break; /* Matching at this starting point really fails. */ 4327 } /* for (;;) */ 4328 4329 if (best_regs_set) 4330 goto restore_best_regs; 4331 4332 FREE_VARIABLES (); 4333 4334 return -1; /* Failure to match. */ 4335 } /* re_match_2 */ 4336 4337 /* Subroutine definitions for re_match_2. */ 4338 4339 4340 /* We are passed P pointing to a register number after a start_memory. 4341 4342 Return true if the pattern up to the corresponding stop_memory can 4343 match the empty string, and false otherwise. 4344 4345 If we find the matching stop_memory, sets P to point to one past its number. 4346 Otherwise, sets P to an undefined byte less than or equal to END. 4347 4348 We don't handle duplicates properly (yet). */ 4349 4350 static boolean 4351 group_match_null_string_p (p, end, reg_info) 4352 unsigned char **p, *end; 4353 register_info_type *reg_info; 4354 { 4355 int mcnt; 4356 /* Point to after the args to the start_memory. */ 4357 unsigned char *p1 = *p + 2; 4358 4359 while (p1 < end) 4360 { 4361 /* Skip over opcodes that can match nothing, and return true or 4362 false, as appropriate, when we get to one that can't, or to the 4363 matching stop_memory. */ 4364 4365 switch ((re_opcode_t) *p1) 4366 { 4367 /* Could be either a loop or a series of alternatives. */ 4368 case on_failure_jump: 4369 p1++; 4370 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 4371 4372 /* If the next operation is not a jump backwards in the 4373 pattern. */ 4374 4375 if (mcnt >= 0) 4376 { 4377 /* Go through the on_failure_jumps of the alternatives, 4378 seeing if any of the alternatives cannot match nothing. 4379 The last alternative starts with only a jump, 4380 whereas the rest start with on_failure_jump and end 4381 with a jump, e.g., here is the pattern for `a|b|c': 4382 4383 /on_failure_jump/0/6/exactn/1/a/jump_past_alt/0/6 4384 /on_failure_jump/0/6/exactn/1/b/jump_past_alt/0/3 4385 /exactn/1/c 4386 4387 So, we have to first go through the first (n-1) 4388 alternatives and then deal with the last one separately. */ 4389 4390 4391 /* Deal with the first (n-1) alternatives, which start 4392 with an on_failure_jump (see above) that jumps to right 4393 past a jump_past_alt. */ 4394 4395 while ((re_opcode_t) p1[mcnt-3] == jump_past_alt) 4396 { 4397 /* `mcnt' holds how many bytes long the alternative 4398 is, including the ending `jump_past_alt' and 4399 its number. */ 4400 4401 if (!alt_match_null_string_p (p1, p1 + mcnt - 3, 4402 reg_info)) 4403 return false; 4404 4405 /* Move to right after this alternative, including the 4406 jump_past_alt. */ 4407 p1 += mcnt; 4408 4409 /* Break if it's the beginning of an n-th alternative 4410 that doesn't begin with an on_failure_jump. */ 4411 if ((re_opcode_t) *p1 != on_failure_jump) 4412 break; 4413 4414 /* Still have to check that it's not an n-th 4415 alternative that starts with an on_failure_jump. */ 4416 p1++; 4417 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 4418 if ((re_opcode_t) p1[mcnt-3] != jump_past_alt) 4419 { 4420 /* Get to the beginning of the n-th alternative. */ 4421 p1 -= 3; 4422 break; 4423 } 4424 } 4425 4426 /* Deal with the last alternative: go back and get number 4427 of the `jump_past_alt' just before it. `mcnt' contains 4428 the length of the alternative. */ 4429 EXTRACT_NUMBER (mcnt, p1 - 2); 4430 4431 if (!alt_match_null_string_p (p1, p1 + mcnt, reg_info)) 4432 return false; 4433 4434 p1 += mcnt; /* Get past the n-th alternative. */ 4435 } /* if mcnt > 0 */ 4436 break; 4437 4438 4439 case stop_memory: 4440 assert (p1[1] == **p); 4441 *p = p1 + 2; 4442 return true; 4443 4444 4445 default: 4446 if (!common_op_match_null_string_p (&p1, end, reg_info)) 4447 return false; 4448 } 4449 } /* while p1 < end */ 4450 4451 return false; 4452 } /* group_match_null_string_p */ 4453 4454 4455 /* Similar to group_match_null_string_p, but doesn't deal with alternatives: 4456 It expects P to be the first byte of a single alternative and END one 4457 byte past the last. The alternative can contain groups. */ 4458 4459 static boolean 4460 alt_match_null_string_p (p, end, reg_info) 4461 unsigned char *p, *end; 4462 register_info_type *reg_info; 4463 { 4464 int mcnt; 4465 unsigned char *p1 = p; 4466 4467 while (p1 < end) 4468 { 4469 /* Skip over opcodes that can match nothing, and break when we get 4470 to one that can't. */ 4471 4472 switch ((re_opcode_t) *p1) 4473 { 4474 /* It's a loop. */ 4475 case on_failure_jump: 4476 p1++; 4477 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 4478 p1 += mcnt; 4479 break; 4480 4481 default: 4482 if (!common_op_match_null_string_p (&p1, end, reg_info)) 4483 return false; 4484 } 4485 } /* while p1 < end */ 4486 4487 return true; 4488 } /* alt_match_null_string_p */ 4489 4490 4491 /* Deals with the ops common to group_match_null_string_p and 4492 alt_match_null_string_p. 4493 4494 Sets P to one after the op and its arguments, if any. */ 4495 4496 static boolean 4497 common_op_match_null_string_p (p, end, reg_info) 4498 unsigned char **p, *end; 4499 register_info_type *reg_info; 4500 { 4501 int mcnt; 4502 boolean ret; 4503 int reg_no; 4504 unsigned char *p1 = *p; 4505 4506 switch ((re_opcode_t) *p1++) 4507 { 4508 case no_op: 4509 case begline: 4510 case endline: 4511 case begbuf: 4512 case endbuf: 4513 case wordbeg: 4514 case wordend: 4515 case wordbound: 4516 case notwordbound: 4517 #ifdef emacs 4518 case before_dot: 4519 case at_dot: 4520 case after_dot: 4521 #endif 4522 break; 4523 4524 case start_memory: 4525 reg_no = *p1; 4526 assert (reg_no > 0 && reg_no <= MAX_REGNUM); 4527 ret = group_match_null_string_p (&p1, end, reg_info); 4528 4529 /* Have to set this here in case we're checking a group which 4530 contains a group and a back reference to it. */ 4531 4532 if (REG_MATCH_NULL_STRING_P (reg_info[reg_no]) == MATCH_NULL_UNSET_VALUE) 4533 REG_MATCH_NULL_STRING_P (reg_info[reg_no]) = ret; 4534 4535 if (!ret) 4536 return false; 4537 break; 4538 4539 /* If this is an optimized succeed_n for zero times, make the jump. */ 4540 case jump: 4541 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 4542 if (mcnt >= 0) 4543 p1 += mcnt; 4544 else 4545 return false; 4546 break; 4547 4548 case succeed_n: 4549 /* Get to the number of times to succeed. */ 4550 p1 += 2; 4551 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 4552 4553 if (mcnt == 0) 4554 { 4555 p1 -= 4; 4556 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 4557 p1 += mcnt; 4558 } 4559 else 4560 return false; 4561 break; 4562 4563 case duplicate: 4564 if (!REG_MATCH_NULL_STRING_P (reg_info[*p1])) 4565 return false; 4566 break; 4567 4568 case set_number_at: 4569 p1 += 4; 4570 4571 default: 4572 /* All other opcodes mean we cannot match the empty string. */ 4573 return false; 4574 } 4575 4576 *p = p1; 4577 return true; 4578 } /* common_op_match_null_string_p */ 4579 4580 4581 /* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN 4582 bytes; nonzero otherwise. */ 4583 4584 static int 4585 bcmp_translate (s1, s2, len, translate) 4586 unsigned char *s1, *s2; 4587 register int len; 4588 char *translate; 4589 { 4590 register unsigned char *p1 = s1, *p2 = s2; 4591 while (len) 4592 { 4593 if (translate[*p1++] != translate[*p2++]) return 1; 4594 len--; 4595 } 4596 return 0; 4597 } 4598 4599 /* Entry points for GNU code. */ 4600 4601 /* re_compile_pattern is the GNU regular expression compiler: it 4602 compiles PATTERN (of length SIZE) and puts the result in BUFP. 4603 Returns 0 if the pattern was valid, otherwise an error string. 4604 4605 Assumes the `allocated' (and perhaps `buffer') and `translate' fields 4606 are set in BUFP on entry. 4607 4608 We call regex_compile to do the actual compilation. */ 4609 4610 const char * 4611 re_compile_pattern (pattern, length, bufp) 4612 const char *pattern; 4613 int length; 4614 struct re_pattern_buffer *bufp; 4615 { 4616 reg_errcode_t ret; 4617 4618 /* GNU code is written to assume at least RE_NREGS registers will be set 4619 (and at least one extra will be -1). */ 4620 bufp->regs_allocated = REGS_UNALLOCATED; 4621 4622 /* And GNU code determines whether or not to get register information 4623 by passing null for the REGS argument to re_match, etc., not by 4624 setting no_sub. */ 4625 bufp->no_sub = 0; 4626 4627 /* Match anchors at newline. */ 4628 bufp->newline_anchor = 1; 4629 4630 ret = regex_compile (pattern, length, re_syntax_options, bufp); 4631 4632 return re_error_msg[(int) ret]; 4633 } 4634 4635 /* Entry points compatible with 4.2 BSD regex library. We don't define 4636 them if this is an Emacs or POSIX compilation. */ 4637 4638 #if !defined (emacs) && !defined (_POSIX_SOURCE) 4639 4640 /* BSD has one and only one pattern buffer. */ 4641 static struct re_pattern_buffer re_comp_buf; 4642 4643 char * 4644 re_comp (s) 4645 const char *s; 4646 { 4647 reg_errcode_t ret; 4648 4649 if (!s) 4650 { 4651 if (!re_comp_buf.buffer) 4652 return "No previous regular expression"; 4653 return 0; 4654 } 4655 4656 if (!re_comp_buf.buffer) 4657 { 4658 re_comp_buf.buffer = (unsigned char *) malloc (200); 4659 if (re_comp_buf.buffer == NULL) 4660 return "Memory exhausted"; 4661 re_comp_buf.allocated = 200; 4662 4663 re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH); 4664 if (re_comp_buf.fastmap == NULL) 4665 return "Memory exhausted"; 4666 } 4667 4668 /* Since `re_exec' always passes NULL for the `regs' argument, we 4669 don't need to initialize the pattern buffer fields which affect it. */ 4670 4671 /* Match anchors at newlines. */ 4672 re_comp_buf.newline_anchor = 1; 4673 4674 ret = regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf); 4675 4676 /* Yes, we're discarding `const' here. */ 4677 return (char *) re_error_msg[(int) ret]; 4678 } 4679 4680 4681 int 4682 re_exec (s) 4683 const char *s; 4684 { 4685 const int len = strlen (s); 4686 return 4687 0 <= re_search (&re_comp_buf, s, len, 0, len, (struct re_registers *) 0); 4688 } 4689 #endif /* not emacs and not _POSIX_SOURCE */ 4690 4691 /* POSIX.2 functions. Don't define these for Emacs. */ 4692 4693 #ifndef emacs 4694 4695 /* regcomp takes a regular expression as a string and compiles it. 4696 4697 PREG is a regex_t *. We do not expect any fields to be initialized, 4698 since POSIX says we shouldn't. Thus, we set 4699 4700 `buffer' to the compiled pattern; 4701 `used' to the length of the compiled pattern; 4702 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the 4703 REG_EXTENDED bit in CFLAGS is set; otherwise, to 4704 RE_SYNTAX_POSIX_BASIC; 4705 `newline_anchor' to REG_NEWLINE being set in CFLAGS; 4706 `fastmap' and `fastmap_accurate' to zero; 4707 `re_nsub' to the number of subexpressions in PATTERN. 4708 4709 PATTERN is the address of the pattern string. 4710 4711 CFLAGS is a series of bits which affect compilation. 4712 4713 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we 4714 use POSIX basic syntax. 4715 4716 If REG_NEWLINE is set, then . and [^...] don't match newline. 4717 Also, regexec will try a match beginning after every newline. 4718 4719 If REG_ICASE is set, then we considers upper- and lowercase 4720 versions of letters to be equivalent when matching. 4721 4722 If REG_NOSUB is set, then when PREG is passed to regexec, that 4723 routine will report only success or failure, and nothing about the 4724 registers. 4725 4726 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for 4727 the return codes and their meanings.) */ 4728 4729 int 4730 regcomp (preg, pattern, cflags) 4731 regex_t *preg; 4732 const char *pattern; 4733 int cflags; 4734 { 4735 reg_errcode_t ret; 4736 unsigned syntax 4737 = (cflags & REG_EXTENDED) ? 4738 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC; 4739 4740 /* regex_compile will allocate the space for the compiled pattern. */ 4741 preg->buffer = 0; 4742 preg->allocated = 0; 4743 4744 /* Don't bother to use a fastmap when searching. This simplifies the 4745 REG_NEWLINE case: if we used a fastmap, we'd have to put all the 4746 characters after newlines into the fastmap. This way, we just try 4747 every character. */ 4748 preg->fastmap = 0; 4749 4750 if (cflags & REG_ICASE) 4751 { 4752 unsigned i; 4753 4754 preg->translate = (char *) malloc (CHAR_SET_SIZE); 4755 if (preg->translate == NULL) 4756 return (int) REG_ESPACE; 4757 4758 /* Map uppercase characters to corresponding lowercase ones. */ 4759 for (i = 0; i < CHAR_SET_SIZE; i++) 4760 preg->translate[i] = ISUPPER (i) ? tolower (i) : i; 4761 } 4762 else 4763 preg->translate = NULL; 4764 4765 /* If REG_NEWLINE is set, newlines are treated differently. */ 4766 if (cflags & REG_NEWLINE) 4767 { /* REG_NEWLINE implies neither . nor [^...] match newline. */ 4768 syntax &= ~RE_DOT_NEWLINE; 4769 syntax |= RE_HAT_LISTS_NOT_NEWLINE; 4770 /* It also changes the matching behavior. */ 4771 preg->newline_anchor = 1; 4772 } 4773 else 4774 preg->newline_anchor = 0; 4775 4776 preg->no_sub = !!(cflags & REG_NOSUB); 4777 4778 /* POSIX says a null character in the pattern terminates it, so we 4779 can use strlen here in compiling the pattern. */ 4780 ret = regex_compile (pattern, strlen (pattern), syntax, preg); 4781 4782 /* POSIX doesn't distinguish between an unmatched open-group and an 4783 unmatched close-group: both are REG_EPAREN. */ 4784 if (ret == REG_ERPAREN) ret = REG_EPAREN; 4785 4786 return (int) ret; 4787 } 4788 4789 4790 /* regexec searches for a given pattern, specified by PREG, in the 4791 string STRING. 4792 4793 If NMATCH is zero or REG_NOSUB was set in the cflags argument to 4794 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at 4795 least NMATCH elements, and we set them to the offsets of the 4796 corresponding matched substrings. 4797 4798 EFLAGS specifies `execution flags' which affect matching: if 4799 REG_NOTBOL is set, then ^ does not match at the beginning of the 4800 string; if REG_NOTEOL is set, then $ does not match at the end. 4801 4802 We return 0 if we find a match and REG_NOMATCH if not. */ 4803 4804 int 4805 regexec (preg, string, nmatch, pmatch, eflags) 4806 const regex_t *preg; 4807 const char *string; 4808 size_t nmatch; 4809 regmatch_t pmatch[]; 4810 int eflags; 4811 { 4812 int ret; 4813 struct re_registers regs; 4814 regex_t private_preg; 4815 int len = strlen (string); 4816 boolean want_reg_info = !preg->no_sub && nmatch > 0; 4817 4818 private_preg = *preg; 4819 4820 private_preg.not_bol = !!(eflags & REG_NOTBOL); 4821 private_preg.not_eol = !!(eflags & REG_NOTEOL); 4822 4823 /* The user has told us exactly how many registers to return 4824 information about, via `nmatch'. We have to pass that on to the 4825 matching routines. */ 4826 private_preg.regs_allocated = REGS_FIXED; 4827 4828 if (want_reg_info) 4829 { 4830 regs.num_regs = nmatch; 4831 regs.start = TALLOC (nmatch, regoff_t); 4832 regs.end = TALLOC (nmatch, regoff_t); 4833 if (regs.start == NULL || regs.end == NULL) 4834 return (int) REG_NOMATCH; 4835 } 4836 4837 /* Perform the searching operation. */ 4838 ret = re_search (&private_preg, string, len, 4839 /* start: */ 0, /* range: */ len, 4840 want_reg_info ? ®s : (struct re_registers *) 0); 4841 4842 /* Copy the register information to the POSIX structure. */ 4843 if (want_reg_info) 4844 { 4845 if (ret >= 0) 4846 { 4847 unsigned r; 4848 4849 for (r = 0; r < nmatch; r++) 4850 { 4851 pmatch[r].rm_so = regs.start[r]; 4852 pmatch[r].rm_eo = regs.end[r]; 4853 } 4854 } 4855 4856 /* If we needed the temporary register info, free the space now. */ 4857 free (regs.start); 4858 free (regs.end); 4859 } 4860 4861 /* We want zero return to mean success, unlike `re_search'. */ 4862 return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH; 4863 } 4864 4865 4866 /* Returns a message corresponding to an error code, ERRCODE, returned 4867 from either regcomp or regexec. We don't use PREG here. */ 4868 4869 size_t 4870 regerror (errcode, preg, errbuf, errbuf_size) 4871 int errcode; 4872 const regex_t *preg; 4873 char *errbuf; 4874 size_t errbuf_size; 4875 { 4876 const char *msg; 4877 size_t msg_size; 4878 4879 if (errcode < 0 4880 || errcode >= (sizeof (re_error_msg) / sizeof (re_error_msg[0]))) 4881 /* Only error codes returned by the rest of the code should be passed 4882 to this routine. If we are given anything else, or if other regex 4883 code generates an invalid error code, then the program has a bug. 4884 Dump core so we can fix it. */ 4885 abort (); 4886 4887 msg = re_error_msg[errcode]; 4888 4889 /* POSIX doesn't require that we do anything in this case, but why 4890 not be nice. */ 4891 if (! msg) 4892 msg = "Success"; 4893 4894 msg_size = strlen (msg) + 1; /* Includes the null. */ 4895 4896 if (errbuf_size != 0) 4897 { 4898 if (msg_size > errbuf_size) 4899 { 4900 strncpy (errbuf, msg, errbuf_size - 1); 4901 errbuf[errbuf_size - 1] = 0; 4902 } 4903 else 4904 strcpy (errbuf, msg); 4905 } 4906 4907 return msg_size; 4908 } 4909 4910 4911 /* Free dynamically allocated space used by PREG. */ 4912 4913 void 4914 regfree (preg) 4915 regex_t *preg; 4916 { 4917 if (preg->buffer != NULL) 4918 free (preg->buffer); 4919 preg->buffer = NULL; 4920 4921 preg->allocated = 0; 4922 preg->used = 0; 4923 4924 if (preg->fastmap != NULL) 4925 free (preg->fastmap); 4926 preg->fastmap = NULL; 4927 preg->fastmap_accurate = 0; 4928 4929 if (preg->translate != NULL) 4930 free (preg->translate); 4931 preg->translate = NULL; 4932 } 4933 4934 #endif /* not emacs */ 4935 4936 /* 4937 Local variables: 4938 make-backup-files: t 4939 version-control: t 4940 trim-versions-without-asking: nil 4941 End: 4942 */ 4943