1 /* Extended regular expression matching and search library. 2 Copyright (C) 2002,2003,2004,2005,2006,2007,2009 3 Free Software Foundation, Inc. 4 This file is part of the GNU C Library. 5 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>. 6 7 The GNU C Library is free software; you can redistribute it and/or 8 modify it under the terms of the GNU Lesser General Public 9 License as published by the Free Software Foundation; either 10 version 2.1 of the License, or (at your option) any later version. 11 12 The GNU C Library is distributed in the hope that it will be useful, 13 but WITHOUT ANY WARRANTY; without even the implied warranty of 14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 Lesser General Public License for more details. 16 17 You should have received a copy of the GNU Lesser General Public 18 License along with the GNU C Library; if not, write to the Free 19 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 20 02111-1307 USA. */ 21 22 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern, 23 size_t length, reg_syntax_t syntax); 24 static void re_compile_fastmap_iter (regex_t *bufp, 25 const re_dfastate_t *init_state, 26 char *fastmap); 27 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len); 28 #ifdef RE_ENABLE_I18N 29 static void free_charset (re_charset_t *cset); 30 #endif /* RE_ENABLE_I18N */ 31 static void free_workarea_compile (regex_t *preg); 32 static reg_errcode_t create_initial_state (re_dfa_t *dfa); 33 #ifdef RE_ENABLE_I18N 34 static void optimize_utf8 (re_dfa_t *dfa); 35 #endif 36 static reg_errcode_t analyze (regex_t *preg); 37 static reg_errcode_t preorder (bin_tree_t *root, 38 reg_errcode_t (fn (void *, bin_tree_t *)), 39 void *extra); 40 static reg_errcode_t postorder (bin_tree_t *root, 41 reg_errcode_t (fn (void *, bin_tree_t *)), 42 void *extra); 43 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node); 44 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node); 45 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg, 46 bin_tree_t *node); 47 static reg_errcode_t calc_first (void *extra, bin_tree_t *node); 48 static reg_errcode_t calc_next (void *extra, bin_tree_t *node); 49 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node); 50 static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint); 51 static int search_duplicated_node (const re_dfa_t *dfa, int org_node, 52 unsigned int constraint); 53 static reg_errcode_t calc_eclosure (re_dfa_t *dfa); 54 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, 55 int node, int root); 56 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa); 57 static int fetch_number (re_string_t *input, re_token_t *token, 58 reg_syntax_t syntax); 59 static int peek_token (re_token_t *token, re_string_t *input, 60 reg_syntax_t syntax) internal_function; 61 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg, 62 reg_syntax_t syntax, reg_errcode_t *err); 63 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg, 64 re_token_t *token, reg_syntax_t syntax, 65 int nest, reg_errcode_t *err); 66 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg, 67 re_token_t *token, reg_syntax_t syntax, 68 int nest, reg_errcode_t *err); 69 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg, 70 re_token_t *token, reg_syntax_t syntax, 71 int nest, reg_errcode_t *err); 72 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg, 73 re_token_t *token, reg_syntax_t syntax, 74 int nest, reg_errcode_t *err); 75 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp, 76 re_dfa_t *dfa, re_token_t *token, 77 reg_syntax_t syntax, reg_errcode_t *err); 78 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, 79 re_token_t *token, reg_syntax_t syntax, 80 reg_errcode_t *err); 81 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem, 82 re_string_t *regexp, 83 re_token_t *token, int token_len, 84 re_dfa_t *dfa, 85 reg_syntax_t syntax, 86 int accept_hyphen); 87 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem, 88 re_string_t *regexp, 89 re_token_t *token); 90 #ifdef RE_ENABLE_I18N 91 static reg_errcode_t build_equiv_class (bitset_t sbcset, 92 re_charset_t *mbcset, 93 int *equiv_class_alloc, 94 const unsigned char *name); 95 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans, 96 bitset_t sbcset, 97 re_charset_t *mbcset, 98 int *char_class_alloc, 99 const unsigned char *class_name, 100 reg_syntax_t syntax); 101 #else /* not RE_ENABLE_I18N */ 102 static reg_errcode_t build_equiv_class (bitset_t sbcset, 103 const unsigned char *name); 104 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans, 105 bitset_t sbcset, 106 const unsigned char *class_name, 107 reg_syntax_t syntax); 108 #endif /* not RE_ENABLE_I18N */ 109 static bin_tree_t *build_charclass_op (re_dfa_t *dfa, 110 RE_TRANSLATE_TYPE trans, 111 const unsigned char *class_name, 112 const unsigned char *extra, 113 int non_match, reg_errcode_t *err); 114 static bin_tree_t *create_tree (re_dfa_t *dfa, 115 bin_tree_t *left, bin_tree_t *right, 116 re_token_type_t type); 117 static bin_tree_t *create_token_tree (re_dfa_t *dfa, 118 bin_tree_t *left, bin_tree_t *right, 119 const re_token_t *token); 120 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa); 121 static void free_token (re_token_t *node); 122 static reg_errcode_t free_tree (void *extra, bin_tree_t *node); 123 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node); 124 125 /* This table gives an error message for each of the error codes listed 126 in regex.h. Obviously the order here has to be same as there. 127 POSIX doesn't require that we do anything for REG_NOERROR, 128 but why not be nice? */ 129 130 const char __re_error_msgid[] attribute_hidden = 131 { 132 #define REG_NOERROR_IDX 0 133 gettext_noop ("Success") /* REG_NOERROR */ 134 "\0" 135 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success") 136 gettext_noop ("No match") /* REG_NOMATCH */ 137 "\0" 138 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match") 139 gettext_noop ("Invalid regular expression") /* REG_BADPAT */ 140 "\0" 141 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression") 142 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */ 143 "\0" 144 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character") 145 gettext_noop ("Invalid character class name") /* REG_ECTYPE */ 146 "\0" 147 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name") 148 gettext_noop ("Trailing backslash") /* REG_EESCAPE */ 149 "\0" 150 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash") 151 gettext_noop ("Invalid back reference") /* REG_ESUBREG */ 152 "\0" 153 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference") 154 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */ 155 "\0" 156 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^") 157 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */ 158 "\0" 159 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(") 160 gettext_noop ("Unmatched \\{") /* REG_EBRACE */ 161 "\0" 162 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{") 163 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */ 164 "\0" 165 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}") 166 gettext_noop ("Invalid range end") /* REG_ERANGE */ 167 "\0" 168 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end") 169 gettext_noop ("Memory exhausted") /* REG_ESPACE */ 170 "\0" 171 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted") 172 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */ 173 "\0" 174 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression") 175 gettext_noop ("Premature end of regular expression") /* REG_EEND */ 176 "\0" 177 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression") 178 gettext_noop ("Regular expression too big") /* REG_ESIZE */ 179 "\0" 180 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big") 181 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */ 182 }; 183 184 const size_t __re_error_msgid_idx[] attribute_hidden = 185 { 186 REG_NOERROR_IDX, 187 REG_NOMATCH_IDX, 188 REG_BADPAT_IDX, 189 REG_ECOLLATE_IDX, 190 REG_ECTYPE_IDX, 191 REG_EESCAPE_IDX, 192 REG_ESUBREG_IDX, 193 REG_EBRACK_IDX, 194 REG_EPAREN_IDX, 195 REG_EBRACE_IDX, 196 REG_BADBR_IDX, 197 REG_ERANGE_IDX, 198 REG_ESPACE_IDX, 199 REG_BADRPT_IDX, 200 REG_EEND_IDX, 201 REG_ESIZE_IDX, 202 REG_ERPAREN_IDX 203 }; 204 205 /* Entry points for GNU code. */ 206 207 /* re_compile_pattern is the GNU regular expression compiler: it 208 compiles PATTERN (of length LENGTH) and puts the result in BUFP. 209 Returns 0 if the pattern was valid, otherwise an error string. 210 211 Assumes the `allocated' (and perhaps `buffer') and `translate' fields 212 are set in BUFP on entry. */ 213 214 const char * 215 re_compile_pattern (pattern, length, bufp) 216 const char *pattern; 217 size_t length; 218 struct re_pattern_buffer *bufp; 219 { 220 reg_errcode_t ret; 221 222 /* And GNU code determines whether or not to get register information 223 by passing null for the REGS argument to re_match, etc., not by 224 setting no_sub, unless RE_NO_SUB is set. */ 225 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB); 226 227 /* Match anchors at newline. */ 228 bufp->newline_anchor = 1; 229 230 ret = re_compile_internal (bufp, pattern, length, re_syntax_options); 231 232 if (!ret) 233 return NULL; 234 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]); 235 } 236 #ifdef _LIBC 237 weak_alias (__re_compile_pattern, re_compile_pattern) 238 #endif 239 240 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can 241 also be assigned to arbitrarily: each pattern buffer stores its own 242 syntax, so it can be changed between regex compilations. */ 243 /* This has no initializer because initialized variables in Emacs 244 become read-only after dumping. */ 245 reg_syntax_t re_syntax_options; 246 247 248 /* Specify the precise syntax of regexps for compilation. This provides 249 for compatibility for various utilities which historically have 250 different, incompatible syntaxes. 251 252 The argument SYNTAX is a bit mask comprised of the various bits 253 defined in regex.h. We return the old syntax. */ 254 255 reg_syntax_t 256 re_set_syntax (syntax) 257 reg_syntax_t syntax; 258 { 259 reg_syntax_t ret = re_syntax_options; 260 261 re_syntax_options = syntax; 262 return ret; 263 } 264 #ifdef _LIBC 265 weak_alias (__re_set_syntax, re_set_syntax) 266 #endif 267 268 int 269 re_compile_fastmap (bufp) 270 struct re_pattern_buffer *bufp; 271 { 272 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer; 273 char *fastmap = bufp->fastmap; 274 275 memset (fastmap, '\0', sizeof (char) * SBC_MAX); 276 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap); 277 if (dfa->init_state != dfa->init_state_word) 278 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap); 279 if (dfa->init_state != dfa->init_state_nl) 280 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap); 281 if (dfa->init_state != dfa->init_state_begbuf) 282 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap); 283 bufp->fastmap_accurate = 1; 284 return 0; 285 } 286 #ifdef _LIBC 287 weak_alias (__re_compile_fastmap, re_compile_fastmap) 288 #endif 289 290 static inline void 291 __attribute ((always_inline)) 292 re_set_fastmap (char *fastmap, int icase, int ch) 293 { 294 fastmap[ch] = 1; 295 if (icase) 296 fastmap[tolower (ch)] = 1; 297 } 298 299 /* Helper function for re_compile_fastmap. 300 Compile fastmap for the initial_state INIT_STATE. */ 301 302 static void 303 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state, 304 char *fastmap) 305 { 306 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer; 307 int node_cnt; 308 int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE)); 309 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt) 310 { 311 int node = init_state->nodes.elems[node_cnt]; 312 re_token_type_t type = dfa->nodes[node].type; 313 314 if (type == CHARACTER) 315 { 316 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c); 317 #ifdef RE_ENABLE_I18N 318 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1) 319 { 320 unsigned char *buf = alloca (dfa->mb_cur_max), *p; 321 wchar_t wc; 322 mbstate_t state; 323 324 p = buf; 325 *p++ = dfa->nodes[node].opr.c; 326 while (++node < dfa->nodes_len 327 && dfa->nodes[node].type == CHARACTER 328 && dfa->nodes[node].mb_partial) 329 *p++ = dfa->nodes[node].opr.c; 330 memset (&state, '\0', sizeof (state)); 331 if (__mbrtowc (&wc, (const char *) buf, p - buf, 332 &state) == p - buf 333 && (__wcrtomb ((char *) buf, towlower (wc), &state) 334 != (size_t) -1)) 335 re_set_fastmap (fastmap, 0, buf[0]); 336 } 337 #endif 338 } 339 else if (type == SIMPLE_BRACKET) 340 { 341 int i, ch; 342 for (i = 0, ch = 0; i < BITSET_WORDS; ++i) 343 { 344 int j; 345 bitset_word_t w = dfa->nodes[node].opr.sbcset[i]; 346 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch) 347 if (w & ((bitset_word_t) 1 << j)) 348 re_set_fastmap (fastmap, icase, ch); 349 } 350 } 351 #ifdef RE_ENABLE_I18N 352 else if (type == COMPLEX_BRACKET) 353 { 354 re_charset_t *cset = dfa->nodes[node].opr.mbcset; 355 int i; 356 357 # ifdef _LIBC 358 /* See if we have to try all bytes which start multiple collation 359 elements. 360 e.g. In da_DK, we want to catch 'a' since "aa" is a valid 361 collation element, and don't catch 'b' since 'b' is 362 the only collation element which starts from 'b' (and 363 it is caught by SIMPLE_BRACKET). */ 364 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0 365 && (cset->ncoll_syms || cset->nranges)) 366 { 367 const int32_t *table = (const int32_t *) 368 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB); 369 for (i = 0; i < SBC_MAX; ++i) 370 if (table[i] < 0) 371 re_set_fastmap (fastmap, icase, i); 372 } 373 # endif /* _LIBC */ 374 375 /* See if we have to start the match at all multibyte characters, 376 i.e. where we would not find an invalid sequence. This only 377 applies to multibyte character sets; for single byte character 378 sets, the SIMPLE_BRACKET again suffices. */ 379 if (dfa->mb_cur_max > 1 380 && (cset->nchar_classes || cset->non_match 381 # ifdef _LIBC 382 || cset->nequiv_classes 383 # endif /* _LIBC */ 384 )) 385 { 386 unsigned char c = 0; 387 do 388 { 389 mbstate_t mbs; 390 memset (&mbs, 0, sizeof (mbs)); 391 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2) 392 re_set_fastmap (fastmap, false, (int) c); 393 } 394 while (++c != 0); 395 } 396 397 else 398 { 399 /* ... Else catch all bytes which can start the mbchars. */ 400 for (i = 0; i < cset->nmbchars; ++i) 401 { 402 char buf[256]; 403 mbstate_t state; 404 memset (&state, '\0', sizeof (state)); 405 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1) 406 re_set_fastmap (fastmap, icase, *(unsigned char *) buf); 407 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1) 408 { 409 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state) 410 != (size_t) -1) 411 re_set_fastmap (fastmap, false, *(unsigned char *) buf); 412 } 413 } 414 } 415 } 416 #endif /* RE_ENABLE_I18N */ 417 else if (type == OP_PERIOD 418 #ifdef RE_ENABLE_I18N 419 || type == OP_UTF8_PERIOD 420 #endif /* RE_ENABLE_I18N */ 421 || type == END_OF_RE) 422 { 423 memset (fastmap, '\1', sizeof (char) * SBC_MAX); 424 if (type == END_OF_RE) 425 bufp->can_be_null = 1; 426 return; 427 } 428 } 429 } 430 431 /* Entry point for POSIX code. */ 432 /* regcomp takes a regular expression as a string and compiles it. 433 434 PREG is a regex_t *. We do not expect any fields to be initialized, 435 since POSIX says we shouldn't. Thus, we set 436 437 `buffer' to the compiled pattern; 438 `used' to the length of the compiled pattern; 439 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the 440 REG_EXTENDED bit in CFLAGS is set; otherwise, to 441 RE_SYNTAX_POSIX_BASIC; 442 `newline_anchor' to REG_NEWLINE being set in CFLAGS; 443 `fastmap' to an allocated space for the fastmap; 444 `fastmap_accurate' to zero; 445 `re_nsub' to the number of subexpressions in PATTERN. 446 447 PATTERN is the address of the pattern string. 448 449 CFLAGS is a series of bits which affect compilation. 450 451 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we 452 use POSIX basic syntax. 453 454 If REG_NEWLINE is set, then . and [^...] don't match newline. 455 Also, regexec will try a match beginning after every newline. 456 457 If REG_ICASE is set, then we considers upper- and lowercase 458 versions of letters to be equivalent when matching. 459 460 If REG_NOSUB is set, then when PREG is passed to regexec, that 461 routine will report only success or failure, and nothing about the 462 registers. 463 464 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for 465 the return codes and their meanings.) */ 466 467 int 468 regcomp (preg, pattern, cflags) 469 regex_t *__restrict preg; 470 const char *__restrict pattern; 471 int cflags; 472 { 473 reg_errcode_t ret; 474 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED 475 : RE_SYNTAX_POSIX_BASIC); 476 477 preg->buffer = NULL; 478 preg->allocated = 0; 479 preg->used = 0; 480 481 /* Try to allocate space for the fastmap. */ 482 preg->fastmap = re_malloc (char, SBC_MAX); 483 if (BE (preg->fastmap == NULL, 0)) 484 return REG_ESPACE; 485 486 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0; 487 488 /* If REG_NEWLINE is set, newlines are treated differently. */ 489 if (cflags & REG_NEWLINE) 490 { /* REG_NEWLINE implies neither . nor [^...] match newline. */ 491 syntax &= ~RE_DOT_NEWLINE; 492 syntax |= RE_HAT_LISTS_NOT_NEWLINE; 493 /* It also changes the matching behavior. */ 494 preg->newline_anchor = 1; 495 } 496 else 497 preg->newline_anchor = 0; 498 preg->no_sub = !!(cflags & REG_NOSUB); 499 preg->translate = NULL; 500 501 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax); 502 503 /* POSIX doesn't distinguish between an unmatched open-group and an 504 unmatched close-group: both are REG_EPAREN. */ 505 if (ret == REG_ERPAREN) 506 ret = REG_EPAREN; 507 508 /* We have already checked preg->fastmap != NULL. */ 509 if (BE (ret == REG_NOERROR, 1)) 510 /* Compute the fastmap now, since regexec cannot modify the pattern 511 buffer. This function never fails in this implementation. */ 512 (void) re_compile_fastmap (preg); 513 else 514 { 515 /* Some error occurred while compiling the expression. */ 516 re_free (preg->fastmap); 517 preg->fastmap = NULL; 518 } 519 520 return (int) ret; 521 } 522 #ifdef _LIBC 523 weak_alias (__regcomp, regcomp) 524 #endif 525 526 /* Returns a message corresponding to an error code, ERRCODE, returned 527 from either regcomp or regexec. We don't use PREG here. */ 528 529 size_t 530 regerror (errcode, preg, errbuf, errbuf_size) 531 int errcode; 532 const regex_t *__restrict preg; 533 char *__restrict errbuf; 534 size_t errbuf_size; 535 { 536 const char *msg; 537 size_t msg_size; 538 539 if (BE (errcode < 0 540 || errcode >= (int) (sizeof (__re_error_msgid_idx) 541 / sizeof (__re_error_msgid_idx[0])), 0)) 542 /* Only error codes returned by the rest of the code should be passed 543 to this routine. If we are given anything else, or if other regex 544 code generates an invalid error code, then the program has a bug. 545 Dump core so we can fix it. */ 546 abort (); 547 548 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]); 549 550 msg_size = strlen (msg) + 1; /* Includes the null. */ 551 552 if (BE (errbuf_size != 0, 1)) 553 { 554 if (BE (msg_size > errbuf_size, 0)) 555 { 556 #if defined HAVE_MEMPCPY || defined _LIBC 557 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0'; 558 #else 559 memcpy (errbuf, msg, errbuf_size - 1); 560 errbuf[errbuf_size - 1] = 0; 561 #endif 562 } 563 else 564 memcpy (errbuf, msg, msg_size); 565 } 566 567 return msg_size; 568 } 569 #ifdef _LIBC 570 weak_alias (__regerror, regerror) 571 #endif 572 573 574 #ifdef RE_ENABLE_I18N 575 /* This static array is used for the map to single-byte characters when 576 UTF-8 is used. Otherwise we would allocate memory just to initialize 577 it the same all the time. UTF-8 is the preferred encoding so this is 578 a worthwhile optimization. */ 579 static const bitset_t utf8_sb_map = 580 { 581 /* Set the first 128 bits. */ 582 [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX 583 }; 584 #endif 585 586 587 static void 588 free_dfa_content (re_dfa_t *dfa) 589 { 590 int i, j; 591 592 if (dfa->nodes) 593 for (i = 0; i < dfa->nodes_len; ++i) 594 free_token (dfa->nodes + i); 595 re_free (dfa->nexts); 596 for (i = 0; i < dfa->nodes_len; ++i) 597 { 598 if (dfa->eclosures != NULL) 599 re_node_set_free (dfa->eclosures + i); 600 if (dfa->inveclosures != NULL) 601 re_node_set_free (dfa->inveclosures + i); 602 if (dfa->edests != NULL) 603 re_node_set_free (dfa->edests + i); 604 } 605 re_free (dfa->edests); 606 re_free (dfa->eclosures); 607 re_free (dfa->inveclosures); 608 re_free (dfa->nodes); 609 610 if (dfa->state_table) 611 for (i = 0; i <= dfa->state_hash_mask; ++i) 612 { 613 struct re_state_table_entry *entry = dfa->state_table + i; 614 for (j = 0; j < entry->num; ++j) 615 { 616 re_dfastate_t *state = entry->array[j]; 617 free_state (state); 618 } 619 re_free (entry->array); 620 } 621 re_free (dfa->state_table); 622 #ifdef RE_ENABLE_I18N 623 if (dfa->sb_char != utf8_sb_map) 624 re_free (dfa->sb_char); 625 #endif 626 re_free (dfa->subexp_map); 627 #ifdef DEBUG 628 re_free (dfa->re_str); 629 #endif 630 631 re_free (dfa); 632 } 633 634 635 /* Free dynamically allocated space used by PREG. */ 636 637 void 638 regfree (preg) 639 regex_t *preg; 640 { 641 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 642 if (BE (dfa != NULL, 1)) 643 free_dfa_content (dfa); 644 preg->buffer = NULL; 645 preg->allocated = 0; 646 647 re_free (preg->fastmap); 648 preg->fastmap = NULL; 649 650 re_free (preg->translate); 651 preg->translate = NULL; 652 } 653 #ifdef _LIBC 654 weak_alias (__regfree, regfree) 655 #endif 656 657 /* Entry points compatible with 4.2 BSD regex library. We don't define 658 them unless specifically requested. */ 659 660 #if defined _REGEX_RE_COMP || defined _LIBC 661 662 /* BSD has one and only one pattern buffer. */ 663 static struct re_pattern_buffer re_comp_buf; 664 665 char * 666 # ifdef _LIBC 667 /* Make these definitions weak in libc, so POSIX programs can redefine 668 these names if they don't use our functions, and still use 669 regcomp/regexec above without link errors. */ 670 weak_function 671 # endif 672 re_comp (s) 673 const char *s; 674 { 675 reg_errcode_t ret; 676 char *fastmap; 677 678 if (!s) 679 { 680 if (!re_comp_buf.buffer) 681 return gettext ("No previous regular expression"); 682 return 0; 683 } 684 685 if (re_comp_buf.buffer) 686 { 687 fastmap = re_comp_buf.fastmap; 688 re_comp_buf.fastmap = NULL; 689 __regfree (&re_comp_buf); 690 memset (&re_comp_buf, '\0', sizeof (re_comp_buf)); 691 re_comp_buf.fastmap = fastmap; 692 } 693 694 if (re_comp_buf.fastmap == NULL) 695 { 696 re_comp_buf.fastmap = (char *) malloc (SBC_MAX); 697 if (re_comp_buf.fastmap == NULL) 698 return (char *) gettext (__re_error_msgid 699 + __re_error_msgid_idx[(int) REG_ESPACE]); 700 } 701 702 /* Since `re_exec' always passes NULL for the `regs' argument, we 703 don't need to initialize the pattern buffer fields which affect it. */ 704 705 /* Match anchors at newlines. */ 706 re_comp_buf.newline_anchor = 1; 707 708 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options); 709 710 if (!ret) 711 return NULL; 712 713 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */ 714 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]); 715 } 716 717 #ifdef _LIBC 718 libc_freeres_fn (free_mem) 719 { 720 __regfree (&re_comp_buf); 721 } 722 #endif 723 724 #endif /* _REGEX_RE_COMP */ 725 726 /* Internal entry point. 727 Compile the regular expression PATTERN, whose length is LENGTH. 728 SYNTAX indicate regular expression's syntax. */ 729 730 static reg_errcode_t 731 re_compile_internal (regex_t *preg, const char * pattern, size_t length, 732 reg_syntax_t syntax) 733 { 734 reg_errcode_t err = REG_NOERROR; 735 re_dfa_t *dfa; 736 re_string_t regexp; 737 738 /* Initialize the pattern buffer. */ 739 preg->fastmap_accurate = 0; 740 preg->syntax = syntax; 741 preg->not_bol = preg->not_eol = 0; 742 preg->used = 0; 743 preg->re_nsub = 0; 744 preg->can_be_null = 0; 745 preg->regs_allocated = REGS_UNALLOCATED; 746 747 /* Initialize the dfa. */ 748 dfa = (re_dfa_t *) preg->buffer; 749 if (BE (preg->allocated < sizeof (re_dfa_t), 0)) 750 { 751 /* If zero allocated, but buffer is non-null, try to realloc 752 enough space. This loses if buffer's address is bogus, but 753 that is the user's responsibility. If ->buffer is NULL this 754 is a simple allocation. */ 755 dfa = re_realloc (preg->buffer, re_dfa_t, 1); 756 if (dfa == NULL) 757 return REG_ESPACE; 758 preg->allocated = sizeof (re_dfa_t); 759 preg->buffer = (unsigned char *) dfa; 760 } 761 preg->used = sizeof (re_dfa_t); 762 763 err = init_dfa (dfa, length); 764 if (BE (err != REG_NOERROR, 0)) 765 { 766 free_dfa_content (dfa); 767 preg->buffer = NULL; 768 preg->allocated = 0; 769 return err; 770 } 771 #ifdef DEBUG 772 /* Note: length+1 will not overflow since it is checked in init_dfa. */ 773 dfa->re_str = re_malloc (char, length + 1); 774 strncpy (dfa->re_str, pattern, length + 1); 775 #endif 776 777 __libc_lock_init (dfa->lock); 778 779 err = re_string_construct (®exp, pattern, length, preg->translate, 780 syntax & RE_ICASE, dfa); 781 if (BE (err != REG_NOERROR, 0)) 782 { 783 re_compile_internal_free_return: 784 free_workarea_compile (preg); 785 re_string_destruct (®exp); 786 free_dfa_content (dfa); 787 preg->buffer = NULL; 788 preg->allocated = 0; 789 return err; 790 } 791 792 /* Parse the regular expression, and build a structure tree. */ 793 preg->re_nsub = 0; 794 dfa->str_tree = parse (®exp, preg, syntax, &err); 795 if (BE (dfa->str_tree == NULL, 0)) 796 goto re_compile_internal_free_return; 797 798 /* Analyze the tree and create the nfa. */ 799 err = analyze (preg); 800 if (BE (err != REG_NOERROR, 0)) 801 goto re_compile_internal_free_return; 802 803 #ifdef RE_ENABLE_I18N 804 /* If possible, do searching in single byte encoding to speed things up. */ 805 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL) 806 optimize_utf8 (dfa); 807 #endif 808 809 /* Then create the initial state of the dfa. */ 810 err = create_initial_state (dfa); 811 812 /* Release work areas. */ 813 free_workarea_compile (preg); 814 re_string_destruct (®exp); 815 816 if (BE (err != REG_NOERROR, 0)) 817 { 818 free_dfa_content (dfa); 819 preg->buffer = NULL; 820 preg->allocated = 0; 821 } 822 823 return err; 824 } 825 826 /* Initialize DFA. We use the length of the regular expression PAT_LEN 827 as the initial length of some arrays. */ 828 829 static reg_errcode_t 830 init_dfa (re_dfa_t *dfa, size_t pat_len) 831 { 832 unsigned int table_size; 833 #ifndef _LIBC 834 char *codeset_name; 835 #endif 836 837 memset (dfa, '\0', sizeof (re_dfa_t)); 838 839 /* Force allocation of str_tree_storage the first time. */ 840 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE; 841 842 /* Avoid overflows. */ 843 if (pat_len == SIZE_MAX) 844 return REG_ESPACE; 845 846 dfa->nodes_alloc = pat_len + 1; 847 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc); 848 849 /* table_size = 2 ^ ceil(log pat_len) */ 850 for (table_size = 1; ; table_size <<= 1) 851 if (table_size > pat_len) 852 break; 853 854 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size); 855 dfa->state_hash_mask = table_size - 1; 856 857 dfa->mb_cur_max = MB_CUR_MAX; 858 #ifdef _LIBC 859 if (dfa->mb_cur_max == 6 860 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0) 861 dfa->is_utf8 = 1; 862 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII) 863 != 0); 864 #else 865 # ifdef HAVE_LANGINFO_CODESET 866 codeset_name = nl_langinfo (CODESET); 867 # else 868 codeset_name = getenv ("LC_ALL"); 869 if (codeset_name == NULL || codeset_name[0] == '\0') 870 codeset_name = getenv ("LC_CTYPE"); 871 if (codeset_name == NULL || codeset_name[0] == '\0') 872 codeset_name = getenv ("LANG"); 873 if (codeset_name == NULL) 874 codeset_name = ""; 875 else if (strchr (codeset_name, '.') != NULL) 876 codeset_name = strchr (codeset_name, '.') + 1; 877 # endif 878 879 if (strcasecmp (codeset_name, "UTF-8") == 0 880 || strcasecmp (codeset_name, "UTF8") == 0) 881 dfa->is_utf8 = 1; 882 883 /* We check exhaustively in the loop below if this charset is a 884 superset of ASCII. */ 885 dfa->map_notascii = 0; 886 #endif 887 888 #ifdef RE_ENABLE_I18N 889 if (dfa->mb_cur_max > 1) 890 { 891 if (dfa->is_utf8) 892 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map; 893 else 894 { 895 int i, j, ch; 896 897 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1); 898 if (BE (dfa->sb_char == NULL, 0)) 899 return REG_ESPACE; 900 901 /* Set the bits corresponding to single byte chars. */ 902 for (i = 0, ch = 0; i < BITSET_WORDS; ++i) 903 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch) 904 { 905 wint_t wch = __btowc (ch); 906 if (wch != WEOF) 907 dfa->sb_char[i] |= (bitset_word_t) 1 << j; 908 # ifndef _LIBC 909 if (isascii (ch) && wch != ch) 910 dfa->map_notascii = 1; 911 # endif 912 } 913 } 914 } 915 #endif 916 917 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0)) 918 return REG_ESPACE; 919 return REG_NOERROR; 920 } 921 922 /* Initialize WORD_CHAR table, which indicate which character is 923 "word". In this case "word" means that it is the word construction 924 character used by some operators like "\<", "\>", etc. */ 925 926 static void 927 internal_function 928 init_word_char (re_dfa_t *dfa) 929 { 930 int i, j, ch; 931 dfa->word_ops_used = 1; 932 for (i = 0, ch = 0; i < BITSET_WORDS; ++i) 933 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch) 934 if (isalnum (ch) || ch == '_') 935 dfa->word_char[i] |= (bitset_word_t) 1 << j; 936 } 937 938 /* Free the work area which are only used while compiling. */ 939 940 static void 941 free_workarea_compile (regex_t *preg) 942 { 943 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 944 bin_tree_storage_t *storage, *next; 945 for (storage = dfa->str_tree_storage; storage; storage = next) 946 { 947 next = storage->next; 948 re_free (storage); 949 } 950 dfa->str_tree_storage = NULL; 951 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE; 952 dfa->str_tree = NULL; 953 re_free (dfa->org_indices); 954 dfa->org_indices = NULL; 955 } 956 957 /* Create initial states for all contexts. */ 958 959 static reg_errcode_t 960 create_initial_state (re_dfa_t *dfa) 961 { 962 int first, i; 963 reg_errcode_t err; 964 re_node_set init_nodes; 965 966 /* Initial states have the epsilon closure of the node which is 967 the first node of the regular expression. */ 968 first = dfa->str_tree->first->node_idx; 969 dfa->init_node = first; 970 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first); 971 if (BE (err != REG_NOERROR, 0)) 972 return err; 973 974 /* The back-references which are in initial states can epsilon transit, 975 since in this case all of the subexpressions can be null. 976 Then we add epsilon closures of the nodes which are the next nodes of 977 the back-references. */ 978 if (dfa->nbackref > 0) 979 for (i = 0; i < init_nodes.nelem; ++i) 980 { 981 int node_idx = init_nodes.elems[i]; 982 re_token_type_t type = dfa->nodes[node_idx].type; 983 984 int clexp_idx; 985 if (type != OP_BACK_REF) 986 continue; 987 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx) 988 { 989 re_token_t *clexp_node; 990 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx]; 991 if (clexp_node->type == OP_CLOSE_SUBEXP 992 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx) 993 break; 994 } 995 if (clexp_idx == init_nodes.nelem) 996 continue; 997 998 if (type == OP_BACK_REF) 999 { 1000 int dest_idx = dfa->edests[node_idx].elems[0]; 1001 if (!re_node_set_contains (&init_nodes, dest_idx)) 1002 { 1003 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx); 1004 i = 0; 1005 } 1006 } 1007 } 1008 1009 /* It must be the first time to invoke acquire_state. */ 1010 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0); 1011 /* We don't check ERR here, since the initial state must not be NULL. */ 1012 if (BE (dfa->init_state == NULL, 0)) 1013 return err; 1014 if (dfa->init_state->has_constraint) 1015 { 1016 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes, 1017 CONTEXT_WORD); 1018 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes, 1019 CONTEXT_NEWLINE); 1020 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa, 1021 &init_nodes, 1022 CONTEXT_NEWLINE 1023 | CONTEXT_BEGBUF); 1024 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL 1025 || dfa->init_state_begbuf == NULL, 0)) 1026 return err; 1027 } 1028 else 1029 dfa->init_state_word = dfa->init_state_nl 1030 = dfa->init_state_begbuf = dfa->init_state; 1031 1032 re_node_set_free (&init_nodes); 1033 return REG_NOERROR; 1034 } 1035 1036 #ifdef RE_ENABLE_I18N 1037 /* If it is possible to do searching in single byte encoding instead of UTF-8 1038 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change 1039 DFA nodes where needed. */ 1040 1041 static void 1042 optimize_utf8 (re_dfa_t *dfa) 1043 { 1044 int node, i, mb_chars = 0, has_period = 0; 1045 1046 for (node = 0; node < dfa->nodes_len; ++node) 1047 switch (dfa->nodes[node].type) 1048 { 1049 case CHARACTER: 1050 if (dfa->nodes[node].opr.c >= 0x80) 1051 mb_chars = 1; 1052 break; 1053 case ANCHOR: 1054 switch (dfa->nodes[node].opr.ctx_type) 1055 { 1056 case LINE_FIRST: 1057 case LINE_LAST: 1058 case BUF_FIRST: 1059 case BUF_LAST: 1060 break; 1061 default: 1062 /* Word anchors etc. cannot be handled. It's okay to test 1063 opr.ctx_type since constraints (for all DFA nodes) are 1064 created by ORing one or more opr.ctx_type values. */ 1065 return; 1066 } 1067 break; 1068 case OP_PERIOD: 1069 has_period = 1; 1070 break; 1071 case OP_BACK_REF: 1072 case OP_ALT: 1073 case END_OF_RE: 1074 case OP_DUP_ASTERISK: 1075 case OP_OPEN_SUBEXP: 1076 case OP_CLOSE_SUBEXP: 1077 break; 1078 case COMPLEX_BRACKET: 1079 return; 1080 case SIMPLE_BRACKET: 1081 /* Just double check. The non-ASCII range starts at 0x80. */ 1082 assert (0x80 % BITSET_WORD_BITS == 0); 1083 for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i) 1084 if (dfa->nodes[node].opr.sbcset[i]) 1085 return; 1086 break; 1087 default: 1088 abort (); 1089 } 1090 1091 if (mb_chars || has_period) 1092 for (node = 0; node < dfa->nodes_len; ++node) 1093 { 1094 if (dfa->nodes[node].type == CHARACTER 1095 && dfa->nodes[node].opr.c >= 0x80) 1096 dfa->nodes[node].mb_partial = 0; 1097 else if (dfa->nodes[node].type == OP_PERIOD) 1098 dfa->nodes[node].type = OP_UTF8_PERIOD; 1099 } 1100 1101 /* The search can be in single byte locale. */ 1102 dfa->mb_cur_max = 1; 1103 dfa->is_utf8 = 0; 1104 dfa->has_mb_node = dfa->nbackref > 0 || has_period; 1105 } 1106 #endif 1107 1108 /* Analyze the structure tree, and calculate "first", "next", "edest", 1109 "eclosure", and "inveclosure". */ 1110 1111 static reg_errcode_t 1112 analyze (regex_t *preg) 1113 { 1114 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 1115 reg_errcode_t ret; 1116 1117 /* Allocate arrays. */ 1118 dfa->nexts = re_malloc (int, dfa->nodes_alloc); 1119 dfa->org_indices = re_malloc (int, dfa->nodes_alloc); 1120 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc); 1121 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc); 1122 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL 1123 || dfa->eclosures == NULL, 0)) 1124 return REG_ESPACE; 1125 1126 dfa->subexp_map = re_malloc (int, preg->re_nsub); 1127 if (dfa->subexp_map != NULL) 1128 { 1129 int i; 1130 for (i = 0; i < preg->re_nsub; i++) 1131 dfa->subexp_map[i] = i; 1132 preorder (dfa->str_tree, optimize_subexps, dfa); 1133 for (i = 0; i < preg->re_nsub; i++) 1134 if (dfa->subexp_map[i] != i) 1135 break; 1136 if (i == preg->re_nsub) 1137 { 1138 free (dfa->subexp_map); 1139 dfa->subexp_map = NULL; 1140 } 1141 } 1142 1143 ret = postorder (dfa->str_tree, lower_subexps, preg); 1144 if (BE (ret != REG_NOERROR, 0)) 1145 return ret; 1146 ret = postorder (dfa->str_tree, calc_first, dfa); 1147 if (BE (ret != REG_NOERROR, 0)) 1148 return ret; 1149 preorder (dfa->str_tree, calc_next, dfa); 1150 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa); 1151 if (BE (ret != REG_NOERROR, 0)) 1152 return ret; 1153 ret = calc_eclosure (dfa); 1154 if (BE (ret != REG_NOERROR, 0)) 1155 return ret; 1156 1157 /* We only need this during the prune_impossible_nodes pass in regexec.c; 1158 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */ 1159 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match) 1160 || dfa->nbackref) 1161 { 1162 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len); 1163 if (BE (dfa->inveclosures == NULL, 0)) 1164 return REG_ESPACE; 1165 ret = calc_inveclosure (dfa); 1166 } 1167 1168 return ret; 1169 } 1170 1171 /* Our parse trees are very unbalanced, so we cannot use a stack to 1172 implement parse tree visits. Instead, we use parent pointers and 1173 some hairy code in these two functions. */ 1174 static reg_errcode_t 1175 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)), 1176 void *extra) 1177 { 1178 bin_tree_t *node, *prev; 1179 1180 for (node = root; ; ) 1181 { 1182 /* Descend down the tree, preferably to the left (or to the right 1183 if that's the only child). */ 1184 while (node->left || node->right) 1185 if (node->left) 1186 node = node->left; 1187 else 1188 node = node->right; 1189 1190 do 1191 { 1192 reg_errcode_t err = fn (extra, node); 1193 if (BE (err != REG_NOERROR, 0)) 1194 return err; 1195 if (node->parent == NULL) 1196 return REG_NOERROR; 1197 prev = node; 1198 node = node->parent; 1199 } 1200 /* Go up while we have a node that is reached from the right. */ 1201 while (node->right == prev || node->right == NULL); 1202 node = node->right; 1203 } 1204 } 1205 1206 static reg_errcode_t 1207 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)), 1208 void *extra) 1209 { 1210 bin_tree_t *node; 1211 1212 for (node = root; ; ) 1213 { 1214 reg_errcode_t err = fn (extra, node); 1215 if (BE (err != REG_NOERROR, 0)) 1216 return err; 1217 1218 /* Go to the left node, or up and to the right. */ 1219 if (node->left) 1220 node = node->left; 1221 else 1222 { 1223 bin_tree_t *prev = NULL; 1224 while (node->right == prev || node->right == NULL) 1225 { 1226 prev = node; 1227 node = node->parent; 1228 if (!node) 1229 return REG_NOERROR; 1230 } 1231 node = node->right; 1232 } 1233 } 1234 } 1235 1236 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell 1237 re_search_internal to map the inner one's opr.idx to this one's. Adjust 1238 backreferences as well. Requires a preorder visit. */ 1239 static reg_errcode_t 1240 optimize_subexps (void *extra, bin_tree_t *node) 1241 { 1242 re_dfa_t *dfa = (re_dfa_t *) extra; 1243 1244 if (node->token.type == OP_BACK_REF && dfa->subexp_map) 1245 { 1246 int idx = node->token.opr.idx; 1247 node->token.opr.idx = dfa->subexp_map[idx]; 1248 dfa->used_bkref_map |= 1 << node->token.opr.idx; 1249 } 1250 1251 else if (node->token.type == SUBEXP 1252 && node->left && node->left->token.type == SUBEXP) 1253 { 1254 int other_idx = node->left->token.opr.idx; 1255 1256 node->left = node->left->left; 1257 if (node->left) 1258 node->left->parent = node; 1259 1260 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx]; 1261 if (other_idx < BITSET_WORD_BITS) 1262 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx); 1263 } 1264 1265 return REG_NOERROR; 1266 } 1267 1268 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation 1269 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */ 1270 static reg_errcode_t 1271 lower_subexps (void *extra, bin_tree_t *node) 1272 { 1273 regex_t *preg = (regex_t *) extra; 1274 reg_errcode_t err = REG_NOERROR; 1275 1276 if (node->left && node->left->token.type == SUBEXP) 1277 { 1278 node->left = lower_subexp (&err, preg, node->left); 1279 if (node->left) 1280 node->left->parent = node; 1281 } 1282 if (node->right && node->right->token.type == SUBEXP) 1283 { 1284 node->right = lower_subexp (&err, preg, node->right); 1285 if (node->right) 1286 node->right->parent = node; 1287 } 1288 1289 return err; 1290 } 1291 1292 static bin_tree_t * 1293 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node) 1294 { 1295 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 1296 bin_tree_t *body = node->left; 1297 bin_tree_t *op, *cls, *tree1, *tree; 1298 1299 if (preg->no_sub 1300 /* We do not optimize empty subexpressions, because otherwise we may 1301 have bad CONCAT nodes with NULL children. This is obviously not 1302 very common, so we do not lose much. An example that triggers 1303 this case is the sed "script" /\(\)/x. */ 1304 && node->left != NULL 1305 && (node->token.opr.idx >= BITSET_WORD_BITS 1306 || !(dfa->used_bkref_map 1307 & ((bitset_word_t) 1 << node->token.opr.idx)))) 1308 return node->left; 1309 1310 /* Convert the SUBEXP node to the concatenation of an 1311 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */ 1312 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP); 1313 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP); 1314 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls; 1315 tree = create_tree (dfa, op, tree1, CONCAT); 1316 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0)) 1317 { 1318 *err = REG_ESPACE; 1319 return NULL; 1320 } 1321 1322 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx; 1323 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp; 1324 return tree; 1325 } 1326 1327 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton 1328 nodes. Requires a postorder visit. */ 1329 static reg_errcode_t 1330 calc_first (void *extra, bin_tree_t *node) 1331 { 1332 re_dfa_t *dfa = (re_dfa_t *) extra; 1333 if (node->token.type == CONCAT) 1334 { 1335 node->first = node->left->first; 1336 node->node_idx = node->left->node_idx; 1337 } 1338 else 1339 { 1340 node->first = node; 1341 node->node_idx = re_dfa_add_node (dfa, node->token); 1342 if (BE (node->node_idx == -1, 0)) 1343 return REG_ESPACE; 1344 if (node->token.type == ANCHOR) 1345 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type; 1346 } 1347 return REG_NOERROR; 1348 } 1349 1350 /* Pass 2: compute NEXT on the tree. Preorder visit. */ 1351 static reg_errcode_t 1352 calc_next (void *extra, bin_tree_t *node) 1353 { 1354 switch (node->token.type) 1355 { 1356 case OP_DUP_ASTERISK: 1357 node->left->next = node; 1358 break; 1359 case CONCAT: 1360 node->left->next = node->right->first; 1361 node->right->next = node->next; 1362 break; 1363 default: 1364 if (node->left) 1365 node->left->next = node->next; 1366 if (node->right) 1367 node->right->next = node->next; 1368 break; 1369 } 1370 return REG_NOERROR; 1371 } 1372 1373 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */ 1374 static reg_errcode_t 1375 link_nfa_nodes (void *extra, bin_tree_t *node) 1376 { 1377 re_dfa_t *dfa = (re_dfa_t *) extra; 1378 int idx = node->node_idx; 1379 reg_errcode_t err = REG_NOERROR; 1380 1381 switch (node->token.type) 1382 { 1383 case CONCAT: 1384 break; 1385 1386 case END_OF_RE: 1387 assert (node->next == NULL); 1388 break; 1389 1390 case OP_DUP_ASTERISK: 1391 case OP_ALT: 1392 { 1393 int left, right; 1394 dfa->has_plural_match = 1; 1395 if (node->left != NULL) 1396 left = node->left->first->node_idx; 1397 else 1398 left = node->next->node_idx; 1399 if (node->right != NULL) 1400 right = node->right->first->node_idx; 1401 else 1402 right = node->next->node_idx; 1403 assert (left > -1); 1404 assert (right > -1); 1405 err = re_node_set_init_2 (dfa->edests + idx, left, right); 1406 } 1407 break; 1408 1409 case ANCHOR: 1410 case OP_OPEN_SUBEXP: 1411 case OP_CLOSE_SUBEXP: 1412 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx); 1413 break; 1414 1415 case OP_BACK_REF: 1416 dfa->nexts[idx] = node->next->node_idx; 1417 if (node->token.type == OP_BACK_REF) 1418 re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]); 1419 break; 1420 1421 default: 1422 assert (!IS_EPSILON_NODE (node->token.type)); 1423 dfa->nexts[idx] = node->next->node_idx; 1424 break; 1425 } 1426 1427 return err; 1428 } 1429 1430 /* Duplicate the epsilon closure of the node ROOT_NODE. 1431 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition 1432 to their own constraint. */ 1433 1434 static reg_errcode_t 1435 internal_function 1436 duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node, 1437 int root_node, unsigned int init_constraint) 1438 { 1439 int org_node, clone_node, ret; 1440 unsigned int constraint = init_constraint; 1441 for (org_node = top_org_node, clone_node = top_clone_node;;) 1442 { 1443 int org_dest, clone_dest; 1444 if (dfa->nodes[org_node].type == OP_BACK_REF) 1445 { 1446 /* If the back reference epsilon-transit, its destination must 1447 also have the constraint. Then duplicate the epsilon closure 1448 of the destination of the back reference, and store it in 1449 edests of the back reference. */ 1450 org_dest = dfa->nexts[org_node]; 1451 re_node_set_empty (dfa->edests + clone_node); 1452 clone_dest = duplicate_node (dfa, org_dest, constraint); 1453 if (BE (clone_dest == -1, 0)) 1454 return REG_ESPACE; 1455 dfa->nexts[clone_node] = dfa->nexts[org_node]; 1456 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1457 if (BE (ret < 0, 0)) 1458 return REG_ESPACE; 1459 } 1460 else if (dfa->edests[org_node].nelem == 0) 1461 { 1462 /* In case of the node can't epsilon-transit, don't duplicate the 1463 destination and store the original destination as the 1464 destination of the node. */ 1465 dfa->nexts[clone_node] = dfa->nexts[org_node]; 1466 break; 1467 } 1468 else if (dfa->edests[org_node].nelem == 1) 1469 { 1470 /* In case of the node can epsilon-transit, and it has only one 1471 destination. */ 1472 org_dest = dfa->edests[org_node].elems[0]; 1473 re_node_set_empty (dfa->edests + clone_node); 1474 /* If the node is root_node itself, it means the epsilon clsoure 1475 has a loop. Then tie it to the destination of the root_node. */ 1476 if (org_node == root_node && clone_node != org_node) 1477 { 1478 ret = re_node_set_insert (dfa->edests + clone_node, org_dest); 1479 if (BE (ret < 0, 0)) 1480 return REG_ESPACE; 1481 break; 1482 } 1483 /* In case of the node has another constraint, add it. */ 1484 constraint |= dfa->nodes[org_node].constraint; 1485 clone_dest = duplicate_node (dfa, org_dest, constraint); 1486 if (BE (clone_dest == -1, 0)) 1487 return REG_ESPACE; 1488 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1489 if (BE (ret < 0, 0)) 1490 return REG_ESPACE; 1491 } 1492 else /* dfa->edests[org_node].nelem == 2 */ 1493 { 1494 /* In case of the node can epsilon-transit, and it has two 1495 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */ 1496 org_dest = dfa->edests[org_node].elems[0]; 1497 re_node_set_empty (dfa->edests + clone_node); 1498 /* Search for a duplicated node which satisfies the constraint. */ 1499 clone_dest = search_duplicated_node (dfa, org_dest, constraint); 1500 if (clone_dest == -1) 1501 { 1502 /* There is no such duplicated node, create a new one. */ 1503 reg_errcode_t err; 1504 clone_dest = duplicate_node (dfa, org_dest, constraint); 1505 if (BE (clone_dest == -1, 0)) 1506 return REG_ESPACE; 1507 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1508 if (BE (ret < 0, 0)) 1509 return REG_ESPACE; 1510 err = duplicate_node_closure (dfa, org_dest, clone_dest, 1511 root_node, constraint); 1512 if (BE (err != REG_NOERROR, 0)) 1513 return err; 1514 } 1515 else 1516 { 1517 /* There is a duplicated node which satisfies the constraint, 1518 use it to avoid infinite loop. */ 1519 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1520 if (BE (ret < 0, 0)) 1521 return REG_ESPACE; 1522 } 1523 1524 org_dest = dfa->edests[org_node].elems[1]; 1525 clone_dest = duplicate_node (dfa, org_dest, constraint); 1526 if (BE (clone_dest == -1, 0)) 1527 return REG_ESPACE; 1528 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1529 if (BE (ret < 0, 0)) 1530 return REG_ESPACE; 1531 } 1532 org_node = org_dest; 1533 clone_node = clone_dest; 1534 } 1535 return REG_NOERROR; 1536 } 1537 1538 /* Search for a node which is duplicated from the node ORG_NODE, and 1539 satisfies the constraint CONSTRAINT. */ 1540 1541 static int 1542 search_duplicated_node (const re_dfa_t *dfa, int org_node, 1543 unsigned int constraint) 1544 { 1545 int idx; 1546 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx) 1547 { 1548 if (org_node == dfa->org_indices[idx] 1549 && constraint == dfa->nodes[idx].constraint) 1550 return idx; /* Found. */ 1551 } 1552 return -1; /* Not found. */ 1553 } 1554 1555 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT. 1556 Return the index of the new node, or -1 if insufficient storage is 1557 available. */ 1558 1559 static int 1560 duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint) 1561 { 1562 int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]); 1563 if (BE (dup_idx != -1, 1)) 1564 { 1565 dfa->nodes[dup_idx].constraint = constraint; 1566 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint; 1567 dfa->nodes[dup_idx].duplicated = 1; 1568 1569 /* Store the index of the original node. */ 1570 dfa->org_indices[dup_idx] = org_idx; 1571 } 1572 return dup_idx; 1573 } 1574 1575 static reg_errcode_t 1576 calc_inveclosure (re_dfa_t *dfa) 1577 { 1578 int src, idx, ret; 1579 for (idx = 0; idx < dfa->nodes_len; ++idx) 1580 re_node_set_init_empty (dfa->inveclosures + idx); 1581 1582 for (src = 0; src < dfa->nodes_len; ++src) 1583 { 1584 int *elems = dfa->eclosures[src].elems; 1585 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx) 1586 { 1587 ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src); 1588 if (BE (ret == -1, 0)) 1589 return REG_ESPACE; 1590 } 1591 } 1592 1593 return REG_NOERROR; 1594 } 1595 1596 /* Calculate "eclosure" for all the node in DFA. */ 1597 1598 static reg_errcode_t 1599 calc_eclosure (re_dfa_t *dfa) 1600 { 1601 int node_idx, incomplete; 1602 #ifdef DEBUG 1603 assert (dfa->nodes_len > 0); 1604 #endif 1605 incomplete = 0; 1606 /* For each nodes, calculate epsilon closure. */ 1607 for (node_idx = 0; ; ++node_idx) 1608 { 1609 reg_errcode_t err; 1610 re_node_set eclosure_elem; 1611 if (node_idx == dfa->nodes_len) 1612 { 1613 if (!incomplete) 1614 break; 1615 incomplete = 0; 1616 node_idx = 0; 1617 } 1618 1619 #ifdef DEBUG 1620 assert (dfa->eclosures[node_idx].nelem != -1); 1621 #endif 1622 1623 /* If we have already calculated, skip it. */ 1624 if (dfa->eclosures[node_idx].nelem != 0) 1625 continue; 1626 /* Calculate epsilon closure of `node_idx'. */ 1627 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1); 1628 if (BE (err != REG_NOERROR, 0)) 1629 return err; 1630 1631 if (dfa->eclosures[node_idx].nelem == 0) 1632 { 1633 incomplete = 1; 1634 re_node_set_free (&eclosure_elem); 1635 } 1636 } 1637 return REG_NOERROR; 1638 } 1639 1640 /* Calculate epsilon closure of NODE. */ 1641 1642 static reg_errcode_t 1643 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root) 1644 { 1645 reg_errcode_t err; 1646 int i, incomplete; 1647 re_node_set eclosure; 1648 incomplete = 0; 1649 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1); 1650 if (BE (err != REG_NOERROR, 0)) 1651 return err; 1652 1653 /* This indicates that we are calculating this node now. 1654 We reference this value to avoid infinite loop. */ 1655 dfa->eclosures[node].nelem = -1; 1656 1657 /* If the current node has constraints, duplicate all nodes 1658 since they must inherit the constraints. */ 1659 if (dfa->nodes[node].constraint 1660 && dfa->edests[node].nelem 1661 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated) 1662 { 1663 err = duplicate_node_closure (dfa, node, node, node, 1664 dfa->nodes[node].constraint); 1665 if (BE (err != REG_NOERROR, 0)) 1666 return err; 1667 } 1668 1669 /* Expand each epsilon destination nodes. */ 1670 if (IS_EPSILON_NODE(dfa->nodes[node].type)) 1671 for (i = 0; i < dfa->edests[node].nelem; ++i) 1672 { 1673 re_node_set eclosure_elem; 1674 int edest = dfa->edests[node].elems[i]; 1675 /* If calculating the epsilon closure of `edest' is in progress, 1676 return intermediate result. */ 1677 if (dfa->eclosures[edest].nelem == -1) 1678 { 1679 incomplete = 1; 1680 continue; 1681 } 1682 /* If we haven't calculated the epsilon closure of `edest' yet, 1683 calculate now. Otherwise use calculated epsilon closure. */ 1684 if (dfa->eclosures[edest].nelem == 0) 1685 { 1686 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0); 1687 if (BE (err != REG_NOERROR, 0)) 1688 return err; 1689 } 1690 else 1691 eclosure_elem = dfa->eclosures[edest]; 1692 /* Merge the epsilon closure of `edest'. */ 1693 re_node_set_merge (&eclosure, &eclosure_elem); 1694 /* If the epsilon closure of `edest' is incomplete, 1695 the epsilon closure of this node is also incomplete. */ 1696 if (dfa->eclosures[edest].nelem == 0) 1697 { 1698 incomplete = 1; 1699 re_node_set_free (&eclosure_elem); 1700 } 1701 } 1702 1703 /* Epsilon closures include itself. */ 1704 re_node_set_insert (&eclosure, node); 1705 if (incomplete && !root) 1706 dfa->eclosures[node].nelem = 0; 1707 else 1708 dfa->eclosures[node] = eclosure; 1709 *new_set = eclosure; 1710 return REG_NOERROR; 1711 } 1712 1713 /* Functions for token which are used in the parser. */ 1714 1715 /* Fetch a token from INPUT. 1716 We must not use this function inside bracket expressions. */ 1717 1718 static void 1719 internal_function 1720 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax) 1721 { 1722 re_string_skip_bytes (input, peek_token (result, input, syntax)); 1723 } 1724 1725 /* Peek a token from INPUT, and return the length of the token. 1726 We must not use this function inside bracket expressions. */ 1727 1728 static int 1729 internal_function 1730 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax) 1731 { 1732 unsigned char c; 1733 1734 if (re_string_eoi (input)) 1735 { 1736 token->type = END_OF_RE; 1737 return 0; 1738 } 1739 1740 c = re_string_peek_byte (input, 0); 1741 token->opr.c = c; 1742 1743 token->word_char = 0; 1744 #ifdef RE_ENABLE_I18N 1745 token->mb_partial = 0; 1746 if (input->mb_cur_max > 1 && 1747 !re_string_first_byte (input, re_string_cur_idx (input))) 1748 { 1749 token->type = CHARACTER; 1750 token->mb_partial = 1; 1751 return 1; 1752 } 1753 #endif 1754 if (c == '\\') 1755 { 1756 unsigned char c2; 1757 if (re_string_cur_idx (input) + 1 >= re_string_length (input)) 1758 { 1759 token->type = BACK_SLASH; 1760 return 1; 1761 } 1762 1763 c2 = re_string_peek_byte_case (input, 1); 1764 token->opr.c = c2; 1765 token->type = CHARACTER; 1766 #ifdef RE_ENABLE_I18N 1767 if (input->mb_cur_max > 1) 1768 { 1769 wint_t wc = re_string_wchar_at (input, 1770 re_string_cur_idx (input) + 1); 1771 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0; 1772 } 1773 else 1774 #endif 1775 token->word_char = IS_WORD_CHAR (c2) != 0; 1776 1777 switch (c2) 1778 { 1779 case '|': 1780 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR)) 1781 token->type = OP_ALT; 1782 break; 1783 case '1': case '2': case '3': case '4': case '5': 1784 case '6': case '7': case '8': case '9': 1785 if (!(syntax & RE_NO_BK_REFS)) 1786 { 1787 token->type = OP_BACK_REF; 1788 token->opr.idx = c2 - '1'; 1789 } 1790 break; 1791 case '<': 1792 if (!(syntax & RE_NO_GNU_OPS)) 1793 { 1794 token->type = ANCHOR; 1795 token->opr.ctx_type = WORD_FIRST; 1796 } 1797 break; 1798 case '>': 1799 if (!(syntax & RE_NO_GNU_OPS)) 1800 { 1801 token->type = ANCHOR; 1802 token->opr.ctx_type = WORD_LAST; 1803 } 1804 break; 1805 case 'b': 1806 if (!(syntax & RE_NO_GNU_OPS)) 1807 { 1808 token->type = ANCHOR; 1809 token->opr.ctx_type = WORD_DELIM; 1810 } 1811 break; 1812 case 'B': 1813 if (!(syntax & RE_NO_GNU_OPS)) 1814 { 1815 token->type = ANCHOR; 1816 token->opr.ctx_type = NOT_WORD_DELIM; 1817 } 1818 break; 1819 case 'w': 1820 if (!(syntax & RE_NO_GNU_OPS)) 1821 token->type = OP_WORD; 1822 break; 1823 case 'W': 1824 if (!(syntax & RE_NO_GNU_OPS)) 1825 token->type = OP_NOTWORD; 1826 break; 1827 case 's': 1828 if (!(syntax & RE_NO_GNU_OPS)) 1829 token->type = OP_SPACE; 1830 break; 1831 case 'S': 1832 if (!(syntax & RE_NO_GNU_OPS)) 1833 token->type = OP_NOTSPACE; 1834 break; 1835 case '`': 1836 if (!(syntax & RE_NO_GNU_OPS)) 1837 { 1838 token->type = ANCHOR; 1839 token->opr.ctx_type = BUF_FIRST; 1840 } 1841 break; 1842 case '\'': 1843 if (!(syntax & RE_NO_GNU_OPS)) 1844 { 1845 token->type = ANCHOR; 1846 token->opr.ctx_type = BUF_LAST; 1847 } 1848 break; 1849 case '(': 1850 if (!(syntax & RE_NO_BK_PARENS)) 1851 token->type = OP_OPEN_SUBEXP; 1852 break; 1853 case ')': 1854 if (!(syntax & RE_NO_BK_PARENS)) 1855 token->type = OP_CLOSE_SUBEXP; 1856 break; 1857 case '+': 1858 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM)) 1859 token->type = OP_DUP_PLUS; 1860 break; 1861 case '?': 1862 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM)) 1863 token->type = OP_DUP_QUESTION; 1864 break; 1865 case '{': 1866 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES))) 1867 token->type = OP_OPEN_DUP_NUM; 1868 break; 1869 case '}': 1870 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES))) 1871 token->type = OP_CLOSE_DUP_NUM; 1872 break; 1873 default: 1874 break; 1875 } 1876 return 2; 1877 } 1878 1879 token->type = CHARACTER; 1880 #ifdef RE_ENABLE_I18N 1881 if (input->mb_cur_max > 1) 1882 { 1883 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input)); 1884 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0; 1885 } 1886 else 1887 #endif 1888 token->word_char = IS_WORD_CHAR (token->opr.c); 1889 1890 switch (c) 1891 { 1892 case '\n': 1893 if (syntax & RE_NEWLINE_ALT) 1894 token->type = OP_ALT; 1895 break; 1896 case '|': 1897 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR)) 1898 token->type = OP_ALT; 1899 break; 1900 case '*': 1901 token->type = OP_DUP_ASTERISK; 1902 break; 1903 case '+': 1904 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM)) 1905 token->type = OP_DUP_PLUS; 1906 break; 1907 case '?': 1908 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM)) 1909 token->type = OP_DUP_QUESTION; 1910 break; 1911 case '{': 1912 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES)) 1913 token->type = OP_OPEN_DUP_NUM; 1914 break; 1915 case '}': 1916 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES)) 1917 token->type = OP_CLOSE_DUP_NUM; 1918 break; 1919 case '(': 1920 if (syntax & RE_NO_BK_PARENS) 1921 token->type = OP_OPEN_SUBEXP; 1922 break; 1923 case ')': 1924 if (syntax & RE_NO_BK_PARENS) 1925 token->type = OP_CLOSE_SUBEXP; 1926 break; 1927 case '[': 1928 token->type = OP_OPEN_BRACKET; 1929 break; 1930 case '.': 1931 token->type = OP_PERIOD; 1932 break; 1933 case '^': 1934 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) && 1935 re_string_cur_idx (input) != 0) 1936 { 1937 char prev = re_string_peek_byte (input, -1); 1938 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n') 1939 break; 1940 } 1941 token->type = ANCHOR; 1942 token->opr.ctx_type = LINE_FIRST; 1943 break; 1944 case '$': 1945 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) && 1946 re_string_cur_idx (input) + 1 != re_string_length (input)) 1947 { 1948 re_token_t next; 1949 re_string_skip_bytes (input, 1); 1950 peek_token (&next, input, syntax); 1951 re_string_skip_bytes (input, -1); 1952 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP) 1953 break; 1954 } 1955 token->type = ANCHOR; 1956 token->opr.ctx_type = LINE_LAST; 1957 break; 1958 default: 1959 break; 1960 } 1961 return 1; 1962 } 1963 1964 /* Peek a token from INPUT, and return the length of the token. 1965 We must not use this function out of bracket expressions. */ 1966 1967 static int 1968 internal_function 1969 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax) 1970 { 1971 unsigned char c; 1972 if (re_string_eoi (input)) 1973 { 1974 token->type = END_OF_RE; 1975 return 0; 1976 } 1977 c = re_string_peek_byte (input, 0); 1978 token->opr.c = c; 1979 1980 #ifdef RE_ENABLE_I18N 1981 if (input->mb_cur_max > 1 && 1982 !re_string_first_byte (input, re_string_cur_idx (input))) 1983 { 1984 token->type = CHARACTER; 1985 return 1; 1986 } 1987 #endif /* RE_ENABLE_I18N */ 1988 1989 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) 1990 && re_string_cur_idx (input) + 1 < re_string_length (input)) 1991 { 1992 /* In this case, '\' escape a character. */ 1993 unsigned char c2; 1994 re_string_skip_bytes (input, 1); 1995 c2 = re_string_peek_byte (input, 0); 1996 token->opr.c = c2; 1997 token->type = CHARACTER; 1998 return 1; 1999 } 2000 if (c == '[') /* '[' is a special char in a bracket exps. */ 2001 { 2002 unsigned char c2; 2003 int token_len; 2004 if (re_string_cur_idx (input) + 1 < re_string_length (input)) 2005 c2 = re_string_peek_byte (input, 1); 2006 else 2007 c2 = 0; 2008 token->opr.c = c2; 2009 token_len = 2; 2010 switch (c2) 2011 { 2012 case '.': 2013 token->type = OP_OPEN_COLL_ELEM; 2014 break; 2015 case '=': 2016 token->type = OP_OPEN_EQUIV_CLASS; 2017 break; 2018 case ':': 2019 if (syntax & RE_CHAR_CLASSES) 2020 { 2021 token->type = OP_OPEN_CHAR_CLASS; 2022 break; 2023 } 2024 /* else fall through. */ 2025 default: 2026 token->type = CHARACTER; 2027 token->opr.c = c; 2028 token_len = 1; 2029 break; 2030 } 2031 return token_len; 2032 } 2033 switch (c) 2034 { 2035 case '-': 2036 token->type = OP_CHARSET_RANGE; 2037 break; 2038 case ']': 2039 token->type = OP_CLOSE_BRACKET; 2040 break; 2041 case '^': 2042 token->type = OP_NON_MATCH_LIST; 2043 break; 2044 default: 2045 token->type = CHARACTER; 2046 } 2047 return 1; 2048 } 2049 2050 /* Functions for parser. */ 2051 2052 /* Entry point of the parser. 2053 Parse the regular expression REGEXP and return the structure tree. 2054 If an error is occured, ERR is set by error code, and return NULL. 2055 This function build the following tree, from regular expression <reg_exp>: 2056 CAT 2057 / \ 2058 / \ 2059 <reg_exp> EOR 2060 2061 CAT means concatenation. 2062 EOR means end of regular expression. */ 2063 2064 static bin_tree_t * 2065 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax, 2066 reg_errcode_t *err) 2067 { 2068 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 2069 bin_tree_t *tree, *eor, *root; 2070 re_token_t current_token; 2071 dfa->syntax = syntax; 2072 fetch_token (¤t_token, regexp, syntax | RE_CARET_ANCHORS_HERE); 2073 tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err); 2074 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2075 return NULL; 2076 eor = create_tree (dfa, NULL, NULL, END_OF_RE); 2077 if (tree != NULL) 2078 root = create_tree (dfa, tree, eor, CONCAT); 2079 else 2080 root = eor; 2081 if (BE (eor == NULL || root == NULL, 0)) 2082 { 2083 *err = REG_ESPACE; 2084 return NULL; 2085 } 2086 return root; 2087 } 2088 2089 /* This function build the following tree, from regular expression 2090 <branch1>|<branch2>: 2091 ALT 2092 / \ 2093 / \ 2094 <branch1> <branch2> 2095 2096 ALT means alternative, which represents the operator `|'. */ 2097 2098 static bin_tree_t * 2099 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token, 2100 reg_syntax_t syntax, int nest, reg_errcode_t *err) 2101 { 2102 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 2103 bin_tree_t *tree, *branch = NULL; 2104 tree = parse_branch (regexp, preg, token, syntax, nest, err); 2105 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2106 return NULL; 2107 2108 while (token->type == OP_ALT) 2109 { 2110 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE); 2111 if (token->type != OP_ALT && token->type != END_OF_RE 2112 && (nest == 0 || token->type != OP_CLOSE_SUBEXP)) 2113 { 2114 branch = parse_branch (regexp, preg, token, syntax, nest, err); 2115 if (BE (*err != REG_NOERROR && branch == NULL, 0)) 2116 return NULL; 2117 } 2118 else 2119 branch = NULL; 2120 tree = create_tree (dfa, tree, branch, OP_ALT); 2121 if (BE (tree == NULL, 0)) 2122 { 2123 *err = REG_ESPACE; 2124 return NULL; 2125 } 2126 } 2127 return tree; 2128 } 2129 2130 /* This function build the following tree, from regular expression 2131 <exp1><exp2>: 2132 CAT 2133 / \ 2134 / \ 2135 <exp1> <exp2> 2136 2137 CAT means concatenation. */ 2138 2139 static bin_tree_t * 2140 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token, 2141 reg_syntax_t syntax, int nest, reg_errcode_t *err) 2142 { 2143 bin_tree_t *tree, *exp; 2144 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 2145 tree = parse_expression (regexp, preg, token, syntax, nest, err); 2146 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2147 return NULL; 2148 2149 while (token->type != OP_ALT && token->type != END_OF_RE 2150 && (nest == 0 || token->type != OP_CLOSE_SUBEXP)) 2151 { 2152 exp = parse_expression (regexp, preg, token, syntax, nest, err); 2153 if (BE (*err != REG_NOERROR && exp == NULL, 0)) 2154 { 2155 return NULL; 2156 } 2157 if (tree != NULL && exp != NULL) 2158 { 2159 tree = create_tree (dfa, tree, exp, CONCAT); 2160 if (tree == NULL) 2161 { 2162 *err = REG_ESPACE; 2163 return NULL; 2164 } 2165 } 2166 else if (tree == NULL) 2167 tree = exp; 2168 /* Otherwise exp == NULL, we don't need to create new tree. */ 2169 } 2170 return tree; 2171 } 2172 2173 /* This function build the following tree, from regular expression a*: 2174 * 2175 | 2176 a 2177 */ 2178 2179 static bin_tree_t * 2180 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token, 2181 reg_syntax_t syntax, int nest, reg_errcode_t *err) 2182 { 2183 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 2184 bin_tree_t *tree; 2185 switch (token->type) 2186 { 2187 case CHARACTER: 2188 tree = create_token_tree (dfa, NULL, NULL, token); 2189 if (BE (tree == NULL, 0)) 2190 { 2191 *err = REG_ESPACE; 2192 return NULL; 2193 } 2194 #ifdef RE_ENABLE_I18N 2195 if (dfa->mb_cur_max > 1) 2196 { 2197 while (!re_string_eoi (regexp) 2198 && !re_string_first_byte (regexp, re_string_cur_idx (regexp))) 2199 { 2200 bin_tree_t *mbc_remain; 2201 fetch_token (token, regexp, syntax); 2202 mbc_remain = create_token_tree (dfa, NULL, NULL, token); 2203 tree = create_tree (dfa, tree, mbc_remain, CONCAT); 2204 if (BE (mbc_remain == NULL || tree == NULL, 0)) 2205 { 2206 *err = REG_ESPACE; 2207 return NULL; 2208 } 2209 } 2210 } 2211 #endif 2212 break; 2213 case OP_OPEN_SUBEXP: 2214 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err); 2215 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2216 return NULL; 2217 break; 2218 case OP_OPEN_BRACKET: 2219 tree = parse_bracket_exp (regexp, dfa, token, syntax, err); 2220 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2221 return NULL; 2222 break; 2223 case OP_BACK_REF: 2224 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1)) 2225 { 2226 *err = REG_ESUBREG; 2227 return NULL; 2228 } 2229 dfa->used_bkref_map |= 1 << token->opr.idx; 2230 tree = create_token_tree (dfa, NULL, NULL, token); 2231 if (BE (tree == NULL, 0)) 2232 { 2233 *err = REG_ESPACE; 2234 return NULL; 2235 } 2236 ++dfa->nbackref; 2237 dfa->has_mb_node = 1; 2238 break; 2239 case OP_OPEN_DUP_NUM: 2240 if (syntax & RE_CONTEXT_INVALID_DUP) 2241 { 2242 *err = REG_BADRPT; 2243 return NULL; 2244 } 2245 /* FALLTHROUGH */ 2246 case OP_DUP_ASTERISK: 2247 case OP_DUP_PLUS: 2248 case OP_DUP_QUESTION: 2249 if (syntax & RE_CONTEXT_INVALID_OPS) 2250 { 2251 *err = REG_BADRPT; 2252 return NULL; 2253 } 2254 else if (syntax & RE_CONTEXT_INDEP_OPS) 2255 { 2256 fetch_token (token, regexp, syntax); 2257 return parse_expression (regexp, preg, token, syntax, nest, err); 2258 } 2259 /* else fall through */ 2260 case OP_CLOSE_SUBEXP: 2261 if ((token->type == OP_CLOSE_SUBEXP) && 2262 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)) 2263 { 2264 *err = REG_ERPAREN; 2265 return NULL; 2266 } 2267 /* else fall through */ 2268 case OP_CLOSE_DUP_NUM: 2269 /* We treat it as a normal character. */ 2270 2271 /* Then we can these characters as normal characters. */ 2272 token->type = CHARACTER; 2273 /* mb_partial and word_char bits should be initialized already 2274 by peek_token. */ 2275 tree = create_token_tree (dfa, NULL, NULL, token); 2276 if (BE (tree == NULL, 0)) 2277 { 2278 *err = REG_ESPACE; 2279 return NULL; 2280 } 2281 break; 2282 case ANCHOR: 2283 if ((token->opr.ctx_type 2284 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST)) 2285 && dfa->word_ops_used == 0) 2286 init_word_char (dfa); 2287 if (token->opr.ctx_type == WORD_DELIM 2288 || token->opr.ctx_type == NOT_WORD_DELIM) 2289 { 2290 bin_tree_t *tree_first, *tree_last; 2291 if (token->opr.ctx_type == WORD_DELIM) 2292 { 2293 token->opr.ctx_type = WORD_FIRST; 2294 tree_first = create_token_tree (dfa, NULL, NULL, token); 2295 token->opr.ctx_type = WORD_LAST; 2296 } 2297 else 2298 { 2299 token->opr.ctx_type = INSIDE_WORD; 2300 tree_first = create_token_tree (dfa, NULL, NULL, token); 2301 token->opr.ctx_type = INSIDE_NOTWORD; 2302 } 2303 tree_last = create_token_tree (dfa, NULL, NULL, token); 2304 tree = create_tree (dfa, tree_first, tree_last, OP_ALT); 2305 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0)) 2306 { 2307 *err = REG_ESPACE; 2308 return NULL; 2309 } 2310 } 2311 else 2312 { 2313 tree = create_token_tree (dfa, NULL, NULL, token); 2314 if (BE (tree == NULL, 0)) 2315 { 2316 *err = REG_ESPACE; 2317 return NULL; 2318 } 2319 } 2320 /* We must return here, since ANCHORs can't be followed 2321 by repetition operators. 2322 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>", 2323 it must not be "<ANCHOR(^)><REPEAT(*)>". */ 2324 fetch_token (token, regexp, syntax); 2325 return tree; 2326 case OP_PERIOD: 2327 tree = create_token_tree (dfa, NULL, NULL, token); 2328 if (BE (tree == NULL, 0)) 2329 { 2330 *err = REG_ESPACE; 2331 return NULL; 2332 } 2333 if (dfa->mb_cur_max > 1) 2334 dfa->has_mb_node = 1; 2335 break; 2336 case OP_WORD: 2337 case OP_NOTWORD: 2338 tree = build_charclass_op (dfa, regexp->trans, 2339 (const unsigned char *) "alnum", 2340 (const unsigned char *) "_", 2341 token->type == OP_NOTWORD, err); 2342 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2343 return NULL; 2344 break; 2345 case OP_SPACE: 2346 case OP_NOTSPACE: 2347 tree = build_charclass_op (dfa, regexp->trans, 2348 (const unsigned char *) "space", 2349 (const unsigned char *) "", 2350 token->type == OP_NOTSPACE, err); 2351 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2352 return NULL; 2353 break; 2354 case OP_ALT: 2355 case END_OF_RE: 2356 return NULL; 2357 case BACK_SLASH: 2358 *err = REG_EESCAPE; 2359 return NULL; 2360 default: 2361 /* Must not happen? */ 2362 #ifdef DEBUG 2363 assert (0); 2364 #endif 2365 return NULL; 2366 } 2367 fetch_token (token, regexp, syntax); 2368 2369 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS 2370 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM) 2371 { 2372 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err); 2373 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2374 return NULL; 2375 /* In BRE consecutive duplications are not allowed. */ 2376 if ((syntax & RE_CONTEXT_INVALID_DUP) 2377 && (token->type == OP_DUP_ASTERISK 2378 || token->type == OP_OPEN_DUP_NUM)) 2379 { 2380 *err = REG_BADRPT; 2381 return NULL; 2382 } 2383 } 2384 2385 return tree; 2386 } 2387 2388 /* This function build the following tree, from regular expression 2389 (<reg_exp>): 2390 SUBEXP 2391 | 2392 <reg_exp> 2393 */ 2394 2395 static bin_tree_t * 2396 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token, 2397 reg_syntax_t syntax, int nest, reg_errcode_t *err) 2398 { 2399 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 2400 bin_tree_t *tree; 2401 size_t cur_nsub; 2402 cur_nsub = preg->re_nsub++; 2403 2404 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE); 2405 2406 /* The subexpression may be a null string. */ 2407 if (token->type == OP_CLOSE_SUBEXP) 2408 tree = NULL; 2409 else 2410 { 2411 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err); 2412 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0)) 2413 *err = REG_EPAREN; 2414 if (BE (*err != REG_NOERROR, 0)) 2415 return NULL; 2416 } 2417 2418 if (cur_nsub <= '9' - '1') 2419 dfa->completed_bkref_map |= 1 << cur_nsub; 2420 2421 tree = create_tree (dfa, tree, NULL, SUBEXP); 2422 if (BE (tree == NULL, 0)) 2423 { 2424 *err = REG_ESPACE; 2425 return NULL; 2426 } 2427 tree->token.opr.idx = cur_nsub; 2428 return tree; 2429 } 2430 2431 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */ 2432 2433 static bin_tree_t * 2434 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa, 2435 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err) 2436 { 2437 bin_tree_t *tree = NULL, *old_tree = NULL; 2438 int i, start, end, start_idx = re_string_cur_idx (regexp); 2439 re_token_t start_token = *token; 2440 2441 if (token->type == OP_OPEN_DUP_NUM) 2442 { 2443 end = 0; 2444 start = fetch_number (regexp, token, syntax); 2445 if (start == -1) 2446 { 2447 if (token->type == CHARACTER && token->opr.c == ',') 2448 start = 0; /* We treat "{,m}" as "{0,m}". */ 2449 else 2450 { 2451 *err = REG_BADBR; /* <re>{} is invalid. */ 2452 return NULL; 2453 } 2454 } 2455 if (BE (start != -2, 1)) 2456 { 2457 /* We treat "{n}" as "{n,n}". */ 2458 end = ((token->type == OP_CLOSE_DUP_NUM) ? start 2459 : ((token->type == CHARACTER && token->opr.c == ',') 2460 ? fetch_number (regexp, token, syntax) : -2)); 2461 } 2462 if (BE (start == -2 || end == -2, 0)) 2463 { 2464 /* Invalid sequence. */ 2465 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0)) 2466 { 2467 if (token->type == END_OF_RE) 2468 *err = REG_EBRACE; 2469 else 2470 *err = REG_BADBR; 2471 2472 return NULL; 2473 } 2474 2475 /* If the syntax bit is set, rollback. */ 2476 re_string_set_index (regexp, start_idx); 2477 *token = start_token; 2478 token->type = CHARACTER; 2479 /* mb_partial and word_char bits should be already initialized by 2480 peek_token. */ 2481 return elem; 2482 } 2483 2484 if (BE ((end != -1 && start > end) || token->type != OP_CLOSE_DUP_NUM, 0)) 2485 { 2486 /* First number greater than second. */ 2487 *err = REG_BADBR; 2488 return NULL; 2489 } 2490 } 2491 else 2492 { 2493 start = (token->type == OP_DUP_PLUS) ? 1 : 0; 2494 end = (token->type == OP_DUP_QUESTION) ? 1 : -1; 2495 } 2496 2497 fetch_token (token, regexp, syntax); 2498 2499 if (BE (elem == NULL, 0)) 2500 return NULL; 2501 if (BE (start == 0 && end == 0, 0)) 2502 { 2503 postorder (elem, free_tree, NULL); 2504 return NULL; 2505 } 2506 2507 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */ 2508 if (BE (start > 0, 0)) 2509 { 2510 tree = elem; 2511 for (i = 2; i <= start; ++i) 2512 { 2513 elem = duplicate_tree (elem, dfa); 2514 tree = create_tree (dfa, tree, elem, CONCAT); 2515 if (BE (elem == NULL || tree == NULL, 0)) 2516 goto parse_dup_op_espace; 2517 } 2518 2519 if (start == end) 2520 return tree; 2521 2522 /* Duplicate ELEM before it is marked optional. */ 2523 elem = duplicate_tree (elem, dfa); 2524 old_tree = tree; 2525 } 2526 else 2527 old_tree = NULL; 2528 2529 if (elem->token.type == SUBEXP) 2530 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx); 2531 2532 tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT)); 2533 if (BE (tree == NULL, 0)) 2534 goto parse_dup_op_espace; 2535 2536 /* This loop is actually executed only when end != -1, 2537 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have 2538 already created the start+1-th copy. */ 2539 for (i = start + 2; i <= end; ++i) 2540 { 2541 elem = duplicate_tree (elem, dfa); 2542 tree = create_tree (dfa, tree, elem, CONCAT); 2543 if (BE (elem == NULL || tree == NULL, 0)) 2544 goto parse_dup_op_espace; 2545 2546 tree = create_tree (dfa, tree, NULL, OP_ALT); 2547 if (BE (tree == NULL, 0)) 2548 goto parse_dup_op_espace; 2549 } 2550 2551 if (old_tree) 2552 tree = create_tree (dfa, old_tree, tree, CONCAT); 2553 2554 return tree; 2555 2556 parse_dup_op_espace: 2557 *err = REG_ESPACE; 2558 return NULL; 2559 } 2560 2561 /* Size of the names for collating symbol/equivalence_class/character_class. 2562 I'm not sure, but maybe enough. */ 2563 #define BRACKET_NAME_BUF_SIZE 32 2564 2565 #ifndef _LIBC 2566 /* Local function for parse_bracket_exp only used in case of NOT _LIBC. 2567 Build the range expression which starts from START_ELEM, and ends 2568 at END_ELEM. The result are written to MBCSET and SBCSET. 2569 RANGE_ALLOC is the allocated size of mbcset->range_starts, and 2570 mbcset->range_ends, is a pointer argument sinse we may 2571 update it. */ 2572 2573 static reg_errcode_t 2574 internal_function 2575 # ifdef RE_ENABLE_I18N 2576 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc, 2577 bracket_elem_t *start_elem, bracket_elem_t *end_elem) 2578 # else /* not RE_ENABLE_I18N */ 2579 build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem, 2580 bracket_elem_t *end_elem) 2581 # endif /* not RE_ENABLE_I18N */ 2582 { 2583 unsigned int start_ch, end_ch; 2584 /* Equivalence Classes and Character Classes can't be a range start/end. */ 2585 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS 2586 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS, 2587 0)) 2588 return REG_ERANGE; 2589 2590 /* We can handle no multi character collating elements without libc 2591 support. */ 2592 if (BE ((start_elem->type == COLL_SYM 2593 && strlen ((char *) start_elem->opr.name) > 1) 2594 || (end_elem->type == COLL_SYM 2595 && strlen ((char *) end_elem->opr.name) > 1), 0)) 2596 return REG_ECOLLATE; 2597 2598 # ifdef RE_ENABLE_I18N 2599 { 2600 wchar_t wc; 2601 wint_t start_wc; 2602 wint_t end_wc; 2603 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'}; 2604 2605 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch 2606 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0] 2607 : 0)); 2608 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch 2609 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0] 2610 : 0)); 2611 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM) 2612 ? __btowc (start_ch) : start_elem->opr.wch); 2613 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM) 2614 ? __btowc (end_ch) : end_elem->opr.wch); 2615 if (start_wc == WEOF || end_wc == WEOF) 2616 return REG_ECOLLATE; 2617 cmp_buf[0] = start_wc; 2618 cmp_buf[4] = end_wc; 2619 if (wcscoll (cmp_buf, cmp_buf + 4) > 0) 2620 return REG_ERANGE; 2621 2622 /* Got valid collation sequence values, add them as a new entry. 2623 However, for !_LIBC we have no collation elements: if the 2624 character set is single byte, the single byte character set 2625 that we build below suffices. parse_bracket_exp passes 2626 no MBCSET if dfa->mb_cur_max == 1. */ 2627 if (mbcset) 2628 { 2629 /* Check the space of the arrays. */ 2630 if (BE (*range_alloc == mbcset->nranges, 0)) 2631 { 2632 /* There is not enough space, need realloc. */ 2633 wchar_t *new_array_start, *new_array_end; 2634 int new_nranges; 2635 2636 /* +1 in case of mbcset->nranges is 0. */ 2637 new_nranges = 2 * mbcset->nranges + 1; 2638 /* Use realloc since mbcset->range_starts and mbcset->range_ends 2639 are NULL if *range_alloc == 0. */ 2640 new_array_start = re_realloc (mbcset->range_starts, wchar_t, 2641 new_nranges); 2642 new_array_end = re_realloc (mbcset->range_ends, wchar_t, 2643 new_nranges); 2644 2645 if (BE (new_array_start == NULL || new_array_end == NULL, 0)) 2646 return REG_ESPACE; 2647 2648 mbcset->range_starts = new_array_start; 2649 mbcset->range_ends = new_array_end; 2650 *range_alloc = new_nranges; 2651 } 2652 2653 mbcset->range_starts[mbcset->nranges] = start_wc; 2654 mbcset->range_ends[mbcset->nranges++] = end_wc; 2655 } 2656 2657 /* Build the table for single byte characters. */ 2658 for (wc = 0; wc < SBC_MAX; ++wc) 2659 { 2660 cmp_buf[2] = wc; 2661 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0 2662 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0) 2663 bitset_set (sbcset, wc); 2664 } 2665 } 2666 # else /* not RE_ENABLE_I18N */ 2667 { 2668 unsigned int ch; 2669 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch 2670 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0] 2671 : 0)); 2672 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch 2673 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0] 2674 : 0)); 2675 if (start_ch > end_ch) 2676 return REG_ERANGE; 2677 /* Build the table for single byte characters. */ 2678 for (ch = 0; ch < SBC_MAX; ++ch) 2679 if (start_ch <= ch && ch <= end_ch) 2680 bitset_set (sbcset, ch); 2681 } 2682 # endif /* not RE_ENABLE_I18N */ 2683 return REG_NOERROR; 2684 } 2685 #endif /* not _LIBC */ 2686 2687 #ifndef _LIBC 2688 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC.. 2689 Build the collating element which is represented by NAME. 2690 The result are written to MBCSET and SBCSET. 2691 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a 2692 pointer argument since we may update it. */ 2693 2694 static reg_errcode_t 2695 internal_function 2696 # ifdef RE_ENABLE_I18N 2697 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset, 2698 int *coll_sym_alloc, const unsigned char *name) 2699 # else /* not RE_ENABLE_I18N */ 2700 build_collating_symbol (bitset_t sbcset, const unsigned char *name) 2701 # endif /* not RE_ENABLE_I18N */ 2702 { 2703 size_t name_len = strlen ((const char *) name); 2704 if (BE (name_len != 1, 0)) 2705 return REG_ECOLLATE; 2706 else 2707 { 2708 bitset_set (sbcset, name[0]); 2709 return REG_NOERROR; 2710 } 2711 } 2712 #endif /* not _LIBC */ 2713 2714 /* This function parse bracket expression like "[abc]", "[a-c]", 2715 "[[.a-a.]]" etc. */ 2716 2717 static bin_tree_t * 2718 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, 2719 reg_syntax_t syntax, reg_errcode_t *err) 2720 { 2721 #ifdef _LIBC 2722 const unsigned char *collseqmb; 2723 const char *collseqwc; 2724 uint32_t nrules; 2725 int32_t table_size; 2726 const int32_t *symb_table; 2727 const unsigned char *extra; 2728 2729 /* Local function for parse_bracket_exp used in _LIBC environement. 2730 Seek the collating symbol entry correspondings to NAME. 2731 Return the index of the symbol in the SYMB_TABLE. */ 2732 2733 auto inline int32_t 2734 __attribute ((always_inline)) 2735 seek_collating_symbol_entry (name, name_len) 2736 const unsigned char *name; 2737 size_t name_len; 2738 { 2739 int32_t hash = elem_hash ((const char *) name, name_len); 2740 int32_t elem = hash % table_size; 2741 if (symb_table[2 * elem] != 0) 2742 { 2743 int32_t second = hash % (table_size - 2) + 1; 2744 2745 do 2746 { 2747 /* First compare the hashing value. */ 2748 if (symb_table[2 * elem] == hash 2749 /* Compare the length of the name. */ 2750 && name_len == extra[symb_table[2 * elem + 1]] 2751 /* Compare the name. */ 2752 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1], 2753 name_len) == 0) 2754 { 2755 /* Yep, this is the entry. */ 2756 break; 2757 } 2758 2759 /* Next entry. */ 2760 elem += second; 2761 } 2762 while (symb_table[2 * elem] != 0); 2763 } 2764 return elem; 2765 } 2766 2767 /* Local function for parse_bracket_exp used in _LIBC environment. 2768 Look up the collation sequence value of BR_ELEM. 2769 Return the value if succeeded, UINT_MAX otherwise. */ 2770 2771 auto inline unsigned int 2772 __attribute ((always_inline)) 2773 lookup_collation_sequence_value (br_elem) 2774 bracket_elem_t *br_elem; 2775 { 2776 if (br_elem->type == SB_CHAR) 2777 { 2778 /* 2779 if (MB_CUR_MAX == 1) 2780 */ 2781 if (nrules == 0) 2782 return collseqmb[br_elem->opr.ch]; 2783 else 2784 { 2785 wint_t wc = __btowc (br_elem->opr.ch); 2786 return __collseq_table_lookup (collseqwc, wc); 2787 } 2788 } 2789 else if (br_elem->type == MB_CHAR) 2790 { 2791 if (nrules != 0) 2792 return __collseq_table_lookup (collseqwc, br_elem->opr.wch); 2793 } 2794 else if (br_elem->type == COLL_SYM) 2795 { 2796 size_t sym_name_len = strlen ((char *) br_elem->opr.name); 2797 if (nrules != 0) 2798 { 2799 int32_t elem, idx; 2800 elem = seek_collating_symbol_entry (br_elem->opr.name, 2801 sym_name_len); 2802 if (symb_table[2 * elem] != 0) 2803 { 2804 /* We found the entry. */ 2805 idx = symb_table[2 * elem + 1]; 2806 /* Skip the name of collating element name. */ 2807 idx += 1 + extra[idx]; 2808 /* Skip the byte sequence of the collating element. */ 2809 idx += 1 + extra[idx]; 2810 /* Adjust for the alignment. */ 2811 idx = (idx + 3) & ~3; 2812 /* Skip the multibyte collation sequence value. */ 2813 idx += sizeof (unsigned int); 2814 /* Skip the wide char sequence of the collating element. */ 2815 idx += sizeof (unsigned int) * 2816 (1 + *(unsigned int *) (extra + idx)); 2817 /* Return the collation sequence value. */ 2818 return *(unsigned int *) (extra + idx); 2819 } 2820 else if (symb_table[2 * elem] == 0 && sym_name_len == 1) 2821 { 2822 /* No valid character. Match it as a single byte 2823 character. */ 2824 return collseqmb[br_elem->opr.name[0]]; 2825 } 2826 } 2827 else if (sym_name_len == 1) 2828 return collseqmb[br_elem->opr.name[0]]; 2829 } 2830 return UINT_MAX; 2831 } 2832 2833 /* Local function for parse_bracket_exp used in _LIBC environement. 2834 Build the range expression which starts from START_ELEM, and ends 2835 at END_ELEM. The result are written to MBCSET and SBCSET. 2836 RANGE_ALLOC is the allocated size of mbcset->range_starts, and 2837 mbcset->range_ends, is a pointer argument sinse we may 2838 update it. */ 2839 2840 auto inline reg_errcode_t 2841 __attribute ((always_inline)) 2842 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem) 2843 re_charset_t *mbcset; 2844 int *range_alloc; 2845 bitset_t sbcset; 2846 bracket_elem_t *start_elem, *end_elem; 2847 { 2848 unsigned int ch; 2849 uint32_t start_collseq; 2850 uint32_t end_collseq; 2851 2852 /* Equivalence Classes and Character Classes can't be a range 2853 start/end. */ 2854 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS 2855 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS, 2856 0)) 2857 return REG_ERANGE; 2858 2859 start_collseq = lookup_collation_sequence_value (start_elem); 2860 end_collseq = lookup_collation_sequence_value (end_elem); 2861 /* Check start/end collation sequence values. */ 2862 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0)) 2863 return REG_ECOLLATE; 2864 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0)) 2865 return REG_ERANGE; 2866 2867 /* Got valid collation sequence values, add them as a new entry. 2868 However, if we have no collation elements, and the character set 2869 is single byte, the single byte character set that we 2870 build below suffices. */ 2871 if (nrules > 0 || dfa->mb_cur_max > 1) 2872 { 2873 /* Check the space of the arrays. */ 2874 if (BE (*range_alloc == mbcset->nranges, 0)) 2875 { 2876 /* There is not enough space, need realloc. */ 2877 uint32_t *new_array_start; 2878 uint32_t *new_array_end; 2879 int new_nranges; 2880 2881 /* +1 in case of mbcset->nranges is 0. */ 2882 new_nranges = 2 * mbcset->nranges + 1; 2883 new_array_start = re_realloc (mbcset->range_starts, uint32_t, 2884 new_nranges); 2885 new_array_end = re_realloc (mbcset->range_ends, uint32_t, 2886 new_nranges); 2887 2888 if (BE (new_array_start == NULL || new_array_end == NULL, 0)) 2889 return REG_ESPACE; 2890 2891 mbcset->range_starts = new_array_start; 2892 mbcset->range_ends = new_array_end; 2893 *range_alloc = new_nranges; 2894 } 2895 2896 mbcset->range_starts[mbcset->nranges] = start_collseq; 2897 mbcset->range_ends[mbcset->nranges++] = end_collseq; 2898 } 2899 2900 /* Build the table for single byte characters. */ 2901 for (ch = 0; ch < SBC_MAX; ch++) 2902 { 2903 uint32_t ch_collseq; 2904 /* 2905 if (MB_CUR_MAX == 1) 2906 */ 2907 if (nrules == 0) 2908 ch_collseq = collseqmb[ch]; 2909 else 2910 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch)); 2911 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq) 2912 bitset_set (sbcset, ch); 2913 } 2914 return REG_NOERROR; 2915 } 2916 2917 /* Local function for parse_bracket_exp used in _LIBC environement. 2918 Build the collating element which is represented by NAME. 2919 The result are written to MBCSET and SBCSET. 2920 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a 2921 pointer argument sinse we may update it. */ 2922 2923 auto inline reg_errcode_t 2924 __attribute ((always_inline)) 2925 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name) 2926 re_charset_t *mbcset; 2927 int *coll_sym_alloc; 2928 bitset_t sbcset; 2929 const unsigned char *name; 2930 { 2931 int32_t elem, idx; 2932 size_t name_len = strlen ((const char *) name); 2933 if (nrules != 0) 2934 { 2935 elem = seek_collating_symbol_entry (name, name_len); 2936 if (symb_table[2 * elem] != 0) 2937 { 2938 /* We found the entry. */ 2939 idx = symb_table[2 * elem + 1]; 2940 /* Skip the name of collating element name. */ 2941 idx += 1 + extra[idx]; 2942 } 2943 else if (symb_table[2 * elem] == 0 && name_len == 1) 2944 { 2945 /* No valid character, treat it as a normal 2946 character. */ 2947 bitset_set (sbcset, name[0]); 2948 return REG_NOERROR; 2949 } 2950 else 2951 return REG_ECOLLATE; 2952 2953 /* Got valid collation sequence, add it as a new entry. */ 2954 /* Check the space of the arrays. */ 2955 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0)) 2956 { 2957 /* Not enough, realloc it. */ 2958 /* +1 in case of mbcset->ncoll_syms is 0. */ 2959 int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1; 2960 /* Use realloc since mbcset->coll_syms is NULL 2961 if *alloc == 0. */ 2962 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t, 2963 new_coll_sym_alloc); 2964 if (BE (new_coll_syms == NULL, 0)) 2965 return REG_ESPACE; 2966 mbcset->coll_syms = new_coll_syms; 2967 *coll_sym_alloc = new_coll_sym_alloc; 2968 } 2969 mbcset->coll_syms[mbcset->ncoll_syms++] = idx; 2970 return REG_NOERROR; 2971 } 2972 else 2973 { 2974 if (BE (name_len != 1, 0)) 2975 return REG_ECOLLATE; 2976 else 2977 { 2978 bitset_set (sbcset, name[0]); 2979 return REG_NOERROR; 2980 } 2981 } 2982 } 2983 #endif 2984 2985 re_token_t br_token; 2986 re_bitset_ptr_t sbcset; 2987 #ifdef RE_ENABLE_I18N 2988 re_charset_t *mbcset; 2989 int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0; 2990 int equiv_class_alloc = 0, char_class_alloc = 0; 2991 #endif /* not RE_ENABLE_I18N */ 2992 int non_match = 0; 2993 bin_tree_t *work_tree; 2994 int token_len; 2995 int first_round = 1; 2996 #ifdef _LIBC 2997 collseqmb = (const unsigned char *) 2998 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB); 2999 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); 3000 if (nrules) 3001 { 3002 /* 3003 if (MB_CUR_MAX > 1) 3004 */ 3005 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC); 3006 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB); 3007 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE, 3008 _NL_COLLATE_SYMB_TABLEMB); 3009 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE, 3010 _NL_COLLATE_SYMB_EXTRAMB); 3011 } 3012 #endif 3013 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1); 3014 #ifdef RE_ENABLE_I18N 3015 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1); 3016 #endif /* RE_ENABLE_I18N */ 3017 #ifdef RE_ENABLE_I18N 3018 if (BE (sbcset == NULL || mbcset == NULL, 0)) 3019 #else 3020 if (BE (sbcset == NULL, 0)) 3021 #endif /* RE_ENABLE_I18N */ 3022 { 3023 *err = REG_ESPACE; 3024 return NULL; 3025 } 3026 3027 token_len = peek_token_bracket (token, regexp, syntax); 3028 if (BE (token->type == END_OF_RE, 0)) 3029 { 3030 *err = REG_BADPAT; 3031 goto parse_bracket_exp_free_return; 3032 } 3033 if (token->type == OP_NON_MATCH_LIST) 3034 { 3035 #ifdef RE_ENABLE_I18N 3036 mbcset->non_match = 1; 3037 #endif /* not RE_ENABLE_I18N */ 3038 non_match = 1; 3039 if (syntax & RE_HAT_LISTS_NOT_NEWLINE) 3040 bitset_set (sbcset, '\n'); 3041 re_string_skip_bytes (regexp, token_len); /* Skip a token. */ 3042 token_len = peek_token_bracket (token, regexp, syntax); 3043 if (BE (token->type == END_OF_RE, 0)) 3044 { 3045 *err = REG_BADPAT; 3046 goto parse_bracket_exp_free_return; 3047 } 3048 } 3049 3050 /* We treat the first ']' as a normal character. */ 3051 if (token->type == OP_CLOSE_BRACKET) 3052 token->type = CHARACTER; 3053 3054 while (1) 3055 { 3056 bracket_elem_t start_elem, end_elem; 3057 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE]; 3058 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE]; 3059 reg_errcode_t ret; 3060 int token_len2 = 0, is_range_exp = 0; 3061 re_token_t token2; 3062 3063 start_elem.opr.name = start_name_buf; 3064 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa, 3065 syntax, first_round); 3066 if (BE (ret != REG_NOERROR, 0)) 3067 { 3068 *err = ret; 3069 goto parse_bracket_exp_free_return; 3070 } 3071 first_round = 0; 3072 3073 /* Get information about the next token. We need it in any case. */ 3074 token_len = peek_token_bracket (token, regexp, syntax); 3075 3076 /* Do not check for ranges if we know they are not allowed. */ 3077 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS) 3078 { 3079 if (BE (token->type == END_OF_RE, 0)) 3080 { 3081 *err = REG_EBRACK; 3082 goto parse_bracket_exp_free_return; 3083 } 3084 if (token->type == OP_CHARSET_RANGE) 3085 { 3086 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */ 3087 token_len2 = peek_token_bracket (&token2, regexp, syntax); 3088 if (BE (token2.type == END_OF_RE, 0)) 3089 { 3090 *err = REG_EBRACK; 3091 goto parse_bracket_exp_free_return; 3092 } 3093 if (token2.type == OP_CLOSE_BRACKET) 3094 { 3095 /* We treat the last '-' as a normal character. */ 3096 re_string_skip_bytes (regexp, -token_len); 3097 token->type = CHARACTER; 3098 } 3099 else 3100 is_range_exp = 1; 3101 } 3102 } 3103 3104 if (is_range_exp == 1) 3105 { 3106 end_elem.opr.name = end_name_buf; 3107 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2, 3108 dfa, syntax, 1); 3109 if (BE (ret != REG_NOERROR, 0)) 3110 { 3111 *err = ret; 3112 goto parse_bracket_exp_free_return; 3113 } 3114 3115 token_len = peek_token_bracket (token, regexp, syntax); 3116 3117 #ifdef _LIBC 3118 *err = build_range_exp (sbcset, mbcset, &range_alloc, 3119 &start_elem, &end_elem); 3120 #else 3121 # ifdef RE_ENABLE_I18N 3122 *err = build_range_exp (sbcset, 3123 dfa->mb_cur_max > 1 ? mbcset : NULL, 3124 &range_alloc, &start_elem, &end_elem); 3125 # else 3126 *err = build_range_exp (sbcset, &start_elem, &end_elem); 3127 # endif 3128 #endif /* RE_ENABLE_I18N */ 3129 if (BE (*err != REG_NOERROR, 0)) 3130 goto parse_bracket_exp_free_return; 3131 } 3132 else 3133 { 3134 switch (start_elem.type) 3135 { 3136 case SB_CHAR: 3137 bitset_set (sbcset, start_elem.opr.ch); 3138 break; 3139 #ifdef RE_ENABLE_I18N 3140 case MB_CHAR: 3141 /* Check whether the array has enough space. */ 3142 if (BE (mbchar_alloc == mbcset->nmbchars, 0)) 3143 { 3144 wchar_t *new_mbchars; 3145 /* Not enough, realloc it. */ 3146 /* +1 in case of mbcset->nmbchars is 0. */ 3147 mbchar_alloc = 2 * mbcset->nmbchars + 1; 3148 /* Use realloc since array is NULL if *alloc == 0. */ 3149 new_mbchars = re_realloc (mbcset->mbchars, wchar_t, 3150 mbchar_alloc); 3151 if (BE (new_mbchars == NULL, 0)) 3152 goto parse_bracket_exp_espace; 3153 mbcset->mbchars = new_mbchars; 3154 } 3155 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch; 3156 break; 3157 #endif /* RE_ENABLE_I18N */ 3158 case EQUIV_CLASS: 3159 *err = build_equiv_class (sbcset, 3160 #ifdef RE_ENABLE_I18N 3161 mbcset, &equiv_class_alloc, 3162 #endif /* RE_ENABLE_I18N */ 3163 start_elem.opr.name); 3164 if (BE (*err != REG_NOERROR, 0)) 3165 goto parse_bracket_exp_free_return; 3166 break; 3167 case COLL_SYM: 3168 *err = build_collating_symbol (sbcset, 3169 #ifdef RE_ENABLE_I18N 3170 mbcset, &coll_sym_alloc, 3171 #endif /* RE_ENABLE_I18N */ 3172 start_elem.opr.name); 3173 if (BE (*err != REG_NOERROR, 0)) 3174 goto parse_bracket_exp_free_return; 3175 break; 3176 case CHAR_CLASS: 3177 *err = build_charclass (regexp->trans, sbcset, 3178 #ifdef RE_ENABLE_I18N 3179 mbcset, &char_class_alloc, 3180 #endif /* RE_ENABLE_I18N */ 3181 start_elem.opr.name, syntax); 3182 if (BE (*err != REG_NOERROR, 0)) 3183 goto parse_bracket_exp_free_return; 3184 break; 3185 default: 3186 assert (0); 3187 break; 3188 } 3189 } 3190 if (BE (token->type == END_OF_RE, 0)) 3191 { 3192 *err = REG_EBRACK; 3193 goto parse_bracket_exp_free_return; 3194 } 3195 if (token->type == OP_CLOSE_BRACKET) 3196 break; 3197 } 3198 3199 re_string_skip_bytes (regexp, token_len); /* Skip a token. */ 3200 3201 /* If it is non-matching list. */ 3202 if (non_match) 3203 bitset_not (sbcset); 3204 3205 #ifdef RE_ENABLE_I18N 3206 /* Ensure only single byte characters are set. */ 3207 if (dfa->mb_cur_max > 1) 3208 bitset_mask (sbcset, dfa->sb_char); 3209 3210 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes 3211 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes 3212 || mbcset->non_match))) 3213 { 3214 bin_tree_t *mbc_tree; 3215 int sbc_idx; 3216 /* Build a tree for complex bracket. */ 3217 dfa->has_mb_node = 1; 3218 br_token.type = COMPLEX_BRACKET; 3219 br_token.opr.mbcset = mbcset; 3220 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token); 3221 if (BE (mbc_tree == NULL, 0)) 3222 goto parse_bracket_exp_espace; 3223 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx) 3224 if (sbcset[sbc_idx]) 3225 break; 3226 /* If there are no bits set in sbcset, there is no point 3227 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */ 3228 if (sbc_idx < BITSET_WORDS) 3229 { 3230 /* Build a tree for simple bracket. */ 3231 br_token.type = SIMPLE_BRACKET; 3232 br_token.opr.sbcset = sbcset; 3233 work_tree = create_token_tree (dfa, NULL, NULL, &br_token); 3234 if (BE (work_tree == NULL, 0)) 3235 goto parse_bracket_exp_espace; 3236 3237 /* Then join them by ALT node. */ 3238 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT); 3239 if (BE (work_tree == NULL, 0)) 3240 goto parse_bracket_exp_espace; 3241 } 3242 else 3243 { 3244 re_free (sbcset); 3245 work_tree = mbc_tree; 3246 } 3247 } 3248 else 3249 #endif /* not RE_ENABLE_I18N */ 3250 { 3251 #ifdef RE_ENABLE_I18N 3252 free_charset (mbcset); 3253 #endif 3254 /* Build a tree for simple bracket. */ 3255 br_token.type = SIMPLE_BRACKET; 3256 br_token.opr.sbcset = sbcset; 3257 work_tree = create_token_tree (dfa, NULL, NULL, &br_token); 3258 if (BE (work_tree == NULL, 0)) 3259 goto parse_bracket_exp_espace; 3260 } 3261 return work_tree; 3262 3263 parse_bracket_exp_espace: 3264 *err = REG_ESPACE; 3265 parse_bracket_exp_free_return: 3266 re_free (sbcset); 3267 #ifdef RE_ENABLE_I18N 3268 free_charset (mbcset); 3269 #endif /* RE_ENABLE_I18N */ 3270 return NULL; 3271 } 3272 3273 /* Parse an element in the bracket expression. */ 3274 3275 static reg_errcode_t 3276 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp, 3277 re_token_t *token, int token_len, re_dfa_t *dfa, 3278 reg_syntax_t syntax, int accept_hyphen) 3279 { 3280 #ifdef RE_ENABLE_I18N 3281 int cur_char_size; 3282 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp)); 3283 if (cur_char_size > 1) 3284 { 3285 elem->type = MB_CHAR; 3286 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp)); 3287 re_string_skip_bytes (regexp, cur_char_size); 3288 return REG_NOERROR; 3289 } 3290 #endif /* RE_ENABLE_I18N */ 3291 re_string_skip_bytes (regexp, token_len); /* Skip a token. */ 3292 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS 3293 || token->type == OP_OPEN_EQUIV_CLASS) 3294 return parse_bracket_symbol (elem, regexp, token); 3295 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen) 3296 { 3297 /* A '-' must only appear as anything but a range indicator before 3298 the closing bracket. Everything else is an error. */ 3299 re_token_t token2; 3300 (void) peek_token_bracket (&token2, regexp, syntax); 3301 if (token2.type != OP_CLOSE_BRACKET) 3302 /* The actual error value is not standardized since this whole 3303 case is undefined. But ERANGE makes good sense. */ 3304 return REG_ERANGE; 3305 } 3306 elem->type = SB_CHAR; 3307 elem->opr.ch = token->opr.c; 3308 return REG_NOERROR; 3309 } 3310 3311 /* Parse a bracket symbol in the bracket expression. Bracket symbols are 3312 such as [:<character_class>:], [.<collating_element>.], and 3313 [=<equivalent_class>=]. */ 3314 3315 static reg_errcode_t 3316 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp, 3317 re_token_t *token) 3318 { 3319 unsigned char ch, delim = token->opr.c; 3320 int i = 0; 3321 if (re_string_eoi(regexp)) 3322 return REG_EBRACK; 3323 for (;; ++i) 3324 { 3325 if (i >= BRACKET_NAME_BUF_SIZE) 3326 return REG_EBRACK; 3327 if (token->type == OP_OPEN_CHAR_CLASS) 3328 ch = re_string_fetch_byte_case (regexp); 3329 else 3330 ch = re_string_fetch_byte (regexp); 3331 if (re_string_eoi(regexp)) 3332 return REG_EBRACK; 3333 if (ch == delim && re_string_peek_byte (regexp, 0) == ']') 3334 break; 3335 elem->opr.name[i] = ch; 3336 } 3337 re_string_skip_bytes (regexp, 1); 3338 elem->opr.name[i] = '\0'; 3339 switch (token->type) 3340 { 3341 case OP_OPEN_COLL_ELEM: 3342 elem->type = COLL_SYM; 3343 break; 3344 case OP_OPEN_EQUIV_CLASS: 3345 elem->type = EQUIV_CLASS; 3346 break; 3347 case OP_OPEN_CHAR_CLASS: 3348 elem->type = CHAR_CLASS; 3349 break; 3350 default: 3351 break; 3352 } 3353 return REG_NOERROR; 3354 } 3355 3356 /* Helper function for parse_bracket_exp. 3357 Build the equivalence class which is represented by NAME. 3358 The result are written to MBCSET and SBCSET. 3359 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes, 3360 is a pointer argument sinse we may update it. */ 3361 3362 static reg_errcode_t 3363 #ifdef RE_ENABLE_I18N 3364 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset, 3365 int *equiv_class_alloc, const unsigned char *name) 3366 #else /* not RE_ENABLE_I18N */ 3367 build_equiv_class (bitset_t sbcset, const unsigned char *name) 3368 #endif /* not RE_ENABLE_I18N */ 3369 { 3370 #ifdef _LIBC 3371 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); 3372 if (nrules != 0) 3373 { 3374 const int32_t *table, *indirect; 3375 const unsigned char *weights, *extra, *cp; 3376 unsigned char char_buf[2]; 3377 int32_t idx1, idx2; 3378 unsigned int ch; 3379 size_t len; 3380 /* This #include defines a local function! */ 3381 # include <locale/weight.h> 3382 /* Calculate the index for equivalence class. */ 3383 cp = name; 3384 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB); 3385 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE, 3386 _NL_COLLATE_WEIGHTMB); 3387 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE, 3388 _NL_COLLATE_EXTRAMB); 3389 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE, 3390 _NL_COLLATE_INDIRECTMB); 3391 idx1 = findidx (&cp); 3392 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0)) 3393 /* This isn't a valid character. */ 3394 return REG_ECOLLATE; 3395 3396 /* Build single byte matcing table for this equivalence class. */ 3397 char_buf[1] = (unsigned char) '\0'; 3398 len = weights[idx1 & 0xffffff]; 3399 for (ch = 0; ch < SBC_MAX; ++ch) 3400 { 3401 char_buf[0] = ch; 3402 cp = char_buf; 3403 idx2 = findidx (&cp); 3404 /* 3405 idx2 = table[ch]; 3406 */ 3407 if (idx2 == 0) 3408 /* This isn't a valid character. */ 3409 continue; 3410 /* Compare only if the length matches and the collation rule 3411 index is the same. */ 3412 if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24)) 3413 { 3414 int cnt = 0; 3415 3416 while (cnt <= len && 3417 weights[(idx1 & 0xffffff) + 1 + cnt] 3418 == weights[(idx2 & 0xffffff) + 1 + cnt]) 3419 ++cnt; 3420 3421 if (cnt > len) 3422 bitset_set (sbcset, ch); 3423 } 3424 } 3425 /* Check whether the array has enough space. */ 3426 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0)) 3427 { 3428 /* Not enough, realloc it. */ 3429 /* +1 in case of mbcset->nequiv_classes is 0. */ 3430 int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1; 3431 /* Use realloc since the array is NULL if *alloc == 0. */ 3432 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes, 3433 int32_t, 3434 new_equiv_class_alloc); 3435 if (BE (new_equiv_classes == NULL, 0)) 3436 return REG_ESPACE; 3437 mbcset->equiv_classes = new_equiv_classes; 3438 *equiv_class_alloc = new_equiv_class_alloc; 3439 } 3440 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1; 3441 } 3442 else 3443 #endif /* _LIBC */ 3444 { 3445 if (BE (strlen ((const char *) name) != 1, 0)) 3446 return REG_ECOLLATE; 3447 bitset_set (sbcset, *name); 3448 } 3449 return REG_NOERROR; 3450 } 3451 3452 /* Helper function for parse_bracket_exp. 3453 Build the character class which is represented by NAME. 3454 The result are written to MBCSET and SBCSET. 3455 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes, 3456 is a pointer argument sinse we may update it. */ 3457 3458 static reg_errcode_t 3459 #ifdef RE_ENABLE_I18N 3460 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset, 3461 re_charset_t *mbcset, int *char_class_alloc, 3462 const unsigned char *class_name, reg_syntax_t syntax) 3463 #else /* not RE_ENABLE_I18N */ 3464 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset, 3465 const unsigned char *class_name, reg_syntax_t syntax) 3466 #endif /* not RE_ENABLE_I18N */ 3467 { 3468 int i; 3469 const char *name = (const char *) class_name; 3470 3471 /* In case of REG_ICASE "upper" and "lower" match the both of 3472 upper and lower cases. */ 3473 if ((syntax & RE_ICASE) 3474 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0)) 3475 name = "alpha"; 3476 3477 #ifdef RE_ENABLE_I18N 3478 /* Check the space of the arrays. */ 3479 if (BE (*char_class_alloc == mbcset->nchar_classes, 0)) 3480 { 3481 /* Not enough, realloc it. */ 3482 /* +1 in case of mbcset->nchar_classes is 0. */ 3483 int new_char_class_alloc = 2 * mbcset->nchar_classes + 1; 3484 /* Use realloc since array is NULL if *alloc == 0. */ 3485 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t, 3486 new_char_class_alloc); 3487 if (BE (new_char_classes == NULL, 0)) 3488 return REG_ESPACE; 3489 mbcset->char_classes = new_char_classes; 3490 *char_class_alloc = new_char_class_alloc; 3491 } 3492 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name); 3493 #endif /* RE_ENABLE_I18N */ 3494 3495 #define BUILD_CHARCLASS_LOOP(ctype_func) \ 3496 do { \ 3497 if (BE (trans != NULL, 0)) \ 3498 { \ 3499 for (i = 0; i < SBC_MAX; ++i) \ 3500 if (ctype_func (i)) \ 3501 bitset_set (sbcset, trans[i]); \ 3502 } \ 3503 else \ 3504 { \ 3505 for (i = 0; i < SBC_MAX; ++i) \ 3506 if (ctype_func (i)) \ 3507 bitset_set (sbcset, i); \ 3508 } \ 3509 } while (0) 3510 3511 if (strcmp (name, "alnum") == 0) 3512 BUILD_CHARCLASS_LOOP (isalnum); 3513 else if (strcmp (name, "cntrl") == 0) 3514 BUILD_CHARCLASS_LOOP (iscntrl); 3515 else if (strcmp (name, "lower") == 0) 3516 BUILD_CHARCLASS_LOOP (islower); 3517 else if (strcmp (name, "space") == 0) 3518 BUILD_CHARCLASS_LOOP (isspace); 3519 else if (strcmp (name, "alpha") == 0) 3520 BUILD_CHARCLASS_LOOP (isalpha); 3521 else if (strcmp (name, "digit") == 0) 3522 BUILD_CHARCLASS_LOOP (isdigit); 3523 else if (strcmp (name, "print") == 0) 3524 BUILD_CHARCLASS_LOOP (isprint); 3525 else if (strcmp (name, "upper") == 0) 3526 BUILD_CHARCLASS_LOOP (isupper); 3527 else if (strcmp (name, "blank") == 0) 3528 BUILD_CHARCLASS_LOOP (isblank); 3529 else if (strcmp (name, "graph") == 0) 3530 BUILD_CHARCLASS_LOOP (isgraph); 3531 else if (strcmp (name, "punct") == 0) 3532 BUILD_CHARCLASS_LOOP (ispunct); 3533 else if (strcmp (name, "xdigit") == 0) 3534 BUILD_CHARCLASS_LOOP (isxdigit); 3535 else 3536 return REG_ECTYPE; 3537 3538 return REG_NOERROR; 3539 } 3540 3541 static bin_tree_t * 3542 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans, 3543 const unsigned char *class_name, 3544 const unsigned char *extra, int non_match, 3545 reg_errcode_t *err) 3546 { 3547 re_bitset_ptr_t sbcset; 3548 #ifdef RE_ENABLE_I18N 3549 re_charset_t *mbcset; 3550 int alloc = 0; 3551 #endif /* not RE_ENABLE_I18N */ 3552 reg_errcode_t ret; 3553 re_token_t br_token; 3554 bin_tree_t *tree; 3555 3556 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1); 3557 #ifdef RE_ENABLE_I18N 3558 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1); 3559 #endif /* RE_ENABLE_I18N */ 3560 3561 #ifdef RE_ENABLE_I18N 3562 if (BE (sbcset == NULL || mbcset == NULL, 0)) 3563 #else /* not RE_ENABLE_I18N */ 3564 if (BE (sbcset == NULL, 0)) 3565 #endif /* not RE_ENABLE_I18N */ 3566 { 3567 *err = REG_ESPACE; 3568 return NULL; 3569 } 3570 3571 if (non_match) 3572 { 3573 #ifdef RE_ENABLE_I18N 3574 mbcset->non_match = 1; 3575 #endif /* not RE_ENABLE_I18N */ 3576 } 3577 3578 /* We don't care the syntax in this case. */ 3579 ret = build_charclass (trans, sbcset, 3580 #ifdef RE_ENABLE_I18N 3581 mbcset, &alloc, 3582 #endif /* RE_ENABLE_I18N */ 3583 class_name, 0); 3584 3585 if (BE (ret != REG_NOERROR, 0)) 3586 { 3587 re_free (sbcset); 3588 #ifdef RE_ENABLE_I18N 3589 free_charset (mbcset); 3590 #endif /* RE_ENABLE_I18N */ 3591 *err = ret; 3592 return NULL; 3593 } 3594 /* \w match '_' also. */ 3595 for (; *extra; extra++) 3596 bitset_set (sbcset, *extra); 3597 3598 /* If it is non-matching list. */ 3599 if (non_match) 3600 bitset_not (sbcset); 3601 3602 #ifdef RE_ENABLE_I18N 3603 /* Ensure only single byte characters are set. */ 3604 if (dfa->mb_cur_max > 1) 3605 bitset_mask (sbcset, dfa->sb_char); 3606 #endif 3607 3608 /* Build a tree for simple bracket. */ 3609 br_token.type = SIMPLE_BRACKET; 3610 br_token.opr.sbcset = sbcset; 3611 tree = create_token_tree (dfa, NULL, NULL, &br_token); 3612 if (BE (tree == NULL, 0)) 3613 goto build_word_op_espace; 3614 3615 #ifdef RE_ENABLE_I18N 3616 if (dfa->mb_cur_max > 1) 3617 { 3618 bin_tree_t *mbc_tree; 3619 /* Build a tree for complex bracket. */ 3620 br_token.type = COMPLEX_BRACKET; 3621 br_token.opr.mbcset = mbcset; 3622 dfa->has_mb_node = 1; 3623 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token); 3624 if (BE (mbc_tree == NULL, 0)) 3625 goto build_word_op_espace; 3626 /* Then join them by ALT node. */ 3627 tree = create_tree (dfa, tree, mbc_tree, OP_ALT); 3628 if (BE (mbc_tree != NULL, 1)) 3629 return tree; 3630 } 3631 else 3632 { 3633 free_charset (mbcset); 3634 return tree; 3635 } 3636 #else /* not RE_ENABLE_I18N */ 3637 return tree; 3638 #endif /* not RE_ENABLE_I18N */ 3639 3640 build_word_op_espace: 3641 re_free (sbcset); 3642 #ifdef RE_ENABLE_I18N 3643 free_charset (mbcset); 3644 #endif /* RE_ENABLE_I18N */ 3645 *err = REG_ESPACE; 3646 return NULL; 3647 } 3648 3649 /* This is intended for the expressions like "a{1,3}". 3650 Fetch a number from `input', and return the number. 3651 Return -1, if the number field is empty like "{,1}". 3652 Return -2, If an error is occured. */ 3653 3654 static int 3655 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax) 3656 { 3657 int num = -1; 3658 unsigned char c; 3659 while (1) 3660 { 3661 fetch_token (token, input, syntax); 3662 c = token->opr.c; 3663 if (BE (token->type == END_OF_RE, 0)) 3664 return -2; 3665 if (token->type == OP_CLOSE_DUP_NUM || c == ',') 3666 break; 3667 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2) 3668 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0')); 3669 num = (num > RE_DUP_MAX) ? -2 : num; 3670 } 3671 return num; 3672 } 3673 3674 #ifdef RE_ENABLE_I18N 3675 static void 3676 free_charset (re_charset_t *cset) 3677 { 3678 re_free (cset->mbchars); 3679 # ifdef _LIBC 3680 re_free (cset->coll_syms); 3681 re_free (cset->equiv_classes); 3682 re_free (cset->range_starts); 3683 re_free (cset->range_ends); 3684 # endif 3685 re_free (cset->char_classes); 3686 re_free (cset); 3687 } 3688 #endif /* RE_ENABLE_I18N */ 3689 3690 /* Functions for binary tree operation. */ 3691 3692 /* Create a tree node. */ 3693 3694 static bin_tree_t * 3695 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right, 3696 re_token_type_t type) 3697 { 3698 re_token_t t; 3699 t.type = type; 3700 return create_token_tree (dfa, left, right, &t); 3701 } 3702 3703 static bin_tree_t * 3704 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right, 3705 const re_token_t *token) 3706 { 3707 bin_tree_t *tree; 3708 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0)) 3709 { 3710 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1); 3711 3712 if (storage == NULL) 3713 return NULL; 3714 storage->next = dfa->str_tree_storage; 3715 dfa->str_tree_storage = storage; 3716 dfa->str_tree_storage_idx = 0; 3717 } 3718 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++]; 3719 3720 tree->parent = NULL; 3721 tree->left = left; 3722 tree->right = right; 3723 tree->token = *token; 3724 tree->token.duplicated = 0; 3725 tree->token.opt_subexp = 0; 3726 tree->first = NULL; 3727 tree->next = NULL; 3728 tree->node_idx = -1; 3729 3730 if (left != NULL) 3731 left->parent = tree; 3732 if (right != NULL) 3733 right->parent = tree; 3734 return tree; 3735 } 3736 3737 /* Mark the tree SRC as an optional subexpression. 3738 To be called from preorder or postorder. */ 3739 3740 static reg_errcode_t 3741 mark_opt_subexp (void *extra, bin_tree_t *node) 3742 { 3743 int idx = (int) (long) extra; 3744 if (node->token.type == SUBEXP && node->token.opr.idx == idx) 3745 node->token.opt_subexp = 1; 3746 3747 return REG_NOERROR; 3748 } 3749 3750 /* Free the allocated memory inside NODE. */ 3751 3752 static void 3753 free_token (re_token_t *node) 3754 { 3755 #ifdef RE_ENABLE_I18N 3756 if (node->type == COMPLEX_BRACKET && node->duplicated == 0) 3757 free_charset (node->opr.mbcset); 3758 else 3759 #endif /* RE_ENABLE_I18N */ 3760 if (node->type == SIMPLE_BRACKET && node->duplicated == 0) 3761 re_free (node->opr.sbcset); 3762 } 3763 3764 /* Worker function for tree walking. Free the allocated memory inside NODE 3765 and its children. */ 3766 3767 static reg_errcode_t 3768 free_tree (void *extra, bin_tree_t *node) 3769 { 3770 free_token (&node->token); 3771 return REG_NOERROR; 3772 } 3773 3774 3775 /* Duplicate the node SRC, and return new node. This is a preorder 3776 visit similar to the one implemented by the generic visitor, but 3777 we need more infrastructure to maintain two parallel trees --- so, 3778 it's easier to duplicate. */ 3779 3780 static bin_tree_t * 3781 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa) 3782 { 3783 const bin_tree_t *node; 3784 bin_tree_t *dup_root; 3785 bin_tree_t **p_new = &dup_root, *dup_node = root->parent; 3786 3787 for (node = root; ; ) 3788 { 3789 /* Create a new tree and link it back to the current parent. */ 3790 *p_new = create_token_tree (dfa, NULL, NULL, &node->token); 3791 if (*p_new == NULL) 3792 return NULL; 3793 (*p_new)->parent = dup_node; 3794 (*p_new)->token.duplicated = 1; 3795 dup_node = *p_new; 3796 3797 /* Go to the left node, or up and to the right. */ 3798 if (node->left) 3799 { 3800 node = node->left; 3801 p_new = &dup_node->left; 3802 } 3803 else 3804 { 3805 const bin_tree_t *prev = NULL; 3806 while (node->right == prev || node->right == NULL) 3807 { 3808 prev = node; 3809 node = node->parent; 3810 dup_node = dup_node->parent; 3811 if (!node) 3812 return dup_root; 3813 } 3814 node = node->right; 3815 p_new = &dup_node->right; 3816 } 3817 } 3818 } 3819