1 /* Extended regular expression matching and search library. 2 Copyright (C) 2002, 2003, 2004, 2005, 2007, 2009 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>. 5 6 The GNU C Library is free software; you can redistribute it and/or 7 modify it under the terms of the GNU Lesser General Public 8 License as published by the Free Software Foundation; either 9 version 2.1 of the License, or (at your option) any later version. 10 11 The GNU C Library is distributed in the hope that it will be useful, 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 Lesser General Public License for more details. 15 16 You should have received a copy of the GNU Lesser General Public 17 License along with the GNU C Library; if not, write to the Free 18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 19 02111-1307 USA. */ 20 21 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags, 22 int n) internal_function; 23 static void match_ctx_clean (re_match_context_t *mctx) internal_function; 24 static void match_ctx_free (re_match_context_t *cache) internal_function; 25 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node, 26 int str_idx, int from, int to) 27 internal_function; 28 static int search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx) 29 internal_function; 30 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node, 31 int str_idx) internal_function; 32 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop, 33 int node, int str_idx) 34 internal_function; 35 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts, 36 re_dfastate_t **limited_sts, int last_node, 37 int last_str_idx) 38 internal_function; 39 static reg_errcode_t re_search_internal (const regex_t *preg, 40 const char *string, int length, 41 int start, int range, int stop, 42 size_t nmatch, regmatch_t pmatch[], 43 int eflags) internal_function; 44 static int re_search_2_stub (struct re_pattern_buffer *bufp, 45 const char *string1, int length1, 46 const char *string2, int length2, 47 int start, int range, struct re_registers *regs, 48 int stop, int ret_len) internal_function; 49 static int re_search_stub (struct re_pattern_buffer *bufp, 50 const char *string, int length, int start, 51 int range, int stop, struct re_registers *regs, 52 int ret_len) internal_function; 53 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch, 54 int nregs, int regs_allocated) internal_function; 55 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx) 56 internal_function; 57 static int check_matching (re_match_context_t *mctx, int fl_longest_match, 58 int *p_match_first) internal_function; 59 static int check_halt_state_context (const re_match_context_t *mctx, 60 const re_dfastate_t *state, int idx) 61 internal_function; 62 static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch, 63 regmatch_t *prev_idx_match, int cur_node, 64 int cur_idx, int nmatch) internal_function; 65 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs, 66 int str_idx, int dest_node, int nregs, 67 regmatch_t *regs, 68 re_node_set *eps_via_nodes) 69 internal_function; 70 static reg_errcode_t set_regs (const regex_t *preg, 71 const re_match_context_t *mctx, 72 size_t nmatch, regmatch_t *pmatch, 73 int fl_backtrack) internal_function; 74 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs) 75 internal_function; 76 77 #ifdef RE_ENABLE_I18N 78 static int sift_states_iter_mb (const re_match_context_t *mctx, 79 re_sift_context_t *sctx, 80 int node_idx, int str_idx, int max_str_idx) 81 internal_function; 82 #endif /* RE_ENABLE_I18N */ 83 static reg_errcode_t sift_states_backward (const re_match_context_t *mctx, 84 re_sift_context_t *sctx) 85 internal_function; 86 static reg_errcode_t build_sifted_states (const re_match_context_t *mctx, 87 re_sift_context_t *sctx, int str_idx, 88 re_node_set *cur_dest) 89 internal_function; 90 static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx, 91 re_sift_context_t *sctx, 92 int str_idx, 93 re_node_set *dest_nodes) 94 internal_function; 95 static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa, 96 re_node_set *dest_nodes, 97 const re_node_set *candidates) 98 internal_function; 99 static int check_dst_limits (const re_match_context_t *mctx, 100 re_node_set *limits, 101 int dst_node, int dst_idx, int src_node, 102 int src_idx) internal_function; 103 static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, 104 int boundaries, int subexp_idx, 105 int from_node, int bkref_idx) 106 internal_function; 107 static int check_dst_limits_calc_pos (const re_match_context_t *mctx, 108 int limit, int subexp_idx, 109 int node, int str_idx, 110 int bkref_idx) internal_function; 111 static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa, 112 re_node_set *dest_nodes, 113 const re_node_set *candidates, 114 re_node_set *limits, 115 struct re_backref_cache_entry *bkref_ents, 116 int str_idx) internal_function; 117 static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx, 118 re_sift_context_t *sctx, 119 int str_idx, const re_node_set *candidates) 120 internal_function; 121 static reg_errcode_t merge_state_array (const re_dfa_t *dfa, 122 re_dfastate_t **dst, 123 re_dfastate_t **src, int num) 124 internal_function; 125 static re_dfastate_t *find_recover_state (reg_errcode_t *err, 126 re_match_context_t *mctx) internal_function; 127 static re_dfastate_t *transit_state (reg_errcode_t *err, 128 re_match_context_t *mctx, 129 re_dfastate_t *state) internal_function; 130 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err, 131 re_match_context_t *mctx, 132 re_dfastate_t *next_state) 133 internal_function; 134 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx, 135 re_node_set *cur_nodes, 136 int str_idx) internal_function; 137 #if 0 138 static re_dfastate_t *transit_state_sb (reg_errcode_t *err, 139 re_match_context_t *mctx, 140 re_dfastate_t *pstate) 141 internal_function; 142 #endif 143 #ifdef RE_ENABLE_I18N 144 static reg_errcode_t transit_state_mb (re_match_context_t *mctx, 145 re_dfastate_t *pstate) 146 internal_function; 147 #endif /* RE_ENABLE_I18N */ 148 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx, 149 const re_node_set *nodes) 150 internal_function; 151 static reg_errcode_t get_subexp (re_match_context_t *mctx, 152 int bkref_node, int bkref_str_idx) 153 internal_function; 154 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx, 155 const re_sub_match_top_t *sub_top, 156 re_sub_match_last_t *sub_last, 157 int bkref_node, int bkref_str) 158 internal_function; 159 static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes, 160 int subexp_idx, int type) internal_function; 161 static reg_errcode_t check_arrival (re_match_context_t *mctx, 162 state_array_t *path, int top_node, 163 int top_str, int last_node, int last_str, 164 int type) internal_function; 165 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx, 166 int str_idx, 167 re_node_set *cur_nodes, 168 re_node_set *next_nodes) 169 internal_function; 170 static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa, 171 re_node_set *cur_nodes, 172 int ex_subexp, int type) 173 internal_function; 174 static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa, 175 re_node_set *dst_nodes, 176 int target, int ex_subexp, 177 int type) internal_function; 178 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx, 179 re_node_set *cur_nodes, int cur_str, 180 int subexp_num, int type) 181 internal_function; 182 static int build_trtable (const re_dfa_t *dfa, 183 re_dfastate_t *state) internal_function; 184 #ifdef RE_ENABLE_I18N 185 static int check_node_accept_bytes (const re_dfa_t *dfa, int node_idx, 186 const re_string_t *input, int idx) 187 internal_function; 188 # ifdef _LIBC 189 static unsigned int find_collation_sequence_value (const unsigned char *mbs, 190 size_t name_len) 191 internal_function; 192 # endif /* _LIBC */ 193 #endif /* RE_ENABLE_I18N */ 194 static int group_nodes_into_DFAstates (const re_dfa_t *dfa, 195 const re_dfastate_t *state, 196 re_node_set *states_node, 197 bitset_t *states_ch) internal_function; 198 static int check_node_accept (const re_match_context_t *mctx, 199 const re_token_t *node, int idx) 200 internal_function; 201 static reg_errcode_t extend_buffers (re_match_context_t *mctx) 202 internal_function; 203 204 /* Entry point for POSIX code. */ 205 206 /* regexec searches for a given pattern, specified by PREG, in the 207 string STRING. 208 209 If NMATCH is zero or REG_NOSUB was set in the cflags argument to 210 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at 211 least NMATCH elements, and we set them to the offsets of the 212 corresponding matched substrings. 213 214 EFLAGS specifies `execution flags' which affect matching: if 215 REG_NOTBOL is set, then ^ does not match at the beginning of the 216 string; if REG_NOTEOL is set, then $ does not match at the end. 217 218 We return 0 if we find a match and REG_NOMATCH if not. */ 219 220 int 221 regexec (preg, string, nmatch, pmatch, eflags) 222 const regex_t *__restrict preg; 223 const char *__restrict string; 224 size_t nmatch; 225 regmatch_t pmatch[]; 226 int eflags; 227 { 228 reg_errcode_t err; 229 int start, length; 230 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 231 232 if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND)) 233 return REG_BADPAT; 234 235 if (eflags & REG_STARTEND) 236 { 237 start = pmatch[0].rm_so; 238 length = pmatch[0].rm_eo; 239 } 240 else 241 { 242 start = 0; 243 length = strlen (string); 244 } 245 246 __libc_lock_lock (dfa->lock); 247 if (preg->no_sub) 248 err = re_search_internal (preg, string, length, start, length - start, 249 length, 0, NULL, eflags); 250 else 251 err = re_search_internal (preg, string, length, start, length - start, 252 length, nmatch, pmatch, eflags); 253 __libc_lock_unlock (dfa->lock); 254 return err != REG_NOERROR; 255 } 256 257 #ifdef _LIBC 258 # include <shlib-compat.h> 259 versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4); 260 261 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4) 262 __typeof__ (__regexec) __compat_regexec; 263 264 int 265 attribute_compat_text_section 266 __compat_regexec (const regex_t *__restrict preg, 267 const char *__restrict string, size_t nmatch, 268 regmatch_t pmatch[], int eflags) 269 { 270 return regexec (preg, string, nmatch, pmatch, 271 eflags & (REG_NOTBOL | REG_NOTEOL)); 272 } 273 compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0); 274 # endif 275 #endif 276 277 /* Entry points for GNU code. */ 278 279 /* re_match, re_search, re_match_2, re_search_2 280 281 The former two functions operate on STRING with length LENGTH, 282 while the later two operate on concatenation of STRING1 and STRING2 283 with lengths LENGTH1 and LENGTH2, respectively. 284 285 re_match() matches the compiled pattern in BUFP against the string, 286 starting at index START. 287 288 re_search() first tries matching at index START, then it tries to match 289 starting from index START + 1, and so on. The last start position tried 290 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same 291 way as re_match().) 292 293 The parameter STOP of re_{match,search}_2 specifies that no match exceeding 294 the first STOP characters of the concatenation of the strings should be 295 concerned. 296 297 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match 298 and all groups is stroed in REGS. (For the "_2" variants, the offsets are 299 computed relative to the concatenation, not relative to the individual 300 strings.) 301 302 On success, re_match* functions return the length of the match, re_search* 303 return the position of the start of the match. Return value -1 means no 304 match was found and -2 indicates an internal error. */ 305 306 int 307 re_match (bufp, string, length, start, regs) 308 struct re_pattern_buffer *bufp; 309 const char *string; 310 int length, start; 311 struct re_registers *regs; 312 { 313 return re_search_stub (bufp, string, length, start, 0, length, regs, 1); 314 } 315 #ifdef _LIBC 316 weak_alias (__re_match, re_match) 317 #endif 318 319 int 320 re_search (bufp, string, length, start, range, regs) 321 struct re_pattern_buffer *bufp; 322 const char *string; 323 int length, start, range; 324 struct re_registers *regs; 325 { 326 return re_search_stub (bufp, string, length, start, range, length, regs, 0); 327 } 328 #ifdef _LIBC 329 weak_alias (__re_search, re_search) 330 #endif 331 332 int 333 re_match_2 (bufp, string1, length1, string2, length2, start, regs, stop) 334 struct re_pattern_buffer *bufp; 335 const char *string1, *string2; 336 int length1, length2, start, stop; 337 struct re_registers *regs; 338 { 339 return re_search_2_stub (bufp, string1, length1, string2, length2, 340 start, 0, regs, stop, 1); 341 } 342 #ifdef _LIBC 343 weak_alias (__re_match_2, re_match_2) 344 #endif 345 346 int 347 re_search_2 (bufp, string1, length1, string2, length2, start, range, regs, stop) 348 struct re_pattern_buffer *bufp; 349 const char *string1, *string2; 350 int length1, length2, start, range, stop; 351 struct re_registers *regs; 352 { 353 return re_search_2_stub (bufp, string1, length1, string2, length2, 354 start, range, regs, stop, 0); 355 } 356 #ifdef _LIBC 357 weak_alias (__re_search_2, re_search_2) 358 #endif 359 360 static int 361 re_search_2_stub (bufp, string1, length1, string2, length2, start, range, regs, 362 stop, ret_len) 363 struct re_pattern_buffer *bufp; 364 const char *string1, *string2; 365 int length1, length2, start, range, stop, ret_len; 366 struct re_registers *regs; 367 { 368 const char *str; 369 int rval; 370 int len = length1 + length2; 371 int free_str = 0; 372 373 if (BE (length1 < 0 || length2 < 0 || stop < 0, 0)) 374 return -2; 375 376 /* Concatenate the strings. */ 377 if (length2 > 0) 378 if (length1 > 0) 379 { 380 char *s = re_malloc (char, len); 381 382 if (BE (s == NULL, 0)) 383 return -2; 384 #ifdef _LIBC 385 memcpy (__mempcpy (s, string1, length1), string2, length2); 386 #else 387 memcpy (s, string1, length1); 388 memcpy (s + length1, string2, length2); 389 #endif 390 str = s; 391 free_str = 1; 392 } 393 else 394 str = string2; 395 else 396 str = string1; 397 398 rval = re_search_stub (bufp, str, len, start, range, stop, regs, 399 ret_len); 400 if (free_str) 401 re_free ((char *) str); 402 return rval; 403 } 404 405 /* The parameters have the same meaning as those of re_search. 406 Additional parameters: 407 If RET_LEN is nonzero the length of the match is returned (re_match style); 408 otherwise the position of the match is returned. */ 409 410 static int 411 re_search_stub (bufp, string, length, start, range, stop, regs, ret_len) 412 struct re_pattern_buffer *bufp; 413 const char *string; 414 int length, start, range, stop, ret_len; 415 struct re_registers *regs; 416 { 417 reg_errcode_t result; 418 regmatch_t *pmatch; 419 int nregs, rval; 420 int eflags = 0; 421 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer; 422 423 /* Check for out-of-range. */ 424 if (BE (start < 0 || start > length, 0)) 425 return -1; 426 if (BE (start + range > length, 0)) 427 range = length - start; 428 else if (BE (start + range < 0, 0)) 429 range = -start; 430 431 __libc_lock_lock (dfa->lock); 432 433 eflags |= (bufp->not_bol) ? REG_NOTBOL : 0; 434 eflags |= (bufp->not_eol) ? REG_NOTEOL : 0; 435 436 /* Compile fastmap if we haven't yet. */ 437 if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate) 438 re_compile_fastmap (bufp); 439 440 if (BE (bufp->no_sub, 0)) 441 regs = NULL; 442 443 /* We need at least 1 register. */ 444 if (regs == NULL) 445 nregs = 1; 446 else if (BE (bufp->regs_allocated == REGS_FIXED && 447 regs->num_regs < bufp->re_nsub + 1, 0)) 448 { 449 nregs = regs->num_regs; 450 if (BE (nregs < 1, 0)) 451 { 452 /* Nothing can be copied to regs. */ 453 regs = NULL; 454 nregs = 1; 455 } 456 } 457 else 458 nregs = bufp->re_nsub + 1; 459 pmatch = re_malloc (regmatch_t, nregs); 460 if (BE (pmatch == NULL, 0)) 461 { 462 rval = -2; 463 goto out; 464 } 465 466 result = re_search_internal (bufp, string, length, start, range, stop, 467 nregs, pmatch, eflags); 468 469 rval = 0; 470 471 /* I hope we needn't fill ther regs with -1's when no match was found. */ 472 if (result != REG_NOERROR) 473 rval = -1; 474 else if (regs != NULL) 475 { 476 /* If caller wants register contents data back, copy them. */ 477 bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs, 478 bufp->regs_allocated); 479 if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0)) 480 rval = -2; 481 } 482 483 if (BE (rval == 0, 1)) 484 { 485 if (ret_len) 486 { 487 assert (pmatch[0].rm_so == start); 488 rval = pmatch[0].rm_eo - start; 489 } 490 else 491 rval = pmatch[0].rm_so; 492 } 493 re_free (pmatch); 494 out: 495 __libc_lock_unlock (dfa->lock); 496 return rval; 497 } 498 499 static unsigned 500 re_copy_regs (regs, pmatch, nregs, regs_allocated) 501 struct re_registers *regs; 502 regmatch_t *pmatch; 503 int nregs, regs_allocated; 504 { 505 int rval = REGS_REALLOCATE; 506 int i; 507 int need_regs = nregs + 1; 508 /* We need one extra element beyond `num_regs' for the `-1' marker GNU code 509 uses. */ 510 511 /* Have the register data arrays been allocated? */ 512 if (regs_allocated == REGS_UNALLOCATED) 513 { /* No. So allocate them with malloc. */ 514 regs->start = re_malloc (regoff_t, need_regs); 515 regs->end = re_malloc (regoff_t, need_regs); 516 if (BE (regs->start == NULL, 0) || BE (regs->end == NULL, 0)) 517 return REGS_UNALLOCATED; 518 regs->num_regs = need_regs; 519 } 520 else if (regs_allocated == REGS_REALLOCATE) 521 { /* Yes. If we need more elements than were already 522 allocated, reallocate them. If we need fewer, just 523 leave it alone. */ 524 if (BE (need_regs > regs->num_regs, 0)) 525 { 526 regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs); 527 regoff_t *new_end = re_realloc (regs->end, regoff_t, need_regs); 528 if (BE (new_start == NULL, 0) || BE (new_end == NULL, 0)) 529 return REGS_UNALLOCATED; 530 regs->start = new_start; 531 regs->end = new_end; 532 regs->num_regs = need_regs; 533 } 534 } 535 else 536 { 537 assert (regs_allocated == REGS_FIXED); 538 /* This function may not be called with REGS_FIXED and nregs too big. */ 539 assert (regs->num_regs >= nregs); 540 rval = REGS_FIXED; 541 } 542 543 /* Copy the regs. */ 544 for (i = 0; i < nregs; ++i) 545 { 546 regs->start[i] = pmatch[i].rm_so; 547 regs->end[i] = pmatch[i].rm_eo; 548 } 549 for ( ; i < regs->num_regs; ++i) 550 regs->start[i] = regs->end[i] = -1; 551 552 return rval; 553 } 554 555 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and 556 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use 557 this memory for recording register information. STARTS and ENDS 558 must be allocated using the malloc library routine, and must each 559 be at least NUM_REGS * sizeof (regoff_t) bytes long. 560 561 If NUM_REGS == 0, then subsequent matches should allocate their own 562 register data. 563 564 Unless this function is called, the first search or match using 565 PATTERN_BUFFER will allocate its own register data, without 566 freeing the old data. */ 567 568 void 569 re_set_registers (bufp, regs, num_regs, starts, ends) 570 struct re_pattern_buffer *bufp; 571 struct re_registers *regs; 572 unsigned num_regs; 573 regoff_t *starts, *ends; 574 { 575 if (num_regs) 576 { 577 bufp->regs_allocated = REGS_REALLOCATE; 578 regs->num_regs = num_regs; 579 regs->start = starts; 580 regs->end = ends; 581 } 582 else 583 { 584 bufp->regs_allocated = REGS_UNALLOCATED; 585 regs->num_regs = 0; 586 regs->start = regs->end = (regoff_t *) 0; 587 } 588 } 589 #ifdef _LIBC 590 weak_alias (__re_set_registers, re_set_registers) 591 #endif 592 593 /* Entry points compatible with 4.2 BSD regex library. We don't define 594 them unless specifically requested. */ 595 596 #if defined _REGEX_RE_COMP || defined _LIBC 597 int 598 # ifdef _LIBC 599 weak_function 600 # endif 601 re_exec (s) 602 const char *s; 603 { 604 return 0 == regexec (&re_comp_buf, s, 0, NULL, 0); 605 } 606 #endif /* _REGEX_RE_COMP */ 607 608 /* Internal entry point. */ 609 610 /* Searches for a compiled pattern PREG in the string STRING, whose 611 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same 612 mingings with regexec. START, and RANGE have the same meanings 613 with re_search. 614 Return REG_NOERROR if we find a match, and REG_NOMATCH if not, 615 otherwise return the error code. 616 Note: We assume front end functions already check ranges. 617 (START + RANGE >= 0 && START + RANGE <= LENGTH) */ 618 619 static reg_errcode_t 620 re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch, 621 eflags) 622 const regex_t *preg; 623 const char *string; 624 int length, start, range, stop, eflags; 625 size_t nmatch; 626 regmatch_t pmatch[]; 627 { 628 reg_errcode_t err; 629 const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer; 630 int left_lim, right_lim, incr; 631 int fl_longest_match, match_first, match_kind, match_last = -1; 632 int extra_nmatch; 633 int sb, ch; 634 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L) 635 re_match_context_t mctx = { .dfa = dfa }; 636 #else 637 re_match_context_t mctx; 638 #endif 639 char *fastmap = (preg->fastmap != NULL && preg->fastmap_accurate 640 && range && !preg->can_be_null) ? preg->fastmap : NULL; 641 RE_TRANSLATE_TYPE t = preg->translate; 642 643 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)) 644 memset (&mctx, '\0', sizeof (re_match_context_t)); 645 mctx.dfa = dfa; 646 #endif 647 648 extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0; 649 nmatch -= extra_nmatch; 650 651 /* Check if the DFA haven't been compiled. */ 652 if (BE (preg->used == 0 || dfa->init_state == NULL 653 || dfa->init_state_word == NULL || dfa->init_state_nl == NULL 654 || dfa->init_state_begbuf == NULL, 0)) 655 return REG_NOMATCH; 656 657 #ifdef DEBUG 658 /* We assume front-end functions already check them. */ 659 assert (start + range >= 0 && start + range <= length); 660 #endif 661 662 /* If initial states with non-begbuf contexts have no elements, 663 the regex must be anchored. If preg->newline_anchor is set, 664 we'll never use init_state_nl, so do not check it. */ 665 if (dfa->init_state->nodes.nelem == 0 666 && dfa->init_state_word->nodes.nelem == 0 667 && (dfa->init_state_nl->nodes.nelem == 0 668 || !preg->newline_anchor)) 669 { 670 if (start != 0 && start + range != 0) 671 return REG_NOMATCH; 672 start = range = 0; 673 } 674 675 /* We must check the longest matching, if nmatch > 0. */ 676 fl_longest_match = (nmatch != 0 || dfa->nbackref); 677 678 err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1, 679 preg->translate, preg->syntax & RE_ICASE, dfa); 680 if (BE (err != REG_NOERROR, 0)) 681 goto free_return; 682 mctx.input.stop = stop; 683 mctx.input.raw_stop = stop; 684 mctx.input.newline_anchor = preg->newline_anchor; 685 686 err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2); 687 if (BE (err != REG_NOERROR, 0)) 688 goto free_return; 689 690 /* We will log all the DFA states through which the dfa pass, 691 if nmatch > 1, or this dfa has "multibyte node", which is a 692 back-reference or a node which can accept multibyte character or 693 multi character collating element. */ 694 if (nmatch > 1 || dfa->has_mb_node) 695 { 696 mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1); 697 if (BE (mctx.state_log == NULL, 0)) 698 { 699 err = REG_ESPACE; 700 goto free_return; 701 } 702 } 703 else 704 mctx.state_log = NULL; 705 706 match_first = start; 707 mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF 708 : CONTEXT_NEWLINE | CONTEXT_BEGBUF; 709 710 /* Check incrementally whether of not the input string match. */ 711 incr = (range < 0) ? -1 : 1; 712 left_lim = (range < 0) ? start + range : start; 713 right_lim = (range < 0) ? start : start + range; 714 sb = dfa->mb_cur_max == 1; 715 match_kind = 716 (fastmap 717 ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0) 718 | (range >= 0 ? 2 : 0) 719 | (t != NULL ? 1 : 0)) 720 : 8); 721 722 for (;; match_first += incr) 723 { 724 err = REG_NOMATCH; 725 if (match_first < left_lim || right_lim < match_first) 726 goto free_return; 727 728 /* Advance as rapidly as possible through the string, until we 729 find a plausible place to start matching. This may be done 730 with varying efficiency, so there are various possibilities: 731 only the most common of them are specialized, in order to 732 save on code size. We use a switch statement for speed. */ 733 switch (match_kind) 734 { 735 case 8: 736 /* No fastmap. */ 737 break; 738 739 case 7: 740 /* Fastmap with single-byte translation, match forward. */ 741 while (BE (match_first < right_lim, 1) 742 && !fastmap[t[(unsigned char) string[match_first]]]) 743 ++match_first; 744 goto forward_match_found_start_or_reached_end; 745 746 case 6: 747 /* Fastmap without translation, match forward. */ 748 while (BE (match_first < right_lim, 1) 749 && !fastmap[(unsigned char) string[match_first]]) 750 ++match_first; 751 752 forward_match_found_start_or_reached_end: 753 if (BE (match_first == right_lim, 0)) 754 { 755 ch = match_first >= length 756 ? 0 : (unsigned char) string[match_first]; 757 if (!fastmap[t ? t[ch] : ch]) 758 goto free_return; 759 } 760 break; 761 762 case 4: 763 case 5: 764 /* Fastmap without multi-byte translation, match backwards. */ 765 while (match_first >= left_lim) 766 { 767 ch = match_first >= length 768 ? 0 : (unsigned char) string[match_first]; 769 if (fastmap[t ? t[ch] : ch]) 770 break; 771 --match_first; 772 } 773 if (match_first < left_lim) 774 goto free_return; 775 break; 776 777 default: 778 /* In this case, we can't determine easily the current byte, 779 since it might be a component byte of a multibyte 780 character. Then we use the constructed buffer instead. */ 781 for (;;) 782 { 783 /* If MATCH_FIRST is out of the valid range, reconstruct the 784 buffers. */ 785 unsigned int offset = match_first - mctx.input.raw_mbs_idx; 786 if (BE (offset >= (unsigned int) mctx.input.valid_raw_len, 0)) 787 { 788 err = re_string_reconstruct (&mctx.input, match_first, 789 eflags); 790 if (BE (err != REG_NOERROR, 0)) 791 goto free_return; 792 793 offset = match_first - mctx.input.raw_mbs_idx; 794 } 795 /* If MATCH_FIRST is out of the buffer, leave it as '\0'. 796 Note that MATCH_FIRST must not be smaller than 0. */ 797 ch = (match_first >= length 798 ? 0 : re_string_byte_at (&mctx.input, offset)); 799 if (fastmap[ch]) 800 break; 801 match_first += incr; 802 if (match_first < left_lim || match_first > right_lim) 803 { 804 err = REG_NOMATCH; 805 goto free_return; 806 } 807 } 808 break; 809 } 810 811 /* Reconstruct the buffers so that the matcher can assume that 812 the matching starts from the beginning of the buffer. */ 813 err = re_string_reconstruct (&mctx.input, match_first, eflags); 814 if (BE (err != REG_NOERROR, 0)) 815 goto free_return; 816 817 #ifdef RE_ENABLE_I18N 818 /* Don't consider this char as a possible match start if it part, 819 yet isn't the head, of a multibyte character. */ 820 if (!sb && !re_string_first_byte (&mctx.input, 0)) 821 continue; 822 #endif 823 824 /* It seems to be appropriate one, then use the matcher. */ 825 /* We assume that the matching starts from 0. */ 826 mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0; 827 match_last = check_matching (&mctx, fl_longest_match, 828 range >= 0 ? &match_first : NULL); 829 if (match_last != -1) 830 { 831 if (BE (match_last == -2, 0)) 832 { 833 err = REG_ESPACE; 834 goto free_return; 835 } 836 else 837 { 838 mctx.match_last = match_last; 839 if ((!preg->no_sub && nmatch > 1) || dfa->nbackref) 840 { 841 re_dfastate_t *pstate = mctx.state_log[match_last]; 842 mctx.last_node = check_halt_state_context (&mctx, pstate, 843 match_last); 844 } 845 if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match) 846 || dfa->nbackref) 847 { 848 err = prune_impossible_nodes (&mctx); 849 if (err == REG_NOERROR) 850 break; 851 if (BE (err != REG_NOMATCH, 0)) 852 goto free_return; 853 match_last = -1; 854 } 855 else 856 break; /* We found a match. */ 857 } 858 } 859 860 match_ctx_clean (&mctx); 861 } 862 863 #ifdef DEBUG 864 assert (match_last != -1); 865 assert (err == REG_NOERROR); 866 #endif 867 868 /* Set pmatch[] if we need. */ 869 if (nmatch > 0) 870 { 871 int reg_idx; 872 873 /* Initialize registers. */ 874 for (reg_idx = 1; reg_idx < nmatch; ++reg_idx) 875 pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1; 876 877 /* Set the points where matching start/end. */ 878 pmatch[0].rm_so = 0; 879 pmatch[0].rm_eo = mctx.match_last; 880 881 if (!preg->no_sub && nmatch > 1) 882 { 883 err = set_regs (preg, &mctx, nmatch, pmatch, 884 dfa->has_plural_match && dfa->nbackref > 0); 885 if (BE (err != REG_NOERROR, 0)) 886 goto free_return; 887 } 888 889 /* At last, add the offset to the each registers, since we slided 890 the buffers so that we could assume that the matching starts 891 from 0. */ 892 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx) 893 if (pmatch[reg_idx].rm_so != -1) 894 { 895 #ifdef RE_ENABLE_I18N 896 if (BE (mctx.input.offsets_needed != 0, 0)) 897 { 898 pmatch[reg_idx].rm_so = 899 (pmatch[reg_idx].rm_so == mctx.input.valid_len 900 ? mctx.input.valid_raw_len 901 : mctx.input.offsets[pmatch[reg_idx].rm_so]); 902 pmatch[reg_idx].rm_eo = 903 (pmatch[reg_idx].rm_eo == mctx.input.valid_len 904 ? mctx.input.valid_raw_len 905 : mctx.input.offsets[pmatch[reg_idx].rm_eo]); 906 } 907 #else 908 assert (mctx.input.offsets_needed == 0); 909 #endif 910 pmatch[reg_idx].rm_so += match_first; 911 pmatch[reg_idx].rm_eo += match_first; 912 } 913 for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx) 914 { 915 pmatch[nmatch + reg_idx].rm_so = -1; 916 pmatch[nmatch + reg_idx].rm_eo = -1; 917 } 918 919 if (dfa->subexp_map) 920 for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++) 921 if (dfa->subexp_map[reg_idx] != reg_idx) 922 { 923 pmatch[reg_idx + 1].rm_so 924 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so; 925 pmatch[reg_idx + 1].rm_eo 926 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo; 927 } 928 } 929 930 free_return: 931 re_free (mctx.state_log); 932 if (dfa->nbackref) 933 match_ctx_free (&mctx); 934 re_string_destruct (&mctx.input); 935 return err; 936 } 937 938 static reg_errcode_t 939 prune_impossible_nodes (mctx) 940 re_match_context_t *mctx; 941 { 942 const re_dfa_t *const dfa = mctx->dfa; 943 int halt_node, match_last; 944 reg_errcode_t ret; 945 re_dfastate_t **sifted_states; 946 re_dfastate_t **lim_states = NULL; 947 re_sift_context_t sctx; 948 #ifdef DEBUG 949 assert (mctx->state_log != NULL); 950 #endif 951 match_last = mctx->match_last; 952 halt_node = mctx->last_node; 953 sifted_states = re_malloc (re_dfastate_t *, match_last + 1); 954 if (BE (sifted_states == NULL, 0)) 955 { 956 ret = REG_ESPACE; 957 goto free_return; 958 } 959 if (dfa->nbackref) 960 { 961 lim_states = re_malloc (re_dfastate_t *, match_last + 1); 962 if (BE (lim_states == NULL, 0)) 963 { 964 ret = REG_ESPACE; 965 goto free_return; 966 } 967 while (1) 968 { 969 memset (lim_states, '\0', 970 sizeof (re_dfastate_t *) * (match_last + 1)); 971 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, 972 match_last); 973 ret = sift_states_backward (mctx, &sctx); 974 re_node_set_free (&sctx.limits); 975 if (BE (ret != REG_NOERROR, 0)) 976 goto free_return; 977 if (sifted_states[0] != NULL || lim_states[0] != NULL) 978 break; 979 do 980 { 981 --match_last; 982 if (match_last < 0) 983 { 984 ret = REG_NOMATCH; 985 goto free_return; 986 } 987 } while (mctx->state_log[match_last] == NULL 988 || !mctx->state_log[match_last]->halt); 989 halt_node = check_halt_state_context (mctx, 990 mctx->state_log[match_last], 991 match_last); 992 } 993 ret = merge_state_array (dfa, sifted_states, lim_states, 994 match_last + 1); 995 re_free (lim_states); 996 lim_states = NULL; 997 if (BE (ret != REG_NOERROR, 0)) 998 goto free_return; 999 } 1000 else 1001 { 1002 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last); 1003 ret = sift_states_backward (mctx, &sctx); 1004 re_node_set_free (&sctx.limits); 1005 if (BE (ret != REG_NOERROR, 0)) 1006 goto free_return; 1007 if (sifted_states[0] == NULL) 1008 { 1009 ret = REG_NOMATCH; 1010 goto free_return; 1011 } 1012 } 1013 re_free (mctx->state_log); 1014 mctx->state_log = sifted_states; 1015 sifted_states = NULL; 1016 mctx->last_node = halt_node; 1017 mctx->match_last = match_last; 1018 ret = REG_NOERROR; 1019 free_return: 1020 re_free (sifted_states); 1021 re_free (lim_states); 1022 return ret; 1023 } 1024 1025 /* Acquire an initial state and return it. 1026 We must select appropriate initial state depending on the context, 1027 since initial states may have constraints like "\<", "^", etc.. */ 1028 1029 static inline re_dfastate_t * 1030 __attribute ((always_inline)) internal_function 1031 acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx, 1032 int idx) 1033 { 1034 const re_dfa_t *const dfa = mctx->dfa; 1035 if (dfa->init_state->has_constraint) 1036 { 1037 unsigned int context; 1038 context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags); 1039 if (IS_WORD_CONTEXT (context)) 1040 return dfa->init_state_word; 1041 else if (IS_ORDINARY_CONTEXT (context)) 1042 return dfa->init_state; 1043 else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context)) 1044 return dfa->init_state_begbuf; 1045 else if (IS_NEWLINE_CONTEXT (context)) 1046 return dfa->init_state_nl; 1047 else if (IS_BEGBUF_CONTEXT (context)) 1048 { 1049 /* It is relatively rare case, then calculate on demand. */ 1050 return re_acquire_state_context (err, dfa, 1051 dfa->init_state->entrance_nodes, 1052 context); 1053 } 1054 else 1055 /* Must not happen? */ 1056 return dfa->init_state; 1057 } 1058 else 1059 return dfa->init_state; 1060 } 1061 1062 /* Check whether the regular expression match input string INPUT or not, 1063 and return the index where the matching end, return -1 if not match, 1064 or return -2 in case of an error. 1065 FL_LONGEST_MATCH means we want the POSIX longest matching. 1066 If P_MATCH_FIRST is not NULL, and the match fails, it is set to the 1067 next place where we may want to try matching. 1068 Note that the matcher assume that the maching starts from the current 1069 index of the buffer. */ 1070 1071 static int 1072 internal_function 1073 check_matching (re_match_context_t *mctx, int fl_longest_match, 1074 int *p_match_first) 1075 { 1076 const re_dfa_t *const dfa = mctx->dfa; 1077 reg_errcode_t err; 1078 int match = 0; 1079 int match_last = -1; 1080 int cur_str_idx = re_string_cur_idx (&mctx->input); 1081 re_dfastate_t *cur_state; 1082 int at_init_state = p_match_first != NULL; 1083 int next_start_idx = cur_str_idx; 1084 1085 err = REG_NOERROR; 1086 cur_state = acquire_init_state_context (&err, mctx, cur_str_idx); 1087 /* An initial state must not be NULL (invalid). */ 1088 if (BE (cur_state == NULL, 0)) 1089 { 1090 assert (err == REG_ESPACE); 1091 return -2; 1092 } 1093 1094 if (mctx->state_log != NULL) 1095 { 1096 mctx->state_log[cur_str_idx] = cur_state; 1097 1098 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them 1099 later. E.g. Processing back references. */ 1100 if (BE (dfa->nbackref, 0)) 1101 { 1102 at_init_state = 0; 1103 err = check_subexp_matching_top (mctx, &cur_state->nodes, 0); 1104 if (BE (err != REG_NOERROR, 0)) 1105 return err; 1106 1107 if (cur_state->has_backref) 1108 { 1109 err = transit_state_bkref (mctx, &cur_state->nodes); 1110 if (BE (err != REG_NOERROR, 0)) 1111 return err; 1112 } 1113 } 1114 } 1115 1116 /* If the RE accepts NULL string. */ 1117 if (BE (cur_state->halt, 0)) 1118 { 1119 if (!cur_state->has_constraint 1120 || check_halt_state_context (mctx, cur_state, cur_str_idx)) 1121 { 1122 if (!fl_longest_match) 1123 return cur_str_idx; 1124 else 1125 { 1126 match_last = cur_str_idx; 1127 match = 1; 1128 } 1129 } 1130 } 1131 1132 while (!re_string_eoi (&mctx->input)) 1133 { 1134 re_dfastate_t *old_state = cur_state; 1135 int next_char_idx = re_string_cur_idx (&mctx->input) + 1; 1136 1137 if (BE (next_char_idx >= mctx->input.bufs_len, 0) 1138 || (BE (next_char_idx >= mctx->input.valid_len, 0) 1139 && mctx->input.valid_len < mctx->input.len)) 1140 { 1141 err = extend_buffers (mctx); 1142 if (BE (err != REG_NOERROR, 0)) 1143 { 1144 assert (err == REG_ESPACE); 1145 return -2; 1146 } 1147 } 1148 1149 cur_state = transit_state (&err, mctx, cur_state); 1150 if (mctx->state_log != NULL) 1151 cur_state = merge_state_with_log (&err, mctx, cur_state); 1152 1153 if (cur_state == NULL) 1154 { 1155 /* Reached the invalid state or an error. Try to recover a valid 1156 state using the state log, if available and if we have not 1157 already found a valid (even if not the longest) match. */ 1158 if (BE (err != REG_NOERROR, 0)) 1159 return -2; 1160 1161 if (mctx->state_log == NULL 1162 || (match && !fl_longest_match) 1163 || (cur_state = find_recover_state (&err, mctx)) == NULL) 1164 break; 1165 } 1166 1167 if (BE (at_init_state, 0)) 1168 { 1169 if (old_state == cur_state) 1170 next_start_idx = next_char_idx; 1171 else 1172 at_init_state = 0; 1173 } 1174 1175 if (cur_state->halt) 1176 { 1177 /* Reached a halt state. 1178 Check the halt state can satisfy the current context. */ 1179 if (!cur_state->has_constraint 1180 || check_halt_state_context (mctx, cur_state, 1181 re_string_cur_idx (&mctx->input))) 1182 { 1183 /* We found an appropriate halt state. */ 1184 match_last = re_string_cur_idx (&mctx->input); 1185 match = 1; 1186 1187 /* We found a match, do not modify match_first below. */ 1188 p_match_first = NULL; 1189 if (!fl_longest_match) 1190 break; 1191 } 1192 } 1193 } 1194 1195 if (p_match_first) 1196 *p_match_first += next_start_idx; 1197 1198 return match_last; 1199 } 1200 1201 /* Check NODE match the current context. */ 1202 1203 static int 1204 internal_function 1205 check_halt_node_context (const re_dfa_t *dfa, int node, unsigned int context) 1206 { 1207 re_token_type_t type = dfa->nodes[node].type; 1208 unsigned int constraint = dfa->nodes[node].constraint; 1209 if (type != END_OF_RE) 1210 return 0; 1211 if (!constraint) 1212 return 1; 1213 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context)) 1214 return 0; 1215 return 1; 1216 } 1217 1218 /* Check the halt state STATE match the current context. 1219 Return 0 if not match, if the node, STATE has, is a halt node and 1220 match the context, return the node. */ 1221 1222 static int 1223 internal_function 1224 check_halt_state_context (const re_match_context_t *mctx, 1225 const re_dfastate_t *state, int idx) 1226 { 1227 int i; 1228 unsigned int context; 1229 #ifdef DEBUG 1230 assert (state->halt); 1231 #endif 1232 context = re_string_context_at (&mctx->input, idx, mctx->eflags); 1233 for (i = 0; i < state->nodes.nelem; ++i) 1234 if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context)) 1235 return state->nodes.elems[i]; 1236 return 0; 1237 } 1238 1239 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA 1240 corresponding to the DFA). 1241 Return the destination node, and update EPS_VIA_NODES, return -1 in case 1242 of errors. */ 1243 1244 static int 1245 internal_function 1246 proceed_next_node (const re_match_context_t *mctx, int nregs, regmatch_t *regs, 1247 int *pidx, int node, re_node_set *eps_via_nodes, 1248 struct re_fail_stack_t *fs) 1249 { 1250 const re_dfa_t *const dfa = mctx->dfa; 1251 int i, err; 1252 if (IS_EPSILON_NODE (dfa->nodes[node].type)) 1253 { 1254 re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes; 1255 re_node_set *edests = &dfa->edests[node]; 1256 int dest_node; 1257 err = re_node_set_insert (eps_via_nodes, node); 1258 if (BE (err < 0, 0)) 1259 return -2; 1260 /* Pick up a valid destination, or return -1 if none is found. */ 1261 for (dest_node = -1, i = 0; i < edests->nelem; ++i) 1262 { 1263 int candidate = edests->elems[i]; 1264 if (!re_node_set_contains (cur_nodes, candidate)) 1265 continue; 1266 if (dest_node == -1) 1267 dest_node = candidate; 1268 1269 else 1270 { 1271 /* In order to avoid infinite loop like "(a*)*", return the second 1272 epsilon-transition if the first was already considered. */ 1273 if (re_node_set_contains (eps_via_nodes, dest_node)) 1274 return candidate; 1275 1276 /* Otherwise, push the second epsilon-transition on the fail stack. */ 1277 else if (fs != NULL 1278 && push_fail_stack (fs, *pidx, candidate, nregs, regs, 1279 eps_via_nodes)) 1280 return -2; 1281 1282 /* We know we are going to exit. */ 1283 break; 1284 } 1285 } 1286 return dest_node; 1287 } 1288 else 1289 { 1290 int naccepted = 0; 1291 re_token_type_t type = dfa->nodes[node].type; 1292 1293 #ifdef RE_ENABLE_I18N 1294 if (dfa->nodes[node].accept_mb) 1295 naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx); 1296 else 1297 #endif /* RE_ENABLE_I18N */ 1298 if (type == OP_BACK_REF) 1299 { 1300 int subexp_idx = dfa->nodes[node].opr.idx + 1; 1301 naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so; 1302 if (fs != NULL) 1303 { 1304 if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1) 1305 return -1; 1306 else if (naccepted) 1307 { 1308 char *buf = (char *) re_string_get_buffer (&mctx->input); 1309 if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx, 1310 naccepted) != 0) 1311 return -1; 1312 } 1313 } 1314 1315 if (naccepted == 0) 1316 { 1317 int dest_node; 1318 err = re_node_set_insert (eps_via_nodes, node); 1319 if (BE (err < 0, 0)) 1320 return -2; 1321 dest_node = dfa->edests[node].elems[0]; 1322 if (re_node_set_contains (&mctx->state_log[*pidx]->nodes, 1323 dest_node)) 1324 return dest_node; 1325 } 1326 } 1327 1328 if (naccepted != 0 1329 || check_node_accept (mctx, dfa->nodes + node, *pidx)) 1330 { 1331 int dest_node = dfa->nexts[node]; 1332 *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted; 1333 if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL 1334 || !re_node_set_contains (&mctx->state_log[*pidx]->nodes, 1335 dest_node))) 1336 return -1; 1337 re_node_set_empty (eps_via_nodes); 1338 return dest_node; 1339 } 1340 } 1341 return -1; 1342 } 1343 1344 static reg_errcode_t 1345 internal_function 1346 push_fail_stack (struct re_fail_stack_t *fs, int str_idx, int dest_node, 1347 int nregs, regmatch_t *regs, re_node_set *eps_via_nodes) 1348 { 1349 reg_errcode_t err; 1350 int num = fs->num++; 1351 if (fs->num == fs->alloc) 1352 { 1353 struct re_fail_stack_ent_t *new_array; 1354 new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t) 1355 * fs->alloc * 2)); 1356 if (new_array == NULL) 1357 return REG_ESPACE; 1358 fs->alloc *= 2; 1359 fs->stack = new_array; 1360 } 1361 fs->stack[num].idx = str_idx; 1362 fs->stack[num].node = dest_node; 1363 fs->stack[num].regs = re_malloc (regmatch_t, nregs); 1364 if (fs->stack[num].regs == NULL) 1365 return REG_ESPACE; 1366 memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs); 1367 err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes); 1368 return err; 1369 } 1370 1371 static int 1372 internal_function 1373 pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs, 1374 regmatch_t *regs, re_node_set *eps_via_nodes) 1375 { 1376 int num = --fs->num; 1377 assert (num >= 0); 1378 *pidx = fs->stack[num].idx; 1379 memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs); 1380 re_node_set_free (eps_via_nodes); 1381 re_free (fs->stack[num].regs); 1382 *eps_via_nodes = fs->stack[num].eps_via_nodes; 1383 return fs->stack[num].node; 1384 } 1385 1386 /* Set the positions where the subexpressions are starts/ends to registers 1387 PMATCH. 1388 Note: We assume that pmatch[0] is already set, and 1389 pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */ 1390 1391 static reg_errcode_t 1392 internal_function 1393 set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch, 1394 regmatch_t *pmatch, int fl_backtrack) 1395 { 1396 const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer; 1397 int idx, cur_node; 1398 re_node_set eps_via_nodes; 1399 struct re_fail_stack_t *fs; 1400 struct re_fail_stack_t fs_body = { 0, 2, NULL }; 1401 regmatch_t *prev_idx_match; 1402 int prev_idx_match_malloced = 0; 1403 1404 #ifdef DEBUG 1405 assert (nmatch > 1); 1406 assert (mctx->state_log != NULL); 1407 #endif 1408 if (fl_backtrack) 1409 { 1410 fs = &fs_body; 1411 fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc); 1412 if (fs->stack == NULL) 1413 return REG_ESPACE; 1414 } 1415 else 1416 fs = NULL; 1417 1418 cur_node = dfa->init_node; 1419 re_node_set_init_empty (&eps_via_nodes); 1420 1421 if (__libc_use_alloca (nmatch * sizeof (regmatch_t))) 1422 prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t)); 1423 else 1424 { 1425 prev_idx_match = re_malloc (regmatch_t, nmatch); 1426 if (prev_idx_match == NULL) 1427 { 1428 free_fail_stack_return (fs); 1429 return REG_ESPACE; 1430 } 1431 prev_idx_match_malloced = 1; 1432 } 1433 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch); 1434 1435 for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;) 1436 { 1437 update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch); 1438 1439 if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node) 1440 { 1441 int reg_idx; 1442 if (fs) 1443 { 1444 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx) 1445 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1) 1446 break; 1447 if (reg_idx == nmatch) 1448 { 1449 re_node_set_free (&eps_via_nodes); 1450 if (prev_idx_match_malloced) 1451 re_free (prev_idx_match); 1452 return free_fail_stack_return (fs); 1453 } 1454 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch, 1455 &eps_via_nodes); 1456 } 1457 else 1458 { 1459 re_node_set_free (&eps_via_nodes); 1460 if (prev_idx_match_malloced) 1461 re_free (prev_idx_match); 1462 return REG_NOERROR; 1463 } 1464 } 1465 1466 /* Proceed to next node. */ 1467 cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node, 1468 &eps_via_nodes, fs); 1469 1470 if (BE (cur_node < 0, 0)) 1471 { 1472 if (BE (cur_node == -2, 0)) 1473 { 1474 re_node_set_free (&eps_via_nodes); 1475 if (prev_idx_match_malloced) 1476 re_free (prev_idx_match); 1477 free_fail_stack_return (fs); 1478 return REG_ESPACE; 1479 } 1480 if (fs) 1481 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch, 1482 &eps_via_nodes); 1483 else 1484 { 1485 re_node_set_free (&eps_via_nodes); 1486 if (prev_idx_match_malloced) 1487 re_free (prev_idx_match); 1488 return REG_NOMATCH; 1489 } 1490 } 1491 } 1492 re_node_set_free (&eps_via_nodes); 1493 if (prev_idx_match_malloced) 1494 re_free (prev_idx_match); 1495 return free_fail_stack_return (fs); 1496 } 1497 1498 static reg_errcode_t 1499 internal_function 1500 free_fail_stack_return (struct re_fail_stack_t *fs) 1501 { 1502 if (fs) 1503 { 1504 int fs_idx; 1505 for (fs_idx = 0; fs_idx < fs->num; ++fs_idx) 1506 { 1507 re_node_set_free (&fs->stack[fs_idx].eps_via_nodes); 1508 re_free (fs->stack[fs_idx].regs); 1509 } 1510 re_free (fs->stack); 1511 } 1512 return REG_NOERROR; 1513 } 1514 1515 static void 1516 internal_function 1517 update_regs (const re_dfa_t *dfa, regmatch_t *pmatch, 1518 regmatch_t *prev_idx_match, int cur_node, int cur_idx, int nmatch) 1519 { 1520 int type = dfa->nodes[cur_node].type; 1521 if (type == OP_OPEN_SUBEXP) 1522 { 1523 int reg_num = dfa->nodes[cur_node].opr.idx + 1; 1524 1525 /* We are at the first node of this sub expression. */ 1526 if (reg_num < nmatch) 1527 { 1528 pmatch[reg_num].rm_so = cur_idx; 1529 pmatch[reg_num].rm_eo = -1; 1530 } 1531 } 1532 else if (type == OP_CLOSE_SUBEXP) 1533 { 1534 int reg_num = dfa->nodes[cur_node].opr.idx + 1; 1535 if (reg_num < nmatch) 1536 { 1537 /* We are at the last node of this sub expression. */ 1538 if (pmatch[reg_num].rm_so < cur_idx) 1539 { 1540 pmatch[reg_num].rm_eo = cur_idx; 1541 /* This is a non-empty match or we are not inside an optional 1542 subexpression. Accept this right away. */ 1543 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch); 1544 } 1545 else 1546 { 1547 if (dfa->nodes[cur_node].opt_subexp 1548 && prev_idx_match[reg_num].rm_so != -1) 1549 /* We transited through an empty match for an optional 1550 subexpression, like (a?)*, and this is not the subexp's 1551 first match. Copy back the old content of the registers 1552 so that matches of an inner subexpression are undone as 1553 well, like in ((a?))*. */ 1554 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch); 1555 else 1556 /* We completed a subexpression, but it may be part of 1557 an optional one, so do not update PREV_IDX_MATCH. */ 1558 pmatch[reg_num].rm_eo = cur_idx; 1559 } 1560 } 1561 } 1562 } 1563 1564 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0 1565 and sift the nodes in each states according to the following rules. 1566 Updated state_log will be wrote to STATE_LOG. 1567 1568 Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if... 1569 1. When STR_IDX == MATCH_LAST(the last index in the state_log): 1570 If `a' isn't the LAST_NODE and `a' can't epsilon transit to 1571 the LAST_NODE, we throw away the node `a'. 1572 2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts 1573 string `s' and transit to `b': 1574 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw 1575 away the node `a'. 1576 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is 1577 thrown away, we throw away the node `a'. 1578 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b': 1579 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the 1580 node `a'. 1581 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away, 1582 we throw away the node `a'. */ 1583 1584 #define STATE_NODE_CONTAINS(state,node) \ 1585 ((state) != NULL && re_node_set_contains (&(state)->nodes, node)) 1586 1587 static reg_errcode_t 1588 internal_function 1589 sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx) 1590 { 1591 reg_errcode_t err; 1592 int null_cnt = 0; 1593 int str_idx = sctx->last_str_idx; 1594 re_node_set cur_dest; 1595 1596 #ifdef DEBUG 1597 assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL); 1598 #endif 1599 1600 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon 1601 transit to the last_node and the last_node itself. */ 1602 err = re_node_set_init_1 (&cur_dest, sctx->last_node); 1603 if (BE (err != REG_NOERROR, 0)) 1604 return err; 1605 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest); 1606 if (BE (err != REG_NOERROR, 0)) 1607 goto free_return; 1608 1609 /* Then check each states in the state_log. */ 1610 while (str_idx > 0) 1611 { 1612 /* Update counters. */ 1613 null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0; 1614 if (null_cnt > mctx->max_mb_elem_len) 1615 { 1616 memset (sctx->sifted_states, '\0', 1617 sizeof (re_dfastate_t *) * str_idx); 1618 re_node_set_free (&cur_dest); 1619 return REG_NOERROR; 1620 } 1621 re_node_set_empty (&cur_dest); 1622 --str_idx; 1623 1624 if (mctx->state_log[str_idx]) 1625 { 1626 err = build_sifted_states (mctx, sctx, str_idx, &cur_dest); 1627 if (BE (err != REG_NOERROR, 0)) 1628 goto free_return; 1629 } 1630 1631 /* Add all the nodes which satisfy the following conditions: 1632 - It can epsilon transit to a node in CUR_DEST. 1633 - It is in CUR_SRC. 1634 And update state_log. */ 1635 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest); 1636 if (BE (err != REG_NOERROR, 0)) 1637 goto free_return; 1638 } 1639 err = REG_NOERROR; 1640 free_return: 1641 re_node_set_free (&cur_dest); 1642 return err; 1643 } 1644 1645 static reg_errcode_t 1646 internal_function 1647 build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx, 1648 int str_idx, re_node_set *cur_dest) 1649 { 1650 const re_dfa_t *const dfa = mctx->dfa; 1651 const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes; 1652 int i; 1653 1654 /* Then build the next sifted state. 1655 We build the next sifted state on `cur_dest', and update 1656 `sifted_states[str_idx]' with `cur_dest'. 1657 Note: 1658 `cur_dest' is the sifted state from `state_log[str_idx + 1]'. 1659 `cur_src' points the node_set of the old `state_log[str_idx]' 1660 (with the epsilon nodes pre-filtered out). */ 1661 for (i = 0; i < cur_src->nelem; i++) 1662 { 1663 int prev_node = cur_src->elems[i]; 1664 int naccepted = 0; 1665 int ret; 1666 1667 #ifdef DEBUG 1668 re_token_type_t type = dfa->nodes[prev_node].type; 1669 assert (!IS_EPSILON_NODE (type)); 1670 #endif 1671 #ifdef RE_ENABLE_I18N 1672 /* If the node may accept `multi byte'. */ 1673 if (dfa->nodes[prev_node].accept_mb) 1674 naccepted = sift_states_iter_mb (mctx, sctx, prev_node, 1675 str_idx, sctx->last_str_idx); 1676 #endif /* RE_ENABLE_I18N */ 1677 1678 /* We don't check backreferences here. 1679 See update_cur_sifted_state(). */ 1680 if (!naccepted 1681 && check_node_accept (mctx, dfa->nodes + prev_node, str_idx) 1682 && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1], 1683 dfa->nexts[prev_node])) 1684 naccepted = 1; 1685 1686 if (naccepted == 0) 1687 continue; 1688 1689 if (sctx->limits.nelem) 1690 { 1691 int to_idx = str_idx + naccepted; 1692 if (check_dst_limits (mctx, &sctx->limits, 1693 dfa->nexts[prev_node], to_idx, 1694 prev_node, str_idx)) 1695 continue; 1696 } 1697 ret = re_node_set_insert (cur_dest, prev_node); 1698 if (BE (ret == -1, 0)) 1699 return REG_ESPACE; 1700 } 1701 1702 return REG_NOERROR; 1703 } 1704 1705 /* Helper functions. */ 1706 1707 static reg_errcode_t 1708 internal_function 1709 clean_state_log_if_needed (re_match_context_t *mctx, int next_state_log_idx) 1710 { 1711 int top = mctx->state_log_top; 1712 1713 if (next_state_log_idx >= mctx->input.bufs_len 1714 || (next_state_log_idx >= mctx->input.valid_len 1715 && mctx->input.valid_len < mctx->input.len)) 1716 { 1717 reg_errcode_t err; 1718 err = extend_buffers (mctx); 1719 if (BE (err != REG_NOERROR, 0)) 1720 return err; 1721 } 1722 1723 if (top < next_state_log_idx) 1724 { 1725 memset (mctx->state_log + top + 1, '\0', 1726 sizeof (re_dfastate_t *) * (next_state_log_idx - top)); 1727 mctx->state_log_top = next_state_log_idx; 1728 } 1729 return REG_NOERROR; 1730 } 1731 1732 static reg_errcode_t 1733 internal_function 1734 merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst, 1735 re_dfastate_t **src, int num) 1736 { 1737 int st_idx; 1738 reg_errcode_t err; 1739 for (st_idx = 0; st_idx < num; ++st_idx) 1740 { 1741 if (dst[st_idx] == NULL) 1742 dst[st_idx] = src[st_idx]; 1743 else if (src[st_idx] != NULL) 1744 { 1745 re_node_set merged_set; 1746 err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes, 1747 &src[st_idx]->nodes); 1748 if (BE (err != REG_NOERROR, 0)) 1749 return err; 1750 dst[st_idx] = re_acquire_state (&err, dfa, &merged_set); 1751 re_node_set_free (&merged_set); 1752 if (BE (err != REG_NOERROR, 0)) 1753 return err; 1754 } 1755 } 1756 return REG_NOERROR; 1757 } 1758 1759 static reg_errcode_t 1760 internal_function 1761 update_cur_sifted_state (const re_match_context_t *mctx, 1762 re_sift_context_t *sctx, int str_idx, 1763 re_node_set *dest_nodes) 1764 { 1765 const re_dfa_t *const dfa = mctx->dfa; 1766 reg_errcode_t err = REG_NOERROR; 1767 const re_node_set *candidates; 1768 candidates = ((mctx->state_log[str_idx] == NULL) ? NULL 1769 : &mctx->state_log[str_idx]->nodes); 1770 1771 if (dest_nodes->nelem == 0) 1772 sctx->sifted_states[str_idx] = NULL; 1773 else 1774 { 1775 if (candidates) 1776 { 1777 /* At first, add the nodes which can epsilon transit to a node in 1778 DEST_NODE. */ 1779 err = add_epsilon_src_nodes (dfa, dest_nodes, candidates); 1780 if (BE (err != REG_NOERROR, 0)) 1781 return err; 1782 1783 /* Then, check the limitations in the current sift_context. */ 1784 if (sctx->limits.nelem) 1785 { 1786 err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits, 1787 mctx->bkref_ents, str_idx); 1788 if (BE (err != REG_NOERROR, 0)) 1789 return err; 1790 } 1791 } 1792 1793 sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes); 1794 if (BE (err != REG_NOERROR, 0)) 1795 return err; 1796 } 1797 1798 if (candidates && mctx->state_log[str_idx]->has_backref) 1799 { 1800 err = sift_states_bkref (mctx, sctx, str_idx, candidates); 1801 if (BE (err != REG_NOERROR, 0)) 1802 return err; 1803 } 1804 return REG_NOERROR; 1805 } 1806 1807 static reg_errcode_t 1808 internal_function 1809 add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes, 1810 const re_node_set *candidates) 1811 { 1812 reg_errcode_t err = REG_NOERROR; 1813 int i; 1814 1815 re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes); 1816 if (BE (err != REG_NOERROR, 0)) 1817 return err; 1818 1819 if (!state->inveclosure.alloc) 1820 { 1821 err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem); 1822 if (BE (err != REG_NOERROR, 0)) 1823 return REG_ESPACE; 1824 for (i = 0; i < dest_nodes->nelem; i++) 1825 re_node_set_merge (&state->inveclosure, 1826 dfa->inveclosures + dest_nodes->elems[i]); 1827 } 1828 return re_node_set_add_intersect (dest_nodes, candidates, 1829 &state->inveclosure); 1830 } 1831 1832 static reg_errcode_t 1833 internal_function 1834 sub_epsilon_src_nodes (const re_dfa_t *dfa, int node, re_node_set *dest_nodes, 1835 const re_node_set *candidates) 1836 { 1837 int ecl_idx; 1838 reg_errcode_t err; 1839 re_node_set *inv_eclosure = dfa->inveclosures + node; 1840 re_node_set except_nodes; 1841 re_node_set_init_empty (&except_nodes); 1842 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx) 1843 { 1844 int cur_node = inv_eclosure->elems[ecl_idx]; 1845 if (cur_node == node) 1846 continue; 1847 if (IS_EPSILON_NODE (dfa->nodes[cur_node].type)) 1848 { 1849 int edst1 = dfa->edests[cur_node].elems[0]; 1850 int edst2 = ((dfa->edests[cur_node].nelem > 1) 1851 ? dfa->edests[cur_node].elems[1] : -1); 1852 if ((!re_node_set_contains (inv_eclosure, edst1) 1853 && re_node_set_contains (dest_nodes, edst1)) 1854 || (edst2 > 0 1855 && !re_node_set_contains (inv_eclosure, edst2) 1856 && re_node_set_contains (dest_nodes, edst2))) 1857 { 1858 err = re_node_set_add_intersect (&except_nodes, candidates, 1859 dfa->inveclosures + cur_node); 1860 if (BE (err != REG_NOERROR, 0)) 1861 { 1862 re_node_set_free (&except_nodes); 1863 return err; 1864 } 1865 } 1866 } 1867 } 1868 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx) 1869 { 1870 int cur_node = inv_eclosure->elems[ecl_idx]; 1871 if (!re_node_set_contains (&except_nodes, cur_node)) 1872 { 1873 int idx = re_node_set_contains (dest_nodes, cur_node) - 1; 1874 re_node_set_remove_at (dest_nodes, idx); 1875 } 1876 } 1877 re_node_set_free (&except_nodes); 1878 return REG_NOERROR; 1879 } 1880 1881 static int 1882 internal_function 1883 check_dst_limits (const re_match_context_t *mctx, re_node_set *limits, 1884 int dst_node, int dst_idx, int src_node, int src_idx) 1885 { 1886 const re_dfa_t *const dfa = mctx->dfa; 1887 int lim_idx, src_pos, dst_pos; 1888 1889 int dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx); 1890 int src_bkref_idx = search_cur_bkref_entry (mctx, src_idx); 1891 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx) 1892 { 1893 int subexp_idx; 1894 struct re_backref_cache_entry *ent; 1895 ent = mctx->bkref_ents + limits->elems[lim_idx]; 1896 subexp_idx = dfa->nodes[ent->node].opr.idx; 1897 1898 dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx], 1899 subexp_idx, dst_node, dst_idx, 1900 dst_bkref_idx); 1901 src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx], 1902 subexp_idx, src_node, src_idx, 1903 src_bkref_idx); 1904 1905 /* In case of: 1906 <src> <dst> ( <subexp> ) 1907 ( <subexp> ) <src> <dst> 1908 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */ 1909 if (src_pos == dst_pos) 1910 continue; /* This is unrelated limitation. */ 1911 else 1912 return 1; 1913 } 1914 return 0; 1915 } 1916 1917 static int 1918 internal_function 1919 check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries, 1920 int subexp_idx, int from_node, int bkref_idx) 1921 { 1922 const re_dfa_t *const dfa = mctx->dfa; 1923 const re_node_set *eclosures = dfa->eclosures + from_node; 1924 int node_idx; 1925 1926 /* Else, we are on the boundary: examine the nodes on the epsilon 1927 closure. */ 1928 for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx) 1929 { 1930 int node = eclosures->elems[node_idx]; 1931 switch (dfa->nodes[node].type) 1932 { 1933 case OP_BACK_REF: 1934 if (bkref_idx != -1) 1935 { 1936 struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx; 1937 do 1938 { 1939 int dst, cpos; 1940 1941 if (ent->node != node) 1942 continue; 1943 1944 if (subexp_idx < BITSET_WORD_BITS 1945 && !(ent->eps_reachable_subexps_map 1946 & ((bitset_word_t) 1 << subexp_idx))) 1947 continue; 1948 1949 /* Recurse trying to reach the OP_OPEN_SUBEXP and 1950 OP_CLOSE_SUBEXP cases below. But, if the 1951 destination node is the same node as the source 1952 node, don't recurse because it would cause an 1953 infinite loop: a regex that exhibits this behavior 1954 is ()\1*\1* */ 1955 dst = dfa->edests[node].elems[0]; 1956 if (dst == from_node) 1957 { 1958 if (boundaries & 1) 1959 return -1; 1960 else /* if (boundaries & 2) */ 1961 return 0; 1962 } 1963 1964 cpos = 1965 check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx, 1966 dst, bkref_idx); 1967 if (cpos == -1 /* && (boundaries & 1) */) 1968 return -1; 1969 if (cpos == 0 && (boundaries & 2)) 1970 return 0; 1971 1972 if (subexp_idx < BITSET_WORD_BITS) 1973 ent->eps_reachable_subexps_map 1974 &= ~((bitset_word_t) 1 << subexp_idx); 1975 } 1976 while (ent++->more); 1977 } 1978 break; 1979 1980 case OP_OPEN_SUBEXP: 1981 if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx) 1982 return -1; 1983 break; 1984 1985 case OP_CLOSE_SUBEXP: 1986 if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx) 1987 return 0; 1988 break; 1989 1990 default: 1991 break; 1992 } 1993 } 1994 1995 return (boundaries & 2) ? 1 : 0; 1996 } 1997 1998 static int 1999 internal_function 2000 check_dst_limits_calc_pos (const re_match_context_t *mctx, int limit, 2001 int subexp_idx, int from_node, int str_idx, 2002 int bkref_idx) 2003 { 2004 struct re_backref_cache_entry *lim = mctx->bkref_ents + limit; 2005 int boundaries; 2006 2007 /* If we are outside the range of the subexpression, return -1 or 1. */ 2008 if (str_idx < lim->subexp_from) 2009 return -1; 2010 2011 if (lim->subexp_to < str_idx) 2012 return 1; 2013 2014 /* If we are within the subexpression, return 0. */ 2015 boundaries = (str_idx == lim->subexp_from); 2016 boundaries |= (str_idx == lim->subexp_to) << 1; 2017 if (boundaries == 0) 2018 return 0; 2019 2020 /* Else, examine epsilon closure. */ 2021 return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx, 2022 from_node, bkref_idx); 2023 } 2024 2025 /* Check the limitations of sub expressions LIMITS, and remove the nodes 2026 which are against limitations from DEST_NODES. */ 2027 2028 static reg_errcode_t 2029 internal_function 2030 check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes, 2031 const re_node_set *candidates, re_node_set *limits, 2032 struct re_backref_cache_entry *bkref_ents, int str_idx) 2033 { 2034 reg_errcode_t err; 2035 int node_idx, lim_idx; 2036 2037 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx) 2038 { 2039 int subexp_idx; 2040 struct re_backref_cache_entry *ent; 2041 ent = bkref_ents + limits->elems[lim_idx]; 2042 2043 if (str_idx <= ent->subexp_from || ent->str_idx < str_idx) 2044 continue; /* This is unrelated limitation. */ 2045 2046 subexp_idx = dfa->nodes[ent->node].opr.idx; 2047 if (ent->subexp_to == str_idx) 2048 { 2049 int ops_node = -1; 2050 int cls_node = -1; 2051 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx) 2052 { 2053 int node = dest_nodes->elems[node_idx]; 2054 re_token_type_t type = dfa->nodes[node].type; 2055 if (type == OP_OPEN_SUBEXP 2056 && subexp_idx == dfa->nodes[node].opr.idx) 2057 ops_node = node; 2058 else if (type == OP_CLOSE_SUBEXP 2059 && subexp_idx == dfa->nodes[node].opr.idx) 2060 cls_node = node; 2061 } 2062 2063 /* Check the limitation of the open subexpression. */ 2064 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */ 2065 if (ops_node >= 0) 2066 { 2067 err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes, 2068 candidates); 2069 if (BE (err != REG_NOERROR, 0)) 2070 return err; 2071 } 2072 2073 /* Check the limitation of the close subexpression. */ 2074 if (cls_node >= 0) 2075 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx) 2076 { 2077 int node = dest_nodes->elems[node_idx]; 2078 if (!re_node_set_contains (dfa->inveclosures + node, 2079 cls_node) 2080 && !re_node_set_contains (dfa->eclosures + node, 2081 cls_node)) 2082 { 2083 /* It is against this limitation. 2084 Remove it form the current sifted state. */ 2085 err = sub_epsilon_src_nodes (dfa, node, dest_nodes, 2086 candidates); 2087 if (BE (err != REG_NOERROR, 0)) 2088 return err; 2089 --node_idx; 2090 } 2091 } 2092 } 2093 else /* (ent->subexp_to != str_idx) */ 2094 { 2095 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx) 2096 { 2097 int node = dest_nodes->elems[node_idx]; 2098 re_token_type_t type = dfa->nodes[node].type; 2099 if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP) 2100 { 2101 if (subexp_idx != dfa->nodes[node].opr.idx) 2102 continue; 2103 /* It is against this limitation. 2104 Remove it form the current sifted state. */ 2105 err = sub_epsilon_src_nodes (dfa, node, dest_nodes, 2106 candidates); 2107 if (BE (err != REG_NOERROR, 0)) 2108 return err; 2109 } 2110 } 2111 } 2112 } 2113 return REG_NOERROR; 2114 } 2115 2116 static reg_errcode_t 2117 internal_function 2118 sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx, 2119 int str_idx, const re_node_set *candidates) 2120 { 2121 const re_dfa_t *const dfa = mctx->dfa; 2122 reg_errcode_t err; 2123 int node_idx, node; 2124 re_sift_context_t local_sctx; 2125 int first_idx = search_cur_bkref_entry (mctx, str_idx); 2126 2127 if (first_idx == -1) 2128 return REG_NOERROR; 2129 2130 local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */ 2131 2132 for (node_idx = 0; node_idx < candidates->nelem; ++node_idx) 2133 { 2134 int enabled_idx; 2135 re_token_type_t type; 2136 struct re_backref_cache_entry *entry; 2137 node = candidates->elems[node_idx]; 2138 type = dfa->nodes[node].type; 2139 /* Avoid infinite loop for the REs like "()\1+". */ 2140 if (node == sctx->last_node && str_idx == sctx->last_str_idx) 2141 continue; 2142 if (type != OP_BACK_REF) 2143 continue; 2144 2145 entry = mctx->bkref_ents + first_idx; 2146 enabled_idx = first_idx; 2147 do 2148 { 2149 int subexp_len; 2150 int to_idx; 2151 int dst_node; 2152 int ret; 2153 re_dfastate_t *cur_state; 2154 2155 if (entry->node != node) 2156 continue; 2157 subexp_len = entry->subexp_to - entry->subexp_from; 2158 to_idx = str_idx + subexp_len; 2159 dst_node = (subexp_len ? dfa->nexts[node] 2160 : dfa->edests[node].elems[0]); 2161 2162 if (to_idx > sctx->last_str_idx 2163 || sctx->sifted_states[to_idx] == NULL 2164 || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node) 2165 || check_dst_limits (mctx, &sctx->limits, node, 2166 str_idx, dst_node, to_idx)) 2167 continue; 2168 2169 if (local_sctx.sifted_states == NULL) 2170 { 2171 local_sctx = *sctx; 2172 err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits); 2173 if (BE (err != REG_NOERROR, 0)) 2174 goto free_return; 2175 } 2176 local_sctx.last_node = node; 2177 local_sctx.last_str_idx = str_idx; 2178 ret = re_node_set_insert (&local_sctx.limits, enabled_idx); 2179 if (BE (ret < 0, 0)) 2180 { 2181 err = REG_ESPACE; 2182 goto free_return; 2183 } 2184 cur_state = local_sctx.sifted_states[str_idx]; 2185 err = sift_states_backward (mctx, &local_sctx); 2186 if (BE (err != REG_NOERROR, 0)) 2187 goto free_return; 2188 if (sctx->limited_states != NULL) 2189 { 2190 err = merge_state_array (dfa, sctx->limited_states, 2191 local_sctx.sifted_states, 2192 str_idx + 1); 2193 if (BE (err != REG_NOERROR, 0)) 2194 goto free_return; 2195 } 2196 local_sctx.sifted_states[str_idx] = cur_state; 2197 re_node_set_remove (&local_sctx.limits, enabled_idx); 2198 2199 /* mctx->bkref_ents may have changed, reload the pointer. */ 2200 entry = mctx->bkref_ents + enabled_idx; 2201 } 2202 while (enabled_idx++, entry++->more); 2203 } 2204 err = REG_NOERROR; 2205 free_return: 2206 if (local_sctx.sifted_states != NULL) 2207 { 2208 re_node_set_free (&local_sctx.limits); 2209 } 2210 2211 return err; 2212 } 2213 2214 2215 #ifdef RE_ENABLE_I18N 2216 static int 2217 internal_function 2218 sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx, 2219 int node_idx, int str_idx, int max_str_idx) 2220 { 2221 const re_dfa_t *const dfa = mctx->dfa; 2222 int naccepted; 2223 /* Check the node can accept `multi byte'. */ 2224 naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx); 2225 if (naccepted > 0 && str_idx + naccepted <= max_str_idx && 2226 !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted], 2227 dfa->nexts[node_idx])) 2228 /* The node can't accept the `multi byte', or the 2229 destination was already thrown away, then the node 2230 could't accept the current input `multi byte'. */ 2231 naccepted = 0; 2232 /* Otherwise, it is sure that the node could accept 2233 `naccepted' bytes input. */ 2234 return naccepted; 2235 } 2236 #endif /* RE_ENABLE_I18N */ 2237 2238 2239 /* Functions for state transition. */ 2240 2241 /* Return the next state to which the current state STATE will transit by 2242 accepting the current input byte, and update STATE_LOG if necessary. 2243 If STATE can accept a multibyte char/collating element/back reference 2244 update the destination of STATE_LOG. */ 2245 2246 static re_dfastate_t * 2247 internal_function 2248 transit_state (reg_errcode_t *err, re_match_context_t *mctx, 2249 re_dfastate_t *state) 2250 { 2251 re_dfastate_t **trtable; 2252 unsigned char ch; 2253 2254 #ifdef RE_ENABLE_I18N 2255 /* If the current state can accept multibyte. */ 2256 if (BE (state->accept_mb, 0)) 2257 { 2258 *err = transit_state_mb (mctx, state); 2259 if (BE (*err != REG_NOERROR, 0)) 2260 return NULL; 2261 } 2262 #endif /* RE_ENABLE_I18N */ 2263 2264 /* Then decide the next state with the single byte. */ 2265 #if 0 2266 if (0) 2267 /* don't use transition table */ 2268 return transit_state_sb (err, mctx, state); 2269 #endif 2270 2271 /* Use transition table */ 2272 ch = re_string_fetch_byte (&mctx->input); 2273 for (;;) 2274 { 2275 trtable = state->trtable; 2276 if (BE (trtable != NULL, 1)) 2277 return trtable[ch]; 2278 2279 trtable = state->word_trtable; 2280 if (BE (trtable != NULL, 1)) 2281 { 2282 unsigned int context; 2283 context 2284 = re_string_context_at (&mctx->input, 2285 re_string_cur_idx (&mctx->input) - 1, 2286 mctx->eflags); 2287 if (IS_WORD_CONTEXT (context)) 2288 return trtable[ch + SBC_MAX]; 2289 else 2290 return trtable[ch]; 2291 } 2292 2293 if (!build_trtable (mctx->dfa, state)) 2294 { 2295 *err = REG_ESPACE; 2296 return NULL; 2297 } 2298 2299 /* Retry, we now have a transition table. */ 2300 } 2301 } 2302 2303 /* Update the state_log if we need */ 2304 re_dfastate_t * 2305 internal_function 2306 merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx, 2307 re_dfastate_t *next_state) 2308 { 2309 const re_dfa_t *const dfa = mctx->dfa; 2310 int cur_idx = re_string_cur_idx (&mctx->input); 2311 2312 if (cur_idx > mctx->state_log_top) 2313 { 2314 mctx->state_log[cur_idx] = next_state; 2315 mctx->state_log_top = cur_idx; 2316 } 2317 else if (mctx->state_log[cur_idx] == 0) 2318 { 2319 mctx->state_log[cur_idx] = next_state; 2320 } 2321 else 2322 { 2323 re_dfastate_t *pstate; 2324 unsigned int context; 2325 re_node_set next_nodes, *log_nodes, *table_nodes = NULL; 2326 /* If (state_log[cur_idx] != 0), it implies that cur_idx is 2327 the destination of a multibyte char/collating element/ 2328 back reference. Then the next state is the union set of 2329 these destinations and the results of the transition table. */ 2330 pstate = mctx->state_log[cur_idx]; 2331 log_nodes = pstate->entrance_nodes; 2332 if (next_state != NULL) 2333 { 2334 table_nodes = next_state->entrance_nodes; 2335 *err = re_node_set_init_union (&next_nodes, table_nodes, 2336 log_nodes); 2337 if (BE (*err != REG_NOERROR, 0)) 2338 return NULL; 2339 } 2340 else 2341 next_nodes = *log_nodes; 2342 /* Note: We already add the nodes of the initial state, 2343 then we don't need to add them here. */ 2344 2345 context = re_string_context_at (&mctx->input, 2346 re_string_cur_idx (&mctx->input) - 1, 2347 mctx->eflags); 2348 next_state = mctx->state_log[cur_idx] 2349 = re_acquire_state_context (err, dfa, &next_nodes, context); 2350 /* We don't need to check errors here, since the return value of 2351 this function is next_state and ERR is already set. */ 2352 2353 if (table_nodes != NULL) 2354 re_node_set_free (&next_nodes); 2355 } 2356 2357 if (BE (dfa->nbackref, 0) && next_state != NULL) 2358 { 2359 /* Check OP_OPEN_SUBEXP in the current state in case that we use them 2360 later. We must check them here, since the back references in the 2361 next state might use them. */ 2362 *err = check_subexp_matching_top (mctx, &next_state->nodes, 2363 cur_idx); 2364 if (BE (*err != REG_NOERROR, 0)) 2365 return NULL; 2366 2367 /* If the next state has back references. */ 2368 if (next_state->has_backref) 2369 { 2370 *err = transit_state_bkref (mctx, &next_state->nodes); 2371 if (BE (*err != REG_NOERROR, 0)) 2372 return NULL; 2373 next_state = mctx->state_log[cur_idx]; 2374 } 2375 } 2376 2377 return next_state; 2378 } 2379 2380 /* Skip bytes in the input that correspond to part of a 2381 multi-byte match, then look in the log for a state 2382 from which to restart matching. */ 2383 re_dfastate_t * 2384 internal_function 2385 find_recover_state (reg_errcode_t *err, re_match_context_t *mctx) 2386 { 2387 re_dfastate_t *cur_state; 2388 do 2389 { 2390 int max = mctx->state_log_top; 2391 int cur_str_idx = re_string_cur_idx (&mctx->input); 2392 2393 do 2394 { 2395 if (++cur_str_idx > max) 2396 return NULL; 2397 re_string_skip_bytes (&mctx->input, 1); 2398 } 2399 while (mctx->state_log[cur_str_idx] == NULL); 2400 2401 cur_state = merge_state_with_log (err, mctx, NULL); 2402 } 2403 while (*err == REG_NOERROR && cur_state == NULL); 2404 return cur_state; 2405 } 2406 2407 /* Helper functions for transit_state. */ 2408 2409 /* From the node set CUR_NODES, pick up the nodes whose types are 2410 OP_OPEN_SUBEXP and which have corresponding back references in the regular 2411 expression. And register them to use them later for evaluating the 2412 correspoding back references. */ 2413 2414 static reg_errcode_t 2415 internal_function 2416 check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes, 2417 int str_idx) 2418 { 2419 const re_dfa_t *const dfa = mctx->dfa; 2420 int node_idx; 2421 reg_errcode_t err; 2422 2423 /* TODO: This isn't efficient. 2424 Because there might be more than one nodes whose types are 2425 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all 2426 nodes. 2427 E.g. RE: (a){2} */ 2428 for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx) 2429 { 2430 int node = cur_nodes->elems[node_idx]; 2431 if (dfa->nodes[node].type == OP_OPEN_SUBEXP 2432 && dfa->nodes[node].opr.idx < BITSET_WORD_BITS 2433 && (dfa->used_bkref_map 2434 & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx))) 2435 { 2436 err = match_ctx_add_subtop (mctx, node, str_idx); 2437 if (BE (err != REG_NOERROR, 0)) 2438 return err; 2439 } 2440 } 2441 return REG_NOERROR; 2442 } 2443 2444 #if 0 2445 /* Return the next state to which the current state STATE will transit by 2446 accepting the current input byte. */ 2447 2448 static re_dfastate_t * 2449 transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx, 2450 re_dfastate_t *state) 2451 { 2452 const re_dfa_t *const dfa = mctx->dfa; 2453 re_node_set next_nodes; 2454 re_dfastate_t *next_state; 2455 int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input); 2456 unsigned int context; 2457 2458 *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1); 2459 if (BE (*err != REG_NOERROR, 0)) 2460 return NULL; 2461 for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt) 2462 { 2463 int cur_node = state->nodes.elems[node_cnt]; 2464 if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx)) 2465 { 2466 *err = re_node_set_merge (&next_nodes, 2467 dfa->eclosures + dfa->nexts[cur_node]); 2468 if (BE (*err != REG_NOERROR, 0)) 2469 { 2470 re_node_set_free (&next_nodes); 2471 return NULL; 2472 } 2473 } 2474 } 2475 context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags); 2476 next_state = re_acquire_state_context (err, dfa, &next_nodes, context); 2477 /* We don't need to check errors here, since the return value of 2478 this function is next_state and ERR is already set. */ 2479 2480 re_node_set_free (&next_nodes); 2481 re_string_skip_bytes (&mctx->input, 1); 2482 return next_state; 2483 } 2484 #endif 2485 2486 #ifdef RE_ENABLE_I18N 2487 static reg_errcode_t 2488 internal_function 2489 transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate) 2490 { 2491 const re_dfa_t *const dfa = mctx->dfa; 2492 reg_errcode_t err; 2493 int i; 2494 2495 for (i = 0; i < pstate->nodes.nelem; ++i) 2496 { 2497 re_node_set dest_nodes, *new_nodes; 2498 int cur_node_idx = pstate->nodes.elems[i]; 2499 int naccepted, dest_idx; 2500 unsigned int context; 2501 re_dfastate_t *dest_state; 2502 2503 if (!dfa->nodes[cur_node_idx].accept_mb) 2504 continue; 2505 2506 if (dfa->nodes[cur_node_idx].constraint) 2507 { 2508 context = re_string_context_at (&mctx->input, 2509 re_string_cur_idx (&mctx->input), 2510 mctx->eflags); 2511 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint, 2512 context)) 2513 continue; 2514 } 2515 2516 /* How many bytes the node can accept? */ 2517 naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input, 2518 re_string_cur_idx (&mctx->input)); 2519 if (naccepted == 0) 2520 continue; 2521 2522 /* The node can accepts `naccepted' bytes. */ 2523 dest_idx = re_string_cur_idx (&mctx->input) + naccepted; 2524 mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted 2525 : mctx->max_mb_elem_len); 2526 err = clean_state_log_if_needed (mctx, dest_idx); 2527 if (BE (err != REG_NOERROR, 0)) 2528 return err; 2529 #ifdef DEBUG 2530 assert (dfa->nexts[cur_node_idx] != -1); 2531 #endif 2532 new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx]; 2533 2534 dest_state = mctx->state_log[dest_idx]; 2535 if (dest_state == NULL) 2536 dest_nodes = *new_nodes; 2537 else 2538 { 2539 err = re_node_set_init_union (&dest_nodes, 2540 dest_state->entrance_nodes, new_nodes); 2541 if (BE (err != REG_NOERROR, 0)) 2542 return err; 2543 } 2544 context = re_string_context_at (&mctx->input, dest_idx - 1, 2545 mctx->eflags); 2546 mctx->state_log[dest_idx] 2547 = re_acquire_state_context (&err, dfa, &dest_nodes, context); 2548 if (dest_state != NULL) 2549 re_node_set_free (&dest_nodes); 2550 if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0)) 2551 return err; 2552 } 2553 return REG_NOERROR; 2554 } 2555 #endif /* RE_ENABLE_I18N */ 2556 2557 static reg_errcode_t 2558 internal_function 2559 transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes) 2560 { 2561 const re_dfa_t *const dfa = mctx->dfa; 2562 reg_errcode_t err; 2563 int i; 2564 int cur_str_idx = re_string_cur_idx (&mctx->input); 2565 2566 for (i = 0; i < nodes->nelem; ++i) 2567 { 2568 int dest_str_idx, prev_nelem, bkc_idx; 2569 int node_idx = nodes->elems[i]; 2570 unsigned int context; 2571 const re_token_t *node = dfa->nodes + node_idx; 2572 re_node_set *new_dest_nodes; 2573 2574 /* Check whether `node' is a backreference or not. */ 2575 if (node->type != OP_BACK_REF) 2576 continue; 2577 2578 if (node->constraint) 2579 { 2580 context = re_string_context_at (&mctx->input, cur_str_idx, 2581 mctx->eflags); 2582 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context)) 2583 continue; 2584 } 2585 2586 /* `node' is a backreference. 2587 Check the substring which the substring matched. */ 2588 bkc_idx = mctx->nbkref_ents; 2589 err = get_subexp (mctx, node_idx, cur_str_idx); 2590 if (BE (err != REG_NOERROR, 0)) 2591 goto free_return; 2592 2593 /* And add the epsilon closures (which is `new_dest_nodes') of 2594 the backreference to appropriate state_log. */ 2595 #ifdef DEBUG 2596 assert (dfa->nexts[node_idx] != -1); 2597 #endif 2598 for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx) 2599 { 2600 int subexp_len; 2601 re_dfastate_t *dest_state; 2602 struct re_backref_cache_entry *bkref_ent; 2603 bkref_ent = mctx->bkref_ents + bkc_idx; 2604 if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx) 2605 continue; 2606 subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from; 2607 new_dest_nodes = (subexp_len == 0 2608 ? dfa->eclosures + dfa->edests[node_idx].elems[0] 2609 : dfa->eclosures + dfa->nexts[node_idx]); 2610 dest_str_idx = (cur_str_idx + bkref_ent->subexp_to 2611 - bkref_ent->subexp_from); 2612 context = re_string_context_at (&mctx->input, dest_str_idx - 1, 2613 mctx->eflags); 2614 dest_state = mctx->state_log[dest_str_idx]; 2615 prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0 2616 : mctx->state_log[cur_str_idx]->nodes.nelem); 2617 /* Add `new_dest_node' to state_log. */ 2618 if (dest_state == NULL) 2619 { 2620 mctx->state_log[dest_str_idx] 2621 = re_acquire_state_context (&err, dfa, new_dest_nodes, 2622 context); 2623 if (BE (mctx->state_log[dest_str_idx] == NULL 2624 && err != REG_NOERROR, 0)) 2625 goto free_return; 2626 } 2627 else 2628 { 2629 re_node_set dest_nodes; 2630 err = re_node_set_init_union (&dest_nodes, 2631 dest_state->entrance_nodes, 2632 new_dest_nodes); 2633 if (BE (err != REG_NOERROR, 0)) 2634 { 2635 re_node_set_free (&dest_nodes); 2636 goto free_return; 2637 } 2638 mctx->state_log[dest_str_idx] 2639 = re_acquire_state_context (&err, dfa, &dest_nodes, context); 2640 re_node_set_free (&dest_nodes); 2641 if (BE (mctx->state_log[dest_str_idx] == NULL 2642 && err != REG_NOERROR, 0)) 2643 goto free_return; 2644 } 2645 /* We need to check recursively if the backreference can epsilon 2646 transit. */ 2647 if (subexp_len == 0 2648 && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem) 2649 { 2650 err = check_subexp_matching_top (mctx, new_dest_nodes, 2651 cur_str_idx); 2652 if (BE (err != REG_NOERROR, 0)) 2653 goto free_return; 2654 err = transit_state_bkref (mctx, new_dest_nodes); 2655 if (BE (err != REG_NOERROR, 0)) 2656 goto free_return; 2657 } 2658 } 2659 } 2660 err = REG_NOERROR; 2661 free_return: 2662 return err; 2663 } 2664 2665 /* Enumerate all the candidates which the backreference BKREF_NODE can match 2666 at BKREF_STR_IDX, and register them by match_ctx_add_entry(). 2667 Note that we might collect inappropriate candidates here. 2668 However, the cost of checking them strictly here is too high, then we 2669 delay these checking for prune_impossible_nodes(). */ 2670 2671 static reg_errcode_t 2672 internal_function 2673 get_subexp (re_match_context_t *mctx, int bkref_node, int bkref_str_idx) 2674 { 2675 const re_dfa_t *const dfa = mctx->dfa; 2676 int subexp_num, sub_top_idx; 2677 const char *buf = (const char *) re_string_get_buffer (&mctx->input); 2678 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */ 2679 int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx); 2680 if (cache_idx != -1) 2681 { 2682 const struct re_backref_cache_entry *entry 2683 = mctx->bkref_ents + cache_idx; 2684 do 2685 if (entry->node == bkref_node) 2686 return REG_NOERROR; /* We already checked it. */ 2687 while (entry++->more); 2688 } 2689 2690 subexp_num = dfa->nodes[bkref_node].opr.idx; 2691 2692 /* For each sub expression */ 2693 for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx) 2694 { 2695 reg_errcode_t err; 2696 re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx]; 2697 re_sub_match_last_t *sub_last; 2698 int sub_last_idx, sl_str, bkref_str_off; 2699 2700 if (dfa->nodes[sub_top->node].opr.idx != subexp_num) 2701 continue; /* It isn't related. */ 2702 2703 sl_str = sub_top->str_idx; 2704 bkref_str_off = bkref_str_idx; 2705 /* At first, check the last node of sub expressions we already 2706 evaluated. */ 2707 for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx) 2708 { 2709 int sl_str_diff; 2710 sub_last = sub_top->lasts[sub_last_idx]; 2711 sl_str_diff = sub_last->str_idx - sl_str; 2712 /* The matched string by the sub expression match with the substring 2713 at the back reference? */ 2714 if (sl_str_diff > 0) 2715 { 2716 if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0)) 2717 { 2718 /* Not enough chars for a successful match. */ 2719 if (bkref_str_off + sl_str_diff > mctx->input.len) 2720 break; 2721 2722 err = clean_state_log_if_needed (mctx, 2723 bkref_str_off 2724 + sl_str_diff); 2725 if (BE (err != REG_NOERROR, 0)) 2726 return err; 2727 buf = (const char *) re_string_get_buffer (&mctx->input); 2728 } 2729 if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0) 2730 /* We don't need to search this sub expression any more. */ 2731 break; 2732 } 2733 bkref_str_off += sl_str_diff; 2734 sl_str += sl_str_diff; 2735 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node, 2736 bkref_str_idx); 2737 2738 /* Reload buf, since the preceding call might have reallocated 2739 the buffer. */ 2740 buf = (const char *) re_string_get_buffer (&mctx->input); 2741 2742 if (err == REG_NOMATCH) 2743 continue; 2744 if (BE (err != REG_NOERROR, 0)) 2745 return err; 2746 } 2747 2748 if (sub_last_idx < sub_top->nlasts) 2749 continue; 2750 if (sub_last_idx > 0) 2751 ++sl_str; 2752 /* Then, search for the other last nodes of the sub expression. */ 2753 for (; sl_str <= bkref_str_idx; ++sl_str) 2754 { 2755 int cls_node, sl_str_off; 2756 const re_node_set *nodes; 2757 sl_str_off = sl_str - sub_top->str_idx; 2758 /* The matched string by the sub expression match with the substring 2759 at the back reference? */ 2760 if (sl_str_off > 0) 2761 { 2762 if (BE (bkref_str_off >= mctx->input.valid_len, 0)) 2763 { 2764 /* If we are at the end of the input, we cannot match. */ 2765 if (bkref_str_off >= mctx->input.len) 2766 break; 2767 2768 err = extend_buffers (mctx); 2769 if (BE (err != REG_NOERROR, 0)) 2770 return err; 2771 2772 buf = (const char *) re_string_get_buffer (&mctx->input); 2773 } 2774 if (buf [bkref_str_off++] != buf[sl_str - 1]) 2775 break; /* We don't need to search this sub expression 2776 any more. */ 2777 } 2778 if (mctx->state_log[sl_str] == NULL) 2779 continue; 2780 /* Does this state have a ')' of the sub expression? */ 2781 nodes = &mctx->state_log[sl_str]->nodes; 2782 cls_node = find_subexp_node (dfa, nodes, subexp_num, 2783 OP_CLOSE_SUBEXP); 2784 if (cls_node == -1) 2785 continue; /* No. */ 2786 if (sub_top->path == NULL) 2787 { 2788 sub_top->path = calloc (sizeof (state_array_t), 2789 sl_str - sub_top->str_idx + 1); 2790 if (sub_top->path == NULL) 2791 return REG_ESPACE; 2792 } 2793 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node 2794 in the current context? */ 2795 err = check_arrival (mctx, sub_top->path, sub_top->node, 2796 sub_top->str_idx, cls_node, sl_str, 2797 OP_CLOSE_SUBEXP); 2798 if (err == REG_NOMATCH) 2799 continue; 2800 if (BE (err != REG_NOERROR, 0)) 2801 return err; 2802 sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str); 2803 if (BE (sub_last == NULL, 0)) 2804 return REG_ESPACE; 2805 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node, 2806 bkref_str_idx); 2807 if (err == REG_NOMATCH) 2808 continue; 2809 } 2810 } 2811 return REG_NOERROR; 2812 } 2813 2814 /* Helper functions for get_subexp(). */ 2815 2816 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR. 2817 If it can arrive, register the sub expression expressed with SUB_TOP 2818 and SUB_LAST. */ 2819 2820 static reg_errcode_t 2821 internal_function 2822 get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top, 2823 re_sub_match_last_t *sub_last, int bkref_node, int bkref_str) 2824 { 2825 reg_errcode_t err; 2826 int to_idx; 2827 /* Can the subexpression arrive the back reference? */ 2828 err = check_arrival (mctx, &sub_last->path, sub_last->node, 2829 sub_last->str_idx, bkref_node, bkref_str, 2830 OP_OPEN_SUBEXP); 2831 if (err != REG_NOERROR) 2832 return err; 2833 err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx, 2834 sub_last->str_idx); 2835 if (BE (err != REG_NOERROR, 0)) 2836 return err; 2837 to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx; 2838 return clean_state_log_if_needed (mctx, to_idx); 2839 } 2840 2841 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX. 2842 Search '(' if FL_OPEN, or search ')' otherwise. 2843 TODO: This function isn't efficient... 2844 Because there might be more than one nodes whose types are 2845 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all 2846 nodes. 2847 E.g. RE: (a){2} */ 2848 2849 static int 2850 internal_function 2851 find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes, 2852 int subexp_idx, int type) 2853 { 2854 int cls_idx; 2855 for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx) 2856 { 2857 int cls_node = nodes->elems[cls_idx]; 2858 const re_token_t *node = dfa->nodes + cls_node; 2859 if (node->type == type 2860 && node->opr.idx == subexp_idx) 2861 return cls_node; 2862 } 2863 return -1; 2864 } 2865 2866 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node 2867 LAST_NODE at LAST_STR. We record the path onto PATH since it will be 2868 heavily reused. 2869 Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */ 2870 2871 static reg_errcode_t 2872 internal_function 2873 check_arrival (re_match_context_t *mctx, state_array_t *path, int top_node, 2874 int top_str, int last_node, int last_str, int type) 2875 { 2876 const re_dfa_t *const dfa = mctx->dfa; 2877 reg_errcode_t err = REG_NOERROR; 2878 int subexp_num, backup_cur_idx, str_idx, null_cnt; 2879 re_dfastate_t *cur_state = NULL; 2880 re_node_set *cur_nodes, next_nodes; 2881 re_dfastate_t **backup_state_log; 2882 unsigned int context; 2883 2884 subexp_num = dfa->nodes[top_node].opr.idx; 2885 /* Extend the buffer if we need. */ 2886 if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0)) 2887 { 2888 re_dfastate_t **new_array; 2889 int old_alloc = path->alloc; 2890 path->alloc += last_str + mctx->max_mb_elem_len + 1; 2891 new_array = re_realloc (path->array, re_dfastate_t *, path->alloc); 2892 if (BE (new_array == NULL, 0)) 2893 { 2894 path->alloc = old_alloc; 2895 return REG_ESPACE; 2896 } 2897 path->array = new_array; 2898 memset (new_array + old_alloc, '\0', 2899 sizeof (re_dfastate_t *) * (path->alloc - old_alloc)); 2900 } 2901 2902 str_idx = path->next_idx ?: top_str; 2903 2904 /* Temporary modify MCTX. */ 2905 backup_state_log = mctx->state_log; 2906 backup_cur_idx = mctx->input.cur_idx; 2907 mctx->state_log = path->array; 2908 mctx->input.cur_idx = str_idx; 2909 2910 /* Setup initial node set. */ 2911 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags); 2912 if (str_idx == top_str) 2913 { 2914 err = re_node_set_init_1 (&next_nodes, top_node); 2915 if (BE (err != REG_NOERROR, 0)) 2916 return err; 2917 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type); 2918 if (BE (err != REG_NOERROR, 0)) 2919 { 2920 re_node_set_free (&next_nodes); 2921 return err; 2922 } 2923 } 2924 else 2925 { 2926 cur_state = mctx->state_log[str_idx]; 2927 if (cur_state && cur_state->has_backref) 2928 { 2929 err = re_node_set_init_copy (&next_nodes, &cur_state->nodes); 2930 if (BE (err != REG_NOERROR, 0)) 2931 return err; 2932 } 2933 else 2934 re_node_set_init_empty (&next_nodes); 2935 } 2936 if (str_idx == top_str || (cur_state && cur_state->has_backref)) 2937 { 2938 if (next_nodes.nelem) 2939 { 2940 err = expand_bkref_cache (mctx, &next_nodes, str_idx, 2941 subexp_num, type); 2942 if (BE (err != REG_NOERROR, 0)) 2943 { 2944 re_node_set_free (&next_nodes); 2945 return err; 2946 } 2947 } 2948 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context); 2949 if (BE (cur_state == NULL && err != REG_NOERROR, 0)) 2950 { 2951 re_node_set_free (&next_nodes); 2952 return err; 2953 } 2954 mctx->state_log[str_idx] = cur_state; 2955 } 2956 2957 for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;) 2958 { 2959 re_node_set_empty (&next_nodes); 2960 if (mctx->state_log[str_idx + 1]) 2961 { 2962 err = re_node_set_merge (&next_nodes, 2963 &mctx->state_log[str_idx + 1]->nodes); 2964 if (BE (err != REG_NOERROR, 0)) 2965 { 2966 re_node_set_free (&next_nodes); 2967 return err; 2968 } 2969 } 2970 if (cur_state) 2971 { 2972 err = check_arrival_add_next_nodes (mctx, str_idx, 2973 &cur_state->non_eps_nodes, 2974 &next_nodes); 2975 if (BE (err != REG_NOERROR, 0)) 2976 { 2977 re_node_set_free (&next_nodes); 2978 return err; 2979 } 2980 } 2981 ++str_idx; 2982 if (next_nodes.nelem) 2983 { 2984 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type); 2985 if (BE (err != REG_NOERROR, 0)) 2986 { 2987 re_node_set_free (&next_nodes); 2988 return err; 2989 } 2990 err = expand_bkref_cache (mctx, &next_nodes, str_idx, 2991 subexp_num, type); 2992 if (BE (err != REG_NOERROR, 0)) 2993 { 2994 re_node_set_free (&next_nodes); 2995 return err; 2996 } 2997 } 2998 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags); 2999 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context); 3000 if (BE (cur_state == NULL && err != REG_NOERROR, 0)) 3001 { 3002 re_node_set_free (&next_nodes); 3003 return err; 3004 } 3005 mctx->state_log[str_idx] = cur_state; 3006 null_cnt = cur_state == NULL ? null_cnt + 1 : 0; 3007 } 3008 re_node_set_free (&next_nodes); 3009 cur_nodes = (mctx->state_log[last_str] == NULL ? NULL 3010 : &mctx->state_log[last_str]->nodes); 3011 path->next_idx = str_idx; 3012 3013 /* Fix MCTX. */ 3014 mctx->state_log = backup_state_log; 3015 mctx->input.cur_idx = backup_cur_idx; 3016 3017 /* Then check the current node set has the node LAST_NODE. */ 3018 if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node)) 3019 return REG_NOERROR; 3020 3021 return REG_NOMATCH; 3022 } 3023 3024 /* Helper functions for check_arrival. */ 3025 3026 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them 3027 to NEXT_NODES. 3028 TODO: This function is similar to the functions transit_state*(), 3029 however this function has many additional works. 3030 Can't we unify them? */ 3031 3032 static reg_errcode_t 3033 internal_function 3034 check_arrival_add_next_nodes (re_match_context_t *mctx, int str_idx, 3035 re_node_set *cur_nodes, re_node_set *next_nodes) 3036 { 3037 const re_dfa_t *const dfa = mctx->dfa; 3038 int result; 3039 int cur_idx; 3040 reg_errcode_t err = REG_NOERROR; 3041 re_node_set union_set; 3042 re_node_set_init_empty (&union_set); 3043 for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx) 3044 { 3045 int naccepted = 0; 3046 int cur_node = cur_nodes->elems[cur_idx]; 3047 #ifdef DEBUG 3048 re_token_type_t type = dfa->nodes[cur_node].type; 3049 assert (!IS_EPSILON_NODE (type)); 3050 #endif 3051 #ifdef RE_ENABLE_I18N 3052 /* If the node may accept `multi byte'. */ 3053 if (dfa->nodes[cur_node].accept_mb) 3054 { 3055 naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input, 3056 str_idx); 3057 if (naccepted > 1) 3058 { 3059 re_dfastate_t *dest_state; 3060 int next_node = dfa->nexts[cur_node]; 3061 int next_idx = str_idx + naccepted; 3062 dest_state = mctx->state_log[next_idx]; 3063 re_node_set_empty (&union_set); 3064 if (dest_state) 3065 { 3066 err = re_node_set_merge (&union_set, &dest_state->nodes); 3067 if (BE (err != REG_NOERROR, 0)) 3068 { 3069 re_node_set_free (&union_set); 3070 return err; 3071 } 3072 } 3073 result = re_node_set_insert (&union_set, next_node); 3074 if (BE (result < 0, 0)) 3075 { 3076 re_node_set_free (&union_set); 3077 return REG_ESPACE; 3078 } 3079 mctx->state_log[next_idx] = re_acquire_state (&err, dfa, 3080 &union_set); 3081 if (BE (mctx->state_log[next_idx] == NULL 3082 && err != REG_NOERROR, 0)) 3083 { 3084 re_node_set_free (&union_set); 3085 return err; 3086 } 3087 } 3088 } 3089 #endif /* RE_ENABLE_I18N */ 3090 if (naccepted 3091 || check_node_accept (mctx, dfa->nodes + cur_node, str_idx)) 3092 { 3093 result = re_node_set_insert (next_nodes, dfa->nexts[cur_node]); 3094 if (BE (result < 0, 0)) 3095 { 3096 re_node_set_free (&union_set); 3097 return REG_ESPACE; 3098 } 3099 } 3100 } 3101 re_node_set_free (&union_set); 3102 return REG_NOERROR; 3103 } 3104 3105 /* For all the nodes in CUR_NODES, add the epsilon closures of them to 3106 CUR_NODES, however exclude the nodes which are: 3107 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN. 3108 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN. 3109 */ 3110 3111 static reg_errcode_t 3112 internal_function 3113 check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes, 3114 int ex_subexp, int type) 3115 { 3116 reg_errcode_t err; 3117 int idx, outside_node; 3118 re_node_set new_nodes; 3119 #ifdef DEBUG 3120 assert (cur_nodes->nelem); 3121 #endif 3122 err = re_node_set_alloc (&new_nodes, cur_nodes->nelem); 3123 if (BE (err != REG_NOERROR, 0)) 3124 return err; 3125 /* Create a new node set NEW_NODES with the nodes which are epsilon 3126 closures of the node in CUR_NODES. */ 3127 3128 for (idx = 0; idx < cur_nodes->nelem; ++idx) 3129 { 3130 int cur_node = cur_nodes->elems[idx]; 3131 const re_node_set *eclosure = dfa->eclosures + cur_node; 3132 outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type); 3133 if (outside_node == -1) 3134 { 3135 /* There are no problematic nodes, just merge them. */ 3136 err = re_node_set_merge (&new_nodes, eclosure); 3137 if (BE (err != REG_NOERROR, 0)) 3138 { 3139 re_node_set_free (&new_nodes); 3140 return err; 3141 } 3142 } 3143 else 3144 { 3145 /* There are problematic nodes, re-calculate incrementally. */ 3146 err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node, 3147 ex_subexp, type); 3148 if (BE (err != REG_NOERROR, 0)) 3149 { 3150 re_node_set_free (&new_nodes); 3151 return err; 3152 } 3153 } 3154 } 3155 re_node_set_free (cur_nodes); 3156 *cur_nodes = new_nodes; 3157 return REG_NOERROR; 3158 } 3159 3160 /* Helper function for check_arrival_expand_ecl. 3161 Check incrementally the epsilon closure of TARGET, and if it isn't 3162 problematic append it to DST_NODES. */ 3163 3164 static reg_errcode_t 3165 internal_function 3166 check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes, 3167 int target, int ex_subexp, int type) 3168 { 3169 int cur_node; 3170 for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);) 3171 { 3172 int err; 3173 3174 if (dfa->nodes[cur_node].type == type 3175 && dfa->nodes[cur_node].opr.idx == ex_subexp) 3176 { 3177 if (type == OP_CLOSE_SUBEXP) 3178 { 3179 err = re_node_set_insert (dst_nodes, cur_node); 3180 if (BE (err == -1, 0)) 3181 return REG_ESPACE; 3182 } 3183 break; 3184 } 3185 err = re_node_set_insert (dst_nodes, cur_node); 3186 if (BE (err == -1, 0)) 3187 return REG_ESPACE; 3188 if (dfa->edests[cur_node].nelem == 0) 3189 break; 3190 if (dfa->edests[cur_node].nelem == 2) 3191 { 3192 err = check_arrival_expand_ecl_sub (dfa, dst_nodes, 3193 dfa->edests[cur_node].elems[1], 3194 ex_subexp, type); 3195 if (BE (err != REG_NOERROR, 0)) 3196 return err; 3197 } 3198 cur_node = dfa->edests[cur_node].elems[0]; 3199 } 3200 return REG_NOERROR; 3201 } 3202 3203 3204 /* For all the back references in the current state, calculate the 3205 destination of the back references by the appropriate entry 3206 in MCTX->BKREF_ENTS. */ 3207 3208 static reg_errcode_t 3209 internal_function 3210 expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes, 3211 int cur_str, int subexp_num, int type) 3212 { 3213 const re_dfa_t *const dfa = mctx->dfa; 3214 reg_errcode_t err; 3215 int cache_idx_start = search_cur_bkref_entry (mctx, cur_str); 3216 struct re_backref_cache_entry *ent; 3217 3218 if (cache_idx_start == -1) 3219 return REG_NOERROR; 3220 3221 restart: 3222 ent = mctx->bkref_ents + cache_idx_start; 3223 do 3224 { 3225 int to_idx, next_node; 3226 3227 /* Is this entry ENT is appropriate? */ 3228 if (!re_node_set_contains (cur_nodes, ent->node)) 3229 continue; /* No. */ 3230 3231 to_idx = cur_str + ent->subexp_to - ent->subexp_from; 3232 /* Calculate the destination of the back reference, and append it 3233 to MCTX->STATE_LOG. */ 3234 if (to_idx == cur_str) 3235 { 3236 /* The backreference did epsilon transit, we must re-check all the 3237 node in the current state. */ 3238 re_node_set new_dests; 3239 reg_errcode_t err2, err3; 3240 next_node = dfa->edests[ent->node].elems[0]; 3241 if (re_node_set_contains (cur_nodes, next_node)) 3242 continue; 3243 err = re_node_set_init_1 (&new_dests, next_node); 3244 err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type); 3245 err3 = re_node_set_merge (cur_nodes, &new_dests); 3246 re_node_set_free (&new_dests); 3247 if (BE (err != REG_NOERROR || err2 != REG_NOERROR 3248 || err3 != REG_NOERROR, 0)) 3249 { 3250 err = (err != REG_NOERROR ? err 3251 : (err2 != REG_NOERROR ? err2 : err3)); 3252 return err; 3253 } 3254 /* TODO: It is still inefficient... */ 3255 goto restart; 3256 } 3257 else 3258 { 3259 re_node_set union_set; 3260 next_node = dfa->nexts[ent->node]; 3261 if (mctx->state_log[to_idx]) 3262 { 3263 int ret; 3264 if (re_node_set_contains (&mctx->state_log[to_idx]->nodes, 3265 next_node)) 3266 continue; 3267 err = re_node_set_init_copy (&union_set, 3268 &mctx->state_log[to_idx]->nodes); 3269 ret = re_node_set_insert (&union_set, next_node); 3270 if (BE (err != REG_NOERROR || ret < 0, 0)) 3271 { 3272 re_node_set_free (&union_set); 3273 err = err != REG_NOERROR ? err : REG_ESPACE; 3274 return err; 3275 } 3276 } 3277 else 3278 { 3279 err = re_node_set_init_1 (&union_set, next_node); 3280 if (BE (err != REG_NOERROR, 0)) 3281 return err; 3282 } 3283 mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set); 3284 re_node_set_free (&union_set); 3285 if (BE (mctx->state_log[to_idx] == NULL 3286 && err != REG_NOERROR, 0)) 3287 return err; 3288 } 3289 } 3290 while (ent++->more); 3291 return REG_NOERROR; 3292 } 3293 3294 /* Build transition table for the state. 3295 Return 1 if succeeded, otherwise return NULL. */ 3296 3297 static int 3298 internal_function 3299 build_trtable (const re_dfa_t *dfa, re_dfastate_t *state) 3300 { 3301 reg_errcode_t err; 3302 int i, j, ch, need_word_trtable = 0; 3303 bitset_word_t elem, mask; 3304 bool dests_node_malloced = false; 3305 bool dest_states_malloced = false; 3306 int ndests; /* Number of the destination states from `state'. */ 3307 re_dfastate_t **trtable; 3308 re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl; 3309 re_node_set follows, *dests_node; 3310 bitset_t *dests_ch; 3311 bitset_t acceptable; 3312 3313 struct dests_alloc 3314 { 3315 re_node_set dests_node[SBC_MAX]; 3316 bitset_t dests_ch[SBC_MAX]; 3317 } *dests_alloc; 3318 3319 /* We build DFA states which corresponds to the destination nodes 3320 from `state'. `dests_node[i]' represents the nodes which i-th 3321 destination state contains, and `dests_ch[i]' represents the 3322 characters which i-th destination state accepts. */ 3323 if (__libc_use_alloca (sizeof (struct dests_alloc))) 3324 dests_alloc = (struct dests_alloc *) alloca (sizeof (struct dests_alloc)); 3325 else 3326 { 3327 dests_alloc = re_malloc (struct dests_alloc, 1); 3328 if (BE (dests_alloc == NULL, 0)) 3329 return 0; 3330 dests_node_malloced = true; 3331 } 3332 dests_node = dests_alloc->dests_node; 3333 dests_ch = dests_alloc->dests_ch; 3334 3335 /* Initialize transiton table. */ 3336 state->word_trtable = state->trtable = NULL; 3337 3338 /* At first, group all nodes belonging to `state' into several 3339 destinations. */ 3340 ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch); 3341 if (BE (ndests <= 0, 0)) 3342 { 3343 if (dests_node_malloced) 3344 free (dests_alloc); 3345 /* Return 0 in case of an error, 1 otherwise. */ 3346 if (ndests == 0) 3347 { 3348 state->trtable = (re_dfastate_t **) 3349 calloc (sizeof (re_dfastate_t *), SBC_MAX); 3350 return 1; 3351 } 3352 return 0; 3353 } 3354 3355 err = re_node_set_alloc (&follows, ndests + 1); 3356 if (BE (err != REG_NOERROR, 0)) 3357 goto out_free; 3358 3359 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX 3360 + ndests * 3 * sizeof (re_dfastate_t *))) 3361 dest_states = (re_dfastate_t **) 3362 alloca (ndests * 3 * sizeof (re_dfastate_t *)); 3363 else 3364 { 3365 dest_states = (re_dfastate_t **) 3366 malloc (ndests * 3 * sizeof (re_dfastate_t *)); 3367 if (BE (dest_states == NULL, 0)) 3368 { 3369 out_free: 3370 if (dest_states_malloced) 3371 free (dest_states); 3372 re_node_set_free (&follows); 3373 for (i = 0; i < ndests; ++i) 3374 re_node_set_free (dests_node + i); 3375 if (dests_node_malloced) 3376 free (dests_alloc); 3377 return 0; 3378 } 3379 dest_states_malloced = true; 3380 } 3381 dest_states_word = dest_states + ndests; 3382 dest_states_nl = dest_states_word + ndests; 3383 bitset_empty (acceptable); 3384 3385 /* Then build the states for all destinations. */ 3386 for (i = 0; i < ndests; ++i) 3387 { 3388 int next_node; 3389 re_node_set_empty (&follows); 3390 /* Merge the follows of this destination states. */ 3391 for (j = 0; j < dests_node[i].nelem; ++j) 3392 { 3393 next_node = dfa->nexts[dests_node[i].elems[j]]; 3394 if (next_node != -1) 3395 { 3396 err = re_node_set_merge (&follows, dfa->eclosures + next_node); 3397 if (BE (err != REG_NOERROR, 0)) 3398 goto out_free; 3399 } 3400 } 3401 dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0); 3402 if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0)) 3403 goto out_free; 3404 /* If the new state has context constraint, 3405 build appropriate states for these contexts. */ 3406 if (dest_states[i]->has_constraint) 3407 { 3408 dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows, 3409 CONTEXT_WORD); 3410 if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0)) 3411 goto out_free; 3412 3413 if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1) 3414 need_word_trtable = 1; 3415 3416 dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows, 3417 CONTEXT_NEWLINE); 3418 if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0)) 3419 goto out_free; 3420 } 3421 else 3422 { 3423 dest_states_word[i] = dest_states[i]; 3424 dest_states_nl[i] = dest_states[i]; 3425 } 3426 bitset_merge (acceptable, dests_ch[i]); 3427 } 3428 3429 if (!BE (need_word_trtable, 0)) 3430 { 3431 /* We don't care about whether the following character is a word 3432 character, or we are in a single-byte character set so we can 3433 discern by looking at the character code: allocate a 3434 256-entry transition table. */ 3435 trtable = state->trtable = 3436 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX); 3437 if (BE (trtable == NULL, 0)) 3438 goto out_free; 3439 3440 /* For all characters ch...: */ 3441 for (i = 0; i < BITSET_WORDS; ++i) 3442 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1; 3443 elem; 3444 mask <<= 1, elem >>= 1, ++ch) 3445 if (BE (elem & 1, 0)) 3446 { 3447 /* There must be exactly one destination which accepts 3448 character ch. See group_nodes_into_DFAstates. */ 3449 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j) 3450 ; 3451 3452 /* j-th destination accepts the word character ch. */ 3453 if (dfa->word_char[i] & mask) 3454 trtable[ch] = dest_states_word[j]; 3455 else 3456 trtable[ch] = dest_states[j]; 3457 } 3458 } 3459 else 3460 { 3461 /* We care about whether the following character is a word 3462 character, and we are in a multi-byte character set: discern 3463 by looking at the character code: build two 256-entry 3464 transition tables, one starting at trtable[0] and one 3465 starting at trtable[SBC_MAX]. */ 3466 trtable = state->word_trtable = 3467 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX); 3468 if (BE (trtable == NULL, 0)) 3469 goto out_free; 3470 3471 /* For all characters ch...: */ 3472 for (i = 0; i < BITSET_WORDS; ++i) 3473 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1; 3474 elem; 3475 mask <<= 1, elem >>= 1, ++ch) 3476 if (BE (elem & 1, 0)) 3477 { 3478 /* There must be exactly one destination which accepts 3479 character ch. See group_nodes_into_DFAstates. */ 3480 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j) 3481 ; 3482 3483 /* j-th destination accepts the word character ch. */ 3484 trtable[ch] = dest_states[j]; 3485 trtable[ch + SBC_MAX] = dest_states_word[j]; 3486 } 3487 } 3488 3489 /* new line */ 3490 if (bitset_contain (acceptable, NEWLINE_CHAR)) 3491 { 3492 /* The current state accepts newline character. */ 3493 for (j = 0; j < ndests; ++j) 3494 if (bitset_contain (dests_ch[j], NEWLINE_CHAR)) 3495 { 3496 /* k-th destination accepts newline character. */ 3497 trtable[NEWLINE_CHAR] = dest_states_nl[j]; 3498 if (need_word_trtable) 3499 trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j]; 3500 /* There must be only one destination which accepts 3501 newline. See group_nodes_into_DFAstates. */ 3502 break; 3503 } 3504 } 3505 3506 if (dest_states_malloced) 3507 free (dest_states); 3508 3509 re_node_set_free (&follows); 3510 for (i = 0; i < ndests; ++i) 3511 re_node_set_free (dests_node + i); 3512 3513 if (dests_node_malloced) 3514 free (dests_alloc); 3515 3516 return 1; 3517 } 3518 3519 /* Group all nodes belonging to STATE into several destinations. 3520 Then for all destinations, set the nodes belonging to the destination 3521 to DESTS_NODE[i] and set the characters accepted by the destination 3522 to DEST_CH[i]. This function return the number of destinations. */ 3523 3524 static int 3525 internal_function 3526 group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state, 3527 re_node_set *dests_node, bitset_t *dests_ch) 3528 { 3529 reg_errcode_t err; 3530 int result; 3531 int i, j, k; 3532 int ndests; /* Number of the destinations from `state'. */ 3533 bitset_t accepts; /* Characters a node can accept. */ 3534 const re_node_set *cur_nodes = &state->nodes; 3535 bitset_empty (accepts); 3536 ndests = 0; 3537 3538 /* For all the nodes belonging to `state', */ 3539 for (i = 0; i < cur_nodes->nelem; ++i) 3540 { 3541 re_token_t *node = &dfa->nodes[cur_nodes->elems[i]]; 3542 re_token_type_t type = node->type; 3543 unsigned int constraint = node->constraint; 3544 3545 /* Enumerate all single byte character this node can accept. */ 3546 if (type == CHARACTER) 3547 bitset_set (accepts, node->opr.c); 3548 else if (type == SIMPLE_BRACKET) 3549 { 3550 bitset_merge (accepts, node->opr.sbcset); 3551 } 3552 else if (type == OP_PERIOD) 3553 { 3554 #ifdef RE_ENABLE_I18N 3555 if (dfa->mb_cur_max > 1) 3556 bitset_merge (accepts, dfa->sb_char); 3557 else 3558 #endif 3559 bitset_set_all (accepts); 3560 if (!(dfa->syntax & RE_DOT_NEWLINE)) 3561 bitset_clear (accepts, '\n'); 3562 if (dfa->syntax & RE_DOT_NOT_NULL) 3563 bitset_clear (accepts, '\0'); 3564 } 3565 #ifdef RE_ENABLE_I18N 3566 else if (type == OP_UTF8_PERIOD) 3567 { 3568 memset (accepts, '\xff', sizeof (bitset_t) / 2); 3569 if (!(dfa->syntax & RE_DOT_NEWLINE)) 3570 bitset_clear (accepts, '\n'); 3571 if (dfa->syntax & RE_DOT_NOT_NULL) 3572 bitset_clear (accepts, '\0'); 3573 } 3574 #endif 3575 else 3576 continue; 3577 3578 /* Check the `accepts' and sift the characters which are not 3579 match it the context. */ 3580 if (constraint) 3581 { 3582 if (constraint & NEXT_NEWLINE_CONSTRAINT) 3583 { 3584 bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR); 3585 bitset_empty (accepts); 3586 if (accepts_newline) 3587 bitset_set (accepts, NEWLINE_CHAR); 3588 else 3589 continue; 3590 } 3591 if (constraint & NEXT_ENDBUF_CONSTRAINT) 3592 { 3593 bitset_empty (accepts); 3594 continue; 3595 } 3596 3597 if (constraint & NEXT_WORD_CONSTRAINT) 3598 { 3599 bitset_word_t any_set = 0; 3600 if (type == CHARACTER && !node->word_char) 3601 { 3602 bitset_empty (accepts); 3603 continue; 3604 } 3605 #ifdef RE_ENABLE_I18N 3606 if (dfa->mb_cur_max > 1) 3607 for (j = 0; j < BITSET_WORDS; ++j) 3608 any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j])); 3609 else 3610 #endif 3611 for (j = 0; j < BITSET_WORDS; ++j) 3612 any_set |= (accepts[j] &= dfa->word_char[j]); 3613 if (!any_set) 3614 continue; 3615 } 3616 if (constraint & NEXT_NOTWORD_CONSTRAINT) 3617 { 3618 bitset_word_t any_set = 0; 3619 if (type == CHARACTER && node->word_char) 3620 { 3621 bitset_empty (accepts); 3622 continue; 3623 } 3624 #ifdef RE_ENABLE_I18N 3625 if (dfa->mb_cur_max > 1) 3626 for (j = 0; j < BITSET_WORDS; ++j) 3627 any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j])); 3628 else 3629 #endif 3630 for (j = 0; j < BITSET_WORDS; ++j) 3631 any_set |= (accepts[j] &= ~dfa->word_char[j]); 3632 if (!any_set) 3633 continue; 3634 } 3635 } 3636 3637 /* Then divide `accepts' into DFA states, or create a new 3638 state. Above, we make sure that accepts is not empty. */ 3639 for (j = 0; j < ndests; ++j) 3640 { 3641 bitset_t intersec; /* Intersection sets, see below. */ 3642 bitset_t remains; 3643 /* Flags, see below. */ 3644 bitset_word_t has_intersec, not_subset, not_consumed; 3645 3646 /* Optimization, skip if this state doesn't accept the character. */ 3647 if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c)) 3648 continue; 3649 3650 /* Enumerate the intersection set of this state and `accepts'. */ 3651 has_intersec = 0; 3652 for (k = 0; k < BITSET_WORDS; ++k) 3653 has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k]; 3654 /* And skip if the intersection set is empty. */ 3655 if (!has_intersec) 3656 continue; 3657 3658 /* Then check if this state is a subset of `accepts'. */ 3659 not_subset = not_consumed = 0; 3660 for (k = 0; k < BITSET_WORDS; ++k) 3661 { 3662 not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k]; 3663 not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k]; 3664 } 3665 3666 /* If this state isn't a subset of `accepts', create a 3667 new group state, which has the `remains'. */ 3668 if (not_subset) 3669 { 3670 bitset_copy (dests_ch[ndests], remains); 3671 bitset_copy (dests_ch[j], intersec); 3672 err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]); 3673 if (BE (err != REG_NOERROR, 0)) 3674 goto error_return; 3675 ++ndests; 3676 } 3677 3678 /* Put the position in the current group. */ 3679 result = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]); 3680 if (BE (result < 0, 0)) 3681 goto error_return; 3682 3683 /* If all characters are consumed, go to next node. */ 3684 if (!not_consumed) 3685 break; 3686 } 3687 /* Some characters remain, create a new group. */ 3688 if (j == ndests) 3689 { 3690 bitset_copy (dests_ch[ndests], accepts); 3691 err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]); 3692 if (BE (err != REG_NOERROR, 0)) 3693 goto error_return; 3694 ++ndests; 3695 bitset_empty (accepts); 3696 } 3697 } 3698 return ndests; 3699 error_return: 3700 for (j = 0; j < ndests; ++j) 3701 re_node_set_free (dests_node + j); 3702 return -1; 3703 } 3704 3705 #ifdef RE_ENABLE_I18N 3706 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts. 3707 Return the number of the bytes the node accepts. 3708 STR_IDX is the current index of the input string. 3709 3710 This function handles the nodes which can accept one character, or 3711 one collating element like '.', '[a-z]', opposite to the other nodes 3712 can only accept one byte. */ 3713 3714 static int 3715 internal_function 3716 check_node_accept_bytes (const re_dfa_t *dfa, int node_idx, 3717 const re_string_t *input, int str_idx) 3718 { 3719 const re_token_t *node = dfa->nodes + node_idx; 3720 int char_len, elem_len; 3721 int i; 3722 3723 if (BE (node->type == OP_UTF8_PERIOD, 0)) 3724 { 3725 unsigned char c = re_string_byte_at (input, str_idx), d; 3726 if (BE (c < 0xc2, 1)) 3727 return 0; 3728 3729 if (str_idx + 2 > input->len) 3730 return 0; 3731 3732 d = re_string_byte_at (input, str_idx + 1); 3733 if (c < 0xe0) 3734 return (d < 0x80 || d > 0xbf) ? 0 : 2; 3735 else if (c < 0xf0) 3736 { 3737 char_len = 3; 3738 if (c == 0xe0 && d < 0xa0) 3739 return 0; 3740 } 3741 else if (c < 0xf8) 3742 { 3743 char_len = 4; 3744 if (c == 0xf0 && d < 0x90) 3745 return 0; 3746 } 3747 else if (c < 0xfc) 3748 { 3749 char_len = 5; 3750 if (c == 0xf8 && d < 0x88) 3751 return 0; 3752 } 3753 else if (c < 0xfe) 3754 { 3755 char_len = 6; 3756 if (c == 0xfc && d < 0x84) 3757 return 0; 3758 } 3759 else 3760 return 0; 3761 3762 if (str_idx + char_len > input->len) 3763 return 0; 3764 3765 for (i = 1; i < char_len; ++i) 3766 { 3767 d = re_string_byte_at (input, str_idx + i); 3768 if (d < 0x80 || d > 0xbf) 3769 return 0; 3770 } 3771 return char_len; 3772 } 3773 3774 char_len = re_string_char_size_at (input, str_idx); 3775 if (node->type == OP_PERIOD) 3776 { 3777 if (char_len <= 1) 3778 return 0; 3779 /* FIXME: I don't think this if is needed, as both '\n' 3780 and '\0' are char_len == 1. */ 3781 /* '.' accepts any one character except the following two cases. */ 3782 if ((!(dfa->syntax & RE_DOT_NEWLINE) && 3783 re_string_byte_at (input, str_idx) == '\n') || 3784 ((dfa->syntax & RE_DOT_NOT_NULL) && 3785 re_string_byte_at (input, str_idx) == '\0')) 3786 return 0; 3787 return char_len; 3788 } 3789 3790 elem_len = re_string_elem_size_at (input, str_idx); 3791 if ((elem_len <= 1 && char_len <= 1) || char_len == 0) 3792 return 0; 3793 3794 if (node->type == COMPLEX_BRACKET) 3795 { 3796 const re_charset_t *cset = node->opr.mbcset; 3797 # ifdef _LIBC 3798 const unsigned char *pin 3799 = ((const unsigned char *) re_string_get_buffer (input) + str_idx); 3800 int j; 3801 uint32_t nrules; 3802 # endif /* _LIBC */ 3803 int match_len = 0; 3804 wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars) 3805 ? re_string_wchar_at (input, str_idx) : 0); 3806 3807 /* match with multibyte character? */ 3808 for (i = 0; i < cset->nmbchars; ++i) 3809 if (wc == cset->mbchars[i]) 3810 { 3811 match_len = char_len; 3812 goto check_node_accept_bytes_match; 3813 } 3814 /* match with character_class? */ 3815 for (i = 0; i < cset->nchar_classes; ++i) 3816 { 3817 wctype_t wt = cset->char_classes[i]; 3818 if (__iswctype (wc, wt)) 3819 { 3820 match_len = char_len; 3821 goto check_node_accept_bytes_match; 3822 } 3823 } 3824 3825 # ifdef _LIBC 3826 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); 3827 if (nrules != 0) 3828 { 3829 unsigned int in_collseq = 0; 3830 const int32_t *table, *indirect; 3831 const unsigned char *weights, *extra; 3832 const char *collseqwc; 3833 /* This #include defines a local function! */ 3834 # include <locale/weight.h> 3835 3836 /* match with collating_symbol? */ 3837 if (cset->ncoll_syms) 3838 extra = (const unsigned char *) 3839 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB); 3840 for (i = 0; i < cset->ncoll_syms; ++i) 3841 { 3842 const unsigned char *coll_sym = extra + cset->coll_syms[i]; 3843 /* Compare the length of input collating element and 3844 the length of current collating element. */ 3845 if (*coll_sym != elem_len) 3846 continue; 3847 /* Compare each bytes. */ 3848 for (j = 0; j < *coll_sym; j++) 3849 if (pin[j] != coll_sym[1 + j]) 3850 break; 3851 if (j == *coll_sym) 3852 { 3853 /* Match if every bytes is equal. */ 3854 match_len = j; 3855 goto check_node_accept_bytes_match; 3856 } 3857 } 3858 3859 if (cset->nranges) 3860 { 3861 if (elem_len <= char_len) 3862 { 3863 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC); 3864 in_collseq = __collseq_table_lookup (collseqwc, wc); 3865 } 3866 else 3867 in_collseq = find_collation_sequence_value (pin, elem_len); 3868 } 3869 /* match with range expression? */ 3870 for (i = 0; i < cset->nranges; ++i) 3871 if (cset->range_starts[i] <= in_collseq 3872 && in_collseq <= cset->range_ends[i]) 3873 { 3874 match_len = elem_len; 3875 goto check_node_accept_bytes_match; 3876 } 3877 3878 /* match with equivalence_class? */ 3879 if (cset->nequiv_classes) 3880 { 3881 const unsigned char *cp = pin; 3882 table = (const int32_t *) 3883 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB); 3884 weights = (const unsigned char *) 3885 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB); 3886 extra = (const unsigned char *) 3887 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB); 3888 indirect = (const int32_t *) 3889 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB); 3890 int32_t idx = findidx (&cp); 3891 if (idx > 0) 3892 for (i = 0; i < cset->nequiv_classes; ++i) 3893 { 3894 int32_t equiv_class_idx = cset->equiv_classes[i]; 3895 size_t weight_len = weights[idx & 0xffffff]; 3896 if (weight_len == weights[equiv_class_idx & 0xffffff] 3897 && (idx >> 24) == (equiv_class_idx >> 24)) 3898 { 3899 int cnt = 0; 3900 3901 idx &= 0xffffff; 3902 equiv_class_idx &= 0xffffff; 3903 3904 while (cnt <= weight_len 3905 && (weights[equiv_class_idx + 1 + cnt] 3906 == weights[idx + 1 + cnt])) 3907 ++cnt; 3908 if (cnt > weight_len) 3909 { 3910 match_len = elem_len; 3911 goto check_node_accept_bytes_match; 3912 } 3913 } 3914 } 3915 } 3916 } 3917 else 3918 # endif /* _LIBC */ 3919 { 3920 /* match with range expression? */ 3921 #if __GNUC__ >= 2 3922 wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'}; 3923 #else 3924 wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'}; 3925 cmp_buf[2] = wc; 3926 #endif 3927 for (i = 0; i < cset->nranges; ++i) 3928 { 3929 cmp_buf[0] = cset->range_starts[i]; 3930 cmp_buf[4] = cset->range_ends[i]; 3931 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0 3932 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0) 3933 { 3934 match_len = char_len; 3935 goto check_node_accept_bytes_match; 3936 } 3937 } 3938 } 3939 check_node_accept_bytes_match: 3940 if (!cset->non_match) 3941 return match_len; 3942 else 3943 { 3944 if (match_len > 0) 3945 return 0; 3946 else 3947 return (elem_len > char_len) ? elem_len : char_len; 3948 } 3949 } 3950 return 0; 3951 } 3952 3953 # ifdef _LIBC 3954 static unsigned int 3955 internal_function 3956 find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len) 3957 { 3958 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); 3959 if (nrules == 0) 3960 { 3961 if (mbs_len == 1) 3962 { 3963 /* No valid character. Match it as a single byte character. */ 3964 const unsigned char *collseq = (const unsigned char *) 3965 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB); 3966 return collseq[mbs[0]]; 3967 } 3968 return UINT_MAX; 3969 } 3970 else 3971 { 3972 int32_t idx; 3973 const unsigned char *extra = (const unsigned char *) 3974 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB); 3975 int32_t extrasize = (const unsigned char *) 3976 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra; 3977 3978 for (idx = 0; idx < extrasize;) 3979 { 3980 int mbs_cnt, found = 0; 3981 int32_t elem_mbs_len; 3982 /* Skip the name of collating element name. */ 3983 idx = idx + extra[idx] + 1; 3984 elem_mbs_len = extra[idx++]; 3985 if (mbs_len == elem_mbs_len) 3986 { 3987 for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt) 3988 if (extra[idx + mbs_cnt] != mbs[mbs_cnt]) 3989 break; 3990 if (mbs_cnt == elem_mbs_len) 3991 /* Found the entry. */ 3992 found = 1; 3993 } 3994 /* Skip the byte sequence of the collating element. */ 3995 idx += elem_mbs_len; 3996 /* Adjust for the alignment. */ 3997 idx = (idx + 3) & ~3; 3998 /* Skip the collation sequence value. */ 3999 idx += sizeof (uint32_t); 4000 /* Skip the wide char sequence of the collating element. */ 4001 idx = idx + sizeof (uint32_t) * (extra[idx] + 1); 4002 /* If we found the entry, return the sequence value. */ 4003 if (found) 4004 return *(uint32_t *) (extra + idx); 4005 /* Skip the collation sequence value. */ 4006 idx += sizeof (uint32_t); 4007 } 4008 return UINT_MAX; 4009 } 4010 } 4011 # endif /* _LIBC */ 4012 #endif /* RE_ENABLE_I18N */ 4013 4014 /* Check whether the node accepts the byte which is IDX-th 4015 byte of the INPUT. */ 4016 4017 static int 4018 internal_function 4019 check_node_accept (const re_match_context_t *mctx, const re_token_t *node, 4020 int idx) 4021 { 4022 unsigned char ch; 4023 ch = re_string_byte_at (&mctx->input, idx); 4024 switch (node->type) 4025 { 4026 case CHARACTER: 4027 if (node->opr.c != ch) 4028 return 0; 4029 break; 4030 4031 case SIMPLE_BRACKET: 4032 if (!bitset_contain (node->opr.sbcset, ch)) 4033 return 0; 4034 break; 4035 4036 #ifdef RE_ENABLE_I18N 4037 case OP_UTF8_PERIOD: 4038 if (ch >= 0x80) 4039 return 0; 4040 /* FALLTHROUGH */ 4041 #endif 4042 case OP_PERIOD: 4043 if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE)) 4044 || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL))) 4045 return 0; 4046 break; 4047 4048 default: 4049 return 0; 4050 } 4051 4052 if (node->constraint) 4053 { 4054 /* The node has constraints. Check whether the current context 4055 satisfies the constraints. */ 4056 unsigned int context = re_string_context_at (&mctx->input, idx, 4057 mctx->eflags); 4058 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context)) 4059 return 0; 4060 } 4061 4062 return 1; 4063 } 4064 4065 /* Extend the buffers, if the buffers have run out. */ 4066 4067 static reg_errcode_t 4068 internal_function 4069 extend_buffers (re_match_context_t *mctx) 4070 { 4071 reg_errcode_t ret; 4072 re_string_t *pstr = &mctx->input; 4073 4074 /* Double the lengthes of the buffers. */ 4075 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2); 4076 if (BE (ret != REG_NOERROR, 0)) 4077 return ret; 4078 4079 if (mctx->state_log != NULL) 4080 { 4081 /* And double the length of state_log. */ 4082 /* XXX We have no indication of the size of this buffer. If this 4083 allocation fail we have no indication that the state_log array 4084 does not have the right size. */ 4085 re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *, 4086 pstr->bufs_len + 1); 4087 if (BE (new_array == NULL, 0)) 4088 return REG_ESPACE; 4089 mctx->state_log = new_array; 4090 } 4091 4092 /* Then reconstruct the buffers. */ 4093 if (pstr->icase) 4094 { 4095 #ifdef RE_ENABLE_I18N 4096 if (pstr->mb_cur_max > 1) 4097 { 4098 ret = build_wcs_upper_buffer (pstr); 4099 if (BE (ret != REG_NOERROR, 0)) 4100 return ret; 4101 } 4102 else 4103 #endif /* RE_ENABLE_I18N */ 4104 build_upper_buffer (pstr); 4105 } 4106 else 4107 { 4108 #ifdef RE_ENABLE_I18N 4109 if (pstr->mb_cur_max > 1) 4110 build_wcs_buffer (pstr); 4111 else 4112 #endif /* RE_ENABLE_I18N */ 4113 { 4114 if (pstr->trans != NULL) 4115 re_string_translate_buffer (pstr); 4116 } 4117 } 4118 return REG_NOERROR; 4119 } 4120 4121 4122 /* Functions for matching context. */ 4123 4124 /* Initialize MCTX. */ 4125 4126 static reg_errcode_t 4127 internal_function 4128 match_ctx_init (re_match_context_t *mctx, int eflags, int n) 4129 { 4130 mctx->eflags = eflags; 4131 mctx->match_last = -1; 4132 if (n > 0) 4133 { 4134 mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n); 4135 mctx->sub_tops = re_malloc (re_sub_match_top_t *, n); 4136 if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0)) 4137 return REG_ESPACE; 4138 } 4139 /* Already zero-ed by the caller. 4140 else 4141 mctx->bkref_ents = NULL; 4142 mctx->nbkref_ents = 0; 4143 mctx->nsub_tops = 0; */ 4144 mctx->abkref_ents = n; 4145 mctx->max_mb_elem_len = 1; 4146 mctx->asub_tops = n; 4147 return REG_NOERROR; 4148 } 4149 4150 /* Clean the entries which depend on the current input in MCTX. 4151 This function must be invoked when the matcher changes the start index 4152 of the input, or changes the input string. */ 4153 4154 static void 4155 internal_function 4156 match_ctx_clean (re_match_context_t *mctx) 4157 { 4158 int st_idx; 4159 for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx) 4160 { 4161 int sl_idx; 4162 re_sub_match_top_t *top = mctx->sub_tops[st_idx]; 4163 for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx) 4164 { 4165 re_sub_match_last_t *last = top->lasts[sl_idx]; 4166 re_free (last->path.array); 4167 re_free (last); 4168 } 4169 re_free (top->lasts); 4170 if (top->path) 4171 { 4172 re_free (top->path->array); 4173 re_free (top->path); 4174 } 4175 free (top); 4176 } 4177 4178 mctx->nsub_tops = 0; 4179 mctx->nbkref_ents = 0; 4180 } 4181 4182 /* Free all the memory associated with MCTX. */ 4183 4184 static void 4185 internal_function 4186 match_ctx_free (re_match_context_t *mctx) 4187 { 4188 /* First, free all the memory associated with MCTX->SUB_TOPS. */ 4189 match_ctx_clean (mctx); 4190 re_free (mctx->sub_tops); 4191 re_free (mctx->bkref_ents); 4192 } 4193 4194 /* Add a new backreference entry to MCTX. 4195 Note that we assume that caller never call this function with duplicate 4196 entry, and call with STR_IDX which isn't smaller than any existing entry. 4197 */ 4198 4199 static reg_errcode_t 4200 internal_function 4201 match_ctx_add_entry (re_match_context_t *mctx, int node, int str_idx, int from, 4202 int to) 4203 { 4204 if (mctx->nbkref_ents >= mctx->abkref_ents) 4205 { 4206 struct re_backref_cache_entry* new_entry; 4207 new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry, 4208 mctx->abkref_ents * 2); 4209 if (BE (new_entry == NULL, 0)) 4210 { 4211 re_free (mctx->bkref_ents); 4212 return REG_ESPACE; 4213 } 4214 mctx->bkref_ents = new_entry; 4215 memset (mctx->bkref_ents + mctx->nbkref_ents, '\0', 4216 sizeof (struct re_backref_cache_entry) * mctx->abkref_ents); 4217 mctx->abkref_ents *= 2; 4218 } 4219 if (mctx->nbkref_ents > 0 4220 && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx) 4221 mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1; 4222 4223 mctx->bkref_ents[mctx->nbkref_ents].node = node; 4224 mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx; 4225 mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from; 4226 mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to; 4227 4228 /* This is a cache that saves negative results of check_dst_limits_calc_pos. 4229 If bit N is clear, means that this entry won't epsilon-transition to 4230 an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression. If 4231 it is set, check_dst_limits_calc_pos_1 will recurse and try to find one 4232 such node. 4233 4234 A backreference does not epsilon-transition unless it is empty, so set 4235 to all zeros if FROM != TO. */ 4236 mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map 4237 = (from == to ? ~0 : 0); 4238 4239 mctx->bkref_ents[mctx->nbkref_ents++].more = 0; 4240 if (mctx->max_mb_elem_len < to - from) 4241 mctx->max_mb_elem_len = to - from; 4242 return REG_NOERROR; 4243 } 4244 4245 /* Search for the first entry which has the same str_idx, or -1 if none is 4246 found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */ 4247 4248 static int 4249 internal_function 4250 search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx) 4251 { 4252 int left, right, mid, last; 4253 last = right = mctx->nbkref_ents; 4254 for (left = 0; left < right;) 4255 { 4256 mid = (left + right) / 2; 4257 if (mctx->bkref_ents[mid].str_idx < str_idx) 4258 left = mid + 1; 4259 else 4260 right = mid; 4261 } 4262 if (left < last && mctx->bkref_ents[left].str_idx == str_idx) 4263 return left; 4264 else 4265 return -1; 4266 } 4267 4268 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches 4269 at STR_IDX. */ 4270 4271 static reg_errcode_t 4272 internal_function 4273 match_ctx_add_subtop (re_match_context_t *mctx, int node, int str_idx) 4274 { 4275 #ifdef DEBUG 4276 assert (mctx->sub_tops != NULL); 4277 assert (mctx->asub_tops > 0); 4278 #endif 4279 if (BE (mctx->nsub_tops == mctx->asub_tops, 0)) 4280 { 4281 int new_asub_tops = mctx->asub_tops * 2; 4282 re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops, 4283 re_sub_match_top_t *, 4284 new_asub_tops); 4285 if (BE (new_array == NULL, 0)) 4286 return REG_ESPACE; 4287 mctx->sub_tops = new_array; 4288 mctx->asub_tops = new_asub_tops; 4289 } 4290 mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t)); 4291 if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0)) 4292 return REG_ESPACE; 4293 mctx->sub_tops[mctx->nsub_tops]->node = node; 4294 mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx; 4295 return REG_NOERROR; 4296 } 4297 4298 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches 4299 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */ 4300 4301 static re_sub_match_last_t * 4302 internal_function 4303 match_ctx_add_sublast (re_sub_match_top_t *subtop, int node, int str_idx) 4304 { 4305 re_sub_match_last_t *new_entry; 4306 if (BE (subtop->nlasts == subtop->alasts, 0)) 4307 { 4308 int new_alasts = 2 * subtop->alasts + 1; 4309 re_sub_match_last_t **new_array = re_realloc (subtop->lasts, 4310 re_sub_match_last_t *, 4311 new_alasts); 4312 if (BE (new_array == NULL, 0)) 4313 return NULL; 4314 subtop->lasts = new_array; 4315 subtop->alasts = new_alasts; 4316 } 4317 new_entry = calloc (1, sizeof (re_sub_match_last_t)); 4318 if (BE (new_entry != NULL, 1)) 4319 { 4320 subtop->lasts[subtop->nlasts] = new_entry; 4321 new_entry->node = node; 4322 new_entry->str_idx = str_idx; 4323 ++subtop->nlasts; 4324 } 4325 return new_entry; 4326 } 4327 4328 static void 4329 internal_function 4330 sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts, 4331 re_dfastate_t **limited_sts, int last_node, int last_str_idx) 4332 { 4333 sctx->sifted_states = sifted_sts; 4334 sctx->limited_states = limited_sts; 4335 sctx->last_node = last_node; 4336 sctx->last_str_idx = last_str_idx; 4337 re_node_set_init_empty (&sctx->limits); 4338 } 4339