1 /* Extended regular expression matching and search library. 2 Copyright (C) 2002, 2003, 2004, 2005, 2006 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>. 5 6 The GNU C Library is free software; you can redistribute it and/or 7 modify it under the terms of the GNU Lesser General Public 8 License as published by the Free Software Foundation; either 9 version 2.1 of the License, or (at your option) any later version. 10 11 The GNU C Library is distributed in the hope that it will be useful, 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 Lesser General Public License for more details. 15 16 You should have received a copy of the GNU Lesser General Public 17 License along with the GNU C Library; if not, write to the Free 18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 19 02111-1307 USA. */ 20 21 static void re_string_construct_common (const char *str, int len, 22 re_string_t *pstr, 23 RE_TRANSLATE_TYPE trans, int icase, 24 const re_dfa_t *dfa) internal_function; 25 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa, 26 const re_node_set *nodes, 27 unsigned int hash) internal_function; 28 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa, 29 const re_node_set *nodes, 30 unsigned int context, 31 unsigned int hash) internal_function; 32 33 /* Functions for string operation. */ 34 35 /* This function allocate the buffers. It is necessary to call 36 re_string_reconstruct before using the object. */ 37 38 static reg_errcode_t 39 internal_function 40 re_string_allocate (re_string_t *pstr, const char *str, int len, int init_len, 41 RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa) 42 { 43 reg_errcode_t ret; 44 int init_buf_len; 45 46 /* Ensure at least one character fits into the buffers. */ 47 if (init_len < dfa->mb_cur_max) 48 init_len = dfa->mb_cur_max; 49 init_buf_len = (len + 1 < init_len) ? len + 1: init_len; 50 re_string_construct_common (str, len, pstr, trans, icase, dfa); 51 52 ret = re_string_realloc_buffers (pstr, init_buf_len); 53 if (BE (ret != REG_NOERROR, 0)) 54 return ret; 55 56 pstr->word_char = dfa->word_char; 57 pstr->word_ops_used = dfa->word_ops_used; 58 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str; 59 pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len; 60 pstr->valid_raw_len = pstr->valid_len; 61 return REG_NOERROR; 62 } 63 64 /* This function allocate the buffers, and initialize them. */ 65 66 static reg_errcode_t 67 internal_function 68 re_string_construct (re_string_t *pstr, const char *str, int len, 69 RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa) 70 { 71 reg_errcode_t ret; 72 memset (pstr, '\0', sizeof (re_string_t)); 73 re_string_construct_common (str, len, pstr, trans, icase, dfa); 74 75 if (len > 0) 76 { 77 ret = re_string_realloc_buffers (pstr, len + 1); 78 if (BE (ret != REG_NOERROR, 0)) 79 return ret; 80 } 81 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str; 82 83 if (icase) 84 { 85 #ifdef RE_ENABLE_I18N 86 if (dfa->mb_cur_max > 1) 87 { 88 while (1) 89 { 90 ret = build_wcs_upper_buffer (pstr); 91 if (BE (ret != REG_NOERROR, 0)) 92 return ret; 93 if (pstr->valid_raw_len >= len) 94 break; 95 if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max) 96 break; 97 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2); 98 if (BE (ret != REG_NOERROR, 0)) 99 return ret; 100 } 101 } 102 else 103 #endif /* RE_ENABLE_I18N */ 104 build_upper_buffer (pstr); 105 } 106 else 107 { 108 #ifdef RE_ENABLE_I18N 109 if (dfa->mb_cur_max > 1) 110 build_wcs_buffer (pstr); 111 else 112 #endif /* RE_ENABLE_I18N */ 113 { 114 if (trans != NULL) 115 re_string_translate_buffer (pstr); 116 else 117 { 118 pstr->valid_len = pstr->bufs_len; 119 pstr->valid_raw_len = pstr->bufs_len; 120 } 121 } 122 } 123 124 return REG_NOERROR; 125 } 126 127 /* Helper functions for re_string_allocate, and re_string_construct. */ 128 129 static reg_errcode_t 130 internal_function 131 re_string_realloc_buffers (re_string_t *pstr, int new_buf_len) 132 { 133 #ifdef RE_ENABLE_I18N 134 if (pstr->mb_cur_max > 1) 135 { 136 wint_t *new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len); 137 if (BE (new_wcs == NULL, 0)) 138 return REG_ESPACE; 139 pstr->wcs = new_wcs; 140 if (pstr->offsets != NULL) 141 { 142 int *new_offsets = re_realloc (pstr->offsets, int, new_buf_len); 143 if (BE (new_offsets == NULL, 0)) 144 return REG_ESPACE; 145 pstr->offsets = new_offsets; 146 } 147 } 148 #endif /* RE_ENABLE_I18N */ 149 if (pstr->mbs_allocated) 150 { 151 unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char, 152 new_buf_len); 153 if (BE (new_mbs == NULL, 0)) 154 return REG_ESPACE; 155 pstr->mbs = new_mbs; 156 } 157 pstr->bufs_len = new_buf_len; 158 return REG_NOERROR; 159 } 160 161 162 static void 163 internal_function 164 re_string_construct_common (const char *str, int len, re_string_t *pstr, 165 RE_TRANSLATE_TYPE trans, int icase, 166 const re_dfa_t *dfa) 167 { 168 pstr->raw_mbs = (const unsigned char *) str; 169 pstr->len = len; 170 pstr->raw_len = len; 171 pstr->trans = trans; 172 pstr->icase = icase ? 1 : 0; 173 pstr->mbs_allocated = (trans != NULL || icase); 174 pstr->mb_cur_max = dfa->mb_cur_max; 175 pstr->is_utf8 = dfa->is_utf8; 176 pstr->map_notascii = dfa->map_notascii; 177 pstr->stop = pstr->len; 178 pstr->raw_stop = pstr->stop; 179 } 180 181 #ifdef RE_ENABLE_I18N 182 183 /* Build wide character buffer PSTR->WCS. 184 If the byte sequence of the string are: 185 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3> 186 Then wide character buffer will be: 187 <wc1> , WEOF , <wc2> , WEOF , <wc3> 188 We use WEOF for padding, they indicate that the position isn't 189 a first byte of a multibyte character. 190 191 Note that this function assumes PSTR->VALID_LEN elements are already 192 built and starts from PSTR->VALID_LEN. */ 193 194 static void 195 internal_function 196 build_wcs_buffer (re_string_t *pstr) 197 { 198 #ifdef _LIBC 199 unsigned char buf[MB_LEN_MAX]; 200 assert (MB_LEN_MAX >= pstr->mb_cur_max); 201 #else 202 unsigned char buf[64]; 203 #endif 204 mbstate_t prev_st; 205 int byte_idx, end_idx, remain_len; 206 size_t mbclen; 207 208 /* Build the buffers from pstr->valid_len to either pstr->len or 209 pstr->bufs_len. */ 210 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len; 211 for (byte_idx = pstr->valid_len; byte_idx < end_idx;) 212 { 213 wchar_t wc; 214 const char *p; 215 216 remain_len = end_idx - byte_idx; 217 prev_st = pstr->cur_state; 218 /* Apply the translation if we need. */ 219 if (BE (pstr->trans != NULL, 0)) 220 { 221 int i, ch; 222 223 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i) 224 { 225 ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i]; 226 buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch]; 227 } 228 p = (const char *) buf; 229 } 230 else 231 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx; 232 mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state); 233 if (BE (mbclen == (size_t) -2, 0)) 234 { 235 /* The buffer doesn't have enough space, finish to build. */ 236 pstr->cur_state = prev_st; 237 break; 238 } 239 else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0)) 240 { 241 /* We treat these cases as a singlebyte character. */ 242 mbclen = 1; 243 wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]; 244 if (BE (pstr->trans != NULL, 0)) 245 wc = pstr->trans[wc]; 246 pstr->cur_state = prev_st; 247 } 248 249 /* Write wide character and padding. */ 250 pstr->wcs[byte_idx++] = wc; 251 /* Write paddings. */ 252 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;) 253 pstr->wcs[byte_idx++] = WEOF; 254 } 255 pstr->valid_len = byte_idx; 256 pstr->valid_raw_len = byte_idx; 257 } 258 259 /* Build wide character buffer PSTR->WCS like build_wcs_buffer, 260 but for REG_ICASE. */ 261 262 static reg_errcode_t 263 internal_function 264 build_wcs_upper_buffer (re_string_t *pstr) 265 { 266 mbstate_t prev_st; 267 int src_idx, byte_idx, end_idx, remain_len; 268 size_t mbclen; 269 #ifdef _LIBC 270 char buf[MB_LEN_MAX]; 271 assert (MB_LEN_MAX >= pstr->mb_cur_max); 272 #else 273 char buf[64]; 274 #endif 275 276 byte_idx = pstr->valid_len; 277 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len; 278 279 /* The following optimization assumes that ASCII characters can be 280 mapped to wide characters with a simple cast. */ 281 if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed) 282 { 283 while (byte_idx < end_idx) 284 { 285 wchar_t wc; 286 287 if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]) 288 && mbsinit (&pstr->cur_state)) 289 { 290 /* In case of a singlebyte character. */ 291 pstr->mbs[byte_idx] 292 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]); 293 /* The next step uses the assumption that wchar_t is encoded 294 ASCII-safe: all ASCII values can be converted like this. */ 295 pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx]; 296 ++byte_idx; 297 continue; 298 } 299 300 remain_len = end_idx - byte_idx; 301 prev_st = pstr->cur_state; 302 mbclen = __mbrtowc (&wc, 303 ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx 304 + byte_idx), remain_len, &pstr->cur_state); 305 if (BE (mbclen + 2 > 2, 1)) 306 { 307 wchar_t wcu = wc; 308 if (iswlower (wc)) 309 { 310 size_t mbcdlen; 311 312 wcu = towupper (wc); 313 mbcdlen = wcrtomb (buf, wcu, &prev_st); 314 if (BE (mbclen == mbcdlen, 1)) 315 memcpy (pstr->mbs + byte_idx, buf, mbclen); 316 else 317 { 318 src_idx = byte_idx; 319 goto offsets_needed; 320 } 321 } 322 else 323 memcpy (pstr->mbs + byte_idx, 324 pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen); 325 pstr->wcs[byte_idx++] = wcu; 326 /* Write paddings. */ 327 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;) 328 pstr->wcs[byte_idx++] = WEOF; 329 } 330 else if (mbclen == (size_t) -1 || mbclen == 0) 331 { 332 /* It is an invalid character or '\0'. Just use the byte. */ 333 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]; 334 pstr->mbs[byte_idx] = ch; 335 /* And also cast it to wide char. */ 336 pstr->wcs[byte_idx++] = (wchar_t) ch; 337 if (BE (mbclen == (size_t) -1, 0)) 338 pstr->cur_state = prev_st; 339 } 340 else 341 { 342 /* The buffer doesn't have enough space, finish to build. */ 343 pstr->cur_state = prev_st; 344 break; 345 } 346 } 347 pstr->valid_len = byte_idx; 348 pstr->valid_raw_len = byte_idx; 349 return REG_NOERROR; 350 } 351 else 352 for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;) 353 { 354 wchar_t wc; 355 const char *p; 356 offsets_needed: 357 remain_len = end_idx - byte_idx; 358 prev_st = pstr->cur_state; 359 if (BE (pstr->trans != NULL, 0)) 360 { 361 int i, ch; 362 363 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i) 364 { 365 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i]; 366 buf[i] = pstr->trans[ch]; 367 } 368 p = (const char *) buf; 369 } 370 else 371 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx; 372 mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state); 373 if (BE (mbclen + 2 > 2, 1)) 374 { 375 wchar_t wcu = wc; 376 if (iswlower (wc)) 377 { 378 size_t mbcdlen; 379 380 wcu = towupper (wc); 381 mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st); 382 if (BE (mbclen == mbcdlen, 1)) 383 memcpy (pstr->mbs + byte_idx, buf, mbclen); 384 else if (mbcdlen != (size_t) -1) 385 { 386 size_t i; 387 388 if (byte_idx + mbcdlen > pstr->bufs_len) 389 { 390 pstr->cur_state = prev_st; 391 break; 392 } 393 394 if (pstr->offsets == NULL) 395 { 396 pstr->offsets = re_malloc (int, pstr->bufs_len); 397 398 if (pstr->offsets == NULL) 399 return REG_ESPACE; 400 } 401 if (!pstr->offsets_needed) 402 { 403 for (i = 0; i < (size_t) byte_idx; ++i) 404 pstr->offsets[i] = i; 405 pstr->offsets_needed = 1; 406 } 407 408 memcpy (pstr->mbs + byte_idx, buf, mbcdlen); 409 pstr->wcs[byte_idx] = wcu; 410 pstr->offsets[byte_idx] = src_idx; 411 for (i = 1; i < mbcdlen; ++i) 412 { 413 pstr->offsets[byte_idx + i] 414 = src_idx + (i < mbclen ? i : mbclen - 1); 415 pstr->wcs[byte_idx + i] = WEOF; 416 } 417 pstr->len += mbcdlen - mbclen; 418 if (pstr->raw_stop > src_idx) 419 pstr->stop += mbcdlen - mbclen; 420 end_idx = (pstr->bufs_len > pstr->len) 421 ? pstr->len : pstr->bufs_len; 422 byte_idx += mbcdlen; 423 src_idx += mbclen; 424 continue; 425 } 426 else 427 memcpy (pstr->mbs + byte_idx, p, mbclen); 428 } 429 else 430 memcpy (pstr->mbs + byte_idx, p, mbclen); 431 432 if (BE (pstr->offsets_needed != 0, 0)) 433 { 434 size_t i; 435 for (i = 0; i < mbclen; ++i) 436 pstr->offsets[byte_idx + i] = src_idx + i; 437 } 438 src_idx += mbclen; 439 440 pstr->wcs[byte_idx++] = wcu; 441 /* Write paddings. */ 442 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;) 443 pstr->wcs[byte_idx++] = WEOF; 444 } 445 else if (mbclen == (size_t) -1 || mbclen == 0) 446 { 447 /* It is an invalid character or '\0'. Just use the byte. */ 448 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx]; 449 450 if (BE (pstr->trans != NULL, 0)) 451 ch = pstr->trans [ch]; 452 pstr->mbs[byte_idx] = ch; 453 454 if (BE (pstr->offsets_needed != 0, 0)) 455 pstr->offsets[byte_idx] = src_idx; 456 ++src_idx; 457 458 /* And also cast it to wide char. */ 459 pstr->wcs[byte_idx++] = (wchar_t) ch; 460 if (BE (mbclen == (size_t) -1, 0)) 461 pstr->cur_state = prev_st; 462 } 463 else 464 { 465 /* The buffer doesn't have enough space, finish to build. */ 466 pstr->cur_state = prev_st; 467 break; 468 } 469 } 470 pstr->valid_len = byte_idx; 471 pstr->valid_raw_len = src_idx; 472 return REG_NOERROR; 473 } 474 475 /* Skip characters until the index becomes greater than NEW_RAW_IDX. 476 Return the index. */ 477 478 static int 479 internal_function 480 re_string_skip_chars (re_string_t *pstr, int new_raw_idx, wint_t *last_wc) 481 { 482 mbstate_t prev_st; 483 int rawbuf_idx; 484 size_t mbclen; 485 wchar_t wc = WEOF; 486 487 /* Skip the characters which are not necessary to check. */ 488 for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len; 489 rawbuf_idx < new_raw_idx;) 490 { 491 int remain_len; 492 remain_len = pstr->len - rawbuf_idx; 493 prev_st = pstr->cur_state; 494 mbclen = __mbrtowc (&wc, (const char *) pstr->raw_mbs + rawbuf_idx, 495 remain_len, &pstr->cur_state); 496 if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0)) 497 { 498 /* We treat these cases as a single byte character. */ 499 if (mbclen == 0 || remain_len == 0) 500 wc = L'\0'; 501 else 502 wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx); 503 mbclen = 1; 504 pstr->cur_state = prev_st; 505 } 506 /* Then proceed the next character. */ 507 rawbuf_idx += mbclen; 508 } 509 *last_wc = (wint_t) wc; 510 return rawbuf_idx; 511 } 512 #endif /* RE_ENABLE_I18N */ 513 514 /* Build the buffer PSTR->MBS, and apply the translation if we need. 515 This function is used in case of REG_ICASE. */ 516 517 static void 518 internal_function 519 build_upper_buffer (re_string_t *pstr) 520 { 521 int char_idx, end_idx; 522 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len; 523 524 for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx) 525 { 526 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx]; 527 if (BE (pstr->trans != NULL, 0)) 528 ch = pstr->trans[ch]; 529 if (islower (ch)) 530 pstr->mbs[char_idx] = toupper (ch); 531 else 532 pstr->mbs[char_idx] = ch; 533 } 534 pstr->valid_len = char_idx; 535 pstr->valid_raw_len = char_idx; 536 } 537 538 /* Apply TRANS to the buffer in PSTR. */ 539 540 static void 541 internal_function 542 re_string_translate_buffer (re_string_t *pstr) 543 { 544 int buf_idx, end_idx; 545 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len; 546 547 for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx) 548 { 549 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx]; 550 pstr->mbs[buf_idx] = pstr->trans[ch]; 551 } 552 553 pstr->valid_len = buf_idx; 554 pstr->valid_raw_len = buf_idx; 555 } 556 557 /* This function re-construct the buffers. 558 Concretely, convert to wide character in case of pstr->mb_cur_max > 1, 559 convert to upper case in case of REG_ICASE, apply translation. */ 560 561 static reg_errcode_t 562 internal_function 563 re_string_reconstruct (re_string_t *pstr, int idx, int eflags) 564 { 565 int offset = idx - pstr->raw_mbs_idx; 566 if (BE (offset < 0, 0)) 567 { 568 /* Reset buffer. */ 569 #ifdef RE_ENABLE_I18N 570 if (pstr->mb_cur_max > 1) 571 memset (&pstr->cur_state, '\0', sizeof (mbstate_t)); 572 #endif /* RE_ENABLE_I18N */ 573 pstr->len = pstr->raw_len; 574 pstr->stop = pstr->raw_stop; 575 pstr->valid_len = 0; 576 pstr->raw_mbs_idx = 0; 577 pstr->valid_raw_len = 0; 578 pstr->offsets_needed = 0; 579 pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF 580 : CONTEXT_NEWLINE | CONTEXT_BEGBUF); 581 if (!pstr->mbs_allocated) 582 pstr->mbs = (unsigned char *) pstr->raw_mbs; 583 offset = idx; 584 } 585 586 if (BE (offset != 0, 1)) 587 { 588 /* Should the already checked characters be kept? */ 589 if (BE (offset < pstr->valid_raw_len, 1)) 590 { 591 /* Yes, move them to the front of the buffer. */ 592 #ifdef RE_ENABLE_I18N 593 if (BE (pstr->offsets_needed, 0)) 594 { 595 int low = 0, high = pstr->valid_len, mid; 596 do 597 { 598 mid = (high + low) / 2; 599 if (pstr->offsets[mid] > offset) 600 high = mid; 601 else if (pstr->offsets[mid] < offset) 602 low = mid + 1; 603 else 604 break; 605 } 606 while (low < high); 607 if (pstr->offsets[mid] < offset) 608 ++mid; 609 pstr->tip_context = re_string_context_at (pstr, mid - 1, 610 eflags); 611 /* This can be quite complicated, so handle specially 612 only the common and easy case where the character with 613 different length representation of lower and upper 614 case is present at or after offset. */ 615 if (pstr->valid_len > offset 616 && mid == offset && pstr->offsets[mid] == offset) 617 { 618 memmove (pstr->wcs, pstr->wcs + offset, 619 (pstr->valid_len - offset) * sizeof (wint_t)); 620 memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset); 621 pstr->valid_len -= offset; 622 pstr->valid_raw_len -= offset; 623 for (low = 0; low < pstr->valid_len; low++) 624 pstr->offsets[low] = pstr->offsets[low + offset] - offset; 625 } 626 else 627 { 628 /* Otherwise, just find out how long the partial multibyte 629 character at offset is and fill it with WEOF/255. */ 630 pstr->len = pstr->raw_len - idx + offset; 631 pstr->stop = pstr->raw_stop - idx + offset; 632 pstr->offsets_needed = 0; 633 while (mid > 0 && pstr->offsets[mid - 1] == offset) 634 --mid; 635 while (mid < pstr->valid_len) 636 if (pstr->wcs[mid] != WEOF) 637 break; 638 else 639 ++mid; 640 if (mid == pstr->valid_len) 641 pstr->valid_len = 0; 642 else 643 { 644 pstr->valid_len = pstr->offsets[mid] - offset; 645 if (pstr->valid_len) 646 { 647 for (low = 0; low < pstr->valid_len; ++low) 648 pstr->wcs[low] = WEOF; 649 memset (pstr->mbs, 255, pstr->valid_len); 650 } 651 } 652 pstr->valid_raw_len = pstr->valid_len; 653 } 654 } 655 else 656 #endif 657 { 658 pstr->tip_context = re_string_context_at (pstr, offset - 1, 659 eflags); 660 #ifdef RE_ENABLE_I18N 661 if (pstr->mb_cur_max > 1) 662 memmove (pstr->wcs, pstr->wcs + offset, 663 (pstr->valid_len - offset) * sizeof (wint_t)); 664 #endif /* RE_ENABLE_I18N */ 665 if (BE (pstr->mbs_allocated, 0)) 666 memmove (pstr->mbs, pstr->mbs + offset, 667 pstr->valid_len - offset); 668 pstr->valid_len -= offset; 669 pstr->valid_raw_len -= offset; 670 #if DEBUG 671 assert (pstr->valid_len > 0); 672 #endif 673 } 674 } 675 else 676 { 677 /* No, skip all characters until IDX. */ 678 int prev_valid_len = pstr->valid_len; 679 680 #ifdef RE_ENABLE_I18N 681 if (BE (pstr->offsets_needed, 0)) 682 { 683 pstr->len = pstr->raw_len - idx + offset; 684 pstr->stop = pstr->raw_stop - idx + offset; 685 pstr->offsets_needed = 0; 686 } 687 #endif 688 pstr->valid_len = 0; 689 #ifdef RE_ENABLE_I18N 690 if (pstr->mb_cur_max > 1) 691 { 692 int wcs_idx; 693 wint_t wc = WEOF; 694 695 if (pstr->is_utf8) 696 { 697 const unsigned char *raw, *p, *q, *end; 698 699 /* Special case UTF-8. Multi-byte chars start with any 700 byte other than 0x80 - 0xbf. */ 701 raw = pstr->raw_mbs + pstr->raw_mbs_idx; 702 end = raw + (offset - pstr->mb_cur_max); 703 if (end < pstr->raw_mbs) 704 end = pstr->raw_mbs; 705 p = raw + offset - 1; 706 #ifdef _LIBC 707 /* We know the wchar_t encoding is UCS4, so for the simple 708 case, ASCII characters, skip the conversion step. */ 709 if (isascii (*p) && BE (pstr->trans == NULL, 1)) 710 { 711 memset (&pstr->cur_state, '\0', sizeof (mbstate_t)); 712 /* pstr->valid_len = 0; */ 713 wc = (wchar_t) *p; 714 } 715 else 716 #endif 717 for (; p >= end; --p) 718 if ((*p & 0xc0) != 0x80) 719 { 720 mbstate_t cur_state; 721 wchar_t wc2; 722 int mlen = raw + pstr->len - p; 723 unsigned char buf[6]; 724 size_t mbclen; 725 726 q = p; 727 if (BE (pstr->trans != NULL, 0)) 728 { 729 int i = mlen < 6 ? mlen : 6; 730 while (--i >= 0) 731 buf[i] = pstr->trans[p[i]]; 732 q = buf; 733 } 734 /* XXX Don't use mbrtowc, we know which conversion 735 to use (UTF-8 -> UCS4). */ 736 memset (&cur_state, 0, sizeof (cur_state)); 737 mbclen = __mbrtowc (&wc2, (const char *) p, mlen, 738 &cur_state); 739 if (raw + offset - p <= mbclen 740 && mbclen < (size_t) -2) 741 { 742 memset (&pstr->cur_state, '\0', 743 sizeof (mbstate_t)); 744 pstr->valid_len = mbclen - (raw + offset - p); 745 wc = wc2; 746 } 747 break; 748 } 749 } 750 751 if (wc == WEOF) 752 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx; 753 if (wc == WEOF) 754 pstr->tip_context 755 = re_string_context_at (pstr, prev_valid_len - 1, eflags); 756 else 757 pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0) 758 && IS_WIDE_WORD_CHAR (wc)) 759 ? CONTEXT_WORD 760 : ((IS_WIDE_NEWLINE (wc) 761 && pstr->newline_anchor) 762 ? CONTEXT_NEWLINE : 0)); 763 if (BE (pstr->valid_len, 0)) 764 { 765 for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx) 766 pstr->wcs[wcs_idx] = WEOF; 767 if (pstr->mbs_allocated) 768 memset (pstr->mbs, 255, pstr->valid_len); 769 } 770 pstr->valid_raw_len = pstr->valid_len; 771 } 772 else 773 #endif /* RE_ENABLE_I18N */ 774 { 775 int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1]; 776 pstr->valid_raw_len = 0; 777 if (pstr->trans) 778 c = pstr->trans[c]; 779 pstr->tip_context = (bitset_contain (pstr->word_char, c) 780 ? CONTEXT_WORD 781 : ((IS_NEWLINE (c) && pstr->newline_anchor) 782 ? CONTEXT_NEWLINE : 0)); 783 } 784 } 785 if (!BE (pstr->mbs_allocated, 0)) 786 pstr->mbs += offset; 787 } 788 pstr->raw_mbs_idx = idx; 789 pstr->len -= offset; 790 pstr->stop -= offset; 791 792 /* Then build the buffers. */ 793 #ifdef RE_ENABLE_I18N 794 if (pstr->mb_cur_max > 1) 795 { 796 if (pstr->icase) 797 { 798 reg_errcode_t ret = build_wcs_upper_buffer (pstr); 799 if (BE (ret != REG_NOERROR, 0)) 800 return ret; 801 } 802 else 803 build_wcs_buffer (pstr); 804 } 805 else 806 #endif /* RE_ENABLE_I18N */ 807 if (BE (pstr->mbs_allocated, 0)) 808 { 809 if (pstr->icase) 810 build_upper_buffer (pstr); 811 else if (pstr->trans != NULL) 812 re_string_translate_buffer (pstr); 813 } 814 else 815 pstr->valid_len = pstr->len; 816 817 pstr->cur_idx = 0; 818 return REG_NOERROR; 819 } 820 821 static unsigned char 822 internal_function __attribute ((pure)) 823 re_string_peek_byte_case (const re_string_t *pstr, int idx) 824 { 825 int ch, off; 826 827 /* Handle the common (easiest) cases first. */ 828 if (BE (!pstr->mbs_allocated, 1)) 829 return re_string_peek_byte (pstr, idx); 830 831 #ifdef RE_ENABLE_I18N 832 if (pstr->mb_cur_max > 1 833 && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx)) 834 return re_string_peek_byte (pstr, idx); 835 #endif 836 837 off = pstr->cur_idx + idx; 838 #ifdef RE_ENABLE_I18N 839 if (pstr->offsets_needed) 840 off = pstr->offsets[off]; 841 #endif 842 843 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off]; 844 845 #ifdef RE_ENABLE_I18N 846 /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I 847 this function returns CAPITAL LETTER I instead of first byte of 848 DOTLESS SMALL LETTER I. The latter would confuse the parser, 849 since peek_byte_case doesn't advance cur_idx in any way. */ 850 if (pstr->offsets_needed && !isascii (ch)) 851 return re_string_peek_byte (pstr, idx); 852 #endif 853 854 return ch; 855 } 856 857 static unsigned char 858 internal_function __attribute ((pure)) 859 re_string_fetch_byte_case (re_string_t *pstr) 860 { 861 if (BE (!pstr->mbs_allocated, 1)) 862 return re_string_fetch_byte (pstr); 863 864 #ifdef RE_ENABLE_I18N 865 if (pstr->offsets_needed) 866 { 867 int off, ch; 868 869 /* For tr_TR.UTF-8 [[:islower:]] there is 870 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip 871 in that case the whole multi-byte character and return 872 the original letter. On the other side, with 873 [[: DOTLESS SMALL LETTER I return [[:I, as doing 874 anything else would complicate things too much. */ 875 876 if (!re_string_first_byte (pstr, pstr->cur_idx)) 877 return re_string_fetch_byte (pstr); 878 879 off = pstr->offsets[pstr->cur_idx]; 880 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off]; 881 882 if (! isascii (ch)) 883 return re_string_fetch_byte (pstr); 884 885 re_string_skip_bytes (pstr, 886 re_string_char_size_at (pstr, pstr->cur_idx)); 887 return ch; 888 } 889 #endif 890 891 return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++]; 892 } 893 894 static void 895 internal_function 896 re_string_destruct (re_string_t *pstr) 897 { 898 #ifdef RE_ENABLE_I18N 899 re_free (pstr->wcs); 900 re_free (pstr->offsets); 901 #endif /* RE_ENABLE_I18N */ 902 if (pstr->mbs_allocated) 903 re_free (pstr->mbs); 904 } 905 906 /* Return the context at IDX in INPUT. */ 907 908 static unsigned int 909 internal_function 910 re_string_context_at (const re_string_t *input, int idx, int eflags) 911 { 912 int c; 913 if (BE (idx < 0, 0)) 914 /* In this case, we use the value stored in input->tip_context, 915 since we can't know the character in input->mbs[-1] here. */ 916 return input->tip_context; 917 if (BE (idx == input->len, 0)) 918 return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF 919 : CONTEXT_NEWLINE | CONTEXT_ENDBUF); 920 #ifdef RE_ENABLE_I18N 921 if (input->mb_cur_max > 1) 922 { 923 wint_t wc; 924 int wc_idx = idx; 925 while(input->wcs[wc_idx] == WEOF) 926 { 927 #ifdef DEBUG 928 /* It must not happen. */ 929 assert (wc_idx >= 0); 930 #endif 931 --wc_idx; 932 if (wc_idx < 0) 933 return input->tip_context; 934 } 935 wc = input->wcs[wc_idx]; 936 if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc)) 937 return CONTEXT_WORD; 938 return (IS_WIDE_NEWLINE (wc) && input->newline_anchor 939 ? CONTEXT_NEWLINE : 0); 940 } 941 else 942 #endif 943 { 944 c = re_string_byte_at (input, idx); 945 if (bitset_contain (input->word_char, c)) 946 return CONTEXT_WORD; 947 return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0; 948 } 949 } 950 951 /* Functions for set operation. */ 952 953 static reg_errcode_t 954 internal_function 955 re_node_set_alloc (re_node_set *set, int size) 956 { 957 set->alloc = size; 958 set->nelem = 0; 959 set->elems = re_malloc (int, size); 960 if (BE (set->elems == NULL, 0)) 961 return REG_ESPACE; 962 return REG_NOERROR; 963 } 964 965 static reg_errcode_t 966 internal_function 967 re_node_set_init_1 (re_node_set *set, int elem) 968 { 969 set->alloc = 1; 970 set->nelem = 1; 971 set->elems = re_malloc (int, 1); 972 if (BE (set->elems == NULL, 0)) 973 { 974 set->alloc = set->nelem = 0; 975 return REG_ESPACE; 976 } 977 set->elems[0] = elem; 978 return REG_NOERROR; 979 } 980 981 static reg_errcode_t 982 internal_function 983 re_node_set_init_2 (re_node_set *set, int elem1, int elem2) 984 { 985 set->alloc = 2; 986 set->elems = re_malloc (int, 2); 987 if (BE (set->elems == NULL, 0)) 988 return REG_ESPACE; 989 if (elem1 == elem2) 990 { 991 set->nelem = 1; 992 set->elems[0] = elem1; 993 } 994 else 995 { 996 set->nelem = 2; 997 if (elem1 < elem2) 998 { 999 set->elems[0] = elem1; 1000 set->elems[1] = elem2; 1001 } 1002 else 1003 { 1004 set->elems[0] = elem2; 1005 set->elems[1] = elem1; 1006 } 1007 } 1008 return REG_NOERROR; 1009 } 1010 1011 static reg_errcode_t 1012 internal_function 1013 re_node_set_init_copy (re_node_set *dest, const re_node_set *src) 1014 { 1015 dest->nelem = src->nelem; 1016 if (src->nelem > 0) 1017 { 1018 dest->alloc = dest->nelem; 1019 dest->elems = re_malloc (int, dest->alloc); 1020 if (BE (dest->elems == NULL, 0)) 1021 { 1022 dest->alloc = dest->nelem = 0; 1023 return REG_ESPACE; 1024 } 1025 memcpy (dest->elems, src->elems, src->nelem * sizeof (int)); 1026 } 1027 else 1028 re_node_set_init_empty (dest); 1029 return REG_NOERROR; 1030 } 1031 1032 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to 1033 DEST. Return value indicate the error code or REG_NOERROR if succeeded. 1034 Note: We assume dest->elems is NULL, when dest->alloc is 0. */ 1035 1036 static reg_errcode_t 1037 internal_function 1038 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1, 1039 const re_node_set *src2) 1040 { 1041 int i1, i2, is, id, delta, sbase; 1042 if (src1->nelem == 0 || src2->nelem == 0) 1043 return REG_NOERROR; 1044 1045 /* We need dest->nelem + 2 * elems_in_intersection; this is a 1046 conservative estimate. */ 1047 if (src1->nelem + src2->nelem + dest->nelem > dest->alloc) 1048 { 1049 int new_alloc = src1->nelem + src2->nelem + dest->alloc; 1050 int *new_elems = re_realloc (dest->elems, int, new_alloc); 1051 if (BE (new_elems == NULL, 0)) 1052 return REG_ESPACE; 1053 dest->elems = new_elems; 1054 dest->alloc = new_alloc; 1055 } 1056 1057 /* Find the items in the intersection of SRC1 and SRC2, and copy 1058 into the top of DEST those that are not already in DEST itself. */ 1059 sbase = dest->nelem + src1->nelem + src2->nelem; 1060 i1 = src1->nelem - 1; 1061 i2 = src2->nelem - 1; 1062 id = dest->nelem - 1; 1063 for (;;) 1064 { 1065 if (src1->elems[i1] == src2->elems[i2]) 1066 { 1067 /* Try to find the item in DEST. Maybe we could binary search? */ 1068 while (id >= 0 && dest->elems[id] > src1->elems[i1]) 1069 --id; 1070 1071 if (id < 0 || dest->elems[id] != src1->elems[i1]) 1072 dest->elems[--sbase] = src1->elems[i1]; 1073 1074 if (--i1 < 0 || --i2 < 0) 1075 break; 1076 } 1077 1078 /* Lower the highest of the two items. */ 1079 else if (src1->elems[i1] < src2->elems[i2]) 1080 { 1081 if (--i2 < 0) 1082 break; 1083 } 1084 else 1085 { 1086 if (--i1 < 0) 1087 break; 1088 } 1089 } 1090 1091 id = dest->nelem - 1; 1092 is = dest->nelem + src1->nelem + src2->nelem - 1; 1093 delta = is - sbase + 1; 1094 1095 /* Now copy. When DELTA becomes zero, the remaining 1096 DEST elements are already in place; this is more or 1097 less the same loop that is in re_node_set_merge. */ 1098 dest->nelem += delta; 1099 if (delta > 0 && id >= 0) 1100 for (;;) 1101 { 1102 if (dest->elems[is] > dest->elems[id]) 1103 { 1104 /* Copy from the top. */ 1105 dest->elems[id + delta--] = dest->elems[is--]; 1106 if (delta == 0) 1107 break; 1108 } 1109 else 1110 { 1111 /* Slide from the bottom. */ 1112 dest->elems[id + delta] = dest->elems[id]; 1113 if (--id < 0) 1114 break; 1115 } 1116 } 1117 1118 /* Copy remaining SRC elements. */ 1119 memcpy (dest->elems, dest->elems + sbase, delta * sizeof (int)); 1120 1121 return REG_NOERROR; 1122 } 1123 1124 /* Calculate the union set of the sets SRC1 and SRC2. And store it to 1125 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */ 1126 1127 static reg_errcode_t 1128 internal_function 1129 re_node_set_init_union (re_node_set *dest, const re_node_set *src1, 1130 const re_node_set *src2) 1131 { 1132 int i1, i2, id; 1133 if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0) 1134 { 1135 dest->alloc = src1->nelem + src2->nelem; 1136 dest->elems = re_malloc (int, dest->alloc); 1137 if (BE (dest->elems == NULL, 0)) 1138 return REG_ESPACE; 1139 } 1140 else 1141 { 1142 if (src1 != NULL && src1->nelem > 0) 1143 return re_node_set_init_copy (dest, src1); 1144 else if (src2 != NULL && src2->nelem > 0) 1145 return re_node_set_init_copy (dest, src2); 1146 else 1147 re_node_set_init_empty (dest); 1148 return REG_NOERROR; 1149 } 1150 for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;) 1151 { 1152 if (src1->elems[i1] > src2->elems[i2]) 1153 { 1154 dest->elems[id++] = src2->elems[i2++]; 1155 continue; 1156 } 1157 if (src1->elems[i1] == src2->elems[i2]) 1158 ++i2; 1159 dest->elems[id++] = src1->elems[i1++]; 1160 } 1161 if (i1 < src1->nelem) 1162 { 1163 memcpy (dest->elems + id, src1->elems + i1, 1164 (src1->nelem - i1) * sizeof (int)); 1165 id += src1->nelem - i1; 1166 } 1167 else if (i2 < src2->nelem) 1168 { 1169 memcpy (dest->elems + id, src2->elems + i2, 1170 (src2->nelem - i2) * sizeof (int)); 1171 id += src2->nelem - i2; 1172 } 1173 dest->nelem = id; 1174 return REG_NOERROR; 1175 } 1176 1177 /* Calculate the union set of the sets DEST and SRC. And store it to 1178 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */ 1179 1180 static reg_errcode_t 1181 internal_function 1182 re_node_set_merge (re_node_set *dest, const re_node_set *src) 1183 { 1184 int is, id, sbase, delta; 1185 if (src == NULL || src->nelem == 0) 1186 return REG_NOERROR; 1187 if (dest->alloc < 2 * src->nelem + dest->nelem) 1188 { 1189 int new_alloc = 2 * (src->nelem + dest->alloc); 1190 int *new_buffer = re_realloc (dest->elems, int, new_alloc); 1191 if (BE (new_buffer == NULL, 0)) 1192 return REG_ESPACE; 1193 dest->elems = new_buffer; 1194 dest->alloc = new_alloc; 1195 } 1196 1197 if (BE (dest->nelem == 0, 0)) 1198 { 1199 dest->nelem = src->nelem; 1200 memcpy (dest->elems, src->elems, src->nelem * sizeof (int)); 1201 return REG_NOERROR; 1202 } 1203 1204 /* Copy into the top of DEST the items of SRC that are not 1205 found in DEST. Maybe we could binary search in DEST? */ 1206 for (sbase = dest->nelem + 2 * src->nelem, 1207 is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; ) 1208 { 1209 if (dest->elems[id] == src->elems[is]) 1210 is--, id--; 1211 else if (dest->elems[id] < src->elems[is]) 1212 dest->elems[--sbase] = src->elems[is--]; 1213 else /* if (dest->elems[id] > src->elems[is]) */ 1214 --id; 1215 } 1216 1217 if (is >= 0) 1218 { 1219 /* If DEST is exhausted, the remaining items of SRC must be unique. */ 1220 sbase -= is + 1; 1221 memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (int)); 1222 } 1223 1224 id = dest->nelem - 1; 1225 is = dest->nelem + 2 * src->nelem - 1; 1226 delta = is - sbase + 1; 1227 if (delta == 0) 1228 return REG_NOERROR; 1229 1230 /* Now copy. When DELTA becomes zero, the remaining 1231 DEST elements are already in place. */ 1232 dest->nelem += delta; 1233 for (;;) 1234 { 1235 if (dest->elems[is] > dest->elems[id]) 1236 { 1237 /* Copy from the top. */ 1238 dest->elems[id + delta--] = dest->elems[is--]; 1239 if (delta == 0) 1240 break; 1241 } 1242 else 1243 { 1244 /* Slide from the bottom. */ 1245 dest->elems[id + delta] = dest->elems[id]; 1246 if (--id < 0) 1247 { 1248 /* Copy remaining SRC elements. */ 1249 memcpy (dest->elems, dest->elems + sbase, 1250 delta * sizeof (int)); 1251 break; 1252 } 1253 } 1254 } 1255 1256 return REG_NOERROR; 1257 } 1258 1259 /* Insert the new element ELEM to the re_node_set* SET. 1260 SET should not already have ELEM. 1261 return -1 if an error is occured, return 1 otherwise. */ 1262 1263 static int 1264 internal_function 1265 re_node_set_insert (re_node_set *set, int elem) 1266 { 1267 int idx; 1268 /* In case the set is empty. */ 1269 if (set->alloc == 0) 1270 { 1271 if (BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1)) 1272 return 1; 1273 else 1274 return -1; 1275 } 1276 1277 if (BE (set->nelem, 0) == 0) 1278 { 1279 /* We already guaranteed above that set->alloc != 0. */ 1280 set->elems[0] = elem; 1281 ++set->nelem; 1282 return 1; 1283 } 1284 1285 /* Realloc if we need. */ 1286 if (set->alloc == set->nelem) 1287 { 1288 int *new_elems; 1289 set->alloc = set->alloc * 2; 1290 new_elems = re_realloc (set->elems, int, set->alloc); 1291 if (BE (new_elems == NULL, 0)) 1292 return -1; 1293 set->elems = new_elems; 1294 } 1295 1296 /* Move the elements which follows the new element. Test the 1297 first element separately to skip a check in the inner loop. */ 1298 if (elem < set->elems[0]) 1299 { 1300 idx = 0; 1301 for (idx = set->nelem; idx > 0; idx--) 1302 set->elems[idx] = set->elems[idx - 1]; 1303 } 1304 else 1305 { 1306 for (idx = set->nelem; set->elems[idx - 1] > elem; idx--) 1307 set->elems[idx] = set->elems[idx - 1]; 1308 } 1309 1310 /* Insert the new element. */ 1311 set->elems[idx] = elem; 1312 ++set->nelem; 1313 return 1; 1314 } 1315 1316 /* Insert the new element ELEM to the re_node_set* SET. 1317 SET should not already have any element greater than or equal to ELEM. 1318 Return -1 if an error is occured, return 1 otherwise. */ 1319 1320 static int 1321 internal_function 1322 re_node_set_insert_last (re_node_set *set, int elem) 1323 { 1324 /* Realloc if we need. */ 1325 if (set->alloc == set->nelem) 1326 { 1327 int *new_elems; 1328 set->alloc = (set->alloc + 1) * 2; 1329 new_elems = re_realloc (set->elems, int, set->alloc); 1330 if (BE (new_elems == NULL, 0)) 1331 return -1; 1332 set->elems = new_elems; 1333 } 1334 1335 /* Insert the new element. */ 1336 set->elems[set->nelem++] = elem; 1337 return 1; 1338 } 1339 1340 /* Compare two node sets SET1 and SET2. 1341 return 1 if SET1 and SET2 are equivalent, return 0 otherwise. */ 1342 1343 static int 1344 internal_function __attribute ((pure)) 1345 re_node_set_compare (const re_node_set *set1, const re_node_set *set2) 1346 { 1347 int i; 1348 if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem) 1349 return 0; 1350 for (i = set1->nelem ; --i >= 0 ; ) 1351 if (set1->elems[i] != set2->elems[i]) 1352 return 0; 1353 return 1; 1354 } 1355 1356 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */ 1357 1358 static int 1359 internal_function __attribute ((pure)) 1360 re_node_set_contains (const re_node_set *set, int elem) 1361 { 1362 unsigned int idx, right, mid; 1363 if (set->nelem <= 0) 1364 return 0; 1365 1366 /* Binary search the element. */ 1367 idx = 0; 1368 right = set->nelem - 1; 1369 while (idx < right) 1370 { 1371 mid = (idx + right) / 2; 1372 if (set->elems[mid] < elem) 1373 idx = mid + 1; 1374 else 1375 right = mid; 1376 } 1377 return set->elems[idx] == elem ? idx + 1 : 0; 1378 } 1379 1380 static void 1381 internal_function 1382 re_node_set_remove_at (re_node_set *set, int idx) 1383 { 1384 if (idx < 0 || idx >= set->nelem) 1385 return; 1386 --set->nelem; 1387 for (; idx < set->nelem; idx++) 1388 set->elems[idx] = set->elems[idx + 1]; 1389 } 1390 1391 1392 /* Add the token TOKEN to dfa->nodes, and return the index of the token. 1393 Or return -1, if an error will be occured. */ 1394 1395 static int 1396 internal_function 1397 re_dfa_add_node (re_dfa_t *dfa, re_token_t token) 1398 { 1399 int type = token.type; 1400 if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0)) 1401 { 1402 size_t new_nodes_alloc = dfa->nodes_alloc * 2; 1403 int *new_nexts, *new_indices; 1404 re_node_set *new_edests, *new_eclosures; 1405 re_token_t *new_nodes; 1406 1407 /* Avoid overflows. */ 1408 if (BE (new_nodes_alloc < dfa->nodes_alloc, 0)) 1409 return -1; 1410 1411 new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc); 1412 if (BE (new_nodes == NULL, 0)) 1413 return -1; 1414 dfa->nodes = new_nodes; 1415 new_nexts = re_realloc (dfa->nexts, int, new_nodes_alloc); 1416 new_indices = re_realloc (dfa->org_indices, int, new_nodes_alloc); 1417 new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc); 1418 new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc); 1419 if (BE (new_nexts == NULL || new_indices == NULL 1420 || new_edests == NULL || new_eclosures == NULL, 0)) 1421 return -1; 1422 dfa->nexts = new_nexts; 1423 dfa->org_indices = new_indices; 1424 dfa->edests = new_edests; 1425 dfa->eclosures = new_eclosures; 1426 dfa->nodes_alloc = new_nodes_alloc; 1427 } 1428 dfa->nodes[dfa->nodes_len] = token; 1429 dfa->nodes[dfa->nodes_len].constraint = 0; 1430 #ifdef RE_ENABLE_I18N 1431 dfa->nodes[dfa->nodes_len].accept_mb = 1432 (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET; 1433 #endif 1434 dfa->nexts[dfa->nodes_len] = -1; 1435 re_node_set_init_empty (dfa->edests + dfa->nodes_len); 1436 re_node_set_init_empty (dfa->eclosures + dfa->nodes_len); 1437 return dfa->nodes_len++; 1438 } 1439 1440 static inline unsigned int 1441 internal_function 1442 calc_state_hash (const re_node_set *nodes, unsigned int context) 1443 { 1444 unsigned int hash = nodes->nelem + context; 1445 int i; 1446 for (i = 0 ; i < nodes->nelem ; i++) 1447 hash += nodes->elems[i]; 1448 return hash; 1449 } 1450 1451 /* Search for the state whose node_set is equivalent to NODES. 1452 Return the pointer to the state, if we found it in the DFA. 1453 Otherwise create the new one and return it. In case of an error 1454 return NULL and set the error code in ERR. 1455 Note: - We assume NULL as the invalid state, then it is possible that 1456 return value is NULL and ERR is REG_NOERROR. 1457 - We never return non-NULL value in case of any errors, it is for 1458 optimization. */ 1459 1460 static re_dfastate_t * 1461 internal_function 1462 re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa, 1463 const re_node_set *nodes) 1464 { 1465 unsigned int hash; 1466 re_dfastate_t *new_state; 1467 struct re_state_table_entry *spot; 1468 int i; 1469 if (BE (nodes->nelem == 0, 0)) 1470 { 1471 *err = REG_NOERROR; 1472 return NULL; 1473 } 1474 hash = calc_state_hash (nodes, 0); 1475 spot = dfa->state_table + (hash & dfa->state_hash_mask); 1476 1477 for (i = 0 ; i < spot->num ; i++) 1478 { 1479 re_dfastate_t *state = spot->array[i]; 1480 if (hash != state->hash) 1481 continue; 1482 if (re_node_set_compare (&state->nodes, nodes)) 1483 return state; 1484 } 1485 1486 /* There are no appropriate state in the dfa, create the new one. */ 1487 new_state = create_ci_newstate (dfa, nodes, hash); 1488 if (BE (new_state == NULL, 0)) 1489 *err = REG_ESPACE; 1490 1491 return new_state; 1492 } 1493 1494 /* Search for the state whose node_set is equivalent to NODES and 1495 whose context is equivalent to CONTEXT. 1496 Return the pointer to the state, if we found it in the DFA. 1497 Otherwise create the new one and return it. In case of an error 1498 return NULL and set the error code in ERR. 1499 Note: - We assume NULL as the invalid state, then it is possible that 1500 return value is NULL and ERR is REG_NOERROR. 1501 - We never return non-NULL value in case of any errors, it is for 1502 optimization. */ 1503 1504 static re_dfastate_t * 1505 internal_function 1506 re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa, 1507 const re_node_set *nodes, unsigned int context) 1508 { 1509 unsigned int hash; 1510 re_dfastate_t *new_state; 1511 struct re_state_table_entry *spot; 1512 int i; 1513 if (nodes->nelem == 0) 1514 { 1515 *err = REG_NOERROR; 1516 return NULL; 1517 } 1518 hash = calc_state_hash (nodes, context); 1519 spot = dfa->state_table + (hash & dfa->state_hash_mask); 1520 1521 for (i = 0 ; i < spot->num ; i++) 1522 { 1523 re_dfastate_t *state = spot->array[i]; 1524 if (state->hash == hash 1525 && state->context == context 1526 && re_node_set_compare (state->entrance_nodes, nodes)) 1527 return state; 1528 } 1529 /* There are no appropriate state in `dfa', create the new one. */ 1530 new_state = create_cd_newstate (dfa, nodes, context, hash); 1531 if (BE (new_state == NULL, 0)) 1532 *err = REG_ESPACE; 1533 1534 return new_state; 1535 } 1536 1537 /* Finish initialization of the new state NEWSTATE, and using its hash value 1538 HASH put in the appropriate bucket of DFA's state table. Return value 1539 indicates the error code if failed. */ 1540 1541 static reg_errcode_t 1542 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate, 1543 unsigned int hash) 1544 { 1545 struct re_state_table_entry *spot; 1546 reg_errcode_t err; 1547 int i; 1548 1549 newstate->hash = hash; 1550 err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem); 1551 if (BE (err != REG_NOERROR, 0)) 1552 return REG_ESPACE; 1553 for (i = 0; i < newstate->nodes.nelem; i++) 1554 { 1555 int elem = newstate->nodes.elems[i]; 1556 if (!IS_EPSILON_NODE (dfa->nodes[elem].type)) 1557 re_node_set_insert_last (&newstate->non_eps_nodes, elem); 1558 } 1559 1560 spot = dfa->state_table + (hash & dfa->state_hash_mask); 1561 if (BE (spot->alloc <= spot->num, 0)) 1562 { 1563 int new_alloc = 2 * spot->num + 2; 1564 re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *, 1565 new_alloc); 1566 if (BE (new_array == NULL, 0)) 1567 return REG_ESPACE; 1568 spot->array = new_array; 1569 spot->alloc = new_alloc; 1570 } 1571 spot->array[spot->num++] = newstate; 1572 return REG_NOERROR; 1573 } 1574 1575 static void 1576 free_state (re_dfastate_t *state) 1577 { 1578 re_node_set_free (&state->non_eps_nodes); 1579 re_node_set_free (&state->inveclosure); 1580 if (state->entrance_nodes != &state->nodes) 1581 { 1582 re_node_set_free (state->entrance_nodes); 1583 re_free (state->entrance_nodes); 1584 } 1585 re_node_set_free (&state->nodes); 1586 re_free (state->word_trtable); 1587 re_free (state->trtable); 1588 re_free (state); 1589 } 1590 1591 /* Create the new state which is independ of contexts. 1592 Return the new state if succeeded, otherwise return NULL. */ 1593 1594 static re_dfastate_t * 1595 internal_function 1596 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes, 1597 unsigned int hash) 1598 { 1599 int i; 1600 reg_errcode_t err; 1601 re_dfastate_t *newstate; 1602 1603 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1); 1604 if (BE (newstate == NULL, 0)) 1605 return NULL; 1606 err = re_node_set_init_copy (&newstate->nodes, nodes); 1607 if (BE (err != REG_NOERROR, 0)) 1608 { 1609 re_free (newstate); 1610 return NULL; 1611 } 1612 1613 newstate->entrance_nodes = &newstate->nodes; 1614 for (i = 0 ; i < nodes->nelem ; i++) 1615 { 1616 re_token_t *node = dfa->nodes + nodes->elems[i]; 1617 re_token_type_t type = node->type; 1618 if (type == CHARACTER && !node->constraint) 1619 continue; 1620 #ifdef RE_ENABLE_I18N 1621 newstate->accept_mb |= node->accept_mb; 1622 #endif /* RE_ENABLE_I18N */ 1623 1624 /* If the state has the halt node, the state is a halt state. */ 1625 if (type == END_OF_RE) 1626 newstate->halt = 1; 1627 else if (type == OP_BACK_REF) 1628 newstate->has_backref = 1; 1629 else if (type == ANCHOR || node->constraint) 1630 newstate->has_constraint = 1; 1631 } 1632 err = register_state (dfa, newstate, hash); 1633 if (BE (err != REG_NOERROR, 0)) 1634 { 1635 free_state (newstate); 1636 newstate = NULL; 1637 } 1638 return newstate; 1639 } 1640 1641 /* Create the new state which is depend on the context CONTEXT. 1642 Return the new state if succeeded, otherwise return NULL. */ 1643 1644 static re_dfastate_t * 1645 internal_function 1646 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes, 1647 unsigned int context, unsigned int hash) 1648 { 1649 int i, nctx_nodes = 0; 1650 reg_errcode_t err; 1651 re_dfastate_t *newstate; 1652 1653 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1); 1654 if (BE (newstate == NULL, 0)) 1655 return NULL; 1656 err = re_node_set_init_copy (&newstate->nodes, nodes); 1657 if (BE (err != REG_NOERROR, 0)) 1658 { 1659 re_free (newstate); 1660 return NULL; 1661 } 1662 1663 newstate->context = context; 1664 newstate->entrance_nodes = &newstate->nodes; 1665 1666 for (i = 0 ; i < nodes->nelem ; i++) 1667 { 1668 re_token_t *node = dfa->nodes + nodes->elems[i]; 1669 re_token_type_t type = node->type; 1670 unsigned int constraint = node->constraint; 1671 1672 if (type == CHARACTER && !constraint) 1673 continue; 1674 #ifdef RE_ENABLE_I18N 1675 newstate->accept_mb |= node->accept_mb; 1676 #endif /* RE_ENABLE_I18N */ 1677 1678 /* If the state has the halt node, the state is a halt state. */ 1679 if (type == END_OF_RE) 1680 newstate->halt = 1; 1681 else if (type == OP_BACK_REF) 1682 newstate->has_backref = 1; 1683 1684 if (constraint) 1685 { 1686 if (newstate->entrance_nodes == &newstate->nodes) 1687 { 1688 newstate->entrance_nodes = re_malloc (re_node_set, 1); 1689 if (BE (newstate->entrance_nodes == NULL, 0)) 1690 { 1691 free_state (newstate); 1692 return NULL; 1693 } 1694 re_node_set_init_copy (newstate->entrance_nodes, nodes); 1695 nctx_nodes = 0; 1696 newstate->has_constraint = 1; 1697 } 1698 1699 if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context)) 1700 { 1701 re_node_set_remove_at (&newstate->nodes, i - nctx_nodes); 1702 ++nctx_nodes; 1703 } 1704 } 1705 } 1706 err = register_state (dfa, newstate, hash); 1707 if (BE (err != REG_NOERROR, 0)) 1708 { 1709 free_state (newstate); 1710 newstate = NULL; 1711 } 1712 return newstate; 1713 } 1714