1 /* 2 Open Tracker License 3 4 Terms and Conditions 5 6 Copyright (c) 1991-2000, Be Incorporated. All rights reserved. 7 8 Permission is hereby granted, free of charge, to any person obtaining a copy of 9 this software and associated documentation files (the "Software"), to deal in 10 the Software without restriction, including without limitation the rights to 11 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 12 of the Software, and to permit persons to whom the Software is furnished to do 13 so, subject to the following conditions: 14 15 The above copyright notice and this permission notice applies to all licensees 16 and shall be included in all copies or substantial portions of the Software. 17 18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF TITLE, MERCHANTABILITY, 20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 21 BE INCORPORATED BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN 22 AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN CONNECTION 23 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 24 25 Except as contained in this notice, the name of Be Incorporated shall not be 26 used in advertising or otherwise to promote the sale, use or other dealings in 27 this Software without prior written authorization from Be Incorporated. 28 29 Tracker(TM), Be(R), BeOS(R), and BeIA(TM) are trademarks or registered trademarks 30 of Be Incorporated in the United States and other countries. Other brand product 31 names are registered trademarks or trademarks of their respective holders. 32 All rights reserved. 33 */ 34 35 // This code is based on regexp.c, v.1.3 by Henry Spencer: 36 37 // @(#)regexp.c 1.3 of 18 April 87 38 // 39 // Copyright (c) 1986 by University of Toronto. 40 // Written by Henry Spencer. Not derived from licensed software. 41 // 42 // Permission is granted to anyone to use this software for any 43 // purpose on any computer system, and to redistribute it freely, 44 // subject to the following restrictions: 45 // 46 // 1. The author is not responsible for the consequences of use of 47 // this software, no matter how awful, even if they arise 48 // from defects in it. 49 // 50 // 2. The origin of this software must not be misrepresented, either 51 // by explicit claim or by omission. 52 // 53 // 3. Altered versions must be plainly marked as such, and must not 54 // be misrepresented as being the original software. 55 // 56 // Beware that some of this code is subtly aware of the way operator 57 // precedence is structured in regular expressions. Serious changes in 58 // regular-expression syntax might require a total rethink. 59 // 60 61 // ALTERED VERSION: Adapted to ANSI C and C++ for the OpenTracker 62 // project (www.opentracker.org), Jul 11, 2000. 63 64 65 #include <stdlib.h> 66 #include <stdio.h> 67 #include <string.h> 68 69 #include <Catalog.h> 70 #include <Errors.h> 71 72 #include "RegExp.h" 73 74 75 #undef B_TRANSLATION_CONTEXT 76 #define B_TRANSLATION_CONTEXT "libtracker" 77 78 79 // The first byte of the regexp internal "program" is actually this magic 80 // number; the start node begins in the second byte. 81 82 const uint8 kRegExpMagic = 0234; 83 84 // The "internal use only" fields in RegExp.h are present to pass info from 85 // compile to execute that permits the execute phase to run lots faster on 86 // simple cases. They are: 87 // 88 // regstart char that must begin a match; '\0' if none obvious 89 // reganch is the match anchored (at beginning-of-line only)? 90 // regmust string (pointer into program) that match must include, or NULL 91 // regmlen length of regmust string 92 // 93 // Regstart and reganch permit very fast decisions on suitable starting points 94 // for a match, cutting down the work a lot. Regmust permits fast rejection 95 // of lines that cannot possibly match. The regmust tests are costly enough 96 // that Compile() supplies a regmust only if the r.e. contains something 97 // potentially expensive (at present, the only such thing detected is * or + 98 // at the start of the r.e., which can involve a lot of backup). Regmlen is 99 // supplied because the test in RunMatcher() needs it and Compile() is 100 // computing it anyway. 101 // 102 // 103 // 104 // Structure for regexp "program". This is essentially a linear encoding 105 // of a nondeterministic finite-state machine (aka syntax charts or 106 // "railroad normal form" in parsing technology). Each node is an opcode 107 // plus a "next" pointer, possibly plus an operand. "Next" pointers of 108 // all nodes except kRegExpBranch implement concatenation; a "next" pointer 109 // with a kRegExpBranch on both ends of it is connecting two alternatives. 110 // (Here we have one of the subtle syntax dependencies: an individual 111 // kRegExpBranch (as opposed to a collection of them) is never concatenated 112 // with anything because of operator precedence.) The operand of some types 113 // of node is a literal string; for others, it is a node leading into a 114 // sub-FSM. In particular, the operand of a kRegExpBranch node is the first 115 // node of the branch. (NB this is* not* a tree structure: the tail of the 116 // branch connects to the thing following the set of kRegExpBranches). 117 // The opcodes are: 118 // 119 120 // definition number opnd? meaning 121 enum { 122 kRegExpEnd = 0, // no End of program. 123 kRegExpBol = 1, // no Match "" at beginning of line. 124 kRegExpEol = 2, // no Match "" at end of line. 125 kRegExpAny = 3, // no Match any one character. 126 kRegExpAnyOf = 4, // str Match any character in this string. 127 kRegExpAnyBut = 5, // str Match any character not in this string. 128 kRegExpBranch = 6, // node Match this alternative, or the next... 129 kRegExpBack = 7, // no Match "", "next" ptr points backward. 130 kRegExpExactly = 8, // str Match this string. 131 kRegExpNothing = 9, // no Match empty string. 132 kRegExpStar = 10, // node Match this (simple) thing 0 or more times. 133 kRegExpPlus = 11, // node Match this (simple) thing 1 or more times. 134 kRegExpOpen = 20, // no Mark this point in input as start of #n. 135 // kRegExpOpen + 1 is number 1, etc. 136 kRegExpClose = 30 // no Analogous to kRegExpOpen. 137 }; 138 139 // 140 // Opcode notes: 141 // 142 // kRegExpBranch The set of branches constituting a single choice are 143 // hooked together with their "next" pointers, since precedence prevents 144 // anything being concatenated to any individual branch. The 145 // "next" pointer of the last kRegExpBranch in a choice points to the 146 // thing following the whole choice. This is also where the 147 // final "next" pointer of each individual branch points; each 148 // branch starts with the operand node of a kRegExpBranch node. 149 // 150 // kRegExpBack Normal "next" pointers all implicitly point forward; 151 // kRegExpBack exists to make loop structures possible. 152 // 153 // kRegExpStar,kRegExpPlus '?', and complex '*' and '+', are implemented as 154 // circular kRegExpBranch structures using kRegExpBack. Simple cases 155 // (one character per match) are implemented with kRegExpStar and 156 // kRegExpPlus for speed and to minimize recursive plunges. 157 // 158 // kRegExpOpen,kRegExpClose ...are numbered at compile time. 159 // 160 // 161 // 162 // A node is one char of opcode followed by two chars of "next" pointer. 163 // "Next" pointers are stored as two 8-bit pieces, high order first. The 164 // value is a positive offset from the opcode of the node containing it. 165 // An operand, if any, simply follows the node. (Note that much of the 166 // code generation knows about this implicit relationship.) 167 // 168 // Using two bytes for the "next" pointer is vast overkill for most things, 169 // but allows patterns to get big without disasters. 170 // 171 172 const char* kMeta = "^$.[()|?+*\\"; 173 const int32 kMaxSize = 32767L; 174 // Probably could be 65535L. 175 176 177 // Flags to be passed up and down: 178 enum { 179 kHasWidth = 01, // Known never to match null string. 180 kSimple = 02, // Simple enough to be kRegExpStar/kRegExpPlus operand. 181 kSPStart = 04, // Starts with * or +. 182 kWorst = 0 // Worst case. 183 }; 184 185 186 const char* kRegExpErrorStringArray[] = { 187 B_TRANSLATE_MARK("Unmatched parenthesis."), 188 B_TRANSLATE_MARK("Expression too long."), 189 B_TRANSLATE_MARK("Too many parenthesis."), 190 B_TRANSLATE_MARK("Junk on end."), 191 B_TRANSLATE_MARK("*+? operand may be empty."), 192 B_TRANSLATE_MARK("Nested *?+."), 193 B_TRANSLATE_MARK("Invalid bracket range."), 194 B_TRANSLATE_MARK("Unmatched brackets."), 195 B_TRANSLATE_MARK("Internal error."), 196 B_TRANSLATE_MARK("?+* follows nothing."), 197 B_TRANSLATE_MARK("Trailing \\."), 198 B_TRANSLATE_MARK("Corrupted expression."), 199 B_TRANSLATE_MARK("Memory corruption."), 200 B_TRANSLATE_MARK("Corrupted pointers."), 201 B_TRANSLATE_MARK("Corrupted opcode.") 202 }; 203 204 205 #ifdef DEBUG 206 int32 regnarrate = 0; 207 #endif 208 209 210 // #pragma mark - RegExp 211 212 213 RegExp::RegExp() 214 : 215 fError(B_OK), 216 fRegExp(NULL), 217 fInputScanPointer(NULL), 218 fParenthesisCount(0), 219 fDummy('\0'), 220 fCodeEmitPointer(NULL), 221 fCodeSize(0), 222 fStringInputPointer(NULL), 223 fRegBol(NULL), 224 fStartPArrayPointer(NULL), 225 fEndPArrayPointer(NULL) 226 { 227 } 228 229 230 RegExp::RegExp(const char* pattern) 231 : 232 fError(B_OK), 233 fRegExp(NULL) 234 { 235 fRegExp = Compile(pattern); 236 } 237 238 239 RegExp::RegExp(const BString& pattern) 240 : 241 fError(B_OK), 242 fRegExp(NULL) 243 { 244 fRegExp = Compile(pattern.String()); 245 } 246 247 248 RegExp::~RegExp() 249 { 250 free(fRegExp); 251 } 252 253 254 status_t 255 RegExp::InitCheck() const 256 { 257 return fError; 258 } 259 260 261 status_t 262 RegExp::SetTo(const char* pattern) 263 { 264 fError = B_OK; 265 free(fRegExp); 266 fRegExp = Compile(pattern); 267 return fError; 268 } 269 270 271 status_t 272 RegExp::SetTo(const BString& pattern) 273 { 274 fError = B_OK; 275 free(fRegExp); 276 fRegExp = Compile(pattern.String()); 277 return fError; 278 } 279 280 281 bool 282 RegExp::Matches(const char* string) const 283 { 284 if (fRegExp == NULL || string == NULL) 285 return false; 286 287 return RunMatcher(fRegExp, string) == 1; 288 } 289 290 291 bool 292 RegExp::Matches(const BString& string) const 293 { 294 if (fRegExp == NULL) 295 return false; 296 297 return RunMatcher(fRegExp, string.String()) == 1; 298 } 299 300 301 // 302 // - Compile - compile a regular expression into internal code 303 // 304 // We can't allocate space until we know how big the compiled form will be, 305 // but we can't compile it (and thus know how big it is) until we've got a 306 // place to put the code. So we cheat: we compile it twice, once with code 307 // generation turned off and size counting turned on, and once "for real". 308 // This also means that we don't allocate space until we are sure that the 309 // thing really will compile successfully, and we never have to move the 310 // code and thus invalidate pointers into it. (Note that it has to be in 311 // one piece because free() must be able to free it all.) 312 // 313 // Beware that the optimization-preparation code in here knows about some 314 // of the structure of the compiled regexp. 315 regexp* 316 RegExp::Compile(const char* exp) 317 { 318 regexp* r; 319 const char* scan; 320 const char* longest; 321 int32 len; 322 int32 flags; 323 324 if (exp == NULL) { 325 SetError(B_BAD_VALUE); 326 return NULL; 327 } 328 329 // First pass: determine size, legality. 330 fInputScanPointer = exp; 331 fParenthesisCount = 1; 332 fCodeSize = 0L; 333 fCodeEmitPointer = &fDummy; 334 Char(kRegExpMagic); 335 if (Reg(0, &flags) == NULL) 336 return NULL; 337 338 // Small enough for pointer-storage convention? 339 if (fCodeSize >= kMaxSize) { 340 SetError(REGEXP_TOO_BIG); 341 return NULL; 342 } 343 344 r = (regexp*)malloc(sizeof(regexp) + fCodeSize); 345 // Allocate space 346 347 if (r == NULL) { 348 SetError(B_NO_MEMORY); 349 return NULL; 350 } 351 352 // Second pass: emit code. 353 fInputScanPointer = exp; 354 fParenthesisCount = 1; 355 fCodeEmitPointer = r->program; 356 Char(kRegExpMagic); 357 if (Reg(0, &flags) == NULL) { 358 free(r); 359 return NULL; 360 } 361 362 // Dig out information for optimizations. 363 r->regstart = '\0'; 364 // Worst-case defaults. 365 r->reganch = 0; 366 r->regmust = NULL; 367 r->regmlen = 0; 368 scan = r->program + 1; 369 // First kRegExpBranch. 370 if (*Next((char*)scan) == kRegExpEnd) { 371 // Only one top-level choice. 372 scan = Operand(scan); 373 374 // Starting-point info. 375 if (*scan == kRegExpExactly) 376 r->regstart = *Operand(scan); 377 else if (*scan == kRegExpBol) 378 r->reganch++; 379 380 // If there's something expensive in the r.e., find the 381 // longest literal string that must appear and make it the 382 // regmust. Resolve ties in favor of later strings, since 383 // the regstart check works with the beginning of the r.e. 384 // and avoiding duplication strengthens checking. Not a 385 // strong reason, but sufficient in the absence of others. 386 if ((flags & kSPStart) != 0) { 387 longest = NULL; 388 len = 0; 389 for (; scan != NULL; scan = Next((char*)scan)) { 390 if (*scan == kRegExpExactly 391 && (int32)strlen(Operand(scan)) >= len) { 392 longest = Operand(scan); 393 len = (int32)strlen(Operand(scan)); 394 } 395 } 396 r->regmust = longest; 397 r->regmlen = len; 398 } 399 } 400 401 return r; 402 } 403 404 405 regexp* 406 RegExp::Expression() const 407 { 408 return fRegExp; 409 } 410 411 412 const char* 413 RegExp::ErrorString() const 414 { 415 if (fError >= REGEXP_UNMATCHED_PARENTHESIS 416 && fError <= REGEXP_CORRUPTED_OPCODE) { 417 return B_TRANSLATE_NOCOLLECT( 418 kRegExpErrorStringArray[fError - B_ERRORS_END]); 419 } 420 421 return strerror(fError); 422 } 423 424 425 void 426 RegExp::SetError(status_t error) const 427 { 428 fError = error; 429 } 430 431 432 // 433 // - Reg - regular expression, i.e. main body or parenthesized thing 434 // 435 // Caller must absorb opening parenthesis. 436 // 437 // Combining parenthesis handling with the base level of regular expression 438 // is a trifle forced, but the need to tie the tails of the branches to what 439 // follows makes it hard to avoid. 440 // 441 char* 442 RegExp::Reg(int32 paren, int32* flagp) 443 { 444 char* ret; 445 char* br; 446 char* ender; 447 int32 parno = 0; 448 int32 flags; 449 450 *flagp = kHasWidth; 451 // Tentatively. 452 453 // Make an kRegExpOpen node, if parenthesized. 454 if (paren) { 455 if (fParenthesisCount >= kSubExpressionMax) { 456 SetError(REGEXP_TOO_MANY_PARENTHESIS); 457 return NULL; 458 } 459 parno = fParenthesisCount; 460 fParenthesisCount++; 461 ret = Node((char)(kRegExpOpen + parno)); 462 } else 463 ret = NULL; 464 465 // Pick up the branches, linking them together. 466 br = Branch(&flags); 467 if (br == NULL) 468 return NULL; 469 470 if (ret != NULL) { 471 Tail(ret, br); 472 // kRegExpOpen -> first 473 } else 474 ret = br; 475 476 if (!(flags & kHasWidth)) 477 *flagp &= ~kHasWidth; 478 479 *flagp |= flags&kSPStart; 480 while (*fInputScanPointer == '|') { 481 fInputScanPointer++; 482 br = Branch(&flags); 483 if (br == NULL) 484 return NULL; 485 486 Tail(ret, br); 487 // kRegExpBranch -> kRegExpBranch. 488 if (!(flags & kHasWidth)) 489 *flagp &= ~kHasWidth; 490 491 *flagp |= flags&kSPStart; 492 } 493 494 // Make a closing node, and hook it on the end. 495 ender = Node(paren ? (char)(kRegExpClose + parno) : (char)kRegExpEnd); 496 Tail(ret, ender); 497 498 // Hook the tails of the branches to the closing node. 499 for (br = ret; br != NULL; br = Next(br)) 500 OpTail(br, ender); 501 502 // Check for proper termination. 503 if (paren && *fInputScanPointer++ != ')') { 504 SetError(REGEXP_UNMATCHED_PARENTHESIS); 505 return NULL; 506 } else if (!paren && *fInputScanPointer != '\0') { 507 if (*fInputScanPointer == ')') { 508 SetError(REGEXP_UNMATCHED_PARENTHESIS); 509 return NULL; 510 } else { 511 SetError(REGEXP_JUNK_ON_END); 512 return NULL; // "Can't happen". 513 } 514 // NOTREACHED 515 } 516 517 return ret; 518 } 519 520 521 // 522 // - Branch - one alternative of an | operator 523 // 524 // Implements the concatenation operator. 525 // 526 char* 527 RegExp::Branch(int32* flagp) 528 { 529 char* ret; 530 char* chain; 531 char* latest; 532 int32 flags; 533 534 *flagp = kWorst; 535 // Tentatively. 536 537 ret = Node(kRegExpBranch); 538 chain = NULL; 539 while (*fInputScanPointer != '\0' 540 && *fInputScanPointer != '|' 541 && *fInputScanPointer != ')') { 542 latest = Piece(&flags); 543 if (latest == NULL) 544 return NULL; 545 546 *flagp |= flags & kHasWidth; 547 if (chain == NULL) { 548 // First piece. 549 *flagp |= flags & kSPStart; 550 } else 551 Tail(chain, latest); 552 553 chain = latest; 554 } 555 556 if (chain == NULL) { 557 // Loop ran zero times. 558 Node(kRegExpNothing); 559 } 560 561 return ret; 562 } 563 564 565 // 566 // - Piece - something followed by possible [*+?] 567 // 568 // Note that the branching code sequences used for ? and the general cases 569 // of * and + are somewhat optimized: they use the same kRegExpNothing node 570 // as both the endmarker for their branch list and the body of the last 571 // branch. It might seem that this node could be dispensed with entirely, 572 // but the endmarker role is not redundant. 573 // 574 char* 575 RegExp::Piece(int32* flagp) 576 { 577 char* ret; 578 char op; 579 char* next; 580 int32 flags; 581 582 ret = Atom(&flags); 583 if (ret == NULL) 584 return NULL; 585 586 op = *fInputScanPointer; 587 if (!IsMult(op)) { 588 *flagp = flags; 589 return ret; 590 } 591 592 if (!(flags & kHasWidth) && op != '?') { 593 SetError(REGEXP_STAR_PLUS_OPERAND_EMPTY); 594 return NULL; 595 } 596 *flagp = op != '+' ? kWorst | kSPStart : kWorst | kHasWidth; 597 598 if (op == '*' && (flags & kSimple)) 599 Insert(kRegExpStar, ret); 600 else if (op == '*') { 601 // Emit x* as (x&|), where & means "self". 602 Insert(kRegExpBranch, ret); 603 // Either x 604 OpTail(ret, Node(kRegExpBack)); 605 // and loop 606 OpTail(ret, ret); 607 // back 608 Tail(ret, Node(kRegExpBranch)); 609 // or 610 Tail(ret, Node(kRegExpNothing)); 611 // null. 612 } else if (op == '+' && (flags & kSimple)) 613 Insert(kRegExpPlus, ret); 614 else if (op == '+') { 615 // Emit x+ as x(&|), where & means "self". 616 next = Node(kRegExpBranch); 617 // Either 618 Tail(ret, next); 619 Tail(Node(kRegExpBack), ret); 620 // loop back 621 Tail(next, Node(kRegExpBranch)); 622 // or 623 Tail(ret, Node(kRegExpNothing)); 624 // null. 625 } else if (op == '?') { 626 // Emit x? as (x|) 627 Insert(kRegExpBranch, ret); 628 // Either x 629 Tail(ret, Node(kRegExpBranch)); 630 // or 631 next = Node(kRegExpNothing); 632 // null. 633 Tail(ret, next); 634 OpTail(ret, next); 635 } 636 637 fInputScanPointer++; 638 if (IsMult(*fInputScanPointer)) { 639 SetError(REGEXP_NESTED_STAR_QUESTION_PLUS); 640 return NULL; 641 } 642 643 return ret; 644 } 645 646 647 // 648 // - Atom - the lowest level 649 // 650 // Optimization: gobbles an entire sequence of ordinary characters so that 651 // it can turn them into a single node, which is smaller to store and 652 // faster to run. Backslashed characters are exceptions, each becoming a 653 // separate node; the code is simpler that way and it's not worth fixing. 654 // 655 char* 656 RegExp::Atom(int32* flagp) 657 { 658 char* ret; 659 int32 flags; 660 661 *flagp = kWorst; 662 // tentatively 663 664 switch (*fInputScanPointer++) { 665 case '^': 666 ret = Node(kRegExpBol); 667 break; 668 669 case '$': 670 ret = Node(kRegExpEol); 671 break; 672 673 case '.': 674 ret = Node(kRegExpAny); 675 *flagp |= kHasWidth|kSimple; 676 break; 677 678 case '[': 679 { 680 int32 cclass; 681 int32 classend; 682 683 if (*fInputScanPointer == '^') { 684 // complement of range 685 ret = Node(kRegExpAnyBut); 686 fInputScanPointer++; 687 } else 688 ret = Node(kRegExpAnyOf); 689 if (*fInputScanPointer == ']' || *fInputScanPointer == '-') 690 Char(*fInputScanPointer++); 691 while (*fInputScanPointer != '\0' 692 && *fInputScanPointer != ']') { 693 if (*fInputScanPointer == '-') { 694 fInputScanPointer++; 695 if (*fInputScanPointer == ']' 696 || *fInputScanPointer == '\0') { 697 Char('-'); 698 } else { 699 cclass = UCharAt(fInputScanPointer - 2) + 1; 700 classend = UCharAt(fInputScanPointer); 701 if (cclass > classend + 1) { 702 SetError(REGEXP_INVALID_BRACKET_RANGE); 703 return NULL; 704 } 705 for (; cclass <= classend; cclass++) 706 Char((char)cclass); 707 fInputScanPointer++; 708 } 709 } else 710 Char(*fInputScanPointer++); 711 } 712 Char('\0'); 713 if (*fInputScanPointer != ']') { 714 SetError(REGEXP_UNMATCHED_BRACKET); 715 return NULL; 716 } 717 fInputScanPointer++; 718 *flagp |= kHasWidth | kSimple; 719 break; 720 } 721 722 case '(': 723 ret = Reg(1, &flags); 724 if (ret == NULL) 725 return NULL; 726 *flagp |= flags & (kHasWidth | kSPStart); 727 break; 728 729 case '\0': 730 case '|': 731 case ')': 732 SetError(REGEXP_INTERNAL_ERROR); 733 return NULL; 734 // supposed to be caught earlier 735 736 case '?': 737 case '+': 738 case '*': 739 SetError(REGEXP_QUESTION_PLUS_STAR_FOLLOWS_NOTHING); 740 return NULL; 741 742 case '\\': 743 if (*fInputScanPointer == '\0') { 744 SetError(REGEXP_TRAILING_BACKSLASH); 745 return NULL; 746 } 747 ret = Node(kRegExpExactly); 748 Char(*fInputScanPointer++); 749 Char('\0'); 750 *flagp |= kHasWidth|kSimple; 751 break; 752 753 default: 754 { 755 int32 len; 756 char ender; 757 758 fInputScanPointer--; 759 len = (int32)strcspn(fInputScanPointer, kMeta); 760 if (len <= 0) { 761 SetError(REGEXP_INTERNAL_ERROR); 762 return NULL; 763 } 764 765 ender = *(fInputScanPointer + len); 766 if (len > 1 && IsMult(ender)) { 767 // Back off clear of ?+* operand. 768 len--; 769 } 770 771 *flagp |= kHasWidth; 772 if (len == 1) 773 *flagp |= kSimple; 774 775 ret = Node(kRegExpExactly); 776 while (len > 0) { 777 Char(*fInputScanPointer++); 778 len--; 779 } 780 781 Char('\0'); 782 break; 783 } 784 } 785 786 return ret; 787 } 788 789 790 // 791 // - Node - emit a node 792 // 793 // Returns location. 794 // 795 char* 796 RegExp::Node(char op) 797 { 798 char* ret; 799 char* ptr; 800 801 ret = fCodeEmitPointer; 802 if (ret == &fDummy) { 803 fCodeSize += 3; 804 return ret; 805 } 806 807 ptr = ret; 808 *ptr++ = op; 809 *ptr++ = '\0'; 810 // Null "next" pointer. 811 *ptr++ = '\0'; 812 fCodeEmitPointer = ptr; 813 814 return ret; 815 } 816 817 818 // 819 // - Char - emit (if appropriate) a byte of code 820 // 821 void 822 RegExp::Char(char b) 823 { 824 if (fCodeEmitPointer != &fDummy) 825 *fCodeEmitPointer++ = b; 826 else 827 fCodeSize++; 828 } 829 830 831 // 832 // - Insert - insert an operator in front of already-emitted operand 833 // 834 // Means relocating the operand. 835 // 836 void 837 RegExp::Insert(char op, char* opnd) 838 { 839 char* src; 840 char* dst; 841 char* place; 842 843 if (fCodeEmitPointer == &fDummy) { 844 fCodeSize += 3; 845 return; 846 } 847 848 src = fCodeEmitPointer; 849 fCodeEmitPointer += 3; 850 dst = fCodeEmitPointer; 851 while (src > opnd) 852 *--dst = *--src; 853 854 place = opnd; 855 // Op node, where operand used to be. 856 *place++ = op; 857 *place++ = '\0'; 858 *place++ = '\0'; 859 } 860 861 862 // 863 // - Tail - set the next-pointer at the end of a node chain 864 // 865 void 866 RegExp::Tail(char* p, char* val) 867 { 868 char* scan; 869 char* temp; 870 int32 offset; 871 872 if (p == &fDummy) 873 return; 874 875 // Find last node. 876 scan = p; 877 for (;;) { 878 temp = Next(scan); 879 if (temp == NULL) 880 break; 881 scan = temp; 882 } 883 884 if (scan[0] == kRegExpBack) 885 offset = scan - val; 886 else 887 offset = val - scan; 888 889 scan[1] = (char)((offset >> 8) & 0377); 890 scan[2] = (char)(offset & 0377); 891 } 892 893 894 // 895 // - OpTail - Tail on operand of first argument; nop if operandless 896 // 897 void 898 RegExp::OpTail(char* p, char* val) 899 { 900 // "Operandless" and "op != kRegExpBranch" are synonymous in practice. 901 if (p == NULL || p == &fDummy || *p != kRegExpBranch) 902 return; 903 904 Tail(Operand(p), val); 905 } 906 907 // 908 // RunMatcher and friends 909 // 910 911 912 // 913 // - RunMatcher - match a regexp against a string 914 // 915 int32 916 RegExp::RunMatcher(regexp* prog, const char* string) const 917 { 918 const char* s; 919 920 // be paranoid... 921 if (prog == NULL || string == NULL) { 922 SetError(B_BAD_VALUE); 923 return 0; 924 } 925 926 // check validity of program 927 if (UCharAt(prog->program) != kRegExpMagic) { 928 SetError(REGEXP_CORRUPTED_PROGRAM); 929 return 0; 930 } 931 932 // if there is a "must appear" string, look for it 933 if (prog->regmust != NULL) { 934 s = string; 935 while ((s = strchr(s, prog->regmust[0])) != NULL) { 936 if (strncmp(s, prog->regmust, (size_t)prog->regmlen) == 0) { 937 // found it 938 break; 939 } 940 s++; 941 } 942 if (s == NULL) { 943 // not present 944 return 0; 945 } 946 } 947 948 // mark beginning of line for ^ 949 fRegBol = string; 950 951 // simplest case: anchored match need be tried only once 952 if (prog->reganch) 953 return Try(prog, (char*)string); 954 955 // messy cases: unanchored match 956 s = string; 957 if (prog->regstart != '\0') { 958 // we know what char it must start with 959 while ((s = strchr(s, prog->regstart)) != NULL) { 960 if (Try(prog, (char*)s)) 961 return 1; 962 s++; 963 } 964 } else { 965 // we don't -- general case. 966 do { 967 if (Try(prog, (char*)s)) 968 return 1; 969 } while (*s++ != '\0'); 970 } 971 972 // failure 973 return 0; 974 } 975 976 977 // 978 // - Try - try match at specific point 979 // 980 // Returns 0 on failure, 1 on success. 981 // 982 int32 983 RegExp::Try(regexp* prog, const char* string) const 984 { 985 int32 i; 986 const char **sp; 987 const char **ep; 988 989 fStringInputPointer = string; 990 fStartPArrayPointer = prog->startp; 991 fEndPArrayPointer = prog->endp; 992 993 sp = prog->startp; 994 ep = prog->endp; 995 for (i = kSubExpressionMax; i > 0; i--) { 996 *sp++ = NULL; 997 *ep++ = NULL; 998 } 999 if (Match(prog->program + 1)) { 1000 prog->startp[0] = string; 1001 prog->endp[0] = fStringInputPointer; 1002 return 1; 1003 } else 1004 return 0; 1005 } 1006 1007 1008 // 1009 // - Match - main matching routine 1010 // 1011 // Conceptually the strategy is simple: check to see whether the current 1012 // node matches, call self recursively to see whether the rest matches, 1013 // and then act accordingly. In practice we make some effort to avoid 1014 // recursion, in particular by going through "ordinary" nodes (that don't 1015 // need to know whether the rest of the match failed) by a loop instead of 1016 // by recursion. 1017 // 1018 // Returns 0 on failure, 1 on success. 1019 // 1020 int32 1021 RegExp::Match(const char* prog) const 1022 { 1023 const char* scan; // Current node. 1024 const char* next; // Next node. 1025 1026 scan = prog; 1027 #ifdef DEBUG 1028 if (scan != NULL && regnarrate) 1029 fprintf(stderr, "%s(\n", Prop(scan)); 1030 #endif 1031 while (scan != NULL) { 1032 #ifdef DEBUG 1033 if (regnarrate) 1034 fprintf(stderr, "%s...\n", Prop(scan)); 1035 #endif 1036 next = Next(scan); 1037 1038 switch (*scan) { 1039 case kRegExpBol: 1040 if (fStringInputPointer != fRegBol) 1041 return 0; 1042 break; 1043 1044 case kRegExpEol: 1045 if (*fStringInputPointer != '\0') 1046 return 0; 1047 break; 1048 1049 case kRegExpAny: 1050 if (*fStringInputPointer == '\0') 1051 return 0; 1052 fStringInputPointer++; 1053 break; 1054 1055 case kRegExpExactly: 1056 { 1057 const char* opnd = Operand(scan); 1058 // Inline the first character, for speed. 1059 if (*opnd != *fStringInputPointer) 1060 return 0; 1061 1062 uint32 len = strlen(opnd); 1063 if (len > 1 1064 && strncmp(opnd, fStringInputPointer, len) != 0) { 1065 return 0; 1066 } 1067 1068 fStringInputPointer += len; 1069 } 1070 break; 1071 1072 case kRegExpAnyOf: 1073 if (*fStringInputPointer == '\0' 1074 || strchr(Operand(scan), *fStringInputPointer) == NULL) { 1075 return 0; 1076 } 1077 fStringInputPointer++; 1078 break; 1079 1080 case kRegExpAnyBut: 1081 if (*fStringInputPointer == '\0' 1082 || strchr(Operand(scan), *fStringInputPointer) != NULL) { 1083 return 0; 1084 } 1085 fStringInputPointer++; 1086 break; 1087 1088 case kRegExpNothing: 1089 break; 1090 1091 case kRegExpBack: 1092 break; 1093 1094 case kRegExpOpen + 1: 1095 case kRegExpOpen + 2: 1096 case kRegExpOpen + 3: 1097 case kRegExpOpen + 4: 1098 case kRegExpOpen + 5: 1099 case kRegExpOpen + 6: 1100 case kRegExpOpen + 7: 1101 case kRegExpOpen + 8: 1102 case kRegExpOpen + 9: 1103 { 1104 int32 no; 1105 const char* save; 1106 1107 no = *scan - kRegExpOpen; 1108 save = fStringInputPointer; 1109 1110 if (Match(next)) { 1111 // 1112 // Don't set startp if some later 1113 // invocation of the same parentheses 1114 // already has. 1115 // 1116 if (fStartPArrayPointer[no] == NULL) 1117 fStartPArrayPointer[no] = save; 1118 return 1; 1119 } else 1120 return 0; 1121 } 1122 break; 1123 1124 case kRegExpClose + 1: 1125 case kRegExpClose + 2: 1126 case kRegExpClose + 3: 1127 case kRegExpClose + 4: 1128 case kRegExpClose + 5: 1129 case kRegExpClose + 6: 1130 case kRegExpClose + 7: 1131 case kRegExpClose + 8: 1132 case kRegExpClose + 9: 1133 { 1134 int32 no; 1135 const char* save; 1136 1137 no = *scan - kRegExpClose; 1138 save = fStringInputPointer; 1139 1140 if (Match(next)) { 1141 // 1142 // Don't set endp if some later 1143 // invocation of the same parentheses 1144 // already has. 1145 // 1146 if (fEndPArrayPointer[no] == NULL) 1147 fEndPArrayPointer[no] = save; 1148 return 1; 1149 } else 1150 return 0; 1151 } 1152 break; 1153 1154 case kRegExpBranch: 1155 { 1156 const char* save; 1157 1158 if (*next != kRegExpBranch) { 1159 // no choice 1160 next = Operand(scan); 1161 // avoid recursion 1162 } else { 1163 do { 1164 save = fStringInputPointer; 1165 if (Match(Operand(scan))) 1166 return 1; 1167 fStringInputPointer = save; 1168 scan = Next(scan); 1169 } while (scan != NULL && *scan == kRegExpBranch); 1170 return 0; 1171 // NOTREACHED 1172 } 1173 } 1174 break; 1175 1176 case kRegExpStar: 1177 case kRegExpPlus: 1178 { 1179 char nextch; 1180 int32 no; 1181 const char* save; 1182 int32 min; 1183 1184 // 1185 // Lookahead to avoid useless match attempts 1186 // when we know what character comes next. 1187 // 1188 nextch = '\0'; 1189 if (*next == kRegExpExactly) 1190 nextch = *Operand(next); 1191 min = (*scan == kRegExpStar) ? 0 : 1; 1192 save = fStringInputPointer; 1193 no = Repeat(Operand(scan)); 1194 while (no >= min) { 1195 // if it could work, try it 1196 if (nextch == '\0' || *fStringInputPointer == nextch) { 1197 if (Match(next)) 1198 return 1; 1199 } 1200 // couldn't or didn't, back up 1201 no--; 1202 fStringInputPointer = save + no; 1203 } 1204 return 0; 1205 } 1206 break; 1207 1208 case kRegExpEnd: 1209 return 1; 1210 // Success! 1211 1212 default: 1213 SetError(REGEXP_MEMORY_CORRUPTION); 1214 return 0; 1215 } 1216 1217 scan = next; 1218 } 1219 1220 // 1221 // We get here only if there's trouble -- normally "case kRegExpEnd" is 1222 // the terminating point. 1223 // 1224 SetError(REGEXP_CORRUPTED_POINTERS); 1225 1226 return 0; 1227 } 1228 1229 1230 // 1231 // - Repeat - repeatedly match something simple, report how many 1232 // 1233 int32 1234 RegExp::Repeat(const char* p) const 1235 { 1236 int32 count = 0; 1237 const char* scan; 1238 const char* opnd; 1239 1240 scan = fStringInputPointer; 1241 opnd = Operand(p); 1242 switch (*p) { 1243 case kRegExpAny: 1244 count = (int32)strlen(scan); 1245 scan += count; 1246 break; 1247 1248 case kRegExpExactly: 1249 while (*opnd == *scan) { 1250 count++; 1251 scan++; 1252 } 1253 break; 1254 1255 case kRegExpAnyOf: 1256 while (*scan != '\0' && strchr(opnd, *scan) != NULL) { 1257 count++; 1258 scan++; 1259 } 1260 break; 1261 1262 case kRegExpAnyBut: 1263 while (*scan != '\0' && strchr(opnd, *scan) == NULL) { 1264 count++; 1265 scan++; 1266 } 1267 break; 1268 1269 default: 1270 // oh dear, called inappropriately 1271 SetError(REGEXP_INTERNAL_ERROR); 1272 count = 0; 1273 // best compromise 1274 break; 1275 } 1276 fStringInputPointer = scan; 1277 1278 return count; 1279 } 1280 1281 1282 // 1283 // - Next - dig the "next" pointer out of a node 1284 // 1285 char* 1286 RegExp::Next(char* p) 1287 { 1288 int32 offset; 1289 1290 if (p == &fDummy) 1291 return NULL; 1292 1293 offset = ((*(p + 1) & 0377) << 8) + (*(p + 2) & 0377); 1294 if (offset == 0) 1295 return NULL; 1296 1297 if (*p == kRegExpBack) 1298 return p - offset; 1299 else 1300 return p + offset; 1301 } 1302 1303 1304 const char* 1305 RegExp::Next(const char* p) const 1306 { 1307 int32 offset; 1308 1309 if (p == &fDummy) 1310 return NULL; 1311 1312 offset = ((*(p + 1) & 0377) << 8) + (*(p + 2) & 0377); 1313 if (offset == 0) 1314 return NULL; 1315 1316 if (*p == kRegExpBack) 1317 return p - offset; 1318 else 1319 return p + offset; 1320 } 1321 1322 1323 inline int32 1324 RegExp::UCharAt(const char* p) const 1325 { 1326 return (int32)*(unsigned char *)p; 1327 } 1328 1329 1330 inline char* 1331 RegExp::Operand(char* p) const 1332 { 1333 return p + 3; 1334 } 1335 1336 1337 inline const char* 1338 RegExp::Operand(const char* p) const 1339 { 1340 return p + 3; 1341 } 1342 1343 1344 inline bool 1345 RegExp::IsMult(char c) const 1346 { 1347 return c == '*' || c == '+' || c == '?'; 1348 } 1349 1350 1351 #ifdef DEBUG 1352 1353 1354 // 1355 // - Dump - dump a regexp onto stdout in vaguely comprehensible form 1356 // 1357 void 1358 RegExp::Dump() 1359 { 1360 const char* s; 1361 char op = kRegExpExactly; 1362 // Arbitrary non-kRegExpEnd op. 1363 const char* next; 1364 1365 s = fRegExp->program + 1; 1366 while (op != kRegExpEnd) { 1367 // While that wasn't kRegExpEnd last time... 1368 op = *s; 1369 printf("%2ld%s", s - fRegExp->program, Prop(s)); 1370 // Where, what. 1371 next = Next(s); 1372 if (next == NULL) { 1373 // next ptr 1374 printf("(0)"); 1375 } else 1376 printf("(%ld)", (s - fRegExp->program) + (next - s)); 1377 1378 s += 3; 1379 if (op == kRegExpAnyOf || op == kRegExpAnyBut 1380 || op == kRegExpExactly) { 1381 // literal string, where present 1382 while (*s != '\0') { 1383 putchar(*s); 1384 s++; 1385 } 1386 s++; 1387 } 1388 putchar('\n'); 1389 } 1390 1391 // header fields of interest 1392 if (fRegExp->regstart != '\0') 1393 printf("start `%c' ", fRegExp->regstart); 1394 1395 if (fRegExp->reganch) 1396 printf("anchored "); 1397 1398 if (fRegExp->regmust != NULL) 1399 printf("must have \"%s\"", fRegExp->regmust); 1400 1401 printf("\n"); 1402 } 1403 1404 1405 // 1406 // - Prop - printable representation of opcode 1407 // 1408 char* 1409 RegExp::Prop(const char* op) const 1410 { 1411 const char* p = NULL; 1412 static char buf[50]; 1413 1414 strcpy(buf, ":"); 1415 1416 switch (*op) { 1417 case kRegExpBol: 1418 p = "kRegExpBol"; 1419 break; 1420 1421 case kRegExpEol: 1422 p = "kRegExpEol"; 1423 break; 1424 1425 case kRegExpAny: 1426 p = "kRegExpAny"; 1427 break; 1428 1429 case kRegExpAnyOf: 1430 p = "kRegExpAnyOf"; 1431 break; 1432 1433 case kRegExpAnyBut: 1434 p = "kRegExpAnyBut"; 1435 break; 1436 1437 case kRegExpBranch: 1438 p = "kRegExpBranch"; 1439 break; 1440 1441 case kRegExpExactly: 1442 p = "kRegExpExactly"; 1443 break; 1444 1445 case kRegExpNothing: 1446 p = "kRegExpNothing"; 1447 break; 1448 1449 case kRegExpBack: 1450 p = "kRegExpBack"; 1451 break; 1452 1453 case kRegExpEnd: 1454 p = "kRegExpEnd"; 1455 break; 1456 1457 case kRegExpOpen + 1: 1458 case kRegExpOpen + 2: 1459 case kRegExpOpen + 3: 1460 case kRegExpOpen + 4: 1461 case kRegExpOpen + 5: 1462 case kRegExpOpen + 6: 1463 case kRegExpOpen + 7: 1464 case kRegExpOpen + 8: 1465 case kRegExpOpen + 9: 1466 sprintf(buf + strlen(buf), "kRegExpOpen%d", *op - kRegExpOpen); 1467 p = NULL; 1468 break; 1469 1470 case kRegExpClose + 1: 1471 case kRegExpClose + 2: 1472 case kRegExpClose + 3: 1473 case kRegExpClose + 4: 1474 case kRegExpClose + 5: 1475 case kRegExpClose + 6: 1476 case kRegExpClose + 7: 1477 case kRegExpClose + 8: 1478 case kRegExpClose + 9: 1479 sprintf(buf + strlen(buf), "kRegExpClose%d", *op - kRegExpClose); 1480 p = NULL; 1481 break; 1482 1483 case kRegExpStar: 1484 p = "kRegExpStar"; 1485 break; 1486 1487 case kRegExpPlus: 1488 p = "kRegExpPlus"; 1489 break; 1490 1491 default: 1492 RegExpError("corrupted opcode"); 1493 break; 1494 } 1495 1496 if (p != NULL) 1497 strcat(buf, p); 1498 1499 return buf; 1500 } 1501 1502 1503 void 1504 RegExp::RegExpError(const char*) const 1505 { 1506 // does nothing now, perhaps it should printf? 1507 } 1508 1509 #endif // DEBUG 1510