1 /* 2 Open Tracker License 3 4 Terms and Conditions 5 6 Copyright (c) 1991-2000, Be Incorporated. All rights reserved. 7 8 Permission is hereby granted, free of charge, to any person obtaining a copy of 9 this software and associated documentation files (the "Software"), to deal in 10 the Software without restriction, including without limitation the rights to 11 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 12 of the Software, and to permit persons to whom the Software is furnished to do 13 so, subject to the following conditions: 14 15 The above copyright notice and this permission notice applies to all licensees 16 and shall be included in all copies or substantial portions of the Software. 17 18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF TITLE, MERCHANTABILITY, 20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 21 BE INCORPORATED BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN 22 AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN CONNECTION 23 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 24 25 Except as contained in this notice, the name of Be Incorporated shall not be 26 used in advertising or otherwise to promote the sale, use or other dealings in 27 this Software without prior written authorization from Be Incorporated. 28 29 Tracker(TM), Be(R), BeOS(R), and BeIA(TM) are trademarks or registered trademarks 30 of Be Incorporated in the United States and other countries. Other brand product 31 names are registered trademarks or trademarks of their respective holders. 32 All rights reserved. 33 */ 34 35 // This code is based on regexp.c, v.1.3 by Henry Spencer: 36 37 // @(#)regexp.c 1.3 of 18 April 87 38 // 39 // Copyright (c) 1986 by University of Toronto. 40 // Written by Henry Spencer. Not derived from licensed software. 41 // 42 // Permission is granted to anyone to use this software for any 43 // purpose on any computer system, and to redistribute it freely, 44 // subject to the following restrictions: 45 // 46 // 1. The author is not responsible for the consequences of use of 47 // this software, no matter how awful, even if they arise 48 // from defects in it. 49 // 50 // 2. The origin of this software must not be misrepresented, either 51 // by explicit claim or by omission. 52 // 53 // 3. Altered versions must be plainly marked as such, and must not 54 // be misrepresented as being the original software. 55 // 56 // Beware that some of this code is subtly aware of the way operator 57 // precedence is structured in regular expressions. Serious changes in 58 // regular-expression syntax might require a total rethink. 59 // 60 61 // ALTERED VERSION: Adapted to ANSI C and C++ for the OpenTracker 62 // project (www.opentracker.org), Jul 11, 2000. 63 64 65 #include <stdlib.h> 66 #include <stdio.h> 67 #include <string.h> 68 69 #include <Errors.h> 70 71 #include "RegExp.h" 72 73 74 // The first byte of the regexp internal "program" is actually this magic 75 // number; the start node begins in the second byte. 76 77 const uint8 kRegExpMagic = 0234; 78 79 // The "internal use only" fields in RegExp.h are present to pass info from 80 // compile to execute that permits the execute phase to run lots faster on 81 // simple cases. They are: 82 // 83 // regstart char that must begin a match; '\0' if none obvious 84 // reganch is the match anchored (at beginning-of-line only)? 85 // regmust string (pointer into program) that match must include, or NULL 86 // regmlen length of regmust string 87 // 88 // Regstart and reganch permit very fast decisions on suitable starting points 89 // for a match, cutting down the work a lot. Regmust permits fast rejection 90 // of lines that cannot possibly match. The regmust tests are costly enough 91 // that Compile() supplies a regmust only if the r.e. contains something 92 // potentially expensive (at present, the only such thing detected is * or + 93 // at the start of the r.e., which can involve a lot of backup). Regmlen is 94 // supplied because the test in RunMatcher() needs it and Compile() is 95 // computing it anyway. 96 // 97 // 98 // 99 // Structure for regexp "program". This is essentially a linear encoding 100 // of a nondeterministic finite-state machine (aka syntax charts or 101 // "railroad normal form" in parsing technology). Each node is an opcode 102 // plus a "next" pointer, possibly plus an operand. "Next" pointers of 103 // all nodes except kRegExpBranch implement concatenation; a "next" pointer 104 // with a kRegExpBranch on both ends of it is connecting two alternatives. 105 // (Here we have one of the subtle syntax dependencies: an individual 106 // kRegExpBranch (as opposed to a collection of them) is never concatenated 107 // with anything because of operator precedence.) The operand of some types 108 // of node is a literal string; for others, it is a node leading into a 109 // sub-FSM. In particular, the operand of a kRegExpBranch node is the first 110 // node of the branch. (NB this is* not* a tree structure: the tail of the 111 // branch connects to the thing following the set of kRegExpBranches). 112 // The opcodes are: 113 // 114 115 // definition number opnd? meaning 116 enum { 117 kRegExpEnd = 0, // no End of program. 118 kRegExpBol = 1, // no Match "" at beginning of line. 119 kRegExpEol = 2, // no Match "" at end of line. 120 kRegExpAny = 3, // no Match any one character. 121 kRegExpAnyOf = 4, // str Match any character in this string. 122 kRegExpAnyBut = 5, // str Match any character not in this string. 123 kRegExpBranch = 6, // node Match this alternative, or the next... 124 kRegExpBack = 7, // no Match "", "next" ptr points backward. 125 kRegExpExactly = 8, // str Match this string. 126 kRegExpNothing = 9, // no Match empty string. 127 kRegExpStar = 10, // node Match this (simple) thing 0 or more times. 128 kRegExpPlus = 11, // node Match this (simple) thing 1 or more times. 129 kRegExpOpen = 20, // no Mark this point in input as start of #n. 130 // kRegExpOpen + 1 is number 1, etc. 131 kRegExpClose = 30 // no Analogous to kRegExpOpen. 132 }; 133 134 // 135 // Opcode notes: 136 // 137 // kRegExpBranch The set of branches constituting a single choice are 138 // hooked together with their "next" pointers, since precedence prevents 139 // anything being concatenated to any individual branch. The 140 // "next" pointer of the last kRegExpBranch in a choice points to the 141 // thing following the whole choice. This is also where the 142 // final "next" pointer of each individual branch points; each 143 // branch starts with the operand node of a kRegExpBranch node. 144 // 145 // kRegExpBack Normal "next" pointers all implicitly point forward; 146 // kRegExpBack exists to make loop structures possible. 147 // 148 // kRegExpStar,kRegExpPlus '?', and complex '*' and '+', are implemented as 149 // circular kRegExpBranch structures using kRegExpBack. Simple cases 150 // (one character per match) are implemented with kRegExpStar and 151 // kRegExpPlus for speed and to minimize recursive plunges. 152 // 153 // kRegExpOpen,kRegExpClose ...are numbered at compile time. 154 // 155 // 156 // 157 // A node is one char of opcode followed by two chars of "next" pointer. 158 // "Next" pointers are stored as two 8-bit pieces, high order first. The 159 // value is a positive offset from the opcode of the node containing it. 160 // An operand, if any, simply follows the node. (Note that much of the 161 // code generation knows about this implicit relationship.) 162 // 163 // Using two bytes for the "next" pointer is vast overkill for most things, 164 // but allows patterns to get big without disasters. 165 // 166 167 const char* kMeta = "^$.[()|?+*\\"; 168 const int32 kMaxSize = 32767L; 169 // Probably could be 65535L. 170 171 172 // Flags to be passed up and down: 173 enum { 174 kHasWidth = 01, // Known never to match null string. 175 kSimple = 02, // Simple enough to be kRegExpStar/kRegExpPlus operand. 176 kSPStart = 04, // Starts with * or +. 177 kWorst = 0 // Worst case. 178 }; 179 180 181 const char* kRegExpErrorStringArray[] = { 182 "Unmatched parenthesis.", 183 "Expression too long.", 184 "Too many parenthesis.", 185 "Junk on end.", 186 "*+? operand may be empty.", 187 "Nested *?+.", 188 "Invalid bracket range.", 189 "Unmatched brackets.", 190 "Internal error.", 191 "?+* follows nothing.", 192 "Trailing \\.", 193 "Corrupted expression.", 194 "Memory corruption.", 195 "Corrupted pointers.", 196 "Corrupted opcode." 197 }; 198 199 200 #ifdef DEBUG 201 int32 regnarrate = 0; 202 #endif 203 204 205 // #pragma mark - RegExp 206 207 208 RegExp::RegExp() 209 : 210 fError(B_OK), 211 fRegExp(NULL), 212 fInputScanPointer(NULL), 213 fParenthesisCount(0), 214 fDummy('\0'), 215 fCodeEmitPointer(NULL), 216 fCodeSize(0), 217 fStringInputPointer(NULL), 218 fRegBol(NULL), 219 fStartPArrayPointer(NULL), 220 fEndPArrayPointer(NULL) 221 { 222 } 223 224 225 RegExp::RegExp(const char* pattern) 226 : 227 fError(B_OK), 228 fRegExp(NULL) 229 { 230 fRegExp = Compile(pattern); 231 } 232 233 234 RegExp::RegExp(const BString& pattern) 235 : 236 fError(B_OK), 237 fRegExp(NULL) 238 { 239 fRegExp = Compile(pattern.String()); 240 } 241 242 243 RegExp::~RegExp() 244 { 245 free(fRegExp); 246 } 247 248 249 status_t 250 RegExp::InitCheck() const 251 { 252 return fError; 253 } 254 255 256 status_t 257 RegExp::SetTo(const char* pattern) 258 { 259 fError = B_OK; 260 free(fRegExp); 261 fRegExp = Compile(pattern); 262 return fError; 263 } 264 265 266 status_t 267 RegExp::SetTo(const BString& pattern) 268 { 269 fError = B_OK; 270 free(fRegExp); 271 fRegExp = Compile(pattern.String()); 272 return fError; 273 } 274 275 276 bool 277 RegExp::Matches(const char* string) const 278 { 279 if (fRegExp == NULL || string == NULL) 280 return false; 281 282 return RunMatcher(fRegExp, string) == 1; 283 } 284 285 286 bool 287 RegExp::Matches(const BString& string) const 288 { 289 if (fRegExp == NULL) 290 return false; 291 292 return RunMatcher(fRegExp, string.String()) == 1; 293 } 294 295 296 // 297 // - Compile - compile a regular expression into internal code 298 // 299 // We can't allocate space until we know how big the compiled form will be, 300 // but we can't compile it (and thus know how big it is) until we've got a 301 // place to put the code. So we cheat: we compile it twice, once with code 302 // generation turned off and size counting turned on, and once "for real". 303 // This also means that we don't allocate space until we are sure that the 304 // thing really will compile successfully, and we never have to move the 305 // code and thus invalidate pointers into it. (Note that it has to be in 306 // one piece because free() must be able to free it all.) 307 // 308 // Beware that the optimization-preparation code in here knows about some 309 // of the structure of the compiled regexp. 310 regexp* 311 RegExp::Compile(const char* exp) 312 { 313 regexp* r; 314 const char* scan; 315 const char* longest; 316 int32 len; 317 int32 flags; 318 319 if (exp == NULL) { 320 SetError(B_BAD_VALUE); 321 return NULL; 322 } 323 324 // First pass: determine size, legality. 325 fInputScanPointer = exp; 326 fParenthesisCount = 1; 327 fCodeSize = 0L; 328 fCodeEmitPointer = &fDummy; 329 Char(kRegExpMagic); 330 if (Reg(0, &flags) == NULL) 331 return NULL; 332 333 // Small enough for pointer-storage convention? 334 if (fCodeSize >= kMaxSize) { 335 SetError(REGEXP_TOO_BIG); 336 return NULL; 337 } 338 339 r = (regexp*)malloc(sizeof(regexp) + fCodeSize); 340 // Allocate space 341 342 if (r == NULL) { 343 SetError(B_NO_MEMORY); 344 return NULL; 345 } 346 347 // Second pass: emit code. 348 fInputScanPointer = exp; 349 fParenthesisCount = 1; 350 fCodeEmitPointer = r->program; 351 Char(kRegExpMagic); 352 if (Reg(0, &flags) == NULL) { 353 free(r); 354 return NULL; 355 } 356 357 // Dig out information for optimizations. 358 r->regstart = '\0'; 359 // Worst-case defaults. 360 r->reganch = 0; 361 r->regmust = NULL; 362 r->regmlen = 0; 363 scan = r->program + 1; 364 // First kRegExpBranch. 365 if (*Next((char*)scan) == kRegExpEnd) { 366 // Only one top-level choice. 367 scan = Operand(scan); 368 369 // Starting-point info. 370 if (*scan == kRegExpExactly) 371 r->regstart = *Operand(scan); 372 else if (*scan == kRegExpBol) 373 r->reganch++; 374 375 // If there's something expensive in the r.e., find the 376 // longest literal string that must appear and make it the 377 // regmust. Resolve ties in favor of later strings, since 378 // the regstart check works with the beginning of the r.e. 379 // and avoiding duplication strengthens checking. Not a 380 // strong reason, but sufficient in the absence of others. 381 if ((flags & kSPStart) != 0) { 382 longest = NULL; 383 len = 0; 384 for (; scan != NULL; scan = Next((char*)scan)) { 385 if (*scan == kRegExpExactly 386 && (int32)strlen(Operand(scan)) >= len) { 387 longest = Operand(scan); 388 len = (int32)strlen(Operand(scan)); 389 } 390 } 391 r->regmust = longest; 392 r->regmlen = len; 393 } 394 } 395 396 return r; 397 } 398 399 400 regexp* 401 RegExp::Expression() const 402 { 403 return fRegExp; 404 } 405 406 407 const char* 408 RegExp::ErrorString() const 409 { 410 if (fError >= REGEXP_UNMATCHED_PARENTHESIS 411 && fError <= REGEXP_CORRUPTED_OPCODE) { 412 return kRegExpErrorStringArray[fError - B_ERRORS_END]; 413 } 414 415 return strerror(fError); 416 } 417 418 419 void 420 RegExp::SetError(status_t error) const 421 { 422 fError = error; 423 } 424 425 426 // 427 // - Reg - regular expression, i.e. main body or parenthesized thing 428 // 429 // Caller must absorb opening parenthesis. 430 // 431 // Combining parenthesis handling with the base level of regular expression 432 // is a trifle forced, but the need to tie the tails of the branches to what 433 // follows makes it hard to avoid. 434 // 435 char* 436 RegExp::Reg(int32 paren, int32* flagp) 437 { 438 char* ret; 439 char* br; 440 char* ender; 441 int32 parno = 0; 442 int32 flags; 443 444 *flagp = kHasWidth; 445 // Tentatively. 446 447 // Make an kRegExpOpen node, if parenthesized. 448 if (paren) { 449 if (fParenthesisCount >= kSubExpressionMax) { 450 SetError(REGEXP_TOO_MANY_PARENTHESIS); 451 return NULL; 452 } 453 parno = fParenthesisCount; 454 fParenthesisCount++; 455 ret = Node((char)(kRegExpOpen + parno)); 456 } else 457 ret = NULL; 458 459 // Pick up the branches, linking them together. 460 br = Branch(&flags); 461 if (br == NULL) 462 return NULL; 463 464 if (ret != NULL) { 465 Tail(ret, br); 466 // kRegExpOpen -> first 467 } else 468 ret = br; 469 470 if (!(flags & kHasWidth)) 471 *flagp &= ~kHasWidth; 472 473 *flagp |= flags&kSPStart; 474 while (*fInputScanPointer == '|') { 475 fInputScanPointer++; 476 br = Branch(&flags); 477 if (br == NULL) 478 return NULL; 479 480 Tail(ret, br); 481 // kRegExpBranch -> kRegExpBranch. 482 if (!(flags & kHasWidth)) 483 *flagp &= ~kHasWidth; 484 485 *flagp |= flags&kSPStart; 486 } 487 488 // Make a closing node, and hook it on the end. 489 ender = Node(paren ? (char)(kRegExpClose + parno) : (char)kRegExpEnd); 490 Tail(ret, ender); 491 492 // Hook the tails of the branches to the closing node. 493 for (br = ret; br != NULL; br = Next(br)) 494 OpTail(br, ender); 495 496 // Check for proper termination. 497 if (paren && *fInputScanPointer++ != ')') { 498 SetError(REGEXP_UNMATCHED_PARENTHESIS); 499 return NULL; 500 } else if (!paren && *fInputScanPointer != '\0') { 501 if (*fInputScanPointer == ')') { 502 SetError(REGEXP_UNMATCHED_PARENTHESIS); 503 return NULL; 504 } else { 505 SetError(REGEXP_JUNK_ON_END); 506 return NULL; // "Can't happen". 507 } 508 // NOTREACHED 509 } 510 511 return ret; 512 } 513 514 515 // 516 // - Branch - one alternative of an | operator 517 // 518 // Implements the concatenation operator. 519 // 520 char* 521 RegExp::Branch(int32* flagp) 522 { 523 char* ret; 524 char* chain; 525 char* latest; 526 int32 flags; 527 528 *flagp = kWorst; 529 // Tentatively. 530 531 ret = Node(kRegExpBranch); 532 chain = NULL; 533 while (*fInputScanPointer != '\0' 534 && *fInputScanPointer != '|' 535 && *fInputScanPointer != ')') { 536 latest = Piece(&flags); 537 if (latest == NULL) 538 return NULL; 539 540 *flagp |= flags & kHasWidth; 541 if (chain == NULL) { 542 // First piece. 543 *flagp |= flags & kSPStart; 544 } else 545 Tail(chain, latest); 546 547 chain = latest; 548 } 549 550 if (chain == NULL) { 551 // Loop ran zero times. 552 Node(kRegExpNothing); 553 } 554 555 return ret; 556 } 557 558 559 // 560 // - Piece - something followed by possible [*+?] 561 // 562 // Note that the branching code sequences used for ? and the general cases 563 // of * and + are somewhat optimized: they use the same kRegExpNothing node 564 // as both the endmarker for their branch list and the body of the last 565 // branch. It might seem that this node could be dispensed with entirely, 566 // but the endmarker role is not redundant. 567 // 568 char* 569 RegExp::Piece(int32* flagp) 570 { 571 char* ret; 572 char op; 573 char* next; 574 int32 flags; 575 576 ret = Atom(&flags); 577 if (ret == NULL) 578 return NULL; 579 580 op = *fInputScanPointer; 581 if (!IsMult(op)) { 582 *flagp = flags; 583 return ret; 584 } 585 586 if (!(flags & kHasWidth) && op != '?') { 587 SetError(REGEXP_STAR_PLUS_OPERAND_EMPTY); 588 return NULL; 589 } 590 *flagp = op != '+' ? kWorst | kSPStart : kWorst | kHasWidth; 591 592 if (op == '*' && (flags & kSimple)) 593 Insert(kRegExpStar, ret); 594 else if (op == '*') { 595 // Emit x* as (x&|), where & means "self". 596 Insert(kRegExpBranch, ret); 597 // Either x 598 OpTail(ret, Node(kRegExpBack)); 599 // and loop 600 OpTail(ret, ret); 601 // back 602 Tail(ret, Node(kRegExpBranch)); 603 // or 604 Tail(ret, Node(kRegExpNothing)); 605 // null. 606 } else if (op == '+' && (flags & kSimple)) 607 Insert(kRegExpPlus, ret); 608 else if (op == '+') { 609 // Emit x+ as x(&|), where & means "self". 610 next = Node(kRegExpBranch); 611 // Either 612 Tail(ret, next); 613 Tail(Node(kRegExpBack), ret); 614 // loop back 615 Tail(next, Node(kRegExpBranch)); 616 // or 617 Tail(ret, Node(kRegExpNothing)); 618 // null. 619 } else if (op == '?') { 620 // Emit x? as (x|) 621 Insert(kRegExpBranch, ret); 622 // Either x 623 Tail(ret, Node(kRegExpBranch)); 624 // or 625 next = Node(kRegExpNothing); 626 // null. 627 Tail(ret, next); 628 OpTail(ret, next); 629 } 630 631 fInputScanPointer++; 632 if (IsMult(*fInputScanPointer)) { 633 SetError(REGEXP_NESTED_STAR_QUESTION_PLUS); 634 return NULL; 635 } 636 637 return ret; 638 } 639 640 641 // 642 // - Atom - the lowest level 643 // 644 // Optimization: gobbles an entire sequence of ordinary characters so that 645 // it can turn them into a single node, which is smaller to store and 646 // faster to run. Backslashed characters are exceptions, each becoming a 647 // separate node; the code is simpler that way and it's not worth fixing. 648 // 649 char* 650 RegExp::Atom(int32* flagp) 651 { 652 char* ret; 653 int32 flags; 654 655 *flagp = kWorst; 656 // tentatively 657 658 switch (*fInputScanPointer++) { 659 case '^': 660 ret = Node(kRegExpBol); 661 break; 662 663 case '$': 664 ret = Node(kRegExpEol); 665 break; 666 667 case '.': 668 ret = Node(kRegExpAny); 669 *flagp |= kHasWidth|kSimple; 670 break; 671 672 case '[': 673 { 674 int32 cclass; 675 int32 classend; 676 677 if (*fInputScanPointer == '^') { 678 // complement of range 679 ret = Node(kRegExpAnyBut); 680 fInputScanPointer++; 681 } else 682 ret = Node(kRegExpAnyOf); 683 if (*fInputScanPointer == ']' || *fInputScanPointer == '-') 684 Char(*fInputScanPointer++); 685 while (*fInputScanPointer != '\0' 686 && *fInputScanPointer != ']') { 687 if (*fInputScanPointer == '-') { 688 fInputScanPointer++; 689 if (*fInputScanPointer == ']' 690 || *fInputScanPointer == '\0') { 691 Char('-'); 692 } else { 693 cclass = UCharAt(fInputScanPointer - 2) + 1; 694 classend = UCharAt(fInputScanPointer); 695 if (cclass > classend + 1) { 696 SetError(REGEXP_INVALID_BRACKET_RANGE); 697 return NULL; 698 } 699 for (; cclass <= classend; cclass++) 700 Char((char)cclass); 701 fInputScanPointer++; 702 } 703 } else 704 Char(*fInputScanPointer++); 705 } 706 Char('\0'); 707 if (*fInputScanPointer != ']') { 708 SetError(REGEXP_UNMATCHED_BRACKET); 709 return NULL; 710 } 711 fInputScanPointer++; 712 *flagp |= kHasWidth | kSimple; 713 break; 714 } 715 716 case '(': 717 ret = Reg(1, &flags); 718 if (ret == NULL) 719 return NULL; 720 *flagp |= flags & (kHasWidth | kSPStart); 721 break; 722 723 case '\0': 724 case '|': 725 case ')': 726 SetError(REGEXP_INTERNAL_ERROR); 727 return NULL; 728 // supposed to be caught earlier 729 730 case '?': 731 case '+': 732 case '*': 733 SetError(REGEXP_QUESTION_PLUS_STAR_FOLLOWS_NOTHING); 734 return NULL; 735 736 case '\\': 737 if (*fInputScanPointer == '\0') { 738 SetError(REGEXP_TRAILING_BACKSLASH); 739 return NULL; 740 } 741 ret = Node(kRegExpExactly); 742 Char(*fInputScanPointer++); 743 Char('\0'); 744 *flagp |= kHasWidth|kSimple; 745 break; 746 747 default: 748 { 749 int32 len; 750 char ender; 751 752 fInputScanPointer--; 753 len = (int32)strcspn(fInputScanPointer, kMeta); 754 if (len <= 0) { 755 SetError(REGEXP_INTERNAL_ERROR); 756 return NULL; 757 } 758 759 ender = *(fInputScanPointer + len); 760 if (len > 1 && IsMult(ender)) { 761 // Back off clear of ?+* operand. 762 len--; 763 } 764 765 *flagp |= kHasWidth; 766 if (len == 1) 767 *flagp |= kSimple; 768 769 ret = Node(kRegExpExactly); 770 while (len > 0) { 771 Char(*fInputScanPointer++); 772 len--; 773 } 774 775 Char('\0'); 776 break; 777 } 778 } 779 780 return ret; 781 } 782 783 784 // 785 // - Node - emit a node 786 // 787 // Returns location. 788 // 789 char* 790 RegExp::Node(char op) 791 { 792 char* ret; 793 char* ptr; 794 795 ret = fCodeEmitPointer; 796 if (ret == &fDummy) { 797 fCodeSize += 3; 798 return ret; 799 } 800 801 ptr = ret; 802 *ptr++ = op; 803 *ptr++ = '\0'; 804 // Null "next" pointer. 805 *ptr++ = '\0'; 806 fCodeEmitPointer = ptr; 807 808 return ret; 809 } 810 811 812 // 813 // - Char - emit (if appropriate) a byte of code 814 // 815 void 816 RegExp::Char(char b) 817 { 818 if (fCodeEmitPointer != &fDummy) 819 *fCodeEmitPointer++ = b; 820 else 821 fCodeSize++; 822 } 823 824 825 // 826 // - Insert - insert an operator in front of already-emitted operand 827 // 828 // Means relocating the operand. 829 // 830 void 831 RegExp::Insert(char op, char* opnd) 832 { 833 char* src; 834 char* dst; 835 char* place; 836 837 if (fCodeEmitPointer == &fDummy) { 838 fCodeSize += 3; 839 return; 840 } 841 842 src = fCodeEmitPointer; 843 fCodeEmitPointer += 3; 844 dst = fCodeEmitPointer; 845 while (src > opnd) 846 *--dst = *--src; 847 848 place = opnd; 849 // Op node, where operand used to be. 850 *place++ = op; 851 *place++ = '\0'; 852 *place++ = '\0'; 853 } 854 855 856 // 857 // - Tail - set the next-pointer at the end of a node chain 858 // 859 void 860 RegExp::Tail(char* p, char* val) 861 { 862 char* scan; 863 char* temp; 864 int32 offset; 865 866 if (p == &fDummy) 867 return; 868 869 // Find last node. 870 scan = p; 871 for (;;) { 872 temp = Next(scan); 873 if (temp == NULL) 874 break; 875 scan = temp; 876 } 877 878 if (scan[0] == kRegExpBack) 879 offset = scan - val; 880 else 881 offset = val - scan; 882 883 scan[1] = (char)((offset >> 8) & 0377); 884 scan[2] = (char)(offset & 0377); 885 } 886 887 888 // 889 // - OpTail - Tail on operand of first argument; nop if operandless 890 // 891 void 892 RegExp::OpTail(char* p, char* val) 893 { 894 // "Operandless" and "op != kRegExpBranch" are synonymous in practice. 895 if (p == NULL || p == &fDummy || *p != kRegExpBranch) 896 return; 897 898 Tail(Operand(p), val); 899 } 900 901 // 902 // RunMatcher and friends 903 // 904 905 906 // 907 // - RunMatcher - match a regexp against a string 908 // 909 int32 910 RegExp::RunMatcher(regexp* prog, const char* string) const 911 { 912 const char* s; 913 914 // be paranoid... 915 if (prog == NULL || string == NULL) { 916 SetError(B_BAD_VALUE); 917 return 0; 918 } 919 920 // check validity of program 921 if (UCharAt(prog->program) != kRegExpMagic) { 922 SetError(REGEXP_CORRUPTED_PROGRAM); 923 return 0; 924 } 925 926 // if there is a "must appear" string, look for it 927 if (prog->regmust != NULL) { 928 s = string; 929 while ((s = strchr(s, prog->regmust[0])) != NULL) { 930 if (strncmp(s, prog->regmust, (size_t)prog->regmlen) == 0) { 931 // found it 932 break; 933 } 934 s++; 935 } 936 if (s == NULL) { 937 // not present 938 return 0; 939 } 940 } 941 942 // mark beginning of line for ^ 943 fRegBol = string; 944 945 // simplest case: anchored match need be tried only once 946 if (prog->reganch) 947 return Try(prog, (char*)string); 948 949 // messy cases: unanchored match 950 s = string; 951 if (prog->regstart != '\0') { 952 // we know what char it must start with 953 while ((s = strchr(s, prog->regstart)) != NULL) { 954 if (Try(prog, (char*)s)) 955 return 1; 956 s++; 957 } 958 } else { 959 // we don't -- general case. 960 do { 961 if (Try(prog, (char*)s)) 962 return 1; 963 } while (*s++ != '\0'); 964 } 965 966 // failure 967 return 0; 968 } 969 970 971 // 972 // - Try - try match at specific point 973 // 974 // Returns 0 on failure, 1 on success. 975 // 976 int32 977 RegExp::Try(regexp* prog, const char* string) const 978 { 979 int32 i; 980 const char **sp; 981 const char **ep; 982 983 fStringInputPointer = string; 984 fStartPArrayPointer = prog->startp; 985 fEndPArrayPointer = prog->endp; 986 987 sp = prog->startp; 988 ep = prog->endp; 989 for (i = kSubExpressionMax; i > 0; i--) { 990 *sp++ = NULL; 991 *ep++ = NULL; 992 } 993 if (Match(prog->program + 1)) { 994 prog->startp[0] = string; 995 prog->endp[0] = fStringInputPointer; 996 return 1; 997 } else 998 return 0; 999 } 1000 1001 1002 // 1003 // - Match - main matching routine 1004 // 1005 // Conceptually the strategy is simple: check to see whether the current 1006 // node matches, call self recursively to see whether the rest matches, 1007 // and then act accordingly. In practice we make some effort to avoid 1008 // recursion, in particular by going through "ordinary" nodes (that don't 1009 // need to know whether the rest of the match failed) by a loop instead of 1010 // by recursion. 1011 // 1012 // Returns 0 on failure, 1 on success. 1013 // 1014 int32 1015 RegExp::Match(const char* prog) const 1016 { 1017 const char* scan; // Current node. 1018 const char* next; // Next node. 1019 1020 scan = prog; 1021 #ifdef DEBUG 1022 if (scan != NULL && regnarrate) 1023 fprintf(stderr, "%s(\n", Prop(scan)); 1024 #endif 1025 while (scan != NULL) { 1026 #ifdef DEBUG 1027 if (regnarrate) 1028 fprintf(stderr, "%s...\n", Prop(scan)); 1029 #endif 1030 next = Next(scan); 1031 1032 switch (*scan) { 1033 case kRegExpBol: 1034 if (fStringInputPointer != fRegBol) 1035 return 0; 1036 break; 1037 1038 case kRegExpEol: 1039 if (*fStringInputPointer != '\0') 1040 return 0; 1041 break; 1042 1043 case kRegExpAny: 1044 if (*fStringInputPointer == '\0') 1045 return 0; 1046 fStringInputPointer++; 1047 break; 1048 1049 case kRegExpExactly: 1050 { 1051 const char* opnd = Operand(scan); 1052 // Inline the first character, for speed. 1053 if (*opnd != *fStringInputPointer) 1054 return 0; 1055 1056 uint32 len = strlen(opnd); 1057 if (len > 1 1058 && strncmp(opnd, fStringInputPointer, len) != 0) { 1059 return 0; 1060 } 1061 1062 fStringInputPointer += len; 1063 } 1064 break; 1065 1066 case kRegExpAnyOf: 1067 if (*fStringInputPointer == '\0' 1068 || strchr(Operand(scan), *fStringInputPointer) == NULL) { 1069 return 0; 1070 } 1071 fStringInputPointer++; 1072 break; 1073 1074 case kRegExpAnyBut: 1075 if (*fStringInputPointer == '\0' 1076 || strchr(Operand(scan), *fStringInputPointer) != NULL) { 1077 return 0; 1078 } 1079 fStringInputPointer++; 1080 break; 1081 1082 case kRegExpNothing: 1083 break; 1084 1085 case kRegExpBack: 1086 break; 1087 1088 case kRegExpOpen + 1: 1089 case kRegExpOpen + 2: 1090 case kRegExpOpen + 3: 1091 case kRegExpOpen + 4: 1092 case kRegExpOpen + 5: 1093 case kRegExpOpen + 6: 1094 case kRegExpOpen + 7: 1095 case kRegExpOpen + 8: 1096 case kRegExpOpen + 9: 1097 { 1098 int32 no; 1099 const char* save; 1100 1101 no = *scan - kRegExpOpen; 1102 save = fStringInputPointer; 1103 1104 if (Match(next)) { 1105 // 1106 // Don't set startp if some later 1107 // invocation of the same parentheses 1108 // already has. 1109 // 1110 if (fStartPArrayPointer[no] == NULL) 1111 fStartPArrayPointer[no] = save; 1112 return 1; 1113 } else 1114 return 0; 1115 } 1116 break; 1117 1118 case kRegExpClose + 1: 1119 case kRegExpClose + 2: 1120 case kRegExpClose + 3: 1121 case kRegExpClose + 4: 1122 case kRegExpClose + 5: 1123 case kRegExpClose + 6: 1124 case kRegExpClose + 7: 1125 case kRegExpClose + 8: 1126 case kRegExpClose + 9: 1127 { 1128 int32 no; 1129 const char* save; 1130 1131 no = *scan - kRegExpClose; 1132 save = fStringInputPointer; 1133 1134 if (Match(next)) { 1135 // 1136 // Don't set endp if some later 1137 // invocation of the same parentheses 1138 // already has. 1139 // 1140 if (fEndPArrayPointer[no] == NULL) 1141 fEndPArrayPointer[no] = save; 1142 return 1; 1143 } else 1144 return 0; 1145 } 1146 break; 1147 1148 case kRegExpBranch: 1149 { 1150 const char* save; 1151 1152 if (*next != kRegExpBranch) { 1153 // no choice 1154 next = Operand(scan); 1155 // avoid recursion 1156 } else { 1157 do { 1158 save = fStringInputPointer; 1159 if (Match(Operand(scan))) 1160 return 1; 1161 fStringInputPointer = save; 1162 scan = Next(scan); 1163 } while (scan != NULL && *scan == kRegExpBranch); 1164 return 0; 1165 // NOTREACHED 1166 } 1167 } 1168 break; 1169 1170 case kRegExpStar: 1171 case kRegExpPlus: 1172 { 1173 char nextch; 1174 int32 no; 1175 const char* save; 1176 int32 min; 1177 1178 // 1179 // Lookahead to avoid useless match attempts 1180 // when we know what character comes next. 1181 // 1182 nextch = '\0'; 1183 if (*next == kRegExpExactly) 1184 nextch = *Operand(next); 1185 min = (*scan == kRegExpStar) ? 0 : 1; 1186 save = fStringInputPointer; 1187 no = Repeat(Operand(scan)); 1188 while (no >= min) { 1189 // if it could work, try it 1190 if (nextch == '\0' || *fStringInputPointer == nextch) { 1191 if (Match(next)) 1192 return 1; 1193 } 1194 // couldn't or didn't, back up 1195 no--; 1196 fStringInputPointer = save + no; 1197 } 1198 return 0; 1199 } 1200 break; 1201 1202 case kRegExpEnd: 1203 return 1; 1204 // Success! 1205 1206 default: 1207 SetError(REGEXP_MEMORY_CORRUPTION); 1208 return 0; 1209 } 1210 1211 scan = next; 1212 } 1213 1214 // 1215 // We get here only if there's trouble -- normally "case kRegExpEnd" is 1216 // the terminating point. 1217 // 1218 SetError(REGEXP_CORRUPTED_POINTERS); 1219 1220 return 0; 1221 } 1222 1223 1224 // 1225 // - Repeat - repeatedly match something simple, report how many 1226 // 1227 int32 1228 RegExp::Repeat(const char* p) const 1229 { 1230 int32 count = 0; 1231 const char* scan; 1232 const char* opnd; 1233 1234 scan = fStringInputPointer; 1235 opnd = Operand(p); 1236 switch (*p) { 1237 case kRegExpAny: 1238 count = (int32)strlen(scan); 1239 scan += count; 1240 break; 1241 1242 case kRegExpExactly: 1243 while (*opnd == *scan) { 1244 count++; 1245 scan++; 1246 } 1247 break; 1248 1249 case kRegExpAnyOf: 1250 while (*scan != '\0' && strchr(opnd, *scan) != NULL) { 1251 count++; 1252 scan++; 1253 } 1254 break; 1255 1256 case kRegExpAnyBut: 1257 while (*scan != '\0' && strchr(opnd, *scan) == NULL) { 1258 count++; 1259 scan++; 1260 } 1261 break; 1262 1263 default: 1264 // oh dear, called inappropriately 1265 SetError(REGEXP_INTERNAL_ERROR); 1266 count = 0; 1267 // best compromise 1268 break; 1269 } 1270 fStringInputPointer = scan; 1271 1272 return count; 1273 } 1274 1275 1276 // 1277 // - Next - dig the "next" pointer out of a node 1278 // 1279 char* 1280 RegExp::Next(char* p) 1281 { 1282 int32 offset; 1283 1284 if (p == &fDummy) 1285 return NULL; 1286 1287 offset = ((*(p + 1) & 0377) << 8) + (*(p + 2) & 0377); 1288 if (offset == 0) 1289 return NULL; 1290 1291 if (*p == kRegExpBack) 1292 return p - offset; 1293 else 1294 return p + offset; 1295 } 1296 1297 1298 const char* 1299 RegExp::Next(const char* p) const 1300 { 1301 int32 offset; 1302 1303 if (p == &fDummy) 1304 return NULL; 1305 1306 offset = ((*(p + 1) & 0377) << 8) + (*(p + 2) & 0377); 1307 if (offset == 0) 1308 return NULL; 1309 1310 if (*p == kRegExpBack) 1311 return p - offset; 1312 else 1313 return p + offset; 1314 } 1315 1316 1317 inline int32 1318 RegExp::UCharAt(const char* p) const 1319 { 1320 return (int32)*(unsigned char *)p; 1321 } 1322 1323 1324 inline char* 1325 RegExp::Operand(char* p) const 1326 { 1327 return p + 3; 1328 } 1329 1330 1331 inline const char* 1332 RegExp::Operand(const char* p) const 1333 { 1334 return p + 3; 1335 } 1336 1337 1338 inline bool 1339 RegExp::IsMult(char c) const 1340 { 1341 return c == '*' || c == '+' || c == '?'; 1342 } 1343 1344 1345 #ifdef DEBUG 1346 1347 1348 // 1349 // - Dump - dump a regexp onto stdout in vaguely comprehensible form 1350 // 1351 void 1352 RegExp::Dump() 1353 { 1354 const char* s; 1355 char op = kRegExpExactly; 1356 // Arbitrary non-kRegExpEnd op. 1357 const char* next; 1358 1359 s = fRegExp->program + 1; 1360 while (op != kRegExpEnd) { 1361 // While that wasn't kRegExpEnd last time... 1362 op = *s; 1363 printf("%2ld%s", s - fRegExp->program, Prop(s)); 1364 // Where, what. 1365 next = Next(s); 1366 if (next == NULL) { 1367 // next ptr 1368 printf("(0)"); 1369 } else 1370 printf("(%ld)", (s - fRegExp->program) + (next - s)); 1371 1372 s += 3; 1373 if (op == kRegExpAnyOf || op == kRegExpAnyBut 1374 || op == kRegExpExactly) { 1375 // literal string, where present 1376 while (*s != '\0') { 1377 putchar(*s); 1378 s++; 1379 } 1380 s++; 1381 } 1382 putchar('\n'); 1383 } 1384 1385 // header fields of interest 1386 if (fRegExp->regstart != '\0') 1387 printf("start `%c' ", fRegExp->regstart); 1388 1389 if (fRegExp->reganch) 1390 printf("anchored "); 1391 1392 if (fRegExp->regmust != NULL) 1393 printf("must have \"%s\"", fRegExp->regmust); 1394 1395 printf("\n"); 1396 } 1397 1398 1399 // 1400 // - Prop - printable representation of opcode 1401 // 1402 char* 1403 RegExp::Prop(const char* op) const 1404 { 1405 const char* p = NULL; 1406 static char buf[50]; 1407 1408 strcpy(buf, ":"); 1409 1410 switch (*op) { 1411 case kRegExpBol: 1412 p = "kRegExpBol"; 1413 break; 1414 1415 case kRegExpEol: 1416 p = "kRegExpEol"; 1417 break; 1418 1419 case kRegExpAny: 1420 p = "kRegExpAny"; 1421 break; 1422 1423 case kRegExpAnyOf: 1424 p = "kRegExpAnyOf"; 1425 break; 1426 1427 case kRegExpAnyBut: 1428 p = "kRegExpAnyBut"; 1429 break; 1430 1431 case kRegExpBranch: 1432 p = "kRegExpBranch"; 1433 break; 1434 1435 case kRegExpExactly: 1436 p = "kRegExpExactly"; 1437 break; 1438 1439 case kRegExpNothing: 1440 p = "kRegExpNothing"; 1441 break; 1442 1443 case kRegExpBack: 1444 p = "kRegExpBack"; 1445 break; 1446 1447 case kRegExpEnd: 1448 p = "kRegExpEnd"; 1449 break; 1450 1451 case kRegExpOpen + 1: 1452 case kRegExpOpen + 2: 1453 case kRegExpOpen + 3: 1454 case kRegExpOpen + 4: 1455 case kRegExpOpen + 5: 1456 case kRegExpOpen + 6: 1457 case kRegExpOpen + 7: 1458 case kRegExpOpen + 8: 1459 case kRegExpOpen + 9: 1460 sprintf(buf + strlen(buf), "kRegExpOpen%d", *op - kRegExpOpen); 1461 p = NULL; 1462 break; 1463 1464 case kRegExpClose + 1: 1465 case kRegExpClose + 2: 1466 case kRegExpClose + 3: 1467 case kRegExpClose + 4: 1468 case kRegExpClose + 5: 1469 case kRegExpClose + 6: 1470 case kRegExpClose + 7: 1471 case kRegExpClose + 8: 1472 case kRegExpClose + 9: 1473 sprintf(buf + strlen(buf), "kRegExpClose%d", *op - kRegExpClose); 1474 p = NULL; 1475 break; 1476 1477 case kRegExpStar: 1478 p = "kRegExpStar"; 1479 break; 1480 1481 case kRegExpPlus: 1482 p = "kRegExpPlus"; 1483 break; 1484 1485 default: 1486 RegExpError("corrupted opcode"); 1487 break; 1488 } 1489 1490 if (p != NULL) 1491 strcat(buf, p); 1492 1493 return buf; 1494 } 1495 1496 1497 void 1498 RegExp::RegExpError(const char*) const 1499 { 1500 // does nothing now, perhaps it should printf? 1501 } 1502 1503 #endif // DEBUG 1504