1 /* 2 Open Tracker License 3 4 Terms and Conditions 5 6 Copyright (c) 1991-2000, Be Incorporated. All rights reserved. 7 8 Permission is hereby granted, free of charge, to any person obtaining a copy of 9 this software and associated documentation files (the "Software"), to deal in 10 the Software without restriction, including without limitation the rights to 11 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 12 of the Software, and to permit persons to whom the Software is furnished to do 13 so, subject to the following conditions: 14 15 The above copyright notice and this permission notice applies to all licensees 16 and shall be included in all copies or substantial portions of the Software. 17 18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF TITLE, MERCHANTABILITY, 20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 21 BE INCORPORATED BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN 22 AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN CONNECTION 23 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 24 25 Except as contained in this notice, the name of Be Incorporated shall not be 26 used in advertising or otherwise to promote the sale, use or other dealings in 27 this Software without prior written authorization from Be Incorporated. 28 29 Tracker(TM), Be(R), BeOS(R), and BeIA(TM) are trademarks or registered trademarks 30 of Be Incorporated in the United States and other countries. Other brand product 31 names are registered trademarks or trademarks of their respective holders. 32 All rights reserved. 33 */ 34 35 // This code is based on regexp.c, v.1.3 by Henry Spencer: 36 37 // @(#)regexp.c 1.3 of 18 April 87 38 // 39 // Copyright (c) 1986 by University of Toronto. 40 // Written by Henry Spencer. Not derived from licensed software. 41 // 42 // Permission is granted to anyone to use this software for any 43 // purpose on any computer system, and to redistribute it freely, 44 // subject to the following restrictions: 45 // 46 // 1. The author is not responsible for the consequences of use of 47 // this software, no matter how awful, even if they arise 48 // from defects in it. 49 // 50 // 2. The origin of this software must not be misrepresented, either 51 // by explicit claim or by omission. 52 // 53 // 3. Altered versions must be plainly marked as such, and must not 54 // be misrepresented as being the original software. 55 // 56 // Beware that some of this code is subtly aware of the way operator 57 // precedence is structured in regular expressions. Serious changes in 58 // regular-expression syntax might require a total rethink. 59 // 60 61 // ALTERED VERSION: Adapted to ANSI C and C++ for the OpenTracker 62 // project (www.opentracker.org), Jul 11, 2000. 63 64 65 #include <stdlib.h> 66 #include <stdio.h> 67 #include <string.h> 68 69 #include <Errors.h> 70 71 #include "RegExp.h" 72 73 74 // The first byte of the regexp internal "program" is actually this magic 75 // number; the start node begins in the second byte. 76 77 const uint8 kRegExpMagic = 0234; 78 79 // The "internal use only" fields in RegExp.h are present to pass info from 80 // compile to execute that permits the execute phase to run lots faster on 81 // simple cases. They are: 82 // 83 // regstart char that must begin a match; '\0' if none obvious 84 // reganch is the match anchored (at beginning-of-line only)? 85 // regmust string (pointer into program) that match must include, or NULL 86 // regmlen length of regmust string 87 // 88 // Regstart and reganch permit very fast decisions on suitable starting points 89 // for a match, cutting down the work a lot. Regmust permits fast rejection 90 // of lines that cannot possibly match. The regmust tests are costly enough 91 // that Compile() supplies a regmust only if the r.e. contains something 92 // potentially expensive (at present, the only such thing detected is * or + 93 // at the start of the r.e., which can involve a lot of backup). Regmlen is 94 // supplied because the test in RunMatcher() needs it and Compile() is 95 // computing it anyway. 96 // 97 // 98 // 99 // Structure for regexp "program". This is essentially a linear encoding 100 // of a nondeterministic finite-state machine (aka syntax charts or 101 // "railroad normal form" in parsing technology). Each node is an opcode 102 // plus a "next" pointer, possibly plus an operand. "Next" pointers of 103 // all nodes except kRegExpBranch implement concatenation; a "next" pointer 104 // with a kRegExpBranch on both ends of it is connecting two alternatives. 105 // (Here we have one of the subtle syntax dependencies: an individual 106 // kRegExpBranch (as opposed to a collection of them) is never concatenated 107 // with anything because of operator precedence.) The operand of some types 108 // of node is a literal string; for others, it is a node leading into a 109 // sub-FSM. In particular, the operand of a kRegExpBranch node is the first 110 // node of the branch. (NB this is* not* a tree structure: the tail of the 111 // branch connects to the thing following the set of kRegExpBranches). 112 // The opcodes are: 113 // 114 115 // definition number opnd? meaning 116 enum { 117 kRegExpEnd = 0, // no End of program. 118 kRegExpBol = 1, // no Match "" at beginning of line. 119 kRegExpEol = 2, // no Match "" at end of line. 120 kRegExpAny = 3, // no Match any one character. 121 kRegExpAnyOf = 4, // str Match any character in this string. 122 kRegExpAnyBut = 5, // str Match any character not in this string. 123 kRegExpBranch = 6, // node Match this alternative, or the next... 124 kRegExpBack = 7, // no Match "", "next" ptr points backward. 125 kRegExpExactly = 8, // str Match this string. 126 kRegExpNothing = 9, // no Match empty string. 127 kRegExpStar = 10, // node Match this (simple) thing 0 or more times. 128 kRegExpPlus = 11, // node Match this (simple) thing 1 or more times. 129 kRegExpOpen = 20, // no Mark this point in input as start of #n. 130 // kRegExpOpen + 1 is number 1, etc. 131 kRegExpClose = 30 // no Analogous to kRegExpOpen. 132 }; 133 134 // 135 // Opcode notes: 136 // 137 // kRegExpBranch The set of branches constituting a single choice are 138 // hooked together with their "next" pointers, since precedence prevents 139 // anything being concatenated to any individual branch. The 140 // "next" pointer of the last kRegExpBranch in a choice points to the 141 // thing following the whole choice. This is also where the 142 // final "next" pointer of each individual branch points; each 143 // branch starts with the operand node of a kRegExpBranch node. 144 // 145 // kRegExpBack Normal "next" pointers all implicitly point forward; 146 // kRegExpBack exists to make loop structures possible. 147 // 148 // kRegExpStar,kRegExpPlus '?', and complex '*' and '+', are implemented as 149 // circular kRegExpBranch structures using kRegExpBack. Simple cases 150 // (one character per match) are implemented with kRegExpStar and 151 // kRegExpPlus for speed and to minimize recursive plunges. 152 // 153 // kRegExpOpen,kRegExpClose ...are numbered at compile time. 154 // 155 // 156 // 157 // A node is one char of opcode followed by two chars of "next" pointer. 158 // "Next" pointers are stored as two 8-bit pieces, high order first. The 159 // value is a positive offset from the opcode of the node containing it. 160 // An operand, if any, simply follows the node. (Note that much of the 161 // code generation knows about this implicit relationship.) 162 // 163 // Using two bytes for the "next" pointer is vast overkill for most things, 164 // but allows patterns to get big without disasters. 165 // 166 167 const char* kMeta = "^$.[()|?+*\\"; 168 const int32 kMaxSize = 32767L; 169 // Probably could be 65535L. 170 171 172 // Flags to be passed up and down: 173 enum { 174 kHasWidth = 01, // Known never to match null string. 175 kSimple = 02, // Simple enough to be kRegExpStar/kRegExpPlus operand. 176 kSPStart = 04, // Starts with * or +. 177 kWorst = 0 // Worst case. 178 }; 179 180 181 const char* kRegExpErrorStringArray[] = { 182 "Unmatched parenthesis.", 183 "Expression too long.", 184 "Too many parenthesis.", 185 "Junk on end.", 186 "*+? operand may be empty.", 187 "Nested *?+.", 188 "Invalid bracket range.", 189 "Unmatched brackets.", 190 "Internal error.", 191 "?+* follows nothing.", 192 "Trailing \\.", 193 "Corrupted expression.", 194 "Memory corruption.", 195 "Corrupted pointers.", 196 "Corrupted opcode." 197 }; 198 199 200 #ifdef DEBUG 201 int32 regnarrate = 0; 202 #endif 203 204 205 // #pragma mark - RegExp 206 207 208 RegExp::RegExp() 209 : 210 fError(B_OK), 211 fRegExp(NULL) 212 { 213 } 214 215 216 RegExp::RegExp(const char* pattern) 217 : 218 fError(B_OK), 219 fRegExp(NULL) 220 { 221 fRegExp = Compile(pattern); 222 } 223 224 225 RegExp::RegExp(const BString &pattern) 226 : 227 fError(B_OK), 228 fRegExp(NULL) 229 { 230 fRegExp = Compile(pattern.String()); 231 } 232 233 234 RegExp::~RegExp() 235 { 236 free(fRegExp); 237 } 238 239 240 status_t 241 RegExp::InitCheck() const 242 { 243 return fError; 244 } 245 246 247 status_t 248 RegExp::SetTo(const char* pattern) 249 { 250 fError = B_OK; 251 free(fRegExp); 252 fRegExp = Compile(pattern); 253 return fError; 254 } 255 256 257 status_t 258 RegExp::SetTo(const BString &pattern) 259 { 260 fError = B_OK; 261 free(fRegExp); 262 fRegExp = Compile(pattern.String()); 263 return fError; 264 } 265 266 267 bool 268 RegExp::Matches(const char* string) const 269 { 270 if (fRegExp == NULL || string == NULL) 271 return false; 272 273 return RunMatcher(fRegExp, string) == 1; 274 } 275 276 277 bool 278 RegExp::Matches(const BString &string) const 279 { 280 if (fRegExp == NULL) 281 return false; 282 283 return RunMatcher(fRegExp, string.String()) == 1; 284 } 285 286 287 // 288 // - Compile - compile a regular expression into internal code 289 // 290 // We can't allocate space until we know how big the compiled form will be, 291 // but we can't compile it (and thus know how big it is) until we've got a 292 // place to put the code. So we cheat: we compile it twice, once with code 293 // generation turned off and size counting turned on, and once "for real". 294 // This also means that we don't allocate space until we are sure that the 295 // thing really will compile successfully, and we never have to move the 296 // code and thus invalidate pointers into it. (Note that it has to be in 297 // one piece because free() must be able to free it all.) 298 // 299 // Beware that the optimization-preparation code in here knows about some 300 // of the structure of the compiled regexp. 301 regexp* 302 RegExp::Compile(const char* exp) 303 { 304 regexp* r; 305 const char* scan; 306 const char* longest; 307 int32 len; 308 int32 flags; 309 310 if (exp == NULL) { 311 SetError(B_BAD_VALUE); 312 return NULL; 313 } 314 315 // First pass: determine size, legality. 316 fInputScanPointer = exp; 317 fParenthesisCount = 1; 318 fCodeSize = 0L; 319 fCodeEmitPointer = &fDummy; 320 Char(kRegExpMagic); 321 if (Reg(0, &flags) == NULL) 322 return NULL; 323 324 // Small enough for pointer-storage convention? 325 if (fCodeSize >= kMaxSize) { 326 SetError(REGEXP_TOO_BIG); 327 return NULL; 328 } 329 330 r = (regexp*)malloc(sizeof(regexp) + fCodeSize); 331 // Allocate space 332 333 if (r == NULL) { 334 SetError(B_NO_MEMORY); 335 return NULL; 336 } 337 338 // Second pass: emit code. 339 fInputScanPointer = exp; 340 fParenthesisCount = 1; 341 fCodeEmitPointer = r->program; 342 Char(kRegExpMagic); 343 if (Reg(0, &flags) == NULL) { 344 free(r); 345 return NULL; 346 } 347 348 // Dig out information for optimizations. 349 r->regstart = '\0'; 350 // Worst-case defaults. 351 r->reganch = 0; 352 r->regmust = NULL; 353 r->regmlen = 0; 354 scan = r->program + 1; 355 // First kRegExpBranch. 356 if (*Next((char*)scan) == kRegExpEnd) { 357 // Only one top-level choice. 358 scan = Operand(scan); 359 360 // Starting-point info. 361 if (*scan == kRegExpExactly) 362 r->regstart = *Operand(scan); 363 else if (*scan == kRegExpBol) 364 r->reganch++; 365 366 // If there's something expensive in the r.e., find the 367 // longest literal string that must appear and make it the 368 // regmust. Resolve ties in favor of later strings, since 369 // the regstart check works with the beginning of the r.e. 370 // and avoiding duplication strengthens checking. Not a 371 // strong reason, but sufficient in the absence of others. 372 if ((flags & kSPStart) != 0) { 373 longest = NULL; 374 len = 0; 375 for (; scan != NULL; scan = Next((char*)scan)) { 376 if (*scan == kRegExpExactly 377 && (int32)strlen(Operand(scan)) >= len) { 378 longest = Operand(scan); 379 len = (int32)strlen(Operand(scan)); 380 } 381 } 382 r->regmust = longest; 383 r->regmlen = len; 384 } 385 } 386 387 return r; 388 } 389 390 391 regexp* 392 RegExp::Expression() const 393 { 394 return fRegExp; 395 } 396 397 398 const char* 399 RegExp::ErrorString() const 400 { 401 if (fError >= REGEXP_UNMATCHED_PARENTHESIS 402 && fError <= REGEXP_CORRUPTED_OPCODE) { 403 return kRegExpErrorStringArray[fError - B_ERRORS_END]; 404 } 405 406 return strerror(fError); 407 } 408 409 410 void 411 RegExp::SetError(status_t error) const 412 { 413 fError = error; 414 } 415 416 417 // 418 // - Reg - regular expression, i.e. main body or parenthesized thing 419 // 420 // Caller must absorb opening parenthesis. 421 // 422 // Combining parenthesis handling with the base level of regular expression 423 // is a trifle forced, but the need to tie the tails of the branches to what 424 // follows makes it hard to avoid. 425 // 426 char* 427 RegExp::Reg(int32 paren, int32* flagp) 428 { 429 char* ret; 430 char* br; 431 char* ender; 432 int32 parno = 0; 433 int32 flags; 434 435 *flagp = kHasWidth; 436 // Tentatively. 437 438 // Make an kRegExpOpen node, if parenthesized. 439 if (paren) { 440 if (fParenthesisCount >= kSubExpressionMax) { 441 SetError(REGEXP_TOO_MANY_PARENTHESIS); 442 return NULL; 443 } 444 parno = fParenthesisCount; 445 fParenthesisCount++; 446 ret = Node((char)(kRegExpOpen + parno)); 447 } else 448 ret = NULL; 449 450 // Pick up the branches, linking them together. 451 br = Branch(&flags); 452 if (br == NULL) 453 return NULL; 454 455 if (ret != NULL) { 456 Tail(ret, br); 457 // kRegExpOpen -> first 458 } else 459 ret = br; 460 461 if (!(flags & kHasWidth)) 462 *flagp &= ~kHasWidth; 463 464 *flagp |= flags&kSPStart; 465 while (*fInputScanPointer == '|') { 466 fInputScanPointer++; 467 br = Branch(&flags); 468 if (br == NULL) 469 return NULL; 470 471 Tail(ret, br); 472 // kRegExpBranch -> kRegExpBranch. 473 if (!(flags & kHasWidth)) 474 *flagp &= ~kHasWidth; 475 476 *flagp |= flags&kSPStart; 477 } 478 479 // Make a closing node, and hook it on the end. 480 ender = Node(paren ? (char)(kRegExpClose + parno) : (char)kRegExpEnd); 481 Tail(ret, ender); 482 483 // Hook the tails of the branches to the closing node. 484 for (br = ret; br != NULL; br = Next(br)) 485 OpTail(br, ender); 486 487 // Check for proper termination. 488 if (paren && *fInputScanPointer++ != ')') { 489 SetError(REGEXP_UNMATCHED_PARENTHESIS); 490 return NULL; 491 } else if (!paren && *fInputScanPointer != '\0') { 492 if (*fInputScanPointer == ')') { 493 SetError(REGEXP_UNMATCHED_PARENTHESIS); 494 return NULL; 495 } else { 496 SetError(REGEXP_JUNK_ON_END); 497 return NULL; // "Can't happen". 498 } 499 // NOTREACHED 500 } 501 502 return ret; 503 } 504 505 506 // 507 // - Branch - one alternative of an | operator 508 // 509 // Implements the concatenation operator. 510 // 511 char* 512 RegExp::Branch(int32* flagp) 513 { 514 char* ret; 515 char* chain; 516 char* latest; 517 int32 flags; 518 519 *flagp = kWorst; 520 // Tentatively. 521 522 ret = Node(kRegExpBranch); 523 chain = NULL; 524 while (*fInputScanPointer != '\0' 525 && *fInputScanPointer != '|' 526 && *fInputScanPointer != ')') { 527 latest = Piece(&flags); 528 if (latest == NULL) 529 return NULL; 530 531 *flagp |= flags & kHasWidth; 532 if (chain == NULL) { 533 // First piece. 534 *flagp |= flags & kSPStart; 535 } else 536 Tail(chain, latest); 537 538 chain = latest; 539 } 540 541 if (chain == NULL) { 542 // Loop ran zero times. 543 Node(kRegExpNothing); 544 } 545 546 return ret; 547 } 548 549 550 // 551 // - Piece - something followed by possible [*+?] 552 // 553 // Note that the branching code sequences used for ? and the general cases 554 // of * and + are somewhat optimized: they use the same kRegExpNothing node 555 // as both the endmarker for their branch list and the body of the last 556 // branch. It might seem that this node could be dispensed with entirely, 557 // but the endmarker role is not redundant. 558 // 559 char* 560 RegExp::Piece(int32* flagp) 561 { 562 char* ret; 563 char op; 564 char* next; 565 int32 flags; 566 567 ret = Atom(&flags); 568 if (ret == NULL) 569 return NULL; 570 571 op = *fInputScanPointer; 572 if (!IsMult(op)) { 573 *flagp = flags; 574 return ret; 575 } 576 577 if (!(flags & kHasWidth) && op != '?') { 578 SetError(REGEXP_STAR_PLUS_OPERAND_EMPTY); 579 return NULL; 580 } 581 *flagp = op != '+' ? kWorst | kSPStart : kWorst | kHasWidth; 582 583 if (op == '*' && (flags & kSimple)) 584 Insert(kRegExpStar, ret); 585 else if (op == '*') { 586 // Emit x* as (x&|), where & means "self". 587 Insert(kRegExpBranch, ret); 588 // Either x 589 OpTail(ret, Node(kRegExpBack)); 590 // and loop 591 OpTail(ret, ret); 592 // back 593 Tail(ret, Node(kRegExpBranch)); 594 // or 595 Tail(ret, Node(kRegExpNothing)); 596 // null. 597 } else if (op == '+' && (flags & kSimple)) 598 Insert(kRegExpPlus, ret); 599 else if (op == '+') { 600 // Emit x+ as x(&|), where & means "self". 601 next = Node(kRegExpBranch); 602 // Either 603 Tail(ret, next); 604 Tail(Node(kRegExpBack), ret); 605 // loop back 606 Tail(next, Node(kRegExpBranch)); 607 // or 608 Tail(ret, Node(kRegExpNothing)); 609 // null. 610 } else if (op == '?') { 611 // Emit x? as (x|) 612 Insert(kRegExpBranch, ret); 613 // Either x 614 Tail(ret, Node(kRegExpBranch)); 615 // or 616 next = Node(kRegExpNothing); 617 // null. 618 Tail(ret, next); 619 OpTail(ret, next); 620 } 621 622 fInputScanPointer++; 623 if (IsMult(*fInputScanPointer)) { 624 SetError(REGEXP_NESTED_STAR_QUESTION_PLUS); 625 return NULL; 626 } 627 628 return ret; 629 } 630 631 632 // 633 // - Atom - the lowest level 634 // 635 // Optimization: gobbles an entire sequence of ordinary characters so that 636 // it can turn them into a single node, which is smaller to store and 637 // faster to run. Backslashed characters are exceptions, each becoming a 638 // separate node; the code is simpler that way and it's not worth fixing. 639 // 640 char* 641 RegExp::Atom(int32* flagp) 642 { 643 char* ret; 644 int32 flags; 645 646 *flagp = kWorst; 647 // tentatively 648 649 switch (*fInputScanPointer++) { 650 case '^': 651 ret = Node(kRegExpBol); 652 break; 653 654 case '$': 655 ret = Node(kRegExpEol); 656 break; 657 658 case '.': 659 ret = Node(kRegExpAny); 660 *flagp |= kHasWidth|kSimple; 661 break; 662 663 case '[': 664 { 665 int32 cclass; 666 int32 classend; 667 668 if (*fInputScanPointer == '^') { 669 // complement of range 670 ret = Node(kRegExpAnyBut); 671 fInputScanPointer++; 672 } else 673 ret = Node(kRegExpAnyOf); 674 if (*fInputScanPointer == ']' || *fInputScanPointer == '-') 675 Char(*fInputScanPointer++); 676 while (*fInputScanPointer != '\0' 677 && *fInputScanPointer != ']') { 678 if (*fInputScanPointer == '-') { 679 fInputScanPointer++; 680 if (*fInputScanPointer == ']' 681 || *fInputScanPointer == '\0') { 682 Char('-'); 683 } else { 684 cclass = UCharAt(fInputScanPointer - 2) + 1; 685 classend = UCharAt(fInputScanPointer); 686 if (cclass > classend + 1) { 687 SetError(REGEXP_INVALID_BRACKET_RANGE); 688 return NULL; 689 } 690 for (; cclass <= classend; cclass++) 691 Char((char)cclass); 692 fInputScanPointer++; 693 } 694 } else 695 Char(*fInputScanPointer++); 696 } 697 Char('\0'); 698 if (*fInputScanPointer != ']') { 699 SetError(REGEXP_UNMATCHED_BRACKET); 700 return NULL; 701 } 702 fInputScanPointer++; 703 *flagp |= kHasWidth | kSimple; 704 break; 705 } 706 707 case '(': 708 ret = Reg(1, &flags); 709 if (ret == NULL) 710 return NULL; 711 *flagp |= flags & (kHasWidth | kSPStart); 712 break; 713 714 case '\0': 715 case '|': 716 case ')': 717 SetError(REGEXP_INTERNAL_ERROR); 718 return NULL; 719 // supposed to be caught earlier 720 721 case '?': 722 case '+': 723 case '*': 724 SetError(REGEXP_QUESTION_PLUS_STAR_FOLLOWS_NOTHING); 725 return NULL; 726 727 case '\\': 728 if (*fInputScanPointer == '\0') { 729 SetError(REGEXP_TRAILING_BACKSLASH); 730 return NULL; 731 } 732 ret = Node(kRegExpExactly); 733 Char(*fInputScanPointer++); 734 Char('\0'); 735 *flagp |= kHasWidth|kSimple; 736 break; 737 738 default: 739 { 740 int32 len; 741 char ender; 742 743 fInputScanPointer--; 744 len = (int32)strcspn(fInputScanPointer, kMeta); 745 if (len <= 0) { 746 SetError(REGEXP_INTERNAL_ERROR); 747 return NULL; 748 } 749 750 ender = *(fInputScanPointer + len); 751 if (len > 1 && IsMult(ender)) { 752 // Back off clear of ?+* operand. 753 len--; 754 } 755 756 *flagp |= kHasWidth; 757 if (len == 1) 758 *flagp |= kSimple; 759 760 ret = Node(kRegExpExactly); 761 while (len > 0) { 762 Char(*fInputScanPointer++); 763 len--; 764 } 765 766 Char('\0'); 767 break; 768 } 769 } 770 771 return ret; 772 } 773 774 775 // 776 // - Node - emit a node 777 // 778 // Returns location. 779 // 780 char* 781 RegExp::Node(char op) 782 { 783 char* ret; 784 char* ptr; 785 786 ret = fCodeEmitPointer; 787 if (ret == &fDummy) { 788 fCodeSize += 3; 789 return ret; 790 } 791 792 ptr = ret; 793 *ptr++ = op; 794 *ptr++ = '\0'; 795 // Null "next" pointer. 796 *ptr++ = '\0'; 797 fCodeEmitPointer = ptr; 798 799 return ret; 800 } 801 802 803 // 804 // - Char - emit (if appropriate) a byte of code 805 // 806 void 807 RegExp::Char(char b) 808 { 809 if (fCodeEmitPointer != &fDummy) 810 *fCodeEmitPointer++ = b; 811 else 812 fCodeSize++; 813 } 814 815 816 // 817 // - Insert - insert an operator in front of already-emitted operand 818 // 819 // Means relocating the operand. 820 // 821 void 822 RegExp::Insert(char op, char* opnd) 823 { 824 char* src; 825 char* dst; 826 char* place; 827 828 if (fCodeEmitPointer == &fDummy) { 829 fCodeSize += 3; 830 return; 831 } 832 833 src = fCodeEmitPointer; 834 fCodeEmitPointer += 3; 835 dst = fCodeEmitPointer; 836 while (src > opnd) 837 *--dst = *--src; 838 839 place = opnd; 840 // Op node, where operand used to be. 841 *place++ = op; 842 *place++ = '\0'; 843 *place++ = '\0'; 844 } 845 846 847 // 848 // - Tail - set the next-pointer at the end of a node chain 849 // 850 void 851 RegExp::Tail(char* p, char* val) 852 { 853 char* scan; 854 char* temp; 855 int32 offset; 856 857 if (p == &fDummy) 858 return; 859 860 // Find last node. 861 scan = p; 862 for (;;) { 863 temp = Next(scan); 864 if (temp == NULL) 865 break; 866 scan = temp; 867 } 868 869 if (scan[0] == kRegExpBack) 870 offset = scan - val; 871 else 872 offset = val - scan; 873 874 scan[1] = (char)((offset >> 8) & 0377); 875 scan[2] = (char)(offset & 0377); 876 } 877 878 879 // 880 // - OpTail - Tail on operand of first argument; nop if operandless 881 // 882 void 883 RegExp::OpTail(char* p, char* val) 884 { 885 // "Operandless" and "op != kRegExpBranch" are synonymous in practice. 886 if (p == NULL || p == &fDummy || *p != kRegExpBranch) 887 return; 888 889 Tail(Operand(p), val); 890 } 891 892 // 893 // RunMatcher and friends 894 // 895 896 897 // 898 // - RunMatcher - match a regexp against a string 899 // 900 int32 901 RegExp::RunMatcher(regexp* prog, const char* string) const 902 { 903 const char* s; 904 905 // be paranoid... 906 if (prog == NULL || string == NULL) { 907 SetError(B_BAD_VALUE); 908 return 0; 909 } 910 911 // check validity of program 912 if (UCharAt(prog->program) != kRegExpMagic) { 913 SetError(REGEXP_CORRUPTED_PROGRAM); 914 return 0; 915 } 916 917 // if there is a "must appear" string, look for it 918 if (prog->regmust != NULL) { 919 s = string; 920 while ((s = strchr(s, prog->regmust[0])) != NULL) { 921 if (strncmp(s, prog->regmust, (size_t)prog->regmlen) == 0) { 922 // found it 923 break; 924 } 925 s++; 926 } 927 if (s == NULL) { 928 // not present 929 return 0; 930 } 931 } 932 933 // mark beginning of line for ^ 934 fRegBol = string; 935 936 // simplest case: anchored match need be tried only once 937 if (prog->reganch) 938 return Try(prog, (char*)string); 939 940 // messy cases: unanchored match 941 s = string; 942 if (prog->regstart != '\0') { 943 // we know what char it must start with 944 while ((s = strchr(s, prog->regstart)) != NULL) { 945 if (Try(prog, (char*)s)) 946 return 1; 947 s++; 948 } 949 } else { 950 // we don't -- general case. 951 do { 952 if (Try(prog, (char*)s)) 953 return 1; 954 } while (*s++ != '\0'); 955 } 956 957 // failure 958 return 0; 959 } 960 961 962 // 963 // - Try - try match at specific point 964 // 965 // Returns 0 on failure, 1 on success. 966 // 967 int32 968 RegExp::Try(regexp* prog, const char* string) const 969 { 970 int32 i; 971 const char **sp; 972 const char **ep; 973 974 fStringInputPointer = string; 975 fStartPArrayPointer = prog->startp; 976 fEndPArrayPointer = prog->endp; 977 978 sp = prog->startp; 979 ep = prog->endp; 980 for (i = kSubExpressionMax; i > 0; i--) { 981 *sp++ = NULL; 982 *ep++ = NULL; 983 } 984 if (Match(prog->program + 1)) { 985 prog->startp[0] = string; 986 prog->endp[0] = fStringInputPointer; 987 return 1; 988 } else 989 return 0; 990 } 991 992 993 // 994 // - Match - main matching routine 995 // 996 // Conceptually the strategy is simple: check to see whether the current 997 // node matches, call self recursively to see whether the rest matches, 998 // and then act accordingly. In practice we make some effort to avoid 999 // recursion, in particular by going through "ordinary" nodes (that don't 1000 // need to know whether the rest of the match failed) by a loop instead of 1001 // by recursion. 1002 // 1003 // Returns 0 on failure, 1 on success. 1004 // 1005 int32 1006 RegExp::Match(const char* prog) const 1007 { 1008 const char* scan; // Current node. 1009 const char* next; // Next node. 1010 1011 scan = prog; 1012 #ifdef DEBUG 1013 if (scan != NULL && regnarrate) 1014 fprintf(stderr, "%s(\n", Prop(scan)); 1015 #endif 1016 while (scan != NULL) { 1017 #ifdef DEBUG 1018 if (regnarrate) 1019 fprintf(stderr, "%s...\n", Prop(scan)); 1020 #endif 1021 next = Next(scan); 1022 1023 switch (*scan) { 1024 case kRegExpBol: 1025 if (fStringInputPointer != fRegBol) 1026 return 0; 1027 break; 1028 1029 case kRegExpEol: 1030 if (*fStringInputPointer != '\0') 1031 return 0; 1032 break; 1033 1034 case kRegExpAny: 1035 if (*fStringInputPointer == '\0') 1036 return 0; 1037 fStringInputPointer++; 1038 break; 1039 1040 case kRegExpExactly: 1041 { 1042 const char* opnd = Operand(scan); 1043 // Inline the first character, for speed. 1044 if (*opnd != *fStringInputPointer) 1045 return 0; 1046 1047 uint32 len = strlen(opnd); 1048 if (len > 1 1049 && strncmp(opnd, fStringInputPointer, len) != 0) { 1050 return 0; 1051 } 1052 1053 fStringInputPointer += len; 1054 } 1055 break; 1056 1057 case kRegExpAnyOf: 1058 if (*fStringInputPointer == '\0' 1059 || strchr(Operand(scan), *fStringInputPointer) == NULL) { 1060 return 0; 1061 } 1062 fStringInputPointer++; 1063 break; 1064 1065 case kRegExpAnyBut: 1066 if (*fStringInputPointer == '\0' 1067 || strchr(Operand(scan), *fStringInputPointer) != NULL) { 1068 return 0; 1069 } 1070 fStringInputPointer++; 1071 break; 1072 1073 case kRegExpNothing: 1074 break; 1075 1076 case kRegExpBack: 1077 break; 1078 1079 case kRegExpOpen + 1: 1080 case kRegExpOpen + 2: 1081 case kRegExpOpen + 3: 1082 case kRegExpOpen + 4: 1083 case kRegExpOpen + 5: 1084 case kRegExpOpen + 6: 1085 case kRegExpOpen + 7: 1086 case kRegExpOpen + 8: 1087 case kRegExpOpen + 9: 1088 { 1089 int32 no; 1090 const char* save; 1091 1092 no = *scan - kRegExpOpen; 1093 save = fStringInputPointer; 1094 1095 if (Match(next)) { 1096 // 1097 // Don't set startp if some later 1098 // invocation of the same parentheses 1099 // already has. 1100 // 1101 if (fStartPArrayPointer[no] == NULL) 1102 fStartPArrayPointer[no] = save; 1103 return 1; 1104 } else 1105 return 0; 1106 } 1107 break; 1108 1109 case kRegExpClose + 1: 1110 case kRegExpClose + 2: 1111 case kRegExpClose + 3: 1112 case kRegExpClose + 4: 1113 case kRegExpClose + 5: 1114 case kRegExpClose + 6: 1115 case kRegExpClose + 7: 1116 case kRegExpClose + 8: 1117 case kRegExpClose + 9: 1118 { 1119 int32 no; 1120 const char* save; 1121 1122 no = *scan - kRegExpClose; 1123 save = fStringInputPointer; 1124 1125 if (Match(next)) { 1126 // 1127 // Don't set endp if some later 1128 // invocation of the same parentheses 1129 // already has. 1130 // 1131 if (fEndPArrayPointer[no] == NULL) 1132 fEndPArrayPointer[no] = save; 1133 return 1; 1134 } else 1135 return 0; 1136 } 1137 break; 1138 1139 case kRegExpBranch: 1140 { 1141 const char* save; 1142 1143 if (*next != kRegExpBranch) { 1144 // no choice 1145 next = Operand(scan); 1146 // avoid recursion 1147 } else { 1148 do { 1149 save = fStringInputPointer; 1150 if (Match(Operand(scan))) 1151 return 1; 1152 fStringInputPointer = save; 1153 scan = Next(scan); 1154 } while (scan != NULL && *scan == kRegExpBranch); 1155 return 0; 1156 // NOTREACHED 1157 } 1158 } 1159 break; 1160 1161 case kRegExpStar: 1162 case kRegExpPlus: 1163 { 1164 char nextch; 1165 int32 no; 1166 const char* save; 1167 int32 min; 1168 1169 // 1170 // Lookahead to avoid useless match attempts 1171 // when we know what character comes next. 1172 // 1173 nextch = '\0'; 1174 if (*next == kRegExpExactly) 1175 nextch = *Operand(next); 1176 min = (*scan == kRegExpStar) ? 0 : 1; 1177 save = fStringInputPointer; 1178 no = Repeat(Operand(scan)); 1179 while (no >= min) { 1180 // if it could work, try it 1181 if (nextch == '\0' || *fStringInputPointer == nextch) { 1182 if (Match(next)) 1183 return 1; 1184 } 1185 // couldn't or didn't, back up 1186 no--; 1187 fStringInputPointer = save + no; 1188 } 1189 return 0; 1190 } 1191 break; 1192 1193 case kRegExpEnd: 1194 return 1; 1195 // Success! 1196 1197 default: 1198 SetError(REGEXP_MEMORY_CORRUPTION); 1199 return 0; 1200 } 1201 1202 scan = next; 1203 } 1204 1205 // 1206 // We get here only if there's trouble -- normally "case kRegExpEnd" is 1207 // the terminating point. 1208 // 1209 SetError(REGEXP_CORRUPTED_POINTERS); 1210 1211 return 0; 1212 } 1213 1214 1215 // 1216 // - Repeat - repeatedly match something simple, report how many 1217 // 1218 int32 1219 RegExp::Repeat(const char* p) const 1220 { 1221 int32 count = 0; 1222 const char* scan; 1223 const char* opnd; 1224 1225 scan = fStringInputPointer; 1226 opnd = Operand(p); 1227 switch (*p) { 1228 case kRegExpAny: 1229 count = (int32)strlen(scan); 1230 scan += count; 1231 break; 1232 1233 case kRegExpExactly: 1234 while (*opnd == *scan) { 1235 count++; 1236 scan++; 1237 } 1238 break; 1239 1240 case kRegExpAnyOf: 1241 while (*scan != '\0' && strchr(opnd, *scan) != NULL) { 1242 count++; 1243 scan++; 1244 } 1245 break; 1246 1247 case kRegExpAnyBut: 1248 while (*scan != '\0' && strchr(opnd, *scan) == NULL) { 1249 count++; 1250 scan++; 1251 } 1252 break; 1253 1254 default: 1255 // oh dear, called inappropriately 1256 SetError(REGEXP_INTERNAL_ERROR); 1257 count = 0; 1258 // best compromise 1259 break; 1260 } 1261 fStringInputPointer = scan; 1262 1263 return count; 1264 } 1265 1266 1267 // 1268 // - Next - dig the "next" pointer out of a node 1269 // 1270 char* 1271 RegExp::Next(char* p) 1272 { 1273 int32 offset; 1274 1275 if (p == &fDummy) 1276 return NULL; 1277 1278 offset = ((*(p + 1) & 0377) << 8) + (*(p + 2) & 0377); 1279 if (offset == 0) 1280 return NULL; 1281 1282 if (*p == kRegExpBack) 1283 return p - offset; 1284 else 1285 return p + offset; 1286 } 1287 1288 1289 const char* 1290 RegExp::Next(const char* p) const 1291 { 1292 int32 offset; 1293 1294 if (p == &fDummy) 1295 return NULL; 1296 1297 offset = ((*(p + 1) & 0377) << 8) + (*(p + 2) & 0377); 1298 if (offset == 0) 1299 return NULL; 1300 1301 if (*p == kRegExpBack) 1302 return p - offset; 1303 else 1304 return p + offset; 1305 } 1306 1307 1308 inline int32 1309 RegExp::UCharAt(const char* p) const 1310 { 1311 return (int32)*(unsigned char *)p; 1312 } 1313 1314 1315 inline char* 1316 RegExp::Operand(char* p) const 1317 { 1318 return p + 3; 1319 } 1320 1321 1322 inline const char* 1323 RegExp::Operand(const char* p) const 1324 { 1325 return p + 3; 1326 } 1327 1328 1329 inline bool 1330 RegExp::IsMult(char c) const 1331 { 1332 return c == '*' || c == '+' || c == '?'; 1333 } 1334 1335 1336 #ifdef DEBUG 1337 1338 1339 // 1340 // - Dump - dump a regexp onto stdout in vaguely comprehensible form 1341 // 1342 void 1343 RegExp::Dump() 1344 { 1345 const char* s; 1346 char op = kRegExpExactly; 1347 // Arbitrary non-kRegExpEnd op. 1348 const char* next; 1349 1350 s = fRegExp->program + 1; 1351 while (op != kRegExpEnd) { 1352 // While that wasn't kRegExpEnd last time... 1353 op = *s; 1354 printf("%2ld%s", s - fRegExp->program, Prop(s)); 1355 // Where, what. 1356 next = Next(s); 1357 if (next == NULL) { 1358 // next ptr 1359 printf("(0)"); 1360 } else 1361 printf("(%ld)", (s - fRegExp->program) + (next - s)); 1362 1363 s += 3; 1364 if (op == kRegExpAnyOf || op == kRegExpAnyBut 1365 || op == kRegExpExactly) { 1366 // literal string, where present 1367 while (*s != '\0') { 1368 putchar(*s); 1369 s++; 1370 } 1371 s++; 1372 } 1373 putchar('\n'); 1374 } 1375 1376 // header fields of interest 1377 if (fRegExp->regstart != '\0') 1378 printf("start `%c' ", fRegExp->regstart); 1379 1380 if (fRegExp->reganch) 1381 printf("anchored "); 1382 1383 if (fRegExp->regmust != NULL) 1384 printf("must have \"%s\"", fRegExp->regmust); 1385 1386 printf("\n"); 1387 } 1388 1389 1390 // 1391 // - Prop - printable representation of opcode 1392 // 1393 char* 1394 RegExp::Prop(const char* op) const 1395 { 1396 const char* p = NULL; 1397 static char buf[50]; 1398 1399 strcpy(buf, ":"); 1400 1401 switch (*op) { 1402 case kRegExpBol: 1403 p = "kRegExpBol"; 1404 break; 1405 1406 case kRegExpEol: 1407 p = "kRegExpEol"; 1408 break; 1409 1410 case kRegExpAny: 1411 p = "kRegExpAny"; 1412 break; 1413 1414 case kRegExpAnyOf: 1415 p = "kRegExpAnyOf"; 1416 break; 1417 1418 case kRegExpAnyBut: 1419 p = "kRegExpAnyBut"; 1420 break; 1421 1422 case kRegExpBranch: 1423 p = "kRegExpBranch"; 1424 break; 1425 1426 case kRegExpExactly: 1427 p = "kRegExpExactly"; 1428 break; 1429 1430 case kRegExpNothing: 1431 p = "kRegExpNothing"; 1432 break; 1433 1434 case kRegExpBack: 1435 p = "kRegExpBack"; 1436 break; 1437 1438 case kRegExpEnd: 1439 p = "kRegExpEnd"; 1440 break; 1441 1442 case kRegExpOpen + 1: 1443 case kRegExpOpen + 2: 1444 case kRegExpOpen + 3: 1445 case kRegExpOpen + 4: 1446 case kRegExpOpen + 5: 1447 case kRegExpOpen + 6: 1448 case kRegExpOpen + 7: 1449 case kRegExpOpen + 8: 1450 case kRegExpOpen + 9: 1451 sprintf(buf + strlen(buf), "kRegExpOpen%d", *op - kRegExpOpen); 1452 p = NULL; 1453 break; 1454 1455 case kRegExpClose + 1: 1456 case kRegExpClose + 2: 1457 case kRegExpClose + 3: 1458 case kRegExpClose + 4: 1459 case kRegExpClose + 5: 1460 case kRegExpClose + 6: 1461 case kRegExpClose + 7: 1462 case kRegExpClose + 8: 1463 case kRegExpClose + 9: 1464 sprintf(buf + strlen(buf), "kRegExpClose%d", *op - kRegExpClose); 1465 p = NULL; 1466 break; 1467 1468 case kRegExpStar: 1469 p = "kRegExpStar"; 1470 break; 1471 1472 case kRegExpPlus: 1473 p = "kRegExpPlus"; 1474 break; 1475 1476 default: 1477 RegExpError("corrupted opcode"); 1478 break; 1479 } 1480 1481 if (p != NULL) 1482 strcat(buf, p); 1483 1484 return buf; 1485 } 1486 1487 1488 void 1489 RegExp::RegExpError(const char*) const 1490 { 1491 // does nothing now, perhaps it should printf? 1492 } 1493 1494 #endif // DEBUG 1495