1 /* 2 Open Tracker License 3 4 Terms and Conditions 5 6 Copyright (c) 1991-2000, Be Incorporated. All rights reserved. 7 8 Permission is hereby granted, free of charge, to any person obtaining a copy of 9 this software and associated documentation files (the "Software"), to deal in 10 the Software without restriction, including without limitation the rights to 11 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 12 of the Software, and to permit persons to whom the Software is furnished to do 13 so, subject to the following conditions: 14 15 The above copyright notice and this permission notice applies to all licensees 16 and shall be included in all copies or substantial portions of the Software. 17 18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF TITLE, MERCHANTABILITY, 20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 21 BE INCORPORATED BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN 22 AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN CONNECTION 23 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 24 25 Except as contained in this notice, the name of Be Incorporated shall not be 26 used in advertising or otherwise to promote the sale, use or other dealings in 27 this Software without prior written authorization from Be Incorporated. 28 29 Tracker(TM), Be(R), BeOS(R), and BeIA(TM) are trademarks or registered trademarks 30 of Be Incorporated in the United States and other countries. Other brand product 31 names are registered trademarks or trademarks of their respective holders. 32 All rights reserved. 33 */ 34 35 // This code is based on regexp.c, v.1.3 by Henry Spencer: 36 37 // @(#)regexp.c 1.3 of 18 April 87 38 // 39 // Copyright (c) 1986 by University of Toronto. 40 // Written by Henry Spencer. Not derived from licensed software. 41 // 42 // Permission is granted to anyone to use this software for any 43 // purpose on any computer system, and to redistribute it freely, 44 // subject to the following restrictions: 45 // 46 // 1. The author is not responsible for the consequences of use of 47 // this software, no matter how awful, even if they arise 48 // from defects in it. 49 // 50 // 2. The origin of this software must not be misrepresented, either 51 // by explicit claim or by omission. 52 // 53 // 3. Altered versions must be plainly marked as such, and must not 54 // be misrepresented as being the original software. 55 // 56 // Beware that some of this code is subtly aware of the way operator 57 // precedence is structured in regular expressions. Serious changes in 58 // regular-expression syntax might require a total rethink. 59 // 60 61 // ALTERED VERSION: Adapted to ANSI C and C++ for the OpenTracker 62 // project (www.opentracker.org), Jul 11, 2000. 63 64 65 #include <stdlib.h> 66 #include <stdio.h> 67 #include <string.h> 68 69 #include <Errors.h> 70 71 #include "RegExp.h" 72 73 74 // The first byte of the regexp internal "program" is actually this magic 75 // number; the start node begins in the second byte. 76 77 const uint8 kRegExpMagic = 0234; 78 79 // The "internal use only" fields in RegExp.h are present to pass info from 80 // compile to execute that permits the execute phase to run lots faster on 81 // simple cases. They are: 82 // 83 // regstart char that must begin a match; '\0' if none obvious 84 // reganch is the match anchored (at beginning-of-line only)? 85 // regmust string (pointer into program) that match must include, or NULL 86 // regmlen length of regmust string 87 // 88 // Regstart and reganch permit very fast decisions on suitable starting points 89 // for a match, cutting down the work a lot. Regmust permits fast rejection 90 // of lines that cannot possibly match. The regmust tests are costly enough 91 // that Compile() supplies a regmust only if the r.e. contains something 92 // potentially expensive (at present, the only such thing detected is * or + 93 // at the start of the r.e., which can involve a lot of backup). Regmlen is 94 // supplied because the test in RunMatcher() needs it and Compile() is 95 // computing it anyway. 96 // 97 // 98 // 99 // Structure for regexp "program". This is essentially a linear encoding 100 // of a nondeterministic finite-state machine (aka syntax charts or 101 // "railroad normal form" in parsing technology). Each node is an opcode 102 // plus a "next" pointer, possibly plus an operand. "Next" pointers of 103 // all nodes except kRegExpBranch implement concatenation; a "next" pointer 104 // with a kRegExpBranch on both ends of it is connecting two alternatives. 105 // (Here we have one of the subtle syntax dependencies: an individual 106 // kRegExpBranch (as opposed to a collection of them) is never concatenated 107 // with anything because of operator precedence.) The operand of some types 108 // of node is a literal string; for others, it is a node leading into a 109 // sub-FSM. In particular, the operand of a kRegExpBranch node is the first 110 // node of the branch. (NB this is* not* a tree structure: the tail of the 111 // branch connects to the thing following the set of kRegExpBranches). 112 // The opcodes are: 113 // 114 115 // definition number opnd? meaning 116 enum { 117 kRegExpEnd = 0, // no End of program. 118 kRegExpBol = 1, // no Match "" at beginning of line. 119 kRegExpEol = 2, // no Match "" at end of line. 120 kRegExpAny = 3, // no Match any one character. 121 kRegExpAnyOf = 4, // str Match any character in this string. 122 kRegExpAnyBut = 5, // str Match any character not in this string. 123 kRegExpBranch = 6, // node Match this alternative, or the next... 124 kRegExpBack = 7, // no Match "", "next" ptr points backward. 125 kRegExpExactly = 8, // str Match this string. 126 kRegExpNothing = 9, // no Match empty string. 127 kRegExpStar = 10, // node Match this (simple) thing 0 or more times. 128 kRegExpPlus = 11, // node Match this (simple) thing 1 or more times. 129 kRegExpOpen = 20, // no Mark this point in input as start of #n. 130 // kRegExpOpen + 1 is number 1, etc. 131 kRegExpClose = 30 // no Analogous to kRegExpOpen. 132 }; 133 134 // 135 // Opcode notes: 136 // 137 // kRegExpBranch The set of branches constituting a single choice are 138 // hooked together with their "next" pointers, since precedence prevents 139 // anything being concatenated to any individual branch. The 140 // "next" pointer of the last kRegExpBranch in a choice points to the 141 // thing following the whole choice. This is also where the 142 // final "next" pointer of each individual branch points; each 143 // branch starts with the operand node of a kRegExpBranch node. 144 // 145 // kRegExpBack Normal "next" pointers all implicitly point forward; 146 // kRegExpBack exists to make loop structures possible. 147 // 148 // kRegExpStar,kRegExpPlus '?', and complex '*' and '+', are implemented as 149 // circular kRegExpBranch structures using kRegExpBack. Simple cases 150 // (one character per match) are implemented with kRegExpStar and 151 // kRegExpPlus for speed and to minimize recursive plunges. 152 // 153 // kRegExpOpen,kRegExpClose ...are numbered at compile time. 154 // 155 // 156 // 157 // A node is one char of opcode followed by two chars of "next" pointer. 158 // "Next" pointers are stored as two 8-bit pieces, high order first. The 159 // value is a positive offset from the opcode of the node containing it. 160 // An operand, if any, simply follows the node. (Note that much of the 161 // code generation knows about this implicit relationship.) 162 // 163 // Using two bytes for the "next" pointer is vast overkill for most things, 164 // but allows patterns to get big without disasters. 165 // 166 167 const char* kMeta = "^$.[()|?+*\\"; 168 const int32 kMaxSize = 32767L; 169 // Probably could be 65535L. 170 171 // Flags to be passed up and down: 172 enum { 173 kHasWidth = 01, // Known never to match null string. 174 kSimple = 02, // Simple enough to be kRegExpStar/kRegExpPlus operand. 175 kSPStart = 04, // Starts with * or +. 176 kWorst = 0 // Worst case. 177 }; 178 179 const char* kRegExpErrorStringArray[] = { 180 "Unmatched parenthesis.", 181 "Expression too long.", 182 "Too many parenthesis.", 183 "Junk on end.", 184 "*+? operand may be empty.", 185 "Nested *?+.", 186 "Invalid bracket range.", 187 "Unmatched brackets.", 188 "Internal error.", 189 "?+* follows nothing.", 190 "Trailing \\.", 191 "Corrupted expression.", 192 "Memory corruption.", 193 "Corrupted pointers.", 194 "Corrupted opcode." 195 }; 196 197 #ifdef DEBUG 198 int32 regnarrate = 0; 199 #endif 200 201 RegExp::RegExp() 202 : fError(B_OK), 203 fRegExp(NULL) 204 { 205 } 206 207 208 RegExp::RegExp(const char* pattern) 209 : fError(B_OK), 210 fRegExp(NULL) 211 { 212 fRegExp = Compile(pattern); 213 } 214 215 216 RegExp::RegExp(const BString &pattern) 217 : fError(B_OK), 218 fRegExp(NULL) 219 { 220 fRegExp = Compile(pattern.String()); 221 } 222 223 224 RegExp::~RegExp() 225 { 226 free(fRegExp); 227 } 228 229 230 status_t 231 RegExp::InitCheck() const 232 { 233 return fError; 234 } 235 236 237 status_t 238 RegExp::SetTo(const char* pattern) 239 { 240 fError = B_OK; 241 free(fRegExp); 242 fRegExp = Compile(pattern); 243 return fError; 244 } 245 246 247 status_t 248 RegExp::SetTo(const BString &pattern) 249 { 250 fError = B_OK; 251 free(fRegExp); 252 fRegExp = Compile(pattern.String()); 253 return fError; 254 } 255 256 257 bool 258 RegExp::Matches(const char* string) const 259 { 260 if (!fRegExp || !string) 261 return false; 262 263 return RunMatcher(fRegExp, string) == 1; 264 } 265 266 267 bool 268 RegExp::Matches(const BString &string) const 269 { 270 if (!fRegExp) 271 return false; 272 273 return RunMatcher(fRegExp, string.String()) == 1; 274 } 275 276 277 // 278 // - Compile - compile a regular expression into internal code 279 // 280 // We can't allocate space until we know how big the compiled form will be, 281 // but we can't compile it (and thus know how big it is) until we've got a 282 // place to put the code. So we cheat: we compile it twice, once with code 283 // generation turned off and size counting turned on, and once "for real". 284 // This also means that we don't allocate space until we are sure that the 285 // thing really will compile successfully, and we never have to move the 286 // code and thus invalidate pointers into it. (Note that it has to be in 287 // one piece because free() must be able to free it all.) 288 // 289 // Beware that the optimization-preparation code in here knows about some 290 // of the structure of the compiled regexp. 291 regexp* 292 RegExp::Compile(const char* exp) 293 { 294 regexp* r; 295 const char* scan; 296 const char* longest; 297 int32 len; 298 int32 flags; 299 300 if (exp == NULL) { 301 SetError(B_BAD_VALUE); 302 return NULL; 303 } 304 305 // First pass: determine size, legality. 306 fInputScanPointer = exp; 307 fParenthesisCount = 1; 308 fCodeSize = 0L; 309 fCodeEmitPointer = &fDummy; 310 Char(kRegExpMagic); 311 if (Reg(0, &flags) == NULL) 312 return NULL; 313 314 // Small enough for pointer-storage convention? 315 if (fCodeSize >= kMaxSize) { 316 SetError(REGEXP_TOO_BIG); 317 return NULL; 318 } 319 320 r = (regexp*)malloc(sizeof(regexp) + fCodeSize); 321 // Allocate space 322 323 if (!r) { 324 SetError(B_NO_MEMORY); 325 return NULL; 326 } 327 328 // Second pass: emit code. 329 fInputScanPointer = exp; 330 fParenthesisCount = 1; 331 fCodeEmitPointer = r->program; 332 Char(kRegExpMagic); 333 if (Reg(0, &flags) == NULL) { 334 free(r); 335 return NULL; 336 } 337 338 // Dig out information for optimizations. 339 r->regstart = '\0'; // Worst-case defaults. 340 r->reganch = 0; 341 r->regmust = NULL; 342 r->regmlen = 0; 343 scan = r->program + 1; 344 // First kRegExpBranch. 345 if (*Next((char*)scan) == kRegExpEnd) { 346 // Only one top-level choice. 347 scan = Operand(scan); 348 349 // Starting-point info. 350 if (*scan == kRegExpExactly) 351 r->regstart = *Operand(scan); 352 else if (*scan == kRegExpBol) 353 r->reganch++; 354 355 // If there's something expensive in the r.e., find the 356 // longest literal string that must appear and make it the 357 // regmust. Resolve ties in favor of later strings, since 358 // the regstart check works with the beginning of the r.e. 359 // and avoiding duplication strengthens checking. Not a 360 // strong reason, but sufficient in the absence of others. 361 if (flags&kSPStart) { 362 longest = NULL; 363 len = 0; 364 for (; scan != NULL; scan = Next((char*)scan)) 365 if (*scan == kRegExpExactly 366 && (int32)strlen(Operand(scan)) >= len) { 367 longest = Operand(scan); 368 len = (int32)strlen(Operand(scan)); 369 } 370 r->regmust = longest; 371 r->regmlen = len; 372 } 373 } 374 375 return r; 376 } 377 378 379 regexp* 380 RegExp::Expression() const 381 { 382 return fRegExp; 383 } 384 385 386 const char* 387 RegExp::ErrorString() const 388 { 389 if (fError >= REGEXP_UNMATCHED_PARENTHESIS 390 && fError <= REGEXP_CORRUPTED_OPCODE) 391 return kRegExpErrorStringArray[fError - B_ERRORS_END]; 392 393 return strerror(fError); 394 } 395 396 397 void 398 RegExp::SetError(status_t error) const 399 { 400 fError = error; 401 } 402 403 404 // 405 // - Reg - regular expression, i.e. main body or parenthesized thing 406 // 407 // Caller must absorb opening parenthesis. 408 // 409 // Combining parenthesis handling with the base level of regular expression 410 // is a trifle forced, but the need to tie the tails of the branches to what 411 // follows makes it hard to avoid. 412 // 413 char* 414 RegExp::Reg(int32 paren, int32* flagp) 415 { 416 char* ret; 417 char* br; 418 char* ender; 419 int32 parno = 0; 420 int32 flags; 421 422 *flagp = kHasWidth; // Tentatively. 423 424 // Make an kRegExpOpen node, if parenthesized. 425 if (paren) { 426 if (fParenthesisCount >= kSubExpressionMax) { 427 SetError(REGEXP_TOO_MANY_PARENTHESIS); 428 return NULL; 429 } 430 parno = fParenthesisCount; 431 fParenthesisCount++; 432 ret = Node((char)(kRegExpOpen + parno)); 433 } else 434 ret = NULL; 435 436 // Pick up the branches, linking them together. 437 br = Branch(&flags); 438 if (br == NULL) 439 return NULL; 440 if (ret != NULL) 441 Tail(ret, br); // kRegExpOpen -> first 442 else 443 ret = br; 444 if (!(flags & kHasWidth)) 445 *flagp &= ~kHasWidth; 446 *flagp |= flags&kSPStart; 447 while (*fInputScanPointer == '|') { 448 fInputScanPointer++; 449 br = Branch(&flags); 450 if (br == NULL) 451 return NULL; 452 Tail(ret, br); // kRegExpBranch -> kRegExpBranch. 453 if (!(flags & kHasWidth)) 454 *flagp &= ~kHasWidth; 455 *flagp |= flags&kSPStart; 456 } 457 458 // Make a closing node, and hook it on the end. 459 ender = Node(paren ? (char)(kRegExpClose + parno) : (char)kRegExpEnd); 460 Tail(ret, ender); 461 462 // Hook the tails of the branches to the closing node. 463 for (br = ret; br != NULL; br = Next(br)) 464 OpTail(br, ender); 465 466 // Check for proper termination. 467 if (paren && *fInputScanPointer++ != ')') { 468 SetError(REGEXP_UNMATCHED_PARENTHESIS); 469 return NULL; 470 } else if (!paren && *fInputScanPointer != '\0') { 471 if (*fInputScanPointer == ')') { 472 SetError(REGEXP_UNMATCHED_PARENTHESIS); 473 return NULL; 474 } else { 475 SetError(REGEXP_JUNK_ON_END); 476 return NULL; // "Can't happen". 477 } 478 // NOTREACHED 479 } 480 481 return ret; 482 } 483 484 485 // 486 // - Branch - one alternative of an | operator 487 // 488 // Implements the concatenation operator. 489 // 490 char* 491 RegExp::Branch(int32* flagp) 492 { 493 char* ret; 494 char* chain; 495 char* latest; 496 int32 flags; 497 498 *flagp = kWorst; // Tentatively. 499 500 ret = Node(kRegExpBranch); 501 chain = NULL; 502 while (*fInputScanPointer != '\0' 503 && *fInputScanPointer != '|' 504 && *fInputScanPointer != ')') { 505 latest = Piece(&flags); 506 if (latest == NULL) 507 return NULL; 508 *flagp |= flags & kHasWidth; 509 if (chain == NULL) // First piece. 510 *flagp |= flags & kSPStart; 511 else 512 Tail(chain, latest); 513 chain = latest; 514 } 515 if (chain == NULL) // Loop ran zero times. 516 Node(kRegExpNothing); 517 518 return ret; 519 } 520 521 522 // 523 // - Piece - something followed by possible [*+?] 524 // 525 // Note that the branching code sequences used for ? and the general cases 526 // of * and + are somewhat optimized: they use the same kRegExpNothing node 527 // as both the endmarker for their branch list and the body of the last 528 // branch. It might seem that this node could be dispensed with entirely, 529 // but the endmarker role is not redundant. 530 // 531 char* 532 RegExp::Piece(int32* flagp) 533 { 534 char* ret; 535 char op; 536 char* next; 537 int32 flags; 538 539 ret = Atom(&flags); 540 if (ret == NULL) 541 return NULL; 542 543 op = *fInputScanPointer; 544 if (!IsMult(op)) { 545 *flagp = flags; 546 return ret; 547 } 548 549 if (!(flags & kHasWidth) && op != '?') { 550 SetError(REGEXP_STAR_PLUS_OPERAND_EMPTY); 551 return NULL; 552 } 553 *flagp = op != '+' ? kWorst | kSPStart : kWorst | kHasWidth; 554 555 if (op == '*' && (flags & kSimple)) 556 Insert(kRegExpStar, ret); 557 else if (op == '*') { 558 // Emit x* as (x&|), where & means "self". 559 Insert(kRegExpBranch, ret); // Either x 560 OpTail(ret, Node(kRegExpBack)); // and loop 561 OpTail(ret, ret); // back 562 Tail(ret, Node(kRegExpBranch)); // or 563 Tail(ret, Node(kRegExpNothing)); // null. 564 } else if (op == '+' && (flags & kSimple)) 565 Insert(kRegExpPlus, ret); 566 else if (op == '+') { 567 // Emit x+ as x(&|), where & means "self". 568 next = Node(kRegExpBranch); // Either 569 Tail(ret, next); 570 Tail(Node(kRegExpBack), ret); // loop back 571 Tail(next, Node(kRegExpBranch)); // or 572 Tail(ret, Node(kRegExpNothing)); // null. 573 } else if (op == '?') { 574 // Emit x? as (x|) 575 Insert(kRegExpBranch, ret); // Either x 576 Tail(ret, Node(kRegExpBranch)); // or 577 next = Node(kRegExpNothing); // null. 578 Tail(ret, next); 579 OpTail(ret, next); 580 } 581 fInputScanPointer++; 582 if (IsMult(*fInputScanPointer)) { 583 SetError(REGEXP_NESTED_STAR_QUESTION_PLUS); 584 return NULL; 585 } 586 return ret; 587 } 588 589 590 // 591 // - Atom - the lowest level 592 // 593 // Optimization: gobbles an entire sequence of ordinary characters so that 594 // it can turn them into a single node, which is smaller to store and 595 // faster to run. Backslashed characters are exceptions, each becoming a 596 // separate node; the code is simpler that way and it's not worth fixing. 597 // 598 char* 599 RegExp::Atom(int32* flagp) 600 { 601 char* ret; 602 int32 flags; 603 604 *flagp = kWorst; // Tentatively. 605 606 switch (*fInputScanPointer++) { 607 case '^': 608 ret = Node(kRegExpBol); 609 break; 610 case '$': 611 ret = Node(kRegExpEol); 612 break; 613 case '.': 614 ret = Node(kRegExpAny); 615 *flagp |= kHasWidth|kSimple; 616 break; 617 case '[': 618 { 619 int32 cclass; 620 int32 classend; 621 622 if (*fInputScanPointer == '^') { // Complement of range. 623 ret = Node(kRegExpAnyBut); 624 fInputScanPointer++; 625 } else 626 ret = Node(kRegExpAnyOf); 627 if (*fInputScanPointer == ']' || *fInputScanPointer == '-') 628 Char(*fInputScanPointer++); 629 while (*fInputScanPointer != '\0' 630 && *fInputScanPointer != ']') { 631 if (*fInputScanPointer == '-') { 632 fInputScanPointer++; 633 if (*fInputScanPointer == ']' 634 || *fInputScanPointer == '\0') { 635 Char('-'); 636 } else { 637 cclass = UCharAt(fInputScanPointer - 2) + 1; 638 classend = UCharAt(fInputScanPointer); 639 if (cclass > classend + 1) { 640 SetError(REGEXP_INVALID_BRACKET_RANGE); 641 return NULL; 642 } 643 for (; cclass <= classend; cclass++) 644 Char((char)cclass); 645 fInputScanPointer++; 646 } 647 } else 648 Char(*fInputScanPointer++); 649 } 650 Char('\0'); 651 if (*fInputScanPointer != ']') { 652 SetError(REGEXP_UNMATCHED_BRACKET); 653 return NULL; 654 } 655 fInputScanPointer++; 656 *flagp |= kHasWidth | kSimple; 657 } 658 break; 659 case '(': 660 ret = Reg(1, &flags); 661 if (ret == NULL) 662 return NULL; 663 *flagp |= flags & (kHasWidth | kSPStart); 664 break; 665 case '\0': 666 case '|': 667 case ')': 668 SetError(REGEXP_INTERNAL_ERROR); 669 return NULL; // Supposed to be caught earlier. 670 case '?': 671 case '+': 672 case '*': 673 SetError(REGEXP_QUESTION_PLUS_STAR_FOLLOWS_NOTHING); 674 return NULL; 675 case '\\': 676 if (*fInputScanPointer == '\0') { 677 SetError(REGEXP_TRAILING_BACKSLASH); 678 return NULL; 679 } 680 ret = Node(kRegExpExactly); 681 Char(*fInputScanPointer++); 682 Char('\0'); 683 *flagp |= kHasWidth|kSimple; 684 break; 685 default: 686 { 687 int32 len; 688 char ender; 689 690 fInputScanPointer--; 691 len = (int32)strcspn(fInputScanPointer, kMeta); 692 if (len <= 0) { 693 SetError(REGEXP_INTERNAL_ERROR); 694 return NULL; 695 } 696 697 ender = *(fInputScanPointer + len); 698 if (len > 1 && IsMult(ender)) 699 len--; // Back off clear of ?+* operand. 700 701 *flagp |= kHasWidth; 702 if (len == 1) 703 *flagp |= kSimple; 704 705 ret = Node(kRegExpExactly); 706 while (len > 0) { 707 Char(*fInputScanPointer++); 708 len--; 709 } 710 711 Char('\0'); 712 break; 713 } 714 } 715 716 return ret; 717 } 718 719 720 // 721 // - Node - emit a node 722 // 723 char* // Location. 724 RegExp::Node(char op) 725 { 726 char* ret; 727 char* ptr; 728 729 ret = fCodeEmitPointer; 730 if (ret == &fDummy) { 731 fCodeSize += 3; 732 return ret; 733 } 734 735 ptr = ret; 736 *ptr++ = op; 737 *ptr++ = '\0'; // Null "next" pointer. 738 *ptr++ = '\0'; 739 fCodeEmitPointer = ptr; 740 741 return ret; 742 } 743 744 745 // 746 // - Char - emit (if appropriate) a byte of code 747 // 748 void 749 RegExp::Char(char b) 750 { 751 if (fCodeEmitPointer != &fDummy) 752 *fCodeEmitPointer++ = b; 753 else 754 fCodeSize++; 755 } 756 757 758 // 759 // - Insert - insert an operator in front of already-emitted operand 760 // 761 // Means relocating the operand. 762 // 763 void 764 RegExp::Insert(char op, char* opnd) 765 { 766 char* src; 767 char* dst; 768 char* place; 769 770 if (fCodeEmitPointer == &fDummy) { 771 fCodeSize += 3; 772 return; 773 } 774 775 src = fCodeEmitPointer; 776 fCodeEmitPointer += 3; 777 dst = fCodeEmitPointer; 778 while (src > opnd) 779 *--dst = *--src; 780 781 place = opnd; // Op node, where operand used to be. 782 *place++ = op; 783 *place++ = '\0'; 784 *place++ = '\0'; 785 } 786 787 788 // 789 // - Tail - set the next-pointer at the end of a node chain 790 // 791 void 792 RegExp::Tail(char* p, char* val) 793 { 794 char* scan; 795 char* temp; 796 int32 offset; 797 798 if (p == &fDummy) 799 return; 800 801 // Find last node. 802 scan = p; 803 for (;;) { 804 temp = Next(scan); 805 if (temp == NULL) 806 break; 807 scan = temp; 808 } 809 810 if (scan[0] == kRegExpBack) 811 offset = scan - val; 812 else 813 offset = val - scan; 814 815 scan[1] = (char)((offset >> 8) & 0377); 816 scan[2] = (char)(offset & 0377); 817 } 818 819 820 // 821 // - OpTail - Tail on operand of first argument; nop if operandless 822 // 823 void 824 RegExp::OpTail(char* p, char* val) 825 { 826 // "Operandless" and "op != kRegExpBranch" are synonymous in practice. 827 if (p == NULL || p == &fDummy || *p != kRegExpBranch) 828 return; 829 Tail(Operand(p), val); 830 } 831 832 // 833 // RunMatcher and friends 834 // 835 836 837 // 838 // - RunMatcher - match a regexp against a string 839 // 840 int32 841 RegExp::RunMatcher(regexp* prog, const char* string) const 842 { 843 const char* s; 844 845 // Be paranoid... 846 if (prog == NULL || string == NULL) { 847 SetError(B_BAD_VALUE); 848 return 0; 849 } 850 851 // Check validity of program. 852 if (UCharAt(prog->program) != kRegExpMagic) { 853 SetError(REGEXP_CORRUPTED_PROGRAM); 854 return 0; 855 } 856 857 // If there is a "must appear" string, look for it. 858 if (prog->regmust != NULL) { 859 s = string; 860 while ((s = strchr(s, prog->regmust[0])) != NULL) { 861 if (strncmp(s, prog->regmust, (size_t)prog->regmlen) == 0) 862 break; // Found it. 863 s++; 864 } 865 if (s == NULL) // Not present. 866 return 0; 867 } 868 869 // Mark beginning of line for ^ . 870 fRegBol = string; 871 872 // Simplest case: anchored match need be tried only once. 873 if (prog->reganch) 874 return Try(prog, (char*)string); 875 876 // Messy cases: unanchored match. 877 s = string; 878 if (prog->regstart != '\0') 879 // We know what char it must start with. 880 while ((s = strchr(s, prog->regstart)) != NULL) { 881 if (Try(prog, (char*)s)) 882 return 1; 883 s++; 884 } 885 else 886 // We don't -- general case. 887 do { 888 if (Try(prog, (char*)s)) 889 return 1; 890 } while (*s++ != '\0'); 891 892 // Failure. 893 return 0; 894 } 895 896 897 // 898 // - Try - try match at specific point 899 // 900 int32 // 0 failure, 1 success 901 RegExp::Try(regexp* prog, const char* string) const 902 { 903 int32 i; 904 const char **sp; 905 const char **ep; 906 907 fStringInputPointer = string; 908 fStartPArrayPointer = prog->startp; 909 fEndPArrayPointer = prog->endp; 910 911 sp = prog->startp; 912 ep = prog->endp; 913 for (i = kSubExpressionMax; i > 0; i--) { 914 *sp++ = NULL; 915 *ep++ = NULL; 916 } 917 if (Match(prog->program + 1)) { 918 prog->startp[0] = string; 919 prog->endp[0] = fStringInputPointer; 920 return 1; 921 } else 922 return 0; 923 } 924 925 926 // 927 // - Match - main matching routine 928 // 929 // Conceptually the strategy is simple: check to see whether the current 930 // node matches, call self recursively to see whether the rest matches, 931 // and then act accordingly. In practice we make some effort to avoid 932 // recursion, in particular by going through "ordinary" nodes (that don't 933 // need to know whether the rest of the match failed) by a loop instead of 934 // by recursion. 935 /// 936 int32 // 0 failure, 1 success 937 RegExp::Match(const char* prog) const 938 { 939 const char* scan; // Current node. 940 const char* next; // Next node. 941 942 scan = prog; 943 #ifdef DEBUG 944 if (scan != NULL && regnarrate) 945 fprintf(stderr, "%s(\n", Prop(scan)); 946 #endif 947 while (scan != NULL) { 948 #ifdef DEBUG 949 if (regnarrate) 950 fprintf(stderr, "%s...\n", Prop(scan)); 951 #endif 952 next = Next(scan); 953 954 switch (*scan) { 955 case kRegExpBol: 956 if (fStringInputPointer != fRegBol) 957 return 0; 958 break; 959 case kRegExpEol: 960 if (*fStringInputPointer != '\0') 961 return 0; 962 break; 963 case kRegExpAny: 964 if (*fStringInputPointer == '\0') 965 return 0; 966 fStringInputPointer++; 967 break; 968 case kRegExpExactly: 969 { 970 const char* opnd = Operand(scan); 971 // Inline the first character, for speed. 972 if (*opnd != *fStringInputPointer) 973 return 0; 974 975 uint32 len = strlen(opnd); 976 if (len > 1 977 && strncmp(opnd, fStringInputPointer, len) != 0) { 978 return 0; 979 } 980 981 fStringInputPointer += len; 982 } 983 break; 984 case kRegExpAnyOf: 985 if (*fStringInputPointer == '\0' 986 || strchr(Operand(scan), *fStringInputPointer) == NULL) 987 return 0; 988 fStringInputPointer++; 989 break; 990 case kRegExpAnyBut: 991 if (*fStringInputPointer == '\0' 992 || strchr(Operand(scan), *fStringInputPointer) != NULL) 993 return 0; 994 fStringInputPointer++; 995 break; 996 case kRegExpNothing: 997 break; 998 case kRegExpBack: 999 break; 1000 case kRegExpOpen + 1: 1001 case kRegExpOpen + 2: 1002 case kRegExpOpen + 3: 1003 case kRegExpOpen + 4: 1004 case kRegExpOpen + 5: 1005 case kRegExpOpen + 6: 1006 case kRegExpOpen + 7: 1007 case kRegExpOpen + 8: 1008 case kRegExpOpen + 9: 1009 { 1010 int32 no; 1011 const char* save; 1012 1013 no = *scan - kRegExpOpen; 1014 save = fStringInputPointer; 1015 1016 if (Match(next)) { 1017 // 1018 // Don't set startp if some later 1019 // invocation of the same parentheses 1020 // already has. 1021 // 1022 if (fStartPArrayPointer[no] == NULL) 1023 fStartPArrayPointer[no] = save; 1024 return 1; 1025 } else 1026 return 0; 1027 } 1028 break; 1029 case kRegExpClose + 1: 1030 case kRegExpClose + 2: 1031 case kRegExpClose + 3: 1032 case kRegExpClose + 4: 1033 case kRegExpClose + 5: 1034 case kRegExpClose + 6: 1035 case kRegExpClose + 7: 1036 case kRegExpClose + 8: 1037 case kRegExpClose + 9: 1038 { 1039 int32 no; 1040 const char* save; 1041 1042 no = *scan - kRegExpClose; 1043 save = fStringInputPointer; 1044 1045 if (Match(next)) { 1046 // 1047 // Don't set endp if some later 1048 // invocation of the same parentheses 1049 // already has. 1050 // 1051 if (fEndPArrayPointer[no] == NULL) 1052 fEndPArrayPointer[no] = save; 1053 return 1; 1054 } else 1055 return 0; 1056 } 1057 break; 1058 case kRegExpBranch: 1059 { 1060 const char* save; 1061 1062 if (*next != kRegExpBranch) // No choice. 1063 next = Operand(scan); // Avoid recursion. 1064 else { 1065 do { 1066 save = fStringInputPointer; 1067 if (Match(Operand(scan))) 1068 return 1; 1069 fStringInputPointer = save; 1070 scan = Next(scan); 1071 } while (scan != NULL && *scan == kRegExpBranch); 1072 return 0; 1073 // NOTREACHED/ 1074 } 1075 } 1076 break; 1077 case kRegExpStar: 1078 case kRegExpPlus: 1079 { 1080 char nextch; 1081 int32 no; 1082 const char* save; 1083 int32 min; 1084 1085 // 1086 //Lookahead to avoid useless match attempts 1087 // when we know what character comes next. 1088 // 1089 nextch = '\0'; 1090 if (*next == kRegExpExactly) 1091 nextch = *Operand(next); 1092 min = (*scan == kRegExpStar) ? 0 : 1; 1093 save = fStringInputPointer; 1094 no = Repeat(Operand(scan)); 1095 while (no >= min) { 1096 // If it could work, try it. 1097 if (nextch == '\0' || *fStringInputPointer == nextch) 1098 if (Match(next)) 1099 return 1; 1100 // Couldn't or didn't -- back up. 1101 no--; 1102 fStringInputPointer = save + no; 1103 } 1104 return 0; 1105 } 1106 break; 1107 case kRegExpEnd: 1108 return 1; // Success! 1109 1110 default: 1111 SetError(REGEXP_MEMORY_CORRUPTION); 1112 return 0; 1113 } 1114 1115 scan = next; 1116 } 1117 1118 // 1119 // We get here only if there's trouble -- normally "case kRegExpEnd" is 1120 // the terminating point. 1121 // 1122 SetError(REGEXP_CORRUPTED_POINTERS); 1123 return 0; 1124 } 1125 1126 1127 // 1128 // - Repeat - repeatedly match something simple, report how many 1129 // 1130 int32 1131 RegExp::Repeat(const char* p) const 1132 { 1133 int32 count = 0; 1134 const char* scan; 1135 const char* opnd; 1136 1137 scan = fStringInputPointer; 1138 opnd = Operand(p); 1139 switch (*p) { 1140 case kRegExpAny: 1141 count = (int32)strlen(scan); 1142 scan += count; 1143 break; 1144 1145 case kRegExpExactly: 1146 while (*opnd == *scan) { 1147 count++; 1148 scan++; 1149 } 1150 break; 1151 1152 case kRegExpAnyOf: 1153 while (*scan != '\0' && strchr(opnd, *scan) != NULL) { 1154 count++; 1155 scan++; 1156 } 1157 break; 1158 1159 case kRegExpAnyBut: 1160 while (*scan != '\0' && strchr(opnd, *scan) == NULL) { 1161 count++; 1162 scan++; 1163 } 1164 break; 1165 1166 default: // Oh dear. Called inappropriately. 1167 SetError(REGEXP_INTERNAL_ERROR); 1168 count = 0; // Best compromise. 1169 break; 1170 } 1171 fStringInputPointer = scan; 1172 1173 return count; 1174 } 1175 1176 1177 // 1178 // - Next - dig the "next" pointer out of a node 1179 // 1180 char* 1181 RegExp::Next(char* p) 1182 { 1183 int32 offset; 1184 1185 if (p == &fDummy) 1186 return NULL; 1187 1188 offset = ((*(p + 1) & 0377) << 8) + (*(p + 2) & 0377); 1189 if (offset == 0) 1190 return NULL; 1191 1192 if (*p == kRegExpBack) 1193 return p - offset; 1194 else 1195 return p + offset; 1196 } 1197 1198 1199 const char* 1200 RegExp::Next(const char* p) const 1201 { 1202 int32 offset; 1203 1204 if (p == &fDummy) 1205 return NULL; 1206 1207 offset = ((*(p + 1) & 0377) << 8) + (*(p + 2) & 0377); 1208 if (offset == 0) 1209 return NULL; 1210 1211 if (*p == kRegExpBack) 1212 return p - offset; 1213 else 1214 return p + offset; 1215 } 1216 1217 1218 inline int32 1219 RegExp::UCharAt(const char* p) const 1220 { 1221 return (int32)*(unsigned char *)p; 1222 } 1223 1224 1225 inline char* 1226 RegExp::Operand(char* p) const 1227 { 1228 return p + 3; 1229 } 1230 1231 1232 inline const char* 1233 RegExp::Operand(const char* p) const 1234 { 1235 return p + 3; 1236 } 1237 1238 1239 inline bool 1240 RegExp::IsMult(char c) const 1241 { 1242 return c == '*' || c == '+' || c == '?'; 1243 } 1244 1245 1246 #ifdef DEBUG 1247 1248 1249 // 1250 // - Dump - dump a regexp onto stdout in vaguely comprehensible form 1251 // 1252 void 1253 RegExp::Dump() 1254 { 1255 const char* s; 1256 char op = kRegExpExactly; // Arbitrary non-kRegExpEnd op. 1257 const char* next; 1258 1259 s = fRegExp->program + 1; 1260 while (op != kRegExpEnd) { // While that wasn't kRegExpEnd last time... 1261 op = *s; 1262 printf("%2ld%s", s - fRegExp->program, Prop(s)); // Where, what. 1263 next = Next(s); 1264 if (next == NULL) // Next ptr. 1265 printf("(0)"); 1266 else 1267 printf("(%ld)", (s - fRegExp->program) + (next - s)); 1268 s += 3; 1269 if (op == kRegExpAnyOf || op == kRegExpAnyBut 1270 || op == kRegExpExactly) { 1271 // Literal string, where present. 1272 while (*s != '\0') { 1273 putchar(*s); 1274 s++; 1275 } 1276 s++; 1277 } 1278 putchar('\n'); 1279 } 1280 1281 // Header fields of interest. 1282 if (fRegExp->regstart != '\0') 1283 printf("start `%c' ", fRegExp->regstart); 1284 if (fRegExp->reganch) 1285 printf("anchored "); 1286 if (fRegExp->regmust != NULL) 1287 printf("must have \"%s\"", fRegExp->regmust); 1288 printf("\n"); 1289 } 1290 1291 1292 // 1293 // - Prop - printable representation of opcode 1294 // 1295 char* 1296 RegExp::Prop(const char* op) const 1297 { 1298 const char* p = NULL; 1299 static char buf[50]; 1300 1301 (void) strcpy(buf, ":"); 1302 1303 switch (*op) { 1304 case kRegExpBol: 1305 p = "kRegExpBol"; 1306 break; 1307 case kRegExpEol: 1308 p = "kRegExpEol"; 1309 break; 1310 case kRegExpAny: 1311 p = "kRegExpAny"; 1312 break; 1313 case kRegExpAnyOf: 1314 p = "kRegExpAnyOf"; 1315 break; 1316 case kRegExpAnyBut: 1317 p = "kRegExpAnyBut"; 1318 break; 1319 case kRegExpBranch: 1320 p = "kRegExpBranch"; 1321 break; 1322 case kRegExpExactly: 1323 p = "kRegExpExactly"; 1324 break; 1325 case kRegExpNothing: 1326 p = "kRegExpNothing"; 1327 break; 1328 case kRegExpBack: 1329 p = "kRegExpBack"; 1330 break; 1331 case kRegExpEnd: 1332 p = "kRegExpEnd"; 1333 break; 1334 case kRegExpOpen + 1: 1335 case kRegExpOpen + 2: 1336 case kRegExpOpen + 3: 1337 case kRegExpOpen + 4: 1338 case kRegExpOpen + 5: 1339 case kRegExpOpen + 6: 1340 case kRegExpOpen + 7: 1341 case kRegExpOpen + 8: 1342 case kRegExpOpen + 9: 1343 sprintf(buf + strlen(buf), "kRegExpOpen%d", *op - kRegExpOpen); 1344 p = NULL; 1345 break; 1346 case kRegExpClose + 1: 1347 case kRegExpClose + 2: 1348 case kRegExpClose + 3: 1349 case kRegExpClose + 4: 1350 case kRegExpClose + 5: 1351 case kRegExpClose + 6: 1352 case kRegExpClose + 7: 1353 case kRegExpClose + 8: 1354 case kRegExpClose + 9: 1355 sprintf(buf + strlen(buf), "kRegExpClose%d", *op - kRegExpClose); 1356 p = NULL; 1357 break; 1358 case kRegExpStar: 1359 p = "kRegExpStar"; 1360 break; 1361 case kRegExpPlus: 1362 p = "kRegExpPlus"; 1363 break; 1364 default: 1365 RegExpError("corrupted opcode"); 1366 break; 1367 } 1368 1369 if (p != NULL) 1370 strcat(buf, p); 1371 1372 return buf; 1373 } 1374 1375 1376 void 1377 RegExp::RegExpError(const char*) const 1378 { 1379 // does nothing now, perhaps it should printf? 1380 } 1381 1382 #endif 1383