1 /* 2 Open Tracker License 3 4 Terms and Conditions 5 6 Copyright (c) 1991-2000, Be Incorporated. All rights reserved. 7 8 Permission is hereby granted, free of charge, to any person obtaining a copy of 9 this software and associated documentation files (the "Software"), to deal in 10 the Software without restriction, including without limitation the rights to 11 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 12 of the Software, and to permit persons to whom the Software is furnished to do 13 so, subject to the following conditions: 14 15 The above copyright notice and this permission notice applies to all licensees 16 and shall be included in all copies or substantial portions of the Software. 17 18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF TITLE, MERCHANTABILITY, 20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 21 BE INCORPORATED BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN 22 AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN CONNECTION 23 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 24 25 Except as contained in this notice, the name of Be Incorporated shall not be 26 used in advertising or otherwise to promote the sale, use or other dealings in 27 this Software without prior written authorization from Be Incorporated. 28 29 Tracker(TM), Be(R), BeOS(R), and BeIA(TM) are trademarks or registered trademarks 30 of Be Incorporated in the United States and other countries. Other brand product 31 names are registered trademarks or trademarks of their respective holders. 32 All rights reserved. 33 */ 34 35 // This code is based on regexp.c, v.1.3 by Henry Spencer: 36 37 // @(#)regexp.c 1.3 of 18 April 87 38 // 39 // Copyright (c) 1986 by University of Toronto. 40 // Written by Henry Spencer. Not derived from licensed software. 41 // 42 // Permission is granted to anyone to use this software for any 43 // purpose on any computer system, and to redistribute it freely, 44 // subject to the following restrictions: 45 // 46 // 1. The author is not responsible for the consequences of use of 47 // this software, no matter how awful, even if they arise 48 // from defects in it. 49 // 50 // 2. The origin of this software must not be misrepresented, either 51 // by explicit claim or by omission. 52 // 53 // 3. Altered versions must be plainly marked as such, and must not 54 // be misrepresented as being the original software. 55 // 56 // Beware that some of this code is subtly aware of the way operator 57 // precedence is structured in regular expressions. Serious changes in 58 // regular-expression syntax might require a total rethink. 59 // 60 61 // ALTERED VERSION: Adapted to ANSI C and C++ for the OpenTracker 62 // project (www.opentracker.org), Jul 11, 2000. 63 64 #include <stdlib.h> 65 #include <stdio.h> 66 #include <string.h> 67 68 #include <Errors.h> 69 70 #include "RegExp.h" 71 72 // The first byte of the regexp internal "program" is actually this magic 73 // number; the start node begins in the second byte. 74 75 const uint8 kRegExpMagic = 0234; 76 77 // The "internal use only" fields in RegExp.h are present to pass info from 78 // compile to execute that permits the execute phase to run lots faster on 79 // simple cases. They are: 80 // 81 // regstart char that must begin a match; '\0' if none obvious 82 // reganch is the match anchored (at beginning-of-line only)? 83 // regmust string (pointer into program) that match must include, or NULL 84 // regmlen length of regmust string 85 // 86 // Regstart and reganch permit very fast decisions on suitable starting points 87 // for a match, cutting down the work a lot. Regmust permits fast rejection 88 // of lines that cannot possibly match. The regmust tests are costly enough 89 // that Compile() supplies a regmust only if the r.e. contains something 90 // potentially expensive (at present, the only such thing detected is * or + 91 // at the start of the r.e., which can involve a lot of backup). Regmlen is 92 // supplied because the test in RunMatcher() needs it and Compile() is computing 93 // it anyway. 94 // 95 // 96 // 97 // Structure for regexp "program". This is essentially a linear encoding 98 // of a nondeterministic finite-state machine (aka syntax charts or 99 // "railroad normal form" in parsing technology). Each node is an opcode 100 // plus a "next" pointer, possibly plus an operand. "Next" pointers of 101 // all nodes except kRegExpBranch implement concatenation; a "next" pointer with 102 // a kRegExpBranch on both ends of it is connecting two alternatives. (Here we 103 // have one of the subtle syntax dependencies: an individual kRegExpBranch (as 104 // opposed to a collection of them) is never concatenated with anything 105 // because of operator precedence.) The operand of some types of node is 106 // a literal string; for others, it is a node leading into a sub-FSM. In 107 // particular, the operand of a kRegExpBranch node is the first node of the branch. 108 // (NB this is *not* a tree structure: the tail of the branch connects 109 // to the thing following the set of kRegExpBranches.) The opcodes are: 110 // 111 112 // definition number opnd? meaning 113 enum { 114 kRegExpEnd = 0, // no End of program. 115 kRegExpBol = 1, // no Match "" at beginning of line. 116 kRegExpEol = 2, // no Match "" at end of line. 117 kRegExpAny = 3, // no Match any one character. 118 kRegExpAnyOf = 4, // str Match any character in this string. 119 kRegExpAnyBut = 5, // str Match any character not in this string. 120 kRegExpBranch = 6, // node Match this alternative, or the next... 121 kRegExpBack = 7, // no Match "", "next" ptr points backward. 122 kRegExpExactly = 8, // str Match this string. 123 kRegExpNothing = 9, // no Match empty string. 124 kRegExpStar = 10, // node Match this (simple) thing 0 or more times. 125 kRegExpPlus = 11, // node Match this (simple) thing 1 or more times. 126 kRegExpOpen = 20, // no Mark this point in input as start of #n. 127 // kRegExpOpen + 1 is number 1, etc. 128 kRegExpClose = 30 // no Analogous to kRegExpOpen. 129 }; 130 131 // 132 // Opcode notes: 133 // 134 // kRegExpBranch The set of branches constituting a single choice are hooked 135 // together with their "next" pointers, since precedence prevents 136 // anything being concatenated to any individual branch. The 137 // "next" pointer of the last kRegExpBranch in a choice points to the 138 // thing following the whole choice. This is also where the 139 // final "next" pointer of each individual branch points; each 140 // branch starts with the operand node of a kRegExpBranch node. 141 // 142 // kRegExpBack Normal "next" pointers all implicitly point forward; kRegExpBack 143 // exists to make loop structures possible. 144 // 145 // kRegExpStar,kRegExpPlus '?', and complex '*' and '+', are implemented as circular 146 // kRegExpBranch structures using kRegExpBack. Simple cases (one character 147 // per match) are implemented with kRegExpStar and kRegExpPlus for speed 148 // and to minimize recursive plunges. 149 // 150 // kRegExpOpen,kRegExpClose ...are numbered at compile time. 151 // 152 // 153 // 154 // A node is one char of opcode followed by two chars of "next" pointer. 155 // "Next" pointers are stored as two 8-bit pieces, high order first. The 156 // value is a positive offset from the opcode of the node containing it. 157 // An operand, if any, simply follows the node. (Note that much of the 158 // code generation knows about this implicit relationship.) 159 // 160 // Using two bytes for the "next" pointer is vast overkill for most things, 161 // but allows patterns to get big without disasters. 162 // 163 164 const char *kMeta = "^$.[()|?+*\\"; 165 const int32 kMaxSize = 32767L; // Probably could be 65535L. 166 167 // Flags to be passed up and down: 168 enum { 169 kHasWidth = 01, // Known never to match null string. 170 kSimple = 02, // Simple enough to be kRegExpStar/kRegExpPlus operand. 171 kSPStart = 04, // Starts with * or +. 172 kWorst = 0 // Worst case. 173 }; 174 175 const char *kRegExpErrorStringArray[] = { 176 "Unmatched parenthesis.", 177 "Expression too long.", 178 "Too many parenthesis.", 179 "Junk on end.", 180 "*+? operand may be empty.", 181 "Nested *?+.", 182 "Invalid bracket range.", 183 "Unmatched brackets.", 184 "Internal error.", 185 "?+* follows nothing.", 186 "Trailing \\.", 187 "Corrupted expression.", 188 "Memory corruption.", 189 "Corrupted pointers.", 190 "Corrupted opcode." 191 }; 192 193 #ifdef DEBUG 194 int32 regnarrate = 0; 195 #endif 196 197 RegExp::RegExp() 198 : fError(B_OK), 199 fRegExp(NULL) 200 { 201 } 202 203 RegExp::RegExp(const char *pattern) 204 : fError(B_OK), 205 fRegExp(NULL) 206 { 207 fRegExp = Compile(pattern); 208 } 209 210 RegExp::RegExp(const BString &pattern) 211 : fError(B_OK), 212 fRegExp(NULL) 213 { 214 fRegExp = Compile(pattern.String()); 215 } 216 217 RegExp::~RegExp() 218 { 219 free(fRegExp); 220 } 221 222 223 224 status_t 225 RegExp::InitCheck() const 226 { 227 return fError; 228 } 229 230 status_t 231 RegExp::SetTo(const char *pattern) 232 { 233 fError = B_OK; 234 free(fRegExp); 235 fRegExp = Compile(pattern); 236 return fError; 237 } 238 239 status_t 240 RegExp::SetTo(const BString &pattern) 241 { 242 fError = B_OK; 243 free(fRegExp); 244 fRegExp = Compile(pattern.String()); 245 return fError; 246 } 247 248 bool 249 RegExp::Matches(const char *string) const 250 { 251 if (!fRegExp || !string) 252 return false; 253 254 return RunMatcher(fRegExp, string) == 1; 255 } 256 257 bool 258 RegExp::Matches(const BString &string) const 259 { 260 if (!fRegExp) 261 return false; 262 263 return RunMatcher(fRegExp, string.String()) == 1; 264 } 265 266 267 // 268 // - Compile - compile a regular expression into internal code 269 // 270 // We can't allocate space until we know how big the compiled form will be, 271 // but we can't compile it (and thus know how big it is) until we've got a 272 // place to put the code. So we cheat: we compile it twice, once with code 273 // generation turned off and size counting turned on, and once "for real". 274 // This also means that we don't allocate space until we are sure that the 275 // thing really will compile successfully, and we never have to move the 276 // code and thus invalidate pointers into it. (Note that it has to be in 277 // one piece because free() must be able to free it all.) 278 // 279 // Beware that the optimization-preparation code in here knows about some 280 // of the structure of the compiled regexp. 281 282 regexp * 283 RegExp::Compile(const char *exp) 284 { 285 regexp *r; 286 const char *scan; 287 const char *longest; 288 int32 len; 289 int32 flags; 290 291 if (exp == NULL) { 292 SetError(B_BAD_VALUE); 293 return NULL; 294 } 295 296 // First pass: determine size, legality. 297 fInputScanPointer = exp; 298 fParenthesisCount = 1; 299 fCodeSize = 0L; 300 fCodeEmitPointer = &fDummy; 301 Char(kRegExpMagic); 302 if (Reg(0, &flags) == NULL) 303 return NULL; 304 305 // Small enough for pointer-storage convention? 306 if (fCodeSize >= kMaxSize) { 307 SetError(REGEXP_TOO_BIG); 308 return NULL; 309 } 310 311 // Allocate space. 312 r = (regexp *)malloc(sizeof(regexp) + fCodeSize); 313 314 if (!r) { 315 SetError(B_NO_MEMORY); 316 return NULL; 317 } 318 319 // Second pass: emit code. 320 fInputScanPointer = exp; 321 fParenthesisCount = 1; 322 fCodeEmitPointer = r->program; 323 Char(kRegExpMagic); 324 if (Reg(0, &flags) == NULL) { 325 free(r); 326 return NULL; 327 } 328 329 // Dig out information for optimizations. 330 r->regstart = '\0'; // Worst-case defaults. 331 r->reganch = 0; 332 r->regmust = NULL; 333 r->regmlen = 0; 334 scan = r->program + 1; // First kRegExpBranch. 335 if (*Next((char *)scan) == kRegExpEnd) { // Only one top-level choice. 336 scan = Operand(scan); 337 338 // Starting-point info. 339 if (*scan == kRegExpExactly) 340 r->regstart = *Operand(scan); 341 else if (*scan == kRegExpBol) 342 r->reganch++; 343 344 // 345 // If there's something expensive in the r.e., find the 346 // longest literal string that must appear and make it the 347 // regmust. Resolve ties in favor of later strings, since 348 // the regstart check works with the beginning of the r.e. 349 // and avoiding duplication strengthens checking. Not a 350 // strong reason, but sufficient in the absence of others. 351 // 352 if (flags&kSPStart) { 353 longest = NULL; 354 len = 0; 355 for (; scan != NULL; scan = Next((char *)scan)) 356 if (*scan == kRegExpExactly && (int32)strlen(Operand(scan)) >= len) { 357 longest = Operand(scan); 358 len = (int32)strlen(Operand(scan)); 359 } 360 r->regmust = longest; 361 r->regmlen = len; 362 } 363 } 364 365 return r; 366 } 367 368 regexp * 369 RegExp::Expression() const 370 { 371 return fRegExp; 372 } 373 374 const char * 375 RegExp::ErrorString() const 376 { 377 if (fError >= REGEXP_UNMATCHED_PARENTHESIS 378 && fError <= REGEXP_CORRUPTED_OPCODE) 379 return kRegExpErrorStringArray[fError - B_ERRORS_END]; 380 381 return strerror(fError); 382 } 383 384 385 void 386 RegExp::SetError(status_t error) const 387 { 388 fError = error; 389 } 390 391 392 // 393 // - Reg - regular expression, i.e. main body or parenthesized thing 394 // 395 // Caller must absorb opening parenthesis. 396 // 397 // Combining parenthesis handling with the base level of regular expression 398 // is a trifle forced, but the need to tie the tails of the branches to what 399 // follows makes it hard to avoid. 400 // 401 char * 402 RegExp::Reg(int32 paren, int32 *flagp) 403 { 404 char *ret; 405 char *br; 406 char *ender; 407 int32 parno = 0; 408 int32 flags; 409 410 *flagp = kHasWidth; // Tentatively. 411 412 // Make an kRegExpOpen node, if parenthesized. 413 if (paren) { 414 if (fParenthesisCount >= kSubExpressionMax) { 415 SetError(REGEXP_TOO_MANY_PARENTHESIS); 416 return NULL; 417 } 418 parno = fParenthesisCount; 419 fParenthesisCount++; 420 ret = Node((char)(kRegExpOpen + parno)); 421 } else 422 ret = NULL; 423 424 // Pick up the branches, linking them together. 425 br = Branch(&flags); 426 if (br == NULL) 427 return NULL; 428 if (ret != NULL) 429 Tail(ret, br); // kRegExpOpen -> first 430 else 431 ret = br; 432 if (!(flags & kHasWidth)) 433 *flagp &= ~kHasWidth; 434 *flagp |= flags&kSPStart; 435 while (*fInputScanPointer == '|') { 436 fInputScanPointer++; 437 br = Branch(&flags); 438 if (br == NULL) 439 return NULL; 440 Tail(ret, br); // kRegExpBranch -> kRegExpBranch. 441 if (!(flags & kHasWidth)) 442 *flagp &= ~kHasWidth; 443 *flagp |= flags&kSPStart; 444 } 445 446 // Make a closing node, and hook it on the end. 447 ender = Node(paren ? (char)(kRegExpClose + parno) : (char)kRegExpEnd); 448 Tail(ret, ender); 449 450 // Hook the tails of the branches to the closing node. 451 for (br = ret; br != NULL; br = Next(br)) 452 OpTail(br, ender); 453 454 // Check for proper termination. 455 if (paren && *fInputScanPointer++ != ')') { 456 SetError(REGEXP_UNMATCHED_PARENTHESIS); 457 return NULL; 458 } else if (!paren && *fInputScanPointer != '\0') { 459 if (*fInputScanPointer == ')') { 460 SetError(REGEXP_UNMATCHED_PARENTHESIS); 461 return NULL; 462 } else { 463 SetError(REGEXP_JUNK_ON_END); 464 return NULL; // "Can't happen". 465 } 466 // NOTREACHED 467 } 468 469 return ret; 470 } 471 472 // 473 // - Branch - one alternative of an | operator 474 // 475 // Implements the concatenation operator. 476 // 477 char * 478 RegExp::Branch(int32 *flagp) 479 { 480 char *ret; 481 char *chain; 482 char *latest; 483 int32 flags; 484 485 *flagp = kWorst; // Tentatively. 486 487 ret = Node(kRegExpBranch); 488 chain = NULL; 489 while (*fInputScanPointer != '\0' 490 && *fInputScanPointer != '|' 491 && *fInputScanPointer != ')') { 492 latest = Piece(&flags); 493 if (latest == NULL) 494 return NULL; 495 *flagp |= flags & kHasWidth; 496 if (chain == NULL) // First piece. 497 *flagp |= flags & kSPStart; 498 else 499 Tail(chain, latest); 500 chain = latest; 501 } 502 if (chain == NULL) // Loop ran zero times. 503 Node(kRegExpNothing); 504 505 return ret; 506 } 507 508 // 509 // - Piece - something followed by possible [*+?] 510 // 511 // Note that the branching code sequences used for ? and the general cases 512 // of * and + are somewhat optimized: they use the same kRegExpNothing node as 513 // both the endmarker for their branch list and the body of the last branch. 514 // It might seem that this node could be dispensed with entirely, but the 515 // endmarker role is not redundant. 516 // 517 char * 518 RegExp::Piece(int32 *flagp) 519 { 520 char *ret; 521 char op; 522 char *next; 523 int32 flags; 524 525 ret = Atom(&flags); 526 if (ret == NULL) 527 return NULL; 528 529 op = *fInputScanPointer; 530 if (!IsMult(op)) { 531 *flagp = flags; 532 return ret; 533 } 534 535 if (!(flags & kHasWidth) && op != '?') { 536 SetError(REGEXP_STAR_PLUS_OPERAND_EMPTY); 537 return NULL; 538 } 539 *flagp = op != '+' ? kWorst | kSPStart : kWorst | kHasWidth; 540 541 if (op == '*' && (flags & kSimple)) 542 Insert(kRegExpStar, ret); 543 else if (op == '*') { 544 // Emit x* as (x&|), where & means "self". 545 Insert(kRegExpBranch, ret); // Either x 546 OpTail(ret, Node(kRegExpBack)); // and loop 547 OpTail(ret, ret); // back 548 Tail(ret, Node(kRegExpBranch)); // or 549 Tail(ret, Node(kRegExpNothing)); // null. 550 } else if (op == '+' && (flags & kSimple)) 551 Insert(kRegExpPlus, ret); 552 else if (op == '+') { 553 // Emit x+ as x(&|), where & means "self". 554 next = Node(kRegExpBranch); // Either 555 Tail(ret, next); 556 Tail(Node(kRegExpBack), ret); // loop back 557 Tail(next, Node(kRegExpBranch)); // or 558 Tail(ret, Node(kRegExpNothing)); // null. 559 } else if (op == '?') { 560 // Emit x? as (x|) 561 Insert(kRegExpBranch, ret); // Either x 562 Tail(ret, Node(kRegExpBranch)); // or 563 next = Node(kRegExpNothing); // null. 564 Tail(ret, next); 565 OpTail(ret, next); 566 } 567 fInputScanPointer++; 568 if (IsMult(*fInputScanPointer)) { 569 SetError(REGEXP_NESTED_STAR_QUESTION_PLUS); 570 return NULL; 571 } 572 return ret; 573 } 574 575 // 576 // - Atom - the lowest level 577 // 578 // Optimization: gobbles an entire sequence of ordinary characters so that 579 // it can turn them into a single node, which is smaller to store and 580 // faster to run. Backslashed characters are exceptions, each becoming a 581 // separate node; the code is simpler that way and it's not worth fixing. 582 // 583 char * 584 RegExp::Atom(int32 *flagp) 585 { 586 char *ret; 587 int32 flags; 588 589 *flagp = kWorst; // Tentatively. 590 591 switch (*fInputScanPointer++) { 592 case '^': 593 ret = Node(kRegExpBol); 594 break; 595 case '$': 596 ret = Node(kRegExpEol); 597 break; 598 case '.': 599 ret = Node(kRegExpAny); 600 *flagp |= kHasWidth|kSimple; 601 break; 602 case '[': 603 { 604 int32 cclass; 605 int32 classend; 606 607 if (*fInputScanPointer == '^') { // Complement of range. 608 ret = Node(kRegExpAnyBut); 609 fInputScanPointer++; 610 } else 611 ret = Node(kRegExpAnyOf); 612 if (*fInputScanPointer == ']' || *fInputScanPointer == '-') 613 Char(*fInputScanPointer++); 614 while (*fInputScanPointer != '\0' && *fInputScanPointer != ']') { 615 if (*fInputScanPointer == '-') { 616 fInputScanPointer++; 617 if (*fInputScanPointer == ']' || *fInputScanPointer == '\0') 618 Char('-'); 619 else { 620 cclass = UCharAt(fInputScanPointer - 2) + 1; 621 classend = UCharAt(fInputScanPointer); 622 if (cclass > classend + 1) { 623 SetError(REGEXP_INVALID_BRACKET_RANGE); 624 return NULL; 625 } 626 for (; cclass <= classend; cclass++) 627 Char((char)cclass); 628 fInputScanPointer++; 629 } 630 } else 631 Char(*fInputScanPointer++); 632 } 633 Char('\0'); 634 if (*fInputScanPointer != ']') { 635 SetError(REGEXP_UNMATCHED_BRACKET); 636 return NULL; 637 } 638 fInputScanPointer++; 639 *flagp |= kHasWidth | kSimple; 640 } 641 break; 642 case '(': 643 ret = Reg(1, &flags); 644 if (ret == NULL) 645 return NULL; 646 *flagp |= flags & (kHasWidth | kSPStart); 647 break; 648 case '\0': 649 case '|': 650 case ')': 651 SetError(REGEXP_INTERNAL_ERROR); 652 return NULL; // Supposed to be caught earlier. 653 case '?': 654 case '+': 655 case '*': 656 SetError(REGEXP_QUESTION_PLUS_STAR_FOLLOWS_NOTHING); 657 return NULL; 658 case '\\': 659 if (*fInputScanPointer == '\0') { 660 SetError(REGEXP_TRAILING_BACKSLASH); 661 return NULL; 662 } 663 ret = Node(kRegExpExactly); 664 Char(*fInputScanPointer++); 665 Char('\0'); 666 *flagp |= kHasWidth|kSimple; 667 break; 668 default: 669 { 670 int32 len; 671 char ender; 672 673 fInputScanPointer--; 674 len = (int32)strcspn(fInputScanPointer, kMeta); 675 if (len <= 0) { 676 SetError(REGEXP_INTERNAL_ERROR); 677 return NULL; 678 } 679 ender = *(fInputScanPointer + len); 680 if (len > 1 && IsMult(ender)) 681 len--; // Back off clear of ?+* operand. 682 *flagp |= kHasWidth; 683 if (len == 1) 684 *flagp |= kSimple; 685 ret = Node(kRegExpExactly); 686 while (len > 0) { 687 Char(*fInputScanPointer++); 688 len--; 689 } 690 Char('\0'); 691 } 692 break; 693 } 694 695 return ret; 696 } 697 698 // 699 // - Node - emit a node 700 // 701 char * // Location. 702 RegExp::Node(char op) 703 { 704 char *ret; 705 char *ptr; 706 707 ret = fCodeEmitPointer; 708 if (ret == &fDummy) { 709 fCodeSize += 3; 710 return ret; 711 } 712 713 ptr = ret; 714 *ptr++ = op; 715 *ptr++ = '\0'; // Null "next" pointer. 716 *ptr++ = '\0'; 717 fCodeEmitPointer = ptr; 718 719 return ret; 720 } 721 722 // 723 // - Char - emit (if appropriate) a byte of code 724 // 725 void 726 RegExp::Char(char b) 727 { 728 if (fCodeEmitPointer != &fDummy) 729 *fCodeEmitPointer++ = b; 730 else 731 fCodeSize++; 732 } 733 734 // 735 // - Insert - insert an operator in front of already-emitted operand 736 // 737 // Means relocating the operand. 738 // 739 void 740 RegExp::Insert(char op, char *opnd) 741 { 742 char *src; 743 char *dst; 744 char *place; 745 746 if (fCodeEmitPointer == &fDummy) { 747 fCodeSize += 3; 748 return; 749 } 750 751 src = fCodeEmitPointer; 752 fCodeEmitPointer += 3; 753 dst = fCodeEmitPointer; 754 while (src > opnd) 755 *--dst = *--src; 756 757 place = opnd; // Op node, where operand used to be. 758 *place++ = op; 759 *place++ = '\0'; 760 *place++ = '\0'; 761 } 762 763 // 764 // - Tail - set the next-pointer at the end of a node chain 765 // 766 void 767 RegExp::Tail(char *p, char *val) 768 { 769 char *scan; 770 char *temp; 771 int32 offset; 772 773 if (p == &fDummy) 774 return; 775 776 // Find last node. 777 scan = p; 778 for (;;) { 779 temp = Next(scan); 780 if (temp == NULL) 781 break; 782 scan = temp; 783 } 784 785 if (scan[0] == kRegExpBack) 786 offset = scan - val; 787 else 788 offset = val - scan; 789 790 scan[1] = (char)((offset >> 8) & 0377); 791 scan[2] = (char)(offset & 0377); 792 } 793 794 // 795 // - OpTail - Tail on operand of first argument; nop if operandless 796 // 797 void 798 RegExp::OpTail(char *p, char *val) 799 { 800 // "Operandless" and "op != kRegExpBranch" are synonymous in practice. 801 if (p == NULL || p == &fDummy || *p != kRegExpBranch) 802 return; 803 Tail(Operand(p), val); 804 } 805 806 // 807 // RunMatcher and friends 808 // 809 810 // 811 // - RunMatcher - match a regexp against a string 812 // 813 int32 814 RegExp::RunMatcher(regexp *prog, const char *string) const 815 { 816 const char *s; 817 818 // Be paranoid... 819 if (prog == NULL || string == NULL) { 820 SetError(B_BAD_VALUE); 821 return 0; 822 } 823 824 // Check validity of program. 825 if (UCharAt(prog->program) != kRegExpMagic) { 826 SetError(REGEXP_CORRUPTED_PROGRAM); 827 return 0; 828 } 829 830 // If there is a "must appear" string, look for it. 831 if (prog->regmust != NULL) { 832 s = string; 833 while ((s = strchr(s, prog->regmust[0])) != NULL) { 834 if (strncmp(s, prog->regmust, (size_t)prog->regmlen) == 0) 835 break; // Found it. 836 s++; 837 } 838 if (s == NULL) // Not present. 839 return 0; 840 } 841 842 // Mark beginning of line for ^ . 843 fRegBol = string; 844 845 // Simplest case: anchored match need be tried only once. 846 if (prog->reganch) 847 return Try(prog, (char*)string); 848 849 // Messy cases: unanchored match. 850 s = string; 851 if (prog->regstart != '\0') 852 // We know what char it must start with. 853 while ((s = strchr(s, prog->regstart)) != NULL) { 854 if (Try(prog, (char*)s)) 855 return 1; 856 s++; 857 } 858 else 859 // We don't -- general case. 860 do { 861 if (Try(prog, (char*)s)) 862 return 1; 863 } while (*s++ != '\0'); 864 865 // Failure. 866 return 0; 867 } 868 869 // 870 // - Try - try match at specific point 871 // 872 int32 // 0 failure, 1 success 873 RegExp::Try(regexp *prog, const char *string) const 874 { 875 int32 i; 876 const char **sp; 877 const char **ep; 878 879 fStringInputPointer = string; 880 fStartPArrayPointer = prog->startp; 881 fEndPArrayPointer = prog->endp; 882 883 sp = prog->startp; 884 ep = prog->endp; 885 for (i = kSubExpressionMax; i > 0; i--) { 886 *sp++ = NULL; 887 *ep++ = NULL; 888 } 889 if (Match(prog->program + 1)) { 890 prog->startp[0] = string; 891 prog->endp[0] = fStringInputPointer; 892 return 1; 893 } else 894 return 0; 895 } 896 897 // 898 // - Match - main matching routine 899 // 900 // Conceptually the strategy is simple: check to see whether the current 901 // node matches, call self recursively to see whether the rest matches, 902 // and then act accordingly. In practice we make some effort to avoid 903 // recursion, in particular by going through "ordinary" nodes (that don't 904 // need to know whether the rest of the match failed) by a loop instead of 905 // by recursion. 906 /// 907 int32 // 0 failure, 1 success 908 RegExp::Match(const char *prog) const 909 { 910 const char *scan; // Current node. 911 const char *next; // Next node. 912 913 scan = prog; 914 #ifdef DEBUG 915 if (scan != NULL && regnarrate) 916 fprintf(stderr, "%s(\n", Prop(scan)); 917 #endif 918 while (scan != NULL) { 919 #ifdef DEBUG 920 if (regnarrate) 921 fprintf(stderr, "%s...\n", Prop(scan)); 922 #endif 923 next = Next(scan); 924 925 switch (*scan) { 926 case kRegExpBol: 927 if (fStringInputPointer != fRegBol) 928 return 0; 929 break; 930 case kRegExpEol: 931 if (*fStringInputPointer != '\0') 932 return 0; 933 break; 934 case kRegExpAny: 935 if (*fStringInputPointer == '\0') 936 return 0; 937 fStringInputPointer++; 938 break; 939 case kRegExpExactly: 940 { 941 const char *opnd = Operand(scan); 942 // Inline the first character, for speed. 943 if (*opnd != *fStringInputPointer) 944 return 0; 945 946 uint32 len = strlen(opnd); 947 if (len > 1 && strncmp(opnd, fStringInputPointer, len) != 0) 948 return 0; 949 950 fStringInputPointer += len; 951 } 952 break; 953 case kRegExpAnyOf: 954 if (*fStringInputPointer == '\0' 955 || strchr(Operand(scan), *fStringInputPointer) == NULL) 956 return 0; 957 fStringInputPointer++; 958 break; 959 case kRegExpAnyBut: 960 if (*fStringInputPointer == '\0' 961 || strchr(Operand(scan), *fStringInputPointer) != NULL) 962 return 0; 963 fStringInputPointer++; 964 break; 965 case kRegExpNothing: 966 break; 967 case kRegExpBack: 968 break; 969 case kRegExpOpen + 1: 970 case kRegExpOpen + 2: 971 case kRegExpOpen + 3: 972 case kRegExpOpen + 4: 973 case kRegExpOpen + 5: 974 case kRegExpOpen + 6: 975 case kRegExpOpen + 7: 976 case kRegExpOpen + 8: 977 case kRegExpOpen + 9: 978 { 979 int32 no; 980 const char *save; 981 982 no = *scan - kRegExpOpen; 983 save = fStringInputPointer; 984 985 if (Match(next)) { 986 // 987 // Don't set startp if some later 988 // invocation of the same parentheses 989 // already has. 990 // 991 if (fStartPArrayPointer[no] == NULL) 992 fStartPArrayPointer[no] = save; 993 return 1; 994 } else 995 return 0; 996 } 997 break; 998 case kRegExpClose + 1: 999 case kRegExpClose + 2: 1000 case kRegExpClose + 3: 1001 case kRegExpClose + 4: 1002 case kRegExpClose + 5: 1003 case kRegExpClose + 6: 1004 case kRegExpClose + 7: 1005 case kRegExpClose + 8: 1006 case kRegExpClose + 9: 1007 { 1008 int32 no; 1009 const char *save; 1010 1011 no = *scan - kRegExpClose; 1012 save = fStringInputPointer; 1013 1014 if (Match(next)) { 1015 // 1016 // Don't set endp if some later 1017 // invocation of the same parentheses 1018 // already has. 1019 // 1020 if (fEndPArrayPointer[no] == NULL) 1021 fEndPArrayPointer[no] = save; 1022 return 1; 1023 } else 1024 return 0; 1025 } 1026 break; 1027 case kRegExpBranch: 1028 { 1029 const char *save; 1030 1031 if (*next != kRegExpBranch) // No choice. 1032 next = Operand(scan); // Avoid recursion. 1033 else { 1034 do { 1035 save = fStringInputPointer; 1036 if (Match(Operand(scan))) 1037 return 1; 1038 fStringInputPointer = save; 1039 scan = Next(scan); 1040 } while (scan != NULL && *scan == kRegExpBranch); 1041 return 0; 1042 // NOTREACHED/ 1043 } 1044 } 1045 break; 1046 case kRegExpStar: 1047 case kRegExpPlus: 1048 { 1049 char nextch; 1050 int32 no; 1051 const char *save; 1052 int32 min; 1053 1054 // 1055 //Lookahead to avoid useless match attempts 1056 // when we know what character comes next. 1057 // 1058 nextch = '\0'; 1059 if (*next == kRegExpExactly) 1060 nextch = *Operand(next); 1061 min = (*scan == kRegExpStar) ? 0 : 1; 1062 save = fStringInputPointer; 1063 no = Repeat(Operand(scan)); 1064 while (no >= min) { 1065 // If it could work, try it. 1066 if (nextch == '\0' || *fStringInputPointer == nextch) 1067 if (Match(next)) 1068 return 1; 1069 // Couldn't or didn't -- back up. 1070 no--; 1071 fStringInputPointer = save + no; 1072 } 1073 return 0; 1074 } 1075 break; 1076 case kRegExpEnd: 1077 return 1; // Success! 1078 1079 default: 1080 SetError(REGEXP_MEMORY_CORRUPTION); 1081 return 0; 1082 } 1083 1084 scan = next; 1085 } 1086 1087 // 1088 // We get here only if there's trouble -- normally "case kRegExpEnd" is 1089 // the terminating point. 1090 // 1091 SetError(REGEXP_CORRUPTED_POINTERS); 1092 return 0; 1093 } 1094 1095 // 1096 // - Repeat - repeatedly match something simple, report how many 1097 // 1098 int32 1099 RegExp::Repeat(const char *p) const 1100 { 1101 int32 count = 0; 1102 const char *scan; 1103 const char *opnd; 1104 1105 scan = fStringInputPointer; 1106 opnd = Operand(p); 1107 switch (*p) { 1108 case kRegExpAny: 1109 count = (int32)strlen(scan); 1110 scan += count; 1111 break; 1112 1113 case kRegExpExactly: 1114 while (*opnd == *scan) { 1115 count++; 1116 scan++; 1117 } 1118 break; 1119 1120 case kRegExpAnyOf: 1121 while (*scan != '\0' && strchr(opnd, *scan) != NULL) { 1122 count++; 1123 scan++; 1124 } 1125 break; 1126 1127 case kRegExpAnyBut: 1128 while (*scan != '\0' && strchr(opnd, *scan) == NULL) { 1129 count++; 1130 scan++; 1131 } 1132 break; 1133 1134 default: // Oh dear. Called inappropriately. 1135 SetError(REGEXP_INTERNAL_ERROR); 1136 count = 0; // Best compromise. 1137 break; 1138 } 1139 fStringInputPointer = scan; 1140 1141 return count; 1142 } 1143 1144 // 1145 // - Next - dig the "next" pointer out of a node 1146 // 1147 char * 1148 RegExp::Next(char *p) 1149 { 1150 int32 offset; 1151 1152 if (p == &fDummy) 1153 return NULL; 1154 1155 offset = ((*(p + 1) & 0377) << 8) + (*(p + 2) & 0377); 1156 if (offset == 0) 1157 return NULL; 1158 1159 if (*p == kRegExpBack) 1160 return p - offset; 1161 else 1162 return p + offset; 1163 } 1164 1165 const char * 1166 RegExp::Next(const char *p) const 1167 { 1168 int32 offset; 1169 1170 if (p == &fDummy) 1171 return NULL; 1172 1173 offset = ((*(p + 1) & 0377) << 8) + (*(p + 2) & 0377); 1174 if (offset == 0) 1175 return NULL; 1176 1177 if (*p == kRegExpBack) 1178 return p - offset; 1179 else 1180 return p + offset; 1181 } 1182 1183 inline int32 1184 RegExp::UCharAt(const char *p) const 1185 { 1186 return (int32)*(unsigned char *)p; 1187 } 1188 1189 inline char * 1190 RegExp::Operand(char* p) const 1191 { 1192 return p + 3; 1193 } 1194 1195 inline const char * 1196 RegExp::Operand(const char* p) const 1197 { 1198 return p + 3; 1199 } 1200 1201 inline bool 1202 RegExp::IsMult(char c) const 1203 { 1204 return c == '*' || c == '+' || c == '?'; 1205 } 1206 1207 1208 #ifdef DEBUG 1209 1210 // 1211 // - Dump - dump a regexp onto stdout in vaguely comprehensible form 1212 // 1213 void 1214 RegExp::Dump() 1215 { 1216 const char *s; 1217 char op = kRegExpExactly; // Arbitrary non-kRegExpEnd op. 1218 const char *next; 1219 1220 s = fRegExp->program + 1; 1221 while (op != kRegExpEnd) { // While that wasn't kRegExpEnd last time... 1222 op = *s; 1223 printf("%2ld%s", s - fRegExp->program, Prop(s)); // Where, what. 1224 next = Next(s); 1225 if (next == NULL) // Next ptr. 1226 printf("(0)"); 1227 else 1228 printf("(%ld)", (s - fRegExp->program) + (next - s)); 1229 s += 3; 1230 if (op == kRegExpAnyOf || op == kRegExpAnyBut || op == kRegExpExactly) { 1231 // Literal string, where present. 1232 while (*s != '\0') { 1233 putchar(*s); 1234 s++; 1235 } 1236 s++; 1237 } 1238 putchar('\n'); 1239 } 1240 1241 // Header fields of interest. 1242 if (fRegExp->regstart != '\0') 1243 printf("start `%c' ", fRegExp->regstart); 1244 if (fRegExp->reganch) 1245 printf("anchored "); 1246 if (fRegExp->regmust != NULL) 1247 printf("must have \"%s\"", fRegExp->regmust); 1248 printf("\n"); 1249 } 1250 1251 // 1252 // - Prop - printable representation of opcode 1253 // 1254 char * 1255 RegExp::Prop(const char *op) const 1256 { 1257 const char *p = NULL; 1258 static char buf[50]; 1259 1260 (void) strcpy(buf, ":"); 1261 1262 switch (*op) { 1263 case kRegExpBol: 1264 p = "kRegExpBol"; 1265 break; 1266 case kRegExpEol: 1267 p = "kRegExpEol"; 1268 break; 1269 case kRegExpAny: 1270 p = "kRegExpAny"; 1271 break; 1272 case kRegExpAnyOf: 1273 p = "kRegExpAnyOf"; 1274 break; 1275 case kRegExpAnyBut: 1276 p = "kRegExpAnyBut"; 1277 break; 1278 case kRegExpBranch: 1279 p = "kRegExpBranch"; 1280 break; 1281 case kRegExpExactly: 1282 p = "kRegExpExactly"; 1283 break; 1284 case kRegExpNothing: 1285 p = "kRegExpNothing"; 1286 break; 1287 case kRegExpBack: 1288 p = "kRegExpBack"; 1289 break; 1290 case kRegExpEnd: 1291 p = "kRegExpEnd"; 1292 break; 1293 case kRegExpOpen + 1: 1294 case kRegExpOpen + 2: 1295 case kRegExpOpen + 3: 1296 case kRegExpOpen + 4: 1297 case kRegExpOpen + 5: 1298 case kRegExpOpen + 6: 1299 case kRegExpOpen + 7: 1300 case kRegExpOpen + 8: 1301 case kRegExpOpen + 9: 1302 sprintf(buf + strlen(buf), "kRegExpOpen%d", *op - kRegExpOpen); 1303 p = NULL; 1304 break; 1305 case kRegExpClose + 1: 1306 case kRegExpClose + 2: 1307 case kRegExpClose + 3: 1308 case kRegExpClose + 4: 1309 case kRegExpClose + 5: 1310 case kRegExpClose + 6: 1311 case kRegExpClose + 7: 1312 case kRegExpClose + 8: 1313 case kRegExpClose + 9: 1314 sprintf(buf + strlen(buf), "kRegExpClose%d", *op - kRegExpClose); 1315 p = NULL; 1316 break; 1317 case kRegExpStar: 1318 p = "kRegExpStar"; 1319 break; 1320 case kRegExpPlus: 1321 p = "kRegExpPlus"; 1322 break; 1323 default: 1324 RegExpError("corrupted opcode"); 1325 break; 1326 } 1327 1328 if (p != NULL) 1329 strcat(buf, p); 1330 1331 return buf; 1332 } 1333 1334 void 1335 RegExp::RegExpError(const char *) const 1336 { 1337 // does nothing now, perhaps it should printf? 1338 } 1339 1340 #endif 1341