1 /*
2 Open Tracker License
3
4 Terms and Conditions
5
6 Copyright (c) 1991-2000, Be Incorporated. All rights reserved.
7
8 Permission is hereby granted, free of charge, to any person obtaining a copy of
9 this software and associated documentation files (the "Software"), to deal in
10 the Software without restriction, including without limitation the rights to
11 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
12 of the Software, and to permit persons to whom the Software is furnished to do
13 so, subject to the following conditions:
14
15 The above copyright notice and this permission notice applies to all licensees
16 and shall be included in all copies or substantial portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF TITLE, MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21 BE INCORPORATED BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
22 AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN CONNECTION
23 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
24
25 Except as contained in this notice, the name of Be Incorporated shall not be
26 used in advertising or otherwise to promote the sale, use or other dealings in
27 this Software without prior written authorization from Be Incorporated.
28
29 Tracker(TM), Be(R), BeOS(R), and BeIA(TM) are trademarks or registered trademarks
30 of Be Incorporated in the United States and other countries. Other brand product
31 names are registered trademarks or trademarks of their respective holders.
32 All rights reserved.
33 */
34
35 // This code is based on regexp.c, v.1.3 by Henry Spencer:
36
37 // @(#)regexp.c 1.3 of 18 April 87
38 //
39 // Copyright (c) 1986 by University of Toronto.
40 // Written by Henry Spencer. Not derived from licensed software.
41 //
42 // Permission is granted to anyone to use this software for any
43 // purpose on any computer system, and to redistribute it freely,
44 // subject to the following restrictions:
45 //
46 // 1. The author is not responsible for the consequences of use of
47 // this software, no matter how awful, even if they arise
48 // from defects in it.
49 //
50 // 2. The origin of this software must not be misrepresented, either
51 // by explicit claim or by omission.
52 //
53 // 3. Altered versions must be plainly marked as such, and must not
54 // be misrepresented as being the original software.
55 //
56 // Beware that some of this code is subtly aware of the way operator
57 // precedence is structured in regular expressions. Serious changes in
58 // regular-expression syntax might require a total rethink.
59 //
60
61 // ALTERED VERSION: Adapted to ANSI C and C++ for the OpenTracker
62 // project (www.opentracker.org), Jul 11, 2000.
63
64
65 #include <stdlib.h>
66 #include <stdio.h>
67 #include <string.h>
68
69 #include <Catalog.h>
70 #include <Errors.h>
71
72 #include "RegExp.h"
73
74
75 #undef B_TRANSLATION_CONTEXT
76 #define B_TRANSLATION_CONTEXT "libtracker"
77
78
79 // The first byte of the regexp internal "program" is actually this magic
80 // number; the start node begins in the second byte.
81
82 const uint8 kRegExpMagic = 0234;
83
84 // The "internal use only" fields in RegExp.h are present to pass info from
85 // compile to execute that permits the execute phase to run lots faster on
86 // simple cases. They are:
87 //
88 // regstart char that must begin a match; '\0' if none obvious
89 // reganch is the match anchored (at beginning-of-line only)?
90 // regmust string (pointer into program) that match must include, or NULL
91 // regmlen length of regmust string
92 //
93 // Regstart and reganch permit very fast decisions on suitable starting points
94 // for a match, cutting down the work a lot. Regmust permits fast rejection
95 // of lines that cannot possibly match. The regmust tests are costly enough
96 // that Compile() supplies a regmust only if the r.e. contains something
97 // potentially expensive (at present, the only such thing detected is * or +
98 // at the start of the r.e., which can involve a lot of backup). Regmlen is
99 // supplied because the test in RunMatcher() needs it and Compile() is
100 // computing it anyway.
101 //
102 //
103 //
104 // Structure for regexp "program". This is essentially a linear encoding
105 // of a nondeterministic finite-state machine (aka syntax charts or
106 // "railroad normal form" in parsing technology). Each node is an opcode
107 // plus a "next" pointer, possibly plus an operand. "Next" pointers of
108 // all nodes except kRegExpBranch implement concatenation; a "next" pointer
109 // with a kRegExpBranch on both ends of it is connecting two alternatives.
110 // (Here we have one of the subtle syntax dependencies: an individual
111 // kRegExpBranch (as opposed to a collection of them) is never concatenated
112 // with anything because of operator precedence.) The operand of some types
113 // of node is a literal string; for others, it is a node leading into a
114 // sub-FSM. In particular, the operand of a kRegExpBranch node is the first
115 // node of the branch. (NB this is* not* a tree structure: the tail of the
116 // branch connects to the thing following the set of kRegExpBranches).
117 // The opcodes are:
118 //
119
120 // definition number opnd? meaning
121 enum {
122 kRegExpEnd = 0, // no End of program.
123 kRegExpBol = 1, // no Match "" at beginning of line.
124 kRegExpEol = 2, // no Match "" at end of line.
125 kRegExpAny = 3, // no Match any one character.
126 kRegExpAnyOf = 4, // str Match any character in this string.
127 kRegExpAnyBut = 5, // str Match any character not in this string.
128 kRegExpBranch = 6, // node Match this alternative, or the next...
129 kRegExpBack = 7, // no Match "", "next" ptr points backward.
130 kRegExpExactly = 8, // str Match this string.
131 kRegExpNothing = 9, // no Match empty string.
132 kRegExpStar = 10, // node Match this (simple) thing 0 or more times.
133 kRegExpPlus = 11, // node Match this (simple) thing 1 or more times.
134 kRegExpOpen = 20, // no Mark this point in input as start of #n.
135 // kRegExpOpen + 1 is number 1, etc.
136 kRegExpClose = 30 // no Analogous to kRegExpOpen.
137 };
138
139 //
140 // Opcode notes:
141 //
142 // kRegExpBranch The set of branches constituting a single choice are
143 // hooked together with their "next" pointers, since precedence prevents
144 // anything being concatenated to any individual branch. The
145 // "next" pointer of the last kRegExpBranch in a choice points to the
146 // thing following the whole choice. This is also where the
147 // final "next" pointer of each individual branch points; each
148 // branch starts with the operand node of a kRegExpBranch node.
149 //
150 // kRegExpBack Normal "next" pointers all implicitly point forward;
151 // kRegExpBack exists to make loop structures possible.
152 //
153 // kRegExpStar,kRegExpPlus '?', and complex '*' and '+', are implemented as
154 // circular kRegExpBranch structures using kRegExpBack. Simple cases
155 // (one character per match) are implemented with kRegExpStar and
156 // kRegExpPlus for speed and to minimize recursive plunges.
157 //
158 // kRegExpOpen,kRegExpClose ...are numbered at compile time.
159 //
160 //
161 //
162 // A node is one char of opcode followed by two chars of "next" pointer.
163 // "Next" pointers are stored as two 8-bit pieces, high order first. The
164 // value is a positive offset from the opcode of the node containing it.
165 // An operand, if any, simply follows the node. (Note that much of the
166 // code generation knows about this implicit relationship.)
167 //
168 // Using two bytes for the "next" pointer is vast overkill for most things,
169 // but allows patterns to get big without disasters.
170 //
171
172 const char* kMeta = "^$.[()|?+*\\";
173 const int32 kMaxSize = 32767L;
174 // Probably could be 65535L.
175
176
177 // Flags to be passed up and down:
178 enum {
179 kHasWidth = 01, // Known never to match null string.
180 kSimple = 02, // Simple enough to be kRegExpStar/kRegExpPlus operand.
181 kSPStart = 04, // Starts with * or +.
182 kWorst = 0 // Worst case.
183 };
184
185
186 const char* kRegExpErrorStringArray[] = {
187 B_TRANSLATE_MARK("Unmatched parenthesis."),
188 B_TRANSLATE_MARK("Expression too long."),
189 B_TRANSLATE_MARK("Too many parenthesis."),
190 B_TRANSLATE_MARK("Junk on end."),
191 B_TRANSLATE_MARK("*+? operand may be empty."),
192 B_TRANSLATE_MARK("Nested *?+."),
193 B_TRANSLATE_MARK("Invalid bracket range."),
194 B_TRANSLATE_MARK("Unmatched brackets."),
195 B_TRANSLATE_MARK("Internal error."),
196 B_TRANSLATE_MARK("?+* follows nothing."),
197 B_TRANSLATE_MARK("Trailing \\."),
198 B_TRANSLATE_MARK("Corrupted expression."),
199 B_TRANSLATE_MARK("Memory corruption."),
200 B_TRANSLATE_MARK("Corrupted pointers."),
201 B_TRANSLATE_MARK("Corrupted opcode.")
202 };
203
204
205 #ifdef DEBUG
206 int32 regnarrate = 0;
207 #endif
208
209
210 // #pragma mark - RegExp
211
212
RegExp()213 RegExp::RegExp()
214 :
215 fError(B_OK),
216 fRegExp(NULL),
217 fInputScanPointer(NULL),
218 fParenthesisCount(0),
219 fDummy('\0'),
220 fCodeEmitPointer(NULL),
221 fCodeSize(0),
222 fStringInputPointer(NULL),
223 fRegBol(NULL),
224 fStartPArrayPointer(NULL),
225 fEndPArrayPointer(NULL)
226 {
227 }
228
229
RegExp(const char * pattern)230 RegExp::RegExp(const char* pattern)
231 :
232 fError(B_OK),
233 fRegExp(NULL)
234 {
235 fRegExp = Compile(pattern);
236 }
237
238
RegExp(const BString & pattern)239 RegExp::RegExp(const BString& pattern)
240 :
241 fError(B_OK),
242 fRegExp(NULL)
243 {
244 fRegExp = Compile(pattern.String());
245 }
246
247
~RegExp()248 RegExp::~RegExp()
249 {
250 free(fRegExp);
251 }
252
253
254 status_t
InitCheck() const255 RegExp::InitCheck() const
256 {
257 return fError;
258 }
259
260
261 status_t
SetTo(const char * pattern)262 RegExp::SetTo(const char* pattern)
263 {
264 fError = B_OK;
265 free(fRegExp);
266 fRegExp = Compile(pattern);
267 return fError;
268 }
269
270
271 status_t
SetTo(const BString & pattern)272 RegExp::SetTo(const BString& pattern)
273 {
274 fError = B_OK;
275 free(fRegExp);
276 fRegExp = Compile(pattern.String());
277 return fError;
278 }
279
280
281 bool
Matches(const char * string) const282 RegExp::Matches(const char* string) const
283 {
284 if (fRegExp == NULL || string == NULL)
285 return false;
286
287 return RunMatcher(fRegExp, string) == 1;
288 }
289
290
291 bool
Matches(const BString & string) const292 RegExp::Matches(const BString& string) const
293 {
294 if (fRegExp == NULL)
295 return false;
296
297 return RunMatcher(fRegExp, string.String()) == 1;
298 }
299
300
301 //
302 // - Compile - compile a regular expression into internal code
303 //
304 // We can't allocate space until we know how big the compiled form will be,
305 // but we can't compile it (and thus know how big it is) until we've got a
306 // place to put the code. So we cheat: we compile it twice, once with code
307 // generation turned off and size counting turned on, and once "for real".
308 // This also means that we don't allocate space until we are sure that the
309 // thing really will compile successfully, and we never have to move the
310 // code and thus invalidate pointers into it. (Note that it has to be in
311 // one piece because free() must be able to free it all.)
312 //
313 // Beware that the optimization-preparation code in here knows about some
314 // of the structure of the compiled regexp.
315 regexp*
Compile(const char * exp)316 RegExp::Compile(const char* exp)
317 {
318 regexp* r;
319 const char* scan;
320 const char* longest;
321 int32 len;
322 int32 flags;
323
324 if (exp == NULL) {
325 SetError(B_BAD_VALUE);
326 return NULL;
327 }
328
329 // First pass: determine size, legality.
330 fInputScanPointer = exp;
331 fParenthesisCount = 1;
332 fCodeSize = 0L;
333 fCodeEmitPointer = &fDummy;
334 Char(kRegExpMagic);
335 if (Reg(0, &flags) == NULL)
336 return NULL;
337
338 // Small enough for pointer-storage convention?
339 if (fCodeSize >= kMaxSize) {
340 SetError(REGEXP_TOO_BIG);
341 return NULL;
342 }
343
344 r = (regexp*)malloc(sizeof(regexp) + fCodeSize);
345 // Allocate space
346
347 if (r == NULL) {
348 SetError(B_NO_MEMORY);
349 return NULL;
350 }
351
352 // Second pass: emit code.
353 fInputScanPointer = exp;
354 fParenthesisCount = 1;
355 fCodeEmitPointer = r->program;
356 Char(kRegExpMagic);
357 if (Reg(0, &flags) == NULL) {
358 free(r);
359 return NULL;
360 }
361
362 // Dig out information for optimizations.
363 r->regstart = '\0';
364 // Worst-case defaults.
365 r->reganch = 0;
366 r->regmust = NULL;
367 r->regmlen = 0;
368 scan = r->program + 1;
369 // First kRegExpBranch.
370 if (*Next((char*)scan) == kRegExpEnd) {
371 // Only one top-level choice.
372 scan = Operand(scan);
373
374 // Starting-point info.
375 if (*scan == kRegExpExactly)
376 r->regstart = *Operand(scan);
377 else if (*scan == kRegExpBol)
378 r->reganch++;
379
380 // If there's something expensive in the r.e., find the
381 // longest literal string that must appear and make it the
382 // regmust. Resolve ties in favor of later strings, since
383 // the regstart check works with the beginning of the r.e.
384 // and avoiding duplication strengthens checking. Not a
385 // strong reason, but sufficient in the absence of others.
386 if ((flags & kSPStart) != 0) {
387 longest = NULL;
388 len = 0;
389 for (; scan != NULL; scan = Next((char*)scan)) {
390 if (*scan == kRegExpExactly
391 && (int32)strlen(Operand(scan)) >= len) {
392 longest = Operand(scan);
393 len = (int32)strlen(Operand(scan));
394 }
395 }
396 r->regmust = longest;
397 r->regmlen = len;
398 }
399 }
400
401 return r;
402 }
403
404
405 regexp*
Expression() const406 RegExp::Expression() const
407 {
408 return fRegExp;
409 }
410
411
412 const char*
ErrorString() const413 RegExp::ErrorString() const
414 {
415 if (fError >= REGEXP_UNMATCHED_PARENTHESIS
416 && fError <= REGEXP_CORRUPTED_OPCODE) {
417 return B_TRANSLATE_NOCOLLECT(
418 kRegExpErrorStringArray[fError - B_ERRORS_END]);
419 }
420
421 return strerror(fError);
422 }
423
424
425 void
SetError(status_t error) const426 RegExp::SetError(status_t error) const
427 {
428 fError = error;
429 }
430
431
432 //
433 // - Reg - regular expression, i.e. main body or parenthesized thing
434 //
435 // Caller must absorb opening parenthesis.
436 //
437 // Combining parenthesis handling with the base level of regular expression
438 // is a trifle forced, but the need to tie the tails of the branches to what
439 // follows makes it hard to avoid.
440 //
441 char*
Reg(int32 paren,int32 * flagp)442 RegExp::Reg(int32 paren, int32* flagp)
443 {
444 char* ret;
445 char* br;
446 char* ender;
447 int32 parno = 0;
448 int32 flags;
449
450 *flagp = kHasWidth;
451 // Tentatively.
452
453 // Make an kRegExpOpen node, if parenthesized.
454 if (paren) {
455 if (fParenthesisCount >= kSubExpressionMax) {
456 SetError(REGEXP_TOO_MANY_PARENTHESIS);
457 return NULL;
458 }
459 parno = fParenthesisCount;
460 fParenthesisCount++;
461 ret = Node((char)(kRegExpOpen + parno));
462 } else
463 ret = NULL;
464
465 // Pick up the branches, linking them together.
466 br = Branch(&flags);
467 if (br == NULL)
468 return NULL;
469
470 if (ret != NULL) {
471 Tail(ret, br);
472 // kRegExpOpen -> first
473 } else
474 ret = br;
475
476 if (!(flags & kHasWidth))
477 *flagp &= ~kHasWidth;
478
479 *flagp |= flags&kSPStart;
480 while (*fInputScanPointer == '|') {
481 fInputScanPointer++;
482 br = Branch(&flags);
483 if (br == NULL)
484 return NULL;
485
486 Tail(ret, br);
487 // kRegExpBranch -> kRegExpBranch.
488 if (!(flags & kHasWidth))
489 *flagp &= ~kHasWidth;
490
491 *flagp |= flags&kSPStart;
492 }
493
494 // Make a closing node, and hook it on the end.
495 ender = Node(paren ? (char)(kRegExpClose + parno) : (char)kRegExpEnd);
496 Tail(ret, ender);
497
498 // Hook the tails of the branches to the closing node.
499 for (br = ret; br != NULL; br = Next(br))
500 OpTail(br, ender);
501
502 // Check for proper termination.
503 if (paren && *fInputScanPointer++ != ')') {
504 SetError(REGEXP_UNMATCHED_PARENTHESIS);
505 return NULL;
506 } else if (!paren && *fInputScanPointer != '\0') {
507 if (*fInputScanPointer == ')') {
508 SetError(REGEXP_UNMATCHED_PARENTHESIS);
509 return NULL;
510 } else {
511 SetError(REGEXP_JUNK_ON_END);
512 return NULL; // "Can't happen".
513 }
514 // NOTREACHED
515 }
516
517 return ret;
518 }
519
520
521 //
522 // - Branch - one alternative of an | operator
523 //
524 // Implements the concatenation operator.
525 //
526 char*
Branch(int32 * flagp)527 RegExp::Branch(int32* flagp)
528 {
529 char* ret;
530 char* chain;
531 char* latest;
532 int32 flags;
533
534 *flagp = kWorst;
535 // Tentatively.
536
537 ret = Node(kRegExpBranch);
538 chain = NULL;
539 while (*fInputScanPointer != '\0'
540 && *fInputScanPointer != '|'
541 && *fInputScanPointer != ')') {
542 latest = Piece(&flags);
543 if (latest == NULL)
544 return NULL;
545
546 *flagp |= flags & kHasWidth;
547 if (chain == NULL) {
548 // First piece.
549 *flagp |= flags & kSPStart;
550 } else
551 Tail(chain, latest);
552
553 chain = latest;
554 }
555
556 if (chain == NULL) {
557 // Loop ran zero times.
558 Node(kRegExpNothing);
559 }
560
561 return ret;
562 }
563
564
565 //
566 // - Piece - something followed by possible [*+?]
567 //
568 // Note that the branching code sequences used for ? and the general cases
569 // of * and + are somewhat optimized: they use the same kRegExpNothing node
570 // as both the endmarker for their branch list and the body of the last
571 // branch. It might seem that this node could be dispensed with entirely,
572 // but the endmarker role is not redundant.
573 //
574 char*
Piece(int32 * flagp)575 RegExp::Piece(int32* flagp)
576 {
577 char* ret;
578 char op;
579 char* next;
580 int32 flags;
581
582 ret = Atom(&flags);
583 if (ret == NULL)
584 return NULL;
585
586 op = *fInputScanPointer;
587 if (!IsMult(op)) {
588 *flagp = flags;
589 return ret;
590 }
591
592 if (!(flags & kHasWidth) && op != '?') {
593 SetError(REGEXP_STAR_PLUS_OPERAND_EMPTY);
594 return NULL;
595 }
596 *flagp = op != '+' ? kWorst | kSPStart : kWorst | kHasWidth;
597
598 if (op == '*' && (flags & kSimple))
599 Insert(kRegExpStar, ret);
600 else if (op == '*') {
601 // Emit x* as (x&|), where & means "self".
602 Insert(kRegExpBranch, ret);
603 // Either x
604 OpTail(ret, Node(kRegExpBack));
605 // and loop
606 OpTail(ret, ret);
607 // back
608 Tail(ret, Node(kRegExpBranch));
609 // or
610 Tail(ret, Node(kRegExpNothing));
611 // null.
612 } else if (op == '+' && (flags & kSimple))
613 Insert(kRegExpPlus, ret);
614 else if (op == '+') {
615 // Emit x+ as x(&|), where & means "self".
616 next = Node(kRegExpBranch);
617 // Either
618 Tail(ret, next);
619 Tail(Node(kRegExpBack), ret);
620 // loop back
621 Tail(next, Node(kRegExpBranch));
622 // or
623 Tail(ret, Node(kRegExpNothing));
624 // null.
625 } else if (op == '?') {
626 // Emit x? as (x|)
627 Insert(kRegExpBranch, ret);
628 // Either x
629 Tail(ret, Node(kRegExpBranch));
630 // or
631 next = Node(kRegExpNothing);
632 // null.
633 Tail(ret, next);
634 OpTail(ret, next);
635 }
636
637 fInputScanPointer++;
638 if (IsMult(*fInputScanPointer)) {
639 SetError(REGEXP_NESTED_STAR_QUESTION_PLUS);
640 return NULL;
641 }
642
643 return ret;
644 }
645
646
647 //
648 // - Atom - the lowest level
649 //
650 // Optimization: gobbles an entire sequence of ordinary characters so that
651 // it can turn them into a single node, which is smaller to store and
652 // faster to run. Backslashed characters are exceptions, each becoming a
653 // separate node; the code is simpler that way and it's not worth fixing.
654 //
655 char*
Atom(int32 * flagp)656 RegExp::Atom(int32* flagp)
657 {
658 char* ret;
659 int32 flags;
660
661 *flagp = kWorst;
662 // tentatively
663
664 switch (*fInputScanPointer++) {
665 case '^':
666 ret = Node(kRegExpBol);
667 break;
668
669 case '$':
670 ret = Node(kRegExpEol);
671 break;
672
673 case '.':
674 ret = Node(kRegExpAny);
675 *flagp |= kHasWidth|kSimple;
676 break;
677
678 case '[':
679 {
680 int32 cclass;
681 int32 classend;
682
683 if (*fInputScanPointer == '^') {
684 // complement of range
685 ret = Node(kRegExpAnyBut);
686 fInputScanPointer++;
687 } else
688 ret = Node(kRegExpAnyOf);
689 if (*fInputScanPointer == ']' || *fInputScanPointer == '-')
690 Char(*fInputScanPointer++);
691 while (*fInputScanPointer != '\0'
692 && *fInputScanPointer != ']') {
693 if (*fInputScanPointer == '-') {
694 fInputScanPointer++;
695 if (*fInputScanPointer == ']'
696 || *fInputScanPointer == '\0') {
697 Char('-');
698 } else {
699 cclass = UCharAt(fInputScanPointer - 2) + 1;
700 classend = UCharAt(fInputScanPointer);
701 if (cclass > classend + 1) {
702 SetError(REGEXP_INVALID_BRACKET_RANGE);
703 return NULL;
704 }
705 for (; cclass <= classend; cclass++)
706 Char((char)cclass);
707 fInputScanPointer++;
708 }
709 } else
710 Char(*fInputScanPointer++);
711 }
712 Char('\0');
713 if (*fInputScanPointer != ']') {
714 SetError(REGEXP_UNMATCHED_BRACKET);
715 return NULL;
716 }
717 fInputScanPointer++;
718 *flagp |= kHasWidth | kSimple;
719 break;
720 }
721
722 case '(':
723 ret = Reg(1, &flags);
724 if (ret == NULL)
725 return NULL;
726 *flagp |= flags & (kHasWidth | kSPStart);
727 break;
728
729 case '\0':
730 case '|':
731 case ')':
732 SetError(REGEXP_INTERNAL_ERROR);
733 return NULL;
734 // supposed to be caught earlier
735
736 case '?':
737 case '+':
738 case '*':
739 SetError(REGEXP_QUESTION_PLUS_STAR_FOLLOWS_NOTHING);
740 return NULL;
741
742 case '\\':
743 if (*fInputScanPointer == '\0') {
744 SetError(REGEXP_TRAILING_BACKSLASH);
745 return NULL;
746 }
747 ret = Node(kRegExpExactly);
748 Char(*fInputScanPointer++);
749 Char('\0');
750 *flagp |= kHasWidth|kSimple;
751 break;
752
753 default:
754 {
755 int32 len;
756 char ender;
757
758 fInputScanPointer--;
759 len = (int32)strcspn(fInputScanPointer, kMeta);
760 if (len <= 0) {
761 SetError(REGEXP_INTERNAL_ERROR);
762 return NULL;
763 }
764
765 ender = *(fInputScanPointer + len);
766 if (len > 1 && IsMult(ender)) {
767 // Back off clear of ?+* operand.
768 len--;
769 }
770
771 *flagp |= kHasWidth;
772 if (len == 1)
773 *flagp |= kSimple;
774
775 ret = Node(kRegExpExactly);
776 while (len > 0) {
777 Char(*fInputScanPointer++);
778 len--;
779 }
780
781 Char('\0');
782 break;
783 }
784 }
785
786 return ret;
787 }
788
789
790 //
791 // - Node - emit a node
792 //
793 // Returns location.
794 //
795 char*
Node(char op)796 RegExp::Node(char op)
797 {
798 char* ret;
799 char* ptr;
800
801 ret = fCodeEmitPointer;
802 if (ret == &fDummy) {
803 fCodeSize += 3;
804 return ret;
805 }
806
807 ptr = ret;
808 *ptr++ = op;
809 *ptr++ = '\0';
810 // Null "next" pointer.
811 *ptr++ = '\0';
812 fCodeEmitPointer = ptr;
813
814 return ret;
815 }
816
817
818 //
819 // - Char - emit (if appropriate) a byte of code
820 //
821 void
Char(char b)822 RegExp::Char(char b)
823 {
824 if (fCodeEmitPointer != &fDummy)
825 *fCodeEmitPointer++ = b;
826 else
827 fCodeSize++;
828 }
829
830
831 //
832 // - Insert - insert an operator in front of already-emitted operand
833 //
834 // Means relocating the operand.
835 //
836 void
Insert(char op,char * opnd)837 RegExp::Insert(char op, char* opnd)
838 {
839 char* src;
840 char* dst;
841 char* place;
842
843 if (fCodeEmitPointer == &fDummy) {
844 fCodeSize += 3;
845 return;
846 }
847
848 src = fCodeEmitPointer;
849 fCodeEmitPointer += 3;
850 dst = fCodeEmitPointer;
851 while (src > opnd)
852 *--dst = *--src;
853
854 place = opnd;
855 // Op node, where operand used to be.
856 *place++ = op;
857 *place++ = '\0';
858 *place++ = '\0';
859 }
860
861
862 //
863 // - Tail - set the next-pointer at the end of a node chain
864 //
865 void
Tail(char * p,char * val)866 RegExp::Tail(char* p, char* val)
867 {
868 char* scan;
869 char* temp;
870 int32 offset;
871
872 if (p == &fDummy)
873 return;
874
875 // Find last node.
876 scan = p;
877 for (;;) {
878 temp = Next(scan);
879 if (temp == NULL)
880 break;
881 scan = temp;
882 }
883
884 if (scan[0] == kRegExpBack)
885 offset = scan - val;
886 else
887 offset = val - scan;
888
889 scan[1] = (char)((offset >> 8) & 0377);
890 scan[2] = (char)(offset & 0377);
891 }
892
893
894 //
895 // - OpTail - Tail on operand of first argument; nop if operandless
896 //
897 void
OpTail(char * p,char * val)898 RegExp::OpTail(char* p, char* val)
899 {
900 // "Operandless" and "op != kRegExpBranch" are synonymous in practice.
901 if (p == NULL || p == &fDummy || *p != kRegExpBranch)
902 return;
903
904 Tail(Operand(p), val);
905 }
906
907 //
908 // RunMatcher and friends
909 //
910
911
912 //
913 // - RunMatcher - match a regexp against a string
914 //
915 int32
RunMatcher(regexp * prog,const char * string) const916 RegExp::RunMatcher(regexp* prog, const char* string) const
917 {
918 const char* s;
919
920 // be paranoid...
921 if (prog == NULL || string == NULL) {
922 SetError(B_BAD_VALUE);
923 return 0;
924 }
925
926 // check validity of program
927 if (UCharAt(prog->program) != kRegExpMagic) {
928 SetError(REGEXP_CORRUPTED_PROGRAM);
929 return 0;
930 }
931
932 // if there is a "must appear" string, look for it
933 if (prog->regmust != NULL) {
934 s = string;
935 while ((s = strchr(s, prog->regmust[0])) != NULL) {
936 if (strncmp(s, prog->regmust, (size_t)prog->regmlen) == 0) {
937 // found it
938 break;
939 }
940 s++;
941 }
942 if (s == NULL) {
943 // not present
944 return 0;
945 }
946 }
947
948 // mark beginning of line for ^
949 fRegBol = string;
950
951 // simplest case: anchored match need be tried only once
952 if (prog->reganch)
953 return Try(prog, (char*)string);
954
955 // messy cases: unanchored match
956 s = string;
957 if (prog->regstart != '\0') {
958 // we know what char it must start with
959 while ((s = strchr(s, prog->regstart)) != NULL) {
960 if (Try(prog, (char*)s))
961 return 1;
962 s++;
963 }
964 } else {
965 // we don't -- general case.
966 do {
967 if (Try(prog, (char*)s))
968 return 1;
969 } while (*s++ != '\0');
970 }
971
972 // failure
973 return 0;
974 }
975
976
977 //
978 // - Try - try match at specific point
979 //
980 // Returns 0 on failure, 1 on success.
981 //
982 int32
Try(regexp * prog,const char * string) const983 RegExp::Try(regexp* prog, const char* string) const
984 {
985 int32 i;
986 const char **sp;
987 const char **ep;
988
989 fStringInputPointer = string;
990 fStartPArrayPointer = prog->startp;
991 fEndPArrayPointer = prog->endp;
992
993 sp = prog->startp;
994 ep = prog->endp;
995 for (i = kSubExpressionMax; i > 0; i--) {
996 *sp++ = NULL;
997 *ep++ = NULL;
998 }
999 if (Match(prog->program + 1)) {
1000 prog->startp[0] = string;
1001 prog->endp[0] = fStringInputPointer;
1002 return 1;
1003 } else
1004 return 0;
1005 }
1006
1007
1008 //
1009 // - Match - main matching routine
1010 //
1011 // Conceptually the strategy is simple: check to see whether the current
1012 // node matches, call self recursively to see whether the rest matches,
1013 // and then act accordingly. In practice we make some effort to avoid
1014 // recursion, in particular by going through "ordinary" nodes (that don't
1015 // need to know whether the rest of the match failed) by a loop instead of
1016 // by recursion.
1017 //
1018 // Returns 0 on failure, 1 on success.
1019 //
1020 int32
Match(const char * prog) const1021 RegExp::Match(const char* prog) const
1022 {
1023 const char* scan; // Current node.
1024 const char* next; // Next node.
1025
1026 scan = prog;
1027 #ifdef DEBUG
1028 if (scan != NULL && regnarrate)
1029 fprintf(stderr, "%s(\n", Prop(scan));
1030 #endif
1031 while (scan != NULL) {
1032 #ifdef DEBUG
1033 if (regnarrate)
1034 fprintf(stderr, "%s...\n", Prop(scan));
1035 #endif
1036 next = Next(scan);
1037
1038 switch (*scan) {
1039 case kRegExpBol:
1040 if (fStringInputPointer != fRegBol)
1041 return 0;
1042 break;
1043
1044 case kRegExpEol:
1045 if (*fStringInputPointer != '\0')
1046 return 0;
1047 break;
1048
1049 case kRegExpAny:
1050 if (*fStringInputPointer == '\0')
1051 return 0;
1052 fStringInputPointer++;
1053 break;
1054
1055 case kRegExpExactly:
1056 {
1057 const char* opnd = Operand(scan);
1058 // Inline the first character, for speed.
1059 if (*opnd != *fStringInputPointer)
1060 return 0;
1061
1062 uint32 len = strlen(opnd);
1063 if (len > 1
1064 && strncmp(opnd, fStringInputPointer, len) != 0) {
1065 return 0;
1066 }
1067
1068 fStringInputPointer += len;
1069 }
1070 break;
1071
1072 case kRegExpAnyOf:
1073 if (*fStringInputPointer == '\0'
1074 || strchr(Operand(scan), *fStringInputPointer) == NULL) {
1075 return 0;
1076 }
1077 fStringInputPointer++;
1078 break;
1079
1080 case kRegExpAnyBut:
1081 if (*fStringInputPointer == '\0'
1082 || strchr(Operand(scan), *fStringInputPointer) != NULL) {
1083 return 0;
1084 }
1085 fStringInputPointer++;
1086 break;
1087
1088 case kRegExpNothing:
1089 break;
1090
1091 case kRegExpBack:
1092 break;
1093
1094 case kRegExpOpen + 1:
1095 case kRegExpOpen + 2:
1096 case kRegExpOpen + 3:
1097 case kRegExpOpen + 4:
1098 case kRegExpOpen + 5:
1099 case kRegExpOpen + 6:
1100 case kRegExpOpen + 7:
1101 case kRegExpOpen + 8:
1102 case kRegExpOpen + 9:
1103 {
1104 int32 no;
1105 const char* save;
1106
1107 no = *scan - kRegExpOpen;
1108 save = fStringInputPointer;
1109
1110 if (Match(next)) {
1111 //
1112 // Don't set startp if some later
1113 // invocation of the same parentheses
1114 // already has.
1115 //
1116 if (fStartPArrayPointer[no] == NULL)
1117 fStartPArrayPointer[no] = save;
1118 return 1;
1119 } else
1120 return 0;
1121 }
1122 break;
1123
1124 case kRegExpClose + 1:
1125 case kRegExpClose + 2:
1126 case kRegExpClose + 3:
1127 case kRegExpClose + 4:
1128 case kRegExpClose + 5:
1129 case kRegExpClose + 6:
1130 case kRegExpClose + 7:
1131 case kRegExpClose + 8:
1132 case kRegExpClose + 9:
1133 {
1134 int32 no;
1135 const char* save;
1136
1137 no = *scan - kRegExpClose;
1138 save = fStringInputPointer;
1139
1140 if (Match(next)) {
1141 //
1142 // Don't set endp if some later
1143 // invocation of the same parentheses
1144 // already has.
1145 //
1146 if (fEndPArrayPointer[no] == NULL)
1147 fEndPArrayPointer[no] = save;
1148 return 1;
1149 } else
1150 return 0;
1151 }
1152 break;
1153
1154 case kRegExpBranch:
1155 {
1156 const char* save;
1157
1158 if (*next != kRegExpBranch) {
1159 // no choice
1160 next = Operand(scan);
1161 // avoid recursion
1162 } else {
1163 do {
1164 save = fStringInputPointer;
1165 if (Match(Operand(scan)))
1166 return 1;
1167 fStringInputPointer = save;
1168 scan = Next(scan);
1169 } while (scan != NULL && *scan == kRegExpBranch);
1170 return 0;
1171 // NOTREACHED
1172 }
1173 }
1174 break;
1175
1176 case kRegExpStar:
1177 case kRegExpPlus:
1178 {
1179 char nextch;
1180 int32 no;
1181 const char* save;
1182 int32 min;
1183
1184 //
1185 // Lookahead to avoid useless match attempts
1186 // when we know what character comes next.
1187 //
1188 nextch = '\0';
1189 if (*next == kRegExpExactly)
1190 nextch = *Operand(next);
1191 min = (*scan == kRegExpStar) ? 0 : 1;
1192 save = fStringInputPointer;
1193 no = Repeat(Operand(scan));
1194 while (no >= min) {
1195 // if it could work, try it
1196 if (nextch == '\0' || *fStringInputPointer == nextch) {
1197 if (Match(next))
1198 return 1;
1199 }
1200 // couldn't or didn't, back up
1201 no--;
1202 fStringInputPointer = save + no;
1203 }
1204 return 0;
1205 }
1206 break;
1207
1208 case kRegExpEnd:
1209 return 1;
1210 // Success!
1211
1212 default:
1213 SetError(REGEXP_MEMORY_CORRUPTION);
1214 return 0;
1215 }
1216
1217 scan = next;
1218 }
1219
1220 //
1221 // We get here only if there's trouble -- normally "case kRegExpEnd" is
1222 // the terminating point.
1223 //
1224 SetError(REGEXP_CORRUPTED_POINTERS);
1225
1226 return 0;
1227 }
1228
1229
1230 //
1231 // - Repeat - repeatedly match something simple, report how many
1232 //
1233 int32
Repeat(const char * p) const1234 RegExp::Repeat(const char* p) const
1235 {
1236 int32 count = 0;
1237 const char* scan;
1238 const char* opnd;
1239
1240 scan = fStringInputPointer;
1241 opnd = Operand(p);
1242 switch (*p) {
1243 case kRegExpAny:
1244 count = (int32)strlen(scan);
1245 scan += count;
1246 break;
1247
1248 case kRegExpExactly:
1249 while (*opnd == *scan) {
1250 count++;
1251 scan++;
1252 }
1253 break;
1254
1255 case kRegExpAnyOf:
1256 while (*scan != '\0' && strchr(opnd, *scan) != NULL) {
1257 count++;
1258 scan++;
1259 }
1260 break;
1261
1262 case kRegExpAnyBut:
1263 while (*scan != '\0' && strchr(opnd, *scan) == NULL) {
1264 count++;
1265 scan++;
1266 }
1267 break;
1268
1269 default:
1270 // oh dear, called inappropriately
1271 SetError(REGEXP_INTERNAL_ERROR);
1272 count = 0;
1273 // best compromise
1274 break;
1275 }
1276 fStringInputPointer = scan;
1277
1278 return count;
1279 }
1280
1281
1282 //
1283 // - Next - dig the "next" pointer out of a node
1284 //
1285 char*
Next(char * p)1286 RegExp::Next(char* p)
1287 {
1288 int32 offset;
1289
1290 if (p == &fDummy)
1291 return NULL;
1292
1293 offset = ((*(p + 1) & 0377) << 8) + (*(p + 2) & 0377);
1294 if (offset == 0)
1295 return NULL;
1296
1297 if (*p == kRegExpBack)
1298 return p - offset;
1299 else
1300 return p + offset;
1301 }
1302
1303
1304 const char*
Next(const char * p) const1305 RegExp::Next(const char* p) const
1306 {
1307 int32 offset;
1308
1309 if (p == &fDummy)
1310 return NULL;
1311
1312 offset = ((*(p + 1) & 0377) << 8) + (*(p + 2) & 0377);
1313 if (offset == 0)
1314 return NULL;
1315
1316 if (*p == kRegExpBack)
1317 return p - offset;
1318 else
1319 return p + offset;
1320 }
1321
1322
1323 inline int32
UCharAt(const char * p) const1324 RegExp::UCharAt(const char* p) const
1325 {
1326 return (int32)*(unsigned char *)p;
1327 }
1328
1329
1330 inline char*
Operand(char * p) const1331 RegExp::Operand(char* p) const
1332 {
1333 return p + 3;
1334 }
1335
1336
1337 inline const char*
Operand(const char * p) const1338 RegExp::Operand(const char* p) const
1339 {
1340 return p + 3;
1341 }
1342
1343
1344 inline bool
IsMult(char c) const1345 RegExp::IsMult(char c) const
1346 {
1347 return c == '*' || c == '+' || c == '?';
1348 }
1349
1350
1351 #ifdef DEBUG
1352
1353
1354 //
1355 // - Dump - dump a regexp onto stdout in vaguely comprehensible form
1356 //
1357 void
Dump()1358 RegExp::Dump()
1359 {
1360 const char* s;
1361 char op = kRegExpExactly;
1362 // Arbitrary non-kRegExpEnd op.
1363 const char* next;
1364
1365 s = fRegExp->program + 1;
1366 while (op != kRegExpEnd) {
1367 // While that wasn't kRegExpEnd last time...
1368 op = *s;
1369 printf("%2ld%s", s - fRegExp->program, Prop(s));
1370 // Where, what.
1371 next = Next(s);
1372 if (next == NULL) {
1373 // next ptr
1374 printf("(0)");
1375 } else
1376 printf("(%ld)", (s - fRegExp->program) + (next - s));
1377
1378 s += 3;
1379 if (op == kRegExpAnyOf || op == kRegExpAnyBut
1380 || op == kRegExpExactly) {
1381 // literal string, where present
1382 while (*s != '\0') {
1383 putchar(*s);
1384 s++;
1385 }
1386 s++;
1387 }
1388 putchar('\n');
1389 }
1390
1391 // header fields of interest
1392 if (fRegExp->regstart != '\0')
1393 printf("start `%c' ", fRegExp->regstart);
1394
1395 if (fRegExp->reganch)
1396 printf("anchored ");
1397
1398 if (fRegExp->regmust != NULL)
1399 printf("must have \"%s\"", fRegExp->regmust);
1400
1401 printf("\n");
1402 }
1403
1404
1405 //
1406 // - Prop - printable representation of opcode
1407 //
1408 char*
Prop(const char * op) const1409 RegExp::Prop(const char* op) const
1410 {
1411 const char* p = NULL;
1412 static char buf[50];
1413
1414 strcpy(buf, ":");
1415
1416 switch (*op) {
1417 case kRegExpBol:
1418 p = "kRegExpBol";
1419 break;
1420
1421 case kRegExpEol:
1422 p = "kRegExpEol";
1423 break;
1424
1425 case kRegExpAny:
1426 p = "kRegExpAny";
1427 break;
1428
1429 case kRegExpAnyOf:
1430 p = "kRegExpAnyOf";
1431 break;
1432
1433 case kRegExpAnyBut:
1434 p = "kRegExpAnyBut";
1435 break;
1436
1437 case kRegExpBranch:
1438 p = "kRegExpBranch";
1439 break;
1440
1441 case kRegExpExactly:
1442 p = "kRegExpExactly";
1443 break;
1444
1445 case kRegExpNothing:
1446 p = "kRegExpNothing";
1447 break;
1448
1449 case kRegExpBack:
1450 p = "kRegExpBack";
1451 break;
1452
1453 case kRegExpEnd:
1454 p = "kRegExpEnd";
1455 break;
1456
1457 case kRegExpOpen + 1:
1458 case kRegExpOpen + 2:
1459 case kRegExpOpen + 3:
1460 case kRegExpOpen + 4:
1461 case kRegExpOpen + 5:
1462 case kRegExpOpen + 6:
1463 case kRegExpOpen + 7:
1464 case kRegExpOpen + 8:
1465 case kRegExpOpen + 9:
1466 sprintf(buf + strlen(buf), "kRegExpOpen%d", *op - kRegExpOpen);
1467 p = NULL;
1468 break;
1469
1470 case kRegExpClose + 1:
1471 case kRegExpClose + 2:
1472 case kRegExpClose + 3:
1473 case kRegExpClose + 4:
1474 case kRegExpClose + 5:
1475 case kRegExpClose + 6:
1476 case kRegExpClose + 7:
1477 case kRegExpClose + 8:
1478 case kRegExpClose + 9:
1479 sprintf(buf + strlen(buf), "kRegExpClose%d", *op - kRegExpClose);
1480 p = NULL;
1481 break;
1482
1483 case kRegExpStar:
1484 p = "kRegExpStar";
1485 break;
1486
1487 case kRegExpPlus:
1488 p = "kRegExpPlus";
1489 break;
1490
1491 default:
1492 RegExpError("corrupted opcode");
1493 break;
1494 }
1495
1496 if (p != NULL)
1497 strcat(buf, p);
1498
1499 return buf;
1500 }
1501
1502
1503 void
RegExpError(const char *) const1504 RegExp::RegExpError(const char*) const
1505 {
1506 // does nothing now, perhaps it should printf?
1507 }
1508
1509 #endif // DEBUG
1510