xref: /haiku/src/kits/tracker/RegExp.cpp (revision 09bb8269ac18ccf92315ff2a8b785fd06f021499)
1 /*
2 Open Tracker License
3 
4 Terms and Conditions
5 
6 Copyright (c) 1991-2000, Be Incorporated. All rights reserved.
7 
8 Permission is hereby granted, free of charge, to any person obtaining a copy of
9 this software and associated documentation files (the "Software"), to deal in
10 the Software without restriction, including without limitation the rights to
11 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
12 of the Software, and to permit persons to whom the Software is furnished to do
13 so, subject to the following conditions:
14 
15 The above copyright notice and this permission notice applies to all licensees
16 and shall be included in all copies or substantial portions of the Software.
17 
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF TITLE, MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21 BE INCORPORATED BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
22 AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN CONNECTION
23 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
24 
25 Except as contained in this notice, the name of Be Incorporated shall not be
26 used in advertising or otherwise to promote the sale, use or other dealings in
27 this Software without prior written authorization from Be Incorporated.
28 
29 Tracker(TM), Be(R), BeOS(R), and BeIA(TM) are trademarks or registered trademarks
30 of Be Incorporated in the United States and other countries. Other brand product
31 names are registered trademarks or trademarks of their respective holders.
32 All rights reserved.
33 */
34 
35 // This code is based on regexp.c, v.1.3 by Henry Spencer:
36 
37 // @(#)regexp.c	1.3 of 18 April 87
38 //
39 //	Copyright (c) 1986 by University of Toronto.
40 //	Written by Henry Spencer.  Not derived from licensed software.
41 //
42 //	Permission is granted to anyone to use this software for any
43 //	purpose on any computer system, and to redistribute it freely,
44 //	subject to the following restrictions:
45 //
46 //	1. The author is not responsible for the consequences of use of
47 //		this software, no matter how awful, even if they arise
48 //		from defects in it.
49 //
50 //	2. The origin of this software must not be misrepresented, either
51 //		by explicit claim or by omission.
52 //
53 //	3. Altered versions must be plainly marked as such, and must not
54 //		be misrepresented as being the original software.
55 //
56 // Beware that some of this code is subtly aware of the way operator
57 // precedence is structured in regular expressions.  Serious changes in
58 // regular-expression syntax might require a total rethink.
59 //
60 
61 // ALTERED VERSION: Adapted to ANSI C and C++ for the OpenTracker
62 // project (www.opentracker.org), Jul 11, 2000.
63 
64 
65 #include <stdlib.h>
66 #include <stdio.h>
67 #include <string.h>
68 
69 #include <Catalog.h>
70 #include <Errors.h>
71 
72 #include "RegExp.h"
73 
74 
75 #undef B_TRANSLATION_CONTEXT
76 #define B_TRANSLATION_CONTEXT "libtracker"
77 
78 
79 // The first byte of the regexp internal "program" is actually this magic
80 // number; the start node begins in the second byte.
81 
82 const uint8 kRegExpMagic = 0234;
83 
84 // The "internal use only" fields in RegExp.h are present to pass info from
85 // compile to execute that permits the execute phase to run lots faster on
86 // simple cases.  They are:
87 //
88 // regstart	char that must begin a match; '\0' if none obvious
89 // reganch	is the match anchored (at beginning-of-line only)?
90 // regmust	string (pointer into program) that match must include, or NULL
91 // regmlen	length of regmust string
92 //
93 // Regstart and reganch permit very fast decisions on suitable starting points
94 // for a match, cutting down the work a lot.  Regmust permits fast rejection
95 // of lines that cannot possibly match.  The regmust tests are costly enough
96 // that Compile() supplies a regmust only if the r.e. contains something
97 // potentially expensive (at present, the only such thing detected is * or +
98 // at the start of the r.e., which can involve a lot of backup). Regmlen is
99 // supplied because the test in RunMatcher() needs it and Compile() is
100 // computing it anyway.
101 //
102 //
103 //
104 // Structure for regexp "program".  This is essentially a linear encoding
105 // of a nondeterministic finite-state machine (aka syntax charts or
106 // "railroad normal form" in parsing technology).  Each node is an opcode
107 // plus a "next" pointer, possibly plus an operand.  "Next" pointers of
108 // all nodes except kRegExpBranch implement concatenation; a "next" pointer
109 // with a kRegExpBranch on both ends of it is connecting two alternatives.
110 // (Here we have one of the subtle syntax dependencies:  an individual
111 // kRegExpBranch (as opposed to a collection of them) is never concatenated
112 // with anything because of operator precedence.)  The operand of some types
113 // of node is a literal string; for others, it is a node leading into a
114 // sub-FSM. In particular, the operand of a kRegExpBranch node is the first
115 // node of the branch. (NB this is* not* a tree structure:  the tail of the
116 // branch connects to the thing following the set of kRegExpBranches).
117 // The opcodes are:
118 //
119 
120 // definition	number	opnd?	meaning
121 enum {
122 	kRegExpEnd = 0,		// no	End of program.
123 	kRegExpBol = 1,		// no	Match "" at beginning of line.
124 	kRegExpEol = 2,		// no	Match "" at end of line.
125 	kRegExpAny = 3,		// no	Match any one character.
126 	kRegExpAnyOf = 4,	// str	Match any character in this string.
127 	kRegExpAnyBut =	5,	// str	Match any character not in this string.
128 	kRegExpBranch =	6,	// node	Match this alternative, or the next...
129 	kRegExpBack = 7,	// no	Match "", "next" ptr points backward.
130 	kRegExpExactly = 8,	// str	Match this string.
131 	kRegExpNothing = 9,	// no	Match empty string.
132 	kRegExpStar = 10,	// node	Match this (simple) thing 0 or more times.
133 	kRegExpPlus = 11,	// node	Match this (simple) thing 1 or more times.
134 	kRegExpOpen	= 20,	// no	Mark this point in input as start of #n.
135 							//	kRegExpOpen + 1 is number 1, etc.
136 	kRegExpClose = 30	// no	Analogous to kRegExpOpen.
137 };
138 
139 //
140 // Opcode notes:
141 //
142 // kRegExpBranch	The set of branches constituting a single choice are
143 //		hooked together with their "next" pointers, since precedence prevents
144 //		anything being concatenated to any individual branch.  The
145 //		"next" pointer of the last kRegExpBranch in a choice points to the
146 //		thing following the whole choice.  This is also where the
147 //		final "next" pointer of each individual branch points; each
148 //		branch starts with the operand node of a kRegExpBranch node.
149 //
150 // kRegExpBack		Normal "next" pointers all implicitly point forward;
151 //		kRegExpBack exists to make loop structures possible.
152 //
153 // kRegExpStar,kRegExpPlus	'?', and complex '*' and '+', are implemented as
154 //		circular kRegExpBranch structures using kRegExpBack.  Simple cases
155 //		(one character per match) are implemented with kRegExpStar and
156 //		kRegExpPlus for speed and to minimize recursive plunges.
157 //
158 // kRegExpOpen,kRegExpClose	...are numbered at compile time.
159 //
160 //
161 //
162 // A node is one char of opcode followed by two chars of "next" pointer.
163 // "Next" pointers are stored as two 8-bit pieces, high order first.  The
164 // value is a positive offset from the opcode of the node containing it.
165 // An operand, if any, simply follows the node.  (Note that much of the
166 // code generation knows about this implicit relationship.)
167 //
168 // Using two bytes for the "next" pointer is vast overkill for most things,
169 // but allows patterns to get big without disasters.
170 //
171 
172 const char* kMeta = "^$.[()|?+*\\";
173 const int32 kMaxSize = 32767L;
174 	// Probably could be 65535L.
175 
176 
177 // Flags to be passed up and down:
178 enum {
179 	kHasWidth =	01,	// Known never to match null string.
180 	kSimple = 02,	// Simple enough to be kRegExpStar/kRegExpPlus operand.
181 	kSPStart = 04,	// Starts with * or +.
182 	kWorst = 0	// Worst case.
183 };
184 
185 
186 const char* kRegExpErrorStringArray[] = {
187 	B_TRANSLATE_MARK("Unmatched parenthesis."),
188 	B_TRANSLATE_MARK("Expression too long."),
189 	B_TRANSLATE_MARK("Too many parenthesis."),
190 	B_TRANSLATE_MARK("Junk on end."),
191 	B_TRANSLATE_MARK("*+? operand may be empty."),
192 	B_TRANSLATE_MARK("Nested *?+."),
193 	B_TRANSLATE_MARK("Invalid bracket range."),
194 	B_TRANSLATE_MARK("Unmatched brackets."),
195 	B_TRANSLATE_MARK("Internal error."),
196 	B_TRANSLATE_MARK("?+* follows nothing."),
197 	B_TRANSLATE_MARK("Trailing \\."),
198 	B_TRANSLATE_MARK("Corrupted expression."),
199 	B_TRANSLATE_MARK("Memory corruption."),
200 	B_TRANSLATE_MARK("Corrupted pointers."),
201 	B_TRANSLATE_MARK("Corrupted opcode.")
202 };
203 
204 
205 #ifdef DEBUG
206 int32 regnarrate = 0;
207 #endif
208 
209 
210 //	#pragma mark - RegExp
211 
212 
RegExp()213 RegExp::RegExp()
214 	:
215 	fError(B_OK),
216 	fRegExp(NULL),
217 	fInputScanPointer(NULL),
218 	fParenthesisCount(0),
219 	fDummy('\0'),
220 	fCodeEmitPointer(NULL),
221 	fCodeSize(0),
222 	fStringInputPointer(NULL),
223 	fRegBol(NULL),
224 	fStartPArrayPointer(NULL),
225 	fEndPArrayPointer(NULL)
226 {
227 }
228 
229 
RegExp(const char * pattern)230 RegExp::RegExp(const char* pattern)
231 	:
232 	fError(B_OK),
233 	fRegExp(NULL)
234 {
235 	fRegExp = Compile(pattern);
236 }
237 
238 
RegExp(const BString & pattern)239 RegExp::RegExp(const BString& pattern)
240 	:
241 	fError(B_OK),
242 	fRegExp(NULL)
243 {
244 	fRegExp = Compile(pattern.String());
245 }
246 
247 
~RegExp()248 RegExp::~RegExp()
249 {
250 	free(fRegExp);
251 }
252 
253 
254 status_t
InitCheck() const255 RegExp::InitCheck() const
256 {
257 	return fError;
258 }
259 
260 
261 status_t
SetTo(const char * pattern)262 RegExp::SetTo(const char* pattern)
263 {
264 	fError = B_OK;
265 	free(fRegExp);
266 	fRegExp = Compile(pattern);
267 	return fError;
268 }
269 
270 
271 status_t
SetTo(const BString & pattern)272 RegExp::SetTo(const BString& pattern)
273 {
274 	fError = B_OK;
275 	free(fRegExp);
276 	fRegExp = Compile(pattern.String());
277 	return fError;
278 }
279 
280 
281 bool
Matches(const char * string) const282 RegExp::Matches(const char* string) const
283 {
284 	if (fRegExp == NULL || string == NULL)
285 		return false;
286 
287 	return RunMatcher(fRegExp, string) == 1;
288 }
289 
290 
291 bool
Matches(const BString & string) const292 RegExp::Matches(const BString& string) const
293 {
294 	if (fRegExp == NULL)
295 		return false;
296 
297 	return RunMatcher(fRegExp, string.String()) == 1;
298 }
299 
300 
301 //
302 // - Compile - compile a regular expression into internal code
303 //
304 // We can't allocate space until we know how big the compiled form will be,
305 // but we can't compile it (and thus know how big it is) until we've got a
306 // place to put the code.  So we cheat:  we compile it twice, once with code
307 // generation turned off and size counting turned on, and once "for real".
308 // This also means that we don't allocate space until we are sure that the
309 // thing really will compile successfully, and we never have to move the
310 // code and thus invalidate pointers into it.  (Note that it has to be in
311 // one piece because free() must be able to free it all.)
312 //
313 // Beware that the optimization-preparation code in here knows about some
314 // of the structure of the compiled regexp.
315 regexp*
Compile(const char * exp)316 RegExp::Compile(const char* exp)
317 {
318 	regexp* r;
319 	const char* scan;
320 	const char* longest;
321 	int32 len;
322 	int32 flags;
323 
324 	if (exp == NULL) {
325 		SetError(B_BAD_VALUE);
326 		return NULL;
327 	}
328 
329 	// First pass: determine size, legality.
330 	fInputScanPointer = exp;
331 	fParenthesisCount = 1;
332 	fCodeSize = 0L;
333 	fCodeEmitPointer = &fDummy;
334 	Char(kRegExpMagic);
335 	if (Reg(0, &flags) == NULL)
336 		return NULL;
337 
338 	// Small enough for pointer-storage convention?
339 	if (fCodeSize >= kMaxSize) {
340 		SetError(REGEXP_TOO_BIG);
341 		return NULL;
342 	}
343 
344 	r = (regexp*)malloc(sizeof(regexp) + fCodeSize);
345 		// Allocate space
346 
347 	if (r == NULL) {
348 		SetError(B_NO_MEMORY);
349 		return NULL;
350 	}
351 
352 	// Second pass: emit code.
353 	fInputScanPointer = exp;
354 	fParenthesisCount = 1;
355 	fCodeEmitPointer = r->program;
356 	Char(kRegExpMagic);
357 	if (Reg(0, &flags) == NULL) {
358 		free(r);
359 		return NULL;
360 	}
361 
362 	// Dig out information for optimizations.
363 	r->regstart = '\0';
364 		// Worst-case defaults.
365 	r->reganch = 0;
366 	r->regmust = NULL;
367 	r->regmlen = 0;
368 	scan = r->program + 1;
369 		// First kRegExpBranch.
370 	if (*Next((char*)scan) == kRegExpEnd) {
371 		// Only one top-level choice.
372 		scan = Operand(scan);
373 
374 		// Starting-point info.
375 		if (*scan == kRegExpExactly)
376 			r->regstart = *Operand(scan);
377 		else if (*scan == kRegExpBol)
378 			r->reganch++;
379 
380 		// If there's something expensive in the r.e., find the
381 		// longest literal string that must appear and make it the
382 		// regmust.  Resolve ties in favor of later strings, since
383 		// the regstart check works with the beginning of the r.e.
384 		// and avoiding duplication strengthens checking.  Not a
385 		// strong reason, but sufficient in the absence of others.
386 		if ((flags & kSPStart) != 0) {
387 			longest = NULL;
388 			len = 0;
389 			for (; scan != NULL; scan = Next((char*)scan)) {
390 				if (*scan == kRegExpExactly
391 					&& (int32)strlen(Operand(scan)) >= len) {
392 					longest = Operand(scan);
393 					len = (int32)strlen(Operand(scan));
394 				}
395 			}
396 			r->regmust = longest;
397 			r->regmlen = len;
398 		}
399 	}
400 
401 	return r;
402 }
403 
404 
405 regexp*
Expression() const406 RegExp::Expression() const
407 {
408 	return fRegExp;
409 }
410 
411 
412 const char*
ErrorString() const413 RegExp::ErrorString() const
414 {
415 	if (fError >= REGEXP_UNMATCHED_PARENTHESIS
416 		&& fError <= REGEXP_CORRUPTED_OPCODE) {
417 		return B_TRANSLATE_NOCOLLECT(
418 			kRegExpErrorStringArray[fError - B_ERRORS_END]);
419 	}
420 
421 	return strerror(fError);
422 }
423 
424 
425 void
SetError(status_t error) const426 RegExp::SetError(status_t error) const
427 {
428 	fError = error;
429 }
430 
431 
432 //
433 // - Reg - regular expression, i.e. main body or parenthesized thing
434 //
435 // Caller must absorb opening parenthesis.
436 //
437 // Combining parenthesis handling with the base level of regular expression
438 // is a trifle forced, but the need to tie the tails of the branches to what
439 // follows makes it hard to avoid.
440 //
441 char*
Reg(int32 paren,int32 * flagp)442 RegExp::Reg(int32 paren, int32* flagp)
443 {
444 	char* ret;
445 	char* br;
446 	char* ender;
447 	int32 parno = 0;
448 	int32 flags;
449 
450 	*flagp = kHasWidth;
451 		// Tentatively.
452 
453 	// Make an kRegExpOpen node, if parenthesized.
454 	if (paren) {
455 		if (fParenthesisCount >= kSubExpressionMax) {
456 			SetError(REGEXP_TOO_MANY_PARENTHESIS);
457 			return NULL;
458 		}
459 		parno = fParenthesisCount;
460 		fParenthesisCount++;
461 		ret = Node((char)(kRegExpOpen + parno));
462 	} else
463 		ret = NULL;
464 
465 	// Pick up the branches, linking them together.
466 	br = Branch(&flags);
467 	if (br == NULL)
468 		return NULL;
469 
470 	if (ret != NULL) {
471 		Tail(ret, br);
472 			// kRegExpOpen -> first
473 	} else
474 		ret = br;
475 
476 	if (!(flags & kHasWidth))
477 		*flagp &= ~kHasWidth;
478 
479 	*flagp |= flags&kSPStart;
480 	while (*fInputScanPointer == '|') {
481 		fInputScanPointer++;
482 		br = Branch(&flags);
483 		if (br == NULL)
484 			return NULL;
485 
486 		Tail(ret, br);
487 			// kRegExpBranch -> kRegExpBranch.
488 		if (!(flags & kHasWidth))
489 			*flagp &= ~kHasWidth;
490 
491 		*flagp |= flags&kSPStart;
492 	}
493 
494 	// Make a closing node, and hook it on the end.
495 	ender = Node(paren ? (char)(kRegExpClose + parno) : (char)kRegExpEnd);
496 	Tail(ret, ender);
497 
498 	// Hook the tails of the branches to the closing node.
499 	for (br = ret; br != NULL; br = Next(br))
500 		OpTail(br, ender);
501 
502 	// Check for proper termination.
503 	if (paren && *fInputScanPointer++ != ')') {
504 		SetError(REGEXP_UNMATCHED_PARENTHESIS);
505 		return NULL;
506 	} else if (!paren && *fInputScanPointer != '\0') {
507 		if (*fInputScanPointer == ')') {
508 			SetError(REGEXP_UNMATCHED_PARENTHESIS);
509 			return NULL;
510 		} else {
511 			SetError(REGEXP_JUNK_ON_END);
512 			return NULL;	//  "Can't happen".
513 		}
514 		// NOTREACHED
515 	}
516 
517 	return ret;
518 }
519 
520 
521 //
522 // - Branch - one alternative of an | operator
523 //
524 // Implements the concatenation operator.
525 //
526 char*
Branch(int32 * flagp)527 RegExp::Branch(int32* flagp)
528 {
529 	char* ret;
530 	char* chain;
531 	char* latest;
532 	int32 flags;
533 
534 	*flagp = kWorst;
535 		// Tentatively.
536 
537 	ret = Node(kRegExpBranch);
538 	chain = NULL;
539 	while (*fInputScanPointer != '\0'
540 		&& *fInputScanPointer != '|'
541 		&& *fInputScanPointer != ')') {
542 		latest = Piece(&flags);
543 		if (latest == NULL)
544 			return NULL;
545 
546 		*flagp |= flags & kHasWidth;
547 		if (chain == NULL) {
548 			// First piece.
549 			*flagp |= flags & kSPStart;
550 		} else
551 			Tail(chain, latest);
552 
553 		chain = latest;
554 	}
555 
556 	if (chain == NULL) {
557 		// Loop ran zero times.
558 		Node(kRegExpNothing);
559 	}
560 
561 	return ret;
562 }
563 
564 
565 //
566 // - Piece - something followed by possible [*+?]
567 //
568 // Note that the branching code sequences used for ? and the general cases
569 // of * and + are somewhat optimized:  they use the same kRegExpNothing node
570 // as both the endmarker for their branch list and the body of the last
571 // branch. It might seem that this node could be dispensed with entirely,
572 // but the endmarker role is not redundant.
573 //
574 char*
Piece(int32 * flagp)575 RegExp::Piece(int32* flagp)
576 {
577 	char* ret;
578 	char op;
579 	char* next;
580 	int32 flags;
581 
582 	ret = Atom(&flags);
583 	if (ret == NULL)
584 		return NULL;
585 
586 	op = *fInputScanPointer;
587 	if (!IsMult(op)) {
588 		*flagp = flags;
589 		return ret;
590 	}
591 
592 	if (!(flags & kHasWidth) && op != '?') {
593 		SetError(REGEXP_STAR_PLUS_OPERAND_EMPTY);
594 		return NULL;
595 	}
596 	*flagp = op != '+' ? kWorst | kSPStart :  kWorst | kHasWidth;
597 
598 	if (op == '*' && (flags & kSimple))
599 		Insert(kRegExpStar, ret);
600 	else if (op == '*') {
601 		// Emit x* as (x&|), where & means "self".
602 		Insert(kRegExpBranch, ret);
603 			// Either x
604 		OpTail(ret, Node(kRegExpBack));
605 			// and loop
606 		OpTail(ret, ret);
607 			// back
608 		Tail(ret, Node(kRegExpBranch));
609 			// or
610 		Tail(ret, Node(kRegExpNothing));
611 			// null.
612 	} else if (op == '+' && (flags & kSimple))
613 		Insert(kRegExpPlus, ret);
614 	else if (op == '+') {
615 		// Emit x+ as x(&|), where & means "self".
616 		next = Node(kRegExpBranch);
617 			// Either
618 		Tail(ret, next);
619 		Tail(Node(kRegExpBack), ret);
620 			// loop back
621 		Tail(next, Node(kRegExpBranch));
622 			// or
623 		Tail(ret, Node(kRegExpNothing));
624 			// null.
625 	} else if (op == '?') {
626 		// Emit x? as (x|)
627 		Insert(kRegExpBranch, ret);
628 			// Either x
629 		Tail(ret, Node(kRegExpBranch));
630 			// or
631 		next = Node(kRegExpNothing);
632 			// null.
633 		Tail(ret, next);
634 		OpTail(ret, next);
635 	}
636 
637 	fInputScanPointer++;
638 	if (IsMult(*fInputScanPointer)) {
639 		SetError(REGEXP_NESTED_STAR_QUESTION_PLUS);
640 		return NULL;
641 	}
642 
643 	return ret;
644 }
645 
646 
647 //
648 // - Atom - the lowest level
649 //
650 // Optimization:  gobbles an entire sequence of ordinary characters so that
651 // it can turn them into a single node, which is smaller to store and
652 // faster to run.  Backslashed characters are exceptions, each becoming a
653 // separate node; the code is simpler that way and it's not worth fixing.
654 //
655 char*
Atom(int32 * flagp)656 RegExp::Atom(int32* flagp)
657 {
658 	char* ret;
659 	int32 flags;
660 
661 	*flagp = kWorst;
662 		// tentatively
663 
664 	switch (*fInputScanPointer++) {
665 		case '^':
666 			ret = Node(kRegExpBol);
667 			break;
668 
669 		case '$':
670 			ret = Node(kRegExpEol);
671 			break;
672 
673 		case '.':
674 			ret = Node(kRegExpAny);
675 			*flagp |= kHasWidth|kSimple;
676 			break;
677 
678 		case '[':
679 		{
680 			int32 cclass;
681 			int32 classend;
682 
683 			if (*fInputScanPointer == '^') {
684 				// complement of range
685 				ret = Node(kRegExpAnyBut);
686 				fInputScanPointer++;
687 			} else
688 				ret = Node(kRegExpAnyOf);
689 			if (*fInputScanPointer == ']' || *fInputScanPointer == '-')
690 				Char(*fInputScanPointer++);
691 			while (*fInputScanPointer != '\0'
692 				&& *fInputScanPointer != ']') {
693 				if (*fInputScanPointer == '-') {
694 					fInputScanPointer++;
695 					if (*fInputScanPointer == ']'
696 						|| *fInputScanPointer == '\0') {
697 						Char('-');
698 					} else {
699 						cclass = UCharAt(fInputScanPointer - 2) + 1;
700 						classend = UCharAt(fInputScanPointer);
701 						if (cclass > classend + 1) {
702 							SetError(REGEXP_INVALID_BRACKET_RANGE);
703 							return NULL;
704 						}
705 						for (; cclass <= classend; cclass++)
706 							Char((char)cclass);
707 						fInputScanPointer++;
708 					}
709 				} else
710 					Char(*fInputScanPointer++);
711 			}
712 			Char('\0');
713 			if (*fInputScanPointer != ']') {
714 				SetError(REGEXP_UNMATCHED_BRACKET);
715 				return NULL;
716 			}
717 			fInputScanPointer++;
718 			*flagp |= kHasWidth | kSimple;
719 			break;
720 		}
721 
722 		case '(':
723 			ret = Reg(1, &flags);
724 			if (ret == NULL)
725 				return NULL;
726 			*flagp |= flags & (kHasWidth | kSPStart);
727 			break;
728 
729 		case '\0':
730 		case '|':
731 		case ')':
732 			SetError(REGEXP_INTERNAL_ERROR);
733 			return NULL;
734 				//  supposed to be caught earlier
735 
736 		case '?':
737 		case '+':
738 		case '*':
739 			SetError(REGEXP_QUESTION_PLUS_STAR_FOLLOWS_NOTHING);
740 			return NULL;
741 
742 		case '\\':
743 			if (*fInputScanPointer == '\0') {
744 				SetError(REGEXP_TRAILING_BACKSLASH);
745 				return NULL;
746 			}
747 			ret = Node(kRegExpExactly);
748 			Char(*fInputScanPointer++);
749 			Char('\0');
750 			*flagp |= kHasWidth|kSimple;
751 			break;
752 
753 		default:
754 		{
755 			int32 len;
756 			char ender;
757 
758 			fInputScanPointer--;
759 			len = (int32)strcspn(fInputScanPointer, kMeta);
760 			if (len <= 0) {
761 				SetError(REGEXP_INTERNAL_ERROR);
762 				return NULL;
763 			}
764 
765 			ender = *(fInputScanPointer + len);
766 			if (len > 1 && IsMult(ender)) {
767 				// Back off clear of ?+* operand.
768 				len--;
769 			}
770 
771 			*flagp |= kHasWidth;
772 			if (len == 1)
773 				*flagp |= kSimple;
774 
775 			ret = Node(kRegExpExactly);
776 			while (len > 0) {
777 				Char(*fInputScanPointer++);
778 				len--;
779 			}
780 
781 			Char('\0');
782 			break;
783 		}
784 	}
785 
786 	return ret;
787 }
788 
789 
790 //
791 // - Node - emit a node
792 //
793 // Returns location.
794 //
795 char*
Node(char op)796 RegExp::Node(char op)
797 {
798 	char* ret;
799 	char* ptr;
800 
801 	ret = fCodeEmitPointer;
802 	if (ret == &fDummy) {
803 		fCodeSize += 3;
804 		return ret;
805 	}
806 
807 	ptr = ret;
808 	*ptr++ = op;
809 	*ptr++ = '\0';
810 		// Null "next" pointer.
811 	*ptr++ = '\0';
812 	fCodeEmitPointer = ptr;
813 
814 	return ret;
815 }
816 
817 
818 //
819 // - Char - emit (if appropriate) a byte of code
820 //
821 void
Char(char b)822 RegExp::Char(char b)
823 {
824 	if (fCodeEmitPointer != &fDummy)
825 		*fCodeEmitPointer++ = b;
826 	else
827 		fCodeSize++;
828 }
829 
830 
831 //
832 // - Insert - insert an operator in front of already-emitted operand
833 //
834 // Means relocating the operand.
835 //
836 void
Insert(char op,char * opnd)837 RegExp::Insert(char op, char* opnd)
838 {
839 	char* src;
840 	char* dst;
841 	char* place;
842 
843 	if (fCodeEmitPointer == &fDummy) {
844 		fCodeSize += 3;
845 		return;
846 	}
847 
848 	src = fCodeEmitPointer;
849 	fCodeEmitPointer += 3;
850 	dst = fCodeEmitPointer;
851 	while (src > opnd)
852 		*--dst = *--src;
853 
854 	place = opnd;
855 		// Op node, where operand used to be.
856 	*place++ = op;
857 	*place++ = '\0';
858 	*place++ = '\0';
859 }
860 
861 
862 //
863 // - Tail - set the next-pointer at the end of a node chain
864 //
865 void
Tail(char * p,char * val)866 RegExp::Tail(char* p, char* val)
867 {
868 	char* scan;
869 	char* temp;
870 	int32 offset;
871 
872 	if (p == &fDummy)
873 		return;
874 
875 	// Find last node.
876 	scan = p;
877 	for (;;) {
878 		temp = Next(scan);
879 		if (temp == NULL)
880 			break;
881 		scan = temp;
882 	}
883 
884 	if (scan[0] == kRegExpBack)
885 		offset = scan - val;
886 	else
887 		offset = val - scan;
888 
889 	scan[1] = (char)((offset >> 8) & 0377);
890 	scan[2] = (char)(offset & 0377);
891 }
892 
893 
894 //
895 // - OpTail - Tail on operand of first argument; nop if operandless
896 //
897 void
OpTail(char * p,char * val)898 RegExp::OpTail(char* p, char* val)
899 {
900 	// "Operandless" and "op != kRegExpBranch" are synonymous in practice.
901 	if (p == NULL || p == &fDummy || *p != kRegExpBranch)
902 		return;
903 
904 	Tail(Operand(p), val);
905 }
906 
907 //
908 // RunMatcher and friends
909 //
910 
911 
912 //
913 // - RunMatcher - match a regexp against a string
914 //
915 int32
RunMatcher(regexp * prog,const char * string) const916 RegExp::RunMatcher(regexp* prog, const char* string) const
917 {
918 	const char* s;
919 
920 	// be paranoid...
921 	if (prog == NULL || string == NULL) {
922 		SetError(B_BAD_VALUE);
923 		return 0;
924 	}
925 
926 	// check validity of program
927 	if (UCharAt(prog->program) != kRegExpMagic) {
928 		SetError(REGEXP_CORRUPTED_PROGRAM);
929 		return 0;
930 	}
931 
932 	// if there is a "must appear" string, look for it
933 	if (prog->regmust != NULL) {
934 		s = string;
935 		while ((s = strchr(s, prog->regmust[0])) != NULL) {
936 			if (strncmp(s, prog->regmust, (size_t)prog->regmlen) == 0) {
937 				// found it
938 				break;
939 			}
940 			s++;
941 		}
942 		if (s == NULL) {
943 			// not present
944 			return 0;
945 		}
946 	}
947 
948 	// mark beginning of line for ^
949 	fRegBol = string;
950 
951 	// simplest case:  anchored match need be tried only once
952 	if (prog->reganch)
953 		return Try(prog, (char*)string);
954 
955 	// messy cases: unanchored match
956 	s = string;
957 	if (prog->regstart != '\0') {
958 		// we know what char it must start with
959 		while ((s = strchr(s, prog->regstart)) != NULL) {
960 			if (Try(prog, (char*)s))
961 				return 1;
962 			s++;
963 		}
964 	} else {
965 		// we don't -- general case.
966 		do {
967 			if (Try(prog, (char*)s))
968 				return 1;
969 		} while (*s++ != '\0');
970 	}
971 
972 	// failure
973 	return 0;
974 }
975 
976 
977 //
978 // - Try - try match at specific point
979 //
980 // Returns 0 on failure, 1 on success.
981 //
982 int32
Try(regexp * prog,const char * string) const983 RegExp::Try(regexp* prog, const char* string) const
984 {
985 	int32 i;
986 	const char **sp;
987 	const char **ep;
988 
989 	fStringInputPointer = string;
990 	fStartPArrayPointer = prog->startp;
991 	fEndPArrayPointer = prog->endp;
992 
993 	sp = prog->startp;
994 	ep = prog->endp;
995 	for (i = kSubExpressionMax; i > 0; i--) {
996 		*sp++ = NULL;
997 		*ep++ = NULL;
998 	}
999 	if (Match(prog->program + 1)) {
1000 		prog->startp[0] = string;
1001 		prog->endp[0] = fStringInputPointer;
1002 		return 1;
1003 	} else
1004 		return 0;
1005 }
1006 
1007 
1008 //
1009 // - Match - main matching routine
1010 //
1011 // Conceptually the strategy is simple:  check to see whether the current
1012 // node matches, call self recursively to see whether the rest matches,
1013 // and then act accordingly.  In practice we make some effort to avoid
1014 // recursion, in particular by going through "ordinary" nodes (that don't
1015 // need to know whether the rest of the match failed) by a loop instead of
1016 // by recursion.
1017 //
1018 // Returns 0 on failure, 1 on success.
1019 //
1020 int32
Match(const char * prog) const1021 RegExp::Match(const char* prog) const
1022 {
1023 	const char* scan;	// Current node.
1024 	const char* next;		// Next node.
1025 
1026 	scan = prog;
1027 #ifdef DEBUG
1028 	if (scan != NULL && regnarrate)
1029 		fprintf(stderr, "%s(\n", Prop(scan));
1030 #endif
1031 	while (scan != NULL) {
1032 #ifdef DEBUG
1033 		if (regnarrate)
1034 			fprintf(stderr, "%s...\n", Prop(scan));
1035 #endif
1036 		next = Next(scan);
1037 
1038 		switch (*scan) {
1039 			case kRegExpBol:
1040 				if (fStringInputPointer != fRegBol)
1041 					return 0;
1042 				break;
1043 
1044 			case kRegExpEol:
1045 				if (*fStringInputPointer != '\0')
1046 					return 0;
1047 				break;
1048 
1049 			case kRegExpAny:
1050 				if (*fStringInputPointer == '\0')
1051 					return 0;
1052 				fStringInputPointer++;
1053 				break;
1054 
1055 			case kRegExpExactly:
1056 			{
1057 				const char* opnd = Operand(scan);
1058 				// Inline the first character, for speed.
1059 				if (*opnd != *fStringInputPointer)
1060 					return 0;
1061 
1062 				uint32 len = strlen(opnd);
1063 				if (len > 1
1064 					&& strncmp(opnd, fStringInputPointer, len) != 0) {
1065 					return 0;
1066 				}
1067 
1068 				fStringInputPointer += len;
1069 			}
1070 			break;
1071 
1072 			case kRegExpAnyOf:
1073 				if (*fStringInputPointer == '\0'
1074 					|| strchr(Operand(scan), *fStringInputPointer) == NULL) {
1075 					return 0;
1076 				}
1077 				fStringInputPointer++;
1078 				break;
1079 
1080 			case kRegExpAnyBut:
1081 				if (*fStringInputPointer == '\0'
1082 					|| strchr(Operand(scan), *fStringInputPointer) != NULL) {
1083 					return 0;
1084 				}
1085 				fStringInputPointer++;
1086 				break;
1087 
1088 			case kRegExpNothing:
1089 				break;
1090 
1091 			case kRegExpBack:
1092 				break;
1093 
1094 			case kRegExpOpen + 1:
1095 			case kRegExpOpen + 2:
1096 			case kRegExpOpen + 3:
1097 			case kRegExpOpen + 4:
1098 			case kRegExpOpen + 5:
1099 			case kRegExpOpen + 6:
1100 			case kRegExpOpen + 7:
1101 			case kRegExpOpen + 8:
1102 			case kRegExpOpen + 9:
1103 			{
1104 				int32 no;
1105 				const char* save;
1106 
1107 				no = *scan - kRegExpOpen;
1108 				save = fStringInputPointer;
1109 
1110 				if (Match(next)) {
1111 					//
1112 					// Don't set startp if some later
1113 					// invocation of the same parentheses
1114 					// already has.
1115 					//
1116 					if (fStartPArrayPointer[no] == NULL)
1117 						fStartPArrayPointer[no] = save;
1118 					return 1;
1119 				} else
1120 					return 0;
1121 			}
1122 			break;
1123 
1124 			case kRegExpClose + 1:
1125 			case kRegExpClose + 2:
1126 			case kRegExpClose + 3:
1127 			case kRegExpClose + 4:
1128 			case kRegExpClose + 5:
1129 			case kRegExpClose + 6:
1130 			case kRegExpClose + 7:
1131 			case kRegExpClose + 8:
1132 			case kRegExpClose + 9:
1133 			{
1134 				int32 no;
1135 				const char* save;
1136 
1137 				no = *scan - kRegExpClose;
1138 				save = fStringInputPointer;
1139 
1140 				if (Match(next)) {
1141 					//
1142 					// Don't set endp if some later
1143 					// invocation of the same parentheses
1144 					// already has.
1145 					//
1146 					if (fEndPArrayPointer[no] == NULL)
1147 						fEndPArrayPointer[no] = save;
1148 					return 1;
1149 				} else
1150 					return 0;
1151 			}
1152 			break;
1153 
1154 			case kRegExpBranch:
1155 			{
1156 				const char* save;
1157 
1158 				if (*next != kRegExpBranch) {
1159 					// no choice
1160 					next = Operand(scan);
1161 						// avoid recursion
1162 				} else {
1163 					do {
1164 						save = fStringInputPointer;
1165 						if (Match(Operand(scan)))
1166 							return 1;
1167 						fStringInputPointer = save;
1168 						scan = Next(scan);
1169 					} while (scan != NULL && *scan == kRegExpBranch);
1170 					return 0;
1171 					// NOTREACHED
1172 				}
1173 			}
1174 			break;
1175 
1176 			case kRegExpStar:
1177 			case kRegExpPlus:
1178 				{
1179 					char nextch;
1180 					int32 no;
1181 					const char* save;
1182 					int32 min;
1183 
1184 					//
1185 					// Lookahead to avoid useless match attempts
1186 					// when we know what character comes next.
1187 					//
1188 					nextch = '\0';
1189 					if (*next == kRegExpExactly)
1190 						nextch = *Operand(next);
1191 					min = (*scan == kRegExpStar) ? 0 : 1;
1192 					save = fStringInputPointer;
1193 					no = Repeat(Operand(scan));
1194 					while (no >= min) {
1195 						// if it could work, try it
1196 						if (nextch == '\0' || *fStringInputPointer == nextch) {
1197 							if (Match(next))
1198 								return 1;
1199 						}
1200 						// couldn't or didn't, back up
1201 						no--;
1202 						fStringInputPointer = save + no;
1203 					}
1204 					return 0;
1205 				}
1206 				break;
1207 
1208 			case kRegExpEnd:
1209 				return 1;
1210 					// Success!
1211 
1212 			default:
1213 				SetError(REGEXP_MEMORY_CORRUPTION);
1214 				return 0;
1215 			}
1216 
1217 		scan = next;
1218 	}
1219 
1220 	//
1221 	// We get here only if there's trouble -- normally "case kRegExpEnd" is
1222 	// the terminating point.
1223 	//
1224 	SetError(REGEXP_CORRUPTED_POINTERS);
1225 
1226 	return 0;
1227 }
1228 
1229 
1230 //
1231 // - Repeat - repeatedly match something simple, report how many
1232 //
1233 int32
Repeat(const char * p) const1234 RegExp::Repeat(const char* p) const
1235 {
1236 	int32 count = 0;
1237 	const char* scan;
1238 	const char* opnd;
1239 
1240 	scan = fStringInputPointer;
1241 	opnd = Operand(p);
1242 	switch (*p) {
1243 		case kRegExpAny:
1244 			count = (int32)strlen(scan);
1245 			scan += count;
1246 			break;
1247 
1248 		case kRegExpExactly:
1249 			while (*opnd == *scan) {
1250 				count++;
1251 				scan++;
1252 			}
1253 			break;
1254 
1255 		case kRegExpAnyOf:
1256 			while (*scan != '\0' && strchr(opnd, *scan) != NULL) {
1257 				count++;
1258 				scan++;
1259 			}
1260 			break;
1261 
1262 		case kRegExpAnyBut:
1263 			while (*scan != '\0' && strchr(opnd, *scan) == NULL) {
1264 				count++;
1265 				scan++;
1266 			}
1267 			break;
1268 
1269 		default:
1270 			// oh dear, called inappropriately
1271 			SetError(REGEXP_INTERNAL_ERROR);
1272 			count = 0;
1273 				// best compromise
1274 			break;
1275 	}
1276 	fStringInputPointer = scan;
1277 
1278 	return count;
1279 }
1280 
1281 
1282 //
1283 // - Next - dig the "next" pointer out of a node
1284 //
1285 char*
Next(char * p)1286 RegExp::Next(char* p)
1287 {
1288 	int32 offset;
1289 
1290 	if (p == &fDummy)
1291 		return NULL;
1292 
1293 	offset = ((*(p + 1) & 0377) << 8) + (*(p + 2) & 0377);
1294 	if (offset == 0)
1295 		return NULL;
1296 
1297 	if (*p == kRegExpBack)
1298 		return p - offset;
1299 	else
1300 		return p + offset;
1301 }
1302 
1303 
1304 const char*
Next(const char * p) const1305 RegExp::Next(const char* p) const
1306 {
1307 	int32 offset;
1308 
1309 	if (p == &fDummy)
1310 		return NULL;
1311 
1312 	offset = ((*(p + 1) & 0377) << 8) + (*(p + 2) & 0377);
1313 	if (offset == 0)
1314 		return NULL;
1315 
1316 	if (*p == kRegExpBack)
1317 		return p - offset;
1318 	else
1319 		return p + offset;
1320 }
1321 
1322 
1323 inline int32
UCharAt(const char * p) const1324 RegExp::UCharAt(const char* p) const
1325 {
1326 	return (int32)*(unsigned char *)p;
1327 }
1328 
1329 
1330 inline char*
Operand(char * p) const1331 RegExp::Operand(char* p) const
1332 {
1333 	return p + 3;
1334 }
1335 
1336 
1337 inline const char*
Operand(const char * p) const1338 RegExp::Operand(const char* p) const
1339 {
1340 	return p + 3;
1341 }
1342 
1343 
1344 inline bool
IsMult(char c) const1345 RegExp::IsMult(char c) const
1346 {
1347 	return c == '*' || c == '+' || c == '?';
1348 }
1349 
1350 
1351 #ifdef DEBUG
1352 
1353 
1354 //
1355 // - Dump - dump a regexp onto stdout in vaguely comprehensible form
1356 //
1357 void
Dump()1358 RegExp::Dump()
1359 {
1360 	const char* s;
1361 	char op = kRegExpExactly;
1362 		// Arbitrary non-kRegExpEnd op.
1363 	const char* next;
1364 
1365 	s = fRegExp->program + 1;
1366 	while (op != kRegExpEnd) {
1367 		// While that wasn't kRegExpEnd last time...
1368 		op = *s;
1369 		printf("%2ld%s", s - fRegExp->program, Prop(s));
1370 			// Where, what.
1371 		next = Next(s);
1372 		if (next == NULL) {
1373 			// next ptr
1374 			printf("(0)");
1375 		} else
1376 			printf("(%ld)", (s - fRegExp->program) + (next - s));
1377 
1378 		s += 3;
1379 		if (op == kRegExpAnyOf || op == kRegExpAnyBut
1380 			|| op == kRegExpExactly) {
1381 			// literal string, where present
1382 			while (*s != '\0') {
1383 				putchar(*s);
1384 				s++;
1385 			}
1386 			s++;
1387 		}
1388 		putchar('\n');
1389 	}
1390 
1391 	// header fields of interest
1392 	if (fRegExp->regstart != '\0')
1393 		printf("start `%c' ", fRegExp->regstart);
1394 
1395 	if (fRegExp->reganch)
1396 		printf("anchored ");
1397 
1398 	if (fRegExp->regmust != NULL)
1399 		printf("must have \"%s\"", fRegExp->regmust);
1400 
1401 	printf("\n");
1402 }
1403 
1404 
1405 //
1406 // - Prop - printable representation of opcode
1407 //
1408 char*
Prop(const char * op) const1409 RegExp::Prop(const char* op) const
1410 {
1411 	const char* p = NULL;
1412 	static char buf[50];
1413 
1414 	strcpy(buf, ":");
1415 
1416 	switch (*op) {
1417 		case kRegExpBol:
1418 			p = "kRegExpBol";
1419 			break;
1420 
1421 		case kRegExpEol:
1422 			p = "kRegExpEol";
1423 			break;
1424 
1425 		case kRegExpAny:
1426 			p = "kRegExpAny";
1427 			break;
1428 
1429 		case kRegExpAnyOf:
1430 			p = "kRegExpAnyOf";
1431 			break;
1432 
1433 		case kRegExpAnyBut:
1434 			p = "kRegExpAnyBut";
1435 			break;
1436 
1437 		case kRegExpBranch:
1438 			p = "kRegExpBranch";
1439 			break;
1440 
1441 		case kRegExpExactly:
1442 			p = "kRegExpExactly";
1443 			break;
1444 
1445 		case kRegExpNothing:
1446 			p = "kRegExpNothing";
1447 			break;
1448 
1449 		case kRegExpBack:
1450 			p = "kRegExpBack";
1451 			break;
1452 
1453 		case kRegExpEnd:
1454 			p = "kRegExpEnd";
1455 			break;
1456 
1457 		case kRegExpOpen + 1:
1458 		case kRegExpOpen + 2:
1459 		case kRegExpOpen + 3:
1460 		case kRegExpOpen + 4:
1461 		case kRegExpOpen + 5:
1462 		case kRegExpOpen + 6:
1463 		case kRegExpOpen + 7:
1464 		case kRegExpOpen + 8:
1465 		case kRegExpOpen + 9:
1466 			sprintf(buf + strlen(buf), "kRegExpOpen%d", *op - kRegExpOpen);
1467 			p = NULL;
1468 			break;
1469 
1470 		case kRegExpClose + 1:
1471 		case kRegExpClose + 2:
1472 		case kRegExpClose + 3:
1473 		case kRegExpClose + 4:
1474 		case kRegExpClose + 5:
1475 		case kRegExpClose + 6:
1476 		case kRegExpClose + 7:
1477 		case kRegExpClose + 8:
1478 		case kRegExpClose + 9:
1479 			sprintf(buf + strlen(buf), "kRegExpClose%d", *op - kRegExpClose);
1480 			p = NULL;
1481 			break;
1482 
1483 		case kRegExpStar:
1484 			p = "kRegExpStar";
1485 			break;
1486 
1487 		case kRegExpPlus:
1488 			p = "kRegExpPlus";
1489 			break;
1490 
1491 		default:
1492 			RegExpError("corrupted opcode");
1493 			break;
1494 	}
1495 
1496 	if (p != NULL)
1497 		strcat(buf, p);
1498 
1499 	return buf;
1500 }
1501 
1502 
1503 void
RegExpError(const char *) const1504 RegExp::RegExpError(const char*) const
1505 {
1506 	// does nothing now, perhaps it should printf?
1507 }
1508 
1509 #endif	// DEBUG
1510