xref: /haiku/src/kits/tracker/RegExp.cpp (revision d374a27286b8a52974a97dba0d5966ea026a665d)
1 /*
2 Open Tracker License
3 
4 Terms and Conditions
5 
6 Copyright (c) 1991-2000, Be Incorporated. All rights reserved.
7 
8 Permission is hereby granted, free of charge, to any person obtaining a copy of
9 this software and associated documentation files (the "Software"), to deal in
10 the Software without restriction, including without limitation the rights to
11 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
12 of the Software, and to permit persons to whom the Software is furnished to do
13 so, subject to the following conditions:
14 
15 The above copyright notice and this permission notice applies to all licensees
16 and shall be included in all copies or substantial portions of the Software.
17 
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF TITLE, MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21 BE INCORPORATED BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
22 AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN CONNECTION
23 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
24 
25 Except as contained in this notice, the name of Be Incorporated shall not be
26 used in advertising or otherwise to promote the sale, use or other dealings in
27 this Software without prior written authorization from Be Incorporated.
28 
29 Tracker(TM), Be(R), BeOS(R), and BeIA(TM) are trademarks or registered trademarks
30 of Be Incorporated in the United States and other countries. Other brand product
31 names are registered trademarks or trademarks of their respective holders.
32 All rights reserved.
33 */
34 
35 // This code is based on regexp.c, v.1.3 by Henry Spencer:
36 
37 // @(#)regexp.c	1.3 of 18 April 87
38 //
39 //	Copyright (c) 1986 by University of Toronto.
40 //	Written by Henry Spencer.  Not derived from licensed software.
41 //
42 //	Permission is granted to anyone to use this software for any
43 //	purpose on any computer system, and to redistribute it freely,
44 //	subject to the following restrictions:
45 //
46 //	1. The author is not responsible for the consequences of use of
47 //		this software, no matter how awful, even if they arise
48 //		from defects in it.
49 //
50 //	2. The origin of this software must not be misrepresented, either
51 //		by explicit claim or by omission.
52 //
53 //	3. Altered versions must be plainly marked as such, and must not
54 //		be misrepresented as being the original software.
55 //
56 // Beware that some of this code is subtly aware of the way operator
57 // precedence is structured in regular expressions.  Serious changes in
58 // regular-expression syntax might require a total rethink.
59 //
60 
61 // ALTERED VERSION: Adapted to ANSI C and C++ for the OpenTracker
62 // project (www.opentracker.org), Jul 11, 2000.
63 
64 #include <stdlib.h>
65 #include <stdio.h>
66 #include <string.h>
67 
68 #include <Errors.h>
69 
70 #include "RegExp.h"
71 
72 // The first byte of the regexp internal "program" is actually this magic
73 // number; the start node begins in the second byte.
74 
75 const uint8 kRegExpMagic = 0234;
76 
77 // The "internal use only" fields in RegExp.h are present to pass info from
78 // compile to execute that permits the execute phase to run lots faster on
79 // simple cases.  They are:
80 //
81 // regstart	char that must begin a match; '\0' if none obvious
82 // reganch	is the match anchored (at beginning-of-line only)?
83 // regmust	string (pointer into program) that match must include, or NULL
84 // regmlen	length of regmust string
85 //
86 // Regstart and reganch permit very fast decisions on suitable starting points
87 // for a match, cutting down the work a lot.  Regmust permits fast rejection
88 // of lines that cannot possibly match.  The regmust tests are costly enough
89 // that Compile() supplies a regmust only if the r.e. contains something
90 // potentially expensive (at present, the only such thing detected is * or +
91 // at the start of the r.e., which can involve a lot of backup).  Regmlen is
92 // supplied because the test in RunMatcher() needs it and Compile() is computing
93 // it anyway.
94 //
95 //
96 //
97 // Structure for regexp "program".  This is essentially a linear encoding
98 // of a nondeterministic finite-state machine (aka syntax charts or
99 // "railroad normal form" in parsing technology).  Each node is an opcode
100 // plus a "next" pointer, possibly plus an operand.  "Next" pointers of
101 // all nodes except kRegExpBranch implement concatenation; a "next" pointer with
102 // a kRegExpBranch on both ends of it is connecting two alternatives.  (Here we
103 // have one of the subtle syntax dependencies:  an individual kRegExpBranch (as
104 // opposed to a collection of them) is never concatenated with anything
105 // because of operator precedence.)  The operand of some types of node is
106 // a literal string; for others, it is a node leading into a sub-FSM.  In
107 // particular, the operand of a kRegExpBranch node is the first node of the branch.
108 // (NB this is *not* a tree structure:  the tail of the branch connects
109 // to the thing following the set of kRegExpBranches.)  The opcodes are:
110 //
111 
112 // definition	number	opnd?	meaning
113 enum {
114 	kRegExpEnd = 0,		// no	End of program.
115 	kRegExpBol = 1,		// no	Match "" at beginning of line.
116 	kRegExpEol = 2,		// no	Match "" at end of line.
117 	kRegExpAny = 3,		// no	Match any one character.
118 	kRegExpAnyOf = 4,	// str	Match any character in this string.
119 	kRegExpAnyBut =	5,	// str	Match any character not in this string.
120 	kRegExpBranch =	6,	// node	Match this alternative, or the next...
121 	kRegExpBack = 7,	// no	Match "", "next" ptr points backward.
122 	kRegExpExactly = 8,	// str	Match this string.
123 	kRegExpNothing = 9,	// no	Match empty string.
124 	kRegExpStar = 10,	// node	Match this (simple) thing 0 or more times.
125 	kRegExpPlus = 11,	// node	Match this (simple) thing 1 or more times.
126 	kRegExpOpen	= 20,	// no	Mark this point in input as start of #n.
127 							//	kRegExpOpen + 1 is number 1, etc.
128 	kRegExpClose = 30	// no	Analogous to kRegExpOpen.
129 };
130 
131 //
132 // Opcode notes:
133 //
134 // kRegExpBranch	The set of branches constituting a single choice are hooked
135 //		together with their "next" pointers, since precedence prevents
136 //		anything being concatenated to any individual branch.  The
137 //		"next" pointer of the last kRegExpBranch in a choice points to the
138 //		thing following the whole choice.  This is also where the
139 //		final "next" pointer of each individual branch points; each
140 //		branch starts with the operand node of a kRegExpBranch node.
141 //
142 // kRegExpBack		Normal "next" pointers all implicitly point forward; kRegExpBack
143 //		exists to make loop structures possible.
144 //
145 // kRegExpStar,kRegExpPlus	'?', and complex '*' and '+', are implemented as circular
146 //		kRegExpBranch structures using kRegExpBack.  Simple cases (one character
147 //		per match) are implemented with kRegExpStar and kRegExpPlus for speed
148 //		and to minimize recursive plunges.
149 //
150 // kRegExpOpen,kRegExpClose	...are numbered at compile time.
151 //
152 //
153 //
154 // A node is one char of opcode followed by two chars of "next" pointer.
155 // "Next" pointers are stored as two 8-bit pieces, high order first.  The
156 // value is a positive offset from the opcode of the node containing it.
157 // An operand, if any, simply follows the node.  (Note that much of the
158 // code generation knows about this implicit relationship.)
159 //
160 // Using two bytes for the "next" pointer is vast overkill for most things,
161 // but allows patterns to get big without disasters.
162 //
163 
164 const char *kMeta = "^$.[()|?+*\\";
165 const int32 kMaxSize = 32767L;		// Probably could be 65535L.
166 
167 // Flags to be passed up and down:
168 enum {
169 	kHasWidth =	01,	// Known never to match null string.
170 	kSimple = 02,	// Simple enough to be kRegExpStar/kRegExpPlus operand.
171 	kSPStart = 04,	// Starts with * or +.
172 	kWorst = 0	// Worst case.
173 };
174 
175 const char *kRegExpErrorStringArray[] = {
176 	"Unmatched parenthesis.",
177 	"Expression too long.",
178 	"Too many parenthesis.",
179 	"Junk on end.",
180 	"*+? operand may be empty.",
181 	"Nested *?+.",
182 	"Invalid bracket range.",
183 	"Unmatched brackets.",
184 	"Internal error.",
185 	"?+* follows nothing.",
186 	"Trailing \\.",
187 	"Corrupted expression.",
188 	"Memory corruption.",
189 	"Corrupted pointers.",
190 	"Corrupted opcode."
191 };
192 
193 #ifdef DEBUG
194 int32 regnarrate = 0;
195 #endif
196 
197 RegExp::RegExp()
198 	:	fError(B_OK),
199 		fRegExp(NULL)
200 {
201 }
202 
203 RegExp::RegExp(const char *pattern)
204 	:	fError(B_OK),
205 		fRegExp(NULL)
206 {
207 	fRegExp = Compile(pattern);
208 }
209 
210 RegExp::RegExp(const BString &pattern)
211 	:	fError(B_OK),
212 		fRegExp(NULL)
213 {
214 	fRegExp = Compile(pattern.String());
215 }
216 
217 RegExp::~RegExp()
218 {
219 	free(fRegExp);
220 }
221 
222 
223 
224 status_t
225 RegExp::InitCheck() const
226 {
227 	return fError;
228 }
229 
230 status_t
231 RegExp::SetTo(const char *pattern)
232 {
233 	fError = B_OK;
234 	free(fRegExp);
235 	fRegExp = Compile(pattern);
236 	return fError;
237 }
238 
239 status_t
240 RegExp::SetTo(const BString &pattern)
241 {
242 	fError = B_OK;
243 	free(fRegExp);
244 	fRegExp = Compile(pattern.String());
245 	return fError;
246 }
247 
248 bool
249 RegExp::Matches(const char *string) const
250 {
251 	if (!fRegExp || !string)
252 		return false;
253 
254 	return RunMatcher(fRegExp, string) == 1;
255 }
256 
257 bool
258 RegExp::Matches(const BString &string) const
259 {
260 	if (!fRegExp)
261 		return false;
262 
263 	return RunMatcher(fRegExp, string.String()) == 1;
264 }
265 
266 
267 //
268 // - Compile - compile a regular expression into internal code
269 //
270 // We can't allocate space until we know how big the compiled form will be,
271 // but we can't compile it (and thus know how big it is) until we've got a
272 // place to put the code.  So we cheat:  we compile it twice, once with code
273 // generation turned off and size counting turned on, and once "for real".
274 // This also means that we don't allocate space until we are sure that the
275 // thing really will compile successfully, and we never have to move the
276 // code and thus invalidate pointers into it.  (Note that it has to be in
277 // one piece because free() must be able to free it all.)
278 //
279 // Beware that the optimization-preparation code in here knows about some
280 // of the structure of the compiled regexp.
281 
282 regexp *
283 RegExp::Compile(const char *exp)
284 {
285 	regexp *r;
286 	const char *scan;
287 	const char *longest;
288 	int32 len;
289 	int32 flags;
290 
291 	if (exp == NULL) {
292 		SetError(B_BAD_VALUE);
293 		return NULL;
294 	}
295 
296 	// First pass: determine size, legality.
297 	fInputScanPointer = exp;
298 	fParenthesisCount = 1;
299 	fCodeSize = 0L;
300 	fCodeEmitPointer = &fDummy;
301 	Char(kRegExpMagic);
302 	if (Reg(0, &flags) == NULL)
303 		return NULL;
304 
305 	// Small enough for pointer-storage convention?
306 	if (fCodeSize >= kMaxSize) {
307 		SetError(REGEXP_TOO_BIG);
308 		return NULL;
309 	}
310 
311 	// Allocate space.
312 	r = (regexp *)malloc(sizeof(regexp) + fCodeSize);
313 
314 	if (!r) {
315 		SetError(B_NO_MEMORY);
316 		return NULL;
317 	}
318 
319 	// Second pass: emit code.
320 	fInputScanPointer = exp;
321 	fParenthesisCount = 1;
322 	fCodeEmitPointer = r->program;
323 	Char(kRegExpMagic);
324 	if (Reg(0, &flags) == NULL) {
325 		free(r);
326 		return NULL;
327 	}
328 
329 	// Dig out information for optimizations.
330 	r->regstart = '\0';	// Worst-case defaults.
331 	r->reganch = 0;
332 	r->regmust = NULL;
333 	r->regmlen = 0;
334 	scan = r->program + 1;			// First kRegExpBranch.
335 	if (*Next((char *)scan) == kRegExpEnd) {		// Only one top-level choice.
336 		scan = Operand(scan);
337 
338 		// Starting-point info.
339 		if (*scan == kRegExpExactly)
340 			r->regstart = *Operand(scan);
341 		else if (*scan == kRegExpBol)
342 			r->reganch++;
343 
344 		//
345 		// If there's something expensive in the r.e., find the
346 		// longest literal string that must appear and make it the
347 		// regmust.  Resolve ties in favor of later strings, since
348 		// the regstart check works with the beginning of the r.e.
349 		// and avoiding duplication strengthens checking.  Not a
350 		// strong reason, but sufficient in the absence of others.
351 		//
352 		if (flags&kSPStart) {
353 			longest = NULL;
354 			len = 0;
355 			for (; scan != NULL; scan = Next((char *)scan))
356 				if (*scan == kRegExpExactly && (int32)strlen(Operand(scan)) >= len) {
357 					longest = Operand(scan);
358 					len = (int32)strlen(Operand(scan));
359 				}
360 			r->regmust = longest;
361 			r->regmlen = len;
362 		}
363 	}
364 
365 	return r;
366 }
367 
368 regexp *
369 RegExp::Expression() const
370 {
371 	return fRegExp;
372 }
373 
374 const char *
375 RegExp::ErrorString() const
376 {
377 	if (fError >= REGEXP_UNMATCHED_PARENTHESIS
378 		&& fError <= REGEXP_CORRUPTED_OPCODE)
379 		return kRegExpErrorStringArray[fError - B_ERRORS_END];
380 
381 	return strerror(fError);
382 }
383 
384 
385 void
386 RegExp::SetError(status_t error) const
387 {
388 	fError = error;
389 }
390 
391 
392 //
393 // - Reg - regular expression, i.e. main body or parenthesized thing
394 //
395 // Caller must absorb opening parenthesis.
396 //
397 // Combining parenthesis handling with the base level of regular expression
398 // is a trifle forced, but the need to tie the tails of the branches to what
399 // follows makes it hard to avoid.
400 //
401 char *
402 RegExp::Reg(int32 paren, int32 *flagp)
403 {
404 	char *ret;
405 	char *br;
406 	char *ender;
407 	int32 parno = 0;
408 	int32 flags;
409 
410 	*flagp = kHasWidth;	// Tentatively.
411 
412 	// Make an kRegExpOpen node, if parenthesized.
413 	if (paren) {
414 		if (fParenthesisCount >= kSubExpressionMax) {
415 			SetError(REGEXP_TOO_MANY_PARENTHESIS);
416 			return NULL;
417 		}
418 		parno = fParenthesisCount;
419 		fParenthesisCount++;
420 		ret = Node((char)(kRegExpOpen + parno));
421 	} else
422 		ret = NULL;
423 
424 	// Pick up the branches, linking them together.
425 	br = Branch(&flags);
426 	if (br == NULL)
427 		return NULL;
428 	if (ret != NULL)
429 		Tail(ret, br);	// kRegExpOpen -> first
430 	else
431 		ret = br;
432 	if (!(flags & kHasWidth))
433 		*flagp &= ~kHasWidth;
434 	*flagp |= flags&kSPStart;
435 	while (*fInputScanPointer == '|') {
436 		fInputScanPointer++;
437 		br = Branch(&flags);
438 		if (br == NULL)
439 			return NULL;
440 		Tail(ret, br);	// kRegExpBranch -> kRegExpBranch.
441 		if (!(flags & kHasWidth))
442 			*flagp &= ~kHasWidth;
443 		*flagp |= flags&kSPStart;
444 	}
445 
446 	// Make a closing node, and hook it on the end.
447 	ender = Node(paren ? (char)(kRegExpClose + parno) : (char)kRegExpEnd);
448 	Tail(ret, ender);
449 
450 	// Hook the tails of the branches to the closing node.
451 	for (br = ret; br != NULL; br = Next(br))
452 		OpTail(br, ender);
453 
454 	// Check for proper termination.
455 	if (paren && *fInputScanPointer++ != ')') {
456 		SetError(REGEXP_UNMATCHED_PARENTHESIS);
457 		return NULL;
458 	} else if (!paren && *fInputScanPointer != '\0') {
459 		if (*fInputScanPointer == ')') {
460 			SetError(REGEXP_UNMATCHED_PARENTHESIS);
461 			return NULL;
462 		} else {
463 			SetError(REGEXP_JUNK_ON_END);
464 			return NULL;	//  "Can't happen".
465 		}
466 		// NOTREACHED
467 	}
468 
469 	return ret;
470 }
471 
472 //
473 // - Branch - one alternative of an | operator
474 //
475 // Implements the concatenation operator.
476 //
477 char *
478 RegExp::Branch(int32 *flagp)
479 {
480 	char *ret;
481 	char *chain;
482 	char *latest;
483 	int32 flags;
484 
485 	*flagp = kWorst;		// Tentatively.
486 
487 	ret = Node(kRegExpBranch);
488 	chain = NULL;
489 	while (*fInputScanPointer != '\0'
490 		&& *fInputScanPointer != '|'
491 		&& *fInputScanPointer != ')') {
492 		latest = Piece(&flags);
493 		if (latest == NULL)
494 			return NULL;
495 		*flagp |= flags & kHasWidth;
496 		if (chain == NULL)	// First piece.
497 			*flagp |= flags & kSPStart;
498 		else
499 			Tail(chain, latest);
500 		chain = latest;
501 	}
502 	if (chain == NULL)	// Loop ran zero times.
503 		Node(kRegExpNothing);
504 
505 	return ret;
506 }
507 
508 //
509 // - Piece - something followed by possible [*+?]
510 //
511 // Note that the branching code sequences used for ? and the general cases
512 // of * and + are somewhat optimized:  they use the same kRegExpNothing node as
513 // both the endmarker for their branch list and the body of the last branch.
514 // It might seem that this node could be dispensed with entirely, but the
515 // endmarker role is not redundant.
516 //
517 char *
518 RegExp::Piece(int32 *flagp)
519 {
520 	char *ret;
521 	char op;
522 	char *next;
523 	int32 flags;
524 
525 	ret = Atom(&flags);
526 	if (ret == NULL)
527 		return NULL;
528 
529 	op = *fInputScanPointer;
530 	if (!IsMult(op)) {
531 		*flagp = flags;
532 		return ret;
533 	}
534 
535 	if (!(flags & kHasWidth) && op != '?') {
536 		SetError(REGEXP_STAR_PLUS_OPERAND_EMPTY);
537 		return NULL;
538 	}
539 	*flagp = op != '+' ? kWorst | kSPStart :  kWorst | kHasWidth;
540 
541 	if (op == '*' && (flags & kSimple))
542 		Insert(kRegExpStar, ret);
543 	else if (op == '*') {
544 		// Emit x* as (x&|), where & means "self".
545 		Insert(kRegExpBranch, ret);				// Either x
546 		OpTail(ret, Node(kRegExpBack));		// and loop
547 		OpTail(ret, ret);				// back
548 		Tail(ret, Node(kRegExpBranch));		// or
549 		Tail(ret, Node(kRegExpNothing));		// null.
550 	} else if (op == '+' && (flags & kSimple))
551 		Insert(kRegExpPlus, ret);
552 	else if (op == '+') {
553 		// Emit x+ as x(&|), where & means "self".
554 		next = Node(kRegExpBranch);				// Either
555 		Tail(ret, next);
556 		Tail(Node(kRegExpBack), ret);		// loop back
557 		Tail(next, Node(kRegExpBranch));		// or
558 		Tail(ret, Node(kRegExpNothing));		// null.
559 	} else if (op == '?') {
560 		// Emit x? as (x|)
561 		Insert(kRegExpBranch, ret);			// Either x
562 		Tail(ret, Node(kRegExpBranch));	// or
563 		next = Node(kRegExpNothing);		// null.
564 		Tail(ret, next);
565 		OpTail(ret, next);
566 	}
567 	fInputScanPointer++;
568 	if (IsMult(*fInputScanPointer)) {
569 		SetError(REGEXP_NESTED_STAR_QUESTION_PLUS);
570 		return NULL;
571 	}
572 	return ret;
573 }
574 
575 //
576 // - Atom - the lowest level
577 //
578 // Optimization:  gobbles an entire sequence of ordinary characters so that
579 // it can turn them into a single node, which is smaller to store and
580 // faster to run.  Backslashed characters are exceptions, each becoming a
581 // separate node; the code is simpler that way and it's not worth fixing.
582 //
583 char *
584 RegExp::Atom(int32 *flagp)
585 {
586 	char *ret;
587 	int32 flags;
588 
589 	*flagp = kWorst;		// Tentatively.
590 
591 	switch (*fInputScanPointer++) {
592 		case '^':
593 			ret = Node(kRegExpBol);
594 			break;
595 		case '$':
596 			ret = Node(kRegExpEol);
597 			break;
598 		case '.':
599 			ret = Node(kRegExpAny);
600 			*flagp |= kHasWidth|kSimple;
601 			break;
602 		case '[':
603 			{
604 				int32 cclass;
605 				int32 classend;
606 
607 				if (*fInputScanPointer == '^') {	// Complement of range.
608 					ret = Node(kRegExpAnyBut);
609 					fInputScanPointer++;
610 				} else
611 					ret = Node(kRegExpAnyOf);
612 				if (*fInputScanPointer == ']' || *fInputScanPointer == '-')
613 					Char(*fInputScanPointer++);
614 				while (*fInputScanPointer != '\0' && *fInputScanPointer != ']') {
615 					if (*fInputScanPointer == '-') {
616 						fInputScanPointer++;
617 						if (*fInputScanPointer == ']' || *fInputScanPointer == '\0')
618 							Char('-');
619 						else {
620 							cclass = UCharAt(fInputScanPointer - 2) + 1;
621 							classend = UCharAt(fInputScanPointer);
622 							if (cclass > classend + 1) {
623 								SetError(REGEXP_INVALID_BRACKET_RANGE);
624 								return NULL;
625 							}
626 							for (; cclass <= classend; cclass++)
627 								Char((char)cclass);
628 							fInputScanPointer++;
629 						}
630 					} else
631 						Char(*fInputScanPointer++);
632 				}
633 				Char('\0');
634 				if (*fInputScanPointer != ']') {
635 					SetError(REGEXP_UNMATCHED_BRACKET);
636 					return NULL;
637 				}
638 				fInputScanPointer++;
639 				*flagp |= kHasWidth | kSimple;
640 			}
641 			break;
642 		case '(':
643 			ret = Reg(1, &flags);
644 			if (ret == NULL)
645 				return NULL;
646 			*flagp |= flags & (kHasWidth | kSPStart);
647 			break;
648 		case '\0':
649 		case '|':
650 		case ')':
651 			SetError(REGEXP_INTERNAL_ERROR);
652 			return NULL; //  Supposed to be caught earlier.
653 		case '?':
654 		case '+':
655 		case '*':
656 			SetError(REGEXP_QUESTION_PLUS_STAR_FOLLOWS_NOTHING);
657 			return NULL;
658 		case '\\':
659 			if (*fInputScanPointer == '\0') {
660 				SetError(REGEXP_TRAILING_BACKSLASH);
661 				return NULL;
662 			}
663 			ret = Node(kRegExpExactly);
664 			Char(*fInputScanPointer++);
665 			Char('\0');
666 			*flagp |= kHasWidth|kSimple;
667 			break;
668 		default:
669 			{
670 				int32 len;
671 				char ender;
672 
673 				fInputScanPointer--;
674 				len = (int32)strcspn(fInputScanPointer, kMeta);
675 				if (len <= 0) {
676 					SetError(REGEXP_INTERNAL_ERROR);
677 					return NULL;
678 				}
679 				ender = *(fInputScanPointer + len);
680 				if (len > 1 && IsMult(ender))
681 					len--;		// Back off clear of ?+* operand.
682 				*flagp |= kHasWidth;
683 				if (len == 1)
684 					*flagp |= kSimple;
685 				ret = Node(kRegExpExactly);
686 				while (len > 0) {
687 					Char(*fInputScanPointer++);
688 					len--;
689 				}
690 				Char('\0');
691 			}
692 			break;
693 	}
694 
695 	return ret;
696 }
697 
698 //
699 // - Node - emit a node
700 //
701 char *			// Location.
702 RegExp::Node(char op)
703 {
704 	char *ret;
705 	char *ptr;
706 
707 	ret = fCodeEmitPointer;
708 	if (ret == &fDummy) {
709 		fCodeSize += 3;
710 		return ret;
711 	}
712 
713 	ptr = ret;
714 	*ptr++ = op;
715 	*ptr++ = '\0';		// Null "next" pointer.
716 	*ptr++ = '\0';
717 	fCodeEmitPointer = ptr;
718 
719 	return ret;
720 }
721 
722 //
723 // - Char - emit (if appropriate) a byte of code
724 //
725 void
726 RegExp::Char(char b)
727 {
728 	if (fCodeEmitPointer != &fDummy)
729 		*fCodeEmitPointer++ = b;
730 	else
731 		fCodeSize++;
732 }
733 
734 //
735 // - Insert - insert an operator in front of already-emitted operand
736 //
737 // Means relocating the operand.
738 //
739 void
740 RegExp::Insert(char op, char *opnd)
741 {
742 	char *src;
743 	char *dst;
744 	char *place;
745 
746 	if (fCodeEmitPointer == &fDummy) {
747 		fCodeSize += 3;
748 		return;
749 	}
750 
751 	src = fCodeEmitPointer;
752 	fCodeEmitPointer += 3;
753 	dst = fCodeEmitPointer;
754 	while (src > opnd)
755 		*--dst = *--src;
756 
757 	place = opnd;		// Op node, where operand used to be.
758 	*place++ = op;
759 	*place++ = '\0';
760 	*place++ = '\0';
761 }
762 
763 //
764 // - Tail - set the next-pointer at the end of a node chain
765 //
766 void
767 RegExp::Tail(char *p, char *val)
768 {
769 	char *scan;
770 	char *temp;
771 	int32 offset;
772 
773 	if (p == &fDummy)
774 		return;
775 
776 	// Find last node.
777 	scan = p;
778 	for (;;) {
779 		temp = Next(scan);
780 		if (temp == NULL)
781 			break;
782 		scan = temp;
783 	}
784 
785 	if (scan[0] == kRegExpBack)
786 		offset = scan - val;
787 	else
788 		offset = val - scan;
789 
790 	scan[1] = (char)((offset >> 8) & 0377);
791 	scan[2] = (char)(offset & 0377);
792 }
793 
794 //
795 // - OpTail - Tail on operand of first argument; nop if operandless
796 //
797 void
798 RegExp::OpTail(char *p, char *val)
799 {
800 	// "Operandless" and "op != kRegExpBranch" are synonymous in practice.
801 	if (p == NULL || p == &fDummy || *p != kRegExpBranch)
802 		return;
803 	Tail(Operand(p), val);
804 }
805 
806 //
807 // RunMatcher and friends
808 //
809 
810 //
811 // - RunMatcher - match a regexp against a string
812 //
813 int32
814 RegExp::RunMatcher(regexp *prog, const char *string) const
815 {
816 	const char *s;
817 
818 	// Be paranoid...
819 	if (prog == NULL || string == NULL) {
820 		SetError(B_BAD_VALUE);
821 		return 0;
822 	}
823 
824 	// Check validity of program.
825 	if (UCharAt(prog->program) != kRegExpMagic) {
826 		SetError(REGEXP_CORRUPTED_PROGRAM);
827 		return 0;
828 	}
829 
830 	// If there is a "must appear" string, look for it.
831 	if (prog->regmust != NULL) {
832 		s = string;
833 		while ((s = strchr(s, prog->regmust[0])) != NULL) {
834 			if (strncmp(s, prog->regmust, (size_t)prog->regmlen) == 0)
835 				break;	// Found it.
836 			s++;
837 		}
838 		if (s == NULL)	// Not present.
839 			return 0;
840 	}
841 
842 	// Mark beginning of line for ^ .
843 	fRegBol = string;
844 
845 	// Simplest case:  anchored match need be tried only once.
846 	if (prog->reganch)
847 		return Try(prog, (char*)string);
848 
849 	// Messy cases:  unanchored match.
850 	s = string;
851 	if (prog->regstart != '\0')
852 		// We know what char it must start with.
853 		while ((s = strchr(s, prog->regstart)) != NULL) {
854 			if (Try(prog, (char*)s))
855 				return 1;
856 			s++;
857 		}
858 	else
859 		// We don't -- general case.
860 		do {
861 			if (Try(prog, (char*)s))
862 				return 1;
863 		} while (*s++ != '\0');
864 
865 	// Failure.
866 	return 0;
867 }
868 
869 //
870 // - Try - try match at specific point
871 //
872 int32			// 0 failure, 1 success
873 RegExp::Try(regexp *prog, const char *string) const
874 {
875 	int32 i;
876 	const char **sp;
877 	const char **ep;
878 
879 	fStringInputPointer = string;
880 	fStartPArrayPointer = prog->startp;
881 	fEndPArrayPointer = prog->endp;
882 
883 	sp = prog->startp;
884 	ep = prog->endp;
885 	for (i = kSubExpressionMax; i > 0; i--) {
886 		*sp++ = NULL;
887 		*ep++ = NULL;
888 	}
889 	if (Match(prog->program + 1)) {
890 		prog->startp[0] = string;
891 		prog->endp[0] = fStringInputPointer;
892 		return 1;
893 	} else
894 		return 0;
895 }
896 
897 //
898 // - Match - main matching routine
899 //
900 // Conceptually the strategy is simple:  check to see whether the current
901 // node matches, call self recursively to see whether the rest matches,
902 // and then act accordingly.  In practice we make some effort to avoid
903 // recursion, in particular by going through "ordinary" nodes (that don't
904 // need to know whether the rest of the match failed) by a loop instead of
905 // by recursion.
906 ///
907 int32			// 0 failure, 1 success
908 RegExp::Match(const char *prog) const
909 {
910 	const char *scan;	// Current node.
911 	const char *next;		// Next node.
912 
913 	scan = prog;
914 #ifdef DEBUG
915 	if (scan != NULL && regnarrate)
916 		fprintf(stderr, "%s(\n", Prop(scan));
917 #endif
918 	while (scan != NULL) {
919 #ifdef DEBUG
920 		if (regnarrate)
921 			fprintf(stderr, "%s...\n", Prop(scan));
922 #endif
923 		next = Next(scan);
924 
925 		switch (*scan) {
926 			case kRegExpBol:
927 				if (fStringInputPointer != fRegBol)
928 					return 0;
929 				break;
930 			case kRegExpEol:
931 				if (*fStringInputPointer != '\0')
932 					return 0;
933 				break;
934 			case kRegExpAny:
935 				if (*fStringInputPointer == '\0')
936 					return 0;
937 				fStringInputPointer++;
938 				break;
939 			case kRegExpExactly:
940 				{
941 					const char *opnd = Operand(scan);
942 					// Inline the first character, for speed.
943 					if (*opnd != *fStringInputPointer)
944 						return 0;
945 
946 					uint32 len = strlen(opnd);
947 					if (len > 1 && strncmp(opnd, fStringInputPointer, len) != 0)
948 						return 0;
949 
950 					fStringInputPointer += len;
951 				}
952 				break;
953 			case kRegExpAnyOf:
954 				if (*fStringInputPointer == '\0'
955 					|| strchr(Operand(scan), *fStringInputPointer) == NULL)
956 					return 0;
957 				fStringInputPointer++;
958 				break;
959 			case kRegExpAnyBut:
960 				if (*fStringInputPointer == '\0'
961 					|| strchr(Operand(scan), *fStringInputPointer) != NULL)
962 					return 0;
963 				fStringInputPointer++;
964 				break;
965 			case kRegExpNothing:
966 				break;
967 			case kRegExpBack:
968 				break;
969 			case kRegExpOpen + 1:
970 			case kRegExpOpen + 2:
971 			case kRegExpOpen + 3:
972 			case kRegExpOpen + 4:
973 			case kRegExpOpen + 5:
974 			case kRegExpOpen + 6:
975 			case kRegExpOpen + 7:
976 			case kRegExpOpen + 8:
977 			case kRegExpOpen + 9:
978 				{
979 					int32 no;
980 					const char *save;
981 
982 					no = *scan - kRegExpOpen;
983 					save = fStringInputPointer;
984 
985 					if (Match(next)) {
986 						//
987 						// Don't set startp if some later
988 						// invocation of the same parentheses
989 						// already has.
990 						//
991 						if (fStartPArrayPointer[no] == NULL)
992 							fStartPArrayPointer[no] = save;
993 						return 1;
994 					} else
995 						return 0;
996 				}
997 				break;
998 			case kRegExpClose + 1:
999 			case kRegExpClose + 2:
1000 			case kRegExpClose + 3:
1001 			case kRegExpClose + 4:
1002 			case kRegExpClose + 5:
1003 			case kRegExpClose + 6:
1004 			case kRegExpClose + 7:
1005 			case kRegExpClose + 8:
1006 			case kRegExpClose + 9:
1007 				{
1008 					int32 no;
1009 					const char *save;
1010 
1011 					no = *scan - kRegExpClose;
1012 					save = fStringInputPointer;
1013 
1014 					if (Match(next)) {
1015 						//
1016 						// Don't set endp if some later
1017 						// invocation of the same parentheses
1018 						// already has.
1019 						//
1020 						if (fEndPArrayPointer[no] == NULL)
1021 							fEndPArrayPointer[no] = save;
1022 						return 1;
1023 					} else
1024 						return 0;
1025 				}
1026 				break;
1027 			case kRegExpBranch:
1028 				{
1029 					const char *save;
1030 
1031 					if (*next != kRegExpBranch)		// No choice.
1032 						next = Operand(scan);	// Avoid recursion.
1033 					else {
1034 						do {
1035 							save = fStringInputPointer;
1036 							if (Match(Operand(scan)))
1037 								return 1;
1038 							fStringInputPointer = save;
1039 							scan = Next(scan);
1040 						} while (scan != NULL && *scan == kRegExpBranch);
1041 						return 0;
1042 						// NOTREACHED/
1043 					}
1044 				}
1045 				break;
1046 			case kRegExpStar:
1047 			case kRegExpPlus:
1048 				{
1049 					char nextch;
1050 					int32 no;
1051 					const char *save;
1052 					int32 min;
1053 
1054 					//
1055 					//Lookahead to avoid useless match attempts
1056 					// when we know what character comes next.
1057 					//
1058 					nextch = '\0';
1059 					if (*next == kRegExpExactly)
1060 						nextch = *Operand(next);
1061 					min = (*scan == kRegExpStar) ? 0 : 1;
1062 					save = fStringInputPointer;
1063 					no = Repeat(Operand(scan));
1064 					while (no >= min) {
1065 						// If it could work, try it.
1066 						if (nextch == '\0' || *fStringInputPointer == nextch)
1067 							if (Match(next))
1068 								return 1;
1069 						// Couldn't or didn't -- back up.
1070 						no--;
1071 						fStringInputPointer = save + no;
1072 					}
1073 					return 0;
1074 				}
1075 				break;
1076 			case kRegExpEnd:
1077 				return 1;	// Success!
1078 
1079 			default:
1080 				SetError(REGEXP_MEMORY_CORRUPTION);
1081 				return 0;
1082 			}
1083 
1084 		scan = next;
1085 	}
1086 
1087 	//
1088 	// We get here only if there's trouble -- normally "case kRegExpEnd" is
1089 	// the terminating point.
1090 	//
1091 	SetError(REGEXP_CORRUPTED_POINTERS);
1092 	return 0;
1093 }
1094 
1095 //
1096 // - Repeat - repeatedly match something simple, report how many
1097 //
1098 int32
1099 RegExp::Repeat(const char *p) const
1100 {
1101 	int32 count = 0;
1102 	const char *scan;
1103 	const char *opnd;
1104 
1105 	scan = fStringInputPointer;
1106 	opnd = Operand(p);
1107 	switch (*p) {
1108 		case kRegExpAny:
1109 			count = (int32)strlen(scan);
1110 			scan += count;
1111 			break;
1112 
1113 		case kRegExpExactly:
1114 			while (*opnd == *scan) {
1115 				count++;
1116 				scan++;
1117 			}
1118 			break;
1119 
1120 		case kRegExpAnyOf:
1121 			while (*scan != '\0' && strchr(opnd, *scan) != NULL) {
1122 				count++;
1123 				scan++;
1124 			}
1125 			break;
1126 
1127 		case kRegExpAnyBut:
1128 			while (*scan != '\0' && strchr(opnd, *scan) == NULL) {
1129 				count++;
1130 				scan++;
1131 			}
1132 			break;
1133 
1134 		default:		// Oh dear.  Called inappropriately.
1135 			SetError(REGEXP_INTERNAL_ERROR);
1136 			count = 0;	// Best compromise.
1137 			break;
1138 	}
1139 	fStringInputPointer = scan;
1140 
1141 	return count;
1142 }
1143 
1144 //
1145 // - Next - dig the "next" pointer out of a node
1146 //
1147 char *
1148 RegExp::Next(char *p)
1149 {
1150 	int32 offset;
1151 
1152 	if (p == &fDummy)
1153 		return NULL;
1154 
1155 	offset = ((*(p + 1) & 0377) << 8) + (*(p + 2) & 0377);
1156 	if (offset == 0)
1157 		return NULL;
1158 
1159 	if (*p == kRegExpBack)
1160 		return p - offset;
1161 	else
1162 		return p + offset;
1163 }
1164 
1165 const char *
1166 RegExp::Next(const char *p) const
1167 {
1168 	int32 offset;
1169 
1170 	if (p == &fDummy)
1171 		return NULL;
1172 
1173 	offset = ((*(p + 1) & 0377) << 8) + (*(p + 2) & 0377);
1174 	if (offset == 0)
1175 		return NULL;
1176 
1177 	if (*p == kRegExpBack)
1178 		return p - offset;
1179 	else
1180 		return p + offset;
1181 }
1182 
1183 inline int32
1184 RegExp::UCharAt(const char *p) const
1185 {
1186 	return (int32)*(unsigned char *)p;
1187 }
1188 
1189 inline char *
1190 RegExp::Operand(char* p) const
1191 {
1192 	return p + 3;
1193 }
1194 
1195 inline const char *
1196 RegExp::Operand(const char* p) const
1197 {
1198 	return p + 3;
1199 }
1200 
1201 inline bool
1202 RegExp::IsMult(char c) const
1203 {
1204 	return c == '*' || c == '+' || c == '?';
1205 }
1206 
1207 
1208 #ifdef DEBUG
1209 
1210 //
1211 // - Dump - dump a regexp onto stdout in vaguely comprehensible form
1212 //
1213 void
1214 RegExp::Dump()
1215 {
1216 	const char *s;
1217 	char op = kRegExpExactly;	// Arbitrary non-kRegExpEnd op.
1218 	const char *next;
1219 
1220 	s = fRegExp->program + 1;
1221 	while (op != kRegExpEnd) {	// While that wasn't kRegExpEnd last time...
1222 		op = *s;
1223 		printf("%2ld%s", s - fRegExp->program, Prop(s));	// Where, what.
1224 		next = Next(s);
1225 		if (next == NULL)		// Next ptr.
1226 			printf("(0)");
1227 		else
1228 			printf("(%ld)", (s - fRegExp->program) + (next - s));
1229 		s += 3;
1230 		if (op == kRegExpAnyOf || op == kRegExpAnyBut || op == kRegExpExactly) {
1231 			// Literal string, where present.
1232 			while (*s != '\0') {
1233 				putchar(*s);
1234 				s++;
1235 			}
1236 			s++;
1237 		}
1238 		putchar('\n');
1239 	}
1240 
1241 	// Header fields of interest.
1242 	if (fRegExp->regstart != '\0')
1243 		printf("start `%c' ", fRegExp->regstart);
1244 	if (fRegExp->reganch)
1245 		printf("anchored ");
1246 	if (fRegExp->regmust != NULL)
1247 		printf("must have \"%s\"", fRegExp->regmust);
1248 	printf("\n");
1249 }
1250 
1251 //
1252 // - Prop - printable representation of opcode
1253 //
1254 char *
1255 RegExp::Prop(const char *op) const
1256 {
1257 	const char *p = NULL;
1258 	static char buf[50];
1259 
1260 	(void) strcpy(buf, ":");
1261 
1262 	switch (*op) {
1263 		case kRegExpBol:
1264 			p = "kRegExpBol";
1265 			break;
1266 		case kRegExpEol:
1267 			p = "kRegExpEol";
1268 			break;
1269 		case kRegExpAny:
1270 			p = "kRegExpAny";
1271 			break;
1272 		case kRegExpAnyOf:
1273 			p = "kRegExpAnyOf";
1274 			break;
1275 		case kRegExpAnyBut:
1276 			p = "kRegExpAnyBut";
1277 			break;
1278 		case kRegExpBranch:
1279 			p = "kRegExpBranch";
1280 			break;
1281 		case kRegExpExactly:
1282 			p = "kRegExpExactly";
1283 			break;
1284 		case kRegExpNothing:
1285 			p = "kRegExpNothing";
1286 			break;
1287 		case kRegExpBack:
1288 			p = "kRegExpBack";
1289 			break;
1290 		case kRegExpEnd:
1291 			p = "kRegExpEnd";
1292 			break;
1293 		case kRegExpOpen + 1:
1294 		case kRegExpOpen + 2:
1295 		case kRegExpOpen + 3:
1296 		case kRegExpOpen + 4:
1297 		case kRegExpOpen + 5:
1298 		case kRegExpOpen + 6:
1299 		case kRegExpOpen + 7:
1300 		case kRegExpOpen + 8:
1301 		case kRegExpOpen + 9:
1302 			sprintf(buf + strlen(buf), "kRegExpOpen%d", *op - kRegExpOpen);
1303 			p = NULL;
1304 			break;
1305 		case kRegExpClose + 1:
1306 		case kRegExpClose + 2:
1307 		case kRegExpClose + 3:
1308 		case kRegExpClose + 4:
1309 		case kRegExpClose + 5:
1310 		case kRegExpClose + 6:
1311 		case kRegExpClose + 7:
1312 		case kRegExpClose + 8:
1313 		case kRegExpClose + 9:
1314 			sprintf(buf + strlen(buf), "kRegExpClose%d", *op - kRegExpClose);
1315 			p = NULL;
1316 			break;
1317 		case kRegExpStar:
1318 			p = "kRegExpStar";
1319 			break;
1320 		case kRegExpPlus:
1321 			p = "kRegExpPlus";
1322 			break;
1323 		default:
1324 			RegExpError("corrupted opcode");
1325 			break;
1326 	}
1327 
1328 	if (p != NULL)
1329 		strcat(buf, p);
1330 
1331 	return buf;
1332 }
1333 
1334 void
1335 RegExp::RegExpError(const char *) const
1336 {
1337 	// does nothing now, perhaps it should printf?
1338 }
1339 
1340 #endif
1341