xref: /haiku/src/kits/tracker/RegExp.cpp (revision 83b1a68c52ba3e0e8796282759f694b7fdddf06d)
1 /*
2 Open Tracker License
3 
4 Terms and Conditions
5 
6 Copyright (c) 1991-2000, Be Incorporated. All rights reserved.
7 
8 Permission is hereby granted, free of charge, to any person obtaining a copy of
9 this software and associated documentation files (the "Software"), to deal in
10 the Software without restriction, including without limitation the rights to
11 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
12 of the Software, and to permit persons to whom the Software is furnished to do
13 so, subject to the following conditions:
14 
15 The above copyright notice and this permission notice applies to all licensees
16 and shall be included in all copies or substantial portions of the Software.
17 
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF TITLE, MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21 BE INCORPORATED BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
22 AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN CONNECTION
23 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
24 
25 Except as contained in this notice, the name of Be Incorporated shall not be
26 used in advertising or otherwise to promote the sale, use or other dealings in
27 this Software without prior written authorization from Be Incorporated.
28 
29 Tracker(TM), Be(R), BeOS(R), and BeIA(TM) are trademarks or registered trademarks
30 of Be Incorporated in the United States and other countries. Other brand product
31 names are registered trademarks or trademarks of their respective holders.
32 All rights reserved.
33 */
34 
35 // This code is based on regexp.c, v.1.3 by Henry Spencer:
36 
37 // @(#)regexp.c	1.3 of 18 April 87
38 //
39 //	Copyright (c) 1986 by University of Toronto.
40 //	Written by Henry Spencer.  Not derived from licensed software.
41 //
42 //	Permission is granted to anyone to use this software for any
43 //	purpose on any computer system, and to redistribute it freely,
44 //	subject to the following restrictions:
45 //
46 //	1. The author is not responsible for the consequences of use of
47 //		this software, no matter how awful, even if they arise
48 //		from defects in it.
49 //
50 //	2. The origin of this software must not be misrepresented, either
51 //		by explicit claim or by omission.
52 //
53 //	3. Altered versions must be plainly marked as such, and must not
54 //		be misrepresented as being the original software.
55 //
56 // Beware that some of this code is subtly aware of the way operator
57 // precedence is structured in regular expressions.  Serious changes in
58 // regular-expression syntax might require a total rethink.
59 //
60 
61 // ALTERED VERSION: Adapted to ANSI C and C++ for the OpenTracker
62 // project (www.opentracker.org), Jul 11, 2000.
63 
64 
65 #include <stdlib.h>
66 #include <stdio.h>
67 #include <string.h>
68 
69 #include <Errors.h>
70 
71 #include "RegExp.h"
72 
73 
74 // The first byte of the regexp internal "program" is actually this magic
75 // number; the start node begins in the second byte.
76 
77 const uint8 kRegExpMagic = 0234;
78 
79 // The "internal use only" fields in RegExp.h are present to pass info from
80 // compile to execute that permits the execute phase to run lots faster on
81 // simple cases.  They are:
82 //
83 // regstart	char that must begin a match; '\0' if none obvious
84 // reganch	is the match anchored (at beginning-of-line only)?
85 // regmust	string (pointer into program) that match must include, or NULL
86 // regmlen	length of regmust string
87 //
88 // Regstart and reganch permit very fast decisions on suitable starting points
89 // for a match, cutting down the work a lot.  Regmust permits fast rejection
90 // of lines that cannot possibly match.  The regmust tests are costly enough
91 // that Compile() supplies a regmust only if the r.e. contains something
92 // potentially expensive (at present, the only such thing detected is * or +
93 // at the start of the r.e., which can involve a lot of backup). Regmlen is
94 // supplied because the test in RunMatcher() needs it and Compile() is
95 // computing it anyway.
96 //
97 //
98 //
99 // Structure for regexp "program".  This is essentially a linear encoding
100 // of a nondeterministic finite-state machine (aka syntax charts or
101 // "railroad normal form" in parsing technology).  Each node is an opcode
102 // plus a "next" pointer, possibly plus an operand.  "Next" pointers of
103 // all nodes except kRegExpBranch implement concatenation; a "next" pointer
104 // with a kRegExpBranch on both ends of it is connecting two alternatives.
105 // (Here we have one of the subtle syntax dependencies:  an individual
106 // kRegExpBranch (as opposed to a collection of them) is never concatenated
107 // with anything because of operator precedence.)  The operand of some types
108 // of node is a literal string; for others, it is a node leading into a
109 // sub-FSM. In particular, the operand of a kRegExpBranch node is the first
110 // node of the branch. (NB this is* not* a tree structure:  the tail of the
111 // branch connects to the thing following the set of kRegExpBranches).
112 // The opcodes are:
113 //
114 
115 // definition	number	opnd?	meaning
116 enum {
117 	kRegExpEnd = 0,		// no	End of program.
118 	kRegExpBol = 1,		// no	Match "" at beginning of line.
119 	kRegExpEol = 2,		// no	Match "" at end of line.
120 	kRegExpAny = 3,		// no	Match any one character.
121 	kRegExpAnyOf = 4,	// str	Match any character in this string.
122 	kRegExpAnyBut =	5,	// str	Match any character not in this string.
123 	kRegExpBranch =	6,	// node	Match this alternative, or the next...
124 	kRegExpBack = 7,	// no	Match "", "next" ptr points backward.
125 	kRegExpExactly = 8,	// str	Match this string.
126 	kRegExpNothing = 9,	// no	Match empty string.
127 	kRegExpStar = 10,	// node	Match this (simple) thing 0 or more times.
128 	kRegExpPlus = 11,	// node	Match this (simple) thing 1 or more times.
129 	kRegExpOpen	= 20,	// no	Mark this point in input as start of #n.
130 							//	kRegExpOpen + 1 is number 1, etc.
131 	kRegExpClose = 30	// no	Analogous to kRegExpOpen.
132 };
133 
134 //
135 // Opcode notes:
136 //
137 // kRegExpBranch	The set of branches constituting a single choice are
138 //		hooked together with their "next" pointers, since precedence prevents
139 //		anything being concatenated to any individual branch.  The
140 //		"next" pointer of the last kRegExpBranch in a choice points to the
141 //		thing following the whole choice.  This is also where the
142 //		final "next" pointer of each individual branch points; each
143 //		branch starts with the operand node of a kRegExpBranch node.
144 //
145 // kRegExpBack		Normal "next" pointers all implicitly point forward;
146 //		kRegExpBack exists to make loop structures possible.
147 //
148 // kRegExpStar,kRegExpPlus	'?', and complex '*' and '+', are implemented as
149 //		circular kRegExpBranch structures using kRegExpBack.  Simple cases
150 //		(one character per match) are implemented with kRegExpStar and
151 //		kRegExpPlus for speed and to minimize recursive plunges.
152 //
153 // kRegExpOpen,kRegExpClose	...are numbered at compile time.
154 //
155 //
156 //
157 // A node is one char of opcode followed by two chars of "next" pointer.
158 // "Next" pointers are stored as two 8-bit pieces, high order first.  The
159 // value is a positive offset from the opcode of the node containing it.
160 // An operand, if any, simply follows the node.  (Note that much of the
161 // code generation knows about this implicit relationship.)
162 //
163 // Using two bytes for the "next" pointer is vast overkill for most things,
164 // but allows patterns to get big without disasters.
165 //
166 
167 const char* kMeta = "^$.[()|?+*\\";
168 const int32 kMaxSize = 32767L;
169 	// Probably could be 65535L.
170 
171 
172 // Flags to be passed up and down:
173 enum {
174 	kHasWidth =	01,	// Known never to match null string.
175 	kSimple = 02,	// Simple enough to be kRegExpStar/kRegExpPlus operand.
176 	kSPStart = 04,	// Starts with * or +.
177 	kWorst = 0	// Worst case.
178 };
179 
180 
181 const char* kRegExpErrorStringArray[] = {
182 	"Unmatched parenthesis.",
183 	"Expression too long.",
184 	"Too many parenthesis.",
185 	"Junk on end.",
186 	"*+? operand may be empty.",
187 	"Nested *?+.",
188 	"Invalid bracket range.",
189 	"Unmatched brackets.",
190 	"Internal error.",
191 	"?+* follows nothing.",
192 	"Trailing \\.",
193 	"Corrupted expression.",
194 	"Memory corruption.",
195 	"Corrupted pointers.",
196 	"Corrupted opcode."
197 };
198 
199 
200 #ifdef DEBUG
201 int32 regnarrate = 0;
202 #endif
203 
204 
205 //	#pragma mark - RegExp
206 
207 
208 RegExp::RegExp()
209 	:
210 	fError(B_OK),
211 	fRegExp(NULL),
212 	fInputScanPointer(NULL),
213 	fParenthesisCount(0),
214 	fDummy('\0'),
215 	fCodeEmitPointer(NULL),
216 	fCodeSize(0),
217 	fStringInputPointer(NULL),
218 	fRegBol(NULL),
219 	fStartPArrayPointer(NULL),
220 	fEndPArrayPointer(NULL)
221 {
222 }
223 
224 
225 RegExp::RegExp(const char* pattern)
226 	:
227 	fError(B_OK),
228 	fRegExp(NULL)
229 {
230 	fRegExp = Compile(pattern);
231 }
232 
233 
234 RegExp::RegExp(const BString& pattern)
235 	:
236 	fError(B_OK),
237 	fRegExp(NULL)
238 {
239 	fRegExp = Compile(pattern.String());
240 }
241 
242 
243 RegExp::~RegExp()
244 {
245 	free(fRegExp);
246 }
247 
248 
249 status_t
250 RegExp::InitCheck() const
251 {
252 	return fError;
253 }
254 
255 
256 status_t
257 RegExp::SetTo(const char* pattern)
258 {
259 	fError = B_OK;
260 	free(fRegExp);
261 	fRegExp = Compile(pattern);
262 	return fError;
263 }
264 
265 
266 status_t
267 RegExp::SetTo(const BString& pattern)
268 {
269 	fError = B_OK;
270 	free(fRegExp);
271 	fRegExp = Compile(pattern.String());
272 	return fError;
273 }
274 
275 
276 bool
277 RegExp::Matches(const char* string) const
278 {
279 	if (fRegExp == NULL || string == NULL)
280 		return false;
281 
282 	return RunMatcher(fRegExp, string) == 1;
283 }
284 
285 
286 bool
287 RegExp::Matches(const BString& string) const
288 {
289 	if (fRegExp == NULL)
290 		return false;
291 
292 	return RunMatcher(fRegExp, string.String()) == 1;
293 }
294 
295 
296 //
297 // - Compile - compile a regular expression into internal code
298 //
299 // We can't allocate space until we know how big the compiled form will be,
300 // but we can't compile it (and thus know how big it is) until we've got a
301 // place to put the code.  So we cheat:  we compile it twice, once with code
302 // generation turned off and size counting turned on, and once "for real".
303 // This also means that we don't allocate space until we are sure that the
304 // thing really will compile successfully, and we never have to move the
305 // code and thus invalidate pointers into it.  (Note that it has to be in
306 // one piece because free() must be able to free it all.)
307 //
308 // Beware that the optimization-preparation code in here knows about some
309 // of the structure of the compiled regexp.
310 regexp*
311 RegExp::Compile(const char* exp)
312 {
313 	regexp* r;
314 	const char* scan;
315 	const char* longest;
316 	int32 len;
317 	int32 flags;
318 
319 	if (exp == NULL) {
320 		SetError(B_BAD_VALUE);
321 		return NULL;
322 	}
323 
324 	// First pass: determine size, legality.
325 	fInputScanPointer = exp;
326 	fParenthesisCount = 1;
327 	fCodeSize = 0L;
328 	fCodeEmitPointer = &fDummy;
329 	Char(kRegExpMagic);
330 	if (Reg(0, &flags) == NULL)
331 		return NULL;
332 
333 	// Small enough for pointer-storage convention?
334 	if (fCodeSize >= kMaxSize) {
335 		SetError(REGEXP_TOO_BIG);
336 		return NULL;
337 	}
338 
339 	r = (regexp*)malloc(sizeof(regexp) + fCodeSize);
340 		// Allocate space
341 
342 	if (r == NULL) {
343 		SetError(B_NO_MEMORY);
344 		return NULL;
345 	}
346 
347 	// Second pass: emit code.
348 	fInputScanPointer = exp;
349 	fParenthesisCount = 1;
350 	fCodeEmitPointer = r->program;
351 	Char(kRegExpMagic);
352 	if (Reg(0, &flags) == NULL) {
353 		free(r);
354 		return NULL;
355 	}
356 
357 	// Dig out information for optimizations.
358 	r->regstart = '\0';
359 		// Worst-case defaults.
360 	r->reganch = 0;
361 	r->regmust = NULL;
362 	r->regmlen = 0;
363 	scan = r->program + 1;
364 		// First kRegExpBranch.
365 	if (*Next((char*)scan) == kRegExpEnd) {
366 		// Only one top-level choice.
367 		scan = Operand(scan);
368 
369 		// Starting-point info.
370 		if (*scan == kRegExpExactly)
371 			r->regstart = *Operand(scan);
372 		else if (*scan == kRegExpBol)
373 			r->reganch++;
374 
375 		// If there's something expensive in the r.e., find the
376 		// longest literal string that must appear and make it the
377 		// regmust.  Resolve ties in favor of later strings, since
378 		// the regstart check works with the beginning of the r.e.
379 		// and avoiding duplication strengthens checking.  Not a
380 		// strong reason, but sufficient in the absence of others.
381 		if ((flags & kSPStart) != 0) {
382 			longest = NULL;
383 			len = 0;
384 			for (; scan != NULL; scan = Next((char*)scan)) {
385 				if (*scan == kRegExpExactly
386 					&& (int32)strlen(Operand(scan)) >= len) {
387 					longest = Operand(scan);
388 					len = (int32)strlen(Operand(scan));
389 				}
390 			}
391 			r->regmust = longest;
392 			r->regmlen = len;
393 		}
394 	}
395 
396 	return r;
397 }
398 
399 
400 regexp*
401 RegExp::Expression() const
402 {
403 	return fRegExp;
404 }
405 
406 
407 const char*
408 RegExp::ErrorString() const
409 {
410 	if (fError >= REGEXP_UNMATCHED_PARENTHESIS
411 		&& fError <= REGEXP_CORRUPTED_OPCODE) {
412 		return kRegExpErrorStringArray[fError - B_ERRORS_END];
413 	}
414 
415 	return strerror(fError);
416 }
417 
418 
419 void
420 RegExp::SetError(status_t error) const
421 {
422 	fError = error;
423 }
424 
425 
426 //
427 // - Reg - regular expression, i.e. main body or parenthesized thing
428 //
429 // Caller must absorb opening parenthesis.
430 //
431 // Combining parenthesis handling with the base level of regular expression
432 // is a trifle forced, but the need to tie the tails of the branches to what
433 // follows makes it hard to avoid.
434 //
435 char*
436 RegExp::Reg(int32 paren, int32* flagp)
437 {
438 	char* ret;
439 	char* br;
440 	char* ender;
441 	int32 parno = 0;
442 	int32 flags;
443 
444 	*flagp = kHasWidth;
445 		// Tentatively.
446 
447 	// Make an kRegExpOpen node, if parenthesized.
448 	if (paren) {
449 		if (fParenthesisCount >= kSubExpressionMax) {
450 			SetError(REGEXP_TOO_MANY_PARENTHESIS);
451 			return NULL;
452 		}
453 		parno = fParenthesisCount;
454 		fParenthesisCount++;
455 		ret = Node((char)(kRegExpOpen + parno));
456 	} else
457 		ret = NULL;
458 
459 	// Pick up the branches, linking them together.
460 	br = Branch(&flags);
461 	if (br == NULL)
462 		return NULL;
463 
464 	if (ret != NULL) {
465 		Tail(ret, br);
466 			// kRegExpOpen -> first
467 	} else
468 		ret = br;
469 
470 	if (!(flags & kHasWidth))
471 		*flagp &= ~kHasWidth;
472 
473 	*flagp |= flags&kSPStart;
474 	while (*fInputScanPointer == '|') {
475 		fInputScanPointer++;
476 		br = Branch(&flags);
477 		if (br == NULL)
478 			return NULL;
479 
480 		Tail(ret, br);
481 			// kRegExpBranch -> kRegExpBranch.
482 		if (!(flags & kHasWidth))
483 			*flagp &= ~kHasWidth;
484 
485 		*flagp |= flags&kSPStart;
486 	}
487 
488 	// Make a closing node, and hook it on the end.
489 	ender = Node(paren ? (char)(kRegExpClose + parno) : (char)kRegExpEnd);
490 	Tail(ret, ender);
491 
492 	// Hook the tails of the branches to the closing node.
493 	for (br = ret; br != NULL; br = Next(br))
494 		OpTail(br, ender);
495 
496 	// Check for proper termination.
497 	if (paren && *fInputScanPointer++ != ')') {
498 		SetError(REGEXP_UNMATCHED_PARENTHESIS);
499 		return NULL;
500 	} else if (!paren && *fInputScanPointer != '\0') {
501 		if (*fInputScanPointer == ')') {
502 			SetError(REGEXP_UNMATCHED_PARENTHESIS);
503 			return NULL;
504 		} else {
505 			SetError(REGEXP_JUNK_ON_END);
506 			return NULL;	//  "Can't happen".
507 		}
508 		// NOTREACHED
509 	}
510 
511 	return ret;
512 }
513 
514 
515 //
516 // - Branch - one alternative of an | operator
517 //
518 // Implements the concatenation operator.
519 //
520 char*
521 RegExp::Branch(int32* flagp)
522 {
523 	char* ret;
524 	char* chain;
525 	char* latest;
526 	int32 flags;
527 
528 	*flagp = kWorst;
529 		// Tentatively.
530 
531 	ret = Node(kRegExpBranch);
532 	chain = NULL;
533 	while (*fInputScanPointer != '\0'
534 		&& *fInputScanPointer != '|'
535 		&& *fInputScanPointer != ')') {
536 		latest = Piece(&flags);
537 		if (latest == NULL)
538 			return NULL;
539 
540 		*flagp |= flags & kHasWidth;
541 		if (chain == NULL) {
542 			// First piece.
543 			*flagp |= flags & kSPStart;
544 		} else
545 			Tail(chain, latest);
546 
547 		chain = latest;
548 	}
549 
550 	if (chain == NULL) {
551 		// Loop ran zero times.
552 		Node(kRegExpNothing);
553 	}
554 
555 	return ret;
556 }
557 
558 
559 //
560 // - Piece - something followed by possible [*+?]
561 //
562 // Note that the branching code sequences used for ? and the general cases
563 // of * and + are somewhat optimized:  they use the same kRegExpNothing node
564 // as both the endmarker for their branch list and the body of the last
565 // branch. It might seem that this node could be dispensed with entirely,
566 // but the endmarker role is not redundant.
567 //
568 char*
569 RegExp::Piece(int32* flagp)
570 {
571 	char* ret;
572 	char op;
573 	char* next;
574 	int32 flags;
575 
576 	ret = Atom(&flags);
577 	if (ret == NULL)
578 		return NULL;
579 
580 	op = *fInputScanPointer;
581 	if (!IsMult(op)) {
582 		*flagp = flags;
583 		return ret;
584 	}
585 
586 	if (!(flags & kHasWidth) && op != '?') {
587 		SetError(REGEXP_STAR_PLUS_OPERAND_EMPTY);
588 		return NULL;
589 	}
590 	*flagp = op != '+' ? kWorst | kSPStart :  kWorst | kHasWidth;
591 
592 	if (op == '*' && (flags & kSimple))
593 		Insert(kRegExpStar, ret);
594 	else if (op == '*') {
595 		// Emit x* as (x&|), where & means "self".
596 		Insert(kRegExpBranch, ret);
597 			// Either x
598 		OpTail(ret, Node(kRegExpBack));
599 			// and loop
600 		OpTail(ret, ret);
601 			// back
602 		Tail(ret, Node(kRegExpBranch));
603 			// or
604 		Tail(ret, Node(kRegExpNothing));
605 			// null.
606 	} else if (op == '+' && (flags & kSimple))
607 		Insert(kRegExpPlus, ret);
608 	else if (op == '+') {
609 		// Emit x+ as x(&|), where & means "self".
610 		next = Node(kRegExpBranch);
611 			// Either
612 		Tail(ret, next);
613 		Tail(Node(kRegExpBack), ret);
614 			// loop back
615 		Tail(next, Node(kRegExpBranch));
616 			// or
617 		Tail(ret, Node(kRegExpNothing));
618 			// null.
619 	} else if (op == '?') {
620 		// Emit x? as (x|)
621 		Insert(kRegExpBranch, ret);
622 			// Either x
623 		Tail(ret, Node(kRegExpBranch));
624 			// or
625 		next = Node(kRegExpNothing);
626 			// null.
627 		Tail(ret, next);
628 		OpTail(ret, next);
629 	}
630 
631 	fInputScanPointer++;
632 	if (IsMult(*fInputScanPointer)) {
633 		SetError(REGEXP_NESTED_STAR_QUESTION_PLUS);
634 		return NULL;
635 	}
636 
637 	return ret;
638 }
639 
640 
641 //
642 // - Atom - the lowest level
643 //
644 // Optimization:  gobbles an entire sequence of ordinary characters so that
645 // it can turn them into a single node, which is smaller to store and
646 // faster to run.  Backslashed characters are exceptions, each becoming a
647 // separate node; the code is simpler that way and it's not worth fixing.
648 //
649 char*
650 RegExp::Atom(int32* flagp)
651 {
652 	char* ret;
653 	int32 flags;
654 
655 	*flagp = kWorst;
656 		// tentatively
657 
658 	switch (*fInputScanPointer++) {
659 		case '^':
660 			ret = Node(kRegExpBol);
661 			break;
662 
663 		case '$':
664 			ret = Node(kRegExpEol);
665 			break;
666 
667 		case '.':
668 			ret = Node(kRegExpAny);
669 			*flagp |= kHasWidth|kSimple;
670 			break;
671 
672 		case '[':
673 		{
674 			int32 cclass;
675 			int32 classend;
676 
677 			if (*fInputScanPointer == '^') {
678 				// complement of range
679 				ret = Node(kRegExpAnyBut);
680 				fInputScanPointer++;
681 			} else
682 				ret = Node(kRegExpAnyOf);
683 			if (*fInputScanPointer == ']' || *fInputScanPointer == '-')
684 				Char(*fInputScanPointer++);
685 			while (*fInputScanPointer != '\0'
686 				&& *fInputScanPointer != ']') {
687 				if (*fInputScanPointer == '-') {
688 					fInputScanPointer++;
689 					if (*fInputScanPointer == ']'
690 						|| *fInputScanPointer == '\0') {
691 						Char('-');
692 					} else {
693 						cclass = UCharAt(fInputScanPointer - 2) + 1;
694 						classend = UCharAt(fInputScanPointer);
695 						if (cclass > classend + 1) {
696 							SetError(REGEXP_INVALID_BRACKET_RANGE);
697 							return NULL;
698 						}
699 						for (; cclass <= classend; cclass++)
700 							Char((char)cclass);
701 						fInputScanPointer++;
702 					}
703 				} else
704 					Char(*fInputScanPointer++);
705 			}
706 			Char('\0');
707 			if (*fInputScanPointer != ']') {
708 				SetError(REGEXP_UNMATCHED_BRACKET);
709 				return NULL;
710 			}
711 			fInputScanPointer++;
712 			*flagp |= kHasWidth | kSimple;
713 			break;
714 		}
715 
716 		case '(':
717 			ret = Reg(1, &flags);
718 			if (ret == NULL)
719 				return NULL;
720 			*flagp |= flags & (kHasWidth | kSPStart);
721 			break;
722 
723 		case '\0':
724 		case '|':
725 		case ')':
726 			SetError(REGEXP_INTERNAL_ERROR);
727 			return NULL;
728 				//  supposed to be caught earlier
729 
730 		case '?':
731 		case '+':
732 		case '*':
733 			SetError(REGEXP_QUESTION_PLUS_STAR_FOLLOWS_NOTHING);
734 			return NULL;
735 
736 		case '\\':
737 			if (*fInputScanPointer == '\0') {
738 				SetError(REGEXP_TRAILING_BACKSLASH);
739 				return NULL;
740 			}
741 			ret = Node(kRegExpExactly);
742 			Char(*fInputScanPointer++);
743 			Char('\0');
744 			*flagp |= kHasWidth|kSimple;
745 			break;
746 
747 		default:
748 		{
749 			int32 len;
750 			char ender;
751 
752 			fInputScanPointer--;
753 			len = (int32)strcspn(fInputScanPointer, kMeta);
754 			if (len <= 0) {
755 				SetError(REGEXP_INTERNAL_ERROR);
756 				return NULL;
757 			}
758 
759 			ender = *(fInputScanPointer + len);
760 			if (len > 1 && IsMult(ender)) {
761 				// Back off clear of ?+* operand.
762 				len--;
763 			}
764 
765 			*flagp |= kHasWidth;
766 			if (len == 1)
767 				*flagp |= kSimple;
768 
769 			ret = Node(kRegExpExactly);
770 			while (len > 0) {
771 				Char(*fInputScanPointer++);
772 				len--;
773 			}
774 
775 			Char('\0');
776 			break;
777 		}
778 	}
779 
780 	return ret;
781 }
782 
783 
784 //
785 // - Node - emit a node
786 //
787 // Returns location.
788 //
789 char*
790 RegExp::Node(char op)
791 {
792 	char* ret;
793 	char* ptr;
794 
795 	ret = fCodeEmitPointer;
796 	if (ret == &fDummy) {
797 		fCodeSize += 3;
798 		return ret;
799 	}
800 
801 	ptr = ret;
802 	*ptr++ = op;
803 	*ptr++ = '\0';
804 		// Null "next" pointer.
805 	*ptr++ = '\0';
806 	fCodeEmitPointer = ptr;
807 
808 	return ret;
809 }
810 
811 
812 //
813 // - Char - emit (if appropriate) a byte of code
814 //
815 void
816 RegExp::Char(char b)
817 {
818 	if (fCodeEmitPointer != &fDummy)
819 		*fCodeEmitPointer++ = b;
820 	else
821 		fCodeSize++;
822 }
823 
824 
825 //
826 // - Insert - insert an operator in front of already-emitted operand
827 //
828 // Means relocating the operand.
829 //
830 void
831 RegExp::Insert(char op, char* opnd)
832 {
833 	char* src;
834 	char* dst;
835 	char* place;
836 
837 	if (fCodeEmitPointer == &fDummy) {
838 		fCodeSize += 3;
839 		return;
840 	}
841 
842 	src = fCodeEmitPointer;
843 	fCodeEmitPointer += 3;
844 	dst = fCodeEmitPointer;
845 	while (src > opnd)
846 		*--dst = *--src;
847 
848 	place = opnd;
849 		// Op node, where operand used to be.
850 	*place++ = op;
851 	*place++ = '\0';
852 	*place++ = '\0';
853 }
854 
855 
856 //
857 // - Tail - set the next-pointer at the end of a node chain
858 //
859 void
860 RegExp::Tail(char* p, char* val)
861 {
862 	char* scan;
863 	char* temp;
864 	int32 offset;
865 
866 	if (p == &fDummy)
867 		return;
868 
869 	// Find last node.
870 	scan = p;
871 	for (;;) {
872 		temp = Next(scan);
873 		if (temp == NULL)
874 			break;
875 		scan = temp;
876 	}
877 
878 	if (scan[0] == kRegExpBack)
879 		offset = scan - val;
880 	else
881 		offset = val - scan;
882 
883 	scan[1] = (char)((offset >> 8) & 0377);
884 	scan[2] = (char)(offset & 0377);
885 }
886 
887 
888 //
889 // - OpTail - Tail on operand of first argument; nop if operandless
890 //
891 void
892 RegExp::OpTail(char* p, char* val)
893 {
894 	// "Operandless" and "op != kRegExpBranch" are synonymous in practice.
895 	if (p == NULL || p == &fDummy || *p != kRegExpBranch)
896 		return;
897 
898 	Tail(Operand(p), val);
899 }
900 
901 //
902 // RunMatcher and friends
903 //
904 
905 
906 //
907 // - RunMatcher - match a regexp against a string
908 //
909 int32
910 RegExp::RunMatcher(regexp* prog, const char* string) const
911 {
912 	const char* s;
913 
914 	// be paranoid...
915 	if (prog == NULL || string == NULL) {
916 		SetError(B_BAD_VALUE);
917 		return 0;
918 	}
919 
920 	// check validity of program
921 	if (UCharAt(prog->program) != kRegExpMagic) {
922 		SetError(REGEXP_CORRUPTED_PROGRAM);
923 		return 0;
924 	}
925 
926 	// if there is a "must appear" string, look for it
927 	if (prog->regmust != NULL) {
928 		s = string;
929 		while ((s = strchr(s, prog->regmust[0])) != NULL) {
930 			if (strncmp(s, prog->regmust, (size_t)prog->regmlen) == 0) {
931 				// found it
932 				break;
933 			}
934 			s++;
935 		}
936 		if (s == NULL) {
937 			// not present
938 			return 0;
939 		}
940 	}
941 
942 	// mark beginning of line for ^
943 	fRegBol = string;
944 
945 	// simplest case:  anchored match need be tried only once
946 	if (prog->reganch)
947 		return Try(prog, (char*)string);
948 
949 	// messy cases: unanchored match
950 	s = string;
951 	if (prog->regstart != '\0') {
952 		// we know what char it must start with
953 		while ((s = strchr(s, prog->regstart)) != NULL) {
954 			if (Try(prog, (char*)s))
955 				return 1;
956 			s++;
957 		}
958 	} else {
959 		// we don't -- general case.
960 		do {
961 			if (Try(prog, (char*)s))
962 				return 1;
963 		} while (*s++ != '\0');
964 	}
965 
966 	// failure
967 	return 0;
968 }
969 
970 
971 //
972 // - Try - try match at specific point
973 //
974 // Returns 0 on failure, 1 on success.
975 //
976 int32
977 RegExp::Try(regexp* prog, const char* string) const
978 {
979 	int32 i;
980 	const char **sp;
981 	const char **ep;
982 
983 	fStringInputPointer = string;
984 	fStartPArrayPointer = prog->startp;
985 	fEndPArrayPointer = prog->endp;
986 
987 	sp = prog->startp;
988 	ep = prog->endp;
989 	for (i = kSubExpressionMax; i > 0; i--) {
990 		*sp++ = NULL;
991 		*ep++ = NULL;
992 	}
993 	if (Match(prog->program + 1)) {
994 		prog->startp[0] = string;
995 		prog->endp[0] = fStringInputPointer;
996 		return 1;
997 	} else
998 		return 0;
999 }
1000 
1001 
1002 //
1003 // - Match - main matching routine
1004 //
1005 // Conceptually the strategy is simple:  check to see whether the current
1006 // node matches, call self recursively to see whether the rest matches,
1007 // and then act accordingly.  In practice we make some effort to avoid
1008 // recursion, in particular by going through "ordinary" nodes (that don't
1009 // need to know whether the rest of the match failed) by a loop instead of
1010 // by recursion.
1011 //
1012 // Returns 0 on failure, 1 on success.
1013 //
1014 int32
1015 RegExp::Match(const char* prog) const
1016 {
1017 	const char* scan;	// Current node.
1018 	const char* next;		// Next node.
1019 
1020 	scan = prog;
1021 #ifdef DEBUG
1022 	if (scan != NULL && regnarrate)
1023 		fprintf(stderr, "%s(\n", Prop(scan));
1024 #endif
1025 	while (scan != NULL) {
1026 #ifdef DEBUG
1027 		if (regnarrate)
1028 			fprintf(stderr, "%s...\n", Prop(scan));
1029 #endif
1030 		next = Next(scan);
1031 
1032 		switch (*scan) {
1033 			case kRegExpBol:
1034 				if (fStringInputPointer != fRegBol)
1035 					return 0;
1036 				break;
1037 
1038 			case kRegExpEol:
1039 				if (*fStringInputPointer != '\0')
1040 					return 0;
1041 				break;
1042 
1043 			case kRegExpAny:
1044 				if (*fStringInputPointer == '\0')
1045 					return 0;
1046 				fStringInputPointer++;
1047 				break;
1048 
1049 			case kRegExpExactly:
1050 			{
1051 				const char* opnd = Operand(scan);
1052 				// Inline the first character, for speed.
1053 				if (*opnd != *fStringInputPointer)
1054 					return 0;
1055 
1056 				uint32 len = strlen(opnd);
1057 				if (len > 1
1058 					&& strncmp(opnd, fStringInputPointer, len) != 0) {
1059 					return 0;
1060 				}
1061 
1062 				fStringInputPointer += len;
1063 			}
1064 			break;
1065 
1066 			case kRegExpAnyOf:
1067 				if (*fStringInputPointer == '\0'
1068 					|| strchr(Operand(scan), *fStringInputPointer) == NULL) {
1069 					return 0;
1070 				}
1071 				fStringInputPointer++;
1072 				break;
1073 
1074 			case kRegExpAnyBut:
1075 				if (*fStringInputPointer == '\0'
1076 					|| strchr(Operand(scan), *fStringInputPointer) != NULL) {
1077 					return 0;
1078 				}
1079 				fStringInputPointer++;
1080 				break;
1081 
1082 			case kRegExpNothing:
1083 				break;
1084 
1085 			case kRegExpBack:
1086 				break;
1087 
1088 			case kRegExpOpen + 1:
1089 			case kRegExpOpen + 2:
1090 			case kRegExpOpen + 3:
1091 			case kRegExpOpen + 4:
1092 			case kRegExpOpen + 5:
1093 			case kRegExpOpen + 6:
1094 			case kRegExpOpen + 7:
1095 			case kRegExpOpen + 8:
1096 			case kRegExpOpen + 9:
1097 			{
1098 				int32 no;
1099 				const char* save;
1100 
1101 				no = *scan - kRegExpOpen;
1102 				save = fStringInputPointer;
1103 
1104 				if (Match(next)) {
1105 					//
1106 					// Don't set startp if some later
1107 					// invocation of the same parentheses
1108 					// already has.
1109 					//
1110 					if (fStartPArrayPointer[no] == NULL)
1111 						fStartPArrayPointer[no] = save;
1112 					return 1;
1113 				} else
1114 					return 0;
1115 			}
1116 			break;
1117 
1118 			case kRegExpClose + 1:
1119 			case kRegExpClose + 2:
1120 			case kRegExpClose + 3:
1121 			case kRegExpClose + 4:
1122 			case kRegExpClose + 5:
1123 			case kRegExpClose + 6:
1124 			case kRegExpClose + 7:
1125 			case kRegExpClose + 8:
1126 			case kRegExpClose + 9:
1127 			{
1128 				int32 no;
1129 				const char* save;
1130 
1131 				no = *scan - kRegExpClose;
1132 				save = fStringInputPointer;
1133 
1134 				if (Match(next)) {
1135 					//
1136 					// Don't set endp if some later
1137 					// invocation of the same parentheses
1138 					// already has.
1139 					//
1140 					if (fEndPArrayPointer[no] == NULL)
1141 						fEndPArrayPointer[no] = save;
1142 					return 1;
1143 				} else
1144 					return 0;
1145 			}
1146 			break;
1147 
1148 			case kRegExpBranch:
1149 			{
1150 				const char* save;
1151 
1152 				if (*next != kRegExpBranch) {
1153 					// no choice
1154 					next = Operand(scan);
1155 						// avoid recursion
1156 				} else {
1157 					do {
1158 						save = fStringInputPointer;
1159 						if (Match(Operand(scan)))
1160 							return 1;
1161 						fStringInputPointer = save;
1162 						scan = Next(scan);
1163 					} while (scan != NULL && *scan == kRegExpBranch);
1164 					return 0;
1165 					// NOTREACHED
1166 				}
1167 			}
1168 			break;
1169 
1170 			case kRegExpStar:
1171 			case kRegExpPlus:
1172 				{
1173 					char nextch;
1174 					int32 no;
1175 					const char* save;
1176 					int32 min;
1177 
1178 					//
1179 					// Lookahead to avoid useless match attempts
1180 					// when we know what character comes next.
1181 					//
1182 					nextch = '\0';
1183 					if (*next == kRegExpExactly)
1184 						nextch = *Operand(next);
1185 					min = (*scan == kRegExpStar) ? 0 : 1;
1186 					save = fStringInputPointer;
1187 					no = Repeat(Operand(scan));
1188 					while (no >= min) {
1189 						// if it could work, try it
1190 						if (nextch == '\0' || *fStringInputPointer == nextch) {
1191 							if (Match(next))
1192 								return 1;
1193 						}
1194 						// couldn't or didn't, back up
1195 						no--;
1196 						fStringInputPointer = save + no;
1197 					}
1198 					return 0;
1199 				}
1200 				break;
1201 
1202 			case kRegExpEnd:
1203 				return 1;
1204 					// Success!
1205 
1206 			default:
1207 				SetError(REGEXP_MEMORY_CORRUPTION);
1208 				return 0;
1209 			}
1210 
1211 		scan = next;
1212 	}
1213 
1214 	//
1215 	// We get here only if there's trouble -- normally "case kRegExpEnd" is
1216 	// the terminating point.
1217 	//
1218 	SetError(REGEXP_CORRUPTED_POINTERS);
1219 
1220 	return 0;
1221 }
1222 
1223 
1224 //
1225 // - Repeat - repeatedly match something simple, report how many
1226 //
1227 int32
1228 RegExp::Repeat(const char* p) const
1229 {
1230 	int32 count = 0;
1231 	const char* scan;
1232 	const char* opnd;
1233 
1234 	scan = fStringInputPointer;
1235 	opnd = Operand(p);
1236 	switch (*p) {
1237 		case kRegExpAny:
1238 			count = (int32)strlen(scan);
1239 			scan += count;
1240 			break;
1241 
1242 		case kRegExpExactly:
1243 			while (*opnd == *scan) {
1244 				count++;
1245 				scan++;
1246 			}
1247 			break;
1248 
1249 		case kRegExpAnyOf:
1250 			while (*scan != '\0' && strchr(opnd, *scan) != NULL) {
1251 				count++;
1252 				scan++;
1253 			}
1254 			break;
1255 
1256 		case kRegExpAnyBut:
1257 			while (*scan != '\0' && strchr(opnd, *scan) == NULL) {
1258 				count++;
1259 				scan++;
1260 			}
1261 			break;
1262 
1263 		default:
1264 			// oh dear, called inappropriately
1265 			SetError(REGEXP_INTERNAL_ERROR);
1266 			count = 0;
1267 				// best compromise
1268 			break;
1269 	}
1270 	fStringInputPointer = scan;
1271 
1272 	return count;
1273 }
1274 
1275 
1276 //
1277 // - Next - dig the "next" pointer out of a node
1278 //
1279 char*
1280 RegExp::Next(char* p)
1281 {
1282 	int32 offset;
1283 
1284 	if (p == &fDummy)
1285 		return NULL;
1286 
1287 	offset = ((*(p + 1) & 0377) << 8) + (*(p + 2) & 0377);
1288 	if (offset == 0)
1289 		return NULL;
1290 
1291 	if (*p == kRegExpBack)
1292 		return p - offset;
1293 	else
1294 		return p + offset;
1295 }
1296 
1297 
1298 const char*
1299 RegExp::Next(const char* p) const
1300 {
1301 	int32 offset;
1302 
1303 	if (p == &fDummy)
1304 		return NULL;
1305 
1306 	offset = ((*(p + 1) & 0377) << 8) + (*(p + 2) & 0377);
1307 	if (offset == 0)
1308 		return NULL;
1309 
1310 	if (*p == kRegExpBack)
1311 		return p - offset;
1312 	else
1313 		return p + offset;
1314 }
1315 
1316 
1317 inline int32
1318 RegExp::UCharAt(const char* p) const
1319 {
1320 	return (int32)*(unsigned char *)p;
1321 }
1322 
1323 
1324 inline char*
1325 RegExp::Operand(char* p) const
1326 {
1327 	return p + 3;
1328 }
1329 
1330 
1331 inline const char*
1332 RegExp::Operand(const char* p) const
1333 {
1334 	return p + 3;
1335 }
1336 
1337 
1338 inline bool
1339 RegExp::IsMult(char c) const
1340 {
1341 	return c == '*' || c == '+' || c == '?';
1342 }
1343 
1344 
1345 #ifdef DEBUG
1346 
1347 
1348 //
1349 // - Dump - dump a regexp onto stdout in vaguely comprehensible form
1350 //
1351 void
1352 RegExp::Dump()
1353 {
1354 	const char* s;
1355 	char op = kRegExpExactly;
1356 		// Arbitrary non-kRegExpEnd op.
1357 	const char* next;
1358 
1359 	s = fRegExp->program + 1;
1360 	while (op != kRegExpEnd) {
1361 		// While that wasn't kRegExpEnd last time...
1362 		op = *s;
1363 		printf("%2ld%s", s - fRegExp->program, Prop(s));
1364 			// Where, what.
1365 		next = Next(s);
1366 		if (next == NULL) {
1367 			// next ptr
1368 			printf("(0)");
1369 		} else
1370 			printf("(%ld)", (s - fRegExp->program) + (next - s));
1371 
1372 		s += 3;
1373 		if (op == kRegExpAnyOf || op == kRegExpAnyBut
1374 			|| op == kRegExpExactly) {
1375 			// literal string, where present
1376 			while (*s != '\0') {
1377 				putchar(*s);
1378 				s++;
1379 			}
1380 			s++;
1381 		}
1382 		putchar('\n');
1383 	}
1384 
1385 	// header fields of interest
1386 	if (fRegExp->regstart != '\0')
1387 		printf("start `%c' ", fRegExp->regstart);
1388 
1389 	if (fRegExp->reganch)
1390 		printf("anchored ");
1391 
1392 	if (fRegExp->regmust != NULL)
1393 		printf("must have \"%s\"", fRegExp->regmust);
1394 
1395 	printf("\n");
1396 }
1397 
1398 
1399 //
1400 // - Prop - printable representation of opcode
1401 //
1402 char*
1403 RegExp::Prop(const char* op) const
1404 {
1405 	const char* p = NULL;
1406 	static char buf[50];
1407 
1408 	strcpy(buf, ":");
1409 
1410 	switch (*op) {
1411 		case kRegExpBol:
1412 			p = "kRegExpBol";
1413 			break;
1414 
1415 		case kRegExpEol:
1416 			p = "kRegExpEol";
1417 			break;
1418 
1419 		case kRegExpAny:
1420 			p = "kRegExpAny";
1421 			break;
1422 
1423 		case kRegExpAnyOf:
1424 			p = "kRegExpAnyOf";
1425 			break;
1426 
1427 		case kRegExpAnyBut:
1428 			p = "kRegExpAnyBut";
1429 			break;
1430 
1431 		case kRegExpBranch:
1432 			p = "kRegExpBranch";
1433 			break;
1434 
1435 		case kRegExpExactly:
1436 			p = "kRegExpExactly";
1437 			break;
1438 
1439 		case kRegExpNothing:
1440 			p = "kRegExpNothing";
1441 			break;
1442 
1443 		case kRegExpBack:
1444 			p = "kRegExpBack";
1445 			break;
1446 
1447 		case kRegExpEnd:
1448 			p = "kRegExpEnd";
1449 			break;
1450 
1451 		case kRegExpOpen + 1:
1452 		case kRegExpOpen + 2:
1453 		case kRegExpOpen + 3:
1454 		case kRegExpOpen + 4:
1455 		case kRegExpOpen + 5:
1456 		case kRegExpOpen + 6:
1457 		case kRegExpOpen + 7:
1458 		case kRegExpOpen + 8:
1459 		case kRegExpOpen + 9:
1460 			sprintf(buf + strlen(buf), "kRegExpOpen%d", *op - kRegExpOpen);
1461 			p = NULL;
1462 			break;
1463 
1464 		case kRegExpClose + 1:
1465 		case kRegExpClose + 2:
1466 		case kRegExpClose + 3:
1467 		case kRegExpClose + 4:
1468 		case kRegExpClose + 5:
1469 		case kRegExpClose + 6:
1470 		case kRegExpClose + 7:
1471 		case kRegExpClose + 8:
1472 		case kRegExpClose + 9:
1473 			sprintf(buf + strlen(buf), "kRegExpClose%d", *op - kRegExpClose);
1474 			p = NULL;
1475 			break;
1476 
1477 		case kRegExpStar:
1478 			p = "kRegExpStar";
1479 			break;
1480 
1481 		case kRegExpPlus:
1482 			p = "kRegExpPlus";
1483 			break;
1484 
1485 		default:
1486 			RegExpError("corrupted opcode");
1487 			break;
1488 	}
1489 
1490 	if (p != NULL)
1491 		strcat(buf, p);
1492 
1493 	return buf;
1494 }
1495 
1496 
1497 void
1498 RegExp::RegExpError(const char*) const
1499 {
1500 	// does nothing now, perhaps it should printf?
1501 }
1502 
1503 #endif	// DEBUG
1504