xref: /haiku/src/kits/tracker/RegExp.cpp (revision 5e96d7d537fbec23bad4ae9b4c8e7b02e769f0c6)
1 /*
2 Open Tracker License
3 
4 Terms and Conditions
5 
6 Copyright (c) 1991-2000, Be Incorporated. All rights reserved.
7 
8 Permission is hereby granted, free of charge, to any person obtaining a copy of
9 this software and associated documentation files (the "Software"), to deal in
10 the Software without restriction, including without limitation the rights to
11 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
12 of the Software, and to permit persons to whom the Software is furnished to do
13 so, subject to the following conditions:
14 
15 The above copyright notice and this permission notice applies to all licensees
16 and shall be included in all copies or substantial portions of the Software.
17 
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF TITLE, MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21 BE INCORPORATED BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
22 AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN CONNECTION
23 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
24 
25 Except as contained in this notice, the name of Be Incorporated shall not be
26 used in advertising or otherwise to promote the sale, use or other dealings in
27 this Software without prior written authorization from Be Incorporated.
28 
29 Tracker(TM), Be(R), BeOS(R), and BeIA(TM) are trademarks or registered trademarks
30 of Be Incorporated in the United States and other countries. Other brand product
31 names are registered trademarks or trademarks of their respective holders.
32 All rights reserved.
33 */
34 
35 // This code is based on regexp.c, v.1.3 by Henry Spencer:
36 
37 // @(#)regexp.c	1.3 of 18 April 87
38 //
39 //	Copyright (c) 1986 by University of Toronto.
40 //	Written by Henry Spencer.  Not derived from licensed software.
41 //
42 //	Permission is granted to anyone to use this software for any
43 //	purpose on any computer system, and to redistribute it freely,
44 //	subject to the following restrictions:
45 //
46 //	1. The author is not responsible for the consequences of use of
47 //		this software, no matter how awful, even if they arise
48 //		from defects in it.
49 //
50 //	2. The origin of this software must not be misrepresented, either
51 //		by explicit claim or by omission.
52 //
53 //	3. Altered versions must be plainly marked as such, and must not
54 //		be misrepresented as being the original software.
55 //
56 // Beware that some of this code is subtly aware of the way operator
57 // precedence is structured in regular expressions.  Serious changes in
58 // regular-expression syntax might require a total rethink.
59 //
60 
61 // ALTERED VERSION: Adapted to ANSI C and C++ for the OpenTracker
62 // project (www.opentracker.org), Jul 11, 2000.
63 
64 
65 #include <stdlib.h>
66 #include <stdio.h>
67 #include <string.h>
68 
69 #include <Errors.h>
70 
71 #include "RegExp.h"
72 
73 
74 // The first byte of the regexp internal "program" is actually this magic
75 // number; the start node begins in the second byte.
76 
77 const uint8 kRegExpMagic = 0234;
78 
79 // The "internal use only" fields in RegExp.h are present to pass info from
80 // compile to execute that permits the execute phase to run lots faster on
81 // simple cases.  They are:
82 //
83 // regstart	char that must begin a match; '\0' if none obvious
84 // reganch	is the match anchored (at beginning-of-line only)?
85 // regmust	string (pointer into program) that match must include, or NULL
86 // regmlen	length of regmust string
87 //
88 // Regstart and reganch permit very fast decisions on suitable starting points
89 // for a match, cutting down the work a lot.  Regmust permits fast rejection
90 // of lines that cannot possibly match.  The regmust tests are costly enough
91 // that Compile() supplies a regmust only if the r.e. contains something
92 // potentially expensive (at present, the only such thing detected is * or +
93 // at the start of the r.e., which can involve a lot of backup). Regmlen is
94 // supplied because the test in RunMatcher() needs it and Compile() is
95 // computing it anyway.
96 //
97 //
98 //
99 // Structure for regexp "program".  This is essentially a linear encoding
100 // of a nondeterministic finite-state machine (aka syntax charts or
101 // "railroad normal form" in parsing technology).  Each node is an opcode
102 // plus a "next" pointer, possibly plus an operand.  "Next" pointers of
103 // all nodes except kRegExpBranch implement concatenation; a "next" pointer
104 // with a kRegExpBranch on both ends of it is connecting two alternatives.
105 // (Here we have one of the subtle syntax dependencies:  an individual
106 // kRegExpBranch (as opposed to a collection of them) is never concatenated
107 // with anything because of operator precedence.)  The operand of some types
108 // of node is a literal string; for others, it is a node leading into a
109 // sub-FSM. In particular, the operand of a kRegExpBranch node is the first
110 // node of the branch. (NB this is* not* a tree structure:  the tail of the
111 // branch connects to the thing following the set of kRegExpBranches).
112 // The opcodes are:
113 //
114 
115 // definition	number	opnd?	meaning
116 enum {
117 	kRegExpEnd = 0,		// no	End of program.
118 	kRegExpBol = 1,		// no	Match "" at beginning of line.
119 	kRegExpEol = 2,		// no	Match "" at end of line.
120 	kRegExpAny = 3,		// no	Match any one character.
121 	kRegExpAnyOf = 4,	// str	Match any character in this string.
122 	kRegExpAnyBut =	5,	// str	Match any character not in this string.
123 	kRegExpBranch =	6,	// node	Match this alternative, or the next...
124 	kRegExpBack = 7,	// no	Match "", "next" ptr points backward.
125 	kRegExpExactly = 8,	// str	Match this string.
126 	kRegExpNothing = 9,	// no	Match empty string.
127 	kRegExpStar = 10,	// node	Match this (simple) thing 0 or more times.
128 	kRegExpPlus = 11,	// node	Match this (simple) thing 1 or more times.
129 	kRegExpOpen	= 20,	// no	Mark this point in input as start of #n.
130 							//	kRegExpOpen + 1 is number 1, etc.
131 	kRegExpClose = 30	// no	Analogous to kRegExpOpen.
132 };
133 
134 //
135 // Opcode notes:
136 //
137 // kRegExpBranch	The set of branches constituting a single choice are
138 //		hooked together with their "next" pointers, since precedence prevents
139 //		anything being concatenated to any individual branch.  The
140 //		"next" pointer of the last kRegExpBranch in a choice points to the
141 //		thing following the whole choice.  This is also where the
142 //		final "next" pointer of each individual branch points; each
143 //		branch starts with the operand node of a kRegExpBranch node.
144 //
145 // kRegExpBack		Normal "next" pointers all implicitly point forward;
146 //		kRegExpBack exists to make loop structures possible.
147 //
148 // kRegExpStar,kRegExpPlus	'?', and complex '*' and '+', are implemented as
149 //		circular kRegExpBranch structures using kRegExpBack.  Simple cases
150 //		(one character per match) are implemented with kRegExpStar and
151 //		kRegExpPlus for speed and to minimize recursive plunges.
152 //
153 // kRegExpOpen,kRegExpClose	...are numbered at compile time.
154 //
155 //
156 //
157 // A node is one char of opcode followed by two chars of "next" pointer.
158 // "Next" pointers are stored as two 8-bit pieces, high order first.  The
159 // value is a positive offset from the opcode of the node containing it.
160 // An operand, if any, simply follows the node.  (Note that much of the
161 // code generation knows about this implicit relationship.)
162 //
163 // Using two bytes for the "next" pointer is vast overkill for most things,
164 // but allows patterns to get big without disasters.
165 //
166 
167 const char* kMeta = "^$.[()|?+*\\";
168 const int32 kMaxSize = 32767L;
169 	// Probably could be 65535L.
170 
171 // Flags to be passed up and down:
172 enum {
173 	kHasWidth =	01,	// Known never to match null string.
174 	kSimple = 02,	// Simple enough to be kRegExpStar/kRegExpPlus operand.
175 	kSPStart = 04,	// Starts with * or +.
176 	kWorst = 0	// Worst case.
177 };
178 
179 const char* kRegExpErrorStringArray[] = {
180 	"Unmatched parenthesis.",
181 	"Expression too long.",
182 	"Too many parenthesis.",
183 	"Junk on end.",
184 	"*+? operand may be empty.",
185 	"Nested *?+.",
186 	"Invalid bracket range.",
187 	"Unmatched brackets.",
188 	"Internal error.",
189 	"?+* follows nothing.",
190 	"Trailing \\.",
191 	"Corrupted expression.",
192 	"Memory corruption.",
193 	"Corrupted pointers.",
194 	"Corrupted opcode."
195 };
196 
197 #ifdef DEBUG
198 int32 regnarrate = 0;
199 #endif
200 
201 RegExp::RegExp()
202 	:	fError(B_OK),
203 		fRegExp(NULL)
204 {
205 }
206 
207 
208 RegExp::RegExp(const char* pattern)
209 	:	fError(B_OK),
210 		fRegExp(NULL)
211 {
212 	fRegExp = Compile(pattern);
213 }
214 
215 
216 RegExp::RegExp(const BString &pattern)
217 	:	fError(B_OK),
218 		fRegExp(NULL)
219 {
220 	fRegExp = Compile(pattern.String());
221 }
222 
223 
224 RegExp::~RegExp()
225 {
226 	free(fRegExp);
227 }
228 
229 
230 status_t
231 RegExp::InitCheck() const
232 {
233 	return fError;
234 }
235 
236 
237 status_t
238 RegExp::SetTo(const char* pattern)
239 {
240 	fError = B_OK;
241 	free(fRegExp);
242 	fRegExp = Compile(pattern);
243 	return fError;
244 }
245 
246 
247 status_t
248 RegExp::SetTo(const BString &pattern)
249 {
250 	fError = B_OK;
251 	free(fRegExp);
252 	fRegExp = Compile(pattern.String());
253 	return fError;
254 }
255 
256 
257 bool
258 RegExp::Matches(const char* string) const
259 {
260 	if (!fRegExp || !string)
261 		return false;
262 
263 	return RunMatcher(fRegExp, string) == 1;
264 }
265 
266 
267 bool
268 RegExp::Matches(const BString &string) const
269 {
270 	if (!fRegExp)
271 		return false;
272 
273 	return RunMatcher(fRegExp, string.String()) == 1;
274 }
275 
276 
277 //
278 // - Compile - compile a regular expression into internal code
279 //
280 // We can't allocate space until we know how big the compiled form will be,
281 // but we can't compile it (and thus know how big it is) until we've got a
282 // place to put the code.  So we cheat:  we compile it twice, once with code
283 // generation turned off and size counting turned on, and once "for real".
284 // This also means that we don't allocate space until we are sure that the
285 // thing really will compile successfully, and we never have to move the
286 // code and thus invalidate pointers into it.  (Note that it has to be in
287 // one piece because free() must be able to free it all.)
288 //
289 // Beware that the optimization-preparation code in here knows about some
290 // of the structure of the compiled regexp.
291 regexp*
292 RegExp::Compile(const char* exp)
293 {
294 	regexp* r;
295 	const char* scan;
296 	const char* longest;
297 	int32 len;
298 	int32 flags;
299 
300 	if (exp == NULL) {
301 		SetError(B_BAD_VALUE);
302 		return NULL;
303 	}
304 
305 	// First pass: determine size, legality.
306 	fInputScanPointer = exp;
307 	fParenthesisCount = 1;
308 	fCodeSize = 0L;
309 	fCodeEmitPointer = &fDummy;
310 	Char(kRegExpMagic);
311 	if (Reg(0, &flags) == NULL)
312 		return NULL;
313 
314 	// Small enough for pointer-storage convention?
315 	if (fCodeSize >= kMaxSize) {
316 		SetError(REGEXP_TOO_BIG);
317 		return NULL;
318 	}
319 
320 	r = (regexp*)malloc(sizeof(regexp) + fCodeSize);
321 		// Allocate space
322 
323 	if (!r) {
324 		SetError(B_NO_MEMORY);
325 		return NULL;
326 	}
327 
328 	// Second pass: emit code.
329 	fInputScanPointer = exp;
330 	fParenthesisCount = 1;
331 	fCodeEmitPointer = r->program;
332 	Char(kRegExpMagic);
333 	if (Reg(0, &flags) == NULL) {
334 		free(r);
335 		return NULL;
336 	}
337 
338 	// Dig out information for optimizations.
339 	r->regstart = '\0';	// Worst-case defaults.
340 	r->reganch = 0;
341 	r->regmust = NULL;
342 	r->regmlen = 0;
343 	scan = r->program + 1;
344 		// First kRegExpBranch.
345 	if (*Next((char*)scan) == kRegExpEnd) {
346 		// Only one top-level choice.
347 		scan = Operand(scan);
348 
349 		// Starting-point info.
350 		if (*scan == kRegExpExactly)
351 			r->regstart = *Operand(scan);
352 		else if (*scan == kRegExpBol)
353 			r->reganch++;
354 
355 		// If there's something expensive in the r.e., find the
356 		// longest literal string that must appear and make it the
357 		// regmust.  Resolve ties in favor of later strings, since
358 		// the regstart check works with the beginning of the r.e.
359 		// and avoiding duplication strengthens checking.  Not a
360 		// strong reason, but sufficient in the absence of others.
361 		if (flags&kSPStart) {
362 			longest = NULL;
363 			len = 0;
364 			for (; scan != NULL; scan = Next((char*)scan))
365 				if (*scan == kRegExpExactly
366 					&& (int32)strlen(Operand(scan)) >= len) {
367 					longest = Operand(scan);
368 					len = (int32)strlen(Operand(scan));
369 				}
370 			r->regmust = longest;
371 			r->regmlen = len;
372 		}
373 	}
374 
375 	return r;
376 }
377 
378 
379 regexp*
380 RegExp::Expression() const
381 {
382 	return fRegExp;
383 }
384 
385 
386 const char*
387 RegExp::ErrorString() const
388 {
389 	if (fError >= REGEXP_UNMATCHED_PARENTHESIS
390 		&& fError <= REGEXP_CORRUPTED_OPCODE)
391 		return kRegExpErrorStringArray[fError - B_ERRORS_END];
392 
393 	return strerror(fError);
394 }
395 
396 
397 void
398 RegExp::SetError(status_t error) const
399 {
400 	fError = error;
401 }
402 
403 
404 //
405 // - Reg - regular expression, i.e. main body or parenthesized thing
406 //
407 // Caller must absorb opening parenthesis.
408 //
409 // Combining parenthesis handling with the base level of regular expression
410 // is a trifle forced, but the need to tie the tails of the branches to what
411 // follows makes it hard to avoid.
412 //
413 char*
414 RegExp::Reg(int32 paren, int32* flagp)
415 {
416 	char* ret;
417 	char* br;
418 	char* ender;
419 	int32 parno = 0;
420 	int32 flags;
421 
422 	*flagp = kHasWidth;	// Tentatively.
423 
424 	// Make an kRegExpOpen node, if parenthesized.
425 	if (paren) {
426 		if (fParenthesisCount >= kSubExpressionMax) {
427 			SetError(REGEXP_TOO_MANY_PARENTHESIS);
428 			return NULL;
429 		}
430 		parno = fParenthesisCount;
431 		fParenthesisCount++;
432 		ret = Node((char)(kRegExpOpen + parno));
433 	} else
434 		ret = NULL;
435 
436 	// Pick up the branches, linking them together.
437 	br = Branch(&flags);
438 	if (br == NULL)
439 		return NULL;
440 	if (ret != NULL)
441 		Tail(ret, br);	// kRegExpOpen -> first
442 	else
443 		ret = br;
444 	if (!(flags & kHasWidth))
445 		*flagp &= ~kHasWidth;
446 	*flagp |= flags&kSPStart;
447 	while (*fInputScanPointer == '|') {
448 		fInputScanPointer++;
449 		br = Branch(&flags);
450 		if (br == NULL)
451 			return NULL;
452 		Tail(ret, br);	// kRegExpBranch -> kRegExpBranch.
453 		if (!(flags & kHasWidth))
454 			*flagp &= ~kHasWidth;
455 		*flagp |= flags&kSPStart;
456 	}
457 
458 	// Make a closing node, and hook it on the end.
459 	ender = Node(paren ? (char)(kRegExpClose + parno) : (char)kRegExpEnd);
460 	Tail(ret, ender);
461 
462 	// Hook the tails of the branches to the closing node.
463 	for (br = ret; br != NULL; br = Next(br))
464 		OpTail(br, ender);
465 
466 	// Check for proper termination.
467 	if (paren && *fInputScanPointer++ != ')') {
468 		SetError(REGEXP_UNMATCHED_PARENTHESIS);
469 		return NULL;
470 	} else if (!paren && *fInputScanPointer != '\0') {
471 		if (*fInputScanPointer == ')') {
472 			SetError(REGEXP_UNMATCHED_PARENTHESIS);
473 			return NULL;
474 		} else {
475 			SetError(REGEXP_JUNK_ON_END);
476 			return NULL;	//  "Can't happen".
477 		}
478 		// NOTREACHED
479 	}
480 
481 	return ret;
482 }
483 
484 
485 //
486 // - Branch - one alternative of an | operator
487 //
488 // Implements the concatenation operator.
489 //
490 char*
491 RegExp::Branch(int32* flagp)
492 {
493 	char* ret;
494 	char* chain;
495 	char* latest;
496 	int32 flags;
497 
498 	*flagp = kWorst;		// Tentatively.
499 
500 	ret = Node(kRegExpBranch);
501 	chain = NULL;
502 	while (*fInputScanPointer != '\0'
503 		&& *fInputScanPointer != '|'
504 		&& *fInputScanPointer != ')') {
505 		latest = Piece(&flags);
506 		if (latest == NULL)
507 			return NULL;
508 		*flagp |= flags & kHasWidth;
509 		if (chain == NULL)	// First piece.
510 			*flagp |= flags & kSPStart;
511 		else
512 			Tail(chain, latest);
513 		chain = latest;
514 	}
515 	if (chain == NULL)	// Loop ran zero times.
516 		Node(kRegExpNothing);
517 
518 	return ret;
519 }
520 
521 
522 //
523 // - Piece - something followed by possible [*+?]
524 //
525 // Note that the branching code sequences used for ? and the general cases
526 // of * and + are somewhat optimized:  they use the same kRegExpNothing node
527 // as both the endmarker for their branch list and the body of the last
528 // branch. It might seem that this node could be dispensed with entirely,
529 // but the endmarker role is not redundant.
530 //
531 char*
532 RegExp::Piece(int32* flagp)
533 {
534 	char* ret;
535 	char op;
536 	char* next;
537 	int32 flags;
538 
539 	ret = Atom(&flags);
540 	if (ret == NULL)
541 		return NULL;
542 
543 	op = *fInputScanPointer;
544 	if (!IsMult(op)) {
545 		*flagp = flags;
546 		return ret;
547 	}
548 
549 	if (!(flags & kHasWidth) && op != '?') {
550 		SetError(REGEXP_STAR_PLUS_OPERAND_EMPTY);
551 		return NULL;
552 	}
553 	*flagp = op != '+' ? kWorst | kSPStart :  kWorst | kHasWidth;
554 
555 	if (op == '*' && (flags & kSimple))
556 		Insert(kRegExpStar, ret);
557 	else if (op == '*') {
558 		// Emit x* as (x&|), where & means "self".
559 		Insert(kRegExpBranch, ret);				// Either x
560 		OpTail(ret, Node(kRegExpBack));		// and loop
561 		OpTail(ret, ret);				// back
562 		Tail(ret, Node(kRegExpBranch));		// or
563 		Tail(ret, Node(kRegExpNothing));		// null.
564 	} else if (op == '+' && (flags & kSimple))
565 		Insert(kRegExpPlus, ret);
566 	else if (op == '+') {
567 		// Emit x+ as x(&|), where & means "self".
568 		next = Node(kRegExpBranch);				// Either
569 		Tail(ret, next);
570 		Tail(Node(kRegExpBack), ret);		// loop back
571 		Tail(next, Node(kRegExpBranch));		// or
572 		Tail(ret, Node(kRegExpNothing));		// null.
573 	} else if (op == '?') {
574 		// Emit x? as (x|)
575 		Insert(kRegExpBranch, ret);			// Either x
576 		Tail(ret, Node(kRegExpBranch));	// or
577 		next = Node(kRegExpNothing);		// null.
578 		Tail(ret, next);
579 		OpTail(ret, next);
580 	}
581 	fInputScanPointer++;
582 	if (IsMult(*fInputScanPointer)) {
583 		SetError(REGEXP_NESTED_STAR_QUESTION_PLUS);
584 		return NULL;
585 	}
586 	return ret;
587 }
588 
589 
590 //
591 // - Atom - the lowest level
592 //
593 // Optimization:  gobbles an entire sequence of ordinary characters so that
594 // it can turn them into a single node, which is smaller to store and
595 // faster to run.  Backslashed characters are exceptions, each becoming a
596 // separate node; the code is simpler that way and it's not worth fixing.
597 //
598 char*
599 RegExp::Atom(int32* flagp)
600 {
601 	char* ret;
602 	int32 flags;
603 
604 	*flagp = kWorst;		// Tentatively.
605 
606 	switch (*fInputScanPointer++) {
607 		case '^':
608 			ret = Node(kRegExpBol);
609 			break;
610 		case '$':
611 			ret = Node(kRegExpEol);
612 			break;
613 		case '.':
614 			ret = Node(kRegExpAny);
615 			*flagp |= kHasWidth|kSimple;
616 			break;
617 		case '[':
618 			{
619 				int32 cclass;
620 				int32 classend;
621 
622 				if (*fInputScanPointer == '^') {	// Complement of range.
623 					ret = Node(kRegExpAnyBut);
624 					fInputScanPointer++;
625 				} else
626 					ret = Node(kRegExpAnyOf);
627 				if (*fInputScanPointer == ']' || *fInputScanPointer == '-')
628 					Char(*fInputScanPointer++);
629 				while (*fInputScanPointer != '\0'
630 					&& *fInputScanPointer != ']') {
631 					if (*fInputScanPointer == '-') {
632 						fInputScanPointer++;
633 						if (*fInputScanPointer == ']'
634 							|| *fInputScanPointer == '\0') {
635 							Char('-');
636 						} else {
637 							cclass = UCharAt(fInputScanPointer - 2) + 1;
638 							classend = UCharAt(fInputScanPointer);
639 							if (cclass > classend + 1) {
640 								SetError(REGEXP_INVALID_BRACKET_RANGE);
641 								return NULL;
642 							}
643 							for (; cclass <= classend; cclass++)
644 								Char((char)cclass);
645 							fInputScanPointer++;
646 						}
647 					} else
648 						Char(*fInputScanPointer++);
649 				}
650 				Char('\0');
651 				if (*fInputScanPointer != ']') {
652 					SetError(REGEXP_UNMATCHED_BRACKET);
653 					return NULL;
654 				}
655 				fInputScanPointer++;
656 				*flagp |= kHasWidth | kSimple;
657 			}
658 			break;
659 		case '(':
660 			ret = Reg(1, &flags);
661 			if (ret == NULL)
662 				return NULL;
663 			*flagp |= flags & (kHasWidth | kSPStart);
664 			break;
665 		case '\0':
666 		case '|':
667 		case ')':
668 			SetError(REGEXP_INTERNAL_ERROR);
669 			return NULL; //  Supposed to be caught earlier.
670 		case '?':
671 		case '+':
672 		case '*':
673 			SetError(REGEXP_QUESTION_PLUS_STAR_FOLLOWS_NOTHING);
674 			return NULL;
675 		case '\\':
676 			if (*fInputScanPointer == '\0') {
677 				SetError(REGEXP_TRAILING_BACKSLASH);
678 				return NULL;
679 			}
680 			ret = Node(kRegExpExactly);
681 			Char(*fInputScanPointer++);
682 			Char('\0');
683 			*flagp |= kHasWidth|kSimple;
684 			break;
685 		default:
686 		{
687 			int32 len;
688 			char ender;
689 
690 			fInputScanPointer--;
691 			len = (int32)strcspn(fInputScanPointer, kMeta);
692 			if (len <= 0) {
693 				SetError(REGEXP_INTERNAL_ERROR);
694 				return NULL;
695 			}
696 
697 			ender = *(fInputScanPointer + len);
698 			if (len > 1 && IsMult(ender))
699 				len--;		// Back off clear of ?+* operand.
700 
701 			*flagp |= kHasWidth;
702 			if (len == 1)
703 				*flagp |= kSimple;
704 
705 			ret = Node(kRegExpExactly);
706 			while (len > 0) {
707 				Char(*fInputScanPointer++);
708 				len--;
709 			}
710 
711 			Char('\0');
712 			break;
713 		}
714 	}
715 
716 	return ret;
717 }
718 
719 
720 //
721 // - Node - emit a node
722 //
723 char*			// Location.
724 RegExp::Node(char op)
725 {
726 	char* ret;
727 	char* ptr;
728 
729 	ret = fCodeEmitPointer;
730 	if (ret == &fDummy) {
731 		fCodeSize += 3;
732 		return ret;
733 	}
734 
735 	ptr = ret;
736 	*ptr++ = op;
737 	*ptr++ = '\0';		// Null "next" pointer.
738 	*ptr++ = '\0';
739 	fCodeEmitPointer = ptr;
740 
741 	return ret;
742 }
743 
744 
745 //
746 // - Char - emit (if appropriate) a byte of code
747 //
748 void
749 RegExp::Char(char b)
750 {
751 	if (fCodeEmitPointer != &fDummy)
752 		*fCodeEmitPointer++ = b;
753 	else
754 		fCodeSize++;
755 }
756 
757 
758 //
759 // - Insert - insert an operator in front of already-emitted operand
760 //
761 // Means relocating the operand.
762 //
763 void
764 RegExp::Insert(char op, char* opnd)
765 {
766 	char* src;
767 	char* dst;
768 	char* place;
769 
770 	if (fCodeEmitPointer == &fDummy) {
771 		fCodeSize += 3;
772 		return;
773 	}
774 
775 	src = fCodeEmitPointer;
776 	fCodeEmitPointer += 3;
777 	dst = fCodeEmitPointer;
778 	while (src > opnd)
779 		*--dst = *--src;
780 
781 	place = opnd;		// Op node, where operand used to be.
782 	*place++ = op;
783 	*place++ = '\0';
784 	*place++ = '\0';
785 }
786 
787 
788 //
789 // - Tail - set the next-pointer at the end of a node chain
790 //
791 void
792 RegExp::Tail(char* p, char* val)
793 {
794 	char* scan;
795 	char* temp;
796 	int32 offset;
797 
798 	if (p == &fDummy)
799 		return;
800 
801 	// Find last node.
802 	scan = p;
803 	for (;;) {
804 		temp = Next(scan);
805 		if (temp == NULL)
806 			break;
807 		scan = temp;
808 	}
809 
810 	if (scan[0] == kRegExpBack)
811 		offset = scan - val;
812 	else
813 		offset = val - scan;
814 
815 	scan[1] = (char)((offset >> 8) & 0377);
816 	scan[2] = (char)(offset & 0377);
817 }
818 
819 
820 //
821 // - OpTail - Tail on operand of first argument; nop if operandless
822 //
823 void
824 RegExp::OpTail(char* p, char* val)
825 {
826 	// "Operandless" and "op != kRegExpBranch" are synonymous in practice.
827 	if (p == NULL || p == &fDummy || *p != kRegExpBranch)
828 		return;
829 	Tail(Operand(p), val);
830 }
831 
832 //
833 // RunMatcher and friends
834 //
835 
836 
837 //
838 // - RunMatcher - match a regexp against a string
839 //
840 int32
841 RegExp::RunMatcher(regexp* prog, const char* string) const
842 {
843 	const char* s;
844 
845 	// Be paranoid...
846 	if (prog == NULL || string == NULL) {
847 		SetError(B_BAD_VALUE);
848 		return 0;
849 	}
850 
851 	// Check validity of program.
852 	if (UCharAt(prog->program) != kRegExpMagic) {
853 		SetError(REGEXP_CORRUPTED_PROGRAM);
854 		return 0;
855 	}
856 
857 	// If there is a "must appear" string, look for it.
858 	if (prog->regmust != NULL) {
859 		s = string;
860 		while ((s = strchr(s, prog->regmust[0])) != NULL) {
861 			if (strncmp(s, prog->regmust, (size_t)prog->regmlen) == 0)
862 				break;	// Found it.
863 			s++;
864 		}
865 		if (s == NULL)	// Not present.
866 			return 0;
867 	}
868 
869 	// Mark beginning of line for ^ .
870 	fRegBol = string;
871 
872 	// Simplest case:  anchored match need be tried only once.
873 	if (prog->reganch)
874 		return Try(prog, (char*)string);
875 
876 	// Messy cases:  unanchored match.
877 	s = string;
878 	if (prog->regstart != '\0')
879 		// We know what char it must start with.
880 		while ((s = strchr(s, prog->regstart)) != NULL) {
881 			if (Try(prog, (char*)s))
882 				return 1;
883 			s++;
884 		}
885 	else
886 		// We don't -- general case.
887 		do {
888 			if (Try(prog, (char*)s))
889 				return 1;
890 		} while (*s++ != '\0');
891 
892 	// Failure.
893 	return 0;
894 }
895 
896 
897 //
898 // - Try - try match at specific point
899 //
900 int32			// 0 failure, 1 success
901 RegExp::Try(regexp* prog, const char* string) const
902 {
903 	int32 i;
904 	const char **sp;
905 	const char **ep;
906 
907 	fStringInputPointer = string;
908 	fStartPArrayPointer = prog->startp;
909 	fEndPArrayPointer = prog->endp;
910 
911 	sp = prog->startp;
912 	ep = prog->endp;
913 	for (i = kSubExpressionMax; i > 0; i--) {
914 		*sp++ = NULL;
915 		*ep++ = NULL;
916 	}
917 	if (Match(prog->program + 1)) {
918 		prog->startp[0] = string;
919 		prog->endp[0] = fStringInputPointer;
920 		return 1;
921 	} else
922 		return 0;
923 }
924 
925 
926 //
927 // - Match - main matching routine
928 //
929 // Conceptually the strategy is simple:  check to see whether the current
930 // node matches, call self recursively to see whether the rest matches,
931 // and then act accordingly.  In practice we make some effort to avoid
932 // recursion, in particular by going through "ordinary" nodes (that don't
933 // need to know whether the rest of the match failed) by a loop instead of
934 // by recursion.
935 ///
936 int32			// 0 failure, 1 success
937 RegExp::Match(const char* prog) const
938 {
939 	const char* scan;	// Current node.
940 	const char* next;		// Next node.
941 
942 	scan = prog;
943 #ifdef DEBUG
944 	if (scan != NULL && regnarrate)
945 		fprintf(stderr, "%s(\n", Prop(scan));
946 #endif
947 	while (scan != NULL) {
948 #ifdef DEBUG
949 		if (regnarrate)
950 			fprintf(stderr, "%s...\n", Prop(scan));
951 #endif
952 		next = Next(scan);
953 
954 		switch (*scan) {
955 			case kRegExpBol:
956 				if (fStringInputPointer != fRegBol)
957 					return 0;
958 				break;
959 			case kRegExpEol:
960 				if (*fStringInputPointer != '\0')
961 					return 0;
962 				break;
963 			case kRegExpAny:
964 				if (*fStringInputPointer == '\0')
965 					return 0;
966 				fStringInputPointer++;
967 				break;
968 			case kRegExpExactly:
969 				{
970 					const char* opnd = Operand(scan);
971 					// Inline the first character, for speed.
972 					if (*opnd != *fStringInputPointer)
973 						return 0;
974 
975 					uint32 len = strlen(opnd);
976 					if (len > 1
977 						&& strncmp(opnd, fStringInputPointer, len) != 0) {
978 						return 0;
979 					}
980 
981 					fStringInputPointer += len;
982 				}
983 				break;
984 			case kRegExpAnyOf:
985 				if (*fStringInputPointer == '\0'
986 					|| strchr(Operand(scan), *fStringInputPointer) == NULL)
987 					return 0;
988 				fStringInputPointer++;
989 				break;
990 			case kRegExpAnyBut:
991 				if (*fStringInputPointer == '\0'
992 					|| strchr(Operand(scan), *fStringInputPointer) != NULL)
993 					return 0;
994 				fStringInputPointer++;
995 				break;
996 			case kRegExpNothing:
997 				break;
998 			case kRegExpBack:
999 				break;
1000 			case kRegExpOpen + 1:
1001 			case kRegExpOpen + 2:
1002 			case kRegExpOpen + 3:
1003 			case kRegExpOpen + 4:
1004 			case kRegExpOpen + 5:
1005 			case kRegExpOpen + 6:
1006 			case kRegExpOpen + 7:
1007 			case kRegExpOpen + 8:
1008 			case kRegExpOpen + 9:
1009 				{
1010 					int32 no;
1011 					const char* save;
1012 
1013 					no = *scan - kRegExpOpen;
1014 					save = fStringInputPointer;
1015 
1016 					if (Match(next)) {
1017 						//
1018 						// Don't set startp if some later
1019 						// invocation of the same parentheses
1020 						// already has.
1021 						//
1022 						if (fStartPArrayPointer[no] == NULL)
1023 							fStartPArrayPointer[no] = save;
1024 						return 1;
1025 					} else
1026 						return 0;
1027 				}
1028 				break;
1029 			case kRegExpClose + 1:
1030 			case kRegExpClose + 2:
1031 			case kRegExpClose + 3:
1032 			case kRegExpClose + 4:
1033 			case kRegExpClose + 5:
1034 			case kRegExpClose + 6:
1035 			case kRegExpClose + 7:
1036 			case kRegExpClose + 8:
1037 			case kRegExpClose + 9:
1038 				{
1039 					int32 no;
1040 					const char* save;
1041 
1042 					no = *scan - kRegExpClose;
1043 					save = fStringInputPointer;
1044 
1045 					if (Match(next)) {
1046 						//
1047 						// Don't set endp if some later
1048 						// invocation of the same parentheses
1049 						// already has.
1050 						//
1051 						if (fEndPArrayPointer[no] == NULL)
1052 							fEndPArrayPointer[no] = save;
1053 						return 1;
1054 					} else
1055 						return 0;
1056 				}
1057 				break;
1058 			case kRegExpBranch:
1059 				{
1060 					const char* save;
1061 
1062 					if (*next != kRegExpBranch)		// No choice.
1063 						next = Operand(scan);	// Avoid recursion.
1064 					else {
1065 						do {
1066 							save = fStringInputPointer;
1067 							if (Match(Operand(scan)))
1068 								return 1;
1069 							fStringInputPointer = save;
1070 							scan = Next(scan);
1071 						} while (scan != NULL && *scan == kRegExpBranch);
1072 						return 0;
1073 						// NOTREACHED/
1074 					}
1075 				}
1076 				break;
1077 			case kRegExpStar:
1078 			case kRegExpPlus:
1079 				{
1080 					char nextch;
1081 					int32 no;
1082 					const char* save;
1083 					int32 min;
1084 
1085 					//
1086 					//Lookahead to avoid useless match attempts
1087 					// when we know what character comes next.
1088 					//
1089 					nextch = '\0';
1090 					if (*next == kRegExpExactly)
1091 						nextch = *Operand(next);
1092 					min = (*scan == kRegExpStar) ? 0 : 1;
1093 					save = fStringInputPointer;
1094 					no = Repeat(Operand(scan));
1095 					while (no >= min) {
1096 						// If it could work, try it.
1097 						if (nextch == '\0' || *fStringInputPointer == nextch)
1098 							if (Match(next))
1099 								return 1;
1100 						// Couldn't or didn't -- back up.
1101 						no--;
1102 						fStringInputPointer = save + no;
1103 					}
1104 					return 0;
1105 				}
1106 				break;
1107 			case kRegExpEnd:
1108 				return 1;	// Success!
1109 
1110 			default:
1111 				SetError(REGEXP_MEMORY_CORRUPTION);
1112 				return 0;
1113 			}
1114 
1115 		scan = next;
1116 	}
1117 
1118 	//
1119 	// We get here only if there's trouble -- normally "case kRegExpEnd" is
1120 	// the terminating point.
1121 	//
1122 	SetError(REGEXP_CORRUPTED_POINTERS);
1123 	return 0;
1124 }
1125 
1126 
1127 //
1128 // - Repeat - repeatedly match something simple, report how many
1129 //
1130 int32
1131 RegExp::Repeat(const char* p) const
1132 {
1133 	int32 count = 0;
1134 	const char* scan;
1135 	const char* opnd;
1136 
1137 	scan = fStringInputPointer;
1138 	opnd = Operand(p);
1139 	switch (*p) {
1140 		case kRegExpAny:
1141 			count = (int32)strlen(scan);
1142 			scan += count;
1143 			break;
1144 
1145 		case kRegExpExactly:
1146 			while (*opnd == *scan) {
1147 				count++;
1148 				scan++;
1149 			}
1150 			break;
1151 
1152 		case kRegExpAnyOf:
1153 			while (*scan != '\0' && strchr(opnd, *scan) != NULL) {
1154 				count++;
1155 				scan++;
1156 			}
1157 			break;
1158 
1159 		case kRegExpAnyBut:
1160 			while (*scan != '\0' && strchr(opnd, *scan) == NULL) {
1161 				count++;
1162 				scan++;
1163 			}
1164 			break;
1165 
1166 		default:		// Oh dear.  Called inappropriately.
1167 			SetError(REGEXP_INTERNAL_ERROR);
1168 			count = 0;	// Best compromise.
1169 			break;
1170 	}
1171 	fStringInputPointer = scan;
1172 
1173 	return count;
1174 }
1175 
1176 
1177 //
1178 // - Next - dig the "next" pointer out of a node
1179 //
1180 char*
1181 RegExp::Next(char* p)
1182 {
1183 	int32 offset;
1184 
1185 	if (p == &fDummy)
1186 		return NULL;
1187 
1188 	offset = ((*(p + 1) & 0377) << 8) + (*(p + 2) & 0377);
1189 	if (offset == 0)
1190 		return NULL;
1191 
1192 	if (*p == kRegExpBack)
1193 		return p - offset;
1194 	else
1195 		return p + offset;
1196 }
1197 
1198 
1199 const char*
1200 RegExp::Next(const char* p) const
1201 {
1202 	int32 offset;
1203 
1204 	if (p == &fDummy)
1205 		return NULL;
1206 
1207 	offset = ((*(p + 1) & 0377) << 8) + (*(p + 2) & 0377);
1208 	if (offset == 0)
1209 		return NULL;
1210 
1211 	if (*p == kRegExpBack)
1212 		return p - offset;
1213 	else
1214 		return p + offset;
1215 }
1216 
1217 
1218 inline int32
1219 RegExp::UCharAt(const char* p) const
1220 {
1221 	return (int32)*(unsigned char *)p;
1222 }
1223 
1224 
1225 inline char*
1226 RegExp::Operand(char* p) const
1227 {
1228 	return p + 3;
1229 }
1230 
1231 
1232 inline const char*
1233 RegExp::Operand(const char* p) const
1234 {
1235 	return p + 3;
1236 }
1237 
1238 
1239 inline bool
1240 RegExp::IsMult(char c) const
1241 {
1242 	return c == '*' || c == '+' || c == '?';
1243 }
1244 
1245 
1246 #ifdef DEBUG
1247 
1248 
1249 //
1250 // - Dump - dump a regexp onto stdout in vaguely comprehensible form
1251 //
1252 void
1253 RegExp::Dump()
1254 {
1255 	const char* s;
1256 	char op = kRegExpExactly;	// Arbitrary non-kRegExpEnd op.
1257 	const char* next;
1258 
1259 	s = fRegExp->program + 1;
1260 	while (op != kRegExpEnd) {	// While that wasn't kRegExpEnd last time...
1261 		op = *s;
1262 		printf("%2ld%s", s - fRegExp->program, Prop(s));	// Where, what.
1263 		next = Next(s);
1264 		if (next == NULL)		// Next ptr.
1265 			printf("(0)");
1266 		else
1267 			printf("(%ld)", (s - fRegExp->program) + (next - s));
1268 		s += 3;
1269 		if (op == kRegExpAnyOf || op == kRegExpAnyBut
1270 			|| op == kRegExpExactly) {
1271 			// Literal string, where present.
1272 			while (*s != '\0') {
1273 				putchar(*s);
1274 				s++;
1275 			}
1276 			s++;
1277 		}
1278 		putchar('\n');
1279 	}
1280 
1281 	// Header fields of interest.
1282 	if (fRegExp->regstart != '\0')
1283 		printf("start `%c' ", fRegExp->regstart);
1284 	if (fRegExp->reganch)
1285 		printf("anchored ");
1286 	if (fRegExp->regmust != NULL)
1287 		printf("must have \"%s\"", fRegExp->regmust);
1288 	printf("\n");
1289 }
1290 
1291 
1292 //
1293 // - Prop - printable representation of opcode
1294 //
1295 char*
1296 RegExp::Prop(const char* op) const
1297 {
1298 	const char* p = NULL;
1299 	static char buf[50];
1300 
1301 	(void) strcpy(buf, ":");
1302 
1303 	switch (*op) {
1304 		case kRegExpBol:
1305 			p = "kRegExpBol";
1306 			break;
1307 		case kRegExpEol:
1308 			p = "kRegExpEol";
1309 			break;
1310 		case kRegExpAny:
1311 			p = "kRegExpAny";
1312 			break;
1313 		case kRegExpAnyOf:
1314 			p = "kRegExpAnyOf";
1315 			break;
1316 		case kRegExpAnyBut:
1317 			p = "kRegExpAnyBut";
1318 			break;
1319 		case kRegExpBranch:
1320 			p = "kRegExpBranch";
1321 			break;
1322 		case kRegExpExactly:
1323 			p = "kRegExpExactly";
1324 			break;
1325 		case kRegExpNothing:
1326 			p = "kRegExpNothing";
1327 			break;
1328 		case kRegExpBack:
1329 			p = "kRegExpBack";
1330 			break;
1331 		case kRegExpEnd:
1332 			p = "kRegExpEnd";
1333 			break;
1334 		case kRegExpOpen + 1:
1335 		case kRegExpOpen + 2:
1336 		case kRegExpOpen + 3:
1337 		case kRegExpOpen + 4:
1338 		case kRegExpOpen + 5:
1339 		case kRegExpOpen + 6:
1340 		case kRegExpOpen + 7:
1341 		case kRegExpOpen + 8:
1342 		case kRegExpOpen + 9:
1343 			sprintf(buf + strlen(buf), "kRegExpOpen%d", *op - kRegExpOpen);
1344 			p = NULL;
1345 			break;
1346 		case kRegExpClose + 1:
1347 		case kRegExpClose + 2:
1348 		case kRegExpClose + 3:
1349 		case kRegExpClose + 4:
1350 		case kRegExpClose + 5:
1351 		case kRegExpClose + 6:
1352 		case kRegExpClose + 7:
1353 		case kRegExpClose + 8:
1354 		case kRegExpClose + 9:
1355 			sprintf(buf + strlen(buf), "kRegExpClose%d", *op - kRegExpClose);
1356 			p = NULL;
1357 			break;
1358 		case kRegExpStar:
1359 			p = "kRegExpStar";
1360 			break;
1361 		case kRegExpPlus:
1362 			p = "kRegExpPlus";
1363 			break;
1364 		default:
1365 			RegExpError("corrupted opcode");
1366 			break;
1367 	}
1368 
1369 	if (p != NULL)
1370 		strcat(buf, p);
1371 
1372 	return buf;
1373 }
1374 
1375 
1376 void
1377 RegExp::RegExpError(const char*) const
1378 {
1379 	// does nothing now, perhaps it should printf?
1380 }
1381 
1382 #endif
1383