xref: /haiku/src/kits/tracker/RegExp.cpp (revision 9522c1e4f21b0292084467c927e4169770db1329)
1 /*
2 Open Tracker License
3 
4 Terms and Conditions
5 
6 Copyright (c) 1991-2000, Be Incorporated. All rights reserved.
7 
8 Permission is hereby granted, free of charge, to any person obtaining a copy of
9 this software and associated documentation files (the "Software"), to deal in
10 the Software without restriction, including without limitation the rights to
11 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
12 of the Software, and to permit persons to whom the Software is furnished to do
13 so, subject to the following conditions:
14 
15 The above copyright notice and this permission notice applies to all licensees
16 and shall be included in all copies or substantial portions of the Software.
17 
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF TITLE, MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21 BE INCORPORATED BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
22 AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN CONNECTION
23 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
24 
25 Except as contained in this notice, the name of Be Incorporated shall not be
26 used in advertising or otherwise to promote the sale, use or other dealings in
27 this Software without prior written authorization from Be Incorporated.
28 
29 Tracker(TM), Be(R), BeOS(R), and BeIA(TM) are trademarks or registered trademarks
30 of Be Incorporated in the United States and other countries. Other brand product
31 names are registered trademarks or trademarks of their respective holders.
32 All rights reserved.
33 */
34 
35 // This code is based on regexp.c, v.1.3 by Henry Spencer:
36 
37 // @(#)regexp.c	1.3 of 18 April 87
38 //
39 //	Copyright (c) 1986 by University of Toronto.
40 //	Written by Henry Spencer.  Not derived from licensed software.
41 //
42 //	Permission is granted to anyone to use this software for any
43 //	purpose on any computer system, and to redistribute it freely,
44 //	subject to the following restrictions:
45 //
46 //	1. The author is not responsible for the consequences of use of
47 //		this software, no matter how awful, even if they arise
48 //		from defects in it.
49 //
50 //	2. The origin of this software must not be misrepresented, either
51 //		by explicit claim or by omission.
52 //
53 //	3. Altered versions must be plainly marked as such, and must not
54 //		be misrepresented as being the original software.
55 //
56 // Beware that some of this code is subtly aware of the way operator
57 // precedence is structured in regular expressions.  Serious changes in
58 // regular-expression syntax might require a total rethink.
59 //
60 
61 // ALTERED VERSION: Adapted to ANSI C and C++ for the OpenTracker
62 // project (www.opentracker.org), Jul 11, 2000.
63 
64 
65 #include <stdlib.h>
66 #include <stdio.h>
67 #include <string.h>
68 
69 #include <Errors.h>
70 
71 #include "RegExp.h"
72 
73 
74 // The first byte of the regexp internal "program" is actually this magic
75 // number; the start node begins in the second byte.
76 
77 const uint8 kRegExpMagic = 0234;
78 
79 // The "internal use only" fields in RegExp.h are present to pass info from
80 // compile to execute that permits the execute phase to run lots faster on
81 // simple cases.  They are:
82 //
83 // regstart	char that must begin a match; '\0' if none obvious
84 // reganch	is the match anchored (at beginning-of-line only)?
85 // regmust	string (pointer into program) that match must include, or NULL
86 // regmlen	length of regmust string
87 //
88 // Regstart and reganch permit very fast decisions on suitable starting points
89 // for a match, cutting down the work a lot.  Regmust permits fast rejection
90 // of lines that cannot possibly match.  The regmust tests are costly enough
91 // that Compile() supplies a regmust only if the r.e. contains something
92 // potentially expensive (at present, the only such thing detected is * or +
93 // at the start of the r.e., which can involve a lot of backup). Regmlen is
94 // supplied because the test in RunMatcher() needs it and Compile() is
95 // computing it anyway.
96 //
97 //
98 //
99 // Structure for regexp "program".  This is essentially a linear encoding
100 // of a nondeterministic finite-state machine (aka syntax charts or
101 // "railroad normal form" in parsing technology).  Each node is an opcode
102 // plus a "next" pointer, possibly plus an operand.  "Next" pointers of
103 // all nodes except kRegExpBranch implement concatenation; a "next" pointer
104 // with a kRegExpBranch on both ends of it is connecting two alternatives.
105 // (Here we have one of the subtle syntax dependencies:  an individual
106 // kRegExpBranch (as opposed to a collection of them) is never concatenated
107 // with anything because of operator precedence.)  The operand of some types
108 // of node is a literal string; for others, it is a node leading into a
109 // sub-FSM. In particular, the operand of a kRegExpBranch node is the first
110 // node of the branch. (NB this is* not* a tree structure:  the tail of the
111 // branch connects to the thing following the set of kRegExpBranches).
112 // The opcodes are:
113 //
114 
115 // definition	number	opnd?	meaning
116 enum {
117 	kRegExpEnd = 0,		// no	End of program.
118 	kRegExpBol = 1,		// no	Match "" at beginning of line.
119 	kRegExpEol = 2,		// no	Match "" at end of line.
120 	kRegExpAny = 3,		// no	Match any one character.
121 	kRegExpAnyOf = 4,	// str	Match any character in this string.
122 	kRegExpAnyBut =	5,	// str	Match any character not in this string.
123 	kRegExpBranch =	6,	// node	Match this alternative, or the next...
124 	kRegExpBack = 7,	// no	Match "", "next" ptr points backward.
125 	kRegExpExactly = 8,	// str	Match this string.
126 	kRegExpNothing = 9,	// no	Match empty string.
127 	kRegExpStar = 10,	// node	Match this (simple) thing 0 or more times.
128 	kRegExpPlus = 11,	// node	Match this (simple) thing 1 or more times.
129 	kRegExpOpen	= 20,	// no	Mark this point in input as start of #n.
130 							//	kRegExpOpen + 1 is number 1, etc.
131 	kRegExpClose = 30	// no	Analogous to kRegExpOpen.
132 };
133 
134 //
135 // Opcode notes:
136 //
137 // kRegExpBranch	The set of branches constituting a single choice are
138 //		hooked together with their "next" pointers, since precedence prevents
139 //		anything being concatenated to any individual branch.  The
140 //		"next" pointer of the last kRegExpBranch in a choice points to the
141 //		thing following the whole choice.  This is also where the
142 //		final "next" pointer of each individual branch points; each
143 //		branch starts with the operand node of a kRegExpBranch node.
144 //
145 // kRegExpBack		Normal "next" pointers all implicitly point forward;
146 //		kRegExpBack exists to make loop structures possible.
147 //
148 // kRegExpStar,kRegExpPlus	'?', and complex '*' and '+', are implemented as
149 //		circular kRegExpBranch structures using kRegExpBack.  Simple cases
150 //		(one character per match) are implemented with kRegExpStar and
151 //		kRegExpPlus for speed and to minimize recursive plunges.
152 //
153 // kRegExpOpen,kRegExpClose	...are numbered at compile time.
154 //
155 //
156 //
157 // A node is one char of opcode followed by two chars of "next" pointer.
158 // "Next" pointers are stored as two 8-bit pieces, high order first.  The
159 // value is a positive offset from the opcode of the node containing it.
160 // An operand, if any, simply follows the node.  (Note that much of the
161 // code generation knows about this implicit relationship.)
162 //
163 // Using two bytes for the "next" pointer is vast overkill for most things,
164 // but allows patterns to get big without disasters.
165 //
166 
167 const char* kMeta = "^$.[()|?+*\\";
168 const int32 kMaxSize = 32767L;
169 	// Probably could be 65535L.
170 
171 
172 // Flags to be passed up and down:
173 enum {
174 	kHasWidth =	01,	// Known never to match null string.
175 	kSimple = 02,	// Simple enough to be kRegExpStar/kRegExpPlus operand.
176 	kSPStart = 04,	// Starts with * or +.
177 	kWorst = 0	// Worst case.
178 };
179 
180 
181 const char* kRegExpErrorStringArray[] = {
182 	"Unmatched parenthesis.",
183 	"Expression too long.",
184 	"Too many parenthesis.",
185 	"Junk on end.",
186 	"*+? operand may be empty.",
187 	"Nested *?+.",
188 	"Invalid bracket range.",
189 	"Unmatched brackets.",
190 	"Internal error.",
191 	"?+* follows nothing.",
192 	"Trailing \\.",
193 	"Corrupted expression.",
194 	"Memory corruption.",
195 	"Corrupted pointers.",
196 	"Corrupted opcode."
197 };
198 
199 
200 #ifdef DEBUG
201 int32 regnarrate = 0;
202 #endif
203 
204 
205 //	#pragma mark - RegExp
206 
207 
208 RegExp::RegExp()
209 	:
210 	fError(B_OK),
211 	fRegExp(NULL)
212 {
213 }
214 
215 
216 RegExp::RegExp(const char* pattern)
217 	:
218 	fError(B_OK),
219 	fRegExp(NULL)
220 {
221 	fRegExp = Compile(pattern);
222 }
223 
224 
225 RegExp::RegExp(const BString &pattern)
226 	:
227 	fError(B_OK),
228 	fRegExp(NULL)
229 {
230 	fRegExp = Compile(pattern.String());
231 }
232 
233 
234 RegExp::~RegExp()
235 {
236 	free(fRegExp);
237 }
238 
239 
240 status_t
241 RegExp::InitCheck() const
242 {
243 	return fError;
244 }
245 
246 
247 status_t
248 RegExp::SetTo(const char* pattern)
249 {
250 	fError = B_OK;
251 	free(fRegExp);
252 	fRegExp = Compile(pattern);
253 	return fError;
254 }
255 
256 
257 status_t
258 RegExp::SetTo(const BString &pattern)
259 {
260 	fError = B_OK;
261 	free(fRegExp);
262 	fRegExp = Compile(pattern.String());
263 	return fError;
264 }
265 
266 
267 bool
268 RegExp::Matches(const char* string) const
269 {
270 	if (fRegExp == NULL || string == NULL)
271 		return false;
272 
273 	return RunMatcher(fRegExp, string) == 1;
274 }
275 
276 
277 bool
278 RegExp::Matches(const BString &string) const
279 {
280 	if (fRegExp == NULL)
281 		return false;
282 
283 	return RunMatcher(fRegExp, string.String()) == 1;
284 }
285 
286 
287 //
288 // - Compile - compile a regular expression into internal code
289 //
290 // We can't allocate space until we know how big the compiled form will be,
291 // but we can't compile it (and thus know how big it is) until we've got a
292 // place to put the code.  So we cheat:  we compile it twice, once with code
293 // generation turned off and size counting turned on, and once "for real".
294 // This also means that we don't allocate space until we are sure that the
295 // thing really will compile successfully, and we never have to move the
296 // code and thus invalidate pointers into it.  (Note that it has to be in
297 // one piece because free() must be able to free it all.)
298 //
299 // Beware that the optimization-preparation code in here knows about some
300 // of the structure of the compiled regexp.
301 regexp*
302 RegExp::Compile(const char* exp)
303 {
304 	regexp* r;
305 	const char* scan;
306 	const char* longest;
307 	int32 len;
308 	int32 flags;
309 
310 	if (exp == NULL) {
311 		SetError(B_BAD_VALUE);
312 		return NULL;
313 	}
314 
315 	// First pass: determine size, legality.
316 	fInputScanPointer = exp;
317 	fParenthesisCount = 1;
318 	fCodeSize = 0L;
319 	fCodeEmitPointer = &fDummy;
320 	Char(kRegExpMagic);
321 	if (Reg(0, &flags) == NULL)
322 		return NULL;
323 
324 	// Small enough for pointer-storage convention?
325 	if (fCodeSize >= kMaxSize) {
326 		SetError(REGEXP_TOO_BIG);
327 		return NULL;
328 	}
329 
330 	r = (regexp*)malloc(sizeof(regexp) + fCodeSize);
331 		// Allocate space
332 
333 	if (r == NULL) {
334 		SetError(B_NO_MEMORY);
335 		return NULL;
336 	}
337 
338 	// Second pass: emit code.
339 	fInputScanPointer = exp;
340 	fParenthesisCount = 1;
341 	fCodeEmitPointer = r->program;
342 	Char(kRegExpMagic);
343 	if (Reg(0, &flags) == NULL) {
344 		free(r);
345 		return NULL;
346 	}
347 
348 	// Dig out information for optimizations.
349 	r->regstart = '\0';
350 		// Worst-case defaults.
351 	r->reganch = 0;
352 	r->regmust = NULL;
353 	r->regmlen = 0;
354 	scan = r->program + 1;
355 		// First kRegExpBranch.
356 	if (*Next((char*)scan) == kRegExpEnd) {
357 		// Only one top-level choice.
358 		scan = Operand(scan);
359 
360 		// Starting-point info.
361 		if (*scan == kRegExpExactly)
362 			r->regstart = *Operand(scan);
363 		else if (*scan == kRegExpBol)
364 			r->reganch++;
365 
366 		// If there's something expensive in the r.e., find the
367 		// longest literal string that must appear and make it the
368 		// regmust.  Resolve ties in favor of later strings, since
369 		// the regstart check works with the beginning of the r.e.
370 		// and avoiding duplication strengthens checking.  Not a
371 		// strong reason, but sufficient in the absence of others.
372 		if ((flags & kSPStart) != 0) {
373 			longest = NULL;
374 			len = 0;
375 			for (; scan != NULL; scan = Next((char*)scan)) {
376 				if (*scan == kRegExpExactly
377 					&& (int32)strlen(Operand(scan)) >= len) {
378 					longest = Operand(scan);
379 					len = (int32)strlen(Operand(scan));
380 				}
381 			}
382 			r->regmust = longest;
383 			r->regmlen = len;
384 		}
385 	}
386 
387 	return r;
388 }
389 
390 
391 regexp*
392 RegExp::Expression() const
393 {
394 	return fRegExp;
395 }
396 
397 
398 const char*
399 RegExp::ErrorString() const
400 {
401 	if (fError >= REGEXP_UNMATCHED_PARENTHESIS
402 		&& fError <= REGEXP_CORRUPTED_OPCODE) {
403 		return kRegExpErrorStringArray[fError - B_ERRORS_END];
404 	}
405 
406 	return strerror(fError);
407 }
408 
409 
410 void
411 RegExp::SetError(status_t error) const
412 {
413 	fError = error;
414 }
415 
416 
417 //
418 // - Reg - regular expression, i.e. main body or parenthesized thing
419 //
420 // Caller must absorb opening parenthesis.
421 //
422 // Combining parenthesis handling with the base level of regular expression
423 // is a trifle forced, but the need to tie the tails of the branches to what
424 // follows makes it hard to avoid.
425 //
426 char*
427 RegExp::Reg(int32 paren, int32* flagp)
428 {
429 	char* ret;
430 	char* br;
431 	char* ender;
432 	int32 parno = 0;
433 	int32 flags;
434 
435 	*flagp = kHasWidth;
436 		// Tentatively.
437 
438 	// Make an kRegExpOpen node, if parenthesized.
439 	if (paren) {
440 		if (fParenthesisCount >= kSubExpressionMax) {
441 			SetError(REGEXP_TOO_MANY_PARENTHESIS);
442 			return NULL;
443 		}
444 		parno = fParenthesisCount;
445 		fParenthesisCount++;
446 		ret = Node((char)(kRegExpOpen + parno));
447 	} else
448 		ret = NULL;
449 
450 	// Pick up the branches, linking them together.
451 	br = Branch(&flags);
452 	if (br == NULL)
453 		return NULL;
454 
455 	if (ret != NULL) {
456 		Tail(ret, br);
457 			// kRegExpOpen -> first
458 	} else
459 		ret = br;
460 
461 	if (!(flags & kHasWidth))
462 		*flagp &= ~kHasWidth;
463 
464 	*flagp |= flags&kSPStart;
465 	while (*fInputScanPointer == '|') {
466 		fInputScanPointer++;
467 		br = Branch(&flags);
468 		if (br == NULL)
469 			return NULL;
470 
471 		Tail(ret, br);
472 			// kRegExpBranch -> kRegExpBranch.
473 		if (!(flags & kHasWidth))
474 			*flagp &= ~kHasWidth;
475 
476 		*flagp |= flags&kSPStart;
477 	}
478 
479 	// Make a closing node, and hook it on the end.
480 	ender = Node(paren ? (char)(kRegExpClose + parno) : (char)kRegExpEnd);
481 	Tail(ret, ender);
482 
483 	// Hook the tails of the branches to the closing node.
484 	for (br = ret; br != NULL; br = Next(br))
485 		OpTail(br, ender);
486 
487 	// Check for proper termination.
488 	if (paren && *fInputScanPointer++ != ')') {
489 		SetError(REGEXP_UNMATCHED_PARENTHESIS);
490 		return NULL;
491 	} else if (!paren && *fInputScanPointer != '\0') {
492 		if (*fInputScanPointer == ')') {
493 			SetError(REGEXP_UNMATCHED_PARENTHESIS);
494 			return NULL;
495 		} else {
496 			SetError(REGEXP_JUNK_ON_END);
497 			return NULL;	//  "Can't happen".
498 		}
499 		// NOTREACHED
500 	}
501 
502 	return ret;
503 }
504 
505 
506 //
507 // - Branch - one alternative of an | operator
508 //
509 // Implements the concatenation operator.
510 //
511 char*
512 RegExp::Branch(int32* flagp)
513 {
514 	char* ret;
515 	char* chain;
516 	char* latest;
517 	int32 flags;
518 
519 	*flagp = kWorst;
520 		// Tentatively.
521 
522 	ret = Node(kRegExpBranch);
523 	chain = NULL;
524 	while (*fInputScanPointer != '\0'
525 		&& *fInputScanPointer != '|'
526 		&& *fInputScanPointer != ')') {
527 		latest = Piece(&flags);
528 		if (latest == NULL)
529 			return NULL;
530 
531 		*flagp |= flags & kHasWidth;
532 		if (chain == NULL) {
533 			// First piece.
534 			*flagp |= flags & kSPStart;
535 		} else
536 			Tail(chain, latest);
537 
538 		chain = latest;
539 	}
540 
541 	if (chain == NULL) {
542 		// Loop ran zero times.
543 		Node(kRegExpNothing);
544 	}
545 
546 	return ret;
547 }
548 
549 
550 //
551 // - Piece - something followed by possible [*+?]
552 //
553 // Note that the branching code sequences used for ? and the general cases
554 // of * and + are somewhat optimized:  they use the same kRegExpNothing node
555 // as both the endmarker for their branch list and the body of the last
556 // branch. It might seem that this node could be dispensed with entirely,
557 // but the endmarker role is not redundant.
558 //
559 char*
560 RegExp::Piece(int32* flagp)
561 {
562 	char* ret;
563 	char op;
564 	char* next;
565 	int32 flags;
566 
567 	ret = Atom(&flags);
568 	if (ret == NULL)
569 		return NULL;
570 
571 	op = *fInputScanPointer;
572 	if (!IsMult(op)) {
573 		*flagp = flags;
574 		return ret;
575 	}
576 
577 	if (!(flags & kHasWidth) && op != '?') {
578 		SetError(REGEXP_STAR_PLUS_OPERAND_EMPTY);
579 		return NULL;
580 	}
581 	*flagp = op != '+' ? kWorst | kSPStart :  kWorst | kHasWidth;
582 
583 	if (op == '*' && (flags & kSimple))
584 		Insert(kRegExpStar, ret);
585 	else if (op == '*') {
586 		// Emit x* as (x&|), where & means "self".
587 		Insert(kRegExpBranch, ret);
588 			// Either x
589 		OpTail(ret, Node(kRegExpBack));
590 			// and loop
591 		OpTail(ret, ret);
592 			// back
593 		Tail(ret, Node(kRegExpBranch));
594 			// or
595 		Tail(ret, Node(kRegExpNothing));
596 			// null.
597 	} else if (op == '+' && (flags & kSimple))
598 		Insert(kRegExpPlus, ret);
599 	else if (op == '+') {
600 		// Emit x+ as x(&|), where & means "self".
601 		next = Node(kRegExpBranch);
602 			// Either
603 		Tail(ret, next);
604 		Tail(Node(kRegExpBack), ret);
605 			// loop back
606 		Tail(next, Node(kRegExpBranch));
607 			// or
608 		Tail(ret, Node(kRegExpNothing));
609 			// null.
610 	} else if (op == '?') {
611 		// Emit x? as (x|)
612 		Insert(kRegExpBranch, ret);
613 			// Either x
614 		Tail(ret, Node(kRegExpBranch));
615 			// or
616 		next = Node(kRegExpNothing);
617 			// null.
618 		Tail(ret, next);
619 		OpTail(ret, next);
620 	}
621 
622 	fInputScanPointer++;
623 	if (IsMult(*fInputScanPointer)) {
624 		SetError(REGEXP_NESTED_STAR_QUESTION_PLUS);
625 		return NULL;
626 	}
627 
628 	return ret;
629 }
630 
631 
632 //
633 // - Atom - the lowest level
634 //
635 // Optimization:  gobbles an entire sequence of ordinary characters so that
636 // it can turn them into a single node, which is smaller to store and
637 // faster to run.  Backslashed characters are exceptions, each becoming a
638 // separate node; the code is simpler that way and it's not worth fixing.
639 //
640 char*
641 RegExp::Atom(int32* flagp)
642 {
643 	char* ret;
644 	int32 flags;
645 
646 	*flagp = kWorst;
647 		// tentatively
648 
649 	switch (*fInputScanPointer++) {
650 		case '^':
651 			ret = Node(kRegExpBol);
652 			break;
653 
654 		case '$':
655 			ret = Node(kRegExpEol);
656 			break;
657 
658 		case '.':
659 			ret = Node(kRegExpAny);
660 			*flagp |= kHasWidth|kSimple;
661 			break;
662 
663 		case '[':
664 		{
665 			int32 cclass;
666 			int32 classend;
667 
668 			if (*fInputScanPointer == '^') {
669 				// complement of range
670 				ret = Node(kRegExpAnyBut);
671 				fInputScanPointer++;
672 			} else
673 				ret = Node(kRegExpAnyOf);
674 			if (*fInputScanPointer == ']' || *fInputScanPointer == '-')
675 				Char(*fInputScanPointer++);
676 			while (*fInputScanPointer != '\0'
677 				&& *fInputScanPointer != ']') {
678 				if (*fInputScanPointer == '-') {
679 					fInputScanPointer++;
680 					if (*fInputScanPointer == ']'
681 						|| *fInputScanPointer == '\0') {
682 						Char('-');
683 					} else {
684 						cclass = UCharAt(fInputScanPointer - 2) + 1;
685 						classend = UCharAt(fInputScanPointer);
686 						if (cclass > classend + 1) {
687 							SetError(REGEXP_INVALID_BRACKET_RANGE);
688 							return NULL;
689 						}
690 						for (; cclass <= classend; cclass++)
691 							Char((char)cclass);
692 						fInputScanPointer++;
693 					}
694 				} else
695 					Char(*fInputScanPointer++);
696 			}
697 			Char('\0');
698 			if (*fInputScanPointer != ']') {
699 				SetError(REGEXP_UNMATCHED_BRACKET);
700 				return NULL;
701 			}
702 			fInputScanPointer++;
703 			*flagp |= kHasWidth | kSimple;
704 			break;
705 		}
706 
707 		case '(':
708 			ret = Reg(1, &flags);
709 			if (ret == NULL)
710 				return NULL;
711 			*flagp |= flags & (kHasWidth | kSPStart);
712 			break;
713 
714 		case '\0':
715 		case '|':
716 		case ')':
717 			SetError(REGEXP_INTERNAL_ERROR);
718 			return NULL;
719 				//  supposed to be caught earlier
720 
721 		case '?':
722 		case '+':
723 		case '*':
724 			SetError(REGEXP_QUESTION_PLUS_STAR_FOLLOWS_NOTHING);
725 			return NULL;
726 
727 		case '\\':
728 			if (*fInputScanPointer == '\0') {
729 				SetError(REGEXP_TRAILING_BACKSLASH);
730 				return NULL;
731 			}
732 			ret = Node(kRegExpExactly);
733 			Char(*fInputScanPointer++);
734 			Char('\0');
735 			*flagp |= kHasWidth|kSimple;
736 			break;
737 
738 		default:
739 		{
740 			int32 len;
741 			char ender;
742 
743 			fInputScanPointer--;
744 			len = (int32)strcspn(fInputScanPointer, kMeta);
745 			if (len <= 0) {
746 				SetError(REGEXP_INTERNAL_ERROR);
747 				return NULL;
748 			}
749 
750 			ender = *(fInputScanPointer + len);
751 			if (len > 1 && IsMult(ender)) {
752 				// Back off clear of ?+* operand.
753 				len--;
754 			}
755 
756 			*flagp |= kHasWidth;
757 			if (len == 1)
758 				*flagp |= kSimple;
759 
760 			ret = Node(kRegExpExactly);
761 			while (len > 0) {
762 				Char(*fInputScanPointer++);
763 				len--;
764 			}
765 
766 			Char('\0');
767 			break;
768 		}
769 	}
770 
771 	return ret;
772 }
773 
774 
775 //
776 // - Node - emit a node
777 //
778 // Returns location.
779 //
780 char*
781 RegExp::Node(char op)
782 {
783 	char* ret;
784 	char* ptr;
785 
786 	ret = fCodeEmitPointer;
787 	if (ret == &fDummy) {
788 		fCodeSize += 3;
789 		return ret;
790 	}
791 
792 	ptr = ret;
793 	*ptr++ = op;
794 	*ptr++ = '\0';
795 		// Null "next" pointer.
796 	*ptr++ = '\0';
797 	fCodeEmitPointer = ptr;
798 
799 	return ret;
800 }
801 
802 
803 //
804 // - Char - emit (if appropriate) a byte of code
805 //
806 void
807 RegExp::Char(char b)
808 {
809 	if (fCodeEmitPointer != &fDummy)
810 		*fCodeEmitPointer++ = b;
811 	else
812 		fCodeSize++;
813 }
814 
815 
816 //
817 // - Insert - insert an operator in front of already-emitted operand
818 //
819 // Means relocating the operand.
820 //
821 void
822 RegExp::Insert(char op, char* opnd)
823 {
824 	char* src;
825 	char* dst;
826 	char* place;
827 
828 	if (fCodeEmitPointer == &fDummy) {
829 		fCodeSize += 3;
830 		return;
831 	}
832 
833 	src = fCodeEmitPointer;
834 	fCodeEmitPointer += 3;
835 	dst = fCodeEmitPointer;
836 	while (src > opnd)
837 		*--dst = *--src;
838 
839 	place = opnd;
840 		// Op node, where operand used to be.
841 	*place++ = op;
842 	*place++ = '\0';
843 	*place++ = '\0';
844 }
845 
846 
847 //
848 // - Tail - set the next-pointer at the end of a node chain
849 //
850 void
851 RegExp::Tail(char* p, char* val)
852 {
853 	char* scan;
854 	char* temp;
855 	int32 offset;
856 
857 	if (p == &fDummy)
858 		return;
859 
860 	// Find last node.
861 	scan = p;
862 	for (;;) {
863 		temp = Next(scan);
864 		if (temp == NULL)
865 			break;
866 		scan = temp;
867 	}
868 
869 	if (scan[0] == kRegExpBack)
870 		offset = scan - val;
871 	else
872 		offset = val - scan;
873 
874 	scan[1] = (char)((offset >> 8) & 0377);
875 	scan[2] = (char)(offset & 0377);
876 }
877 
878 
879 //
880 // - OpTail - Tail on operand of first argument; nop if operandless
881 //
882 void
883 RegExp::OpTail(char* p, char* val)
884 {
885 	// "Operandless" and "op != kRegExpBranch" are synonymous in practice.
886 	if (p == NULL || p == &fDummy || *p != kRegExpBranch)
887 		return;
888 
889 	Tail(Operand(p), val);
890 }
891 
892 //
893 // RunMatcher and friends
894 //
895 
896 
897 //
898 // - RunMatcher - match a regexp against a string
899 //
900 int32
901 RegExp::RunMatcher(regexp* prog, const char* string) const
902 {
903 	const char* s;
904 
905 	// be paranoid...
906 	if (prog == NULL || string == NULL) {
907 		SetError(B_BAD_VALUE);
908 		return 0;
909 	}
910 
911 	// check validity of program
912 	if (UCharAt(prog->program) != kRegExpMagic) {
913 		SetError(REGEXP_CORRUPTED_PROGRAM);
914 		return 0;
915 	}
916 
917 	// if there is a "must appear" string, look for it
918 	if (prog->regmust != NULL) {
919 		s = string;
920 		while ((s = strchr(s, prog->regmust[0])) != NULL) {
921 			if (strncmp(s, prog->regmust, (size_t)prog->regmlen) == 0) {
922 				// found it
923 				break;
924 			}
925 			s++;
926 		}
927 		if (s == NULL) {
928 			// not present
929 			return 0;
930 		}
931 	}
932 
933 	// mark beginning of line for ^
934 	fRegBol = string;
935 
936 	// simplest case:  anchored match need be tried only once
937 	if (prog->reganch)
938 		return Try(prog, (char*)string);
939 
940 	// messy cases: unanchored match
941 	s = string;
942 	if (prog->regstart != '\0') {
943 		// we know what char it must start with
944 		while ((s = strchr(s, prog->regstart)) != NULL) {
945 			if (Try(prog, (char*)s))
946 				return 1;
947 			s++;
948 		}
949 	} else {
950 		// we don't -- general case.
951 		do {
952 			if (Try(prog, (char*)s))
953 				return 1;
954 		} while (*s++ != '\0');
955 	}
956 
957 	// failure
958 	return 0;
959 }
960 
961 
962 //
963 // - Try - try match at specific point
964 //
965 // Returns 0 on failure, 1 on success.
966 //
967 int32
968 RegExp::Try(regexp* prog, const char* string) const
969 {
970 	int32 i;
971 	const char **sp;
972 	const char **ep;
973 
974 	fStringInputPointer = string;
975 	fStartPArrayPointer = prog->startp;
976 	fEndPArrayPointer = prog->endp;
977 
978 	sp = prog->startp;
979 	ep = prog->endp;
980 	for (i = kSubExpressionMax; i > 0; i--) {
981 		*sp++ = NULL;
982 		*ep++ = NULL;
983 	}
984 	if (Match(prog->program + 1)) {
985 		prog->startp[0] = string;
986 		prog->endp[0] = fStringInputPointer;
987 		return 1;
988 	} else
989 		return 0;
990 }
991 
992 
993 //
994 // - Match - main matching routine
995 //
996 // Conceptually the strategy is simple:  check to see whether the current
997 // node matches, call self recursively to see whether the rest matches,
998 // and then act accordingly.  In practice we make some effort to avoid
999 // recursion, in particular by going through "ordinary" nodes (that don't
1000 // need to know whether the rest of the match failed) by a loop instead of
1001 // by recursion.
1002 //
1003 // Returns 0 on failure, 1 on success.
1004 //
1005 int32
1006 RegExp::Match(const char* prog) const
1007 {
1008 	const char* scan;	// Current node.
1009 	const char* next;		// Next node.
1010 
1011 	scan = prog;
1012 #ifdef DEBUG
1013 	if (scan != NULL && regnarrate)
1014 		fprintf(stderr, "%s(\n", Prop(scan));
1015 #endif
1016 	while (scan != NULL) {
1017 #ifdef DEBUG
1018 		if (regnarrate)
1019 			fprintf(stderr, "%s...\n", Prop(scan));
1020 #endif
1021 		next = Next(scan);
1022 
1023 		switch (*scan) {
1024 			case kRegExpBol:
1025 				if (fStringInputPointer != fRegBol)
1026 					return 0;
1027 				break;
1028 
1029 			case kRegExpEol:
1030 				if (*fStringInputPointer != '\0')
1031 					return 0;
1032 				break;
1033 
1034 			case kRegExpAny:
1035 				if (*fStringInputPointer == '\0')
1036 					return 0;
1037 				fStringInputPointer++;
1038 				break;
1039 
1040 			case kRegExpExactly:
1041 			{
1042 				const char* opnd = Operand(scan);
1043 				// Inline the first character, for speed.
1044 				if (*opnd != *fStringInputPointer)
1045 					return 0;
1046 
1047 				uint32 len = strlen(opnd);
1048 				if (len > 1
1049 					&& strncmp(opnd, fStringInputPointer, len) != 0) {
1050 					return 0;
1051 				}
1052 
1053 				fStringInputPointer += len;
1054 			}
1055 			break;
1056 
1057 			case kRegExpAnyOf:
1058 				if (*fStringInputPointer == '\0'
1059 					|| strchr(Operand(scan), *fStringInputPointer) == NULL) {
1060 					return 0;
1061 				}
1062 				fStringInputPointer++;
1063 				break;
1064 
1065 			case kRegExpAnyBut:
1066 				if (*fStringInputPointer == '\0'
1067 					|| strchr(Operand(scan), *fStringInputPointer) != NULL) {
1068 					return 0;
1069 				}
1070 				fStringInputPointer++;
1071 				break;
1072 
1073 			case kRegExpNothing:
1074 				break;
1075 
1076 			case kRegExpBack:
1077 				break;
1078 
1079 			case kRegExpOpen + 1:
1080 			case kRegExpOpen + 2:
1081 			case kRegExpOpen + 3:
1082 			case kRegExpOpen + 4:
1083 			case kRegExpOpen + 5:
1084 			case kRegExpOpen + 6:
1085 			case kRegExpOpen + 7:
1086 			case kRegExpOpen + 8:
1087 			case kRegExpOpen + 9:
1088 			{
1089 				int32 no;
1090 				const char* save;
1091 
1092 				no = *scan - kRegExpOpen;
1093 				save = fStringInputPointer;
1094 
1095 				if (Match(next)) {
1096 					//
1097 					// Don't set startp if some later
1098 					// invocation of the same parentheses
1099 					// already has.
1100 					//
1101 					if (fStartPArrayPointer[no] == NULL)
1102 						fStartPArrayPointer[no] = save;
1103 					return 1;
1104 				} else
1105 					return 0;
1106 			}
1107 			break;
1108 
1109 			case kRegExpClose + 1:
1110 			case kRegExpClose + 2:
1111 			case kRegExpClose + 3:
1112 			case kRegExpClose + 4:
1113 			case kRegExpClose + 5:
1114 			case kRegExpClose + 6:
1115 			case kRegExpClose + 7:
1116 			case kRegExpClose + 8:
1117 			case kRegExpClose + 9:
1118 			{
1119 				int32 no;
1120 				const char* save;
1121 
1122 				no = *scan - kRegExpClose;
1123 				save = fStringInputPointer;
1124 
1125 				if (Match(next)) {
1126 					//
1127 					// Don't set endp if some later
1128 					// invocation of the same parentheses
1129 					// already has.
1130 					//
1131 					if (fEndPArrayPointer[no] == NULL)
1132 						fEndPArrayPointer[no] = save;
1133 					return 1;
1134 				} else
1135 					return 0;
1136 			}
1137 			break;
1138 
1139 			case kRegExpBranch:
1140 			{
1141 				const char* save;
1142 
1143 				if (*next != kRegExpBranch) {
1144 					// no choice
1145 					next = Operand(scan);
1146 						// avoid recursion
1147 				} else {
1148 					do {
1149 						save = fStringInputPointer;
1150 						if (Match(Operand(scan)))
1151 							return 1;
1152 						fStringInputPointer = save;
1153 						scan = Next(scan);
1154 					} while (scan != NULL && *scan == kRegExpBranch);
1155 					return 0;
1156 					// NOTREACHED
1157 				}
1158 			}
1159 			break;
1160 
1161 			case kRegExpStar:
1162 			case kRegExpPlus:
1163 				{
1164 					char nextch;
1165 					int32 no;
1166 					const char* save;
1167 					int32 min;
1168 
1169 					//
1170 					// Lookahead to avoid useless match attempts
1171 					// when we know what character comes next.
1172 					//
1173 					nextch = '\0';
1174 					if (*next == kRegExpExactly)
1175 						nextch = *Operand(next);
1176 					min = (*scan == kRegExpStar) ? 0 : 1;
1177 					save = fStringInputPointer;
1178 					no = Repeat(Operand(scan));
1179 					while (no >= min) {
1180 						// if it could work, try it
1181 						if (nextch == '\0' || *fStringInputPointer == nextch) {
1182 							if (Match(next))
1183 								return 1;
1184 						}
1185 						// couldn't or didn't, back up
1186 						no--;
1187 						fStringInputPointer = save + no;
1188 					}
1189 					return 0;
1190 				}
1191 				break;
1192 
1193 			case kRegExpEnd:
1194 				return 1;
1195 					// Success!
1196 
1197 			default:
1198 				SetError(REGEXP_MEMORY_CORRUPTION);
1199 				return 0;
1200 			}
1201 
1202 		scan = next;
1203 	}
1204 
1205 	//
1206 	// We get here only if there's trouble -- normally "case kRegExpEnd" is
1207 	// the terminating point.
1208 	//
1209 	SetError(REGEXP_CORRUPTED_POINTERS);
1210 
1211 	return 0;
1212 }
1213 
1214 
1215 //
1216 // - Repeat - repeatedly match something simple, report how many
1217 //
1218 int32
1219 RegExp::Repeat(const char* p) const
1220 {
1221 	int32 count = 0;
1222 	const char* scan;
1223 	const char* opnd;
1224 
1225 	scan = fStringInputPointer;
1226 	opnd = Operand(p);
1227 	switch (*p) {
1228 		case kRegExpAny:
1229 			count = (int32)strlen(scan);
1230 			scan += count;
1231 			break;
1232 
1233 		case kRegExpExactly:
1234 			while (*opnd == *scan) {
1235 				count++;
1236 				scan++;
1237 			}
1238 			break;
1239 
1240 		case kRegExpAnyOf:
1241 			while (*scan != '\0' && strchr(opnd, *scan) != NULL) {
1242 				count++;
1243 				scan++;
1244 			}
1245 			break;
1246 
1247 		case kRegExpAnyBut:
1248 			while (*scan != '\0' && strchr(opnd, *scan) == NULL) {
1249 				count++;
1250 				scan++;
1251 			}
1252 			break;
1253 
1254 		default:
1255 			// oh dear, called inappropriately
1256 			SetError(REGEXP_INTERNAL_ERROR);
1257 			count = 0;
1258 				// best compromise
1259 			break;
1260 	}
1261 	fStringInputPointer = scan;
1262 
1263 	return count;
1264 }
1265 
1266 
1267 //
1268 // - Next - dig the "next" pointer out of a node
1269 //
1270 char*
1271 RegExp::Next(char* p)
1272 {
1273 	int32 offset;
1274 
1275 	if (p == &fDummy)
1276 		return NULL;
1277 
1278 	offset = ((*(p + 1) & 0377) << 8) + (*(p + 2) & 0377);
1279 	if (offset == 0)
1280 		return NULL;
1281 
1282 	if (*p == kRegExpBack)
1283 		return p - offset;
1284 	else
1285 		return p + offset;
1286 }
1287 
1288 
1289 const char*
1290 RegExp::Next(const char* p) const
1291 {
1292 	int32 offset;
1293 
1294 	if (p == &fDummy)
1295 		return NULL;
1296 
1297 	offset = ((*(p + 1) & 0377) << 8) + (*(p + 2) & 0377);
1298 	if (offset == 0)
1299 		return NULL;
1300 
1301 	if (*p == kRegExpBack)
1302 		return p - offset;
1303 	else
1304 		return p + offset;
1305 }
1306 
1307 
1308 inline int32
1309 RegExp::UCharAt(const char* p) const
1310 {
1311 	return (int32)*(unsigned char *)p;
1312 }
1313 
1314 
1315 inline char*
1316 RegExp::Operand(char* p) const
1317 {
1318 	return p + 3;
1319 }
1320 
1321 
1322 inline const char*
1323 RegExp::Operand(const char* p) const
1324 {
1325 	return p + 3;
1326 }
1327 
1328 
1329 inline bool
1330 RegExp::IsMult(char c) const
1331 {
1332 	return c == '*' || c == '+' || c == '?';
1333 }
1334 
1335 
1336 #ifdef DEBUG
1337 
1338 
1339 //
1340 // - Dump - dump a regexp onto stdout in vaguely comprehensible form
1341 //
1342 void
1343 RegExp::Dump()
1344 {
1345 	const char* s;
1346 	char op = kRegExpExactly;
1347 		// Arbitrary non-kRegExpEnd op.
1348 	const char* next;
1349 
1350 	s = fRegExp->program + 1;
1351 	while (op != kRegExpEnd) {
1352 		// While that wasn't kRegExpEnd last time...
1353 		op = *s;
1354 		printf("%2ld%s", s - fRegExp->program, Prop(s));
1355 			// Where, what.
1356 		next = Next(s);
1357 		if (next == NULL) {
1358 			// next ptr
1359 			printf("(0)");
1360 		} else
1361 			printf("(%ld)", (s - fRegExp->program) + (next - s));
1362 
1363 		s += 3;
1364 		if (op == kRegExpAnyOf || op == kRegExpAnyBut
1365 			|| op == kRegExpExactly) {
1366 			// literal string, where present
1367 			while (*s != '\0') {
1368 				putchar(*s);
1369 				s++;
1370 			}
1371 			s++;
1372 		}
1373 		putchar('\n');
1374 	}
1375 
1376 	// header fields of interest
1377 	if (fRegExp->regstart != '\0')
1378 		printf("start `%c' ", fRegExp->regstart);
1379 
1380 	if (fRegExp->reganch)
1381 		printf("anchored ");
1382 
1383 	if (fRegExp->regmust != NULL)
1384 		printf("must have \"%s\"", fRegExp->regmust);
1385 
1386 	printf("\n");
1387 }
1388 
1389 
1390 //
1391 // - Prop - printable representation of opcode
1392 //
1393 char*
1394 RegExp::Prop(const char* op) const
1395 {
1396 	const char* p = NULL;
1397 	static char buf[50];
1398 
1399 	strcpy(buf, ":");
1400 
1401 	switch (*op) {
1402 		case kRegExpBol:
1403 			p = "kRegExpBol";
1404 			break;
1405 
1406 		case kRegExpEol:
1407 			p = "kRegExpEol";
1408 			break;
1409 
1410 		case kRegExpAny:
1411 			p = "kRegExpAny";
1412 			break;
1413 
1414 		case kRegExpAnyOf:
1415 			p = "kRegExpAnyOf";
1416 			break;
1417 
1418 		case kRegExpAnyBut:
1419 			p = "kRegExpAnyBut";
1420 			break;
1421 
1422 		case kRegExpBranch:
1423 			p = "kRegExpBranch";
1424 			break;
1425 
1426 		case kRegExpExactly:
1427 			p = "kRegExpExactly";
1428 			break;
1429 
1430 		case kRegExpNothing:
1431 			p = "kRegExpNothing";
1432 			break;
1433 
1434 		case kRegExpBack:
1435 			p = "kRegExpBack";
1436 			break;
1437 
1438 		case kRegExpEnd:
1439 			p = "kRegExpEnd";
1440 			break;
1441 
1442 		case kRegExpOpen + 1:
1443 		case kRegExpOpen + 2:
1444 		case kRegExpOpen + 3:
1445 		case kRegExpOpen + 4:
1446 		case kRegExpOpen + 5:
1447 		case kRegExpOpen + 6:
1448 		case kRegExpOpen + 7:
1449 		case kRegExpOpen + 8:
1450 		case kRegExpOpen + 9:
1451 			sprintf(buf + strlen(buf), "kRegExpOpen%d", *op - kRegExpOpen);
1452 			p = NULL;
1453 			break;
1454 
1455 		case kRegExpClose + 1:
1456 		case kRegExpClose + 2:
1457 		case kRegExpClose + 3:
1458 		case kRegExpClose + 4:
1459 		case kRegExpClose + 5:
1460 		case kRegExpClose + 6:
1461 		case kRegExpClose + 7:
1462 		case kRegExpClose + 8:
1463 		case kRegExpClose + 9:
1464 			sprintf(buf + strlen(buf), "kRegExpClose%d", *op - kRegExpClose);
1465 			p = NULL;
1466 			break;
1467 
1468 		case kRegExpStar:
1469 			p = "kRegExpStar";
1470 			break;
1471 
1472 		case kRegExpPlus:
1473 			p = "kRegExpPlus";
1474 			break;
1475 
1476 		default:
1477 			RegExpError("corrupted opcode");
1478 			break;
1479 	}
1480 
1481 	if (p != NULL)
1482 		strcat(buf, p);
1483 
1484 	return buf;
1485 }
1486 
1487 
1488 void
1489 RegExp::RegExpError(const char*) const
1490 {
1491 	// does nothing now, perhaps it should printf?
1492 }
1493 
1494 #endif	// DEBUG
1495