xref: /haiku/src/kits/storage/sniffer/Parser.cpp (revision 02fd0582dd25cf21fba489c6c767716619a1f574)
1 //----------------------------------------------------------------------
2 //  This software is part of the OpenBeOS distribution and is covered
3 //  by the OpenBeOS license.
4 //----------------------------------------------------------------------
5 /*!
6 	\file sniffer/Parser.cpp
7 	MIME sniffer rule parser implementation
8 */
9 
10 #include <sniffer/Parser.h>
11 #include <sniffer/Pattern.h>
12 #include <sniffer/PatternList.h>
13 #include <sniffer/Range.h>
14 #include <sniffer/RPattern.h>
15 #include <sniffer/RPatternList.h>
16 #include <sniffer/Rule.h>
17 
18 #include <new.h>
19 #include <stdio.h>
20 #include <stdlib.h>	// For atol(), atof()
21 #include <string.h>
22 #include <String.h>	// BString
23 
24 using namespace BPrivate::Storage::Sniffer;
25 
26 // Miscellaneous helper functions
27 char escapeChar(char ch);
28 char hexToChar(char hi, char low);
29 char hexToChar(char hex);
30 char octalToChar(char octal);
31 char octalToChar(char hi, char low);
32 char octalToChar(char hi, char mid, char low);
33 bool isHexChar(char ch);
34 bool isWhiteSpace(char ch);
35 bool isOctalChar(char ch);
36 bool isDecimalChar(char ch);
37 bool isPunctuation(char ch);
38 
39 //! Parses the given rule.
40 /*! The resulting parsed Rule structure is stored in \c rule, which
41 	must be pre-allocated. If parsing fails, a descriptive error message (meant
42 	to be viewed in a monospaced font) is placed in the pre-allocated \c BString
43 	pointed to by \c parseError (which may be \c NULL if you don't care about
44 	the error message).
45 
46 	\param rule Pointer to a NULL-terminated string containing the sniffer
47 	            rule to be parsed
48 	\param result Pointer to a pre-allocated \c Rule object into which the result
49 	              of parsing is placed upon success.
50 	\param parseError Point to pre-allocated \c BString object into which
51 	                  a descriptive error message is stored upon failure.
52 
53 	\return
54 	- B_OK: Success
55 	- B_BAD_MIME_SNIFFER_RULE: Failure
56 */
57 status_t
58 BPrivate::Storage::Sniffer::parse(const char *rule, Rule *result, BString *parseError) {
59 	Parser parser;
60 	return parser.Parse(rule, result, parseError);
61 }
62 
63 //------------------------------------------------------------------------------
64 // CharStream
65 //------------------------------------------------------------------------------
66 
67 CharStream::CharStream(const std::string &string)
68 	: fString(string)
69 	, fPos(0)
70 	, fCStatus(B_OK)
71 {
72 }
73 
74 CharStream::CharStream()
75 	: fString("")
76 	, fPos(0)
77 	, fCStatus(B_NO_INIT)
78 {
79 }
80 
81 CharStream::~CharStream() {
82 	Unset();
83 }
84 
85 status_t
86 CharStream::SetTo(const std::string &string) {
87 	fString = string;
88 	fPos = 0;
89 	fCStatus = B_OK;
90 	return fCStatus;
91 }
92 
93 void
94 CharStream::Unset() {
95 	fString = "";
96 	fPos = 0;
97 	fCStatus = B_NO_INIT;
98 }
99 
100 status_t
101 CharStream::InitCheck() const {
102 	return fCStatus;
103 }
104 
105 bool
106 CharStream::IsEmpty() const {
107 	return fPos >= fString.length();
108 }
109 
110 size_t
111 CharStream::Pos() const {
112 	return fPos;
113 }
114 
115 const std::string&
116 CharStream::String() const {
117 	return fString;
118 }
119 
120 char
121 CharStream::Get() {
122 	if (fCStatus != B_OK)
123 		throw new Err("Sniffer parser error: CharStream::Get() called on uninitialized CharStream object", -1);
124 	if (fPos < fString.length())
125 		return fString[fPos++];
126 	else {
127 		fPos++;		// Increment fPos to keep Unget()s consistent
128 		return 0x3;	// Return End-Of-Text char
129 	}
130 }
131 
132 void
133 CharStream::Unget() {
134 	if (fCStatus != B_OK)
135 		throw new Err("Sniffer parser error: CharStream::Unget() called on uninitialized CharStream object", -1);
136 	if (fPos > 0)
137 		fPos--;
138 	else
139 		throw new Err("Sniffer parser error: CharStream::Unget() called at beginning of character stream", -1);
140 }
141 
142 //------------------------------------------------------------------------------
143 // Token
144 //------------------------------------------------------------------------------
145 
146 Token::Token(TokenType type, const ssize_t pos)
147 	: fType(type)
148 	, fPos(pos)
149 {
150 //	if (type != EmptyToken)
151 //		cout << "New Token, fType == " << tokenTypeToString(fType) << endl;
152 }
153 
154 Token::~Token() {
155 }
156 
157 TokenType
158 Token::Type() const {
159 	return fType;
160 }
161 
162 const std::string&
163 Token::String() const {
164 	throw new Err("Sniffer scanner error: Token::String() called on non-string token", fPos);
165 }
166 
167 int32
168 Token::Int() const {
169 	throw new Err("Sniffer scanner error: Token::Int() called on non-integer token", fPos);
170 }
171 
172 double
173 Token::Float() const {
174 	throw new Err("Sniffer scanner error: Token::Float() called on non-float token", fPos);
175 }
176 
177 ssize_t
178 Token::Pos() const {
179 	return fPos;
180 }
181 
182 bool
183 Token::operator==(Token &ref) const {
184 	// Compare types, then data if necessary
185 	if (Type() == ref.Type()) {
186 		switch (Type()) {
187 			case CharacterString:
188 //				printf(" str1 == '%s'\n", String());
189 //				printf(" str2 == '%s'\n", ref.String());
190 //				printf(" strcmp() == %d\n", strcmp(String(), ref.String()));
191 			{
192 				return String() == ref.String();
193 
194 /*
195 				// strcmp() seems to choke on certain, non-normal ASCII chars
196 				// (i.e. chars outside the usual alphabets, but still valid
197 				// as far as ASCII is concerned), so we'll just compare the
198 				// strings by hand to be safe.
199 				const char *str1 = String();
200 				const char *str2 = ref.String();
201 				int len1 = strlen(str1);
202 				int len2 = strlen(str2);
203 //				printf("len1 == %d\n", len1);
204 //				printf("len2 == %d\n", len2);
205 				if (len1 == len2) {
206 					for (int i = 0; i < len1; i++) {
207 //						printf("i == %d, str1[%d] == %x, str2[%d] == %x\n", i, i, str1[i], i, str2[i]);
208 						if (str1[i] != str2[i])
209 							return false;
210 					}
211 				}
212 				return true;
213 */
214 			}
215 //				return strcmp(String(), ref.String()) == 0;
216 
217 			case Integer:
218 				return Int() == ref.Int();
219 
220 			case FloatingPoint:
221 				return Float() == ref.Float();
222 
223 			default:
224 				return true;
225 		}
226 	} else
227 		return false;
228 }
229 
230 //------------------------------------------------------------------------------
231 // StringToken
232 //------------------------------------------------------------------------------
233 
234 StringToken::StringToken(const std::string &str, const ssize_t pos)
235 	: Token(CharacterString, pos)
236 	, fString(str)
237 {
238 }
239 
240 StringToken::~StringToken() {
241 }
242 
243 const std::string&
244 StringToken::String() const {
245 	return fString;
246 }
247 
248 //------------------------------------------------------------------------------
249 // IntToken
250 //------------------------------------------------------------------------------
251 
252 IntToken::IntToken(const int32 value, const ssize_t pos)
253 	: Token(Integer, pos)
254 	, fValue(value)
255 {
256 }
257 
258 IntToken::~IntToken() {
259 }
260 
261 int32
262 IntToken::Int() const {
263 	return fValue;
264 }
265 
266 double
267 IntToken::Float() const {
268 	return (double)fValue;
269 }
270 
271 //------------------------------------------------------------------------------
272 // FloatToken
273 //------------------------------------------------------------------------------
274 
275 FloatToken::FloatToken(const double value, const ssize_t pos)
276 	: Token(FloatingPoint, pos)
277 	, fValue(value)
278 {
279 }
280 
281 FloatToken::~FloatToken() {
282 }
283 
284 
285 double
286 FloatToken::Float() const {
287 	return fValue;
288 }
289 
290 //------------------------------------------------------------------------------
291 // TokenStream
292 //------------------------------------------------------------------------------
293 
294 TokenStream::TokenStream(const std::string &string)
295 	: fCStatus(B_NO_INIT)
296 	, fPos(-1)
297 	, fStrLen(-1)
298 {
299 	SetTo(string);
300 }
301 
302 TokenStream::TokenStream()
303 	: fCStatus(B_NO_INIT)
304 	, fPos(-1)
305 	, fStrLen(-1)
306 {
307 }
308 
309 TokenStream::~TokenStream() {
310 	Unset();
311 }
312 
313 status_t
314 TokenStream::SetTo(const std::string &string) {
315 	Unset();
316 	fStrLen = string.length();
317 	CharStream stream(string);
318 	if (stream.InitCheck() != B_OK)
319 		throw new Err("Sniffer scanner error: Unable to intialize character stream", -1);
320 
321 	typedef enum TokenStreamScannerState {
322 		tsssStart,
323 		tsssOneSingle,
324 		tsssOneDouble,
325 		tsssOneZero,
326 		tsssZeroX,
327 		tsssOneHex,
328 		tsssTwoHex,
329 		tsssIntOrFloat,
330 		tsssFloat,
331 		tsssLonelyDecimalPoint,
332 		tsssLonelyMinusOrPlus,
333 		tsssLonelyFloatExtension,
334 		tsssLonelyFloatExtensionWithSign,
335 		tsssExtendedFloat,
336 		tsssUnquoted,
337 		tsssEscape,
338 		tsssEscapeX,
339 		tsssEscapeOneOctal,
340 		tsssEscapeTwoOctal,
341 		tsssEscapeOneHex,
342 	};
343 
344 	TokenStreamScannerState state = tsssStart;
345 	TokenStreamScannerState escapedState = tsssStart;
346 		// Used to remember which state to return to from an escape sequence
347 
348 	std::string charStr = "";	// Used to build up character strings
349 	char lastChar = 0;			// For two char lookahead
350 	char lastLastChar = 0;		// For three char lookahead (have I mentioned I hate octal?)
351 	bool keepLooping = true;
352 	ssize_t startPos = 0;
353 	while (keepLooping) {
354 		ssize_t pos = stream.Pos();
355 		char ch = stream.Get();
356 		switch (state) {
357 			case tsssStart:
358 				startPos = pos;
359 				switch (ch) {
360 					case 0x3:	// End-Of-Text
361 						if (stream.IsEmpty())
362 							keepLooping = false;
363 						else
364 							throw new Err(std::string("Sniffer pattern error: invalid character '") + ch + "'", pos);
365 						break;
366 
367 					case '\t':
368 					case '\n':
369 					case ' ':
370 						// Whitespace, so ignore it.
371 						break;
372 
373 					case '"':
374 						charStr = "";
375 						state = tsssOneDouble;
376 						break;
377 
378 					case '\'':
379 						charStr = "";
380 						state = tsssOneSingle;
381 						break;
382 
383 					case '+':
384 					case '-':
385 						charStr = ch;
386 						lastChar = ch;
387 						state = tsssLonelyMinusOrPlus;
388 						break;
389 
390 					case '.':
391 						charStr = ch;
392 						state = tsssLonelyDecimalPoint;
393 						break;
394 
395 					case '0':
396 						charStr = ch;
397 						state = tsssOneZero;
398 						break;
399 
400 					case '1':
401 					case '2':
402 					case '3':
403 					case '4':
404 					case '5':
405 					case '6':
406 					case '7':
407 					case '8':
408 					case '9':
409 						charStr = ch;
410 						state = tsssIntOrFloat;
411 						break;
412 
413 					case '&':	AddToken(Ampersand, pos);		break;
414 					case '(':	AddToken(LeftParen, pos);		break;
415 					case ')':	AddToken(RightParen, pos);		break;
416 					case ':':	AddToken(Colon, pos);			break;
417 					case '[':	AddToken(LeftBracket, pos);		break;
418 
419 					case '\\':
420 						charStr = "";					// Clear our string
421 						state = tsssEscape;
422 						escapedState = tsssUnquoted;	// Unquoted strings begin with an escaped character
423 						break;
424 
425 					case ']':	AddToken(RightBracket, pos);		break;
426 					case '|':	AddToken(Divider, pos);			break;
427 
428 					default:
429 						throw new Err(std::string("Sniffer pattern error: invalid character '") + ch + "'", pos);
430 				}
431 				break;
432 
433 			case tsssOneSingle:
434 				switch (ch) {
435 					case '\\':
436 						escapedState = state;		// Save our state
437 						state = tsssEscape;			// Handle the escape sequence
438 						break;
439 					case '\'':
440 						AddString(charStr, startPos);
441 						state = tsssStart;
442 						break;
443 					case 0x3:
444 						if (stream.IsEmpty())
445 							throw new Err(std::string("Sniffer pattern error: unterminated single-quoted string"), pos);
446 						else
447 							charStr += ch;
448 						break;
449 					default:
450 						charStr += ch;
451 						break;
452 				}
453 				break;
454 
455 			case tsssOneDouble:
456 				switch (ch) {
457 					case '\\':
458 						escapedState = state;		// Save our state
459 						state = tsssEscape;			// Handle the escape sequence
460 						break;
461 					case '"':
462 						AddString(charStr, startPos);
463 						state = tsssStart;
464 						break;
465 					case 0x3:
466 						if (stream.IsEmpty())
467 							throw new Err(std::string("Sniffer pattern error: unterminated double-quoted string"), pos);
468 						else
469 							charStr += ch;
470 						break;
471 					default:
472 						charStr += ch;
473 						break;
474 				}
475 				break;
476 
477 			case tsssOneZero:
478 				if (ch == 'x') {
479 					charStr = "";	// Reinit, since we actually have a hex string
480 					state = tsssZeroX;
481 				} else if ('0' <= ch && ch <= '9') {
482 					charStr += ch;
483 					state = tsssIntOrFloat;
484 				} else if (ch == '.') {
485 					charStr += ch;
486 					state = tsssFloat;
487 				} else if (ch == 'e' || ch == 'E') {
488 					charStr += ch;
489 					state = tsssLonelyFloatExtension;
490 				} else {
491 					// Terminate the number
492 					AddInt(charStr.c_str(), startPos);
493 
494 					// Push the last char back on and try again
495 					stream.Unget();
496 					state = tsssStart;
497 				}
498 				break;
499 
500 			case tsssZeroX:
501 				if (isHexChar(ch)) {
502 					lastChar = ch;
503 					state = tsssOneHex;
504 				} else
505 					throw new Err(std::string("Sniffer pattern error: incomplete hex code"), pos);
506 				break;
507 
508 			case tsssOneHex:
509 				if (isHexChar(ch)) {
510 					try {
511 						charStr += hexToChar(lastChar, ch);
512 					} catch (Err *err) {
513 						if (err)
514 							err->SetPos(pos);
515 						throw err;
516 					}
517 					state = tsssTwoHex;
518 				} else
519 					throw new Err(std::string("Sniffer pattern error: bad hex literal"), pos);	// Same as R5
520 				break;
521 
522 			case tsssTwoHex:
523 				if (isHexChar(ch)) {
524 					lastChar = ch;
525 					state = tsssOneHex;
526 				} else {
527 					AddString(charStr, startPos);
528 					stream.Unget();		// So punctuation gets handled properly
529 					state = tsssStart;
530 				}
531 				break;
532 
533 			case tsssIntOrFloat:
534 				if (isDecimalChar(ch))
535 					charStr += ch;
536 				else if (ch == '.') {
537 					charStr += ch;
538 					state = tsssFloat;
539 				} else if (ch == 'e' || ch == 'E') {
540 					charStr += ch;
541 					state = tsssLonelyFloatExtension;
542 				} else {
543 					// Terminate the number
544 					AddInt(charStr.c_str(), startPos);
545 
546 					// Push the last char back on and try again
547 					stream.Unget();
548 					state = tsssStart;
549 				}
550 				break;
551 
552 			case tsssFloat:
553 				if (isDecimalChar(ch))
554 					charStr += ch;
555 				else if (ch == 'e' || ch == 'E') {
556 					charStr += ch;
557 					state = tsssLonelyFloatExtension;
558 				} else {
559 					// Terminate the number
560 					AddFloat(charStr.c_str(), startPos);
561 
562 					// Push the last char back on and try again
563 					stream.Unget();
564 					state = tsssStart;
565 				}
566 				break;
567 
568 			case tsssLonelyDecimalPoint:
569 				if (isDecimalChar(ch)) {
570 					charStr += ch;
571 					state = tsssFloat;
572 				} else
573 					throw new Err(std::string("Sniffer pattern error: incomplete floating point number"), pos);
574 				break;
575 
576 			case tsssLonelyMinusOrPlus:
577 				if (isDecimalChar(ch)) {
578 					charStr += ch;
579 					state = tsssIntOrFloat;
580 				} else if (ch == '.') {
581 					charStr += ch;
582 					state = tsssLonelyDecimalPoint;
583 				} else if (ch == 'i' && lastChar == '-') {
584 					AddToken(CaseInsensitiveFlag, startPos);
585 					state = tsssStart;
586 				} else
587 					throw new Err(std::string("Sniffer pattern error: incomplete signed number"), pos);
588 				break;
589 
590 			case tsssLonelyFloatExtension:
591 				if (ch == '+' || ch == '-') {
592 					charStr += ch;
593 					state = tsssLonelyFloatExtensionWithSign;
594 				} else if (isDecimalChar(ch)) {
595 					charStr += ch;
596 					state = tsssExtendedFloat;
597 				} else
598 					throw new Err(std::string("Sniffer pattern error: incomplete extended-notation floating point number"), pos);
599 				break;
600 
601 			case tsssLonelyFloatExtensionWithSign:
602 				if (isDecimalChar(ch)) {
603 					charStr += ch;
604 					state = tsssExtendedFloat;
605 				} else
606 					throw new Err(std::string("Sniffer pattern error: incomplete extended-notation floating point number"), pos);
607 				break;
608 
609 			case tsssExtendedFloat:
610 				if (isDecimalChar(ch)) {
611 					charStr += ch;
612 					state = tsssExtendedFloat;
613 				} else {
614 					// Terminate the number
615 					AddFloat(charStr.c_str(), startPos);
616 
617 					// Push the last char back on and try again
618 					stream.Unget();
619 					state = tsssStart;
620 				}
621 				break;
622 
623 			case tsssUnquoted:
624 				if (ch == '\\') {
625 					escapedState = state;		// Save our state
626 					state = tsssEscape;			// Handle the escape sequence
627 				} else if (isWhiteSpace(ch) || isPunctuation(ch)) {
628 					AddString(charStr, startPos);
629 					stream.Unget();				// In case it's punctuation, let tsssStart handle it
630 					state = tsssStart;
631 				} else if (ch == 0x3 && stream.IsEmpty()) {
632 					AddString(charStr, startPos);
633 					keepLooping = false;
634 				} else {
635 					charStr += ch;
636 				}
637 				break;
638 
639 			case tsssEscape:
640 				if (isOctalChar(ch)) {
641 					lastChar = ch;
642 					state = tsssEscapeOneOctal;
643 				} else if (ch == 'x') {
644 					state = tsssEscapeX;
645 				} else {
646 					// Check for a true end-of-text marker
647 					if (ch == 0x3 && stream.IsEmpty())
648 						throw new Err(std::string("Sniffer pattern error: incomplete escape sequence"), pos);
649 					else {
650 						charStr += escapeChar(ch);
651 						state = escapedState;	// Return to the state we were in before the escape
652 					}
653 				}
654 				break;
655 
656 			case tsssEscapeX:
657 				if (isHexChar(ch)) {
658 					lastChar = ch;
659 					state = tsssEscapeOneHex;
660 				} else
661 					throw new Err(std::string("Sniffer pattern error: incomplete escaped hex code"), pos);
662 				break;
663 
664 			case tsssEscapeOneOctal:
665 				if (isOctalChar(ch)) {
666 					lastLastChar = lastChar;
667 					lastChar = ch;
668 					state = tsssEscapeTwoOctal;
669 				} else {
670 					// First handle the octal
671 					try {
672 						charStr += octalToChar(lastChar);
673 					} catch (Err *err) {
674 						if (err)
675 							err->SetPos(startPos);
676 						throw err;
677 					}
678 
679 					// Push the new char back on and let the state we
680 					// were in when the escape sequence was hit handle it.
681 					stream.Unget();
682 					state = escapedState;
683 				}
684 				break;
685 
686 			case tsssEscapeTwoOctal:
687 				if (isOctalChar(ch)) {
688 					try {
689 						charStr += octalToChar(lastLastChar, lastChar, ch);
690 					} catch (Err *err) {
691 						if (err)
692 							err->SetPos(startPos);
693 						throw err;
694 					}
695 					state = escapedState;
696 				} else {
697 					// First handle the octal
698 					try {
699 						charStr += octalToChar(lastLastChar, lastChar);
700 					} catch (Err *err) {
701 						if (err)
702 							err->SetPos(startPos);
703 						throw err;
704 					}
705 
706 					// Push the new char back on and let the state we
707 					// were in when the escape sequence was hit handle it.
708 					stream.Unget();
709 					state = escapedState;
710 				}
711 				break;
712 
713 			case tsssEscapeOneHex:
714 				if (isHexChar(ch)) {
715 					try {
716 						charStr += hexToChar(lastChar, ch);
717 					} catch (Err *err) {
718 						if (err)
719 							err->SetPos(pos);
720 						throw err;
721 					}
722 					state = escapedState;
723 				} else
724 					throw new Err(std::string("Sniffer pattern error: incomplete escaped hex code"), pos);
725 				break;
726 
727 		}
728 	}
729 	if (state == tsssStart)	{
730 		fCStatus = B_OK;
731 		fPos = 0;
732 	} else {
733 		throw new Err("Sniffer pattern error: unterminated rule", stream.Pos());
734 	}
735 
736 	return fCStatus;
737 }
738 
739 void
740 TokenStream::Unset() {
741 	std::vector<Token*>::iterator i;
742 	for (i = fTokenList.begin(); i != fTokenList.end(); i++)
743 		delete *i;
744 	fTokenList.clear();
745 	fCStatus = B_NO_INIT;
746 	fStrLen = -1;
747 }
748 
749 status_t
750 TokenStream::InitCheck() const {
751 	return fCStatus;
752 }
753 
754 //! Returns a pointer to the next token in the stream.
755 /*! The TokenStream object retains owner ship of the Token object returned by Get().
756     If Get() is called at the end of the stream, a pointer to a Err object is thrown.
757 */
758 const Token*
759 TokenStream::Get() {
760 	if (fCStatus != B_OK)
761 		throw new Err("Sniffer parser error: TokenStream::Get() called on uninitialized TokenStream object", -1);
762 	if (fPos < (ssize_t)fTokenList.size())
763 		return fTokenList[fPos++];
764 	else {
765 		throw new Err("Sniffer pattern error: unterminated rule", EndPos());
766 //		fPos++;			// Increment fPos to keep Unget()s consistent
767 //		return NULL;	// Return NULL to signal end of list
768 	}
769 }
770 
771 //! Places token returned by the most recent call to Get() back on the head of the stream.
772 /*! If Unget() is called at the beginning of the stream, a pointer to a Err object is thrown.
773 */
774 void
775 TokenStream::Unget() {
776 	if (fCStatus != B_OK)
777 		throw new Err("Sniffer parser error: TokenStream::Unget() called on uninitialized TokenStream object", -1);
778 	if (fPos > 0)
779 		fPos--;
780 	else
781 		throw new Err("Sniffer parser error: TokenStream::Unget() called at beginning of token stream", -1);
782 }
783 
784 
785 /*! \brief Reads the next token in the stream and verifies it is of the given type,
786 	throwing a pointer to a Err object if it is not.
787 */
788 void
789 TokenStream::Read(TokenType type) {
790 	const Token *t = Get();
791 	if (t->Type() != type) {
792 		throw new Err((std::string("Sniffer pattern error: expected ") + tokenTypeToString(type)
793 	                + ", found " + tokenTypeToString(t->Type())).c_str(), t->Pos());
794 	}
795 }
796 
797 //! Conditionally reads the next token in the stream.
798 /*! CondRead() peeks at the next token in the stream. If it is of the given type, the
799 	token is removed from the stream and \c true is returned. If it is not of the
800 	given type, false is returned and the token remains at the head of the stream.
801 */
802 bool
803 TokenStream::CondRead(TokenType type) {
804 	const Token *t = Get();
805 	if (t->Type() == type) {
806 		return true;
807 	} else {
808 		Unget();
809 		return false;
810 	}
811 }
812 
813 ssize_t
814 TokenStream::Pos() const {
815 	return fPos < (ssize_t)fTokenList.size() ? fTokenList[fPos]->Pos() : fStrLen;
816 }
817 
818 ssize_t
819 TokenStream::EndPos() const {
820 	return fStrLen;
821 }
822 
823 bool
824 TokenStream::IsEmpty() const {
825 	return fCStatus != B_OK || fPos >= (ssize_t)fTokenList.size();
826 }
827 
828 void
829 TokenStream::AddToken(TokenType type, ssize_t pos) {
830 	Token *token = new Token(type, pos);
831 	fTokenList.push_back(token);
832 }
833 
834 void
835 TokenStream::AddString(const std::string &str, ssize_t pos) {
836 	Token *token = new StringToken(str, pos);
837 	fTokenList.push_back(token);
838 }
839 
840 void
841 TokenStream::AddInt(const char *str, ssize_t pos) {
842 	// Convert the string to an int
843 	int32 value = atol(str);
844 	Token *token = new IntToken(value, pos);
845 	fTokenList.push_back(token);
846 }
847 
848 void
849 TokenStream::AddFloat(const char *str, ssize_t pos) {
850 	// Convert the string to a float
851 	double value = atof(str);
852 	Token *token = new FloatToken(value, pos);
853 	fTokenList.push_back(token);
854 }
855 
856 //------------------------------------------------------------------------------
857 // Helper functions
858 //------------------------------------------------------------------------------
859 
860 char
861 escapeChar(char ch) {
862 	// I've manually handled all the escape sequences I could come
863 	// up with, and for anything else I just return the character
864 	// passed in. Hex escapes are handled elsewhere, so \x just
865 	// returns 'x'. Similarly, octals are handled elsewhere, so \0
866 	// through \9 just return '0' through '9'.
867 	switch (ch) {
868 		case 'a':
869 			return '\a';
870 		case 'b':
871 			return '\b';
872 		case 'f':
873 			return '\f';
874 		case 'n':
875 			return '\n';
876 		case 'r':
877 			return '\r';
878 		case 't':
879 			return '\t';
880 		case 'v':
881 			return '\v';
882 		default:
883 			return ch;
884 	}
885 }
886 
887 // Converts 0x|hi|low| to a single char
888 char
889 hexToChar(char hi, char low) {
890 	return (hexToChar(hi) << 4)	| hexToChar(low);
891 }
892 
893 // Converts 0x|ch| to a single char
894 char
895 hexToChar(char hex) {
896 	if ('0' <= hex && hex <= '9')
897 		return hex-'0';
898 	else if ('a' <= hex && hex <= 'f')
899 		return hex-'a'+10;
900 	else if ('A' <= hex && hex <= 'F')
901 		return hex-'A'+10;
902 	else
903 		throw new Err(std::string("Sniffer parser error: invalid hex digit '") + hex + "' passed to hexToChar()", -1);
904 }
905 
906 char
907 octalToChar(char octal) {
908 	return octalToChar('0', '0', octal);
909 }
910 
911 char
912 octalToChar(char hi, char low) {
913 	return octalToChar('0', hi, low);
914 }
915 
916 char
917 octalToChar(char hi, char mid, char low) {
918 	if (isOctalChar(hi) && isOctalChar(mid) && isOctalChar(low)) {
919 		// Check for octals >= decimal 256
920 		if ((hi-'0') <= 3)
921 			return ((hi-'0') << 6) | ((mid-'0') << 3) | (low-'0');
922 		else
923 			throw new Err("Sniffer pattern error: invalid octal literal (octals must be between octal 0 and octal 377 inclusive)", -1);
924 	} else
925 		throw new Err(std::string("Sniffer parser error: invalid octal digit passed to hexToChar()"), -1);
926 }
927 
928 bool
929 isHexChar(char ch) {
930 	return ('0' <= ch && ch <= '9')
931 	         || ('a' <= ch && ch <= 'f')
932 	           || ('A' <= ch && ch <= 'F');
933 }
934 
935 bool
936 isWhiteSpace(char ch) {
937 	return ch == ' ' || ch == '\n' || ch == '\t';
938 }
939 
940 bool
941 isOctalChar(char ch) {
942 	return ('0' <= ch && ch <= '7');
943 }
944 
945 bool
946 isDecimalChar(char ch) {
947 	return ('0' <= ch && ch <= '9');
948 }
949 
950 bool
951 isPunctuation(char ch) {
952 	switch (ch) {
953 		case '&':
954 		case '(':
955 		case ')':
956 		case ':':
957 		case '[':
958 		case ']':
959 		case '|':
960 			return true;
961 		default:
962 			return false;
963 	}
964 }
965 
966 const char*
967 BPrivate::Storage::Sniffer::tokenTypeToString(TokenType type) {
968 	switch (type) {
969 		case LeftParen:
970 			return "LeftParen";
971 			break;
972 		case RightParen:
973 			return "RightParen";
974 			break;
975 		case LeftBracket:
976 			return "LeftBracket";
977 			break;
978 		case RightBracket:
979 			return "RightBracket";
980 			break;
981 		case Colon:
982 			return "Colon";
983 			break;
984 		case Divider:
985 			return "Divider";
986 			break;
987 		case Ampersand:
988 			return "Ampersand";
989 			break;
990 		case CaseInsensitiveFlag:
991 			return "CaseInsensitiveFlag";
992 			break;
993 		case CharacterString:
994 			return "CharacterString";
995 			break;
996 		case Integer:
997 			return "Integer";
998 			break;
999 		case FloatingPoint:
1000 			return "FloatingPoint";
1001 			break;
1002 		default:
1003 			return "UNKNOWN TOKEN TYPE";
1004 			break;
1005 	}
1006 }
1007 
1008 //------------------------------------------------------------------------------
1009 // Parser
1010 //------------------------------------------------------------------------------
1011 
1012 Parser::Parser()
1013 	: fOutOfMemErr(new(nothrow) Err("Sniffer parser error: out of memory", -1))
1014 {
1015 }
1016 
1017 Parser::~Parser() {
1018 	delete fOutOfMemErr;
1019 }
1020 
1021 status_t
1022 Parser::Parse(const char *rule, Rule *result, BString *parseError) {
1023 	try {
1024 		if (!rule)
1025 			throw new Err("Sniffer pattern error: NULL pattern", -1);
1026 		if (!result)
1027 			return B_BAD_VALUE;
1028 		if (stream.SetTo(rule) != B_OK)
1029 			throw new Err("Sniffer parser error: Unable to intialize token stream", -1);
1030 
1031 		ParseRule(result);
1032 
1033 		return B_OK;
1034 
1035 	} catch (Err *err) {
1036 //		cout << "Caught error" << endl;
1037 		if (parseError)
1038 			parseError->SetTo(ErrorMessage(err, rule).c_str());
1039 		delete err;
1040 		return B_BAD_MIME_SNIFFER_RULE;
1041 	}
1042 }
1043 
1044 std::string
1045 Parser::ErrorMessage(Err *err, const char *rule) {
1046 	const char* msg = (err && err->Msg())
1047     	                ? err->Msg()
1048     	                  : "Sniffer parser error: Unexpected error with no supplied error message";
1049     ssize_t pos = err && (err->Pos() >= 0) ? err->Pos() : 0;
1050     std::string str = std::string(rule ? rule : "") + "\n";
1051     for (int i = 0; i < pos; i++)
1052     	str += " ";
1053     str += "^    ";
1054     str += msg;
1055     return str;
1056 }
1057 
1058 void
1059 Parser::ParseRule(Rule *result) {
1060 	if (!result)
1061 		throw new Err("Sniffer parser error: NULL Rule object passed to Parser::ParseRule()", -1);
1062 
1063 	// Priority
1064 	double priority = ParsePriority();
1065 	// Conjunction List
1066 	std::vector<DisjList*>* list = ParseConjList();
1067 
1068 	result->SetTo(priority, list);
1069 }
1070 
1071 double
1072 Parser::ParsePriority() {
1073 	const Token *t = stream.Get();
1074 	if (t->Type() == FloatingPoint || t->Type() == Integer) {
1075 		double result = t->Float();
1076 		if (0.0 <= result && result <= 1.0)
1077 			return result;
1078 		else {
1079 //			cout << "(priority == " << result << ")" << endl;
1080 			throw new Err("Sniffer pattern error: invalid priority", t->Pos());
1081 		}
1082 	} else
1083 		throw new Err("Sniffer pattern error: match level expected", t->Pos());	// Same as R5
1084 }
1085 
1086 std::vector<DisjList*>*
1087 Parser::ParseConjList() {
1088 	std::vector<DisjList*> *list = new(nothrow) std::vector<DisjList*>;
1089 	if (!list)
1090 		ThrowOutOfMemError(stream.Pos());
1091 	try {
1092 		// DisjList+
1093 		int count = 0;
1094 		while (true) {
1095 			DisjList* expr = ParseDisjList();
1096 			if (!expr)
1097 				break;
1098 			else {
1099 				list->push_back(expr);
1100 				count++;
1101 			}
1102 		}
1103 		if (count == 0)
1104 			throw new Err("Sniffer pattern error: missing expression", -1);
1105 	} catch (...) {
1106 		delete list;
1107 		throw;
1108 	}
1109 	return list;
1110 }
1111 
1112 DisjList*
1113 Parser::ParseDisjList() {
1114 	// If we've run out of tokens right now, it's okay, but
1115 	// we need to let ParseConjList() know what's up
1116 	if (stream.IsEmpty())
1117 		return NULL;
1118 
1119 	// Peek ahead, then let the appropriate Parse*List()
1120 	// functions handle things
1121 	const Token *t1 = stream.Get();
1122 
1123 	// PatternList | RangeList
1124 	if (t1->Type() == LeftParen) {
1125 		const Token *t2 = stream.Get();
1126 		// Skip the case-insensitive flag, if there is one
1127 		const Token *tokenOfInterest = (t2->Type() == CaseInsensitiveFlag) ? stream.Get() : t2;
1128 		if (t2 != tokenOfInterest)
1129 			stream.Unget();	// We called Get() three times
1130 		stream.Unget();
1131 		stream.Unget();
1132 		// RangeList
1133 		if (tokenOfInterest->Type() == LeftBracket) {
1134 			return ParseRPatternList();
1135 		// PatternList
1136 		} else {
1137 			return ParsePatternList(Range(0,0));
1138 		}
1139 	// Range, PatternList
1140 	} else if (t1->Type() == LeftBracket) {
1141 		stream.Unget();
1142 		return ParsePatternList(ParseRange());
1143 	} else {
1144 		throw new Err("Sniffer pattern error: missing pattern", t1->Pos());	// Same as R5
1145 	}
1146 
1147 	// PatternList
1148 	// RangeList
1149 	// Range + PatternList
1150 }
1151 
1152 Range
1153 Parser::ParseRange() {
1154 	int32 start, end;
1155 	// LeftBracket
1156 	stream.Read(LeftBracket);
1157 	// Integer
1158 	{
1159 		const Token *t = stream.Get();
1160 		if (t->Type() == Integer) {
1161 			start = t->Int();
1162 			end = start;	// In case we aren't given an explicit end
1163 		} else
1164 			throw new Err("Sniffer pattern error: pattern offset expected", t->Pos());
1165 	}
1166 	// [Colon, Integer] RightBracket
1167 	{
1168 		const Token *t = stream.Get();
1169 		// Colon, Integer, RightBracket
1170 		if (t->Type() == Colon) {
1171 			// Integer
1172 			{
1173 				const Token *t = stream.Get();
1174 				if (t->Type() == Integer) {
1175 					end = t->Int();
1176 				} else
1177 					ThrowUnexpectedTokenError(Integer, t);
1178 			}
1179 			// RightBracket
1180 			stream.Read(RightBracket);
1181 		// !(Colon, Integer) RightBracket
1182 		} else if (t->Type() == RightBracket) {
1183 			// Nothing to do here...
1184 
1185 		// Something else...
1186 		} else
1187 			ThrowUnexpectedTokenError(Colon, Integer, t);
1188 	}
1189 	Range range(start, end);
1190 	if (range.InitCheck() == B_OK)
1191 		return range;
1192 	else
1193 		throw range.GetErr();
1194 }
1195 
1196 DisjList*
1197 Parser::ParsePatternList(Range range) {
1198 	PatternList *list = new(nothrow) PatternList(range);
1199 	if (!list)
1200 		ThrowOutOfMemError(stream.Pos());
1201 	try {
1202 		// LeftParen
1203 		stream.Read(LeftParen);
1204 		// [Flag] Pattern, (Divider, [Flag] Pattern)*
1205 		while (true) {
1206 			// [Flag]
1207 			if (stream.CondRead(CaseInsensitiveFlag))
1208 				list->SetCaseInsensitive(true);
1209 			// Pattern
1210 			list->Add(ParsePattern());
1211 			// [Divider]
1212 			if (!stream.CondRead(Divider))
1213 				break;
1214 		}
1215 		// RightParen
1216 		const Token *t = stream.Get();
1217 		if (t->Type() != RightParen)
1218 			throw new Err("Sniffer pattern error: expecting '|', ')', or possibly '&'", t->Pos());
1219 	} catch (...) {
1220 		delete list;
1221 		throw;
1222 	}
1223 	return list;
1224 }
1225 
1226 DisjList*
1227 Parser::ParseRPatternList() {
1228 	RPatternList *list = new(nothrow) RPatternList();
1229 	if (!list)
1230 		ThrowOutOfMemError(stream.Pos());
1231 	try {
1232 		// LeftParen
1233 		stream.Read(LeftParen);
1234 		// [Flag] RPattern, (Divider, [Flag] RPattern)*
1235 		while (true) {
1236 			// [Flag]
1237 			if (stream.CondRead(CaseInsensitiveFlag))
1238 				list->SetCaseInsensitive(true);
1239 			// RPattern
1240 			list->Add(ParseRPattern());
1241 			// [Divider]
1242 			if (!stream.CondRead(Divider))
1243 				break;
1244 		}
1245 		// RightParen
1246 		const Token *t = stream.Get();
1247 		if (t->Type() != RightParen)
1248 			throw new Err("Sniffer pattern error: expecting '|', ')', or possibly '&'", t->Pos());
1249 	} catch (...) {
1250 		delete list;
1251 		throw;
1252 	}
1253 	return list;
1254 }
1255 
1256 RPattern*
1257 Parser::ParseRPattern() {
1258 	// Range
1259 	Range range = ParseRange();
1260 	// Pattern
1261 	Pattern *pattern = ParsePattern();
1262 
1263 	RPattern *result = new(nothrow) RPattern(range, pattern);
1264 	if (result) {
1265 		if (result->InitCheck() == B_OK)
1266 			return result;
1267 		else {
1268 			Err *err = result->GetErr();
1269 			delete result;
1270 			throw err;
1271 		}
1272 	} else
1273 		ThrowOutOfMemError(stream.Pos());
1274 }
1275 
1276 Pattern*
1277 Parser::ParsePattern() {
1278 	std::string str;
1279 	// String
1280 	{
1281 		const Token *t = stream.Get();
1282 		if (t->Type() == CharacterString)
1283 			str = t->String();
1284 		else
1285 			throw new Err("Sniffer pattern error: missing pattern", t->Pos());
1286 	}
1287 	// [Ampersand, String]
1288 	if (stream.CondRead(Ampersand)) {
1289 		// String (i.e. Mask)
1290 		const Token *t = stream.Get();
1291 		if (t->Type() == CharacterString) {
1292 			Pattern *result = new(nothrow) Pattern(str, t->String());
1293 			if (!result)
1294 				ThrowOutOfMemError(t->Pos());
1295 			if (result->InitCheck() == B_OK) {
1296 				return result;
1297 			} else {
1298 				Err *err = result->GetErr();
1299 				delete result;
1300 				if (err) {
1301 					err->SetPos(t->Pos());
1302 				}
1303 				throw err;
1304 			}
1305 		} else
1306 			ThrowUnexpectedTokenError(CharacterString, t);
1307 	} else {
1308 		// No mask specified.
1309 		Pattern *result = new(nothrow) Pattern(str);
1310 		if (result) {
1311 			if (result->InitCheck() == B_OK)
1312 				return result;
1313 			else {
1314 				Err *err = result->GetErr();
1315 				delete result;
1316 				throw err;
1317 			}
1318 		} else
1319 			ThrowOutOfMemError(stream.Pos());
1320 	}
1321 }
1322 
1323 void
1324 Parser::ThrowEndOfStreamError() {
1325 	throw new Err("Sniffer pattern error: unterminated rule", stream.EndPos());
1326 }
1327 
1328 inline
1329 void
1330 Parser::ThrowOutOfMemError(ssize_t pos) {
1331 	if (fOutOfMemErr)
1332 		fOutOfMemErr->SetPos(pos);
1333 	Err *err = fOutOfMemErr;
1334 	fOutOfMemErr = NULL;
1335 	throw err;
1336 }
1337 
1338 void
1339 Parser::ThrowUnexpectedTokenError(TokenType expected, const Token *found) {
1340 	throw new Err((std::string("Sniffer pattern error: expected ") + tokenTypeToString(expected)
1341 	                + ", found " + (found ? tokenTypeToString(found->Type()) : "NULL token")).c_str()
1342 	                , (found ? found->Pos() : stream.EndPos()));
1343 }
1344 
1345 void
1346 Parser::ThrowUnexpectedTokenError(TokenType expected1, TokenType expected2, const Token *found) {
1347 	throw new Err((std::string("Sniffer pattern error: expected ") + tokenTypeToString(expected1)
1348 	                + " or " + tokenTypeToString(expected2) + ", found "
1349 	                + (found ? tokenTypeToString(found->Type()) : "NULL token")).c_str()
1350 	                , (found ? found->Pos() : stream.EndPos()));
1351 }
1352 
1353 
1354 
1355