xref: /haiku/src/kits/storage/sniffer/Parser.cpp (revision 60ee71d302aad04031b745b4505944dfa9c881a2)
1 //----------------------------------------------------------------------
2 //  This software is part of the OpenBeOS distribution and is covered
3 //  by the OpenBeOS license.
4 //----------------------------------------------------------------------
5 /*!
6 	\file sniffer/Parser.cpp
7 	MIME sniffer rule parser implementation
8 */
9 
10 //#include <sniffer/Expr.h>
11 #include <sniffer/Parser.h>
12 #include <sniffer/Pattern.h>
13 #include <sniffer/PatternList.h>
14 #include <sniffer/Range.h>
15 #include <sniffer/RPattern.h>
16 #include <sniffer/RPatternList.h>
17 #include <sniffer/Rule.h>
18 
19 #include <new.h>
20 #include <stdio.h>
21 #include <stdlib.h>	// For atol(), atof()
22 #include <string.h>
23 #include <String.h>	// BString
24 
25 using namespace Sniffer;
26 
27 // Miscellaneous helper functions
28 char escapeChar(char ch);
29 char hexToChar(char hi, char low);
30 char hexToChar(char hex);
31 char octalToChar(char octal);
32 char octalToChar(char hi, char low);
33 char octalToChar(char hi, char mid, char low);
34 bool isHexChar(char ch);
35 bool isWhiteSpace(char ch);
36 bool isOctalChar(char ch);
37 bool isDecimalChar(char ch);
38 bool isPunctuation(char ch);
39 
40 //! Parses the given rule.
41 /*! The resulting parsed Sniffer::Rule structure is stored in \c rule, which
42 	must be pre-allocated. If parsing fails, a descriptive error message (meant
43 	to be viewed in a monospaced font) is placed in the pre-allocated \c BString
44 	pointed to by \c parseError (which may be \c NULL if you don't care about
45 	the error message).
46 
47 	\param rule Pointer to a NULL-terminated string containing the sniffer
48 	            rule to be parsed
49 	\param result Pointer to a pre-allocated \c Sniffer::Rule object into which the result
50 	              of parsing is placed upon success.
51 	\param parseError Point to pre-allocated \c BString object into which
52 	                  a descriptive error message is stored upon failure.
53 
54 	\return
55 	- B_OK: Success
56 	- B_BAD_MIME_SNIFFER_RULE: Failure
57 */
58 status_t
59 Sniffer::parse(const char *rule, Rule *result, BString *parseError) {
60 	Parser parser;
61 	return parser.Parse(rule, result, parseError);
62 }
63 
64 //------------------------------------------------------------------------------
65 // CharStream
66 //------------------------------------------------------------------------------
67 
68 CharStream::CharStream(const char *string)
69 	: fString(NULL)
70 	, fPos(0)
71 	, fLen(-1)
72 	, fCStatus(B_NO_INIT)
73 {
74 	SetTo(string);
75 }
76 
77 CharStream::~CharStream() {
78 	Unset();
79 }
80 
81 status_t
82 CharStream::SetTo(const char *string) {
83 	Unset();
84 	if (string) {
85 		fString = new(nothrow) char[strlen(string)+1];
86 		if (!fString)
87 			fCStatus = B_NO_MEMORY;
88 		else {
89 			strcpy(fString, string);
90 			fLen = strlen(fString);
91 			fCStatus = B_OK;
92 		}
93 	}
94 	return fCStatus;
95 }
96 
97 void
98 CharStream::Unset() {
99 	delete fString;
100 	fCStatus = B_NO_INIT;
101 	fPos = 0;
102 	fLen = -1;
103 }
104 
105 status_t
106 CharStream::InitCheck() const {
107 	return fCStatus;
108 }
109 
110 bool
111 CharStream::IsEmpty() const {
112 	return fPos >= fLen;
113 }
114 
115 ssize_t
116 CharStream::Pos() const {
117 	return fPos;
118 }
119 
120 const char*
121 CharStream::String() const {
122 	return fString;
123 }
124 
125 char
126 CharStream::Get() {
127 	if (fCStatus != B_OK)
128 		throw new Err("Sniffer parser error: CharStream::Get() called on uninitialized CharStream object", -1);
129 	if (fPos < fLen)
130 		return fString[fPos++];
131 	else {
132 		fPos++;		// Increment fPos to keep Unget()s consistent
133 		return 0x3;	// Return End-Of-Text char
134 	}
135 }
136 
137 void
138 CharStream::Unget() {
139 	if (fCStatus != B_OK)
140 		throw new Err("Sniffer parser error: CharStream::Unget() called on uninitialized CharStream object", -1);
141 	if (fPos > 0)
142 		fPos--;
143 	else
144 		throw new Err("Sniffer parser error: CharStream::Unget() called at beginning of character stream", -1);
145 }
146 
147 //------------------------------------------------------------------------------
148 // Token
149 //------------------------------------------------------------------------------
150 
151 Token::Token(TokenType type, const ssize_t pos)
152 	: fType(type)
153 	, fPos(pos)
154 {
155 //	if (type != EmptyToken)
156 //		cout << "New Token, fType == " << tokenTypeToString(fType) << endl;
157 }
158 
159 Token::~Token() {
160 }
161 
162 TokenType
163 Token::Type() const {
164 	return fType;
165 }
166 
167 const char*
168 Token::String() const {
169 	throw new Err("Sniffer scanner error: Token::String() called on non-string token", fPos);
170 }
171 
172 int32
173 Token::Int() const {
174 	throw new Err("Sniffer scanner error: Token::Int() called on non-integer token", fPos);
175 }
176 
177 double
178 Token::Float() const {
179 	throw new Err("Sniffer scanner error: Token::Float() called on non-float token", fPos);
180 }
181 
182 ssize_t
183 Token::Pos() const {
184 	return fPos;
185 }
186 
187 bool
188 Token::operator==(Token &ref) const {
189 	// Compare types, then data if necessary
190 	if (Type() == ref.Type()) {
191 		switch (Type()) {
192 			case CharacterString:
193 //				printf(" str1 == '%s'\n", String());
194 //				printf(" str2 == '%s'\n", ref.String());
195 //				printf(" strcmp() == %d\n", strcmp(String(), ref.String()));
196 			{
197 				// strcmp() seems to choke on certain, non-normal ASCII chars
198 				// (i.e. chars outside the usual alphabets, but still valid
199 				// as far as ASCII is concerned), so we'll just compare the
200 				// strings by hand to be safe.
201 				const char *str1 = String();
202 				const char *str2 = ref.String();
203 				int len1 = strlen(str1);
204 				int len2 = strlen(str2);
205 //				printf("len1 == %d\n", len1);
206 //				printf("len2 == %d\n", len2);
207 				if (len1 == len2) {
208 					for (int i = 0; i < len1; i++) {
209 //						printf("i == %d, str1[%d] == %x, str2[%d] == %x\n", i, i, str1[i], i, str2[i]);
210 						if (str1[i] != str2[i])
211 							return false;
212 					}
213 				}
214 				return true;
215 			}
216 //				return strcmp(String(), ref.String()) == 0;
217 
218 			case Integer:
219 				return Int() == ref.Int();
220 
221 			case FloatingPoint:
222 				return Float() == ref.Float();
223 
224 			default:
225 				return true;
226 		}
227 	} else
228 		return false;
229 }
230 
231 //------------------------------------------------------------------------------
232 // StringToken
233 //------------------------------------------------------------------------------
234 
235 StringToken::StringToken(const char *string, const ssize_t pos)
236 	: Token(CharacterString, pos)
237 	, fString(NULL)
238 {
239 	if (string) {
240 		fString = new(nothrow) char[strlen(string)+1];
241 		if (fString)
242 			strcpy(fString, string);
243 	}
244 }
245 
246 StringToken::~StringToken() {
247 	delete fString;
248 }
249 
250 const char*
251 StringToken::String() const {
252 	return fString;
253 }
254 
255 //------------------------------------------------------------------------------
256 // IntToken
257 //------------------------------------------------------------------------------
258 
259 IntToken::IntToken(const int32 value, const ssize_t pos)
260 	: Token(Integer, pos)
261 	, fValue(value)
262 {
263 }
264 
265 int32
266 IntToken::Int() const {
267 	return fValue;
268 }
269 
270 double
271 IntToken::Float() const {
272 	return (double)fValue;
273 }
274 
275 //------------------------------------------------------------------------------
276 // FloatToken
277 //------------------------------------------------------------------------------
278 
279 FloatToken::FloatToken(const double value, const ssize_t pos)
280 	: Token(FloatingPoint, pos)
281 	, fValue(value)
282 {
283 }
284 
285 double
286 FloatToken::Float() const {
287 	return fValue;
288 }
289 
290 //------------------------------------------------------------------------------
291 // TokenStream
292 //------------------------------------------------------------------------------
293 
294 TokenStream::TokenStream(const char *string = NULL)
295 	: fCStatus(B_NO_INIT)
296 	, fPos(-1)
297 	, fStrLen(-1)
298 {
299 	SetTo(string);
300 }
301 
302 TokenStream::~TokenStream() {
303 	Unset();
304 }
305 
306 status_t
307 TokenStream::SetTo(const char *string) {
308 int q = 0;
309 	Unset();
310 	if (string) {
311 		fStrLen = strlen(string);
312 		CharStream stream(string);
313 		if (stream.InitCheck() != B_OK)
314 			throw new Err("Sniffer scanner error: Unable to intialize character stream", -1);
315 
316 		typedef enum TokenStreamScannerState {
317 			tsssStart,
318 			tsssOneSingle,
319 			tsssOneDouble,
320 			tsssOneZero,
321 			tsssZeroX,
322 			tsssOneHex,
323 			tsssTwoHex,
324 			tsssHexStringEnd,
325 			tsssIntOrFloat,
326 			tsssFloat,
327 			tsssLonelyDecimalPoint,
328 			tsssLonelyMinusOrPlus,
329 			tsssLonelyFloatExtension,
330 			tsssLonelyFloatExtensionWithSign,
331 			tsssExtendedFloat,
332 			tsssUnquoted,
333 			tsssEscape,
334 			tsssEscapeX,
335 			tsssEscapeOneOctal,
336 			tsssEscapeTwoOctal,
337 			tsssEscapeOneHex,
338 			tsssEscapeTwoHex
339 		};
340 
341 		TokenStreamScannerState state = tsssStart;
342 		TokenStreamScannerState escapedState;
343 			// Used to remember which state to return to from an escape sequence
344 
345 		std::string charStr;	// Used to build up character strings
346 		char lastChar;			// For two char lookahead
347 		char lastLastChar;		// For three char lookahead
348 		bool keepLooping = true;
349 		ssize_t startPos;
350 		while (keepLooping) {
351 			ssize_t pos = stream.Pos();
352 			char ch = stream.Get();
353 			switch (state) {
354 				case tsssStart:
355 					startPos = pos;
356 					switch (ch) {
357 						case 0x3:	// End-Of-Text
358 							if (stream.IsEmpty())
359 								keepLooping = false;
360 							else
361 								throw new Err(std::string("Sniffer pattern error: invalid character '") + ch + "'", pos);
362 							break;
363 
364 						case '\t':
365 						case '\n':
366 						case ' ':
367 							// Whitespace, so ignore it.
368 							break;
369 
370 						case '"':
371 							charStr = "";
372 							state = tsssOneDouble;
373 							break;
374 
375 						case '\'':
376 							charStr = "";
377 							state = tsssOneSingle;
378 							break;
379 
380 						case '+':
381 						case '-':
382 							charStr = ch;
383 							state = tsssLonelyMinusOrPlus;
384 							break;
385 
386 						case '.':
387 							charStr = ch;
388 							state = tsssLonelyDecimalPoint;
389 							break;
390 
391 						case '0':
392 							charStr = ch;
393 							state = tsssOneZero;
394 							break;
395 
396 						case '1':
397 						case '2':
398 						case '3':
399 						case '4':
400 						case '5':
401 						case '6':
402 						case '7':
403 						case '8':
404 						case '9':
405 							charStr = ch;
406 							state = tsssIntOrFloat;
407 							break;
408 
409 						case '&':	AddToken(Ampersand, pos);		break;
410 						case '(':	AddToken(LeftParen, pos);		break;
411 						case ')':	AddToken(RightParen, pos);		break;
412 						case ':':	AddToken(Colon, pos);			break;
413 						case '[':	AddToken(LeftBracket, pos);		break;
414 
415 						case '\\':
416 							charStr = "";					// Clear our string
417 							state = tsssEscape;
418 							escapedState = tsssUnquoted;	// Unquoted strings begin with an escaped character
419 							break;
420 
421 						case ']':	AddToken(RightBracket, pos);		break;
422 						case '|':	AddToken(Divider, pos);			break;
423 
424 						default:
425 							throw new Err(std::string("Sniffer pattern error: invalid character '") + ch + "'", pos);
426 					}
427 					break;
428 
429 				case tsssOneSingle:
430 					switch (ch) {
431 						case '\\':
432 							escapedState = state;		// Save our state
433 							state = tsssEscape;			// Handle the escape sequence
434 							break;
435 						case '\'':
436 							AddString(charStr.c_str(), startPos);
437 							state = tsssStart;
438 							break;
439 						case 0x3:
440 							if (stream.IsEmpty())
441 								throw new Err(std::string("Sniffer pattern error: unterminated single-quoted string"), pos);
442 							else
443 								charStr += ch;
444 							break;
445 						default:
446 							charStr += ch;
447 							break;
448 					}
449 					break;
450 
451 				case tsssOneDouble:
452 					switch (ch) {
453 						case '\\':
454 							escapedState = state;		// Save our state
455 							state = tsssEscape;			// Handle the escape sequence
456 							break;
457 						case '"':
458 							AddString(charStr.c_str(), startPos);
459 							state = tsssStart;
460 							break;
461 						case 0x3:
462 							if (stream.IsEmpty())
463 								throw new Err(std::string("Sniffer pattern error: unterminated double-quoted string"), pos);
464 							else
465 								charStr += ch;
466 							break;
467 						default:
468 							charStr += ch;
469 							break;
470 					}
471 					break;
472 
473 				case tsssOneZero:
474 					if (ch == 'x') {
475 						charStr = "";	// Reinit, since we actually have a hex string
476 						state = tsssZeroX;
477 					} else if ('0' <= ch && ch <= '9') {
478 						charStr += ch;
479 						state = tsssIntOrFloat;
480 					} else if (ch == '.') {
481 						charStr += ch;
482 						state = tsssFloat;
483 					} else if (ch == 'e' || ch == 'E') {
484 						charStr += ch;
485 						state = tsssLonelyFloatExtension;
486 					} else {
487 						// Terminate the number
488 						AddInt(charStr.c_str(), startPos);
489 
490 						// Push the last char back on and try again
491 						stream.Unget();
492 						state = tsssStart;
493 					}
494 					break;
495 
496 				case tsssZeroX:
497 					if (isHexChar(ch)) {
498 						lastChar = ch;
499 						state = tsssOneHex;
500 					} else
501 						throw new Err(std::string("Sniffer pattern error: incomplete hex code"), pos);
502 					break;
503 
504 				case tsssOneHex:
505 					if (isHexChar(ch)) {
506 						try {
507 							charStr += hexToChar(lastChar, ch);
508 						} catch (Err *err) {
509 							if (err)
510 								err->SetPos(pos);
511 							throw err;
512 						}
513 						state = tsssTwoHex;
514 					} else
515 						throw new Err(std::string("Sniffer pattern error: bad hex literal"), pos);	// Same as R5
516 					break;
517 
518 				case tsssTwoHex:
519 					if (isHexChar(ch)) {
520 						lastChar = ch;
521 						state = tsssOneHex;
522 					} else {
523 						AddString(charStr.c_str(), startPos);
524 						stream.Unget();		// So punctuation gets handled properly
525 						state = tsssStart;
526 					}
527 					break;
528 
529 				case tsssIntOrFloat:
530 					if (isDecimalChar(ch))
531 						charStr += ch;
532 					else if (ch == '.') {
533 						charStr += ch;
534 						state = tsssFloat;
535 					} else if (ch == 'e' || ch == 'E') {
536 						charStr += ch;
537 						state = tsssLonelyFloatExtension;
538 					} else {
539 						// Terminate the number
540 						AddInt(charStr.c_str(), startPos);
541 
542 						// Push the last char back on and try again
543 						stream.Unget();
544 						state = tsssStart;
545 					}
546 					break;
547 
548 				case tsssFloat:
549 					if (isDecimalChar(ch))
550 						charStr += ch;
551 					else if (ch == 'e' || ch == 'E') {
552 						charStr += ch;
553 						state = tsssLonelyFloatExtension;
554 					} else {
555 						// Terminate the number
556 						AddFloat(charStr.c_str(), startPos);
557 
558 						// Push the last char back on and try again
559 						stream.Unget();
560 						state = tsssStart;
561 					}
562 					break;
563 
564 				case tsssLonelyDecimalPoint:
565 					if (isDecimalChar(ch)) {
566 						charStr += ch;
567 						state = tsssFloat;
568 					} else
569 						throw new Err(std::string("Sniffer pattern error: incomplete floating point number"), pos);
570 					break;
571 
572 				case tsssLonelyMinusOrPlus:
573 					if (isDecimalChar(ch)) {
574 						charStr += ch;
575 						state = tsssIntOrFloat;
576 					} else if (ch == '.') {
577 						charStr += ch;
578 						state = tsssLonelyDecimalPoint;
579 					} else
580 						throw new Err(std::string("Sniffer pattern error: incomplete signed number"), pos);
581 					break;
582 
583 				case tsssLonelyFloatExtension:
584 					if (ch == '+' || ch == '-') {
585 						charStr += ch;
586 						state = tsssLonelyFloatExtensionWithSign;
587 					} else if (isDecimalChar(ch)) {
588 						charStr += ch;
589 						state = tsssExtendedFloat;
590 					} else
591 						throw new Err(std::string("Sniffer pattern error: incomplete extended-notation floating point number"), pos);
592 					break;
593 
594 				case tsssLonelyFloatExtensionWithSign:
595 					if (isDecimalChar(ch)) {
596 						charStr += ch;
597 						state = tsssExtendedFloat;
598 					} else
599 						throw new Err(std::string("Sniffer pattern error: incomplete extended-notation floating point number"), pos);
600 					break;
601 
602 				case tsssExtendedFloat:
603 					if (isDecimalChar(ch)) {
604 						charStr += ch;
605 						state = tsssExtendedFloat;
606 					} else {
607 						// Terminate the number
608 						AddFloat(charStr.c_str(), startPos);
609 
610 						// Push the last char back on and try again
611 						stream.Unget();
612 						state = tsssStart;
613 					}
614 					break;
615 
616 				case tsssUnquoted:
617 					if (ch == '\\') {
618 						escapedState = state;		// Save our state
619 						state = tsssEscape;			// Handle the escape sequence
620 					} else if (isWhiteSpace(ch) || isPunctuation(ch)) {
621 						AddString(charStr.c_str(), startPos);
622 						stream.Unget();				// In case it's punctuation, let tsssStart handle it
623 						state = tsssStart;
624 					} else if (ch == '\'' || ch == '"') {
625 						throw new Err(std::string("Sniffer pattern error: illegal unquoted character '") + ch + "'", pos);
626 					} else if (ch == 0x3 && stream.IsEmpty()) {
627 						AddString(charStr.c_str(), startPos);
628 						keepLooping = false;
629 					} else {
630 						charStr += ch;
631 					}
632 					break;
633 
634 				case tsssEscape:
635 					if (isOctalChar(ch)) {
636 						lastChar = ch;
637 						state = tsssEscapeOneOctal;
638 					} else if (ch == 'x') {
639 						state = tsssEscapeX;
640 					} else {
641 						// Check for a true end-of-text marker
642 						if (ch == 0x3 && stream.IsEmpty())
643 							throw new Err(std::string("Sniffer pattern error: incomplete escape sequence"), pos);
644 						else {
645 							charStr += escapeChar(ch);
646 							state = escapedState;	// Return to the state we were in before the escape
647 						}
648 					}
649 					break;
650 
651 				case tsssEscapeX:
652 					if (isHexChar(ch)) {
653 						lastChar = ch;
654 						state = tsssEscapeOneHex;
655 					} else
656 						throw new Err(std::string("Sniffer pattern error: incomplete escaped hex code"), pos);
657 					break;
658 
659 				case tsssEscapeOneOctal:
660 					if (isOctalChar(ch)) {
661 						lastLastChar = lastChar;
662 						lastChar = ch;
663 						state = tsssEscapeTwoOctal;
664 					} else {
665 						// First handle the octal
666 						try {
667 							charStr += octalToChar(lastChar);
668 						} catch (Err *err) {
669 							if (err)
670 								err->SetPos(startPos);
671 							throw err;
672 						}
673 
674 						// Push the new char back on and let the state we
675 						// were in when the escape sequence was hit handle it.
676 						stream.Unget();
677 						state = escapedState;
678 					}
679 					break;
680 
681 				case tsssEscapeTwoOctal:
682 					if (isOctalChar(ch)) {
683 						try {
684 							charStr += octalToChar(lastLastChar, lastChar, ch);
685 						} catch (Err *err) {
686 							if (err)
687 								err->SetPos(startPos);
688 							throw err;
689 						}
690 						state = escapedState;
691 					} else {
692 						// First handle the octal
693 						try {
694 							charStr += octalToChar(lastLastChar, lastChar);
695 						} catch (Err *err) {
696 							if (err)
697 								err->SetPos(startPos);
698 							throw err;
699 						}
700 
701 						// Push the new char back on and let the state we
702 						// were in when the escape sequence was hit handle it.
703 						stream.Unget();
704 						state = escapedState;
705 					}
706 					break;
707 
708 				case tsssEscapeOneHex:
709 					if (isHexChar(ch)) {
710 						try {
711 							charStr += hexToChar(lastChar, ch);
712 						} catch (Err *err) {
713 							if (err)
714 								err->SetPos(pos);
715 							throw err;
716 						}
717 						state = escapedState;
718 					} else
719 						throw new Err(std::string("Sniffer pattern error: incomplete escaped hex code"), pos);
720 					break;
721 
722 			}
723 		}
724 		if (state == tsssStart)	{
725 			fCStatus = B_OK;
726 			fPos = 0;
727 		} else {
728 			throw new Err("Sniffer pattern error: unterminated rule", stream.Pos());
729 		}
730 	}
731 
732 	return fCStatus;
733 }
734 
735 void
736 TokenStream::Unset() {
737 	std::vector<Token*>::iterator i;
738 	for (i = fTokenList.begin(); i != fTokenList.end(); i++)
739 		delete *i;
740 	fTokenList.clear();
741 	fCStatus = B_NO_INIT;
742 	fStrLen = -1;
743 }
744 
745 status_t
746 TokenStream::InitCheck() const {
747 	return fCStatus;
748 }
749 
750 //! Returns a pointer to the next token in the stream.
751 /*! The TokenStream object retains owner ship of the Token object returned by Get().
752     If Get() is called at the end of the stream, a pointer to a Sniffer::Err object is thrown.
753 */
754 const Token*
755 TokenStream::Get() {
756 	if (fCStatus != B_OK)
757 		throw new Err("Sniffer parser error: TokenStream::Get() called on uninitialized TokenStream object", -1);
758 	if (fPos < fTokenList.size())
759 		return fTokenList[fPos++];
760 	else {
761 		throw new Err("Sniffer pattern error: unterminated rule", EndPos());
762 //		fPos++;			// Increment fPos to keep Unget()s consistent
763 //		return NULL;	// Return NULL to signal end of list
764 	}
765 }
766 
767 //! Places token returned by the most recent call to Get() back on the head of the stream.
768 /*! If Unget() is called at the beginning of the stream, a pointer to a Sniffer::Err object is thrown.
769 */
770 void
771 TokenStream::Unget() {
772 	if (fCStatus != B_OK)
773 		throw new Err("Sniffer parser error: TokenStream::Unget() called on uninitialized TokenStream object", -1);
774 	if (fPos > 0)
775 		fPos--;
776 	else
777 		throw new Err("Sniffer parser error: TokenStream::Unget() called at beginning of token stream", -1);
778 }
779 
780 
781 /*! \brief Reads the next token in the stream and verifies it is of the given type,
782 	throwing a pointer to a Sniffer::Err object if it is not.
783 */
784 void
785 TokenStream::Read(TokenType type) {
786 	const Token *t = Get();
787 	if (t->Type() != type) {
788 		throw new Err((std::string("Sniffer pattern error: expected ") + tokenTypeToString(type)
789 	                + ", found " + tokenTypeToString(t->Type())).c_str(), t->Pos());
790 	}
791 }
792 
793 //! Conditionally reads the next token in the stream.
794 /*! CondRead() peeks at the next token in the stream. If it is of the given type, the
795 	token is removed from the stream and \c true is returned. If it is not of the
796 	given type, false is returned and the token remains at the head of the stream.
797 */
798 bool
799 TokenStream::CondRead(TokenType type) {
800 	const Token *t = Get();
801 	if (t->Type() == type) {
802 		return true;
803 	} else {
804 		Unget();
805 		return false;
806 	}
807 }
808 
809 ssize_t
810 TokenStream::Pos() const {
811 	return fPos < fTokenList.size() ? fTokenList[fPos]->Pos() : fStrLen;
812 }
813 
814 ssize_t
815 TokenStream::EndPos() const {
816 	return fStrLen;
817 }
818 
819 bool
820 TokenStream::IsEmpty() const {
821 	return fCStatus != B_OK || fPos >= fTokenList.size();
822 }
823 
824 void
825 TokenStream::AddToken(TokenType type, ssize_t pos) {
826 	Token *token = new Token(type, pos);
827 	fTokenList.push_back(token);
828 }
829 
830 void
831 TokenStream::AddString(const char *str, ssize_t pos) {
832 	Token *token = new StringToken(str, pos);
833 	fTokenList.push_back(token);
834 }
835 
836 void
837 TokenStream::AddInt(const char *str, ssize_t pos) {
838 	// Convert the string to an int
839 	int32 value = atol(str);
840 	Token *token = new IntToken(value, pos);
841 	fTokenList.push_back(token);
842 }
843 
844 void
845 TokenStream::AddFloat(const char *str, ssize_t pos) {
846 	// Convert the string to a float
847 	double value = atof(str);
848 	Token *token = new FloatToken(value, pos);
849 	fTokenList.push_back(token);
850 }
851 
852 //------------------------------------------------------------------------------
853 // Helper functions
854 //------------------------------------------------------------------------------
855 
856 char
857 escapeChar(char ch) {
858 	// I've manually handled all the escape sequences I could come
859 	// up with, and for anything else I just return the character
860 	// passed in. Hex escapes are handled elsewhere, so \x just
861 	// returns 'x'. Similarly, octals are handled elsewhere, so \0
862 	// through \9 just return '0' through '9'.
863 	switch (ch) {
864 		case 'a':
865 			return '\a';
866 		case 'b':
867 			return '\b';
868 		case 'f':
869 			return '\f';
870 		case 'n':
871 			return '\n';
872 		case 'r':
873 			return '\r';
874 		case 't':
875 			return '\t';
876 		case 'v':
877 			return '\v';
878 		default:
879 			return ch;
880 	}
881 }
882 
883 // Converts 0x|hi|low| to a single char
884 char
885 hexToChar(char hi, char low) {
886 	return (hexToChar(hi) << 4)	| hexToChar(low);
887 }
888 
889 // Converts 0x|ch| to a single char
890 char
891 hexToChar(char hex) {
892 	if ('0' <= hex && hex <= '9')
893 		return hex-'0';
894 	else if ('a' <= hex && hex <= 'f')
895 		return hex-'a'+10;
896 	else if ('A' <= hex && hex <= 'F')
897 		return hex-'A'+10;
898 	else
899 		throw new Err(std::string("Sniffer parser error: invalid hex digit '") + hex + "' passed to hexToChar()", -1);
900 }
901 
902 char
903 octalToChar(char octal) {
904 	return octalToChar('0', '0', octal);
905 }
906 
907 char
908 octalToChar(char hi, char low) {
909 	return octalToChar('0', hi, low);
910 }
911 
912 char
913 octalToChar(char hi, char mid, char low) {
914 	if (isOctalChar(hi) && isOctalChar(mid) && isOctalChar(low)) {
915 		// Check for octals >= decimal 256
916 		if ((hi-'0') <= 3)
917 			return ((hi-'0') << 6) | ((mid-'0') << 3) | (low-'0');
918 		else
919 			throw new Err("Sniffer pattern error: invalid octal literal (octals must be between octal 0 and octal 377 inclusive)", -1);
920 	} else
921 		throw new Err(std::string("Sniffer parser error: invalid octal digit passed to hexToChar()"), -1);
922 }
923 
924 bool
925 isHexChar(char ch) {
926 	return ('0' <= ch && ch <= '9')
927 	         || ('a' <= ch && ch <= 'f')
928 	           || ('A' <= ch && ch <= 'F');
929 }
930 
931 bool
932 isWhiteSpace(char ch) {
933 	return ch == ' ' || ch == '\n' || ch == '\t';
934 }
935 
936 bool
937 isOctalChar(char ch) {
938 	return ('0' <= ch && ch <= '7');
939 }
940 
941 bool
942 isDecimalChar(char ch) {
943 	return ('0' <= ch && ch <= '9');
944 }
945 
946 bool
947 isPunctuation(char ch) {
948 	switch (ch) {
949 		case '&':
950 		case '(':
951 		case ')':
952 		case ':':
953 		case '[':
954 		case ']':
955 		case '|':
956 			return true;
957 		default:
958 			return false;
959 	}
960 }
961 
962 const char*
963 Sniffer::tokenTypeToString(TokenType type) {
964 	switch (type) {
965 		case LeftParen:
966 			return "LeftParen";
967 			break;
968 		case RightParen:
969 			return "RightParen";
970 			break;
971 		case LeftBracket:
972 			return "LeftBracket";
973 			break;
974 		case RightBracket:
975 			return "RightBracket";
976 			break;
977 		case Colon:
978 			return "Colon";
979 			break;
980 		case Divider:
981 			return "Divider";
982 			break;
983 		case Ampersand:
984 			return "Ampersand";
985 			break;
986 		case CharacterString:
987 			return "CharacterString";
988 			break;
989 		case Integer:
990 			return "Integer";
991 			break;
992 		case FloatingPoint:
993 			return "FloatingPoint";
994 			break;
995 		default:
996 			return "UNKNOWN TOKEN TYPE";
997 			break;
998 	}
999 }
1000 
1001 //------------------------------------------------------------------------------
1002 // Parser
1003 //------------------------------------------------------------------------------
1004 
1005 Parser::Parser()
1006 	: fOutOfMemErr(new(nothrow) Err("Sniffer parser error: out of memory", -1))
1007 {
1008 }
1009 
1010 Parser::~Parser() {
1011 	delete fOutOfMemErr;
1012 }
1013 
1014 status_t
1015 Parser::Parse(const char *rule, Rule *result, BString *parseError) {
1016 	try {
1017 		if (!rule)
1018 			throw new Err("Sniffer pattern error: NULL pattern", -1);
1019 		if (!result)
1020 			return B_BAD_VALUE;
1021 		if (stream.SetTo(rule) != B_OK)
1022 			throw new Err("Sniffer parser error: Unable to intialize token stream", -1);
1023 
1024 		ParseRule(result);
1025 
1026 		return B_OK;
1027 
1028 	} catch (Err *err) {
1029 //		cout << "Caught error" << endl;
1030 		if (parseError)
1031 			parseError->SetTo(ErrorMessage(err, rule).c_str());
1032 		delete err;
1033 		return B_BAD_MIME_SNIFFER_RULE;
1034 	}
1035 }
1036 
1037 std::string
1038 Parser::ErrorMessage(Err *err, const char *rule) {
1039 	const char* msg = (err && err->Msg())
1040     	                ? err->Msg()
1041     	                  : "Sniffer parser error: Unexpected error with no supplied error message";
1042     size_t pos = err && (err->Pos() >= 0) ? err->Pos() : 0;
1043     std::string str = std::string(rule ? rule : "") + "\n";
1044     for (int i = 0; i < pos; i++)
1045     	str += " ";
1046     str += "^    ";
1047     str += msg;
1048     return str;
1049 }
1050 
1051 void
1052 Parser::ParseRule(Rule *result) {
1053 	if (!result)
1054 		throw new Err("Sniffer parser error: NULL Rule object passed to Parser::ParseRule()", -1);
1055 
1056 	// Priority
1057 	double priority = ParsePriority();
1058 	// Expression List
1059 	std::vector<Expr*>* list = ParseExprList();
1060 
1061 	result->SetTo(priority, list);
1062 }
1063 
1064 double
1065 Parser::ParsePriority() {
1066 	const Token *t = stream.Get();
1067 	if (t->Type() == FloatingPoint || t->Type() == Integer) {
1068 		double result = t->Float();
1069 		if (0.0 <= result && result <= 1.0)
1070 			return result;
1071 		else {
1072 //			cout << "(priority == " << result << ")" << endl;
1073 			throw new Err("Sniffer pattern error: invalid priority", t->Pos());
1074 		}
1075 	} else
1076 		throw new Err("Sniffer pattern error: match level expected", t->Pos());	// Same as R5
1077 }
1078 
1079 std::vector<Expr*>*
1080 Parser::ParseExprList() {
1081 	std::vector<Expr*> *list = new(nothrow) std::vector<Expr*>;
1082 	if (!list)
1083 		ThrowOutOfMemError(stream.Pos());
1084 	try {
1085 		// Expr+
1086 		int count = 0;
1087 		while (true) {
1088 			Expr* expr = ParseExpr();
1089 			if (!expr)
1090 				break;
1091 			else {
1092 				list->push_back(expr);
1093 				count++;
1094 			}
1095 		}
1096 		if (count == 0)
1097 			throw new Err("Sniffer pattern error: missing expression", -1);
1098 	} catch (...) {
1099 		delete list;
1100 		throw;
1101 	}
1102 	return list;
1103 }
1104 
1105 Expr*
1106 Parser::ParseExpr() {
1107 	// If we've run out of tokens right now, it's okay, but
1108 	// we need to let ParseExprList() know what's up
1109 	if (stream.IsEmpty())
1110 		return NULL;
1111 
1112 	// Peek ahead, then let the appropriate Parse*List()
1113 	// functions handle things
1114 	const Token *t1 = stream.Get();
1115 
1116 	// PatternList | RangeList
1117 	if (t1->Type() == LeftParen) {
1118 		const Token *t2 = stream.Get();
1119 		stream.Unget();
1120 		stream.Unget();
1121 		// RangeList
1122 		if (t2->Type() == LeftBracket) {
1123 			return ParseRPatternList();
1124 		// PatternList
1125 		} else {
1126 			return ParsePatternList(Range(0,0));
1127 		}
1128 	// Range, PatternList
1129 	} else if (t1->Type() == LeftBracket) {
1130 		stream.Unget();
1131 		return ParsePatternList(ParseRange());
1132 	} else {
1133 		throw new Err("Sniffer pattern error: missing pattern", t1->Pos());	// Same as R5
1134 	}
1135 
1136 	// PatternList
1137 	// RangeList
1138 	// Range + PatternList
1139 }
1140 
1141 Range
1142 Parser::ParseRange() {
1143 	int32 start, end;
1144 	// LeftBracket
1145 	stream.Read(LeftBracket);
1146 	// Integer
1147 	{
1148 		const Token *t = stream.Get();
1149 		if (t->Type() == Integer) {
1150 			start = t->Int();
1151 			end = start;	// In case we aren't given an explicit end
1152 		} else
1153 			throw new Err("Sniffer pattern error: pattern offset expected", t->Pos());
1154 	}
1155 	// [Colon, Integer] RightBracket
1156 	{
1157 		const Token *t = stream.Get();
1158 		// Colon, Integer, RightBracket
1159 		if (t->Type() == Colon) {
1160 			// Integer
1161 			{
1162 				const Token *t = stream.Get();
1163 				if (t->Type() == Integer) {
1164 					end = t->Int();
1165 				} else
1166 					ThrowUnexpectedTokenError(Integer, t);
1167 			}
1168 			// RightBracket
1169 			stream.Read(RightBracket);
1170 		// !(Colon, Integer) RightBracket
1171 		} else if (t->Type() == RightBracket) {
1172 			// Nothing to do here...
1173 
1174 		// Something else...
1175 		} else
1176 			ThrowUnexpectedTokenError(Colon, Integer, t);
1177 	}
1178 	Range range(start, end);
1179 	if (range.InitCheck() == B_OK)
1180 		return range;
1181 	else
1182 		throw range.GetErr();
1183 }
1184 
1185 Expr*
1186 Parser::ParsePatternList(Range range) {
1187 	PatternList *list = new(nothrow) PatternList(range);
1188 	if (!list)
1189 		ThrowOutOfMemError(stream.Pos());
1190 	try {
1191 		// LeftParen
1192 		stream.Read(LeftParen);
1193 		// Pattern, (Divider, Pattern)*
1194 		bool keepLooping = true;
1195 		while (true) {
1196 			// Pattern
1197 			list->Add(ParsePattern());
1198 			// [Divider]
1199 			if (!stream.CondRead(Divider))
1200 				break;
1201 		}
1202 		// RightParen
1203 		const Token *t = stream.Get();
1204 		if (t->Type() != RightParen)
1205 			throw new Err("Sniffer pattern error: expecting '|', ')', or possibly '&'", t->Pos());
1206 	} catch (...) {
1207 		delete list;
1208 		throw;
1209 	}
1210 	return list;
1211 }
1212 
1213 Expr*
1214 Parser::ParseRPatternList() {
1215 	RPatternList *list = new(nothrow) RPatternList();
1216 	if (!list)
1217 		ThrowOutOfMemError(stream.Pos());
1218 	try {
1219 		// LeftParen
1220 		stream.Read(LeftParen);
1221 		// RPattern, (Divider, RPattern)*
1222 		bool keepLooping = true;
1223 		while (true) {
1224 			// RPattern
1225 			list->Add(ParseRPattern());
1226 			// [Divider]
1227 			if (!stream.CondRead(Divider))
1228 				break;
1229 		}
1230 		// RightParen
1231 		const Token *t = stream.Get();
1232 		if (t->Type() != RightParen)
1233 			throw new Err("Sniffer pattern error: expecting '|', ')', or possibly '&'", t->Pos());
1234 	} catch (...) {
1235 		delete list;
1236 		throw;
1237 	}
1238 	return list;
1239 }
1240 
1241 RPattern*
1242 Parser::ParseRPattern() {
1243 	// Range
1244 	Range range = ParseRange();
1245 	// Pattern
1246 	Pattern *pattern = ParsePattern();
1247 
1248 	RPattern *result = new(nothrow) RPattern(range, pattern);
1249 	if (result) {
1250 		if (result->InitCheck() == B_OK)
1251 			return result;
1252 		else {
1253 			Err *err = result->GetErr();
1254 			delete result;
1255 			throw err;
1256 		}
1257 	} else
1258 		ThrowOutOfMemError(stream.Pos());
1259 }
1260 
1261 Pattern*
1262 Parser::ParsePattern() {
1263 	std::string str;
1264 	// String
1265 	{
1266 		const Token *t = stream.Get();
1267 		if (t->Type() == CharacterString)
1268 			str = t->String();
1269 		else
1270 			throw new Err("Sniffer pattern error: missing pattern", t->Pos());
1271 	}
1272 	// [Ampersand, String]
1273 	if (stream.CondRead(Ampersand)) {
1274 		// String (i.e. Mask)
1275 		const Token *t = stream.Get();
1276 		if (t->Type() == CharacterString) {
1277 			Pattern *result = new(nothrow) Pattern(str.c_str(), t->String());
1278 			if (!result)
1279 				ThrowOutOfMemError(t->Pos());
1280 			if (result->InitCheck() == B_OK) {
1281 				return result;
1282 			} else {
1283 				Err *err = result->GetErr();
1284 				delete result;
1285 				if (err) {
1286 					err->SetPos(t->Pos());
1287 				}
1288 				throw err;
1289 			}
1290 		} else
1291 			ThrowUnexpectedTokenError(CharacterString, t);
1292 	} else {
1293 		// No mask specified.
1294 		Pattern *result = new(nothrow) Pattern(str.c_str());
1295 		if (result) {
1296 			if (result->InitCheck() == B_OK)
1297 				return result;
1298 			else {
1299 				Err *err = result->GetErr();
1300 				delete result;
1301 				throw err;
1302 			}
1303 		} else
1304 			ThrowOutOfMemError(stream.Pos());
1305 	}
1306 }
1307 
1308 void
1309 Parser::ThrowEndOfStreamError() {
1310 	throw new Err("Sniffer pattern error: unterminated rule", stream.EndPos());
1311 }
1312 
1313 inline
1314 void
1315 Parser::ThrowOutOfMemError(ssize_t pos) {
1316 	if (fOutOfMemErr)
1317 		fOutOfMemErr->SetPos(pos);
1318 	Err *err = fOutOfMemErr;
1319 	fOutOfMemErr = NULL;
1320 	throw err;
1321 }
1322 
1323 void
1324 Parser::ThrowUnexpectedTokenError(TokenType expected, const Token *found) {
1325 	throw new Err((std::string("Sniffer pattern error: expected ") + tokenTypeToString(expected)
1326 	                + ", found " + (found ? tokenTypeToString(found->Type()) : "NULL token")).c_str()
1327 	                , (found ? found->Pos() : stream.EndPos()));
1328 }
1329 
1330 void
1331 Parser::ThrowUnexpectedTokenError(TokenType expected1, TokenType expected2, const Token *found) {
1332 	throw new Err((std::string("Sniffer pattern error: expected ") + tokenTypeToString(expected1)
1333 	                + " or " + tokenTypeToString(expected2) + ", found "
1334 	                + (found ? tokenTypeToString(found->Type()) : "NULL token")).c_str()
1335 	                , (found ? found->Pos() : stream.EndPos()));
1336 }
1337