xref: /haiku/src/kits/storage/sniffer/Parser.cpp (revision 268f99dd7dc4bd7474a8bd2742d3f1ec1de6752a)
1 //----------------------------------------------------------------------
2 //  This software is part of the Haiku distribution and is covered
3 //  by the MIT License.
4 //----------------------------------------------------------------------
5 /*!
6 	\file sniffer/Parser.cpp
7 	MIME sniffer rule parser implementation
8 */
9 
10 #include <sniffer/Parser.h>
11 #include <sniffer/Pattern.h>
12 #include <sniffer/PatternList.h>
13 #include <sniffer/Range.h>
14 #include <sniffer/RPattern.h>
15 #include <sniffer/RPatternList.h>
16 #include <sniffer/Rule.h>
17 
18 #include <new>
19 #include <stdio.h>
20 #include <stdlib.h>	// For atol(), atof()
21 #include <string.h>
22 #include <String.h>	// BString
23 
24 using namespace BPrivate::Storage::Sniffer;
25 
26 // Miscellaneous helper functions
27 char escapeChar(char ch);
28 char hexToChar(char hi, char low);
29 char hexToChar(char hex);
30 char octalToChar(char octal);
31 char octalToChar(char hi, char low);
32 char octalToChar(char hi, char mid, char low);
33 bool isHexChar(char ch);
34 bool isWhiteSpace(char ch);
35 bool isOctalChar(char ch);
36 bool isDecimalChar(char ch);
37 bool isPunctuation(char ch);
38 
39 //! Parses the given rule.
40 /*! The resulting parsed Rule structure is stored in \c rule, which
41 	must be pre-allocated. If parsing fails, a descriptive error message (meant
42 	to be viewed in a monospaced font) is placed in the pre-allocated \c BString
43 	pointed to by \c parseError (which may be \c NULL if you don't care about
44 	the error message).
45 
46 	\param rule Pointer to a NULL-terminated string containing the sniffer
47 	            rule to be parsed
48 	\param result Pointer to a pre-allocated \c Rule object into which the result
49 	              of parsing is placed upon success.
50 	\param parseError Point to pre-allocated \c BString object into which
51 	                  a descriptive error message is stored upon failure.
52 
53 	\return
54 	- B_OK: Success
55 	- B_BAD_MIME_SNIFFER_RULE: Failure
56 */
57 status_t
parse(const char * rule,Rule * result,BString * parseError)58 BPrivate::Storage::Sniffer::parse(const char *rule, Rule *result, BString *parseError) {
59 	Parser parser;
60 	return parser.Parse(rule, result, parseError);
61 }
62 
63 //------------------------------------------------------------------------------
64 // Token
65 //------------------------------------------------------------------------------
66 
Token(TokenType type,const ssize_t pos)67 Token::Token(TokenType type, const ssize_t pos)
68 	: fType(type)
69 	, fPos(pos)
70 {
71 //	if (type != EmptyToken)
72 //		cout << "New Token, fType == " << tokenTypeToString(fType) << endl;
73 }
74 
~Token()75 Token::~Token() {
76 }
77 
78 TokenType
Type() const79 Token::Type() const {
80 	return fType;
81 }
82 
83 const std::string&
String() const84 Token::String() const {
85 	throw new Err("Sniffer scanner error: Token::String() called on non-string token", fPos);
86 }
87 
88 int32
Int() const89 Token::Int() const {
90 	throw new Err("Sniffer scanner error: Token::Int() called on non-integer token", fPos);
91 }
92 
93 double
Float() const94 Token::Float() const {
95 	throw new Err("Sniffer scanner error: Token::Float() called on non-float token", fPos);
96 }
97 
98 ssize_t
Pos() const99 Token::Pos() const {
100 	return fPos;
101 }
102 
103 bool
operator ==(Token & ref) const104 Token::operator==(Token &ref) const {
105 	// Compare types, then data if necessary
106 	if (Type() == ref.Type()) {
107 		switch (Type()) {
108 			case CharacterString:
109 //				printf(" str1 == '%s'\n", String());
110 //				printf(" str2 == '%s'\n", ref.String());
111 //				printf(" strcmp() == %d\n", strcmp(String(), ref.String()));
112 			{
113 				return String() == ref.String();
114 
115 /*
116 				// strcmp() seems to choke on certain, non-normal ASCII chars
117 				// (i.e. chars outside the usual alphabets, but still valid
118 				// as far as ASCII is concerned), so we'll just compare the
119 				// strings by hand to be safe.
120 				const char *str1 = String();
121 				const char *str2 = ref.String();
122 				int len1 = strlen(str1);
123 				int len2 = strlen(str2);
124 //				printf("len1 == %d\n", len1);
125 //				printf("len2 == %d\n", len2);
126 				if (len1 == len2) {
127 					for (int i = 0; i < len1; i++) {
128 //						printf("i == %d, str1[%d] == %x, str2[%d] == %x\n", i, i, str1[i], i, str2[i]);
129 						if (str1[i] != str2[i])
130 							return false;
131 					}
132 				}
133 				return true;
134 */
135 			}
136 //				return strcmp(String(), ref.String()) == 0;
137 
138 			case Integer:
139 				return Int() == ref.Int();
140 
141 			case FloatingPoint:
142 				return Float() == ref.Float();
143 
144 			default:
145 				return true;
146 		}
147 	} else
148 		return false;
149 }
150 
151 //------------------------------------------------------------------------------
152 // StringToken
153 //------------------------------------------------------------------------------
154 
StringToken(const std::string & str,const ssize_t pos)155 StringToken::StringToken(const std::string &str, const ssize_t pos)
156 	: Token(CharacterString, pos)
157 	, fString(str)
158 {
159 }
160 
~StringToken()161 StringToken::~StringToken() {
162 }
163 
164 const std::string&
String() const165 StringToken::String() const {
166 	return fString;
167 }
168 
169 //------------------------------------------------------------------------------
170 // IntToken
171 //------------------------------------------------------------------------------
172 
IntToken(const int32 value,const ssize_t pos)173 IntToken::IntToken(const int32 value, const ssize_t pos)
174 	: Token(Integer, pos)
175 	, fValue(value)
176 {
177 }
178 
~IntToken()179 IntToken::~IntToken() {
180 }
181 
182 int32
Int() const183 IntToken::Int() const {
184 	return fValue;
185 }
186 
187 double
Float() const188 IntToken::Float() const {
189 	return (double)fValue;
190 }
191 
192 //------------------------------------------------------------------------------
193 // FloatToken
194 //------------------------------------------------------------------------------
195 
FloatToken(const double value,const ssize_t pos)196 FloatToken::FloatToken(const double value, const ssize_t pos)
197 	: Token(FloatingPoint, pos)
198 	, fValue(value)
199 {
200 }
201 
~FloatToken()202 FloatToken::~FloatToken() {
203 }
204 
205 
206 double
Float() const207 FloatToken::Float() const {
208 	return fValue;
209 }
210 
211 //------------------------------------------------------------------------------
212 // TokenStream
213 //------------------------------------------------------------------------------
214 
TokenStream(const std::string & string)215 TokenStream::TokenStream(const std::string &string)
216 	: fCStatus(B_NO_INIT)
217 	, fPos(-1)
218 	, fStrLen(-1)
219 {
220 	SetTo(string);
221 }
222 
TokenStream()223 TokenStream::TokenStream()
224 	: fCStatus(B_NO_INIT)
225 	, fPos(-1)
226 	, fStrLen(-1)
227 {
228 }
229 
~TokenStream()230 TokenStream::~TokenStream() {
231 	Unset();
232 }
233 
234 status_t
SetTo(const std::string & string)235 TokenStream::SetTo(const std::string &string) {
236 	Unset();
237 	fStrLen = string.length();
238 	CharStream stream(string);
239 	if (stream.InitCheck() != B_OK)
240 		throw new Err("Sniffer scanner error: Unable to intialize character stream", -1);
241 
242 	typedef enum TokenStreamScannerState {
243 		tsssStart,
244 		tsssOneSingle,
245 		tsssOneDouble,
246 		tsssOneZero,
247 		tsssZeroX,
248 		tsssOneHex,
249 		tsssTwoHex,
250 		tsssIntOrFloat,
251 		tsssFloat,
252 		tsssLonelyDecimalPoint,
253 		tsssLonelyMinusOrPlus,
254 		tsssLonelyFloatExtension,
255 		tsssLonelyFloatExtensionWithSign,
256 		tsssExtendedFloat,
257 		tsssUnquoted,
258 		tsssEscape,
259 		tsssEscapeX,
260 		tsssEscapeOneOctal,
261 		tsssEscapeTwoOctal,
262 		tsssEscapeOneHex,
263 	} TokenStreamScannerState;
264 
265 	TokenStreamScannerState state = tsssStart;
266 	TokenStreamScannerState escapedState = tsssStart;
267 		// Used to remember which state to return to from an escape sequence
268 
269 	std::string charStr = "";	// Used to build up character strings
270 	char lastChar = 0;			// For two char lookahead
271 	char lastLastChar = 0;		// For three char lookahead (have I mentioned I hate octal?)
272 	bool keepLooping = true;
273 	ssize_t startPos = 0;
274 	while (keepLooping) {
275 		ssize_t pos = stream.Pos();
276 		char ch = stream.Get();
277 		switch (state) {
278 			case tsssStart:
279 				startPos = pos;
280 				switch (ch) {
281 					case 0x3:	// End-Of-Text
282 						if (stream.IsEmpty())
283 							keepLooping = false;
284 						else
285 							throw new Err(std::string("Sniffer pattern error: invalid character '") + ch + "'", pos);
286 						break;
287 
288 					case '\t':
289 					case '\n':
290 					case ' ':
291 						// Whitespace, so ignore it.
292 						break;
293 
294 					case '"':
295 						charStr = "";
296 						state = tsssOneDouble;
297 						break;
298 
299 					case '\'':
300 						charStr = "";
301 						state = tsssOneSingle;
302 						break;
303 
304 					case '+':
305 					case '-':
306 						charStr = ch;
307 						lastChar = ch;
308 						state = tsssLonelyMinusOrPlus;
309 						break;
310 
311 					case '.':
312 						charStr = ch;
313 						state = tsssLonelyDecimalPoint;
314 						break;
315 
316 					case '0':
317 						charStr = ch;
318 						state = tsssOneZero;
319 						break;
320 
321 					case '1':
322 					case '2':
323 					case '3':
324 					case '4':
325 					case '5':
326 					case '6':
327 					case '7':
328 					case '8':
329 					case '9':
330 						charStr = ch;
331 						state = tsssIntOrFloat;
332 						break;
333 
334 					case '&':	AddToken(Ampersand, pos);		break;
335 					case '(':	AddToken(LeftParen, pos);		break;
336 					case ')':	AddToken(RightParen, pos);		break;
337 					case ':':	AddToken(Colon, pos);			break;
338 					case '[':	AddToken(LeftBracket, pos);		break;
339 
340 					case '\\':
341 						charStr = "";					// Clear our string
342 						state = tsssEscape;
343 						escapedState = tsssUnquoted;	// Unquoted strings begin with an escaped character
344 						break;
345 
346 					case ']':	AddToken(RightBracket, pos);		break;
347 					case '|':	AddToken(Divider, pos);			break;
348 
349 					default:
350 						throw new Err(std::string("Sniffer pattern error: invalid character '") + ch + "'", pos);
351 				}
352 				break;
353 
354 			case tsssOneSingle:
355 				switch (ch) {
356 					case '\\':
357 						escapedState = state;		// Save our state
358 						state = tsssEscape;			// Handle the escape sequence
359 						break;
360 					case '\'':
361 						AddString(charStr, startPos);
362 						state = tsssStart;
363 						break;
364 					case 0x3:
365 						if (stream.IsEmpty())
366 							throw new Err(std::string("Sniffer pattern error: unterminated single-quoted string"), pos);
367 						else
368 							charStr += ch;
369 						break;
370 					default:
371 						charStr += ch;
372 						break;
373 				}
374 				break;
375 
376 			case tsssOneDouble:
377 				switch (ch) {
378 					case '\\':
379 						escapedState = state;		// Save our state
380 						state = tsssEscape;			// Handle the escape sequence
381 						break;
382 					case '"':
383 						AddString(charStr, startPos);
384 						state = tsssStart;
385 						break;
386 					case 0x3:
387 						if (stream.IsEmpty())
388 							throw new Err(std::string("Sniffer pattern error: unterminated double-quoted string"), pos);
389 						else
390 							charStr += ch;
391 						break;
392 					default:
393 						charStr += ch;
394 						break;
395 				}
396 				break;
397 
398 			case tsssOneZero:
399 				if (ch == 'x') {
400 					charStr = "";	// Reinit, since we actually have a hex string
401 					state = tsssZeroX;
402 				} else if ('0' <= ch && ch <= '9') {
403 					charStr += ch;
404 					state = tsssIntOrFloat;
405 				} else if (ch == '.') {
406 					charStr += ch;
407 					state = tsssFloat;
408 				} else if (ch == 'e' || ch == 'E') {
409 					charStr += ch;
410 					state = tsssLonelyFloatExtension;
411 				} else {
412 					// Terminate the number
413 					AddInt(charStr.c_str(), startPos);
414 
415 					// Push the last char back on and try again
416 					stream.Unget();
417 					state = tsssStart;
418 				}
419 				break;
420 
421 			case tsssZeroX:
422 				if (isHexChar(ch)) {
423 					lastChar = ch;
424 					state = tsssOneHex;
425 				} else
426 					throw new Err(std::string("Sniffer pattern error: incomplete hex code"), pos);
427 				break;
428 
429 			case tsssOneHex:
430 				if (isHexChar(ch)) {
431 					try {
432 						charStr += hexToChar(lastChar, ch);
433 					} catch (Err *err) {
434 						if (err)
435 							err->SetPos(pos);
436 						throw err;
437 					}
438 					state = tsssTwoHex;
439 				} else
440 					throw new Err(std::string("Sniffer pattern error: bad hex literal"), pos);	// Same as R5
441 				break;
442 
443 			case tsssTwoHex:
444 				if (isHexChar(ch)) {
445 					lastChar = ch;
446 					state = tsssOneHex;
447 				} else {
448 					AddString(charStr, startPos);
449 					stream.Unget();		// So punctuation gets handled properly
450 					state = tsssStart;
451 				}
452 				break;
453 
454 			case tsssIntOrFloat:
455 				if (isDecimalChar(ch))
456 					charStr += ch;
457 				else if (ch == '.') {
458 					charStr += ch;
459 					state = tsssFloat;
460 				} else if (ch == 'e' || ch == 'E') {
461 					charStr += ch;
462 					state = tsssLonelyFloatExtension;
463 				} else {
464 					// Terminate the number
465 					AddInt(charStr.c_str(), startPos);
466 
467 					// Push the last char back on and try again
468 					stream.Unget();
469 					state = tsssStart;
470 				}
471 				break;
472 
473 			case tsssFloat:
474 				if (isDecimalChar(ch))
475 					charStr += ch;
476 				else if (ch == 'e' || ch == 'E') {
477 					charStr += ch;
478 					state = tsssLonelyFloatExtension;
479 				} else {
480 					// Terminate the number
481 					AddFloat(charStr.c_str(), startPos);
482 
483 					// Push the last char back on and try again
484 					stream.Unget();
485 					state = tsssStart;
486 				}
487 				break;
488 
489 			case tsssLonelyDecimalPoint:
490 				if (isDecimalChar(ch)) {
491 					charStr += ch;
492 					state = tsssFloat;
493 				} else
494 					throw new Err(std::string("Sniffer pattern error: incomplete floating point number"), pos);
495 				break;
496 
497 			case tsssLonelyMinusOrPlus:
498 				if (isDecimalChar(ch)) {
499 					charStr += ch;
500 					state = tsssIntOrFloat;
501 				} else if (ch == '.') {
502 					charStr += ch;
503 					state = tsssLonelyDecimalPoint;
504 				} else if (ch == 'i' && lastChar == '-') {
505 					AddToken(CaseInsensitiveFlag, startPos);
506 					state = tsssStart;
507 				} else
508 					throw new Err(std::string("Sniffer pattern error: incomplete signed number or invalid flag"), pos);
509 				break;
510 
511 			case tsssLonelyFloatExtension:
512 				if (ch == '+' || ch == '-') {
513 					charStr += ch;
514 					state = tsssLonelyFloatExtensionWithSign;
515 				} else if (isDecimalChar(ch)) {
516 					charStr += ch;
517 					state = tsssExtendedFloat;
518 				} else
519 					throw new Err(std::string("Sniffer pattern error: incomplete extended-notation floating point number"), pos);
520 				break;
521 
522 			case tsssLonelyFloatExtensionWithSign:
523 				if (isDecimalChar(ch)) {
524 					charStr += ch;
525 					state = tsssExtendedFloat;
526 				} else
527 					throw new Err(std::string("Sniffer pattern error: incomplete extended-notation floating point number"), pos);
528 				break;
529 
530 			case tsssExtendedFloat:
531 				if (isDecimalChar(ch)) {
532 					charStr += ch;
533 					state = tsssExtendedFloat;
534 				} else {
535 					// Terminate the number
536 					AddFloat(charStr.c_str(), startPos);
537 
538 					// Push the last char back on and try again
539 					stream.Unget();
540 					state = tsssStart;
541 				}
542 				break;
543 
544 			case tsssUnquoted:
545 				if (ch == '\\') {
546 					escapedState = state;		// Save our state
547 					state = tsssEscape;			// Handle the escape sequence
548 				} else if (isWhiteSpace(ch) || isPunctuation(ch)) {
549 					AddString(charStr, startPos);
550 					stream.Unget();				// In case it's punctuation, let tsssStart handle it
551 					state = tsssStart;
552 				} else if (ch == 0x3 && stream.IsEmpty()) {
553 					AddString(charStr, startPos);
554 					keepLooping = false;
555 				} else {
556 					charStr += ch;
557 				}
558 				break;
559 
560 			case tsssEscape:
561 				if (isOctalChar(ch)) {
562 					lastChar = ch;
563 					state = tsssEscapeOneOctal;
564 				} else if (ch == 'x') {
565 					state = tsssEscapeX;
566 				} else {
567 					// Check for a true end-of-text marker
568 					if (ch == 0x3 && stream.IsEmpty())
569 						throw new Err(std::string("Sniffer pattern error: incomplete escape sequence"), pos);
570 					else {
571 						charStr += escapeChar(ch);
572 						state = escapedState;	// Return to the state we were in before the escape
573 					}
574 				}
575 				break;
576 
577 			case tsssEscapeX:
578 				if (isHexChar(ch)) {
579 					lastChar = ch;
580 					state = tsssEscapeOneHex;
581 				} else
582 					throw new Err(std::string("Sniffer pattern error: incomplete escaped hex code"), pos);
583 				break;
584 
585 			case tsssEscapeOneOctal:
586 				if (isOctalChar(ch)) {
587 					lastLastChar = lastChar;
588 					lastChar = ch;
589 					state = tsssEscapeTwoOctal;
590 				} else {
591 					// First handle the octal
592 					try {
593 						charStr += octalToChar(lastChar);
594 					} catch (Err *err) {
595 						if (err)
596 							err->SetPos(startPos);
597 						throw err;
598 					}
599 
600 					// Push the new char back on and let the state we
601 					// were in when the escape sequence was hit handle it.
602 					stream.Unget();
603 					state = escapedState;
604 				}
605 				break;
606 
607 			case tsssEscapeTwoOctal:
608 				if (isOctalChar(ch)) {
609 					try {
610 						charStr += octalToChar(lastLastChar, lastChar, ch);
611 					} catch (Err *err) {
612 						if (err)
613 							err->SetPos(startPos);
614 						throw err;
615 					}
616 					state = escapedState;
617 				} else {
618 					// First handle the octal
619 					try {
620 						charStr += octalToChar(lastLastChar, lastChar);
621 					} catch (Err *err) {
622 						if (err)
623 							err->SetPos(startPos);
624 						throw err;
625 					}
626 
627 					// Push the new char back on and let the state we
628 					// were in when the escape sequence was hit handle it.
629 					stream.Unget();
630 					state = escapedState;
631 				}
632 				break;
633 
634 			case tsssEscapeOneHex:
635 				if (isHexChar(ch)) {
636 					try {
637 						charStr += hexToChar(lastChar, ch);
638 					} catch (Err *err) {
639 						if (err)
640 							err->SetPos(pos);
641 						throw err;
642 					}
643 					state = escapedState;
644 				} else
645 					throw new Err(std::string("Sniffer pattern error: incomplete escaped hex code"), pos);
646 				break;
647 
648 		}
649 	}
650 	if (state == tsssStart)	{
651 		fCStatus = B_OK;
652 		fPos = 0;
653 	} else {
654 		throw new Err("Sniffer pattern error: unterminated rule", stream.Pos());
655 	}
656 
657 	return fCStatus;
658 }
659 
660 void
Unset()661 TokenStream::Unset() {
662 	std::vector<Token*>::iterator i;
663 	for (i = fTokenList.begin(); i != fTokenList.end(); i++)
664 		delete *i;
665 	fTokenList.clear();
666 	fCStatus = B_NO_INIT;
667 	fStrLen = -1;
668 }
669 
670 status_t
InitCheck() const671 TokenStream::InitCheck() const {
672 	return fCStatus;
673 }
674 
675 //! Returns a pointer to the next token in the stream.
676 /*! The TokenStream object retains owner ship of the Token object returned by Get().
677     If Get() is called at the end of the stream, a pointer to a Err object is thrown.
678 */
679 const Token*
Get()680 TokenStream::Get() {
681 	if (fCStatus != B_OK)
682 		throw new Err("Sniffer parser error: TokenStream::Get() called on uninitialized TokenStream object", -1);
683 	if (fPos < (ssize_t)fTokenList.size())
684 		return fTokenList[fPos++];
685 	else {
686 		throw new Err("Sniffer pattern error: unterminated rule", EndPos());
687 //		fPos++;			// Increment fPos to keep Unget()s consistent
688 //		return NULL;	// Return NULL to signal end of list
689 	}
690 }
691 
692 //! Places token returned by the most recent call to Get() back on the head of the stream.
693 /*! If Unget() is called at the beginning of the stream, a pointer to a Err object is thrown.
694 */
695 void
Unget()696 TokenStream::Unget() {
697 	if (fCStatus != B_OK)
698 		throw new Err("Sniffer parser error: TokenStream::Unget() called on uninitialized TokenStream object", -1);
699 	if (fPos > 0)
700 		fPos--;
701 	else
702 		throw new Err("Sniffer parser error: TokenStream::Unget() called at beginning of token stream", -1);
703 }
704 
705 
706 /*! \brief Reads the next token in the stream and verifies it is of the given type,
707 	throwing a pointer to a Err object if it is not.
708 */
709 void
Read(TokenType type)710 TokenStream::Read(TokenType type) {
711 	const Token *t = Get();
712 	if (t->Type() != type) {
713 		throw new Err((std::string("Sniffer pattern error: expected ") + tokenTypeToString(type)
714 	                + ", found " + tokenTypeToString(t->Type())).c_str(), t->Pos());
715 	}
716 }
717 
718 //! Conditionally reads the next token in the stream.
719 /*! CondRead() peeks at the next token in the stream. If it is of the given type, the
720 	token is removed from the stream and \c true is returned. If it is not of the
721 	given type, false is returned and the token remains at the head of the stream.
722 */
723 bool
CondRead(TokenType type)724 TokenStream::CondRead(TokenType type) {
725 	const Token *t = Get();
726 	if (t->Type() == type) {
727 		return true;
728 	} else {
729 		Unget();
730 		return false;
731 	}
732 }
733 
734 ssize_t
Pos() const735 TokenStream::Pos() const {
736 	return fPos < (ssize_t)fTokenList.size() ? fTokenList[fPos]->Pos() : fStrLen;
737 }
738 
739 ssize_t
EndPos() const740 TokenStream::EndPos() const {
741 	return fStrLen;
742 }
743 
744 bool
IsEmpty() const745 TokenStream::IsEmpty() const {
746 	return fCStatus != B_OK || fPos >= (ssize_t)fTokenList.size();
747 }
748 
749 void
AddToken(TokenType type,ssize_t pos)750 TokenStream::AddToken(TokenType type, ssize_t pos) {
751 	Token *token = new Token(type, pos);
752 	fTokenList.push_back(token);
753 }
754 
755 void
AddString(const std::string & str,ssize_t pos)756 TokenStream::AddString(const std::string &str, ssize_t pos) {
757 	Token *token = new StringToken(str, pos);
758 	fTokenList.push_back(token);
759 }
760 
761 void
AddInt(const char * str,ssize_t pos)762 TokenStream::AddInt(const char *str, ssize_t pos) {
763 	// Convert the string to an int
764 	int32 value = atol(str);
765 	Token *token = new IntToken(value, pos);
766 	fTokenList.push_back(token);
767 }
768 
769 void
AddFloat(const char * str,ssize_t pos)770 TokenStream::AddFloat(const char *str, ssize_t pos) {
771 	// Convert the string to a float
772 	double value = atof(str);
773 	Token *token = new FloatToken(value, pos);
774 	fTokenList.push_back(token);
775 }
776 
777 //------------------------------------------------------------------------------
778 // Helper functions
779 //------------------------------------------------------------------------------
780 
781 char
escapeChar(char ch)782 escapeChar(char ch) {
783 	// I've manually handled all the escape sequences I could come
784 	// up with, and for anything else I just return the character
785 	// passed in. Hex escapes are handled elsewhere, so \x just
786 	// returns 'x'. Similarly, octals are handled elsewhere, so \0
787 	// through \9 just return '0' through '9'.
788 	switch (ch) {
789 		case 'a':
790 			return '\a';
791 		case 'b':
792 			return '\b';
793 		case 'f':
794 			return '\f';
795 		case 'n':
796 			return '\n';
797 		case 'r':
798 			return '\r';
799 		case 't':
800 			return '\t';
801 		case 'v':
802 			return '\v';
803 		default:
804 			return ch;
805 	}
806 }
807 
808 // Converts 0x|hi|low| to a single char
809 char
hexToChar(char hi,char low)810 hexToChar(char hi, char low) {
811 	return (hexToChar(hi) << 4)	| hexToChar(low);
812 }
813 
814 // Converts 0x|ch| to a single char
815 char
hexToChar(char hex)816 hexToChar(char hex) {
817 	if ('0' <= hex && hex <= '9')
818 		return hex-'0';
819 	else if ('a' <= hex && hex <= 'f')
820 		return hex-'a'+10;
821 	else if ('A' <= hex && hex <= 'F')
822 		return hex-'A'+10;
823 	else
824 		throw new Err(std::string("Sniffer parser error: invalid hex digit '") + hex + "' passed to hexToChar()", -1);
825 }
826 
827 char
octalToChar(char octal)828 octalToChar(char octal) {
829 	return octalToChar('0', '0', octal);
830 }
831 
832 char
octalToChar(char hi,char low)833 octalToChar(char hi, char low) {
834 	return octalToChar('0', hi, low);
835 }
836 
837 char
octalToChar(char hi,char mid,char low)838 octalToChar(char hi, char mid, char low) {
839 	if (isOctalChar(hi) && isOctalChar(mid) && isOctalChar(low)) {
840 		// Check for octals >= decimal 256
841 		if ((hi-'0') <= 3)
842 			return ((hi-'0') << 6) | ((mid-'0') << 3) | (low-'0');
843 		else
844 			throw new Err("Sniffer pattern error: invalid octal literal (octals must be between octal 0 and octal 377 inclusive)", -1);
845 	} else
846 		throw new Err(std::string("Sniffer parser error: invalid octal digit passed to hexToChar()"), -1);
847 }
848 
849 bool
isHexChar(char ch)850 isHexChar(char ch) {
851 	return ('0' <= ch && ch <= '9')
852 	         || ('a' <= ch && ch <= 'f')
853 	           || ('A' <= ch && ch <= 'F');
854 }
855 
856 bool
isWhiteSpace(char ch)857 isWhiteSpace(char ch) {
858 	return ch == ' ' || ch == '\n' || ch == '\t';
859 }
860 
861 bool
isOctalChar(char ch)862 isOctalChar(char ch) {
863 	return ('0' <= ch && ch <= '7');
864 }
865 
866 bool
isDecimalChar(char ch)867 isDecimalChar(char ch) {
868 	return ('0' <= ch && ch <= '9');
869 }
870 
871 bool
isPunctuation(char ch)872 isPunctuation(char ch) {
873 	switch (ch) {
874 		case '&':
875 		case '(':
876 		case ')':
877 		case ':':
878 		case '[':
879 		case ']':
880 		case '|':
881 			return true;
882 		default:
883 			return false;
884 	}
885 }
886 
887 const char*
tokenTypeToString(TokenType type)888 BPrivate::Storage::Sniffer::tokenTypeToString(TokenType type) {
889 	switch (type) {
890 		case LeftParen:
891 			return "LeftParen";
892 			break;
893 		case RightParen:
894 			return "RightParen";
895 			break;
896 		case LeftBracket:
897 			return "LeftBracket";
898 			break;
899 		case RightBracket:
900 			return "RightBracket";
901 			break;
902 		case Colon:
903 			return "Colon";
904 			break;
905 		case Divider:
906 			return "Divider";
907 			break;
908 		case Ampersand:
909 			return "Ampersand";
910 			break;
911 		case CaseInsensitiveFlag:
912 			return "CaseInsensitiveFlag";
913 			break;
914 		case CharacterString:
915 			return "CharacterString";
916 			break;
917 		case Integer:
918 			return "Integer";
919 			break;
920 		case FloatingPoint:
921 			return "FloatingPoint";
922 			break;
923 		default:
924 			return "UNKNOWN TOKEN TYPE";
925 			break;
926 	}
927 }
928 
929 //------------------------------------------------------------------------------
930 // Parser
931 //------------------------------------------------------------------------------
932 
Parser()933 Parser::Parser()
934 	: fOutOfMemErr(new(std::nothrow) Err("Sniffer parser error: out of memory", -1))
935 {
936 }
937 
~Parser()938 Parser::~Parser() {
939 	delete fOutOfMemErr;
940 }
941 
942 status_t
Parse(const char * rule,Rule * result,BString * parseError)943 Parser::Parse(const char *rule, Rule *result, BString *parseError) {
944 	try {
945 		if (!rule)
946 			throw new Err("Sniffer pattern error: NULL pattern", -1);
947 		if (!result)
948 			return B_BAD_VALUE;
949 		if (stream.SetTo(rule) != B_OK)
950 			throw new Err("Sniffer parser error: Unable to intialize token stream", -1);
951 
952 		ParseRule(result);
953 
954 		return B_OK;
955 
956 	} catch (Err *err) {
957 //		cout << "Caught error" << endl;
958 		if (parseError)
959 			parseError->SetTo(ErrorMessage(err, rule).c_str());
960 		delete err;
961 		return rule ? (status_t)B_BAD_MIME_SNIFFER_RULE : (status_t)B_BAD_VALUE;
962 	}
963 }
964 
965 std::string
ErrorMessage(Err * err,const char * rule)966 Parser::ErrorMessage(Err *err, const char *rule) {
967 	const char* msg = (err && err->Msg())
968     	                ? err->Msg()
969     	                  : "Sniffer parser error: Unexpected error with no supplied error message";
970     ssize_t pos = err && (err->Pos() >= 0) ? err->Pos() : 0;
971     std::string str = std::string(rule ? rule : "") + "\n";
972     for (int i = 0; i < pos; i++)
973     	str += " ";
974     str += "^    ";
975     str += msg;
976     return str;
977 }
978 
979 void
ParseRule(Rule * result)980 Parser::ParseRule(Rule *result) {
981 	if (!result)
982 		throw new Err("Sniffer parser error: NULL Rule object passed to Parser::ParseRule()", -1);
983 
984 	// Priority
985 	double priority = ParsePriority();
986 	// Conjunction List
987 	std::vector<DisjList*>* list = ParseConjList();
988 
989 	result->SetTo(priority, list);
990 }
991 
992 double
ParsePriority()993 Parser::ParsePriority() {
994 	const Token *t = stream.Get();
995 	if (t->Type() == FloatingPoint || t->Type() == Integer) {
996 		double result = t->Float();
997 		if (0.0 <= result && result <= 1.0)
998 			return result;
999 		else {
1000 //			cout << "(priority == " << result << ")" << endl;
1001 			throw new Err("Sniffer pattern error: invalid priority", t->Pos());
1002 		}
1003 	} else
1004 		throw new Err("Sniffer pattern error: match level expected", t->Pos());	// Same as R5
1005 }
1006 
1007 std::vector<DisjList*>*
ParseConjList()1008 Parser::ParseConjList() {
1009 	std::vector<DisjList*> *list = new(std::nothrow) std::vector<DisjList*>;
1010 	if (!list)
1011 		ThrowOutOfMemError(stream.Pos());
1012 	try {
1013 		// DisjList+
1014 		int count = 0;
1015 		while (true) {
1016 			DisjList* expr = ParseDisjList();
1017 			if (!expr)
1018 				break;
1019 			else {
1020 				list->push_back(expr);
1021 				count++;
1022 			}
1023 		}
1024 		if (count == 0)
1025 			throw new Err("Sniffer pattern error: missing expression", -1);
1026 	} catch (...) {
1027 		delete list;
1028 		throw;
1029 	}
1030 	return list;
1031 }
1032 
1033 DisjList*
ParseDisjList()1034 Parser::ParseDisjList() {
1035 	// If we've run out of tokens right now, it's okay, but
1036 	// we need to let ParseConjList() know what's up
1037 	if (stream.IsEmpty())
1038 		return NULL;
1039 
1040 	// Peek ahead, then let the appropriate Parse*List()
1041 	// functions handle things
1042 	const Token *t1 = stream.Get();
1043 
1044 	// PatternList | RangeList
1045 	if (t1->Type() == LeftParen) {
1046 		const Token *t2 = stream.Get();
1047 		// Skip the case-insensitive flag, if there is one
1048 		const Token *tokenOfInterest = (t2->Type() == CaseInsensitiveFlag) ? stream.Get() : t2;
1049 		if (t2 != tokenOfInterest)
1050 			stream.Unget();	// We called Get() three times
1051 		stream.Unget();
1052 		stream.Unget();
1053 		// RangeList
1054 		if (tokenOfInterest->Type() == LeftBracket) {
1055 			return ParseRPatternList();
1056 		// PatternList
1057 		} else {
1058 			return ParsePatternList(Range(0,0));
1059 		}
1060 	// Range, PatternList
1061 	} else if (t1->Type() == LeftBracket) {
1062 		stream.Unget();
1063 		return ParsePatternList(ParseRange());
1064 	} else {
1065 		throw new Err("Sniffer pattern error: missing pattern", t1->Pos());	// Same as R5
1066 	}
1067 
1068 	// PatternList
1069 	// RangeList
1070 	// Range + PatternList
1071 }
1072 
1073 Range
ParseRange()1074 Parser::ParseRange() {
1075 	int32 start, end;
1076 	// LeftBracket
1077 	stream.Read(LeftBracket);
1078 	// Integer
1079 	{
1080 		const Token *t = stream.Get();
1081 		if (t->Type() == Integer) {
1082 			start = t->Int();
1083 			end = start;	// In case we aren't given an explicit end
1084 		} else
1085 			throw new Err("Sniffer pattern error: pattern offset expected", t->Pos());
1086 	}
1087 	// [Colon, Integer] RightBracket
1088 	{
1089 		const Token *t = stream.Get();
1090 		// Colon, Integer, RightBracket
1091 		if (t->Type() == Colon) {
1092 			// Integer
1093 			{
1094 				const Token *t = stream.Get();
1095 				if (t->Type() == Integer) {
1096 					end = t->Int();
1097 				} else
1098 					ThrowUnexpectedTokenError(Integer, t);
1099 			}
1100 			// RightBracket
1101 			stream.Read(RightBracket);
1102 		// !(Colon, Integer) RightBracket
1103 		} else if (t->Type() == RightBracket) {
1104 			// Nothing to do here...
1105 
1106 		// Something else...
1107 		} else
1108 			ThrowUnexpectedTokenError(Colon, Integer, t);
1109 	}
1110 	Range range(start, end);
1111 	if (range.InitCheck() == B_OK)
1112 		return range;
1113 	else
1114 		throw range.GetErr();
1115 }
1116 
1117 DisjList*
ParsePatternList(Range range)1118 Parser::ParsePatternList(Range range) {
1119 	PatternList *list = new(std::nothrow) PatternList(range);
1120 	if (!list)
1121 		ThrowOutOfMemError(stream.Pos());
1122 	try {
1123 		// LeftParen
1124 		stream.Read(LeftParen);
1125 		// [Flag] Pattern, (Divider, [Flag] Pattern)*
1126 		while (true) {
1127 			// [Flag]
1128 			if (stream.CondRead(CaseInsensitiveFlag))
1129 				list->SetCaseInsensitive(true);
1130 			// Pattern
1131 			list->Add(ParsePattern());
1132 			// [Divider]
1133 			if (!stream.CondRead(Divider))
1134 				break;
1135 		}
1136 		// RightParen
1137 		const Token *t = stream.Get();
1138 		if (t->Type() != RightParen)
1139 			throw new Err("Sniffer pattern error: expecting '|', ')', or possibly '&'", t->Pos());
1140 	} catch (...) {
1141 		delete list;
1142 		throw;
1143 	}
1144 	return list;
1145 }
1146 
1147 DisjList*
ParseRPatternList()1148 Parser::ParseRPatternList() {
1149 	RPatternList *list = new(std::nothrow) RPatternList();
1150 	if (!list)
1151 		ThrowOutOfMemError(stream.Pos());
1152 	try {
1153 		// LeftParen
1154 		stream.Read(LeftParen);
1155 		// [Flag] RPattern, (Divider, [Flag] RPattern)*
1156 		while (true) {
1157 			// [Flag]
1158 			if (stream.CondRead(CaseInsensitiveFlag))
1159 				list->SetCaseInsensitive(true);
1160 			// RPattern
1161 			list->Add(ParseRPattern());
1162 			// [Divider]
1163 			if (!stream.CondRead(Divider))
1164 				break;
1165 		}
1166 		// RightParen
1167 		const Token *t = stream.Get();
1168 		if (t->Type() != RightParen)
1169 			throw new Err("Sniffer pattern error: expecting '|', ')', or possibly '&'", t->Pos());
1170 	} catch (...) {
1171 		delete list;
1172 		throw;
1173 	}
1174 	return list;
1175 }
1176 
1177 RPattern*
ParseRPattern()1178 Parser::ParseRPattern() {
1179 	// Range
1180 	Range range = ParseRange();
1181 	// Pattern
1182 	Pattern *pattern = ParsePattern();
1183 
1184 	RPattern *result = new(std::nothrow) RPattern(range, pattern);
1185 	if (result) {
1186 		if (result->InitCheck() == B_OK)
1187 			return result;
1188 		else {
1189 			Err *err = result->GetErr();
1190 			delete result;
1191 			throw err;
1192 		}
1193 	} else
1194 		ThrowOutOfMemError(stream.Pos());
1195 	return NULL;
1196 }
1197 
1198 Pattern*
ParsePattern()1199 Parser::ParsePattern() {
1200 	std::string str;
1201 	// String
1202 	{
1203 		const Token *t = stream.Get();
1204 		if (t->Type() == CharacterString)
1205 			str = t->String();
1206 		else
1207 			throw new Err("Sniffer pattern error: missing pattern", t->Pos());
1208 	}
1209 	// [Ampersand, String]
1210 	if (stream.CondRead(Ampersand)) {
1211 		// String (i.e. Mask)
1212 		const Token *t = stream.Get();
1213 		if (t->Type() == CharacterString) {
1214 			Pattern *result = new(std::nothrow) Pattern(str, t->String());
1215 			if (!result)
1216 				ThrowOutOfMemError(t->Pos());
1217 			if (result->InitCheck() == B_OK) {
1218 				return result;
1219 			} else {
1220 				Err *err = result->GetErr();
1221 				delete result;
1222 				if (err) {
1223 					err->SetPos(t->Pos());
1224 				}
1225 				throw err;
1226 			}
1227 		} else
1228 			ThrowUnexpectedTokenError(CharacterString, t);
1229 	} else {
1230 		// No mask specified.
1231 		Pattern *result = new(std::nothrow) Pattern(str);
1232 		if (result) {
1233 			if (result->InitCheck() == B_OK)
1234 				return result;
1235 			else {
1236 				Err *err = result->GetErr();
1237 				delete result;
1238 				throw err;
1239 			}
1240 		} else
1241 			ThrowOutOfMemError(stream.Pos());
1242 	}
1243 	return NULL;
1244 }
1245 
1246 void
ThrowEndOfStreamError()1247 Parser::ThrowEndOfStreamError() {
1248 	throw new Err("Sniffer pattern error: unterminated rule", stream.EndPos());
1249 }
1250 
1251 inline
1252 void
ThrowOutOfMemError(ssize_t pos)1253 Parser::ThrowOutOfMemError(ssize_t pos) {
1254 	if (fOutOfMemErr)
1255 		fOutOfMemErr->SetPos(pos);
1256 	Err *err = fOutOfMemErr;
1257 	fOutOfMemErr = NULL;
1258 	throw err;
1259 }
1260 
1261 void
ThrowUnexpectedTokenError(TokenType expected,const Token * found)1262 Parser::ThrowUnexpectedTokenError(TokenType expected, const Token *found) {
1263 	throw new Err((std::string("Sniffer pattern error: expected ") + tokenTypeToString(expected)
1264 	                + ", found " + (found ? tokenTypeToString(found->Type()) : "NULL token")).c_str()
1265 	                , (found ? found->Pos() : stream.EndPos()));
1266 }
1267 
1268 void
ThrowUnexpectedTokenError(TokenType expected1,TokenType expected2,const Token * found)1269 Parser::ThrowUnexpectedTokenError(TokenType expected1, TokenType expected2, const Token *found) {
1270 	throw new Err((std::string("Sniffer pattern error: expected ") + tokenTypeToString(expected1)
1271 	                + " or " + tokenTypeToString(expected2) + ", found "
1272 	                + (found ? tokenTypeToString(found->Type()) : "NULL token")).c_str()
1273 	                , (found ? found->Pos() : stream.EndPos()));
1274 }
1275 
1276 
1277 
1278