1 //----------------------------------------------------------------------
2 // This software is part of the Haiku distribution and is covered
3 // by the MIT License.
4 //----------------------------------------------------------------------
5 /*!
6 \file sniffer/Parser.cpp
7 MIME sniffer rule parser implementation
8 */
9
10 #include <sniffer/Parser.h>
11 #include <sniffer/Pattern.h>
12 #include <sniffer/PatternList.h>
13 #include <sniffer/Range.h>
14 #include <sniffer/RPattern.h>
15 #include <sniffer/RPatternList.h>
16 #include <sniffer/Rule.h>
17
18 #include <new>
19 #include <stdio.h>
20 #include <stdlib.h> // For atol(), atof()
21 #include <string.h>
22 #include <String.h> // BString
23
24 using namespace BPrivate::Storage::Sniffer;
25
26 // Miscellaneous helper functions
27 char escapeChar(char ch);
28 char hexToChar(char hi, char low);
29 char hexToChar(char hex);
30 char octalToChar(char octal);
31 char octalToChar(char hi, char low);
32 char octalToChar(char hi, char mid, char low);
33 bool isHexChar(char ch);
34 bool isWhiteSpace(char ch);
35 bool isOctalChar(char ch);
36 bool isDecimalChar(char ch);
37 bool isPunctuation(char ch);
38
39 //! Parses the given rule.
40 /*! The resulting parsed Rule structure is stored in \c rule, which
41 must be pre-allocated. If parsing fails, a descriptive error message (meant
42 to be viewed in a monospaced font) is placed in the pre-allocated \c BString
43 pointed to by \c parseError (which may be \c NULL if you don't care about
44 the error message).
45
46 \param rule Pointer to a NULL-terminated string containing the sniffer
47 rule to be parsed
48 \param result Pointer to a pre-allocated \c Rule object into which the result
49 of parsing is placed upon success.
50 \param parseError Point to pre-allocated \c BString object into which
51 a descriptive error message is stored upon failure.
52
53 \return
54 - B_OK: Success
55 - B_BAD_MIME_SNIFFER_RULE: Failure
56 */
57 status_t
parse(const char * rule,Rule * result,BString * parseError)58 BPrivate::Storage::Sniffer::parse(const char *rule, Rule *result, BString *parseError) {
59 Parser parser;
60 return parser.Parse(rule, result, parseError);
61 }
62
63 //------------------------------------------------------------------------------
64 // Token
65 //------------------------------------------------------------------------------
66
Token(TokenType type,const ssize_t pos)67 Token::Token(TokenType type, const ssize_t pos)
68 : fType(type)
69 , fPos(pos)
70 {
71 // if (type != EmptyToken)
72 // cout << "New Token, fType == " << tokenTypeToString(fType) << endl;
73 }
74
~Token()75 Token::~Token() {
76 }
77
78 TokenType
Type() const79 Token::Type() const {
80 return fType;
81 }
82
83 const std::string&
String() const84 Token::String() const {
85 throw new Err("Sniffer scanner error: Token::String() called on non-string token", fPos);
86 }
87
88 int32
Int() const89 Token::Int() const {
90 throw new Err("Sniffer scanner error: Token::Int() called on non-integer token", fPos);
91 }
92
93 double
Float() const94 Token::Float() const {
95 throw new Err("Sniffer scanner error: Token::Float() called on non-float token", fPos);
96 }
97
98 ssize_t
Pos() const99 Token::Pos() const {
100 return fPos;
101 }
102
103 bool
operator ==(Token & ref) const104 Token::operator==(Token &ref) const {
105 // Compare types, then data if necessary
106 if (Type() == ref.Type()) {
107 switch (Type()) {
108 case CharacterString:
109 // printf(" str1 == '%s'\n", String());
110 // printf(" str2 == '%s'\n", ref.String());
111 // printf(" strcmp() == %d\n", strcmp(String(), ref.String()));
112 {
113 return String() == ref.String();
114
115 /*
116 // strcmp() seems to choke on certain, non-normal ASCII chars
117 // (i.e. chars outside the usual alphabets, but still valid
118 // as far as ASCII is concerned), so we'll just compare the
119 // strings by hand to be safe.
120 const char *str1 = String();
121 const char *str2 = ref.String();
122 int len1 = strlen(str1);
123 int len2 = strlen(str2);
124 // printf("len1 == %d\n", len1);
125 // printf("len2 == %d\n", len2);
126 if (len1 == len2) {
127 for (int i = 0; i < len1; i++) {
128 // printf("i == %d, str1[%d] == %x, str2[%d] == %x\n", i, i, str1[i], i, str2[i]);
129 if (str1[i] != str2[i])
130 return false;
131 }
132 }
133 return true;
134 */
135 }
136 // return strcmp(String(), ref.String()) == 0;
137
138 case Integer:
139 return Int() == ref.Int();
140
141 case FloatingPoint:
142 return Float() == ref.Float();
143
144 default:
145 return true;
146 }
147 } else
148 return false;
149 }
150
151 //------------------------------------------------------------------------------
152 // StringToken
153 //------------------------------------------------------------------------------
154
StringToken(const std::string & str,const ssize_t pos)155 StringToken::StringToken(const std::string &str, const ssize_t pos)
156 : Token(CharacterString, pos)
157 , fString(str)
158 {
159 }
160
~StringToken()161 StringToken::~StringToken() {
162 }
163
164 const std::string&
String() const165 StringToken::String() const {
166 return fString;
167 }
168
169 //------------------------------------------------------------------------------
170 // IntToken
171 //------------------------------------------------------------------------------
172
IntToken(const int32 value,const ssize_t pos)173 IntToken::IntToken(const int32 value, const ssize_t pos)
174 : Token(Integer, pos)
175 , fValue(value)
176 {
177 }
178
~IntToken()179 IntToken::~IntToken() {
180 }
181
182 int32
Int() const183 IntToken::Int() const {
184 return fValue;
185 }
186
187 double
Float() const188 IntToken::Float() const {
189 return (double)fValue;
190 }
191
192 //------------------------------------------------------------------------------
193 // FloatToken
194 //------------------------------------------------------------------------------
195
FloatToken(const double value,const ssize_t pos)196 FloatToken::FloatToken(const double value, const ssize_t pos)
197 : Token(FloatingPoint, pos)
198 , fValue(value)
199 {
200 }
201
~FloatToken()202 FloatToken::~FloatToken() {
203 }
204
205
206 double
Float() const207 FloatToken::Float() const {
208 return fValue;
209 }
210
211 //------------------------------------------------------------------------------
212 // TokenStream
213 //------------------------------------------------------------------------------
214
TokenStream(const std::string & string)215 TokenStream::TokenStream(const std::string &string)
216 : fCStatus(B_NO_INIT)
217 , fPos(-1)
218 , fStrLen(-1)
219 {
220 SetTo(string);
221 }
222
TokenStream()223 TokenStream::TokenStream()
224 : fCStatus(B_NO_INIT)
225 , fPos(-1)
226 , fStrLen(-1)
227 {
228 }
229
~TokenStream()230 TokenStream::~TokenStream() {
231 Unset();
232 }
233
234 status_t
SetTo(const std::string & string)235 TokenStream::SetTo(const std::string &string) {
236 Unset();
237 fStrLen = string.length();
238 CharStream stream(string);
239 if (stream.InitCheck() != B_OK)
240 throw new Err("Sniffer scanner error: Unable to intialize character stream", -1);
241
242 typedef enum TokenStreamScannerState {
243 tsssStart,
244 tsssOneSingle,
245 tsssOneDouble,
246 tsssOneZero,
247 tsssZeroX,
248 tsssOneHex,
249 tsssTwoHex,
250 tsssIntOrFloat,
251 tsssFloat,
252 tsssLonelyDecimalPoint,
253 tsssLonelyMinusOrPlus,
254 tsssLonelyFloatExtension,
255 tsssLonelyFloatExtensionWithSign,
256 tsssExtendedFloat,
257 tsssUnquoted,
258 tsssEscape,
259 tsssEscapeX,
260 tsssEscapeOneOctal,
261 tsssEscapeTwoOctal,
262 tsssEscapeOneHex,
263 } TokenStreamScannerState;
264
265 TokenStreamScannerState state = tsssStart;
266 TokenStreamScannerState escapedState = tsssStart;
267 // Used to remember which state to return to from an escape sequence
268
269 std::string charStr = ""; // Used to build up character strings
270 char lastChar = 0; // For two char lookahead
271 char lastLastChar = 0; // For three char lookahead (have I mentioned I hate octal?)
272 bool keepLooping = true;
273 ssize_t startPos = 0;
274 while (keepLooping) {
275 ssize_t pos = stream.Pos();
276 char ch = stream.Get();
277 switch (state) {
278 case tsssStart:
279 startPos = pos;
280 switch (ch) {
281 case 0x3: // End-Of-Text
282 if (stream.IsEmpty())
283 keepLooping = false;
284 else
285 throw new Err(std::string("Sniffer pattern error: invalid character '") + ch + "'", pos);
286 break;
287
288 case '\t':
289 case '\n':
290 case ' ':
291 // Whitespace, so ignore it.
292 break;
293
294 case '"':
295 charStr = "";
296 state = tsssOneDouble;
297 break;
298
299 case '\'':
300 charStr = "";
301 state = tsssOneSingle;
302 break;
303
304 case '+':
305 case '-':
306 charStr = ch;
307 lastChar = ch;
308 state = tsssLonelyMinusOrPlus;
309 break;
310
311 case '.':
312 charStr = ch;
313 state = tsssLonelyDecimalPoint;
314 break;
315
316 case '0':
317 charStr = ch;
318 state = tsssOneZero;
319 break;
320
321 case '1':
322 case '2':
323 case '3':
324 case '4':
325 case '5':
326 case '6':
327 case '7':
328 case '8':
329 case '9':
330 charStr = ch;
331 state = tsssIntOrFloat;
332 break;
333
334 case '&': AddToken(Ampersand, pos); break;
335 case '(': AddToken(LeftParen, pos); break;
336 case ')': AddToken(RightParen, pos); break;
337 case ':': AddToken(Colon, pos); break;
338 case '[': AddToken(LeftBracket, pos); break;
339
340 case '\\':
341 charStr = ""; // Clear our string
342 state = tsssEscape;
343 escapedState = tsssUnquoted; // Unquoted strings begin with an escaped character
344 break;
345
346 case ']': AddToken(RightBracket, pos); break;
347 case '|': AddToken(Divider, pos); break;
348
349 default:
350 throw new Err(std::string("Sniffer pattern error: invalid character '") + ch + "'", pos);
351 }
352 break;
353
354 case tsssOneSingle:
355 switch (ch) {
356 case '\\':
357 escapedState = state; // Save our state
358 state = tsssEscape; // Handle the escape sequence
359 break;
360 case '\'':
361 AddString(charStr, startPos);
362 state = tsssStart;
363 break;
364 case 0x3:
365 if (stream.IsEmpty())
366 throw new Err(std::string("Sniffer pattern error: unterminated single-quoted string"), pos);
367 else
368 charStr += ch;
369 break;
370 default:
371 charStr += ch;
372 break;
373 }
374 break;
375
376 case tsssOneDouble:
377 switch (ch) {
378 case '\\':
379 escapedState = state; // Save our state
380 state = tsssEscape; // Handle the escape sequence
381 break;
382 case '"':
383 AddString(charStr, startPos);
384 state = tsssStart;
385 break;
386 case 0x3:
387 if (stream.IsEmpty())
388 throw new Err(std::string("Sniffer pattern error: unterminated double-quoted string"), pos);
389 else
390 charStr += ch;
391 break;
392 default:
393 charStr += ch;
394 break;
395 }
396 break;
397
398 case tsssOneZero:
399 if (ch == 'x') {
400 charStr = ""; // Reinit, since we actually have a hex string
401 state = tsssZeroX;
402 } else if ('0' <= ch && ch <= '9') {
403 charStr += ch;
404 state = tsssIntOrFloat;
405 } else if (ch == '.') {
406 charStr += ch;
407 state = tsssFloat;
408 } else if (ch == 'e' || ch == 'E') {
409 charStr += ch;
410 state = tsssLonelyFloatExtension;
411 } else {
412 // Terminate the number
413 AddInt(charStr.c_str(), startPos);
414
415 // Push the last char back on and try again
416 stream.Unget();
417 state = tsssStart;
418 }
419 break;
420
421 case tsssZeroX:
422 if (isHexChar(ch)) {
423 lastChar = ch;
424 state = tsssOneHex;
425 } else
426 throw new Err(std::string("Sniffer pattern error: incomplete hex code"), pos);
427 break;
428
429 case tsssOneHex:
430 if (isHexChar(ch)) {
431 try {
432 charStr += hexToChar(lastChar, ch);
433 } catch (Err *err) {
434 if (err)
435 err->SetPos(pos);
436 throw err;
437 }
438 state = tsssTwoHex;
439 } else
440 throw new Err(std::string("Sniffer pattern error: bad hex literal"), pos); // Same as R5
441 break;
442
443 case tsssTwoHex:
444 if (isHexChar(ch)) {
445 lastChar = ch;
446 state = tsssOneHex;
447 } else {
448 AddString(charStr, startPos);
449 stream.Unget(); // So punctuation gets handled properly
450 state = tsssStart;
451 }
452 break;
453
454 case tsssIntOrFloat:
455 if (isDecimalChar(ch))
456 charStr += ch;
457 else if (ch == '.') {
458 charStr += ch;
459 state = tsssFloat;
460 } else if (ch == 'e' || ch == 'E') {
461 charStr += ch;
462 state = tsssLonelyFloatExtension;
463 } else {
464 // Terminate the number
465 AddInt(charStr.c_str(), startPos);
466
467 // Push the last char back on and try again
468 stream.Unget();
469 state = tsssStart;
470 }
471 break;
472
473 case tsssFloat:
474 if (isDecimalChar(ch))
475 charStr += ch;
476 else if (ch == 'e' || ch == 'E') {
477 charStr += ch;
478 state = tsssLonelyFloatExtension;
479 } else {
480 // Terminate the number
481 AddFloat(charStr.c_str(), startPos);
482
483 // Push the last char back on and try again
484 stream.Unget();
485 state = tsssStart;
486 }
487 break;
488
489 case tsssLonelyDecimalPoint:
490 if (isDecimalChar(ch)) {
491 charStr += ch;
492 state = tsssFloat;
493 } else
494 throw new Err(std::string("Sniffer pattern error: incomplete floating point number"), pos);
495 break;
496
497 case tsssLonelyMinusOrPlus:
498 if (isDecimalChar(ch)) {
499 charStr += ch;
500 state = tsssIntOrFloat;
501 } else if (ch == '.') {
502 charStr += ch;
503 state = tsssLonelyDecimalPoint;
504 } else if (ch == 'i' && lastChar == '-') {
505 AddToken(CaseInsensitiveFlag, startPos);
506 state = tsssStart;
507 } else
508 throw new Err(std::string("Sniffer pattern error: incomplete signed number or invalid flag"), pos);
509 break;
510
511 case tsssLonelyFloatExtension:
512 if (ch == '+' || ch == '-') {
513 charStr += ch;
514 state = tsssLonelyFloatExtensionWithSign;
515 } else if (isDecimalChar(ch)) {
516 charStr += ch;
517 state = tsssExtendedFloat;
518 } else
519 throw new Err(std::string("Sniffer pattern error: incomplete extended-notation floating point number"), pos);
520 break;
521
522 case tsssLonelyFloatExtensionWithSign:
523 if (isDecimalChar(ch)) {
524 charStr += ch;
525 state = tsssExtendedFloat;
526 } else
527 throw new Err(std::string("Sniffer pattern error: incomplete extended-notation floating point number"), pos);
528 break;
529
530 case tsssExtendedFloat:
531 if (isDecimalChar(ch)) {
532 charStr += ch;
533 state = tsssExtendedFloat;
534 } else {
535 // Terminate the number
536 AddFloat(charStr.c_str(), startPos);
537
538 // Push the last char back on and try again
539 stream.Unget();
540 state = tsssStart;
541 }
542 break;
543
544 case tsssUnquoted:
545 if (ch == '\\') {
546 escapedState = state; // Save our state
547 state = tsssEscape; // Handle the escape sequence
548 } else if (isWhiteSpace(ch) || isPunctuation(ch)) {
549 AddString(charStr, startPos);
550 stream.Unget(); // In case it's punctuation, let tsssStart handle it
551 state = tsssStart;
552 } else if (ch == 0x3 && stream.IsEmpty()) {
553 AddString(charStr, startPos);
554 keepLooping = false;
555 } else {
556 charStr += ch;
557 }
558 break;
559
560 case tsssEscape:
561 if (isOctalChar(ch)) {
562 lastChar = ch;
563 state = tsssEscapeOneOctal;
564 } else if (ch == 'x') {
565 state = tsssEscapeX;
566 } else {
567 // Check for a true end-of-text marker
568 if (ch == 0x3 && stream.IsEmpty())
569 throw new Err(std::string("Sniffer pattern error: incomplete escape sequence"), pos);
570 else {
571 charStr += escapeChar(ch);
572 state = escapedState; // Return to the state we were in before the escape
573 }
574 }
575 break;
576
577 case tsssEscapeX:
578 if (isHexChar(ch)) {
579 lastChar = ch;
580 state = tsssEscapeOneHex;
581 } else
582 throw new Err(std::string("Sniffer pattern error: incomplete escaped hex code"), pos);
583 break;
584
585 case tsssEscapeOneOctal:
586 if (isOctalChar(ch)) {
587 lastLastChar = lastChar;
588 lastChar = ch;
589 state = tsssEscapeTwoOctal;
590 } else {
591 // First handle the octal
592 try {
593 charStr += octalToChar(lastChar);
594 } catch (Err *err) {
595 if (err)
596 err->SetPos(startPos);
597 throw err;
598 }
599
600 // Push the new char back on and let the state we
601 // were in when the escape sequence was hit handle it.
602 stream.Unget();
603 state = escapedState;
604 }
605 break;
606
607 case tsssEscapeTwoOctal:
608 if (isOctalChar(ch)) {
609 try {
610 charStr += octalToChar(lastLastChar, lastChar, ch);
611 } catch (Err *err) {
612 if (err)
613 err->SetPos(startPos);
614 throw err;
615 }
616 state = escapedState;
617 } else {
618 // First handle the octal
619 try {
620 charStr += octalToChar(lastLastChar, lastChar);
621 } catch (Err *err) {
622 if (err)
623 err->SetPos(startPos);
624 throw err;
625 }
626
627 // Push the new char back on and let the state we
628 // were in when the escape sequence was hit handle it.
629 stream.Unget();
630 state = escapedState;
631 }
632 break;
633
634 case tsssEscapeOneHex:
635 if (isHexChar(ch)) {
636 try {
637 charStr += hexToChar(lastChar, ch);
638 } catch (Err *err) {
639 if (err)
640 err->SetPos(pos);
641 throw err;
642 }
643 state = escapedState;
644 } else
645 throw new Err(std::string("Sniffer pattern error: incomplete escaped hex code"), pos);
646 break;
647
648 }
649 }
650 if (state == tsssStart) {
651 fCStatus = B_OK;
652 fPos = 0;
653 } else {
654 throw new Err("Sniffer pattern error: unterminated rule", stream.Pos());
655 }
656
657 return fCStatus;
658 }
659
660 void
Unset()661 TokenStream::Unset() {
662 std::vector<Token*>::iterator i;
663 for (i = fTokenList.begin(); i != fTokenList.end(); i++)
664 delete *i;
665 fTokenList.clear();
666 fCStatus = B_NO_INIT;
667 fStrLen = -1;
668 }
669
670 status_t
InitCheck() const671 TokenStream::InitCheck() const {
672 return fCStatus;
673 }
674
675 //! Returns a pointer to the next token in the stream.
676 /*! The TokenStream object retains owner ship of the Token object returned by Get().
677 If Get() is called at the end of the stream, a pointer to a Err object is thrown.
678 */
679 const Token*
Get()680 TokenStream::Get() {
681 if (fCStatus != B_OK)
682 throw new Err("Sniffer parser error: TokenStream::Get() called on uninitialized TokenStream object", -1);
683 if (fPos < (ssize_t)fTokenList.size())
684 return fTokenList[fPos++];
685 else {
686 throw new Err("Sniffer pattern error: unterminated rule", EndPos());
687 // fPos++; // Increment fPos to keep Unget()s consistent
688 // return NULL; // Return NULL to signal end of list
689 }
690 }
691
692 //! Places token returned by the most recent call to Get() back on the head of the stream.
693 /*! If Unget() is called at the beginning of the stream, a pointer to a Err object is thrown.
694 */
695 void
Unget()696 TokenStream::Unget() {
697 if (fCStatus != B_OK)
698 throw new Err("Sniffer parser error: TokenStream::Unget() called on uninitialized TokenStream object", -1);
699 if (fPos > 0)
700 fPos--;
701 else
702 throw new Err("Sniffer parser error: TokenStream::Unget() called at beginning of token stream", -1);
703 }
704
705
706 /*! \brief Reads the next token in the stream and verifies it is of the given type,
707 throwing a pointer to a Err object if it is not.
708 */
709 void
Read(TokenType type)710 TokenStream::Read(TokenType type) {
711 const Token *t = Get();
712 if (t->Type() != type) {
713 throw new Err((std::string("Sniffer pattern error: expected ") + tokenTypeToString(type)
714 + ", found " + tokenTypeToString(t->Type())).c_str(), t->Pos());
715 }
716 }
717
718 //! Conditionally reads the next token in the stream.
719 /*! CondRead() peeks at the next token in the stream. If it is of the given type, the
720 token is removed from the stream and \c true is returned. If it is not of the
721 given type, false is returned and the token remains at the head of the stream.
722 */
723 bool
CondRead(TokenType type)724 TokenStream::CondRead(TokenType type) {
725 const Token *t = Get();
726 if (t->Type() == type) {
727 return true;
728 } else {
729 Unget();
730 return false;
731 }
732 }
733
734 ssize_t
Pos() const735 TokenStream::Pos() const {
736 return fPos < (ssize_t)fTokenList.size() ? fTokenList[fPos]->Pos() : fStrLen;
737 }
738
739 ssize_t
EndPos() const740 TokenStream::EndPos() const {
741 return fStrLen;
742 }
743
744 bool
IsEmpty() const745 TokenStream::IsEmpty() const {
746 return fCStatus != B_OK || fPos >= (ssize_t)fTokenList.size();
747 }
748
749 void
AddToken(TokenType type,ssize_t pos)750 TokenStream::AddToken(TokenType type, ssize_t pos) {
751 Token *token = new Token(type, pos);
752 fTokenList.push_back(token);
753 }
754
755 void
AddString(const std::string & str,ssize_t pos)756 TokenStream::AddString(const std::string &str, ssize_t pos) {
757 Token *token = new StringToken(str, pos);
758 fTokenList.push_back(token);
759 }
760
761 void
AddInt(const char * str,ssize_t pos)762 TokenStream::AddInt(const char *str, ssize_t pos) {
763 // Convert the string to an int
764 int32 value = atol(str);
765 Token *token = new IntToken(value, pos);
766 fTokenList.push_back(token);
767 }
768
769 void
AddFloat(const char * str,ssize_t pos)770 TokenStream::AddFloat(const char *str, ssize_t pos) {
771 // Convert the string to a float
772 double value = atof(str);
773 Token *token = new FloatToken(value, pos);
774 fTokenList.push_back(token);
775 }
776
777 //------------------------------------------------------------------------------
778 // Helper functions
779 //------------------------------------------------------------------------------
780
781 char
escapeChar(char ch)782 escapeChar(char ch) {
783 // I've manually handled all the escape sequences I could come
784 // up with, and for anything else I just return the character
785 // passed in. Hex escapes are handled elsewhere, so \x just
786 // returns 'x'. Similarly, octals are handled elsewhere, so \0
787 // through \9 just return '0' through '9'.
788 switch (ch) {
789 case 'a':
790 return '\a';
791 case 'b':
792 return '\b';
793 case 'f':
794 return '\f';
795 case 'n':
796 return '\n';
797 case 'r':
798 return '\r';
799 case 't':
800 return '\t';
801 case 'v':
802 return '\v';
803 default:
804 return ch;
805 }
806 }
807
808 // Converts 0x|hi|low| to a single char
809 char
hexToChar(char hi,char low)810 hexToChar(char hi, char low) {
811 return (hexToChar(hi) << 4) | hexToChar(low);
812 }
813
814 // Converts 0x|ch| to a single char
815 char
hexToChar(char hex)816 hexToChar(char hex) {
817 if ('0' <= hex && hex <= '9')
818 return hex-'0';
819 else if ('a' <= hex && hex <= 'f')
820 return hex-'a'+10;
821 else if ('A' <= hex && hex <= 'F')
822 return hex-'A'+10;
823 else
824 throw new Err(std::string("Sniffer parser error: invalid hex digit '") + hex + "' passed to hexToChar()", -1);
825 }
826
827 char
octalToChar(char octal)828 octalToChar(char octal) {
829 return octalToChar('0', '0', octal);
830 }
831
832 char
octalToChar(char hi,char low)833 octalToChar(char hi, char low) {
834 return octalToChar('0', hi, low);
835 }
836
837 char
octalToChar(char hi,char mid,char low)838 octalToChar(char hi, char mid, char low) {
839 if (isOctalChar(hi) && isOctalChar(mid) && isOctalChar(low)) {
840 // Check for octals >= decimal 256
841 if ((hi-'0') <= 3)
842 return ((hi-'0') << 6) | ((mid-'0') << 3) | (low-'0');
843 else
844 throw new Err("Sniffer pattern error: invalid octal literal (octals must be between octal 0 and octal 377 inclusive)", -1);
845 } else
846 throw new Err(std::string("Sniffer parser error: invalid octal digit passed to hexToChar()"), -1);
847 }
848
849 bool
isHexChar(char ch)850 isHexChar(char ch) {
851 return ('0' <= ch && ch <= '9')
852 || ('a' <= ch && ch <= 'f')
853 || ('A' <= ch && ch <= 'F');
854 }
855
856 bool
isWhiteSpace(char ch)857 isWhiteSpace(char ch) {
858 return ch == ' ' || ch == '\n' || ch == '\t';
859 }
860
861 bool
isOctalChar(char ch)862 isOctalChar(char ch) {
863 return ('0' <= ch && ch <= '7');
864 }
865
866 bool
isDecimalChar(char ch)867 isDecimalChar(char ch) {
868 return ('0' <= ch && ch <= '9');
869 }
870
871 bool
isPunctuation(char ch)872 isPunctuation(char ch) {
873 switch (ch) {
874 case '&':
875 case '(':
876 case ')':
877 case ':':
878 case '[':
879 case ']':
880 case '|':
881 return true;
882 default:
883 return false;
884 }
885 }
886
887 const char*
tokenTypeToString(TokenType type)888 BPrivate::Storage::Sniffer::tokenTypeToString(TokenType type) {
889 switch (type) {
890 case LeftParen:
891 return "LeftParen";
892 break;
893 case RightParen:
894 return "RightParen";
895 break;
896 case LeftBracket:
897 return "LeftBracket";
898 break;
899 case RightBracket:
900 return "RightBracket";
901 break;
902 case Colon:
903 return "Colon";
904 break;
905 case Divider:
906 return "Divider";
907 break;
908 case Ampersand:
909 return "Ampersand";
910 break;
911 case CaseInsensitiveFlag:
912 return "CaseInsensitiveFlag";
913 break;
914 case CharacterString:
915 return "CharacterString";
916 break;
917 case Integer:
918 return "Integer";
919 break;
920 case FloatingPoint:
921 return "FloatingPoint";
922 break;
923 default:
924 return "UNKNOWN TOKEN TYPE";
925 break;
926 }
927 }
928
929 //------------------------------------------------------------------------------
930 // Parser
931 //------------------------------------------------------------------------------
932
Parser()933 Parser::Parser()
934 : fOutOfMemErr(new(std::nothrow) Err("Sniffer parser error: out of memory", -1))
935 {
936 }
937
~Parser()938 Parser::~Parser() {
939 delete fOutOfMemErr;
940 }
941
942 status_t
Parse(const char * rule,Rule * result,BString * parseError)943 Parser::Parse(const char *rule, Rule *result, BString *parseError) {
944 try {
945 if (!rule)
946 throw new Err("Sniffer pattern error: NULL pattern", -1);
947 if (!result)
948 return B_BAD_VALUE;
949 if (stream.SetTo(rule) != B_OK)
950 throw new Err("Sniffer parser error: Unable to intialize token stream", -1);
951
952 ParseRule(result);
953
954 return B_OK;
955
956 } catch (Err *err) {
957 // cout << "Caught error" << endl;
958 if (parseError)
959 parseError->SetTo(ErrorMessage(err, rule).c_str());
960 delete err;
961 return rule ? (status_t)B_BAD_MIME_SNIFFER_RULE : (status_t)B_BAD_VALUE;
962 }
963 }
964
965 std::string
ErrorMessage(Err * err,const char * rule)966 Parser::ErrorMessage(Err *err, const char *rule) {
967 const char* msg = (err && err->Msg())
968 ? err->Msg()
969 : "Sniffer parser error: Unexpected error with no supplied error message";
970 ssize_t pos = err && (err->Pos() >= 0) ? err->Pos() : 0;
971 std::string str = std::string(rule ? rule : "") + "\n";
972 for (int i = 0; i < pos; i++)
973 str += " ";
974 str += "^ ";
975 str += msg;
976 return str;
977 }
978
979 void
ParseRule(Rule * result)980 Parser::ParseRule(Rule *result) {
981 if (!result)
982 throw new Err("Sniffer parser error: NULL Rule object passed to Parser::ParseRule()", -1);
983
984 // Priority
985 double priority = ParsePriority();
986 // Conjunction List
987 std::vector<DisjList*>* list = ParseConjList();
988
989 result->SetTo(priority, list);
990 }
991
992 double
ParsePriority()993 Parser::ParsePriority() {
994 const Token *t = stream.Get();
995 if (t->Type() == FloatingPoint || t->Type() == Integer) {
996 double result = t->Float();
997 if (0.0 <= result && result <= 1.0)
998 return result;
999 else {
1000 // cout << "(priority == " << result << ")" << endl;
1001 throw new Err("Sniffer pattern error: invalid priority", t->Pos());
1002 }
1003 } else
1004 throw new Err("Sniffer pattern error: match level expected", t->Pos()); // Same as R5
1005 }
1006
1007 std::vector<DisjList*>*
ParseConjList()1008 Parser::ParseConjList() {
1009 std::vector<DisjList*> *list = new(std::nothrow) std::vector<DisjList*>;
1010 if (!list)
1011 ThrowOutOfMemError(stream.Pos());
1012 try {
1013 // DisjList+
1014 int count = 0;
1015 while (true) {
1016 DisjList* expr = ParseDisjList();
1017 if (!expr)
1018 break;
1019 else {
1020 list->push_back(expr);
1021 count++;
1022 }
1023 }
1024 if (count == 0)
1025 throw new Err("Sniffer pattern error: missing expression", -1);
1026 } catch (...) {
1027 delete list;
1028 throw;
1029 }
1030 return list;
1031 }
1032
1033 DisjList*
ParseDisjList()1034 Parser::ParseDisjList() {
1035 // If we've run out of tokens right now, it's okay, but
1036 // we need to let ParseConjList() know what's up
1037 if (stream.IsEmpty())
1038 return NULL;
1039
1040 // Peek ahead, then let the appropriate Parse*List()
1041 // functions handle things
1042 const Token *t1 = stream.Get();
1043
1044 // PatternList | RangeList
1045 if (t1->Type() == LeftParen) {
1046 const Token *t2 = stream.Get();
1047 // Skip the case-insensitive flag, if there is one
1048 const Token *tokenOfInterest = (t2->Type() == CaseInsensitiveFlag) ? stream.Get() : t2;
1049 if (t2 != tokenOfInterest)
1050 stream.Unget(); // We called Get() three times
1051 stream.Unget();
1052 stream.Unget();
1053 // RangeList
1054 if (tokenOfInterest->Type() == LeftBracket) {
1055 return ParseRPatternList();
1056 // PatternList
1057 } else {
1058 return ParsePatternList(Range(0,0));
1059 }
1060 // Range, PatternList
1061 } else if (t1->Type() == LeftBracket) {
1062 stream.Unget();
1063 return ParsePatternList(ParseRange());
1064 } else {
1065 throw new Err("Sniffer pattern error: missing pattern", t1->Pos()); // Same as R5
1066 }
1067
1068 // PatternList
1069 // RangeList
1070 // Range + PatternList
1071 }
1072
1073 Range
ParseRange()1074 Parser::ParseRange() {
1075 int32 start, end;
1076 // LeftBracket
1077 stream.Read(LeftBracket);
1078 // Integer
1079 {
1080 const Token *t = stream.Get();
1081 if (t->Type() == Integer) {
1082 start = t->Int();
1083 end = start; // In case we aren't given an explicit end
1084 } else
1085 throw new Err("Sniffer pattern error: pattern offset expected", t->Pos());
1086 }
1087 // [Colon, Integer] RightBracket
1088 {
1089 const Token *t = stream.Get();
1090 // Colon, Integer, RightBracket
1091 if (t->Type() == Colon) {
1092 // Integer
1093 {
1094 const Token *t = stream.Get();
1095 if (t->Type() == Integer) {
1096 end = t->Int();
1097 } else
1098 ThrowUnexpectedTokenError(Integer, t);
1099 }
1100 // RightBracket
1101 stream.Read(RightBracket);
1102 // !(Colon, Integer) RightBracket
1103 } else if (t->Type() == RightBracket) {
1104 // Nothing to do here...
1105
1106 // Something else...
1107 } else
1108 ThrowUnexpectedTokenError(Colon, Integer, t);
1109 }
1110 Range range(start, end);
1111 if (range.InitCheck() == B_OK)
1112 return range;
1113 else
1114 throw range.GetErr();
1115 }
1116
1117 DisjList*
ParsePatternList(Range range)1118 Parser::ParsePatternList(Range range) {
1119 PatternList *list = new(std::nothrow) PatternList(range);
1120 if (!list)
1121 ThrowOutOfMemError(stream.Pos());
1122 try {
1123 // LeftParen
1124 stream.Read(LeftParen);
1125 // [Flag] Pattern, (Divider, [Flag] Pattern)*
1126 while (true) {
1127 // [Flag]
1128 if (stream.CondRead(CaseInsensitiveFlag))
1129 list->SetCaseInsensitive(true);
1130 // Pattern
1131 list->Add(ParsePattern());
1132 // [Divider]
1133 if (!stream.CondRead(Divider))
1134 break;
1135 }
1136 // RightParen
1137 const Token *t = stream.Get();
1138 if (t->Type() != RightParen)
1139 throw new Err("Sniffer pattern error: expecting '|', ')', or possibly '&'", t->Pos());
1140 } catch (...) {
1141 delete list;
1142 throw;
1143 }
1144 return list;
1145 }
1146
1147 DisjList*
ParseRPatternList()1148 Parser::ParseRPatternList() {
1149 RPatternList *list = new(std::nothrow) RPatternList();
1150 if (!list)
1151 ThrowOutOfMemError(stream.Pos());
1152 try {
1153 // LeftParen
1154 stream.Read(LeftParen);
1155 // [Flag] RPattern, (Divider, [Flag] RPattern)*
1156 while (true) {
1157 // [Flag]
1158 if (stream.CondRead(CaseInsensitiveFlag))
1159 list->SetCaseInsensitive(true);
1160 // RPattern
1161 list->Add(ParseRPattern());
1162 // [Divider]
1163 if (!stream.CondRead(Divider))
1164 break;
1165 }
1166 // RightParen
1167 const Token *t = stream.Get();
1168 if (t->Type() != RightParen)
1169 throw new Err("Sniffer pattern error: expecting '|', ')', or possibly '&'", t->Pos());
1170 } catch (...) {
1171 delete list;
1172 throw;
1173 }
1174 return list;
1175 }
1176
1177 RPattern*
ParseRPattern()1178 Parser::ParseRPattern() {
1179 // Range
1180 Range range = ParseRange();
1181 // Pattern
1182 Pattern *pattern = ParsePattern();
1183
1184 RPattern *result = new(std::nothrow) RPattern(range, pattern);
1185 if (result) {
1186 if (result->InitCheck() == B_OK)
1187 return result;
1188 else {
1189 Err *err = result->GetErr();
1190 delete result;
1191 throw err;
1192 }
1193 } else
1194 ThrowOutOfMemError(stream.Pos());
1195 return NULL;
1196 }
1197
1198 Pattern*
ParsePattern()1199 Parser::ParsePattern() {
1200 std::string str;
1201 // String
1202 {
1203 const Token *t = stream.Get();
1204 if (t->Type() == CharacterString)
1205 str = t->String();
1206 else
1207 throw new Err("Sniffer pattern error: missing pattern", t->Pos());
1208 }
1209 // [Ampersand, String]
1210 if (stream.CondRead(Ampersand)) {
1211 // String (i.e. Mask)
1212 const Token *t = stream.Get();
1213 if (t->Type() == CharacterString) {
1214 Pattern *result = new(std::nothrow) Pattern(str, t->String());
1215 if (!result)
1216 ThrowOutOfMemError(t->Pos());
1217 if (result->InitCheck() == B_OK) {
1218 return result;
1219 } else {
1220 Err *err = result->GetErr();
1221 delete result;
1222 if (err) {
1223 err->SetPos(t->Pos());
1224 }
1225 throw err;
1226 }
1227 } else
1228 ThrowUnexpectedTokenError(CharacterString, t);
1229 } else {
1230 // No mask specified.
1231 Pattern *result = new(std::nothrow) Pattern(str);
1232 if (result) {
1233 if (result->InitCheck() == B_OK)
1234 return result;
1235 else {
1236 Err *err = result->GetErr();
1237 delete result;
1238 throw err;
1239 }
1240 } else
1241 ThrowOutOfMemError(stream.Pos());
1242 }
1243 return NULL;
1244 }
1245
1246 void
ThrowEndOfStreamError()1247 Parser::ThrowEndOfStreamError() {
1248 throw new Err("Sniffer pattern error: unterminated rule", stream.EndPos());
1249 }
1250
1251 inline
1252 void
ThrowOutOfMemError(ssize_t pos)1253 Parser::ThrowOutOfMemError(ssize_t pos) {
1254 if (fOutOfMemErr)
1255 fOutOfMemErr->SetPos(pos);
1256 Err *err = fOutOfMemErr;
1257 fOutOfMemErr = NULL;
1258 throw err;
1259 }
1260
1261 void
ThrowUnexpectedTokenError(TokenType expected,const Token * found)1262 Parser::ThrowUnexpectedTokenError(TokenType expected, const Token *found) {
1263 throw new Err((std::string("Sniffer pattern error: expected ") + tokenTypeToString(expected)
1264 + ", found " + (found ? tokenTypeToString(found->Type()) : "NULL token")).c_str()
1265 , (found ? found->Pos() : stream.EndPos()));
1266 }
1267
1268 void
ThrowUnexpectedTokenError(TokenType expected1,TokenType expected2,const Token * found)1269 Parser::ThrowUnexpectedTokenError(TokenType expected1, TokenType expected2, const Token *found) {
1270 throw new Err((std::string("Sniffer pattern error: expected ") + tokenTypeToString(expected1)
1271 + " or " + tokenTypeToString(expected2) + ", found "
1272 + (found ? tokenTypeToString(found->Type()) : "NULL token")).c_str()
1273 , (found ? found->Pos() : stream.EndPos()));
1274 }
1275
1276
1277
1278