1 //---------------------------------------------------------------------- 2 // This software is part of the OpenBeOS distribution and is covered 3 // by the OpenBeOS license. 4 //---------------------------------------------------------------------- 5 /*! 6 \file sniffer/Parser.cpp 7 MIME sniffer rule parser implementation 8 */ 9 10 //#include <sniffer/Expr.h> 11 #include <sniffer/Parser.h> 12 #include <sniffer/Pattern.h> 13 #include <sniffer/PatternList.h> 14 #include <sniffer/Range.h> 15 #include <sniffer/RPattern.h> 16 #include <sniffer/RPatternList.h> 17 #include <sniffer/Rule.h> 18 19 #include <new.h> 20 #include <stdio.h> 21 #include <stdlib.h> // For atol(), atof() 22 #include <string.h> 23 #include <String.h> // BString 24 25 using namespace Sniffer; 26 27 // Miscellaneous helper functions 28 char escapeChar(char ch); 29 char hexToChar(char hi, char low); 30 char hexToChar(char hex); 31 char octalToChar(char octal); 32 char octalToChar(char hi, char low); 33 char octalToChar(char hi, char mid, char low); 34 bool isHexChar(char ch); 35 bool isWhiteSpace(char ch); 36 bool isOctalChar(char ch); 37 bool isDecimalChar(char ch); 38 bool isPunctuation(char ch); 39 40 //! Parses the given rule. 41 /*! The resulting parsed Sniffer::Rule structure is stored in \c rule, which 42 must be pre-allocated. If parsing fails, a descriptive error message (meant 43 to be viewed in a monospaced font) is placed in the pre-allocated \c BString 44 pointed to by \c parseError (which may be \c NULL if you don't care about 45 the error message). 46 47 \param rule Pointer to a NULL-terminated string containing the sniffer 48 rule to be parsed 49 \param result Pointer to a pre-allocated \c Sniffer::Rule object into which the result 50 of parsing is placed upon success. 51 \param parseError Point to pre-allocated \c BString object into which 52 a descriptive error message is stored upon failure. 53 54 \return 55 - B_OK: Success 56 - B_BAD_MIME_SNIFFER_RULE: Failure 57 */ 58 status_t 59 Sniffer::parse(const char *rule, Rule *result, BString *parseError) { 60 Parser parser; 61 return parser.Parse(rule, result, parseError); 62 } 63 64 //------------------------------------------------------------------------------ 65 // CharStream 66 //------------------------------------------------------------------------------ 67 68 CharStream::CharStream(const char *string) 69 : fString(NULL) 70 , fPos(0) 71 , fLen(-1) 72 , fCStatus(B_NO_INIT) 73 { 74 SetTo(string); 75 } 76 77 CharStream::~CharStream() { 78 Unset(); 79 } 80 81 status_t 82 CharStream::SetTo(const char *string) { 83 Unset(); 84 if (string) { 85 fString = new(nothrow) char[strlen(string)+1]; 86 if (!fString) 87 fCStatus = B_NO_MEMORY; 88 else { 89 strcpy(fString, string); 90 fLen = strlen(fString); 91 fCStatus = B_OK; 92 } 93 } 94 return fCStatus; 95 } 96 97 void 98 CharStream::Unset() { 99 delete fString; 100 fCStatus = B_NO_INIT; 101 fPos = 0; 102 fLen = -1; 103 } 104 105 status_t 106 CharStream::InitCheck() const { 107 return fCStatus; 108 } 109 110 bool 111 CharStream::IsEmpty() const { 112 return fPos >= fLen; 113 } 114 115 ssize_t 116 CharStream::Pos() const { 117 return fPos; 118 } 119 120 const char* 121 CharStream::String() const { 122 return fString; 123 } 124 125 char 126 CharStream::Get() { 127 if (fCStatus != B_OK) 128 throw new Err("Sniffer parser error: CharStream::Get() called on uninitialized CharStream object", -1); 129 if (fPos < fLen) 130 return fString[fPos++]; 131 else { 132 fPos++; // Increment fPos to keep Unget()s consistent 133 return 0x3; // Return End-Of-Text char 134 } 135 } 136 137 void 138 CharStream::Unget() { 139 if (fCStatus != B_OK) 140 throw new Err("Sniffer parser error: CharStream::Unget() called on uninitialized CharStream object", -1); 141 if (fPos > 0) 142 fPos--; 143 else 144 throw new Err("Sniffer parser error: CharStream::Unget() called at beginning of character stream", -1); 145 } 146 147 //------------------------------------------------------------------------------ 148 // Token 149 //------------------------------------------------------------------------------ 150 151 Token::Token(TokenType type, const ssize_t pos) 152 : fType(type) 153 , fPos(pos) 154 { 155 // if (type != EmptyToken) 156 // cout << "New Token, fType == " << tokenTypeToString(fType) << endl; 157 } 158 159 Token::~Token() { 160 } 161 162 TokenType 163 Token::Type() const { 164 return fType; 165 } 166 167 const char* 168 Token::String() const { 169 throw new Err("Sniffer scanner error: Token::String() called on non-string token", fPos); 170 } 171 172 int32 173 Token::Int() const { 174 throw new Err("Sniffer scanner error: Token::Int() called on non-integer token", fPos); 175 } 176 177 double 178 Token::Float() const { 179 throw new Err("Sniffer scanner error: Token::Float() called on non-float token", fPos); 180 } 181 182 ssize_t 183 Token::Pos() const { 184 return fPos; 185 } 186 187 bool 188 Token::operator==(Token &ref) const { 189 // Compare types, then data if necessary 190 if (Type() == ref.Type()) { 191 switch (Type()) { 192 case CharacterString: 193 // printf(" str1 == '%s'\n", String()); 194 // printf(" str2 == '%s'\n", ref.String()); 195 // printf(" strcmp() == %d\n", strcmp(String(), ref.String())); 196 { 197 // strcmp() seems to choke on certain, non-normal ASCII chars 198 // (i.e. chars outside the usual alphabets, but still valid 199 // as far as ASCII is concerned), so we'll just compare the 200 // strings by hand to be safe. 201 const char *str1 = String(); 202 const char *str2 = ref.String(); 203 int len1 = strlen(str1); 204 int len2 = strlen(str2); 205 // printf("len1 == %d\n", len1); 206 // printf("len2 == %d\n", len2); 207 if (len1 == len2) { 208 for (int i = 0; i < len1; i++) { 209 // printf("i == %d, str1[%d] == %x, str2[%d] == %x\n", i, i, str1[i], i, str2[i]); 210 if (str1[i] != str2[i]) 211 return false; 212 } 213 } 214 return true; 215 } 216 // return strcmp(String(), ref.String()) == 0; 217 218 case Integer: 219 return Int() == ref.Int(); 220 221 case FloatingPoint: 222 return Float() == ref.Float(); 223 224 default: 225 return true; 226 } 227 } else 228 return false; 229 } 230 231 //------------------------------------------------------------------------------ 232 // StringToken 233 //------------------------------------------------------------------------------ 234 235 StringToken::StringToken(const char *string, const ssize_t pos) 236 : Token(CharacterString, pos) 237 , fString(NULL) 238 { 239 if (string) { 240 fString = new(nothrow) char[strlen(string)+1]; 241 if (fString) 242 strcpy(fString, string); 243 } 244 } 245 246 StringToken::~StringToken() { 247 delete fString; 248 } 249 250 const char* 251 StringToken::String() const { 252 return fString; 253 } 254 255 //------------------------------------------------------------------------------ 256 // IntToken 257 //------------------------------------------------------------------------------ 258 259 IntToken::IntToken(const int32 value, const ssize_t pos) 260 : Token(Integer, pos) 261 , fValue(value) 262 { 263 } 264 265 int32 266 IntToken::Int() const { 267 return fValue; 268 } 269 270 double 271 IntToken::Float() const { 272 return (double)fValue; 273 } 274 275 //------------------------------------------------------------------------------ 276 // FloatToken 277 //------------------------------------------------------------------------------ 278 279 FloatToken::FloatToken(const double value, const ssize_t pos) 280 : Token(FloatingPoint, pos) 281 , fValue(value) 282 { 283 } 284 285 double 286 FloatToken::Float() const { 287 return fValue; 288 } 289 290 //------------------------------------------------------------------------------ 291 // TokenStream 292 //------------------------------------------------------------------------------ 293 294 TokenStream::TokenStream(const char *string = NULL) 295 : fCStatus(B_NO_INIT) 296 , fPos(-1) 297 , fStrLen(-1) 298 { 299 SetTo(string); 300 } 301 302 TokenStream::~TokenStream() { 303 Unset(); 304 } 305 306 status_t 307 TokenStream::SetTo(const char *string) { 308 int q = 0; 309 Unset(); 310 if (string) { 311 fStrLen = strlen(string); 312 CharStream stream(string); 313 if (stream.InitCheck() != B_OK) 314 throw new Err("Sniffer scanner error: Unable to intialize character stream", -1); 315 316 typedef enum TokenStreamScannerState { 317 tsssStart, 318 tsssOneSingle, 319 tsssOneDouble, 320 tsssOneZero, 321 tsssZeroX, 322 tsssOneHex, 323 tsssTwoHex, 324 tsssHexStringEnd, 325 tsssIntOrFloat, 326 tsssFloat, 327 tsssLonelyDecimalPoint, 328 tsssLonelyMinusOrPlus, 329 tsssLonelyFloatExtension, 330 tsssLonelyFloatExtensionWithSign, 331 tsssExtendedFloat, 332 tsssUnquoted, 333 tsssEscape, 334 tsssEscapeX, 335 tsssEscapeOneOctal, 336 tsssEscapeTwoOctal, 337 tsssEscapeOneHex, 338 tsssEscapeTwoHex 339 }; 340 341 TokenStreamScannerState state = tsssStart; 342 TokenStreamScannerState escapedState; 343 // Used to remember which state to return to from an escape sequence 344 345 std::string charStr; // Used to build up character strings 346 char lastChar; // For two char lookahead 347 char lastLastChar; // For three char lookahead 348 bool keepLooping = true; 349 ssize_t startPos; 350 while (keepLooping) { 351 ssize_t pos = stream.Pos(); 352 char ch = stream.Get(); 353 switch (state) { 354 case tsssStart: 355 startPos = pos; 356 switch (ch) { 357 case 0x3: // End-Of-Text 358 if (stream.IsEmpty()) 359 keepLooping = false; 360 else 361 throw new Err(std::string("Sniffer pattern error: invalid character '") + ch + "'", pos); 362 break; 363 364 case '\t': 365 case '\n': 366 case ' ': 367 // Whitespace, so ignore it. 368 break; 369 370 case '"': 371 charStr = ""; 372 state = tsssOneDouble; 373 break; 374 375 case '\'': 376 charStr = ""; 377 state = tsssOneSingle; 378 break; 379 380 case '+': 381 case '-': 382 charStr = ch; 383 state = tsssLonelyMinusOrPlus; 384 break; 385 386 case '.': 387 charStr = ch; 388 state = tsssLonelyDecimalPoint; 389 break; 390 391 case '0': 392 charStr = ch; 393 state = tsssOneZero; 394 break; 395 396 case '1': 397 case '2': 398 case '3': 399 case '4': 400 case '5': 401 case '6': 402 case '7': 403 case '8': 404 case '9': 405 charStr = ch; 406 state = tsssIntOrFloat; 407 break; 408 409 case '&': AddToken(Ampersand, pos); break; 410 case '(': AddToken(LeftParen, pos); break; 411 case ')': AddToken(RightParen, pos); break; 412 case ':': AddToken(Colon, pos); break; 413 case '[': AddToken(LeftBracket, pos); break; 414 415 case '\\': 416 charStr = ""; // Clear our string 417 state = tsssEscape; 418 escapedState = tsssUnquoted; // Unquoted strings begin with an escaped character 419 break; 420 421 case ']': AddToken(RightBracket, pos); break; 422 case '|': AddToken(Divider, pos); break; 423 424 default: 425 throw new Err(std::string("Sniffer pattern error: invalid character '") + ch + "'", pos); 426 } 427 break; 428 429 case tsssOneSingle: 430 switch (ch) { 431 case '\\': 432 escapedState = state; // Save our state 433 state = tsssEscape; // Handle the escape sequence 434 break; 435 case '\'': 436 AddString(charStr.c_str(), startPos); 437 state = tsssStart; 438 break; 439 case 0x3: 440 if (stream.IsEmpty()) 441 throw new Err(std::string("Sniffer pattern error: unterminated single-quoted string"), pos); 442 else 443 charStr += ch; 444 break; 445 default: 446 charStr += ch; 447 break; 448 } 449 break; 450 451 case tsssOneDouble: 452 switch (ch) { 453 case '\\': 454 escapedState = state; // Save our state 455 state = tsssEscape; // Handle the escape sequence 456 break; 457 case '"': 458 AddString(charStr.c_str(), startPos); 459 state = tsssStart; 460 break; 461 case 0x3: 462 if (stream.IsEmpty()) 463 throw new Err(std::string("Sniffer pattern error: unterminated double-quoted string"), pos); 464 else 465 charStr += ch; 466 break; 467 default: 468 charStr += ch; 469 break; 470 } 471 break; 472 473 case tsssOneZero: 474 if (ch == 'x') { 475 charStr = ""; // Reinit, since we actually have a hex string 476 state = tsssZeroX; 477 } else if ('0' <= ch && ch <= '9') { 478 charStr += ch; 479 state = tsssIntOrFloat; 480 } else if (ch == '.') { 481 charStr += ch; 482 state = tsssFloat; 483 } else if (ch == 'e' || ch == 'E') { 484 charStr += ch; 485 state = tsssLonelyFloatExtension; 486 } else { 487 // Terminate the number 488 AddInt(charStr.c_str(), startPos); 489 490 // Push the last char back on and try again 491 stream.Unget(); 492 state = tsssStart; 493 } 494 break; 495 496 case tsssZeroX: 497 if (isHexChar(ch)) { 498 lastChar = ch; 499 state = tsssOneHex; 500 } else 501 throw new Err(std::string("Sniffer pattern error: incomplete hex code"), pos); 502 break; 503 504 case tsssOneHex: 505 if (isHexChar(ch)) { 506 try { 507 charStr += hexToChar(lastChar, ch); 508 } catch (Err *err) { 509 if (err) 510 err->SetPos(pos); 511 throw err; 512 } 513 state = tsssTwoHex; 514 } else 515 throw new Err(std::string("Sniffer pattern error: bad hex literal"), pos); // Same as R5 516 break; 517 518 case tsssTwoHex: 519 if (isHexChar(ch)) { 520 lastChar = ch; 521 state = tsssOneHex; 522 } else { 523 AddString(charStr.c_str(), startPos); 524 stream.Unget(); // So punctuation gets handled properly 525 state = tsssStart; 526 } 527 break; 528 529 case tsssIntOrFloat: 530 if (isDecimalChar(ch)) 531 charStr += ch; 532 else if (ch == '.') { 533 charStr += ch; 534 state = tsssFloat; 535 } else if (ch == 'e' || ch == 'E') { 536 charStr += ch; 537 state = tsssLonelyFloatExtension; 538 } else { 539 // Terminate the number 540 AddInt(charStr.c_str(), startPos); 541 542 // Push the last char back on and try again 543 stream.Unget(); 544 state = tsssStart; 545 } 546 break; 547 548 case tsssFloat: 549 if (isDecimalChar(ch)) 550 charStr += ch; 551 else if (ch == 'e' || ch == 'E') { 552 charStr += ch; 553 state = tsssLonelyFloatExtension; 554 } else { 555 // Terminate the number 556 AddFloat(charStr.c_str(), startPos); 557 558 // Push the last char back on and try again 559 stream.Unget(); 560 state = tsssStart; 561 } 562 break; 563 564 case tsssLonelyDecimalPoint: 565 if (isDecimalChar(ch)) { 566 charStr += ch; 567 state = tsssFloat; 568 } else 569 throw new Err(std::string("Sniffer pattern error: incomplete floating point number"), pos); 570 break; 571 572 case tsssLonelyMinusOrPlus: 573 if (isDecimalChar(ch)) { 574 charStr += ch; 575 state = tsssIntOrFloat; 576 } else if (ch == '.') { 577 charStr += ch; 578 state = tsssLonelyDecimalPoint; 579 } else 580 throw new Err(std::string("Sniffer pattern error: incomplete signed number"), pos); 581 break; 582 583 case tsssLonelyFloatExtension: 584 if (ch == '+' || ch == '-') { 585 charStr += ch; 586 state = tsssLonelyFloatExtensionWithSign; 587 } else if (isDecimalChar(ch)) { 588 charStr += ch; 589 state = tsssExtendedFloat; 590 } else 591 throw new Err(std::string("Sniffer pattern error: incomplete extended-notation floating point number"), pos); 592 break; 593 594 case tsssLonelyFloatExtensionWithSign: 595 if (isDecimalChar(ch)) { 596 charStr += ch; 597 state = tsssExtendedFloat; 598 } else 599 throw new Err(std::string("Sniffer pattern error: incomplete extended-notation floating point number"), pos); 600 break; 601 602 case tsssExtendedFloat: 603 if (isDecimalChar(ch)) { 604 charStr += ch; 605 state = tsssExtendedFloat; 606 } else { 607 // Terminate the number 608 AddFloat(charStr.c_str(), startPos); 609 610 // Push the last char back on and try again 611 stream.Unget(); 612 state = tsssStart; 613 } 614 break; 615 616 case tsssUnquoted: 617 if (ch == '\\') { 618 escapedState = state; // Save our state 619 state = tsssEscape; // Handle the escape sequence 620 } else if (isWhiteSpace(ch) || isPunctuation(ch)) { 621 AddString(charStr.c_str(), startPos); 622 stream.Unget(); // In case it's punctuation, let tsssStart handle it 623 state = tsssStart; 624 } else if (ch == '\'' || ch == '"') { 625 throw new Err(std::string("Sniffer pattern error: illegal unquoted character '") + ch + "'", pos); 626 } else if (ch == 0x3 && stream.IsEmpty()) { 627 AddString(charStr.c_str(), startPos); 628 keepLooping = false; 629 } else { 630 charStr += ch; 631 } 632 break; 633 634 case tsssEscape: 635 if (isOctalChar(ch)) { 636 lastChar = ch; 637 state = tsssEscapeOneOctal; 638 } else if (ch == 'x') { 639 state = tsssEscapeX; 640 } else { 641 // Check for a true end-of-text marker 642 if (ch == 0x3 && stream.IsEmpty()) 643 throw new Err(std::string("Sniffer pattern error: incomplete escape sequence"), pos); 644 else { 645 charStr += escapeChar(ch); 646 state = escapedState; // Return to the state we were in before the escape 647 } 648 } 649 break; 650 651 case tsssEscapeX: 652 if (isHexChar(ch)) { 653 lastChar = ch; 654 state = tsssEscapeOneHex; 655 } else 656 throw new Err(std::string("Sniffer pattern error: incomplete escaped hex code"), pos); 657 break; 658 659 case tsssEscapeOneOctal: 660 if (isOctalChar(ch)) { 661 lastLastChar = lastChar; 662 lastChar = ch; 663 state = tsssEscapeTwoOctal; 664 } else { 665 // First handle the octal 666 try { 667 charStr += octalToChar(lastChar); 668 } catch (Err *err) { 669 if (err) 670 err->SetPos(startPos); 671 throw err; 672 } 673 674 // Push the new char back on and let the state we 675 // were in when the escape sequence was hit handle it. 676 stream.Unget(); 677 state = escapedState; 678 } 679 break; 680 681 case tsssEscapeTwoOctal: 682 if (isOctalChar(ch)) { 683 try { 684 charStr += octalToChar(lastLastChar, lastChar, ch); 685 } catch (Err *err) { 686 if (err) 687 err->SetPos(startPos); 688 throw err; 689 } 690 state = escapedState; 691 } else { 692 // First handle the octal 693 try { 694 charStr += octalToChar(lastLastChar, lastChar); 695 } catch (Err *err) { 696 if (err) 697 err->SetPos(startPos); 698 throw err; 699 } 700 701 // Push the new char back on and let the state we 702 // were in when the escape sequence was hit handle it. 703 stream.Unget(); 704 state = escapedState; 705 } 706 break; 707 708 case tsssEscapeOneHex: 709 if (isHexChar(ch)) { 710 try { 711 charStr += hexToChar(lastChar, ch); 712 } catch (Err *err) { 713 if (err) 714 err->SetPos(pos); 715 throw err; 716 } 717 state = escapedState; 718 } else 719 throw new Err(std::string("Sniffer pattern error: incomplete escaped hex code"), pos); 720 break; 721 722 } 723 } 724 if (state == tsssStart) { 725 fCStatus = B_OK; 726 fPos = 0; 727 } else { 728 throw new Err("Sniffer pattern error: unterminated rule", stream.Pos()); 729 } 730 } 731 732 return fCStatus; 733 } 734 735 void 736 TokenStream::Unset() { 737 std::vector<Token*>::iterator i; 738 for (i = fTokenList.begin(); i != fTokenList.end(); i++) 739 delete *i; 740 fTokenList.clear(); 741 fCStatus = B_NO_INIT; 742 fStrLen = -1; 743 } 744 745 status_t 746 TokenStream::InitCheck() const { 747 return fCStatus; 748 } 749 750 //! Returns a pointer to the next token in the stream. 751 /*! The TokenStream object retains owner ship of the Token object returned by Get(). 752 If Get() is called at the end of the stream, a pointer to a Sniffer::Err object is thrown. 753 */ 754 const Token* 755 TokenStream::Get() { 756 if (fCStatus != B_OK) 757 throw new Err("Sniffer parser error: TokenStream::Get() called on uninitialized TokenStream object", -1); 758 if (fPos < fTokenList.size()) 759 return fTokenList[fPos++]; 760 else { 761 throw new Err("Sniffer pattern error: unterminated rule", EndPos()); 762 // fPos++; // Increment fPos to keep Unget()s consistent 763 // return NULL; // Return NULL to signal end of list 764 } 765 } 766 767 //! Places token returned by the most recent call to Get() back on the head of the stream. 768 /*! If Unget() is called at the beginning of the stream, a pointer to a Sniffer::Err object is thrown. 769 */ 770 void 771 TokenStream::Unget() { 772 if (fCStatus != B_OK) 773 throw new Err("Sniffer parser error: TokenStream::Unget() called on uninitialized TokenStream object", -1); 774 if (fPos > 0) 775 fPos--; 776 else 777 throw new Err("Sniffer parser error: TokenStream::Unget() called at beginning of token stream", -1); 778 } 779 780 781 /*! \brief Reads the next token in the stream and verifies it is of the given type, 782 throwing a pointer to a Sniffer::Err object if it is not. 783 */ 784 void 785 TokenStream::Read(TokenType type) { 786 const Token *t = Get(); 787 if (t->Type() != type) { 788 throw new Err((std::string("Sniffer pattern error: expected ") + tokenTypeToString(type) 789 + ", found " + tokenTypeToString(t->Type())).c_str(), t->Pos()); 790 } 791 } 792 793 //! Conditionally reads the next token in the stream. 794 /*! CondRead() peeks at the next token in the stream. If it is of the given type, the 795 token is removed from the stream and \c true is returned. If it is not of the 796 given type, false is returned and the token remains at the head of the stream. 797 */ 798 bool 799 TokenStream::CondRead(TokenType type) { 800 const Token *t = Get(); 801 if (t->Type() == type) { 802 return true; 803 } else { 804 Unget(); 805 return false; 806 } 807 } 808 809 ssize_t 810 TokenStream::Pos() const { 811 return fPos < fTokenList.size() ? fTokenList[fPos]->Pos() : fStrLen; 812 } 813 814 ssize_t 815 TokenStream::EndPos() const { 816 return fStrLen; 817 } 818 819 bool 820 TokenStream::IsEmpty() const { 821 return fCStatus != B_OK || fPos >= fTokenList.size(); 822 } 823 824 void 825 TokenStream::AddToken(TokenType type, ssize_t pos) { 826 Token *token = new Token(type, pos); 827 fTokenList.push_back(token); 828 } 829 830 void 831 TokenStream::AddString(const char *str, ssize_t pos) { 832 Token *token = new StringToken(str, pos); 833 fTokenList.push_back(token); 834 } 835 836 void 837 TokenStream::AddInt(const char *str, ssize_t pos) { 838 // Convert the string to an int 839 int32 value = atol(str); 840 Token *token = new IntToken(value, pos); 841 fTokenList.push_back(token); 842 } 843 844 void 845 TokenStream::AddFloat(const char *str, ssize_t pos) { 846 // Convert the string to a float 847 double value = atof(str); 848 Token *token = new FloatToken(value, pos); 849 fTokenList.push_back(token); 850 } 851 852 //------------------------------------------------------------------------------ 853 // Helper functions 854 //------------------------------------------------------------------------------ 855 856 char 857 escapeChar(char ch) { 858 // I've manually handled all the escape sequences I could come 859 // up with, and for anything else I just return the character 860 // passed in. Hex escapes are handled elsewhere, so \x just 861 // returns 'x'. Similarly, octals are handled elsewhere, so \0 862 // through \9 just return '0' through '9'. 863 switch (ch) { 864 case 'a': 865 return '\a'; 866 case 'b': 867 return '\b'; 868 case 'f': 869 return '\f'; 870 case 'n': 871 return '\n'; 872 case 'r': 873 return '\r'; 874 case 't': 875 return '\t'; 876 case 'v': 877 return '\v'; 878 default: 879 return ch; 880 } 881 } 882 883 // Converts 0x|hi|low| to a single char 884 char 885 hexToChar(char hi, char low) { 886 return (hexToChar(hi) << 4) | hexToChar(low); 887 } 888 889 // Converts 0x|ch| to a single char 890 char 891 hexToChar(char hex) { 892 if ('0' <= hex && hex <= '9') 893 return hex-'0'; 894 else if ('a' <= hex && hex <= 'f') 895 return hex-'a'+10; 896 else if ('A' <= hex && hex <= 'F') 897 return hex-'A'+10; 898 else 899 throw new Err(std::string("Sniffer parser error: invalid hex digit '") + hex + "' passed to hexToChar()", -1); 900 } 901 902 char 903 octalToChar(char octal) { 904 return octalToChar('0', '0', octal); 905 } 906 907 char 908 octalToChar(char hi, char low) { 909 return octalToChar('0', hi, low); 910 } 911 912 char 913 octalToChar(char hi, char mid, char low) { 914 if (isOctalChar(hi) && isOctalChar(mid) && isOctalChar(low)) { 915 // Check for octals >= decimal 256 916 if ((hi-'0') <= 3) 917 return ((hi-'0') << 6) | ((mid-'0') << 3) | (low-'0'); 918 else 919 throw new Err("Sniffer pattern error: invalid octal literal (octals must be between octal 0 and octal 377 inclusive)", -1); 920 } else 921 throw new Err(std::string("Sniffer parser error: invalid octal digit passed to hexToChar()"), -1); 922 } 923 924 bool 925 isHexChar(char ch) { 926 return ('0' <= ch && ch <= '9') 927 || ('a' <= ch && ch <= 'f') 928 || ('A' <= ch && ch <= 'F'); 929 } 930 931 bool 932 isWhiteSpace(char ch) { 933 return ch == ' ' || ch == '\n' || ch == '\t'; 934 } 935 936 bool 937 isOctalChar(char ch) { 938 return ('0' <= ch && ch <= '7'); 939 } 940 941 bool 942 isDecimalChar(char ch) { 943 return ('0' <= ch && ch <= '9'); 944 } 945 946 bool 947 isPunctuation(char ch) { 948 switch (ch) { 949 case '&': 950 case '(': 951 case ')': 952 case ':': 953 case '[': 954 case ']': 955 case '|': 956 return true; 957 default: 958 return false; 959 } 960 } 961 962 const char* 963 Sniffer::tokenTypeToString(TokenType type) { 964 switch (type) { 965 case LeftParen: 966 return "LeftParen"; 967 break; 968 case RightParen: 969 return "RightParen"; 970 break; 971 case LeftBracket: 972 return "LeftBracket"; 973 break; 974 case RightBracket: 975 return "RightBracket"; 976 break; 977 case Colon: 978 return "Colon"; 979 break; 980 case Divider: 981 return "Divider"; 982 break; 983 case Ampersand: 984 return "Ampersand"; 985 break; 986 case CharacterString: 987 return "CharacterString"; 988 break; 989 case Integer: 990 return "Integer"; 991 break; 992 case FloatingPoint: 993 return "FloatingPoint"; 994 break; 995 default: 996 return "UNKNOWN TOKEN TYPE"; 997 break; 998 } 999 } 1000 1001 //------------------------------------------------------------------------------ 1002 // Parser 1003 //------------------------------------------------------------------------------ 1004 1005 Parser::Parser() 1006 : fOutOfMemErr(new(nothrow) Err("Sniffer parser error: out of memory", -1)) 1007 { 1008 } 1009 1010 Parser::~Parser() { 1011 delete fOutOfMemErr; 1012 } 1013 1014 status_t 1015 Parser::Parse(const char *rule, Rule *result, BString *parseError) { 1016 try { 1017 if (!rule) 1018 throw new Err("Sniffer pattern error: NULL pattern", -1); 1019 if (!result) 1020 return B_BAD_VALUE; 1021 if (stream.SetTo(rule) != B_OK) 1022 throw new Err("Sniffer parser error: Unable to intialize token stream", -1); 1023 1024 ParseRule(result); 1025 1026 return B_OK; 1027 1028 } catch (Err *err) { 1029 // cout << "Caught error" << endl; 1030 if (parseError) 1031 parseError->SetTo(ErrorMessage(err, rule).c_str()); 1032 delete err; 1033 return B_BAD_MIME_SNIFFER_RULE; 1034 } 1035 } 1036 1037 std::string 1038 Parser::ErrorMessage(Err *err, const char *rule) { 1039 const char* msg = (err && err->Msg()) 1040 ? err->Msg() 1041 : "Sniffer parser error: Unexpected error with no supplied error message"; 1042 size_t pos = err && (err->Pos() >= 0) ? err->Pos() : 0; 1043 std::string str = std::string(rule ? rule : "") + "\n"; 1044 for (int i = 0; i < pos; i++) 1045 str += " "; 1046 str += "^ "; 1047 str += msg; 1048 return str; 1049 } 1050 1051 void 1052 Parser::ParseRule(Rule *result) { 1053 if (!result) 1054 throw new Err("Sniffer parser error: NULL Rule object passed to Parser::ParseRule()", -1); 1055 1056 // Priority 1057 double priority = ParsePriority(); 1058 // Expression List 1059 std::vector<Expr*>* list = ParseExprList(); 1060 1061 result->SetTo(priority, list); 1062 } 1063 1064 double 1065 Parser::ParsePriority() { 1066 const Token *t = stream.Get(); 1067 if (t->Type() == FloatingPoint || t->Type() == Integer) { 1068 double result = t->Float(); 1069 if (0.0 <= result && result <= 1.0) 1070 return result; 1071 else { 1072 // cout << "(priority == " << result << ")" << endl; 1073 throw new Err("Sniffer pattern error: invalid priority", t->Pos()); 1074 } 1075 } else 1076 throw new Err("Sniffer pattern error: match level expected", t->Pos()); // Same as R5 1077 } 1078 1079 std::vector<Expr*>* 1080 Parser::ParseExprList() { 1081 std::vector<Expr*> *list = new(nothrow) std::vector<Expr*>; 1082 if (!list) 1083 ThrowOutOfMemError(stream.Pos()); 1084 try { 1085 // Expr+ 1086 int count = 0; 1087 while (true) { 1088 Expr* expr = ParseExpr(); 1089 if (!expr) 1090 break; 1091 else { 1092 list->push_back(expr); 1093 count++; 1094 } 1095 } 1096 if (count == 0) 1097 throw new Err("Sniffer pattern error: missing expression", -1); 1098 } catch (...) { 1099 delete list; 1100 throw; 1101 } 1102 return list; 1103 } 1104 1105 Expr* 1106 Parser::ParseExpr() { 1107 // If we've run out of tokens right now, it's okay, but 1108 // we need to let ParseExprList() know what's up 1109 if (stream.IsEmpty()) 1110 return NULL; 1111 1112 // Peek ahead, then let the appropriate Parse*List() 1113 // functions handle things 1114 const Token *t1 = stream.Get(); 1115 1116 // PatternList | RangeList 1117 if (t1->Type() == LeftParen) { 1118 const Token *t2 = stream.Get(); 1119 stream.Unget(); 1120 stream.Unget(); 1121 // RangeList 1122 if (t2->Type() == LeftBracket) { 1123 return ParseRPatternList(); 1124 // PatternList 1125 } else { 1126 return ParsePatternList(Range(0,0)); 1127 } 1128 // Range, PatternList 1129 } else if (t1->Type() == LeftBracket) { 1130 stream.Unget(); 1131 return ParsePatternList(ParseRange()); 1132 } else { 1133 throw new Err("Sniffer pattern error: missing pattern", t1->Pos()); // Same as R5 1134 } 1135 1136 // PatternList 1137 // RangeList 1138 // Range + PatternList 1139 } 1140 1141 Range 1142 Parser::ParseRange() { 1143 int32 start, end; 1144 // LeftBracket 1145 stream.Read(LeftBracket); 1146 // Integer 1147 { 1148 const Token *t = stream.Get(); 1149 if (t->Type() == Integer) { 1150 start = t->Int(); 1151 end = start; // In case we aren't given an explicit end 1152 } else 1153 throw new Err("Sniffer pattern error: pattern offset expected", t->Pos()); 1154 } 1155 // [Colon, Integer] RightBracket 1156 { 1157 const Token *t = stream.Get(); 1158 // Colon, Integer, RightBracket 1159 if (t->Type() == Colon) { 1160 // Integer 1161 { 1162 const Token *t = stream.Get(); 1163 if (t->Type() == Integer) { 1164 end = t->Int(); 1165 } else 1166 ThrowUnexpectedTokenError(Integer, t); 1167 } 1168 // RightBracket 1169 stream.Read(RightBracket); 1170 // !(Colon, Integer) RightBracket 1171 } else if (t->Type() == RightBracket) { 1172 // Nothing to do here... 1173 1174 // Something else... 1175 } else 1176 ThrowUnexpectedTokenError(Colon, Integer, t); 1177 } 1178 Range range(start, end); 1179 if (range.InitCheck() == B_OK) 1180 return range; 1181 else 1182 throw range.GetErr(); 1183 } 1184 1185 Expr* 1186 Parser::ParsePatternList(Range range) { 1187 PatternList *list = new(nothrow) PatternList(range); 1188 if (!list) 1189 ThrowOutOfMemError(stream.Pos()); 1190 try { 1191 // LeftParen 1192 stream.Read(LeftParen); 1193 // Pattern, (Divider, Pattern)* 1194 bool keepLooping = true; 1195 while (true) { 1196 // Pattern 1197 list->Add(ParsePattern()); 1198 // [Divider] 1199 if (!stream.CondRead(Divider)) 1200 break; 1201 } 1202 // RightParen 1203 const Token *t = stream.Get(); 1204 if (t->Type() != RightParen) 1205 throw new Err("Sniffer pattern error: expecting '|', ')', or possibly '&'", t->Pos()); 1206 } catch (...) { 1207 delete list; 1208 throw; 1209 } 1210 return list; 1211 } 1212 1213 Expr* 1214 Parser::ParseRPatternList() { 1215 RPatternList *list = new(nothrow) RPatternList(); 1216 if (!list) 1217 ThrowOutOfMemError(stream.Pos()); 1218 try { 1219 // LeftParen 1220 stream.Read(LeftParen); 1221 // RPattern, (Divider, RPattern)* 1222 bool keepLooping = true; 1223 while (true) { 1224 // RPattern 1225 list->Add(ParseRPattern()); 1226 // [Divider] 1227 if (!stream.CondRead(Divider)) 1228 break; 1229 } 1230 // RightParen 1231 const Token *t = stream.Get(); 1232 if (t->Type() != RightParen) 1233 throw new Err("Sniffer pattern error: expecting '|', ')', or possibly '&'", t->Pos()); 1234 } catch (...) { 1235 delete list; 1236 throw; 1237 } 1238 return list; 1239 } 1240 1241 RPattern* 1242 Parser::ParseRPattern() { 1243 // Range 1244 Range range = ParseRange(); 1245 // Pattern 1246 Pattern *pattern = ParsePattern(); 1247 1248 RPattern *result = new(nothrow) RPattern(range, pattern); 1249 if (result) { 1250 if (result->InitCheck() == B_OK) 1251 return result; 1252 else { 1253 Err *err = result->GetErr(); 1254 delete result; 1255 throw err; 1256 } 1257 } else 1258 ThrowOutOfMemError(stream.Pos()); 1259 } 1260 1261 Pattern* 1262 Parser::ParsePattern() { 1263 std::string str; 1264 // String 1265 { 1266 const Token *t = stream.Get(); 1267 if (t->Type() == CharacterString) 1268 str = t->String(); 1269 else 1270 throw new Err("Sniffer pattern error: missing pattern", t->Pos()); 1271 } 1272 // [Ampersand, String] 1273 if (stream.CondRead(Ampersand)) { 1274 // String (i.e. Mask) 1275 const Token *t = stream.Get(); 1276 if (t->Type() == CharacterString) { 1277 Pattern *result = new(nothrow) Pattern(str.c_str(), t->String()); 1278 if (!result) 1279 ThrowOutOfMemError(t->Pos()); 1280 if (result->InitCheck() == B_OK) { 1281 return result; 1282 } else { 1283 Err *err = result->GetErr(); 1284 delete result; 1285 if (err) { 1286 err->SetPos(t->Pos()); 1287 } 1288 throw err; 1289 } 1290 } else 1291 ThrowUnexpectedTokenError(CharacterString, t); 1292 } else { 1293 // No mask specified. 1294 Pattern *result = new(nothrow) Pattern(str.c_str()); 1295 if (result) { 1296 if (result->InitCheck() == B_OK) 1297 return result; 1298 else { 1299 Err *err = result->GetErr(); 1300 delete result; 1301 throw err; 1302 } 1303 } else 1304 ThrowOutOfMemError(stream.Pos()); 1305 } 1306 } 1307 1308 void 1309 Parser::ThrowEndOfStreamError() { 1310 throw new Err("Sniffer pattern error: unterminated rule", stream.EndPos()); 1311 } 1312 1313 inline 1314 void 1315 Parser::ThrowOutOfMemError(ssize_t pos) { 1316 if (fOutOfMemErr) 1317 fOutOfMemErr->SetPos(pos); 1318 Err *err = fOutOfMemErr; 1319 fOutOfMemErr = NULL; 1320 throw err; 1321 } 1322 1323 void 1324 Parser::ThrowUnexpectedTokenError(TokenType expected, const Token *found) { 1325 throw new Err((std::string("Sniffer pattern error: expected ") + tokenTypeToString(expected) 1326 + ", found " + (found ? tokenTypeToString(found->Type()) : "NULL token")).c_str() 1327 , (found ? found->Pos() : stream.EndPos())); 1328 } 1329 1330 void 1331 Parser::ThrowUnexpectedTokenError(TokenType expected1, TokenType expected2, const Token *found) { 1332 throw new Err((std::string("Sniffer pattern error: expected ") + tokenTypeToString(expected1) 1333 + " or " + tokenTypeToString(expected2) + ", found " 1334 + (found ? tokenTypeToString(found->Type()) : "NULL token")).c_str() 1335 , (found ? found->Pos() : stream.EndPos())); 1336 } 1337