1 //---------------------------------------------------------------------- 2 // This software is part of the OpenBeOS distribution and is covered 3 // by the OpenBeOS license. 4 //---------------------------------------------------------------------- 5 /*! 6 \file sniffer/Parser.cpp 7 MIME sniffer rule parser implementation 8 */ 9 10 #include <sniffer/Parser.h> 11 #include <sniffer/Pattern.h> 12 #include <sniffer/PatternList.h> 13 #include <sniffer/Range.h> 14 #include <sniffer/RPattern.h> 15 #include <sniffer/RPatternList.h> 16 #include <sniffer/Rule.h> 17 18 #include <new.h> 19 #include <stdio.h> 20 #include <stdlib.h> // For atol(), atof() 21 #include <string.h> 22 #include <String.h> // BString 23 24 using namespace BPrivate::Storage::Sniffer; 25 26 // Miscellaneous helper functions 27 char escapeChar(char ch); 28 char hexToChar(char hi, char low); 29 char hexToChar(char hex); 30 char octalToChar(char octal); 31 char octalToChar(char hi, char low); 32 char octalToChar(char hi, char mid, char low); 33 bool isHexChar(char ch); 34 bool isWhiteSpace(char ch); 35 bool isOctalChar(char ch); 36 bool isDecimalChar(char ch); 37 bool isPunctuation(char ch); 38 39 //! Parses the given rule. 40 /*! The resulting parsed Rule structure is stored in \c rule, which 41 must be pre-allocated. If parsing fails, a descriptive error message (meant 42 to be viewed in a monospaced font) is placed in the pre-allocated \c BString 43 pointed to by \c parseError (which may be \c NULL if you don't care about 44 the error message). 45 46 \param rule Pointer to a NULL-terminated string containing the sniffer 47 rule to be parsed 48 \param result Pointer to a pre-allocated \c Rule object into which the result 49 of parsing is placed upon success. 50 \param parseError Point to pre-allocated \c BString object into which 51 a descriptive error message is stored upon failure. 52 53 \return 54 - B_OK: Success 55 - B_BAD_MIME_SNIFFER_RULE: Failure 56 */ 57 status_t 58 BPrivate::Storage::Sniffer::parse(const char *rule, Rule *result, BString *parseError) { 59 Parser parser; 60 return parser.Parse(rule, result, parseError); 61 } 62 63 //------------------------------------------------------------------------------ 64 // CharStream 65 //------------------------------------------------------------------------------ 66 67 CharStream::CharStream(const std::string &string) 68 : fString(string) 69 , fPos(0) 70 , fCStatus(B_OK) 71 { 72 } 73 74 CharStream::CharStream() 75 : fString("") 76 , fPos(0) 77 , fCStatus(B_NO_INIT) 78 { 79 } 80 81 CharStream::~CharStream() { 82 Unset(); 83 } 84 85 status_t 86 CharStream::SetTo(const std::string &string) { 87 fString = string; 88 fPos = 0; 89 fCStatus = B_OK; 90 return fCStatus; 91 } 92 93 void 94 CharStream::Unset() { 95 fString = ""; 96 fPos = 0; 97 fCStatus = B_NO_INIT; 98 } 99 100 status_t 101 CharStream::InitCheck() const { 102 return fCStatus; 103 } 104 105 bool 106 CharStream::IsEmpty() const { 107 return fPos >= fString.length(); 108 } 109 110 size_t 111 CharStream::Pos() const { 112 return fPos; 113 } 114 115 const std::string& 116 CharStream::String() const { 117 return fString; 118 } 119 120 char 121 CharStream::Get() { 122 if (fCStatus != B_OK) 123 throw new Err("Sniffer parser error: CharStream::Get() called on uninitialized CharStream object", -1); 124 if (fPos < fString.length()) 125 return fString[fPos++]; 126 else { 127 fPos++; // Increment fPos to keep Unget()s consistent 128 return 0x3; // Return End-Of-Text char 129 } 130 } 131 132 void 133 CharStream::Unget() { 134 if (fCStatus != B_OK) 135 throw new Err("Sniffer parser error: CharStream::Unget() called on uninitialized CharStream object", -1); 136 if (fPos > 0) 137 fPos--; 138 else 139 throw new Err("Sniffer parser error: CharStream::Unget() called at beginning of character stream", -1); 140 } 141 142 //------------------------------------------------------------------------------ 143 // Token 144 //------------------------------------------------------------------------------ 145 146 Token::Token(TokenType type, const ssize_t pos) 147 : fType(type) 148 , fPos(pos) 149 { 150 // if (type != EmptyToken) 151 // cout << "New Token, fType == " << tokenTypeToString(fType) << endl; 152 } 153 154 Token::~Token() { 155 } 156 157 TokenType 158 Token::Type() const { 159 return fType; 160 } 161 162 const std::string& 163 Token::String() const { 164 throw new Err("Sniffer scanner error: Token::String() called on non-string token", fPos); 165 } 166 167 int32 168 Token::Int() const { 169 throw new Err("Sniffer scanner error: Token::Int() called on non-integer token", fPos); 170 } 171 172 double 173 Token::Float() const { 174 throw new Err("Sniffer scanner error: Token::Float() called on non-float token", fPos); 175 } 176 177 ssize_t 178 Token::Pos() const { 179 return fPos; 180 } 181 182 bool 183 Token::operator==(Token &ref) const { 184 // Compare types, then data if necessary 185 if (Type() == ref.Type()) { 186 switch (Type()) { 187 case CharacterString: 188 // printf(" str1 == '%s'\n", String()); 189 // printf(" str2 == '%s'\n", ref.String()); 190 // printf(" strcmp() == %d\n", strcmp(String(), ref.String())); 191 { 192 return String() == ref.String(); 193 194 /* 195 // strcmp() seems to choke on certain, non-normal ASCII chars 196 // (i.e. chars outside the usual alphabets, but still valid 197 // as far as ASCII is concerned), so we'll just compare the 198 // strings by hand to be safe. 199 const char *str1 = String(); 200 const char *str2 = ref.String(); 201 int len1 = strlen(str1); 202 int len2 = strlen(str2); 203 // printf("len1 == %d\n", len1); 204 // printf("len2 == %d\n", len2); 205 if (len1 == len2) { 206 for (int i = 0; i < len1; i++) { 207 // printf("i == %d, str1[%d] == %x, str2[%d] == %x\n", i, i, str1[i], i, str2[i]); 208 if (str1[i] != str2[i]) 209 return false; 210 } 211 } 212 return true; 213 */ 214 } 215 // return strcmp(String(), ref.String()) == 0; 216 217 case Integer: 218 return Int() == ref.Int(); 219 220 case FloatingPoint: 221 return Float() == ref.Float(); 222 223 default: 224 return true; 225 } 226 } else 227 return false; 228 } 229 230 //------------------------------------------------------------------------------ 231 // StringToken 232 //------------------------------------------------------------------------------ 233 234 StringToken::StringToken(const std::string &str, const ssize_t pos) 235 : Token(CharacterString, pos) 236 , fString(str) 237 { 238 } 239 240 StringToken::~StringToken() { 241 } 242 243 const std::string& 244 StringToken::String() const { 245 return fString; 246 } 247 248 //------------------------------------------------------------------------------ 249 // IntToken 250 //------------------------------------------------------------------------------ 251 252 IntToken::IntToken(const int32 value, const ssize_t pos) 253 : Token(Integer, pos) 254 , fValue(value) 255 { 256 } 257 258 IntToken::~IntToken() { 259 } 260 261 int32 262 IntToken::Int() const { 263 return fValue; 264 } 265 266 double 267 IntToken::Float() const { 268 return (double)fValue; 269 } 270 271 //------------------------------------------------------------------------------ 272 // FloatToken 273 //------------------------------------------------------------------------------ 274 275 FloatToken::FloatToken(const double value, const ssize_t pos) 276 : Token(FloatingPoint, pos) 277 , fValue(value) 278 { 279 } 280 281 FloatToken::~FloatToken() { 282 } 283 284 285 double 286 FloatToken::Float() const { 287 return fValue; 288 } 289 290 //------------------------------------------------------------------------------ 291 // TokenStream 292 //------------------------------------------------------------------------------ 293 294 TokenStream::TokenStream(const std::string &string) 295 : fCStatus(B_NO_INIT) 296 , fPos(-1) 297 , fStrLen(-1) 298 { 299 SetTo(string); 300 } 301 302 TokenStream::TokenStream() 303 : fCStatus(B_NO_INIT) 304 , fPos(-1) 305 , fStrLen(-1) 306 { 307 } 308 309 TokenStream::~TokenStream() { 310 Unset(); 311 } 312 313 status_t 314 TokenStream::SetTo(const std::string &string) { 315 Unset(); 316 fStrLen = string.length(); 317 CharStream stream(string); 318 if (stream.InitCheck() != B_OK) 319 throw new Err("Sniffer scanner error: Unable to intialize character stream", -1); 320 321 typedef enum TokenStreamScannerState { 322 tsssStart, 323 tsssOneSingle, 324 tsssOneDouble, 325 tsssOneZero, 326 tsssZeroX, 327 tsssOneHex, 328 tsssTwoHex, 329 tsssIntOrFloat, 330 tsssFloat, 331 tsssLonelyDecimalPoint, 332 tsssLonelyMinusOrPlus, 333 tsssLonelyFloatExtension, 334 tsssLonelyFloatExtensionWithSign, 335 tsssExtendedFloat, 336 tsssUnquoted, 337 tsssEscape, 338 tsssEscapeX, 339 tsssEscapeOneOctal, 340 tsssEscapeTwoOctal, 341 tsssEscapeOneHex, 342 }; 343 344 TokenStreamScannerState state = tsssStart; 345 TokenStreamScannerState escapedState = tsssStart; 346 // Used to remember which state to return to from an escape sequence 347 348 std::string charStr = ""; // Used to build up character strings 349 char lastChar = 0; // For two char lookahead 350 char lastLastChar = 0; // For three char lookahead (have I mentioned I hate octal?) 351 bool keepLooping = true; 352 ssize_t startPos = 0; 353 while (keepLooping) { 354 ssize_t pos = stream.Pos(); 355 char ch = stream.Get(); 356 switch (state) { 357 case tsssStart: 358 startPos = pos; 359 switch (ch) { 360 case 0x3: // End-Of-Text 361 if (stream.IsEmpty()) 362 keepLooping = false; 363 else 364 throw new Err(std::string("Sniffer pattern error: invalid character '") + ch + "'", pos); 365 break; 366 367 case '\t': 368 case '\n': 369 case ' ': 370 // Whitespace, so ignore it. 371 break; 372 373 case '"': 374 charStr = ""; 375 state = tsssOneDouble; 376 break; 377 378 case '\'': 379 charStr = ""; 380 state = tsssOneSingle; 381 break; 382 383 case '+': 384 case '-': 385 charStr = ch; 386 lastChar = ch; 387 state = tsssLonelyMinusOrPlus; 388 break; 389 390 case '.': 391 charStr = ch; 392 state = tsssLonelyDecimalPoint; 393 break; 394 395 case '0': 396 charStr = ch; 397 state = tsssOneZero; 398 break; 399 400 case '1': 401 case '2': 402 case '3': 403 case '4': 404 case '5': 405 case '6': 406 case '7': 407 case '8': 408 case '9': 409 charStr = ch; 410 state = tsssIntOrFloat; 411 break; 412 413 case '&': AddToken(Ampersand, pos); break; 414 case '(': AddToken(LeftParen, pos); break; 415 case ')': AddToken(RightParen, pos); break; 416 case ':': AddToken(Colon, pos); break; 417 case '[': AddToken(LeftBracket, pos); break; 418 419 case '\\': 420 charStr = ""; // Clear our string 421 state = tsssEscape; 422 escapedState = tsssUnquoted; // Unquoted strings begin with an escaped character 423 break; 424 425 case ']': AddToken(RightBracket, pos); break; 426 case '|': AddToken(Divider, pos); break; 427 428 default: 429 throw new Err(std::string("Sniffer pattern error: invalid character '") + ch + "'", pos); 430 } 431 break; 432 433 case tsssOneSingle: 434 switch (ch) { 435 case '\\': 436 escapedState = state; // Save our state 437 state = tsssEscape; // Handle the escape sequence 438 break; 439 case '\'': 440 AddString(charStr, startPos); 441 state = tsssStart; 442 break; 443 case 0x3: 444 if (stream.IsEmpty()) 445 throw new Err(std::string("Sniffer pattern error: unterminated single-quoted string"), pos); 446 else 447 charStr += ch; 448 break; 449 default: 450 charStr += ch; 451 break; 452 } 453 break; 454 455 case tsssOneDouble: 456 switch (ch) { 457 case '\\': 458 escapedState = state; // Save our state 459 state = tsssEscape; // Handle the escape sequence 460 break; 461 case '"': 462 AddString(charStr, startPos); 463 state = tsssStart; 464 break; 465 case 0x3: 466 if (stream.IsEmpty()) 467 throw new Err(std::string("Sniffer pattern error: unterminated double-quoted string"), pos); 468 else 469 charStr += ch; 470 break; 471 default: 472 charStr += ch; 473 break; 474 } 475 break; 476 477 case tsssOneZero: 478 if (ch == 'x') { 479 charStr = ""; // Reinit, since we actually have a hex string 480 state = tsssZeroX; 481 } else if ('0' <= ch && ch <= '9') { 482 charStr += ch; 483 state = tsssIntOrFloat; 484 } else if (ch == '.') { 485 charStr += ch; 486 state = tsssFloat; 487 } else if (ch == 'e' || ch == 'E') { 488 charStr += ch; 489 state = tsssLonelyFloatExtension; 490 } else { 491 // Terminate the number 492 AddInt(charStr.c_str(), startPos); 493 494 // Push the last char back on and try again 495 stream.Unget(); 496 state = tsssStart; 497 } 498 break; 499 500 case tsssZeroX: 501 if (isHexChar(ch)) { 502 lastChar = ch; 503 state = tsssOneHex; 504 } else 505 throw new Err(std::string("Sniffer pattern error: incomplete hex code"), pos); 506 break; 507 508 case tsssOneHex: 509 if (isHexChar(ch)) { 510 try { 511 charStr += hexToChar(lastChar, ch); 512 } catch (Err *err) { 513 if (err) 514 err->SetPos(pos); 515 throw err; 516 } 517 state = tsssTwoHex; 518 } else 519 throw new Err(std::string("Sniffer pattern error: bad hex literal"), pos); // Same as R5 520 break; 521 522 case tsssTwoHex: 523 if (isHexChar(ch)) { 524 lastChar = ch; 525 state = tsssOneHex; 526 } else { 527 AddString(charStr, startPos); 528 stream.Unget(); // So punctuation gets handled properly 529 state = tsssStart; 530 } 531 break; 532 533 case tsssIntOrFloat: 534 if (isDecimalChar(ch)) 535 charStr += ch; 536 else if (ch == '.') { 537 charStr += ch; 538 state = tsssFloat; 539 } else if (ch == 'e' || ch == 'E') { 540 charStr += ch; 541 state = tsssLonelyFloatExtension; 542 } else { 543 // Terminate the number 544 AddInt(charStr.c_str(), startPos); 545 546 // Push the last char back on and try again 547 stream.Unget(); 548 state = tsssStart; 549 } 550 break; 551 552 case tsssFloat: 553 if (isDecimalChar(ch)) 554 charStr += ch; 555 else if (ch == 'e' || ch == 'E') { 556 charStr += ch; 557 state = tsssLonelyFloatExtension; 558 } else { 559 // Terminate the number 560 AddFloat(charStr.c_str(), startPos); 561 562 // Push the last char back on and try again 563 stream.Unget(); 564 state = tsssStart; 565 } 566 break; 567 568 case tsssLonelyDecimalPoint: 569 if (isDecimalChar(ch)) { 570 charStr += ch; 571 state = tsssFloat; 572 } else 573 throw new Err(std::string("Sniffer pattern error: incomplete floating point number"), pos); 574 break; 575 576 case tsssLonelyMinusOrPlus: 577 if (isDecimalChar(ch)) { 578 charStr += ch; 579 state = tsssIntOrFloat; 580 } else if (ch == '.') { 581 charStr += ch; 582 state = tsssLonelyDecimalPoint; 583 } else if (ch == 'i' && lastChar == '-') { 584 AddToken(CaseInsensitiveFlag, startPos); 585 state = tsssStart; 586 } else 587 throw new Err(std::string("Sniffer pattern error: incomplete signed number"), pos); 588 break; 589 590 case tsssLonelyFloatExtension: 591 if (ch == '+' || ch == '-') { 592 charStr += ch; 593 state = tsssLonelyFloatExtensionWithSign; 594 } else if (isDecimalChar(ch)) { 595 charStr += ch; 596 state = tsssExtendedFloat; 597 } else 598 throw new Err(std::string("Sniffer pattern error: incomplete extended-notation floating point number"), pos); 599 break; 600 601 case tsssLonelyFloatExtensionWithSign: 602 if (isDecimalChar(ch)) { 603 charStr += ch; 604 state = tsssExtendedFloat; 605 } else 606 throw new Err(std::string("Sniffer pattern error: incomplete extended-notation floating point number"), pos); 607 break; 608 609 case tsssExtendedFloat: 610 if (isDecimalChar(ch)) { 611 charStr += ch; 612 state = tsssExtendedFloat; 613 } else { 614 // Terminate the number 615 AddFloat(charStr.c_str(), startPos); 616 617 // Push the last char back on and try again 618 stream.Unget(); 619 state = tsssStart; 620 } 621 break; 622 623 case tsssUnquoted: 624 if (ch == '\\') { 625 escapedState = state; // Save our state 626 state = tsssEscape; // Handle the escape sequence 627 } else if (isWhiteSpace(ch) || isPunctuation(ch)) { 628 AddString(charStr, startPos); 629 stream.Unget(); // In case it's punctuation, let tsssStart handle it 630 state = tsssStart; 631 } else if (ch == 0x3 && stream.IsEmpty()) { 632 AddString(charStr, startPos); 633 keepLooping = false; 634 } else { 635 charStr += ch; 636 } 637 break; 638 639 case tsssEscape: 640 if (isOctalChar(ch)) { 641 lastChar = ch; 642 state = tsssEscapeOneOctal; 643 } else if (ch == 'x') { 644 state = tsssEscapeX; 645 } else { 646 // Check for a true end-of-text marker 647 if (ch == 0x3 && stream.IsEmpty()) 648 throw new Err(std::string("Sniffer pattern error: incomplete escape sequence"), pos); 649 else { 650 charStr += escapeChar(ch); 651 state = escapedState; // Return to the state we were in before the escape 652 } 653 } 654 break; 655 656 case tsssEscapeX: 657 if (isHexChar(ch)) { 658 lastChar = ch; 659 state = tsssEscapeOneHex; 660 } else 661 throw new Err(std::string("Sniffer pattern error: incomplete escaped hex code"), pos); 662 break; 663 664 case tsssEscapeOneOctal: 665 if (isOctalChar(ch)) { 666 lastLastChar = lastChar; 667 lastChar = ch; 668 state = tsssEscapeTwoOctal; 669 } else { 670 // First handle the octal 671 try { 672 charStr += octalToChar(lastChar); 673 } catch (Err *err) { 674 if (err) 675 err->SetPos(startPos); 676 throw err; 677 } 678 679 // Push the new char back on and let the state we 680 // were in when the escape sequence was hit handle it. 681 stream.Unget(); 682 state = escapedState; 683 } 684 break; 685 686 case tsssEscapeTwoOctal: 687 if (isOctalChar(ch)) { 688 try { 689 charStr += octalToChar(lastLastChar, lastChar, ch); 690 } catch (Err *err) { 691 if (err) 692 err->SetPos(startPos); 693 throw err; 694 } 695 state = escapedState; 696 } else { 697 // First handle the octal 698 try { 699 charStr += octalToChar(lastLastChar, lastChar); 700 } catch (Err *err) { 701 if (err) 702 err->SetPos(startPos); 703 throw err; 704 } 705 706 // Push the new char back on and let the state we 707 // were in when the escape sequence was hit handle it. 708 stream.Unget(); 709 state = escapedState; 710 } 711 break; 712 713 case tsssEscapeOneHex: 714 if (isHexChar(ch)) { 715 try { 716 charStr += hexToChar(lastChar, ch); 717 } catch (Err *err) { 718 if (err) 719 err->SetPos(pos); 720 throw err; 721 } 722 state = escapedState; 723 } else 724 throw new Err(std::string("Sniffer pattern error: incomplete escaped hex code"), pos); 725 break; 726 727 } 728 } 729 if (state == tsssStart) { 730 fCStatus = B_OK; 731 fPos = 0; 732 } else { 733 throw new Err("Sniffer pattern error: unterminated rule", stream.Pos()); 734 } 735 736 return fCStatus; 737 } 738 739 void 740 TokenStream::Unset() { 741 std::vector<Token*>::iterator i; 742 for (i = fTokenList.begin(); i != fTokenList.end(); i++) 743 delete *i; 744 fTokenList.clear(); 745 fCStatus = B_NO_INIT; 746 fStrLen = -1; 747 } 748 749 status_t 750 TokenStream::InitCheck() const { 751 return fCStatus; 752 } 753 754 //! Returns a pointer to the next token in the stream. 755 /*! The TokenStream object retains owner ship of the Token object returned by Get(). 756 If Get() is called at the end of the stream, a pointer to a Err object is thrown. 757 */ 758 const Token* 759 TokenStream::Get() { 760 if (fCStatus != B_OK) 761 throw new Err("Sniffer parser error: TokenStream::Get() called on uninitialized TokenStream object", -1); 762 if (fPos < (ssize_t)fTokenList.size()) 763 return fTokenList[fPos++]; 764 else { 765 throw new Err("Sniffer pattern error: unterminated rule", EndPos()); 766 // fPos++; // Increment fPos to keep Unget()s consistent 767 // return NULL; // Return NULL to signal end of list 768 } 769 } 770 771 //! Places token returned by the most recent call to Get() back on the head of the stream. 772 /*! If Unget() is called at the beginning of the stream, a pointer to a Err object is thrown. 773 */ 774 void 775 TokenStream::Unget() { 776 if (fCStatus != B_OK) 777 throw new Err("Sniffer parser error: TokenStream::Unget() called on uninitialized TokenStream object", -1); 778 if (fPos > 0) 779 fPos--; 780 else 781 throw new Err("Sniffer parser error: TokenStream::Unget() called at beginning of token stream", -1); 782 } 783 784 785 /*! \brief Reads the next token in the stream and verifies it is of the given type, 786 throwing a pointer to a Err object if it is not. 787 */ 788 void 789 TokenStream::Read(TokenType type) { 790 const Token *t = Get(); 791 if (t->Type() != type) { 792 throw new Err((std::string("Sniffer pattern error: expected ") + tokenTypeToString(type) 793 + ", found " + tokenTypeToString(t->Type())).c_str(), t->Pos()); 794 } 795 } 796 797 //! Conditionally reads the next token in the stream. 798 /*! CondRead() peeks at the next token in the stream. If it is of the given type, the 799 token is removed from the stream and \c true is returned. If it is not of the 800 given type, false is returned and the token remains at the head of the stream. 801 */ 802 bool 803 TokenStream::CondRead(TokenType type) { 804 const Token *t = Get(); 805 if (t->Type() == type) { 806 return true; 807 } else { 808 Unget(); 809 return false; 810 } 811 } 812 813 ssize_t 814 TokenStream::Pos() const { 815 return fPos < (ssize_t)fTokenList.size() ? fTokenList[fPos]->Pos() : fStrLen; 816 } 817 818 ssize_t 819 TokenStream::EndPos() const { 820 return fStrLen; 821 } 822 823 bool 824 TokenStream::IsEmpty() const { 825 return fCStatus != B_OK || fPos >= (ssize_t)fTokenList.size(); 826 } 827 828 void 829 TokenStream::AddToken(TokenType type, ssize_t pos) { 830 Token *token = new Token(type, pos); 831 fTokenList.push_back(token); 832 } 833 834 void 835 TokenStream::AddString(const std::string &str, ssize_t pos) { 836 Token *token = new StringToken(str, pos); 837 fTokenList.push_back(token); 838 } 839 840 void 841 TokenStream::AddInt(const char *str, ssize_t pos) { 842 // Convert the string to an int 843 int32 value = atol(str); 844 Token *token = new IntToken(value, pos); 845 fTokenList.push_back(token); 846 } 847 848 void 849 TokenStream::AddFloat(const char *str, ssize_t pos) { 850 // Convert the string to a float 851 double value = atof(str); 852 Token *token = new FloatToken(value, pos); 853 fTokenList.push_back(token); 854 } 855 856 //------------------------------------------------------------------------------ 857 // Helper functions 858 //------------------------------------------------------------------------------ 859 860 char 861 escapeChar(char ch) { 862 // I've manually handled all the escape sequences I could come 863 // up with, and for anything else I just return the character 864 // passed in. Hex escapes are handled elsewhere, so \x just 865 // returns 'x'. Similarly, octals are handled elsewhere, so \0 866 // through \9 just return '0' through '9'. 867 switch (ch) { 868 case 'a': 869 return '\a'; 870 case 'b': 871 return '\b'; 872 case 'f': 873 return '\f'; 874 case 'n': 875 return '\n'; 876 case 'r': 877 return '\r'; 878 case 't': 879 return '\t'; 880 case 'v': 881 return '\v'; 882 default: 883 return ch; 884 } 885 } 886 887 // Converts 0x|hi|low| to a single char 888 char 889 hexToChar(char hi, char low) { 890 return (hexToChar(hi) << 4) | hexToChar(low); 891 } 892 893 // Converts 0x|ch| to a single char 894 char 895 hexToChar(char hex) { 896 if ('0' <= hex && hex <= '9') 897 return hex-'0'; 898 else if ('a' <= hex && hex <= 'f') 899 return hex-'a'+10; 900 else if ('A' <= hex && hex <= 'F') 901 return hex-'A'+10; 902 else 903 throw new Err(std::string("Sniffer parser error: invalid hex digit '") + hex + "' passed to hexToChar()", -1); 904 } 905 906 char 907 octalToChar(char octal) { 908 return octalToChar('0', '0', octal); 909 } 910 911 char 912 octalToChar(char hi, char low) { 913 return octalToChar('0', hi, low); 914 } 915 916 char 917 octalToChar(char hi, char mid, char low) { 918 if (isOctalChar(hi) && isOctalChar(mid) && isOctalChar(low)) { 919 // Check for octals >= decimal 256 920 if ((hi-'0') <= 3) 921 return ((hi-'0') << 6) | ((mid-'0') << 3) | (low-'0'); 922 else 923 throw new Err("Sniffer pattern error: invalid octal literal (octals must be between octal 0 and octal 377 inclusive)", -1); 924 } else 925 throw new Err(std::string("Sniffer parser error: invalid octal digit passed to hexToChar()"), -1); 926 } 927 928 bool 929 isHexChar(char ch) { 930 return ('0' <= ch && ch <= '9') 931 || ('a' <= ch && ch <= 'f') 932 || ('A' <= ch && ch <= 'F'); 933 } 934 935 bool 936 isWhiteSpace(char ch) { 937 return ch == ' ' || ch == '\n' || ch == '\t'; 938 } 939 940 bool 941 isOctalChar(char ch) { 942 return ('0' <= ch && ch <= '7'); 943 } 944 945 bool 946 isDecimalChar(char ch) { 947 return ('0' <= ch && ch <= '9'); 948 } 949 950 bool 951 isPunctuation(char ch) { 952 switch (ch) { 953 case '&': 954 case '(': 955 case ')': 956 case ':': 957 case '[': 958 case ']': 959 case '|': 960 return true; 961 default: 962 return false; 963 } 964 } 965 966 const char* 967 BPrivate::Storage::Sniffer::tokenTypeToString(TokenType type) { 968 switch (type) { 969 case LeftParen: 970 return "LeftParen"; 971 break; 972 case RightParen: 973 return "RightParen"; 974 break; 975 case LeftBracket: 976 return "LeftBracket"; 977 break; 978 case RightBracket: 979 return "RightBracket"; 980 break; 981 case Colon: 982 return "Colon"; 983 break; 984 case Divider: 985 return "Divider"; 986 break; 987 case Ampersand: 988 return "Ampersand"; 989 break; 990 case CaseInsensitiveFlag: 991 return "CaseInsensitiveFlag"; 992 break; 993 case CharacterString: 994 return "CharacterString"; 995 break; 996 case Integer: 997 return "Integer"; 998 break; 999 case FloatingPoint: 1000 return "FloatingPoint"; 1001 break; 1002 default: 1003 return "UNKNOWN TOKEN TYPE"; 1004 break; 1005 } 1006 } 1007 1008 //------------------------------------------------------------------------------ 1009 // Parser 1010 //------------------------------------------------------------------------------ 1011 1012 Parser::Parser() 1013 : fOutOfMemErr(new(nothrow) Err("Sniffer parser error: out of memory", -1)) 1014 { 1015 } 1016 1017 Parser::~Parser() { 1018 delete fOutOfMemErr; 1019 } 1020 1021 status_t 1022 Parser::Parse(const char *rule, Rule *result, BString *parseError) { 1023 try { 1024 if (!rule) 1025 throw new Err("Sniffer pattern error: NULL pattern", -1); 1026 if (!result) 1027 return B_BAD_VALUE; 1028 if (stream.SetTo(rule) != B_OK) 1029 throw new Err("Sniffer parser error: Unable to intialize token stream", -1); 1030 1031 ParseRule(result); 1032 1033 return B_OK; 1034 1035 } catch (Err *err) { 1036 // cout << "Caught error" << endl; 1037 if (parseError) 1038 parseError->SetTo(ErrorMessage(err, rule).c_str()); 1039 delete err; 1040 return B_BAD_MIME_SNIFFER_RULE; 1041 } 1042 } 1043 1044 std::string 1045 Parser::ErrorMessage(Err *err, const char *rule) { 1046 const char* msg = (err && err->Msg()) 1047 ? err->Msg() 1048 : "Sniffer parser error: Unexpected error with no supplied error message"; 1049 ssize_t pos = err && (err->Pos() >= 0) ? err->Pos() : 0; 1050 std::string str = std::string(rule ? rule : "") + "\n"; 1051 for (int i = 0; i < pos; i++) 1052 str += " "; 1053 str += "^ "; 1054 str += msg; 1055 return str; 1056 } 1057 1058 void 1059 Parser::ParseRule(Rule *result) { 1060 if (!result) 1061 throw new Err("Sniffer parser error: NULL Rule object passed to Parser::ParseRule()", -1); 1062 1063 // Priority 1064 double priority = ParsePriority(); 1065 // Conjunction List 1066 std::vector<DisjList*>* list = ParseConjList(); 1067 1068 result->SetTo(priority, list); 1069 } 1070 1071 double 1072 Parser::ParsePriority() { 1073 const Token *t = stream.Get(); 1074 if (t->Type() == FloatingPoint || t->Type() == Integer) { 1075 double result = t->Float(); 1076 if (0.0 <= result && result <= 1.0) 1077 return result; 1078 else { 1079 // cout << "(priority == " << result << ")" << endl; 1080 throw new Err("Sniffer pattern error: invalid priority", t->Pos()); 1081 } 1082 } else 1083 throw new Err("Sniffer pattern error: match level expected", t->Pos()); // Same as R5 1084 } 1085 1086 std::vector<DisjList*>* 1087 Parser::ParseConjList() { 1088 std::vector<DisjList*> *list = new(nothrow) std::vector<DisjList*>; 1089 if (!list) 1090 ThrowOutOfMemError(stream.Pos()); 1091 try { 1092 // DisjList+ 1093 int count = 0; 1094 while (true) { 1095 DisjList* expr = ParseDisjList(); 1096 if (!expr) 1097 break; 1098 else { 1099 list->push_back(expr); 1100 count++; 1101 } 1102 } 1103 if (count == 0) 1104 throw new Err("Sniffer pattern error: missing expression", -1); 1105 } catch (...) { 1106 delete list; 1107 throw; 1108 } 1109 return list; 1110 } 1111 1112 DisjList* 1113 Parser::ParseDisjList() { 1114 // If we've run out of tokens right now, it's okay, but 1115 // we need to let ParseConjList() know what's up 1116 if (stream.IsEmpty()) 1117 return NULL; 1118 1119 // Peek ahead, then let the appropriate Parse*List() 1120 // functions handle things 1121 const Token *t1 = stream.Get(); 1122 1123 // PatternList | RangeList 1124 if (t1->Type() == LeftParen) { 1125 const Token *t2 = stream.Get(); 1126 // Skip the case-insensitive flag, if there is one 1127 const Token *tokenOfInterest = (t2->Type() == CaseInsensitiveFlag) ? stream.Get() : t2; 1128 if (t2 != tokenOfInterest) 1129 stream.Unget(); // We called Get() three times 1130 stream.Unget(); 1131 stream.Unget(); 1132 // RangeList 1133 if (tokenOfInterest->Type() == LeftBracket) { 1134 return ParseRPatternList(); 1135 // PatternList 1136 } else { 1137 return ParsePatternList(Range(0,0)); 1138 } 1139 // Range, PatternList 1140 } else if (t1->Type() == LeftBracket) { 1141 stream.Unget(); 1142 return ParsePatternList(ParseRange()); 1143 } else { 1144 throw new Err("Sniffer pattern error: missing pattern", t1->Pos()); // Same as R5 1145 } 1146 1147 // PatternList 1148 // RangeList 1149 // Range + PatternList 1150 } 1151 1152 Range 1153 Parser::ParseRange() { 1154 int32 start, end; 1155 // LeftBracket 1156 stream.Read(LeftBracket); 1157 // Integer 1158 { 1159 const Token *t = stream.Get(); 1160 if (t->Type() == Integer) { 1161 start = t->Int(); 1162 end = start; // In case we aren't given an explicit end 1163 } else 1164 throw new Err("Sniffer pattern error: pattern offset expected", t->Pos()); 1165 } 1166 // [Colon, Integer] RightBracket 1167 { 1168 const Token *t = stream.Get(); 1169 // Colon, Integer, RightBracket 1170 if (t->Type() == Colon) { 1171 // Integer 1172 { 1173 const Token *t = stream.Get(); 1174 if (t->Type() == Integer) { 1175 end = t->Int(); 1176 } else 1177 ThrowUnexpectedTokenError(Integer, t); 1178 } 1179 // RightBracket 1180 stream.Read(RightBracket); 1181 // !(Colon, Integer) RightBracket 1182 } else if (t->Type() == RightBracket) { 1183 // Nothing to do here... 1184 1185 // Something else... 1186 } else 1187 ThrowUnexpectedTokenError(Colon, Integer, t); 1188 } 1189 Range range(start, end); 1190 if (range.InitCheck() == B_OK) 1191 return range; 1192 else 1193 throw range.GetErr(); 1194 } 1195 1196 DisjList* 1197 Parser::ParsePatternList(Range range) { 1198 PatternList *list = new(nothrow) PatternList(range); 1199 if (!list) 1200 ThrowOutOfMemError(stream.Pos()); 1201 try { 1202 // LeftParen 1203 stream.Read(LeftParen); 1204 // [Flag] Pattern, (Divider, [Flag] Pattern)* 1205 while (true) { 1206 // [Flag] 1207 if (stream.CondRead(CaseInsensitiveFlag)) 1208 list->SetCaseInsensitive(true); 1209 // Pattern 1210 list->Add(ParsePattern()); 1211 // [Divider] 1212 if (!stream.CondRead(Divider)) 1213 break; 1214 } 1215 // RightParen 1216 const Token *t = stream.Get(); 1217 if (t->Type() != RightParen) 1218 throw new Err("Sniffer pattern error: expecting '|', ')', or possibly '&'", t->Pos()); 1219 } catch (...) { 1220 delete list; 1221 throw; 1222 } 1223 return list; 1224 } 1225 1226 DisjList* 1227 Parser::ParseRPatternList() { 1228 RPatternList *list = new(nothrow) RPatternList(); 1229 if (!list) 1230 ThrowOutOfMemError(stream.Pos()); 1231 try { 1232 // LeftParen 1233 stream.Read(LeftParen); 1234 // [Flag] RPattern, (Divider, [Flag] RPattern)* 1235 while (true) { 1236 // [Flag] 1237 if (stream.CondRead(CaseInsensitiveFlag)) 1238 list->SetCaseInsensitive(true); 1239 // RPattern 1240 list->Add(ParseRPattern()); 1241 // [Divider] 1242 if (!stream.CondRead(Divider)) 1243 break; 1244 } 1245 // RightParen 1246 const Token *t = stream.Get(); 1247 if (t->Type() != RightParen) 1248 throw new Err("Sniffer pattern error: expecting '|', ')', or possibly '&'", t->Pos()); 1249 } catch (...) { 1250 delete list; 1251 throw; 1252 } 1253 return list; 1254 } 1255 1256 RPattern* 1257 Parser::ParseRPattern() { 1258 // Range 1259 Range range = ParseRange(); 1260 // Pattern 1261 Pattern *pattern = ParsePattern(); 1262 1263 RPattern *result = new(nothrow) RPattern(range, pattern); 1264 if (result) { 1265 if (result->InitCheck() == B_OK) 1266 return result; 1267 else { 1268 Err *err = result->GetErr(); 1269 delete result; 1270 throw err; 1271 } 1272 } else 1273 ThrowOutOfMemError(stream.Pos()); 1274 } 1275 1276 Pattern* 1277 Parser::ParsePattern() { 1278 std::string str; 1279 // String 1280 { 1281 const Token *t = stream.Get(); 1282 if (t->Type() == CharacterString) 1283 str = t->String(); 1284 else 1285 throw new Err("Sniffer pattern error: missing pattern", t->Pos()); 1286 } 1287 // [Ampersand, String] 1288 if (stream.CondRead(Ampersand)) { 1289 // String (i.e. Mask) 1290 const Token *t = stream.Get(); 1291 if (t->Type() == CharacterString) { 1292 Pattern *result = new(nothrow) Pattern(str, t->String()); 1293 if (!result) 1294 ThrowOutOfMemError(t->Pos()); 1295 if (result->InitCheck() == B_OK) { 1296 return result; 1297 } else { 1298 Err *err = result->GetErr(); 1299 delete result; 1300 if (err) { 1301 err->SetPos(t->Pos()); 1302 } 1303 throw err; 1304 } 1305 } else 1306 ThrowUnexpectedTokenError(CharacterString, t); 1307 } else { 1308 // No mask specified. 1309 Pattern *result = new(nothrow) Pattern(str); 1310 if (result) { 1311 if (result->InitCheck() == B_OK) 1312 return result; 1313 else { 1314 Err *err = result->GetErr(); 1315 delete result; 1316 throw err; 1317 } 1318 } else 1319 ThrowOutOfMemError(stream.Pos()); 1320 } 1321 } 1322 1323 void 1324 Parser::ThrowEndOfStreamError() { 1325 throw new Err("Sniffer pattern error: unterminated rule", stream.EndPos()); 1326 } 1327 1328 inline 1329 void 1330 Parser::ThrowOutOfMemError(ssize_t pos) { 1331 if (fOutOfMemErr) 1332 fOutOfMemErr->SetPos(pos); 1333 Err *err = fOutOfMemErr; 1334 fOutOfMemErr = NULL; 1335 throw err; 1336 } 1337 1338 void 1339 Parser::ThrowUnexpectedTokenError(TokenType expected, const Token *found) { 1340 throw new Err((std::string("Sniffer pattern error: expected ") + tokenTypeToString(expected) 1341 + ", found " + (found ? tokenTypeToString(found->Type()) : "NULL token")).c_str() 1342 , (found ? found->Pos() : stream.EndPos())); 1343 } 1344 1345 void 1346 Parser::ThrowUnexpectedTokenError(TokenType expected1, TokenType expected2, const Token *found) { 1347 throw new Err((std::string("Sniffer pattern error: expected ") + tokenTypeToString(expected1) 1348 + " or " + tokenTypeToString(expected2) + ", found " 1349 + (found ? tokenTypeToString(found->Type()) : "NULL token")).c_str() 1350 , (found ? found->Pos() : stream.EndPos())); 1351 } 1352 1353 1354 1355