1 //---------------------------------------------------------------------- 2 // This software is part of the Haiku distribution and is covered 3 // by the MIT License. 4 //---------------------------------------------------------------------- 5 /*! 6 \file sniffer/Parser.cpp 7 MIME sniffer rule parser implementation 8 */ 9 10 #include <sniffer/Parser.h> 11 #include <sniffer/Pattern.h> 12 #include <sniffer/PatternList.h> 13 #include <sniffer/Range.h> 14 #include <sniffer/RPattern.h> 15 #include <sniffer/RPatternList.h> 16 #include <sniffer/Rule.h> 17 18 #include <new> 19 #include <stdio.h> 20 #include <stdlib.h> // For atol(), atof() 21 #include <string.h> 22 #include <String.h> // BString 23 24 using namespace BPrivate::Storage::Sniffer; 25 26 // Miscellaneous helper functions 27 char escapeChar(char ch); 28 char hexToChar(char hi, char low); 29 char hexToChar(char hex); 30 char octalToChar(char octal); 31 char octalToChar(char hi, char low); 32 char octalToChar(char hi, char mid, char low); 33 bool isHexChar(char ch); 34 bool isWhiteSpace(char ch); 35 bool isOctalChar(char ch); 36 bool isDecimalChar(char ch); 37 bool isPunctuation(char ch); 38 39 //! Parses the given rule. 40 /*! The resulting parsed Rule structure is stored in \c rule, which 41 must be pre-allocated. If parsing fails, a descriptive error message (meant 42 to be viewed in a monospaced font) is placed in the pre-allocated \c BString 43 pointed to by \c parseError (which may be \c NULL if you don't care about 44 the error message). 45 46 \param rule Pointer to a NULL-terminated string containing the sniffer 47 rule to be parsed 48 \param result Pointer to a pre-allocated \c Rule object into which the result 49 of parsing is placed upon success. 50 \param parseError Point to pre-allocated \c BString object into which 51 a descriptive error message is stored upon failure. 52 53 \return 54 - B_OK: Success 55 - B_BAD_MIME_SNIFFER_RULE: Failure 56 */ 57 status_t 58 BPrivate::Storage::Sniffer::parse(const char *rule, Rule *result, BString *parseError) { 59 Parser parser; 60 return parser.Parse(rule, result, parseError); 61 } 62 63 //------------------------------------------------------------------------------ 64 // Token 65 //------------------------------------------------------------------------------ 66 67 Token::Token(TokenType type, const ssize_t pos) 68 : fType(type) 69 , fPos(pos) 70 { 71 // if (type != EmptyToken) 72 // cout << "New Token, fType == " << tokenTypeToString(fType) << endl; 73 } 74 75 Token::~Token() { 76 } 77 78 TokenType 79 Token::Type() const { 80 return fType; 81 } 82 83 const std::string& 84 Token::String() const { 85 throw new Err("Sniffer scanner error: Token::String() called on non-string token", fPos); 86 } 87 88 int32 89 Token::Int() const { 90 throw new Err("Sniffer scanner error: Token::Int() called on non-integer token", fPos); 91 } 92 93 double 94 Token::Float() const { 95 throw new Err("Sniffer scanner error: Token::Float() called on non-float token", fPos); 96 } 97 98 ssize_t 99 Token::Pos() const { 100 return fPos; 101 } 102 103 bool 104 Token::operator==(Token &ref) const { 105 // Compare types, then data if necessary 106 if (Type() == ref.Type()) { 107 switch (Type()) { 108 case CharacterString: 109 // printf(" str1 == '%s'\n", String()); 110 // printf(" str2 == '%s'\n", ref.String()); 111 // printf(" strcmp() == %d\n", strcmp(String(), ref.String())); 112 { 113 return String() == ref.String(); 114 115 /* 116 // strcmp() seems to choke on certain, non-normal ASCII chars 117 // (i.e. chars outside the usual alphabets, but still valid 118 // as far as ASCII is concerned), so we'll just compare the 119 // strings by hand to be safe. 120 const char *str1 = String(); 121 const char *str2 = ref.String(); 122 int len1 = strlen(str1); 123 int len2 = strlen(str2); 124 // printf("len1 == %d\n", len1); 125 // printf("len2 == %d\n", len2); 126 if (len1 == len2) { 127 for (int i = 0; i < len1; i++) { 128 // printf("i == %d, str1[%d] == %x, str2[%d] == %x\n", i, i, str1[i], i, str2[i]); 129 if (str1[i] != str2[i]) 130 return false; 131 } 132 } 133 return true; 134 */ 135 } 136 // return strcmp(String(), ref.String()) == 0; 137 138 case Integer: 139 return Int() == ref.Int(); 140 141 case FloatingPoint: 142 return Float() == ref.Float(); 143 144 default: 145 return true; 146 } 147 } else 148 return false; 149 } 150 151 //------------------------------------------------------------------------------ 152 // StringToken 153 //------------------------------------------------------------------------------ 154 155 StringToken::StringToken(const std::string &str, const ssize_t pos) 156 : Token(CharacterString, pos) 157 , fString(str) 158 { 159 } 160 161 StringToken::~StringToken() { 162 } 163 164 const std::string& 165 StringToken::String() const { 166 return fString; 167 } 168 169 //------------------------------------------------------------------------------ 170 // IntToken 171 //------------------------------------------------------------------------------ 172 173 IntToken::IntToken(const int32 value, const ssize_t pos) 174 : Token(Integer, pos) 175 , fValue(value) 176 { 177 } 178 179 IntToken::~IntToken() { 180 } 181 182 int32 183 IntToken::Int() const { 184 return fValue; 185 } 186 187 double 188 IntToken::Float() const { 189 return (double)fValue; 190 } 191 192 //------------------------------------------------------------------------------ 193 // FloatToken 194 //------------------------------------------------------------------------------ 195 196 FloatToken::FloatToken(const double value, const ssize_t pos) 197 : Token(FloatingPoint, pos) 198 , fValue(value) 199 { 200 } 201 202 FloatToken::~FloatToken() { 203 } 204 205 206 double 207 FloatToken::Float() const { 208 return fValue; 209 } 210 211 //------------------------------------------------------------------------------ 212 // TokenStream 213 //------------------------------------------------------------------------------ 214 215 TokenStream::TokenStream(const std::string &string) 216 : fCStatus(B_NO_INIT) 217 , fPos(-1) 218 , fStrLen(-1) 219 { 220 SetTo(string); 221 } 222 223 TokenStream::TokenStream() 224 : fCStatus(B_NO_INIT) 225 , fPos(-1) 226 , fStrLen(-1) 227 { 228 } 229 230 TokenStream::~TokenStream() { 231 Unset(); 232 } 233 234 status_t 235 TokenStream::SetTo(const std::string &string) { 236 Unset(); 237 fStrLen = string.length(); 238 CharStream stream(string); 239 if (stream.InitCheck() != B_OK) 240 throw new Err("Sniffer scanner error: Unable to intialize character stream", -1); 241 242 typedef enum TokenStreamScannerState { 243 tsssStart, 244 tsssOneSingle, 245 tsssOneDouble, 246 tsssOneZero, 247 tsssZeroX, 248 tsssOneHex, 249 tsssTwoHex, 250 tsssIntOrFloat, 251 tsssFloat, 252 tsssLonelyDecimalPoint, 253 tsssLonelyMinusOrPlus, 254 tsssLonelyFloatExtension, 255 tsssLonelyFloatExtensionWithSign, 256 tsssExtendedFloat, 257 tsssUnquoted, 258 tsssEscape, 259 tsssEscapeX, 260 tsssEscapeOneOctal, 261 tsssEscapeTwoOctal, 262 tsssEscapeOneHex, 263 } TokenStreamScannerState; 264 265 TokenStreamScannerState state = tsssStart; 266 TokenStreamScannerState escapedState = tsssStart; 267 // Used to remember which state to return to from an escape sequence 268 269 std::string charStr = ""; // Used to build up character strings 270 char lastChar = 0; // For two char lookahead 271 char lastLastChar = 0; // For three char lookahead (have I mentioned I hate octal?) 272 bool keepLooping = true; 273 ssize_t startPos = 0; 274 while (keepLooping) { 275 ssize_t pos = stream.Pos(); 276 char ch = stream.Get(); 277 switch (state) { 278 case tsssStart: 279 startPos = pos; 280 switch (ch) { 281 case 0x3: // End-Of-Text 282 if (stream.IsEmpty()) 283 keepLooping = false; 284 else 285 throw new Err(std::string("Sniffer pattern error: invalid character '") + ch + "'", pos); 286 break; 287 288 case '\t': 289 case '\n': 290 case ' ': 291 // Whitespace, so ignore it. 292 break; 293 294 case '"': 295 charStr = ""; 296 state = tsssOneDouble; 297 break; 298 299 case '\'': 300 charStr = ""; 301 state = tsssOneSingle; 302 break; 303 304 case '+': 305 case '-': 306 charStr = ch; 307 lastChar = ch; 308 state = tsssLonelyMinusOrPlus; 309 break; 310 311 case '.': 312 charStr = ch; 313 state = tsssLonelyDecimalPoint; 314 break; 315 316 case '0': 317 charStr = ch; 318 state = tsssOneZero; 319 break; 320 321 case '1': 322 case '2': 323 case '3': 324 case '4': 325 case '5': 326 case '6': 327 case '7': 328 case '8': 329 case '9': 330 charStr = ch; 331 state = tsssIntOrFloat; 332 break; 333 334 case '&': AddToken(Ampersand, pos); break; 335 case '(': AddToken(LeftParen, pos); break; 336 case ')': AddToken(RightParen, pos); break; 337 case ':': AddToken(Colon, pos); break; 338 case '[': AddToken(LeftBracket, pos); break; 339 340 case '\\': 341 charStr = ""; // Clear our string 342 state = tsssEscape; 343 escapedState = tsssUnquoted; // Unquoted strings begin with an escaped character 344 break; 345 346 case ']': AddToken(RightBracket, pos); break; 347 case '|': AddToken(Divider, pos); break; 348 349 default: 350 throw new Err(std::string("Sniffer pattern error: invalid character '") + ch + "'", pos); 351 } 352 break; 353 354 case tsssOneSingle: 355 switch (ch) { 356 case '\\': 357 escapedState = state; // Save our state 358 state = tsssEscape; // Handle the escape sequence 359 break; 360 case '\'': 361 AddString(charStr, startPos); 362 state = tsssStart; 363 break; 364 case 0x3: 365 if (stream.IsEmpty()) 366 throw new Err(std::string("Sniffer pattern error: unterminated single-quoted string"), pos); 367 else 368 charStr += ch; 369 break; 370 default: 371 charStr += ch; 372 break; 373 } 374 break; 375 376 case tsssOneDouble: 377 switch (ch) { 378 case '\\': 379 escapedState = state; // Save our state 380 state = tsssEscape; // Handle the escape sequence 381 break; 382 case '"': 383 AddString(charStr, startPos); 384 state = tsssStart; 385 break; 386 case 0x3: 387 if (stream.IsEmpty()) 388 throw new Err(std::string("Sniffer pattern error: unterminated double-quoted string"), pos); 389 else 390 charStr += ch; 391 break; 392 default: 393 charStr += ch; 394 break; 395 } 396 break; 397 398 case tsssOneZero: 399 if (ch == 'x') { 400 charStr = ""; // Reinit, since we actually have a hex string 401 state = tsssZeroX; 402 } else if ('0' <= ch && ch <= '9') { 403 charStr += ch; 404 state = tsssIntOrFloat; 405 } else if (ch == '.') { 406 charStr += ch; 407 state = tsssFloat; 408 } else if (ch == 'e' || ch == 'E') { 409 charStr += ch; 410 state = tsssLonelyFloatExtension; 411 } else { 412 // Terminate the number 413 AddInt(charStr.c_str(), startPos); 414 415 // Push the last char back on and try again 416 stream.Unget(); 417 state = tsssStart; 418 } 419 break; 420 421 case tsssZeroX: 422 if (isHexChar(ch)) { 423 lastChar = ch; 424 state = tsssOneHex; 425 } else 426 throw new Err(std::string("Sniffer pattern error: incomplete hex code"), pos); 427 break; 428 429 case tsssOneHex: 430 if (isHexChar(ch)) { 431 try { 432 charStr += hexToChar(lastChar, ch); 433 } catch (Err *err) { 434 if (err) 435 err->SetPos(pos); 436 throw err; 437 } 438 state = tsssTwoHex; 439 } else 440 throw new Err(std::string("Sniffer pattern error: bad hex literal"), pos); // Same as R5 441 break; 442 443 case tsssTwoHex: 444 if (isHexChar(ch)) { 445 lastChar = ch; 446 state = tsssOneHex; 447 } else { 448 AddString(charStr, startPos); 449 stream.Unget(); // So punctuation gets handled properly 450 state = tsssStart; 451 } 452 break; 453 454 case tsssIntOrFloat: 455 if (isDecimalChar(ch)) 456 charStr += ch; 457 else if (ch == '.') { 458 charStr += ch; 459 state = tsssFloat; 460 } else if (ch == 'e' || ch == 'E') { 461 charStr += ch; 462 state = tsssLonelyFloatExtension; 463 } else { 464 // Terminate the number 465 AddInt(charStr.c_str(), startPos); 466 467 // Push the last char back on and try again 468 stream.Unget(); 469 state = tsssStart; 470 } 471 break; 472 473 case tsssFloat: 474 if (isDecimalChar(ch)) 475 charStr += ch; 476 else if (ch == 'e' || ch == 'E') { 477 charStr += ch; 478 state = tsssLonelyFloatExtension; 479 } else { 480 // Terminate the number 481 AddFloat(charStr.c_str(), startPos); 482 483 // Push the last char back on and try again 484 stream.Unget(); 485 state = tsssStart; 486 } 487 break; 488 489 case tsssLonelyDecimalPoint: 490 if (isDecimalChar(ch)) { 491 charStr += ch; 492 state = tsssFloat; 493 } else 494 throw new Err(std::string("Sniffer pattern error: incomplete floating point number"), pos); 495 break; 496 497 case tsssLonelyMinusOrPlus: 498 if (isDecimalChar(ch)) { 499 charStr += ch; 500 state = tsssIntOrFloat; 501 } else if (ch == '.') { 502 charStr += ch; 503 state = tsssLonelyDecimalPoint; 504 } else if (ch == 'i' && lastChar == '-') { 505 AddToken(CaseInsensitiveFlag, startPos); 506 state = tsssStart; 507 } else 508 throw new Err(std::string("Sniffer pattern error: incomplete signed number or invalid flag"), pos); 509 break; 510 511 case tsssLonelyFloatExtension: 512 if (ch == '+' || ch == '-') { 513 charStr += ch; 514 state = tsssLonelyFloatExtensionWithSign; 515 } else if (isDecimalChar(ch)) { 516 charStr += ch; 517 state = tsssExtendedFloat; 518 } else 519 throw new Err(std::string("Sniffer pattern error: incomplete extended-notation floating point number"), pos); 520 break; 521 522 case tsssLonelyFloatExtensionWithSign: 523 if (isDecimalChar(ch)) { 524 charStr += ch; 525 state = tsssExtendedFloat; 526 } else 527 throw new Err(std::string("Sniffer pattern error: incomplete extended-notation floating point number"), pos); 528 break; 529 530 case tsssExtendedFloat: 531 if (isDecimalChar(ch)) { 532 charStr += ch; 533 state = tsssExtendedFloat; 534 } else { 535 // Terminate the number 536 AddFloat(charStr.c_str(), startPos); 537 538 // Push the last char back on and try again 539 stream.Unget(); 540 state = tsssStart; 541 } 542 break; 543 544 case tsssUnquoted: 545 if (ch == '\\') { 546 escapedState = state; // Save our state 547 state = tsssEscape; // Handle the escape sequence 548 } else if (isWhiteSpace(ch) || isPunctuation(ch)) { 549 AddString(charStr, startPos); 550 stream.Unget(); // In case it's punctuation, let tsssStart handle it 551 state = tsssStart; 552 } else if (ch == 0x3 && stream.IsEmpty()) { 553 AddString(charStr, startPos); 554 keepLooping = false; 555 } else { 556 charStr += ch; 557 } 558 break; 559 560 case tsssEscape: 561 if (isOctalChar(ch)) { 562 lastChar = ch; 563 state = tsssEscapeOneOctal; 564 } else if (ch == 'x') { 565 state = tsssEscapeX; 566 } else { 567 // Check for a true end-of-text marker 568 if (ch == 0x3 && stream.IsEmpty()) 569 throw new Err(std::string("Sniffer pattern error: incomplete escape sequence"), pos); 570 else { 571 charStr += escapeChar(ch); 572 state = escapedState; // Return to the state we were in before the escape 573 } 574 } 575 break; 576 577 case tsssEscapeX: 578 if (isHexChar(ch)) { 579 lastChar = ch; 580 state = tsssEscapeOneHex; 581 } else 582 throw new Err(std::string("Sniffer pattern error: incomplete escaped hex code"), pos); 583 break; 584 585 case tsssEscapeOneOctal: 586 if (isOctalChar(ch)) { 587 lastLastChar = lastChar; 588 lastChar = ch; 589 state = tsssEscapeTwoOctal; 590 } else { 591 // First handle the octal 592 try { 593 charStr += octalToChar(lastChar); 594 } catch (Err *err) { 595 if (err) 596 err->SetPos(startPos); 597 throw err; 598 } 599 600 // Push the new char back on and let the state we 601 // were in when the escape sequence was hit handle it. 602 stream.Unget(); 603 state = escapedState; 604 } 605 break; 606 607 case tsssEscapeTwoOctal: 608 if (isOctalChar(ch)) { 609 try { 610 charStr += octalToChar(lastLastChar, lastChar, ch); 611 } catch (Err *err) { 612 if (err) 613 err->SetPos(startPos); 614 throw err; 615 } 616 state = escapedState; 617 } else { 618 // First handle the octal 619 try { 620 charStr += octalToChar(lastLastChar, lastChar); 621 } catch (Err *err) { 622 if (err) 623 err->SetPos(startPos); 624 throw err; 625 } 626 627 // Push the new char back on and let the state we 628 // were in when the escape sequence was hit handle it. 629 stream.Unget(); 630 state = escapedState; 631 } 632 break; 633 634 case tsssEscapeOneHex: 635 if (isHexChar(ch)) { 636 try { 637 charStr += hexToChar(lastChar, ch); 638 } catch (Err *err) { 639 if (err) 640 err->SetPos(pos); 641 throw err; 642 } 643 state = escapedState; 644 } else 645 throw new Err(std::string("Sniffer pattern error: incomplete escaped hex code"), pos); 646 break; 647 648 } 649 } 650 if (state == tsssStart) { 651 fCStatus = B_OK; 652 fPos = 0; 653 } else { 654 throw new Err("Sniffer pattern error: unterminated rule", stream.Pos()); 655 } 656 657 return fCStatus; 658 } 659 660 void 661 TokenStream::Unset() { 662 std::vector<Token*>::iterator i; 663 for (i = fTokenList.begin(); i != fTokenList.end(); i++) 664 delete *i; 665 fTokenList.clear(); 666 fCStatus = B_NO_INIT; 667 fStrLen = -1; 668 } 669 670 status_t 671 TokenStream::InitCheck() const { 672 return fCStatus; 673 } 674 675 //! Returns a pointer to the next token in the stream. 676 /*! The TokenStream object retains owner ship of the Token object returned by Get(). 677 If Get() is called at the end of the stream, a pointer to a Err object is thrown. 678 */ 679 const Token* 680 TokenStream::Get() { 681 if (fCStatus != B_OK) 682 throw new Err("Sniffer parser error: TokenStream::Get() called on uninitialized TokenStream object", -1); 683 if (fPos < (ssize_t)fTokenList.size()) 684 return fTokenList[fPos++]; 685 else { 686 throw new Err("Sniffer pattern error: unterminated rule", EndPos()); 687 // fPos++; // Increment fPos to keep Unget()s consistent 688 // return NULL; // Return NULL to signal end of list 689 } 690 } 691 692 //! Places token returned by the most recent call to Get() back on the head of the stream. 693 /*! If Unget() is called at the beginning of the stream, a pointer to a Err object is thrown. 694 */ 695 void 696 TokenStream::Unget() { 697 if (fCStatus != B_OK) 698 throw new Err("Sniffer parser error: TokenStream::Unget() called on uninitialized TokenStream object", -1); 699 if (fPos > 0) 700 fPos--; 701 else 702 throw new Err("Sniffer parser error: TokenStream::Unget() called at beginning of token stream", -1); 703 } 704 705 706 /*! \brief Reads the next token in the stream and verifies it is of the given type, 707 throwing a pointer to a Err object if it is not. 708 */ 709 void 710 TokenStream::Read(TokenType type) { 711 const Token *t = Get(); 712 if (t->Type() != type) { 713 throw new Err((std::string("Sniffer pattern error: expected ") + tokenTypeToString(type) 714 + ", found " + tokenTypeToString(t->Type())).c_str(), t->Pos()); 715 } 716 } 717 718 //! Conditionally reads the next token in the stream. 719 /*! CondRead() peeks at the next token in the stream. If it is of the given type, the 720 token is removed from the stream and \c true is returned. If it is not of the 721 given type, false is returned and the token remains at the head of the stream. 722 */ 723 bool 724 TokenStream::CondRead(TokenType type) { 725 const Token *t = Get(); 726 if (t->Type() == type) { 727 return true; 728 } else { 729 Unget(); 730 return false; 731 } 732 } 733 734 ssize_t 735 TokenStream::Pos() const { 736 return fPos < (ssize_t)fTokenList.size() ? fTokenList[fPos]->Pos() : fStrLen; 737 } 738 739 ssize_t 740 TokenStream::EndPos() const { 741 return fStrLen; 742 } 743 744 bool 745 TokenStream::IsEmpty() const { 746 return fCStatus != B_OK || fPos >= (ssize_t)fTokenList.size(); 747 } 748 749 void 750 TokenStream::AddToken(TokenType type, ssize_t pos) { 751 Token *token = new Token(type, pos); 752 fTokenList.push_back(token); 753 } 754 755 void 756 TokenStream::AddString(const std::string &str, ssize_t pos) { 757 Token *token = new StringToken(str, pos); 758 fTokenList.push_back(token); 759 } 760 761 void 762 TokenStream::AddInt(const char *str, ssize_t pos) { 763 // Convert the string to an int 764 int32 value = atol(str); 765 Token *token = new IntToken(value, pos); 766 fTokenList.push_back(token); 767 } 768 769 void 770 TokenStream::AddFloat(const char *str, ssize_t pos) { 771 // Convert the string to a float 772 double value = atof(str); 773 Token *token = new FloatToken(value, pos); 774 fTokenList.push_back(token); 775 } 776 777 //------------------------------------------------------------------------------ 778 // Helper functions 779 //------------------------------------------------------------------------------ 780 781 char 782 escapeChar(char ch) { 783 // I've manually handled all the escape sequences I could come 784 // up with, and for anything else I just return the character 785 // passed in. Hex escapes are handled elsewhere, so \x just 786 // returns 'x'. Similarly, octals are handled elsewhere, so \0 787 // through \9 just return '0' through '9'. 788 switch (ch) { 789 case 'a': 790 return '\a'; 791 case 'b': 792 return '\b'; 793 case 'f': 794 return '\f'; 795 case 'n': 796 return '\n'; 797 case 'r': 798 return '\r'; 799 case 't': 800 return '\t'; 801 case 'v': 802 return '\v'; 803 default: 804 return ch; 805 } 806 } 807 808 // Converts 0x|hi|low| to a single char 809 char 810 hexToChar(char hi, char low) { 811 return (hexToChar(hi) << 4) | hexToChar(low); 812 } 813 814 // Converts 0x|ch| to a single char 815 char 816 hexToChar(char hex) { 817 if ('0' <= hex && hex <= '9') 818 return hex-'0'; 819 else if ('a' <= hex && hex <= 'f') 820 return hex-'a'+10; 821 else if ('A' <= hex && hex <= 'F') 822 return hex-'A'+10; 823 else 824 throw new Err(std::string("Sniffer parser error: invalid hex digit '") + hex + "' passed to hexToChar()", -1); 825 } 826 827 char 828 octalToChar(char octal) { 829 return octalToChar('0', '0', octal); 830 } 831 832 char 833 octalToChar(char hi, char low) { 834 return octalToChar('0', hi, low); 835 } 836 837 char 838 octalToChar(char hi, char mid, char low) { 839 if (isOctalChar(hi) && isOctalChar(mid) && isOctalChar(low)) { 840 // Check for octals >= decimal 256 841 if ((hi-'0') <= 3) 842 return ((hi-'0') << 6) | ((mid-'0') << 3) | (low-'0'); 843 else 844 throw new Err("Sniffer pattern error: invalid octal literal (octals must be between octal 0 and octal 377 inclusive)", -1); 845 } else 846 throw new Err(std::string("Sniffer parser error: invalid octal digit passed to hexToChar()"), -1); 847 } 848 849 bool 850 isHexChar(char ch) { 851 return ('0' <= ch && ch <= '9') 852 || ('a' <= ch && ch <= 'f') 853 || ('A' <= ch && ch <= 'F'); 854 } 855 856 bool 857 isWhiteSpace(char ch) { 858 return ch == ' ' || ch == '\n' || ch == '\t'; 859 } 860 861 bool 862 isOctalChar(char ch) { 863 return ('0' <= ch && ch <= '7'); 864 } 865 866 bool 867 isDecimalChar(char ch) { 868 return ('0' <= ch && ch <= '9'); 869 } 870 871 bool 872 isPunctuation(char ch) { 873 switch (ch) { 874 case '&': 875 case '(': 876 case ')': 877 case ':': 878 case '[': 879 case ']': 880 case '|': 881 return true; 882 default: 883 return false; 884 } 885 } 886 887 const char* 888 BPrivate::Storage::Sniffer::tokenTypeToString(TokenType type) { 889 switch (type) { 890 case LeftParen: 891 return "LeftParen"; 892 break; 893 case RightParen: 894 return "RightParen"; 895 break; 896 case LeftBracket: 897 return "LeftBracket"; 898 break; 899 case RightBracket: 900 return "RightBracket"; 901 break; 902 case Colon: 903 return "Colon"; 904 break; 905 case Divider: 906 return "Divider"; 907 break; 908 case Ampersand: 909 return "Ampersand"; 910 break; 911 case CaseInsensitiveFlag: 912 return "CaseInsensitiveFlag"; 913 break; 914 case CharacterString: 915 return "CharacterString"; 916 break; 917 case Integer: 918 return "Integer"; 919 break; 920 case FloatingPoint: 921 return "FloatingPoint"; 922 break; 923 default: 924 return "UNKNOWN TOKEN TYPE"; 925 break; 926 } 927 } 928 929 //------------------------------------------------------------------------------ 930 // Parser 931 //------------------------------------------------------------------------------ 932 933 Parser::Parser() 934 : fOutOfMemErr(new(std::nothrow) Err("Sniffer parser error: out of memory", -1)) 935 { 936 } 937 938 Parser::~Parser() { 939 delete fOutOfMemErr; 940 } 941 942 status_t 943 Parser::Parse(const char *rule, Rule *result, BString *parseError) { 944 try { 945 if (!rule) 946 throw new Err("Sniffer pattern error: NULL pattern", -1); 947 if (!result) 948 return B_BAD_VALUE; 949 if (stream.SetTo(rule) != B_OK) 950 throw new Err("Sniffer parser error: Unable to intialize token stream", -1); 951 952 ParseRule(result); 953 954 return B_OK; 955 956 } catch (Err *err) { 957 // cout << "Caught error" << endl; 958 if (parseError) 959 parseError->SetTo(ErrorMessage(err, rule).c_str()); 960 delete err; 961 return rule ? (status_t)B_BAD_MIME_SNIFFER_RULE : (status_t)B_BAD_VALUE; 962 } 963 } 964 965 std::string 966 Parser::ErrorMessage(Err *err, const char *rule) { 967 const char* msg = (err && err->Msg()) 968 ? err->Msg() 969 : "Sniffer parser error: Unexpected error with no supplied error message"; 970 ssize_t pos = err && (err->Pos() >= 0) ? err->Pos() : 0; 971 std::string str = std::string(rule ? rule : "") + "\n"; 972 for (int i = 0; i < pos; i++) 973 str += " "; 974 str += "^ "; 975 str += msg; 976 return str; 977 } 978 979 void 980 Parser::ParseRule(Rule *result) { 981 if (!result) 982 throw new Err("Sniffer parser error: NULL Rule object passed to Parser::ParseRule()", -1); 983 984 // Priority 985 double priority = ParsePriority(); 986 // Conjunction List 987 std::vector<DisjList*>* list = ParseConjList(); 988 989 result->SetTo(priority, list); 990 } 991 992 double 993 Parser::ParsePriority() { 994 const Token *t = stream.Get(); 995 if (t->Type() == FloatingPoint || t->Type() == Integer) { 996 double result = t->Float(); 997 if (0.0 <= result && result <= 1.0) 998 return result; 999 else { 1000 // cout << "(priority == " << result << ")" << endl; 1001 throw new Err("Sniffer pattern error: invalid priority", t->Pos()); 1002 } 1003 } else 1004 throw new Err("Sniffer pattern error: match level expected", t->Pos()); // Same as R5 1005 } 1006 1007 std::vector<DisjList*>* 1008 Parser::ParseConjList() { 1009 std::vector<DisjList*> *list = new(std::nothrow) std::vector<DisjList*>; 1010 if (!list) 1011 ThrowOutOfMemError(stream.Pos()); 1012 try { 1013 // DisjList+ 1014 int count = 0; 1015 while (true) { 1016 DisjList* expr = ParseDisjList(); 1017 if (!expr) 1018 break; 1019 else { 1020 list->push_back(expr); 1021 count++; 1022 } 1023 } 1024 if (count == 0) 1025 throw new Err("Sniffer pattern error: missing expression", -1); 1026 } catch (...) { 1027 delete list; 1028 throw; 1029 } 1030 return list; 1031 } 1032 1033 DisjList* 1034 Parser::ParseDisjList() { 1035 // If we've run out of tokens right now, it's okay, but 1036 // we need to let ParseConjList() know what's up 1037 if (stream.IsEmpty()) 1038 return NULL; 1039 1040 // Peek ahead, then let the appropriate Parse*List() 1041 // functions handle things 1042 const Token *t1 = stream.Get(); 1043 1044 // PatternList | RangeList 1045 if (t1->Type() == LeftParen) { 1046 const Token *t2 = stream.Get(); 1047 // Skip the case-insensitive flag, if there is one 1048 const Token *tokenOfInterest = (t2->Type() == CaseInsensitiveFlag) ? stream.Get() : t2; 1049 if (t2 != tokenOfInterest) 1050 stream.Unget(); // We called Get() three times 1051 stream.Unget(); 1052 stream.Unget(); 1053 // RangeList 1054 if (tokenOfInterest->Type() == LeftBracket) { 1055 return ParseRPatternList(); 1056 // PatternList 1057 } else { 1058 return ParsePatternList(Range(0,0)); 1059 } 1060 // Range, PatternList 1061 } else if (t1->Type() == LeftBracket) { 1062 stream.Unget(); 1063 return ParsePatternList(ParseRange()); 1064 } else { 1065 throw new Err("Sniffer pattern error: missing pattern", t1->Pos()); // Same as R5 1066 } 1067 1068 // PatternList 1069 // RangeList 1070 // Range + PatternList 1071 } 1072 1073 Range 1074 Parser::ParseRange() { 1075 int32 start, end; 1076 // LeftBracket 1077 stream.Read(LeftBracket); 1078 // Integer 1079 { 1080 const Token *t = stream.Get(); 1081 if (t->Type() == Integer) { 1082 start = t->Int(); 1083 end = start; // In case we aren't given an explicit end 1084 } else 1085 throw new Err("Sniffer pattern error: pattern offset expected", t->Pos()); 1086 } 1087 // [Colon, Integer] RightBracket 1088 { 1089 const Token *t = stream.Get(); 1090 // Colon, Integer, RightBracket 1091 if (t->Type() == Colon) { 1092 // Integer 1093 { 1094 const Token *t = stream.Get(); 1095 if (t->Type() == Integer) { 1096 end = t->Int(); 1097 } else 1098 ThrowUnexpectedTokenError(Integer, t); 1099 } 1100 // RightBracket 1101 stream.Read(RightBracket); 1102 // !(Colon, Integer) RightBracket 1103 } else if (t->Type() == RightBracket) { 1104 // Nothing to do here... 1105 1106 // Something else... 1107 } else 1108 ThrowUnexpectedTokenError(Colon, Integer, t); 1109 } 1110 Range range(start, end); 1111 if (range.InitCheck() == B_OK) 1112 return range; 1113 else 1114 throw range.GetErr(); 1115 } 1116 1117 DisjList* 1118 Parser::ParsePatternList(Range range) { 1119 PatternList *list = new(std::nothrow) PatternList(range); 1120 if (!list) 1121 ThrowOutOfMemError(stream.Pos()); 1122 try { 1123 // LeftParen 1124 stream.Read(LeftParen); 1125 // [Flag] Pattern, (Divider, [Flag] Pattern)* 1126 while (true) { 1127 // [Flag] 1128 if (stream.CondRead(CaseInsensitiveFlag)) 1129 list->SetCaseInsensitive(true); 1130 // Pattern 1131 list->Add(ParsePattern()); 1132 // [Divider] 1133 if (!stream.CondRead(Divider)) 1134 break; 1135 } 1136 // RightParen 1137 const Token *t = stream.Get(); 1138 if (t->Type() != RightParen) 1139 throw new Err("Sniffer pattern error: expecting '|', ')', or possibly '&'", t->Pos()); 1140 } catch (...) { 1141 delete list; 1142 throw; 1143 } 1144 return list; 1145 } 1146 1147 DisjList* 1148 Parser::ParseRPatternList() { 1149 RPatternList *list = new(std::nothrow) RPatternList(); 1150 if (!list) 1151 ThrowOutOfMemError(stream.Pos()); 1152 try { 1153 // LeftParen 1154 stream.Read(LeftParen); 1155 // [Flag] RPattern, (Divider, [Flag] RPattern)* 1156 while (true) { 1157 // [Flag] 1158 if (stream.CondRead(CaseInsensitiveFlag)) 1159 list->SetCaseInsensitive(true); 1160 // RPattern 1161 list->Add(ParseRPattern()); 1162 // [Divider] 1163 if (!stream.CondRead(Divider)) 1164 break; 1165 } 1166 // RightParen 1167 const Token *t = stream.Get(); 1168 if (t->Type() != RightParen) 1169 throw new Err("Sniffer pattern error: expecting '|', ')', or possibly '&'", t->Pos()); 1170 } catch (...) { 1171 delete list; 1172 throw; 1173 } 1174 return list; 1175 } 1176 1177 RPattern* 1178 Parser::ParseRPattern() { 1179 // Range 1180 Range range = ParseRange(); 1181 // Pattern 1182 Pattern *pattern = ParsePattern(); 1183 1184 RPattern *result = new(std::nothrow) RPattern(range, pattern); 1185 if (result) { 1186 if (result->InitCheck() == B_OK) 1187 return result; 1188 else { 1189 Err *err = result->GetErr(); 1190 delete result; 1191 throw err; 1192 } 1193 } else 1194 ThrowOutOfMemError(stream.Pos()); 1195 return NULL; 1196 } 1197 1198 Pattern* 1199 Parser::ParsePattern() { 1200 std::string str; 1201 // String 1202 { 1203 const Token *t = stream.Get(); 1204 if (t->Type() == CharacterString) 1205 str = t->String(); 1206 else 1207 throw new Err("Sniffer pattern error: missing pattern", t->Pos()); 1208 } 1209 // [Ampersand, String] 1210 if (stream.CondRead(Ampersand)) { 1211 // String (i.e. Mask) 1212 const Token *t = stream.Get(); 1213 if (t->Type() == CharacterString) { 1214 Pattern *result = new(std::nothrow) Pattern(str, t->String()); 1215 if (!result) 1216 ThrowOutOfMemError(t->Pos()); 1217 if (result->InitCheck() == B_OK) { 1218 return result; 1219 } else { 1220 Err *err = result->GetErr(); 1221 delete result; 1222 if (err) { 1223 err->SetPos(t->Pos()); 1224 } 1225 throw err; 1226 } 1227 } else 1228 ThrowUnexpectedTokenError(CharacterString, t); 1229 } else { 1230 // No mask specified. 1231 Pattern *result = new(std::nothrow) Pattern(str); 1232 if (result) { 1233 if (result->InitCheck() == B_OK) 1234 return result; 1235 else { 1236 Err *err = result->GetErr(); 1237 delete result; 1238 throw err; 1239 } 1240 } else 1241 ThrowOutOfMemError(stream.Pos()); 1242 } 1243 return NULL; 1244 } 1245 1246 void 1247 Parser::ThrowEndOfStreamError() { 1248 throw new Err("Sniffer pattern error: unterminated rule", stream.EndPos()); 1249 } 1250 1251 inline 1252 void 1253 Parser::ThrowOutOfMemError(ssize_t pos) { 1254 if (fOutOfMemErr) 1255 fOutOfMemErr->SetPos(pos); 1256 Err *err = fOutOfMemErr; 1257 fOutOfMemErr = NULL; 1258 throw err; 1259 } 1260 1261 void 1262 Parser::ThrowUnexpectedTokenError(TokenType expected, const Token *found) { 1263 throw new Err((std::string("Sniffer pattern error: expected ") + tokenTypeToString(expected) 1264 + ", found " + (found ? tokenTypeToString(found->Type()) : "NULL token")).c_str() 1265 , (found ? found->Pos() : stream.EndPos())); 1266 } 1267 1268 void 1269 Parser::ThrowUnexpectedTokenError(TokenType expected1, TokenType expected2, const Token *found) { 1270 throw new Err((std::string("Sniffer pattern error: expected ") + tokenTypeToString(expected1) 1271 + " or " + tokenTypeToString(expected2) + ", found " 1272 + (found ? tokenTypeToString(found->Type()) : "NULL token")).c_str() 1273 , (found ? found->Pos() : stream.EndPos())); 1274 } 1275 1276 1277 1278