1674e18fbSTyler Dauwalder //---------------------------------------------------------------------- 2*2ca13760SColdfirex // This software is part of the Haiku distribution and is covered 3b6f76ebeSAugustin Cavalier // by the MIT License. 4674e18fbSTyler Dauwalder //--------------------------------------------------------------------- 5674e18fbSTyler Dauwalder /*! 6674e18fbSTyler Dauwalder \file sniffer/Parser.h 7674e18fbSTyler Dauwalder MIME sniffer rule parser declarations 8674e18fbSTyler Dauwalder */ 982b75665STyler Dauwalder #ifndef _SNIFFER_PARSER_H 1082b75665STyler Dauwalder #define _SNIFFER_PARSER_H 11674e18fbSTyler Dauwalder 12674e18fbSTyler Dauwalder #include <SupportDefs.h> 1389ec8a81STyler Dauwalder #include <sniffer/CharStream.h> 14390dce8dSTyler Dauwalder #include <sniffer/Err.h> 154574a75fSTyler Dauwalder #include <sniffer/Range.h> 164574a75fSTyler Dauwalder #include <sniffer/Rule.h> 175da54924STyler Dauwalder #include <List.h> 185da54924STyler Dauwalder #include <string> 194574a75fSTyler Dauwalder #include <vector> 205da54924STyler Dauwalder 21674e18fbSTyler Dauwalder class BString; 22674e18fbSTyler Dauwalder 2360ee71d3STyler Dauwalder //! MIME Sniffer related classes 2409d84e61STyler Dauwalder namespace BPrivate { 2509d84e61STyler Dauwalder namespace Storage { 26674e18fbSTyler Dauwalder namespace Sniffer { 274574a75fSTyler Dauwalder 28674e18fbSTyler Dauwalder class Rule; 2993d145bbSTyler Dauwalder class DisjList; 304574a75fSTyler Dauwalder class RPattern; 314574a75fSTyler Dauwalder class Pattern; 325da54924STyler Dauwalder 335da54924STyler Dauwalder //------------------------------------------------------------------------------ 345da54924STyler Dauwalder // The mighty parsing function ;-) 355da54924STyler Dauwalder //------------------------------------------------------------------------------ 365da54924STyler Dauwalder 37674e18fbSTyler Dauwalder status_t parse(const char *rule, Rule *result, BString *parseError = NULL); 385da54924STyler Dauwalder 395da54924STyler Dauwalder //------------------------------------------------------------------------------ 405da54924STyler Dauwalder // Classes used internally by the parser 415da54924STyler Dauwalder //------------------------------------------------------------------------------ 425da54924STyler Dauwalder 4360ee71d3STyler Dauwalder //! Types of tokens 445da54924STyler Dauwalder typedef enum TokenType { 455da54924STyler Dauwalder EmptyToken, 465da54924STyler Dauwalder LeftParen, 475da54924STyler Dauwalder RightParen, 485da54924STyler Dauwalder LeftBracket, 495da54924STyler Dauwalder RightBracket, 505da54924STyler Dauwalder Colon, 515da54924STyler Dauwalder Divider, 525da54924STyler Dauwalder Ampersand, 5393d145bbSTyler Dauwalder CaseInsensitiveFlag, 545da54924STyler Dauwalder CharacterString, 555da54924STyler Dauwalder Integer, 565da54924STyler Dauwalder FloatingPoint 576eb09230SMichael Lotz } TokenType; 585da54924STyler Dauwalder 5960ee71d3STyler Dauwalder /*! \brief Returns a NULL-terminated string contating the 6060ee71d3STyler Dauwalder name of the given token type 6160ee71d3STyler Dauwalder */ 625da54924STyler Dauwalder const char* tokenTypeToString(TokenType type); 635da54924STyler Dauwalder 6460ee71d3STyler Dauwalder //! Base token class returned by TokenStream 6560ee71d3STyler Dauwalder /*! Each token represents a single chunk of relevant information 6660ee71d3STyler Dauwalder in a given rule. For example, the floating point number "1.2e-35", 6760ee71d3STyler Dauwalder originally represented as a 7-character string, is added to the 6860ee71d3STyler Dauwalder token stream as a single FloatToken object. 6960ee71d3STyler Dauwalder */ 705da54924STyler Dauwalder class Token { 715da54924STyler Dauwalder public: 724574a75fSTyler Dauwalder Token(TokenType type = EmptyToken, const ssize_t pos = -1); 7302fd0582STyler Dauwalder virtual ~Token(); 745da54924STyler Dauwalder TokenType Type() const; 75fe70cd16STyler Dauwalder virtual const std::string& String() const; 765da54924STyler Dauwalder virtual int32 Int() const; 775da54924STyler Dauwalder virtual double Float() const; 784574a75fSTyler Dauwalder ssize_t Pos() const; 79390dce8dSTyler Dauwalder bool operator==(Token &ref) const; 805da54924STyler Dauwalder protected: 815da54924STyler Dauwalder TokenType fType; 824574a75fSTyler Dauwalder ssize_t fPos; 835da54924STyler Dauwalder }; 845da54924STyler Dauwalder 8560ee71d3STyler Dauwalder //! String token class 8660ee71d3STyler Dauwalder /*! Single-quoted strings, double-quoted strings, unquoted strings, and 8760ee71d3STyler Dauwalder hex literals are all converted to StringToken objects by the scanner 8860ee71d3STyler Dauwalder and from then on treated uniformly. 8960ee71d3STyler Dauwalder */ 905da54924STyler Dauwalder class StringToken : public Token { 915da54924STyler Dauwalder public: 92fe70cd16STyler Dauwalder StringToken(const std::string &str, const ssize_t pos); 9302fd0582STyler Dauwalder virtual ~StringToken(); 94fe70cd16STyler Dauwalder virtual const std::string& String() const; 955da54924STyler Dauwalder protected: 96fe70cd16STyler Dauwalder std::string fString; 975da54924STyler Dauwalder }; 985da54924STyler Dauwalder 9960ee71d3STyler Dauwalder //! Integer token class 10060ee71d3STyler Dauwalder /*! Signed or unsigned integer literals are coverted to IntToken objects, 10160ee71d3STyler Dauwalder which may then be treated as either ints or floats (since a priority 10260ee71d3STyler Dauwalder of "1" would be valid, but scanned as an int instead of a float). 10360ee71d3STyler Dauwalder */ 1045da54924STyler Dauwalder class IntToken : public Token { 1055da54924STyler Dauwalder public: 1064574a75fSTyler Dauwalder IntToken(const int32 value, const ssize_t pos); 10702fd0582STyler Dauwalder virtual ~IntToken(); 1085da54924STyler Dauwalder virtual int32 Int() const; 1095da54924STyler Dauwalder virtual double Float() const; 1105da54924STyler Dauwalder protected: 1115da54924STyler Dauwalder int32 fValue; 1125da54924STyler Dauwalder }; 1135da54924STyler Dauwalder 11460ee71d3STyler Dauwalder //! Floating point token class 11560ee71d3STyler Dauwalder /*! Signed or unsigned, extended or non-extended notation floating point 11660ee71d3STyler Dauwalder numbers are converted to FloatToken objects by the scanner. 11760ee71d3STyler Dauwalder */ 1185da54924STyler Dauwalder class FloatToken : public Token { 1195da54924STyler Dauwalder public: 1204574a75fSTyler Dauwalder FloatToken(const double value, const ssize_t pos); 12102fd0582STyler Dauwalder virtual ~FloatToken(); 1225da54924STyler Dauwalder virtual double Float() const; 1235da54924STyler Dauwalder protected: 1245da54924STyler Dauwalder double fValue; 1255da54924STyler Dauwalder }; 1265da54924STyler Dauwalder 12760ee71d3STyler Dauwalder //! Manages a stream of Token objects 12860ee71d3STyler Dauwalder /*! Provides Get() and Unget() operations, some handy shortcut operations (Read() 12960ee71d3STyler Dauwalder and CondRead()), and handles memory management with respect to all the 13060ee71d3STyler Dauwalder Token objects in the stream (i.e. never delete a Token object returned by Get()). 13160ee71d3STyler Dauwalder 13260ee71d3STyler Dauwalder Also, the scanner portion of the parser is implemented in the TokenStream's 13360ee71d3STyler Dauwalder SetTo() function. 13460ee71d3STyler Dauwalder */ 1355da54924STyler Dauwalder class TokenStream { 1365da54924STyler Dauwalder public: 137fe70cd16STyler Dauwalder TokenStream(const std::string &string); 138fe70cd16STyler Dauwalder TokenStream(); 1395da54924STyler Dauwalder ~TokenStream(); 1405da54924STyler Dauwalder 141fe70cd16STyler Dauwalder status_t SetTo(const std::string &string); 1425da54924STyler Dauwalder void Unset(); 1435da54924STyler Dauwalder status_t InitCheck() const; 1445da54924STyler Dauwalder 145390dce8dSTyler Dauwalder const Token* Get(); 146390dce8dSTyler Dauwalder void Unget(); 1475da54924STyler Dauwalder 148390dce8dSTyler Dauwalder void Read(TokenType type); 149390dce8dSTyler Dauwalder bool CondRead(TokenType type); 150390dce8dSTyler Dauwalder 151390dce8dSTyler Dauwalder ssize_t Pos() const; 152390dce8dSTyler Dauwalder ssize_t EndPos() const; 153390dce8dSTyler Dauwalder 154390dce8dSTyler Dauwalder bool IsEmpty() const; 1555da54924STyler Dauwalder 1565da54924STyler Dauwalder private: 1574574a75fSTyler Dauwalder void AddToken(TokenType type, ssize_t pos); 158fe70cd16STyler Dauwalder void AddString(const std::string &str, ssize_t pos); 1594574a75fSTyler Dauwalder void AddInt(const char *str, ssize_t pos); 1604574a75fSTyler Dauwalder void AddFloat(const char *str, ssize_t pos); 1615da54924STyler Dauwalder 162390dce8dSTyler Dauwalder std::vector<Token*> fTokenList; 16302fd0582STyler Dauwalder status_t fCStatus; 164390dce8dSTyler Dauwalder int fPos; 165390dce8dSTyler Dauwalder int fStrLen; 1665da54924STyler Dauwalder 167390dce8dSTyler Dauwalder 1685da54924STyler Dauwalder TokenStream(const TokenStream &ref); 1695da54924STyler Dauwalder TokenStream& operator=(const TokenStream &ref); 1705da54924STyler Dauwalder }; 1715da54924STyler Dauwalder 17260ee71d3STyler Dauwalder //! Handles parsing a sniffer rule, yielding either a parsed rule or a descriptive error message. 17393d145bbSTyler Dauwalder /*! A MIME sniffer rule is valid if it is well-formed with respect to the 17493d145bbSTyler Dauwalder following grammar and fulfills some further conditions listed thereafter: 17593d145bbSTyler Dauwalder 17693d145bbSTyler Dauwalder <code> 17793d145bbSTyler Dauwalder Rule ::= LWS Priority LWS ConjList LWS 17893d145bbSTyler Dauwalder ConjList ::= DisjList (LWS DisjList)* 17993d145bbSTyler Dauwalder DisjList ::= "(" LWS PatternList LWS ")" 18093d145bbSTyler Dauwalder | "(" LWS RPatternList LWS ")" 18193d145bbSTyler Dauwalder | Range LWS "(" LWS PatternList LWS ")" 18293d145bbSTyler Dauwalder RPatternList ::= [Flag LWS] RPattern (LWS "|" LWS [Flag LWS] RPattern)* 18393d145bbSTyler Dauwalder PatternList ::= [Flag LWS] Pattern (LWS "|" LWS [Flag LWS] Pattern)* 18493d145bbSTyler Dauwalder 18593d145bbSTyler Dauwalder RPattern ::= LWS Range LWS Pattern 18693d145bbSTyler Dauwalder Pattern ::= PString [ LWS "&" LWS Mask ] 18793d145bbSTyler Dauwalder Range ::= "[" LWS SDecimal [LWS ":" LWS SDecimal] LWS "]" 18893d145bbSTyler Dauwalder 18993d145bbSTyler Dauwalder Priority ::= Float 19093d145bbSTyler Dauwalder Mask ::= PString 19193d145bbSTyler Dauwalder PString ::= HexLiteral | QuotedString | UnquotedString 19293d145bbSTyler Dauwalder 19393d145bbSTyler Dauwalder HexLiteral ::= "0x" HexPair HexPair* 19493d145bbSTyler Dauwalder HexPair ::= HexChar HexChar 19593d145bbSTyler Dauwalder 19693d145bbSTyler Dauwalder QuotedString ::= SingleQuotedString | DoubleQuotedString 19793d145bbSTyler Dauwalder SQuotedString := "'" SQChar+ "'" 19893d145bbSTyler Dauwalder DQuotedString := '"' DQChar+ '"' 19993d145bbSTyler Dauwalder 20093d145bbSTyler Dauwalder UnquotedString ::= EscapedChar UChar* 20193d145bbSTyler Dauwalder EscapedChar ::= OctalEscape | HexEscape | "\" Char 20293d145bbSTyler Dauwalder OctalEscape ::= "\" [[OctHiChar] OctChar] OctChar 20393d145bbSTyler Dauwalder HexEscape ::= "\x" HexPair 20493d145bbSTyler Dauwalder 20593d145bbSTyler Dauwalder Flag ::= "-i" 20693d145bbSTyler Dauwalder 20793d145bbSTyler Dauwalder SDecimal ::= [Sign] Decimal 20893d145bbSTyler Dauwalder Decimal ::= DecChar DecChar* 20993d145bbSTyler Dauwalder Float ::= Fixed [("E" | "e") SDecimal] 21093d145bbSTyler Dauwalder Fixed ::= SDecimal ["." [Decimal]] | [Sign] "." Decimal 21193d145bbSTyler Dauwalder Sign ::= "+" | "-" 21293d145bbSTyler Dauwalder 21393d145bbSTyler Dauwalder PunctuationChar ::= "(" | ")" | "[" | "]" | "|" | "&" | ":" 21493d145bbSTyler Dauwalder OctHiChar ::= "0" | "1" | "2" | "3" 21593d145bbSTyler Dauwalder OctChar ::= OctHiChar | "4" | "5" | "6" | "7" 21693d145bbSTyler Dauwalder DecChar ::= OctChar | "8" | "9" 21793d145bbSTyler Dauwalder HexChar ::= DecChar | "a" | "b" | "c" | "d" | "e" | "f" | "A" | "B" | "C" 21893d145bbSTyler Dauwalder | "D" | "E" | "F" 21993d145bbSTyler Dauwalder 22093d145bbSTyler Dauwalder Char :: <any character> 22193d145bbSTyler Dauwalder SQChar ::= <Char except "\", "'"> | EscapedChar 22293d145bbSTyler Dauwalder DQChar ::= <Char except "\", '"'> | EscapedChar 22393d145bbSTyler Dauwalder UChar ::= <Char except "\", LWSChar, and PunctuationChar> | EscapedChar 22493d145bbSTyler Dauwalder 22593d145bbSTyler Dauwalder LWS ::= LWSChar* 22693d145bbSTyler Dauwalder LWSChar ::= " " | TAB | LF 22793d145bbSTyler Dauwalder </code> 22893d145bbSTyler Dauwalder 22993d145bbSTyler Dauwalder Conditions: 23093d145bbSTyler Dauwalder - If a mask is specified for a pattern, this mask must have the same 23193d145bbSTyler Dauwalder length as the pattern string. 23293d145bbSTyler Dauwalder - 0.0 <= Priority <= 1.0 23393d145bbSTyler Dauwalder - 0 <= Range begin <= Range end 23493d145bbSTyler Dauwalder 23593d145bbSTyler Dauwalder Notes: 23693d145bbSTyler Dauwalder - If a case-insensitive flag ("-i") appears in front of any Pattern or RPattern 23793d145bbSTyler Dauwalder in a DisjList, case-insensitivity is applied to the entire DisjList. 23893d145bbSTyler Dauwalder 23993d145bbSTyler Dauwalder Examples: 24093d145bbSTyler Dauwalder - 1.0 ('ABCD') 24193d145bbSTyler Dauwalder The file must start with the string "ABCD". The priority of the rule 24293d145bbSTyler Dauwalder is 1.0 (maximal). 24393d145bbSTyler Dauwalder - 0.8 [0:3] ('ABCD' | 'abcd') 24493d145bbSTyler Dauwalder The file must contain the string "ABCD" or "abcd" starting somewhere in 24593d145bbSTyler Dauwalder the first four bytes. The rule priority is 0.8. 24693d145bbSTyler Dauwalder - 0.5 ([0:3] 'ABCD' | [0:3] 'abcd' | [13] 'EFGH') 24793d145bbSTyler Dauwalder The file must contain the string "ABCD" or "abcd" starting somewhere in 24893d145bbSTyler Dauwalder the first four bytes or the string "EFGH" at position 13. The rule 24993d145bbSTyler Dauwalder priority is 0.5. 25093d145bbSTyler Dauwalder - 0.8 [0:3] ('ABCD' & 0xff00ffff | 'abcd' & 0xffff00ff) 25193d145bbSTyler Dauwalder The file must contain the string "A.CD" or "ab.d" (whereas "." is an 25293d145bbSTyler Dauwalder arbitrary character) starting somewhere in the first four bytes. The 25393d145bbSTyler Dauwalder rule priority is 0.8. 25493d145bbSTyler Dauwalder - 0.3 [10] ('mnop') ('abc') [20] ('xyz') 25593d145bbSTyler Dauwalder The file must contain the string 'abc' at the beginning of the file, 25693d145bbSTyler Dauwalder the string 'mnop' starting at position 10, and the string 'xyz' 25793d145bbSTyler Dauwalder starting at position 20. The rule priority is 0.3. 25893d145bbSTyler Dauwalder - 200e-3 (-i 'ab') 25993d145bbSTyler Dauwalder The file must contain the string 'ab', 'aB', 'Ab', or 'AB' at the 26093d145bbSTyler Dauwalder beginning of the file. The rule priority is 0.2. 26193d145bbSTyler Dauwalder 26293d145bbSTyler Dauwalder Real examples: 26393d145bbSTyler Dauwalder - 0.20 ([0]"//" | [0]"/\*" | [0:32]"#include" | [0:32]"#ifndef" 26493d145bbSTyler Dauwalder | [0:32]"#ifdef") 26593d145bbSTyler Dauwalder text/x-source-code 26693d145bbSTyler Dauwalder - 0.70 ("8BPS \000\000\000\000" & 0xffffffff0000ffffffff ) 26793d145bbSTyler Dauwalder image/x-photoshop 26893d145bbSTyler Dauwalder - 0.40 [0:64]( -i "<HTML" | "<HEAD" | "<TITLE" | "<BODY" 26993d145bbSTyler Dauwalder | "<TABLE" | "<!--" | "<META" | "<CENTER") 27093d145bbSTyler Dauwalder text/html 27193d145bbSTyler Dauwalder 27293d145bbSTyler Dauwalder */ 2734574a75fSTyler Dauwalder class Parser { 2744574a75fSTyler Dauwalder public: 2754574a75fSTyler Dauwalder Parser(); 276390dce8dSTyler Dauwalder ~Parser(); 2774574a75fSTyler Dauwalder status_t Parse(const char *rule, Rule *result, BString *parseError = NULL); 2784574a75fSTyler Dauwalder private: 2794574a75fSTyler Dauwalder std::string ErrorMessage(Err *err, const char *rule); 2804574a75fSTyler Dauwalder 281390dce8dSTyler Dauwalder // Things that get done a lot :-) 282390dce8dSTyler Dauwalder void ThrowEndOfStreamError(); 283390dce8dSTyler Dauwalder inline void ThrowOutOfMemError(ssize_t pos); 284390dce8dSTyler Dauwalder void ThrowUnexpectedTokenError(TokenType expected, const Token *found); 285390dce8dSTyler Dauwalder void ThrowUnexpectedTokenError(TokenType expected1, TokenType expected2, const Token *found); 286390dce8dSTyler Dauwalder 2874574a75fSTyler Dauwalder // Parsing functions 2884574a75fSTyler Dauwalder void ParseRule(Rule *result); 2894574a75fSTyler Dauwalder double ParsePriority(); 29093d145bbSTyler Dauwalder std::vector<DisjList*>* ParseConjList(); 29193d145bbSTyler Dauwalder DisjList* ParseDisjList(); 2924574a75fSTyler Dauwalder Range ParseRange(); 29393d145bbSTyler Dauwalder DisjList* ParsePatternList(Range range); 29493d145bbSTyler Dauwalder DisjList* ParseRPatternList(); 2954574a75fSTyler Dauwalder RPattern* ParseRPattern(); 2964574a75fSTyler Dauwalder Pattern* ParsePattern(); 2974574a75fSTyler Dauwalder 2984574a75fSTyler Dauwalder TokenStream stream; 299390dce8dSTyler Dauwalder 300390dce8dSTyler Dauwalder Err *fOutOfMemErr; 3014574a75fSTyler Dauwalder }; 3024574a75fSTyler Dauwalder 30309d84e61STyler Dauwalder }; // namespace Sniffer 30409d84e61STyler Dauwalder }; // namespace Storage 30509d84e61STyler Dauwalder }; // namespace BPrivate 306674e18fbSTyler Dauwalder 30782b75665STyler Dauwalder #endif // _SNIFFER_PARSER_H 308674e18fbSTyler Dauwalder 30909d84e61STyler Dauwalder 31009d84e61STyler Dauwalder 31109d84e61STyler Dauwalder 312