1674e18fbSTyler Dauwalder //---------------------------------------------------------------------- 2674e18fbSTyler Dauwalder // This software is part of the OpenBeOS distribution and is covered 3674e18fbSTyler Dauwalder // by the OpenBeOS license. 4674e18fbSTyler Dauwalder //--------------------------------------------------------------------- 5674e18fbSTyler Dauwalder /*! 6674e18fbSTyler Dauwalder \file sniffer/Parser.h 7674e18fbSTyler Dauwalder MIME sniffer rule parser declarations 8674e18fbSTyler Dauwalder */ 9674e18fbSTyler Dauwalder #ifndef _sk_sniffer_parser_h_ 10674e18fbSTyler Dauwalder #define _sk_sniffer_parser_h_ 11674e18fbSTyler Dauwalder 12674e18fbSTyler Dauwalder #include <SupportDefs.h> 13390dce8dSTyler Dauwalder #include <sniffer/Err.h> 144574a75fSTyler Dauwalder #include <sniffer/Range.h> 154574a75fSTyler Dauwalder #include <sniffer/Rule.h> 165da54924STyler Dauwalder #include <List.h> 175da54924STyler Dauwalder #include <string> 184574a75fSTyler Dauwalder #include <vector> 195da54924STyler Dauwalder 20674e18fbSTyler Dauwalder class BString; 21674e18fbSTyler Dauwalder 2260ee71d3STyler Dauwalder //! MIME Sniffer related classes 23674e18fbSTyler Dauwalder namespace Sniffer { 244574a75fSTyler Dauwalder 25674e18fbSTyler Dauwalder class Rule; 26*93d145bbSTyler Dauwalder class DisjList; 274574a75fSTyler Dauwalder class RPattern; 284574a75fSTyler Dauwalder class Pattern; 295da54924STyler Dauwalder 305da54924STyler Dauwalder //------------------------------------------------------------------------------ 315da54924STyler Dauwalder // The mighty parsing function ;-) 325da54924STyler Dauwalder //------------------------------------------------------------------------------ 335da54924STyler Dauwalder 34674e18fbSTyler Dauwalder status_t parse(const char *rule, Rule *result, BString *parseError = NULL); 355da54924STyler Dauwalder 365da54924STyler Dauwalder //------------------------------------------------------------------------------ 375da54924STyler Dauwalder // Classes used internally by the parser 385da54924STyler Dauwalder //------------------------------------------------------------------------------ 395da54924STyler Dauwalder 4060ee71d3STyler Dauwalder //! Manages a stream of characters 4160ee71d3STyler Dauwalder /*! CharStream is used by the scanner portion of the parser, which is implemented 4260ee71d3STyler Dauwalder in TokenStream::SetTo(). 4360ee71d3STyler Dauwalder */ 445da54924STyler Dauwalder class CharStream { 455da54924STyler Dauwalder public: 46fe70cd16STyler Dauwalder CharStream(const std::string &string); 47fe70cd16STyler Dauwalder CharStream(); 485da54924STyler Dauwalder ~CharStream(); 495da54924STyler Dauwalder 50fe70cd16STyler Dauwalder status_t SetTo(const std::string &string); 515da54924STyler Dauwalder void Unset(); 525da54924STyler Dauwalder status_t InitCheck() const; 535da54924STyler Dauwalder bool IsEmpty() const; 544574a75fSTyler Dauwalder ssize_t Pos() const; 55fe70cd16STyler Dauwalder const std::string& String() const; 565da54924STyler Dauwalder 575da54924STyler Dauwalder char Get(); 585da54924STyler Dauwalder void Unget(); 595da54924STyler Dauwalder 605da54924STyler Dauwalder private: 61fe70cd16STyler Dauwalder std::string fString; 624574a75fSTyler Dauwalder ssize_t fPos; 63fe70cd16STyler Dauwalder // ssize_t fLen; 645da54924STyler Dauwalder status_t fCStatus; 655da54924STyler Dauwalder 665da54924STyler Dauwalder CharStream(const CharStream &ref); 675da54924STyler Dauwalder CharStream& operator=(const CharStream &ref); 685da54924STyler Dauwalder }; 695da54924STyler Dauwalder 7060ee71d3STyler Dauwalder //! Types of tokens 715da54924STyler Dauwalder typedef enum TokenType { 725da54924STyler Dauwalder EmptyToken, 735da54924STyler Dauwalder LeftParen, 745da54924STyler Dauwalder RightParen, 755da54924STyler Dauwalder LeftBracket, 765da54924STyler Dauwalder RightBracket, 775da54924STyler Dauwalder Colon, 785da54924STyler Dauwalder Divider, 795da54924STyler Dauwalder Ampersand, 80*93d145bbSTyler Dauwalder CaseInsensitiveFlag, 815da54924STyler Dauwalder CharacterString, 825da54924STyler Dauwalder Integer, 835da54924STyler Dauwalder FloatingPoint 845da54924STyler Dauwalder }; 855da54924STyler Dauwalder 8660ee71d3STyler Dauwalder /*! \brief Returns a NULL-terminated string contating the 8760ee71d3STyler Dauwalder name of the given token type 8860ee71d3STyler Dauwalder */ 895da54924STyler Dauwalder const char* tokenTypeToString(TokenType type); 905da54924STyler Dauwalder 9160ee71d3STyler Dauwalder //! Base token class returned by TokenStream 9260ee71d3STyler Dauwalder /*! Each token represents a single chunk of relevant information 9360ee71d3STyler Dauwalder in a given rule. For example, the floating point number "1.2e-35", 9460ee71d3STyler Dauwalder originally represented as a 7-character string, is added to the 9560ee71d3STyler Dauwalder token stream as a single FloatToken object. 9660ee71d3STyler Dauwalder */ 975da54924STyler Dauwalder class Token { 985da54924STyler Dauwalder public: 994574a75fSTyler Dauwalder Token(TokenType type = EmptyToken, const ssize_t pos = -1); 1005da54924STyler Dauwalder TokenType Type() const; 101fe70cd16STyler Dauwalder virtual const std::string& String() const; 1025da54924STyler Dauwalder virtual int32 Int() const; 1035da54924STyler Dauwalder virtual double Float() const; 1044574a75fSTyler Dauwalder ssize_t Pos() const; 105390dce8dSTyler Dauwalder bool operator==(Token &ref) const; 1065da54924STyler Dauwalder protected: 1075da54924STyler Dauwalder TokenType fType; 1084574a75fSTyler Dauwalder ssize_t fPos; 1095da54924STyler Dauwalder }; 1105da54924STyler Dauwalder 11160ee71d3STyler Dauwalder //! String token class 11260ee71d3STyler Dauwalder /*! Single-quoted strings, double-quoted strings, unquoted strings, and 11360ee71d3STyler Dauwalder hex literals are all converted to StringToken objects by the scanner 11460ee71d3STyler Dauwalder and from then on treated uniformly. 11560ee71d3STyler Dauwalder */ 1165da54924STyler Dauwalder class StringToken : public Token { 1175da54924STyler Dauwalder public: 118fe70cd16STyler Dauwalder StringToken(const std::string &str, const ssize_t pos); 119fe70cd16STyler Dauwalder virtual const std::string& String() const; 1205da54924STyler Dauwalder protected: 121fe70cd16STyler Dauwalder std::string fString; 1225da54924STyler Dauwalder }; 1235da54924STyler Dauwalder 12460ee71d3STyler Dauwalder //! Integer token class 12560ee71d3STyler Dauwalder /*! Signed or unsigned integer literals are coverted to IntToken objects, 12660ee71d3STyler Dauwalder which may then be treated as either ints or floats (since a priority 12760ee71d3STyler Dauwalder of "1" would be valid, but scanned as an int instead of a float). 12860ee71d3STyler Dauwalder */ 1295da54924STyler Dauwalder class IntToken : public Token { 1305da54924STyler Dauwalder public: 1314574a75fSTyler Dauwalder IntToken(const int32 value, const ssize_t pos); 1325da54924STyler Dauwalder virtual int32 Int() const; 1335da54924STyler Dauwalder virtual double Float() const; 1345da54924STyler Dauwalder protected: 1355da54924STyler Dauwalder int32 fValue; 1365da54924STyler Dauwalder }; 1375da54924STyler Dauwalder 13860ee71d3STyler Dauwalder //! Floating point token class 13960ee71d3STyler Dauwalder /*! Signed or unsigned, extended or non-extended notation floating point 14060ee71d3STyler Dauwalder numbers are converted to FloatToken objects by the scanner. 14160ee71d3STyler Dauwalder */ 1425da54924STyler Dauwalder class FloatToken : public Token { 1435da54924STyler Dauwalder public: 1444574a75fSTyler Dauwalder FloatToken(const double value, const ssize_t pos); 1455da54924STyler Dauwalder virtual double Float() const; 1465da54924STyler Dauwalder protected: 1475da54924STyler Dauwalder double fValue; 1485da54924STyler Dauwalder }; 1495da54924STyler Dauwalder 15060ee71d3STyler Dauwalder //! Manages a stream of Token objects 15160ee71d3STyler Dauwalder /*! Provides Get() and Unget() operations, some handy shortcut operations (Read() 15260ee71d3STyler Dauwalder and CondRead()), and handles memory management with respect to all the 15360ee71d3STyler Dauwalder Token objects in the stream (i.e. never delete a Token object returned by Get()). 15460ee71d3STyler Dauwalder 15560ee71d3STyler Dauwalder Also, the scanner portion of the parser is implemented in the TokenStream's 15660ee71d3STyler Dauwalder SetTo() function. 15760ee71d3STyler Dauwalder */ 1585da54924STyler Dauwalder class TokenStream { 1595da54924STyler Dauwalder public: 160fe70cd16STyler Dauwalder TokenStream(const std::string &string); 161fe70cd16STyler Dauwalder TokenStream(); 1625da54924STyler Dauwalder ~TokenStream(); 1635da54924STyler Dauwalder 164fe70cd16STyler Dauwalder status_t SetTo(const std::string &string); 1655da54924STyler Dauwalder void Unset(); 1665da54924STyler Dauwalder status_t InitCheck() const; 1675da54924STyler Dauwalder 168390dce8dSTyler Dauwalder const Token* Get(); 169390dce8dSTyler Dauwalder void Unget(); 1705da54924STyler Dauwalder 171390dce8dSTyler Dauwalder void Read(TokenType type); 172390dce8dSTyler Dauwalder bool CondRead(TokenType type); 173390dce8dSTyler Dauwalder 174390dce8dSTyler Dauwalder ssize_t Pos() const; 175390dce8dSTyler Dauwalder ssize_t EndPos() const; 176390dce8dSTyler Dauwalder 177390dce8dSTyler Dauwalder bool IsEmpty() const; 1785da54924STyler Dauwalder 1795da54924STyler Dauwalder private: 1804574a75fSTyler Dauwalder void AddToken(TokenType type, ssize_t pos); 181fe70cd16STyler Dauwalder void AddString(const std::string &str, ssize_t pos); 1824574a75fSTyler Dauwalder void AddInt(const char *str, ssize_t pos); 1834574a75fSTyler Dauwalder void AddFloat(const char *str, ssize_t pos); 1845da54924STyler Dauwalder 185390dce8dSTyler Dauwalder std::vector<Token*> fTokenList; 186390dce8dSTyler Dauwalder int fPos; 187390dce8dSTyler Dauwalder int fStrLen; 1885da54924STyler Dauwalder status_t fCStatus; 1895da54924STyler Dauwalder 190390dce8dSTyler Dauwalder 1915da54924STyler Dauwalder TokenStream(const TokenStream &ref); 1925da54924STyler Dauwalder TokenStream& operator=(const TokenStream &ref); 1935da54924STyler Dauwalder }; 1945da54924STyler Dauwalder 19560ee71d3STyler Dauwalder //! Handles parsing a sniffer rule, yielding either a parsed rule or a descriptive error message. 196*93d145bbSTyler Dauwalder /*! A MIME sniffer rule is valid if it is well-formed with respect to the 197*93d145bbSTyler Dauwalder following grammar and fulfills some further conditions listed thereafter: 198*93d145bbSTyler Dauwalder 199*93d145bbSTyler Dauwalder <code> 200*93d145bbSTyler Dauwalder Rule ::= LWS Priority LWS ConjList LWS 201*93d145bbSTyler Dauwalder ConjList ::= DisjList (LWS DisjList)* 202*93d145bbSTyler Dauwalder DisjList ::= "(" LWS PatternList LWS ")" 203*93d145bbSTyler Dauwalder | "(" LWS RPatternList LWS ")" 204*93d145bbSTyler Dauwalder | Range LWS "(" LWS PatternList LWS ")" 205*93d145bbSTyler Dauwalder RPatternList ::= [Flag LWS] RPattern (LWS "|" LWS [Flag LWS] RPattern)* 206*93d145bbSTyler Dauwalder PatternList ::= [Flag LWS] Pattern (LWS "|" LWS [Flag LWS] Pattern)* 207*93d145bbSTyler Dauwalder 208*93d145bbSTyler Dauwalder RPattern ::= LWS Range LWS Pattern 209*93d145bbSTyler Dauwalder Pattern ::= PString [ LWS "&" LWS Mask ] 210*93d145bbSTyler Dauwalder Range ::= "[" LWS SDecimal [LWS ":" LWS SDecimal] LWS "]" 211*93d145bbSTyler Dauwalder 212*93d145bbSTyler Dauwalder Priority ::= Float 213*93d145bbSTyler Dauwalder Mask ::= PString 214*93d145bbSTyler Dauwalder PString ::= HexLiteral | QuotedString | UnquotedString 215*93d145bbSTyler Dauwalder 216*93d145bbSTyler Dauwalder HexLiteral ::= "0x" HexPair HexPair* 217*93d145bbSTyler Dauwalder HexPair ::= HexChar HexChar 218*93d145bbSTyler Dauwalder 219*93d145bbSTyler Dauwalder QuotedString ::= SingleQuotedString | DoubleQuotedString 220*93d145bbSTyler Dauwalder SQuotedString := "'" SQChar+ "'" 221*93d145bbSTyler Dauwalder DQuotedString := '"' DQChar+ '"' 222*93d145bbSTyler Dauwalder 223*93d145bbSTyler Dauwalder UnquotedString ::= EscapedChar UChar* 224*93d145bbSTyler Dauwalder EscapedChar ::= OctalEscape | HexEscape | "\" Char 225*93d145bbSTyler Dauwalder OctalEscape ::= "\" [[OctHiChar] OctChar] OctChar 226*93d145bbSTyler Dauwalder HexEscape ::= "\x" HexPair 227*93d145bbSTyler Dauwalder 228*93d145bbSTyler Dauwalder Flag ::= "-i" 229*93d145bbSTyler Dauwalder 230*93d145bbSTyler Dauwalder SDecimal ::= [Sign] Decimal 231*93d145bbSTyler Dauwalder Decimal ::= DecChar DecChar* 232*93d145bbSTyler Dauwalder Float ::= Fixed [("E" | "e") SDecimal] 233*93d145bbSTyler Dauwalder Fixed ::= SDecimal ["." [Decimal]] | [Sign] "." Decimal 234*93d145bbSTyler Dauwalder Sign ::= "+" | "-" 235*93d145bbSTyler Dauwalder 236*93d145bbSTyler Dauwalder PunctuationChar ::= "(" | ")" | "[" | "]" | "|" | "&" | ":" 237*93d145bbSTyler Dauwalder OctHiChar ::= "0" | "1" | "2" | "3" 238*93d145bbSTyler Dauwalder OctChar ::= OctHiChar | "4" | "5" | "6" | "7" 239*93d145bbSTyler Dauwalder DecChar ::= OctChar | "8" | "9" 240*93d145bbSTyler Dauwalder HexChar ::= DecChar | "a" | "b" | "c" | "d" | "e" | "f" | "A" | "B" | "C" 241*93d145bbSTyler Dauwalder | "D" | "E" | "F" 242*93d145bbSTyler Dauwalder 243*93d145bbSTyler Dauwalder Char :: <any character> 244*93d145bbSTyler Dauwalder SQChar ::= <Char except "\", "'"> | EscapedChar 245*93d145bbSTyler Dauwalder DQChar ::= <Char except "\", '"'> | EscapedChar 246*93d145bbSTyler Dauwalder UChar ::= <Char except "\", LWSChar, and PunctuationChar> | EscapedChar 247*93d145bbSTyler Dauwalder 248*93d145bbSTyler Dauwalder LWS ::= LWSChar* 249*93d145bbSTyler Dauwalder LWSChar ::= " " | TAB | LF 250*93d145bbSTyler Dauwalder </code> 251*93d145bbSTyler Dauwalder 252*93d145bbSTyler Dauwalder Conditions: 253*93d145bbSTyler Dauwalder - If a mask is specified for a pattern, this mask must have the same 254*93d145bbSTyler Dauwalder length as the pattern string. 255*93d145bbSTyler Dauwalder - 0.0 <= Priority <= 1.0 256*93d145bbSTyler Dauwalder - 0 <= Range begin <= Range end 257*93d145bbSTyler Dauwalder 258*93d145bbSTyler Dauwalder Notes: 259*93d145bbSTyler Dauwalder - If a case-insensitive flag ("-i") appears in front of any Pattern or RPattern 260*93d145bbSTyler Dauwalder in a DisjList, case-insensitivity is applied to the entire DisjList. 261*93d145bbSTyler Dauwalder 262*93d145bbSTyler Dauwalder Examples: 263*93d145bbSTyler Dauwalder - 1.0 ('ABCD') 264*93d145bbSTyler Dauwalder The file must start with the string "ABCD". The priority of the rule 265*93d145bbSTyler Dauwalder is 1.0 (maximal). 266*93d145bbSTyler Dauwalder - 0.8 [0:3] ('ABCD' | 'abcd') 267*93d145bbSTyler Dauwalder The file must contain the string "ABCD" or "abcd" starting somewhere in 268*93d145bbSTyler Dauwalder the first four bytes. The rule priority is 0.8. 269*93d145bbSTyler Dauwalder - 0.5 ([0:3] 'ABCD' | [0:3] 'abcd' | [13] 'EFGH') 270*93d145bbSTyler Dauwalder The file must contain the string "ABCD" or "abcd" starting somewhere in 271*93d145bbSTyler Dauwalder the first four bytes or the string "EFGH" at position 13. The rule 272*93d145bbSTyler Dauwalder priority is 0.5. 273*93d145bbSTyler Dauwalder - 0.8 [0:3] ('ABCD' & 0xff00ffff | 'abcd' & 0xffff00ff) 274*93d145bbSTyler Dauwalder The file must contain the string "A.CD" or "ab.d" (whereas "." is an 275*93d145bbSTyler Dauwalder arbitrary character) starting somewhere in the first four bytes. The 276*93d145bbSTyler Dauwalder rule priority is 0.8. 277*93d145bbSTyler Dauwalder - 0.3 [10] ('mnop') ('abc') [20] ('xyz') 278*93d145bbSTyler Dauwalder The file must contain the string 'abc' at the beginning of the file, 279*93d145bbSTyler Dauwalder the string 'mnop' starting at position 10, and the string 'xyz' 280*93d145bbSTyler Dauwalder starting at position 20. The rule priority is 0.3. 281*93d145bbSTyler Dauwalder - 200e-3 (-i 'ab') 282*93d145bbSTyler Dauwalder The file must contain the string 'ab', 'aB', 'Ab', or 'AB' at the 283*93d145bbSTyler Dauwalder beginning of the file. The rule priority is 0.2. 284*93d145bbSTyler Dauwalder 285*93d145bbSTyler Dauwalder Real examples: 286*93d145bbSTyler Dauwalder - 0.20 ([0]"//" | [0]"/\*" | [0:32]"#include" | [0:32]"#ifndef" 287*93d145bbSTyler Dauwalder | [0:32]"#ifdef") 288*93d145bbSTyler Dauwalder text/x-source-code 289*93d145bbSTyler Dauwalder - 0.70 ("8BPS \000\000\000\000" & 0xffffffff0000ffffffff ) 290*93d145bbSTyler Dauwalder image/x-photoshop 291*93d145bbSTyler Dauwalder - 0.40 [0:64]( -i "<HTML" | "<HEAD" | "<TITLE" | "<BODY" 292*93d145bbSTyler Dauwalder | "<TABLE" | "<!--" | "<META" | "<CENTER") 293*93d145bbSTyler Dauwalder text/html 294*93d145bbSTyler Dauwalder 295*93d145bbSTyler Dauwalder */ 2964574a75fSTyler Dauwalder class Parser { 2974574a75fSTyler Dauwalder public: 2984574a75fSTyler Dauwalder Parser(); 299390dce8dSTyler Dauwalder ~Parser(); 3004574a75fSTyler Dauwalder status_t Parse(const char *rule, Rule *result, BString *parseError = NULL); 3014574a75fSTyler Dauwalder private: 3024574a75fSTyler Dauwalder std::string ErrorMessage(Err *err, const char *rule); 3034574a75fSTyler Dauwalder 304390dce8dSTyler Dauwalder // Things that get done a lot :-) 305390dce8dSTyler Dauwalder void ThrowEndOfStreamError(); 306390dce8dSTyler Dauwalder inline void ThrowOutOfMemError(ssize_t pos); 307390dce8dSTyler Dauwalder void ThrowUnexpectedTokenError(TokenType expected, const Token *found); 308390dce8dSTyler Dauwalder void ThrowUnexpectedTokenError(TokenType expected1, TokenType expected2, const Token *found); 309390dce8dSTyler Dauwalder 3104574a75fSTyler Dauwalder // Parsing functions 3114574a75fSTyler Dauwalder void ParseRule(Rule *result); 3124574a75fSTyler Dauwalder double ParsePriority(); 313*93d145bbSTyler Dauwalder std::vector<DisjList*>* ParseConjList(); 314*93d145bbSTyler Dauwalder DisjList* ParseDisjList(); 3154574a75fSTyler Dauwalder Range ParseRange(); 316*93d145bbSTyler Dauwalder DisjList* ParsePatternList(Range range); 317*93d145bbSTyler Dauwalder DisjList* ParseRPatternList(); 3184574a75fSTyler Dauwalder RPattern* ParseRPattern(); 3194574a75fSTyler Dauwalder Pattern* ParsePattern(); 3204574a75fSTyler Dauwalder 3214574a75fSTyler Dauwalder TokenStream stream; 322390dce8dSTyler Dauwalder 323390dce8dSTyler Dauwalder Err *fOutOfMemErr; 3244574a75fSTyler Dauwalder }; 3254574a75fSTyler Dauwalder 3265da54924STyler Dauwalder } // namespace Sniffer 3275da54924STyler Dauwalder 328674e18fbSTyler Dauwalder 329674e18fbSTyler Dauwalder #endif // _sk_sniffer_parser_h_ 330674e18fbSTyler Dauwalder 331