1 //---------------------------------------------------------------------- 2 // This software is part of the Haiku distribution and is covered 3 // by the MIT License. 4 //--------------------------------------------------------------------- 5 /*! 6 \file sniffer/Parser.h 7 MIME sniffer rule parser declarations 8 */ 9 #ifndef _SNIFFER_PARSER_H 10 #define _SNIFFER_PARSER_H 11 12 #include <SupportDefs.h> 13 #include <sniffer/CharStream.h> 14 #include <sniffer/Err.h> 15 #include <sniffer/Range.h> 16 #include <sniffer/Rule.h> 17 #include <List.h> 18 #include <string> 19 #include <vector> 20 21 class BString; 22 23 //! MIME Sniffer related classes 24 namespace BPrivate { 25 namespace Storage { 26 namespace Sniffer { 27 28 class Rule; 29 class DisjList; 30 class RPattern; 31 class Pattern; 32 33 //------------------------------------------------------------------------------ 34 // The mighty parsing function ;-) 35 //------------------------------------------------------------------------------ 36 37 status_t parse(const char *rule, Rule *result, BString *parseError = NULL); 38 39 //------------------------------------------------------------------------------ 40 // Classes used internally by the parser 41 //------------------------------------------------------------------------------ 42 43 //! Types of tokens 44 typedef enum TokenType { 45 EmptyToken, 46 LeftParen, 47 RightParen, 48 LeftBracket, 49 RightBracket, 50 Colon, 51 Divider, 52 Ampersand, 53 CaseInsensitiveFlag, 54 CharacterString, 55 Integer, 56 FloatingPoint 57 } TokenType; 58 59 /*! \brief Returns a NULL-terminated string contating the 60 name of the given token type 61 */ 62 const char* tokenTypeToString(TokenType type); 63 64 //! Base token class returned by TokenStream 65 /*! Each token represents a single chunk of relevant information 66 in a given rule. For example, the floating point number "1.2e-35", 67 originally represented as a 7-character string, is added to the 68 token stream as a single FloatToken object. 69 */ 70 class Token { 71 public: 72 Token(TokenType type = EmptyToken, const ssize_t pos = -1); 73 virtual ~Token(); 74 TokenType Type() const; 75 virtual const std::string& String() const; 76 virtual int32 Int() const; 77 virtual double Float() const; 78 ssize_t Pos() const; 79 bool operator==(Token &ref) const; 80 protected: 81 TokenType fType; 82 ssize_t fPos; 83 }; 84 85 //! String token class 86 /*! Single-quoted strings, double-quoted strings, unquoted strings, and 87 hex literals are all converted to StringToken objects by the scanner 88 and from then on treated uniformly. 89 */ 90 class StringToken : public Token { 91 public: 92 StringToken(const std::string &str, const ssize_t pos); 93 virtual ~StringToken(); 94 virtual const std::string& String() const; 95 protected: 96 std::string fString; 97 }; 98 99 //! Integer token class 100 /*! Signed or unsigned integer literals are coverted to IntToken objects, 101 which may then be treated as either ints or floats (since a priority 102 of "1" would be valid, but scanned as an int instead of a float). 103 */ 104 class IntToken : public Token { 105 public: 106 IntToken(const int32 value, const ssize_t pos); 107 virtual ~IntToken(); 108 virtual int32 Int() const; 109 virtual double Float() const; 110 protected: 111 int32 fValue; 112 }; 113 114 //! Floating point token class 115 /*! Signed or unsigned, extended or non-extended notation floating point 116 numbers are converted to FloatToken objects by the scanner. 117 */ 118 class FloatToken : public Token { 119 public: 120 FloatToken(const double value, const ssize_t pos); 121 virtual ~FloatToken(); 122 virtual double Float() const; 123 protected: 124 double fValue; 125 }; 126 127 //! Manages a stream of Token objects 128 /*! Provides Get() and Unget() operations, some handy shortcut operations (Read() 129 and CondRead()), and handles memory management with respect to all the 130 Token objects in the stream (i.e. never delete a Token object returned by Get()). 131 132 Also, the scanner portion of the parser is implemented in the TokenStream's 133 SetTo() function. 134 */ 135 class TokenStream { 136 public: 137 TokenStream(const std::string &string); 138 TokenStream(); 139 ~TokenStream(); 140 141 status_t SetTo(const std::string &string); 142 void Unset(); 143 status_t InitCheck() const; 144 145 const Token* Get(); 146 void Unget(); 147 148 void Read(TokenType type); 149 bool CondRead(TokenType type); 150 151 ssize_t Pos() const; 152 ssize_t EndPos() const; 153 154 bool IsEmpty() const; 155 156 private: 157 void AddToken(TokenType type, ssize_t pos); 158 void AddString(const std::string &str, ssize_t pos); 159 void AddInt(const char *str, ssize_t pos); 160 void AddFloat(const char *str, ssize_t pos); 161 162 std::vector<Token*> fTokenList; 163 status_t fCStatus; 164 int fPos; 165 int fStrLen; 166 167 168 TokenStream(const TokenStream &ref); 169 TokenStream& operator=(const TokenStream &ref); 170 }; 171 172 //! Handles parsing a sniffer rule, yielding either a parsed rule or a descriptive error message. 173 /*! A MIME sniffer rule is valid if it is well-formed with respect to the 174 following grammar and fulfills some further conditions listed thereafter: 175 176 <code> 177 Rule ::= LWS Priority LWS ConjList LWS 178 ConjList ::= DisjList (LWS DisjList)* 179 DisjList ::= "(" LWS PatternList LWS ")" 180 | "(" LWS RPatternList LWS ")" 181 | Range LWS "(" LWS PatternList LWS ")" 182 RPatternList ::= [Flag LWS] RPattern (LWS "|" LWS [Flag LWS] RPattern)* 183 PatternList ::= [Flag LWS] Pattern (LWS "|" LWS [Flag LWS] Pattern)* 184 185 RPattern ::= LWS Range LWS Pattern 186 Pattern ::= PString [ LWS "&" LWS Mask ] 187 Range ::= "[" LWS SDecimal [LWS ":" LWS SDecimal] LWS "]" 188 189 Priority ::= Float 190 Mask ::= PString 191 PString ::= HexLiteral | QuotedString | UnquotedString 192 193 HexLiteral ::= "0x" HexPair HexPair* 194 HexPair ::= HexChar HexChar 195 196 QuotedString ::= SingleQuotedString | DoubleQuotedString 197 SQuotedString := "'" SQChar+ "'" 198 DQuotedString := '"' DQChar+ '"' 199 200 UnquotedString ::= EscapedChar UChar* 201 EscapedChar ::= OctalEscape | HexEscape | "\" Char 202 OctalEscape ::= "\" [[OctHiChar] OctChar] OctChar 203 HexEscape ::= "\x" HexPair 204 205 Flag ::= "-i" 206 207 SDecimal ::= [Sign] Decimal 208 Decimal ::= DecChar DecChar* 209 Float ::= Fixed [("E" | "e") SDecimal] 210 Fixed ::= SDecimal ["." [Decimal]] | [Sign] "." Decimal 211 Sign ::= "+" | "-" 212 213 PunctuationChar ::= "(" | ")" | "[" | "]" | "|" | "&" | ":" 214 OctHiChar ::= "0" | "1" | "2" | "3" 215 OctChar ::= OctHiChar | "4" | "5" | "6" | "7" 216 DecChar ::= OctChar | "8" | "9" 217 HexChar ::= DecChar | "a" | "b" | "c" | "d" | "e" | "f" | "A" | "B" | "C" 218 | "D" | "E" | "F" 219 220 Char :: <any character> 221 SQChar ::= <Char except "\", "'"> | EscapedChar 222 DQChar ::= <Char except "\", '"'> | EscapedChar 223 UChar ::= <Char except "\", LWSChar, and PunctuationChar> | EscapedChar 224 225 LWS ::= LWSChar* 226 LWSChar ::= " " | TAB | LF 227 </code> 228 229 Conditions: 230 - If a mask is specified for a pattern, this mask must have the same 231 length as the pattern string. 232 - 0.0 <= Priority <= 1.0 233 - 0 <= Range begin <= Range end 234 235 Notes: 236 - If a case-insensitive flag ("-i") appears in front of any Pattern or RPattern 237 in a DisjList, case-insensitivity is applied to the entire DisjList. 238 239 Examples: 240 - 1.0 ('ABCD') 241 The file must start with the string "ABCD". The priority of the rule 242 is 1.0 (maximal). 243 - 0.8 [0:3] ('ABCD' | 'abcd') 244 The file must contain the string "ABCD" or "abcd" starting somewhere in 245 the first four bytes. The rule priority is 0.8. 246 - 0.5 ([0:3] 'ABCD' | [0:3] 'abcd' | [13] 'EFGH') 247 The file must contain the string "ABCD" or "abcd" starting somewhere in 248 the first four bytes or the string "EFGH" at position 13. The rule 249 priority is 0.5. 250 - 0.8 [0:3] ('ABCD' & 0xff00ffff | 'abcd' & 0xffff00ff) 251 The file must contain the string "A.CD" or "ab.d" (whereas "." is an 252 arbitrary character) starting somewhere in the first four bytes. The 253 rule priority is 0.8. 254 - 0.3 [10] ('mnop') ('abc') [20] ('xyz') 255 The file must contain the string 'abc' at the beginning of the file, 256 the string 'mnop' starting at position 10, and the string 'xyz' 257 starting at position 20. The rule priority is 0.3. 258 - 200e-3 (-i 'ab') 259 The file must contain the string 'ab', 'aB', 'Ab', or 'AB' at the 260 beginning of the file. The rule priority is 0.2. 261 262 Real examples: 263 - 0.20 ([0]"//" | [0]"/\*" | [0:32]"#include" | [0:32]"#ifndef" 264 | [0:32]"#ifdef") 265 text/x-source-code 266 - 0.70 ("8BPS \000\000\000\000" & 0xffffffff0000ffffffff ) 267 image/x-photoshop 268 - 0.40 [0:64]( -i "<HTML" | "<HEAD" | "<TITLE" | "<BODY" 269 | "<TABLE" | "<!--" | "<META" | "<CENTER") 270 text/html 271 272 */ 273 class Parser { 274 public: 275 Parser(); 276 ~Parser(); 277 status_t Parse(const char *rule, Rule *result, BString *parseError = NULL); 278 private: 279 std::string ErrorMessage(Err *err, const char *rule); 280 281 // Things that get done a lot :-) 282 void ThrowEndOfStreamError(); 283 inline void ThrowOutOfMemError(ssize_t pos); 284 void ThrowUnexpectedTokenError(TokenType expected, const Token *found); 285 void ThrowUnexpectedTokenError(TokenType expected1, TokenType expected2, const Token *found); 286 287 // Parsing functions 288 void ParseRule(Rule *result); 289 double ParsePriority(); 290 std::vector<DisjList*>* ParseConjList(); 291 DisjList* ParseDisjList(); 292 Range ParseRange(); 293 DisjList* ParsePatternList(Range range); 294 DisjList* ParseRPatternList(); 295 RPattern* ParseRPattern(); 296 Pattern* ParsePattern(); 297 298 TokenStream stream; 299 300 Err *fOutOfMemErr; 301 }; 302 303 }; // namespace Sniffer 304 }; // namespace Storage 305 }; // namespace BPrivate 306 307 #endif // _SNIFFER_PARSER_H 308 309 310 311 312