xref: /haiku/headers/private/storage/sniffer/Parser.h (revision c237c4ce593ee823d9867fd997e51e4c447f5623)
1 //----------------------------------------------------------------------
2 //  This software is part of the Haiku distribution and is covered
3 //  by the MIT License.
4 //---------------------------------------------------------------------
5 /*!
6 	\file sniffer/Parser.h
7 	MIME sniffer rule parser declarations
8 */
9 #ifndef _SNIFFER_PARSER_H
10 #define _SNIFFER_PARSER_H
11 
12 #include <SupportDefs.h>
13 #include <sniffer/CharStream.h>
14 #include <sniffer/Err.h>
15 #include <sniffer/Range.h>
16 #include <sniffer/Rule.h>
17 #include <List.h>
18 #include <string>
19 #include <vector>
20 
21 class BString;
22 
23 //! MIME Sniffer related classes
24 namespace BPrivate {
25 namespace Storage {
26 namespace Sniffer {
27 
28 class Rule;
29 class DisjList;
30 class RPattern;
31 class Pattern;
32 
33 //------------------------------------------------------------------------------
34 // The mighty parsing function ;-)
35 //------------------------------------------------------------------------------
36 
37 status_t parse(const char *rule, Rule *result, BString *parseError = NULL);
38 
39 //------------------------------------------------------------------------------
40 // Classes used internally by the parser
41 //------------------------------------------------------------------------------
42 
43 //! Types of tokens
44 typedef enum TokenType {
45 	EmptyToken,
46 	LeftParen,
47 	RightParen,
48 	LeftBracket,
49 	RightBracket,
50 	Colon,
51 	Divider,
52 	Ampersand,
53 	CaseInsensitiveFlag,
54 	CharacterString,
55 	Integer,
56 	FloatingPoint
57 } TokenType;
58 
59 /*! \brief Returns a NULL-terminated string contating the
60 		   name of the given token type
61 */
62 const char* tokenTypeToString(TokenType type);
63 
64 //! Base token class returned by TokenStream
65 /*! Each token represents a single chunk of relevant information
66     in a given rule. For example, the floating point number "1.2e-35",
67     originally represented as a 7-character string, is added to the
68     token stream as a single FloatToken object.
69 */
70 class Token {
71 public:
72 	Token(TokenType type = EmptyToken, const ssize_t pos = -1);
73 	virtual ~Token();
74 	TokenType Type() const;
75 	virtual const std::string& String() const;
76 	virtual int32 Int() const;
77 	virtual double Float() const;
78 	ssize_t Pos() const;
79 	bool operator==(Token &ref) const;
80 protected:
81 	TokenType fType;
82 	ssize_t fPos;
83 };
84 
85 //! String token class
86 /*! Single-quoted strings, double-quoted strings, unquoted strings, and
87 	hex literals are all converted to StringToken objects by the scanner
88 	and from then on treated uniformly.
89 */
90 class StringToken : public Token {
91 public:
92 	StringToken(const std::string &str, const ssize_t pos);
93 	virtual ~StringToken();
94 	virtual const std::string& String() const;
95 protected:
96 	std::string fString;
97 };
98 
99 //! Integer token class
100 /*! Signed or unsigned integer literals are coverted to IntToken objects,
101     which may then be treated as either ints or floats (since a priority
102     of "1" would be valid, but scanned as an int instead of a float).
103 */
104 class IntToken : public Token {
105 public:
106 	IntToken(const int32 value, const ssize_t pos);
107 	virtual ~IntToken();
108 	virtual int32 Int() const;
109 	virtual double Float() const;
110 protected:
111 	int32 fValue;
112 };
113 
114 //! Floating point token class
115 /*! Signed or unsigned, extended or non-extended notation floating point
116     numbers are converted to FloatToken objects by the scanner.
117 */
118 class FloatToken : public Token {
119 public:
120 	FloatToken(const double value, const ssize_t pos);
121 	virtual ~FloatToken();
122 	virtual double Float() const;
123 protected:
124 	double fValue;
125 };
126 
127 //! Manages a stream of Token objects
128 /*! Provides Get() and Unget() operations, some handy shortcut operations (Read()
129     and CondRead()), and handles memory management with respect to all the
130     Token objects in the stream (i.e. never delete a Token object returned by Get()).
131 
132     Also, the scanner portion of the parser is implemented in the TokenStream's
133     SetTo() function.
134 */
135 class TokenStream {
136 public:
137 	TokenStream(const std::string &string);
138 	TokenStream();
139 	~TokenStream();
140 
141 	status_t SetTo(const std::string &string);
142 	void Unset();
143 	status_t InitCheck() const;
144 
145 	const Token* Get();
146 	void Unget();
147 
148 	void Read(TokenType type);
149 	bool CondRead(TokenType type);
150 
151 	ssize_t Pos() const;
152 	ssize_t EndPos() const;
153 
154 	bool IsEmpty() const;
155 
156 private:
157 	void AddToken(TokenType type, ssize_t pos);
158 	void AddString(const std::string &str, ssize_t pos);
159 	void AddInt(const char *str, ssize_t pos);
160 	void AddFloat(const char *str, ssize_t pos);
161 
162 	std::vector<Token*> fTokenList;
163 	status_t fCStatus;
164 	int fPos;
165 	int fStrLen;
166 
167 
168 	TokenStream(const TokenStream &ref);
169 	TokenStream& operator=(const TokenStream &ref);
170 };
171 
172 //! Handles parsing a sniffer rule, yielding either a parsed rule or a descriptive error message.
173 /*! A MIME sniffer rule is valid if it is well-formed with respect to the
174 	following grammar and fulfills some further conditions listed thereafter:
175 
176 	<code>
177 	Rule			::= LWS Priority LWS ConjList LWS
178 	ConjList		::= DisjList (LWS DisjList)*
179 	DisjList		::= "(" LWS PatternList LWS ")"
180 						| "(" LWS RPatternList LWS ")"
181 						| Range LWS "(" LWS PatternList LWS ")"
182 	RPatternList	::= [Flag LWS] RPattern (LWS "|" LWS [Flag LWS] RPattern)*
183 	PatternList		::= [Flag LWS] Pattern (LWS "|" LWS [Flag LWS] Pattern)*
184 
185 	RPattern		::= LWS Range LWS Pattern
186 	Pattern			::= PString [ LWS "&" LWS Mask ]
187 	Range			::=	"[" LWS SDecimal [LWS ":" LWS SDecimal] LWS "]"
188 
189 	Priority		::= Float
190 	Mask			::= PString
191 	PString			::= HexLiteral | QuotedString | UnquotedString
192 
193 	HexLiteral		::= "0x" HexPair HexPair*
194 	HexPair			::= HexChar HexChar
195 
196 	QuotedString	::= SingleQuotedString | DoubleQuotedString
197 	SQuotedString	:= "'" SQChar+ "'"
198 	DQuotedString	:= '"' DQChar+ '"'
199 
200 	UnquotedString	::= EscapedChar UChar*
201 	EscapedChar		::= OctalEscape | HexEscape | "\" Char
202 	OctalEscape		::= "\" [[OctHiChar] OctChar] OctChar
203 	HexEscape		::= "\x" HexPair
204 
205 	Flag			::= "-i"
206 
207 	SDecimal		::= [Sign] Decimal
208 	Decimal			::= DecChar DecChar*
209 	Float			::= Fixed [("E" | "e") SDecimal]
210 	Fixed			::= SDecimal ["." [Decimal]] | [Sign] "." Decimal
211 	Sign			::= "+" | "-"
212 
213 	PunctuationChar	::= "(" | ")" | "[" | "]" | "|" | "&" | ":"
214 	OctHiChar		::= "0" | "1" | "2" | "3"
215 	OctChar			::= OctHiChar | "4" | "5" | "6" | "7"
216 	DecChar			::= OctChar | "8" | "9"
217 	HexChar			::= DecChar | "a" | "b" | "c" | "d" | "e" | "f" | "A" | "B" | "C"
218 						| "D" | "E" | "F"
219 
220 	Char			:: <any character>
221 	SQChar			::= <Char except "\", "'"> | EscapedChar
222 	DQChar			::= <Char except "\", '"'> | EscapedChar
223 	UChar			::= <Char except "\", LWSChar,  and PunctuationChar> | EscapedChar
224 
225 	LWS				::= LWSChar*
226 	LWSChar			::= " " | TAB | LF
227 	</code>
228 
229 	Conditions:
230 	- If a mask is specified for a pattern, this mask must have the same
231 	  length as the pattern string.
232 	- 0.0 <= Priority <= 1.0
233 	- 0 <= Range begin <= Range end
234 
235 	Notes:
236 	- If a case-insensitive flag ("-i") appears in front of any Pattern or RPattern
237 	  in a DisjList, case-insensitivity is applied to the entire DisjList.
238 
239 	Examples:
240 	- 1.0 ('ABCD')
241 	  The file must start with the string "ABCD". The priority of the rule
242 	  is 1.0 (maximal).
243 	- 0.8 [0:3] ('ABCD' | 'abcd')
244 	  The file must contain the string "ABCD" or "abcd" starting somewhere in
245 	  the first four bytes. The rule priority is 0.8.
246 	- 0.5 ([0:3] 'ABCD' | [0:3] 'abcd' | [13] 'EFGH')
247 	  The file must contain the string "ABCD" or "abcd" starting somewhere in
248 	  the first four bytes or the string "EFGH" at position 13. The rule
249 	  priority is 0.5.
250 	- 0.8 [0:3] ('ABCD' & 0xff00ffff | 'abcd' & 0xffff00ff)
251 	  The file must contain the string "A.CD" or "ab.d" (whereas "." is an
252 	  arbitrary character) starting somewhere in the first four bytes. The
253 	  rule priority is 0.8.
254 	- 0.3 [10] ('mnop') ('abc') [20] ('xyz')
255 	  The file must contain the string 'abc' at the beginning of the file,
256 	  the string 'mnop' starting at position 10, and the string 'xyz'
257 	  starting at position 20. The rule priority is 0.3.
258 	- 200e-3 (-i 'ab')
259 	  The file must contain the string 'ab', 'aB', 'Ab', or 'AB' at the
260 	  beginning of the file. The rule priority is 0.2.
261 
262 	Real examples:
263 	- 0.20 ([0]"//" | [0]"/\*" | [0:32]"#include" | [0:32]"#ifndef"
264 	        | [0:32]"#ifdef")
265 	  text/x-source-code
266 	- 0.70 ("8BPS  \000\000\000\000" & 0xffffffff0000ffffffff )
267 	  image/x-photoshop
268 	- 0.40 [0:64]( -i "&lt;HTML" | "&lt;HEAD" | "&lt;TITLE" | "&lt;BODY"
269 			| "&lt;TABLE" | "&lt;!--" | "&lt;META" | "&lt;CENTER")
270 	  text/html
271 
272 */
273 class Parser {
274 public:
275 	Parser();
276 	~Parser();
277 	status_t Parse(const char *rule, Rule *result, BString *parseError = NULL);
278 private:
279 	std::string ErrorMessage(Err *err, const char *rule);
280 
281 	// Things that get done a lot :-)
282 	void ThrowEndOfStreamError();
283 	inline void ThrowOutOfMemError(ssize_t pos);
284 	void ThrowUnexpectedTokenError(TokenType expected, const Token *found);
285 	void ThrowUnexpectedTokenError(TokenType expected1, TokenType expected2, const Token *found);
286 
287 	// Parsing functions
288 	void ParseRule(Rule *result);
289 	double ParsePriority();
290 	std::vector<DisjList*>* ParseConjList();
291 	DisjList* ParseDisjList();
292 	Range ParseRange();
293 	DisjList* ParsePatternList(Range range);
294 	DisjList* ParseRPatternList();
295 	RPattern* ParseRPattern();
296 	Pattern* ParsePattern();
297 
298 	TokenStream stream;
299 
300 	Err *fOutOfMemErr;
301 };
302 
303 };	// namespace Sniffer
304 };	// namespace Storage
305 };	// namespace BPrivate
306 
307 #endif	// _SNIFFER_PARSER_H
308 
309 
310 
311 
312