xref: /haiku/headers/private/storage/sniffer/Parser.h (revision 93d145bb01fcc5d92405cb9af11b91362ce0b090)
1674e18fbSTyler Dauwalder //----------------------------------------------------------------------
2674e18fbSTyler Dauwalder //  This software is part of the OpenBeOS distribution and is covered
3674e18fbSTyler Dauwalder //  by the OpenBeOS license.
4674e18fbSTyler Dauwalder //---------------------------------------------------------------------
5674e18fbSTyler Dauwalder /*!
6674e18fbSTyler Dauwalder 	\file sniffer/Parser.h
7674e18fbSTyler Dauwalder 	MIME sniffer rule parser declarations
8674e18fbSTyler Dauwalder */
9674e18fbSTyler Dauwalder #ifndef _sk_sniffer_parser_h_
10674e18fbSTyler Dauwalder #define _sk_sniffer_parser_h_
11674e18fbSTyler Dauwalder 
12674e18fbSTyler Dauwalder #include <SupportDefs.h>
13390dce8dSTyler Dauwalder #include <sniffer/Err.h>
144574a75fSTyler Dauwalder #include <sniffer/Range.h>
154574a75fSTyler Dauwalder #include <sniffer/Rule.h>
165da54924STyler Dauwalder #include <List.h>
175da54924STyler Dauwalder #include <string>
184574a75fSTyler Dauwalder #include <vector>
195da54924STyler Dauwalder 
20674e18fbSTyler Dauwalder class BString;
21674e18fbSTyler Dauwalder 
2260ee71d3STyler Dauwalder //! MIME Sniffer related classes
23674e18fbSTyler Dauwalder namespace Sniffer {
244574a75fSTyler Dauwalder 
25674e18fbSTyler Dauwalder class Rule;
26*93d145bbSTyler Dauwalder class DisjList;
274574a75fSTyler Dauwalder class RPattern;
284574a75fSTyler Dauwalder class Pattern;
295da54924STyler Dauwalder 
305da54924STyler Dauwalder //------------------------------------------------------------------------------
315da54924STyler Dauwalder // The mighty parsing function ;-)
325da54924STyler Dauwalder //------------------------------------------------------------------------------
335da54924STyler Dauwalder 
34674e18fbSTyler Dauwalder status_t parse(const char *rule, Rule *result, BString *parseError = NULL);
355da54924STyler Dauwalder 
365da54924STyler Dauwalder //------------------------------------------------------------------------------
375da54924STyler Dauwalder // Classes used internally by the parser
385da54924STyler Dauwalder //------------------------------------------------------------------------------
395da54924STyler Dauwalder 
4060ee71d3STyler Dauwalder //! Manages a stream of characters
4160ee71d3STyler Dauwalder /*! CharStream is used by the scanner portion of the parser, which is implemented
4260ee71d3STyler Dauwalder 	in TokenStream::SetTo().
4360ee71d3STyler Dauwalder */
445da54924STyler Dauwalder class CharStream {
455da54924STyler Dauwalder public:
46fe70cd16STyler Dauwalder 	CharStream(const std::string &string);
47fe70cd16STyler Dauwalder 	CharStream();
485da54924STyler Dauwalder 	~CharStream();
495da54924STyler Dauwalder 
50fe70cd16STyler Dauwalder 	status_t SetTo(const std::string &string);
515da54924STyler Dauwalder 	void Unset();
525da54924STyler Dauwalder 	status_t InitCheck() const;
535da54924STyler Dauwalder 	bool IsEmpty() const;
544574a75fSTyler Dauwalder 	ssize_t Pos() const;
55fe70cd16STyler Dauwalder 	const std::string& String() const;
565da54924STyler Dauwalder 
575da54924STyler Dauwalder 	char Get();
585da54924STyler Dauwalder 	void Unget();
595da54924STyler Dauwalder 
605da54924STyler Dauwalder private:
61fe70cd16STyler Dauwalder 	std::string fString;
624574a75fSTyler Dauwalder 	ssize_t fPos;
63fe70cd16STyler Dauwalder //	ssize_t fLen;
645da54924STyler Dauwalder 	status_t fCStatus;
655da54924STyler Dauwalder 
665da54924STyler Dauwalder 	CharStream(const CharStream &ref);
675da54924STyler Dauwalder 	CharStream& operator=(const CharStream &ref);
685da54924STyler Dauwalder };
695da54924STyler Dauwalder 
7060ee71d3STyler Dauwalder //! Types of tokens
715da54924STyler Dauwalder typedef enum TokenType {
725da54924STyler Dauwalder 	EmptyToken,
735da54924STyler Dauwalder 	LeftParen,
745da54924STyler Dauwalder 	RightParen,
755da54924STyler Dauwalder 	LeftBracket,
765da54924STyler Dauwalder 	RightBracket,
775da54924STyler Dauwalder 	Colon,
785da54924STyler Dauwalder 	Divider,
795da54924STyler Dauwalder 	Ampersand,
80*93d145bbSTyler Dauwalder 	CaseInsensitiveFlag,
815da54924STyler Dauwalder 	CharacterString,
825da54924STyler Dauwalder 	Integer,
835da54924STyler Dauwalder 	FloatingPoint
845da54924STyler Dauwalder };
855da54924STyler Dauwalder 
8660ee71d3STyler Dauwalder /*! \brief Returns a NULL-terminated string contating the
8760ee71d3STyler Dauwalder 		   name of the given token type
8860ee71d3STyler Dauwalder */
895da54924STyler Dauwalder const char* tokenTypeToString(TokenType type);
905da54924STyler Dauwalder 
9160ee71d3STyler Dauwalder //! Base token class returned by TokenStream
9260ee71d3STyler Dauwalder /*! Each token represents a single chunk of relevant information
9360ee71d3STyler Dauwalder     in a given rule. For example, the floating point number "1.2e-35",
9460ee71d3STyler Dauwalder     originally represented as a 7-character string, is added to the
9560ee71d3STyler Dauwalder     token stream as a single FloatToken object.
9660ee71d3STyler Dauwalder */
975da54924STyler Dauwalder class Token {
985da54924STyler Dauwalder public:
994574a75fSTyler Dauwalder 	Token(TokenType type = EmptyToken, const ssize_t pos = -1);
1005da54924STyler Dauwalder 	TokenType Type() const;
101fe70cd16STyler Dauwalder 	virtual const std::string& String() const;
1025da54924STyler Dauwalder 	virtual int32 Int() const;
1035da54924STyler Dauwalder 	virtual double Float() const;
1044574a75fSTyler Dauwalder 	ssize_t Pos() const;
105390dce8dSTyler Dauwalder 	bool operator==(Token &ref) const;
1065da54924STyler Dauwalder protected:
1075da54924STyler Dauwalder 	TokenType fType;
1084574a75fSTyler Dauwalder 	ssize_t fPos;
1095da54924STyler Dauwalder };
1105da54924STyler Dauwalder 
11160ee71d3STyler Dauwalder //! String token class
11260ee71d3STyler Dauwalder /*! Single-quoted strings, double-quoted strings, unquoted strings, and
11360ee71d3STyler Dauwalder 	hex literals are all converted to StringToken objects by the scanner
11460ee71d3STyler Dauwalder 	and from then on treated uniformly.
11560ee71d3STyler Dauwalder */
1165da54924STyler Dauwalder class StringToken : public Token {
1175da54924STyler Dauwalder public:
118fe70cd16STyler Dauwalder 	StringToken(const std::string &str, const ssize_t pos);
119fe70cd16STyler Dauwalder 	virtual const std::string& String() const;
1205da54924STyler Dauwalder protected:
121fe70cd16STyler Dauwalder 	std::string fString;
1225da54924STyler Dauwalder };
1235da54924STyler Dauwalder 
12460ee71d3STyler Dauwalder //! Integer token class
12560ee71d3STyler Dauwalder /*! Signed or unsigned integer literals are coverted to IntToken objects,
12660ee71d3STyler Dauwalder     which may then be treated as either ints or floats (since a priority
12760ee71d3STyler Dauwalder     of "1" would be valid, but scanned as an int instead of a float).
12860ee71d3STyler Dauwalder */
1295da54924STyler Dauwalder class IntToken : public Token {
1305da54924STyler Dauwalder public:
1314574a75fSTyler Dauwalder 	IntToken(const int32 value, const ssize_t pos);
1325da54924STyler Dauwalder 	virtual int32 Int() const;
1335da54924STyler Dauwalder 	virtual double Float() const;
1345da54924STyler Dauwalder protected:
1355da54924STyler Dauwalder 	int32 fValue;
1365da54924STyler Dauwalder };
1375da54924STyler Dauwalder 
13860ee71d3STyler Dauwalder //! Floating point token class
13960ee71d3STyler Dauwalder /*! Signed or unsigned, extended or non-extended notation floating point
14060ee71d3STyler Dauwalder     numbers are converted to FloatToken objects by the scanner.
14160ee71d3STyler Dauwalder */
1425da54924STyler Dauwalder class FloatToken : public Token {
1435da54924STyler Dauwalder public:
1444574a75fSTyler Dauwalder 	FloatToken(const double value, const ssize_t pos);
1455da54924STyler Dauwalder 	virtual double Float() const;
1465da54924STyler Dauwalder protected:
1475da54924STyler Dauwalder 	double fValue;
1485da54924STyler Dauwalder };
1495da54924STyler Dauwalder 
15060ee71d3STyler Dauwalder //! Manages a stream of Token objects
15160ee71d3STyler Dauwalder /*! Provides Get() and Unget() operations, some handy shortcut operations (Read()
15260ee71d3STyler Dauwalder     and CondRead()), and handles memory management with respect to all the
15360ee71d3STyler Dauwalder     Token objects in the stream (i.e. never delete a Token object returned by Get()).
15460ee71d3STyler Dauwalder 
15560ee71d3STyler Dauwalder     Also, the scanner portion of the parser is implemented in the TokenStream's
15660ee71d3STyler Dauwalder     SetTo() function.
15760ee71d3STyler Dauwalder */
1585da54924STyler Dauwalder class TokenStream {
1595da54924STyler Dauwalder public:
160fe70cd16STyler Dauwalder 	TokenStream(const std::string &string);
161fe70cd16STyler Dauwalder 	TokenStream();
1625da54924STyler Dauwalder 	~TokenStream();
1635da54924STyler Dauwalder 
164fe70cd16STyler Dauwalder 	status_t SetTo(const std::string &string);
1655da54924STyler Dauwalder 	void Unset();
1665da54924STyler Dauwalder 	status_t InitCheck() const;
1675da54924STyler Dauwalder 
168390dce8dSTyler Dauwalder 	const Token* Get();
169390dce8dSTyler Dauwalder 	void Unget();
1705da54924STyler Dauwalder 
171390dce8dSTyler Dauwalder 	void Read(TokenType type);
172390dce8dSTyler Dauwalder 	bool CondRead(TokenType type);
173390dce8dSTyler Dauwalder 
174390dce8dSTyler Dauwalder 	ssize_t Pos() const;
175390dce8dSTyler Dauwalder 	ssize_t EndPos() const;
176390dce8dSTyler Dauwalder 
177390dce8dSTyler Dauwalder 	bool IsEmpty() const;
1785da54924STyler Dauwalder 
1795da54924STyler Dauwalder private:
1804574a75fSTyler Dauwalder 	void AddToken(TokenType type, ssize_t pos);
181fe70cd16STyler Dauwalder 	void AddString(const std::string &str, ssize_t pos);
1824574a75fSTyler Dauwalder 	void AddInt(const char *str, ssize_t pos);
1834574a75fSTyler Dauwalder 	void AddFloat(const char *str, ssize_t pos);
1845da54924STyler Dauwalder 
185390dce8dSTyler Dauwalder 	std::vector<Token*> fTokenList;
186390dce8dSTyler Dauwalder 	int fPos;
187390dce8dSTyler Dauwalder 	int fStrLen;
1885da54924STyler Dauwalder 	status_t fCStatus;
1895da54924STyler Dauwalder 
190390dce8dSTyler Dauwalder 
1915da54924STyler Dauwalder 	TokenStream(const TokenStream &ref);
1925da54924STyler Dauwalder 	TokenStream& operator=(const TokenStream &ref);
1935da54924STyler Dauwalder };
1945da54924STyler Dauwalder 
19560ee71d3STyler Dauwalder //! Handles parsing a sniffer rule, yielding either a parsed rule or a descriptive error message.
196*93d145bbSTyler Dauwalder /*! A MIME sniffer rule is valid if it is well-formed with respect to the
197*93d145bbSTyler Dauwalder 	following grammar and fulfills some further conditions listed thereafter:
198*93d145bbSTyler Dauwalder 
199*93d145bbSTyler Dauwalder 	<code>
200*93d145bbSTyler Dauwalder 	Rule			::= LWS Priority LWS ConjList LWS
201*93d145bbSTyler Dauwalder 	ConjList		::= DisjList (LWS DisjList)*
202*93d145bbSTyler Dauwalder 	DisjList		::= "(" LWS PatternList LWS ")"
203*93d145bbSTyler Dauwalder 						| "(" LWS RPatternList LWS ")"
204*93d145bbSTyler Dauwalder 						| Range LWS "(" LWS PatternList LWS ")"
205*93d145bbSTyler Dauwalder 	RPatternList	::= [Flag LWS] RPattern (LWS "|" LWS [Flag LWS] RPattern)*
206*93d145bbSTyler Dauwalder 	PatternList		::= [Flag LWS] Pattern (LWS "|" LWS [Flag LWS] Pattern)*
207*93d145bbSTyler Dauwalder 
208*93d145bbSTyler Dauwalder 	RPattern		::= LWS Range LWS Pattern
209*93d145bbSTyler Dauwalder 	Pattern			::= PString [ LWS "&" LWS Mask ]
210*93d145bbSTyler Dauwalder 	Range			::=	"[" LWS SDecimal [LWS ":" LWS SDecimal] LWS "]"
211*93d145bbSTyler Dauwalder 
212*93d145bbSTyler Dauwalder 	Priority		::= Float
213*93d145bbSTyler Dauwalder 	Mask			::= PString
214*93d145bbSTyler Dauwalder 	PString			::= HexLiteral | QuotedString | UnquotedString
215*93d145bbSTyler Dauwalder 
216*93d145bbSTyler Dauwalder 	HexLiteral		::= "0x" HexPair HexPair*
217*93d145bbSTyler Dauwalder 	HexPair			::= HexChar HexChar
218*93d145bbSTyler Dauwalder 
219*93d145bbSTyler Dauwalder 	QuotedString	::= SingleQuotedString | DoubleQuotedString
220*93d145bbSTyler Dauwalder 	SQuotedString	:= "'" SQChar+ "'"
221*93d145bbSTyler Dauwalder 	DQuotedString	:= '"' DQChar+ '"'
222*93d145bbSTyler Dauwalder 
223*93d145bbSTyler Dauwalder 	UnquotedString	::= EscapedChar UChar*
224*93d145bbSTyler Dauwalder 	EscapedChar		::= OctalEscape | HexEscape | "\" Char
225*93d145bbSTyler Dauwalder 	OctalEscape		::= "\" [[OctHiChar] OctChar] OctChar
226*93d145bbSTyler Dauwalder 	HexEscape		::= "\x" HexPair
227*93d145bbSTyler Dauwalder 
228*93d145bbSTyler Dauwalder 	Flag			::= "-i"
229*93d145bbSTyler Dauwalder 
230*93d145bbSTyler Dauwalder 	SDecimal		::= [Sign] Decimal
231*93d145bbSTyler Dauwalder 	Decimal			::= DecChar DecChar*
232*93d145bbSTyler Dauwalder 	Float			::= Fixed [("E" | "e") SDecimal]
233*93d145bbSTyler Dauwalder 	Fixed			::= SDecimal ["." [Decimal]] | [Sign] "." Decimal
234*93d145bbSTyler Dauwalder 	Sign			::= "+" | "-"
235*93d145bbSTyler Dauwalder 
236*93d145bbSTyler Dauwalder 	PunctuationChar	::= "(" | ")" | "[" | "]" | "|" | "&" | ":"
237*93d145bbSTyler Dauwalder 	OctHiChar		::= "0" | "1" | "2" | "3"
238*93d145bbSTyler Dauwalder 	OctChar			::= OctHiChar | "4" | "5" | "6" | "7"
239*93d145bbSTyler Dauwalder 	DecChar			::= OctChar | "8" | "9"
240*93d145bbSTyler Dauwalder 	HexChar			::= DecChar | "a" | "b" | "c" | "d" | "e" | "f" | "A" | "B" | "C"
241*93d145bbSTyler Dauwalder 						| "D" | "E" | "F"
242*93d145bbSTyler Dauwalder 
243*93d145bbSTyler Dauwalder 	Char			:: <any character>
244*93d145bbSTyler Dauwalder 	SQChar			::= <Char except "\", "'"> | EscapedChar
245*93d145bbSTyler Dauwalder 	DQChar			::= <Char except "\", '"'> | EscapedChar
246*93d145bbSTyler Dauwalder 	UChar			::= <Char except "\", LWSChar,  and PunctuationChar> | EscapedChar
247*93d145bbSTyler Dauwalder 
248*93d145bbSTyler Dauwalder 	LWS				::= LWSChar*
249*93d145bbSTyler Dauwalder 	LWSChar			::= " " | TAB | LF
250*93d145bbSTyler Dauwalder 	</code>
251*93d145bbSTyler Dauwalder 
252*93d145bbSTyler Dauwalder 	Conditions:
253*93d145bbSTyler Dauwalder 	- If a mask is specified for a pattern, this mask must have the same
254*93d145bbSTyler Dauwalder 	  length as the pattern string.
255*93d145bbSTyler Dauwalder 	- 0.0 <= Priority <= 1.0
256*93d145bbSTyler Dauwalder 	- 0 <= Range begin <= Range end
257*93d145bbSTyler Dauwalder 
258*93d145bbSTyler Dauwalder 	Notes:
259*93d145bbSTyler Dauwalder 	- If a case-insensitive flag ("-i") appears in front of any Pattern or RPattern
260*93d145bbSTyler Dauwalder 	  in a DisjList, case-insensitivity is applied to the entire DisjList.
261*93d145bbSTyler Dauwalder 
262*93d145bbSTyler Dauwalder 	Examples:
263*93d145bbSTyler Dauwalder 	- 1.0 ('ABCD')
264*93d145bbSTyler Dauwalder 	  The file must start with the string "ABCD". The priority of the rule
265*93d145bbSTyler Dauwalder 	  is 1.0 (maximal).
266*93d145bbSTyler Dauwalder 	- 0.8 [0:3] ('ABCD' | 'abcd')
267*93d145bbSTyler Dauwalder 	  The file must contain the string "ABCD" or "abcd" starting somewhere in
268*93d145bbSTyler Dauwalder 	  the first four bytes. The rule priority is 0.8.
269*93d145bbSTyler Dauwalder 	- 0.5 ([0:3] 'ABCD' | [0:3] 'abcd' | [13] 'EFGH')
270*93d145bbSTyler Dauwalder 	  The file must contain the string "ABCD" or "abcd" starting somewhere in
271*93d145bbSTyler Dauwalder 	  the first four bytes or the string "EFGH" at position 13. The rule
272*93d145bbSTyler Dauwalder 	  priority is 0.5.
273*93d145bbSTyler Dauwalder 	- 0.8 [0:3] ('ABCD' & 0xff00ffff | 'abcd' & 0xffff00ff)
274*93d145bbSTyler Dauwalder 	  The file must contain the string "A.CD" or "ab.d" (whereas "." is an
275*93d145bbSTyler Dauwalder 	  arbitrary character) starting somewhere in the first four bytes. The
276*93d145bbSTyler Dauwalder 	  rule priority is 0.8.
277*93d145bbSTyler Dauwalder 	- 0.3 [10] ('mnop') ('abc') [20] ('xyz')
278*93d145bbSTyler Dauwalder 	  The file must contain the string 'abc' at the beginning of the file,
279*93d145bbSTyler Dauwalder 	  the string 'mnop' starting at position 10, and the string 'xyz'
280*93d145bbSTyler Dauwalder 	  starting at position 20. The rule priority is 0.3.
281*93d145bbSTyler Dauwalder 	- 200e-3 (-i 'ab')
282*93d145bbSTyler Dauwalder 	  The file must contain the string 'ab', 'aB', 'Ab', or 'AB' at the
283*93d145bbSTyler Dauwalder 	  beginning of the file. The rule priority is 0.2.
284*93d145bbSTyler Dauwalder 
285*93d145bbSTyler Dauwalder 	Real examples:
286*93d145bbSTyler Dauwalder 	- 0.20 ([0]"//" | [0]"/\*" | [0:32]"#include" | [0:32]"#ifndef"
287*93d145bbSTyler Dauwalder 	        | [0:32]"#ifdef")
288*93d145bbSTyler Dauwalder 	  text/x-source-code
289*93d145bbSTyler Dauwalder 	- 0.70 ("8BPS  \000\000\000\000" & 0xffffffff0000ffffffff )
290*93d145bbSTyler Dauwalder 	  image/x-photoshop
291*93d145bbSTyler Dauwalder 	- 0.40 [0:64]( -i "&lt;HTML" | "&lt;HEAD" | "&lt;TITLE" | "&lt;BODY"
292*93d145bbSTyler Dauwalder 			| "&lt;TABLE" | "&lt;!--" | "&lt;META" | "&lt;CENTER")
293*93d145bbSTyler Dauwalder 	  text/html
294*93d145bbSTyler Dauwalder 
295*93d145bbSTyler Dauwalder */
2964574a75fSTyler Dauwalder class Parser {
2974574a75fSTyler Dauwalder public:
2984574a75fSTyler Dauwalder 	Parser();
299390dce8dSTyler Dauwalder 	~Parser();
3004574a75fSTyler Dauwalder 	status_t Parse(const char *rule, Rule *result, BString *parseError = NULL);
3014574a75fSTyler Dauwalder private:
3024574a75fSTyler Dauwalder 	std::string ErrorMessage(Err *err, const char *rule);
3034574a75fSTyler Dauwalder 
304390dce8dSTyler Dauwalder 	// Things that get done a lot :-)
305390dce8dSTyler Dauwalder 	void ThrowEndOfStreamError();
306390dce8dSTyler Dauwalder 	inline void ThrowOutOfMemError(ssize_t pos);
307390dce8dSTyler Dauwalder 	void ThrowUnexpectedTokenError(TokenType expected, const Token *found);
308390dce8dSTyler Dauwalder 	void ThrowUnexpectedTokenError(TokenType expected1, TokenType expected2, const Token *found);
309390dce8dSTyler Dauwalder 
3104574a75fSTyler Dauwalder 	// Parsing functions
3114574a75fSTyler Dauwalder 	void ParseRule(Rule *result);
3124574a75fSTyler Dauwalder 	double ParsePriority();
313*93d145bbSTyler Dauwalder 	std::vector<DisjList*>* ParseConjList();
314*93d145bbSTyler Dauwalder 	DisjList* ParseDisjList();
3154574a75fSTyler Dauwalder 	Range ParseRange();
316*93d145bbSTyler Dauwalder 	DisjList* ParsePatternList(Range range);
317*93d145bbSTyler Dauwalder 	DisjList* ParseRPatternList();
3184574a75fSTyler Dauwalder 	RPattern* ParseRPattern();
3194574a75fSTyler Dauwalder 	Pattern* ParsePattern();
3204574a75fSTyler Dauwalder 
3214574a75fSTyler Dauwalder 	TokenStream stream;
322390dce8dSTyler Dauwalder 
323390dce8dSTyler Dauwalder 	Err *fOutOfMemErr;
3244574a75fSTyler Dauwalder };
3254574a75fSTyler Dauwalder 
3265da54924STyler Dauwalder }	// namespace Sniffer
3275da54924STyler Dauwalder 
328674e18fbSTyler Dauwalder 
329674e18fbSTyler Dauwalder #endif	// _sk_sniffer_parser_h_
330674e18fbSTyler Dauwalder 
331