xref: /haiku/headers/private/storage/sniffer/Parser.h (revision 268f99dd7dc4bd7474a8bd2742d3f1ec1de6752a)
1674e18fbSTyler Dauwalder //----------------------------------------------------------------------
2*2ca13760SColdfirex //  This software is part of the Haiku distribution and is covered
3b6f76ebeSAugustin Cavalier //  by the MIT License.
4674e18fbSTyler Dauwalder //---------------------------------------------------------------------
5674e18fbSTyler Dauwalder /*!
6674e18fbSTyler Dauwalder 	\file sniffer/Parser.h
7674e18fbSTyler Dauwalder 	MIME sniffer rule parser declarations
8674e18fbSTyler Dauwalder */
982b75665STyler Dauwalder #ifndef _SNIFFER_PARSER_H
1082b75665STyler Dauwalder #define _SNIFFER_PARSER_H
11674e18fbSTyler Dauwalder 
12674e18fbSTyler Dauwalder #include <SupportDefs.h>
1389ec8a81STyler Dauwalder #include <sniffer/CharStream.h>
14390dce8dSTyler Dauwalder #include <sniffer/Err.h>
154574a75fSTyler Dauwalder #include <sniffer/Range.h>
164574a75fSTyler Dauwalder #include <sniffer/Rule.h>
175da54924STyler Dauwalder #include <List.h>
185da54924STyler Dauwalder #include <string>
194574a75fSTyler Dauwalder #include <vector>
205da54924STyler Dauwalder 
21674e18fbSTyler Dauwalder class BString;
22674e18fbSTyler Dauwalder 
2360ee71d3STyler Dauwalder //! MIME Sniffer related classes
2409d84e61STyler Dauwalder namespace BPrivate {
2509d84e61STyler Dauwalder namespace Storage {
26674e18fbSTyler Dauwalder namespace Sniffer {
274574a75fSTyler Dauwalder 
28674e18fbSTyler Dauwalder class Rule;
2993d145bbSTyler Dauwalder class DisjList;
304574a75fSTyler Dauwalder class RPattern;
314574a75fSTyler Dauwalder class Pattern;
325da54924STyler Dauwalder 
335da54924STyler Dauwalder //------------------------------------------------------------------------------
345da54924STyler Dauwalder // The mighty parsing function ;-)
355da54924STyler Dauwalder //------------------------------------------------------------------------------
365da54924STyler Dauwalder 
37674e18fbSTyler Dauwalder status_t parse(const char *rule, Rule *result, BString *parseError = NULL);
385da54924STyler Dauwalder 
395da54924STyler Dauwalder //------------------------------------------------------------------------------
405da54924STyler Dauwalder // Classes used internally by the parser
415da54924STyler Dauwalder //------------------------------------------------------------------------------
425da54924STyler Dauwalder 
4360ee71d3STyler Dauwalder //! Types of tokens
445da54924STyler Dauwalder typedef enum TokenType {
455da54924STyler Dauwalder 	EmptyToken,
465da54924STyler Dauwalder 	LeftParen,
475da54924STyler Dauwalder 	RightParen,
485da54924STyler Dauwalder 	LeftBracket,
495da54924STyler Dauwalder 	RightBracket,
505da54924STyler Dauwalder 	Colon,
515da54924STyler Dauwalder 	Divider,
525da54924STyler Dauwalder 	Ampersand,
5393d145bbSTyler Dauwalder 	CaseInsensitiveFlag,
545da54924STyler Dauwalder 	CharacterString,
555da54924STyler Dauwalder 	Integer,
565da54924STyler Dauwalder 	FloatingPoint
576eb09230SMichael Lotz } TokenType;
585da54924STyler Dauwalder 
5960ee71d3STyler Dauwalder /*! \brief Returns a NULL-terminated string contating the
6060ee71d3STyler Dauwalder 		   name of the given token type
6160ee71d3STyler Dauwalder */
625da54924STyler Dauwalder const char* tokenTypeToString(TokenType type);
635da54924STyler Dauwalder 
6460ee71d3STyler Dauwalder //! Base token class returned by TokenStream
6560ee71d3STyler Dauwalder /*! Each token represents a single chunk of relevant information
6660ee71d3STyler Dauwalder     in a given rule. For example, the floating point number "1.2e-35",
6760ee71d3STyler Dauwalder     originally represented as a 7-character string, is added to the
6860ee71d3STyler Dauwalder     token stream as a single FloatToken object.
6960ee71d3STyler Dauwalder */
705da54924STyler Dauwalder class Token {
715da54924STyler Dauwalder public:
724574a75fSTyler Dauwalder 	Token(TokenType type = EmptyToken, const ssize_t pos = -1);
7302fd0582STyler Dauwalder 	virtual ~Token();
745da54924STyler Dauwalder 	TokenType Type() const;
75fe70cd16STyler Dauwalder 	virtual const std::string& String() const;
765da54924STyler Dauwalder 	virtual int32 Int() const;
775da54924STyler Dauwalder 	virtual double Float() const;
784574a75fSTyler Dauwalder 	ssize_t Pos() const;
79390dce8dSTyler Dauwalder 	bool operator==(Token &ref) const;
805da54924STyler Dauwalder protected:
815da54924STyler Dauwalder 	TokenType fType;
824574a75fSTyler Dauwalder 	ssize_t fPos;
835da54924STyler Dauwalder };
845da54924STyler Dauwalder 
8560ee71d3STyler Dauwalder //! String token class
8660ee71d3STyler Dauwalder /*! Single-quoted strings, double-quoted strings, unquoted strings, and
8760ee71d3STyler Dauwalder 	hex literals are all converted to StringToken objects by the scanner
8860ee71d3STyler Dauwalder 	and from then on treated uniformly.
8960ee71d3STyler Dauwalder */
905da54924STyler Dauwalder class StringToken : public Token {
915da54924STyler Dauwalder public:
92fe70cd16STyler Dauwalder 	StringToken(const std::string &str, const ssize_t pos);
9302fd0582STyler Dauwalder 	virtual ~StringToken();
94fe70cd16STyler Dauwalder 	virtual const std::string& String() const;
955da54924STyler Dauwalder protected:
96fe70cd16STyler Dauwalder 	std::string fString;
975da54924STyler Dauwalder };
985da54924STyler Dauwalder 
9960ee71d3STyler Dauwalder //! Integer token class
10060ee71d3STyler Dauwalder /*! Signed or unsigned integer literals are coverted to IntToken objects,
10160ee71d3STyler Dauwalder     which may then be treated as either ints or floats (since a priority
10260ee71d3STyler Dauwalder     of "1" would be valid, but scanned as an int instead of a float).
10360ee71d3STyler Dauwalder */
1045da54924STyler Dauwalder class IntToken : public Token {
1055da54924STyler Dauwalder public:
1064574a75fSTyler Dauwalder 	IntToken(const int32 value, const ssize_t pos);
10702fd0582STyler Dauwalder 	virtual ~IntToken();
1085da54924STyler Dauwalder 	virtual int32 Int() const;
1095da54924STyler Dauwalder 	virtual double Float() const;
1105da54924STyler Dauwalder protected:
1115da54924STyler Dauwalder 	int32 fValue;
1125da54924STyler Dauwalder };
1135da54924STyler Dauwalder 
11460ee71d3STyler Dauwalder //! Floating point token class
11560ee71d3STyler Dauwalder /*! Signed or unsigned, extended or non-extended notation floating point
11660ee71d3STyler Dauwalder     numbers are converted to FloatToken objects by the scanner.
11760ee71d3STyler Dauwalder */
1185da54924STyler Dauwalder class FloatToken : public Token {
1195da54924STyler Dauwalder public:
1204574a75fSTyler Dauwalder 	FloatToken(const double value, const ssize_t pos);
12102fd0582STyler Dauwalder 	virtual ~FloatToken();
1225da54924STyler Dauwalder 	virtual double Float() const;
1235da54924STyler Dauwalder protected:
1245da54924STyler Dauwalder 	double fValue;
1255da54924STyler Dauwalder };
1265da54924STyler Dauwalder 
12760ee71d3STyler Dauwalder //! Manages a stream of Token objects
12860ee71d3STyler Dauwalder /*! Provides Get() and Unget() operations, some handy shortcut operations (Read()
12960ee71d3STyler Dauwalder     and CondRead()), and handles memory management with respect to all the
13060ee71d3STyler Dauwalder     Token objects in the stream (i.e. never delete a Token object returned by Get()).
13160ee71d3STyler Dauwalder 
13260ee71d3STyler Dauwalder     Also, the scanner portion of the parser is implemented in the TokenStream's
13360ee71d3STyler Dauwalder     SetTo() function.
13460ee71d3STyler Dauwalder */
1355da54924STyler Dauwalder class TokenStream {
1365da54924STyler Dauwalder public:
137fe70cd16STyler Dauwalder 	TokenStream(const std::string &string);
138fe70cd16STyler Dauwalder 	TokenStream();
1395da54924STyler Dauwalder 	~TokenStream();
1405da54924STyler Dauwalder 
141fe70cd16STyler Dauwalder 	status_t SetTo(const std::string &string);
1425da54924STyler Dauwalder 	void Unset();
1435da54924STyler Dauwalder 	status_t InitCheck() const;
1445da54924STyler Dauwalder 
145390dce8dSTyler Dauwalder 	const Token* Get();
146390dce8dSTyler Dauwalder 	void Unget();
1475da54924STyler Dauwalder 
148390dce8dSTyler Dauwalder 	void Read(TokenType type);
149390dce8dSTyler Dauwalder 	bool CondRead(TokenType type);
150390dce8dSTyler Dauwalder 
151390dce8dSTyler Dauwalder 	ssize_t Pos() const;
152390dce8dSTyler Dauwalder 	ssize_t EndPos() const;
153390dce8dSTyler Dauwalder 
154390dce8dSTyler Dauwalder 	bool IsEmpty() const;
1555da54924STyler Dauwalder 
1565da54924STyler Dauwalder private:
1574574a75fSTyler Dauwalder 	void AddToken(TokenType type, ssize_t pos);
158fe70cd16STyler Dauwalder 	void AddString(const std::string &str, ssize_t pos);
1594574a75fSTyler Dauwalder 	void AddInt(const char *str, ssize_t pos);
1604574a75fSTyler Dauwalder 	void AddFloat(const char *str, ssize_t pos);
1615da54924STyler Dauwalder 
162390dce8dSTyler Dauwalder 	std::vector<Token*> fTokenList;
16302fd0582STyler Dauwalder 	status_t fCStatus;
164390dce8dSTyler Dauwalder 	int fPos;
165390dce8dSTyler Dauwalder 	int fStrLen;
1665da54924STyler Dauwalder 
167390dce8dSTyler Dauwalder 
1685da54924STyler Dauwalder 	TokenStream(const TokenStream &ref);
1695da54924STyler Dauwalder 	TokenStream& operator=(const TokenStream &ref);
1705da54924STyler Dauwalder };
1715da54924STyler Dauwalder 
17260ee71d3STyler Dauwalder //! Handles parsing a sniffer rule, yielding either a parsed rule or a descriptive error message.
17393d145bbSTyler Dauwalder /*! A MIME sniffer rule is valid if it is well-formed with respect to the
17493d145bbSTyler Dauwalder 	following grammar and fulfills some further conditions listed thereafter:
17593d145bbSTyler Dauwalder 
17693d145bbSTyler Dauwalder 	<code>
17793d145bbSTyler Dauwalder 	Rule			::= LWS Priority LWS ConjList LWS
17893d145bbSTyler Dauwalder 	ConjList		::= DisjList (LWS DisjList)*
17993d145bbSTyler Dauwalder 	DisjList		::= "(" LWS PatternList LWS ")"
18093d145bbSTyler Dauwalder 						| "(" LWS RPatternList LWS ")"
18193d145bbSTyler Dauwalder 						| Range LWS "(" LWS PatternList LWS ")"
18293d145bbSTyler Dauwalder 	RPatternList	::= [Flag LWS] RPattern (LWS "|" LWS [Flag LWS] RPattern)*
18393d145bbSTyler Dauwalder 	PatternList		::= [Flag LWS] Pattern (LWS "|" LWS [Flag LWS] Pattern)*
18493d145bbSTyler Dauwalder 
18593d145bbSTyler Dauwalder 	RPattern		::= LWS Range LWS Pattern
18693d145bbSTyler Dauwalder 	Pattern			::= PString [ LWS "&" LWS Mask ]
18793d145bbSTyler Dauwalder 	Range			::=	"[" LWS SDecimal [LWS ":" LWS SDecimal] LWS "]"
18893d145bbSTyler Dauwalder 
18993d145bbSTyler Dauwalder 	Priority		::= Float
19093d145bbSTyler Dauwalder 	Mask			::= PString
19193d145bbSTyler Dauwalder 	PString			::= HexLiteral | QuotedString | UnquotedString
19293d145bbSTyler Dauwalder 
19393d145bbSTyler Dauwalder 	HexLiteral		::= "0x" HexPair HexPair*
19493d145bbSTyler Dauwalder 	HexPair			::= HexChar HexChar
19593d145bbSTyler Dauwalder 
19693d145bbSTyler Dauwalder 	QuotedString	::= SingleQuotedString | DoubleQuotedString
19793d145bbSTyler Dauwalder 	SQuotedString	:= "'" SQChar+ "'"
19893d145bbSTyler Dauwalder 	DQuotedString	:= '"' DQChar+ '"'
19993d145bbSTyler Dauwalder 
20093d145bbSTyler Dauwalder 	UnquotedString	::= EscapedChar UChar*
20193d145bbSTyler Dauwalder 	EscapedChar		::= OctalEscape | HexEscape | "\" Char
20293d145bbSTyler Dauwalder 	OctalEscape		::= "\" [[OctHiChar] OctChar] OctChar
20393d145bbSTyler Dauwalder 	HexEscape		::= "\x" HexPair
20493d145bbSTyler Dauwalder 
20593d145bbSTyler Dauwalder 	Flag			::= "-i"
20693d145bbSTyler Dauwalder 
20793d145bbSTyler Dauwalder 	SDecimal		::= [Sign] Decimal
20893d145bbSTyler Dauwalder 	Decimal			::= DecChar DecChar*
20993d145bbSTyler Dauwalder 	Float			::= Fixed [("E" | "e") SDecimal]
21093d145bbSTyler Dauwalder 	Fixed			::= SDecimal ["." [Decimal]] | [Sign] "." Decimal
21193d145bbSTyler Dauwalder 	Sign			::= "+" | "-"
21293d145bbSTyler Dauwalder 
21393d145bbSTyler Dauwalder 	PunctuationChar	::= "(" | ")" | "[" | "]" | "|" | "&" | ":"
21493d145bbSTyler Dauwalder 	OctHiChar		::= "0" | "1" | "2" | "3"
21593d145bbSTyler Dauwalder 	OctChar			::= OctHiChar | "4" | "5" | "6" | "7"
21693d145bbSTyler Dauwalder 	DecChar			::= OctChar | "8" | "9"
21793d145bbSTyler Dauwalder 	HexChar			::= DecChar | "a" | "b" | "c" | "d" | "e" | "f" | "A" | "B" | "C"
21893d145bbSTyler Dauwalder 						| "D" | "E" | "F"
21993d145bbSTyler Dauwalder 
22093d145bbSTyler Dauwalder 	Char			:: <any character>
22193d145bbSTyler Dauwalder 	SQChar			::= <Char except "\", "'"> | EscapedChar
22293d145bbSTyler Dauwalder 	DQChar			::= <Char except "\", '"'> | EscapedChar
22393d145bbSTyler Dauwalder 	UChar			::= <Char except "\", LWSChar,  and PunctuationChar> | EscapedChar
22493d145bbSTyler Dauwalder 
22593d145bbSTyler Dauwalder 	LWS				::= LWSChar*
22693d145bbSTyler Dauwalder 	LWSChar			::= " " | TAB | LF
22793d145bbSTyler Dauwalder 	</code>
22893d145bbSTyler Dauwalder 
22993d145bbSTyler Dauwalder 	Conditions:
23093d145bbSTyler Dauwalder 	- If a mask is specified for a pattern, this mask must have the same
23193d145bbSTyler Dauwalder 	  length as the pattern string.
23293d145bbSTyler Dauwalder 	- 0.0 <= Priority <= 1.0
23393d145bbSTyler Dauwalder 	- 0 <= Range begin <= Range end
23493d145bbSTyler Dauwalder 
23593d145bbSTyler Dauwalder 	Notes:
23693d145bbSTyler Dauwalder 	- If a case-insensitive flag ("-i") appears in front of any Pattern or RPattern
23793d145bbSTyler Dauwalder 	  in a DisjList, case-insensitivity is applied to the entire DisjList.
23893d145bbSTyler Dauwalder 
23993d145bbSTyler Dauwalder 	Examples:
24093d145bbSTyler Dauwalder 	- 1.0 ('ABCD')
24193d145bbSTyler Dauwalder 	  The file must start with the string "ABCD". The priority of the rule
24293d145bbSTyler Dauwalder 	  is 1.0 (maximal).
24393d145bbSTyler Dauwalder 	- 0.8 [0:3] ('ABCD' | 'abcd')
24493d145bbSTyler Dauwalder 	  The file must contain the string "ABCD" or "abcd" starting somewhere in
24593d145bbSTyler Dauwalder 	  the first four bytes. The rule priority is 0.8.
24693d145bbSTyler Dauwalder 	- 0.5 ([0:3] 'ABCD' | [0:3] 'abcd' | [13] 'EFGH')
24793d145bbSTyler Dauwalder 	  The file must contain the string "ABCD" or "abcd" starting somewhere in
24893d145bbSTyler Dauwalder 	  the first four bytes or the string "EFGH" at position 13. The rule
24993d145bbSTyler Dauwalder 	  priority is 0.5.
25093d145bbSTyler Dauwalder 	- 0.8 [0:3] ('ABCD' & 0xff00ffff | 'abcd' & 0xffff00ff)
25193d145bbSTyler Dauwalder 	  The file must contain the string "A.CD" or "ab.d" (whereas "." is an
25293d145bbSTyler Dauwalder 	  arbitrary character) starting somewhere in the first four bytes. The
25393d145bbSTyler Dauwalder 	  rule priority is 0.8.
25493d145bbSTyler Dauwalder 	- 0.3 [10] ('mnop') ('abc') [20] ('xyz')
25593d145bbSTyler Dauwalder 	  The file must contain the string 'abc' at the beginning of the file,
25693d145bbSTyler Dauwalder 	  the string 'mnop' starting at position 10, and the string 'xyz'
25793d145bbSTyler Dauwalder 	  starting at position 20. The rule priority is 0.3.
25893d145bbSTyler Dauwalder 	- 200e-3 (-i 'ab')
25993d145bbSTyler Dauwalder 	  The file must contain the string 'ab', 'aB', 'Ab', or 'AB' at the
26093d145bbSTyler Dauwalder 	  beginning of the file. The rule priority is 0.2.
26193d145bbSTyler Dauwalder 
26293d145bbSTyler Dauwalder 	Real examples:
26393d145bbSTyler Dauwalder 	- 0.20 ([0]"//" | [0]"/\*" | [0:32]"#include" | [0:32]"#ifndef"
26493d145bbSTyler Dauwalder 	        | [0:32]"#ifdef")
26593d145bbSTyler Dauwalder 	  text/x-source-code
26693d145bbSTyler Dauwalder 	- 0.70 ("8BPS  \000\000\000\000" & 0xffffffff0000ffffffff )
26793d145bbSTyler Dauwalder 	  image/x-photoshop
26893d145bbSTyler Dauwalder 	- 0.40 [0:64]( -i "&lt;HTML" | "&lt;HEAD" | "&lt;TITLE" | "&lt;BODY"
26993d145bbSTyler Dauwalder 			| "&lt;TABLE" | "&lt;!--" | "&lt;META" | "&lt;CENTER")
27093d145bbSTyler Dauwalder 	  text/html
27193d145bbSTyler Dauwalder 
27293d145bbSTyler Dauwalder */
2734574a75fSTyler Dauwalder class Parser {
2744574a75fSTyler Dauwalder public:
2754574a75fSTyler Dauwalder 	Parser();
276390dce8dSTyler Dauwalder 	~Parser();
2774574a75fSTyler Dauwalder 	status_t Parse(const char *rule, Rule *result, BString *parseError = NULL);
2784574a75fSTyler Dauwalder private:
2794574a75fSTyler Dauwalder 	std::string ErrorMessage(Err *err, const char *rule);
2804574a75fSTyler Dauwalder 
281390dce8dSTyler Dauwalder 	// Things that get done a lot :-)
282390dce8dSTyler Dauwalder 	void ThrowEndOfStreamError();
283390dce8dSTyler Dauwalder 	inline void ThrowOutOfMemError(ssize_t pos);
284390dce8dSTyler Dauwalder 	void ThrowUnexpectedTokenError(TokenType expected, const Token *found);
285390dce8dSTyler Dauwalder 	void ThrowUnexpectedTokenError(TokenType expected1, TokenType expected2, const Token *found);
286390dce8dSTyler Dauwalder 
2874574a75fSTyler Dauwalder 	// Parsing functions
2884574a75fSTyler Dauwalder 	void ParseRule(Rule *result);
2894574a75fSTyler Dauwalder 	double ParsePriority();
29093d145bbSTyler Dauwalder 	std::vector<DisjList*>* ParseConjList();
29193d145bbSTyler Dauwalder 	DisjList* ParseDisjList();
2924574a75fSTyler Dauwalder 	Range ParseRange();
29393d145bbSTyler Dauwalder 	DisjList* ParsePatternList(Range range);
29493d145bbSTyler Dauwalder 	DisjList* ParseRPatternList();
2954574a75fSTyler Dauwalder 	RPattern* ParseRPattern();
2964574a75fSTyler Dauwalder 	Pattern* ParsePattern();
2974574a75fSTyler Dauwalder 
2984574a75fSTyler Dauwalder 	TokenStream stream;
299390dce8dSTyler Dauwalder 
300390dce8dSTyler Dauwalder 	Err *fOutOfMemErr;
3014574a75fSTyler Dauwalder };
3024574a75fSTyler Dauwalder 
30309d84e61STyler Dauwalder };	// namespace Sniffer
30409d84e61STyler Dauwalder };	// namespace Storage
30509d84e61STyler Dauwalder };	// namespace BPrivate
306674e18fbSTyler Dauwalder 
30782b75665STyler Dauwalder #endif	// _SNIFFER_PARSER_H
308674e18fbSTyler Dauwalder 
30909d84e61STyler Dauwalder 
31009d84e61STyler Dauwalder 
31109d84e61STyler Dauwalder 
312