xref: /haiku/src/add-ons/kernel/file_systems/shared/QueryParserUtils.cpp (revision b8a45b3a2df2379b4301bf3bd5949b9a105be4ba)
1 /*
2  * Copyright 2001-2014, Axel Dörfler, axeld@pinc-software.de.
3  * Copyright 2010, Clemens Zeidler <haiku@clemens-zeidler.de>
4  * Copyright 2011, Ingo Weinhold, ingo_weinhold@gmx.de.
5  * This file may be used under the terms of the MIT License.
6  */
7 
8 
9 // This needs to be the first include because of the fs shell API wrapper
10 #include <algorithm>
11 
12 #include <file_systems/QueryParserUtils.h>
13 
14 #ifndef FS_SHELL
15 #	include <string.h>
16 
17 #	include <TypeConstants.h>
18 #endif
19 
20 
21 namespace QueryParser {
22 
23 
24 template<typename Key>
25 static inline int
26 compare_integral(const Key& a, const Key& b)
27 {
28 	if (a < b)
29 		return -1;
30 	if (a > b)
31 		return 1;
32 	return 0;
33 }
34 
35 
36 // #pragma mark -
37 
38 
39 void
40 skipWhitespace(const char** expr, int32 skip)
41 {
42 	const char* string = (*expr) + skip;
43 	while (*string == ' ' || *string == '\t') string++;
44 	*expr = string;
45 }
46 
47 
48 void
49 skipWhitespaceReverse(const char** expr, const char* stop)
50 {
51 	const char* string = *expr;
52 	while (string > stop && (*string == ' ' || *string == '\t'))
53 		string--;
54 	*expr = string;
55 }
56 
57 
58 int
59 compareKeys(uint32 type, const void* key1, size_t length1, const void* key2,
60 	size_t length2)
61 {
62 	switch (type) {
63 		case B_INT32_TYPE:
64 			return compare_integral(*(int32*)key1, *(int32*)key2);
65 		case B_UINT32_TYPE:
66 			return compare_integral(*(uint32*)key1, *(uint32*)key2);
67 		case B_INT64_TYPE:
68 			return compare_integral(*(int64*)key1, *(int64*)key2);
69 		case B_UINT64_TYPE:
70 			return compare_integral(*(uint64*)key1, *(uint64*)key2);
71 		case B_FLOAT_TYPE:
72 			return compare_integral(*(float*)key1, *(float*)key2);
73 		case B_DOUBLE_TYPE:
74 			return compare_integral(*(double*)key1, *(double*)key2);
75 		case B_STRING_TYPE:
76 		case B_MIME_STRING_TYPE:
77 		{
78 			int result = strncmp((const char*)key1, (const char*)key2,
79 				std::min(length1, length2));
80 			if (result == 0) {
81 				result = compare_integral(strnlen((const char*)key1, length1),
82 					strnlen((const char*)key2, length2));
83 			}
84 			return result;
85 		}
86 	}
87 	return -1;
88 }
89 
90 
91 //	#pragma mark -
92 
93 
94 uint32
95 utf8ToUnicode(const char** string)
96 {
97 	uint8* bytes = (uint8*)*string;
98 	int32 length;
99 	uint8 mask = 0x1f;
100 
101 	switch (bytes[0] & 0xf0) {
102 		case 0xc0:
103 		case 0xd0:
104 			length = 2;
105 			break;
106 		case 0xe0:
107 			length = 3;
108 			break;
109 		case 0xf0:
110 			mask = 0x0f;
111 			length = 4;
112 			break;
113 		default:
114 			// valid 1-byte character
115 			// and invalid characters
116 			(*string)++;
117 			return bytes[0];
118 	}
119 	uint32 c = bytes[0] & mask;
120 	int32 i = 1;
121 	for (; i < length && (bytes[i] & 0x80) > 0; i++)
122 		c = (c << 6) | (bytes[i] & 0x3f);
123 
124 	if (i < length) {
125 		// invalid character
126 		(*string)++;
127 		return (uint32)bytes[0];
128 	}
129 	*string += length;
130 	return c;
131 }
132 
133 
134 int32
135 getFirstPatternSymbol(const char* string)
136 {
137 	char c;
138 
139 	for (int32 index = 0; (c = *string++); index++) {
140 		if (c == '*' || c == '?' || c == '[')
141 			return index;
142 	}
143 	return -1;
144 }
145 
146 
147 status_t
148 isValidPattern(const char* pattern)
149 {
150 	while (*pattern) {
151 		switch (*pattern++) {
152 			case '\\':
153 				// the escape character must not be at the end of the pattern
154 				if (!*pattern++)
155 					return PATTERN_INVALID_ESCAPE;
156 				break;
157 
158 			case '[':
159 				if (pattern[0] == ']' || !pattern[0])
160 					return PATTERN_INVALID_SET;
161 
162 				while (*pattern != ']') {
163 					if (*pattern == '\\' && !*++pattern)
164 						return PATTERN_INVALID_ESCAPE;
165 
166 					if (!*pattern)
167 						return PATTERN_INVALID_SET;
168 
169 					if (pattern[0] == '-' && pattern[1] == '-')
170 						return PATTERN_INVALID_RANGE;
171 
172 					pattern++;
173 				}
174 				break;
175 		}
176 	}
177 	return B_OK;
178 }
179 
180 
181 /*!	Matches the string against the given wildcard pattern.
182 	Returns either MATCH_OK, or NO_MATCH when everything went fine, or
183 	values < 0 (see enum at the top of Query.cpp) if an error occurs.
184 
185 	The pattern matching is roughly based on code originally written
186 	by J. Kercheval, and on code written by Kenneth Almquist, though
187 	it shares no code.
188 */
189 status_t
190 matchString(const char* pattern, const char* string)
191 {
192 	while (*pattern) {
193 		// end of string == valid end of pattern?
194 		if (!string[0]) {
195 			while (pattern[0] == '*')
196 				pattern++;
197 			return !pattern[0] ? MATCH_OK : NO_MATCH;
198 		}
199 
200 		switch (*pattern++) {
201 			case '?':
202 			{
203 				// match exactly one UTF-8 character; we are
204 				// not interested in the result
205 				utf8ToUnicode(&string);
206 				break;
207 			}
208 
209 			case '*':
210 			{
211 				// compact pattern
212 				while (true) {
213 					if (pattern[0] == '?') {
214 						if (!*++string)
215 							return NO_MATCH;
216 					} else if (pattern[0] != '*')
217 						break;
218 
219 					pattern++;
220 				}
221 
222 				// if the pattern is done, we have matched the string
223 				if (!pattern[0])
224 					return MATCH_OK;
225 
226 				while(true) {
227 					// we have removed all occurences of '*' and '?'
228 					if (pattern[0] == string[0]
229 						|| pattern[0] == '['
230 						|| pattern[0] == '\\') {
231 						status_t status = matchString(pattern, string);
232 						if (status < B_OK || status == MATCH_OK)
233 							return status;
234 					}
235 
236 					// we could be nice here and just jump to the next
237 					// UTF-8 character - but we wouldn't gain that much
238 					// and it'd be slower (since we're checking for
239 					// equality before entering the recursion)
240 					if (!*++string)
241 						return NO_MATCH;
242 				}
243 				break;
244 			}
245 
246 			case '[':
247 			{
248 				bool invert = false;
249 				if (pattern[0] == '^' || pattern[0] == '!') {
250 					invert = true;
251 					pattern++;
252 				}
253 
254 				if (!pattern[0] || pattern[0] == ']')
255 					return MATCH_BAD_PATTERN;
256 
257 				uint32 c = utf8ToUnicode(&string);
258 				bool matched = false;
259 
260 				while (pattern[0] != ']') {
261 					if (!pattern[0])
262 						return MATCH_BAD_PATTERN;
263 
264 					if (pattern[0] == '\\')
265 						pattern++;
266 
267 					uint32 first = utf8ToUnicode(&pattern);
268 
269 					// Does this character match, or is this a range?
270 					if (first == c) {
271 						matched = true;
272 						break;
273 					} else if (pattern[0] == '-' && pattern[1] != ']'
274 							&& pattern[1]) {
275 						pattern++;
276 
277 						if (pattern[0] == '\\') {
278 							pattern++;
279 							if (!pattern[0])
280 								return MATCH_BAD_PATTERN;
281 						}
282 						uint32 last = utf8ToUnicode(&pattern);
283 
284 						if (c >= first && c <= last) {
285 							matched = true;
286 							break;
287 						}
288 					}
289 				}
290 
291 				if (invert)
292 					matched = !matched;
293 
294 				if (matched) {
295 					while (pattern[0] != ']') {
296 						if (!pattern[0])
297 							return MATCH_BAD_PATTERN;
298 						pattern++;
299 					}
300 					pattern++;
301 					break;
302 				}
303 				return NO_MATCH;
304 			}
305 
306             case '\\':
307 				if (!pattern[0])
308 					return MATCH_BAD_PATTERN;
309 				// supposed to fall through
310 			default:
311 				if (pattern[-1] != string[0])
312 					return NO_MATCH;
313 				string++;
314 				break;
315 		}
316 	}
317 
318 	if (string[0])
319 		return NO_MATCH;
320 
321 	return MATCH_OK;
322 }
323 
324 
325 }	// namespace QueryParser
326