xref: /haiku/src/add-ons/kernel/file_systems/shared/QueryParserUtils.cpp (revision 21258e2674226d6aa732321b6f8494841895af5f)
1 /*
2  * Copyright 2001-2014, Axel Dörfler, axeld@pinc-software.de.
3  * Copyright 2010, Clemens Zeidler <haiku@clemens-zeidler.de>
4  * Copyright 2011, Ingo Weinhold, ingo_weinhold@gmx.de.
5  * This file may be used under the terms of the MIT License.
6  */
7 
8 
9 // This needs to be the first include because of the fs shell API wrapper
10 #include <algorithm>
11 
12 #include <file_systems/QueryParserUtils.h>
13 
14 #ifndef FS_SHELL
15 #	include <string.h>
16 
17 #	include <TypeConstants.h>
18 #endif
19 
20 
21 namespace QueryParser {
22 
23 
24 template<typename Key>
25 static inline int
26 compare_integral(const Key& a, const Key& b)
27 {
28 	if (a < b)
29 		return -1;
30 	if (a > b)
31 		return 1;
32 	return 0;
33 }
34 
35 
36 // #pragma mark -
37 
38 
39 void
40 skipWhitespace(char** expr, int32 skip)
41 {
42 	char* string = (*expr) + skip;
43 	while (*string == ' ' || *string == '\t') string++;
44 	*expr = string;
45 }
46 
47 
48 void
49 skipWhitespaceReverse(char** expr, char* stop)
50 {
51 	char* string = *expr;
52 	while (string > stop && (*string == ' ' || *string == '\t'))
53 		string--;
54 	*expr = string;
55 }
56 
57 
58 int
59 compareKeys(uint32 type, const void* key1, size_t length1, const void* key2,
60 	size_t length2)
61 {
62 	switch (type) {
63 		case B_INT32_TYPE:
64 			return compare_integral(*(int32*)key1, *(int32*)key2);
65 		case B_UINT32_TYPE:
66 			return compare_integral(*(uint32*)key1, *(uint32*)key2);
67 		case B_INT64_TYPE:
68 			return compare_integral(*(int64*)key1, *(int64*)key2);
69 		case B_UINT64_TYPE:
70 			return compare_integral(*(uint64*)key1, *(uint64*)key2);
71 		case B_FLOAT_TYPE:
72 			return compare_integral(*(float*)key1, *(float*)key2);
73 		case B_DOUBLE_TYPE:
74 			return compare_integral(*(double*)key1, *(double*)key2);
75 		case B_STRING_TYPE:
76 		case B_MIME_STRING_TYPE:
77 		{
78 			int result = strncmp((const char*)key1, (const char*)key2,
79 				std::min(length1, length2));
80 			if (result == 0) {
81 				result = compare_integral(strnlen((const char*)key1, length1),
82 					strnlen((const char*)key2, length2));
83 			}
84 			return result;
85 		}
86 	}
87 	return -1;
88 }
89 
90 
91 //	#pragma mark -
92 
93 
94 uint32
95 utf8ToUnicode(char** string)
96 {
97 	uint8* bytes = (uint8*)*string;
98 	int32 length;
99 	uint8 mask = 0x1f;
100 
101 	switch (bytes[0] & 0xf0) {
102 		case 0xc0:
103 		case 0xd0:
104 			length = 2;
105 			break;
106 		case 0xe0:
107 			length = 3;
108 			break;
109 		case 0xf0:
110 			mask = 0x0f;
111 			length = 4;
112 			break;
113 		default:
114 			// valid 1-byte character
115 			// and invalid characters
116 			(*string)++;
117 			return bytes[0];
118 	}
119 	uint32 c = bytes[0] & mask;
120 	int32 i = 1;
121 	for (; i < length && (bytes[i] & 0x80) > 0; i++)
122 		c = (c << 6) | (bytes[i] & 0x3f);
123 
124 	if (i < length) {
125 		// invalid character
126 		(*string)++;
127 		return (uint32)bytes[0];
128 	}
129 	*string += length;
130 	return c;
131 }
132 
133 
134 int32
135 getFirstPatternSymbol(char* string)
136 {
137 	char c;
138 
139 	for (int32 index = 0; (c = *string++); index++) {
140 		if (c == '*' || c == '?' || c == '[')
141 			return index;
142 	}
143 	return -1;
144 }
145 
146 
147 status_t
148 isValidPattern(char* pattern)
149 {
150 	while (*pattern) {
151 		switch (*pattern++) {
152 			case '\\':
153 				// the escape character must not be at the end of the pattern
154 				if (!*pattern++)
155 					return PATTERN_INVALID_ESCAPE;
156 				break;
157 
158 			case '[':
159 				if (pattern[0] == ']' || !pattern[0])
160 					return PATTERN_INVALID_SET;
161 
162 				while (*pattern != ']') {
163 					if (*pattern == '\\' && !*++pattern)
164 						return PATTERN_INVALID_ESCAPE;
165 
166 					if (!*pattern)
167 						return PATTERN_INVALID_SET;
168 
169 					if (pattern[0] == '-' && pattern[1] == '-')
170 						return PATTERN_INVALID_RANGE;
171 
172 					pattern++;
173 				}
174 				break;
175 		}
176 	}
177 	return B_OK;
178 }
179 
180 
181 /*!	Matches the string against the given wildcard pattern.
182 	Returns either MATCH_OK, or NO_MATCH when everything went fine, or
183 	values < 0 (see enum at the top of Query.cpp) if an error occurs.
184 */
185 status_t
186 matchString(char* pattern, char* string)
187 {
188 	while (*pattern) {
189 		// end of string == valid end of pattern?
190 		if (!string[0]) {
191 			while (pattern[0] == '*')
192 				pattern++;
193 			return !pattern[0] ? MATCH_OK : NO_MATCH;
194 		}
195 
196 		switch (*pattern++) {
197 			case '?':
198 			{
199 				// match exactly one UTF-8 character; we are
200 				// not interested in the result
201 				utf8ToUnicode(&string);
202 				break;
203 			}
204 
205 			case '*':
206 			{
207 				// compact pattern
208 				while (true) {
209 					if (pattern[0] == '?') {
210 						if (!*++string)
211 							return NO_MATCH;
212 					} else if (pattern[0] != '*')
213 						break;
214 
215 					pattern++;
216 				}
217 
218 				// if the pattern is done, we have matched the string
219 				if (!pattern[0])
220 					return MATCH_OK;
221 
222 				while(true) {
223 					// we have removed all occurences of '*' and '?'
224 					if (pattern[0] == string[0]
225 						|| pattern[0] == '['
226 						|| pattern[0] == '\\') {
227 						status_t status = matchString(pattern, string);
228 						if (status < B_OK || status == MATCH_OK)
229 							return status;
230 					}
231 
232 					// we could be nice here and just jump to the next
233 					// UTF-8 character - but we wouldn't gain that much
234 					// and it'd be slower (since we're checking for
235 					// equality before entering the recursion)
236 					if (!*++string)
237 						return NO_MATCH;
238 				}
239 				break;
240 			}
241 
242 			case '[':
243 			{
244 				bool invert = false;
245 				if (pattern[0] == '^' || pattern[0] == '!') {
246 					invert = true;
247 					pattern++;
248 				}
249 
250 				if (!pattern[0] || pattern[0] == ']')
251 					return MATCH_BAD_PATTERN;
252 
253 				uint32 c = utf8ToUnicode(&string);
254 				bool matched = false;
255 
256 				while (pattern[0] != ']') {
257 					if (!pattern[0])
258 						return MATCH_BAD_PATTERN;
259 
260 					if (pattern[0] == '\\')
261 						pattern++;
262 
263 					uint32 first = utf8ToUnicode(&pattern);
264 
265 					// Does this character match, or is this a range?
266 					if (first == c) {
267 						matched = true;
268 						break;
269 					} else if (pattern[0] == '-' && pattern[1] != ']'
270 							&& pattern[1]) {
271 						pattern++;
272 
273 						if (pattern[0] == '\\') {
274 							pattern++;
275 							if (!pattern[0])
276 								return MATCH_BAD_PATTERN;
277 						}
278 						uint32 last = utf8ToUnicode(&pattern);
279 
280 						if (c >= first && c <= last) {
281 							matched = true;
282 							break;
283 						}
284 					}
285 				}
286 
287 				if (invert)
288 					matched = !matched;
289 
290 				if (matched) {
291 					while (pattern[0] != ']') {
292 						if (!pattern[0])
293 							return MATCH_BAD_PATTERN;
294 						pattern++;
295 					}
296 					pattern++;
297 					break;
298 				}
299 				return NO_MATCH;
300 			}
301 
302             case '\\':
303 				if (!pattern[0])
304 					return MATCH_BAD_PATTERN;
305 				// supposed to fall through
306 			default:
307 				if (pattern[-1] != string[0])
308 					return NO_MATCH;
309 				string++;
310 				break;
311 		}
312 	}
313 
314 	if (string[0])
315 		return NO_MATCH;
316 
317 	return MATCH_OK;
318 }
319 
320 
321 }	// namespace QueryParser
322