xref: /haiku/src/add-ons/kernel/file_systems/shared/QueryParserUtils.cpp (revision 19ae20e67e91fc09cc9fc5c0e60e21e24e7a53eb)
1 /*
2  * Copyright 2001-2009, Axel Dörfler, axeld@pinc-software.de.
3  * Copyright 2010, Clemens Zeidler <haiku@clemens-zeidler.de>
4  * Copyright 2011, Ingo Weinhold, ingo_weinhold@gmx.de.
5  * This file may be used under the terms of the MIT License.
6  */
7 
8 
9 #include <file_systems/QueryParserUtils.h>
10 
11 #include <string.h>
12 
13 #include <algorithm>
14 
15 #include <TypeConstants.h>
16 
17 
18 namespace QueryParser {
19 
20 
21 template<typename Key>
22 static inline int
23 compare_integral(const Key& a, const Key& b)
24 {
25 	if (a < b)
26 		return -1;
27 	else if (a > b)
28 		return 1;
29 	return 0;
30 }
31 
32 
33 // #pragma mark -
34 
35 
36 void
37 skipWhitespace(char** expr, int32 skip)
38 {
39 	char* string = (*expr) + skip;
40 	while (*string == ' ' || *string == '\t') string++;
41 	*expr = string;
42 }
43 
44 
45 void
46 skipWhitespaceReverse(char** expr, char* stop)
47 {
48 	char* string = *expr;
49 	while (string > stop && (*string == ' ' || *string == '\t'))
50 		string--;
51 	*expr = string;
52 }
53 
54 
55 int
56 compareKeys(uint32 type, const void* key1, size_t length1, const void* key2,
57 	size_t length2)
58 {
59 	switch (type) {
60 		case B_INT32_TYPE:
61 			return compare_integral(*(int32*)key1, *(int32*)key2);
62 		case B_UINT32_TYPE:
63 			return compare_integral(*(uint32*)key1, *(uint32*)key2);
64 		case B_INT64_TYPE:
65 			return compare_integral(*(int64*)key1, *(int64*)key2);
66 		case B_UINT64_TYPE:
67 			return compare_integral(*(uint64*)key1, *(uint64*)key2);
68 		case B_FLOAT_TYPE:
69 			return compare_integral(*(float*)key1, *(float*)key2);
70 		case B_DOUBLE_TYPE:
71 			return compare_integral(*(double*)key1, *(double*)key2);
72 		case B_STRING_TYPE:
73 		case B_MIME_STRING_TYPE:
74 		{
75 			int result = strncmp((const char*)key1, (const char*)key2,
76 				std::min(length1, length2));
77 			if (result == 0) {
78 				result = compare_integral(strnlen((const char*)key1, length1),
79 					strnlen((const char*)key2, length2));
80 			}
81 			return result;
82 		}
83 	}
84 	return -1;
85 }
86 
87 
88 //	#pragma mark -
89 
90 
91 uint32
92 utf8ToUnicode(char** string)
93 {
94 	uint8* bytes = (uint8*)*string;
95 	int32 length;
96 	uint8 mask = 0x1f;
97 
98 	switch (bytes[0] & 0xf0) {
99 		case 0xc0:
100 		case 0xd0:
101 			length = 2;
102 			break;
103 		case 0xe0:
104 			length = 3;
105 			break;
106 		case 0xf0:
107 			mask = 0x0f;
108 			length = 4;
109 			break;
110 		default:
111 			// valid 1-byte character
112 			// and invalid characters
113 			(*string)++;
114 			return bytes[0];
115 	}
116 	uint32 c = bytes[0] & mask;
117 	int32 i = 1;
118 	for (; i < length && (bytes[i] & 0x80) > 0; i++)
119 		c = (c << 6) | (bytes[i] & 0x3f);
120 
121 	if (i < length) {
122 		// invalid character
123 		(*string)++;
124 		return (uint32)bytes[0];
125 	}
126 	*string += length;
127 	return c;
128 }
129 
130 
131 int32
132 getFirstPatternSymbol(char* string)
133 {
134 	char c;
135 
136 	for (int32 index = 0; (c = *string++); index++) {
137 		if (c == '*' || c == '?' || c == '[')
138 			return index;
139 	}
140 	return -1;
141 }
142 
143 
144 status_t
145 isValidPattern(char* pattern)
146 {
147 	while (*pattern) {
148 		switch (*pattern++) {
149 			case '\\':
150 				// the escape character must not be at the end of the pattern
151 				if (!*pattern++)
152 					return PATTERN_INVALID_ESCAPE;
153 				break;
154 
155 			case '[':
156 				if (pattern[0] == ']' || !pattern[0])
157 					return PATTERN_INVALID_SET;
158 
159 				while (*pattern != ']') {
160 					if (*pattern == '\\' && !*++pattern)
161 						return PATTERN_INVALID_ESCAPE;
162 
163 					if (!*pattern)
164 						return PATTERN_INVALID_SET;
165 
166 					if (pattern[0] == '-' && pattern[1] == '-')
167 						return PATTERN_INVALID_RANGE;
168 
169 					pattern++;
170 				}
171 				break;
172 		}
173 	}
174 	return B_OK;
175 }
176 
177 
178 /*!	Matches the string against the given wildcard pattern.
179 	Returns either MATCH_OK, or NO_MATCH when everything went fine, or
180 	values < 0 (see enum at the top of Query.cpp) if an error occurs.
181 */
182 status_t
183 matchString(char* pattern, char* string)
184 {
185 	while (*pattern) {
186 		// end of string == valid end of pattern?
187 		if (!string[0]) {
188 			while (pattern[0] == '*')
189 				pattern++;
190 			return !pattern[0] ? MATCH_OK : NO_MATCH;
191 		}
192 
193 		switch (*pattern++) {
194 			case '?':
195 			{
196 				// match exactly one UTF-8 character; we are
197 				// not interested in the result
198 				utf8ToUnicode(&string);
199 				break;
200 			}
201 
202 			case '*':
203 			{
204 				// compact pattern
205 				while (true) {
206 					if (pattern[0] == '?') {
207 						if (!*++string)
208 							return NO_MATCH;
209 					} else if (pattern[0] != '*')
210 						break;
211 
212 					pattern++;
213 				}
214 
215 				// if the pattern is done, we have matched the string
216 				if (!pattern[0])
217 					return MATCH_OK;
218 
219 				while(true) {
220 					// we have removed all occurences of '*' and '?'
221 					if (pattern[0] == string[0]
222 						|| pattern[0] == '['
223 						|| pattern[0] == '\\') {
224 						status_t status = matchString(pattern, string);
225 						if (status < B_OK || status == MATCH_OK)
226 							return status;
227 					}
228 
229 					// we could be nice here and just jump to the next
230 					// UTF-8 character - but we wouldn't gain that much
231 					// and it'd be slower (since we're checking for
232 					// equality before entering the recursion)
233 					if (!*++string)
234 						return NO_MATCH;
235 				}
236 				break;
237 			}
238 
239 			case '[':
240 			{
241 				bool invert = false;
242 				if (pattern[0] == '^' || pattern[0] == '!') {
243 					invert = true;
244 					pattern++;
245 				}
246 
247 				if (!pattern[0] || pattern[0] == ']')
248 					return MATCH_BAD_PATTERN;
249 
250 				uint32 c = utf8ToUnicode(&string);
251 				bool matched = false;
252 
253 				while (pattern[0] != ']') {
254 					if (!pattern[0])
255 						return MATCH_BAD_PATTERN;
256 
257 					if (pattern[0] == '\\')
258 						pattern++;
259 
260 					uint32 first = utf8ToUnicode(&pattern);
261 
262 					// Does this character match, or is this a range?
263 					if (first == c) {
264 						matched = true;
265 						break;
266 					} else if (pattern[0] == '-' && pattern[1] != ']'
267 							&& pattern[1]) {
268 						pattern++;
269 
270 						if (pattern[0] == '\\') {
271 							pattern++;
272 							if (!pattern[0])
273 								return MATCH_BAD_PATTERN;
274 						}
275 						uint32 last = utf8ToUnicode(&pattern);
276 
277 						if (c >= first && c <= last) {
278 							matched = true;
279 							break;
280 						}
281 					}
282 				}
283 
284 				if (invert)
285 					matched = !matched;
286 
287 				if (matched) {
288 					while (pattern[0] != ']') {
289 						if (!pattern[0])
290 							return MATCH_BAD_PATTERN;
291 						pattern++;
292 					}
293 					pattern++;
294 					break;
295 				}
296 				return NO_MATCH;
297 			}
298 
299             case '\\':
300 				if (!pattern[0])
301 					return MATCH_BAD_PATTERN;
302 				// supposed to fall through
303 			default:
304 				if (pattern[-1] != string[0])
305 					return NO_MATCH;
306 				string++;
307 				break;
308 		}
309 	}
310 
311 	if (string[0])
312 		return NO_MATCH;
313 
314 	return MATCH_OK;
315 }
316 
317 
318 }	// namespace QueryParser
319