xref: /haiku/src/system/libroot/posix/musl/regex/fnmatch.c (revision 657f041aee41527c4f33e5194aeb4ebcc1d210dc)
1 /*
2  * An implementation of what I call the "Sea of Stars" algorithm for
3  * POSIX fnmatch(). The basic idea is that we factor the pattern into
4  * a head component (which we match first and can reject without ever
5  * measuring the length of the string), an optional tail component
6  * (which only exists if the pattern contains at least one star), and
7  * an optional "sea of stars", a set of star-separated components
8  * between the head and tail. After the head and tail matches have
9  * been removed from the input string, the components in the "sea of
10  * stars" are matched sequentially by searching for their first
11  * occurrence past the end of the previous match.
12  *
13  * - Rich Felker, April 2012
14  */
15 
16 #include <string.h>
17 #include <fnmatch.h>
18 #include <stdlib.h>
19 #include <wchar.h>
20 #include <wctype.h>
21 
22 #define END 0
23 #define UNMATCHABLE -2
24 #define BRACKET -3
25 #define QUESTION -4
26 #define STAR -5
27 
str_next(const char * str,size_t n,size_t * step)28 static int str_next(const char *str, size_t n, size_t *step)
29 {
30 	if (!n) {
31 		*step = 0;
32 		return 0;
33 	}
34 	if (str[0] >= 128U) {
35 		wchar_t wc;
36 		int k = mbtowc(&wc, str, n);
37 		if (k<0) {
38 			*step = 1;
39 			return -1;
40 		}
41 		*step = k;
42 		return wc;
43 	}
44 	*step = 1;
45 	return str[0];
46 }
47 
pat_next(const char * pat,size_t m,size_t * step,int flags)48 static int pat_next(const char *pat, size_t m, size_t *step, int flags)
49 {
50 	int esc = 0;
51 	if (!m || !*pat) {
52 		*step = 0;
53 		return END;
54 	}
55 	*step = 1;
56 	if (pat[0]=='\\' && pat[1] && !(flags & FNM_NOESCAPE)) {
57 		*step = 2;
58 		pat++;
59 		esc = 1;
60 		goto escaped;
61 	}
62 	if (pat[0]=='[') {
63 		size_t k = 1;
64 		if (k<m) if (pat[k] == '^' || pat[k] == '!') k++;
65 		if (k<m) if (pat[k] == ']') k++;
66 		for (; k<m && pat[k] && pat[k]!=']'; k++) {
67 			if (k+1<m && pat[k+1] && pat[k]=='[' && (pat[k+1]==':' || pat[k+1]=='.' || pat[k+1]=='=')) {
68 				int z = pat[k+1];
69 				k+=2;
70 				if (k<m && pat[k]) k++;
71 				while (k<m && pat[k] && (pat[k-1]!=z || pat[k]!=']')) k++;
72 				if (k==m || !pat[k]) break;
73 			}
74 		}
75 		if (k==m || !pat[k]) {
76 			*step = 1;
77 			return '[';
78 		}
79 		*step = k+1;
80 		return BRACKET;
81 	}
82 	if (pat[0] == '*')
83 		return STAR;
84 	if (pat[0] == '?')
85 		return QUESTION;
86 escaped:
87 	if (pat[0] >= 128U) {
88 		wchar_t wc;
89 		int k = mbtowc(&wc, pat, m);
90 		if (k<0) {
91 			*step = 0;
92 			return UNMATCHABLE;
93 		}
94 		*step = k + esc;
95 		return wc;
96 	}
97 	return pat[0];
98 }
99 
casefold(int k)100 static int casefold(int k)
101 {
102 	int c = towupper(k);
103 	return c == k ? towlower(k) : c;
104 }
105 
match_bracket(const char * p,int k,int kfold)106 static int match_bracket(const char *p, int k, int kfold)
107 {
108 	wchar_t wc;
109 	int inv = 0;
110 	p++;
111 	if (*p=='^' || *p=='!') {
112 		inv = 1;
113 		p++;
114 	}
115 	if (*p==']') {
116 		if (k==']') return !inv;
117 		p++;
118 	} else if (*p=='-') {
119 		if (k=='-') return !inv;
120 		p++;
121 	}
122 	wc = p[-1];
123 	for (; *p != ']'; p++) {
124 		if (p[0]=='-' && p[1]!=']') {
125 			wchar_t wc2;
126 			int l = mbtowc(&wc2, p+1, 4);
127 			if (l < 0) return 0;
128 			if (wc <= wc2)
129 				if ((unsigned)k-wc <= wc2-wc ||
130 				    (unsigned)kfold-wc <= wc2-wc)
131 					return !inv;
132 			p += l-1;
133 			continue;
134 		}
135 		if (p[0]=='[' && (p[1]==':' || p[1]=='.' || p[1]=='=')) {
136 			const char *p0 = p+2;
137 			int z = p[1];
138 			p+=3;
139 			while (p[-1]!=z || p[0]!=']') p++;
140 			if (z == ':' && p-1-p0 < 16) {
141 				char buf[16];
142 				memcpy(buf, p0, p-1-p0);
143 				buf[p-1-p0] = 0;
144 				if (iswctype(k, wctype(buf)) ||
145 				    iswctype(kfold, wctype(buf)))
146 					return !inv;
147 			}
148 			continue;
149 		}
150 		if (*p < 128U) {
151 			wc = (unsigned char)*p;
152 		} else {
153 			int l = mbtowc(&wc, p, 4);
154 			if (l < 0) return 0;
155 			p += l-1;
156 		}
157 		if (wc==k || wc==kfold) return !inv;
158 	}
159 	return inv;
160 }
161 
fnmatch_internal(const char * pat,size_t m,const char * str,size_t n,int flags)162 static int fnmatch_internal(const char *pat, size_t m, const char *str, size_t n, int flags)
163 {
164 	const char *p, *ptail, *endpat;
165 	const char *s, *stail, *endstr;
166 	size_t pinc, sinc, tailcnt=0;
167 	int c, k, kfold;
168 
169 	if (flags & FNM_PERIOD) {
170 		if (*str == '.' && *pat != '.')
171 			return FNM_NOMATCH;
172 	}
173 	for (;;) {
174 		switch ((c = pat_next(pat, m, &pinc, flags))) {
175 		case UNMATCHABLE:
176 			return FNM_NOMATCH;
177 		case STAR:
178 			pat++;
179 			m--;
180 			break;
181 		default:
182 			k = str_next(str, n, &sinc);
183 			if (k <= 0)
184 				return (c==END) ? 0 : FNM_NOMATCH;
185 			str += sinc;
186 			n -= sinc;
187 			kfold = flags & FNM_CASEFOLD ? casefold(k) : k;
188 			if (c == BRACKET) {
189 				if (!match_bracket(pat, k, kfold))
190 					return FNM_NOMATCH;
191 			} else if (c != QUESTION && k != c && kfold != c) {
192 				return FNM_NOMATCH;
193 			}
194 			pat+=pinc;
195 			m-=pinc;
196 			continue;
197 		}
198 		break;
199 	}
200 
201 	/* Compute real pat length if it was initially unknown/-1 */
202 	m = strnlen(pat, m);
203 	endpat = pat + m;
204 
205 	/* Find the last * in pat and count chars needed after it */
206 	for (p=ptail=pat; p<endpat; p+=pinc) {
207 		switch (pat_next(p, endpat-p, &pinc, flags)) {
208 		case UNMATCHABLE:
209 			return FNM_NOMATCH;
210 		case STAR:
211 			tailcnt=0;
212 			ptail = p+1;
213 			break;
214 		default:
215 			tailcnt++;
216 			break;
217 		}
218 	}
219 
220 	/* Past this point we need not check for UNMATCHABLE in pat,
221 	 * because all of pat has already been parsed once. */
222 
223 	/* Compute real str length if it was initially unknown/-1 */
224 	n = strnlen(str, n);
225 	endstr = str + n;
226 	if (n < tailcnt) return FNM_NOMATCH;
227 
228 	/* Find the final tailcnt chars of str, accounting for UTF-8.
229 	 * On illegal sequences we may get it wrong, but in that case
230 	 * we necessarily have a matching failure anyway. */
231 	for (s=endstr; s>str && tailcnt; tailcnt--) {
232 		if (s[-1] < 128U) s--;
233 		else while ((unsigned char)*--s-0x80U<0x40 && s>str);
234 	}
235 	if (tailcnt) return FNM_NOMATCH;
236 	stail = s;
237 
238 	/* Check that the pat and str tails match */
239 	p = ptail;
240 	for (;;) {
241 		c = pat_next(p, endpat-p, &pinc, flags);
242 		p += pinc;
243 		if ((k = str_next(s, endstr-s, &sinc)) <= 0) {
244 			if (c != END) return FNM_NOMATCH;
245 			break;
246 		}
247 		s += sinc;
248 		kfold = flags & FNM_CASEFOLD ? casefold(k) : k;
249 		if (c == BRACKET) {
250 			if (!match_bracket(p-pinc, k, kfold))
251 				return FNM_NOMATCH;
252 		} else if (c != QUESTION && k != c && kfold != c) {
253 			return FNM_NOMATCH;
254 		}
255 	}
256 
257 	/* We're all done with the tails now, so throw them out */
258 	endstr = stail;
259 	endpat = ptail;
260 
261 	/* Match pattern components until there are none left */
262 	while (pat<endpat) {
263 		p = pat;
264 		s = str;
265 		for (;;) {
266 			c = pat_next(p, endpat-p, &pinc, flags);
267 			p += pinc;
268 			/* Encountering * completes/commits a component */
269 			if (c == STAR) {
270 				pat = p;
271 				str = s;
272 				break;
273 			}
274 			k = str_next(s, endstr-s, &sinc);
275 			if (!k)
276 				return FNM_NOMATCH;
277 			kfold = flags & FNM_CASEFOLD ? casefold(k) : k;
278 			if (c == BRACKET) {
279 				if (!match_bracket(p-pinc, k, kfold))
280 					break;
281 			} else if (c != QUESTION && k != c && kfold != c) {
282 				break;
283 			}
284 			s += sinc;
285 		}
286 		if (c == STAR) continue;
287 		/* If we failed, advance str, by 1 char if it's a valid
288 		 * char, or past all invalid bytes otherwise. */
289 		k = str_next(str, endstr-str, &sinc);
290 		if (k > 0) str += sinc;
291 		else for (str++; str_next(str, endstr-str, &sinc)<0; str++);
292 	}
293 
294 	return 0;
295 }
296 
fnmatch(const char * pat,const char * str,int flags)297 int fnmatch(const char *pat, const char *str, int flags)
298 {
299 	const char *s, *p;
300 	size_t inc;
301 	int c;
302 	if (flags & FNM_PATHNAME) for (;;) {
303 		for (s=str; *s && *s!='/'; s++);
304 		for (p=pat; (c=pat_next(p, -1, &inc, flags))!=END && c!='/'; p+=inc);
305 		if (c!=*s && (!*s || !(flags & FNM_LEADING_DIR)))
306 			return FNM_NOMATCH;
307 		if (fnmatch_internal(pat, p-pat, str, s-str, flags))
308 			return FNM_NOMATCH;
309 		if (!c) return 0;
310 		str = s+1;
311 		pat = p+inc;
312 	} else if (flags & FNM_LEADING_DIR) {
313 		for (s=str; *s; s++) {
314 			if (*s != '/') continue;
315 			if (!fnmatch_internal(pat, -1, str, s-str, flags))
316 				return 0;
317 		}
318 	}
319 	return fnmatch_internal(pat, -1, str, -1, flags);
320 }
321