1 /* 2 * An implementation of what I call the "Sea of Stars" algorithm for 3 * POSIX fnmatch(). The basic idea is that we factor the pattern into 4 * a head component (which we match first and can reject without ever 5 * measuring the length of the string), an optional tail component 6 * (which only exists if the pattern contains at least one star), and 7 * an optional "sea of stars", a set of star-separated components 8 * between the head and tail. After the head and tail matches have 9 * been removed from the input string, the components in the "sea of 10 * stars" are matched sequentially by searching for their first 11 * occurrence past the end of the previous match. 12 * 13 * - Rich Felker, April 2012 14 */ 15 16 #include <string.h> 17 #include <fnmatch.h> 18 #include <stdlib.h> 19 #include <wchar.h> 20 #include <wctype.h> 21 22 #define END 0 23 #define UNMATCHABLE -2 24 #define BRACKET -3 25 #define QUESTION -4 26 #define STAR -5 27 28 static int str_next(const char *str, size_t n, size_t *step) 29 { 30 if (!n) { 31 *step = 0; 32 return 0; 33 } 34 if (str[0] >= 128U) { 35 wchar_t wc; 36 int k = mbtowc(&wc, str, n); 37 if (k<0) { 38 *step = 1; 39 return -1; 40 } 41 *step = k; 42 return wc; 43 } 44 *step = 1; 45 return str[0]; 46 } 47 48 static int pat_next(const char *pat, size_t m, size_t *step, int flags) 49 { 50 int esc = 0; 51 if (!m || !*pat) { 52 *step = 0; 53 return END; 54 } 55 *step = 1; 56 if (pat[0]=='\\' && pat[1] && !(flags & FNM_NOESCAPE)) { 57 *step = 2; 58 pat++; 59 esc = 1; 60 goto escaped; 61 } 62 if (pat[0]=='[') { 63 size_t k = 1; 64 if (k<m) if (pat[k] == '^' || pat[k] == '!') k++; 65 if (k<m) if (pat[k] == ']') k++; 66 for (; k<m && pat[k] && pat[k]!=']'; k++) { 67 if (k+1<m && pat[k+1] && pat[k]=='[' && (pat[k+1]==':' || pat[k+1]=='.' || pat[k+1]=='=')) { 68 int z = pat[k+1]; 69 k+=2; 70 if (k<m && pat[k]) k++; 71 while (k<m && pat[k] && (pat[k-1]!=z || pat[k]!=']')) k++; 72 if (k==m || !pat[k]) break; 73 } 74 } 75 if (k==m || !pat[k]) { 76 *step = 1; 77 return '['; 78 } 79 *step = k+1; 80 return BRACKET; 81 } 82 if (pat[0] == '*') 83 return STAR; 84 if (pat[0] == '?') 85 return QUESTION; 86 escaped: 87 if (pat[0] >= 128U) { 88 wchar_t wc; 89 int k = mbtowc(&wc, pat, m); 90 if (k<0) { 91 *step = 0; 92 return UNMATCHABLE; 93 } 94 *step = k + esc; 95 return wc; 96 } 97 return pat[0]; 98 } 99 100 static int casefold(int k) 101 { 102 int c = towupper(k); 103 return c == k ? towlower(k) : c; 104 } 105 106 static int match_bracket(const char *p, int k, int kfold) 107 { 108 wchar_t wc; 109 int inv = 0; 110 p++; 111 if (*p=='^' || *p=='!') { 112 inv = 1; 113 p++; 114 } 115 if (*p==']') { 116 if (k==']') return !inv; 117 p++; 118 } else if (*p=='-') { 119 if (k=='-') return !inv; 120 p++; 121 } 122 wc = p[-1]; 123 for (; *p != ']'; p++) { 124 if (p[0]=='-' && p[1]!=']') { 125 wchar_t wc2; 126 int l = mbtowc(&wc2, p+1, 4); 127 if (l < 0) return 0; 128 if (wc <= wc2) 129 if ((unsigned)k-wc <= wc2-wc || 130 (unsigned)kfold-wc <= wc2-wc) 131 return !inv; 132 p += l-1; 133 continue; 134 } 135 if (p[0]=='[' && (p[1]==':' || p[1]=='.' || p[1]=='=')) { 136 const char *p0 = p+2; 137 int z = p[1]; 138 p+=3; 139 while (p[-1]!=z || p[0]!=']') p++; 140 if (z == ':' && p-1-p0 < 16) { 141 char buf[16]; 142 memcpy(buf, p0, p-1-p0); 143 buf[p-1-p0] = 0; 144 if (iswctype(k, wctype(buf)) || 145 iswctype(kfold, wctype(buf))) 146 return !inv; 147 } 148 continue; 149 } 150 if (*p < 128U) { 151 wc = (unsigned char)*p; 152 } else { 153 int l = mbtowc(&wc, p, 4); 154 if (l < 0) return 0; 155 p += l-1; 156 } 157 if (wc==k || wc==kfold) return !inv; 158 } 159 return inv; 160 } 161 162 static int fnmatch_internal(const char *pat, size_t m, const char *str, size_t n, int flags) 163 { 164 const char *p, *ptail, *endpat; 165 const char *s, *stail, *endstr; 166 size_t pinc, sinc, tailcnt=0; 167 int c, k, kfold; 168 169 if (flags & FNM_PERIOD) { 170 if (*str == '.' && *pat != '.') 171 return FNM_NOMATCH; 172 } 173 for (;;) { 174 switch ((c = pat_next(pat, m, &pinc, flags))) { 175 case UNMATCHABLE: 176 return FNM_NOMATCH; 177 case STAR: 178 pat++; 179 m--; 180 break; 181 default: 182 k = str_next(str, n, &sinc); 183 if (k <= 0) 184 return (c==END) ? 0 : FNM_NOMATCH; 185 str += sinc; 186 n -= sinc; 187 kfold = flags & FNM_CASEFOLD ? casefold(k) : k; 188 if (c == BRACKET) { 189 if (!match_bracket(pat, k, kfold)) 190 return FNM_NOMATCH; 191 } else if (c != QUESTION && k != c && kfold != c) { 192 return FNM_NOMATCH; 193 } 194 pat+=pinc; 195 m-=pinc; 196 continue; 197 } 198 break; 199 } 200 201 /* Compute real pat length if it was initially unknown/-1 */ 202 m = strnlen(pat, m); 203 endpat = pat + m; 204 205 /* Find the last * in pat and count chars needed after it */ 206 for (p=ptail=pat; p<endpat; p+=pinc) { 207 switch (pat_next(p, endpat-p, &pinc, flags)) { 208 case UNMATCHABLE: 209 return FNM_NOMATCH; 210 case STAR: 211 tailcnt=0; 212 ptail = p+1; 213 break; 214 default: 215 tailcnt++; 216 break; 217 } 218 } 219 220 /* Past this point we need not check for UNMATCHABLE in pat, 221 * because all of pat has already been parsed once. */ 222 223 /* Compute real str length if it was initially unknown/-1 */ 224 n = strnlen(str, n); 225 endstr = str + n; 226 if (n < tailcnt) return FNM_NOMATCH; 227 228 /* Find the final tailcnt chars of str, accounting for UTF-8. 229 * On illegal sequences we may get it wrong, but in that case 230 * we necessarily have a matching failure anyway. */ 231 for (s=endstr; s>str && tailcnt; tailcnt--) { 232 if (s[-1] < 128U) s--; 233 else while ((unsigned char)*--s-0x80U<0x40 && s>str); 234 } 235 if (tailcnt) return FNM_NOMATCH; 236 stail = s; 237 238 /* Check that the pat and str tails match */ 239 p = ptail; 240 for (;;) { 241 c = pat_next(p, endpat-p, &pinc, flags); 242 p += pinc; 243 if ((k = str_next(s, endstr-s, &sinc)) <= 0) { 244 if (c != END) return FNM_NOMATCH; 245 break; 246 } 247 s += sinc; 248 kfold = flags & FNM_CASEFOLD ? casefold(k) : k; 249 if (c == BRACKET) { 250 if (!match_bracket(p-pinc, k, kfold)) 251 return FNM_NOMATCH; 252 } else if (c != QUESTION && k != c && kfold != c) { 253 return FNM_NOMATCH; 254 } 255 } 256 257 /* We're all done with the tails now, so throw them out */ 258 endstr = stail; 259 endpat = ptail; 260 261 /* Match pattern components until there are none left */ 262 while (pat<endpat) { 263 p = pat; 264 s = str; 265 for (;;) { 266 c = pat_next(p, endpat-p, &pinc, flags); 267 p += pinc; 268 /* Encountering * completes/commits a component */ 269 if (c == STAR) { 270 pat = p; 271 str = s; 272 break; 273 } 274 k = str_next(s, endstr-s, &sinc); 275 if (!k) 276 return FNM_NOMATCH; 277 kfold = flags & FNM_CASEFOLD ? casefold(k) : k; 278 if (c == BRACKET) { 279 if (!match_bracket(p-pinc, k, kfold)) 280 break; 281 } else if (c != QUESTION && k != c && kfold != c) { 282 break; 283 } 284 s += sinc; 285 } 286 if (c == STAR) continue; 287 /* If we failed, advance str, by 1 char if it's a valid 288 * char, or past all invalid bytes otherwise. */ 289 k = str_next(str, endstr-str, &sinc); 290 if (k > 0) str += sinc; 291 else for (str++; str_next(str, endstr-str, &sinc)<0; str++); 292 } 293 294 return 0; 295 } 296 297 int fnmatch(const char *pat, const char *str, int flags) 298 { 299 const char *s, *p; 300 size_t inc; 301 int c; 302 if (flags & FNM_PATHNAME) for (;;) { 303 for (s=str; *s && *s!='/'; s++); 304 for (p=pat; (c=pat_next(p, -1, &inc, flags))!=END && c!='/'; p+=inc); 305 if (c!=*s && (!*s || !(flags & FNM_LEADING_DIR))) 306 return FNM_NOMATCH; 307 if (fnmatch_internal(pat, p-pat, str, s-str, flags)) 308 return FNM_NOMATCH; 309 if (!c) return 0; 310 str = s+1; 311 pat = p+inc; 312 } else if (flags & FNM_LEADING_DIR) { 313 for (s=str; *s; s++) { 314 if (*s != '/') continue; 315 if (!fnmatch_internal(pat, -1, str, s-str, flags)) 316 return 0; 317 } 318 } 319 return fnmatch_internal(pat, -1, str, -1, flags); 320 } 321