1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2018 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
19
20 #ifdef _LIBC
21 # include <locale/weight.h>
22 #endif
23
24 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
25 size_t length, reg_syntax_t syntax);
26 static void re_compile_fastmap_iter (regex_t *bufp,
27 const re_dfastate_t *init_state,
28 char *fastmap);
29 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
30 #ifdef RE_ENABLE_I18N
31 static void free_charset (re_charset_t *cset);
32 #endif /* RE_ENABLE_I18N */
33 static void free_workarea_compile (regex_t *preg);
34 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
35 #ifdef RE_ENABLE_I18N
36 static void optimize_utf8 (re_dfa_t *dfa);
37 #endif
38 static reg_errcode_t analyze (regex_t *preg);
39 static reg_errcode_t preorder (bin_tree_t *root,
40 reg_errcode_t (fn (void *, bin_tree_t *)),
41 void *extra);
42 static reg_errcode_t postorder (bin_tree_t *root,
43 reg_errcode_t (fn (void *, bin_tree_t *)),
44 void *extra);
45 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
46 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
47 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
48 bin_tree_t *node);
49 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
50 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
51 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
52 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
53 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
54 unsigned int constraint);
55 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
56 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
57 Idx node, bool root);
58 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
59 static Idx fetch_number (re_string_t *input, re_token_t *token,
60 reg_syntax_t syntax);
61 static int peek_token (re_token_t *token, re_string_t *input,
62 reg_syntax_t syntax);
63 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
64 reg_syntax_t syntax, reg_errcode_t *err);
65 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
66 re_token_t *token, reg_syntax_t syntax,
67 Idx nest, reg_errcode_t *err);
68 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
69 re_token_t *token, reg_syntax_t syntax,
70 Idx nest, reg_errcode_t *err);
71 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
72 re_token_t *token, reg_syntax_t syntax,
73 Idx nest, reg_errcode_t *err);
74 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
75 re_token_t *token, reg_syntax_t syntax,
76 Idx nest, reg_errcode_t *err);
77 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
78 re_dfa_t *dfa, re_token_t *token,
79 reg_syntax_t syntax, reg_errcode_t *err);
80 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
81 re_token_t *token, reg_syntax_t syntax,
82 reg_errcode_t *err);
83 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
84 re_string_t *regexp,
85 re_token_t *token, int token_len,
86 re_dfa_t *dfa,
87 reg_syntax_t syntax,
88 bool accept_hyphen);
89 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
90 re_string_t *regexp,
91 re_token_t *token);
92 #ifdef RE_ENABLE_I18N
93 static reg_errcode_t build_equiv_class (bitset_t sbcset,
94 re_charset_t *mbcset,
95 Idx *equiv_class_alloc,
96 const unsigned char *name);
97 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
98 bitset_t sbcset,
99 re_charset_t *mbcset,
100 Idx *char_class_alloc,
101 const char *class_name,
102 reg_syntax_t syntax);
103 #else /* not RE_ENABLE_I18N */
104 static reg_errcode_t build_equiv_class (bitset_t sbcset,
105 const unsigned char *name);
106 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
107 bitset_t sbcset,
108 const char *class_name,
109 reg_syntax_t syntax);
110 #endif /* not RE_ENABLE_I18N */
111 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
112 RE_TRANSLATE_TYPE trans,
113 const char *class_name,
114 const char *extra,
115 bool non_match, reg_errcode_t *err);
116 static bin_tree_t *create_tree (re_dfa_t *dfa,
117 bin_tree_t *left, bin_tree_t *right,
118 re_token_type_t type);
119 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
120 bin_tree_t *left, bin_tree_t *right,
121 const re_token_t *token);
122 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
123 static void free_token (re_token_t *node);
124 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
125 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
126
127 /* This table gives an error message for each of the error codes listed
128 in regex.h. Obviously the order here has to be same as there.
129 POSIX doesn't require that we do anything for REG_NOERROR,
130 but why not be nice? */
131
132 static const char __re_error_msgid[] =
133 {
134 #define REG_NOERROR_IDX 0
135 gettext_noop ("Success") /* REG_NOERROR */
136 "\0"
137 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
138 gettext_noop ("No match") /* REG_NOMATCH */
139 "\0"
140 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
141 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
142 "\0"
143 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
144 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
145 "\0"
146 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
147 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
148 "\0"
149 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
150 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
151 "\0"
152 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
153 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
154 "\0"
155 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
156 gettext_noop ("Unmatched [, [^, [:, [., or [=") /* REG_EBRACK */
157 "\0"
158 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [, [^, [:, [., or [=")
159 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
160 "\0"
161 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
162 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
163 "\0"
164 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
165 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
166 "\0"
167 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
168 gettext_noop ("Invalid range end") /* REG_ERANGE */
169 "\0"
170 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
171 gettext_noop ("Memory exhausted") /* REG_ESPACE */
172 "\0"
173 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
174 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
175 "\0"
176 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
177 gettext_noop ("Premature end of regular expression") /* REG_EEND */
178 "\0"
179 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
180 gettext_noop ("Regular expression too big") /* REG_ESIZE */
181 "\0"
182 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
183 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
184 };
185
186 static const size_t __re_error_msgid_idx[] =
187 {
188 REG_NOERROR_IDX,
189 REG_NOMATCH_IDX,
190 REG_BADPAT_IDX,
191 REG_ECOLLATE_IDX,
192 REG_ECTYPE_IDX,
193 REG_EESCAPE_IDX,
194 REG_ESUBREG_IDX,
195 REG_EBRACK_IDX,
196 REG_EPAREN_IDX,
197 REG_EBRACE_IDX,
198 REG_BADBR_IDX,
199 REG_ERANGE_IDX,
200 REG_ESPACE_IDX,
201 REG_BADRPT_IDX,
202 REG_EEND_IDX,
203 REG_ESIZE_IDX,
204 REG_ERPAREN_IDX
205 };
206
207 /* Entry points for GNU code. */
208
209 /* re_compile_pattern is the GNU regular expression compiler: it
210 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
211 Returns 0 if the pattern was valid, otherwise an error string.
212
213 Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
214 are set in BUFP on entry. */
215
216 const char *
re_compile_pattern(const char * pattern,size_t length,struct re_pattern_buffer * bufp)217 re_compile_pattern (const char *pattern, size_t length,
218 struct re_pattern_buffer *bufp)
219 {
220 reg_errcode_t ret;
221
222 /* And GNU code determines whether or not to get register information
223 by passing null for the REGS argument to re_match, etc., not by
224 setting no_sub, unless RE_NO_SUB is set. */
225 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
226
227 /* Match anchors at newline. */
228 bufp->newline_anchor = 1;
229
230 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
231
232 if (!ret)
233 return NULL;
234 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
235 }
236 #ifdef _LIBC
weak_alias(__re_compile_pattern,re_compile_pattern)237 weak_alias (__re_compile_pattern, re_compile_pattern)
238 #endif
239
240 /* Set by 're_set_syntax' to the current regexp syntax to recognize. Can
241 also be assigned to arbitrarily: each pattern buffer stores its own
242 syntax, so it can be changed between regex compilations. */
243 /* This has no initializer because initialized variables in Emacs
244 become read-only after dumping. */
245 reg_syntax_t re_syntax_options;
246
247
248 /* Specify the precise syntax of regexps for compilation. This provides
249 for compatibility for various utilities which historically have
250 different, incompatible syntaxes.
251
252 The argument SYNTAX is a bit mask comprised of the various bits
253 defined in regex.h. We return the old syntax. */
254
255 reg_syntax_t
256 re_set_syntax (reg_syntax_t syntax)
257 {
258 reg_syntax_t ret = re_syntax_options;
259
260 re_syntax_options = syntax;
261 return ret;
262 }
263 #ifdef _LIBC
weak_alias(__re_set_syntax,re_set_syntax)264 weak_alias (__re_set_syntax, re_set_syntax)
265 #endif
266
267 int
268 re_compile_fastmap (struct re_pattern_buffer *bufp)
269 {
270 re_dfa_t *dfa = bufp->buffer;
271 char *fastmap = bufp->fastmap;
272
273 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
274 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
275 if (dfa->init_state != dfa->init_state_word)
276 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
277 if (dfa->init_state != dfa->init_state_nl)
278 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
279 if (dfa->init_state != dfa->init_state_begbuf)
280 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
281 bufp->fastmap_accurate = 1;
282 return 0;
283 }
284 #ifdef _LIBC
weak_alias(__re_compile_fastmap,re_compile_fastmap)285 weak_alias (__re_compile_fastmap, re_compile_fastmap)
286 #endif
287
288 static inline void
289 __attribute__ ((always_inline))
290 re_set_fastmap (char *fastmap, bool icase, int ch)
291 {
292 fastmap[ch] = 1;
293 if (icase)
294 fastmap[tolower (ch)] = 1;
295 }
296
297 /* Helper function for re_compile_fastmap.
298 Compile fastmap for the initial_state INIT_STATE. */
299
300 static void
re_compile_fastmap_iter(regex_t * bufp,const re_dfastate_t * init_state,char * fastmap)301 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
302 char *fastmap)
303 {
304 re_dfa_t *dfa = bufp->buffer;
305 Idx node_cnt;
306 bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
307 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
308 {
309 Idx node = init_state->nodes.elems[node_cnt];
310 re_token_type_t type = dfa->nodes[node].type;
311
312 if (type == CHARACTER)
313 {
314 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
315 #ifdef RE_ENABLE_I18N
316 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
317 {
318 unsigned char buf[MB_LEN_MAX];
319 unsigned char *p;
320 wchar_t wc;
321 mbstate_t state;
322
323 p = buf;
324 *p++ = dfa->nodes[node].opr.c;
325 while (++node < dfa->nodes_len
326 && dfa->nodes[node].type == CHARACTER
327 && dfa->nodes[node].mb_partial)
328 *p++ = dfa->nodes[node].opr.c;
329 memset (&state, '\0', sizeof (state));
330 if (__mbrtowc (&wc, (const char *) buf, p - buf,
331 &state) == p - buf
332 && (__wcrtomb ((char *) buf, __towlower (wc), &state)
333 != (size_t) -1))
334 re_set_fastmap (fastmap, false, buf[0]);
335 }
336 #endif
337 }
338 else if (type == SIMPLE_BRACKET)
339 {
340 int i, ch;
341 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
342 {
343 int j;
344 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
345 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
346 if (w & ((bitset_word_t) 1 << j))
347 re_set_fastmap (fastmap, icase, ch);
348 }
349 }
350 #ifdef RE_ENABLE_I18N
351 else if (type == COMPLEX_BRACKET)
352 {
353 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
354 Idx i;
355
356 # ifdef _LIBC
357 /* See if we have to try all bytes which start multiple collation
358 elements.
359 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
360 collation element, and don't catch 'b' since 'b' is
361 the only collation element which starts from 'b' (and
362 it is caught by SIMPLE_BRACKET). */
363 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
364 && (cset->ncoll_syms || cset->nranges))
365 {
366 const int32_t *table = (const int32_t *)
367 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
368 for (i = 0; i < SBC_MAX; ++i)
369 if (table[i] < 0)
370 re_set_fastmap (fastmap, icase, i);
371 }
372 # endif /* _LIBC */
373
374 /* See if we have to start the match at all multibyte characters,
375 i.e. where we would not find an invalid sequence. This only
376 applies to multibyte character sets; for single byte character
377 sets, the SIMPLE_BRACKET again suffices. */
378 if (dfa->mb_cur_max > 1
379 && (cset->nchar_classes || cset->non_match || cset->nranges
380 # ifdef _LIBC
381 || cset->nequiv_classes
382 # endif /* _LIBC */
383 ))
384 {
385 unsigned char c = 0;
386 do
387 {
388 mbstate_t mbs;
389 memset (&mbs, 0, sizeof (mbs));
390 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
391 re_set_fastmap (fastmap, false, (int) c);
392 }
393 while (++c != 0);
394 }
395
396 else
397 {
398 /* ... Else catch all bytes which can start the mbchars. */
399 for (i = 0; i < cset->nmbchars; ++i)
400 {
401 char buf[256];
402 mbstate_t state;
403 memset (&state, '\0', sizeof (state));
404 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
405 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
406 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
407 {
408 if (__wcrtomb (buf, __towlower (cset->mbchars[i]), &state)
409 != (size_t) -1)
410 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
411 }
412 }
413 }
414 }
415 #endif /* RE_ENABLE_I18N */
416 else if (type == OP_PERIOD
417 #ifdef RE_ENABLE_I18N
418 || type == OP_UTF8_PERIOD
419 #endif /* RE_ENABLE_I18N */
420 || type == END_OF_RE)
421 {
422 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
423 if (type == END_OF_RE)
424 bufp->can_be_null = 1;
425 return;
426 }
427 }
428 }
429
430 /* Entry point for POSIX code. */
431 /* regcomp takes a regular expression as a string and compiles it.
432
433 PREG is a regex_t *. We do not expect any fields to be initialized,
434 since POSIX says we shouldn't. Thus, we set
435
436 'buffer' to the compiled pattern;
437 'used' to the length of the compiled pattern;
438 'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
439 REG_EXTENDED bit in CFLAGS is set; otherwise, to
440 RE_SYNTAX_POSIX_BASIC;
441 'newline_anchor' to REG_NEWLINE being set in CFLAGS;
442 'fastmap' to an allocated space for the fastmap;
443 'fastmap_accurate' to zero;
444 're_nsub' to the number of subexpressions in PATTERN.
445
446 PATTERN is the address of the pattern string.
447
448 CFLAGS is a series of bits which affect compilation.
449
450 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
451 use POSIX basic syntax.
452
453 If REG_NEWLINE is set, then . and [^...] don't match newline.
454 Also, regexec will try a match beginning after every newline.
455
456 If REG_ICASE is set, then we considers upper- and lowercase
457 versions of letters to be equivalent when matching.
458
459 If REG_NOSUB is set, then when PREG is passed to regexec, that
460 routine will report only success or failure, and nothing about the
461 registers.
462
463 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
464 the return codes and their meanings.) */
465
466 int
regcomp(regex_t * _Restrict_ preg,const char * _Restrict_ pattern,int cflags)467 regcomp (regex_t *_Restrict_ preg, const char *_Restrict_ pattern, int cflags)
468 {
469 reg_errcode_t ret;
470 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
471 : RE_SYNTAX_POSIX_BASIC);
472
473 preg->buffer = NULL;
474 preg->allocated = 0;
475 preg->used = 0;
476
477 /* Try to allocate space for the fastmap. */
478 preg->fastmap = re_malloc (char, SBC_MAX);
479 if (BE (preg->fastmap == NULL, 0))
480 return REG_ESPACE;
481
482 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
483
484 /* If REG_NEWLINE is set, newlines are treated differently. */
485 if (cflags & REG_NEWLINE)
486 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
487 syntax &= ~RE_DOT_NEWLINE;
488 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
489 /* It also changes the matching behavior. */
490 preg->newline_anchor = 1;
491 }
492 else
493 preg->newline_anchor = 0;
494 preg->no_sub = !!(cflags & REG_NOSUB);
495 preg->translate = NULL;
496
497 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
498
499 /* POSIX doesn't distinguish between an unmatched open-group and an
500 unmatched close-group: both are REG_EPAREN. */
501 if (ret == REG_ERPAREN)
502 ret = REG_EPAREN;
503
504 /* We have already checked preg->fastmap != NULL. */
505 if (BE (ret == REG_NOERROR, 1))
506 /* Compute the fastmap now, since regexec cannot modify the pattern
507 buffer. This function never fails in this implementation. */
508 (void) re_compile_fastmap (preg);
509 else
510 {
511 /* Some error occurred while compiling the expression. */
512 re_free (preg->fastmap);
513 preg->fastmap = NULL;
514 }
515
516 return (int) ret;
517 }
518 #ifdef _LIBC
519 libc_hidden_def (__regcomp)
weak_alias(__regcomp,regcomp)520 weak_alias (__regcomp, regcomp)
521 #endif
522
523 /* Returns a message corresponding to an error code, ERRCODE, returned
524 from either regcomp or regexec. We don't use PREG here. */
525
526 size_t
527 regerror (int errcode, const regex_t *_Restrict_ preg, char *_Restrict_ errbuf,
528 size_t errbuf_size)
529 {
530 const char *msg;
531 size_t msg_size;
532
533 if (BE (errcode < 0
534 || errcode >= (int) (sizeof (__re_error_msgid_idx)
535 / sizeof (__re_error_msgid_idx[0])), 0))
536 /* Only error codes returned by the rest of the code should be passed
537 to this routine. If we are given anything else, or if other regex
538 code generates an invalid error code, then the program has a bug.
539 Dump core so we can fix it. */
540 abort ();
541
542 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
543
544 msg_size = strlen (msg) + 1; /* Includes the null. */
545
546 if (BE (errbuf_size != 0, 1))
547 {
548 size_t cpy_size = msg_size;
549 if (BE (msg_size > errbuf_size, 0))
550 {
551 cpy_size = errbuf_size - 1;
552 errbuf[cpy_size] = '\0';
553 }
554 memcpy (errbuf, msg, cpy_size);
555 }
556
557 return msg_size;
558 }
559 #ifdef _LIBC
560 weak_alias (__regerror, regerror)
561 #endif
562
563
564 #ifdef RE_ENABLE_I18N
565 /* This static array is used for the map to single-byte characters when
566 UTF-8 is used. Otherwise we would allocate memory just to initialize
567 it the same all the time. UTF-8 is the preferred encoding so this is
568 a worthwhile optimization. */
569 static const bitset_t utf8_sb_map =
570 {
571 /* Set the first 128 bits. */
572 # if defined __GNUC__ && !defined __STRICT_ANSI__
573 [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
574 # else
575 # if 4 * BITSET_WORD_BITS < ASCII_CHARS
576 # error "bitset_word_t is narrower than 32 bits"
577 # elif 3 * BITSET_WORD_BITS < ASCII_CHARS
578 BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
579 # elif 2 * BITSET_WORD_BITS < ASCII_CHARS
580 BITSET_WORD_MAX, BITSET_WORD_MAX,
581 # elif 1 * BITSET_WORD_BITS < ASCII_CHARS
582 BITSET_WORD_MAX,
583 # endif
584 (BITSET_WORD_MAX
585 >> (SBC_MAX % BITSET_WORD_BITS == 0
586 ? 0
587 : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
588 # endif
589 };
590 #endif
591
592
593 static void
free_dfa_content(re_dfa_t * dfa)594 free_dfa_content (re_dfa_t *dfa)
595 {
596 Idx i, j;
597
598 if (dfa->nodes)
599 for (i = 0; i < dfa->nodes_len; ++i)
600 free_token (dfa->nodes + i);
601 re_free (dfa->nexts);
602 for (i = 0; i < dfa->nodes_len; ++i)
603 {
604 if (dfa->eclosures != NULL)
605 re_node_set_free (dfa->eclosures + i);
606 if (dfa->inveclosures != NULL)
607 re_node_set_free (dfa->inveclosures + i);
608 if (dfa->edests != NULL)
609 re_node_set_free (dfa->edests + i);
610 }
611 re_free (dfa->edests);
612 re_free (dfa->eclosures);
613 re_free (dfa->inveclosures);
614 re_free (dfa->nodes);
615
616 if (dfa->state_table)
617 for (i = 0; i <= dfa->state_hash_mask; ++i)
618 {
619 struct re_state_table_entry *entry = dfa->state_table + i;
620 for (j = 0; j < entry->num; ++j)
621 {
622 re_dfastate_t *state = entry->array[j];
623 free_state (state);
624 }
625 re_free (entry->array);
626 }
627 re_free (dfa->state_table);
628 #ifdef RE_ENABLE_I18N
629 if (dfa->sb_char != utf8_sb_map)
630 re_free (dfa->sb_char);
631 #endif
632 re_free (dfa->subexp_map);
633 #ifdef DEBUG
634 re_free (dfa->re_str);
635 #endif
636
637 re_free (dfa);
638 }
639
640
641 /* Free dynamically allocated space used by PREG. */
642
643 void
regfree(regex_t * preg)644 regfree (regex_t *preg)
645 {
646 re_dfa_t *dfa = preg->buffer;
647 if (BE (dfa != NULL, 1))
648 {
649 lock_fini (dfa->lock);
650 free_dfa_content (dfa);
651 }
652 preg->buffer = NULL;
653 preg->allocated = 0;
654
655 re_free (preg->fastmap);
656 preg->fastmap = NULL;
657
658 re_free (preg->translate);
659 preg->translate = NULL;
660 }
661 #ifdef _LIBC
662 libc_hidden_def (__regfree)
663 weak_alias (__regfree, regfree)
664 #endif
665
666 /* Entry points compatible with 4.2 BSD regex library. We don't define
667 them unless specifically requested. */
668
669 #if defined _REGEX_RE_COMP || defined _LIBC
670
671 /* BSD has one and only one pattern buffer. */
672 static struct re_pattern_buffer re_comp_buf;
673
674 char *
675 # ifdef _LIBC
676 /* Make these definitions weak in libc, so POSIX programs can redefine
677 these names if they don't use our functions, and still use
678 regcomp/regexec above without link errors. */
679 weak_function
680 # endif
re_comp(const char * s)681 re_comp (const char *s)
682 {
683 reg_errcode_t ret;
684 char *fastmap;
685
686 if (!s)
687 {
688 if (!re_comp_buf.buffer)
689 return gettext ("No previous regular expression");
690 return 0;
691 }
692
693 if (re_comp_buf.buffer)
694 {
695 fastmap = re_comp_buf.fastmap;
696 re_comp_buf.fastmap = NULL;
697 __regfree (&re_comp_buf);
698 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
699 re_comp_buf.fastmap = fastmap;
700 }
701
702 if (re_comp_buf.fastmap == NULL)
703 {
704 re_comp_buf.fastmap = re_malloc (char, SBC_MAX);
705 if (re_comp_buf.fastmap == NULL)
706 return (char *) gettext (__re_error_msgid
707 + __re_error_msgid_idx[(int) REG_ESPACE]);
708 }
709
710 /* Since 're_exec' always passes NULL for the 'regs' argument, we
711 don't need to initialize the pattern buffer fields which affect it. */
712
713 /* Match anchors at newlines. */
714 re_comp_buf.newline_anchor = 1;
715
716 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
717
718 if (!ret)
719 return NULL;
720
721 /* Yes, we're discarding 'const' here if !HAVE_LIBINTL. */
722 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
723 }
724
725 #ifdef _LIBC
libc_freeres_fn(free_mem)726 libc_freeres_fn (free_mem)
727 {
728 __regfree (&re_comp_buf);
729 }
730 #endif
731
732 #endif /* _REGEX_RE_COMP */
733
734 /* Internal entry point.
735 Compile the regular expression PATTERN, whose length is LENGTH.
736 SYNTAX indicate regular expression's syntax. */
737
738 static reg_errcode_t
re_compile_internal(regex_t * preg,const char * pattern,size_t length,reg_syntax_t syntax)739 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
740 reg_syntax_t syntax)
741 {
742 reg_errcode_t err = REG_NOERROR;
743 re_dfa_t *dfa;
744 re_string_t regexp;
745
746 /* Initialize the pattern buffer. */
747 preg->fastmap_accurate = 0;
748 preg->syntax = syntax;
749 preg->not_bol = preg->not_eol = 0;
750 preg->used = 0;
751 preg->re_nsub = 0;
752 preg->can_be_null = 0;
753 preg->regs_allocated = REGS_UNALLOCATED;
754
755 /* Initialize the dfa. */
756 dfa = preg->buffer;
757 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
758 {
759 /* If zero allocated, but buffer is non-null, try to realloc
760 enough space. This loses if buffer's address is bogus, but
761 that is the user's responsibility. If ->buffer is NULL this
762 is a simple allocation. */
763 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
764 if (dfa == NULL)
765 return REG_ESPACE;
766 preg->allocated = sizeof (re_dfa_t);
767 preg->buffer = dfa;
768 }
769 preg->used = sizeof (re_dfa_t);
770
771 err = init_dfa (dfa, length);
772 if (BE (err == REG_NOERROR && lock_init (dfa->lock) != 0, 0))
773 err = REG_ESPACE;
774 if (BE (err != REG_NOERROR, 0))
775 {
776 free_dfa_content (dfa);
777 preg->buffer = NULL;
778 preg->allocated = 0;
779 return err;
780 }
781 #ifdef DEBUG
782 /* Note: length+1 will not overflow since it is checked in init_dfa. */
783 dfa->re_str = re_malloc (char, length + 1);
784 strncpy (dfa->re_str, pattern, length + 1);
785 #endif
786
787 err = re_string_construct (®exp, pattern, length, preg->translate,
788 (syntax & RE_ICASE) != 0, dfa);
789 if (BE (err != REG_NOERROR, 0))
790 {
791 re_compile_internal_free_return:
792 free_workarea_compile (preg);
793 re_string_destruct (®exp);
794 lock_fini (dfa->lock);
795 free_dfa_content (dfa);
796 preg->buffer = NULL;
797 preg->allocated = 0;
798 return err;
799 }
800
801 /* Parse the regular expression, and build a structure tree. */
802 preg->re_nsub = 0;
803 dfa->str_tree = parse (®exp, preg, syntax, &err);
804 if (BE (dfa->str_tree == NULL, 0))
805 goto re_compile_internal_free_return;
806
807 /* Analyze the tree and create the nfa. */
808 err = analyze (preg);
809 if (BE (err != REG_NOERROR, 0))
810 goto re_compile_internal_free_return;
811
812 #ifdef RE_ENABLE_I18N
813 /* If possible, do searching in single byte encoding to speed things up. */
814 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
815 optimize_utf8 (dfa);
816 #endif
817
818 /* Then create the initial state of the dfa. */
819 err = create_initial_state (dfa);
820
821 /* Release work areas. */
822 free_workarea_compile (preg);
823 re_string_destruct (®exp);
824
825 if (BE (err != REG_NOERROR, 0))
826 {
827 lock_fini (dfa->lock);
828 free_dfa_content (dfa);
829 preg->buffer = NULL;
830 preg->allocated = 0;
831 }
832
833 return err;
834 }
835
836 /* Initialize DFA. We use the length of the regular expression PAT_LEN
837 as the initial length of some arrays. */
838
839 static reg_errcode_t
init_dfa(re_dfa_t * dfa,size_t pat_len)840 init_dfa (re_dfa_t *dfa, size_t pat_len)
841 {
842 __re_size_t table_size;
843 #ifndef _LIBC
844 const char *codeset_name;
845 #endif
846 #ifdef RE_ENABLE_I18N
847 size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
848 #else
849 size_t max_i18n_object_size = 0;
850 #endif
851 size_t max_object_size =
852 MAX (sizeof (struct re_state_table_entry),
853 MAX (sizeof (re_token_t),
854 MAX (sizeof (re_node_set),
855 MAX (sizeof (regmatch_t),
856 max_i18n_object_size))));
857
858 memset (dfa, '\0', sizeof (re_dfa_t));
859
860 /* Force allocation of str_tree_storage the first time. */
861 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
862
863 /* Avoid overflows. The extra "/ 2" is for the table_size doubling
864 calculation below, and for similar doubling calculations
865 elsewhere. And it's <= rather than <, because some of the
866 doubling calculations add 1 afterwards. */
867 if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2 <= pat_len, 0))
868 return REG_ESPACE;
869
870 dfa->nodes_alloc = pat_len + 1;
871 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
872
873 /* table_size = 2 ^ ceil(log pat_len) */
874 for (table_size = 1; ; table_size <<= 1)
875 if (table_size > pat_len)
876 break;
877
878 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
879 dfa->state_hash_mask = table_size - 1;
880
881 dfa->mb_cur_max = MB_CUR_MAX;
882 #ifdef _LIBC
883 if (dfa->mb_cur_max == 6
884 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
885 dfa->is_utf8 = 1;
886 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
887 != 0);
888 #else
889 codeset_name = nl_langinfo (CODESET);
890 if ((codeset_name[0] == 'U' || codeset_name[0] == 'u')
891 && (codeset_name[1] == 'T' || codeset_name[1] == 't')
892 && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
893 && strcmp (codeset_name + 3 + (codeset_name[3] == '-'), "8") == 0)
894 dfa->is_utf8 = 1;
895
896 /* We check exhaustively in the loop below if this charset is a
897 superset of ASCII. */
898 dfa->map_notascii = 0;
899 #endif
900
901 #ifdef RE_ENABLE_I18N
902 if (dfa->mb_cur_max > 1)
903 {
904 if (dfa->is_utf8)
905 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
906 else
907 {
908 int i, j, ch;
909
910 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
911 if (BE (dfa->sb_char == NULL, 0))
912 return REG_ESPACE;
913
914 /* Set the bits corresponding to single byte chars. */
915 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
916 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
917 {
918 wint_t wch = __btowc (ch);
919 if (wch != WEOF)
920 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
921 # ifndef _LIBC
922 if (isascii (ch) && wch != ch)
923 dfa->map_notascii = 1;
924 # endif
925 }
926 }
927 }
928 #endif
929
930 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
931 return REG_ESPACE;
932 return REG_NOERROR;
933 }
934
935 /* Initialize WORD_CHAR table, which indicate which character is
936 "word". In this case "word" means that it is the word construction
937 character used by some operators like "\<", "\>", etc. */
938
939 static void
init_word_char(re_dfa_t * dfa)940 init_word_char (re_dfa_t *dfa)
941 {
942 int i = 0;
943 int j;
944 int ch = 0;
945 dfa->word_ops_used = 1;
946 if (BE (dfa->map_notascii == 0, 1))
947 {
948 /* Avoid uint32_t and uint64_t as some non-GCC platforms lack
949 them, an issue when this code is used in Gnulib. */
950 bitset_word_t bits0 = 0x00000000;
951 bitset_word_t bits1 = 0x03ff0000;
952 bitset_word_t bits2 = 0x87fffffe;
953 bitset_word_t bits3 = 0x07fffffe;
954 if (BITSET_WORD_BITS == 64)
955 {
956 /* Pacify gcc -Woverflow on 32-bit platformns. */
957 dfa->word_char[0] = bits1 << 31 << 1 | bits0;
958 dfa->word_char[1] = bits3 << 31 << 1 | bits2;
959 i = 2;
960 }
961 else if (BITSET_WORD_BITS == 32)
962 {
963 dfa->word_char[0] = bits0;
964 dfa->word_char[1] = bits1;
965 dfa->word_char[2] = bits2;
966 dfa->word_char[3] = bits3;
967 i = 4;
968 }
969 else
970 goto general_case;
971 ch = 128;
972
973 if (BE (dfa->is_utf8, 1))
974 {
975 memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
976 return;
977 }
978 }
979
980 general_case:
981 for (; i < BITSET_WORDS; ++i)
982 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
983 if (isalnum (ch) || ch == '_')
984 dfa->word_char[i] |= (bitset_word_t) 1 << j;
985 }
986
987 /* Free the work area which are only used while compiling. */
988
989 static void
free_workarea_compile(regex_t * preg)990 free_workarea_compile (regex_t *preg)
991 {
992 re_dfa_t *dfa = preg->buffer;
993 bin_tree_storage_t *storage, *next;
994 for (storage = dfa->str_tree_storage; storage; storage = next)
995 {
996 next = storage->next;
997 re_free (storage);
998 }
999 dfa->str_tree_storage = NULL;
1000 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
1001 dfa->str_tree = NULL;
1002 re_free (dfa->org_indices);
1003 dfa->org_indices = NULL;
1004 }
1005
1006 /* Create initial states for all contexts. */
1007
1008 static reg_errcode_t
create_initial_state(re_dfa_t * dfa)1009 create_initial_state (re_dfa_t *dfa)
1010 {
1011 Idx first, i;
1012 reg_errcode_t err;
1013 re_node_set init_nodes;
1014
1015 /* Initial states have the epsilon closure of the node which is
1016 the first node of the regular expression. */
1017 first = dfa->str_tree->first->node_idx;
1018 dfa->init_node = first;
1019 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1020 if (BE (err != REG_NOERROR, 0))
1021 return err;
1022
1023 /* The back-references which are in initial states can epsilon transit,
1024 since in this case all of the subexpressions can be null.
1025 Then we add epsilon closures of the nodes which are the next nodes of
1026 the back-references. */
1027 if (dfa->nbackref > 0)
1028 for (i = 0; i < init_nodes.nelem; ++i)
1029 {
1030 Idx node_idx = init_nodes.elems[i];
1031 re_token_type_t type = dfa->nodes[node_idx].type;
1032
1033 Idx clexp_idx;
1034 if (type != OP_BACK_REF)
1035 continue;
1036 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1037 {
1038 re_token_t *clexp_node;
1039 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1040 if (clexp_node->type == OP_CLOSE_SUBEXP
1041 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1042 break;
1043 }
1044 if (clexp_idx == init_nodes.nelem)
1045 continue;
1046
1047 if (type == OP_BACK_REF)
1048 {
1049 Idx dest_idx = dfa->edests[node_idx].elems[0];
1050 if (!re_node_set_contains (&init_nodes, dest_idx))
1051 {
1052 reg_errcode_t merge_err
1053 = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1054 if (merge_err != REG_NOERROR)
1055 return merge_err;
1056 i = 0;
1057 }
1058 }
1059 }
1060
1061 /* It must be the first time to invoke acquire_state. */
1062 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1063 /* We don't check ERR here, since the initial state must not be NULL. */
1064 if (BE (dfa->init_state == NULL, 0))
1065 return err;
1066 if (dfa->init_state->has_constraint)
1067 {
1068 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1069 CONTEXT_WORD);
1070 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1071 CONTEXT_NEWLINE);
1072 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1073 &init_nodes,
1074 CONTEXT_NEWLINE
1075 | CONTEXT_BEGBUF);
1076 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1077 || dfa->init_state_begbuf == NULL, 0))
1078 return err;
1079 }
1080 else
1081 dfa->init_state_word = dfa->init_state_nl
1082 = dfa->init_state_begbuf = dfa->init_state;
1083
1084 re_node_set_free (&init_nodes);
1085 return REG_NOERROR;
1086 }
1087
1088 #ifdef RE_ENABLE_I18N
1089 /* If it is possible to do searching in single byte encoding instead of UTF-8
1090 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1091 DFA nodes where needed. */
1092
1093 static void
optimize_utf8(re_dfa_t * dfa)1094 optimize_utf8 (re_dfa_t *dfa)
1095 {
1096 Idx node;
1097 int i;
1098 bool mb_chars = false;
1099 bool has_period = false;
1100
1101 for (node = 0; node < dfa->nodes_len; ++node)
1102 switch (dfa->nodes[node].type)
1103 {
1104 case CHARACTER:
1105 if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1106 mb_chars = true;
1107 break;
1108 case ANCHOR:
1109 switch (dfa->nodes[node].opr.ctx_type)
1110 {
1111 case LINE_FIRST:
1112 case LINE_LAST:
1113 case BUF_FIRST:
1114 case BUF_LAST:
1115 break;
1116 default:
1117 /* Word anchors etc. cannot be handled. It's okay to test
1118 opr.ctx_type since constraints (for all DFA nodes) are
1119 created by ORing one or more opr.ctx_type values. */
1120 return;
1121 }
1122 break;
1123 case OP_PERIOD:
1124 has_period = true;
1125 break;
1126 case OP_BACK_REF:
1127 case OP_ALT:
1128 case END_OF_RE:
1129 case OP_DUP_ASTERISK:
1130 case OP_OPEN_SUBEXP:
1131 case OP_CLOSE_SUBEXP:
1132 break;
1133 case COMPLEX_BRACKET:
1134 return;
1135 case SIMPLE_BRACKET:
1136 /* Just double check. */
1137 {
1138 int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1139 ? 0
1140 : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1141 for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1142 {
1143 if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1144 return;
1145 rshift = 0;
1146 }
1147 }
1148 break;
1149 default:
1150 abort ();
1151 }
1152
1153 if (mb_chars || has_period)
1154 for (node = 0; node < dfa->nodes_len; ++node)
1155 {
1156 if (dfa->nodes[node].type == CHARACTER
1157 && dfa->nodes[node].opr.c >= ASCII_CHARS)
1158 dfa->nodes[node].mb_partial = 0;
1159 else if (dfa->nodes[node].type == OP_PERIOD)
1160 dfa->nodes[node].type = OP_UTF8_PERIOD;
1161 }
1162
1163 /* The search can be in single byte locale. */
1164 dfa->mb_cur_max = 1;
1165 dfa->is_utf8 = 0;
1166 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1167 }
1168 #endif
1169
1170 /* Analyze the structure tree, and calculate "first", "next", "edest",
1171 "eclosure", and "inveclosure". */
1172
1173 static reg_errcode_t
analyze(regex_t * preg)1174 analyze (regex_t *preg)
1175 {
1176 re_dfa_t *dfa = preg->buffer;
1177 reg_errcode_t ret;
1178
1179 /* Allocate arrays. */
1180 dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1181 dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1182 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1183 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1184 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1185 || dfa->eclosures == NULL, 0))
1186 return REG_ESPACE;
1187
1188 dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1189 if (dfa->subexp_map != NULL)
1190 {
1191 Idx i;
1192 for (i = 0; i < preg->re_nsub; i++)
1193 dfa->subexp_map[i] = i;
1194 preorder (dfa->str_tree, optimize_subexps, dfa);
1195 for (i = 0; i < preg->re_nsub; i++)
1196 if (dfa->subexp_map[i] != i)
1197 break;
1198 if (i == preg->re_nsub)
1199 {
1200 re_free (dfa->subexp_map);
1201 dfa->subexp_map = NULL;
1202 }
1203 }
1204
1205 ret = postorder (dfa->str_tree, lower_subexps, preg);
1206 if (BE (ret != REG_NOERROR, 0))
1207 return ret;
1208 ret = postorder (dfa->str_tree, calc_first, dfa);
1209 if (BE (ret != REG_NOERROR, 0))
1210 return ret;
1211 preorder (dfa->str_tree, calc_next, dfa);
1212 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1213 if (BE (ret != REG_NOERROR, 0))
1214 return ret;
1215 ret = calc_eclosure (dfa);
1216 if (BE (ret != REG_NOERROR, 0))
1217 return ret;
1218
1219 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1220 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1221 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1222 || dfa->nbackref)
1223 {
1224 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1225 if (BE (dfa->inveclosures == NULL, 0))
1226 return REG_ESPACE;
1227 ret = calc_inveclosure (dfa);
1228 }
1229
1230 return ret;
1231 }
1232
1233 /* Our parse trees are very unbalanced, so we cannot use a stack to
1234 implement parse tree visits. Instead, we use parent pointers and
1235 some hairy code in these two functions. */
1236 static reg_errcode_t
postorder(bin_tree_t * root,reg_errcode_t (fn (void *,bin_tree_t *)),void * extra)1237 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1238 void *extra)
1239 {
1240 bin_tree_t *node, *prev;
1241
1242 for (node = root; ; )
1243 {
1244 /* Descend down the tree, preferably to the left (or to the right
1245 if that's the only child). */
1246 while (node->left || node->right)
1247 if (node->left)
1248 node = node->left;
1249 else
1250 node = node->right;
1251
1252 do
1253 {
1254 reg_errcode_t err = fn (extra, node);
1255 if (BE (err != REG_NOERROR, 0))
1256 return err;
1257 if (node->parent == NULL)
1258 return REG_NOERROR;
1259 prev = node;
1260 node = node->parent;
1261 }
1262 /* Go up while we have a node that is reached from the right. */
1263 while (node->right == prev || node->right == NULL);
1264 node = node->right;
1265 }
1266 }
1267
1268 static reg_errcode_t
preorder(bin_tree_t * root,reg_errcode_t (fn (void *,bin_tree_t *)),void * extra)1269 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1270 void *extra)
1271 {
1272 bin_tree_t *node;
1273
1274 for (node = root; ; )
1275 {
1276 reg_errcode_t err = fn (extra, node);
1277 if (BE (err != REG_NOERROR, 0))
1278 return err;
1279
1280 /* Go to the left node, or up and to the right. */
1281 if (node->left)
1282 node = node->left;
1283 else
1284 {
1285 bin_tree_t *prev = NULL;
1286 while (node->right == prev || node->right == NULL)
1287 {
1288 prev = node;
1289 node = node->parent;
1290 if (!node)
1291 return REG_NOERROR;
1292 }
1293 node = node->right;
1294 }
1295 }
1296 }
1297
1298 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1299 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1300 backreferences as well. Requires a preorder visit. */
1301 static reg_errcode_t
optimize_subexps(void * extra,bin_tree_t * node)1302 optimize_subexps (void *extra, bin_tree_t *node)
1303 {
1304 re_dfa_t *dfa = (re_dfa_t *) extra;
1305
1306 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1307 {
1308 int idx = node->token.opr.idx;
1309 node->token.opr.idx = dfa->subexp_map[idx];
1310 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1311 }
1312
1313 else if (node->token.type == SUBEXP
1314 && node->left && node->left->token.type == SUBEXP)
1315 {
1316 Idx other_idx = node->left->token.opr.idx;
1317
1318 node->left = node->left->left;
1319 if (node->left)
1320 node->left->parent = node;
1321
1322 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1323 if (other_idx < BITSET_WORD_BITS)
1324 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1325 }
1326
1327 return REG_NOERROR;
1328 }
1329
1330 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1331 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1332 static reg_errcode_t
lower_subexps(void * extra,bin_tree_t * node)1333 lower_subexps (void *extra, bin_tree_t *node)
1334 {
1335 regex_t *preg = (regex_t *) extra;
1336 reg_errcode_t err = REG_NOERROR;
1337
1338 if (node->left && node->left->token.type == SUBEXP)
1339 {
1340 node->left = lower_subexp (&err, preg, node->left);
1341 if (node->left)
1342 node->left->parent = node;
1343 }
1344 if (node->right && node->right->token.type == SUBEXP)
1345 {
1346 node->right = lower_subexp (&err, preg, node->right);
1347 if (node->right)
1348 node->right->parent = node;
1349 }
1350
1351 return err;
1352 }
1353
1354 static bin_tree_t *
lower_subexp(reg_errcode_t * err,regex_t * preg,bin_tree_t * node)1355 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1356 {
1357 re_dfa_t *dfa = preg->buffer;
1358 bin_tree_t *body = node->left;
1359 bin_tree_t *op, *cls, *tree1, *tree;
1360
1361 if (preg->no_sub
1362 /* We do not optimize empty subexpressions, because otherwise we may
1363 have bad CONCAT nodes with NULL children. This is obviously not
1364 very common, so we do not lose much. An example that triggers
1365 this case is the sed "script" /\(\)/x. */
1366 && node->left != NULL
1367 && (node->token.opr.idx >= BITSET_WORD_BITS
1368 || !(dfa->used_bkref_map
1369 & ((bitset_word_t) 1 << node->token.opr.idx))))
1370 return node->left;
1371
1372 /* Convert the SUBEXP node to the concatenation of an
1373 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1374 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1375 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1376 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1377 tree = create_tree (dfa, op, tree1, CONCAT);
1378 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1379 {
1380 *err = REG_ESPACE;
1381 return NULL;
1382 }
1383
1384 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1385 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1386 return tree;
1387 }
1388
1389 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1390 nodes. Requires a postorder visit. */
1391 static reg_errcode_t
calc_first(void * extra,bin_tree_t * node)1392 calc_first (void *extra, bin_tree_t *node)
1393 {
1394 re_dfa_t *dfa = (re_dfa_t *) extra;
1395 if (node->token.type == CONCAT)
1396 {
1397 node->first = node->left->first;
1398 node->node_idx = node->left->node_idx;
1399 }
1400 else
1401 {
1402 node->first = node;
1403 node->node_idx = re_dfa_add_node (dfa, node->token);
1404 if (BE (node->node_idx == -1, 0))
1405 return REG_ESPACE;
1406 if (node->token.type == ANCHOR)
1407 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1408 }
1409 return REG_NOERROR;
1410 }
1411
1412 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1413 static reg_errcode_t
calc_next(void * extra,bin_tree_t * node)1414 calc_next (void *extra, bin_tree_t *node)
1415 {
1416 switch (node->token.type)
1417 {
1418 case OP_DUP_ASTERISK:
1419 node->left->next = node;
1420 break;
1421 case CONCAT:
1422 node->left->next = node->right->first;
1423 node->right->next = node->next;
1424 break;
1425 default:
1426 if (node->left)
1427 node->left->next = node->next;
1428 if (node->right)
1429 node->right->next = node->next;
1430 break;
1431 }
1432 return REG_NOERROR;
1433 }
1434
1435 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1436 static reg_errcode_t
link_nfa_nodes(void * extra,bin_tree_t * node)1437 link_nfa_nodes (void *extra, bin_tree_t *node)
1438 {
1439 re_dfa_t *dfa = (re_dfa_t *) extra;
1440 Idx idx = node->node_idx;
1441 reg_errcode_t err = REG_NOERROR;
1442
1443 switch (node->token.type)
1444 {
1445 case CONCAT:
1446 break;
1447
1448 case END_OF_RE:
1449 assert (node->next == NULL);
1450 break;
1451
1452 case OP_DUP_ASTERISK:
1453 case OP_ALT:
1454 {
1455 Idx left, right;
1456 dfa->has_plural_match = 1;
1457 if (node->left != NULL)
1458 left = node->left->first->node_idx;
1459 else
1460 left = node->next->node_idx;
1461 if (node->right != NULL)
1462 right = node->right->first->node_idx;
1463 else
1464 right = node->next->node_idx;
1465 assert (left > -1);
1466 assert (right > -1);
1467 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1468 }
1469 break;
1470
1471 case ANCHOR:
1472 case OP_OPEN_SUBEXP:
1473 case OP_CLOSE_SUBEXP:
1474 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1475 break;
1476
1477 case OP_BACK_REF:
1478 dfa->nexts[idx] = node->next->node_idx;
1479 if (node->token.type == OP_BACK_REF)
1480 err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1481 break;
1482
1483 default:
1484 assert (!IS_EPSILON_NODE (node->token.type));
1485 dfa->nexts[idx] = node->next->node_idx;
1486 break;
1487 }
1488
1489 return err;
1490 }
1491
1492 /* Duplicate the epsilon closure of the node ROOT_NODE.
1493 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1494 to their own constraint. */
1495
1496 static reg_errcode_t
duplicate_node_closure(re_dfa_t * dfa,Idx top_org_node,Idx top_clone_node,Idx root_node,unsigned int init_constraint)1497 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1498 Idx root_node, unsigned int init_constraint)
1499 {
1500 Idx org_node, clone_node;
1501 bool ok;
1502 unsigned int constraint = init_constraint;
1503 for (org_node = top_org_node, clone_node = top_clone_node;;)
1504 {
1505 Idx org_dest, clone_dest;
1506 if (dfa->nodes[org_node].type == OP_BACK_REF)
1507 {
1508 /* If the back reference epsilon-transit, its destination must
1509 also have the constraint. Then duplicate the epsilon closure
1510 of the destination of the back reference, and store it in
1511 edests of the back reference. */
1512 org_dest = dfa->nexts[org_node];
1513 re_node_set_empty (dfa->edests + clone_node);
1514 clone_dest = duplicate_node (dfa, org_dest, constraint);
1515 if (BE (clone_dest == -1, 0))
1516 return REG_ESPACE;
1517 dfa->nexts[clone_node] = dfa->nexts[org_node];
1518 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1519 if (BE (! ok, 0))
1520 return REG_ESPACE;
1521 }
1522 else if (dfa->edests[org_node].nelem == 0)
1523 {
1524 /* In case of the node can't epsilon-transit, don't duplicate the
1525 destination and store the original destination as the
1526 destination of the node. */
1527 dfa->nexts[clone_node] = dfa->nexts[org_node];
1528 break;
1529 }
1530 else if (dfa->edests[org_node].nelem == 1)
1531 {
1532 /* In case of the node can epsilon-transit, and it has only one
1533 destination. */
1534 org_dest = dfa->edests[org_node].elems[0];
1535 re_node_set_empty (dfa->edests + clone_node);
1536 /* If the node is root_node itself, it means the epsilon closure
1537 has a loop. Then tie it to the destination of the root_node. */
1538 if (org_node == root_node && clone_node != org_node)
1539 {
1540 ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1541 if (BE (! ok, 0))
1542 return REG_ESPACE;
1543 break;
1544 }
1545 /* In case the node has another constraint, append it. */
1546 constraint |= dfa->nodes[org_node].constraint;
1547 clone_dest = duplicate_node (dfa, org_dest, constraint);
1548 if (BE (clone_dest == -1, 0))
1549 return REG_ESPACE;
1550 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1551 if (BE (! ok, 0))
1552 return REG_ESPACE;
1553 }
1554 else /* dfa->edests[org_node].nelem == 2 */
1555 {
1556 /* In case of the node can epsilon-transit, and it has two
1557 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1558 org_dest = dfa->edests[org_node].elems[0];
1559 re_node_set_empty (dfa->edests + clone_node);
1560 /* Search for a duplicated node which satisfies the constraint. */
1561 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1562 if (clone_dest == -1)
1563 {
1564 /* There is no such duplicated node, create a new one. */
1565 reg_errcode_t err;
1566 clone_dest = duplicate_node (dfa, org_dest, constraint);
1567 if (BE (clone_dest == -1, 0))
1568 return REG_ESPACE;
1569 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1570 if (BE (! ok, 0))
1571 return REG_ESPACE;
1572 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1573 root_node, constraint);
1574 if (BE (err != REG_NOERROR, 0))
1575 return err;
1576 }
1577 else
1578 {
1579 /* There is a duplicated node which satisfies the constraint,
1580 use it to avoid infinite loop. */
1581 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1582 if (BE (! ok, 0))
1583 return REG_ESPACE;
1584 }
1585
1586 org_dest = dfa->edests[org_node].elems[1];
1587 clone_dest = duplicate_node (dfa, org_dest, constraint);
1588 if (BE (clone_dest == -1, 0))
1589 return REG_ESPACE;
1590 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1591 if (BE (! ok, 0))
1592 return REG_ESPACE;
1593 }
1594 org_node = org_dest;
1595 clone_node = clone_dest;
1596 }
1597 return REG_NOERROR;
1598 }
1599
1600 /* Search for a node which is duplicated from the node ORG_NODE, and
1601 satisfies the constraint CONSTRAINT. */
1602
1603 static Idx
search_duplicated_node(const re_dfa_t * dfa,Idx org_node,unsigned int constraint)1604 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1605 unsigned int constraint)
1606 {
1607 Idx idx;
1608 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1609 {
1610 if (org_node == dfa->org_indices[idx]
1611 && constraint == dfa->nodes[idx].constraint)
1612 return idx; /* Found. */
1613 }
1614 return -1; /* Not found. */
1615 }
1616
1617 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1618 Return the index of the new node, or -1 if insufficient storage is
1619 available. */
1620
1621 static Idx
duplicate_node(re_dfa_t * dfa,Idx org_idx,unsigned int constraint)1622 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1623 {
1624 Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1625 if (BE (dup_idx != -1, 1))
1626 {
1627 dfa->nodes[dup_idx].constraint = constraint;
1628 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1629 dfa->nodes[dup_idx].duplicated = 1;
1630
1631 /* Store the index of the original node. */
1632 dfa->org_indices[dup_idx] = org_idx;
1633 }
1634 return dup_idx;
1635 }
1636
1637 static reg_errcode_t
calc_inveclosure(re_dfa_t * dfa)1638 calc_inveclosure (re_dfa_t *dfa)
1639 {
1640 Idx src, idx;
1641 bool ok;
1642 for (idx = 0; idx < dfa->nodes_len; ++idx)
1643 re_node_set_init_empty (dfa->inveclosures + idx);
1644
1645 for (src = 0; src < dfa->nodes_len; ++src)
1646 {
1647 Idx *elems = dfa->eclosures[src].elems;
1648 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1649 {
1650 ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1651 if (BE (! ok, 0))
1652 return REG_ESPACE;
1653 }
1654 }
1655
1656 return REG_NOERROR;
1657 }
1658
1659 /* Calculate "eclosure" for all the node in DFA. */
1660
1661 static reg_errcode_t
calc_eclosure(re_dfa_t * dfa)1662 calc_eclosure (re_dfa_t *dfa)
1663 {
1664 Idx node_idx;
1665 bool incomplete;
1666 #ifdef DEBUG
1667 assert (dfa->nodes_len > 0);
1668 #endif
1669 incomplete = false;
1670 /* For each nodes, calculate epsilon closure. */
1671 for (node_idx = 0; ; ++node_idx)
1672 {
1673 reg_errcode_t err;
1674 re_node_set eclosure_elem;
1675 if (node_idx == dfa->nodes_len)
1676 {
1677 if (!incomplete)
1678 break;
1679 incomplete = false;
1680 node_idx = 0;
1681 }
1682
1683 #ifdef DEBUG
1684 assert (dfa->eclosures[node_idx].nelem != -1);
1685 #endif
1686
1687 /* If we have already calculated, skip it. */
1688 if (dfa->eclosures[node_idx].nelem != 0)
1689 continue;
1690 /* Calculate epsilon closure of 'node_idx'. */
1691 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1692 if (BE (err != REG_NOERROR, 0))
1693 return err;
1694
1695 if (dfa->eclosures[node_idx].nelem == 0)
1696 {
1697 incomplete = true;
1698 re_node_set_free (&eclosure_elem);
1699 }
1700 }
1701 return REG_NOERROR;
1702 }
1703
1704 /* Calculate epsilon closure of NODE. */
1705
1706 static reg_errcode_t
calc_eclosure_iter(re_node_set * new_set,re_dfa_t * dfa,Idx node,bool root)1707 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1708 {
1709 reg_errcode_t err;
1710 Idx i;
1711 re_node_set eclosure;
1712 bool ok;
1713 bool incomplete = false;
1714 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1715 if (BE (err != REG_NOERROR, 0))
1716 return err;
1717
1718 /* This indicates that we are calculating this node now.
1719 We reference this value to avoid infinite loop. */
1720 dfa->eclosures[node].nelem = -1;
1721
1722 /* If the current node has constraints, duplicate all nodes
1723 since they must inherit the constraints. */
1724 if (dfa->nodes[node].constraint
1725 && dfa->edests[node].nelem
1726 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1727 {
1728 err = duplicate_node_closure (dfa, node, node, node,
1729 dfa->nodes[node].constraint);
1730 if (BE (err != REG_NOERROR, 0))
1731 return err;
1732 }
1733
1734 /* Expand each epsilon destination nodes. */
1735 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1736 for (i = 0; i < dfa->edests[node].nelem; ++i)
1737 {
1738 re_node_set eclosure_elem;
1739 Idx edest = dfa->edests[node].elems[i];
1740 /* If calculating the epsilon closure of 'edest' is in progress,
1741 return intermediate result. */
1742 if (dfa->eclosures[edest].nelem == -1)
1743 {
1744 incomplete = true;
1745 continue;
1746 }
1747 /* If we haven't calculated the epsilon closure of 'edest' yet,
1748 calculate now. Otherwise use calculated epsilon closure. */
1749 if (dfa->eclosures[edest].nelem == 0)
1750 {
1751 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1752 if (BE (err != REG_NOERROR, 0))
1753 return err;
1754 }
1755 else
1756 eclosure_elem = dfa->eclosures[edest];
1757 /* Merge the epsilon closure of 'edest'. */
1758 err = re_node_set_merge (&eclosure, &eclosure_elem);
1759 if (BE (err != REG_NOERROR, 0))
1760 return err;
1761 /* If the epsilon closure of 'edest' is incomplete,
1762 the epsilon closure of this node is also incomplete. */
1763 if (dfa->eclosures[edest].nelem == 0)
1764 {
1765 incomplete = true;
1766 re_node_set_free (&eclosure_elem);
1767 }
1768 }
1769
1770 /* An epsilon closure includes itself. */
1771 ok = re_node_set_insert (&eclosure, node);
1772 if (BE (! ok, 0))
1773 return REG_ESPACE;
1774 if (incomplete && !root)
1775 dfa->eclosures[node].nelem = 0;
1776 else
1777 dfa->eclosures[node] = eclosure;
1778 *new_set = eclosure;
1779 return REG_NOERROR;
1780 }
1781
1782 /* Functions for token which are used in the parser. */
1783
1784 /* Fetch a token from INPUT.
1785 We must not use this function inside bracket expressions. */
1786
1787 static void
fetch_token(re_token_t * result,re_string_t * input,reg_syntax_t syntax)1788 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1789 {
1790 re_string_skip_bytes (input, peek_token (result, input, syntax));
1791 }
1792
1793 /* Peek a token from INPUT, and return the length of the token.
1794 We must not use this function inside bracket expressions. */
1795
1796 static int
peek_token(re_token_t * token,re_string_t * input,reg_syntax_t syntax)1797 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1798 {
1799 unsigned char c;
1800
1801 if (re_string_eoi (input))
1802 {
1803 token->type = END_OF_RE;
1804 return 0;
1805 }
1806
1807 c = re_string_peek_byte (input, 0);
1808 token->opr.c = c;
1809
1810 token->word_char = 0;
1811 #ifdef RE_ENABLE_I18N
1812 token->mb_partial = 0;
1813 if (input->mb_cur_max > 1 &&
1814 !re_string_first_byte (input, re_string_cur_idx (input)))
1815 {
1816 token->type = CHARACTER;
1817 token->mb_partial = 1;
1818 return 1;
1819 }
1820 #endif
1821 if (c == '\\')
1822 {
1823 unsigned char c2;
1824 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1825 {
1826 token->type = BACK_SLASH;
1827 return 1;
1828 }
1829
1830 c2 = re_string_peek_byte_case (input, 1);
1831 token->opr.c = c2;
1832 token->type = CHARACTER;
1833 #ifdef RE_ENABLE_I18N
1834 if (input->mb_cur_max > 1)
1835 {
1836 wint_t wc = re_string_wchar_at (input,
1837 re_string_cur_idx (input) + 1);
1838 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1839 }
1840 else
1841 #endif
1842 token->word_char = IS_WORD_CHAR (c2) != 0;
1843
1844 switch (c2)
1845 {
1846 case '|':
1847 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1848 token->type = OP_ALT;
1849 break;
1850 case '1': case '2': case '3': case '4': case '5':
1851 case '6': case '7': case '8': case '9':
1852 if (!(syntax & RE_NO_BK_REFS))
1853 {
1854 token->type = OP_BACK_REF;
1855 token->opr.idx = c2 - '1';
1856 }
1857 break;
1858 case '<':
1859 if (!(syntax & RE_NO_GNU_OPS))
1860 {
1861 token->type = ANCHOR;
1862 token->opr.ctx_type = WORD_FIRST;
1863 }
1864 break;
1865 case '>':
1866 if (!(syntax & RE_NO_GNU_OPS))
1867 {
1868 token->type = ANCHOR;
1869 token->opr.ctx_type = WORD_LAST;
1870 }
1871 break;
1872 case 'b':
1873 if (!(syntax & RE_NO_GNU_OPS))
1874 {
1875 token->type = ANCHOR;
1876 token->opr.ctx_type = WORD_DELIM;
1877 }
1878 break;
1879 case 'B':
1880 if (!(syntax & RE_NO_GNU_OPS))
1881 {
1882 token->type = ANCHOR;
1883 token->opr.ctx_type = NOT_WORD_DELIM;
1884 }
1885 break;
1886 case 'w':
1887 if (!(syntax & RE_NO_GNU_OPS))
1888 token->type = OP_WORD;
1889 break;
1890 case 'W':
1891 if (!(syntax & RE_NO_GNU_OPS))
1892 token->type = OP_NOTWORD;
1893 break;
1894 case 's':
1895 if (!(syntax & RE_NO_GNU_OPS))
1896 token->type = OP_SPACE;
1897 break;
1898 case 'S':
1899 if (!(syntax & RE_NO_GNU_OPS))
1900 token->type = OP_NOTSPACE;
1901 break;
1902 case '`':
1903 if (!(syntax & RE_NO_GNU_OPS))
1904 {
1905 token->type = ANCHOR;
1906 token->opr.ctx_type = BUF_FIRST;
1907 }
1908 break;
1909 case '\'':
1910 if (!(syntax & RE_NO_GNU_OPS))
1911 {
1912 token->type = ANCHOR;
1913 token->opr.ctx_type = BUF_LAST;
1914 }
1915 break;
1916 case '(':
1917 if (!(syntax & RE_NO_BK_PARENS))
1918 token->type = OP_OPEN_SUBEXP;
1919 break;
1920 case ')':
1921 if (!(syntax & RE_NO_BK_PARENS))
1922 token->type = OP_CLOSE_SUBEXP;
1923 break;
1924 case '+':
1925 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1926 token->type = OP_DUP_PLUS;
1927 break;
1928 case '?':
1929 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1930 token->type = OP_DUP_QUESTION;
1931 break;
1932 case '{':
1933 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1934 token->type = OP_OPEN_DUP_NUM;
1935 break;
1936 case '}':
1937 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1938 token->type = OP_CLOSE_DUP_NUM;
1939 break;
1940 default:
1941 break;
1942 }
1943 return 2;
1944 }
1945
1946 token->type = CHARACTER;
1947 #ifdef RE_ENABLE_I18N
1948 if (input->mb_cur_max > 1)
1949 {
1950 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1951 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1952 }
1953 else
1954 #endif
1955 token->word_char = IS_WORD_CHAR (token->opr.c);
1956
1957 switch (c)
1958 {
1959 case '\n':
1960 if (syntax & RE_NEWLINE_ALT)
1961 token->type = OP_ALT;
1962 break;
1963 case '|':
1964 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1965 token->type = OP_ALT;
1966 break;
1967 case '*':
1968 token->type = OP_DUP_ASTERISK;
1969 break;
1970 case '+':
1971 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1972 token->type = OP_DUP_PLUS;
1973 break;
1974 case '?':
1975 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1976 token->type = OP_DUP_QUESTION;
1977 break;
1978 case '{':
1979 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1980 token->type = OP_OPEN_DUP_NUM;
1981 break;
1982 case '}':
1983 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1984 token->type = OP_CLOSE_DUP_NUM;
1985 break;
1986 case '(':
1987 if (syntax & RE_NO_BK_PARENS)
1988 token->type = OP_OPEN_SUBEXP;
1989 break;
1990 case ')':
1991 if (syntax & RE_NO_BK_PARENS)
1992 token->type = OP_CLOSE_SUBEXP;
1993 break;
1994 case '[':
1995 token->type = OP_OPEN_BRACKET;
1996 break;
1997 case '.':
1998 token->type = OP_PERIOD;
1999 break;
2000 case '^':
2001 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
2002 re_string_cur_idx (input) != 0)
2003 {
2004 char prev = re_string_peek_byte (input, -1);
2005 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
2006 break;
2007 }
2008 token->type = ANCHOR;
2009 token->opr.ctx_type = LINE_FIRST;
2010 break;
2011 case '$':
2012 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
2013 re_string_cur_idx (input) + 1 != re_string_length (input))
2014 {
2015 re_token_t next;
2016 re_string_skip_bytes (input, 1);
2017 peek_token (&next, input, syntax);
2018 re_string_skip_bytes (input, -1);
2019 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2020 break;
2021 }
2022 token->type = ANCHOR;
2023 token->opr.ctx_type = LINE_LAST;
2024 break;
2025 default:
2026 break;
2027 }
2028 return 1;
2029 }
2030
2031 /* Peek a token from INPUT, and return the length of the token.
2032 We must not use this function out of bracket expressions. */
2033
2034 static int
peek_token_bracket(re_token_t * token,re_string_t * input,reg_syntax_t syntax)2035 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2036 {
2037 unsigned char c;
2038 if (re_string_eoi (input))
2039 {
2040 token->type = END_OF_RE;
2041 return 0;
2042 }
2043 c = re_string_peek_byte (input, 0);
2044 token->opr.c = c;
2045
2046 #ifdef RE_ENABLE_I18N
2047 if (input->mb_cur_max > 1 &&
2048 !re_string_first_byte (input, re_string_cur_idx (input)))
2049 {
2050 token->type = CHARACTER;
2051 return 1;
2052 }
2053 #endif /* RE_ENABLE_I18N */
2054
2055 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2056 && re_string_cur_idx (input) + 1 < re_string_length (input))
2057 {
2058 /* In this case, '\' escape a character. */
2059 unsigned char c2;
2060 re_string_skip_bytes (input, 1);
2061 c2 = re_string_peek_byte (input, 0);
2062 token->opr.c = c2;
2063 token->type = CHARACTER;
2064 return 1;
2065 }
2066 if (c == '[') /* '[' is a special char in a bracket exps. */
2067 {
2068 unsigned char c2;
2069 int token_len;
2070 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2071 c2 = re_string_peek_byte (input, 1);
2072 else
2073 c2 = 0;
2074 token->opr.c = c2;
2075 token_len = 2;
2076 switch (c2)
2077 {
2078 case '.':
2079 token->type = OP_OPEN_COLL_ELEM;
2080 break;
2081
2082 case '=':
2083 token->type = OP_OPEN_EQUIV_CLASS;
2084 break;
2085
2086 case ':':
2087 if (syntax & RE_CHAR_CLASSES)
2088 {
2089 token->type = OP_OPEN_CHAR_CLASS;
2090 break;
2091 }
2092 FALLTHROUGH;
2093 default:
2094 token->type = CHARACTER;
2095 token->opr.c = c;
2096 token_len = 1;
2097 break;
2098 }
2099 return token_len;
2100 }
2101 switch (c)
2102 {
2103 case '-':
2104 token->type = OP_CHARSET_RANGE;
2105 break;
2106 case ']':
2107 token->type = OP_CLOSE_BRACKET;
2108 break;
2109 case '^':
2110 token->type = OP_NON_MATCH_LIST;
2111 break;
2112 default:
2113 token->type = CHARACTER;
2114 }
2115 return 1;
2116 }
2117
2118 /* Functions for parser. */
2119
2120 /* Entry point of the parser.
2121 Parse the regular expression REGEXP and return the structure tree.
2122 If an error occurs, ERR is set by error code, and return NULL.
2123 This function build the following tree, from regular expression <reg_exp>:
2124 CAT
2125 / \
2126 / \
2127 <reg_exp> EOR
2128
2129 CAT means concatenation.
2130 EOR means end of regular expression. */
2131
2132 static bin_tree_t *
parse(re_string_t * regexp,regex_t * preg,reg_syntax_t syntax,reg_errcode_t * err)2133 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2134 reg_errcode_t *err)
2135 {
2136 re_dfa_t *dfa = preg->buffer;
2137 bin_tree_t *tree, *eor, *root;
2138 re_token_t current_token;
2139 dfa->syntax = syntax;
2140 fetch_token (¤t_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2141 tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err);
2142 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2143 return NULL;
2144 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2145 if (tree != NULL)
2146 root = create_tree (dfa, tree, eor, CONCAT);
2147 else
2148 root = eor;
2149 if (BE (eor == NULL || root == NULL, 0))
2150 {
2151 *err = REG_ESPACE;
2152 return NULL;
2153 }
2154 return root;
2155 }
2156
2157 /* This function build the following tree, from regular expression
2158 <branch1>|<branch2>:
2159 ALT
2160 / \
2161 / \
2162 <branch1> <branch2>
2163
2164 ALT means alternative, which represents the operator '|'. */
2165
2166 static bin_tree_t *
parse_reg_exp(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,Idx nest,reg_errcode_t * err)2167 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2168 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2169 {
2170 re_dfa_t *dfa = preg->buffer;
2171 bin_tree_t *tree, *branch = NULL;
2172 bitset_word_t initial_bkref_map = dfa->completed_bkref_map;
2173 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2174 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2175 return NULL;
2176
2177 while (token->type == OP_ALT)
2178 {
2179 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2180 if (token->type != OP_ALT && token->type != END_OF_RE
2181 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2182 {
2183 bitset_word_t accumulated_bkref_map = dfa->completed_bkref_map;
2184 dfa->completed_bkref_map = initial_bkref_map;
2185 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2186 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2187 {
2188 if (tree != NULL)
2189 postorder (tree, free_tree, NULL);
2190 return NULL;
2191 }
2192 dfa->completed_bkref_map |= accumulated_bkref_map;
2193 }
2194 else
2195 branch = NULL;
2196 tree = create_tree (dfa, tree, branch, OP_ALT);
2197 if (BE (tree == NULL, 0))
2198 {
2199 *err = REG_ESPACE;
2200 return NULL;
2201 }
2202 }
2203 return tree;
2204 }
2205
2206 /* This function build the following tree, from regular expression
2207 <exp1><exp2>:
2208 CAT
2209 / \
2210 / \
2211 <exp1> <exp2>
2212
2213 CAT means concatenation. */
2214
2215 static bin_tree_t *
parse_branch(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,Idx nest,reg_errcode_t * err)2216 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2217 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2218 {
2219 bin_tree_t *tree, *expr;
2220 re_dfa_t *dfa = preg->buffer;
2221 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2222 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2223 return NULL;
2224
2225 while (token->type != OP_ALT && token->type != END_OF_RE
2226 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2227 {
2228 expr = parse_expression (regexp, preg, token, syntax, nest, err);
2229 if (BE (*err != REG_NOERROR && expr == NULL, 0))
2230 {
2231 if (tree != NULL)
2232 postorder (tree, free_tree, NULL);
2233 return NULL;
2234 }
2235 if (tree != NULL && expr != NULL)
2236 {
2237 bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT);
2238 if (newtree == NULL)
2239 {
2240 postorder (expr, free_tree, NULL);
2241 postorder (tree, free_tree, NULL);
2242 *err = REG_ESPACE;
2243 return NULL;
2244 }
2245 tree = newtree;
2246 }
2247 else if (tree == NULL)
2248 tree = expr;
2249 /* Otherwise expr == NULL, we don't need to create new tree. */
2250 }
2251 return tree;
2252 }
2253
2254 /* This function build the following tree, from regular expression a*:
2255 *
2256 |
2257 a
2258 */
2259
2260 static bin_tree_t *
parse_expression(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,Idx nest,reg_errcode_t * err)2261 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2262 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2263 {
2264 re_dfa_t *dfa = preg->buffer;
2265 bin_tree_t *tree;
2266 switch (token->type)
2267 {
2268 case CHARACTER:
2269 tree = create_token_tree (dfa, NULL, NULL, token);
2270 if (BE (tree == NULL, 0))
2271 {
2272 *err = REG_ESPACE;
2273 return NULL;
2274 }
2275 #ifdef RE_ENABLE_I18N
2276 if (dfa->mb_cur_max > 1)
2277 {
2278 while (!re_string_eoi (regexp)
2279 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2280 {
2281 bin_tree_t *mbc_remain;
2282 fetch_token (token, regexp, syntax);
2283 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2284 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2285 if (BE (mbc_remain == NULL || tree == NULL, 0))
2286 {
2287 *err = REG_ESPACE;
2288 return NULL;
2289 }
2290 }
2291 }
2292 #endif
2293 break;
2294
2295 case OP_OPEN_SUBEXP:
2296 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2297 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2298 return NULL;
2299 break;
2300
2301 case OP_OPEN_BRACKET:
2302 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2303 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2304 return NULL;
2305 break;
2306
2307 case OP_BACK_REF:
2308 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2309 {
2310 *err = REG_ESUBREG;
2311 return NULL;
2312 }
2313 dfa->used_bkref_map |= 1 << token->opr.idx;
2314 tree = create_token_tree (dfa, NULL, NULL, token);
2315 if (BE (tree == NULL, 0))
2316 {
2317 *err = REG_ESPACE;
2318 return NULL;
2319 }
2320 ++dfa->nbackref;
2321 dfa->has_mb_node = 1;
2322 break;
2323
2324 case OP_OPEN_DUP_NUM:
2325 if (syntax & RE_CONTEXT_INVALID_DUP)
2326 {
2327 *err = REG_BADRPT;
2328 return NULL;
2329 }
2330 FALLTHROUGH;
2331 case OP_DUP_ASTERISK:
2332 case OP_DUP_PLUS:
2333 case OP_DUP_QUESTION:
2334 if (syntax & RE_CONTEXT_INVALID_OPS)
2335 {
2336 *err = REG_BADRPT;
2337 return NULL;
2338 }
2339 else if (syntax & RE_CONTEXT_INDEP_OPS)
2340 {
2341 fetch_token (token, regexp, syntax);
2342 return parse_expression (regexp, preg, token, syntax, nest, err);
2343 }
2344 FALLTHROUGH;
2345 case OP_CLOSE_SUBEXP:
2346 if ((token->type == OP_CLOSE_SUBEXP) &&
2347 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2348 {
2349 *err = REG_ERPAREN;
2350 return NULL;
2351 }
2352 FALLTHROUGH;
2353 case OP_CLOSE_DUP_NUM:
2354 /* We treat it as a normal character. */
2355
2356 /* Then we can these characters as normal characters. */
2357 token->type = CHARACTER;
2358 /* mb_partial and word_char bits should be initialized already
2359 by peek_token. */
2360 tree = create_token_tree (dfa, NULL, NULL, token);
2361 if (BE (tree == NULL, 0))
2362 {
2363 *err = REG_ESPACE;
2364 return NULL;
2365 }
2366 break;
2367
2368 case ANCHOR:
2369 if ((token->opr.ctx_type
2370 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2371 && dfa->word_ops_used == 0)
2372 init_word_char (dfa);
2373 if (token->opr.ctx_type == WORD_DELIM
2374 || token->opr.ctx_type == NOT_WORD_DELIM)
2375 {
2376 bin_tree_t *tree_first, *tree_last;
2377 if (token->opr.ctx_type == WORD_DELIM)
2378 {
2379 token->opr.ctx_type = WORD_FIRST;
2380 tree_first = create_token_tree (dfa, NULL, NULL, token);
2381 token->opr.ctx_type = WORD_LAST;
2382 }
2383 else
2384 {
2385 token->opr.ctx_type = INSIDE_WORD;
2386 tree_first = create_token_tree (dfa, NULL, NULL, token);
2387 token->opr.ctx_type = INSIDE_NOTWORD;
2388 }
2389 tree_last = create_token_tree (dfa, NULL, NULL, token);
2390 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2391 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2392 {
2393 *err = REG_ESPACE;
2394 return NULL;
2395 }
2396 }
2397 else
2398 {
2399 tree = create_token_tree (dfa, NULL, NULL, token);
2400 if (BE (tree == NULL, 0))
2401 {
2402 *err = REG_ESPACE;
2403 return NULL;
2404 }
2405 }
2406 /* We must return here, since ANCHORs can't be followed
2407 by repetition operators.
2408 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2409 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2410 fetch_token (token, regexp, syntax);
2411 return tree;
2412
2413 case OP_PERIOD:
2414 tree = create_token_tree (dfa, NULL, NULL, token);
2415 if (BE (tree == NULL, 0))
2416 {
2417 *err = REG_ESPACE;
2418 return NULL;
2419 }
2420 if (dfa->mb_cur_max > 1)
2421 dfa->has_mb_node = 1;
2422 break;
2423
2424 case OP_WORD:
2425 case OP_NOTWORD:
2426 tree = build_charclass_op (dfa, regexp->trans,
2427 "alnum",
2428 "_",
2429 token->type == OP_NOTWORD, err);
2430 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2431 return NULL;
2432 break;
2433
2434 case OP_SPACE:
2435 case OP_NOTSPACE:
2436 tree = build_charclass_op (dfa, regexp->trans,
2437 "space",
2438 "",
2439 token->type == OP_NOTSPACE, err);
2440 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2441 return NULL;
2442 break;
2443
2444 case OP_ALT:
2445 case END_OF_RE:
2446 return NULL;
2447
2448 case BACK_SLASH:
2449 *err = REG_EESCAPE;
2450 return NULL;
2451
2452 default:
2453 /* Must not happen? */
2454 #ifdef DEBUG
2455 assert (0);
2456 #endif
2457 return NULL;
2458 }
2459 fetch_token (token, regexp, syntax);
2460
2461 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2462 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2463 {
2464 bin_tree_t *dup_tree = parse_dup_op (tree, regexp, dfa, token,
2465 syntax, err);
2466 if (BE (*err != REG_NOERROR && dup_tree == NULL, 0))
2467 {
2468 if (tree != NULL)
2469 postorder (tree, free_tree, NULL);
2470 return NULL;
2471 }
2472 tree = dup_tree;
2473 /* In BRE consecutive duplications are not allowed. */
2474 if ((syntax & RE_CONTEXT_INVALID_DUP)
2475 && (token->type == OP_DUP_ASTERISK
2476 || token->type == OP_OPEN_DUP_NUM))
2477 {
2478 if (tree != NULL)
2479 postorder (tree, free_tree, NULL);
2480 *err = REG_BADRPT;
2481 return NULL;
2482 }
2483 }
2484
2485 return tree;
2486 }
2487
2488 /* This function build the following tree, from regular expression
2489 (<reg_exp>):
2490 SUBEXP
2491 |
2492 <reg_exp>
2493 */
2494
2495 static bin_tree_t *
parse_sub_exp(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,Idx nest,reg_errcode_t * err)2496 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2497 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2498 {
2499 re_dfa_t *dfa = preg->buffer;
2500 bin_tree_t *tree;
2501 size_t cur_nsub;
2502 cur_nsub = preg->re_nsub++;
2503
2504 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2505
2506 /* The subexpression may be a null string. */
2507 if (token->type == OP_CLOSE_SUBEXP)
2508 tree = NULL;
2509 else
2510 {
2511 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2512 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2513 {
2514 if (tree != NULL)
2515 postorder (tree, free_tree, NULL);
2516 *err = REG_EPAREN;
2517 }
2518 if (BE (*err != REG_NOERROR, 0))
2519 return NULL;
2520 }
2521
2522 if (cur_nsub <= '9' - '1')
2523 dfa->completed_bkref_map |= 1 << cur_nsub;
2524
2525 tree = create_tree (dfa, tree, NULL, SUBEXP);
2526 if (BE (tree == NULL, 0))
2527 {
2528 *err = REG_ESPACE;
2529 return NULL;
2530 }
2531 tree->token.opr.idx = cur_nsub;
2532 return tree;
2533 }
2534
2535 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2536
2537 static bin_tree_t *
parse_dup_op(bin_tree_t * elem,re_string_t * regexp,re_dfa_t * dfa,re_token_t * token,reg_syntax_t syntax,reg_errcode_t * err)2538 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2539 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2540 {
2541 bin_tree_t *tree = NULL, *old_tree = NULL;
2542 Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2543 re_token_t start_token = *token;
2544
2545 if (token->type == OP_OPEN_DUP_NUM)
2546 {
2547 end = 0;
2548 start = fetch_number (regexp, token, syntax);
2549 if (start == -1)
2550 {
2551 if (token->type == CHARACTER && token->opr.c == ',')
2552 start = 0; /* We treat "{,m}" as "{0,m}". */
2553 else
2554 {
2555 *err = REG_BADBR; /* <re>{} is invalid. */
2556 return NULL;
2557 }
2558 }
2559 if (BE (start != -2, 1))
2560 {
2561 /* We treat "{n}" as "{n,n}". */
2562 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2563 : ((token->type == CHARACTER && token->opr.c == ',')
2564 ? fetch_number (regexp, token, syntax) : -2));
2565 }
2566 if (BE (start == -2 || end == -2, 0))
2567 {
2568 /* Invalid sequence. */
2569 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2570 {
2571 if (token->type == END_OF_RE)
2572 *err = REG_EBRACE;
2573 else
2574 *err = REG_BADBR;
2575
2576 return NULL;
2577 }
2578
2579 /* If the syntax bit is set, rollback. */
2580 re_string_set_index (regexp, start_idx);
2581 *token = start_token;
2582 token->type = CHARACTER;
2583 /* mb_partial and word_char bits should be already initialized by
2584 peek_token. */
2585 return elem;
2586 }
2587
2588 if (BE ((end != -1 && start > end)
2589 || token->type != OP_CLOSE_DUP_NUM, 0))
2590 {
2591 /* First number greater than second. */
2592 *err = REG_BADBR;
2593 return NULL;
2594 }
2595
2596 if (BE (RE_DUP_MAX < (end == -1 ? start : end), 0))
2597 {
2598 *err = REG_ESIZE;
2599 return NULL;
2600 }
2601 }
2602 else
2603 {
2604 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2605 end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2606 }
2607
2608 fetch_token (token, regexp, syntax);
2609
2610 if (BE (elem == NULL, 0))
2611 return NULL;
2612 if (BE (start == 0 && end == 0, 0))
2613 {
2614 postorder (elem, free_tree, NULL);
2615 return NULL;
2616 }
2617
2618 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2619 if (BE (start > 0, 0))
2620 {
2621 tree = elem;
2622 for (i = 2; i <= start; ++i)
2623 {
2624 elem = duplicate_tree (elem, dfa);
2625 tree = create_tree (dfa, tree, elem, CONCAT);
2626 if (BE (elem == NULL || tree == NULL, 0))
2627 goto parse_dup_op_espace;
2628 }
2629
2630 if (start == end)
2631 return tree;
2632
2633 /* Duplicate ELEM before it is marked optional. */
2634 elem = duplicate_tree (elem, dfa);
2635 if (BE (elem == NULL, 0))
2636 goto parse_dup_op_espace;
2637 old_tree = tree;
2638 }
2639 else
2640 old_tree = NULL;
2641
2642 if (elem->token.type == SUBEXP)
2643 {
2644 uintptr_t subidx = elem->token.opr.idx;
2645 postorder (elem, mark_opt_subexp, (void *) subidx);
2646 }
2647
2648 tree = create_tree (dfa, elem, NULL,
2649 (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2650 if (BE (tree == NULL, 0))
2651 goto parse_dup_op_espace;
2652
2653 /* This loop is actually executed only when end != -1,
2654 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2655 already created the start+1-th copy. */
2656 if (TYPE_SIGNED (Idx) || end != -1)
2657 for (i = start + 2; i <= end; ++i)
2658 {
2659 elem = duplicate_tree (elem, dfa);
2660 tree = create_tree (dfa, tree, elem, CONCAT);
2661 if (BE (elem == NULL || tree == NULL, 0))
2662 goto parse_dup_op_espace;
2663
2664 tree = create_tree (dfa, tree, NULL, OP_ALT);
2665 if (BE (tree == NULL, 0))
2666 goto parse_dup_op_espace;
2667 }
2668
2669 if (old_tree)
2670 tree = create_tree (dfa, old_tree, tree, CONCAT);
2671
2672 return tree;
2673
2674 parse_dup_op_espace:
2675 *err = REG_ESPACE;
2676 return NULL;
2677 }
2678
2679 /* Size of the names for collating symbol/equivalence_class/character_class.
2680 I'm not sure, but maybe enough. */
2681 #define BRACKET_NAME_BUF_SIZE 32
2682
2683 #ifndef _LIBC
2684
2685 # ifdef RE_ENABLE_I18N
2686 /* Convert the byte B to the corresponding wide character. In a
2687 unibyte locale, treat B as itself if it is an encoding error.
2688 In a multibyte locale, return WEOF if B is an encoding error. */
2689 static wint_t
parse_byte(unsigned char b,re_charset_t * mbcset)2690 parse_byte (unsigned char b, re_charset_t *mbcset)
2691 {
2692 wint_t wc = __btowc (b);
2693 return wc == WEOF && !mbcset ? b : wc;
2694 }
2695 #endif
2696
2697 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2698 Build the range expression which starts from START_ELEM, and ends
2699 at END_ELEM. The result are written to MBCSET and SBCSET.
2700 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2701 mbcset->range_ends, is a pointer argument since we may
2702 update it. */
2703
2704 static reg_errcode_t
2705 # ifdef RE_ENABLE_I18N
build_range_exp(const reg_syntax_t syntax,bitset_t sbcset,re_charset_t * mbcset,Idx * range_alloc,const bracket_elem_t * start_elem,const bracket_elem_t * end_elem)2706 build_range_exp (const reg_syntax_t syntax,
2707 bitset_t sbcset,
2708 re_charset_t *mbcset,
2709 Idx *range_alloc,
2710 const bracket_elem_t *start_elem,
2711 const bracket_elem_t *end_elem)
2712 # else /* not RE_ENABLE_I18N */
2713 build_range_exp (const reg_syntax_t syntax,
2714 bitset_t sbcset,
2715 const bracket_elem_t *start_elem,
2716 const bracket_elem_t *end_elem)
2717 # endif /* not RE_ENABLE_I18N */
2718 {
2719 unsigned int start_ch, end_ch;
2720 /* Equivalence Classes and Character Classes can't be a range start/end. */
2721 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2722 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2723 0))
2724 return REG_ERANGE;
2725
2726 /* We can handle no multi character collating elements without libc
2727 support. */
2728 if (BE ((start_elem->type == COLL_SYM
2729 && strlen ((char *) start_elem->opr.name) > 1)
2730 || (end_elem->type == COLL_SYM
2731 && strlen ((char *) end_elem->opr.name) > 1), 0))
2732 return REG_ECOLLATE;
2733
2734 # ifdef RE_ENABLE_I18N
2735 {
2736 wchar_t wc;
2737 wint_t start_wc;
2738 wint_t end_wc;
2739
2740 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2741 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2742 : 0));
2743 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2744 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2745 : 0));
2746 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2747 ? parse_byte (start_ch, mbcset) : start_elem->opr.wch);
2748 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2749 ? parse_byte (end_ch, mbcset) : end_elem->opr.wch);
2750 if (start_wc == WEOF || end_wc == WEOF)
2751 return REG_ECOLLATE;
2752 else if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_wc > end_wc, 0))
2753 return REG_ERANGE;
2754
2755 /* Got valid collation sequence values, add them as a new entry.
2756 However, for !_LIBC we have no collation elements: if the
2757 character set is single byte, the single byte character set
2758 that we build below suffices. parse_bracket_exp passes
2759 no MBCSET if dfa->mb_cur_max == 1. */
2760 if (mbcset)
2761 {
2762 /* Check the space of the arrays. */
2763 if (BE (*range_alloc == mbcset->nranges, 0))
2764 {
2765 /* There is not enough space, need realloc. */
2766 wchar_t *new_array_start, *new_array_end;
2767 Idx new_nranges;
2768
2769 /* +1 in case of mbcset->nranges is 0. */
2770 new_nranges = 2 * mbcset->nranges + 1;
2771 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2772 are NULL if *range_alloc == 0. */
2773 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2774 new_nranges);
2775 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2776 new_nranges);
2777
2778 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2779 {
2780 re_free (new_array_start);
2781 re_free (new_array_end);
2782 return REG_ESPACE;
2783 }
2784
2785 mbcset->range_starts = new_array_start;
2786 mbcset->range_ends = new_array_end;
2787 *range_alloc = new_nranges;
2788 }
2789
2790 mbcset->range_starts[mbcset->nranges] = start_wc;
2791 mbcset->range_ends[mbcset->nranges++] = end_wc;
2792 }
2793
2794 /* Build the table for single byte characters. */
2795 for (wc = 0; wc < SBC_MAX; ++wc)
2796 {
2797 if (start_wc <= wc && wc <= end_wc)
2798 bitset_set (sbcset, wc);
2799 }
2800 }
2801 # else /* not RE_ENABLE_I18N */
2802 {
2803 unsigned int ch;
2804 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2805 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2806 : 0));
2807 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2808 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2809 : 0));
2810 if (start_ch > end_ch)
2811 return REG_ERANGE;
2812 /* Build the table for single byte characters. */
2813 for (ch = 0; ch < SBC_MAX; ++ch)
2814 if (start_ch <= ch && ch <= end_ch)
2815 bitset_set (sbcset, ch);
2816 }
2817 # endif /* not RE_ENABLE_I18N */
2818 return REG_NOERROR;
2819 }
2820 #endif /* not _LIBC */
2821
2822 #ifndef _LIBC
2823 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2824 Build the collating element which is represented by NAME.
2825 The result are written to MBCSET and SBCSET.
2826 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2827 pointer argument since we may update it. */
2828
2829 static reg_errcode_t
2830 # ifdef RE_ENABLE_I18N
build_collating_symbol(bitset_t sbcset,re_charset_t * mbcset,Idx * coll_sym_alloc,const unsigned char * name)2831 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2832 Idx *coll_sym_alloc, const unsigned char *name)
2833 # else /* not RE_ENABLE_I18N */
2834 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2835 # endif /* not RE_ENABLE_I18N */
2836 {
2837 size_t name_len = strlen ((const char *) name);
2838 if (BE (name_len != 1, 0))
2839 return REG_ECOLLATE;
2840 else
2841 {
2842 bitset_set (sbcset, name[0]);
2843 return REG_NOERROR;
2844 }
2845 }
2846 #endif /* not _LIBC */
2847
2848 /* This function parse bracket expression like "[abc]", "[a-c]",
2849 "[[.a-a.]]" etc. */
2850
2851 static bin_tree_t *
parse_bracket_exp(re_string_t * regexp,re_dfa_t * dfa,re_token_t * token,reg_syntax_t syntax,reg_errcode_t * err)2852 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2853 reg_syntax_t syntax, reg_errcode_t *err)
2854 {
2855 #ifdef _LIBC
2856 const unsigned char *collseqmb;
2857 const char *collseqwc;
2858 uint32_t nrules;
2859 int32_t table_size;
2860 const int32_t *symb_table;
2861 const unsigned char *extra;
2862
2863 /* Local function for parse_bracket_exp used in _LIBC environment.
2864 Seek the collating symbol entry corresponding to NAME.
2865 Return the index of the symbol in the SYMB_TABLE,
2866 or -1 if not found. */
2867
2868 auto inline int32_t
2869 __attribute__ ((always_inline))
2870 seek_collating_symbol_entry (const unsigned char *name, size_t name_len)
2871 {
2872 int32_t elem;
2873
2874 for (elem = 0; elem < table_size; elem++)
2875 if (symb_table[2 * elem] != 0)
2876 {
2877 int32_t idx = symb_table[2 * elem + 1];
2878 /* Skip the name of collating element name. */
2879 idx += 1 + extra[idx];
2880 if (/* Compare the length of the name. */
2881 name_len == extra[idx]
2882 /* Compare the name. */
2883 && memcmp (name, &extra[idx + 1], name_len) == 0)
2884 /* Yep, this is the entry. */
2885 return elem;
2886 }
2887 return -1;
2888 }
2889
2890 /* Local function for parse_bracket_exp used in _LIBC environment.
2891 Look up the collation sequence value of BR_ELEM.
2892 Return the value if succeeded, UINT_MAX otherwise. */
2893
2894 auto inline unsigned int
2895 __attribute__ ((always_inline))
2896 lookup_collation_sequence_value (bracket_elem_t *br_elem)
2897 {
2898 if (br_elem->type == SB_CHAR)
2899 {
2900 /*
2901 if (MB_CUR_MAX == 1)
2902 */
2903 if (nrules == 0)
2904 return collseqmb[br_elem->opr.ch];
2905 else
2906 {
2907 wint_t wc = __btowc (br_elem->opr.ch);
2908 return __collseq_table_lookup (collseqwc, wc);
2909 }
2910 }
2911 else if (br_elem->type == MB_CHAR)
2912 {
2913 if (nrules != 0)
2914 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2915 }
2916 else if (br_elem->type == COLL_SYM)
2917 {
2918 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2919 if (nrules != 0)
2920 {
2921 int32_t elem, idx;
2922 elem = seek_collating_symbol_entry (br_elem->opr.name,
2923 sym_name_len);
2924 if (elem != -1)
2925 {
2926 /* We found the entry. */
2927 idx = symb_table[2 * elem + 1];
2928 /* Skip the name of collating element name. */
2929 idx += 1 + extra[idx];
2930 /* Skip the byte sequence of the collating element. */
2931 idx += 1 + extra[idx];
2932 /* Adjust for the alignment. */
2933 idx = (idx + 3) & ~3;
2934 /* Skip the multibyte collation sequence value. */
2935 idx += sizeof (unsigned int);
2936 /* Skip the wide char sequence of the collating element. */
2937 idx += sizeof (unsigned int) *
2938 (1 + *(unsigned int *) (extra + idx));
2939 /* Return the collation sequence value. */
2940 return *(unsigned int *) (extra + idx);
2941 }
2942 else if (sym_name_len == 1)
2943 {
2944 /* No valid character. Match it as a single byte
2945 character. */
2946 return collseqmb[br_elem->opr.name[0]];
2947 }
2948 }
2949 else if (sym_name_len == 1)
2950 return collseqmb[br_elem->opr.name[0]];
2951 }
2952 return UINT_MAX;
2953 }
2954
2955 /* Local function for parse_bracket_exp used in _LIBC environment.
2956 Build the range expression which starts from START_ELEM, and ends
2957 at END_ELEM. The result are written to MBCSET and SBCSET.
2958 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2959 mbcset->range_ends, is a pointer argument since we may
2960 update it. */
2961
2962 auto inline reg_errcode_t
2963 __attribute__ ((always_inline))
2964 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2965 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2966 {
2967 unsigned int ch;
2968 uint32_t start_collseq;
2969 uint32_t end_collseq;
2970
2971 /* Equivalence Classes and Character Classes can't be a range
2972 start/end. */
2973 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2974 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2975 0))
2976 return REG_ERANGE;
2977
2978 /* FIXME: Implement rational ranges here, too. */
2979 start_collseq = lookup_collation_sequence_value (start_elem);
2980 end_collseq = lookup_collation_sequence_value (end_elem);
2981 /* Check start/end collation sequence values. */
2982 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2983 return REG_ECOLLATE;
2984 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2985 return REG_ERANGE;
2986
2987 /* Got valid collation sequence values, add them as a new entry.
2988 However, if we have no collation elements, and the character set
2989 is single byte, the single byte character set that we
2990 build below suffices. */
2991 if (nrules > 0 || dfa->mb_cur_max > 1)
2992 {
2993 /* Check the space of the arrays. */
2994 if (BE (*range_alloc == mbcset->nranges, 0))
2995 {
2996 /* There is not enough space, need realloc. */
2997 uint32_t *new_array_start;
2998 uint32_t *new_array_end;
2999 Idx new_nranges;
3000
3001 /* +1 in case of mbcset->nranges is 0. */
3002 new_nranges = 2 * mbcset->nranges + 1;
3003 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
3004 new_nranges);
3005 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
3006 new_nranges);
3007
3008 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
3009 return REG_ESPACE;
3010
3011 mbcset->range_starts = new_array_start;
3012 mbcset->range_ends = new_array_end;
3013 *range_alloc = new_nranges;
3014 }
3015
3016 mbcset->range_starts[mbcset->nranges] = start_collseq;
3017 mbcset->range_ends[mbcset->nranges++] = end_collseq;
3018 }
3019
3020 /* Build the table for single byte characters. */
3021 for (ch = 0; ch < SBC_MAX; ch++)
3022 {
3023 uint32_t ch_collseq;
3024 /*
3025 if (MB_CUR_MAX == 1)
3026 */
3027 if (nrules == 0)
3028 ch_collseq = collseqmb[ch];
3029 else
3030 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
3031 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
3032 bitset_set (sbcset, ch);
3033 }
3034 return REG_NOERROR;
3035 }
3036
3037 /* Local function for parse_bracket_exp used in _LIBC environment.
3038 Build the collating element which is represented by NAME.
3039 The result are written to MBCSET and SBCSET.
3040 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3041 pointer argument since we may update it. */
3042
3043 auto inline reg_errcode_t
3044 __attribute__ ((always_inline))
3045 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
3046 Idx *coll_sym_alloc, const unsigned char *name)
3047 {
3048 int32_t elem, idx;
3049 size_t name_len = strlen ((const char *) name);
3050 if (nrules != 0)
3051 {
3052 elem = seek_collating_symbol_entry (name, name_len);
3053 if (elem != -1)
3054 {
3055 /* We found the entry. */
3056 idx = symb_table[2 * elem + 1];
3057 /* Skip the name of collating element name. */
3058 idx += 1 + extra[idx];
3059 }
3060 else if (name_len == 1)
3061 {
3062 /* No valid character, treat it as a normal
3063 character. */
3064 bitset_set (sbcset, name[0]);
3065 return REG_NOERROR;
3066 }
3067 else
3068 return REG_ECOLLATE;
3069
3070 /* Got valid collation sequence, add it as a new entry. */
3071 /* Check the space of the arrays. */
3072 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3073 {
3074 /* Not enough, realloc it. */
3075 /* +1 in case of mbcset->ncoll_syms is 0. */
3076 Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3077 /* Use realloc since mbcset->coll_syms is NULL
3078 if *alloc == 0. */
3079 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3080 new_coll_sym_alloc);
3081 if (BE (new_coll_syms == NULL, 0))
3082 return REG_ESPACE;
3083 mbcset->coll_syms = new_coll_syms;
3084 *coll_sym_alloc = new_coll_sym_alloc;
3085 }
3086 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3087 return REG_NOERROR;
3088 }
3089 else
3090 {
3091 if (BE (name_len != 1, 0))
3092 return REG_ECOLLATE;
3093 else
3094 {
3095 bitset_set (sbcset, name[0]);
3096 return REG_NOERROR;
3097 }
3098 }
3099 }
3100 #endif
3101
3102 re_token_t br_token;
3103 re_bitset_ptr_t sbcset;
3104 #ifdef RE_ENABLE_I18N
3105 re_charset_t *mbcset;
3106 Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3107 Idx equiv_class_alloc = 0, char_class_alloc = 0;
3108 #endif /* not RE_ENABLE_I18N */
3109 bool non_match = false;
3110 bin_tree_t *work_tree;
3111 int token_len;
3112 bool first_round = true;
3113 #ifdef _LIBC
3114 collseqmb = (const unsigned char *)
3115 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3116 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3117 if (nrules)
3118 {
3119 /*
3120 if (MB_CUR_MAX > 1)
3121 */
3122 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3123 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3124 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3125 _NL_COLLATE_SYMB_TABLEMB);
3126 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3127 _NL_COLLATE_SYMB_EXTRAMB);
3128 }
3129 #endif
3130 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3131 #ifdef RE_ENABLE_I18N
3132 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3133 #endif /* RE_ENABLE_I18N */
3134 #ifdef RE_ENABLE_I18N
3135 if (BE (sbcset == NULL || mbcset == NULL, 0))
3136 #else
3137 if (BE (sbcset == NULL, 0))
3138 #endif /* RE_ENABLE_I18N */
3139 {
3140 re_free (sbcset);
3141 #ifdef RE_ENABLE_I18N
3142 re_free (mbcset);
3143 #endif
3144 *err = REG_ESPACE;
3145 return NULL;
3146 }
3147
3148 token_len = peek_token_bracket (token, regexp, syntax);
3149 if (BE (token->type == END_OF_RE, 0))
3150 {
3151 *err = REG_BADPAT;
3152 goto parse_bracket_exp_free_return;
3153 }
3154 if (token->type == OP_NON_MATCH_LIST)
3155 {
3156 #ifdef RE_ENABLE_I18N
3157 mbcset->non_match = 1;
3158 #endif /* not RE_ENABLE_I18N */
3159 non_match = true;
3160 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3161 bitset_set (sbcset, '\n');
3162 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3163 token_len = peek_token_bracket (token, regexp, syntax);
3164 if (BE (token->type == END_OF_RE, 0))
3165 {
3166 *err = REG_BADPAT;
3167 goto parse_bracket_exp_free_return;
3168 }
3169 }
3170
3171 /* We treat the first ']' as a normal character. */
3172 if (token->type == OP_CLOSE_BRACKET)
3173 token->type = CHARACTER;
3174
3175 while (1)
3176 {
3177 bracket_elem_t start_elem, end_elem;
3178 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3179 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3180 reg_errcode_t ret;
3181 int token_len2 = 0;
3182 bool is_range_exp = false;
3183 re_token_t token2;
3184
3185 start_elem.opr.name = start_name_buf;
3186 start_elem.type = COLL_SYM;
3187 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3188 syntax, first_round);
3189 if (BE (ret != REG_NOERROR, 0))
3190 {
3191 *err = ret;
3192 goto parse_bracket_exp_free_return;
3193 }
3194 first_round = false;
3195
3196 /* Get information about the next token. We need it in any case. */
3197 token_len = peek_token_bracket (token, regexp, syntax);
3198
3199 /* Do not check for ranges if we know they are not allowed. */
3200 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3201 {
3202 if (BE (token->type == END_OF_RE, 0))
3203 {
3204 *err = REG_EBRACK;
3205 goto parse_bracket_exp_free_return;
3206 }
3207 if (token->type == OP_CHARSET_RANGE)
3208 {
3209 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3210 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3211 if (BE (token2.type == END_OF_RE, 0))
3212 {
3213 *err = REG_EBRACK;
3214 goto parse_bracket_exp_free_return;
3215 }
3216 if (token2.type == OP_CLOSE_BRACKET)
3217 {
3218 /* We treat the last '-' as a normal character. */
3219 re_string_skip_bytes (regexp, -token_len);
3220 token->type = CHARACTER;
3221 }
3222 else
3223 is_range_exp = true;
3224 }
3225 }
3226
3227 if (is_range_exp == true)
3228 {
3229 end_elem.opr.name = end_name_buf;
3230 end_elem.type = COLL_SYM;
3231 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3232 dfa, syntax, true);
3233 if (BE (ret != REG_NOERROR, 0))
3234 {
3235 *err = ret;
3236 goto parse_bracket_exp_free_return;
3237 }
3238
3239 token_len = peek_token_bracket (token, regexp, syntax);
3240
3241 #ifdef _LIBC
3242 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3243 &start_elem, &end_elem);
3244 #else
3245 # ifdef RE_ENABLE_I18N
3246 *err = build_range_exp (syntax, sbcset,
3247 dfa->mb_cur_max > 1 ? mbcset : NULL,
3248 &range_alloc, &start_elem, &end_elem);
3249 # else
3250 *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
3251 # endif
3252 #endif /* RE_ENABLE_I18N */
3253 if (BE (*err != REG_NOERROR, 0))
3254 goto parse_bracket_exp_free_return;
3255 }
3256 else
3257 {
3258 switch (start_elem.type)
3259 {
3260 case SB_CHAR:
3261 bitset_set (sbcset, start_elem.opr.ch);
3262 break;
3263 #ifdef RE_ENABLE_I18N
3264 case MB_CHAR:
3265 /* Check whether the array has enough space. */
3266 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3267 {
3268 wchar_t *new_mbchars;
3269 /* Not enough, realloc it. */
3270 /* +1 in case of mbcset->nmbchars is 0. */
3271 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3272 /* Use realloc since array is NULL if *alloc == 0. */
3273 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3274 mbchar_alloc);
3275 if (BE (new_mbchars == NULL, 0))
3276 goto parse_bracket_exp_espace;
3277 mbcset->mbchars = new_mbchars;
3278 }
3279 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3280 break;
3281 #endif /* RE_ENABLE_I18N */
3282 case EQUIV_CLASS:
3283 *err = build_equiv_class (sbcset,
3284 #ifdef RE_ENABLE_I18N
3285 mbcset, &equiv_class_alloc,
3286 #endif /* RE_ENABLE_I18N */
3287 start_elem.opr.name);
3288 if (BE (*err != REG_NOERROR, 0))
3289 goto parse_bracket_exp_free_return;
3290 break;
3291 case COLL_SYM:
3292 *err = build_collating_symbol (sbcset,
3293 #ifdef RE_ENABLE_I18N
3294 mbcset, &coll_sym_alloc,
3295 #endif /* RE_ENABLE_I18N */
3296 start_elem.opr.name);
3297 if (BE (*err != REG_NOERROR, 0))
3298 goto parse_bracket_exp_free_return;
3299 break;
3300 case CHAR_CLASS:
3301 *err = build_charclass (regexp->trans, sbcset,
3302 #ifdef RE_ENABLE_I18N
3303 mbcset, &char_class_alloc,
3304 #endif /* RE_ENABLE_I18N */
3305 (const char *) start_elem.opr.name,
3306 syntax);
3307 if (BE (*err != REG_NOERROR, 0))
3308 goto parse_bracket_exp_free_return;
3309 break;
3310 default:
3311 assert (0);
3312 break;
3313 }
3314 }
3315 if (BE (token->type == END_OF_RE, 0))
3316 {
3317 *err = REG_EBRACK;
3318 goto parse_bracket_exp_free_return;
3319 }
3320 if (token->type == OP_CLOSE_BRACKET)
3321 break;
3322 }
3323
3324 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3325
3326 /* If it is non-matching list. */
3327 if (non_match)
3328 bitset_not (sbcset);
3329
3330 #ifdef RE_ENABLE_I18N
3331 /* Ensure only single byte characters are set. */
3332 if (dfa->mb_cur_max > 1)
3333 bitset_mask (sbcset, dfa->sb_char);
3334
3335 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3336 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3337 || mbcset->non_match)))
3338 {
3339 bin_tree_t *mbc_tree;
3340 int sbc_idx;
3341 /* Build a tree for complex bracket. */
3342 dfa->has_mb_node = 1;
3343 br_token.type = COMPLEX_BRACKET;
3344 br_token.opr.mbcset = mbcset;
3345 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3346 if (BE (mbc_tree == NULL, 0))
3347 goto parse_bracket_exp_espace;
3348 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3349 if (sbcset[sbc_idx])
3350 break;
3351 /* If there are no bits set in sbcset, there is no point
3352 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3353 if (sbc_idx < BITSET_WORDS)
3354 {
3355 /* Build a tree for simple bracket. */
3356 br_token.type = SIMPLE_BRACKET;
3357 br_token.opr.sbcset = sbcset;
3358 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3359 if (BE (work_tree == NULL, 0))
3360 goto parse_bracket_exp_espace;
3361
3362 /* Then join them by ALT node. */
3363 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3364 if (BE (work_tree == NULL, 0))
3365 goto parse_bracket_exp_espace;
3366 }
3367 else
3368 {
3369 re_free (sbcset);
3370 work_tree = mbc_tree;
3371 }
3372 }
3373 else
3374 #endif /* not RE_ENABLE_I18N */
3375 {
3376 #ifdef RE_ENABLE_I18N
3377 free_charset (mbcset);
3378 #endif
3379 /* Build a tree for simple bracket. */
3380 br_token.type = SIMPLE_BRACKET;
3381 br_token.opr.sbcset = sbcset;
3382 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3383 if (BE (work_tree == NULL, 0))
3384 goto parse_bracket_exp_espace;
3385 }
3386 return work_tree;
3387
3388 parse_bracket_exp_espace:
3389 *err = REG_ESPACE;
3390 parse_bracket_exp_free_return:
3391 re_free (sbcset);
3392 #ifdef RE_ENABLE_I18N
3393 free_charset (mbcset);
3394 #endif /* RE_ENABLE_I18N */
3395 return NULL;
3396 }
3397
3398 /* Parse an element in the bracket expression. */
3399
3400 static reg_errcode_t
parse_bracket_element(bracket_elem_t * elem,re_string_t * regexp,re_token_t * token,int token_len,re_dfa_t * dfa,reg_syntax_t syntax,bool accept_hyphen)3401 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3402 re_token_t *token, int token_len, re_dfa_t *dfa,
3403 reg_syntax_t syntax, bool accept_hyphen)
3404 {
3405 #ifdef RE_ENABLE_I18N
3406 int cur_char_size;
3407 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3408 if (cur_char_size > 1)
3409 {
3410 elem->type = MB_CHAR;
3411 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3412 re_string_skip_bytes (regexp, cur_char_size);
3413 return REG_NOERROR;
3414 }
3415 #endif /* RE_ENABLE_I18N */
3416 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3417 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3418 || token->type == OP_OPEN_EQUIV_CLASS)
3419 return parse_bracket_symbol (elem, regexp, token);
3420 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3421 {
3422 /* A '-' must only appear as anything but a range indicator before
3423 the closing bracket. Everything else is an error. */
3424 re_token_t token2;
3425 (void) peek_token_bracket (&token2, regexp, syntax);
3426 if (token2.type != OP_CLOSE_BRACKET)
3427 /* The actual error value is not standardized since this whole
3428 case is undefined. But ERANGE makes good sense. */
3429 return REG_ERANGE;
3430 }
3431 elem->type = SB_CHAR;
3432 elem->opr.ch = token->opr.c;
3433 return REG_NOERROR;
3434 }
3435
3436 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3437 such as [:<character_class>:], [.<collating_element>.], and
3438 [=<equivalent_class>=]. */
3439
3440 static reg_errcode_t
parse_bracket_symbol(bracket_elem_t * elem,re_string_t * regexp,re_token_t * token)3441 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3442 re_token_t *token)
3443 {
3444 unsigned char ch, delim = token->opr.c;
3445 int i = 0;
3446 if (re_string_eoi(regexp))
3447 return REG_EBRACK;
3448 for (;; ++i)
3449 {
3450 if (i >= BRACKET_NAME_BUF_SIZE)
3451 return REG_EBRACK;
3452 if (token->type == OP_OPEN_CHAR_CLASS)
3453 ch = re_string_fetch_byte_case (regexp);
3454 else
3455 ch = re_string_fetch_byte (regexp);
3456 if (re_string_eoi(regexp))
3457 return REG_EBRACK;
3458 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3459 break;
3460 elem->opr.name[i] = ch;
3461 }
3462 re_string_skip_bytes (regexp, 1);
3463 elem->opr.name[i] = '\0';
3464 switch (token->type)
3465 {
3466 case OP_OPEN_COLL_ELEM:
3467 elem->type = COLL_SYM;
3468 break;
3469 case OP_OPEN_EQUIV_CLASS:
3470 elem->type = EQUIV_CLASS;
3471 break;
3472 case OP_OPEN_CHAR_CLASS:
3473 elem->type = CHAR_CLASS;
3474 break;
3475 default:
3476 break;
3477 }
3478 return REG_NOERROR;
3479 }
3480
3481 /* Helper function for parse_bracket_exp.
3482 Build the equivalence class which is represented by NAME.
3483 The result are written to MBCSET and SBCSET.
3484 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3485 is a pointer argument since we may update it. */
3486
3487 static reg_errcode_t
3488 #ifdef RE_ENABLE_I18N
build_equiv_class(bitset_t sbcset,re_charset_t * mbcset,Idx * equiv_class_alloc,const unsigned char * name)3489 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3490 Idx *equiv_class_alloc, const unsigned char *name)
3491 #else /* not RE_ENABLE_I18N */
3492 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3493 #endif /* not RE_ENABLE_I18N */
3494 {
3495 #ifdef _LIBC
3496 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3497 if (nrules != 0)
3498 {
3499 const int32_t *table, *indirect;
3500 const unsigned char *weights, *extra, *cp;
3501 unsigned char char_buf[2];
3502 int32_t idx1, idx2;
3503 unsigned int ch;
3504 size_t len;
3505 /* Calculate the index for equivalence class. */
3506 cp = name;
3507 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3508 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3509 _NL_COLLATE_WEIGHTMB);
3510 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3511 _NL_COLLATE_EXTRAMB);
3512 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3513 _NL_COLLATE_INDIRECTMB);
3514 idx1 = findidx (table, indirect, extra, &cp, -1);
3515 if (BE (idx1 == 0 || *cp != '\0', 0))
3516 /* This isn't a valid character. */
3517 return REG_ECOLLATE;
3518
3519 /* Build single byte matching table for this equivalence class. */
3520 len = weights[idx1 & 0xffffff];
3521 for (ch = 0; ch < SBC_MAX; ++ch)
3522 {
3523 char_buf[0] = ch;
3524 cp = char_buf;
3525 idx2 = findidx (table, indirect, extra, &cp, 1);
3526 /*
3527 idx2 = table[ch];
3528 */
3529 if (idx2 == 0)
3530 /* This isn't a valid character. */
3531 continue;
3532 /* Compare only if the length matches and the collation rule
3533 index is the same. */
3534 if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24)
3535 && memcmp (weights + (idx1 & 0xffffff) + 1,
3536 weights + (idx2 & 0xffffff) + 1, len) == 0)
3537 bitset_set (sbcset, ch);
3538 }
3539 /* Check whether the array has enough space. */
3540 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3541 {
3542 /* Not enough, realloc it. */
3543 /* +1 in case of mbcset->nequiv_classes is 0. */
3544 Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3545 /* Use realloc since the array is NULL if *alloc == 0. */
3546 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3547 int32_t,
3548 new_equiv_class_alloc);
3549 if (BE (new_equiv_classes == NULL, 0))
3550 return REG_ESPACE;
3551 mbcset->equiv_classes = new_equiv_classes;
3552 *equiv_class_alloc = new_equiv_class_alloc;
3553 }
3554 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3555 }
3556 else
3557 #endif /* _LIBC */
3558 {
3559 if (BE (strlen ((const char *) name) != 1, 0))
3560 return REG_ECOLLATE;
3561 bitset_set (sbcset, *name);
3562 }
3563 return REG_NOERROR;
3564 }
3565
3566 /* Helper function for parse_bracket_exp.
3567 Build the character class which is represented by NAME.
3568 The result are written to MBCSET and SBCSET.
3569 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3570 is a pointer argument since we may update it. */
3571
3572 static reg_errcode_t
3573 #ifdef RE_ENABLE_I18N
build_charclass(RE_TRANSLATE_TYPE trans,bitset_t sbcset,re_charset_t * mbcset,Idx * char_class_alloc,const char * class_name,reg_syntax_t syntax)3574 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3575 re_charset_t *mbcset, Idx *char_class_alloc,
3576 const char *class_name, reg_syntax_t syntax)
3577 #else /* not RE_ENABLE_I18N */
3578 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3579 const char *class_name, reg_syntax_t syntax)
3580 #endif /* not RE_ENABLE_I18N */
3581 {
3582 int i;
3583 const char *name = class_name;
3584
3585 /* In case of REG_ICASE "upper" and "lower" match the both of
3586 upper and lower cases. */
3587 if ((syntax & RE_ICASE)
3588 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3589 name = "alpha";
3590
3591 #ifdef RE_ENABLE_I18N
3592 /* Check the space of the arrays. */
3593 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3594 {
3595 /* Not enough, realloc it. */
3596 /* +1 in case of mbcset->nchar_classes is 0. */
3597 Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3598 /* Use realloc since array is NULL if *alloc == 0. */
3599 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3600 new_char_class_alloc);
3601 if (BE (new_char_classes == NULL, 0))
3602 return REG_ESPACE;
3603 mbcset->char_classes = new_char_classes;
3604 *char_class_alloc = new_char_class_alloc;
3605 }
3606 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3607 #endif /* RE_ENABLE_I18N */
3608
3609 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3610 do { \
3611 if (BE (trans != NULL, 0)) \
3612 { \
3613 for (i = 0; i < SBC_MAX; ++i) \
3614 if (ctype_func (i)) \
3615 bitset_set (sbcset, trans[i]); \
3616 } \
3617 else \
3618 { \
3619 for (i = 0; i < SBC_MAX; ++i) \
3620 if (ctype_func (i)) \
3621 bitset_set (sbcset, i); \
3622 } \
3623 } while (0)
3624
3625 if (strcmp (name, "alnum") == 0)
3626 BUILD_CHARCLASS_LOOP (isalnum);
3627 else if (strcmp (name, "cntrl") == 0)
3628 BUILD_CHARCLASS_LOOP (iscntrl);
3629 else if (strcmp (name, "lower") == 0)
3630 BUILD_CHARCLASS_LOOP (islower);
3631 else if (strcmp (name, "space") == 0)
3632 BUILD_CHARCLASS_LOOP (isspace);
3633 else if (strcmp (name, "alpha") == 0)
3634 BUILD_CHARCLASS_LOOP (isalpha);
3635 else if (strcmp (name, "digit") == 0)
3636 BUILD_CHARCLASS_LOOP (isdigit);
3637 else if (strcmp (name, "print") == 0)
3638 BUILD_CHARCLASS_LOOP (isprint);
3639 else if (strcmp (name, "upper") == 0)
3640 BUILD_CHARCLASS_LOOP (isupper);
3641 else if (strcmp (name, "blank") == 0)
3642 BUILD_CHARCLASS_LOOP (isblank);
3643 else if (strcmp (name, "graph") == 0)
3644 BUILD_CHARCLASS_LOOP (isgraph);
3645 else if (strcmp (name, "punct") == 0)
3646 BUILD_CHARCLASS_LOOP (ispunct);
3647 else if (strcmp (name, "xdigit") == 0)
3648 BUILD_CHARCLASS_LOOP (isxdigit);
3649 else
3650 return REG_ECTYPE;
3651
3652 return REG_NOERROR;
3653 }
3654
3655 static bin_tree_t *
build_charclass_op(re_dfa_t * dfa,RE_TRANSLATE_TYPE trans,const char * class_name,const char * extra,bool non_match,reg_errcode_t * err)3656 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3657 const char *class_name,
3658 const char *extra, bool non_match,
3659 reg_errcode_t *err)
3660 {
3661 re_bitset_ptr_t sbcset;
3662 #ifdef RE_ENABLE_I18N
3663 re_charset_t *mbcset;
3664 Idx alloc = 0;
3665 #endif /* not RE_ENABLE_I18N */
3666 reg_errcode_t ret;
3667 re_token_t br_token;
3668 bin_tree_t *tree;
3669
3670 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3671 if (BE (sbcset == NULL, 0))
3672 {
3673 *err = REG_ESPACE;
3674 return NULL;
3675 }
3676 #ifdef RE_ENABLE_I18N
3677 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3678 if (BE (mbcset == NULL, 0))
3679 {
3680 re_free (sbcset);
3681 *err = REG_ESPACE;
3682 return NULL;
3683 }
3684 mbcset->non_match = non_match;
3685 #endif /* RE_ENABLE_I18N */
3686
3687 /* We don't care the syntax in this case. */
3688 ret = build_charclass (trans, sbcset,
3689 #ifdef RE_ENABLE_I18N
3690 mbcset, &alloc,
3691 #endif /* RE_ENABLE_I18N */
3692 class_name, 0);
3693
3694 if (BE (ret != REG_NOERROR, 0))
3695 {
3696 re_free (sbcset);
3697 #ifdef RE_ENABLE_I18N
3698 free_charset (mbcset);
3699 #endif /* RE_ENABLE_I18N */
3700 *err = ret;
3701 return NULL;
3702 }
3703 /* \w match '_' also. */
3704 for (; *extra; extra++)
3705 bitset_set (sbcset, *extra);
3706
3707 /* If it is non-matching list. */
3708 if (non_match)
3709 bitset_not (sbcset);
3710
3711 #ifdef RE_ENABLE_I18N
3712 /* Ensure only single byte characters are set. */
3713 if (dfa->mb_cur_max > 1)
3714 bitset_mask (sbcset, dfa->sb_char);
3715 #endif
3716
3717 /* Build a tree for simple bracket. */
3718 #if defined GCC_LINT || defined lint
3719 memset (&br_token, 0, sizeof br_token);
3720 #endif
3721 br_token.type = SIMPLE_BRACKET;
3722 br_token.opr.sbcset = sbcset;
3723 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3724 if (BE (tree == NULL, 0))
3725 goto build_word_op_espace;
3726
3727 #ifdef RE_ENABLE_I18N
3728 if (dfa->mb_cur_max > 1)
3729 {
3730 bin_tree_t *mbc_tree;
3731 /* Build a tree for complex bracket. */
3732 br_token.type = COMPLEX_BRACKET;
3733 br_token.opr.mbcset = mbcset;
3734 dfa->has_mb_node = 1;
3735 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3736 if (BE (mbc_tree == NULL, 0))
3737 goto build_word_op_espace;
3738 /* Then join them by ALT node. */
3739 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3740 if (BE (mbc_tree != NULL, 1))
3741 return tree;
3742 }
3743 else
3744 {
3745 free_charset (mbcset);
3746 return tree;
3747 }
3748 #else /* not RE_ENABLE_I18N */
3749 return tree;
3750 #endif /* not RE_ENABLE_I18N */
3751
3752 build_word_op_espace:
3753 re_free (sbcset);
3754 #ifdef RE_ENABLE_I18N
3755 free_charset (mbcset);
3756 #endif /* RE_ENABLE_I18N */
3757 *err = REG_ESPACE;
3758 return NULL;
3759 }
3760
3761 /* This is intended for the expressions like "a{1,3}".
3762 Fetch a number from 'input', and return the number.
3763 Return -1 if the number field is empty like "{,1}".
3764 Return RE_DUP_MAX + 1 if the number field is too large.
3765 Return -2 if an error occurred. */
3766
3767 static Idx
fetch_number(re_string_t * input,re_token_t * token,reg_syntax_t syntax)3768 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3769 {
3770 Idx num = -1;
3771 unsigned char c;
3772 while (1)
3773 {
3774 fetch_token (token, input, syntax);
3775 c = token->opr.c;
3776 if (BE (token->type == END_OF_RE, 0))
3777 return -2;
3778 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3779 break;
3780 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3781 ? -2
3782 : num == -1
3783 ? c - '0'
3784 : MIN (RE_DUP_MAX + 1, num * 10 + c - '0'));
3785 }
3786 return num;
3787 }
3788
3789 #ifdef RE_ENABLE_I18N
3790 static void
free_charset(re_charset_t * cset)3791 free_charset (re_charset_t *cset)
3792 {
3793 re_free (cset->mbchars);
3794 # ifdef _LIBC
3795 re_free (cset->coll_syms);
3796 re_free (cset->equiv_classes);
3797 re_free (cset->range_starts);
3798 re_free (cset->range_ends);
3799 # endif
3800 re_free (cset->char_classes);
3801 re_free (cset);
3802 }
3803 #endif /* RE_ENABLE_I18N */
3804
3805 /* Functions for binary tree operation. */
3806
3807 /* Create a tree node. */
3808
3809 static bin_tree_t *
create_tree(re_dfa_t * dfa,bin_tree_t * left,bin_tree_t * right,re_token_type_t type)3810 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3811 re_token_type_t type)
3812 {
3813 re_token_t t;
3814 #if defined GCC_LINT || defined lint
3815 memset (&t, 0, sizeof t);
3816 #endif
3817 t.type = type;
3818 return create_token_tree (dfa, left, right, &t);
3819 }
3820
3821 static bin_tree_t *
create_token_tree(re_dfa_t * dfa,bin_tree_t * left,bin_tree_t * right,const re_token_t * token)3822 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3823 const re_token_t *token)
3824 {
3825 bin_tree_t *tree;
3826 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3827 {
3828 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3829
3830 if (storage == NULL)
3831 return NULL;
3832 storage->next = dfa->str_tree_storage;
3833 dfa->str_tree_storage = storage;
3834 dfa->str_tree_storage_idx = 0;
3835 }
3836 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3837
3838 tree->parent = NULL;
3839 tree->left = left;
3840 tree->right = right;
3841 tree->token = *token;
3842 tree->token.duplicated = 0;
3843 tree->token.opt_subexp = 0;
3844 tree->first = NULL;
3845 tree->next = NULL;
3846 tree->node_idx = -1;
3847
3848 if (left != NULL)
3849 left->parent = tree;
3850 if (right != NULL)
3851 right->parent = tree;
3852 return tree;
3853 }
3854
3855 /* Mark the tree SRC as an optional subexpression.
3856 To be called from preorder or postorder. */
3857
3858 static reg_errcode_t
mark_opt_subexp(void * extra,bin_tree_t * node)3859 mark_opt_subexp (void *extra, bin_tree_t *node)
3860 {
3861 Idx idx = (uintptr_t) extra;
3862 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3863 node->token.opt_subexp = 1;
3864
3865 return REG_NOERROR;
3866 }
3867
3868 /* Free the allocated memory inside NODE. */
3869
3870 static void
free_token(re_token_t * node)3871 free_token (re_token_t *node)
3872 {
3873 #ifdef RE_ENABLE_I18N
3874 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3875 free_charset (node->opr.mbcset);
3876 else
3877 #endif /* RE_ENABLE_I18N */
3878 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3879 re_free (node->opr.sbcset);
3880 }
3881
3882 /* Worker function for tree walking. Free the allocated memory inside NODE
3883 and its children. */
3884
3885 static reg_errcode_t
free_tree(void * extra,bin_tree_t * node)3886 free_tree (void *extra, bin_tree_t *node)
3887 {
3888 free_token (&node->token);
3889 return REG_NOERROR;
3890 }
3891
3892
3893 /* Duplicate the node SRC, and return new node. This is a preorder
3894 visit similar to the one implemented by the generic visitor, but
3895 we need more infrastructure to maintain two parallel trees --- so,
3896 it's easier to duplicate. */
3897
3898 static bin_tree_t *
duplicate_tree(const bin_tree_t * root,re_dfa_t * dfa)3899 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3900 {
3901 const bin_tree_t *node;
3902 bin_tree_t *dup_root;
3903 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3904
3905 for (node = root; ; )
3906 {
3907 /* Create a new tree and link it back to the current parent. */
3908 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3909 if (*p_new == NULL)
3910 return NULL;
3911 (*p_new)->parent = dup_node;
3912 (*p_new)->token.duplicated = 1;
3913 dup_node = *p_new;
3914
3915 /* Go to the left node, or up and to the right. */
3916 if (node->left)
3917 {
3918 node = node->left;
3919 p_new = &dup_node->left;
3920 }
3921 else
3922 {
3923 const bin_tree_t *prev = NULL;
3924 while (node->right == prev || node->right == NULL)
3925 {
3926 prev = node;
3927 node = node->parent;
3928 dup_node = dup_node->parent;
3929 if (!node)
3930 return dup_root;
3931 }
3932 node = node->right;
3933 p_new = &dup_node->right;
3934 }
3935 }
3936 }
3937