xref: /haiku/src/system/libroot/posix/glibc/regex/regcomp.c (revision bda66ab7c7bfee38e845cea952a0a632613af99d)
1 /* Extended regular expression matching and search library.
2    Copyright (C) 2002-2018 Free Software Foundation, Inc.
3    This file is part of the GNU C Library.
4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5 
6    The GNU C Library is free software; you can redistribute it and/or
7    modify it under the terms of the GNU Lesser General Public
8    License as published by the Free Software Foundation; either
9    version 2.1 of the License, or (at your option) any later version.
10 
11    The GNU C Library is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14    Lesser General Public License for more details.
15 
16    You should have received a copy of the GNU Lesser General Public
17    License along with the GNU C Library; if not, see
18    <https://www.gnu.org/licenses/>.  */
19 
20 #ifdef _LIBC
21 # include <locale/weight.h>
22 #endif
23 
24 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
25 					  size_t length, reg_syntax_t syntax);
26 static void re_compile_fastmap_iter (regex_t *bufp,
27 				     const re_dfastate_t *init_state,
28 				     char *fastmap);
29 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
30 #ifdef RE_ENABLE_I18N
31 static void free_charset (re_charset_t *cset);
32 #endif /* RE_ENABLE_I18N */
33 static void free_workarea_compile (regex_t *preg);
34 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
35 #ifdef RE_ENABLE_I18N
36 static void optimize_utf8 (re_dfa_t *dfa);
37 #endif
38 static reg_errcode_t analyze (regex_t *preg);
39 static reg_errcode_t preorder (bin_tree_t *root,
40 			       reg_errcode_t (fn (void *, bin_tree_t *)),
41 			       void *extra);
42 static reg_errcode_t postorder (bin_tree_t *root,
43 				reg_errcode_t (fn (void *, bin_tree_t *)),
44 				void *extra);
45 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
46 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
47 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
48 				 bin_tree_t *node);
49 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
50 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
51 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
52 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
53 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
54 				   unsigned int constraint);
55 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
56 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
57 					 Idx node, bool root);
58 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
59 static Idx fetch_number (re_string_t *input, re_token_t *token,
60 			 reg_syntax_t syntax);
61 static int peek_token (re_token_t *token, re_string_t *input,
62 			reg_syntax_t syntax);
63 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
64 			  reg_syntax_t syntax, reg_errcode_t *err);
65 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
66 				  re_token_t *token, reg_syntax_t syntax,
67 				  Idx nest, reg_errcode_t *err);
68 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
69 				 re_token_t *token, reg_syntax_t syntax,
70 				 Idx nest, reg_errcode_t *err);
71 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
72 				     re_token_t *token, reg_syntax_t syntax,
73 				     Idx nest, reg_errcode_t *err);
74 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
75 				  re_token_t *token, reg_syntax_t syntax,
76 				  Idx nest, reg_errcode_t *err);
77 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
78 				 re_dfa_t *dfa, re_token_t *token,
79 				 reg_syntax_t syntax, reg_errcode_t *err);
80 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
81 				      re_token_t *token, reg_syntax_t syntax,
82 				      reg_errcode_t *err);
83 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
84 					    re_string_t *regexp,
85 					    re_token_t *token, int token_len,
86 					    re_dfa_t *dfa,
87 					    reg_syntax_t syntax,
88 					    bool accept_hyphen);
89 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
90 					  re_string_t *regexp,
91 					  re_token_t *token);
92 #ifdef RE_ENABLE_I18N
93 static reg_errcode_t build_equiv_class (bitset_t sbcset,
94 					re_charset_t *mbcset,
95 					Idx *equiv_class_alloc,
96 					const unsigned char *name);
97 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
98 				      bitset_t sbcset,
99 				      re_charset_t *mbcset,
100 				      Idx *char_class_alloc,
101 				      const char *class_name,
102 				      reg_syntax_t syntax);
103 #else  /* not RE_ENABLE_I18N */
104 static reg_errcode_t build_equiv_class (bitset_t sbcset,
105 					const unsigned char *name);
106 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
107 				      bitset_t sbcset,
108 				      const char *class_name,
109 				      reg_syntax_t syntax);
110 #endif /* not RE_ENABLE_I18N */
111 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
112 				       RE_TRANSLATE_TYPE trans,
113 				       const char *class_name,
114 				       const char *extra,
115 				       bool non_match, reg_errcode_t *err);
116 static bin_tree_t *create_tree (re_dfa_t *dfa,
117 				bin_tree_t *left, bin_tree_t *right,
118 				re_token_type_t type);
119 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
120 				      bin_tree_t *left, bin_tree_t *right,
121 				      const re_token_t *token);
122 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
123 static void free_token (re_token_t *node);
124 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
125 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
126 
127 /* This table gives an error message for each of the error codes listed
128    in regex.h.  Obviously the order here has to be same as there.
129    POSIX doesn't require that we do anything for REG_NOERROR,
130    but why not be nice?  */
131 
132 static const char __re_error_msgid[] =
133   {
134 #define REG_NOERROR_IDX	0
135     gettext_noop ("Success")	/* REG_NOERROR */
136     "\0"
137 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
138     gettext_noop ("No match")	/* REG_NOMATCH */
139     "\0"
140 #define REG_BADPAT_IDX	(REG_NOMATCH_IDX + sizeof "No match")
141     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
142     "\0"
143 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
144     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
145     "\0"
146 #define REG_ECTYPE_IDX	(REG_ECOLLATE_IDX + sizeof "Invalid collation character")
147     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
148     "\0"
149 #define REG_EESCAPE_IDX	(REG_ECTYPE_IDX + sizeof "Invalid character class name")
150     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
151     "\0"
152 #define REG_ESUBREG_IDX	(REG_EESCAPE_IDX + sizeof "Trailing backslash")
153     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
154     "\0"
155 #define REG_EBRACK_IDX	(REG_ESUBREG_IDX + sizeof "Invalid back reference")
156     gettext_noop ("Unmatched [, [^, [:, [., or [=")	/* REG_EBRACK */
157     "\0"
158 #define REG_EPAREN_IDX	(REG_EBRACK_IDX + sizeof "Unmatched [, [^, [:, [., or [=")
159     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
160     "\0"
161 #define REG_EBRACE_IDX	(REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
162     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
163     "\0"
164 #define REG_BADBR_IDX	(REG_EBRACE_IDX + sizeof "Unmatched \\{")
165     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
166     "\0"
167 #define REG_ERANGE_IDX	(REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
168     gettext_noop ("Invalid range end")	/* REG_ERANGE */
169     "\0"
170 #define REG_ESPACE_IDX	(REG_ERANGE_IDX + sizeof "Invalid range end")
171     gettext_noop ("Memory exhausted") /* REG_ESPACE */
172     "\0"
173 #define REG_BADRPT_IDX	(REG_ESPACE_IDX + sizeof "Memory exhausted")
174     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
175     "\0"
176 #define REG_EEND_IDX	(REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
177     gettext_noop ("Premature end of regular expression") /* REG_EEND */
178     "\0"
179 #define REG_ESIZE_IDX	(REG_EEND_IDX + sizeof "Premature end of regular expression")
180     gettext_noop ("Regular expression too big") /* REG_ESIZE */
181     "\0"
182 #define REG_ERPAREN_IDX	(REG_ESIZE_IDX + sizeof "Regular expression too big")
183     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
184   };
185 
186 static const size_t __re_error_msgid_idx[] =
187   {
188     REG_NOERROR_IDX,
189     REG_NOMATCH_IDX,
190     REG_BADPAT_IDX,
191     REG_ECOLLATE_IDX,
192     REG_ECTYPE_IDX,
193     REG_EESCAPE_IDX,
194     REG_ESUBREG_IDX,
195     REG_EBRACK_IDX,
196     REG_EPAREN_IDX,
197     REG_EBRACE_IDX,
198     REG_BADBR_IDX,
199     REG_ERANGE_IDX,
200     REG_ESPACE_IDX,
201     REG_BADRPT_IDX,
202     REG_EEND_IDX,
203     REG_ESIZE_IDX,
204     REG_ERPAREN_IDX
205   };
206 
207 /* Entry points for GNU code.  */
208 
209 /* re_compile_pattern is the GNU regular expression compiler: it
210    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
211    Returns 0 if the pattern was valid, otherwise an error string.
212 
213    Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
214    are set in BUFP on entry.  */
215 
216 const char *
re_compile_pattern(const char * pattern,size_t length,struct re_pattern_buffer * bufp)217 re_compile_pattern (const char *pattern, size_t length,
218 		    struct re_pattern_buffer *bufp)
219 {
220   reg_errcode_t ret;
221 
222   /* And GNU code determines whether or not to get register information
223      by passing null for the REGS argument to re_match, etc., not by
224      setting no_sub, unless RE_NO_SUB is set.  */
225   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
226 
227   /* Match anchors at newline.  */
228   bufp->newline_anchor = 1;
229 
230   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
231 
232   if (!ret)
233     return NULL;
234   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
235 }
236 #ifdef _LIBC
weak_alias(__re_compile_pattern,re_compile_pattern)237 weak_alias (__re_compile_pattern, re_compile_pattern)
238 #endif
239 
240 /* Set by 're_set_syntax' to the current regexp syntax to recognize.  Can
241    also be assigned to arbitrarily: each pattern buffer stores its own
242    syntax, so it can be changed between regex compilations.  */
243 /* This has no initializer because initialized variables in Emacs
244    become read-only after dumping.  */
245 reg_syntax_t re_syntax_options;
246 
247 
248 /* Specify the precise syntax of regexps for compilation.  This provides
249    for compatibility for various utilities which historically have
250    different, incompatible syntaxes.
251 
252    The argument SYNTAX is a bit mask comprised of the various bits
253    defined in regex.h.  We return the old syntax.  */
254 
255 reg_syntax_t
256 re_set_syntax (reg_syntax_t syntax)
257 {
258   reg_syntax_t ret = re_syntax_options;
259 
260   re_syntax_options = syntax;
261   return ret;
262 }
263 #ifdef _LIBC
weak_alias(__re_set_syntax,re_set_syntax)264 weak_alias (__re_set_syntax, re_set_syntax)
265 #endif
266 
267 int
268 re_compile_fastmap (struct re_pattern_buffer *bufp)
269 {
270   re_dfa_t *dfa = bufp->buffer;
271   char *fastmap = bufp->fastmap;
272 
273   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
274   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
275   if (dfa->init_state != dfa->init_state_word)
276     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
277   if (dfa->init_state != dfa->init_state_nl)
278     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
279   if (dfa->init_state != dfa->init_state_begbuf)
280     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
281   bufp->fastmap_accurate = 1;
282   return 0;
283 }
284 #ifdef _LIBC
weak_alias(__re_compile_fastmap,re_compile_fastmap)285 weak_alias (__re_compile_fastmap, re_compile_fastmap)
286 #endif
287 
288 static inline void
289 __attribute__ ((always_inline))
290 re_set_fastmap (char *fastmap, bool icase, int ch)
291 {
292   fastmap[ch] = 1;
293   if (icase)
294     fastmap[tolower (ch)] = 1;
295 }
296 
297 /* Helper function for re_compile_fastmap.
298    Compile fastmap for the initial_state INIT_STATE.  */
299 
300 static void
re_compile_fastmap_iter(regex_t * bufp,const re_dfastate_t * init_state,char * fastmap)301 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
302 			 char *fastmap)
303 {
304   re_dfa_t *dfa = bufp->buffer;
305   Idx node_cnt;
306   bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
307   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
308     {
309       Idx node = init_state->nodes.elems[node_cnt];
310       re_token_type_t type = dfa->nodes[node].type;
311 
312       if (type == CHARACTER)
313 	{
314 	  re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
315 #ifdef RE_ENABLE_I18N
316 	  if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
317 	    {
318 	      unsigned char buf[MB_LEN_MAX];
319 	      unsigned char *p;
320 	      wchar_t wc;
321 	      mbstate_t state;
322 
323 	      p = buf;
324 	      *p++ = dfa->nodes[node].opr.c;
325 	      while (++node < dfa->nodes_len
326 		     &&	dfa->nodes[node].type == CHARACTER
327 		     && dfa->nodes[node].mb_partial)
328 		*p++ = dfa->nodes[node].opr.c;
329 	      memset (&state, '\0', sizeof (state));
330 	      if (__mbrtowc (&wc, (const char *) buf, p - buf,
331 			     &state) == p - buf
332 		  && (__wcrtomb ((char *) buf, __towlower (wc), &state)
333 		      != (size_t) -1))
334 		re_set_fastmap (fastmap, false, buf[0]);
335 	    }
336 #endif
337 	}
338       else if (type == SIMPLE_BRACKET)
339 	{
340 	  int i, ch;
341 	  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
342 	    {
343 	      int j;
344 	      bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
345 	      for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
346 		if (w & ((bitset_word_t) 1 << j))
347 		  re_set_fastmap (fastmap, icase, ch);
348 	    }
349 	}
350 #ifdef RE_ENABLE_I18N
351       else if (type == COMPLEX_BRACKET)
352 	{
353 	  re_charset_t *cset = dfa->nodes[node].opr.mbcset;
354 	  Idx i;
355 
356 # ifdef _LIBC
357 	  /* See if we have to try all bytes which start multiple collation
358 	     elements.
359 	     e.g. In da_DK, we want to catch 'a' since "aa" is a valid
360 		  collation element, and don't catch 'b' since 'b' is
361 		  the only collation element which starts from 'b' (and
362 		  it is caught by SIMPLE_BRACKET).  */
363 	      if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
364 		  && (cset->ncoll_syms || cset->nranges))
365 		{
366 		  const int32_t *table = (const int32_t *)
367 		    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
368 		  for (i = 0; i < SBC_MAX; ++i)
369 		    if (table[i] < 0)
370 		      re_set_fastmap (fastmap, icase, i);
371 		}
372 # endif /* _LIBC */
373 
374 	  /* See if we have to start the match at all multibyte characters,
375 	     i.e. where we would not find an invalid sequence.  This only
376 	     applies to multibyte character sets; for single byte character
377 	     sets, the SIMPLE_BRACKET again suffices.  */
378 	  if (dfa->mb_cur_max > 1
379 	      && (cset->nchar_classes || cset->non_match || cset->nranges
380 # ifdef _LIBC
381 		  || cset->nequiv_classes
382 # endif /* _LIBC */
383 		 ))
384 	    {
385 	      unsigned char c = 0;
386 	      do
387 		{
388 		  mbstate_t mbs;
389 		  memset (&mbs, 0, sizeof (mbs));
390 		  if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
391 		    re_set_fastmap (fastmap, false, (int) c);
392 		}
393 	      while (++c != 0);
394 	    }
395 
396 	  else
397 	    {
398 	      /* ... Else catch all bytes which can start the mbchars.  */
399 	      for (i = 0; i < cset->nmbchars; ++i)
400 		{
401 		  char buf[256];
402 		  mbstate_t state;
403 		  memset (&state, '\0', sizeof (state));
404 		  if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
405 		    re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
406 		  if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
407 		    {
408 		      if (__wcrtomb (buf, __towlower (cset->mbchars[i]), &state)
409 			  != (size_t) -1)
410 			re_set_fastmap (fastmap, false, *(unsigned char *) buf);
411 		    }
412 		}
413 	    }
414 	}
415 #endif /* RE_ENABLE_I18N */
416       else if (type == OP_PERIOD
417 #ifdef RE_ENABLE_I18N
418 	       || type == OP_UTF8_PERIOD
419 #endif /* RE_ENABLE_I18N */
420 	       || type == END_OF_RE)
421 	{
422 	  memset (fastmap, '\1', sizeof (char) * SBC_MAX);
423 	  if (type == END_OF_RE)
424 	    bufp->can_be_null = 1;
425 	  return;
426 	}
427     }
428 }
429 
430 /* Entry point for POSIX code.  */
431 /* regcomp takes a regular expression as a string and compiles it.
432 
433    PREG is a regex_t *.  We do not expect any fields to be initialized,
434    since POSIX says we shouldn't.  Thus, we set
435 
436      'buffer' to the compiled pattern;
437      'used' to the length of the compiled pattern;
438      'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
439        REG_EXTENDED bit in CFLAGS is set; otherwise, to
440        RE_SYNTAX_POSIX_BASIC;
441      'newline_anchor' to REG_NEWLINE being set in CFLAGS;
442      'fastmap' to an allocated space for the fastmap;
443      'fastmap_accurate' to zero;
444      're_nsub' to the number of subexpressions in PATTERN.
445 
446    PATTERN is the address of the pattern string.
447 
448    CFLAGS is a series of bits which affect compilation.
449 
450      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
451      use POSIX basic syntax.
452 
453      If REG_NEWLINE is set, then . and [^...] don't match newline.
454      Also, regexec will try a match beginning after every newline.
455 
456      If REG_ICASE is set, then we considers upper- and lowercase
457      versions of letters to be equivalent when matching.
458 
459      If REG_NOSUB is set, then when PREG is passed to regexec, that
460      routine will report only success or failure, and nothing about the
461      registers.
462 
463    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
464    the return codes and their meanings.)  */
465 
466 int
regcomp(regex_t * _Restrict_ preg,const char * _Restrict_ pattern,int cflags)467 regcomp (regex_t *_Restrict_ preg, const char *_Restrict_ pattern, int cflags)
468 {
469   reg_errcode_t ret;
470   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
471 			 : RE_SYNTAX_POSIX_BASIC);
472 
473   preg->buffer = NULL;
474   preg->allocated = 0;
475   preg->used = 0;
476 
477   /* Try to allocate space for the fastmap.  */
478   preg->fastmap = re_malloc (char, SBC_MAX);
479   if (BE (preg->fastmap == NULL, 0))
480     return REG_ESPACE;
481 
482   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
483 
484   /* If REG_NEWLINE is set, newlines are treated differently.  */
485   if (cflags & REG_NEWLINE)
486     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
487       syntax &= ~RE_DOT_NEWLINE;
488       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
489       /* It also changes the matching behavior.  */
490       preg->newline_anchor = 1;
491     }
492   else
493     preg->newline_anchor = 0;
494   preg->no_sub = !!(cflags & REG_NOSUB);
495   preg->translate = NULL;
496 
497   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
498 
499   /* POSIX doesn't distinguish between an unmatched open-group and an
500      unmatched close-group: both are REG_EPAREN.  */
501   if (ret == REG_ERPAREN)
502     ret = REG_EPAREN;
503 
504   /* We have already checked preg->fastmap != NULL.  */
505   if (BE (ret == REG_NOERROR, 1))
506     /* Compute the fastmap now, since regexec cannot modify the pattern
507        buffer.  This function never fails in this implementation.  */
508     (void) re_compile_fastmap (preg);
509   else
510     {
511       /* Some error occurred while compiling the expression.  */
512       re_free (preg->fastmap);
513       preg->fastmap = NULL;
514     }
515 
516   return (int) ret;
517 }
518 #ifdef _LIBC
519 libc_hidden_def (__regcomp)
weak_alias(__regcomp,regcomp)520 weak_alias (__regcomp, regcomp)
521 #endif
522 
523 /* Returns a message corresponding to an error code, ERRCODE, returned
524    from either regcomp or regexec.   We don't use PREG here.  */
525 
526 size_t
527 regerror (int errcode, const regex_t *_Restrict_ preg, char *_Restrict_ errbuf,
528 	  size_t errbuf_size)
529 {
530   const char *msg;
531   size_t msg_size;
532 
533   if (BE (errcode < 0
534 	  || errcode >= (int) (sizeof (__re_error_msgid_idx)
535 			       / sizeof (__re_error_msgid_idx[0])), 0))
536     /* Only error codes returned by the rest of the code should be passed
537        to this routine.  If we are given anything else, or if other regex
538        code generates an invalid error code, then the program has a bug.
539        Dump core so we can fix it.  */
540     abort ();
541 
542   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
543 
544   msg_size = strlen (msg) + 1; /* Includes the null.  */
545 
546   if (BE (errbuf_size != 0, 1))
547     {
548       size_t cpy_size = msg_size;
549       if (BE (msg_size > errbuf_size, 0))
550 	{
551 	  cpy_size = errbuf_size - 1;
552 	  errbuf[cpy_size] = '\0';
553 	}
554       memcpy (errbuf, msg, cpy_size);
555     }
556 
557   return msg_size;
558 }
559 #ifdef _LIBC
560 weak_alias (__regerror, regerror)
561 #endif
562 
563 
564 #ifdef RE_ENABLE_I18N
565 /* This static array is used for the map to single-byte characters when
566    UTF-8 is used.  Otherwise we would allocate memory just to initialize
567    it the same all the time.  UTF-8 is the preferred encoding so this is
568    a worthwhile optimization.  */
569 static const bitset_t utf8_sb_map =
570 {
571   /* Set the first 128 bits.  */
572 # if defined __GNUC__ && !defined __STRICT_ANSI__
573   [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
574 # else
575 #  if 4 * BITSET_WORD_BITS < ASCII_CHARS
576 #   error "bitset_word_t is narrower than 32 bits"
577 #  elif 3 * BITSET_WORD_BITS < ASCII_CHARS
578   BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
579 #  elif 2 * BITSET_WORD_BITS < ASCII_CHARS
580   BITSET_WORD_MAX, BITSET_WORD_MAX,
581 #  elif 1 * BITSET_WORD_BITS < ASCII_CHARS
582   BITSET_WORD_MAX,
583 #  endif
584   (BITSET_WORD_MAX
585    >> (SBC_MAX % BITSET_WORD_BITS == 0
586        ? 0
587        : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
588 # endif
589 };
590 #endif
591 
592 
593 static void
free_dfa_content(re_dfa_t * dfa)594 free_dfa_content (re_dfa_t *dfa)
595 {
596   Idx i, j;
597 
598   if (dfa->nodes)
599     for (i = 0; i < dfa->nodes_len; ++i)
600       free_token (dfa->nodes + i);
601   re_free (dfa->nexts);
602   for (i = 0; i < dfa->nodes_len; ++i)
603     {
604       if (dfa->eclosures != NULL)
605 	re_node_set_free (dfa->eclosures + i);
606       if (dfa->inveclosures != NULL)
607 	re_node_set_free (dfa->inveclosures + i);
608       if (dfa->edests != NULL)
609 	re_node_set_free (dfa->edests + i);
610     }
611   re_free (dfa->edests);
612   re_free (dfa->eclosures);
613   re_free (dfa->inveclosures);
614   re_free (dfa->nodes);
615 
616   if (dfa->state_table)
617     for (i = 0; i <= dfa->state_hash_mask; ++i)
618       {
619 	struct re_state_table_entry *entry = dfa->state_table + i;
620 	for (j = 0; j < entry->num; ++j)
621 	  {
622 	    re_dfastate_t *state = entry->array[j];
623 	    free_state (state);
624 	  }
625 	re_free (entry->array);
626       }
627   re_free (dfa->state_table);
628 #ifdef RE_ENABLE_I18N
629   if (dfa->sb_char != utf8_sb_map)
630     re_free (dfa->sb_char);
631 #endif
632   re_free (dfa->subexp_map);
633 #ifdef DEBUG
634   re_free (dfa->re_str);
635 #endif
636 
637   re_free (dfa);
638 }
639 
640 
641 /* Free dynamically allocated space used by PREG.  */
642 
643 void
regfree(regex_t * preg)644 regfree (regex_t *preg)
645 {
646   re_dfa_t *dfa = preg->buffer;
647   if (BE (dfa != NULL, 1))
648     {
649       lock_fini (dfa->lock);
650       free_dfa_content (dfa);
651     }
652   preg->buffer = NULL;
653   preg->allocated = 0;
654 
655   re_free (preg->fastmap);
656   preg->fastmap = NULL;
657 
658   re_free (preg->translate);
659   preg->translate = NULL;
660 }
661 #ifdef _LIBC
662 libc_hidden_def (__regfree)
663 weak_alias (__regfree, regfree)
664 #endif
665 
666 /* Entry points compatible with 4.2 BSD regex library.  We don't define
667    them unless specifically requested.  */
668 
669 #if defined _REGEX_RE_COMP || defined _LIBC
670 
671 /* BSD has one and only one pattern buffer.  */
672 static struct re_pattern_buffer re_comp_buf;
673 
674 char *
675 # ifdef _LIBC
676 /* Make these definitions weak in libc, so POSIX programs can redefine
677    these names if they don't use our functions, and still use
678    regcomp/regexec above without link errors.  */
679 weak_function
680 # endif
re_comp(const char * s)681 re_comp (const char *s)
682 {
683   reg_errcode_t ret;
684   char *fastmap;
685 
686   if (!s)
687     {
688       if (!re_comp_buf.buffer)
689 	return gettext ("No previous regular expression");
690       return 0;
691     }
692 
693   if (re_comp_buf.buffer)
694     {
695       fastmap = re_comp_buf.fastmap;
696       re_comp_buf.fastmap = NULL;
697       __regfree (&re_comp_buf);
698       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
699       re_comp_buf.fastmap = fastmap;
700     }
701 
702   if (re_comp_buf.fastmap == NULL)
703     {
704       re_comp_buf.fastmap = re_malloc (char, SBC_MAX);
705       if (re_comp_buf.fastmap == NULL)
706 	return (char *) gettext (__re_error_msgid
707 				 + __re_error_msgid_idx[(int) REG_ESPACE]);
708     }
709 
710   /* Since 're_exec' always passes NULL for the 'regs' argument, we
711      don't need to initialize the pattern buffer fields which affect it.  */
712 
713   /* Match anchors at newlines.  */
714   re_comp_buf.newline_anchor = 1;
715 
716   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
717 
718   if (!ret)
719     return NULL;
720 
721   /* Yes, we're discarding 'const' here if !HAVE_LIBINTL.  */
722   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
723 }
724 
725 #ifdef _LIBC
libc_freeres_fn(free_mem)726 libc_freeres_fn (free_mem)
727 {
728   __regfree (&re_comp_buf);
729 }
730 #endif
731 
732 #endif /* _REGEX_RE_COMP */
733 
734 /* Internal entry point.
735    Compile the regular expression PATTERN, whose length is LENGTH.
736    SYNTAX indicate regular expression's syntax.  */
737 
738 static reg_errcode_t
re_compile_internal(regex_t * preg,const char * pattern,size_t length,reg_syntax_t syntax)739 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
740 		     reg_syntax_t syntax)
741 {
742   reg_errcode_t err = REG_NOERROR;
743   re_dfa_t *dfa;
744   re_string_t regexp;
745 
746   /* Initialize the pattern buffer.  */
747   preg->fastmap_accurate = 0;
748   preg->syntax = syntax;
749   preg->not_bol = preg->not_eol = 0;
750   preg->used = 0;
751   preg->re_nsub = 0;
752   preg->can_be_null = 0;
753   preg->regs_allocated = REGS_UNALLOCATED;
754 
755   /* Initialize the dfa.  */
756   dfa = preg->buffer;
757   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
758     {
759       /* If zero allocated, but buffer is non-null, try to realloc
760 	 enough space.  This loses if buffer's address is bogus, but
761 	 that is the user's responsibility.  If ->buffer is NULL this
762 	 is a simple allocation.  */
763       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
764       if (dfa == NULL)
765 	return REG_ESPACE;
766       preg->allocated = sizeof (re_dfa_t);
767       preg->buffer = dfa;
768     }
769   preg->used = sizeof (re_dfa_t);
770 
771   err = init_dfa (dfa, length);
772   if (BE (err == REG_NOERROR && lock_init (dfa->lock) != 0, 0))
773     err = REG_ESPACE;
774   if (BE (err != REG_NOERROR, 0))
775     {
776       free_dfa_content (dfa);
777       preg->buffer = NULL;
778       preg->allocated = 0;
779       return err;
780     }
781 #ifdef DEBUG
782   /* Note: length+1 will not overflow since it is checked in init_dfa.  */
783   dfa->re_str = re_malloc (char, length + 1);
784   strncpy (dfa->re_str, pattern, length + 1);
785 #endif
786 
787   err = re_string_construct (&regexp, pattern, length, preg->translate,
788 			     (syntax & RE_ICASE) != 0, dfa);
789   if (BE (err != REG_NOERROR, 0))
790     {
791     re_compile_internal_free_return:
792       free_workarea_compile (preg);
793       re_string_destruct (&regexp);
794       lock_fini (dfa->lock);
795       free_dfa_content (dfa);
796       preg->buffer = NULL;
797       preg->allocated = 0;
798       return err;
799     }
800 
801   /* Parse the regular expression, and build a structure tree.  */
802   preg->re_nsub = 0;
803   dfa->str_tree = parse (&regexp, preg, syntax, &err);
804   if (BE (dfa->str_tree == NULL, 0))
805     goto re_compile_internal_free_return;
806 
807   /* Analyze the tree and create the nfa.  */
808   err = analyze (preg);
809   if (BE (err != REG_NOERROR, 0))
810     goto re_compile_internal_free_return;
811 
812 #ifdef RE_ENABLE_I18N
813   /* If possible, do searching in single byte encoding to speed things up.  */
814   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
815     optimize_utf8 (dfa);
816 #endif
817 
818   /* Then create the initial state of the dfa.  */
819   err = create_initial_state (dfa);
820 
821   /* Release work areas.  */
822   free_workarea_compile (preg);
823   re_string_destruct (&regexp);
824 
825   if (BE (err != REG_NOERROR, 0))
826     {
827       lock_fini (dfa->lock);
828       free_dfa_content (dfa);
829       preg->buffer = NULL;
830       preg->allocated = 0;
831     }
832 
833   return err;
834 }
835 
836 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
837    as the initial length of some arrays.  */
838 
839 static reg_errcode_t
init_dfa(re_dfa_t * dfa,size_t pat_len)840 init_dfa (re_dfa_t *dfa, size_t pat_len)
841 {
842   __re_size_t table_size;
843 #ifndef _LIBC
844   const char *codeset_name;
845 #endif
846 #ifdef RE_ENABLE_I18N
847   size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
848 #else
849   size_t max_i18n_object_size = 0;
850 #endif
851   size_t max_object_size =
852     MAX (sizeof (struct re_state_table_entry),
853 	 MAX (sizeof (re_token_t),
854 	      MAX (sizeof (re_node_set),
855 		   MAX (sizeof (regmatch_t),
856 			max_i18n_object_size))));
857 
858   memset (dfa, '\0', sizeof (re_dfa_t));
859 
860   /* Force allocation of str_tree_storage the first time.  */
861   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
862 
863   /* Avoid overflows.  The extra "/ 2" is for the table_size doubling
864      calculation below, and for similar doubling calculations
865      elsewhere.  And it's <= rather than <, because some of the
866      doubling calculations add 1 afterwards.  */
867   if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2 <= pat_len, 0))
868     return REG_ESPACE;
869 
870   dfa->nodes_alloc = pat_len + 1;
871   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
872 
873   /*  table_size = 2 ^ ceil(log pat_len) */
874   for (table_size = 1; ; table_size <<= 1)
875     if (table_size > pat_len)
876       break;
877 
878   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
879   dfa->state_hash_mask = table_size - 1;
880 
881   dfa->mb_cur_max = MB_CUR_MAX;
882 #ifdef _LIBC
883   if (dfa->mb_cur_max == 6
884       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
885     dfa->is_utf8 = 1;
886   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
887 		       != 0);
888 #else
889   codeset_name = nl_langinfo (CODESET);
890   if ((codeset_name[0] == 'U' || codeset_name[0] == 'u')
891       && (codeset_name[1] == 'T' || codeset_name[1] == 't')
892       && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
893       && strcmp (codeset_name + 3 + (codeset_name[3] == '-'), "8") == 0)
894     dfa->is_utf8 = 1;
895 
896   /* We check exhaustively in the loop below if this charset is a
897      superset of ASCII.  */
898   dfa->map_notascii = 0;
899 #endif
900 
901 #ifdef RE_ENABLE_I18N
902   if (dfa->mb_cur_max > 1)
903     {
904       if (dfa->is_utf8)
905 	dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
906       else
907 	{
908 	  int i, j, ch;
909 
910 	  dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
911 	  if (BE (dfa->sb_char == NULL, 0))
912 	    return REG_ESPACE;
913 
914 	  /* Set the bits corresponding to single byte chars.  */
915 	  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
916 	    for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
917 	      {
918 		wint_t wch = __btowc (ch);
919 		if (wch != WEOF)
920 		  dfa->sb_char[i] |= (bitset_word_t) 1 << j;
921 # ifndef _LIBC
922 		if (isascii (ch) && wch != ch)
923 		  dfa->map_notascii = 1;
924 # endif
925 	      }
926 	}
927     }
928 #endif
929 
930   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
931     return REG_ESPACE;
932   return REG_NOERROR;
933 }
934 
935 /* Initialize WORD_CHAR table, which indicate which character is
936    "word".  In this case "word" means that it is the word construction
937    character used by some operators like "\<", "\>", etc.  */
938 
939 static void
init_word_char(re_dfa_t * dfa)940 init_word_char (re_dfa_t *dfa)
941 {
942   int i = 0;
943   int j;
944   int ch = 0;
945   dfa->word_ops_used = 1;
946   if (BE (dfa->map_notascii == 0, 1))
947     {
948       /* Avoid uint32_t and uint64_t as some non-GCC platforms lack
949 	 them, an issue when this code is used in Gnulib.  */
950       bitset_word_t bits0 = 0x00000000;
951       bitset_word_t bits1 = 0x03ff0000;
952       bitset_word_t bits2 = 0x87fffffe;
953       bitset_word_t bits3 = 0x07fffffe;
954       if (BITSET_WORD_BITS == 64)
955 	{
956 	  /* Pacify gcc -Woverflow on 32-bit platformns.  */
957 	  dfa->word_char[0] = bits1 << 31 << 1 | bits0;
958 	  dfa->word_char[1] = bits3 << 31 << 1 | bits2;
959 	  i = 2;
960 	}
961       else if (BITSET_WORD_BITS == 32)
962 	{
963 	  dfa->word_char[0] = bits0;
964 	  dfa->word_char[1] = bits1;
965 	  dfa->word_char[2] = bits2;
966 	  dfa->word_char[3] = bits3;
967 	  i = 4;
968 	}
969       else
970         goto general_case;
971       ch = 128;
972 
973       if (BE (dfa->is_utf8, 1))
974 	{
975 	  memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
976 	  return;
977 	}
978     }
979 
980  general_case:
981   for (; i < BITSET_WORDS; ++i)
982     for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
983       if (isalnum (ch) || ch == '_')
984 	dfa->word_char[i] |= (bitset_word_t) 1 << j;
985 }
986 
987 /* Free the work area which are only used while compiling.  */
988 
989 static void
free_workarea_compile(regex_t * preg)990 free_workarea_compile (regex_t *preg)
991 {
992   re_dfa_t *dfa = preg->buffer;
993   bin_tree_storage_t *storage, *next;
994   for (storage = dfa->str_tree_storage; storage; storage = next)
995     {
996       next = storage->next;
997       re_free (storage);
998     }
999   dfa->str_tree_storage = NULL;
1000   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
1001   dfa->str_tree = NULL;
1002   re_free (dfa->org_indices);
1003   dfa->org_indices = NULL;
1004 }
1005 
1006 /* Create initial states for all contexts.  */
1007 
1008 static reg_errcode_t
create_initial_state(re_dfa_t * dfa)1009 create_initial_state (re_dfa_t *dfa)
1010 {
1011   Idx first, i;
1012   reg_errcode_t err;
1013   re_node_set init_nodes;
1014 
1015   /* Initial states have the epsilon closure of the node which is
1016      the first node of the regular expression.  */
1017   first = dfa->str_tree->first->node_idx;
1018   dfa->init_node = first;
1019   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1020   if (BE (err != REG_NOERROR, 0))
1021     return err;
1022 
1023   /* The back-references which are in initial states can epsilon transit,
1024      since in this case all of the subexpressions can be null.
1025      Then we add epsilon closures of the nodes which are the next nodes of
1026      the back-references.  */
1027   if (dfa->nbackref > 0)
1028     for (i = 0; i < init_nodes.nelem; ++i)
1029       {
1030 	Idx node_idx = init_nodes.elems[i];
1031 	re_token_type_t type = dfa->nodes[node_idx].type;
1032 
1033 	Idx clexp_idx;
1034 	if (type != OP_BACK_REF)
1035 	  continue;
1036 	for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1037 	  {
1038 	    re_token_t *clexp_node;
1039 	    clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1040 	    if (clexp_node->type == OP_CLOSE_SUBEXP
1041 		&& clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1042 	      break;
1043 	  }
1044 	if (clexp_idx == init_nodes.nelem)
1045 	  continue;
1046 
1047 	if (type == OP_BACK_REF)
1048 	  {
1049 	    Idx dest_idx = dfa->edests[node_idx].elems[0];
1050 	    if (!re_node_set_contains (&init_nodes, dest_idx))
1051 	      {
1052 		reg_errcode_t merge_err
1053                   = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1054 		if (merge_err != REG_NOERROR)
1055 		  return merge_err;
1056 		i = 0;
1057 	      }
1058 	  }
1059       }
1060 
1061   /* It must be the first time to invoke acquire_state.  */
1062   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1063   /* We don't check ERR here, since the initial state must not be NULL.  */
1064   if (BE (dfa->init_state == NULL, 0))
1065     return err;
1066   if (dfa->init_state->has_constraint)
1067     {
1068       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1069 						       CONTEXT_WORD);
1070       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1071 						     CONTEXT_NEWLINE);
1072       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1073 							 &init_nodes,
1074 							 CONTEXT_NEWLINE
1075 							 | CONTEXT_BEGBUF);
1076       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1077 	      || dfa->init_state_begbuf == NULL, 0))
1078 	return err;
1079     }
1080   else
1081     dfa->init_state_word = dfa->init_state_nl
1082       = dfa->init_state_begbuf = dfa->init_state;
1083 
1084   re_node_set_free (&init_nodes);
1085   return REG_NOERROR;
1086 }
1087 
1088 #ifdef RE_ENABLE_I18N
1089 /* If it is possible to do searching in single byte encoding instead of UTF-8
1090    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1091    DFA nodes where needed.  */
1092 
1093 static void
optimize_utf8(re_dfa_t * dfa)1094 optimize_utf8 (re_dfa_t *dfa)
1095 {
1096   Idx node;
1097   int i;
1098   bool mb_chars = false;
1099   bool has_period = false;
1100 
1101   for (node = 0; node < dfa->nodes_len; ++node)
1102     switch (dfa->nodes[node].type)
1103       {
1104       case CHARACTER:
1105 	if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1106 	  mb_chars = true;
1107 	break;
1108       case ANCHOR:
1109 	switch (dfa->nodes[node].opr.ctx_type)
1110 	  {
1111 	  case LINE_FIRST:
1112 	  case LINE_LAST:
1113 	  case BUF_FIRST:
1114 	  case BUF_LAST:
1115 	    break;
1116 	  default:
1117 	    /* Word anchors etc. cannot be handled.  It's okay to test
1118 	       opr.ctx_type since constraints (for all DFA nodes) are
1119 	       created by ORing one or more opr.ctx_type values.  */
1120 	    return;
1121 	  }
1122 	break;
1123       case OP_PERIOD:
1124 	has_period = true;
1125 	break;
1126       case OP_BACK_REF:
1127       case OP_ALT:
1128       case END_OF_RE:
1129       case OP_DUP_ASTERISK:
1130       case OP_OPEN_SUBEXP:
1131       case OP_CLOSE_SUBEXP:
1132 	break;
1133       case COMPLEX_BRACKET:
1134 	return;
1135       case SIMPLE_BRACKET:
1136 	/* Just double check.  */
1137 	{
1138 	  int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1139 			? 0
1140 			: BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1141 	  for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1142 	    {
1143 	      if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1144 		return;
1145 	      rshift = 0;
1146 	    }
1147 	}
1148 	break;
1149       default:
1150 	abort ();
1151       }
1152 
1153   if (mb_chars || has_period)
1154     for (node = 0; node < dfa->nodes_len; ++node)
1155       {
1156 	if (dfa->nodes[node].type == CHARACTER
1157 	    && dfa->nodes[node].opr.c >= ASCII_CHARS)
1158 	  dfa->nodes[node].mb_partial = 0;
1159 	else if (dfa->nodes[node].type == OP_PERIOD)
1160 	  dfa->nodes[node].type = OP_UTF8_PERIOD;
1161       }
1162 
1163   /* The search can be in single byte locale.  */
1164   dfa->mb_cur_max = 1;
1165   dfa->is_utf8 = 0;
1166   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1167 }
1168 #endif
1169 
1170 /* Analyze the structure tree, and calculate "first", "next", "edest",
1171    "eclosure", and "inveclosure".  */
1172 
1173 static reg_errcode_t
analyze(regex_t * preg)1174 analyze (regex_t *preg)
1175 {
1176   re_dfa_t *dfa = preg->buffer;
1177   reg_errcode_t ret;
1178 
1179   /* Allocate arrays.  */
1180   dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1181   dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1182   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1183   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1184   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1185 	  || dfa->eclosures == NULL, 0))
1186     return REG_ESPACE;
1187 
1188   dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1189   if (dfa->subexp_map != NULL)
1190     {
1191       Idx i;
1192       for (i = 0; i < preg->re_nsub; i++)
1193 	dfa->subexp_map[i] = i;
1194       preorder (dfa->str_tree, optimize_subexps, dfa);
1195       for (i = 0; i < preg->re_nsub; i++)
1196 	if (dfa->subexp_map[i] != i)
1197 	  break;
1198       if (i == preg->re_nsub)
1199 	{
1200 	  re_free (dfa->subexp_map);
1201 	  dfa->subexp_map = NULL;
1202 	}
1203     }
1204 
1205   ret = postorder (dfa->str_tree, lower_subexps, preg);
1206   if (BE (ret != REG_NOERROR, 0))
1207     return ret;
1208   ret = postorder (dfa->str_tree, calc_first, dfa);
1209   if (BE (ret != REG_NOERROR, 0))
1210     return ret;
1211   preorder (dfa->str_tree, calc_next, dfa);
1212   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1213   if (BE (ret != REG_NOERROR, 0))
1214     return ret;
1215   ret = calc_eclosure (dfa);
1216   if (BE (ret != REG_NOERROR, 0))
1217     return ret;
1218 
1219   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1220      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1221   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1222       || dfa->nbackref)
1223     {
1224       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1225       if (BE (dfa->inveclosures == NULL, 0))
1226 	return REG_ESPACE;
1227       ret = calc_inveclosure (dfa);
1228     }
1229 
1230   return ret;
1231 }
1232 
1233 /* Our parse trees are very unbalanced, so we cannot use a stack to
1234    implement parse tree visits.  Instead, we use parent pointers and
1235    some hairy code in these two functions.  */
1236 static reg_errcode_t
postorder(bin_tree_t * root,reg_errcode_t (fn (void *,bin_tree_t *)),void * extra)1237 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1238 	   void *extra)
1239 {
1240   bin_tree_t *node, *prev;
1241 
1242   for (node = root; ; )
1243     {
1244       /* Descend down the tree, preferably to the left (or to the right
1245 	 if that's the only child).  */
1246       while (node->left || node->right)
1247 	if (node->left)
1248 	  node = node->left;
1249 	else
1250 	  node = node->right;
1251 
1252       do
1253 	{
1254 	  reg_errcode_t err = fn (extra, node);
1255 	  if (BE (err != REG_NOERROR, 0))
1256 	    return err;
1257 	  if (node->parent == NULL)
1258 	    return REG_NOERROR;
1259 	  prev = node;
1260 	  node = node->parent;
1261 	}
1262       /* Go up while we have a node that is reached from the right.  */
1263       while (node->right == prev || node->right == NULL);
1264       node = node->right;
1265     }
1266 }
1267 
1268 static reg_errcode_t
preorder(bin_tree_t * root,reg_errcode_t (fn (void *,bin_tree_t *)),void * extra)1269 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1270 	  void *extra)
1271 {
1272   bin_tree_t *node;
1273 
1274   for (node = root; ; )
1275     {
1276       reg_errcode_t err = fn (extra, node);
1277       if (BE (err != REG_NOERROR, 0))
1278 	return err;
1279 
1280       /* Go to the left node, or up and to the right.  */
1281       if (node->left)
1282 	node = node->left;
1283       else
1284 	{
1285 	  bin_tree_t *prev = NULL;
1286 	  while (node->right == prev || node->right == NULL)
1287 	    {
1288 	      prev = node;
1289 	      node = node->parent;
1290 	      if (!node)
1291 		return REG_NOERROR;
1292 	    }
1293 	  node = node->right;
1294 	}
1295     }
1296 }
1297 
1298 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1299    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1300    backreferences as well.  Requires a preorder visit.  */
1301 static reg_errcode_t
optimize_subexps(void * extra,bin_tree_t * node)1302 optimize_subexps (void *extra, bin_tree_t *node)
1303 {
1304   re_dfa_t *dfa = (re_dfa_t *) extra;
1305 
1306   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1307     {
1308       int idx = node->token.opr.idx;
1309       node->token.opr.idx = dfa->subexp_map[idx];
1310       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1311     }
1312 
1313   else if (node->token.type == SUBEXP
1314 	   && node->left && node->left->token.type == SUBEXP)
1315     {
1316       Idx other_idx = node->left->token.opr.idx;
1317 
1318       node->left = node->left->left;
1319       if (node->left)
1320 	node->left->parent = node;
1321 
1322       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1323       if (other_idx < BITSET_WORD_BITS)
1324 	dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1325     }
1326 
1327   return REG_NOERROR;
1328 }
1329 
1330 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1331    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1332 static reg_errcode_t
lower_subexps(void * extra,bin_tree_t * node)1333 lower_subexps (void *extra, bin_tree_t *node)
1334 {
1335   regex_t *preg = (regex_t *) extra;
1336   reg_errcode_t err = REG_NOERROR;
1337 
1338   if (node->left && node->left->token.type == SUBEXP)
1339     {
1340       node->left = lower_subexp (&err, preg, node->left);
1341       if (node->left)
1342 	node->left->parent = node;
1343     }
1344   if (node->right && node->right->token.type == SUBEXP)
1345     {
1346       node->right = lower_subexp (&err, preg, node->right);
1347       if (node->right)
1348 	node->right->parent = node;
1349     }
1350 
1351   return err;
1352 }
1353 
1354 static bin_tree_t *
lower_subexp(reg_errcode_t * err,regex_t * preg,bin_tree_t * node)1355 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1356 {
1357   re_dfa_t *dfa = preg->buffer;
1358   bin_tree_t *body = node->left;
1359   bin_tree_t *op, *cls, *tree1, *tree;
1360 
1361   if (preg->no_sub
1362       /* We do not optimize empty subexpressions, because otherwise we may
1363 	 have bad CONCAT nodes with NULL children.  This is obviously not
1364 	 very common, so we do not lose much.  An example that triggers
1365 	 this case is the sed "script" /\(\)/x.  */
1366       && node->left != NULL
1367       && (node->token.opr.idx >= BITSET_WORD_BITS
1368 	  || !(dfa->used_bkref_map
1369 	       & ((bitset_word_t) 1 << node->token.opr.idx))))
1370     return node->left;
1371 
1372   /* Convert the SUBEXP node to the concatenation of an
1373      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1374   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1375   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1376   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1377   tree = create_tree (dfa, op, tree1, CONCAT);
1378   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1379     {
1380       *err = REG_ESPACE;
1381       return NULL;
1382     }
1383 
1384   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1385   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1386   return tree;
1387 }
1388 
1389 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1390    nodes.  Requires a postorder visit.  */
1391 static reg_errcode_t
calc_first(void * extra,bin_tree_t * node)1392 calc_first (void *extra, bin_tree_t *node)
1393 {
1394   re_dfa_t *dfa = (re_dfa_t *) extra;
1395   if (node->token.type == CONCAT)
1396     {
1397       node->first = node->left->first;
1398       node->node_idx = node->left->node_idx;
1399     }
1400   else
1401     {
1402       node->first = node;
1403       node->node_idx = re_dfa_add_node (dfa, node->token);
1404       if (BE (node->node_idx == -1, 0))
1405 	return REG_ESPACE;
1406       if (node->token.type == ANCHOR)
1407 	dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1408     }
1409   return REG_NOERROR;
1410 }
1411 
1412 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1413 static reg_errcode_t
calc_next(void * extra,bin_tree_t * node)1414 calc_next (void *extra, bin_tree_t *node)
1415 {
1416   switch (node->token.type)
1417     {
1418     case OP_DUP_ASTERISK:
1419       node->left->next = node;
1420       break;
1421     case CONCAT:
1422       node->left->next = node->right->first;
1423       node->right->next = node->next;
1424       break;
1425     default:
1426       if (node->left)
1427 	node->left->next = node->next;
1428       if (node->right)
1429 	node->right->next = node->next;
1430       break;
1431     }
1432   return REG_NOERROR;
1433 }
1434 
1435 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1436 static reg_errcode_t
link_nfa_nodes(void * extra,bin_tree_t * node)1437 link_nfa_nodes (void *extra, bin_tree_t *node)
1438 {
1439   re_dfa_t *dfa = (re_dfa_t *) extra;
1440   Idx idx = node->node_idx;
1441   reg_errcode_t err = REG_NOERROR;
1442 
1443   switch (node->token.type)
1444     {
1445     case CONCAT:
1446       break;
1447 
1448     case END_OF_RE:
1449       assert (node->next == NULL);
1450       break;
1451 
1452     case OP_DUP_ASTERISK:
1453     case OP_ALT:
1454       {
1455 	Idx left, right;
1456 	dfa->has_plural_match = 1;
1457 	if (node->left != NULL)
1458 	  left = node->left->first->node_idx;
1459 	else
1460 	  left = node->next->node_idx;
1461 	if (node->right != NULL)
1462 	  right = node->right->first->node_idx;
1463 	else
1464 	  right = node->next->node_idx;
1465 	assert (left > -1);
1466 	assert (right > -1);
1467 	err = re_node_set_init_2 (dfa->edests + idx, left, right);
1468       }
1469       break;
1470 
1471     case ANCHOR:
1472     case OP_OPEN_SUBEXP:
1473     case OP_CLOSE_SUBEXP:
1474       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1475       break;
1476 
1477     case OP_BACK_REF:
1478       dfa->nexts[idx] = node->next->node_idx;
1479       if (node->token.type == OP_BACK_REF)
1480 	err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1481       break;
1482 
1483     default:
1484       assert (!IS_EPSILON_NODE (node->token.type));
1485       dfa->nexts[idx] = node->next->node_idx;
1486       break;
1487     }
1488 
1489   return err;
1490 }
1491 
1492 /* Duplicate the epsilon closure of the node ROOT_NODE.
1493    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1494    to their own constraint.  */
1495 
1496 static reg_errcode_t
duplicate_node_closure(re_dfa_t * dfa,Idx top_org_node,Idx top_clone_node,Idx root_node,unsigned int init_constraint)1497 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1498 			Idx root_node, unsigned int init_constraint)
1499 {
1500   Idx org_node, clone_node;
1501   bool ok;
1502   unsigned int constraint = init_constraint;
1503   for (org_node = top_org_node, clone_node = top_clone_node;;)
1504     {
1505       Idx org_dest, clone_dest;
1506       if (dfa->nodes[org_node].type == OP_BACK_REF)
1507 	{
1508 	  /* If the back reference epsilon-transit, its destination must
1509 	     also have the constraint.  Then duplicate the epsilon closure
1510 	     of the destination of the back reference, and store it in
1511 	     edests of the back reference.  */
1512 	  org_dest = dfa->nexts[org_node];
1513 	  re_node_set_empty (dfa->edests + clone_node);
1514 	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1515 	  if (BE (clone_dest == -1, 0))
1516 	    return REG_ESPACE;
1517 	  dfa->nexts[clone_node] = dfa->nexts[org_node];
1518 	  ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1519 	  if (BE (! ok, 0))
1520 	    return REG_ESPACE;
1521 	}
1522       else if (dfa->edests[org_node].nelem == 0)
1523 	{
1524 	  /* In case of the node can't epsilon-transit, don't duplicate the
1525 	     destination and store the original destination as the
1526 	     destination of the node.  */
1527 	  dfa->nexts[clone_node] = dfa->nexts[org_node];
1528 	  break;
1529 	}
1530       else if (dfa->edests[org_node].nelem == 1)
1531 	{
1532 	  /* In case of the node can epsilon-transit, and it has only one
1533 	     destination.  */
1534 	  org_dest = dfa->edests[org_node].elems[0];
1535 	  re_node_set_empty (dfa->edests + clone_node);
1536 	  /* If the node is root_node itself, it means the epsilon closure
1537 	     has a loop.  Then tie it to the destination of the root_node.  */
1538 	  if (org_node == root_node && clone_node != org_node)
1539 	    {
1540 	      ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1541 	      if (BE (! ok, 0))
1542 	        return REG_ESPACE;
1543 	      break;
1544 	    }
1545 	  /* In case the node has another constraint, append it.  */
1546 	  constraint |= dfa->nodes[org_node].constraint;
1547 	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1548 	  if (BE (clone_dest == -1, 0))
1549 	    return REG_ESPACE;
1550 	  ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1551 	  if (BE (! ok, 0))
1552 	    return REG_ESPACE;
1553 	}
1554       else /* dfa->edests[org_node].nelem == 2 */
1555 	{
1556 	  /* In case of the node can epsilon-transit, and it has two
1557 	     destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1558 	  org_dest = dfa->edests[org_node].elems[0];
1559 	  re_node_set_empty (dfa->edests + clone_node);
1560 	  /* Search for a duplicated node which satisfies the constraint.  */
1561 	  clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1562 	  if (clone_dest == -1)
1563 	    {
1564 	      /* There is no such duplicated node, create a new one.  */
1565 	      reg_errcode_t err;
1566 	      clone_dest = duplicate_node (dfa, org_dest, constraint);
1567 	      if (BE (clone_dest == -1, 0))
1568 		return REG_ESPACE;
1569 	      ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1570 	      if (BE (! ok, 0))
1571 		return REG_ESPACE;
1572 	      err = duplicate_node_closure (dfa, org_dest, clone_dest,
1573 					    root_node, constraint);
1574 	      if (BE (err != REG_NOERROR, 0))
1575 		return err;
1576 	    }
1577 	  else
1578 	    {
1579 	      /* There is a duplicated node which satisfies the constraint,
1580 		 use it to avoid infinite loop.  */
1581 	      ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1582 	      if (BE (! ok, 0))
1583 		return REG_ESPACE;
1584 	    }
1585 
1586 	  org_dest = dfa->edests[org_node].elems[1];
1587 	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1588 	  if (BE (clone_dest == -1, 0))
1589 	    return REG_ESPACE;
1590 	  ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1591 	  if (BE (! ok, 0))
1592 	    return REG_ESPACE;
1593 	}
1594       org_node = org_dest;
1595       clone_node = clone_dest;
1596     }
1597   return REG_NOERROR;
1598 }
1599 
1600 /* Search for a node which is duplicated from the node ORG_NODE, and
1601    satisfies the constraint CONSTRAINT.  */
1602 
1603 static Idx
search_duplicated_node(const re_dfa_t * dfa,Idx org_node,unsigned int constraint)1604 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1605 			unsigned int constraint)
1606 {
1607   Idx idx;
1608   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1609     {
1610       if (org_node == dfa->org_indices[idx]
1611 	  && constraint == dfa->nodes[idx].constraint)
1612 	return idx; /* Found.  */
1613     }
1614   return -1; /* Not found.  */
1615 }
1616 
1617 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1618    Return the index of the new node, or -1 if insufficient storage is
1619    available.  */
1620 
1621 static Idx
duplicate_node(re_dfa_t * dfa,Idx org_idx,unsigned int constraint)1622 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1623 {
1624   Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1625   if (BE (dup_idx != -1, 1))
1626     {
1627       dfa->nodes[dup_idx].constraint = constraint;
1628       dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1629       dfa->nodes[dup_idx].duplicated = 1;
1630 
1631       /* Store the index of the original node.  */
1632       dfa->org_indices[dup_idx] = org_idx;
1633     }
1634   return dup_idx;
1635 }
1636 
1637 static reg_errcode_t
calc_inveclosure(re_dfa_t * dfa)1638 calc_inveclosure (re_dfa_t *dfa)
1639 {
1640   Idx src, idx;
1641   bool ok;
1642   for (idx = 0; idx < dfa->nodes_len; ++idx)
1643     re_node_set_init_empty (dfa->inveclosures + idx);
1644 
1645   for (src = 0; src < dfa->nodes_len; ++src)
1646     {
1647       Idx *elems = dfa->eclosures[src].elems;
1648       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1649 	{
1650 	  ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1651 	  if (BE (! ok, 0))
1652 	    return REG_ESPACE;
1653 	}
1654     }
1655 
1656   return REG_NOERROR;
1657 }
1658 
1659 /* Calculate "eclosure" for all the node in DFA.  */
1660 
1661 static reg_errcode_t
calc_eclosure(re_dfa_t * dfa)1662 calc_eclosure (re_dfa_t *dfa)
1663 {
1664   Idx node_idx;
1665   bool incomplete;
1666 #ifdef DEBUG
1667   assert (dfa->nodes_len > 0);
1668 #endif
1669   incomplete = false;
1670   /* For each nodes, calculate epsilon closure.  */
1671   for (node_idx = 0; ; ++node_idx)
1672     {
1673       reg_errcode_t err;
1674       re_node_set eclosure_elem;
1675       if (node_idx == dfa->nodes_len)
1676 	{
1677 	  if (!incomplete)
1678 	    break;
1679 	  incomplete = false;
1680 	  node_idx = 0;
1681 	}
1682 
1683 #ifdef DEBUG
1684       assert (dfa->eclosures[node_idx].nelem != -1);
1685 #endif
1686 
1687       /* If we have already calculated, skip it.  */
1688       if (dfa->eclosures[node_idx].nelem != 0)
1689 	continue;
1690       /* Calculate epsilon closure of 'node_idx'.  */
1691       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1692       if (BE (err != REG_NOERROR, 0))
1693 	return err;
1694 
1695       if (dfa->eclosures[node_idx].nelem == 0)
1696 	{
1697 	  incomplete = true;
1698 	  re_node_set_free (&eclosure_elem);
1699 	}
1700     }
1701   return REG_NOERROR;
1702 }
1703 
1704 /* Calculate epsilon closure of NODE.  */
1705 
1706 static reg_errcode_t
calc_eclosure_iter(re_node_set * new_set,re_dfa_t * dfa,Idx node,bool root)1707 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1708 {
1709   reg_errcode_t err;
1710   Idx i;
1711   re_node_set eclosure;
1712   bool ok;
1713   bool incomplete = false;
1714   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1715   if (BE (err != REG_NOERROR, 0))
1716     return err;
1717 
1718   /* This indicates that we are calculating this node now.
1719      We reference this value to avoid infinite loop.  */
1720   dfa->eclosures[node].nelem = -1;
1721 
1722   /* If the current node has constraints, duplicate all nodes
1723      since they must inherit the constraints.  */
1724   if (dfa->nodes[node].constraint
1725       && dfa->edests[node].nelem
1726       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1727     {
1728       err = duplicate_node_closure (dfa, node, node, node,
1729 				    dfa->nodes[node].constraint);
1730       if (BE (err != REG_NOERROR, 0))
1731 	return err;
1732     }
1733 
1734   /* Expand each epsilon destination nodes.  */
1735   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1736     for (i = 0; i < dfa->edests[node].nelem; ++i)
1737       {
1738 	re_node_set eclosure_elem;
1739 	Idx edest = dfa->edests[node].elems[i];
1740 	/* If calculating the epsilon closure of 'edest' is in progress,
1741 	   return intermediate result.  */
1742 	if (dfa->eclosures[edest].nelem == -1)
1743 	  {
1744 	    incomplete = true;
1745 	    continue;
1746 	  }
1747 	/* If we haven't calculated the epsilon closure of 'edest' yet,
1748 	   calculate now. Otherwise use calculated epsilon closure.  */
1749 	if (dfa->eclosures[edest].nelem == 0)
1750 	  {
1751 	    err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1752 	    if (BE (err != REG_NOERROR, 0))
1753 	      return err;
1754 	  }
1755 	else
1756 	  eclosure_elem = dfa->eclosures[edest];
1757 	/* Merge the epsilon closure of 'edest'.  */
1758 	err = re_node_set_merge (&eclosure, &eclosure_elem);
1759 	if (BE (err != REG_NOERROR, 0))
1760 	  return err;
1761 	/* If the epsilon closure of 'edest' is incomplete,
1762 	   the epsilon closure of this node is also incomplete.  */
1763 	if (dfa->eclosures[edest].nelem == 0)
1764 	  {
1765 	    incomplete = true;
1766 	    re_node_set_free (&eclosure_elem);
1767 	  }
1768       }
1769 
1770   /* An epsilon closure includes itself.  */
1771   ok = re_node_set_insert (&eclosure, node);
1772   if (BE (! ok, 0))
1773     return REG_ESPACE;
1774   if (incomplete && !root)
1775     dfa->eclosures[node].nelem = 0;
1776   else
1777     dfa->eclosures[node] = eclosure;
1778   *new_set = eclosure;
1779   return REG_NOERROR;
1780 }
1781 
1782 /* Functions for token which are used in the parser.  */
1783 
1784 /* Fetch a token from INPUT.
1785    We must not use this function inside bracket expressions.  */
1786 
1787 static void
fetch_token(re_token_t * result,re_string_t * input,reg_syntax_t syntax)1788 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1789 {
1790   re_string_skip_bytes (input, peek_token (result, input, syntax));
1791 }
1792 
1793 /* Peek a token from INPUT, and return the length of the token.
1794    We must not use this function inside bracket expressions.  */
1795 
1796 static int
peek_token(re_token_t * token,re_string_t * input,reg_syntax_t syntax)1797 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1798 {
1799   unsigned char c;
1800 
1801   if (re_string_eoi (input))
1802     {
1803       token->type = END_OF_RE;
1804       return 0;
1805     }
1806 
1807   c = re_string_peek_byte (input, 0);
1808   token->opr.c = c;
1809 
1810   token->word_char = 0;
1811 #ifdef RE_ENABLE_I18N
1812   token->mb_partial = 0;
1813   if (input->mb_cur_max > 1 &&
1814       !re_string_first_byte (input, re_string_cur_idx (input)))
1815     {
1816       token->type = CHARACTER;
1817       token->mb_partial = 1;
1818       return 1;
1819     }
1820 #endif
1821   if (c == '\\')
1822     {
1823       unsigned char c2;
1824       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1825 	{
1826 	  token->type = BACK_SLASH;
1827 	  return 1;
1828 	}
1829 
1830       c2 = re_string_peek_byte_case (input, 1);
1831       token->opr.c = c2;
1832       token->type = CHARACTER;
1833 #ifdef RE_ENABLE_I18N
1834       if (input->mb_cur_max > 1)
1835 	{
1836 	  wint_t wc = re_string_wchar_at (input,
1837 					  re_string_cur_idx (input) + 1);
1838 	  token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1839 	}
1840       else
1841 #endif
1842 	token->word_char = IS_WORD_CHAR (c2) != 0;
1843 
1844       switch (c2)
1845 	{
1846 	case '|':
1847 	  if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1848 	    token->type = OP_ALT;
1849 	  break;
1850 	case '1': case '2': case '3': case '4': case '5':
1851 	case '6': case '7': case '8': case '9':
1852 	  if (!(syntax & RE_NO_BK_REFS))
1853 	    {
1854 	      token->type = OP_BACK_REF;
1855 	      token->opr.idx = c2 - '1';
1856 	    }
1857 	  break;
1858 	case '<':
1859 	  if (!(syntax & RE_NO_GNU_OPS))
1860 	    {
1861 	      token->type = ANCHOR;
1862 	      token->opr.ctx_type = WORD_FIRST;
1863 	    }
1864 	  break;
1865 	case '>':
1866 	  if (!(syntax & RE_NO_GNU_OPS))
1867 	    {
1868 	      token->type = ANCHOR;
1869 	      token->opr.ctx_type = WORD_LAST;
1870 	    }
1871 	  break;
1872 	case 'b':
1873 	  if (!(syntax & RE_NO_GNU_OPS))
1874 	    {
1875 	      token->type = ANCHOR;
1876 	      token->opr.ctx_type = WORD_DELIM;
1877 	    }
1878 	  break;
1879 	case 'B':
1880 	  if (!(syntax & RE_NO_GNU_OPS))
1881 	    {
1882 	      token->type = ANCHOR;
1883 	      token->opr.ctx_type = NOT_WORD_DELIM;
1884 	    }
1885 	  break;
1886 	case 'w':
1887 	  if (!(syntax & RE_NO_GNU_OPS))
1888 	    token->type = OP_WORD;
1889 	  break;
1890 	case 'W':
1891 	  if (!(syntax & RE_NO_GNU_OPS))
1892 	    token->type = OP_NOTWORD;
1893 	  break;
1894 	case 's':
1895 	  if (!(syntax & RE_NO_GNU_OPS))
1896 	    token->type = OP_SPACE;
1897 	  break;
1898 	case 'S':
1899 	  if (!(syntax & RE_NO_GNU_OPS))
1900 	    token->type = OP_NOTSPACE;
1901 	  break;
1902 	case '`':
1903 	  if (!(syntax & RE_NO_GNU_OPS))
1904 	    {
1905 	      token->type = ANCHOR;
1906 	      token->opr.ctx_type = BUF_FIRST;
1907 	    }
1908 	  break;
1909 	case '\'':
1910 	  if (!(syntax & RE_NO_GNU_OPS))
1911 	    {
1912 	      token->type = ANCHOR;
1913 	      token->opr.ctx_type = BUF_LAST;
1914 	    }
1915 	  break;
1916 	case '(':
1917 	  if (!(syntax & RE_NO_BK_PARENS))
1918 	    token->type = OP_OPEN_SUBEXP;
1919 	  break;
1920 	case ')':
1921 	  if (!(syntax & RE_NO_BK_PARENS))
1922 	    token->type = OP_CLOSE_SUBEXP;
1923 	  break;
1924 	case '+':
1925 	  if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1926 	    token->type = OP_DUP_PLUS;
1927 	  break;
1928 	case '?':
1929 	  if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1930 	    token->type = OP_DUP_QUESTION;
1931 	  break;
1932 	case '{':
1933 	  if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1934 	    token->type = OP_OPEN_DUP_NUM;
1935 	  break;
1936 	case '}':
1937 	  if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1938 	    token->type = OP_CLOSE_DUP_NUM;
1939 	  break;
1940 	default:
1941 	  break;
1942 	}
1943       return 2;
1944     }
1945 
1946   token->type = CHARACTER;
1947 #ifdef RE_ENABLE_I18N
1948   if (input->mb_cur_max > 1)
1949     {
1950       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1951       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1952     }
1953   else
1954 #endif
1955     token->word_char = IS_WORD_CHAR (token->opr.c);
1956 
1957   switch (c)
1958     {
1959     case '\n':
1960       if (syntax & RE_NEWLINE_ALT)
1961 	token->type = OP_ALT;
1962       break;
1963     case '|':
1964       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1965 	token->type = OP_ALT;
1966       break;
1967     case '*':
1968       token->type = OP_DUP_ASTERISK;
1969       break;
1970     case '+':
1971       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1972 	token->type = OP_DUP_PLUS;
1973       break;
1974     case '?':
1975       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1976 	token->type = OP_DUP_QUESTION;
1977       break;
1978     case '{':
1979       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1980 	token->type = OP_OPEN_DUP_NUM;
1981       break;
1982     case '}':
1983       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1984 	token->type = OP_CLOSE_DUP_NUM;
1985       break;
1986     case '(':
1987       if (syntax & RE_NO_BK_PARENS)
1988 	token->type = OP_OPEN_SUBEXP;
1989       break;
1990     case ')':
1991       if (syntax & RE_NO_BK_PARENS)
1992 	token->type = OP_CLOSE_SUBEXP;
1993       break;
1994     case '[':
1995       token->type = OP_OPEN_BRACKET;
1996       break;
1997     case '.':
1998       token->type = OP_PERIOD;
1999       break;
2000     case '^':
2001       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
2002 	  re_string_cur_idx (input) != 0)
2003 	{
2004 	  char prev = re_string_peek_byte (input, -1);
2005 	  if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
2006 	    break;
2007 	}
2008       token->type = ANCHOR;
2009       token->opr.ctx_type = LINE_FIRST;
2010       break;
2011     case '$':
2012       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
2013 	  re_string_cur_idx (input) + 1 != re_string_length (input))
2014 	{
2015 	  re_token_t next;
2016 	  re_string_skip_bytes (input, 1);
2017 	  peek_token (&next, input, syntax);
2018 	  re_string_skip_bytes (input, -1);
2019 	  if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2020 	    break;
2021 	}
2022       token->type = ANCHOR;
2023       token->opr.ctx_type = LINE_LAST;
2024       break;
2025     default:
2026       break;
2027     }
2028   return 1;
2029 }
2030 
2031 /* Peek a token from INPUT, and return the length of the token.
2032    We must not use this function out of bracket expressions.  */
2033 
2034 static int
peek_token_bracket(re_token_t * token,re_string_t * input,reg_syntax_t syntax)2035 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2036 {
2037   unsigned char c;
2038   if (re_string_eoi (input))
2039     {
2040       token->type = END_OF_RE;
2041       return 0;
2042     }
2043   c = re_string_peek_byte (input, 0);
2044   token->opr.c = c;
2045 
2046 #ifdef RE_ENABLE_I18N
2047   if (input->mb_cur_max > 1 &&
2048       !re_string_first_byte (input, re_string_cur_idx (input)))
2049     {
2050       token->type = CHARACTER;
2051       return 1;
2052     }
2053 #endif /* RE_ENABLE_I18N */
2054 
2055   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2056       && re_string_cur_idx (input) + 1 < re_string_length (input))
2057     {
2058       /* In this case, '\' escape a character.  */
2059       unsigned char c2;
2060       re_string_skip_bytes (input, 1);
2061       c2 = re_string_peek_byte (input, 0);
2062       token->opr.c = c2;
2063       token->type = CHARACTER;
2064       return 1;
2065     }
2066   if (c == '[') /* '[' is a special char in a bracket exps.  */
2067     {
2068       unsigned char c2;
2069       int token_len;
2070       if (re_string_cur_idx (input) + 1 < re_string_length (input))
2071 	c2 = re_string_peek_byte (input, 1);
2072       else
2073 	c2 = 0;
2074       token->opr.c = c2;
2075       token_len = 2;
2076       switch (c2)
2077 	{
2078 	case '.':
2079 	  token->type = OP_OPEN_COLL_ELEM;
2080 	  break;
2081 
2082 	case '=':
2083 	  token->type = OP_OPEN_EQUIV_CLASS;
2084 	  break;
2085 
2086 	case ':':
2087 	  if (syntax & RE_CHAR_CLASSES)
2088 	    {
2089 	      token->type = OP_OPEN_CHAR_CLASS;
2090 	      break;
2091 	    }
2092 	  FALLTHROUGH;
2093 	default:
2094 	  token->type = CHARACTER;
2095 	  token->opr.c = c;
2096 	  token_len = 1;
2097 	  break;
2098 	}
2099       return token_len;
2100     }
2101   switch (c)
2102     {
2103     case '-':
2104       token->type = OP_CHARSET_RANGE;
2105       break;
2106     case ']':
2107       token->type = OP_CLOSE_BRACKET;
2108       break;
2109     case '^':
2110       token->type = OP_NON_MATCH_LIST;
2111       break;
2112     default:
2113       token->type = CHARACTER;
2114     }
2115   return 1;
2116 }
2117 
2118 /* Functions for parser.  */
2119 
2120 /* Entry point of the parser.
2121    Parse the regular expression REGEXP and return the structure tree.
2122    If an error occurs, ERR is set by error code, and return NULL.
2123    This function build the following tree, from regular expression <reg_exp>:
2124 	   CAT
2125 	   / \
2126 	  /   \
2127    <reg_exp>  EOR
2128 
2129    CAT means concatenation.
2130    EOR means end of regular expression.  */
2131 
2132 static bin_tree_t *
parse(re_string_t * regexp,regex_t * preg,reg_syntax_t syntax,reg_errcode_t * err)2133 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2134        reg_errcode_t *err)
2135 {
2136   re_dfa_t *dfa = preg->buffer;
2137   bin_tree_t *tree, *eor, *root;
2138   re_token_t current_token;
2139   dfa->syntax = syntax;
2140   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2141   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2142   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2143     return NULL;
2144   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2145   if (tree != NULL)
2146     root = create_tree (dfa, tree, eor, CONCAT);
2147   else
2148     root = eor;
2149   if (BE (eor == NULL || root == NULL, 0))
2150     {
2151       *err = REG_ESPACE;
2152       return NULL;
2153     }
2154   return root;
2155 }
2156 
2157 /* This function build the following tree, from regular expression
2158    <branch1>|<branch2>:
2159 	   ALT
2160 	   / \
2161 	  /   \
2162    <branch1> <branch2>
2163 
2164    ALT means alternative, which represents the operator '|'.  */
2165 
2166 static bin_tree_t *
parse_reg_exp(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,Idx nest,reg_errcode_t * err)2167 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2168 	       reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2169 {
2170   re_dfa_t *dfa = preg->buffer;
2171   bin_tree_t *tree, *branch = NULL;
2172   bitset_word_t initial_bkref_map = dfa->completed_bkref_map;
2173   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2174   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2175     return NULL;
2176 
2177   while (token->type == OP_ALT)
2178     {
2179       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2180       if (token->type != OP_ALT && token->type != END_OF_RE
2181 	  && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2182 	{
2183 	  bitset_word_t accumulated_bkref_map = dfa->completed_bkref_map;
2184 	  dfa->completed_bkref_map = initial_bkref_map;
2185 	  branch = parse_branch (regexp, preg, token, syntax, nest, err);
2186 	  if (BE (*err != REG_NOERROR && branch == NULL, 0))
2187 	    {
2188 	      if (tree != NULL)
2189 		postorder (tree, free_tree, NULL);
2190 	      return NULL;
2191 	    }
2192 	  dfa->completed_bkref_map |= accumulated_bkref_map;
2193 	}
2194       else
2195 	branch = NULL;
2196       tree = create_tree (dfa, tree, branch, OP_ALT);
2197       if (BE (tree == NULL, 0))
2198 	{
2199 	  *err = REG_ESPACE;
2200 	  return NULL;
2201 	}
2202     }
2203   return tree;
2204 }
2205 
2206 /* This function build the following tree, from regular expression
2207    <exp1><exp2>:
2208 	CAT
2209 	/ \
2210        /   \
2211    <exp1> <exp2>
2212 
2213    CAT means concatenation.  */
2214 
2215 static bin_tree_t *
parse_branch(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,Idx nest,reg_errcode_t * err)2216 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2217 	      reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2218 {
2219   bin_tree_t *tree, *expr;
2220   re_dfa_t *dfa = preg->buffer;
2221   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2222   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2223     return NULL;
2224 
2225   while (token->type != OP_ALT && token->type != END_OF_RE
2226 	 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2227     {
2228       expr = parse_expression (regexp, preg, token, syntax, nest, err);
2229       if (BE (*err != REG_NOERROR && expr == NULL, 0))
2230 	{
2231 	  if (tree != NULL)
2232 	    postorder (tree, free_tree, NULL);
2233 	  return NULL;
2234 	}
2235       if (tree != NULL && expr != NULL)
2236 	{
2237 	  bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT);
2238 	  if (newtree == NULL)
2239 	    {
2240 	      postorder (expr, free_tree, NULL);
2241 	      postorder (tree, free_tree, NULL);
2242 	      *err = REG_ESPACE;
2243 	      return NULL;
2244 	    }
2245 	  tree = newtree;
2246 	}
2247       else if (tree == NULL)
2248 	tree = expr;
2249       /* Otherwise expr == NULL, we don't need to create new tree.  */
2250     }
2251   return tree;
2252 }
2253 
2254 /* This function build the following tree, from regular expression a*:
2255 	 *
2256 	 |
2257 	 a
2258 */
2259 
2260 static bin_tree_t *
parse_expression(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,Idx nest,reg_errcode_t * err)2261 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2262 		  reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2263 {
2264   re_dfa_t *dfa = preg->buffer;
2265   bin_tree_t *tree;
2266   switch (token->type)
2267     {
2268     case CHARACTER:
2269       tree = create_token_tree (dfa, NULL, NULL, token);
2270       if (BE (tree == NULL, 0))
2271 	{
2272 	  *err = REG_ESPACE;
2273 	  return NULL;
2274 	}
2275 #ifdef RE_ENABLE_I18N
2276       if (dfa->mb_cur_max > 1)
2277 	{
2278 	  while (!re_string_eoi (regexp)
2279 		 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2280 	    {
2281 	      bin_tree_t *mbc_remain;
2282 	      fetch_token (token, regexp, syntax);
2283 	      mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2284 	      tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2285 	      if (BE (mbc_remain == NULL || tree == NULL, 0))
2286 		{
2287 		  *err = REG_ESPACE;
2288 		  return NULL;
2289 		}
2290 	    }
2291 	}
2292 #endif
2293       break;
2294 
2295     case OP_OPEN_SUBEXP:
2296       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2297       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2298 	return NULL;
2299       break;
2300 
2301     case OP_OPEN_BRACKET:
2302       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2303       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2304 	return NULL;
2305       break;
2306 
2307     case OP_BACK_REF:
2308       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2309 	{
2310 	  *err = REG_ESUBREG;
2311 	  return NULL;
2312 	}
2313       dfa->used_bkref_map |= 1 << token->opr.idx;
2314       tree = create_token_tree (dfa, NULL, NULL, token);
2315       if (BE (tree == NULL, 0))
2316 	{
2317 	  *err = REG_ESPACE;
2318 	  return NULL;
2319 	}
2320       ++dfa->nbackref;
2321       dfa->has_mb_node = 1;
2322       break;
2323 
2324     case OP_OPEN_DUP_NUM:
2325       if (syntax & RE_CONTEXT_INVALID_DUP)
2326 	{
2327 	  *err = REG_BADRPT;
2328 	  return NULL;
2329 	}
2330       FALLTHROUGH;
2331     case OP_DUP_ASTERISK:
2332     case OP_DUP_PLUS:
2333     case OP_DUP_QUESTION:
2334       if (syntax & RE_CONTEXT_INVALID_OPS)
2335 	{
2336 	  *err = REG_BADRPT;
2337 	  return NULL;
2338 	}
2339       else if (syntax & RE_CONTEXT_INDEP_OPS)
2340 	{
2341 	  fetch_token (token, regexp, syntax);
2342 	  return parse_expression (regexp, preg, token, syntax, nest, err);
2343 	}
2344       FALLTHROUGH;
2345     case OP_CLOSE_SUBEXP:
2346       if ((token->type == OP_CLOSE_SUBEXP) &&
2347 	  !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2348 	{
2349 	  *err = REG_ERPAREN;
2350 	  return NULL;
2351 	}
2352       FALLTHROUGH;
2353     case OP_CLOSE_DUP_NUM:
2354       /* We treat it as a normal character.  */
2355 
2356       /* Then we can these characters as normal characters.  */
2357       token->type = CHARACTER;
2358       /* mb_partial and word_char bits should be initialized already
2359 	 by peek_token.  */
2360       tree = create_token_tree (dfa, NULL, NULL, token);
2361       if (BE (tree == NULL, 0))
2362 	{
2363 	  *err = REG_ESPACE;
2364 	  return NULL;
2365 	}
2366       break;
2367 
2368     case ANCHOR:
2369       if ((token->opr.ctx_type
2370 	   & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2371 	  && dfa->word_ops_used == 0)
2372 	init_word_char (dfa);
2373       if (token->opr.ctx_type == WORD_DELIM
2374 	  || token->opr.ctx_type == NOT_WORD_DELIM)
2375 	{
2376 	  bin_tree_t *tree_first, *tree_last;
2377 	  if (token->opr.ctx_type == WORD_DELIM)
2378 	    {
2379 	      token->opr.ctx_type = WORD_FIRST;
2380 	      tree_first = create_token_tree (dfa, NULL, NULL, token);
2381 	      token->opr.ctx_type = WORD_LAST;
2382 	    }
2383 	  else
2384 	    {
2385 	      token->opr.ctx_type = INSIDE_WORD;
2386 	      tree_first = create_token_tree (dfa, NULL, NULL, token);
2387 	      token->opr.ctx_type = INSIDE_NOTWORD;
2388 	    }
2389 	  tree_last = create_token_tree (dfa, NULL, NULL, token);
2390 	  tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2391 	  if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2392 	    {
2393 	      *err = REG_ESPACE;
2394 	      return NULL;
2395 	    }
2396 	}
2397       else
2398 	{
2399 	  tree = create_token_tree (dfa, NULL, NULL, token);
2400 	  if (BE (tree == NULL, 0))
2401 	    {
2402 	      *err = REG_ESPACE;
2403 	      return NULL;
2404 	    }
2405 	}
2406       /* We must return here, since ANCHORs can't be followed
2407 	 by repetition operators.
2408 	 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2409 	     it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2410       fetch_token (token, regexp, syntax);
2411       return tree;
2412 
2413     case OP_PERIOD:
2414       tree = create_token_tree (dfa, NULL, NULL, token);
2415       if (BE (tree == NULL, 0))
2416 	{
2417 	  *err = REG_ESPACE;
2418 	  return NULL;
2419 	}
2420       if (dfa->mb_cur_max > 1)
2421 	dfa->has_mb_node = 1;
2422       break;
2423 
2424     case OP_WORD:
2425     case OP_NOTWORD:
2426       tree = build_charclass_op (dfa, regexp->trans,
2427 				 "alnum",
2428 				 "_",
2429 				 token->type == OP_NOTWORD, err);
2430       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2431 	return NULL;
2432       break;
2433 
2434     case OP_SPACE:
2435     case OP_NOTSPACE:
2436       tree = build_charclass_op (dfa, regexp->trans,
2437 				 "space",
2438 				 "",
2439 				 token->type == OP_NOTSPACE, err);
2440       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2441 	return NULL;
2442       break;
2443 
2444     case OP_ALT:
2445     case END_OF_RE:
2446       return NULL;
2447 
2448     case BACK_SLASH:
2449       *err = REG_EESCAPE;
2450       return NULL;
2451 
2452     default:
2453       /* Must not happen?  */
2454 #ifdef DEBUG
2455       assert (0);
2456 #endif
2457       return NULL;
2458     }
2459   fetch_token (token, regexp, syntax);
2460 
2461   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2462 	 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2463     {
2464       bin_tree_t *dup_tree = parse_dup_op (tree, regexp, dfa, token,
2465 					   syntax, err);
2466       if (BE (*err != REG_NOERROR && dup_tree == NULL, 0))
2467 	{
2468 	  if (tree != NULL)
2469 	    postorder (tree, free_tree, NULL);
2470 	  return NULL;
2471 	}
2472       tree = dup_tree;
2473       /* In BRE consecutive duplications are not allowed.  */
2474       if ((syntax & RE_CONTEXT_INVALID_DUP)
2475 	  && (token->type == OP_DUP_ASTERISK
2476 	      || token->type == OP_OPEN_DUP_NUM))
2477 	{
2478 	  if (tree != NULL)
2479 	    postorder (tree, free_tree, NULL);
2480 	  *err = REG_BADRPT;
2481 	  return NULL;
2482 	}
2483     }
2484 
2485   return tree;
2486 }
2487 
2488 /* This function build the following tree, from regular expression
2489    (<reg_exp>):
2490 	 SUBEXP
2491 	    |
2492 	<reg_exp>
2493 */
2494 
2495 static bin_tree_t *
parse_sub_exp(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,Idx nest,reg_errcode_t * err)2496 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2497 	       reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2498 {
2499   re_dfa_t *dfa = preg->buffer;
2500   bin_tree_t *tree;
2501   size_t cur_nsub;
2502   cur_nsub = preg->re_nsub++;
2503 
2504   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2505 
2506   /* The subexpression may be a null string.  */
2507   if (token->type == OP_CLOSE_SUBEXP)
2508     tree = NULL;
2509   else
2510     {
2511       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2512       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2513 	{
2514 	  if (tree != NULL)
2515 	    postorder (tree, free_tree, NULL);
2516 	  *err = REG_EPAREN;
2517 	}
2518       if (BE (*err != REG_NOERROR, 0))
2519 	return NULL;
2520     }
2521 
2522   if (cur_nsub <= '9' - '1')
2523     dfa->completed_bkref_map |= 1 << cur_nsub;
2524 
2525   tree = create_tree (dfa, tree, NULL, SUBEXP);
2526   if (BE (tree == NULL, 0))
2527     {
2528       *err = REG_ESPACE;
2529       return NULL;
2530     }
2531   tree->token.opr.idx = cur_nsub;
2532   return tree;
2533 }
2534 
2535 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2536 
2537 static bin_tree_t *
parse_dup_op(bin_tree_t * elem,re_string_t * regexp,re_dfa_t * dfa,re_token_t * token,reg_syntax_t syntax,reg_errcode_t * err)2538 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2539 	      re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2540 {
2541   bin_tree_t *tree = NULL, *old_tree = NULL;
2542   Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2543   re_token_t start_token = *token;
2544 
2545   if (token->type == OP_OPEN_DUP_NUM)
2546     {
2547       end = 0;
2548       start = fetch_number (regexp, token, syntax);
2549       if (start == -1)
2550 	{
2551 	  if (token->type == CHARACTER && token->opr.c == ',')
2552 	    start = 0; /* We treat "{,m}" as "{0,m}".  */
2553 	  else
2554 	    {
2555 	      *err = REG_BADBR; /* <re>{} is invalid.  */
2556 	      return NULL;
2557 	    }
2558 	}
2559       if (BE (start != -2, 1))
2560 	{
2561 	  /* We treat "{n}" as "{n,n}".  */
2562 	  end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2563 		 : ((token->type == CHARACTER && token->opr.c == ',')
2564 		    ? fetch_number (regexp, token, syntax) : -2));
2565 	}
2566       if (BE (start == -2 || end == -2, 0))
2567 	{
2568 	  /* Invalid sequence.  */
2569 	  if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2570 	    {
2571 	      if (token->type == END_OF_RE)
2572 		*err = REG_EBRACE;
2573 	      else
2574 		*err = REG_BADBR;
2575 
2576 	      return NULL;
2577 	    }
2578 
2579 	  /* If the syntax bit is set, rollback.  */
2580 	  re_string_set_index (regexp, start_idx);
2581 	  *token = start_token;
2582 	  token->type = CHARACTER;
2583 	  /* mb_partial and word_char bits should be already initialized by
2584 	     peek_token.  */
2585 	  return elem;
2586 	}
2587 
2588       if (BE ((end != -1 && start > end)
2589 	      || token->type != OP_CLOSE_DUP_NUM, 0))
2590 	{
2591 	  /* First number greater than second.  */
2592 	  *err = REG_BADBR;
2593 	  return NULL;
2594 	}
2595 
2596       if (BE (RE_DUP_MAX < (end == -1 ? start : end), 0))
2597 	{
2598 	  *err = REG_ESIZE;
2599 	  return NULL;
2600 	}
2601     }
2602   else
2603     {
2604       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2605       end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2606     }
2607 
2608   fetch_token (token, regexp, syntax);
2609 
2610   if (BE (elem == NULL, 0))
2611     return NULL;
2612   if (BE (start == 0 && end == 0, 0))
2613     {
2614       postorder (elem, free_tree, NULL);
2615       return NULL;
2616     }
2617 
2618   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2619   if (BE (start > 0, 0))
2620     {
2621       tree = elem;
2622       for (i = 2; i <= start; ++i)
2623 	{
2624 	  elem = duplicate_tree (elem, dfa);
2625 	  tree = create_tree (dfa, tree, elem, CONCAT);
2626 	  if (BE (elem == NULL || tree == NULL, 0))
2627 	    goto parse_dup_op_espace;
2628 	}
2629 
2630       if (start == end)
2631 	return tree;
2632 
2633       /* Duplicate ELEM before it is marked optional.  */
2634       elem = duplicate_tree (elem, dfa);
2635       if (BE (elem == NULL, 0))
2636         goto parse_dup_op_espace;
2637       old_tree = tree;
2638     }
2639   else
2640     old_tree = NULL;
2641 
2642   if (elem->token.type == SUBEXP)
2643     {
2644       uintptr_t subidx = elem->token.opr.idx;
2645       postorder (elem, mark_opt_subexp, (void *) subidx);
2646     }
2647 
2648   tree = create_tree (dfa, elem, NULL,
2649 		      (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2650   if (BE (tree == NULL, 0))
2651     goto parse_dup_op_espace;
2652 
2653   /* This loop is actually executed only when end != -1,
2654      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2655      already created the start+1-th copy.  */
2656   if (TYPE_SIGNED (Idx) || end != -1)
2657     for (i = start + 2; i <= end; ++i)
2658       {
2659 	elem = duplicate_tree (elem, dfa);
2660 	tree = create_tree (dfa, tree, elem, CONCAT);
2661 	if (BE (elem == NULL || tree == NULL, 0))
2662 	  goto parse_dup_op_espace;
2663 
2664 	tree = create_tree (dfa, tree, NULL, OP_ALT);
2665 	if (BE (tree == NULL, 0))
2666 	  goto parse_dup_op_espace;
2667       }
2668 
2669   if (old_tree)
2670     tree = create_tree (dfa, old_tree, tree, CONCAT);
2671 
2672   return tree;
2673 
2674  parse_dup_op_espace:
2675   *err = REG_ESPACE;
2676   return NULL;
2677 }
2678 
2679 /* Size of the names for collating symbol/equivalence_class/character_class.
2680    I'm not sure, but maybe enough.  */
2681 #define BRACKET_NAME_BUF_SIZE 32
2682 
2683 #ifndef _LIBC
2684 
2685 # ifdef RE_ENABLE_I18N
2686 /* Convert the byte B to the corresponding wide character.  In a
2687    unibyte locale, treat B as itself if it is an encoding error.
2688    In a multibyte locale, return WEOF if B is an encoding error.  */
2689 static wint_t
parse_byte(unsigned char b,re_charset_t * mbcset)2690 parse_byte (unsigned char b, re_charset_t *mbcset)
2691 {
2692   wint_t wc = __btowc (b);
2693   return wc == WEOF && !mbcset ? b : wc;
2694 }
2695 #endif
2696 
2697   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2698      Build the range expression which starts from START_ELEM, and ends
2699      at END_ELEM.  The result are written to MBCSET and SBCSET.
2700      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2701      mbcset->range_ends, is a pointer argument since we may
2702      update it.  */
2703 
2704 static reg_errcode_t
2705 # ifdef RE_ENABLE_I18N
build_range_exp(const reg_syntax_t syntax,bitset_t sbcset,re_charset_t * mbcset,Idx * range_alloc,const bracket_elem_t * start_elem,const bracket_elem_t * end_elem)2706 build_range_exp (const reg_syntax_t syntax,
2707                  bitset_t sbcset,
2708                  re_charset_t *mbcset,
2709                  Idx *range_alloc,
2710                  const bracket_elem_t *start_elem,
2711                  const bracket_elem_t *end_elem)
2712 # else /* not RE_ENABLE_I18N */
2713 build_range_exp (const reg_syntax_t syntax,
2714                  bitset_t sbcset,
2715                  const bracket_elem_t *start_elem,
2716                  const bracket_elem_t *end_elem)
2717 # endif /* not RE_ENABLE_I18N */
2718 {
2719   unsigned int start_ch, end_ch;
2720   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2721   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2722 	  || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2723 	  0))
2724     return REG_ERANGE;
2725 
2726   /* We can handle no multi character collating elements without libc
2727      support.  */
2728   if (BE ((start_elem->type == COLL_SYM
2729 	   && strlen ((char *) start_elem->opr.name) > 1)
2730 	  || (end_elem->type == COLL_SYM
2731 	      && strlen ((char *) end_elem->opr.name) > 1), 0))
2732     return REG_ECOLLATE;
2733 
2734 # ifdef RE_ENABLE_I18N
2735   {
2736     wchar_t wc;
2737     wint_t start_wc;
2738     wint_t end_wc;
2739 
2740     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2741 		: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2742 		   : 0));
2743     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2744 	      : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2745 		 : 0));
2746     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2747 		? parse_byte (start_ch, mbcset) : start_elem->opr.wch);
2748     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2749 	      ? parse_byte (end_ch, mbcset) : end_elem->opr.wch);
2750     if (start_wc == WEOF || end_wc == WEOF)
2751       return REG_ECOLLATE;
2752     else if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_wc > end_wc, 0))
2753       return REG_ERANGE;
2754 
2755     /* Got valid collation sequence values, add them as a new entry.
2756        However, for !_LIBC we have no collation elements: if the
2757        character set is single byte, the single byte character set
2758        that we build below suffices.  parse_bracket_exp passes
2759        no MBCSET if dfa->mb_cur_max == 1.  */
2760     if (mbcset)
2761       {
2762 	/* Check the space of the arrays.  */
2763 	if (BE (*range_alloc == mbcset->nranges, 0))
2764 	  {
2765 	    /* There is not enough space, need realloc.  */
2766 	    wchar_t *new_array_start, *new_array_end;
2767 	    Idx new_nranges;
2768 
2769 	    /* +1 in case of mbcset->nranges is 0.  */
2770 	    new_nranges = 2 * mbcset->nranges + 1;
2771 	    /* Use realloc since mbcset->range_starts and mbcset->range_ends
2772 	       are NULL if *range_alloc == 0.  */
2773 	    new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2774 					  new_nranges);
2775 	    new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2776 					new_nranges);
2777 
2778 	    if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2779 	      {
2780 		re_free (new_array_start);
2781 		re_free (new_array_end);
2782 		return REG_ESPACE;
2783 	      }
2784 
2785 	    mbcset->range_starts = new_array_start;
2786 	    mbcset->range_ends = new_array_end;
2787 	    *range_alloc = new_nranges;
2788 	  }
2789 
2790 	mbcset->range_starts[mbcset->nranges] = start_wc;
2791 	mbcset->range_ends[mbcset->nranges++] = end_wc;
2792       }
2793 
2794     /* Build the table for single byte characters.  */
2795     for (wc = 0; wc < SBC_MAX; ++wc)
2796       {
2797 	if (start_wc <= wc && wc <= end_wc)
2798 	  bitset_set (sbcset, wc);
2799       }
2800   }
2801 # else /* not RE_ENABLE_I18N */
2802   {
2803     unsigned int ch;
2804     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2805 		: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2806 		   : 0));
2807     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2808 	      : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2809 		 : 0));
2810     if (start_ch > end_ch)
2811       return REG_ERANGE;
2812     /* Build the table for single byte characters.  */
2813     for (ch = 0; ch < SBC_MAX; ++ch)
2814       if (start_ch <= ch  && ch <= end_ch)
2815 	bitset_set (sbcset, ch);
2816   }
2817 # endif /* not RE_ENABLE_I18N */
2818   return REG_NOERROR;
2819 }
2820 #endif /* not _LIBC */
2821 
2822 #ifndef _LIBC
2823 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2824    Build the collating element which is represented by NAME.
2825    The result are written to MBCSET and SBCSET.
2826    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2827    pointer argument since we may update it.  */
2828 
2829 static reg_errcode_t
2830 # ifdef RE_ENABLE_I18N
build_collating_symbol(bitset_t sbcset,re_charset_t * mbcset,Idx * coll_sym_alloc,const unsigned char * name)2831 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2832 			Idx *coll_sym_alloc, const unsigned char *name)
2833 # else /* not RE_ENABLE_I18N */
2834 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2835 # endif /* not RE_ENABLE_I18N */
2836 {
2837   size_t name_len = strlen ((const char *) name);
2838   if (BE (name_len != 1, 0))
2839     return REG_ECOLLATE;
2840   else
2841     {
2842       bitset_set (sbcset, name[0]);
2843       return REG_NOERROR;
2844     }
2845 }
2846 #endif /* not _LIBC */
2847 
2848 /* This function parse bracket expression like "[abc]", "[a-c]",
2849    "[[.a-a.]]" etc.  */
2850 
2851 static bin_tree_t *
parse_bracket_exp(re_string_t * regexp,re_dfa_t * dfa,re_token_t * token,reg_syntax_t syntax,reg_errcode_t * err)2852 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2853 		   reg_syntax_t syntax, reg_errcode_t *err)
2854 {
2855 #ifdef _LIBC
2856   const unsigned char *collseqmb;
2857   const char *collseqwc;
2858   uint32_t nrules;
2859   int32_t table_size;
2860   const int32_t *symb_table;
2861   const unsigned char *extra;
2862 
2863   /* Local function for parse_bracket_exp used in _LIBC environment.
2864      Seek the collating symbol entry corresponding to NAME.
2865      Return the index of the symbol in the SYMB_TABLE,
2866      or -1 if not found.  */
2867 
2868   auto inline int32_t
2869   __attribute__ ((always_inline))
2870   seek_collating_symbol_entry (const unsigned char *name, size_t name_len)
2871     {
2872       int32_t elem;
2873 
2874       for (elem = 0; elem < table_size; elem++)
2875 	if (symb_table[2 * elem] != 0)
2876 	  {
2877 	    int32_t idx = symb_table[2 * elem + 1];
2878 	    /* Skip the name of collating element name.  */
2879 	    idx += 1 + extra[idx];
2880 	    if (/* Compare the length of the name.  */
2881 		name_len == extra[idx]
2882 		/* Compare the name.  */
2883 		&& memcmp (name, &extra[idx + 1], name_len) == 0)
2884 	      /* Yep, this is the entry.  */
2885 	      return elem;
2886 	  }
2887       return -1;
2888     }
2889 
2890   /* Local function for parse_bracket_exp used in _LIBC environment.
2891      Look up the collation sequence value of BR_ELEM.
2892      Return the value if succeeded, UINT_MAX otherwise.  */
2893 
2894   auto inline unsigned int
2895   __attribute__ ((always_inline))
2896   lookup_collation_sequence_value (bracket_elem_t *br_elem)
2897     {
2898       if (br_elem->type == SB_CHAR)
2899 	{
2900 	  /*
2901 	  if (MB_CUR_MAX == 1)
2902 	  */
2903 	  if (nrules == 0)
2904 	    return collseqmb[br_elem->opr.ch];
2905 	  else
2906 	    {
2907 	      wint_t wc = __btowc (br_elem->opr.ch);
2908 	      return __collseq_table_lookup (collseqwc, wc);
2909 	    }
2910 	}
2911       else if (br_elem->type == MB_CHAR)
2912 	{
2913 	  if (nrules != 0)
2914 	    return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2915 	}
2916       else if (br_elem->type == COLL_SYM)
2917 	{
2918 	  size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2919 	  if (nrules != 0)
2920 	    {
2921 	      int32_t elem, idx;
2922 	      elem = seek_collating_symbol_entry (br_elem->opr.name,
2923 						  sym_name_len);
2924 	      if (elem != -1)
2925 		{
2926 		  /* We found the entry.  */
2927 		  idx = symb_table[2 * elem + 1];
2928 		  /* Skip the name of collating element name.  */
2929 		  idx += 1 + extra[idx];
2930 		  /* Skip the byte sequence of the collating element.  */
2931 		  idx += 1 + extra[idx];
2932 		  /* Adjust for the alignment.  */
2933 		  idx = (idx + 3) & ~3;
2934 		  /* Skip the multibyte collation sequence value.  */
2935 		  idx += sizeof (unsigned int);
2936 		  /* Skip the wide char sequence of the collating element.  */
2937 		  idx += sizeof (unsigned int) *
2938 		    (1 + *(unsigned int *) (extra + idx));
2939 		  /* Return the collation sequence value.  */
2940 		  return *(unsigned int *) (extra + idx);
2941 		}
2942 	      else if (sym_name_len == 1)
2943 		{
2944 		  /* No valid character.  Match it as a single byte
2945 		     character.  */
2946 		  return collseqmb[br_elem->opr.name[0]];
2947 		}
2948 	    }
2949 	  else if (sym_name_len == 1)
2950 	    return collseqmb[br_elem->opr.name[0]];
2951 	}
2952       return UINT_MAX;
2953     }
2954 
2955   /* Local function for parse_bracket_exp used in _LIBC environment.
2956      Build the range expression which starts from START_ELEM, and ends
2957      at END_ELEM.  The result are written to MBCSET and SBCSET.
2958      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2959      mbcset->range_ends, is a pointer argument since we may
2960      update it.  */
2961 
2962   auto inline reg_errcode_t
2963   __attribute__ ((always_inline))
2964   build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2965 		   bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2966     {
2967       unsigned int ch;
2968       uint32_t start_collseq;
2969       uint32_t end_collseq;
2970 
2971       /* Equivalence Classes and Character Classes can't be a range
2972 	 start/end.  */
2973       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2974 	      || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2975 	      0))
2976 	return REG_ERANGE;
2977 
2978       /* FIXME: Implement rational ranges here, too.  */
2979       start_collseq = lookup_collation_sequence_value (start_elem);
2980       end_collseq = lookup_collation_sequence_value (end_elem);
2981       /* Check start/end collation sequence values.  */
2982       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2983 	return REG_ECOLLATE;
2984       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2985 	return REG_ERANGE;
2986 
2987       /* Got valid collation sequence values, add them as a new entry.
2988 	 However, if we have no collation elements, and the character set
2989 	 is single byte, the single byte character set that we
2990 	 build below suffices. */
2991       if (nrules > 0 || dfa->mb_cur_max > 1)
2992 	{
2993 	  /* Check the space of the arrays.  */
2994 	  if (BE (*range_alloc == mbcset->nranges, 0))
2995 	    {
2996 	      /* There is not enough space, need realloc.  */
2997 	      uint32_t *new_array_start;
2998 	      uint32_t *new_array_end;
2999 	      Idx new_nranges;
3000 
3001 	      /* +1 in case of mbcset->nranges is 0.  */
3002 	      new_nranges = 2 * mbcset->nranges + 1;
3003 	      new_array_start = re_realloc (mbcset->range_starts, uint32_t,
3004 					    new_nranges);
3005 	      new_array_end = re_realloc (mbcset->range_ends, uint32_t,
3006 					  new_nranges);
3007 
3008 	      if (BE (new_array_start == NULL || new_array_end == NULL, 0))
3009 		return REG_ESPACE;
3010 
3011 	      mbcset->range_starts = new_array_start;
3012 	      mbcset->range_ends = new_array_end;
3013 	      *range_alloc = new_nranges;
3014 	    }
3015 
3016 	  mbcset->range_starts[mbcset->nranges] = start_collseq;
3017 	  mbcset->range_ends[mbcset->nranges++] = end_collseq;
3018 	}
3019 
3020       /* Build the table for single byte characters.  */
3021       for (ch = 0; ch < SBC_MAX; ch++)
3022 	{
3023 	  uint32_t ch_collseq;
3024 	  /*
3025 	  if (MB_CUR_MAX == 1)
3026 	  */
3027 	  if (nrules == 0)
3028 	    ch_collseq = collseqmb[ch];
3029 	  else
3030 	    ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
3031 	  if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
3032 	    bitset_set (sbcset, ch);
3033 	}
3034       return REG_NOERROR;
3035     }
3036 
3037   /* Local function for parse_bracket_exp used in _LIBC environment.
3038      Build the collating element which is represented by NAME.
3039      The result are written to MBCSET and SBCSET.
3040      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3041      pointer argument since we may update it.  */
3042 
3043   auto inline reg_errcode_t
3044   __attribute__ ((always_inline))
3045   build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
3046 			  Idx *coll_sym_alloc, const unsigned char *name)
3047     {
3048       int32_t elem, idx;
3049       size_t name_len = strlen ((const char *) name);
3050       if (nrules != 0)
3051 	{
3052 	  elem = seek_collating_symbol_entry (name, name_len);
3053 	  if (elem != -1)
3054 	    {
3055 	      /* We found the entry.  */
3056 	      idx = symb_table[2 * elem + 1];
3057 	      /* Skip the name of collating element name.  */
3058 	      idx += 1 + extra[idx];
3059 	    }
3060 	  else if (name_len == 1)
3061 	    {
3062 	      /* No valid character, treat it as a normal
3063 		 character.  */
3064 	      bitset_set (sbcset, name[0]);
3065 	      return REG_NOERROR;
3066 	    }
3067 	  else
3068 	    return REG_ECOLLATE;
3069 
3070 	  /* Got valid collation sequence, add it as a new entry.  */
3071 	  /* Check the space of the arrays.  */
3072 	  if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3073 	    {
3074 	      /* Not enough, realloc it.  */
3075 	      /* +1 in case of mbcset->ncoll_syms is 0.  */
3076 	      Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3077 	      /* Use realloc since mbcset->coll_syms is NULL
3078 		 if *alloc == 0.  */
3079 	      int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3080 						   new_coll_sym_alloc);
3081 	      if (BE (new_coll_syms == NULL, 0))
3082 		return REG_ESPACE;
3083 	      mbcset->coll_syms = new_coll_syms;
3084 	      *coll_sym_alloc = new_coll_sym_alloc;
3085 	    }
3086 	  mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3087 	  return REG_NOERROR;
3088 	}
3089       else
3090 	{
3091 	  if (BE (name_len != 1, 0))
3092 	    return REG_ECOLLATE;
3093 	  else
3094 	    {
3095 	      bitset_set (sbcset, name[0]);
3096 	      return REG_NOERROR;
3097 	    }
3098 	}
3099     }
3100 #endif
3101 
3102   re_token_t br_token;
3103   re_bitset_ptr_t sbcset;
3104 #ifdef RE_ENABLE_I18N
3105   re_charset_t *mbcset;
3106   Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3107   Idx equiv_class_alloc = 0, char_class_alloc = 0;
3108 #endif /* not RE_ENABLE_I18N */
3109   bool non_match = false;
3110   bin_tree_t *work_tree;
3111   int token_len;
3112   bool first_round = true;
3113 #ifdef _LIBC
3114   collseqmb = (const unsigned char *)
3115     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3116   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3117   if (nrules)
3118     {
3119       /*
3120       if (MB_CUR_MAX > 1)
3121       */
3122       collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3123       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3124       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3125 						  _NL_COLLATE_SYMB_TABLEMB);
3126       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3127 						   _NL_COLLATE_SYMB_EXTRAMB);
3128     }
3129 #endif
3130   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3131 #ifdef RE_ENABLE_I18N
3132   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3133 #endif /* RE_ENABLE_I18N */
3134 #ifdef RE_ENABLE_I18N
3135   if (BE (sbcset == NULL || mbcset == NULL, 0))
3136 #else
3137   if (BE (sbcset == NULL, 0))
3138 #endif /* RE_ENABLE_I18N */
3139     {
3140       re_free (sbcset);
3141 #ifdef RE_ENABLE_I18N
3142       re_free (mbcset);
3143 #endif
3144       *err = REG_ESPACE;
3145       return NULL;
3146     }
3147 
3148   token_len = peek_token_bracket (token, regexp, syntax);
3149   if (BE (token->type == END_OF_RE, 0))
3150     {
3151       *err = REG_BADPAT;
3152       goto parse_bracket_exp_free_return;
3153     }
3154   if (token->type == OP_NON_MATCH_LIST)
3155     {
3156 #ifdef RE_ENABLE_I18N
3157       mbcset->non_match = 1;
3158 #endif /* not RE_ENABLE_I18N */
3159       non_match = true;
3160       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3161 	bitset_set (sbcset, '\n');
3162       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3163       token_len = peek_token_bracket (token, regexp, syntax);
3164       if (BE (token->type == END_OF_RE, 0))
3165 	{
3166 	  *err = REG_BADPAT;
3167 	  goto parse_bracket_exp_free_return;
3168 	}
3169     }
3170 
3171   /* We treat the first ']' as a normal character.  */
3172   if (token->type == OP_CLOSE_BRACKET)
3173     token->type = CHARACTER;
3174 
3175   while (1)
3176     {
3177       bracket_elem_t start_elem, end_elem;
3178       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3179       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3180       reg_errcode_t ret;
3181       int token_len2 = 0;
3182       bool is_range_exp = false;
3183       re_token_t token2;
3184 
3185       start_elem.opr.name = start_name_buf;
3186       start_elem.type = COLL_SYM;
3187       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3188 				   syntax, first_round);
3189       if (BE (ret != REG_NOERROR, 0))
3190 	{
3191 	  *err = ret;
3192 	  goto parse_bracket_exp_free_return;
3193 	}
3194       first_round = false;
3195 
3196       /* Get information about the next token.  We need it in any case.  */
3197       token_len = peek_token_bracket (token, regexp, syntax);
3198 
3199       /* Do not check for ranges if we know they are not allowed.  */
3200       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3201 	{
3202 	  if (BE (token->type == END_OF_RE, 0))
3203 	    {
3204 	      *err = REG_EBRACK;
3205 	      goto parse_bracket_exp_free_return;
3206 	    }
3207 	  if (token->type == OP_CHARSET_RANGE)
3208 	    {
3209 	      re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3210 	      token_len2 = peek_token_bracket (&token2, regexp, syntax);
3211 	      if (BE (token2.type == END_OF_RE, 0))
3212 		{
3213 		  *err = REG_EBRACK;
3214 		  goto parse_bracket_exp_free_return;
3215 		}
3216 	      if (token2.type == OP_CLOSE_BRACKET)
3217 		{
3218 		  /* We treat the last '-' as a normal character.  */
3219 		  re_string_skip_bytes (regexp, -token_len);
3220 		  token->type = CHARACTER;
3221 		}
3222 	      else
3223 		is_range_exp = true;
3224 	    }
3225 	}
3226 
3227       if (is_range_exp == true)
3228 	{
3229 	  end_elem.opr.name = end_name_buf;
3230 	  end_elem.type = COLL_SYM;
3231 	  ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3232 				       dfa, syntax, true);
3233 	  if (BE (ret != REG_NOERROR, 0))
3234 	    {
3235 	      *err = ret;
3236 	      goto parse_bracket_exp_free_return;
3237 	    }
3238 
3239 	  token_len = peek_token_bracket (token, regexp, syntax);
3240 
3241 #ifdef _LIBC
3242 	  *err = build_range_exp (sbcset, mbcset, &range_alloc,
3243 				  &start_elem, &end_elem);
3244 #else
3245 # ifdef RE_ENABLE_I18N
3246 	  *err = build_range_exp (syntax, sbcset,
3247 				  dfa->mb_cur_max > 1 ? mbcset : NULL,
3248 				  &range_alloc, &start_elem, &end_elem);
3249 # else
3250 	  *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
3251 # endif
3252 #endif /* RE_ENABLE_I18N */
3253 	  if (BE (*err != REG_NOERROR, 0))
3254 	    goto parse_bracket_exp_free_return;
3255 	}
3256       else
3257 	{
3258 	  switch (start_elem.type)
3259 	    {
3260 	    case SB_CHAR:
3261 	      bitset_set (sbcset, start_elem.opr.ch);
3262 	      break;
3263 #ifdef RE_ENABLE_I18N
3264 	    case MB_CHAR:
3265 	      /* Check whether the array has enough space.  */
3266 	      if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3267 		{
3268 		  wchar_t *new_mbchars;
3269 		  /* Not enough, realloc it.  */
3270 		  /* +1 in case of mbcset->nmbchars is 0.  */
3271 		  mbchar_alloc = 2 * mbcset->nmbchars + 1;
3272 		  /* Use realloc since array is NULL if *alloc == 0.  */
3273 		  new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3274 					    mbchar_alloc);
3275 		  if (BE (new_mbchars == NULL, 0))
3276 		    goto parse_bracket_exp_espace;
3277 		  mbcset->mbchars = new_mbchars;
3278 		}
3279 	      mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3280 	      break;
3281 #endif /* RE_ENABLE_I18N */
3282 	    case EQUIV_CLASS:
3283 	      *err = build_equiv_class (sbcset,
3284 #ifdef RE_ENABLE_I18N
3285 					mbcset, &equiv_class_alloc,
3286 #endif /* RE_ENABLE_I18N */
3287 					start_elem.opr.name);
3288 	      if (BE (*err != REG_NOERROR, 0))
3289 		goto parse_bracket_exp_free_return;
3290 	      break;
3291 	    case COLL_SYM:
3292 	      *err = build_collating_symbol (sbcset,
3293 #ifdef RE_ENABLE_I18N
3294 					     mbcset, &coll_sym_alloc,
3295 #endif /* RE_ENABLE_I18N */
3296 					     start_elem.opr.name);
3297 	      if (BE (*err != REG_NOERROR, 0))
3298 		goto parse_bracket_exp_free_return;
3299 	      break;
3300 	    case CHAR_CLASS:
3301 	      *err = build_charclass (regexp->trans, sbcset,
3302 #ifdef RE_ENABLE_I18N
3303 				      mbcset, &char_class_alloc,
3304 #endif /* RE_ENABLE_I18N */
3305 				      (const char *) start_elem.opr.name,
3306 				      syntax);
3307 	      if (BE (*err != REG_NOERROR, 0))
3308 	       goto parse_bracket_exp_free_return;
3309 	      break;
3310 	    default:
3311 	      assert (0);
3312 	      break;
3313 	    }
3314 	}
3315       if (BE (token->type == END_OF_RE, 0))
3316 	{
3317 	  *err = REG_EBRACK;
3318 	  goto parse_bracket_exp_free_return;
3319 	}
3320       if (token->type == OP_CLOSE_BRACKET)
3321 	break;
3322     }
3323 
3324   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3325 
3326   /* If it is non-matching list.  */
3327   if (non_match)
3328     bitset_not (sbcset);
3329 
3330 #ifdef RE_ENABLE_I18N
3331   /* Ensure only single byte characters are set.  */
3332   if (dfa->mb_cur_max > 1)
3333     bitset_mask (sbcset, dfa->sb_char);
3334 
3335   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3336       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3337 						     || mbcset->non_match)))
3338     {
3339       bin_tree_t *mbc_tree;
3340       int sbc_idx;
3341       /* Build a tree for complex bracket.  */
3342       dfa->has_mb_node = 1;
3343       br_token.type = COMPLEX_BRACKET;
3344       br_token.opr.mbcset = mbcset;
3345       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3346       if (BE (mbc_tree == NULL, 0))
3347 	goto parse_bracket_exp_espace;
3348       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3349 	if (sbcset[sbc_idx])
3350 	  break;
3351       /* If there are no bits set in sbcset, there is no point
3352 	 of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3353       if (sbc_idx < BITSET_WORDS)
3354 	{
3355 	  /* Build a tree for simple bracket.  */
3356 	  br_token.type = SIMPLE_BRACKET;
3357 	  br_token.opr.sbcset = sbcset;
3358 	  work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3359 	  if (BE (work_tree == NULL, 0))
3360 	    goto parse_bracket_exp_espace;
3361 
3362 	  /* Then join them by ALT node.  */
3363 	  work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3364 	  if (BE (work_tree == NULL, 0))
3365 	    goto parse_bracket_exp_espace;
3366 	}
3367       else
3368 	{
3369 	  re_free (sbcset);
3370 	  work_tree = mbc_tree;
3371 	}
3372     }
3373   else
3374 #endif /* not RE_ENABLE_I18N */
3375     {
3376 #ifdef RE_ENABLE_I18N
3377       free_charset (mbcset);
3378 #endif
3379       /* Build a tree for simple bracket.  */
3380       br_token.type = SIMPLE_BRACKET;
3381       br_token.opr.sbcset = sbcset;
3382       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3383       if (BE (work_tree == NULL, 0))
3384 	goto parse_bracket_exp_espace;
3385     }
3386   return work_tree;
3387 
3388  parse_bracket_exp_espace:
3389   *err = REG_ESPACE;
3390  parse_bracket_exp_free_return:
3391   re_free (sbcset);
3392 #ifdef RE_ENABLE_I18N
3393   free_charset (mbcset);
3394 #endif /* RE_ENABLE_I18N */
3395   return NULL;
3396 }
3397 
3398 /* Parse an element in the bracket expression.  */
3399 
3400 static reg_errcode_t
parse_bracket_element(bracket_elem_t * elem,re_string_t * regexp,re_token_t * token,int token_len,re_dfa_t * dfa,reg_syntax_t syntax,bool accept_hyphen)3401 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3402 		       re_token_t *token, int token_len, re_dfa_t *dfa,
3403 		       reg_syntax_t syntax, bool accept_hyphen)
3404 {
3405 #ifdef RE_ENABLE_I18N
3406   int cur_char_size;
3407   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3408   if (cur_char_size > 1)
3409     {
3410       elem->type = MB_CHAR;
3411       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3412       re_string_skip_bytes (regexp, cur_char_size);
3413       return REG_NOERROR;
3414     }
3415 #endif /* RE_ENABLE_I18N */
3416   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3417   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3418       || token->type == OP_OPEN_EQUIV_CLASS)
3419     return parse_bracket_symbol (elem, regexp, token);
3420   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3421     {
3422       /* A '-' must only appear as anything but a range indicator before
3423 	 the closing bracket.  Everything else is an error.  */
3424       re_token_t token2;
3425       (void) peek_token_bracket (&token2, regexp, syntax);
3426       if (token2.type != OP_CLOSE_BRACKET)
3427 	/* The actual error value is not standardized since this whole
3428 	   case is undefined.  But ERANGE makes good sense.  */
3429 	return REG_ERANGE;
3430     }
3431   elem->type = SB_CHAR;
3432   elem->opr.ch = token->opr.c;
3433   return REG_NOERROR;
3434 }
3435 
3436 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3437    such as [:<character_class>:], [.<collating_element>.], and
3438    [=<equivalent_class>=].  */
3439 
3440 static reg_errcode_t
parse_bracket_symbol(bracket_elem_t * elem,re_string_t * regexp,re_token_t * token)3441 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3442 		      re_token_t *token)
3443 {
3444   unsigned char ch, delim = token->opr.c;
3445   int i = 0;
3446   if (re_string_eoi(regexp))
3447     return REG_EBRACK;
3448   for (;; ++i)
3449     {
3450       if (i >= BRACKET_NAME_BUF_SIZE)
3451 	return REG_EBRACK;
3452       if (token->type == OP_OPEN_CHAR_CLASS)
3453 	ch = re_string_fetch_byte_case (regexp);
3454       else
3455 	ch = re_string_fetch_byte (regexp);
3456       if (re_string_eoi(regexp))
3457 	return REG_EBRACK;
3458       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3459 	break;
3460       elem->opr.name[i] = ch;
3461     }
3462   re_string_skip_bytes (regexp, 1);
3463   elem->opr.name[i] = '\0';
3464   switch (token->type)
3465     {
3466     case OP_OPEN_COLL_ELEM:
3467       elem->type = COLL_SYM;
3468       break;
3469     case OP_OPEN_EQUIV_CLASS:
3470       elem->type = EQUIV_CLASS;
3471       break;
3472     case OP_OPEN_CHAR_CLASS:
3473       elem->type = CHAR_CLASS;
3474       break;
3475     default:
3476       break;
3477     }
3478   return REG_NOERROR;
3479 }
3480 
3481   /* Helper function for parse_bracket_exp.
3482      Build the equivalence class which is represented by NAME.
3483      The result are written to MBCSET and SBCSET.
3484      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3485      is a pointer argument since we may update it.  */
3486 
3487 static reg_errcode_t
3488 #ifdef RE_ENABLE_I18N
build_equiv_class(bitset_t sbcset,re_charset_t * mbcset,Idx * equiv_class_alloc,const unsigned char * name)3489 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3490 		   Idx *equiv_class_alloc, const unsigned char *name)
3491 #else /* not RE_ENABLE_I18N */
3492 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3493 #endif /* not RE_ENABLE_I18N */
3494 {
3495 #ifdef _LIBC
3496   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3497   if (nrules != 0)
3498     {
3499       const int32_t *table, *indirect;
3500       const unsigned char *weights, *extra, *cp;
3501       unsigned char char_buf[2];
3502       int32_t idx1, idx2;
3503       unsigned int ch;
3504       size_t len;
3505       /* Calculate the index for equivalence class.  */
3506       cp = name;
3507       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3508       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3509 					       _NL_COLLATE_WEIGHTMB);
3510       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3511 						   _NL_COLLATE_EXTRAMB);
3512       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3513 						_NL_COLLATE_INDIRECTMB);
3514       idx1 = findidx (table, indirect, extra, &cp, -1);
3515       if (BE (idx1 == 0 || *cp != '\0', 0))
3516 	/* This isn't a valid character.  */
3517 	return REG_ECOLLATE;
3518 
3519       /* Build single byte matching table for this equivalence class.  */
3520       len = weights[idx1 & 0xffffff];
3521       for (ch = 0; ch < SBC_MAX; ++ch)
3522 	{
3523 	  char_buf[0] = ch;
3524 	  cp = char_buf;
3525 	  idx2 = findidx (table, indirect, extra, &cp, 1);
3526 /*
3527 	  idx2 = table[ch];
3528 */
3529 	  if (idx2 == 0)
3530 	    /* This isn't a valid character.  */
3531 	    continue;
3532 	  /* Compare only if the length matches and the collation rule
3533 	     index is the same.  */
3534 	  if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24)
3535 	      && memcmp (weights + (idx1 & 0xffffff) + 1,
3536 			 weights + (idx2 & 0xffffff) + 1, len) == 0)
3537 	    bitset_set (sbcset, ch);
3538 	}
3539       /* Check whether the array has enough space.  */
3540       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3541 	{
3542 	  /* Not enough, realloc it.  */
3543 	  /* +1 in case of mbcset->nequiv_classes is 0.  */
3544 	  Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3545 	  /* Use realloc since the array is NULL if *alloc == 0.  */
3546 	  int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3547 						   int32_t,
3548 						   new_equiv_class_alloc);
3549 	  if (BE (new_equiv_classes == NULL, 0))
3550 	    return REG_ESPACE;
3551 	  mbcset->equiv_classes = new_equiv_classes;
3552 	  *equiv_class_alloc = new_equiv_class_alloc;
3553 	}
3554       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3555     }
3556   else
3557 #endif /* _LIBC */
3558     {
3559       if (BE (strlen ((const char *) name) != 1, 0))
3560 	return REG_ECOLLATE;
3561       bitset_set (sbcset, *name);
3562     }
3563   return REG_NOERROR;
3564 }
3565 
3566   /* Helper function for parse_bracket_exp.
3567      Build the character class which is represented by NAME.
3568      The result are written to MBCSET and SBCSET.
3569      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3570      is a pointer argument since we may update it.  */
3571 
3572 static reg_errcode_t
3573 #ifdef RE_ENABLE_I18N
build_charclass(RE_TRANSLATE_TYPE trans,bitset_t sbcset,re_charset_t * mbcset,Idx * char_class_alloc,const char * class_name,reg_syntax_t syntax)3574 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3575 		 re_charset_t *mbcset, Idx *char_class_alloc,
3576 		 const char *class_name, reg_syntax_t syntax)
3577 #else /* not RE_ENABLE_I18N */
3578 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3579 		 const char *class_name, reg_syntax_t syntax)
3580 #endif /* not RE_ENABLE_I18N */
3581 {
3582   int i;
3583   const char *name = class_name;
3584 
3585   /* In case of REG_ICASE "upper" and "lower" match the both of
3586      upper and lower cases.  */
3587   if ((syntax & RE_ICASE)
3588       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3589     name = "alpha";
3590 
3591 #ifdef RE_ENABLE_I18N
3592   /* Check the space of the arrays.  */
3593   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3594     {
3595       /* Not enough, realloc it.  */
3596       /* +1 in case of mbcset->nchar_classes is 0.  */
3597       Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3598       /* Use realloc since array is NULL if *alloc == 0.  */
3599       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3600 					       new_char_class_alloc);
3601       if (BE (new_char_classes == NULL, 0))
3602 	return REG_ESPACE;
3603       mbcset->char_classes = new_char_classes;
3604       *char_class_alloc = new_char_class_alloc;
3605     }
3606   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3607 #endif /* RE_ENABLE_I18N */
3608 
3609 #define BUILD_CHARCLASS_LOOP(ctype_func)	\
3610   do {						\
3611     if (BE (trans != NULL, 0))			\
3612       {						\
3613 	for (i = 0; i < SBC_MAX; ++i)		\
3614 	  if (ctype_func (i))			\
3615 	    bitset_set (sbcset, trans[i]);	\
3616       }						\
3617     else					\
3618       {						\
3619 	for (i = 0; i < SBC_MAX; ++i)		\
3620 	  if (ctype_func (i))			\
3621 	    bitset_set (sbcset, i);		\
3622       }						\
3623   } while (0)
3624 
3625   if (strcmp (name, "alnum") == 0)
3626     BUILD_CHARCLASS_LOOP (isalnum);
3627   else if (strcmp (name, "cntrl") == 0)
3628     BUILD_CHARCLASS_LOOP (iscntrl);
3629   else if (strcmp (name, "lower") == 0)
3630     BUILD_CHARCLASS_LOOP (islower);
3631   else if (strcmp (name, "space") == 0)
3632     BUILD_CHARCLASS_LOOP (isspace);
3633   else if (strcmp (name, "alpha") == 0)
3634     BUILD_CHARCLASS_LOOP (isalpha);
3635   else if (strcmp (name, "digit") == 0)
3636     BUILD_CHARCLASS_LOOP (isdigit);
3637   else if (strcmp (name, "print") == 0)
3638     BUILD_CHARCLASS_LOOP (isprint);
3639   else if (strcmp (name, "upper") == 0)
3640     BUILD_CHARCLASS_LOOP (isupper);
3641   else if (strcmp (name, "blank") == 0)
3642     BUILD_CHARCLASS_LOOP (isblank);
3643   else if (strcmp (name, "graph") == 0)
3644     BUILD_CHARCLASS_LOOP (isgraph);
3645   else if (strcmp (name, "punct") == 0)
3646     BUILD_CHARCLASS_LOOP (ispunct);
3647   else if (strcmp (name, "xdigit") == 0)
3648     BUILD_CHARCLASS_LOOP (isxdigit);
3649   else
3650     return REG_ECTYPE;
3651 
3652   return REG_NOERROR;
3653 }
3654 
3655 static bin_tree_t *
build_charclass_op(re_dfa_t * dfa,RE_TRANSLATE_TYPE trans,const char * class_name,const char * extra,bool non_match,reg_errcode_t * err)3656 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3657 		    const char *class_name,
3658 		    const char *extra, bool non_match,
3659 		    reg_errcode_t *err)
3660 {
3661   re_bitset_ptr_t sbcset;
3662 #ifdef RE_ENABLE_I18N
3663   re_charset_t *mbcset;
3664   Idx alloc = 0;
3665 #endif /* not RE_ENABLE_I18N */
3666   reg_errcode_t ret;
3667   re_token_t br_token;
3668   bin_tree_t *tree;
3669 
3670   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3671   if (BE (sbcset == NULL, 0))
3672     {
3673       *err = REG_ESPACE;
3674       return NULL;
3675     }
3676 #ifdef RE_ENABLE_I18N
3677   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3678   if (BE (mbcset == NULL, 0))
3679     {
3680       re_free (sbcset);
3681       *err = REG_ESPACE;
3682       return NULL;
3683     }
3684   mbcset->non_match = non_match;
3685 #endif /* RE_ENABLE_I18N */
3686 
3687   /* We don't care the syntax in this case.  */
3688   ret = build_charclass (trans, sbcset,
3689 #ifdef RE_ENABLE_I18N
3690 			 mbcset, &alloc,
3691 #endif /* RE_ENABLE_I18N */
3692 			 class_name, 0);
3693 
3694   if (BE (ret != REG_NOERROR, 0))
3695     {
3696       re_free (sbcset);
3697 #ifdef RE_ENABLE_I18N
3698       free_charset (mbcset);
3699 #endif /* RE_ENABLE_I18N */
3700       *err = ret;
3701       return NULL;
3702     }
3703   /* \w match '_' also.  */
3704   for (; *extra; extra++)
3705     bitset_set (sbcset, *extra);
3706 
3707   /* If it is non-matching list.  */
3708   if (non_match)
3709     bitset_not (sbcset);
3710 
3711 #ifdef RE_ENABLE_I18N
3712   /* Ensure only single byte characters are set.  */
3713   if (dfa->mb_cur_max > 1)
3714     bitset_mask (sbcset, dfa->sb_char);
3715 #endif
3716 
3717   /* Build a tree for simple bracket.  */
3718 #if defined GCC_LINT || defined lint
3719   memset (&br_token, 0, sizeof br_token);
3720 #endif
3721   br_token.type = SIMPLE_BRACKET;
3722   br_token.opr.sbcset = sbcset;
3723   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3724   if (BE (tree == NULL, 0))
3725     goto build_word_op_espace;
3726 
3727 #ifdef RE_ENABLE_I18N
3728   if (dfa->mb_cur_max > 1)
3729     {
3730       bin_tree_t *mbc_tree;
3731       /* Build a tree for complex bracket.  */
3732       br_token.type = COMPLEX_BRACKET;
3733       br_token.opr.mbcset = mbcset;
3734       dfa->has_mb_node = 1;
3735       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3736       if (BE (mbc_tree == NULL, 0))
3737 	goto build_word_op_espace;
3738       /* Then join them by ALT node.  */
3739       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3740       if (BE (mbc_tree != NULL, 1))
3741 	return tree;
3742     }
3743   else
3744     {
3745       free_charset (mbcset);
3746       return tree;
3747     }
3748 #else /* not RE_ENABLE_I18N */
3749   return tree;
3750 #endif /* not RE_ENABLE_I18N */
3751 
3752  build_word_op_espace:
3753   re_free (sbcset);
3754 #ifdef RE_ENABLE_I18N
3755   free_charset (mbcset);
3756 #endif /* RE_ENABLE_I18N */
3757   *err = REG_ESPACE;
3758   return NULL;
3759 }
3760 
3761 /* This is intended for the expressions like "a{1,3}".
3762    Fetch a number from 'input', and return the number.
3763    Return -1 if the number field is empty like "{,1}".
3764    Return RE_DUP_MAX + 1 if the number field is too large.
3765    Return -2 if an error occurred.  */
3766 
3767 static Idx
fetch_number(re_string_t * input,re_token_t * token,reg_syntax_t syntax)3768 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3769 {
3770   Idx num = -1;
3771   unsigned char c;
3772   while (1)
3773     {
3774       fetch_token (token, input, syntax);
3775       c = token->opr.c;
3776       if (BE (token->type == END_OF_RE, 0))
3777 	return -2;
3778       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3779 	break;
3780       num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3781 	     ? -2
3782 	     : num == -1
3783 	     ? c - '0'
3784 	     : MIN (RE_DUP_MAX + 1, num * 10 + c - '0'));
3785     }
3786   return num;
3787 }
3788 
3789 #ifdef RE_ENABLE_I18N
3790 static void
free_charset(re_charset_t * cset)3791 free_charset (re_charset_t *cset)
3792 {
3793   re_free (cset->mbchars);
3794 # ifdef _LIBC
3795   re_free (cset->coll_syms);
3796   re_free (cset->equiv_classes);
3797   re_free (cset->range_starts);
3798   re_free (cset->range_ends);
3799 # endif
3800   re_free (cset->char_classes);
3801   re_free (cset);
3802 }
3803 #endif /* RE_ENABLE_I18N */
3804 
3805 /* Functions for binary tree operation.  */
3806 
3807 /* Create a tree node.  */
3808 
3809 static bin_tree_t *
create_tree(re_dfa_t * dfa,bin_tree_t * left,bin_tree_t * right,re_token_type_t type)3810 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3811 	     re_token_type_t type)
3812 {
3813   re_token_t t;
3814 #if defined GCC_LINT || defined lint
3815   memset (&t, 0, sizeof t);
3816 #endif
3817   t.type = type;
3818   return create_token_tree (dfa, left, right, &t);
3819 }
3820 
3821 static bin_tree_t *
create_token_tree(re_dfa_t * dfa,bin_tree_t * left,bin_tree_t * right,const re_token_t * token)3822 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3823 		   const re_token_t *token)
3824 {
3825   bin_tree_t *tree;
3826   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3827     {
3828       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3829 
3830       if (storage == NULL)
3831 	return NULL;
3832       storage->next = dfa->str_tree_storage;
3833       dfa->str_tree_storage = storage;
3834       dfa->str_tree_storage_idx = 0;
3835     }
3836   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3837 
3838   tree->parent = NULL;
3839   tree->left = left;
3840   tree->right = right;
3841   tree->token = *token;
3842   tree->token.duplicated = 0;
3843   tree->token.opt_subexp = 0;
3844   tree->first = NULL;
3845   tree->next = NULL;
3846   tree->node_idx = -1;
3847 
3848   if (left != NULL)
3849     left->parent = tree;
3850   if (right != NULL)
3851     right->parent = tree;
3852   return tree;
3853 }
3854 
3855 /* Mark the tree SRC as an optional subexpression.
3856    To be called from preorder or postorder.  */
3857 
3858 static reg_errcode_t
mark_opt_subexp(void * extra,bin_tree_t * node)3859 mark_opt_subexp (void *extra, bin_tree_t *node)
3860 {
3861   Idx idx = (uintptr_t) extra;
3862   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3863     node->token.opt_subexp = 1;
3864 
3865   return REG_NOERROR;
3866 }
3867 
3868 /* Free the allocated memory inside NODE. */
3869 
3870 static void
free_token(re_token_t * node)3871 free_token (re_token_t *node)
3872 {
3873 #ifdef RE_ENABLE_I18N
3874   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3875     free_charset (node->opr.mbcset);
3876   else
3877 #endif /* RE_ENABLE_I18N */
3878     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3879       re_free (node->opr.sbcset);
3880 }
3881 
3882 /* Worker function for tree walking.  Free the allocated memory inside NODE
3883    and its children. */
3884 
3885 static reg_errcode_t
free_tree(void * extra,bin_tree_t * node)3886 free_tree (void *extra, bin_tree_t *node)
3887 {
3888   free_token (&node->token);
3889   return REG_NOERROR;
3890 }
3891 
3892 
3893 /* Duplicate the node SRC, and return new node.  This is a preorder
3894    visit similar to the one implemented by the generic visitor, but
3895    we need more infrastructure to maintain two parallel trees --- so,
3896    it's easier to duplicate.  */
3897 
3898 static bin_tree_t *
duplicate_tree(const bin_tree_t * root,re_dfa_t * dfa)3899 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3900 {
3901   const bin_tree_t *node;
3902   bin_tree_t *dup_root;
3903   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3904 
3905   for (node = root; ; )
3906     {
3907       /* Create a new tree and link it back to the current parent.  */
3908       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3909       if (*p_new == NULL)
3910 	return NULL;
3911       (*p_new)->parent = dup_node;
3912       (*p_new)->token.duplicated = 1;
3913       dup_node = *p_new;
3914 
3915       /* Go to the left node, or up and to the right.  */
3916       if (node->left)
3917 	{
3918 	  node = node->left;
3919 	  p_new = &dup_node->left;
3920 	}
3921       else
3922 	{
3923 	  const bin_tree_t *prev = NULL;
3924 	  while (node->right == prev || node->right == NULL)
3925 	    {
3926 	      prev = node;
3927 	      node = node->parent;
3928 	      dup_node = dup_node->parent;
3929 	      if (!node)
3930 		return dup_root;
3931 	    }
3932 	  node = node->right;
3933 	  p_new = &dup_node->right;
3934 	}
3935     }
3936 }
3937