xref: /haiku/src/system/libroot/posix/glibc/regex/regcomp.c (revision 820dca4df6c7bf955c46e8f6521b9408f50b2900)
1 /* Extended regular expression matching and search library.
2    Copyright (C) 2002,2003,2004,2005,2006,2007,2009
3    Free Software Foundation, Inc.
4    This file is part of the GNU C Library.
5    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 
7    The GNU C Library is free software; you can redistribute it and/or
8    modify it under the terms of the GNU Lesser General Public
9    License as published by the Free Software Foundation; either
10    version 2.1 of the License, or (at your option) any later version.
11 
12    The GNU C Library is distributed in the hope that it will be useful,
13    but WITHOUT ANY WARRANTY; without even the implied warranty of
14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15    Lesser General Public License for more details.
16 
17    You should have received a copy of the GNU Lesser General Public
18    License along with the GNU C Library; if not, write to the Free
19    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
20    02111-1307 USA.  */
21 
22 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
23 					  size_t length, reg_syntax_t syntax);
24 static void re_compile_fastmap_iter (regex_t *bufp,
25 				     const re_dfastate_t *init_state,
26 				     char *fastmap);
27 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
28 #ifdef RE_ENABLE_I18N
29 static void free_charset (re_charset_t *cset);
30 #endif /* RE_ENABLE_I18N */
31 static void free_workarea_compile (regex_t *preg);
32 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
33 #ifdef RE_ENABLE_I18N
34 static void optimize_utf8 (re_dfa_t *dfa);
35 #endif
36 static reg_errcode_t analyze (regex_t *preg);
37 static reg_errcode_t preorder (bin_tree_t *root,
38 			       reg_errcode_t (fn (void *, bin_tree_t *)),
39 			       void *extra);
40 static reg_errcode_t postorder (bin_tree_t *root,
41 				reg_errcode_t (fn (void *, bin_tree_t *)),
42 				void *extra);
43 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
44 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
45 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
46 				 bin_tree_t *node);
47 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
48 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
49 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
50 static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
51 static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
52 				   unsigned int constraint);
53 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
54 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
55 					 int node, int root);
56 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
57 static int fetch_number (re_string_t *input, re_token_t *token,
58 			 reg_syntax_t syntax);
59 static int peek_token (re_token_t *token, re_string_t *input,
60 			reg_syntax_t syntax) internal_function;
61 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
62 			  reg_syntax_t syntax, reg_errcode_t *err);
63 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
64 				  re_token_t *token, reg_syntax_t syntax,
65 				  int nest, reg_errcode_t *err);
66 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
67 				 re_token_t *token, reg_syntax_t syntax,
68 				 int nest, reg_errcode_t *err);
69 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
70 				     re_token_t *token, reg_syntax_t syntax,
71 				     int nest, reg_errcode_t *err);
72 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
73 				  re_token_t *token, reg_syntax_t syntax,
74 				  int nest, reg_errcode_t *err);
75 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
76 				 re_dfa_t *dfa, re_token_t *token,
77 				 reg_syntax_t syntax, reg_errcode_t *err);
78 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
79 				      re_token_t *token, reg_syntax_t syntax,
80 				      reg_errcode_t *err);
81 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
82 					    re_string_t *regexp,
83 					    re_token_t *token, int token_len,
84 					    re_dfa_t *dfa,
85 					    reg_syntax_t syntax,
86 					    int accept_hyphen);
87 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
88 					  re_string_t *regexp,
89 					  re_token_t *token);
90 #ifdef RE_ENABLE_I18N
91 static reg_errcode_t build_equiv_class (bitset_t sbcset,
92 					re_charset_t *mbcset,
93 					int *equiv_class_alloc,
94 					const unsigned char *name);
95 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
96 				      bitset_t sbcset,
97 				      re_charset_t *mbcset,
98 				      int *char_class_alloc,
99 				      const unsigned char *class_name,
100 				      reg_syntax_t syntax);
101 #else  /* not RE_ENABLE_I18N */
102 static reg_errcode_t build_equiv_class (bitset_t sbcset,
103 					const unsigned char *name);
104 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
105 				      bitset_t sbcset,
106 				      const unsigned char *class_name,
107 				      reg_syntax_t syntax);
108 #endif /* not RE_ENABLE_I18N */
109 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
110 				       RE_TRANSLATE_TYPE trans,
111 				       const unsigned char *class_name,
112 				       const unsigned char *extra,
113 				       int non_match, reg_errcode_t *err);
114 static bin_tree_t *create_tree (re_dfa_t *dfa,
115 				bin_tree_t *left, bin_tree_t *right,
116 				re_token_type_t type);
117 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
118 				      bin_tree_t *left, bin_tree_t *right,
119 				      const re_token_t *token);
120 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
121 static void free_token (re_token_t *node);
122 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
123 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
124 
125 /* This table gives an error message for each of the error codes listed
126    in regex.h.  Obviously the order here has to be same as there.
127    POSIX doesn't require that we do anything for REG_NOERROR,
128    but why not be nice?  */
129 
130 const char __re_error_msgid[] attribute_hidden =
131   {
132 #define REG_NOERROR_IDX	0
133     gettext_noop ("Success")	/* REG_NOERROR */
134     "\0"
135 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
136     gettext_noop ("No match")	/* REG_NOMATCH */
137     "\0"
138 #define REG_BADPAT_IDX	(REG_NOMATCH_IDX + sizeof "No match")
139     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
140     "\0"
141 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
142     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
143     "\0"
144 #define REG_ECTYPE_IDX	(REG_ECOLLATE_IDX + sizeof "Invalid collation character")
145     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
146     "\0"
147 #define REG_EESCAPE_IDX	(REG_ECTYPE_IDX + sizeof "Invalid character class name")
148     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
149     "\0"
150 #define REG_ESUBREG_IDX	(REG_EESCAPE_IDX + sizeof "Trailing backslash")
151     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
152     "\0"
153 #define REG_EBRACK_IDX	(REG_ESUBREG_IDX + sizeof "Invalid back reference")
154     gettext_noop ("Unmatched [ or [^")	/* REG_EBRACK */
155     "\0"
156 #define REG_EPAREN_IDX	(REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
157     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
158     "\0"
159 #define REG_EBRACE_IDX	(REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
160     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
161     "\0"
162 #define REG_BADBR_IDX	(REG_EBRACE_IDX + sizeof "Unmatched \\{")
163     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
164     "\0"
165 #define REG_ERANGE_IDX	(REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
166     gettext_noop ("Invalid range end")	/* REG_ERANGE */
167     "\0"
168 #define REG_ESPACE_IDX	(REG_ERANGE_IDX + sizeof "Invalid range end")
169     gettext_noop ("Memory exhausted") /* REG_ESPACE */
170     "\0"
171 #define REG_BADRPT_IDX	(REG_ESPACE_IDX + sizeof "Memory exhausted")
172     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
173     "\0"
174 #define REG_EEND_IDX	(REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
175     gettext_noop ("Premature end of regular expression") /* REG_EEND */
176     "\0"
177 #define REG_ESIZE_IDX	(REG_EEND_IDX + sizeof "Premature end of regular expression")
178     gettext_noop ("Regular expression too big") /* REG_ESIZE */
179     "\0"
180 #define REG_ERPAREN_IDX	(REG_ESIZE_IDX + sizeof "Regular expression too big")
181     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
182   };
183 
184 const size_t __re_error_msgid_idx[] attribute_hidden =
185   {
186     REG_NOERROR_IDX,
187     REG_NOMATCH_IDX,
188     REG_BADPAT_IDX,
189     REG_ECOLLATE_IDX,
190     REG_ECTYPE_IDX,
191     REG_EESCAPE_IDX,
192     REG_ESUBREG_IDX,
193     REG_EBRACK_IDX,
194     REG_EPAREN_IDX,
195     REG_EBRACE_IDX,
196     REG_BADBR_IDX,
197     REG_ERANGE_IDX,
198     REG_ESPACE_IDX,
199     REG_BADRPT_IDX,
200     REG_EEND_IDX,
201     REG_ESIZE_IDX,
202     REG_ERPAREN_IDX
203   };
204 
205 /* Entry points for GNU code.  */
206 
207 /* re_compile_pattern is the GNU regular expression compiler: it
208    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
209    Returns 0 if the pattern was valid, otherwise an error string.
210 
211    Assumes the `allocated' (and perhaps `buffer') and `translate' fields
212    are set in BUFP on entry.  */
213 
214 const char *
215 re_compile_pattern (pattern, length, bufp)
216     const char *pattern;
217     size_t length;
218     struct re_pattern_buffer *bufp;
219 {
220   reg_errcode_t ret;
221 
222   /* And GNU code determines whether or not to get register information
223      by passing null for the REGS argument to re_match, etc., not by
224      setting no_sub, unless RE_NO_SUB is set.  */
225   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
226 
227   /* Match anchors at newline.  */
228   bufp->newline_anchor = 1;
229 
230   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
231 
232   if (!ret)
233     return NULL;
234   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
235 }
236 #ifdef _LIBC
237 weak_alias (__re_compile_pattern, re_compile_pattern)
238 #endif
239 
240 /* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
241    also be assigned to arbitrarily: each pattern buffer stores its own
242    syntax, so it can be changed between regex compilations.  */
243 /* This has no initializer because initialized variables in Emacs
244    become read-only after dumping.  */
245 reg_syntax_t re_syntax_options;
246 
247 
248 /* Specify the precise syntax of regexps for compilation.  This provides
249    for compatibility for various utilities which historically have
250    different, incompatible syntaxes.
251 
252    The argument SYNTAX is a bit mask comprised of the various bits
253    defined in regex.h.  We return the old syntax.  */
254 
255 reg_syntax_t
256 re_set_syntax (syntax)
257     reg_syntax_t syntax;
258 {
259   reg_syntax_t ret = re_syntax_options;
260 
261   re_syntax_options = syntax;
262   return ret;
263 }
264 #ifdef _LIBC
265 weak_alias (__re_set_syntax, re_set_syntax)
266 #endif
267 
268 int
269 re_compile_fastmap (bufp)
270     struct re_pattern_buffer *bufp;
271 {
272   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
273   char *fastmap = bufp->fastmap;
274 
275   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
276   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
277   if (dfa->init_state != dfa->init_state_word)
278     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
279   if (dfa->init_state != dfa->init_state_nl)
280     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
281   if (dfa->init_state != dfa->init_state_begbuf)
282     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
283   bufp->fastmap_accurate = 1;
284   return 0;
285 }
286 #ifdef _LIBC
287 weak_alias (__re_compile_fastmap, re_compile_fastmap)
288 #endif
289 
290 static inline void
291 __attribute ((always_inline))
292 re_set_fastmap (char *fastmap, int icase, int ch)
293 {
294   fastmap[ch] = 1;
295   if (icase)
296     fastmap[tolower (ch)] = 1;
297 }
298 
299 /* Helper function for re_compile_fastmap.
300    Compile fastmap for the initial_state INIT_STATE.  */
301 
302 static void
303 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
304 			 char *fastmap)
305 {
306   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
307   int node_cnt;
308   int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
309   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
310     {
311       int node = init_state->nodes.elems[node_cnt];
312       re_token_type_t type = dfa->nodes[node].type;
313 
314       if (type == CHARACTER)
315 	{
316 	  re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
317 #ifdef RE_ENABLE_I18N
318 	  if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
319 	    {
320 	      unsigned char *buf = alloca (dfa->mb_cur_max), *p;
321 	      wchar_t wc;
322 	      mbstate_t state;
323 
324 	      p = buf;
325 	      *p++ = dfa->nodes[node].opr.c;
326 	      while (++node < dfa->nodes_len
327 		     &&	dfa->nodes[node].type == CHARACTER
328 		     && dfa->nodes[node].mb_partial)
329 		*p++ = dfa->nodes[node].opr.c;
330 	      memset (&state, '\0', sizeof (state));
331 	      if (__mbrtowc (&wc, (const char *) buf, p - buf,
332 			     &state) == p - buf
333 		  && (__wcrtomb ((char *) buf, towlower (wc), &state)
334 		      != (size_t) -1))
335 		re_set_fastmap (fastmap, 0, buf[0]);
336 	    }
337 #endif
338 	}
339       else if (type == SIMPLE_BRACKET)
340 	{
341 	  int i, ch;
342 	  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
343 	    {
344 	      int j;
345 	      bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
346 	      for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
347 		if (w & ((bitset_word_t) 1 << j))
348 		  re_set_fastmap (fastmap, icase, ch);
349 	    }
350 	}
351 #ifdef RE_ENABLE_I18N
352       else if (type == COMPLEX_BRACKET)
353 	{
354 	  re_charset_t *cset = dfa->nodes[node].opr.mbcset;
355 	  int i;
356 
357 # ifdef _LIBC
358 	  /* See if we have to try all bytes which start multiple collation
359 	     elements.
360 	     e.g. In da_DK, we want to catch 'a' since "aa" is a valid
361 		  collation element, and don't catch 'b' since 'b' is
362 		  the only collation element which starts from 'b' (and
363 		  it is caught by SIMPLE_BRACKET).  */
364 	      if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
365 		  && (cset->ncoll_syms || cset->nranges))
366 		{
367 		  const int32_t *table = (const int32_t *)
368 		    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
369 		  for (i = 0; i < SBC_MAX; ++i)
370 		    if (table[i] < 0)
371 		      re_set_fastmap (fastmap, icase, i);
372 		}
373 # endif /* _LIBC */
374 
375 	  /* See if we have to start the match at all multibyte characters,
376 	     i.e. where we would not find an invalid sequence.  This only
377 	     applies to multibyte character sets; for single byte character
378 	     sets, the SIMPLE_BRACKET again suffices.  */
379 	  if (dfa->mb_cur_max > 1
380 	      && (cset->nchar_classes || cset->non_match
381 # ifdef _LIBC
382 		  || cset->nequiv_classes
383 # endif /* _LIBC */
384 		 ))
385 	    {
386 	      unsigned char c = 0;
387 	      do
388 		{
389 		  mbstate_t mbs;
390 		  memset (&mbs, 0, sizeof (mbs));
391 		  if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
392 		    re_set_fastmap (fastmap, false, (int) c);
393 		}
394 	      while (++c != 0);
395 	    }
396 
397 	  else
398 	    {
399 	      /* ... Else catch all bytes which can start the mbchars.  */
400 	      for (i = 0; i < cset->nmbchars; ++i)
401 		{
402 		  char buf[256];
403 		  mbstate_t state;
404 		  memset (&state, '\0', sizeof (state));
405 		  if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
406 		    re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
407 		  if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
408 		    {
409 		      if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
410 			  != (size_t) -1)
411 			re_set_fastmap (fastmap, false, *(unsigned char *) buf);
412 		    }
413  		}
414  	    }
415 	}
416 #endif /* RE_ENABLE_I18N */
417       else if (type == OP_PERIOD
418 #ifdef RE_ENABLE_I18N
419 	       || type == OP_UTF8_PERIOD
420 #endif /* RE_ENABLE_I18N */
421 	       || type == END_OF_RE)
422 	{
423 	  memset (fastmap, '\1', sizeof (char) * SBC_MAX);
424 	  if (type == END_OF_RE)
425 	    bufp->can_be_null = 1;
426 	  return;
427 	}
428     }
429 }
430 
431 /* Entry point for POSIX code.  */
432 /* regcomp takes a regular expression as a string and compiles it.
433 
434    PREG is a regex_t *.  We do not expect any fields to be initialized,
435    since POSIX says we shouldn't.  Thus, we set
436 
437      `buffer' to the compiled pattern;
438      `used' to the length of the compiled pattern;
439      `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
440        REG_EXTENDED bit in CFLAGS is set; otherwise, to
441        RE_SYNTAX_POSIX_BASIC;
442      `newline_anchor' to REG_NEWLINE being set in CFLAGS;
443      `fastmap' to an allocated space for the fastmap;
444      `fastmap_accurate' to zero;
445      `re_nsub' to the number of subexpressions in PATTERN.
446 
447    PATTERN is the address of the pattern string.
448 
449    CFLAGS is a series of bits which affect compilation.
450 
451      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
452      use POSIX basic syntax.
453 
454      If REG_NEWLINE is set, then . and [^...] don't match newline.
455      Also, regexec will try a match beginning after every newline.
456 
457      If REG_ICASE is set, then we considers upper- and lowercase
458      versions of letters to be equivalent when matching.
459 
460      If REG_NOSUB is set, then when PREG is passed to regexec, that
461      routine will report only success or failure, and nothing about the
462      registers.
463 
464    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
465    the return codes and their meanings.)  */
466 
467 int
468 regcomp (preg, pattern, cflags)
469     regex_t *__restrict preg;
470     const char *__restrict pattern;
471     int cflags;
472 {
473   reg_errcode_t ret;
474   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
475 			 : RE_SYNTAX_POSIX_BASIC);
476 
477   preg->buffer = NULL;
478   preg->allocated = 0;
479   preg->used = 0;
480 
481   /* Try to allocate space for the fastmap.  */
482   preg->fastmap = re_malloc (char, SBC_MAX);
483   if (BE (preg->fastmap == NULL, 0))
484     return REG_ESPACE;
485 
486   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
487 
488   /* If REG_NEWLINE is set, newlines are treated differently.  */
489   if (cflags & REG_NEWLINE)
490     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
491       syntax &= ~RE_DOT_NEWLINE;
492       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
493       /* It also changes the matching behavior.  */
494       preg->newline_anchor = 1;
495     }
496   else
497     preg->newline_anchor = 0;
498   preg->no_sub = !!(cflags & REG_NOSUB);
499   preg->translate = NULL;
500 
501   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
502 
503   /* POSIX doesn't distinguish between an unmatched open-group and an
504      unmatched close-group: both are REG_EPAREN.  */
505   if (ret == REG_ERPAREN)
506     ret = REG_EPAREN;
507 
508   /* We have already checked preg->fastmap != NULL.  */
509   if (BE (ret == REG_NOERROR, 1))
510     /* Compute the fastmap now, since regexec cannot modify the pattern
511        buffer.  This function never fails in this implementation.  */
512     (void) re_compile_fastmap (preg);
513   else
514     {
515       /* Some error occurred while compiling the expression.  */
516       re_free (preg->fastmap);
517       preg->fastmap = NULL;
518     }
519 
520   return (int) ret;
521 }
522 #ifdef _LIBC
523 weak_alias (__regcomp, regcomp)
524 #endif
525 
526 /* Returns a message corresponding to an error code, ERRCODE, returned
527    from either regcomp or regexec.   We don't use PREG here.  */
528 
529 size_t
530 regerror (errcode, preg, errbuf, errbuf_size)
531     int errcode;
532     const regex_t *__restrict preg;
533     char *__restrict errbuf;
534     size_t errbuf_size;
535 {
536   const char *msg;
537   size_t msg_size;
538 
539   if (BE (errcode < 0
540 	  || errcode >= (int) (sizeof (__re_error_msgid_idx)
541 			       / sizeof (__re_error_msgid_idx[0])), 0))
542     /* Only error codes returned by the rest of the code should be passed
543        to this routine.  If we are given anything else, or if other regex
544        code generates an invalid error code, then the program has a bug.
545        Dump core so we can fix it.  */
546     abort ();
547 
548   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
549 
550   msg_size = strlen (msg) + 1; /* Includes the null.  */
551 
552   if (BE (errbuf_size != 0, 1))
553     {
554       if (BE (msg_size > errbuf_size, 0))
555 	{
556 #if defined HAVE_MEMPCPY || defined _LIBC
557 	  *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
558 #else
559 	  memcpy (errbuf, msg, errbuf_size - 1);
560 	  errbuf[errbuf_size - 1] = 0;
561 #endif
562 	}
563       else
564 	memcpy (errbuf, msg, msg_size);
565     }
566 
567   return msg_size;
568 }
569 #ifdef _LIBC
570 weak_alias (__regerror, regerror)
571 #endif
572 
573 
574 #ifdef RE_ENABLE_I18N
575 /* This static array is used for the map to single-byte characters when
576    UTF-8 is used.  Otherwise we would allocate memory just to initialize
577    it the same all the time.  UTF-8 is the preferred encoding so this is
578    a worthwhile optimization.  */
579 static const bitset_t utf8_sb_map =
580 {
581   /* Set the first 128 bits.  */
582   [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
583 };
584 #endif
585 
586 
587 static void
588 free_dfa_content (re_dfa_t *dfa)
589 {
590   int i, j;
591 
592   if (dfa->nodes)
593     for (i = 0; i < dfa->nodes_len; ++i)
594       free_token (dfa->nodes + i);
595   re_free (dfa->nexts);
596   for (i = 0; i < dfa->nodes_len; ++i)
597     {
598       if (dfa->eclosures != NULL)
599 	re_node_set_free (dfa->eclosures + i);
600       if (dfa->inveclosures != NULL)
601 	re_node_set_free (dfa->inveclosures + i);
602       if (dfa->edests != NULL)
603 	re_node_set_free (dfa->edests + i);
604     }
605   re_free (dfa->edests);
606   re_free (dfa->eclosures);
607   re_free (dfa->inveclosures);
608   re_free (dfa->nodes);
609 
610   if (dfa->state_table)
611     for (i = 0; i <= dfa->state_hash_mask; ++i)
612       {
613 	struct re_state_table_entry *entry = dfa->state_table + i;
614 	for (j = 0; j < entry->num; ++j)
615 	  {
616 	    re_dfastate_t *state = entry->array[j];
617 	    free_state (state);
618 	  }
619         re_free (entry->array);
620       }
621   re_free (dfa->state_table);
622 #ifdef RE_ENABLE_I18N
623   if (dfa->sb_char != utf8_sb_map)
624     re_free (dfa->sb_char);
625 #endif
626   re_free (dfa->subexp_map);
627 #ifdef DEBUG
628   re_free (dfa->re_str);
629 #endif
630 
631   re_free (dfa);
632 }
633 
634 
635 /* Free dynamically allocated space used by PREG.  */
636 
637 void
638 regfree (preg)
639     regex_t *preg;
640 {
641   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
642   if (BE (dfa != NULL, 1))
643     free_dfa_content (dfa);
644   preg->buffer = NULL;
645   preg->allocated = 0;
646 
647   re_free (preg->fastmap);
648   preg->fastmap = NULL;
649 
650   re_free (preg->translate);
651   preg->translate = NULL;
652 }
653 #ifdef _LIBC
654 weak_alias (__regfree, regfree)
655 #endif
656 
657 /* Entry points compatible with 4.2 BSD regex library.  We don't define
658    them unless specifically requested.  */
659 
660 #if defined _REGEX_RE_COMP || defined _LIBC
661 
662 /* BSD has one and only one pattern buffer.  */
663 static struct re_pattern_buffer re_comp_buf;
664 
665 char *
666 # ifdef _LIBC
667 /* Make these definitions weak in libc, so POSIX programs can redefine
668    these names if they don't use our functions, and still use
669    regcomp/regexec above without link errors.  */
670 weak_function
671 # endif
672 re_comp (s)
673      const char *s;
674 {
675   reg_errcode_t ret;
676   char *fastmap;
677 
678   if (!s)
679     {
680       if (!re_comp_buf.buffer)
681 	return gettext ("No previous regular expression");
682       return 0;
683     }
684 
685   if (re_comp_buf.buffer)
686     {
687       fastmap = re_comp_buf.fastmap;
688       re_comp_buf.fastmap = NULL;
689       __regfree (&re_comp_buf);
690       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
691       re_comp_buf.fastmap = fastmap;
692     }
693 
694   if (re_comp_buf.fastmap == NULL)
695     {
696       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
697       if (re_comp_buf.fastmap == NULL)
698 	return (char *) gettext (__re_error_msgid
699 				 + __re_error_msgid_idx[(int) REG_ESPACE]);
700     }
701 
702   /* Since `re_exec' always passes NULL for the `regs' argument, we
703      don't need to initialize the pattern buffer fields which affect it.  */
704 
705   /* Match anchors at newlines.  */
706   re_comp_buf.newline_anchor = 1;
707 
708   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
709 
710   if (!ret)
711     return NULL;
712 
713   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
714   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
715 }
716 
717 #ifdef _LIBC
718 libc_freeres_fn (free_mem)
719 {
720   __regfree (&re_comp_buf);
721 }
722 #endif
723 
724 #endif /* _REGEX_RE_COMP */
725 
726 /* Internal entry point.
727    Compile the regular expression PATTERN, whose length is LENGTH.
728    SYNTAX indicate regular expression's syntax.  */
729 
730 static reg_errcode_t
731 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
732 		     reg_syntax_t syntax)
733 {
734   reg_errcode_t err = REG_NOERROR;
735   re_dfa_t *dfa;
736   re_string_t regexp;
737 
738   /* Initialize the pattern buffer.  */
739   preg->fastmap_accurate = 0;
740   preg->syntax = syntax;
741   preg->not_bol = preg->not_eol = 0;
742   preg->used = 0;
743   preg->re_nsub = 0;
744   preg->can_be_null = 0;
745   preg->regs_allocated = REGS_UNALLOCATED;
746 
747   /* Initialize the dfa.  */
748   dfa = (re_dfa_t *) preg->buffer;
749   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
750     {
751       /* If zero allocated, but buffer is non-null, try to realloc
752 	 enough space.  This loses if buffer's address is bogus, but
753 	 that is the user's responsibility.  If ->buffer is NULL this
754 	 is a simple allocation.  */
755       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
756       if (dfa == NULL)
757 	return REG_ESPACE;
758       preg->allocated = sizeof (re_dfa_t);
759       preg->buffer = (unsigned char *) dfa;
760     }
761   preg->used = sizeof (re_dfa_t);
762 
763   err = init_dfa (dfa, length);
764   if (BE (err != REG_NOERROR, 0))
765     {
766       free_dfa_content (dfa);
767       preg->buffer = NULL;
768       preg->allocated = 0;
769       return err;
770     }
771 #ifdef DEBUG
772   /* Note: length+1 will not overflow since it is checked in init_dfa.  */
773   dfa->re_str = re_malloc (char, length + 1);
774   strncpy (dfa->re_str, pattern, length + 1);
775 #endif
776 
777   __libc_lock_init (dfa->lock);
778 
779   err = re_string_construct (&regexp, pattern, length, preg->translate,
780 			     syntax & RE_ICASE, dfa);
781   if (BE (err != REG_NOERROR, 0))
782     {
783     re_compile_internal_free_return:
784       free_workarea_compile (preg);
785       re_string_destruct (&regexp);
786       free_dfa_content (dfa);
787       preg->buffer = NULL;
788       preg->allocated = 0;
789       return err;
790     }
791 
792   /* Parse the regular expression, and build a structure tree.  */
793   preg->re_nsub = 0;
794   dfa->str_tree = parse (&regexp, preg, syntax, &err);
795   if (BE (dfa->str_tree == NULL, 0))
796     goto re_compile_internal_free_return;
797 
798   /* Analyze the tree and create the nfa.  */
799   err = analyze (preg);
800   if (BE (err != REG_NOERROR, 0))
801     goto re_compile_internal_free_return;
802 
803 #ifdef RE_ENABLE_I18N
804   /* If possible, do searching in single byte encoding to speed things up.  */
805   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
806     optimize_utf8 (dfa);
807 #endif
808 
809   /* Then create the initial state of the dfa.  */
810   err = create_initial_state (dfa);
811 
812   /* Release work areas.  */
813   free_workarea_compile (preg);
814   re_string_destruct (&regexp);
815 
816   if (BE (err != REG_NOERROR, 0))
817     {
818       free_dfa_content (dfa);
819       preg->buffer = NULL;
820       preg->allocated = 0;
821     }
822 
823   return err;
824 }
825 
826 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
827    as the initial length of some arrays.  */
828 
829 static reg_errcode_t
830 init_dfa (re_dfa_t *dfa, size_t pat_len)
831 {
832   unsigned int table_size;
833 #ifndef _LIBC
834   char *codeset_name;
835 #endif
836 
837   memset (dfa, '\0', sizeof (re_dfa_t));
838 
839   /* Force allocation of str_tree_storage the first time.  */
840   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
841 
842   /* Avoid overflows.  */
843   if (pat_len == SIZE_MAX)
844     return REG_ESPACE;
845 
846   dfa->nodes_alloc = pat_len + 1;
847   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
848 
849   /*  table_size = 2 ^ ceil(log pat_len) */
850   for (table_size = 1; ; table_size <<= 1)
851     if (table_size > pat_len)
852       break;
853 
854   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
855   dfa->state_hash_mask = table_size - 1;
856 
857   dfa->mb_cur_max = MB_CUR_MAX;
858 #ifdef _LIBC
859   if (dfa->mb_cur_max == 6
860       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
861     dfa->is_utf8 = 1;
862   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
863 		       != 0);
864 #else
865 # ifdef HAVE_LANGINFO_CODESET
866   codeset_name = nl_langinfo (CODESET);
867 # else
868   codeset_name = getenv ("LC_ALL");
869   if (codeset_name == NULL || codeset_name[0] == '\0')
870     codeset_name = getenv ("LC_CTYPE");
871   if (codeset_name == NULL || codeset_name[0] == '\0')
872     codeset_name = getenv ("LANG");
873   if (codeset_name == NULL)
874     codeset_name = "";
875   else if (strchr (codeset_name, '.') !=  NULL)
876     codeset_name = strchr (codeset_name, '.') + 1;
877 # endif
878 
879   if (strcasecmp (codeset_name, "UTF-8") == 0
880       || strcasecmp (codeset_name, "UTF8") == 0)
881     dfa->is_utf8 = 1;
882 
883   /* We check exhaustively in the loop below if this charset is a
884      superset of ASCII.  */
885   dfa->map_notascii = 0;
886 #endif
887 
888 #ifdef RE_ENABLE_I18N
889   if (dfa->mb_cur_max > 1)
890     {
891       if (dfa->is_utf8)
892 	dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
893       else
894 	{
895 	  int i, j, ch;
896 
897 	  dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
898 	  if (BE (dfa->sb_char == NULL, 0))
899 	    return REG_ESPACE;
900 
901 	  /* Set the bits corresponding to single byte chars.  */
902 	  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
903 	    for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
904 	      {
905 		wint_t wch = __btowc (ch);
906 		if (wch != WEOF)
907 		  dfa->sb_char[i] |= (bitset_word_t) 1 << j;
908 # ifndef _LIBC
909 		if (isascii (ch) && wch != ch)
910 		  dfa->map_notascii = 1;
911 # endif
912 	      }
913 	}
914     }
915 #endif
916 
917   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
918     return REG_ESPACE;
919   return REG_NOERROR;
920 }
921 
922 /* Initialize WORD_CHAR table, which indicate which character is
923    "word".  In this case "word" means that it is the word construction
924    character used by some operators like "\<", "\>", etc.  */
925 
926 static void
927 internal_function
928 init_word_char (re_dfa_t *dfa)
929 {
930   int i, j, ch;
931   dfa->word_ops_used = 1;
932   for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
933     for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
934       if (isalnum (ch) || ch == '_')
935 	dfa->word_char[i] |= (bitset_word_t) 1 << j;
936 }
937 
938 /* Free the work area which are only used while compiling.  */
939 
940 static void
941 free_workarea_compile (regex_t *preg)
942 {
943   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
944   bin_tree_storage_t *storage, *next;
945   for (storage = dfa->str_tree_storage; storage; storage = next)
946     {
947       next = storage->next;
948       re_free (storage);
949     }
950   dfa->str_tree_storage = NULL;
951   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
952   dfa->str_tree = NULL;
953   re_free (dfa->org_indices);
954   dfa->org_indices = NULL;
955 }
956 
957 /* Create initial states for all contexts.  */
958 
959 static reg_errcode_t
960 create_initial_state (re_dfa_t *dfa)
961 {
962   int first, i;
963   reg_errcode_t err;
964   re_node_set init_nodes;
965 
966   /* Initial states have the epsilon closure of the node which is
967      the first node of the regular expression.  */
968   first = dfa->str_tree->first->node_idx;
969   dfa->init_node = first;
970   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
971   if (BE (err != REG_NOERROR, 0))
972     return err;
973 
974   /* The back-references which are in initial states can epsilon transit,
975      since in this case all of the subexpressions can be null.
976      Then we add epsilon closures of the nodes which are the next nodes of
977      the back-references.  */
978   if (dfa->nbackref > 0)
979     for (i = 0; i < init_nodes.nelem; ++i)
980       {
981 	int node_idx = init_nodes.elems[i];
982 	re_token_type_t type = dfa->nodes[node_idx].type;
983 
984 	int clexp_idx;
985 	if (type != OP_BACK_REF)
986 	  continue;
987 	for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
988 	  {
989 	    re_token_t *clexp_node;
990 	    clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
991 	    if (clexp_node->type == OP_CLOSE_SUBEXP
992 		&& clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
993 	      break;
994 	  }
995 	if (clexp_idx == init_nodes.nelem)
996 	  continue;
997 
998 	if (type == OP_BACK_REF)
999 	  {
1000 	    int dest_idx = dfa->edests[node_idx].elems[0];
1001 	    if (!re_node_set_contains (&init_nodes, dest_idx))
1002 	      {
1003 		re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1004 		i = 0;
1005 	      }
1006 	  }
1007       }
1008 
1009   /* It must be the first time to invoke acquire_state.  */
1010   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1011   /* We don't check ERR here, since the initial state must not be NULL.  */
1012   if (BE (dfa->init_state == NULL, 0))
1013     return err;
1014   if (dfa->init_state->has_constraint)
1015     {
1016       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1017 						       CONTEXT_WORD);
1018       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1019 						     CONTEXT_NEWLINE);
1020       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1021 							 &init_nodes,
1022 							 CONTEXT_NEWLINE
1023 							 | CONTEXT_BEGBUF);
1024       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1025 	      || dfa->init_state_begbuf == NULL, 0))
1026 	return err;
1027     }
1028   else
1029     dfa->init_state_word = dfa->init_state_nl
1030       = dfa->init_state_begbuf = dfa->init_state;
1031 
1032   re_node_set_free (&init_nodes);
1033   return REG_NOERROR;
1034 }
1035 
1036 #ifdef RE_ENABLE_I18N
1037 /* If it is possible to do searching in single byte encoding instead of UTF-8
1038    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1039    DFA nodes where needed.  */
1040 
1041 static void
1042 optimize_utf8 (re_dfa_t *dfa)
1043 {
1044   int node, i, mb_chars = 0, has_period = 0;
1045 
1046   for (node = 0; node < dfa->nodes_len; ++node)
1047     switch (dfa->nodes[node].type)
1048       {
1049       case CHARACTER:
1050 	if (dfa->nodes[node].opr.c >= 0x80)
1051 	  mb_chars = 1;
1052 	break;
1053       case ANCHOR:
1054 	switch (dfa->nodes[node].opr.ctx_type)
1055 	  {
1056 	  case LINE_FIRST:
1057 	  case LINE_LAST:
1058 	  case BUF_FIRST:
1059 	  case BUF_LAST:
1060 	    break;
1061 	  default:
1062 	    /* Word anchors etc. cannot be handled.  It's okay to test
1063 	       opr.ctx_type since constraints (for all DFA nodes) are
1064 	       created by ORing one or more opr.ctx_type values.  */
1065 	    return;
1066 	  }
1067 	break;
1068       case OP_PERIOD:
1069         has_period = 1;
1070         break;
1071       case OP_BACK_REF:
1072       case OP_ALT:
1073       case END_OF_RE:
1074       case OP_DUP_ASTERISK:
1075       case OP_OPEN_SUBEXP:
1076       case OP_CLOSE_SUBEXP:
1077 	break;
1078       case COMPLEX_BRACKET:
1079 	return;
1080       case SIMPLE_BRACKET:
1081 	/* Just double check.  The non-ASCII range starts at 0x80.  */
1082 	assert (0x80 % BITSET_WORD_BITS == 0);
1083         for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1084 	  if (dfa->nodes[node].opr.sbcset[i])
1085 	    return;
1086 	break;
1087       default:
1088 	abort ();
1089       }
1090 
1091   if (mb_chars || has_period)
1092     for (node = 0; node < dfa->nodes_len; ++node)
1093       {
1094 	if (dfa->nodes[node].type == CHARACTER
1095 	    && dfa->nodes[node].opr.c >= 0x80)
1096 	  dfa->nodes[node].mb_partial = 0;
1097 	else if (dfa->nodes[node].type == OP_PERIOD)
1098 	  dfa->nodes[node].type = OP_UTF8_PERIOD;
1099       }
1100 
1101   /* The search can be in single byte locale.  */
1102   dfa->mb_cur_max = 1;
1103   dfa->is_utf8 = 0;
1104   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1105 }
1106 #endif
1107 
1108 /* Analyze the structure tree, and calculate "first", "next", "edest",
1109    "eclosure", and "inveclosure".  */
1110 
1111 static reg_errcode_t
1112 analyze (regex_t *preg)
1113 {
1114   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1115   reg_errcode_t ret;
1116 
1117   /* Allocate arrays.  */
1118   dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1119   dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1120   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1121   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1122   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1123 	  || dfa->eclosures == NULL, 0))
1124     return REG_ESPACE;
1125 
1126   dfa->subexp_map = re_malloc (int, preg->re_nsub);
1127   if (dfa->subexp_map != NULL)
1128     {
1129       int i;
1130       for (i = 0; i < preg->re_nsub; i++)
1131 	dfa->subexp_map[i] = i;
1132       preorder (dfa->str_tree, optimize_subexps, dfa);
1133       for (i = 0; i < preg->re_nsub; i++)
1134 	if (dfa->subexp_map[i] != i)
1135 	  break;
1136       if (i == preg->re_nsub)
1137 	{
1138 	  free (dfa->subexp_map);
1139 	  dfa->subexp_map = NULL;
1140 	}
1141     }
1142 
1143   ret = postorder (dfa->str_tree, lower_subexps, preg);
1144   if (BE (ret != REG_NOERROR, 0))
1145     return ret;
1146   ret = postorder (dfa->str_tree, calc_first, dfa);
1147   if (BE (ret != REG_NOERROR, 0))
1148     return ret;
1149   preorder (dfa->str_tree, calc_next, dfa);
1150   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1151   if (BE (ret != REG_NOERROR, 0))
1152     return ret;
1153   ret = calc_eclosure (dfa);
1154   if (BE (ret != REG_NOERROR, 0))
1155     return ret;
1156 
1157   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1158      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1159   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1160       || dfa->nbackref)
1161     {
1162       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1163       if (BE (dfa->inveclosures == NULL, 0))
1164         return REG_ESPACE;
1165       ret = calc_inveclosure (dfa);
1166     }
1167 
1168   return ret;
1169 }
1170 
1171 /* Our parse trees are very unbalanced, so we cannot use a stack to
1172    implement parse tree visits.  Instead, we use parent pointers and
1173    some hairy code in these two functions.  */
1174 static reg_errcode_t
1175 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1176 	   void *extra)
1177 {
1178   bin_tree_t *node, *prev;
1179 
1180   for (node = root; ; )
1181     {
1182       /* Descend down the tree, preferably to the left (or to the right
1183 	 if that's the only child).  */
1184       while (node->left || node->right)
1185 	if (node->left)
1186           node = node->left;
1187         else
1188           node = node->right;
1189 
1190       do
1191 	{
1192 	  reg_errcode_t err = fn (extra, node);
1193 	  if (BE (err != REG_NOERROR, 0))
1194 	    return err;
1195           if (node->parent == NULL)
1196 	    return REG_NOERROR;
1197 	  prev = node;
1198 	  node = node->parent;
1199 	}
1200       /* Go up while we have a node that is reached from the right.  */
1201       while (node->right == prev || node->right == NULL);
1202       node = node->right;
1203     }
1204 }
1205 
1206 static reg_errcode_t
1207 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1208 	  void *extra)
1209 {
1210   bin_tree_t *node;
1211 
1212   for (node = root; ; )
1213     {
1214       reg_errcode_t err = fn (extra, node);
1215       if (BE (err != REG_NOERROR, 0))
1216 	return err;
1217 
1218       /* Go to the left node, or up and to the right.  */
1219       if (node->left)
1220 	node = node->left;
1221       else
1222 	{
1223 	  bin_tree_t *prev = NULL;
1224 	  while (node->right == prev || node->right == NULL)
1225 	    {
1226 	      prev = node;
1227 	      node = node->parent;
1228 	      if (!node)
1229 	        return REG_NOERROR;
1230 	    }
1231 	  node = node->right;
1232 	}
1233     }
1234 }
1235 
1236 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1237    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1238    backreferences as well.  Requires a preorder visit.  */
1239 static reg_errcode_t
1240 optimize_subexps (void *extra, bin_tree_t *node)
1241 {
1242   re_dfa_t *dfa = (re_dfa_t *) extra;
1243 
1244   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1245     {
1246       int idx = node->token.opr.idx;
1247       node->token.opr.idx = dfa->subexp_map[idx];
1248       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1249     }
1250 
1251   else if (node->token.type == SUBEXP
1252            && node->left && node->left->token.type == SUBEXP)
1253     {
1254       int other_idx = node->left->token.opr.idx;
1255 
1256       node->left = node->left->left;
1257       if (node->left)
1258         node->left->parent = node;
1259 
1260       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1261       if (other_idx < BITSET_WORD_BITS)
1262 	  dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1263     }
1264 
1265   return REG_NOERROR;
1266 }
1267 
1268 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1269    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1270 static reg_errcode_t
1271 lower_subexps (void *extra, bin_tree_t *node)
1272 {
1273   regex_t *preg = (regex_t *) extra;
1274   reg_errcode_t err = REG_NOERROR;
1275 
1276   if (node->left && node->left->token.type == SUBEXP)
1277     {
1278       node->left = lower_subexp (&err, preg, node->left);
1279       if (node->left)
1280 	node->left->parent = node;
1281     }
1282   if (node->right && node->right->token.type == SUBEXP)
1283     {
1284       node->right = lower_subexp (&err, preg, node->right);
1285       if (node->right)
1286 	node->right->parent = node;
1287     }
1288 
1289   return err;
1290 }
1291 
1292 static bin_tree_t *
1293 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1294 {
1295   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1296   bin_tree_t *body = node->left;
1297   bin_tree_t *op, *cls, *tree1, *tree;
1298 
1299   if (preg->no_sub
1300       /* We do not optimize empty subexpressions, because otherwise we may
1301 	 have bad CONCAT nodes with NULL children.  This is obviously not
1302 	 very common, so we do not lose much.  An example that triggers
1303 	 this case is the sed "script" /\(\)/x.  */
1304       && node->left != NULL
1305       && (node->token.opr.idx >= BITSET_WORD_BITS
1306 	  || !(dfa->used_bkref_map
1307 	       & ((bitset_word_t) 1 << node->token.opr.idx))))
1308     return node->left;
1309 
1310   /* Convert the SUBEXP node to the concatenation of an
1311      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1312   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1313   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1314   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1315   tree = create_tree (dfa, op, tree1, CONCAT);
1316   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1317     {
1318       *err = REG_ESPACE;
1319       return NULL;
1320     }
1321 
1322   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1323   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1324   return tree;
1325 }
1326 
1327 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1328    nodes.  Requires a postorder visit.  */
1329 static reg_errcode_t
1330 calc_first (void *extra, bin_tree_t *node)
1331 {
1332   re_dfa_t *dfa = (re_dfa_t *) extra;
1333   if (node->token.type == CONCAT)
1334     {
1335       node->first = node->left->first;
1336       node->node_idx = node->left->node_idx;
1337     }
1338   else
1339     {
1340       node->first = node;
1341       node->node_idx = re_dfa_add_node (dfa, node->token);
1342       if (BE (node->node_idx == -1, 0))
1343         return REG_ESPACE;
1344       if (node->token.type == ANCHOR)
1345         dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1346     }
1347   return REG_NOERROR;
1348 }
1349 
1350 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1351 static reg_errcode_t
1352 calc_next (void *extra, bin_tree_t *node)
1353 {
1354   switch (node->token.type)
1355     {
1356     case OP_DUP_ASTERISK:
1357       node->left->next = node;
1358       break;
1359     case CONCAT:
1360       node->left->next = node->right->first;
1361       node->right->next = node->next;
1362       break;
1363     default:
1364       if (node->left)
1365 	node->left->next = node->next;
1366       if (node->right)
1367         node->right->next = node->next;
1368       break;
1369     }
1370   return REG_NOERROR;
1371 }
1372 
1373 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1374 static reg_errcode_t
1375 link_nfa_nodes (void *extra, bin_tree_t *node)
1376 {
1377   re_dfa_t *dfa = (re_dfa_t *) extra;
1378   int idx = node->node_idx;
1379   reg_errcode_t err = REG_NOERROR;
1380 
1381   switch (node->token.type)
1382     {
1383     case CONCAT:
1384       break;
1385 
1386     case END_OF_RE:
1387       assert (node->next == NULL);
1388       break;
1389 
1390     case OP_DUP_ASTERISK:
1391     case OP_ALT:
1392       {
1393 	int left, right;
1394 	dfa->has_plural_match = 1;
1395 	if (node->left != NULL)
1396 	  left = node->left->first->node_idx;
1397 	else
1398 	  left = node->next->node_idx;
1399 	if (node->right != NULL)
1400 	  right = node->right->first->node_idx;
1401 	else
1402 	  right = node->next->node_idx;
1403 	assert (left > -1);
1404 	assert (right > -1);
1405 	err = re_node_set_init_2 (dfa->edests + idx, left, right);
1406       }
1407       break;
1408 
1409     case ANCHOR:
1410     case OP_OPEN_SUBEXP:
1411     case OP_CLOSE_SUBEXP:
1412       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1413       break;
1414 
1415     case OP_BACK_REF:
1416       dfa->nexts[idx] = node->next->node_idx;
1417       if (node->token.type == OP_BACK_REF)
1418 	re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1419       break;
1420 
1421     default:
1422       assert (!IS_EPSILON_NODE (node->token.type));
1423       dfa->nexts[idx] = node->next->node_idx;
1424       break;
1425     }
1426 
1427   return err;
1428 }
1429 
1430 /* Duplicate the epsilon closure of the node ROOT_NODE.
1431    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1432    to their own constraint.  */
1433 
1434 static reg_errcode_t
1435 internal_function
1436 duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1437 			int root_node, unsigned int init_constraint)
1438 {
1439   int org_node, clone_node, ret;
1440   unsigned int constraint = init_constraint;
1441   for (org_node = top_org_node, clone_node = top_clone_node;;)
1442     {
1443       int org_dest, clone_dest;
1444       if (dfa->nodes[org_node].type == OP_BACK_REF)
1445 	{
1446 	  /* If the back reference epsilon-transit, its destination must
1447 	     also have the constraint.  Then duplicate the epsilon closure
1448 	     of the destination of the back reference, and store it in
1449 	     edests of the back reference.  */
1450 	  org_dest = dfa->nexts[org_node];
1451 	  re_node_set_empty (dfa->edests + clone_node);
1452 	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1453 	  if (BE (clone_dest == -1, 0))
1454 	    return REG_ESPACE;
1455 	  dfa->nexts[clone_node] = dfa->nexts[org_node];
1456 	  ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1457 	  if (BE (ret < 0, 0))
1458 	    return REG_ESPACE;
1459 	}
1460       else if (dfa->edests[org_node].nelem == 0)
1461 	{
1462 	  /* In case of the node can't epsilon-transit, don't duplicate the
1463 	     destination and store the original destination as the
1464 	     destination of the node.  */
1465 	  dfa->nexts[clone_node] = dfa->nexts[org_node];
1466 	  break;
1467 	}
1468       else if (dfa->edests[org_node].nelem == 1)
1469 	{
1470 	  /* In case of the node can epsilon-transit, and it has only one
1471 	     destination.  */
1472 	  org_dest = dfa->edests[org_node].elems[0];
1473 	  re_node_set_empty (dfa->edests + clone_node);
1474 	  /* If the node is root_node itself, it means the epsilon clsoure
1475 	     has a loop.   Then tie it to the destination of the root_node.  */
1476 	  if (org_node == root_node && clone_node != org_node)
1477 	    {
1478 	      ret = re_node_set_insert (dfa->edests + clone_node, org_dest);
1479 	      if (BE (ret < 0, 0))
1480 		return REG_ESPACE;
1481 	      break;
1482 	    }
1483 	  /* In case of the node has another constraint, add it.  */
1484 	  constraint |= dfa->nodes[org_node].constraint;
1485 	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1486 	  if (BE (clone_dest == -1, 0))
1487 	    return REG_ESPACE;
1488 	  ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1489 	  if (BE (ret < 0, 0))
1490 	    return REG_ESPACE;
1491 	}
1492       else /* dfa->edests[org_node].nelem == 2 */
1493 	{
1494 	  /* In case of the node can epsilon-transit, and it has two
1495 	     destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1496 	  org_dest = dfa->edests[org_node].elems[0];
1497 	  re_node_set_empty (dfa->edests + clone_node);
1498 	  /* Search for a duplicated node which satisfies the constraint.  */
1499 	  clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1500 	  if (clone_dest == -1)
1501 	    {
1502 	      /* There is no such duplicated node, create a new one.  */
1503 	      reg_errcode_t err;
1504 	      clone_dest = duplicate_node (dfa, org_dest, constraint);
1505 	      if (BE (clone_dest == -1, 0))
1506 		return REG_ESPACE;
1507 	      ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1508 	      if (BE (ret < 0, 0))
1509 		return REG_ESPACE;
1510 	      err = duplicate_node_closure (dfa, org_dest, clone_dest,
1511 					    root_node, constraint);
1512 	      if (BE (err != REG_NOERROR, 0))
1513 		return err;
1514 	    }
1515 	  else
1516 	    {
1517 	      /* There is a duplicated node which satisfies the constraint,
1518 		 use it to avoid infinite loop.  */
1519 	      ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1520 	      if (BE (ret < 0, 0))
1521 		return REG_ESPACE;
1522 	    }
1523 
1524 	  org_dest = dfa->edests[org_node].elems[1];
1525 	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1526 	  if (BE (clone_dest == -1, 0))
1527 	    return REG_ESPACE;
1528 	  ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1529 	  if (BE (ret < 0, 0))
1530 	    return REG_ESPACE;
1531 	}
1532       org_node = org_dest;
1533       clone_node = clone_dest;
1534     }
1535   return REG_NOERROR;
1536 }
1537 
1538 /* Search for a node which is duplicated from the node ORG_NODE, and
1539    satisfies the constraint CONSTRAINT.  */
1540 
1541 static int
1542 search_duplicated_node (const re_dfa_t *dfa, int org_node,
1543 			unsigned int constraint)
1544 {
1545   int idx;
1546   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1547     {
1548       if (org_node == dfa->org_indices[idx]
1549 	  && constraint == dfa->nodes[idx].constraint)
1550 	return idx; /* Found.  */
1551     }
1552   return -1; /* Not found.  */
1553 }
1554 
1555 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1556    Return the index of the new node, or -1 if insufficient storage is
1557    available.  */
1558 
1559 static int
1560 duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1561 {
1562   int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1563   if (BE (dup_idx != -1, 1))
1564     {
1565       dfa->nodes[dup_idx].constraint = constraint;
1566       dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1567       dfa->nodes[dup_idx].duplicated = 1;
1568 
1569       /* Store the index of the original node.  */
1570       dfa->org_indices[dup_idx] = org_idx;
1571     }
1572   return dup_idx;
1573 }
1574 
1575 static reg_errcode_t
1576 calc_inveclosure (re_dfa_t *dfa)
1577 {
1578   int src, idx, ret;
1579   for (idx = 0; idx < dfa->nodes_len; ++idx)
1580     re_node_set_init_empty (dfa->inveclosures + idx);
1581 
1582   for (src = 0; src < dfa->nodes_len; ++src)
1583     {
1584       int *elems = dfa->eclosures[src].elems;
1585       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1586 	{
1587 	  ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1588 	  if (BE (ret == -1, 0))
1589 	    return REG_ESPACE;
1590 	}
1591     }
1592 
1593   return REG_NOERROR;
1594 }
1595 
1596 /* Calculate "eclosure" for all the node in DFA.  */
1597 
1598 static reg_errcode_t
1599 calc_eclosure (re_dfa_t *dfa)
1600 {
1601   int node_idx, incomplete;
1602 #ifdef DEBUG
1603   assert (dfa->nodes_len > 0);
1604 #endif
1605   incomplete = 0;
1606   /* For each nodes, calculate epsilon closure.  */
1607   for (node_idx = 0; ; ++node_idx)
1608     {
1609       reg_errcode_t err;
1610       re_node_set eclosure_elem;
1611       if (node_idx == dfa->nodes_len)
1612 	{
1613 	  if (!incomplete)
1614 	    break;
1615 	  incomplete = 0;
1616 	  node_idx = 0;
1617 	}
1618 
1619 #ifdef DEBUG
1620       assert (dfa->eclosures[node_idx].nelem != -1);
1621 #endif
1622 
1623       /* If we have already calculated, skip it.  */
1624       if (dfa->eclosures[node_idx].nelem != 0)
1625 	continue;
1626       /* Calculate epsilon closure of `node_idx'.  */
1627       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1628       if (BE (err != REG_NOERROR, 0))
1629 	return err;
1630 
1631       if (dfa->eclosures[node_idx].nelem == 0)
1632 	{
1633 	  incomplete = 1;
1634 	  re_node_set_free (&eclosure_elem);
1635 	}
1636     }
1637   return REG_NOERROR;
1638 }
1639 
1640 /* Calculate epsilon closure of NODE.  */
1641 
1642 static reg_errcode_t
1643 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1644 {
1645   reg_errcode_t err;
1646   int i, incomplete;
1647   re_node_set eclosure;
1648   incomplete = 0;
1649   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1650   if (BE (err != REG_NOERROR, 0))
1651     return err;
1652 
1653   /* This indicates that we are calculating this node now.
1654      We reference this value to avoid infinite loop.  */
1655   dfa->eclosures[node].nelem = -1;
1656 
1657   /* If the current node has constraints, duplicate all nodes
1658      since they must inherit the constraints.  */
1659   if (dfa->nodes[node].constraint
1660       && dfa->edests[node].nelem
1661       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1662     {
1663       err = duplicate_node_closure (dfa, node, node, node,
1664 				    dfa->nodes[node].constraint);
1665       if (BE (err != REG_NOERROR, 0))
1666 	return err;
1667     }
1668 
1669   /* Expand each epsilon destination nodes.  */
1670   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1671     for (i = 0; i < dfa->edests[node].nelem; ++i)
1672       {
1673 	re_node_set eclosure_elem;
1674 	int edest = dfa->edests[node].elems[i];
1675 	/* If calculating the epsilon closure of `edest' is in progress,
1676 	   return intermediate result.  */
1677 	if (dfa->eclosures[edest].nelem == -1)
1678 	  {
1679 	    incomplete = 1;
1680 	    continue;
1681 	  }
1682 	/* If we haven't calculated the epsilon closure of `edest' yet,
1683 	   calculate now. Otherwise use calculated epsilon closure.  */
1684 	if (dfa->eclosures[edest].nelem == 0)
1685 	  {
1686 	    err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1687 	    if (BE (err != REG_NOERROR, 0))
1688 	      return err;
1689 	  }
1690 	else
1691 	  eclosure_elem = dfa->eclosures[edest];
1692 	/* Merge the epsilon closure of `edest'.  */
1693 	re_node_set_merge (&eclosure, &eclosure_elem);
1694 	/* If the epsilon closure of `edest' is incomplete,
1695 	   the epsilon closure of this node is also incomplete.  */
1696 	if (dfa->eclosures[edest].nelem == 0)
1697 	  {
1698 	    incomplete = 1;
1699 	    re_node_set_free (&eclosure_elem);
1700 	  }
1701       }
1702 
1703   /* Epsilon closures include itself.  */
1704   re_node_set_insert (&eclosure, node);
1705   if (incomplete && !root)
1706     dfa->eclosures[node].nelem = 0;
1707   else
1708     dfa->eclosures[node] = eclosure;
1709   *new_set = eclosure;
1710   return REG_NOERROR;
1711 }
1712 
1713 /* Functions for token which are used in the parser.  */
1714 
1715 /* Fetch a token from INPUT.
1716    We must not use this function inside bracket expressions.  */
1717 
1718 static void
1719 internal_function
1720 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1721 {
1722   re_string_skip_bytes (input, peek_token (result, input, syntax));
1723 }
1724 
1725 /* Peek a token from INPUT, and return the length of the token.
1726    We must not use this function inside bracket expressions.  */
1727 
1728 static int
1729 internal_function
1730 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1731 {
1732   unsigned char c;
1733 
1734   if (re_string_eoi (input))
1735     {
1736       token->type = END_OF_RE;
1737       return 0;
1738     }
1739 
1740   c = re_string_peek_byte (input, 0);
1741   token->opr.c = c;
1742 
1743   token->word_char = 0;
1744 #ifdef RE_ENABLE_I18N
1745   token->mb_partial = 0;
1746   if (input->mb_cur_max > 1 &&
1747       !re_string_first_byte (input, re_string_cur_idx (input)))
1748     {
1749       token->type = CHARACTER;
1750       token->mb_partial = 1;
1751       return 1;
1752     }
1753 #endif
1754   if (c == '\\')
1755     {
1756       unsigned char c2;
1757       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1758 	{
1759 	  token->type = BACK_SLASH;
1760 	  return 1;
1761 	}
1762 
1763       c2 = re_string_peek_byte_case (input, 1);
1764       token->opr.c = c2;
1765       token->type = CHARACTER;
1766 #ifdef RE_ENABLE_I18N
1767       if (input->mb_cur_max > 1)
1768 	{
1769 	  wint_t wc = re_string_wchar_at (input,
1770 					  re_string_cur_idx (input) + 1);
1771 	  token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1772 	}
1773       else
1774 #endif
1775 	token->word_char = IS_WORD_CHAR (c2) != 0;
1776 
1777       switch (c2)
1778 	{
1779 	case '|':
1780 	  if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1781 	    token->type = OP_ALT;
1782 	  break;
1783 	case '1': case '2': case '3': case '4': case '5':
1784 	case '6': case '7': case '8': case '9':
1785 	  if (!(syntax & RE_NO_BK_REFS))
1786 	    {
1787 	      token->type = OP_BACK_REF;
1788 	      token->opr.idx = c2 - '1';
1789 	    }
1790 	  break;
1791 	case '<':
1792 	  if (!(syntax & RE_NO_GNU_OPS))
1793 	    {
1794 	      token->type = ANCHOR;
1795 	      token->opr.ctx_type = WORD_FIRST;
1796 	    }
1797 	  break;
1798 	case '>':
1799 	  if (!(syntax & RE_NO_GNU_OPS))
1800 	    {
1801 	      token->type = ANCHOR;
1802 	      token->opr.ctx_type = WORD_LAST;
1803 	    }
1804 	  break;
1805 	case 'b':
1806 	  if (!(syntax & RE_NO_GNU_OPS))
1807 	    {
1808 	      token->type = ANCHOR;
1809 	      token->opr.ctx_type = WORD_DELIM;
1810 	    }
1811 	  break;
1812 	case 'B':
1813 	  if (!(syntax & RE_NO_GNU_OPS))
1814 	    {
1815 	      token->type = ANCHOR;
1816 	      token->opr.ctx_type = NOT_WORD_DELIM;
1817 	    }
1818 	  break;
1819 	case 'w':
1820 	  if (!(syntax & RE_NO_GNU_OPS))
1821 	    token->type = OP_WORD;
1822 	  break;
1823 	case 'W':
1824 	  if (!(syntax & RE_NO_GNU_OPS))
1825 	    token->type = OP_NOTWORD;
1826 	  break;
1827 	case 's':
1828 	  if (!(syntax & RE_NO_GNU_OPS))
1829 	    token->type = OP_SPACE;
1830 	  break;
1831 	case 'S':
1832 	  if (!(syntax & RE_NO_GNU_OPS))
1833 	    token->type = OP_NOTSPACE;
1834 	  break;
1835 	case '`':
1836 	  if (!(syntax & RE_NO_GNU_OPS))
1837 	    {
1838 	      token->type = ANCHOR;
1839 	      token->opr.ctx_type = BUF_FIRST;
1840 	    }
1841 	  break;
1842 	case '\'':
1843 	  if (!(syntax & RE_NO_GNU_OPS))
1844 	    {
1845 	      token->type = ANCHOR;
1846 	      token->opr.ctx_type = BUF_LAST;
1847 	    }
1848 	  break;
1849 	case '(':
1850 	  if (!(syntax & RE_NO_BK_PARENS))
1851 	    token->type = OP_OPEN_SUBEXP;
1852 	  break;
1853 	case ')':
1854 	  if (!(syntax & RE_NO_BK_PARENS))
1855 	    token->type = OP_CLOSE_SUBEXP;
1856 	  break;
1857 	case '+':
1858 	  if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1859 	    token->type = OP_DUP_PLUS;
1860 	  break;
1861 	case '?':
1862 	  if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1863 	    token->type = OP_DUP_QUESTION;
1864 	  break;
1865 	case '{':
1866 	  if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1867 	    token->type = OP_OPEN_DUP_NUM;
1868 	  break;
1869 	case '}':
1870 	  if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1871 	    token->type = OP_CLOSE_DUP_NUM;
1872 	  break;
1873 	default:
1874 	  break;
1875 	}
1876       return 2;
1877     }
1878 
1879   token->type = CHARACTER;
1880 #ifdef RE_ENABLE_I18N
1881   if (input->mb_cur_max > 1)
1882     {
1883       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1884       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1885     }
1886   else
1887 #endif
1888     token->word_char = IS_WORD_CHAR (token->opr.c);
1889 
1890   switch (c)
1891     {
1892     case '\n':
1893       if (syntax & RE_NEWLINE_ALT)
1894 	token->type = OP_ALT;
1895       break;
1896     case '|':
1897       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1898 	token->type = OP_ALT;
1899       break;
1900     case '*':
1901       token->type = OP_DUP_ASTERISK;
1902       break;
1903     case '+':
1904       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1905 	token->type = OP_DUP_PLUS;
1906       break;
1907     case '?':
1908       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1909 	token->type = OP_DUP_QUESTION;
1910       break;
1911     case '{':
1912       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1913 	token->type = OP_OPEN_DUP_NUM;
1914       break;
1915     case '}':
1916       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1917 	token->type = OP_CLOSE_DUP_NUM;
1918       break;
1919     case '(':
1920       if (syntax & RE_NO_BK_PARENS)
1921 	token->type = OP_OPEN_SUBEXP;
1922       break;
1923     case ')':
1924       if (syntax & RE_NO_BK_PARENS)
1925 	token->type = OP_CLOSE_SUBEXP;
1926       break;
1927     case '[':
1928       token->type = OP_OPEN_BRACKET;
1929       break;
1930     case '.':
1931       token->type = OP_PERIOD;
1932       break;
1933     case '^':
1934       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1935 	  re_string_cur_idx (input) != 0)
1936 	{
1937 	  char prev = re_string_peek_byte (input, -1);
1938 	  if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1939 	    break;
1940 	}
1941       token->type = ANCHOR;
1942       token->opr.ctx_type = LINE_FIRST;
1943       break;
1944     case '$':
1945       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1946 	  re_string_cur_idx (input) + 1 != re_string_length (input))
1947 	{
1948 	  re_token_t next;
1949 	  re_string_skip_bytes (input, 1);
1950 	  peek_token (&next, input, syntax);
1951 	  re_string_skip_bytes (input, -1);
1952 	  if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1953 	    break;
1954 	}
1955       token->type = ANCHOR;
1956       token->opr.ctx_type = LINE_LAST;
1957       break;
1958     default:
1959       break;
1960     }
1961   return 1;
1962 }
1963 
1964 /* Peek a token from INPUT, and return the length of the token.
1965    We must not use this function out of bracket expressions.  */
1966 
1967 static int
1968 internal_function
1969 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1970 {
1971   unsigned char c;
1972   if (re_string_eoi (input))
1973     {
1974       token->type = END_OF_RE;
1975       return 0;
1976     }
1977   c = re_string_peek_byte (input, 0);
1978   token->opr.c = c;
1979 
1980 #ifdef RE_ENABLE_I18N
1981   if (input->mb_cur_max > 1 &&
1982       !re_string_first_byte (input, re_string_cur_idx (input)))
1983     {
1984       token->type = CHARACTER;
1985       return 1;
1986     }
1987 #endif /* RE_ENABLE_I18N */
1988 
1989   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
1990       && re_string_cur_idx (input) + 1 < re_string_length (input))
1991     {
1992       /* In this case, '\' escape a character.  */
1993       unsigned char c2;
1994       re_string_skip_bytes (input, 1);
1995       c2 = re_string_peek_byte (input, 0);
1996       token->opr.c = c2;
1997       token->type = CHARACTER;
1998       return 1;
1999     }
2000   if (c == '[') /* '[' is a special char in a bracket exps.  */
2001     {
2002       unsigned char c2;
2003       int token_len;
2004       if (re_string_cur_idx (input) + 1 < re_string_length (input))
2005 	c2 = re_string_peek_byte (input, 1);
2006       else
2007 	c2 = 0;
2008       token->opr.c = c2;
2009       token_len = 2;
2010       switch (c2)
2011 	{
2012 	case '.':
2013 	  token->type = OP_OPEN_COLL_ELEM;
2014 	  break;
2015 	case '=':
2016 	  token->type = OP_OPEN_EQUIV_CLASS;
2017 	  break;
2018 	case ':':
2019 	  if (syntax & RE_CHAR_CLASSES)
2020 	    {
2021 	      token->type = OP_OPEN_CHAR_CLASS;
2022 	      break;
2023 	    }
2024 	  /* else fall through.  */
2025 	default:
2026 	  token->type = CHARACTER;
2027 	  token->opr.c = c;
2028 	  token_len = 1;
2029 	  break;
2030 	}
2031       return token_len;
2032     }
2033   switch (c)
2034     {
2035     case '-':
2036       token->type = OP_CHARSET_RANGE;
2037       break;
2038     case ']':
2039       token->type = OP_CLOSE_BRACKET;
2040       break;
2041     case '^':
2042       token->type = OP_NON_MATCH_LIST;
2043       break;
2044     default:
2045       token->type = CHARACTER;
2046     }
2047   return 1;
2048 }
2049 
2050 /* Functions for parser.  */
2051 
2052 /* Entry point of the parser.
2053    Parse the regular expression REGEXP and return the structure tree.
2054    If an error is occured, ERR is set by error code, and return NULL.
2055    This function build the following tree, from regular expression <reg_exp>:
2056 	   CAT
2057 	   / \
2058 	  /   \
2059    <reg_exp>  EOR
2060 
2061    CAT means concatenation.
2062    EOR means end of regular expression.  */
2063 
2064 static bin_tree_t *
2065 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2066        reg_errcode_t *err)
2067 {
2068   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2069   bin_tree_t *tree, *eor, *root;
2070   re_token_t current_token;
2071   dfa->syntax = syntax;
2072   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2073   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2074   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2075     return NULL;
2076   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2077   if (tree != NULL)
2078     root = create_tree (dfa, tree, eor, CONCAT);
2079   else
2080     root = eor;
2081   if (BE (eor == NULL || root == NULL, 0))
2082     {
2083       *err = REG_ESPACE;
2084       return NULL;
2085     }
2086   return root;
2087 }
2088 
2089 /* This function build the following tree, from regular expression
2090    <branch1>|<branch2>:
2091 	   ALT
2092 	   / \
2093 	  /   \
2094    <branch1> <branch2>
2095 
2096    ALT means alternative, which represents the operator `|'.  */
2097 
2098 static bin_tree_t *
2099 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2100 	       reg_syntax_t syntax, int nest, reg_errcode_t *err)
2101 {
2102   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2103   bin_tree_t *tree, *branch = NULL;
2104   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2105   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2106     return NULL;
2107 
2108   while (token->type == OP_ALT)
2109     {
2110       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2111       if (token->type != OP_ALT && token->type != END_OF_RE
2112 	  && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2113 	{
2114 	  branch = parse_branch (regexp, preg, token, syntax, nest, err);
2115 	  if (BE (*err != REG_NOERROR && branch == NULL, 0))
2116 	    return NULL;
2117 	}
2118       else
2119 	branch = NULL;
2120       tree = create_tree (dfa, tree, branch, OP_ALT);
2121       if (BE (tree == NULL, 0))
2122 	{
2123 	  *err = REG_ESPACE;
2124 	  return NULL;
2125 	}
2126     }
2127   return tree;
2128 }
2129 
2130 /* This function build the following tree, from regular expression
2131    <exp1><exp2>:
2132 	CAT
2133 	/ \
2134        /   \
2135    <exp1> <exp2>
2136 
2137    CAT means concatenation.  */
2138 
2139 static bin_tree_t *
2140 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2141 	      reg_syntax_t syntax, int nest, reg_errcode_t *err)
2142 {
2143   bin_tree_t *tree, *exp;
2144   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2145   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2146   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2147     return NULL;
2148 
2149   while (token->type != OP_ALT && token->type != END_OF_RE
2150 	 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2151     {
2152       exp = parse_expression (regexp, preg, token, syntax, nest, err);
2153       if (BE (*err != REG_NOERROR && exp == NULL, 0))
2154 	{
2155 	  return NULL;
2156 	}
2157       if (tree != NULL && exp != NULL)
2158 	{
2159 	  tree = create_tree (dfa, tree, exp, CONCAT);
2160 	  if (tree == NULL)
2161 	    {
2162 	      *err = REG_ESPACE;
2163 	      return NULL;
2164 	    }
2165 	}
2166       else if (tree == NULL)
2167 	tree = exp;
2168       /* Otherwise exp == NULL, we don't need to create new tree.  */
2169     }
2170   return tree;
2171 }
2172 
2173 /* This function build the following tree, from regular expression a*:
2174 	 *
2175 	 |
2176 	 a
2177 */
2178 
2179 static bin_tree_t *
2180 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2181 		  reg_syntax_t syntax, int nest, reg_errcode_t *err)
2182 {
2183   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2184   bin_tree_t *tree;
2185   switch (token->type)
2186     {
2187     case CHARACTER:
2188       tree = create_token_tree (dfa, NULL, NULL, token);
2189       if (BE (tree == NULL, 0))
2190 	{
2191 	  *err = REG_ESPACE;
2192 	  return NULL;
2193 	}
2194 #ifdef RE_ENABLE_I18N
2195       if (dfa->mb_cur_max > 1)
2196 	{
2197 	  while (!re_string_eoi (regexp)
2198 		 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2199 	    {
2200 	      bin_tree_t *mbc_remain;
2201 	      fetch_token (token, regexp, syntax);
2202 	      mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2203 	      tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2204 	      if (BE (mbc_remain == NULL || tree == NULL, 0))
2205 		{
2206 		  *err = REG_ESPACE;
2207 		  return NULL;
2208 		}
2209 	    }
2210 	}
2211 #endif
2212       break;
2213     case OP_OPEN_SUBEXP:
2214       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2215       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2216 	return NULL;
2217       break;
2218     case OP_OPEN_BRACKET:
2219       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2220       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2221 	return NULL;
2222       break;
2223     case OP_BACK_REF:
2224       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2225 	{
2226 	  *err = REG_ESUBREG;
2227 	  return NULL;
2228 	}
2229       dfa->used_bkref_map |= 1 << token->opr.idx;
2230       tree = create_token_tree (dfa, NULL, NULL, token);
2231       if (BE (tree == NULL, 0))
2232 	{
2233 	  *err = REG_ESPACE;
2234 	  return NULL;
2235 	}
2236       ++dfa->nbackref;
2237       dfa->has_mb_node = 1;
2238       break;
2239     case OP_OPEN_DUP_NUM:
2240       if (syntax & RE_CONTEXT_INVALID_DUP)
2241 	{
2242 	  *err = REG_BADRPT;
2243 	  return NULL;
2244 	}
2245       /* FALLTHROUGH */
2246     case OP_DUP_ASTERISK:
2247     case OP_DUP_PLUS:
2248     case OP_DUP_QUESTION:
2249       if (syntax & RE_CONTEXT_INVALID_OPS)
2250 	{
2251 	  *err = REG_BADRPT;
2252 	  return NULL;
2253 	}
2254       else if (syntax & RE_CONTEXT_INDEP_OPS)
2255 	{
2256 	  fetch_token (token, regexp, syntax);
2257 	  return parse_expression (regexp, preg, token, syntax, nest, err);
2258 	}
2259       /* else fall through  */
2260     case OP_CLOSE_SUBEXP:
2261       if ((token->type == OP_CLOSE_SUBEXP) &&
2262 	  !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2263 	{
2264 	  *err = REG_ERPAREN;
2265 	  return NULL;
2266 	}
2267       /* else fall through  */
2268     case OP_CLOSE_DUP_NUM:
2269       /* We treat it as a normal character.  */
2270 
2271       /* Then we can these characters as normal characters.  */
2272       token->type = CHARACTER;
2273       /* mb_partial and word_char bits should be initialized already
2274 	 by peek_token.  */
2275       tree = create_token_tree (dfa, NULL, NULL, token);
2276       if (BE (tree == NULL, 0))
2277 	{
2278 	  *err = REG_ESPACE;
2279 	  return NULL;
2280 	}
2281       break;
2282     case ANCHOR:
2283       if ((token->opr.ctx_type
2284 	   & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2285 	  && dfa->word_ops_used == 0)
2286 	init_word_char (dfa);
2287       if (token->opr.ctx_type == WORD_DELIM
2288           || token->opr.ctx_type == NOT_WORD_DELIM)
2289 	{
2290 	  bin_tree_t *tree_first, *tree_last;
2291 	  if (token->opr.ctx_type == WORD_DELIM)
2292 	    {
2293 	      token->opr.ctx_type = WORD_FIRST;
2294 	      tree_first = create_token_tree (dfa, NULL, NULL, token);
2295 	      token->opr.ctx_type = WORD_LAST;
2296             }
2297           else
2298             {
2299 	      token->opr.ctx_type = INSIDE_WORD;
2300 	      tree_first = create_token_tree (dfa, NULL, NULL, token);
2301 	      token->opr.ctx_type = INSIDE_NOTWORD;
2302             }
2303 	  tree_last = create_token_tree (dfa, NULL, NULL, token);
2304 	  tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2305 	  if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2306 	    {
2307 	      *err = REG_ESPACE;
2308 	      return NULL;
2309 	    }
2310 	}
2311       else
2312 	{
2313 	  tree = create_token_tree (dfa, NULL, NULL, token);
2314 	  if (BE (tree == NULL, 0))
2315 	    {
2316 	      *err = REG_ESPACE;
2317 	      return NULL;
2318 	    }
2319 	}
2320       /* We must return here, since ANCHORs can't be followed
2321 	 by repetition operators.
2322 	 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2323 	     it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2324       fetch_token (token, regexp, syntax);
2325       return tree;
2326     case OP_PERIOD:
2327       tree = create_token_tree (dfa, NULL, NULL, token);
2328       if (BE (tree == NULL, 0))
2329 	{
2330 	  *err = REG_ESPACE;
2331 	  return NULL;
2332 	}
2333       if (dfa->mb_cur_max > 1)
2334 	dfa->has_mb_node = 1;
2335       break;
2336     case OP_WORD:
2337     case OP_NOTWORD:
2338       tree = build_charclass_op (dfa, regexp->trans,
2339 				 (const unsigned char *) "alnum",
2340 				 (const unsigned char *) "_",
2341 				 token->type == OP_NOTWORD, err);
2342       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2343 	return NULL;
2344       break;
2345     case OP_SPACE:
2346     case OP_NOTSPACE:
2347       tree = build_charclass_op (dfa, regexp->trans,
2348 				 (const unsigned char *) "space",
2349 				 (const unsigned char *) "",
2350 				 token->type == OP_NOTSPACE, err);
2351       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2352 	return NULL;
2353       break;
2354     case OP_ALT:
2355     case END_OF_RE:
2356       return NULL;
2357     case BACK_SLASH:
2358       *err = REG_EESCAPE;
2359       return NULL;
2360     default:
2361       /* Must not happen?  */
2362 #ifdef DEBUG
2363       assert (0);
2364 #endif
2365       return NULL;
2366     }
2367   fetch_token (token, regexp, syntax);
2368 
2369   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2370 	 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2371     {
2372       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2373       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2374 	return NULL;
2375       /* In BRE consecutive duplications are not allowed.  */
2376       if ((syntax & RE_CONTEXT_INVALID_DUP)
2377 	  && (token->type == OP_DUP_ASTERISK
2378 	      || token->type == OP_OPEN_DUP_NUM))
2379 	{
2380 	  *err = REG_BADRPT;
2381 	  return NULL;
2382 	}
2383     }
2384 
2385   return tree;
2386 }
2387 
2388 /* This function build the following tree, from regular expression
2389    (<reg_exp>):
2390 	 SUBEXP
2391 	    |
2392 	<reg_exp>
2393 */
2394 
2395 static bin_tree_t *
2396 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2397 	       reg_syntax_t syntax, int nest, reg_errcode_t *err)
2398 {
2399   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2400   bin_tree_t *tree;
2401   size_t cur_nsub;
2402   cur_nsub = preg->re_nsub++;
2403 
2404   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2405 
2406   /* The subexpression may be a null string.  */
2407   if (token->type == OP_CLOSE_SUBEXP)
2408     tree = NULL;
2409   else
2410     {
2411       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2412       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2413         *err = REG_EPAREN;
2414       if (BE (*err != REG_NOERROR, 0))
2415 	return NULL;
2416     }
2417 
2418   if (cur_nsub <= '9' - '1')
2419     dfa->completed_bkref_map |= 1 << cur_nsub;
2420 
2421   tree = create_tree (dfa, tree, NULL, SUBEXP);
2422   if (BE (tree == NULL, 0))
2423     {
2424       *err = REG_ESPACE;
2425       return NULL;
2426     }
2427   tree->token.opr.idx = cur_nsub;
2428   return tree;
2429 }
2430 
2431 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2432 
2433 static bin_tree_t *
2434 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2435 	      re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2436 {
2437   bin_tree_t *tree = NULL, *old_tree = NULL;
2438   int i, start, end, start_idx = re_string_cur_idx (regexp);
2439   re_token_t start_token = *token;
2440 
2441   if (token->type == OP_OPEN_DUP_NUM)
2442     {
2443       end = 0;
2444       start = fetch_number (regexp, token, syntax);
2445       if (start == -1)
2446 	{
2447 	  if (token->type == CHARACTER && token->opr.c == ',')
2448 	    start = 0; /* We treat "{,m}" as "{0,m}".  */
2449 	  else
2450 	    {
2451 	      *err = REG_BADBR; /* <re>{} is invalid.  */
2452 	      return NULL;
2453 	    }
2454 	}
2455       if (BE (start != -2, 1))
2456 	{
2457 	  /* We treat "{n}" as "{n,n}".  */
2458 	  end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2459 		 : ((token->type == CHARACTER && token->opr.c == ',')
2460 		    ? fetch_number (regexp, token, syntax) : -2));
2461 	}
2462       if (BE (start == -2 || end == -2, 0))
2463 	{
2464 	  /* Invalid sequence.  */
2465 	  if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2466 	    {
2467 	      if (token->type == END_OF_RE)
2468 		*err = REG_EBRACE;
2469 	      else
2470 		*err = REG_BADBR;
2471 
2472 	      return NULL;
2473 	    }
2474 
2475 	  /* If the syntax bit is set, rollback.  */
2476 	  re_string_set_index (regexp, start_idx);
2477 	  *token = start_token;
2478 	  token->type = CHARACTER;
2479 	  /* mb_partial and word_char bits should be already initialized by
2480 	     peek_token.  */
2481 	  return elem;
2482 	}
2483 
2484       if (BE ((end != -1 && start > end) || token->type != OP_CLOSE_DUP_NUM, 0))
2485 	{
2486 	  /* First number greater than second.  */
2487 	  *err = REG_BADBR;
2488 	  return NULL;
2489 	}
2490     }
2491   else
2492     {
2493       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2494       end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2495     }
2496 
2497   fetch_token (token, regexp, syntax);
2498 
2499   if (BE (elem == NULL, 0))
2500     return NULL;
2501   if (BE (start == 0 && end == 0, 0))
2502     {
2503       postorder (elem, free_tree, NULL);
2504       return NULL;
2505     }
2506 
2507   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2508   if (BE (start > 0, 0))
2509     {
2510       tree = elem;
2511       for (i = 2; i <= start; ++i)
2512 	{
2513 	  elem = duplicate_tree (elem, dfa);
2514 	  tree = create_tree (dfa, tree, elem, CONCAT);
2515 	  if (BE (elem == NULL || tree == NULL, 0))
2516 	    goto parse_dup_op_espace;
2517 	}
2518 
2519       if (start == end)
2520 	return tree;
2521 
2522       /* Duplicate ELEM before it is marked optional.  */
2523       elem = duplicate_tree (elem, dfa);
2524       old_tree = tree;
2525     }
2526   else
2527     old_tree = NULL;
2528 
2529   if (elem->token.type == SUBEXP)
2530     postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2531 
2532   tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2533   if (BE (tree == NULL, 0))
2534     goto parse_dup_op_espace;
2535 
2536   /* This loop is actually executed only when end != -1,
2537      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2538      already created the start+1-th copy.  */
2539   for (i = start + 2; i <= end; ++i)
2540     {
2541       elem = duplicate_tree (elem, dfa);
2542       tree = create_tree (dfa, tree, elem, CONCAT);
2543       if (BE (elem == NULL || tree == NULL, 0))
2544         goto parse_dup_op_espace;
2545 
2546       tree = create_tree (dfa, tree, NULL, OP_ALT);
2547       if (BE (tree == NULL, 0))
2548         goto parse_dup_op_espace;
2549     }
2550 
2551   if (old_tree)
2552     tree = create_tree (dfa, old_tree, tree, CONCAT);
2553 
2554   return tree;
2555 
2556  parse_dup_op_espace:
2557   *err = REG_ESPACE;
2558   return NULL;
2559 }
2560 
2561 /* Size of the names for collating symbol/equivalence_class/character_class.
2562    I'm not sure, but maybe enough.  */
2563 #define BRACKET_NAME_BUF_SIZE 32
2564 
2565 #ifndef _LIBC
2566   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2567      Build the range expression which starts from START_ELEM, and ends
2568      at END_ELEM.  The result are written to MBCSET and SBCSET.
2569      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2570      mbcset->range_ends, is a pointer argument sinse we may
2571      update it.  */
2572 
2573 static reg_errcode_t
2574 internal_function
2575 # ifdef RE_ENABLE_I18N
2576 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2577 		 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2578 # else /* not RE_ENABLE_I18N */
2579 build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2580 		 bracket_elem_t *end_elem)
2581 # endif /* not RE_ENABLE_I18N */
2582 {
2583   unsigned int start_ch, end_ch;
2584   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2585   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2586 	  || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2587 	  0))
2588     return REG_ERANGE;
2589 
2590   /* We can handle no multi character collating elements without libc
2591      support.  */
2592   if (BE ((start_elem->type == COLL_SYM
2593 	   && strlen ((char *) start_elem->opr.name) > 1)
2594 	  || (end_elem->type == COLL_SYM
2595 	      && strlen ((char *) end_elem->opr.name) > 1), 0))
2596     return REG_ECOLLATE;
2597 
2598 # ifdef RE_ENABLE_I18N
2599   {
2600     wchar_t wc;
2601     wint_t start_wc;
2602     wint_t end_wc;
2603     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2604 
2605     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2606 		: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2607 		   : 0));
2608     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2609 	      : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2610 		 : 0));
2611     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2612 		? __btowc (start_ch) : start_elem->opr.wch);
2613     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2614 	      ? __btowc (end_ch) : end_elem->opr.wch);
2615     if (start_wc == WEOF || end_wc == WEOF)
2616       return REG_ECOLLATE;
2617     cmp_buf[0] = start_wc;
2618     cmp_buf[4] = end_wc;
2619     if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2620       return REG_ERANGE;
2621 
2622     /* Got valid collation sequence values, add them as a new entry.
2623        However, for !_LIBC we have no collation elements: if the
2624        character set is single byte, the single byte character set
2625        that we build below suffices.  parse_bracket_exp passes
2626        no MBCSET if dfa->mb_cur_max == 1.  */
2627     if (mbcset)
2628       {
2629         /* Check the space of the arrays.  */
2630         if (BE (*range_alloc == mbcset->nranges, 0))
2631           {
2632 	    /* There is not enough space, need realloc.  */
2633 	    wchar_t *new_array_start, *new_array_end;
2634 	    int new_nranges;
2635 
2636 	    /* +1 in case of mbcset->nranges is 0.  */
2637 	    new_nranges = 2 * mbcset->nranges + 1;
2638 	    /* Use realloc since mbcset->range_starts and mbcset->range_ends
2639 	       are NULL if *range_alloc == 0.  */
2640 	    new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2641 				          new_nranges);
2642 	    new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2643 				        new_nranges);
2644 
2645 	    if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2646 	      return REG_ESPACE;
2647 
2648 	    mbcset->range_starts = new_array_start;
2649 	    mbcset->range_ends = new_array_end;
2650 	    *range_alloc = new_nranges;
2651           }
2652 
2653         mbcset->range_starts[mbcset->nranges] = start_wc;
2654         mbcset->range_ends[mbcset->nranges++] = end_wc;
2655       }
2656 
2657     /* Build the table for single byte characters.  */
2658     for (wc = 0; wc < SBC_MAX; ++wc)
2659       {
2660 	cmp_buf[2] = wc;
2661 	if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2662 	    && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2663 	  bitset_set (sbcset, wc);
2664       }
2665   }
2666 # else /* not RE_ENABLE_I18N */
2667   {
2668     unsigned int ch;
2669     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2670 		: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2671 		   : 0));
2672     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2673 	      : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2674 		 : 0));
2675     if (start_ch > end_ch)
2676       return REG_ERANGE;
2677     /* Build the table for single byte characters.  */
2678     for (ch = 0; ch < SBC_MAX; ++ch)
2679       if (start_ch <= ch  && ch <= end_ch)
2680 	bitset_set (sbcset, ch);
2681   }
2682 # endif /* not RE_ENABLE_I18N */
2683   return REG_NOERROR;
2684 }
2685 #endif /* not _LIBC */
2686 
2687 #ifndef _LIBC
2688 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2689    Build the collating element which is represented by NAME.
2690    The result are written to MBCSET and SBCSET.
2691    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2692    pointer argument since we may update it.  */
2693 
2694 static reg_errcode_t
2695 internal_function
2696 # ifdef RE_ENABLE_I18N
2697 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2698 			int *coll_sym_alloc, const unsigned char *name)
2699 # else /* not RE_ENABLE_I18N */
2700 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2701 # endif /* not RE_ENABLE_I18N */
2702 {
2703   size_t name_len = strlen ((const char *) name);
2704   if (BE (name_len != 1, 0))
2705     return REG_ECOLLATE;
2706   else
2707     {
2708       bitset_set (sbcset, name[0]);
2709       return REG_NOERROR;
2710     }
2711 }
2712 #endif /* not _LIBC */
2713 
2714 /* This function parse bracket expression like "[abc]", "[a-c]",
2715    "[[.a-a.]]" etc.  */
2716 
2717 static bin_tree_t *
2718 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2719 		   reg_syntax_t syntax, reg_errcode_t *err)
2720 {
2721 #ifdef _LIBC
2722   const unsigned char *collseqmb;
2723   const char *collseqwc;
2724   uint32_t nrules;
2725   int32_t table_size;
2726   const int32_t *symb_table;
2727   const unsigned char *extra;
2728 
2729   /* Local function for parse_bracket_exp used in _LIBC environement.
2730      Seek the collating symbol entry correspondings to NAME.
2731      Return the index of the symbol in the SYMB_TABLE.  */
2732 
2733   auto inline int32_t
2734   __attribute ((always_inline))
2735   seek_collating_symbol_entry (name, name_len)
2736 	 const unsigned char *name;
2737 	 size_t name_len;
2738     {
2739       int32_t hash = elem_hash ((const char *) name, name_len);
2740       int32_t elem = hash % table_size;
2741       if (symb_table[2 * elem] != 0)
2742 	{
2743 	  int32_t second = hash % (table_size - 2) + 1;
2744 
2745 	  do
2746 	    {
2747 	      /* First compare the hashing value.  */
2748 	      if (symb_table[2 * elem] == hash
2749 		  /* Compare the length of the name.  */
2750 		  && name_len == extra[symb_table[2 * elem + 1]]
2751 		  /* Compare the name.  */
2752 		  && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2753 			     name_len) == 0)
2754 		{
2755 		  /* Yep, this is the entry.  */
2756 		  break;
2757 		}
2758 
2759 	      /* Next entry.  */
2760 	      elem += second;
2761 	    }
2762 	  while (symb_table[2 * elem] != 0);
2763 	}
2764       return elem;
2765     }
2766 
2767   /* Local function for parse_bracket_exp used in _LIBC environment.
2768      Look up the collation sequence value of BR_ELEM.
2769      Return the value if succeeded, UINT_MAX otherwise.  */
2770 
2771   auto inline unsigned int
2772   __attribute ((always_inline))
2773   lookup_collation_sequence_value (br_elem)
2774 	 bracket_elem_t *br_elem;
2775     {
2776       if (br_elem->type == SB_CHAR)
2777 	{
2778 	  /*
2779 	  if (MB_CUR_MAX == 1)
2780 	  */
2781 	  if (nrules == 0)
2782 	    return collseqmb[br_elem->opr.ch];
2783 	  else
2784 	    {
2785 	      wint_t wc = __btowc (br_elem->opr.ch);
2786 	      return __collseq_table_lookup (collseqwc, wc);
2787 	    }
2788 	}
2789       else if (br_elem->type == MB_CHAR)
2790 	{
2791 	  if (nrules != 0)
2792 	    return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2793 	}
2794       else if (br_elem->type == COLL_SYM)
2795 	{
2796 	  size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2797 	  if (nrules != 0)
2798 	    {
2799 	      int32_t elem, idx;
2800 	      elem = seek_collating_symbol_entry (br_elem->opr.name,
2801 						  sym_name_len);
2802 	      if (symb_table[2 * elem] != 0)
2803 		{
2804 		  /* We found the entry.  */
2805 		  idx = symb_table[2 * elem + 1];
2806 		  /* Skip the name of collating element name.  */
2807 		  idx += 1 + extra[idx];
2808 		  /* Skip the byte sequence of the collating element.  */
2809 		  idx += 1 + extra[idx];
2810 		  /* Adjust for the alignment.  */
2811 		  idx = (idx + 3) & ~3;
2812 		  /* Skip the multibyte collation sequence value.  */
2813 		  idx += sizeof (unsigned int);
2814 		  /* Skip the wide char sequence of the collating element.  */
2815 		  idx += sizeof (unsigned int) *
2816 		    (1 + *(unsigned int *) (extra + idx));
2817 		  /* Return the collation sequence value.  */
2818 		  return *(unsigned int *) (extra + idx);
2819 		}
2820 	      else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2821 		{
2822 		  /* No valid character.  Match it as a single byte
2823 		     character.  */
2824 		  return collseqmb[br_elem->opr.name[0]];
2825 		}
2826 	    }
2827 	  else if (sym_name_len == 1)
2828 	    return collseqmb[br_elem->opr.name[0]];
2829 	}
2830       return UINT_MAX;
2831     }
2832 
2833   /* Local function for parse_bracket_exp used in _LIBC environement.
2834      Build the range expression which starts from START_ELEM, and ends
2835      at END_ELEM.  The result are written to MBCSET and SBCSET.
2836      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2837      mbcset->range_ends, is a pointer argument sinse we may
2838      update it.  */
2839 
2840   auto inline reg_errcode_t
2841   __attribute ((always_inline))
2842   build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2843 	 re_charset_t *mbcset;
2844 	 int *range_alloc;
2845 	 bitset_t sbcset;
2846 	 bracket_elem_t *start_elem, *end_elem;
2847     {
2848       unsigned int ch;
2849       uint32_t start_collseq;
2850       uint32_t end_collseq;
2851 
2852       /* Equivalence Classes and Character Classes can't be a range
2853 	 start/end.  */
2854       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2855 	      || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2856 	      0))
2857 	return REG_ERANGE;
2858 
2859       start_collseq = lookup_collation_sequence_value (start_elem);
2860       end_collseq = lookup_collation_sequence_value (end_elem);
2861       /* Check start/end collation sequence values.  */
2862       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2863 	return REG_ECOLLATE;
2864       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2865 	return REG_ERANGE;
2866 
2867       /* Got valid collation sequence values, add them as a new entry.
2868 	 However, if we have no collation elements, and the character set
2869 	 is single byte, the single byte character set that we
2870 	 build below suffices. */
2871       if (nrules > 0 || dfa->mb_cur_max > 1)
2872 	{
2873           /* Check the space of the arrays.  */
2874           if (BE (*range_alloc == mbcset->nranges, 0))
2875 	    {
2876 	      /* There is not enough space, need realloc.  */
2877 	      uint32_t *new_array_start;
2878 	      uint32_t *new_array_end;
2879 	      int new_nranges;
2880 
2881 	      /* +1 in case of mbcset->nranges is 0.  */
2882 	      new_nranges = 2 * mbcset->nranges + 1;
2883 	      new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2884 					    new_nranges);
2885 	      new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2886 				          new_nranges);
2887 
2888 	      if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2889 	        return REG_ESPACE;
2890 
2891 	      mbcset->range_starts = new_array_start;
2892 	      mbcset->range_ends = new_array_end;
2893 	      *range_alloc = new_nranges;
2894 	    }
2895 
2896           mbcset->range_starts[mbcset->nranges] = start_collseq;
2897           mbcset->range_ends[mbcset->nranges++] = end_collseq;
2898 	}
2899 
2900       /* Build the table for single byte characters.  */
2901       for (ch = 0; ch < SBC_MAX; ch++)
2902 	{
2903 	  uint32_t ch_collseq;
2904 	  /*
2905 	  if (MB_CUR_MAX == 1)
2906 	  */
2907 	  if (nrules == 0)
2908 	    ch_collseq = collseqmb[ch];
2909 	  else
2910 	    ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2911 	  if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2912 	    bitset_set (sbcset, ch);
2913 	}
2914       return REG_NOERROR;
2915     }
2916 
2917   /* Local function for parse_bracket_exp used in _LIBC environement.
2918      Build the collating element which is represented by NAME.
2919      The result are written to MBCSET and SBCSET.
2920      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2921      pointer argument sinse we may update it.  */
2922 
2923   auto inline reg_errcode_t
2924   __attribute ((always_inline))
2925   build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2926 	 re_charset_t *mbcset;
2927 	 int *coll_sym_alloc;
2928 	 bitset_t sbcset;
2929 	 const unsigned char *name;
2930     {
2931       int32_t elem, idx;
2932       size_t name_len = strlen ((const char *) name);
2933       if (nrules != 0)
2934 	{
2935 	  elem = seek_collating_symbol_entry (name, name_len);
2936 	  if (symb_table[2 * elem] != 0)
2937 	    {
2938 	      /* We found the entry.  */
2939 	      idx = symb_table[2 * elem + 1];
2940 	      /* Skip the name of collating element name.  */
2941 	      idx += 1 + extra[idx];
2942 	    }
2943 	  else if (symb_table[2 * elem] == 0 && name_len == 1)
2944 	    {
2945 	      /* No valid character, treat it as a normal
2946 		 character.  */
2947 	      bitset_set (sbcset, name[0]);
2948 	      return REG_NOERROR;
2949 	    }
2950 	  else
2951 	    return REG_ECOLLATE;
2952 
2953 	  /* Got valid collation sequence, add it as a new entry.  */
2954 	  /* Check the space of the arrays.  */
2955 	  if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2956 	    {
2957 	      /* Not enough, realloc it.  */
2958 	      /* +1 in case of mbcset->ncoll_syms is 0.  */
2959 	      int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2960 	      /* Use realloc since mbcset->coll_syms is NULL
2961 		 if *alloc == 0.  */
2962 	      int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2963 						   new_coll_sym_alloc);
2964 	      if (BE (new_coll_syms == NULL, 0))
2965 		return REG_ESPACE;
2966 	      mbcset->coll_syms = new_coll_syms;
2967 	      *coll_sym_alloc = new_coll_sym_alloc;
2968 	    }
2969 	  mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2970 	  return REG_NOERROR;
2971 	}
2972       else
2973 	{
2974 	  if (BE (name_len != 1, 0))
2975 	    return REG_ECOLLATE;
2976 	  else
2977 	    {
2978 	      bitset_set (sbcset, name[0]);
2979 	      return REG_NOERROR;
2980 	    }
2981 	}
2982     }
2983 #endif
2984 
2985   re_token_t br_token;
2986   re_bitset_ptr_t sbcset;
2987 #ifdef RE_ENABLE_I18N
2988   re_charset_t *mbcset;
2989   int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
2990   int equiv_class_alloc = 0, char_class_alloc = 0;
2991 #endif /* not RE_ENABLE_I18N */
2992   int non_match = 0;
2993   bin_tree_t *work_tree;
2994   int token_len;
2995   int first_round = 1;
2996 #ifdef _LIBC
2997   collseqmb = (const unsigned char *)
2998     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
2999   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3000   if (nrules)
3001     {
3002       /*
3003       if (MB_CUR_MAX > 1)
3004       */
3005       collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3006       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3007       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3008 						  _NL_COLLATE_SYMB_TABLEMB);
3009       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3010 						   _NL_COLLATE_SYMB_EXTRAMB);
3011     }
3012 #endif
3013   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3014 #ifdef RE_ENABLE_I18N
3015   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3016 #endif /* RE_ENABLE_I18N */
3017 #ifdef RE_ENABLE_I18N
3018   if (BE (sbcset == NULL || mbcset == NULL, 0))
3019 #else
3020   if (BE (sbcset == NULL, 0))
3021 #endif /* RE_ENABLE_I18N */
3022     {
3023       *err = REG_ESPACE;
3024       return NULL;
3025     }
3026 
3027   token_len = peek_token_bracket (token, regexp, syntax);
3028   if (BE (token->type == END_OF_RE, 0))
3029     {
3030       *err = REG_BADPAT;
3031       goto parse_bracket_exp_free_return;
3032     }
3033   if (token->type == OP_NON_MATCH_LIST)
3034     {
3035 #ifdef RE_ENABLE_I18N
3036       mbcset->non_match = 1;
3037 #endif /* not RE_ENABLE_I18N */
3038       non_match = 1;
3039       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3040 	bitset_set (sbcset, '\n');
3041       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3042       token_len = peek_token_bracket (token, regexp, syntax);
3043       if (BE (token->type == END_OF_RE, 0))
3044 	{
3045 	  *err = REG_BADPAT;
3046 	  goto parse_bracket_exp_free_return;
3047 	}
3048     }
3049 
3050   /* We treat the first ']' as a normal character.  */
3051   if (token->type == OP_CLOSE_BRACKET)
3052     token->type = CHARACTER;
3053 
3054   while (1)
3055     {
3056       bracket_elem_t start_elem, end_elem;
3057       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3058       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3059       reg_errcode_t ret;
3060       int token_len2 = 0, is_range_exp = 0;
3061       re_token_t token2;
3062 
3063       start_elem.opr.name = start_name_buf;
3064       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3065 				   syntax, first_round);
3066       if (BE (ret != REG_NOERROR, 0))
3067 	{
3068 	  *err = ret;
3069 	  goto parse_bracket_exp_free_return;
3070 	}
3071       first_round = 0;
3072 
3073       /* Get information about the next token.  We need it in any case.  */
3074       token_len = peek_token_bracket (token, regexp, syntax);
3075 
3076       /* Do not check for ranges if we know they are not allowed.  */
3077       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3078 	{
3079 	  if (BE (token->type == END_OF_RE, 0))
3080 	    {
3081 	      *err = REG_EBRACK;
3082 	      goto parse_bracket_exp_free_return;
3083 	    }
3084 	  if (token->type == OP_CHARSET_RANGE)
3085 	    {
3086 	      re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3087 	      token_len2 = peek_token_bracket (&token2, regexp, syntax);
3088 	      if (BE (token2.type == END_OF_RE, 0))
3089 		{
3090 		  *err = REG_EBRACK;
3091 		  goto parse_bracket_exp_free_return;
3092 		}
3093 	      if (token2.type == OP_CLOSE_BRACKET)
3094 		{
3095 		  /* We treat the last '-' as a normal character.  */
3096 		  re_string_skip_bytes (regexp, -token_len);
3097 		  token->type = CHARACTER;
3098 		}
3099 	      else
3100 		is_range_exp = 1;
3101 	    }
3102 	}
3103 
3104       if (is_range_exp == 1)
3105 	{
3106 	  end_elem.opr.name = end_name_buf;
3107 	  ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3108 				       dfa, syntax, 1);
3109 	  if (BE (ret != REG_NOERROR, 0))
3110 	    {
3111 	      *err = ret;
3112 	      goto parse_bracket_exp_free_return;
3113 	    }
3114 
3115 	  token_len = peek_token_bracket (token, regexp, syntax);
3116 
3117 #ifdef _LIBC
3118 	  *err = build_range_exp (sbcset, mbcset, &range_alloc,
3119 				  &start_elem, &end_elem);
3120 #else
3121 # ifdef RE_ENABLE_I18N
3122 	  *err = build_range_exp (sbcset,
3123 				  dfa->mb_cur_max > 1 ? mbcset : NULL,
3124 				  &range_alloc, &start_elem, &end_elem);
3125 # else
3126 	  *err = build_range_exp (sbcset, &start_elem, &end_elem);
3127 # endif
3128 #endif /* RE_ENABLE_I18N */
3129 	  if (BE (*err != REG_NOERROR, 0))
3130 	    goto parse_bracket_exp_free_return;
3131 	}
3132       else
3133 	{
3134 	  switch (start_elem.type)
3135 	    {
3136 	    case SB_CHAR:
3137 	      bitset_set (sbcset, start_elem.opr.ch);
3138 	      break;
3139 #ifdef RE_ENABLE_I18N
3140 	    case MB_CHAR:
3141 	      /* Check whether the array has enough space.  */
3142 	      if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3143 		{
3144 		  wchar_t *new_mbchars;
3145 		  /* Not enough, realloc it.  */
3146 		  /* +1 in case of mbcset->nmbchars is 0.  */
3147 		  mbchar_alloc = 2 * mbcset->nmbchars + 1;
3148 		  /* Use realloc since array is NULL if *alloc == 0.  */
3149 		  new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3150 					    mbchar_alloc);
3151 		  if (BE (new_mbchars == NULL, 0))
3152 		    goto parse_bracket_exp_espace;
3153 		  mbcset->mbchars = new_mbchars;
3154 		}
3155 	      mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3156 	      break;
3157 #endif /* RE_ENABLE_I18N */
3158 	    case EQUIV_CLASS:
3159 	      *err = build_equiv_class (sbcset,
3160 #ifdef RE_ENABLE_I18N
3161 					mbcset, &equiv_class_alloc,
3162 #endif /* RE_ENABLE_I18N */
3163 					start_elem.opr.name);
3164 	      if (BE (*err != REG_NOERROR, 0))
3165 		goto parse_bracket_exp_free_return;
3166 	      break;
3167 	    case COLL_SYM:
3168 	      *err = build_collating_symbol (sbcset,
3169 #ifdef RE_ENABLE_I18N
3170 					     mbcset, &coll_sym_alloc,
3171 #endif /* RE_ENABLE_I18N */
3172 					     start_elem.opr.name);
3173 	      if (BE (*err != REG_NOERROR, 0))
3174 		goto parse_bracket_exp_free_return;
3175 	      break;
3176 	    case CHAR_CLASS:
3177 	      *err = build_charclass (regexp->trans, sbcset,
3178 #ifdef RE_ENABLE_I18N
3179 				      mbcset, &char_class_alloc,
3180 #endif /* RE_ENABLE_I18N */
3181 				      start_elem.opr.name, syntax);
3182 	      if (BE (*err != REG_NOERROR, 0))
3183 	       goto parse_bracket_exp_free_return;
3184 	      break;
3185 	    default:
3186 	      assert (0);
3187 	      break;
3188 	    }
3189 	}
3190       if (BE (token->type == END_OF_RE, 0))
3191 	{
3192 	  *err = REG_EBRACK;
3193 	  goto parse_bracket_exp_free_return;
3194 	}
3195       if (token->type == OP_CLOSE_BRACKET)
3196 	break;
3197     }
3198 
3199   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3200 
3201   /* If it is non-matching list.  */
3202   if (non_match)
3203     bitset_not (sbcset);
3204 
3205 #ifdef RE_ENABLE_I18N
3206   /* Ensure only single byte characters are set.  */
3207   if (dfa->mb_cur_max > 1)
3208     bitset_mask (sbcset, dfa->sb_char);
3209 
3210   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3211       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3212 						     || mbcset->non_match)))
3213     {
3214       bin_tree_t *mbc_tree;
3215       int sbc_idx;
3216       /* Build a tree for complex bracket.  */
3217       dfa->has_mb_node = 1;
3218       br_token.type = COMPLEX_BRACKET;
3219       br_token.opr.mbcset = mbcset;
3220       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3221       if (BE (mbc_tree == NULL, 0))
3222 	goto parse_bracket_exp_espace;
3223       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3224 	if (sbcset[sbc_idx])
3225 	  break;
3226       /* If there are no bits set in sbcset, there is no point
3227 	 of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3228       if (sbc_idx < BITSET_WORDS)
3229 	{
3230           /* Build a tree for simple bracket.  */
3231           br_token.type = SIMPLE_BRACKET;
3232           br_token.opr.sbcset = sbcset;
3233           work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3234           if (BE (work_tree == NULL, 0))
3235             goto parse_bracket_exp_espace;
3236 
3237           /* Then join them by ALT node.  */
3238           work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3239           if (BE (work_tree == NULL, 0))
3240             goto parse_bracket_exp_espace;
3241 	}
3242       else
3243 	{
3244 	  re_free (sbcset);
3245 	  work_tree = mbc_tree;
3246 	}
3247     }
3248   else
3249 #endif /* not RE_ENABLE_I18N */
3250     {
3251 #ifdef RE_ENABLE_I18N
3252       free_charset (mbcset);
3253 #endif
3254       /* Build a tree for simple bracket.  */
3255       br_token.type = SIMPLE_BRACKET;
3256       br_token.opr.sbcset = sbcset;
3257       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3258       if (BE (work_tree == NULL, 0))
3259         goto parse_bracket_exp_espace;
3260     }
3261   return work_tree;
3262 
3263  parse_bracket_exp_espace:
3264   *err = REG_ESPACE;
3265  parse_bracket_exp_free_return:
3266   re_free (sbcset);
3267 #ifdef RE_ENABLE_I18N
3268   free_charset (mbcset);
3269 #endif /* RE_ENABLE_I18N */
3270   return NULL;
3271 }
3272 
3273 /* Parse an element in the bracket expression.  */
3274 
3275 static reg_errcode_t
3276 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3277 		       re_token_t *token, int token_len, re_dfa_t *dfa,
3278 		       reg_syntax_t syntax, int accept_hyphen)
3279 {
3280 #ifdef RE_ENABLE_I18N
3281   int cur_char_size;
3282   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3283   if (cur_char_size > 1)
3284     {
3285       elem->type = MB_CHAR;
3286       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3287       re_string_skip_bytes (regexp, cur_char_size);
3288       return REG_NOERROR;
3289     }
3290 #endif /* RE_ENABLE_I18N */
3291   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3292   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3293       || token->type == OP_OPEN_EQUIV_CLASS)
3294     return parse_bracket_symbol (elem, regexp, token);
3295   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3296     {
3297       /* A '-' must only appear as anything but a range indicator before
3298 	 the closing bracket.  Everything else is an error.  */
3299       re_token_t token2;
3300       (void) peek_token_bracket (&token2, regexp, syntax);
3301       if (token2.type != OP_CLOSE_BRACKET)
3302 	/* The actual error value is not standardized since this whole
3303 	   case is undefined.  But ERANGE makes good sense.  */
3304 	return REG_ERANGE;
3305     }
3306   elem->type = SB_CHAR;
3307   elem->opr.ch = token->opr.c;
3308   return REG_NOERROR;
3309 }
3310 
3311 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3312    such as [:<character_class>:], [.<collating_element>.], and
3313    [=<equivalent_class>=].  */
3314 
3315 static reg_errcode_t
3316 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3317 		      re_token_t *token)
3318 {
3319   unsigned char ch, delim = token->opr.c;
3320   int i = 0;
3321   if (re_string_eoi(regexp))
3322     return REG_EBRACK;
3323   for (;; ++i)
3324     {
3325       if (i >= BRACKET_NAME_BUF_SIZE)
3326 	return REG_EBRACK;
3327       if (token->type == OP_OPEN_CHAR_CLASS)
3328 	ch = re_string_fetch_byte_case (regexp);
3329       else
3330 	ch = re_string_fetch_byte (regexp);
3331       if (re_string_eoi(regexp))
3332 	return REG_EBRACK;
3333       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3334 	break;
3335       elem->opr.name[i] = ch;
3336     }
3337   re_string_skip_bytes (regexp, 1);
3338   elem->opr.name[i] = '\0';
3339   switch (token->type)
3340     {
3341     case OP_OPEN_COLL_ELEM:
3342       elem->type = COLL_SYM;
3343       break;
3344     case OP_OPEN_EQUIV_CLASS:
3345       elem->type = EQUIV_CLASS;
3346       break;
3347     case OP_OPEN_CHAR_CLASS:
3348       elem->type = CHAR_CLASS;
3349       break;
3350     default:
3351       break;
3352     }
3353   return REG_NOERROR;
3354 }
3355 
3356   /* Helper function for parse_bracket_exp.
3357      Build the equivalence class which is represented by NAME.
3358      The result are written to MBCSET and SBCSET.
3359      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3360      is a pointer argument sinse we may update it.  */
3361 
3362 static reg_errcode_t
3363 #ifdef RE_ENABLE_I18N
3364 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3365 		   int *equiv_class_alloc, const unsigned char *name)
3366 #else /* not RE_ENABLE_I18N */
3367 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3368 #endif /* not RE_ENABLE_I18N */
3369 {
3370 #ifdef _LIBC
3371   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3372   if (nrules != 0)
3373     {
3374       const int32_t *table, *indirect;
3375       const unsigned char *weights, *extra, *cp;
3376       unsigned char char_buf[2];
3377       int32_t idx1, idx2;
3378       unsigned int ch;
3379       size_t len;
3380       /* This #include defines a local function!  */
3381 # include <locale/weight.h>
3382       /* Calculate the index for equivalence class.  */
3383       cp = name;
3384       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3385       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3386 					       _NL_COLLATE_WEIGHTMB);
3387       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3388 						   _NL_COLLATE_EXTRAMB);
3389       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3390 						_NL_COLLATE_INDIRECTMB);
3391       idx1 = findidx (&cp);
3392       if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3393 	/* This isn't a valid character.  */
3394 	return REG_ECOLLATE;
3395 
3396       /* Build single byte matcing table for this equivalence class.  */
3397       char_buf[1] = (unsigned char) '\0';
3398       len = weights[idx1 & 0xffffff];
3399       for (ch = 0; ch < SBC_MAX; ++ch)
3400 	{
3401 	  char_buf[0] = ch;
3402 	  cp = char_buf;
3403 	  idx2 = findidx (&cp);
3404 /*
3405 	  idx2 = table[ch];
3406 */
3407 	  if (idx2 == 0)
3408 	    /* This isn't a valid character.  */
3409 	    continue;
3410 	  /* Compare only if the length matches and the collation rule
3411 	     index is the same.  */
3412 	  if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3413 	    {
3414 	      int cnt = 0;
3415 
3416 	      while (cnt <= len &&
3417 		     weights[(idx1 & 0xffffff) + 1 + cnt]
3418 		     == weights[(idx2 & 0xffffff) + 1 + cnt])
3419 		++cnt;
3420 
3421 	      if (cnt > len)
3422 		bitset_set (sbcset, ch);
3423 	    }
3424 	}
3425       /* Check whether the array has enough space.  */
3426       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3427 	{
3428 	  /* Not enough, realloc it.  */
3429 	  /* +1 in case of mbcset->nequiv_classes is 0.  */
3430 	  int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3431 	  /* Use realloc since the array is NULL if *alloc == 0.  */
3432 	  int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3433 						   int32_t,
3434 						   new_equiv_class_alloc);
3435 	  if (BE (new_equiv_classes == NULL, 0))
3436 	    return REG_ESPACE;
3437 	  mbcset->equiv_classes = new_equiv_classes;
3438 	  *equiv_class_alloc = new_equiv_class_alloc;
3439 	}
3440       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3441     }
3442   else
3443 #endif /* _LIBC */
3444     {
3445       if (BE (strlen ((const char *) name) != 1, 0))
3446 	return REG_ECOLLATE;
3447       bitset_set (sbcset, *name);
3448     }
3449   return REG_NOERROR;
3450 }
3451 
3452   /* Helper function for parse_bracket_exp.
3453      Build the character class which is represented by NAME.
3454      The result are written to MBCSET and SBCSET.
3455      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3456      is a pointer argument sinse we may update it.  */
3457 
3458 static reg_errcode_t
3459 #ifdef RE_ENABLE_I18N
3460 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3461 		 re_charset_t *mbcset, int *char_class_alloc,
3462 		 const unsigned char *class_name, reg_syntax_t syntax)
3463 #else /* not RE_ENABLE_I18N */
3464 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3465 		 const unsigned char *class_name, reg_syntax_t syntax)
3466 #endif /* not RE_ENABLE_I18N */
3467 {
3468   int i;
3469   const char *name = (const char *) class_name;
3470 
3471   /* In case of REG_ICASE "upper" and "lower" match the both of
3472      upper and lower cases.  */
3473   if ((syntax & RE_ICASE)
3474       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3475     name = "alpha";
3476 
3477 #ifdef RE_ENABLE_I18N
3478   /* Check the space of the arrays.  */
3479   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3480     {
3481       /* Not enough, realloc it.  */
3482       /* +1 in case of mbcset->nchar_classes is 0.  */
3483       int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3484       /* Use realloc since array is NULL if *alloc == 0.  */
3485       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3486 					       new_char_class_alloc);
3487       if (BE (new_char_classes == NULL, 0))
3488 	return REG_ESPACE;
3489       mbcset->char_classes = new_char_classes;
3490       *char_class_alloc = new_char_class_alloc;
3491     }
3492   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3493 #endif /* RE_ENABLE_I18N */
3494 
3495 #define BUILD_CHARCLASS_LOOP(ctype_func)	\
3496   do {						\
3497     if (BE (trans != NULL, 0))			\
3498       {						\
3499 	for (i = 0; i < SBC_MAX; ++i)		\
3500   	  if (ctype_func (i))			\
3501 	    bitset_set (sbcset, trans[i]);	\
3502       }						\
3503     else					\
3504       {						\
3505 	for (i = 0; i < SBC_MAX; ++i)		\
3506   	  if (ctype_func (i))			\
3507 	    bitset_set (sbcset, i);		\
3508       }						\
3509   } while (0)
3510 
3511   if (strcmp (name, "alnum") == 0)
3512     BUILD_CHARCLASS_LOOP (isalnum);
3513   else if (strcmp (name, "cntrl") == 0)
3514     BUILD_CHARCLASS_LOOP (iscntrl);
3515   else if (strcmp (name, "lower") == 0)
3516     BUILD_CHARCLASS_LOOP (islower);
3517   else if (strcmp (name, "space") == 0)
3518     BUILD_CHARCLASS_LOOP (isspace);
3519   else if (strcmp (name, "alpha") == 0)
3520     BUILD_CHARCLASS_LOOP (isalpha);
3521   else if (strcmp (name, "digit") == 0)
3522     BUILD_CHARCLASS_LOOP (isdigit);
3523   else if (strcmp (name, "print") == 0)
3524     BUILD_CHARCLASS_LOOP (isprint);
3525   else if (strcmp (name, "upper") == 0)
3526     BUILD_CHARCLASS_LOOP (isupper);
3527   else if (strcmp (name, "blank") == 0)
3528     BUILD_CHARCLASS_LOOP (isblank);
3529   else if (strcmp (name, "graph") == 0)
3530     BUILD_CHARCLASS_LOOP (isgraph);
3531   else if (strcmp (name, "punct") == 0)
3532     BUILD_CHARCLASS_LOOP (ispunct);
3533   else if (strcmp (name, "xdigit") == 0)
3534     BUILD_CHARCLASS_LOOP (isxdigit);
3535   else
3536     return REG_ECTYPE;
3537 
3538   return REG_NOERROR;
3539 }
3540 
3541 static bin_tree_t *
3542 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3543 		    const unsigned char *class_name,
3544 		    const unsigned char *extra, int non_match,
3545 		    reg_errcode_t *err)
3546 {
3547   re_bitset_ptr_t sbcset;
3548 #ifdef RE_ENABLE_I18N
3549   re_charset_t *mbcset;
3550   int alloc = 0;
3551 #endif /* not RE_ENABLE_I18N */
3552   reg_errcode_t ret;
3553   re_token_t br_token;
3554   bin_tree_t *tree;
3555 
3556   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3557 #ifdef RE_ENABLE_I18N
3558   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3559 #endif /* RE_ENABLE_I18N */
3560 
3561 #ifdef RE_ENABLE_I18N
3562   if (BE (sbcset == NULL || mbcset == NULL, 0))
3563 #else /* not RE_ENABLE_I18N */
3564   if (BE (sbcset == NULL, 0))
3565 #endif /* not RE_ENABLE_I18N */
3566     {
3567       *err = REG_ESPACE;
3568       return NULL;
3569     }
3570 
3571   if (non_match)
3572     {
3573 #ifdef RE_ENABLE_I18N
3574       mbcset->non_match = 1;
3575 #endif /* not RE_ENABLE_I18N */
3576     }
3577 
3578   /* We don't care the syntax in this case.  */
3579   ret = build_charclass (trans, sbcset,
3580 #ifdef RE_ENABLE_I18N
3581 			 mbcset, &alloc,
3582 #endif /* RE_ENABLE_I18N */
3583 			 class_name, 0);
3584 
3585   if (BE (ret != REG_NOERROR, 0))
3586     {
3587       re_free (sbcset);
3588 #ifdef RE_ENABLE_I18N
3589       free_charset (mbcset);
3590 #endif /* RE_ENABLE_I18N */
3591       *err = ret;
3592       return NULL;
3593     }
3594   /* \w match '_' also.  */
3595   for (; *extra; extra++)
3596     bitset_set (sbcset, *extra);
3597 
3598   /* If it is non-matching list.  */
3599   if (non_match)
3600     bitset_not (sbcset);
3601 
3602 #ifdef RE_ENABLE_I18N
3603   /* Ensure only single byte characters are set.  */
3604   if (dfa->mb_cur_max > 1)
3605     bitset_mask (sbcset, dfa->sb_char);
3606 #endif
3607 
3608   /* Build a tree for simple bracket.  */
3609   br_token.type = SIMPLE_BRACKET;
3610   br_token.opr.sbcset = sbcset;
3611   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3612   if (BE (tree == NULL, 0))
3613     goto build_word_op_espace;
3614 
3615 #ifdef RE_ENABLE_I18N
3616   if (dfa->mb_cur_max > 1)
3617     {
3618       bin_tree_t *mbc_tree;
3619       /* Build a tree for complex bracket.  */
3620       br_token.type = COMPLEX_BRACKET;
3621       br_token.opr.mbcset = mbcset;
3622       dfa->has_mb_node = 1;
3623       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3624       if (BE (mbc_tree == NULL, 0))
3625 	goto build_word_op_espace;
3626       /* Then join them by ALT node.  */
3627       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3628       if (BE (mbc_tree != NULL, 1))
3629 	return tree;
3630     }
3631   else
3632     {
3633       free_charset (mbcset);
3634       return tree;
3635     }
3636 #else /* not RE_ENABLE_I18N */
3637   return tree;
3638 #endif /* not RE_ENABLE_I18N */
3639 
3640  build_word_op_espace:
3641   re_free (sbcset);
3642 #ifdef RE_ENABLE_I18N
3643   free_charset (mbcset);
3644 #endif /* RE_ENABLE_I18N */
3645   *err = REG_ESPACE;
3646   return NULL;
3647 }
3648 
3649 /* This is intended for the expressions like "a{1,3}".
3650    Fetch a number from `input', and return the number.
3651    Return -1, if the number field is empty like "{,1}".
3652    Return -2, If an error is occured.  */
3653 
3654 static int
3655 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3656 {
3657   int num = -1;
3658   unsigned char c;
3659   while (1)
3660     {
3661       fetch_token (token, input, syntax);
3662       c = token->opr.c;
3663       if (BE (token->type == END_OF_RE, 0))
3664 	return -2;
3665       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3666 	break;
3667       num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3668 	     ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3669       num = (num > RE_DUP_MAX) ? -2 : num;
3670     }
3671   return num;
3672 }
3673 
3674 #ifdef RE_ENABLE_I18N
3675 static void
3676 free_charset (re_charset_t *cset)
3677 {
3678   re_free (cset->mbchars);
3679 # ifdef _LIBC
3680   re_free (cset->coll_syms);
3681   re_free (cset->equiv_classes);
3682   re_free (cset->range_starts);
3683   re_free (cset->range_ends);
3684 # endif
3685   re_free (cset->char_classes);
3686   re_free (cset);
3687 }
3688 #endif /* RE_ENABLE_I18N */
3689 
3690 /* Functions for binary tree operation.  */
3691 
3692 /* Create a tree node.  */
3693 
3694 static bin_tree_t *
3695 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3696 	     re_token_type_t type)
3697 {
3698   re_token_t t;
3699   t.type = type;
3700   return create_token_tree (dfa, left, right, &t);
3701 }
3702 
3703 static bin_tree_t *
3704 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3705 		   const re_token_t *token)
3706 {
3707   bin_tree_t *tree;
3708   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3709     {
3710       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3711 
3712       if (storage == NULL)
3713 	return NULL;
3714       storage->next = dfa->str_tree_storage;
3715       dfa->str_tree_storage = storage;
3716       dfa->str_tree_storage_idx = 0;
3717     }
3718   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3719 
3720   tree->parent = NULL;
3721   tree->left = left;
3722   tree->right = right;
3723   tree->token = *token;
3724   tree->token.duplicated = 0;
3725   tree->token.opt_subexp = 0;
3726   tree->first = NULL;
3727   tree->next = NULL;
3728   tree->node_idx = -1;
3729 
3730   if (left != NULL)
3731     left->parent = tree;
3732   if (right != NULL)
3733     right->parent = tree;
3734   return tree;
3735 }
3736 
3737 /* Mark the tree SRC as an optional subexpression.
3738    To be called from preorder or postorder.  */
3739 
3740 static reg_errcode_t
3741 mark_opt_subexp (void *extra, bin_tree_t *node)
3742 {
3743   int idx = (int) (long) extra;
3744   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3745     node->token.opt_subexp = 1;
3746 
3747   return REG_NOERROR;
3748 }
3749 
3750 /* Free the allocated memory inside NODE. */
3751 
3752 static void
3753 free_token (re_token_t *node)
3754 {
3755 #ifdef RE_ENABLE_I18N
3756   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3757     free_charset (node->opr.mbcset);
3758   else
3759 #endif /* RE_ENABLE_I18N */
3760     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3761       re_free (node->opr.sbcset);
3762 }
3763 
3764 /* Worker function for tree walking.  Free the allocated memory inside NODE
3765    and its children. */
3766 
3767 static reg_errcode_t
3768 free_tree (void *extra, bin_tree_t *node)
3769 {
3770   free_token (&node->token);
3771   return REG_NOERROR;
3772 }
3773 
3774 
3775 /* Duplicate the node SRC, and return new node.  This is a preorder
3776    visit similar to the one implemented by the generic visitor, but
3777    we need more infrastructure to maintain two parallel trees --- so,
3778    it's easier to duplicate.  */
3779 
3780 static bin_tree_t *
3781 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3782 {
3783   const bin_tree_t *node;
3784   bin_tree_t *dup_root;
3785   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3786 
3787   for (node = root; ; )
3788     {
3789       /* Create a new tree and link it back to the current parent.  */
3790       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3791       if (*p_new == NULL)
3792 	return NULL;
3793       (*p_new)->parent = dup_node;
3794       (*p_new)->token.duplicated = 1;
3795       dup_node = *p_new;
3796 
3797       /* Go to the left node, or up and to the right.  */
3798       if (node->left)
3799 	{
3800 	  node = node->left;
3801 	  p_new = &dup_node->left;
3802 	}
3803       else
3804 	{
3805 	  const bin_tree_t *prev = NULL;
3806 	  while (node->right == prev || node->right == NULL)
3807 	    {
3808 	      prev = node;
3809 	      node = node->parent;
3810 	      dup_node = dup_node->parent;
3811 	      if (!node)
3812 	        return dup_root;
3813 	    }
3814 	  node = node->right;
3815 	  p_new = &dup_node->right;
3816 	}
3817     }
3818 }
3819