xref: /haiku/src/system/libroot/posix/glibc/regex/regex_internal.c (revision bda66ab7c7bfee38e845cea952a0a632613af99d)
1 /* Extended regular expression matching and search library.
2    Copyright (C) 2002-2018 Free Software Foundation, Inc.
3    This file is part of the GNU C Library.
4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5 
6    The GNU C Library is free software; you can redistribute it and/or
7    modify it under the terms of the GNU Lesser General Public
8    License as published by the Free Software Foundation; either
9    version 2.1 of the License, or (at your option) any later version.
10 
11    The GNU C Library is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14    Lesser General Public License for more details.
15 
16    You should have received a copy of the GNU Lesser General Public
17    License along with the GNU C Library; if not, see
18    <https://www.gnu.org/licenses/>.  */
19 
20 static void re_string_construct_common (const char *str, Idx len,
21 					re_string_t *pstr,
22 					RE_TRANSLATE_TYPE trans, bool icase,
23 					const re_dfa_t *dfa);
24 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
25 					  const re_node_set *nodes,
26 					  re_hashval_t hash);
27 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
28 					  const re_node_set *nodes,
29 					  unsigned int context,
30 					  re_hashval_t hash);
31 static reg_errcode_t re_string_realloc_buffers (re_string_t *pstr,
32 						Idx new_buf_len);
33 #ifdef RE_ENABLE_I18N
34 static void build_wcs_buffer (re_string_t *pstr);
35 static reg_errcode_t build_wcs_upper_buffer (re_string_t *pstr);
36 #endif /* RE_ENABLE_I18N */
37 static void build_upper_buffer (re_string_t *pstr);
38 static void re_string_translate_buffer (re_string_t *pstr);
39 static unsigned int re_string_context_at (const re_string_t *input, Idx idx,
40 					  int eflags) __attribute__ ((pure));
41 
42 /* Functions for string operation.  */
43 
44 /* This function allocate the buffers.  It is necessary to call
45    re_string_reconstruct before using the object.  */
46 
47 static reg_errcode_t
48 __attribute_warn_unused_result__
re_string_allocate(re_string_t * pstr,const char * str,Idx len,Idx init_len,RE_TRANSLATE_TYPE trans,bool icase,const re_dfa_t * dfa)49 re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len,
50 		    RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
51 {
52   reg_errcode_t ret;
53   Idx init_buf_len;
54 
55   /* Ensure at least one character fits into the buffers.  */
56   if (init_len < dfa->mb_cur_max)
57     init_len = dfa->mb_cur_max;
58   init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
59   re_string_construct_common (str, len, pstr, trans, icase, dfa);
60 
61   ret = re_string_realloc_buffers (pstr, init_buf_len);
62   if (BE (ret != REG_NOERROR, 0))
63     return ret;
64 
65   pstr->word_char = dfa->word_char;
66   pstr->word_ops_used = dfa->word_ops_used;
67   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
68   pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
69   pstr->valid_raw_len = pstr->valid_len;
70   return REG_NOERROR;
71 }
72 
73 /* This function allocate the buffers, and initialize them.  */
74 
75 static reg_errcode_t
76 __attribute_warn_unused_result__
re_string_construct(re_string_t * pstr,const char * str,Idx len,RE_TRANSLATE_TYPE trans,bool icase,const re_dfa_t * dfa)77 re_string_construct (re_string_t *pstr, const char *str, Idx len,
78 		     RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
79 {
80   reg_errcode_t ret;
81   memset (pstr, '\0', sizeof (re_string_t));
82   re_string_construct_common (str, len, pstr, trans, icase, dfa);
83 
84   if (len > 0)
85     {
86       ret = re_string_realloc_buffers (pstr, len + 1);
87       if (BE (ret != REG_NOERROR, 0))
88 	return ret;
89     }
90   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
91 
92   if (icase)
93     {
94 #ifdef RE_ENABLE_I18N
95       if (dfa->mb_cur_max > 1)
96 	{
97 	  while (1)
98 	    {
99 	      ret = build_wcs_upper_buffer (pstr);
100 	      if (BE (ret != REG_NOERROR, 0))
101 		return ret;
102 	      if (pstr->valid_raw_len >= len)
103 		break;
104 	      if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
105 		break;
106 	      ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
107 	      if (BE (ret != REG_NOERROR, 0))
108 		return ret;
109 	    }
110 	}
111       else
112 #endif /* RE_ENABLE_I18N  */
113 	build_upper_buffer (pstr);
114     }
115   else
116     {
117 #ifdef RE_ENABLE_I18N
118       if (dfa->mb_cur_max > 1)
119 	build_wcs_buffer (pstr);
120       else
121 #endif /* RE_ENABLE_I18N  */
122 	{
123 	  if (trans != NULL)
124 	    re_string_translate_buffer (pstr);
125 	  else
126 	    {
127 	      pstr->valid_len = pstr->bufs_len;
128 	      pstr->valid_raw_len = pstr->bufs_len;
129 	    }
130 	}
131     }
132 
133   return REG_NOERROR;
134 }
135 
136 /* Helper functions for re_string_allocate, and re_string_construct.  */
137 
138 static reg_errcode_t
139 __attribute_warn_unused_result__
re_string_realloc_buffers(re_string_t * pstr,Idx new_buf_len)140 re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)
141 {
142 #ifdef RE_ENABLE_I18N
143   if (pstr->mb_cur_max > 1)
144     {
145       wint_t *new_wcs;
146 
147       /* Avoid overflow in realloc.  */
148       const size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx));
149       if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) < new_buf_len, 0))
150 	return REG_ESPACE;
151 
152       new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
153       if (BE (new_wcs == NULL, 0))
154 	return REG_ESPACE;
155       pstr->wcs = new_wcs;
156       if (pstr->offsets != NULL)
157 	{
158 	  Idx *new_offsets = re_realloc (pstr->offsets, Idx, new_buf_len);
159 	  if (BE (new_offsets == NULL, 0))
160 	    return REG_ESPACE;
161 	  pstr->offsets = new_offsets;
162 	}
163     }
164 #endif /* RE_ENABLE_I18N  */
165   if (pstr->mbs_allocated)
166     {
167       unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
168 					   new_buf_len);
169       if (BE (new_mbs == NULL, 0))
170 	return REG_ESPACE;
171       pstr->mbs = new_mbs;
172     }
173   pstr->bufs_len = new_buf_len;
174   return REG_NOERROR;
175 }
176 
177 
178 static void
re_string_construct_common(const char * str,Idx len,re_string_t * pstr,RE_TRANSLATE_TYPE trans,bool icase,const re_dfa_t * dfa)179 re_string_construct_common (const char *str, Idx len, re_string_t *pstr,
180 			    RE_TRANSLATE_TYPE trans, bool icase,
181 			    const re_dfa_t *dfa)
182 {
183   pstr->raw_mbs = (const unsigned char *) str;
184   pstr->len = len;
185   pstr->raw_len = len;
186   pstr->trans = trans;
187   pstr->icase = icase;
188   pstr->mbs_allocated = (trans != NULL || icase);
189   pstr->mb_cur_max = dfa->mb_cur_max;
190   pstr->is_utf8 = dfa->is_utf8;
191   pstr->map_notascii = dfa->map_notascii;
192   pstr->stop = pstr->len;
193   pstr->raw_stop = pstr->stop;
194 }
195 
196 #ifdef RE_ENABLE_I18N
197 
198 /* Build wide character buffer PSTR->WCS.
199    If the byte sequence of the string are:
200      <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
201    Then wide character buffer will be:
202      <wc1>   , WEOF    , <wc2>   , WEOF    , <wc3>
203    We use WEOF for padding, they indicate that the position isn't
204    a first byte of a multibyte character.
205 
206    Note that this function assumes PSTR->VALID_LEN elements are already
207    built and starts from PSTR->VALID_LEN.  */
208 
209 static void
build_wcs_buffer(re_string_t * pstr)210 build_wcs_buffer (re_string_t *pstr)
211 {
212 #ifdef _LIBC
213   unsigned char buf[MB_LEN_MAX];
214   assert (MB_LEN_MAX >= pstr->mb_cur_max);
215 #else
216   unsigned char buf[64];
217 #endif
218   mbstate_t prev_st;
219   Idx byte_idx, end_idx, remain_len;
220   size_t mbclen;
221 
222   /* Build the buffers from pstr->valid_len to either pstr->len or
223      pstr->bufs_len.  */
224   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
225   for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
226     {
227       wchar_t wc;
228       const char *p;
229 
230       remain_len = end_idx - byte_idx;
231       prev_st = pstr->cur_state;
232       /* Apply the translation if we need.  */
233       if (BE (pstr->trans != NULL, 0))
234 	{
235 	  int i, ch;
236 
237 	  for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
238 	    {
239 	      ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
240 	      buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
241 	    }
242 	  p = (const char *) buf;
243 	}
244       else
245 	p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
246       mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
247       if (BE (mbclen == (size_t) -1 || mbclen == 0
248 	      || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len), 0))
249 	{
250 	  /* We treat these cases as a singlebyte character.  */
251 	  mbclen = 1;
252 	  wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
253 	  if (BE (pstr->trans != NULL, 0))
254 	    wc = pstr->trans[wc];
255 	  pstr->cur_state = prev_st;
256 	}
257       else if (BE (mbclen == (size_t) -2, 0))
258 	{
259 	  /* The buffer doesn't have enough space, finish to build.  */
260 	  pstr->cur_state = prev_st;
261 	  break;
262 	}
263 
264       /* Write wide character and padding.  */
265       pstr->wcs[byte_idx++] = wc;
266       /* Write paddings.  */
267       for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
268 	pstr->wcs[byte_idx++] = WEOF;
269     }
270   pstr->valid_len = byte_idx;
271   pstr->valid_raw_len = byte_idx;
272 }
273 
274 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
275    but for REG_ICASE.  */
276 
277 static reg_errcode_t
278 __attribute_warn_unused_result__
build_wcs_upper_buffer(re_string_t * pstr)279 build_wcs_upper_buffer (re_string_t *pstr)
280 {
281   mbstate_t prev_st;
282   Idx src_idx, byte_idx, end_idx, remain_len;
283   size_t mbclen;
284 #ifdef _LIBC
285   char buf[MB_LEN_MAX];
286   assert (MB_LEN_MAX >= pstr->mb_cur_max);
287 #else
288   char buf[64];
289 #endif
290 
291   byte_idx = pstr->valid_len;
292   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
293 
294   /* The following optimization assumes that ASCII characters can be
295      mapped to wide characters with a simple cast.  */
296   if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
297     {
298       while (byte_idx < end_idx)
299 	{
300 	  wchar_t wc;
301 
302 	  if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
303 	      && mbsinit (&pstr->cur_state))
304 	    {
305 	      /* In case of a singlebyte character.  */
306 	      pstr->mbs[byte_idx]
307 		= toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
308 	      /* The next step uses the assumption that wchar_t is encoded
309 		 ASCII-safe: all ASCII values can be converted like this.  */
310 	      pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
311 	      ++byte_idx;
312 	      continue;
313 	    }
314 
315 	  remain_len = end_idx - byte_idx;
316 	  prev_st = pstr->cur_state;
317 	  mbclen = __mbrtowc (&wc,
318 			      ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
319 			       + byte_idx), remain_len, &pstr->cur_state);
320 	  if (BE (0 < mbclen && mbclen < (size_t) -2, 1))
321 	    {
322 	      wchar_t wcu = __towupper (wc);
323 	      if (wcu != wc)
324 		{
325 		  size_t mbcdlen;
326 
327 		  mbcdlen = __wcrtomb (buf, wcu, &prev_st);
328 		  if (BE (mbclen == mbcdlen, 1))
329 		    memcpy (pstr->mbs + byte_idx, buf, mbclen);
330 		  else
331 		    {
332 		      src_idx = byte_idx;
333 		      goto offsets_needed;
334 		    }
335 		}
336 	      else
337 		memcpy (pstr->mbs + byte_idx,
338 			pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
339 	      pstr->wcs[byte_idx++] = wcu;
340 	      /* Write paddings.  */
341 	      for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
342 		pstr->wcs[byte_idx++] = WEOF;
343 	    }
344 	  else if (mbclen == (size_t) -1 || mbclen == 0
345 		   || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
346 	    {
347 	      /* It is an invalid character, an incomplete character
348 		 at the end of the string, or '\0'.  Just use the byte.  */
349 	      int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
350 	      pstr->mbs[byte_idx] = ch;
351 	      /* And also cast it to wide char.  */
352 	      pstr->wcs[byte_idx++] = (wchar_t) ch;
353 	      if (BE (mbclen == (size_t) -1, 0))
354 		pstr->cur_state = prev_st;
355 	    }
356 	  else
357 	    {
358 	      /* The buffer doesn't have enough space, finish to build.  */
359 	      pstr->cur_state = prev_st;
360 	      break;
361 	    }
362 	}
363       pstr->valid_len = byte_idx;
364       pstr->valid_raw_len = byte_idx;
365       return REG_NOERROR;
366     }
367   else
368     for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
369       {
370 	wchar_t wc;
371 	const char *p;
372       offsets_needed:
373 	remain_len = end_idx - byte_idx;
374 	prev_st = pstr->cur_state;
375 	if (BE (pstr->trans != NULL, 0))
376 	  {
377 	    int i, ch;
378 
379 	    for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
380 	      {
381 		ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
382 		buf[i] = pstr->trans[ch];
383 	      }
384 	    p = (const char *) buf;
385 	  }
386 	else
387 	  p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
388 	mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
389 	if (BE (0 < mbclen && mbclen < (size_t) -2, 1))
390 	  {
391 	    wchar_t wcu = __towupper (wc);
392 	    if (wcu != wc)
393 	      {
394 		size_t mbcdlen;
395 
396 		mbcdlen = __wcrtomb ((char *) buf, wcu, &prev_st);
397 		if (BE (mbclen == mbcdlen, 1))
398 		  memcpy (pstr->mbs + byte_idx, buf, mbclen);
399 		else if (mbcdlen != (size_t) -1)
400 		  {
401 		    size_t i;
402 
403 		    if (byte_idx + mbcdlen > pstr->bufs_len)
404 		      {
405 			pstr->cur_state = prev_st;
406 			break;
407 		      }
408 
409 		    if (pstr->offsets == NULL)
410 		      {
411 			pstr->offsets = re_malloc (Idx, pstr->bufs_len);
412 
413 			if (pstr->offsets == NULL)
414 			  return REG_ESPACE;
415 		      }
416 		    if (!pstr->offsets_needed)
417 		      {
418 			for (i = 0; i < (size_t) byte_idx; ++i)
419 			  pstr->offsets[i] = i;
420 			pstr->offsets_needed = 1;
421 		      }
422 
423 		    memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
424 		    pstr->wcs[byte_idx] = wcu;
425 		    pstr->offsets[byte_idx] = src_idx;
426 		    for (i = 1; i < mbcdlen; ++i)
427 		      {
428 			pstr->offsets[byte_idx + i]
429 			  = src_idx + (i < mbclen ? i : mbclen - 1);
430 			pstr->wcs[byte_idx + i] = WEOF;
431 		      }
432 		    pstr->len += mbcdlen - mbclen;
433 		    if (pstr->raw_stop > src_idx)
434 		      pstr->stop += mbcdlen - mbclen;
435 		    end_idx = (pstr->bufs_len > pstr->len)
436 			      ? pstr->len : pstr->bufs_len;
437 		    byte_idx += mbcdlen;
438 		    src_idx += mbclen;
439 		    continue;
440 		  }
441 		else
442 		  memcpy (pstr->mbs + byte_idx, p, mbclen);
443 	      }
444 	    else
445 	      memcpy (pstr->mbs + byte_idx, p, mbclen);
446 
447 	    if (BE (pstr->offsets_needed != 0, 0))
448 	      {
449 		size_t i;
450 		for (i = 0; i < mbclen; ++i)
451 		  pstr->offsets[byte_idx + i] = src_idx + i;
452 	      }
453 	    src_idx += mbclen;
454 
455 	    pstr->wcs[byte_idx++] = wcu;
456 	    /* Write paddings.  */
457 	    for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
458 	      pstr->wcs[byte_idx++] = WEOF;
459 	  }
460 	else if (mbclen == (size_t) -1 || mbclen == 0
461 		 || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
462 	  {
463 	    /* It is an invalid character or '\0'.  Just use the byte.  */
464 	    int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
465 
466 	    if (BE (pstr->trans != NULL, 0))
467 	      ch = pstr->trans [ch];
468 	    pstr->mbs[byte_idx] = ch;
469 
470 	    if (BE (pstr->offsets_needed != 0, 0))
471 	      pstr->offsets[byte_idx] = src_idx;
472 	    ++src_idx;
473 
474 	    /* And also cast it to wide char.  */
475 	    pstr->wcs[byte_idx++] = (wchar_t) ch;
476 	    if (BE (mbclen == (size_t) -1, 0))
477 	      pstr->cur_state = prev_st;
478 	  }
479 	else
480 	  {
481 	    /* The buffer doesn't have enough space, finish to build.  */
482 	    pstr->cur_state = prev_st;
483 	    break;
484 	  }
485       }
486   pstr->valid_len = byte_idx;
487   pstr->valid_raw_len = src_idx;
488   return REG_NOERROR;
489 }
490 
491 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
492    Return the index.  */
493 
494 static Idx
re_string_skip_chars(re_string_t * pstr,Idx new_raw_idx,wint_t * last_wc)495 re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc)
496 {
497   mbstate_t prev_st;
498   Idx rawbuf_idx;
499   size_t mbclen;
500   wint_t wc = WEOF;
501 
502   /* Skip the characters which are not necessary to check.  */
503   for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
504        rawbuf_idx < new_raw_idx;)
505     {
506       wchar_t wc2;
507       Idx remain_len = pstr->raw_len - rawbuf_idx;
508       prev_st = pstr->cur_state;
509       mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
510 			  remain_len, &pstr->cur_state);
511       if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
512 	{
513 	  /* We treat these cases as a single byte character.  */
514 	  if (mbclen == 0 || remain_len == 0)
515 	    wc = L'\0';
516 	  else
517 	    wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
518 	  mbclen = 1;
519 	  pstr->cur_state = prev_st;
520 	}
521       else
522 	wc = wc2;
523       /* Then proceed the next character.  */
524       rawbuf_idx += mbclen;
525     }
526   *last_wc = wc;
527   return rawbuf_idx;
528 }
529 #endif /* RE_ENABLE_I18N  */
530 
531 /* Build the buffer PSTR->MBS, and apply the translation if we need.
532    This function is used in case of REG_ICASE.  */
533 
534 static void
build_upper_buffer(re_string_t * pstr)535 build_upper_buffer (re_string_t *pstr)
536 {
537   Idx char_idx, end_idx;
538   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
539 
540   for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
541     {
542       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
543       if (BE (pstr->trans != NULL, 0))
544 	ch = pstr->trans[ch];
545       pstr->mbs[char_idx] = toupper (ch);
546     }
547   pstr->valid_len = char_idx;
548   pstr->valid_raw_len = char_idx;
549 }
550 
551 /* Apply TRANS to the buffer in PSTR.  */
552 
553 static void
re_string_translate_buffer(re_string_t * pstr)554 re_string_translate_buffer (re_string_t *pstr)
555 {
556   Idx buf_idx, end_idx;
557   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
558 
559   for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
560     {
561       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
562       pstr->mbs[buf_idx] = pstr->trans[ch];
563     }
564 
565   pstr->valid_len = buf_idx;
566   pstr->valid_raw_len = buf_idx;
567 }
568 
569 /* This function re-construct the buffers.
570    Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
571    convert to upper case in case of REG_ICASE, apply translation.  */
572 
573 static reg_errcode_t
574 __attribute_warn_unused_result__
re_string_reconstruct(re_string_t * pstr,Idx idx,int eflags)575 re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
576 {
577   Idx offset;
578 
579   if (BE (pstr->raw_mbs_idx <= idx, 0))
580     offset = idx - pstr->raw_mbs_idx;
581   else
582     {
583       /* Reset buffer.  */
584 #ifdef RE_ENABLE_I18N
585       if (pstr->mb_cur_max > 1)
586 	memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
587 #endif /* RE_ENABLE_I18N */
588       pstr->len = pstr->raw_len;
589       pstr->stop = pstr->raw_stop;
590       pstr->valid_len = 0;
591       pstr->raw_mbs_idx = 0;
592       pstr->valid_raw_len = 0;
593       pstr->offsets_needed = 0;
594       pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
595 			   : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
596       if (!pstr->mbs_allocated)
597 	pstr->mbs = (unsigned char *) pstr->raw_mbs;
598       offset = idx;
599     }
600 
601   if (BE (offset != 0, 1))
602     {
603       /* Should the already checked characters be kept?  */
604       if (BE (offset < pstr->valid_raw_len, 1))
605 	{
606 	  /* Yes, move them to the front of the buffer.  */
607 #ifdef RE_ENABLE_I18N
608 	  if (BE (pstr->offsets_needed, 0))
609 	    {
610 	      Idx low = 0, high = pstr->valid_len, mid;
611 	      do
612 		{
613 		  mid = (high + low) / 2;
614 		  if (pstr->offsets[mid] > offset)
615 		    high = mid;
616 		  else if (pstr->offsets[mid] < offset)
617 		    low = mid + 1;
618 		  else
619 		    break;
620 		}
621 	      while (low < high);
622 	      if (pstr->offsets[mid] < offset)
623 		++mid;
624 	      pstr->tip_context = re_string_context_at (pstr, mid - 1,
625 							eflags);
626 	      /* This can be quite complicated, so handle specially
627 		 only the common and easy case where the character with
628 		 different length representation of lower and upper
629 		 case is present at or after offset.  */
630 	      if (pstr->valid_len > offset
631 		  && mid == offset && pstr->offsets[mid] == offset)
632 		{
633 		  memmove (pstr->wcs, pstr->wcs + offset,
634 			   (pstr->valid_len - offset) * sizeof (wint_t));
635 		  memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
636 		  pstr->valid_len -= offset;
637 		  pstr->valid_raw_len -= offset;
638 		  for (low = 0; low < pstr->valid_len; low++)
639 		    pstr->offsets[low] = pstr->offsets[low + offset] - offset;
640 		}
641 	      else
642 		{
643 		  /* Otherwise, just find out how long the partial multibyte
644 		     character at offset is and fill it with WEOF/255.  */
645 		  pstr->len = pstr->raw_len - idx + offset;
646 		  pstr->stop = pstr->raw_stop - idx + offset;
647 		  pstr->offsets_needed = 0;
648 		  while (mid > 0 && pstr->offsets[mid - 1] == offset)
649 		    --mid;
650 		  while (mid < pstr->valid_len)
651 		    if (pstr->wcs[mid] != WEOF)
652 		      break;
653 		    else
654 		      ++mid;
655 		  if (mid == pstr->valid_len)
656 		    pstr->valid_len = 0;
657 		  else
658 		    {
659 		      pstr->valid_len = pstr->offsets[mid] - offset;
660 		      if (pstr->valid_len)
661 			{
662 			  for (low = 0; low < pstr->valid_len; ++low)
663 			    pstr->wcs[low] = WEOF;
664 			  memset (pstr->mbs, 255, pstr->valid_len);
665 			}
666 		    }
667 		  pstr->valid_raw_len = pstr->valid_len;
668 		}
669 	    }
670 	  else
671 #endif
672 	    {
673 	      pstr->tip_context = re_string_context_at (pstr, offset - 1,
674 							eflags);
675 #ifdef RE_ENABLE_I18N
676 	      if (pstr->mb_cur_max > 1)
677 		memmove (pstr->wcs, pstr->wcs + offset,
678 			 (pstr->valid_len - offset) * sizeof (wint_t));
679 #endif /* RE_ENABLE_I18N */
680 	      if (BE (pstr->mbs_allocated, 0))
681 		memmove (pstr->mbs, pstr->mbs + offset,
682 			 pstr->valid_len - offset);
683 	      pstr->valid_len -= offset;
684 	      pstr->valid_raw_len -= offset;
685 #if defined DEBUG && DEBUG
686 	      assert (pstr->valid_len > 0);
687 #endif
688 	    }
689 	}
690       else
691 	{
692 #ifdef RE_ENABLE_I18N
693 	  /* No, skip all characters until IDX.  */
694 	  Idx prev_valid_len = pstr->valid_len;
695 
696 	  if (BE (pstr->offsets_needed, 0))
697 	    {
698 	      pstr->len = pstr->raw_len - idx + offset;
699 	      pstr->stop = pstr->raw_stop - idx + offset;
700 	      pstr->offsets_needed = 0;
701 	    }
702 #endif
703 	  pstr->valid_len = 0;
704 #ifdef RE_ENABLE_I18N
705 	  if (pstr->mb_cur_max > 1)
706 	    {
707 	      Idx wcs_idx;
708 	      wint_t wc = WEOF;
709 
710 	      if (pstr->is_utf8)
711 		{
712 		  const unsigned char *raw, *p, *end;
713 
714 		  /* Special case UTF-8.  Multi-byte chars start with any
715 		     byte other than 0x80 - 0xbf.  */
716 		  raw = pstr->raw_mbs + pstr->raw_mbs_idx;
717 		  end = raw + (offset - pstr->mb_cur_max);
718 		  if (end < pstr->raw_mbs)
719 		    end = pstr->raw_mbs;
720 		  p = raw + offset - 1;
721 #ifdef _LIBC
722 		  /* We know the wchar_t encoding is UCS4, so for the simple
723 		     case, ASCII characters, skip the conversion step.  */
724 		  if (isascii (*p) && BE (pstr->trans == NULL, 1))
725 		    {
726 		      memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
727 		      /* pstr->valid_len = 0; */
728 		      wc = (wchar_t) *p;
729 		    }
730 		  else
731 #endif
732 		    for (; p >= end; --p)
733 		      if ((*p & 0xc0) != 0x80)
734 			{
735 			  mbstate_t cur_state;
736 			  wchar_t wc2;
737 			  Idx mlen = raw + pstr->len - p;
738 			  unsigned char buf[6];
739 			  size_t mbclen;
740 
741 			  const unsigned char *pp = p;
742 			  if (BE (pstr->trans != NULL, 0))
743 			    {
744 			      int i = mlen < 6 ? mlen : 6;
745 			      while (--i >= 0)
746 				buf[i] = pstr->trans[p[i]];
747 			      pp = buf;
748 			    }
749 			  /* XXX Don't use mbrtowc, we know which conversion
750 			     to use (UTF-8 -> UCS4).  */
751 			  memset (&cur_state, 0, sizeof (cur_state));
752 			  mbclen = __mbrtowc (&wc2, (const char *) pp, mlen,
753 					      &cur_state);
754 			  if (raw + offset - p <= mbclen
755 			      && mbclen < (size_t) -2)
756 			    {
757 			      memset (&pstr->cur_state, '\0',
758 				      sizeof (mbstate_t));
759 			      pstr->valid_len = mbclen - (raw + offset - p);
760 			      wc = wc2;
761 			    }
762 			  break;
763 			}
764 		}
765 
766 	      if (wc == WEOF)
767 		pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
768 	      if (wc == WEOF)
769 		pstr->tip_context
770 		  = re_string_context_at (pstr, prev_valid_len - 1, eflags);
771 	      else
772 		pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
773 				      && IS_WIDE_WORD_CHAR (wc))
774 				     ? CONTEXT_WORD
775 				     : ((IS_WIDE_NEWLINE (wc)
776 					 && pstr->newline_anchor)
777 					? CONTEXT_NEWLINE : 0));
778 	      if (BE (pstr->valid_len, 0))
779 		{
780 		  for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
781 		    pstr->wcs[wcs_idx] = WEOF;
782 		  if (pstr->mbs_allocated)
783 		    memset (pstr->mbs, 255, pstr->valid_len);
784 		}
785 	      pstr->valid_raw_len = pstr->valid_len;
786 	    }
787 	  else
788 #endif /* RE_ENABLE_I18N */
789 	    {
790 	      int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
791 	      pstr->valid_raw_len = 0;
792 	      if (pstr->trans)
793 		c = pstr->trans[c];
794 	      pstr->tip_context = (bitset_contain (pstr->word_char, c)
795 				   ? CONTEXT_WORD
796 				   : ((IS_NEWLINE (c) && pstr->newline_anchor)
797 				      ? CONTEXT_NEWLINE : 0));
798 	    }
799 	}
800       if (!BE (pstr->mbs_allocated, 0))
801 	pstr->mbs += offset;
802     }
803   pstr->raw_mbs_idx = idx;
804   pstr->len -= offset;
805   pstr->stop -= offset;
806 
807   /* Then build the buffers.  */
808 #ifdef RE_ENABLE_I18N
809   if (pstr->mb_cur_max > 1)
810     {
811       if (pstr->icase)
812 	{
813 	  reg_errcode_t ret = build_wcs_upper_buffer (pstr);
814 	  if (BE (ret != REG_NOERROR, 0))
815 	    return ret;
816 	}
817       else
818 	build_wcs_buffer (pstr);
819     }
820   else
821 #endif /* RE_ENABLE_I18N */
822     if (BE (pstr->mbs_allocated, 0))
823       {
824 	if (pstr->icase)
825 	  build_upper_buffer (pstr);
826 	else if (pstr->trans != NULL)
827 	  re_string_translate_buffer (pstr);
828       }
829     else
830       pstr->valid_len = pstr->len;
831 
832   pstr->cur_idx = 0;
833   return REG_NOERROR;
834 }
835 
836 static unsigned char
837 __attribute__ ((pure))
re_string_peek_byte_case(const re_string_t * pstr,Idx idx)838 re_string_peek_byte_case (const re_string_t *pstr, Idx idx)
839 {
840   int ch;
841   Idx off;
842 
843   /* Handle the common (easiest) cases first.  */
844   if (BE (!pstr->mbs_allocated, 1))
845     return re_string_peek_byte (pstr, idx);
846 
847 #ifdef RE_ENABLE_I18N
848   if (pstr->mb_cur_max > 1
849       && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
850     return re_string_peek_byte (pstr, idx);
851 #endif
852 
853   off = pstr->cur_idx + idx;
854 #ifdef RE_ENABLE_I18N
855   if (pstr->offsets_needed)
856     off = pstr->offsets[off];
857 #endif
858 
859   ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
860 
861 #ifdef RE_ENABLE_I18N
862   /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
863      this function returns CAPITAL LETTER I instead of first byte of
864      DOTLESS SMALL LETTER I.  The latter would confuse the parser,
865      since peek_byte_case doesn't advance cur_idx in any way.  */
866   if (pstr->offsets_needed && !isascii (ch))
867     return re_string_peek_byte (pstr, idx);
868 #endif
869 
870   return ch;
871 }
872 
873 static unsigned char
re_string_fetch_byte_case(re_string_t * pstr)874 re_string_fetch_byte_case (re_string_t *pstr)
875 {
876   if (BE (!pstr->mbs_allocated, 1))
877     return re_string_fetch_byte (pstr);
878 
879 #ifdef RE_ENABLE_I18N
880   if (pstr->offsets_needed)
881     {
882       Idx off;
883       int ch;
884 
885       /* For tr_TR.UTF-8 [[:islower:]] there is
886 	 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs.  Skip
887 	 in that case the whole multi-byte character and return
888 	 the original letter.  On the other side, with
889 	 [[: DOTLESS SMALL LETTER I return [[:I, as doing
890 	 anything else would complicate things too much.  */
891 
892       if (!re_string_first_byte (pstr, pstr->cur_idx))
893 	return re_string_fetch_byte (pstr);
894 
895       off = pstr->offsets[pstr->cur_idx];
896       ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
897 
898       if (! isascii (ch))
899 	return re_string_fetch_byte (pstr);
900 
901       re_string_skip_bytes (pstr,
902 			    re_string_char_size_at (pstr, pstr->cur_idx));
903       return ch;
904     }
905 #endif
906 
907   return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
908 }
909 
910 static void
re_string_destruct(re_string_t * pstr)911 re_string_destruct (re_string_t *pstr)
912 {
913 #ifdef RE_ENABLE_I18N
914   re_free (pstr->wcs);
915   re_free (pstr->offsets);
916 #endif /* RE_ENABLE_I18N  */
917   if (pstr->mbs_allocated)
918     re_free (pstr->mbs);
919 }
920 
921 /* Return the context at IDX in INPUT.  */
922 
923 static unsigned int
re_string_context_at(const re_string_t * input,Idx idx,int eflags)924 re_string_context_at (const re_string_t *input, Idx idx, int eflags)
925 {
926   int c;
927   if (BE (idx < 0, 0))
928     /* In this case, we use the value stored in input->tip_context,
929        since we can't know the character in input->mbs[-1] here.  */
930     return input->tip_context;
931   if (BE (idx == input->len, 0))
932     return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
933 	    : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
934 #ifdef RE_ENABLE_I18N
935   if (input->mb_cur_max > 1)
936     {
937       wint_t wc;
938       Idx wc_idx = idx;
939       while(input->wcs[wc_idx] == WEOF)
940 	{
941 #if defined DEBUG && DEBUG
942 	  /* It must not happen.  */
943 	  assert (wc_idx >= 0);
944 #endif
945 	  --wc_idx;
946 	  if (wc_idx < 0)
947 	    return input->tip_context;
948 	}
949       wc = input->wcs[wc_idx];
950       if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
951 	return CONTEXT_WORD;
952       return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
953 	      ? CONTEXT_NEWLINE : 0);
954     }
955   else
956 #endif
957     {
958       c = re_string_byte_at (input, idx);
959       if (bitset_contain (input->word_char, c))
960 	return CONTEXT_WORD;
961       return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
962     }
963 }
964 
965 /* Functions for set operation.  */
966 
967 static reg_errcode_t
968 __attribute_warn_unused_result__
re_node_set_alloc(re_node_set * set,Idx size)969 re_node_set_alloc (re_node_set *set, Idx size)
970 {
971   set->alloc = size;
972   set->nelem = 0;
973   set->elems = re_malloc (Idx, size);
974   if (BE (set->elems == NULL, 0) && (MALLOC_0_IS_NONNULL || size != 0))
975     return REG_ESPACE;
976   return REG_NOERROR;
977 }
978 
979 static reg_errcode_t
980 __attribute_warn_unused_result__
re_node_set_init_1(re_node_set * set,Idx elem)981 re_node_set_init_1 (re_node_set *set, Idx elem)
982 {
983   set->alloc = 1;
984   set->nelem = 1;
985   set->elems = re_malloc (Idx, 1);
986   if (BE (set->elems == NULL, 0))
987     {
988       set->alloc = set->nelem = 0;
989       return REG_ESPACE;
990     }
991   set->elems[0] = elem;
992   return REG_NOERROR;
993 }
994 
995 static reg_errcode_t
996 __attribute_warn_unused_result__
re_node_set_init_2(re_node_set * set,Idx elem1,Idx elem2)997 re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)
998 {
999   set->alloc = 2;
1000   set->elems = re_malloc (Idx, 2);
1001   if (BE (set->elems == NULL, 0))
1002     return REG_ESPACE;
1003   if (elem1 == elem2)
1004     {
1005       set->nelem = 1;
1006       set->elems[0] = elem1;
1007     }
1008   else
1009     {
1010       set->nelem = 2;
1011       if (elem1 < elem2)
1012 	{
1013 	  set->elems[0] = elem1;
1014 	  set->elems[1] = elem2;
1015 	}
1016       else
1017 	{
1018 	  set->elems[0] = elem2;
1019 	  set->elems[1] = elem1;
1020 	}
1021     }
1022   return REG_NOERROR;
1023 }
1024 
1025 static reg_errcode_t
1026 __attribute_warn_unused_result__
re_node_set_init_copy(re_node_set * dest,const re_node_set * src)1027 re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
1028 {
1029   dest->nelem = src->nelem;
1030   if (src->nelem > 0)
1031     {
1032       dest->alloc = dest->nelem;
1033       dest->elems = re_malloc (Idx, dest->alloc);
1034       if (BE (dest->elems == NULL, 0))
1035 	{
1036 	  dest->alloc = dest->nelem = 0;
1037 	  return REG_ESPACE;
1038 	}
1039       memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1040     }
1041   else
1042     re_node_set_init_empty (dest);
1043   return REG_NOERROR;
1044 }
1045 
1046 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1047    DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1048    Note: We assume dest->elems is NULL, when dest->alloc is 0.  */
1049 
1050 static reg_errcode_t
1051 __attribute_warn_unused_result__
re_node_set_add_intersect(re_node_set * dest,const re_node_set * src1,const re_node_set * src2)1052 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
1053 			   const re_node_set *src2)
1054 {
1055   Idx i1, i2, is, id, delta, sbase;
1056   if (src1->nelem == 0 || src2->nelem == 0)
1057     return REG_NOERROR;
1058 
1059   /* We need dest->nelem + 2 * elems_in_intersection; this is a
1060      conservative estimate.  */
1061   if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
1062     {
1063       Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;
1064       Idx *new_elems = re_realloc (dest->elems, Idx, new_alloc);
1065       if (BE (new_elems == NULL, 0))
1066 	return REG_ESPACE;
1067       dest->elems = new_elems;
1068       dest->alloc = new_alloc;
1069     }
1070 
1071   /* Find the items in the intersection of SRC1 and SRC2, and copy
1072      into the top of DEST those that are not already in DEST itself.  */
1073   sbase = dest->nelem + src1->nelem + src2->nelem;
1074   i1 = src1->nelem - 1;
1075   i2 = src2->nelem - 1;
1076   id = dest->nelem - 1;
1077   for (;;)
1078     {
1079       if (src1->elems[i1] == src2->elems[i2])
1080 	{
1081 	  /* Try to find the item in DEST.  Maybe we could binary search?  */
1082 	  while (id >= 0 && dest->elems[id] > src1->elems[i1])
1083 	    --id;
1084 
1085 	  if (id < 0 || dest->elems[id] != src1->elems[i1])
1086             dest->elems[--sbase] = src1->elems[i1];
1087 
1088 	  if (--i1 < 0 || --i2 < 0)
1089 	    break;
1090 	}
1091 
1092       /* Lower the highest of the two items.  */
1093       else if (src1->elems[i1] < src2->elems[i2])
1094 	{
1095 	  if (--i2 < 0)
1096 	    break;
1097 	}
1098       else
1099 	{
1100 	  if (--i1 < 0)
1101 	    break;
1102 	}
1103     }
1104 
1105   id = dest->nelem - 1;
1106   is = dest->nelem + src1->nelem + src2->nelem - 1;
1107   delta = is - sbase + 1;
1108 
1109   /* Now copy.  When DELTA becomes zero, the remaining
1110      DEST elements are already in place; this is more or
1111      less the same loop that is in re_node_set_merge.  */
1112   dest->nelem += delta;
1113   if (delta > 0 && id >= 0)
1114     for (;;)
1115       {
1116 	if (dest->elems[is] > dest->elems[id])
1117 	  {
1118 	    /* Copy from the top.  */
1119 	    dest->elems[id + delta--] = dest->elems[is--];
1120 	    if (delta == 0)
1121 	      break;
1122 	  }
1123 	else
1124 	  {
1125 	    /* Slide from the bottom.  */
1126 	    dest->elems[id + delta] = dest->elems[id];
1127 	    if (--id < 0)
1128 	      break;
1129 	  }
1130       }
1131 
1132   /* Copy remaining SRC elements.  */
1133   memcpy (dest->elems, dest->elems + sbase, delta * sizeof (Idx));
1134 
1135   return REG_NOERROR;
1136 }
1137 
1138 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1139    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1140 
1141 static reg_errcode_t
1142 __attribute_warn_unused_result__
re_node_set_init_union(re_node_set * dest,const re_node_set * src1,const re_node_set * src2)1143 re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1144 			const re_node_set *src2)
1145 {
1146   Idx i1, i2, id;
1147   if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1148     {
1149       dest->alloc = src1->nelem + src2->nelem;
1150       dest->elems = re_malloc (Idx, dest->alloc);
1151       if (BE (dest->elems == NULL, 0))
1152 	return REG_ESPACE;
1153     }
1154   else
1155     {
1156       if (src1 != NULL && src1->nelem > 0)
1157 	return re_node_set_init_copy (dest, src1);
1158       else if (src2 != NULL && src2->nelem > 0)
1159 	return re_node_set_init_copy (dest, src2);
1160       else
1161 	re_node_set_init_empty (dest);
1162       return REG_NOERROR;
1163     }
1164   for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1165     {
1166       if (src1->elems[i1] > src2->elems[i2])
1167 	{
1168 	  dest->elems[id++] = src2->elems[i2++];
1169 	  continue;
1170 	}
1171       if (src1->elems[i1] == src2->elems[i2])
1172 	++i2;
1173       dest->elems[id++] = src1->elems[i1++];
1174     }
1175   if (i1 < src1->nelem)
1176     {
1177       memcpy (dest->elems + id, src1->elems + i1,
1178 	     (src1->nelem - i1) * sizeof (Idx));
1179       id += src1->nelem - i1;
1180     }
1181   else if (i2 < src2->nelem)
1182     {
1183       memcpy (dest->elems + id, src2->elems + i2,
1184 	     (src2->nelem - i2) * sizeof (Idx));
1185       id += src2->nelem - i2;
1186     }
1187   dest->nelem = id;
1188   return REG_NOERROR;
1189 }
1190 
1191 /* Calculate the union set of the sets DEST and SRC. And store it to
1192    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1193 
1194 static reg_errcode_t
1195 __attribute_warn_unused_result__
re_node_set_merge(re_node_set * dest,const re_node_set * src)1196 re_node_set_merge (re_node_set *dest, const re_node_set *src)
1197 {
1198   Idx is, id, sbase, delta;
1199   if (src == NULL || src->nelem == 0)
1200     return REG_NOERROR;
1201   if (dest->alloc < 2 * src->nelem + dest->nelem)
1202     {
1203       Idx new_alloc = 2 * (src->nelem + dest->alloc);
1204       Idx *new_buffer = re_realloc (dest->elems, Idx, new_alloc);
1205       if (BE (new_buffer == NULL, 0))
1206 	return REG_ESPACE;
1207       dest->elems = new_buffer;
1208       dest->alloc = new_alloc;
1209     }
1210 
1211   if (BE (dest->nelem == 0, 0))
1212     {
1213       dest->nelem = src->nelem;
1214       memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1215       return REG_NOERROR;
1216     }
1217 
1218   /* Copy into the top of DEST the items of SRC that are not
1219      found in DEST.  Maybe we could binary search in DEST?  */
1220   for (sbase = dest->nelem + 2 * src->nelem,
1221        is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
1222     {
1223       if (dest->elems[id] == src->elems[is])
1224 	is--, id--;
1225       else if (dest->elems[id] < src->elems[is])
1226 	dest->elems[--sbase] = src->elems[is--];
1227       else /* if (dest->elems[id] > src->elems[is]) */
1228 	--id;
1229     }
1230 
1231   if (is >= 0)
1232     {
1233       /* If DEST is exhausted, the remaining items of SRC must be unique.  */
1234       sbase -= is + 1;
1235       memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (Idx));
1236     }
1237 
1238   id = dest->nelem - 1;
1239   is = dest->nelem + 2 * src->nelem - 1;
1240   delta = is - sbase + 1;
1241   if (delta == 0)
1242     return REG_NOERROR;
1243 
1244   /* Now copy.  When DELTA becomes zero, the remaining
1245      DEST elements are already in place.  */
1246   dest->nelem += delta;
1247   for (;;)
1248     {
1249       if (dest->elems[is] > dest->elems[id])
1250 	{
1251 	  /* Copy from the top.  */
1252 	  dest->elems[id + delta--] = dest->elems[is--];
1253 	  if (delta == 0)
1254 	    break;
1255 	}
1256       else
1257 	{
1258 	  /* Slide from the bottom.  */
1259 	  dest->elems[id + delta] = dest->elems[id];
1260 	  if (--id < 0)
1261 	    {
1262 	      /* Copy remaining SRC elements.  */
1263 	      memcpy (dest->elems, dest->elems + sbase,
1264 		      delta * sizeof (Idx));
1265 	      break;
1266 	    }
1267 	}
1268     }
1269 
1270   return REG_NOERROR;
1271 }
1272 
1273 /* Insert the new element ELEM to the re_node_set* SET.
1274    SET should not already have ELEM.
1275    Return true if successful.  */
1276 
1277 static bool
1278 __attribute_warn_unused_result__
re_node_set_insert(re_node_set * set,Idx elem)1279 re_node_set_insert (re_node_set *set, Idx elem)
1280 {
1281   Idx idx;
1282   /* In case the set is empty.  */
1283   if (set->alloc == 0)
1284     return BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1);
1285 
1286   if (BE (set->nelem, 0) == 0)
1287     {
1288       /* We already guaranteed above that set->alloc != 0.  */
1289       set->elems[0] = elem;
1290       ++set->nelem;
1291       return true;
1292     }
1293 
1294   /* Realloc if we need.  */
1295   if (set->alloc == set->nelem)
1296     {
1297       Idx *new_elems;
1298       set->alloc = set->alloc * 2;
1299       new_elems = re_realloc (set->elems, Idx, set->alloc);
1300       if (BE (new_elems == NULL, 0))
1301 	return false;
1302       set->elems = new_elems;
1303     }
1304 
1305   /* Move the elements which follows the new element.  Test the
1306      first element separately to skip a check in the inner loop.  */
1307   if (elem < set->elems[0])
1308     {
1309       idx = 0;
1310       for (idx = set->nelem; idx > 0; idx--)
1311 	set->elems[idx] = set->elems[idx - 1];
1312     }
1313   else
1314     {
1315       for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1316 	set->elems[idx] = set->elems[idx - 1];
1317     }
1318 
1319   /* Insert the new element.  */
1320   set->elems[idx] = elem;
1321   ++set->nelem;
1322   return true;
1323 }
1324 
1325 /* Insert the new element ELEM to the re_node_set* SET.
1326    SET should not already have any element greater than or equal to ELEM.
1327    Return true if successful.  */
1328 
1329 static bool
1330 __attribute_warn_unused_result__
re_node_set_insert_last(re_node_set * set,Idx elem)1331 re_node_set_insert_last (re_node_set *set, Idx elem)
1332 {
1333   /* Realloc if we need.  */
1334   if (set->alloc == set->nelem)
1335     {
1336       Idx *new_elems;
1337       set->alloc = (set->alloc + 1) * 2;
1338       new_elems = re_realloc (set->elems, Idx, set->alloc);
1339       if (BE (new_elems == NULL, 0))
1340 	return false;
1341       set->elems = new_elems;
1342     }
1343 
1344   /* Insert the new element.  */
1345   set->elems[set->nelem++] = elem;
1346   return true;
1347 }
1348 
1349 /* Compare two node sets SET1 and SET2.
1350    Return true if SET1 and SET2 are equivalent.  */
1351 
1352 static bool
1353 __attribute__ ((pure))
re_node_set_compare(const re_node_set * set1,const re_node_set * set2)1354 re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1355 {
1356   Idx i;
1357   if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1358     return false;
1359   for (i = set1->nelem ; --i >= 0 ; )
1360     if (set1->elems[i] != set2->elems[i])
1361       return false;
1362   return true;
1363 }
1364 
1365 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise.  */
1366 
1367 static Idx
1368 __attribute__ ((pure))
re_node_set_contains(const re_node_set * set,Idx elem)1369 re_node_set_contains (const re_node_set *set, Idx elem)
1370 {
1371   __re_size_t idx, right, mid;
1372   if (set->nelem <= 0)
1373     return 0;
1374 
1375   /* Binary search the element.  */
1376   idx = 0;
1377   right = set->nelem - 1;
1378   while (idx < right)
1379     {
1380       mid = (idx + right) / 2;
1381       if (set->elems[mid] < elem)
1382 	idx = mid + 1;
1383       else
1384 	right = mid;
1385     }
1386   return set->elems[idx] == elem ? idx + 1 : 0;
1387 }
1388 
1389 static void
re_node_set_remove_at(re_node_set * set,Idx idx)1390 re_node_set_remove_at (re_node_set *set, Idx idx)
1391 {
1392   if (idx < 0 || idx >= set->nelem)
1393     return;
1394   --set->nelem;
1395   for (; idx < set->nelem; idx++)
1396     set->elems[idx] = set->elems[idx + 1];
1397 }
1398 
1399 
1400 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1401    Or return -1 if an error occurred.  */
1402 
1403 static Idx
re_dfa_add_node(re_dfa_t * dfa,re_token_t token)1404 re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1405 {
1406   if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1407     {
1408       size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1409       Idx *new_nexts, *new_indices;
1410       re_node_set *new_edests, *new_eclosures;
1411       re_token_t *new_nodes;
1412 
1413       /* Avoid overflows in realloc.  */
1414       const size_t max_object_size = MAX (sizeof (re_token_t),
1415 					  MAX (sizeof (re_node_set),
1416 					       sizeof (Idx)));
1417       if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) < new_nodes_alloc, 0))
1418 	return -1;
1419 
1420       new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1421       if (BE (new_nodes == NULL, 0))
1422 	return -1;
1423       dfa->nodes = new_nodes;
1424       new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);
1425       new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);
1426       new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1427       new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1428       if (BE (new_nexts == NULL || new_indices == NULL
1429 	      || new_edests == NULL || new_eclosures == NULL, 0))
1430 	{
1431 	   re_free (new_nexts);
1432 	   re_free (new_indices);
1433 	   re_free (new_edests);
1434 	   re_free (new_eclosures);
1435 	   return -1;
1436 	}
1437       dfa->nexts = new_nexts;
1438       dfa->org_indices = new_indices;
1439       dfa->edests = new_edests;
1440       dfa->eclosures = new_eclosures;
1441       dfa->nodes_alloc = new_nodes_alloc;
1442     }
1443   dfa->nodes[dfa->nodes_len] = token;
1444   dfa->nodes[dfa->nodes_len].constraint = 0;
1445 #ifdef RE_ENABLE_I18N
1446   dfa->nodes[dfa->nodes_len].accept_mb =
1447     ((token.type == OP_PERIOD && dfa->mb_cur_max > 1)
1448      || token.type == COMPLEX_BRACKET);
1449 #endif
1450   dfa->nexts[dfa->nodes_len] = -1;
1451   re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1452   re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1453   return dfa->nodes_len++;
1454 }
1455 
1456 static re_hashval_t
calc_state_hash(const re_node_set * nodes,unsigned int context)1457 calc_state_hash (const re_node_set *nodes, unsigned int context)
1458 {
1459   re_hashval_t hash = nodes->nelem + context;
1460   Idx i;
1461   for (i = 0 ; i < nodes->nelem ; i++)
1462     hash += nodes->elems[i];
1463   return hash;
1464 }
1465 
1466 /* Search for the state whose node_set is equivalent to NODES.
1467    Return the pointer to the state, if we found it in the DFA.
1468    Otherwise create the new one and return it.  In case of an error
1469    return NULL and set the error code in ERR.
1470    Note: - We assume NULL as the invalid state, then it is possible that
1471 	   return value is NULL and ERR is REG_NOERROR.
1472 	 - We never return non-NULL value in case of any errors, it is for
1473 	   optimization.  */
1474 
1475 static re_dfastate_t *
1476 __attribute_warn_unused_result__
re_acquire_state(reg_errcode_t * err,const re_dfa_t * dfa,const re_node_set * nodes)1477 re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1478 		  const re_node_set *nodes)
1479 {
1480   re_hashval_t hash;
1481   re_dfastate_t *new_state;
1482   struct re_state_table_entry *spot;
1483   Idx i;
1484 #if defined GCC_LINT || defined lint
1485   /* Suppress bogus uninitialized-variable warnings.  */
1486   *err = REG_NOERROR;
1487 #endif
1488   if (BE (nodes->nelem == 0, 0))
1489     {
1490       *err = REG_NOERROR;
1491       return NULL;
1492     }
1493   hash = calc_state_hash (nodes, 0);
1494   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1495 
1496   for (i = 0 ; i < spot->num ; i++)
1497     {
1498       re_dfastate_t *state = spot->array[i];
1499       if (hash != state->hash)
1500 	continue;
1501       if (re_node_set_compare (&state->nodes, nodes))
1502 	return state;
1503     }
1504 
1505   /* There are no appropriate state in the dfa, create the new one.  */
1506   new_state = create_ci_newstate (dfa, nodes, hash);
1507   if (BE (new_state == NULL, 0))
1508     *err = REG_ESPACE;
1509 
1510   return new_state;
1511 }
1512 
1513 /* Search for the state whose node_set is equivalent to NODES and
1514    whose context is equivalent to CONTEXT.
1515    Return the pointer to the state, if we found it in the DFA.
1516    Otherwise create the new one and return it.  In case of an error
1517    return NULL and set the error code in ERR.
1518    Note: - We assume NULL as the invalid state, then it is possible that
1519 	   return value is NULL and ERR is REG_NOERROR.
1520 	 - We never return non-NULL value in case of any errors, it is for
1521 	   optimization.  */
1522 
1523 static re_dfastate_t *
1524 __attribute_warn_unused_result__
re_acquire_state_context(reg_errcode_t * err,const re_dfa_t * dfa,const re_node_set * nodes,unsigned int context)1525 re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1526 			  const re_node_set *nodes, unsigned int context)
1527 {
1528   re_hashval_t hash;
1529   re_dfastate_t *new_state;
1530   struct re_state_table_entry *spot;
1531   Idx i;
1532 #if defined GCC_LINT || defined lint
1533   /* Suppress bogus uninitialized-variable warnings.  */
1534   *err = REG_NOERROR;
1535 #endif
1536   if (nodes->nelem == 0)
1537     {
1538       *err = REG_NOERROR;
1539       return NULL;
1540     }
1541   hash = calc_state_hash (nodes, context);
1542   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1543 
1544   for (i = 0 ; i < spot->num ; i++)
1545     {
1546       re_dfastate_t *state = spot->array[i];
1547       if (state->hash == hash
1548 	  && state->context == context
1549 	  && re_node_set_compare (state->entrance_nodes, nodes))
1550 	return state;
1551     }
1552   /* There are no appropriate state in 'dfa', create the new one.  */
1553   new_state = create_cd_newstate (dfa, nodes, context, hash);
1554   if (BE (new_state == NULL, 0))
1555     *err = REG_ESPACE;
1556 
1557   return new_state;
1558 }
1559 
1560 /* Finish initialization of the new state NEWSTATE, and using its hash value
1561    HASH put in the appropriate bucket of DFA's state table.  Return value
1562    indicates the error code if failed.  */
1563 
1564 static reg_errcode_t
1565 __attribute_warn_unused_result__
register_state(const re_dfa_t * dfa,re_dfastate_t * newstate,re_hashval_t hash)1566 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
1567 		re_hashval_t hash)
1568 {
1569   struct re_state_table_entry *spot;
1570   reg_errcode_t err;
1571   Idx i;
1572 
1573   newstate->hash = hash;
1574   err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1575   if (BE (err != REG_NOERROR, 0))
1576     return REG_ESPACE;
1577   for (i = 0; i < newstate->nodes.nelem; i++)
1578     {
1579       Idx elem = newstate->nodes.elems[i];
1580       if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1581 	if (! re_node_set_insert_last (&newstate->non_eps_nodes, elem))
1582 	  return REG_ESPACE;
1583     }
1584 
1585   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1586   if (BE (spot->alloc <= spot->num, 0))
1587     {
1588       Idx new_alloc = 2 * spot->num + 2;
1589       re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1590 					      new_alloc);
1591       if (BE (new_array == NULL, 0))
1592 	return REG_ESPACE;
1593       spot->array = new_array;
1594       spot->alloc = new_alloc;
1595     }
1596   spot->array[spot->num++] = newstate;
1597   return REG_NOERROR;
1598 }
1599 
1600 static void
free_state(re_dfastate_t * state)1601 free_state (re_dfastate_t *state)
1602 {
1603   re_node_set_free (&state->non_eps_nodes);
1604   re_node_set_free (&state->inveclosure);
1605   if (state->entrance_nodes != &state->nodes)
1606     {
1607       re_node_set_free (state->entrance_nodes);
1608       re_free (state->entrance_nodes);
1609     }
1610   re_node_set_free (&state->nodes);
1611   re_free (state->word_trtable);
1612   re_free (state->trtable);
1613   re_free (state);
1614 }
1615 
1616 /* Create the new state which is independent of contexts.
1617    Return the new state if succeeded, otherwise return NULL.  */
1618 
1619 static re_dfastate_t *
1620 __attribute_warn_unused_result__
create_ci_newstate(const re_dfa_t * dfa,const re_node_set * nodes,re_hashval_t hash)1621 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1622 		    re_hashval_t hash)
1623 {
1624   Idx i;
1625   reg_errcode_t err;
1626   re_dfastate_t *newstate;
1627 
1628   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1629   if (BE (newstate == NULL, 0))
1630     return NULL;
1631   err = re_node_set_init_copy (&newstate->nodes, nodes);
1632   if (BE (err != REG_NOERROR, 0))
1633     {
1634       re_free (newstate);
1635       return NULL;
1636     }
1637 
1638   newstate->entrance_nodes = &newstate->nodes;
1639   for (i = 0 ; i < nodes->nelem ; i++)
1640     {
1641       re_token_t *node = dfa->nodes + nodes->elems[i];
1642       re_token_type_t type = node->type;
1643       if (type == CHARACTER && !node->constraint)
1644 	continue;
1645 #ifdef RE_ENABLE_I18N
1646       newstate->accept_mb |= node->accept_mb;
1647 #endif /* RE_ENABLE_I18N */
1648 
1649       /* If the state has the halt node, the state is a halt state.  */
1650       if (type == END_OF_RE)
1651 	newstate->halt = 1;
1652       else if (type == OP_BACK_REF)
1653 	newstate->has_backref = 1;
1654       else if (type == ANCHOR || node->constraint)
1655 	newstate->has_constraint = 1;
1656     }
1657   err = register_state (dfa, newstate, hash);
1658   if (BE (err != REG_NOERROR, 0))
1659     {
1660       free_state (newstate);
1661       newstate = NULL;
1662     }
1663   return newstate;
1664 }
1665 
1666 /* Create the new state which is depend on the context CONTEXT.
1667    Return the new state if succeeded, otherwise return NULL.  */
1668 
1669 static re_dfastate_t *
1670 __attribute_warn_unused_result__
create_cd_newstate(const re_dfa_t * dfa,const re_node_set * nodes,unsigned int context,re_hashval_t hash)1671 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1672 		    unsigned int context, re_hashval_t hash)
1673 {
1674   Idx i, nctx_nodes = 0;
1675   reg_errcode_t err;
1676   re_dfastate_t *newstate;
1677 
1678   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1679   if (BE (newstate == NULL, 0))
1680     return NULL;
1681   err = re_node_set_init_copy (&newstate->nodes, nodes);
1682   if (BE (err != REG_NOERROR, 0))
1683     {
1684       re_free (newstate);
1685       return NULL;
1686     }
1687 
1688   newstate->context = context;
1689   newstate->entrance_nodes = &newstate->nodes;
1690 
1691   for (i = 0 ; i < nodes->nelem ; i++)
1692     {
1693       re_token_t *node = dfa->nodes + nodes->elems[i];
1694       re_token_type_t type = node->type;
1695       unsigned int constraint = node->constraint;
1696 
1697       if (type == CHARACTER && !constraint)
1698 	continue;
1699 #ifdef RE_ENABLE_I18N
1700       newstate->accept_mb |= node->accept_mb;
1701 #endif /* RE_ENABLE_I18N */
1702 
1703       /* If the state has the halt node, the state is a halt state.  */
1704       if (type == END_OF_RE)
1705 	newstate->halt = 1;
1706       else if (type == OP_BACK_REF)
1707 	newstate->has_backref = 1;
1708 
1709       if (constraint)
1710 	{
1711 	  if (newstate->entrance_nodes == &newstate->nodes)
1712 	    {
1713 	      newstate->entrance_nodes = re_malloc (re_node_set, 1);
1714 	      if (BE (newstate->entrance_nodes == NULL, 0))
1715 		{
1716 		  free_state (newstate);
1717 		  return NULL;
1718 		}
1719 	      if (re_node_set_init_copy (newstate->entrance_nodes, nodes)
1720 		  != REG_NOERROR)
1721 		return NULL;
1722 	      nctx_nodes = 0;
1723 	      newstate->has_constraint = 1;
1724 	    }
1725 
1726 	  if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1727 	    {
1728 	      re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1729 	      ++nctx_nodes;
1730 	    }
1731 	}
1732     }
1733   err = register_state (dfa, newstate, hash);
1734   if (BE (err != REG_NOERROR, 0))
1735     {
1736       free_state (newstate);
1737       newstate = NULL;
1738     }
1739   return  newstate;
1740 }
1741