xref: /haiku/src/system/libroot/posix/glibc/iconv/loop.c (revision e81a954787e50e56a7f06f72705b7859b6ab06d1)
1 /* Conversion loop frame work.
2    Copyright (C) 1998-2002, 2003 Free Software Foundation, Inc.
3    This file is part of the GNU C Library.
4    Contributed by Ulrich Drepper <drepper@cygnus.com>, 1998.
5 
6    The GNU C Library is free software; you can redistribute it and/or
7    modify it under the terms of the GNU Lesser General Public
8    License as published by the Free Software Foundation; either
9    version 2.1 of the License, or (at your option) any later version.
10 
11    The GNU C Library is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14    Lesser General Public License for more details.
15 
16    You should have received a copy of the GNU Lesser General Public
17    License along with the GNU C Library; if not, write to the Free
18    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19    02111-1307 USA.  */
20 
21 /* This file provides a frame for the reader loop in all conversion modules.
22    The actual code must (of course) be provided in the actual module source
23    code but certain actions can be written down generically, with some
24    customization options which are these:
25 
26      MIN_NEEDED_INPUT	minimal number of input bytes needed for the next
27 			conversion.
28      MIN_NEEDED_OUTPUT	minimal number of bytes produced by the next round
29 			of conversion.
30 
31      MAX_NEEDED_INPUT	you guess it, this is the maximal number of input
32 			bytes needed.  It defaults to MIN_NEEDED_INPUT
33      MAX_NEEDED_OUTPUT	likewise for output bytes.
34 
35      LOOPFCT		name of the function created.  If not specified
36 			the name is `loop' but this prevents the use
37 			of multiple functions in the same file.
38 
39      BODY		this is supposed to expand to the body of the loop.
40 			The user must provide this.
41 
42      EXTRA_LOOP_DECLS	extra arguments passed from converion loop call.
43 
44      INIT_PARAMS	code to define and initialize variables from params.
45      UPDATE_PARAMS	code to store result in params.
46 
47      ONEBYTE_BODY	body of the specialized conversion function for a
48 			single byte from the current character set to INTERNAL.
49 */
50 
51 #include <assert.h>
52 #include <endian.h>
53 #include <gconv.h>
54 #include <stdint.h>
55 #include <string.h>
56 #include <wchar.h>
57 #include <sys/param.h>		/* For MIN.  */
58 #define __need_size_t
59 #include <stddef.h>
60 
61 
62 /* We have to provide support for machines which are not able to handled
63    unaligned memory accesses.  Some of the character encodings have
64    representations with a fixed width of 2 or 4 bytes.  But if we cannot
65    access unaligned memory we still have to read byte-wise.  */
66 #undef FCTNAME2
67 #if defined _STRING_ARCH_unaligned || !defined DEFINE_UNALIGNED
68 /* We can handle unaligned memory access.  */
69 # define get16(addr) *((__const uint16_t *) (addr))
70 # define get32(addr) *((__const uint32_t *) (addr))
71 
72 /* We need no special support for writing values either.  */
73 # define put16(addr, val) *((uint16_t *) (addr)) = (val)
74 # define put32(addr, val) *((uint32_t *) (addr)) = (val)
75 
76 # define FCTNAME2(name) name
77 #else
78 /* Distinguish between big endian and little endian.  */
79 # if __BYTE_ORDER == __LITTLE_ENDIAN
80 #  define get16(addr) \
81      (((__const unsigned char *) (addr))[1] << 8			      \
82       | ((__const unsigned char *) (addr))[0])
83 #  define get32(addr) \
84      (((((__const unsigned char *) (addr))[3] << 8			      \
85 	| ((__const unsigned char *) (addr))[2]) << 8			      \
86        | ((__const unsigned char *) (addr))[1]) << 8			      \
87       | ((__const unsigned char *) (addr))[0])
88 
89 #  define put16(addr, val) \
90      ({ uint16_t __val = (val);						      \
91 	((unsigned char *) (addr))[0] = __val;				      \
92 	((unsigned char *) (addr))[1] = __val >> 8;			      \
93 	(void) 0; })
94 #  define put32(addr, val) \
95      ({ uint32_t __val = (val);						      \
96 	((unsigned char *) (addr))[0] = __val;				      \
97 	__val >>= 8;							      \
98 	((unsigned char *) (addr))[1] = __val;				      \
99 	__val >>= 8;							      \
100 	((unsigned char *) (addr))[2] = __val;				      \
101 	__val >>= 8;							      \
102 	((unsigned char *) (addr))[3] = __val;				      \
103 	(void) 0; })
104 # else
105 #  define get16(addr) \
106      (((__const unsigned char *) (addr))[0] << 8			      \
107       | ((__const unsigned char *) (addr))[1])
108 #  define get32(addr) \
109      (((((__const unsigned char *) (addr))[0] << 8			      \
110 	| ((__const unsigned char *) (addr))[1]) << 8			      \
111        | ((__const unsigned char *) (addr))[2]) << 8			      \
112       | ((__const unsigned char *) (addr))[3])
113 
114 #  define put16(addr, val) \
115      ({ uint16_t __val = (val);						      \
116 	((unsigned char *) (addr))[1] = __val;				      \
117 	((unsigned char *) (addr))[0] = __val >> 8;			      \
118 	(void) 0; })
119 #  define put32(addr, val) \
120      ({ uint32_t __val = (val);						      \
121 	((unsigned char *) (addr))[3] = __val;				      \
122 	__val >>= 8;							      \
123 	((unsigned char *) (addr))[2] = __val;				      \
124 	__val >>= 8;							      \
125 	((unsigned char *) (addr))[1] = __val;				      \
126 	__val >>= 8;							      \
127 	((unsigned char *) (addr))[0] = __val;				      \
128 	(void) 0; })
129 # endif
130 
131 # define FCTNAME2(name) name##_unaligned
132 #endif
133 #define FCTNAME(name) FCTNAME2(name)
134 
135 
136 /* We need at least one byte for the next round.  */
137 #ifndef MIN_NEEDED_INPUT
138 # error "MIN_NEEDED_INPUT definition missing"
139 #elif MIN_NEEDED_INPUT < 1
140 # error "MIN_NEEDED_INPUT must be >= 1"
141 #endif
142 
143 /* Let's see how many bytes we produce.  */
144 #ifndef MAX_NEEDED_INPUT
145 # define MAX_NEEDED_INPUT	MIN_NEEDED_INPUT
146 #endif
147 
148 /* We produce at least one byte in the next round.  */
149 #ifndef MIN_NEEDED_OUTPUT
150 # error "MIN_NEEDED_OUTPUT definition missing"
151 #elif MIN_NEEDED_OUTPUT < 1
152 # error "MIN_NEEDED_OUTPUT must be >= 1"
153 #endif
154 
155 /* Let's see how many bytes we produce.  */
156 #ifndef MAX_NEEDED_OUTPUT
157 # define MAX_NEEDED_OUTPUT	MIN_NEEDED_OUTPUT
158 #endif
159 
160 /* Default name for the function.  */
161 #ifndef LOOPFCT
162 # define LOOPFCT		loop
163 #endif
164 
165 /* Make sure we have a loop body.  */
166 #ifndef BODY
167 # error "Definition of BODY missing for function" LOOPFCT
168 #endif
169 
170 
171 /* If no arguments have to passed to the loop function define the macro
172    as empty.  */
173 #ifndef EXTRA_LOOP_DECLS
174 # define EXTRA_LOOP_DECLS
175 #endif
176 
177 
178 /* To make it easier for the writers of the modules, we define a macro
179    to test whether we have to ignore errors.  */
180 #define ignore_errors_p() \
181   (irreversible != NULL && (flags & __GCONV_IGNORE_ERRORS))
182 
183 
184 /* Error handling for the FROM_LOOP direction, with ignoring of errors.
185    Note that we cannot use the do while (0) trick since `break' and
186    `continue' must reach certain points.  */
187 #define STANDARD_FROM_LOOP_ERR_HANDLER(Incr) \
188   {									      \
189     result = __GCONV_ILLEGAL_INPUT;					      \
190 									      \
191     if (! ignore_errors_p ())						      \
192       break;								      \
193 									      \
194     /* We ignore the invalid input byte sequence.  */			      \
195     inptr += (Incr);							      \
196     ++*irreversible;							      \
197     /* But we keep result == __GCONV_ILLEGAL_INPUT, because of the constraint \
198        that "iconv -c" must give the same exitcode as "iconv".  */	      \
199     continue;								      \
200   }
201 
202 /* Error handling for the TO_LOOP direction, with use of transliteration/
203    transcription functions and ignoring of errors.  Note that we cannot use
204    the do while (0) trick since `break' and `continue' must reach certain
205    points.  */
206 #define STANDARD_TO_LOOP_ERR_HANDLER(Incr) \
207   {									      \
208     result = __GCONV_ILLEGAL_INPUT;					      \
209 									      \
210     if (irreversible == NULL)						      \
211       /* This means we are in call from __gconv_transliterate.  In this	      \
212 	 case we are not doing any error recovery outself.  */		      \
213       break;								      \
214 									      \
215     /* See whether we have to ignore the error.  If not, stop.  */	      \
216     if (! ignore_errors_p ())						      \
217       break;								      \
218 									      \
219     /* When we come here it means we ignore the character.  */		      \
220     ++*irreversible;							      \
221     inptr += Incr;							      \
222     /* But we keep result == __GCONV_ILLEGAL_INPUT, because of the constraint \
223        that "iconv -c" must give the same exitcode as "iconv".  */	      \
224     continue;								      \
225   }
226 
227 
228 /* Handling of Unicode 3.1 TAG characters.  Unicode recommends
229    "If language codes are not relevant to the particular processing
230     operation, then they should be ignored."  This macro is usually
231    called right before  STANDARD_TO_LOOP_ERR_HANDLER (Incr).  */
232 #define UNICODE_TAG_HANDLER(Character, Incr) \
233   {									      \
234     /* TAG characters are those in the range U+E0000..U+E007F.  */	      \
235     if (((Character) >> 7) == (0xe0000 >> 7))				      \
236       {									      \
237 	inptr += Incr;							      \
238 	continue;							      \
239       }									      \
240   }
241 
242 
243 /* The function returns the status, as defined in gconv.h.  */
244 static inline int
245 FCTNAME (LOOPFCT) (struct __gconv_step *step,
246 		   struct __gconv_step_data *step_data,
247 		   const unsigned char **inptrp, const unsigned char *inend,
248 		   unsigned char **outptrp, const unsigned char *outend,
249 		   size_t *irreversible EXTRA_LOOP_DECLS)
250 {
251 #ifdef LOOP_NEED_STATE
252   mbstate_t *state = step_data->__statep;
253 #endif
254 #ifdef LOOP_NEED_FLAGS
255   int flags = step_data->__flags;
256 #endif
257 #ifdef LOOP_NEED_DATA
258   void *data = step->__data;
259 #endif
260   int result = __GCONV_EMPTY_INPUT;
261   const unsigned char *inptr = *inptrp;
262   unsigned char *outptr = *outptrp;
263 
264 #ifdef INIT_PARAMS
265   INIT_PARAMS;
266 #endif
267 
268   while (inptr != inend)
269     {
270       /* `if' cases for MIN_NEEDED_OUTPUT ==/!= 1 is made to help the
271 	 compiler generating better code.  They will be optimized away
272 	 since MIN_NEEDED_OUTPUT is always a constant.  */
273       if (MIN_NEEDED_INPUT > 1
274 	  && __builtin_expect (inptr + MIN_NEEDED_INPUT > inend, 0))
275 	{
276 	  /* We don't have enough input for another complete input
277 	     character.  */
278 	  result = __GCONV_INCOMPLETE_INPUT;
279 	  break;
280 	}
281       if ((MIN_NEEDED_OUTPUT != 1
282 	   && __builtin_expect (outptr + MIN_NEEDED_OUTPUT > outend, 0))
283 	  || (MIN_NEEDED_OUTPUT == 1
284 	      && __builtin_expect (outptr >= outend, 0)))
285 	{
286 	  /* Overflow in the output buffer.  */
287 	  result = __GCONV_FULL_OUTPUT;
288 	  break;
289 	}
290 
291       /* Here comes the body the user provides.  It can stop with
292 	 RESULT set to GCONV_INCOMPLETE_INPUT (if the size of the
293 	 input characters vary in size), GCONV_ILLEGAL_INPUT, or
294 	 GCONV_FULL_OUTPUT (if the output characters vary in size).  */
295       BODY
296     }
297 
298   /* Update the pointers pointed to by the parameters.  */
299   *inptrp = inptr;
300   *outptrp = outptr;
301 #ifdef UPDATE_PARAMS
302   UPDATE_PARAMS;
303 #endif
304 
305   return result;
306 }
307 
308 
309 /* Include the file a second time to define the function to handle
310    unaligned access.  */
311 #if !defined DEFINE_UNALIGNED && !defined _STRING_ARCH_unaligned \
312     && MIN_NEEDED_INPUT != 1 && MAX_NEEDED_INPUT % MIN_NEEDED_INPUT == 0 \
313     && MIN_NEEDED_OUTPUT != 1 && MAX_NEEDED_OUTPUT % MIN_NEEDED_OUTPUT == 0
314 # undef get16
315 # undef get32
316 # undef put16
317 # undef put32
318 # undef unaligned
319 
320 # define DEFINE_UNALIGNED
321 # include "loop.c"
322 # undef DEFINE_UNALIGNED
323 #endif
324 
325 
326 #if MAX_NEEDED_INPUT > 1
327 # define SINGLE(fct) SINGLE2 (fct)
328 # define SINGLE2(fct) fct##_single
329 static inline int
330 SINGLE(LOOPFCT) (struct __gconv_step *step,
331 		 struct __gconv_step_data *step_data,
332 		 const unsigned char **inptrp, const unsigned char *inend,
333 		 unsigned char **outptrp, unsigned char *outend,
334 		 size_t *irreversible EXTRA_LOOP_DECLS)
335 {
336   mbstate_t *state = step_data->__statep;
337 #ifdef LOOP_NEED_FLAGS
338   int flags = step_data->__flags;
339 #endif
340 #ifdef LOOP_NEED_DATA
341   void *data = step->__data;
342 #endif
343   int result = __GCONV_OK;
344   unsigned char bytebuf[MAX_NEEDED_INPUT];
345   const unsigned char *inptr = *inptrp;
346   unsigned char *outptr = *outptrp;
347   size_t inlen;
348 
349 #ifdef INIT_PARAMS
350   INIT_PARAMS;
351 #endif
352 
353 #ifdef UNPACK_BYTES
354   UNPACK_BYTES
355 #else
356   /* Add the bytes from the state to the input buffer.  */
357   for (inlen = 0; inlen < (size_t) (state->__count & 7); ++inlen)
358     bytebuf[inlen] = state->__value.__wchb[inlen];
359 #endif
360 
361   /* Are there enough bytes in the input buffer?  */
362   if (__builtin_expect (inptr + (MIN_NEEDED_INPUT - inlen) > inend, 0))
363     {
364       *inptrp = inend;
365 #ifdef STORE_REST
366       inptr = bytebuf;
367       inptrp = &inptr;
368       inend = &bytebuf[inlen];
369 
370       STORE_REST
371 #else
372       /* We don't have enough input for another complete input
373 	 character.  */
374       while (inptr < inend)
375 	state->__value.__wchb[inlen++] = *inptr++;
376 #endif
377 
378       return __GCONV_INCOMPLETE_INPUT;
379     }
380 
381   /* Enough space in output buffer.  */
382   if ((MIN_NEEDED_OUTPUT != 1 && outptr + MIN_NEEDED_OUTPUT > outend)
383       || (MIN_NEEDED_OUTPUT == 1 && outptr >= outend))
384     /* Overflow in the output buffer.  */
385     return __GCONV_FULL_OUTPUT;
386 
387   /*  Now add characters from the normal input buffer.  */
388   do
389     bytebuf[inlen++] = *inptr++;
390   while (inlen < MAX_NEEDED_INPUT && inptr < inend);
391 
392   inptr = bytebuf;
393   inend = &bytebuf[inlen];
394 
395   do
396     {
397       BODY
398     }
399   while (0);
400 
401   /* Now we either have produced an output character and consumed all the
402      bytes from the state and at least one more, or the character is still
403      incomplete, or we have some other error (like illegal input character,
404      no space in output buffer).  */
405   if (__builtin_expect (inptr != bytebuf, 1))
406     {
407       /* We found a new character.  */
408       assert (inptr - bytebuf > (state->__count & 7));
409 
410       *inptrp += inptr - bytebuf - (state->__count & 7);
411       *outptrp = outptr;
412 
413       result = __GCONV_OK;
414 
415       /* Clear the state buffer.  */
416 #ifdef CLEAR_STATE
417       CLEAR_STATE;
418 #else
419       state->__count &= ~7;
420 #endif
421     }
422   else if (result == __GCONV_INCOMPLETE_INPUT)
423     {
424       /* This can only happen if we have less than MAX_NEEDED_INPUT bytes
425 	 available.  */
426       assert (inend != &bytebuf[MAX_NEEDED_INPUT]);
427 
428       *inptrp += inend - bytebuf - (state->__count & 7);
429 #ifdef STORE_REST
430       inptrp = &inptr;
431 
432       STORE_REST
433 #else
434       /* We don't have enough input for another complete input
435 	 character.  */
436       while (inptr < inend)
437 	state->__value.__wchb[inlen++] = *inptr++;
438 #endif
439     }
440 
441   return result;
442 }
443 # undef SINGLE
444 # undef SINGLE2
445 #endif
446 
447 
448 #ifdef ONEBYTE_BODY
449 /* Define the shortcut function for btowc.  */
450 static wint_t
451 gconv_btowc (struct __gconv_step *step, unsigned char c)
452   ONEBYTE_BODY
453 # define FROM_ONEBYTE gconv_btowc
454 #endif
455 
456 
457 /* We remove the macro definitions so that we can include this file again
458    for the definition of another function.  */
459 #undef MIN_NEEDED_INPUT
460 #undef MAX_NEEDED_INPUT
461 #undef MIN_NEEDED_OUTPUT
462 #undef MAX_NEEDED_OUTPUT
463 #undef LOOPFCT
464 #undef BODY
465 #undef LOOPFCT
466 #undef EXTRA_LOOP_DECLS
467 #undef INIT_PARAMS
468 #undef UPDATE_PARAMS
469 #undef ONEBYTE_BODY
470 #undef UNPACK_BYTES
471 #undef CLEAR_STATE
472 #undef LOOP_NEED_STATE
473 #undef LOOP_NEED_FLAGS
474 #undef LOOP_NEED_DATA
475 #undef get16
476 #undef get32
477 #undef put16
478 #undef put32
479 #undef unaligned
480