1 /* Simple transformations functions. 2 Copyright (C) 1997-2003, 2004 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997. 5 6 The GNU C Library is free software; you can redistribute it and/or 7 modify it under the terms of the GNU Lesser General Public 8 License as published by the Free Software Foundation; either 9 version 2.1 of the License, or (at your option) any later version. 10 11 The GNU C Library is distributed in the hope that it will be useful, 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 Lesser General Public License for more details. 15 16 You should have received a copy of the GNU Lesser General Public 17 License along with the GNU C Library; if not, write to the Free 18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 19 02111-1307 USA. */ 20 21 #include <byteswap.h> 22 #include <dlfcn.h> 23 #include <endian.h> 24 #include <errno.h> 25 #include <gconv.h> 26 #include <stdint.h> 27 #include <stdlib.h> 28 #include <string.h> 29 #include <wchar.h> 30 #include <sys/param.h> 31 #include <gconv_int.h> 32 33 #define BUILTIN_ALIAS(s1, s2) /* nothing */ 34 #define BUILTIN_TRANSFORMATION(From, To, Cost, Name, Fct, BtowcFct, \ 35 MinF, MaxF, MinT, MaxT) \ 36 extern int Fct (struct __gconv_step *, struct __gconv_step_data *, \ 37 __const unsigned char **, __const unsigned char *, \ 38 unsigned char **, size_t *, int, int); 39 #include "gconv_builtin.h" 40 41 42 #ifndef EILSEQ 43 # define EILSEQ EINVAL 44 #endif 45 46 47 /* Specialized conversion function for a single byte to INTERNAL, recognizing 48 only ASCII characters. */ 49 wint_t 50 __gconv_btwoc_ascii (struct __gconv_step *step, unsigned char c) 51 { 52 if (c < 0x80) 53 return c; 54 else 55 return WEOF; 56 } 57 58 59 /* Transform from the internal, UCS4-like format, to UCS4. The 60 difference between the internal ucs4 format and the real UCS4 61 format is, if any, the endianess. The Unicode/ISO 10646 says that 62 unless some higher protocol specifies it differently, the byte 63 order is big endian.*/ 64 #define DEFINE_INIT 0 65 #define DEFINE_FINI 0 66 #define MIN_NEEDED_FROM 4 67 #define MIN_NEEDED_TO 4 68 #define FROM_DIRECTION 1 69 #define FROM_LOOP internal_ucs4_loop 70 #define TO_LOOP internal_ucs4_loop /* This is not used. */ 71 #define FUNCTION_NAME __gconv_transform_internal_ucs4 72 73 74 static inline int 75 __attribute ((always_inline)) 76 internal_ucs4_loop (struct __gconv_step *step, 77 struct __gconv_step_data *step_data, 78 const unsigned char **inptrp, const unsigned char *inend, 79 unsigned char **outptrp, unsigned char *outend, 80 size_t *irreversible) 81 { 82 const unsigned char *inptr = *inptrp; 83 unsigned char *outptr = *outptrp; 84 size_t n_convert = MIN (inend - inptr, outend - outptr) / 4; 85 int result; 86 87 #if __BYTE_ORDER == __LITTLE_ENDIAN 88 /* Sigh, we have to do some real work. */ 89 size_t cnt; 90 uint32_t *outptr32 = (uint32_t *) outptr; 91 92 for (cnt = 0; cnt < n_convert; ++cnt, inptr += 4) 93 *outptr32++ = bswap_32 (*(const uint32_t *) inptr); 94 95 *inptrp = inptr; 96 *outptrp = (unsigned char *) outptr32; 97 #elif __BYTE_ORDER == __BIG_ENDIAN 98 /* Simply copy the data. */ 99 *inptrp = inptr + n_convert * 4; 100 *outptrp = __mempcpy (outptr, inptr, n_convert * 4); 101 #else 102 # error "This endianess is not supported." 103 #endif 104 105 /* Determine the status. */ 106 if (*inptrp == inend) 107 result = __GCONV_EMPTY_INPUT; 108 else if (*outptrp + 4 > outend) 109 result = __GCONV_FULL_OUTPUT; 110 else 111 result = __GCONV_INCOMPLETE_INPUT; 112 113 return result; 114 } 115 116 #ifndef _STRING_ARCH_unaligned 117 static inline int 118 __attribute ((always_inline)) 119 internal_ucs4_loop_unaligned (struct __gconv_step *step, 120 struct __gconv_step_data *step_data, 121 const unsigned char **inptrp, 122 const unsigned char *inend, 123 unsigned char **outptrp, unsigned char *outend, 124 size_t *irreversible) 125 { 126 const unsigned char *inptr = *inptrp; 127 unsigned char *outptr = *outptrp; 128 size_t n_convert = MIN (inend - inptr, outend - outptr) / 4; 129 int result; 130 131 # if __BYTE_ORDER == __LITTLE_ENDIAN 132 /* Sigh, we have to do some real work. */ 133 size_t cnt; 134 135 for (cnt = 0; cnt < n_convert; ++cnt, inptr += 4, outptr += 4) 136 { 137 outptr[0] = inptr[3]; 138 outptr[1] = inptr[2]; 139 outptr[2] = inptr[1]; 140 outptr[3] = inptr[0]; 141 } 142 143 *inptrp = inptr; 144 *outptrp = outptr; 145 # elif __BYTE_ORDER == __BIG_ENDIAN 146 /* Simply copy the data. */ 147 *inptrp = inptr + n_convert * 4; 148 *outptrp = __mempcpy (outptr, inptr, n_convert * 4); 149 # else 150 # error "This endianess is not supported." 151 # endif 152 153 /* Determine the status. */ 154 if (*inptrp == inend) 155 result = __GCONV_EMPTY_INPUT; 156 else if (*outptrp + 4 > outend) 157 result = __GCONV_FULL_OUTPUT; 158 else 159 result = __GCONV_INCOMPLETE_INPUT; 160 161 return result; 162 } 163 #endif 164 165 166 static inline int 167 __attribute ((always_inline)) 168 internal_ucs4_loop_single (struct __gconv_step *step, 169 struct __gconv_step_data *step_data, 170 const unsigned char **inptrp, 171 const unsigned char *inend, 172 unsigned char **outptrp, unsigned char *outend, 173 size_t *irreversible) 174 { 175 mbstate_t *state = step_data->__statep; 176 size_t cnt = state->__count & 7; 177 178 while (*inptrp < inend && cnt < 4) 179 state->__value.__wchb[cnt++] = *(*inptrp)++; 180 181 if (__builtin_expect (cnt < 4, 0)) 182 { 183 /* Still not enough bytes. Store the ones in the input buffer. */ 184 state->__count &= ~7; 185 state->__count |= cnt; 186 187 return __GCONV_INCOMPLETE_INPUT; 188 } 189 190 #if __BYTE_ORDER == __LITTLE_ENDIAN 191 (*outptrp)[0] = state->__value.__wchb[3]; 192 (*outptrp)[1] = state->__value.__wchb[2]; 193 (*outptrp)[2] = state->__value.__wchb[1]; 194 (*outptrp)[3] = state->__value.__wchb[0]; 195 196 #elif __BYTE_ORDER == __BIG_ENDIAN 197 /* XXX unaligned */ 198 (*outptrp)[0] = state->__value.__wchb[0]; 199 (*outptrp)[1] = state->__value.__wchb[1]; 200 (*outptrp)[2] = state->__value.__wchb[2]; 201 (*outptrp)[3] = state->__value.__wchb[3]; 202 #else 203 # error "This endianess is not supported." 204 #endif 205 *outptrp += 4; 206 207 /* Clear the state buffer. */ 208 state->__count &= ~7; 209 210 return __GCONV_OK; 211 } 212 213 #include <iconv/skeleton.c> 214 215 216 /* Transform from UCS4 to the internal, UCS4-like format. Unlike 217 for the other direction we have to check for correct values here. */ 218 #define DEFINE_INIT 0 219 #define DEFINE_FINI 0 220 #define MIN_NEEDED_FROM 4 221 #define MIN_NEEDED_TO 4 222 #define FROM_DIRECTION 1 223 #define FROM_LOOP ucs4_internal_loop 224 #define TO_LOOP ucs4_internal_loop /* This is not used. */ 225 #define FUNCTION_NAME __gconv_transform_ucs4_internal 226 227 228 static inline int 229 __attribute ((always_inline)) 230 ucs4_internal_loop (struct __gconv_step *step, 231 struct __gconv_step_data *step_data, 232 const unsigned char **inptrp, const unsigned char *inend, 233 unsigned char **outptrp, unsigned char *outend, 234 size_t *irreversible) 235 { 236 int flags = step_data->__flags; 237 const unsigned char *inptr = *inptrp; 238 unsigned char *outptr = *outptrp; 239 size_t n_convert = MIN (inend - inptr, outend - outptr) / 4; 240 int result; 241 size_t cnt; 242 243 for (cnt = 0; cnt < n_convert; ++cnt, inptr += 4) 244 { 245 uint32_t inval; 246 247 #if __BYTE_ORDER == __LITTLE_ENDIAN 248 inval = bswap_32 (*(const uint32_t *) inptr); 249 #else 250 inval = *(const uint32_t *) inptr; 251 #endif 252 253 if (__builtin_expect (inval > 0x7fffffff, 0)) 254 { 255 /* The value is too large. We don't try transliteration here since 256 this is not an error because of the lack of possibilities to 257 represent the result. This is a genuine bug in the input since 258 UCS4 does not allow such values. */ 259 if (irreversible == NULL) 260 /* We are transliterating, don't try to correct anything. */ 261 return __GCONV_ILLEGAL_INPUT; 262 263 if (flags & __GCONV_IGNORE_ERRORS) 264 { 265 /* Just ignore this character. */ 266 ++*irreversible; 267 continue; 268 } 269 270 *inptrp = inptr; 271 *outptrp = outptr; 272 return __GCONV_ILLEGAL_INPUT; 273 } 274 275 *((uint32_t *) outptr) = inval; 276 outptr += sizeof (uint32_t); 277 } 278 279 *inptrp = inptr; 280 *outptrp = outptr; 281 282 /* Determine the status. */ 283 if (*inptrp == inend) 284 result = __GCONV_EMPTY_INPUT; 285 else if (*outptrp + 4 > outend) 286 result = __GCONV_FULL_OUTPUT; 287 else 288 result = __GCONV_INCOMPLETE_INPUT; 289 290 return result; 291 } 292 293 #ifndef _STRING_ARCH_unaligned 294 static inline int 295 __attribute ((always_inline)) 296 ucs4_internal_loop_unaligned (struct __gconv_step *step, 297 struct __gconv_step_data *step_data, 298 const unsigned char **inptrp, 299 const unsigned char *inend, 300 unsigned char **outptrp, unsigned char *outend, 301 size_t *irreversible) 302 { 303 int flags = step_data->__flags; 304 const unsigned char *inptr = *inptrp; 305 unsigned char *outptr = *outptrp; 306 size_t n_convert = MIN (inend - inptr, outend - outptr) / 4; 307 int result; 308 size_t cnt; 309 310 for (cnt = 0; cnt < n_convert; ++cnt, inptr += 4) 311 { 312 if (__builtin_expect (inptr[0] > 0x80, 0)) 313 { 314 /* The value is too large. We don't try transliteration here since 315 this is not an error because of the lack of possibilities to 316 represent the result. This is a genuine bug in the input since 317 UCS4 does not allow such values. */ 318 if (irreversible == NULL) 319 /* We are transliterating, don't try to correct anything. */ 320 return __GCONV_ILLEGAL_INPUT; 321 322 if (flags & __GCONV_IGNORE_ERRORS) 323 { 324 /* Just ignore this character. */ 325 ++*irreversible; 326 continue; 327 } 328 329 *inptrp = inptr; 330 *outptrp = outptr; 331 return __GCONV_ILLEGAL_INPUT; 332 } 333 334 # if __BYTE_ORDER == __LITTLE_ENDIAN 335 outptr[3] = inptr[0]; 336 outptr[2] = inptr[1]; 337 outptr[1] = inptr[2]; 338 outptr[0] = inptr[3]; 339 # else 340 outptr[0] = inptr[0]; 341 outptr[1] = inptr[1]; 342 outptr[2] = inptr[2]; 343 outptr[3] = inptr[3]; 344 # endif 345 outptr += 4; 346 } 347 348 *inptrp = inptr; 349 *outptrp = outptr; 350 351 /* Determine the status. */ 352 if (*inptrp == inend) 353 result = __GCONV_EMPTY_INPUT; 354 else if (*outptrp + 4 > outend) 355 result = __GCONV_FULL_OUTPUT; 356 else 357 result = __GCONV_INCOMPLETE_INPUT; 358 359 return result; 360 } 361 #endif 362 363 364 static inline int 365 __attribute ((always_inline)) 366 ucs4_internal_loop_single (struct __gconv_step *step, 367 struct __gconv_step_data *step_data, 368 const unsigned char **inptrp, 369 const unsigned char *inend, 370 unsigned char **outptrp, unsigned char *outend, 371 size_t *irreversible) 372 { 373 mbstate_t *state = step_data->__statep; 374 int flags = step_data->__flags; 375 size_t cnt = state->__count & 7; 376 377 while (*inptrp < inend && cnt < 4) 378 state->__value.__wchb[cnt++] = *(*inptrp)++; 379 380 if (__builtin_expect (cnt < 4, 0)) 381 { 382 /* Still not enough bytes. Store the ones in the input buffer. */ 383 state->__count &= ~7; 384 state->__count |= cnt; 385 386 return __GCONV_INCOMPLETE_INPUT; 387 } 388 389 if (__builtin_expect (((unsigned char *) state->__value.__wchb)[0] > 0x80, 390 0)) 391 { 392 /* The value is too large. We don't try transliteration here since 393 this is not an error because of the lack of possibilities to 394 represent the result. This is a genuine bug in the input since 395 UCS4 does not allow such values. */ 396 if (!(flags & __GCONV_IGNORE_ERRORS)) 397 { 398 *inptrp -= cnt - (state->__count & 7); 399 return __GCONV_ILLEGAL_INPUT; 400 } 401 } 402 else 403 { 404 #if __BYTE_ORDER == __LITTLE_ENDIAN 405 (*outptrp)[0] = state->__value.__wchb[3]; 406 (*outptrp)[1] = state->__value.__wchb[2]; 407 (*outptrp)[2] = state->__value.__wchb[1]; 408 (*outptrp)[3] = state->__value.__wchb[0]; 409 #elif __BYTE_ORDER == __BIG_ENDIAN 410 (*outptrp)[0] = state->__value.__wchb[0]; 411 (*outptrp)[1] = state->__value.__wchb[1]; 412 (*outptrp)[2] = state->__value.__wchb[2]; 413 (*outptrp)[3] = state->__value.__wchb[3]; 414 #endif 415 416 *outptrp += 4; 417 } 418 419 /* Clear the state buffer. */ 420 state->__count &= ~7; 421 422 return __GCONV_OK; 423 } 424 425 #include <iconv/skeleton.c> 426 427 428 /* Similarly for the little endian form. */ 429 #define DEFINE_INIT 0 430 #define DEFINE_FINI 0 431 #define MIN_NEEDED_FROM 4 432 #define MIN_NEEDED_TO 4 433 #define FROM_DIRECTION 1 434 #define FROM_LOOP internal_ucs4le_loop 435 #define TO_LOOP internal_ucs4le_loop /* This is not used. */ 436 #define FUNCTION_NAME __gconv_transform_internal_ucs4le 437 438 439 static inline int 440 __attribute ((always_inline)) 441 internal_ucs4le_loop (struct __gconv_step *step, 442 struct __gconv_step_data *step_data, 443 const unsigned char **inptrp, const unsigned char *inend, 444 unsigned char **outptrp, unsigned char *outend, 445 size_t *irreversible) 446 { 447 const unsigned char *inptr = *inptrp; 448 unsigned char *outptr = *outptrp; 449 size_t n_convert = MIN (inend - inptr, outend - outptr) / 4; 450 int result; 451 452 #if __BYTE_ORDER == __BIG_ENDIAN 453 /* Sigh, we have to do some real work. */ 454 size_t cnt; 455 uint32_t *outptr32 = (uint32_t *) outptr; 456 457 for (cnt = 0; cnt < n_convert; ++cnt, inptr += 4) 458 *outptr32++ = bswap_32 (*(const uint32_t *) inptr); 459 outptr = (unsigned char *) outptr32; 460 461 *inptrp = inptr; 462 *outptrp = outptr; 463 #elif __BYTE_ORDER == __LITTLE_ENDIAN 464 /* Simply copy the data. */ 465 *inptrp = inptr + n_convert * 4; 466 *outptrp = __mempcpy (outptr, inptr, n_convert * 4); 467 #else 468 # error "This endianess is not supported." 469 #endif 470 471 /* Determine the status. */ 472 if (*inptrp == inend) 473 result = __GCONV_EMPTY_INPUT; 474 else if (*outptrp + 4 > outend) 475 result = __GCONV_FULL_OUTPUT; 476 else 477 result = __GCONV_INCOMPLETE_INPUT; 478 479 return result; 480 } 481 482 #ifndef _STRING_ARCH_unaligned 483 static inline int 484 __attribute ((always_inline)) 485 internal_ucs4le_loop_unaligned (struct __gconv_step *step, 486 struct __gconv_step_data *step_data, 487 const unsigned char **inptrp, 488 const unsigned char *inend, 489 unsigned char **outptrp, unsigned char *outend, 490 size_t *irreversible) 491 { 492 const unsigned char *inptr = *inptrp; 493 unsigned char *outptr = *outptrp; 494 size_t n_convert = MIN (inend - inptr, outend - outptr) / 4; 495 int result; 496 497 # if __BYTE_ORDER == __BIG_ENDIAN 498 /* Sigh, we have to do some real work. */ 499 size_t cnt; 500 501 for (cnt = 0; cnt < n_convert; ++cnt, inptr += 4, outptr += 4) 502 { 503 outptr[0] = inptr[3]; 504 outptr[1] = inptr[2]; 505 outptr[2] = inptr[1]; 506 outptr[3] = inptr[0]; 507 } 508 509 *inptrp = inptr; 510 *outptrp = outptr; 511 # elif __BYTE_ORDER == __LITTLE_ENDIAN 512 /* Simply copy the data. */ 513 *inptrp = inptr + n_convert * 4; 514 *outptrp = __mempcpy (outptr, inptr, n_convert * 4); 515 # else 516 # error "This endianess is not supported." 517 # endif 518 519 /* Determine the status. */ 520 if (*inptrp == inend) 521 result = __GCONV_EMPTY_INPUT; 522 else if (*inptrp + 4 > inend) 523 result = __GCONV_INCOMPLETE_INPUT; 524 else 525 { 526 assert (*outptrp + 4 > outend); 527 result = __GCONV_FULL_OUTPUT; 528 } 529 530 return result; 531 } 532 #endif 533 534 535 static inline int 536 __attribute ((always_inline)) 537 internal_ucs4le_loop_single (struct __gconv_step *step, 538 struct __gconv_step_data *step_data, 539 const unsigned char **inptrp, 540 const unsigned char *inend, 541 unsigned char **outptrp, unsigned char *outend, 542 size_t *irreversible) 543 { 544 mbstate_t *state = step_data->__statep; 545 size_t cnt = state->__count & 7; 546 547 while (*inptrp < inend && cnt < 4) 548 state->__value.__wchb[cnt++] = *(*inptrp)++; 549 550 if (__builtin_expect (cnt < 4, 0)) 551 { 552 /* Still not enough bytes. Store the ones in the input buffer. */ 553 state->__count &= ~7; 554 state->__count |= cnt; 555 556 return __GCONV_INCOMPLETE_INPUT; 557 } 558 559 #if __BYTE_ORDER == __BIG_ENDIAN 560 (*outptrp)[0] = state->__value.__wchb[3]; 561 (*outptrp)[1] = state->__value.__wchb[2]; 562 (*outptrp)[2] = state->__value.__wchb[1]; 563 (*outptrp)[3] = state->__value.__wchb[0]; 564 565 #else 566 /* XXX unaligned */ 567 (*outptrp)[0] = state->__value.__wchb[0]; 568 (*outptrp)[1] = state->__value.__wchb[1]; 569 (*outptrp)[2] = state->__value.__wchb[2]; 570 (*outptrp)[3] = state->__value.__wchb[3]; 571 572 #endif 573 574 *outptrp += 4; 575 576 /* Clear the state buffer. */ 577 state->__count &= ~7; 578 579 return __GCONV_OK; 580 } 581 582 #include <iconv/skeleton.c> 583 584 585 /* And finally from UCS4-LE to the internal encoding. */ 586 #define DEFINE_INIT 0 587 #define DEFINE_FINI 0 588 #define MIN_NEEDED_FROM 4 589 #define MIN_NEEDED_TO 4 590 #define FROM_DIRECTION 1 591 #define FROM_LOOP ucs4le_internal_loop 592 #define TO_LOOP ucs4le_internal_loop /* This is not used. */ 593 #define FUNCTION_NAME __gconv_transform_ucs4le_internal 594 595 596 static inline int 597 __attribute ((always_inline)) 598 ucs4le_internal_loop (struct __gconv_step *step, 599 struct __gconv_step_data *step_data, 600 const unsigned char **inptrp, const unsigned char *inend, 601 unsigned char **outptrp, unsigned char *outend, 602 size_t *irreversible) 603 { 604 int flags = step_data->__flags; 605 const unsigned char *inptr = *inptrp; 606 unsigned char *outptr = *outptrp; 607 size_t n_convert = MIN (inend - inptr, outend - outptr) / 4; 608 int result; 609 size_t cnt; 610 611 for (cnt = 0; cnt < n_convert; ++cnt, inptr += 4) 612 { 613 uint32_t inval; 614 615 #if __BYTE_ORDER == __BIG_ENDIAN 616 inval = bswap_32 (*(const uint32_t *) inptr); 617 #else 618 inval = *(const uint32_t *) inptr; 619 #endif 620 621 if (__builtin_expect (inval > 0x7fffffff, 0)) 622 { 623 /* The value is too large. We don't try transliteration here since 624 this is not an error because of the lack of possibilities to 625 represent the result. This is a genuine bug in the input since 626 UCS4 does not allow such values. */ 627 if (irreversible == NULL) 628 /* We are transliterating, don't try to correct anything. */ 629 return __GCONV_ILLEGAL_INPUT; 630 631 if (flags & __GCONV_IGNORE_ERRORS) 632 { 633 /* Just ignore this character. */ 634 ++*irreversible; 635 continue; 636 } 637 638 return __GCONV_ILLEGAL_INPUT; 639 } 640 641 *((uint32_t *) outptr) = inval; 642 outptr += sizeof (uint32_t); 643 } 644 645 *inptrp = inptr; 646 *outptrp = outptr; 647 648 /* Determine the status. */ 649 if (*inptrp == inend) 650 result = __GCONV_EMPTY_INPUT; 651 else if (*inptrp + 4 > inend) 652 result = __GCONV_INCOMPLETE_INPUT; 653 else 654 { 655 assert (*outptrp + 4 > outend); 656 result = __GCONV_FULL_OUTPUT; 657 } 658 659 return result; 660 } 661 662 #ifndef _STRING_ARCH_unaligned 663 static inline int 664 __attribute ((always_inline)) 665 ucs4le_internal_loop_unaligned (struct __gconv_step *step, 666 struct __gconv_step_data *step_data, 667 const unsigned char **inptrp, 668 const unsigned char *inend, 669 unsigned char **outptrp, unsigned char *outend, 670 size_t *irreversible) 671 { 672 int flags = step_data->__flags; 673 const unsigned char *inptr = *inptrp; 674 unsigned char *outptr = *outptrp; 675 size_t n_convert = MIN (inend - inptr, outend - outptr) / 4; 676 int result; 677 size_t cnt; 678 679 for (cnt = 0; cnt < n_convert; ++cnt, inptr += 4) 680 { 681 if (__builtin_expect (inptr[3] > 0x80, 0)) 682 { 683 /* The value is too large. We don't try transliteration here since 684 this is not an error because of the lack of possibilities to 685 represent the result. This is a genuine bug in the input since 686 UCS4 does not allow such values. */ 687 if (irreversible == NULL) 688 /* We are transliterating, don't try to correct anything. */ 689 return __GCONV_ILLEGAL_INPUT; 690 691 if (flags & __GCONV_IGNORE_ERRORS) 692 { 693 /* Just ignore this character. */ 694 ++*irreversible; 695 continue; 696 } 697 698 *inptrp = inptr; 699 *outptrp = outptr; 700 return __GCONV_ILLEGAL_INPUT; 701 } 702 703 # if __BYTE_ORDER == __BIG_ENDIAN 704 outptr[3] = inptr[0]; 705 outptr[2] = inptr[1]; 706 outptr[1] = inptr[2]; 707 outptr[0] = inptr[3]; 708 # else 709 outptr[0] = inptr[0]; 710 outptr[1] = inptr[1]; 711 outptr[2] = inptr[2]; 712 outptr[3] = inptr[3]; 713 # endif 714 715 outptr += 4; 716 } 717 718 *inptrp = inptr; 719 *outptrp = outptr; 720 721 /* Determine the status. */ 722 if (*inptrp == inend) 723 result = __GCONV_EMPTY_INPUT; 724 else if (*inptrp + 4 > inend) 725 result = __GCONV_INCOMPLETE_INPUT; 726 else 727 { 728 assert (*outptrp + 4 > outend); 729 result = __GCONV_FULL_OUTPUT; 730 } 731 732 return result; 733 } 734 #endif 735 736 737 static inline int 738 __attribute ((always_inline)) 739 ucs4le_internal_loop_single (struct __gconv_step *step, 740 struct __gconv_step_data *step_data, 741 const unsigned char **inptrp, 742 const unsigned char *inend, 743 unsigned char **outptrp, unsigned char *outend, 744 size_t *irreversible) 745 { 746 mbstate_t *state = step_data->__statep; 747 int flags = step_data->__flags; 748 size_t cnt = state->__count & 7; 749 750 while (*inptrp < inend && cnt < 4) 751 state->__value.__wchb[cnt++] = *(*inptrp)++; 752 753 if (__builtin_expect (cnt < 4, 0)) 754 { 755 /* Still not enough bytes. Store the ones in the input buffer. */ 756 state->__count &= ~7; 757 state->__count |= cnt; 758 759 return __GCONV_INCOMPLETE_INPUT; 760 } 761 762 if (__builtin_expect (((unsigned char *) state->__value.__wchb)[3] > 0x80, 763 0)) 764 { 765 /* The value is too large. We don't try transliteration here since 766 this is not an error because of the lack of possibilities to 767 represent the result. This is a genuine bug in the input since 768 UCS4 does not allow such values. */ 769 if (!(flags & __GCONV_IGNORE_ERRORS)) 770 return __GCONV_ILLEGAL_INPUT; 771 } 772 else 773 { 774 #if __BYTE_ORDER == __BIG_ENDIAN 775 (*outptrp)[0] = state->__value.__wchb[3]; 776 (*outptrp)[1] = state->__value.__wchb[2]; 777 (*outptrp)[2] = state->__value.__wchb[1]; 778 (*outptrp)[3] = state->__value.__wchb[0]; 779 #else 780 (*outptrp)[0] = state->__value.__wchb[0]; 781 (*outptrp)[1] = state->__value.__wchb[1]; 782 (*outptrp)[2] = state->__value.__wchb[2]; 783 (*outptrp)[3] = state->__value.__wchb[3]; 784 #endif 785 786 *outptrp += 4; 787 } 788 789 /* Clear the state buffer. */ 790 state->__count &= ~7; 791 792 return __GCONV_OK; 793 } 794 795 #include <iconv/skeleton.c> 796 797 798 /* Convert from ISO 646-IRV to the internal (UCS4-like) format. */ 799 #define DEFINE_INIT 0 800 #define DEFINE_FINI 0 801 #define MIN_NEEDED_FROM 1 802 #define MIN_NEEDED_TO 4 803 #define FROM_DIRECTION 1 804 #define FROM_LOOP ascii_internal_loop 805 #define TO_LOOP ascii_internal_loop /* This is not used. */ 806 #define FUNCTION_NAME __gconv_transform_ascii_internal 807 #define ONE_DIRECTION 1 808 809 #define MIN_NEEDED_INPUT MIN_NEEDED_FROM 810 #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO 811 #define LOOPFCT FROM_LOOP 812 #define BODY \ 813 { \ 814 if (__builtin_expect (*inptr > '\x7f', 0)) \ 815 { \ 816 /* The value is too large. We don't try transliteration here since \ 817 this is not an error because of the lack of possibilities to \ 818 represent the result. This is a genuine bug in the input since \ 819 ASCII does not allow such values. */ \ 820 STANDARD_FROM_LOOP_ERR_HANDLER (1); \ 821 } \ 822 else \ 823 /* It's an one byte sequence. */ \ 824 *((uint32_t *) outptr) = *inptr++; \ 825 outptr += sizeof (uint32_t); \ 826 } 827 #define LOOP_NEED_FLAGS 828 #include <iconv/loop.c> 829 #include <iconv/skeleton.c> 830 831 832 /* Convert from the internal (UCS4-like) format to ISO 646-IRV. */ 833 #define DEFINE_INIT 0 834 #define DEFINE_FINI 0 835 #define MIN_NEEDED_FROM 4 836 #define MIN_NEEDED_TO 1 837 #define FROM_DIRECTION 1 838 #define FROM_LOOP internal_ascii_loop 839 #define TO_LOOP internal_ascii_loop /* This is not used. */ 840 #define FUNCTION_NAME __gconv_transform_internal_ascii 841 #define ONE_DIRECTION 1 842 843 #define MIN_NEEDED_INPUT MIN_NEEDED_FROM 844 #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO 845 #define LOOPFCT FROM_LOOP 846 #define BODY \ 847 { \ 848 if (__builtin_expect (*((const uint32_t *) inptr) > 0x7f, 0)) \ 849 { \ 850 UNICODE_TAG_HANDLER (*((const uint32_t *) inptr), 4); \ 851 STANDARD_TO_LOOP_ERR_HANDLER (4); \ 852 } \ 853 else \ 854 /* It's an one byte sequence. */ \ 855 *outptr++ = *((const uint32_t *) inptr); \ 856 inptr += sizeof (uint32_t); \ 857 } 858 #define LOOP_NEED_FLAGS 859 #include <iconv/loop.c> 860 #include <iconv/skeleton.c> 861 862 863 /* Convert from the internal (UCS4-like) format to UTF-8. */ 864 #define DEFINE_INIT 0 865 #define DEFINE_FINI 0 866 #define MIN_NEEDED_FROM 4 867 #define MIN_NEEDED_TO 1 868 #define MAX_NEEDED_TO 6 869 #define FROM_DIRECTION 1 870 #define FROM_LOOP internal_utf8_loop 871 #define TO_LOOP internal_utf8_loop /* This is not used. */ 872 #define FUNCTION_NAME __gconv_transform_internal_utf8 873 #define ONE_DIRECTION 1 874 875 #define MIN_NEEDED_INPUT MIN_NEEDED_FROM 876 #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO 877 #define MAX_NEEDED_OUTPUT MAX_NEEDED_TO 878 #define LOOPFCT FROM_LOOP 879 #define BODY \ 880 { \ 881 uint32_t wc = *((const uint32_t *) inptr); \ 882 \ 883 if (wc < 0x80) \ 884 /* It's an one byte sequence. */ \ 885 *outptr++ = (unsigned char) wc; \ 886 else if (__builtin_expect (wc <= 0x7fffffff, 1)) \ 887 { \ 888 size_t step; \ 889 char *start; \ 890 \ 891 for (step = 2; step < 6; ++step) \ 892 if ((wc & (~(uint32_t)0 << (5 * step + 1))) == 0) \ 893 break; \ 894 \ 895 if (__builtin_expect (outptr + step > outend, 0)) \ 896 { \ 897 /* Too long. */ \ 898 result = __GCONV_FULL_OUTPUT; \ 899 break; \ 900 } \ 901 \ 902 start = outptr; \ 903 *outptr = (unsigned char) (~0xff >> step); \ 904 outptr += step; \ 905 do \ 906 { \ 907 start[--step] = 0x80 | (wc & 0x3f); \ 908 wc >>= 6; \ 909 } \ 910 while (step > 1); \ 911 start[0] |= wc; \ 912 } \ 913 else \ 914 { \ 915 STANDARD_TO_LOOP_ERR_HANDLER (4); \ 916 } \ 917 \ 918 inptr += 4; \ 919 } 920 #define LOOP_NEED_FLAGS 921 #include <iconv/loop.c> 922 #include <iconv/skeleton.c> 923 924 925 /* Convert from UTF-8 to the internal (UCS4-like) format. */ 926 #define DEFINE_INIT 0 927 #define DEFINE_FINI 0 928 #define MIN_NEEDED_FROM 1 929 #define MAX_NEEDED_FROM 6 930 #define MIN_NEEDED_TO 4 931 #define FROM_DIRECTION 1 932 #define FROM_LOOP utf8_internal_loop 933 #define TO_LOOP utf8_internal_loop /* This is not used. */ 934 #define FUNCTION_NAME __gconv_transform_utf8_internal 935 #define ONE_DIRECTION 1 936 937 #define MIN_NEEDED_INPUT MIN_NEEDED_FROM 938 #define MAX_NEEDED_INPUT MAX_NEEDED_FROM 939 #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO 940 #define LOOPFCT FROM_LOOP 941 #define BODY \ 942 { \ 943 uint32_t ch; \ 944 uint_fast32_t cnt; \ 945 uint_fast32_t i; \ 946 \ 947 /* Next input byte. */ \ 948 ch = *inptr; \ 949 \ 950 if (ch < 0x80) \ 951 { \ 952 /* One byte sequence. */ \ 953 cnt = 1; \ 954 ++inptr; \ 955 } \ 956 else \ 957 { \ 958 if (ch >= 0xc2 && ch < 0xe0) \ 959 { \ 960 /* We expect two bytes. The first byte cannot be 0xc0 or 0xc1, \ 961 otherwise the wide character could have been represented \ 962 using a single byte. */ \ 963 cnt = 2; \ 964 ch &= 0x1f; \ 965 } \ 966 else if (__builtin_expect ((ch & 0xf0) == 0xe0, 1)) \ 967 { \ 968 /* We expect three bytes. */ \ 969 cnt = 3; \ 970 ch &= 0x0f; \ 971 } \ 972 else if (__builtin_expect ((ch & 0xf8) == 0xf0, 1)) \ 973 { \ 974 /* We expect four bytes. */ \ 975 cnt = 4; \ 976 ch &= 0x07; \ 977 } \ 978 else if (__builtin_expect ((ch & 0xfc) == 0xf8, 1)) \ 979 { \ 980 /* We expect five bytes. */ \ 981 cnt = 5; \ 982 ch &= 0x03; \ 983 } \ 984 else if (__builtin_expect ((ch & 0xfe) == 0xfc, 1)) \ 985 { \ 986 /* We expect six bytes. */ \ 987 cnt = 6; \ 988 ch &= 0x01; \ 989 } \ 990 else \ 991 { \ 992 /* Search the end of this ill-formed UTF-8 character. This \ 993 is the next byte with (x & 0xc0) != 0x80. */ \ 994 i = 0; \ 995 do \ 996 ++i; \ 997 while (inptr + i < inend \ 998 && (*(inptr + i) & 0xc0) == 0x80 \ 999 && i < 5); \ 1000 \ 1001 errout: \ 1002 STANDARD_FROM_LOOP_ERR_HANDLER (i); \ 1003 } \ 1004 \ 1005 if (__builtin_expect (inptr + cnt > inend, 0)) \ 1006 { \ 1007 /* We don't have enough input. But before we report that check \ 1008 that all the bytes are correct. */ \ 1009 for (i = 1; inptr + i < inend; ++i) \ 1010 if ((inptr[i] & 0xc0) != 0x80) \ 1011 break; \ 1012 \ 1013 if (__builtin_expect (inptr + i == inend, 1)) \ 1014 { \ 1015 result = __GCONV_INCOMPLETE_INPUT; \ 1016 break; \ 1017 } \ 1018 \ 1019 goto errout; \ 1020 } \ 1021 \ 1022 /* Read the possible remaining bytes. */ \ 1023 for (i = 1; i < cnt; ++i) \ 1024 { \ 1025 uint32_t byte = inptr[i]; \ 1026 \ 1027 if ((byte & 0xc0) != 0x80) \ 1028 /* This is an illegal encoding. */ \ 1029 break; \ 1030 \ 1031 ch <<= 6; \ 1032 ch |= byte & 0x3f; \ 1033 } \ 1034 \ 1035 /* If i < cnt, some trail byte was not >= 0x80, < 0xc0. \ 1036 If cnt > 2 and ch < 2^(5*cnt-4), the wide character ch could \ 1037 have been represented with fewer than cnt bytes. */ \ 1038 if (i < cnt || (cnt > 2 && (ch >> (5 * cnt - 4)) == 0)) \ 1039 { \ 1040 /* This is an illegal encoding. */ \ 1041 goto errout; \ 1042 } \ 1043 \ 1044 inptr += cnt; \ 1045 } \ 1046 \ 1047 /* Now adjust the pointers and store the result. */ \ 1048 *((uint32_t *) outptr) = ch; \ 1049 outptr += sizeof (uint32_t); \ 1050 } 1051 #define LOOP_NEED_FLAGS 1052 1053 #define STORE_REST \ 1054 { \ 1055 /* We store the remaining bytes while converting them into the UCS4 \ 1056 format. We can assume that the first byte in the buffer is \ 1057 correct and that it requires a larger number of bytes than there \ 1058 are in the input buffer. */ \ 1059 wint_t ch = **inptrp; \ 1060 size_t cnt, r; \ 1061 \ 1062 state->__count = inend - *inptrp; \ 1063 \ 1064 if (ch >= 0xc2 && ch < 0xe0) \ 1065 { \ 1066 /* We expect two bytes. The first byte cannot be 0xc0 or \ 1067 0xc1, otherwise the wide character could have been \ 1068 represented using a single byte. */ \ 1069 cnt = 2; \ 1070 ch &= 0x1f; \ 1071 } \ 1072 else if (__builtin_expect ((ch & 0xf0) == 0xe0, 1)) \ 1073 { \ 1074 /* We expect three bytes. */ \ 1075 cnt = 3; \ 1076 ch &= 0x0f; \ 1077 } \ 1078 else if (__builtin_expect ((ch & 0xf8) == 0xf0, 1)) \ 1079 { \ 1080 /* We expect four bytes. */ \ 1081 cnt = 4; \ 1082 ch &= 0x07; \ 1083 } \ 1084 else if (__builtin_expect ((ch & 0xfc) == 0xf8, 1)) \ 1085 { \ 1086 /* We expect five bytes. */ \ 1087 cnt = 5; \ 1088 ch &= 0x03; \ 1089 } \ 1090 else \ 1091 { \ 1092 /* We expect six bytes. */ \ 1093 cnt = 6; \ 1094 ch &= 0x01; \ 1095 } \ 1096 \ 1097 /* The first byte is already consumed. */ \ 1098 r = cnt - 1; \ 1099 while (++(*inptrp) < inend) \ 1100 { \ 1101 ch <<= 6; \ 1102 ch |= **inptrp & 0x3f; \ 1103 --r; \ 1104 } \ 1105 \ 1106 /* Shift for the so far missing bytes. */ \ 1107 ch <<= r * 6; \ 1108 \ 1109 /* Store the number of bytes expected for the entire sequence. */ \ 1110 state->__count |= cnt << 8; \ 1111 \ 1112 /* Store the value. */ \ 1113 state->__value.__wch = ch; \ 1114 } 1115 1116 #define UNPACK_BYTES \ 1117 { \ 1118 static const unsigned char inmask[5] = { 0xc0, 0xe0, 0xf0, 0xf8, 0xfc }; \ 1119 wint_t wch = state->__value.__wch; \ 1120 size_t ntotal = state->__count >> 8; \ 1121 \ 1122 inlen = state->__count & 255; \ 1123 \ 1124 bytebuf[0] = inmask[ntotal - 2]; \ 1125 \ 1126 do \ 1127 { \ 1128 if (--ntotal < inlen) \ 1129 bytebuf[ntotal] = 0x80 | (wch & 0x3f); \ 1130 wch >>= 6; \ 1131 } \ 1132 while (ntotal > 1); \ 1133 \ 1134 bytebuf[0] |= wch; \ 1135 } 1136 1137 #define CLEAR_STATE \ 1138 state->__count = 0 1139 1140 1141 #include <iconv/loop.c> 1142 #include <iconv/skeleton.c> 1143 1144 1145 /* Convert from UCS2 to the internal (UCS4-like) format. */ 1146 #define DEFINE_INIT 0 1147 #define DEFINE_FINI 0 1148 #define MIN_NEEDED_FROM 2 1149 #define MIN_NEEDED_TO 4 1150 #define FROM_DIRECTION 1 1151 #define FROM_LOOP ucs2_internal_loop 1152 #define TO_LOOP ucs2_internal_loop /* This is not used. */ 1153 #define FUNCTION_NAME __gconv_transform_ucs2_internal 1154 #define ONE_DIRECTION 1 1155 1156 #define MIN_NEEDED_INPUT MIN_NEEDED_FROM 1157 #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO 1158 #define LOOPFCT FROM_LOOP 1159 #define BODY \ 1160 { \ 1161 uint16_t u1 = get16 (inptr); \ 1162 \ 1163 if (__builtin_expect (u1 >= 0xd800 && u1 < 0xe000, 0)) \ 1164 { \ 1165 /* Surrogate characters in UCS-2 input are not valid. Reject \ 1166 them. (Catching this here is not security relevant.) */ \ 1167 STANDARD_FROM_LOOP_ERR_HANDLER (2); \ 1168 } \ 1169 \ 1170 *((uint32_t *) outptr) = u1; \ 1171 outptr += sizeof (uint32_t); \ 1172 inptr += 2; \ 1173 } 1174 #define LOOP_NEED_FLAGS 1175 #include <iconv/loop.c> 1176 #include <iconv/skeleton.c> 1177 1178 1179 /* Convert from the internal (UCS4-like) format to UCS2. */ 1180 #define DEFINE_INIT 0 1181 #define DEFINE_FINI 0 1182 #define MIN_NEEDED_FROM 4 1183 #define MIN_NEEDED_TO 2 1184 #define FROM_DIRECTION 1 1185 #define FROM_LOOP internal_ucs2_loop 1186 #define TO_LOOP internal_ucs2_loop /* This is not used. */ 1187 #define FUNCTION_NAME __gconv_transform_internal_ucs2 1188 #define ONE_DIRECTION 1 1189 1190 #define MIN_NEEDED_INPUT MIN_NEEDED_FROM 1191 #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO 1192 #define LOOPFCT FROM_LOOP 1193 #define BODY \ 1194 { \ 1195 uint32_t val = *((const uint32_t *) inptr); \ 1196 \ 1197 if (__builtin_expect (val >= 0x10000, 0)) \ 1198 { \ 1199 UNICODE_TAG_HANDLER (val, 4); \ 1200 STANDARD_TO_LOOP_ERR_HANDLER (4); \ 1201 } \ 1202 else if (__builtin_expect (val >= 0xd800 && val < 0xe000, 0)) \ 1203 { \ 1204 /* Surrogate characters in UCS-4 input are not valid. \ 1205 We must catch this, because the UCS-2 output might be \ 1206 interpreted as UTF-16 by other programs. If we let \ 1207 surrogates pass through, attackers could make a security \ 1208 hole exploit by synthesizing any desired plane 1-16 \ 1209 character. */ \ 1210 result = __GCONV_ILLEGAL_INPUT; \ 1211 if (! ignore_errors_p ()) \ 1212 break; \ 1213 inptr += 4; \ 1214 ++*irreversible; \ 1215 continue; \ 1216 } \ 1217 else \ 1218 { \ 1219 put16 (outptr, val); \ 1220 outptr += sizeof (uint16_t); \ 1221 inptr += 4; \ 1222 } \ 1223 } 1224 #define LOOP_NEED_FLAGS 1225 #include <iconv/loop.c> 1226 #include <iconv/skeleton.c> 1227 1228 1229 /* Convert from UCS2 in other endianness to the internal (UCS4-like) format. */ 1230 #define DEFINE_INIT 0 1231 #define DEFINE_FINI 0 1232 #define MIN_NEEDED_FROM 2 1233 #define MIN_NEEDED_TO 4 1234 #define FROM_DIRECTION 1 1235 #define FROM_LOOP ucs2reverse_internal_loop 1236 #define TO_LOOP ucs2reverse_internal_loop/* This is not used.*/ 1237 #define FUNCTION_NAME __gconv_transform_ucs2reverse_internal 1238 #define ONE_DIRECTION 1 1239 1240 #define MIN_NEEDED_INPUT MIN_NEEDED_FROM 1241 #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO 1242 #define LOOPFCT FROM_LOOP 1243 #define BODY \ 1244 { \ 1245 uint16_t u1 = bswap_16 (get16 (inptr)); \ 1246 \ 1247 if (__builtin_expect (u1 >= 0xd800 && u1 < 0xe000, 0)) \ 1248 { \ 1249 /* Surrogate characters in UCS-2 input are not valid. Reject \ 1250 them. (Catching this here is not security relevant.) */ \ 1251 if (! ignore_errors_p ()) \ 1252 { \ 1253 result = __GCONV_ILLEGAL_INPUT; \ 1254 break; \ 1255 } \ 1256 inptr += 2; \ 1257 ++*irreversible; \ 1258 continue; \ 1259 } \ 1260 \ 1261 *((uint32_t *) outptr) = u1; \ 1262 outptr += sizeof (uint32_t); \ 1263 inptr += 2; \ 1264 } 1265 #define LOOP_NEED_FLAGS 1266 #include <iconv/loop.c> 1267 #include <iconv/skeleton.c> 1268 1269 1270 /* Convert from the internal (UCS4-like) format to UCS2 in other endianness. */ 1271 #define DEFINE_INIT 0 1272 #define DEFINE_FINI 0 1273 #define MIN_NEEDED_FROM 4 1274 #define MIN_NEEDED_TO 2 1275 #define FROM_DIRECTION 1 1276 #define FROM_LOOP internal_ucs2reverse_loop 1277 #define TO_LOOP internal_ucs2reverse_loop/* This is not used.*/ 1278 #define FUNCTION_NAME __gconv_transform_internal_ucs2reverse 1279 #define ONE_DIRECTION 1 1280 1281 #define MIN_NEEDED_INPUT MIN_NEEDED_FROM 1282 #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO 1283 #define LOOPFCT FROM_LOOP 1284 #define BODY \ 1285 { \ 1286 uint32_t val = *((const uint32_t *) inptr); \ 1287 if (__builtin_expect (val >= 0x10000, 0)) \ 1288 { \ 1289 UNICODE_TAG_HANDLER (val, 4); \ 1290 STANDARD_TO_LOOP_ERR_HANDLER (4); \ 1291 } \ 1292 else if (__builtin_expect (val >= 0xd800 && val < 0xe000, 0)) \ 1293 { \ 1294 /* Surrogate characters in UCS-4 input are not valid. \ 1295 We must catch this, because the UCS-2 output might be \ 1296 interpreted as UTF-16 by other programs. If we let \ 1297 surrogates pass through, attackers could make a security \ 1298 hole exploit by synthesizing any desired plane 1-16 \ 1299 character. */ \ 1300 if (! ignore_errors_p ()) \ 1301 { \ 1302 result = __GCONV_ILLEGAL_INPUT; \ 1303 break; \ 1304 } \ 1305 inptr += 4; \ 1306 ++*irreversible; \ 1307 continue; \ 1308 } \ 1309 else \ 1310 { \ 1311 put16 (outptr, bswap_16 (val)); \ 1312 outptr += sizeof (uint16_t); \ 1313 inptr += 4; \ 1314 } \ 1315 } 1316 #define LOOP_NEED_FLAGS 1317 #include <iconv/loop.c> 1318 #include <iconv/skeleton.c> 1319