1 /* Simple transformations functions. 2 Copyright (C) 1997-2002, 2003 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997. 5 6 The GNU C Library is free software; you can redistribute it and/or 7 modify it under the terms of the GNU Lesser General Public 8 License as published by the Free Software Foundation; either 9 version 2.1 of the License, or (at your option) any later version. 10 11 The GNU C Library is distributed in the hope that it will be useful, 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 Lesser General Public License for more details. 15 16 You should have received a copy of the GNU Lesser General Public 17 License along with the GNU C Library; if not, write to the Free 18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 19 02111-1307 USA. */ 20 21 #include <byteswap.h> 22 #include <dlfcn.h> 23 #include <endian.h> 24 #include <errno.h> 25 #include <gconv.h> 26 #include <stdint.h> 27 #include <stdlib.h> 28 #include <string.h> 29 #include <wchar.h> 30 #include <sys/param.h> 31 #include <gconv_int.h> 32 33 #define BUILTIN_ALIAS(s1, s2) /* nothing */ 34 #define BUILTIN_TRANSFORMATION(From, To, Cost, Name, Fct, BtowcFct, \ 35 MinF, MaxF, MinT, MaxT) \ 36 extern int Fct (struct __gconv_step *, struct __gconv_step_data *, \ 37 __const unsigned char **, __const unsigned char *, \ 38 unsigned char **, size_t *, int, int); 39 #include "gconv_builtin.h" 40 41 42 #ifndef EILSEQ 43 # define EILSEQ EINVAL 44 #endif 45 46 47 /* Specialized conversion function for a single byte to INTERNAL, recognizing 48 only ASCII characters. */ 49 wint_t 50 __gconv_btwoc_ascii (struct __gconv_step *step, unsigned char c) 51 { 52 if (c < 0x80) 53 return c; 54 else 55 return WEOF; 56 } 57 58 59 /* Transform from the internal, UCS4-like format, to UCS4. The 60 difference between the internal ucs4 format and the real UCS4 61 format is, if any, the endianess. The Unicode/ISO 10646 says that 62 unless some higher protocol specifies it differently, the byte 63 order is big endian.*/ 64 #define DEFINE_INIT 0 65 #define DEFINE_FINI 0 66 #define MIN_NEEDED_FROM 4 67 #define MIN_NEEDED_TO 4 68 #define FROM_DIRECTION 1 69 #define FROM_LOOP internal_ucs4_loop 70 #define TO_LOOP internal_ucs4_loop /* This is not used. */ 71 #define FUNCTION_NAME __gconv_transform_internal_ucs4 72 73 74 static inline int 75 internal_ucs4_loop (struct __gconv_step *step, 76 struct __gconv_step_data *step_data, 77 const unsigned char **inptrp, const unsigned char *inend, 78 unsigned char **outptrp, unsigned char *outend, 79 size_t *irreversible) 80 { 81 const unsigned char *inptr = *inptrp; 82 unsigned char *outptr = *outptrp; 83 size_t n_convert = MIN (inend - inptr, outend - outptr) / 4; 84 int result; 85 86 #if __BYTE_ORDER == __LITTLE_ENDIAN 87 /* Sigh, we have to do some real work. */ 88 size_t cnt; 89 90 for (cnt = 0; cnt < n_convert; ++cnt, inptr += 4) 91 *((uint32_t *) outptr)++ = bswap_32 (*(const uint32_t *) inptr); 92 93 *inptrp = inptr; 94 *outptrp = outptr; 95 #elif __BYTE_ORDER == __BIG_ENDIAN 96 /* Simply copy the data. */ 97 *inptrp = inptr + n_convert * 4; 98 *outptrp = __mempcpy (outptr, inptr, n_convert * 4); 99 #else 100 # error "This endianess is not supported." 101 #endif 102 103 /* Determine the status. */ 104 if (*inptrp == inend) 105 result = __GCONV_EMPTY_INPUT; 106 else if (*outptrp + 4 > outend) 107 result = __GCONV_FULL_OUTPUT; 108 else 109 result = __GCONV_INCOMPLETE_INPUT; 110 111 return result; 112 } 113 114 #ifndef _STRING_ARCH_unaligned 115 static inline int 116 internal_ucs4_loop_unaligned (struct __gconv_step *step, 117 struct __gconv_step_data *step_data, 118 const unsigned char **inptrp, 119 const unsigned char *inend, 120 unsigned char **outptrp, unsigned char *outend, 121 size_t *irreversible) 122 { 123 const unsigned char *inptr = *inptrp; 124 unsigned char *outptr = *outptrp; 125 size_t n_convert = MIN (inend - inptr, outend - outptr) / 4; 126 int result; 127 128 # if __BYTE_ORDER == __LITTLE_ENDIAN 129 /* Sigh, we have to do some real work. */ 130 size_t cnt; 131 132 for (cnt = 0; cnt < n_convert; ++cnt, inptr += 4, outptr += 4) 133 { 134 outptr[0] = inptr[3]; 135 outptr[1] = inptr[2]; 136 outptr[2] = inptr[1]; 137 outptr[3] = inptr[0]; 138 } 139 140 *inptrp = inptr; 141 *outptrp = outptr; 142 # elif __BYTE_ORDER == __BIG_ENDIAN 143 /* Simply copy the data. */ 144 *inptrp = inptr + n_convert * 4; 145 *outptrp = __mempcpy (outptr, inptr, n_convert * 4); 146 # else 147 # error "This endianess is not supported." 148 # endif 149 150 /* Determine the status. */ 151 if (*inptrp == inend) 152 result = __GCONV_EMPTY_INPUT; 153 else if (*outptrp + 4 > outend) 154 result = __GCONV_FULL_OUTPUT; 155 else 156 result = __GCONV_INCOMPLETE_INPUT; 157 158 return result; 159 } 160 #endif 161 162 163 static inline int 164 internal_ucs4_loop_single (struct __gconv_step *step, 165 struct __gconv_step_data *step_data, 166 const unsigned char **inptrp, 167 const unsigned char *inend, 168 unsigned char **outptrp, unsigned char *outend, 169 size_t *irreversible) 170 { 171 mbstate_t *state = step_data->__statep; 172 size_t cnt = state->__count & 7; 173 174 while (*inptrp < inend && cnt < 4) 175 state->__value.__wchb[cnt++] = *(*inptrp)++; 176 177 if (__builtin_expect (cnt < 4, 0)) 178 { 179 /* Still not enough bytes. Store the ones in the input buffer. */ 180 state->__count &= ~7; 181 state->__count |= cnt; 182 183 return __GCONV_INCOMPLETE_INPUT; 184 } 185 186 #if __BYTE_ORDER == __LITTLE_ENDIAN 187 (*outptrp)[0] = state->__value.__wchb[3]; 188 (*outptrp)[1] = state->__value.__wchb[2]; 189 (*outptrp)[2] = state->__value.__wchb[1]; 190 (*outptrp)[3] = state->__value.__wchb[0]; 191 192 *outptrp += 4; 193 #elif __BYTE_ORDER == __BIG_ENDIAN 194 /* XXX unaligned */ 195 *(*((uint32_t **) outptrp)++) = state->__value.__wch; 196 #else 197 # error "This endianess is not supported." 198 #endif 199 200 /* Clear the state buffer. */ 201 state->__count &= ~7; 202 203 return __GCONV_OK; 204 } 205 206 #include <iconv/skeleton.c> 207 208 209 /* Transform from UCS4 to the internal, UCS4-like format. Unlike 210 for the other direction we have to check for correct values here. */ 211 #define DEFINE_INIT 0 212 #define DEFINE_FINI 0 213 #define MIN_NEEDED_FROM 4 214 #define MIN_NEEDED_TO 4 215 #define FROM_DIRECTION 1 216 #define FROM_LOOP ucs4_internal_loop 217 #define TO_LOOP ucs4_internal_loop /* This is not used. */ 218 #define FUNCTION_NAME __gconv_transform_ucs4_internal 219 220 221 static inline int 222 ucs4_internal_loop (struct __gconv_step *step, 223 struct __gconv_step_data *step_data, 224 const unsigned char **inptrp, const unsigned char *inend, 225 unsigned char **outptrp, unsigned char *outend, 226 size_t *irreversible) 227 { 228 int flags = step_data->__flags; 229 const unsigned char *inptr = *inptrp; 230 unsigned char *outptr = *outptrp; 231 size_t n_convert = MIN (inend - inptr, outend - outptr) / 4; 232 int result; 233 size_t cnt; 234 235 for (cnt = 0; cnt < n_convert; ++cnt, inptr += 4) 236 { 237 uint32_t inval; 238 239 #if __BYTE_ORDER == __LITTLE_ENDIAN 240 inval = bswap_32 (*(const uint32_t *) inptr); 241 #else 242 inval = *(const uint32_t *) inptr; 243 #endif 244 245 if (__builtin_expect (inval > 0x7fffffff, 0)) 246 { 247 /* The value is too large. We don't try transliteration here since 248 this is not an error because of the lack of possibilities to 249 represent the result. This is a genuine bug in the input since 250 UCS4 does not allow such values. */ 251 if (irreversible == NULL) 252 /* We are transliterating, don't try to correct anything. */ 253 return __GCONV_ILLEGAL_INPUT; 254 255 if (flags & __GCONV_IGNORE_ERRORS) 256 { 257 /* Just ignore this character. */ 258 ++*irreversible; 259 continue; 260 } 261 262 *inptrp = inptr; 263 *outptrp = outptr; 264 return __GCONV_ILLEGAL_INPUT; 265 } 266 267 *((uint32_t *) outptr)++ = inval; 268 } 269 270 *inptrp = inptr; 271 *outptrp = outptr; 272 273 /* Determine the status. */ 274 if (*inptrp == inend) 275 result = __GCONV_EMPTY_INPUT; 276 else if (*outptrp + 4 > outend) 277 result = __GCONV_FULL_OUTPUT; 278 else 279 result = __GCONV_INCOMPLETE_INPUT; 280 281 return result; 282 } 283 284 #ifndef _STRING_ARCH_unaligned 285 static inline int 286 ucs4_internal_loop_unaligned (struct __gconv_step *step, 287 struct __gconv_step_data *step_data, 288 const unsigned char **inptrp, 289 const unsigned char *inend, 290 unsigned char **outptrp, unsigned char *outend, 291 size_t *irreversible) 292 { 293 int flags = step_data->__flags; 294 const unsigned char *inptr = *inptrp; 295 unsigned char *outptr = *outptrp; 296 size_t n_convert = MIN (inend - inptr, outend - outptr) / 4; 297 int result; 298 size_t cnt; 299 300 for (cnt = 0; cnt < n_convert; ++cnt, inptr += 4) 301 { 302 if (__builtin_expect (inptr[0] > 0x80, 0)) 303 { 304 /* The value is too large. We don't try transliteration here since 305 this is not an error because of the lack of possibilities to 306 represent the result. This is a genuine bug in the input since 307 UCS4 does not allow such values. */ 308 if (irreversible == NULL) 309 /* We are transliterating, don't try to correct anything. */ 310 return __GCONV_ILLEGAL_INPUT; 311 312 if (flags & __GCONV_IGNORE_ERRORS) 313 { 314 /* Just ignore this character. */ 315 ++*irreversible; 316 continue; 317 } 318 319 *inptrp = inptr; 320 *outptrp = outptr; 321 return __GCONV_ILLEGAL_INPUT; 322 } 323 324 # if __BYTE_ORDER == __LITTLE_ENDIAN 325 outptr[3] = inptr[0]; 326 outptr[2] = inptr[1]; 327 outptr[1] = inptr[2]; 328 outptr[0] = inptr[3]; 329 # else 330 outptr[0] = inptr[0]; 331 outptr[1] = inptr[1]; 332 outptr[2] = inptr[2]; 333 outptr[3] = inptr[3]; 334 # endif 335 outptr += 4; 336 } 337 338 *inptrp = inptr; 339 *outptrp = outptr; 340 341 /* Determine the status. */ 342 if (*inptrp == inend) 343 result = __GCONV_EMPTY_INPUT; 344 else if (*outptrp + 4 > outend) 345 result = __GCONV_FULL_OUTPUT; 346 else 347 result = __GCONV_INCOMPLETE_INPUT; 348 349 return result; 350 } 351 #endif 352 353 354 static inline int 355 ucs4_internal_loop_single (struct __gconv_step *step, 356 struct __gconv_step_data *step_data, 357 const unsigned char **inptrp, 358 const unsigned char *inend, 359 unsigned char **outptrp, unsigned char *outend, 360 size_t *irreversible) 361 { 362 mbstate_t *state = step_data->__statep; 363 int flags = step_data->__flags; 364 size_t cnt = state->__count & 7; 365 366 while (*inptrp < inend && cnt < 4) 367 state->__value.__wchb[cnt++] = *(*inptrp)++; 368 369 if (__builtin_expect (cnt < 4, 0)) 370 { 371 /* Still not enough bytes. Store the ones in the input buffer. */ 372 state->__count &= ~7; 373 state->__count |= cnt; 374 375 return __GCONV_INCOMPLETE_INPUT; 376 } 377 378 if (__builtin_expect (((unsigned char *) state->__value.__wchb)[0] > 0x80, 379 0)) 380 { 381 /* The value is too large. We don't try transliteration here since 382 this is not an error because of the lack of possibilities to 383 represent the result. This is a genuine bug in the input since 384 UCS4 does not allow such values. */ 385 if (!(flags & __GCONV_IGNORE_ERRORS)) 386 { 387 *inptrp -= cnt - (state->__count & 7); 388 return __GCONV_ILLEGAL_INPUT; 389 } 390 } 391 else 392 { 393 #if __BYTE_ORDER == __LITTLE_ENDIAN 394 (*outptrp)[0] = state->__value.__wchb[3]; 395 (*outptrp)[1] = state->__value.__wchb[2]; 396 (*outptrp)[2] = state->__value.__wchb[1]; 397 (*outptrp)[3] = state->__value.__wchb[0]; 398 #elif __BYTE_ORDER == __BIG_ENDIAN 399 (*outptrp)[0] = state->__value.__wchb[0]; 400 (*outptrp)[1] = state->__value.__wchb[1]; 401 (*outptrp)[2] = state->__value.__wchb[2]; 402 (*outptrp)[3] = state->__value.__wchb[3]; 403 #endif 404 405 *outptrp += 4; 406 } 407 408 /* Clear the state buffer. */ 409 state->__count &= ~7; 410 411 return __GCONV_OK; 412 } 413 414 #include <iconv/skeleton.c> 415 416 417 /* Similarly for the little endian form. */ 418 #define DEFINE_INIT 0 419 #define DEFINE_FINI 0 420 #define MIN_NEEDED_FROM 4 421 #define MIN_NEEDED_TO 4 422 #define FROM_DIRECTION 1 423 #define FROM_LOOP internal_ucs4le_loop 424 #define TO_LOOP internal_ucs4le_loop /* This is not used. */ 425 #define FUNCTION_NAME __gconv_transform_internal_ucs4le 426 427 428 static inline int 429 internal_ucs4le_loop (struct __gconv_step *step, 430 struct __gconv_step_data *step_data, 431 const unsigned char **inptrp, const unsigned char *inend, 432 unsigned char **outptrp, unsigned char *outend, 433 size_t *irreversible) 434 { 435 const unsigned char *inptr = *inptrp; 436 unsigned char *outptr = *outptrp; 437 size_t n_convert = MIN (inend - inptr, outend - outptr) / 4; 438 int result; 439 440 #if __BYTE_ORDER == __BIG_ENDIAN 441 /* Sigh, we have to do some real work. */ 442 size_t cnt; 443 444 for (cnt = 0; cnt < n_convert; ++cnt, inptr += 4) 445 *((uint32_t *) outptr)++ = bswap_32 (*(const uint32_t *) inptr); 446 447 *inptrp = inptr; 448 *outptrp = outptr; 449 #elif __BYTE_ORDER == __LITTLE_ENDIAN 450 /* Simply copy the data. */ 451 *inptrp = inptr + n_convert * 4; 452 *outptrp = __mempcpy (outptr, inptr, n_convert * 4); 453 #else 454 # error "This endianess is not supported." 455 #endif 456 457 /* Determine the status. */ 458 if (*inptrp == inend) 459 result = __GCONV_EMPTY_INPUT; 460 else if (*outptrp + 4 > outend) 461 result = __GCONV_FULL_OUTPUT; 462 else 463 result = __GCONV_INCOMPLETE_INPUT; 464 465 return result; 466 } 467 468 #ifndef _STRING_ARCH_unaligned 469 static inline int 470 internal_ucs4le_loop_unaligned (struct __gconv_step *step, 471 struct __gconv_step_data *step_data, 472 const unsigned char **inptrp, 473 const unsigned char *inend, 474 unsigned char **outptrp, unsigned char *outend, 475 size_t *irreversible) 476 { 477 const unsigned char *inptr = *inptrp; 478 unsigned char *outptr = *outptrp; 479 size_t n_convert = MIN (inend - inptr, outend - outptr) / 4; 480 int result; 481 482 # if __BYTE_ORDER == __BIG_ENDIAN 483 /* Sigh, we have to do some real work. */ 484 size_t cnt; 485 486 for (cnt = 0; cnt < n_convert; ++cnt, inptr += 4, outptr += 4) 487 { 488 outptr[0] = inptr[3]; 489 outptr[1] = inptr[2]; 490 outptr[2] = inptr[1]; 491 outptr[3] = inptr[0]; 492 } 493 494 *inptrp = inptr; 495 *outptrp = outptr; 496 # elif __BYTE_ORDER == __LITTLE_ENDIAN 497 /* Simply copy the data. */ 498 *inptrp = inptr + n_convert * 4; 499 *outptrp = __mempcpy (outptr, inptr, n_convert * 4); 500 # else 501 # error "This endianess is not supported." 502 # endif 503 504 /* Determine the status. */ 505 if (*inptrp == inend) 506 result = __GCONV_EMPTY_INPUT; 507 else if (*inptrp + 4 > inend) 508 result = __GCONV_INCOMPLETE_INPUT; 509 else 510 { 511 assert (*outptrp + 4 > outend); 512 result = __GCONV_FULL_OUTPUT; 513 } 514 515 return result; 516 } 517 #endif 518 519 520 static inline int 521 internal_ucs4le_loop_single (struct __gconv_step *step, 522 struct __gconv_step_data *step_data, 523 const unsigned char **inptrp, 524 const unsigned char *inend, 525 unsigned char **outptrp, unsigned char *outend, 526 size_t *irreversible) 527 { 528 mbstate_t *state = step_data->__statep; 529 size_t cnt = state->__count & 7; 530 531 while (*inptrp < inend && cnt < 4) 532 state->__value.__wchb[cnt++] = *(*inptrp)++; 533 534 if (__builtin_expect (cnt < 4, 0)) 535 { 536 /* Still not enough bytes. Store the ones in the input buffer. */ 537 state->__count &= ~7; 538 state->__count |= cnt; 539 540 return __GCONV_INCOMPLETE_INPUT; 541 } 542 543 #if __BYTE_ORDER == __BIG_ENDIAN 544 (*outptrp)[0] = state->__value.__wchb[3]; 545 (*outptrp)[1] = state->__value.__wchb[2]; 546 (*outptrp)[2] = state->__value.__wchb[1]; 547 (*outptrp)[3] = state->__value.__wchb[0]; 548 549 *outptrp += 4; 550 #else 551 /* XXX unaligned */ 552 *(*((uint32_t **) outptrp)++) = state->__value.__wch; 553 #endif 554 555 /* Clear the state buffer. */ 556 state->__count &= ~7; 557 558 return __GCONV_OK; 559 } 560 561 #include <iconv/skeleton.c> 562 563 564 /* And finally from UCS4-LE to the internal encoding. */ 565 #define DEFINE_INIT 0 566 #define DEFINE_FINI 0 567 #define MIN_NEEDED_FROM 4 568 #define MIN_NEEDED_TO 4 569 #define FROM_DIRECTION 1 570 #define FROM_LOOP ucs4le_internal_loop 571 #define TO_LOOP ucs4le_internal_loop /* This is not used. */ 572 #define FUNCTION_NAME __gconv_transform_ucs4le_internal 573 574 575 static inline int 576 ucs4le_internal_loop (struct __gconv_step *step, 577 struct __gconv_step_data *step_data, 578 const unsigned char **inptrp, const unsigned char *inend, 579 unsigned char **outptrp, unsigned char *outend, 580 size_t *irreversible) 581 { 582 int flags = step_data->__flags; 583 const unsigned char *inptr = *inptrp; 584 unsigned char *outptr = *outptrp; 585 size_t n_convert = MIN (inend - inptr, outend - outptr) / 4; 586 int result; 587 size_t cnt; 588 589 for (cnt = 0; cnt < n_convert; ++cnt, inptr += 4) 590 { 591 uint32_t inval; 592 593 #if __BYTE_ORDER == __BIG_ENDIAN 594 inval = bswap_32 (*(const uint32_t *) inptr); 595 #else 596 inval = *(const uint32_t *) inptr; 597 #endif 598 599 if (__builtin_expect (inval > 0x7fffffff, 0)) 600 { 601 /* The value is too large. We don't try transliteration here since 602 this is not an error because of the lack of possibilities to 603 represent the result. This is a genuine bug in the input since 604 UCS4 does not allow such values. */ 605 if (irreversible == NULL) 606 /* We are transliterating, don't try to correct anything. */ 607 return __GCONV_ILLEGAL_INPUT; 608 609 if (flags & __GCONV_IGNORE_ERRORS) 610 { 611 /* Just ignore this character. */ 612 ++*irreversible; 613 continue; 614 } 615 616 return __GCONV_ILLEGAL_INPUT; 617 } 618 619 *((uint32_t *) outptr)++ = inval; 620 } 621 622 *inptrp = inptr; 623 *outptrp = outptr; 624 625 /* Determine the status. */ 626 if (*inptrp == inend) 627 result = __GCONV_EMPTY_INPUT; 628 else if (*inptrp + 4 > inend) 629 result = __GCONV_INCOMPLETE_INPUT; 630 else 631 { 632 assert (*outptrp + 4 > outend); 633 result = __GCONV_FULL_OUTPUT; 634 } 635 636 return result; 637 } 638 639 #ifndef _STRING_ARCH_unaligned 640 static inline int 641 ucs4le_internal_loop_unaligned (struct __gconv_step *step, 642 struct __gconv_step_data *step_data, 643 const unsigned char **inptrp, 644 const unsigned char *inend, 645 unsigned char **outptrp, unsigned char *outend, 646 size_t *irreversible) 647 { 648 int flags = step_data->__flags; 649 const unsigned char *inptr = *inptrp; 650 unsigned char *outptr = *outptrp; 651 size_t n_convert = MIN (inend - inptr, outend - outptr) / 4; 652 int result; 653 size_t cnt; 654 655 for (cnt = 0; cnt < n_convert; ++cnt, inptr += 4) 656 { 657 if (__builtin_expect (inptr[3] > 0x80, 0)) 658 { 659 /* The value is too large. We don't try transliteration here since 660 this is not an error because of the lack of possibilities to 661 represent the result. This is a genuine bug in the input since 662 UCS4 does not allow such values. */ 663 if (irreversible == NULL) 664 /* We are transliterating, don't try to correct anything. */ 665 return __GCONV_ILLEGAL_INPUT; 666 667 if (flags & __GCONV_IGNORE_ERRORS) 668 { 669 /* Just ignore this character. */ 670 ++*irreversible; 671 continue; 672 } 673 674 *inptrp = inptr; 675 *outptrp = outptr; 676 return __GCONV_ILLEGAL_INPUT; 677 } 678 679 # if __BYTE_ORDER == __BIG_ENDIAN 680 outptr[3] = inptr[0]; 681 outptr[2] = inptr[1]; 682 outptr[1] = inptr[2]; 683 outptr[0] = inptr[3]; 684 # else 685 outptr[0] = inptr[0]; 686 outptr[1] = inptr[1]; 687 outptr[2] = inptr[2]; 688 outptr[3] = inptr[3]; 689 # endif 690 691 outptr += 4; 692 } 693 694 *inptrp = inptr; 695 *outptrp = outptr; 696 697 /* Determine the status. */ 698 if (*inptrp == inend) 699 result = __GCONV_EMPTY_INPUT; 700 else if (*inptrp + 4 > inend) 701 result = __GCONV_INCOMPLETE_INPUT; 702 else 703 { 704 assert (*outptrp + 4 > outend); 705 result = __GCONV_FULL_OUTPUT; 706 } 707 708 return result; 709 } 710 #endif 711 712 713 static inline int 714 ucs4le_internal_loop_single (struct __gconv_step *step, 715 struct __gconv_step_data *step_data, 716 const unsigned char **inptrp, 717 const unsigned char *inend, 718 unsigned char **outptrp, unsigned char *outend, 719 size_t *irreversible) 720 { 721 mbstate_t *state = step_data->__statep; 722 int flags = step_data->__flags; 723 size_t cnt = state->__count & 7; 724 725 while (*inptrp < inend && cnt < 4) 726 state->__value.__wchb[cnt++] = *(*inptrp)++; 727 728 if (__builtin_expect (cnt < 4, 0)) 729 { 730 /* Still not enough bytes. Store the ones in the input buffer. */ 731 state->__count &= ~7; 732 state->__count |= cnt; 733 734 return __GCONV_INCOMPLETE_INPUT; 735 } 736 737 if (__builtin_expect (((unsigned char *) state->__value.__wchb)[3] > 0x80, 738 0)) 739 { 740 /* The value is too large. We don't try transliteration here since 741 this is not an error because of the lack of possibilities to 742 represent the result. This is a genuine bug in the input since 743 UCS4 does not allow such values. */ 744 if (!(flags & __GCONV_IGNORE_ERRORS)) 745 return __GCONV_ILLEGAL_INPUT; 746 } 747 else 748 { 749 #if __BYTE_ORDER == __BIG_ENDIAN 750 (*outptrp)[0] = state->__value.__wchb[3]; 751 (*outptrp)[1] = state->__value.__wchb[2]; 752 (*outptrp)[2] = state->__value.__wchb[1]; 753 (*outptrp)[3] = state->__value.__wchb[0]; 754 #else 755 (*outptrp)[0] = state->__value.__wchb[0]; 756 (*outptrp)[1] = state->__value.__wchb[1]; 757 (*outptrp)[2] = state->__value.__wchb[2]; 758 (*outptrp)[3] = state->__value.__wchb[3]; 759 #endif 760 761 *outptrp += 4; 762 } 763 764 /* Clear the state buffer. */ 765 state->__count &= ~7; 766 767 return __GCONV_OK; 768 } 769 770 #include <iconv/skeleton.c> 771 772 773 /* Convert from ISO 646-IRV to the internal (UCS4-like) format. */ 774 #define DEFINE_INIT 0 775 #define DEFINE_FINI 0 776 #define MIN_NEEDED_FROM 1 777 #define MIN_NEEDED_TO 4 778 #define FROM_DIRECTION 1 779 #define FROM_LOOP ascii_internal_loop 780 #define TO_LOOP ascii_internal_loop /* This is not used. */ 781 #define FUNCTION_NAME __gconv_transform_ascii_internal 782 #define ONE_DIRECTION 1 783 784 #define MIN_NEEDED_INPUT MIN_NEEDED_FROM 785 #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO 786 #define LOOPFCT FROM_LOOP 787 #define BODY \ 788 { \ 789 if (__builtin_expect (*inptr > '\x7f', 0)) \ 790 { \ 791 /* The value is too large. We don't try transliteration here since \ 792 this is not an error because of the lack of possibilities to \ 793 represent the result. This is a genuine bug in the input since \ 794 ASCII does not allow such values. */ \ 795 STANDARD_FROM_LOOP_ERR_HANDLER (1); \ 796 } \ 797 else \ 798 /* It's an one byte sequence. */ \ 799 *((uint32_t *) outptr)++ = *inptr++; \ 800 } 801 #define LOOP_NEED_FLAGS 802 #include <iconv/loop.c> 803 #include <iconv/skeleton.c> 804 805 806 /* Convert from the internal (UCS4-like) format to ISO 646-IRV. */ 807 #define DEFINE_INIT 0 808 #define DEFINE_FINI 0 809 #define MIN_NEEDED_FROM 4 810 #define MIN_NEEDED_TO 1 811 #define FROM_DIRECTION 1 812 #define FROM_LOOP internal_ascii_loop 813 #define TO_LOOP internal_ascii_loop /* This is not used. */ 814 #define FUNCTION_NAME __gconv_transform_internal_ascii 815 #define ONE_DIRECTION 1 816 817 #define MIN_NEEDED_INPUT MIN_NEEDED_FROM 818 #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO 819 #define LOOPFCT FROM_LOOP 820 #define BODY \ 821 { \ 822 if (__builtin_expect (*((const uint32_t *) inptr) > 0x7f, 0)) \ 823 { \ 824 UNICODE_TAG_HANDLER (*((const uint32_t *) inptr), 4); \ 825 STANDARD_TO_LOOP_ERR_HANDLER (4); \ 826 } \ 827 else \ 828 /* It's an one byte sequence. */ \ 829 *outptr++ = *((const uint32_t *) inptr)++; \ 830 } 831 #define LOOP_NEED_FLAGS 832 #include <iconv/loop.c> 833 #include <iconv/skeleton.c> 834 835 836 /* Convert from the internal (UCS4-like) format to UTF-8. */ 837 #define DEFINE_INIT 0 838 #define DEFINE_FINI 0 839 #define MIN_NEEDED_FROM 4 840 #define MIN_NEEDED_TO 1 841 #define MAX_NEEDED_TO 6 842 #define FROM_DIRECTION 1 843 #define FROM_LOOP internal_utf8_loop 844 #define TO_LOOP internal_utf8_loop /* This is not used. */ 845 #define FUNCTION_NAME __gconv_transform_internal_utf8 846 #define ONE_DIRECTION 1 847 848 #define MIN_NEEDED_INPUT MIN_NEEDED_FROM 849 #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO 850 #define MAX_NEEDED_OUTPUT MAX_NEEDED_TO 851 #define LOOPFCT FROM_LOOP 852 #define BODY \ 853 { \ 854 uint32_t wc = *((const uint32_t *) inptr); \ 855 \ 856 if (wc < 0x80) \ 857 /* It's an one byte sequence. */ \ 858 *outptr++ = (unsigned char) wc; \ 859 else if (__builtin_expect (wc <= 0x7fffffff, 1)) \ 860 { \ 861 size_t step; \ 862 char *start; \ 863 \ 864 for (step = 2; step < 6; ++step) \ 865 if ((wc & (~(uint32_t)0 << (5 * step + 1))) == 0) \ 866 break; \ 867 \ 868 if (__builtin_expect (outptr + step > outend, 0)) \ 869 { \ 870 /* Too long. */ \ 871 result = __GCONV_FULL_OUTPUT; \ 872 break; \ 873 } \ 874 \ 875 start = outptr; \ 876 *outptr = (unsigned char) (~0xff >> step); \ 877 outptr += step; \ 878 --step; \ 879 do \ 880 { \ 881 start[step] = 0x80 | (wc & 0x3f); \ 882 wc >>= 6; \ 883 } \ 884 while (--step > 0); \ 885 start[0] |= wc; \ 886 } \ 887 else \ 888 { \ 889 STANDARD_TO_LOOP_ERR_HANDLER (4); \ 890 } \ 891 \ 892 inptr += 4; \ 893 } 894 #define LOOP_NEED_FLAGS 895 #include <iconv/loop.c> 896 #include <iconv/skeleton.c> 897 898 899 /* Convert from UTF-8 to the internal (UCS4-like) format. */ 900 #define DEFINE_INIT 0 901 #define DEFINE_FINI 0 902 #define MIN_NEEDED_FROM 1 903 #define MAX_NEEDED_FROM 6 904 #define MIN_NEEDED_TO 4 905 #define FROM_DIRECTION 1 906 #define FROM_LOOP utf8_internal_loop 907 #define TO_LOOP utf8_internal_loop /* This is not used. */ 908 #define FUNCTION_NAME __gconv_transform_utf8_internal 909 #define ONE_DIRECTION 1 910 911 #define MIN_NEEDED_INPUT MIN_NEEDED_FROM 912 #define MAX_NEEDED_INPUT MAX_NEEDED_FROM 913 #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO 914 #define LOOPFCT FROM_LOOP 915 #define BODY \ 916 { \ 917 uint32_t ch; \ 918 uint_fast32_t cnt; \ 919 uint_fast32_t i; \ 920 \ 921 /* Next input byte. */ \ 922 ch = *inptr; \ 923 \ 924 if (ch < 0x80) \ 925 { \ 926 /* One byte sequence. */ \ 927 cnt = 1; \ 928 ++inptr; \ 929 } \ 930 else \ 931 { \ 932 if (ch >= 0xc2 && ch < 0xe0) \ 933 { \ 934 /* We expect two bytes. The first byte cannot be 0xc0 or 0xc1, \ 935 otherwise the wide character could have been represented \ 936 using a single byte. */ \ 937 cnt = 2; \ 938 ch &= 0x1f; \ 939 } \ 940 else if (__builtin_expect ((ch & 0xf0) == 0xe0, 1)) \ 941 { \ 942 /* We expect three bytes. */ \ 943 cnt = 3; \ 944 ch &= 0x0f; \ 945 } \ 946 else if (__builtin_expect ((ch & 0xf8) == 0xf0, 1)) \ 947 { \ 948 /* We expect four bytes. */ \ 949 cnt = 4; \ 950 ch &= 0x07; \ 951 } \ 952 else if (__builtin_expect ((ch & 0xfc) == 0xf8, 1)) \ 953 { \ 954 /* We expect five bytes. */ \ 955 cnt = 5; \ 956 ch &= 0x03; \ 957 } \ 958 else if (__builtin_expect ((ch & 0xfe) == 0xfc, 1)) \ 959 { \ 960 /* We expect six bytes. */ \ 961 cnt = 6; \ 962 ch &= 0x01; \ 963 } \ 964 else \ 965 { \ 966 int skipped; \ 967 \ 968 /* Search the end of this ill-formed UTF-8 character. This \ 969 is the next byte with (x & 0xc0) != 0x80. */ \ 970 skipped = 0; \ 971 do \ 972 ++skipped; \ 973 while (inptr + skipped < inend \ 974 && (*(inptr + skipped) & 0xc0) == 0x80 \ 975 && skipped < 5); \ 976 \ 977 STANDARD_FROM_LOOP_ERR_HANDLER (skipped); \ 978 } \ 979 \ 980 if (__builtin_expect (inptr + cnt > inend, 0)) \ 981 { \ 982 /* We don't have enough input. But before we report that check \ 983 that all the bytes are correct. */ \ 984 for (i = 1; inptr + i < inend; ++i) \ 985 if ((inptr[i] & 0xc0) != 0x80) \ 986 break; \ 987 \ 988 if (__builtin_expect (inptr + i == inend, 1)) \ 989 { \ 990 result = __GCONV_INCOMPLETE_INPUT; \ 991 break; \ 992 } \ 993 \ 994 STANDARD_FROM_LOOP_ERR_HANDLER (i); \ 995 } \ 996 \ 997 /* Read the possible remaining bytes. */ \ 998 for (i = 1; i < cnt; ++i) \ 999 { \ 1000 uint32_t byte = inptr[i]; \ 1001 \ 1002 if ((byte & 0xc0) != 0x80) \ 1003 /* This is an illegal encoding. */ \ 1004 break; \ 1005 \ 1006 ch <<= 6; \ 1007 ch |= byte & 0x3f; \ 1008 } \ 1009 \ 1010 /* If i < cnt, some trail byte was not >= 0x80, < 0xc0. \ 1011 If cnt > 2 and ch < 2^(5*cnt-4), the wide character ch could \ 1012 have been represented with fewer than cnt bytes. */ \ 1013 if (i < cnt || (cnt > 2 && (ch >> (5 * cnt - 4)) == 0)) \ 1014 { \ 1015 /* This is an illegal encoding. */ \ 1016 STANDARD_FROM_LOOP_ERR_HANDLER (i); \ 1017 } \ 1018 \ 1019 inptr += cnt; \ 1020 } \ 1021 \ 1022 /* Now adjust the pointers and store the result. */ \ 1023 *((uint32_t *) outptr)++ = ch; \ 1024 } 1025 #define LOOP_NEED_FLAGS 1026 1027 #define STORE_REST \ 1028 { \ 1029 /* We store the remaining bytes while converting them into the UCS4 \ 1030 format. We can assume that the first byte in the buffer is \ 1031 correct and that it requires a larger number of bytes than there \ 1032 are in the input buffer. */ \ 1033 wint_t ch = **inptrp; \ 1034 size_t cnt, r; \ 1035 \ 1036 state->__count = inend - *inptrp; \ 1037 \ 1038 if (ch >= 0xc2 && ch < 0xe0) \ 1039 { \ 1040 /* We expect two bytes. The first byte cannot be 0xc0 or \ 1041 0xc1, otherwise the wide character could have been \ 1042 represented using a single byte. */ \ 1043 cnt = 2; \ 1044 ch &= 0x1f; \ 1045 } \ 1046 else if (__builtin_expect ((ch & 0xf0) == 0xe0, 1)) \ 1047 { \ 1048 /* We expect three bytes. */ \ 1049 cnt = 3; \ 1050 ch &= 0x0f; \ 1051 } \ 1052 else if (__builtin_expect ((ch & 0xf8) == 0xf0, 1)) \ 1053 { \ 1054 /* We expect four bytes. */ \ 1055 cnt = 4; \ 1056 ch &= 0x07; \ 1057 } \ 1058 else if (__builtin_expect ((ch & 0xfc) == 0xf8, 1)) \ 1059 { \ 1060 /* We expect five bytes. */ \ 1061 cnt = 5; \ 1062 ch &= 0x03; \ 1063 } \ 1064 else \ 1065 { \ 1066 /* We expect six bytes. */ \ 1067 cnt = 6; \ 1068 ch &= 0x01; \ 1069 } \ 1070 \ 1071 /* The first byte is already consumed. */ \ 1072 r = cnt - 1; \ 1073 while (++(*inptrp) < inend) \ 1074 { \ 1075 ch <<= 6; \ 1076 ch |= **inptrp & 0x3f; \ 1077 --r; \ 1078 } \ 1079 \ 1080 /* Shift for the so far missing bytes. */ \ 1081 ch <<= r * 6; \ 1082 \ 1083 /* Store the number of bytes expected for the entire sequence. */ \ 1084 state->__count |= cnt << 8; \ 1085 \ 1086 /* Store the value. */ \ 1087 state->__value.__wch = ch; \ 1088 } 1089 1090 #define UNPACK_BYTES \ 1091 { \ 1092 static const unsigned char inmask[5] = { 0xc0, 0xe0, 0xf0, 0xf8, 0xfc }; \ 1093 wint_t wch = state->__value.__wch; \ 1094 size_t ntotal = state->__count >> 8; \ 1095 \ 1096 inlen = state->__count & 255; \ 1097 \ 1098 bytebuf[0] = inmask[ntotal - 2]; \ 1099 \ 1100 do \ 1101 { \ 1102 if (--ntotal < inlen) \ 1103 bytebuf[ntotal] = 0x80 | (wch & 0x3f); \ 1104 wch >>= 6; \ 1105 } \ 1106 while (ntotal > 1); \ 1107 \ 1108 bytebuf[0] |= wch; \ 1109 } 1110 1111 #define CLEAR_STATE \ 1112 state->__count = 0 1113 1114 1115 #include <iconv/loop.c> 1116 #include <iconv/skeleton.c> 1117 1118 1119 /* Convert from UCS2 to the internal (UCS4-like) format. */ 1120 #define DEFINE_INIT 0 1121 #define DEFINE_FINI 0 1122 #define MIN_NEEDED_FROM 2 1123 #define MIN_NEEDED_TO 4 1124 #define FROM_DIRECTION 1 1125 #define FROM_LOOP ucs2_internal_loop 1126 #define TO_LOOP ucs2_internal_loop /* This is not used. */ 1127 #define FUNCTION_NAME __gconv_transform_ucs2_internal 1128 #define ONE_DIRECTION 1 1129 1130 #define MIN_NEEDED_INPUT MIN_NEEDED_FROM 1131 #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO 1132 #define LOOPFCT FROM_LOOP 1133 #define BODY \ 1134 { \ 1135 uint16_t u1 = *((const uint16_t *) inptr); \ 1136 \ 1137 if (__builtin_expect (u1 >= 0xd800 && u1 < 0xe000, 0)) \ 1138 { \ 1139 /* Surrogate characters in UCS-2 input are not valid. Reject \ 1140 them. (Catching this here is not security relevant.) */ \ 1141 STANDARD_FROM_LOOP_ERR_HANDLER (2); \ 1142 } \ 1143 \ 1144 *((uint32_t *) outptr)++ = u1; \ 1145 inptr += 2; \ 1146 } 1147 #define LOOP_NEED_FLAGS 1148 #include <iconv/loop.c> 1149 #include <iconv/skeleton.c> 1150 1151 1152 /* Convert from the internal (UCS4-like) format to UCS2. */ 1153 #define DEFINE_INIT 0 1154 #define DEFINE_FINI 0 1155 #define MIN_NEEDED_FROM 4 1156 #define MIN_NEEDED_TO 2 1157 #define FROM_DIRECTION 1 1158 #define FROM_LOOP internal_ucs2_loop 1159 #define TO_LOOP internal_ucs2_loop /* This is not used. */ 1160 #define FUNCTION_NAME __gconv_transform_internal_ucs2 1161 #define ONE_DIRECTION 1 1162 1163 #define MIN_NEEDED_INPUT MIN_NEEDED_FROM 1164 #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO 1165 #define LOOPFCT FROM_LOOP 1166 #define BODY \ 1167 { \ 1168 uint32_t val = *((const uint32_t *) inptr); \ 1169 \ 1170 if (__builtin_expect (val >= 0x10000, 0)) \ 1171 { \ 1172 UNICODE_TAG_HANDLER (val, 4); \ 1173 STANDARD_TO_LOOP_ERR_HANDLER (4); \ 1174 } \ 1175 else if (__builtin_expect (val >= 0xd800 && val < 0xe000, 0)) \ 1176 { \ 1177 /* Surrogate characters in UCS-4 input are not valid. \ 1178 We must catch this, because the UCS-2 output might be \ 1179 interpreted as UTF-16 by other programs. If we let \ 1180 surrogates pass through, attackers could make a security \ 1181 hole exploit by synthesizing any desired plane 1-16 \ 1182 character. */ \ 1183 result = __GCONV_ILLEGAL_INPUT; \ 1184 if (! ignore_errors_p ()) \ 1185 break; \ 1186 inptr += 4; \ 1187 ++*irreversible; \ 1188 continue; \ 1189 } \ 1190 else \ 1191 { \ 1192 *((uint16_t *) outptr)++ = val; \ 1193 inptr += 4; \ 1194 } \ 1195 } 1196 #define LOOP_NEED_FLAGS 1197 #include <iconv/loop.c> 1198 #include <iconv/skeleton.c> 1199 1200 1201 /* Convert from UCS2 in other endianness to the internal (UCS4-like) format. */ 1202 #define DEFINE_INIT 0 1203 #define DEFINE_FINI 0 1204 #define MIN_NEEDED_FROM 2 1205 #define MIN_NEEDED_TO 4 1206 #define FROM_DIRECTION 1 1207 #define FROM_LOOP ucs2reverse_internal_loop 1208 #define TO_LOOP ucs2reverse_internal_loop/* This is not used.*/ 1209 #define FUNCTION_NAME __gconv_transform_ucs2reverse_internal 1210 #define ONE_DIRECTION 1 1211 1212 #define MIN_NEEDED_INPUT MIN_NEEDED_FROM 1213 #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO 1214 #define LOOPFCT FROM_LOOP 1215 #define BODY \ 1216 { \ 1217 uint16_t u1 = bswap_16 (*((const uint16_t *) inptr)); \ 1218 \ 1219 if (__builtin_expect (u1 >= 0xd800 && u1 < 0xe000, 0)) \ 1220 { \ 1221 /* Surrogate characters in UCS-2 input are not valid. Reject \ 1222 them. (Catching this here is not security relevant.) */ \ 1223 if (! ignore_errors_p ()) \ 1224 { \ 1225 result = __GCONV_ILLEGAL_INPUT; \ 1226 break; \ 1227 } \ 1228 inptr += 2; \ 1229 ++*irreversible; \ 1230 continue; \ 1231 } \ 1232 \ 1233 *((uint32_t *) outptr)++ = u1; \ 1234 inptr += 2; \ 1235 } 1236 #define LOOP_NEED_FLAGS 1237 #include <iconv/loop.c> 1238 #include <iconv/skeleton.c> 1239 1240 1241 /* Convert from the internal (UCS4-like) format to UCS2 in other endianness. */ 1242 #define DEFINE_INIT 0 1243 #define DEFINE_FINI 0 1244 #define MIN_NEEDED_FROM 4 1245 #define MIN_NEEDED_TO 2 1246 #define FROM_DIRECTION 1 1247 #define FROM_LOOP internal_ucs2reverse_loop 1248 #define TO_LOOP internal_ucs2reverse_loop/* This is not used.*/ 1249 #define FUNCTION_NAME __gconv_transform_internal_ucs2reverse 1250 #define ONE_DIRECTION 1 1251 1252 #define MIN_NEEDED_INPUT MIN_NEEDED_FROM 1253 #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO 1254 #define LOOPFCT FROM_LOOP 1255 #define BODY \ 1256 { \ 1257 uint32_t val = *((const uint32_t *) inptr); \ 1258 if (__builtin_expect (val >= 0x10000, 0)) \ 1259 { \ 1260 UNICODE_TAG_HANDLER (val, 4); \ 1261 STANDARD_TO_LOOP_ERR_HANDLER (4); \ 1262 } \ 1263 else if (__builtin_expect (val >= 0xd800 && val < 0xe000, 0)) \ 1264 { \ 1265 /* Surrogate characters in UCS-4 input are not valid. \ 1266 We must catch this, because the UCS-2 output might be \ 1267 interpreted as UTF-16 by other programs. If we let \ 1268 surrogates pass through, attackers could make a security \ 1269 hole exploit by synthesizing any desired plane 1-16 \ 1270 character. */ \ 1271 if (! ignore_errors_p ()) \ 1272 { \ 1273 result = __GCONV_ILLEGAL_INPUT; \ 1274 break; \ 1275 } \ 1276 inptr += 4; \ 1277 ++*irreversible; \ 1278 continue; \ 1279 } \ 1280 else \ 1281 { \ 1282 *((uint16_t *) outptr)++ = bswap_16 (val); \ 1283 inptr += 4; \ 1284 } \ 1285 } 1286 #define LOOP_NEED_FLAGS 1287 #include <iconv/loop.c> 1288 #include <iconv/skeleton.c> 1289