1 /* 2 * Copyright (C) 1999-2003, 2005-2006 Free Software Foundation, Inc. 3 * This file is part of the GNU LIBICONV Library. 4 * 5 * The GNU LIBICONV Library is free software; you can redistribute it 6 * and/or modify it under the terms of the GNU Library General Public 7 * License as published by the Free Software Foundation; either version 2 8 * of the License, or (at your option) any later version. 9 * 10 * The GNU LIBICONV Library is distributed in the hope that it will be 11 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 * Library General Public License for more details. 14 * 15 * You should have received a copy of the GNU Library General Public 16 * License along with the GNU LIBICONV Library; see the file COPYING.LIB. 17 * If not, write to the Free Software Foundation, Inc., 51 Franklin Street, 18 * Fifth Floor, Boston, MA 02110-1301, USA. 19 */ 20 21 /* This file defines the conversion loop via Unicode as a pivot encoding. */ 22 23 /* Attempt to transliterate wc. Return code as in xxx_wctomb. */ 24 static int unicode_transliterate (conv_t cd, ucs4_t wc, 25 unsigned char* outptr, size_t outleft) 26 { 27 if (cd->oflags & HAVE_HANGUL_JAMO) { 28 /* Decompose Hangul into Jamo. Use double-width Jamo (contained 29 in all Korean encodings and ISO-2022-JP-2), not half-width Jamo 30 (contained in Unicode only). */ 31 ucs4_t buf[3]; 32 int ret = johab_hangul_decompose(cd,buf,wc); 33 if (ret != RET_ILUNI) { 34 /* we know 1 <= ret <= 3 */ 35 state_t backup_state = cd->ostate; 36 unsigned char* backup_outptr = outptr; 37 size_t backup_outleft = outleft; 38 int i, sub_outcount; 39 for (i = 0; i < ret; i++) { 40 if (outleft == 0) { 41 sub_outcount = RET_TOOSMALL; 42 goto johab_hangul_failed; 43 } 44 sub_outcount = cd->ofuncs.xxx_wctomb(cd,outptr,buf[i],outleft); 45 if (sub_outcount <= RET_ILUNI) 46 goto johab_hangul_failed; 47 if (!(sub_outcount <= outleft)) abort(); 48 outptr += sub_outcount; outleft -= sub_outcount; 49 } 50 return outptr-backup_outptr; 51 johab_hangul_failed: 52 cd->ostate = backup_state; 53 outptr = backup_outptr; 54 outleft = backup_outleft; 55 if (sub_outcount != RET_ILUNI) 56 return RET_TOOSMALL; 57 } 58 } 59 { 60 /* Try to use a variant, but postfix it with 61 U+303E IDEOGRAPHIC VARIATION INDICATOR 62 (cf. Ken Lunde's "CJKV information processing", p. 188). */ 63 int indx = -1; 64 if (wc == 0x3006) 65 indx = 0; 66 else if (wc == 0x30f6) 67 indx = 1; 68 else if (wc >= 0x4e00 && wc < 0xa000) 69 indx = cjk_variants_indx[wc-0x4e00]; 70 if (indx >= 0) { 71 for (;; indx++) { 72 ucs4_t buf[2]; 73 unsigned short variant = cjk_variants[indx]; 74 unsigned short last = variant & 0x8000; 75 variant &= 0x7fff; 76 variant += 0x3000; 77 buf[0] = variant; buf[1] = 0x303e; 78 { 79 state_t backup_state = cd->ostate; 80 unsigned char* backup_outptr = outptr; 81 size_t backup_outleft = outleft; 82 int i, sub_outcount; 83 for (i = 0; i < 2; i++) { 84 if (outleft == 0) { 85 sub_outcount = RET_TOOSMALL; 86 goto variant_failed; 87 } 88 sub_outcount = cd->ofuncs.xxx_wctomb(cd,outptr,buf[i],outleft); 89 if (sub_outcount <= RET_ILUNI) 90 goto variant_failed; 91 if (!(sub_outcount <= outleft)) abort(); 92 outptr += sub_outcount; outleft -= sub_outcount; 93 } 94 return outptr-backup_outptr; 95 variant_failed: 96 cd->ostate = backup_state; 97 outptr = backup_outptr; 98 outleft = backup_outleft; 99 if (sub_outcount != RET_ILUNI) 100 return RET_TOOSMALL; 101 } 102 if (last) 103 break; 104 } 105 } 106 } 107 if (wc >= 0x2018 && wc <= 0x201a) { 108 /* Special case for quotation marks 0x2018, 0x2019, 0x201a */ 109 ucs4_t substitute = 110 (cd->oflags & HAVE_QUOTATION_MARKS 111 ? (wc == 0x201a ? 0x2018 : wc) 112 : (cd->oflags & HAVE_ACCENTS 113 ? (wc==0x2019 ? 0x00b4 : 0x0060) /* use accents */ 114 : 0x0027 /* use apostrophe */ 115 ) ); 116 int outcount = cd->ofuncs.xxx_wctomb(cd,outptr,substitute,outleft); 117 if (outcount != RET_ILUNI) 118 return outcount; 119 } 120 { 121 /* Use the transliteration table. */ 122 int indx = translit_index(wc); 123 if (indx >= 0) { 124 const unsigned int * cp = &translit_data[indx]; 125 unsigned int num = *cp++; 126 state_t backup_state = cd->ostate; 127 unsigned char* backup_outptr = outptr; 128 size_t backup_outleft = outleft; 129 unsigned int i; 130 int sub_outcount; 131 for (i = 0; i < num; i++) { 132 if (outleft == 0) { 133 sub_outcount = RET_TOOSMALL; 134 goto translit_failed; 135 } 136 sub_outcount = cd->ofuncs.xxx_wctomb(cd,outptr,cp[i],outleft); 137 if (sub_outcount == RET_ILUNI) 138 /* Recursive transliteration. */ 139 sub_outcount = unicode_transliterate(cd,cp[i],outptr,outleft); 140 if (sub_outcount <= RET_ILUNI) 141 goto translit_failed; 142 if (!(sub_outcount <= outleft)) abort(); 143 outptr += sub_outcount; outleft -= sub_outcount; 144 } 145 return outptr-backup_outptr; 146 translit_failed: 147 cd->ostate = backup_state; 148 outptr = backup_outptr; 149 outleft = backup_outleft; 150 if (sub_outcount != RET_ILUNI) 151 return RET_TOOSMALL; 152 } 153 } 154 return RET_ILUNI; 155 } 156 157 #ifndef LIBICONV_PLUG 158 159 struct uc_to_mb_fallback_locals { 160 unsigned char* l_outbuf; 161 size_t l_outbytesleft; 162 int l_errno; 163 }; 164 165 static void uc_to_mb_write_replacement (const char *buf, size_t buflen, 166 void* callback_arg) 167 { 168 struct uc_to_mb_fallback_locals * plocals = 169 (struct uc_to_mb_fallback_locals *) callback_arg; 170 /* Do nothing if already encountered an error in a previous call. */ 171 if (plocals->l_errno == 0) { 172 /* Attempt to copy the passed buffer to the output buffer. */ 173 if (plocals->l_outbytesleft < buflen) 174 plocals->l_errno = E2BIG; 175 else { 176 memcpy(plocals->l_outbuf, buf, buflen); 177 plocals->l_outbuf += buflen; 178 plocals->l_outbytesleft -= buflen; 179 } 180 } 181 } 182 183 struct mb_to_uc_fallback_locals { 184 conv_t l_cd; 185 unsigned char* l_outbuf; 186 size_t l_outbytesleft; 187 int l_errno; 188 }; 189 190 static void mb_to_uc_write_replacement (const unsigned int *buf, size_t buflen, 191 void* callback_arg) 192 { 193 struct mb_to_uc_fallback_locals * plocals = 194 (struct mb_to_uc_fallback_locals *) callback_arg; 195 /* Do nothing if already encountered an error in a previous call. */ 196 if (plocals->l_errno == 0) { 197 /* Attempt to convert the passed buffer to the target encoding. */ 198 conv_t cd = plocals->l_cd; 199 unsigned char* outptr = plocals->l_outbuf; 200 size_t outleft = plocals->l_outbytesleft; 201 for (; buflen > 0; buf++, buflen--) { 202 ucs4_t wc = *buf; 203 int outcount; 204 if (outleft == 0) { 205 plocals->l_errno = E2BIG; 206 break; 207 } 208 outcount = cd->ofuncs.xxx_wctomb(cd,outptr,wc,outleft); 209 if (outcount != RET_ILUNI) 210 goto outcount_ok; 211 /* Handle Unicode tag characters (range U+E0000..U+E007F). */ 212 if ((wc >> 7) == (0xe0000 >> 7)) 213 goto outcount_zero; 214 /* Try transliteration. */ 215 if (cd->transliterate) { 216 outcount = unicode_transliterate(cd,wc,outptr,outleft); 217 if (outcount != RET_ILUNI) 218 goto outcount_ok; 219 } 220 if (cd->discard_ilseq) { 221 outcount = 0; 222 goto outcount_ok; 223 } 224 #ifndef LIBICONV_PLUG 225 else if (cd->fallbacks.uc_to_mb_fallback != NULL) { 226 struct uc_to_mb_fallback_locals locals; 227 locals.l_outbuf = outptr; 228 locals.l_outbytesleft = outleft; 229 locals.l_errno = 0; 230 cd->fallbacks.uc_to_mb_fallback(wc, 231 uc_to_mb_write_replacement, 232 &locals, 233 cd->fallbacks.data); 234 if (locals.l_errno != 0) { 235 plocals->l_errno = locals.l_errno; 236 break; 237 } 238 outptr = locals.l_outbuf; 239 outleft = locals.l_outbytesleft; 240 outcount = 0; 241 goto outcount_ok; 242 } 243 #endif 244 outcount = cd->ofuncs.xxx_wctomb(cd,outptr,0xFFFD,outleft); 245 if (outcount != RET_ILUNI) 246 goto outcount_ok; 247 plocals->l_errno = EILSEQ; 248 break; 249 outcount_ok: 250 if (outcount < 0) { 251 plocals->l_errno = E2BIG; 252 break; 253 } 254 #ifndef LIBICONV_PLUG 255 if (cd->hooks.uc_hook) 256 (*cd->hooks.uc_hook)(wc, cd->hooks.data); 257 #endif 258 if (!(outcount <= outleft)) abort(); 259 outptr += outcount; outleft -= outcount; 260 outcount_zero: ; 261 } 262 plocals->l_outbuf = outptr; 263 plocals->l_outbytesleft = outleft; 264 } 265 } 266 267 #endif /* !LIBICONV_PLUG */ 268 269 static size_t unicode_loop_convert (iconv_t icd, 270 const char* * inbuf, size_t *inbytesleft, 271 char* * outbuf, size_t *outbytesleft) 272 { 273 conv_t cd = (conv_t) icd; 274 size_t result = 0; 275 const unsigned char* inptr = (const unsigned char*) *inbuf; 276 size_t inleft = *inbytesleft; 277 unsigned char* outptr = (unsigned char*) *outbuf; 278 size_t outleft = *outbytesleft; 279 while (inleft > 0) { 280 state_t last_istate = cd->istate; 281 ucs4_t wc; 282 int incount; 283 int outcount; 284 incount = cd->ifuncs.xxx_mbtowc(cd,&wc,inptr,inleft); 285 if (incount < 0) { 286 if (incount == RET_ILSEQ) { 287 /* Case 1: invalid input */ 288 if (cd->discard_ilseq) { 289 switch (cd->iindex) { 290 case ei_ucs4: case ei_ucs4be: case ei_ucs4le: 291 case ei_utf32: case ei_utf32be: case ei_utf32le: 292 case ei_ucs4internal: case ei_ucs4swapped: 293 incount = 4; break; 294 case ei_ucs2: case ei_ucs2be: case ei_ucs2le: 295 case ei_utf16: case ei_utf16be: case ei_utf16le: 296 case ei_ucs2internal: case ei_ucs2swapped: 297 incount = 2; break; 298 default: 299 incount = 1; break; 300 } 301 goto outcount_zero; 302 } 303 #ifndef LIBICONV_PLUG 304 else if (cd->fallbacks.mb_to_uc_fallback != NULL) { 305 struct mb_to_uc_fallback_locals locals; 306 switch (cd->iindex) { 307 case ei_ucs4: case ei_ucs4be: case ei_ucs4le: 308 case ei_utf32: case ei_utf32be: case ei_utf32le: 309 case ei_ucs4internal: case ei_ucs4swapped: 310 incount = 4; break; 311 case ei_ucs2: case ei_ucs2be: case ei_ucs2le: 312 case ei_utf16: case ei_utf16be: case ei_utf16le: 313 case ei_ucs2internal: case ei_ucs2swapped: 314 incount = 2; break; 315 default: 316 incount = 1; break; 317 } 318 locals.l_cd = cd; 319 locals.l_outbuf = outptr; 320 locals.l_outbytesleft = outleft; 321 locals.l_errno = 0; 322 cd->fallbacks.mb_to_uc_fallback(inptr, incount, 323 mb_to_uc_write_replacement, 324 &locals, 325 cd->fallbacks.data); 326 if (locals.l_errno != 0) { 327 errno = locals.l_errno; 328 result = -1; 329 break; 330 } 331 outptr = locals.l_outbuf; 332 outleft = locals.l_outbytesleft; 333 result += 1; 334 goto outcount_zero; 335 } 336 #endif 337 errno = EILSEQ; 338 result = -1; 339 break; 340 } 341 if (incount == RET_TOOFEW(0)) { 342 /* Case 2: not enough bytes available to detect anything */ 343 errno = EINVAL; 344 result = -1; 345 break; 346 } 347 /* Case 3: k bytes read, but only a shift sequence */ 348 incount = -2-incount; 349 } else { 350 /* Case 4: k bytes read, making up a wide character */ 351 if (outleft == 0) { 352 cd->istate = last_istate; 353 errno = E2BIG; 354 result = -1; 355 break; 356 } 357 outcount = cd->ofuncs.xxx_wctomb(cd,outptr,wc,outleft); 358 if (outcount != RET_ILUNI) 359 goto outcount_ok; 360 /* Handle Unicode tag characters (range U+E0000..U+E007F). */ 361 if ((wc >> 7) == (0xe0000 >> 7)) 362 goto outcount_zero; 363 /* Try transliteration. */ 364 result++; 365 if (cd->transliterate) { 366 outcount = unicode_transliterate(cd,wc,outptr,outleft); 367 if (outcount != RET_ILUNI) 368 goto outcount_ok; 369 } 370 if (cd->discard_ilseq) { 371 outcount = 0; 372 goto outcount_ok; 373 } 374 #ifndef LIBICONV_PLUG 375 else if (cd->fallbacks.uc_to_mb_fallback != NULL) { 376 struct uc_to_mb_fallback_locals locals; 377 locals.l_outbuf = outptr; 378 locals.l_outbytesleft = outleft; 379 locals.l_errno = 0; 380 cd->fallbacks.uc_to_mb_fallback(wc, 381 uc_to_mb_write_replacement, 382 &locals, 383 cd->fallbacks.data); 384 if (locals.l_errno != 0) { 385 cd->istate = last_istate; 386 errno = locals.l_errno; 387 return -1; 388 } 389 outptr = locals.l_outbuf; 390 outleft = locals.l_outbytesleft; 391 outcount = 0; 392 goto outcount_ok; 393 } 394 #endif 395 outcount = cd->ofuncs.xxx_wctomb(cd,outptr,0xFFFD,outleft); 396 if (outcount != RET_ILUNI) 397 goto outcount_ok; 398 cd->istate = last_istate; 399 errno = EILSEQ; 400 result = -1; 401 break; 402 outcount_ok: 403 if (outcount < 0) { 404 cd->istate = last_istate; 405 errno = E2BIG; 406 result = -1; 407 break; 408 } 409 #ifndef LIBICONV_PLUG 410 if (cd->hooks.uc_hook) 411 (*cd->hooks.uc_hook)(wc, cd->hooks.data); 412 #endif 413 if (!(outcount <= outleft)) abort(); 414 outptr += outcount; outleft -= outcount; 415 } 416 outcount_zero: 417 if (!(incount <= inleft)) abort(); 418 inptr += incount; inleft -= incount; 419 } 420 *inbuf = (const char*) inptr; 421 *inbytesleft = inleft; 422 *outbuf = (char*) outptr; 423 *outbytesleft = outleft; 424 return result; 425 } 426 427 static size_t unicode_loop_reset (iconv_t icd, 428 char* * outbuf, size_t *outbytesleft) 429 { 430 conv_t cd = (conv_t) icd; 431 if (outbuf == NULL || *outbuf == NULL) { 432 /* Reset the states. */ 433 memset(&cd->istate,'\0',sizeof(state_t)); 434 memset(&cd->ostate,'\0',sizeof(state_t)); 435 return 0; 436 } else { 437 size_t result = 0; 438 if (cd->ifuncs.xxx_flushwc) { 439 state_t last_istate = cd->istate; 440 ucs4_t wc; 441 if (cd->ifuncs.xxx_flushwc(cd, &wc)) { 442 unsigned char* outptr = (unsigned char*) *outbuf; 443 size_t outleft = *outbytesleft; 444 int outcount = cd->ofuncs.xxx_wctomb(cd,outptr,wc,outleft); 445 if (outcount != RET_ILUNI) 446 goto outcount_ok; 447 /* Handle Unicode tag characters (range U+E0000..U+E007F). */ 448 if ((wc >> 7) == (0xe0000 >> 7)) 449 goto outcount_zero; 450 /* Try transliteration. */ 451 result++; 452 if (cd->transliterate) { 453 outcount = unicode_transliterate(cd,wc,outptr,outleft); 454 if (outcount != RET_ILUNI) 455 goto outcount_ok; 456 } 457 if (cd->discard_ilseq) { 458 outcount = 0; 459 goto outcount_ok; 460 } 461 #ifndef LIBICONV_PLUG 462 else if (cd->fallbacks.uc_to_mb_fallback != NULL) { 463 struct uc_to_mb_fallback_locals locals; 464 locals.l_outbuf = outptr; 465 locals.l_outbytesleft = outleft; 466 locals.l_errno = 0; 467 cd->fallbacks.uc_to_mb_fallback(wc, 468 uc_to_mb_write_replacement, 469 &locals, 470 cd->fallbacks.data); 471 if (locals.l_errno != 0) { 472 cd->istate = last_istate; 473 errno = locals.l_errno; 474 return -1; 475 } 476 outptr = locals.l_outbuf; 477 outleft = locals.l_outbytesleft; 478 outcount = 0; 479 goto outcount_ok; 480 } 481 #endif 482 outcount = cd->ofuncs.xxx_wctomb(cd,outptr,0xFFFD,outleft); 483 if (outcount != RET_ILUNI) 484 goto outcount_ok; 485 cd->istate = last_istate; 486 errno = EILSEQ; 487 return -1; 488 outcount_ok: 489 if (outcount < 0) { 490 cd->istate = last_istate; 491 errno = E2BIG; 492 return -1; 493 } 494 #ifndef LIBICONV_PLUG 495 if (cd->hooks.uc_hook) 496 (*cd->hooks.uc_hook)(wc, cd->hooks.data); 497 #endif 498 if (!(outcount <= outleft)) abort(); 499 outptr += outcount; 500 outleft -= outcount; 501 outcount_zero: 502 *outbuf = (char*) outptr; 503 *outbytesleft = outleft; 504 } 505 } 506 if (cd->ofuncs.xxx_reset) { 507 unsigned char* outptr = (unsigned char*) *outbuf; 508 size_t outleft = *outbytesleft; 509 int outcount = cd->ofuncs.xxx_reset(cd,outptr,outleft); 510 if (outcount < 0) { 511 errno = E2BIG; 512 return -1; 513 } 514 if (!(outcount <= outleft)) abort(); 515 *outbuf = (char*) (outptr + outcount); 516 *outbytesleft = outleft - outcount; 517 } 518 memset(&cd->istate,'\0',sizeof(state_t)); 519 memset(&cd->ostate,'\0',sizeof(state_t)); 520 return result; 521 } 522 } 523