1 /*
2 * Copyright (C) 1999-2003, 2005-2006 Free Software Foundation, Inc.
3 * This file is part of the GNU LIBICONV Library.
4 *
5 * The GNU LIBICONV Library is free software; you can redistribute it
6 * and/or modify it under the terms of the GNU Library General Public
7 * License as published by the Free Software Foundation; either version 2
8 * of the License, or (at your option) any later version.
9 *
10 * The GNU LIBICONV Library is distributed in the hope that it will be
11 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Library General Public License for more details.
14 *
15 * You should have received a copy of the GNU Library General Public
16 * License along with the GNU LIBICONV Library; see the file COPYING.LIB.
17 * If not, write to the Free Software Foundation, Inc., 51 Franklin Street,
18 * Fifth Floor, Boston, MA 02110-1301, USA.
19 */
20
21 /* This file defines the conversion loop via Unicode as a pivot encoding. */
22
23 /* Attempt to transliterate wc. Return code as in xxx_wctomb. */
unicode_transliterate(conv_t cd,ucs4_t wc,unsigned char * outptr,size_t outleft)24 static int unicode_transliterate (conv_t cd, ucs4_t wc,
25 unsigned char* outptr, size_t outleft)
26 {
27 if (cd->oflags & HAVE_HANGUL_JAMO) {
28 /* Decompose Hangul into Jamo. Use double-width Jamo (contained
29 in all Korean encodings and ISO-2022-JP-2), not half-width Jamo
30 (contained in Unicode only). */
31 ucs4_t buf[3];
32 int ret = johab_hangul_decompose(cd,buf,wc);
33 if (ret != RET_ILUNI) {
34 /* we know 1 <= ret <= 3 */
35 state_t backup_state = cd->ostate;
36 unsigned char* backup_outptr = outptr;
37 size_t backup_outleft = outleft;
38 int i, sub_outcount;
39 for (i = 0; i < ret; i++) {
40 if (outleft == 0) {
41 sub_outcount = RET_TOOSMALL;
42 goto johab_hangul_failed;
43 }
44 sub_outcount = cd->ofuncs.xxx_wctomb(cd,outptr,buf[i],outleft);
45 if (sub_outcount <= RET_ILUNI)
46 goto johab_hangul_failed;
47 if (!(sub_outcount <= outleft)) abort();
48 outptr += sub_outcount; outleft -= sub_outcount;
49 }
50 return outptr-backup_outptr;
51 johab_hangul_failed:
52 cd->ostate = backup_state;
53 outptr = backup_outptr;
54 outleft = backup_outleft;
55 if (sub_outcount != RET_ILUNI)
56 return RET_TOOSMALL;
57 }
58 }
59 {
60 /* Try to use a variant, but postfix it with
61 U+303E IDEOGRAPHIC VARIATION INDICATOR
62 (cf. Ken Lunde's "CJKV information processing", p. 188). */
63 int indx = -1;
64 if (wc == 0x3006)
65 indx = 0;
66 else if (wc == 0x30f6)
67 indx = 1;
68 else if (wc >= 0x4e00 && wc < 0xa000)
69 indx = cjk_variants_indx[wc-0x4e00];
70 if (indx >= 0) {
71 for (;; indx++) {
72 ucs4_t buf[2];
73 unsigned short variant = cjk_variants[indx];
74 unsigned short last = variant & 0x8000;
75 variant &= 0x7fff;
76 variant += 0x3000;
77 buf[0] = variant; buf[1] = 0x303e;
78 {
79 state_t backup_state = cd->ostate;
80 unsigned char* backup_outptr = outptr;
81 size_t backup_outleft = outleft;
82 int i, sub_outcount;
83 for (i = 0; i < 2; i++) {
84 if (outleft == 0) {
85 sub_outcount = RET_TOOSMALL;
86 goto variant_failed;
87 }
88 sub_outcount = cd->ofuncs.xxx_wctomb(cd,outptr,buf[i],outleft);
89 if (sub_outcount <= RET_ILUNI)
90 goto variant_failed;
91 if (!(sub_outcount <= outleft)) abort();
92 outptr += sub_outcount; outleft -= sub_outcount;
93 }
94 return outptr-backup_outptr;
95 variant_failed:
96 cd->ostate = backup_state;
97 outptr = backup_outptr;
98 outleft = backup_outleft;
99 if (sub_outcount != RET_ILUNI)
100 return RET_TOOSMALL;
101 }
102 if (last)
103 break;
104 }
105 }
106 }
107 if (wc >= 0x2018 && wc <= 0x201a) {
108 /* Special case for quotation marks 0x2018, 0x2019, 0x201a */
109 ucs4_t substitute =
110 (cd->oflags & HAVE_QUOTATION_MARKS
111 ? (wc == 0x201a ? 0x2018 : wc)
112 : (cd->oflags & HAVE_ACCENTS
113 ? (wc==0x2019 ? 0x00b4 : 0x0060) /* use accents */
114 : 0x0027 /* use apostrophe */
115 ) );
116 int outcount = cd->ofuncs.xxx_wctomb(cd,outptr,substitute,outleft);
117 if (outcount != RET_ILUNI)
118 return outcount;
119 }
120 {
121 /* Use the transliteration table. */
122 int indx = translit_index(wc);
123 if (indx >= 0) {
124 const unsigned int * cp = &translit_data[indx];
125 unsigned int num = *cp++;
126 state_t backup_state = cd->ostate;
127 unsigned char* backup_outptr = outptr;
128 size_t backup_outleft = outleft;
129 unsigned int i;
130 int sub_outcount;
131 for (i = 0; i < num; i++) {
132 if (outleft == 0) {
133 sub_outcount = RET_TOOSMALL;
134 goto translit_failed;
135 }
136 sub_outcount = cd->ofuncs.xxx_wctomb(cd,outptr,cp[i],outleft);
137 if (sub_outcount == RET_ILUNI)
138 /* Recursive transliteration. */
139 sub_outcount = unicode_transliterate(cd,cp[i],outptr,outleft);
140 if (sub_outcount <= RET_ILUNI)
141 goto translit_failed;
142 if (!(sub_outcount <= outleft)) abort();
143 outptr += sub_outcount; outleft -= sub_outcount;
144 }
145 return outptr-backup_outptr;
146 translit_failed:
147 cd->ostate = backup_state;
148 outptr = backup_outptr;
149 outleft = backup_outleft;
150 if (sub_outcount != RET_ILUNI)
151 return RET_TOOSMALL;
152 }
153 }
154 return RET_ILUNI;
155 }
156
157 #ifndef LIBICONV_PLUG
158
159 struct uc_to_mb_fallback_locals {
160 unsigned char* l_outbuf;
161 size_t l_outbytesleft;
162 int l_errno;
163 };
164
uc_to_mb_write_replacement(const char * buf,size_t buflen,void * callback_arg)165 static void uc_to_mb_write_replacement (const char *buf, size_t buflen,
166 void* callback_arg)
167 {
168 struct uc_to_mb_fallback_locals * plocals =
169 (struct uc_to_mb_fallback_locals *) callback_arg;
170 /* Do nothing if already encountered an error in a previous call. */
171 if (plocals->l_errno == 0) {
172 /* Attempt to copy the passed buffer to the output buffer. */
173 if (plocals->l_outbytesleft < buflen)
174 plocals->l_errno = E2BIG;
175 else {
176 memcpy(plocals->l_outbuf, buf, buflen);
177 plocals->l_outbuf += buflen;
178 plocals->l_outbytesleft -= buflen;
179 }
180 }
181 }
182
183 struct mb_to_uc_fallback_locals {
184 conv_t l_cd;
185 unsigned char* l_outbuf;
186 size_t l_outbytesleft;
187 int l_errno;
188 };
189
mb_to_uc_write_replacement(const unsigned int * buf,size_t buflen,void * callback_arg)190 static void mb_to_uc_write_replacement (const unsigned int *buf, size_t buflen,
191 void* callback_arg)
192 {
193 struct mb_to_uc_fallback_locals * plocals =
194 (struct mb_to_uc_fallback_locals *) callback_arg;
195 /* Do nothing if already encountered an error in a previous call. */
196 if (plocals->l_errno == 0) {
197 /* Attempt to convert the passed buffer to the target encoding. */
198 conv_t cd = plocals->l_cd;
199 unsigned char* outptr = plocals->l_outbuf;
200 size_t outleft = plocals->l_outbytesleft;
201 for (; buflen > 0; buf++, buflen--) {
202 ucs4_t wc = *buf;
203 int outcount;
204 if (outleft == 0) {
205 plocals->l_errno = E2BIG;
206 break;
207 }
208 outcount = cd->ofuncs.xxx_wctomb(cd,outptr,wc,outleft);
209 if (outcount != RET_ILUNI)
210 goto outcount_ok;
211 /* Handle Unicode tag characters (range U+E0000..U+E007F). */
212 if ((wc >> 7) == (0xe0000 >> 7))
213 goto outcount_zero;
214 /* Try transliteration. */
215 if (cd->transliterate) {
216 outcount = unicode_transliterate(cd,wc,outptr,outleft);
217 if (outcount != RET_ILUNI)
218 goto outcount_ok;
219 }
220 if (cd->discard_ilseq) {
221 outcount = 0;
222 goto outcount_ok;
223 }
224 #ifndef LIBICONV_PLUG
225 else if (cd->fallbacks.uc_to_mb_fallback != NULL) {
226 struct uc_to_mb_fallback_locals locals;
227 locals.l_outbuf = outptr;
228 locals.l_outbytesleft = outleft;
229 locals.l_errno = 0;
230 cd->fallbacks.uc_to_mb_fallback(wc,
231 uc_to_mb_write_replacement,
232 &locals,
233 cd->fallbacks.data);
234 if (locals.l_errno != 0) {
235 plocals->l_errno = locals.l_errno;
236 break;
237 }
238 outptr = locals.l_outbuf;
239 outleft = locals.l_outbytesleft;
240 outcount = 0;
241 goto outcount_ok;
242 }
243 #endif
244 outcount = cd->ofuncs.xxx_wctomb(cd,outptr,0xFFFD,outleft);
245 if (outcount != RET_ILUNI)
246 goto outcount_ok;
247 plocals->l_errno = EILSEQ;
248 break;
249 outcount_ok:
250 if (outcount < 0) {
251 plocals->l_errno = E2BIG;
252 break;
253 }
254 #ifndef LIBICONV_PLUG
255 if (cd->hooks.uc_hook)
256 (*cd->hooks.uc_hook)(wc, cd->hooks.data);
257 #endif
258 if (!(outcount <= outleft)) abort();
259 outptr += outcount; outleft -= outcount;
260 outcount_zero: ;
261 }
262 plocals->l_outbuf = outptr;
263 plocals->l_outbytesleft = outleft;
264 }
265 }
266
267 #endif /* !LIBICONV_PLUG */
268
unicode_loop_convert(iconv_t icd,const char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft)269 static size_t unicode_loop_convert (iconv_t icd,
270 const char* * inbuf, size_t *inbytesleft,
271 char* * outbuf, size_t *outbytesleft)
272 {
273 conv_t cd = (conv_t) icd;
274 size_t result = 0;
275 const unsigned char* inptr = (const unsigned char*) *inbuf;
276 size_t inleft = *inbytesleft;
277 unsigned char* outptr = (unsigned char*) *outbuf;
278 size_t outleft = *outbytesleft;
279 while (inleft > 0) {
280 state_t last_istate = cd->istate;
281 ucs4_t wc;
282 int incount;
283 int outcount;
284 incount = cd->ifuncs.xxx_mbtowc(cd,&wc,inptr,inleft);
285 if (incount < 0) {
286 if (incount == RET_ILSEQ) {
287 /* Case 1: invalid input */
288 if (cd->discard_ilseq) {
289 switch (cd->iindex) {
290 case ei_ucs4: case ei_ucs4be: case ei_ucs4le:
291 case ei_utf32: case ei_utf32be: case ei_utf32le:
292 case ei_ucs4internal: case ei_ucs4swapped:
293 incount = 4; break;
294 case ei_ucs2: case ei_ucs2be: case ei_ucs2le:
295 case ei_utf16: case ei_utf16be: case ei_utf16le:
296 case ei_ucs2internal: case ei_ucs2swapped:
297 incount = 2; break;
298 default:
299 incount = 1; break;
300 }
301 goto outcount_zero;
302 }
303 #ifndef LIBICONV_PLUG
304 else if (cd->fallbacks.mb_to_uc_fallback != NULL) {
305 struct mb_to_uc_fallback_locals locals;
306 switch (cd->iindex) {
307 case ei_ucs4: case ei_ucs4be: case ei_ucs4le:
308 case ei_utf32: case ei_utf32be: case ei_utf32le:
309 case ei_ucs4internal: case ei_ucs4swapped:
310 incount = 4; break;
311 case ei_ucs2: case ei_ucs2be: case ei_ucs2le:
312 case ei_utf16: case ei_utf16be: case ei_utf16le:
313 case ei_ucs2internal: case ei_ucs2swapped:
314 incount = 2; break;
315 default:
316 incount = 1; break;
317 }
318 locals.l_cd = cd;
319 locals.l_outbuf = outptr;
320 locals.l_outbytesleft = outleft;
321 locals.l_errno = 0;
322 cd->fallbacks.mb_to_uc_fallback(inptr, incount,
323 mb_to_uc_write_replacement,
324 &locals,
325 cd->fallbacks.data);
326 if (locals.l_errno != 0) {
327 errno = locals.l_errno;
328 result = -1;
329 break;
330 }
331 outptr = locals.l_outbuf;
332 outleft = locals.l_outbytesleft;
333 result += 1;
334 goto outcount_zero;
335 }
336 #endif
337 errno = EILSEQ;
338 result = -1;
339 break;
340 }
341 if (incount == RET_TOOFEW(0)) {
342 /* Case 2: not enough bytes available to detect anything */
343 errno = EINVAL;
344 result = -1;
345 break;
346 }
347 /* Case 3: k bytes read, but only a shift sequence */
348 incount = -2-incount;
349 } else {
350 /* Case 4: k bytes read, making up a wide character */
351 if (outleft == 0) {
352 cd->istate = last_istate;
353 errno = E2BIG;
354 result = -1;
355 break;
356 }
357 outcount = cd->ofuncs.xxx_wctomb(cd,outptr,wc,outleft);
358 if (outcount != RET_ILUNI)
359 goto outcount_ok;
360 /* Handle Unicode tag characters (range U+E0000..U+E007F). */
361 if ((wc >> 7) == (0xe0000 >> 7))
362 goto outcount_zero;
363 /* Try transliteration. */
364 result++;
365 if (cd->transliterate) {
366 outcount = unicode_transliterate(cd,wc,outptr,outleft);
367 if (outcount != RET_ILUNI)
368 goto outcount_ok;
369 }
370 if (cd->discard_ilseq) {
371 outcount = 0;
372 goto outcount_ok;
373 }
374 #ifndef LIBICONV_PLUG
375 else if (cd->fallbacks.uc_to_mb_fallback != NULL) {
376 struct uc_to_mb_fallback_locals locals;
377 locals.l_outbuf = outptr;
378 locals.l_outbytesleft = outleft;
379 locals.l_errno = 0;
380 cd->fallbacks.uc_to_mb_fallback(wc,
381 uc_to_mb_write_replacement,
382 &locals,
383 cd->fallbacks.data);
384 if (locals.l_errno != 0) {
385 cd->istate = last_istate;
386 errno = locals.l_errno;
387 return -1;
388 }
389 outptr = locals.l_outbuf;
390 outleft = locals.l_outbytesleft;
391 outcount = 0;
392 goto outcount_ok;
393 }
394 #endif
395 outcount = cd->ofuncs.xxx_wctomb(cd,outptr,0xFFFD,outleft);
396 if (outcount != RET_ILUNI)
397 goto outcount_ok;
398 cd->istate = last_istate;
399 errno = EILSEQ;
400 result = -1;
401 break;
402 outcount_ok:
403 if (outcount < 0) {
404 cd->istate = last_istate;
405 errno = E2BIG;
406 result = -1;
407 break;
408 }
409 #ifndef LIBICONV_PLUG
410 if (cd->hooks.uc_hook)
411 (*cd->hooks.uc_hook)(wc, cd->hooks.data);
412 #endif
413 if (!(outcount <= outleft)) abort();
414 outptr += outcount; outleft -= outcount;
415 }
416 outcount_zero:
417 if (!(incount <= inleft)) abort();
418 inptr += incount; inleft -= incount;
419 }
420 *inbuf = (const char*) inptr;
421 *inbytesleft = inleft;
422 *outbuf = (char*) outptr;
423 *outbytesleft = outleft;
424 return result;
425 }
426
unicode_loop_reset(iconv_t icd,char ** outbuf,size_t * outbytesleft)427 static size_t unicode_loop_reset (iconv_t icd,
428 char* * outbuf, size_t *outbytesleft)
429 {
430 conv_t cd = (conv_t) icd;
431 if (outbuf == NULL || *outbuf == NULL) {
432 /* Reset the states. */
433 memset(&cd->istate,'\0',sizeof(state_t));
434 memset(&cd->ostate,'\0',sizeof(state_t));
435 return 0;
436 } else {
437 size_t result = 0;
438 if (cd->ifuncs.xxx_flushwc) {
439 state_t last_istate = cd->istate;
440 ucs4_t wc;
441 if (cd->ifuncs.xxx_flushwc(cd, &wc)) {
442 unsigned char* outptr = (unsigned char*) *outbuf;
443 size_t outleft = *outbytesleft;
444 int outcount = cd->ofuncs.xxx_wctomb(cd,outptr,wc,outleft);
445 if (outcount != RET_ILUNI)
446 goto outcount_ok;
447 /* Handle Unicode tag characters (range U+E0000..U+E007F). */
448 if ((wc >> 7) == (0xe0000 >> 7))
449 goto outcount_zero;
450 /* Try transliteration. */
451 result++;
452 if (cd->transliterate) {
453 outcount = unicode_transliterate(cd,wc,outptr,outleft);
454 if (outcount != RET_ILUNI)
455 goto outcount_ok;
456 }
457 if (cd->discard_ilseq) {
458 outcount = 0;
459 goto outcount_ok;
460 }
461 #ifndef LIBICONV_PLUG
462 else if (cd->fallbacks.uc_to_mb_fallback != NULL) {
463 struct uc_to_mb_fallback_locals locals;
464 locals.l_outbuf = outptr;
465 locals.l_outbytesleft = outleft;
466 locals.l_errno = 0;
467 cd->fallbacks.uc_to_mb_fallback(wc,
468 uc_to_mb_write_replacement,
469 &locals,
470 cd->fallbacks.data);
471 if (locals.l_errno != 0) {
472 cd->istate = last_istate;
473 errno = locals.l_errno;
474 return -1;
475 }
476 outptr = locals.l_outbuf;
477 outleft = locals.l_outbytesleft;
478 outcount = 0;
479 goto outcount_ok;
480 }
481 #endif
482 outcount = cd->ofuncs.xxx_wctomb(cd,outptr,0xFFFD,outleft);
483 if (outcount != RET_ILUNI)
484 goto outcount_ok;
485 cd->istate = last_istate;
486 errno = EILSEQ;
487 return -1;
488 outcount_ok:
489 if (outcount < 0) {
490 cd->istate = last_istate;
491 errno = E2BIG;
492 return -1;
493 }
494 #ifndef LIBICONV_PLUG
495 if (cd->hooks.uc_hook)
496 (*cd->hooks.uc_hook)(wc, cd->hooks.data);
497 #endif
498 if (!(outcount <= outleft)) abort();
499 outptr += outcount;
500 outleft -= outcount;
501 outcount_zero:
502 *outbuf = (char*) outptr;
503 *outbytesleft = outleft;
504 }
505 }
506 if (cd->ofuncs.xxx_reset) {
507 unsigned char* outptr = (unsigned char*) *outbuf;
508 size_t outleft = *outbytesleft;
509 int outcount = cd->ofuncs.xxx_reset(cd,outptr,outleft);
510 if (outcount < 0) {
511 errno = E2BIG;
512 return -1;
513 }
514 if (!(outcount <= outleft)) abort();
515 *outbuf = (char*) (outptr + outcount);
516 *outbytesleft = outleft - outcount;
517 }
518 memset(&cd->istate,'\0',sizeof(state_t));
519 memset(&cd->ostate,'\0',sizeof(state_t));
520 return result;
521 }
522 }
523