xref: /haiku/src/libs/iconv/loop_unicode.h (revision aef5731f38da6f7b913e0f64acd8a40555491ce5)
1 /*
2  * Copyright (C) 1999-2003, 2005-2006 Free Software Foundation, Inc.
3  * This file is part of the GNU LIBICONV Library.
4  *
5  * The GNU LIBICONV Library is free software; you can redistribute it
6  * and/or modify it under the terms of the GNU Library General Public
7  * License as published by the Free Software Foundation; either version 2
8  * of the License, or (at your option) any later version.
9  *
10  * The GNU LIBICONV Library is distributed in the hope that it will be
11  * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  * Library General Public License for more details.
14  *
15  * You should have received a copy of the GNU Library General Public
16  * License along with the GNU LIBICONV Library; see the file COPYING.LIB.
17  * If not, write to the Free Software Foundation, Inc., 51 Franklin Street,
18  * Fifth Floor, Boston, MA 02110-1301, USA.
19  */
20 
21 /* This file defines the conversion loop via Unicode as a pivot encoding. */
22 
23 /* Attempt to transliterate wc. Return code as in xxx_wctomb. */
unicode_transliterate(conv_t cd,ucs4_t wc,unsigned char * outptr,size_t outleft)24 static int unicode_transliterate (conv_t cd, ucs4_t wc,
25                                   unsigned char* outptr, size_t outleft)
26 {
27   if (cd->oflags & HAVE_HANGUL_JAMO) {
28     /* Decompose Hangul into Jamo. Use double-width Jamo (contained
29        in all Korean encodings and ISO-2022-JP-2), not half-width Jamo
30        (contained in Unicode only). */
31     ucs4_t buf[3];
32     int ret = johab_hangul_decompose(cd,buf,wc);
33     if (ret != RET_ILUNI) {
34       /* we know 1 <= ret <= 3 */
35       state_t backup_state = cd->ostate;
36       unsigned char* backup_outptr = outptr;
37       size_t backup_outleft = outleft;
38       int i, sub_outcount;
39       for (i = 0; i < ret; i++) {
40         if (outleft == 0) {
41           sub_outcount = RET_TOOSMALL;
42           goto johab_hangul_failed;
43         }
44         sub_outcount = cd->ofuncs.xxx_wctomb(cd,outptr,buf[i],outleft);
45         if (sub_outcount <= RET_ILUNI)
46           goto johab_hangul_failed;
47         if (!(sub_outcount <= outleft)) abort();
48         outptr += sub_outcount; outleft -= sub_outcount;
49       }
50       return outptr-backup_outptr;
51     johab_hangul_failed:
52       cd->ostate = backup_state;
53       outptr = backup_outptr;
54       outleft = backup_outleft;
55       if (sub_outcount != RET_ILUNI)
56         return RET_TOOSMALL;
57     }
58   }
59   {
60     /* Try to use a variant, but postfix it with
61        U+303E IDEOGRAPHIC VARIATION INDICATOR
62        (cf. Ken Lunde's "CJKV information processing", p. 188). */
63     int indx = -1;
64     if (wc == 0x3006)
65       indx = 0;
66     else if (wc == 0x30f6)
67       indx = 1;
68     else if (wc >= 0x4e00 && wc < 0xa000)
69       indx = cjk_variants_indx[wc-0x4e00];
70     if (indx >= 0) {
71       for (;; indx++) {
72         ucs4_t buf[2];
73         unsigned short variant = cjk_variants[indx];
74         unsigned short last = variant & 0x8000;
75         variant &= 0x7fff;
76         variant += 0x3000;
77         buf[0] = variant; buf[1] = 0x303e;
78         {
79           state_t backup_state = cd->ostate;
80           unsigned char* backup_outptr = outptr;
81           size_t backup_outleft = outleft;
82           int i, sub_outcount;
83           for (i = 0; i < 2; i++) {
84             if (outleft == 0) {
85               sub_outcount = RET_TOOSMALL;
86               goto variant_failed;
87             }
88             sub_outcount = cd->ofuncs.xxx_wctomb(cd,outptr,buf[i],outleft);
89             if (sub_outcount <= RET_ILUNI)
90               goto variant_failed;
91             if (!(sub_outcount <= outleft)) abort();
92             outptr += sub_outcount; outleft -= sub_outcount;
93           }
94           return outptr-backup_outptr;
95         variant_failed:
96           cd->ostate = backup_state;
97           outptr = backup_outptr;
98           outleft = backup_outleft;
99           if (sub_outcount != RET_ILUNI)
100             return RET_TOOSMALL;
101         }
102         if (last)
103           break;
104       }
105     }
106   }
107   if (wc >= 0x2018 && wc <= 0x201a) {
108     /* Special case for quotation marks 0x2018, 0x2019, 0x201a */
109     ucs4_t substitute =
110       (cd->oflags & HAVE_QUOTATION_MARKS
111        ? (wc == 0x201a ? 0x2018 : wc)
112        : (cd->oflags & HAVE_ACCENTS
113           ? (wc==0x2019 ? 0x00b4 : 0x0060) /* use accents */
114           : 0x0027 /* use apostrophe */
115       )  );
116     int outcount = cd->ofuncs.xxx_wctomb(cd,outptr,substitute,outleft);
117     if (outcount != RET_ILUNI)
118       return outcount;
119   }
120   {
121     /* Use the transliteration table. */
122     int indx = translit_index(wc);
123     if (indx >= 0) {
124       const unsigned int * cp = &translit_data[indx];
125       unsigned int num = *cp++;
126       state_t backup_state = cd->ostate;
127       unsigned char* backup_outptr = outptr;
128       size_t backup_outleft = outleft;
129       unsigned int i;
130       int sub_outcount;
131       for (i = 0; i < num; i++) {
132         if (outleft == 0) {
133           sub_outcount = RET_TOOSMALL;
134           goto translit_failed;
135         }
136         sub_outcount = cd->ofuncs.xxx_wctomb(cd,outptr,cp[i],outleft);
137         if (sub_outcount == RET_ILUNI)
138           /* Recursive transliteration. */
139           sub_outcount = unicode_transliterate(cd,cp[i],outptr,outleft);
140         if (sub_outcount <= RET_ILUNI)
141           goto translit_failed;
142         if (!(sub_outcount <= outleft)) abort();
143         outptr += sub_outcount; outleft -= sub_outcount;
144       }
145       return outptr-backup_outptr;
146     translit_failed:
147       cd->ostate = backup_state;
148       outptr = backup_outptr;
149       outleft = backup_outleft;
150       if (sub_outcount != RET_ILUNI)
151         return RET_TOOSMALL;
152     }
153   }
154   return RET_ILUNI;
155 }
156 
157 #ifndef LIBICONV_PLUG
158 
159 struct uc_to_mb_fallback_locals {
160   unsigned char* l_outbuf;
161   size_t l_outbytesleft;
162   int l_errno;
163 };
164 
uc_to_mb_write_replacement(const char * buf,size_t buflen,void * callback_arg)165 static void uc_to_mb_write_replacement (const char *buf, size_t buflen,
166                                         void* callback_arg)
167 {
168   struct uc_to_mb_fallback_locals * plocals =
169     (struct uc_to_mb_fallback_locals *) callback_arg;
170   /* Do nothing if already encountered an error in a previous call. */
171   if (plocals->l_errno == 0) {
172     /* Attempt to copy the passed buffer to the output buffer. */
173     if (plocals->l_outbytesleft < buflen)
174       plocals->l_errno = E2BIG;
175     else {
176       memcpy(plocals->l_outbuf, buf, buflen);
177       plocals->l_outbuf += buflen;
178       plocals->l_outbytesleft -= buflen;
179     }
180   }
181 }
182 
183 struct mb_to_uc_fallback_locals {
184   conv_t l_cd;
185   unsigned char* l_outbuf;
186   size_t l_outbytesleft;
187   int l_errno;
188 };
189 
mb_to_uc_write_replacement(const unsigned int * buf,size_t buflen,void * callback_arg)190 static void mb_to_uc_write_replacement (const unsigned int *buf, size_t buflen,
191                                         void* callback_arg)
192 {
193   struct mb_to_uc_fallback_locals * plocals =
194     (struct mb_to_uc_fallback_locals *) callback_arg;
195   /* Do nothing if already encountered an error in a previous call. */
196   if (plocals->l_errno == 0) {
197     /* Attempt to convert the passed buffer to the target encoding. */
198     conv_t cd = plocals->l_cd;
199     unsigned char* outptr = plocals->l_outbuf;
200     size_t outleft = plocals->l_outbytesleft;
201     for (; buflen > 0; buf++, buflen--) {
202       ucs4_t wc = *buf;
203       int outcount;
204       if (outleft == 0) {
205         plocals->l_errno = E2BIG;
206         break;
207       }
208       outcount = cd->ofuncs.xxx_wctomb(cd,outptr,wc,outleft);
209       if (outcount != RET_ILUNI)
210         goto outcount_ok;
211       /* Handle Unicode tag characters (range U+E0000..U+E007F). */
212       if ((wc >> 7) == (0xe0000 >> 7))
213         goto outcount_zero;
214       /* Try transliteration. */
215       if (cd->transliterate) {
216         outcount = unicode_transliterate(cd,wc,outptr,outleft);
217         if (outcount != RET_ILUNI)
218           goto outcount_ok;
219       }
220       if (cd->discard_ilseq) {
221         outcount = 0;
222         goto outcount_ok;
223       }
224       #ifndef LIBICONV_PLUG
225       else if (cd->fallbacks.uc_to_mb_fallback != NULL) {
226         struct uc_to_mb_fallback_locals locals;
227         locals.l_outbuf = outptr;
228         locals.l_outbytesleft = outleft;
229         locals.l_errno = 0;
230         cd->fallbacks.uc_to_mb_fallback(wc,
231                                         uc_to_mb_write_replacement,
232                                         &locals,
233                                         cd->fallbacks.data);
234         if (locals.l_errno != 0) {
235           plocals->l_errno = locals.l_errno;
236           break;
237         }
238         outptr = locals.l_outbuf;
239         outleft = locals.l_outbytesleft;
240         outcount = 0;
241         goto outcount_ok;
242       }
243       #endif
244       outcount = cd->ofuncs.xxx_wctomb(cd,outptr,0xFFFD,outleft);
245       if (outcount != RET_ILUNI)
246         goto outcount_ok;
247       plocals->l_errno = EILSEQ;
248       break;
249     outcount_ok:
250       if (outcount < 0) {
251         plocals->l_errno = E2BIG;
252         break;
253       }
254       #ifndef LIBICONV_PLUG
255       if (cd->hooks.uc_hook)
256         (*cd->hooks.uc_hook)(wc, cd->hooks.data);
257       #endif
258       if (!(outcount <= outleft)) abort();
259       outptr += outcount; outleft -= outcount;
260     outcount_zero: ;
261     }
262     plocals->l_outbuf = outptr;
263     plocals->l_outbytesleft = outleft;
264   }
265 }
266 
267 #endif /* !LIBICONV_PLUG */
268 
unicode_loop_convert(iconv_t icd,const char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft)269 static size_t unicode_loop_convert (iconv_t icd,
270                                     const char* * inbuf, size_t *inbytesleft,
271                                     char* * outbuf, size_t *outbytesleft)
272 {
273   conv_t cd = (conv_t) icd;
274   size_t result = 0;
275   const unsigned char* inptr = (const unsigned char*) *inbuf;
276   size_t inleft = *inbytesleft;
277   unsigned char* outptr = (unsigned char*) *outbuf;
278   size_t outleft = *outbytesleft;
279   while (inleft > 0) {
280     state_t last_istate = cd->istate;
281     ucs4_t wc;
282     int incount;
283     int outcount;
284     incount = cd->ifuncs.xxx_mbtowc(cd,&wc,inptr,inleft);
285     if (incount < 0) {
286       if (incount == RET_ILSEQ) {
287         /* Case 1: invalid input */
288         if (cd->discard_ilseq) {
289           switch (cd->iindex) {
290             case ei_ucs4: case ei_ucs4be: case ei_ucs4le:
291             case ei_utf32: case ei_utf32be: case ei_utf32le:
292             case ei_ucs4internal: case ei_ucs4swapped:
293               incount = 4; break;
294             case ei_ucs2: case ei_ucs2be: case ei_ucs2le:
295             case ei_utf16: case ei_utf16be: case ei_utf16le:
296             case ei_ucs2internal: case ei_ucs2swapped:
297               incount = 2; break;
298             default:
299               incount = 1; break;
300           }
301           goto outcount_zero;
302         }
303         #ifndef LIBICONV_PLUG
304         else if (cd->fallbacks.mb_to_uc_fallback != NULL) {
305           struct mb_to_uc_fallback_locals locals;
306           switch (cd->iindex) {
307             case ei_ucs4: case ei_ucs4be: case ei_ucs4le:
308             case ei_utf32: case ei_utf32be: case ei_utf32le:
309             case ei_ucs4internal: case ei_ucs4swapped:
310               incount = 4; break;
311             case ei_ucs2: case ei_ucs2be: case ei_ucs2le:
312             case ei_utf16: case ei_utf16be: case ei_utf16le:
313             case ei_ucs2internal: case ei_ucs2swapped:
314               incount = 2; break;
315             default:
316               incount = 1; break;
317           }
318           locals.l_cd = cd;
319           locals.l_outbuf = outptr;
320           locals.l_outbytesleft = outleft;
321           locals.l_errno = 0;
322           cd->fallbacks.mb_to_uc_fallback(inptr, incount,
323                                           mb_to_uc_write_replacement,
324                                           &locals,
325                                           cd->fallbacks.data);
326           if (locals.l_errno != 0) {
327             errno = locals.l_errno;
328             result = -1;
329             break;
330           }
331           outptr = locals.l_outbuf;
332           outleft = locals.l_outbytesleft;
333           result += 1;
334           goto outcount_zero;
335         }
336         #endif
337         errno = EILSEQ;
338         result = -1;
339         break;
340       }
341       if (incount == RET_TOOFEW(0)) {
342         /* Case 2: not enough bytes available to detect anything */
343         errno = EINVAL;
344         result = -1;
345         break;
346       }
347       /* Case 3: k bytes read, but only a shift sequence */
348       incount = -2-incount;
349     } else {
350       /* Case 4: k bytes read, making up a wide character */
351       if (outleft == 0) {
352         cd->istate = last_istate;
353         errno = E2BIG;
354         result = -1;
355         break;
356       }
357       outcount = cd->ofuncs.xxx_wctomb(cd,outptr,wc,outleft);
358       if (outcount != RET_ILUNI)
359         goto outcount_ok;
360       /* Handle Unicode tag characters (range U+E0000..U+E007F). */
361       if ((wc >> 7) == (0xe0000 >> 7))
362         goto outcount_zero;
363       /* Try transliteration. */
364       result++;
365       if (cd->transliterate) {
366         outcount = unicode_transliterate(cd,wc,outptr,outleft);
367         if (outcount != RET_ILUNI)
368           goto outcount_ok;
369       }
370       if (cd->discard_ilseq) {
371         outcount = 0;
372         goto outcount_ok;
373       }
374       #ifndef LIBICONV_PLUG
375       else if (cd->fallbacks.uc_to_mb_fallback != NULL) {
376         struct uc_to_mb_fallback_locals locals;
377         locals.l_outbuf = outptr;
378         locals.l_outbytesleft = outleft;
379         locals.l_errno = 0;
380         cd->fallbacks.uc_to_mb_fallback(wc,
381                                         uc_to_mb_write_replacement,
382                                         &locals,
383                                         cd->fallbacks.data);
384         if (locals.l_errno != 0) {
385           cd->istate = last_istate;
386           errno = locals.l_errno;
387           return -1;
388         }
389         outptr = locals.l_outbuf;
390         outleft = locals.l_outbytesleft;
391         outcount = 0;
392         goto outcount_ok;
393       }
394       #endif
395       outcount = cd->ofuncs.xxx_wctomb(cd,outptr,0xFFFD,outleft);
396       if (outcount != RET_ILUNI)
397         goto outcount_ok;
398       cd->istate = last_istate;
399       errno = EILSEQ;
400       result = -1;
401       break;
402     outcount_ok:
403       if (outcount < 0) {
404         cd->istate = last_istate;
405         errno = E2BIG;
406         result = -1;
407         break;
408       }
409       #ifndef LIBICONV_PLUG
410       if (cd->hooks.uc_hook)
411         (*cd->hooks.uc_hook)(wc, cd->hooks.data);
412       #endif
413       if (!(outcount <= outleft)) abort();
414       outptr += outcount; outleft -= outcount;
415     }
416   outcount_zero:
417     if (!(incount <= inleft)) abort();
418     inptr += incount; inleft -= incount;
419   }
420   *inbuf = (const char*) inptr;
421   *inbytesleft = inleft;
422   *outbuf = (char*) outptr;
423   *outbytesleft = outleft;
424   return result;
425 }
426 
unicode_loop_reset(iconv_t icd,char ** outbuf,size_t * outbytesleft)427 static size_t unicode_loop_reset (iconv_t icd,
428                                   char* * outbuf, size_t *outbytesleft)
429 {
430   conv_t cd = (conv_t) icd;
431   if (outbuf == NULL || *outbuf == NULL) {
432     /* Reset the states. */
433     memset(&cd->istate,'\0',sizeof(state_t));
434     memset(&cd->ostate,'\0',sizeof(state_t));
435     return 0;
436   } else {
437     size_t result = 0;
438     if (cd->ifuncs.xxx_flushwc) {
439       state_t last_istate = cd->istate;
440       ucs4_t wc;
441       if (cd->ifuncs.xxx_flushwc(cd, &wc)) {
442         unsigned char* outptr = (unsigned char*) *outbuf;
443         size_t outleft = *outbytesleft;
444         int outcount = cd->ofuncs.xxx_wctomb(cd,outptr,wc,outleft);
445         if (outcount != RET_ILUNI)
446           goto outcount_ok;
447         /* Handle Unicode tag characters (range U+E0000..U+E007F). */
448         if ((wc >> 7) == (0xe0000 >> 7))
449           goto outcount_zero;
450         /* Try transliteration. */
451         result++;
452         if (cd->transliterate) {
453           outcount = unicode_transliterate(cd,wc,outptr,outleft);
454           if (outcount != RET_ILUNI)
455             goto outcount_ok;
456         }
457         if (cd->discard_ilseq) {
458           outcount = 0;
459           goto outcount_ok;
460         }
461         #ifndef LIBICONV_PLUG
462         else if (cd->fallbacks.uc_to_mb_fallback != NULL) {
463           struct uc_to_mb_fallback_locals locals;
464           locals.l_outbuf = outptr;
465           locals.l_outbytesleft = outleft;
466           locals.l_errno = 0;
467           cd->fallbacks.uc_to_mb_fallback(wc,
468                                           uc_to_mb_write_replacement,
469                                           &locals,
470                                           cd->fallbacks.data);
471           if (locals.l_errno != 0) {
472             cd->istate = last_istate;
473             errno = locals.l_errno;
474             return -1;
475           }
476           outptr = locals.l_outbuf;
477           outleft = locals.l_outbytesleft;
478           outcount = 0;
479           goto outcount_ok;
480         }
481         #endif
482         outcount = cd->ofuncs.xxx_wctomb(cd,outptr,0xFFFD,outleft);
483         if (outcount != RET_ILUNI)
484           goto outcount_ok;
485         cd->istate = last_istate;
486         errno = EILSEQ;
487         return -1;
488       outcount_ok:
489         if (outcount < 0) {
490           cd->istate = last_istate;
491           errno = E2BIG;
492           return -1;
493         }
494         #ifndef LIBICONV_PLUG
495         if (cd->hooks.uc_hook)
496           (*cd->hooks.uc_hook)(wc, cd->hooks.data);
497         #endif
498         if (!(outcount <= outleft)) abort();
499         outptr += outcount;
500         outleft -= outcount;
501       outcount_zero:
502         *outbuf = (char*) outptr;
503         *outbytesleft = outleft;
504       }
505     }
506     if (cd->ofuncs.xxx_reset) {
507       unsigned char* outptr = (unsigned char*) *outbuf;
508       size_t outleft = *outbytesleft;
509       int outcount = cd->ofuncs.xxx_reset(cd,outptr,outleft);
510       if (outcount < 0) {
511         errno = E2BIG;
512         return -1;
513       }
514       if (!(outcount <= outleft)) abort();
515       *outbuf = (char*) (outptr + outcount);
516       *outbytesleft = outleft - outcount;
517     }
518     memset(&cd->istate,'\0',sizeof(state_t));
519     memset(&cd->ostate,'\0',sizeof(state_t));
520     return result;
521   }
522 }
523