xref: /haiku/src/libs/iconv/iso2022_jp3.h (revision b671e9bbdbd10268a042b4f4cc4317ccd03d105e)
1 /*
2  * Copyright (C) 1999-2004 Free Software Foundation, Inc.
3  * This file is part of the GNU LIBICONV Library.
4  *
5  * The GNU LIBICONV Library is free software; you can redistribute it
6  * and/or modify it under the terms of the GNU Library General Public
7  * License as published by the Free Software Foundation; either version 2
8  * of the License, or (at your option) any later version.
9  *
10  * The GNU LIBICONV Library is distributed in the hope that it will be
11  * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  * Library General Public License for more details.
14  *
15  * You should have received a copy of the GNU Library General Public
16  * License along with the GNU LIBICONV Library; see the file COPYING.LIB.
17  * If not, write to the Free Software Foundation, Inc., 51 Franklin Street,
18  * Fifth Floor, Boston, MA 02110-1301, USA.
19  */
20 
21 /*
22  * ISO-2022-JP-3
23  */
24 
25 #include "jisx0213.h"
26 
27 #define ESC 0x1b
28 
29 /*
30  * The state is composed of one of the following values
31  */
32 #define STATE_ASCII             0  /* Esc ( B */
33 #define STATE_JISX0201ROMAN     1  /* Esc ( J */
34 #define STATE_JISX0201KATAKANA  2  /* Esc ( I */
35 #define STATE_JISX0208          3  /* Esc $ @ or Esc $ B */
36 #define STATE_JISX02131         4  /* Esc $ ( O or Esc $ ( Q*/
37 #define STATE_JISX02132         5  /* Esc $ ( P */
38 
39 /*
40  * In the ISO-2022-JP-3 to UCS-4 direction, the state also holds the last
41  * character to be output, shifted by 3 bits.
42  */
43 
44 static int
45 iso2022_jp3_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, int n)
46 {
47   ucs4_t last_wc = conv->istate >> 3;
48   if (last_wc) {
49     /* Output the buffered character. */
50     conv->istate &= 7;
51     *pwc = last_wc;
52     return 0; /* Don't advance the input pointer. */
53   } else {
54     state_t state = conv->istate;
55     int count = 0;
56     unsigned char c;
57     for (;;) {
58       c = *s;
59       if (c == ESC) {
60         if (n < count+3)
61           goto none;
62         if (s[1] == '(') {
63           if (s[2] == 'B') {
64             state = STATE_ASCII;
65             s += 3; count += 3;
66             if (n < count+1)
67               goto none;
68             continue;
69           }
70           if (s[2] == 'J') {
71             state = STATE_JISX0201ROMAN;
72             s += 3; count += 3;
73             if (n < count+1)
74               goto none;
75             continue;
76           }
77           if (s[2] == 'I') {
78             state = STATE_JISX0201KATAKANA;
79             s += 3; count += 3;
80             if (n < count+1)
81               goto none;
82             continue;
83           }
84           return RET_ILSEQ;
85         }
86         if (s[1] == '$') {
87           if (s[2] == '@' || s[2] == 'B') {
88             /* We don't distinguish JIS X 0208-1978 and JIS X 0208-1983. */
89             state = STATE_JISX0208;
90             s += 3; count += 3;
91             if (n < count+1)
92               goto none;
93             continue;
94           }
95           if (s[2] == '(') {
96             if (n < count+4)
97               goto none;
98             if (s[3] == 'O' || s[3] == 'Q') {
99               state = STATE_JISX02131;
100               s += 4; count += 4;
101               if (n < count+1)
102                 goto none;
103               continue;
104             }
105             if (s[3] == 'P') {
106               state = STATE_JISX02132;
107               s += 4; count += 4;
108               if (n < count+1)
109                 goto none;
110               continue;
111             }
112           }
113           return RET_ILSEQ;
114         }
115         return RET_ILSEQ;
116       }
117       break;
118     }
119     switch (state) {
120       case STATE_ASCII:
121         if (c < 0x80) {
122           int ret = ascii_mbtowc(conv,pwc,s,1);
123           if (ret == RET_ILSEQ)
124             return RET_ILSEQ;
125           if (ret != 1) abort();
126           conv->istate = state;
127           return count+1;
128         } else
129           return RET_ILSEQ;
130       case STATE_JISX0201ROMAN:
131         if (c < 0x80) {
132           int ret = jisx0201_mbtowc(conv,pwc,s,1);
133           if (ret == RET_ILSEQ)
134             return RET_ILSEQ;
135           if (ret != 1) abort();
136           conv->istate = state;
137           return count+1;
138         } else
139           return RET_ILSEQ;
140       case STATE_JISX0201KATAKANA:
141         if (c < 0x80) {
142           unsigned char buf = c+0x80;
143           int ret = jisx0201_mbtowc(conv,pwc,&buf,1);
144           if (ret == RET_ILSEQ)
145             return RET_ILSEQ;
146           if (ret != 1) abort();
147           conv->istate = state;
148           return count+1;
149         } else
150           return RET_ILSEQ;
151       case STATE_JISX0208:
152         if (n < count+2)
153           goto none;
154         if (s[0] < 0x80 && s[1] < 0x80) {
155           int ret = jisx0208_mbtowc(conv,pwc,s,2);
156           if (ret == RET_ILSEQ)
157             return RET_ILSEQ;
158           if (ret != 2) abort();
159           conv->istate = state;
160           return count+2;
161         } else
162           return RET_ILSEQ;
163       case STATE_JISX02131:
164       case STATE_JISX02132:
165         if (n < count+2)
166           goto none;
167         if (s[0] < 0x80 && s[1] < 0x80) {
168           ucs4_t wc = jisx0213_to_ucs4(((state-STATE_JISX02131+1)<<8)+s[0],s[1]);
169           if (wc) {
170             if (wc < 0x80) {
171               /* It's a combining character. */
172               ucs4_t wc1 = jisx0213_to_ucs_combining[wc - 1][0];
173               ucs4_t wc2 = jisx0213_to_ucs_combining[wc - 1][1];
174               /* We cannot output two Unicode characters at once. So,
175                  output the first character and buffer the second one. */
176               *pwc = wc1;
177               conv->istate = (wc2 << 3) | state;
178             } else {
179               *pwc = wc;
180               conv->istate = state;
181             }
182             return count+2;
183           }
184         }
185         return RET_ILSEQ;
186       default: abort();
187     }
188   none:
189     conv->istate = state;
190     return RET_TOOFEW(count);
191   }
192 }
193 
194 static int
195 iso2022_jp3_flushwc (conv_t conv, ucs4_t *pwc)
196 {
197   ucs4_t last_wc = conv->istate >> 3;
198   if (last_wc) {
199     /* Output the buffered character. */
200     conv->istate &= 7;
201     *pwc = last_wc;
202     return 1;
203   } else
204     return 0;
205 }
206 
207 /*
208  * In the UCS-4 to ISO-2022-JP-3 direction, the state also holds the last two
209  * bytes to be output, shifted by 3 bits, and the STATE_xxxxx value that was
210  * effective before this buffered character, shifted by 19 bits.
211  */
212 
213 /* Composition tables for each of the relevant combining characters.  */
214 static const struct { unsigned short base; unsigned short composed; } iso2022_jp3_comp_table_data[] = {
215 #define iso2022_jp3_comp_table02e5_idx 0
216 #define iso2022_jp3_comp_table02e5_len 1
217   { 0x2b64, 0x2b65 }, /* 0x12B65 = 0x12B64 U+02E5 */
218 #define iso2022_jp3_comp_table02e9_idx (iso2022_jp3_comp_table02e5_idx+iso2022_jp3_comp_table02e5_len)
219 #define iso2022_jp3_comp_table02e9_len 1
220   { 0x2b60, 0x2b66 }, /* 0x12B66 = 0x12B60 U+02E9 */
221 #define iso2022_jp3_comp_table0300_idx (iso2022_jp3_comp_table02e9_idx+iso2022_jp3_comp_table02e9_len)
222 #define iso2022_jp3_comp_table0300_len 5
223   { 0x295c, 0x2b44 }, /* 0x12B44 = 0x1295C U+0300 */
224   { 0x2b38, 0x2b48 }, /* 0x12B48 = 0x12B38 U+0300 */
225   { 0x2b37, 0x2b4a }, /* 0x12B4A = 0x12B37 U+0300 */
226   { 0x2b30, 0x2b4c }, /* 0x12B4C = 0x12B30 U+0300 */
227   { 0x2b43, 0x2b4e }, /* 0x12B4E = 0x12B43 U+0300 */
228 #define iso2022_jp3_comp_table0301_idx (iso2022_jp3_comp_table0300_idx+iso2022_jp3_comp_table0300_len)
229 #define iso2022_jp3_comp_table0301_len 4
230   { 0x2b38, 0x2b49 }, /* 0x12B49 = 0x12B38 U+0301 */
231   { 0x2b37, 0x2b4b }, /* 0x12B4B = 0x12B37 U+0301 */
232   { 0x2b30, 0x2b4d }, /* 0x12B4D = 0x12B30 U+0301 */
233   { 0x2b43, 0x2b4f }, /* 0x12B4F = 0x12B43 U+0301 */
234 #define iso2022_jp3_comp_table309a_idx (iso2022_jp3_comp_table0301_idx+iso2022_jp3_comp_table0301_len)
235 #define iso2022_jp3_comp_table309a_len 14
236   { 0x242b, 0x2477 }, /* 0x12477 = 0x1242B U+309A */
237   { 0x242d, 0x2478 }, /* 0x12478 = 0x1242D U+309A */
238   { 0x242f, 0x2479 }, /* 0x12479 = 0x1242F U+309A */
239   { 0x2431, 0x247a }, /* 0x1247A = 0x12431 U+309A */
240   { 0x2433, 0x247b }, /* 0x1247B = 0x12433 U+309A */
241   { 0x252b, 0x2577 }, /* 0x12577 = 0x1252B U+309A */
242   { 0x252d, 0x2578 }, /* 0x12578 = 0x1252D U+309A */
243   { 0x252f, 0x2579 }, /* 0x12579 = 0x1252F U+309A */
244   { 0x2531, 0x257a }, /* 0x1257A = 0x12531 U+309A */
245   { 0x2533, 0x257b }, /* 0x1257B = 0x12533 U+309A */
246   { 0x253b, 0x257c }, /* 0x1257C = 0x1253B U+309A */
247   { 0x2544, 0x257d }, /* 0x1257D = 0x12544 U+309A */
248   { 0x2548, 0x257e }, /* 0x1257E = 0x12548 U+309A */
249   { 0x2675, 0x2678 }, /* 0x12678 = 0x12675 U+309A */
250 };
251 
252 #define SPLIT_STATE \
253   unsigned short lasttwo = state >> 3; state_t prevstate = state >> 19; state &= 7
254 #define COMBINE_STATE \
255   state |= (prevstate << 19) | (lasttwo << 3)
256 #define COMBINE_STATE_NO_LASTTWO \
257   /* assume lasttwo == 0, then prevstate is ignored */
258 
259 static int
260 iso2022_jp3_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, int n)
261 {
262   int count = 0;
263   unsigned char buf[2];
264   unsigned short jch;
265   int ret;
266   state_t state = conv->ostate;
267   SPLIT_STATE;
268 
269   if (lasttwo) {
270     /* Attempt to combine the last character with this one. */
271     unsigned int idx;
272     unsigned int len;
273 
274     if (wc == 0x02e5)
275       idx = iso2022_jp3_comp_table02e5_idx,
276       len = iso2022_jp3_comp_table02e5_len;
277     else if (wc == 0x02e9)
278       idx = iso2022_jp3_comp_table02e9_idx,
279       len = iso2022_jp3_comp_table02e9_len;
280     else if (wc == 0x0300)
281       idx = iso2022_jp3_comp_table0300_idx,
282       len = iso2022_jp3_comp_table0300_len;
283     else if (wc == 0x0301)
284       idx = iso2022_jp3_comp_table0301_idx,
285       len = iso2022_jp3_comp_table0301_len;
286     else if (wc == 0x309a)
287       idx = iso2022_jp3_comp_table309a_idx,
288       len = iso2022_jp3_comp_table309a_len;
289     else
290       goto not_combining;
291 
292     do
293       if (iso2022_jp3_comp_table_data[idx].base == lasttwo)
294         break;
295     while (++idx, --len > 0);
296 
297     if (len > 0) {
298       /* Output the combined character. */
299       /* We know the combined character is in JISX0213 plane 1, but
300          the buffered character may have been in JISX0208 or in
301          JISX0213 plane 1. */
302       count = (state != STATE_JISX02131 ? 4 : 0) + 2;
303       if (n < count)
304         return RET_TOOSMALL;
305       if (state != STATE_JISX02131) {
306         r[0] = ESC;
307         r[1] = '$';
308         r[2] = '(';
309         r[3] = 'Q';
310         r += 4;
311         state = STATE_JISX02131;
312       }
313       lasttwo = iso2022_jp3_comp_table_data[idx].composed;
314       r[0] = (lasttwo >> 8) & 0xff;
315       r[1] = lasttwo & 0xff;
316       COMBINE_STATE_NO_LASTTWO;
317       conv->ostate = state;
318       return count;
319     }
320 
321   not_combining:
322     /* Output the buffered character. */
323     /* We know it is in JISX0208 or in JISX0213 plane 1. */
324     count = (prevstate != state ? 3 : 0) + 2;
325     if (n < count)
326       return RET_TOOSMALL;
327     if (prevstate != state) {
328       if (state != STATE_JISX0208) abort();
329       r[0] = ESC;
330       r[1] = '$';
331       r[2] = 'B';
332       r += 3;
333     }
334     r[0] = (lasttwo >> 8) & 0xff;
335     r[1] = lasttwo & 0xff;
336     r += 2;
337   }
338 
339   /* Try ASCII. */
340   ret = ascii_wctomb(conv,buf,wc,1);
341   if (ret != RET_ILUNI) {
342     if (ret != 1) abort();
343     if (buf[0] < 0x80) {
344       count += (state == STATE_ASCII ? 1 : 4);
345       if (n < count)
346         return RET_TOOSMALL;
347       if (state != STATE_ASCII) {
348         r[0] = ESC;
349         r[1] = '(';
350         r[2] = 'B';
351         r += 3;
352         state = STATE_ASCII;
353       }
354       r[0] = buf[0];
355       COMBINE_STATE_NO_LASTTWO;
356       conv->ostate = state;
357       return count;
358     }
359   }
360 
361   /* Try JIS X 0201-1976 Roman. */
362   ret = jisx0201_wctomb(conv,buf,wc,1);
363   if (ret != RET_ILUNI) {
364     if (ret != 1) abort();
365     if (buf[0] < 0x80) {
366       count += (state == STATE_JISX0201ROMAN ? 1 : 4);
367       if (n < count)
368         return RET_TOOSMALL;
369       if (state != STATE_JISX0201ROMAN) {
370         r[0] = ESC;
371         r[1] = '(';
372         r[2] = 'J';
373         r += 3;
374         state = STATE_JISX0201ROMAN;
375       }
376       r[0] = buf[0];
377       COMBINE_STATE_NO_LASTTWO;
378       conv->ostate = state;
379       return count;
380     }
381   }
382 
383   jch = ucs4_to_jisx0213(wc);
384 
385   /* Try JIS X 0208-1990 in place of JIS X 0208-1978 and JIS X 0208-1983. */
386   ret = jisx0208_wctomb(conv,buf,wc,2);
387   if (ret != RET_ILUNI) {
388     if (ret != 2) abort();
389     if (buf[0] < 0x80 && buf[1] < 0x80) {
390       if (jch & 0x0080) {
391         /* A possible match in comp_table_data. Buffer it. */
392         prevstate = state;
393         lasttwo = jch & 0x7f7f;
394         state = STATE_JISX0208;
395         COMBINE_STATE;
396         conv->ostate = state;
397         return count;
398       } else {
399         count += (state == STATE_JISX0208 ? 2 : 5);
400         if (n < count)
401           return RET_TOOSMALL;
402         if (state != STATE_JISX0208) {
403           r[0] = ESC;
404           r[1] = '$';
405           r[2] = 'B';
406           r += 3;
407           state = STATE_JISX0208;
408         }
409         r[0] = buf[0];
410         r[1] = buf[1];
411         COMBINE_STATE_NO_LASTTWO;
412         conv->ostate = state;
413         return count;
414       }
415     }
416   }
417 
418   /* Try JISX 0213 plane 1 and JISX 0213 plane 2. */
419   if (jch != 0) {
420     if (jch & 0x8000) {
421       /* JISX 0213 plane 2. */
422       if (state != STATE_JISX02132) {
423         count += 4;
424         if (n < count)
425           return RET_TOOSMALL;
426         r[0] = ESC;
427         r[1] = '$';
428         r[2] = '(';
429         r[3] = 'P';
430         r += 4;
431         state = STATE_JISX02132;
432       }
433     } else {
434       /* JISX 0213 plane 1. */
435       if (state != STATE_JISX02131) {
436         count += 4;
437         if (n < count)
438           return RET_TOOSMALL;
439         r[0] = ESC;
440         r[1] = '$';
441         r[2] = '(';
442         r[3] = 'Q';
443         r += 4;
444         state = STATE_JISX02131;
445       }
446     }
447     if (jch & 0x0080) {
448       /* A possible match in comp_table_data. We have to buffer it. */
449       /* We know it's a JISX 0213 plane 1 character. */
450       if (jch & 0x8000) abort();
451       prevstate = state;
452       lasttwo = jch & 0x7f7f;
453       COMBINE_STATE;
454       conv->ostate = state;
455       return count;
456     }
457     count += 2;
458     if (n < count)
459       return RET_TOOSMALL;
460     r[0] = (jch >> 8) & 0x7f;
461     r[1] = jch & 0x7f;
462     COMBINE_STATE_NO_LASTTWO;
463     conv->ostate = state;
464     return count;
465   }
466 
467   /* Try JIS X 0201-1976 Katakana. This is not officially part of
468      ISO-2022-JP-3. Therefore we try it after all other attempts. */
469   ret = jisx0201_wctomb(conv,buf,wc,1);
470   if (ret != RET_ILUNI) {
471     if (ret != 1) abort();
472     if (buf[0] >= 0x80) {
473       count += (state == STATE_JISX0201KATAKANA ? 1 : 4);
474       if (n < count)
475         return RET_TOOSMALL;
476       if (state != STATE_JISX0201KATAKANA) {
477         r[0] = ESC;
478         r[1] = '(';
479         r[2] = 'I';
480         r += 3;
481         state = STATE_JISX0201KATAKANA;
482       }
483       r[0] = buf[0]-0x80;
484       COMBINE_STATE_NO_LASTTWO;
485       conv->ostate = state;
486       return count;
487     }
488   }
489 
490   return RET_ILUNI;
491 }
492 
493 static int
494 iso2022_jp3_reset (conv_t conv, unsigned char *r, int n)
495 {
496   state_t state = conv->ostate;
497   SPLIT_STATE;
498   {
499     int count =
500       (lasttwo ? (prevstate != state ? 3 : 0) + 2 : 0)
501       + (state != STATE_ASCII ? 3 : 0);
502     if (n < count)
503       return RET_TOOSMALL;
504     if (lasttwo) {
505       if (prevstate != state) {
506         if (state != STATE_JISX0208) abort();
507         r[0] = ESC;
508         r[1] = '$';
509         r[2] = 'B';
510         r += 3;
511       }
512       r[0] = (lasttwo >> 8) & 0xff;
513       r[1] = lasttwo & 0xff;
514       r += 2;
515     }
516     if (state != STATE_ASCII) {
517       r[0] = ESC;
518       r[1] = '(';
519       r[2] = 'B';
520     }
521     /* conv->ostate = 0; will be done by the caller */
522     return count;
523   }
524 }
525 
526 #undef COMBINE_STATE_NO_LASTTWO
527 #undef COMBINE_STATE
528 #undef SPLIT_STATE
529 #undef STATE_JISX02132
530 #undef STATE_JISX02131
531 #undef STATE_JISX0208
532 #undef STATE_JISX0201KATAKANA
533 #undef STATE_JISX0201ROMAN
534 #undef STATE_ASCII
535