xref: /haiku/src/libs/iconv/tcvn.h (revision 2b76973fa2401f7a5edf68e6470f3d3210cbcff3)
1 /*
2  * Copyright (C) 1999-2002, 2004 Free Software Foundation, Inc.
3  * This file is part of the GNU LIBICONV Library.
4  *
5  * The GNU LIBICONV Library is free software; you can redistribute it
6  * and/or modify it under the terms of the GNU Library General Public
7  * License as published by the Free Software Foundation; either version 2
8  * of the License, or (at your option) any later version.
9  *
10  * The GNU LIBICONV Library is distributed in the hope that it will be
11  * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  * Library General Public License for more details.
14  *
15  * You should have received a copy of the GNU Library General Public
16  * License along with the GNU LIBICONV Library; see the file COPYING.LIB.
17  * If not, write to the Free Software Foundation, Inc., 51 Franklin Street,
18  * Fifth Floor, Boston, MA 02110-1301, USA.
19  */
20 
21 /*
22  * TCVN-5712
23  */
24 
25 #include "flushwc.h"
26 #include "vietcomb.h"
27 
28 static const unsigned char tcvn_comb_table[] = {
29   0xb0, 0xb3, 0xb2, 0xb1, 0xb4,
30 };
31 
32 /* The possible bases in viet_comp_table_data:
33    0x0041..0x0045, 0x0047..0x0049, 0x004B..0x0050, 0x0052..0x0057,
34    0x0059..0x005A, 0x0061..0x0065, 0x0067..0x0069, 0x006B..0x0070,
35    0x0072..0x0077, 0x0079..0x007A, 0x00A5, 0x00C2, 0x00CA, 0x00D3..0x00D6,
36    0x00DA, 0x00E2, 0x00EA, 0x00F3..0x00F6, 0x00FA, 0x0102..0x0103,
37    0x0168..0x0169, 0x01A0..0x01A1, 0x01AF..0x01B0. */
38 static const unsigned int tcvn_comp_bases[] = {
39   0x06fdfbbe, 0x06fdfbbe, 0x00000000, 0x00000020, 0x04780404, 0x04780404,
40   0x0000000c, 0x00000000, 0x00000000, 0x00000300, 0x00000000, 0x00018003
41 };
42 
43 static const unsigned short tcvn_2uni_1[24] = {
44   /* 0x00 */
45   0x0000, 0x00da, 0x1ee4, 0x0003, 0x1eea, 0x1eec, 0x1eee, 0x0007,
46   0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f,
47   /* 0x10 */
48   0x0010, 0x1ee8, 0x1ef0, 0x1ef2, 0x1ef6, 0x1ef8, 0x00dd, 0x1ef4,
49 };
50 static const unsigned short tcvn_2uni_2[128] = {
51   /* 0x80 */
52   0x00c0, 0x1ea2, 0x00c3, 0x00c1, 0x1ea0, 0x1eb6, 0x1eac, 0x00c8,
53   0x1eba, 0x1ebc, 0x00c9, 0x1eb8, 0x1ec6, 0x00cc, 0x1ec8, 0x0128,
54   /* 0x90 */
55   0x00cd, 0x1eca, 0x00d2, 0x1ece, 0x00d5, 0x00d3, 0x1ecc, 0x1ed8,
56   0x1edc, 0x1ede, 0x1ee0, 0x1eda, 0x1ee2, 0x00d9, 0x1ee6, 0x0168,
57   /* 0xa0 */
58   0x00a0, 0x0102, 0x00c2, 0x00ca, 0x00d4, 0x01a0, 0x01af, 0x0110,
59   0x0103, 0x00e2, 0x00ea, 0x00f4, 0x01a1, 0x01b0, 0x0111, 0x1eb0,
60   /* 0xb0 */
61   0x0300, 0x0309, 0x0303, 0x0301, 0x0323, 0x00e0, 0x1ea3, 0x00e3,
62   0x00e1, 0x1ea1, 0x1eb2, 0x1eb1, 0x1eb3, 0x1eb5, 0x1eaf, 0x1eb4,
63   /* 0xc0 */
64   0x1eae, 0x1ea6, 0x1ea8, 0x1eaa, 0x1ea4, 0x1ec0, 0x1eb7, 0x1ea7,
65   0x1ea9, 0x1eab, 0x1ea5, 0x1ead, 0x00e8, 0x1ec2, 0x1ebb, 0x1ebd,
66   /* 0xd0 */
67   0x00e9, 0x1eb9, 0x1ec1, 0x1ec3, 0x1ec5, 0x1ebf, 0x1ec7, 0x00ec,
68   0x1ec9, 0x1ec4, 0x1ebe, 0x1ed2, 0x0129, 0x00ed, 0x1ecb, 0x00f2,
69   /* 0xe0 */
70   0x1ed4, 0x1ecf, 0x00f5, 0x00f3, 0x1ecd, 0x1ed3, 0x1ed5, 0x1ed7,
71   0x1ed1, 0x1ed9, 0x1edd, 0x1edf, 0x1ee1, 0x1edb, 0x1ee3, 0x00f9,
72   /* 0xf0 */
73   0x1ed6, 0x1ee7, 0x0169, 0x00fa, 0x1ee5, 0x1eeb, 0x1eed, 0x1eef,
74   0x1ee9, 0x1ef1, 0x1ef3, 0x1ef7, 0x1ef9, 0x00fd, 0x1ef5, 0x1ed0,
75 };
76 
77 /* In the TCVN to Unicode direction, the state contains a buffered
78    character, or 0 if none. */
79 
80 static int
81 tcvn_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, int n)
82 {
83   unsigned char c = *s;
84   unsigned short wc;
85   unsigned short last_wc;
86   if (c < 0x18)
87     wc = tcvn_2uni_1[c];
88   else if (c < 0x80)
89     wc = c;
90   else
91     wc = tcvn_2uni_2[c-0x80];
92   last_wc = conv->istate;
93   if (last_wc) {
94     if (wc >= 0x0300 && wc < 0x0340) {
95       /* See whether last_wc and wc can be combined. */
96       unsigned int k;
97       unsigned int i1, i2;
98       switch (wc) {
99         case 0x0300: k = 0; break;
100         case 0x0301: k = 1; break;
101         case 0x0303: k = 2; break;
102         case 0x0309: k = 3; break;
103         case 0x0323: k = 4; break;
104         default: abort();
105       }
106       i1 = viet_comp_table[k].idx;
107       i2 = i1 + viet_comp_table[k].len-1;
108       if (last_wc >= viet_comp_table_data[i1].base
109           && last_wc <= viet_comp_table_data[i2].base) {
110         unsigned int i;
111         for (;;) {
112           i = (i1+i2)>>1;
113           if (last_wc == viet_comp_table_data[i].base)
114             break;
115           if (last_wc < viet_comp_table_data[i].base) {
116             if (i1 == i)
117               goto not_combining;
118             i2 = i;
119           } else {
120             if (i1 != i)
121               i1 = i;
122             else {
123               i = i2;
124               if (last_wc == viet_comp_table_data[i].base)
125                 break;
126               goto not_combining;
127             }
128           }
129         }
130         last_wc = viet_comp_table_data[i].composed;
131         /* Output the combined character. */
132         conv->istate = 0;
133         *pwc = (ucs4_t) last_wc;
134         return 1;
135       }
136     }
137   not_combining:
138     /* Output the buffered character. */
139     conv->istate = 0;
140     *pwc = (ucs4_t) last_wc;
141     return 0; /* Don't advance the input pointer. */
142   }
143   if (wc >= 0x0041 && wc <= 0x01b0
144       && ((tcvn_comp_bases[(wc - 0x0040) >> 5] >> (wc & 0x1f)) & 1)) {
145     /* wc is a possible match in viet_comp_table_data. Buffer it. */
146     conv->istate = wc;
147     return RET_TOOFEW(1);
148   } else {
149     /* Output wc immediately. */
150     *pwc = (ucs4_t) wc;
151     return 1;
152   }
153 }
154 
155 #define tcvn_flushwc normal_flushwc
156 
157 static const unsigned char tcvn_page00[96+184] = {
158   0xa0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
159   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
160   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
161   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
162   0x80, 0x83, 0xa2, 0x82, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
163   0x87, 0x8a, 0xa3, 0x00, 0x8d, 0x90, 0x00, 0x00, /* 0xc8-0xcf */
164   0x00, 0x00, 0x92, 0x95, 0xa4, 0x94, 0x00, 0x00, /* 0xd0-0xd7 */
165   0x00, 0x9d, 0x01, 0x00, 0x00, 0x16, 0x00, 0x00, /* 0xd8-0xdf */
166   0xb5, 0xb8, 0xa9, 0xb7, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
167   0xcc, 0xd0, 0xaa, 0x00, 0xd7, 0xdd, 0x00, 0x00, /* 0xe8-0xef */
168   0x00, 0x00, 0xdf, 0xe3, 0xab, 0xe2, 0x00, 0x00, /* 0xf0-0xf7 */
169   0x00, 0xef, 0xf3, 0x00, 0x00, 0xfd, 0x00, 0x00, /* 0xf8-0xff */
170   /* 0x0100 */
171   0x00, 0x00, 0xa1, 0xa8, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
172   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
173   0xa7, 0xae, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
174   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
175   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
176   0x8f, 0xdc, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
177   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
178   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
179   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
180   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
181   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
182   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
183   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
184   0x9f, 0xf2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
185   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
186   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
187   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
188   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
189   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
190   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
191   0xa5, 0xac, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
192   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xa6, /* 0xa8-0xaf */
193   0xad, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
194 };
195 static const unsigned char tcvn_page03[40] = {
196   0xb0, 0xb3, 0x00, 0xb2, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
197   0x00, 0xb1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
198   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
199   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
200   0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
201 };
202 static const unsigned char tcvn_page1e[96] = {
203   0x84, 0xb9, 0x81, 0xb6, 0xc4, 0xca, 0xc1, 0xc7, /* 0xa0-0xa7 */
204   0xc2, 0xc8, 0xc3, 0xc9, 0x86, 0xcb, 0xc0, 0xbe, /* 0xa8-0xaf */
205   0xaf, 0xbb, 0xba, 0xbc, 0xbf, 0xbd, 0x85, 0xc6, /* 0xb0-0xb7 */
206   0x8b, 0xd1, 0x88, 0xce, 0x89, 0xcf, 0xda, 0xd5, /* 0xb8-0xbf */
207   0xc5, 0xd2, 0xcd, 0xd3, 0xd9, 0xd4, 0x8c, 0xd6, /* 0xc0-0xc7 */
208   0x8e, 0xd8, 0x91, 0xde, 0x96, 0xe4, 0x93, 0xe1, /* 0xc8-0xcf */
209   0xff, 0xe8, 0xdb, 0xe5, 0xe0, 0xe6, 0xf0, 0xe7, /* 0xd0-0xd7 */
210   0x97, 0xe9, 0x9b, 0xed, 0x98, 0xea, 0x99, 0xeb, /* 0xd8-0xdf */
211   0x9a, 0xec, 0x9c, 0xee, 0x02, 0xf4, 0x9e, 0xf1, /* 0xe0-0xe7 */
212   0x11, 0xf8, 0x04, 0xf5, 0x05, 0xf6, 0x06, 0xf7, /* 0xe8-0xef */
213   0x12, 0xf9, 0x13, 0xfa, 0x17, 0xfe, 0x14, 0xfb, /* 0xf0-0xf7 */
214   0x15, 0xfc, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
215 };
216 
217 static int
218 tcvn_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, int n)
219 {
220   unsigned char c = 0;
221   if (wc < 0x0080 && (wc >= 0x0020 || (0x00fe0076 & (1 << wc)) == 0)) {
222     *r = wc;
223     return 1;
224   }
225   else if (wc >= 0x00a0 && wc < 0x01b8)
226     c = tcvn_page00[wc-0x00a0];
227   else if (wc >= 0x0300 && wc < 0x0328)
228     c = tcvn_page03[wc-0x0300];
229   else if (wc >= 0x0340 && wc < 0x0342) /* deprecated Vietnamese tone marks */
230     c = tcvn_page03[wc-0x0340];
231   else if (wc >= 0x1ea0 && wc < 0x1f00)
232     c = tcvn_page1e[wc-0x1ea0];
233   if (c != 0) {
234     *r = c;
235     return 1;
236   }
237   /* Try compatibility or canonical decomposition. */
238   {
239     /* Binary search through viet_decomp_table. */
240     unsigned int i1 = 0;
241     unsigned int i2 = sizeof(viet_decomp_table)/sizeof(viet_decomp_table[0])-1;
242     if (wc >= viet_decomp_table[i1].composed
243         && wc <= viet_decomp_table[i2].composed) {
244       unsigned int i;
245       for (;;) {
246         /* Here i2 - i1 > 0. */
247         i = (i1+i2)>>1;
248         if (wc == viet_decomp_table[i].composed)
249           break;
250         if (wc < viet_decomp_table[i].composed) {
251           if (i1 == i)
252             return RET_ILUNI;
253           /* Here i1 < i < i2. */
254           i2 = i;
255         } else {
256           /* Here i1 <= i < i2. */
257           if (i1 != i)
258             i1 = i;
259           else {
260             /* Here i2 - i1 = 1. */
261             i = i2;
262             if (wc == viet_decomp_table[i].composed)
263               break;
264             else
265               return RET_ILUNI;
266           }
267         }
268       }
269       /* Found a compatibility or canonical decomposition. */
270       wc = viet_decomp_table[i].base;
271       /* wc is one of 0x0020, 0x0041..0x005a, 0x0061..0x007a, 0x00a5, 0x00a8,
272          0x00c2, 0x00c5..0x00c7, 0x00ca, 0x00cf, 0x00d3, 0x00d4, 0x00d6,
273          0x00d8, 0x00da, 0x00dc, 0x00e2, 0x00e5..0x00e7, 0x00ea, 0x00ef,
274          0x00f3, 0x00f4, 0x00f6, 0x00f8, 0x00fc, 0x0102, 0x0103, 0x01a0,
275          0x01a1, 0x01af, 0x01b0. */
276       if (wc < 0x0080)
277         c = wc;
278       else {
279         c = tcvn_page00[wc-0x00a0];
280         if (c == 0)
281           return RET_ILUNI;
282       }
283       if (n < 2)
284         return RET_TOOSMALL;
285       r[0] = c;
286       r[1] = tcvn_comb_table[viet_decomp_table[i].comb1];
287       return 2;
288     }
289   }
290   return RET_ILUNI;
291 }
292