1 /*
2 * Copyright (C) 1999-2001, 2005 Free Software Foundation, Inc.
3 * This file is part of the GNU LIBICONV Library.
4 *
5 * The GNU LIBICONV Library is free software; you can redistribute it
6 * and/or modify it under the terms of the GNU Library General Public
7 * License as published by the Free Software Foundation; either version 2
8 * of the License, or (at your option) any later version.
9 *
10 * The GNU LIBICONV Library is distributed in the hope that it will be
11 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Library General Public License for more details.
14 *
15 * You should have received a copy of the GNU Library General Public
16 * License along with the GNU LIBICONV Library; see the file COPYING.LIB.
17 * If not, write to the Free Software Foundation, Inc., 51 Franklin Street,
18 * Fifth Floor, Boston, MA 02110-1301, USA.
19 */
20
21 /*
22 * CP950
23 */
24
25 /*
26 * Microsoft CP950 is a slightly extended and slightly modified version of
27 * BIG5. The differences between the EASTASIA/OTHER/BIG5.TXT and
28 * VENDORS/MICSFT/WINDOWS/CP950.TXT tables found on ftp.unicode.org are
29 * as follows:
30 *
31 * 1. Some characters in the BIG5 range are defined differently:
32 *
33 * code BIG5.TXT CP950.TXT
34 * 0xA145 0x2022 # BULLET 0x2027 # HYPHENATION POINT
35 * 0xA14E 0xFF64 # HALFWIDTH IDEOGRAPHIC COMMA
36 * 0xFE51 # SMALL IDEOGRAPHIC COMMA
37 * 0xA15A --- 0x2574 # BOX DRAWINGS LIGHT LEFT
38 * 0xA1C2 0x203E # OVERLINE 0x00AF # MACRON
39 * 0xA1C3 --- 0xFFE3 # FULLWIDTH MACRON
40 * 0xA1C5 --- 0x02CD # MODIFIER LETTER LOW MACRON
41 * 0xA1E3 0x223C # TILDE OPERATOR 0xFF5E # FULLWIDTH TILDE
42 * 0xA1F2 0x2641 # EARTH 0x2295 # CIRCLED PLUS
43 * 0xA1F3 0x2609 # SUN 0x2299 # CIRCLED DOT OPERATOR
44 * 0xA1FE --- 0xFF0F # FULLWIDTH SOLIDUS
45 * 0xA240 --- 0xFF3C # FULLWIDTH REVERSE SOLIDUS
46 * 0xA241 0xFF0F # FULLWIDTH SOLIDUS 0x2215 # DIVISION SLASH
47 * 0xA242 0xFF3C # FULLWIDTH REVERSE SOLIDUS
48 * 0xFE68 # SMALL REVERSE SOLIDUS
49 * 0xA244 0x00A5 # YEN SIGN 0xFFE5 # FULLWIDTH YEN SIGN
50 * 0xA246 0x00A2 # CENT SIGN 0xFFE0 # FULLWIDTH CENT SIGN
51 * 0xA247 0x00A3 # POUND SIGN 0xFFE1 # FULLWIDTH POUND SIGN
52 * 0xA2CC --- 0x5341
53 * 0xA2CE --- 0x5345
54 *
55 * 2. A small new row. See cp950ext.h.
56 *
57 * 3. CP950.TXT is lacking the range 0xC6A1..0xC7FC (Hiragana, Katakana,
58 * Cyrillic, circled digits, parenthesized digits).
59 *
60 * We implement this omission, because said range is marked "uncertain"
61 * in the unicode.org BIG5 table.
62 *
63 * The table found on Microsoft's website furthermore adds:
64 *
65 * 4. A single character:
66 *
67 * code CP950.TXT
68 * 0xA3E1 0x20AC # EURO SIGN
69 *
70 * Many variants of BIG5 or CP950 (in JDK, Solaris, OSF/1, Windows-2000, ICU,
71 * as well as our BIG5-2003 converter) also add:
72 *
73 * 5. Private area mappings:
74 *
75 * code Unicode
76 * 0x{81..8D}{40..7E,A1..FE} U+EEB8..U+F6B0
77 * 0x{8E..A0}{40..7E,A1..FE} U+E311..U+EEB7
78 * 0x{FA..FE}{40..7E,A1..FE} U+E000..U+E310
79 *
80 * We add them too because, although there are backward compatibility problems
81 * when a character from a private area is moved to an official Unicode code
82 * point, they are useful for some people in practice.
83 */
84
85 static const unsigned short cp950_2uni_pagea1[314] = {
86 /* 0xa1 */
87 0x3000, 0xff0c, 0x3001, 0x3002, 0xff0e, 0x2027, 0xff1b, 0xff1a,
88 0xff1f, 0xff01, 0xfe30, 0x2026, 0x2025, 0xfe50, 0xfe51, 0xfe52,
89 0x00b7, 0xfe54, 0xfe55, 0xfe56, 0xfe57, 0xff5c, 0x2013, 0xfe31,
90 0x2014, 0xfe33, 0x2574, 0xfe34, 0xfe4f, 0xff08, 0xff09, 0xfe35,
91 0xfe36, 0xff5b, 0xff5d, 0xfe37, 0xfe38, 0x3014, 0x3015, 0xfe39,
92 0xfe3a, 0x3010, 0x3011, 0xfe3b, 0xfe3c, 0x300a, 0x300b, 0xfe3d,
93 0xfe3e, 0x3008, 0x3009, 0xfe3f, 0xfe40, 0x300c, 0x300d, 0xfe41,
94 0xfe42, 0x300e, 0x300f, 0xfe43, 0xfe44, 0xfe59, 0xfe5a, 0xfe5b,
95 0xfe5c, 0xfe5d, 0xfe5e, 0x2018, 0x2019, 0x201c, 0x201d, 0x301d,
96 0x301e, 0x2035, 0x2032, 0xff03, 0xff06, 0xff0a, 0x203b, 0x00a7,
97 0x3003, 0x25cb, 0x25cf, 0x25b3, 0x25b2, 0x25ce, 0x2606, 0x2605,
98 0x25c7, 0x25c6, 0x25a1, 0x25a0, 0x25bd, 0x25bc, 0x32a3, 0x2105,
99 0x00af, 0xffe3, 0xff3f, 0x02cd, 0xfe49, 0xfe4a, 0xfe4d, 0xfe4e,
100 0xfe4b, 0xfe4c, 0xfe5f, 0xfe60, 0xfe61, 0xff0b, 0xff0d, 0x00d7,
101 0x00f7, 0x00b1, 0x221a, 0xff1c, 0xff1e, 0xff1d, 0x2266, 0x2267,
102 0x2260, 0x221e, 0x2252, 0x2261, 0xfe62, 0xfe63, 0xfe64, 0xfe65,
103 0xfe66, 0xff5e, 0x2229, 0x222a, 0x22a5, 0x2220, 0x221f, 0x22bf,
104 0x33d2, 0x33d1, 0x222b, 0x222e, 0x2235, 0x2234, 0x2640, 0x2642,
105 0x2295, 0x2299, 0x2191, 0x2193, 0x2190, 0x2192, 0x2196, 0x2197,
106 0x2199, 0x2198, 0x2225, 0x2223, 0xff0f,
107 /* 0xa2 */
108 0xff3c, 0x2215, 0xfe68, 0xff04, 0xffe5, 0x3012, 0xffe0, 0xffe1,
109 0xff05, 0xff20, 0x2103, 0x2109, 0xfe69, 0xfe6a, 0xfe6b, 0x33d5,
110 0x339c, 0x339d, 0x339e, 0x33ce, 0x33a1, 0x338e, 0x338f, 0x33c4,
111 0x00b0, 0x5159, 0x515b, 0x515e, 0x515d, 0x5161, 0x5163, 0x55e7,
112 0x74e9, 0x7cce, 0x2581, 0x2582, 0x2583, 0x2584, 0x2585, 0x2586,
113 0x2587, 0x2588, 0x258f, 0x258e, 0x258d, 0x258c, 0x258b, 0x258a,
114 0x2589, 0x253c, 0x2534, 0x252c, 0x2524, 0x251c, 0x2594, 0x2500,
115 0x2502, 0x2595, 0x250c, 0x2510, 0x2514, 0x2518, 0x256d, 0x256e,
116 0x2570, 0x256f, 0x2550, 0x255e, 0x256a, 0x2561, 0x25e2, 0x25e3,
117 0x25e5, 0x25e4, 0x2571, 0x2572, 0x2573, 0xff10, 0xff11, 0xff12,
118 0xff13, 0xff14, 0xff15, 0xff16, 0xff17, 0xff18, 0xff19, 0x2160,
119 0x2161, 0x2162, 0x2163, 0x2164, 0x2165, 0x2166, 0x2167, 0x2168,
120 0x2169, 0x3021, 0x3022, 0x3023, 0x3024, 0x3025, 0x3026, 0x3027,
121 0x3028, 0x3029, 0x5341, 0x5344, 0x5345, 0xff21, 0xff22, 0xff23,
122 0xff24, 0xff25, 0xff26, 0xff27, 0xff28, 0xff29, 0xff2a, 0xff2b,
123 0xff2c, 0xff2d, 0xff2e, 0xff2f, 0xff30, 0xff31, 0xff32, 0xff33,
124 0xff34, 0xff35, 0xff36, 0xff37, 0xff38, 0xff39, 0xff3a, 0xff41,
125 0xff42, 0xff43, 0xff44, 0xff45, 0xff46, 0xff47, 0xff48, 0xff49,
126 0xff4a, 0xff4b, 0xff4c, 0xff4d, 0xff4e, 0xff4f, 0xff50, 0xff51,
127 0xff52, 0xff53, 0xff54, 0xff55, 0xff56,
128 };
129
130 #include "cp950ext.h"
131
132 static int
cp950_mbtowc(conv_t conv,ucs4_t * pwc,const unsigned char * s,int n)133 cp950_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, int n)
134 {
135 unsigned char c = *s;
136 /* Code set 0 (ASCII) */
137 if (c < 0x80)
138 return ascii_mbtowc(conv,pwc,s,n);
139 /* Code set 1 (BIG5 extended) */
140 if (c >= 0x81 && c < 0xff) {
141 if (n < 2)
142 return RET_TOOFEW(0);
143 {
144 unsigned char c2 = s[1];
145 if ((c2 >= 0x40 && c2 < 0x7f) || (c2 >= 0xa1 && c2 < 0xff)) {
146 if (c >= 0xa1) {
147 if (c < 0xa3) {
148 unsigned int i = 157 * (c - 0xa1) + (c2 - (c2 >= 0xa1 ? 0x62 : 0x40));
149 unsigned short wc = cp950_2uni_pagea1[i];
150 if (wc != 0xfffd) {
151 *pwc = (ucs4_t) wc;
152 return 2;
153 }
154 }
155 if (!((c == 0xc6 && c2 >= 0xa1) || c == 0xc7)) {
156 int ret = big5_mbtowc(conv,pwc,s,2);
157 if (ret != RET_ILSEQ)
158 return ret;
159 }
160 if (c == 0xa3 && c2 == 0xe1) {
161 *pwc = 0x20ac;
162 return 2;
163 }
164 if (c >= 0xfa) {
165 /* User-defined characters */
166 *pwc = 0xe000 + 157 * (c - 0xfa) + (c2 - (c2 >= 0xa1 ? 0x62 : 0x40));
167 return 2;
168 }
169 } else {
170 /* 0x81 <= c < 0xa1. */
171 /* User-defined characters */
172 *pwc = (c >= 0x8e ? 0xdb18 : 0xeeb8) + 157 * (c - 0x81)
173 + (c2 - (c2 >= 0xa1 ? 0x62 : 0x40));
174 return 2;
175 }
176 }
177 }
178 if (c == 0xf9) {
179 int ret = cp950ext_mbtowc(conv,pwc,s,2);
180 if (ret != RET_ILSEQ)
181 return ret;
182 }
183 }
184 return RET_ILSEQ;
185 }
186
187 static int
cp950_wctomb(conv_t conv,unsigned char * r,ucs4_t wc,int n)188 cp950_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, int n)
189 {
190 unsigned char buf[2];
191 int ret;
192
193 /* Code set 0 (ASCII) */
194 ret = ascii_wctomb(conv,r,wc,n);
195 if (ret != RET_ILUNI)
196 return ret;
197
198 /* Code set 1 (BIG5 extended) */
199 switch (wc >> 8) {
200 case 0x00:
201 if (wc == 0x00af) { buf[0] = 0xa1; buf[1] = 0xc2; ret = 2; break; }
202 if (wc == 0x00a2 || wc == 0x00a3 || wc == 0x00a4)
203 return RET_ILUNI;
204 break;
205 case 0x02:
206 if (wc == 0x02cd) { buf[0] = 0xa1; buf[1] = 0xc5; ret = 2; break; }
207 break;
208 case 0x20:
209 if (wc == 0x2027) { buf[0] = 0xa1; buf[1] = 0x45; ret = 2; break; }
210 if (wc == 0x20ac) { buf[0] = 0xa3; buf[1] = 0xe1; ret = 2; break; }
211 if (wc == 0x2022 || wc == 0x203e)
212 return RET_ILUNI;
213 break;
214 case 0x22:
215 if (wc == 0x2215) { buf[0] = 0xa2; buf[1] = 0x41; ret = 2; break; }
216 if (wc == 0x2295) { buf[0] = 0xa1; buf[1] = 0xf2; ret = 2; break; }
217 if (wc == 0x2299) { buf[0] = 0xa1; buf[1] = 0xf3; ret = 2; break; }
218 if (wc == 0x223c)
219 return RET_ILUNI;
220 break;
221 case 0x25:
222 if (wc == 0x2574) { buf[0] = 0xa1; buf[1] = 0x5a; ret = 2; break; }
223 break;
224 case 0x26:
225 if (wc == 0x2609 || wc == 0x2641)
226 return RET_ILUNI;
227 break;
228 case 0xe0: case 0xe1: case 0xe2: case 0xe3: case 0xe4: case 0xe5:
229 case 0xe6: case 0xe7: case 0xe8: case 0xe9: case 0xea: case 0xeb:
230 case 0xec: case 0xed: case 0xee: case 0xef: case 0xf0: case 0xf1:
231 case 0xf2: case 0xf3: case 0xf4: case 0xf5: case 0xf6:
232 {
233 /* User-defined characters */
234 unsigned int i = wc - 0xe000;
235 if (i < 5809) {
236 unsigned int c1 = i / 157;
237 unsigned int c2 = i % 157;
238 buf[0] = c1 + (c1 < 5 ? 0xfa : c1 < 24 ? 0x89 : 0x69);
239 buf[1] = c2 + (c2 < 0x3f ? 0x40 : 0x62);
240 ret = 2;
241 break;
242 }
243 }
244 break;
245 case 0xfe:
246 if (wc == 0xfe51) { buf[0] = 0xa1; buf[1] = 0x4e; ret = 2; break; }
247 if (wc == 0xfe68) { buf[0] = 0xa2; buf[1] = 0x42; ret = 2; break; }
248 break;
249 case 0xff:
250 if (wc == 0xff0f) { buf[0] = 0xa1; buf[1] = 0xfe; ret = 2; break; }
251 if (wc == 0xff3c) { buf[0] = 0xa2; buf[1] = 0x40; ret = 2; break; }
252 if (wc == 0xff5e) { buf[0] = 0xa1; buf[1] = 0xe3; ret = 2; break; }
253 if (wc == 0xffe0) { buf[0] = 0xa2; buf[1] = 0x46; ret = 2; break; }
254 if (wc == 0xffe1) { buf[0] = 0xa2; buf[1] = 0x47; ret = 2; break; }
255 if (wc == 0xffe3) { buf[0] = 0xa1; buf[1] = 0xc3; ret = 2; break; }
256 if (wc == 0xffe5) { buf[0] = 0xa2; buf[1] = 0x44; ret = 2; break; }
257 if (wc == 0xff64)
258 return RET_ILUNI;
259 break;
260 }
261 if (ret == RET_ILUNI)
262 ret = big5_wctomb(conv,buf,wc,2);
263 if (ret != RET_ILUNI) {
264 if (ret != 2) abort();
265 if (!((buf[0] == 0xc6 && buf[1] >= 0xa1) || buf[0] == 0xc7)) {
266 if (n < 2)
267 return RET_TOOSMALL;
268 r[0] = buf[0];
269 r[1] = buf[1];
270 return 2;
271 }
272 }
273 ret = cp950ext_wctomb(conv,buf,wc,2);
274 if (ret != RET_ILUNI) {
275 if (ret != 2) abort();
276 if (n < 2)
277 return RET_TOOSMALL;
278 r[0] = buf[0];
279 r[1] = buf[1];
280 return 2;
281 }
282
283 return RET_ILUNI;
284 }
285