1 /* 2 * Copyright (C) 1999-2001, 2005 Free Software Foundation, Inc. 3 * This file is part of the GNU LIBICONV Library. 4 * 5 * The GNU LIBICONV Library is free software; you can redistribute it 6 * and/or modify it under the terms of the GNU Library General Public 7 * License as published by the Free Software Foundation; either version 2 8 * of the License, or (at your option) any later version. 9 * 10 * The GNU LIBICONV Library is distributed in the hope that it will be 11 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 * Library General Public License for more details. 14 * 15 * You should have received a copy of the GNU Library General Public 16 * License along with the GNU LIBICONV Library; see the file COPYING.LIB. 17 * If not, write to the Free Software Foundation, Inc., 51 Franklin Street, 18 * Fifth Floor, Boston, MA 02110-1301, USA. 19 */ 20 21 /* 22 * GBK 23 */ 24 25 /* 26 * GBK, as described in Ken Lunde's book, is an extension of GB 2312-1980 27 * (shifted by adding 0x8080 to the range 0xA1A1..0xFEFE, as used in EUC-CN). 28 * It adds the following ranges: 29 * 30 * (part of GBK/1) 0xA2A1-0xA2AA Small Roman numerals 31 * GBK/3 0x{81-A0}{40-7E,80-FE} 6080 new characters, all in Unicode 32 * GBK/4 0x{AA-FE}{40-7E,80-A0} 8160 new characters, 8080 in Unicode 33 * GBK/5 0x{A8-A9}{40-7E,80-A0} 166 new characters, 153 in Unicode 34 * 35 * Furthermore, all four tables I have looked at 36 * - the CP936 table by Microsoft, found on ftp.unicode.org in 1999, 37 * - the GBK table by Sun, investigated on a Solaris 2.7 machine, 38 * - the GBK tables by CWEX, found in the Big5+ package, 39 * - the GB18030 standard (second printing), 40 * agree in the following extensions. (Ken Lunde must have overlooked these 41 * differences between GB2312 and GBK. Also, the CWEX tables have additional 42 * differences.) 43 * 44 * 1. Some characters in the GB2312 range are defined differently: 45 * 46 * code GB2312 GBK 47 * 0xA1A4 0x30FB # KATAKANA MIDDLE DOT 0x00B7 # MIDDLE DOT 48 * 0xA1AA 0x2015 # HORIZONTAL BAR 0x2014 # EM DASH 49 * 50 * 2. 19 characters added in the range 0xA6E0-0xA6F5. 51 * 52 * 3. 4 characters added in the range 0xA8BB-0xA8C0. 53 * 54 * CP936 as of 1999 was identical to GBK. However, since 1999, Microsoft has 55 * added new mappings to CP936... 56 */ 57 58 #include "gbkext1.h" 59 #include "gbkext2.h" 60 #include "gbkext_inv.h" 61 #include "cp936ext.h" 62 63 static int 64 gbk_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, int n) 65 { 66 unsigned char c = *s; 67 68 if (c >= 0x81 && c < 0xff) { 69 if (n < 2) 70 return RET_TOOFEW(0); 71 if (c >= 0xa1 && c <= 0xf7) { 72 unsigned char c2 = s[1]; 73 if (c == 0xa1) { 74 if (c2 == 0xa4) { 75 *pwc = 0x00b7; 76 return 2; 77 } 78 if (c2 == 0xaa) { 79 *pwc = 0x2014; 80 return 2; 81 } 82 } 83 if (c2 >= 0xa1 && c2 < 0xff) { 84 unsigned char buf[2]; 85 int ret; 86 buf[0] = c-0x80; buf[1] = c2-0x80; 87 ret = gb2312_mbtowc(conv,pwc,buf,2); 88 if (ret != RET_ILSEQ) 89 return ret; 90 buf[0] = c; buf[1] = c2; 91 ret = cp936ext_mbtowc(conv,pwc,buf,2); 92 if (ret != RET_ILSEQ) 93 return ret; 94 } 95 } 96 if (c >= 0x81 && c <= 0xa0) 97 return gbkext1_mbtowc(conv,pwc,s,2); 98 if (c >= 0xa8 && c <= 0xfe) 99 return gbkext2_mbtowc(conv,pwc,s,2); 100 if (c == 0xa2) { 101 unsigned char c2 = s[1]; 102 if (c2 >= 0xa1 && c2 <= 0xaa) { 103 *pwc = 0x2170+(c2-0xa1); 104 return 2; 105 } 106 } 107 } 108 return RET_ILSEQ; 109 } 110 111 static int 112 gbk_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, int n) 113 { 114 unsigned char buf[2]; 115 int ret; 116 117 if (wc != 0x30fb && wc != 0x2015) { 118 ret = gb2312_wctomb(conv,buf,wc,2); 119 if (ret != RET_ILUNI) { 120 if (ret != 2) abort(); 121 if (n < 2) 122 return RET_TOOSMALL; 123 r[0] = buf[0]+0x80; 124 r[1] = buf[1]+0x80; 125 return 2; 126 } 127 } 128 ret = gbkext_inv_wctomb(conv,buf,wc,2); 129 if (ret != RET_ILUNI) { 130 if (ret != 2) abort(); 131 if (n < 2) 132 return RET_TOOSMALL; 133 r[0] = buf[0]; 134 r[1] = buf[1]; 135 return 2; 136 } 137 if (wc >= 0x2170 && wc <= 0x2179) { 138 r[0] = 0xa2; 139 r[1] = 0xa1 + (wc-0x2170); 140 return 2; 141 } 142 ret = cp936ext_wctomb(conv,buf,wc,2); 143 if (ret != RET_ILUNI) { 144 if (ret != 2) abort(); 145 if (n < 2) 146 return RET_TOOSMALL; 147 r[0] = buf[0]; 148 r[1] = buf[1]; 149 return 2; 150 } 151 if (wc == 0x00b7) { 152 if (n < 2) 153 return RET_TOOSMALL; 154 r[0] = 0xa1; 155 r[1] = 0xa4; 156 return 2; 157 } 158 if (wc == 0x2014) { 159 if (n < 2) 160 return RET_TOOSMALL; 161 r[0] = 0xa1; 162 r[1] = 0xaa; 163 return 2; 164 } 165 166 return RET_ILUNI; 167 } 168