1 /* 2 * Copyright (C) 1999-2001, 2005 Free Software Foundation, Inc. 3 * This file is part of the GNU LIBICONV Library. 4 * 5 * The GNU LIBICONV Library is free software; you can redistribute it 6 * and/or modify it under the terms of the GNU Library General Public 7 * License as published by the Free Software Foundation; either version 2 8 * of the License, or (at your option) any later version. 9 * 10 * The GNU LIBICONV Library is distributed in the hope that it will be 11 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 * Library General Public License for more details. 14 * 15 * You should have received a copy of the GNU Library General Public 16 * License along with the GNU LIBICONV Library; see the file COPYING.LIB. 17 * If not, write to the Free Software Foundation, Inc., 51 Franklin Street, 18 * Fifth Floor, Boston, MA 02110-1301, USA. 19 */ 20 21 /* 22 * ISO-IR-165 23 */ 24 25 /* 26 * ISO-IR-165 is an extension of GB 2312, consisting of: 27 * 1. GB 6345.1-86 corrections: 28 * Two corrections to GB 2312, at 0x2367 and 0x6F71. 29 * 2. GB 6345.1-86 additions: 30 * - 6 new full-width pinyin characters in row 0x28. 31 * - ISO646-CN in row 0x2A. 32 * - 32 half-width pinyin characters in row 0x2B. 33 * 3. GB 8565.2-88 additions: 34 * - 50 characters in row 0x2D. 35 * - 92 characters in row 0x2E. 36 * - 93 characters in row 0x2F. 37 * - 470 characters in rows 0x7A-0x7E. 38 * 4. ISO-IR-165 additions: 39 * - 22 characters in row 0x26. 40 * - 94 characters in row 0x2C. 41 * - 44 new characters in row 0x2D. 42 * - 1 new character in row 0x2F. 43 * 44 * The conversion table was created from the following sources: 45 * Ad 1. The 0x2367 correction is already integrated in the unicode.org 46 * GB2312.TXT table. The 0x6F71 mapping is the same in the unicode.org 47 * GB2312.TXT and UNIHAN.TXT table and in Koichi Yasuoka's Uni2GB table, 48 * so we assume it's correct. 49 * The unicode.org UNIHAN.TXT table about GB 8565 is not usable: it has 50 * extraneous code points at rows 0x28, 0x2C, 0x2D. Note also that it does 51 * not list the 69 non-hanzi in row 0x2F. Moreover, it has the characters 52 * 0x2F7A-0x2F7D shifted down by one to 0x2F79-0x2F7C. 53 * Therefore we take the GB8565 and ISO-IR-165 data from Koichi Yasuoka's 54 * Uni2GB table. 55 * Ad 1. Yasuoka maps 0x2367 to U+0261 (small script g) and 0x2840 to U+FF47 56 * (full-width small normal g). While coherent with ISO-IR's 165.pdf, 57 * this disagrees with Ken Lunde's book: He says that ISO-IR-165 58 * includes the GB6345 correction, i.e. maps 0x2367 to U+FF47 or U+0067 59 * and _not_ to U+0261 (small script g). 60 * To overcome the confusion, we just map both 0x2367 and 0x2840 to 61 * U+FF47. 62 * Ad 2. Row 0x28: Add a mapping from 0x283F to U+01F9. 63 * Row 0x2A: Mapping is well-known, also present in Koichi Yasuoka's 64 * table. 65 * Row 0x2B: Typed in by hand from appendix E in Ken Lunde's book. 66 * When converting from Unicode to ISO-IR-165, prefer the half-width 67 * range 0x2B{21..40} to the full-width range 0x28{21..40}. 68 * Ad 3. Rows 0x2D, 0x2E: Both Koichi Yasuoka's Uni2GB table and the UNIHAN.TXT 69 * data for GB 8565 agree here. 70 * Row 0x2F: Taken from Koichi Yasuoka's Uni2GB table. 71 * Rows 0x7A-0x7E: Koichi Yasuoka's Uni2GB table and the UNIHAN.TXT 72 * data for GB 8565 agree here mostly. Differences: 73 * 0x7C38 -> U+6F26 or U+527A ? We choose U+6F26. 74 * 0x7C5A -> U+7A40 or U+6996 ? We choose U+6996. 75 * Ad 4. Row 0x26: Mapping unknown. 76 * Rows 0x2C, 0x2D: Both Koichi Yasuoka's Uni2GB table and the UNIHAN.TXT 77 * data for GB 8565 (!) agree here. 78 * Row 0x2F: Taken from Koichi Yasuoka's Uni2GB table. 79 */ 80 81 #include "isoir165ext.h" 82 83 static int 84 isoir165_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, int n) 85 { 86 int ret; 87 88 /* Map full-width pinyin (row 0x28) like half-width pinyin (row 0x2B). */ 89 if (s[0] == 0x28) { 90 if (n >= 2) { 91 unsigned char c2 = s[1]; 92 if (c2 >= 0x21 && c2 <= 0x40) { 93 unsigned char buf[2]; 94 buf[0] = 0x2b; 95 buf[1] = c2; 96 ret = isoir165ext_mbtowc(conv,pwc,buf,2); 97 if (ret != RET_ILSEQ) 98 return ret; 99 } 100 } 101 } 102 /* Try the GB2312 -> Unicode table. */ 103 ret = gb2312_mbtowc(conv,pwc,s,n); 104 if (ret != RET_ILSEQ) 105 return ret; 106 /* Row 0x2A is GB_1988-80. */ 107 if (s[0] == 0x2a) { 108 if (n >= 2) { 109 unsigned char c2 = s[1]; 110 if (c2 >= 0x21 && c2 < 0x7f) { 111 int ret = iso646_cn_mbtowc(conv,pwc,s+1,1); 112 if (ret != 1) abort(); 113 return 2; 114 } 115 return RET_ILSEQ; 116 } 117 return RET_TOOFEW(0); 118 } 119 /* Try the ISO-IR-165 extensions -> Unicode table. */ 120 ret = isoir165ext_mbtowc(conv,pwc,s,n); 121 return ret; 122 } 123 124 static int 125 isoir165_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, int n) 126 { 127 unsigned char buf[2]; 128 int ret; 129 130 /* Try the Unicode -> GB2312 table. */ 131 ret = gb2312_wctomb(conv,buf,wc,2); 132 if (ret != RET_ILUNI) { 133 if (ret != 2) abort(); 134 if (!(buf[0] == 0x28 && buf[1] >= 0x21 && buf[1] <= 0x40)) { 135 if (n >= 2) { 136 r[0] = buf[0]; 137 r[1] = buf[1]; 138 return 2; 139 } 140 return RET_TOOSMALL; 141 } 142 } 143 /* Row 0x2A is GB_1988-80. */ 144 ret = iso646_cn_wctomb(conv,buf,wc,1); 145 if (ret != RET_ILUNI) { 146 if (ret != 1) abort(); 147 if (buf[0] >= 0x21 && buf[0] < 0x7f) { 148 if (n >= 2) { 149 r[0] = 0x2a; 150 r[1] = buf[0]; 151 return 2; 152 } 153 return RET_TOOSMALL; 154 } 155 } 156 /* Try the Unicode -> ISO-IR-165 extensions table. */ 157 ret = isoir165ext_wctomb(conv,r,wc,n); 158 return ret; 159 } 160