1 /* 2 * Copyright (C) 1999-2001 Free Software Foundation, Inc. 3 * This file is part of the GNU LIBICONV Library. 4 * 5 * The GNU LIBICONV Library is free software; you can redistribute it 6 * and/or modify it under the terms of the GNU Library General Public 7 * License as published by the Free Software Foundation; either version 2 8 * of the License, or (at your option) any later version. 9 * 10 * The GNU LIBICONV Library is distributed in the hope that it will be 11 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 * Library General Public License for more details. 14 * 15 * You should have received a copy of the GNU Library General Public 16 * License along with the GNU LIBICONV Library; see the file COPYING.LIB. 17 * If not, write to the Free Software Foundation, Inc., 51 Franklin Street, 18 * Fifth Floor, Boston, MA 02110-1301, USA. 19 */ 20 21 /* 22 * JOHAB Hangul 23 * 24 * Ken Lunde writes in his "CJKV Information Processing" book, p. 114: 25 * "Hangul can be composed of two or three jamo (some jamo are considered 26 * compound). Johab uses 19 initial jamo (consonants), 21 medial jamo (vowels) 27 * and 27 final jamo (consonants; 28 when you include the "fill" character 28 * for Hangul containing only two jamo). Multiplying these numbers results in 29 * 11172." 30 * 31 * Structure of the Johab encoding (see p. 181-184): 32 * bit 15 = 1 33 * bit 14..10 = initial jamo, only 19+1 out of 32 possible values are used 34 * bit 9..5 = medial jamo, only 21+1 out of 32 possible values are used 35 * bit 4..0 = final jamo, only 27+1 out of 32 possible values are used 36 * 37 * Structure of the Unicode encoding: 38 * grep '^0x\([8-C]...\|D[0-7]..\)' unicode.org-mappings/EASTASIA/KSC/JOHAB.TXT 39 * You see that all characters there are marked "HANGUL LETTER" or "HANGUL 40 * SYLLABLE". If you eliminate the "HANGUL LETTER"s, the table is sorted 41 * in ascending order according to Johab encoding and according to the Unicode 42 * encoding. Now look a little more carefully, and you see that the following 43 * formula holds: 44 * unicode == 0xAC00 45 * + 21 * 28 * (jamo_initial_index[(johab >> 10) & 31] - 1) 46 * + 28 * (jamo_medial_index[(johab >> 5) & 31] - 1) 47 * + jamo_final_index[johab & 31] 48 * where the index tables are defined as below. 49 */ 50 51 /* Tables mapping 5-bit groups to jamo letters. */ 52 /* Note that Jamo XX = UHC 0xA4A0+XX = Unicode 0x3130+XX */ 53 #define NONE 0xfd 54 #define FILL 0xff 55 static const unsigned char jamo_initial[32] = { 56 NONE, FILL, 0x01, 0x02, 0x04, 0x07, 0x08, 0x09, 57 0x11, 0x12, 0x13, 0x15, 0x16, 0x17, 0x18, 0x19, 58 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, NONE, NONE, NONE, 59 NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE, 60 }; 61 static const unsigned char jamo_medial[32] = { 62 NONE, NONE, FILL, 0x1f, 0x20, 0x21, 0x22, 0x23, 63 NONE, NONE, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 64 NONE, NONE, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 65 NONE, NONE, 0x30, 0x31, 0x32, 0x33, NONE, NONE, 66 }; 67 static const unsigned char jamo_final[32] = { 68 NONE, FILL, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 69 0x07, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 70 0x10, 0x11, NONE, 0x12, 0x14, 0x15, 0x16, 0x17, 71 0x18, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, NONE, NONE, 72 }; 73 /* Same as jamo_final, except that it excludes characters already 74 contained in jamo_initial. 11 characters instead of 27. */ 75 static const unsigned char jamo_final_notinitial[32] = { 76 NONE, NONE, NONE, NONE, 0x03, NONE, 0x05, 0x06, 77 NONE, NONE, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 78 0x10, NONE, NONE, NONE, 0x14, NONE, NONE, NONE, 79 NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE, 80 }; 81 82 /* Tables mapping 5-bit groups to packed indices. */ 83 #define none -1 84 #define fill 0 85 static const signed char jamo_initial_index[32] = { 86 none, fill, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 87 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 88 0x0f, 0x10, 0x11, 0x12, 0x13, none, none, none, 89 none, none, none, none, none, none, none, none, 90 }; 91 static const signed char jamo_medial_index[32] = { 92 none, none, fill, 0x01, 0x02, 0x03, 0x04, 0x05, 93 none, none, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 94 none, none, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 95 none, none, 0x12, 0x13, 0x14, 0x15, none, none, 96 }; 97 static const signed char jamo_final_index[32] = { 98 none, fill, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 99 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 100 0x0f, 0x10, none, 0x11, 0x12, 0x13, 0x14, 0x15, 101 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, none, none, 102 }; 103 104 static int 105 johab_hangul_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, int n) 106 { 107 unsigned char c1 = s[0]; 108 if ((c1 >= 0x84 && c1 <= 0xd3)) { 109 if (n >= 2) { 110 unsigned char c2 = s[1]; 111 if ((c2 >= 0x41 && c2 < 0x7f) || (c2 >= 0x81 && c2 < 0xff)) { 112 unsigned int johab = (c1 << 8) | c2; 113 unsigned int bitspart1 = (johab >> 10) & 31; 114 unsigned int bitspart2 = (johab >> 5) & 31; 115 unsigned int bitspart3 = johab & 31; 116 int index1 = jamo_initial_index[bitspart1]; 117 int index2 = jamo_medial_index[bitspart2]; 118 int index3 = jamo_final_index[bitspart3]; 119 /* Exclude "none" values. */ 120 if (index1 >= 0 && index2 >= 0 && index3 >= 0) { 121 /* Deal with "fill" values in initial or medial position. */ 122 if (index1 == fill) { 123 if (index2 == fill) { 124 unsigned char jamo3 = jamo_final_notinitial[bitspart3]; 125 if (jamo3 != NONE) { 126 *pwc = (ucs4_t) 0x3130 + jamo3; 127 return 2; 128 } 129 } else if (index3 == fill) { 130 unsigned char jamo2 = jamo_medial[bitspart2]; 131 if (jamo2 != NONE && jamo2 != FILL) { 132 *pwc = (ucs4_t) 0x3130 + jamo2; 133 return 2; 134 } 135 } 136 /* Syllables composed only of medial and final don't exist. */ 137 } else if (index2 == fill) { 138 if (index3 == fill) { 139 unsigned char jamo1 = jamo_initial[bitspart1]; 140 if (jamo1 != NONE && jamo1 != FILL) { 141 *pwc = (ucs4_t) 0x3130 + jamo1; 142 return 2; 143 } 144 } 145 /* Syllables composed only of initial and final don't exist. */ 146 } else { 147 /* index1 and index2 are not fill, but index3 may be fill. */ 148 /* Nothing more to exclude. All 11172 code points are valid. */ 149 *pwc = 0xac00 + ((index1 - 1) * 21 + (index2 - 1)) * 28 + index3; 150 return 2; 151 } 152 } 153 } 154 return RET_ILSEQ; 155 } 156 return RET_TOOFEW(0); 157 } 158 return RET_ILSEQ; 159 } 160 161 /* 51 Jamo: 19 initial, 21 medial, 11 final not initial. */ 162 static const unsigned short johab_hangul_page31[51] = { 163 0x8841, 0x8c41, 0x8444, 0x9041, 0x8446, 0x8447, 0x9441, /*0x30-0x37*/ 164 0x9841, 0x9c41, 0x844a, 0x844b, 0x844c, 0x844d, 0x844e, 0x844f, /*0x38-0x3f*/ 165 0x8450, 0xa041, 0xa441, 0xa841, 0x8454, 0xac41, 0xb041, 0xb441, /*0x40-0x47*/ 166 0xb841, 0xbc41, 0xc041, 0xc441, 0xc841, 0xcc41, 0xd041, 0x8461, /*0x48-0x4f*/ 167 0x8481, 0x84a1, 0x84c1, 0x84e1, 0x8541, 0x8561, 0x8581, 0x85a1, /*0x50-0x57*/ 168 0x85c1, 0x85e1, 0x8641, 0x8661, 0x8681, 0x86a1, 0x86c1, 0x86e1, /*0x58-0x5f*/ 169 0x8741, 0x8761, 0x8781, 0x87a1, /*0x60-0x67*/ 170 }; 171 172 /* Tables mapping packed indices to 5-bit groups. */ 173 /* index1+1 = jamo_initial_index[bitspart1] <==> 174 bitspart1 = jamo_initial_index_inverse[index1] */ 175 static const char jamo_initial_index_inverse[19] = { 176 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 177 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 178 0x10, 0x11, 0x12, 0x13, 0x14, 179 }; 180 /* index2+1 = jamo_medial_index[bitspart2] <==> 181 bitspart2 = jamo_medial_index_inverse[index2] */ 182 static const char jamo_medial_index_inverse[21] = { 183 0x03, 0x04, 0x05, 0x06, 0x07, 184 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 185 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 186 0x1a, 0x1b, 0x1c, 0x1d, 187 }; 188 /* index3 = jamo_final_index[bitspart3] <==> 189 bitspart3 = jamo_final_index_inverse[index3] */ 190 static const char jamo_final_index_inverse[28] = { 191 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 192 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 193 0x10, 0x11, 0x13, 0x14, 0x15, 0x16, 0x17, 194 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 195 }; 196 197 static int 198 johab_hangul_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, int n) 199 { 200 if (n >= 2) { 201 if (wc >= 0x3131 && wc < 0x3164) { 202 unsigned short c = johab_hangul_page31[wc-0x3131]; 203 r[0] = (c >> 8); r[1] = (c & 0xff); 204 return 2; 205 } else if (wc >= 0xac00 && wc < 0xd7a4) { 206 unsigned int index1; 207 unsigned int index2; 208 unsigned int index3; 209 unsigned short c; 210 unsigned int tmp = wc - 0xac00; 211 index3 = tmp % 28; tmp = tmp / 28; 212 index2 = tmp % 21; tmp = tmp / 21; 213 index1 = tmp; 214 c = (((((1 << 5) 215 | jamo_initial_index_inverse[index1]) << 5) 216 | jamo_medial_index_inverse[index2]) << 5) 217 | jamo_final_index_inverse[index3]; 218 r[0] = (c >> 8); r[1] = (c & 0xff); 219 return 2; 220 } 221 return RET_ILUNI; 222 } 223 return RET_TOOSMALL; 224 } 225 226 /* 227 * Decomposition of JOHAB Hangul in one to three Johab Jamo elements. 228 */ 229 230 /* Decompose wc into r[0..2], and return the number of resulting Jamo elements. 231 Return RET_ILUNI if decomposition is not possible. */ 232 233 static int johab_hangul_decompose (conv_t conv, ucs4_t* r, ucs4_t wc) 234 { 235 unsigned char buf[2]; 236 int ret = johab_hangul_wctomb(conv,buf,wc,2); 237 if (ret != RET_ILUNI) { 238 unsigned int hangul = (buf[0] << 8) | buf[1]; 239 unsigned char jamo1 = jamo_initial[(hangul >> 10) & 31]; 240 unsigned char jamo2 = jamo_medial[(hangul >> 5) & 31]; 241 unsigned char jamo3 = jamo_final[hangul & 31]; 242 if ((hangul >> 15) != 1) abort(); 243 if (jamo1 != NONE && jamo2 != NONE && jamo3 != NONE) { 244 /* They are not all three == FILL because that would correspond to 245 johab = 0x8441, which doesn't exist. */ 246 ucs4_t* p = r; 247 if (jamo1 != FILL) 248 *p++ = 0x3130 + jamo1; 249 if (jamo2 != FILL) 250 *p++ = 0x3130 + jamo2; 251 if (jamo3 != FILL) 252 *p++ = 0x3130 + jamo3; 253 return p-r; 254 } 255 } 256 return RET_ILUNI; 257 } 258 259 #undef fill 260 #undef none 261 #undef FILL 262 #undef NONE 263