xref: /haiku/src/libs/iconv/johab_hangul.h (revision d157bf8522d5dc449602bec43f10ecdedc9943cd)
1 /*
2  * Copyright (C) 1999-2001 Free Software Foundation, Inc.
3  * This file is part of the GNU LIBICONV Library.
4  *
5  * The GNU LIBICONV Library is free software; you can redistribute it
6  * and/or modify it under the terms of the GNU Library General Public
7  * License as published by the Free Software Foundation; either version 2
8  * of the License, or (at your option) any later version.
9  *
10  * The GNU LIBICONV Library is distributed in the hope that it will be
11  * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  * Library General Public License for more details.
14  *
15  * You should have received a copy of the GNU Library General Public
16  * License along with the GNU LIBICONV Library; see the file COPYING.LIB.
17  * If not, write to the Free Software Foundation, Inc., 51 Franklin Street,
18  * Fifth Floor, Boston, MA 02110-1301, USA.
19  */
20 
21 /*
22  * JOHAB Hangul
23  *
24  * Ken Lunde writes in his "CJKV Information Processing" book, p. 114:
25  * "Hangul can be composed of two or three jamo (some jamo are considered
26  *  compound). Johab uses 19 initial jamo (consonants), 21 medial jamo (vowels)
27  *  and 27 final jamo (consonants; 28 when you include the "fill" character
28  *  for Hangul containing only two jamo). Multiplying these numbers results in
29  *  11172."
30  *
31  * Structure of the Johab encoding (see p. 181-184):
32  *   bit 15 = 1
33  *   bit 14..10 = initial jamo, only 19+1 out of 32 possible values are used
34  *   bit 9..5 = medial jamo, only 21+1 out of 32 possible values are used
35  *   bit 4..0 = final jamo, only 27+1 out of 32 possible values are used
36  *
37  * Structure of the Unicode encoding:
38  * grep '^0x\([8-C]...\|D[0-7]..\)' unicode.org-mappings/EASTASIA/KSC/JOHAB.TXT
39  * You see that all characters there are marked "HANGUL LETTER" or "HANGUL
40  * SYLLABLE". If you eliminate the "HANGUL LETTER"s, the table is sorted
41  * in ascending order according to Johab encoding and according to the Unicode
42  * encoding. Now look a little more carefully, and you see that the following
43  * formula holds:
44  *     unicode == 0xAC00
45  *                + 21 * 28 * (jamo_initial_index[(johab >> 10) & 31] - 1)
46  *                + 28 * (jamo_medial_index[(johab >> 5) & 31] - 1)
47  *                + jamo_final_index[johab & 31]
48  * where the index tables are defined as below.
49  */
50 
51 /* Tables mapping 5-bit groups to jamo letters. */
52 /* Note that Jamo XX = UHC 0xA4A0+XX = Unicode 0x3130+XX */
53 #define NONE 0xfd
54 #define FILL 0xff
55 static const unsigned char jamo_initial[32] = {
56   NONE, FILL, 0x01, 0x02, 0x04, 0x07, 0x08, 0x09,
57   0x11, 0x12, 0x13, 0x15, 0x16, 0x17, 0x18, 0x19,
58   0x1a, 0x1b, 0x1c, 0x1d, 0x1e, NONE, NONE, NONE,
59   NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE,
60 };
61 static const unsigned char jamo_medial[32] = {
62   NONE, NONE, FILL, 0x1f, 0x20, 0x21, 0x22, 0x23,
63   NONE, NONE, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29,
64   NONE, NONE, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
65   NONE, NONE, 0x30, 0x31, 0x32, 0x33, NONE, NONE,
66 };
67 static const unsigned char jamo_final[32] = {
68   NONE, FILL, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
69   0x07, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
70   0x10, 0x11, NONE, 0x12, 0x14, 0x15, 0x16, 0x17,
71   0x18, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, NONE, NONE,
72 };
73 /* Same as jamo_final, except that it excludes characters already
74    contained in jamo_initial. 11 characters instead of 27. */
75 static const unsigned char jamo_final_notinitial[32] = {
76   NONE, NONE, NONE, NONE, 0x03, NONE, 0x05, 0x06,
77   NONE, NONE, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
78   0x10, NONE, NONE, NONE, 0x14, NONE, NONE, NONE,
79   NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE,
80 };
81 
82 /* Tables mapping 5-bit groups to packed indices. */
83 #define none -1
84 #define fill 0
85 static const signed char jamo_initial_index[32] = {
86   none, fill, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
87   0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
88   0x0f, 0x10, 0x11, 0x12, 0x13, none, none, none,
89   none, none, none, none, none, none, none, none,
90 };
91 static const signed char jamo_medial_index[32] = {
92   none, none, fill, 0x01, 0x02, 0x03, 0x04, 0x05,
93   none, none, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b,
94   none, none, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11,
95   none, none, 0x12, 0x13, 0x14, 0x15, none, none,
96 };
97 static const signed char jamo_final_index[32] = {
98   none, fill, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
99   0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
100   0x0f, 0x10, none, 0x11, 0x12, 0x13, 0x14, 0x15,
101   0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, none, none,
102 };
103 
104 static int
105 johab_hangul_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, int n)
106 {
107   unsigned char c1 = s[0];
108   if ((c1 >= 0x84 && c1 <= 0xd3)) {
109     if (n >= 2) {
110       unsigned char c2 = s[1];
111       if ((c2 >= 0x41 && c2 < 0x7f) || (c2 >= 0x81 && c2 < 0xff)) {
112         unsigned int johab = (c1 << 8) | c2;
113         unsigned int bitspart1 = (johab >> 10) & 31;
114         unsigned int bitspart2 = (johab >> 5) & 31;
115         unsigned int bitspart3 = johab & 31;
116         int index1 = jamo_initial_index[bitspart1];
117         int index2 = jamo_medial_index[bitspart2];
118         int index3 = jamo_final_index[bitspart3];
119         /* Exclude "none" values. */
120         if (index1 >= 0 && index2 >= 0 && index3 >= 0) {
121           /* Deal with "fill" values in initial or medial position. */
122           if (index1 == fill) {
123             if (index2 == fill) {
124               unsigned char jamo3 = jamo_final_notinitial[bitspart3];
125               if (jamo3 != NONE) {
126                 *pwc = (ucs4_t) 0x3130 + jamo3;
127                 return 2;
128               }
129             } else if (index3 == fill) {
130               unsigned char jamo2 = jamo_medial[bitspart2];
131               if (jamo2 != NONE && jamo2 != FILL) {
132                 *pwc = (ucs4_t) 0x3130 + jamo2;
133                 return 2;
134               }
135             }
136             /* Syllables composed only of medial and final don't exist. */
137           } else if (index2 == fill) {
138             if (index3 == fill) {
139               unsigned char jamo1 = jamo_initial[bitspart1];
140               if (jamo1 != NONE && jamo1 != FILL) {
141                 *pwc = (ucs4_t) 0x3130 + jamo1;
142                 return 2;
143               }
144             }
145             /* Syllables composed only of initial and final don't exist. */
146           } else {
147              /* index1 and index2 are not fill, but index3 may be fill. */
148              /* Nothing more to exclude. All 11172 code points are valid. */
149              *pwc = 0xac00 + ((index1 - 1) * 21 + (index2 - 1)) * 28 + index3;
150              return 2;
151           }
152         }
153       }
154       return RET_ILSEQ;
155     }
156     return RET_TOOFEW(0);
157   }
158   return RET_ILSEQ;
159 }
160 
161 /* 51 Jamo: 19 initial, 21 medial, 11 final not initial. */
162 static const unsigned short johab_hangul_page31[51] = {
163           0x8841, 0x8c41, 0x8444, 0x9041, 0x8446, 0x8447, 0x9441, /*0x30-0x37*/
164   0x9841, 0x9c41, 0x844a, 0x844b, 0x844c, 0x844d, 0x844e, 0x844f, /*0x38-0x3f*/
165   0x8450, 0xa041, 0xa441, 0xa841, 0x8454, 0xac41, 0xb041, 0xb441, /*0x40-0x47*/
166   0xb841, 0xbc41, 0xc041, 0xc441, 0xc841, 0xcc41, 0xd041, 0x8461, /*0x48-0x4f*/
167   0x8481, 0x84a1, 0x84c1, 0x84e1, 0x8541, 0x8561, 0x8581, 0x85a1, /*0x50-0x57*/
168   0x85c1, 0x85e1, 0x8641, 0x8661, 0x8681, 0x86a1, 0x86c1, 0x86e1, /*0x58-0x5f*/
169   0x8741, 0x8761, 0x8781, 0x87a1,                                 /*0x60-0x67*/
170 };
171 
172 /* Tables mapping packed indices to 5-bit groups. */
173 /* index1+1 = jamo_initial_index[bitspart1]  <==>
174    bitspart1 = jamo_initial_index_inverse[index1] */
175 static const char jamo_initial_index_inverse[19] = {
176               0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
177   0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
178   0x10, 0x11, 0x12, 0x13, 0x14,
179 };
180 /* index2+1 = jamo_medial_index[bitspart2]  <==>
181    bitspart2 = jamo_medial_index_inverse[index2] */
182 static const char jamo_medial_index_inverse[21] = {
183                     0x03, 0x04, 0x05, 0x06, 0x07,
184               0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
185               0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
186               0x1a, 0x1b, 0x1c, 0x1d,
187 };
188 /* index3 = jamo_final_index[bitspart3]  <==>
189    bitspart3 = jamo_final_index_inverse[index3] */
190 static const char jamo_final_index_inverse[28] = {
191         0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
192   0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
193   0x10, 0x11,       0x13, 0x14, 0x15, 0x16, 0x17,
194   0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d,
195 };
196 
197 static int
198 johab_hangul_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, int n)
199 {
200   if (n >= 2) {
201     if (wc >= 0x3131 && wc < 0x3164) {
202       unsigned short c = johab_hangul_page31[wc-0x3131];
203       r[0] = (c >> 8); r[1] = (c & 0xff);
204       return 2;
205     } else if (wc >= 0xac00 && wc < 0xd7a4) {
206       unsigned int index1;
207       unsigned int index2;
208       unsigned int index3;
209       unsigned short c;
210       unsigned int tmp = wc - 0xac00;
211       index3 = tmp % 28; tmp = tmp / 28;
212       index2 = tmp % 21; tmp = tmp / 21;
213       index1 = tmp;
214       c = (((((1 << 5)
215               | jamo_initial_index_inverse[index1]) << 5)
216             | jamo_medial_index_inverse[index2]) << 5)
217           | jamo_final_index_inverse[index3];
218       r[0] = (c >> 8); r[1] = (c & 0xff);
219       return 2;
220     }
221     return RET_ILUNI;
222   }
223   return RET_TOOSMALL;
224 }
225 
226 /*
227  * Decomposition of JOHAB Hangul in one to three Johab Jamo elements.
228  */
229 
230 /* Decompose wc into r[0..2], and return the number of resulting Jamo elements.
231    Return RET_ILUNI if decomposition is not possible. */
232 
233 static int johab_hangul_decompose (conv_t conv, ucs4_t* r, ucs4_t wc)
234 {
235   unsigned char buf[2];
236   int ret = johab_hangul_wctomb(conv,buf,wc,2);
237   if (ret != RET_ILUNI) {
238     unsigned int hangul = (buf[0] << 8) | buf[1];
239     unsigned char jamo1 = jamo_initial[(hangul >> 10) & 31];
240     unsigned char jamo2 = jamo_medial[(hangul >> 5) & 31];
241     unsigned char jamo3 = jamo_final[hangul & 31];
242     if ((hangul >> 15) != 1) abort();
243     if (jamo1 != NONE && jamo2 != NONE && jamo3 != NONE) {
244       /* They are not all three == FILL because that would correspond to
245          johab = 0x8441, which doesn't exist. */
246       ucs4_t* p = r;
247       if (jamo1 != FILL)
248         *p++ = 0x3130 + jamo1;
249       if (jamo2 != FILL)
250         *p++ = 0x3130 + jamo2;
251       if (jamo3 != FILL)
252         *p++ = 0x3130 + jamo3;
253       return p-r;
254     }
255   }
256   return RET_ILUNI;
257 }
258 
259 #undef fill
260 #undef none
261 #undef FILL
262 #undef NONE
263