xref: /haiku/src/kits/textencoding/character_sets.cpp (revision 95bac3fda53a4cb21880712d7b43f8c21db32a2e)
1 #include <string.h>
2 #include <CharacterSet.h>
3 #include <Debug.h>
4 #include "character_sets.h"
5 
6 namespace BPrivate {
7 
8 /**
9  * These variables are used in defining the character_sets_by_id array below.
10  * @see http://www.iana.org/assignments/character-sets
11  * @see http://java.sun.com/j2se/1.5.0/docs/guide/intl/encoding.doc.html
12  * @see http://www.openi18n.org/subgroups/sa/locnameguide/final/CodesetAliasTable.html
13  **/
14 
15 static const char * unicodeAliases[] = {
16 	// IANA aliases
17 	// java aliases
18 	"UTF8", "unicode-1-1-utf-8",
19 	NULL
20 };
21 static const BCharacterSet unicode(0,106,"Unicode","UTF-8","UTF-8",unicodeAliases);
22 
23 static const char * isoLatin1aliases[] = {
24 	// IANA aliases
25 	"iso-ir-100", "ISO_8859-1", "ISO-8859-1", "latin1", "11", "IBM819", "CP819", "csISOLatin1",
26 	// java aliases
27 	"819", "IBM-819", "ISO8859_1", "8859_1", "ISO8859-1",
28 	NULL
29 };
30 static const BCharacterSet isoLatin1(1,4,"ISO West European","ISO_8859-1:1987","ISO-8859-1",isoLatin1aliases);
31 
32 static const char * isoLatin2aliases[] = {
33 	// IANA aliases
34 	"iso-ir-101", "ISO_8859-2", "ISO-8859-2", "latin2", "12", "csISOLatin2",
35 	// java aliases
36 	"iso8859_2", "8859_2", "ISO8859-2", "ibm912", "ibm-912", "cp912", "912",
37 	NULL
38 };
39 static const BCharacterSet isoLatin2(2,5,"ISO East European","ISO_8859-2:1987","ISO-8859-2",isoLatin2aliases);
40 
41 static const char * isoLatin3aliases[] = {
42 	// IANA aliases
43 	"iso-ir-109", "ISO_8859-3", "ISO-8859-3", "latin3", "13", "csISOLatin3",
44 	// java aliases
45 	"iso8859_3", "8859_3", "iso8859-3", "ibm913", "ibm-913", "cp913", "913",
46 	NULL
47 };
48 static const BCharacterSet isoLatin3(3,6,"ISO South European","ISO_8859-3:1988","ISO-8859-3",isoLatin3aliases);
49 
50 static const char * isoLatin4aliases[] = {
51 	// IANA aliases
52 	"iso-ir-110", "ISO_8859-4", "ISO-8859-4", "latin4", "14", "csISOLatin4",
53 	// java aliases
54 	"iso8859_4", "iso8859-4", "8859_4", "ibm914", "ibm-914", "cp914", "914",
55 	NULL
56 };
57 static const BCharacterSet isoLatin4(4,7,"ISO North European","ISO_8859-4:1988","ISO-8859-4",isoLatin4aliases);
58 
59 static const char * isoLatin5aliases[] = {
60 	// IANA aliases
61 	"iso-ir-144", "ISO_8859-5", "ISO-8859-5", "cyrillic", "csISOLatinCyrillic",
62 	// java aliases
63 	"iso8859_5", "8859_5", "ISO8859-5", "ibm915", "ibm-915", "cp915", "915",
64 	NULL
65 };
66 static const BCharacterSet isoLatin5(5,8,"ISO Cyrillic","ISO_8859-5:1988","ISO-8859-5",isoLatin5aliases);
67 
68 static const char * isoLatin6aliases[] = {
69 	// IANA aliases
70 	"iso-ir-127", "ISO_8859-6", "ISO-8859-6", "ECMA-114", "ASMO-708", "arabic", "csISOLatinArabic",
71 	// java aliases
72 	"iso8859_6", "8859_6", "ISO8859-6", "ibm1089", "ibm-1089", "cp1089", "1089",
73 	NULL
74 };
75 static const BCharacterSet isoLatin6(6,9,"ISO Arabic","ISO_8859-6:1987","ISO-8859-6",isoLatin6aliases);
76 
77 static const char * isoLatin7aliases[] = {
78 	// IANA aliases
79 	"iso-ir-126", "ISO_8859-7", "ISO-8859-7", "ELOT_928", "ECMA-118", "greek", "greek8", "csISOLatinGreek",
80 	// java aliases
81 	"iso8859_7", "8859_7", "iso8859-7", "sun_eu_greek", "ibm813", "ibm-813", "813", "cp813",
82 	NULL
83 };
84 static const BCharacterSet isoLatin7(7,10,"ISO Greek","ISO_8859-7:1987","ISO-8859-7",isoLatin7aliases);
85 
86 static const char * isoLatin8aliases[] = {
87 	// IANA aliases
88 	"iso-ir-138", "ISO_8859-8", "ISO-8859-8", "hebrew", "csISOLatinHebrew",
89 	// java aliases
90 	"iso8859_8", "8859_8", "ISO8859-8", "cp916", "916", "ibm916", "ibm-916",
91 	NULL
92 };
93 static const BCharacterSet isoLatin8(8,11,"ISO Hebrew","ISO_8859-8:1988","ISO-8859-8",isoLatin8aliases);
94 
95 static const char * isoLatin9aliases[] = {
96 	// IANA aliases
97 	"iso-ir-148", "ISO_8859-9", "ISO-8859-9", "latin5", "15", "csISOLatin5",
98 	// java aliases
99 	"iso8859_9", "8859_9", "ibm920", "ibm-920", "920", "cp920",
100 	NULL
101 };
102 const BCharacterSet isoLatin9(9,12,"ISO Turkish","ISO_8859-9:1989","ISO-8859-9",isoLatin9aliases);
103 
104 static const char * isoLatin10aliases[] = {
105 	// IANA aliases
106 	"iso-ir-157", "16", "ISO_8859-10:1992", "csISOLatin6", "latin6",
107 	// java aliases
108 	NULL
109 };
110 static const BCharacterSet isoLatin10(10,13,"ISO Nordic","ISO-8859-10","ISO-8859-10",isoLatin10aliases);
111 
112 static const char * macintoshAliases[] = {
113 	// IANA aliases
114 	"mac", "csMacintosh",
115 	// java aliases
116 	"MacRoman",
117 	// mail kit aliases
118 	"x-mac-roman",
119 	NULL
120 };
121 static const BCharacterSet macintosh(11,2027,"Macintosh Roman","macintosh",NULL,macintoshAliases);
122 
123 static const char * shiftJISaliases[] = {
124 	// IANA aliases
125 	"MS_Kanji", "csShiftJIS",
126 	// java aliases
127 	"sjis", "shift_jis", "shift-jis", "x-sjis",
128 	// mail kit aliases
129 	"shift_jisx0213",
130 	NULL
131 };
132 static const BCharacterSet shiftJIS(12,17,"Japanese Shift JIS","Shift_JIS","Shift_JIS",shiftJISaliases);
133 
134 static const char * EUCPackedJapaneseAliases[] = {
135 	// IANA aliases
136 	"EUC-JP", "csEUCPkdFmtJapanese",
137 	// java aliases
138 	"eucjis", "eucjp", "x-euc-jp", "x-eucjp",
139 	// mail kit aliases
140 	"euc-jisx0213",
141 	NULL
142 };
143 static const BCharacterSet packedJapanese(13,18,"Japanese EUC",
144                                    "Extended_UNIX_Code_Packed_Format_for_Japanese","EUC-JP",
145                                    EUCPackedJapaneseAliases);
146 
147 static const char * iso2022jpAliases[] = {
148 	// IANA aliases
149 	"csISO2022JP",
150 	// java aliases
151 	"iso2022jp", "jis", "jis_encoding", "csjisencoding",
152 	NULL
153 };
154 static const BCharacterSet iso2022jp(14,39,"Japanese JIS","ISO-2022-JP","ISO-2022-JP",iso2022jpAliases);
155 
156 static const char * windows1252aliases[] = {
157 	// IANA aliases
158 	// java aliases
159 	"cp1252", "cp5348",
160 	NULL
161 };
162 static const BCharacterSet windows1252(15,2252,"Windows Latin-1 (CP 1252)","windows-1252",NULL,windows1252aliases);
163 
164 static const char * unicode2aliases[] = {
165 	// IANA aliases
166 	"csUnicode",
167 	// java aliases
168 	"UTF_16BE", "X-UTF-16BE", "UnicodeBigUnmarked",
169 	NULL
170 };
171 static const BCharacterSet unicode2(16,1000,"Unicode (UTF-16)","ISO-10646-UCS-2",NULL,unicode2aliases);
172 
173 static const char * KOI8Raliases[] = {
174 	// IANA aliases
175 	"csKOI8R",
176 	// java aliases
177 	"koi8_r", "koi8", "cskoi8r",
178 	NULL
179 };
180 static const BCharacterSet KOI8R(17,2084,"KOI8-R Cyrillic","KOI8-R","KOI8-R",KOI8Raliases);
181 
182 static const char * windows1251aliases[] = {
183 	// IANA aliases
184 	// java aliases
185 	"cp1251", "cp5347", "ansi-1251",
186 	NULL
187 };
188 static const BCharacterSet windows1251(18,2251,"Windows Cyrillic (CP 1251)","windows-1251",NULL,windows1251aliases);
189 
190 static const char * IBM866aliases[] = {
191 	// IANA aliases
192 	"cp866", "866", "csIBM866",
193 	// java aliases
194 	"ibm-866",
195 	// mail kit aliases
196 	"dos-866",
197 	NULL
198 };
199 static const BCharacterSet IBM866(19,2086,"DOS Cyrillic","IBM866","IBM866",IBM866aliases);
200 
201 static const char * IBM437aliases[] = {
202 	// IANA aliases
203 	"cp437", "437", "csPC8CodePage437",
204 	// java aliases
205 	"ibm-437", "windows-437",
206 	// mail kit aliases
207 	"dos-437",
208 	NULL
209 };
210 static const BCharacterSet IBM437(20,2011,"DOS Latin-US","IBM437","IBM437",IBM437aliases);
211 
212 static const char * eucKRaliases[] = {
213 	// IANA aliases
214 	"csEUCKR",
215 	// java aliases
216 	"ksc5601", "euckr", "ks_c_5601-1987", "ksc5601-1987", "ksc5601_1987", "ksc_5601", "5601",
217 	NULL
218 };
219 static const BCharacterSet eucKR(21,38,"EUC Korean","EUC-KR","EUC-KR",eucKRaliases);
220 
221 static const char * iso13aliases[] = {
222 	// IANA aliases
223 	// java aliases
224 	"iso8859_13", "8859_13", "iso_8859-13", "ISO8859-13",
225 	NULL
226 };
227 static const BCharacterSet iso13(22,109,"ISO Baltic","ISO-8859-13","ISO-8859-13",iso13aliases);
228 
229 static const char * iso14aliases[] = {
230 	// IANA aliases
231 	"iso-ir-199", "ISO_8859-14:1998", "ISO_8859-14", "latin8", "iso-celtic", "l8",
232 	NULL
233 };
234 static const BCharacterSet iso14(23,110,"ISO Celtic","ISO-8859-14","ISO-8859-14",iso14aliases);
235 
236 static const char * iso15aliases[] = {
237 	// IANA aliases
238 	"ISO_8859-15", "Latin-9",
239 	// java aliases
240 	"8859_15", "ISO8859_15", "ISO8859-15", "IBM923", "IBM-923", "cp923", "923",
241 	"LATIN0", "LATIN9", "L9", "csISOlatin0", "csISOlatin9", "ISO8859_15_FDIS",
242 	NULL
243 };
244 static const BCharacterSet iso15(24,111,"ISO Latin 9","ISO-8859-15","ISO-8859-15",iso15aliases);
245 
246 // chinese character set testing
247 
248 static const char * big5aliases[] = {
249 	// IANA aliases
250 	"csBig5",
251 	NULL
252 };
253 static const BCharacterSet big5(25,2026,"Chinese Big5","Big5","Big5",big5aliases);
254 
255 static const char * gb18030aliases[] = {
256 	// java aliases
257 	"gb18030-2000",
258 	// mail kit aliases
259 	"gb2312",
260 	"gbk",
261 	NULL
262 };
263 static const BCharacterSet gb18030(26,114,"Chinese GB18030","GB18030",NULL,gb18030aliases);
264 
265 /**
266  * The following initializes the global character set array.
267  * It is organized by id for efficient retrieval using predefined constants in UTF8.h and Font.h.
268  * Character sets are stored contiguously and may be efficiently iterated over.
269  * To add a new character set, define the character set above -- remember to increment the id --
270  * and then add &<charSetName> to the _end_ of the following list.  That's all.
271  **/
272 
273 const BCharacterSet * character_sets_by_id[] = {
274 	&unicode,
275 	&isoLatin1, &isoLatin2, &isoLatin3,	&isoLatin4,	&isoLatin5,
276 	&isoLatin6,	&isoLatin7, &isoLatin8, &isoLatin9, &isoLatin10,
277 	&macintosh,
278 	// R5 BFont encodings end here
279 	&shiftJIS, &packedJapanese, &iso2022jp,
280 	&windows1252, &unicode2, &KOI8R, &windows1251,
281 	&IBM866, &IBM437, &eucKR, &iso13, &iso14, &iso15,
282 	// R5 convert_to/from_utf8 encodings end here
283 	&big5,&gb18030,
284 };
285 const uint32 character_sets_by_id_count = sizeof(character_sets_by_id)/sizeof(const BCharacterSet*);
286 
287 /**
288  * The following code initializes the global MIBenum array.
289  * This sparsely populated array exists as an efficient way to access character sets by MIBenum.
290  * The MIBenum array is automatically allocated, and initialized by the following class.
291  * The following class should only be instantiated once, this is assured by using an assertion.
292  * No changes are required to the following code to add a new character set.
293  **/
294 
295 const BCharacterSet ** character_sets_by_MIBenum;
296 uint32 maximum_valid_MIBenum;
297 
298 static class MIBenumArrayInitializer {
299 public:
300 	MIBenumArrayInitializer() {
301 		DEBUG_ONLY(static int onlyOneTime = 0;)
302 		ASSERT_WITH_MESSAGE(onlyOneTime++ == 0,"MIBenumArrayInitializer should be instantiated only one time.");
303 		// analyzing character_sets_by_id
304 		uint32 max_MIBenum = 0;
305 		for (uint32 index = 0 ; index < character_sets_by_id_count ; index++ ) {
306 			if (max_MIBenum < character_sets_by_id[index]->GetMIBenum()) {
307 				max_MIBenum = character_sets_by_id[index]->GetMIBenum();
308 			}
309 		}
310 		// initializing extern variables
311 		character_sets_by_MIBenum = new (const BCharacterSet*)[max_MIBenum+2];
312 		maximum_valid_MIBenum = max_MIBenum;
313 		// initializing MIBenum array
314 		memset(character_sets_by_MIBenum,0,sizeof(BCharacterSet*)*(max_MIBenum+2));
315 		for (uint32 index2 = 0 ; index2 < character_sets_by_id_count ; index2++ ) {
316 			const BCharacterSet * charset = character_sets_by_id[index2];
317 			character_sets_by_MIBenum[charset->GetMIBenum()] = charset;
318 		}
319 	}
320 	~MIBenumArrayInitializer()
321 	{
322 		delete [] character_sets_by_MIBenum;
323 	}
324 } runTheInitializer;
325 
326 }
327