xref: /haiku/src/kits/textencoding/character_sets.cpp (revision 25a7b01d15612846f332751841da3579db313082)
1 #include <string.h>
2 #include <Catalog.h>
3 #include <Locale.h>
4 #include <CharacterSet.h>
5 #include <Debug.h>
6 #include "character_sets.h"
7 
8 #undef B_TRANSLATION_CONTEXT
9 #define B_TRANSLATION_CONTEXT "textencodings"
10 
11 namespace BPrivate {
12 
13 /**
14  * These variables are used in defining the character_sets_by_id array below.
15  * @see http://www.iana.org/assignments/character-sets
16  * @see http://java.sun.com/j2se/1.5.0/docs/guide/intl/encoding.doc.html
17  * @see http://www.openi18n.org/subgroups/sa/locnameguide/final/CodesetAliasTable.html
18  **/
19 
20 static const char * unicodeAliases[] = {
21 	// IANA aliases
22 	// java aliases
23 	"UTF8", "unicode-1-1-utf-8",
24 	NULL
25 };
26 static const BCharacterSet unicode(0,106, B_TRANSLATE("Unicode"),
27 	"UTF-8", "UTF-8",unicodeAliases);
28 
29 static const char * isoLatin1aliases[] = {
30 	// IANA aliases
31 	"iso-ir-100", "ISO_8859-1", "ISO-8859-1", "latin1", "11", "IBM819", "CP819", "csISOLatin1",
32 	// java aliases
33 	"819", "IBM-819", "ISO8859_1", "8859_1", "ISO8859-1",
34 	NULL
35 };
36 static const BCharacterSet isoLatin1(1,4, B_TRANSLATE("ISO West European"),
37 	"ISO_8859-1:1987","ISO-8859-1",isoLatin1aliases);
38 
39 static const char * isoLatin2aliases[] = {
40 	// IANA aliases
41 	"iso-ir-101", "ISO_8859-2", "ISO-8859-2", "latin2", "12", "csISOLatin2",
42 	// java aliases
43 	"iso8859_2", "8859_2", "ISO8859-2", "ibm912", "ibm-912", "cp912", "912",
44 	NULL
45 };
46 static const BCharacterSet isoLatin2(2,5, B_TRANSLATE("ISO East European"),
47 	"ISO_8859-2:1987","ISO-8859-2",isoLatin2aliases);
48 
49 static const char * isoLatin3aliases[] = {
50 	// IANA aliases
51 	"iso-ir-109", "ISO_8859-3", "ISO-8859-3", "latin3", "13", "csISOLatin3",
52 	// java aliases
53 	"iso8859_3", "8859_3", "iso8859-3", "ibm913", "ibm-913", "cp913", "913",
54 	NULL
55 };
56 static const BCharacterSet isoLatin3(3,6, B_TRANSLATE("ISO South European"),
57 	"ISO_8859-3:1988","ISO-8859-3",isoLatin3aliases);
58 
59 static const char * isoLatin4aliases[] = {
60 	// IANA aliases
61 	"iso-ir-110", "ISO_8859-4", "ISO-8859-4", "latin4", "14", "csISOLatin4",
62 	// java aliases
63 	"iso8859_4", "iso8859-4", "8859_4", "ibm914", "ibm-914", "cp914", "914",
64 	NULL
65 };
66 static const BCharacterSet isoLatin4(4,7, B_TRANSLATE("ISO North European"),
67 	"ISO_8859-4:1988","ISO-8859-4",isoLatin4aliases);
68 
69 static const char * isoLatin5aliases[] = {
70 	// IANA aliases
71 	"iso-ir-144", "ISO_8859-5", "ISO-8859-5", "cyrillic", "csISOLatinCyrillic",
72 	// java aliases
73 	"iso8859_5", "8859_5", "ISO8859-5", "ibm915", "ibm-915", "cp915", "915",
74 	NULL
75 };
76 static const BCharacterSet isoLatin5(5,8, B_TRANSLATE("ISO Cyrillic"),
77 	"ISO_8859-5:1988","ISO-8859-5",isoLatin5aliases);
78 
79 static const char * isoLatin6aliases[] = {
80 	// IANA aliases
81 	"iso-ir-127", "ISO_8859-6", "ISO-8859-6", "ECMA-114", "ASMO-708", "arabic", "csISOLatinArabic",
82 	// java aliases
83 	"iso8859_6", "8859_6", "ISO8859-6", "ibm1089", "ibm-1089", "cp1089", "1089",
84 	NULL
85 };
86 static const BCharacterSet isoLatin6(6,9, B_TRANSLATE("ISO Arabic"),
87 	"ISO_8859-6:1987","ISO-8859-6",isoLatin6aliases);
88 
89 static const char * isoLatin7aliases[] = {
90 	// IANA aliases
91 	"iso-ir-126", "ISO_8859-7", "ISO-8859-7", "ELOT_928", "ECMA-118", "greek", "greek8", "csISOLatinGreek",
92 	// java aliases
93 	"iso8859_7", "8859_7", "iso8859-7", "sun_eu_greek", "ibm813", "ibm-813", "813", "cp813",
94 	NULL
95 };
96 static const BCharacterSet isoLatin7(7,10, B_TRANSLATE("ISO Greek"),
97 	"ISO_8859-7:1987","ISO-8859-7",isoLatin7aliases);
98 
99 static const char * isoLatin8aliases[] = {
100 	// IANA aliases
101 	"iso-ir-138", "ISO_8859-8", "ISO-8859-8", "hebrew", "csISOLatinHebrew",
102 	// java aliases
103 	"iso8859_8", "8859_8", "ISO8859-8", "cp916", "916", "ibm916", "ibm-916",
104 	NULL
105 };
106 static const BCharacterSet isoLatin8(8,11, B_TRANSLATE("ISO Hebrew"),
107 	"ISO_8859-8:1988","ISO-8859-8",isoLatin8aliases);
108 
109 static const char * isoLatin9aliases[] = {
110 	// IANA aliases
111 	"iso-ir-148", "ISO_8859-9", "ISO-8859-9", "latin5", "15", "csISOLatin5",
112 	// java aliases
113 	"iso8859_9", "8859_9", "ibm920", "ibm-920", "920", "cp920",
114 	NULL
115 };
116 const BCharacterSet isoLatin9(9,12, B_TRANSLATE("ISO Turkish"),
117 	"ISO_8859-9:1989","ISO-8859-9",isoLatin9aliases);
118 
119 static const char * isoLatin10aliases[] = {
120 	// IANA aliases
121 	"iso-ir-157", "16", "ISO_8859-10:1992", "csISOLatin6", "latin6",
122 	// java aliases
123 	NULL
124 };
125 static const BCharacterSet isoLatin10(10,13, B_TRANSLATE("ISO Nordic"),
126 	"ISO-8859-10","ISO-8859-10",isoLatin10aliases);
127 
128 static const char * macintoshAliases[] = {
129 	// IANA aliases
130 	"mac", "csMacintosh",
131 	// java aliases
132 	"MacRoman",
133 	// mail kit aliases
134 	"x-mac-roman",
135 	NULL
136 };
137 static const BCharacterSet macintosh(11,2027, B_TRANSLATE("Macintosh Roman"),
138 	"macintosh",NULL,macintoshAliases);
139 
140 static const char * shiftJISaliases[] = {
141 	// IANA aliases
142 	"MS_Kanji", "csShiftJIS",
143 	// java aliases
144 	"sjis", "shift_jis", "shift-jis", "x-sjis",
145 	// mail kit aliases
146 	"shift_jisx0213",
147 	NULL
148 };
149 static const BCharacterSet shiftJIS(12,17, B_TRANSLATE("Japanese Shift JIS"),
150 	"Shift_JIS","Shift_JIS",shiftJISaliases);
151 
152 static const char * EUCPackedJapaneseAliases[] = {
153 	// IANA aliases
154 	"EUC-JP", "csEUCPkdFmtJapanese",
155 	// java aliases
156 	"eucjis", "eucjp", "x-euc-jp", "x-eucjp",
157 	// mail kit aliases
158 	"euc-jisx0213",
159 	NULL
160 };
161 static const BCharacterSet packedJapanese(13,18, B_TRANSLATE("Japanese EUC"),
162                                    "Extended_UNIX_Code_Packed_Format_for_Japanese","EUC-JP",
163                                    EUCPackedJapaneseAliases);
164 
165 static const char * iso2022jpAliases[] = {
166 	// IANA aliases
167 	"csISO2022JP",
168 	// java aliases
169 	"iso2022jp", "jis", "jis_encoding", "csjisencoding",
170 	NULL
171 };
172 static const BCharacterSet iso2022jp(14,39, B_TRANSLATE("Japanese JIS"),
173 	"ISO-2022-JP","ISO-2022-JP",iso2022jpAliases);
174 
175 static const char * windows1252aliases[] = {
176 	// IANA aliases
177 	// java aliases
178 	"cp1252", "cp5348",
179 	NULL
180 };
181 static const BCharacterSet windows1252(15,2252, B_TRANSLATE("Windows Latin-1 "
182 	"(CP 1252)"),"windows-1252",NULL,windows1252aliases);
183 
184 static const char * unicode2aliases[] = {
185 	// IANA aliases
186 	"csUnicode",
187 	// java aliases
188 	"UTF-16BE", "UTF_16BE", "X-UTF-16BE", "UnicodeBigUnmarked",
189 	NULL
190 };
191 static const BCharacterSet unicode2(16,1000, B_TRANSLATE("Unicode (UCS-2)"),
192 	"ISO-10646-UCS-2",NULL,unicode2aliases);
193 
194 static const char * KOI8Raliases[] = {
195 	// IANA aliases
196 	"csKOI8R",
197 	// java aliases
198 	"koi8_r", "koi8", "cskoi8r",
199 	NULL
200 };
201 static const BCharacterSet KOI8R(17,2084, B_TRANSLATE("KOI8-R Cyrillic"),
202 	"KOI8-R","KOI8-R",KOI8Raliases);
203 
204 static const char * windows1251aliases[] = {
205 	// IANA aliases
206 	// java aliases
207 	"cp1251", "cp5347", "ansi-1251",
208 	NULL
209 };
210 static const BCharacterSet windows1251(18,2251, B_TRANSLATE("Windows Cyrillic "
211 	"(CP 1251)"), "windows-1251",NULL,windows1251aliases);
212 
213 static const char * IBM866aliases[] = {
214 	// IANA aliases
215 	"cp866", "866", "csIBM866",
216 	// java aliases
217 	"ibm-866",
218 	// mail kit aliases
219 	"dos-866",
220 	NULL
221 };
222 static const BCharacterSet IBM866(19,2086, B_TRANSLATE("DOS Cyrillic"),
223 	"IBM866","IBM866",IBM866aliases);
224 
225 static const char * IBM437aliases[] = {
226 	// IANA aliases
227 	"cp437", "437", "csPC8CodePage437",
228 	// java aliases
229 	"ibm-437", "windows-437",
230 	// mail kit aliases
231 	"dos-437",
232 	NULL
233 };
234 static const BCharacterSet IBM437(20,2011, B_TRANSLATE("DOS Latin-US"),
235 	"IBM437","IBM437",IBM437aliases);
236 
237 static const char * eucKRaliases[] = {
238 	// IANA aliases
239 	"csEUCKR",
240 	// java aliases
241 	"ksc5601", "euckr", "ks_c_5601-1987", "ksc5601-1987",
242 	"ksc5601_1987", "ksc_5601", "5601",
243 	NULL
244 };
245 static const BCharacterSet eucKR(21,38, B_TRANSLATE("EUC Korean"),
246 	"EUC-KR","EUC-KR",eucKRaliases);
247 
248 static const char * iso13aliases[] = {
249 	// IANA aliases
250 	// java aliases
251 	"iso8859_13", "8859_13", "iso_8859-13", "ISO8859-13",
252 	NULL
253 };
254 static const BCharacterSet iso13(22,109, B_TRANSLATE("ISO Baltic"),
255 	"ISO-8859-13","ISO-8859-13",iso13aliases);
256 
257 static const char * iso14aliases[] = {
258 	// IANA aliases
259 	"iso-ir-199", "ISO_8859-14:1998", "ISO_8859-14", "latin8", "iso-celtic", "l8",
260 	NULL
261 };
262 static const BCharacterSet iso14(23,110, B_TRANSLATE("ISO Celtic"),
263 	"ISO-8859-14","ISO-8859-14",iso14aliases);
264 
265 static const char * iso15aliases[] = {
266 	// IANA aliases
267 	"ISO_8859-15", "Latin-9",
268 	// java aliases
269 	"8859_15", "ISO8859_15", "ISO8859-15", "IBM923", "IBM-923", "cp923", "923",
270 	"LATIN0", "LATIN9", "L9", "csISOlatin0", "csISOlatin9", "ISO8859_15_FDIS",
271 	NULL
272 };
273 static const BCharacterSet iso15(24,111, B_TRANSLATE("ISO Latin 9"),
274 	"ISO-8859-15","ISO-8859-15",iso15aliases);
275 
276 // chinese character set testing
277 
278 static const char * big5aliases[] = {
279 	// IANA aliases
280 	"csBig5",
281 	NULL
282 };
283 static const BCharacterSet big5(25,2026, B_TRANSLATE("Chinese Big5"),
284 	"Big5","Big5",big5aliases);
285 
286 static const char * gb18030aliases[] = {
287 	// java aliases
288 	"gb18030-2000",
289 	// mail kit aliases
290 	"gb2312",
291 	"gbk",
292 	NULL
293 };
294 static const BCharacterSet gb18030(26,114, B_TRANSLATE("Chinese GB18030"),
295 	"GB18030",NULL,gb18030aliases);
296 
297 static const char* kUTF16Aliases[] = {
298 	// IANA aliases
299 	"UTF-16",
300 	// java aliases
301 	"UTF-16BE", "X-UTF-16BE", "UnicodeBigUnmarked",
302 	NULL
303 };
304 static const BCharacterSet kUTF16(27, 1000, B_TRANSLATE("Unicode"), "UTF-16", "UTF-16",
305 	kUTF16Aliases);
306 
307 static const char* kWindows1250Aliases[] = {
308 	// IANA aliases
309 	"cswindows1250",
310 	// java aliases
311 	"cp1250",
312 	"ms-ee",
313 	NULL
314 };
315 static const BCharacterSet kWindows1250(28, 2250, B_TRANSLATE("Windows Central "
316 	"European (CP 1250)"), "windows-1250", NULL, kWindows1250Aliases);
317 
318 /**
319  * The following initializes the global character set array.
320  * It is organized by id for efficient retrieval using predefined constants in UTF8.h and Font.h.
321  * Character sets are stored contiguously and may be efficiently iterated over.
322  * To add a new character set, define the character set above -- remember to increment the id --
323  * and then add &<charSetName> to the _end_ of the following list.  That's all.
324  **/
325 
326 const BCharacterSet * character_sets_by_id[] = {
327 	&unicode,
328 	&isoLatin1, &isoLatin2, &isoLatin3,	&isoLatin4,	&isoLatin5,
329 	&isoLatin6,	&isoLatin7, &isoLatin8, &isoLatin9, &isoLatin10,
330 	&macintosh,
331 	// R5 BFont encodings end here
332 	&shiftJIS, &packedJapanese, &iso2022jp,
333 	&windows1252, &unicode2, &KOI8R, &windows1251,
334 	&IBM866, &IBM437, &eucKR, &iso13, &iso14, &iso15,
335 	// R5 convert_to/from_utf8 encodings end here
336 	&big5,&gb18030,
337 	&kUTF16,
338 	&kWindows1250,
339 };
340 const uint32 character_sets_by_id_count = sizeof(character_sets_by_id)/sizeof(const BCharacterSet*);
341 
342 /**
343  * The following code initializes the global MIBenum array.
344  * This sparsely populated array exists as an efficient way to access character sets by MIBenum.
345  * The MIBenum array is automatically allocated, and initialized by the following class.
346  * The following class should only be instantiated once, this is assured by using an assertion.
347  * No changes are required to the following code to add a new character set.
348  **/
349 
350 const BCharacterSet ** character_sets_by_MIBenum;
351 uint32 maximum_valid_MIBenum;
352 
353 static class MIBenumArrayInitializer {
354 public:
MIBenumArrayInitializer()355 	MIBenumArrayInitializer() {
356 		DEBUG_ONLY(static int onlyOneTime = 0;)
357 		ASSERT_WITH_MESSAGE(onlyOneTime++ == 0,"MIBenumArrayInitializer should be instantiated only one time.");
358 		// analyzing character_sets_by_id
359 		uint32 max_MIBenum = 0;
360 		for (uint32 index = 0 ; index < character_sets_by_id_count ; index++ ) {
361 			if (max_MIBenum < character_sets_by_id[index]->GetMIBenum()) {
362 				max_MIBenum = character_sets_by_id[index]->GetMIBenum();
363 			}
364 		}
365 		// initializing extern variables
366 		character_sets_by_MIBenum = new const BCharacterSet*[max_MIBenum+2];
367 		maximum_valid_MIBenum = max_MIBenum;
368 		// initializing MIBenum array
369 		memset(character_sets_by_MIBenum,0,sizeof(BCharacterSet*)*(max_MIBenum+2));
370 		for (uint32 index2 = 0 ; index2 < character_sets_by_id_count ; index2++ ) {
371 			const BCharacterSet * charset = character_sets_by_id[index2];
372 			character_sets_by_MIBenum[charset->GetMIBenum()] = charset;
373 		}
374 	}
~MIBenumArrayInitializer()375 	~MIBenumArrayInitializer()
376 	{
377 		delete [] character_sets_by_MIBenum;
378 	}
379 } runTheInitializer;
380 
381 }
382