xref: /haiku/src/kits/textencoding/character_sets.cpp (revision 01b25646004ff628ecad0281a9795e5e90f71746)
1 #include <string.h>
2 #include <CharacterSet.h>
3 #include <Debug.h>
4 #include "character_sets.h"
5 
6 namespace BPrivate {
7 
8 /**
9  * These variables are used in defining the character_sets_by_id array below.
10  * @see http://www.iana.org/assignments/character-sets
11  **/
12 
13 const BCharacterSet unicode(0,106,"Unicode","UTF-8","UTF-8",NULL);
14 
15 const char * isoLatin1aliases[] =
16  { "iso-ir-100","ISO_8859-1","ISO-8859-1","latin1","11","IBM819","CP819","csISOLatin1",NULL };
17 const BCharacterSet isoLatin1(1,4,"ISO Latin 1","ISO_8859-1:1987","ISO_8859-1",isoLatin1aliases);
18 
19 const char * isoLatin2aliases[] =
20  { "iso-ir-101","ISO_8859-2","ISO-8859-2","latin2","12","csISOLatin2",NULL };
21 const BCharacterSet isoLatin2(2,5,"ISO Latin 2","ISO_8859-2:1987","ISO_8859-2",isoLatin2aliases);
22 
23 const char * isoLatin3aliases[] =
24  { "iso-ir-109","ISO_8859-3","ISO-8859-3","latin3","13","csISOLatin3",NULL };
25 const BCharacterSet isoLatin3(3,6,"ISO Latin 3","ISO_8859-3:1988","ISO_8859-3",isoLatin3aliases);
26 
27 const char * isoLatin4aliases[] =
28  { "iso-ir-110","ISO_8859-4","ISO-8859-4","latin4","14","csISOLatin4",NULL };
29 const BCharacterSet isoLatin4(4,7,"ISO Latin 4","ISO_8859-4:1988","ISO_8859-4",isoLatin4aliases);
30 
31 const char * isoLatin5aliases[] =
32  { "iso-ir-144","ISO_8859-5","ISO-8859-5","cyrillic","csISOLatinCyrillic",NULL };
33 const BCharacterSet isoLatin5(5,8,"ISO Cyrillic","ISO_8859-5:1988","ISO_8859-5",isoLatin5aliases);
34 
35 const char * isoLatin6aliases[] =
36  { "iso-ir-127","ISO_8859-6","ISO-8859-6","ECMA-114","ASMO-708","arabic","csISOLatinArabic",NULL };
37 const BCharacterSet isoLatin6(6,9,"ISO Arabic","ISO_8859-6:1987","ISO_8859-6",isoLatin6aliases);
38 
39 const char * isoLatin7aliases[] =
40  { "iso-ir-126","ISO_8859-7","ISO-8859-7","ELOT_928","ECMA-118","greek","greek8","csISOLatinGreek",NULL };
41 const BCharacterSet isoLatin7(7,10,"ISO Greek","ISO_8859-7:1987","ISO_8859-7",isoLatin7aliases);
42 
43 const char * isoLatin8aliases[] =
44  { "iso-ir-138","ISO_8859-8","ISO-8859-8","hebrew","csISOLatinHebrew",NULL };
45 const BCharacterSet isoLatin8(8,11,"ISO Hebrew","ISO_8859-8:1988","ISO-8859-8",isoLatin8aliases);
46 
47 const char * isoLatin9aliases[] =
48  { "iso-ir-148","ISO_8859-9","ISO-8859-9","latin5","15","csISOLatin5",NULL };
49 const BCharacterSet isoLatin9(9,12,"ISO Latin 5","ISO_8859-9:1989","ISO-8859-9",isoLatin9aliases);
50 
51 const char * isoLatin10aliases[] =
52  { "iso-ir-157","16","ISO_8859-10:1992","csISOLatin6","latin6",NULL };
53 const BCharacterSet isoLatin10(10,13,"ISO Latin 6","ISO-8859-10","ISO-8859-10",isoLatin10aliases);
54 
55 const char * macintoshAliases[] =
56  { "mac","csMacintosh",NULL };
57 const BCharacterSet macintosh(11,2027,"Macintosh Roman","macintosh",NULL,macintoshAliases);
58 
59 const char * shiftJISaliases[] =
60  { "MS_Kanji","csShiftJIS",NULL };
61 const BCharacterSet shiftJIS(12,17,"Shift JIS","Shift_JIS","Shift_JIS",shiftJISaliases);
62 
63 const char * EUCPackedJapaneseAliases[] =
64  { "EUC-JP","csEUCPkdFmtJapanese",NULL };
65 const BCharacterSet packedJapanese(13,18,"EUC Packed Format Japanese",
66                                    "Extended_UNIX_Code_Packed_Format_for_Japanese","EUC-JP",
67                                    EUCPackedJapaneseAliases);
68 
69 const char * JIS0208aliases[] =
70  { "iso-ir-87","x0208","JIS_X0208-1983","csISO87JISX0208",NULL };
71 const BCharacterSet JIS0208(14,63,"JIS 0208","JIS_C6226-1983",NULL,JIS0208aliases);
72 
73 const BCharacterSet windows1252(15,2252,"MS-Windows Codepage 1252","windows-1252",NULL,NULL);
74 
75 const char * unicode2aliases[] =
76  { "csUnicode",NULL };
77 const BCharacterSet unicode2(16,1000,"Unicode 2.0","ISO-10646-UCS-2",NULL,unicode2aliases);
78 
79 const char * KOI8Raliases[] =
80  { "csKOI8R",NULL };
81 const BCharacterSet KOI8R(17,2084,"KOI8-R Cyrillic","KOI8-R","KOI8-R",KOI8Raliases);
82 
83 const BCharacterSet windows1251(18,2251,"MS-Windows Codepage 1251","windows-1251",NULL,NULL);
84 
85 const char * IBM866aliases[] =
86  { "cp866","866","csIBM866",NULL };
87 const BCharacterSet IBM866(19,2086,"IBM Codepage 866","IBM866","IBM866",IBM866aliases);
88 
89 const char * IBM437aliases[] =
90  { "cp437","437","csPC8CodePage437",NULL };
91 const BCharacterSet IBM437(20,2011,"IBM Codepage 437","IBM437","IBM437",IBM437aliases);
92 
93 const char * eucKRaliases[] =
94  { "csEUCKR",NULL };
95 const BCharacterSet eucKR(21,38,"EUC Korean","EUC-KR","EUC-KR",eucKRaliases);
96 
97 const BCharacterSet iso13(22,109,"ISO 8859-13","ISO-8859-13","ISO-8859-13",NULL);
98 
99 const char * iso14aliases[] =
100  { "iso-ir-199","ISO_8859-14:1998","ISO_8859-14","latin8","iso-celtic","l8",NULL };
101 const BCharacterSet iso14(23,110,"ISO 8859-14","ISO-8859-14","ISO-8859-14",iso14aliases);
102 
103 const char * iso15aliases[] =
104  { "ISO_8859-14","Latin-9",NULL };
105 const BCharacterSet iso15(24,111,"ISO 8859-15","ISO-8859-15","ISO-8859-15",iso15aliases);
106 
107 // chinese character set testing
108 
109 const char * big5aliases[] =
110  { "csBig5",NULL };
111 const BCharacterSet big5(25,2026,"Big5","Big5","Big5",big5aliases);
112 
113 const BCharacterSet gb18030(26,114,"GB18030","GB18030",NULL,NULL);
114 
115 /**
116  * The following initializes the global character set array.
117  * It is organized by id for efficient retrieval using predefined constants in UTF8.h and Font.h.
118  * Character sets are stored contiguously and may be efficiently iterated over.
119  * To add a new character set, define the character set above -- remember to increment the id --
120  * and then add &<charSetName> to the _end_ of the following list.  That's all.
121  **/
122 
123 const BCharacterSet * character_sets_by_id[] = {
124 	&unicode,
125 	&isoLatin1, &isoLatin2, &isoLatin3,	&isoLatin4,	&isoLatin5,
126 	&isoLatin6,	&isoLatin7, &isoLatin8, &isoLatin9, &isoLatin10,
127 	&macintosh,
128 	// R5 BFont encodings end here
129 	&shiftJIS, &packedJapanese, &JIS0208,
130 	&windows1252, &unicode2, &KOI8R, &windows1251,
131 	&IBM866, &IBM437, &eucKR, &iso13, &iso14, &iso15,
132 	// R5 convert_to/from_utf8 encodings end here
133 	&big5,&gb18030,
134 };
135 const uint32 character_sets_by_id_count = sizeof(character_sets_by_id)/sizeof(const BCharacterSet*);
136 
137 /**
138  * The following code initializes the global MIBenum array.
139  * This sparsely populated array exists as an efficient way to access character sets by MIBenum.
140  * The MIBenum array is automatically allocated, and initialized by the following class.
141  * The following class should only be instantiated once, this is assured by using an assertion.
142  * No changes are required to the following code to add a new character set.
143  **/
144 
145 const BCharacterSet ** character_sets_by_MIBenum;
146 uint32 maximum_valid_MIBenum;
147 
148 class MIBenumArrayInitializer {
149 public:
150 	MIBenumArrayInitializer() {
151 		DEBUG_ONLY(static int onlyOneTime = 0;)
152 		ASSERT_WITH_MESSAGE(onlyOneTime++ == 0,"MIBenumArrayInitializer should be instantiated only one time.");
153 		// analyzing character_sets_by_id
154 		uint32 max_MIBenum = 0;
155 		for (uint32 index = 0 ; index < character_sets_by_id_count ; index++ ) {
156 			if (max_MIBenum < character_sets_by_id[index]->GetMIBenum()) {
157 				max_MIBenum = character_sets_by_id[index]->GetMIBenum();
158 			}
159 		}
160 		// initializing extern variables
161 		character_sets_by_MIBenum = new (const BCharacterSet*)[max_MIBenum+2];
162 		maximum_valid_MIBenum = max_MIBenum;
163 		// initializing MIBenum array
164 		memset(character_sets_by_MIBenum,0,sizeof(BCharacterSet*)*(max_MIBenum+2));
165 		for (uint32 index2 = 0 ; index2 < character_sets_by_id_count ; index2++ ) {
166 			const BCharacterSet * charset = character_sets_by_id[index2];
167 			character_sets_by_MIBenum[charset->GetMIBenum()] = charset;
168 		}
169 	}
170 	~MIBenumArrayInitializer()
171 	{
172 		delete [] character_sets_by_MIBenum;
173 	}
174 } runTheInitializer;
175 
176 }
177 
178