1 #include <string.h> 2 #include <CharacterSet.h> 3 #include <Debug.h> 4 #include "character_sets.h" 5 6 namespace BPrivate { 7 8 /** 9 * These variables are used in defining the character_sets_by_id array below. 10 * @see http://www.iana.org/assignments/character-sets 11 * @see http://java.sun.com/j2se/1.5.0/docs/guide/intl/encoding.doc.html 12 * @see http://www.openi18n.org/subgroups/sa/locnameguide/final/CodesetAliasTable.html 13 **/ 14 15 static const char * unicodeAliases[] = { 16 // IANA aliases 17 // java aliases 18 "UTF8", "unicode-1-1-utf-8", 19 NULL 20 }; 21 static const BCharacterSet unicode(0,106,"Unicode","UTF-8","UTF-8",unicodeAliases); 22 23 static const char * isoLatin1aliases[] = { 24 // IANA aliases 25 "iso-ir-100", "ISO_8859-1", "ISO-8859-1", "latin1", "11", "IBM819", "CP819", "csISOLatin1", 26 // java aliases 27 "819", "IBM-819", "ISO8859_1", "8859_1", "ISO8859-1", 28 NULL 29 }; 30 static const BCharacterSet isoLatin1(1,4,"ISO West European","ISO_8859-1:1987","ISO-8859-1",isoLatin1aliases); 31 32 static const char * isoLatin2aliases[] = { 33 // IANA aliases 34 "iso-ir-101", "ISO_8859-2", "ISO-8859-2", "latin2", "12", "csISOLatin2", 35 // java aliases 36 "iso8859_2", "8859_2", "ISO8859-2", "ibm912", "ibm-912", "cp912", "912", 37 NULL 38 }; 39 static const BCharacterSet isoLatin2(2,5,"ISO East European","ISO_8859-2:1987","ISO-8859-2",isoLatin2aliases); 40 41 static const char * isoLatin3aliases[] = { 42 // IANA aliases 43 "iso-ir-109", "ISO_8859-3", "ISO-8859-3", "latin3", "13", "csISOLatin3", 44 // java aliases 45 "iso8859_3", "8859_3", "iso8859-3", "ibm913", "ibm-913", "cp913", "913", 46 NULL 47 }; 48 static const BCharacterSet isoLatin3(3,6,"ISO South European","ISO_8859-3:1988","ISO-8859-3",isoLatin3aliases); 49 50 static const char * isoLatin4aliases[] = { 51 // IANA aliases 52 "iso-ir-110", "ISO_8859-4", "ISO-8859-4", "latin4", "14", "csISOLatin4", 53 // java aliases 54 "iso8859_4", "iso8859-4", "8859_4", "ibm914", "ibm-914", "cp914", "914", 55 NULL 56 }; 57 static const BCharacterSet isoLatin4(4,7,"ISO North European","ISO_8859-4:1988","ISO-8859-4",isoLatin4aliases); 58 59 static const char * isoLatin5aliases[] = { 60 // IANA aliases 61 "iso-ir-144", "ISO_8859-5", "ISO-8859-5", "cyrillic", "csISOLatinCyrillic", 62 // java aliases 63 "iso8859_5", "8859_5", "ISO8859-5", "ibm915", "ibm-915", "cp915", "915", 64 NULL 65 }; 66 static const BCharacterSet isoLatin5(5,8,"ISO Cyrillic","ISO_8859-5:1988","ISO-8859-5",isoLatin5aliases); 67 68 static const char * isoLatin6aliases[] = { 69 // IANA aliases 70 "iso-ir-127", "ISO_8859-6", "ISO-8859-6", "ECMA-114", "ASMO-708", "arabic", "csISOLatinArabic", 71 // java aliases 72 "iso8859_6", "8859_6", "ISO8859-6", "ibm1089", "ibm-1089", "cp1089", "1089", 73 NULL 74 }; 75 static const BCharacterSet isoLatin6(6,9,"ISO Arabic","ISO_8859-6:1987","ISO-8859-6",isoLatin6aliases); 76 77 static const char * isoLatin7aliases[] = { 78 // IANA aliases 79 "iso-ir-126", "ISO_8859-7", "ISO-8859-7", "ELOT_928", "ECMA-118", "greek", "greek8", "csISOLatinGreek", 80 // java aliases 81 "iso8859_7", "8859_7", "iso8859-7", "sun_eu_greek", "ibm813", "ibm-813", "813", "cp813", 82 NULL 83 }; 84 static const BCharacterSet isoLatin7(7,10,"ISO Greek","ISO_8859-7:1987","ISO-8859-7",isoLatin7aliases); 85 86 static const char * isoLatin8aliases[] = { 87 // IANA aliases 88 "iso-ir-138", "ISO_8859-8", "ISO-8859-8", "hebrew", "csISOLatinHebrew", 89 // java aliases 90 "iso8859_8", "8859_8", "ISO8859-8", "cp916", "916", "ibm916", "ibm-916", 91 NULL 92 }; 93 static const BCharacterSet isoLatin8(8,11,"ISO Hebrew","ISO_8859-8:1988","ISO-8859-8",isoLatin8aliases); 94 95 static const char * isoLatin9aliases[] = { 96 // IANA aliases 97 "iso-ir-148", "ISO_8859-9", "ISO-8859-9", "latin5", "15", "csISOLatin5", 98 // java aliases 99 "iso8859_9", "8859_9", "ibm920", "ibm-920", "920", "cp920", 100 NULL 101 }; 102 const BCharacterSet isoLatin9(9,12,"ISO Turkish","ISO_8859-9:1989","ISO-8859-9",isoLatin9aliases); 103 104 static const char * isoLatin10aliases[] = { 105 // IANA aliases 106 "iso-ir-157", "16", "ISO_8859-10:1992", "csISOLatin6", "latin6", 107 // java aliases 108 NULL 109 }; 110 static const BCharacterSet isoLatin10(10,13,"ISO Nordic","ISO-8859-10","ISO-8859-10",isoLatin10aliases); 111 112 static const char * macintoshAliases[] = { 113 // IANA aliases 114 "mac", "csMacintosh", 115 // java aliases 116 "MacRoman", 117 // mail kit aliases 118 "x-mac-roman", 119 NULL 120 }; 121 static const BCharacterSet macintosh(11,2027,"Macintosh Roman","macintosh",NULL,macintoshAliases); 122 123 static const char * shiftJISaliases[] = { 124 // IANA aliases 125 "MS_Kanji", "csShiftJIS", 126 // java aliases 127 "sjis", "shift_jis", "shift-jis", "x-sjis", 128 // mail kit aliases 129 "shift_jisx0213", 130 NULL 131 }; 132 static const BCharacterSet shiftJIS(12,17,"Japanese Shift JIS","Shift_JIS","Shift_JIS",shiftJISaliases); 133 134 static const char * EUCPackedJapaneseAliases[] = { 135 // IANA aliases 136 "EUC-JP", "csEUCPkdFmtJapanese", 137 // java aliases 138 "eucjis", "eucjp", "x-euc-jp", "x-eucjp", 139 // mail kit aliases 140 "euc-jisx0213", 141 NULL 142 }; 143 static const BCharacterSet packedJapanese(13,18,"Japanese EUC", 144 "Extended_UNIX_Code_Packed_Format_for_Japanese","EUC-JP", 145 EUCPackedJapaneseAliases); 146 147 static const char * iso2022jpAliases[] = { 148 // IANA aliases 149 "csISO2022JP", 150 // java aliases 151 "iso2022jp", "jis", "jis_encoding", "csjisencoding", 152 NULL 153 }; 154 static const BCharacterSet iso2022jp(14,39,"Japanese JIS","ISO-2022-JP","ISO-2022-JP",iso2022jpAliases); 155 156 static const char * windows1252aliases[] = { 157 // IANA aliases 158 // java aliases 159 "cp1252", "cp5348", 160 NULL 161 }; 162 static const BCharacterSet windows1252(15,2252,"Windows Latin-1 (CP 1252)","windows-1252",NULL,windows1252aliases); 163 164 static const char * unicode2aliases[] = { 165 // IANA aliases 166 "csUnicode", 167 // java aliases 168 "UTF_16BE", "X-UTF-16BE", "UnicodeBigUnmarked", 169 NULL 170 }; 171 static const BCharacterSet unicode2(16,1000,"Unicode (UTF-16)","ISO-10646-UCS-2",NULL,unicode2aliases); 172 173 static const char * KOI8Raliases[] = { 174 // IANA aliases 175 "csKOI8R", 176 // java aliases 177 "koi8_r", "koi8", "cskoi8r", 178 NULL 179 }; 180 static const BCharacterSet KOI8R(17,2084,"KOI8-R Cyrillic","KOI8-R","KOI8-R",KOI8Raliases); 181 182 static const char * windows1251aliases[] = { 183 // IANA aliases 184 // java aliases 185 "cp1251", "cp5347", "ansi-1251", 186 NULL 187 }; 188 static const BCharacterSet windows1251(18,2251,"Windows Cyrillic (CP 1251)","windows-1251",NULL,windows1251aliases); 189 190 static const char * IBM866aliases[] = { 191 // IANA aliases 192 "cp866", "866", "csIBM866", 193 // java aliases 194 "ibm-866", 195 // mail kit aliases 196 "dos-866", 197 NULL 198 }; 199 static const BCharacterSet IBM866(19,2086,"DOS Cyrillic","IBM866","IBM866",IBM866aliases); 200 201 static const char * IBM437aliases[] = { 202 // IANA aliases 203 "cp437", "437", "csPC8CodePage437", 204 // java aliases 205 "ibm-437", "windows-437", 206 // mail kit aliases 207 "dos-437", 208 NULL 209 }; 210 static const BCharacterSet IBM437(20,2011,"DOS Latin-US","IBM437","IBM437",IBM437aliases); 211 212 static const char * eucKRaliases[] = { 213 // IANA aliases 214 "csEUCKR", 215 // java aliases 216 "ksc5601", "euckr", "ks_c_5601-1987", "ksc5601-1987", "ksc5601_1987", "ksc_5601", "5601", 217 NULL 218 }; 219 static const BCharacterSet eucKR(21,38,"EUC Korean","EUC-KR","EUC-KR",eucKRaliases); 220 221 static const char * iso13aliases[] = { 222 // IANA aliases 223 // java aliases 224 "iso8859_13", "8859_13", "iso_8859-13", "ISO8859-13", 225 NULL 226 }; 227 static const BCharacterSet iso13(22,109,"ISO Baltic","ISO-8859-13","ISO-8859-13",iso13aliases); 228 229 static const char * iso14aliases[] = { 230 // IANA aliases 231 "iso-ir-199", "ISO_8859-14:1998", "ISO_8859-14", "latin8", "iso-celtic", "l8", 232 NULL 233 }; 234 static const BCharacterSet iso14(23,110,"ISO Celtic","ISO-8859-14","ISO-8859-14",iso14aliases); 235 236 static const char * iso15aliases[] = { 237 // IANA aliases 238 "ISO_8859-15", "Latin-9", 239 // java aliases 240 "8859_15", "ISO8859_15", "ISO8859-15", "IBM923", "IBM-923", "cp923", "923", 241 "LATIN0", "LATIN9", "L9", "csISOlatin0", "csISOlatin9", "ISO8859_15_FDIS", 242 NULL 243 }; 244 static const BCharacterSet iso15(24,111,"ISO Latin 9","ISO-8859-15","ISO-8859-15",iso15aliases); 245 246 // chinese character set testing 247 248 static const char * big5aliases[] = { 249 // IANA aliases 250 "csBig5", 251 NULL 252 }; 253 static const BCharacterSet big5(25,2026,"Chinese Big5","Big5","Big5",big5aliases); 254 255 static const char * gb18030aliases[] = { 256 // java aliases 257 "gb18030-2000", 258 // mail kit aliases 259 "gb2312", 260 "gbk", 261 NULL 262 }; 263 static const BCharacterSet gb18030(26,114,"Chinese GB18030","GB18030",NULL,gb18030aliases); 264 265 /** 266 * The following initializes the global character set array. 267 * It is organized by id for efficient retrieval using predefined constants in UTF8.h and Font.h. 268 * Character sets are stored contiguously and may be efficiently iterated over. 269 * To add a new character set, define the character set above -- remember to increment the id -- 270 * and then add &<charSetName> to the _end_ of the following list. That's all. 271 **/ 272 273 const BCharacterSet * character_sets_by_id[] = { 274 &unicode, 275 &isoLatin1, &isoLatin2, &isoLatin3, &isoLatin4, &isoLatin5, 276 &isoLatin6, &isoLatin7, &isoLatin8, &isoLatin9, &isoLatin10, 277 &macintosh, 278 // R5 BFont encodings end here 279 &shiftJIS, &packedJapanese, &iso2022jp, 280 &windows1252, &unicode2, &KOI8R, &windows1251, 281 &IBM866, &IBM437, &eucKR, &iso13, &iso14, &iso15, 282 // R5 convert_to/from_utf8 encodings end here 283 &big5,&gb18030, 284 }; 285 const uint32 character_sets_by_id_count = sizeof(character_sets_by_id)/sizeof(const BCharacterSet*); 286 287 /** 288 * The following code initializes the global MIBenum array. 289 * This sparsely populated array exists as an efficient way to access character sets by MIBenum. 290 * The MIBenum array is automatically allocated, and initialized by the following class. 291 * The following class should only be instantiated once, this is assured by using an assertion. 292 * No changes are required to the following code to add a new character set. 293 **/ 294 295 const BCharacterSet ** character_sets_by_MIBenum; 296 uint32 maximum_valid_MIBenum; 297 298 static class MIBenumArrayInitializer { 299 public: 300 MIBenumArrayInitializer() { 301 DEBUG_ONLY(static int onlyOneTime = 0;) 302 ASSERT_WITH_MESSAGE(onlyOneTime++ == 0,"MIBenumArrayInitializer should be instantiated only one time."); 303 // analyzing character_sets_by_id 304 uint32 max_MIBenum = 0; 305 for (uint32 index = 0 ; index < character_sets_by_id_count ; index++ ) { 306 if (max_MIBenum < character_sets_by_id[index]->GetMIBenum()) { 307 max_MIBenum = character_sets_by_id[index]->GetMIBenum(); 308 } 309 } 310 // initializing extern variables 311 character_sets_by_MIBenum = new (const BCharacterSet*)[max_MIBenum+2]; 312 maximum_valid_MIBenum = max_MIBenum; 313 // initializing MIBenum array 314 memset(character_sets_by_MIBenum,0,sizeof(BCharacterSet*)*(max_MIBenum+2)); 315 for (uint32 index2 = 0 ; index2 < character_sets_by_id_count ; index2++ ) { 316 const BCharacterSet * charset = character_sets_by_id[index2]; 317 character_sets_by_MIBenum[charset->GetMIBenum()] = charset; 318 } 319 } 320 ~MIBenumArrayInitializer() 321 { 322 delete [] character_sets_by_MIBenum; 323 } 324 } runTheInitializer; 325 326 } 327