1 #include <string.h> 2 #include <Catalog.h> 3 #include <Locale.h> 4 #include <CharacterSet.h> 5 #include <Debug.h> 6 #include "character_sets.h" 7 8 #undef B_TRANSLATION_CONTEXT 9 #define B_TRANSLATION_CONTEXT "textencodings" 10 11 namespace BPrivate { 12 13 /** 14 * These variables are used in defining the character_sets_by_id array below. 15 * @see http://www.iana.org/assignments/character-sets 16 * @see http://java.sun.com/j2se/1.5.0/docs/guide/intl/encoding.doc.html 17 * @see http://www.openi18n.org/subgroups/sa/locnameguide/final/CodesetAliasTable.html 18 **/ 19 20 static const char * unicodeAliases[] = { 21 // IANA aliases 22 // java aliases 23 "UTF8", "unicode-1-1-utf-8", 24 NULL 25 }; 26 static const BCharacterSet unicode(0,106, B_TRANSLATE("Unicode"), 27 "UTF-8", "UTF-8",unicodeAliases); 28 29 static const char * isoLatin1aliases[] = { 30 // IANA aliases 31 "iso-ir-100", "ISO_8859-1", "ISO-8859-1", "latin1", "11", "IBM819", "CP819", "csISOLatin1", 32 // java aliases 33 "819", "IBM-819", "ISO8859_1", "8859_1", "ISO8859-1", 34 NULL 35 }; 36 static const BCharacterSet isoLatin1(1,4, B_TRANSLATE("ISO West European"), 37 "ISO_8859-1:1987","ISO-8859-1",isoLatin1aliases); 38 39 static const char * isoLatin2aliases[] = { 40 // IANA aliases 41 "iso-ir-101", "ISO_8859-2", "ISO-8859-2", "latin2", "12", "csISOLatin2", 42 // java aliases 43 "iso8859_2", "8859_2", "ISO8859-2", "ibm912", "ibm-912", "cp912", "912", 44 NULL 45 }; 46 static const BCharacterSet isoLatin2(2,5, B_TRANSLATE("ISO East European"), 47 "ISO_8859-2:1987","ISO-8859-2",isoLatin2aliases); 48 49 static const char * isoLatin3aliases[] = { 50 // IANA aliases 51 "iso-ir-109", "ISO_8859-3", "ISO-8859-3", "latin3", "13", "csISOLatin3", 52 // java aliases 53 "iso8859_3", "8859_3", "iso8859-3", "ibm913", "ibm-913", "cp913", "913", 54 NULL 55 }; 56 static const BCharacterSet isoLatin3(3,6, B_TRANSLATE("ISO South European"), 57 "ISO_8859-3:1988","ISO-8859-3",isoLatin3aliases); 58 59 static const char * isoLatin4aliases[] = { 60 // IANA aliases 61 "iso-ir-110", "ISO_8859-4", "ISO-8859-4", "latin4", "14", "csISOLatin4", 62 // java aliases 63 "iso8859_4", "iso8859-4", "8859_4", "ibm914", "ibm-914", "cp914", "914", 64 NULL 65 }; 66 static const BCharacterSet isoLatin4(4,7, B_TRANSLATE("ISO North European"), 67 "ISO_8859-4:1988","ISO-8859-4",isoLatin4aliases); 68 69 static const char * isoLatin5aliases[] = { 70 // IANA aliases 71 "iso-ir-144", "ISO_8859-5", "ISO-8859-5", "cyrillic", "csISOLatinCyrillic", 72 // java aliases 73 "iso8859_5", "8859_5", "ISO8859-5", "ibm915", "ibm-915", "cp915", "915", 74 NULL 75 }; 76 static const BCharacterSet isoLatin5(5,8, B_TRANSLATE("ISO Cyrillic"), 77 "ISO_8859-5:1988","ISO-8859-5",isoLatin5aliases); 78 79 static const char * isoLatin6aliases[] = { 80 // IANA aliases 81 "iso-ir-127", "ISO_8859-6", "ISO-8859-6", "ECMA-114", "ASMO-708", "arabic", "csISOLatinArabic", 82 // java aliases 83 "iso8859_6", "8859_6", "ISO8859-6", "ibm1089", "ibm-1089", "cp1089", "1089", 84 NULL 85 }; 86 static const BCharacterSet isoLatin6(6,9, B_TRANSLATE("ISO Arabic"), 87 "ISO_8859-6:1987","ISO-8859-6",isoLatin6aliases); 88 89 static const char * isoLatin7aliases[] = { 90 // IANA aliases 91 "iso-ir-126", "ISO_8859-7", "ISO-8859-7", "ELOT_928", "ECMA-118", "greek", "greek8", "csISOLatinGreek", 92 // java aliases 93 "iso8859_7", "8859_7", "iso8859-7", "sun_eu_greek", "ibm813", "ibm-813", "813", "cp813", 94 NULL 95 }; 96 static const BCharacterSet isoLatin7(7,10, B_TRANSLATE("ISO Greek"), 97 "ISO_8859-7:1987","ISO-8859-7",isoLatin7aliases); 98 99 static const char * isoLatin8aliases[] = { 100 // IANA aliases 101 "iso-ir-138", "ISO_8859-8", "ISO-8859-8", "hebrew", "csISOLatinHebrew", 102 // java aliases 103 "iso8859_8", "8859_8", "ISO8859-8", "cp916", "916", "ibm916", "ibm-916", 104 NULL 105 }; 106 static const BCharacterSet isoLatin8(8,11, B_TRANSLATE("ISO Hebrew"), 107 "ISO_8859-8:1988","ISO-8859-8",isoLatin8aliases); 108 109 static const char * isoLatin9aliases[] = { 110 // IANA aliases 111 "iso-ir-148", "ISO_8859-9", "ISO-8859-9", "latin5", "15", "csISOLatin5", 112 // java aliases 113 "iso8859_9", "8859_9", "ibm920", "ibm-920", "920", "cp920", 114 NULL 115 }; 116 const BCharacterSet isoLatin9(9,12, B_TRANSLATE("ISO Turkish"), 117 "ISO_8859-9:1989","ISO-8859-9",isoLatin9aliases); 118 119 static const char * isoLatin10aliases[] = { 120 // IANA aliases 121 "iso-ir-157", "16", "ISO_8859-10:1992", "csISOLatin6", "latin6", 122 // java aliases 123 NULL 124 }; 125 static const BCharacterSet isoLatin10(10,13, B_TRANSLATE("ISO Nordic"), 126 "ISO-8859-10","ISO-8859-10",isoLatin10aliases); 127 128 static const char * macintoshAliases[] = { 129 // IANA aliases 130 "mac", "csMacintosh", 131 // java aliases 132 "MacRoman", 133 // mail kit aliases 134 "x-mac-roman", 135 NULL 136 }; 137 static const BCharacterSet macintosh(11,2027, B_TRANSLATE("Macintosh Roman"), 138 "macintosh",NULL,macintoshAliases); 139 140 static const char * shiftJISaliases[] = { 141 // IANA aliases 142 "MS_Kanji", "csShiftJIS", 143 // java aliases 144 "sjis", "shift_jis", "shift-jis", "x-sjis", 145 // mail kit aliases 146 "shift_jisx0213", 147 NULL 148 }; 149 static const BCharacterSet shiftJIS(12,17, B_TRANSLATE("Japanese Shift JIS"), 150 "Shift_JIS","Shift_JIS",shiftJISaliases); 151 152 static const char * EUCPackedJapaneseAliases[] = { 153 // IANA aliases 154 "EUC-JP", "csEUCPkdFmtJapanese", 155 // java aliases 156 "eucjis", "eucjp", "x-euc-jp", "x-eucjp", 157 // mail kit aliases 158 "euc-jisx0213", 159 NULL 160 }; 161 static const BCharacterSet packedJapanese(13,18, B_TRANSLATE("Japanese EUC"), 162 "Extended_UNIX_Code_Packed_Format_for_Japanese","EUC-JP", 163 EUCPackedJapaneseAliases); 164 165 static const char * iso2022jpAliases[] = { 166 // IANA aliases 167 "csISO2022JP", 168 // java aliases 169 "iso2022jp", "jis", "jis_encoding", "csjisencoding", 170 NULL 171 }; 172 static const BCharacterSet iso2022jp(14,39, B_TRANSLATE("Japanese JIS"), 173 "ISO-2022-JP","ISO-2022-JP",iso2022jpAliases); 174 175 static const char * windows1252aliases[] = { 176 // IANA aliases 177 // java aliases 178 "cp1252", "cp5348", 179 NULL 180 }; 181 static const BCharacterSet windows1252(15,2252, B_TRANSLATE("Windows Latin-1 " 182 "(CP 1252)"),"windows-1252",NULL,windows1252aliases); 183 184 static const char * unicode2aliases[] = { 185 // IANA aliases 186 "csUnicode", 187 // java aliases 188 "UTF-16BE", "UTF_16BE", "X-UTF-16BE", "UnicodeBigUnmarked", 189 NULL 190 }; 191 static const BCharacterSet unicode2(16,1000, B_TRANSLATE("Unicode (UCS-2)"), 192 "ISO-10646-UCS-2",NULL,unicode2aliases); 193 194 static const char * KOI8Raliases[] = { 195 // IANA aliases 196 "csKOI8R", 197 // java aliases 198 "koi8_r", "koi8", "cskoi8r", 199 NULL 200 }; 201 static const BCharacterSet KOI8R(17,2084, B_TRANSLATE("KOI8-R Cyrillic"), 202 "KOI8-R","KOI8-R",KOI8Raliases); 203 204 static const char * windows1251aliases[] = { 205 // IANA aliases 206 // java aliases 207 "cp1251", "cp5347", "ansi-1251", 208 NULL 209 }; 210 static const BCharacterSet windows1251(18,2251, B_TRANSLATE("Windows Cyrillic " 211 "(CP 1251)"), "windows-1251",NULL,windows1251aliases); 212 213 static const char * IBM866aliases[] = { 214 // IANA aliases 215 "cp866", "866", "csIBM866", 216 // java aliases 217 "ibm-866", 218 // mail kit aliases 219 "dos-866", 220 NULL 221 }; 222 static const BCharacterSet IBM866(19,2086, B_TRANSLATE("DOS Cyrillic"), 223 "IBM866","IBM866",IBM866aliases); 224 225 static const char * IBM437aliases[] = { 226 // IANA aliases 227 "cp437", "437", "csPC8CodePage437", 228 // java aliases 229 "ibm-437", "windows-437", 230 // mail kit aliases 231 "dos-437", 232 NULL 233 }; 234 static const BCharacterSet IBM437(20,2011, B_TRANSLATE("DOS Latin-US"), 235 "IBM437","IBM437",IBM437aliases); 236 237 static const char * eucKRaliases[] = { 238 // IANA aliases 239 "csEUCKR", 240 // java aliases 241 "ksc5601", "euckr", "ks_c_5601-1987", "ksc5601-1987", 242 "ksc5601_1987", "ksc_5601", "5601", 243 NULL 244 }; 245 static const BCharacterSet eucKR(21,38, B_TRANSLATE("EUC Korean"), 246 "EUC-KR","EUC-KR",eucKRaliases); 247 248 static const char * iso13aliases[] = { 249 // IANA aliases 250 // java aliases 251 "iso8859_13", "8859_13", "iso_8859-13", "ISO8859-13", 252 NULL 253 }; 254 static const BCharacterSet iso13(22,109, B_TRANSLATE("ISO Baltic"), 255 "ISO-8859-13","ISO-8859-13",iso13aliases); 256 257 static const char * iso14aliases[] = { 258 // IANA aliases 259 "iso-ir-199", "ISO_8859-14:1998", "ISO_8859-14", "latin8", "iso-celtic", "l8", 260 NULL 261 }; 262 static const BCharacterSet iso14(23,110, B_TRANSLATE("ISO Celtic"), 263 "ISO-8859-14","ISO-8859-14",iso14aliases); 264 265 static const char * iso15aliases[] = { 266 // IANA aliases 267 "ISO_8859-15", "Latin-9", 268 // java aliases 269 "8859_15", "ISO8859_15", "ISO8859-15", "IBM923", "IBM-923", "cp923", "923", 270 "LATIN0", "LATIN9", "L9", "csISOlatin0", "csISOlatin9", "ISO8859_15_FDIS", 271 NULL 272 }; 273 static const BCharacterSet iso15(24,111, B_TRANSLATE("ISO Latin 9"), 274 "ISO-8859-15","ISO-8859-15",iso15aliases); 275 276 // chinese character set testing 277 278 static const char * big5aliases[] = { 279 // IANA aliases 280 "csBig5", 281 NULL 282 }; 283 static const BCharacterSet big5(25,2026, B_TRANSLATE("Chinese Big5"), 284 "Big5","Big5",big5aliases); 285 286 static const char * gb18030aliases[] = { 287 // java aliases 288 "gb18030-2000", 289 // mail kit aliases 290 "gb2312", 291 "gbk", 292 NULL 293 }; 294 static const BCharacterSet gb18030(26,114, B_TRANSLATE("Chinese GB18030"), 295 "GB18030",NULL,gb18030aliases); 296 297 static const char* kUTF16Aliases[] = { 298 // IANA aliases 299 "UTF-16", 300 // java aliases 301 "UTF-16BE", "X-UTF-16BE", "UnicodeBigUnmarked", 302 NULL 303 }; 304 static const BCharacterSet kUTF16(27, 1000, B_TRANSLATE("Unicode"), "UTF-16", "UTF-16", 305 kUTF16Aliases); 306 307 static const char* kWindows1250Aliases[] = { 308 // IANA aliases 309 "cswindows1250", 310 // java aliases 311 "cp1250", 312 "ms-ee", 313 NULL 314 }; 315 static const BCharacterSet kWindows1250(28, 2250, B_TRANSLATE("Windows Central " 316 "European (CP 1250)"), "windows-1250", NULL, kWindows1250Aliases); 317 318 /** 319 * The following initializes the global character set array. 320 * It is organized by id for efficient retrieval using predefined constants in UTF8.h and Font.h. 321 * Character sets are stored contiguously and may be efficiently iterated over. 322 * To add a new character set, define the character set above -- remember to increment the id -- 323 * and then add &<charSetName> to the _end_ of the following list. That's all. 324 **/ 325 326 const BCharacterSet * character_sets_by_id[] = { 327 &unicode, 328 &isoLatin1, &isoLatin2, &isoLatin3, &isoLatin4, &isoLatin5, 329 &isoLatin6, &isoLatin7, &isoLatin8, &isoLatin9, &isoLatin10, 330 &macintosh, 331 // R5 BFont encodings end here 332 &shiftJIS, &packedJapanese, &iso2022jp, 333 &windows1252, &unicode2, &KOI8R, &windows1251, 334 &IBM866, &IBM437, &eucKR, &iso13, &iso14, &iso15, 335 // R5 convert_to/from_utf8 encodings end here 336 &big5,&gb18030, 337 &kUTF16, 338 &kWindows1250, 339 }; 340 const uint32 character_sets_by_id_count = sizeof(character_sets_by_id)/sizeof(const BCharacterSet*); 341 342 /** 343 * The following code initializes the global MIBenum array. 344 * This sparsely populated array exists as an efficient way to access character sets by MIBenum. 345 * The MIBenum array is automatically allocated, and initialized by the following class. 346 * The following class should only be instantiated once, this is assured by using an assertion. 347 * No changes are required to the following code to add a new character set. 348 **/ 349 350 const BCharacterSet ** character_sets_by_MIBenum; 351 uint32 maximum_valid_MIBenum; 352 353 static class MIBenumArrayInitializer { 354 public: MIBenumArrayInitializer()355 MIBenumArrayInitializer() { 356 DEBUG_ONLY(static int onlyOneTime = 0;) 357 ASSERT_WITH_MESSAGE(onlyOneTime++ == 0,"MIBenumArrayInitializer should be instantiated only one time."); 358 // analyzing character_sets_by_id 359 uint32 max_MIBenum = 0; 360 for (uint32 index = 0 ; index < character_sets_by_id_count ; index++ ) { 361 if (max_MIBenum < character_sets_by_id[index]->GetMIBenum()) { 362 max_MIBenum = character_sets_by_id[index]->GetMIBenum(); 363 } 364 } 365 // initializing extern variables 366 character_sets_by_MIBenum = new const BCharacterSet*[max_MIBenum+2]; 367 maximum_valid_MIBenum = max_MIBenum; 368 // initializing MIBenum array 369 memset(character_sets_by_MIBenum,0,sizeof(BCharacterSet*)*(max_MIBenum+2)); 370 for (uint32 index2 = 0 ; index2 < character_sets_by_id_count ; index2++ ) { 371 const BCharacterSet * charset = character_sets_by_id[index2]; 372 character_sets_by_MIBenum[charset->GetMIBenum()] = charset; 373 } 374 } ~MIBenumArrayInitializer()375 ~MIBenumArrayInitializer() 376 { 377 delete [] character_sets_by_MIBenum; 378 } 379 } runTheInitializer; 380 381 } 382