1 /* Determine a canonical name for the current locale's character encoding. 2 3 Copyright (C) 2000-2006 Free Software Foundation, Inc. 4 5 This program is free software; you can redistribute it and/or modify it 6 under the terms of the GNU Library General Public License as published 7 by the Free Software Foundation; either version 2, or (at your option) 8 any later version. 9 10 This program is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Library General Public License for more details. 14 15 You should have received a copy of the GNU Library General Public 16 License along with this program; if not, write to the Free Software 17 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, 18 USA. */ 19 20 /* Written by Bruno Haible <bruno@clisp.org>. */ 21 22 #include "config.h" 23 24 /* Specification. */ 25 #include "localcharset.h" 26 27 #include <stddef.h> 28 #include <stdio.h> 29 #include <string.h> 30 #include <stdlib.h> 31 32 #if defined _WIN32 || defined __WIN32__ 33 # define WIN32_NATIVE 34 #endif 35 36 #if defined __EMX__ 37 /* Assume EMX program runs on OS/2, even if compiled under DOS. */ 38 # define OS2 39 #endif 40 41 #if !defined WIN32_NATIVE 42 # if HAVE_LANGINFO_CODESET 43 # include <langinfo.h> 44 # else 45 # if 0 /* see comment below */ 46 # include <locale.h> 47 # endif 48 # endif 49 # ifdef __CYGWIN__ 50 # define WIN32_LEAN_AND_MEAN 51 # include <windows.h> 52 # endif 53 #elif defined WIN32_NATIVE 54 # define WIN32_LEAN_AND_MEAN 55 # include <windows.h> 56 #endif 57 #if defined OS2 58 # define INCL_DOS 59 # include <os2.h> 60 #endif 61 62 #if ENABLE_RELOCATABLE 63 # include "relocatable.h" 64 #else 65 # define relocate(pathname) (pathname) 66 #endif 67 68 /* Get LIBDIR. */ 69 #ifndef LIBDIR 70 # include "configmake.h" 71 #endif 72 73 #if defined _WIN32 || defined __WIN32__ || defined __CYGWIN__ || defined __EMX__ || defined __DJGPP__ 74 /* Win32, Cygwin, OS/2, DOS */ 75 # define ISSLASH(C) ((C) == '/' || (C) == '\\') 76 #endif 77 78 #ifndef DIRECTORY_SEPARATOR 79 # define DIRECTORY_SEPARATOR '/' 80 #endif 81 82 #ifndef ISSLASH 83 # define ISSLASH(C) ((C) == DIRECTORY_SEPARATOR) 84 #endif 85 86 #if HAVE_DECL_GETC_UNLOCKED 87 # undef getc 88 # define getc getc_unlocked 89 #endif 90 91 /* The following static variable is declared 'volatile' to avoid a 92 possible multithread problem in the function get_charset_aliases. If we 93 are running in a threaded environment, and if two threads initialize 94 'charset_aliases' simultaneously, both will produce the same value, 95 and everything will be ok if the two assignments to 'charset_aliases' 96 are atomic. But I don't know what will happen if the two assignments mix. */ 97 #if __STDC__ != 1 98 # define volatile /* empty */ 99 #endif 100 /* Pointer to the contents of the charset.alias file, if it has already been 101 read, else NULL. Its format is: 102 ALIAS_1 '\0' CANONICAL_1 '\0' ... ALIAS_n '\0' CANONICAL_n '\0' '\0' */ 103 static const char * volatile charset_aliases; 104 105 /* Return a pointer to the contents of the charset.alias file. */ 106 static const char * 107 get_charset_aliases (void) 108 { 109 const char *cp; 110 111 cp = charset_aliases; 112 if (cp == NULL) 113 { 114 #if !(defined VMS || defined WIN32_NATIVE || defined __CYGWIN__) 115 FILE *fp; 116 const char *dir; 117 const char *base = "charset.alias"; 118 char *file_name; 119 120 /* Make it possible to override the charset.alias location. This is 121 necessary for running the testsuite before "make install". */ 122 dir = getenv ("CHARSETALIASDIR"); 123 if (dir == NULL || dir[0] == '\0') 124 dir = relocate (LIBDIR); 125 126 /* Concatenate dir and base into freshly allocated file_name. */ 127 { 128 size_t dir_len = strlen (dir); 129 size_t base_len = strlen (base); 130 int add_slash = (dir_len > 0 && !ISSLASH (dir[dir_len - 1])); 131 file_name = (char *) malloc (dir_len + add_slash + base_len + 1); 132 if (file_name != NULL) 133 { 134 memcpy (file_name, dir, dir_len); 135 if (add_slash) 136 file_name[dir_len] = DIRECTORY_SEPARATOR; 137 memcpy (file_name + dir_len + add_slash, base, base_len + 1); 138 } 139 } 140 141 if (file_name == NULL || (fp = fopen (file_name, "r")) == NULL) 142 /* Out of memory or file not found, treat it as empty. */ 143 cp = ""; 144 else 145 { 146 /* Parse the file's contents. */ 147 char *res_ptr = NULL; 148 size_t res_size = 0; 149 150 for (;;) 151 { 152 int c; 153 char buf1[50+1]; 154 char buf2[50+1]; 155 size_t l1, l2; 156 char *old_res_ptr; 157 158 c = getc (fp); 159 if (c == EOF) 160 break; 161 if (c == '\n' || c == ' ' || c == '\t') 162 continue; 163 if (c == '#') 164 { 165 /* Skip comment, to end of line. */ 166 do 167 c = getc (fp); 168 while (!(c == EOF || c == '\n')); 169 if (c == EOF) 170 break; 171 continue; 172 } 173 ungetc (c, fp); 174 if (fscanf (fp, "%50s %50s", buf1, buf2) < 2) 175 break; 176 l1 = strlen (buf1); 177 l2 = strlen (buf2); 178 old_res_ptr = res_ptr; 179 if (res_size == 0) 180 { 181 res_size = l1 + 1 + l2 + 1; 182 res_ptr = (char *) malloc (res_size + 1); 183 } 184 else 185 { 186 res_size += l1 + 1 + l2 + 1; 187 res_ptr = (char *) realloc (res_ptr, res_size + 1); 188 } 189 if (res_ptr == NULL) 190 { 191 /* Out of memory. */ 192 res_size = 0; 193 if (old_res_ptr != NULL) 194 free (old_res_ptr); 195 break; 196 } 197 strcpy (res_ptr + res_size - (l2 + 1) - (l1 + 1), buf1); 198 strcpy (res_ptr + res_size - (l2 + 1), buf2); 199 } 200 fclose (fp); 201 if (res_size == 0) 202 cp = ""; 203 else 204 { 205 *(res_ptr + res_size) = '\0'; 206 cp = res_ptr; 207 } 208 } 209 210 if (file_name != NULL) 211 free (file_name); 212 213 #else 214 215 # if defined VMS 216 /* To avoid the troubles of an extra file charset.alias_vms in the 217 sources of many GNU packages, simply inline the aliases here. */ 218 /* The list of encodings is taken from the OpenVMS 7.3-1 documentation 219 "Compaq C Run-Time Library Reference Manual for OpenVMS systems" 220 section 10.7 "Handling Different Character Sets". */ 221 cp = "ISO8859-1" "\0" "ISO-8859-1" "\0" 222 "ISO8859-2" "\0" "ISO-8859-2" "\0" 223 "ISO8859-5" "\0" "ISO-8859-5" "\0" 224 "ISO8859-7" "\0" "ISO-8859-7" "\0" 225 "ISO8859-8" "\0" "ISO-8859-8" "\0" 226 "ISO8859-9" "\0" "ISO-8859-9" "\0" 227 /* Japanese */ 228 "eucJP" "\0" "EUC-JP" "\0" 229 "SJIS" "\0" "SHIFT_JIS" "\0" 230 "DECKANJI" "\0" "DEC-KANJI" "\0" 231 "SDECKANJI" "\0" "EUC-JP" "\0" 232 /* Chinese */ 233 "eucTW" "\0" "EUC-TW" "\0" 234 "DECHANYU" "\0" "DEC-HANYU" "\0" 235 "DECHANZI" "\0" "GB2312" "\0" 236 /* Korean */ 237 "DECKOREAN" "\0" "EUC-KR" "\0"; 238 # endif 239 240 # if defined WIN32_NATIVE || defined __CYGWIN__ 241 /* To avoid the troubles of installing a separate file in the same 242 directory as the DLL and of retrieving the DLL's directory at 243 runtime, simply inline the aliases here. */ 244 245 cp = "CP936" "\0" "GBK" "\0" 246 "CP1361" "\0" "JOHAB" "\0" 247 "CP20127" "\0" "ASCII" "\0" 248 "CP20866" "\0" "KOI8-R" "\0" 249 "CP20936" "\0" "GB2312" "\0" 250 "CP21866" "\0" "KOI8-RU" "\0" 251 "CP28591" "\0" "ISO-8859-1" "\0" 252 "CP28592" "\0" "ISO-8859-2" "\0" 253 "CP28593" "\0" "ISO-8859-3" "\0" 254 "CP28594" "\0" "ISO-8859-4" "\0" 255 "CP28595" "\0" "ISO-8859-5" "\0" 256 "CP28596" "\0" "ISO-8859-6" "\0" 257 "CP28597" "\0" "ISO-8859-7" "\0" 258 "CP28598" "\0" "ISO-8859-8" "\0" 259 "CP28599" "\0" "ISO-8859-9" "\0" 260 "CP28605" "\0" "ISO-8859-15" "\0" 261 "CP38598" "\0" "ISO-8859-8" "\0" 262 "CP51932" "\0" "EUC-JP" "\0" 263 "CP51936" "\0" "GB2312" "\0" 264 "CP51949" "\0" "EUC-KR" "\0" 265 "CP51950" "\0" "EUC-TW" "\0" 266 "CP54936" "\0" "GB18030" "\0" 267 "CP65001" "\0" "UTF-8" "\0"; 268 # endif 269 #endif 270 271 charset_aliases = cp; 272 } 273 274 return cp; 275 } 276 277 /* Determine the current locale's character encoding, and canonicalize it 278 into one of the canonical names listed in config.charset. 279 The result must not be freed; it is statically allocated. 280 If the canonical name cannot be determined, the result is a non-canonical 281 name. */ 282 283 #ifdef STATIC 284 STATIC 285 #endif 286 const char * 287 locale_charset (void) 288 { 289 const char *codeset; 290 const char *aliases; 291 292 #if !(defined WIN32_NATIVE || defined OS2) 293 294 # if HAVE_LANGINFO_CODESET 295 296 /* Most systems support nl_langinfo (CODESET) nowadays. */ 297 codeset = nl_langinfo (CODESET); 298 299 # ifdef __CYGWIN__ 300 /* Cygwin 2006 does not have locales. nl_langinfo (CODESET) always 301 returns "US-ASCII". As long as this is not fixed, return the suffix 302 of the locale name from the environment variables (if present) or 303 the codepage as a number. */ 304 if (codeset != NULL && strcmp (codeset, "US-ASCII") == 0) 305 { 306 const char *locale; 307 static char buf[2 + 10 + 1]; 308 309 locale = getenv ("LC_ALL"); 310 if (locale == NULL || locale[0] == '\0') 311 { 312 locale = getenv ("LC_CTYPE"); 313 if (locale == NULL || locale[0] == '\0') 314 locale = getenv ("LANG"); 315 } 316 if (locale != NULL && locale[0] != '\0') 317 { 318 /* If the locale name contains an encoding after the dot, return 319 it. */ 320 const char *dot = strchr (locale, '.'); 321 322 if (dot != NULL) 323 { 324 const char *modifier; 325 326 dot++; 327 /* Look for the possible @... trailer and remove it, if any. */ 328 modifier = strchr (dot, '@'); 329 if (modifier == NULL) 330 return dot; 331 if (modifier - dot < sizeof (buf)) 332 { 333 memcpy (buf, dot, modifier - dot); 334 buf [modifier - dot] = '\0'; 335 return buf; 336 } 337 } 338 } 339 340 /* Woe32 has a function returning the locale's codepage as a number. */ 341 sprintf (buf, "CP%u", GetACP ()); 342 codeset = buf; 343 } 344 # endif 345 346 # else 347 348 /* On old systems which lack it, use setlocale or getenv. */ 349 const char *locale = NULL; 350 351 /* But most old systems don't have a complete set of locales. Some 352 (like SunOS 4 or DJGPP) have only the C locale. Therefore we don't 353 use setlocale here; it would return "C" when it doesn't support the 354 locale name the user has set. */ 355 # if 0 356 locale = setlocale (LC_CTYPE, NULL); 357 # endif 358 if (locale == NULL || locale[0] == '\0') 359 { 360 locale = getenv ("LC_ALL"); 361 if (locale == NULL || locale[0] == '\0') 362 { 363 locale = getenv ("LC_CTYPE"); 364 if (locale == NULL || locale[0] == '\0') 365 locale = getenv ("LANG"); 366 } 367 } 368 369 /* On some old systems, one used to set locale = "iso8859_1". On others, 370 you set it to "language_COUNTRY.charset". In any case, we resolve it 371 through the charset.alias file. */ 372 codeset = locale; 373 374 # endif 375 376 #elif defined WIN32_NATIVE 377 378 static char buf[2 + 10 + 1]; 379 380 /* Woe32 has a function returning the locale's codepage as a number. */ 381 sprintf (buf, "CP%u", GetACP ()); 382 codeset = buf; 383 384 #elif defined OS2 385 386 const char *locale; 387 static char buf[2 + 10 + 1]; 388 ULONG cp[3]; 389 ULONG cplen; 390 391 /* Allow user to override the codeset, as set in the operating system, 392 with standard language environment variables. */ 393 locale = getenv ("LC_ALL"); 394 if (locale == NULL || locale[0] == '\0') 395 { 396 locale = getenv ("LC_CTYPE"); 397 if (locale == NULL || locale[0] == '\0') 398 locale = getenv ("LANG"); 399 } 400 if (locale != NULL && locale[0] != '\0') 401 { 402 /* If the locale name contains an encoding after the dot, return it. */ 403 const char *dot = strchr (locale, '.'); 404 405 if (dot != NULL) 406 { 407 const char *modifier; 408 409 dot++; 410 /* Look for the possible @... trailer and remove it, if any. */ 411 modifier = strchr (dot, '@'); 412 if (modifier == NULL) 413 return dot; 414 if (modifier - dot < sizeof (buf)) 415 { 416 memcpy (buf, dot, modifier - dot); 417 buf [modifier - dot] = '\0'; 418 return buf; 419 } 420 } 421 422 /* Resolve through the charset.alias file. */ 423 codeset = locale; 424 } 425 else 426 { 427 /* OS/2 has a function returning the locale's codepage as a number. */ 428 if (DosQueryCp (sizeof (cp), cp, &cplen)) 429 codeset = ""; 430 else 431 { 432 sprintf (buf, "CP%u", cp[0]); 433 codeset = buf; 434 } 435 } 436 437 #endif 438 439 if (codeset == NULL) 440 /* The canonical name cannot be determined. */ 441 codeset = ""; 442 443 /* Resolve alias. */ 444 for (aliases = get_charset_aliases (); 445 *aliases != '\0'; 446 aliases += strlen (aliases) + 1, aliases += strlen (aliases) + 1) 447 if (strcmp (codeset, aliases) == 0 448 || (aliases[0] == '*' && aliases[1] == '\0')) 449 { 450 codeset = aliases + strlen (aliases) + 1; 451 break; 452 } 453 454 /* Don't return an empty string. GNU libc and GNU libiconv interpret 455 the empty string as denoting "the locale's character encoding", 456 thus GNU libiconv would call this function a second time. */ 457 if (codeset[0] == '\0') 458 codeset = "ASCII"; 459 460 return codeset; 461 } 462