1 /** 2 * unistr.c - Unicode string handling. Originated from the Linux-NTFS project. 3 * 4 * Copyright (c) 2000-2004 Anton Altaparmakov 5 * Copyright (c) 2002-2009 Szabolcs Szakacsits 6 * Copyright (c) 2008-2009 Jean-Pierre Andre 7 * Copyright (c) 2008 Bernhard Kaindl 8 * 9 * This program/include file is free software; you can redistribute it and/or 10 * modify it under the terms of the GNU General Public License as published 11 * by the Free Software Foundation; either version 2 of the License, or 12 * (at your option) any later version. 13 * 14 * This program/include file is distributed in the hope that it will be 15 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty 16 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 * GNU General Public License for more details. 18 * 19 * You should have received a copy of the GNU General Public License 20 * along with this program (in the main directory of the NTFS-3G 21 * distribution in the file COPYING); if not, write to the Free Software 22 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 23 */ 24 25 #ifdef HAVE_CONFIG_H 26 #include "config.h" 27 #endif 28 29 #ifdef HAVE_STDIO_H 30 #include <stdio.h> 31 #endif 32 #ifdef HAVE_STDLIB_H 33 #include <stdlib.h> 34 #endif 35 #ifdef HAVE_WCHAR_H 36 #include <wchar.h> 37 #endif 38 #ifdef HAVE_STRING_H 39 #include <string.h> 40 #endif 41 #ifdef HAVE_ERRNO_H 42 #include <errno.h> 43 #endif 44 #ifdef HAVE_LOCALE_H 45 #include <locale.h> 46 #endif 47 48 #if defined(__APPLE__) || defined(__DARWIN__) 49 #ifdef ENABLE_NFCONV 50 #include <CoreFoundation/CoreFoundation.h> 51 #endif /* ENABLE_NFCONV */ 52 #endif /* defined(__APPLE__) || defined(__DARWIN__) */ 53 54 #include "compat.h" 55 #include "attrib.h" 56 #include "types.h" 57 #include "unistr.h" 58 #include "debug.h" 59 #include "logging.h" 60 #include "misc.h" 61 62 #define NOREVBOM 0 /* JPA rejecting U+FFFE and U+FFFF, open to debate */ 63 64 // no wchar support in the Haiku kernel 65 #if defined(__HAIKU__) && defined(_KERNEL_MODE) 66 # include <KernelExport.h> 67 # define mbstowcs(a, b, c) (panic("mbstowcs"), 0) 68 # define wctomb(a, b) (panic("wctomb"), 0) 69 # define mbtowc(a, b, c) (panic("mbtowc"), 0) 70 # define setlocale(a, b) (panic("setlocale"), 0) 71 #endif 72 73 /* 74 * IMPORTANT 75 * ========= 76 * 77 * All these routines assume that the Unicode characters are in little endian 78 * encoding inside the strings!!! 79 */ 80 81 static int use_utf8 = 1; /* use UTF-8 encoding for file names */ 82 83 #if defined(__APPLE__) || defined(__DARWIN__) 84 #ifdef ENABLE_NFCONV 85 /** 86 * This variable controls whether or not automatic normalization form conversion 87 * should be performed when translating NTFS unicode file names to UTF-8. 88 * Defaults to on, but can be controlled from the outside using the function 89 * int ntfs_macosx_normalize_filenames(int normalize); 90 */ 91 static int nfconvert_utf8 = 1; 92 #endif /* ENABLE_NFCONV */ 93 #endif /* defined(__APPLE__) || defined(__DARWIN__) */ 94 95 /* 96 * This is used by the name collation functions to quickly determine what 97 * characters are (in)valid. 98 */ 99 #if 0 100 static const u8 legal_ansi_char_array[0x40] = { 101 0x00, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 102 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 103 104 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 105 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 106 107 0x17, 0x07, 0x18, 0x17, 0x17, 0x17, 0x17, 0x17, 108 0x17, 0x17, 0x18, 0x16, 0x16, 0x17, 0x07, 0x00, 109 110 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 111 0x17, 0x17, 0x04, 0x16, 0x18, 0x16, 0x18, 0x18, 112 }; 113 #endif 114 115 /** 116 * ntfs_names_are_equal - compare two Unicode names for equality 117 * @s1: name to compare to @s2 118 * @s1_len: length in Unicode characters of @s1 119 * @s2: name to compare to @s1 120 * @s2_len: length in Unicode characters of @s2 121 * @ic: ignore case bool 122 * @upcase: upcase table (only if @ic == IGNORE_CASE) 123 * @upcase_size: length in Unicode characters of @upcase (if present) 124 * 125 * Compare the names @s1 and @s2 and return TRUE (1) if the names are 126 * identical, or FALSE (0) if they are not identical. If @ic is IGNORE_CASE, 127 * the @upcase table is used to perform a case insensitive comparison. 128 */ 129 BOOL ntfs_names_are_equal(const ntfschar *s1, size_t s1_len, 130 const ntfschar *s2, size_t s2_len, 131 const IGNORE_CASE_BOOL ic, 132 const ntfschar *upcase, const u32 upcase_size) 133 { 134 if (s1_len != s2_len) 135 return FALSE; 136 if (!s1_len) 137 return TRUE; 138 if (ic == CASE_SENSITIVE) 139 return ntfs_ucsncmp(s1, s2, s1_len) ? FALSE: TRUE; 140 return ntfs_ucsncasecmp(s1, s2, s1_len, upcase, upcase_size) ? FALSE: 141 TRUE; 142 } 143 144 /* 145 * ntfs_names_full_collate() fully collate two Unicode names 146 * 147 * @name1: first Unicode name to compare 148 * @name1_len: length of first Unicode name to compare 149 * @name2: second Unicode name to compare 150 * @name2_len: length of second Unicode name to compare 151 * @ic: either CASE_SENSITIVE or IGNORE_CASE 152 * @upcase: upcase table (ignored if @ic is CASE_SENSITIVE) 153 * @upcase_len: upcase table size (ignored if @ic is CASE_SENSITIVE) 154 * 155 * -1 if the first name collates before the second one, 156 * 0 if the names match, 157 * 1 if the second name collates before the first one, or 158 * 159 */ 160 int ntfs_names_full_collate(const ntfschar *name1, const u32 name1_len, 161 const ntfschar *name2, const u32 name2_len, 162 const IGNORE_CASE_BOOL ic, const ntfschar *upcase, 163 const u32 upcase_len) 164 { 165 u32 cnt; 166 u16 c1, c2; 167 u16 u1, u2; 168 169 #ifdef DEBUG 170 if (!name1 || !name2 || (ic && (!upcase || !upcase_len))) { 171 ntfs_log_debug("ntfs_names_collate received NULL pointer!\n"); 172 exit(1); 173 } 174 #endif 175 cnt = min(name1_len, name2_len); 176 if (cnt > 0) { 177 if (ic == CASE_SENSITIVE) { 178 do { 179 c1 = le16_to_cpu(*name1); 180 name1++; 181 c2 = le16_to_cpu(*name2); 182 name2++; 183 } while (--cnt && (c1 == c2)); 184 u1 = c1; 185 u2 = c2; 186 if (u1 < upcase_len) 187 u1 = le16_to_cpu(upcase[u1]); 188 if (u2 < upcase_len) 189 u2 = le16_to_cpu(upcase[u2]); 190 if ((u1 == u2) && cnt) 191 do { 192 u1 = le16_to_cpu(*name1); 193 name1++; 194 u2 = le16_to_cpu(*name2); 195 name2++; 196 if (u1 < upcase_len) 197 u1 = le16_to_cpu(upcase[u1]); 198 if (u2 < upcase_len) 199 u2 = le16_to_cpu(upcase[u2]); 200 } while ((u1 == u2) && --cnt); 201 if (u1 < u2) 202 return -1; 203 if (u1 > u2) 204 return 1; 205 if (name1_len < name2_len) 206 return -1; 207 if (name1_len > name2_len) 208 return 1; 209 if (c1 < c2) 210 return -1; 211 if (c1 > c2) 212 return 1; 213 } else { 214 do { 215 u1 = c1 = le16_to_cpu(*name1); 216 name1++; 217 u2 = c2 = le16_to_cpu(*name2); 218 name2++; 219 if (u1 < upcase_len) 220 u1 = le16_to_cpu(upcase[u1]); 221 if (u2 < upcase_len) 222 u2 = le16_to_cpu(upcase[u2]); 223 } while ((u1 == u2) && --cnt); 224 if (u1 < u2) 225 return -1; 226 if (u1 > u2) 227 return 1; 228 if (name1_len < name2_len) 229 return -1; 230 if (name1_len > name2_len) 231 return 1; 232 } 233 } else { 234 if (name1_len < name2_len) 235 return -1; 236 if (name1_len > name2_len) 237 return 1; 238 } 239 return 0; 240 } 241 242 /** 243 * ntfs_ucsncmp - compare two little endian Unicode strings 244 * @s1: first string 245 * @s2: second string 246 * @n: maximum unicode characters to compare 247 * 248 * Compare the first @n characters of the Unicode strings @s1 and @s2, 249 * The strings in little endian format and appropriate le16_to_cpu() 250 * conversion is performed on non-little endian machines. 251 * 252 * The function returns an integer less than, equal to, or greater than zero 253 * if @s1 (or the first @n Unicode characters thereof) is found, respectively, 254 * to be less than, to match, or be greater than @s2. 255 */ 256 int ntfs_ucsncmp(const ntfschar *s1, const ntfschar *s2, size_t n) 257 { 258 ntfschar c1, c2; 259 size_t i; 260 261 #ifdef DEBUG 262 if (!s1 || !s2) { 263 ntfs_log_debug("ntfs_wcsncmp() received NULL pointer!\n"); 264 exit(1); 265 } 266 #endif 267 for (i = 0; i < n; ++i) { 268 c1 = le16_to_cpu(s1[i]); 269 c2 = le16_to_cpu(s2[i]); 270 if (c1 < c2) 271 return -1; 272 if (c1 > c2) 273 return 1; 274 if (!c1) 275 break; 276 } 277 return 0; 278 } 279 280 /** 281 * ntfs_ucsncasecmp - compare two little endian Unicode strings, ignoring case 282 * @s1: first string 283 * @s2: second string 284 * @n: maximum unicode characters to compare 285 * @upcase: upcase table 286 * @upcase_size: upcase table size in Unicode characters 287 * 288 * Compare the first @n characters of the Unicode strings @s1 and @s2, 289 * ignoring case. The strings in little endian format and appropriate 290 * le16_to_cpu() conversion is performed on non-little endian machines. 291 * 292 * Each character is uppercased using the @upcase table before the comparison. 293 * 294 * The function returns an integer less than, equal to, or greater than zero 295 * if @s1 (or the first @n Unicode characters thereof) is found, respectively, 296 * to be less than, to match, or be greater than @s2. 297 */ 298 int ntfs_ucsncasecmp(const ntfschar *s1, const ntfschar *s2, size_t n, 299 const ntfschar *upcase, const u32 upcase_size) 300 { 301 u16 c1, c2; 302 size_t i; 303 304 #ifdef DEBUG 305 if (!s1 || !s2 || !upcase) { 306 ntfs_log_debug("ntfs_wcsncasecmp() received NULL pointer!\n"); 307 exit(1); 308 } 309 #endif 310 for (i = 0; i < n; ++i) { 311 if ((c1 = le16_to_cpu(s1[i])) < upcase_size) 312 c1 = le16_to_cpu(upcase[c1]); 313 if ((c2 = le16_to_cpu(s2[i])) < upcase_size) 314 c2 = le16_to_cpu(upcase[c2]); 315 if (c1 < c2) 316 return -1; 317 if (c1 > c2) 318 return 1; 319 if (!c1) 320 break; 321 } 322 return 0; 323 } 324 325 /** 326 * ntfs_ucsnlen - determine the length of a little endian Unicode string 327 * @s: pointer to Unicode string 328 * @maxlen: maximum length of string @s 329 * 330 * Return the number of Unicode characters in the little endian Unicode 331 * string @s up to a maximum of maxlen Unicode characters, not including 332 * the terminating (ntfschar)'\0'. If there is no (ntfschar)'\0' between @s 333 * and @s + @maxlen, @maxlen is returned. 334 * 335 * This function never looks beyond @s + @maxlen. 336 */ 337 u32 ntfs_ucsnlen(const ntfschar *s, u32 maxlen) 338 { 339 u32 i; 340 341 for (i = 0; i < maxlen; i++) { 342 if (!le16_to_cpu(s[i])) 343 break; 344 } 345 return i; 346 } 347 348 /** 349 * ntfs_ucsndup - duplicate little endian Unicode string 350 * @s: pointer to Unicode string 351 * @maxlen: maximum length of string @s 352 * 353 * Return a pointer to a new little endian Unicode string which is a duplicate 354 * of the string s. Memory for the new string is obtained with ntfs_malloc(3), 355 * and can be freed with free(3). 356 * 357 * A maximum of @maxlen Unicode characters are copied and a terminating 358 * (ntfschar)'\0' little endian Unicode character is added. 359 * 360 * This function never looks beyond @s + @maxlen. 361 * 362 * Return a pointer to the new little endian Unicode string on success and NULL 363 * on failure with errno set to the error code. 364 */ 365 ntfschar *ntfs_ucsndup(const ntfschar *s, u32 maxlen) 366 { 367 ntfschar *dst; 368 u32 len; 369 370 len = ntfs_ucsnlen(s, maxlen); 371 dst = ntfs_malloc((len + 1) * sizeof(ntfschar)); 372 if (dst) { 373 memcpy(dst, s, len * sizeof(ntfschar)); 374 dst[len] = cpu_to_le16(L'\0'); 375 } 376 return dst; 377 } 378 379 /** 380 * ntfs_name_upcase - Map an Unicode name to its uppercase equivalent 381 * @name: 382 * @name_len: 383 * @upcase: 384 * @upcase_len: 385 * 386 * Description... 387 * 388 * Returns: 389 */ 390 void ntfs_name_upcase(ntfschar *name, u32 name_len, const ntfschar *upcase, 391 const u32 upcase_len) 392 { 393 u32 i; 394 u16 u; 395 396 for (i = 0; i < name_len; i++) 397 if ((u = le16_to_cpu(name[i])) < upcase_len) 398 name[i] = upcase[u]; 399 } 400 401 /** 402 * ntfs_name_locase - Map a Unicode name to its lowercase equivalent 403 */ 404 void ntfs_name_locase(ntfschar *name, u32 name_len, const ntfschar *locase, 405 const u32 locase_len) 406 { 407 u32 i; 408 u16 u; 409 410 if (locase) 411 for (i = 0; i < name_len; i++) 412 if ((u = le16_to_cpu(name[i])) < locase_len) 413 name[i] = locase[u]; 414 } 415 416 /** 417 * ntfs_file_value_upcase - Convert a filename to upper case 418 * @file_name_attr: 419 * @upcase: 420 * @upcase_len: 421 * 422 * Description... 423 * 424 * Returns: 425 */ 426 void ntfs_file_value_upcase(FILE_NAME_ATTR *file_name_attr, 427 const ntfschar *upcase, const u32 upcase_len) 428 { 429 ntfs_name_upcase((ntfschar*)&file_name_attr->file_name, 430 file_name_attr->file_name_length, upcase, upcase_len); 431 } 432 433 /* 434 NTFS uses Unicode (UTF-16LE [NTFS-3G uses UCS-2LE, which is enough 435 for now]) for path names, but the Unicode code points need to be 436 converted before a path can be accessed under NTFS. For 7 bit ASCII/ANSI, 437 glibc does this even without a locale in a hard-coded fashion as that 438 appears to be is easy because the low 7-bit ASCII range appears to be 439 available in all charsets but it does not convert anything if 440 there was some error with the locale setup or none set up like 441 when mount is called during early boot where he (by policy) do 442 not use locales (and may be not available if /usr is not yet mounted), 443 so this patch fixes the resulting issues for systems which use 444 UTF-8 and for others, specifying the locale in fstab brings them 445 the encoding which they want. 446 447 If no locale is defined or there was a problem with setting one 448 up and whenever nl_langinfo(CODESET) returns a sting starting with 449 "ANSI", use an internal UCS-2LE <-> UTF-8 codeset converter to fix 450 the bug where NTFS-3G does not show any path names which include 451 international characters!!! (and also fails on creating them) as result. 452 453 Author: Bernhard Kaindl <bk@suse.de> 454 Jean-Pierre Andre made it compliant with RFC3629/RFC2781. 455 */ 456 457 /* 458 * Return the amount of 8-bit elements in UTF-8 needed (without the terminating 459 * null) to store a given UTF-16LE string. 460 * 461 * Return -1 with errno set if string has invalid byte sequence or too long. 462 */ 463 static int utf16_to_utf8_size(const ntfschar *ins, const int ins_len, int outs_len) 464 { 465 int i, ret = -1; 466 int count = 0; 467 BOOL surrog; 468 469 surrog = FALSE; 470 for (i = 0; i < ins_len && ins[i]; i++) { 471 unsigned short c = le16_to_cpu(ins[i]); 472 if (surrog) { 473 if ((c >= 0xdc00) && (c < 0xe000)) { 474 surrog = FALSE; 475 count += 4; 476 } else 477 goto fail; 478 } else 479 if (c < 0x80) 480 count++; 481 else if (c < 0x800) 482 count += 2; 483 else if (c < 0xd800) 484 count += 3; 485 else if (c < 0xdc00) 486 surrog = TRUE; 487 #if NOREVBOM 488 else if ((c >= 0xe000) && (c < 0xfffe)) 489 #else 490 else if (c >= 0xe000) 491 #endif 492 count += 3; 493 else 494 goto fail; 495 if (count > outs_len) { 496 errno = ENAMETOOLONG; 497 goto out; 498 } 499 } 500 if (surrog) 501 goto fail; 502 503 ret = count; 504 out: 505 return ret; 506 fail: 507 errno = EILSEQ; 508 goto out; 509 } 510 511 /* 512 * ntfs_utf16_to_utf8 - convert a little endian UTF16LE string to an UTF-8 string 513 * @ins: input utf16 string buffer 514 * @ins_len: length of input string in utf16 characters 515 * @outs: on return contains the (allocated) output multibyte string 516 * @outs_len: length of output buffer in bytes 517 * 518 * Return -1 with errno set if string has invalid byte sequence or too long. 519 */ 520 static int ntfs_utf16_to_utf8(const ntfschar *ins, const int ins_len, 521 char **outs, int outs_len) 522 { 523 #if defined(__APPLE__) || defined(__DARWIN__) 524 #ifdef ENABLE_NFCONV 525 char *original_outs_value = *outs; 526 int original_outs_len = outs_len; 527 #endif /* ENABLE_NFCONV */ 528 #endif /* defined(__APPLE__) || defined(__DARWIN__) */ 529 530 char *t; 531 int i, size, ret = -1; 532 int halfpair; 533 534 halfpair = 0; 535 if (!*outs) 536 outs_len = PATH_MAX; 537 538 size = utf16_to_utf8_size(ins, ins_len, outs_len); 539 540 if (size < 0) 541 goto out; 542 543 if (!*outs) { 544 outs_len = size + 1; 545 *outs = ntfs_malloc(outs_len); 546 if (!*outs) 547 goto out; 548 } 549 550 t = *outs; 551 552 for (i = 0; i < ins_len && ins[i]; i++) { 553 unsigned short c = le16_to_cpu(ins[i]); 554 /* size not double-checked */ 555 if (halfpair) { 556 if ((c >= 0xdc00) && (c < 0xe000)) { 557 *t++ = 0xf0 + (((halfpair + 64) >> 8) & 7); 558 *t++ = 0x80 + (((halfpair + 64) >> 2) & 63); 559 *t++ = 0x80 + ((c >> 6) & 15) + ((halfpair & 3) << 4); 560 *t++ = 0x80 + (c & 63); 561 halfpair = 0; 562 } else 563 goto fail; 564 } else if (c < 0x80) { 565 *t++ = c; 566 } else { 567 if (c < 0x800) { 568 *t++ = (0xc0 | ((c >> 6) & 0x3f)); 569 *t++ = 0x80 | (c & 0x3f); 570 } else if (c < 0xd800) { 571 *t++ = 0xe0 | (c >> 12); 572 *t++ = 0x80 | ((c >> 6) & 0x3f); 573 *t++ = 0x80 | (c & 0x3f); 574 } else if (c < 0xdc00) 575 halfpair = c; 576 else if (c >= 0xe000) { 577 *t++ = 0xe0 | (c >> 12); 578 *t++ = 0x80 | ((c >> 6) & 0x3f); 579 *t++ = 0x80 | (c & 0x3f); 580 } else 581 goto fail; 582 } 583 } 584 *t = '\0'; 585 586 #if defined(__APPLE__) || defined(__DARWIN__) 587 #ifdef ENABLE_NFCONV 588 if(nfconvert_utf8 && (t - *outs) > 0) { 589 char *new_outs = NULL; 590 int new_outs_len = ntfs_macosx_normalize_utf8(*outs, &new_outs, 0); // Normalize to decomposed form 591 if(new_outs_len >= 0 && new_outs != NULL) { 592 if(original_outs_value != *outs) { 593 // We have allocated outs ourselves. 594 free(*outs); 595 *outs = new_outs; 596 t = *outs + new_outs_len; 597 } 598 else { 599 // We need to copy new_outs into the fixed outs buffer. 600 memset(*outs, 0, original_outs_len); 601 strncpy(*outs, new_outs, original_outs_len-1); 602 t = *outs + original_outs_len; 603 free(new_outs); 604 } 605 } 606 else { 607 ntfs_log_error("Failed to normalize NTFS string to UTF-8 NFD: %s\n", *outs); 608 ntfs_log_error(" new_outs=0x%p\n", new_outs); 609 ntfs_log_error(" new_outs_len=%d\n", new_outs_len); 610 } 611 } 612 #endif /* ENABLE_NFCONV */ 613 #endif /* defined(__APPLE__) || defined(__DARWIN__) */ 614 615 ret = t - *outs; 616 out: 617 return ret; 618 fail: 619 errno = EILSEQ; 620 goto out; 621 } 622 623 /* 624 * Return the amount of 16-bit elements in UTF-16LE needed 625 * (without the terminating null) to store given UTF-8 string. 626 * 627 * Return -1 with errno set if it's longer than PATH_MAX or string is invalid. 628 * 629 * Note: This does not check whether the input sequence is a valid utf8 string, 630 * and should be used only in context where such check is made! 631 */ 632 static int utf8_to_utf16_size(const char *s) 633 { 634 int ret = -1; 635 unsigned int byte; 636 size_t count = 0; 637 638 while ((byte = *((const unsigned char *)s++))) { 639 if (++count >= PATH_MAX) 640 goto fail; 641 if (byte >= 0xc0) { 642 if (byte >= 0xF5) { 643 errno = EILSEQ; 644 goto out; 645 } 646 if (!*s) 647 break; 648 if (byte >= 0xC0) 649 s++; 650 if (!*s) 651 break; 652 if (byte >= 0xE0) 653 s++; 654 if (!*s) 655 break; 656 if (byte >= 0xF0) { 657 s++; 658 if (++count >= PATH_MAX) 659 goto fail; 660 } 661 } 662 } 663 ret = count; 664 out: 665 return ret; 666 fail: 667 errno = ENAMETOOLONG; 668 goto out; 669 } 670 /* 671 * This converts one UTF-8 sequence to cpu-endian Unicode value 672 * within range U+0 .. U+10ffff and excluding U+D800 .. U+DFFF 673 * 674 * Return the number of used utf8 bytes or -1 with errno set 675 * if sequence is invalid. 676 */ 677 static int utf8_to_unicode(u32 *wc, const char *s) 678 { 679 unsigned int byte = *((const unsigned char *)s); 680 681 /* single byte */ 682 if (byte == 0) { 683 *wc = (u32) 0; 684 return 0; 685 } else if (byte < 0x80) { 686 *wc = (u32) byte; 687 return 1; 688 /* double byte */ 689 } else if (byte < 0xc2) { 690 goto fail; 691 } else if (byte < 0xE0) { 692 if ((s[1] & 0xC0) == 0x80) { 693 *wc = ((u32)(byte & 0x1F) << 6) 694 | ((u32)(s[1] & 0x3F)); 695 return 2; 696 } else 697 goto fail; 698 /* three-byte */ 699 } else if (byte < 0xF0) { 700 if (((s[1] & 0xC0) == 0x80) && ((s[2] & 0xC0) == 0x80)) { 701 *wc = ((u32)(byte & 0x0F) << 12) 702 | ((u32)(s[1] & 0x3F) << 6) 703 | ((u32)(s[2] & 0x3F)); 704 /* Check valid ranges */ 705 #if NOREVBOM 706 if (((*wc >= 0x800) && (*wc <= 0xD7FF)) 707 || ((*wc >= 0xe000) && (*wc <= 0xFFFD))) 708 return 3; 709 #else 710 if (((*wc >= 0x800) && (*wc <= 0xD7FF)) 711 || ((*wc >= 0xe000) && (*wc <= 0xFFFF))) 712 return 3; 713 #endif 714 } 715 goto fail; 716 /* four-byte */ 717 } else if (byte < 0xF5) { 718 if (((s[1] & 0xC0) == 0x80) && ((s[2] & 0xC0) == 0x80) 719 && ((s[3] & 0xC0) == 0x80)) { 720 *wc = ((u32)(byte & 0x07) << 18) 721 | ((u32)(s[1] & 0x3F) << 12) 722 | ((u32)(s[2] & 0x3F) << 6) 723 | ((u32)(s[3] & 0x3F)); 724 /* Check valid ranges */ 725 if ((*wc <= 0x10ffff) && (*wc >= 0x10000)) 726 return 4; 727 } 728 goto fail; 729 } 730 fail: 731 errno = EILSEQ; 732 return -1; 733 } 734 735 /** 736 * ntfs_utf8_to_utf16 - convert a UTF-8 string to a UTF-16LE string 737 * @ins: input multibyte string buffer 738 * @outs: on return contains the (allocated) output utf16 string 739 * @outs_len: length of output buffer in utf16 characters 740 * 741 * Return -1 with errno set. 742 */ 743 static int ntfs_utf8_to_utf16(const char *ins, ntfschar **outs) 744 { 745 #if defined(__APPLE__) || defined(__DARWIN__) 746 #ifdef ENABLE_NFCONV 747 char *new_ins = NULL; 748 if(nfconvert_utf8) { 749 int new_ins_len; 750 new_ins_len = ntfs_macosx_normalize_utf8(ins, &new_ins, 1); // Normalize to composed form 751 if(new_ins_len >= 0) 752 ins = new_ins; 753 else 754 ntfs_log_error("Failed to normalize NTFS string to UTF-8 NFC: %s\n", ins); 755 } 756 #endif /* ENABLE_NFCONV */ 757 #endif /* defined(__APPLE__) || defined(__DARWIN__) */ 758 const char *t = ins; 759 u32 wc; 760 BOOL allocated; 761 ntfschar *outpos; 762 int shorts, ret = -1; 763 764 shorts = utf8_to_utf16_size(ins); 765 if (shorts < 0) 766 goto fail; 767 768 allocated = FALSE; 769 if (!*outs) { 770 *outs = ntfs_malloc((shorts + 1) * sizeof(ntfschar)); 771 if (!*outs) 772 goto fail; 773 allocated = TRUE; 774 } 775 776 outpos = *outs; 777 778 while(1) { 779 int m = utf8_to_unicode(&wc, t); 780 if (m <= 0) { 781 if (m < 0) { 782 /* do not leave space allocated if failed */ 783 if (allocated) { 784 free(*outs); 785 *outs = (ntfschar*)NULL; 786 } 787 goto fail; 788 } 789 *outpos++ = const_cpu_to_le16(0); 790 break; 791 } 792 if (wc < 0x10000) 793 *outpos++ = cpu_to_le16(wc); 794 else { 795 wc -= 0x10000; 796 *outpos++ = cpu_to_le16((wc >> 10) + 0xd800); 797 *outpos++ = cpu_to_le16((wc & 0x3ff) + 0xdc00); 798 } 799 t += m; 800 } 801 802 ret = --outpos - *outs; 803 fail: 804 #if defined(__APPLE__) || defined(__DARWIN__) 805 #ifdef ENABLE_NFCONV 806 if(new_ins != NULL) 807 free(new_ins); 808 #endif /* ENABLE_NFCONV */ 809 #endif /* defined(__APPLE__) || defined(__DARWIN__) */ 810 return ret; 811 } 812 813 /** 814 * ntfs_ucstombs - convert a little endian Unicode string to a multibyte string 815 * @ins: input Unicode string buffer 816 * @ins_len: length of input string in Unicode characters 817 * @outs: on return contains the (allocated) output multibyte string 818 * @outs_len: length of output buffer in bytes 819 * 820 * Convert the input little endian, 2-byte Unicode string @ins, of length 821 * @ins_len into the multibyte string format dictated by the current locale. 822 * 823 * If *@outs is NULL, the function allocates the string and the caller is 824 * responsible for calling free(*@outs); when finished with it. 825 * 826 * On success the function returns the number of bytes written to the output 827 * string *@outs (>= 0), not counting the terminating NULL byte. If the output 828 * string buffer was allocated, *@outs is set to it. 829 * 830 * On error, -1 is returned, and errno is set to the error code. The following 831 * error codes can be expected: 832 * EINVAL Invalid arguments (e.g. @ins or @outs is NULL). 833 * EILSEQ The input string cannot be represented as a multibyte 834 * sequence according to the current locale. 835 * ENAMETOOLONG Destination buffer is too small for input string. 836 * ENOMEM Not enough memory to allocate destination buffer. 837 */ 838 int ntfs_ucstombs(const ntfschar *ins, const int ins_len, char **outs, 839 int outs_len) 840 { 841 char *mbs; 842 int mbs_len; 843 #ifdef MB_CUR_MAX 844 wchar_t wc; 845 int i, o; 846 int cnt = 0; 847 #ifdef HAVE_MBSINIT 848 mbstate_t mbstate; 849 #endif 850 #endif /* MB_CUR_MAX */ 851 852 if (!ins || !outs) { 853 errno = EINVAL; 854 return -1; 855 } 856 mbs = *outs; 857 mbs_len = outs_len; 858 if (mbs && !mbs_len) { 859 errno = ENAMETOOLONG; 860 return -1; 861 } 862 if (use_utf8) 863 return ntfs_utf16_to_utf8(ins, ins_len, outs, outs_len); 864 #ifdef MB_CUR_MAX 865 if (!mbs) { 866 mbs_len = (ins_len + 1) * MB_CUR_MAX; 867 mbs = ntfs_malloc(mbs_len); 868 if (!mbs) 869 return -1; 870 } 871 #ifdef HAVE_MBSINIT 872 memset(&mbstate, 0, sizeof(mbstate)); 873 #else 874 wctomb(NULL, 0); 875 #endif 876 for (i = o = 0; i < ins_len; i++) { 877 /* Reallocate memory if necessary or abort. */ 878 if ((int)(o + MB_CUR_MAX) > mbs_len) { 879 char *tc; 880 if (mbs == *outs) { 881 errno = ENAMETOOLONG; 882 return -1; 883 } 884 tc = ntfs_malloc((mbs_len + 64) & ~63); 885 if (!tc) 886 goto err_out; 887 memcpy(tc, mbs, mbs_len); 888 mbs_len = (mbs_len + 64) & ~63; 889 free(mbs); 890 mbs = tc; 891 } 892 /* Convert the LE Unicode character to a CPU wide character. */ 893 wc = (wchar_t)le16_to_cpu(ins[i]); 894 if (!wc) 895 break; 896 /* Convert the CPU endian wide character to multibyte. */ 897 #ifdef HAVE_MBSINIT 898 cnt = wcrtomb(mbs + o, wc, &mbstate); 899 #else 900 cnt = wctomb(mbs + o, wc); 901 #endif 902 if (cnt == -1) 903 goto err_out; 904 if (cnt <= 0) { 905 ntfs_log_debug("Eeek. cnt <= 0, cnt = %i\n", cnt); 906 errno = EINVAL; 907 goto err_out; 908 } 909 o += cnt; 910 } 911 #ifdef HAVE_MBSINIT 912 /* Make sure we are back in the initial state. */ 913 if (!mbsinit(&mbstate)) { 914 ntfs_log_debug("Eeek. mbstate not in initial state!\n"); 915 errno = EILSEQ; 916 goto err_out; 917 } 918 #endif 919 /* Now write the NULL character. */ 920 mbs[o] = '\0'; 921 if (*outs != mbs) 922 *outs = mbs; 923 return o; 924 err_out: 925 if (mbs != *outs) { 926 int eo = errno; 927 free(mbs); 928 errno = eo; 929 } 930 #else /* MB_CUR_MAX */ 931 errno = EILSEQ; 932 #endif /* MB_CUR_MAX */ 933 return -1; 934 } 935 936 /** 937 * ntfs_mbstoucs - convert a multibyte string to a little endian Unicode string 938 * @ins: input multibyte string buffer 939 * @outs: on return contains the (allocated) output Unicode string 940 * 941 * Convert the input multibyte string @ins, from the current locale into the 942 * corresponding little endian, 2-byte Unicode string. 943 * 944 * The function allocates the string and the caller is responsible for calling 945 * free(*@outs); when finished with it. 946 * 947 * On success the function returns the number of Unicode characters written to 948 * the output string *@outs (>= 0), not counting the terminating Unicode NULL 949 * character. 950 * 951 * On error, -1 is returned, and errno is set to the error code. The following 952 * error codes can be expected: 953 * EINVAL Invalid arguments (e.g. @ins or @outs is NULL). 954 * EILSEQ The input string cannot be represented as a Unicode 955 * string according to the current locale. 956 * ENAMETOOLONG Destination buffer is too small for input string. 957 * ENOMEM Not enough memory to allocate destination buffer. 958 */ 959 int ntfs_mbstoucs(const char *ins, ntfschar **outs) 960 { 961 #ifdef MB_CUR_MAX 962 ntfschar *ucs; 963 const char *s; 964 wchar_t wc; 965 int i, o, cnt, ins_len, ucs_len, ins_size; 966 #ifdef HAVE_MBSINIT 967 mbstate_t mbstate; 968 #endif 969 #endif /* MB_CUR_MAX */ 970 971 if (!ins || !outs) { 972 errno = EINVAL; 973 return -1; 974 } 975 976 if (use_utf8) 977 return ntfs_utf8_to_utf16(ins, outs); 978 979 #ifdef MB_CUR_MAX 980 /* Determine the size of the multi-byte string in bytes. */ 981 ins_size = strlen(ins); 982 /* Determine the length of the multi-byte string. */ 983 s = ins; 984 #if defined(HAVE_MBSINIT) 985 memset(&mbstate, 0, sizeof(mbstate)); 986 ins_len = mbsrtowcs(NULL, (const char **)&s, 0, &mbstate); 987 #ifdef __CYGWIN32__ 988 if (!ins_len && *ins) { 989 /* Older Cygwin had broken mbsrtowcs() implementation. */ 990 ins_len = strlen(ins); 991 } 992 #endif 993 #elif !defined(DJGPP) 994 ins_len = mbstowcs(NULL, s, 0); 995 #else 996 /* Eeek!!! DJGPP has broken mbstowcs() implementation!!! */ 997 ins_len = strlen(ins); 998 #endif 999 if (ins_len == -1) 1000 return ins_len; 1001 #ifdef HAVE_MBSINIT 1002 if ((s != ins) || !mbsinit(&mbstate)) { 1003 #else 1004 if (s != ins) { 1005 #endif 1006 errno = EILSEQ; 1007 return -1; 1008 } 1009 /* Add the NULL terminator. */ 1010 ins_len++; 1011 ucs_len = ins_len; 1012 ucs = ntfs_malloc(ucs_len * sizeof(ntfschar)); 1013 if (!ucs) 1014 return -1; 1015 #ifdef HAVE_MBSINIT 1016 memset(&mbstate, 0, sizeof(mbstate)); 1017 #else 1018 mbtowc(NULL, NULL, 0); 1019 #endif 1020 for (i = o = cnt = 0; i < ins_size; i += cnt, o++) { 1021 /* Reallocate memory if necessary. */ 1022 if (o >= ucs_len) { 1023 ntfschar *tc; 1024 ucs_len = (ucs_len * sizeof(ntfschar) + 64) & ~63; 1025 tc = realloc(ucs, ucs_len); 1026 if (!tc) 1027 goto err_out; 1028 ucs = tc; 1029 ucs_len /= sizeof(ntfschar); 1030 } 1031 /* Convert the multibyte character to a wide character. */ 1032 #ifdef HAVE_MBSINIT 1033 cnt = mbrtowc(&wc, ins + i, ins_size - i, &mbstate); 1034 #else 1035 cnt = mbtowc(&wc, ins + i, ins_size - i); 1036 #endif 1037 if (!cnt) 1038 break; 1039 if (cnt == -1) 1040 goto err_out; 1041 if (cnt < -1) { 1042 ntfs_log_trace("Eeek. cnt = %i\n", cnt); 1043 errno = EINVAL; 1044 goto err_out; 1045 } 1046 /* Make sure we are not overflowing the NTFS Unicode set. */ 1047 if ((unsigned long)wc >= (unsigned long)(1 << 1048 (8 * sizeof(ntfschar)))) { 1049 errno = EILSEQ; 1050 goto err_out; 1051 } 1052 /* Convert the CPU wide character to a LE Unicode character. */ 1053 ucs[o] = cpu_to_le16(wc); 1054 } 1055 #ifdef HAVE_MBSINIT 1056 /* Make sure we are back in the initial state. */ 1057 if (!mbsinit(&mbstate)) { 1058 ntfs_log_trace("Eeek. mbstate not in initial state!\n"); 1059 errno = EILSEQ; 1060 goto err_out; 1061 } 1062 #endif 1063 /* Now write the NULL character. */ 1064 ucs[o] = cpu_to_le16(L'\0'); 1065 *outs = ucs; 1066 return o; 1067 err_out: 1068 free(ucs); 1069 #else /* MB_CUR_MAX */ 1070 errno = EILSEQ; 1071 #endif /* MB_CUR_MAX */ 1072 return -1; 1073 } 1074 1075 /* 1076 * Turn a UTF8 name uppercase 1077 * 1078 * Returns an allocated uppercase name which has to be freed by caller 1079 * or NULL if there is an error (described by errno) 1080 */ 1081 1082 char *ntfs_uppercase_mbs(const char *low, 1083 const ntfschar *upcase, u32 upcase_size) 1084 { 1085 int size; 1086 char *upp; 1087 u32 wc; 1088 int n; 1089 const char *s; 1090 char *t; 1091 1092 size = strlen(low); 1093 upp = (char*)ntfs_malloc(3*size + 1); 1094 if (upp) { 1095 s = low; 1096 t = upp; 1097 do { 1098 n = utf8_to_unicode(&wc, s); 1099 if (n > 0) { 1100 if (wc < upcase_size) 1101 wc = le16_to_cpu(upcase[wc]); 1102 if (wc < 0x80) 1103 *t++ = wc; 1104 else if (wc < 0x800) { 1105 *t++ = (0xc0 | ((wc >> 6) & 0x3f)); 1106 *t++ = 0x80 | (wc & 0x3f); 1107 } else if (wc < 0x10000) { 1108 *t++ = 0xe0 | (wc >> 12); 1109 *t++ = 0x80 | ((wc >> 6) & 0x3f); 1110 *t++ = 0x80 | (wc & 0x3f); 1111 } else { 1112 *t++ = 0xf0 | ((wc >> 18) & 7); 1113 *t++ = 0x80 | ((wc >> 12) & 63); 1114 *t++ = 0x80 | ((wc >> 6) & 0x3f); 1115 *t++ = 0x80 | (wc & 0x3f); 1116 } 1117 s += n; 1118 } 1119 } while (n > 0); 1120 if (n < 0) { 1121 free(upp); 1122 upp = (char*)NULL; 1123 errno = EILSEQ; 1124 } 1125 *t = 0; 1126 } 1127 return (upp); 1128 } 1129 1130 /** 1131 * ntfs_upcase_table_build - build the default upcase table for NTFS 1132 * @uc: destination buffer where to store the built table 1133 * @uc_len: size of destination buffer in bytes 1134 * 1135 * ntfs_upcase_table_build() builds the default upcase table for NTFS and 1136 * stores it in the caller supplied buffer @uc of size @uc_len. 1137 * 1138 * Note, @uc_len must be at least 128kiB in size or bad things will happen! 1139 */ 1140 void ntfs_upcase_table_build(ntfschar *uc, u32 uc_len) 1141 { 1142 static int uc_run_table[][3] = { /* Start, End, Add */ 1143 {0x0061, 0x007B, -32}, {0x0451, 0x045D, -80}, {0x1F70, 0x1F72, 74}, 1144 {0x00E0, 0x00F7, -32}, {0x045E, 0x0460, -80}, {0x1F72, 0x1F76, 86}, 1145 {0x00F8, 0x00FF, -32}, {0x0561, 0x0587, -48}, {0x1F76, 0x1F78, 100}, 1146 {0x0256, 0x0258, -205}, {0x1F00, 0x1F08, 8}, {0x1F78, 0x1F7A, 128}, 1147 {0x028A, 0x028C, -217}, {0x1F10, 0x1F16, 8}, {0x1F7A, 0x1F7C, 112}, 1148 {0x03AC, 0x03AD, -38}, {0x1F20, 0x1F28, 8}, {0x1F7C, 0x1F7E, 126}, 1149 {0x03AD, 0x03B0, -37}, {0x1F30, 0x1F38, 8}, {0x1FB0, 0x1FB2, 8}, 1150 {0x03B1, 0x03C2, -32}, {0x1F40, 0x1F46, 8}, {0x1FD0, 0x1FD2, 8}, 1151 {0x03C2, 0x03C3, -31}, {0x1F51, 0x1F52, 8}, {0x1FE0, 0x1FE2, 8}, 1152 {0x03C3, 0x03CC, -32}, {0x1F53, 0x1F54, 8}, {0x1FE5, 0x1FE6, 7}, 1153 {0x03CC, 0x03CD, -64}, {0x1F55, 0x1F56, 8}, {0x2170, 0x2180, -16}, 1154 {0x03CD, 0x03CF, -63}, {0x1F57, 0x1F58, 8}, {0x24D0, 0x24EA, -26}, 1155 {0x0430, 0x0450, -32}, {0x1F60, 0x1F68, 8}, {0xFF41, 0xFF5B, -32}, 1156 {0} 1157 }; 1158 static int uc_dup_table[][2] = { /* Start, End */ 1159 {0x0100, 0x012F}, {0x01A0, 0x01A6}, {0x03E2, 0x03EF}, {0x04CB, 0x04CC}, 1160 {0x0132, 0x0137}, {0x01B3, 0x01B7}, {0x0460, 0x0481}, {0x04D0, 0x04EB}, 1161 {0x0139, 0x0149}, {0x01CD, 0x01DD}, {0x0490, 0x04BF}, {0x04EE, 0x04F5}, 1162 {0x014A, 0x0178}, {0x01DE, 0x01EF}, {0x04BF, 0x04BF}, {0x04F8, 0x04F9}, 1163 {0x0179, 0x017E}, {0x01F4, 0x01F5}, {0x04C1, 0x04C4}, {0x1E00, 0x1E95}, 1164 {0x018B, 0x018B}, {0x01FA, 0x0218}, {0x04C7, 0x04C8}, {0x1EA0, 0x1EF9}, 1165 {0} 1166 }; 1167 static int uc_byte_table[][2] = { /* Offset, Value */ 1168 {0x00FF, 0x0178}, {0x01AD, 0x01AC}, {0x01F3, 0x01F1}, {0x0269, 0x0196}, 1169 {0x0183, 0x0182}, {0x01B0, 0x01AF}, {0x0253, 0x0181}, {0x026F, 0x019C}, 1170 {0x0185, 0x0184}, {0x01B9, 0x01B8}, {0x0254, 0x0186}, {0x0272, 0x019D}, 1171 {0x0188, 0x0187}, {0x01BD, 0x01BC}, {0x0259, 0x018F}, {0x0275, 0x019F}, 1172 {0x018C, 0x018B}, {0x01C6, 0x01C4}, {0x025B, 0x0190}, {0x0283, 0x01A9}, 1173 {0x0192, 0x0191}, {0x01C9, 0x01C7}, {0x0260, 0x0193}, {0x0288, 0x01AE}, 1174 {0x0199, 0x0198}, {0x01CC, 0x01CA}, {0x0263, 0x0194}, {0x0292, 0x01B7}, 1175 {0x01A8, 0x01A7}, {0x01DD, 0x018E}, {0x0268, 0x0197}, 1176 {0} 1177 }; 1178 int i, r; 1179 int k, off; 1180 1181 memset((char*)uc, 0, uc_len); 1182 uc_len >>= 1; 1183 if (uc_len > 65536) 1184 uc_len = 65536; 1185 for (i = 0; (u32)i < uc_len; i++) 1186 uc[i] = cpu_to_le16(i); 1187 for (r = 0; uc_run_table[r][0]; r++) { 1188 off = uc_run_table[r][2]; 1189 for (i = uc_run_table[r][0]; i < uc_run_table[r][1]; i++) 1190 uc[i] = cpu_to_le16(i + off); 1191 } 1192 for (r = 0; uc_dup_table[r][0]; r++) 1193 for (i = uc_dup_table[r][0]; i < uc_dup_table[r][1]; i += 2) 1194 uc[i + 1] = cpu_to_le16(i); 1195 for (r = 0; uc_byte_table[r][0]; r++) { 1196 k = uc_byte_table[r][1]; 1197 uc[uc_byte_table[r][0]] = cpu_to_le16(k); 1198 } 1199 } 1200 1201 /* 1202 * Build a table for converting to lower case 1203 * 1204 * This is only meaningful when there is a single lower case 1205 * character leading to an upper case one, and currently the 1206 * only exception is the greek letter sigma which has a single 1207 * upper case glyph (code U+03A3), but two lower case glyphs 1208 * (code U+03C3 and U+03C2, the latter to be used at the end 1209 * of a word). In the following implementation the upper case 1210 * sigma will be lowercased as U+03C3. 1211 */ 1212 1213 ntfschar *ntfs_locase_table_build(const ntfschar *uc, u32 uc_cnt) 1214 { 1215 ntfschar *lc; 1216 u32 upp; 1217 u32 i; 1218 1219 lc = (ntfschar*)ntfs_malloc(uc_cnt*sizeof(ntfschar)); 1220 if (lc) { 1221 for (i=0; i<uc_cnt; i++) 1222 lc[i] = cpu_to_le16(i); 1223 for (i=0; i<uc_cnt; i++) { 1224 upp = le16_to_cpu(uc[i]); 1225 if ((upp != i) && (upp < uc_cnt)) 1226 lc[upp] = cpu_to_le16(i); 1227 } 1228 } else 1229 ntfs_log_error("Could not build the locase table\n"); 1230 return (lc); 1231 } 1232 1233 /** 1234 * ntfs_str2ucs - convert a string to a valid NTFS file name 1235 * @s: input string 1236 * @len: length of output buffer in Unicode characters 1237 * 1238 * Convert the input @s string into the corresponding little endian, 1239 * 2-byte Unicode string. The length of the converted string is less 1240 * or equal to the maximum length allowed by the NTFS format (255). 1241 * 1242 * If @s is NULL then return AT_UNNAMED. 1243 * 1244 * On success the function returns the Unicode string in an allocated 1245 * buffer and the caller is responsible to free it when it's not needed 1246 * anymore. 1247 * 1248 * On error NULL is returned and errno is set to the error code. 1249 */ 1250 ntfschar *ntfs_str2ucs(const char *s, int *len) 1251 { 1252 ntfschar *ucs = NULL; 1253 1254 if (s && ((*len = ntfs_mbstoucs(s, &ucs)) == -1)) { 1255 ntfs_log_perror("Couldn't convert '%s' to Unicode", s); 1256 return NULL; 1257 } 1258 if (*len > NTFS_MAX_NAME_LEN) { 1259 free(ucs); 1260 errno = ENAMETOOLONG; 1261 return NULL; 1262 } 1263 if (!ucs || !*len) { 1264 ucs = AT_UNNAMED; 1265 *len = 0; 1266 } 1267 return ucs; 1268 } 1269 1270 /** 1271 * ntfs_ucsfree - free memory allocated by ntfs_str2ucs() 1272 * @ucs input string to be freed 1273 * 1274 * Free memory at @ucs and which was allocated by ntfs_str2ucs. 1275 * 1276 * Return value: none. 1277 */ 1278 void ntfs_ucsfree(ntfschar *ucs) 1279 { 1280 if (ucs && (ucs != AT_UNNAMED)) 1281 free(ucs); 1282 } 1283 1284 /* 1285 * Check whether a name contains no chars forbidden 1286 * for DOS or Win32 use 1287 * 1288 * If there is a bad char, errno is set to EINVAL 1289 */ 1290 1291 BOOL ntfs_forbidden_chars(const ntfschar *name, int len) 1292 { 1293 BOOL forbidden; 1294 int ch; 1295 int i; 1296 u32 mainset = (1L << ('\"' - 0x20)) 1297 | (1L << ('*' - 0x20)) 1298 | (1L << ('/' - 0x20)) 1299 | (1L << (':' - 0x20)) 1300 | (1L << ('<' - 0x20)) 1301 | (1L << ('>' - 0x20)) 1302 | (1L << ('?' - 0x20)); 1303 1304 forbidden = (len == 0) 1305 || (le16_to_cpu(name[len-1]) == ' ') 1306 || (le16_to_cpu(name[len-1]) == '.'); 1307 for (i=0; i<len; i++) { 1308 ch = le16_to_cpu(name[i]); 1309 if ((ch < 0x20) 1310 || ((ch < 0x40) 1311 && ((1L << (ch - 0x20)) & mainset)) 1312 || (ch == '\\') 1313 || (ch == '|')) 1314 forbidden = TRUE; 1315 } 1316 if (forbidden) 1317 errno = EINVAL; 1318 return (forbidden); 1319 } 1320 1321 /* 1322 * Check whether the same name can be used as a DOS and 1323 * a Win32 name 1324 * 1325 * The names must be the same, or the short name the uppercase 1326 * variant of the long name 1327 */ 1328 1329 BOOL ntfs_collapsible_chars(ntfs_volume *vol, 1330 const ntfschar *shortname, int shortlen, 1331 const ntfschar *longname, int longlen) 1332 { 1333 BOOL collapsible; 1334 unsigned int ch; 1335 int i; 1336 1337 collapsible = shortlen == longlen; 1338 if (collapsible) 1339 for (i=0; i<shortlen; i++) { 1340 ch = le16_to_cpu(longname[i]); 1341 if ((ch >= vol->upcase_len) 1342 || ((shortname[i] != longname[i]) 1343 && (shortname[i] != vol->upcase[ch]))) 1344 collapsible = FALSE; 1345 } 1346 return (collapsible); 1347 } 1348 1349 /* 1350 * Define the character encoding to be used. 1351 * Use UTF-8 unless specified otherwise. 1352 */ 1353 1354 int ntfs_set_char_encoding(const char *locale) 1355 { 1356 use_utf8 = 0; 1357 if (!locale || strstr(locale,"utf8") || strstr(locale,"UTF8") 1358 || strstr(locale,"utf-8") || strstr(locale,"UTF-8")) 1359 use_utf8 = 1; 1360 else 1361 if (setlocale(LC_ALL, locale)) 1362 use_utf8 = 0; 1363 else { 1364 ntfs_log_error("Invalid locale, encoding to UTF-8\n"); 1365 use_utf8 = 1; 1366 } 1367 return 0; /* always successful */ 1368 } 1369 1370 #if defined(__APPLE__) || defined(__DARWIN__) 1371 1372 int ntfs_macosx_normalize_filenames(int normalize) { 1373 #ifdef ENABLE_NFCONV 1374 if(normalize == 0 || normalize == 1) { 1375 nfconvert_utf8 = normalize; 1376 return 0; 1377 } 1378 else 1379 return -1; 1380 #else 1381 return -1; 1382 #endif /* ENABLE_NFCONV */ 1383 } 1384 1385 int ntfs_macosx_normalize_utf8(const char *utf8_string, char **target, 1386 int composed) { 1387 #ifdef ENABLE_NFCONV 1388 /* For this code to compile, the CoreFoundation framework must be fed to the linker. */ 1389 CFStringRef cfSourceString; 1390 CFMutableStringRef cfMutableString; 1391 CFRange rangeToProcess; 1392 CFIndex requiredBufferLength; 1393 char *result = NULL; 1394 int resultLength = -1; 1395 1396 /* Convert the UTF-8 string to a CFString. */ 1397 cfSourceString = CFStringCreateWithCString(kCFAllocatorDefault, utf8_string, kCFStringEncodingUTF8); 1398 if(cfSourceString == NULL) { 1399 ntfs_log_error("CFStringCreateWithCString failed!\n"); 1400 return -2; 1401 } 1402 1403 /* Create a mutable string from cfSourceString that we are free to modify. */ 1404 cfMutableString = CFStringCreateMutableCopy(kCFAllocatorDefault, 0, cfSourceString); 1405 CFRelease(cfSourceString); /* End-of-life. */ 1406 if(cfMutableString == NULL) { 1407 ntfs_log_error("CFStringCreateMutableCopy failed!\n"); 1408 return -3; 1409 } 1410 1411 /* Normalize the mutable string to the desired normalization form. */ 1412 CFStringNormalize(cfMutableString, (composed != 0 ? kCFStringNormalizationFormC : kCFStringNormalizationFormD)); 1413 1414 /* Store the resulting string in a '\0'-terminated UTF-8 encoded char* buffer. */ 1415 rangeToProcess = CFRangeMake(0, CFStringGetLength(cfMutableString)); 1416 if(CFStringGetBytes(cfMutableString, rangeToProcess, kCFStringEncodingUTF8, 0, false, NULL, 0, &requiredBufferLength) > 0) { 1417 resultLength = sizeof(char)*(requiredBufferLength + 1); 1418 result = ntfs_calloc(resultLength); 1419 1420 if(result != NULL) { 1421 if(CFStringGetBytes(cfMutableString, rangeToProcess, kCFStringEncodingUTF8, 1422 0, false, (UInt8*)result, resultLength-1, &requiredBufferLength) <= 0) { 1423 ntfs_log_error("Could not perform UTF-8 conversion of normalized CFMutableString.\n"); 1424 free(result); 1425 result = NULL; 1426 } 1427 } 1428 else 1429 ntfs_log_error("Could not perform a ntfs_calloc of %d bytes for char *result.\n", resultLength); 1430 } 1431 else 1432 ntfs_log_error("Could not perform check for required length of UTF-8 conversion of normalized CFMutableString.\n"); 1433 1434 1435 CFRelease(cfMutableString); 1436 1437 if(result != NULL) { 1438 *target = result; 1439 return resultLength - 1; 1440 } 1441 else 1442 return -1; 1443 #else 1444 return -1; 1445 #endif /* ENABLE_NFCONV */ 1446 } 1447 #endif /* defined(__APPLE__) || defined(__DARWIN__) */ 1448