1 /** 2 * unistr.c - Unicode string handling. Originated from the Linux-NTFS project. 3 * 4 * Copyright (c) 2000-2004 Anton Altaparmakov 5 * Copyright (c) 2002-2009 Szabolcs Szakacsits 6 * Copyright (c) 2008-2011 Jean-Pierre Andre 7 * Copyright (c) 2008 Bernhard Kaindl 8 * 9 * This program/include file is free software; you can redistribute it and/or 10 * modify it under the terms of the GNU General Public License as published 11 * by the Free Software Foundation; either version 2 of the License, or 12 * (at your option) any later version. 13 * 14 * This program/include file is distributed in the hope that it will be 15 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty 16 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 * GNU General Public License for more details. 18 * 19 * You should have received a copy of the GNU General Public License 20 * along with this program (in the main directory of the NTFS-3G 21 * distribution in the file COPYING); if not, write to the Free Software 22 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 23 */ 24 25 #ifdef HAVE_CONFIG_H 26 #include "config.h" 27 #endif 28 29 #ifdef HAVE_STDIO_H 30 #include <stdio.h> 31 #endif 32 #ifdef HAVE_STDLIB_H 33 #include <stdlib.h> 34 #endif 35 #ifdef HAVE_WCHAR_H 36 #include <wchar.h> 37 #endif 38 #ifdef HAVE_STRING_H 39 #include <string.h> 40 #endif 41 #ifdef HAVE_ERRNO_H 42 #include <errno.h> 43 #endif 44 #ifdef HAVE_LOCALE_H 45 #include <locale.h> 46 #endif 47 48 #if defined(__APPLE__) || defined(__DARWIN__) 49 #ifdef ENABLE_NFCONV 50 #include <CoreFoundation/CoreFoundation.h> 51 #endif /* ENABLE_NFCONV */ 52 #endif /* defined(__APPLE__) || defined(__DARWIN__) */ 53 54 #include "compat.h" 55 #include "attrib.h" 56 #include "types.h" 57 #include "unistr.h" 58 #include "debug.h" 59 #include "logging.h" 60 #include "misc.h" 61 62 #define NOREVBOM 0 /* JPA rejecting U+FFFE and U+FFFF, open to debate */ 63 64 // no wchar support in the Haiku kernel 65 #if defined(__HAIKU__) && defined(_KERNEL_MODE) 66 # include <KernelExport.h> 67 # define mbstowcs(a, b, c) (panic("mbstowcs"), 0) 68 # define wctomb(a, b) (panic("wctomb"), 0) 69 # define mbtowc(a, b, c) (panic("mbtowc"), 0) 70 # define setlocale(a, b) (panic("setlocale"), 0) 71 #endif 72 73 /* 74 * IMPORTANT 75 * ========= 76 * 77 * All these routines assume that the Unicode characters are in little endian 78 * encoding inside the strings!!! 79 */ 80 81 static int use_utf8 = 1; /* use UTF-8 encoding for file names */ 82 83 #if defined(__APPLE__) || defined(__DARWIN__) 84 #ifdef ENABLE_NFCONV 85 /** 86 * This variable controls whether or not automatic normalization form conversion 87 * should be performed when translating NTFS unicode file names to UTF-8. 88 * Defaults to on, but can be controlled from the outside using the function 89 * int ntfs_macosx_normalize_filenames(int normalize); 90 */ 91 static int nfconvert_utf8 = 1; 92 #endif /* ENABLE_NFCONV */ 93 #endif /* defined(__APPLE__) || defined(__DARWIN__) */ 94 95 /* 96 * This is used by the name collation functions to quickly determine what 97 * characters are (in)valid. 98 */ 99 #if 0 100 static const u8 legal_ansi_char_array[0x40] = { 101 0x00, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 102 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 103 104 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 105 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 106 107 0x17, 0x07, 0x18, 0x17, 0x17, 0x17, 0x17, 0x17, 108 0x17, 0x17, 0x18, 0x16, 0x16, 0x17, 0x07, 0x00, 109 110 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 111 0x17, 0x17, 0x04, 0x16, 0x18, 0x16, 0x18, 0x18, 112 }; 113 #endif 114 115 /** 116 * ntfs_names_are_equal - compare two Unicode names for equality 117 * @s1: name to compare to @s2 118 * @s1_len: length in Unicode characters of @s1 119 * @s2: name to compare to @s1 120 * @s2_len: length in Unicode characters of @s2 121 * @ic: ignore case bool 122 * @upcase: upcase table (only if @ic == IGNORE_CASE) 123 * @upcase_size: length in Unicode characters of @upcase (if present) 124 * 125 * Compare the names @s1 and @s2 and return TRUE (1) if the names are 126 * identical, or FALSE (0) if they are not identical. If @ic is IGNORE_CASE, 127 * the @upcase table is used to perform a case insensitive comparison. 128 */ 129 BOOL ntfs_names_are_equal(const ntfschar *s1, size_t s1_len, 130 const ntfschar *s2, size_t s2_len, 131 const IGNORE_CASE_BOOL ic, 132 const ntfschar *upcase, const u32 upcase_size) 133 { 134 if (s1_len != s2_len) 135 return FALSE; 136 if (!s1_len) 137 return TRUE; 138 if (ic == CASE_SENSITIVE) 139 return ntfs_ucsncmp(s1, s2, s1_len) ? FALSE: TRUE; 140 return ntfs_ucsncasecmp(s1, s2, s1_len, upcase, upcase_size) ? FALSE: 141 TRUE; 142 } 143 144 /* 145 * ntfs_names_full_collate() fully collate two Unicode names 146 * 147 * @name1: first Unicode name to compare 148 * @name1_len: length of first Unicode name to compare 149 * @name2: second Unicode name to compare 150 * @name2_len: length of second Unicode name to compare 151 * @ic: either CASE_SENSITIVE or IGNORE_CASE 152 * @upcase: upcase table (ignored if @ic is CASE_SENSITIVE) 153 * @upcase_len: upcase table size (ignored if @ic is CASE_SENSITIVE) 154 * 155 * -1 if the first name collates before the second one, 156 * 0 if the names match, 157 * 1 if the second name collates before the first one, or 158 * 159 */ 160 int ntfs_names_full_collate(const ntfschar *name1, const u32 name1_len, 161 const ntfschar *name2, const u32 name2_len, 162 const IGNORE_CASE_BOOL ic, const ntfschar *upcase, 163 const u32 upcase_len) 164 { 165 u32 cnt; 166 u16 c1, c2; 167 u16 u1, u2; 168 169 #ifdef DEBUG 170 if (!name1 || !name2 || (ic && (!upcase || !upcase_len))) { 171 ntfs_log_debug("ntfs_names_collate received NULL pointer!\n"); 172 exit(1); 173 } 174 #endif 175 cnt = min(name1_len, name2_len); 176 if (cnt > 0) { 177 if (ic == CASE_SENSITIVE) { 178 while (--cnt && (*name1 == *name2)) { 179 name1++; 180 name2++; 181 } 182 u1 = c1 = le16_to_cpu(*name1); 183 u2 = c2 = le16_to_cpu(*name2); 184 if (u1 < upcase_len) 185 u1 = le16_to_cpu(upcase[u1]); 186 if (u2 < upcase_len) 187 u2 = le16_to_cpu(upcase[u2]); 188 if ((u1 == u2) && cnt) 189 do { 190 name1++; 191 u1 = le16_to_cpu(*name1); 192 name2++; 193 u2 = le16_to_cpu(*name2); 194 if (u1 < upcase_len) 195 u1 = le16_to_cpu(upcase[u1]); 196 if (u2 < upcase_len) 197 u2 = le16_to_cpu(upcase[u2]); 198 } while ((u1 == u2) && --cnt); 199 if (u1 < u2) 200 return -1; 201 if (u1 > u2) 202 return 1; 203 if (name1_len < name2_len) 204 return -1; 205 if (name1_len > name2_len) 206 return 1; 207 if (c1 < c2) 208 return -1; 209 if (c1 > c2) 210 return 1; 211 } else { 212 do { 213 u1 = c1 = le16_to_cpu(*name1); 214 name1++; 215 u2 = c2 = le16_to_cpu(*name2); 216 name2++; 217 if (u1 < upcase_len) 218 u1 = le16_to_cpu(upcase[u1]); 219 if (u2 < upcase_len) 220 u2 = le16_to_cpu(upcase[u2]); 221 } while ((u1 == u2) && --cnt); 222 if (u1 < u2) 223 return -1; 224 if (u1 > u2) 225 return 1; 226 if (name1_len < name2_len) 227 return -1; 228 if (name1_len > name2_len) 229 return 1; 230 } 231 } else { 232 if (name1_len < name2_len) 233 return -1; 234 if (name1_len > name2_len) 235 return 1; 236 } 237 return 0; 238 } 239 240 /** 241 * ntfs_ucsncmp - compare two little endian Unicode strings 242 * @s1: first string 243 * @s2: second string 244 * @n: maximum unicode characters to compare 245 * 246 * Compare the first @n characters of the Unicode strings @s1 and @s2, 247 * The strings in little endian format and appropriate le16_to_cpu() 248 * conversion is performed on non-little endian machines. 249 * 250 * The function returns an integer less than, equal to, or greater than zero 251 * if @s1 (or the first @n Unicode characters thereof) is found, respectively, 252 * to be less than, to match, or be greater than @s2. 253 */ 254 int ntfs_ucsncmp(const ntfschar *s1, const ntfschar *s2, size_t n) 255 { 256 ntfschar c1, c2; 257 size_t i; 258 259 #ifdef DEBUG 260 if (!s1 || !s2) { 261 ntfs_log_debug("ntfs_wcsncmp() received NULL pointer!\n"); 262 exit(1); 263 } 264 #endif 265 for (i = 0; i < n; ++i) { 266 c1 = le16_to_cpu(s1[i]); 267 c2 = le16_to_cpu(s2[i]); 268 if (c1 < c2) 269 return -1; 270 if (c1 > c2) 271 return 1; 272 if (!c1) 273 break; 274 } 275 return 0; 276 } 277 278 /** 279 * ntfs_ucsncasecmp - compare two little endian Unicode strings, ignoring case 280 * @s1: first string 281 * @s2: second string 282 * @n: maximum unicode characters to compare 283 * @upcase: upcase table 284 * @upcase_size: upcase table size in Unicode characters 285 * 286 * Compare the first @n characters of the Unicode strings @s1 and @s2, 287 * ignoring case. The strings in little endian format and appropriate 288 * le16_to_cpu() conversion is performed on non-little endian machines. 289 * 290 * Each character is uppercased using the @upcase table before the comparison. 291 * 292 * The function returns an integer less than, equal to, or greater than zero 293 * if @s1 (or the first @n Unicode characters thereof) is found, respectively, 294 * to be less than, to match, or be greater than @s2. 295 */ 296 int ntfs_ucsncasecmp(const ntfschar *s1, const ntfschar *s2, size_t n, 297 const ntfschar *upcase, const u32 upcase_size) 298 { 299 u16 c1, c2; 300 size_t i; 301 302 #ifdef DEBUG 303 if (!s1 || !s2 || !upcase) { 304 ntfs_log_debug("ntfs_wcsncasecmp() received NULL pointer!\n"); 305 exit(1); 306 } 307 #endif 308 for (i = 0; i < n; ++i) { 309 if ((c1 = le16_to_cpu(s1[i])) < upcase_size) 310 c1 = le16_to_cpu(upcase[c1]); 311 if ((c2 = le16_to_cpu(s2[i])) < upcase_size) 312 c2 = le16_to_cpu(upcase[c2]); 313 if (c1 < c2) 314 return -1; 315 if (c1 > c2) 316 return 1; 317 if (!c1) 318 break; 319 } 320 return 0; 321 } 322 323 /** 324 * ntfs_ucsnlen - determine the length of a little endian Unicode string 325 * @s: pointer to Unicode string 326 * @maxlen: maximum length of string @s 327 * 328 * Return the number of Unicode characters in the little endian Unicode 329 * string @s up to a maximum of maxlen Unicode characters, not including 330 * the terminating (ntfschar)'\0'. If there is no (ntfschar)'\0' between @s 331 * and @s + @maxlen, @maxlen is returned. 332 * 333 * This function never looks beyond @s + @maxlen. 334 */ 335 u32 ntfs_ucsnlen(const ntfschar *s, u32 maxlen) 336 { 337 u32 i; 338 339 for (i = 0; i < maxlen; i++) { 340 if (!le16_to_cpu(s[i])) 341 break; 342 } 343 return i; 344 } 345 346 /** 347 * ntfs_ucsndup - duplicate little endian Unicode string 348 * @s: pointer to Unicode string 349 * @maxlen: maximum length of string @s 350 * 351 * Return a pointer to a new little endian Unicode string which is a duplicate 352 * of the string s. Memory for the new string is obtained with ntfs_malloc(3), 353 * and can be freed with free(3). 354 * 355 * A maximum of @maxlen Unicode characters are copied and a terminating 356 * (ntfschar)'\0' little endian Unicode character is added. 357 * 358 * This function never looks beyond @s + @maxlen. 359 * 360 * Return a pointer to the new little endian Unicode string on success and NULL 361 * on failure with errno set to the error code. 362 */ 363 ntfschar *ntfs_ucsndup(const ntfschar *s, u32 maxlen) 364 { 365 ntfschar *dst; 366 u32 len; 367 368 len = ntfs_ucsnlen(s, maxlen); 369 dst = ntfs_malloc((len + 1) * sizeof(ntfschar)); 370 if (dst) { 371 memcpy(dst, s, len * sizeof(ntfschar)); 372 dst[len] = cpu_to_le16(L'\0'); 373 } 374 return dst; 375 } 376 377 /** 378 * ntfs_name_upcase - Map an Unicode name to its uppercase equivalent 379 * @name: 380 * @name_len: 381 * @upcase: 382 * @upcase_len: 383 * 384 * Description... 385 * 386 * Returns: 387 */ 388 void ntfs_name_upcase(ntfschar *name, u32 name_len, const ntfschar *upcase, 389 const u32 upcase_len) 390 { 391 u32 i; 392 u16 u; 393 394 for (i = 0; i < name_len; i++) 395 if ((u = le16_to_cpu(name[i])) < upcase_len) 396 name[i] = upcase[u]; 397 } 398 399 /** 400 * ntfs_name_locase - Map a Unicode name to its lowercase equivalent 401 */ 402 void ntfs_name_locase(ntfschar *name, u32 name_len, const ntfschar *locase, 403 const u32 locase_len) 404 { 405 u32 i; 406 u16 u; 407 408 if (locase) 409 for (i = 0; i < name_len; i++) 410 if ((u = le16_to_cpu(name[i])) < locase_len) 411 name[i] = locase[u]; 412 } 413 414 /** 415 * ntfs_file_value_upcase - Convert a filename to upper case 416 * @file_name_attr: 417 * @upcase: 418 * @upcase_len: 419 * 420 * Description... 421 * 422 * Returns: 423 */ 424 void ntfs_file_value_upcase(FILE_NAME_ATTR *file_name_attr, 425 const ntfschar *upcase, const u32 upcase_len) 426 { 427 ntfs_name_upcase((ntfschar*)&file_name_attr->file_name, 428 file_name_attr->file_name_length, upcase, upcase_len); 429 } 430 431 /* 432 NTFS uses Unicode (UTF-16LE [NTFS-3G uses UCS-2LE, which is enough 433 for now]) for path names, but the Unicode code points need to be 434 converted before a path can be accessed under NTFS. For 7 bit ASCII/ANSI, 435 glibc does this even without a locale in a hard-coded fashion as that 436 appears to be is easy because the low 7-bit ASCII range appears to be 437 available in all charsets but it does not convert anything if 438 there was some error with the locale setup or none set up like 439 when mount is called during early boot where he (by policy) do 440 not use locales (and may be not available if /usr is not yet mounted), 441 so this patch fixes the resulting issues for systems which use 442 UTF-8 and for others, specifying the locale in fstab brings them 443 the encoding which they want. 444 445 If no locale is defined or there was a problem with setting one 446 up and whenever nl_langinfo(CODESET) returns a sting starting with 447 "ANSI", use an internal UCS-2LE <-> UTF-8 codeset converter to fix 448 the bug where NTFS-3G does not show any path names which include 449 international characters!!! (and also fails on creating them) as result. 450 451 Author: Bernhard Kaindl <bk@suse.de> 452 Jean-Pierre Andre made it compliant with RFC3629/RFC2781. 453 */ 454 455 /* 456 * Return the amount of 8-bit elements in UTF-8 needed (without the terminating 457 * null) to store a given UTF-16LE string. 458 * 459 * Return -1 with errno set if string has invalid byte sequence or too long. 460 */ 461 static int utf16_to_utf8_size(const ntfschar *ins, const int ins_len, int outs_len) 462 { 463 int i, ret = -1; 464 int count = 0; 465 BOOL surrog; 466 467 surrog = FALSE; 468 for (i = 0; i < ins_len && ins[i]; i++) { 469 unsigned short c = le16_to_cpu(ins[i]); 470 if (surrog) { 471 if ((c >= 0xdc00) && (c < 0xe000)) { 472 surrog = FALSE; 473 count += 4; 474 } else 475 goto fail; 476 } else 477 if (c < 0x80) 478 count++; 479 else if (c < 0x800) 480 count += 2; 481 else if (c < 0xd800) 482 count += 3; 483 else if (c < 0xdc00) 484 surrog = TRUE; 485 #if NOREVBOM 486 else if ((c >= 0xe000) && (c < 0xfffe)) 487 #else 488 else if (c >= 0xe000) 489 #endif 490 count += 3; 491 else 492 goto fail; 493 if (count > outs_len) { 494 errno = ENAMETOOLONG; 495 goto out; 496 } 497 } 498 if (surrog) 499 goto fail; 500 501 ret = count; 502 out: 503 return ret; 504 fail: 505 errno = EILSEQ; 506 goto out; 507 } 508 509 /* 510 * ntfs_utf16_to_utf8 - convert a little endian UTF16LE string to an UTF-8 string 511 * @ins: input utf16 string buffer 512 * @ins_len: length of input string in utf16 characters 513 * @outs: on return contains the (allocated) output multibyte string 514 * @outs_len: length of output buffer in bytes 515 * 516 * Return -1 with errno set if string has invalid byte sequence or too long. 517 */ 518 static int ntfs_utf16_to_utf8(const ntfschar *ins, const int ins_len, 519 char **outs, int outs_len) 520 { 521 #if defined(__APPLE__) || defined(__DARWIN__) 522 #ifdef ENABLE_NFCONV 523 char *original_outs_value = *outs; 524 int original_outs_len = outs_len; 525 #endif /* ENABLE_NFCONV */ 526 #endif /* defined(__APPLE__) || defined(__DARWIN__) */ 527 528 char *t; 529 int i, size, ret = -1; 530 int halfpair; 531 532 halfpair = 0; 533 if (!*outs) 534 outs_len = PATH_MAX; 535 536 size = utf16_to_utf8_size(ins, ins_len, outs_len); 537 538 if (size < 0) 539 goto out; 540 541 if (!*outs) { 542 outs_len = size + 1; 543 *outs = ntfs_malloc(outs_len); 544 if (!*outs) 545 goto out; 546 } 547 548 t = *outs; 549 550 for (i = 0; i < ins_len && ins[i]; i++) { 551 unsigned short c = le16_to_cpu(ins[i]); 552 /* size not double-checked */ 553 if (halfpair) { 554 if ((c >= 0xdc00) && (c < 0xe000)) { 555 *t++ = 0xf0 + (((halfpair + 64) >> 8) & 7); 556 *t++ = 0x80 + (((halfpair + 64) >> 2) & 63); 557 *t++ = 0x80 + ((c >> 6) & 15) + ((halfpair & 3) << 4); 558 *t++ = 0x80 + (c & 63); 559 halfpair = 0; 560 } else 561 goto fail; 562 } else if (c < 0x80) { 563 *t++ = c; 564 } else { 565 if (c < 0x800) { 566 *t++ = (0xc0 | ((c >> 6) & 0x3f)); 567 *t++ = 0x80 | (c & 0x3f); 568 } else if (c < 0xd800) { 569 *t++ = 0xe0 | (c >> 12); 570 *t++ = 0x80 | ((c >> 6) & 0x3f); 571 *t++ = 0x80 | (c & 0x3f); 572 } else if (c < 0xdc00) 573 halfpair = c; 574 else if (c >= 0xe000) { 575 *t++ = 0xe0 | (c >> 12); 576 *t++ = 0x80 | ((c >> 6) & 0x3f); 577 *t++ = 0x80 | (c & 0x3f); 578 } else 579 goto fail; 580 } 581 } 582 *t = '\0'; 583 584 #if defined(__APPLE__) || defined(__DARWIN__) 585 #ifdef ENABLE_NFCONV 586 if(nfconvert_utf8 && (t - *outs) > 0) { 587 char *new_outs = NULL; 588 int new_outs_len = ntfs_macosx_normalize_utf8(*outs, &new_outs, 0); // Normalize to decomposed form 589 if(new_outs_len >= 0 && new_outs != NULL) { 590 if(original_outs_value != *outs) { 591 // We have allocated outs ourselves. 592 free(*outs); 593 *outs = new_outs; 594 t = *outs + new_outs_len; 595 } 596 else { 597 // We need to copy new_outs into the fixed outs buffer. 598 memset(*outs, 0, original_outs_len); 599 strncpy(*outs, new_outs, original_outs_len-1); 600 t = *outs + original_outs_len; 601 free(new_outs); 602 } 603 } 604 else { 605 ntfs_log_error("Failed to normalize NTFS string to UTF-8 NFD: %s\n", *outs); 606 ntfs_log_error(" new_outs=0x%p\n", new_outs); 607 ntfs_log_error(" new_outs_len=%d\n", new_outs_len); 608 } 609 } 610 #endif /* ENABLE_NFCONV */ 611 #endif /* defined(__APPLE__) || defined(__DARWIN__) */ 612 613 ret = t - *outs; 614 out: 615 return ret; 616 fail: 617 errno = EILSEQ; 618 goto out; 619 } 620 621 /* 622 * Return the amount of 16-bit elements in UTF-16LE needed 623 * (without the terminating null) to store given UTF-8 string. 624 * 625 * Return -1 with errno set if it's longer than PATH_MAX or string is invalid. 626 * 627 * Note: This does not check whether the input sequence is a valid utf8 string, 628 * and should be used only in context where such check is made! 629 */ 630 static int utf8_to_utf16_size(const char *s) 631 { 632 int ret = -1; 633 unsigned int byte; 634 size_t count = 0; 635 636 while ((byte = *((const unsigned char *)s++))) { 637 if (++count >= PATH_MAX) 638 goto fail; 639 if (byte >= 0xc0) { 640 if (byte >= 0xF5) { 641 errno = EILSEQ; 642 goto out; 643 } 644 if (!*s) 645 break; 646 if (byte >= 0xC0) 647 s++; 648 if (!*s) 649 break; 650 if (byte >= 0xE0) 651 s++; 652 if (!*s) 653 break; 654 if (byte >= 0xF0) { 655 s++; 656 if (++count >= PATH_MAX) 657 goto fail; 658 } 659 } 660 } 661 ret = count; 662 out: 663 return ret; 664 fail: 665 errno = ENAMETOOLONG; 666 goto out; 667 } 668 /* 669 * This converts one UTF-8 sequence to cpu-endian Unicode value 670 * within range U+0 .. U+10ffff and excluding U+D800 .. U+DFFF 671 * 672 * Return the number of used utf8 bytes or -1 with errno set 673 * if sequence is invalid. 674 */ 675 static int utf8_to_unicode(u32 *wc, const char *s) 676 { 677 unsigned int byte = *((const unsigned char *)s); 678 679 /* single byte */ 680 if (byte == 0) { 681 *wc = (u32) 0; 682 return 0; 683 } else if (byte < 0x80) { 684 *wc = (u32) byte; 685 return 1; 686 /* double byte */ 687 } else if (byte < 0xc2) { 688 goto fail; 689 } else if (byte < 0xE0) { 690 if ((s[1] & 0xC0) == 0x80) { 691 *wc = ((u32)(byte & 0x1F) << 6) 692 | ((u32)(s[1] & 0x3F)); 693 return 2; 694 } else 695 goto fail; 696 /* three-byte */ 697 } else if (byte < 0xF0) { 698 if (((s[1] & 0xC0) == 0x80) && ((s[2] & 0xC0) == 0x80)) { 699 *wc = ((u32)(byte & 0x0F) << 12) 700 | ((u32)(s[1] & 0x3F) << 6) 701 | ((u32)(s[2] & 0x3F)); 702 /* Check valid ranges */ 703 #if NOREVBOM 704 if (((*wc >= 0x800) && (*wc <= 0xD7FF)) 705 || ((*wc >= 0xe000) && (*wc <= 0xFFFD))) 706 return 3; 707 #else 708 if (((*wc >= 0x800) && (*wc <= 0xD7FF)) 709 || ((*wc >= 0xe000) && (*wc <= 0xFFFF))) 710 return 3; 711 #endif 712 } 713 goto fail; 714 /* four-byte */ 715 } else if (byte < 0xF5) { 716 if (((s[1] & 0xC0) == 0x80) && ((s[2] & 0xC0) == 0x80) 717 && ((s[3] & 0xC0) == 0x80)) { 718 *wc = ((u32)(byte & 0x07) << 18) 719 | ((u32)(s[1] & 0x3F) << 12) 720 | ((u32)(s[2] & 0x3F) << 6) 721 | ((u32)(s[3] & 0x3F)); 722 /* Check valid ranges */ 723 if ((*wc <= 0x10ffff) && (*wc >= 0x10000)) 724 return 4; 725 } 726 goto fail; 727 } 728 fail: 729 errno = EILSEQ; 730 return -1; 731 } 732 733 /** 734 * ntfs_utf8_to_utf16 - convert a UTF-8 string to a UTF-16LE string 735 * @ins: input multibyte string buffer 736 * @outs: on return contains the (allocated) output utf16 string 737 * @outs_len: length of output buffer in utf16 characters 738 * 739 * Return -1 with errno set. 740 */ 741 static int ntfs_utf8_to_utf16(const char *ins, ntfschar **outs) 742 { 743 #if defined(__APPLE__) || defined(__DARWIN__) 744 #ifdef ENABLE_NFCONV 745 char *new_ins = NULL; 746 if(nfconvert_utf8) { 747 int new_ins_len; 748 new_ins_len = ntfs_macosx_normalize_utf8(ins, &new_ins, 1); // Normalize to composed form 749 if(new_ins_len >= 0) 750 ins = new_ins; 751 else 752 ntfs_log_error("Failed to normalize NTFS string to UTF-8 NFC: %s\n", ins); 753 } 754 #endif /* ENABLE_NFCONV */ 755 #endif /* defined(__APPLE__) || defined(__DARWIN__) */ 756 const char *t = ins; 757 u32 wc; 758 BOOL allocated; 759 ntfschar *outpos; 760 int shorts, ret = -1; 761 762 shorts = utf8_to_utf16_size(ins); 763 if (shorts < 0) 764 goto fail; 765 766 allocated = FALSE; 767 if (!*outs) { 768 *outs = ntfs_malloc((shorts + 1) * sizeof(ntfschar)); 769 if (!*outs) 770 goto fail; 771 allocated = TRUE; 772 } 773 774 outpos = *outs; 775 776 while(1) { 777 int m = utf8_to_unicode(&wc, t); 778 if (m <= 0) { 779 if (m < 0) { 780 /* do not leave space allocated if failed */ 781 if (allocated) { 782 free(*outs); 783 *outs = (ntfschar*)NULL; 784 } 785 goto fail; 786 } 787 *outpos++ = const_cpu_to_le16(0); 788 break; 789 } 790 if (wc < 0x10000) 791 *outpos++ = cpu_to_le16(wc); 792 else { 793 wc -= 0x10000; 794 *outpos++ = cpu_to_le16((wc >> 10) + 0xd800); 795 *outpos++ = cpu_to_le16((wc & 0x3ff) + 0xdc00); 796 } 797 t += m; 798 } 799 800 ret = --outpos - *outs; 801 fail: 802 #if defined(__APPLE__) || defined(__DARWIN__) 803 #ifdef ENABLE_NFCONV 804 if(new_ins != NULL) 805 free(new_ins); 806 #endif /* ENABLE_NFCONV */ 807 #endif /* defined(__APPLE__) || defined(__DARWIN__) */ 808 return ret; 809 } 810 811 /** 812 * ntfs_ucstombs - convert a little endian Unicode string to a multibyte string 813 * @ins: input Unicode string buffer 814 * @ins_len: length of input string in Unicode characters 815 * @outs: on return contains the (allocated) output multibyte string 816 * @outs_len: length of output buffer in bytes 817 * 818 * Convert the input little endian, 2-byte Unicode string @ins, of length 819 * @ins_len into the multibyte string format dictated by the current locale. 820 * 821 * If *@outs is NULL, the function allocates the string and the caller is 822 * responsible for calling free(*@outs); when finished with it. 823 * 824 * On success the function returns the number of bytes written to the output 825 * string *@outs (>= 0), not counting the terminating NULL byte. If the output 826 * string buffer was allocated, *@outs is set to it. 827 * 828 * On error, -1 is returned, and errno is set to the error code. The following 829 * error codes can be expected: 830 * EINVAL Invalid arguments (e.g. @ins or @outs is NULL). 831 * EILSEQ The input string cannot be represented as a multibyte 832 * sequence according to the current locale. 833 * ENAMETOOLONG Destination buffer is too small for input string. 834 * ENOMEM Not enough memory to allocate destination buffer. 835 */ 836 int ntfs_ucstombs(const ntfschar *ins, const int ins_len, char **outs, 837 int outs_len) 838 { 839 char *mbs; 840 int mbs_len; 841 #ifdef MB_CUR_MAX 842 wchar_t wc; 843 int i, o; 844 int cnt = 0; 845 #ifdef HAVE_MBSINIT 846 mbstate_t mbstate; 847 #endif 848 #endif /* MB_CUR_MAX */ 849 850 if (!ins || !outs) { 851 errno = EINVAL; 852 return -1; 853 } 854 mbs = *outs; 855 mbs_len = outs_len; 856 if (mbs && !mbs_len) { 857 errno = ENAMETOOLONG; 858 return -1; 859 } 860 if (use_utf8) 861 return ntfs_utf16_to_utf8(ins, ins_len, outs, outs_len); 862 #ifdef MB_CUR_MAX 863 if (!mbs) { 864 mbs_len = (ins_len + 1) * MB_CUR_MAX; 865 mbs = ntfs_malloc(mbs_len); 866 if (!mbs) 867 return -1; 868 } 869 #ifdef HAVE_MBSINIT 870 memset(&mbstate, 0, sizeof(mbstate)); 871 #else 872 wctomb(NULL, 0); 873 #endif 874 for (i = o = 0; i < ins_len; i++) { 875 /* Reallocate memory if necessary or abort. */ 876 if ((int)(o + MB_CUR_MAX) > mbs_len) { 877 char *tc; 878 if (mbs == *outs) { 879 errno = ENAMETOOLONG; 880 return -1; 881 } 882 tc = ntfs_malloc((mbs_len + 64) & ~63); 883 if (!tc) 884 goto err_out; 885 memcpy(tc, mbs, mbs_len); 886 mbs_len = (mbs_len + 64) & ~63; 887 free(mbs); 888 mbs = tc; 889 } 890 /* Convert the LE Unicode character to a CPU wide character. */ 891 wc = (wchar_t)le16_to_cpu(ins[i]); 892 if (!wc) 893 break; 894 /* Convert the CPU endian wide character to multibyte. */ 895 #ifdef HAVE_MBSINIT 896 cnt = wcrtomb(mbs + o, wc, &mbstate); 897 #else 898 cnt = wctomb(mbs + o, wc); 899 #endif 900 if (cnt == -1) 901 goto err_out; 902 if (cnt <= 0) { 903 ntfs_log_debug("Eeek. cnt <= 0, cnt = %i\n", cnt); 904 errno = EINVAL; 905 goto err_out; 906 } 907 o += cnt; 908 } 909 #ifdef HAVE_MBSINIT 910 /* Make sure we are back in the initial state. */ 911 if (!mbsinit(&mbstate)) { 912 ntfs_log_debug("Eeek. mbstate not in initial state!\n"); 913 errno = EILSEQ; 914 goto err_out; 915 } 916 #endif 917 /* Now write the NULL character. */ 918 mbs[o] = '\0'; 919 if (*outs != mbs) 920 *outs = mbs; 921 return o; 922 err_out: 923 if (mbs != *outs) { 924 int eo = errno; 925 free(mbs); 926 errno = eo; 927 } 928 #else /* MB_CUR_MAX */ 929 errno = EILSEQ; 930 #endif /* MB_CUR_MAX */ 931 return -1; 932 } 933 934 /** 935 * ntfs_mbstoucs - convert a multibyte string to a little endian Unicode string 936 * @ins: input multibyte string buffer 937 * @outs: on return contains the (allocated) output Unicode string 938 * 939 * Convert the input multibyte string @ins, from the current locale into the 940 * corresponding little endian, 2-byte Unicode string. 941 * 942 * The function allocates the string and the caller is responsible for calling 943 * free(*@outs); when finished with it. 944 * 945 * On success the function returns the number of Unicode characters written to 946 * the output string *@outs (>= 0), not counting the terminating Unicode NULL 947 * character. 948 * 949 * On error, -1 is returned, and errno is set to the error code. The following 950 * error codes can be expected: 951 * EINVAL Invalid arguments (e.g. @ins or @outs is NULL). 952 * EILSEQ The input string cannot be represented as a Unicode 953 * string according to the current locale. 954 * ENAMETOOLONG Destination buffer is too small for input string. 955 * ENOMEM Not enough memory to allocate destination buffer. 956 */ 957 int ntfs_mbstoucs(const char *ins, ntfschar **outs) 958 { 959 #ifdef MB_CUR_MAX 960 ntfschar *ucs; 961 const char *s; 962 wchar_t wc; 963 int i, o, cnt, ins_len, ucs_len, ins_size; 964 #ifdef HAVE_MBSINIT 965 mbstate_t mbstate; 966 #endif 967 #endif /* MB_CUR_MAX */ 968 969 if (!ins || !outs) { 970 errno = EINVAL; 971 return -1; 972 } 973 974 if (use_utf8) 975 return ntfs_utf8_to_utf16(ins, outs); 976 977 #ifdef MB_CUR_MAX 978 /* Determine the size of the multi-byte string in bytes. */ 979 ins_size = strlen(ins); 980 /* Determine the length of the multi-byte string. */ 981 s = ins; 982 #if defined(HAVE_MBSINIT) 983 memset(&mbstate, 0, sizeof(mbstate)); 984 ins_len = mbsrtowcs(NULL, (const char **)&s, 0, &mbstate); 985 #ifdef __CYGWIN32__ 986 if (!ins_len && *ins) { 987 /* Older Cygwin had broken mbsrtowcs() implementation. */ 988 ins_len = strlen(ins); 989 } 990 #endif 991 #elif !defined(DJGPP) 992 ins_len = mbstowcs(NULL, s, 0); 993 #else 994 /* Eeek!!! DJGPP has broken mbstowcs() implementation!!! */ 995 ins_len = strlen(ins); 996 #endif 997 if (ins_len == -1) 998 return ins_len; 999 #ifdef HAVE_MBSINIT 1000 if ((s != ins) || !mbsinit(&mbstate)) { 1001 #else 1002 if (s != ins) { 1003 #endif 1004 errno = EILSEQ; 1005 return -1; 1006 } 1007 /* Add the NULL terminator. */ 1008 ins_len++; 1009 ucs_len = ins_len; 1010 ucs = ntfs_malloc(ucs_len * sizeof(ntfschar)); 1011 if (!ucs) 1012 return -1; 1013 #ifdef HAVE_MBSINIT 1014 memset(&mbstate, 0, sizeof(mbstate)); 1015 #else 1016 mbtowc(NULL, NULL, 0); 1017 #endif 1018 for (i = o = cnt = 0; i < ins_size; i += cnt, o++) { 1019 /* Reallocate memory if necessary. */ 1020 if (o >= ucs_len) { 1021 ntfschar *tc; 1022 ucs_len = (ucs_len * sizeof(ntfschar) + 64) & ~63; 1023 tc = realloc(ucs, ucs_len); 1024 if (!tc) 1025 goto err_out; 1026 ucs = tc; 1027 ucs_len /= sizeof(ntfschar); 1028 } 1029 /* Convert the multibyte character to a wide character. */ 1030 #ifdef HAVE_MBSINIT 1031 cnt = mbrtowc(&wc, ins + i, ins_size - i, &mbstate); 1032 #else 1033 cnt = mbtowc(&wc, ins + i, ins_size - i); 1034 #endif 1035 if (!cnt) 1036 break; 1037 if (cnt == -1) 1038 goto err_out; 1039 if (cnt < -1) { 1040 ntfs_log_trace("Eeek. cnt = %i\n", cnt); 1041 errno = EINVAL; 1042 goto err_out; 1043 } 1044 /* Make sure we are not overflowing the NTFS Unicode set. */ 1045 if ((unsigned long)wc >= (unsigned long)(1 << 1046 (8 * sizeof(ntfschar)))) { 1047 errno = EILSEQ; 1048 goto err_out; 1049 } 1050 /* Convert the CPU wide character to a LE Unicode character. */ 1051 ucs[o] = cpu_to_le16(wc); 1052 } 1053 #ifdef HAVE_MBSINIT 1054 /* Make sure we are back in the initial state. */ 1055 if (!mbsinit(&mbstate)) { 1056 ntfs_log_trace("Eeek. mbstate not in initial state!\n"); 1057 errno = EILSEQ; 1058 goto err_out; 1059 } 1060 #endif 1061 /* Now write the NULL character. */ 1062 ucs[o] = cpu_to_le16(L'\0'); 1063 *outs = ucs; 1064 return o; 1065 err_out: 1066 free(ucs); 1067 #else /* MB_CUR_MAX */ 1068 errno = EILSEQ; 1069 #endif /* MB_CUR_MAX */ 1070 return -1; 1071 } 1072 1073 /* 1074 * Turn a UTF8 name uppercase 1075 * 1076 * Returns an allocated uppercase name which has to be freed by caller 1077 * or NULL if there is an error (described by errno) 1078 */ 1079 1080 char *ntfs_uppercase_mbs(const char *low, 1081 const ntfschar *upcase, u32 upcase_size) 1082 { 1083 int size; 1084 char *upp; 1085 u32 wc; 1086 int n; 1087 const char *s; 1088 char *t; 1089 1090 size = strlen(low); 1091 upp = (char*)ntfs_malloc(3*size + 1); 1092 if (upp) { 1093 s = low; 1094 t = upp; 1095 do { 1096 n = utf8_to_unicode(&wc, s); 1097 if (n > 0) { 1098 if (wc < upcase_size) 1099 wc = le16_to_cpu(upcase[wc]); 1100 if (wc < 0x80) 1101 *t++ = wc; 1102 else if (wc < 0x800) { 1103 *t++ = (0xc0 | ((wc >> 6) & 0x3f)); 1104 *t++ = 0x80 | (wc & 0x3f); 1105 } else if (wc < 0x10000) { 1106 *t++ = 0xe0 | (wc >> 12); 1107 *t++ = 0x80 | ((wc >> 6) & 0x3f); 1108 *t++ = 0x80 | (wc & 0x3f); 1109 } else { 1110 *t++ = 0xf0 | ((wc >> 18) & 7); 1111 *t++ = 0x80 | ((wc >> 12) & 63); 1112 *t++ = 0x80 | ((wc >> 6) & 0x3f); 1113 *t++ = 0x80 | (wc & 0x3f); 1114 } 1115 s += n; 1116 } 1117 } while (n > 0); 1118 if (n < 0) { 1119 free(upp); 1120 upp = (char*)NULL; 1121 errno = EILSEQ; 1122 } 1123 *t = 0; 1124 } 1125 return (upp); 1126 } 1127 1128 /** 1129 * ntfs_upcase_table_build - build the default upcase table for NTFS 1130 * @uc: destination buffer where to store the built table 1131 * @uc_len: size of destination buffer in bytes 1132 * 1133 * ntfs_upcase_table_build() builds the default upcase table for NTFS and 1134 * stores it in the caller supplied buffer @uc of size @uc_len. 1135 * 1136 * Note, @uc_len must be at least 128kiB in size or bad things will happen! 1137 */ 1138 void ntfs_upcase_table_build(ntfschar *uc, u32 uc_len) 1139 { 1140 #if 1 /* Vista */ 1141 /* 1142 * This is the table as defined by Vista 1143 */ 1144 /* 1145 * "Start" is inclusive and "End" is exclusive, every value has the 1146 * value of "Add" added to it. 1147 */ 1148 static int uc_run_table[][3] = { /* Start, End, Add */ 1149 {0x0061, 0x007b, -32}, {0x00e0, 0x00f7, -32}, {0x00f8, 0x00ff, -32}, 1150 {0x0256, 0x0258, -205}, {0x028a, 0x028c, -217}, {0x037b, 0x037e, 130}, 1151 {0x03ac, 0x03ad, -38}, {0x03ad, 0x03b0, -37}, {0x03b1, 0x03c2, -32}, 1152 {0x03c2, 0x03c3, -31}, {0x03c3, 0x03cc, -32}, {0x03cc, 0x03cd, -64}, 1153 {0x03cd, 0x03cf, -63}, {0x0430, 0x0450, -32}, {0x0450, 0x0460, -80}, 1154 {0x0561, 0x0587, -48}, {0x1f00, 0x1f08, 8}, {0x1f10, 0x1f16, 8}, 1155 {0x1f20, 0x1f28, 8}, {0x1f30, 0x1f38, 8}, {0x1f40, 0x1f46, 8}, 1156 {0x1f51, 0x1f52, 8}, {0x1f53, 0x1f54, 8}, {0x1f55, 0x1f56, 8}, 1157 {0x1f57, 0x1f58, 8}, {0x1f60, 0x1f68, 8}, {0x1f70, 0x1f72, 74}, 1158 {0x1f72, 0x1f76, 86}, {0x1f76, 0x1f78, 100}, {0x1f78, 0x1f7a, 128}, 1159 {0x1f7a, 0x1f7c, 112}, {0x1f7c, 0x1f7e, 126}, {0x1f80, 0x1f88, 8}, 1160 {0x1f90, 0x1f98, 8}, {0x1fa0, 0x1fa8, 8}, {0x1fb0, 0x1fb2, 8}, 1161 {0x1fb3, 0x1fb4, 9}, {0x1fcc, 0x1fcd, -9}, {0x1fd0, 0x1fd2, 8}, 1162 {0x1fe0, 0x1fe2, 8}, {0x1fe5, 0x1fe6, 7}, {0x1ffc, 0x1ffd, -9}, 1163 {0x2170, 0x2180, -16}, {0x24d0, 0x24ea, -26}, {0x2c30, 0x2c5f, -48}, 1164 {0x2d00, 0x2d26, -7264}, {0xff41, 0xff5b, -32}, {0} 1165 }; 1166 /* 1167 * "Start" is exclusive and "End" is inclusive, every second value is 1168 * decremented by one. 1169 */ 1170 static int uc_dup_table[][2] = { /* Start, End */ 1171 {0x0100, 0x012f}, {0x0132, 0x0137}, {0x0139, 0x0149}, {0x014a, 0x0178}, 1172 {0x0179, 0x017e}, {0x01a0, 0x01a6}, {0x01b3, 0x01b7}, {0x01cd, 0x01dd}, 1173 {0x01de, 0x01ef}, {0x01f4, 0x01f5}, {0x01f8, 0x01f9}, {0x01fa, 0x0220}, 1174 {0x0222, 0x0234}, {0x023b, 0x023c}, {0x0241, 0x0242}, {0x0246, 0x024f}, 1175 {0x03d8, 0x03ef}, {0x03f7, 0x03f8}, {0x03fa, 0x03fb}, {0x0460, 0x0481}, 1176 {0x048a, 0x04bf}, {0x04c1, 0x04c4}, {0x04c5, 0x04c8}, {0x04c9, 0x04ce}, 1177 {0x04ec, 0x04ed}, {0x04d0, 0x04eb}, {0x04ee, 0x04f5}, {0x04f6, 0x0513}, 1178 {0x1e00, 0x1e95}, {0x1ea0, 0x1ef9}, {0x2183, 0x2184}, {0x2c60, 0x2c61}, 1179 {0x2c67, 0x2c6c}, {0x2c75, 0x2c76}, {0x2c80, 0x2ce3}, {0} 1180 }; 1181 /* 1182 * Set the Unicode character at offset "Offset" to "Value". Note, 1183 * "Value" is host endian. 1184 */ 1185 static int uc_byte_table[][2] = { /* Offset, Value */ 1186 {0x00ff, 0x0178}, {0x0180, 0x0243}, {0x0183, 0x0182}, {0x0185, 0x0184}, 1187 {0x0188, 0x0187}, {0x018c, 0x018b}, {0x0192, 0x0191}, {0x0195, 0x01f6}, 1188 {0x0199, 0x0198}, {0x019a, 0x023d}, {0x019e, 0x0220}, {0x01a8, 0x01a7}, 1189 {0x01ad, 0x01ac}, {0x01b0, 0x01af}, {0x01b9, 0x01b8}, {0x01bd, 0x01bc}, 1190 {0x01bf, 0x01f7}, {0x01c6, 0x01c4}, {0x01c9, 0x01c7}, {0x01cc, 0x01ca}, 1191 {0x01dd, 0x018e}, {0x01f3, 0x01f1}, {0x023a, 0x2c65}, {0x023e, 0x2c66}, 1192 {0x0253, 0x0181}, {0x0254, 0x0186}, {0x0259, 0x018f}, {0x025b, 0x0190}, 1193 {0x0260, 0x0193}, {0x0263, 0x0194}, {0x0268, 0x0197}, {0x0269, 0x0196}, 1194 {0x026b, 0x2c62}, {0x026f, 0x019c}, {0x0272, 0x019d}, {0x0275, 0x019f}, 1195 {0x027d, 0x2c64}, {0x0280, 0x01a6}, {0x0283, 0x01a9}, {0x0288, 0x01ae}, 1196 {0x0289, 0x0244}, {0x028c, 0x0245}, {0x0292, 0x01b7}, {0x03f2, 0x03f9}, 1197 {0x04cf, 0x04c0}, {0x1d7d, 0x2c63}, {0x214e, 0x2132}, {0} 1198 }; 1199 #else /* Vista */ 1200 /* 1201 * This is the table as defined by Windows XP 1202 */ 1203 static int uc_run_table[][3] = { /* Start, End, Add */ 1204 {0x0061, 0x007B, -32}, {0x0451, 0x045D, -80}, {0x1F70, 0x1F72, 74}, 1205 {0x00E0, 0x00F7, -32}, {0x045E, 0x0460, -80}, {0x1F72, 0x1F76, 86}, 1206 {0x00F8, 0x00FF, -32}, {0x0561, 0x0587, -48}, {0x1F76, 0x1F78, 100}, 1207 {0x0256, 0x0258, -205}, {0x1F00, 0x1F08, 8}, {0x1F78, 0x1F7A, 128}, 1208 {0x028A, 0x028C, -217}, {0x1F10, 0x1F16, 8}, {0x1F7A, 0x1F7C, 112}, 1209 {0x03AC, 0x03AD, -38}, {0x1F20, 0x1F28, 8}, {0x1F7C, 0x1F7E, 126}, 1210 {0x03AD, 0x03B0, -37}, {0x1F30, 0x1F38, 8}, {0x1FB0, 0x1FB2, 8}, 1211 {0x03B1, 0x03C2, -32}, {0x1F40, 0x1F46, 8}, {0x1FD0, 0x1FD2, 8}, 1212 {0x03C2, 0x03C3, -31}, {0x1F51, 0x1F52, 8}, {0x1FE0, 0x1FE2, 8}, 1213 {0x03C3, 0x03CC, -32}, {0x1F53, 0x1F54, 8}, {0x1FE5, 0x1FE6, 7}, 1214 {0x03CC, 0x03CD, -64}, {0x1F55, 0x1F56, 8}, {0x2170, 0x2180, -16}, 1215 {0x03CD, 0x03CF, -63}, {0x1F57, 0x1F58, 8}, {0x24D0, 0x24EA, -26}, 1216 {0x0430, 0x0450, -32}, {0x1F60, 0x1F68, 8}, {0xFF41, 0xFF5B, -32}, 1217 {0} 1218 }; 1219 static int uc_dup_table[][2] = { /* Start, End */ 1220 {0x0100, 0x012F}, {0x01A0, 0x01A6}, {0x03E2, 0x03EF}, {0x04CB, 0x04CC}, 1221 {0x0132, 0x0137}, {0x01B3, 0x01B7}, {0x0460, 0x0481}, {0x04D0, 0x04EB}, 1222 {0x0139, 0x0149}, {0x01CD, 0x01DD}, {0x0490, 0x04BF}, {0x04EE, 0x04F5}, 1223 {0x014A, 0x0178}, {0x01DE, 0x01EF}, {0x04BF, 0x04BF}, {0x04F8, 0x04F9}, 1224 {0x0179, 0x017E}, {0x01F4, 0x01F5}, {0x04C1, 0x04C4}, {0x1E00, 0x1E95}, 1225 {0x018B, 0x018B}, {0x01FA, 0x0218}, {0x04C7, 0x04C8}, {0x1EA0, 0x1EF9}, 1226 {0} 1227 }; 1228 static int uc_byte_table[][2] = { /* Offset, Value */ 1229 {0x00FF, 0x0178}, {0x01AD, 0x01AC}, {0x01F3, 0x01F1}, {0x0269, 0x0196}, 1230 {0x0183, 0x0182}, {0x01B0, 0x01AF}, {0x0253, 0x0181}, {0x026F, 0x019C}, 1231 {0x0185, 0x0184}, {0x01B9, 0x01B8}, {0x0254, 0x0186}, {0x0272, 0x019D}, 1232 {0x0188, 0x0187}, {0x01BD, 0x01BC}, {0x0259, 0x018F}, {0x0275, 0x019F}, 1233 {0x018C, 0x018B}, {0x01C6, 0x01C4}, {0x025B, 0x0190}, {0x0283, 0x01A9}, 1234 {0x0192, 0x0191}, {0x01C9, 0x01C7}, {0x0260, 0x0193}, {0x0288, 0x01AE}, 1235 {0x0199, 0x0198}, {0x01CC, 0x01CA}, {0x0263, 0x0194}, {0x0292, 0x01B7}, 1236 {0x01A8, 0x01A7}, {0x01DD, 0x018E}, {0x0268, 0x0197}, 1237 {0} 1238 }; 1239 #endif /* Vista */ 1240 int i, r; 1241 int k, off; 1242 1243 memset((char*)uc, 0, uc_len); 1244 uc_len >>= 1; 1245 if (uc_len > 65536) 1246 uc_len = 65536; 1247 for (i = 0; (u32)i < uc_len; i++) 1248 uc[i] = cpu_to_le16(i); 1249 for (r = 0; uc_run_table[r][0]; r++) { 1250 off = uc_run_table[r][2]; 1251 for (i = uc_run_table[r][0]; i < uc_run_table[r][1]; i++) 1252 uc[i] = cpu_to_le16(i + off); 1253 } 1254 for (r = 0; uc_dup_table[r][0]; r++) 1255 for (i = uc_dup_table[r][0]; i < uc_dup_table[r][1]; i += 2) 1256 uc[i + 1] = cpu_to_le16(i); 1257 for (r = 0; uc_byte_table[r][0]; r++) { 1258 k = uc_byte_table[r][1]; 1259 uc[uc_byte_table[r][0]] = cpu_to_le16(k); 1260 } 1261 } 1262 1263 /* 1264 * Allocate and build the default upcase table 1265 * 1266 * Returns the number of entries 1267 * 0 if failed 1268 */ 1269 1270 #define UPCASE_LEN 65536 /* default number of entries in upcase */ 1271 1272 u32 ntfs_upcase_build_default(ntfschar **upcase) 1273 { 1274 u32 upcase_len = 0; 1275 1276 *upcase = (ntfschar*)ntfs_malloc(UPCASE_LEN*2); 1277 if (*upcase) { 1278 ntfs_upcase_table_build(*upcase, UPCASE_LEN*2); 1279 upcase_len = UPCASE_LEN; 1280 } 1281 return (upcase_len); 1282 } 1283 1284 /* 1285 * Build a table for converting to lower case 1286 * 1287 * This is only meaningful when there is a single lower case 1288 * character leading to an upper case one, and currently the 1289 * only exception is the greek letter sigma which has a single 1290 * upper case glyph (code U+03A3), but two lower case glyphs 1291 * (code U+03C3 and U+03C2, the latter to be used at the end 1292 * of a word). In the following implementation the upper case 1293 * sigma will be lowercased as U+03C3. 1294 */ 1295 1296 ntfschar *ntfs_locase_table_build(const ntfschar *uc, u32 uc_cnt) 1297 { 1298 ntfschar *lc; 1299 u32 upp; 1300 u32 i; 1301 1302 lc = (ntfschar*)ntfs_malloc(uc_cnt*sizeof(ntfschar)); 1303 if (lc) { 1304 for (i=0; i<uc_cnt; i++) 1305 lc[i] = cpu_to_le16(i); 1306 for (i=0; i<uc_cnt; i++) { 1307 upp = le16_to_cpu(uc[i]); 1308 if ((upp != i) && (upp < uc_cnt)) 1309 lc[upp] = cpu_to_le16(i); 1310 } 1311 } else 1312 ntfs_log_error("Could not build the locase table\n"); 1313 return (lc); 1314 } 1315 1316 /** 1317 * ntfs_str2ucs - convert a string to a valid NTFS file name 1318 * @s: input string 1319 * @len: length of output buffer in Unicode characters 1320 * 1321 * Convert the input @s string into the corresponding little endian, 1322 * 2-byte Unicode string. The length of the converted string is less 1323 * or equal to the maximum length allowed by the NTFS format (255). 1324 * 1325 * If @s is NULL then return AT_UNNAMED. 1326 * 1327 * On success the function returns the Unicode string in an allocated 1328 * buffer and the caller is responsible to free it when it's not needed 1329 * anymore. 1330 * 1331 * On error NULL is returned and errno is set to the error code. 1332 */ 1333 ntfschar *ntfs_str2ucs(const char *s, int *len) 1334 { 1335 ntfschar *ucs = NULL; 1336 1337 if (s && ((*len = ntfs_mbstoucs(s, &ucs)) == -1)) { 1338 ntfs_log_perror("Couldn't convert '%s' to Unicode", s); 1339 return NULL; 1340 } 1341 if (*len > NTFS_MAX_NAME_LEN) { 1342 free(ucs); 1343 errno = ENAMETOOLONG; 1344 return NULL; 1345 } 1346 if (!ucs || !*len) { 1347 ucs = AT_UNNAMED; 1348 *len = 0; 1349 } 1350 return ucs; 1351 } 1352 1353 /** 1354 * ntfs_ucsfree - free memory allocated by ntfs_str2ucs() 1355 * @ucs input string to be freed 1356 * 1357 * Free memory at @ucs and which was allocated by ntfs_str2ucs. 1358 * 1359 * Return value: none. 1360 */ 1361 void ntfs_ucsfree(ntfschar *ucs) 1362 { 1363 if (ucs && (ucs != AT_UNNAMED)) 1364 free(ucs); 1365 } 1366 1367 /* 1368 * Check whether a name contains no chars forbidden 1369 * for DOS or Win32 use 1370 * 1371 * If there is a bad char, errno is set to EINVAL 1372 */ 1373 1374 BOOL ntfs_forbidden_chars(const ntfschar *name, int len) 1375 { 1376 BOOL forbidden; 1377 int ch; 1378 int i; 1379 u32 mainset = (1L << ('\"' - 0x20)) 1380 | (1L << ('*' - 0x20)) 1381 | (1L << ('/' - 0x20)) 1382 | (1L << (':' - 0x20)) 1383 | (1L << ('<' - 0x20)) 1384 | (1L << ('>' - 0x20)) 1385 | (1L << ('?' - 0x20)); 1386 1387 forbidden = (len == 0) 1388 || (le16_to_cpu(name[len-1]) == ' ') 1389 || (le16_to_cpu(name[len-1]) == '.'); 1390 for (i=0; i<len; i++) { 1391 ch = le16_to_cpu(name[i]); 1392 if ((ch < 0x20) 1393 || ((ch < 0x40) 1394 && ((1L << (ch - 0x20)) & mainset)) 1395 || (ch == '\\') 1396 || (ch == '|')) 1397 forbidden = TRUE; 1398 } 1399 if (forbidden) 1400 errno = EINVAL; 1401 return (forbidden); 1402 } 1403 1404 /* 1405 * Check whether the same name can be used as a DOS and 1406 * a Win32 name 1407 * 1408 * The names must be the same, or the short name the uppercase 1409 * variant of the long name 1410 */ 1411 1412 BOOL ntfs_collapsible_chars(ntfs_volume *vol, 1413 const ntfschar *shortname, int shortlen, 1414 const ntfschar *longname, int longlen) 1415 { 1416 BOOL collapsible; 1417 unsigned int ch; 1418 unsigned int cs; 1419 int i; 1420 1421 collapsible = shortlen == longlen; 1422 for (i=0; collapsible && (i<shortlen); i++) { 1423 ch = le16_to_cpu(longname[i]); 1424 cs = le16_to_cpu(shortname[i]); 1425 if ((cs != ch) 1426 && ((ch >= vol->upcase_len) 1427 || (cs >= vol->upcase_len) 1428 || (vol->upcase[cs] != vol->upcase[ch]))) 1429 collapsible = FALSE; 1430 } 1431 return (collapsible); 1432 } 1433 1434 /* 1435 * Define the character encoding to be used. 1436 * Use UTF-8 unless specified otherwise. 1437 */ 1438 1439 int ntfs_set_char_encoding(const char *locale) 1440 { 1441 use_utf8 = 0; 1442 if (!locale || strstr(locale,"utf8") || strstr(locale,"UTF8") 1443 || strstr(locale,"utf-8") || strstr(locale,"UTF-8")) 1444 use_utf8 = 1; 1445 else 1446 if (setlocale(LC_ALL, locale)) 1447 use_utf8 = 0; 1448 else { 1449 ntfs_log_error("Invalid locale, encoding to UTF-8\n"); 1450 use_utf8 = 1; 1451 } 1452 return 0; /* always successful */ 1453 } 1454 1455 #if defined(__APPLE__) || defined(__DARWIN__) 1456 1457 int ntfs_macosx_normalize_filenames(int normalize) { 1458 #ifdef ENABLE_NFCONV 1459 if(normalize == 0 || normalize == 1) { 1460 nfconvert_utf8 = normalize; 1461 return 0; 1462 } 1463 else 1464 return -1; 1465 #else 1466 return -1; 1467 #endif /* ENABLE_NFCONV */ 1468 } 1469 1470 int ntfs_macosx_normalize_utf8(const char *utf8_string, char **target, 1471 int composed) { 1472 #ifdef ENABLE_NFCONV 1473 /* For this code to compile, the CoreFoundation framework must be fed to the linker. */ 1474 CFStringRef cfSourceString; 1475 CFMutableStringRef cfMutableString; 1476 CFRange rangeToProcess; 1477 CFIndex requiredBufferLength; 1478 char *result = NULL; 1479 int resultLength = -1; 1480 1481 /* Convert the UTF-8 string to a CFString. */ 1482 cfSourceString = CFStringCreateWithCString(kCFAllocatorDefault, utf8_string, kCFStringEncodingUTF8); 1483 if(cfSourceString == NULL) { 1484 ntfs_log_error("CFStringCreateWithCString failed!\n"); 1485 return -2; 1486 } 1487 1488 /* Create a mutable string from cfSourceString that we are free to modify. */ 1489 cfMutableString = CFStringCreateMutableCopy(kCFAllocatorDefault, 0, cfSourceString); 1490 CFRelease(cfSourceString); /* End-of-life. */ 1491 if(cfMutableString == NULL) { 1492 ntfs_log_error("CFStringCreateMutableCopy failed!\n"); 1493 return -3; 1494 } 1495 1496 /* Normalize the mutable string to the desired normalization form. */ 1497 CFStringNormalize(cfMutableString, (composed != 0 ? kCFStringNormalizationFormC : kCFStringNormalizationFormD)); 1498 1499 /* Store the resulting string in a '\0'-terminated UTF-8 encoded char* buffer. */ 1500 rangeToProcess = CFRangeMake(0, CFStringGetLength(cfMutableString)); 1501 if(CFStringGetBytes(cfMutableString, rangeToProcess, kCFStringEncodingUTF8, 0, false, NULL, 0, &requiredBufferLength) > 0) { 1502 resultLength = sizeof(char)*(requiredBufferLength + 1); 1503 result = ntfs_calloc(resultLength); 1504 1505 if(result != NULL) { 1506 if(CFStringGetBytes(cfMutableString, rangeToProcess, kCFStringEncodingUTF8, 1507 0, false, (UInt8*)result, resultLength-1, &requiredBufferLength) <= 0) { 1508 ntfs_log_error("Could not perform UTF-8 conversion of normalized CFMutableString.\n"); 1509 free(result); 1510 result = NULL; 1511 } 1512 } 1513 else 1514 ntfs_log_error("Could not perform a ntfs_calloc of %d bytes for char *result.\n", resultLength); 1515 } 1516 else 1517 ntfs_log_error("Could not perform check for required length of UTF-8 conversion of normalized CFMutableString.\n"); 1518 1519 1520 CFRelease(cfMutableString); 1521 1522 if(result != NULL) { 1523 *target = result; 1524 return resultLength - 1; 1525 } 1526 else 1527 return -1; 1528 #else 1529 return -1; 1530 #endif /* ENABLE_NFCONV */ 1531 } 1532 #endif /* defined(__APPLE__) || defined(__DARWIN__) */ 1533