1 /** 2 * unistr.c - Unicode string handling. Originated from the Linux-NTFS project. 3 * 4 * Copyright (c) 2000-2004 Anton Altaparmakov 5 * Copyright (c) 2002-2009 Szabolcs Szakacsits 6 * Copyright (c) 2008-2015 Jean-Pierre Andre 7 * Copyright (c) 2008 Bernhard Kaindl 8 * 9 * This program/include file is free software; you can redistribute it and/or 10 * modify it under the terms of the GNU General Public License as published 11 * by the Free Software Foundation; either version 2 of the License, or 12 * (at your option) any later version. 13 * 14 * This program/include file is distributed in the hope that it will be 15 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty 16 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 * GNU General Public License for more details. 18 * 19 * You should have received a copy of the GNU General Public License 20 * along with this program (in the main directory of the NTFS-3G 21 * distribution in the file COPYING); if not, write to the Free Software 22 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 23 */ 24 25 #ifdef HAVE_CONFIG_H 26 #include "config.h" 27 #endif 28 29 #ifdef HAVE_STDIO_H 30 #include <stdio.h> 31 #endif 32 #ifdef HAVE_STDLIB_H 33 #include <stdlib.h> 34 #endif 35 #ifdef HAVE_WCHAR_H 36 #include <wchar.h> 37 #endif 38 #ifdef HAVE_STRING_H 39 #include <string.h> 40 #endif 41 #ifdef HAVE_ERRNO_H 42 #include <errno.h> 43 #endif 44 #ifdef HAVE_LOCALE_H 45 #include <locale.h> 46 #endif 47 48 #if defined(__APPLE__) || defined(__DARWIN__) 49 #ifdef ENABLE_NFCONV 50 #include <CoreFoundation/CoreFoundation.h> 51 #endif /* ENABLE_NFCONV */ 52 #endif /* defined(__APPLE__) || defined(__DARWIN__) */ 53 54 #include "compat.h" 55 #include "attrib.h" 56 #include "types.h" 57 #include "unistr.h" 58 #include "debug.h" 59 #include "logging.h" 60 #include "misc.h" 61 62 #ifndef ALLOW_BROKEN_UNICODE 63 /* Erik allowing broken UTF-16 surrogate pairs and U+FFFE and U+FFFF by default, 64 * open to debate. */ 65 #define ALLOW_BROKEN_UNICODE 1 66 #endif /* !defined(ALLOW_BROKEN_UNICODE) */ 67 68 /* 69 * IMPORTANT 70 * ========= 71 * 72 * All these routines assume that the Unicode characters are in little endian 73 * encoding inside the strings!!! 74 */ 75 76 static int use_utf8 = 1; /* use UTF-8 encoding for file names */ 77 78 #if defined(__APPLE__) || defined(__DARWIN__) 79 #ifdef ENABLE_NFCONV 80 /** 81 * This variable controls whether or not automatic normalization form conversion 82 * should be performed when translating NTFS unicode file names to UTF-8. 83 * Defaults to on, but can be controlled from the outside using the function 84 * int ntfs_macosx_normalize_filenames(int normalize); 85 */ 86 static int nfconvert_utf8 = 1; 87 #endif /* ENABLE_NFCONV */ 88 #endif /* defined(__APPLE__) || defined(__DARWIN__) */ 89 90 /* 91 * This is used by the name collation functions to quickly determine what 92 * characters are (in)valid. 93 */ 94 #if 0 95 static const u8 legal_ansi_char_array[0x40] = { 96 0x00, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 97 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 98 99 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 100 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 101 102 0x17, 0x07, 0x18, 0x17, 0x17, 0x17, 0x17, 0x17, 103 0x17, 0x17, 0x18, 0x16, 0x16, 0x17, 0x07, 0x00, 104 105 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 106 0x17, 0x17, 0x04, 0x16, 0x18, 0x16, 0x18, 0x18, 107 }; 108 #endif 109 110 /** 111 * ntfs_names_are_equal - compare two Unicode names for equality 112 * @s1: name to compare to @s2 113 * @s1_len: length in Unicode characters of @s1 114 * @s2: name to compare to @s1 115 * @s2_len: length in Unicode characters of @s2 116 * @ic: ignore case bool 117 * @upcase: upcase table (only if @ic == IGNORE_CASE) 118 * @upcase_size: length in Unicode characters of @upcase (if present) 119 * 120 * Compare the names @s1 and @s2 and return TRUE (1) if the names are 121 * identical, or FALSE (0) if they are not identical. If @ic is IGNORE_CASE, 122 * the @upcase table is used to perform a case insensitive comparison. 123 */ 124 BOOL ntfs_names_are_equal(const ntfschar *s1, size_t s1_len, 125 const ntfschar *s2, size_t s2_len, 126 const IGNORE_CASE_BOOL ic, 127 const ntfschar *upcase, const u32 upcase_size) 128 { 129 if (s1_len != s2_len) 130 return FALSE; 131 if (!s1_len) 132 return TRUE; 133 if (ic == CASE_SENSITIVE) 134 return ntfs_ucsncmp(s1, s2, s1_len) ? FALSE: TRUE; 135 return ntfs_ucsncasecmp(s1, s2, s1_len, upcase, upcase_size) ? FALSE: 136 TRUE; 137 } 138 139 /* 140 * ntfs_names_full_collate() fully collate two Unicode names 141 * 142 * @name1: first Unicode name to compare 143 * @name1_len: length of first Unicode name to compare 144 * @name2: second Unicode name to compare 145 * @name2_len: length of second Unicode name to compare 146 * @ic: either CASE_SENSITIVE or IGNORE_CASE (see below) 147 * @upcase: upcase table 148 * @upcase_len: upcase table size 149 * 150 * If @ic is CASE_SENSITIVE, then the names are compared primarily ignoring 151 * case, but if the names are equal ignoring case, then they are compared 152 * case-sensitively. As an example, "abc" would collate before "BCD" (since 153 * "abc" and "BCD" differ ignoring case and 'A' < 'B') but after "ABC" (since 154 * "ABC" and "abc" are equal ignoring case and 'A' < 'a'). This matches the 155 * collation order of filenames as indexed in NTFS directories. 156 * 157 * If @ic is IGNORE_CASE, then the names are only compared case-insensitively 158 * and are considered to match if and only if they are equal ignoring case. 159 * 160 * Returns: 161 * -1 if the first name collates before the second one, 162 * 0 if the names match, or 163 * 1 if the second name collates before the first one 164 */ 165 int ntfs_names_full_collate(const ntfschar *name1, const u32 name1_len, 166 const ntfschar *name2, const u32 name2_len, 167 const IGNORE_CASE_BOOL ic, const ntfschar *upcase, 168 const u32 upcase_len) 169 { 170 u32 cnt; 171 u16 c1, c2; 172 u16 u1, u2; 173 174 #ifdef DEBUG 175 if (!name1 || !name2 || !upcase || !upcase_len) { 176 ntfs_log_debug("ntfs_names_collate received NULL pointer!\n"); 177 exit(1); 178 } 179 #endif 180 cnt = min(name1_len, name2_len); 181 if (cnt > 0) { 182 if (ic == CASE_SENSITIVE) { 183 while (--cnt && (*name1 == *name2)) { 184 name1++; 185 name2++; 186 } 187 u1 = c1 = le16_to_cpu(*name1); 188 u2 = c2 = le16_to_cpu(*name2); 189 if (u1 < upcase_len) 190 u1 = le16_to_cpu(upcase[u1]); 191 if (u2 < upcase_len) 192 u2 = le16_to_cpu(upcase[u2]); 193 if ((u1 == u2) && cnt) 194 do { 195 name1++; 196 u1 = le16_to_cpu(*name1); 197 name2++; 198 u2 = le16_to_cpu(*name2); 199 if (u1 < upcase_len) 200 u1 = le16_to_cpu(upcase[u1]); 201 if (u2 < upcase_len) 202 u2 = le16_to_cpu(upcase[u2]); 203 } while ((u1 == u2) && --cnt); 204 if (u1 < u2) 205 return -1; 206 if (u1 > u2) 207 return 1; 208 if (name1_len < name2_len) 209 return -1; 210 if (name1_len > name2_len) 211 return 1; 212 if (c1 < c2) 213 return -1; 214 if (c1 > c2) 215 return 1; 216 } else { 217 do { 218 u1 = le16_to_cpu(*name1); 219 name1++; 220 u2 = le16_to_cpu(*name2); 221 name2++; 222 if (u1 < upcase_len) 223 u1 = le16_to_cpu(upcase[u1]); 224 if (u2 < upcase_len) 225 u2 = le16_to_cpu(upcase[u2]); 226 } while ((u1 == u2) && --cnt); 227 if (u1 < u2) 228 return -1; 229 if (u1 > u2) 230 return 1; 231 if (name1_len < name2_len) 232 return -1; 233 if (name1_len > name2_len) 234 return 1; 235 } 236 } else { 237 if (name1_len < name2_len) 238 return -1; 239 if (name1_len > name2_len) 240 return 1; 241 } 242 return 0; 243 } 244 245 /** 246 * ntfs_ucsncmp - compare two little endian Unicode strings 247 * @s1: first string 248 * @s2: second string 249 * @n: maximum unicode characters to compare 250 * 251 * Compare the first @n characters of the Unicode strings @s1 and @s2, 252 * The strings in little endian format and appropriate le16_to_cpu() 253 * conversion is performed on non-little endian machines. 254 * 255 * The function returns an integer less than, equal to, or greater than zero 256 * if @s1 (or the first @n Unicode characters thereof) is found, respectively, 257 * to be less than, to match, or be greater than @s2. 258 */ 259 int ntfs_ucsncmp(const ntfschar *s1, const ntfschar *s2, size_t n) 260 { 261 u16 c1, c2; 262 size_t i; 263 264 #ifdef DEBUG 265 if (!s1 || !s2) { 266 ntfs_log_debug("ntfs_wcsncmp() received NULL pointer!\n"); 267 exit(1); 268 } 269 #endif 270 for (i = 0; i < n; ++i) { 271 c1 = le16_to_cpu(s1[i]); 272 c2 = le16_to_cpu(s2[i]); 273 if (c1 < c2) 274 return -1; 275 if (c1 > c2) 276 return 1; 277 if (!c1) 278 break; 279 } 280 return 0; 281 } 282 283 /** 284 * ntfs_ucsncasecmp - compare two little endian Unicode strings, ignoring case 285 * @s1: first string 286 * @s2: second string 287 * @n: maximum unicode characters to compare 288 * @upcase: upcase table 289 * @upcase_size: upcase table size in Unicode characters 290 * 291 * Compare the first @n characters of the Unicode strings @s1 and @s2, 292 * ignoring case. The strings in little endian format and appropriate 293 * le16_to_cpu() conversion is performed on non-little endian machines. 294 * 295 * Each character is uppercased using the @upcase table before the comparison. 296 * 297 * The function returns an integer less than, equal to, or greater than zero 298 * if @s1 (or the first @n Unicode characters thereof) is found, respectively, 299 * to be less than, to match, or be greater than @s2. 300 */ 301 int ntfs_ucsncasecmp(const ntfschar *s1, const ntfschar *s2, size_t n, 302 const ntfschar *upcase, const u32 upcase_size) 303 { 304 u16 c1, c2; 305 size_t i; 306 307 #ifdef DEBUG 308 if (!s1 || !s2 || !upcase) { 309 ntfs_log_debug("ntfs_wcsncasecmp() received NULL pointer!\n"); 310 exit(1); 311 } 312 #endif 313 for (i = 0; i < n; ++i) { 314 if ((c1 = le16_to_cpu(s1[i])) < upcase_size) 315 c1 = le16_to_cpu(upcase[c1]); 316 if ((c2 = le16_to_cpu(s2[i])) < upcase_size) 317 c2 = le16_to_cpu(upcase[c2]); 318 if (c1 < c2) 319 return -1; 320 if (c1 > c2) 321 return 1; 322 if (!c1) 323 break; 324 } 325 return 0; 326 } 327 328 /** 329 * ntfs_ucsnlen - determine the length of a little endian Unicode string 330 * @s: pointer to Unicode string 331 * @maxlen: maximum length of string @s 332 * 333 * Return the number of Unicode characters in the little endian Unicode 334 * string @s up to a maximum of maxlen Unicode characters, not including 335 * the terminating (ntfschar)'\0'. If there is no (ntfschar)'\0' between @s 336 * and @s + @maxlen, @maxlen is returned. 337 * 338 * This function never looks beyond @s + @maxlen. 339 */ 340 u32 ntfs_ucsnlen(const ntfschar *s, u32 maxlen) 341 { 342 u32 i; 343 344 for (i = 0; i < maxlen; i++) { 345 if (!le16_to_cpu(s[i])) 346 break; 347 } 348 return i; 349 } 350 351 /** 352 * ntfs_ucsndup - duplicate little endian Unicode string 353 * @s: pointer to Unicode string 354 * @maxlen: maximum length of string @s 355 * 356 * Return a pointer to a new little endian Unicode string which is a duplicate 357 * of the string s. Memory for the new string is obtained with ntfs_malloc(3), 358 * and can be freed with free(3). 359 * 360 * A maximum of @maxlen Unicode characters are copied and a terminating 361 * (ntfschar)'\0' little endian Unicode character is added. 362 * 363 * This function never looks beyond @s + @maxlen. 364 * 365 * Return a pointer to the new little endian Unicode string on success and NULL 366 * on failure with errno set to the error code. 367 */ 368 ntfschar *ntfs_ucsndup(const ntfschar *s, u32 maxlen) 369 { 370 ntfschar *dst; 371 u32 len; 372 373 len = ntfs_ucsnlen(s, maxlen); 374 dst = ntfs_malloc((len + 1) * sizeof(ntfschar)); 375 if (dst) { 376 memcpy(dst, s, len * sizeof(ntfschar)); 377 dst[len] = const_cpu_to_le16(L'\0'); 378 } 379 return dst; 380 } 381 382 /** 383 * ntfs_name_upcase - Map an Unicode name to its uppercase equivalent 384 * @name: 385 * @name_len: 386 * @upcase: 387 * @upcase_len: 388 * 389 * Description... 390 * 391 * Returns: 392 */ 393 void ntfs_name_upcase(ntfschar *name, u32 name_len, const ntfschar *upcase, 394 const u32 upcase_len) 395 { 396 u32 i; 397 u16 u; 398 399 for (i = 0; i < name_len; i++) 400 if ((u = le16_to_cpu(name[i])) < upcase_len) 401 name[i] = upcase[u]; 402 } 403 404 /** 405 * ntfs_name_locase - Map a Unicode name to its lowercase equivalent 406 */ 407 void ntfs_name_locase(ntfschar *name, u32 name_len, const ntfschar *locase, 408 const u32 locase_len) 409 { 410 u32 i; 411 u16 u; 412 413 if (locase) 414 for (i = 0; i < name_len; i++) 415 if ((u = le16_to_cpu(name[i])) < locase_len) 416 name[i] = locase[u]; 417 } 418 419 /** 420 * ntfs_file_value_upcase - Convert a filename to upper case 421 * @file_name_attr: 422 * @upcase: 423 * @upcase_len: 424 * 425 * Description... 426 * 427 * Returns: 428 */ 429 void ntfs_file_value_upcase(FILE_NAME_ATTR *file_name_attr, 430 const ntfschar *upcase, const u32 upcase_len) 431 { 432 ntfs_name_upcase((ntfschar*)&file_name_attr->file_name, 433 file_name_attr->file_name_length, upcase, upcase_len); 434 } 435 436 /* 437 NTFS uses Unicode (UTF-16LE [NTFS-3G uses UCS-2LE, which is enough 438 for now]) for path names, but the Unicode code points need to be 439 converted before a path can be accessed under NTFS. For 7 bit ASCII/ANSI, 440 glibc does this even without a locale in a hard-coded fashion as that 441 appears to be is easy because the low 7-bit ASCII range appears to be 442 available in all charsets but it does not convert anything if 443 there was some error with the locale setup or none set up like 444 when mount is called during early boot where he (by policy) do 445 not use locales (and may be not available if /usr is not yet mounted), 446 so this patch fixes the resulting issues for systems which use 447 UTF-8 and for others, specifying the locale in fstab brings them 448 the encoding which they want. 449 450 If no locale is defined or there was a problem with setting one 451 up and whenever nl_langinfo(CODESET) returns a sting starting with 452 "ANSI", use an internal UCS-2LE <-> UTF-8 codeset converter to fix 453 the bug where NTFS-3G does not show any path names which include 454 international characters!!! (and also fails on creating them) as result. 455 456 Author: Bernhard Kaindl <bk@suse.de> 457 Jean-Pierre Andre made it compliant with RFC3629/RFC2781. 458 */ 459 460 /* 461 * Return the number of bytes in UTF-8 needed (without the terminating null) to 462 * store the given UTF-16LE string. 463 * 464 * On error, -1 is returned, and errno is set to the error code. The following 465 * error codes can be expected: 466 * EILSEQ The input string is not valid UTF-16LE (only possible 467 * if compiled without ALLOW_BROKEN_UNICODE). 468 * ENAMETOOLONG The length of the UTF-8 string in bytes (without the 469 * terminating null) would exceed @outs_len. 470 */ 471 static int utf16_to_utf8_size(const ntfschar *ins, const int ins_len, int outs_len) 472 { 473 int i, ret = -1; 474 int count = 0; 475 BOOL surrog; 476 477 surrog = FALSE; 478 for (i = 0; i < ins_len && ins[i] && count <= outs_len; i++) { 479 unsigned short c = le16_to_cpu(ins[i]); 480 if (surrog) { 481 if ((c >= 0xdc00) && (c < 0xe000)) { 482 surrog = FALSE; 483 count += 4; 484 } else { 485 #if ALLOW_BROKEN_UNICODE 486 /* The first UTF-16 unit of a surrogate pair has 487 * a value between 0xd800 and 0xdc00. It can be 488 * encoded as an individual UTF-8 sequence if we 489 * cannot combine it with the next UTF-16 unit 490 * unit as a surrogate pair. */ 491 surrog = FALSE; 492 count += 3; 493 494 --i; 495 continue; 496 #else 497 goto fail; 498 #endif /* ALLOW_BROKEN_UNICODE */ 499 } 500 } else 501 if (c < 0x80) 502 count++; 503 else if (c < 0x800) 504 count += 2; 505 else if (c < 0xd800) 506 count += 3; 507 else if (c < 0xdc00) 508 surrog = TRUE; 509 #if ALLOW_BROKEN_UNICODE 510 else if (c < 0xe000) 511 count += 3; 512 else if (c >= 0xe000) 513 #else 514 else if ((c >= 0xe000) && (c < 0xfffe)) 515 #endif /* ALLOW_BROKEN_UNICODE */ 516 count += 3; 517 else 518 goto fail; 519 } 520 521 if (surrog && count <= outs_len) { 522 #if ALLOW_BROKEN_UNICODE 523 count += 3; /* ending with a single surrogate */ 524 #else 525 goto fail; 526 #endif /* ALLOW_BROKEN_UNICODE */ 527 } 528 529 if (count > outs_len) { 530 errno = ENAMETOOLONG; 531 goto out; 532 } 533 534 ret = count; 535 out: 536 return ret; 537 fail: 538 errno = EILSEQ; 539 goto out; 540 } 541 542 /* 543 * ntfs_utf16_to_utf8 - convert a little endian UTF16LE string to an UTF-8 string 544 * @ins: input utf16 string buffer 545 * @ins_len: length of input string in utf16 characters 546 * @outs: on return contains the (allocated) output multibyte string 547 * @outs_len: length of output buffer in bytes (ignored if *@outs is NULL) 548 * 549 * Return -1 with errno set if string has invalid byte sequence or too long. 550 */ 551 static int ntfs_utf16_to_utf8(const ntfschar *ins, const int ins_len, 552 char **outs, int outs_len) 553 { 554 #if defined(__APPLE__) || defined(__DARWIN__) 555 #ifdef ENABLE_NFCONV 556 char *original_outs_value = *outs; 557 int original_outs_len = outs_len; 558 #endif /* ENABLE_NFCONV */ 559 #endif /* defined(__APPLE__) || defined(__DARWIN__) */ 560 561 char *t; 562 int i, size, ret = -1; 563 int halfpair; 564 565 halfpair = 0; 566 if (!*outs) { 567 /* If no output buffer was provided, we will allocate one and 568 * limit its length to PATH_MAX. Note: we follow the standard 569 * convention of PATH_MAX including the terminating null. */ 570 outs_len = PATH_MAX; 571 } 572 573 /* The size *with* the terminating null is limited to @outs_len, 574 * so the size *without* the terminating null is limited to one less. */ 575 size = utf16_to_utf8_size(ins, ins_len, outs_len - 1); 576 577 if (size < 0) 578 goto out; 579 580 if (!*outs) { 581 outs_len = size + 1; 582 *outs = ntfs_malloc(outs_len); 583 if (!*outs) 584 goto out; 585 } 586 587 t = *outs; 588 589 for (i = 0; i < ins_len && ins[i]; i++) { 590 unsigned short c = le16_to_cpu(ins[i]); 591 /* size not double-checked */ 592 if (halfpair) { 593 if ((c >= 0xdc00) && (c < 0xe000)) { 594 *t++ = 0xf0 + (((halfpair + 64) >> 8) & 7); 595 *t++ = 0x80 + (((halfpair + 64) >> 2) & 63); 596 *t++ = 0x80 + ((c >> 6) & 15) + ((halfpair & 3) << 4); 597 *t++ = 0x80 + (c & 63); 598 halfpair = 0; 599 } else { 600 #if ALLOW_BROKEN_UNICODE 601 /* The first UTF-16 unit of a surrogate pair has 602 * a value between 0xd800 and 0xdc00. It can be 603 * encoded as an individual UTF-8 sequence if we 604 * cannot combine it with the next UTF-16 unit 605 * unit as a surrogate pair. */ 606 *t++ = 0xe0 | (halfpair >> 12); 607 *t++ = 0x80 | ((halfpair >> 6) & 0x3f); 608 *t++ = 0x80 | (halfpair & 0x3f); 609 halfpair = 0; 610 611 --i; 612 continue; 613 #else 614 goto fail; 615 #endif /* ALLOW_BROKEN_UNICODE */ 616 } 617 } else if (c < 0x80) { 618 *t++ = c; 619 } else { 620 if (c < 0x800) { 621 *t++ = (0xc0 | ((c >> 6) & 0x3f)); 622 *t++ = 0x80 | (c & 0x3f); 623 } else if (c < 0xd800) { 624 *t++ = 0xe0 | (c >> 12); 625 *t++ = 0x80 | ((c >> 6) & 0x3f); 626 *t++ = 0x80 | (c & 0x3f); 627 } else if (c < 0xdc00) 628 halfpair = c; 629 #if ALLOW_BROKEN_UNICODE 630 else if (c < 0xe000) { 631 *t++ = 0xe0 | (c >> 12); 632 *t++ = 0x80 | ((c >> 6) & 0x3f); 633 *t++ = 0x80 | (c & 0x3f); 634 } 635 #endif /* ALLOW_BROKEN_UNICODE */ 636 else if (c >= 0xe000) { 637 *t++ = 0xe0 | (c >> 12); 638 *t++ = 0x80 | ((c >> 6) & 0x3f); 639 *t++ = 0x80 | (c & 0x3f); 640 } else 641 goto fail; 642 } 643 } 644 #if ALLOW_BROKEN_UNICODE 645 if (halfpair) { /* ending with a single surrogate */ 646 *t++ = 0xe0 | (halfpair >> 12); 647 *t++ = 0x80 | ((halfpair >> 6) & 0x3f); 648 *t++ = 0x80 | (halfpair & 0x3f); 649 } 650 #endif /* ALLOW_BROKEN_UNICODE */ 651 *t = '\0'; 652 653 #if defined(__APPLE__) || defined(__DARWIN__) 654 #ifdef ENABLE_NFCONV 655 if(nfconvert_utf8 && (t - *outs) > 0) { 656 char *new_outs = NULL; 657 int new_outs_len = ntfs_macosx_normalize_utf8(*outs, &new_outs, 0); // Normalize to decomposed form 658 if(new_outs_len >= 0 && new_outs != NULL) { 659 if(original_outs_value != *outs) { 660 // We have allocated outs ourselves. 661 free(*outs); 662 *outs = new_outs; 663 t = *outs + new_outs_len; 664 } 665 else { 666 // We need to copy new_outs into the fixed outs buffer. 667 memset(*outs, 0, original_outs_len); 668 strncpy(*outs, new_outs, original_outs_len-1); 669 t = *outs + original_outs_len; 670 free(new_outs); 671 } 672 } 673 else { 674 ntfs_log_error("Failed to normalize NTFS string to UTF-8 NFD: %s\n", *outs); 675 ntfs_log_error(" new_outs=0x%p\n", new_outs); 676 ntfs_log_error(" new_outs_len=%d\n", new_outs_len); 677 } 678 } 679 #endif /* ENABLE_NFCONV */ 680 #endif /* defined(__APPLE__) || defined(__DARWIN__) */ 681 682 ret = t - *outs; 683 out: 684 return ret; 685 fail: 686 errno = EILSEQ; 687 goto out; 688 } 689 690 /* 691 * Return the amount of 16-bit elements in UTF-16LE needed 692 * (without the terminating null) to store given UTF-8 string. 693 * 694 * Return -1 with errno set if it's longer than PATH_MAX or string is invalid. 695 * 696 * Note: This does not check whether the input sequence is a valid utf8 string, 697 * and should be used only in context where such check is made! 698 */ 699 static int utf8_to_utf16_size(const char *s) 700 { 701 int ret = -1; 702 unsigned int byte; 703 size_t count = 0; 704 705 while ((byte = *((const unsigned char *)s++))) { 706 if (++count >= PATH_MAX) 707 goto fail; 708 if (byte >= 0xc0) { 709 if (byte >= 0xF5) { 710 errno = EILSEQ; 711 goto out; 712 } 713 if (!*s) 714 break; 715 if (byte >= 0xC0) 716 s++; 717 if (!*s) 718 break; 719 if (byte >= 0xE0) 720 s++; 721 if (!*s) 722 break; 723 if (byte >= 0xF0) { 724 s++; 725 if (++count >= PATH_MAX) 726 goto fail; 727 } 728 } 729 } 730 ret = count; 731 out: 732 return ret; 733 fail: 734 errno = ENAMETOOLONG; 735 goto out; 736 } 737 /* 738 * This converts one UTF-8 sequence to cpu-endian Unicode value 739 * within range U+0 .. U+10ffff and excluding U+D800 .. U+DFFF 740 * 741 * Return the number of used utf8 bytes or -1 with errno set 742 * if sequence is invalid. 743 */ 744 static int utf8_to_unicode(u32 *wc, const char *s) 745 { 746 unsigned int byte = *((const unsigned char *)s); 747 748 /* single byte */ 749 if (byte == 0) { 750 *wc = (u32) 0; 751 return 0; 752 } else if (byte < 0x80) { 753 *wc = (u32) byte; 754 return 1; 755 /* double byte */ 756 } else if (byte < 0xc2) { 757 goto fail; 758 } else if (byte < 0xE0) { 759 if ((s[1] & 0xC0) == 0x80) { 760 *wc = ((u32)(byte & 0x1F) << 6) 761 | ((u32)(s[1] & 0x3F)); 762 return 2; 763 } else 764 goto fail; 765 /* three-byte */ 766 } else if (byte < 0xF0) { 767 if (((s[1] & 0xC0) == 0x80) && ((s[2] & 0xC0) == 0x80)) { 768 *wc = ((u32)(byte & 0x0F) << 12) 769 | ((u32)(s[1] & 0x3F) << 6) 770 | ((u32)(s[2] & 0x3F)); 771 /* Check valid ranges */ 772 #if ALLOW_BROKEN_UNICODE 773 if (((*wc >= 0x800) && (*wc <= 0xD7FF)) 774 || ((*wc >= 0xD800) && (*wc <= 0xDFFF)) 775 || ((*wc >= 0xe000) && (*wc <= 0xFFFF))) 776 return 3; 777 #else 778 if (((*wc >= 0x800) && (*wc <= 0xD7FF)) 779 || ((*wc >= 0xe000) && (*wc <= 0xFFFD))) 780 return 3; 781 #endif /* ALLOW_BROKEN_UNICODE */ 782 } 783 goto fail; 784 /* four-byte */ 785 } else if (byte < 0xF5) { 786 if (((s[1] & 0xC0) == 0x80) && ((s[2] & 0xC0) == 0x80) 787 && ((s[3] & 0xC0) == 0x80)) { 788 *wc = ((u32)(byte & 0x07) << 18) 789 | ((u32)(s[1] & 0x3F) << 12) 790 | ((u32)(s[2] & 0x3F) << 6) 791 | ((u32)(s[3] & 0x3F)); 792 /* Check valid ranges */ 793 if ((*wc <= 0x10ffff) && (*wc >= 0x10000)) 794 return 4; 795 } 796 goto fail; 797 } 798 fail: 799 errno = EILSEQ; 800 return -1; 801 } 802 803 /** 804 * ntfs_utf8_to_utf16 - convert a UTF-8 string to a UTF-16LE string 805 * @ins: input multibyte string buffer 806 * @outs: on return contains the (allocated) output utf16 string 807 * @outs_len: length of output buffer in utf16 characters 808 * 809 * Return -1 with errno set. 810 */ 811 static int ntfs_utf8_to_utf16(const char *ins, ntfschar **outs) 812 { 813 #if defined(__APPLE__) || defined(__DARWIN__) 814 #ifdef ENABLE_NFCONV 815 char *new_ins = NULL; 816 if(nfconvert_utf8) { 817 int new_ins_len; 818 new_ins_len = ntfs_macosx_normalize_utf8(ins, &new_ins, 1); // Normalize to composed form 819 if(new_ins_len >= 0) 820 ins = new_ins; 821 else 822 ntfs_log_error("Failed to normalize NTFS string to UTF-8 NFC: %s\n", ins); 823 } 824 #endif /* ENABLE_NFCONV */ 825 #endif /* defined(__APPLE__) || defined(__DARWIN__) */ 826 const char *t = ins; 827 u32 wc; 828 BOOL allocated; 829 ntfschar *outpos; 830 int shorts, ret = -1; 831 832 shorts = utf8_to_utf16_size(ins); 833 if (shorts < 0) 834 goto fail; 835 836 allocated = FALSE; 837 if (!*outs) { 838 *outs = ntfs_malloc((shorts + 1) * sizeof(ntfschar)); 839 if (!*outs) 840 goto fail; 841 allocated = TRUE; 842 } 843 844 outpos = *outs; 845 846 while(1) { 847 int m = utf8_to_unicode(&wc, t); 848 if (m <= 0) { 849 if (m < 0) { 850 /* do not leave space allocated if failed */ 851 if (allocated) { 852 free(*outs); 853 *outs = (ntfschar*)NULL; 854 } 855 goto fail; 856 } 857 *outpos++ = const_cpu_to_le16(0); 858 break; 859 } 860 if (wc < 0x10000) 861 *outpos++ = cpu_to_le16(wc); 862 else { 863 wc -= 0x10000; 864 *outpos++ = cpu_to_le16((wc >> 10) + 0xd800); 865 *outpos++ = cpu_to_le16((wc & 0x3ff) + 0xdc00); 866 } 867 t += m; 868 } 869 870 ret = --outpos - *outs; 871 fail: 872 #if defined(__APPLE__) || defined(__DARWIN__) 873 #ifdef ENABLE_NFCONV 874 if(new_ins != NULL) 875 free(new_ins); 876 #endif /* ENABLE_NFCONV */ 877 #endif /* defined(__APPLE__) || defined(__DARWIN__) */ 878 return ret; 879 } 880 881 /** 882 * ntfs_ucstombs - convert a little endian Unicode string to a multibyte string 883 * @ins: input Unicode string buffer 884 * @ins_len: length of input string in Unicode characters 885 * @outs: on return contains the (allocated) output multibyte string 886 * @outs_len: length of output buffer in bytes (ignored if *@outs is NULL) 887 * 888 * Convert the input little endian, 2-byte Unicode string @ins, of length 889 * @ins_len into the multibyte string format dictated by the current locale. 890 * 891 * If *@outs is NULL, the function allocates the string and the caller is 892 * responsible for calling free(*@outs); when finished with it. 893 * 894 * On success the function returns the number of bytes written to the output 895 * string *@outs (>= 0), not counting the terminating NULL byte. If the output 896 * string buffer was allocated, *@outs is set to it. 897 * 898 * On error, -1 is returned, and errno is set to the error code. The following 899 * error codes can be expected: 900 * EINVAL Invalid arguments (e.g. @ins or @outs is NULL). 901 * EILSEQ The input string cannot be represented as a multibyte 902 * sequence according to the current locale. 903 * ENAMETOOLONG Destination buffer is too small for input string. 904 * ENOMEM Not enough memory to allocate destination buffer. 905 */ 906 int ntfs_ucstombs(const ntfschar *ins, const int ins_len, char **outs, 907 int outs_len) 908 { 909 char *mbs; 910 int mbs_len; 911 #ifdef MB_CUR_MAX 912 wchar_t wc; 913 int i, o; 914 int cnt = 0; 915 #ifdef HAVE_MBSINIT 916 mbstate_t mbstate; 917 #endif 918 #endif /* MB_CUR_MAX */ 919 920 if (!ins || !outs) { 921 errno = EINVAL; 922 return -1; 923 } 924 mbs = *outs; 925 mbs_len = outs_len; 926 if (mbs && !mbs_len) { 927 errno = ENAMETOOLONG; 928 return -1; 929 } 930 if (use_utf8) 931 return ntfs_utf16_to_utf8(ins, ins_len, outs, outs_len); 932 #ifdef MB_CUR_MAX 933 if (!mbs) { 934 mbs_len = (ins_len + 1) * MB_CUR_MAX; 935 mbs = ntfs_malloc(mbs_len); 936 if (!mbs) 937 return -1; 938 } 939 #ifdef HAVE_MBSINIT 940 memset(&mbstate, 0, sizeof(mbstate)); 941 #else 942 #ifndef __HAIKU__ 943 wctomb(NULL, 0); 944 #endif 945 #endif 946 for (i = o = 0; i < ins_len; i++) { 947 /* Reallocate memory if necessary or abort. */ 948 if ((int)(o + MB_CUR_MAX) > mbs_len) { 949 char *tc; 950 if (mbs == *outs) { 951 errno = ENAMETOOLONG; 952 return -1; 953 } 954 tc = ntfs_malloc((mbs_len + 64) & ~63); 955 if (!tc) 956 goto err_out; 957 memcpy(tc, mbs, mbs_len); 958 mbs_len = (mbs_len + 64) & ~63; 959 free(mbs); 960 mbs = tc; 961 } 962 /* Convert the LE Unicode character to a CPU wide character. */ 963 wc = (wchar_t)le16_to_cpu(ins[i]); 964 if (!wc) 965 break; 966 /* Convert the CPU endian wide character to multibyte. */ 967 #ifdef HAVE_MBSINIT 968 cnt = wcrtomb(mbs + o, wc, &mbstate); 969 #elif defined(__HAIKU__) 970 cnt = -1; 971 #else 972 cnt = wctomb(mbs + o, wc); 973 #endif 974 if (cnt == -1) 975 goto err_out; 976 if (cnt <= 0) { 977 ntfs_log_debug("Eeek. cnt <= 0, cnt = %i\n", cnt); 978 errno = EINVAL; 979 goto err_out; 980 } 981 o += cnt; 982 } 983 #ifdef HAVE_MBSINIT 984 /* Make sure we are back in the initial state. */ 985 if (!mbsinit(&mbstate)) { 986 ntfs_log_debug("Eeek. mbstate not in initial state!\n"); 987 errno = EILSEQ; 988 goto err_out; 989 } 990 #endif 991 /* Now write the NULL character. */ 992 mbs[o] = '\0'; 993 if (*outs != mbs) 994 *outs = mbs; 995 return o; 996 err_out: 997 if (mbs != *outs) { 998 int eo = errno; 999 free(mbs); 1000 errno = eo; 1001 } 1002 #else /* MB_CUR_MAX */ 1003 errno = EILSEQ; 1004 #endif /* MB_CUR_MAX */ 1005 return -1; 1006 } 1007 1008 /** 1009 * ntfs_mbstoucs - convert a multibyte string to a little endian Unicode string 1010 * @ins: input multibyte string buffer 1011 * @outs: on return contains the (allocated) output Unicode string 1012 * 1013 * Convert the input multibyte string @ins, from the current locale into the 1014 * corresponding little endian, 2-byte Unicode string. 1015 * 1016 * The function allocates the string and the caller is responsible for calling 1017 * free(*@outs); when finished with it. 1018 * 1019 * On success the function returns the number of Unicode characters written to 1020 * the output string *@outs (>= 0), not counting the terminating Unicode NULL 1021 * character. 1022 * 1023 * On error, -1 is returned, and errno is set to the error code. The following 1024 * error codes can be expected: 1025 * EINVAL Invalid arguments (e.g. @ins or @outs is NULL). 1026 * EILSEQ The input string cannot be represented as a Unicode 1027 * string according to the current locale. 1028 * ENAMETOOLONG Destination buffer is too small for input string. 1029 * ENOMEM Not enough memory to allocate destination buffer. 1030 */ 1031 int ntfs_mbstoucs(const char *ins, ntfschar **outs) 1032 { 1033 #ifdef MB_CUR_MAX 1034 ntfschar *ucs; 1035 const char *s; 1036 wchar_t wc; 1037 int i, o, cnt, ins_len, ucs_len, ins_size; 1038 #ifdef HAVE_MBSINIT 1039 mbstate_t mbstate; 1040 #endif 1041 #endif /* MB_CUR_MAX */ 1042 1043 if (!ins || !outs) { 1044 errno = EINVAL; 1045 return -1; 1046 } 1047 1048 if (use_utf8) 1049 return ntfs_utf8_to_utf16(ins, outs); 1050 1051 #ifdef MB_CUR_MAX 1052 /* Determine the size of the multi-byte string in bytes. */ 1053 ins_size = strlen(ins); 1054 /* Determine the length of the multi-byte string. */ 1055 s = ins; 1056 #if defined(HAVE_MBSINIT) 1057 memset(&mbstate, 0, sizeof(mbstate)); 1058 ins_len = mbsrtowcs(NULL, (const char **)&s, 0, &mbstate); 1059 #ifdef __CYGWIN32__ 1060 if (!ins_len && *ins) { 1061 /* Older Cygwin had broken mbsrtowcs() implementation. */ 1062 ins_len = strlen(ins); 1063 } 1064 #endif 1065 #elif !defined(DJGPP) && !defined(__HAIKU__) 1066 ins_len = mbstowcs(NULL, s, 0); 1067 #else 1068 /* Eeek!!! DJGPP has broken mbstowcs() implementation!!! */ 1069 ins_len = strlen(ins); 1070 #endif 1071 if (ins_len == -1) 1072 return ins_len; 1073 #ifdef HAVE_MBSINIT 1074 if ((s != ins) || !mbsinit(&mbstate)) { 1075 #else 1076 if (s != ins) { 1077 #endif 1078 errno = EILSEQ; 1079 return -1; 1080 } 1081 /* Add the NULL terminator. */ 1082 ins_len++; 1083 ucs_len = ins_len; 1084 ucs = ntfs_malloc(ucs_len * sizeof(ntfschar)); 1085 if (!ucs) 1086 return -1; 1087 #ifdef HAVE_MBSINIT 1088 memset(&mbstate, 0, sizeof(mbstate)); 1089 #else 1090 #ifndef __HAIKU__ 1091 mbtowc(NULL, NULL, 0); 1092 #endif 1093 #endif 1094 for (i = o = cnt = 0; i < ins_size; i += cnt, o++) { 1095 /* Reallocate memory if necessary. */ 1096 if (o >= ucs_len) { 1097 ntfschar *tc; 1098 ucs_len = (ucs_len * sizeof(ntfschar) + 64) & ~63; 1099 tc = realloc(ucs, ucs_len); 1100 if (!tc) 1101 goto err_out; 1102 ucs = tc; 1103 ucs_len /= sizeof(ntfschar); 1104 } 1105 /* Convert the multibyte character to a wide character. */ 1106 #ifdef HAVE_MBSINIT 1107 cnt = mbrtowc(&wc, ins + i, ins_size - i, &mbstate); 1108 #elif defined(__HAIKU__) 1109 cnt = -1; 1110 #else 1111 cnt = mbtowc(&wc, ins + i, ins_size - i); 1112 #endif 1113 if (!cnt) 1114 break; 1115 if (cnt == -1) 1116 goto err_out; 1117 if (cnt < -1) { 1118 ntfs_log_trace("Eeek. cnt = %i\n", cnt); 1119 errno = EINVAL; 1120 goto err_out; 1121 } 1122 /* Make sure we are not overflowing the NTFS Unicode set. */ 1123 if ((unsigned long)wc >= (unsigned long)(1 << 1124 (8 * sizeof(ntfschar)))) { 1125 errno = EILSEQ; 1126 goto err_out; 1127 } 1128 /* Convert the CPU wide character to a LE Unicode character. */ 1129 ucs[o] = cpu_to_le16(wc); 1130 } 1131 #ifdef HAVE_MBSINIT 1132 /* Make sure we are back in the initial state. */ 1133 if (!mbsinit(&mbstate)) { 1134 ntfs_log_trace("Eeek. mbstate not in initial state!\n"); 1135 errno = EILSEQ; 1136 goto err_out; 1137 } 1138 #endif 1139 /* Now write the NULL character. */ 1140 ucs[o] = const_cpu_to_le16(L'\0'); 1141 *outs = ucs; 1142 return o; 1143 err_out: 1144 free(ucs); 1145 #else /* MB_CUR_MAX */ 1146 errno = EILSEQ; 1147 #endif /* MB_CUR_MAX */ 1148 return -1; 1149 } 1150 1151 /* 1152 * Turn a UTF8 name uppercase 1153 * 1154 * Returns an allocated uppercase name which has to be freed by caller 1155 * or NULL if there is an error (described by errno) 1156 */ 1157 1158 char *ntfs_uppercase_mbs(const char *low, 1159 const ntfschar *upcase, u32 upcase_size) 1160 { 1161 int size; 1162 char *upp; 1163 u32 wc; 1164 int n; 1165 const char *s; 1166 char *t; 1167 1168 size = strlen(low); 1169 upp = (char*)ntfs_malloc(3*size + 1); 1170 if (upp) { 1171 s = low; 1172 t = upp; 1173 do { 1174 n = utf8_to_unicode(&wc, s); 1175 if (n > 0) { 1176 if (wc < upcase_size) 1177 wc = le16_to_cpu(upcase[wc]); 1178 if (wc < 0x80) 1179 *t++ = wc; 1180 else if (wc < 0x800) { 1181 *t++ = (0xc0 | ((wc >> 6) & 0x3f)); 1182 *t++ = 0x80 | (wc & 0x3f); 1183 } else if (wc < 0x10000) { 1184 *t++ = 0xe0 | (wc >> 12); 1185 *t++ = 0x80 | ((wc >> 6) & 0x3f); 1186 *t++ = 0x80 | (wc & 0x3f); 1187 } else { 1188 *t++ = 0xf0 | ((wc >> 18) & 7); 1189 *t++ = 0x80 | ((wc >> 12) & 63); 1190 *t++ = 0x80 | ((wc >> 6) & 0x3f); 1191 *t++ = 0x80 | (wc & 0x3f); 1192 } 1193 s += n; 1194 } 1195 } while (n > 0); 1196 if (n < 0) { 1197 free(upp); 1198 upp = (char*)NULL; 1199 errno = EILSEQ; 1200 } 1201 *t = 0; 1202 } 1203 return (upp); 1204 } 1205 1206 /** 1207 * ntfs_upcase_table_build - build the default upcase table for NTFS 1208 * @uc: destination buffer where to store the built table 1209 * @uc_len: size of destination buffer in bytes 1210 * 1211 * ntfs_upcase_table_build() builds the default upcase table for NTFS and 1212 * stores it in the caller supplied buffer @uc of size @uc_len. 1213 * 1214 * Note, @uc_len must be at least 128kiB in size or bad things will happen! 1215 */ 1216 void ntfs_upcase_table_build(ntfschar *uc, u32 uc_len) 1217 { 1218 struct NEWUPPERCASE { 1219 unsigned short first; 1220 unsigned short last; 1221 short diff; 1222 unsigned char step; 1223 unsigned char osmajor; 1224 unsigned char osminor; 1225 } ; 1226 1227 /* 1228 * This is the table as defined by Windows XP 1229 */ 1230 static int uc_run_table[][3] = { /* Start, End, Add */ 1231 {0x0061, 0x007B, -32}, {0x0451, 0x045D, -80}, {0x1F70, 0x1F72, 74}, 1232 {0x00E0, 0x00F7, -32}, {0x045E, 0x0460, -80}, {0x1F72, 0x1F76, 86}, 1233 {0x00F8, 0x00FF, -32}, {0x0561, 0x0587, -48}, {0x1F76, 0x1F78, 100}, 1234 {0x0256, 0x0258, -205}, {0x1F00, 0x1F08, 8}, {0x1F78, 0x1F7A, 128}, 1235 {0x028A, 0x028C, -217}, {0x1F10, 0x1F16, 8}, {0x1F7A, 0x1F7C, 112}, 1236 {0x03AC, 0x03AD, -38}, {0x1F20, 0x1F28, 8}, {0x1F7C, 0x1F7E, 126}, 1237 {0x03AD, 0x03B0, -37}, {0x1F30, 0x1F38, 8}, {0x1FB0, 0x1FB2, 8}, 1238 {0x03B1, 0x03C2, -32}, {0x1F40, 0x1F46, 8}, {0x1FD0, 0x1FD2, 8}, 1239 {0x03C2, 0x03C3, -31}, {0x1F51, 0x1F52, 8}, {0x1FE0, 0x1FE2, 8}, 1240 {0x03C3, 0x03CC, -32}, {0x1F53, 0x1F54, 8}, {0x1FE5, 0x1FE6, 7}, 1241 {0x03CC, 0x03CD, -64}, {0x1F55, 0x1F56, 8}, {0x2170, 0x2180, -16}, 1242 {0x03CD, 0x03CF, -63}, {0x1F57, 0x1F58, 8}, {0x24D0, 0x24EA, -26}, 1243 {0x0430, 0x0450, -32}, {0x1F60, 0x1F68, 8}, {0xFF41, 0xFF5B, -32}, 1244 {0} 1245 }; 1246 static int uc_dup_table[][2] = { /* Start, End */ 1247 {0x0100, 0x012F}, {0x01A0, 0x01A6}, {0x03E2, 0x03EF}, {0x04CB, 0x04CC}, 1248 {0x0132, 0x0137}, {0x01B3, 0x01B7}, {0x0460, 0x0481}, {0x04D0, 0x04EB}, 1249 {0x0139, 0x0149}, {0x01CD, 0x01DD}, {0x0490, 0x04BF}, {0x04EE, 0x04F5}, 1250 {0x014A, 0x0178}, {0x01DE, 0x01EF}, {0x04BF, 0x04BF}, {0x04F8, 0x04F9}, 1251 {0x0179, 0x017E}, {0x01F4, 0x01F5}, {0x04C1, 0x04C4}, {0x1E00, 0x1E95}, 1252 {0x018B, 0x018B}, {0x01FA, 0x0218}, {0x04C7, 0x04C8}, {0x1EA0, 0x1EF9}, 1253 {0} 1254 }; 1255 static int uc_byte_table[][2] = { /* Offset, Value */ 1256 {0x00FF, 0x0178}, {0x01AD, 0x01AC}, {0x01F3, 0x01F1}, {0x0269, 0x0196}, 1257 {0x0183, 0x0182}, {0x01B0, 0x01AF}, {0x0253, 0x0181}, {0x026F, 0x019C}, 1258 {0x0185, 0x0184}, {0x01B9, 0x01B8}, {0x0254, 0x0186}, {0x0272, 0x019D}, 1259 {0x0188, 0x0187}, {0x01BD, 0x01BC}, {0x0259, 0x018F}, {0x0275, 0x019F}, 1260 {0x018C, 0x018B}, {0x01C6, 0x01C4}, {0x025B, 0x0190}, {0x0283, 0x01A9}, 1261 {0x0192, 0x0191}, {0x01C9, 0x01C7}, {0x0260, 0x0193}, {0x0288, 0x01AE}, 1262 {0x0199, 0x0198}, {0x01CC, 0x01CA}, {0x0263, 0x0194}, {0x0292, 0x01B7}, 1263 {0x01A8, 0x01A7}, {0x01DD, 0x018E}, {0x0268, 0x0197}, 1264 {0} 1265 }; 1266 1267 /* 1268 * Changes which were applied to later Windows versions 1269 * 1270 * md5 for $UpCase from Winxp : 6fa3db2468275286210751e869d36373 1271 * Vista : 2f03b5a69d486ff3864cecbd07f24440 1272 * Win8 : 7ff498a44e45e77374cc7c962b1b92f2 1273 */ 1274 static const struct NEWUPPERCASE newuppercase[] = { 1275 /* from Windows 6.0 (Vista) */ 1276 { 0x37b, 0x37d, 0x82, 1, 6, 0 }, 1277 { 0x1f80, 0x1f87, 0x8, 1, 6, 0 }, 1278 { 0x1f90, 0x1f97, 0x8, 1, 6, 0 }, 1279 { 0x1fa0, 0x1fa7, 0x8, 1, 6, 0 }, 1280 { 0x2c30, 0x2c5e, -0x30, 1, 6, 0 }, 1281 { 0x2d00, 0x2d25, -0x1c60, 1, 6, 0 }, 1282 { 0x2c68, 0x2c6c, -0x1, 2, 6, 0 }, 1283 { 0x219, 0x21f, -0x1, 2, 6, 0 }, 1284 { 0x223, 0x233, -0x1, 2, 6, 0 }, 1285 { 0x247, 0x24f, -0x1, 2, 6, 0 }, 1286 { 0x3d9, 0x3e1, -0x1, 2, 6, 0 }, 1287 { 0x48b, 0x48f, -0x1, 2, 6, 0 }, 1288 { 0x4fb, 0x513, -0x1, 2, 6, 0 }, 1289 { 0x2c81, 0x2ce3, -0x1, 2, 6, 0 }, 1290 { 0x3f8, 0x3fb, -0x1, 3, 6, 0 }, 1291 { 0x4c6, 0x4ce, -0x1, 4, 6, 0 }, 1292 { 0x23c, 0x242, -0x1, 6, 6, 0 }, 1293 { 0x4ed, 0x4f7, -0x1, 10, 6, 0 }, 1294 { 0x450, 0x45d, -0x50, 13, 6, 0 }, 1295 { 0x2c61, 0x2c76, -0x1, 21, 6, 0 }, 1296 { 0x1fcc, 0x1ffc, -0x9, 48, 6, 0 }, 1297 { 0x180, 0x180, 0xc3, 1, 6, 0 }, 1298 { 0x195, 0x195, 0x61, 1, 6, 0 }, 1299 { 0x19a, 0x19a, 0xa3, 1, 6, 0 }, 1300 { 0x19e, 0x19e, 0x82, 1, 6, 0 }, 1301 { 0x1bf, 0x1bf, 0x38, 1, 6, 0 }, 1302 { 0x1f9, 0x1f9, -0x1, 1, 6, 0 }, 1303 { 0x23a, 0x23a, 0x2a2b, 1, 6, 0 }, 1304 { 0x23e, 0x23e, 0x2a28, 1, 6, 0 }, 1305 { 0x26b, 0x26b, 0x29f7, 1, 6, 0 }, 1306 { 0x27d, 0x27d, 0x29e7, 1, 6, 0 }, 1307 { 0x280, 0x280, -0xda, 1, 6, 0 }, 1308 { 0x289, 0x289, -0x45, 1, 6, 0 }, 1309 { 0x28c, 0x28c, -0x47, 1, 6, 0 }, 1310 { 0x3f2, 0x3f2, 0x7, 1, 6, 0 }, 1311 { 0x4cf, 0x4cf, -0xf, 1, 6, 0 }, 1312 { 0x1d7d, 0x1d7d, 0xee6, 1, 6, 0 }, 1313 { 0x1fb3, 0x1fb3, 0x9, 1, 6, 0 }, 1314 { 0x214e, 0x214e, -0x1c, 1, 6, 0 }, 1315 { 0x2184, 0x2184, -0x1, 1, 6, 0 }, 1316 /* from Windows 6.1 (Win7) */ 1317 { 0x23a, 0x23e, 0x0, 4, 6, 1 }, 1318 { 0x250, 0x250, 0x2a1f, 2, 6, 1 }, 1319 { 0x251, 0x251, 0x2a1c, 2, 6, 1 }, 1320 { 0x271, 0x271, 0x29fd, 2, 6, 1 }, 1321 { 0x371, 0x373, -0x1, 2, 6, 1 }, 1322 { 0x377, 0x377, -0x1, 2, 6, 1 }, 1323 { 0x3c2, 0x3c2, 0x0, 2, 6, 1 }, 1324 { 0x3d7, 0x3d7, -0x8, 2, 6, 1 }, 1325 { 0x515, 0x523, -0x1, 2, 6, 1 }, 1326 /* below, -0x75fc stands for 0x8a04 and truncation */ 1327 { 0x1d79, 0x1d79, -0x75fc, 2, 6, 1 }, 1328 { 0x1efb, 0x1eff, -0x1, 2, 6, 1 }, 1329 { 0x1fc3, 0x1ff3, 0x9, 48, 6, 1 }, 1330 { 0x1fcc, 0x1ffc, 0x0, 48, 6, 1 }, 1331 { 0x2c65, 0x2c65, -0x2a2b, 2, 6, 1 }, 1332 { 0x2c66, 0x2c66, -0x2a28, 2, 6, 1 }, 1333 { 0x2c73, 0x2c73, -0x1, 2, 6, 1 }, 1334 { 0xa641, 0xa65f, -0x1, 2, 6, 1 }, 1335 { 0xa663, 0xa66d, -0x1, 2, 6, 1 }, 1336 { 0xa681, 0xa697, -0x1, 2, 6, 1 }, 1337 { 0xa723, 0xa72f, -0x1, 2, 6, 1 }, 1338 { 0xa733, 0xa76f, -0x1, 2, 6, 1 }, 1339 { 0xa77a, 0xa77c, -0x1, 2, 6, 1 }, 1340 { 0xa77f, 0xa787, -0x1, 2, 6, 1 }, 1341 { 0xa78c, 0xa78c, -0x1, 2, 6, 1 }, 1342 /* end mark */ 1343 { 0 } 1344 } ; 1345 1346 int i, r; 1347 int k, off; 1348 const struct NEWUPPERCASE *puc; 1349 1350 memset((char*)uc, 0, uc_len); 1351 uc_len >>= 1; 1352 if (uc_len > 65536) 1353 uc_len = 65536; 1354 for (i = 0; (u32)i < uc_len; i++) 1355 uc[i] = cpu_to_le16(i); 1356 for (r = 0; uc_run_table[r][0]; r++) { 1357 off = uc_run_table[r][2]; 1358 for (i = uc_run_table[r][0]; i < uc_run_table[r][1]; i++) 1359 uc[i] = cpu_to_le16(i + off); 1360 } 1361 for (r = 0; uc_dup_table[r][0]; r++) 1362 for (i = uc_dup_table[r][0]; i < uc_dup_table[r][1]; i += 2) 1363 uc[i + 1] = cpu_to_le16(i); 1364 for (r = 0; uc_byte_table[r][0]; r++) { 1365 k = uc_byte_table[r][1]; 1366 uc[uc_byte_table[r][0]] = cpu_to_le16(k); 1367 } 1368 for (r=0; newuppercase[r].first; r++) { 1369 puc = &newuppercase[r]; 1370 if ((puc->osmajor < UPCASE_MAJOR) 1371 || ((puc->osmajor == UPCASE_MAJOR) 1372 && (puc->osminor <= UPCASE_MINOR))) { 1373 off = puc->diff; 1374 for (i = puc->first; i <= puc->last; i += puc->step) 1375 uc[i] = cpu_to_le16(i + off); 1376 } 1377 } 1378 } 1379 1380 /* 1381 * Allocate and build the default upcase table 1382 * 1383 * Returns the number of entries 1384 * 0 if failed 1385 */ 1386 1387 #define UPCASE_LEN 65536 /* default number of entries in upcase */ 1388 1389 u32 ntfs_upcase_build_default(ntfschar **upcase) 1390 { 1391 u32 upcase_len = 0; 1392 1393 *upcase = (ntfschar*)ntfs_malloc(UPCASE_LEN*2); 1394 if (*upcase) { 1395 ntfs_upcase_table_build(*upcase, UPCASE_LEN*2); 1396 upcase_len = UPCASE_LEN; 1397 } 1398 return (upcase_len); 1399 } 1400 1401 /* 1402 * Build a table for converting to lower case 1403 * 1404 * This is only meaningful when there is a single lower case 1405 * character leading to an upper case one, and currently the 1406 * only exception is the greek letter sigma which has a single 1407 * upper case glyph (code U+03A3), but two lower case glyphs 1408 * (code U+03C3 and U+03C2, the latter to be used at the end 1409 * of a word). In the following implementation the upper case 1410 * sigma will be lowercased as U+03C3. 1411 */ 1412 1413 ntfschar *ntfs_locase_table_build(const ntfschar *uc, u32 uc_cnt) 1414 { 1415 ntfschar *lc; 1416 u32 upp; 1417 u32 i; 1418 1419 lc = (ntfschar*)ntfs_malloc(uc_cnt*sizeof(ntfschar)); 1420 if (lc) { 1421 for (i=0; i<uc_cnt; i++) 1422 lc[i] = cpu_to_le16(i); 1423 for (i=0; i<uc_cnt; i++) { 1424 upp = le16_to_cpu(uc[i]); 1425 if ((upp != i) && (upp < uc_cnt)) 1426 lc[upp] = cpu_to_le16(i); 1427 } 1428 } else 1429 ntfs_log_error("Could not build the locase table\n"); 1430 return (lc); 1431 } 1432 1433 /** 1434 * ntfs_str2ucs - convert a string to a valid NTFS file name 1435 * @s: input string 1436 * @len: length of output buffer in Unicode characters 1437 * 1438 * Convert the input @s string into the corresponding little endian, 1439 * 2-byte Unicode string. The length of the converted string is less 1440 * or equal to the maximum length allowed by the NTFS format (255). 1441 * 1442 * If @s is NULL then return AT_UNNAMED. 1443 * 1444 * On success the function returns the Unicode string in an allocated 1445 * buffer and the caller is responsible to free it when it's not needed 1446 * anymore. 1447 * 1448 * On error NULL is returned and errno is set to the error code. 1449 */ 1450 ntfschar *ntfs_str2ucs(const char *s, int *len) 1451 { 1452 ntfschar *ucs = NULL; 1453 1454 if (s && ((*len = ntfs_mbstoucs(s, &ucs)) == -1)) { 1455 ntfs_log_perror("Couldn't convert '%s' to Unicode", s); 1456 return NULL; 1457 } 1458 if (*len > NTFS_MAX_NAME_LEN) { 1459 free(ucs); 1460 errno = ENAMETOOLONG; 1461 return NULL; 1462 } 1463 if (!ucs || !*len) { 1464 ucs = AT_UNNAMED; 1465 *len = 0; 1466 } 1467 return ucs; 1468 } 1469 1470 /** 1471 * ntfs_ucsfree - free memory allocated by ntfs_str2ucs() 1472 * @ucs input string to be freed 1473 * 1474 * Free memory at @ucs and which was allocated by ntfs_str2ucs. 1475 * 1476 * Return value: none. 1477 */ 1478 void ntfs_ucsfree(ntfschar *ucs) 1479 { 1480 if (ucs && (ucs != AT_UNNAMED)) 1481 free(ucs); 1482 } 1483 1484 /* 1485 * Check whether a name contains no chars forbidden 1486 * for DOS or Win32 use 1487 * 1488 * If @strict is TRUE, then trailing dots and spaces are forbidden. 1489 * These names are technically allowed in the Win32 namespace, but 1490 * they can be problematic. See comment for FILE_NAME_WIN32. 1491 * 1492 * If there is a bad char, errno is set to EINVAL 1493 */ 1494 1495 BOOL ntfs_forbidden_chars(const ntfschar *name, int len, BOOL strict) 1496 { 1497 BOOL forbidden; 1498 int ch; 1499 int i; 1500 static const u32 mainset = (1L << ('\"' - 0x20)) 1501 | (1L << ('*' - 0x20)) 1502 | (1L << ('/' - 0x20)) 1503 | (1L << (':' - 0x20)) 1504 | (1L << ('<' - 0x20)) 1505 | (1L << ('>' - 0x20)) 1506 | (1L << ('?' - 0x20)); 1507 1508 forbidden = (len == 0) || 1509 (strict && (name[len-1] == const_cpu_to_le16(' ') || 1510 name[len-1] == const_cpu_to_le16('.'))); 1511 for (i=0; i<len; i++) { 1512 ch = le16_to_cpu(name[i]); 1513 if ((ch < 0x20) 1514 || ((ch < 0x40) 1515 && ((1L << (ch - 0x20)) & mainset)) 1516 || (ch == '\\') 1517 || (ch == '|')) 1518 forbidden = TRUE; 1519 } 1520 if (forbidden) 1521 errno = EINVAL; 1522 return (forbidden); 1523 } 1524 1525 /* 1526 * Check whether a name contains no forbidden chars and 1527 * is not a reserved name for DOS or Win32 use 1528 * 1529 * The reserved names are CON, PRN, AUX, NUL, COM1..COM9, LPT1..LPT9 1530 * with no suffix or any suffix. 1531 * 1532 * If @strict is TRUE, then trailing dots and spaces are forbidden. 1533 * These names are technically allowed in the Win32 namespace, but 1534 * they can be problematic. See comment for FILE_NAME_WIN32. 1535 * 1536 * If the name is forbidden, errno is set to EINVAL 1537 */ 1538 1539 BOOL ntfs_forbidden_names(ntfs_volume *vol, const ntfschar *name, int len, 1540 BOOL strict) 1541 { 1542 BOOL forbidden; 1543 int h; 1544 static const ntfschar dot = const_cpu_to_le16('.'); 1545 static const ntfschar con[] = { const_cpu_to_le16('c'), 1546 const_cpu_to_le16('o'), const_cpu_to_le16('n') }; 1547 static const ntfschar prn[] = { const_cpu_to_le16('p'), 1548 const_cpu_to_le16('r'), const_cpu_to_le16('n') }; 1549 static const ntfschar aux[] = { const_cpu_to_le16('a'), 1550 const_cpu_to_le16('u'), const_cpu_to_le16('x') }; 1551 static const ntfschar nul[] = { const_cpu_to_le16('n'), 1552 const_cpu_to_le16('u'), const_cpu_to_le16('l') }; 1553 static const ntfschar com[] = { const_cpu_to_le16('c'), 1554 const_cpu_to_le16('o'), const_cpu_to_le16('m') }; 1555 static const ntfschar lpt[] = { const_cpu_to_le16('l'), 1556 const_cpu_to_le16('p'), const_cpu_to_le16('t') }; 1557 1558 forbidden = ntfs_forbidden_chars(name, len, strict); 1559 if (!forbidden && (len >= 3)) { 1560 /* 1561 * Rough hash check to tell whether the first couple of chars 1562 * may be one of CO PR AU NU LP or lowercase variants. 1563 */ 1564 h = ((le16_to_cpu(name[0]) & 31)*48) 1565 ^ ((le16_to_cpu(name[1]) & 31)*165); 1566 if ((h % 23) == 17) { 1567 /* do a full check, depending on the third char */ 1568 switch (le16_to_cpu(name[2]) & ~0x20) { 1569 case 'N' : 1570 if (((len == 3) || (name[3] == dot)) 1571 && (!ntfs_ucsncasecmp(name, con, 3, 1572 vol->upcase, vol->upcase_len) 1573 || !ntfs_ucsncasecmp(name, prn, 3, 1574 vol->upcase, vol->upcase_len))) 1575 forbidden = TRUE; 1576 break; 1577 case 'X' : 1578 if (((len == 3) || (name[3] == dot)) 1579 && !ntfs_ucsncasecmp(name, aux, 3, 1580 vol->upcase, vol->upcase_len)) 1581 forbidden = TRUE; 1582 break; 1583 case 'L' : 1584 if (((len == 3) || (name[3] == dot)) 1585 && !ntfs_ucsncasecmp(name, nul, 3, 1586 vol->upcase, vol->upcase_len)) 1587 forbidden = TRUE; 1588 break; 1589 case 'M' : 1590 if ((len > 3) 1591 && (le16_to_cpu(name[3]) >= '1') 1592 && (le16_to_cpu(name[3]) <= '9') 1593 && ((len == 4) || (name[4] == dot)) 1594 && !ntfs_ucsncasecmp(name, com, 3, 1595 vol->upcase, vol->upcase_len)) 1596 forbidden = TRUE; 1597 break; 1598 case 'T' : 1599 if ((len > 3) 1600 && (le16_to_cpu(name[3]) >= '1') 1601 && (le16_to_cpu(name[3]) <= '9') 1602 && ((len == 4) || (name[4] == dot)) 1603 && !ntfs_ucsncasecmp(name, lpt, 3, 1604 vol->upcase, vol->upcase_len)) 1605 forbidden = TRUE; 1606 break; 1607 } 1608 } 1609 } 1610 1611 if (forbidden) 1612 errno = EINVAL; 1613 return (forbidden); 1614 } 1615 1616 /* 1617 * Check whether the same name can be used as a DOS and 1618 * a Win32 name 1619 * 1620 * The names must be the same, or the short name the uppercase 1621 * variant of the long name 1622 */ 1623 1624 BOOL ntfs_collapsible_chars(ntfs_volume *vol, 1625 const ntfschar *shortname, int shortlen, 1626 const ntfschar *longname, int longlen) 1627 { 1628 BOOL collapsible; 1629 unsigned int ch; 1630 unsigned int cs; 1631 int i; 1632 1633 collapsible = shortlen == longlen; 1634 for (i=0; collapsible && (i<shortlen); i++) { 1635 ch = le16_to_cpu(longname[i]); 1636 cs = le16_to_cpu(shortname[i]); 1637 if ((cs != ch) 1638 && ((ch >= vol->upcase_len) 1639 || (cs >= vol->upcase_len) 1640 || (vol->upcase[cs] != vol->upcase[ch]))) 1641 collapsible = FALSE; 1642 } 1643 return (collapsible); 1644 } 1645 1646 /* 1647 * Define the character encoding to be used. 1648 * Use UTF-8 unless specified otherwise. 1649 */ 1650 1651 int ntfs_set_char_encoding(const char *locale) 1652 { 1653 use_utf8 = 0; 1654 if (!locale || strstr(locale,"utf8") || strstr(locale,"UTF8") 1655 || strstr(locale,"utf-8") || strstr(locale,"UTF-8")) 1656 use_utf8 = 1; 1657 else 1658 #ifndef __HAIKU__ 1659 if (setlocale(LC_ALL, locale)) 1660 use_utf8 = 0; 1661 else 1662 #endif 1663 { 1664 ntfs_log_error("Invalid locale, encoding to UTF-8\n"); 1665 use_utf8 = 1; 1666 } 1667 return 0; /* always successful */ 1668 } 1669 1670 #if defined(__APPLE__) || defined(__DARWIN__) 1671 1672 int ntfs_macosx_normalize_filenames(int normalize) { 1673 #ifdef ENABLE_NFCONV 1674 if (normalize == 0 || normalize == 1) { 1675 nfconvert_utf8 = normalize; 1676 return 0; 1677 } 1678 else { 1679 return -1; 1680 } 1681 #else 1682 return -1; 1683 #endif /* ENABLE_NFCONV */ 1684 } 1685 1686 int ntfs_macosx_normalize_utf8(const char *utf8_string, char **target, 1687 int composed) 1688 { 1689 #ifdef ENABLE_NFCONV 1690 /* For this code to compile, the CoreFoundation framework must be fed to 1691 * the linker. */ 1692 CFStringRef cfSourceString; 1693 CFMutableStringRef cfMutableString; 1694 CFRange rangeToProcess; 1695 CFIndex requiredBufferLength; 1696 char *result = NULL; 1697 int resultLength = -1; 1698 1699 /* Convert the UTF-8 string to a CFString. */ 1700 cfSourceString = CFStringCreateWithCString(kCFAllocatorDefault, 1701 utf8_string, kCFStringEncodingUTF8); 1702 if (cfSourceString == NULL) { 1703 ntfs_log_error("CFStringCreateWithCString failed!\n"); 1704 return -2; 1705 } 1706 1707 /* Create a mutable string from cfSourceString that we are free to 1708 * modify. */ 1709 cfMutableString = CFStringCreateMutableCopy(kCFAllocatorDefault, 0, 1710 cfSourceString); 1711 CFRelease(cfSourceString); /* End-of-life. */ 1712 if (cfMutableString == NULL) { 1713 ntfs_log_error("CFStringCreateMutableCopy failed!\n"); 1714 return -3; 1715 } 1716 1717 /* Normalize the mutable string to the desired normalization form. */ 1718 CFStringNormalize(cfMutableString, (composed != 0 ? 1719 kCFStringNormalizationFormC : kCFStringNormalizationFormD)); 1720 1721 /* Store the resulting string in a '\0'-terminated UTF-8 encoded char* 1722 * buffer. */ 1723 rangeToProcess = CFRangeMake(0, CFStringGetLength(cfMutableString)); 1724 if (CFStringGetBytes(cfMutableString, rangeToProcess, 1725 kCFStringEncodingUTF8, 0, false, NULL, 0, 1726 &requiredBufferLength) > 0) 1727 { 1728 resultLength = sizeof(char) * (requiredBufferLength + 1); 1729 result = ntfs_calloc(resultLength); 1730 1731 if (result != NULL) { 1732 if (CFStringGetBytes(cfMutableString, rangeToProcess, 1733 kCFStringEncodingUTF8, 0, false, 1734 (UInt8*) result, resultLength - 1, 1735 &requiredBufferLength) <= 0) 1736 { 1737 ntfs_log_error("Could not perform UTF-8 " 1738 "conversion of normalized " 1739 "CFMutableString.\n"); 1740 free(result); 1741 result = NULL; 1742 } 1743 } 1744 else { 1745 ntfs_log_error("Could not perform a ntfs_calloc of %d " 1746 "bytes for char *result.\n", resultLength); 1747 } 1748 } 1749 else { 1750 ntfs_log_error("Could not perform check for required length of " 1751 "UTF-8 conversion of normalized CFMutableString.\n"); 1752 } 1753 1754 CFRelease(cfMutableString); 1755 1756 if (result != NULL) { 1757 *target = result; 1758 return resultLength - 1; 1759 } 1760 else { 1761 return -1; 1762 } 1763 #else 1764 return -1; 1765 #endif /* ENABLE_NFCONV */ 1766 } 1767 #endif /* defined(__APPLE__) || defined(__DARWIN__) */ 1768