1 /** 2 * unistr.c - Unicode string handling. Originated from the Linux-NTFS project. 3 * 4 * Copyright (c) 2000-2004 Anton Altaparmakov 5 * Copyright (c) 2002-2006 Szabolcs Szakacsits 6 * 7 * This program/include file is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU General Public License as published 9 * by the Free Software Foundation; either version 2 of the License, or 10 * (at your option) any later version. 11 * 12 * This program/include file is distributed in the hope that it will be 13 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty 14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 * GNU General Public License for more details. 16 * 17 * You should have received a copy of the GNU General Public License 18 * along with this program (in the main directory of the NTFS-3G 19 * distribution in the file COPYING); if not, write to the Free Software 20 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 21 */ 22 23 #ifdef HAVE_CONFIG_H 24 #include "config.h" 25 #endif 26 27 #ifdef HAVE_STDIO_H 28 #include <stdio.h> 29 #endif 30 #ifdef HAVE_STDLIB_H 31 #include <stdlib.h> 32 #endif 33 #ifdef HAVE_WCHAR_H 34 #include <wchar.h> 35 #endif 36 #ifdef HAVE_STRING_H 37 #include <string.h> 38 #endif 39 #ifdef HAVE_ERRNO_H 40 #include <errno.h> 41 #endif 42 43 #include "attrib.h" 44 #include "types.h" 45 #include "unistr.h" 46 #include "debug.h" 47 #include "logging.h" 48 #include "misc.h" 49 50 /* 51 * IMPORTANT 52 * ========= 53 * 54 * All these routines assume that the Unicode characters are in little endian 55 * encoding inside the strings!!! 56 */ 57 58 /* 59 * This is used by the name collation functions to quickly determine what 60 * characters are (in)valid. 61 */ 62 #if 0 63 static const u8 legal_ansi_char_array[0x40] = { 64 0x00, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 65 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 66 67 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 68 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 69 70 0x17, 0x07, 0x18, 0x17, 0x17, 0x17, 0x17, 0x17, 71 0x17, 0x17, 0x18, 0x16, 0x16, 0x17, 0x07, 0x00, 72 73 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 74 0x17, 0x17, 0x04, 0x16, 0x18, 0x16, 0x18, 0x18, 75 }; 76 #endif 77 78 /** 79 * ntfs_names_are_equal - compare two Unicode names for equality 80 * @s1: name to compare to @s2 81 * @s1_len: length in Unicode characters of @s1 82 * @s2: name to compare to @s1 83 * @s2_len: length in Unicode characters of @s2 84 * @ic: ignore case bool 85 * @upcase: upcase table (only if @ic == IGNORE_CASE) 86 * @upcase_size: length in Unicode characters of @upcase (if present) 87 * 88 * Compare the names @s1 and @s2 and return TRUE (1) if the names are 89 * identical, or FALSE (0) if they are not identical. If @ic is IGNORE_CASE, 90 * the @upcase table is used to perform a case insensitive comparison. 91 */ 92 BOOL ntfs_names_are_equal(const ntfschar *s1, size_t s1_len, 93 const ntfschar *s2, size_t s2_len, 94 const IGNORE_CASE_BOOL ic, 95 const ntfschar *upcase, const u32 upcase_size) 96 { 97 if (s1_len != s2_len) 98 return FALSE; 99 if (!s1_len) 100 return TRUE; 101 if (ic == CASE_SENSITIVE) 102 return ntfs_ucsncmp(s1, s2, s1_len) ? FALSE: TRUE; 103 return ntfs_ucsncasecmp(s1, s2, s1_len, upcase, upcase_size) ? FALSE: 104 TRUE; 105 } 106 107 /** 108 * ntfs_names_collate - collate two Unicode names 109 * @name1: first Unicode name to compare 110 * @name1_len: length of first Unicode name to compare 111 * @name2: second Unicode name to compare 112 * @name2_len: length of second Unicode name to compare 113 * @err_val: if @name1 contains an invalid character return this value 114 * @ic: either CASE_SENSITIVE or IGNORE_CASE 115 * @upcase: upcase table (ignored if @ic is CASE_SENSITIVE) 116 * @upcase_len: upcase table size (ignored if @ic is CASE_SENSITIVE) 117 * 118 * ntfs_names_collate() collates two Unicode names and returns: 119 * 120 * -1 if the first name collates before the second one, 121 * 0 if the names match, 122 * 1 if the second name collates before the first one, or 123 * @err_val if an invalid character is found in @name1 during the comparison. 124 * 125 * The following characters are considered invalid: '"', '*', '<', '>' and '?'. 126 */ 127 int ntfs_names_collate(const ntfschar *name1, const u32 name1_len, 128 const ntfschar *name2, const u32 name2_len, 129 const int err_val __attribute__((unused)), 130 const IGNORE_CASE_BOOL ic, const ntfschar *upcase, 131 const u32 upcase_len) 132 { 133 u32 cnt; 134 ntfschar c1, c2; 135 136 #ifdef DEBUG 137 if (!name1 || !name2 || (ic && (!upcase || !upcase_len))) { 138 ntfs_log_debug("ntfs_names_collate received NULL pointer!\n"); 139 exit(1); 140 } 141 #endif 142 for (cnt = 0; cnt < min(name1_len, name2_len); ++cnt) { 143 c1 = le16_to_cpu(*name1); 144 name1++; 145 c2 = le16_to_cpu(*name2); 146 name2++; 147 if (ic) { 148 if (c1 < upcase_len) 149 c1 = le16_to_cpu(upcase[c1]); 150 if (c2 < upcase_len) 151 c2 = le16_to_cpu(upcase[c2]); 152 } 153 #if 0 154 if (c1 < 64 && legal_ansi_char_array[c1] & 8) 155 return err_val; 156 #endif 157 if (c1 < c2) 158 return -1; 159 if (c1 > c2) 160 return 1; 161 } 162 if (name1_len < name2_len) 163 return -1; 164 if (name1_len == name2_len) 165 return 0; 166 /* name1_len > name2_len */ 167 #if 0 168 c1 = le16_to_cpu(*name1); 169 if (c1 < 64 && legal_ansi_char_array[c1] & 8) 170 return err_val; 171 #endif 172 return 1; 173 } 174 175 /** 176 * ntfs_ucsncmp - compare two little endian Unicode strings 177 * @s1: first string 178 * @s2: second string 179 * @n: maximum unicode characters to compare 180 * 181 * Compare the first @n characters of the Unicode strings @s1 and @s2, 182 * The strings in little endian format and appropriate le16_to_cpu() 183 * conversion is performed on non-little endian machines. 184 * 185 * The function returns an integer less than, equal to, or greater than zero 186 * if @s1 (or the first @n Unicode characters thereof) is found, respectively, 187 * to be less than, to match, or be greater than @s2. 188 */ 189 int ntfs_ucsncmp(const ntfschar *s1, const ntfschar *s2, size_t n) 190 { 191 ntfschar c1, c2; 192 size_t i; 193 194 #ifdef DEBUG 195 if (!s1 || !s2) { 196 ntfs_log_debug("ntfs_wcsncmp() received NULL pointer!\n"); 197 exit(1); 198 } 199 #endif 200 for (i = 0; i < n; ++i) { 201 c1 = le16_to_cpu(s1[i]); 202 c2 = le16_to_cpu(s2[i]); 203 if (c1 < c2) 204 return -1; 205 if (c1 > c2) 206 return 1; 207 if (!c1) 208 break; 209 } 210 return 0; 211 } 212 213 /** 214 * ntfs_ucsncasecmp - compare two little endian Unicode strings, ignoring case 215 * @s1: first string 216 * @s2: second string 217 * @n: maximum unicode characters to compare 218 * @upcase: upcase table 219 * @upcase_size: upcase table size in Unicode characters 220 * 221 * Compare the first @n characters of the Unicode strings @s1 and @s2, 222 * ignoring case. The strings in little endian format and appropriate 223 * le16_to_cpu() conversion is performed on non-little endian machines. 224 * 225 * Each character is uppercased using the @upcase table before the comparison. 226 * 227 * The function returns an integer less than, equal to, or greater than zero 228 * if @s1 (or the first @n Unicode characters thereof) is found, respectively, 229 * to be less than, to match, or be greater than @s2. 230 */ 231 int ntfs_ucsncasecmp(const ntfschar *s1, const ntfschar *s2, size_t n, 232 const ntfschar *upcase, const u32 upcase_size) 233 { 234 ntfschar c1, c2; 235 size_t i; 236 237 #ifdef DEBUG 238 if (!s1 || !s2 || !upcase) { 239 ntfs_log_debug("ntfs_wcsncasecmp() received NULL pointer!\n"); 240 exit(1); 241 } 242 #endif 243 for (i = 0; i < n; ++i) { 244 if ((c1 = le16_to_cpu(s1[i])) < upcase_size) 245 c1 = le16_to_cpu(upcase[c1]); 246 if ((c2 = le16_to_cpu(s2[i])) < upcase_size) 247 c2 = le16_to_cpu(upcase[c2]); 248 if (c1 < c2) 249 return -1; 250 if (c1 > c2) 251 return 1; 252 if (!c1) 253 break; 254 } 255 return 0; 256 } 257 258 /** 259 * ntfs_ucsnlen - determine the length of a little endian Unicode string 260 * @s: pointer to Unicode string 261 * @maxlen: maximum length of string @s 262 * 263 * Return the number of Unicode characters in the little endian Unicode 264 * string @s up to a maximum of maxlen Unicode characters, not including 265 * the terminating (ntfschar)'\0'. If there is no (ntfschar)'\0' between @s 266 * and @s + @maxlen, @maxlen is returned. 267 * 268 * This function never looks beyond @s + @maxlen. 269 */ 270 u32 ntfs_ucsnlen(const ntfschar *s, u32 maxlen) 271 { 272 u32 i; 273 274 for (i = 0; i < maxlen; i++) { 275 if (!le16_to_cpu(s[i])) 276 break; 277 } 278 return i; 279 } 280 281 /** 282 * ntfs_ucsndup - duplicate little endian Unicode string 283 * @s: pointer to Unicode string 284 * @maxlen: maximum length of string @s 285 * 286 * Return a pointer to a new little endian Unicode string which is a duplicate 287 * of the string s. Memory for the new string is obtained with ntfs_malloc(3), 288 * and can be freed with free(3). 289 * 290 * A maximum of @maxlen Unicode characters are copied and a terminating 291 * (ntfschar)'\0' little endian Unicode character is added. 292 * 293 * This function never looks beyond @s + @maxlen. 294 * 295 * Return a pointer to the new little endian Unicode string on success and NULL 296 * on failure with errno set to the error code. 297 */ 298 ntfschar *ntfs_ucsndup(const ntfschar *s, u32 maxlen) 299 { 300 ntfschar *dst; 301 u32 len; 302 303 len = ntfs_ucsnlen(s, maxlen); 304 dst = ntfs_malloc((len + 1) * sizeof(ntfschar)); 305 if (dst) { 306 memcpy(dst, s, len * sizeof(ntfschar)); 307 dst[len] = cpu_to_le16(L'\0'); 308 } 309 return dst; 310 } 311 312 /** 313 * ntfs_name_upcase - Map an Unicode name to its uppercase equivalent 314 * @name: 315 * @name_len: 316 * @upcase: 317 * @upcase_len: 318 * 319 * Description... 320 * 321 * Returns: 322 */ 323 void ntfs_name_upcase(ntfschar *name, u32 name_len, const ntfschar *upcase, 324 const u32 upcase_len) 325 { 326 u32 i; 327 ntfschar u; 328 329 for (i = 0; i < name_len; i++) 330 if ((u = le16_to_cpu(name[i])) < upcase_len) 331 name[i] = upcase[u]; 332 } 333 334 /** 335 * ntfs_file_value_upcase - Convert a filename to upper case 336 * @file_name_attr: 337 * @upcase: 338 * @upcase_len: 339 * 340 * Description... 341 * 342 * Returns: 343 */ 344 void ntfs_file_value_upcase(FILE_NAME_ATTR *file_name_attr, 345 const ntfschar *upcase, const u32 upcase_len) 346 { 347 ntfs_name_upcase((ntfschar*)&file_name_attr->file_name, 348 file_name_attr->file_name_length, upcase, upcase_len); 349 } 350 351 /** 352 * ntfs_file_values_compare - Which of two filenames should be listed first 353 * @file_name_attr1: 354 * @file_name_attr2: 355 * @err_val: 356 * @ic: 357 * @upcase: 358 * @upcase_len: 359 * 360 * Description... 361 * 362 * Returns: 363 */ 364 int ntfs_file_values_compare(const FILE_NAME_ATTR *file_name_attr1, 365 const FILE_NAME_ATTR *file_name_attr2, 366 const int err_val, const IGNORE_CASE_BOOL ic, 367 const ntfschar *upcase, const u32 upcase_len) 368 { 369 return ntfs_names_collate((ntfschar*)&file_name_attr1->file_name, 370 file_name_attr1->file_name_length, 371 (ntfschar*)&file_name_attr2->file_name, 372 file_name_attr2->file_name_length, 373 err_val, ic, upcase, upcase_len); 374 } 375 376 #if defined(__BEOS__) || defined(__HAIKU__) 377 /* Encode a single wide character into a sequence of utf8 bytes. 378 * Returns the number of bytes consumed, or 0 on error. 379 */ 380 static int 381 ntfs_wc_to_utf8(wchar_t c,unsigned char* buf) 382 { 383 if(c==0) 384 return 0; /* No support for embedded 0 runes */ 385 if(c<0x80) { 386 if(buf)buf[0]=(unsigned char)c; 387 return 1; 388 } 389 if(c<0x800) { 390 if(buf) { 391 buf[0] = 0xc0 | (c>>6); 392 buf[1] = 0x80 | (c & 0x3f); 393 } 394 return 2; 395 } 396 if(c<0x10000) { 397 if(buf) { 398 buf[0] = 0xe0 | (c>>12); 399 buf[1] = 0x80 | ((c>>6) & 0x3f); 400 buf[2] = 0x80 | (c & 0x3f); 401 } 402 return 3; 403 } 404 /* We don't support characters above 0xFFFF in NTFS */ 405 return 0; 406 } 407 408 409 /* Decodes a sequence of utf8 bytes into a single wide character. 410 * The character is returned in host byte order. 411 * Returns the number of bytes consumed, or 0 on error. 412 */ 413 static int 414 ntfs_wc_from_utf8(const unsigned char* str,wchar_t *c) 415 { 416 int l=0,i; 417 418 if(*str<0x80) { 419 *c = *str; 420 return 1; 421 } 422 if(*str<0xc0) /* lead byte must not be 10xxxxxx */ 423 return 0; /* is c0 a possible lead byte? */ 424 if(*str<0xe0) { /* 110xxxxx */ 425 *c = *str & 0x1f; 426 l=2; 427 } else if(*str<0xf0) { /* 1110xxxx */ 428 *c = *str & 0xf; 429 l=3; 430 } else if(*str<0xf8) { /* 11110xxx */ 431 *c = *str & 7; 432 l=4; 433 } else /* We don't support characters above 0xFFFF in NTFS */ 434 return 0; 435 436 437 for(i=1;i<l;i++) { 438 /* all other bytes must be 10xxxxxx */ 439 if((str[i] & 0xc0) != 0x80) 440 return 0; 441 *c <<= 6; 442 *c |= str[i] & 0x3f; 443 } 444 return l; 445 } 446 447 448 /* Converts wide string to UTF-8. Expects two in- and two out-parameters. 449 * Returns 0 on success, or error code. 450 * The caller has to free the result string. 451 * There is no support for UTF-16, yet 452 */ 453 static inline int ntfs_dupuni2utf8(wchar_t* in, int in_len,char **out,int *out_len) 454 { 455 int i,tmp; 456 int len8; 457 unsigned char *result; 458 459 /* count the length of the resulting UTF-8 */ 460 for(i=len8=0;i<in_len;i++) { 461 tmp=ntfs_wc_to_utf8(le16_to_cpu( *(in+i) ),0); 462 if(!tmp) 463 /* invalid character */ 464 return EILSEQ; 465 len8+=tmp; 466 } 467 *out=result=ntfs_malloc(len8+1); /* allow for zero-termination */ 468 469 if(!result) 470 return ENOMEM; 471 result[len8]='\0'; 472 *out_len=len8; 473 for(i=len8=0;i<in_len;i++) 474 len8+=ntfs_wc_to_utf8(le16_to_cpu( *(in+i) ),result+len8); 475 return 0; 476 } 477 478 /* Converts an UTF-8 sequence to a wide string. Same conventions as the 479 * previous function 480 */ 481 static inline int ntfs_duputf82uni(unsigned char* in, int in_len,wchar_t** out,int *out_len) 482 { 483 int i,tmp; 484 int len16; 485 486 wchar_t* result; 487 wchar_t wtmp; 488 for(i=len16=0;i<in_len;i+=tmp,len16++) { 489 tmp=ntfs_wc_from_utf8(in+i,&wtmp); 490 if(!tmp) 491 return EILSEQ; 492 } 493 *out=result=ntfs_malloc(2*(len16+1)); 494 if(!result) 495 return ENOMEM; 496 result[len16]=0; 497 *out_len=len16; 498 for(i=len16=0;i<in_len;i+=tmp,len16++) 499 { 500 tmp=ntfs_wc_from_utf8(in+i, &wtmp); 501 *(result+len16) = cpu_to_le16(wtmp); 502 } 503 return 0; 504 } 505 506 507 /** 508 * ntfs_ucstombs - convert a little endian Unicode string to a multibyte string 509 * @ins: input Unicode string buffer 510 * @ins_len: length of input string in Unicode characters 511 * @outs: on return contains the (allocated) output multibyte string 512 * @outs_len: length of output buffer in bytes 513 * 514 * Convert the input little endian, 2-byte Unicode string @ins, of length 515 * @ins_len into the multibyte string format dictated by the current locale. 516 * 517 * If *@outs is NULL, the function allocates the string and the caller is 518 * responsible for calling free(*@outs); when finished with it. 519 * 520 * On success the function returns the number of bytes written to the output 521 * string *@outs (>= 0), not counting the terminating NULL byte. If the output 522 * string buffer was allocated, *@outs is set to it. 523 * 524 * On error, -1 is returned, and errno is set to the error code. The following 525 * error codes can be expected: 526 * EINVAL Invalid arguments (e.g. @ins or @outs is NULL). 527 * EILSEQ The input string cannot be represented as a multibyte 528 * sequence according to the current locale. 529 * ENAMETOOLONG Destination buffer is too small for input string. 530 * ENOMEM Not enough memory to allocate destination buffer. 531 */ 532 int ntfs_ucstombs(const ntfschar *ins, const int ins_len, char **outs, int outs_len) 533 { 534 int out_len = outs_len; 535 if(ntfs_dupuni2utf8((wchar_t*)ins,ins_len,outs,&out_len)==0) 536 return out_len; 537 else 538 return EINVAL; 539 } 540 541 /** 542 * ntfs_mbstoucs - convert a multibyte string to a little endian Unicode string 543 * @ins: input multibyte string buffer 544 * @outs: on return contains the (allocated) output Unicode string 545 * @outs_len: length of output buffer in Unicode characters 546 * 547 * Convert the input multibyte string @ins, from the current locale into the 548 * corresponding little endian, 2-byte Unicode string. 549 * 550 * If *@outs is NULL, the function allocates the string and the caller is 551 * responsible for calling free(*@outs); when finished with it. 552 * 553 * On success the function returns the number of Unicode characters written to 554 * the output string *@outs (>= 0), not counting the terminating Unicode NULL 555 * character. If the output string buffer was allocated, *@outs is set to it. 556 * 557 * On error, -1 is returned, and errno is set to the error code. The following 558 * error codes can be expected: 559 * EINVAL Invalid arguments (e.g. @ins or @outs is NULL). 560 * EILSEQ The input string cannot be represented as a Unicode 561 * string according to the current locale. 562 * ENAMETOOLONG Destination buffer is too small for input string. 563 * ENOMEM Not enough memory to allocate destination buffer. 564 */ 565 int ntfs_mbstoucs(const char *ins, ntfschar **outs, int outs_len) 566 { 567 int in_len = strlen(ins); 568 int out_len = outs_len; 569 if(ntfs_duputf82uni((unsigned char*)ins,in_len,outs,&out_len)==0) 570 return out_len; 571 else 572 return EILSEQ; 573 } 574 575 #else 576 577 /** 578 * ntfs_ucstombs - convert a little endian Unicode string to a multibyte string 579 * @ins: input Unicode string buffer 580 * @ins_len: length of input string in Unicode characters 581 * @outs: on return contains the (allocated) output multibyte string 582 * @outs_len: length of output buffer in bytes 583 * 584 * Convert the input little endian, 2-byte Unicode string @ins, of length 585 * @ins_len into the multibyte string format dictated by the current locale. 586 * 587 * If *@outs is NULL, the function allocates the string and the caller is 588 * responsible for calling free(*@outs); when finished with it. 589 * 590 * On success the function returns the number of bytes written to the output 591 * string *@outs (>= 0), not counting the terminating NULL byte. If the output 592 * string buffer was allocated, *@outs is set to it. 593 * 594 * On error, -1 is returned, and errno is set to the error code. The following 595 * error codes can be expected: 596 * EINVAL Invalid arguments (e.g. @ins or @outs is NULL). 597 * EILSEQ The input string cannot be represented as a multibyte 598 * sequence according to the current locale. 599 * ENAMETOOLONG Destination buffer is too small for input string. 600 * ENOMEM Not enough memory to allocate destination buffer. 601 */ 602 int ntfs_ucstombs(const ntfschar *ins, const int ins_len, char **outs, 603 int outs_len) 604 { 605 char *mbs; 606 wchar_t wc; 607 int i, o, mbs_len; 608 int cnt = 0; 609 #ifdef HAVE_MBSINIT 610 mbstate_t mbstate; 611 #endif 612 613 if (!ins || !outs) { 614 errno = EINVAL; 615 return -1; 616 } 617 mbs = *outs; 618 mbs_len = outs_len; 619 if (mbs && !mbs_len) { 620 errno = ENAMETOOLONG; 621 return -1; 622 } 623 if (!mbs) { 624 mbs_len = (ins_len + 1) * MB_CUR_MAX; 625 mbs = ntfs_malloc(mbs_len); 626 if (!mbs) 627 return -1; 628 } 629 #ifdef HAVE_MBSINIT 630 memset(&mbstate, 0, sizeof(mbstate)); 631 #else 632 wctomb(NULL, 0); 633 #endif 634 for (i = o = 0; i < ins_len; i++) { 635 /* Reallocate memory if necessary or abort. */ 636 if ((int)(o + MB_CUR_MAX) > mbs_len) { 637 char *tc; 638 if (mbs == *outs) { 639 errno = ENAMETOOLONG; 640 return -1; 641 } 642 tc = ntfs_malloc((mbs_len + 64) & ~63); 643 if (!tc) 644 goto err_out; 645 memcpy(tc, mbs, mbs_len); 646 mbs_len = (mbs_len + 64) & ~63; 647 free(mbs); 648 mbs = tc; 649 } 650 /* Convert the LE Unicode character to a CPU wide character. */ 651 wc = (wchar_t)le16_to_cpu(ins[i]); 652 if (!wc) 653 break; 654 /* Convert the CPU endian wide character to multibyte. */ 655 #ifdef HAVE_MBSINIT 656 cnt = wcrtomb(mbs + o, wc, &mbstate); 657 #else 658 cnt = wctomb(mbs + o, wc); 659 #endif 660 if (cnt == -1) 661 goto err_out; 662 if (cnt <= 0) { 663 ntfs_log_debug("Eeek. cnt <= 0, cnt = %i\n", cnt); 664 errno = EINVAL; 665 goto err_out; 666 } 667 o += cnt; 668 } 669 #ifdef HAVE_MBSINIT 670 /* Make sure we are back in the initial state. */ 671 if (!mbsinit(&mbstate)) { 672 ntfs_log_debug("Eeek. mbstate not in initial state!\n"); 673 errno = EILSEQ; 674 goto err_out; 675 } 676 #endif 677 /* Now write the NULL character. */ 678 mbs[o] = '\0'; 679 if (*outs != mbs) 680 *outs = mbs; 681 return o; 682 err_out: 683 if (mbs != *outs) { 684 int eo = errno; 685 free(mbs); 686 errno = eo; 687 } 688 return -1; 689 } 690 691 /** 692 * ntfs_mbstoucs - convert a multibyte string to a little endian Unicode string 693 * @ins: input multibyte string buffer 694 * @outs: on return contains the (allocated) output Unicode string 695 * @outs_len: length of output buffer in Unicode characters 696 * 697 * Convert the input multibyte string @ins, from the current locale into the 698 * corresponding little endian, 2-byte Unicode string. 699 * 700 * If *@outs is NULL, the function allocates the string and the caller is 701 * responsible for calling free(*@outs); when finished with it. 702 * 703 * On success the function returns the number of Unicode characters written to 704 * the output string *@outs (>= 0), not counting the terminating Unicode NULL 705 * character. If the output string buffer was allocated, *@outs is set to it. 706 * 707 * On error, -1 is returned, and errno is set to the error code. The following 708 * error codes can be expected: 709 * EINVAL Invalid arguments (e.g. @ins or @outs is NULL). 710 * EILSEQ The input string cannot be represented as a Unicode 711 * string according to the current locale. 712 * ENAMETOOLONG Destination buffer is too small for input string. 713 * ENOMEM Not enough memory to allocate destination buffer. 714 */ 715 int ntfs_mbstoucs(const char *ins, ntfschar **outs, int outs_len) 716 { 717 ntfschar *ucs; 718 const char *s; 719 wchar_t wc; 720 int i, o, cnt, ins_len, ucs_len, ins_size; 721 #ifdef HAVE_MBSINIT 722 mbstate_t mbstate; 723 #endif 724 725 if (!ins || !outs) { 726 errno = EINVAL; 727 return -1; 728 } 729 ucs = *outs; 730 ucs_len = outs_len; 731 if (ucs && !ucs_len) { 732 errno = ENAMETOOLONG; 733 return -1; 734 } 735 /* Determine the size of the multi-byte string in bytes. */ 736 ins_size = strlen(ins); 737 /* Determine the length of the multi-byte string. */ 738 s = ins; 739 #if defined(HAVE_MBSINIT) 740 memset(&mbstate, 0, sizeof(mbstate)); 741 ins_len = mbsrtowcs(NULL, (const char **)&s, 0, &mbstate); 742 #ifdef __CYGWIN32__ 743 if (!ins_len && *ins) { 744 /* Older Cygwin had broken mbsrtowcs() implementation. */ 745 ins_len = strlen(ins); 746 } 747 #endif 748 #elif !defined(DJGPP) 749 ins_len = mbstowcs(NULL, s, 0); 750 #else 751 /* Eeek!!! DJGPP has broken mbstowcs() implementation!!! */ 752 ins_len = strlen(ins); 753 #endif 754 if (ins_len == -1) 755 return ins_len; 756 #ifdef HAVE_MBSINIT 757 if ((s != ins) || !mbsinit(&mbstate)) { 758 #else 759 if (s != ins) { 760 #endif 761 errno = EILSEQ; 762 return -1; 763 } 764 /* Add the NULL terminator. */ 765 ins_len++; 766 if (!ucs) { 767 ucs_len = ins_len; 768 ucs = ntfs_malloc(ucs_len * sizeof(ntfschar)); 769 if (!ucs) 770 return -1; 771 } 772 #ifdef HAVE_MBSINIT 773 memset(&mbstate, 0, sizeof(mbstate)); 774 #else 775 mbtowc(NULL, NULL, 0); 776 #endif 777 for (i = o = cnt = 0; i < ins_size; i += cnt, o++) { 778 /* Reallocate memory if necessary or abort. */ 779 if (o >= ucs_len) { 780 ntfschar *tc; 781 if (ucs == *outs) { 782 errno = ENAMETOOLONG; 783 return -1; 784 } 785 /* 786 * We will never get here but hey, it's only a bit of 787 * extra code... 788 */ 789 ucs_len = (ucs_len * sizeof(ntfschar) + 64) & ~63; 790 tc = (ntfschar*)realloc(ucs, ucs_len); 791 if (!tc) 792 goto err_out; 793 ucs = tc; 794 ucs_len /= sizeof(ntfschar); 795 } 796 /* Convert the multibyte character to a wide character. */ 797 #ifdef HAVE_MBSINIT 798 cnt = mbrtowc(&wc, ins + i, ins_size - i, &mbstate); 799 #else 800 cnt = mbtowc(&wc, ins + i, ins_size - i); 801 #endif 802 if (!cnt) 803 break; 804 if (cnt == -1) 805 goto err_out; 806 if (cnt < -1) { 807 ntfs_log_trace("Eeek. cnt = %i\n", cnt); 808 errno = EINVAL; 809 goto err_out; 810 } 811 /* Make sure we are not overflowing the NTFS Unicode set. */ 812 if ((unsigned long)wc >= (unsigned long)(1 << 813 (8 * sizeof(ntfschar)))) { 814 errno = EILSEQ; 815 goto err_out; 816 } 817 /* Convert the CPU wide character to a LE Unicode character. */ 818 ucs[o] = cpu_to_le16(wc); 819 } 820 #ifdef HAVE_MBSINIT 821 /* Make sure we are back in the initial state. */ 822 if (!mbsinit(&mbstate)) { 823 ntfs_log_trace("Eeek. mbstate not in initial state!\n"); 824 errno = EILSEQ; 825 goto err_out; 826 } 827 #endif 828 /* Now write the NULL character. */ 829 ucs[o] = cpu_to_le16(L'\0'); 830 if (*outs != ucs) 831 *outs = ucs; 832 return o; 833 err_out: 834 if (ucs != *outs) { 835 int eo = errno; 836 free(ucs); 837 errno = eo; 838 } 839 return -1; 840 } 841 842 #endif // defined(__BEOS__) || defined(__HAIKU__) 843 844 /** 845 * ntfs_upcase_table_build - build the default upcase table for NTFS 846 * @uc: destination buffer where to store the built table 847 * @uc_len: size of destination buffer in bytes 848 * 849 * ntfs_upcase_table_build() builds the default upcase table for NTFS and 850 * stores it in the caller supplied buffer @uc of size @uc_len. 851 * 852 * Note, @uc_len must be at least 128kiB in size or bad things will happen! 853 */ 854 void ntfs_upcase_table_build(ntfschar *uc, u32 uc_len) 855 { 856 static int uc_run_table[][3] = { /* Start, End, Add */ 857 {0x0061, 0x007B, -32}, {0x0451, 0x045D, -80}, {0x1F70, 0x1F72, 74}, 858 {0x00E0, 0x00F7, -32}, {0x045E, 0x0460, -80}, {0x1F72, 0x1F76, 86}, 859 {0x00F8, 0x00FF, -32}, {0x0561, 0x0587, -48}, {0x1F76, 0x1F78, 100}, 860 {0x0256, 0x0258, -205}, {0x1F00, 0x1F08, 8}, {0x1F78, 0x1F7A, 128}, 861 {0x028A, 0x028C, -217}, {0x1F10, 0x1F16, 8}, {0x1F7A, 0x1F7C, 112}, 862 {0x03AC, 0x03AD, -38}, {0x1F20, 0x1F28, 8}, {0x1F7C, 0x1F7E, 126}, 863 {0x03AD, 0x03B0, -37}, {0x1F30, 0x1F38, 8}, {0x1FB0, 0x1FB2, 8}, 864 {0x03B1, 0x03C2, -32}, {0x1F40, 0x1F46, 8}, {0x1FD0, 0x1FD2, 8}, 865 {0x03C2, 0x03C3, -31}, {0x1F51, 0x1F52, 8}, {0x1FE0, 0x1FE2, 8}, 866 {0x03C3, 0x03CC, -32}, {0x1F53, 0x1F54, 8}, {0x1FE5, 0x1FE6, 7}, 867 {0x03CC, 0x03CD, -64}, {0x1F55, 0x1F56, 8}, {0x2170, 0x2180, -16}, 868 {0x03CD, 0x03CF, -63}, {0x1F57, 0x1F58, 8}, {0x24D0, 0x24EA, -26}, 869 {0x0430, 0x0450, -32}, {0x1F60, 0x1F68, 8}, {0xFF41, 0xFF5B, -32}, 870 {0} 871 }; 872 static int uc_dup_table[][2] = { /* Start, End */ 873 {0x0100, 0x012F}, {0x01A0, 0x01A6}, {0x03E2, 0x03EF}, {0x04CB, 0x04CC}, 874 {0x0132, 0x0137}, {0x01B3, 0x01B7}, {0x0460, 0x0481}, {0x04D0, 0x04EB}, 875 {0x0139, 0x0149}, {0x01CD, 0x01DD}, {0x0490, 0x04BF}, {0x04EE, 0x04F5}, 876 {0x014A, 0x0178}, {0x01DE, 0x01EF}, {0x04BF, 0x04BF}, {0x04F8, 0x04F9}, 877 {0x0179, 0x017E}, {0x01F4, 0x01F5}, {0x04C1, 0x04C4}, {0x1E00, 0x1E95}, 878 {0x018B, 0x018B}, {0x01FA, 0x0218}, {0x04C7, 0x04C8}, {0x1EA0, 0x1EF9}, 879 {0} 880 }; 881 static int uc_byte_table[][2] = { /* Offset, Value */ 882 {0x00FF, 0x0178}, {0x01AD, 0x01AC}, {0x01F3, 0x01F1}, {0x0269, 0x0196}, 883 {0x0183, 0x0182}, {0x01B0, 0x01AF}, {0x0253, 0x0181}, {0x026F, 0x019C}, 884 {0x0185, 0x0184}, {0x01B9, 0x01B8}, {0x0254, 0x0186}, {0x0272, 0x019D}, 885 {0x0188, 0x0187}, {0x01BD, 0x01BC}, {0x0259, 0x018F}, {0x0275, 0x019F}, 886 {0x018C, 0x018B}, {0x01C6, 0x01C4}, {0x025B, 0x0190}, {0x0283, 0x01A9}, 887 {0x0192, 0x0191}, {0x01C9, 0x01C7}, {0x0260, 0x0193}, {0x0288, 0x01AE}, 888 {0x0199, 0x0198}, {0x01CC, 0x01CA}, {0x0263, 0x0194}, {0x0292, 0x01B7}, 889 {0x01A8, 0x01A7}, {0x01DD, 0x018E}, {0x0268, 0x0197}, 890 {0} 891 }; 892 int i, r; 893 894 memset((char*)uc, 0, uc_len); 895 uc_len >>= 1; 896 if (uc_len > 65536) 897 uc_len = 65536; 898 for (i = 0; (u32)i < uc_len; i++) 899 uc[i] = i; 900 for (r = 0; uc_run_table[r][0]; r++) 901 for (i = uc_run_table[r][0]; i < uc_run_table[r][1]; i++) 902 uc[i] += uc_run_table[r][2]; 903 for (r = 0; uc_dup_table[r][0]; r++) 904 for (i = uc_dup_table[r][0]; i < uc_dup_table[r][1]; i += 2) 905 uc[i + 1]--; 906 for (r = 0; uc_byte_table[r][0]; r++) 907 uc[uc_byte_table[r][0]] = uc_byte_table[r][1]; 908 } 909 910 /** 911 * ntfs_str2ucs - convert a string to a valid NTFS file name 912 * @s: input string 913 * @len: length of output buffer in Unicode characters 914 * 915 * Convert the input @s string into the corresponding little endian, 916 * 2-byte Unicode string. The length of the converted string is less 917 * or equal to the maximum length allowed by the NTFS format (255). 918 * 919 * If @s is NULL then return AT_UNNAMED. 920 * 921 * On success the function returns the Unicode string in an allocated 922 * buffer and the caller is responsible to free it when it's not needed 923 * anymore. 924 * 925 * On error NULL is returned and errno is set to the error code. 926 */ 927 ntfschar *ntfs_str2ucs(const char *s, int *len) 928 { 929 ntfschar *ucs = NULL; 930 931 if (s && ((*len = ntfs_mbstoucs(s, &ucs, 0)) == -1)) { 932 ntfs_log_perror("Couldn't convert '%s' to Unicode", s); 933 return NULL; 934 } 935 if (*len > NTFS_MAX_NAME_LEN) { 936 free(ucs); 937 errno = ENAMETOOLONG; 938 return NULL; 939 } 940 if (!ucs || !*len) { 941 ucs = AT_UNNAMED; 942 *len = 0; 943 } 944 return ucs; 945 } 946 947 /** 948 * ntfs_ucsfree - free memory allocated by ntfs_str2ucs() 949 * @ucs input string to be freed 950 * 951 * Free memory at @ucs and which was allocated by ntfs_str2ucs. 952 * 953 * Return value: none. 954 */ 955 void ntfs_ucsfree(ntfschar *ucs) 956 { 957 if (ucs && (ucs != AT_UNNAMED)) 958 free(ucs); 959 } 960 961