1 /**
2 * unistr.c - Unicode string handling. Originated from the Linux-NTFS project.
3 *
4 * Copyright (c) 2000-2004 Anton Altaparmakov
5 * Copyright (c) 2002-2009 Szabolcs Szakacsits
6 * Copyright (c) 2008-2015 Jean-Pierre Andre
7 * Copyright (c) 2008 Bernhard Kaindl
8 *
9 * This program/include file is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License as published
11 * by the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program/include file is distributed in the hope that it will be
15 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
16 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program (in the main directory of the NTFS-3G
21 * distribution in the file COPYING); if not, write to the Free Software
22 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 */
24
25 #ifdef HAVE_CONFIG_H
26 #include "config.h"
27 #endif
28
29 #ifdef HAVE_STDIO_H
30 #include <stdio.h>
31 #endif
32 #ifdef HAVE_STDLIB_H
33 #include <stdlib.h>
34 #endif
35 #ifdef HAVE_WCHAR_H
36 #include <wchar.h>
37 #endif
38 #ifdef HAVE_STRING_H
39 #include <string.h>
40 #endif
41 #ifdef HAVE_ERRNO_H
42 #include <errno.h>
43 #endif
44 #ifdef HAVE_LOCALE_H
45 #include <locale.h>
46 #endif
47
48 #if defined(__APPLE__) || defined(__DARWIN__)
49 #ifdef ENABLE_NFCONV
50 #include <CoreFoundation/CoreFoundation.h>
51 #endif /* ENABLE_NFCONV */
52 #endif /* defined(__APPLE__) || defined(__DARWIN__) */
53
54 #include "compat.h"
55 #include "attrib.h"
56 #include "types.h"
57 #include "unistr.h"
58 #include "debug.h"
59 #include "logging.h"
60 #include "misc.h"
61
62 #ifndef ALLOW_BROKEN_UNICODE
63 /* Erik allowing broken UTF-16 surrogate pairs and U+FFFE and U+FFFF by default,
64 * open to debate. */
65 #define ALLOW_BROKEN_UNICODE 1
66 #endif /* !defined(ALLOW_BROKEN_UNICODE) */
67
68 /*
69 * IMPORTANT
70 * =========
71 *
72 * All these routines assume that the Unicode characters are in little endian
73 * encoding inside the strings!!!
74 */
75
76 static int use_utf8 = 1; /* use UTF-8 encoding for file names */
77
78 #if defined(__APPLE__) || defined(__DARWIN__)
79 #ifdef ENABLE_NFCONV
80 /**
81 * This variable controls whether or not automatic normalization form conversion
82 * should be performed when translating NTFS unicode file names to UTF-8.
83 * Defaults to on, but can be controlled from the outside using the function
84 * int ntfs_macosx_normalize_filenames(int normalize);
85 */
86 static int nfconvert_utf8 = 1;
87 #endif /* ENABLE_NFCONV */
88 #endif /* defined(__APPLE__) || defined(__DARWIN__) */
89
90 /*
91 * This is used by the name collation functions to quickly determine what
92 * characters are (in)valid.
93 */
94 #if 0
95 static const u8 legal_ansi_char_array[0x40] = {
96 0x00, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
97 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
98
99 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
100 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
101
102 0x17, 0x07, 0x18, 0x17, 0x17, 0x17, 0x17, 0x17,
103 0x17, 0x17, 0x18, 0x16, 0x16, 0x17, 0x07, 0x00,
104
105 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17,
106 0x17, 0x17, 0x04, 0x16, 0x18, 0x16, 0x18, 0x18,
107 };
108 #endif
109
110 /**
111 * ntfs_names_are_equal - compare two Unicode names for equality
112 * @s1: name to compare to @s2
113 * @s1_len: length in Unicode characters of @s1
114 * @s2: name to compare to @s1
115 * @s2_len: length in Unicode characters of @s2
116 * @ic: ignore case bool
117 * @upcase: upcase table (only if @ic == IGNORE_CASE)
118 * @upcase_size: length in Unicode characters of @upcase (if present)
119 *
120 * Compare the names @s1 and @s2 and return TRUE (1) if the names are
121 * identical, or FALSE (0) if they are not identical. If @ic is IGNORE_CASE,
122 * the @upcase table is used to perform a case insensitive comparison.
123 */
ntfs_names_are_equal(const ntfschar * s1,size_t s1_len,const ntfschar * s2,size_t s2_len,const IGNORE_CASE_BOOL ic,const ntfschar * upcase,const u32 upcase_size)124 BOOL ntfs_names_are_equal(const ntfschar *s1, size_t s1_len,
125 const ntfschar *s2, size_t s2_len,
126 const IGNORE_CASE_BOOL ic,
127 const ntfschar *upcase, const u32 upcase_size)
128 {
129 if (s1_len != s2_len)
130 return FALSE;
131 if (!s1_len)
132 return TRUE;
133 if (ic == CASE_SENSITIVE)
134 return ntfs_ucsncmp(s1, s2, s1_len) ? FALSE: TRUE;
135 return ntfs_ucsncasecmp(s1, s2, s1_len, upcase, upcase_size) ? FALSE:
136 TRUE;
137 }
138
139 /*
140 * ntfs_names_full_collate() fully collate two Unicode names
141 *
142 * @name1: first Unicode name to compare
143 * @name1_len: length of first Unicode name to compare
144 * @name2: second Unicode name to compare
145 * @name2_len: length of second Unicode name to compare
146 * @ic: either CASE_SENSITIVE or IGNORE_CASE (see below)
147 * @upcase: upcase table
148 * @upcase_len: upcase table size
149 *
150 * If @ic is CASE_SENSITIVE, then the names are compared primarily ignoring
151 * case, but if the names are equal ignoring case, then they are compared
152 * case-sensitively. As an example, "abc" would collate before "BCD" (since
153 * "abc" and "BCD" differ ignoring case and 'A' < 'B') but after "ABC" (since
154 * "ABC" and "abc" are equal ignoring case and 'A' < 'a'). This matches the
155 * collation order of filenames as indexed in NTFS directories.
156 *
157 * If @ic is IGNORE_CASE, then the names are only compared case-insensitively
158 * and are considered to match if and only if they are equal ignoring case.
159 *
160 * Returns:
161 * -1 if the first name collates before the second one,
162 * 0 if the names match, or
163 * 1 if the second name collates before the first one
164 */
ntfs_names_full_collate(const ntfschar * name1,const u32 name1_len,const ntfschar * name2,const u32 name2_len,const IGNORE_CASE_BOOL ic,const ntfschar * upcase,const u32 upcase_len)165 int ntfs_names_full_collate(const ntfschar *name1, const u32 name1_len,
166 const ntfschar *name2, const u32 name2_len,
167 const IGNORE_CASE_BOOL ic, const ntfschar *upcase,
168 const u32 upcase_len)
169 {
170 u32 cnt;
171 u16 c1, c2;
172 u16 u1, u2;
173
174 #ifdef DEBUG
175 if (!name1 || !name2 || !upcase || !upcase_len) {
176 ntfs_log_debug("ntfs_names_collate received NULL pointer!\n");
177 exit(1);
178 }
179 #endif
180 cnt = min(name1_len, name2_len);
181 if (cnt > 0) {
182 if (ic == CASE_SENSITIVE) {
183 while (--cnt && (*name1 == *name2)) {
184 name1++;
185 name2++;
186 }
187 u1 = c1 = le16_to_cpu(*name1);
188 u2 = c2 = le16_to_cpu(*name2);
189 if (u1 < upcase_len)
190 u1 = le16_to_cpu(upcase[u1]);
191 if (u2 < upcase_len)
192 u2 = le16_to_cpu(upcase[u2]);
193 if ((u1 == u2) && cnt)
194 do {
195 name1++;
196 u1 = le16_to_cpu(*name1);
197 name2++;
198 u2 = le16_to_cpu(*name2);
199 if (u1 < upcase_len)
200 u1 = le16_to_cpu(upcase[u1]);
201 if (u2 < upcase_len)
202 u2 = le16_to_cpu(upcase[u2]);
203 } while ((u1 == u2) && --cnt);
204 if (u1 < u2)
205 return -1;
206 if (u1 > u2)
207 return 1;
208 if (name1_len < name2_len)
209 return -1;
210 if (name1_len > name2_len)
211 return 1;
212 if (c1 < c2)
213 return -1;
214 if (c1 > c2)
215 return 1;
216 } else {
217 do {
218 u1 = le16_to_cpu(*name1);
219 name1++;
220 u2 = le16_to_cpu(*name2);
221 name2++;
222 if (u1 < upcase_len)
223 u1 = le16_to_cpu(upcase[u1]);
224 if (u2 < upcase_len)
225 u2 = le16_to_cpu(upcase[u2]);
226 } while ((u1 == u2) && --cnt);
227 if (u1 < u2)
228 return -1;
229 if (u1 > u2)
230 return 1;
231 if (name1_len < name2_len)
232 return -1;
233 if (name1_len > name2_len)
234 return 1;
235 }
236 } else {
237 if (name1_len < name2_len)
238 return -1;
239 if (name1_len > name2_len)
240 return 1;
241 }
242 return 0;
243 }
244
245 /**
246 * ntfs_ucsncmp - compare two little endian Unicode strings
247 * @s1: first string
248 * @s2: second string
249 * @n: maximum unicode characters to compare
250 *
251 * Compare the first @n characters of the Unicode strings @s1 and @s2,
252 * The strings in little endian format and appropriate le16_to_cpu()
253 * conversion is performed on non-little endian machines.
254 *
255 * The function returns an integer less than, equal to, or greater than zero
256 * if @s1 (or the first @n Unicode characters thereof) is found, respectively,
257 * to be less than, to match, or be greater than @s2.
258 */
ntfs_ucsncmp(const ntfschar * s1,const ntfschar * s2,size_t n)259 int ntfs_ucsncmp(const ntfschar *s1, const ntfschar *s2, size_t n)
260 {
261 u16 c1, c2;
262 size_t i;
263
264 #ifdef DEBUG
265 if (!s1 || !s2) {
266 ntfs_log_debug("ntfs_wcsncmp() received NULL pointer!\n");
267 exit(1);
268 }
269 #endif
270 for (i = 0; i < n; ++i) {
271 c1 = le16_to_cpu(s1[i]);
272 c2 = le16_to_cpu(s2[i]);
273 if (c1 < c2)
274 return -1;
275 if (c1 > c2)
276 return 1;
277 if (!c1)
278 break;
279 }
280 return 0;
281 }
282
283 /**
284 * ntfs_ucsncasecmp - compare two little endian Unicode strings, ignoring case
285 * @s1: first string
286 * @s2: second string
287 * @n: maximum unicode characters to compare
288 * @upcase: upcase table
289 * @upcase_size: upcase table size in Unicode characters
290 *
291 * Compare the first @n characters of the Unicode strings @s1 and @s2,
292 * ignoring case. The strings in little endian format and appropriate
293 * le16_to_cpu() conversion is performed on non-little endian machines.
294 *
295 * Each character is uppercased using the @upcase table before the comparison.
296 *
297 * The function returns an integer less than, equal to, or greater than zero
298 * if @s1 (or the first @n Unicode characters thereof) is found, respectively,
299 * to be less than, to match, or be greater than @s2.
300 */
ntfs_ucsncasecmp(const ntfschar * s1,const ntfschar * s2,size_t n,const ntfschar * upcase,const u32 upcase_size)301 int ntfs_ucsncasecmp(const ntfschar *s1, const ntfschar *s2, size_t n,
302 const ntfschar *upcase, const u32 upcase_size)
303 {
304 u16 c1, c2;
305 size_t i;
306
307 #ifdef DEBUG
308 if (!s1 || !s2 || !upcase) {
309 ntfs_log_debug("ntfs_wcsncasecmp() received NULL pointer!\n");
310 exit(1);
311 }
312 #endif
313 for (i = 0; i < n; ++i) {
314 if ((c1 = le16_to_cpu(s1[i])) < upcase_size)
315 c1 = le16_to_cpu(upcase[c1]);
316 if ((c2 = le16_to_cpu(s2[i])) < upcase_size)
317 c2 = le16_to_cpu(upcase[c2]);
318 if (c1 < c2)
319 return -1;
320 if (c1 > c2)
321 return 1;
322 if (!c1)
323 break;
324 }
325 return 0;
326 }
327
328 /**
329 * ntfs_ucsnlen - determine the length of a little endian Unicode string
330 * @s: pointer to Unicode string
331 * @maxlen: maximum length of string @s
332 *
333 * Return the number of Unicode characters in the little endian Unicode
334 * string @s up to a maximum of maxlen Unicode characters, not including
335 * the terminating (ntfschar)'\0'. If there is no (ntfschar)'\0' between @s
336 * and @s + @maxlen, @maxlen is returned.
337 *
338 * This function never looks beyond @s + @maxlen.
339 */
ntfs_ucsnlen(const ntfschar * s,u32 maxlen)340 u32 ntfs_ucsnlen(const ntfschar *s, u32 maxlen)
341 {
342 u32 i;
343
344 for (i = 0; i < maxlen; i++) {
345 if (!le16_to_cpu(s[i]))
346 break;
347 }
348 return i;
349 }
350
351 /**
352 * ntfs_ucsndup - duplicate little endian Unicode string
353 * @s: pointer to Unicode string
354 * @maxlen: maximum length of string @s
355 *
356 * Return a pointer to a new little endian Unicode string which is a duplicate
357 * of the string s. Memory for the new string is obtained with ntfs_malloc(3),
358 * and can be freed with free(3).
359 *
360 * A maximum of @maxlen Unicode characters are copied and a terminating
361 * (ntfschar)'\0' little endian Unicode character is added.
362 *
363 * This function never looks beyond @s + @maxlen.
364 *
365 * Return a pointer to the new little endian Unicode string on success and NULL
366 * on failure with errno set to the error code.
367 */
ntfs_ucsndup(const ntfschar * s,u32 maxlen)368 ntfschar *ntfs_ucsndup(const ntfschar *s, u32 maxlen)
369 {
370 ntfschar *dst;
371 u32 len;
372
373 len = ntfs_ucsnlen(s, maxlen);
374 dst = ntfs_malloc((len + 1) * sizeof(ntfschar));
375 if (dst) {
376 memcpy(dst, s, len * sizeof(ntfschar));
377 dst[len] = const_cpu_to_le16(L'\0');
378 }
379 return dst;
380 }
381
382 /**
383 * ntfs_name_upcase - Map an Unicode name to its uppercase equivalent
384 * @name:
385 * @name_len:
386 * @upcase:
387 * @upcase_len:
388 *
389 * Description...
390 *
391 * Returns:
392 */
ntfs_name_upcase(ntfschar * name,u32 name_len,const ntfschar * upcase,const u32 upcase_len)393 void ntfs_name_upcase(ntfschar *name, u32 name_len, const ntfschar *upcase,
394 const u32 upcase_len)
395 {
396 u32 i;
397 u16 u;
398
399 for (i = 0; i < name_len; i++)
400 if ((u = le16_to_cpu(name[i])) < upcase_len)
401 name[i] = upcase[u];
402 }
403
404 /**
405 * ntfs_name_locase - Map a Unicode name to its lowercase equivalent
406 */
ntfs_name_locase(ntfschar * name,u32 name_len,const ntfschar * locase,const u32 locase_len)407 void ntfs_name_locase(ntfschar *name, u32 name_len, const ntfschar *locase,
408 const u32 locase_len)
409 {
410 u32 i;
411 u16 u;
412
413 if (locase)
414 for (i = 0; i < name_len; i++)
415 if ((u = le16_to_cpu(name[i])) < locase_len)
416 name[i] = locase[u];
417 }
418
419 /**
420 * ntfs_file_value_upcase - Convert a filename to upper case
421 * @file_name_attr:
422 * @upcase:
423 * @upcase_len:
424 *
425 * Description...
426 *
427 * Returns:
428 */
ntfs_file_value_upcase(FILE_NAME_ATTR * file_name_attr,const ntfschar * upcase,const u32 upcase_len)429 void ntfs_file_value_upcase(FILE_NAME_ATTR *file_name_attr,
430 const ntfschar *upcase, const u32 upcase_len)
431 {
432 ntfs_name_upcase((ntfschar*)&file_name_attr->file_name,
433 file_name_attr->file_name_length, upcase, upcase_len);
434 }
435
436 /*
437 NTFS uses Unicode (UTF-16LE [NTFS-3G uses UCS-2LE, which is enough
438 for now]) for path names, but the Unicode code points need to be
439 converted before a path can be accessed under NTFS. For 7 bit ASCII/ANSI,
440 glibc does this even without a locale in a hard-coded fashion as that
441 appears to be is easy because the low 7-bit ASCII range appears to be
442 available in all charsets but it does not convert anything if
443 there was some error with the locale setup or none set up like
444 when mount is called during early boot where he (by policy) do
445 not use locales (and may be not available if /usr is not yet mounted),
446 so this patch fixes the resulting issues for systems which use
447 UTF-8 and for others, specifying the locale in fstab brings them
448 the encoding which they want.
449
450 If no locale is defined or there was a problem with setting one
451 up and whenever nl_langinfo(CODESET) returns a sting starting with
452 "ANSI", use an internal UCS-2LE <-> UTF-8 codeset converter to fix
453 the bug where NTFS-3G does not show any path names which include
454 international characters!!! (and also fails on creating them) as result.
455
456 Author: Bernhard Kaindl <bk@suse.de>
457 Jean-Pierre Andre made it compliant with RFC3629/RFC2781.
458 */
459
460 /*
461 * Return the number of bytes in UTF-8 needed (without the terminating null) to
462 * store the given UTF-16LE string.
463 *
464 * On error, -1 is returned, and errno is set to the error code. The following
465 * error codes can be expected:
466 * EILSEQ The input string is not valid UTF-16LE (only possible
467 * if compiled without ALLOW_BROKEN_UNICODE).
468 * ENAMETOOLONG The length of the UTF-8 string in bytes (without the
469 * terminating null) would exceed @outs_len.
470 */
utf16_to_utf8_size(const ntfschar * ins,const int ins_len,int outs_len)471 static int utf16_to_utf8_size(const ntfschar *ins, const int ins_len, int outs_len)
472 {
473 int i, ret = -1;
474 int count = 0;
475 BOOL surrog;
476
477 surrog = FALSE;
478 for (i = 0; i < ins_len && ins[i] && count <= outs_len; i++) {
479 unsigned short c = le16_to_cpu(ins[i]);
480 if (surrog) {
481 if ((c >= 0xdc00) && (c < 0xe000)) {
482 surrog = FALSE;
483 count += 4;
484 } else {
485 #if ALLOW_BROKEN_UNICODE
486 /* The first UTF-16 unit of a surrogate pair has
487 * a value between 0xd800 and 0xdc00. It can be
488 * encoded as an individual UTF-8 sequence if we
489 * cannot combine it with the next UTF-16 unit
490 * unit as a surrogate pair. */
491 surrog = FALSE;
492 count += 3;
493
494 --i;
495 continue;
496 #else
497 goto fail;
498 #endif /* ALLOW_BROKEN_UNICODE */
499 }
500 } else
501 if (c < 0x80)
502 count++;
503 else if (c < 0x800)
504 count += 2;
505 else if (c < 0xd800)
506 count += 3;
507 else if (c < 0xdc00)
508 surrog = TRUE;
509 #if ALLOW_BROKEN_UNICODE
510 else if (c < 0xe000)
511 count += 3;
512 else if (c >= 0xe000)
513 #else
514 else if ((c >= 0xe000) && (c < 0xfffe))
515 #endif /* ALLOW_BROKEN_UNICODE */
516 count += 3;
517 else
518 goto fail;
519 }
520
521 if (surrog && count <= outs_len) {
522 #if ALLOW_BROKEN_UNICODE
523 count += 3; /* ending with a single surrogate */
524 #else
525 goto fail;
526 #endif /* ALLOW_BROKEN_UNICODE */
527 }
528
529 if (count > outs_len) {
530 errno = ENAMETOOLONG;
531 goto out;
532 }
533
534 ret = count;
535 out:
536 return ret;
537 fail:
538 errno = EILSEQ;
539 goto out;
540 }
541
542 /*
543 * ntfs_utf16_to_utf8 - convert a little endian UTF16LE string to an UTF-8 string
544 * @ins: input utf16 string buffer
545 * @ins_len: length of input string in utf16 characters
546 * @outs: on return contains the (allocated) output multibyte string
547 * @outs_len: length of output buffer in bytes (ignored if *@outs is NULL)
548 *
549 * Return -1 with errno set if string has invalid byte sequence or too long.
550 */
ntfs_utf16_to_utf8(const ntfschar * ins,const int ins_len,char ** outs,int outs_len)551 static int ntfs_utf16_to_utf8(const ntfschar *ins, const int ins_len,
552 char **outs, int outs_len)
553 {
554 #if defined(__APPLE__) || defined(__DARWIN__)
555 #ifdef ENABLE_NFCONV
556 char *original_outs_value = *outs;
557 int original_outs_len = outs_len;
558 #endif /* ENABLE_NFCONV */
559 #endif /* defined(__APPLE__) || defined(__DARWIN__) */
560
561 char *t;
562 int i, size, ret = -1;
563 int halfpair;
564
565 halfpair = 0;
566 if (!*outs) {
567 /* If no output buffer was provided, we will allocate one and
568 * limit its length to PATH_MAX. Note: we follow the standard
569 * convention of PATH_MAX including the terminating null. */
570 outs_len = PATH_MAX;
571 }
572
573 /* The size *with* the terminating null is limited to @outs_len,
574 * so the size *without* the terminating null is limited to one less. */
575 size = utf16_to_utf8_size(ins, ins_len, outs_len - 1);
576
577 if (size < 0)
578 goto out;
579
580 if (!*outs) {
581 outs_len = size + 1;
582 *outs = ntfs_malloc(outs_len);
583 if (!*outs)
584 goto out;
585 }
586
587 t = *outs;
588
589 for (i = 0; i < ins_len && ins[i]; i++) {
590 unsigned short c = le16_to_cpu(ins[i]);
591 /* size not double-checked */
592 if (halfpair) {
593 if ((c >= 0xdc00) && (c < 0xe000)) {
594 *t++ = 0xf0 + (((halfpair + 64) >> 8) & 7);
595 *t++ = 0x80 + (((halfpair + 64) >> 2) & 63);
596 *t++ = 0x80 + ((c >> 6) & 15) + ((halfpair & 3) << 4);
597 *t++ = 0x80 + (c & 63);
598 halfpair = 0;
599 } else {
600 #if ALLOW_BROKEN_UNICODE
601 /* The first UTF-16 unit of a surrogate pair has
602 * a value between 0xd800 and 0xdc00. It can be
603 * encoded as an individual UTF-8 sequence if we
604 * cannot combine it with the next UTF-16 unit
605 * unit as a surrogate pair. */
606 *t++ = 0xe0 | (halfpair >> 12);
607 *t++ = 0x80 | ((halfpair >> 6) & 0x3f);
608 *t++ = 0x80 | (halfpair & 0x3f);
609 halfpair = 0;
610
611 --i;
612 continue;
613 #else
614 goto fail;
615 #endif /* ALLOW_BROKEN_UNICODE */
616 }
617 } else if (c < 0x80) {
618 *t++ = c;
619 } else {
620 if (c < 0x800) {
621 *t++ = (0xc0 | ((c >> 6) & 0x3f));
622 *t++ = 0x80 | (c & 0x3f);
623 } else if (c < 0xd800) {
624 *t++ = 0xe0 | (c >> 12);
625 *t++ = 0x80 | ((c >> 6) & 0x3f);
626 *t++ = 0x80 | (c & 0x3f);
627 } else if (c < 0xdc00)
628 halfpair = c;
629 #if ALLOW_BROKEN_UNICODE
630 else if (c < 0xe000) {
631 *t++ = 0xe0 | (c >> 12);
632 *t++ = 0x80 | ((c >> 6) & 0x3f);
633 *t++ = 0x80 | (c & 0x3f);
634 }
635 #endif /* ALLOW_BROKEN_UNICODE */
636 else if (c >= 0xe000) {
637 *t++ = 0xe0 | (c >> 12);
638 *t++ = 0x80 | ((c >> 6) & 0x3f);
639 *t++ = 0x80 | (c & 0x3f);
640 } else
641 goto fail;
642 }
643 }
644 #if ALLOW_BROKEN_UNICODE
645 if (halfpair) { /* ending with a single surrogate */
646 *t++ = 0xe0 | (halfpair >> 12);
647 *t++ = 0x80 | ((halfpair >> 6) & 0x3f);
648 *t++ = 0x80 | (halfpair & 0x3f);
649 }
650 #endif /* ALLOW_BROKEN_UNICODE */
651 *t = '\0';
652
653 #if defined(__APPLE__) || defined(__DARWIN__)
654 #ifdef ENABLE_NFCONV
655 if(nfconvert_utf8 && (t - *outs) > 0) {
656 char *new_outs = NULL;
657 int new_outs_len = ntfs_macosx_normalize_utf8(*outs, &new_outs, 0); // Normalize to decomposed form
658 if(new_outs_len >= 0 && new_outs != NULL) {
659 if(original_outs_value != *outs) {
660 // We have allocated outs ourselves.
661 free(*outs);
662 *outs = new_outs;
663 t = *outs + new_outs_len;
664 }
665 else {
666 // We need to copy new_outs into the fixed outs buffer.
667 memset(*outs, 0, original_outs_len);
668 strncpy(*outs, new_outs, original_outs_len-1);
669 t = *outs + original_outs_len;
670 free(new_outs);
671 }
672 }
673 else {
674 ntfs_log_error("Failed to normalize NTFS string to UTF-8 NFD: %s\n", *outs);
675 ntfs_log_error(" new_outs=0x%p\n", new_outs);
676 ntfs_log_error(" new_outs_len=%d\n", new_outs_len);
677 }
678 }
679 #endif /* ENABLE_NFCONV */
680 #endif /* defined(__APPLE__) || defined(__DARWIN__) */
681
682 ret = t - *outs;
683 out:
684 return ret;
685 fail:
686 errno = EILSEQ;
687 goto out;
688 }
689
690 /*
691 * Return the amount of 16-bit elements in UTF-16LE needed
692 * (without the terminating null) to store given UTF-8 string.
693 *
694 * Return -1 with errno set if it's longer than PATH_MAX or string is invalid.
695 *
696 * Note: This does not check whether the input sequence is a valid utf8 string,
697 * and should be used only in context where such check is made!
698 */
utf8_to_utf16_size(const char * s)699 static int utf8_to_utf16_size(const char *s)
700 {
701 int ret = -1;
702 unsigned int byte;
703 size_t count = 0;
704
705 while ((byte = *((const unsigned char *)s++))) {
706 if (++count >= PATH_MAX)
707 goto fail;
708 if (byte >= 0xc0) {
709 if (byte >= 0xF5) {
710 errno = EILSEQ;
711 goto out;
712 }
713 if (!*s)
714 break;
715 if (byte >= 0xC0)
716 s++;
717 if (!*s)
718 break;
719 if (byte >= 0xE0)
720 s++;
721 if (!*s)
722 break;
723 if (byte >= 0xF0) {
724 s++;
725 if (++count >= PATH_MAX)
726 goto fail;
727 }
728 }
729 }
730 ret = count;
731 out:
732 return ret;
733 fail:
734 errno = ENAMETOOLONG;
735 goto out;
736 }
737 /*
738 * This converts one UTF-8 sequence to cpu-endian Unicode value
739 * within range U+0 .. U+10ffff and excluding U+D800 .. U+DFFF
740 *
741 * Return the number of used utf8 bytes or -1 with errno set
742 * if sequence is invalid.
743 */
utf8_to_unicode(u32 * wc,const char * s)744 static int utf8_to_unicode(u32 *wc, const char *s)
745 {
746 unsigned int byte = *((const unsigned char *)s);
747
748 /* single byte */
749 if (byte == 0) {
750 *wc = (u32) 0;
751 return 0;
752 } else if (byte < 0x80) {
753 *wc = (u32) byte;
754 return 1;
755 /* double byte */
756 } else if (byte < 0xc2) {
757 goto fail;
758 } else if (byte < 0xE0) {
759 if ((s[1] & 0xC0) == 0x80) {
760 *wc = ((u32)(byte & 0x1F) << 6)
761 | ((u32)(s[1] & 0x3F));
762 return 2;
763 } else
764 goto fail;
765 /* three-byte */
766 } else if (byte < 0xF0) {
767 if (((s[1] & 0xC0) == 0x80) && ((s[2] & 0xC0) == 0x80)) {
768 *wc = ((u32)(byte & 0x0F) << 12)
769 | ((u32)(s[1] & 0x3F) << 6)
770 | ((u32)(s[2] & 0x3F));
771 /* Check valid ranges */
772 #if ALLOW_BROKEN_UNICODE
773 if (((*wc >= 0x800) && (*wc <= 0xD7FF))
774 || ((*wc >= 0xD800) && (*wc <= 0xDFFF))
775 || ((*wc >= 0xe000) && (*wc <= 0xFFFF)))
776 return 3;
777 #else
778 if (((*wc >= 0x800) && (*wc <= 0xD7FF))
779 || ((*wc >= 0xe000) && (*wc <= 0xFFFD)))
780 return 3;
781 #endif /* ALLOW_BROKEN_UNICODE */
782 }
783 goto fail;
784 /* four-byte */
785 } else if (byte < 0xF5) {
786 if (((s[1] & 0xC0) == 0x80) && ((s[2] & 0xC0) == 0x80)
787 && ((s[3] & 0xC0) == 0x80)) {
788 *wc = ((u32)(byte & 0x07) << 18)
789 | ((u32)(s[1] & 0x3F) << 12)
790 | ((u32)(s[2] & 0x3F) << 6)
791 | ((u32)(s[3] & 0x3F));
792 /* Check valid ranges */
793 if ((*wc <= 0x10ffff) && (*wc >= 0x10000))
794 return 4;
795 }
796 goto fail;
797 }
798 fail:
799 errno = EILSEQ;
800 return -1;
801 }
802
803 /**
804 * ntfs_utf8_to_utf16 - convert a UTF-8 string to a UTF-16LE string
805 * @ins: input multibyte string buffer
806 * @outs: on return contains the (allocated) output utf16 string
807 * @outs_len: length of output buffer in utf16 characters
808 *
809 * Return -1 with errno set.
810 */
ntfs_utf8_to_utf16(const char * ins,ntfschar ** outs)811 static int ntfs_utf8_to_utf16(const char *ins, ntfschar **outs)
812 {
813 #if defined(__APPLE__) || defined(__DARWIN__)
814 #ifdef ENABLE_NFCONV
815 char *new_ins = NULL;
816 if(nfconvert_utf8) {
817 int new_ins_len;
818 new_ins_len = ntfs_macosx_normalize_utf8(ins, &new_ins, 1); // Normalize to composed form
819 if(new_ins_len >= 0)
820 ins = new_ins;
821 else
822 ntfs_log_error("Failed to normalize NTFS string to UTF-8 NFC: %s\n", ins);
823 }
824 #endif /* ENABLE_NFCONV */
825 #endif /* defined(__APPLE__) || defined(__DARWIN__) */
826 const char *t = ins;
827 u32 wc;
828 BOOL allocated;
829 ntfschar *outpos;
830 int shorts, ret = -1;
831
832 shorts = utf8_to_utf16_size(ins);
833 if (shorts < 0)
834 goto fail;
835
836 allocated = FALSE;
837 if (!*outs) {
838 *outs = ntfs_malloc((shorts + 1) * sizeof(ntfschar));
839 if (!*outs)
840 goto fail;
841 allocated = TRUE;
842 }
843
844 outpos = *outs;
845
846 while(1) {
847 int m = utf8_to_unicode(&wc, t);
848 if (m <= 0) {
849 if (m < 0) {
850 /* do not leave space allocated if failed */
851 if (allocated) {
852 free(*outs);
853 *outs = (ntfschar*)NULL;
854 }
855 goto fail;
856 }
857 *outpos++ = const_cpu_to_le16(0);
858 break;
859 }
860 if (wc < 0x10000)
861 *outpos++ = cpu_to_le16(wc);
862 else {
863 wc -= 0x10000;
864 *outpos++ = cpu_to_le16((wc >> 10) + 0xd800);
865 *outpos++ = cpu_to_le16((wc & 0x3ff) + 0xdc00);
866 }
867 t += m;
868 }
869
870 ret = --outpos - *outs;
871 fail:
872 #if defined(__APPLE__) || defined(__DARWIN__)
873 #ifdef ENABLE_NFCONV
874 if(new_ins != NULL)
875 free(new_ins);
876 #endif /* ENABLE_NFCONV */
877 #endif /* defined(__APPLE__) || defined(__DARWIN__) */
878 return ret;
879 }
880
881 /**
882 * ntfs_ucstombs - convert a little endian Unicode string to a multibyte string
883 * @ins: input Unicode string buffer
884 * @ins_len: length of input string in Unicode characters
885 * @outs: on return contains the (allocated) output multibyte string
886 * @outs_len: length of output buffer in bytes (ignored if *@outs is NULL)
887 *
888 * Convert the input little endian, 2-byte Unicode string @ins, of length
889 * @ins_len into the multibyte string format dictated by the current locale.
890 *
891 * If *@outs is NULL, the function allocates the string and the caller is
892 * responsible for calling free(*@outs); when finished with it.
893 *
894 * On success the function returns the number of bytes written to the output
895 * string *@outs (>= 0), not counting the terminating NULL byte. If the output
896 * string buffer was allocated, *@outs is set to it.
897 *
898 * On error, -1 is returned, and errno is set to the error code. The following
899 * error codes can be expected:
900 * EINVAL Invalid arguments (e.g. @ins or @outs is NULL).
901 * EILSEQ The input string cannot be represented as a multibyte
902 * sequence according to the current locale.
903 * ENAMETOOLONG Destination buffer is too small for input string.
904 * ENOMEM Not enough memory to allocate destination buffer.
905 */
ntfs_ucstombs(const ntfschar * ins,const int ins_len,char ** outs,int outs_len)906 int ntfs_ucstombs(const ntfschar *ins, const int ins_len, char **outs,
907 int outs_len)
908 {
909 char *mbs;
910 int mbs_len;
911 #ifdef MB_CUR_MAX
912 wchar_t wc;
913 int i, o;
914 int cnt = 0;
915 #ifdef HAVE_MBSINIT
916 mbstate_t mbstate;
917 #endif
918 #endif /* MB_CUR_MAX */
919
920 if (!ins || !outs) {
921 errno = EINVAL;
922 return -1;
923 }
924 mbs = *outs;
925 mbs_len = outs_len;
926 if (mbs && !mbs_len) {
927 errno = ENAMETOOLONG;
928 return -1;
929 }
930 if (use_utf8)
931 return ntfs_utf16_to_utf8(ins, ins_len, outs, outs_len);
932 #ifdef MB_CUR_MAX
933 if (!mbs) {
934 mbs_len = (ins_len + 1) * MB_CUR_MAX;
935 mbs = ntfs_malloc(mbs_len);
936 if (!mbs)
937 return -1;
938 }
939 #ifdef HAVE_MBSINIT
940 memset(&mbstate, 0, sizeof(mbstate));
941 #else
942 #ifndef __HAIKU__
943 wctomb(NULL, 0);
944 #endif
945 #endif
946 for (i = o = 0; i < ins_len; i++) {
947 /* Reallocate memory if necessary or abort. */
948 if ((int)(o + MB_CUR_MAX) > mbs_len) {
949 char *tc;
950 if (mbs == *outs) {
951 errno = ENAMETOOLONG;
952 return -1;
953 }
954 tc = ntfs_malloc((mbs_len + 64) & ~63);
955 if (!tc)
956 goto err_out;
957 memcpy(tc, mbs, mbs_len);
958 mbs_len = (mbs_len + 64) & ~63;
959 free(mbs);
960 mbs = tc;
961 }
962 /* Convert the LE Unicode character to a CPU wide character. */
963 wc = (wchar_t)le16_to_cpu(ins[i]);
964 if (!wc)
965 break;
966 /* Convert the CPU endian wide character to multibyte. */
967 #ifdef HAVE_MBSINIT
968 cnt = wcrtomb(mbs + o, wc, &mbstate);
969 #elif defined(__HAIKU__)
970 cnt = -1;
971 #else
972 cnt = wctomb(mbs + o, wc);
973 #endif
974 if (cnt == -1)
975 goto err_out;
976 if (cnt <= 0) {
977 ntfs_log_debug("Eeek. cnt <= 0, cnt = %i\n", cnt);
978 errno = EINVAL;
979 goto err_out;
980 }
981 o += cnt;
982 }
983 #ifdef HAVE_MBSINIT
984 /* Make sure we are back in the initial state. */
985 if (!mbsinit(&mbstate)) {
986 ntfs_log_debug("Eeek. mbstate not in initial state!\n");
987 errno = EILSEQ;
988 goto err_out;
989 }
990 #endif
991 /* Now write the NULL character. */
992 mbs[o] = '\0';
993 if (*outs != mbs)
994 *outs = mbs;
995 return o;
996 err_out:
997 if (mbs != *outs) {
998 int eo = errno;
999 free(mbs);
1000 errno = eo;
1001 }
1002 #else /* MB_CUR_MAX */
1003 errno = EILSEQ;
1004 #endif /* MB_CUR_MAX */
1005 return -1;
1006 }
1007
1008 /**
1009 * ntfs_mbstoucs - convert a multibyte string to a little endian Unicode string
1010 * @ins: input multibyte string buffer
1011 * @outs: on return contains the (allocated) output Unicode string
1012 *
1013 * Convert the input multibyte string @ins, from the current locale into the
1014 * corresponding little endian, 2-byte Unicode string.
1015 *
1016 * The function allocates the string and the caller is responsible for calling
1017 * free(*@outs); when finished with it.
1018 *
1019 * On success the function returns the number of Unicode characters written to
1020 * the output string *@outs (>= 0), not counting the terminating Unicode NULL
1021 * character.
1022 *
1023 * On error, -1 is returned, and errno is set to the error code. The following
1024 * error codes can be expected:
1025 * EINVAL Invalid arguments (e.g. @ins or @outs is NULL).
1026 * EILSEQ The input string cannot be represented as a Unicode
1027 * string according to the current locale.
1028 * ENAMETOOLONG Destination buffer is too small for input string.
1029 * ENOMEM Not enough memory to allocate destination buffer.
1030 */
ntfs_mbstoucs(const char * ins,ntfschar ** outs)1031 int ntfs_mbstoucs(const char *ins, ntfschar **outs)
1032 {
1033 #ifdef MB_CUR_MAX
1034 ntfschar *ucs;
1035 const char *s;
1036 wchar_t wc;
1037 int i, o, cnt, ins_len, ucs_len, ins_size;
1038 #ifdef HAVE_MBSINIT
1039 mbstate_t mbstate;
1040 #endif
1041 #endif /* MB_CUR_MAX */
1042
1043 if (!ins || !outs) {
1044 errno = EINVAL;
1045 return -1;
1046 }
1047
1048 if (use_utf8)
1049 return ntfs_utf8_to_utf16(ins, outs);
1050
1051 #ifdef MB_CUR_MAX
1052 /* Determine the size of the multi-byte string in bytes. */
1053 ins_size = strlen(ins);
1054 /* Determine the length of the multi-byte string. */
1055 s = ins;
1056 #if defined(HAVE_MBSINIT)
1057 memset(&mbstate, 0, sizeof(mbstate));
1058 ins_len = mbsrtowcs(NULL, (const char **)&s, 0, &mbstate);
1059 #ifdef __CYGWIN32__
1060 if (!ins_len && *ins) {
1061 /* Older Cygwin had broken mbsrtowcs() implementation. */
1062 ins_len = strlen(ins);
1063 }
1064 #endif
1065 #elif !defined(DJGPP) && !defined(__HAIKU__)
1066 ins_len = mbstowcs(NULL, s, 0);
1067 #else
1068 /* Eeek!!! DJGPP has broken mbstowcs() implementation!!! */
1069 ins_len = strlen(ins);
1070 #endif
1071 if (ins_len == -1)
1072 return ins_len;
1073 #ifdef HAVE_MBSINIT
1074 if ((s != ins) || !mbsinit(&mbstate)) {
1075 #else
1076 if (s != ins) {
1077 #endif
1078 errno = EILSEQ;
1079 return -1;
1080 }
1081 /* Add the NULL terminator. */
1082 ins_len++;
1083 ucs_len = ins_len;
1084 ucs = ntfs_malloc(ucs_len * sizeof(ntfschar));
1085 if (!ucs)
1086 return -1;
1087 #ifdef HAVE_MBSINIT
1088 memset(&mbstate, 0, sizeof(mbstate));
1089 #else
1090 #ifndef __HAIKU__
1091 mbtowc(NULL, NULL, 0);
1092 #endif
1093 #endif
1094 for (i = o = cnt = 0; i < ins_size; i += cnt, o++) {
1095 /* Reallocate memory if necessary. */
1096 if (o >= ucs_len) {
1097 ntfschar *tc;
1098 ucs_len = (ucs_len * sizeof(ntfschar) + 64) & ~63;
1099 tc = realloc(ucs, ucs_len);
1100 if (!tc)
1101 goto err_out;
1102 ucs = tc;
1103 ucs_len /= sizeof(ntfschar);
1104 }
1105 /* Convert the multibyte character to a wide character. */
1106 #ifdef HAVE_MBSINIT
1107 cnt = mbrtowc(&wc, ins + i, ins_size - i, &mbstate);
1108 #elif defined(__HAIKU__)
1109 cnt = -1;
1110 #else
1111 cnt = mbtowc(&wc, ins + i, ins_size - i);
1112 #endif
1113 if (!cnt)
1114 break;
1115 if (cnt == -1)
1116 goto err_out;
1117 if (cnt < -1) {
1118 ntfs_log_trace("Eeek. cnt = %i\n", cnt);
1119 errno = EINVAL;
1120 goto err_out;
1121 }
1122 /* Make sure we are not overflowing the NTFS Unicode set. */
1123 if ((unsigned long)wc >= (unsigned long)(1 <<
1124 (8 * sizeof(ntfschar)))) {
1125 errno = EILSEQ;
1126 goto err_out;
1127 }
1128 /* Convert the CPU wide character to a LE Unicode character. */
1129 ucs[o] = cpu_to_le16(wc);
1130 }
1131 #ifdef HAVE_MBSINIT
1132 /* Make sure we are back in the initial state. */
1133 if (!mbsinit(&mbstate)) {
1134 ntfs_log_trace("Eeek. mbstate not in initial state!\n");
1135 errno = EILSEQ;
1136 goto err_out;
1137 }
1138 #endif
1139 /* Now write the NULL character. */
1140 ucs[o] = const_cpu_to_le16(L'\0');
1141 *outs = ucs;
1142 return o;
1143 err_out:
1144 free(ucs);
1145 #else /* MB_CUR_MAX */
1146 errno = EILSEQ;
1147 #endif /* MB_CUR_MAX */
1148 return -1;
1149 }
1150
1151 /*
1152 * Turn a UTF8 name uppercase
1153 *
1154 * Returns an allocated uppercase name which has to be freed by caller
1155 * or NULL if there is an error (described by errno)
1156 */
1157
1158 char *ntfs_uppercase_mbs(const char *low,
1159 const ntfschar *upcase, u32 upcase_size)
1160 {
1161 int size;
1162 char *upp;
1163 u32 wc;
1164 int n;
1165 const char *s;
1166 char *t;
1167
1168 size = strlen(low);
1169 upp = (char*)ntfs_malloc(3*size + 1);
1170 if (upp) {
1171 s = low;
1172 t = upp;
1173 do {
1174 n = utf8_to_unicode(&wc, s);
1175 if (n > 0) {
1176 if (wc < upcase_size)
1177 wc = le16_to_cpu(upcase[wc]);
1178 if (wc < 0x80)
1179 *t++ = wc;
1180 else if (wc < 0x800) {
1181 *t++ = (0xc0 | ((wc >> 6) & 0x3f));
1182 *t++ = 0x80 | (wc & 0x3f);
1183 } else if (wc < 0x10000) {
1184 *t++ = 0xe0 | (wc >> 12);
1185 *t++ = 0x80 | ((wc >> 6) & 0x3f);
1186 *t++ = 0x80 | (wc & 0x3f);
1187 } else {
1188 *t++ = 0xf0 | ((wc >> 18) & 7);
1189 *t++ = 0x80 | ((wc >> 12) & 63);
1190 *t++ = 0x80 | ((wc >> 6) & 0x3f);
1191 *t++ = 0x80 | (wc & 0x3f);
1192 }
1193 s += n;
1194 }
1195 } while (n > 0);
1196 if (n < 0) {
1197 free(upp);
1198 upp = (char*)NULL;
1199 errno = EILSEQ;
1200 }
1201 *t = 0;
1202 }
1203 return (upp);
1204 }
1205
1206 /**
1207 * ntfs_upcase_table_build - build the default upcase table for NTFS
1208 * @uc: destination buffer where to store the built table
1209 * @uc_len: size of destination buffer in bytes
1210 *
1211 * ntfs_upcase_table_build() builds the default upcase table for NTFS and
1212 * stores it in the caller supplied buffer @uc of size @uc_len.
1213 *
1214 * Note, @uc_len must be at least 128kiB in size or bad things will happen!
1215 */
1216 void ntfs_upcase_table_build(ntfschar *uc, u32 uc_len)
1217 {
1218 struct NEWUPPERCASE {
1219 unsigned short first;
1220 unsigned short last;
1221 short diff;
1222 unsigned char step;
1223 unsigned char osmajor;
1224 unsigned char osminor;
1225 } ;
1226
1227 /*
1228 * This is the table as defined by Windows XP
1229 */
1230 static int uc_run_table[][3] = { /* Start, End, Add */
1231 {0x0061, 0x007B, -32}, {0x0451, 0x045D, -80}, {0x1F70, 0x1F72, 74},
1232 {0x00E0, 0x00F7, -32}, {0x045E, 0x0460, -80}, {0x1F72, 0x1F76, 86},
1233 {0x00F8, 0x00FF, -32}, {0x0561, 0x0587, -48}, {0x1F76, 0x1F78, 100},
1234 {0x0256, 0x0258, -205}, {0x1F00, 0x1F08, 8}, {0x1F78, 0x1F7A, 128},
1235 {0x028A, 0x028C, -217}, {0x1F10, 0x1F16, 8}, {0x1F7A, 0x1F7C, 112},
1236 {0x03AC, 0x03AD, -38}, {0x1F20, 0x1F28, 8}, {0x1F7C, 0x1F7E, 126},
1237 {0x03AD, 0x03B0, -37}, {0x1F30, 0x1F38, 8}, {0x1FB0, 0x1FB2, 8},
1238 {0x03B1, 0x03C2, -32}, {0x1F40, 0x1F46, 8}, {0x1FD0, 0x1FD2, 8},
1239 {0x03C2, 0x03C3, -31}, {0x1F51, 0x1F52, 8}, {0x1FE0, 0x1FE2, 8},
1240 {0x03C3, 0x03CC, -32}, {0x1F53, 0x1F54, 8}, {0x1FE5, 0x1FE6, 7},
1241 {0x03CC, 0x03CD, -64}, {0x1F55, 0x1F56, 8}, {0x2170, 0x2180, -16},
1242 {0x03CD, 0x03CF, -63}, {0x1F57, 0x1F58, 8}, {0x24D0, 0x24EA, -26},
1243 {0x0430, 0x0450, -32}, {0x1F60, 0x1F68, 8}, {0xFF41, 0xFF5B, -32},
1244 {0}
1245 };
1246 static int uc_dup_table[][2] = { /* Start, End */
1247 {0x0100, 0x012F}, {0x01A0, 0x01A6}, {0x03E2, 0x03EF}, {0x04CB, 0x04CC},
1248 {0x0132, 0x0137}, {0x01B3, 0x01B7}, {0x0460, 0x0481}, {0x04D0, 0x04EB},
1249 {0x0139, 0x0149}, {0x01CD, 0x01DD}, {0x0490, 0x04BF}, {0x04EE, 0x04F5},
1250 {0x014A, 0x0178}, {0x01DE, 0x01EF}, {0x04BF, 0x04BF}, {0x04F8, 0x04F9},
1251 {0x0179, 0x017E}, {0x01F4, 0x01F5}, {0x04C1, 0x04C4}, {0x1E00, 0x1E95},
1252 {0x018B, 0x018B}, {0x01FA, 0x0218}, {0x04C7, 0x04C8}, {0x1EA0, 0x1EF9},
1253 {0}
1254 };
1255 static int uc_byte_table[][2] = { /* Offset, Value */
1256 {0x00FF, 0x0178}, {0x01AD, 0x01AC}, {0x01F3, 0x01F1}, {0x0269, 0x0196},
1257 {0x0183, 0x0182}, {0x01B0, 0x01AF}, {0x0253, 0x0181}, {0x026F, 0x019C},
1258 {0x0185, 0x0184}, {0x01B9, 0x01B8}, {0x0254, 0x0186}, {0x0272, 0x019D},
1259 {0x0188, 0x0187}, {0x01BD, 0x01BC}, {0x0259, 0x018F}, {0x0275, 0x019F},
1260 {0x018C, 0x018B}, {0x01C6, 0x01C4}, {0x025B, 0x0190}, {0x0283, 0x01A9},
1261 {0x0192, 0x0191}, {0x01C9, 0x01C7}, {0x0260, 0x0193}, {0x0288, 0x01AE},
1262 {0x0199, 0x0198}, {0x01CC, 0x01CA}, {0x0263, 0x0194}, {0x0292, 0x01B7},
1263 {0x01A8, 0x01A7}, {0x01DD, 0x018E}, {0x0268, 0x0197},
1264 {0}
1265 };
1266
1267 /*
1268 * Changes which were applied to later Windows versions
1269 *
1270 * md5 for $UpCase from Winxp : 6fa3db2468275286210751e869d36373
1271 * Vista : 2f03b5a69d486ff3864cecbd07f24440
1272 * Win8 : 7ff498a44e45e77374cc7c962b1b92f2
1273 */
1274 static const struct NEWUPPERCASE newuppercase[] = {
1275 /* from Windows 6.0 (Vista) */
1276 { 0x37b, 0x37d, 0x82, 1, 6, 0 },
1277 { 0x1f80, 0x1f87, 0x8, 1, 6, 0 },
1278 { 0x1f90, 0x1f97, 0x8, 1, 6, 0 },
1279 { 0x1fa0, 0x1fa7, 0x8, 1, 6, 0 },
1280 { 0x2c30, 0x2c5e, -0x30, 1, 6, 0 },
1281 { 0x2d00, 0x2d25, -0x1c60, 1, 6, 0 },
1282 { 0x2c68, 0x2c6c, -0x1, 2, 6, 0 },
1283 { 0x219, 0x21f, -0x1, 2, 6, 0 },
1284 { 0x223, 0x233, -0x1, 2, 6, 0 },
1285 { 0x247, 0x24f, -0x1, 2, 6, 0 },
1286 { 0x3d9, 0x3e1, -0x1, 2, 6, 0 },
1287 { 0x48b, 0x48f, -0x1, 2, 6, 0 },
1288 { 0x4fb, 0x513, -0x1, 2, 6, 0 },
1289 { 0x2c81, 0x2ce3, -0x1, 2, 6, 0 },
1290 { 0x3f8, 0x3fb, -0x1, 3, 6, 0 },
1291 { 0x4c6, 0x4ce, -0x1, 4, 6, 0 },
1292 { 0x23c, 0x242, -0x1, 6, 6, 0 },
1293 { 0x4ed, 0x4f7, -0x1, 10, 6, 0 },
1294 { 0x450, 0x45d, -0x50, 13, 6, 0 },
1295 { 0x2c61, 0x2c76, -0x1, 21, 6, 0 },
1296 { 0x1fcc, 0x1ffc, -0x9, 48, 6, 0 },
1297 { 0x180, 0x180, 0xc3, 1, 6, 0 },
1298 { 0x195, 0x195, 0x61, 1, 6, 0 },
1299 { 0x19a, 0x19a, 0xa3, 1, 6, 0 },
1300 { 0x19e, 0x19e, 0x82, 1, 6, 0 },
1301 { 0x1bf, 0x1bf, 0x38, 1, 6, 0 },
1302 { 0x1f9, 0x1f9, -0x1, 1, 6, 0 },
1303 { 0x23a, 0x23a, 0x2a2b, 1, 6, 0 },
1304 { 0x23e, 0x23e, 0x2a28, 1, 6, 0 },
1305 { 0x26b, 0x26b, 0x29f7, 1, 6, 0 },
1306 { 0x27d, 0x27d, 0x29e7, 1, 6, 0 },
1307 { 0x280, 0x280, -0xda, 1, 6, 0 },
1308 { 0x289, 0x289, -0x45, 1, 6, 0 },
1309 { 0x28c, 0x28c, -0x47, 1, 6, 0 },
1310 { 0x3f2, 0x3f2, 0x7, 1, 6, 0 },
1311 { 0x4cf, 0x4cf, -0xf, 1, 6, 0 },
1312 { 0x1d7d, 0x1d7d, 0xee6, 1, 6, 0 },
1313 { 0x1fb3, 0x1fb3, 0x9, 1, 6, 0 },
1314 { 0x214e, 0x214e, -0x1c, 1, 6, 0 },
1315 { 0x2184, 0x2184, -0x1, 1, 6, 0 },
1316 /* from Windows 6.1 (Win7) */
1317 { 0x23a, 0x23e, 0x0, 4, 6, 1 },
1318 { 0x250, 0x250, 0x2a1f, 2, 6, 1 },
1319 { 0x251, 0x251, 0x2a1c, 2, 6, 1 },
1320 { 0x271, 0x271, 0x29fd, 2, 6, 1 },
1321 { 0x371, 0x373, -0x1, 2, 6, 1 },
1322 { 0x377, 0x377, -0x1, 2, 6, 1 },
1323 { 0x3c2, 0x3c2, 0x0, 2, 6, 1 },
1324 { 0x3d7, 0x3d7, -0x8, 2, 6, 1 },
1325 { 0x515, 0x523, -0x1, 2, 6, 1 },
1326 /* below, -0x75fc stands for 0x8a04 and truncation */
1327 { 0x1d79, 0x1d79, -0x75fc, 2, 6, 1 },
1328 { 0x1efb, 0x1eff, -0x1, 2, 6, 1 },
1329 { 0x1fc3, 0x1ff3, 0x9, 48, 6, 1 },
1330 { 0x1fcc, 0x1ffc, 0x0, 48, 6, 1 },
1331 { 0x2c65, 0x2c65, -0x2a2b, 2, 6, 1 },
1332 { 0x2c66, 0x2c66, -0x2a28, 2, 6, 1 },
1333 { 0x2c73, 0x2c73, -0x1, 2, 6, 1 },
1334 { 0xa641, 0xa65f, -0x1, 2, 6, 1 },
1335 { 0xa663, 0xa66d, -0x1, 2, 6, 1 },
1336 { 0xa681, 0xa697, -0x1, 2, 6, 1 },
1337 { 0xa723, 0xa72f, -0x1, 2, 6, 1 },
1338 { 0xa733, 0xa76f, -0x1, 2, 6, 1 },
1339 { 0xa77a, 0xa77c, -0x1, 2, 6, 1 },
1340 { 0xa77f, 0xa787, -0x1, 2, 6, 1 },
1341 { 0xa78c, 0xa78c, -0x1, 2, 6, 1 },
1342 /* end mark */
1343 { 0 }
1344 } ;
1345
1346 int i, r;
1347 int k, off;
1348 const struct NEWUPPERCASE *puc;
1349
1350 memset((char*)uc, 0, uc_len);
1351 uc_len >>= 1;
1352 if (uc_len > 65536)
1353 uc_len = 65536;
1354 for (i = 0; (u32)i < uc_len; i++)
1355 uc[i] = cpu_to_le16(i);
1356 for (r = 0; uc_run_table[r][0]; r++) {
1357 off = uc_run_table[r][2];
1358 for (i = uc_run_table[r][0]; i < uc_run_table[r][1]; i++)
1359 uc[i] = cpu_to_le16(i + off);
1360 }
1361 for (r = 0; uc_dup_table[r][0]; r++)
1362 for (i = uc_dup_table[r][0]; i < uc_dup_table[r][1]; i += 2)
1363 uc[i + 1] = cpu_to_le16(i);
1364 for (r = 0; uc_byte_table[r][0]; r++) {
1365 k = uc_byte_table[r][1];
1366 uc[uc_byte_table[r][0]] = cpu_to_le16(k);
1367 }
1368 for (r=0; newuppercase[r].first; r++) {
1369 puc = &newuppercase[r];
1370 if ((puc->osmajor < UPCASE_MAJOR)
1371 || ((puc->osmajor == UPCASE_MAJOR)
1372 && (puc->osminor <= UPCASE_MINOR))) {
1373 off = puc->diff;
1374 for (i = puc->first; i <= puc->last; i += puc->step)
1375 uc[i] = cpu_to_le16(i + off);
1376 }
1377 }
1378 }
1379
1380 /*
1381 * Allocate and build the default upcase table
1382 *
1383 * Returns the number of entries
1384 * 0 if failed
1385 */
1386
1387 #define UPCASE_LEN 65536 /* default number of entries in upcase */
1388
1389 u32 ntfs_upcase_build_default(ntfschar **upcase)
1390 {
1391 u32 upcase_len = 0;
1392
1393 *upcase = (ntfschar*)ntfs_malloc(UPCASE_LEN*2);
1394 if (*upcase) {
1395 ntfs_upcase_table_build(*upcase, UPCASE_LEN*2);
1396 upcase_len = UPCASE_LEN;
1397 }
1398 return (upcase_len);
1399 }
1400
1401 /*
1402 * Build a table for converting to lower case
1403 *
1404 * This is only meaningful when there is a single lower case
1405 * character leading to an upper case one, and currently the
1406 * only exception is the greek letter sigma which has a single
1407 * upper case glyph (code U+03A3), but two lower case glyphs
1408 * (code U+03C3 and U+03C2, the latter to be used at the end
1409 * of a word). In the following implementation the upper case
1410 * sigma will be lowercased as U+03C3.
1411 */
1412
1413 ntfschar *ntfs_locase_table_build(const ntfschar *uc, u32 uc_cnt)
1414 {
1415 ntfschar *lc;
1416 u32 upp;
1417 u32 i;
1418
1419 lc = (ntfschar*)ntfs_malloc(uc_cnt*sizeof(ntfschar));
1420 if (lc) {
1421 for (i=0; i<uc_cnt; i++)
1422 lc[i] = cpu_to_le16(i);
1423 for (i=0; i<uc_cnt; i++) {
1424 upp = le16_to_cpu(uc[i]);
1425 if ((upp != i) && (upp < uc_cnt))
1426 lc[upp] = cpu_to_le16(i);
1427 }
1428 } else
1429 ntfs_log_error("Could not build the locase table\n");
1430 return (lc);
1431 }
1432
1433 /**
1434 * ntfs_str2ucs - convert a string to a valid NTFS file name
1435 * @s: input string
1436 * @len: length of output buffer in Unicode characters
1437 *
1438 * Convert the input @s string into the corresponding little endian,
1439 * 2-byte Unicode string. The length of the converted string is less
1440 * or equal to the maximum length allowed by the NTFS format (255).
1441 *
1442 * If @s is NULL then return AT_UNNAMED.
1443 *
1444 * On success the function returns the Unicode string in an allocated
1445 * buffer and the caller is responsible to free it when it's not needed
1446 * anymore.
1447 *
1448 * On error NULL is returned and errno is set to the error code.
1449 */
1450 ntfschar *ntfs_str2ucs(const char *s, int *len)
1451 {
1452 ntfschar *ucs = NULL;
1453
1454 if (s && ((*len = ntfs_mbstoucs(s, &ucs)) == -1)) {
1455 ntfs_log_perror("Couldn't convert '%s' to Unicode", s);
1456 return NULL;
1457 }
1458 if (*len > NTFS_MAX_NAME_LEN) {
1459 free(ucs);
1460 errno = ENAMETOOLONG;
1461 return NULL;
1462 }
1463 if (!ucs || !*len) {
1464 ucs = AT_UNNAMED;
1465 *len = 0;
1466 }
1467 return ucs;
1468 }
1469
1470 /**
1471 * ntfs_ucsfree - free memory allocated by ntfs_str2ucs()
1472 * @ucs input string to be freed
1473 *
1474 * Free memory at @ucs and which was allocated by ntfs_str2ucs.
1475 *
1476 * Return value: none.
1477 */
1478 void ntfs_ucsfree(ntfschar *ucs)
1479 {
1480 if (ucs && (ucs != AT_UNNAMED))
1481 free(ucs);
1482 }
1483
1484 /*
1485 * Check whether a name contains no chars forbidden
1486 * for DOS or Win32 use
1487 *
1488 * If @strict is TRUE, then trailing dots and spaces are forbidden.
1489 * These names are technically allowed in the Win32 namespace, but
1490 * they can be problematic. See comment for FILE_NAME_WIN32.
1491 *
1492 * If there is a bad char, errno is set to EINVAL
1493 */
1494
1495 BOOL ntfs_forbidden_chars(const ntfschar *name, int len, BOOL strict)
1496 {
1497 BOOL forbidden;
1498 int ch;
1499 int i;
1500 static const u32 mainset = (1L << ('\"' - 0x20))
1501 | (1L << ('*' - 0x20))
1502 | (1L << ('/' - 0x20))
1503 | (1L << (':' - 0x20))
1504 | (1L << ('<' - 0x20))
1505 | (1L << ('>' - 0x20))
1506 | (1L << ('?' - 0x20));
1507
1508 forbidden = (len == 0) ||
1509 (strict && (name[len-1] == const_cpu_to_le16(' ') ||
1510 name[len-1] == const_cpu_to_le16('.')));
1511 for (i=0; i<len; i++) {
1512 ch = le16_to_cpu(name[i]);
1513 if ((ch < 0x20)
1514 || ((ch < 0x40)
1515 && ((1L << (ch - 0x20)) & mainset))
1516 || (ch == '\\')
1517 || (ch == '|'))
1518 forbidden = TRUE;
1519 }
1520 if (forbidden)
1521 errno = EINVAL;
1522 return (forbidden);
1523 }
1524
1525 /*
1526 * Check whether a name contains no forbidden chars and
1527 * is not a reserved name for DOS or Win32 use
1528 *
1529 * The reserved names are CON, PRN, AUX, NUL, COM1..COM9, LPT1..LPT9
1530 * with no suffix or any suffix.
1531 *
1532 * If @strict is TRUE, then trailing dots and spaces are forbidden.
1533 * These names are technically allowed in the Win32 namespace, but
1534 * they can be problematic. See comment for FILE_NAME_WIN32.
1535 *
1536 * If the name is forbidden, errno is set to EINVAL
1537 */
1538
1539 BOOL ntfs_forbidden_names(ntfs_volume *vol, const ntfschar *name, int len,
1540 BOOL strict)
1541 {
1542 BOOL forbidden;
1543 int h;
1544 static const ntfschar dot = const_cpu_to_le16('.');
1545 static const ntfschar con[] = { const_cpu_to_le16('c'),
1546 const_cpu_to_le16('o'), const_cpu_to_le16('n') };
1547 static const ntfschar prn[] = { const_cpu_to_le16('p'),
1548 const_cpu_to_le16('r'), const_cpu_to_le16('n') };
1549 static const ntfschar aux[] = { const_cpu_to_le16('a'),
1550 const_cpu_to_le16('u'), const_cpu_to_le16('x') };
1551 static const ntfschar nul[] = { const_cpu_to_le16('n'),
1552 const_cpu_to_le16('u'), const_cpu_to_le16('l') };
1553 static const ntfschar com[] = { const_cpu_to_le16('c'),
1554 const_cpu_to_le16('o'), const_cpu_to_le16('m') };
1555 static const ntfschar lpt[] = { const_cpu_to_le16('l'),
1556 const_cpu_to_le16('p'), const_cpu_to_le16('t') };
1557
1558 forbidden = ntfs_forbidden_chars(name, len, strict);
1559 if (!forbidden && (len >= 3)) {
1560 /*
1561 * Rough hash check to tell whether the first couple of chars
1562 * may be one of CO PR AU NU LP or lowercase variants.
1563 */
1564 h = ((le16_to_cpu(name[0]) & 31)*48)
1565 ^ ((le16_to_cpu(name[1]) & 31)*165);
1566 if ((h % 23) == 17) {
1567 /* do a full check, depending on the third char */
1568 switch (le16_to_cpu(name[2]) & ~0x20) {
1569 case 'N' :
1570 if (((len == 3) || (name[3] == dot))
1571 && (!ntfs_ucsncasecmp(name, con, 3,
1572 vol->upcase, vol->upcase_len)
1573 || !ntfs_ucsncasecmp(name, prn, 3,
1574 vol->upcase, vol->upcase_len)))
1575 forbidden = TRUE;
1576 break;
1577 case 'X' :
1578 if (((len == 3) || (name[3] == dot))
1579 && !ntfs_ucsncasecmp(name, aux, 3,
1580 vol->upcase, vol->upcase_len))
1581 forbidden = TRUE;
1582 break;
1583 case 'L' :
1584 if (((len == 3) || (name[3] == dot))
1585 && !ntfs_ucsncasecmp(name, nul, 3,
1586 vol->upcase, vol->upcase_len))
1587 forbidden = TRUE;
1588 break;
1589 case 'M' :
1590 if ((len > 3)
1591 && (le16_to_cpu(name[3]) >= '1')
1592 && (le16_to_cpu(name[3]) <= '9')
1593 && ((len == 4) || (name[4] == dot))
1594 && !ntfs_ucsncasecmp(name, com, 3,
1595 vol->upcase, vol->upcase_len))
1596 forbidden = TRUE;
1597 break;
1598 case 'T' :
1599 if ((len > 3)
1600 && (le16_to_cpu(name[3]) >= '1')
1601 && (le16_to_cpu(name[3]) <= '9')
1602 && ((len == 4) || (name[4] == dot))
1603 && !ntfs_ucsncasecmp(name, lpt, 3,
1604 vol->upcase, vol->upcase_len))
1605 forbidden = TRUE;
1606 break;
1607 }
1608 }
1609 }
1610
1611 if (forbidden)
1612 errno = EINVAL;
1613 return (forbidden);
1614 }
1615
1616 /*
1617 * Check whether the same name can be used as a DOS and
1618 * a Win32 name
1619 *
1620 * The names must be the same, or the short name the uppercase
1621 * variant of the long name
1622 */
1623
1624 BOOL ntfs_collapsible_chars(ntfs_volume *vol,
1625 const ntfschar *shortname, int shortlen,
1626 const ntfschar *longname, int longlen)
1627 {
1628 BOOL collapsible;
1629 unsigned int ch;
1630 unsigned int cs;
1631 int i;
1632
1633 collapsible = shortlen == longlen;
1634 for (i=0; collapsible && (i<shortlen); i++) {
1635 ch = le16_to_cpu(longname[i]);
1636 cs = le16_to_cpu(shortname[i]);
1637 if ((cs != ch)
1638 && ((ch >= vol->upcase_len)
1639 || (cs >= vol->upcase_len)
1640 || (vol->upcase[cs] != vol->upcase[ch])))
1641 collapsible = FALSE;
1642 }
1643 return (collapsible);
1644 }
1645
1646 /*
1647 * Define the character encoding to be used.
1648 * Use UTF-8 unless specified otherwise.
1649 */
1650
1651 int ntfs_set_char_encoding(const char *locale)
1652 {
1653 use_utf8 = 0;
1654 if (!locale || strstr(locale,"utf8") || strstr(locale,"UTF8")
1655 || strstr(locale,"utf-8") || strstr(locale,"UTF-8"))
1656 use_utf8 = 1;
1657 else
1658 #ifndef __HAIKU__
1659 if (setlocale(LC_ALL, locale))
1660 use_utf8 = 0;
1661 else
1662 #endif
1663 {
1664 ntfs_log_error("Invalid locale, encoding to UTF-8\n");
1665 use_utf8 = 1;
1666 }
1667 return 0; /* always successful */
1668 }
1669
1670 #if defined(__APPLE__) || defined(__DARWIN__)
1671
1672 int ntfs_macosx_normalize_filenames(int normalize) {
1673 #ifdef ENABLE_NFCONV
1674 if (normalize == 0 || normalize == 1) {
1675 nfconvert_utf8 = normalize;
1676 return 0;
1677 }
1678 else {
1679 return -1;
1680 }
1681 #else
1682 return -1;
1683 #endif /* ENABLE_NFCONV */
1684 }
1685
1686 int ntfs_macosx_normalize_utf8(const char *utf8_string, char **target,
1687 int composed)
1688 {
1689 #ifdef ENABLE_NFCONV
1690 /* For this code to compile, the CoreFoundation framework must be fed to
1691 * the linker. */
1692 CFStringRef cfSourceString;
1693 CFMutableStringRef cfMutableString;
1694 CFRange rangeToProcess;
1695 CFIndex requiredBufferLength;
1696 char *result = NULL;
1697 int resultLength = -1;
1698
1699 /* Convert the UTF-8 string to a CFString. */
1700 cfSourceString = CFStringCreateWithCString(kCFAllocatorDefault,
1701 utf8_string, kCFStringEncodingUTF8);
1702 if (cfSourceString == NULL) {
1703 ntfs_log_error("CFStringCreateWithCString failed!\n");
1704 return -2;
1705 }
1706
1707 /* Create a mutable string from cfSourceString that we are free to
1708 * modify. */
1709 cfMutableString = CFStringCreateMutableCopy(kCFAllocatorDefault, 0,
1710 cfSourceString);
1711 CFRelease(cfSourceString); /* End-of-life. */
1712 if (cfMutableString == NULL) {
1713 ntfs_log_error("CFStringCreateMutableCopy failed!\n");
1714 return -3;
1715 }
1716
1717 /* Normalize the mutable string to the desired normalization form. */
1718 CFStringNormalize(cfMutableString, (composed != 0 ?
1719 kCFStringNormalizationFormC : kCFStringNormalizationFormD));
1720
1721 /* Store the resulting string in a '\0'-terminated UTF-8 encoded char*
1722 * buffer. */
1723 rangeToProcess = CFRangeMake(0, CFStringGetLength(cfMutableString));
1724 if (CFStringGetBytes(cfMutableString, rangeToProcess,
1725 kCFStringEncodingUTF8, 0, false, NULL, 0,
1726 &requiredBufferLength) > 0)
1727 {
1728 resultLength = sizeof(char) * (requiredBufferLength + 1);
1729 result = ntfs_calloc(resultLength);
1730
1731 if (result != NULL) {
1732 if (CFStringGetBytes(cfMutableString, rangeToProcess,
1733 kCFStringEncodingUTF8, 0, false,
1734 (UInt8*) result, resultLength - 1,
1735 &requiredBufferLength) <= 0)
1736 {
1737 ntfs_log_error("Could not perform UTF-8 "
1738 "conversion of normalized "
1739 "CFMutableString.\n");
1740 free(result);
1741 result = NULL;
1742 }
1743 }
1744 else {
1745 ntfs_log_error("Could not perform a ntfs_calloc of %d "
1746 "bytes for char *result.\n", resultLength);
1747 }
1748 }
1749 else {
1750 ntfs_log_error("Could not perform check for required length of "
1751 "UTF-8 conversion of normalized CFMutableString.\n");
1752 }
1753
1754 CFRelease(cfMutableString);
1755
1756 if (result != NULL) {
1757 *target = result;
1758 return resultLength - 1;
1759 }
1760 else {
1761 return -1;
1762 }
1763 #else
1764 return -1;
1765 #endif /* ENABLE_NFCONV */
1766 }
1767 #endif /* defined(__APPLE__) || defined(__DARWIN__) */
1768