xref: /haiku/src/add-ons/kernel/file_systems/ntfs/libntfs/unistr.c (revision 820dca4df6c7bf955c46e8f6521b9408f50b2900)
1 /**
2  * unistr.c - Unicode string handling. Originated from the Linux-NTFS project.
3  *
4  * Copyright (c) 2000-2004 Anton Altaparmakov
5  * Copyright (c) 2002-2009 Szabolcs Szakacsits
6  * Copyright (c) 2008-2011 Jean-Pierre Andre
7  * Copyright (c) 2008      Bernhard Kaindl
8  *
9  * This program/include file is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU General Public License as published
11  * by the Free Software Foundation; either version 2 of the License, or
12  * (at your option) any later version.
13  *
14  * This program/include file is distributed in the hope that it will be
15  * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
16  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17  * GNU General Public License for more details.
18  *
19  * You should have received a copy of the GNU General Public License
20  * along with this program (in the main directory of the NTFS-3G
21  * distribution in the file COPYING); if not, write to the Free Software
22  * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
23  */
24 
25 #ifdef HAVE_CONFIG_H
26 #include "config.h"
27 #endif
28 
29 #ifdef HAVE_STDIO_H
30 #include <stdio.h>
31 #endif
32 #ifdef HAVE_STDLIB_H
33 #include <stdlib.h>
34 #endif
35 #ifdef HAVE_WCHAR_H
36 #include <wchar.h>
37 #endif
38 #ifdef HAVE_STRING_H
39 #include <string.h>
40 #endif
41 #ifdef HAVE_ERRNO_H
42 #include <errno.h>
43 #endif
44 #ifdef HAVE_LOCALE_H
45 #include <locale.h>
46 #endif
47 
48 #if defined(__APPLE__) || defined(__DARWIN__)
49 #ifdef ENABLE_NFCONV
50 #include <CoreFoundation/CoreFoundation.h>
51 #endif /* ENABLE_NFCONV */
52 #endif /* defined(__APPLE__) || defined(__DARWIN__) */
53 
54 #include "compat.h"
55 #include "attrib.h"
56 #include "types.h"
57 #include "unistr.h"
58 #include "debug.h"
59 #include "logging.h"
60 #include "misc.h"
61 
62 #define NOREVBOM 0  /* JPA rejecting U+FFFE and U+FFFF, open to debate */
63 
64 // no wchar support in the Haiku kernel
65 #if defined(__HAIKU__) && defined(_KERNEL_MODE)
66 #	include <KernelExport.h>
67 #	define mbstowcs(a, b, c)	(panic("mbstowcs"), 0)
68 #	define wctomb(a, b)			(panic("wctomb"), 0)
69 #	define mbtowc(a, b, c)		(panic("mbtowc"), 0)
70 #	define setlocale(a, b)		(panic("setlocale"), 0)
71 #endif
72 
73 /*
74  * IMPORTANT
75  * =========
76  *
77  * All these routines assume that the Unicode characters are in little endian
78  * encoding inside the strings!!!
79  */
80 
81 static int use_utf8 = 1; /* use UTF-8 encoding for file names */
82 
83 #if defined(__APPLE__) || defined(__DARWIN__)
84 #ifdef ENABLE_NFCONV
85 /**
86  * This variable controls whether or not automatic normalization form conversion
87  * should be performed when translating NTFS unicode file names to UTF-8.
88  * Defaults to on, but can be controlled from the outside using the function
89  *   int ntfs_macosx_normalize_filenames(int normalize);
90  */
91 static int nfconvert_utf8 = 1;
92 #endif /* ENABLE_NFCONV */
93 #endif /* defined(__APPLE__) || defined(__DARWIN__) */
94 
95 /*
96  * This is used by the name collation functions to quickly determine what
97  * characters are (in)valid.
98  */
99 #if 0
100 static const u8 legal_ansi_char_array[0x40] = {
101 	0x00, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
102 	0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
103 
104 	0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
105 	0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
106 
107 	0x17, 0x07, 0x18, 0x17, 0x17, 0x17, 0x17, 0x17,
108 	0x17, 0x17, 0x18, 0x16, 0x16, 0x17, 0x07, 0x00,
109 
110 	0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17,
111 	0x17, 0x17, 0x04, 0x16, 0x18, 0x16, 0x18, 0x18,
112 };
113 #endif
114 
115 /**
116  * ntfs_names_are_equal - compare two Unicode names for equality
117  * @s1:			name to compare to @s2
118  * @s1_len:		length in Unicode characters of @s1
119  * @s2:			name to compare to @s1
120  * @s2_len:		length in Unicode characters of @s2
121  * @ic:			ignore case bool
122  * @upcase:		upcase table (only if @ic == IGNORE_CASE)
123  * @upcase_size:	length in Unicode characters of @upcase (if present)
124  *
125  * Compare the names @s1 and @s2 and return TRUE (1) if the names are
126  * identical, or FALSE (0) if they are not identical. If @ic is IGNORE_CASE,
127  * the @upcase table is used to perform a case insensitive comparison.
128  */
129 BOOL ntfs_names_are_equal(const ntfschar *s1, size_t s1_len,
130 		const ntfschar *s2, size_t s2_len,
131 		const IGNORE_CASE_BOOL ic,
132 		const ntfschar *upcase, const u32 upcase_size)
133 {
134 	if (s1_len != s2_len)
135 		return FALSE;
136 	if (!s1_len)
137 		return TRUE;
138 	if (ic == CASE_SENSITIVE)
139 		return ntfs_ucsncmp(s1, s2, s1_len) ? FALSE: TRUE;
140 	return ntfs_ucsncasecmp(s1, s2, s1_len, upcase, upcase_size) ? FALSE:
141 								       TRUE;
142 }
143 
144 /*
145  * ntfs_names_full_collate() fully collate two Unicode names
146  *
147  * @name1:	first Unicode name to compare
148  * @name1_len:	length of first Unicode name to compare
149  * @name2:	second Unicode name to compare
150  * @name2_len:	length of second Unicode name to compare
151  * @ic:		either CASE_SENSITIVE or IGNORE_CASE
152  * @upcase:	upcase table (ignored if @ic is CASE_SENSITIVE)
153  * @upcase_len:	upcase table size (ignored if @ic is CASE_SENSITIVE)
154  *
155  *  -1 if the first name collates before the second one,
156  *   0 if the names match,
157  *   1 if the second name collates before the first one, or
158  *
159  */
160 int ntfs_names_full_collate(const ntfschar *name1, const u32 name1_len,
161 		const ntfschar *name2, const u32 name2_len,
162 		const IGNORE_CASE_BOOL ic, const ntfschar *upcase,
163 		const u32 upcase_len)
164 {
165 	u32 cnt;
166 	u16 c1, c2;
167 	u16 u1, u2;
168 
169 #ifdef DEBUG
170 	if (!name1 || !name2 || (ic && (!upcase || !upcase_len))) {
171 		ntfs_log_debug("ntfs_names_collate received NULL pointer!\n");
172 		exit(1);
173 	}
174 #endif
175 	cnt = min(name1_len, name2_len);
176 	if (cnt > 0) {
177 		if (ic == CASE_SENSITIVE) {
178 			while (--cnt && (*name1 == *name2)) {
179 				name1++;
180 				name2++;
181 			}
182 			u1 = c1 = le16_to_cpu(*name1);
183 			u2 = c2 = le16_to_cpu(*name2);
184 			if (u1 < upcase_len)
185 				u1 = le16_to_cpu(upcase[u1]);
186 			if (u2 < upcase_len)
187 				u2 = le16_to_cpu(upcase[u2]);
188 			if ((u1 == u2) && cnt)
189 				do {
190 					name1++;
191 					u1 = le16_to_cpu(*name1);
192 					name2++;
193 					u2 = le16_to_cpu(*name2);
194 					if (u1 < upcase_len)
195 						u1 = le16_to_cpu(upcase[u1]);
196 					if (u2 < upcase_len)
197 						u2 = le16_to_cpu(upcase[u2]);
198 				} while ((u1 == u2) && --cnt);
199 			if (u1 < u2)
200 				return -1;
201 			if (u1 > u2)
202 				return 1;
203 			if (name1_len < name2_len)
204 				return -1;
205 			if (name1_len > name2_len)
206 				return 1;
207 			if (c1 < c2)
208 				return -1;
209 			if (c1 > c2)
210 				return 1;
211 		} else {
212 			do {
213 				u1 = c1 = le16_to_cpu(*name1);
214 				name1++;
215 				u2 = c2 = le16_to_cpu(*name2);
216 				name2++;
217 				if (u1 < upcase_len)
218 					u1 = le16_to_cpu(upcase[u1]);
219 				if (u2 < upcase_len)
220 					u2 = le16_to_cpu(upcase[u2]);
221 			} while ((u1 == u2) && --cnt);
222 			if (u1 < u2)
223 				return -1;
224 			if (u1 > u2)
225 				return 1;
226 			if (name1_len < name2_len)
227 				return -1;
228 			if (name1_len > name2_len)
229 				return 1;
230 		}
231 	} else {
232 		if (name1_len < name2_len)
233 			return -1;
234 		if (name1_len > name2_len)
235 			return 1;
236 	}
237 	return 0;
238 }
239 
240 /**
241  * ntfs_ucsncmp - compare two little endian Unicode strings
242  * @s1:		first string
243  * @s2:		second string
244  * @n:		maximum unicode characters to compare
245  *
246  * Compare the first @n characters of the Unicode strings @s1 and @s2,
247  * The strings in little endian format and appropriate le16_to_cpu()
248  * conversion is performed on non-little endian machines.
249  *
250  * The function returns an integer less than, equal to, or greater than zero
251  * if @s1 (or the first @n Unicode characters thereof) is found, respectively,
252  * to be less than, to match, or be greater than @s2.
253  */
254 int ntfs_ucsncmp(const ntfschar *s1, const ntfschar *s2, size_t n)
255 {
256 	ntfschar c1, c2;
257 	size_t i;
258 
259 #ifdef DEBUG
260 	if (!s1 || !s2) {
261 		ntfs_log_debug("ntfs_wcsncmp() received NULL pointer!\n");
262 		exit(1);
263 	}
264 #endif
265 	for (i = 0; i < n; ++i) {
266 		c1 = le16_to_cpu(s1[i]);
267 		c2 = le16_to_cpu(s2[i]);
268 		if (c1 < c2)
269 			return -1;
270 		if (c1 > c2)
271 			return 1;
272 		if (!c1)
273 			break;
274 	}
275 	return 0;
276 }
277 
278 /**
279  * ntfs_ucsncasecmp - compare two little endian Unicode strings, ignoring case
280  * @s1:			first string
281  * @s2:			second string
282  * @n:			maximum unicode characters to compare
283  * @upcase:		upcase table
284  * @upcase_size:	upcase table size in Unicode characters
285  *
286  * Compare the first @n characters of the Unicode strings @s1 and @s2,
287  * ignoring case. The strings in little endian format and appropriate
288  * le16_to_cpu() conversion is performed on non-little endian machines.
289  *
290  * Each character is uppercased using the @upcase table before the comparison.
291  *
292  * The function returns an integer less than, equal to, or greater than zero
293  * if @s1 (or the first @n Unicode characters thereof) is found, respectively,
294  * to be less than, to match, or be greater than @s2.
295  */
296 int ntfs_ucsncasecmp(const ntfschar *s1, const ntfschar *s2, size_t n,
297 		const ntfschar *upcase, const u32 upcase_size)
298 {
299 	u16 c1, c2;
300 	size_t i;
301 
302 #ifdef DEBUG
303 	if (!s1 || !s2 || !upcase) {
304 		ntfs_log_debug("ntfs_wcsncasecmp() received NULL pointer!\n");
305 		exit(1);
306 	}
307 #endif
308 	for (i = 0; i < n; ++i) {
309 		if ((c1 = le16_to_cpu(s1[i])) < upcase_size)
310 			c1 = le16_to_cpu(upcase[c1]);
311 		if ((c2 = le16_to_cpu(s2[i])) < upcase_size)
312 			c2 = le16_to_cpu(upcase[c2]);
313 		if (c1 < c2)
314 			return -1;
315 		if (c1 > c2)
316 			return 1;
317 		if (!c1)
318 			break;
319 	}
320 	return 0;
321 }
322 
323 /**
324  * ntfs_ucsnlen - determine the length of a little endian Unicode string
325  * @s:		pointer to Unicode string
326  * @maxlen:	maximum length of string @s
327  *
328  * Return the number of Unicode characters in the little endian Unicode
329  * string @s up to a maximum of maxlen Unicode characters, not including
330  * the terminating (ntfschar)'\0'. If there is no (ntfschar)'\0' between @s
331  * and @s + @maxlen, @maxlen is returned.
332  *
333  * This function never looks beyond @s + @maxlen.
334  */
335 u32 ntfs_ucsnlen(const ntfschar *s, u32 maxlen)
336 {
337 	u32 i;
338 
339 	for (i = 0; i < maxlen; i++) {
340 		if (!le16_to_cpu(s[i]))
341 			break;
342 	}
343 	return i;
344 }
345 
346 /**
347  * ntfs_ucsndup - duplicate little endian Unicode string
348  * @s:		pointer to Unicode string
349  * @maxlen:	maximum length of string @s
350  *
351  * Return a pointer to a new little endian Unicode string which is a duplicate
352  * of the string s.  Memory for the new string is obtained with ntfs_malloc(3),
353  * and can be freed with free(3).
354  *
355  * A maximum of @maxlen Unicode characters are copied and a terminating
356  * (ntfschar)'\0' little endian Unicode character is added.
357  *
358  * This function never looks beyond @s + @maxlen.
359  *
360  * Return a pointer to the new little endian Unicode string on success and NULL
361  * on failure with errno set to the error code.
362  */
363 ntfschar *ntfs_ucsndup(const ntfschar *s, u32 maxlen)
364 {
365 	ntfschar *dst;
366 	u32 len;
367 
368 	len = ntfs_ucsnlen(s, maxlen);
369 	dst = ntfs_malloc((len + 1) * sizeof(ntfschar));
370 	if (dst) {
371 		memcpy(dst, s, len * sizeof(ntfschar));
372 		dst[len] = cpu_to_le16(L'\0');
373 	}
374 	return dst;
375 }
376 
377 /**
378  * ntfs_name_upcase - Map an Unicode name to its uppercase equivalent
379  * @name:
380  * @name_len:
381  * @upcase:
382  * @upcase_len:
383  *
384  * Description...
385  *
386  * Returns:
387  */
388 void ntfs_name_upcase(ntfschar *name, u32 name_len, const ntfschar *upcase,
389 		const u32 upcase_len)
390 {
391 	u32 i;
392 	u16 u;
393 
394 	for (i = 0; i < name_len; i++)
395 		if ((u = le16_to_cpu(name[i])) < upcase_len)
396 			name[i] = upcase[u];
397 }
398 
399 /**
400  * ntfs_name_locase - Map a Unicode name to its lowercase equivalent
401  */
402 void ntfs_name_locase(ntfschar *name, u32 name_len, const ntfschar *locase,
403 		const u32 locase_len)
404 {
405 	u32 i;
406 	u16 u;
407 
408 	if (locase)
409 		for (i = 0; i < name_len; i++)
410 			if ((u = le16_to_cpu(name[i])) < locase_len)
411 				name[i] = locase[u];
412 }
413 
414 /**
415  * ntfs_file_value_upcase - Convert a filename to upper case
416  * @file_name_attr:
417  * @upcase:
418  * @upcase_len:
419  *
420  * Description...
421  *
422  * Returns:
423  */
424 void ntfs_file_value_upcase(FILE_NAME_ATTR *file_name_attr,
425 		const ntfschar *upcase, const u32 upcase_len)
426 {
427 	ntfs_name_upcase((ntfschar*)&file_name_attr->file_name,
428 			file_name_attr->file_name_length, upcase, upcase_len);
429 }
430 
431 /*
432    NTFS uses Unicode (UTF-16LE [NTFS-3G uses UCS-2LE, which is enough
433    for now]) for path names, but the Unicode code points need to be
434    converted before a path can be accessed under NTFS. For 7 bit ASCII/ANSI,
435    glibc does this even without a locale in a hard-coded fashion as that
436    appears to be is easy because the low 7-bit ASCII range appears to be
437    available in all charsets but it does not convert anything if
438    there was some error with the locale setup or none set up like
439    when mount is called during early boot where he (by policy) do
440    not use locales (and may be not available if /usr is not yet mounted),
441    so this patch fixes the resulting issues for systems which use
442    UTF-8 and for others, specifying the locale in fstab brings them
443    the encoding which they want.
444 
445    If no locale is defined or there was a problem with setting one
446    up and whenever nl_langinfo(CODESET) returns a sting starting with
447    "ANSI", use an internal UCS-2LE <-> UTF-8 codeset converter to fix
448    the bug where NTFS-3G does not show any path names which include
449    international characters!!! (and also fails on creating them) as result.
450 
451    Author: Bernhard Kaindl <bk@suse.de>
452    Jean-Pierre Andre made it compliant with RFC3629/RFC2781.
453 */
454 
455 /*
456  * Return the amount of 8-bit elements in UTF-8 needed (without the terminating
457  * null) to store a given UTF-16LE string.
458  *
459  * Return -1 with errno set if string has invalid byte sequence or too long.
460  */
461 static int utf16_to_utf8_size(const ntfschar *ins, const int ins_len, int outs_len)
462 {
463 	int i, ret = -1;
464 	int count = 0;
465 	BOOL surrog;
466 
467 	surrog = FALSE;
468 	for (i = 0; i < ins_len && ins[i]; i++) {
469 		unsigned short c = le16_to_cpu(ins[i]);
470 		if (surrog) {
471 			if ((c >= 0xdc00) && (c < 0xe000)) {
472 				surrog = FALSE;
473 				count += 4;
474 			} else
475 				goto fail;
476 		} else
477 			if (c < 0x80)
478 				count++;
479 			else if (c < 0x800)
480 				count += 2;
481 			else if (c < 0xd800)
482 				count += 3;
483 			else if (c < 0xdc00)
484 				surrog = TRUE;
485 #if NOREVBOM
486 			else if ((c >= 0xe000) && (c < 0xfffe))
487 #else
488 			else if (c >= 0xe000)
489 #endif
490 				count += 3;
491 			else
492 				goto fail;
493 		if (count > outs_len) {
494 			errno = ENAMETOOLONG;
495 			goto out;
496 		}
497 	}
498 	if (surrog)
499 		goto fail;
500 
501 	ret = count;
502 out:
503 	return ret;
504 fail:
505 	errno = EILSEQ;
506 	goto out;
507 }
508 
509 /*
510  * ntfs_utf16_to_utf8 - convert a little endian UTF16LE string to an UTF-8 string
511  * @ins:	input utf16 string buffer
512  * @ins_len:	length of input string in utf16 characters
513  * @outs:	on return contains the (allocated) output multibyte string
514  * @outs_len:	length of output buffer in bytes
515  *
516  * Return -1 with errno set if string has invalid byte sequence or too long.
517  */
518 static int ntfs_utf16_to_utf8(const ntfschar *ins, const int ins_len,
519 			      char **outs, int outs_len)
520 {
521 #if defined(__APPLE__) || defined(__DARWIN__)
522 #ifdef ENABLE_NFCONV
523 	char *original_outs_value = *outs;
524 	int original_outs_len = outs_len;
525 #endif /* ENABLE_NFCONV */
526 #endif /* defined(__APPLE__) || defined(__DARWIN__) */
527 
528 	char *t;
529 	int i, size, ret = -1;
530 	int halfpair;
531 
532 	halfpair = 0;
533 	if (!*outs)
534 		outs_len = PATH_MAX;
535 
536 	size = utf16_to_utf8_size(ins, ins_len, outs_len);
537 
538 	if (size < 0)
539 		goto out;
540 
541 	if (!*outs) {
542 		outs_len = size + 1;
543 		*outs = ntfs_malloc(outs_len);
544 		if (!*outs)
545 			goto out;
546 	}
547 
548 	t = *outs;
549 
550 	for (i = 0; i < ins_len && ins[i]; i++) {
551 	    unsigned short c = le16_to_cpu(ins[i]);
552 			/* size not double-checked */
553 		if (halfpair) {
554 			if ((c >= 0xdc00) && (c < 0xe000)) {
555 				*t++ = 0xf0 + (((halfpair + 64) >> 8) & 7);
556 				*t++ = 0x80 + (((halfpair + 64) >> 2) & 63);
557 				*t++ = 0x80 + ((c >> 6) & 15) + ((halfpair & 3) << 4);
558 				*t++ = 0x80 + (c & 63);
559 				halfpair = 0;
560 			} else
561 				goto fail;
562 		} else if (c < 0x80) {
563 			*t++ = c;
564 	    	} else {
565 			if (c < 0x800) {
566 			   	*t++ = (0xc0 | ((c >> 6) & 0x3f));
567 			        *t++ = 0x80 | (c & 0x3f);
568 			} else if (c < 0xd800) {
569 			   	*t++ = 0xe0 | (c >> 12);
570 			   	*t++ = 0x80 | ((c >> 6) & 0x3f);
571 		        	*t++ = 0x80 | (c & 0x3f);
572 			} else if (c < 0xdc00)
573 				halfpair = c;
574 			else if (c >= 0xe000) {
575 				*t++ = 0xe0 | (c >> 12);
576 				*t++ = 0x80 | ((c >> 6) & 0x3f);
577 			        *t++ = 0x80 | (c & 0x3f);
578 			} else
579 				goto fail;
580 	        }
581 	}
582 	*t = '\0';
583 
584 #if defined(__APPLE__) || defined(__DARWIN__)
585 #ifdef ENABLE_NFCONV
586 	if(nfconvert_utf8 && (t - *outs) > 0) {
587 		char *new_outs = NULL;
588 		int new_outs_len = ntfs_macosx_normalize_utf8(*outs, &new_outs, 0); // Normalize to decomposed form
589 		if(new_outs_len >= 0 && new_outs != NULL) {
590 			if(original_outs_value != *outs) {
591 				// We have allocated outs ourselves.
592 				free(*outs);
593 				*outs = new_outs;
594 				t = *outs + new_outs_len;
595 			}
596 			else {
597 				// We need to copy new_outs into the fixed outs buffer.
598 				memset(*outs, 0, original_outs_len);
599 				strncpy(*outs, new_outs, original_outs_len-1);
600 				t = *outs + original_outs_len;
601 				free(new_outs);
602 			}
603 		}
604 		else {
605 			ntfs_log_error("Failed to normalize NTFS string to UTF-8 NFD: %s\n", *outs);
606 			ntfs_log_error("  new_outs=0x%p\n", new_outs);
607 			ntfs_log_error("  new_outs_len=%d\n", new_outs_len);
608 		}
609 	}
610 #endif /* ENABLE_NFCONV */
611 #endif /* defined(__APPLE__) || defined(__DARWIN__) */
612 
613 	ret = t - *outs;
614 out:
615 	return ret;
616 fail:
617 	errno = EILSEQ;
618 	goto out;
619 }
620 
621 /*
622  * Return the amount of 16-bit elements in UTF-16LE needed
623  * (without the terminating null) to store given UTF-8 string.
624  *
625  * Return -1 with errno set if it's longer than PATH_MAX or string is invalid.
626  *
627  * Note: This does not check whether the input sequence is a valid utf8 string,
628  *	 and should be used only in context where such check is made!
629  */
630 static int utf8_to_utf16_size(const char *s)
631 {
632 	int ret = -1;
633 	unsigned int byte;
634 	size_t count = 0;
635 
636 	while ((byte = *((const unsigned char *)s++))) {
637 		if (++count >= PATH_MAX)
638 			goto fail;
639 		if (byte >= 0xc0) {
640 			if (byte >= 0xF5) {
641 				errno = EILSEQ;
642 				goto out;
643 			}
644 			if (!*s)
645 				break;
646 			if (byte >= 0xC0)
647 				s++;
648 			if (!*s)
649 				break;
650 			if (byte >= 0xE0)
651 				s++;
652 			if (!*s)
653 				break;
654 			if (byte >= 0xF0) {
655 				s++;
656 				if (++count >= PATH_MAX)
657 					goto fail;
658 			}
659 		}
660 	}
661 	ret = count;
662 out:
663 	return ret;
664 fail:
665 	errno = ENAMETOOLONG;
666 	goto out;
667 }
668 /*
669  * This converts one UTF-8 sequence to cpu-endian Unicode value
670  * within range U+0 .. U+10ffff and excluding U+D800 .. U+DFFF
671  *
672  * Return the number of used utf8 bytes or -1 with errno set
673  * if sequence is invalid.
674  */
675 static int utf8_to_unicode(u32 *wc, const char *s)
676 {
677     	unsigned int byte = *((const unsigned char *)s);
678 
679 					/* single byte */
680 	if (byte == 0) {
681 		*wc = (u32) 0;
682 		return 0;
683 	} else if (byte < 0x80) {
684 		*wc = (u32) byte;
685 		return 1;
686 					/* double byte */
687 	} else if (byte < 0xc2) {
688 		goto fail;
689 	} else if (byte < 0xE0) {
690 		if ((s[1] & 0xC0) == 0x80) {
691 			*wc = ((u32)(byte & 0x1F) << 6)
692 			    | ((u32)(s[1] & 0x3F));
693 			return 2;
694 		} else
695 			goto fail;
696 					/* three-byte */
697 	} else if (byte < 0xF0) {
698 		if (((s[1] & 0xC0) == 0x80) && ((s[2] & 0xC0) == 0x80)) {
699 			*wc = ((u32)(byte & 0x0F) << 12)
700 			    | ((u32)(s[1] & 0x3F) << 6)
701 			    | ((u32)(s[2] & 0x3F));
702 			/* Check valid ranges */
703 #if NOREVBOM
704 			if (((*wc >= 0x800) && (*wc <= 0xD7FF))
705 			  || ((*wc >= 0xe000) && (*wc <= 0xFFFD)))
706 				return 3;
707 #else
708 			if (((*wc >= 0x800) && (*wc <= 0xD7FF))
709 			  || ((*wc >= 0xe000) && (*wc <= 0xFFFF)))
710 				return 3;
711 #endif
712 		}
713 		goto fail;
714 					/* four-byte */
715 	} else if (byte < 0xF5) {
716 		if (((s[1] & 0xC0) == 0x80) && ((s[2] & 0xC0) == 0x80)
717 		  && ((s[3] & 0xC0) == 0x80)) {
718 			*wc = ((u32)(byte & 0x07) << 18)
719 			    | ((u32)(s[1] & 0x3F) << 12)
720 			    | ((u32)(s[2] & 0x3F) << 6)
721 			    | ((u32)(s[3] & 0x3F));
722 		/* Check valid ranges */
723 		if ((*wc <= 0x10ffff) && (*wc >= 0x10000))
724 			return 4;
725 		}
726 		goto fail;
727 	}
728 fail:
729 	errno = EILSEQ;
730 	return -1;
731 }
732 
733 /**
734  * ntfs_utf8_to_utf16 - convert a UTF-8 string to a UTF-16LE string
735  * @ins:	input multibyte string buffer
736  * @outs:	on return contains the (allocated) output utf16 string
737  * @outs_len:	length of output buffer in utf16 characters
738  *
739  * Return -1 with errno set.
740  */
741 static int ntfs_utf8_to_utf16(const char *ins, ntfschar **outs)
742 {
743 #if defined(__APPLE__) || defined(__DARWIN__)
744 #ifdef ENABLE_NFCONV
745 	char *new_ins = NULL;
746 	if(nfconvert_utf8) {
747 		int new_ins_len;
748 		new_ins_len = ntfs_macosx_normalize_utf8(ins, &new_ins, 1); // Normalize to composed form
749 		if(new_ins_len >= 0)
750 			ins = new_ins;
751 		else
752 			ntfs_log_error("Failed to normalize NTFS string to UTF-8 NFC: %s\n", ins);
753 	}
754 #endif /* ENABLE_NFCONV */
755 #endif /* defined(__APPLE__) || defined(__DARWIN__) */
756 	const char *t = ins;
757 	u32 wc;
758 	BOOL allocated;
759 	ntfschar *outpos;
760 	int shorts, ret = -1;
761 
762 	shorts = utf8_to_utf16_size(ins);
763 	if (shorts < 0)
764 		goto fail;
765 
766 	allocated = FALSE;
767 	if (!*outs) {
768 		*outs = ntfs_malloc((shorts + 1) * sizeof(ntfschar));
769 		if (!*outs)
770 			goto fail;
771 		allocated = TRUE;
772 	}
773 
774 	outpos = *outs;
775 
776 	while(1) {
777 		int m  = utf8_to_unicode(&wc, t);
778 		if (m <= 0) {
779 			if (m < 0) {
780 				/* do not leave space allocated if failed */
781 				if (allocated) {
782 					free(*outs);
783 					*outs = (ntfschar*)NULL;
784 				}
785 				goto fail;
786 			}
787 			*outpos++ = const_cpu_to_le16(0);
788 			break;
789 		}
790 		if (wc < 0x10000)
791 			*outpos++ = cpu_to_le16(wc);
792 		else {
793 			wc -= 0x10000;
794 			*outpos++ = cpu_to_le16((wc >> 10) + 0xd800);
795 			*outpos++ = cpu_to_le16((wc & 0x3ff) + 0xdc00);
796 		}
797 		t += m;
798 	}
799 
800 	ret = --outpos - *outs;
801 fail:
802 #if defined(__APPLE__) || defined(__DARWIN__)
803 #ifdef ENABLE_NFCONV
804 	if(new_ins != NULL)
805 		free(new_ins);
806 #endif /* ENABLE_NFCONV */
807 #endif /* defined(__APPLE__) || defined(__DARWIN__) */
808 	return ret;
809 }
810 
811 /**
812  * ntfs_ucstombs - convert a little endian Unicode string to a multibyte string
813  * @ins:	input Unicode string buffer
814  * @ins_len:	length of input string in Unicode characters
815  * @outs:	on return contains the (allocated) output multibyte string
816  * @outs_len:	length of output buffer in bytes
817  *
818  * Convert the input little endian, 2-byte Unicode string @ins, of length
819  * @ins_len into the multibyte string format dictated by the current locale.
820  *
821  * If *@outs is NULL, the function allocates the string and the caller is
822  * responsible for calling free(*@outs); when finished with it.
823  *
824  * On success the function returns the number of bytes written to the output
825  * string *@outs (>= 0), not counting the terminating NULL byte. If the output
826  * string buffer was allocated, *@outs is set to it.
827  *
828  * On error, -1 is returned, and errno is set to the error code. The following
829  * error codes can be expected:
830  *	EINVAL		Invalid arguments (e.g. @ins or @outs is NULL).
831  *	EILSEQ		The input string cannot be represented as a multibyte
832  *			sequence according to the current locale.
833  *	ENAMETOOLONG	Destination buffer is too small for input string.
834  *	ENOMEM		Not enough memory to allocate destination buffer.
835  */
836 int ntfs_ucstombs(const ntfschar *ins, const int ins_len, char **outs,
837 		int outs_len)
838 {
839 	char *mbs;
840 	int mbs_len;
841 #ifdef MB_CUR_MAX
842 	wchar_t wc;
843 	int i, o;
844 	int cnt = 0;
845 #ifdef HAVE_MBSINIT
846 	mbstate_t mbstate;
847 #endif
848 #endif /* MB_CUR_MAX */
849 
850 	if (!ins || !outs) {
851 		errno = EINVAL;
852 		return -1;
853 	}
854 	mbs = *outs;
855 	mbs_len = outs_len;
856 	if (mbs && !mbs_len) {
857 		errno = ENAMETOOLONG;
858 		return -1;
859 	}
860 	if (use_utf8)
861 		return ntfs_utf16_to_utf8(ins, ins_len, outs, outs_len);
862 #ifdef MB_CUR_MAX
863 	if (!mbs) {
864 		mbs_len = (ins_len + 1) * MB_CUR_MAX;
865 		mbs = ntfs_malloc(mbs_len);
866 		if (!mbs)
867 			return -1;
868 	}
869 #ifdef HAVE_MBSINIT
870 	memset(&mbstate, 0, sizeof(mbstate));
871 #else
872 	wctomb(NULL, 0);
873 #endif
874 	for (i = o = 0; i < ins_len; i++) {
875 		/* Reallocate memory if necessary or abort. */
876 		if ((int)(o + MB_CUR_MAX) > mbs_len) {
877 			char *tc;
878 			if (mbs == *outs) {
879 				errno = ENAMETOOLONG;
880 				return -1;
881 			}
882 			tc = ntfs_malloc((mbs_len + 64) & ~63);
883 			if (!tc)
884 				goto err_out;
885 			memcpy(tc, mbs, mbs_len);
886 			mbs_len = (mbs_len + 64) & ~63;
887 			free(mbs);
888 			mbs = tc;
889 		}
890 		/* Convert the LE Unicode character to a CPU wide character. */
891 		wc = (wchar_t)le16_to_cpu(ins[i]);
892 		if (!wc)
893 			break;
894 		/* Convert the CPU endian wide character to multibyte. */
895 #ifdef HAVE_MBSINIT
896 		cnt = wcrtomb(mbs + o, wc, &mbstate);
897 #else
898 		cnt = wctomb(mbs + o, wc);
899 #endif
900 		if (cnt == -1)
901 			goto err_out;
902 		if (cnt <= 0) {
903 			ntfs_log_debug("Eeek. cnt <= 0, cnt = %i\n", cnt);
904 			errno = EINVAL;
905 			goto err_out;
906 		}
907 		o += cnt;
908 	}
909 #ifdef HAVE_MBSINIT
910 	/* Make sure we are back in the initial state. */
911 	if (!mbsinit(&mbstate)) {
912 		ntfs_log_debug("Eeek. mbstate not in initial state!\n");
913 		errno = EILSEQ;
914 		goto err_out;
915 	}
916 #endif
917 	/* Now write the NULL character. */
918 	mbs[o] = '\0';
919 	if (*outs != mbs)
920 		*outs = mbs;
921 	return o;
922 err_out:
923 	if (mbs != *outs) {
924 		int eo = errno;
925 		free(mbs);
926 		errno = eo;
927 	}
928 #else /* MB_CUR_MAX */
929 	errno = EILSEQ;
930 #endif /* MB_CUR_MAX */
931 	return -1;
932 }
933 
934 /**
935  * ntfs_mbstoucs - convert a multibyte string to a little endian Unicode string
936  * @ins:	input multibyte string buffer
937  * @outs:	on return contains the (allocated) output Unicode string
938  *
939  * Convert the input multibyte string @ins, from the current locale into the
940  * corresponding little endian, 2-byte Unicode string.
941  *
942  * The function allocates the string and the caller is responsible for calling
943  * free(*@outs); when finished with it.
944  *
945  * On success the function returns the number of Unicode characters written to
946  * the output string *@outs (>= 0), not counting the terminating Unicode NULL
947  * character.
948  *
949  * On error, -1 is returned, and errno is set to the error code. The following
950  * error codes can be expected:
951  *	EINVAL		Invalid arguments (e.g. @ins or @outs is NULL).
952  *	EILSEQ		The input string cannot be represented as a Unicode
953  *			string according to the current locale.
954  *	ENAMETOOLONG	Destination buffer is too small for input string.
955  *	ENOMEM		Not enough memory to allocate destination buffer.
956  */
957 int ntfs_mbstoucs(const char *ins, ntfschar **outs)
958 {
959 #ifdef MB_CUR_MAX
960 	ntfschar *ucs;
961 	const char *s;
962 	wchar_t wc;
963 	int i, o, cnt, ins_len, ucs_len, ins_size;
964 #ifdef HAVE_MBSINIT
965 	mbstate_t mbstate;
966 #endif
967 #endif /* MB_CUR_MAX */
968 
969 	if (!ins || !outs) {
970 		errno = EINVAL;
971 		return -1;
972 	}
973 
974 	if (use_utf8)
975 		return ntfs_utf8_to_utf16(ins, outs);
976 
977 #ifdef MB_CUR_MAX
978 	/* Determine the size of the multi-byte string in bytes. */
979 	ins_size = strlen(ins);
980 	/* Determine the length of the multi-byte string. */
981 	s = ins;
982 #if defined(HAVE_MBSINIT)
983 	memset(&mbstate, 0, sizeof(mbstate));
984 	ins_len = mbsrtowcs(NULL, (const char **)&s, 0, &mbstate);
985 #ifdef __CYGWIN32__
986 	if (!ins_len && *ins) {
987 		/* Older Cygwin had broken mbsrtowcs() implementation. */
988 		ins_len = strlen(ins);
989 	}
990 #endif
991 #elif !defined(DJGPP)
992 	ins_len = mbstowcs(NULL, s, 0);
993 #else
994 	/* Eeek!!! DJGPP has broken mbstowcs() implementation!!! */
995 	ins_len = strlen(ins);
996 #endif
997 	if (ins_len == -1)
998 		return ins_len;
999 #ifdef HAVE_MBSINIT
1000 	if ((s != ins) || !mbsinit(&mbstate)) {
1001 #else
1002 	if (s != ins) {
1003 #endif
1004 		errno = EILSEQ;
1005 		return -1;
1006 	}
1007 	/* Add the NULL terminator. */
1008 	ins_len++;
1009 	ucs_len = ins_len;
1010 	ucs = ntfs_malloc(ucs_len * sizeof(ntfschar));
1011 	if (!ucs)
1012 		return -1;
1013 #ifdef HAVE_MBSINIT
1014 	memset(&mbstate, 0, sizeof(mbstate));
1015 #else
1016 	mbtowc(NULL, NULL, 0);
1017 #endif
1018 	for (i = o = cnt = 0; i < ins_size; i += cnt, o++) {
1019 		/* Reallocate memory if necessary. */
1020 		if (o >= ucs_len) {
1021 			ntfschar *tc;
1022 			ucs_len = (ucs_len * sizeof(ntfschar) + 64) & ~63;
1023 			tc = realloc(ucs, ucs_len);
1024 			if (!tc)
1025 				goto err_out;
1026 			ucs = tc;
1027 			ucs_len /= sizeof(ntfschar);
1028 		}
1029 		/* Convert the multibyte character to a wide character. */
1030 #ifdef HAVE_MBSINIT
1031 		cnt = mbrtowc(&wc, ins + i, ins_size - i, &mbstate);
1032 #else
1033 		cnt = mbtowc(&wc, ins + i, ins_size - i);
1034 #endif
1035 		if (!cnt)
1036 			break;
1037 		if (cnt == -1)
1038 			goto err_out;
1039 		if (cnt < -1) {
1040 			ntfs_log_trace("Eeek. cnt = %i\n", cnt);
1041 			errno = EINVAL;
1042 			goto err_out;
1043 		}
1044 		/* Make sure we are not overflowing the NTFS Unicode set. */
1045 		if ((unsigned long)wc >= (unsigned long)(1 <<
1046 				(8 * sizeof(ntfschar)))) {
1047 			errno = EILSEQ;
1048 			goto err_out;
1049 		}
1050 		/* Convert the CPU wide character to a LE Unicode character. */
1051 		ucs[o] = cpu_to_le16(wc);
1052 	}
1053 #ifdef HAVE_MBSINIT
1054 	/* Make sure we are back in the initial state. */
1055 	if (!mbsinit(&mbstate)) {
1056 		ntfs_log_trace("Eeek. mbstate not in initial state!\n");
1057 		errno = EILSEQ;
1058 		goto err_out;
1059 	}
1060 #endif
1061 	/* Now write the NULL character. */
1062 	ucs[o] = cpu_to_le16(L'\0');
1063 	*outs = ucs;
1064 	return o;
1065 err_out:
1066 	free(ucs);
1067 #else /* MB_CUR_MAX */
1068 	errno = EILSEQ;
1069 #endif /* MB_CUR_MAX */
1070 	return -1;
1071 }
1072 
1073 /*
1074  *		Turn a UTF8 name uppercase
1075  *
1076  *	Returns an allocated uppercase name which has to be freed by caller
1077  *	or NULL if there is an error (described by errno)
1078  */
1079 
1080 char *ntfs_uppercase_mbs(const char *low,
1081 			const ntfschar *upcase, u32 upcase_size)
1082 {
1083 	int size;
1084 	char *upp;
1085 	u32 wc;
1086 	int n;
1087 	const char *s;
1088 	char *t;
1089 
1090 	size = strlen(low);
1091 	upp = (char*)ntfs_malloc(3*size + 1);
1092 	if (upp) {
1093 		s = low;
1094 		t = upp;
1095 		do {
1096 			n = utf8_to_unicode(&wc, s);
1097 			if (n > 0) {
1098 				if (wc < upcase_size)
1099 					wc = le16_to_cpu(upcase[wc]);
1100 				if (wc < 0x80)
1101 					*t++ = wc;
1102 				else if (wc < 0x800) {
1103 					*t++ = (0xc0 | ((wc >> 6) & 0x3f));
1104 					*t++ = 0x80 | (wc & 0x3f);
1105 				} else if (wc < 0x10000) {
1106 					*t++ = 0xe0 | (wc >> 12);
1107 					*t++ = 0x80 | ((wc >> 6) & 0x3f);
1108 					*t++ = 0x80 | (wc & 0x3f);
1109 				} else {
1110 					*t++ = 0xf0 | ((wc >> 18) & 7);
1111 					*t++ = 0x80 | ((wc >> 12) & 63);
1112 					*t++ = 0x80 | ((wc >> 6) & 0x3f);
1113 					*t++ = 0x80 | (wc & 0x3f);
1114 				}
1115 			s += n;
1116 			}
1117 		} while (n > 0);
1118 		if (n < 0) {
1119 			free(upp);
1120 			upp = (char*)NULL;
1121 			errno = EILSEQ;
1122 		}
1123 		*t = 0;
1124 	}
1125 	return (upp);
1126 }
1127 
1128 /**
1129  * ntfs_upcase_table_build - build the default upcase table for NTFS
1130  * @uc:		destination buffer where to store the built table
1131  * @uc_len:	size of destination buffer in bytes
1132  *
1133  * ntfs_upcase_table_build() builds the default upcase table for NTFS and
1134  * stores it in the caller supplied buffer @uc of size @uc_len.
1135  *
1136  * Note, @uc_len must be at least 128kiB in size or bad things will happen!
1137  */
1138 void ntfs_upcase_table_build(ntfschar *uc, u32 uc_len)
1139 {
1140 #if 1 /* Vista */
1141 	/*
1142 	 *	This is the table as defined by Vista
1143 	 */
1144 	/*
1145 	 * "Start" is inclusive and "End" is exclusive, every value has the
1146 	 * value of "Add" added to it.
1147 	 */
1148 	static int uc_run_table[][3] = { /* Start, End, Add */
1149 	{0x0061, 0x007b,   -32}, {0x00e0, 0x00f7,  -32}, {0x00f8, 0x00ff, -32},
1150 	{0x0256, 0x0258,  -205}, {0x028a, 0x028c, -217}, {0x037b, 0x037e, 130},
1151 	{0x03ac, 0x03ad,   -38}, {0x03ad, 0x03b0,  -37}, {0x03b1, 0x03c2, -32},
1152 	{0x03c2, 0x03c3,   -31}, {0x03c3, 0x03cc,  -32}, {0x03cc, 0x03cd, -64},
1153 	{0x03cd, 0x03cf,   -63}, {0x0430, 0x0450,  -32}, {0x0450, 0x0460, -80},
1154 	{0x0561, 0x0587,   -48}, {0x1f00, 0x1f08,    8}, {0x1f10, 0x1f16,   8},
1155 	{0x1f20, 0x1f28,     8}, {0x1f30, 0x1f38,    8}, {0x1f40, 0x1f46,   8},
1156 	{0x1f51, 0x1f52,     8}, {0x1f53, 0x1f54,    8}, {0x1f55, 0x1f56,   8},
1157 	{0x1f57, 0x1f58,     8}, {0x1f60, 0x1f68,    8}, {0x1f70, 0x1f72,  74},
1158 	{0x1f72, 0x1f76,    86}, {0x1f76, 0x1f78,  100}, {0x1f78, 0x1f7a, 128},
1159 	{0x1f7a, 0x1f7c,   112}, {0x1f7c, 0x1f7e,  126}, {0x1f80, 0x1f88,   8},
1160 	{0x1f90, 0x1f98,     8}, {0x1fa0, 0x1fa8,    8}, {0x1fb0, 0x1fb2,   8},
1161 	{0x1fb3, 0x1fb4,     9}, {0x1fcc, 0x1fcd,   -9}, {0x1fd0, 0x1fd2,   8},
1162 	{0x1fe0, 0x1fe2,     8}, {0x1fe5, 0x1fe6,    7}, {0x1ffc, 0x1ffd,  -9},
1163 	{0x2170, 0x2180,   -16}, {0x24d0, 0x24ea,  -26}, {0x2c30, 0x2c5f, -48},
1164 	{0x2d00, 0x2d26, -7264}, {0xff41, 0xff5b,  -32}, {0}
1165 	};
1166 	/*
1167 	 * "Start" is exclusive and "End" is inclusive, every second value is
1168 	 * decremented by one.
1169 	 */
1170 	static int uc_dup_table[][2] = { /* Start, End */
1171 	{0x0100, 0x012f}, {0x0132, 0x0137}, {0x0139, 0x0149}, {0x014a, 0x0178},
1172 	{0x0179, 0x017e}, {0x01a0, 0x01a6}, {0x01b3, 0x01b7}, {0x01cd, 0x01dd},
1173 	{0x01de, 0x01ef}, {0x01f4, 0x01f5}, {0x01f8, 0x01f9}, {0x01fa, 0x0220},
1174 	{0x0222, 0x0234}, {0x023b, 0x023c}, {0x0241, 0x0242}, {0x0246, 0x024f},
1175 	{0x03d8, 0x03ef}, {0x03f7, 0x03f8}, {0x03fa, 0x03fb}, {0x0460, 0x0481},
1176 	{0x048a, 0x04bf}, {0x04c1, 0x04c4}, {0x04c5, 0x04c8}, {0x04c9, 0x04ce},
1177 	{0x04ec, 0x04ed}, {0x04d0, 0x04eb}, {0x04ee, 0x04f5}, {0x04f6, 0x0513},
1178 	{0x1e00, 0x1e95}, {0x1ea0, 0x1ef9}, {0x2183, 0x2184}, {0x2c60, 0x2c61},
1179 	{0x2c67, 0x2c6c}, {0x2c75, 0x2c76}, {0x2c80, 0x2ce3}, {0}
1180 	};
1181 	/*
1182 	 * Set the Unicode character at offset "Offset" to "Value".  Note,
1183 	 * "Value" is host endian.
1184 	 */
1185 	static int uc_byte_table[][2] = { /* Offset, Value */
1186 	{0x00ff, 0x0178}, {0x0180, 0x0243}, {0x0183, 0x0182}, {0x0185, 0x0184},
1187 	{0x0188, 0x0187}, {0x018c, 0x018b}, {0x0192, 0x0191}, {0x0195, 0x01f6},
1188 	{0x0199, 0x0198}, {0x019a, 0x023d}, {0x019e, 0x0220}, {0x01a8, 0x01a7},
1189 	{0x01ad, 0x01ac}, {0x01b0, 0x01af}, {0x01b9, 0x01b8}, {0x01bd, 0x01bc},
1190 	{0x01bf, 0x01f7}, {0x01c6, 0x01c4}, {0x01c9, 0x01c7}, {0x01cc, 0x01ca},
1191 	{0x01dd, 0x018e}, {0x01f3, 0x01f1}, {0x023a, 0x2c65}, {0x023e, 0x2c66},
1192 	{0x0253, 0x0181}, {0x0254, 0x0186}, {0x0259, 0x018f}, {0x025b, 0x0190},
1193 	{0x0260, 0x0193}, {0x0263, 0x0194}, {0x0268, 0x0197}, {0x0269, 0x0196},
1194 	{0x026b, 0x2c62}, {0x026f, 0x019c}, {0x0272, 0x019d}, {0x0275, 0x019f},
1195 	{0x027d, 0x2c64}, {0x0280, 0x01a6}, {0x0283, 0x01a9}, {0x0288, 0x01ae},
1196 	{0x0289, 0x0244}, {0x028c, 0x0245}, {0x0292, 0x01b7}, {0x03f2, 0x03f9},
1197 	{0x04cf, 0x04c0}, {0x1d7d, 0x2c63}, {0x214e, 0x2132}, {0}
1198 	};
1199 #else /* Vista */
1200 	/*
1201 	 *	This is the table as defined by Windows XP
1202 	 */
1203 	static int uc_run_table[][3] = { /* Start, End, Add */
1204 	{0x0061, 0x007B,  -32}, {0x0451, 0x045D, -80}, {0x1F70, 0x1F72,  74},
1205 	{0x00E0, 0x00F7,  -32}, {0x045E, 0x0460, -80}, {0x1F72, 0x1F76,  86},
1206 	{0x00F8, 0x00FF,  -32}, {0x0561, 0x0587, -48}, {0x1F76, 0x1F78, 100},
1207 	{0x0256, 0x0258, -205}, {0x1F00, 0x1F08,   8}, {0x1F78, 0x1F7A, 128},
1208 	{0x028A, 0x028C, -217}, {0x1F10, 0x1F16,   8}, {0x1F7A, 0x1F7C, 112},
1209 	{0x03AC, 0x03AD,  -38}, {0x1F20, 0x1F28,   8}, {0x1F7C, 0x1F7E, 126},
1210 	{0x03AD, 0x03B0,  -37}, {0x1F30, 0x1F38,   8}, {0x1FB0, 0x1FB2,   8},
1211 	{0x03B1, 0x03C2,  -32}, {0x1F40, 0x1F46,   8}, {0x1FD0, 0x1FD2,   8},
1212 	{0x03C2, 0x03C3,  -31}, {0x1F51, 0x1F52,   8}, {0x1FE0, 0x1FE2,   8},
1213 	{0x03C3, 0x03CC,  -32}, {0x1F53, 0x1F54,   8}, {0x1FE5, 0x1FE6,   7},
1214 	{0x03CC, 0x03CD,  -64}, {0x1F55, 0x1F56,   8}, {0x2170, 0x2180, -16},
1215 	{0x03CD, 0x03CF,  -63}, {0x1F57, 0x1F58,   8}, {0x24D0, 0x24EA, -26},
1216 	{0x0430, 0x0450,  -32}, {0x1F60, 0x1F68,   8}, {0xFF41, 0xFF5B, -32},
1217 	{0}
1218 	};
1219 	static int uc_dup_table[][2] = { /* Start, End */
1220 	{0x0100, 0x012F}, {0x01A0, 0x01A6}, {0x03E2, 0x03EF}, {0x04CB, 0x04CC},
1221 	{0x0132, 0x0137}, {0x01B3, 0x01B7}, {0x0460, 0x0481}, {0x04D0, 0x04EB},
1222 	{0x0139, 0x0149}, {0x01CD, 0x01DD}, {0x0490, 0x04BF}, {0x04EE, 0x04F5},
1223 	{0x014A, 0x0178}, {0x01DE, 0x01EF}, {0x04BF, 0x04BF}, {0x04F8, 0x04F9},
1224 	{0x0179, 0x017E}, {0x01F4, 0x01F5}, {0x04C1, 0x04C4}, {0x1E00, 0x1E95},
1225 	{0x018B, 0x018B}, {0x01FA, 0x0218}, {0x04C7, 0x04C8}, {0x1EA0, 0x1EF9},
1226 	{0}
1227 	};
1228 	static int uc_byte_table[][2] = { /* Offset, Value */
1229 	{0x00FF, 0x0178}, {0x01AD, 0x01AC}, {0x01F3, 0x01F1}, {0x0269, 0x0196},
1230 	{0x0183, 0x0182}, {0x01B0, 0x01AF}, {0x0253, 0x0181}, {0x026F, 0x019C},
1231 	{0x0185, 0x0184}, {0x01B9, 0x01B8}, {0x0254, 0x0186}, {0x0272, 0x019D},
1232 	{0x0188, 0x0187}, {0x01BD, 0x01BC}, {0x0259, 0x018F}, {0x0275, 0x019F},
1233 	{0x018C, 0x018B}, {0x01C6, 0x01C4}, {0x025B, 0x0190}, {0x0283, 0x01A9},
1234 	{0x0192, 0x0191}, {0x01C9, 0x01C7}, {0x0260, 0x0193}, {0x0288, 0x01AE},
1235 	{0x0199, 0x0198}, {0x01CC, 0x01CA}, {0x0263, 0x0194}, {0x0292, 0x01B7},
1236 	{0x01A8, 0x01A7}, {0x01DD, 0x018E}, {0x0268, 0x0197},
1237 	{0}
1238 	};
1239 #endif /* Vista */
1240 	int i, r;
1241 	int k, off;
1242 
1243 	memset((char*)uc, 0, uc_len);
1244 	uc_len >>= 1;
1245 	if (uc_len > 65536)
1246 		uc_len = 65536;
1247 	for (i = 0; (u32)i < uc_len; i++)
1248 		uc[i] = cpu_to_le16(i);
1249 	for (r = 0; uc_run_table[r][0]; r++) {
1250 		off = uc_run_table[r][2];
1251 		for (i = uc_run_table[r][0]; i < uc_run_table[r][1]; i++)
1252 			uc[i] = cpu_to_le16(i + off);
1253 	}
1254 	for (r = 0; uc_dup_table[r][0]; r++)
1255 		for (i = uc_dup_table[r][0]; i < uc_dup_table[r][1]; i += 2)
1256 			uc[i + 1] = cpu_to_le16(i);
1257 	for (r = 0; uc_byte_table[r][0]; r++) {
1258 		k = uc_byte_table[r][1];
1259 		uc[uc_byte_table[r][0]] = cpu_to_le16(k);
1260 	}
1261 }
1262 
1263 /*
1264  *		Allocate and build the default upcase table
1265  *
1266  *	Returns the number of entries
1267  *		0 if failed
1268  */
1269 
1270 #define UPCASE_LEN 65536 /* default number of entries in upcase */
1271 
1272 u32 ntfs_upcase_build_default(ntfschar **upcase)
1273 {
1274 	u32 upcase_len = 0;
1275 
1276 	*upcase = (ntfschar*)ntfs_malloc(UPCASE_LEN*2);
1277 	if (*upcase) {
1278 		ntfs_upcase_table_build(*upcase, UPCASE_LEN*2);
1279 		upcase_len = UPCASE_LEN;
1280 	}
1281 	return (upcase_len);
1282 }
1283 
1284 /*
1285  *		Build a table for converting to lower case
1286  *
1287  *	This is only meaningful when there is a single lower case
1288  *	character leading to an upper case one, and currently the
1289  *	only exception is the greek letter sigma which has a single
1290  *	upper case glyph (code U+03A3), but two lower case glyphs
1291  *	(code U+03C3 and U+03C2, the latter to be used at the end
1292  *	of a word). In the following implementation the upper case
1293  *	sigma will be lowercased as U+03C3.
1294  */
1295 
1296 ntfschar *ntfs_locase_table_build(const ntfschar *uc, u32 uc_cnt)
1297 {
1298 	ntfschar *lc;
1299 	u32 upp;
1300 	u32 i;
1301 
1302 	lc = (ntfschar*)ntfs_malloc(uc_cnt*sizeof(ntfschar));
1303 	if (lc) {
1304 		for (i=0; i<uc_cnt; i++)
1305 			lc[i] = cpu_to_le16(i);
1306 		for (i=0; i<uc_cnt; i++) {
1307 			upp = le16_to_cpu(uc[i]);
1308 			if ((upp != i) && (upp < uc_cnt))
1309 				lc[upp] = cpu_to_le16(i);
1310 		}
1311 	} else
1312 		ntfs_log_error("Could not build the locase table\n");
1313 	return (lc);
1314 }
1315 
1316 /**
1317  * ntfs_str2ucs - convert a string to a valid NTFS file name
1318  * @s:		input string
1319  * @len:	length of output buffer in Unicode characters
1320  *
1321  * Convert the input @s string into the corresponding little endian,
1322  * 2-byte Unicode string. The length of the converted string is less
1323  * or equal to the maximum length allowed by the NTFS format (255).
1324  *
1325  * If @s is NULL then return AT_UNNAMED.
1326  *
1327  * On success the function returns the Unicode string in an allocated
1328  * buffer and the caller is responsible to free it when it's not needed
1329  * anymore.
1330  *
1331  * On error NULL is returned and errno is set to the error code.
1332  */
1333 ntfschar *ntfs_str2ucs(const char *s, int *len)
1334 {
1335 	ntfschar *ucs = NULL;
1336 
1337 	if (s && ((*len = ntfs_mbstoucs(s, &ucs)) == -1)) {
1338 		ntfs_log_perror("Couldn't convert '%s' to Unicode", s);
1339 		return NULL;
1340 	}
1341 	if (*len > NTFS_MAX_NAME_LEN) {
1342 		free(ucs);
1343 		errno = ENAMETOOLONG;
1344 		return NULL;
1345 	}
1346 	if (!ucs || !*len) {
1347 		ucs  = AT_UNNAMED;
1348 		*len = 0;
1349 	}
1350 	return ucs;
1351 }
1352 
1353 /**
1354  * ntfs_ucsfree - free memory allocated by ntfs_str2ucs()
1355  * @ucs		input string to be freed
1356  *
1357  * Free memory at @ucs and which was allocated by ntfs_str2ucs.
1358  *
1359  * Return value: none.
1360  */
1361 void ntfs_ucsfree(ntfschar *ucs)
1362 {
1363 	if (ucs && (ucs != AT_UNNAMED))
1364 		free(ucs);
1365 }
1366 
1367 /*
1368  *		Check whether a name contains no chars forbidden
1369  *	for DOS or Win32 use
1370  *
1371  *	If there is a bad char, errno is set to EINVAL
1372  */
1373 
1374 BOOL ntfs_forbidden_chars(const ntfschar *name, int len)
1375 {
1376 	BOOL forbidden;
1377 	int ch;
1378 	int i;
1379 	u32 mainset =     (1L << ('\"' - 0x20))
1380 			| (1L << ('*' - 0x20))
1381 			| (1L << ('/' - 0x20))
1382 			| (1L << (':' - 0x20))
1383 			| (1L << ('<' - 0x20))
1384 			| (1L << ('>' - 0x20))
1385 			| (1L << ('?' - 0x20));
1386 
1387 	forbidden = (len == 0)
1388 			|| (le16_to_cpu(name[len-1]) == ' ')
1389 			|| (le16_to_cpu(name[len-1]) == '.');
1390 	for (i=0; i<len; i++) {
1391 		ch = le16_to_cpu(name[i]);
1392 		if ((ch < 0x20)
1393 		    || ((ch < 0x40)
1394 			&& ((1L << (ch - 0x20)) & mainset))
1395 		    || (ch == '\\')
1396 		    || (ch == '|'))
1397 			forbidden = TRUE;
1398 	}
1399 	if (forbidden)
1400 		errno = EINVAL;
1401 	return (forbidden);
1402 }
1403 
1404 /*
1405  *		Check whether the same name can be used as a DOS and
1406  *	a Win32 name
1407  *
1408  *	The names must be the same, or the short name the uppercase
1409  *	variant of the long name
1410  */
1411 
1412 BOOL ntfs_collapsible_chars(ntfs_volume *vol,
1413 			const ntfschar *shortname, int shortlen,
1414 			const ntfschar *longname, int longlen)
1415 {
1416 	BOOL collapsible;
1417 	unsigned int ch;
1418 	unsigned int cs;
1419 	int i;
1420 
1421 	collapsible = shortlen == longlen;
1422 	for (i=0; collapsible && (i<shortlen); i++) {
1423 		ch = le16_to_cpu(longname[i]);
1424 		cs = le16_to_cpu(shortname[i]);
1425 		if ((cs != ch)
1426 		    && ((ch >= vol->upcase_len)
1427 			|| (cs >= vol->upcase_len)
1428 			|| (vol->upcase[cs] != vol->upcase[ch])))
1429 				collapsible = FALSE;
1430 	}
1431 	return (collapsible);
1432 }
1433 
1434 /*
1435  * Define the character encoding to be used.
1436  * Use UTF-8 unless specified otherwise.
1437  */
1438 
1439 int ntfs_set_char_encoding(const char *locale)
1440 {
1441 	use_utf8 = 0;
1442 	if (!locale || strstr(locale,"utf8") || strstr(locale,"UTF8")
1443 	    || strstr(locale,"utf-8") || strstr(locale,"UTF-8"))
1444 		use_utf8 = 1;
1445 	else
1446 		if (setlocale(LC_ALL, locale))
1447 			use_utf8 = 0;
1448 		else {
1449 			ntfs_log_error("Invalid locale, encoding to UTF-8\n");
1450 			use_utf8 = 1;
1451 	 	}
1452 	return 0; /* always successful */
1453 }
1454 
1455 #if defined(__APPLE__) || defined(__DARWIN__)
1456 
1457 int ntfs_macosx_normalize_filenames(int normalize) {
1458 #ifdef ENABLE_NFCONV
1459 	if(normalize == 0 || normalize == 1) {
1460 		nfconvert_utf8 = normalize;
1461 		return 0;
1462 	}
1463 	else
1464 		return -1;
1465 #else
1466 	return -1;
1467 #endif /* ENABLE_NFCONV */
1468 }
1469 
1470 int ntfs_macosx_normalize_utf8(const char *utf8_string, char **target,
1471  int composed) {
1472 #ifdef ENABLE_NFCONV
1473 	/* For this code to compile, the CoreFoundation framework must be fed to the linker. */
1474 	CFStringRef cfSourceString;
1475 	CFMutableStringRef cfMutableString;
1476 	CFRange rangeToProcess;
1477 	CFIndex requiredBufferLength;
1478 	char *result = NULL;
1479 	int resultLength = -1;
1480 
1481 	/* Convert the UTF-8 string to a CFString. */
1482 	cfSourceString = CFStringCreateWithCString(kCFAllocatorDefault, utf8_string, kCFStringEncodingUTF8);
1483 	if(cfSourceString == NULL) {
1484 		ntfs_log_error("CFStringCreateWithCString failed!\n");
1485 		return -2;
1486 	}
1487 
1488 	/* Create a mutable string from cfSourceString that we are free to modify. */
1489 	cfMutableString = CFStringCreateMutableCopy(kCFAllocatorDefault, 0, cfSourceString);
1490 	CFRelease(cfSourceString); /* End-of-life. */
1491 	if(cfMutableString == NULL) {
1492 		ntfs_log_error("CFStringCreateMutableCopy failed!\n");
1493 		return -3;
1494 	}
1495 
1496 	/* Normalize the mutable string to the desired normalization form. */
1497 	CFStringNormalize(cfMutableString, (composed != 0 ? kCFStringNormalizationFormC : kCFStringNormalizationFormD));
1498 
1499 	/* Store the resulting string in a '\0'-terminated UTF-8 encoded char* buffer. */
1500 	rangeToProcess = CFRangeMake(0, CFStringGetLength(cfMutableString));
1501 	if(CFStringGetBytes(cfMutableString, rangeToProcess, kCFStringEncodingUTF8, 0, false, NULL, 0, &requiredBufferLength) > 0) {
1502 		resultLength = sizeof(char)*(requiredBufferLength + 1);
1503 		result = ntfs_calloc(resultLength);
1504 
1505 		if(result != NULL) {
1506 			if(CFStringGetBytes(cfMutableString, rangeToProcess, kCFStringEncodingUTF8,
1507 					    0, false, (UInt8*)result, resultLength-1, &requiredBufferLength) <= 0) {
1508 				ntfs_log_error("Could not perform UTF-8 conversion of normalized CFMutableString.\n");
1509 				free(result);
1510 				result = NULL;
1511 			}
1512 		}
1513 		else
1514 			ntfs_log_error("Could not perform a ntfs_calloc of %d bytes for char *result.\n", resultLength);
1515 	}
1516 	else
1517 		ntfs_log_error("Could not perform check for required length of UTF-8 conversion of normalized CFMutableString.\n");
1518 
1519 
1520 	CFRelease(cfMutableString);
1521 
1522 	if(result != NULL) {
1523 	 	*target = result;
1524 		return resultLength - 1;
1525 	}
1526 	else
1527 		return -1;
1528 #else
1529 	return -1;
1530 #endif /* ENABLE_NFCONV */
1531 }
1532 #endif /* defined(__APPLE__) || defined(__DARWIN__) */
1533