xref: /haiku/src/add-ons/kernel/file_systems/ntfs/libntfs/unistr.c (revision bf243977ffd34197ad7c0820c2da5d21abea0402)
1 /**
2  * unistr.c - Unicode string handling. Originated from the Linux-NTFS project.
3  *
4  * Copyright (c) 2000-2004 Anton Altaparmakov
5  * Copyright (c) 2002-2009 Szabolcs Szakacsits
6  * Copyright (c) 2008-2009 Jean-Pierre Andre
7  * Copyright (c) 2008      Bernhard Kaindl
8  *
9  * This program/include file is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU General Public License as published
11  * by the Free Software Foundation; either version 2 of the License, or
12  * (at your option) any later version.
13  *
14  * This program/include file is distributed in the hope that it will be
15  * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
16  * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17  * GNU General Public License for more details.
18  *
19  * You should have received a copy of the GNU General Public License
20  * along with this program (in the main directory of the NTFS-3G
21  * distribution in the file COPYING); if not, write to the Free Software
22  * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
23  */
24 
25 #ifdef HAVE_CONFIG_H
26 #include "config.h"
27 #endif
28 
29 #ifdef HAVE_STDIO_H
30 #include <stdio.h>
31 #endif
32 #ifdef HAVE_STDLIB_H
33 #include <stdlib.h>
34 #endif
35 #ifdef HAVE_WCHAR_H
36 #include <wchar.h>
37 #endif
38 #ifdef HAVE_STRING_H
39 #include <string.h>
40 #endif
41 #ifdef HAVE_ERRNO_H
42 #include <errno.h>
43 #endif
44 #ifdef HAVE_LOCALE_H
45 #include <locale.h>
46 #endif
47 
48 #if defined(__APPLE__) || defined(__DARWIN__)
49 #ifdef ENABLE_NFCONV
50 #include <CoreFoundation/CoreFoundation.h>
51 #endif /* ENABLE_NFCONV */
52 #endif /* defined(__APPLE__) || defined(__DARWIN__) */
53 
54 #include "compat.h"
55 #include "attrib.h"
56 #include "types.h"
57 #include "unistr.h"
58 #include "debug.h"
59 #include "logging.h"
60 #include "misc.h"
61 
62 #define NOREVBOM 0  /* JPA rejecting U+FFFE and U+FFFF, open to debate */
63 
64 // no wchar support in the Haiku kernel
65 #if defined(__HAIKU__) && defined(_KERNEL_MODE)
66 #	include <KernelExport.h>
67 #	define mbstowcs(a, b, c)	(panic("mbstowcs"), 0)
68 #	define wctomb(a, b)			(panic("wctomb"), 0)
69 #	define mbtowc(a, b, c)		(panic("mbtowc"), 0)
70 #	define setlocale(a, b)		(panic("setlocale"), 0)
71 #endif
72 
73 /*
74  * IMPORTANT
75  * =========
76  *
77  * All these routines assume that the Unicode characters are in little endian
78  * encoding inside the strings!!!
79  */
80 
81 static int use_utf8 = 1; /* use UTF-8 encoding for file names */
82 
83 #if defined(__APPLE__) || defined(__DARWIN__)
84 #ifdef ENABLE_NFCONV
85 /**
86  * This variable controls whether or not automatic normalization form conversion
87  * should be performed when translating NTFS unicode file names to UTF-8.
88  * Defaults to on, but can be controlled from the outside using the function
89  *   int ntfs_macosx_normalize_filenames(int normalize);
90  */
91 static int nfconvert_utf8 = 1;
92 #endif /* ENABLE_NFCONV */
93 #endif /* defined(__APPLE__) || defined(__DARWIN__) */
94 
95 /*
96  * This is used by the name collation functions to quickly determine what
97  * characters are (in)valid.
98  */
99 #if 0
100 static const u8 legal_ansi_char_array[0x40] = {
101 	0x00, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
102 	0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
103 
104 	0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
105 	0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
106 
107 	0x17, 0x07, 0x18, 0x17, 0x17, 0x17, 0x17, 0x17,
108 	0x17, 0x17, 0x18, 0x16, 0x16, 0x17, 0x07, 0x00,
109 
110 	0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17,
111 	0x17, 0x17, 0x04, 0x16, 0x18, 0x16, 0x18, 0x18,
112 };
113 #endif
114 
115 /**
116  * ntfs_names_are_equal - compare two Unicode names for equality
117  * @s1:			name to compare to @s2
118  * @s1_len:		length in Unicode characters of @s1
119  * @s2:			name to compare to @s1
120  * @s2_len:		length in Unicode characters of @s2
121  * @ic:			ignore case bool
122  * @upcase:		upcase table (only if @ic == IGNORE_CASE)
123  * @upcase_size:	length in Unicode characters of @upcase (if present)
124  *
125  * Compare the names @s1 and @s2 and return TRUE (1) if the names are
126  * identical, or FALSE (0) if they are not identical. If @ic is IGNORE_CASE,
127  * the @upcase table is used to perform a case insensitive comparison.
128  */
129 BOOL ntfs_names_are_equal(const ntfschar *s1, size_t s1_len,
130 		const ntfschar *s2, size_t s2_len,
131 		const IGNORE_CASE_BOOL ic,
132 		const ntfschar *upcase, const u32 upcase_size)
133 {
134 	if (s1_len != s2_len)
135 		return FALSE;
136 	if (!s1_len)
137 		return TRUE;
138 	if (ic == CASE_SENSITIVE)
139 		return ntfs_ucsncmp(s1, s2, s1_len) ? FALSE: TRUE;
140 	return ntfs_ucsncasecmp(s1, s2, s1_len, upcase, upcase_size) ? FALSE:
141 								       TRUE;
142 }
143 
144 /*
145  * ntfs_names_full_collate() fully collate two Unicode names
146  *
147  * @name1:	first Unicode name to compare
148  * @name1_len:	length of first Unicode name to compare
149  * @name2:	second Unicode name to compare
150  * @name2_len:	length of second Unicode name to compare
151  * @ic:		either CASE_SENSITIVE or IGNORE_CASE
152  * @upcase:	upcase table (ignored if @ic is CASE_SENSITIVE)
153  * @upcase_len:	upcase table size (ignored if @ic is CASE_SENSITIVE)
154  *
155  *  -1 if the first name collates before the second one,
156  *   0 if the names match,
157  *   1 if the second name collates before the first one, or
158  *
159  */
160 int ntfs_names_full_collate(const ntfschar *name1, const u32 name1_len,
161 		const ntfschar *name2, const u32 name2_len,
162 		const IGNORE_CASE_BOOL ic, const ntfschar *upcase,
163 		const u32 upcase_len)
164 {
165 	u32 cnt;
166 	u16 c1, c2;
167 	u16 u1, u2;
168 
169 #ifdef DEBUG
170 	if (!name1 || !name2 || (ic && (!upcase || !upcase_len))) {
171 		ntfs_log_debug("ntfs_names_collate received NULL pointer!\n");
172 		exit(1);
173 	}
174 #endif
175 	cnt = min(name1_len, name2_len);
176 	if (cnt > 0) {
177 		if (ic == CASE_SENSITIVE) {
178 			do {
179 				c1 = le16_to_cpu(*name1);
180 				name1++;
181 				c2 = le16_to_cpu(*name2);
182 				name2++;
183 			} while (--cnt && (c1 == c2));
184 			u1 = c1;
185 			u2 = c2;
186 			if (u1 < upcase_len)
187 				u1 = le16_to_cpu(upcase[u1]);
188 			if (u2 < upcase_len)
189 				u2 = le16_to_cpu(upcase[u2]);
190 			if ((u1 == u2) && cnt)
191 				do {
192 					u1 = le16_to_cpu(*name1);
193 					name1++;
194 					u2 = le16_to_cpu(*name2);
195 					name2++;
196 					if (u1 < upcase_len)
197 						u1 = le16_to_cpu(upcase[u1]);
198 					if (u2 < upcase_len)
199 						u2 = le16_to_cpu(upcase[u2]);
200 				} while ((u1 == u2) && --cnt);
201 			if (u1 < u2)
202 				return -1;
203 			if (u1 > u2)
204 				return 1;
205 			if (name1_len < name2_len)
206 				return -1;
207 			if (name1_len > name2_len)
208 				return 1;
209 			if (c1 < c2)
210 				return -1;
211 			if (c1 > c2)
212 				return 1;
213 		} else {
214 			do {
215 				u1 = c1 = le16_to_cpu(*name1);
216 				name1++;
217 				u2 = c2 = le16_to_cpu(*name2);
218 				name2++;
219 				if (u1 < upcase_len)
220 					u1 = le16_to_cpu(upcase[u1]);
221 				if (u2 < upcase_len)
222 					u2 = le16_to_cpu(upcase[u2]);
223 			} while ((u1 == u2) && --cnt);
224 			if (u1 < u2)
225 				return -1;
226 			if (u1 > u2)
227 				return 1;
228 			if (name1_len < name2_len)
229 				return -1;
230 			if (name1_len > name2_len)
231 				return 1;
232 		}
233 	} else {
234 		if (name1_len < name2_len)
235 			return -1;
236 		if (name1_len > name2_len)
237 			return 1;
238 	}
239 	return 0;
240 }
241 
242 /**
243  * ntfs_ucsncmp - compare two little endian Unicode strings
244  * @s1:		first string
245  * @s2:		second string
246  * @n:		maximum unicode characters to compare
247  *
248  * Compare the first @n characters of the Unicode strings @s1 and @s2,
249  * The strings in little endian format and appropriate le16_to_cpu()
250  * conversion is performed on non-little endian machines.
251  *
252  * The function returns an integer less than, equal to, or greater than zero
253  * if @s1 (or the first @n Unicode characters thereof) is found, respectively,
254  * to be less than, to match, or be greater than @s2.
255  */
256 int ntfs_ucsncmp(const ntfschar *s1, const ntfschar *s2, size_t n)
257 {
258 	ntfschar c1, c2;
259 	size_t i;
260 
261 #ifdef DEBUG
262 	if (!s1 || !s2) {
263 		ntfs_log_debug("ntfs_wcsncmp() received NULL pointer!\n");
264 		exit(1);
265 	}
266 #endif
267 	for (i = 0; i < n; ++i) {
268 		c1 = le16_to_cpu(s1[i]);
269 		c2 = le16_to_cpu(s2[i]);
270 		if (c1 < c2)
271 			return -1;
272 		if (c1 > c2)
273 			return 1;
274 		if (!c1)
275 			break;
276 	}
277 	return 0;
278 }
279 
280 /**
281  * ntfs_ucsncasecmp - compare two little endian Unicode strings, ignoring case
282  * @s1:			first string
283  * @s2:			second string
284  * @n:			maximum unicode characters to compare
285  * @upcase:		upcase table
286  * @upcase_size:	upcase table size in Unicode characters
287  *
288  * Compare the first @n characters of the Unicode strings @s1 and @s2,
289  * ignoring case. The strings in little endian format and appropriate
290  * le16_to_cpu() conversion is performed on non-little endian machines.
291  *
292  * Each character is uppercased using the @upcase table before the comparison.
293  *
294  * The function returns an integer less than, equal to, or greater than zero
295  * if @s1 (or the first @n Unicode characters thereof) is found, respectively,
296  * to be less than, to match, or be greater than @s2.
297  */
298 int ntfs_ucsncasecmp(const ntfschar *s1, const ntfschar *s2, size_t n,
299 		const ntfschar *upcase, const u32 upcase_size)
300 {
301 	u16 c1, c2;
302 	size_t i;
303 
304 #ifdef DEBUG
305 	if (!s1 || !s2 || !upcase) {
306 		ntfs_log_debug("ntfs_wcsncasecmp() received NULL pointer!\n");
307 		exit(1);
308 	}
309 #endif
310 	for (i = 0; i < n; ++i) {
311 		if ((c1 = le16_to_cpu(s1[i])) < upcase_size)
312 			c1 = le16_to_cpu(upcase[c1]);
313 		if ((c2 = le16_to_cpu(s2[i])) < upcase_size)
314 			c2 = le16_to_cpu(upcase[c2]);
315 		if (c1 < c2)
316 			return -1;
317 		if (c1 > c2)
318 			return 1;
319 		if (!c1)
320 			break;
321 	}
322 	return 0;
323 }
324 
325 /**
326  * ntfs_ucsnlen - determine the length of a little endian Unicode string
327  * @s:		pointer to Unicode string
328  * @maxlen:	maximum length of string @s
329  *
330  * Return the number of Unicode characters in the little endian Unicode
331  * string @s up to a maximum of maxlen Unicode characters, not including
332  * the terminating (ntfschar)'\0'. If there is no (ntfschar)'\0' between @s
333  * and @s + @maxlen, @maxlen is returned.
334  *
335  * This function never looks beyond @s + @maxlen.
336  */
337 u32 ntfs_ucsnlen(const ntfschar *s, u32 maxlen)
338 {
339 	u32 i;
340 
341 	for (i = 0; i < maxlen; i++) {
342 		if (!le16_to_cpu(s[i]))
343 			break;
344 	}
345 	return i;
346 }
347 
348 /**
349  * ntfs_ucsndup - duplicate little endian Unicode string
350  * @s:		pointer to Unicode string
351  * @maxlen:	maximum length of string @s
352  *
353  * Return a pointer to a new little endian Unicode string which is a duplicate
354  * of the string s.  Memory for the new string is obtained with ntfs_malloc(3),
355  * and can be freed with free(3).
356  *
357  * A maximum of @maxlen Unicode characters are copied and a terminating
358  * (ntfschar)'\0' little endian Unicode character is added.
359  *
360  * This function never looks beyond @s + @maxlen.
361  *
362  * Return a pointer to the new little endian Unicode string on success and NULL
363  * on failure with errno set to the error code.
364  */
365 ntfschar *ntfs_ucsndup(const ntfschar *s, u32 maxlen)
366 {
367 	ntfschar *dst;
368 	u32 len;
369 
370 	len = ntfs_ucsnlen(s, maxlen);
371 	dst = ntfs_malloc((len + 1) * sizeof(ntfschar));
372 	if (dst) {
373 		memcpy(dst, s, len * sizeof(ntfschar));
374 		dst[len] = cpu_to_le16(L'\0');
375 	}
376 	return dst;
377 }
378 
379 /**
380  * ntfs_name_upcase - Map an Unicode name to its uppercase equivalent
381  * @name:
382  * @name_len:
383  * @upcase:
384  * @upcase_len:
385  *
386  * Description...
387  *
388  * Returns:
389  */
390 void ntfs_name_upcase(ntfschar *name, u32 name_len, const ntfschar *upcase,
391 		const u32 upcase_len)
392 {
393 	u32 i;
394 	u16 u;
395 
396 	for (i = 0; i < name_len; i++)
397 		if ((u = le16_to_cpu(name[i])) < upcase_len)
398 			name[i] = upcase[u];
399 }
400 
401 /**
402  * ntfs_name_locase - Map a Unicode name to its lowercase equivalent
403  */
404 void ntfs_name_locase(ntfschar *name, u32 name_len, const ntfschar *locase,
405 		const u32 locase_len)
406 {
407 	u32 i;
408 	u16 u;
409 
410 	if (locase)
411 		for (i = 0; i < name_len; i++)
412 			if ((u = le16_to_cpu(name[i])) < locase_len)
413 				name[i] = locase[u];
414 }
415 
416 /**
417  * ntfs_file_value_upcase - Convert a filename to upper case
418  * @file_name_attr:
419  * @upcase:
420  * @upcase_len:
421  *
422  * Description...
423  *
424  * Returns:
425  */
426 void ntfs_file_value_upcase(FILE_NAME_ATTR *file_name_attr,
427 		const ntfschar *upcase, const u32 upcase_len)
428 {
429 	ntfs_name_upcase((ntfschar*)&file_name_attr->file_name,
430 			file_name_attr->file_name_length, upcase, upcase_len);
431 }
432 
433 /*
434    NTFS uses Unicode (UTF-16LE [NTFS-3G uses UCS-2LE, which is enough
435    for now]) for path names, but the Unicode code points need to be
436    converted before a path can be accessed under NTFS. For 7 bit ASCII/ANSI,
437    glibc does this even without a locale in a hard-coded fashion as that
438    appears to be is easy because the low 7-bit ASCII range appears to be
439    available in all charsets but it does not convert anything if
440    there was some error with the locale setup or none set up like
441    when mount is called during early boot where he (by policy) do
442    not use locales (and may be not available if /usr is not yet mounted),
443    so this patch fixes the resulting issues for systems which use
444    UTF-8 and for others, specifying the locale in fstab brings them
445    the encoding which they want.
446 
447    If no locale is defined or there was a problem with setting one
448    up and whenever nl_langinfo(CODESET) returns a sting starting with
449    "ANSI", use an internal UCS-2LE <-> UTF-8 codeset converter to fix
450    the bug where NTFS-3G does not show any path names which include
451    international characters!!! (and also fails on creating them) as result.
452 
453    Author: Bernhard Kaindl <bk@suse.de>
454    Jean-Pierre Andre made it compliant with RFC3629/RFC2781.
455 */
456 
457 /*
458  * Return the amount of 8-bit elements in UTF-8 needed (without the terminating
459  * null) to store a given UTF-16LE string.
460  *
461  * Return -1 with errno set if string has invalid byte sequence or too long.
462  */
463 static int utf16_to_utf8_size(const ntfschar *ins, const int ins_len, int outs_len)
464 {
465 	int i, ret = -1;
466 	int count = 0;
467 	BOOL surrog;
468 
469 	surrog = FALSE;
470 	for (i = 0; i < ins_len && ins[i]; i++) {
471 		unsigned short c = le16_to_cpu(ins[i]);
472 		if (surrog) {
473 			if ((c >= 0xdc00) && (c < 0xe000)) {
474 				surrog = FALSE;
475 				count += 4;
476 			} else
477 				goto fail;
478 		} else
479 			if (c < 0x80)
480 				count++;
481 			else if (c < 0x800)
482 				count += 2;
483 			else if (c < 0xd800)
484 				count += 3;
485 			else if (c < 0xdc00)
486 				surrog = TRUE;
487 #if NOREVBOM
488 			else if ((c >= 0xe000) && (c < 0xfffe))
489 #else
490 			else if (c >= 0xe000)
491 #endif
492 				count += 3;
493 			else
494 				goto fail;
495 		if (count > outs_len) {
496 			errno = ENAMETOOLONG;
497 			goto out;
498 		}
499 	}
500 	if (surrog)
501 		goto fail;
502 
503 	ret = count;
504 out:
505 	return ret;
506 fail:
507 	errno = EILSEQ;
508 	goto out;
509 }
510 
511 /*
512  * ntfs_utf16_to_utf8 - convert a little endian UTF16LE string to an UTF-8 string
513  * @ins:	input utf16 string buffer
514  * @ins_len:	length of input string in utf16 characters
515  * @outs:	on return contains the (allocated) output multibyte string
516  * @outs_len:	length of output buffer in bytes
517  *
518  * Return -1 with errno set if string has invalid byte sequence or too long.
519  */
520 static int ntfs_utf16_to_utf8(const ntfschar *ins, const int ins_len,
521 			      char **outs, int outs_len)
522 {
523 #if defined(__APPLE__) || defined(__DARWIN__)
524 #ifdef ENABLE_NFCONV
525 	char *original_outs_value = *outs;
526 	int original_outs_len = outs_len;
527 #endif /* ENABLE_NFCONV */
528 #endif /* defined(__APPLE__) || defined(__DARWIN__) */
529 
530 	char *t;
531 	int i, size, ret = -1;
532 	int halfpair;
533 
534 	halfpair = 0;
535 	if (!*outs)
536 		outs_len = PATH_MAX;
537 
538 	size = utf16_to_utf8_size(ins, ins_len, outs_len);
539 
540 	if (size < 0)
541 		goto out;
542 
543 	if (!*outs) {
544 		outs_len = size + 1;
545 		*outs = ntfs_malloc(outs_len);
546 		if (!*outs)
547 			goto out;
548 	}
549 
550 	t = *outs;
551 
552 	for (i = 0; i < ins_len && ins[i]; i++) {
553 	    unsigned short c = le16_to_cpu(ins[i]);
554 			/* size not double-checked */
555 		if (halfpair) {
556 			if ((c >= 0xdc00) && (c < 0xe000)) {
557 				*t++ = 0xf0 + (((halfpair + 64) >> 8) & 7);
558 				*t++ = 0x80 + (((halfpair + 64) >> 2) & 63);
559 				*t++ = 0x80 + ((c >> 6) & 15) + ((halfpair & 3) << 4);
560 				*t++ = 0x80 + (c & 63);
561 				halfpair = 0;
562 			} else
563 				goto fail;
564 		} else if (c < 0x80) {
565 			*t++ = c;
566 	    	} else {
567 			if (c < 0x800) {
568 			   	*t++ = (0xc0 | ((c >> 6) & 0x3f));
569 			        *t++ = 0x80 | (c & 0x3f);
570 			} else if (c < 0xd800) {
571 			   	*t++ = 0xe0 | (c >> 12);
572 			   	*t++ = 0x80 | ((c >> 6) & 0x3f);
573 		        	*t++ = 0x80 | (c & 0x3f);
574 			} else if (c < 0xdc00)
575 				halfpair = c;
576 			else if (c >= 0xe000) {
577 				*t++ = 0xe0 | (c >> 12);
578 				*t++ = 0x80 | ((c >> 6) & 0x3f);
579 			        *t++ = 0x80 | (c & 0x3f);
580 			} else
581 				goto fail;
582 	        }
583 	}
584 	*t = '\0';
585 
586 #if defined(__APPLE__) || defined(__DARWIN__)
587 #ifdef ENABLE_NFCONV
588 	if(nfconvert_utf8 && (t - *outs) > 0) {
589 		char *new_outs = NULL;
590 		int new_outs_len = ntfs_macosx_normalize_utf8(*outs, &new_outs, 0); // Normalize to decomposed form
591 		if(new_outs_len >= 0 && new_outs != NULL) {
592 			if(original_outs_value != *outs) {
593 				// We have allocated outs ourselves.
594 				free(*outs);
595 				*outs = new_outs;
596 				t = *outs + new_outs_len;
597 			}
598 			else {
599 				// We need to copy new_outs into the fixed outs buffer.
600 				memset(*outs, 0, original_outs_len);
601 				strncpy(*outs, new_outs, original_outs_len-1);
602 				t = *outs + original_outs_len;
603 				free(new_outs);
604 			}
605 		}
606 		else {
607 			ntfs_log_error("Failed to normalize NTFS string to UTF-8 NFD: %s\n", *outs);
608 			ntfs_log_error("  new_outs=0x%p\n", new_outs);
609 			ntfs_log_error("  new_outs_len=%d\n", new_outs_len);
610 		}
611 	}
612 #endif /* ENABLE_NFCONV */
613 #endif /* defined(__APPLE__) || defined(__DARWIN__) */
614 
615 	ret = t - *outs;
616 out:
617 	return ret;
618 fail:
619 	errno = EILSEQ;
620 	goto out;
621 }
622 
623 /*
624  * Return the amount of 16-bit elements in UTF-16LE needed
625  * (without the terminating null) to store given UTF-8 string.
626  *
627  * Return -1 with errno set if it's longer than PATH_MAX or string is invalid.
628  *
629  * Note: This does not check whether the input sequence is a valid utf8 string,
630  *	 and should be used only in context where such check is made!
631  */
632 static int utf8_to_utf16_size(const char *s)
633 {
634 	int ret = -1;
635 	unsigned int byte;
636 	size_t count = 0;
637 
638 	while ((byte = *((const unsigned char *)s++))) {
639 		if (++count >= PATH_MAX)
640 			goto fail;
641 		if (byte >= 0xc0) {
642 			if (byte >= 0xF5) {
643 				errno = EILSEQ;
644 				goto out;
645 			}
646 			if (!*s)
647 				break;
648 			if (byte >= 0xC0)
649 				s++;
650 			if (!*s)
651 				break;
652 			if (byte >= 0xE0)
653 				s++;
654 			if (!*s)
655 				break;
656 			if (byte >= 0xF0) {
657 				s++;
658 				if (++count >= PATH_MAX)
659 					goto fail;
660 			}
661 		}
662 	}
663 	ret = count;
664 out:
665 	return ret;
666 fail:
667 	errno = ENAMETOOLONG;
668 	goto out;
669 }
670 /*
671  * This converts one UTF-8 sequence to cpu-endian Unicode value
672  * within range U+0 .. U+10ffff and excluding U+D800 .. U+DFFF
673  *
674  * Return the number of used utf8 bytes or -1 with errno set
675  * if sequence is invalid.
676  */
677 static int utf8_to_unicode(u32 *wc, const char *s)
678 {
679     	unsigned int byte = *((const unsigned char *)s);
680 
681 					/* single byte */
682 	if (byte == 0) {
683 		*wc = (u32) 0;
684 		return 0;
685 	} else if (byte < 0x80) {
686 		*wc = (u32) byte;
687 		return 1;
688 					/* double byte */
689 	} else if (byte < 0xc2) {
690 		goto fail;
691 	} else if (byte < 0xE0) {
692 		if ((s[1] & 0xC0) == 0x80) {
693 			*wc = ((u32)(byte & 0x1F) << 6)
694 			    | ((u32)(s[1] & 0x3F));
695 			return 2;
696 		} else
697 			goto fail;
698 					/* three-byte */
699 	} else if (byte < 0xF0) {
700 		if (((s[1] & 0xC0) == 0x80) && ((s[2] & 0xC0) == 0x80)) {
701 			*wc = ((u32)(byte & 0x0F) << 12)
702 			    | ((u32)(s[1] & 0x3F) << 6)
703 			    | ((u32)(s[2] & 0x3F));
704 			/* Check valid ranges */
705 #if NOREVBOM
706 			if (((*wc >= 0x800) && (*wc <= 0xD7FF))
707 			  || ((*wc >= 0xe000) && (*wc <= 0xFFFD)))
708 				return 3;
709 #else
710 			if (((*wc >= 0x800) && (*wc <= 0xD7FF))
711 			  || ((*wc >= 0xe000) && (*wc <= 0xFFFF)))
712 				return 3;
713 #endif
714 		}
715 		goto fail;
716 					/* four-byte */
717 	} else if (byte < 0xF5) {
718 		if (((s[1] & 0xC0) == 0x80) && ((s[2] & 0xC0) == 0x80)
719 		  && ((s[3] & 0xC0) == 0x80)) {
720 			*wc = ((u32)(byte & 0x07) << 18)
721 			    | ((u32)(s[1] & 0x3F) << 12)
722 			    | ((u32)(s[2] & 0x3F) << 6)
723 			    | ((u32)(s[3] & 0x3F));
724 		/* Check valid ranges */
725 		if ((*wc <= 0x10ffff) && (*wc >= 0x10000))
726 			return 4;
727 		}
728 		goto fail;
729 	}
730 fail:
731 	errno = EILSEQ;
732 	return -1;
733 }
734 
735 /**
736  * ntfs_utf8_to_utf16 - convert a UTF-8 string to a UTF-16LE string
737  * @ins:	input multibyte string buffer
738  * @outs:	on return contains the (allocated) output utf16 string
739  * @outs_len:	length of output buffer in utf16 characters
740  *
741  * Return -1 with errno set.
742  */
743 static int ntfs_utf8_to_utf16(const char *ins, ntfschar **outs)
744 {
745 #if defined(__APPLE__) || defined(__DARWIN__)
746 #ifdef ENABLE_NFCONV
747 	char *new_ins = NULL;
748 	if(nfconvert_utf8) {
749 		int new_ins_len;
750 		new_ins_len = ntfs_macosx_normalize_utf8(ins, &new_ins, 1); // Normalize to composed form
751 		if(new_ins_len >= 0)
752 			ins = new_ins;
753 		else
754 			ntfs_log_error("Failed to normalize NTFS string to UTF-8 NFC: %s\n", ins);
755 	}
756 #endif /* ENABLE_NFCONV */
757 #endif /* defined(__APPLE__) || defined(__DARWIN__) */
758 	const char *t = ins;
759 	u32 wc;
760 	BOOL allocated;
761 	ntfschar *outpos;
762 	int shorts, ret = -1;
763 
764 	shorts = utf8_to_utf16_size(ins);
765 	if (shorts < 0)
766 		goto fail;
767 
768 	allocated = FALSE;
769 	if (!*outs) {
770 		*outs = ntfs_malloc((shorts + 1) * sizeof(ntfschar));
771 		if (!*outs)
772 			goto fail;
773 		allocated = TRUE;
774 	}
775 
776 	outpos = *outs;
777 
778 	while(1) {
779 		int m  = utf8_to_unicode(&wc, t);
780 		if (m <= 0) {
781 			if (m < 0) {
782 				/* do not leave space allocated if failed */
783 				if (allocated) {
784 					free(*outs);
785 					*outs = (ntfschar*)NULL;
786 				}
787 				goto fail;
788 			}
789 			*outpos++ = const_cpu_to_le16(0);
790 			break;
791 		}
792 		if (wc < 0x10000)
793 			*outpos++ = cpu_to_le16(wc);
794 		else {
795 			wc -= 0x10000;
796 			*outpos++ = cpu_to_le16((wc >> 10) + 0xd800);
797 			*outpos++ = cpu_to_le16((wc & 0x3ff) + 0xdc00);
798 		}
799 		t += m;
800 	}
801 
802 	ret = --outpos - *outs;
803 fail:
804 #if defined(__APPLE__) || defined(__DARWIN__)
805 #ifdef ENABLE_NFCONV
806 	if(new_ins != NULL)
807 		free(new_ins);
808 #endif /* ENABLE_NFCONV */
809 #endif /* defined(__APPLE__) || defined(__DARWIN__) */
810 	return ret;
811 }
812 
813 /**
814  * ntfs_ucstombs - convert a little endian Unicode string to a multibyte string
815  * @ins:	input Unicode string buffer
816  * @ins_len:	length of input string in Unicode characters
817  * @outs:	on return contains the (allocated) output multibyte string
818  * @outs_len:	length of output buffer in bytes
819  *
820  * Convert the input little endian, 2-byte Unicode string @ins, of length
821  * @ins_len into the multibyte string format dictated by the current locale.
822  *
823  * If *@outs is NULL, the function allocates the string and the caller is
824  * responsible for calling free(*@outs); when finished with it.
825  *
826  * On success the function returns the number of bytes written to the output
827  * string *@outs (>= 0), not counting the terminating NULL byte. If the output
828  * string buffer was allocated, *@outs is set to it.
829  *
830  * On error, -1 is returned, and errno is set to the error code. The following
831  * error codes can be expected:
832  *	EINVAL		Invalid arguments (e.g. @ins or @outs is NULL).
833  *	EILSEQ		The input string cannot be represented as a multibyte
834  *			sequence according to the current locale.
835  *	ENAMETOOLONG	Destination buffer is too small for input string.
836  *	ENOMEM		Not enough memory to allocate destination buffer.
837  */
838 int ntfs_ucstombs(const ntfschar *ins, const int ins_len, char **outs,
839 		int outs_len)
840 {
841 	char *mbs;
842 	int mbs_len;
843 #ifdef MB_CUR_MAX
844 	wchar_t wc;
845 	int i, o;
846 	int cnt = 0;
847 #ifdef HAVE_MBSINIT
848 	mbstate_t mbstate;
849 #endif
850 #endif /* MB_CUR_MAX */
851 
852 	if (!ins || !outs) {
853 		errno = EINVAL;
854 		return -1;
855 	}
856 	mbs = *outs;
857 	mbs_len = outs_len;
858 	if (mbs && !mbs_len) {
859 		errno = ENAMETOOLONG;
860 		return -1;
861 	}
862 	if (use_utf8)
863 		return ntfs_utf16_to_utf8(ins, ins_len, outs, outs_len);
864 #ifdef MB_CUR_MAX
865 	if (!mbs) {
866 		mbs_len = (ins_len + 1) * MB_CUR_MAX;
867 		mbs = ntfs_malloc(mbs_len);
868 		if (!mbs)
869 			return -1;
870 	}
871 #ifdef HAVE_MBSINIT
872 	memset(&mbstate, 0, sizeof(mbstate));
873 #else
874 	wctomb(NULL, 0);
875 #endif
876 	for (i = o = 0; i < ins_len; i++) {
877 		/* Reallocate memory if necessary or abort. */
878 		if ((int)(o + MB_CUR_MAX) > mbs_len) {
879 			char *tc;
880 			if (mbs == *outs) {
881 				errno = ENAMETOOLONG;
882 				return -1;
883 			}
884 			tc = ntfs_malloc((mbs_len + 64) & ~63);
885 			if (!tc)
886 				goto err_out;
887 			memcpy(tc, mbs, mbs_len);
888 			mbs_len = (mbs_len + 64) & ~63;
889 			free(mbs);
890 			mbs = tc;
891 		}
892 		/* Convert the LE Unicode character to a CPU wide character. */
893 		wc = (wchar_t)le16_to_cpu(ins[i]);
894 		if (!wc)
895 			break;
896 		/* Convert the CPU endian wide character to multibyte. */
897 #ifdef HAVE_MBSINIT
898 		cnt = wcrtomb(mbs + o, wc, &mbstate);
899 #else
900 		cnt = wctomb(mbs + o, wc);
901 #endif
902 		if (cnt == -1)
903 			goto err_out;
904 		if (cnt <= 0) {
905 			ntfs_log_debug("Eeek. cnt <= 0, cnt = %i\n", cnt);
906 			errno = EINVAL;
907 			goto err_out;
908 		}
909 		o += cnt;
910 	}
911 #ifdef HAVE_MBSINIT
912 	/* Make sure we are back in the initial state. */
913 	if (!mbsinit(&mbstate)) {
914 		ntfs_log_debug("Eeek. mbstate not in initial state!\n");
915 		errno = EILSEQ;
916 		goto err_out;
917 	}
918 #endif
919 	/* Now write the NULL character. */
920 	mbs[o] = '\0';
921 	if (*outs != mbs)
922 		*outs = mbs;
923 	return o;
924 err_out:
925 	if (mbs != *outs) {
926 		int eo = errno;
927 		free(mbs);
928 		errno = eo;
929 	}
930 #else /* MB_CUR_MAX */
931 	errno = EILSEQ;
932 #endif /* MB_CUR_MAX */
933 	return -1;
934 }
935 
936 /**
937  * ntfs_mbstoucs - convert a multibyte string to a little endian Unicode string
938  * @ins:	input multibyte string buffer
939  * @outs:	on return contains the (allocated) output Unicode string
940  *
941  * Convert the input multibyte string @ins, from the current locale into the
942  * corresponding little endian, 2-byte Unicode string.
943  *
944  * The function allocates the string and the caller is responsible for calling
945  * free(*@outs); when finished with it.
946  *
947  * On success the function returns the number of Unicode characters written to
948  * the output string *@outs (>= 0), not counting the terminating Unicode NULL
949  * character.
950  *
951  * On error, -1 is returned, and errno is set to the error code. The following
952  * error codes can be expected:
953  *	EINVAL		Invalid arguments (e.g. @ins or @outs is NULL).
954  *	EILSEQ		The input string cannot be represented as a Unicode
955  *			string according to the current locale.
956  *	ENAMETOOLONG	Destination buffer is too small for input string.
957  *	ENOMEM		Not enough memory to allocate destination buffer.
958  */
959 int ntfs_mbstoucs(const char *ins, ntfschar **outs)
960 {
961 #ifdef MB_CUR_MAX
962 	ntfschar *ucs;
963 	const char *s;
964 	wchar_t wc;
965 	int i, o, cnt, ins_len, ucs_len, ins_size;
966 #ifdef HAVE_MBSINIT
967 	mbstate_t mbstate;
968 #endif
969 #endif /* MB_CUR_MAX */
970 
971 	if (!ins || !outs) {
972 		errno = EINVAL;
973 		return -1;
974 	}
975 
976 	if (use_utf8)
977 		return ntfs_utf8_to_utf16(ins, outs);
978 
979 #ifdef MB_CUR_MAX
980 	/* Determine the size of the multi-byte string in bytes. */
981 	ins_size = strlen(ins);
982 	/* Determine the length of the multi-byte string. */
983 	s = ins;
984 #if defined(HAVE_MBSINIT)
985 	memset(&mbstate, 0, sizeof(mbstate));
986 	ins_len = mbsrtowcs(NULL, (const char **)&s, 0, &mbstate);
987 #ifdef __CYGWIN32__
988 	if (!ins_len && *ins) {
989 		/* Older Cygwin had broken mbsrtowcs() implementation. */
990 		ins_len = strlen(ins);
991 	}
992 #endif
993 #elif !defined(DJGPP)
994 	ins_len = mbstowcs(NULL, s, 0);
995 #else
996 	/* Eeek!!! DJGPP has broken mbstowcs() implementation!!! */
997 	ins_len = strlen(ins);
998 #endif
999 	if (ins_len == -1)
1000 		return ins_len;
1001 #ifdef HAVE_MBSINIT
1002 	if ((s != ins) || !mbsinit(&mbstate)) {
1003 #else
1004 	if (s != ins) {
1005 #endif
1006 		errno = EILSEQ;
1007 		return -1;
1008 	}
1009 	/* Add the NULL terminator. */
1010 	ins_len++;
1011 	ucs_len = ins_len;
1012 	ucs = ntfs_malloc(ucs_len * sizeof(ntfschar));
1013 	if (!ucs)
1014 		return -1;
1015 #ifdef HAVE_MBSINIT
1016 	memset(&mbstate, 0, sizeof(mbstate));
1017 #else
1018 	mbtowc(NULL, NULL, 0);
1019 #endif
1020 	for (i = o = cnt = 0; i < ins_size; i += cnt, o++) {
1021 		/* Reallocate memory if necessary. */
1022 		if (o >= ucs_len) {
1023 			ntfschar *tc;
1024 			ucs_len = (ucs_len * sizeof(ntfschar) + 64) & ~63;
1025 			tc = realloc(ucs, ucs_len);
1026 			if (!tc)
1027 				goto err_out;
1028 			ucs = tc;
1029 			ucs_len /= sizeof(ntfschar);
1030 		}
1031 		/* Convert the multibyte character to a wide character. */
1032 #ifdef HAVE_MBSINIT
1033 		cnt = mbrtowc(&wc, ins + i, ins_size - i, &mbstate);
1034 #else
1035 		cnt = mbtowc(&wc, ins + i, ins_size - i);
1036 #endif
1037 		if (!cnt)
1038 			break;
1039 		if (cnt == -1)
1040 			goto err_out;
1041 		if (cnt < -1) {
1042 			ntfs_log_trace("Eeek. cnt = %i\n", cnt);
1043 			errno = EINVAL;
1044 			goto err_out;
1045 		}
1046 		/* Make sure we are not overflowing the NTFS Unicode set. */
1047 		if ((unsigned long)wc >= (unsigned long)(1 <<
1048 				(8 * sizeof(ntfschar)))) {
1049 			errno = EILSEQ;
1050 			goto err_out;
1051 		}
1052 		/* Convert the CPU wide character to a LE Unicode character. */
1053 		ucs[o] = cpu_to_le16(wc);
1054 	}
1055 #ifdef HAVE_MBSINIT
1056 	/* Make sure we are back in the initial state. */
1057 	if (!mbsinit(&mbstate)) {
1058 		ntfs_log_trace("Eeek. mbstate not in initial state!\n");
1059 		errno = EILSEQ;
1060 		goto err_out;
1061 	}
1062 #endif
1063 	/* Now write the NULL character. */
1064 	ucs[o] = cpu_to_le16(L'\0');
1065 	*outs = ucs;
1066 	return o;
1067 err_out:
1068 	free(ucs);
1069 #else /* MB_CUR_MAX */
1070 	errno = EILSEQ;
1071 #endif /* MB_CUR_MAX */
1072 	return -1;
1073 }
1074 
1075 /*
1076  *		Turn a UTF8 name uppercase
1077  *
1078  *	Returns an allocated uppercase name which has to be freed by caller
1079  *	or NULL if there is an error (described by errno)
1080  */
1081 
1082 char *ntfs_uppercase_mbs(const char *low,
1083 			const ntfschar *upcase, u32 upcase_size)
1084 {
1085 	int size;
1086 	char *upp;
1087 	u32 wc;
1088 	int n;
1089 	const char *s;
1090 	char *t;
1091 
1092 	size = strlen(low);
1093 	upp = (char*)ntfs_malloc(3*size + 1);
1094 	if (upp) {
1095 		s = low;
1096 		t = upp;
1097 		do {
1098 			n = utf8_to_unicode(&wc, s);
1099 			if (n > 0) {
1100 				if (wc < upcase_size)
1101 					wc = le16_to_cpu(upcase[wc]);
1102 				if (wc < 0x80)
1103 					*t++ = wc;
1104 				else if (wc < 0x800) {
1105 					*t++ = (0xc0 | ((wc >> 6) & 0x3f));
1106 					*t++ = 0x80 | (wc & 0x3f);
1107 				} else if (wc < 0x10000) {
1108 					*t++ = 0xe0 | (wc >> 12);
1109 					*t++ = 0x80 | ((wc >> 6) & 0x3f);
1110 					*t++ = 0x80 | (wc & 0x3f);
1111 				} else {
1112 					*t++ = 0xf0 | ((wc >> 18) & 7);
1113 					*t++ = 0x80 | ((wc >> 12) & 63);
1114 					*t++ = 0x80 | ((wc >> 6) & 0x3f);
1115 					*t++ = 0x80 | (wc & 0x3f);
1116 				}
1117 			s += n;
1118 			}
1119 		} while (n > 0);
1120 		if (n < 0) {
1121 			free(upp);
1122 			upp = (char*)NULL;
1123 			errno = EILSEQ;
1124 		}
1125 		*t = 0;
1126 	}
1127 	return (upp);
1128 }
1129 
1130 /**
1131  * ntfs_upcase_table_build - build the default upcase table for NTFS
1132  * @uc:		destination buffer where to store the built table
1133  * @uc_len:	size of destination buffer in bytes
1134  *
1135  * ntfs_upcase_table_build() builds the default upcase table for NTFS and
1136  * stores it in the caller supplied buffer @uc of size @uc_len.
1137  *
1138  * Note, @uc_len must be at least 128kiB in size or bad things will happen!
1139  */
1140 void ntfs_upcase_table_build(ntfschar *uc, u32 uc_len)
1141 {
1142 	static int uc_run_table[][3] = { /* Start, End, Add */
1143 	{0x0061, 0x007B,  -32}, {0x0451, 0x045D, -80}, {0x1F70, 0x1F72,  74},
1144 	{0x00E0, 0x00F7,  -32}, {0x045E, 0x0460, -80}, {0x1F72, 0x1F76,  86},
1145 	{0x00F8, 0x00FF,  -32}, {0x0561, 0x0587, -48}, {0x1F76, 0x1F78, 100},
1146 	{0x0256, 0x0258, -205}, {0x1F00, 0x1F08,   8}, {0x1F78, 0x1F7A, 128},
1147 	{0x028A, 0x028C, -217}, {0x1F10, 0x1F16,   8}, {0x1F7A, 0x1F7C, 112},
1148 	{0x03AC, 0x03AD,  -38}, {0x1F20, 0x1F28,   8}, {0x1F7C, 0x1F7E, 126},
1149 	{0x03AD, 0x03B0,  -37}, {0x1F30, 0x1F38,   8}, {0x1FB0, 0x1FB2,   8},
1150 	{0x03B1, 0x03C2,  -32}, {0x1F40, 0x1F46,   8}, {0x1FD0, 0x1FD2,   8},
1151 	{0x03C2, 0x03C3,  -31}, {0x1F51, 0x1F52,   8}, {0x1FE0, 0x1FE2,   8},
1152 	{0x03C3, 0x03CC,  -32}, {0x1F53, 0x1F54,   8}, {0x1FE5, 0x1FE6,   7},
1153 	{0x03CC, 0x03CD,  -64}, {0x1F55, 0x1F56,   8}, {0x2170, 0x2180, -16},
1154 	{0x03CD, 0x03CF,  -63}, {0x1F57, 0x1F58,   8}, {0x24D0, 0x24EA, -26},
1155 	{0x0430, 0x0450,  -32}, {0x1F60, 0x1F68,   8}, {0xFF41, 0xFF5B, -32},
1156 	{0}
1157 	};
1158 	static int uc_dup_table[][2] = { /* Start, End */
1159 	{0x0100, 0x012F}, {0x01A0, 0x01A6}, {0x03E2, 0x03EF}, {0x04CB, 0x04CC},
1160 	{0x0132, 0x0137}, {0x01B3, 0x01B7}, {0x0460, 0x0481}, {0x04D0, 0x04EB},
1161 	{0x0139, 0x0149}, {0x01CD, 0x01DD}, {0x0490, 0x04BF}, {0x04EE, 0x04F5},
1162 	{0x014A, 0x0178}, {0x01DE, 0x01EF}, {0x04BF, 0x04BF}, {0x04F8, 0x04F9},
1163 	{0x0179, 0x017E}, {0x01F4, 0x01F5}, {0x04C1, 0x04C4}, {0x1E00, 0x1E95},
1164 	{0x018B, 0x018B}, {0x01FA, 0x0218}, {0x04C7, 0x04C8}, {0x1EA0, 0x1EF9},
1165 	{0}
1166 	};
1167 	static int uc_byte_table[][2] = { /* Offset, Value */
1168 	{0x00FF, 0x0178}, {0x01AD, 0x01AC}, {0x01F3, 0x01F1}, {0x0269, 0x0196},
1169 	{0x0183, 0x0182}, {0x01B0, 0x01AF}, {0x0253, 0x0181}, {0x026F, 0x019C},
1170 	{0x0185, 0x0184}, {0x01B9, 0x01B8}, {0x0254, 0x0186}, {0x0272, 0x019D},
1171 	{0x0188, 0x0187}, {0x01BD, 0x01BC}, {0x0259, 0x018F}, {0x0275, 0x019F},
1172 	{0x018C, 0x018B}, {0x01C6, 0x01C4}, {0x025B, 0x0190}, {0x0283, 0x01A9},
1173 	{0x0192, 0x0191}, {0x01C9, 0x01C7}, {0x0260, 0x0193}, {0x0288, 0x01AE},
1174 	{0x0199, 0x0198}, {0x01CC, 0x01CA}, {0x0263, 0x0194}, {0x0292, 0x01B7},
1175 	{0x01A8, 0x01A7}, {0x01DD, 0x018E}, {0x0268, 0x0197},
1176 	{0}
1177 	};
1178 	int i, r;
1179 	int k, off;
1180 
1181 	memset((char*)uc, 0, uc_len);
1182 	uc_len >>= 1;
1183 	if (uc_len > 65536)
1184 		uc_len = 65536;
1185 	for (i = 0; (u32)i < uc_len; i++)
1186 		uc[i] = cpu_to_le16(i);
1187 	for (r = 0; uc_run_table[r][0]; r++) {
1188 		off = uc_run_table[r][2];
1189 		for (i = uc_run_table[r][0]; i < uc_run_table[r][1]; i++)
1190 			uc[i] = cpu_to_le16(i + off);
1191 	}
1192 	for (r = 0; uc_dup_table[r][0]; r++)
1193 		for (i = uc_dup_table[r][0]; i < uc_dup_table[r][1]; i += 2)
1194 			uc[i + 1] = cpu_to_le16(i);
1195 	for (r = 0; uc_byte_table[r][0]; r++) {
1196 		k = uc_byte_table[r][1];
1197 		uc[uc_byte_table[r][0]] = cpu_to_le16(k);
1198 	}
1199 }
1200 
1201 /*
1202  *		Build a table for converting to lower case
1203  *
1204  *	This is only meaningful when there is a single lower case
1205  *	character leading to an upper case one, and currently the
1206  *	only exception is the greek letter sigma which has a single
1207  *	upper case glyph (code U+03A3), but two lower case glyphs
1208  *	(code U+03C3 and U+03C2, the latter to be used at the end
1209  *	of a word). In the following implementation the upper case
1210  *	sigma will be lowercased as U+03C3.
1211  */
1212 
1213 ntfschar *ntfs_locase_table_build(const ntfschar *uc, u32 uc_cnt)
1214 {
1215 	ntfschar *lc;
1216 	u32 upp;
1217 	u32 i;
1218 
1219 	lc = (ntfschar*)ntfs_malloc(uc_cnt*sizeof(ntfschar));
1220 	if (lc) {
1221 		for (i=0; i<uc_cnt; i++)
1222 			lc[i] = cpu_to_le16(i);
1223 		for (i=0; i<uc_cnt; i++) {
1224 			upp = le16_to_cpu(uc[i]);
1225 			if ((upp != i) && (upp < uc_cnt))
1226 				lc[upp] = cpu_to_le16(i);
1227 		}
1228 	} else
1229 		ntfs_log_error("Could not build the locase table\n");
1230 	return (lc);
1231 }
1232 
1233 /**
1234  * ntfs_str2ucs - convert a string to a valid NTFS file name
1235  * @s:		input string
1236  * @len:	length of output buffer in Unicode characters
1237  *
1238  * Convert the input @s string into the corresponding little endian,
1239  * 2-byte Unicode string. The length of the converted string is less
1240  * or equal to the maximum length allowed by the NTFS format (255).
1241  *
1242  * If @s is NULL then return AT_UNNAMED.
1243  *
1244  * On success the function returns the Unicode string in an allocated
1245  * buffer and the caller is responsible to free it when it's not needed
1246  * anymore.
1247  *
1248  * On error NULL is returned and errno is set to the error code.
1249  */
1250 ntfschar *ntfs_str2ucs(const char *s, int *len)
1251 {
1252 	ntfschar *ucs = NULL;
1253 
1254 	if (s && ((*len = ntfs_mbstoucs(s, &ucs)) == -1)) {
1255 		ntfs_log_perror("Couldn't convert '%s' to Unicode", s);
1256 		return NULL;
1257 	}
1258 	if (*len > NTFS_MAX_NAME_LEN) {
1259 		free(ucs);
1260 		errno = ENAMETOOLONG;
1261 		return NULL;
1262 	}
1263 	if (!ucs || !*len) {
1264 		ucs  = AT_UNNAMED;
1265 		*len = 0;
1266 	}
1267 	return ucs;
1268 }
1269 
1270 /**
1271  * ntfs_ucsfree - free memory allocated by ntfs_str2ucs()
1272  * @ucs		input string to be freed
1273  *
1274  * Free memory at @ucs and which was allocated by ntfs_str2ucs.
1275  *
1276  * Return value: none.
1277  */
1278 void ntfs_ucsfree(ntfschar *ucs)
1279 {
1280 	if (ucs && (ucs != AT_UNNAMED))
1281 		free(ucs);
1282 }
1283 
1284 /*
1285  *		Check whether a name contains no chars forbidden
1286  *	for DOS or Win32 use
1287  *
1288  *	If there is a bad char, errno is set to EINVAL
1289  */
1290 
1291 BOOL ntfs_forbidden_chars(const ntfschar *name, int len)
1292 {
1293 	BOOL forbidden;
1294 	int ch;
1295 	int i;
1296 	u32 mainset =     (1L << ('\"' - 0x20))
1297 			| (1L << ('*' - 0x20))
1298 			| (1L << ('/' - 0x20))
1299 			| (1L << (':' - 0x20))
1300 			| (1L << ('<' - 0x20))
1301 			| (1L << ('>' - 0x20))
1302 			| (1L << ('?' - 0x20));
1303 
1304 	forbidden = (len == 0)
1305 			|| (le16_to_cpu(name[len-1]) == ' ')
1306 			|| (le16_to_cpu(name[len-1]) == '.');
1307 	for (i=0; i<len; i++) {
1308 		ch = le16_to_cpu(name[i]);
1309 		if ((ch < 0x20)
1310 		    || ((ch < 0x40)
1311 			&& ((1L << (ch - 0x20)) & mainset))
1312 		    || (ch == '\\')
1313 		    || (ch == '|'))
1314 			forbidden = TRUE;
1315 	}
1316 	if (forbidden)
1317 		errno = EINVAL;
1318 	return (forbidden);
1319 }
1320 
1321 /*
1322  *		Check whether the same name can be used as a DOS and
1323  *	a Win32 name
1324  *
1325  *	The names must be the same, or the short name the uppercase
1326  *	variant of the long name
1327  */
1328 
1329 BOOL ntfs_collapsible_chars(ntfs_volume *vol,
1330 			const ntfschar *shortname, int shortlen,
1331 			const ntfschar *longname, int longlen)
1332 {
1333 	BOOL collapsible;
1334 	unsigned int ch;
1335 	int i;
1336 
1337 	collapsible = shortlen == longlen;
1338 	if (collapsible)
1339 		for (i=0; i<shortlen; i++) {
1340 			ch = le16_to_cpu(longname[i]);
1341 			if ((ch >= vol->upcase_len)
1342 		   	 || ((shortname[i] != longname[i])
1343 				&& (shortname[i] != vol->upcase[ch])))
1344 					collapsible = FALSE;
1345 	}
1346 	return (collapsible);
1347 }
1348 
1349 /*
1350  * Define the character encoding to be used.
1351  * Use UTF-8 unless specified otherwise.
1352  */
1353 
1354 int ntfs_set_char_encoding(const char *locale)
1355 {
1356 	use_utf8 = 0;
1357 	if (!locale || strstr(locale,"utf8") || strstr(locale,"UTF8")
1358 	    || strstr(locale,"utf-8") || strstr(locale,"UTF-8"))
1359 		use_utf8 = 1;
1360 	else
1361 		if (setlocale(LC_ALL, locale))
1362 			use_utf8 = 0;
1363 		else {
1364 			ntfs_log_error("Invalid locale, encoding to UTF-8\n");
1365 			use_utf8 = 1;
1366 	 	}
1367 	return 0; /* always successful */
1368 }
1369 
1370 #if defined(__APPLE__) || defined(__DARWIN__)
1371 
1372 int ntfs_macosx_normalize_filenames(int normalize) {
1373 #ifdef ENABLE_NFCONV
1374 	if(normalize == 0 || normalize == 1) {
1375 		nfconvert_utf8 = normalize;
1376 		return 0;
1377 	}
1378 	else
1379 		return -1;
1380 #else
1381 	return -1;
1382 #endif /* ENABLE_NFCONV */
1383 }
1384 
1385 int ntfs_macosx_normalize_utf8(const char *utf8_string, char **target,
1386  int composed) {
1387 #ifdef ENABLE_NFCONV
1388 	/* For this code to compile, the CoreFoundation framework must be fed to the linker. */
1389 	CFStringRef cfSourceString;
1390 	CFMutableStringRef cfMutableString;
1391 	CFRange rangeToProcess;
1392 	CFIndex requiredBufferLength;
1393 	char *result = NULL;
1394 	int resultLength = -1;
1395 
1396 	/* Convert the UTF-8 string to a CFString. */
1397 	cfSourceString = CFStringCreateWithCString(kCFAllocatorDefault, utf8_string, kCFStringEncodingUTF8);
1398 	if(cfSourceString == NULL) {
1399 		ntfs_log_error("CFStringCreateWithCString failed!\n");
1400 		return -2;
1401 	}
1402 
1403 	/* Create a mutable string from cfSourceString that we are free to modify. */
1404 	cfMutableString = CFStringCreateMutableCopy(kCFAllocatorDefault, 0, cfSourceString);
1405 	CFRelease(cfSourceString); /* End-of-life. */
1406 	if(cfMutableString == NULL) {
1407 		ntfs_log_error("CFStringCreateMutableCopy failed!\n");
1408 		return -3;
1409 	}
1410 
1411 	/* Normalize the mutable string to the desired normalization form. */
1412 	CFStringNormalize(cfMutableString, (composed != 0 ? kCFStringNormalizationFormC : kCFStringNormalizationFormD));
1413 
1414 	/* Store the resulting string in a '\0'-terminated UTF-8 encoded char* buffer. */
1415 	rangeToProcess = CFRangeMake(0, CFStringGetLength(cfMutableString));
1416 	if(CFStringGetBytes(cfMutableString, rangeToProcess, kCFStringEncodingUTF8, 0, false, NULL, 0, &requiredBufferLength) > 0) {
1417 		resultLength = sizeof(char)*(requiredBufferLength + 1);
1418 		result = ntfs_calloc(resultLength);
1419 
1420 		if(result != NULL) {
1421 			if(CFStringGetBytes(cfMutableString, rangeToProcess, kCFStringEncodingUTF8,
1422 					    0, false, (UInt8*)result, resultLength-1, &requiredBufferLength) <= 0) {
1423 				ntfs_log_error("Could not perform UTF-8 conversion of normalized CFMutableString.\n");
1424 				free(result);
1425 				result = NULL;
1426 			}
1427 		}
1428 		else
1429 			ntfs_log_error("Could not perform a ntfs_calloc of %d bytes for char *result.\n", resultLength);
1430 	}
1431 	else
1432 		ntfs_log_error("Could not perform check for required length of UTF-8 conversion of normalized CFMutableString.\n");
1433 
1434 
1435 	CFRelease(cfMutableString);
1436 
1437 	if(result != NULL) {
1438 	 	*target = result;
1439 		return resultLength - 1;
1440 	}
1441 	else
1442 		return -1;
1443 #else
1444 	return -1;
1445 #endif /* ENABLE_NFCONV */
1446 }
1447 #endif /* defined(__APPLE__) || defined(__DARWIN__) */
1448