xref: /haiku/src/system/kernel/convertutf.cpp (revision cbe0a0c436162d78cc3f92a305b64918c839d079)
1 /*
2  * Copyright 2014 Jonathan Schleifer <js@webkeks.org>
3  * Copyright 2014 Haiku, Inc. All rights reserved.
4  *
5  * Distributed under the terms of the MIT License.
6  *
7  * Authors:
8  *		Jonathan Schleifer, js@webkeks.org
9  *		John Scipione, jscipione@gmail.com
10  */
11 
12 
13 #include "convertutf.h"
14 
15 
16 #include <ByteOrder.h>
17 #include <Errors.h>
18 #include <StorageDefs.h>
19 
20 
21 static inline size_t
22 glyph_length(uint32 glyph)
23 {
24 	if (glyph < 0x80)
25 		return 1;
26 	else if (glyph < 0x800)
27 		return 2;
28 	else if (glyph < 0x10000)
29 		return 3;
30 	else if (glyph < 0x110000)
31 		return 4;
32 
33 	return 0;
34 }
35 
36 
37 static void
38 encode_glyph(uint32 glyph, size_t glyphLength, char* buffer)
39 {
40 	if (glyphLength == 1) {
41 		*buffer = glyph;
42 	} else if (glyphLength == 2) {
43 		*buffer++ = 0xC0 | (glyph >> 6);
44 		*buffer = 0x80 | (glyph & 0x3F);
45 	} else if (glyphLength == 3) {
46 		*buffer++ = 0xE0 | (glyph >> 12);
47 		*buffer++ = 0x80 | (glyph >> 6 & 0x3F);
48 		*buffer = 0x80 | (glyph & 0x3F);
49 	} else if (glyphLength == 4) {
50 		*buffer++ = 0xF0 | (glyph >> 18);
51 		*buffer++ = 0x80 | (glyph >> 12 & 0x3F);
52 		*buffer++ = 0x80 | (glyph >> 6 & 0x3F);
53 		*buffer = 0x80 | (glyph & 0x3F);
54 	}
55 }
56 
57 
58 static ssize_t
59 utf16_to_utf8(const uint16* source, size_t sourceCodeUnitCount, char* target,
60 	size_t targetLength, bool isLittleEndian)
61 {
62 	if (source == NULL || sourceCodeUnitCount == 0
63 		|| target == NULL || targetLength == 0) {
64 		return B_BAD_VALUE;
65 	}
66 
67 	ssize_t outLength = 0;
68 
69 	for (size_t i = 0; i < sourceCodeUnitCount; i++) {
70 		uint32 glyph = isLittleEndian
71 			? B_LENDIAN_TO_HOST_INT32(source[i])
72 			: B_BENDIAN_TO_HOST_INT32(source[i]);
73 
74 		if ((glyph & 0xFC00) == 0xDC00) {
75 			// missing high surrogate
76 			return B_BAD_VALUE;
77 		}
78 
79 		if ((glyph & 0xFC00) == 0xD800) {
80 			if (sourceCodeUnitCount <= i + 1) {
81 				// high surrogate at end of string
82 				return B_BAD_VALUE;
83 			}
84 
85 			uint32 low = isLittleEndian
86 				? B_LENDIAN_TO_HOST_INT32(source[i + 1])
87 				: B_BENDIAN_TO_HOST_INT32(source[i + 1]);
88 			if ((low & 0xFC00) != 0xDC00) {
89 				// missing low surrogate
90 				return B_BAD_VALUE;
91 			}
92 
93 			glyph = (((glyph & 0x3FF) << 10) | (low & 0x3FF)) + 0x10000;
94 			i++;
95 		}
96 
97 		size_t glyphLength = glyph_length(glyph);
98 		if (glyphLength == 0)
99 			return B_BAD_VALUE;
100 		else if (outLength + glyphLength >= targetLength
101 			|| outLength + glyphLength >= B_FILE_NAME_LENGTH) {
102 			// NUL terminate the string so the caller can use the
103 			// abbreviated version in this case. Since the length
104 			// isn't returned the caller will need to call strlen()
105 			// to get the length of the string.
106 			target[outLength] = '\0';
107 			return B_NAME_TOO_LONG;
108 		}
109 
110 		encode_glyph(glyph, glyphLength, target + outLength);
111 		outLength += glyphLength;
112 	}
113 
114 	target[outLength] = '\0';
115 
116 	return outLength;
117 }
118 
119 
120 ssize_t
121 utf16le_to_utf8(const uint16* source, size_t sourceCodeUnitCount,
122 	char* target, size_t targetLength)
123 {
124 	return utf16_to_utf8(source, sourceCodeUnitCount, target, targetLength,
125 		true);
126 }
127 
128 
129 ssize_t
130 utf16be_to_utf8(const uint16* source, size_t sourceCodeUnitCount,
131 	char* target, size_t targetLength)
132 {
133 	return utf16_to_utf8(source, sourceCodeUnitCount, target, targetLength,
134 		false);
135 }
136