1 /*
2 * Copyright 2014 Jonathan Schleifer <js@webkeks.org>
3 * Copyright 2014 Haiku, Inc. All rights reserved.
4 *
5 * Distributed under the terms of the MIT License.
6 *
7 * Authors:
8 * Jonathan Schleifer, js@webkeks.org
9 * John Scipione, jscipione@gmail.com
10 */
11
12
13 #include "convertutf.h"
14
15
16 #include <ByteOrder.h>
17 #include <Errors.h>
18 #include <StorageDefs.h>
19
20
21 static inline size_t
glyph_length(uint32 glyph)22 glyph_length(uint32 glyph)
23 {
24 if (glyph < 0x80)
25 return 1;
26 else if (glyph < 0x800)
27 return 2;
28 else if (glyph < 0x10000)
29 return 3;
30 else if (glyph < 0x110000)
31 return 4;
32
33 return 0;
34 }
35
36
37 static void
encode_glyph(uint32 glyph,size_t glyphLength,char * buffer)38 encode_glyph(uint32 glyph, size_t glyphLength, char* buffer)
39 {
40 if (glyphLength == 1) {
41 *buffer = glyph;
42 } else if (glyphLength == 2) {
43 *buffer++ = 0xC0 | (glyph >> 6);
44 *buffer = 0x80 | (glyph & 0x3F);
45 } else if (glyphLength == 3) {
46 *buffer++ = 0xE0 | (glyph >> 12);
47 *buffer++ = 0x80 | (glyph >> 6 & 0x3F);
48 *buffer = 0x80 | (glyph & 0x3F);
49 } else if (glyphLength == 4) {
50 *buffer++ = 0xF0 | (glyph >> 18);
51 *buffer++ = 0x80 | (glyph >> 12 & 0x3F);
52 *buffer++ = 0x80 | (glyph >> 6 & 0x3F);
53 *buffer = 0x80 | (glyph & 0x3F);
54 }
55 }
56
57
58 static ssize_t
utf16_to_utf8(const uint16 * source,size_t sourceCodeUnitCount,char * target,size_t targetLength,bool isLittleEndian)59 utf16_to_utf8(const uint16* source, size_t sourceCodeUnitCount, char* target,
60 size_t targetLength, bool isLittleEndian)
61 {
62 if (source == NULL || sourceCodeUnitCount == 0
63 || target == NULL || targetLength == 0) {
64 return B_BAD_VALUE;
65 }
66
67 ssize_t outLength = 0;
68
69 for (size_t i = 0; i < sourceCodeUnitCount; i++) {
70 uint32 glyph = isLittleEndian
71 ? B_LENDIAN_TO_HOST_INT32(source[i])
72 : B_BENDIAN_TO_HOST_INT32(source[i]);
73
74 if ((glyph & 0xFC00) == 0xDC00) {
75 // missing high surrogate
76 return B_BAD_VALUE;
77 }
78
79 if ((glyph & 0xFC00) == 0xD800) {
80 if (sourceCodeUnitCount <= i + 1) {
81 // high surrogate at end of string
82 return B_BAD_VALUE;
83 }
84
85 uint32 low = isLittleEndian
86 ? B_LENDIAN_TO_HOST_INT32(source[i + 1])
87 : B_BENDIAN_TO_HOST_INT32(source[i + 1]);
88 if ((low & 0xFC00) != 0xDC00) {
89 // missing low surrogate
90 return B_BAD_VALUE;
91 }
92
93 glyph = (((glyph & 0x3FF) << 10) | (low & 0x3FF)) + 0x10000;
94 i++;
95 }
96
97 size_t glyphLength = glyph_length(glyph);
98 if (glyphLength == 0)
99 return B_BAD_VALUE;
100 else if (outLength + glyphLength >= targetLength
101 || outLength + glyphLength >= B_FILE_NAME_LENGTH) {
102 // NUL terminate the string so the caller can use the
103 // abbreviated version in this case. Since the length
104 // isn't returned the caller will need to call strlen()
105 // to get the length of the string.
106 target[outLength] = '\0';
107 return B_NAME_TOO_LONG;
108 }
109
110 encode_glyph(glyph, glyphLength, target + outLength);
111 outLength += glyphLength;
112 }
113
114 target[outLength] = '\0';
115
116 return outLength;
117 }
118
119
120 ssize_t
utf16le_to_utf8(const uint16 * source,size_t sourceCodeUnitCount,char * target,size_t targetLength)121 utf16le_to_utf8(const uint16* source, size_t sourceCodeUnitCount,
122 char* target, size_t targetLength)
123 {
124 return utf16_to_utf8(source, sourceCodeUnitCount, target, targetLength,
125 true);
126 }
127
128
129 ssize_t
utf16be_to_utf8(const uint16 * source,size_t sourceCodeUnitCount,char * target,size_t targetLength)130 utf16be_to_utf8(const uint16* source, size_t sourceCodeUnitCount,
131 char* target, size_t targetLength)
132 {
133 return utf16_to_utf8(source, sourceCodeUnitCount, target, targetLength,
134 false);
135 }
136