xref: /haiku/src/add-ons/kernel/file_systems/udf/UdfString.cpp (revision 2b76973fa2401f7a5edf68e6470f3d3210cbcff3)
1 #include "UdfString.h"
2 
3 #include <ByteOrder.h>
4 
5 #include <AutoDeleter.h>
6 
7 
8 /*! \brief Converts the given unicode character to utf8.
9 
10 	\param c The unicode character.
11 	\param out Pointer to a C-string of at least 4 characters
12 	           long into which the output utf8 characters will
13 	           be written. The string that is pointed to will
14 	           be incremented to reflect the number of characters
15 	           written, i.e. if \a out initially points to a pointer
16 	           to the first character in string named \c str, and
17 	           the function writes 4 characters to \c str, then
18 	           upon returning, out will point to a pointer to
19 	           the fifth character in \c str.
20 */
21 static void
22 unicode_to_utf8(uint32 c, char **out)
23 {
24 	char *s = *out;
25 
26 	if (c < 0x80)
27 		*(s++) = c;
28 	else if (c < 0x800) {
29 		*(s++) = 0xc0 | (c>>6);
30 		*(s++) = 0x80 | (c & 0x3f);
31 	} else if (c < 0x10000) {
32 		*(s++) = 0xe0 | (c>>12);
33 		*(s++) = 0x80 | ((c>>6) & 0x3f);
34 		*(s++) = 0x80 | (c & 0x3f);
35 	} else if (c <= 0x10ffff) {
36 		*(s++) = 0xf0 | (c>>18);
37 		*(s++) = 0x80 | ((c>>12) & 0x3f);
38 		*(s++) = 0x80 | ((c>>6) & 0x3f);
39 		*(s++) = 0x80 | (c & 0x3f);
40 	}
41 	*out = s;
42 }
43 
44 /*! \brief Converts the given utf8 character to 4-byte unicode.
45 
46 	\param in Pointer to a C-String from which utf8 characters
47 	          will be read. *in will be incremented to reflect
48 	          the number of characters read, similarly to the
49 	          \c out parameter for unicode_to_utf8().
50 
51 	\return The 4-byte unicode character, or **in if passed an
52 	        invalid character, or 0 if passed any NULL pointers.
53 */
54 static uint32
55 utf8_to_unicode(const char **in)
56 {
57 	if (!in)
58 		return 0;
59 	uint8 *bytes = (uint8 *)*in;
60 	if (!bytes)
61 		return 0;
62 
63 	int32 length;
64 	uint8 mask = 0x1f;
65 
66 	switch (bytes[0] & 0xf0) {
67 		case 0xc0:
68 		case 0xd0:	length = 2; break;
69 		case 0xe0:	length = 3; break;
70 		case 0xf0:
71 			mask = 0x0f;
72 			length = 4;
73 			break;
74 		default:
75 			// valid 1-byte character
76 			// and invalid characters
77 			(*in)++;
78 			return bytes[0];
79 	}
80 	uint32 c = bytes[0] & mask;
81 	int32 i = 1;
82 	for (;i < length && (bytes[i] & 0x80) > 0;i++)
83 		c = (c << 6) | (bytes[i] & 0x3f);
84 
85 	if (i < length) {
86 		// invalid character
87 		(*in)++;
88 		return (uint32)bytes[0];
89 	}
90 	*in += length;
91 	return c;
92 }
93 
94 
95 // #pragma mark -
96 
97 
98 /*! \brief Creates an empty string object. */
99 UdfString::UdfString()
100 	:
101 	fCs0String(NULL),
102 	fUtf8String(NULL)
103 {
104 }
105 
106 
107 /*! \brief Creates a new UdfString object from the given Utf8 string. */
108 UdfString::UdfString(const char *utf8)
109 	:
110 	fCs0String(NULL),
111 	fUtf8String(NULL)
112 {
113 	SetTo(utf8);
114 }
115 
116 
117 /*! \brief Creates a new UdfString object from the given Cs0 string. */
118 UdfString::UdfString(const char *cs0, uint32 length)
119 	:
120 	fCs0String(NULL),
121 	fUtf8String(NULL)
122 {
123 	SetTo(cs0, length);
124 }
125 
126 
127 UdfString::~UdfString()
128 {
129 	_Clear();
130 }
131 
132 
133 /*! \brief Assignment from a Utf8 string. */
134 void
135 UdfString::SetTo(const char *utf8)
136 {
137 	TRACE(("UdfString::SetTo: utf8 = `%s', strlen(utf8) = %ld\n",
138 		utf8, utf8 ? strlen(utf8) : 0));
139 	_Clear();
140 
141 	if (utf8 == NULL) {
142 		TRACE_ERROR(("UdfString::SetTo: passed NULL utf8 string\n"));
143 		return;
144 	}
145 
146 	uint32 length = strlen(utf8);
147 	// First copy the utf8 string
148 	fUtf8String = new(nothrow) char[length + 1];
149 	if (fUtf8String == NULL) {
150 		TRACE_ERROR(("UdfString::SetTo: fUtf8String[%" B_PRIu32
151 			"] allocation failed\n", length + 1));
152 		return;
153 	}
154 
155 	memcpy(fUtf8String, utf8, length + 1);
156 	// Next convert to raw 4-byte unicode. Then we'll do some
157 	// analysis to figure out if we have any invalid characters,
158 	// and whether we can get away with compressed 8-bit unicode,
159 	// or have to use burly 16-bit unicode.
160 	uint32 *raw = new(nothrow) uint32[length];
161 	if (raw == NULL) {
162 		TRACE_ERROR(("UdfString::SetTo: uint32 raw[%" B_PRIu32 "] temporary"
163 			" string allocation failed\n", length));
164 		_Clear();
165 		return;
166 	}
167 
168 	ArrayDeleter<uint32> rawDeleter(raw);
169 
170 	const char *in = utf8;
171 	uint32 rawLength = 0;
172 	for (uint32 i = 0; i < length && uint32(in - utf8) < length; i++, rawLength++)
173 		raw[i] = utf8_to_unicode(&in);
174 
175 	// Check for invalids.
176 	uint32 mask = 0xffff0000;
177 	for (uint32 i = 0; i < rawLength; i++) {
178 		if (raw[i] & mask) {
179 			TRACE(("WARNING: utf8 string contained a multi-byte sequence which "
180 			       "was converted into a unicode character larger than 16-bits; "
181 			       "character will be converted to an underscore character for "
182 			       "safety.\n"));
183 			raw[i] = '_';
184 		}
185 	}
186 	// See if we can get away with 8-bit compressed unicode
187 	mask = 0xffffff00;
188 	bool canUse8bit = true;
189 	for (uint32 i = 0; i < rawLength; i++) {
190 		if (raw[i] & mask) {
191 			canUse8bit = false;
192 			break;
193 		}
194 	}
195 	// Build our cs0 string
196 	if (canUse8bit) {
197 		fCs0Length = rawLength + 1;
198 		fCs0String = new(nothrow) char[fCs0Length];
199 		if (fCs0String != NULL) {
200 			fCs0String[0] = '\x08';	// 8-bit compressed unicode
201 			for (uint32 i = 0; i < rawLength; i++)
202 				fCs0String[i + 1] = raw[i] % 256;
203 		} else {
204 			TRACE_ERROR(("UdfString::SetTo: fCs0String[%" B_PRIu32
205 				"] allocation failed\n", fCs0Length));
206 			_Clear();
207 			return;
208 		}
209 	} else {
210 		fCs0Length = rawLength * 2 + 1;
211 		fCs0String = new(nothrow) char[fCs0Length];
212 		if (fCs0String != NULL) {
213 			uint32 pos = 0;
214 			fCs0String[pos++] = '\x10';	// 16-bit unicode
215 			for (uint32 i = 0; i < rawLength; i++) {
216 				// 16-bit unicode chars must be written big endian
217 				uint16 value = uint16(raw[i]);
218 				uint8 high = uint8(value >> 8 & 0xff);
219 				uint8 low = uint8(value & 0xff);
220 				fCs0String[pos++] = high;
221 				fCs0String[pos++] = low;
222 			}
223 		} else {
224 			TRACE_ERROR(("UdfString::SetTo: fCs0String[%" B_PRIu32
225 				"] allocation failed\n", fCs0Length));
226 			_Clear();
227 			return;
228 		}
229 	}
230 }
231 
232 
233 /*! \brief Assignment from a Cs0 string. */
234 void
235 UdfString::SetTo(const char *cs0, uint32 length)
236 {
237 	DEBUG_INIT_ETC("UdfString", ("cs0: %p, length: %ld", cs0, length));
238 
239 	_Clear();
240 	if (length == 0)
241 		return;
242 	if (!cs0) {
243 		PRINT(("passed NULL cs0 string\n"));
244 		return;
245 	}
246 
247 	// First copy the Cs0 string and length
248 	fCs0String = new(nothrow) char[length];
249 	if (fCs0String) {
250 		memcpy(fCs0String, cs0, length);
251 		fCs0Length = length;
252 	} else {
253 		PRINT(("new fCs0String[%ld] allocation failed\n", length));
254 		return;
255 	}
256 
257 	// Now convert to utf8
258 
259 	// The first byte of the CS0 string is the compression ID.
260 	// - 8: 1 byte characters
261 	// - 16: 2 byte, big endian characters
262 	// - 254: "CS0 expansion is empty and unique", 1 byte characters
263 	// - 255: "CS0 expansion is empty and unique", 2 byte, big endian characters
264 	PRINT(("compression ID: %d\n", cs0[0]));
265 	switch (reinterpret_cast<const uint8*>(cs0)[0]) {
266 		case 8:
267 		case 254:
268 		{
269 			const uint8 *inputString = reinterpret_cast<const uint8*>(&(cs0[1]));
270 			int32 maxLength = length-1;				// Max length of input string in uint8 characters
271 			int32 allocationLength = maxLength*2+1;	// Need at most 2 utf8 chars per uint8 char
272 			fUtf8String = new(nothrow) char[allocationLength];
273 			if (fUtf8String) {
274 				char *outputString = fUtf8String;
275 
276 				for (int32 i = 0; i < maxLength && inputString[i]; i++) {
277 					unicode_to_utf8(inputString[i], &outputString);
278 				}
279 				outputString[0] = 0;
280 			} else {
281 				PRINT(("new fUtf8String[%ld] allocation failed\n", allocationLength));
282 			}
283 
284 			break;
285 		}
286 
287 		case 16:
288 		case 255:
289 		{
290 			const uint16 *inputString = reinterpret_cast<const uint16*>(&(cs0[1]));
291 			int32 maxLength = (length-1) / 2;		// Max length of input string in uint16 characters
292 			int32 allocationLength = maxLength*3+1;	// Need at most 3 utf8 chars per uint16 char
293 			fUtf8String = new(nothrow) char[allocationLength];
294 			if (fUtf8String) {
295 				char *outputString = fUtf8String;
296 
297 				for (int32 i = 0; i < maxLength && inputString[i]; i++) {
298 					unicode_to_utf8(B_BENDIAN_TO_HOST_INT16(inputString[i]), &outputString);
299 				}
300 				outputString[0] = 0;
301 			} else {
302 				PRINT(("new fUtf8String[%ld] allocation failed\n", allocationLength));
303 			}
304 
305 			break;
306 		}
307 
308 		default:
309 			PRINT(("invalid compression id!\n"));
310 			break;
311 	}
312 }
313 
314 void
315 UdfString::_Clear()
316 {
317 	DEBUG_INIT("UdfString");
318 
319 	delete [] fCs0String;
320 	fCs0String = NULL;
321 	delete [] fUtf8String;
322 	fUtf8String = NULL;
323 }
324