xref: /haiku/src/add-ons/kernel/file_systems/udf/UdfString.cpp (revision fce4895d1884da5ae6fb299d23c735c598e690b1)
1 #include "UdfString.h"
2 
3 #include <ByteOrder.h>
4 
5 #include <AutoDeleter.h>
6 
7 
8 using std::nothrow;
9 
10 
11 /*! \brief Converts the given unicode character to utf8.
12 
13 	\param c The unicode character.
14 	\param out Pointer to a C-string of at least 4 characters
15 	           long into which the output utf8 characters will
16 	           be written. The string that is pointed to will
17 	           be incremented to reflect the number of characters
18 	           written, i.e. if \a out initially points to a pointer
19 	           to the first character in string named \c str, and
20 	           the function writes 4 characters to \c str, then
21 	           upon returning, out will point to a pointer to
22 	           the fifth character in \c str.
23 */
24 static void
25 unicode_to_utf8(uint32 c, char **out)
26 {
27 	char *s = *out;
28 
29 	if (c < 0x80)
30 		*(s++) = c;
31 	else if (c < 0x800) {
32 		*(s++) = 0xc0 | (c>>6);
33 		*(s++) = 0x80 | (c & 0x3f);
34 	} else if (c < 0x10000) {
35 		*(s++) = 0xe0 | (c>>12);
36 		*(s++) = 0x80 | ((c>>6) & 0x3f);
37 		*(s++) = 0x80 | (c & 0x3f);
38 	} else if (c <= 0x10ffff) {
39 		*(s++) = 0xf0 | (c>>18);
40 		*(s++) = 0x80 | ((c>>12) & 0x3f);
41 		*(s++) = 0x80 | ((c>>6) & 0x3f);
42 		*(s++) = 0x80 | (c & 0x3f);
43 	}
44 	*out = s;
45 }
46 
47 /*! \brief Converts the given utf8 character to 4-byte unicode.
48 
49 	\param in Pointer to a C-String from which utf8 characters
50 	          will be read. *in will be incremented to reflect
51 	          the number of characters read, similarly to the
52 	          \c out parameter for unicode_to_utf8().
53 
54 	\return The 4-byte unicode character, or **in if passed an
55 	        invalid character, or 0 if passed any NULL pointers.
56 */
57 static uint32
58 utf8_to_unicode(const char **in)
59 {
60 	if (!in)
61 		return 0;
62 	uint8 *bytes = (uint8 *)*in;
63 	if (!bytes)
64 		return 0;
65 
66 	int32 length;
67 	uint8 mask = 0x1f;
68 
69 	switch (bytes[0] & 0xf0) {
70 		case 0xc0:
71 		case 0xd0:	length = 2; break;
72 		case 0xe0:	length = 3; break;
73 		case 0xf0:
74 			mask = 0x0f;
75 			length = 4;
76 			break;
77 		default:
78 			// valid 1-byte character
79 			// and invalid characters
80 			(*in)++;
81 			return bytes[0];
82 	}
83 	uint32 c = bytes[0] & mask;
84 	int32 i = 1;
85 	for (;i < length && (bytes[i] & 0x80) > 0;i++)
86 		c = (c << 6) | (bytes[i] & 0x3f);
87 
88 	if (i < length) {
89 		// invalid character
90 		(*in)++;
91 		return (uint32)bytes[0];
92 	}
93 	*in += length;
94 	return c;
95 }
96 
97 
98 // #pragma mark -
99 
100 
101 /*! \brief Creates an empty string object. */
102 UdfString::UdfString()
103 	:
104 	fCs0String(NULL),
105 	fUtf8String(NULL)
106 {
107 }
108 
109 
110 /*! \brief Creates a new UdfString object from the given Utf8 string. */
111 UdfString::UdfString(const char *utf8)
112 	:
113 	fCs0String(NULL),
114 	fUtf8String(NULL)
115 {
116 	SetTo(utf8);
117 }
118 
119 
120 /*! \brief Creates a new UdfString object from the given Cs0 string. */
121 UdfString::UdfString(const char *cs0, uint32 length)
122 	:
123 	fCs0String(NULL),
124 	fUtf8String(NULL)
125 {
126 	SetTo(cs0, length);
127 }
128 
129 
130 UdfString::~UdfString()
131 {
132 	_Clear();
133 }
134 
135 
136 /*! \brief Assignment from a Utf8 string. */
137 void
138 UdfString::SetTo(const char *utf8)
139 {
140 	TRACE(("UdfString::SetTo: utf8 = `%s', strlen(utf8) = %ld\n",
141 		utf8, utf8 ? strlen(utf8) : 0));
142 	_Clear();
143 
144 	if (utf8 == NULL) {
145 		TRACE_ERROR(("UdfString::SetTo: passed NULL utf8 string\n"));
146 		return;
147 	}
148 
149 	uint32 length = strlen(utf8);
150 	// First copy the utf8 string
151 	fUtf8String = new(nothrow) char[length + 1];
152 	if (fUtf8String == NULL) {
153 		TRACE_ERROR(("UdfString::SetTo: fUtf8String[%" B_PRIu32
154 			"] allocation failed\n", length + 1));
155 		return;
156 	}
157 
158 	memcpy(fUtf8String, utf8, length + 1);
159 	// Next convert to raw 4-byte unicode. Then we'll do some
160 	// analysis to figure out if we have any invalid characters,
161 	// and whether we can get away with compressed 8-bit unicode,
162 	// or have to use burly 16-bit unicode.
163 	uint32 *raw = new(nothrow) uint32[length];
164 	if (raw == NULL) {
165 		TRACE_ERROR(("UdfString::SetTo: uint32 raw[%" B_PRIu32 "] temporary"
166 			" string allocation failed\n", length));
167 		_Clear();
168 		return;
169 	}
170 
171 	ArrayDeleter<uint32> rawDeleter(raw);
172 
173 	const char *in = utf8;
174 	uint32 rawLength = 0;
175 	for (uint32 i = 0; i < length && uint32(in - utf8) < length; i++, rawLength++)
176 		raw[i] = utf8_to_unicode(&in);
177 
178 	// Check for invalids.
179 	uint32 mask = 0xffff0000;
180 	for (uint32 i = 0; i < rawLength; i++) {
181 		if (raw[i] & mask) {
182 			TRACE(("WARNING: utf8 string contained a multi-byte sequence which "
183 			       "was converted into a unicode character larger than 16-bits; "
184 			       "character will be converted to an underscore character for "
185 			       "safety.\n"));
186 			raw[i] = '_';
187 		}
188 	}
189 	// See if we can get away with 8-bit compressed unicode
190 	mask = 0xffffff00;
191 	bool canUse8bit = true;
192 	for (uint32 i = 0; i < rawLength; i++) {
193 		if (raw[i] & mask) {
194 			canUse8bit = false;
195 			break;
196 		}
197 	}
198 	// Build our cs0 string
199 	if (canUse8bit) {
200 		fCs0Length = rawLength + 1;
201 		fCs0String = new(nothrow) char[fCs0Length];
202 		if (fCs0String != NULL) {
203 			fCs0String[0] = '\x08';	// 8-bit compressed unicode
204 			for (uint32 i = 0; i < rawLength; i++)
205 				fCs0String[i + 1] = raw[i] % 256;
206 		} else {
207 			TRACE_ERROR(("UdfString::SetTo: fCs0String[%" B_PRIu32
208 				"] allocation failed\n", fCs0Length));
209 			_Clear();
210 			return;
211 		}
212 	} else {
213 		fCs0Length = rawLength * 2 + 1;
214 		fCs0String = new(nothrow) char[fCs0Length];
215 		if (fCs0String != NULL) {
216 			uint32 pos = 0;
217 			fCs0String[pos++] = '\x10';	// 16-bit unicode
218 			for (uint32 i = 0; i < rawLength; i++) {
219 				// 16-bit unicode chars must be written big endian
220 				uint16 value = uint16(raw[i]);
221 				uint8 high = uint8(value >> 8 & 0xff);
222 				uint8 low = uint8(value & 0xff);
223 				fCs0String[pos++] = high;
224 				fCs0String[pos++] = low;
225 			}
226 		} else {
227 			TRACE_ERROR(("UdfString::SetTo: fCs0String[%" B_PRIu32
228 				"] allocation failed\n", fCs0Length));
229 			_Clear();
230 			return;
231 		}
232 	}
233 }
234 
235 
236 /*! \brief Assignment from a Cs0 string. */
237 void
238 UdfString::SetTo(const char *cs0, uint32 length)
239 {
240 	DEBUG_INIT_ETC("UdfString", ("cs0: %p, length: %ld", cs0, length));
241 
242 	_Clear();
243 	if (length == 0)
244 		return;
245 	if (!cs0) {
246 		PRINT(("passed NULL cs0 string\n"));
247 		return;
248 	}
249 
250 	// First copy the Cs0 string and length
251 	fCs0String = new(nothrow) char[length];
252 	if (fCs0String) {
253 		memcpy(fCs0String, cs0, length);
254 		fCs0Length = length;
255 	} else {
256 		PRINT(("new fCs0String[%ld] allocation failed\n", length));
257 		return;
258 	}
259 
260 	// Now convert to utf8
261 
262 	// The first byte of the CS0 string is the compression ID.
263 	// - 8: 1 byte characters
264 	// - 16: 2 byte, big endian characters
265 	// - 254: "CS0 expansion is empty and unique", 1 byte characters
266 	// - 255: "CS0 expansion is empty and unique", 2 byte, big endian characters
267 	PRINT(("compression ID: %d\n", cs0[0]));
268 	switch (reinterpret_cast<const uint8*>(cs0)[0]) {
269 		case 8:
270 		case 254:
271 		{
272 			const uint8 *inputString = reinterpret_cast<const uint8*>(&(cs0[1]));
273 			int32 maxLength = length-1;				// Max length of input string in uint8 characters
274 			int32 allocationLength = maxLength*2+1;	// Need at most 2 utf8 chars per uint8 char
275 			fUtf8String = new(nothrow) char[allocationLength];
276 			if (fUtf8String) {
277 				char *outputString = fUtf8String;
278 
279 				for (int32 i = 0; i < maxLength && inputString[i]; i++) {
280 					unicode_to_utf8(inputString[i], &outputString);
281 				}
282 				outputString[0] = 0;
283 			} else {
284 				PRINT(("new fUtf8String[%ld] allocation failed\n", allocationLength));
285 			}
286 
287 			break;
288 		}
289 
290 		case 16:
291 		case 255:
292 		{
293 			const uint16 *inputString = reinterpret_cast<const uint16*>(&(cs0[1]));
294 			int32 maxLength = (length-1) / 2;		// Max length of input string in uint16 characters
295 			int32 allocationLength = maxLength*3+1;	// Need at most 3 utf8 chars per uint16 char
296 			fUtf8String = new(nothrow) char[allocationLength];
297 			if (fUtf8String) {
298 				char *outputString = fUtf8String;
299 
300 				for (int32 i = 0; i < maxLength && inputString[i]; i++) {
301 					unicode_to_utf8(B_BENDIAN_TO_HOST_INT16(inputString[i]), &outputString);
302 				}
303 				outputString[0] = 0;
304 			} else {
305 				PRINT(("new fUtf8String[%ld] allocation failed\n", allocationLength));
306 			}
307 
308 			break;
309 		}
310 
311 		default:
312 			PRINT(("invalid compression id!\n"));
313 			break;
314 	}
315 }
316 
317 void
318 UdfString::_Clear()
319 {
320 	DEBUG_INIT("UdfString");
321 
322 	delete [] fCs0String;
323 	fCs0String = NULL;
324 	delete [] fUtf8String;
325 	fUtf8String = NULL;
326 }
327