xref: /haiku/src/add-ons/kernel/file_systems/udf/UdfString.cpp (revision 8df6a8dbf579280f55b61d725e470dee5d504e83)
1 #include "UdfString.h"
2 
3 #include "ByteOrder.h"
4 
5 
6 /*! \brief Converts the given unicode character to utf8.
7 
8 	\param c The unicode character.
9 	\param out Pointer to a C-string of at least 4 characters
10 	           long into which the output utf8 characters will
11 	           be written. The string that is pointed to will
12 	           be incremented to reflect the number of characters
13 	           written, i.e. if \a out initially points to a pointer
14 	           to the first character in string named \c str, and
15 	           the function writes 4 characters to \c str, then
16 	           upon returning, out will point to a pointer to
17 	           the fifth character in \c str.
18 */
19 static
20 void
21 unicode_to_utf8(uint32 c, char **out)
22 {
23 	char *s = *out;
24 
25 	if (c < 0x80)
26 		*(s++) = c;
27 	else if (c < 0x800) {
28 		*(s++) = 0xc0 | (c>>6);
29 		*(s++) = 0x80 | (c & 0x3f);
30 	} else if (c < 0x10000) {
31 		*(s++) = 0xe0 | (c>>12);
32 		*(s++) = 0x80 | ((c>>6) & 0x3f);
33 		*(s++) = 0x80 | (c & 0x3f);
34 	} else if (c <= 0x10ffff) {
35 		*(s++) = 0xf0 | (c>>18);
36 		*(s++) = 0x80 | ((c>>12) & 0x3f);
37 		*(s++) = 0x80 | ((c>>6) & 0x3f);
38 		*(s++) = 0x80 | (c & 0x3f);
39 	}
40 	*out = s;
41 }
42 
43 /*! \brief Converts the given utf8 character to 4-byte unicode.
44 
45 	\param in Pointer to a C-String from which utf8 characters
46 	          will be read. *in will be incremented to reflect
47 	          the number of characters read, similarly to the
48 	          \c out parameter for unicode_to_utf8().
49 
50 	\return The 4-byte unicode character, or **in if passed an
51 	        invalid character, or 0 if passed any NULL pointers.
52 */
53 static
54 uint32
55 utf8_to_unicode(const char **in)
56 {
57 	if (!in)
58 		return 0;
59 	uint8 *bytes = (uint8 *)*in;
60 	if (!bytes)
61 		return 0;
62 
63 	int32 length;
64 	uint8 mask = 0x1f;
65 
66 	switch (bytes[0] & 0xf0) {
67 		case 0xc0:
68 		case 0xd0:	length = 2; break;
69 		case 0xe0:	length = 3; break;
70 		case 0xf0:
71 			mask = 0x0f;
72 			length = 4;
73 			break;
74 		default:
75 			// valid 1-byte character
76 			// and invalid characters
77 			(*in)++;
78 			return bytes[0];
79 	}
80 	uint32 c = bytes[0] & mask;
81 	int32 i = 1;
82 	for (;i < length && (bytes[i] & 0x80) > 0;i++)
83 		c = (c << 6) | (bytes[i] & 0x3f);
84 
85 	if (i < length) {
86 		// invalid character
87 		(*in)++;
88 		return (uint32)bytes[0];
89 	}
90 	*in += length;
91 	return c;
92 }
93 
94 
95 /*! \brief Creates an empty string object. */
96 UdfString::UdfString()
97 	:
98 	fCs0String(NULL),
99 	fUtf8String(NULL)
100 {
101 }
102 
103 
104 /*! \brief Creates a new UdfString object from the given Utf8 string. */
105 UdfString::UdfString(const char *utf8)
106 	:
107 	fCs0String(NULL),
108 	fUtf8String(NULL)
109 {
110 	SetTo(utf8);
111 }
112 
113 
114 /*! \brief Creates a new UdfString object from the given Cs0 string. */
115 UdfString::UdfString(const char *cs0, uint32 length)
116 	:
117 	fCs0String(NULL),
118 	fUtf8String(NULL)
119 {
120 	SetTo(cs0, length);
121 }
122 
123 
124 UdfString::~UdfString()
125 {
126 	_Clear();
127 }
128 
129 
130 /*! \brief Assignment from a Utf8 string. */
131 void
132 UdfString::SetTo(const char *utf8)
133 {
134 	TRACE(("UdfString::SetTo: utf8 = `%s', strlen(utf8) = %ld\n",
135 		utf8, utf8 ? strlen(utf8) : 0));
136 	_Clear();
137 
138 	if (utf8 == NULL) {
139 		TRACE_ERROR(("UdfString::SetTo: passed NULL utf8 string\n"));
140 		return;
141 	}
142 
143 	uint32 length = strlen(utf8);
144 	// First copy the utf8 string
145 	fUtf8String = new(nothrow) char[length + 1];
146 	if (fUtf8String == NULL) {
147 		TRACE_ERROR(("UdfString::SetTo: fUtf8String[%ld] allocation failed\n",
148 			length + 1));
149 		return;
150 	}
151 
152 	memcpy(fUtf8String, utf8, length + 1);
153 	// Next convert to raw 4-byte unicode. Then we'll do some
154 	// analysis to figure out if we have any invalid characters,
155 	// and whether we can get away with compressed 8-bit unicode,
156 	// or have to use burly 16-bit unicode.
157 	uint32 *raw = new(nothrow) uint32[length];
158 	if (raw == NULL) {
159 		TRACE_ERROR(("UdfString::SetTo: uint32 raw[%ld] temporary string "
160 			"allocation failed\n", length));
161 		_Clear();
162 		return;
163 	}
164 
165 	const char *in = utf8;
166 	uint32 rawLength = 0;
167 	for (uint32 i = 0; i < length && uint32(in - utf8) < length; i++, rawLength++)
168 		raw[i] = utf8_to_unicode(&in);
169 
170 	// Check for invalids.
171 	uint32 mask = 0xffff0000;
172 	for (uint32 i = 0; i < rawLength; i++) {
173 		if (raw[i] & mask) {
174 			TRACE(("WARNING: utf8 string contained a multi-byte sequence which "
175 			       "was converted into a unicode character larger than 16-bits; "
176 			       "character will be converted to an underscore character for "
177 			       "safety.\n"));
178 			raw[i] = '_';
179 		}
180 	}
181 	// See if we can get away with 8-bit compressed unicode
182 	mask = 0xffffff00;
183 	bool canUse8bit = true;
184 	for (uint32 i = 0; i < rawLength; i++) {
185 		if (raw[i] & mask) {
186 			canUse8bit = false;
187 			break;
188 		}
189 	}
190 	// Build our cs0 string
191 	if (canUse8bit) {
192 		fCs0Length = rawLength + 1;
193 		fCs0String = new(nothrow) char[fCs0Length];
194 		if (fCs0String) {
195 			fCs0String[0] = '\x08';	// 8-bit compressed unicode
196 			for (uint32 i = 0; i < rawLength; i++)
197 				fCs0String[i + 1] = raw[i] % 256;
198 		} else {
199 			TRACE_ERROR(("UdfString::SetTo: fCs0String[%ld] allocation failed\n",
200 				fCs0Length));
201 			_Clear();
202 			return;
203 		}
204 	} else {
205 		fCs0Length = rawLength * 2 + 1;
206 		fCs0String = new(nothrow) char[fCs0Length];
207 		if (fCs0String) {
208 			uint32 pos = 0;
209 			fCs0String[pos++] = '\x10';	// 16-bit unicode
210 			for (uint32 i = 0; i < rawLength; i++) {
211 				// 16-bit unicode chars must be written big endian
212 				uint16 value = uint16(raw[i]);
213 				uint8 high = uint8(value >> 8 & 0xff);
214 				uint8 low = uint8(value & 0xff);
215 				fCs0String[pos++] = high;
216 				fCs0String[pos++] = low;
217 			}
218 		} else {
219 			TRACE_ERROR(("UdfString::SetTo: fCs0String[%ld] allocation failed\n",
220 				fCs0Length));
221 			_Clear();
222 			return;
223 		}
224 	}
225 	// Clean up
226 	delete [] raw;
227 	raw = NULL;
228 }
229 
230 
231 /*! \brief Assignment from a Cs0 string. */
232 void
233 UdfString::SetTo(const char *cs0, uint32 length)
234 {
235 	DEBUG_INIT_ETC("UdfString", ("cs0: %p, length: %ld", cs0, length));
236 
237 	_Clear();
238 	if (length == 0)
239 		return;
240 	if (!cs0) {
241 		PRINT(("passed NULL cs0 string\n"));
242 		return;
243 	}
244 
245 	// First copy the Cs0 string and length
246 	fCs0String = new(nothrow) char[length];
247 	if (fCs0String) {
248 		memcpy(fCs0String, cs0, length);
249 		fCs0Length = length;
250 	} else {
251 		PRINT(("new fCs0String[%ld] allocation failed\n", length));
252 		return;
253 	}
254 
255 	// Now convert to utf8
256 
257 	// The first byte of the CS0 string is the compression ID.
258 	// - 8: 1 byte characters
259 	// - 16: 2 byte, big endian characters
260 	// - 254: "CS0 expansion is empty and unique", 1 byte characters
261 	// - 255: "CS0 expansion is empty and unique", 2 byte, big endian characters
262 	PRINT(("compression ID: %d\n", cs0[0]));
263 	switch (reinterpret_cast<const uint8*>(cs0)[0]) {
264 		case 8:
265 		case 254:
266 		{
267 			const uint8 *inputString = reinterpret_cast<const uint8*>(&(cs0[1]));
268 			int32 maxLength = length-1;				// Max length of input string in uint8 characters
269 			int32 allocationLength = maxLength*2+1;	// Need at most 2 utf8 chars per uint8 char
270 			fUtf8String = new(nothrow) char[allocationLength];
271 			if (fUtf8String) {
272 				char *outputString = fUtf8String;
273 
274 				for (int32 i = 0; i < maxLength && inputString[i]; i++) {
275 					unicode_to_utf8(inputString[i], &outputString);
276 				}
277 				outputString[0] = 0;
278 			} else {
279 				PRINT(("new fUtf8String[%ld] allocation failed\n", allocationLength));
280 			}
281 
282 			break;
283 		}
284 
285 		case 16:
286 		case 255:
287 		{
288 			const uint16 *inputString = reinterpret_cast<const uint16*>(&(cs0[1]));
289 			int32 maxLength = (length-1) / 2;		// Max length of input string in uint16 characters
290 			int32 allocationLength = maxLength*3+1;	// Need at most 3 utf8 chars per uint16 char
291 			fUtf8String = new(nothrow) char[allocationLength];
292 			if (fUtf8String) {
293 				char *outputString = fUtf8String;
294 
295 				for (int32 i = 0; i < maxLength && inputString[i]; i++) {
296 					unicode_to_utf8(B_BENDIAN_TO_HOST_INT16(inputString[i]), &outputString);
297 				}
298 				outputString[0] = 0;
299 			} else {
300 				PRINT(("new fUtf8String[%ld] allocation failed\n", allocationLength));
301 			}
302 
303 			break;
304 		}
305 
306 		default:
307 			PRINT(("invalid compression id!\n"));
308 			break;
309 	}
310 }
311 
312 void
313 UdfString::_Clear()
314 {
315 	DEBUG_INIT("UdfString");
316 
317 	delete [] fCs0String;
318 	fCs0String = NULL;
319 	delete [] fUtf8String;
320 	fUtf8String = NULL;
321 }
322