xref: /haiku/src/add-ons/kernel/file_systems/udf/UdfString.cpp (revision 4f00613311d0bd6b70fa82ce19931c41f071ea4e)
1 #include "UdfString.h"
2 
3 #include "ByteOrder.h"
4 
5 
6 /*! \brief Converts the given unicode character to utf8.
7 
8 	\param c The unicode character.
9 	\param out Pointer to a C-string of at least 4 characters
10 	           long into which the output utf8 characters will
11 	           be written. The string that is pointed to will
12 	           be incremented to reflect the number of characters
13 	           written, i.e. if \a out initially points to a pointer
14 	           to the first character in string named \c str, and
15 	           the function writes 4 characters to \c str, then
16 	           upon returning, out will point to a pointer to
17 	           the fifth character in \c str.
18 */
19 static
20 void
21 unicode_to_utf8(uint32 c, char **out)
22 {
23 	char *s = *out;
24 
25 	if (c < 0x80)
26 		*(s++) = c;
27 	else if (c < 0x800) {
28 		*(s++) = 0xc0 | (c>>6);
29 		*(s++) = 0x80 | (c & 0x3f);
30 	} else if (c < 0x10000) {
31 		*(s++) = 0xe0 | (c>>12);
32 		*(s++) = 0x80 | ((c>>6) & 0x3f);
33 		*(s++) = 0x80 | (c & 0x3f);
34 	} else if (c <= 0x10ffff) {
35 		*(s++) = 0xf0 | (c>>18);
36 		*(s++) = 0x80 | ((c>>12) & 0x3f);
37 		*(s++) = 0x80 | ((c>>6) & 0x3f);
38 		*(s++) = 0x80 | (c & 0x3f);
39 	}
40 	*out = s;
41 }
42 
43 /*! \brief Converts the given utf8 character to 4-byte unicode.
44 
45 	\param in Pointer to a C-String from which utf8 characters
46 	          will be read. *in will be incremented to reflect
47 	          the number of characters read, similarly to the
48 	          \c out parameter for Udf::unicode_to_utf8().
49 
50 	\return The 4-byte unicode character, or **in if passed an
51 	        invalid character, or 0 if passed any NULL pointers.
52 */
53 static
54 uint32
55 utf8_to_unicode(const char **in)
56 {
57 	if (!in)
58 		return 0;
59 	uint8 *bytes = (uint8 *)*in;
60 	if (!bytes)
61 		return 0;
62 
63 	int32 length;
64 	uint8 mask = 0x1f;
65 
66 	switch (bytes[0] & 0xf0) {
67 		case 0xc0:
68 		case 0xd0:	length = 2; break;
69 		case 0xe0:	length = 3; break;
70 		case 0xf0:
71 			mask = 0x0f;
72 			length = 4;
73 			break;
74 		default:
75 			// valid 1-byte character
76 			// and invalid characters
77 			(*in)++;
78 			return bytes[0];
79 	}
80 	uint32 c = bytes[0] & mask;
81 	int32 i = 1;
82 	for (;i < length && (bytes[i] & 0x80) > 0;i++)
83 		c = (c << 6) | (bytes[i] & 0x3f);
84 
85 	if (i < length) {
86 		// invalid character
87 		(*in)++;
88 		return (uint32)bytes[0];
89 	}
90 	*in += length;
91 	return c;
92 }
93 
94 using namespace Udf;
95 
96 /*! \brief Creates an empty string object.
97 */
98 String::String()
99 	: fCs0String(NULL)
100 	, fUtf8String(NULL)
101 {
102 }
103 
104 /*! \brief Creates a new String object from the given Utf8 string.
105 */
106 String::String(const char *utf8)
107 	: fCs0String(NULL)
108 	, fUtf8String(NULL)
109 {
110 	SetTo(utf8);
111 }
112 
113 /*! \brief Creates a new String object from the given Cs0 string.
114 */
115 String::String(const char *cs0, uint32 length)
116 	: fCs0String(NULL)
117 	, fUtf8String(NULL)
118 {
119 	SetTo(cs0, length);
120 }
121 
122 String::~String()
123 {
124 	DEBUG_INIT("String");
125 
126 	_Clear();
127 }
128 
129 /*! \brief Assignment from a Utf8 string.
130 */
131 void
132 String::SetTo(const char *utf8)
133 {
134 	DEBUG_INIT_ETC("String", ("utf8: `%s', strlen(utf8): %ld", utf8,
135 	               utf8 ? strlen(utf8) : 0));
136 	_Clear();
137 	if (!utf8) {
138 		PRINT(("passed NULL utf8 string\n"));
139 		return;
140 	}
141 	uint32 length = strlen(utf8);
142 	// First copy the utf8 string
143 	fUtf8String = new(nothrow) char[length+1];
144 	if (!fUtf8String){
145 		PRINT(("new fUtf8String[%ld] allocation failed\n", length+1));
146 		return;
147 	}
148 	memcpy(fUtf8String, utf8, length+1);
149 	// Next convert to raw 4-byte unicode. Then we'll do some
150 	// analysis to figure out if we have any invalid characters,
151 	// and whether we can get away with compressed 8-bit unicode,
152 	// or have to use burly 16-bit unicode.
153 	uint32 *raw = new(nothrow) uint32[length];
154 	if (!raw) {
155 		PRINT(("new uint32 raw[%ld] temporary string allocation failed\n", length));
156 		_Clear();
157 		return;
158 	}
159 	const char *in = utf8;
160 	uint32 rawLength = 0;
161 	for (uint32 i = 0; i < length && uint32(in-utf8) < length; i++, rawLength++)
162 		raw[i] = utf8_to_unicode(&in);
163 	// Check for invalids.
164 	uint32 mask = 0xffff0000;
165 	for (uint32 i = 0; i < rawLength; i++) {
166 		if (raw[i] & mask) {
167 			PRINT(("WARNING: utf8 string contained a multi-byte sequence which "
168 			       "was converted into a unicode character larger than 16-bits; "
169 			       "character will be converted to an underscore character for "
170 			       "safety.\n"));
171 			raw[i] = '_';
172 		}
173 	}
174 	// See if we can get away with 8-bit compressed unicode
175 	mask = 0xffffff00;
176 	bool canUse8bit = true;
177 	for (uint32 i = 0; i < rawLength; i++) {
178 		if (raw[i] & mask) {
179 			canUse8bit = false;
180 			break;
181 		}
182 	}
183 	// Build our cs0 string
184 	if (canUse8bit) {
185 		fCs0Length = rawLength+1;
186 		fCs0String = new(nothrow) char[fCs0Length];
187 		if (fCs0String) {
188 			fCs0String[0] = '\x08';	// 8-bit compressed unicode
189 			for (uint32 i = 0; i < rawLength; i++)
190 				fCs0String[i+1] = raw[i] % 256;
191 		} else {
192 			PRINT(("new fCs0String[%ld] allocation failed\n", fCs0Length));
193 			_Clear();
194 			return;
195 		}
196 	} else {
197 		fCs0Length = rawLength*2+1;
198 		fCs0String = new(nothrow) char[fCs0Length];
199 		if (fCs0String) {
200 			uint32 pos = 0;
201 			fCs0String[pos++] = '\x10';	// 16-bit unicode
202 			for (uint32 i = 0; i < rawLength; i++) {
203 				// 16-bit unicode chars must be written big endian
204 				uint16 value = uint16(raw[i]);
205 				uint8 high = uint8(value >> 8 & 0xff);
206 				uint8 low = uint8(value & 0xff);
207 				fCs0String[pos++] = high;
208 				fCs0String[pos++] = low;
209 			}
210 		} else {
211 			PRINT(("new fCs0String[%ld] allocation failed\n", fCs0Length));
212 			_Clear();
213 			return;
214 		}
215 	}
216 	// Clean up
217 	delete [] raw;
218 	raw = NULL;
219 }
220 
221 /*! \brief Assignment from a Cs0 string.
222 */
223 void
224 String::SetTo(const char *cs0, uint32 length)
225 {
226 	DEBUG_INIT_ETC("String", ("cs0: %p, length: %ld", cs0, length));
227 
228 	_Clear();
229 	if (length == 0)
230 		return;
231 	if (!cs0) {
232 		PRINT(("passed NULL cs0 string\n"));
233 		return;
234 	}
235 
236 	// First copy the Cs0 string and length
237 	fCs0String = new(nothrow) char[length];
238 	if (fCs0String) {
239 		memcpy(fCs0String, cs0, length);
240 		fCs0Length = length;
241 	} else {
242 		PRINT(("new fCs0String[%ld] allocation failed\n", length));
243 		return;
244 	}
245 
246 	// Now convert to utf8
247 
248 	// The first byte of the CS0 string is the compression ID.
249 	// - 8: 1 byte characters
250 	// - 16: 2 byte, big endian characters
251 	// - 254: "CS0 expansion is empty and unique", 1 byte characters
252 	// - 255: "CS0 expansion is empty and unique", 2 byte, big endian characters
253 	PRINT(("compression ID: %d\n", cs0[0]));
254 	switch (reinterpret_cast<const uint8*>(cs0)[0]) {
255 		case 8:
256 		case 254:
257 		{
258 			const uint8 *inputString = reinterpret_cast<const uint8*>(&(cs0[1]));
259 			int32 maxLength = length-1;				// Max length of input string in uint8 characters
260 			int32 allocationLength = maxLength*2+1;	// Need at most 2 utf8 chars per uint8 char
261 			fUtf8String = new(nothrow) char[allocationLength];
262 			if (fUtf8String) {
263 				char *outputString = fUtf8String;
264 
265 				for (int32 i = 0; i < maxLength && inputString[i]; i++) {
266 					unicode_to_utf8(inputString[i], &outputString);
267 				}
268 				outputString[0] = 0;
269 			} else {
270 				PRINT(("new fUtf8String[%ld] allocation failed\n", allocationLength));
271 			}
272 
273 			break;
274 		}
275 
276 		case 16:
277 		case 255:
278 		{
279 			const uint16 *inputString = reinterpret_cast<const uint16*>(&(cs0[1]));
280 			int32 maxLength = (length-1) / 2;		// Max length of input string in uint16 characters
281 			int32 allocationLength = maxLength*3+1;	// Need at most 3 utf8 chars per uint16 char
282 			fUtf8String = new(nothrow) char[allocationLength];
283 			if (fUtf8String) {
284 				char *outputString = fUtf8String;
285 
286 				for (int32 i = 0; i < maxLength && inputString[i]; i++) {
287 					unicode_to_utf8(B_BENDIAN_TO_HOST_INT16(inputString[i]), &outputString);
288 				}
289 				outputString[0] = 0;
290 			} else {
291 				PRINT(("new fUtf8String[%ld] allocation failed\n", allocationLength));
292 			}
293 
294 			break;
295 		}
296 
297 		default:
298 			PRINT(("invalid compression id!\n"));
299 			break;
300 	}
301 }
302 
303 void
304 String::_Clear()
305 {
306 	DEBUG_INIT("String");
307 
308 	delete [] fCs0String;
309 	fCs0String = NULL;
310 	delete [] fUtf8String;
311 	fUtf8String = NULL;
312 }
313