1 #include "UdfString.h" 2 3 #include "ByteOrder.h" 4 5 6 /*! \brief Converts the given unicode character to utf8. 7 8 \param c The unicode character. 9 \param out Pointer to a C-string of at least 4 characters 10 long into which the output utf8 characters will 11 be written. The string that is pointed to will 12 be incremented to reflect the number of characters 13 written, i.e. if \a out initially points to a pointer 14 to the first character in string named \c str, and 15 the function writes 4 characters to \c str, then 16 upon returning, out will point to a pointer to 17 the fifth character in \c str. 18 */ 19 static 20 void 21 unicode_to_utf8(uint32 c, char **out) 22 { 23 char *s = *out; 24 25 if (c < 0x80) 26 *(s++) = c; 27 else if (c < 0x800) { 28 *(s++) = 0xc0 | (c>>6); 29 *(s++) = 0x80 | (c & 0x3f); 30 } else if (c < 0x10000) { 31 *(s++) = 0xe0 | (c>>12); 32 *(s++) = 0x80 | ((c>>6) & 0x3f); 33 *(s++) = 0x80 | (c & 0x3f); 34 } else if (c <= 0x10ffff) { 35 *(s++) = 0xf0 | (c>>18); 36 *(s++) = 0x80 | ((c>>12) & 0x3f); 37 *(s++) = 0x80 | ((c>>6) & 0x3f); 38 *(s++) = 0x80 | (c & 0x3f); 39 } 40 *out = s; 41 } 42 43 /*! \brief Converts the given utf8 character to 4-byte unicode. 44 45 \param in Pointer to a C-String from which utf8 characters 46 will be read. *in will be incremented to reflect 47 the number of characters read, similarly to the 48 \c out parameter for unicode_to_utf8(). 49 50 \return The 4-byte unicode character, or **in if passed an 51 invalid character, or 0 if passed any NULL pointers. 52 */ 53 static 54 uint32 55 utf8_to_unicode(const char **in) 56 { 57 if (!in) 58 return 0; 59 uint8 *bytes = (uint8 *)*in; 60 if (!bytes) 61 return 0; 62 63 int32 length; 64 uint8 mask = 0x1f; 65 66 switch (bytes[0] & 0xf0) { 67 case 0xc0: 68 case 0xd0: length = 2; break; 69 case 0xe0: length = 3; break; 70 case 0xf0: 71 mask = 0x0f; 72 length = 4; 73 break; 74 default: 75 // valid 1-byte character 76 // and invalid characters 77 (*in)++; 78 return bytes[0]; 79 } 80 uint32 c = bytes[0] & mask; 81 int32 i = 1; 82 for (;i < length && (bytes[i] & 0x80) > 0;i++) 83 c = (c << 6) | (bytes[i] & 0x3f); 84 85 if (i < length) { 86 // invalid character 87 (*in)++; 88 return (uint32)bytes[0]; 89 } 90 *in += length; 91 return c; 92 } 93 94 95 /*! \brief Creates an empty string object. */ 96 UdfString::UdfString() 97 : 98 fCs0String(NULL), 99 fUtf8String(NULL) 100 { 101 } 102 103 104 /*! \brief Creates a new UdfString object from the given Utf8 string. */ 105 UdfString::UdfString(const char *utf8) 106 : 107 fCs0String(NULL), 108 fUtf8String(NULL) 109 { 110 SetTo(utf8); 111 } 112 113 114 /*! \brief Creates a new UdfString object from the given Cs0 string. */ 115 UdfString::UdfString(const char *cs0, uint32 length) 116 : 117 fCs0String(NULL), 118 fUtf8String(NULL) 119 { 120 SetTo(cs0, length); 121 } 122 123 124 UdfString::~UdfString() 125 { 126 _Clear(); 127 } 128 129 130 /*! \brief Assignment from a Utf8 string. */ 131 void 132 UdfString::SetTo(const char *utf8) 133 { 134 TRACE(("UdfString::SetTo: utf8 = `%s', strlen(utf8) = %ld\n", 135 utf8, utf8 ? strlen(utf8) : 0)); 136 _Clear(); 137 138 if (utf8 == NULL) { 139 TRACE_ERROR(("UdfString::SetTo: passed NULL utf8 string\n")); 140 return; 141 } 142 143 uint32 length = strlen(utf8); 144 // First copy the utf8 string 145 fUtf8String = new(nothrow) char[length + 1]; 146 if (fUtf8String == NULL) { 147 TRACE_ERROR(("UdfString::SetTo: fUtf8String[%ld] allocation failed\n", 148 length + 1)); 149 return; 150 } 151 152 memcpy(fUtf8String, utf8, length + 1); 153 // Next convert to raw 4-byte unicode. Then we'll do some 154 // analysis to figure out if we have any invalid characters, 155 // and whether we can get away with compressed 8-bit unicode, 156 // or have to use burly 16-bit unicode. 157 uint32 *raw = new(nothrow) uint32[length]; 158 if (raw == NULL) { 159 TRACE_ERROR(("UdfString::SetTo: uint32 raw[%ld] temporary string " 160 "allocation failed\n", length)); 161 _Clear(); 162 return; 163 } 164 165 const char *in = utf8; 166 uint32 rawLength = 0; 167 for (uint32 i = 0; i < length && uint32(in - utf8) < length; i++, rawLength++) 168 raw[i] = utf8_to_unicode(&in); 169 170 // Check for invalids. 171 uint32 mask = 0xffff0000; 172 for (uint32 i = 0; i < rawLength; i++) { 173 if (raw[i] & mask) { 174 TRACE(("WARNING: utf8 string contained a multi-byte sequence which " 175 "was converted into a unicode character larger than 16-bits; " 176 "character will be converted to an underscore character for " 177 "safety.\n")); 178 raw[i] = '_'; 179 } 180 } 181 // See if we can get away with 8-bit compressed unicode 182 mask = 0xffffff00; 183 bool canUse8bit = true; 184 for (uint32 i = 0; i < rawLength; i++) { 185 if (raw[i] & mask) { 186 canUse8bit = false; 187 break; 188 } 189 } 190 // Build our cs0 string 191 if (canUse8bit) { 192 fCs0Length = rawLength + 1; 193 fCs0String = new(nothrow) char[fCs0Length]; 194 if (fCs0String) { 195 fCs0String[0] = '\x08'; // 8-bit compressed unicode 196 for (uint32 i = 0; i < rawLength; i++) 197 fCs0String[i + 1] = raw[i] % 256; 198 } else { 199 TRACE_ERROR(("UdfString::SetTo: fCs0String[%ld] allocation failed\n", 200 fCs0Length)); 201 _Clear(); 202 return; 203 } 204 } else { 205 fCs0Length = rawLength * 2 + 1; 206 fCs0String = new(nothrow) char[fCs0Length]; 207 if (fCs0String) { 208 uint32 pos = 0; 209 fCs0String[pos++] = '\x10'; // 16-bit unicode 210 for (uint32 i = 0; i < rawLength; i++) { 211 // 16-bit unicode chars must be written big endian 212 uint16 value = uint16(raw[i]); 213 uint8 high = uint8(value >> 8 & 0xff); 214 uint8 low = uint8(value & 0xff); 215 fCs0String[pos++] = high; 216 fCs0String[pos++] = low; 217 } 218 } else { 219 TRACE_ERROR(("UdfString::SetTo: fCs0String[%ld] allocation failed\n", 220 fCs0Length)); 221 _Clear(); 222 return; 223 } 224 } 225 // Clean up 226 delete [] raw; 227 raw = NULL; 228 } 229 230 231 /*! \brief Assignment from a Cs0 string. */ 232 void 233 UdfString::SetTo(const char *cs0, uint32 length) 234 { 235 DEBUG_INIT_ETC("UdfString", ("cs0: %p, length: %ld", cs0, length)); 236 237 _Clear(); 238 if (length == 0) 239 return; 240 if (!cs0) { 241 PRINT(("passed NULL cs0 string\n")); 242 return; 243 } 244 245 // First copy the Cs0 string and length 246 fCs0String = new(nothrow) char[length]; 247 if (fCs0String) { 248 memcpy(fCs0String, cs0, length); 249 fCs0Length = length; 250 } else { 251 PRINT(("new fCs0String[%ld] allocation failed\n", length)); 252 return; 253 } 254 255 // Now convert to utf8 256 257 // The first byte of the CS0 string is the compression ID. 258 // - 8: 1 byte characters 259 // - 16: 2 byte, big endian characters 260 // - 254: "CS0 expansion is empty and unique", 1 byte characters 261 // - 255: "CS0 expansion is empty and unique", 2 byte, big endian characters 262 PRINT(("compression ID: %d\n", cs0[0])); 263 switch (reinterpret_cast<const uint8*>(cs0)[0]) { 264 case 8: 265 case 254: 266 { 267 const uint8 *inputString = reinterpret_cast<const uint8*>(&(cs0[1])); 268 int32 maxLength = length-1; // Max length of input string in uint8 characters 269 int32 allocationLength = maxLength*2+1; // Need at most 2 utf8 chars per uint8 char 270 fUtf8String = new(nothrow) char[allocationLength]; 271 if (fUtf8String) { 272 char *outputString = fUtf8String; 273 274 for (int32 i = 0; i < maxLength && inputString[i]; i++) { 275 unicode_to_utf8(inputString[i], &outputString); 276 } 277 outputString[0] = 0; 278 } else { 279 PRINT(("new fUtf8String[%ld] allocation failed\n", allocationLength)); 280 } 281 282 break; 283 } 284 285 case 16: 286 case 255: 287 { 288 const uint16 *inputString = reinterpret_cast<const uint16*>(&(cs0[1])); 289 int32 maxLength = (length-1) / 2; // Max length of input string in uint16 characters 290 int32 allocationLength = maxLength*3+1; // Need at most 3 utf8 chars per uint16 char 291 fUtf8String = new(nothrow) char[allocationLength]; 292 if (fUtf8String) { 293 char *outputString = fUtf8String; 294 295 for (int32 i = 0; i < maxLength && inputString[i]; i++) { 296 unicode_to_utf8(B_BENDIAN_TO_HOST_INT16(inputString[i]), &outputString); 297 } 298 outputString[0] = 0; 299 } else { 300 PRINT(("new fUtf8String[%ld] allocation failed\n", allocationLength)); 301 } 302 303 break; 304 } 305 306 default: 307 PRINT(("invalid compression id!\n")); 308 break; 309 } 310 } 311 312 void 313 UdfString::_Clear() 314 { 315 DEBUG_INIT("UdfString"); 316 317 delete [] fCs0String; 318 fCs0String = NULL; 319 delete [] fUtf8String; 320 fUtf8String = NULL; 321 } 322