1 #include "UdfString.h" 2 3 #include <ByteOrder.h> 4 5 #include <AutoDeleter.h> 6 7 8 /*! \brief Converts the given unicode character to utf8. 9 10 \param c The unicode character. 11 \param out Pointer to a C-string of at least 4 characters 12 long into which the output utf8 characters will 13 be written. The string that is pointed to will 14 be incremented to reflect the number of characters 15 written, i.e. if \a out initially points to a pointer 16 to the first character in string named \c str, and 17 the function writes 4 characters to \c str, then 18 upon returning, out will point to a pointer to 19 the fifth character in \c str. 20 */ 21 static void 22 unicode_to_utf8(uint32 c, char **out) 23 { 24 char *s = *out; 25 26 if (c < 0x80) 27 *(s++) = c; 28 else if (c < 0x800) { 29 *(s++) = 0xc0 | (c>>6); 30 *(s++) = 0x80 | (c & 0x3f); 31 } else if (c < 0x10000) { 32 *(s++) = 0xe0 | (c>>12); 33 *(s++) = 0x80 | ((c>>6) & 0x3f); 34 *(s++) = 0x80 | (c & 0x3f); 35 } else if (c <= 0x10ffff) { 36 *(s++) = 0xf0 | (c>>18); 37 *(s++) = 0x80 | ((c>>12) & 0x3f); 38 *(s++) = 0x80 | ((c>>6) & 0x3f); 39 *(s++) = 0x80 | (c & 0x3f); 40 } 41 *out = s; 42 } 43 44 /*! \brief Converts the given utf8 character to 4-byte unicode. 45 46 \param in Pointer to a C-String from which utf8 characters 47 will be read. *in will be incremented to reflect 48 the number of characters read, similarly to the 49 \c out parameter for unicode_to_utf8(). 50 51 \return The 4-byte unicode character, or **in if passed an 52 invalid character, or 0 if passed any NULL pointers. 53 */ 54 static uint32 55 utf8_to_unicode(const char **in) 56 { 57 if (!in) 58 return 0; 59 uint8 *bytes = (uint8 *)*in; 60 if (!bytes) 61 return 0; 62 63 int32 length; 64 uint8 mask = 0x1f; 65 66 switch (bytes[0] & 0xf0) { 67 case 0xc0: 68 case 0xd0: length = 2; break; 69 case 0xe0: length = 3; break; 70 case 0xf0: 71 mask = 0x0f; 72 length = 4; 73 break; 74 default: 75 // valid 1-byte character 76 // and invalid characters 77 (*in)++; 78 return bytes[0]; 79 } 80 uint32 c = bytes[0] & mask; 81 int32 i = 1; 82 for (;i < length && (bytes[i] & 0x80) > 0;i++) 83 c = (c << 6) | (bytes[i] & 0x3f); 84 85 if (i < length) { 86 // invalid character 87 (*in)++; 88 return (uint32)bytes[0]; 89 } 90 *in += length; 91 return c; 92 } 93 94 95 // #pragma mark - 96 97 98 /*! \brief Creates an empty string object. */ 99 UdfString::UdfString() 100 : 101 fCs0String(NULL), 102 fUtf8String(NULL) 103 { 104 } 105 106 107 /*! \brief Creates a new UdfString object from the given Utf8 string. */ 108 UdfString::UdfString(const char *utf8) 109 : 110 fCs0String(NULL), 111 fUtf8String(NULL) 112 { 113 SetTo(utf8); 114 } 115 116 117 /*! \brief Creates a new UdfString object from the given Cs0 string. */ 118 UdfString::UdfString(const char *cs0, uint32 length) 119 : 120 fCs0String(NULL), 121 fUtf8String(NULL) 122 { 123 SetTo(cs0, length); 124 } 125 126 127 UdfString::~UdfString() 128 { 129 _Clear(); 130 } 131 132 133 /*! \brief Assignment from a Utf8 string. */ 134 void 135 UdfString::SetTo(const char *utf8) 136 { 137 TRACE(("UdfString::SetTo: utf8 = `%s', strlen(utf8) = %ld\n", 138 utf8, utf8 ? strlen(utf8) : 0)); 139 _Clear(); 140 141 if (utf8 == NULL) { 142 TRACE_ERROR(("UdfString::SetTo: passed NULL utf8 string\n")); 143 return; 144 } 145 146 uint32 length = strlen(utf8); 147 // First copy the utf8 string 148 fUtf8String = new(nothrow) char[length + 1]; 149 if (fUtf8String == NULL) { 150 TRACE_ERROR(("UdfString::SetTo: fUtf8String[%" B_PRIu32 151 "] allocation failed\n", length + 1)); 152 return; 153 } 154 155 memcpy(fUtf8String, utf8, length + 1); 156 // Next convert to raw 4-byte unicode. Then we'll do some 157 // analysis to figure out if we have any invalid characters, 158 // and whether we can get away with compressed 8-bit unicode, 159 // or have to use burly 16-bit unicode. 160 uint32 *raw = new(nothrow) uint32[length]; 161 if (raw == NULL) { 162 TRACE_ERROR(("UdfString::SetTo: uint32 raw[%" B_PRIu32 "] temporary" 163 " string allocation failed\n", length)); 164 _Clear(); 165 return; 166 } 167 168 ArrayDeleter<uint32> rawDeleter(raw); 169 170 const char *in = utf8; 171 uint32 rawLength = 0; 172 for (uint32 i = 0; i < length && uint32(in - utf8) < length; i++, rawLength++) 173 raw[i] = utf8_to_unicode(&in); 174 175 // Check for invalids. 176 uint32 mask = 0xffff0000; 177 for (uint32 i = 0; i < rawLength; i++) { 178 if (raw[i] & mask) { 179 TRACE(("WARNING: utf8 string contained a multi-byte sequence which " 180 "was converted into a unicode character larger than 16-bits; " 181 "character will be converted to an underscore character for " 182 "safety.\n")); 183 raw[i] = '_'; 184 } 185 } 186 // See if we can get away with 8-bit compressed unicode 187 mask = 0xffffff00; 188 bool canUse8bit = true; 189 for (uint32 i = 0; i < rawLength; i++) { 190 if (raw[i] & mask) { 191 canUse8bit = false; 192 break; 193 } 194 } 195 // Build our cs0 string 196 if (canUse8bit) { 197 fCs0Length = rawLength + 1; 198 fCs0String = new(nothrow) char[fCs0Length]; 199 if (fCs0String != NULL) { 200 fCs0String[0] = '\x08'; // 8-bit compressed unicode 201 for (uint32 i = 0; i < rawLength; i++) 202 fCs0String[i + 1] = raw[i] % 256; 203 } else { 204 TRACE_ERROR(("UdfString::SetTo: fCs0String[%" B_PRIu32 205 "] allocation failed\n", fCs0Length)); 206 _Clear(); 207 return; 208 } 209 } else { 210 fCs0Length = rawLength * 2 + 1; 211 fCs0String = new(nothrow) char[fCs0Length]; 212 if (fCs0String != NULL) { 213 uint32 pos = 0; 214 fCs0String[pos++] = '\x10'; // 16-bit unicode 215 for (uint32 i = 0; i < rawLength; i++) { 216 // 16-bit unicode chars must be written big endian 217 uint16 value = uint16(raw[i]); 218 uint8 high = uint8(value >> 8 & 0xff); 219 uint8 low = uint8(value & 0xff); 220 fCs0String[pos++] = high; 221 fCs0String[pos++] = low; 222 } 223 } else { 224 TRACE_ERROR(("UdfString::SetTo: fCs0String[%" B_PRIu32 225 "] allocation failed\n", fCs0Length)); 226 _Clear(); 227 return; 228 } 229 } 230 } 231 232 233 /*! \brief Assignment from a Cs0 string. */ 234 void 235 UdfString::SetTo(const char *cs0, uint32 length) 236 { 237 DEBUG_INIT_ETC("UdfString", ("cs0: %p, length: %ld", cs0, length)); 238 239 _Clear(); 240 if (length == 0) 241 return; 242 if (!cs0) { 243 PRINT(("passed NULL cs0 string\n")); 244 return; 245 } 246 247 // First copy the Cs0 string and length 248 fCs0String = new(nothrow) char[length]; 249 if (fCs0String) { 250 memcpy(fCs0String, cs0, length); 251 fCs0Length = length; 252 } else { 253 PRINT(("new fCs0String[%ld] allocation failed\n", length)); 254 return; 255 } 256 257 // Now convert to utf8 258 259 // The first byte of the CS0 string is the compression ID. 260 // - 8: 1 byte characters 261 // - 16: 2 byte, big endian characters 262 // - 254: "CS0 expansion is empty and unique", 1 byte characters 263 // - 255: "CS0 expansion is empty and unique", 2 byte, big endian characters 264 PRINT(("compression ID: %d\n", cs0[0])); 265 switch (reinterpret_cast<const uint8*>(cs0)[0]) { 266 case 8: 267 case 254: 268 { 269 const uint8 *inputString = reinterpret_cast<const uint8*>(&(cs0[1])); 270 int32 maxLength = length-1; // Max length of input string in uint8 characters 271 int32 allocationLength = maxLength*2+1; // Need at most 2 utf8 chars per uint8 char 272 fUtf8String = new(nothrow) char[allocationLength]; 273 if (fUtf8String) { 274 char *outputString = fUtf8String; 275 276 for (int32 i = 0; i < maxLength && inputString[i]; i++) { 277 unicode_to_utf8(inputString[i], &outputString); 278 } 279 outputString[0] = 0; 280 } else { 281 PRINT(("new fUtf8String[%ld] allocation failed\n", allocationLength)); 282 } 283 284 break; 285 } 286 287 case 16: 288 case 255: 289 { 290 const uint16 *inputString = reinterpret_cast<const uint16*>(&(cs0[1])); 291 int32 maxLength = (length-1) / 2; // Max length of input string in uint16 characters 292 int32 allocationLength = maxLength*3+1; // Need at most 3 utf8 chars per uint16 char 293 fUtf8String = new(nothrow) char[allocationLength]; 294 if (fUtf8String) { 295 char *outputString = fUtf8String; 296 297 for (int32 i = 0; i < maxLength && inputString[i]; i++) { 298 unicode_to_utf8(B_BENDIAN_TO_HOST_INT16(inputString[i]), &outputString); 299 } 300 outputString[0] = 0; 301 } else { 302 PRINT(("new fUtf8String[%ld] allocation failed\n", allocationLength)); 303 } 304 305 break; 306 } 307 308 default: 309 PRINT(("invalid compression id!\n")); 310 break; 311 } 312 } 313 314 void 315 UdfString::_Clear() 316 { 317 DEBUG_INIT("UdfString"); 318 319 delete [] fCs0String; 320 fCs0String = NULL; 321 delete [] fUtf8String; 322 fUtf8String = NULL; 323 } 324