1 #include "UdfString.h" 2 3 #include <ByteOrder.h> 4 5 #include <AutoDeleter.h> 6 7 8 using std::nothrow; 9 10 11 /*! \brief Converts the given unicode character to utf8. 12 13 \param c The unicode character. 14 \param out Pointer to a C-string of at least 4 characters 15 long into which the output utf8 characters will 16 be written. The string that is pointed to will 17 be incremented to reflect the number of characters 18 written, i.e. if \a out initially points to a pointer 19 to the first character in string named \c str, and 20 the function writes 4 characters to \c str, then 21 upon returning, out will point to a pointer to 22 the fifth character in \c str. 23 */ 24 static void 25 unicode_to_utf8(uint32 c, char **out) 26 { 27 char *s = *out; 28 29 if (c < 0x80) 30 *(s++) = c; 31 else if (c < 0x800) { 32 *(s++) = 0xc0 | (c>>6); 33 *(s++) = 0x80 | (c & 0x3f); 34 } else if (c < 0x10000) { 35 *(s++) = 0xe0 | (c>>12); 36 *(s++) = 0x80 | ((c>>6) & 0x3f); 37 *(s++) = 0x80 | (c & 0x3f); 38 } else if (c <= 0x10ffff) { 39 *(s++) = 0xf0 | (c>>18); 40 *(s++) = 0x80 | ((c>>12) & 0x3f); 41 *(s++) = 0x80 | ((c>>6) & 0x3f); 42 *(s++) = 0x80 | (c & 0x3f); 43 } 44 *out = s; 45 } 46 47 /*! \brief Converts the given utf8 character to 4-byte unicode. 48 49 \param in Pointer to a C-String from which utf8 characters 50 will be read. *in will be incremented to reflect 51 the number of characters read, similarly to the 52 \c out parameter for unicode_to_utf8(). 53 54 \return The 4-byte unicode character, or **in if passed an 55 invalid character, or 0 if passed any NULL pointers. 56 */ 57 static uint32 58 utf8_to_unicode(const char **in) 59 { 60 if (!in) 61 return 0; 62 uint8 *bytes = (uint8 *)*in; 63 if (!bytes) 64 return 0; 65 66 int32 length; 67 uint8 mask = 0x1f; 68 69 switch (bytes[0] & 0xf0) { 70 case 0xc0: 71 case 0xd0: length = 2; break; 72 case 0xe0: length = 3; break; 73 case 0xf0: 74 mask = 0x0f; 75 length = 4; 76 break; 77 default: 78 // valid 1-byte character 79 // and invalid characters 80 (*in)++; 81 return bytes[0]; 82 } 83 uint32 c = bytes[0] & mask; 84 int32 i = 1; 85 for (;i < length && (bytes[i] & 0x80) > 0;i++) 86 c = (c << 6) | (bytes[i] & 0x3f); 87 88 if (i < length) { 89 // invalid character 90 (*in)++; 91 return (uint32)bytes[0]; 92 } 93 *in += length; 94 return c; 95 } 96 97 98 // #pragma mark - 99 100 101 /*! \brief Creates an empty string object. */ 102 UdfString::UdfString() 103 : 104 fCs0String(NULL), 105 fUtf8String(NULL) 106 { 107 } 108 109 110 /*! \brief Creates a new UdfString object from the given Utf8 string. */ 111 UdfString::UdfString(const char *utf8) 112 : 113 fCs0String(NULL), 114 fUtf8String(NULL) 115 { 116 SetTo(utf8); 117 } 118 119 120 /*! \brief Creates a new UdfString object from the given Cs0 string. */ 121 UdfString::UdfString(const char *cs0, uint32 length) 122 : 123 fCs0String(NULL), 124 fUtf8String(NULL) 125 { 126 SetTo(cs0, length); 127 } 128 129 130 UdfString::~UdfString() 131 { 132 _Clear(); 133 } 134 135 136 /*! \brief Assignment from a Utf8 string. */ 137 void 138 UdfString::SetTo(const char *utf8) 139 { 140 TRACE(("UdfString::SetTo: utf8 = `%s', strlen(utf8) = %ld\n", 141 utf8, utf8 ? strlen(utf8) : 0)); 142 _Clear(); 143 144 if (utf8 == NULL) { 145 TRACE_ERROR(("UdfString::SetTo: passed NULL utf8 string\n")); 146 return; 147 } 148 149 uint32 length = strlen(utf8); 150 // First copy the utf8 string 151 fUtf8String = new(nothrow) char[length + 1]; 152 if (fUtf8String == NULL) { 153 TRACE_ERROR(("UdfString::SetTo: fUtf8String[%" B_PRIu32 154 "] allocation failed\n", length + 1)); 155 return; 156 } 157 158 memcpy(fUtf8String, utf8, length + 1); 159 // Next convert to raw 4-byte unicode. Then we'll do some 160 // analysis to figure out if we have any invalid characters, 161 // and whether we can get away with compressed 8-bit unicode, 162 // or have to use burly 16-bit unicode. 163 uint32 *raw = new(nothrow) uint32[length]; 164 if (raw == NULL) { 165 TRACE_ERROR(("UdfString::SetTo: uint32 raw[%" B_PRIu32 "] temporary" 166 " string allocation failed\n", length)); 167 _Clear(); 168 return; 169 } 170 171 ArrayDeleter<uint32> rawDeleter(raw); 172 173 const char *in = utf8; 174 uint32 rawLength = 0; 175 for (uint32 i = 0; i < length && uint32(in - utf8) < length; i++, rawLength++) 176 raw[i] = utf8_to_unicode(&in); 177 178 // Check for invalids. 179 uint32 mask = 0xffff0000; 180 for (uint32 i = 0; i < rawLength; i++) { 181 if (raw[i] & mask) { 182 TRACE(("WARNING: utf8 string contained a multi-byte sequence which " 183 "was converted into a unicode character larger than 16-bits; " 184 "character will be converted to an underscore character for " 185 "safety.\n")); 186 raw[i] = '_'; 187 } 188 } 189 // See if we can get away with 8-bit compressed unicode 190 mask = 0xffffff00; 191 bool canUse8bit = true; 192 for (uint32 i = 0; i < rawLength; i++) { 193 if (raw[i] & mask) { 194 canUse8bit = false; 195 break; 196 } 197 } 198 // Build our cs0 string 199 if (canUse8bit) { 200 fCs0Length = rawLength + 1; 201 fCs0String = new(nothrow) char[fCs0Length]; 202 if (fCs0String != NULL) { 203 fCs0String[0] = '\x08'; // 8-bit compressed unicode 204 for (uint32 i = 0; i < rawLength; i++) 205 fCs0String[i + 1] = raw[i] % 256; 206 } else { 207 TRACE_ERROR(("UdfString::SetTo: fCs0String[%" B_PRIu32 208 "] allocation failed\n", fCs0Length)); 209 _Clear(); 210 return; 211 } 212 } else { 213 fCs0Length = rawLength * 2 + 1; 214 fCs0String = new(nothrow) char[fCs0Length]; 215 if (fCs0String != NULL) { 216 uint32 pos = 0; 217 fCs0String[pos++] = '\x10'; // 16-bit unicode 218 for (uint32 i = 0; i < rawLength; i++) { 219 // 16-bit unicode chars must be written big endian 220 uint16 value = uint16(raw[i]); 221 uint8 high = uint8(value >> 8 & 0xff); 222 uint8 low = uint8(value & 0xff); 223 fCs0String[pos++] = high; 224 fCs0String[pos++] = low; 225 } 226 } else { 227 TRACE_ERROR(("UdfString::SetTo: fCs0String[%" B_PRIu32 228 "] allocation failed\n", fCs0Length)); 229 _Clear(); 230 return; 231 } 232 } 233 } 234 235 236 /*! \brief Assignment from a Cs0 string. */ 237 void 238 UdfString::SetTo(const char *cs0, uint32 length) 239 { 240 DEBUG_INIT_ETC("UdfString", ("cs0: %p, length: %ld", cs0, length)); 241 242 _Clear(); 243 if (length == 0) 244 return; 245 if (!cs0) { 246 PRINT(("passed NULL cs0 string\n")); 247 return; 248 } 249 250 // First copy the Cs0 string and length 251 fCs0String = new(nothrow) char[length]; 252 if (fCs0String) { 253 memcpy(fCs0String, cs0, length); 254 fCs0Length = length; 255 } else { 256 PRINT(("new fCs0String[%ld] allocation failed\n", length)); 257 return; 258 } 259 260 // Now convert to utf8 261 262 // The first byte of the CS0 string is the compression ID. 263 // - 8: 1 byte characters 264 // - 16: 2 byte, big endian characters 265 // - 254: "CS0 expansion is empty and unique", 1 byte characters 266 // - 255: "CS0 expansion is empty and unique", 2 byte, big endian characters 267 PRINT(("compression ID: %d\n", cs0[0])); 268 switch (reinterpret_cast<const uint8*>(cs0)[0]) { 269 case 8: 270 case 254: 271 { 272 const uint8 *inputString = reinterpret_cast<const uint8*>(&(cs0[1])); 273 int32 maxLength = length-1; // Max length of input string in uint8 characters 274 int32 allocationLength = maxLength*2+1; // Need at most 2 utf8 chars per uint8 char 275 fUtf8String = new(nothrow) char[allocationLength]; 276 if (fUtf8String) { 277 char *outputString = fUtf8String; 278 279 for (int32 i = 0; i < maxLength && inputString[i]; i++) { 280 unicode_to_utf8(inputString[i], &outputString); 281 } 282 outputString[0] = 0; 283 } else { 284 PRINT(("new fUtf8String[%ld] allocation failed\n", allocationLength)); 285 } 286 287 break; 288 } 289 290 case 16: 291 case 255: 292 { 293 const uint16 *inputString = reinterpret_cast<const uint16*>(&(cs0[1])); 294 int32 maxLength = (length-1) / 2; // Max length of input string in uint16 characters 295 int32 allocationLength = maxLength*3+1; // Need at most 3 utf8 chars per uint16 char 296 fUtf8String = new(nothrow) char[allocationLength]; 297 if (fUtf8String) { 298 char *outputString = fUtf8String; 299 300 for (int32 i = 0; i < maxLength && inputString[i]; i++) { 301 unicode_to_utf8(B_BENDIAN_TO_HOST_INT16(inputString[i]), &outputString); 302 } 303 outputString[0] = 0; 304 } else { 305 PRINT(("new fUtf8String[%ld] allocation failed\n", allocationLength)); 306 } 307 308 break; 309 } 310 311 default: 312 PRINT(("invalid compression id!\n")); 313 break; 314 } 315 } 316 317 void 318 UdfString::_Clear() 319 { 320 DEBUG_INIT("UdfString"); 321 322 delete [] fCs0String; 323 fCs0String = NULL; 324 delete [] fUtf8String; 325 fUtf8String = NULL; 326 } 327