1 #include "UdfString.h" 2 3 #include "ByteOrder.h" 4 5 6 /*! \brief Converts the given unicode character to utf8. 7 8 \param c The unicode character. 9 \param out Pointer to a C-string of at least 4 characters 10 long into which the output utf8 characters will 11 be written. The string that is pointed to will 12 be incremented to reflect the number of characters 13 written, i.e. if \a out initially points to a pointer 14 to the first character in string named \c str, and 15 the function writes 4 characters to \c str, then 16 upon returning, out will point to a pointer to 17 the fifth character in \c str. 18 */ 19 static 20 void 21 unicode_to_utf8(uint32 c, char **out) 22 { 23 char *s = *out; 24 25 if (c < 0x80) 26 *(s++) = c; 27 else if (c < 0x800) { 28 *(s++) = 0xc0 | (c>>6); 29 *(s++) = 0x80 | (c & 0x3f); 30 } else if (c < 0x10000) { 31 *(s++) = 0xe0 | (c>>12); 32 *(s++) = 0x80 | ((c>>6) & 0x3f); 33 *(s++) = 0x80 | (c & 0x3f); 34 } else if (c <= 0x10ffff) { 35 *(s++) = 0xf0 | (c>>18); 36 *(s++) = 0x80 | ((c>>12) & 0x3f); 37 *(s++) = 0x80 | ((c>>6) & 0x3f); 38 *(s++) = 0x80 | (c & 0x3f); 39 } 40 *out = s; 41 } 42 43 /*! \brief Converts the given utf8 character to 4-byte unicode. 44 45 \param in Pointer to a C-String from which utf8 characters 46 will be read. *in will be incremented to reflect 47 the number of characters read, similarly to the 48 \c out parameter for Udf::unicode_to_utf8(). 49 50 \return The 4-byte unicode character, or **in if passed an 51 invalid character, or 0 if passed any NULL pointers. 52 */ 53 static 54 uint32 55 utf8_to_unicode(const char **in) 56 { 57 if (!in) 58 return 0; 59 uint8 *bytes = (uint8 *)*in; 60 if (!bytes) 61 return 0; 62 63 int32 length; 64 uint8 mask = 0x1f; 65 66 switch (bytes[0] & 0xf0) { 67 case 0xc0: 68 case 0xd0: length = 2; break; 69 case 0xe0: length = 3; break; 70 case 0xf0: 71 mask = 0x0f; 72 length = 4; 73 break; 74 default: 75 // valid 1-byte character 76 // and invalid characters 77 (*in)++; 78 return bytes[0]; 79 } 80 uint32 c = bytes[0] & mask; 81 int32 i = 1; 82 for (;i < length && (bytes[i] & 0x80) > 0;i++) 83 c = (c << 6) | (bytes[i] & 0x3f); 84 85 if (i < length) { 86 // invalid character 87 (*in)++; 88 return (uint32)bytes[0]; 89 } 90 *in += length; 91 return c; 92 } 93 94 using namespace Udf; 95 96 /*! \brief Creates an empty string object. 97 */ 98 String::String() 99 : fCs0String(NULL) 100 , fUtf8String(NULL) 101 { 102 } 103 104 /*! \brief Creates a new String object from the given Utf8 string. 105 */ 106 String::String(const char *utf8) 107 : fCs0String(NULL) 108 , fUtf8String(NULL) 109 { 110 SetTo(utf8); 111 } 112 113 /*! \brief Creates a new String object from the given Cs0 string. 114 */ 115 String::String(const char *cs0, uint32 length) 116 : fCs0String(NULL) 117 , fUtf8String(NULL) 118 { 119 SetTo(cs0, length); 120 } 121 122 String::~String() 123 { 124 DEBUG_INIT("String"); 125 126 _Clear(); 127 } 128 129 /*! \brief Assignment from a Utf8 string. 130 */ 131 void 132 String::SetTo(const char *utf8) 133 { 134 DEBUG_INIT_ETC("String", ("utf8: `%s', strlen(utf8): %ld", utf8, 135 utf8 ? strlen(utf8) : 0)); 136 _Clear(); 137 if (!utf8) { 138 PRINT(("passed NULL utf8 string\n")); 139 return; 140 } 141 uint32 length = strlen(utf8); 142 // First copy the utf8 string 143 fUtf8String = new(nothrow) char[length+1]; 144 if (!fUtf8String){ 145 PRINT(("new fUtf8String[%ld] allocation failed\n", length+1)); 146 return; 147 } 148 memcpy(fUtf8String, utf8, length+1); 149 // Next convert to raw 4-byte unicode. Then we'll do some 150 // analysis to figure out if we have any invalid characters, 151 // and whether we can get away with compressed 8-bit unicode, 152 // or have to use burly 16-bit unicode. 153 uint32 *raw = new(nothrow) uint32[length]; 154 if (!raw) { 155 PRINT(("new uint32 raw[%ld] temporary string allocation failed\n", length)); 156 _Clear(); 157 return; 158 } 159 const char *in = utf8; 160 uint32 rawLength = 0; 161 for (uint32 i = 0; i < length && uint32(in-utf8) < length; i++, rawLength++) 162 raw[i] = utf8_to_unicode(&in); 163 // Check for invalids. 164 uint32 mask = 0xffff0000; 165 for (uint32 i = 0; i < rawLength; i++) { 166 if (raw[i] & mask) { 167 PRINT(("WARNING: utf8 string contained a multi-byte sequence which " 168 "was converted into a unicode character larger than 16-bits; " 169 "character will be converted to an underscore character for " 170 "safety.\n")); 171 raw[i] = '_'; 172 } 173 } 174 // See if we can get away with 8-bit compressed unicode 175 mask = 0xffffff00; 176 bool canUse8bit = true; 177 for (uint32 i = 0; i < rawLength; i++) { 178 if (raw[i] & mask) { 179 canUse8bit = false; 180 break; 181 } 182 } 183 // Build our cs0 string 184 if (canUse8bit) { 185 fCs0Length = rawLength+1; 186 fCs0String = new(nothrow) char[fCs0Length]; 187 if (fCs0String) { 188 fCs0String[0] = '\x08'; // 8-bit compressed unicode 189 for (uint32 i = 0; i < rawLength; i++) 190 fCs0String[i+1] = raw[i] % 256; 191 } else { 192 PRINT(("new fCs0String[%ld] allocation failed\n", fCs0Length)); 193 _Clear(); 194 return; 195 } 196 } else { 197 fCs0Length = rawLength*2+1; 198 fCs0String = new(nothrow) char[fCs0Length]; 199 if (fCs0String) { 200 uint32 pos = 0; 201 fCs0String[pos++] = '\x10'; // 16-bit unicode 202 for (uint32 i = 0; i < rawLength; i++) { 203 // 16-bit unicode chars must be written big endian 204 uint16 value = uint16(raw[i]); 205 uint8 high = uint8(value >> 8 & 0xff); 206 uint8 low = uint8(value & 0xff); 207 fCs0String[pos++] = high; 208 fCs0String[pos++] = low; 209 } 210 } else { 211 PRINT(("new fCs0String[%ld] allocation failed\n", fCs0Length)); 212 _Clear(); 213 return; 214 } 215 } 216 // Clean up 217 delete [] raw; 218 raw = NULL; 219 } 220 221 /*! \brief Assignment from a Cs0 string. 222 */ 223 void 224 String::SetTo(const char *cs0, uint32 length) 225 { 226 DEBUG_INIT_ETC("String", ("cs0: %p, length: %ld", cs0, length)); 227 228 _Clear(); 229 if (length == 0) 230 return; 231 if (!cs0) { 232 PRINT(("passed NULL cs0 string\n")); 233 return; 234 } 235 236 // First copy the Cs0 string and length 237 fCs0String = new(nothrow) char[length]; 238 if (fCs0String) { 239 memcpy(fCs0String, cs0, length); 240 fCs0Length = length; 241 } else { 242 PRINT(("new fCs0String[%ld] allocation failed\n", length)); 243 return; 244 } 245 246 // Now convert to utf8 247 248 // The first byte of the CS0 string is the compression ID. 249 // - 8: 1 byte characters 250 // - 16: 2 byte, big endian characters 251 // - 254: "CS0 expansion is empty and unique", 1 byte characters 252 // - 255: "CS0 expansion is empty and unique", 2 byte, big endian characters 253 PRINT(("compression ID: %d\n", cs0[0])); 254 switch (reinterpret_cast<const uint8*>(cs0)[0]) { 255 case 8: 256 case 254: 257 { 258 const uint8 *inputString = reinterpret_cast<const uint8*>(&(cs0[1])); 259 int32 maxLength = length-1; // Max length of input string in uint8 characters 260 int32 allocationLength = maxLength*2+1; // Need at most 2 utf8 chars per uint8 char 261 fUtf8String = new(nothrow) char[allocationLength]; 262 if (fUtf8String) { 263 char *outputString = fUtf8String; 264 265 for (int32 i = 0; i < maxLength && inputString[i]; i++) { 266 unicode_to_utf8(inputString[i], &outputString); 267 } 268 outputString[0] = 0; 269 } else { 270 PRINT(("new fUtf8String[%ld] allocation failed\n", allocationLength)); 271 } 272 273 break; 274 } 275 276 case 16: 277 case 255: 278 { 279 const uint16 *inputString = reinterpret_cast<const uint16*>(&(cs0[1])); 280 int32 maxLength = (length-1) / 2; // Max length of input string in uint16 characters 281 int32 allocationLength = maxLength*3+1; // Need at most 3 utf8 chars per uint16 char 282 fUtf8String = new(nothrow) char[allocationLength]; 283 if (fUtf8String) { 284 char *outputString = fUtf8String; 285 286 for (int32 i = 0; i < maxLength && inputString[i]; i++) { 287 unicode_to_utf8(B_BENDIAN_TO_HOST_INT16(inputString[i]), &outputString); 288 } 289 outputString[0] = 0; 290 } else { 291 PRINT(("new fUtf8String[%ld] allocation failed\n", allocationLength)); 292 } 293 294 break; 295 } 296 297 default: 298 PRINT(("invalid compression id!\n")); 299 break; 300 } 301 } 302 303 void 304 String::_Clear() 305 { 306 DEBUG_INIT("String"); 307 308 delete [] fCs0String; 309 fCs0String = NULL; 310 delete [] fUtf8String; 311 fUtf8String = NULL; 312 } 313