1 /* 2 * Copyright 2014 Jonathan Schleifer <js@webkeks.org> 3 * Copyright 2014 Haiku, Inc. All rights reserved. 4 * 5 * Distributed under the terms of the MIT License. 6 * 7 * Authors: 8 * Jonathan Schleifer, js@webkeks.org 9 * John Scipione, jscipione@gmail.com 10 */ 11 12 13 #include "convertutf.h" 14 15 16 #include <ByteOrder.h> 17 #include <Errors.h> 18 #include <StorageDefs.h> 19 20 21 static inline size_t 22 glyph_length(uint32 glyph) 23 { 24 if (glyph < 0x80) 25 return 1; 26 else if (glyph < 0x800) 27 return 2; 28 else if (glyph < 0x10000) 29 return 3; 30 else if (glyph < 0x110000) 31 return 4; 32 33 return 0; 34 } 35 36 37 static void 38 encode_glyph(uint32 glyph, size_t glyphLength, char* buffer) 39 { 40 if (glyphLength == 1) { 41 *buffer = glyph; 42 } else if (glyphLength == 2) { 43 *buffer++ = 0xC0 | (glyph >> 6); 44 *buffer = 0x80 | (glyph & 0x3F); 45 } else if (glyphLength == 3) { 46 *buffer++ = 0xE0 | (glyph >> 12); 47 *buffer++ = 0x80 | (glyph >> 6 & 0x3F); 48 *buffer = 0x80 | (glyph & 0x3F); 49 } else if (glyphLength == 4) { 50 *buffer++ = 0xF0 | (glyph >> 18); 51 *buffer++ = 0x80 | (glyph >> 12 & 0x3F); 52 *buffer++ = 0x80 | (glyph >> 6 & 0x3F); 53 *buffer = 0x80 | (glyph & 0x3F); 54 } 55 } 56 57 58 static ssize_t 59 utf16_to_utf8(const uint16* source, size_t sourceCodeUnitCount, char* target, 60 size_t targetLength, bool isLittleEndian) 61 { 62 if (source == NULL || sourceCodeUnitCount == 0 63 || target == NULL || targetLength == 0) { 64 return B_BAD_VALUE; 65 } 66 67 ssize_t outLength = 0; 68 69 for (size_t i = 0; i < sourceCodeUnitCount; i++) { 70 uint32 glyph = isLittleEndian 71 ? B_LENDIAN_TO_HOST_INT32(source[i]) 72 : B_BENDIAN_TO_HOST_INT32(source[i]); 73 74 if ((glyph & 0xFC00) == 0xDC00) { 75 // missing high surrogate 76 return B_BAD_VALUE; 77 } 78 79 if ((glyph & 0xFC00) == 0xD800) { 80 if (sourceCodeUnitCount <= i + 1) { 81 // high surrogate at end of string 82 return B_BAD_VALUE; 83 } 84 85 uint32 low = isLittleEndian 86 ? B_LENDIAN_TO_HOST_INT32(source[i + 1]) 87 : B_BENDIAN_TO_HOST_INT32(source[i + 1]); 88 if ((low & 0xFC00) != 0xDC00) { 89 // missing low surrogate 90 return B_BAD_VALUE; 91 } 92 93 glyph = (((glyph & 0x3FF) << 10) | (low & 0x3FF)) + 0x10000; 94 i++; 95 } 96 97 size_t glyphLength = glyph_length(glyph); 98 if (glyphLength == 0) 99 return B_BAD_VALUE; 100 else if (outLength + glyphLength >= targetLength 101 || outLength + glyphLength >= B_FILE_NAME_LENGTH) { 102 // NUL terminate the string so the caller can use the 103 // abbreviated version in this case. Since the length 104 // isn't returned the caller will need to call strlen() 105 // to get the length of the string. 106 target[outLength] = '\0'; 107 return B_NAME_TOO_LONG; 108 } 109 110 encode_glyph(glyph, glyphLength, target + outLength); 111 outLength += glyphLength; 112 } 113 114 target[outLength] = '\0'; 115 116 return outLength; 117 } 118 119 120 ssize_t 121 utf16le_to_utf8(const uint16* source, size_t sourceCodeUnitCount, 122 char* target, size_t targetLength) 123 { 124 return utf16_to_utf8(source, sourceCodeUnitCount, target, targetLength, 125 true); 126 } 127 128 129 ssize_t 130 utf16be_to_utf8(const uint16* source, size_t sourceCodeUnitCount, 131 char* target, size_t targetLength) 132 { 133 return utf16_to_utf8(source, sourceCodeUnitCount, target, targetLength, 134 false); 135 } 136