1 /* 2 * Copyright 2003-2008, Haiku, Inc. All Rights Reserved. 3 * Distributed under the terms of the MIT License. 4 * 5 * Authors: 6 * Andrew Bachmann 7 */ 8 9 10 #include <CharacterSet.h> 11 #include <CharacterSetRoster.h> 12 #include <UTF8.h> 13 14 #include <errno.h> 15 #include <iconv.h> 16 #include <stdio.h> 17 18 19 //#define DEBUG_CONV 1 20 21 #ifdef DEBUG_CONV 22 # define DEBPRINT(ARGS) printf ARGS; 23 #else 24 # define DEBPRINT(ARGS) ; 25 #endif 26 27 using namespace BPrivate; 28 29 int iconvctl(iconv_t icd, int request, void* argument); 30 31 32 static void 33 discard_invalid_input_character(iconv_t* conversion, char** inputBuffer, 34 size_t* inputLeft) 35 { 36 if (*inputLeft == 0) 37 return; 38 39 char outputBuffer[1]; 40 41 // skip the invalid input character only 42 size_t left = 1; 43 for (; left <= *inputLeft; left ++) { 44 // reset internal state 45 iconv(*conversion, NULL, NULL, NULL, NULL); 46 47 char* buffer = *inputBuffer; 48 char* output = outputBuffer; 49 size_t outputLeft = 1; 50 size_t size = iconv(*conversion, &buffer, &left, 51 &output, &outputLeft); 52 53 if (size != (size_t)-1) { 54 // should not reach here 55 break; 56 } 57 58 if (errno == EINVAL) { 59 // too few input bytes provided, 60 // increase input buffer size and try again 61 continue; 62 } 63 64 if (errno == EILSEQ) { 65 // minimal size of input buffer found 66 break; 67 } 68 69 // should not reach here 70 }; 71 72 *inputBuffer += left; 73 *inputLeft -= left; 74 } 75 76 77 status_t 78 convert_encoding(const char* from, const char* to, const char* src, 79 int32* srcLen, char* dst, int32* dstLen, int32* state, 80 char substitute) 81 { 82 if (*srcLen == 0) { 83 // nothing to do! 84 *dstLen = 0; 85 return B_OK; 86 } 87 88 // TODO: this doesn't work, as the state is reset every time! 89 iconv_t conversion = iconv_open(to, from); 90 if (conversion == (iconv_t)-1) { 91 DEBPRINT(("iconv_open failed\n")); 92 return B_ERROR; 93 } 94 95 size_t outputLeft = *dstLen; 96 97 if (state == NULL || *state == 0) { 98 if (state != NULL) 99 *state = 1; 100 101 iconv(conversion, NULL, NULL, &dst, &outputLeft); 102 } 103 104 char** inputBuffer = const_cast<char**>(&src); 105 size_t inputLeft = *srcLen; 106 do { 107 size_t nonReversibleConversions = iconv(conversion, inputBuffer, 108 &inputLeft, &dst, &outputLeft); 109 if (nonReversibleConversions == (size_t)-1) { 110 if (errno == E2BIG) { 111 // Not enough room in the output buffer for the next converted character 112 // This is not a "real" error, we just quit out. 113 break; 114 } 115 116 switch (errno) { 117 case EILSEQ: // unable to generate a corresponding character 118 { 119 discard_invalid_input_character(&conversion, inputBuffer, 120 &inputLeft); 121 122 // prepare to convert the substitute character to target encoding 123 char original = substitute; 124 size_t len = 1; 125 char* copy = &original; 126 127 // Perform the conversion 128 // We ignore any errors during this as part of robustness/best-effort 129 // We use ISO-8859-1 as a source because it is a single byte encoding 130 // It also overlaps UTF-8 for the lower 128 characters. It is also 131 // likely to have a mapping to almost any target encoding. 132 iconv_t iso8859_1to = iconv_open(to,"ISO-8859-1"); 133 if (iso8859_1to != (iconv_t)-1) { 134 iconv(iso8859_1to, 0, 0, 0, 0); 135 iconv(iso8859_1to, ©, &len, &dst, &outputLeft); 136 iconv_close(iso8859_1to); 137 } 138 break; 139 } 140 141 case EINVAL: // incomplete multibyte sequence at the end of the input 142 // TODO inputLeft bytes from inputBuffer should 143 // be stored in state variable, so that conversion 144 // can continue when the caller provides the missing 145 // bytes with the next call of this method 146 147 // we just eat bad bytes, as part of robustness/best-effort 148 inputBuffer++; 149 inputLeft--; 150 break; 151 152 default: 153 // unknown error, completely bail 154 status_t status = errno; 155 iconv_close(conversion); 156 return status; 157 } 158 } 159 } while (inputLeft > 0 && outputLeft > 0); 160 161 *srcLen -= inputLeft; 162 *dstLen -= outputLeft; 163 iconv_close(conversion); 164 165 return B_OK; 166 } 167 168 169 status_t 170 convert_to_utf8(uint32 srcEncoding, const char* src, int32* srcLen, 171 char* dst, int32* dstLen, int32* state, char substitute) 172 { 173 const BCharacterSet* charset = BCharacterSetRoster::GetCharacterSetByConversionID( 174 srcEncoding); 175 if (charset == NULL) 176 return B_ERROR; 177 178 #if DEBUG_CONV 179 fprintf(stderr, "convert_to_utf8(%s) : \"", charset->GetName()); 180 for (int i = 0 ; i < *srcLen ; i++) { 181 fprintf(stderr, "%c", src[i]); 182 } 183 fprintf(stderr, "\"\n"); 184 #endif 185 186 return convert_encoding(charset->GetName(), "UTF-8", src, srcLen, 187 dst, dstLen, state, substitute); 188 } 189 190 191 status_t 192 convert_from_utf8(uint32 dstEncoding, const char* src, int32* srcLen, 193 char* dst, int32* dstLen, int32* state, char substitute) 194 { 195 const BCharacterSet* charset = BCharacterSetRoster::GetCharacterSetByConversionID( 196 dstEncoding); 197 if (charset == NULL) 198 return B_ERROR; 199 200 #if DEBUG_CONV 201 fprintf(stderr, "convert_from_utf8(%s) : \"", charset->GetName()); 202 for (int i = 0 ; i < *srcLen ; i++) { 203 fprintf(stderr, "%c", src[i]); 204 } 205 fprintf(stderr, "\"\n"); 206 #endif 207 208 return convert_encoding("UTF-8", charset->GetName(), src, srcLen, 209 dst, dstLen, state, substitute); 210 } 211 212