1 /* 2 * Copyright 2003-2008, Haiku, Inc. All Rights Reserved. 3 * Distributed under the terms of the MIT License. 4 * 5 * Authors: 6 * Andrew Bachmann 7 */ 8 9 10 #include <CharacterSet.h> 11 #include <CharacterSetRoster.h> 12 #include <UTF8.h> 13 14 #include <errno.h> 15 #include <iconv.h> 16 #include <stdio.h> 17 18 19 //#define DEBUG_CONV 1 20 21 #ifdef DEBUG_CONV 22 # define DEBPRINT(ARGS) printf ARGS; 23 #else 24 # define DEBPRINT(ARGS) ; 25 #endif 26 27 using namespace BPrivate; 28 29 int iconvctl(iconv_t icd, int request, void* argument); 30 31 32 status_t 33 convert_encoding(const char* from, const char* to, const char* src, 34 int32* srcLen, char* dst, int32* dstLen, int32* state, 35 char substitute) 36 { 37 if (*srcLen == 0) { 38 // nothing to do! 39 *dstLen = 0; 40 return B_OK; 41 } 42 43 // TODO: this doesn't work, as the state is reset every time! 44 iconv_t conversion = iconv_open(to, from); 45 if (conversion == (iconv_t)-1) { 46 DEBPRINT(("iconv_open failed\n")); 47 return B_ERROR; 48 } 49 50 size_t outputLeft = *dstLen; 51 52 if (state == NULL || *state == 0) { 53 if (state != NULL) 54 *state = 1; 55 56 iconv(conversion, NULL, NULL, &dst, &outputLeft); 57 } 58 59 char** inputBuffer = const_cast<char**>(&src); 60 size_t inputLeft = *srcLen; 61 do { 62 size_t nonReversibleConversions = iconv(conversion, inputBuffer, 63 &inputLeft, &dst, &outputLeft); 64 if (nonReversibleConversions == (size_t)-1) { 65 if (errno == E2BIG) { 66 // Not enough room in the output buffer for the next converted character 67 // This is not a "real" error, we just quit out. 68 break; 69 } 70 71 switch (errno) { 72 case EILSEQ: // unable to generate a corresponding character 73 { 74 // discard the input character 75 const int one = 1, zero = 0; 76 iconvctl(conversion, ICONV_SET_DISCARD_ILSEQ, (void*)&one); 77 iconv(conversion, inputBuffer, &inputLeft, &dst, &outputLeft); 78 iconvctl(conversion, ICONV_SET_DISCARD_ILSEQ, (void*)&zero); 79 80 // prepare to convert the substitute character to target encoding 81 char original = substitute; 82 size_t len = 1; 83 char* copy = &original; 84 85 // Perform the conversion 86 // We ignore any errors during this as part of robustness/best-effort 87 // We use ISO-8859-1 as a source because it is a single byte encoding 88 // It also overlaps UTF-8 for the lower 128 characters. It is also 89 // likely to have a mapping to almost any target encoding. 90 iconv_t iso8859_1to = iconv_open(to,"ISO-8859-1"); 91 if (iso8859_1to != (iconv_t)-1) { 92 iconv(iso8859_1to, 0, 0, 0, 0); 93 iconv(iso8859_1to, ©, &len, &dst, &outputLeft); 94 iconv_close(iso8859_1to); 95 } 96 break; 97 } 98 99 case EINVAL: // incomplete multibyte sequence in the input 100 // we just eat bad bytes, as part of robustness/best-effort 101 inputBuffer++; 102 inputLeft--; 103 break; 104 105 default: 106 // unknown error, completely bail 107 status_t status = errno; 108 iconv_close(conversion); 109 return status; 110 } 111 } 112 } while (inputLeft > 0 && outputLeft > 0); 113 114 *srcLen -= inputLeft; 115 *dstLen -= outputLeft; 116 iconv_close(conversion); 117 118 return B_OK; 119 } 120 121 122 status_t 123 convert_to_utf8(uint32 srcEncoding, const char* src, int32* srcLen, 124 char* dst, int32* dstLen, int32* state, char substitute) 125 { 126 const BCharacterSet* charset = BCharacterSetRoster::GetCharacterSetByConversionID( 127 srcEncoding); 128 if (charset == NULL) 129 return B_ERROR; 130 131 #if DEBUG_CONV 132 fprintf(stderr, "convert_to_utf8(%s) : \"", charset->GetName()); 133 for (int i = 0 ; i < *srcLen ; i++) { 134 fprintf(stderr, "%c", src[i]); 135 } 136 fprintf(stderr, "\"\n"); 137 #endif 138 139 return convert_encoding(charset->GetName(), "UTF-8", src, srcLen, 140 dst, dstLen, state, substitute); 141 } 142 143 144 status_t 145 convert_from_utf8(uint32 dstEncoding, const char* src, int32* srcLen, 146 char* dst, int32* dstLen, int32* state, char substitute) 147 { 148 const BCharacterSet* charset = BCharacterSetRoster::GetCharacterSetByConversionID( 149 dstEncoding); 150 if (charset == NULL) 151 return B_ERROR; 152 153 #if DEBUG_CONV 154 fprintf(stderr, "convert_from_utf8(%s) : \"", charset->GetName()); 155 for (int i = 0 ; i < *srcLen ; i++) { 156 fprintf(stderr, "%c", src[i]); 157 } 158 fprintf(stderr, "\"\n"); 159 #endif 160 161 return convert_encoding("UTF-8", charset->GetName(), src, srcLen, 162 dst, dstLen, state, substitute); 163 } 164 165