1 /* 2 * Copyright 2003-2007, Haiku, Inc. All Rights Reserved. 3 * Distributed under the terms of the MIT License. 4 * 5 * Authors: 6 * Andrew Bachmann 7 */ 8 9 10 #include <CharacterSet.h> 11 #include <CharacterSetRoster.h> 12 #include <UTF8.h> 13 14 #include <errno.h> 15 #include <iconv.h> 16 #include <stdio.h> 17 18 19 //#define DEBUG_CONV 1 20 21 #ifdef DEBUG_CONV 22 # define DEBPRINT(ARGS) printf ARGS; 23 #else 24 # define DEBPRINT(ARGS) ; 25 #endif 26 27 using namespace BPrivate; 28 29 int iconvctl(iconv_t icd, int request, void* argument); 30 31 32 status_t 33 convert_encoding(const char* from, const char* to, const char* src, 34 int32* srcLen, char* dst, int32* dstLen, int32* state, 35 char substitute) 36 { 37 if (*srcLen == 0) { 38 // nothing to do! 39 *dstLen = 0; 40 return B_OK; 41 } 42 43 // TODO: this doesn't work, as the state is reset every time! 44 iconv_t conversion = iconv_open(to, from); 45 if (conversion == (iconv_t)-1) { 46 DEBPRINT(("iconv_open failed\n")); 47 return B_ERROR; 48 } 49 50 size_t outputLeft = *dstLen; 51 52 if (state == NULL || *state == 0) { 53 if (state != NULL) 54 *state = 1; 55 56 iconv(conversion, NULL, NULL, &dst, &outputLeft); 57 } 58 59 char** inputBuffer = const_cast<char**>(&src); 60 size_t inputLeft = *srcLen; 61 do { 62 size_t nonReversibleConversions = iconv(conversion, inputBuffer, 63 &inputLeft, &dst, &outputLeft); 64 if (nonReversibleConversions == (size_t)-1) { 65 if (errno == E2BIG) { 66 // Not enough room in the output buffer for the next converted character 67 // This is not a "real" error, we just quit out. 68 break; 69 } 70 71 switch (errno) { 72 case EILSEQ: // unable to generate a corresponding character 73 { 74 // discard the input character 75 const int one = 1, zero = 0; 76 iconvctl(conversion, ICONV_SET_DISCARD_ILSEQ, (void*)&one); 77 iconv(conversion, inputBuffer, &inputLeft, &dst, &outputLeft); 78 iconvctl(conversion, ICONV_SET_DISCARD_ILSEQ, (void*)&zero); 79 80 // prepare to convert the substitute character to target encoding 81 char* original = new char[1]; 82 original[0] = substitute; 83 size_t len = 1; 84 char* copy = original; 85 86 // Perform the conversion 87 // We ignore any errors during this as part of robustness/best-effort 88 // We use ISO-8859-1 as a source because it is a single byte encoding 89 // It also overlaps UTF-8 for the lower 128 characters. It is also 90 // likely to have a mapping to almost any target encoding. 91 iconv_t iso8859_1to = iconv_open(to,"ISO-8859-1"); 92 if (iso8859_1to != (iconv_t)-1) { 93 iconv(iso8859_1to, 0, 0, 0, 0); 94 iconv(iso8859_1to, const_cast<char**>(©), &len, &dst, 95 &outputLeft); 96 iconv_close(iso8859_1to); 97 } 98 delete[] original; 99 break; 100 } 101 102 case EINVAL: // incomplete multibyte sequence in the input 103 // we just eat bad bytes, as part of robustness/best-effort 104 inputBuffer++; 105 inputLeft--; 106 break; 107 108 default: 109 // unknown error, completely bail 110 status_t status = errno; 111 iconv_close(conversion); 112 return status; 113 } 114 } 115 } while (inputLeft > 0 && outputLeft > 0); 116 117 *srcLen -= inputLeft; 118 *dstLen -= outputLeft; 119 iconv_close(conversion); 120 121 return B_OK; 122 } 123 124 125 status_t 126 convert_to_utf8(uint32 srcEncoding, const char* src, int32* srcLen, 127 char* dst, int32* dstLen, int32* state, char substitute) 128 { 129 const BCharacterSet* charset = BCharacterSetRoster::GetCharacterSetByConversionID( 130 srcEncoding); 131 if (charset == NULL) 132 return B_ERROR; 133 134 #if DEBUG_CONV 135 fprintf(stderr, "convert_to_utf8(%s) : \"", charset->GetName()); 136 for (int i = 0 ; i < *srcLen ; i++) { 137 fprintf(stderr, "%c", src[i]); 138 } 139 fprintf(stderr, "\"\n"); 140 #endif 141 142 return convert_encoding(charset->GetName(), "UTF-8", src, srcLen, 143 dst, dstLen, state, substitute); 144 } 145 146 147 status_t 148 convert_from_utf8(uint32 dstEncoding, const char* src, int32* srcLen, 149 char* dst, int32* dstLen, int32* state, char substitute) 150 { 151 const BCharacterSet* charset = BCharacterSetRoster::GetCharacterSetByConversionID( 152 dstEncoding); 153 if (charset == NULL) 154 return B_ERROR; 155 156 #if DEBUG_CONV 157 fprintf(stderr, "convert_from_utf8(%s) : \"", charset->GetName()); 158 for (int i = 0 ; i < *srcLen ; i++) { 159 fprintf(stderr, "%c", src[i]); 160 } 161 fprintf(stderr, "\"\n"); 162 #endif 163 164 return convert_encoding("UTF-8", charset->GetName(), src, srcLen, 165 dst, dstLen, state, substitute); 166 } 167 168