1 #include <UTF8.h> 2 #include <iconv.h> 3 #include <CharacterSet.h> 4 #include <CharacterSetRoster.h> 5 #include <Errors.h> 6 #include <errno.h> 7 #include <stdio.h> 8 #include <Debug.h> 9 10 //#define DEBUG_CONV 1 11 12 #ifdef DEBUG_CONV 13 #define DEBPRINT(ARGS) printf ARGS; 14 #else 15 #define DEBPRINT(ARGS) ; 16 #endif 17 18 using namespace BPrivate; 19 20 typedef char ** input_buffer_t; 21 22 int iconvctl (iconv_t icd, int request, void* argument); 23 24 status_t 25 convert_encoding(const char * from, const char * to, 26 const char * src, int32 * srcLen, 27 char * dst, int32 * dstLen, 28 int32 * state, char substitute) 29 { 30 status_t status; 31 if (*srcLen == 0) { 32 // nothing to do! 33 DEBPRINT(("nothing to do\n")); 34 return B_OK; 35 } 36 iconv_t conversion = iconv_open(to,from); 37 if (conversion == (iconv_t)-1) { 38 DEBPRINT(("iconv_open failed\n")); 39 return B_ERROR; 40 } 41 if ((state == NULL) || (*state == 0)) { 42 iconv(conversion,0,0,0,0); 43 } 44 input_buffer_t inputBuffer = const_cast<input_buffer_t>(&src); 45 size_t inputLeft = *srcLen; 46 size_t outputLeft = *dstLen; 47 do { 48 size_t nonReversibleConversions = iconv(conversion,inputBuffer,&inputLeft,&dst,&outputLeft); 49 if (nonReversibleConversions == (size_t)-1) { 50 if (errno == E2BIG) { 51 // Not enough room in the output buffer for the next converted character 52 // This is not a "real" error, we just quit out. 53 break; 54 } 55 switch (errno) { 56 case EILSEQ: // unable to generate a corresponding character 57 { 58 // discard the input character 59 const int one = 1, zero = 0; 60 iconvctl(conversion,ICONV_SET_DISCARD_ILSEQ,(void*)&one); 61 iconv(conversion,inputBuffer,&inputLeft,&dst,&outputLeft); 62 iconvctl(conversion,ICONV_SET_DISCARD_ILSEQ,(void*)&zero); 63 // prepare to convert the substitute character to target encoding 64 char * original = new char[1]; 65 original[0] = substitute; 66 size_t len = 1; 67 char * copy = original; 68 // Perform the conversion 69 // We ignore any errors during this as part of robustness/best-effort 70 // We use ISO-8859-1 as a source because it is a single byte encoding 71 // It also overlaps UTF-8 for the lower 128 characters. It is also 72 // likely to have a mapping to almost any target encoding. 73 iconv_t iso8859_1to = iconv_open(to,"ISO-8859-1"); 74 if (iso8859_1to != (iconv_t)-1) { 75 iconv(iso8859_1to,0,0,0,0); 76 iconv(iso8859_1to,const_cast<input_buffer_t>(©),&len,&dst,&outputLeft); 77 iconv_close(iso8859_1to); 78 } 79 delete original; 80 } 81 break; 82 case EINVAL: // incomplete multibyte sequence in the input 83 // we just eat bad bytes, as part of robustness/best-effort 84 inputBuffer++; 85 inputLeft--; 86 break; 87 default: 88 // unknown error, completely bail 89 status = errno; 90 iconv_close(conversion); 91 return status; 92 } 93 } 94 } while ((inputLeft > 0) && (outputLeft > 0)); 95 *srcLen -= inputLeft; 96 *dstLen -= outputLeft; 97 iconv_close(conversion); 98 if (*srcLen != 0) { 99 // able to convert at least one character 100 DEBPRINT(("able to convert at least one character\n")); 101 return B_OK; 102 } else { 103 // not able to convert at least one character 104 DEBPRINT(("not able to convert at least one character\n")); 105 return B_ERROR; 106 } 107 } 108 109 status_t 110 convert_to_utf8(uint32 srcEncoding, 111 const char * src, int32 * srcLen, 112 char * dst, int32 * dstLen, 113 int32 * state, char substitute) 114 { 115 const BCharacterSet * charset = BCharacterSetRoster::GetCharacterSetByConversionID(srcEncoding); 116 if (charset == 0) { 117 return B_ERROR; 118 } 119 #if DEBUG_CONV 120 fprintf(stderr, "convert_to_utf8(%s) : \"", charset->GetName()); 121 for (int i = 0 ; i < *srcLen ; i++) { 122 fprintf(stderr, "%c", src[i]); 123 } 124 fprintf(stderr, "\"\n"); 125 #endif 126 return convert_encoding(charset->GetName(),"UTF-8",src,srcLen,dst,dstLen,state,substitute); 127 } 128 129 status_t 130 convert_from_utf8(uint32 dstEncoding, 131 const char * src, int32 * srcLen, 132 char * dst, int32 * dstLen, 133 int32 * state, char substitute) 134 { 135 const BCharacterSet * charset = BCharacterSetRoster::GetCharacterSetByConversionID(dstEncoding); 136 if (charset == 0) { 137 return B_ERROR; 138 } 139 #if DEBUG_CONV 140 fprintf(stderr, "convert_from_utf8(%s) : \"", charset->GetName()); 141 for (int i = 0 ; i < *srcLen ; i++) { 142 fprintf(stderr, "%c", src[i]); 143 } 144 fprintf(stderr, "\"\n"); 145 #endif 146 return convert_encoding("UTF-8",charset->GetName(),src,srcLen,dst,dstLen,state,substitute); 147 } 148