1 #include <UTF8.h> 2 #include <iconv.h> 3 #include <CharacterSet.h> 4 #include <CharacterSetRoster.h> 5 #include <Errors.h> 6 #include <errno.h> 7 #include <stdio.h> 8 #include <Debug.h> 9 10 using namespace BPrivate; 11 12 typedef char ** input_buffer_t; 13 14 int iconvctl (iconv_t icd, int request, void* argument); 15 16 status_t 17 convert_encoding(const char * from, const char * to, 18 const char * src, int32 * srcLen, 19 char * dst, int32 * dstLen, 20 int32 * state, char substitute) 21 { 22 status_t status; 23 if (*srcLen == 0) { 24 // nothing to do! 25 PRINT(("nothing to do\n")); 26 return B_OK; 27 } 28 iconv_t conversion = iconv_open(to,from); 29 if (conversion == (iconv_t)-1) { 30 PRINT(("iconv_open failed\n")); 31 return B_ERROR; 32 } 33 if ((state == NULL) || (*state == 0)) { 34 iconv(conversion,0,0,0,0); 35 } 36 input_buffer_t inputBuffer = const_cast<input_buffer_t>(&src); 37 size_t inputLeft = *srcLen; 38 size_t outputLeft = *dstLen; 39 do { 40 size_t nonReversibleConversions = iconv(conversion,inputBuffer,&inputLeft,&dst,&outputLeft); 41 if (nonReversibleConversions == (size_t)-1) { 42 if (errno == E2BIG) { 43 // Not enough room in the output buffer for the next converted character 44 // This is not a "real" error, we just quit out. 45 break; 46 } 47 switch (errno) { 48 case EILSEQ: // unable to generate a corresponding character 49 { 50 // discard the input character 51 const int one = 1, zero = 0; 52 iconvctl(conversion,ICONV_SET_DISCARD_ILSEQ,(void*)&one); 53 iconv(conversion,inputBuffer,&inputLeft,&dst,&outputLeft); 54 iconvctl(conversion,ICONV_SET_DISCARD_ILSEQ,(void*)&zero); 55 // prepare to convert the substitute character to target encoding 56 char * original = new char[1]; 57 original[0] = substitute; 58 size_t len = 1; 59 char * copy = original; 60 // Perform the conversion 61 // We ignore any errors during this as part of robustness/best-effort 62 // We use ISO-8859-1 as a source because it is a single byte encoding 63 // It also overlaps UTF-8 for the lower 128 characters. It is also 64 // likely to have a mapping to almost any target encoding. 65 iconv_t iso8859_1to = iconv_open(to,"ISO-8859-1"); 66 if (iso8859_1to != (iconv_t)-1) { 67 iconv(iso8859_1to,0,0,0,0); 68 iconv(iso8859_1to,const_cast<input_buffer_t>(©),&len,&dst,&outputLeft); 69 iconv_close(iso8859_1to); 70 } 71 delete original; 72 } 73 break; 74 case EINVAL: // incomplete multibyte sequence in the input 75 // we just eat bad bytes, as part of robustness/best-effort 76 inputBuffer++; 77 inputLeft--; 78 break; 79 default: 80 // unknown error, completely bail 81 status = errno; 82 iconv_close(conversion); 83 return status; 84 } 85 } 86 } while ((inputLeft > 0) && (outputLeft > 0)); 87 *srcLen -= inputLeft; 88 *dstLen -= outputLeft; 89 iconv_close(conversion); 90 if (*srcLen != 0) { 91 // able to convert at least one character 92 PRINT(("able to convert at least one character\n")); 93 return B_OK; 94 } else { 95 // not able to convert at least one character 96 PRINT(("not able to convert at least one character\n")); 97 return B_ERROR; 98 } 99 } 100 101 status_t 102 convert_to_utf8(uint32 srcEncoding, 103 const char * src, int32 * srcLen, 104 char * dst, int32 * dstLen, 105 int32 * state, char substitute = B_SUBSTITUTE) 106 { 107 const BCharacterSet * charset = BCharacterSetRoster::GetCharacterSetByConversionID(srcEncoding); 108 if (charset == 0) { 109 return B_ERROR; 110 } 111 #if DEBUG 112 fprintf(stderr, "convert_to_utf8(%s) : \"", charset->GetName()); 113 for (int i = 0 ; i < *srcLen ; i++) { 114 fprintf(stderr, "%c", src[i]); 115 } 116 fprintf(stderr, "\"\n"); 117 #endif 118 return convert_encoding(charset->GetName(),"UTF-8",src,srcLen,dst,dstLen,state,substitute); 119 } 120 121 status_t 122 convert_from_utf8(uint32 dstEncoding, 123 const char * src, int32 * srcLen, 124 char * dst, int32 * dstLen, 125 int32 * state, char substitute = B_SUBSTITUTE) 126 { 127 const BCharacterSet * charset = BCharacterSetRoster::GetCharacterSetByConversionID(dstEncoding); 128 if (charset == 0) { 129 return B_ERROR; 130 } 131 #if DEBUG 132 fprintf(stderr, "convert_from_utf8(%s) : \"", charset->GetName()); 133 for (int i = 0 ; i < *srcLen ; i++) { 134 fprintf(stderr, "%c", src[i]); 135 } 136 fprintf(stderr, "\"\n"); 137 #endif 138 return convert_encoding("UTF-8",charset->GetName(),src,srcLen,dst,dstLen,state,substitute); 139 } 140