1 /*
2 * Copyright 2003-2008, Haiku, Inc. All Rights Reserved.
3 * Distributed under the terms of the MIT License.
4 *
5 * Authors:
6 * Andrew Bachmann
7 */
8
9
10 #include <CharacterSet.h>
11 #include <CharacterSetRoster.h>
12 #include <UTF8.h>
13
14 #include <errno.h>
15 #include <iconv.h>
16 #include <stdio.h>
17
18
19 //#define DEBUG_CONV 1
20
21 #ifdef DEBUG_CONV
22 # define DEBPRINT(ARGS) printf ARGS;
23 #else
24 # define DEBPRINT(ARGS) ;
25 #endif
26
27 using namespace BPrivate;
28
29 int iconvctl(iconv_t icd, int request, void* argument);
30
31
32 static void
discard_invalid_input_character(iconv_t * conversion,char ** inputBuffer,size_t * inputLeft)33 discard_invalid_input_character(iconv_t* conversion, char** inputBuffer,
34 size_t* inputLeft)
35 {
36 if (*inputLeft == 0)
37 return;
38
39 char outputBuffer[1];
40
41 // skip the invalid input character only
42 size_t left = 1;
43 for (; left <= *inputLeft; left ++) {
44 // reset internal state
45 iconv(*conversion, NULL, NULL, NULL, NULL);
46
47 char* buffer = *inputBuffer;
48 char* output = outputBuffer;
49 size_t outputLeft = 1;
50 size_t size = iconv(*conversion, &buffer, &left,
51 &output, &outputLeft);
52
53 if (size != (size_t)-1) {
54 // should not reach here
55 break;
56 }
57
58 if (errno == EINVAL) {
59 // too few input bytes provided,
60 // increase input buffer size and try again
61 continue;
62 }
63
64 if (errno == EILSEQ) {
65 // minimal size of input buffer found
66 break;
67 }
68
69 // should not reach here
70 };
71
72 *inputBuffer += left;
73 *inputLeft -= left;
74 }
75
76
77 status_t
convert_encoding(const char * from,const char * to,const char * src,int32 * srcLen,char * dst,int32 * dstLen,int32 * state,char substitute)78 convert_encoding(const char* from, const char* to, const char* src,
79 int32* srcLen, char* dst, int32* dstLen, int32* state,
80 char substitute)
81 {
82 if (*srcLen == 0) {
83 // nothing to do!
84 *dstLen = 0;
85 return B_OK;
86 }
87
88 // TODO: this doesn't work, as the state is reset every time!
89 iconv_t conversion = iconv_open(to, from);
90 if (conversion == (iconv_t)-1) {
91 DEBPRINT(("iconv_open failed\n"));
92 return B_ERROR;
93 }
94
95 size_t outputLeft = *dstLen;
96
97 if (state == NULL || *state == 0) {
98 if (state != NULL)
99 *state = 1;
100
101 iconv(conversion, NULL, NULL, &dst, &outputLeft);
102 }
103
104 char** inputBuffer = const_cast<char**>(&src);
105 size_t inputLeft = *srcLen;
106 do {
107 size_t nonReversibleConversions = iconv(conversion, inputBuffer,
108 &inputLeft, &dst, &outputLeft);
109 if (nonReversibleConversions == (size_t)-1) {
110 if (errno == E2BIG) {
111 // Not enough room in the output buffer for the next converted character
112 // This is not a "real" error, we just quit out.
113 break;
114 }
115
116 switch (errno) {
117 case EILSEQ: // unable to generate a corresponding character
118 {
119 discard_invalid_input_character(&conversion, inputBuffer,
120 &inputLeft);
121
122 // prepare to convert the substitute character to target encoding
123 char original = substitute;
124 size_t len = 1;
125 char* copy = &original;
126
127 // Perform the conversion
128 // We ignore any errors during this as part of robustness/best-effort
129 // We use ISO-8859-1 as a source because it is a single byte encoding
130 // It also overlaps UTF-8 for the lower 128 characters. It is also
131 // likely to have a mapping to almost any target encoding.
132 iconv_t iso8859_1to = iconv_open(to,"ISO-8859-1");
133 if (iso8859_1to != (iconv_t)-1) {
134 iconv(iso8859_1to, 0, 0, 0, 0);
135 iconv(iso8859_1to, ©, &len, &dst, &outputLeft);
136 iconv_close(iso8859_1to);
137 }
138 break;
139 }
140
141 case EINVAL: // incomplete multibyte sequence at the end of the input
142 // TODO inputLeft bytes from inputBuffer should
143 // be stored in state variable, so that conversion
144 // can continue when the caller provides the missing
145 // bytes with the next call of this method
146
147 // we just eat bad bytes, as part of robustness/best-effort
148 inputBuffer++;
149 inputLeft--;
150 break;
151
152 default:
153 // unknown error, completely bail
154 status_t status = errno;
155 iconv_close(conversion);
156 return status;
157 }
158 }
159 } while (inputLeft > 0 && outputLeft > 0);
160
161 *srcLen -= inputLeft;
162 *dstLen -= outputLeft;
163 iconv_close(conversion);
164
165 return B_OK;
166 }
167
168
169 status_t
convert_to_utf8(uint32 srcEncoding,const char * src,int32 * srcLen,char * dst,int32 * dstLen,int32 * state,char substitute)170 convert_to_utf8(uint32 srcEncoding, const char* src, int32* srcLen,
171 char* dst, int32* dstLen, int32* state, char substitute)
172 {
173 const BCharacterSet* charset = BCharacterSetRoster::GetCharacterSetByConversionID(
174 srcEncoding);
175 if (charset == NULL)
176 return B_ERROR;
177
178 #if DEBUG_CONV
179 fprintf(stderr, "convert_to_utf8(%s) : \"", charset->GetName());
180 for (int i = 0 ; i < *srcLen ; i++) {
181 fprintf(stderr, "%c", src[i]);
182 }
183 fprintf(stderr, "\"\n");
184 #endif
185
186 return convert_encoding(charset->GetName(), "UTF-8", src, srcLen,
187 dst, dstLen, state, substitute);
188 }
189
190
191 status_t
convert_from_utf8(uint32 dstEncoding,const char * src,int32 * srcLen,char * dst,int32 * dstLen,int32 * state,char substitute)192 convert_from_utf8(uint32 dstEncoding, const char* src, int32* srcLen,
193 char* dst, int32* dstLen, int32* state, char substitute)
194 {
195 const BCharacterSet* charset = BCharacterSetRoster::GetCharacterSetByConversionID(
196 dstEncoding);
197 if (charset == NULL)
198 return B_ERROR;
199
200 #if DEBUG_CONV
201 fprintf(stderr, "convert_from_utf8(%s) : \"", charset->GetName());
202 for (int i = 0 ; i < *srcLen ; i++) {
203 fprintf(stderr, "%c", src[i]);
204 }
205 fprintf(stderr, "\"\n");
206 #endif
207
208 return convert_encoding("UTF-8", charset->GetName(), src, srcLen,
209 dst, dstLen, state, substitute);
210 }
211
212