xref: /haiku/src/kits/textencoding/utf8_conversions.cpp (revision 21258e2674226d6aa732321b6f8494841895af5f)
1 /*
2  * Copyright 2003-2008, Haiku, Inc. All Rights Reserved.
3  * Distributed under the terms of the MIT License.
4  *
5  * Authors:
6  *		Andrew Bachmann
7  */
8 
9 
10 #include <CharacterSet.h>
11 #include <CharacterSetRoster.h>
12 #include <UTF8.h>
13 
14 #include <errno.h>
15 #include <iconv.h>
16 #include <stdio.h>
17 
18 
19 //#define DEBUG_CONV 1
20 
21 #ifdef DEBUG_CONV
22 #	define DEBPRINT(ARGS) printf ARGS;
23 #else
24 #	define DEBPRINT(ARGS) ;
25 #endif
26 
27 using namespace BPrivate;
28 
29 int iconvctl(iconv_t icd, int request, void* argument);
30 
31 
32 static void
33 discard_invalid_input_character(iconv_t* conversion, char** inputBuffer,
34 	size_t* inputLeft)
35 {
36 	if (*inputLeft == 0)
37 		return;
38 
39 	char outputBuffer[1];
40 
41 	// skip the invalid input character only
42 	size_t left = 1;
43 	for (; left <= *inputLeft; left ++) {
44 		// reset internal state
45 		iconv(*conversion, NULL, NULL, NULL, NULL);
46 
47 		char* buffer = *inputBuffer;
48 		char* output = outputBuffer;
49 		size_t outputLeft = 1;
50 		size_t size = iconv(*conversion, &buffer, &left,
51 			&output, &outputLeft);
52 
53 		if (size != (size_t)-1) {
54 			// should not reach here
55 			break;
56 		}
57 
58 		if (errno == EINVAL) {
59 			// too few input bytes provided,
60 			// increase input buffer size and try again
61 			continue;
62 		}
63 
64 		if (errno == EILSEQ) {
65 			// minimal size of input buffer found
66 			break;
67 		}
68 
69 		// should not reach here
70 	};
71 
72 	*inputBuffer += left;
73 	*inputLeft -= left;
74 }
75 
76 
77 status_t
78 convert_encoding(const char* from, const char* to, const char* src,
79 	int32* srcLen, char* dst, int32* dstLen, int32* state,
80 	char substitute)
81 {
82 	if (*srcLen == 0) {
83 		// nothing to do!
84 		*dstLen = 0;
85 		return B_OK;
86 	}
87 
88 	// TODO: this doesn't work, as the state is reset every time!
89 	iconv_t conversion = iconv_open(to, from);
90 	if (conversion == (iconv_t)-1) {
91 		DEBPRINT(("iconv_open failed\n"));
92 		return B_ERROR;
93 	}
94 
95 	size_t outputLeft = *dstLen;
96 
97 	if (state == NULL || *state == 0) {
98 		if (state != NULL)
99 			*state = 1;
100 
101 		iconv(conversion, NULL, NULL, &dst, &outputLeft);
102 	}
103 
104 	char** inputBuffer = const_cast<char**>(&src);
105 	size_t inputLeft = *srcLen;
106 	do {
107 		size_t nonReversibleConversions = iconv(conversion, inputBuffer,
108 			&inputLeft, &dst, &outputLeft);
109 		if (nonReversibleConversions == (size_t)-1) {
110 			if (errno == E2BIG) {
111 				// Not enough room in the output buffer for the next converted character
112 				// This is not a "real" error, we just quit out.
113 				break;
114 			}
115 
116 			switch (errno) {
117 				case EILSEQ: // unable to generate a corresponding character
118 				{
119 					discard_invalid_input_character(&conversion, inputBuffer,
120 						&inputLeft);
121 
122 					// prepare to convert the substitute character to target encoding
123 					char original = substitute;
124 					size_t len = 1;
125 					char* copy = &original;
126 
127 					// Perform the conversion
128 					// We ignore any errors during this as part of robustness/best-effort
129 					// We use ISO-8859-1 as a source because it is a single byte encoding
130 					// It also overlaps UTF-8 for the lower 128 characters.  It is also
131 					// likely to have a mapping to almost any target encoding.
132 					iconv_t iso8859_1to = iconv_open(to,"ISO-8859-1");
133 					if (iso8859_1to != (iconv_t)-1) {
134 						iconv(iso8859_1to, 0, 0, 0, 0);
135 						iconv(iso8859_1to, &copy, &len, &dst, &outputLeft);
136 						iconv_close(iso8859_1to);
137 					}
138 					break;
139 				}
140 
141 				case EINVAL: // incomplete multibyte sequence at the end of the input
142 					// TODO inputLeft bytes from inputBuffer should
143 					// be stored in state variable, so that conversion
144 					// can continue when the caller provides the missing
145 					// bytes with the next call of this method
146 
147 					// we just eat bad bytes, as part of robustness/best-effort
148 					inputBuffer++;
149 					inputLeft--;
150 					break;
151 
152 				default:
153 					// unknown error, completely bail
154 					status_t status = errno;
155 					iconv_close(conversion);
156 					return status;
157 			}
158 		}
159 	} while (inputLeft > 0 && outputLeft > 0);
160 
161 	*srcLen -= inputLeft;
162 	*dstLen -= outputLeft;
163 	iconv_close(conversion);
164 
165 	return B_OK;
166 }
167 
168 
169 status_t
170 convert_to_utf8(uint32 srcEncoding, const char* src, int32* srcLen,
171 	char* dst, int32* dstLen, int32* state, char substitute)
172 {
173 	const BCharacterSet* charset = BCharacterSetRoster::GetCharacterSetByConversionID(
174 		srcEncoding);
175 	if (charset == NULL)
176 		return B_ERROR;
177 
178 #if DEBUG_CONV
179 	fprintf(stderr, "convert_to_utf8(%s) : \"", charset->GetName());
180 	for (int i = 0 ; i < *srcLen ; i++) {
181 		fprintf(stderr, "%c", src[i]);
182 	}
183 	fprintf(stderr, "\"\n");
184 #endif
185 
186 	return convert_encoding(charset->GetName(), "UTF-8", src, srcLen,
187 		dst, dstLen, state, substitute);
188 }
189 
190 
191 status_t
192 convert_from_utf8(uint32 dstEncoding, const char* src, int32* srcLen,
193 	char* dst, int32* dstLen, int32* state, char substitute)
194 {
195 	const BCharacterSet* charset = BCharacterSetRoster::GetCharacterSetByConversionID(
196 		dstEncoding);
197 	if (charset == NULL)
198 		return B_ERROR;
199 
200 #if DEBUG_CONV
201 	fprintf(stderr, "convert_from_utf8(%s) : \"", charset->GetName());
202 	for (int i = 0 ; i < *srcLen ; i++) {
203 		fprintf(stderr, "%c", src[i]);
204 	}
205 	fprintf(stderr, "\"\n");
206 #endif
207 
208 	return convert_encoding("UTF-8", charset->GetName(), src, srcLen,
209 		dst, dstLen, state, substitute);
210 }
211 
212