xref: /haiku/src/kits/textencoding/utf8_conversions.cpp (revision 239222b2369c39dc52df52b0a7cdd6cc0a91bc92)
1 /*
2  * Copyright 2003-2008, Haiku, Inc. All Rights Reserved.
3  * Distributed under the terms of the MIT License.
4  *
5  * Authors:
6  *		Andrew Bachmann
7  */
8 
9 
10 #include <CharacterSet.h>
11 #include <CharacterSetRoster.h>
12 #include <UTF8.h>
13 
14 #include <errno.h>
15 #include <iconv.h>
16 #include <stdio.h>
17 
18 
19 //#define DEBUG_CONV 1
20 
21 #ifdef DEBUG_CONV
22 #	define DEBPRINT(ARGS) printf ARGS;
23 #else
24 #	define DEBPRINT(ARGS) ;
25 #endif
26 
27 using namespace BPrivate;
28 
29 int iconvctl(iconv_t icd, int request, void* argument);
30 
31 
32 status_t
33 convert_encoding(const char* from, const char* to, const char* src,
34 	int32* srcLen, char* dst, int32* dstLen, int32* state,
35 	char substitute)
36 {
37 	if (*srcLen == 0) {
38 		// nothing to do!
39 		*dstLen = 0;
40 		return B_OK;
41 	}
42 
43 	// TODO: this doesn't work, as the state is reset every time!
44 	iconv_t conversion = iconv_open(to, from);
45 	if (conversion == (iconv_t)-1) {
46 		DEBPRINT(("iconv_open failed\n"));
47 		return B_ERROR;
48 	}
49 
50 	size_t outputLeft = *dstLen;
51 
52 	if (state == NULL || *state == 0) {
53 		if (state != NULL)
54 			*state = 1;
55 
56 		iconv(conversion, NULL, NULL, &dst, &outputLeft);
57 	}
58 
59 	char** inputBuffer = const_cast<char**>(&src);
60 	size_t inputLeft = *srcLen;
61 	do {
62 		size_t nonReversibleConversions = iconv(conversion, inputBuffer,
63 			&inputLeft, &dst, &outputLeft);
64 		if (nonReversibleConversions == (size_t)-1) {
65 			if (errno == E2BIG) {
66 				// Not enough room in the output buffer for the next converted character
67 				// This is not a "real" error, we just quit out.
68 				break;
69 			}
70 
71 			switch (errno) {
72 				case EILSEQ: // unable to generate a corresponding character
73 				{
74 					// discard the input character
75 					const int one = 1, zero = 0;
76 					iconvctl(conversion, ICONV_SET_DISCARD_ILSEQ, (void*)&one);
77 					iconv(conversion, inputBuffer, &inputLeft, &dst, &outputLeft);
78 					iconvctl(conversion, ICONV_SET_DISCARD_ILSEQ, (void*)&zero);
79 
80 					// prepare to convert the substitute character to target encoding
81 					char original = substitute;
82 					size_t len = 1;
83 					char* copy = &original;
84 
85 					// Perform the conversion
86 					// We ignore any errors during this as part of robustness/best-effort
87 					// We use ISO-8859-1 as a source because it is a single byte encoding
88 					// It also overlaps UTF-8 for the lower 128 characters.  It is also
89 					// likely to have a mapping to almost any target encoding.
90 					iconv_t iso8859_1to = iconv_open(to,"ISO-8859-1");
91 					if (iso8859_1to != (iconv_t)-1) {
92 						iconv(iso8859_1to, 0, 0, 0, 0);
93 						iconv(iso8859_1to, &copy, &len, &dst, &outputLeft);
94 						iconv_close(iso8859_1to);
95 					}
96 					break;
97 				}
98 
99 				case EINVAL: // incomplete multibyte sequence in the input
100 					// we just eat bad bytes, as part of robustness/best-effort
101 					inputBuffer++;
102 					inputLeft--;
103 					break;
104 
105 				default:
106 					// unknown error, completely bail
107 					status_t status = errno;
108 					iconv_close(conversion);
109 					return status;
110 			}
111 		}
112 	} while (inputLeft > 0 && outputLeft > 0);
113 
114 	*srcLen -= inputLeft;
115 	*dstLen -= outputLeft;
116 	iconv_close(conversion);
117 
118 	return B_OK;
119 }
120 
121 
122 status_t
123 convert_to_utf8(uint32 srcEncoding, const char* src, int32* srcLen,
124 	char* dst, int32* dstLen, int32* state, char substitute)
125 {
126 	const BCharacterSet* charset = BCharacterSetRoster::GetCharacterSetByConversionID(
127 		srcEncoding);
128 	if (charset == NULL)
129 		return B_ERROR;
130 
131 #if DEBUG_CONV
132 	fprintf(stderr, "convert_to_utf8(%s) : \"", charset->GetName());
133 	for (int i = 0 ; i < *srcLen ; i++) {
134 		fprintf(stderr, "%c", src[i]);
135 	}
136 	fprintf(stderr, "\"\n");
137 #endif
138 
139 	return convert_encoding(charset->GetName(), "UTF-8", src, srcLen,
140 		dst, dstLen, state, substitute);
141 }
142 
143 
144 status_t
145 convert_from_utf8(uint32 dstEncoding, const char* src, int32* srcLen,
146 	char* dst, int32* dstLen, int32* state, char substitute)
147 {
148 	const BCharacterSet* charset = BCharacterSetRoster::GetCharacterSetByConversionID(
149 		dstEncoding);
150 	if (charset == NULL)
151 		return B_ERROR;
152 
153 #if DEBUG_CONV
154 	fprintf(stderr, "convert_from_utf8(%s) : \"", charset->GetName());
155 	for (int i = 0 ; i < *srcLen ; i++) {
156 		fprintf(stderr, "%c", src[i]);
157 	}
158 	fprintf(stderr, "\"\n");
159 #endif
160 
161 	return convert_encoding("UTF-8", charset->GetName(), src, srcLen,
162 		dst, dstLen, state, substitute);
163 }
164 
165