xref: /haiku/src/kits/textencoding/utf8_conversions.cpp (revision 93a78ecaa45114d68952d08c4778f073515102f2)
1 /*
2  * Copyright 2003-2007, Haiku, Inc. All Rights Reserved.
3  * Distributed under the terms of the MIT License.
4  *
5  * Authors:
6  *		Andrew Bachmann
7  */
8 
9 
10 #include <CharacterSet.h>
11 #include <CharacterSetRoster.h>
12 #include <UTF8.h>
13 
14 #include <errno.h>
15 #include <iconv.h>
16 #include <stdio.h>
17 
18 
19 //#define DEBUG_CONV 1
20 
21 #ifdef DEBUG_CONV
22 #	define DEBPRINT(ARGS) printf ARGS;
23 #else
24 #	define DEBPRINT(ARGS) ;
25 #endif
26 
27 using namespace BPrivate;
28 
29 int iconvctl(iconv_t icd, int request, void* argument);
30 
31 
32 status_t
33 convert_encoding(const char* from, const char* to, const char* src,
34 	int32* srcLen, char* dst, int32* dstLen, int32* state,
35 	char substitute)
36 {
37 	if (*srcLen == 0) {
38 		// nothing to do!
39 		*dstLen = 0;
40 		return B_OK;
41 	}
42 
43 	// TODO: this doesn't work, as the state is reset every time!
44 	iconv_t conversion = iconv_open(to, from);
45 	if (conversion == (iconv_t)-1) {
46 		DEBPRINT(("iconv_open failed\n"));
47 		return B_ERROR;
48 	}
49 
50 	size_t outputLeft = *dstLen;
51 
52 	if (state == NULL || *state == 0) {
53 		if (state != NULL)
54 			*state = 1;
55 
56 		iconv(conversion, NULL, NULL, &dst, &outputLeft);
57 	}
58 
59 	char** inputBuffer = const_cast<char**>(&src);
60 	size_t inputLeft = *srcLen;
61 	do {
62 		size_t nonReversibleConversions = iconv(conversion, inputBuffer,
63 			&inputLeft, &dst, &outputLeft);
64 		if (nonReversibleConversions == (size_t)-1) {
65 			if (errno == E2BIG) {
66 				// Not enough room in the output buffer for the next converted character
67 				// This is not a "real" error, we just quit out.
68 				break;
69 			}
70 
71 			switch (errno) {
72 				case EILSEQ: // unable to generate a corresponding character
73 				{
74 					// discard the input character
75 					const int one = 1, zero = 0;
76 					iconvctl(conversion, ICONV_SET_DISCARD_ILSEQ, (void*)&one);
77 					iconv(conversion, inputBuffer, &inputLeft, &dst, &outputLeft);
78 					iconvctl(conversion, ICONV_SET_DISCARD_ILSEQ, (void*)&zero);
79 
80 					// prepare to convert the substitute character to target encoding
81 					char* original = new char[1];
82 					original[0] = substitute;
83 					size_t len = 1;
84 					char* copy = original;
85 
86 					// Perform the conversion
87 					// We ignore any errors during this as part of robustness/best-effort
88 					// We use ISO-8859-1 as a source because it is a single byte encoding
89 					// It also overlaps UTF-8 for the lower 128 characters.  It is also
90 					// likely to have a mapping to almost any target encoding.
91 					iconv_t iso8859_1to = iconv_open(to,"ISO-8859-1");
92 					if (iso8859_1to != (iconv_t)-1) {
93 						iconv(iso8859_1to, 0, 0, 0, 0);
94 						iconv(iso8859_1to, const_cast<char**>(&copy), &len, &dst,
95 							&outputLeft);
96 						iconv_close(iso8859_1to);
97 					}
98 					delete original;
99 					break;
100 				}
101 
102 				case EINVAL: // incomplete multibyte sequence in the input
103 					// we just eat bad bytes, as part of robustness/best-effort
104 					inputBuffer++;
105 					inputLeft--;
106 					break;
107 
108 				default:
109 					// unknown error, completely bail
110 					status_t status = errno;
111 					iconv_close(conversion);
112 					return status;
113 			}
114 		}
115 	} while (inputLeft > 0 && outputLeft > 0);
116 
117 	*srcLen -= inputLeft;
118 	*dstLen -= outputLeft;
119 	iconv_close(conversion);
120 
121 	return B_OK;
122 }
123 
124 
125 status_t
126 convert_to_utf8(uint32 srcEncoding, const char* src, int32* srcLen,
127 	char* dst, int32* dstLen, int32* state, char substitute)
128 {
129 	const BCharacterSet* charset = BCharacterSetRoster::GetCharacterSetByConversionID(
130 		srcEncoding);
131 	if (charset == NULL)
132 		return B_ERROR;
133 
134 #if DEBUG_CONV
135 	fprintf(stderr, "convert_to_utf8(%s) : \"", charset->GetName());
136 	for (int i = 0 ; i < *srcLen ; i++) {
137 		fprintf(stderr, "%c", src[i]);
138 	}
139 	fprintf(stderr, "\"\n");
140 #endif
141 
142 	return convert_encoding(charset->GetName(), "UTF-8", src, srcLen,
143 		dst, dstLen, state, substitute);
144 }
145 
146 
147 status_t
148 convert_from_utf8(uint32 dstEncoding, const char* src, int32* srcLen,
149 	char* dst, int32* dstLen, int32* state, char substitute)
150 {
151 	const BCharacterSet* charset = BCharacterSetRoster::GetCharacterSetByConversionID(
152 		dstEncoding);
153 	if (charset == NULL)
154 		return B_ERROR;
155 
156 #if DEBUG_CONV
157 	fprintf(stderr, "convert_from_utf8(%s) : \"", charset->GetName());
158 	for (int i = 0 ; i < *srcLen ; i++) {
159 		fprintf(stderr, "%c", src[i]);
160 	}
161 	fprintf(stderr, "\"\n");
162 #endif
163 
164 	return convert_encoding("UTF-8", charset->GetName(), src, srcLen,
165 		dst, dstLen, state, substitute);
166 }
167 
168