xref: /haiku/src/kits/textencoding/utf8_conversions.cpp (revision 4f00613311d0bd6b70fa82ce19931c41f071ea4e)
1 #include <UTF8.h>
2 #include <iconv.h>
3 #include <CharacterSet.h>
4 #include <CharacterSetRoster.h>
5 #include <Errors.h>
6 #include <errno.h>
7 #include <stdio.h>
8 #include <Debug.h>
9 
10 //#define DEBUG_CONV 1
11 
12 #ifdef DEBUG_CONV
13 	#define DEBPRINT(ARGS) printf ARGS;
14 #else
15 	#define DEBPRINT(ARGS) ;
16 #endif
17 
18 using namespace BPrivate;
19 
20 typedef char ** input_buffer_t;
21 
22 int iconvctl (iconv_t icd, int request, void* argument);
23 
24 status_t
25 convert_encoding(const char * from, const char * to,
26                  const char * src, int32 * srcLen,
27                  char * dst, int32 * dstLen,
28                  int32 * state, char substitute)
29 {
30 	status_t status;
31 	if (*srcLen == 0) {
32 		// nothing to do!
33 		DEBPRINT(("nothing to do\n"));
34 		return B_OK;
35 	}
36 	iconv_t conversion = iconv_open(to,from);
37 	if (conversion == (iconv_t)-1) {
38 		DEBPRINT(("iconv_open failed\n"));
39 		return B_ERROR;
40 	}
41 	if ((state == NULL) || (*state == 0)) {
42 		iconv(conversion,0,0,0,0);
43 	}
44 	input_buffer_t inputBuffer = const_cast<input_buffer_t>(&src);
45 	size_t inputLeft = *srcLen;
46 	size_t outputLeft = *dstLen;
47 	do {
48 		size_t nonReversibleConversions = iconv(conversion,inputBuffer,&inputLeft,&dst,&outputLeft);
49 		if (nonReversibleConversions == (size_t)-1) {
50 			if (errno == E2BIG) {
51 				// Not enough room in the output buffer for the next converted character
52 				// This is not a "real" error, we just quit out.
53 				break;
54 			}
55 			switch (errno) {
56 			case EILSEQ: // unable to generate a corresponding character
57 				{
58 				// discard the input character
59 				const int one = 1, zero = 0;
60 				iconvctl(conversion,ICONV_SET_DISCARD_ILSEQ,(void*)&one);
61 				iconv(conversion,inputBuffer,&inputLeft,&dst,&outputLeft);
62 				iconvctl(conversion,ICONV_SET_DISCARD_ILSEQ,(void*)&zero);
63 				// prepare to convert the substitute character to target encoding
64 				char * original = new char[1];
65 				original[0] = substitute;
66 				size_t len = 1;
67 				char * copy = original;
68 				// Perform the conversion
69 				// We ignore any errors during this as part of robustness/best-effort
70 				// We use ISO-8859-1 as a source because it is a single byte encoding
71 				// It also overlaps UTF-8 for the lower 128 characters.  It is also
72 				// likely to have a mapping to almost any target encoding.
73 				iconv_t iso8859_1to = iconv_open(to,"ISO-8859-1");
74 				if (iso8859_1to != (iconv_t)-1) {
75 					iconv(iso8859_1to,0,0,0,0);
76 					iconv(iso8859_1to,const_cast<input_buffer_t>(&copy),&len,&dst,&outputLeft);
77 					iconv_close(iso8859_1to);
78 				}
79 				delete original;
80 				}
81 				break;
82 			case EINVAL: // incomplete multibyte sequence in the input
83 				// we just eat bad bytes, as part of robustness/best-effort
84 				inputBuffer++;
85 				inputLeft--;
86 				break;
87 			default:
88 				// unknown error, completely bail
89 				status = errno;
90 				iconv_close(conversion);
91 				return status;
92 			}
93 		}
94 	} while ((inputLeft > 0) && (outputLeft > 0));
95 	*srcLen -= inputLeft;
96 	*dstLen -= outputLeft;
97 	iconv_close(conversion);
98 	if (*srcLen != 0) {
99 		// able to convert at least one character
100 		DEBPRINT(("able to convert at least one character\n"));
101 		return B_OK;
102 	} else {
103 		// not able to convert at least one character
104 		DEBPRINT(("not able to convert at least one character\n"));
105 		return B_ERROR;
106 	}
107 }
108 
109 status_t
110 convert_to_utf8(uint32 srcEncoding,
111                 const char * src, int32 * srcLen,
112                 char * dst, int32 * dstLen,
113                 int32 * state, char substitute)
114 {
115 	const BCharacterSet * charset = BCharacterSetRoster::GetCharacterSetByConversionID(srcEncoding);
116 	if (charset == 0) {
117 		return B_ERROR;
118 	}
119 #if DEBUG_CONV
120 	fprintf(stderr, "convert_to_utf8(%s) : \"", charset->GetName());
121 	for (int i = 0 ; i < *srcLen ; i++) {
122 		fprintf(stderr, "%c", src[i]);
123 	}
124 	fprintf(stderr, "\"\n");
125 #endif
126 	return convert_encoding(charset->GetName(),"UTF-8",src,srcLen,dst,dstLen,state,substitute);
127 }
128 
129 status_t
130 convert_from_utf8(uint32 dstEncoding,
131                   const char * src, int32 * srcLen,
132                   char * dst, int32 * dstLen,
133                   int32 * state, char substitute)
134 {
135 	const BCharacterSet * charset = BCharacterSetRoster::GetCharacterSetByConversionID(dstEncoding);
136 	if (charset == 0) {
137 		return B_ERROR;
138 	}
139 #if DEBUG_CONV
140 	fprintf(stderr, "convert_from_utf8(%s) : \"", charset->GetName());
141 	for (int i = 0 ; i < *srcLen ; i++) {
142 		fprintf(stderr, "%c", src[i]);
143 	}
144 	fprintf(stderr, "\"\n");
145 #endif
146 	return convert_encoding("UTF-8",charset->GetName(),src,srcLen,dst,dstLen,state,substitute);
147 }
148