xref: /haiku/src/kits/textencoding/utf8_conversions.cpp (revision 95bac3fda53a4cb21880712d7b43f8c21db32a2e)
1 #include <UTF8.h>
2 #include <iconv.h>
3 #include <CharacterSet.h>
4 #include <CharacterSetRoster.h>
5 #include <Errors.h>
6 #include <errno.h>
7 #include <stdio.h>
8 #include <Debug.h>
9 
10 using namespace BPrivate;
11 
12 typedef char ** input_buffer_t;
13 
14 int iconvctl (iconv_t icd, int request, void* argument);
15 
16 status_t
17 convert_encoding(const char * from, const char * to,
18                  const char * src, int32 * srcLen,
19                  char * dst, int32 * dstLen,
20                  int32 * state, char substitute)
21 {
22 	status_t status;
23 	if (*srcLen == 0) {
24 		// nothing to do!
25 		PRINT(("nothing to do\n"));
26 		return B_OK;
27 	}
28 	iconv_t conversion = iconv_open(to,from);
29 	if (conversion == (iconv_t)-1) {
30 		PRINT(("iconv_open failed\n"));
31 		return B_ERROR;
32 	}
33 	if ((state == NULL) || (*state == 0)) {
34 		iconv(conversion,0,0,0,0);
35 	}
36 	input_buffer_t inputBuffer = const_cast<input_buffer_t>(&src);
37 	size_t inputLeft = *srcLen;
38 	size_t outputLeft = *dstLen;
39 	do {
40 		size_t nonReversibleConversions = iconv(conversion,inputBuffer,&inputLeft,&dst,&outputLeft);
41 		if (nonReversibleConversions == (size_t)-1) {
42 			if (errno == E2BIG) {
43 				// Not enough room in the output buffer for the next converted character
44 				// This is not a "real" error, we just quit out.
45 				break;
46 			}
47 			switch (errno) {
48 			case EILSEQ: // unable to generate a corresponding character
49 				{
50 				// discard the input character
51 				const int one = 1, zero = 0;
52 				iconvctl(conversion,ICONV_SET_DISCARD_ILSEQ,(void*)&one);
53 				iconv(conversion,inputBuffer,&inputLeft,&dst,&outputLeft);
54 				iconvctl(conversion,ICONV_SET_DISCARD_ILSEQ,(void*)&zero);
55 				// prepare to convert the substitute character to target encoding
56 				char * original = new char[1];
57 				original[0] = substitute;
58 				size_t len = 1;
59 				char * copy = original;
60 				// Perform the conversion
61 				// We ignore any errors during this as part of robustness/best-effort
62 				// We use ISO-8859-1 as a source because it is a single byte encoding
63 				// It also overlaps UTF-8 for the lower 128 characters.  It is also
64 				// likely to have a mapping to almost any target encoding.
65 				iconv_t iso8859_1to = iconv_open(to,"ISO-8859-1");
66 				if (iso8859_1to != (iconv_t)-1) {
67 					iconv(iso8859_1to,0,0,0,0);
68 					iconv(iso8859_1to,const_cast<input_buffer_t>(&copy),&len,&dst,&outputLeft);
69 					iconv_close(iso8859_1to);
70 				}
71 				delete original;
72 				}
73 				break;
74 			case EINVAL: // incomplete multibyte sequence in the input
75 				// we just eat bad bytes, as part of robustness/best-effort
76 				inputBuffer++;
77 				inputLeft--;
78 				break;
79 			default:
80 				// unknown error, completely bail
81 				status = errno;
82 				iconv_close(conversion);
83 				return status;
84 			}
85 		}
86 	} while ((inputLeft > 0) && (outputLeft > 0));
87 	*srcLen -= inputLeft;
88 	*dstLen -= outputLeft;
89 	iconv_close(conversion);
90 	if (*srcLen != 0) {
91 		// able to convert at least one character
92 		PRINT(("able to convert at least one character\n"));
93 		return B_OK;
94 	} else {
95 		// not able to convert at least one character
96 		PRINT(("not able to convert at least one character\n"));
97 		return B_ERROR;
98 	}
99 }
100 
101 status_t
102 convert_to_utf8(uint32 srcEncoding,
103                 const char * src, int32 * srcLen,
104                 char * dst, int32 * dstLen,
105                 int32 * state, char substitute = B_SUBSTITUTE)
106 {
107 	const BCharacterSet * charset = BCharacterSetRoster::GetCharacterSetByConversionID(srcEncoding);
108 	if (charset == 0) {
109 		return B_ERROR;
110 	}
111 #if DEBUG
112 	fprintf(stderr, "convert_to_utf8(%s) : \"", charset->GetName());
113 	for (int i = 0 ; i < *srcLen ; i++) {
114 		fprintf(stderr, "%c", src[i]);
115 	}
116 	fprintf(stderr, "\"\n");
117 #endif
118 	return convert_encoding(charset->GetName(),"UTF-8",src,srcLen,dst,dstLen,state,substitute);
119 }
120 
121 status_t
122 convert_from_utf8(uint32 dstEncoding,
123                   const char * src, int32 * srcLen,
124                   char * dst, int32 * dstLen,
125                   int32 * state, char substitute = B_SUBSTITUTE)
126 {
127 	const BCharacterSet * charset = BCharacterSetRoster::GetCharacterSetByConversionID(dstEncoding);
128 	if (charset == 0) {
129 		return B_ERROR;
130 	}
131 #if DEBUG
132 	fprintf(stderr, "convert_from_utf8(%s) : \"", charset->GetName());
133 	for (int i = 0 ; i < *srcLen ; i++) {
134 		fprintf(stderr, "%c", src[i]);
135 	}
136 	fprintf(stderr, "\"\n");
137 #endif
138 	return convert_encoding("UTF-8",charset->GetName(),src,srcLen,dst,dstLen,state,substitute);
139 }
140