xref: /haiku/src/kits/textencoding/utf8_conversions.cpp (revision aa94570a34695672df9b47adda2257f75d8da880)
1 #include <UTF8.h>
2 #include <iconv.h>
3 #include <CharacterSet.h>
4 #include <CharacterSetRoster.h>
5 #include <Errors.h>
6 #include <errno.h>
7 #include <stdio.h>
8 
9 using namespace BPrivate;
10 
11 typedef char ** input_buffer_t;
12 
13 int iconvctl (iconv_t icd, int request, void* argument);
14 
15 status_t
16 convert_encoding(const char * from, const char * to,
17                  const char * src, int32 * srcLen,
18                  char * dst, int32 * dstLen,
19                  int32 * state, char substitute)
20 {
21 	status_t status;
22 	if (*srcLen == 0) {
23 		// nothing to do!
24 		return B_OK;
25 	}
26 	iconv_t conversion = iconv_open(to,from);
27 	if (conversion == (iconv_t)-1) {
28 		return B_ERROR;
29 	}
30 	if (state == 0) {
31 		return B_ERROR;
32 	}
33 	if (*state == 0) {
34 		iconv(conversion,0,0,0,0);
35 	}
36 	input_buffer_t inputBuffer = const_cast<input_buffer_t>(&src);
37 	size_t inputLeft = *srcLen;
38 	size_t outputLeft = *dstLen;
39 	do {
40 		size_t nonReversibleConversions = iconv(conversion,inputBuffer,&inputLeft,&dst,&outputLeft);
41 		if (nonReversibleConversions == (size_t)-1) {
42 			if (errno == E2BIG) {
43 				// Not enough room in the output buffer for the next converted character
44 				// This is not a "real" error, we just quit out.
45 				break;
46 			}
47 			switch (errno) {
48 			case EILSEQ: // unable to generate a corresponding character
49 				{
50 				// discard the input character
51 				const int one = 1, zero = 0;
52 				iconvctl(conversion,ICONV_SET_DISCARD_ILSEQ,(void*)&one);
53 				iconv(conversion,inputBuffer,&inputLeft,&dst,&outputLeft);
54 				iconvctl(conversion,ICONV_SET_DISCARD_ILSEQ,(void*)&zero);
55 				// prepare to convert the substitute character to target encoding
56 				char * original = new char[1];
57 				original[0] = substitute;
58 				size_t len = 1;
59 				char * copy = original;
60 				// Perform the conversion
61 				// We ignore any errors during this as part of robustness/best-effort
62 				// We use ISO-8859-1 as a source because it is a single byte encoding
63 				// It also overlaps UTF-8 for the lower 128 characters.  It is also
64 				// likely to have a mapping to almost any target encoding.
65 				iconv_t iso8859_1to = iconv_open(to,"ISO-8859-1");
66 				if (iso8859_1to != (iconv_t)-1) {
67 					iconv(iso8859_1to,0,0,0,0);
68 					iconv(iso8859_1to,const_cast<input_buffer_t>(&copy),&len,&dst,&outputLeft);
69 					iconv_close(iso8859_1to);
70 				}
71 				delete original;
72 				}
73 				break;
74 			case EINVAL: // incomplete multibyte sequence in the input
75 				// we just eat bad bytes, as part of robustness/best-effort
76 				inputBuffer++;
77 				inputLeft--;
78 				break;
79 			default:
80 				// unknown error, completely bail
81 				status = errno;
82 				iconv_close(conversion);
83 				return status;
84 			}
85 		}
86 	} while ((inputLeft > 0) && (outputLeft > 0));
87 	*srcLen -= inputLeft;
88 	*dstLen -= outputLeft;
89 	iconv_close(conversion);
90 	if (*srcLen != 0) {
91 		// able to convert at least one character
92 		return B_OK;
93 	} else {
94 		// not able to convert at least one character
95 		return B_ERROR;
96 	}
97 }
98 
99 status_t
100 convert_to_utf8(uint32 srcEncoding,
101                 const char * src, int32 * srcLen,
102                 char * dst, int32 * dstLen,
103                 int32 * state, char substitute = B_SUBSTITUTE)
104 {
105 	const BCharacterSet * charset = BCharacterSetRoster::GetCharacterSetByConversionID(srcEncoding);
106 	if (charset == 0) {
107 		return B_ERROR;
108 	}
109 	return convert_encoding(charset->GetName(),"UTF-8",src,srcLen,dst,dstLen,state,substitute);
110 }
111 
112 status_t
113 convert_from_utf8(uint32 dstEncoding,
114                   const char * src, int32 * srcLen,
115                   char * dst, int32 * dstLen,
116                   int32 * state, char substitute = B_SUBSTITUTE)
117 {
118 	const BCharacterSet * charset = BCharacterSetRoster::GetCharacterSetByConversionID(dstEncoding);
119 	if (charset == 0) {
120 		return B_ERROR;
121 	}
122 	return convert_encoding("UTF-8",charset->GetName(),src,srcLen,dst,dstLen,state,substitute);
123 }
124