xref: /haiku/src/system/libroot/add-ons/icu/ICUCtypeData.cpp (revision 05599114937b2bc98f0494fd8bcbbc0d6bd68a2c)
1 /*
2  * Copyright 2010-2011, Oliver Tappe, zooey@hirschkaefer.de.
3  * Distributed under the terms of the MIT License.
4  */
5 
6 
7 #include "ICUCtypeData.h"
8 
9 #include <langinfo.h>
10 #include <stdlib.h>
11 #include <string.h>
12 
13 #include <algorithm>
14 
15 #include <unicode/uchar.h>
16 #include <unicode/uvernum.h>
17 
18 #include <Debug.h>
19 
20 
21 //#define TRACE_CTYPE
22 #undef TRACE
23 #ifdef TRACE_CTYPE
24 #	include <OS.h>
25 #	define TRACE(x) debug_printf x
26 #else
27 #	define TRACE(x) ;
28 #endif
29 
30 
31 U_NAMESPACE_USE
32 
33 
34 namespace BPrivate {
35 namespace Libroot {
36 
37 
ICUCtypeData(pthread_key_t tlsKey)38 ICUCtypeData::ICUCtypeData(pthread_key_t tlsKey)
39 	:
40 	inherited(tlsKey),
41 	fDataBridge(NULL)
42 {
43 }
44 
45 
~ICUCtypeData()46 ICUCtypeData::~ICUCtypeData()
47 {
48 }
49 
50 
51 void
Initialize(LocaleCtypeDataBridge * dataBridge)52 ICUCtypeData::Initialize(LocaleCtypeDataBridge* dataBridge)
53 {
54 	*dataBridge->addrOfClassInfoTable = &fClassInfo[128];
55 	*dataBridge->addrOfToLowerTable = &fToLowerMap[128];
56 	*dataBridge->addrOfToUpperTable = &fToUpperMap[128];
57 	fDataBridge = dataBridge;
58 }
59 
60 
61 status_t
SetTo(const Locale & locale,const char * posixLocaleName)62 ICUCtypeData::SetTo(const Locale& locale, const char* posixLocaleName)
63 {
64 	status_t result = inherited::SetTo(locale, posixLocaleName);
65 	if (result != B_OK)
66 		return result;
67 
68 	UErrorCode icuStatus = U_ZERO_ERROR;
69 
70 	UConverter* converter;
71 	result = _GetConverter(converter);
72 	if (result != B_OK)
73 		return result;
74 
75 	ucnv_reset(converter);
76 
77 	fDataBridge->setMbCurMax(ucnv_getMaxCharSize(converter));
78 
79 	char buffer[] = { 0, 0 };
80 	for (int i = 0; i < 256; ++i) {
81 		const char* source = buffer;
82 		buffer[0] = (char)i;
83 		buffer[1] = '\0';
84 		icuStatus = U_ZERO_ERROR;
85 		UChar32 unicodeChar
86 			= ucnv_getNextUChar(converter, &source, source + 1, &icuStatus);
87 
88 		unsigned short classInfo = 0;
89 		unsigned int toLower = i;
90 		unsigned int toUpper = i;
91 		if (U_SUCCESS(icuStatus)) {
92 			if (u_isblank(unicodeChar))
93 				classInfo |= _ISblank;
94 			if (u_charType(unicodeChar) == U_CONTROL_CHAR)
95 				classInfo |= _IScntrl;
96 			if (u_ispunct(unicodeChar))
97 				classInfo |= _ISpunct;
98 			if (u_hasBinaryProperty(unicodeChar, UCHAR_POSIX_ALNUM))
99 				classInfo |= _ISalnum;
100 			if (u_isUUppercase(unicodeChar))
101 				classInfo |= _ISupper;
102 			if (u_isULowercase(unicodeChar))
103 				classInfo |= _ISlower;
104 			if (u_isUAlphabetic(unicodeChar))
105 				classInfo |= _ISalpha;
106 			if (u_isdigit(unicodeChar))
107 				classInfo |= _ISdigit;
108 			if (u_isxdigit(unicodeChar))
109 				classInfo |= _ISxdigit;
110 			if (u_isUWhiteSpace(unicodeChar))
111 				classInfo |= _ISspace;
112 			if (u_hasBinaryProperty(unicodeChar, UCHAR_POSIX_PRINT))
113 				classInfo |= _ISprint;
114 			if (u_hasBinaryProperty(unicodeChar, UCHAR_POSIX_GRAPH))
115 				classInfo |= _ISgraph;
116 
117 			UChar lowerChar = u_tolower(unicodeChar);
118 			icuStatus = U_ZERO_ERROR;
119 			ucnv_fromUChars(converter, buffer, 1, &lowerChar, 1, &icuStatus);
120 			if (U_SUCCESS(icuStatus))
121 				toLower = (unsigned char)buffer[0];
122 
123 			UChar upperChar = u_toupper(unicodeChar);
124 			icuStatus = U_ZERO_ERROR;
125 			ucnv_fromUChars(converter, buffer, 1, &upperChar, 1, &icuStatus);
126 			if (U_SUCCESS(icuStatus))
127 				toUpper = (unsigned char)buffer[0];
128 		}
129 		fClassInfo[i + 128] = classInfo;
130 		fToLowerMap[i + 128] = toLower;
131 		fToUpperMap[i + 128] = toUpper;
132 		if (i >= 128 && i < 255) {
133 			// mirror upper half at negative indices (except for -1 [=EOF])
134 			fClassInfo[i - 128] = classInfo;
135 			fToLowerMap[i - 128] = toLower;
136 			fToUpperMap[i - 128] = toUpper;
137 		}
138 	}
139 
140 	return B_OK;
141 }
142 
143 
144 status_t
SetToPosix()145 ICUCtypeData::SetToPosix()
146 {
147 	status_t result = inherited::SetToPosix();
148 
149 	if (result == B_OK) {
150 		memcpy(fClassInfo, fDataBridge->posixClassInfo, sizeof(fClassInfo));
151 		memcpy(fToLowerMap, fDataBridge->posixToLowerMap, sizeof(fToLowerMap));
152 		memcpy(fToUpperMap, fDataBridge->posixToUpperMap, sizeof(fToUpperMap));
153 
154 		fDataBridge->setMbCurMax(1);
155 	}
156 
157 	return result;
158 }
159 
160 
161 int
IsWCType(wint_t wc,wctype_t charClass)162 ICUCtypeData::IsWCType(wint_t wc, wctype_t charClass)
163 {
164 	if (wc == WEOF)
165 		return 0;
166 
167 	switch (charClass) {
168 		case _ISalnum:
169 			return u_hasBinaryProperty(wc, UCHAR_POSIX_ALNUM);
170 		case _ISalpha:
171 			return u_isUAlphabetic(wc);
172 		case _ISblank:
173 			return u_isblank(wc);
174 		case _IScntrl:
175 			return u_charType(wc) == U_CONTROL_CHAR;
176 		case _ISdigit:
177 			return u_isdigit(wc);
178 		case _ISgraph:
179 			return u_hasBinaryProperty(wc, UCHAR_POSIX_GRAPH);
180 		case _ISlower:
181 			return u_isULowercase(wc);
182 		case _ISprint:
183 			return u_hasBinaryProperty(wc, UCHAR_POSIX_PRINT);
184 		case _ISpunct:
185 			return u_ispunct(wc);
186 		case _ISspace:
187 			return u_isUWhiteSpace(wc);
188 		case _ISupper:
189 			return u_isUUppercase(wc);
190 		case _ISxdigit:
191 			return u_isxdigit(wc);
192 		default:
193 			return 0;
194 	}
195 }
196 
197 
198 status_t
ToWCTrans(wint_t wc,wctrans_t transition,wint_t & result)199 ICUCtypeData::ToWCTrans(wint_t wc, wctrans_t transition, wint_t& result)
200 {
201 	switch (transition) {
202 		case _ISlower:
203 			result = u_tolower(wc);
204 			return B_OK;
205 		case _ISupper:
206 			result = u_toupper(wc);
207 			return B_OK;
208 		default:
209 			return B_BAD_VALUE;
210 	}
211 }
212 
213 
214 status_t
MultibyteToWchar(wchar_t * wcOut,const char * mb,size_t mbLen,mbstate_t * mbState,size_t & lengthOut)215 ICUCtypeData::MultibyteToWchar(wchar_t* wcOut, const char* mb, size_t mbLen,
216 	mbstate_t* mbState, size_t& lengthOut)
217 {
218 	UConverter* converter = NULL;
219 	status_t result = _GetConverterForMbState(mbState, converter);
220 	if (result != B_OK) {
221 		TRACE(("MultibyteToWchar(): couldn't get converter for mbstate %p - "
222 				"%" B_PRIx32 "\n", mbState, result));
223 		return result;
224 	}
225 
226 	// do the conversion
227 	UErrorCode icuStatus = U_ZERO_ERROR;
228 
229 	const char* buffer = mb;
230 	UChar targetBuffer[3];
231 	UChar* target = targetBuffer;
232 	ucnv_toUnicode(converter, &target, target + 1, &buffer, buffer + mbLen,
233 		NULL, FALSE, &icuStatus);
234 	size_t sourceLengthUsed = buffer - mb;
235 	size_t targetLengthUsed = (size_t)(target - targetBuffer);
236 
237 	if (U16_IS_LEAD(targetBuffer[0])) {
238 		// we have a surrogate pair, read the second character
239 		TRACE(("MultibyteToWchar(): have a surrogate pair\n"));
240 		icuStatus = U_ZERO_ERROR;
241 		ucnv_toUnicode(converter, &target, target + 2 - targetLengthUsed,
242 			&buffer, buffer + mbLen - sourceLengthUsed,
243 			NULL, FALSE, &icuStatus);
244 		sourceLengthUsed = buffer - mb;
245 		targetLengthUsed = (size_t)(target - targetBuffer);
246 	}
247 
248 	if (icuStatus == U_BUFFER_OVERFLOW_ERROR && targetLengthUsed > 0) {
249 		// we've got one character, which is all that we wanted
250 		icuStatus = U_ZERO_ERROR;
251 	}
252 
253 	if (!U_SUCCESS(icuStatus)) {
254 		// conversion failed because of illegal character sequence
255 		TRACE(("MultibyteToWchar(): illegal character sequence\n"));
256 		ucnv_resetToUnicode(converter);
257 		result = B_BAD_DATA;
258 	} else if (targetLengthUsed == 0) {
259 		TRACE(("MultibyteToWchar(): incomplete character (len=%lu)\n", mbLen));
260 		for (size_t i = 0; i < mbLen; ++i)
261 			TRACE(("\tbyte %lu: %x\n", i, mb[i]));
262 		mbState->count = sourceLengthUsed;
263 		result = B_BAD_INDEX;
264 	} else {
265 		UChar32 unicodeChar = 0xBADBEEF;
266 		U16_GET(targetBuffer, 0, 0, targetLengthUsed, unicodeChar);
267 
268 		if (unicodeChar == 0) {
269 			// reset to initial state
270 			_DropConverterFromMbState(mbState);
271 			memset(mbState, 0, sizeof(mbstate_t));
272 			lengthOut = 0;
273 		} else {
274 			mbState->count = 0;
275 			lengthOut = sourceLengthUsed;
276 		}
277 
278 		if (wcOut != NULL)
279 			*wcOut = unicodeChar;
280 
281 		result = B_OK;
282 	}
283 
284 	return result;
285 }
286 
287 
288 status_t
MultibyteStringToWchar(wchar_t * wcDest,size_t wcDestLength,const char ** mbSource,size_t mbSourceLength,mbstate_t * mbState,size_t & lengthOut)289 ICUCtypeData::MultibyteStringToWchar(wchar_t* wcDest, size_t wcDestLength,
290 	const char** mbSource, size_t mbSourceLength, mbstate_t* mbState,
291 	size_t& lengthOut)
292 {
293 	UConverter* converter = NULL;
294 	status_t result = _GetConverterForMbState(mbState, converter);
295 	if (result != B_OK) {
296 		TRACE(("MultibyteStringToWchar(): couldn't get converter for mbstate %p"
297 				" - %" B_PRIx32 "\n", mbState, result));
298 		return result;
299 	}
300 
301 	bool wcsIsTerminated = false;
302 	const char* source = *mbSource;
303 	const char* sourceEnd = source + mbSourceLength;
304 	if (sourceEnd < source) {
305 		// overflow, clamp to highest possible address
306 		sourceEnd = (const char*)-1;
307 	}
308 
309 	if (wcDest == NULL) {
310 		// if there's no destination buffer, there's no length limit either
311 		wcDestLength = (size_t)-1;
312 	}
313 
314 	UErrorCode icuStatus = U_ZERO_ERROR;
315 	size_t sourceLengthUsed = 0;
316 	for (lengthOut = 0; lengthOut < wcDestLength; ++lengthOut) {
317 		if (sourceLengthUsed >= mbSourceLength)
318 			break;
319 		UChar32 unicodeChar = ucnv_getNextUChar(converter, &source,
320 			std::min(source + MB_CUR_MAX, sourceEnd), &icuStatus);
321 		TRACE(("MultibyteStringToWchar() l:%lu wl:%lu s:%p se:%p sl:%lu slu:%lu"
322 				" uchar:%x st:%x\n", lengthOut, wcDestLength, source, sourceEnd,
323 			mbSourceLength, sourceLengthUsed, unicodeChar, icuStatus));
324 		if (!U_SUCCESS(icuStatus))
325 			break;
326 		sourceLengthUsed = source - *mbSource;
327 		if (wcDest != NULL)
328 			*wcDest++ = unicodeChar;
329 		if (unicodeChar == L'\0') {
330 			wcsIsTerminated = true;
331 			break;
332 		}
333 		icuStatus = U_ZERO_ERROR;
334 	}
335 
336 	if (!U_SUCCESS(icuStatus)) {
337 		// conversion failed because of illegal character sequence
338 		TRACE(("MultibyteStringToWchar(): illegal character sequence\n"));
339 		ucnv_resetToUnicode(converter);
340 		result = B_BAD_DATA;
341 		if (wcDest != NULL)
342 			*mbSource = *mbSource + sourceLengthUsed;
343 	} else if (wcsIsTerminated) {
344 		// reset to initial state
345 		_DropConverterFromMbState(mbState);
346 		memset(mbState, 0, sizeof(mbstate_t));
347 		if (wcDest != NULL)
348 			*mbSource = NULL;
349 	} else {
350 		mbState->count = 0;
351 		if (wcDest != NULL)
352 			*mbSource = source;
353 	}
354 
355 	return result;
356 }
357 
358 
359 status_t
WcharToMultibyte(char * mbOut,wchar_t wc,mbstate_t * mbState,size_t & lengthOut)360 ICUCtypeData::WcharToMultibyte(char* mbOut, wchar_t wc, mbstate_t* mbState,
361 	size_t& lengthOut)
362 {
363 	UConverter* converter = NULL;
364 	status_t result = _GetConverterForMbState(mbState, converter);
365 	if (result != B_OK) {
366 		TRACE(("WcharToMultibyte(): couldn't get converter for mbstate %p - "
367 				"%" B_PRIx32 "\n", mbState, result));
368 		return result;
369 	}
370 
371 	// convert input from UTF-32 to UTF-16
372 	UChar ucharBuffer[2];
373 	size_t ucharLength;
374 	if (U_IS_BMP(wc)) {
375 		ucharBuffer[0] = wc;
376 		ucharLength = 1;
377 	} else {
378 		ucharBuffer[0] = U16_LEAD(wc);
379 		ucharBuffer[1] = U16_TRAIL(wc);
380 		ucharLength = 2;
381 	}
382 
383 	// do the actual conversion
384 	UErrorCode icuStatus = U_ZERO_ERROR;
385 	size_t mbLength = mbOut == NULL ? 0 : MB_CUR_MAX;
386 	lengthOut = ucnv_fromUChars(converter, mbOut, mbLength, ucharBuffer,
387 		ucharLength, &icuStatus);
388 	TRACE(("WcharToMultibyte() l:%lu mb:%p ml:%lu uchar:%x st:%x\n", lengthOut,
389 		mbOut, mbLength, wc, icuStatus));
390 
391 	if (icuStatus == U_BUFFER_OVERFLOW_ERROR && mbOut == NULL) {
392 		// we have no output buffer, so we ignore buffer overflows
393 		icuStatus = U_ZERO_ERROR;
394 	}
395 
396 	if (!U_SUCCESS(icuStatus)) {
397 		if (icuStatus == U_ILLEGAL_ARGUMENT_ERROR) {
398 			// bad converter (shouldn't really happen)
399 			TRACE(("WcharToMultibyte(): bad converter\n"));
400 			return B_BAD_VALUE;
401 		}
402 
403 		// conversion failed because of illegal/unmappable character
404 		TRACE(("WcharToMultibyte(): illegal character sequence\n"));
405 		ucnv_resetFromUnicode(converter);
406 		return B_BAD_DATA;
407 	}
408 
409 	if (wc == 0) {
410 		// reset to initial state
411 		_DropConverterFromMbState(mbState);
412 		memset(mbState, 0, sizeof(mbstate_t));
413 	}
414 
415 	return B_OK;
416 }
417 
418 
419 status_t
WcharStringToMultibyte(char * mbDest,size_t mbDestLength,const wchar_t ** wcSource,size_t wcSourceLength,mbstate_t * mbState,size_t & lengthOut)420 ICUCtypeData::WcharStringToMultibyte(char* mbDest, size_t mbDestLength,
421 	const wchar_t** wcSource, size_t wcSourceLength, mbstate_t* mbState,
422 	size_t& lengthOut)
423 {
424 	UConverter* converter = NULL;
425 	status_t result = _GetConverterForMbState(mbState, converter);
426 	if (result != B_OK) {
427 		TRACE(("WcharStringToMultibyte(): couldn't get converter for mbstate %p"
428 			" - %" B_PRIx32 "\n", mbState, result));
429 		return result;
430 	}
431 
432 	bool mbsIsTerminated = false;
433 	const UChar32* source = (UChar32*)*wcSource;
434 
435 	UErrorCode icuStatus = U_ZERO_ERROR;
436 	lengthOut = 0;
437 	size_t sourceLengthUsed = 0;
438 	for (; sourceLengthUsed < wcSourceLength; ++sourceLengthUsed, ++source) {
439 		if (mbDest != NULL && lengthOut >= mbDestLength)
440 			break;
441 
442 		// convert input from UTF-32 to UTF-16
443 		UChar ucharBuffer[2];
444 		size_t ucharLength;
445 		if (U_IS_BMP(*source)) {
446 			ucharBuffer[0] = *source;
447 			ucharLength = 1;
448 		} else {
449 			ucharBuffer[0] = U16_LEAD(*source);
450 			ucharBuffer[1] = U16_TRAIL(*source);
451 			ucharLength = 2;
452 		}
453 
454 		// do the actual conversion
455 		size_t destLength = mbDest == NULL ? 0 : mbDestLength - lengthOut;
456 		char buffer[MB_CUR_MAX];
457 		size_t mbLength = ucnv_fromUChars(converter,
458 			mbDest == NULL ? NULL : buffer, destLength, ucharBuffer,
459 			ucharLength, &icuStatus);
460 		TRACE(("WcharStringToMultibyte() l:%lu mb:%p ml:%lu s:%p ul:%lu slu:%lu"
461 				" uchar:%x st:%x\n", mbLength, mbDest, destLength, source,
462 			ucharLength, sourceLengthUsed, *source, icuStatus));
463 
464 		if (icuStatus == U_BUFFER_OVERFLOW_ERROR) {
465 			// ignore buffer overflows ...
466  			icuStatus = U_ZERO_ERROR;
467  			// ... but stop if the output buffer has been exceeded
468  			if (destLength > 0)
469  				break;
470 		} else if (mbDest != NULL)
471 			memcpy(mbDest, buffer, mbLength);
472 
473 		if (!U_SUCCESS(icuStatus))
474 			break;
475 		if (mbDest != NULL)
476 			mbDest += mbLength;
477 		if (*source == L'\0') {
478 			mbsIsTerminated = true;
479 			break;
480 		}
481 		lengthOut += mbLength;
482 		icuStatus = U_ZERO_ERROR;
483 	}
484 
485 	if (!U_SUCCESS(icuStatus)) {
486 		// conversion failed because of illegal character sequence
487 		TRACE(("WcharStringToMultibyte(): illegal character sequence\n"));
488 		ucnv_resetFromUnicode(converter);
489 		result = B_BAD_DATA;
490 		if (mbDest != NULL)
491 			*wcSource = *wcSource + sourceLengthUsed;
492 	} else if (mbsIsTerminated) {
493 		// reset to initial state
494 		_DropConverterFromMbState(mbState);
495 		memset(mbState, 0, sizeof(mbstate_t));
496 		if (mbDest != NULL)
497 			*wcSource = NULL;
498 	} else {
499 		mbState->count = 0;
500 		if (mbDest != NULL)
501 			*wcSource = (wchar_t*)source;
502 	}
503 
504 	return result;
505 }
506 
507 
508 const char*
GetLanginfo(int index)509 ICUCtypeData::GetLanginfo(int index)
510 {
511 	switch(index) {
512 		case CODESET:
513 			return fGivenCharset;
514 		default:
515 			return "";
516 	}
517 }
518 
519 
520 status_t
_GetConverterForMbState(mbstate_t * mbState,UConverter * & converterOut)521 ICUCtypeData::_GetConverterForMbState(mbstate_t* mbState,
522 	UConverter*& converterOut)
523 {
524 	if (strcmp(mbState->charset, fGivenCharset) == 0
525 			&& (char*)mbState->converter >= mbState->data
526 			&& (char*)mbState->converter < mbState->data + 8) {
527 		// charset matches and converter actually lives in *this* mbState,
528 		// so we can use it (if the converter points to the outside, it means
529 		// that the mbstate_t has been copied)
530 		converterOut = (UConverter*)mbState->converter;
531 		return B_OK;
532 	}
533 
534 	// charset no longer matches the converter, we need to dump it and
535 	// create a new one
536 	_DropConverterFromMbState(mbState);
537 
538 	// create a new converter for the current charset ...
539 	UConverter* icuConverter;
540 	status_t result = _GetConverter(icuConverter);
541 	if (result != B_OK)
542 		return result;
543 
544 	// ... and clone it into the mbstate
545 	UErrorCode icuStatus = U_ZERO_ERROR;
546 	int32_t bufferSize = sizeof(mbState->data);
547 	UConverter* clone
548 		= ucnv_safeClone(icuConverter, mbState->data, &bufferSize, &icuStatus);
549 
550 	if (clone == NULL || !U_SUCCESS(icuStatus))
551 		return B_ERROR;
552 
553 	if ((char*)clone < mbState->data || (char*)clone >= mbState->data + 8) {
554 		// buffer is too small (shouldn't happen according to ICU docs)
555 		return B_NO_MEMORY;
556 	}
557 
558 	strlcpy(mbState->charset, fGivenCharset, sizeof(mbState->charset));
559 	mbState->converter = clone;
560 
561 	converterOut = clone;
562 
563 	return B_OK;
564 }
565 
566 
567 status_t
_DropConverterFromMbState(mbstate_t * mbState)568 ICUCtypeData::_DropConverterFromMbState(mbstate_t* mbState)
569 {
570 	if (mbState->converter != NULL && (char*)mbState->converter >= mbState->data
571 			&& (char*)mbState->converter < mbState->data + 8) {
572 		// check that the converter actually lives in *this* mbState,
573 		// otherwise we risk freeing a converter that doesn't belong to us;
574 		// this parallels the check in _GetConverterForMbState()
575 		ucnv_close((UConverter*)mbState->converter);
576 	}
577 	memset(mbState, 0, sizeof(mbstate_t));
578 
579 	return B_OK;
580 }
581 
582 
583 }	// namespace Libroot
584 }	// namespace BPrivate
585