1 /*
2 * Copyright 2010-2011, Oliver Tappe, zooey@hirschkaefer.de.
3 * Distributed under the terms of the MIT License.
4 */
5
6
7 #include "ICUCtypeData.h"
8
9 #include <langinfo.h>
10 #include <stdlib.h>
11 #include <string.h>
12
13 #include <algorithm>
14
15 #include <unicode/uchar.h>
16 #include <unicode/uvernum.h>
17
18 #include <Debug.h>
19
20
21 //#define TRACE_CTYPE
22 #undef TRACE
23 #ifdef TRACE_CTYPE
24 # include <OS.h>
25 # define TRACE(x) debug_printf x
26 #else
27 # define TRACE(x) ;
28 #endif
29
30
31 U_NAMESPACE_USE
32
33
34 namespace BPrivate {
35 namespace Libroot {
36
37
ICUCtypeData(pthread_key_t tlsKey)38 ICUCtypeData::ICUCtypeData(pthread_key_t tlsKey)
39 :
40 inherited(tlsKey),
41 fDataBridge(NULL)
42 {
43 }
44
45
~ICUCtypeData()46 ICUCtypeData::~ICUCtypeData()
47 {
48 }
49
50
51 void
Initialize(LocaleCtypeDataBridge * dataBridge)52 ICUCtypeData::Initialize(LocaleCtypeDataBridge* dataBridge)
53 {
54 *dataBridge->addrOfClassInfoTable = &fClassInfo[128];
55 *dataBridge->addrOfToLowerTable = &fToLowerMap[128];
56 *dataBridge->addrOfToUpperTable = &fToUpperMap[128];
57 fDataBridge = dataBridge;
58 }
59
60
61 status_t
SetTo(const Locale & locale,const char * posixLocaleName)62 ICUCtypeData::SetTo(const Locale& locale, const char* posixLocaleName)
63 {
64 status_t result = inherited::SetTo(locale, posixLocaleName);
65 if (result != B_OK)
66 return result;
67
68 UErrorCode icuStatus = U_ZERO_ERROR;
69
70 UConverter* converter;
71 result = _GetConverter(converter);
72 if (result != B_OK)
73 return result;
74
75 ucnv_reset(converter);
76
77 fDataBridge->setMbCurMax(ucnv_getMaxCharSize(converter));
78
79 char buffer[] = { 0, 0 };
80 for (int i = 0; i < 256; ++i) {
81 const char* source = buffer;
82 buffer[0] = (char)i;
83 buffer[1] = '\0';
84 icuStatus = U_ZERO_ERROR;
85 UChar32 unicodeChar
86 = ucnv_getNextUChar(converter, &source, source + 1, &icuStatus);
87
88 unsigned short classInfo = 0;
89 unsigned int toLower = i;
90 unsigned int toUpper = i;
91 if (U_SUCCESS(icuStatus)) {
92 if (u_isblank(unicodeChar))
93 classInfo |= _ISblank;
94 if (u_charType(unicodeChar) == U_CONTROL_CHAR)
95 classInfo |= _IScntrl;
96 if (u_ispunct(unicodeChar))
97 classInfo |= _ISpunct;
98 if (u_hasBinaryProperty(unicodeChar, UCHAR_POSIX_ALNUM))
99 classInfo |= _ISalnum;
100 if (u_isUUppercase(unicodeChar))
101 classInfo |= _ISupper;
102 if (u_isULowercase(unicodeChar))
103 classInfo |= _ISlower;
104 if (u_isUAlphabetic(unicodeChar))
105 classInfo |= _ISalpha;
106 if (u_isdigit(unicodeChar))
107 classInfo |= _ISdigit;
108 if (u_isxdigit(unicodeChar))
109 classInfo |= _ISxdigit;
110 if (u_isUWhiteSpace(unicodeChar))
111 classInfo |= _ISspace;
112 if (u_hasBinaryProperty(unicodeChar, UCHAR_POSIX_PRINT))
113 classInfo |= _ISprint;
114 if (u_hasBinaryProperty(unicodeChar, UCHAR_POSIX_GRAPH))
115 classInfo |= _ISgraph;
116
117 UChar lowerChar = u_tolower(unicodeChar);
118 icuStatus = U_ZERO_ERROR;
119 ucnv_fromUChars(converter, buffer, 1, &lowerChar, 1, &icuStatus);
120 if (U_SUCCESS(icuStatus))
121 toLower = (unsigned char)buffer[0];
122
123 UChar upperChar = u_toupper(unicodeChar);
124 icuStatus = U_ZERO_ERROR;
125 ucnv_fromUChars(converter, buffer, 1, &upperChar, 1, &icuStatus);
126 if (U_SUCCESS(icuStatus))
127 toUpper = (unsigned char)buffer[0];
128 }
129 fClassInfo[i + 128] = classInfo;
130 fToLowerMap[i + 128] = toLower;
131 fToUpperMap[i + 128] = toUpper;
132 if (i >= 128 && i < 255) {
133 // mirror upper half at negative indices (except for -1 [=EOF])
134 fClassInfo[i - 128] = classInfo;
135 fToLowerMap[i - 128] = toLower;
136 fToUpperMap[i - 128] = toUpper;
137 }
138 }
139
140 return B_OK;
141 }
142
143
144 status_t
SetToPosix()145 ICUCtypeData::SetToPosix()
146 {
147 status_t result = inherited::SetToPosix();
148
149 if (result == B_OK) {
150 memcpy(fClassInfo, fDataBridge->posixClassInfo, sizeof(fClassInfo));
151 memcpy(fToLowerMap, fDataBridge->posixToLowerMap, sizeof(fToLowerMap));
152 memcpy(fToUpperMap, fDataBridge->posixToUpperMap, sizeof(fToUpperMap));
153
154 fDataBridge->setMbCurMax(1);
155 }
156
157 return result;
158 }
159
160
161 int
IsWCType(wint_t wc,wctype_t charClass)162 ICUCtypeData::IsWCType(wint_t wc, wctype_t charClass)
163 {
164 if (wc == WEOF)
165 return 0;
166
167 switch (charClass) {
168 case _ISalnum:
169 return u_hasBinaryProperty(wc, UCHAR_POSIX_ALNUM);
170 case _ISalpha:
171 return u_isUAlphabetic(wc);
172 case _ISblank:
173 return u_isblank(wc);
174 case _IScntrl:
175 return u_charType(wc) == U_CONTROL_CHAR;
176 case _ISdigit:
177 return u_isdigit(wc);
178 case _ISgraph:
179 return u_hasBinaryProperty(wc, UCHAR_POSIX_GRAPH);
180 case _ISlower:
181 return u_isULowercase(wc);
182 case _ISprint:
183 return u_hasBinaryProperty(wc, UCHAR_POSIX_PRINT);
184 case _ISpunct:
185 return u_ispunct(wc);
186 case _ISspace:
187 return u_isUWhiteSpace(wc);
188 case _ISupper:
189 return u_isUUppercase(wc);
190 case _ISxdigit:
191 return u_isxdigit(wc);
192 default:
193 return 0;
194 }
195 }
196
197
198 status_t
ToWCTrans(wint_t wc,wctrans_t transition,wint_t & result)199 ICUCtypeData::ToWCTrans(wint_t wc, wctrans_t transition, wint_t& result)
200 {
201 switch (transition) {
202 case _ISlower:
203 result = u_tolower(wc);
204 return B_OK;
205 case _ISupper:
206 result = u_toupper(wc);
207 return B_OK;
208 default:
209 return B_BAD_VALUE;
210 }
211 }
212
213
214 status_t
MultibyteToWchar(wchar_t * wcOut,const char * mb,size_t mbLen,mbstate_t * mbState,size_t & lengthOut)215 ICUCtypeData::MultibyteToWchar(wchar_t* wcOut, const char* mb, size_t mbLen,
216 mbstate_t* mbState, size_t& lengthOut)
217 {
218 UConverter* converter = NULL;
219 status_t result = _GetConverterForMbState(mbState, converter);
220 if (result != B_OK) {
221 TRACE(("MultibyteToWchar(): couldn't get converter for mbstate %p - "
222 "%" B_PRIx32 "\n", mbState, result));
223 return result;
224 }
225
226 // do the conversion
227 UErrorCode icuStatus = U_ZERO_ERROR;
228
229 const char* buffer = mb;
230 UChar targetBuffer[3];
231 UChar* target = targetBuffer;
232 ucnv_toUnicode(converter, &target, target + 1, &buffer, buffer + mbLen,
233 NULL, FALSE, &icuStatus);
234 size_t sourceLengthUsed = buffer - mb;
235 size_t targetLengthUsed = (size_t)(target - targetBuffer);
236
237 if (U16_IS_LEAD(targetBuffer[0])) {
238 // we have a surrogate pair, read the second character
239 TRACE(("MultibyteToWchar(): have a surrogate pair\n"));
240 icuStatus = U_ZERO_ERROR;
241 ucnv_toUnicode(converter, &target, target + 2 - targetLengthUsed,
242 &buffer, buffer + mbLen - sourceLengthUsed,
243 NULL, FALSE, &icuStatus);
244 sourceLengthUsed = buffer - mb;
245 targetLengthUsed = (size_t)(target - targetBuffer);
246 }
247
248 if (icuStatus == U_BUFFER_OVERFLOW_ERROR && targetLengthUsed > 0) {
249 // we've got one character, which is all that we wanted
250 icuStatus = U_ZERO_ERROR;
251 }
252
253 if (!U_SUCCESS(icuStatus)) {
254 // conversion failed because of illegal character sequence
255 TRACE(("MultibyteToWchar(): illegal character sequence\n"));
256 ucnv_resetToUnicode(converter);
257 result = B_BAD_DATA;
258 } else if (targetLengthUsed == 0) {
259 TRACE(("MultibyteToWchar(): incomplete character (len=%lu)\n", mbLen));
260 for (size_t i = 0; i < mbLen; ++i)
261 TRACE(("\tbyte %lu: %x\n", i, mb[i]));
262 mbState->count = sourceLengthUsed;
263 result = B_BAD_INDEX;
264 } else {
265 UChar32 unicodeChar = 0xBADBEEF;
266 U16_GET(targetBuffer, 0, 0, targetLengthUsed, unicodeChar);
267
268 if (unicodeChar == 0) {
269 // reset to initial state
270 _DropConverterFromMbState(mbState);
271 memset(mbState, 0, sizeof(mbstate_t));
272 lengthOut = 0;
273 } else {
274 mbState->count = 0;
275 lengthOut = sourceLengthUsed;
276 }
277
278 if (wcOut != NULL)
279 *wcOut = unicodeChar;
280
281 result = B_OK;
282 }
283
284 return result;
285 }
286
287
288 status_t
MultibyteStringToWchar(wchar_t * wcDest,size_t wcDestLength,const char ** mbSource,size_t mbSourceLength,mbstate_t * mbState,size_t & lengthOut)289 ICUCtypeData::MultibyteStringToWchar(wchar_t* wcDest, size_t wcDestLength,
290 const char** mbSource, size_t mbSourceLength, mbstate_t* mbState,
291 size_t& lengthOut)
292 {
293 UConverter* converter = NULL;
294 status_t result = _GetConverterForMbState(mbState, converter);
295 if (result != B_OK) {
296 TRACE(("MultibyteStringToWchar(): couldn't get converter for mbstate %p"
297 " - %" B_PRIx32 "\n", mbState, result));
298 return result;
299 }
300
301 bool wcsIsTerminated = false;
302 const char* source = *mbSource;
303 const char* sourceEnd = source + mbSourceLength;
304 if (sourceEnd < source) {
305 // overflow, clamp to highest possible address
306 sourceEnd = (const char*)-1;
307 }
308
309 if (wcDest == NULL) {
310 // if there's no destination buffer, there's no length limit either
311 wcDestLength = (size_t)-1;
312 }
313
314 UErrorCode icuStatus = U_ZERO_ERROR;
315 size_t sourceLengthUsed = 0;
316 for (lengthOut = 0; lengthOut < wcDestLength; ++lengthOut) {
317 if (sourceLengthUsed >= mbSourceLength)
318 break;
319 UChar32 unicodeChar = ucnv_getNextUChar(converter, &source,
320 std::min(source + MB_CUR_MAX, sourceEnd), &icuStatus);
321 TRACE(("MultibyteStringToWchar() l:%lu wl:%lu s:%p se:%p sl:%lu slu:%lu"
322 " uchar:%x st:%x\n", lengthOut, wcDestLength, source, sourceEnd,
323 mbSourceLength, sourceLengthUsed, unicodeChar, icuStatus));
324 if (!U_SUCCESS(icuStatus))
325 break;
326 sourceLengthUsed = source - *mbSource;
327 if (wcDest != NULL)
328 *wcDest++ = unicodeChar;
329 if (unicodeChar == L'\0') {
330 wcsIsTerminated = true;
331 break;
332 }
333 icuStatus = U_ZERO_ERROR;
334 }
335
336 if (!U_SUCCESS(icuStatus)) {
337 // conversion failed because of illegal character sequence
338 TRACE(("MultibyteStringToWchar(): illegal character sequence\n"));
339 ucnv_resetToUnicode(converter);
340 result = B_BAD_DATA;
341 if (wcDest != NULL)
342 *mbSource = *mbSource + sourceLengthUsed;
343 } else if (wcsIsTerminated) {
344 // reset to initial state
345 _DropConverterFromMbState(mbState);
346 memset(mbState, 0, sizeof(mbstate_t));
347 if (wcDest != NULL)
348 *mbSource = NULL;
349 } else {
350 mbState->count = 0;
351 if (wcDest != NULL)
352 *mbSource = source;
353 }
354
355 return result;
356 }
357
358
359 status_t
WcharToMultibyte(char * mbOut,wchar_t wc,mbstate_t * mbState,size_t & lengthOut)360 ICUCtypeData::WcharToMultibyte(char* mbOut, wchar_t wc, mbstate_t* mbState,
361 size_t& lengthOut)
362 {
363 UConverter* converter = NULL;
364 status_t result = _GetConverterForMbState(mbState, converter);
365 if (result != B_OK) {
366 TRACE(("WcharToMultibyte(): couldn't get converter for mbstate %p - "
367 "%" B_PRIx32 "\n", mbState, result));
368 return result;
369 }
370
371 // convert input from UTF-32 to UTF-16
372 UChar ucharBuffer[2];
373 size_t ucharLength;
374 if (U_IS_BMP(wc)) {
375 ucharBuffer[0] = wc;
376 ucharLength = 1;
377 } else {
378 ucharBuffer[0] = U16_LEAD(wc);
379 ucharBuffer[1] = U16_TRAIL(wc);
380 ucharLength = 2;
381 }
382
383 // do the actual conversion
384 UErrorCode icuStatus = U_ZERO_ERROR;
385 size_t mbLength = mbOut == NULL ? 0 : MB_CUR_MAX;
386 lengthOut = ucnv_fromUChars(converter, mbOut, mbLength, ucharBuffer,
387 ucharLength, &icuStatus);
388 TRACE(("WcharToMultibyte() l:%lu mb:%p ml:%lu uchar:%x st:%x\n", lengthOut,
389 mbOut, mbLength, wc, icuStatus));
390
391 if (icuStatus == U_BUFFER_OVERFLOW_ERROR && mbOut == NULL) {
392 // we have no output buffer, so we ignore buffer overflows
393 icuStatus = U_ZERO_ERROR;
394 }
395
396 if (!U_SUCCESS(icuStatus)) {
397 if (icuStatus == U_ILLEGAL_ARGUMENT_ERROR) {
398 // bad converter (shouldn't really happen)
399 TRACE(("WcharToMultibyte(): bad converter\n"));
400 return B_BAD_VALUE;
401 }
402
403 // conversion failed because of illegal/unmappable character
404 TRACE(("WcharToMultibyte(): illegal character sequence\n"));
405 ucnv_resetFromUnicode(converter);
406 return B_BAD_DATA;
407 }
408
409 if (wc == 0) {
410 // reset to initial state
411 _DropConverterFromMbState(mbState);
412 memset(mbState, 0, sizeof(mbstate_t));
413 }
414
415 return B_OK;
416 }
417
418
419 status_t
WcharStringToMultibyte(char * mbDest,size_t mbDestLength,const wchar_t ** wcSource,size_t wcSourceLength,mbstate_t * mbState,size_t & lengthOut)420 ICUCtypeData::WcharStringToMultibyte(char* mbDest, size_t mbDestLength,
421 const wchar_t** wcSource, size_t wcSourceLength, mbstate_t* mbState,
422 size_t& lengthOut)
423 {
424 UConverter* converter = NULL;
425 status_t result = _GetConverterForMbState(mbState, converter);
426 if (result != B_OK) {
427 TRACE(("WcharStringToMultibyte(): couldn't get converter for mbstate %p"
428 " - %" B_PRIx32 "\n", mbState, result));
429 return result;
430 }
431
432 bool mbsIsTerminated = false;
433 const UChar32* source = (UChar32*)*wcSource;
434
435 UErrorCode icuStatus = U_ZERO_ERROR;
436 lengthOut = 0;
437 size_t sourceLengthUsed = 0;
438 for (; sourceLengthUsed < wcSourceLength; ++sourceLengthUsed, ++source) {
439 if (mbDest != NULL && lengthOut >= mbDestLength)
440 break;
441
442 // convert input from UTF-32 to UTF-16
443 UChar ucharBuffer[2];
444 size_t ucharLength;
445 if (U_IS_BMP(*source)) {
446 ucharBuffer[0] = *source;
447 ucharLength = 1;
448 } else {
449 ucharBuffer[0] = U16_LEAD(*source);
450 ucharBuffer[1] = U16_TRAIL(*source);
451 ucharLength = 2;
452 }
453
454 // do the actual conversion
455 size_t destLength = mbDest == NULL ? 0 : mbDestLength - lengthOut;
456 char buffer[MB_CUR_MAX];
457 size_t mbLength = ucnv_fromUChars(converter,
458 mbDest == NULL ? NULL : buffer, destLength, ucharBuffer,
459 ucharLength, &icuStatus);
460 TRACE(("WcharStringToMultibyte() l:%lu mb:%p ml:%lu s:%p ul:%lu slu:%lu"
461 " uchar:%x st:%x\n", mbLength, mbDest, destLength, source,
462 ucharLength, sourceLengthUsed, *source, icuStatus));
463
464 if (icuStatus == U_BUFFER_OVERFLOW_ERROR) {
465 // ignore buffer overflows ...
466 icuStatus = U_ZERO_ERROR;
467 // ... but stop if the output buffer has been exceeded
468 if (destLength > 0)
469 break;
470 } else if (mbDest != NULL)
471 memcpy(mbDest, buffer, mbLength);
472
473 if (!U_SUCCESS(icuStatus))
474 break;
475 if (mbDest != NULL)
476 mbDest += mbLength;
477 if (*source == L'\0') {
478 mbsIsTerminated = true;
479 break;
480 }
481 lengthOut += mbLength;
482 icuStatus = U_ZERO_ERROR;
483 }
484
485 if (!U_SUCCESS(icuStatus)) {
486 // conversion failed because of illegal character sequence
487 TRACE(("WcharStringToMultibyte(): illegal character sequence\n"));
488 ucnv_resetFromUnicode(converter);
489 result = B_BAD_DATA;
490 if (mbDest != NULL)
491 *wcSource = *wcSource + sourceLengthUsed;
492 } else if (mbsIsTerminated) {
493 // reset to initial state
494 _DropConverterFromMbState(mbState);
495 memset(mbState, 0, sizeof(mbstate_t));
496 if (mbDest != NULL)
497 *wcSource = NULL;
498 } else {
499 mbState->count = 0;
500 if (mbDest != NULL)
501 *wcSource = (wchar_t*)source;
502 }
503
504 return result;
505 }
506
507
508 const char*
GetLanginfo(int index)509 ICUCtypeData::GetLanginfo(int index)
510 {
511 switch(index) {
512 case CODESET:
513 return fGivenCharset;
514 default:
515 return "";
516 }
517 }
518
519
520 status_t
_GetConverterForMbState(mbstate_t * mbState,UConverter * & converterOut)521 ICUCtypeData::_GetConverterForMbState(mbstate_t* mbState,
522 UConverter*& converterOut)
523 {
524 if (strcmp(mbState->charset, fGivenCharset) == 0
525 && (char*)mbState->converter >= mbState->data
526 && (char*)mbState->converter < mbState->data + 8) {
527 // charset matches and converter actually lives in *this* mbState,
528 // so we can use it (if the converter points to the outside, it means
529 // that the mbstate_t has been copied)
530 converterOut = (UConverter*)mbState->converter;
531 return B_OK;
532 }
533
534 // charset no longer matches the converter, we need to dump it and
535 // create a new one
536 _DropConverterFromMbState(mbState);
537
538 // create a new converter for the current charset ...
539 UConverter* icuConverter;
540 status_t result = _GetConverter(icuConverter);
541 if (result != B_OK)
542 return result;
543
544 // ... and clone it into the mbstate
545 UErrorCode icuStatus = U_ZERO_ERROR;
546 int32_t bufferSize = sizeof(mbState->data);
547 UConverter* clone
548 = ucnv_safeClone(icuConverter, mbState->data, &bufferSize, &icuStatus);
549
550 if (clone == NULL || !U_SUCCESS(icuStatus))
551 return B_ERROR;
552
553 if ((char*)clone < mbState->data || (char*)clone >= mbState->data + 8) {
554 // buffer is too small (shouldn't happen according to ICU docs)
555 return B_NO_MEMORY;
556 }
557
558 strlcpy(mbState->charset, fGivenCharset, sizeof(mbState->charset));
559 mbState->converter = clone;
560
561 converterOut = clone;
562
563 return B_OK;
564 }
565
566
567 status_t
_DropConverterFromMbState(mbstate_t * mbState)568 ICUCtypeData::_DropConverterFromMbState(mbstate_t* mbState)
569 {
570 if (mbState->converter != NULL && (char*)mbState->converter >= mbState->data
571 && (char*)mbState->converter < mbState->data + 8) {
572 // check that the converter actually lives in *this* mbState,
573 // otherwise we risk freeing a converter that doesn't belong to us;
574 // this parallels the check in _GetConverterForMbState()
575 ucnv_close((UConverter*)mbState->converter);
576 }
577 memset(mbState, 0, sizeof(mbstate_t));
578
579 return B_OK;
580 }
581
582
583 } // namespace Libroot
584 } // namespace BPrivate
585