/* * Copyright 2003, Axel Dörfler, axeld@pinc-software.de. All rights reserved. * Distributed under the terms of the MIT License. * * Authors: * Axel Dörfler, axeld@pinc-software.de * Siarzhuk Zharski, zharik@gmx.li * */ #include #include #include BUnicodeChar::BUnicodeChar() { } // Returns the general category value for the code point. int8 BUnicodeChar::Type(uint32 c) { BUnicodeChar(); return u_charType(c); } // Determines whether the specified code point is a letter character. // True for general categories "L" (letters). bool BUnicodeChar::IsAlpha(uint32 c) { BUnicodeChar(); return u_isalpha(c); } // Determines whether the specified code point is an alphanumeric character // (letter or digit). // True for characters with general categories // "L" (letters) and "Nd" (decimal digit numbers). bool BUnicodeChar::IsAlNum(uint32 c) { BUnicodeChar(); return u_isalnum(c); } // Check if a code point has the Lowercase Unicode property (UCHAR_LOWERCASE). bool BUnicodeChar::IsLower(uint32 c) { BUnicodeChar(); return u_isULowercase(c); } // Check if a code point has the Uppercase Unicode property (UCHAR_UPPERCASE). bool BUnicodeChar::IsUpper(uint32 c) { BUnicodeChar(); return u_isUUppercase(c); } // Determines whether the specified code point is a titlecase letter. // True for general category "Lt" (titlecase letter). bool BUnicodeChar::IsTitle(uint32 c) { BUnicodeChar(); return u_istitle(c); } // Determines whether the specified code point is a digit character. // True for characters with general category "Nd" (decimal digit numbers). // Beginning with Unicode 4, this is the same as // testing for the Numeric_Type of Decimal. bool BUnicodeChar::IsDigit(uint32 c) { BUnicodeChar(); return u_isdigit(c); } // Determines whether the specified code point is a hexadecimal digit. // This is equivalent to u_digit(c, 16)>=0. // True for characters with general category "Nd" (decimal digit numbers) // as well as Latin letters a-f and A-F in both ASCII and Fullwidth ASCII. // (That is, for letters with code points // 0041..0046, 0061..0066, FF21..FF26, FF41..FF46.) bool BUnicodeChar::IsHexDigit(uint32 c) { BUnicodeChar(); return u_isxdigit(c); } // Determines whether the specified code point is "defined", // which usually means that it is assigned a character. // True for general categories other than "Cn" (other, not assigned), // i.e., true for all code points mentioned in UnicodeData.txt. bool BUnicodeChar::IsDefined(uint32 c) { BUnicodeChar(); return u_isdefined(c); } // Determines whether the specified code point is a base character. // True for general categories "L" (letters), "N" (numbers), // "Mc" (spacing combining marks), and "Me" (enclosing marks). bool BUnicodeChar::IsBase(uint32 c) { BUnicodeChar(); return u_isbase(c); } // Determines whether the specified code point is a control character // (as defined by this function). // A control character is one of the following: // - ISO 8-bit control character (U+0000..U+001f and U+007f..U+009f) // - U_CONTROL_CHAR (Cc) // - U_FORMAT_CHAR (Cf) // - U_LINE_SEPARATOR (Zl) // - U_PARAGRAPH_SEPARATOR (Zp) bool BUnicodeChar::IsControl(uint32 c) { BUnicodeChar(); return u_iscntrl(c); } // Determines whether the specified code point is a punctuation character. // True for characters with general categories "P" (punctuation). bool BUnicodeChar::IsPunctuation(uint32 c) { BUnicodeChar(); return u_ispunct(c); } // Determine if the specified code point is a space character according to Java. // True for characters with general categories "Z" (separators), // which does not include control codes (e.g., TAB or Line Feed). bool BUnicodeChar::IsSpace(uint32 c) { BUnicodeChar(); return u_isJavaSpaceChar(c); } // Determines if the specified code point is a whitespace character // A character is considered to be a whitespace character if and only // if it satisfies one of the following criteria: // - It is a Unicode Separator character (categories "Z" = "Zs" or "Zl" or "Zp"), // but is not also a non-breaking space (U+00A0 NBSP or U+2007 Figure Space // or U+202F Narrow NBSP). // - It is U+0009 HORIZONTAL TABULATION. // - It is U+000A LINE FEED. // - It is U+000B VERTICAL TABULATION. // - It is U+000C FORM FEED. // - It is U+000D CARRIAGE RETURN. // - It is U+001C FILE SEPARATOR. // - It is U+001D GROUP SEPARATOR. // - It is U+001E RECORD SEPARATOR. // - It is U+001F UNIT SEPARATOR. bool BUnicodeChar::IsWhitespace(uint32 c) { BUnicodeChar(); return u_isWhitespace(c); } // Determines whether the specified code point is a printable character. // True for general categories other than "C" (controls). bool BUnicodeChar::IsPrintable(uint32 c) { BUnicodeChar(); return u_isprint(c); } // #pragma mark - uint32 BUnicodeChar::ToLower(uint32 c) { BUnicodeChar(); return u_tolower(c); } uint32 BUnicodeChar::ToUpper(uint32 c) { BUnicodeChar(); return u_toupper(c); } uint32 BUnicodeChar::ToTitle(uint32 c) { BUnicodeChar(); return u_totitle(c); } int32 BUnicodeChar::DigitValue(uint32 c) { BUnicodeChar(); return u_digit(c, 10); } unicode_east_asian_width BUnicodeChar::EastAsianWidth(uint32 c) { return (unicode_east_asian_width)u_getIntPropertyValue(c, UCHAR_EAST_ASIAN_WIDTH); } void BUnicodeChar::ToUTF8(uint32 c, char** out) { int i = 0; U8_APPEND_UNSAFE(*out, i, c); *out += i; } uint32 BUnicodeChar::FromUTF8(const char** in) { int i = 0; uint32 c = 0; U8_NEXT_UNSAFE(*in, i, c); *in += i; return c; } size_t BUnicodeChar::UTF8StringLength(const char* string) { size_t len = 0; while (*string) { FromUTF8(&string); len++; } return len; } size_t BUnicodeChar::UTF8StringLength(const char* string, size_t maxLength) { size_t len = 0; while (len < maxLength && *string) { FromUTF8(&string); len++; } return len; }