1 /* 2 * Copyright 2003, Axel Dörfler, axeld@pinc-software.de. All rights reserved. 3 * Distributed under the terms of the MIT License. 4 * 5 * Authors: 6 * Axel Dörfler, axeld@pinc-software.de 7 * Siarzhuk Zharski, zharik@gmx.li 8 * 9 */ 10 11 12 #include <UnicodeChar.h> 13 14 #include <unicode/uchar.h> 15 #include <unicode/utf8.h> 16 17 18 BUnicodeChar::BUnicodeChar() 19 { 20 } 21 22 23 // Returns the general category value for the code point. 24 int8 25 BUnicodeChar::Type(uint32 c) 26 { 27 return u_charType(c); 28 } 29 30 31 // Determines whether the specified code point is a letter character. 32 // True for general categories "L" (letters). 33 bool 34 BUnicodeChar::IsAlpha(uint32 c) 35 { 36 return u_isalpha(c); 37 } 38 39 40 // Determines whether the specified code point is an alphanumeric character 41 // (letter or digit). 42 // True for characters with general categories 43 // "L" (letters) and "Nd" (decimal digit numbers). 44 bool 45 BUnicodeChar::IsAlNum(uint32 c) 46 { 47 return u_isalnum(c); 48 } 49 50 51 // Check if a code point has the Lowercase Unicode property (UCHAR_LOWERCASE). 52 bool 53 BUnicodeChar::IsLower(uint32 c) 54 { 55 return u_isULowercase(c); 56 } 57 58 59 // Check if a code point has the Uppercase Unicode property (UCHAR_UPPERCASE). 60 bool 61 BUnicodeChar::IsUpper(uint32 c) 62 { 63 return u_isUUppercase(c); 64 } 65 66 67 // Determines whether the specified code point is a titlecase letter. 68 // True for general category "Lt" (titlecase letter). 69 bool 70 BUnicodeChar::IsTitle(uint32 c) 71 { 72 return u_istitle(c); 73 } 74 75 76 // Determines whether the specified code point is a digit character. 77 // True for characters with general category "Nd" (decimal digit numbers). 78 // Beginning with Unicode 4, this is the same as 79 // testing for the Numeric_Type of Decimal. 80 bool 81 BUnicodeChar::IsDigit(uint32 c) 82 { 83 return u_isdigit(c); 84 } 85 86 87 // Determines whether the specified code point is a hexadecimal digit. 88 // This is equivalent to u_digit(c, 16)>=0. 89 // True for characters with general category "Nd" (decimal digit numbers) 90 // as well as Latin letters a-f and A-F in both ASCII and Fullwidth ASCII. 91 // (That is, for letters with code points 92 // 0041..0046, 0061..0066, FF21..FF26, FF41..FF46.) 93 bool 94 BUnicodeChar::IsHexDigit(uint32 c) 95 { 96 return u_isxdigit(c); 97 } 98 99 100 // Determines whether the specified code point is "defined", 101 // which usually means that it is assigned a character. 102 // True for general categories other than "Cn" (other, not assigned), 103 // i.e., true for all code points mentioned in UnicodeData.txt. 104 bool 105 BUnicodeChar::IsDefined(uint32 c) 106 { 107 return u_isdefined(c); 108 } 109 110 111 // Determines whether the specified code point is a base character. 112 // True for general categories "L" (letters), "N" (numbers), 113 // "Mc" (spacing combining marks), and "Me" (enclosing marks). 114 bool 115 BUnicodeChar::IsBase(uint32 c) 116 { 117 return u_isbase(c); 118 } 119 120 121 // Determines whether the specified code point is a control character 122 // (as defined by this function). 123 // A control character is one of the following: 124 // - ISO 8-bit control character (U+0000..U+001f and U+007f..U+009f) 125 // - U_CONTROL_CHAR (Cc) 126 // - U_FORMAT_CHAR (Cf) 127 // - U_LINE_SEPARATOR (Zl) 128 // - U_PARAGRAPH_SEPARATOR (Zp) 129 bool 130 BUnicodeChar::IsControl(uint32 c) 131 { 132 return u_iscntrl(c); 133 } 134 135 136 // Determines whether the specified code point is a punctuation character. 137 // True for characters with general categories "P" (punctuation). 138 bool 139 BUnicodeChar::IsPunctuation(uint32 c) 140 { 141 return u_ispunct(c); 142 } 143 144 145 // Determine if the specified code point is a space character according to Java. 146 // True for characters with general categories "Z" (separators), 147 // which does not include control codes (e.g., TAB or Line Feed). 148 bool 149 BUnicodeChar::IsSpace(uint32 c) 150 { 151 return u_isJavaSpaceChar(c); 152 } 153 154 155 // Determines if the specified code point is a whitespace character 156 // A character is considered to be a whitespace character if and only 157 // if it satisfies one of the following criteria: 158 // - It is a Unicode Separator character (categories "Z" = "Zs" or "Zl" or "Zp"), 159 // but is not also a non-breaking space (U+00A0 NBSP or U+2007 Figure Space 160 // or U+202F Narrow NBSP). 161 // - It is U+0009 HORIZONTAL TABULATION. 162 // - It is U+000A LINE FEED. 163 // - It is U+000B VERTICAL TABULATION. 164 // - It is U+000C FORM FEED. 165 // - It is U+000D CARRIAGE RETURN. 166 // - It is U+001C FILE SEPARATOR. 167 // - It is U+001D GROUP SEPARATOR. 168 // - It is U+001E RECORD SEPARATOR. 169 // - It is U+001F UNIT SEPARATOR. 170 bool 171 BUnicodeChar::IsWhitespace(uint32 c) 172 { 173 return u_isWhitespace(c); 174 } 175 176 177 // Determines whether the specified code point is a printable character. 178 // True for general categories other than "C" (controls). 179 bool 180 BUnicodeChar::IsPrintable(uint32 c) 181 { 182 return u_isprint(c); 183 } 184 185 186 // #pragma mark - 187 188 uint32 189 BUnicodeChar::ToLower(uint32 c) 190 { 191 return u_tolower(c); 192 } 193 194 195 uint32 196 BUnicodeChar::ToUpper(uint32 c) 197 { 198 return u_toupper(c); 199 } 200 201 202 uint32 203 BUnicodeChar::ToTitle(uint32 c) 204 { 205 return u_totitle(c); 206 } 207 208 209 int32 210 BUnicodeChar::DigitValue(uint32 c) 211 { 212 return u_digit(c, 10); 213 } 214 215 216 unicode_east_asian_width 217 BUnicodeChar::EastAsianWidth(uint32 c) 218 { 219 return (unicode_east_asian_width)u_getIntPropertyValue(c, 220 UCHAR_EAST_ASIAN_WIDTH); 221 } 222 223 224 void 225 BUnicodeChar::ToUTF8(uint32 c, char** out) 226 { 227 int i = 0; 228 U8_APPEND_UNSAFE(*out, i, c); 229 *out += i; 230 } 231 232 233 uint32 234 BUnicodeChar::FromUTF8(const char** in) 235 { 236 int i = 0; 237 uint32 c = 0; 238 U8_NEXT_UNSAFE(*in, i, c); 239 *in += i; 240 241 return c; 242 } 243 244 245 size_t 246 BUnicodeChar::UTF8StringLength(const char* string) 247 { 248 size_t len = 0; 249 while (*string) { 250 FromUTF8(&string); 251 len++; 252 } 253 return len; 254 } 255 256 257 size_t 258 BUnicodeChar::UTF8StringLength(const char* string, size_t maxLength) 259 { 260 size_t len = 0; 261 while (len < maxLength && *string) { 262 FromUTF8(&string); 263 len++; 264 } 265 return len; 266 } 267