1 /* 2 * Copyright 2003, Axel Dörfler, axeld@pinc-software.de. All rights reserved. 3 * Distributed under the terms of the MIT License. 4 * 5 * Authors: 6 * Axel Dörfler, axeld@pinc-software.de 7 * Siarzhuk Zharski, zharik@gmx.li 8 * 9 */ 10 11 12 #include <UnicodeChar.h> 13 14 #include <unicode/uchar.h> 15 #include <unicode/utf8.h> 16 17 18 BUnicodeChar::BUnicodeChar() 19 { 20 } 21 22 23 // Returns the general category value for the code point. 24 int8 25 BUnicodeChar::Type(uint32 c) 26 { 27 BUnicodeChar(); 28 return u_charType(c); 29 } 30 31 32 // Determines whether the specified code point is a letter character. 33 // True for general categories "L" (letters). 34 bool 35 BUnicodeChar::IsAlpha(uint32 c) 36 { 37 BUnicodeChar(); 38 return u_isalpha(c); 39 } 40 41 42 // Determines whether the specified code point is an alphanumeric character 43 // (letter or digit). 44 // True for characters with general categories 45 // "L" (letters) and "Nd" (decimal digit numbers). 46 bool 47 BUnicodeChar::IsAlNum(uint32 c) 48 { 49 BUnicodeChar(); 50 return u_isalnum(c); 51 } 52 53 54 // Check if a code point has the Lowercase Unicode property (UCHAR_LOWERCASE). 55 bool 56 BUnicodeChar::IsLower(uint32 c) 57 { 58 BUnicodeChar(); 59 return u_isULowercase(c); 60 } 61 62 63 // Check if a code point has the Uppercase Unicode property (UCHAR_UPPERCASE). 64 bool 65 BUnicodeChar::IsUpper(uint32 c) 66 { 67 BUnicodeChar(); 68 return u_isUUppercase(c); 69 } 70 71 72 // Determines whether the specified code point is a titlecase letter. 73 // True for general category "Lt" (titlecase letter). 74 bool 75 BUnicodeChar::IsTitle(uint32 c) 76 { 77 BUnicodeChar(); 78 return u_istitle(c); 79 } 80 81 82 // Determines whether the specified code point is a digit character. 83 // True for characters with general category "Nd" (decimal digit numbers). 84 // Beginning with Unicode 4, this is the same as 85 // testing for the Numeric_Type of Decimal. 86 bool 87 BUnicodeChar::IsDigit(uint32 c) 88 { 89 BUnicodeChar(); 90 return u_isdigit(c); 91 } 92 93 94 // Determines whether the specified code point is a hexadecimal digit. 95 // This is equivalent to u_digit(c, 16)>=0. 96 // True for characters with general category "Nd" (decimal digit numbers) 97 // as well as Latin letters a-f and A-F in both ASCII and Fullwidth ASCII. 98 // (That is, for letters with code points 99 // 0041..0046, 0061..0066, FF21..FF26, FF41..FF46.) 100 bool 101 BUnicodeChar::IsHexDigit(uint32 c) 102 { 103 BUnicodeChar(); 104 return u_isxdigit(c); 105 } 106 107 108 // Determines whether the specified code point is "defined", 109 // which usually means that it is assigned a character. 110 // True for general categories other than "Cn" (other, not assigned), 111 // i.e., true for all code points mentioned in UnicodeData.txt. 112 bool 113 BUnicodeChar::IsDefined(uint32 c) 114 { 115 BUnicodeChar(); 116 return u_isdefined(c); 117 } 118 119 120 // Determines whether the specified code point is a base character. 121 // True for general categories "L" (letters), "N" (numbers), 122 // "Mc" (spacing combining marks), and "Me" (enclosing marks). 123 bool 124 BUnicodeChar::IsBase(uint32 c) 125 { 126 BUnicodeChar(); 127 return u_isbase(c); 128 } 129 130 131 // Determines whether the specified code point is a control character 132 // (as defined by this function). 133 // A control character is one of the following: 134 // - ISO 8-bit control character (U+0000..U+001f and U+007f..U+009f) 135 // - U_CONTROL_CHAR (Cc) 136 // - U_FORMAT_CHAR (Cf) 137 // - U_LINE_SEPARATOR (Zl) 138 // - U_PARAGRAPH_SEPARATOR (Zp) 139 bool 140 BUnicodeChar::IsControl(uint32 c) 141 { 142 BUnicodeChar(); 143 return u_iscntrl(c); 144 } 145 146 147 // Determines whether the specified code point is a punctuation character. 148 // True for characters with general categories "P" (punctuation). 149 bool 150 BUnicodeChar::IsPunctuation(uint32 c) 151 { 152 BUnicodeChar(); 153 return u_ispunct(c); 154 } 155 156 157 // Determine if the specified code point is a space character according to Java. 158 // True for characters with general categories "Z" (separators), 159 // which does not include control codes (e.g., TAB or Line Feed). 160 bool 161 BUnicodeChar::IsSpace(uint32 c) 162 { 163 BUnicodeChar(); 164 return u_isJavaSpaceChar(c); 165 } 166 167 168 // Determines if the specified code point is a whitespace character 169 // A character is considered to be a whitespace character if and only 170 // if it satisfies one of the following criteria: 171 // - It is a Unicode Separator character (categories "Z" = "Zs" or "Zl" or "Zp"), 172 // but is not also a non-breaking space (U+00A0 NBSP or U+2007 Figure Space 173 // or U+202F Narrow NBSP). 174 // - It is U+0009 HORIZONTAL TABULATION. 175 // - It is U+000A LINE FEED. 176 // - It is U+000B VERTICAL TABULATION. 177 // - It is U+000C FORM FEED. 178 // - It is U+000D CARRIAGE RETURN. 179 // - It is U+001C FILE SEPARATOR. 180 // - It is U+001D GROUP SEPARATOR. 181 // - It is U+001E RECORD SEPARATOR. 182 // - It is U+001F UNIT SEPARATOR. 183 bool 184 BUnicodeChar::IsWhitespace(uint32 c) 185 { 186 BUnicodeChar(); 187 return u_isWhitespace(c); 188 } 189 190 191 // Determines whether the specified code point is a printable character. 192 // True for general categories other than "C" (controls). 193 bool 194 BUnicodeChar::IsPrintable(uint32 c) 195 { 196 BUnicodeChar(); 197 return u_isprint(c); 198 } 199 200 201 // #pragma mark - 202 203 uint32 204 BUnicodeChar::ToLower(uint32 c) 205 { 206 BUnicodeChar(); 207 return u_tolower(c); 208 } 209 210 211 uint32 212 BUnicodeChar::ToUpper(uint32 c) 213 { 214 BUnicodeChar(); 215 return u_toupper(c); 216 } 217 218 219 uint32 220 BUnicodeChar::ToTitle(uint32 c) 221 { 222 BUnicodeChar(); 223 return u_totitle(c); 224 } 225 226 227 int32 228 BUnicodeChar::DigitValue(uint32 c) 229 { 230 BUnicodeChar(); 231 return u_digit(c, 10); 232 } 233 234 235 unicode_east_asian_width 236 BUnicodeChar::EastAsianWidth(uint32 c) 237 { 238 return (unicode_east_asian_width)u_getIntPropertyValue(c, 239 UCHAR_EAST_ASIAN_WIDTH); 240 } 241 242 243 void 244 BUnicodeChar::ToUTF8(uint32 c, char** out) 245 { 246 int i = 0; 247 U8_APPEND_UNSAFE(*out, i, c); 248 *out += i; 249 } 250 251 252 uint32 253 BUnicodeChar::FromUTF8(const char** in) 254 { 255 int i = 0; 256 uint32 c = 0; 257 U8_NEXT_UNSAFE(*in, i, c); 258 *in += i; 259 260 return c; 261 } 262 263 264 size_t 265 BUnicodeChar::UTF8StringLength(const char* string) 266 { 267 size_t len = 0; 268 while (*string) { 269 FromUTF8(&string); 270 len++; 271 } 272 return len; 273 } 274 275 276 size_t 277 BUnicodeChar::UTF8StringLength(const char* string, size_t maxLength) 278 { 279 size_t len = 0; 280 while (len < maxLength && *string) { 281 FromUTF8(&string); 282 len++; 283 } 284 return len; 285 } 286