1 #ifndef _UNICODE_CHAR_H_ 2 #define _UNICODE_CHAR_H_ 3 4 #include <SupportDefs.h> 5 6 #include <LocaleBuild.h> 7 8 enum unicode_char_category 9 { 10 // Non-category for unassigned and non-character code points. 11 B_UNICODE_UNASSIGNED = 0, 12 13 B_UNICODE_UPPERCASE_LETTER = 1, // Lu 14 B_UNICODE_LOWERCASE_LETTER = 2, // Ll 15 B_UNICODE_TITLECASE_LETTER = 3, // Lt 16 B_UNICODE_MODIFIER_LETTER = 4, // Lm 17 B_UNICODE_OTHER_LETTER = 5, // Lo 18 B_UNICODE_NON_SPACING_MARK = 6, // Mn 19 B_UNICODE_ENCLOSING_MARK = 7, // Me 20 B_UNICODE_COMBINING_SPACING_MARK = 8, // Mc 21 B_UNICODE_DECIMAL_DIGIT_NUMBER = 9, // Nd 22 B_UNICODE_LETTER_NUMBER = 10, // Nl 23 B_UNICODE_OTHER_NUMBER = 11, // No 24 B_UNICODE_SPACE_SEPARATOR = 12, // Zs 25 B_UNICODE_LINE_SEPARATOR = 13, // Zl 26 B_UNICODE_PARAGRAPH_SEPARATOR = 14, // Zp 27 B_UNICODE_CONTROL_CHAR = 15, // Cc 28 B_UNICODE_FORMAT_CHAR = 16, // Cf 29 B_UNICODE_PRIVATE_USE_CHAR = 17, // Co 30 B_UNICODE_SURROGATE = 18, // Cs 31 B_UNICODE_DASH_PUNCTUATION = 19, // Pd 32 B_UNICODE_START_PUNCTUATION = 20, // Ps 33 B_UNICODE_END_PUNCTUATION = 21, // Pe 34 B_UNICODE_CONNECTOR_PUNCTUATION = 22, // Pc 35 B_UNICODE_OTHER_PUNCTUATION = 23, // Po 36 B_UNICODE_MATH_SYMBOL = 24, // Sm 37 B_UNICODE_CURRENCY_SYMBOL = 25, // Sc 38 B_UNICODE_MODIFIER_SYMBOL = 26, // Sk 39 B_UNICODE_OTHER_SYMBOL = 27, // So 40 B_UNICODE_INITIAL_PUNCTUATION = 28, // Pi 41 B_UNICODE_FINAL_PUNCTUATION = 29, // Pf 42 B_UNICODE_GENERAL_OTHER_TYPES = 30, // Cn 43 44 B_UNICODE_CATEGORY_COUNT 45 }; 46 47 48 /** 49 * This specifies the language directional property of a character set. 50 */ 51 52 enum unicode_char_direction { 53 B_UNICODE_LEFT_TO_RIGHT = 0, 54 B_UNICODE_RIGHT_TO_LEFT = 1, 55 B_UNICODE_EUROPEAN_NUMBER = 2, 56 B_UNICODE_EUROPEAN_NUMBER_SEPARATOR = 3, 57 B_UNICODE_EUROPEAN_NUMBER_TERMINATOR = 4, 58 B_UNICODE_ARABIC_NUMBER = 5, 59 B_UNICODE_COMMON_NUMBER_SEPARATOR = 6, 60 B_UNICODE_BLOCK_SEPARATOR = 7, 61 B_UNICODE_SEGMENT_SEPARATOR = 8, 62 B_UNICODE_WHITE_SPACE_NEUTRAL = 9, 63 B_UNICODE_OTHER_NEUTRAL = 10, 64 B_UNICODE_LEFT_TO_RIGHT_EMBEDDING = 11, 65 B_UNICODE_LEFT_TO_RIGHT_OVERRIDE = 12, 66 B_UNICODE_RIGHT_TO_LEFT_ARABIC = 13, 67 B_UNICODE_RIGHT_TO_LEFT_EMBEDDING = 14, 68 B_UNICODE_RIGHT_TO_LEFT_OVERRIDE = 15, 69 B_UNICODE_POP_DIRECTIONAL_FORMAT = 16, 70 B_UNICODE_DIR_NON_SPACING_MARK = 17, 71 B_UNICODE_BOUNDARY_NEUTRAL = 18, 72 73 B_UNICODE_DIRECTION_COUNT 74 }; 75 76 77 /** 78 * Script range as defined in the Unicode standard. 79 */ 80 81 enum unicode_char_script { 82 // Script names 83 B_UNICODE_BASIC_LATIN, 84 B_UNICODE_LATIN_1_SUPPLEMENT, 85 B_UNICODE_LATIN_EXTENDED_A, 86 B_UNICODE_LATIN_EXTENDED_B, 87 B_UNICODE_IPA_EXTENSIONS, 88 B_UNICODE_SPACING_MODIFIER_LETTERS, 89 B_UNICODE_COMBINING_DIACRITICAL_MARKS, 90 B_UNICODE_GREEK, 91 B_UNICODE_CYRILLIC, 92 B_UNICODE_ARMENIAN, 93 B_UNICODE_HEBREW, 94 B_UNICODE_ARABIC, 95 B_UNICODE_SYRIAC, 96 B_UNICODE_THAANA, 97 B_UNICODE_DEVANAGARI, 98 B_UNICODE_BENGALI, 99 B_UNICODE_GURMUKHI, 100 B_UNICODE_GUJARATI, 101 B_UNICODE_ORIYA, 102 B_UNICODE_TAMIL, 103 B_UNICODE_TELUGU, 104 B_UNICODE_KANNADA, 105 B_UNICODE_MALAYALAM, 106 B_UNICODE_SINHALA, 107 B_UNICODE_THAI, 108 B_UNICODE_LAO, 109 B_UNICODE_TIBETAN, 110 B_UNICODE_MYANMAR, 111 B_UNICODE_GEORGIAN, 112 B_UNICODE_HANGUL_JAMO, 113 B_UNICODE_ETHIOPIC, 114 B_UNICODE_CHEROKEE, 115 B_UNICODE_UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS, 116 B_UNICODE_OGHAM, 117 B_UNICODE_RUNIC, 118 B_UNICODE_KHMER, 119 B_UNICODE_MONGOLIAN, 120 B_UNICODE_LATIN_EXTENDED_ADDITIONAL, 121 B_UNICODE_GREEK_EXTENDED, 122 B_UNICODE_GENERAL_PUNCTUATION, 123 B_UNICODE_SUPERSCRIPTS_AND_SUBSCRIPTS, 124 B_UNICODE_CURRENCY_SYMBOLS, 125 B_UNICODE_COMBINING_MARKS_FOR_SYMBOLS, 126 B_UNICODE_LETTERLIKE_SYMBOLS, 127 B_UNICODE_NUMBER_FORMS, 128 B_UNICODE_ARROWS, 129 B_UNICODE_MATHEMATICAL_OPERATORS, 130 B_UNICODE_MISCELLANEOUS_TECHNICAL, 131 B_UNICODE_CONTROL_PICTURES, 132 B_UNICODE_OPTICAL_CHARACTER_RECOGNITION, 133 B_UNICODE_ENCLOSED_ALPHANUMERICS, 134 B_UNICODE_BOX_DRAWING, 135 B_UNICODE_BLOCK_ELEMENTS, 136 B_UNICODE_GEOMETRIC_SHAPES, 137 B_UNICODE_MISCELLANEOUS_SYMBOLS, 138 B_UNICODE_DINGBATS, 139 B_UNICODE_BRAILLE_PATTERNS, 140 B_UNICODE_CJK_RADICALS_SUPPLEMENT, 141 B_UNICODE_KANGXI_RADICALS, 142 B_UNICODE_IDEOGRAPHIC_DESCRIPTION_CHARACTERS, 143 B_UNICODE_CJK_SYMBOLS_AND_PUNCTUATION, 144 B_UNICODE_HIRAGANA, 145 B_UNICODE_KATAKANA, 146 B_UNICODE_BOPOMOFO, 147 B_UNICODE_HANGUL_COMPATIBILITY_JAMO, 148 B_UNICODE_KANBUN, 149 B_UNICODE_BOPOMOFO_EXTENDED, 150 B_UNICODE_ENCLOSED_CJK_LETTERS_AND_MONTHS, 151 B_UNICODE_CJK_COMPATIBILITY, 152 B_UNICODE_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A, 153 B_UNICODE_CJK_UNIFIED_IDEOGRAPHS, 154 B_UNICODE_YI_SYLLABLES, 155 B_UNICODE_YI_RADICALS, 156 B_UNICODE_HANGUL_SYLLABLES, 157 B_UNICODE_HIGH_SURROGATES, 158 B_UNICODE_HIGH_PRIVATE_USE_SURROGATES, 159 B_UNICODE_LOW_SURROGATES, 160 B_UNICODE_PRIVATE_USE_AREA, 161 B_UNICODE_CJK_COMPATIBILITY_IDEOGRAPHS, 162 B_UNICODE_ALPHABETIC_PRESENTATION_FORMS, 163 B_UNICODE_ARABIC_PRESENTATION_FORMS_A, 164 B_UNICODE_COMBINING_HALF_MARKS, 165 B_UNICODE_CJK_COMPATIBILITY_FORMS, 166 B_UNICODE_SMALL_FORM_VARIANTS, 167 B_UNICODE_ARABIC_PRESENTATION_FORMS_B, 168 B_UNICODE_SPECIALS, 169 B_UNICODE_HALFWIDTH_AND_FULLWIDTH_FORMS, 170 171 B_UNICODE_SCRIPT_COUNT, 172 B_UNICODE_NO_SCRIPT = B_UNICODE_SCRIPT_COUNT 173 }; 174 175 176 /** 177 * Values returned by the u_getCellWidth() function. 178 */ 179 180 enum unicode_cell_width 181 { 182 B_UNICODE_ZERO_WIDTH = 0, 183 B_UNICODE_HALF_WIDTH = 1, 184 B_UNICODE_FULL_WIDTH = 2, 185 B_UNICODE_NEUTRAL_WIDTH = 3, 186 187 B_UNICODE_CELL_WIDTH_COUNT 188 }; 189 190 191 class _IMPEXP_LOCALE BUnicodeChar { 192 public: 193 static bool IsAlpha(uint32 c); 194 static bool IsAlNum(uint32 c); 195 static bool IsDigit(uint32 c); 196 static bool IsHexDigit(uint32 c); 197 static bool IsUpper(uint32 c); 198 static bool IsLower(uint32 c); 199 static bool IsSpace(uint32 c); 200 static bool IsWhitespace(uint32 c); 201 static bool IsControl(uint32 c); 202 static bool IsPunctuation(uint32 c); 203 static bool IsPrintable(uint32 c); 204 static bool IsTitle(uint32 c); 205 static bool IsDefined(uint32 c); 206 static bool IsBase(uint32 c); 207 208 static int8 Type(uint32 c); 209 210 static uint32 ToLower(uint32 c); 211 static uint32 ToUpper(uint32 c); 212 static uint32 ToTitle(uint32 c); 213 static int32 DigitValue(uint32 c); 214 215 static void ToUTF8(uint32 c, char **out); 216 static uint32 FromUTF8(const char **in); 217 static uint32 FromUTF8(const char *in); 218 219 static size_t UTF8StringLength(const char *str); 220 static size_t UTF8StringLength(const char *str, size_t maxLength); 221 222 private: 223 BUnicodeChar(); 224 }; 225 226 227 inline uint32 228 BUnicodeChar::FromUTF8(const char *in) 229 { 230 const char *string = in; 231 return FromUTF8(&string); 232 } 233 234 235 #endif /* _UNICODE_CHAR_H_ */ 236