1 #ifndef _UNICODE_CHAR_H_ 2 #define _UNICODE_CHAR_H_ 3 4 #include <SupportDefs.h> 5 6 enum unicode_char_category 7 { 8 // Non-category for unassigned and non-character code points. 9 B_UNICODE_UNASSIGNED = 0, 10 11 B_UNICODE_UPPERCASE_LETTER = 1, // Lu 12 B_UNICODE_LOWERCASE_LETTER = 2, // Ll 13 B_UNICODE_TITLECASE_LETTER = 3, // Lt 14 B_UNICODE_MODIFIER_LETTER = 4, // Lm 15 B_UNICODE_OTHER_LETTER = 5, // Lo 16 B_UNICODE_NON_SPACING_MARK = 6, // Mn 17 B_UNICODE_ENCLOSING_MARK = 7, // Me 18 B_UNICODE_COMBINING_SPACING_MARK = 8, // Mc 19 B_UNICODE_DECIMAL_DIGIT_NUMBER = 9, // Nd 20 B_UNICODE_LETTER_NUMBER = 10, // Nl 21 B_UNICODE_OTHER_NUMBER = 11, // No 22 B_UNICODE_SPACE_SEPARATOR = 12, // Zs 23 B_UNICODE_LINE_SEPARATOR = 13, // Zl 24 B_UNICODE_PARAGRAPH_SEPARATOR = 14, // Zp 25 B_UNICODE_CONTROL_CHAR = 15, // Cc 26 B_UNICODE_FORMAT_CHAR = 16, // Cf 27 B_UNICODE_PRIVATE_USE_CHAR = 17, // Co 28 B_UNICODE_SURROGATE = 18, // Cs 29 B_UNICODE_DASH_PUNCTUATION = 19, // Pd 30 B_UNICODE_START_PUNCTUATION = 20, // Ps 31 B_UNICODE_END_PUNCTUATION = 21, // Pe 32 B_UNICODE_CONNECTOR_PUNCTUATION = 22, // Pc 33 B_UNICODE_OTHER_PUNCTUATION = 23, // Po 34 B_UNICODE_MATH_SYMBOL = 24, // Sm 35 B_UNICODE_CURRENCY_SYMBOL = 25, // Sc 36 B_UNICODE_MODIFIER_SYMBOL = 26, // Sk 37 B_UNICODE_OTHER_SYMBOL = 27, // So 38 B_UNICODE_INITIAL_PUNCTUATION = 28, // Pi 39 B_UNICODE_FINAL_PUNCTUATION = 29, // Pf 40 B_UNICODE_GENERAL_OTHER_TYPES = 30, // Cn 41 42 B_UNICODE_CATEGORY_COUNT 43 }; 44 45 46 /** 47 * This specifies the language directional property of a character set. 48 */ 49 50 enum unicode_char_direction { 51 B_UNICODE_LEFT_TO_RIGHT = 0, 52 B_UNICODE_RIGHT_TO_LEFT = 1, 53 B_UNICODE_EUROPEAN_NUMBER = 2, 54 B_UNICODE_EUROPEAN_NUMBER_SEPARATOR = 3, 55 B_UNICODE_EUROPEAN_NUMBER_TERMINATOR = 4, 56 B_UNICODE_ARABIC_NUMBER = 5, 57 B_UNICODE_COMMON_NUMBER_SEPARATOR = 6, 58 B_UNICODE_BLOCK_SEPARATOR = 7, 59 B_UNICODE_SEGMENT_SEPARATOR = 8, 60 B_UNICODE_WHITE_SPACE_NEUTRAL = 9, 61 B_UNICODE_OTHER_NEUTRAL = 10, 62 B_UNICODE_LEFT_TO_RIGHT_EMBEDDING = 11, 63 B_UNICODE_LEFT_TO_RIGHT_OVERRIDE = 12, 64 B_UNICODE_RIGHT_TO_LEFT_ARABIC = 13, 65 B_UNICODE_RIGHT_TO_LEFT_EMBEDDING = 14, 66 B_UNICODE_RIGHT_TO_LEFT_OVERRIDE = 15, 67 B_UNICODE_POP_DIRECTIONAL_FORMAT = 16, 68 B_UNICODE_DIR_NON_SPACING_MARK = 17, 69 B_UNICODE_BOUNDARY_NEUTRAL = 18, 70 71 B_UNICODE_DIRECTION_COUNT 72 }; 73 74 75 /** 76 * Script range as defined in the Unicode standard. 77 */ 78 79 enum unicode_char_script { 80 // Script names 81 B_UNICODE_BASIC_LATIN, 82 B_UNICODE_LATIN_1_SUPPLEMENT, 83 B_UNICODE_LATIN_EXTENDED_A, 84 B_UNICODE_LATIN_EXTENDED_B, 85 B_UNICODE_IPA_EXTENSIONS, 86 B_UNICODE_SPACING_MODIFIER_LETTERS, 87 B_UNICODE_COMBINING_DIACRITICAL_MARKS, 88 B_UNICODE_GREEK, 89 B_UNICODE_CYRILLIC, 90 B_UNICODE_ARMENIAN, 91 B_UNICODE_HEBREW, 92 B_UNICODE_ARABIC, 93 B_UNICODE_SYRIAC, 94 B_UNICODE_THAANA, 95 B_UNICODE_DEVANAGARI, 96 B_UNICODE_BENGALI, 97 B_UNICODE_GURMUKHI, 98 B_UNICODE_GUJARATI, 99 B_UNICODE_ORIYA, 100 B_UNICODE_TAMIL, 101 B_UNICODE_TELUGU, 102 B_UNICODE_KANNADA, 103 B_UNICODE_MALAYALAM, 104 B_UNICODE_SINHALA, 105 B_UNICODE_THAI, 106 B_UNICODE_LAO, 107 B_UNICODE_TIBETAN, 108 B_UNICODE_MYANMAR, 109 B_UNICODE_GEORGIAN, 110 B_UNICODE_HANGUL_JAMO, 111 B_UNICODE_ETHIOPIC, 112 B_UNICODE_CHEROKEE, 113 B_UNICODE_UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS, 114 B_UNICODE_OGHAM, 115 B_UNICODE_RUNIC, 116 B_UNICODE_KHMER, 117 B_UNICODE_MONGOLIAN, 118 B_UNICODE_LATIN_EXTENDED_ADDITIONAL, 119 B_UNICODE_GREEK_EXTENDED, 120 B_UNICODE_GENERAL_PUNCTUATION, 121 B_UNICODE_SUPERSCRIPTS_AND_SUBSCRIPTS, 122 B_UNICODE_CURRENCY_SYMBOLS, 123 B_UNICODE_COMBINING_MARKS_FOR_SYMBOLS, 124 B_UNICODE_LETTERLIKE_SYMBOLS, 125 B_UNICODE_NUMBER_FORMS, 126 B_UNICODE_ARROWS, 127 B_UNICODE_MATHEMATICAL_OPERATORS, 128 B_UNICODE_MISCELLANEOUS_TECHNICAL, 129 B_UNICODE_CONTROL_PICTURES, 130 B_UNICODE_OPTICAL_CHARACTER_RECOGNITION, 131 B_UNICODE_ENCLOSED_ALPHANUMERICS, 132 B_UNICODE_BOX_DRAWING, 133 B_UNICODE_BLOCK_ELEMENTS, 134 B_UNICODE_GEOMETRIC_SHAPES, 135 B_UNICODE_MISCELLANEOUS_SYMBOLS, 136 B_UNICODE_DINGBATS, 137 B_UNICODE_BRAILLE_PATTERNS, 138 B_UNICODE_CJK_RADICALS_SUPPLEMENT, 139 B_UNICODE_KANGXI_RADICALS, 140 B_UNICODE_IDEOGRAPHIC_DESCRIPTION_CHARACTERS, 141 B_UNICODE_CJK_SYMBOLS_AND_PUNCTUATION, 142 B_UNICODE_HIRAGANA, 143 B_UNICODE_KATAKANA, 144 B_UNICODE_BOPOMOFO, 145 B_UNICODE_HANGUL_COMPATIBILITY_JAMO, 146 B_UNICODE_KANBUN, 147 B_UNICODE_BOPOMOFO_EXTENDED, 148 B_UNICODE_ENCLOSED_CJK_LETTERS_AND_MONTHS, 149 B_UNICODE_CJK_COMPATIBILITY, 150 B_UNICODE_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A, 151 B_UNICODE_CJK_UNIFIED_IDEOGRAPHS, 152 B_UNICODE_YI_SYLLABLES, 153 B_UNICODE_YI_RADICALS, 154 B_UNICODE_HANGUL_SYLLABLES, 155 B_UNICODE_HIGH_SURROGATES, 156 B_UNICODE_HIGH_PRIVATE_USE_SURROGATES, 157 B_UNICODE_LOW_SURROGATES, 158 B_UNICODE_PRIVATE_USE_AREA, 159 B_UNICODE_CJK_COMPATIBILITY_IDEOGRAPHS, 160 B_UNICODE_ALPHABETIC_PRESENTATION_FORMS, 161 B_UNICODE_ARABIC_PRESENTATION_FORMS_A, 162 B_UNICODE_COMBINING_HALF_MARKS, 163 B_UNICODE_CJK_COMPATIBILITY_FORMS, 164 B_UNICODE_SMALL_FORM_VARIANTS, 165 B_UNICODE_ARABIC_PRESENTATION_FORMS_B, 166 B_UNICODE_SPECIALS, 167 B_UNICODE_HALFWIDTH_AND_FULLWIDTH_FORMS, 168 169 B_UNICODE_SCRIPT_COUNT, 170 B_UNICODE_NO_SCRIPT = B_UNICODE_SCRIPT_COUNT 171 }; 172 173 174 /** 175 * Values returned by the u_getCellWidth() function. 176 */ 177 178 enum unicode_cell_width 179 { 180 B_UNICODE_ZERO_WIDTH = 0, 181 B_UNICODE_HALF_WIDTH = 1, 182 B_UNICODE_FULL_WIDTH = 2, 183 B_UNICODE_NEUTRAL_WIDTH = 3, 184 185 B_UNICODE_CELL_WIDTH_COUNT 186 }; 187 188 189 class BUnicodeChar { 190 public: 191 static bool IsAlpha(uint32 c); 192 static bool IsAlNum(uint32 c); 193 static bool IsDigit(uint32 c); 194 static bool IsHexDigit(uint32 c); 195 static bool IsUpper(uint32 c); 196 static bool IsLower(uint32 c); 197 static bool IsSpace(uint32 c); 198 static bool IsWhitespace(uint32 c); 199 static bool IsControl(uint32 c); 200 static bool IsPunctuation(uint32 c); 201 static bool IsPrintable(uint32 c); 202 static bool IsTitle(uint32 c); 203 static bool IsDefined(uint32 c); 204 static bool IsBase(uint32 c); 205 206 static int8 Type(uint32 c); 207 208 static uint32 ToLower(uint32 c); 209 static uint32 ToUpper(uint32 c); 210 static uint32 ToTitle(uint32 c); 211 static int32 DigitValue(uint32 c); 212 213 static void ToUTF8(uint32 c, char **out); 214 static uint32 FromUTF8(const char **in); 215 static uint32 FromUTF8(const char *in); 216 217 static size_t UTF8StringLength(const char *str); 218 static size_t UTF8StringLength(const char *str, size_t maxLength); 219 220 private: 221 BUnicodeChar(); 222 }; 223 224 225 inline uint32 226 BUnicodeChar::FromUTF8(const char *in) 227 { 228 const char *string = in; 229 return FromUTF8(&string); 230 } 231 232 233 #endif /* _UNICODE_CHAR_H_ */ 234