1 #ifndef _UNICODE_CHAR_H_ 2 #define _UNICODE_CHAR_H_ 3 4 #include <SupportDefs.h> 5 6 enum unicode_char_category 7 { 8 // Non-category for unassigned and non-character code points. 9 B_UNICODE_UNASSIGNED = 0, 10 11 B_UNICODE_GENERAL_OTHER_TYPES = 0, // Cn 12 B_UNICODE_UPPERCASE_LETTER = 1, // Lu 13 B_UNICODE_LOWERCASE_LETTER = 2, // Ll 14 B_UNICODE_TITLECASE_LETTER = 3, // Lt 15 B_UNICODE_MODIFIER_LETTER = 4, // Lm 16 B_UNICODE_OTHER_LETTER = 5, // Lo 17 B_UNICODE_NON_SPACING_MARK = 6, // Mn 18 B_UNICODE_ENCLOSING_MARK = 7, // Me 19 B_UNICODE_COMBINING_SPACING_MARK = 8, // Mc 20 B_UNICODE_DECIMAL_DIGIT_NUMBER = 9, // Nd 21 B_UNICODE_LETTER_NUMBER = 10, // Nl 22 B_UNICODE_OTHER_NUMBER = 11, // No 23 B_UNICODE_SPACE_SEPARATOR = 12, // Zs 24 B_UNICODE_LINE_SEPARATOR = 13, // Zl 25 B_UNICODE_PARAGRAPH_SEPARATOR = 14, // Zp 26 B_UNICODE_CONTROL_CHAR = 15, // Cc 27 B_UNICODE_FORMAT_CHAR = 16, // Cf 28 B_UNICODE_PRIVATE_USE_CHAR = 17, // Co 29 B_UNICODE_SURROGATE = 18, // Cs 30 B_UNICODE_DASH_PUNCTUATION = 19, // Pd 31 B_UNICODE_START_PUNCTUATION = 20, // Ps 32 B_UNICODE_END_PUNCTUATION = 21, // Pe 33 B_UNICODE_CONNECTOR_PUNCTUATION = 22, // Pc 34 B_UNICODE_OTHER_PUNCTUATION = 23, // Po 35 B_UNICODE_MATH_SYMBOL = 24, // Sm 36 B_UNICODE_CURRENCY_SYMBOL = 25, // Sc 37 B_UNICODE_MODIFIER_SYMBOL = 26, // Sk 38 B_UNICODE_OTHER_SYMBOL = 27, // So 39 B_UNICODE_INITIAL_PUNCTUATION = 28, // Pi 40 B_UNICODE_FINAL_PUNCTUATION = 29, // Pf 41 42 B_UNICODE_CATEGORY_COUNT 43 }; 44 45 46 // This specifies the language directional property of a character set. 47 48 enum unicode_char_direction { 49 B_UNICODE_LEFT_TO_RIGHT = 0, 50 B_UNICODE_RIGHT_TO_LEFT = 1, 51 B_UNICODE_EUROPEAN_NUMBER = 2, 52 B_UNICODE_EUROPEAN_NUMBER_SEPARATOR = 3, 53 B_UNICODE_EUROPEAN_NUMBER_TERMINATOR = 4, 54 B_UNICODE_ARABIC_NUMBER = 5, 55 B_UNICODE_COMMON_NUMBER_SEPARATOR = 6, 56 B_UNICODE_BLOCK_SEPARATOR = 7, 57 B_UNICODE_SEGMENT_SEPARATOR = 8, 58 B_UNICODE_WHITE_SPACE_NEUTRAL = 9, 59 B_UNICODE_OTHER_NEUTRAL = 10, 60 B_UNICODE_LEFT_TO_RIGHT_EMBEDDING = 11, 61 B_UNICODE_LEFT_TO_RIGHT_OVERRIDE = 12, 62 B_UNICODE_RIGHT_TO_LEFT_ARABIC = 13, 63 B_UNICODE_RIGHT_TO_LEFT_EMBEDDING = 14, 64 B_UNICODE_RIGHT_TO_LEFT_OVERRIDE = 15, 65 B_UNICODE_POP_DIRECTIONAL_FORMAT = 16, 66 B_UNICODE_DIR_NON_SPACING_MARK = 17, 67 B_UNICODE_BOUNDARY_NEUTRAL = 18, 68 69 B_UNICODE_DIRECTION_COUNT 70 }; 71 72 73 // Script range as defined in the Unicode standard. 74 75 enum unicode_char_script { 76 // New No_Block value in Unicode 4. 77 B_UNICODE_NO_BLOCK = 0, // [none] Special range 78 B_UNICODE_BASIC_LATIN = 1, // [0000] 79 B_UNICODE_LATIN_1_SUPPLEMENT = 2, // [0080] 80 B_UNICODE_LATIN_EXTENDED_A = 3, // [0100] 81 B_UNICODE_LATIN_EXTENDED_B = 4, // [0180] 82 B_UNICODE_IPA_EXTENSIONS = 5, // [0250] 83 B_UNICODE_SPACING_MODIFIER_LETTERS = 6, // [02B0] 84 B_UNICODE_COMBINING_DIACRITICAL_MARKS = 7, // [0300] 85 B_UNICODE_GREEK = 8, // [0370] 86 B_UNICODE_CYRILLIC = 9, // [0400] 87 B_UNICODE_ARMENIAN = 10, // [0530] 88 B_UNICODE_HEBREW = 11, // [0590] 89 B_UNICODE_ARABIC = 12, // [0600] 90 B_UNICODE_SYRIAC = 13, // [0700] 91 B_UNICODE_THAANA = 14, // [0780] 92 B_UNICODE_DEVANAGARI = 15, // [0900] 93 B_UNICODE_BENGALI = 16, // [0980] 94 B_UNICODE_GURMUKHI = 17, // [0A00] 95 B_UNICODE_GUJARATI = 18, // [0A80] 96 B_UNICODE_ORIYA = 19, // [0B00] 97 B_UNICODE_TAMIL = 20, // [0B80] 98 B_UNICODE_TELUGU = 21, // [0C00] 99 B_UNICODE_KANNADA = 22, // [0C80] 100 B_UNICODE_MALAYALAM = 23, // [0D00] 101 B_UNICODE_SINHALA = 24, // [0D80] 102 B_UNICODE_THAI = 25, // [0E00] 103 B_UNICODE_LAO = 26, // [0E80] 104 B_UNICODE_TIBETAN = 27, // [0F00] 105 B_UNICODE_MYANMAR = 28, // [1000] 106 B_UNICODE_GEORGIAN = 29, // [10A0] 107 B_UNICODE_HANGUL_JAMO = 30, // [1100] 108 B_UNICODE_ETHIOPIC = 31, // [1200] 109 B_UNICODE_CHEROKEE = 32, // [13A0] 110 B_UNICODE_UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS = 33, // [1400] 111 B_UNICODE_OGHAM = 34, // [1680] 112 B_UNICODE_RUNIC = 35, // [16A0] 113 B_UNICODE_KHMER = 36, // [1780] 114 B_UNICODE_MONGOLIAN = 37, // [1800] 115 B_UNICODE_LATIN_EXTENDED_ADDITIONAL = 38, // [1E00] 116 B_UNICODE_GREEK_EXTENDED = 39, // [1F00] 117 B_UNICODE_GENERAL_PUNCTUATION = 40, // [2000] 118 B_UNICODE_SUPERSCRIPTS_AND_SUBSCRIPTS = 41, // [2070] 119 B_UNICODE_CURRENCY_SYMBOLS = 42, // [20A0] 120 B_UNICODE_COMBINING_MARKS_FOR_SYMBOLS = 43, // [20D0] 121 B_UNICODE_LETTERLIKE_SYMBOLS = 44, // [2100] 122 B_UNICODE_NUMBER_FORMS = 45, // [2150] 123 B_UNICODE_ARROWS = 46, // [2190] 124 B_UNICODE_MATHEMATICAL_OPERATORS = 47, // [2200] 125 B_UNICODE_MISCELLANEOUS_TECHNICAL = 48, // [2300] 126 B_UNICODE_CONTROL_PICTURES = 49, // [2400] 127 B_UNICODE_OPTICAL_CHARACTER_RECOGNITION = 50, // [2440] 128 B_UNICODE_ENCLOSED_ALPHANUMERICS = 51, // [2460] 129 B_UNICODE_BOX_DRAWING = 52, // [2500] 130 B_UNICODE_BLOCK_ELEMENTS = 53, // [2580] 131 B_UNICODE_GEOMETRIC_SHAPES = 54, // [25A0] 132 B_UNICODE_MISCELLANEOUS_SYMBOLS = 55, // [2600] 133 B_UNICODE_DINGBATS = 56, // [2700] 134 B_UNICODE_BRAILLE_PATTERNS = 57, // [2800] 135 B_UNICODE_CJK_RADICALS_SUPPLEMENT = 58, // [2E80] 136 B_UNICODE_KANGXI_RADICALS = 59, // [2F00] 137 B_UNICODE_IDEOGRAPHIC_DESCRIPTION_CHARACTERS = 60, // [2FF0] 138 B_UNICODE_CJK_SYMBOLS_AND_PUNCTUATION = 61, // [3000] 139 B_UNICODE_HIRAGANA = 62, // [3040] 140 B_UNICODE_KATAKANA = 63, // [30A0] 141 B_UNICODE_BOPOMOFO = 64, // [3100] 142 B_UNICODE_HANGUL_COMPATIBILITY_JAMO = 65, // [3130] 143 B_UNICODE_KANBUN = 66, // [3190] 144 B_UNICODE_BOPOMOFO_EXTENDED = 67, // [31A0] 145 B_UNICODE_ENCLOSED_CJK_LETTERS_AND_MONTHS = 68, // [3200] 146 B_UNICODE_CJK_COMPATIBILITY = 69, // [3300] 147 B_UNICODE_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A = 70, // [3400] 148 B_UNICODE_CJK_UNIFIED_IDEOGRAPHS = 71, // [4E00] 149 B_UNICODE_YI_SYLLABLES = 72, // [A000] 150 B_UNICODE_YI_RADICALS = 73, // [A490] 151 B_UNICODE_HANGUL_SYLLABLES = 74, // [AC00] 152 B_UNICODE_HIGH_SURROGATES = 75, // [D800] 153 B_UNICODE_HIGH_PRIVATE_USE_SURROGATES = 76, // [DB80] 154 B_UNICODE_LOW_SURROGATES = 77, // [DC00] 155 B_UNICODE_PRIVATE_USE = 78, 156 B_UNICODE_PRIVATE_USE_AREA = B_UNICODE_PRIVATE_USE, // [E000] 157 B_UNICODE_CJK_COMPATIBILITY_IDEOGRAPHS = 79, // [F900] 158 B_UNICODE_ALPHABETIC_PRESENTATION_FORMS = 80, // [FB00] 159 B_UNICODE_ARABIC_PRESENTATION_FORMS_A = 81, // [FB50] 160 B_UNICODE_COMBINING_HALF_MARKS = 82, // [FE20] 161 B_UNICODE_CJK_COMPATIBILITY_FORMS = 83, // [FE30] 162 B_UNICODE_SMALL_FORM_VARIANTS = 84, // [FE50] 163 B_UNICODE_ARABIC_PRESENTATION_FORMS_B = 85, // [FE70] 164 B_UNICODE_SPECIALS = 86, // [FFF0] 165 B_UNICODE_HALFWIDTH_AND_FULLWIDTH_FORMS = 87, // [FF00] 166 167 // New blocks in Unicode 3.1 168 B_UNICODE_OLD_ITALIC = 88, // [10300] 169 B_UNICODE_GOTHIC = 89, // [10330] 170 B_UNICODE_DESERET = 90, // [10400] 171 B_UNICODE_BYZANTINE_MUSICAL_SYMBOLS = 91, // [1D000] 172 B_UNICODE_MUSICAL_SYMBOLS = 92, // [1D100] 173 B_UNICODE_MATHEMATICAL_ALPHANUMERIC_SYMBOLS = 93, // [1D400] 174 B_UNICODE_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B = 94, // [20000] 175 B_UNICODE_CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT = 95, // [2F800] 176 B_UNICODE_TAGS = 96, // [E0000] 177 178 // New blocks in Unicode 179 B_UNICODE_CYRILLIC_SUPPLEMENTARY = 97, 180 B_UNICODE_CYRILLIC_SUPPLEMENT = B_UNICODE_CYRILLIC_SUPPLEMENTARY, // [0500] 181 B_UNICODE_TAGALOG = 98, // [1700] 182 B_UNICODE_HANUNOO = 99, // [1720] 183 B_UNICODE_BUHID = 100, // [1740] 184 B_UNICODE_TAGBANWA = 101, // [1760] 185 B_UNICODE_MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A = 102, // [27C0] 186 B_UNICODE_SUPPLEMENTAL_ARROWS_A = 103, // [27F0] 187 B_UNICODE_SUPPLEMENTAL_ARROWS_B = 104, // [2900] 188 B_UNICODE_MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B = 105, // [2980] 189 B_UNICODE_SUPPLEMENTAL_MATHEMATICAL_OPERATORS = 106, // [2A00] 190 B_UNICODE_KATAKANA_PHONETIC_EXTENSIONS = 107, // [31F0] 191 B_UNICODE_VARIATION_SELECTORS = 108, // [FE00] 192 B_UNICODE_SUPPLEMENTARY_PRIVATE_USE_AREA_A = 109, // [F0000] 193 B_UNICODE_SUPPLEMENTARY_PRIVATE_USE_AREA_B = 110, // [100000] 194 195 // New blocks in Unicode 4 196 B_UNICODE_LIMBU = 111, // [1900] 197 B_UNICODE_TAI_LE = 112, // [1950] 198 B_UNICODE_KHMER_SYMBOLS = 113, // [19E0] 199 B_UNICODE_PHONETIC_EXTENSIONS = 114, // [1D00] 200 B_UNICODE_MISCELLANEOUS_SYMBOLS_AND_ARROWS = 115, // [2B00] 201 B_UNICODE_YIJING_HEXAGRAM_SYMBOLS = 116, // [4DC0] 202 B_UNICODE_LINEAR_B_SYLLABARY = 117, // [10000] 203 B_UNICODE_LINEAR_B_IDEOGRAMS = 118, // [10080] 204 B_UNICODE_AEGEAN_NUMBERS = 119, // [10100] 205 B_UNICODE_UGARITIC = 120, // [10380] 206 B_UNICODE_SHAVIAN = 121, // [10450] 207 B_UNICODE_OSMANYA = 122, // [10480] 208 B_UNICODE_CYPRIOT_SYLLABARY = 123, // [10800] 209 B_UNICODE_TAI_XUAN_JING_SYMBOLS = 124, // [1D300] 210 B_UNICODE_VARIATION_SELECTORS_SUPPLEMENT = 125, // [E0100] 211 212 // New blocks in Unicode 4.1 213 B_UNICODE_ANCIENT_GREEK_MUSICAL_NOTATION = 126, // [1D200] 214 B_UNICODE_ANCIENT_GREEK_NUMBERS = 127, // [10140] 215 B_UNICODE_ARABIC_SUPPLEMENT = 128, // [0750] 216 B_UNICODE_BUGINESE = 129, // [1A00] 217 B_UNICODE_CJK_STROKES = 130, // [31C0] 218 B_UNICODE_COMBINING_DIACRITICAL_MARKS_SUPPLEMENT = 131, // [1DC0] 219 B_UNICODE_COPTIC = 132, // [2C80] 220 B_UNICODE_ETHIOPIC_EXTENDED = 133, // [2D80] 221 B_UNICODE_ETHIOPIC_SUPPLEMENT = 134, // [1380] 222 B_UNICODE_GEORGIAN_SUPPLEMENT = 135, // [2D00] 223 B_UNICODE_GLAGOLITIC = 136, // [2C00] 224 B_UNICODE_KHAROSHTHI = 137, // [10A00] 225 B_UNICODE_MODIFIER_TONE_LETTERS = 138, // [A700] 226 B_UNICODE_NEW_TAI_LUE = 139, // [1980] 227 B_UNICODE_OLD_PERSIAN = 140, // [103A0] 228 B_UNICODE_PHONETIC_EXTENSIONS_SUPPLEMENT = 141, // [1D80] 229 B_UNICODE_SUPPLEMENTAL_PUNCTUATION = 142, // [2E00] 230 B_UNICODE_SYLOTI_NAGRI = 143, // [A800] 231 B_UNICODE_TIFINAGH = 144, // [2D30] 232 B_UNICODE_VERTICAL_FORMS = 145, // [FE10] 233 234 // New blocks in Unicode 5.0 235 B_UNICODE_NKO = 146, // [07C0] 236 B_UNICODE_BALINESE = 147, // [1B00] 237 B_UNICODE_LATIN_EXTENDED_C = 148, // [2C60] 238 B_UNICODE_LATIN_EXTENDED_D = 149, // [A720] 239 B_UNICODE_PHAGS_PA = 150, // [A840] 240 B_UNICODE_PHOENICIAN = 151, // [10900] 241 B_UNICODE_CUNEIFORM = 152, // [12000] 242 B_UNICODE_CUNEIFORM_NUMBERS_AND_PUNCTUATION = 153, // [12400] 243 B_UNICODE_COUNTING_ROD_NUMERALS = 154, // [1D360] 244 245 // New blocks in Unicode 5.1 246 B_UNICODE_SUNDANESE = 155, // [1B80] 247 B_UNICODE_LEPCHA = 156, // [1C00] 248 B_UNICODE_OL_CHIKI = 157, // [1C50] 249 B_UNICODE_CYRILLIC_EXTENDED_A = 158, // [2DE0] 250 B_UNICODE_VAI = 159, // [A500] 251 B_UNICODE_CYRILLIC_EXTENDED_B = 160, // [A640] 252 B_UNICODE_SAURASHTRA = 161, // [A880] 253 B_UNICODE_KAYAH_LI = 162, // [A900] 254 B_UNICODE_REJANG = 163, // [A930] 255 B_UNICODE_CHAM = 164, // [AA00] 256 B_UNICODE_ANCIENT_SYMBOLS = 165, // [10190] 257 B_UNICODE_PHAISTOS_DISC = 166, // [101D0] 258 B_UNICODE_LYCIAN = 167, // [10280] 259 B_UNICODE_CARIAN = 168, // [102A0] 260 B_UNICODE_LYDIAN = 169, // [10920] 261 B_UNICODE_MAHJONG_TILES = 170, // [1F000] 262 B_UNICODE_DOMINO_TILES = 171, // [1F030] 263 264 // New blocks in Unicode 5.2 265 B_UNICODE_SAMARITAN = 172, // [0800] 266 B_UNICODE_UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS_EXTENDED = 173, // [18B0] 267 B_UNICODE_TAI_THAM = 174, // [1A20] 268 B_UNICODE_VEDIC_EXTENSIONS = 175, // [1CD0] 269 B_UNICODE_LISU = 176, // [A4D0] 270 B_UNICODE_BAMUM = 177, // [A6A0] 271 B_UNICODE_COMMON_INDIC_NUMBER_FORMS = 178, // [A830] 272 B_UNICODE_DEVANAGARI_EXTENDED = 179, // [A8E0] 273 B_UNICODE_HANGUL_JAMO_EXTENDED_A = 180, // [A960] 274 B_UNICODE_JAVANESE = 181, // [A980] 275 B_UNICODE_MYANMAR_EXTENDED_A = 182, // [AA60] 276 B_UNICODE_TAI_VIET = 183, // [AA80] 277 B_UNICODE_MEETEI_MAYEK = 184, // [ABC0] 278 B_UNICODE_HANGUL_JAMO_EXTENDED_B = 185, // [D7B0] 279 B_UNICODE_IMPERIAL_ARAMAIC = 186, // [10840] 280 B_UNICODE_OLD_SOUTH_ARABIAN = 187, // [10A60] 281 B_UNICODE_AVESTAN = 188, // [10B00] 282 B_UNICODE_INSCRIPTIONAL_PARTHIAN = 189, // [10B40] 283 B_UNICODE_INSCRIPTIONAL_PAHLAVI = 190, // [10B60] 284 B_UNICODE_OLD_TURKIC = 191, // [10C00] 285 B_UNICODE_RUMI_NUMERAL_SYMBOLS = 192, // [10E60] 286 B_UNICODE_KAITHI = 193, // [11080] 287 B_UNICODE_EGYPTIAN_HIEROGLYPHS = 194, // [13000] 288 B_UNICODE_ENCLOSED_ALPHANUMERIC_SUPPLEMENT = 195, // [1F100] 289 B_UNICODE_ENCLOSED_IDEOGRAPHIC_SUPPLEMENT = 196, // [1F200] 290 B_UNICODE_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_C = 197, // [2A700] 291 292 // New blocks in Unicode 6.0 293 B_UNICODE_MANDAIC = 198, // [0840] 294 B_UNICODE_BATAK = 199, // [1BC0] 295 B_UNICODE_ETHIOPIC_EXTENDED_A = 200, // [AB00] 296 B_UNICODE_BRAHMI = 201, // [11000] 297 B_UNICODE_BAMUM_SUPPLEMENT = 202, // [16800] 298 B_UNICODE_KANA_SUPPLEMENT = 203, // [1B000] 299 B_UNICODE_PLAYING_CARDS = 204, // [1F0A0] 300 B_UNICODE_MISCELLANEOUS_SYMBOLS_AND_PICTOGRAPHS = 205, // [1F300] 301 B_UNICODE_EMOTICONS = 206, // [1F600] 302 B_UNICODE_TRANSPORT_AND_MAP_SYMBOLS = 207, // [1F680] 303 B_UNICODE_ALCHEMICAL_SYMBOLS = 208, // [1F700] 304 B_UNICODE_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D = 209, // [2B740] 305 306 B_UNICODE_SCRIPT_COUNT = 210, 307 B_UNICODE_NO_SCRIPT = B_UNICODE_SCRIPT_COUNT, 308 309 B_UNICODE_INVALID_CODE = -1 310 }; 311 312 313 // East Asian Width constants. 314 315 enum unicode_east_asian_width 316 { 317 B_UNICODE_EA_NEUTRAL, // [N] 318 B_UNICODE_EA_AMBIGUOUS, // [A] 319 B_UNICODE_EA_HALFWIDTH, // [H] 320 B_UNICODE_EA_FULLWIDTH, // [F] 321 B_UNICODE_EA_NARROW, // [Na] 322 B_UNICODE_EA_WIDE, // [W] 323 B_UNICODE_EA_COUNT 324 }; 325 326 327 class BUnicodeChar { 328 public: 329 static bool IsAlpha(uint32 c); 330 static bool IsAlNum(uint32 c); 331 static bool IsDigit(uint32 c); 332 static bool IsHexDigit(uint32 c); 333 static bool IsUpper(uint32 c); 334 static bool IsLower(uint32 c); 335 static bool IsSpace(uint32 c); 336 static bool IsWhitespace(uint32 c); 337 static bool IsControl(uint32 c); 338 static bool IsPunctuation(uint32 c); 339 static bool IsPrintable(uint32 c); 340 static bool IsTitle(uint32 c); 341 static bool IsDefined(uint32 c); 342 static bool IsBase(uint32 c); 343 344 static int8 Type(uint32 c); 345 346 static uint32 ToLower(uint32 c); 347 static uint32 ToUpper(uint32 c); 348 static uint32 ToTitle(uint32 c); 349 static int32 DigitValue(uint32 c); 350 static unicode_east_asian_width EastAsianWidth(uint32 c); 351 352 static void ToUTF8(uint32 c, char** out); 353 static uint32 FromUTF8(const char** in); 354 static uint32 FromUTF8(const char* in); 355 356 static size_t UTF8StringLength(const char* string); 357 static size_t UTF8StringLength(const char* string, size_t maxLength); 358 359 private: 360 BUnicodeChar(); 361 }; 362 363 364 inline uint32 365 BUnicodeChar::FromUTF8(const char* in) 366 { 367 const char* string = in; 368 return FromUTF8(&string); 369 } 370 371 372 #endif // _UNICODE_CHAR_H_ 373