xref: /haiku/headers/os/locale/UnicodeChar.h (revision c3ac87e8fc6f81023c44faed6cf00c8a916f4e31)
1 #ifndef _UNICODE_CHAR_H_
2 #define _UNICODE_CHAR_H_
3 
4 #include <SupportDefs.h>
5 
6 #include <LocaleBuild.h>
7 
8 enum unicode_char_category
9 {
10 	// Non-category for unassigned and non-character code points.
11 	B_UNICODE_UNASSIGNED				= 0,
12 
13 	B_UNICODE_UPPERCASE_LETTER			= 1,	// Lu
14 	B_UNICODE_LOWERCASE_LETTER			= 2,	// Ll
15 	B_UNICODE_TITLECASE_LETTER			= 3,	// Lt
16 	B_UNICODE_MODIFIER_LETTER			= 4,	// Lm
17 	B_UNICODE_OTHER_LETTER				= 5,	// Lo
18 	B_UNICODE_NON_SPACING_MARK			= 6,	// Mn
19 	B_UNICODE_ENCLOSING_MARK			= 7,	// Me
20 	B_UNICODE_COMBINING_SPACING_MARK	= 8,	// Mc
21 	B_UNICODE_DECIMAL_DIGIT_NUMBER		= 9,	// Nd
22 	B_UNICODE_LETTER_NUMBER				= 10,	// Nl
23 	B_UNICODE_OTHER_NUMBER				= 11,	// No
24 	B_UNICODE_SPACE_SEPARATOR			= 12,	// Zs
25 	B_UNICODE_LINE_SEPARATOR			= 13,	// Zl
26 	B_UNICODE_PARAGRAPH_SEPARATOR		= 14,	// Zp
27 	B_UNICODE_CONTROL_CHAR				= 15,	// Cc
28 	B_UNICODE_FORMAT_CHAR				= 16,	// Cf
29 	B_UNICODE_PRIVATE_USE_CHAR			= 17,	// Co
30 	B_UNICODE_SURROGATE					= 18,	// Cs
31 	B_UNICODE_DASH_PUNCTUATION			= 19,	// Pd
32 	B_UNICODE_START_PUNCTUATION			= 20,	// Ps
33 	B_UNICODE_END_PUNCTUATION			= 21,	// Pe
34 	B_UNICODE_CONNECTOR_PUNCTUATION		= 22,	// Pc
35 	B_UNICODE_OTHER_PUNCTUATION			= 23,	// Po
36 	B_UNICODE_MATH_SYMBOL				= 24,	// Sm
37 	B_UNICODE_CURRENCY_SYMBOL			= 25,	// Sc
38 	B_UNICODE_MODIFIER_SYMBOL			= 26,	// Sk
39 	B_UNICODE_OTHER_SYMBOL				= 27,	// So
40 	B_UNICODE_INITIAL_PUNCTUATION		= 28,	// Pi
41 	B_UNICODE_FINAL_PUNCTUATION			= 29,	// Pf
42 	B_UNICODE_GENERAL_OTHER_TYPES		= 30,	// Cn
43 
44 	B_UNICODE_CATEGORY_COUNT
45 };
46 
47 
48 /**
49  * This specifies the language directional property of a character set.
50  */
51 
52 enum unicode_char_direction {
53 	B_UNICODE_LEFT_TO_RIGHT               = 0,
54 	B_UNICODE_RIGHT_TO_LEFT               = 1,
55 	B_UNICODE_EUROPEAN_NUMBER             = 2,
56 	B_UNICODE_EUROPEAN_NUMBER_SEPARATOR   = 3,
57 	B_UNICODE_EUROPEAN_NUMBER_TERMINATOR  = 4,
58 	B_UNICODE_ARABIC_NUMBER               = 5,
59 	B_UNICODE_COMMON_NUMBER_SEPARATOR     = 6,
60 	B_UNICODE_BLOCK_SEPARATOR             = 7,
61 	B_UNICODE_SEGMENT_SEPARATOR           = 8,
62 	B_UNICODE_WHITE_SPACE_NEUTRAL         = 9,
63 	B_UNICODE_OTHER_NEUTRAL               = 10,
64 	B_UNICODE_LEFT_TO_RIGHT_EMBEDDING     = 11,
65 	B_UNICODE_LEFT_TO_RIGHT_OVERRIDE      = 12,
66 	B_UNICODE_RIGHT_TO_LEFT_ARABIC        = 13,
67 	B_UNICODE_RIGHT_TO_LEFT_EMBEDDING     = 14,
68 	B_UNICODE_RIGHT_TO_LEFT_OVERRIDE      = 15,
69 	B_UNICODE_POP_DIRECTIONAL_FORMAT      = 16,
70 	B_UNICODE_DIR_NON_SPACING_MARK        = 17,
71 	B_UNICODE_BOUNDARY_NEUTRAL            = 18,
72 
73 	B_UNICODE_DIRECTION_COUNT
74 };
75 
76 
77 /**
78  * Script range as defined in the Unicode standard.
79  */
80 
81 enum unicode_char_script {
82 	// Script names
83 	B_UNICODE_BASIC_LATIN,
84 	B_UNICODE_LATIN_1_SUPPLEMENT,
85 	B_UNICODE_LATIN_EXTENDED_A,
86 	B_UNICODE_LATIN_EXTENDED_B,
87 	B_UNICODE_IPA_EXTENSIONS,
88 	B_UNICODE_SPACING_MODIFIER_LETTERS,
89 	B_UNICODE_COMBINING_DIACRITICAL_MARKS,
90 	B_UNICODE_GREEK,
91 	B_UNICODE_CYRILLIC,
92 	B_UNICODE_ARMENIAN,
93 	B_UNICODE_HEBREW,
94 	B_UNICODE_ARABIC,
95 	B_UNICODE_SYRIAC,
96 	B_UNICODE_THAANA,
97 	B_UNICODE_DEVANAGARI,
98 	B_UNICODE_BENGALI,
99 	B_UNICODE_GURMUKHI,
100 	B_UNICODE_GUJARATI,
101 	B_UNICODE_ORIYA,
102 	B_UNICODE_TAMIL,
103 	B_UNICODE_TELUGU,
104 	B_UNICODE_KANNADA,
105 	B_UNICODE_MALAYALAM,
106 	B_UNICODE_SINHALA,
107 	B_UNICODE_THAI,
108 	B_UNICODE_LAO,
109 	B_UNICODE_TIBETAN,
110 	B_UNICODE_MYANMAR,
111 	B_UNICODE_GEORGIAN,
112 	B_UNICODE_HANGUL_JAMO,
113 	B_UNICODE_ETHIOPIC,
114 	B_UNICODE_CHEROKEE,
115 	B_UNICODE_UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS,
116 	B_UNICODE_OGHAM,
117 	B_UNICODE_RUNIC,
118 	B_UNICODE_KHMER,
119 	B_UNICODE_MONGOLIAN,
120 	B_UNICODE_LATIN_EXTENDED_ADDITIONAL,
121 	B_UNICODE_GREEK_EXTENDED,
122 	B_UNICODE_GENERAL_PUNCTUATION,
123 	B_UNICODE_SUPERSCRIPTS_AND_SUBSCRIPTS,
124 	B_UNICODE_CURRENCY_SYMBOLS,
125 	B_UNICODE_COMBINING_MARKS_FOR_SYMBOLS,
126 	B_UNICODE_LETTERLIKE_SYMBOLS,
127 	B_UNICODE_NUMBER_FORMS,
128 	B_UNICODE_ARROWS,
129 	B_UNICODE_MATHEMATICAL_OPERATORS,
130 	B_UNICODE_MISCELLANEOUS_TECHNICAL,
131 	B_UNICODE_CONTROL_PICTURES,
132 	B_UNICODE_OPTICAL_CHARACTER_RECOGNITION,
133 	B_UNICODE_ENCLOSED_ALPHANUMERICS,
134 	B_UNICODE_BOX_DRAWING,
135 	B_UNICODE_BLOCK_ELEMENTS,
136 	B_UNICODE_GEOMETRIC_SHAPES,
137 	B_UNICODE_MISCELLANEOUS_SYMBOLS,
138 	B_UNICODE_DINGBATS,
139 	B_UNICODE_BRAILLE_PATTERNS,
140 	B_UNICODE_CJK_RADICALS_SUPPLEMENT,
141 	B_UNICODE_KANGXI_RADICALS,
142 	B_UNICODE_IDEOGRAPHIC_DESCRIPTION_CHARACTERS,
143 	B_UNICODE_CJK_SYMBOLS_AND_PUNCTUATION,
144 	B_UNICODE_HIRAGANA,
145 	B_UNICODE_KATAKANA,
146 	B_UNICODE_BOPOMOFO,
147 	B_UNICODE_HANGUL_COMPATIBILITY_JAMO,
148 	B_UNICODE_KANBUN,
149 	B_UNICODE_BOPOMOFO_EXTENDED,
150 	B_UNICODE_ENCLOSED_CJK_LETTERS_AND_MONTHS,
151 	B_UNICODE_CJK_COMPATIBILITY,
152 	B_UNICODE_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A,
153 	B_UNICODE_CJK_UNIFIED_IDEOGRAPHS,
154 	B_UNICODE_YI_SYLLABLES,
155 	B_UNICODE_YI_RADICALS,
156 	B_UNICODE_HANGUL_SYLLABLES,
157 	B_UNICODE_HIGH_SURROGATES,
158 	B_UNICODE_HIGH_PRIVATE_USE_SURROGATES,
159 	B_UNICODE_LOW_SURROGATES,
160 	B_UNICODE_PRIVATE_USE_AREA,
161 	B_UNICODE_CJK_COMPATIBILITY_IDEOGRAPHS,
162 	B_UNICODE_ALPHABETIC_PRESENTATION_FORMS,
163 	B_UNICODE_ARABIC_PRESENTATION_FORMS_A,
164 	B_UNICODE_COMBINING_HALF_MARKS,
165 	B_UNICODE_CJK_COMPATIBILITY_FORMS,
166 	B_UNICODE_SMALL_FORM_VARIANTS,
167 	B_UNICODE_ARABIC_PRESENTATION_FORMS_B,
168 	B_UNICODE_SPECIALS,
169 	B_UNICODE_HALFWIDTH_AND_FULLWIDTH_FORMS,
170 
171 	B_UNICODE_SCRIPT_COUNT,
172 	B_UNICODE_NO_SCRIPT = B_UNICODE_SCRIPT_COUNT
173 };
174 
175 
176 /**
177  * Values returned by the u_getCellWidth() function.
178  */
179 
180 enum unicode_cell_width
181 {
182     B_UNICODE_ZERO_WIDTH              = 0,
183     B_UNICODE_HALF_WIDTH              = 1,
184     B_UNICODE_FULL_WIDTH              = 2,
185     B_UNICODE_NEUTRAL_WIDTH           = 3,
186 
187     B_UNICODE_CELL_WIDTH_COUNT
188 };
189 
190 
191 class _IMPEXP_LOCALE BUnicodeChar {
192 	public:
193 		static bool IsAlpha(uint32 c);
194 		static bool IsAlNum(uint32 c);
195 		static bool IsDigit(uint32 c);
196 		static bool IsHexDigit(uint32 c);
197 		static bool IsUpper(uint32 c);
198 		static bool IsLower(uint32 c);
199 		static bool IsSpace(uint32 c);
200 		static bool IsWhitespace(uint32 c);
201 		static bool IsControl(uint32 c);
202 		static bool IsPunctuation(uint32 c);
203 		static bool IsPrintable(uint32 c);
204 		static bool IsTitle(uint32 c);
205 		static bool IsDefined(uint32 c);
206 		static bool IsBase(uint32 c);
207 
208 		static int8 Type(uint32 c);
209 
210 		static uint32 ToLower(uint32 c);
211 		static uint32 ToUpper(uint32 c);
212 		static uint32 ToTitle(uint32 c);
213 		static int32 DigitValue(uint32 c);
214 
215 		static void ToUTF8(uint32 c, char **out);
216 		static uint32 FromUTF8(const char **in);
217 		static uint32 FromUTF8(const char *in);
218 
219 		static size_t UTF8StringLength(const char *str);
220 		static size_t UTF8StringLength(const char *str, size_t maxLength);
221 
222 	private:
223 		BUnicodeChar();
224 };
225 
226 
227 inline uint32
228 BUnicodeChar::FromUTF8(const char *in)
229 {
230 	const char *string = in;
231 	return FromUTF8(&string);
232 }
233 
234 
235 #endif	/* _UNICODE_CHAR_H_ */
236