xref: /haiku/headers/os/locale/UnicodeChar.h (revision 893988af824e65e49e55f517b157db8386e8002b)
1 #ifndef _UNICODE_CHAR_H_
2 #define _UNICODE_CHAR_H_
3 
4 #include <SupportDefs.h>
5 
6 enum unicode_char_category
7 {
8 	// Non-category for unassigned and non-character code points.
9 	B_UNICODE_UNASSIGNED				= 0,
10 
11 	B_UNICODE_UPPERCASE_LETTER			= 1,	// Lu
12 	B_UNICODE_LOWERCASE_LETTER			= 2,	// Ll
13 	B_UNICODE_TITLECASE_LETTER			= 3,	// Lt
14 	B_UNICODE_MODIFIER_LETTER			= 4,	// Lm
15 	B_UNICODE_OTHER_LETTER				= 5,	// Lo
16 	B_UNICODE_NON_SPACING_MARK			= 6,	// Mn
17 	B_UNICODE_ENCLOSING_MARK			= 7,	// Me
18 	B_UNICODE_COMBINING_SPACING_MARK	= 8,	// Mc
19 	B_UNICODE_DECIMAL_DIGIT_NUMBER		= 9,	// Nd
20 	B_UNICODE_LETTER_NUMBER				= 10,	// Nl
21 	B_UNICODE_OTHER_NUMBER				= 11,	// No
22 	B_UNICODE_SPACE_SEPARATOR			= 12,	// Zs
23 	B_UNICODE_LINE_SEPARATOR			= 13,	// Zl
24 	B_UNICODE_PARAGRAPH_SEPARATOR		= 14,	// Zp
25 	B_UNICODE_CONTROL_CHAR				= 15,	// Cc
26 	B_UNICODE_FORMAT_CHAR				= 16,	// Cf
27 	B_UNICODE_PRIVATE_USE_CHAR			= 17,	// Co
28 	B_UNICODE_SURROGATE					= 18,	// Cs
29 	B_UNICODE_DASH_PUNCTUATION			= 19,	// Pd
30 	B_UNICODE_START_PUNCTUATION			= 20,	// Ps
31 	B_UNICODE_END_PUNCTUATION			= 21,	// Pe
32 	B_UNICODE_CONNECTOR_PUNCTUATION		= 22,	// Pc
33 	B_UNICODE_OTHER_PUNCTUATION			= 23,	// Po
34 	B_UNICODE_MATH_SYMBOL				= 24,	// Sm
35 	B_UNICODE_CURRENCY_SYMBOL			= 25,	// Sc
36 	B_UNICODE_MODIFIER_SYMBOL			= 26,	// Sk
37 	B_UNICODE_OTHER_SYMBOL				= 27,	// So
38 	B_UNICODE_INITIAL_PUNCTUATION		= 28,	// Pi
39 	B_UNICODE_FINAL_PUNCTUATION			= 29,	// Pf
40 	B_UNICODE_GENERAL_OTHER_TYPES		= 30,	// Cn
41 
42 	B_UNICODE_CATEGORY_COUNT
43 };
44 
45 
46 /**
47  * This specifies the language directional property of a character set.
48  */
49 
50 enum unicode_char_direction {
51 	B_UNICODE_LEFT_TO_RIGHT               = 0,
52 	B_UNICODE_RIGHT_TO_LEFT               = 1,
53 	B_UNICODE_EUROPEAN_NUMBER             = 2,
54 	B_UNICODE_EUROPEAN_NUMBER_SEPARATOR   = 3,
55 	B_UNICODE_EUROPEAN_NUMBER_TERMINATOR  = 4,
56 	B_UNICODE_ARABIC_NUMBER               = 5,
57 	B_UNICODE_COMMON_NUMBER_SEPARATOR     = 6,
58 	B_UNICODE_BLOCK_SEPARATOR             = 7,
59 	B_UNICODE_SEGMENT_SEPARATOR           = 8,
60 	B_UNICODE_WHITE_SPACE_NEUTRAL         = 9,
61 	B_UNICODE_OTHER_NEUTRAL               = 10,
62 	B_UNICODE_LEFT_TO_RIGHT_EMBEDDING     = 11,
63 	B_UNICODE_LEFT_TO_RIGHT_OVERRIDE      = 12,
64 	B_UNICODE_RIGHT_TO_LEFT_ARABIC        = 13,
65 	B_UNICODE_RIGHT_TO_LEFT_EMBEDDING     = 14,
66 	B_UNICODE_RIGHT_TO_LEFT_OVERRIDE      = 15,
67 	B_UNICODE_POP_DIRECTIONAL_FORMAT      = 16,
68 	B_UNICODE_DIR_NON_SPACING_MARK        = 17,
69 	B_UNICODE_BOUNDARY_NEUTRAL            = 18,
70 
71 	B_UNICODE_DIRECTION_COUNT
72 };
73 
74 
75 /**
76  * Script range as defined in the Unicode standard.
77  */
78 
79 enum unicode_char_script {
80 	// Script names
81 	B_UNICODE_BASIC_LATIN,
82 	B_UNICODE_LATIN_1_SUPPLEMENT,
83 	B_UNICODE_LATIN_EXTENDED_A,
84 	B_UNICODE_LATIN_EXTENDED_B,
85 	B_UNICODE_IPA_EXTENSIONS,
86 	B_UNICODE_SPACING_MODIFIER_LETTERS,
87 	B_UNICODE_COMBINING_DIACRITICAL_MARKS,
88 	B_UNICODE_GREEK,
89 	B_UNICODE_CYRILLIC,
90 	B_UNICODE_ARMENIAN,
91 	B_UNICODE_HEBREW,
92 	B_UNICODE_ARABIC,
93 	B_UNICODE_SYRIAC,
94 	B_UNICODE_THAANA,
95 	B_UNICODE_DEVANAGARI,
96 	B_UNICODE_BENGALI,
97 	B_UNICODE_GURMUKHI,
98 	B_UNICODE_GUJARATI,
99 	B_UNICODE_ORIYA,
100 	B_UNICODE_TAMIL,
101 	B_UNICODE_TELUGU,
102 	B_UNICODE_KANNADA,
103 	B_UNICODE_MALAYALAM,
104 	B_UNICODE_SINHALA,
105 	B_UNICODE_THAI,
106 	B_UNICODE_LAO,
107 	B_UNICODE_TIBETAN,
108 	B_UNICODE_MYANMAR,
109 	B_UNICODE_GEORGIAN,
110 	B_UNICODE_HANGUL_JAMO,
111 	B_UNICODE_ETHIOPIC,
112 	B_UNICODE_CHEROKEE,
113 	B_UNICODE_UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS,
114 	B_UNICODE_OGHAM,
115 	B_UNICODE_RUNIC,
116 	B_UNICODE_KHMER,
117 	B_UNICODE_MONGOLIAN,
118 	B_UNICODE_LATIN_EXTENDED_ADDITIONAL,
119 	B_UNICODE_GREEK_EXTENDED,
120 	B_UNICODE_GENERAL_PUNCTUATION,
121 	B_UNICODE_SUPERSCRIPTS_AND_SUBSCRIPTS,
122 	B_UNICODE_CURRENCY_SYMBOLS,
123 	B_UNICODE_COMBINING_MARKS_FOR_SYMBOLS,
124 	B_UNICODE_LETTERLIKE_SYMBOLS,
125 	B_UNICODE_NUMBER_FORMS,
126 	B_UNICODE_ARROWS,
127 	B_UNICODE_MATHEMATICAL_OPERATORS,
128 	B_UNICODE_MISCELLANEOUS_TECHNICAL,
129 	B_UNICODE_CONTROL_PICTURES,
130 	B_UNICODE_OPTICAL_CHARACTER_RECOGNITION,
131 	B_UNICODE_ENCLOSED_ALPHANUMERICS,
132 	B_UNICODE_BOX_DRAWING,
133 	B_UNICODE_BLOCK_ELEMENTS,
134 	B_UNICODE_GEOMETRIC_SHAPES,
135 	B_UNICODE_MISCELLANEOUS_SYMBOLS,
136 	B_UNICODE_DINGBATS,
137 	B_UNICODE_BRAILLE_PATTERNS,
138 	B_UNICODE_CJK_RADICALS_SUPPLEMENT,
139 	B_UNICODE_KANGXI_RADICALS,
140 	B_UNICODE_IDEOGRAPHIC_DESCRIPTION_CHARACTERS,
141 	B_UNICODE_CJK_SYMBOLS_AND_PUNCTUATION,
142 	B_UNICODE_HIRAGANA,
143 	B_UNICODE_KATAKANA,
144 	B_UNICODE_BOPOMOFO,
145 	B_UNICODE_HANGUL_COMPATIBILITY_JAMO,
146 	B_UNICODE_KANBUN,
147 	B_UNICODE_BOPOMOFO_EXTENDED,
148 	B_UNICODE_ENCLOSED_CJK_LETTERS_AND_MONTHS,
149 	B_UNICODE_CJK_COMPATIBILITY,
150 	B_UNICODE_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A,
151 	B_UNICODE_CJK_UNIFIED_IDEOGRAPHS,
152 	B_UNICODE_YI_SYLLABLES,
153 	B_UNICODE_YI_RADICALS,
154 	B_UNICODE_HANGUL_SYLLABLES,
155 	B_UNICODE_HIGH_SURROGATES,
156 	B_UNICODE_HIGH_PRIVATE_USE_SURROGATES,
157 	B_UNICODE_LOW_SURROGATES,
158 	B_UNICODE_PRIVATE_USE_AREA,
159 	B_UNICODE_CJK_COMPATIBILITY_IDEOGRAPHS,
160 	B_UNICODE_ALPHABETIC_PRESENTATION_FORMS,
161 	B_UNICODE_ARABIC_PRESENTATION_FORMS_A,
162 	B_UNICODE_COMBINING_HALF_MARKS,
163 	B_UNICODE_CJK_COMPATIBILITY_FORMS,
164 	B_UNICODE_SMALL_FORM_VARIANTS,
165 	B_UNICODE_ARABIC_PRESENTATION_FORMS_B,
166 	B_UNICODE_SPECIALS,
167 	B_UNICODE_HALFWIDTH_AND_FULLWIDTH_FORMS,
168 
169 	B_UNICODE_SCRIPT_COUNT,
170 	B_UNICODE_NO_SCRIPT = B_UNICODE_SCRIPT_COUNT
171 };
172 
173 
174 /**
175  * Values returned by the u_getCellWidth() function.
176  */
177 
178 enum unicode_cell_width
179 {
180     B_UNICODE_ZERO_WIDTH              = 0,
181     B_UNICODE_HALF_WIDTH              = 1,
182     B_UNICODE_FULL_WIDTH              = 2,
183     B_UNICODE_NEUTRAL_WIDTH           = 3,
184 
185     B_UNICODE_CELL_WIDTH_COUNT
186 };
187 
188 
189 class BUnicodeChar {
190 	public:
191 		static bool IsAlpha(uint32 c);
192 		static bool IsAlNum(uint32 c);
193 		static bool IsDigit(uint32 c);
194 		static bool IsHexDigit(uint32 c);
195 		static bool IsUpper(uint32 c);
196 		static bool IsLower(uint32 c);
197 		static bool IsSpace(uint32 c);
198 		static bool IsWhitespace(uint32 c);
199 		static bool IsControl(uint32 c);
200 		static bool IsPunctuation(uint32 c);
201 		static bool IsPrintable(uint32 c);
202 		static bool IsTitle(uint32 c);
203 		static bool IsDefined(uint32 c);
204 		static bool IsBase(uint32 c);
205 
206 		static int8 Type(uint32 c);
207 
208 		static uint32 ToLower(uint32 c);
209 		static uint32 ToUpper(uint32 c);
210 		static uint32 ToTitle(uint32 c);
211 		static int32 DigitValue(uint32 c);
212 
213 		static void ToUTF8(uint32 c, char **out);
214 		static uint32 FromUTF8(const char **in);
215 		static uint32 FromUTF8(const char *in);
216 
217 		static size_t UTF8StringLength(const char *str);
218 		static size_t UTF8StringLength(const char *str, size_t maxLength);
219 
220 	private:
221 		BUnicodeChar();
222 };
223 
224 
225 inline uint32
226 BUnicodeChar::FromUTF8(const char *in)
227 {
228 	const char *string = in;
229 	return FromUTF8(&string);
230 }
231 
232 
233 #endif	/* _UNICODE_CHAR_H_ */
234