xref: /haiku/src/kits/locale/UnicodeChar.cpp (revision e81a954787e50e56a7f06f72705b7859b6ab06d1)
1 /*
2  * Copyright 2003, Axel Dörfler, axeld@pinc-software.de. All rights reserved.
3  * Distributed under the terms of the MIT License.
4  *
5  * Authors:
6  *		Axel Dörfler, axeld@pinc-software.de
7  *		Siarzhuk Zharski, zharik@gmx.li
8  *
9  */
10 
11 
12 #include <UnicodeChar.h>
13 
14 #include <unicode/uchar.h>
15 #include <unicode/utf8.h>
16 
17 
18 BUnicodeChar::BUnicodeChar()
19 {
20 }
21 
22 
23 // Returns the general category value for the code point.
24 int8
25 BUnicodeChar::Type(uint32 c)
26 {
27 	BUnicodeChar();
28 	return u_charType(c);
29 }
30 
31 
32 // Determines whether the specified code point is a letter character.
33 // True for general categories "L" (letters).
34 bool
35 BUnicodeChar::IsAlpha(uint32 c)
36 {
37 	BUnicodeChar();
38 	return u_isalpha(c);
39 }
40 
41 
42 // Determines whether the specified code point is an alphanumeric character
43 // (letter or digit).
44 // True for characters with general categories
45 // "L" (letters) and "Nd" (decimal digit numbers).
46 bool
47 BUnicodeChar::IsAlNum(uint32 c)
48 {
49 	BUnicodeChar();
50 	return u_isalnum(c);
51 }
52 
53 
54 // Check if a code point has the Lowercase Unicode property (UCHAR_LOWERCASE).
55 bool
56 BUnicodeChar::IsLower(uint32 c)
57 {
58 	BUnicodeChar();
59 	return u_isULowercase(c);
60 }
61 
62 
63 // Check if a code point has the Uppercase Unicode property (UCHAR_UPPERCASE).
64 bool
65 BUnicodeChar::IsUpper(uint32 c)
66 {
67 	BUnicodeChar();
68 	return u_isUUppercase(c);
69 }
70 
71 
72 // Determines whether the specified code point is a titlecase letter.
73 // True for general category "Lt" (titlecase letter).
74 bool
75 BUnicodeChar::IsTitle(uint32 c)
76 {
77 	BUnicodeChar();
78 	return u_istitle(c);
79 }
80 
81 
82 // Determines whether the specified code point is a digit character.
83 // True for characters with general category "Nd" (decimal digit numbers).
84 // Beginning with Unicode 4, this is the same as
85 // testing for the Numeric_Type of Decimal.
86 bool
87 BUnicodeChar::IsDigit(uint32 c)
88 {
89 	BUnicodeChar();
90 	return u_isdigit(c);
91 }
92 
93 
94 // Determines whether the specified code point is a hexadecimal digit.
95 // This is equivalent to u_digit(c, 16)>=0.
96 // True for characters with general category "Nd" (decimal digit numbers)
97 // as well as Latin letters a-f and A-F in both ASCII and Fullwidth ASCII.
98 // (That is, for letters with code points
99 // 0041..0046, 0061..0066, FF21..FF26, FF41..FF46.)
100 bool
101 BUnicodeChar::IsHexDigit(uint32 c)
102 {
103 	BUnicodeChar();
104 	return u_isxdigit(c);
105 }
106 
107 
108 // Determines whether the specified code point is "defined",
109 // which usually means that it is assigned a character.
110 // True for general categories other than "Cn" (other, not assigned),
111 // i.e., true for all code points mentioned in UnicodeData.txt.
112 bool
113 BUnicodeChar::IsDefined(uint32 c)
114 {
115 	BUnicodeChar();
116 	return u_isdefined(c);
117 }
118 
119 
120 // Determines whether the specified code point is a base character.
121 // True for general categories "L" (letters), "N" (numbers),
122 // "Mc" (spacing combining marks), and "Me" (enclosing marks).
123 bool
124 BUnicodeChar::IsBase(uint32 c)
125 {
126 	BUnicodeChar();
127 	return u_isbase(c);
128 }
129 
130 
131 // Determines whether the specified code point is a control character
132 // (as defined by this function).
133 // A control character is one of the following:
134 // - ISO 8-bit control character (U+0000..U+001f and U+007f..U+009f)
135 // - U_CONTROL_CHAR (Cc)
136 // - U_FORMAT_CHAR (Cf)
137 // - U_LINE_SEPARATOR (Zl)
138 // - U_PARAGRAPH_SEPARATOR (Zp)
139 bool
140 BUnicodeChar::IsControl(uint32 c)
141 {
142 	BUnicodeChar();
143 	return u_iscntrl(c);
144 }
145 
146 
147 // Determines whether the specified code point is a punctuation character.
148 // True for characters with general categories "P" (punctuation).
149 bool
150 BUnicodeChar::IsPunctuation(uint32 c)
151 {
152 	BUnicodeChar();
153 	return u_ispunct(c);
154 }
155 
156 
157 // Determine if the specified code point is a space character according to Java.
158 // True for characters with general categories "Z" (separators),
159 // which does not include control codes (e.g., TAB or Line Feed).
160 bool
161 BUnicodeChar::IsSpace(uint32 c)
162 {
163 	BUnicodeChar();
164 	return u_isJavaSpaceChar(c);
165 }
166 
167 
168 // Determines if the specified code point is a whitespace character
169 // A character is considered to be a whitespace character if and only
170 // if it satisfies one of the following criteria:
171 // - It is a Unicode Separator character (categories "Z" = "Zs" or "Zl" or "Zp"),
172 //		but is not also a non-breaking space (U+00A0 NBSP or U+2007 Figure Space
173 //		or U+202F Narrow NBSP).
174 // - It is U+0009 HORIZONTAL TABULATION.
175 // - It is U+000A LINE FEED.
176 // - It is U+000B VERTICAL TABULATION.
177 // - It is U+000C FORM FEED.
178 // - It is U+000D CARRIAGE RETURN.
179 // - It is U+001C FILE SEPARATOR.
180 // - It is U+001D GROUP SEPARATOR.
181 // - It is U+001E RECORD SEPARATOR.
182 // - It is U+001F UNIT SEPARATOR.
183 bool
184 BUnicodeChar::IsWhitespace(uint32 c)
185 {
186 	BUnicodeChar();
187 	return u_isWhitespace(c);
188 }
189 
190 
191 // Determines whether the specified code point is a printable character.
192 // True for general categories other than "C" (controls).
193 bool
194 BUnicodeChar::IsPrintable(uint32 c)
195 {
196 	BUnicodeChar();
197 	return u_isprint(c);
198 }
199 
200 
201 //	#pragma mark -
202 
203 uint32
204 BUnicodeChar::ToLower(uint32 c)
205 {
206 	BUnicodeChar();
207 	return u_tolower(c);
208 }
209 
210 
211 uint32
212 BUnicodeChar::ToUpper(uint32 c)
213 {
214 	BUnicodeChar();
215 	return u_toupper(c);
216 }
217 
218 
219 uint32
220 BUnicodeChar::ToTitle(uint32 c)
221 {
222 	BUnicodeChar();
223 	return u_totitle(c);
224 }
225 
226 
227 int32
228 BUnicodeChar::DigitValue(uint32 c)
229 {
230 	BUnicodeChar();
231 	return u_digit(c, 10);
232 }
233 
234 
235 unicode_east_asian_width
236 BUnicodeChar::EastAsianWidth(uint32 c)
237 {
238 	return (unicode_east_asian_width)u_getIntPropertyValue(c,
239 			UCHAR_EAST_ASIAN_WIDTH);
240 }
241 
242 
243 void
244 BUnicodeChar::ToUTF8(uint32 c, char** out)
245 {
246 	int i = 0;
247 	U8_APPEND_UNSAFE(*out, i, c);
248 	*out += i;
249 }
250 
251 
252 uint32
253 BUnicodeChar::FromUTF8(const char** in)
254 {
255 	int i = 0;
256 	uint32 c = 0;
257 	U8_NEXT_UNSAFE(*in, i, c);
258 	*in += i;
259 
260 	return c;
261 }
262 
263 
264 size_t
265 BUnicodeChar::UTF8StringLength(const char* string)
266 {
267 	size_t len = 0;
268 	while (*string) {
269 		FromUTF8(&string);
270 		len++;
271 	}
272 	return len;
273 }
274 
275 
276 size_t
277 BUnicodeChar::UTF8StringLength(const char* string, size_t maxLength)
278 {
279 	size_t len = 0;
280 	while (len < maxLength && *string) {
281 		FromUTF8(&string);
282 		len++;
283 	}
284 	return len;
285 }
286