xref: /haiku/src/kits/locale/UnicodeChar.cpp (revision 21258e2674226d6aa732321b6f8494841895af5f)
1 /*
2  * Copyright 2003, Axel Dörfler, axeld@pinc-software.de. All rights reserved.
3  * Distributed under the terms of the MIT License.
4  *
5  * Authors:
6  *		Axel Dörfler, axeld@pinc-software.de
7  *		Siarzhuk Zharski, zharik@gmx.li
8  *
9  */
10 
11 
12 #include <UnicodeChar.h>
13 
14 #include <unicode/uchar.h>
15 #include <unicode/utf8.h>
16 
17 
18 BUnicodeChar::BUnicodeChar()
19 {
20 }
21 
22 
23 // Returns the general category value for the code point.
24 int8
25 BUnicodeChar::Type(uint32 c)
26 {
27 	return u_charType(c);
28 }
29 
30 
31 // Determines whether the specified code point is a letter character.
32 // True for general categories "L" (letters).
33 bool
34 BUnicodeChar::IsAlpha(uint32 c)
35 {
36 	return u_isalpha(c);
37 }
38 
39 
40 // Determines whether the specified code point is an alphanumeric character
41 // (letter or digit).
42 // True for characters with general categories
43 // "L" (letters) and "Nd" (decimal digit numbers).
44 bool
45 BUnicodeChar::IsAlNum(uint32 c)
46 {
47 	return u_isalnum(c);
48 }
49 
50 
51 // Check if a code point has the Lowercase Unicode property (UCHAR_LOWERCASE).
52 bool
53 BUnicodeChar::IsLower(uint32 c)
54 {
55 	return u_isULowercase(c);
56 }
57 
58 
59 // Check if a code point has the Uppercase Unicode property (UCHAR_UPPERCASE).
60 bool
61 BUnicodeChar::IsUpper(uint32 c)
62 {
63 	return u_isUUppercase(c);
64 }
65 
66 
67 // Determines whether the specified code point is a titlecase letter.
68 // True for general category "Lt" (titlecase letter).
69 bool
70 BUnicodeChar::IsTitle(uint32 c)
71 {
72 	return u_istitle(c);
73 }
74 
75 
76 // Determines whether the specified code point is a digit character.
77 // True for characters with general category "Nd" (decimal digit numbers).
78 // Beginning with Unicode 4, this is the same as
79 // testing for the Numeric_Type of Decimal.
80 bool
81 BUnicodeChar::IsDigit(uint32 c)
82 {
83 	return u_isdigit(c);
84 }
85 
86 
87 // Determines whether the specified code point is a hexadecimal digit.
88 // This is equivalent to u_digit(c, 16)>=0.
89 // True for characters with general category "Nd" (decimal digit numbers)
90 // as well as Latin letters a-f and A-F in both ASCII and Fullwidth ASCII.
91 // (That is, for letters with code points
92 // 0041..0046, 0061..0066, FF21..FF26, FF41..FF46.)
93 bool
94 BUnicodeChar::IsHexDigit(uint32 c)
95 {
96 	return u_isxdigit(c);
97 }
98 
99 
100 // Determines whether the specified code point is "defined",
101 // which usually means that it is assigned a character.
102 // True for general categories other than "Cn" (other, not assigned),
103 // i.e., true for all code points mentioned in UnicodeData.txt.
104 bool
105 BUnicodeChar::IsDefined(uint32 c)
106 {
107 	return u_isdefined(c);
108 }
109 
110 
111 // Determines whether the specified code point is a base character.
112 // True for general categories "L" (letters), "N" (numbers),
113 // "Mc" (spacing combining marks), and "Me" (enclosing marks).
114 bool
115 BUnicodeChar::IsBase(uint32 c)
116 {
117 	return u_isbase(c);
118 }
119 
120 
121 // Determines whether the specified code point is a control character
122 // (as defined by this function).
123 // A control character is one of the following:
124 // - ISO 8-bit control character (U+0000..U+001f and U+007f..U+009f)
125 // - U_CONTROL_CHAR (Cc)
126 // - U_FORMAT_CHAR (Cf)
127 // - U_LINE_SEPARATOR (Zl)
128 // - U_PARAGRAPH_SEPARATOR (Zp)
129 bool
130 BUnicodeChar::IsControl(uint32 c)
131 {
132 	return u_iscntrl(c);
133 }
134 
135 
136 // Determines whether the specified code point is a punctuation character.
137 // True for characters with general categories "P" (punctuation).
138 bool
139 BUnicodeChar::IsPunctuation(uint32 c)
140 {
141 	return u_ispunct(c);
142 }
143 
144 
145 // Determine if the specified code point is a space character according to Java.
146 // True for characters with general categories "Z" (separators),
147 // which does not include control codes (e.g., TAB or Line Feed).
148 bool
149 BUnicodeChar::IsSpace(uint32 c)
150 {
151 	return u_isJavaSpaceChar(c);
152 }
153 
154 
155 // Determines if the specified code point is a whitespace character
156 // A character is considered to be a whitespace character if and only
157 // if it satisfies one of the following criteria:
158 // - It is a Unicode Separator character (categories "Z" = "Zs" or "Zl" or "Zp"),
159 //		but is not also a non-breaking space (U+00A0 NBSP or U+2007 Figure Space
160 //		or U+202F Narrow NBSP).
161 // - It is U+0009 HORIZONTAL TABULATION.
162 // - It is U+000A LINE FEED.
163 // - It is U+000B VERTICAL TABULATION.
164 // - It is U+000C FORM FEED.
165 // - It is U+000D CARRIAGE RETURN.
166 // - It is U+001C FILE SEPARATOR.
167 // - It is U+001D GROUP SEPARATOR.
168 // - It is U+001E RECORD SEPARATOR.
169 // - It is U+001F UNIT SEPARATOR.
170 bool
171 BUnicodeChar::IsWhitespace(uint32 c)
172 {
173 	return u_isWhitespace(c);
174 }
175 
176 
177 // Determines whether the specified code point is a printable character.
178 // True for general categories other than "C" (controls).
179 bool
180 BUnicodeChar::IsPrintable(uint32 c)
181 {
182 	return u_isprint(c);
183 }
184 
185 
186 //	#pragma mark -
187 
188 uint32
189 BUnicodeChar::ToLower(uint32 c)
190 {
191 	return u_tolower(c);
192 }
193 
194 
195 uint32
196 BUnicodeChar::ToUpper(uint32 c)
197 {
198 	return u_toupper(c);
199 }
200 
201 
202 uint32
203 BUnicodeChar::ToTitle(uint32 c)
204 {
205 	return u_totitle(c);
206 }
207 
208 
209 int32
210 BUnicodeChar::DigitValue(uint32 c)
211 {
212 	return u_digit(c, 10);
213 }
214 
215 
216 unicode_east_asian_width
217 BUnicodeChar::EastAsianWidth(uint32 c)
218 {
219 	return (unicode_east_asian_width)u_getIntPropertyValue(c,
220 			UCHAR_EAST_ASIAN_WIDTH);
221 }
222 
223 
224 void
225 BUnicodeChar::ToUTF8(uint32 c, char** out)
226 {
227 	int i = 0;
228 	U8_APPEND_UNSAFE(*out, i, c);
229 	*out += i;
230 }
231 
232 
233 uint32
234 BUnicodeChar::FromUTF8(const char** in)
235 {
236 	int i = 0;
237 	uint32 c = 0;
238 	U8_NEXT_UNSAFE(*in, i, c);
239 	*in += i;
240 
241 	return c;
242 }
243 
244 
245 size_t
246 BUnicodeChar::UTF8StringLength(const char* string)
247 {
248 	size_t len = 0;
249 	while (*string) {
250 		FromUTF8(&string);
251 		len++;
252 	}
253 	return len;
254 }
255 
256 
257 size_t
258 BUnicodeChar::UTF8StringLength(const char* string, size_t maxLength)
259 {
260 	size_t len = 0;
261 	while (len < maxLength && *string) {
262 		FromUTF8(&string);
263 		len++;
264 	}
265 	return len;
266 }
267