xref: /haiku/docs/user/locale/UnicodeChar.dox (revision b6f76ebe7153b94820cf35f8db4facc158841abb)
1a33f8fbdSAdrien Destugues/*
2820dca4dSJohn Scipione * Copyright 2011 Haiku, Inc. All rights reserved.
3*b6f76ebeSAugustin Cavalier * Distributed under the terms of the MIT License.
4a33f8fbdSAdrien Destugues *
5a33f8fbdSAdrien Destugues * Authors:
6820dca4dSJohn Scipione *		Axel Dörfler, axeld@pinc-software.de
7820dca4dSJohn Scipione *		John Scipione, jscipione@gmail.com
8a33f8fbdSAdrien Destugues *
9a33f8fbdSAdrien Destugues * Corresponds to:
10820dca4dSJohn Scipione *		headers/os/locale/UnicodeChar.h	 rev 42274
11820dca4dSJohn Scipione *		src/kits/locale/UnicodeChar.cpp	 rev 42274
12a33f8fbdSAdrien Destugues */
13a33f8fbdSAdrien Destugues
14820dca4dSJohn Scipione
15820dca4dSJohn Scipione/*!
16820dca4dSJohn Scipione	\file UnicodeChar.h
17820dca4dSJohn Scipione	\ingroup locale
18820dca4dSJohn Scipione	\ingroup libbe
19820dca4dSJohn Scipione	\brief Provides the BUnicodeChar class.
20820dca4dSJohn Scipione*/
21820dca4dSJohn Scipione
22820dca4dSJohn Scipione
23a8a3468aSAdrien Destugues/*!
24a8a3468aSAdrien Destugues	\class BUnicodeChar
25a8a3468aSAdrien Destugues	\ingroup locale
26820dca4dSJohn Scipione	\ingroup libbe
27a8a3468aSAdrien Destugues	\brief Management of all information about characters.
28a8a3468aSAdrien Destugues
29a8a3468aSAdrien Destugues	This class provide a set of tools for managing the whole set of characters
30a33f8fbdSAdrien Destugues	defined by unicode. This include information about special sets of
31a33f8fbdSAdrien Destugues	characters such as if the character is whitespace, or alphanumeric. It also
32a33f8fbdSAdrien Destugues	provides the uppercase equivalent of a character and determines whether a
33a33f8fbdSAdrien Destugues	character can be ornamented with accents.
34a8a3468aSAdrien Destugues
35a33f8fbdSAdrien Destugues	This class consists entirely of static methods, so you do not have to
36a33f8fbdSAdrien Destugues	instantiate it. You can call one of the methods passing in the character
37a33f8fbdSAdrien Destugues	that you want to be examined.
38a8a3468aSAdrien Destugues
39820dca4dSJohn Scipione	Note all the function work with chars encoded in UTF-32. This is not the
40a33f8fbdSAdrien Destugues	most usual way to handle characters, but it is the fastest. To convert an
41820dca4dSJohn Scipione	UTF-8 string to an UTF-32 character use the FromUTF8() method.
42edc845a3SJohn Scipione
43edc845a3SJohn Scipione	\since Haiku R1
44a8a3468aSAdrien Destugues*/
45a8a3468aSAdrien Destugues
46820dca4dSJohn Scipione
47a8a3468aSAdrien Destugues/*!
48a8a3468aSAdrien Destugues	\fn static bool BUnicodeChar::IsAlpha(uint32 c)
49a33f8fbdSAdrien Destugues	\brief Determine if \a c is alphabetic.
50a33f8fbdSAdrien Destugues
51a33f8fbdSAdrien Destugues	\returns \c true if the specified unicode character is an
52a33f8fbdSAdrien Destugues	         alphabetic character.
53edc845a3SJohn Scipione
54edc845a3SJohn Scipione	\since Haiku R1
55a8a3468aSAdrien Destugues*/
56a8a3468aSAdrien Destugues
57820dca4dSJohn Scipione
58a8a3468aSAdrien Destugues/*!
59a8a3468aSAdrien Destugues	\fn static bool BUnicodeChar::IsAlNum(uint32 c)
60a33f8fbdSAdrien Destugues	\brief Determine if \a c is alphanumeric.
61a33f8fbdSAdrien Destugues
62a33f8fbdSAdrien Destugues	\returns \c true if the specified unicode character is a
63a33f8fbdSAdrien Destugues	         alphabetic or numeric character.
64edc845a3SJohn Scipione
65edc845a3SJohn Scipione	\since Haiku R1
66a8a3468aSAdrien Destugues*/
67a8a3468aSAdrien Destugues
68820dca4dSJohn Scipione
69a8a3468aSAdrien Destugues/*!
70a8a3468aSAdrien Destugues	\fn static bool BUnicodeChar::IsDigit(uint32 c)
71a33f8fbdSAdrien Destugues	\brief Determine if \a c is numeric.
72a33f8fbdSAdrien Destugues
73a33f8fbdSAdrien Destugues	\returns \c true if the specified unicode character is a
74a33f8fbdSAdrien Destugues	         number character.
75edc845a3SJohn Scipione
76edc845a3SJohn Scipione	\since Haiku R1
77a8a3468aSAdrien Destugues*/
78a8a3468aSAdrien Destugues
79820dca4dSJohn Scipione
80a8a3468aSAdrien Destugues/*!
81a8a3468aSAdrien Destugues	\fn static bool BUnicodeChar::IsHexDigit(uint32 c)
82a33f8fbdSAdrien Destugues	\brief Determine if \a c is a hexadecimal digit.
83a33f8fbdSAdrien Destugues
84a33f8fbdSAdrien Destugues	\returns \c true if the specified unicode character is a
85a33f8fbdSAdrien Destugues	         hexadecimal number character.
86edc845a3SJohn Scipione
87edc845a3SJohn Scipione	\since Haiku R1
88a8a3468aSAdrien Destugues*/
89a8a3468aSAdrien Destugues
90820dca4dSJohn Scipione
91a8a3468aSAdrien Destugues/*!
92a8a3468aSAdrien Destugues	\fn static bool BUnicodeChar::IsUpper(uint32 c)
93a33f8fbdSAdrien Destugues	\brief Determine if \a c is uppercase.
94a33f8fbdSAdrien Destugues
95a33f8fbdSAdrien Destugues	\returns \c true if the specified unicode character is an
96a33f8fbdSAdrien Destugues	         uppercase character.
97edc845a3SJohn Scipione
98edc845a3SJohn Scipione	\since Haiku R1
99a8a3468aSAdrien Destugues*/
100a8a3468aSAdrien Destugues
101820dca4dSJohn Scipione
102a8a3468aSAdrien Destugues/*!
103a8a3468aSAdrien Destugues	\fn static bool BUnicodeChar::IsLower(uint32 c)
104a33f8fbdSAdrien Destugues	\brief Determine if \a c is lowercase.
105a33f8fbdSAdrien Destugues
106a33f8fbdSAdrien Destugues	\returns \c true if the specified unicode character is a
107a33f8fbdSAdrien Destugues	         lowercase character.
108edc845a3SJohn Scipione
109edc845a3SJohn Scipione	\since Haiku R1
110a8a3468aSAdrien Destugues*/
111a8a3468aSAdrien Destugues
112820dca4dSJohn Scipione
113a8a3468aSAdrien Destugues/*!
114a8a3468aSAdrien Destugues	\fn static bool BUnicodeChar::IsSpace(uint32 c)
115a33f8fbdSAdrien Destugues	\brief Determine if \a c is a space.
116a8a3468aSAdrien Destugues
117a33f8fbdSAdrien Destugues	Unlike IsWhitespace() this function will return \c true for non-breakable
118a33f8fbdSAdrien Destugues	spaces. This method is useful for determining if the character will render
119a33f8fbdSAdrien Destugues	as an empty space which can be stretched on-screen.
120a33f8fbdSAdrien Destugues
121a33f8fbdSAdrien Destugues	\returns \c true if the specified unicode character is some
122a33f8fbdSAdrien Destugues	         kind of a space character.
123a33f8fbdSAdrien Destugues
124a33f8fbdSAdrien Destugues	\sa IsWhitespace()
125edc845a3SJohn Scipione
126edc845a3SJohn Scipione	\since Haiku R1
127a8a3468aSAdrien Destugues*/
128a8a3468aSAdrien Destugues
129820dca4dSJohn Scipione
130a8a3468aSAdrien Destugues/*!
131a8a3468aSAdrien Destugues	\fn static bool BUnicodeChar::IsWhitespace(uint32 c)
132a33f8fbdSAdrien Destugues	\brief Determine if \a c is whitespace.
133a8a3468aSAdrien Destugues
134a33f8fbdSAdrien Destugues	This method is essentially the same as IsSpace(), but excludes all
135a33f8fbdSAdrien Destugues	non-breakable spaces.
136a33f8fbdSAdrien Destugues
137a33f8fbdSAdrien Destugues	\returns \c true if the specified unicode character is a whitespace
138a33f8fbdSAdrien Destugues	         character.
139a33f8fbdSAdrien Destugues
140a33f8fbdSAdrien Destugues	\sa IsSpace()
141edc845a3SJohn Scipione
142edc845a3SJohn Scipione	\since Haiku R1
143a8a3468aSAdrien Destugues*/
144a8a3468aSAdrien Destugues
145820dca4dSJohn Scipione
146a8a3468aSAdrien Destugues/*!
147a8a3468aSAdrien Destugues	\fn static bool BUnicodeChar::IsControl(uint32 c)
148a33f8fbdSAdrien Destugues	\brief Determine if \a c is a control character.
149a8a3468aSAdrien Destugues
150a33f8fbdSAdrien Destugues	Example control characters are the non-printable ASCII characters from
151a33f8fbdSAdrien Destugues	0x0 to 0x1F.
152a33f8fbdSAdrien Destugues
153a33f8fbdSAdrien Destugues	\returns \c true if the specified unicode character is a control
154a33f8fbdSAdrien Destugues	         character.
155a33f8fbdSAdrien Destugues
156a33f8fbdSAdrien Destugues	\sa IsPrintable()
157edc845a3SJohn Scipione
158edc845a3SJohn Scipione	\since Haiku R1
159a8a3468aSAdrien Destugues*/
160a8a3468aSAdrien Destugues
161820dca4dSJohn Scipione
162a8a3468aSAdrien Destugues/*!
163a8a3468aSAdrien Destugues	\fn static bool BUnicodeChar::IsPunctuation(uint32 c)
164a33f8fbdSAdrien Destugues	\brief Determine if \a c is punctuation character.
165a33f8fbdSAdrien Destugues
166a33f8fbdSAdrien Destugues	\returns \c true if the specified unicode character is a
167a33f8fbdSAdrien Destugues	         punctuation character.
168edc845a3SJohn Scipione
169edc845a3SJohn Scipione	\since Haiku R1
170a8a3468aSAdrien Destugues*/
171a8a3468aSAdrien Destugues
172820dca4dSJohn Scipione
173a8a3468aSAdrien Destugues/*!
174a8a3468aSAdrien Destugues	\fn static bool BUnicodeChar::IsPrintable(uint32 c)
175a33f8fbdSAdrien Destugues	\brief Determine if \a c is printable.
176a33f8fbdSAdrien Destugues
177a33f8fbdSAdrien Destugues	Printable characters are not control characters.
178a33f8fbdSAdrien Destugues
179a33f8fbdSAdrien Destugues	\returns \c true if the specified unicode character is a printable
180a33f8fbdSAdrien Destugues	         character.
181a33f8fbdSAdrien Destugues
182a33f8fbdSAdrien Destugues	\sa IsControl()
183edc845a3SJohn Scipione
184edc845a3SJohn Scipione	\since Haiku R1
185a8a3468aSAdrien Destugues*/
186a8a3468aSAdrien Destugues
187820dca4dSJohn Scipione
188a8a3468aSAdrien Destugues/*!
189a8a3468aSAdrien Destugues	\fn static bool BUnicodeChar::IsTitle(uint32 c)
190a33f8fbdSAdrien Destugues	\brief Determine if \a c is title case.
191a8a3468aSAdrien Destugues
192a33f8fbdSAdrien Destugues	Title case characters are a smaller version of normal uppercase letters.
193a33f8fbdSAdrien Destugues
194a33f8fbdSAdrien Destugues	\returns \c true if the specified unicode character is a title case
195a33f8fbdSAdrien Destugues	         character.
196edc845a3SJohn Scipione
197edc845a3SJohn Scipione	\since Haiku R1
198a8a3468aSAdrien Destugues*/
199a8a3468aSAdrien Destugues
200820dca4dSJohn Scipione
201a8a3468aSAdrien Destugues/*!
202a8a3468aSAdrien Destugues	\fn static bool BUnicodeChar::IsDefined(uint32 c)
203a33f8fbdSAdrien Destugues	\brief Determine if \a c is defined.
204a8a3468aSAdrien Destugues
205a33f8fbdSAdrien Destugues	In unicode some codes are not valid or not attributed yet.
206a33f8fbdSAdrien Destugues	For these codes this method will return \c false.
207a33f8fbdSAdrien Destugues
208a33f8fbdSAdrien Destugues	\returns \c true if the specified unicode character is defined.
209edc845a3SJohn Scipione
210edc845a3SJohn Scipione	\since Haiku R1
211a8a3468aSAdrien Destugues*/
212a8a3468aSAdrien Destugues
213820dca4dSJohn Scipione
214a8a3468aSAdrien Destugues/*!
215a8a3468aSAdrien Destugues	\fn static bool BUnicodeChar::IsBase(uint32 c)
216a33f8fbdSAdrien Destugues	\brief Determine if \a c can be used with a diacritic.
217a33f8fbdSAdrien Destugues
218a33f8fbdSAdrien Destugues	\note IsBase() does not determine if a unicode character is distinct.
219a33f8fbdSAdrien Destugues
220a33f8fbdSAdrien Destugues	\returns \c true if the specified unicode character is a base
221a33f8fbdSAdrien Destugues	         form character that can be used with a diacritic.
222edc845a3SJohn Scipione
223edc845a3SJohn Scipione	\since Haiku R1
224a8a3468aSAdrien Destugues*/
225a8a3468aSAdrien Destugues
226820dca4dSJohn Scipione
227a8a3468aSAdrien Destugues/*!
228a8a3468aSAdrien Destugues	\fn static int8 BUnicodeChar::Type(uint32 c)
229a33f8fbdSAdrien Destugues	\brief Gets the type of a character.
230a8a3468aSAdrien Destugues
231a33f8fbdSAdrien Destugues	\returns A member of the \c unicode_char_category enum.
232edc845a3SJohn Scipione
233edc845a3SJohn Scipione	\since Haiku R1
234a8a3468aSAdrien Destugues*/
235a8a3468aSAdrien Destugues
236820dca4dSJohn Scipione
237a8a3468aSAdrien Destugues/*!
238a33f8fbdSAdrien Destugues	\fn uint32  BUnicodeChar::ToLower(uint32 c)
239a33f8fbdSAdrien Destugues	\brief Transforms \a c to lowercase.
240a33f8fbdSAdrien Destugues
241a33f8fbdSAdrien Destugues	\returns The lowercase version of the specified unicode character.
242edc845a3SJohn Scipione
243edc845a3SJohn Scipione	\since Haiku R1
244a8a3468aSAdrien Destugues*/
245a8a3468aSAdrien Destugues
246820dca4dSJohn Scipione
247a8a3468aSAdrien Destugues/*!
248a33f8fbdSAdrien Destugues	\fn uint32 BUnicodeChar::ToUpper(uint32 c)
249a33f8fbdSAdrien Destugues	\brief Transforms \a c to uppercase.
250a33f8fbdSAdrien Destugues
251a33f8fbdSAdrien Destugues	\returns The uppercase version of the specified unicode character.
252edc845a3SJohn Scipione
253edc845a3SJohn Scipione	\since Haiku R1
254a8a3468aSAdrien Destugues*/
255a8a3468aSAdrien Destugues
256820dca4dSJohn Scipione
257a8a3468aSAdrien Destugues/*!
258a33f8fbdSAdrien Destugues	\fn uint32 BUnicodeChar::ToTitle(uint32 c)
259a33f8fbdSAdrien Destugues	\brief Transforms \a c to title case.
260a33f8fbdSAdrien Destugues
261a33f8fbdSAdrien Destugues	\returns The title case version of the specified unicode character.
262edc845a3SJohn Scipione
263edc845a3SJohn Scipione	\since Haiku R1
264a8a3468aSAdrien Destugues*/
265a8a3468aSAdrien Destugues
266820dca4dSJohn Scipione
267a8a3468aSAdrien Destugues/*!
268a33f8fbdSAdrien Destugues	\fn int32 BUnicodeChar::DigitValue(uint32 c)
269a33f8fbdSAdrien Destugues	\brief Gets the numeric value \a c.
270a33f8fbdSAdrien Destugues
271a33f8fbdSAdrien Destugues	\returns The numeric version of the specified unicode character.
272edc845a3SJohn Scipione
273edc845a3SJohn Scipione	\since Haiku R1
274a8a3468aSAdrien Destugues*/
275a8a3468aSAdrien Destugues
276820dca4dSJohn Scipione
277a8a3468aSAdrien Destugues/*!
278a33f8fbdSAdrien Destugues	\fn void BUnicodeChar::ToUTF8(uint32 c, char** out)
279820dca4dSJohn Scipione	\brief Transform a character to UTF-8 encoding.
280a33f8fbdSAdrien Destugues
281820dca4dSJohn Scipione	\returns The UTF-8 encoding of the specified unicode character.
282edc845a3SJohn Scipione
283edc845a3SJohn Scipione	\since Haiku R1
284a8a3468aSAdrien Destugues*/
285a8a3468aSAdrien Destugues
286820dca4dSJohn Scipione
287a8a3468aSAdrien Destugues/*!
288a33f8fbdSAdrien Destugues	\fn uint32 BUnicodeChar::FromUTF8(const char** in)
289820dca4dSJohn Scipione	\brief Transform a UTF-8 string to an UTF-32 character.
290a8a3468aSAdrien Destugues
291a8a3468aSAdrien Destugues	If the string contains multiple characters, only the fist one is used.
292a8a3468aSAdrien Destugues	This function updates the in pointer so that it points on the next
293a8a3468aSAdrien Destugues	character for the following call.
294a33f8fbdSAdrien Destugues
295820dca4dSJohn Scipione	\returns The UTF-32 encoded version of \a in.
296edc845a3SJohn Scipione
297edc845a3SJohn Scipione	\since Haiku R1
298a8a3468aSAdrien Destugues*/
299a8a3468aSAdrien Destugues
300820dca4dSJohn Scipione
301a8a3468aSAdrien Destugues/*!
302edc845a3SJohn Scipione	\fn size_t BUnicodeChar::UTF8StringLength(const char* string)
303a33f8fbdSAdrien Destugues	\brief Counts the characters in the given \c NUL terminated string.
304a8a3468aSAdrien Destugues
305820dca4dSJohn Scipione	\returns the number of UTF-8 characters in the \c NUL terminated string.
306a8a3468aSAdrien Destugues
307a8a3468aSAdrien Destugues	\sa BString::CountChars()
308edc845a3SJohn Scipione
309edc845a3SJohn Scipione	\since Haiku R1
310a8a3468aSAdrien Destugues*/
311a8a3468aSAdrien Destugues
312820dca4dSJohn Scipione
313a8a3468aSAdrien Destugues/*!
314edc845a3SJohn Scipione	\fn size_t BUnicodeChar::UTF8StringLength(const char* string,
315edc845a3SJohn Scipione		size_t maxLength)
316a33f8fbdSAdrien Destugues	\brief Counts the characters in the given string up to \a maxLength
317a33f8fbdSAdrien Destugues		characters.
318a8a3468aSAdrien Destugues
319edc845a3SJohn Scipione	\param string does not need to be \c NUL terminated if you specify a
320a33f8fbdSAdrien Destugues	       \a maxLength that is shorter than the maximum length of the string.
321edc845a3SJohn Scipione	\param maxLength The maximum length of the string in bytes.
322a33f8fbdSAdrien Destugues
323820dca4dSJohn Scipione	\returns the number of UTF-8 characters in the \c NUL terminated string
324a33f8fbdSAdrien Destugues	         up to \a maxLength characters.
325edc845a3SJohn Scipione
326edc845a3SJohn Scipione	\since Haiku R1
327a8a3468aSAdrien Destugues*/
328