xref: /haiku/src/kits/locale/Collator.cpp (revision 35ead8815b679605a9b4db8130613ea100f4b14c)
1 /*
2 ** Copyright 2003, Axel Dörfler, axeld@pinc-software.de. All rights reserved.
3 ** Distributed under the terms of the OpenBeOS License.
4 */
5 
6 
7 #include <Collator.h>
8 #include <UnicodeChar.h>
9 #include <String.h>
10 #include <Message.h>
11 
12 #include <typeinfo>
13 #include <ctype.h>
14 
15 
16 // conversion array for character ranges 192 - 223 & 224 - 255
17 static const uint8 kNoDiacrits[] = {
18 	'a','a','a','a','a','a','a',
19 	'c',
20 	'e','e','e','e',
21 	'i','i','i','i',
22 	240,	// eth
23 	'n',
24 	'o','o','o','o','o',
25 	247,	//
26 	'o',
27 	'u','u','u','u',
28 	'y',
29 	254,	// thorn
30 	'y'
31 };
32 
33 
34 static inline uint32
35 getPrimaryChar(uint32 c)
36 {
37 	if (c < 0x80)
38 		return tolower(c);
39 
40 	// this automatically returns lowercase letters
41 	if (c >= 192 && c < 223)
42 		return kNoDiacrits[c - 192];
43 	if (c == 223)	// ß
44 		return 's';
45 	if (c >= 224 && c < 256)
46 		return kNoDiacrits[c - 224];
47 
48 	return BUnicodeChar::ToLower(c);
49 }
50 
51 
52 BCollatorAddOn::input_context::input_context(bool ignorePunctuation)
53 	:
54 	ignore_punctuation(ignorePunctuation),
55 	next_char(0),
56 	reserved1(0),
57 	reserved2(0)
58 {
59 }
60 
61 
62 //	#pragma mark -
63 
64 
65 BCollator::BCollator()
66 	:
67 	fCollatorImage(B_ERROR),
68 	fStrength(B_COLLATE_PRIMARY),
69 	fIgnorePunctuation(true)
70 {
71 	// ToDo: the collator construction will have to change; the default
72 	//	collator should be constructed by the Locale/LocaleRoster, so we
73 	//	only need a constructor where you specify all details
74 
75 	fCollator = new BCollatorAddOn();
76 }
77 
78 
79 BCollator::BCollator(BCollatorAddOn *collator, int8 strength,
80 	bool ignorePunctuation)
81 	:
82 	fCollator(collator),
83 	fCollatorImage(B_ERROR),
84 	fStrength(strength),
85 	fIgnorePunctuation(ignorePunctuation)
86 {
87 	if (collator == NULL)
88 		fCollator = new BCollatorAddOn();
89 }
90 
91 
92 BCollator::BCollator(BMessage *archive)
93 	: BArchivable(archive),
94 	fCollator(NULL),
95 	fCollatorImage(B_ERROR),
96 	fIgnorePunctuation(true)
97 {
98 #if HAIKU_TARGET_PLATFORM_HAIKU
99 	int32 data;
100 	if (archive->FindInt32("loc:strength", &data) == B_OK)
101 		fStrength = (uint8)data;
102 	else
103 		fStrength = B_COLLATE_PRIMARY;
104 
105 	if (archive->FindBool("loc:punctuation", &fIgnorePunctuation) != B_OK)
106 		fIgnorePunctuation = true;
107 
108 	BMessage collatorArchive;
109 	if (archive->FindMessage("loc:collator", &collatorArchive) == B_OK) {
110 		BArchivable *unarchived =
111 			instantiate_object(&collatorArchive, &fCollatorImage);
112 
113 		// do we really have a BCollatorAddOn here?
114 		fCollator = dynamic_cast<BCollatorAddOn *>(unarchived);
115 		if (fCollator == NULL)
116 			delete unarchived;
117 	}
118 
119 	if (fCollator == NULL) {
120 		fCollator = new BCollatorAddOn();
121 		fCollatorImage = B_ERROR;
122 	}
123 #endif
124 }
125 
126 
127 BCollator::~BCollator()
128 {
129 	delete fCollator;
130 
131 	if (fCollatorImage >= B_OK)
132 		unload_add_on(fCollatorImage);
133 }
134 
135 
136 void
137 BCollator::SetDefaultStrength(int8 strength)
138 {
139 	fStrength = strength;
140 }
141 
142 
143 int8
144 BCollator::DefaultStrength() const
145 {
146 	return fStrength;
147 }
148 
149 
150 void
151 BCollator::SetIgnorePunctuation(bool ignore)
152 {
153 	fIgnorePunctuation = ignore;
154 }
155 
156 
157 bool
158 BCollator::IgnorePunctuation() const
159 {
160 	return fIgnorePunctuation;
161 }
162 
163 
164 status_t
165 BCollator::GetSortKey(const char *string, BString *key, int8 strength)
166 {
167 	if (strength == B_COLLATE_DEFAULT)
168 		strength = fStrength;
169 
170 	return fCollator->GetSortKey(string, key, strength, fIgnorePunctuation);
171 }
172 
173 
174 int
175 BCollator::Compare(const char *a, const char *b, int32 length, int8 strength)
176 {
177 	if (length == -1)	// match the whole string
178 		length = 0x7fffffff;
179 
180 	return fCollator->Compare(a, b, length,
181 				strength == B_COLLATE_DEFAULT ? fStrength : strength,
182 				fIgnorePunctuation);
183 }
184 
185 
186 status_t
187 BCollator::Archive(BMessage *archive, bool deep) const
188 {
189 	status_t status = BArchivable::Archive(archive, deep);
190 	if (status < B_OK)
191 		return status;
192 
193 	if (status == B_OK)
194 		status = archive->AddInt32("loc:strength", fStrength);
195 	if (status == B_OK)
196 		status = archive->AddBool("loc:punctuation", fIgnorePunctuation);
197 
198 	BMessage collatorArchive;
199 	if (status == B_OK && deep
200 		&& typeid(*fCollator) != typeid(BCollatorAddOn)
201 			// only archive subclasses from BCollatorAddOn
202 		&& (status = fCollator->Archive(&collatorArchive, true)) == B_OK)
203 		status = archive->AddMessage("loc:collator", &collatorArchive);
204 
205 	return status;
206 }
207 
208 
209 BArchivable *
210 BCollator::Instantiate(BMessage *archive)
211 {
212 	if (validate_instantiation(archive, "BCollator"))
213 		return new BCollator(archive);
214 
215 	return NULL;
216 }
217 
218 
219 //	#pragma mark -
220 
221 
222 BCollatorAddOn::BCollatorAddOn()
223 {
224 }
225 
226 
227 BCollatorAddOn::BCollatorAddOn(BMessage *archive)
228 	: BArchivable(archive)
229 {
230 }
231 
232 
233 BCollatorAddOn::~BCollatorAddOn()
234 {
235 }
236 
237 
238 /** This returns the next Unicode character from the UTF-8 encoded
239  *	input string, and bumps it to the next character.
240  *	It will ignore punctuation if specified by the context, and
241  *	might substitute characters if needed.
242  */
243 
244 uint32
245 BCollatorAddOn::GetNextChar(const char **string, input_context &context)
246 {
247 	uint32 c = context.next_char;
248 	if (c != 0) {
249 		context.next_char = 0;
250 		return c;
251 	}
252 
253 	do {
254 		c = BUnicodeChar::FromUTF8(string);
255 	} while (context.ignore_punctuation
256 		&& (BUnicodeChar::IsPunctuation(c) || BUnicodeChar::IsSpace(c)));
257 
258 	if (c == 223) {
259 		context.next_char = 's';
260 		return 's';
261 	}
262 
263 	return c;
264 }
265 
266 
267 /** Fills the specified buffer with the primary sort key. The buffer
268  *	has to be long enough to hold the key.
269  *	It returns the position in the buffer immediately after the key;
270  *	it does not add a terminating null byte!
271  */
272 
273 char *
274 BCollatorAddOn::PutPrimaryKey(const char *string, char *buffer, int32 length,
275 	bool ignorePunctuation)
276 {
277 	input_context context(ignorePunctuation);
278 
279 	uint32 c;
280 	for (int32 i = 0; (c = GetNextChar(&string, context)) != 0 && i < length;
281 		i++) {
282 		if (c < 0x80)
283 			*buffer++ = tolower(c);
284 		else
285 			BUnicodeChar::ToUTF8(getPrimaryChar(c), &buffer);
286 	}
287 
288 	return buffer;
289 }
290 
291 
292 size_t
293 BCollatorAddOn::PrimaryKeyLength(size_t length)
294 {
295 	return length * 2;
296 		// the primary key needs to make space for doubled characters (like 'ß')
297 }
298 
299 
300 status_t
301 BCollatorAddOn::GetSortKey(const char *string, BString *key, int8 strength,
302 	bool ignorePunctuation)
303 {
304 	if (strength >= B_COLLATE_QUATERNARY) {
305 		// the difference between tertiary and quaternary collation strength
306 		// are usually a different handling of punctuation characters
307 		ignorePunctuation = false;
308 	}
309 
310 	size_t length = strlen(string);
311 
312 	switch (strength) {
313 		case B_COLLATE_PRIMARY:
314 		{
315 			char *begin = key->LockBuffer(PrimaryKeyLength(length));
316 			if (begin == NULL)
317 				return B_NO_MEMORY;
318 
319 			char *end = PutPrimaryKey(string, begin, length, ignorePunctuation);
320 			*end = '\0';
321 
322 			key->UnlockBuffer(end - begin);
323 			break;
324 		}
325 
326 		case B_COLLATE_SECONDARY:
327 		{
328 			char *begin
329 				= key->LockBuffer(PrimaryKeyLength(length) + length + 1);
330 					// the primary key + the secondary key + separator char
331 			if (begin == NULL)
332 				return B_NO_MEMORY;
333 
334 			char *buffer = PutPrimaryKey(string, begin, length, ignorePunctuation);
335 			*buffer++ = '\01';
336 				// separator
337 
338 			input_context context(ignorePunctuation);
339 			uint32 c;
340 			for (uint32 i = 0;
341 				(c = GetNextChar(&string, context)) && i < length; i++) {
342 				if (c < 0x80)
343 					*buffer++ = tolower(c);
344 				else
345 					BUnicodeChar::ToUTF8(BUnicodeChar::ToLower(c), &buffer);
346 			}
347 			*buffer = '\0';
348 
349 			key->UnlockBuffer(buffer - begin);
350 			break;
351 		}
352 
353 		case B_COLLATE_TERTIARY:
354 		case B_COLLATE_QUATERNARY:
355 		{
356 			char *begin
357 				= key->LockBuffer(PrimaryKeyLength(length) + length + 1);
358 					// the primary key + the tertiary key + separator char
359 			if (begin == NULL)
360 				return B_NO_MEMORY;
361 
362 			char *buffer = PutPrimaryKey(string, begin, length, ignorePunctuation);
363 			*buffer++ = '\01';
364 				// separator
365 
366 			input_context context(ignorePunctuation);
367 			uint32 c;
368 			for (uint32 i = 0;
369 				(c = GetNextChar(&string, context)) && i < length; i++) {
370 				BUnicodeChar::ToUTF8(c, &buffer);
371 			}
372 			*buffer = '\0';
373 
374 			key->UnlockBuffer(buffer + length - begin);
375 			break;
376 		}
377 
378 		case B_COLLATE_IDENTICAL:
379 		default:
380 			key->SetTo(string, length);
381 				// is there any way to check if BString::SetTo() actually succeeded?
382 			break;
383 	}
384 	return B_OK;
385 }
386 
387 
388 int
389 BCollatorAddOn::Compare(const char *a, const char *b, int32 length,
390 	int8 strength, bool ignorePunctuation)
391 {
392 	if (strength >= B_COLLATE_QUATERNARY) {
393 		// the difference between tertiary and quaternary collation strength
394 		// are usually a different handling of punctuation characters
395 		ignorePunctuation = false;
396 	}
397 
398 	input_context contextA(ignorePunctuation);
399 	input_context contextB(ignorePunctuation);
400 
401 	switch (strength) {
402 		case B_COLLATE_PRIMARY:
403 		{
404 			for (int32 i = 0; i < length; i++) {
405 				uint32 charA = GetNextChar(&a, contextA);
406 				uint32 charB = GetNextChar(&b, contextB);
407 				if (charA == 0)
408 					return charB == 0 ? 0 : -(int32)charB;
409 				else if (charB == 0)
410 					return (int32)charA;
411 
412 				charA = getPrimaryChar(charA);
413 				charB = getPrimaryChar(charB);
414 
415 				if (charA != charB)
416 					return (int32)charA - (int32)charB;
417 			}
418 			return 0;
419 		}
420 
421 		case B_COLLATE_SECONDARY:
422 		{
423 			// diacriticals can only change the order between equal strings
424 			int32 compare
425 				= Compare(a, b, length, B_COLLATE_PRIMARY, ignorePunctuation);
426 			if (compare != 0)
427 				return compare;
428 
429 			for (int32 i = 0; i < length; i++) {
430 				uint32 charA = BUnicodeChar::ToLower(GetNextChar(&a, contextA));
431 				uint32 charB = BUnicodeChar::ToLower(GetNextChar(&b, contextB));
432 
433 				// the two strings does have the same size when we get here
434 				if (charA == 0)
435 					return 0;
436 
437 				if (charA != charB)
438 					return (int32)charA - (int32)charB;
439 			}
440 			return 0;
441 		}
442 
443 		case B_COLLATE_TERTIARY:
444 		case B_COLLATE_QUATERNARY:
445 		{
446 			// diacriticals can only change the order between equal strings
447 			int32 compare
448 				= Compare(a, b, length, B_COLLATE_PRIMARY, ignorePunctuation);
449 			if (compare != 0)
450 				return compare;
451 
452 			for (int32 i = 0; i < length; i++) {
453 				uint32 charA = GetNextChar(&a, contextA);
454 				uint32 charB = GetNextChar(&b, contextB);
455 
456 				// the two strings does have the same size when we get here
457 				if (charA == 0)
458 					return 0;
459 
460 				if (charA != charB)
461 					return (int32)charA - (int32)charB;
462 			}
463 			return 0;
464 		}
465 
466 		case B_COLLATE_IDENTICAL:
467 		default:
468 			return strncmp(a, b, length);
469 	}
470 }
471 
472 
473 status_t
474 BCollatorAddOn::Archive(BMessage *archive, bool deep) const
475 {
476 	return BArchivable::Archive(archive, deep);
477 }
478 
479 
480 BArchivable *
481 BCollatorAddOn::Instantiate(BMessage *archive)
482 {
483 	if (validate_instantiation(archive, "BCollatorAddOn"))
484 		return new BCollatorAddOn(archive);
485 
486 	return NULL;
487 }
488 
489