xref: /haiku/src/kits/locale/Collator.cpp (revision c9ad965c81b08802fed0827fd1dd16f45297928a)
1 /*
2 ** Copyright 2003, Axel Dörfler, axeld@pinc-software.de. All rights reserved.
3 ** Distributed under the terms of the OpenBeOS License.
4 */
5 
6 
7 #include <Collator.h>
8 #include <UnicodeChar.h>
9 #include <String.h>
10 #include <Message.h>
11 
12 #include <typeinfo>
13 #include <ctype.h>
14 
15 
16 // conversion array for character ranges 192 - 223 & 224 - 255
17 static const uint8 kNoDiacrits[] = {
18 	'a','a','a','a','a','a','a',
19 	'c',
20 	'e','e','e','e',
21 	'i','i','i','i',
22 	240,	// eth
23 	'n',
24 	'o','o','o','o','o',
25 	247,	//
26 	'o',
27 	'u','u','u','u',
28 	'y',
29 	254,	// thorn
30 	'y'
31 };
32 
33 
34 static inline uint32
35 getPrimaryChar(uint32 c)
36 {
37 	if (c < 0x80)
38 		return tolower(c);
39 
40 	// this automatically returns lowercase letters
41 	if (c >= 192 && c < 223)
42 		return kNoDiacrits[c - 192];
43 	if (c == 223)	// ß
44 		return 's';
45 	if (c >= 224 && c < 256)
46 		return kNoDiacrits[c - 224];
47 
48 	return BUnicodeChar::ToLower(c);
49 }
50 
51 
52 BCollatorAddOn::input_context::input_context(bool ignorePunctuation)
53 	:
54 	ignore_punctuation(ignorePunctuation),
55 	next_char(0),
56 	reserved1(0),
57 	reserved2(0)
58 {
59 }
60 
61 
62 //	#pragma mark -
63 
64 
65 BCollator::BCollator()
66 	:
67 	fCollatorImage(B_ERROR),
68 	fStrength(B_COLLATE_PRIMARY),
69 	fIgnorePunctuation(true)
70 {
71 	// ToDo: the collator construction will have to change; the default
72 	//	collator should be constructed by the Locale/LocaleRoster, so we
73 	//	only need a constructor where you specify all details
74 
75 	fCollator = new BCollatorAddOn();
76 }
77 
78 
79 BCollator::BCollator(BCollatorAddOn *collator, int8 strength,
80 	bool ignorePunctuation)
81 	:
82 	fCollator(collator),
83 	fCollatorImage(B_ERROR),
84 	fStrength(strength),
85 	fIgnorePunctuation(ignorePunctuation)
86 {
87 	if (collator == NULL)
88 		fCollator = new BCollatorAddOn();
89 }
90 
91 
92 BCollator::BCollator(BMessage *archive)
93 	: BArchivable(archive),
94 	fCollator(NULL),
95 	fCollatorImage(B_ERROR)
96 {
97 #if HAIKU_TARGET_PLATFORM_HAIKU
98 	int32 data;
99 	if (archive->FindInt32("loc:strength", &data) == B_OK)
100 		fStrength = (uint8)data;
101 	else
102 		fStrength = B_COLLATE_PRIMARY;
103 
104 	if (archive->FindBool("loc:punctuation", &fIgnorePunctuation) != B_OK)
105 		fIgnorePunctuation = true;
106 
107 	BMessage collatorArchive;
108 	if (archive->FindMessage("loc:collator", &collatorArchive) == B_OK) {
109 		BArchivable *unarchived =
110 			instantiate_object(&collatorArchive, &fCollatorImage);
111 
112 		// do we really have a BCollatorAddOn here?
113 		fCollator = dynamic_cast<BCollatorAddOn *>(unarchived);
114 		if (fCollator == NULL)
115 			delete unarchived;
116 	}
117 
118 	if (fCollator == NULL) {
119 		fCollator = new BCollatorAddOn();
120 		fCollatorImage = B_ERROR;
121 	}
122 #endif
123 }
124 
125 
126 BCollator::~BCollator()
127 {
128 	delete fCollator;
129 
130 	if (fCollatorImage >= B_OK)
131 		unload_add_on(fCollatorImage);
132 }
133 
134 
135 void
136 BCollator::SetDefaultStrength(int8 strength)
137 {
138 	fStrength = strength;
139 }
140 
141 
142 int8
143 BCollator::DefaultStrength() const
144 {
145 	return fStrength;
146 }
147 
148 
149 void
150 BCollator::SetIgnorePunctuation(bool ignore)
151 {
152 	fIgnorePunctuation = ignore;
153 }
154 
155 
156 bool
157 BCollator::IgnorePunctuation() const
158 {
159 	return fIgnorePunctuation;
160 }
161 
162 
163 status_t
164 BCollator::GetSortKey(const char *string, BString *key, int8 strength)
165 {
166 	if (strength == B_COLLATE_DEFAULT)
167 		strength = fStrength;
168 
169 	return fCollator->GetSortKey(string, key, strength, fIgnorePunctuation);
170 }
171 
172 
173 int
174 BCollator::Compare(const char *a, const char *b, int32 length, int8 strength)
175 {
176 	if (length == -1)	// match the whole string
177 		length = 0x7fffffff;
178 
179 	return fCollator->Compare(a, b, length,
180 				strength == B_COLLATE_DEFAULT ? fStrength : strength,
181 				fIgnorePunctuation);
182 }
183 
184 
185 status_t
186 BCollator::Archive(BMessage *archive, bool deep) const
187 {
188 	status_t status = BArchivable::Archive(archive, deep);
189 	if (status < B_OK)
190 		return status;
191 
192 	if (status == B_OK)
193 		status = archive->AddInt32("loc:strength", fStrength);
194 	if (status == B_OK)
195 		status = archive->AddBool("loc:punctuation", fIgnorePunctuation);
196 
197 	BMessage collatorArchive;
198 	if (status == B_OK && deep
199 		&& typeid(*fCollator) != typeid(BCollatorAddOn)
200 			// only archive subclasses from BCollatorAddOn
201 		&& (status = fCollator->Archive(&collatorArchive, true)) == B_OK)
202 		status = archive->AddMessage("loc:collator", &collatorArchive);
203 
204 	return status;
205 }
206 
207 
208 BArchivable *
209 BCollator::Instantiate(BMessage *archive)
210 {
211 	if (validate_instantiation(archive, "BCollator"))
212 		return new BCollator(archive);
213 
214 	return NULL;
215 }
216 
217 
218 //	#pragma mark -
219 
220 
221 BCollatorAddOn::BCollatorAddOn()
222 {
223 }
224 
225 
226 BCollatorAddOn::BCollatorAddOn(BMessage *archive)
227 	: BArchivable(archive)
228 {
229 }
230 
231 
232 BCollatorAddOn::~BCollatorAddOn()
233 {
234 }
235 
236 
237 /** This returns the next Unicode character from the UTF-8 encoded
238  *	input string, and bumps it to the next character.
239  *	It will ignore punctuation if specified by the context, and
240  *	might substitute characters if needed.
241  */
242 
243 uint32
244 BCollatorAddOn::GetNextChar(const char **string, input_context &context)
245 {
246 	uint32 c = context.next_char;
247 	if (c != 0) {
248 		context.next_char = 0;
249 		return c;
250 	}
251 
252 	do {
253 		c = BUnicodeChar::FromUTF8(string);
254 	} while (context.ignore_punctuation
255 		&& (BUnicodeChar::IsPunctuation(c) || BUnicodeChar::IsSpace(c)));
256 
257 	if (c == 223) {
258 		context.next_char = 's';
259 		return 's';
260 	}
261 
262 	return c;
263 }
264 
265 
266 /** Fills the specified buffer with the primary sort key. The buffer
267  *	has to be long enough to hold the key.
268  *	It returns the position in the buffer immediately after the key;
269  *	it does not add a terminating null byte!
270  */
271 
272 char *
273 BCollatorAddOn::PutPrimaryKey(const char *string, char *buffer, int32 length,
274 	bool ignorePunctuation)
275 {
276 	input_context context(ignorePunctuation);
277 
278 	uint32 c;
279 	for (int32 i = 0; (c = GetNextChar(&string, context)) != 0 && i < length;
280 		i++) {
281 		if (c < 0x80)
282 			*buffer++ = tolower(c);
283 		else
284 			BUnicodeChar::ToUTF8(getPrimaryChar(c), &buffer);
285 	}
286 
287 	return buffer;
288 }
289 
290 
291 size_t
292 BCollatorAddOn::PrimaryKeyLength(size_t length)
293 {
294 	return length * 2;
295 		// the primary key needs to make space for doubled characters (like 'ß')
296 }
297 
298 
299 status_t
300 BCollatorAddOn::GetSortKey(const char *string, BString *key, int8 strength,
301 	bool ignorePunctuation)
302 {
303 	if (strength >= B_COLLATE_QUATERNARY) {
304 		// the difference between tertiary and quaternary collation strength
305 		// are usually a different handling of punctuation characters
306 		ignorePunctuation = false;
307 	}
308 
309 	size_t length = strlen(string);
310 
311 	switch (strength) {
312 		case B_COLLATE_PRIMARY:
313 		{
314 			char *begin = key->LockBuffer(PrimaryKeyLength(length));
315 			if (begin == NULL)
316 				return B_NO_MEMORY;
317 
318 			char *end = PutPrimaryKey(string, begin, length, ignorePunctuation);
319 			*end = '\0';
320 
321 			key->UnlockBuffer(end - begin);
322 			break;
323 		}
324 
325 		case B_COLLATE_SECONDARY:
326 		{
327 			char *begin
328 				= key->LockBuffer(PrimaryKeyLength(length) + length + 1);
329 					// the primary key + the secondary key + separator char
330 			if (begin == NULL)
331 				return B_NO_MEMORY;
332 
333 			char *buffer = PutPrimaryKey(string, begin, length, ignorePunctuation);
334 			*buffer++ = '\01';
335 				// separator
336 
337 			input_context context(ignorePunctuation);
338 			uint32 c;
339 			for (uint32 i = 0;
340 				(c = GetNextChar(&string, context)) && i < length; i++) {
341 				if (c < 0x80)
342 					*buffer++ = tolower(c);
343 				else
344 					BUnicodeChar::ToUTF8(BUnicodeChar::ToLower(c), &buffer);
345 			}
346 			*buffer = '\0';
347 
348 			key->UnlockBuffer(buffer - begin);
349 			break;
350 		}
351 
352 		case B_COLLATE_TERTIARY:
353 		case B_COLLATE_QUATERNARY:
354 		{
355 			char *begin
356 				= key->LockBuffer(PrimaryKeyLength(length) + length + 1);
357 					// the primary key + the tertiary key + separator char
358 			if (begin == NULL)
359 				return B_NO_MEMORY;
360 
361 			char *buffer = PutPrimaryKey(string, begin, length, ignorePunctuation);
362 			*buffer++ = '\01';
363 				// separator
364 
365 			input_context context(ignorePunctuation);
366 			uint32 c;
367 			for (uint32 i = 0;
368 				(c = GetNextChar(&string, context)) && i < length; i++) {
369 				BUnicodeChar::ToUTF8(c, &buffer);
370 			}
371 			*buffer = '\0';
372 
373 			key->UnlockBuffer(buffer + length - begin);
374 			break;
375 		}
376 
377 		case B_COLLATE_IDENTICAL:
378 		default:
379 			key->SetTo(string, length);
380 				// is there any way to check if BString::SetTo() actually succeeded?
381 			break;
382 	}
383 	return B_OK;
384 }
385 
386 
387 int
388 BCollatorAddOn::Compare(const char *a, const char *b, int32 length,
389 	int8 strength, bool ignorePunctuation)
390 {
391 	if (strength >= B_COLLATE_QUATERNARY) {
392 		// the difference between tertiary and quaternary collation strength
393 		// are usually a different handling of punctuation characters
394 		ignorePunctuation = false;
395 	}
396 
397 	input_context contextA(ignorePunctuation);
398 	input_context contextB(ignorePunctuation);
399 
400 	switch (strength) {
401 		case B_COLLATE_PRIMARY:
402 		{
403 			for (int32 i = 0; i < length; i++) {
404 				uint32 charA = GetNextChar(&a, contextA);
405 				uint32 charB = GetNextChar(&b, contextB);
406 				if (charA == 0)
407 					return charB == 0 ? 0 : -(int32)charB;
408 				else if (charB == 0)
409 					return (int32)charA;
410 
411 				charA = getPrimaryChar(charA);
412 				charB = getPrimaryChar(charB);
413 
414 				if (charA != charB)
415 					return (int32)charA - (int32)charB;
416 			}
417 			return 0;
418 		}
419 
420 		case B_COLLATE_SECONDARY:
421 		{
422 			// diacriticals can only change the order between equal strings
423 			int32 compare
424 				= Compare(a, b, length, B_COLLATE_PRIMARY, ignorePunctuation);
425 			if (compare != 0)
426 				return compare;
427 
428 			for (int32 i = 0; i < length; i++) {
429 				uint32 charA = BUnicodeChar::ToLower(GetNextChar(&a, contextA));
430 				uint32 charB = BUnicodeChar::ToLower(GetNextChar(&b, contextB));
431 
432 				// the two strings does have the same size when we get here
433 				if (charA == 0)
434 					return 0;
435 
436 				if (charA != charB)
437 					return (int32)charA - (int32)charB;
438 			}
439 			return 0;
440 		}
441 
442 		case B_COLLATE_TERTIARY:
443 		case B_COLLATE_QUATERNARY:
444 		{
445 			// diacriticals can only change the order between equal strings
446 			int32 compare
447 				= Compare(a, b, length, B_COLLATE_PRIMARY, ignorePunctuation);
448 			if (compare != 0)
449 				return compare;
450 
451 			for (int32 i = 0; i < length; i++) {
452 				uint32 charA = GetNextChar(&a, contextA);
453 				uint32 charB = GetNextChar(&b, contextB);
454 
455 				// the two strings does have the same size when we get here
456 				if (charA == 0)
457 					return 0;
458 
459 				if (charA != charB)
460 					return (int32)charA - (int32)charB;
461 			}
462 			return 0;
463 		}
464 
465 		case B_COLLATE_IDENTICAL:
466 		default:
467 			return strncmp(a, b, length);
468 	}
469 }
470 
471 
472 status_t
473 BCollatorAddOn::Archive(BMessage *archive, bool deep) const
474 {
475 	return BArchivable::Archive(archive, deep);
476 }
477 
478 
479 BArchivable *
480 BCollatorAddOn::Instantiate(BMessage *archive)
481 {
482 	if (validate_instantiation(archive, "BCollatorAddOn"))
483 		return new BCollatorAddOn(archive);
484 
485 	return NULL;
486 }
487 
488