private/interface/utf8_functions.h

/*
 * Copyright 2004-2006, Haiku, Inc.
 * Distributed under the terms of the MIT License.
 */
#ifndef _UTF8_FUNCTIONS_H
#define _UTF8_FUNCTIONS_H


#include <SupportDefs.h>


static inline bool
IsInsideGlyph(uchar ch)
{
	return (ch & 0xc0) == 0x80;
}


static inline uint32
UTF8NextCharLenUnsafe(const char *text)
{
	const char *ptr = text;

	do {
		ptr++;
	} while (IsInsideGlyph(*ptr));

	return ptr - text;
}


static inline uint32
UTF8NextCharLen(const char *text)
{
	if (text == NULL || *text == 0)
		return 0;

	return UTF8NextCharLenUnsafe(text);
}


static inline uint32
UTF8PreviousCharLen(const char *text, const char *limit)
{
	const char *ptr = text;

	if (ptr == NULL || limit == NULL)
		return 0;

	do {
		if (ptr == limit)
			break;
		ptr--;
	} while (IsInsideGlyph(*ptr));

	return text - ptr;
}


/*!	UTF8CountBytes gets the length (in bytes) of a UTF8 string. Up to
	numChars characters are read. If numChars is a negative value it is ignored
	and the string is read up to the terminating 0.
*/
static inline uint32
UTF8CountBytes(const char *bytes, int32 numChars)
{
	if (!bytes)
		return 0;

	if (numChars < 0)
		numChars = INT_MAX;

	const char *base = bytes;
	while (*bytes && numChars-- > 0) {
		if (bytes[0] & 0x80) {
			if (bytes[0] & 0x40) {
				if (bytes[0] & 0x20) {
					if (bytes[0] & 0x10) {
						if (bytes[1] == 0 || bytes[2] == 0 || bytes[3] == 0)
							return (bytes - base);

						bytes += 4;
						continue;
					}

					if (bytes[1] == 0 || bytes[2] == 0)
						return (bytes - base);

					bytes += 3;
					continue;
				}

				if (bytes[1] == 0)
					return (bytes - base);

				bytes += 2;
				continue;
			}

			/* Not a startbyte - skip */
			bytes += 1;
			continue;
		}

		bytes += 1;
	}

	return (bytes - base);
}


/*!	UTF8CountChars gets the length (in characters) of a UTF8 string. Up to
	numBytes bytes are read. If numBytes is a negative value it is ignored
	and the string is read up to the terminating 0.
*/
static inline uint32
UTF8CountChars(const char *bytes, int32 numBytes)
{
	if (!bytes)
		return 0;

	uint32 length = 0;
	const char *last = bytes + numBytes - 1;
	if (numBytes < 0)
		last = (const char *)UINT_MAX;

	while (*bytes && bytes <= last) {
		if (bytes[0] & 0x80) {
			if (bytes[0] & 0x40) {
				if (bytes[0] & 0x20) {
					if (bytes[0] & 0x10) {
						if (bytes[1] == 0 || bytes[2] == 0 || bytes[3] == 0)
							return length;

						bytes += 4;
						length++;
						continue;
					}

					if (bytes[1] == 0 || bytes[2] == 0)
						return length;

					bytes += 3;
					length++;
					continue;
				}

				if (bytes[1] == 0)
					return length;

				bytes += 2;
				length++;
				continue;
			}

			/* Not a startbyte - skip */
			bytes += 1;
			continue;
		}

		bytes += 1;
		length++;
	}

	return length;
}


/*!	UTF8ToCharCode converts the input that includes potential multibyte chars
	to UTF-32 char codes that can be used by FreeType. The string pointer is
	then advanced to the next character in the string. In case the terminating
	0 is reached, the string pointer is not advanced anymore and spaces are
	returned. This makes it safe to overruns and enables streamed processing
	of UTF8 strings.
*/
static inline uint32
UTF8ToCharCode(const char **bytes)
{
	register uint32 result = 0;

	if ((*bytes)[0] & 0x80) {
		if ((*bytes)[0] & 0x40) {
			if ((*bytes)[0] & 0x20) {
				if ((*bytes)[0] & 0x10) {
					if ((*bytes)[0] & 0x08) {
						/*	A five byte char?!
							Something's wrong, substitute. */
						result += 0x20;
						(*bytes)++;
						return result;
					}

					if ((*bytes)[1] == 0 || (*bytes)[2] == 0 || (*bytes)[3] == 0)
						return 0x00;

					/* A four byte char */
					result += (*bytes)[0] & 0x07;
					result <<= 6;
					result += (*bytes)[1] & 0x3f;
					result <<= 6;
					result += (*bytes)[2] & 0x3f;
					result <<= 6;
					result += (*bytes)[3] & 0x3f;
					(*bytes) += 4;
					return result;
				}

				if ((*bytes)[1] == 0 || (*bytes)[2] == 0)
					return 0x00;

				/* A three byte char */
				result += (*bytes)[0] & 0x0f;
				result <<= 6;
				result += (*bytes)[1] & 0x3f;
				result <<= 6;
				result += (*bytes)[2] & 0x3f;
				(*bytes) += 3;
				return result;
			}

			if ((*bytes)[1] == 0)
				return 0x00;

			/* A two byte char */
			result += (*bytes)[0] & 0x1f;
			result <<= 6;
			result += (*bytes)[1] & 0x3f;
			(*bytes) += 2;
			return result;
		}

		/*	This (10) is not a startbyte.
			Substitute with a space. */
		result += 0x20;
		(*bytes)++;
		return result;
	}

	if ((*bytes)[0] == 0) {
		/*	We do not advance beyond the terminating 0. */
		return 0x00;
	}

	result += (*bytes)[0];
	(*bytes)++;
	return result;
}

#endif	// _UTF8_FUNCTIONS_H