152b1d543SIngo Weinhold /* 252b1d543SIngo Weinhold * Copyright 2008, Ingo Weinhold, ingo_weinhold@gmx.de. 352b1d543SIngo Weinhold * Distributed under the terms of the MIT License. 452b1d543SIngo Weinhold */ 552b1d543SIngo Weinhold #ifndef UTF8_CHAR_H 652b1d543SIngo Weinhold #define UTF8_CHAR_H 752b1d543SIngo Weinhold 852b1d543SIngo Weinhold #include <ctype.h> 952b1d543SIngo Weinhold #include <string.h> 1052b1d543SIngo Weinhold 11c7047b8fSSiarzhuk Zharski #include <UnicodeChar.h> 12c7047b8fSSiarzhuk Zharski 1352b1d543SIngo Weinhold 1452b1d543SIngo Weinhold struct UTF8Char { 1552b1d543SIngo Weinhold char bytes[4]; 1652b1d543SIngo Weinhold UTF8CharUTF8Char1752b1d543SIngo Weinhold UTF8Char() 1852b1d543SIngo Weinhold { 1934a1a44dSSiarzhuk Zharski bytes[0] = 0; 2052b1d543SIngo Weinhold } 2152b1d543SIngo Weinhold UTF8CharUTF8Char2252b1d543SIngo Weinhold UTF8Char(char c) 2352b1d543SIngo Weinhold { 2452b1d543SIngo Weinhold bytes[0] = c; 2552b1d543SIngo Weinhold } 2652b1d543SIngo Weinhold UTF8CharUTF8Char27daebca78SSiarzhuk Zharski UTF8Char(const char* c) 28daebca78SSiarzhuk Zharski { 29daebca78SSiarzhuk Zharski SetTo(c, ByteCount(*c)); 30daebca78SSiarzhuk Zharski } 31daebca78SSiarzhuk Zharski UTF8CharUTF8Char32bdc33077SIngo Weinhold UTF8Char(const char* c, int32 count) 33bdc33077SIngo Weinhold { 34bdc33077SIngo Weinhold SetTo(c, count); 35bdc33077SIngo Weinhold } 36bdc33077SIngo Weinhold SetToUTF8Char37bdc33077SIngo Weinhold void SetTo(const char* c, int32 count) 38bdc33077SIngo Weinhold { 39bdc33077SIngo Weinhold bytes[0] = c[0]; 40bdc33077SIngo Weinhold if (count > 1) { 41bdc33077SIngo Weinhold bytes[1] = c[1]; 42bdc33077SIngo Weinhold if (count > 2) { 43bdc33077SIngo Weinhold bytes[2] = c[2]; 44bdc33077SIngo Weinhold if (count > 3) 45bdc33077SIngo Weinhold bytes[3] = c[3]; 46bdc33077SIngo Weinhold } 47bdc33077SIngo Weinhold } 48bdc33077SIngo Weinhold } 49bdc33077SIngo Weinhold ByteCountUTF8Char5052b1d543SIngo Weinhold static int32 ByteCount(char firstChar) 5152b1d543SIngo Weinhold { 5252b1d543SIngo Weinhold // Note, this does not recognize invalid chars 5382224430SStephan Aßmus uchar c = firstChar; 5452b1d543SIngo Weinhold if (c < 0x80) 5552b1d543SIngo Weinhold return 1; 5652b1d543SIngo Weinhold if (c < 0xe0) 5752b1d543SIngo Weinhold return 2; 5852b1d543SIngo Weinhold return c < 0xf0 ? 3 : 4; 5952b1d543SIngo Weinhold } 6052b1d543SIngo Weinhold ByteCountUTF8Char6152b1d543SIngo Weinhold int32 ByteCount() const 6252b1d543SIngo Weinhold { 6352b1d543SIngo Weinhold return ByteCount(bytes[0]); 6452b1d543SIngo Weinhold } 6552b1d543SIngo Weinhold IsFullWidthUTF8Char664c9d4b02SIngo Weinhold bool IsFullWidth() const 674c9d4b02SIngo Weinhold { 68*3d149248SSiarzhuk Zharski switch (BUnicodeChar::EastAsianWidth(BUnicodeChar::FromUTF8(bytes))) { 69*3d149248SSiarzhuk Zharski case B_UNICODE_EA_FULLWIDTH: 70*3d149248SSiarzhuk Zharski case B_UNICODE_EA_WIDE: 71*3d149248SSiarzhuk Zharski return true; 72*3d149248SSiarzhuk Zharski default: 73*3d149248SSiarzhuk Zharski break; 74*3d149248SSiarzhuk Zharski } 754c9d4b02SIngo Weinhold return false; 764c9d4b02SIngo Weinhold } 774c9d4b02SIngo Weinhold IsSpaceUTF8Char7852b1d543SIngo Weinhold bool IsSpace() const 7952b1d543SIngo Weinhold { 80c7047b8fSSiarzhuk Zharski return BUnicodeChar::IsSpace(BUnicodeChar::FromUTF8(bytes)); 81c7047b8fSSiarzhuk Zharski } 82c7047b8fSSiarzhuk Zharski IsAlNumUTF8Char83c7047b8fSSiarzhuk Zharski bool IsAlNum() const 84c7047b8fSSiarzhuk Zharski { 85c7047b8fSSiarzhuk Zharski return BUnicodeChar::IsAlNum(BUnicodeChar::FromUTF8(bytes)); 8652b1d543SIngo Weinhold } 8752b1d543SIngo Weinhold ToLowerUTF8Char8852b1d543SIngo Weinhold UTF8Char ToLower() const 8952b1d543SIngo Weinhold { 90c7047b8fSSiarzhuk Zharski uint32 c = BUnicodeChar::ToLower(BUnicodeChar::FromUTF8(bytes)); 9152b1d543SIngo Weinhold 92c7047b8fSSiarzhuk Zharski UTF8Char character; 93c7047b8fSSiarzhuk Zharski char* utf8 = character.bytes; 94c7047b8fSSiarzhuk Zharski BUnicodeChar::ToUTF8(c, &utf8); 95c7047b8fSSiarzhuk Zharski 96c7047b8fSSiarzhuk Zharski return character; 9752b1d543SIngo Weinhold } 9852b1d543SIngo Weinhold 9952b1d543SIngo Weinhold bool operator==(const UTF8Char& other) const 10052b1d543SIngo Weinhold { 10152b1d543SIngo Weinhold int32 byteCount = ByteCount(); 10252b1d543SIngo Weinhold bool equals = bytes[0] == other.bytes[0]; 10352b1d543SIngo Weinhold if (byteCount > 1 && equals) { 10452b1d543SIngo Weinhold equals = bytes[1] == other.bytes[1]; 10552b1d543SIngo Weinhold if (byteCount > 2 && equals) { 10652b1d543SIngo Weinhold equals = bytes[2] == other.bytes[2]; 10752b1d543SIngo Weinhold if (byteCount > 3 && equals) 10852b1d543SIngo Weinhold equals = bytes[3] == other.bytes[3]; 10952b1d543SIngo Weinhold } 11052b1d543SIngo Weinhold } 11152b1d543SIngo Weinhold return equals; 11252b1d543SIngo Weinhold } 11352b1d543SIngo Weinhold 11452b1d543SIngo Weinhold bool operator!=(const UTF8Char& other) const 11552b1d543SIngo Weinhold { 11652b1d543SIngo Weinhold return !(*this == other); 11752b1d543SIngo Weinhold } 11852b1d543SIngo Weinhold }; 11952b1d543SIngo Weinhold 12052b1d543SIngo Weinhold 12152b1d543SIngo Weinhold #endif // UTF8_CHAR_H 122