1 /* 2 * Copyright 2004-2006, Haiku, Inc. 3 * Distributed under the terms of the MIT License. 4 */ 5 #ifndef _UTF8_FUNCTIONS_H 6 #define _UTF8_FUNCTIONS_H 7 8 9 #include <SupportDefs.h> 10 11 12 static inline bool 13 IsInsideGlyph(uchar ch) 14 { 15 return (ch & 0xC0) == 0x80; 16 } 17 18 static inline uint32 19 UTF8NextCharLenUnsafe(const char *text) 20 { 21 const char *ptr = text; 22 23 do { 24 ptr++; 25 } while (IsInsideGlyph(*ptr)); 26 27 return ptr - text; 28 } 29 30 static inline uint32 31 UTF8NextCharLen(const char *text) 32 { 33 if (text == NULL || *text == 0) 34 return 0; 35 36 return UTF8NextCharLenUnsafe(text); 37 } 38 39 static inline uint32 40 UTF8PreviousCharLen(const char *text, const char *limit) 41 { 42 const char *ptr = text; 43 44 if (ptr == NULL || limit == NULL) 45 return 0; 46 47 do { 48 if (ptr == limit) 49 break; 50 ptr--; 51 } while (IsInsideGlyph(*ptr)); 52 53 return text - ptr; 54 } 55 56 // TODO: use this function in other places of this file... 57 static inline uint32 58 count_utf8_bytes(uchar ch) 59 { 60 // the number of high bits set until the first 61 // unset bit determine the count of bytes used for 62 // this glyph from this byte on 63 uchar bit = 1 << 7; 64 uint32 count = 1; 65 if (ch & bit) { 66 bit = bit >> 1; 67 while (ch & bit) { 68 count++; 69 bit = bit >> 1; 70 } 71 } 72 return count; 73 } 74 75 static inline uint32 76 UTF8CountBytes(const char *text, uint32 numChars) 77 { 78 if (text) { 79 // iterate over numChars glyphs incrementing ptr by the 80 // number of bytes for each glyph, which is encoded in 81 // the first byte of any glyph. 82 const char *ptr = text; 83 while (numChars--) { 84 ptr += count_utf8_bytes(*ptr); 85 } 86 return ptr - text; 87 } 88 return 0; 89 } 90 91 static inline uint32 92 UTF8CountChars(const char *text, int32 numBytes) 93 { 94 const char* ptr = text; 95 const char* last = ptr + numBytes - 1; 96 97 uint32 count = 0; 98 while (ptr <= last) { 99 ptr += UTF8NextCharLen(ptr); 100 count++; 101 } 102 103 return count; 104 } 105 106 107 /* UTF8ToCharCode converts the input that includes potential multibyte chars 108 to UTF-32 char codes that can be used by FreeType. The string pointer is 109 then advanced to the next character in the string. In case the terminating 110 0 is reached, the string pointer is not advanced anymore and spaces are 111 returned. This makes it safe to overruns and enables streamed processing 112 of UTF8 strings. */ 113 static inline uint32 114 UTF8ToCharCode(const char **bytes) 115 { 116 register uint32 result = 0; 117 118 if ((*bytes)[0] & 0x80) { 119 if ((*bytes)[0] & 0x40) { 120 if ((*bytes)[0] & 0x20) { 121 if ((*bytes)[0] & 0x10) { 122 if ((*bytes)[0] & 0x08) { 123 /* A five byte char?! 124 Something's wrong, substitute. */ 125 result += 0x20; 126 (*bytes)++; 127 return result; 128 } 129 130 /* A four byte char */ 131 result += (*bytes)[0] & 0x07; 132 result <<= 6; 133 result += (*bytes)[1] & 0x3f; 134 result <<= 6; 135 result += (*bytes)[2] & 0x3f; 136 result <<= 6; 137 result += (*bytes)[3] & 0x3f; 138 (*bytes) += 3; 139 return result; 140 } 141 142 /* A three byte char */ 143 result += (*bytes)[0] & 0x0f; 144 result <<= 6; 145 result += (*bytes)[1] & 0x3f; 146 result <<= 6; 147 result += (*bytes)[2] & 0x3f; 148 (*bytes) += 3; 149 return result; 150 } 151 152 /* A two byte char */ 153 result += (*bytes)[0] & 0x1f; 154 result <<= 6; 155 result += (*bytes)[1] & 0x3f; 156 (*bytes) += 2; 157 return result; 158 } 159 160 /* This (10) is not a startbyte. 161 Substitute with a space. */ 162 result += 0x20; 163 (*bytes)++; 164 return result; 165 } 166 167 if ((*bytes)[0] == 0) { 168 /* We do not advance beyond the terminating 0. */ 169 return 0x00; 170 } 171 172 result += (*bytes)[0]; 173 (*bytes)++; 174 return result; 175 } 176 177 178 /* UTF8ToLength works like strlen() but takes UTF8 encoded multibyte chars 179 into account. It's a quicker version of UTF8CountChars above. */ 180 static inline int32 181 UTF8ToLength(const char *bytes) 182 { 183 int32 length = 0; 184 while (*bytes) { 185 length++; 186 187 if (bytes[0] & 0x80) { 188 if (bytes[0] & 0x40) { 189 if (bytes[0] & 0x20) { 190 if (bytes[0] & 0x10) { 191 bytes += 4; 192 continue; 193 } 194 195 bytes += 3; 196 continue; 197 } 198 199 bytes += 2; 200 continue; 201 } 202 203 /* Not a startbyte - skip */ 204 } 205 206 bytes += 1; 207 } 208 209 return length; 210 } 211 212 #endif // _UTF8_FUNCTIONS_H 213