1 #include "vterm_internal.h"
2
3 #define UNICODE_INVALID 0xFFFD
4
5 #if defined(DEBUG) && DEBUG > 1
6 # include <stdio.h>
7 # define DEBUG_PRINT_UTF8
8 #endif
9
10 struct UTF8DecoderData {
11 // number of bytes remaining in this codepoint
12 int bytes_remaining;
13
14 // number of bytes total in this codepoint once it's finished
15 // (for detecting overlongs)
16 int bytes_total;
17
18 int this_cp;
19 };
20
init_utf8(VTermEncoding * enc,void * data_)21 static void init_utf8(VTermEncoding *enc, void *data_)
22 {
23 struct UTF8DecoderData *data = data_;
24
25 data->bytes_remaining = 0;
26 data->bytes_total = 0;
27 }
28
decode_utf8(VTermEncoding * enc,void * data_,uint32_t cp[],int * cpi,int cplen,const char bytes[],size_t * pos,size_t bytelen)29 static void decode_utf8(VTermEncoding *enc, void *data_,
30 uint32_t cp[], int *cpi, int cplen,
31 const char bytes[], size_t *pos, size_t bytelen)
32 {
33 struct UTF8DecoderData *data = data_;
34
35 #ifdef DEBUG_PRINT_UTF8
36 printf("BEGIN UTF-8\n");
37 #endif
38
39 for(; *pos < bytelen && *cpi < cplen; (*pos)++) {
40 unsigned char c = bytes[*pos];
41
42 #ifdef DEBUG_PRINT_UTF8
43 printf(" pos=%zd c=%02x rem=%d\n", *pos, c, data->bytes_remaining);
44 #endif
45
46 if(c < 0x20)
47 return;
48
49 else if(c >= 0x20 && c < 0x80) {
50 if(data->bytes_remaining)
51 cp[(*cpi)++] = UNICODE_INVALID;
52
53 cp[(*cpi)++] = c;
54 #ifdef DEBUG_PRINT_UTF8
55 printf(" UTF-8 char: U+%04x\n", c);
56 #endif
57 data->bytes_remaining = 0;
58 }
59
60 else if(c >= 0x80 && c < 0xc0) {
61 if(!data->bytes_remaining) {
62 cp[(*cpi)++] = UNICODE_INVALID;
63 continue;
64 }
65
66 data->this_cp <<= 6;
67 data->this_cp |= c & 0x3f;
68 data->bytes_remaining--;
69
70 if(!data->bytes_remaining) {
71 #ifdef DEBUG_PRINT_UTF8
72 printf(" UTF-8 raw char U+%04x bytelen=%d ", data->this_cp, data->bytes_total);
73 #endif
74 // Check for overlong sequences
75 switch(data->bytes_total) {
76 case 2:
77 if(data->this_cp < 0x0080) data->this_cp = UNICODE_INVALID;
78 break;
79 case 3:
80 if(data->this_cp < 0x0800) data->this_cp = UNICODE_INVALID;
81 break;
82 case 4:
83 if(data->this_cp < 0x10000) data->this_cp = UNICODE_INVALID;
84 break;
85 case 5:
86 if(data->this_cp < 0x200000) data->this_cp = UNICODE_INVALID;
87 break;
88 case 6:
89 if(data->this_cp < 0x4000000) data->this_cp = UNICODE_INVALID;
90 break;
91 }
92 // Now look for plain invalid ones
93 if((data->this_cp >= 0xD800 && data->this_cp <= 0xDFFF) ||
94 data->this_cp == 0xFFFE ||
95 data->this_cp == 0xFFFF)
96 data->this_cp = UNICODE_INVALID;
97 #ifdef DEBUG_PRINT_UTF8
98 printf(" char: U+%04x\n", data->this_cp);
99 #endif
100 cp[(*cpi)++] = data->this_cp;
101 }
102 }
103
104 else if(c >= 0xc0 && c < 0xe0) {
105 if(data->bytes_remaining)
106 cp[(*cpi)++] = UNICODE_INVALID;
107
108 data->this_cp = c & 0x1f;
109 data->bytes_total = 2;
110 data->bytes_remaining = 1;
111 }
112
113 else if(c >= 0xe0 && c < 0xf0) {
114 if(data->bytes_remaining)
115 cp[(*cpi)++] = UNICODE_INVALID;
116
117 data->this_cp = c & 0x0f;
118 data->bytes_total = 3;
119 data->bytes_remaining = 2;
120 }
121
122 else if(c >= 0xf0 && c < 0xf8) {
123 if(data->bytes_remaining)
124 cp[(*cpi)++] = UNICODE_INVALID;
125
126 data->this_cp = c & 0x07;
127 data->bytes_total = 4;
128 data->bytes_remaining = 3;
129 }
130
131 else if(c >= 0xf8 && c < 0xfc) {
132 if(data->bytes_remaining)
133 cp[(*cpi)++] = UNICODE_INVALID;
134
135 data->this_cp = c & 0x03;
136 data->bytes_total = 5;
137 data->bytes_remaining = 4;
138 }
139
140 else if(c >= 0xfc && c < 0xfe) {
141 if(data->bytes_remaining)
142 cp[(*cpi)++] = UNICODE_INVALID;
143
144 data->this_cp = c & 0x01;
145 data->bytes_total = 6;
146 data->bytes_remaining = 5;
147 }
148
149 else {
150 cp[(*cpi)++] = UNICODE_INVALID;
151 }
152 }
153 }
154
155 static VTermEncoding encoding_utf8 = {
156 .init = &init_utf8,
157 .decode = &decode_utf8,
158 };
159
decode_usascii(VTermEncoding * enc,void * data,uint32_t cp[],int * cpi,int cplen,const char bytes[],size_t * pos,size_t bytelen)160 static void decode_usascii(VTermEncoding *enc, void *data,
161 uint32_t cp[], int *cpi, int cplen,
162 const char bytes[], size_t *pos, size_t bytelen)
163 {
164 int is_gr = bytes[*pos] & 0x80;
165
166 for(; *pos < bytelen && *cpi < cplen; (*pos)++) {
167 unsigned char c = bytes[*pos] ^ is_gr;
168
169 if(c < 0x20 || c >= 0x80)
170 return;
171
172 cp[(*cpi)++] = c;
173 }
174 }
175
176 static VTermEncoding encoding_usascii = {
177 .decode = &decode_usascii,
178 };
179
180 struct StaticTableEncoding {
181 const VTermEncoding enc;
182 const uint32_t chars[128];
183 };
184
decode_table(VTermEncoding * enc,void * data,uint32_t cp[],int * cpi,int cplen,const char bytes[],size_t * pos,size_t bytelen)185 static void decode_table(VTermEncoding *enc, void *data,
186 uint32_t cp[], int *cpi, int cplen,
187 const char bytes[], size_t *pos, size_t bytelen)
188 {
189 struct StaticTableEncoding *table = (struct StaticTableEncoding *)enc;
190 int is_gr = bytes[*pos] & 0x80;
191
192 for(; *pos < bytelen && *cpi < cplen; (*pos)++) {
193 unsigned char c = bytes[*pos] ^ is_gr;
194
195 if(c < 0x20 || c >= 0x80)
196 return;
197
198 if(table->chars[c])
199 cp[(*cpi)++] = table->chars[c];
200 else
201 cp[(*cpi)++] = c;
202 }
203 }
204
205 #include "encoding/DECdrawing.inc"
206 #include "encoding/uk.inc"
207
208 static struct {
209 VTermEncodingType type;
210 char designation;
211 VTermEncoding *enc;
212 }
213 encodings[] = {
214 { ENC_UTF8, 'u', &encoding_utf8 },
215 { ENC_SINGLE_94, '0', (VTermEncoding*)&encoding_DECdrawing },
216 { ENC_SINGLE_94, 'A', (VTermEncoding*)&encoding_uk },
217 { ENC_SINGLE_94, 'B', &encoding_usascii },
218 { 0 },
219 };
220
221 /* This ought to be INTERNAL but isn't because it's used by unit testing */
vterm_lookup_encoding(VTermEncodingType type,char designation)222 VTermEncoding *vterm_lookup_encoding(VTermEncodingType type, char designation)
223 {
224 int i;
225 for(i = 0; encodings[i].designation; i++)
226 if(encodings[i].type == type && encodings[i].designation == designation)
227 return encodings[i].enc;
228 return NULL;
229 }
230