xref: /haiku/src/apps/serialconnect/libvterm/src/encoding.c (revision c95da8e7006d9002c43a94b4525750ba7467e9e6)
1 #include "vterm_internal.h"
2 
3 #define UNICODE_INVALID 0xFFFD
4 
5 #if defined(DEBUG) && DEBUG > 1
6 # include <stdio.h>
7 # define DEBUG_PRINT_UTF8
8 #endif
9 
10 struct UTF8DecoderData {
11   // number of bytes remaining in this codepoint
12   int bytes_remaining;
13 
14   // number of bytes total in this codepoint once it's finished
15   // (for detecting overlongs)
16   int bytes_total;
17 
18   int this_cp;
19 };
20 
init_utf8(VTermEncoding * enc,void * data_)21 static void init_utf8(VTermEncoding *enc, void *data_)
22 {
23   struct UTF8DecoderData *data = data_;
24 
25   data->bytes_remaining = 0;
26   data->bytes_total     = 0;
27 }
28 
decode_utf8(VTermEncoding * enc,void * data_,uint32_t cp[],int * cpi,int cplen,const char bytes[],size_t * pos,size_t bytelen)29 static void decode_utf8(VTermEncoding *enc, void *data_,
30                         uint32_t cp[], int *cpi, int cplen,
31                         const char bytes[], size_t *pos, size_t bytelen)
32 {
33   struct UTF8DecoderData *data = data_;
34 
35 #ifdef DEBUG_PRINT_UTF8
36   printf("BEGIN UTF-8\n");
37 #endif
38 
39   for(; *pos < bytelen && *cpi < cplen; (*pos)++) {
40     unsigned char c = bytes[*pos];
41 
42 #ifdef DEBUG_PRINT_UTF8
43     printf(" pos=%zd c=%02x rem=%d\n", *pos, c, data->bytes_remaining);
44 #endif
45 
46     if(c < 0x20)
47       return;
48 
49     else if(c >= 0x20 && c < 0x80) {
50       if(data->bytes_remaining)
51         cp[(*cpi)++] = UNICODE_INVALID;
52 
53       cp[(*cpi)++] = c;
54 #ifdef DEBUG_PRINT_UTF8
55       printf(" UTF-8 char: U+%04x\n", c);
56 #endif
57       data->bytes_remaining = 0;
58     }
59 
60     else if(c >= 0x80 && c < 0xc0) {
61       if(!data->bytes_remaining) {
62         cp[(*cpi)++] = UNICODE_INVALID;
63         continue;
64       }
65 
66       data->this_cp <<= 6;
67       data->this_cp |= c & 0x3f;
68       data->bytes_remaining--;
69 
70       if(!data->bytes_remaining) {
71 #ifdef DEBUG_PRINT_UTF8
72         printf(" UTF-8 raw char U+%04x bytelen=%d ", data->this_cp, data->bytes_total);
73 #endif
74         // Check for overlong sequences
75         switch(data->bytes_total) {
76         case 2:
77           if(data->this_cp <  0x0080) data->this_cp = UNICODE_INVALID;
78           break;
79         case 3:
80           if(data->this_cp <  0x0800) data->this_cp = UNICODE_INVALID;
81           break;
82         case 4:
83           if(data->this_cp < 0x10000) data->this_cp = UNICODE_INVALID;
84           break;
85         case 5:
86           if(data->this_cp < 0x200000) data->this_cp = UNICODE_INVALID;
87           break;
88         case 6:
89           if(data->this_cp < 0x4000000) data->this_cp = UNICODE_INVALID;
90           break;
91         }
92         // Now look for plain invalid ones
93         if((data->this_cp >= 0xD800 && data->this_cp <= 0xDFFF) ||
94            data->this_cp == 0xFFFE ||
95            data->this_cp == 0xFFFF)
96           data->this_cp = UNICODE_INVALID;
97 #ifdef DEBUG_PRINT_UTF8
98         printf(" char: U+%04x\n", data->this_cp);
99 #endif
100         cp[(*cpi)++] = data->this_cp;
101       }
102     }
103 
104     else if(c >= 0xc0 && c < 0xe0) {
105       if(data->bytes_remaining)
106         cp[(*cpi)++] = UNICODE_INVALID;
107 
108       data->this_cp = c & 0x1f;
109       data->bytes_total = 2;
110       data->bytes_remaining = 1;
111     }
112 
113     else if(c >= 0xe0 && c < 0xf0) {
114       if(data->bytes_remaining)
115         cp[(*cpi)++] = UNICODE_INVALID;
116 
117       data->this_cp = c & 0x0f;
118       data->bytes_total = 3;
119       data->bytes_remaining = 2;
120     }
121 
122     else if(c >= 0xf0 && c < 0xf8) {
123       if(data->bytes_remaining)
124         cp[(*cpi)++] = UNICODE_INVALID;
125 
126       data->this_cp = c & 0x07;
127       data->bytes_total = 4;
128       data->bytes_remaining = 3;
129     }
130 
131     else if(c >= 0xf8 && c < 0xfc) {
132       if(data->bytes_remaining)
133         cp[(*cpi)++] = UNICODE_INVALID;
134 
135       data->this_cp = c & 0x03;
136       data->bytes_total = 5;
137       data->bytes_remaining = 4;
138     }
139 
140     else if(c >= 0xfc && c < 0xfe) {
141       if(data->bytes_remaining)
142         cp[(*cpi)++] = UNICODE_INVALID;
143 
144       data->this_cp = c & 0x01;
145       data->bytes_total = 6;
146       data->bytes_remaining = 5;
147     }
148 
149     else {
150       cp[(*cpi)++] = UNICODE_INVALID;
151     }
152   }
153 }
154 
155 static VTermEncoding encoding_utf8 = {
156   .init   = &init_utf8,
157   .decode = &decode_utf8,
158 };
159 
decode_usascii(VTermEncoding * enc,void * data,uint32_t cp[],int * cpi,int cplen,const char bytes[],size_t * pos,size_t bytelen)160 static void decode_usascii(VTermEncoding *enc, void *data,
161                            uint32_t cp[], int *cpi, int cplen,
162                            const char bytes[], size_t *pos, size_t bytelen)
163 {
164   int is_gr = bytes[*pos] & 0x80;
165 
166   for(; *pos < bytelen && *cpi < cplen; (*pos)++) {
167     unsigned char c = bytes[*pos] ^ is_gr;
168 
169     if(c < 0x20 || c >= 0x80)
170       return;
171 
172     cp[(*cpi)++] = c;
173   }
174 }
175 
176 static VTermEncoding encoding_usascii = {
177   .decode = &decode_usascii,
178 };
179 
180 struct StaticTableEncoding {
181   const VTermEncoding enc;
182   const uint32_t chars[128];
183 };
184 
decode_table(VTermEncoding * enc,void * data,uint32_t cp[],int * cpi,int cplen,const char bytes[],size_t * pos,size_t bytelen)185 static void decode_table(VTermEncoding *enc, void *data,
186                          uint32_t cp[], int *cpi, int cplen,
187                          const char bytes[], size_t *pos, size_t bytelen)
188 {
189   struct StaticTableEncoding *table = (struct StaticTableEncoding *)enc;
190   int is_gr = bytes[*pos] & 0x80;
191 
192   for(; *pos < bytelen && *cpi < cplen; (*pos)++) {
193     unsigned char c = bytes[*pos] ^ is_gr;
194 
195     if(c < 0x20 || c >= 0x80)
196       return;
197 
198     if(table->chars[c])
199       cp[(*cpi)++] = table->chars[c];
200     else
201       cp[(*cpi)++] = c;
202   }
203 }
204 
205 #include "encoding/DECdrawing.inc"
206 #include "encoding/uk.inc"
207 
208 static struct {
209   VTermEncodingType type;
210   char designation;
211   VTermEncoding *enc;
212 }
213 encodings[] = {
214   { ENC_UTF8,      'u', &encoding_utf8 },
215   { ENC_SINGLE_94, '0', (VTermEncoding*)&encoding_DECdrawing },
216   { ENC_SINGLE_94, 'A', (VTermEncoding*)&encoding_uk },
217   { ENC_SINGLE_94, 'B', &encoding_usascii },
218   { 0 },
219 };
220 
221 /* This ought to be INTERNAL but isn't because it's used by unit testing */
vterm_lookup_encoding(VTermEncodingType type,char designation)222 VTermEncoding *vterm_lookup_encoding(VTermEncodingType type, char designation)
223 {
224   int i;
225   for(i = 0; encodings[i].designation; i++)
226     if(encodings[i].type == type && encodings[i].designation == designation)
227       return encodings[i].enc;
228   return NULL;
229 }
230