1 /*
2 * Copyright 2006-2014 Haiku, Inc. All Rights Reserved.
3 * Distributed under the terms of the MIT License.
4 *
5 * Authors:
6 * Stephan Aßmus <superstippi@gmx.de>
7 * Rene Gollent <rene@gollent.com>
8 * John Scipione <jscipione@gmail.com>
9 * Ingo Weinhold <bonefish@cs.tu-berlin.de>
10 */
11
12
13 #include "CLanguageTokenizer.h"
14
15 #include <ctype.h>
16 #include <stdio.h>
17 #include <stdlib.h>
18
19
20 using CLanguage::ParseException;
21 using CLanguage::Token;
22 using CLanguage::Tokenizer;
23
24
25 // #pragma mark - Token
26
27
Token()28 Token::Token()
29 :
30 string(""),
31 type(TOKEN_NONE),
32 value(),
33 position(0)
34 {
35 }
36
37
Token(const Token & other)38 Token::Token(const Token& other)
39 :
40 string(other.string),
41 type(other.type),
42 value(other.value),
43 position(other.position)
44 {
45 }
46
47
Token(const char * string,int32 length,int32 position,int32 type)48 Token::Token(const char* string, int32 length, int32 position, int32 type)
49 :
50 string(string, length),
51 type(type),
52 value(),
53 position(position)
54 {
55 }
56
57
58 Token&
operator =(const Token & other)59 Token::operator=(const Token& other)
60 {
61 string = other.string;
62 type = other.type;
63 value = other.value;
64 position = other.position;
65 return *this;
66 }
67
68
69 // #pragma mark - Tokenizer
70
71
Tokenizer()72 Tokenizer::Tokenizer()
73 :
74 fString(""),
75 fCurrentChar(NULL),
76 fCurrentToken(),
77 fReuseToken(false)
78 {
79 }
80
81
82 void
SetTo(const char * string)83 Tokenizer::SetTo(const char* string)
84 {
85 fString = string;
86 fCurrentChar = fString.String();
87 fCurrentToken = Token();
88 fReuseToken = false;
89 }
90
91
92 const Token&
NextToken()93 Tokenizer::NextToken()
94 {
95 if (fCurrentToken.type == TOKEN_END_OF_LINE)
96 return fCurrentToken;
97
98 if (fReuseToken) {
99 fReuseToken = false;
100 return fCurrentToken;
101 }
102
103 while (*fCurrentChar != 0 && isspace(*fCurrentChar))
104 fCurrentChar++;
105
106 if (*fCurrentChar == 0) {
107 return fCurrentToken = Token("", 0, _CurrentPos(),
108 TOKEN_END_OF_LINE);
109 }
110
111 bool decimal = *fCurrentChar == '.';
112
113 if (decimal || isdigit(*fCurrentChar)) {
114 if (*fCurrentChar == '0' && fCurrentChar[1] == 'x')
115 return _ParseHexOperand();
116
117 BString temp;
118
119 const char* begin = fCurrentChar;
120
121 // optional digits before the comma
122 while (isdigit(*fCurrentChar)) {
123 temp << *fCurrentChar;
124 fCurrentChar++;
125 }
126
127 // optional post decimal part
128 // (required if there are no digits before the decimal)
129 if (*fCurrentChar == '.') {
130 decimal = true;
131 temp << '.';
132 fCurrentChar++;
133
134 // optional post decimal digits
135 while (isdigit(*fCurrentChar)) {
136 temp << *fCurrentChar;
137 fCurrentChar++;
138 }
139 }
140
141 int32 length = fCurrentChar - begin;
142 if (length == 1 && decimal) {
143 // check for . operator
144 fCurrentChar = begin;
145 if (!_ParseOperator())
146 throw ParseException("unexpected character", _CurrentPos());
147
148 return fCurrentToken;
149 }
150
151 BString test = temp;
152 test << "&_";
153 double value;
154 char t[2];
155 int32 matches = sscanf(test.String(), "%lf&%s", &value, t);
156 if (matches != 2)
157 throw ParseException("error in constant", _CurrentPos() - length);
158
159 fCurrentToken = Token(begin, length, _CurrentPos() - length,
160 TOKEN_CONSTANT);
161 if (decimal)
162 fCurrentToken.value.SetTo(value);
163 else
164 fCurrentToken.value.SetTo((int64)strtoll(temp.String(), NULL, 10));
165 } else if (isalpha(*fCurrentChar) || *fCurrentChar == '_') {
166 const char* begin = fCurrentChar;
167 while (*fCurrentChar != 0 && (isalpha(*fCurrentChar)
168 || isdigit(*fCurrentChar) || *fCurrentChar == '_')) {
169 fCurrentChar++;
170 }
171 int32 length = fCurrentChar - begin;
172 fCurrentToken = Token(begin, length, _CurrentPos() - length,
173 TOKEN_IDENTIFIER);
174 } else if (*fCurrentChar == '"' || *fCurrentChar == '\'') {
175 bool terminatorFound = false;
176 const char* begin = fCurrentChar++;
177 while (*fCurrentChar != 0) {
178 if (*fCurrentChar == '\\') {
179 if (*(fCurrentChar++) != 0)
180 fCurrentChar++;
181 } else if (*(fCurrentChar++) == *begin) {
182 terminatorFound = true;
183 break;
184 }
185 }
186 int32 tokenType = TOKEN_STRING_LITERAL;
187 if (!terminatorFound) {
188 tokenType = *begin == '"' ? TOKEN_DOUBLE_QUOTE
189 : TOKEN_SINGLE_QUOTE;
190 fCurrentChar = begin + 1;
191 }
192
193 int32 length = fCurrentChar - begin;
194 fCurrentToken = Token(begin, length, _CurrentPos() - length,
195 tokenType);
196 } else {
197 if (!_ParseOperator()) {
198 int32 type = TOKEN_NONE;
199 switch (*fCurrentChar) {
200 case '\n':
201 type = TOKEN_END_OF_LINE;
202 break;
203
204 case '(':
205 type = TOKEN_OPENING_PAREN;
206 break;
207 case ')':
208 type = TOKEN_CLOSING_PAREN;
209 break;
210
211 case '[':
212 type = TOKEN_OPENING_SQUARE_BRACKET;
213 break;
214 case ']':
215 type = TOKEN_CLOSING_SQUARE_BRACKET;
216 break;
217
218 case '{':
219 type = TOKEN_OPENING_CURLY_BRACE;
220 break;
221 case '}':
222 type = TOKEN_CLOSING_CURLY_BRACE;
223 break;
224
225 case '\\':
226 type = TOKEN_BACKSLASH;
227 break;
228
229 case ':':
230 type = TOKEN_COLON;
231 break;
232
233 case ';':
234 type = TOKEN_SEMICOLON;
235 break;
236
237 case ',':
238 type = TOKEN_COMMA;
239 break;
240
241 case '.':
242 type = TOKEN_PERIOD;
243 break;
244
245 case '#':
246 type = TOKEN_POUND;
247 break;
248
249 default:
250 throw ParseException("unexpected character",
251 _CurrentPos());
252 }
253 fCurrentToken = Token(fCurrentChar, 1, _CurrentPos(),
254 type);
255 fCurrentChar++;
256 }
257 }
258
259 return fCurrentToken;
260 }
261
262
263 bool
_ParseOperator()264 Tokenizer::_ParseOperator()
265 {
266 int32 type = TOKEN_NONE;
267 int32 length = 0;
268 switch (*fCurrentChar) {
269 case '+':
270 type = TOKEN_PLUS;
271 length = 1;
272 break;
273
274 case '-':
275 if (_Peek() == '>') {
276 type = TOKEN_MEMBER_PTR;
277 length = 2;
278 } else {
279 type = TOKEN_MINUS;
280 length = 1;
281 }
282 break;
283
284 case '*':
285 switch (_Peek()) {
286 case '/':
287 type = TOKEN_END_COMMENT_BLOCK;
288 length = 2;
289 break;
290 default:
291 type = TOKEN_STAR;
292 length = 1;
293 break;
294 }
295 break;
296
297 case '/':
298 switch (_Peek()) {
299 case '*':
300 type = TOKEN_BEGIN_COMMENT_BLOCK;
301 length = 2;
302 break;
303 case '/':
304 type = TOKEN_INLINE_COMMENT;
305 length = 2;
306 break;
307 default:
308 type = TOKEN_SLASH;
309 length = 1;
310 break;
311 }
312 break;
313
314 case '%':
315 type = TOKEN_MODULO;
316 length = 1;
317 break;
318
319 case '^':
320 type = TOKEN_BITWISE_XOR;
321 length = 1;
322 break;
323
324 case '&':
325 if (_Peek() == '&') {
326 type = TOKEN_LOGICAL_AND;
327 length = 2;
328 } else {
329 type = TOKEN_BITWISE_AND;
330 length = 1;
331 }
332 break;
333
334 case '|':
335 if (_Peek() == '|') {
336 type = TOKEN_LOGICAL_OR;
337 length = 2;
338 } else {
339 type = TOKEN_BITWISE_OR;
340 length = 1;
341 }
342 break;
343
344 case '!':
345 if (_Peek() == '=') {
346 type = TOKEN_NE;
347 length = 2;
348 } else {
349 type = TOKEN_LOGICAL_NOT;
350 length = 1;
351 }
352 break;
353
354 case '=':
355 if (_Peek() == '=') {
356 type = TOKEN_EQ;
357 length = 2;
358 } else {
359 type = TOKEN_ASSIGN;
360 length = 1;
361 }
362 break;
363
364 case '>':
365 if (_Peek() == '=') {
366 type = TOKEN_GE;
367 length = 2;
368 } else {
369 type = TOKEN_GT;
370 length = 1;
371 }
372 break;
373
374 case '<':
375 if (_Peek() == '=') {
376 type = TOKEN_LE;
377 length = 2;
378 } else {
379 type = TOKEN_LT;
380 length = 1;
381 }
382 break;
383
384 case '~':
385 type = TOKEN_BITWISE_NOT;
386 length = 1;
387 break;
388
389
390 case '?':
391 type = TOKEN_CONDITION;
392 length = 1;
393 break;
394
395 case '.':
396 type = TOKEN_MEMBER_PTR;
397 length = 1;
398 break;
399
400 default:
401 break;
402 }
403
404 if (length == 0)
405 return false;
406
407 fCurrentToken = Token(fCurrentChar, length, _CurrentPos(), type);
408 fCurrentChar += length;
409
410 return true;
411 }
412
413
414 void
RewindToken()415 Tokenizer::RewindToken()
416 {
417 fReuseToken = true;
418 }
419
420
421 char
_Peek() const422 Tokenizer::_Peek() const
423 {
424 if (_CurrentPos() < fString.Length())
425 return *(fCurrentChar + 1);
426
427 return '\0';
428 }
429
430
431 /*static*/ bool
_IsHexDigit(char c)432 Tokenizer::_IsHexDigit(char c)
433 {
434 return isdigit(c) || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
435 }
436
437
438 Token&
_ParseHexOperand()439 Tokenizer::_ParseHexOperand()
440 {
441 const char* begin = fCurrentChar;
442 fCurrentChar += 2;
443 // skip "0x"
444
445 if (!_IsHexDigit(*fCurrentChar))
446 throw ParseException("expected hex digit", _CurrentPos());
447
448 fCurrentChar++;
449 while (_IsHexDigit(*fCurrentChar))
450 fCurrentChar++;
451
452 int32 length = fCurrentChar - begin;
453 fCurrentToken = Token(begin, length, _CurrentPos() - length,
454 TOKEN_CONSTANT);
455
456 if (length <= 10) {
457 // including the leading 0x, a 32-bit constant will be at most
458 // 10 characters. Anything larger, and 64 is necessary.
459 fCurrentToken.value.SetTo((uint32)strtoul(
460 fCurrentToken.string.String(), NULL, 16));
461 } else {
462 fCurrentToken.value.SetTo((uint64)strtoull(
463 fCurrentToken.string.String(), NULL, 16));
464 }
465 return fCurrentToken;
466 }
467
468
469 int32
_CurrentPos() const470 Tokenizer::_CurrentPos() const
471 {
472 return fCurrentChar - fString.String();
473 }
474