xref: /haiku/src/kits/debugger/source_language/c_family/CLanguageTokenizer.cpp (revision 1a76488fc88584bf66b9751d7fb9b6527ac20d87)
1 /*
2  * Copyright 2006-2014 Haiku, Inc. All Rights Reserved.
3  * Distributed under the terms of the MIT License.
4  *
5  * Authors:
6  *		Stephan Aßmus <superstippi@gmx.de>
7  *		Rene Gollent <rene@gollent.com>
8  *		John Scipione <jscipione@gmail.com>
9  *		Ingo Weinhold <bonefish@cs.tu-berlin.de>
10  */
11 
12 
13 #include "CLanguageTokenizer.h"
14 
15 #include <ctype.h>
16 #include <stdio.h>
17 #include <stdlib.h>
18 
19 
20 using CLanguage::ParseException;
21 using CLanguage::Token;
22 using CLanguage::Tokenizer;
23 
24 
25 // #pragma mark - Token
26 
27 
28 Token::Token()
29 	:
30 	string(""),
31 	type(TOKEN_NONE),
32 	value(),
33 	position(0)
34 {
35 }
36 
37 
38 Token::Token(const Token& other)
39 	:
40 	string(other.string),
41 	type(other.type),
42 	value(other.value),
43 	position(other.position)
44 {
45 }
46 
47 
48 Token::Token(const char* string, int32 length, int32 position, int32 type)
49 	:
50 	string(string, length),
51 	type(type),
52 	value(),
53 	position(position)
54 {
55 }
56 
57 
58 Token&
59 Token::operator=(const Token& other)
60 {
61 	string = other.string;
62 	type = other.type;
63 	value = other.value;
64 	position = other.position;
65 	return *this;
66 }
67 
68 
69 // #pragma mark - Tokenizer
70 
71 
72 Tokenizer::Tokenizer()
73 	:
74 	fString(""),
75 	fCurrentChar(NULL),
76 	fCurrentToken(),
77 	fReuseToken(false)
78 {
79 }
80 
81 
82 void
83 Tokenizer::SetTo(const char* string)
84 {
85 	fString = string;
86 	fCurrentChar = fString.String();
87 	fCurrentToken = Token();
88 	fReuseToken = false;
89 }
90 
91 
92 const Token&
93 Tokenizer::NextToken()
94 {
95 	if (fCurrentToken.type == TOKEN_END_OF_LINE)
96 		return fCurrentToken;
97 
98 	if (fReuseToken) {
99 		fReuseToken = false;
100 		return fCurrentToken;
101 	}
102 
103 	while (*fCurrentChar != 0 && isspace(*fCurrentChar))
104 		fCurrentChar++;
105 
106 	if (*fCurrentChar == 0) {
107 		return fCurrentToken = Token("", 0, _CurrentPos(),
108 			TOKEN_END_OF_LINE);
109 	}
110 
111 	bool decimal = *fCurrentChar == '.';
112 
113 	if (decimal || isdigit(*fCurrentChar)) {
114 		if (*fCurrentChar == '0' && fCurrentChar[1] == 'x')
115 			return _ParseHexOperand();
116 
117 		BString temp;
118 
119 		const char* begin = fCurrentChar;
120 
121 		// optional digits before the comma
122 		while (isdigit(*fCurrentChar)) {
123 			temp << *fCurrentChar;
124 			fCurrentChar++;
125 		}
126 
127 		// optional post decimal part
128 		// (required if there are no digits before the decimal)
129 		if (*fCurrentChar == '.') {
130 			decimal = true;
131 			temp << '.';
132 			fCurrentChar++;
133 
134 			// optional post decimal digits
135 			while (isdigit(*fCurrentChar)) {
136 				temp << *fCurrentChar;
137 				fCurrentChar++;
138 			}
139 		}
140 
141 		int32 length = fCurrentChar - begin;
142 		if (length == 1 && decimal) {
143 			// check for . operator
144 			fCurrentChar = begin;
145 			if (!_ParseOperator())
146 				throw ParseException("unexpected character", _CurrentPos());
147 
148 			return fCurrentToken;
149 		}
150 
151 		BString test = temp;
152 		test << "&_";
153 		double value;
154 		char t[2];
155 		int32 matches = sscanf(test.String(), "%lf&%s", &value, t);
156 		if (matches != 2)
157 			throw ParseException("error in constant", _CurrentPos() - length);
158 
159 		fCurrentToken = Token(begin, length, _CurrentPos() - length,
160 			TOKEN_CONSTANT);
161 		if (decimal)
162 			fCurrentToken.value.SetTo(value);
163 		else
164 			fCurrentToken.value.SetTo((int64)strtoll(temp.String(), NULL, 10));
165 	} else if (isalpha(*fCurrentChar) || *fCurrentChar == '_') {
166 		const char* begin = fCurrentChar;
167 		while (*fCurrentChar != 0 && (isalpha(*fCurrentChar)
168 			|| isdigit(*fCurrentChar) || *fCurrentChar == '_')) {
169 			fCurrentChar++;
170 		}
171 		int32 length = fCurrentChar - begin;
172 		fCurrentToken = Token(begin, length, _CurrentPos() - length,
173 			TOKEN_IDENTIFIER);
174 	} else if (*fCurrentChar == '"' || *fCurrentChar == '\'') {
175 		bool terminatorFound = false;
176 		const char* begin = fCurrentChar++;
177 		while (*fCurrentChar != 0) {
178 			if (*fCurrentChar == '\\') {
179 				if (*(fCurrentChar++) != 0)
180 					fCurrentChar++;
181 			} else if (*(fCurrentChar++) == *begin) {
182 				terminatorFound = true;
183 				break;
184 			}
185 		}
186 		int32 tokenType = TOKEN_STRING_LITERAL;
187 		if (!terminatorFound) {
188 			tokenType = *begin == '"' ? TOKEN_DOUBLE_QUOTE
189 					: TOKEN_SINGLE_QUOTE;
190 			fCurrentChar = begin + 1;
191 		}
192 
193 		int32 length = fCurrentChar - begin;
194 		fCurrentToken = Token(begin, length, _CurrentPos() - length,
195 			tokenType);
196 	} else {
197 		if (!_ParseOperator()) {
198 			int32 type = TOKEN_NONE;
199 			switch (*fCurrentChar) {
200 				case '\n':
201 					type = TOKEN_END_OF_LINE;
202 					break;
203 
204 				case '(':
205 					type = TOKEN_OPENING_PAREN;
206 					break;
207 				case ')':
208 					type = TOKEN_CLOSING_PAREN;
209 					break;
210 
211 				case '[':
212 					type = TOKEN_OPENING_SQUARE_BRACKET;
213 					break;
214 				case ']':
215 					type = TOKEN_CLOSING_SQUARE_BRACKET;
216 					break;
217 
218 				case '{':
219 					type = TOKEN_OPENING_CURLY_BRACE;
220 					break;
221 				case '}':
222 					type = TOKEN_CLOSING_CURLY_BRACE;
223 					break;
224 
225 				case '\\':
226 					type = TOKEN_BACKSLASH;
227 					break;
228 
229 				case ':':
230 					type = TOKEN_COLON;
231 					break;
232 
233 				case ';':
234 					type = TOKEN_SEMICOLON;
235 					break;
236 
237 				case ',':
238 					type = TOKEN_COMMA;
239 					break;
240 
241 				case '.':
242 					type = TOKEN_PERIOD;
243 					break;
244 
245 				case '#':
246 					type = TOKEN_POUND;
247 					break;
248 
249 				default:
250 					throw ParseException("unexpected character",
251 						_CurrentPos());
252 			}
253 			fCurrentToken = Token(fCurrentChar, 1, _CurrentPos(),
254 				type);
255 			fCurrentChar++;
256 		}
257 	}
258 
259 	return fCurrentToken;
260 }
261 
262 
263 bool
264 Tokenizer::_ParseOperator()
265 {
266 	int32 type = TOKEN_NONE;
267 	int32 length = 0;
268 	switch (*fCurrentChar) {
269 		case '+':
270 			type = TOKEN_PLUS;
271 			length = 1;
272 			break;
273 
274 		case '-':
275 			 if (_Peek() == '>') {
276 			 	type = TOKEN_MEMBER_PTR;
277 			 	length = 2;
278 			 } else {
279 				type = TOKEN_MINUS;
280 				length = 1;
281 			 }
282 			break;
283 
284 		case '*':
285 			switch (_Peek()) {
286 				case '/':
287 					type = TOKEN_END_COMMENT_BLOCK;
288 					length = 2;
289 					break;
290 				default:
291 					type = TOKEN_STAR;
292 					length = 1;
293 					break;
294 			}
295 			break;
296 
297 		case '/':
298 			switch (_Peek()) {
299 				case '*':
300 					type = TOKEN_BEGIN_COMMENT_BLOCK;
301 					length = 2;
302 					break;
303 				case '/':
304 					type = TOKEN_INLINE_COMMENT;
305 					length = 2;
306 					break;
307 				default:
308 					type = TOKEN_SLASH;
309 					length = 1;
310 					break;
311 			}
312 			break;
313 
314 		case '%':
315 			type = TOKEN_MODULO;
316 			length = 1;
317 			break;
318 
319 		case '^':
320 			type = TOKEN_BITWISE_XOR;
321 			length = 1;
322 			break;
323 
324 		case '&':
325 			if (_Peek() == '&') {
326 			 	type = TOKEN_LOGICAL_AND;
327 			 	length = 2;
328 			} else {
329 				type = TOKEN_BITWISE_AND;
330 				length = 1;
331 			}
332 			break;
333 
334 		case '|':
335 			if (_Peek() == '|') {
336 				type = TOKEN_LOGICAL_OR;
337 				length = 2;
338 			} else {
339 				type = TOKEN_BITWISE_OR;
340 				length = 1;
341 			}
342 			break;
343 
344 		case '!':
345 			if (_Peek() == '=') {
346 				type = TOKEN_NE;
347 				length = 2;
348 			} else {
349 				type = TOKEN_LOGICAL_NOT;
350 				length = 1;
351 			}
352 			break;
353 
354 		case '=':
355 			if (_Peek() == '=') {
356 				type = TOKEN_EQ;
357 				length = 2;
358 			} else {
359 				type = TOKEN_ASSIGN;
360 				length = 1;
361 			}
362 			break;
363 
364 		case '>':
365 			if (_Peek() == '=') {
366 				type = TOKEN_GE;
367 				length = 2;
368 			} else {
369 				type = TOKEN_GT;
370 				length = 1;
371 			}
372 			break;
373 
374 		case '<':
375 			if (_Peek() == '=') {
376 				type = TOKEN_LE;
377 				length = 2;
378 			} else {
379 				type = TOKEN_LT;
380 				length = 1;
381 			}
382 			break;
383 
384 		case '~':
385 			type = TOKEN_BITWISE_NOT;
386 			length = 1;
387 			break;
388 
389 
390 		case '?':
391 			type = TOKEN_CONDITION;
392 			length = 1;
393 			break;
394 
395 		case '.':
396 			type = TOKEN_MEMBER_PTR;
397 			length = 1;
398 			break;
399 
400 		default:
401 			break;
402 	}
403 
404 	if (length == 0)
405 		return false;
406 
407 	fCurrentToken = Token(fCurrentChar, length, _CurrentPos(), type);
408 	fCurrentChar += length;
409 
410 	return true;
411 }
412 
413 
414 void
415 Tokenizer::RewindToken()
416 {
417 	fReuseToken = true;
418 }
419 
420 
421 char
422 Tokenizer::_Peek() const
423 {
424 	if (_CurrentPos() < fString.Length())
425 		return *(fCurrentChar + 1);
426 
427 	return '\0';
428 }
429 
430 
431 /*static*/ bool
432 Tokenizer::_IsHexDigit(char c)
433 {
434 	return isdigit(c) || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
435 }
436 
437 
438 Token&
439 Tokenizer::_ParseHexOperand()
440 {
441 	const char* begin = fCurrentChar;
442 	fCurrentChar += 2;
443 		// skip "0x"
444 
445 	if (!_IsHexDigit(*fCurrentChar))
446 		throw ParseException("expected hex digit", _CurrentPos());
447 
448 	fCurrentChar++;
449 	while (_IsHexDigit(*fCurrentChar))
450 		fCurrentChar++;
451 
452 	int32 length = fCurrentChar - begin;
453 	fCurrentToken = Token(begin, length, _CurrentPos() - length,
454 		TOKEN_CONSTANT);
455 
456 	if (length <= 10) {
457 		// including the leading 0x, a 32-bit constant will be at most
458 		// 10 characters. Anything larger, and 64 is necessary.
459 		fCurrentToken.value.SetTo((uint32)strtoul(
460 			fCurrentToken.string.String(), NULL, 16));
461 	} else {
462 		fCurrentToken.value.SetTo((uint64)strtoull(
463 			fCurrentToken.string.String(), NULL, 16));
464 	}
465 	return fCurrentToken;
466 }
467 
468 
469 int32
470 Tokenizer::_CurrentPos() const
471 {
472 	return fCurrentChar - fString.String();
473 }
474