1 /* 2 * Copyright 2006-2014 Haiku, Inc. All Rights Reserved. 3 * Distributed under the terms of the MIT License. 4 * 5 * Authors: 6 * Stephan Aßmus <superstippi@gmx.de> 7 * Rene Gollent <rene@gollent.com> 8 * John Scipione <jscipione@gmail.com> 9 * Ingo Weinhold <bonefish@cs.tu-berlin.de> 10 */ 11 12 13 #include "CLanguageTokenizer.h" 14 15 #include <ctype.h> 16 #include <stdio.h> 17 #include <stdlib.h> 18 19 20 using CLanguage::ParseException; 21 using CLanguage::Token; 22 using CLanguage::Tokenizer; 23 24 25 // #pragma mark - Token 26 27 28 Token::Token() 29 : 30 string(""), 31 type(TOKEN_NONE), 32 value(), 33 position(0) 34 { 35 } 36 37 38 Token::Token(const Token& other) 39 : 40 string(other.string), 41 type(other.type), 42 value(other.value), 43 position(other.position) 44 { 45 } 46 47 48 Token::Token(const char* string, int32 length, int32 position, int32 type) 49 : 50 string(string, length), 51 type(type), 52 value(), 53 position(position) 54 { 55 } 56 57 58 Token& 59 Token::operator=(const Token& other) 60 { 61 string = other.string; 62 type = other.type; 63 value = other.value; 64 position = other.position; 65 return *this; 66 } 67 68 69 // #pragma mark - Tokenizer 70 71 72 Tokenizer::Tokenizer() 73 : 74 fString(""), 75 fCurrentChar(NULL), 76 fCurrentToken(), 77 fReuseToken(false) 78 { 79 } 80 81 82 void 83 Tokenizer::SetTo(const char* string) 84 { 85 fString = string; 86 fCurrentChar = fString.String(); 87 fCurrentToken = Token(); 88 fReuseToken = false; 89 } 90 91 92 const Token& 93 Tokenizer::NextToken() 94 { 95 if (fCurrentToken.type == TOKEN_END_OF_LINE) 96 return fCurrentToken; 97 98 if (fReuseToken) { 99 fReuseToken = false; 100 return fCurrentToken; 101 } 102 103 while (*fCurrentChar != 0 && isspace(*fCurrentChar)) 104 fCurrentChar++; 105 106 if (*fCurrentChar == 0) { 107 return fCurrentToken = Token("", 0, _CurrentPos(), 108 TOKEN_END_OF_LINE); 109 } 110 111 bool decimal = *fCurrentChar == '.'; 112 113 if (decimal || isdigit(*fCurrentChar)) { 114 if (*fCurrentChar == '0' && fCurrentChar[1] == 'x') 115 return _ParseHexOperand(); 116 117 BString temp; 118 119 const char* begin = fCurrentChar; 120 121 // optional digits before the comma 122 while (isdigit(*fCurrentChar)) { 123 temp << *fCurrentChar; 124 fCurrentChar++; 125 } 126 127 // optional post decimal part 128 // (required if there are no digits before the decimal) 129 if (*fCurrentChar == '.') { 130 decimal = true; 131 temp << '.'; 132 fCurrentChar++; 133 134 // optional post decimal digits 135 while (isdigit(*fCurrentChar)) { 136 temp << *fCurrentChar; 137 fCurrentChar++; 138 } 139 } 140 141 int32 length = fCurrentChar - begin; 142 if (length == 1 && decimal) { 143 // check for . operator 144 fCurrentChar = begin; 145 if (!_ParseOperator()) 146 throw ParseException("unexpected character", _CurrentPos()); 147 148 return fCurrentToken; 149 } 150 151 BString test = temp; 152 test << "&_"; 153 double value; 154 char t[2]; 155 int32 matches = sscanf(test.String(), "%lf&%s", &value, t); 156 if (matches != 2) 157 throw ParseException("error in constant", _CurrentPos() - length); 158 159 fCurrentToken = Token(begin, length, _CurrentPos() - length, 160 TOKEN_CONSTANT); 161 if (decimal) 162 fCurrentToken.value.SetTo(value); 163 else 164 fCurrentToken.value.SetTo((int64)strtoll(temp.String(), NULL, 10)); 165 } else if (isalpha(*fCurrentChar) || *fCurrentChar == '_') { 166 const char* begin = fCurrentChar; 167 while (*fCurrentChar != 0 && (isalpha(*fCurrentChar) 168 || isdigit(*fCurrentChar) || *fCurrentChar == '_')) { 169 fCurrentChar++; 170 } 171 int32 length = fCurrentChar - begin; 172 fCurrentToken = Token(begin, length, _CurrentPos() - length, 173 TOKEN_IDENTIFIER); 174 } else if (*fCurrentChar == '"' || *fCurrentChar == '\'') { 175 bool terminatorFound = false; 176 const char* begin = fCurrentChar++; 177 while (*fCurrentChar != 0) { 178 if (*fCurrentChar == '\\') { 179 if (*(fCurrentChar++) != 0) 180 fCurrentChar++; 181 } else if (*(fCurrentChar++) == *begin) { 182 terminatorFound = true; 183 break; 184 } 185 } 186 int32 tokenType = TOKEN_STRING_LITERAL; 187 if (!terminatorFound) { 188 tokenType = *begin == '"' ? TOKEN_DOUBLE_QUOTE 189 : TOKEN_SINGLE_QUOTE; 190 fCurrentChar = begin + 1; 191 } 192 193 int32 length = fCurrentChar - begin; 194 fCurrentToken = Token(begin, length, _CurrentPos() - length, 195 tokenType); 196 } else { 197 if (!_ParseOperator()) { 198 int32 type = TOKEN_NONE; 199 switch (*fCurrentChar) { 200 case '\n': 201 type = TOKEN_END_OF_LINE; 202 break; 203 204 case '(': 205 type = TOKEN_OPENING_PAREN; 206 break; 207 case ')': 208 type = TOKEN_CLOSING_PAREN; 209 break; 210 211 case '[': 212 type = TOKEN_OPENING_SQUARE_BRACKET; 213 break; 214 case ']': 215 type = TOKEN_CLOSING_SQUARE_BRACKET; 216 break; 217 218 case '{': 219 type = TOKEN_OPENING_CURLY_BRACE; 220 break; 221 case '}': 222 type = TOKEN_CLOSING_CURLY_BRACE; 223 break; 224 225 case '\\': 226 type = TOKEN_BACKSLASH; 227 break; 228 229 case ':': 230 type = TOKEN_COLON; 231 break; 232 233 case ';': 234 type = TOKEN_SEMICOLON; 235 break; 236 237 case ',': 238 type = TOKEN_COMMA; 239 break; 240 241 case '.': 242 type = TOKEN_PERIOD; 243 break; 244 245 case '#': 246 type = TOKEN_POUND; 247 break; 248 249 default: 250 throw ParseException("unexpected character", 251 _CurrentPos()); 252 } 253 fCurrentToken = Token(fCurrentChar, 1, _CurrentPos(), 254 type); 255 fCurrentChar++; 256 } 257 } 258 259 return fCurrentToken; 260 } 261 262 263 bool 264 Tokenizer::_ParseOperator() 265 { 266 int32 type = TOKEN_NONE; 267 int32 length = 0; 268 switch (*fCurrentChar) { 269 case '+': 270 type = TOKEN_PLUS; 271 length = 1; 272 break; 273 274 case '-': 275 if (_Peek() == '>') { 276 type = TOKEN_MEMBER_PTR; 277 length = 2; 278 } else { 279 type = TOKEN_MINUS; 280 length = 1; 281 } 282 break; 283 284 case '*': 285 switch (_Peek()) { 286 case '/': 287 type = TOKEN_END_COMMENT_BLOCK; 288 length = 2; 289 break; 290 default: 291 type = TOKEN_STAR; 292 length = 1; 293 break; 294 } 295 break; 296 297 case '/': 298 switch (_Peek()) { 299 case '*': 300 type = TOKEN_BEGIN_COMMENT_BLOCK; 301 length = 2; 302 break; 303 case '/': 304 type = TOKEN_INLINE_COMMENT; 305 length = 2; 306 break; 307 default: 308 type = TOKEN_SLASH; 309 length = 1; 310 break; 311 } 312 break; 313 314 case '%': 315 type = TOKEN_MODULO; 316 length = 1; 317 break; 318 319 case '^': 320 type = TOKEN_BITWISE_XOR; 321 length = 1; 322 break; 323 324 case '&': 325 if (_Peek() == '&') { 326 type = TOKEN_LOGICAL_AND; 327 length = 2; 328 } else { 329 type = TOKEN_BITWISE_AND; 330 length = 1; 331 } 332 break; 333 334 case '|': 335 if (_Peek() == '|') { 336 type = TOKEN_LOGICAL_OR; 337 length = 2; 338 } else { 339 type = TOKEN_BITWISE_OR; 340 length = 1; 341 } 342 break; 343 344 case '!': 345 if (_Peek() == '=') { 346 type = TOKEN_NE; 347 length = 2; 348 } else { 349 type = TOKEN_LOGICAL_NOT; 350 length = 1; 351 } 352 break; 353 354 case '=': 355 if (_Peek() == '=') { 356 type = TOKEN_EQ; 357 length = 2; 358 } else { 359 type = TOKEN_ASSIGN; 360 length = 1; 361 } 362 break; 363 364 case '>': 365 if (_Peek() == '=') { 366 type = TOKEN_GE; 367 length = 2; 368 } else { 369 type = TOKEN_GT; 370 length = 1; 371 } 372 break; 373 374 case '<': 375 if (_Peek() == '=') { 376 type = TOKEN_LE; 377 length = 2; 378 } else { 379 type = TOKEN_LT; 380 length = 1; 381 } 382 break; 383 384 case '~': 385 type = TOKEN_BITWISE_NOT; 386 length = 1; 387 break; 388 389 390 case '?': 391 type = TOKEN_CONDITION; 392 length = 1; 393 break; 394 395 case '.': 396 type = TOKEN_MEMBER_PTR; 397 length = 1; 398 break; 399 400 default: 401 break; 402 } 403 404 if (length == 0) 405 return false; 406 407 fCurrentToken = Token(fCurrentChar, length, _CurrentPos(), type); 408 fCurrentChar += length; 409 410 return true; 411 } 412 413 414 void 415 Tokenizer::RewindToken() 416 { 417 fReuseToken = true; 418 } 419 420 421 char 422 Tokenizer::_Peek() const 423 { 424 if (_CurrentPos() < fString.Length()) 425 return *(fCurrentChar + 1); 426 427 return '\0'; 428 } 429 430 431 /*static*/ bool 432 Tokenizer::_IsHexDigit(char c) 433 { 434 return isdigit(c) || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F'); 435 } 436 437 438 Token& 439 Tokenizer::_ParseHexOperand() 440 { 441 const char* begin = fCurrentChar; 442 fCurrentChar += 2; 443 // skip "0x" 444 445 if (!_IsHexDigit(*fCurrentChar)) 446 throw ParseException("expected hex digit", _CurrentPos()); 447 448 fCurrentChar++; 449 while (_IsHexDigit(*fCurrentChar)) 450 fCurrentChar++; 451 452 int32 length = fCurrentChar - begin; 453 fCurrentToken = Token(begin, length, _CurrentPos() - length, 454 TOKEN_CONSTANT); 455 456 if (length <= 10) { 457 // including the leading 0x, a 32-bit constant will be at most 458 // 10 characters. Anything larger, and 64 is necessary. 459 fCurrentToken.value.SetTo((uint32)strtoul( 460 fCurrentToken.string.String(), NULL, 16)); 461 } else { 462 fCurrentToken.value.SetTo((uint64)strtoull( 463 fCurrentToken.string.String(), NULL, 16)); 464 } 465 return fCurrentToken; 466 } 467 468 469 int32 470 Tokenizer::_CurrentPos() const 471 { 472 return fCurrentChar - fString.String(); 473 } 474