This machine mirrors various open-source projects.
20 Gbit/s uplink.
If there are any issues or you want another project mirrored, please contact
mirror-service -=AT=- netcologne DOT de !
00001 //===-- parser/Lexer.cpp -------------------------------------- -*- C++ -*-===// 00002 // 00003 // This file is distributed under the MIT license. See LICENSE.txt for details. 00004 // 00005 // Copyright (C) 2008-2010, Stephen Wilson 00006 // 00007 //===----------------------------------------------------------------------===// 00008 00009 #include "comma/parser/Lexer.h" 00010 #include <cstring> 00011 00012 using namespace comma; 00013 00014 Lexer::Lexer(TextProvider &txtProvider, Diagnostic &diag) 00015 : txtProvider(txtProvider), 00016 diagnostic(diag), 00017 currentIter(txtProvider.begin()), 00018 errorCount(0), 00019 scanningAborted(false), 00020 index(0) 00021 { } 00022 00023 std::string Lexer::Token::getString() const 00024 { 00025 return Lexer::tokenString(*this); 00026 } 00027 00028 const char *Lexer::tokenString(const Code code) 00029 { 00030 const char *result; 00031 00032 switch (code) { 00033 default: 00034 result = 0; 00035 break; 00036 00037 #define RESERVED(NAME, STRING) case TKN_ ## NAME: result = STRING; break; 00038 #define GLYPH(NAME, STRING) case TKN_ ## NAME: result = STRING; break; 00039 #include "comma/parser/Tokens.def" 00040 #undef RESERVED 00041 #undef GLYPH 00042 } 00043 00044 return result; 00045 } 00046 00047 std::string Lexer::tokenString(const Token &token) 00048 { 00049 Code code = token.getCode(); 00050 00051 switch (code) { 00052 default: 00053 return std::string(tokenString(code)); 00054 break; 00055 00056 case TKN_IDENTIFIER: 00057 case TKN_INTEGER: 00058 case TKN_STRING: 00059 case TKN_CHARACTER: 00060 return std::string(token.getRep(), token.getLength()); 00061 } 00062 } 00063 00064 bool Lexer::isDecimalDigit(unsigned c) 00065 { 00066 return ('0' <= c && c <= '9'); 00067 } 00068 00069 bool Lexer::isInitialIdentifierChar(unsigned c) 00070 { 00071 if (('a' <= c && c <= 'z') || 00072 ('A' <= c && c <= 'Z') || 00073 (c == '%') || (c == '_')) 00074 return true; 00075 00076 return false; 00077 } 00078 00079 bool Lexer::isInnerIdentifierChar(unsigned c) 00080 { 00081 return isInitialIdentifierChar(c) || isDecimalDigit(c) || c == '?'; 00082 } 00083 00084 bool Lexer::isWhitespace(unsigned c) 00085 { 00086 return (c == ' ') || (c == '\t') || (c == '\n'); 00087 } 00088 00089 Location Lexer::currentLocation() const 00090 { 00091 return txtProvider.getLocation(currentIter); 00092 } 00093 00094 // Something of a fundamental function, since all characters are gathered from 00095 // the underlying stream via this routine. 00096 unsigned Lexer::readStream() 00097 { 00098 unsigned c = *currentIter; 00099 ++currentIter; 00100 00101 // Ensure that carriage returns and DOS style newline sequences are 00102 // canonicalized into single newline character codes. 00103 switch (c) { 00104 00105 case '\r': 00106 if (*currentIter == '\n') 00107 ++currentIter; 00108 case '\n': 00109 return '\n'; 00110 } 00111 00112 return c; 00113 } 00114 00115 unsigned Lexer::peekStream() 00116 { 00117 unsigned c = *currentIter; 00118 00119 if (c == '\r') 00120 return '\n'; 00121 00122 return c; 00123 } 00124 00125 void Lexer::ungetStream() 00126 { 00127 --currentIter; 00128 } 00129 00130 void Lexer::ignoreStream() 00131 { 00132 readStream(); 00133 } 00134 00135 bool Lexer::eatComment() 00136 { 00137 unsigned c = peekStream(); 00138 00139 if (c == '-') { 00140 ignoreStream(); 00141 if (peekStream() == '-') { 00142 // Loop until either a newline or the input stream is 00143 // exhausted. 00144 for (;;) { 00145 c = readStream(); 00146 if (c == '\n' || c == 0) 00147 return true; 00148 } 00149 } 00150 else { 00151 ungetStream(); 00152 return false; 00153 } 00154 } 00155 return false; 00156 } 00157 00158 bool Lexer::eatWhitespace() 00159 { 00160 unsigned c = peekStream(); 00161 00162 if (isWhitespace(c)) { 00163 do { 00164 ignoreStream(); 00165 } while (isWhitespace(c = peekStream())); 00166 return true; 00167 } 00168 return false; 00169 } 00170 00171 void Lexer::emitToken(Code code, 00172 const TextIterator &start, const TextIterator &end) 00173 { 00174 Location loc = txtProvider.getLocation(start); 00175 const char *string = &start; 00176 unsigned length = &end - &start; 00177 *targetToken = Token(code, loc, string, length); 00178 } 00179 00180 void Lexer::emitToken(Code code, Location loc) 00181 { 00182 *targetToken = Token(code, loc, 0, 0); 00183 } 00184 00185 void Lexer::emitStringToken(const TextIterator &start, const TextIterator &end) 00186 { 00187 emitToken(TKN_STRING, start, end); 00188 } 00189 00190 void Lexer::emitIntegerToken(const TextIterator &start, const TextIterator &end) 00191 { 00192 emitToken(TKN_INTEGER, start, end); 00193 } 00194 00195 void Lexer::emitIdentifierToken(const TextIterator &start, const TextIterator &end) 00196 { 00197 emitToken(TKN_IDENTIFIER, start, end); 00198 } 00199 00200 void Lexer::emitCharacterToken(const TextIterator &start, const TextIterator &end) 00201 { 00202 emitToken(TKN_CHARACTER, start, end); 00203 } 00204 00205 Lexer::Code Lexer::getTokenCode(TextIterator &start, TextIterator &end) const 00206 { 00207 Code code = UNUSED_ID; 00208 const char *str = &start; 00209 unsigned length = &end - &start; 00210 00211 switch (length) { 00212 case 1: 00213 if (strncmp(str, "%", length) == 0) 00214 code = TKN_PERCENT; 00215 break; 00216 00217 case 2: 00218 if (strncmp(str, "is", length) == 0) 00219 code = TKN_IS; 00220 else if (strncmp(str, "if", length) == 0) 00221 code = TKN_IF; 00222 else if (strncmp(str, "in", length) == 0) 00223 code = TKN_IN; 00224 else if (strncmp(str, "of", length) == 0) 00225 code = TKN_OF; 00226 else if (strncmp(str, "or", length) == 0) 00227 code = TKN_OR; 00228 break; 00229 00230 case 3: 00231 if (strncmp(str, "end", length) == 0) 00232 code = TKN_END; 00233 else if (strncmp(str, "out", length) == 0) 00234 code = TKN_OUT; 00235 else if (strncmp(str, "add", length) == 0) 00236 code = TKN_ADD; 00237 else if (strncmp(str, "inj", length) == 0) 00238 code = TKN_INJ; 00239 else if (strncmp(str, "prj", length) == 0) 00240 code = TKN_PRJ; 00241 else if (strncmp(str, "and", length) == 0) 00242 code = TKN_AND; 00243 else if (strncmp(str, "mod", length) == 0) 00244 code = TKN_MOD; 00245 else if (strncmp(str, "rem", length) == 0) 00246 code = TKN_REM; 00247 else if (strncmp(str, "for", length) == 0) 00248 code = TKN_FOR; 00249 else if (strncmp(str, "not", length) == 0) 00250 code = TKN_NOT; 00251 else if (strncmp(str, "xor", length) == 0) 00252 code = TKN_XOR; 00253 else if (strncmp(str, "new", length) == 0) 00254 code = TKN_NEW; 00255 else if (strncmp(str, "all", length) == 0) 00256 code = TKN_ALL; 00257 break; 00258 00259 case 4: 00260 if (strncmp(str, "else", length) == 0) 00261 code = TKN_ELSE; 00262 else if (strncmp(str, "loop", length) == 0) 00263 code = TKN_LOOP; 00264 else if (strncmp(str, "then", length) == 0) 00265 code = TKN_THEN; 00266 else if (strncmp(str, "with", length) == 0) 00267 code = TKN_WITH; 00268 else if (strncmp(str, "type", length) == 0) 00269 code = TKN_TYPE; 00270 else if (strncmp(str, "when", length) == 0) 00271 code = TKN_WHEN; 00272 else if (strncmp(str, "null", length) == 0) 00273 code = TKN_NULL; 00274 break; 00275 00276 case 5: 00277 if (strncmp(str, "begin", length) == 0) 00278 code = TKN_BEGIN; 00279 else if (strncmp(str, "elsif", length) == 0) 00280 code = TKN_ELSIF; 00281 else if (strncmp(str, "while", length) == 0) 00282 code = TKN_WHILE; 00283 else if (strncmp(str, "range", length) == 0) 00284 code = TKN_RANGE; 00285 else if (strncmp(str, "array", length) == 0) 00286 code = TKN_ARRAY; 00287 else if (strncmp(str, "raise", length) == 0) 00288 code = TKN_RAISE; 00289 break; 00290 00291 case 6: 00292 if (strncmp(str, "domain", length) == 0) 00293 code = TKN_DOMAIN; 00294 else if (strncmp(str, "return", length) == 0) 00295 code = TKN_RETURN; 00296 else if (strncmp(str, "import", length) == 0) 00297 code = TKN_IMPORT; 00298 else if (strncmp(str, "pragma", length) == 0) 00299 code = TKN_PRAGMA; 00300 else if (strncmp(str, "others", length) == 0) 00301 code = TKN_OTHERS; 00302 else if (strncmp(str, "record", length) == 0) 00303 code = TKN_RECORD; 00304 else if (strncmp(str, "access", length) == 0) 00305 code = TKN_ACCESS; 00306 break; 00307 00308 case 7: 00309 if (strncmp(str, "carrier", length) == 0) 00310 code = TKN_CARRIER; 00311 else if (strncmp(str, "declare", length) == 0) 00312 code = TKN_DECLARE; 00313 else if (strncmp(str, "generic", length) == 0) 00314 code = TKN_GENERIC; 00315 else if (strncmp(str, "subtype", length) == 0) 00316 code = TKN_SUBTYPE; 00317 else if (strncmp(str, "reverse", length) == 0) 00318 code = TKN_REVERSE; 00319 else if (strncmp(str, "renames", length) == 0) 00320 code = TKN_RENAMES; 00321 break; 00322 00323 case 8: 00324 if (strncmp(str, "function", length) == 0) 00325 code = TKN_FUNCTION; 00326 else if (strncmp(str, "abstract", length) == 0) 00327 code = TKN_ABSTRACT; 00328 break; 00329 00330 case 9: 00331 if (strncmp(str, "procedure", length) == 0) 00332 code = TKN_PROCEDURE; 00333 else if (strncmp(str, "signature", length) == 0) 00334 code = TKN_SIGNATURE; 00335 else if (strncmp(str, "exception", length) == 0) 00336 code = TKN_EXCEPTION; 00337 break; 00338 } 00339 return code; 00340 } 00341 00342 void Lexer::diagnoseConsecutiveUnderscores(unsigned c1, unsigned c2) 00343 { 00344 if (c1 == '_' && c2 == '_') { 00345 report(diag::CONSECUTIVE_UNDERSCORE); 00346 do { 00347 ignoreStream(); 00348 } while ((c2 = peekStream()) == '_'); 00349 } 00350 } 00351 00352 bool Lexer::scanWord() 00353 { 00354 TextIterator start = currentIter; 00355 unsigned c1, c2; 00356 00357 if (isInitialIdentifierChar(c1 = peekStream())) { 00358 do { 00359 ignoreStream(); 00360 c2 = peekStream(); 00361 diagnoseConsecutiveUnderscores(c1, c2); 00362 } while (isInnerIdentifierChar(c1 = c2)); 00363 00364 Code code = getTokenCode(start, currentIter); 00365 00366 if (code == UNUSED_ID) 00367 emitIdentifierToken(start, currentIter); 00368 else 00369 emitToken(code, txtProvider.getLocation(start)); 00370 return true; 00371 } 00372 return false; 00373 } 00374 00375 bool Lexer::scanGlyph() 00376 { 00377 Location loc = currentLocation(); 00378 unsigned c = readStream(); 00379 Code code = UNUSED_ID; 00380 00381 switch (c) { 00382 case '(': 00383 code = TKN_LPAREN; 00384 break; 00385 00386 case ')': 00387 code = TKN_RPAREN; 00388 break; 00389 00390 case ';': 00391 code = TKN_SEMI; 00392 break; 00393 00394 case '.': 00395 switch (peekStream()) { 00396 case '.': 00397 ignoreStream(); 00398 code = TKN_DDOT; 00399 break; 00400 00401 default: 00402 code = TKN_DOT; 00403 } 00404 break; 00405 00406 case ':': 00407 switch (peekStream()) { 00408 case '=': 00409 ignoreStream(); 00410 code = TKN_ASSIGN; 00411 break; 00412 00413 default: 00414 code = TKN_COLON; 00415 } 00416 break; 00417 00418 case ',': 00419 code = TKN_COMMA; 00420 break; 00421 00422 case '=': 00423 switch (peekStream()) { 00424 default: 00425 code = TKN_EQUAL; 00426 break; 00427 00428 case '>': 00429 ignoreStream(); 00430 code = TKN_RDARROW; 00431 break; 00432 } 00433 break; 00434 00435 case '<': 00436 switch (peekStream()) { 00437 default: 00438 code = TKN_LESS; 00439 break; 00440 00441 case '=': 00442 ignoreStream(); 00443 code = TKN_LEQ; 00444 break; 00445 00446 case '>': 00447 ignoreStream(); 00448 code = TKN_DIAMOND; 00449 } 00450 break; 00451 00452 case '>': 00453 switch (peekStream()) { 00454 default: 00455 code = TKN_GREAT; 00456 break; 00457 00458 case '=': 00459 ignoreStream(); 00460 code = TKN_GEQ; 00461 break; 00462 } 00463 break; 00464 00465 case '+': 00466 code = TKN_PLUS; 00467 break; 00468 00469 case '-': 00470 code = TKN_MINUS; 00471 break; 00472 00473 case '*': 00474 switch (peekStream()) { 00475 case '*': 00476 ignoreStream(); 00477 code = TKN_POW; 00478 break; 00479 00480 default: 00481 code = TKN_STAR; 00482 } 00483 break; 00484 00485 case '/': 00486 switch (peekStream()) { 00487 case '=': 00488 ignoreStream(); 00489 code = TKN_NEQUAL; 00490 break; 00491 00492 default: 00493 code = TKN_FSLASH; 00494 } 00495 break; 00496 00497 case '&': 00498 code = TKN_AMPER; 00499 break; 00500 00501 case '@': 00502 code = TKN_AT; 00503 break; 00504 00505 case '|': 00506 code = TKN_BAR; 00507 break; 00508 } 00509 00510 if (code == UNUSED_ID) { 00511 ungetStream(); 00512 return false; 00513 } 00514 00515 emitToken(code, loc); 00516 return true; 00517 } 00518 00519 bool Lexer::scanEscape() 00520 { 00521 Location loc = currentLocation(); 00522 unsigned c; 00523 00524 switch (c = readStream()) { 00525 case '\\': break; 00526 case '"' : break; 00527 case '\'': break; 00528 case 't' : break; 00529 case 'n' : break; 00530 case 'r' : break; 00531 case 'b' : break; 00532 00533 case 0: 00534 // Premature end of stream. We let this condition be picked up by the 00535 // caller. 00536 ungetStream(); 00537 return false; 00538 00539 default: 00540 // Illegal escape sequence. 00541 report(loc, diag::ILLEGAL_ESCAPE) << (char)c; 00542 return false; 00543 } 00544 return true; 00545 } 00546 00547 bool Lexer::scanCharacter() 00548 { 00549 TextIterator start = currentIter; 00550 Location loc = currentLocation(); 00551 unsigned c; 00552 00553 if (peekStream() == '\'') { 00554 ignoreStream(); 00555 c = readStream(); 00556 00557 if (c == '\'') { 00558 // Empty enumeration literal. This is not valid. Consume and 00559 // report. 00560 report(loc, diag::EMPTY_CHARACTER_LITERAL); 00561 emitCharacterToken(start, currentIter); 00562 return true; 00563 } 00564 00565 if (peekStream() != '\'') { 00566 // If the character is not terminated, this must be an attribute 00567 // selector. Unget the current character and return a quote token. 00568 ungetStream(); 00569 emitToken(TKN_QUOTE, loc); 00570 return true; 00571 } 00572 00573 // Special case for the character representing a left paren. We need to 00574 // deal with the special case of a qualified expression containing a 00575 // character. Take "Character'('x')" or "String'('x', 'y')" for 00576 // example. We handle this oddball case by checking if another quote is 00577 // two characters away. 00578 if (c == '(') { 00579 TextIterator cursor = currentIter; 00580 if (*++cursor && *++cursor == '\'') { 00581 ungetStream(); 00582 emitToken(TKN_QUOTE, loc); 00583 return true; 00584 } 00585 } 00586 00587 // Otherwise we have a character literal. 00588 // 00589 // FIXME: Ensure the character belongs to the standard character set. 00590 ignoreStream(); 00591 emitCharacterToken(start, currentIter); 00592 return true; 00593 } 00594 return false; 00595 } 00596 00597 bool Lexer::scanString() 00598 { 00599 TextIterator start = currentIter; 00600 Location loc = currentLocation(); 00601 unsigned c; 00602 00603 if (peekStream() == '"') { 00604 ignoreStream(); 00605 00606 for (;;) { 00607 switch (c = readStream()) { 00608 case '\\': 00609 // Note that if scanning of the escape fails, we simply do not 00610 // accumulate the offending sequence and continue scanning. 00611 scanEscape(); 00612 break; 00613 00614 case 0: 00615 // Premature end of stream. Form the string literal from all 00616 // tokens accumulated thus far. 00617 report(loc, diag::UNTERMINATED_STRING); 00618 emitStringToken(start, currentIter); 00619 return true; 00620 00621 case '\n': 00622 // Embedded newline. 00623 report(loc, diag::NEWLINE_IN_STRING_LIT); 00624 emitStringToken(start, currentIter); 00625 return true; 00626 00627 case '"': 00628 // End of string literal. 00629 emitStringToken(start, currentIter); 00630 return true; 00631 } 00632 } 00633 } 00634 return false; 00635 } 00636 00637 bool Lexer::scanNumeric() 00638 { 00639 Location loc = currentLocation(); 00640 TextIterator start = currentIter; 00641 unsigned c = peekStream(); 00642 00643 if (isDecimalDigit(c)) { 00644 ignoreStream(); 00645 00646 // Decimal literals cannot have a leading zero (except for the zero 00647 // literal, of course). When we spot such a malformed integer, emit a 00648 // diagnostic and drop the leading zeros. 00649 if (c == '0' && isDecimalDigit(peekStream())) { 00650 report(loc, diag::LEADING_ZERO_IN_INTEGER_LIT); 00651 00652 while (peekStream() == '0') ignoreStream(); 00653 00654 // Check if we have a string of zeros. Simply return the zero token 00655 // in such a case. Otherwise, continue scanning normally. 00656 if (!isDecimalDigit(peekStream())) { 00657 TextIterator end = start; 00658 emitIntegerToken(start, ++end); 00659 return true; 00660 } 00661 else c = readStream(); 00662 } 00663 00664 for (;;) { 00665 c = readStream(); 00666 00667 if (isDecimalDigit(c) || c == '_') 00668 continue; 00669 else { 00670 ungetStream(); 00671 break; 00672 } 00673 } 00674 emitIntegerToken(start, currentIter); 00675 return true; 00676 } 00677 return false; 00678 } 00679 00680 void Lexer::beginExcursion() 00681 { 00682 positionStack.push_back(index); 00683 } 00684 00685 void Lexer::endExcursion() 00686 { 00687 index = positionStack.back(); 00688 positionStack.pop_back(); 00689 } 00690 00691 void Lexer::forgetExcursion() 00692 { 00693 unsigned saved_index = positionStack.back(); 00694 positionStack.pop_back(); 00695 00696 if (positionStack.empty()) { 00697 assert(saved_index == 0 && "index/position mismatch!"); 00698 ((void)saved_index); 00699 tokens.clear(); 00700 } 00701 } 00702 00703 void Lexer::peek(Token &tkn, unsigned n) 00704 { 00705 unsigned numTokens = tokens.size(); 00706 00707 if (index + n < numTokens) { 00708 tkn = tokens[index + n]; 00709 return; 00710 } 00711 00712 unsigned tokensNeeded = index + n - numTokens; 00713 targetToken = &tkn; 00714 for (unsigned i = 0; i <= tokensNeeded; ++i) { 00715 scanToken(); 00716 if (targetToken->getCode() != TKN_EOT) 00717 tokens.push_back(*targetToken); 00718 } 00719 } 00720 00721 void Lexer::scan(Token &tkn) 00722 { 00723 unsigned numTokens = tokens.size(); 00724 00725 // Check if we have a cached token to return. 00726 if (index < numTokens) { 00727 tkn = tokens[index++]; 00728 return; 00729 } 00730 00731 // Clear the token buffer if it is not empty and we are not in an excursion. 00732 if (numTokens && positionStack.empty()) { 00733 tokens.clear(); 00734 index = 0; 00735 } 00736 00737 targetToken = &tkn; 00738 00739 scanToken(); 00740 00741 // Save the token if we are in an excursion and it is not EOT. 00742 if (!positionStack.empty() && targetToken->getCode() != TKN_EOT) { 00743 index++; 00744 tokens.push_back(*targetToken); 00745 } 00746 } 00747 00748 void Lexer::scanToken() 00749 { 00750 for (;;) { 00751 eatWhitespace(); 00752 while (eatComment()) eatWhitespace(); 00753 00754 if (peekStream() == 0 || scanningAborted) { 00755 emitToken(TKN_EOT, Location()); 00756 return; 00757 } 00758 00759 if (scanWord()) return; 00760 if (scanGlyph()) return; 00761 if (scanString()) return; 00762 if (scanNumeric()) return; 00763 if (scanCharacter()) return; 00764 00765 // For invalid character data emit a diagnostic and abort the scan. 00766 // 00767 // FIXME: There should be an isSourceChar function to check if the 00768 // character belongs to the source character set. Scanning could just 00769 // skip legal characters. Characters which do not fall into the 00770 // expected character set should likely have their hex value printed. 00771 report(diag::INVALID_CHARACTER) << static_cast<char>(peekStream()); 00772 ignoreStream(); 00773 abortScanning(); 00774 } 00775 } 00776