This machine mirrors various open-source projects.
20 Gbit/s uplink.
If there are any issues or you want another project mirrored, please contact
mirror-service -=AT=- netcologne DOT de !
00001 //===-- parser/Lexer.h ---------------------------------------- -*- C++ -*-===// 00002 // 00003 // This file is distributed under the MIT license. See LICENSE.txt for details. 00004 // 00005 // Copyright (C) 2008, Stephen Wilson 00006 // 00007 //===----------------------------------------------------------------------===// 00008 00009 #ifndef COMMA_PARSER_LEXER_HDR_GUARD 00010 #define COMMA_PARSER_LEXER_HDR_GUARD 00011 00012 #include "comma/basic/Diagnostic.h" 00013 #include "comma/basic/TextProvider.h" 00014 #include <iosfwd> 00015 #include <string> 00016 00017 namespace comma { 00018 00019 class Lexer { 00020 00021 public: 00022 Lexer(TextProvider &txtProvider, Diagnostic &diag); 00023 00024 // Aside from UNUSED_ID, the only codes which should be used by clients of 00025 // this class are those prefixed by TKN. All other codes are considered 00026 // internal and are subject to change. Questions about a codes kind (for 00027 // example, determining if a code represents a reserved word) should be 00028 // answered thru the supplied predicates and not be based on this 00029 // enumerations ordering. 00030 enum Code { 00031 UNUSED_ID, 00032 00033 #define RESERVED(NAME, STRING) TKN_ ## NAME, 00034 #define GLYPH(NAME, STRING) TKN_ ## NAME, 00035 #define TOKEN(NAME) TKN_ ## NAME, 00036 #include "comma/parser/Tokens.def" 00037 #undef RESERVED 00038 #undef GLYPH 00039 #undef TOKEN 00040 00041 NUMTOKEN_CODES 00042 }; 00043 00044 // The Token class represents the result of lexing process. Tokens are 00045 // identified by code. They provide access to their underlying string 00046 // representation, and have position information in the form of a single 00047 // Location entry (which must be interpreted with respect to a particular 00048 // TextProvider). 00049 class Token { 00050 00051 public: 00052 Token() : code(Lexer::UNUSED_ID) { } 00053 00054 Lexer::Code getCode() const { return code; } 00055 00056 Location getLocation() const { return location; } 00057 00058 const char *getRep() const { return string; } 00059 00060 unsigned getLength() const { return length; } 00061 00062 // This method provides a string representation of the token. 00063 std::string getString() const; 00064 00065 private: 00066 Lexer::Code code : 8; 00067 unsigned length : 24; 00068 Location location; 00069 const char *string; 00070 00071 // Declare Lexer as a friend to give access to the following 00072 // constructor. 00073 friend class Lexer; 00074 00075 Token(Lexer::Code code, 00076 Location location, 00077 const char *string, 00078 unsigned length) 00079 : code(code), 00080 length(length), 00081 location(location), 00082 string(string) { } 00083 }; 00084 00085 // Scans a single token from the input stream. When the stream is 00086 // exhausted, all further calls to this method will set the supplied tokens 00087 // code to TKN_EOT. 00088 void scan(Lexer::Token &tkn); 00089 00090 void peek(Lexer::Token &tkn, unsigned n); 00091 00092 // Saves the current "position" of the lexer. Further calls to Lexer::scan 00093 // will remember the resulting tokens. The token stream can be restored to 00094 // the state before saveExcursion was called with a call to 00095 // Lexer::endExcursion. Alternatively, the excursion can be forgotten with 00096 // a call to Lexer::forgetExcursion. 00097 void beginExcursion(); 00098 00099 void endExcursion(); 00100 00101 void forgetExcursion(); 00102 00103 // Returns true if the lexer has not seen an error while scanning its input. 00104 bool lexSuccessful() const { return errorCount == 0; } 00105 00106 // Returns the number of errors seen by the lexer. 00107 unsigned getErrorCount() const { return errorCount; } 00108 00112 void abortScanning() { scanningAborted = true; } 00113 00114 // Returns true if the given token is a glyph and it can name a function 00115 // (e.g. '+', '*', etc). 00116 static bool isFunctionGlyph(const Lexer::Token &tkn) { 00117 switch (tkn.getCode()) { 00118 case TKN_EQUAL: 00119 case TKN_NEQUAL: 00120 case TKN_LESS: 00121 case TKN_LEQ: 00122 case TKN_GREAT: 00123 case TKN_GEQ: 00124 case TKN_MINUS: 00125 case TKN_STAR: 00126 case TKN_PLUS: 00127 case TKN_FSLASH: 00128 case TKN_POW: 00129 case TKN_MOD: 00130 case TKN_REM: 00131 case TKN_AND: 00132 case TKN_NOT: 00133 case TKN_XOR: 00134 case TKN_OR: 00135 return true; 00136 default: 00137 return false; 00138 } 00139 } 00140 00141 // Returns a static string representation of the given token code, or NULL 00142 // if no such representation is available. 00143 static const char *tokenString(Code code); 00144 00145 // Returns the string representation of the given token. 00146 static std::string tokenString(const Token &tkn); 00147 00148 private: 00149 void scanToken(); 00150 00151 bool eatWhitespace(); 00152 00153 bool eatComment(); 00154 00155 bool scanWord(); 00156 00157 bool scanGlyph(); 00158 00159 bool scanCharacter(); 00160 00161 bool scanString(); 00162 00163 bool scanNumeric(); 00164 00165 bool scanEscape(); 00166 00167 static bool isAlphabetic(unsigned c); 00168 00169 static bool isInitialIdentifierChar(unsigned c); 00170 00171 static bool isInnerIdentifierChar(unsigned c); 00172 00173 static bool isWhitespace(unsigned c); 00174 00175 static bool isDecimalDigit(unsigned c); 00176 00177 Location currentLocation() const; 00178 00179 // We do not read from the supplied stream directly. Rather, we call the 00180 // following simple wrappers which ensure that our line and column 00181 // information is consistently updated. Carriage returns and DOS style 00182 // newlines are canonicalized to single newline characters. When the stream 00183 // is exhausted, 0 is returned to signal EOF. 00184 unsigned readStream(); 00185 unsigned peekStream(); 00186 void ungetStream(); 00187 void ignoreStream(); 00188 00189 // Returns the token code assoicated with the chatacter string delimited by 00190 // start and end. If the string exactly matches a reserved word (this 00191 // function will not recignize glyph tokens) the words corresponding code is 00192 // returned, else UNUSED_ID if there is no match. 00193 Code getTokenCode(TextIterator &start, TextIterator &end) const; 00194 00195 void emitToken(Code code, 00196 const TextIterator &start, const TextIterator &end); 00197 00198 // Used to emit reserved identifiers and glyphs which do not require a 00199 // string representation. 00200 void emitToken(Code code, Location loc); 00201 00202 void emitStringToken(const TextIterator &start, const TextIterator &end); 00203 00204 void emitIntegerToken(const TextIterator &start, const TextIterator &end); 00205 00206 void emitIdentifierToken(const TextIterator &start, 00207 const TextIterator &end); 00208 00209 void emitCharacterToken(const TextIterator &start, const TextIterator &end); 00210 00211 DiagnosticStream &report(Location loc, diag::Kind kind) { 00212 ++errorCount; 00213 SourceLocation sloc = txtProvider.getSourceLocation(loc); 00214 return diagnostic.report(sloc, kind); 00215 } 00216 00217 // If c1 and c2 are both '_' characters, report a consecutive underscore 00218 // error and drive the stream to consume all remaining '_' characters. 00219 void diagnoseConsecutiveUnderscores(unsigned c1, unsigned c2); 00220 00221 DiagnosticStream &report(SourceLocation sloc, diag::Kind kind) { 00222 ++errorCount; 00223 return diagnostic.report(sloc, kind); 00224 } 00225 00226 DiagnosticStream &report(diag::Kind kind) { 00227 ++errorCount; 00228 SourceLocation sloc = txtProvider.getSourceLocation(currentLocation()); 00229 return diagnostic.report(sloc, kind); 00230 } 00231 00232 // This is the stream we read from. 00233 TextProvider &txtProvider; 00234 00235 // Output stream to send messages. 00236 Diagnostic &diagnostic; 00237 00238 // An iterator into our stream. 00239 TextIterator currentIter; 00240 00241 // Numer of errors detected. 00242 unsigned errorCount; 00243 00244 // True is lexing has been cancelled with a call to abortScanning(). 00245 bool scanningAborted; 00246 00247 // The token parameter supplied to scan() is maintained here. This is 00248 // the destination of the lexing methods. 00249 Token *targetToken; 00250 00251 // A vector of tokens is used to implement lookahead. 00252 std::vector<Token> tokens; 00253 00254 // A stack of positions into the token vector, used to implement 00255 // savePosition and restorePosition. 00256 std::vector<unsigned> positionStack; 00257 00258 // Index into our token vector. This index is non-zero only when an 00259 // excursion has ended with a call to endExcursion. 00260 unsigned index; 00261 }; 00262 00263 } // End comma namespace 00264 00265 #endif