Comma: include/comma/parser/Lexer.h Source File

00001 //===-- parser/Lexer.h ---------------------------------------- -*- C++ -*-===//
00002 //
00003 // This file is distributed under the MIT license.  See LICENSE.txt for details.
00004 //
00005 // Copyright (C) 2008, Stephen Wilson
00006 //
00007 //===----------------------------------------------------------------------===//
00008 
00009 #ifndef COMMA_PARSER_LEXER_HDR_GUARD
00010 #define COMMA_PARSER_LEXER_HDR_GUARD
00011 
00012 #include "comma/basic/Diagnostic.h"
00013 #include "comma/basic/TextProvider.h"
00014 #include <iosfwd>
00015 #include <string>
00016 
00017 namespace comma {
00018 
00019 class Lexer {
00020 
00021 public:
00022     Lexer(TextProvider &txtProvider, Diagnostic &diag);
00023 
00024     // Aside from UNUSED_ID, the only codes which should be used by clients of
00025     // this class are those prefixed by TKN.  All other codes are considered
00026     // internal and are subject to change.  Questions about a codes kind (for
00027     // example, determining if a code represents a reserved word) should be
00028     // answered thru the supplied predicates and not be based on this
00029     // enumerations ordering.
00030     enum Code {
00031         UNUSED_ID,
00032 
00033 #define RESERVED(NAME, STRING) TKN_ ## NAME,
00034 #define GLYPH(NAME, STRING)    TKN_ ## NAME,
00035 #define TOKEN(NAME)            TKN_ ## NAME,
00036 #include "comma/parser/Tokens.def"
00037 #undef RESERVED
00038 #undef GLYPH
00039 #undef TOKEN
00040 
00041         NUMTOKEN_CODES
00042     };
00043 
00044     // The Token class represents the result of lexing process.  Tokens are
00045     // identified by code.  They provide access to their underlying string
00046     // representation, and have position information in the form of a single
00047     // Location entry (which must be interpreted with respect to a particular
00048     // TextProvider).
00049     class Token {
00050 
00051     public:
00052         Token() : code(Lexer::UNUSED_ID) { }
00053 
00054         Lexer::Code getCode() const { return code; }
00055 
00056         Location getLocation() const { return location; }
00057 
00058         const char *getRep() const { return string; }
00059 
00060         unsigned getLength() const { return length; }
00061 
00062         // This method provides a string representation of the token.
00063         std::string getString() const;
00064 
00065     private:
00066         Lexer::Code code   : 8;
00067         unsigned    length : 24;
00068         Location    location;
00069         const char *string;
00070 
00071         // Declare Lexer as a friend to give access to the following
00072         // constructor.
00073         friend class Lexer;
00074 
00075         Token(Lexer::Code code,
00076               Location    location,
00077               const char *string,
00078               unsigned length)
00079             : code(code),
00080               length(length),
00081               location(location),
00082               string(string) { }
00083     };
00084 
00085     // Scans a single token from the input stream.  When the stream is
00086     // exhausted, all further calls to this method will set the supplied tokens
00087     // code to TKN_EOT.
00088     void scan(Lexer::Token &tkn);
00089 
00090     void peek(Lexer::Token &tkn, unsigned n);
00091 
00092     // Saves the current "position" of the lexer.  Further calls to Lexer::scan
00093     // will remember the resulting tokens.  The token stream can be restored to
00094     // the state before saveExcursion was called with a call to
00095     // Lexer::endExcursion.  Alternatively, the excursion can be forgotten with
00096     // a call to Lexer::forgetExcursion.
00097     void beginExcursion();
00098 
00099     void endExcursion();
00100 
00101     void forgetExcursion();
00102 
00103     // Returns true if the lexer has not seen an error while scanning its input.
00104     bool lexSuccessful() const { return errorCount == 0; }
00105 
00106     // Returns the number of errors seen by the lexer.
00107     unsigned getErrorCount() const { return errorCount; }
00108 
00112     void abortScanning() { scanningAborted = true; }
00113 
00114     // Returns true if the given token is a glyph and it can name a function
00115     // (e.g. '+', '*', etc).
00116     static bool isFunctionGlyph(const Lexer::Token &tkn) {
00117         switch (tkn.getCode()) {
00118         case TKN_EQUAL:
00119         case TKN_NEQUAL:
00120         case TKN_LESS:
00121         case TKN_LEQ:
00122         case TKN_GREAT:
00123         case TKN_GEQ:
00124         case TKN_MINUS:
00125         case TKN_STAR:
00126         case TKN_PLUS:
00127         case TKN_FSLASH:
00128         case TKN_POW:
00129         case TKN_MOD:
00130         case TKN_REM:
00131         case TKN_AND:
00132         case TKN_NOT:
00133         case TKN_XOR:
00134         case TKN_OR:
00135             return true;
00136         default:
00137             return false;
00138         }
00139     }
00140 
00141     // Returns a static string representation of the given token code, or NULL
00142     // if no such representation is available.
00143     static const char *tokenString(Code code);
00144 
00145     // Returns the string representation of the given token.
00146     static std::string tokenString(const Token &tkn);
00147 
00148 private:
00149     void scanToken();
00150 
00151     bool eatWhitespace();
00152 
00153     bool eatComment();
00154 
00155     bool scanWord();
00156 
00157     bool scanGlyph();
00158 
00159     bool scanCharacter();
00160 
00161     bool scanString();
00162 
00163     bool scanNumeric();
00164 
00165     bool scanEscape();
00166 
00167     static bool isAlphabetic(unsigned c);
00168 
00169     static bool isInitialIdentifierChar(unsigned c);
00170 
00171     static bool isInnerIdentifierChar(unsigned c);
00172 
00173     static bool isWhitespace(unsigned c);
00174 
00175     static bool isDecimalDigit(unsigned c);
00176 
00177     Location currentLocation() const;
00178 
00179     // We do not read from the supplied stream directly.  Rather, we call the
00180     // following simple wrappers which ensure that our line and column
00181     // information is consistently updated.  Carriage returns and DOS style
00182     // newlines are canonicalized to single newline characters.  When the stream
00183     // is exhausted, 0 is returned to signal EOF.
00184     unsigned readStream();
00185     unsigned peekStream();
00186     void ungetStream();
00187     void ignoreStream();
00188 
00189     // Returns the token code assoicated with the chatacter string delimited by
00190     // start and end.  If the string exactly matches a reserved word (this
00191     // function will not recignize glyph tokens) the words corresponding code is
00192     // returned, else UNUSED_ID if there is no match.
00193     Code getTokenCode(TextIterator &start, TextIterator &end) const;
00194 
00195     void emitToken(Code code,
00196                    const TextIterator &start, const TextIterator &end);
00197 
00198     // Used to emit reserved identifiers and glyphs which do not require a
00199     // string representation.
00200     void emitToken(Code code, Location loc);
00201 
00202     void emitStringToken(const TextIterator &start, const TextIterator &end);
00203 
00204     void emitIntegerToken(const TextIterator &start, const TextIterator &end);
00205 
00206     void emitIdentifierToken(const TextIterator &start,
00207                              const TextIterator &end);
00208 
00209     void emitCharacterToken(const TextIterator &start, const TextIterator &end);
00210 
00211     DiagnosticStream &report(Location loc, diag::Kind kind) {
00212         ++errorCount;
00213         SourceLocation sloc = txtProvider.getSourceLocation(loc);
00214         return diagnostic.report(sloc, kind);
00215     }
00216 
00217     // If c1 and c2 are both '_' characters, report a consecutive underscore
00218     // error and drive the stream to consume all remaining '_' characters.
00219     void diagnoseConsecutiveUnderscores(unsigned c1, unsigned c2);
00220 
00221     DiagnosticStream &report(SourceLocation sloc, diag::Kind kind)  {
00222         ++errorCount;
00223         return diagnostic.report(sloc, kind);
00224     }
00225 
00226     DiagnosticStream &report(diag::Kind kind) {
00227         ++errorCount;
00228         SourceLocation sloc = txtProvider.getSourceLocation(currentLocation());
00229         return diagnostic.report(sloc, kind);
00230     }
00231 
00232     // This is the stream we read from.
00233     TextProvider &txtProvider;
00234 
00235     // Output stream to send messages.
00236     Diagnostic &diagnostic;
00237 
00238     // An iterator into our stream.
00239     TextIterator currentIter;
00240 
00241     // Numer of errors detected.
00242     unsigned errorCount;
00243 
00244     // True is lexing has been cancelled with a call to abortScanning().
00245     bool scanningAborted;
00246 
00247     // The token parameter supplied to scan() is maintained here.  This is
00248     // the destination of the lexing methods.
00249     Token *targetToken;
00250 
00251     // A vector of tokens is used to implement lookahead.
00252     std::vector<Token> tokens;
00253 
00254     // A stack of positions into the token vector, used to implement
00255     // savePosition and restorePosition.
00256     std::vector<unsigned> positionStack;
00257 
00258     // Index into our token vector.  This index is non-zero only when an
00259     // excursion has ended with a call to endExcursion.
00260     unsigned index;
00261 };
00262 
00263 } // End comma namespace
00264 
00265 #endif
Welcome to the NetCologne GmbH open source mirroring service!

include/comma/parser/Lexer.h