LLVM API Documentation

TGLexer.cpp
Go to the documentation of this file.
00001 //===- TGLexer.cpp - Lexer for TableGen -----------------------------------===//
00002 //
00003 //                     The LLVM Compiler Infrastructure
00004 //
00005 // This file is distributed under the University of Illinois Open Source
00006 // License. See LICENSE.TXT for details.
00007 //
00008 //===----------------------------------------------------------------------===//
00009 //
00010 // Implement the Lexer for TableGen.
00011 //
00012 //===----------------------------------------------------------------------===//
00013 
00014 #include "TGLexer.h"
00015 #include "llvm/ADT/StringSwitch.h"
00016 #include "llvm/ADT/Twine.h"
00017 #include "llvm/Config/config.h" // for strtoull()/strtoll() define
00018 #include "llvm/Support/MemoryBuffer.h"
00019 #include "llvm/Support/SourceMgr.h"
00020 #include "llvm/TableGen/Error.h"
00021 #include <cctype>
00022 #include <cerrno>
00023 #include <cstdio>
00024 #include <cstdlib>
00025 #include <cstring>
00026 
00027 using namespace llvm;
00028 
00029 TGLexer::TGLexer(SourceMgr &SM) : SrcMgr(SM) {
00030   CurBuffer = SrcMgr.getMainFileID();
00031   CurBuf = SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer();
00032   CurPtr = CurBuf.begin();
00033   TokStart = nullptr;
00034 }
00035 
00036 SMLoc TGLexer::getLoc() const {
00037   return SMLoc::getFromPointer(TokStart);
00038 }
00039 
00040 /// ReturnError - Set the error to the specified string at the specified
00041 /// location.  This is defined to always return tgtok::Error.
00042 tgtok::TokKind TGLexer::ReturnError(const char *Loc, const Twine &Msg) {
00043   PrintError(Loc, Msg);
00044   return tgtok::Error;
00045 }
00046 
00047 int TGLexer::getNextChar() {
00048   char CurChar = *CurPtr++;
00049   switch (CurChar) {
00050   default:
00051     return (unsigned char)CurChar;
00052   case 0: {
00053     // A nul character in the stream is either the end of the current buffer or
00054     // a random nul in the file.  Disambiguate that here.
00055     if (CurPtr-1 != CurBuf.end())
00056       return 0;  // Just whitespace.
00057     
00058     // If this is the end of an included file, pop the parent file off the
00059     // include stack.
00060     SMLoc ParentIncludeLoc = SrcMgr.getParentIncludeLoc(CurBuffer);
00061     if (ParentIncludeLoc != SMLoc()) {
00062       CurBuffer = SrcMgr.FindBufferContainingLoc(ParentIncludeLoc);
00063       CurBuf = SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer();
00064       CurPtr = ParentIncludeLoc.getPointer();
00065       return getNextChar();
00066     }
00067     
00068     // Otherwise, return end of file.
00069     --CurPtr;  // Another call to lex will return EOF again.  
00070     return EOF;
00071   }
00072   case '\n':
00073   case '\r':
00074     // Handle the newline character by ignoring it and incrementing the line
00075     // count.  However, be careful about 'dos style' files with \n\r in them.
00076     // Only treat a \n\r or \r\n as a single line.
00077     if ((*CurPtr == '\n' || (*CurPtr == '\r')) &&
00078         *CurPtr != CurChar)
00079       ++CurPtr;  // Eat the two char newline sequence.
00080     return '\n';
00081   }  
00082 }
00083 
00084 int TGLexer::peekNextChar(int Index) {
00085   return *(CurPtr + Index);
00086 }
00087 
00088 tgtok::TokKind TGLexer::LexToken() {
00089   TokStart = CurPtr;
00090   // This always consumes at least one character.
00091   int CurChar = getNextChar();
00092 
00093   switch (CurChar) {
00094   default:
00095     // Handle letters: [a-zA-Z_]
00096     if (isalpha(CurChar) || CurChar == '_')
00097       return LexIdentifier();
00098 
00099     // Unknown character, emit an error.
00100     return ReturnError(TokStart, "Unexpected character");
00101   case EOF: return tgtok::Eof;
00102   case ':': return tgtok::colon;
00103   case ';': return tgtok::semi;
00104   case '.': return tgtok::period;
00105   case ',': return tgtok::comma;
00106   case '<': return tgtok::less;
00107   case '>': return tgtok::greater;
00108   case ']': return tgtok::r_square;
00109   case '{': return tgtok::l_brace;
00110   case '}': return tgtok::r_brace;
00111   case '(': return tgtok::l_paren;
00112   case ')': return tgtok::r_paren;
00113   case '=': return tgtok::equal;
00114   case '?': return tgtok::question;
00115   case '#': return tgtok::paste;
00116       
00117   case 0:
00118   case ' ':
00119   case '\t':
00120   case '\n':
00121   case '\r':
00122     // Ignore whitespace.
00123     return LexToken();
00124   case '/':
00125     // If this is the start of a // comment, skip until the end of the line or
00126     // the end of the buffer.
00127     if (*CurPtr == '/')
00128       SkipBCPLComment();
00129     else if (*CurPtr == '*') {
00130       if (SkipCComment())
00131         return tgtok::Error;
00132     } else // Otherwise, this is an error.
00133       return ReturnError(TokStart, "Unexpected character");
00134     return LexToken();
00135   case '-': case '+':
00136   case '0': case '1': case '2': case '3': case '4': case '5': case '6':
00137   case '7': case '8': case '9': {
00138     int NextChar = 0;
00139     if (isdigit(CurChar)) {
00140       // Allow identifiers to start with a number if it is followed by
00141       // an identifier.  This can happen with paste operations like
00142       // foo#8i.
00143       int i = 0;
00144       do {
00145         NextChar = peekNextChar(i++);
00146       } while (isdigit(NextChar));
00147 
00148       if (NextChar == 'x' || NextChar == 'b') {
00149         // If this is [0-9]b[01] or [0-9]x[0-9A-fa-f] this is most
00150         // likely a number.
00151         int NextNextChar = peekNextChar(i);
00152         switch (NextNextChar) {
00153         default:
00154           break;
00155         case '0': case '1': 
00156           if (NextChar == 'b')
00157             return LexNumber();
00158           // Fallthrough
00159         case '2': case '3': case '4': case '5':
00160         case '6': case '7': case '8': case '9':
00161         case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
00162         case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
00163           if (NextChar == 'x')
00164             return LexNumber();
00165           break;
00166         }
00167       }
00168     }
00169 
00170     if (isalpha(NextChar) || NextChar == '_')
00171       return LexIdentifier();
00172 
00173     return LexNumber();
00174   }
00175   case '"': return LexString();
00176   case '$': return LexVarName();
00177   case '[': return LexBracket();
00178   case '!': return LexExclaim();
00179   }
00180 }
00181 
00182 /// LexString - Lex "[^"]*"
00183 tgtok::TokKind TGLexer::LexString() {
00184   const char *StrStart = CurPtr;
00185   
00186   CurStrVal = "";
00187   
00188   while (*CurPtr != '"') {
00189     // If we hit the end of the buffer, report an error.
00190     if (*CurPtr == 0 && CurPtr == CurBuf.end())
00191       return ReturnError(StrStart, "End of file in string literal");
00192     
00193     if (*CurPtr == '\n' || *CurPtr == '\r')
00194       return ReturnError(StrStart, "End of line in string literal");
00195     
00196     if (*CurPtr != '\\') {
00197       CurStrVal += *CurPtr++;
00198       continue;
00199     }
00200 
00201     ++CurPtr;
00202     
00203     switch (*CurPtr) {
00204     case '\\': case '\'': case '"':
00205       // These turn into their literal character.
00206       CurStrVal += *CurPtr++;
00207       break;
00208     case 't':
00209       CurStrVal += '\t';
00210       ++CurPtr;
00211       break;
00212     case 'n':
00213       CurStrVal += '\n';
00214       ++CurPtr;
00215       break;
00216         
00217     case '\n':
00218     case '\r':
00219       return ReturnError(CurPtr, "escaped newlines not supported in tblgen");
00220 
00221     // If we hit the end of the buffer, report an error.
00222     case '\0':
00223       if (CurPtr == CurBuf.end())
00224         return ReturnError(StrStart, "End of file in string literal");
00225       // FALL THROUGH
00226     default:
00227       return ReturnError(CurPtr, "invalid escape in string literal");
00228     }
00229   }
00230   
00231   ++CurPtr;
00232   return tgtok::StrVal;
00233 }
00234 
00235 tgtok::TokKind TGLexer::LexVarName() {
00236   if (!isalpha(CurPtr[0]) && CurPtr[0] != '_')
00237     return ReturnError(TokStart, "Invalid variable name");
00238   
00239   // Otherwise, we're ok, consume the rest of the characters.
00240   const char *VarNameStart = CurPtr++;
00241   
00242   while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_')
00243     ++CurPtr;
00244 
00245   CurStrVal.assign(VarNameStart, CurPtr);
00246   return tgtok::VarName;
00247 }
00248 
00249 
00250 tgtok::TokKind TGLexer::LexIdentifier() {
00251   // The first letter is [a-zA-Z_#].
00252   const char *IdentStart = TokStart;
00253 
00254   // Match the rest of the identifier regex: [0-9a-zA-Z_#]*
00255   while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_')
00256     ++CurPtr;
00257 
00258   // Check to see if this identifier is a keyword.
00259   StringRef Str(IdentStart, CurPtr-IdentStart);
00260 
00261   if (Str == "include") {
00262     if (LexInclude()) return tgtok::Error;
00263     return Lex();
00264   }
00265 
00266   tgtok::TokKind Kind = StringSwitch<tgtok::TokKind>(Str)
00267     .Case("int", tgtok::Int)
00268     .Case("bit", tgtok::Bit)
00269     .Case("bits", tgtok::Bits)
00270     .Case("string", tgtok::String)
00271     .Case("list", tgtok::List)
00272     .Case("code", tgtok::Code)
00273     .Case("dag", tgtok::Dag)
00274     .Case("class", tgtok::Class)
00275     .Case("def", tgtok::Def)
00276     .Case("foreach", tgtok::Foreach)
00277     .Case("defm", tgtok::Defm)
00278     .Case("multiclass", tgtok::MultiClass)
00279     .Case("field", tgtok::Field)
00280     .Case("let", tgtok::Let)
00281     .Case("in", tgtok::In)
00282     .Default(tgtok::Id);
00283 
00284   if (Kind == tgtok::Id)
00285     CurStrVal.assign(Str.begin(), Str.end());
00286   return Kind;
00287 }
00288 
00289 /// LexInclude - We just read the "include" token.  Get the string token that
00290 /// comes next and enter the include.
00291 bool TGLexer::LexInclude() {
00292   // The token after the include must be a string.
00293   tgtok::TokKind Tok = LexToken();
00294   if (Tok == tgtok::Error) return true;
00295   if (Tok != tgtok::StrVal) {
00296     PrintError(getLoc(), "Expected filename after include");
00297     return true;
00298   }
00299 
00300   // Get the string.
00301   std::string Filename = CurStrVal;
00302   std::string IncludedFile;
00303 
00304   
00305   CurBuffer = SrcMgr.AddIncludeFile(Filename, SMLoc::getFromPointer(CurPtr),
00306                                     IncludedFile);
00307   if (!CurBuffer) {
00308     PrintError(getLoc(), "Could not find include file '" + Filename + "'");
00309     return true;
00310   }
00311   
00312   DependenciesMapTy::const_iterator Found = Dependencies.find(IncludedFile);
00313   if (Found != Dependencies.end()) {
00314     PrintError(getLoc(),
00315                "File '" + IncludedFile + "' has already been included.");
00316     SrcMgr.PrintMessage(Found->second, SourceMgr::DK_Note,
00317                         "previously included here");
00318     return true;
00319   }
00320   Dependencies.insert(std::make_pair(IncludedFile, getLoc()));
00321   // Save the line number and lex buffer of the includer.
00322   CurBuf = SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer();
00323   CurPtr = CurBuf.begin();
00324   return false;
00325 }
00326 
00327 void TGLexer::SkipBCPLComment() {
00328   ++CurPtr;  // skip the second slash.
00329   while (1) {
00330     switch (*CurPtr) {
00331     case '\n':
00332     case '\r':
00333       return;  // Newline is end of comment.
00334     case 0:
00335       // If this is the end of the buffer, end the comment.
00336       if (CurPtr == CurBuf.end())
00337         return;
00338       break;
00339     }
00340     // Otherwise, skip the character.
00341     ++CurPtr;
00342   }
00343 }
00344 
00345 /// SkipCComment - This skips C-style /**/ comments.  The only difference from C
00346 /// is that we allow nesting.
00347 bool TGLexer::SkipCComment() {
00348   ++CurPtr;  // skip the star.
00349   unsigned CommentDepth = 1;
00350   
00351   while (1) {
00352     int CurChar = getNextChar();
00353     switch (CurChar) {
00354     case EOF:
00355       PrintError(TokStart, "Unterminated comment!");
00356       return true;
00357     case '*':
00358       // End of the comment?
00359       if (CurPtr[0] != '/') break;
00360       
00361       ++CurPtr;   // End the */.
00362       if (--CommentDepth == 0)
00363         return false;
00364       break;
00365     case '/':
00366       // Start of a nested comment?
00367       if (CurPtr[0] != '*') break;
00368       ++CurPtr;
00369       ++CommentDepth;
00370       break;
00371     }
00372   }
00373 }
00374 
00375 /// LexNumber - Lex:
00376 ///    [-+]?[0-9]+
00377 ///    0x[0-9a-fA-F]+
00378 ///    0b[01]+
00379 tgtok::TokKind TGLexer::LexNumber() {
00380   if (CurPtr[-1] == '0') {
00381     if (CurPtr[0] == 'x') {
00382       ++CurPtr;
00383       const char *NumStart = CurPtr;
00384       while (isxdigit(CurPtr[0]))
00385         ++CurPtr;
00386       
00387       // Requires at least one hex digit.
00388       if (CurPtr == NumStart)
00389         return ReturnError(TokStart, "Invalid hexadecimal number");
00390 
00391       errno = 0;
00392       CurIntVal = strtoll(NumStart, nullptr, 16);
00393       if (errno == EINVAL)
00394         return ReturnError(TokStart, "Invalid hexadecimal number");
00395       if (errno == ERANGE) {
00396         errno = 0;
00397         CurIntVal = (int64_t)strtoull(NumStart, nullptr, 16);
00398         if (errno == EINVAL)
00399           return ReturnError(TokStart, "Invalid hexadecimal number");
00400         if (errno == ERANGE)
00401           return ReturnError(TokStart, "Hexadecimal number out of range");
00402       }
00403       return tgtok::IntVal;
00404     } else if (CurPtr[0] == 'b') {
00405       ++CurPtr;
00406       const char *NumStart = CurPtr;
00407       while (CurPtr[0] == '0' || CurPtr[0] == '1')
00408         ++CurPtr;
00409 
00410       // Requires at least one binary digit.
00411       if (CurPtr == NumStart)
00412         return ReturnError(CurPtr-2, "Invalid binary number");
00413       CurIntVal = strtoll(NumStart, nullptr, 2);
00414       return tgtok::BinaryIntVal;
00415     }
00416   }
00417 
00418   // Check for a sign without a digit.
00419   if (!isdigit(CurPtr[0])) {
00420     if (CurPtr[-1] == '-')
00421       return tgtok::minus;
00422     else if (CurPtr[-1] == '+')
00423       return tgtok::plus;
00424   }
00425   
00426   while (isdigit(CurPtr[0]))
00427     ++CurPtr;
00428   CurIntVal = strtoll(TokStart, nullptr, 10);
00429   return tgtok::IntVal;
00430 }
00431 
00432 /// LexBracket - We just read '['.  If this is a code block, return it,
00433 /// otherwise return the bracket.  Match: '[' and '[{ ( [^}]+ | }[^]] )* }]'
00434 tgtok::TokKind TGLexer::LexBracket() {
00435   if (CurPtr[0] != '{')
00436     return tgtok::l_square;
00437   ++CurPtr;
00438   const char *CodeStart = CurPtr;
00439   while (1) {
00440     int Char = getNextChar();
00441     if (Char == EOF) break;
00442     
00443     if (Char != '}') continue;
00444     
00445     Char = getNextChar();
00446     if (Char == EOF) break;
00447     if (Char == ']') {
00448       CurStrVal.assign(CodeStart, CurPtr-2);
00449       return tgtok::CodeFragment;
00450     }
00451   }
00452   
00453   return ReturnError(CodeStart-2, "Unterminated Code Block");
00454 }
00455 
00456 /// LexExclaim - Lex '!' and '![a-zA-Z]+'.
00457 tgtok::TokKind TGLexer::LexExclaim() {
00458   if (!isalpha(*CurPtr))
00459     return ReturnError(CurPtr - 1, "Invalid \"!operator\"");
00460   
00461   const char *Start = CurPtr++;
00462   while (isalpha(*CurPtr))
00463     ++CurPtr;
00464   
00465   // Check to see which operator this is.
00466   tgtok::TokKind Kind =
00467     StringSwitch<tgtok::TokKind>(StringRef(Start, CurPtr - Start))
00468     .Case("eq", tgtok::XEq)
00469     .Case("if", tgtok::XIf)
00470     .Case("head", tgtok::XHead)
00471     .Case("tail", tgtok::XTail)
00472     .Case("con", tgtok::XConcat)
00473     .Case("add", tgtok::XADD)
00474     .Case("and", tgtok::XAND)
00475     .Case("shl", tgtok::XSHL)
00476     .Case("sra", tgtok::XSRA)
00477     .Case("srl", tgtok::XSRL)
00478     .Case("cast", tgtok::XCast)
00479     .Case("empty", tgtok::XEmpty)
00480     .Case("subst", tgtok::XSubst)
00481     .Case("foreach", tgtok::XForEach)
00482     .Case("listconcat", tgtok::XListConcat)
00483     .Case("strconcat", tgtok::XStrConcat)
00484     .Default(tgtok::Error);
00485 
00486   return Kind != tgtok::Error ? Kind : ReturnError(Start-1, "Unknown operator");
00487 }
00488