LLVM API Documentation
00001 //===- TGLexer.cpp - Lexer for TableGen -----------------------------------===// 00002 // 00003 // The LLVM Compiler Infrastructure 00004 // 00005 // This file is distributed under the University of Illinois Open Source 00006 // License. See LICENSE.TXT for details. 00007 // 00008 //===----------------------------------------------------------------------===// 00009 // 00010 // Implement the Lexer for TableGen. 00011 // 00012 //===----------------------------------------------------------------------===// 00013 00014 #include "TGLexer.h" 00015 #include "llvm/ADT/StringSwitch.h" 00016 #include "llvm/ADT/Twine.h" 00017 #include "llvm/Config/config.h" // for strtoull()/strtoll() define 00018 #include "llvm/Support/MemoryBuffer.h" 00019 #include "llvm/Support/SourceMgr.h" 00020 #include "llvm/TableGen/Error.h" 00021 #include <cctype> 00022 #include <cerrno> 00023 #include <cstdio> 00024 #include <cstdlib> 00025 #include <cstring> 00026 00027 using namespace llvm; 00028 00029 TGLexer::TGLexer(SourceMgr &SM) : SrcMgr(SM) { 00030 CurBuffer = SrcMgr.getMainFileID(); 00031 CurBuf = SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer(); 00032 CurPtr = CurBuf.begin(); 00033 TokStart = nullptr; 00034 } 00035 00036 SMLoc TGLexer::getLoc() const { 00037 return SMLoc::getFromPointer(TokStart); 00038 } 00039 00040 /// ReturnError - Set the error to the specified string at the specified 00041 /// location. This is defined to always return tgtok::Error. 00042 tgtok::TokKind TGLexer::ReturnError(const char *Loc, const Twine &Msg) { 00043 PrintError(Loc, Msg); 00044 return tgtok::Error; 00045 } 00046 00047 int TGLexer::getNextChar() { 00048 char CurChar = *CurPtr++; 00049 switch (CurChar) { 00050 default: 00051 return (unsigned char)CurChar; 00052 case 0: { 00053 // A nul character in the stream is either the end of the current buffer or 00054 // a random nul in the file. Disambiguate that here. 00055 if (CurPtr-1 != CurBuf.end()) 00056 return 0; // Just whitespace. 00057 00058 // If this is the end of an included file, pop the parent file off the 00059 // include stack. 00060 SMLoc ParentIncludeLoc = SrcMgr.getParentIncludeLoc(CurBuffer); 00061 if (ParentIncludeLoc != SMLoc()) { 00062 CurBuffer = SrcMgr.FindBufferContainingLoc(ParentIncludeLoc); 00063 CurBuf = SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer(); 00064 CurPtr = ParentIncludeLoc.getPointer(); 00065 return getNextChar(); 00066 } 00067 00068 // Otherwise, return end of file. 00069 --CurPtr; // Another call to lex will return EOF again. 00070 return EOF; 00071 } 00072 case '\n': 00073 case '\r': 00074 // Handle the newline character by ignoring it and incrementing the line 00075 // count. However, be careful about 'dos style' files with \n\r in them. 00076 // Only treat a \n\r or \r\n as a single line. 00077 if ((*CurPtr == '\n' || (*CurPtr == '\r')) && 00078 *CurPtr != CurChar) 00079 ++CurPtr; // Eat the two char newline sequence. 00080 return '\n'; 00081 } 00082 } 00083 00084 int TGLexer::peekNextChar(int Index) { 00085 return *(CurPtr + Index); 00086 } 00087 00088 tgtok::TokKind TGLexer::LexToken() { 00089 TokStart = CurPtr; 00090 // This always consumes at least one character. 00091 int CurChar = getNextChar(); 00092 00093 switch (CurChar) { 00094 default: 00095 // Handle letters: [a-zA-Z_] 00096 if (isalpha(CurChar) || CurChar == '_') 00097 return LexIdentifier(); 00098 00099 // Unknown character, emit an error. 00100 return ReturnError(TokStart, "Unexpected character"); 00101 case EOF: return tgtok::Eof; 00102 case ':': return tgtok::colon; 00103 case ';': return tgtok::semi; 00104 case '.': return tgtok::period; 00105 case ',': return tgtok::comma; 00106 case '<': return tgtok::less; 00107 case '>': return tgtok::greater; 00108 case ']': return tgtok::r_square; 00109 case '{': return tgtok::l_brace; 00110 case '}': return tgtok::r_brace; 00111 case '(': return tgtok::l_paren; 00112 case ')': return tgtok::r_paren; 00113 case '=': return tgtok::equal; 00114 case '?': return tgtok::question; 00115 case '#': return tgtok::paste; 00116 00117 case 0: 00118 case ' ': 00119 case '\t': 00120 case '\n': 00121 case '\r': 00122 // Ignore whitespace. 00123 return LexToken(); 00124 case '/': 00125 // If this is the start of a // comment, skip until the end of the line or 00126 // the end of the buffer. 00127 if (*CurPtr == '/') 00128 SkipBCPLComment(); 00129 else if (*CurPtr == '*') { 00130 if (SkipCComment()) 00131 return tgtok::Error; 00132 } else // Otherwise, this is an error. 00133 return ReturnError(TokStart, "Unexpected character"); 00134 return LexToken(); 00135 case '-': case '+': 00136 case '0': case '1': case '2': case '3': case '4': case '5': case '6': 00137 case '7': case '8': case '9': { 00138 int NextChar = 0; 00139 if (isdigit(CurChar)) { 00140 // Allow identifiers to start with a number if it is followed by 00141 // an identifier. This can happen with paste operations like 00142 // foo#8i. 00143 int i = 0; 00144 do { 00145 NextChar = peekNextChar(i++); 00146 } while (isdigit(NextChar)); 00147 00148 if (NextChar == 'x' || NextChar == 'b') { 00149 // If this is [0-9]b[01] or [0-9]x[0-9A-fa-f] this is most 00150 // likely a number. 00151 int NextNextChar = peekNextChar(i); 00152 switch (NextNextChar) { 00153 default: 00154 break; 00155 case '0': case '1': 00156 if (NextChar == 'b') 00157 return LexNumber(); 00158 // Fallthrough 00159 case '2': case '3': case '4': case '5': 00160 case '6': case '7': case '8': case '9': 00161 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': 00162 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': 00163 if (NextChar == 'x') 00164 return LexNumber(); 00165 break; 00166 } 00167 } 00168 } 00169 00170 if (isalpha(NextChar) || NextChar == '_') 00171 return LexIdentifier(); 00172 00173 return LexNumber(); 00174 } 00175 case '"': return LexString(); 00176 case '$': return LexVarName(); 00177 case '[': return LexBracket(); 00178 case '!': return LexExclaim(); 00179 } 00180 } 00181 00182 /// LexString - Lex "[^"]*" 00183 tgtok::TokKind TGLexer::LexString() { 00184 const char *StrStart = CurPtr; 00185 00186 CurStrVal = ""; 00187 00188 while (*CurPtr != '"') { 00189 // If we hit the end of the buffer, report an error. 00190 if (*CurPtr == 0 && CurPtr == CurBuf.end()) 00191 return ReturnError(StrStart, "End of file in string literal"); 00192 00193 if (*CurPtr == '\n' || *CurPtr == '\r') 00194 return ReturnError(StrStart, "End of line in string literal"); 00195 00196 if (*CurPtr != '\\') { 00197 CurStrVal += *CurPtr++; 00198 continue; 00199 } 00200 00201 ++CurPtr; 00202 00203 switch (*CurPtr) { 00204 case '\\': case '\'': case '"': 00205 // These turn into their literal character. 00206 CurStrVal += *CurPtr++; 00207 break; 00208 case 't': 00209 CurStrVal += '\t'; 00210 ++CurPtr; 00211 break; 00212 case 'n': 00213 CurStrVal += '\n'; 00214 ++CurPtr; 00215 break; 00216 00217 case '\n': 00218 case '\r': 00219 return ReturnError(CurPtr, "escaped newlines not supported in tblgen"); 00220 00221 // If we hit the end of the buffer, report an error. 00222 case '\0': 00223 if (CurPtr == CurBuf.end()) 00224 return ReturnError(StrStart, "End of file in string literal"); 00225 // FALL THROUGH 00226 default: 00227 return ReturnError(CurPtr, "invalid escape in string literal"); 00228 } 00229 } 00230 00231 ++CurPtr; 00232 return tgtok::StrVal; 00233 } 00234 00235 tgtok::TokKind TGLexer::LexVarName() { 00236 if (!isalpha(CurPtr[0]) && CurPtr[0] != '_') 00237 return ReturnError(TokStart, "Invalid variable name"); 00238 00239 // Otherwise, we're ok, consume the rest of the characters. 00240 const char *VarNameStart = CurPtr++; 00241 00242 while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_') 00243 ++CurPtr; 00244 00245 CurStrVal.assign(VarNameStart, CurPtr); 00246 return tgtok::VarName; 00247 } 00248 00249 00250 tgtok::TokKind TGLexer::LexIdentifier() { 00251 // The first letter is [a-zA-Z_#]. 00252 const char *IdentStart = TokStart; 00253 00254 // Match the rest of the identifier regex: [0-9a-zA-Z_#]* 00255 while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_') 00256 ++CurPtr; 00257 00258 // Check to see if this identifier is a keyword. 00259 StringRef Str(IdentStart, CurPtr-IdentStart); 00260 00261 if (Str == "include") { 00262 if (LexInclude()) return tgtok::Error; 00263 return Lex(); 00264 } 00265 00266 tgtok::TokKind Kind = StringSwitch<tgtok::TokKind>(Str) 00267 .Case("int", tgtok::Int) 00268 .Case("bit", tgtok::Bit) 00269 .Case("bits", tgtok::Bits) 00270 .Case("string", tgtok::String) 00271 .Case("list", tgtok::List) 00272 .Case("code", tgtok::Code) 00273 .Case("dag", tgtok::Dag) 00274 .Case("class", tgtok::Class) 00275 .Case("def", tgtok::Def) 00276 .Case("foreach", tgtok::Foreach) 00277 .Case("defm", tgtok::Defm) 00278 .Case("multiclass", tgtok::MultiClass) 00279 .Case("field", tgtok::Field) 00280 .Case("let", tgtok::Let) 00281 .Case("in", tgtok::In) 00282 .Default(tgtok::Id); 00283 00284 if (Kind == tgtok::Id) 00285 CurStrVal.assign(Str.begin(), Str.end()); 00286 return Kind; 00287 } 00288 00289 /// LexInclude - We just read the "include" token. Get the string token that 00290 /// comes next and enter the include. 00291 bool TGLexer::LexInclude() { 00292 // The token after the include must be a string. 00293 tgtok::TokKind Tok = LexToken(); 00294 if (Tok == tgtok::Error) return true; 00295 if (Tok != tgtok::StrVal) { 00296 PrintError(getLoc(), "Expected filename after include"); 00297 return true; 00298 } 00299 00300 // Get the string. 00301 std::string Filename = CurStrVal; 00302 std::string IncludedFile; 00303 00304 00305 CurBuffer = SrcMgr.AddIncludeFile(Filename, SMLoc::getFromPointer(CurPtr), 00306 IncludedFile); 00307 if (!CurBuffer) { 00308 PrintError(getLoc(), "Could not find include file '" + Filename + "'"); 00309 return true; 00310 } 00311 00312 DependenciesMapTy::const_iterator Found = Dependencies.find(IncludedFile); 00313 if (Found != Dependencies.end()) { 00314 PrintError(getLoc(), 00315 "File '" + IncludedFile + "' has already been included."); 00316 SrcMgr.PrintMessage(Found->second, SourceMgr::DK_Note, 00317 "previously included here"); 00318 return true; 00319 } 00320 Dependencies.insert(std::make_pair(IncludedFile, getLoc())); 00321 // Save the line number and lex buffer of the includer. 00322 CurBuf = SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer(); 00323 CurPtr = CurBuf.begin(); 00324 return false; 00325 } 00326 00327 void TGLexer::SkipBCPLComment() { 00328 ++CurPtr; // skip the second slash. 00329 while (1) { 00330 switch (*CurPtr) { 00331 case '\n': 00332 case '\r': 00333 return; // Newline is end of comment. 00334 case 0: 00335 // If this is the end of the buffer, end the comment. 00336 if (CurPtr == CurBuf.end()) 00337 return; 00338 break; 00339 } 00340 // Otherwise, skip the character. 00341 ++CurPtr; 00342 } 00343 } 00344 00345 /// SkipCComment - This skips C-style /**/ comments. The only difference from C 00346 /// is that we allow nesting. 00347 bool TGLexer::SkipCComment() { 00348 ++CurPtr; // skip the star. 00349 unsigned CommentDepth = 1; 00350 00351 while (1) { 00352 int CurChar = getNextChar(); 00353 switch (CurChar) { 00354 case EOF: 00355 PrintError(TokStart, "Unterminated comment!"); 00356 return true; 00357 case '*': 00358 // End of the comment? 00359 if (CurPtr[0] != '/') break; 00360 00361 ++CurPtr; // End the */. 00362 if (--CommentDepth == 0) 00363 return false; 00364 break; 00365 case '/': 00366 // Start of a nested comment? 00367 if (CurPtr[0] != '*') break; 00368 ++CurPtr; 00369 ++CommentDepth; 00370 break; 00371 } 00372 } 00373 } 00374 00375 /// LexNumber - Lex: 00376 /// [-+]?[0-9]+ 00377 /// 0x[0-9a-fA-F]+ 00378 /// 0b[01]+ 00379 tgtok::TokKind TGLexer::LexNumber() { 00380 if (CurPtr[-1] == '0') { 00381 if (CurPtr[0] == 'x') { 00382 ++CurPtr; 00383 const char *NumStart = CurPtr; 00384 while (isxdigit(CurPtr[0])) 00385 ++CurPtr; 00386 00387 // Requires at least one hex digit. 00388 if (CurPtr == NumStart) 00389 return ReturnError(TokStart, "Invalid hexadecimal number"); 00390 00391 errno = 0; 00392 CurIntVal = strtoll(NumStart, nullptr, 16); 00393 if (errno == EINVAL) 00394 return ReturnError(TokStart, "Invalid hexadecimal number"); 00395 if (errno == ERANGE) { 00396 errno = 0; 00397 CurIntVal = (int64_t)strtoull(NumStart, nullptr, 16); 00398 if (errno == EINVAL) 00399 return ReturnError(TokStart, "Invalid hexadecimal number"); 00400 if (errno == ERANGE) 00401 return ReturnError(TokStart, "Hexadecimal number out of range"); 00402 } 00403 return tgtok::IntVal; 00404 } else if (CurPtr[0] == 'b') { 00405 ++CurPtr; 00406 const char *NumStart = CurPtr; 00407 while (CurPtr[0] == '0' || CurPtr[0] == '1') 00408 ++CurPtr; 00409 00410 // Requires at least one binary digit. 00411 if (CurPtr == NumStart) 00412 return ReturnError(CurPtr-2, "Invalid binary number"); 00413 CurIntVal = strtoll(NumStart, nullptr, 2); 00414 return tgtok::BinaryIntVal; 00415 } 00416 } 00417 00418 // Check for a sign without a digit. 00419 if (!isdigit(CurPtr[0])) { 00420 if (CurPtr[-1] == '-') 00421 return tgtok::minus; 00422 else if (CurPtr[-1] == '+') 00423 return tgtok::plus; 00424 } 00425 00426 while (isdigit(CurPtr[0])) 00427 ++CurPtr; 00428 CurIntVal = strtoll(TokStart, nullptr, 10); 00429 return tgtok::IntVal; 00430 } 00431 00432 /// LexBracket - We just read '['. If this is a code block, return it, 00433 /// otherwise return the bracket. Match: '[' and '[{ ( [^}]+ | }[^]] )* }]' 00434 tgtok::TokKind TGLexer::LexBracket() { 00435 if (CurPtr[0] != '{') 00436 return tgtok::l_square; 00437 ++CurPtr; 00438 const char *CodeStart = CurPtr; 00439 while (1) { 00440 int Char = getNextChar(); 00441 if (Char == EOF) break; 00442 00443 if (Char != '}') continue; 00444 00445 Char = getNextChar(); 00446 if (Char == EOF) break; 00447 if (Char == ']') { 00448 CurStrVal.assign(CodeStart, CurPtr-2); 00449 return tgtok::CodeFragment; 00450 } 00451 } 00452 00453 return ReturnError(CodeStart-2, "Unterminated Code Block"); 00454 } 00455 00456 /// LexExclaim - Lex '!' and '![a-zA-Z]+'. 00457 tgtok::TokKind TGLexer::LexExclaim() { 00458 if (!isalpha(*CurPtr)) 00459 return ReturnError(CurPtr - 1, "Invalid \"!operator\""); 00460 00461 const char *Start = CurPtr++; 00462 while (isalpha(*CurPtr)) 00463 ++CurPtr; 00464 00465 // Check to see which operator this is. 00466 tgtok::TokKind Kind = 00467 StringSwitch<tgtok::TokKind>(StringRef(Start, CurPtr - Start)) 00468 .Case("eq", tgtok::XEq) 00469 .Case("if", tgtok::XIf) 00470 .Case("head", tgtok::XHead) 00471 .Case("tail", tgtok::XTail) 00472 .Case("con", tgtok::XConcat) 00473 .Case("add", tgtok::XADD) 00474 .Case("and", tgtok::XAND) 00475 .Case("shl", tgtok::XSHL) 00476 .Case("sra", tgtok::XSRA) 00477 .Case("srl", tgtok::XSRL) 00478 .Case("cast", tgtok::XCast) 00479 .Case("empty", tgtok::XEmpty) 00480 .Case("subst", tgtok::XSubst) 00481 .Case("foreach", tgtok::XForEach) 00482 .Case("listconcat", tgtok::XListConcat) 00483 .Case("strconcat", tgtok::XStrConcat) 00484 .Default(tgtok::Error); 00485 00486 return Kind != tgtok::Error ? Kind : ReturnError(Start-1, "Unknown operator"); 00487 } 00488