clang API Documentation
00001 //===--- LiteralSupport.cpp - Code to parse and process literals ----------===// 00002 // 00003 // The LLVM Compiler Infrastructure 00004 // 00005 // This file is distributed under the University of Illinois Open Source 00006 // License. See LICENSE.TXT for details. 00007 // 00008 //===----------------------------------------------------------------------===// 00009 // 00010 // This file implements the NumericLiteralParser, CharLiteralParser, and 00011 // StringLiteralParser interfaces. 00012 // 00013 //===----------------------------------------------------------------------===// 00014 00015 #include "clang/Lex/LiteralSupport.h" 00016 #include "clang/Basic/CharInfo.h" 00017 #include "clang/Basic/TargetInfo.h" 00018 #include "clang/Lex/LexDiagnostic.h" 00019 #include "clang/Lex/Preprocessor.h" 00020 #include "llvm/ADT/StringExtras.h" 00021 #include "llvm/Support/ConvertUTF.h" 00022 #include "llvm/Support/ErrorHandling.h" 00023 00024 using namespace clang; 00025 00026 static unsigned getCharWidth(tok::TokenKind kind, const TargetInfo &Target) { 00027 switch (kind) { 00028 default: llvm_unreachable("Unknown token type!"); 00029 case tok::char_constant: 00030 case tok::string_literal: 00031 case tok::utf8_char_constant: 00032 case tok::utf8_string_literal: 00033 return Target.getCharWidth(); 00034 case tok::wide_char_constant: 00035 case tok::wide_string_literal: 00036 return Target.getWCharWidth(); 00037 case tok::utf16_char_constant: 00038 case tok::utf16_string_literal: 00039 return Target.getChar16Width(); 00040 case tok::utf32_char_constant: 00041 case tok::utf32_string_literal: 00042 return Target.getChar32Width(); 00043 } 00044 } 00045 00046 static CharSourceRange MakeCharSourceRange(const LangOptions &Features, 00047 FullSourceLoc TokLoc, 00048 const char *TokBegin, 00049 const char *TokRangeBegin, 00050 const char *TokRangeEnd) { 00051 SourceLocation Begin = 00052 Lexer::AdvanceToTokenCharacter(TokLoc, TokRangeBegin - TokBegin, 00053 TokLoc.getManager(), Features); 00054 SourceLocation End = 00055 Lexer::AdvanceToTokenCharacter(Begin, TokRangeEnd - TokRangeBegin, 00056 TokLoc.getManager(), Features); 00057 return CharSourceRange::getCharRange(Begin, End); 00058 } 00059 00060 /// \brief Produce a diagnostic highlighting some portion of a literal. 00061 /// 00062 /// Emits the diagnostic \p DiagID, highlighting the range of characters from 00063 /// \p TokRangeBegin (inclusive) to \p TokRangeEnd (exclusive), which must be 00064 /// a substring of a spelling buffer for the token beginning at \p TokBegin. 00065 static DiagnosticBuilder Diag(DiagnosticsEngine *Diags, 00066 const LangOptions &Features, FullSourceLoc TokLoc, 00067 const char *TokBegin, const char *TokRangeBegin, 00068 const char *TokRangeEnd, unsigned DiagID) { 00069 SourceLocation Begin = 00070 Lexer::AdvanceToTokenCharacter(TokLoc, TokRangeBegin - TokBegin, 00071 TokLoc.getManager(), Features); 00072 return Diags->Report(Begin, DiagID) << 00073 MakeCharSourceRange(Features, TokLoc, TokBegin, TokRangeBegin, TokRangeEnd); 00074 } 00075 00076 /// ProcessCharEscape - Parse a standard C escape sequence, which can occur in 00077 /// either a character or a string literal. 00078 static unsigned ProcessCharEscape(const char *ThisTokBegin, 00079 const char *&ThisTokBuf, 00080 const char *ThisTokEnd, bool &HadError, 00081 FullSourceLoc Loc, unsigned CharWidth, 00082 DiagnosticsEngine *Diags, 00083 const LangOptions &Features) { 00084 const char *EscapeBegin = ThisTokBuf; 00085 00086 // Skip the '\' char. 00087 ++ThisTokBuf; 00088 00089 // We know that this character can't be off the end of the buffer, because 00090 // that would have been \", which would not have been the end of string. 00091 unsigned ResultChar = *ThisTokBuf++; 00092 switch (ResultChar) { 00093 // These map to themselves. 00094 case '\\': case '\'': case '"': case '?': break; 00095 00096 // These have fixed mappings. 00097 case 'a': 00098 // TODO: K&R: the meaning of '\\a' is different in traditional C 00099 ResultChar = 7; 00100 break; 00101 case 'b': 00102 ResultChar = 8; 00103 break; 00104 case 'e': 00105 if (Diags) 00106 Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf, 00107 diag::ext_nonstandard_escape) << "e"; 00108 ResultChar = 27; 00109 break; 00110 case 'E': 00111 if (Diags) 00112 Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf, 00113 diag::ext_nonstandard_escape) << "E"; 00114 ResultChar = 27; 00115 break; 00116 case 'f': 00117 ResultChar = 12; 00118 break; 00119 case 'n': 00120 ResultChar = 10; 00121 break; 00122 case 'r': 00123 ResultChar = 13; 00124 break; 00125 case 't': 00126 ResultChar = 9; 00127 break; 00128 case 'v': 00129 ResultChar = 11; 00130 break; 00131 case 'x': { // Hex escape. 00132 ResultChar = 0; 00133 if (ThisTokBuf == ThisTokEnd || !isHexDigit(*ThisTokBuf)) { 00134 if (Diags) 00135 Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf, 00136 diag::err_hex_escape_no_digits) << "x"; 00137 HadError = 1; 00138 break; 00139 } 00140 00141 // Hex escapes are a maximal series of hex digits. 00142 bool Overflow = false; 00143 for (; ThisTokBuf != ThisTokEnd; ++ThisTokBuf) { 00144 int CharVal = llvm::hexDigitValue(ThisTokBuf[0]); 00145 if (CharVal == -1) break; 00146 // About to shift out a digit? 00147 Overflow |= (ResultChar & 0xF0000000) ? true : false; 00148 ResultChar <<= 4; 00149 ResultChar |= CharVal; 00150 } 00151 00152 // See if any bits will be truncated when evaluated as a character. 00153 if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) { 00154 Overflow = true; 00155 ResultChar &= ~0U >> (32-CharWidth); 00156 } 00157 00158 // Check for overflow. 00159 if (Overflow && Diags) // Too many digits to fit in 00160 Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf, 00161 diag::err_hex_escape_too_large); 00162 break; 00163 } 00164 case '0': case '1': case '2': case '3': 00165 case '4': case '5': case '6': case '7': { 00166 // Octal escapes. 00167 --ThisTokBuf; 00168 ResultChar = 0; 00169 00170 // Octal escapes are a series of octal digits with maximum length 3. 00171 // "\0123" is a two digit sequence equal to "\012" "3". 00172 unsigned NumDigits = 0; 00173 do { 00174 ResultChar <<= 3; 00175 ResultChar |= *ThisTokBuf++ - '0'; 00176 ++NumDigits; 00177 } while (ThisTokBuf != ThisTokEnd && NumDigits < 3 && 00178 ThisTokBuf[0] >= '0' && ThisTokBuf[0] <= '7'); 00179 00180 // Check for overflow. Reject '\777', but not L'\777'. 00181 if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) { 00182 if (Diags) 00183 Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf, 00184 diag::err_octal_escape_too_large); 00185 ResultChar &= ~0U >> (32-CharWidth); 00186 } 00187 break; 00188 } 00189 00190 // Otherwise, these are not valid escapes. 00191 case '(': case '{': case '[': case '%': 00192 // GCC accepts these as extensions. We warn about them as such though. 00193 if (Diags) 00194 Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf, 00195 diag::ext_nonstandard_escape) 00196 << std::string(1, ResultChar); 00197 break; 00198 default: 00199 if (!Diags) 00200 break; 00201 00202 if (isPrintable(ResultChar)) 00203 Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf, 00204 diag::ext_unknown_escape) 00205 << std::string(1, ResultChar); 00206 else 00207 Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf, 00208 diag::ext_unknown_escape) 00209 << "x" + llvm::utohexstr(ResultChar); 00210 break; 00211 } 00212 00213 return ResultChar; 00214 } 00215 00216 static void appendCodePoint(unsigned Codepoint, 00217 llvm::SmallVectorImpl<char> &Str) { 00218 char ResultBuf[4]; 00219 char *ResultPtr = ResultBuf; 00220 bool Res = llvm::ConvertCodePointToUTF8(Codepoint, ResultPtr); 00221 (void)Res; 00222 assert(Res && "Unexpected conversion failure"); 00223 Str.append(ResultBuf, ResultPtr); 00224 } 00225 00226 void clang::expandUCNs(SmallVectorImpl<char> &Buf, StringRef Input) { 00227 for (StringRef::iterator I = Input.begin(), E = Input.end(); I != E; ++I) { 00228 if (*I != '\\') { 00229 Buf.push_back(*I); 00230 continue; 00231 } 00232 00233 ++I; 00234 assert(*I == 'u' || *I == 'U'); 00235 00236 unsigned NumHexDigits; 00237 if (*I == 'u') 00238 NumHexDigits = 4; 00239 else 00240 NumHexDigits = 8; 00241 00242 assert(I + NumHexDigits <= E); 00243 00244 uint32_t CodePoint = 0; 00245 for (++I; NumHexDigits != 0; ++I, --NumHexDigits) { 00246 unsigned Value = llvm::hexDigitValue(*I); 00247 assert(Value != -1U); 00248 00249 CodePoint <<= 4; 00250 CodePoint += Value; 00251 } 00252 00253 appendCodePoint(CodePoint, Buf); 00254 --I; 00255 } 00256 } 00257 00258 /// ProcessUCNEscape - Read the Universal Character Name, check constraints and 00259 /// return the UTF32. 00260 static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf, 00261 const char *ThisTokEnd, 00262 uint32_t &UcnVal, unsigned short &UcnLen, 00263 FullSourceLoc Loc, DiagnosticsEngine *Diags, 00264 const LangOptions &Features, 00265 bool in_char_string_literal = false) { 00266 const char *UcnBegin = ThisTokBuf; 00267 00268 // Skip the '\u' char's. 00269 ThisTokBuf += 2; 00270 00271 if (ThisTokBuf == ThisTokEnd || !isHexDigit(*ThisTokBuf)) { 00272 if (Diags) 00273 Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf, 00274 diag::err_hex_escape_no_digits) << StringRef(&ThisTokBuf[-1], 1); 00275 return false; 00276 } 00277 UcnLen = (ThisTokBuf[-1] == 'u' ? 4 : 8); 00278 unsigned short UcnLenSave = UcnLen; 00279 for (; ThisTokBuf != ThisTokEnd && UcnLenSave; ++ThisTokBuf, UcnLenSave--) { 00280 int CharVal = llvm::hexDigitValue(ThisTokBuf[0]); 00281 if (CharVal == -1) break; 00282 UcnVal <<= 4; 00283 UcnVal |= CharVal; 00284 } 00285 // If we didn't consume the proper number of digits, there is a problem. 00286 if (UcnLenSave) { 00287 if (Diags) 00288 Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf, 00289 diag::err_ucn_escape_incomplete); 00290 return false; 00291 } 00292 00293 // Check UCN constraints (C99 6.4.3p2) [C++11 lex.charset p2] 00294 if ((0xD800 <= UcnVal && UcnVal <= 0xDFFF) || // surrogate codepoints 00295 UcnVal > 0x10FFFF) { // maximum legal UTF32 value 00296 if (Diags) 00297 Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf, 00298 diag::err_ucn_escape_invalid); 00299 return false; 00300 } 00301 00302 // C++11 allows UCNs that refer to control characters and basic source 00303 // characters inside character and string literals 00304 if (UcnVal < 0xa0 && 00305 (UcnVal != 0x24 && UcnVal != 0x40 && UcnVal != 0x60)) { // $, @, ` 00306 bool IsError = (!Features.CPlusPlus11 || !in_char_string_literal); 00307 if (Diags) { 00308 char BasicSCSChar = UcnVal; 00309 if (UcnVal >= 0x20 && UcnVal < 0x7f) 00310 Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf, 00311 IsError ? diag::err_ucn_escape_basic_scs : 00312 diag::warn_cxx98_compat_literal_ucn_escape_basic_scs) 00313 << StringRef(&BasicSCSChar, 1); 00314 else 00315 Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf, 00316 IsError ? diag::err_ucn_control_character : 00317 diag::warn_cxx98_compat_literal_ucn_control_character); 00318 } 00319 if (IsError) 00320 return false; 00321 } 00322 00323 if (!Features.CPlusPlus && !Features.C99 && Diags) 00324 Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf, 00325 diag::warn_ucn_not_valid_in_c89_literal); 00326 00327 return true; 00328 } 00329 00330 /// MeasureUCNEscape - Determine the number of bytes within the resulting string 00331 /// which this UCN will occupy. 00332 static int MeasureUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf, 00333 const char *ThisTokEnd, unsigned CharByteWidth, 00334 const LangOptions &Features, bool &HadError) { 00335 // UTF-32: 4 bytes per escape. 00336 if (CharByteWidth == 4) 00337 return 4; 00338 00339 uint32_t UcnVal = 0; 00340 unsigned short UcnLen = 0; 00341 FullSourceLoc Loc; 00342 00343 if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal, 00344 UcnLen, Loc, nullptr, Features, true)) { 00345 HadError = true; 00346 return 0; 00347 } 00348 00349 // UTF-16: 2 bytes for BMP, 4 bytes otherwise. 00350 if (CharByteWidth == 2) 00351 return UcnVal <= 0xFFFF ? 2 : 4; 00352 00353 // UTF-8. 00354 if (UcnVal < 0x80) 00355 return 1; 00356 if (UcnVal < 0x800) 00357 return 2; 00358 if (UcnVal < 0x10000) 00359 return 3; 00360 return 4; 00361 } 00362 00363 /// EncodeUCNEscape - Read the Universal Character Name, check constraints and 00364 /// convert the UTF32 to UTF8 or UTF16. This is a subroutine of 00365 /// StringLiteralParser. When we decide to implement UCN's for identifiers, 00366 /// we will likely rework our support for UCN's. 00367 static void EncodeUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf, 00368 const char *ThisTokEnd, 00369 char *&ResultBuf, bool &HadError, 00370 FullSourceLoc Loc, unsigned CharByteWidth, 00371 DiagnosticsEngine *Diags, 00372 const LangOptions &Features) { 00373 typedef uint32_t UTF32; 00374 UTF32 UcnVal = 0; 00375 unsigned short UcnLen = 0; 00376 if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal, UcnLen, 00377 Loc, Diags, Features, true)) { 00378 HadError = true; 00379 return; 00380 } 00381 00382 assert((CharByteWidth == 1 || CharByteWidth == 2 || CharByteWidth == 4) && 00383 "only character widths of 1, 2, or 4 bytes supported"); 00384 00385 (void)UcnLen; 00386 assert((UcnLen== 4 || UcnLen== 8) && "only ucn length of 4 or 8 supported"); 00387 00388 if (CharByteWidth == 4) { 00389 // FIXME: Make the type of the result buffer correct instead of 00390 // using reinterpret_cast. 00391 UTF32 *ResultPtr = reinterpret_cast<UTF32*>(ResultBuf); 00392 *ResultPtr = UcnVal; 00393 ResultBuf += 4; 00394 return; 00395 } 00396 00397 if (CharByteWidth == 2) { 00398 // FIXME: Make the type of the result buffer correct instead of 00399 // using reinterpret_cast. 00400 UTF16 *ResultPtr = reinterpret_cast<UTF16*>(ResultBuf); 00401 00402 if (UcnVal <= (UTF32)0xFFFF) { 00403 *ResultPtr = UcnVal; 00404 ResultBuf += 2; 00405 return; 00406 } 00407 00408 // Convert to UTF16. 00409 UcnVal -= 0x10000; 00410 *ResultPtr = 0xD800 + (UcnVal >> 10); 00411 *(ResultPtr+1) = 0xDC00 + (UcnVal & 0x3FF); 00412 ResultBuf += 4; 00413 return; 00414 } 00415 00416 assert(CharByteWidth == 1 && "UTF-8 encoding is only for 1 byte characters"); 00417 00418 // Now that we've parsed/checked the UCN, we convert from UTF32->UTF8. 00419 // The conversion below was inspired by: 00420 // http://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c 00421 // First, we determine how many bytes the result will require. 00422 typedef uint8_t UTF8; 00423 00424 unsigned short bytesToWrite = 0; 00425 if (UcnVal < (UTF32)0x80) 00426 bytesToWrite = 1; 00427 else if (UcnVal < (UTF32)0x800) 00428 bytesToWrite = 2; 00429 else if (UcnVal < (UTF32)0x10000) 00430 bytesToWrite = 3; 00431 else 00432 bytesToWrite = 4; 00433 00434 const unsigned byteMask = 0xBF; 00435 const unsigned byteMark = 0x80; 00436 00437 // Once the bits are split out into bytes of UTF8, this is a mask OR-ed 00438 // into the first byte, depending on how many bytes follow. 00439 static const UTF8 firstByteMark[5] = { 00440 0x00, 0x00, 0xC0, 0xE0, 0xF0 00441 }; 00442 // Finally, we write the bytes into ResultBuf. 00443 ResultBuf += bytesToWrite; 00444 switch (bytesToWrite) { // note: everything falls through. 00445 case 4: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6; 00446 case 3: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6; 00447 case 2: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6; 00448 case 1: *--ResultBuf = (UTF8) (UcnVal | firstByteMark[bytesToWrite]); 00449 } 00450 // Update the buffer. 00451 ResultBuf += bytesToWrite; 00452 } 00453 00454 00455 /// integer-constant: [C99 6.4.4.1] 00456 /// decimal-constant integer-suffix 00457 /// octal-constant integer-suffix 00458 /// hexadecimal-constant integer-suffix 00459 /// binary-literal integer-suffix [GNU, C++1y] 00460 /// user-defined-integer-literal: [C++11 lex.ext] 00461 /// decimal-literal ud-suffix 00462 /// octal-literal ud-suffix 00463 /// hexadecimal-literal ud-suffix 00464 /// binary-literal ud-suffix [GNU, C++1y] 00465 /// decimal-constant: 00466 /// nonzero-digit 00467 /// decimal-constant digit 00468 /// octal-constant: 00469 /// 0 00470 /// octal-constant octal-digit 00471 /// hexadecimal-constant: 00472 /// hexadecimal-prefix hexadecimal-digit 00473 /// hexadecimal-constant hexadecimal-digit 00474 /// hexadecimal-prefix: one of 00475 /// 0x 0X 00476 /// binary-literal: 00477 /// 0b binary-digit 00478 /// 0B binary-digit 00479 /// binary-literal binary-digit 00480 /// integer-suffix: 00481 /// unsigned-suffix [long-suffix] 00482 /// unsigned-suffix [long-long-suffix] 00483 /// long-suffix [unsigned-suffix] 00484 /// long-long-suffix [unsigned-sufix] 00485 /// nonzero-digit: 00486 /// 1 2 3 4 5 6 7 8 9 00487 /// octal-digit: 00488 /// 0 1 2 3 4 5 6 7 00489 /// hexadecimal-digit: 00490 /// 0 1 2 3 4 5 6 7 8 9 00491 /// a b c d e f 00492 /// A B C D E F 00493 /// binary-digit: 00494 /// 0 00495 /// 1 00496 /// unsigned-suffix: one of 00497 /// u U 00498 /// long-suffix: one of 00499 /// l L 00500 /// long-long-suffix: one of 00501 /// ll LL 00502 /// 00503 /// floating-constant: [C99 6.4.4.2] 00504 /// TODO: add rules... 00505 /// 00506 NumericLiteralParser::NumericLiteralParser(StringRef TokSpelling, 00507 SourceLocation TokLoc, 00508 Preprocessor &PP) 00509 : PP(PP), ThisTokBegin(TokSpelling.begin()), ThisTokEnd(TokSpelling.end()) { 00510 00511 // This routine assumes that the range begin/end matches the regex for integer 00512 // and FP constants (specifically, the 'pp-number' regex), and assumes that 00513 // the byte at "*end" is both valid and not part of the regex. Because of 00514 // this, it doesn't have to check for 'overscan' in various places. 00515 assert(!isPreprocessingNumberBody(*ThisTokEnd) && "didn't maximally munch?"); 00516 00517 s = DigitsBegin = ThisTokBegin; 00518 saw_exponent = false; 00519 saw_period = false; 00520 saw_ud_suffix = false; 00521 isLong = false; 00522 isUnsigned = false; 00523 isLongLong = false; 00524 isFloat = false; 00525 isImaginary = false; 00526 MicrosoftInteger = 0; 00527 hadError = false; 00528 00529 if (*s == '0') { // parse radix 00530 ParseNumberStartingWithZero(TokLoc); 00531 if (hadError) 00532 return; 00533 } else { // the first digit is non-zero 00534 radix = 10; 00535 s = SkipDigits(s); 00536 if (s == ThisTokEnd) { 00537 // Done. 00538 } else if (isHexDigit(*s) && !(*s == 'e' || *s == 'E')) { 00539 PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin), 00540 diag::err_invalid_decimal_digit) << StringRef(s, 1); 00541 hadError = true; 00542 return; 00543 } else if (*s == '.') { 00544 checkSeparator(TokLoc, s, CSK_AfterDigits); 00545 s++; 00546 saw_period = true; 00547 checkSeparator(TokLoc, s, CSK_BeforeDigits); 00548 s = SkipDigits(s); 00549 } 00550 if ((*s == 'e' || *s == 'E')) { // exponent 00551 checkSeparator(TokLoc, s, CSK_AfterDigits); 00552 const char *Exponent = s; 00553 s++; 00554 saw_exponent = true; 00555 if (*s == '+' || *s == '-') s++; // sign 00556 checkSeparator(TokLoc, s, CSK_BeforeDigits); 00557 const char *first_non_digit = SkipDigits(s); 00558 if (first_non_digit != s) { 00559 s = first_non_digit; 00560 } else { 00561 PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent - ThisTokBegin), 00562 diag::err_exponent_has_no_digits); 00563 hadError = true; 00564 return; 00565 } 00566 } 00567 } 00568 00569 SuffixBegin = s; 00570 checkSeparator(TokLoc, s, CSK_AfterDigits); 00571 00572 // Parse the suffix. At this point we can classify whether we have an FP or 00573 // integer constant. 00574 bool isFPConstant = isFloatingLiteral(); 00575 const char *ImaginarySuffixLoc = nullptr; 00576 00577 // Loop over all of the characters of the suffix. If we see something bad, 00578 // we break out of the loop. 00579 for (; s != ThisTokEnd; ++s) { 00580 switch (*s) { 00581 case 'f': // FP Suffix for "float" 00582 case 'F': 00583 if (!isFPConstant) break; // Error for integer constant. 00584 if (isFloat || isLong) break; // FF, LF invalid. 00585 isFloat = true; 00586 continue; // Success. 00587 case 'u': 00588 case 'U': 00589 if (isFPConstant) break; // Error for floating constant. 00590 if (isUnsigned) break; // Cannot be repeated. 00591 isUnsigned = true; 00592 continue; // Success. 00593 case 'l': 00594 case 'L': 00595 if (isLong || isLongLong) break; // Cannot be repeated. 00596 if (isFloat) break; // LF invalid. 00597 00598 // Check for long long. The L's need to be adjacent and the same case. 00599 if (s+1 != ThisTokEnd && s[1] == s[0]) { 00600 if (isFPConstant) break; // long long invalid for floats. 00601 isLongLong = true; 00602 ++s; // Eat both of them. 00603 } else { 00604 isLong = true; 00605 } 00606 continue; // Success. 00607 case 'i': 00608 case 'I': 00609 if (PP.getLangOpts().MicrosoftExt) { 00610 if (isLong || isLongLong || MicrosoftInteger) 00611 break; 00612 00613 // Allow i8, i16, i32, i64, and i128. 00614 if (s + 1 != ThisTokEnd) { 00615 switch (s[1]) { 00616 case '8': 00617 if (isFPConstant) break; 00618 s += 2; // i8 suffix 00619 MicrosoftInteger = 8; 00620 break; 00621 case '1': 00622 if (isFPConstant) break; 00623 if (s + 2 == ThisTokEnd) break; 00624 if (s[2] == '6') { 00625 s += 3; // i16 suffix 00626 MicrosoftInteger = 16; 00627 } 00628 else if (s[2] == '2') { 00629 if (s + 3 == ThisTokEnd) break; 00630 if (s[3] == '8') { 00631 s += 4; // i128 suffix 00632 MicrosoftInteger = 128; 00633 } 00634 } 00635 break; 00636 case '3': 00637 if (isFPConstant) break; 00638 if (s + 2 == ThisTokEnd) break; 00639 if (s[2] == '2') { 00640 s += 3; // i32 suffix 00641 MicrosoftInteger = 32; 00642 } 00643 break; 00644 case '6': 00645 if (isFPConstant) break; 00646 if (s + 2 == ThisTokEnd) break; 00647 if (s[2] == '4') { 00648 s += 3; // i64 suffix 00649 MicrosoftInteger = 64; 00650 } 00651 break; 00652 default: 00653 break; 00654 } 00655 if (MicrosoftInteger) 00656 break; 00657 } 00658 } 00659 // "i", "if", and "il" are user-defined suffixes in C++1y. 00660 if (PP.getLangOpts().CPlusPlus14 && *s == 'i') 00661 break; 00662 // fall through. 00663 case 'j': 00664 case 'J': 00665 if (isImaginary) break; // Cannot be repeated. 00666 isImaginary = true; 00667 ImaginarySuffixLoc = s; 00668 continue; // Success. 00669 } 00670 // If we reached here, there was an error or a ud-suffix. 00671 break; 00672 } 00673 00674 if (s != ThisTokEnd) { 00675 // FIXME: Don't bother expanding UCNs if !tok.hasUCN(). 00676 expandUCNs(UDSuffixBuf, StringRef(SuffixBegin, ThisTokEnd - SuffixBegin)); 00677 if (isValidUDSuffix(PP.getLangOpts(), UDSuffixBuf)) { 00678 // Any suffix pieces we might have parsed are actually part of the 00679 // ud-suffix. 00680 isLong = false; 00681 isUnsigned = false; 00682 isLongLong = false; 00683 isFloat = false; 00684 isImaginary = false; 00685 MicrosoftInteger = 0; 00686 00687 saw_ud_suffix = true; 00688 return; 00689 } 00690 00691 // Report an error if there are any. 00692 PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, SuffixBegin - ThisTokBegin), 00693 isFPConstant ? diag::err_invalid_suffix_float_constant : 00694 diag::err_invalid_suffix_integer_constant) 00695 << StringRef(SuffixBegin, ThisTokEnd-SuffixBegin); 00696 hadError = true; 00697 return; 00698 } 00699 00700 if (isImaginary) { 00701 PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, 00702 ImaginarySuffixLoc - ThisTokBegin), 00703 diag::ext_imaginary_constant); 00704 } 00705 } 00706 00707 /// Determine whether a suffix is a valid ud-suffix. We avoid treating reserved 00708 /// suffixes as ud-suffixes, because the diagnostic experience is better if we 00709 /// treat it as an invalid suffix. 00710 bool NumericLiteralParser::isValidUDSuffix(const LangOptions &LangOpts, 00711 StringRef Suffix) { 00712 if (!LangOpts.CPlusPlus11 || Suffix.empty()) 00713 return false; 00714 00715 // By C++11 [lex.ext]p10, ud-suffixes starting with an '_' are always valid. 00716 if (Suffix[0] == '_') 00717 return true; 00718 00719 // In C++11, there are no library suffixes. 00720 if (!LangOpts.CPlusPlus14) 00721 return false; 00722 00723 // In C++1y, "s", "h", "min", "ms", "us", and "ns" are used in the library. 00724 // Per tweaked N3660, "il", "i", and "if" are also used in the library. 00725 return llvm::StringSwitch<bool>(Suffix) 00726 .Cases("h", "min", "s", true) 00727 .Cases("ms", "us", "ns", true) 00728 .Cases("il", "i", "if", true) 00729 .Default(false); 00730 } 00731 00732 void NumericLiteralParser::checkSeparator(SourceLocation TokLoc, 00733 const char *Pos, 00734 CheckSeparatorKind IsAfterDigits) { 00735 if (IsAfterDigits == CSK_AfterDigits) { 00736 if (Pos == ThisTokBegin) 00737 return; 00738 --Pos; 00739 } else if (Pos == ThisTokEnd) 00740 return; 00741 00742 if (isDigitSeparator(*Pos)) 00743 PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Pos - ThisTokBegin), 00744 diag::err_digit_separator_not_between_digits) 00745 << IsAfterDigits; 00746 } 00747 00748 /// ParseNumberStartingWithZero - This method is called when the first character 00749 /// of the number is found to be a zero. This means it is either an octal 00750 /// number (like '04') or a hex number ('0x123a') a binary number ('0b1010') or 00751 /// a floating point number (01239.123e4). Eat the prefix, determining the 00752 /// radix etc. 00753 void NumericLiteralParser::ParseNumberStartingWithZero(SourceLocation TokLoc) { 00754 assert(s[0] == '0' && "Invalid method call"); 00755 s++; 00756 00757 int c1 = s[0]; 00758 int c2 = s[1]; 00759 00760 // Handle a hex number like 0x1234. 00761 if ((c1 == 'x' || c1 == 'X') && (isHexDigit(c2) || c2 == '.')) { 00762 s++; 00763 radix = 16; 00764 DigitsBegin = s; 00765 s = SkipHexDigits(s); 00766 bool noSignificand = (s == DigitsBegin); 00767 if (s == ThisTokEnd) { 00768 // Done. 00769 } else if (*s == '.') { 00770 s++; 00771 saw_period = true; 00772 const char *floatDigitsBegin = s; 00773 checkSeparator(TokLoc, s, CSK_BeforeDigits); 00774 s = SkipHexDigits(s); 00775 noSignificand &= (floatDigitsBegin == s); 00776 } 00777 00778 if (noSignificand) { 00779 PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin), 00780 diag::err_hexconstant_requires_digits); 00781 hadError = true; 00782 return; 00783 } 00784 00785 // A binary exponent can appear with or with a '.'. If dotted, the 00786 // binary exponent is required. 00787 if (*s == 'p' || *s == 'P') { 00788 checkSeparator(TokLoc, s, CSK_AfterDigits); 00789 const char *Exponent = s; 00790 s++; 00791 saw_exponent = true; 00792 if (*s == '+' || *s == '-') s++; // sign 00793 const char *first_non_digit = SkipDigits(s); 00794 if (first_non_digit == s) { 00795 PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent-ThisTokBegin), 00796 diag::err_exponent_has_no_digits); 00797 hadError = true; 00798 return; 00799 } 00800 checkSeparator(TokLoc, s, CSK_BeforeDigits); 00801 s = first_non_digit; 00802 00803 if (!PP.getLangOpts().HexFloats) 00804 PP.Diag(TokLoc, diag::ext_hexconstant_invalid); 00805 } else if (saw_period) { 00806 PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin), 00807 diag::err_hexconstant_requires_exponent); 00808 hadError = true; 00809 } 00810 return; 00811 } 00812 00813 // Handle simple binary numbers 0b01010 00814 if ((c1 == 'b' || c1 == 'B') && (c2 == '0' || c2 == '1')) { 00815 // 0b101010 is a C++1y / GCC extension. 00816 PP.Diag(TokLoc, 00817 PP.getLangOpts().CPlusPlus14 00818 ? diag::warn_cxx11_compat_binary_literal 00819 : PP.getLangOpts().CPlusPlus 00820 ? diag::ext_binary_literal_cxx14 00821 : diag::ext_binary_literal); 00822 ++s; 00823 radix = 2; 00824 DigitsBegin = s; 00825 s = SkipBinaryDigits(s); 00826 if (s == ThisTokEnd) { 00827 // Done. 00828 } else if (isHexDigit(*s)) { 00829 PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin), 00830 diag::err_invalid_binary_digit) << StringRef(s, 1); 00831 hadError = true; 00832 } 00833 // Other suffixes will be diagnosed by the caller. 00834 return; 00835 } 00836 00837 // For now, the radix is set to 8. If we discover that we have a 00838 // floating point constant, the radix will change to 10. Octal floating 00839 // point constants are not permitted (only decimal and hexadecimal). 00840 radix = 8; 00841 DigitsBegin = s; 00842 s = SkipOctalDigits(s); 00843 if (s == ThisTokEnd) 00844 return; // Done, simple octal number like 01234 00845 00846 // If we have some other non-octal digit that *is* a decimal digit, see if 00847 // this is part of a floating point number like 094.123 or 09e1. 00848 if (isDigit(*s)) { 00849 const char *EndDecimal = SkipDigits(s); 00850 if (EndDecimal[0] == '.' || EndDecimal[0] == 'e' || EndDecimal[0] == 'E') { 00851 s = EndDecimal; 00852 radix = 10; 00853 } 00854 } 00855 00856 // If we have a hex digit other than 'e' (which denotes a FP exponent) then 00857 // the code is using an incorrect base. 00858 if (isHexDigit(*s) && *s != 'e' && *s != 'E') { 00859 PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin), 00860 diag::err_invalid_octal_digit) << StringRef(s, 1); 00861 hadError = true; 00862 return; 00863 } 00864 00865 if (*s == '.') { 00866 s++; 00867 radix = 10; 00868 saw_period = true; 00869 checkSeparator(TokLoc, s, CSK_BeforeDigits); 00870 s = SkipDigits(s); // Skip suffix. 00871 } 00872 if (*s == 'e' || *s == 'E') { // exponent 00873 checkSeparator(TokLoc, s, CSK_AfterDigits); 00874 const char *Exponent = s; 00875 s++; 00876 radix = 10; 00877 saw_exponent = true; 00878 if (*s == '+' || *s == '-') s++; // sign 00879 const char *first_non_digit = SkipDigits(s); 00880 if (first_non_digit != s) { 00881 checkSeparator(TokLoc, s, CSK_BeforeDigits); 00882 s = first_non_digit; 00883 } else { 00884 PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent-ThisTokBegin), 00885 diag::err_exponent_has_no_digits); 00886 hadError = true; 00887 return; 00888 } 00889 } 00890 } 00891 00892 static bool alwaysFitsInto64Bits(unsigned Radix, unsigned NumDigits) { 00893 switch (Radix) { 00894 case 2: 00895 return NumDigits <= 64; 00896 case 8: 00897 return NumDigits <= 64 / 3; // Digits are groups of 3 bits. 00898 case 10: 00899 return NumDigits <= 19; // floor(log10(2^64)) 00900 case 16: 00901 return NumDigits <= 64 / 4; // Digits are groups of 4 bits. 00902 default: 00903 llvm_unreachable("impossible Radix"); 00904 } 00905 } 00906 00907 /// GetIntegerValue - Convert this numeric literal value to an APInt that 00908 /// matches Val's input width. If there is an overflow, set Val to the low bits 00909 /// of the result and return true. Otherwise, return false. 00910 bool NumericLiteralParser::GetIntegerValue(llvm::APInt &Val) { 00911 // Fast path: Compute a conservative bound on the maximum number of 00912 // bits per digit in this radix. If we can't possibly overflow a 00913 // uint64 based on that bound then do the simple conversion to 00914 // integer. This avoids the expensive overflow checking below, and 00915 // handles the common cases that matter (small decimal integers and 00916 // hex/octal values which don't overflow). 00917 const unsigned NumDigits = SuffixBegin - DigitsBegin; 00918 if (alwaysFitsInto64Bits(radix, NumDigits)) { 00919 uint64_t N = 0; 00920 for (const char *Ptr = DigitsBegin; Ptr != SuffixBegin; ++Ptr) 00921 if (!isDigitSeparator(*Ptr)) 00922 N = N * radix + llvm::hexDigitValue(*Ptr); 00923 00924 // This will truncate the value to Val's input width. Simply check 00925 // for overflow by comparing. 00926 Val = N; 00927 return Val.getZExtValue() != N; 00928 } 00929 00930 Val = 0; 00931 const char *Ptr = DigitsBegin; 00932 00933 llvm::APInt RadixVal(Val.getBitWidth(), radix); 00934 llvm::APInt CharVal(Val.getBitWidth(), 0); 00935 llvm::APInt OldVal = Val; 00936 00937 bool OverflowOccurred = false; 00938 while (Ptr < SuffixBegin) { 00939 if (isDigitSeparator(*Ptr)) { 00940 ++Ptr; 00941 continue; 00942 } 00943 00944 unsigned C = llvm::hexDigitValue(*Ptr++); 00945 00946 // If this letter is out of bound for this radix, reject it. 00947 assert(C < radix && "NumericLiteralParser ctor should have rejected this"); 00948 00949 CharVal = C; 00950 00951 // Add the digit to the value in the appropriate radix. If adding in digits 00952 // made the value smaller, then this overflowed. 00953 OldVal = Val; 00954 00955 // Multiply by radix, did overflow occur on the multiply? 00956 Val *= RadixVal; 00957 OverflowOccurred |= Val.udiv(RadixVal) != OldVal; 00958 00959 // Add value, did overflow occur on the value? 00960 // (a + b) ult b <=> overflow 00961 Val += CharVal; 00962 OverflowOccurred |= Val.ult(CharVal); 00963 } 00964 return OverflowOccurred; 00965 } 00966 00967 llvm::APFloat::opStatus 00968 NumericLiteralParser::GetFloatValue(llvm::APFloat &Result) { 00969 using llvm::APFloat; 00970 00971 unsigned n = std::min(SuffixBegin - ThisTokBegin, ThisTokEnd - ThisTokBegin); 00972 00973 llvm::SmallString<16> Buffer; 00974 StringRef Str(ThisTokBegin, n); 00975 if (Str.find('\'') != StringRef::npos) { 00976 Buffer.reserve(n); 00977 std::remove_copy_if(Str.begin(), Str.end(), std::back_inserter(Buffer), 00978 &isDigitSeparator); 00979 Str = Buffer; 00980 } 00981 00982 return Result.convertFromString(Str, APFloat::rmNearestTiesToEven); 00983 } 00984 00985 00986 /// \verbatim 00987 /// user-defined-character-literal: [C++11 lex.ext] 00988 /// character-literal ud-suffix 00989 /// ud-suffix: 00990 /// identifier 00991 /// character-literal: [C++11 lex.ccon] 00992 /// ' c-char-sequence ' 00993 /// u' c-char-sequence ' 00994 /// U' c-char-sequence ' 00995 /// L' c-char-sequence ' 00996 /// c-char-sequence: 00997 /// c-char 00998 /// c-char-sequence c-char 00999 /// c-char: 01000 /// any member of the source character set except the single-quote ', 01001 /// backslash \, or new-line character 01002 /// escape-sequence 01003 /// universal-character-name 01004 /// escape-sequence: 01005 /// simple-escape-sequence 01006 /// octal-escape-sequence 01007 /// hexadecimal-escape-sequence 01008 /// simple-escape-sequence: 01009 /// one of \' \" \? \\ \a \b \f \n \r \t \v 01010 /// octal-escape-sequence: 01011 /// \ octal-digit 01012 /// \ octal-digit octal-digit 01013 /// \ octal-digit octal-digit octal-digit 01014 /// hexadecimal-escape-sequence: 01015 /// \x hexadecimal-digit 01016 /// hexadecimal-escape-sequence hexadecimal-digit 01017 /// universal-character-name: [C++11 lex.charset] 01018 /// \u hex-quad 01019 /// \U hex-quad hex-quad 01020 /// hex-quad: 01021 /// hex-digit hex-digit hex-digit hex-digit 01022 /// \endverbatim 01023 /// 01024 CharLiteralParser::CharLiteralParser(const char *begin, const char *end, 01025 SourceLocation Loc, Preprocessor &PP, 01026 tok::TokenKind kind) { 01027 // At this point we know that the character matches the regex "(L|u|U)?'.*'". 01028 HadError = false; 01029 01030 Kind = kind; 01031 01032 const char *TokBegin = begin; 01033 01034 // Skip over wide character determinant. 01035 if (Kind != tok::char_constant) 01036 ++begin; 01037 if (Kind == tok::utf8_char_constant) 01038 ++begin; 01039 01040 // Skip over the entry quote. 01041 assert(begin[0] == '\'' && "Invalid token lexed"); 01042 ++begin; 01043 01044 // Remove an optional ud-suffix. 01045 if (end[-1] != '\'') { 01046 const char *UDSuffixEnd = end; 01047 do { 01048 --end; 01049 } while (end[-1] != '\''); 01050 // FIXME: Don't bother with this if !tok.hasUCN(). 01051 expandUCNs(UDSuffixBuf, StringRef(end, UDSuffixEnd - end)); 01052 UDSuffixOffset = end - TokBegin; 01053 } 01054 01055 // Trim the ending quote. 01056 assert(end != begin && "Invalid token lexed"); 01057 --end; 01058 01059 // FIXME: The "Value" is an uint64_t so we can handle char literals of 01060 // up to 64-bits. 01061 // FIXME: This extensively assumes that 'char' is 8-bits. 01062 assert(PP.getTargetInfo().getCharWidth() == 8 && 01063 "Assumes char is 8 bits"); 01064 assert(PP.getTargetInfo().getIntWidth() <= 64 && 01065 (PP.getTargetInfo().getIntWidth() & 7) == 0 && 01066 "Assumes sizeof(int) on target is <= 64 and a multiple of char"); 01067 assert(PP.getTargetInfo().getWCharWidth() <= 64 && 01068 "Assumes sizeof(wchar) on target is <= 64"); 01069 01070 SmallVector<uint32_t, 4> codepoint_buffer; 01071 codepoint_buffer.resize(end - begin); 01072 uint32_t *buffer_begin = &codepoint_buffer.front(); 01073 uint32_t *buffer_end = buffer_begin + codepoint_buffer.size(); 01074 01075 // Unicode escapes representing characters that cannot be correctly 01076 // represented in a single code unit are disallowed in character literals 01077 // by this implementation. 01078 uint32_t largest_character_for_kind; 01079 if (tok::wide_char_constant == Kind) { 01080 largest_character_for_kind = 01081 0xFFFFFFFFu >> (32-PP.getTargetInfo().getWCharWidth()); 01082 } else if (tok::utf8_char_constant == Kind) { 01083 largest_character_for_kind = 0x7F; 01084 } else if (tok::utf16_char_constant == Kind) { 01085 largest_character_for_kind = 0xFFFF; 01086 } else if (tok::utf32_char_constant == Kind) { 01087 largest_character_for_kind = 0x10FFFF; 01088 } else { 01089 largest_character_for_kind = 0x7Fu; 01090 } 01091 01092 while (begin != end) { 01093 // Is this a span of non-escape characters? 01094 if (begin[0] != '\\') { 01095 char const *start = begin; 01096 do { 01097 ++begin; 01098 } while (begin != end && *begin != '\\'); 01099 01100 char const *tmp_in_start = start; 01101 uint32_t *tmp_out_start = buffer_begin; 01102 ConversionResult res = 01103 ConvertUTF8toUTF32(reinterpret_cast<UTF8 const **>(&start), 01104 reinterpret_cast<UTF8 const *>(begin), 01105 &buffer_begin, buffer_end, strictConversion); 01106 if (res != conversionOK) { 01107 // If we see bad encoding for unprefixed character literals, warn and 01108 // simply copy the byte values, for compatibility with gcc and 01109 // older versions of clang. 01110 bool NoErrorOnBadEncoding = isAscii(); 01111 unsigned Msg = diag::err_bad_character_encoding; 01112 if (NoErrorOnBadEncoding) 01113 Msg = diag::warn_bad_character_encoding; 01114 PP.Diag(Loc, Msg); 01115 if (NoErrorOnBadEncoding) { 01116 start = tmp_in_start; 01117 buffer_begin = tmp_out_start; 01118 for (; start != begin; ++start, ++buffer_begin) 01119 *buffer_begin = static_cast<uint8_t>(*start); 01120 } else { 01121 HadError = true; 01122 } 01123 } else { 01124 for (; tmp_out_start < buffer_begin; ++tmp_out_start) { 01125 if (*tmp_out_start > largest_character_for_kind) { 01126 HadError = true; 01127 PP.Diag(Loc, diag::err_character_too_large); 01128 } 01129 } 01130 } 01131 01132 continue; 01133 } 01134 // Is this a Universal Character Name escape? 01135 if (begin[1] == 'u' || begin[1] == 'U') { 01136 unsigned short UcnLen = 0; 01137 if (!ProcessUCNEscape(TokBegin, begin, end, *buffer_begin, UcnLen, 01138 FullSourceLoc(Loc, PP.getSourceManager()), 01139 &PP.getDiagnostics(), PP.getLangOpts(), true)) { 01140 HadError = true; 01141 } else if (*buffer_begin > largest_character_for_kind) { 01142 HadError = true; 01143 PP.Diag(Loc, diag::err_character_too_large); 01144 } 01145 01146 ++buffer_begin; 01147 continue; 01148 } 01149 unsigned CharWidth = getCharWidth(Kind, PP.getTargetInfo()); 01150 uint64_t result = 01151 ProcessCharEscape(TokBegin, begin, end, HadError, 01152 FullSourceLoc(Loc,PP.getSourceManager()), 01153 CharWidth, &PP.getDiagnostics(), PP.getLangOpts()); 01154 *buffer_begin++ = result; 01155 } 01156 01157 unsigned NumCharsSoFar = buffer_begin - &codepoint_buffer.front(); 01158 01159 if (NumCharsSoFar > 1) { 01160 if (isWide()) 01161 PP.Diag(Loc, diag::warn_extraneous_char_constant); 01162 else if (isAscii() && NumCharsSoFar == 4) 01163 PP.Diag(Loc, diag::ext_four_char_character_literal); 01164 else if (isAscii()) 01165 PP.Diag(Loc, diag::ext_multichar_character_literal); 01166 else 01167 PP.Diag(Loc, diag::err_multichar_utf_character_literal); 01168 IsMultiChar = true; 01169 } else { 01170 IsMultiChar = false; 01171 } 01172 01173 llvm::APInt LitVal(PP.getTargetInfo().getIntWidth(), 0); 01174 01175 // Narrow character literals act as though their value is concatenated 01176 // in this implementation, but warn on overflow. 01177 bool multi_char_too_long = false; 01178 if (isAscii() && isMultiChar()) { 01179 LitVal = 0; 01180 for (size_t i = 0; i < NumCharsSoFar; ++i) { 01181 // check for enough leading zeros to shift into 01182 multi_char_too_long |= (LitVal.countLeadingZeros() < 8); 01183 LitVal <<= 8; 01184 LitVal = LitVal + (codepoint_buffer[i] & 0xFF); 01185 } 01186 } else if (NumCharsSoFar > 0) { 01187 // otherwise just take the last character 01188 LitVal = buffer_begin[-1]; 01189 } 01190 01191 if (!HadError && multi_char_too_long) { 01192 PP.Diag(Loc, diag::warn_char_constant_too_large); 01193 } 01194 01195 // Transfer the value from APInt to uint64_t 01196 Value = LitVal.getZExtValue(); 01197 01198 // If this is a single narrow character, sign extend it (e.g. '\xFF' is "-1") 01199 // if 'char' is signed for this target (C99 6.4.4.4p10). Note that multiple 01200 // character constants are not sign extended in the this implementation: 01201 // '\xFF\xFF' = 65536 and '\x0\xFF' = 255, which matches GCC. 01202 if (isAscii() && NumCharsSoFar == 1 && (Value & 128) && 01203 PP.getLangOpts().CharIsSigned) 01204 Value = (signed char)Value; 01205 } 01206 01207 /// \verbatim 01208 /// string-literal: [C++0x lex.string] 01209 /// encoding-prefix " [s-char-sequence] " 01210 /// encoding-prefix R raw-string 01211 /// encoding-prefix: 01212 /// u8 01213 /// u 01214 /// U 01215 /// L 01216 /// s-char-sequence: 01217 /// s-char 01218 /// s-char-sequence s-char 01219 /// s-char: 01220 /// any member of the source character set except the double-quote ", 01221 /// backslash \, or new-line character 01222 /// escape-sequence 01223 /// universal-character-name 01224 /// raw-string: 01225 /// " d-char-sequence ( r-char-sequence ) d-char-sequence " 01226 /// r-char-sequence: 01227 /// r-char 01228 /// r-char-sequence r-char 01229 /// r-char: 01230 /// any member of the source character set, except a right parenthesis ) 01231 /// followed by the initial d-char-sequence (which may be empty) 01232 /// followed by a double quote ". 01233 /// d-char-sequence: 01234 /// d-char 01235 /// d-char-sequence d-char 01236 /// d-char: 01237 /// any member of the basic source character set except: 01238 /// space, the left parenthesis (, the right parenthesis ), 01239 /// the backslash \, and the control characters representing horizontal 01240 /// tab, vertical tab, form feed, and newline. 01241 /// escape-sequence: [C++0x lex.ccon] 01242 /// simple-escape-sequence 01243 /// octal-escape-sequence 01244 /// hexadecimal-escape-sequence 01245 /// simple-escape-sequence: 01246 /// one of \' \" \? \\ \a \b \f \n \r \t \v 01247 /// octal-escape-sequence: 01248 /// \ octal-digit 01249 /// \ octal-digit octal-digit 01250 /// \ octal-digit octal-digit octal-digit 01251 /// hexadecimal-escape-sequence: 01252 /// \x hexadecimal-digit 01253 /// hexadecimal-escape-sequence hexadecimal-digit 01254 /// universal-character-name: 01255 /// \u hex-quad 01256 /// \U hex-quad hex-quad 01257 /// hex-quad: 01258 /// hex-digit hex-digit hex-digit hex-digit 01259 /// \endverbatim 01260 /// 01261 StringLiteralParser:: 01262 StringLiteralParser(ArrayRef<Token> StringToks, 01263 Preprocessor &PP, bool Complain) 01264 : SM(PP.getSourceManager()), Features(PP.getLangOpts()), 01265 Target(PP.getTargetInfo()), Diags(Complain ? &PP.getDiagnostics() :nullptr), 01266 MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown), 01267 ResultPtr(ResultBuf.data()), hadError(false), Pascal(false) { 01268 init(StringToks); 01269 } 01270 01271 void StringLiteralParser::init(ArrayRef<Token> StringToks){ 01272 // The literal token may have come from an invalid source location (e.g. due 01273 // to a PCH error), in which case the token length will be 0. 01274 if (StringToks.empty() || StringToks[0].getLength() < 2) 01275 return DiagnoseLexingError(SourceLocation()); 01276 01277 // Scan all of the string portions, remember the max individual token length, 01278 // computing a bound on the concatenated string length, and see whether any 01279 // piece is a wide-string. If any of the string portions is a wide-string 01280 // literal, the result is a wide-string literal [C99 6.4.5p4]. 01281 assert(!StringToks.empty() && "expected at least one token"); 01282 MaxTokenLength = StringToks[0].getLength(); 01283 assert(StringToks[0].getLength() >= 2 && "literal token is invalid!"); 01284 SizeBound = StringToks[0].getLength()-2; // -2 for "". 01285 Kind = StringToks[0].getKind(); 01286 01287 hadError = false; 01288 01289 // Implement Translation Phase #6: concatenation of string literals 01290 /// (C99 5.1.1.2p1). The common case is only one string fragment. 01291 for (unsigned i = 1; i != StringToks.size(); ++i) { 01292 if (StringToks[i].getLength() < 2) 01293 return DiagnoseLexingError(StringToks[i].getLocation()); 01294 01295 // The string could be shorter than this if it needs cleaning, but this is a 01296 // reasonable bound, which is all we need. 01297 assert(StringToks[i].getLength() >= 2 && "literal token is invalid!"); 01298 SizeBound += StringToks[i].getLength()-2; // -2 for "". 01299 01300 // Remember maximum string piece length. 01301 if (StringToks[i].getLength() > MaxTokenLength) 01302 MaxTokenLength = StringToks[i].getLength(); 01303 01304 // Remember if we see any wide or utf-8/16/32 strings. 01305 // Also check for illegal concatenations. 01306 if (StringToks[i].isNot(Kind) && StringToks[i].isNot(tok::string_literal)) { 01307 if (isAscii()) { 01308 Kind = StringToks[i].getKind(); 01309 } else { 01310 if (Diags) 01311 Diags->Report(StringToks[i].getLocation(), 01312 diag::err_unsupported_string_concat); 01313 hadError = true; 01314 } 01315 } 01316 } 01317 01318 // Include space for the null terminator. 01319 ++SizeBound; 01320 01321 // TODO: K&R warning: "traditional C rejects string constant concatenation" 01322 01323 // Get the width in bytes of char/wchar_t/char16_t/char32_t 01324 CharByteWidth = getCharWidth(Kind, Target); 01325 assert((CharByteWidth & 7) == 0 && "Assumes character size is byte multiple"); 01326 CharByteWidth /= 8; 01327 01328 // The output buffer size needs to be large enough to hold wide characters. 01329 // This is a worst-case assumption which basically corresponds to L"" "long". 01330 SizeBound *= CharByteWidth; 01331 01332 // Size the temporary buffer to hold the result string data. 01333 ResultBuf.resize(SizeBound); 01334 01335 // Likewise, but for each string piece. 01336 SmallString<512> TokenBuf; 01337 TokenBuf.resize(MaxTokenLength); 01338 01339 // Loop over all the strings, getting their spelling, and expanding them to 01340 // wide strings as appropriate. 01341 ResultPtr = &ResultBuf[0]; // Next byte to fill in. 01342 01343 Pascal = false; 01344 01345 SourceLocation UDSuffixTokLoc; 01346 01347 for (unsigned i = 0, e = StringToks.size(); i != e; ++i) { 01348 const char *ThisTokBuf = &TokenBuf[0]; 01349 // Get the spelling of the token, which eliminates trigraphs, etc. We know 01350 // that ThisTokBuf points to a buffer that is big enough for the whole token 01351 // and 'spelled' tokens can only shrink. 01352 bool StringInvalid = false; 01353 unsigned ThisTokLen = 01354 Lexer::getSpelling(StringToks[i], ThisTokBuf, SM, Features, 01355 &StringInvalid); 01356 if (StringInvalid) 01357 return DiagnoseLexingError(StringToks[i].getLocation()); 01358 01359 const char *ThisTokBegin = ThisTokBuf; 01360 const char *ThisTokEnd = ThisTokBuf+ThisTokLen; 01361 01362 // Remove an optional ud-suffix. 01363 if (ThisTokEnd[-1] != '"') { 01364 const char *UDSuffixEnd = ThisTokEnd; 01365 do { 01366 --ThisTokEnd; 01367 } while (ThisTokEnd[-1] != '"'); 01368 01369 StringRef UDSuffix(ThisTokEnd, UDSuffixEnd - ThisTokEnd); 01370 01371 if (UDSuffixBuf.empty()) { 01372 if (StringToks[i].hasUCN()) 01373 expandUCNs(UDSuffixBuf, UDSuffix); 01374 else 01375 UDSuffixBuf.assign(UDSuffix); 01376 UDSuffixToken = i; 01377 UDSuffixOffset = ThisTokEnd - ThisTokBuf; 01378 UDSuffixTokLoc = StringToks[i].getLocation(); 01379 } else { 01380 SmallString<32> ExpandedUDSuffix; 01381 if (StringToks[i].hasUCN()) { 01382 expandUCNs(ExpandedUDSuffix, UDSuffix); 01383 UDSuffix = ExpandedUDSuffix; 01384 } 01385 01386 // C++11 [lex.ext]p8: At the end of phase 6, if a string literal is the 01387 // result of a concatenation involving at least one user-defined-string- 01388 // literal, all the participating user-defined-string-literals shall 01389 // have the same ud-suffix. 01390 if (UDSuffixBuf != UDSuffix) { 01391 if (Diags) { 01392 SourceLocation TokLoc = StringToks[i].getLocation(); 01393 Diags->Report(TokLoc, diag::err_string_concat_mixed_suffix) 01394 << UDSuffixBuf << UDSuffix 01395 << SourceRange(UDSuffixTokLoc, UDSuffixTokLoc) 01396 << SourceRange(TokLoc, TokLoc); 01397 } 01398 hadError = true; 01399 } 01400 } 01401 } 01402 01403 // Strip the end quote. 01404 --ThisTokEnd; 01405 01406 // TODO: Input character set mapping support. 01407 01408 // Skip marker for wide or unicode strings. 01409 if (ThisTokBuf[0] == 'L' || ThisTokBuf[0] == 'u' || ThisTokBuf[0] == 'U') { 01410 ++ThisTokBuf; 01411 // Skip 8 of u8 marker for utf8 strings. 01412 if (ThisTokBuf[0] == '8') 01413 ++ThisTokBuf; 01414 } 01415 01416 // Check for raw string 01417 if (ThisTokBuf[0] == 'R') { 01418 ThisTokBuf += 2; // skip R" 01419 01420 const char *Prefix = ThisTokBuf; 01421 while (ThisTokBuf[0] != '(') 01422 ++ThisTokBuf; 01423 ++ThisTokBuf; // skip '(' 01424 01425 // Remove same number of characters from the end 01426 ThisTokEnd -= ThisTokBuf - Prefix; 01427 assert(ThisTokEnd >= ThisTokBuf && "malformed raw string literal"); 01428 01429 // Copy the string over 01430 if (CopyStringFragment(StringToks[i], ThisTokBegin, 01431 StringRef(ThisTokBuf, ThisTokEnd - ThisTokBuf))) 01432 hadError = true; 01433 } else { 01434 if (ThisTokBuf[0] != '"') { 01435 // The file may have come from PCH and then changed after loading the 01436 // PCH; Fail gracefully. 01437 return DiagnoseLexingError(StringToks[i].getLocation()); 01438 } 01439 ++ThisTokBuf; // skip " 01440 01441 // Check if this is a pascal string 01442 if (Features.PascalStrings && ThisTokBuf + 1 != ThisTokEnd && 01443 ThisTokBuf[0] == '\\' && ThisTokBuf[1] == 'p') { 01444 01445 // If the \p sequence is found in the first token, we have a pascal string 01446 // Otherwise, if we already have a pascal string, ignore the first \p 01447 if (i == 0) { 01448 ++ThisTokBuf; 01449 Pascal = true; 01450 } else if (Pascal) 01451 ThisTokBuf += 2; 01452 } 01453 01454 while (ThisTokBuf != ThisTokEnd) { 01455 // Is this a span of non-escape characters? 01456 if (ThisTokBuf[0] != '\\') { 01457 const char *InStart = ThisTokBuf; 01458 do { 01459 ++ThisTokBuf; 01460 } while (ThisTokBuf != ThisTokEnd && ThisTokBuf[0] != '\\'); 01461 01462 // Copy the character span over. 01463 if (CopyStringFragment(StringToks[i], ThisTokBegin, 01464 StringRef(InStart, ThisTokBuf - InStart))) 01465 hadError = true; 01466 continue; 01467 } 01468 // Is this a Universal Character Name escape? 01469 if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U') { 01470 EncodeUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, 01471 ResultPtr, hadError, 01472 FullSourceLoc(StringToks[i].getLocation(), SM), 01473 CharByteWidth, Diags, Features); 01474 continue; 01475 } 01476 // Otherwise, this is a non-UCN escape character. Process it. 01477 unsigned ResultChar = 01478 ProcessCharEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, hadError, 01479 FullSourceLoc(StringToks[i].getLocation(), SM), 01480 CharByteWidth*8, Diags, Features); 01481 01482 if (CharByteWidth == 4) { 01483 // FIXME: Make the type of the result buffer correct instead of 01484 // using reinterpret_cast. 01485 UTF32 *ResultWidePtr = reinterpret_cast<UTF32*>(ResultPtr); 01486 *ResultWidePtr = ResultChar; 01487 ResultPtr += 4; 01488 } else if (CharByteWidth == 2) { 01489 // FIXME: Make the type of the result buffer correct instead of 01490 // using reinterpret_cast. 01491 UTF16 *ResultWidePtr = reinterpret_cast<UTF16*>(ResultPtr); 01492 *ResultWidePtr = ResultChar & 0xFFFF; 01493 ResultPtr += 2; 01494 } else { 01495 assert(CharByteWidth == 1 && "Unexpected char width"); 01496 *ResultPtr++ = ResultChar & 0xFF; 01497 } 01498 } 01499 } 01500 } 01501 01502 if (Pascal) { 01503 if (CharByteWidth == 4) { 01504 // FIXME: Make the type of the result buffer correct instead of 01505 // using reinterpret_cast. 01506 UTF32 *ResultWidePtr = reinterpret_cast<UTF32*>(ResultBuf.data()); 01507 ResultWidePtr[0] = GetNumStringChars() - 1; 01508 } else if (CharByteWidth == 2) { 01509 // FIXME: Make the type of the result buffer correct instead of 01510 // using reinterpret_cast. 01511 UTF16 *ResultWidePtr = reinterpret_cast<UTF16*>(ResultBuf.data()); 01512 ResultWidePtr[0] = GetNumStringChars() - 1; 01513 } else { 01514 assert(CharByteWidth == 1 && "Unexpected char width"); 01515 ResultBuf[0] = GetNumStringChars() - 1; 01516 } 01517 01518 // Verify that pascal strings aren't too large. 01519 if (GetStringLength() > 256) { 01520 if (Diags) 01521 Diags->Report(StringToks.front().getLocation(), 01522 diag::err_pascal_string_too_long) 01523 << SourceRange(StringToks.front().getLocation(), 01524 StringToks.back().getLocation()); 01525 hadError = true; 01526 return; 01527 } 01528 } else if (Diags) { 01529 // Complain if this string literal has too many characters. 01530 unsigned MaxChars = Features.CPlusPlus? 65536 : Features.C99 ? 4095 : 509; 01531 01532 if (GetNumStringChars() > MaxChars) 01533 Diags->Report(StringToks.front().getLocation(), 01534 diag::ext_string_too_long) 01535 << GetNumStringChars() << MaxChars 01536 << (Features.CPlusPlus ? 2 : Features.C99 ? 1 : 0) 01537 << SourceRange(StringToks.front().getLocation(), 01538 StringToks.back().getLocation()); 01539 } 01540 } 01541 01542 static const char *resyncUTF8(const char *Err, const char *End) { 01543 if (Err == End) 01544 return End; 01545 End = Err + std::min<unsigned>(getNumBytesForUTF8(*Err), End-Err); 01546 while (++Err != End && (*Err & 0xC0) == 0x80) 01547 ; 01548 return Err; 01549 } 01550 01551 /// \brief This function copies from Fragment, which is a sequence of bytes 01552 /// within Tok's contents (which begin at TokBegin) into ResultPtr. 01553 /// Performs widening for multi-byte characters. 01554 bool StringLiteralParser::CopyStringFragment(const Token &Tok, 01555 const char *TokBegin, 01556 StringRef Fragment) { 01557 const UTF8 *ErrorPtrTmp; 01558 if (ConvertUTF8toWide(CharByteWidth, Fragment, ResultPtr, ErrorPtrTmp)) 01559 return false; 01560 01561 // If we see bad encoding for unprefixed string literals, warn and 01562 // simply copy the byte values, for compatibility with gcc and older 01563 // versions of clang. 01564 bool NoErrorOnBadEncoding = isAscii(); 01565 if (NoErrorOnBadEncoding) { 01566 memcpy(ResultPtr, Fragment.data(), Fragment.size()); 01567 ResultPtr += Fragment.size(); 01568 } 01569 01570 if (Diags) { 01571 const char *ErrorPtr = reinterpret_cast<const char *>(ErrorPtrTmp); 01572 01573 FullSourceLoc SourceLoc(Tok.getLocation(), SM); 01574 const DiagnosticBuilder &Builder = 01575 Diag(Diags, Features, SourceLoc, TokBegin, 01576 ErrorPtr, resyncUTF8(ErrorPtr, Fragment.end()), 01577 NoErrorOnBadEncoding ? diag::warn_bad_string_encoding 01578 : diag::err_bad_string_encoding); 01579 01580 const char *NextStart = resyncUTF8(ErrorPtr, Fragment.end()); 01581 StringRef NextFragment(NextStart, Fragment.end()-NextStart); 01582 01583 // Decode into a dummy buffer. 01584 SmallString<512> Dummy; 01585 Dummy.reserve(Fragment.size() * CharByteWidth); 01586 char *Ptr = Dummy.data(); 01587 01588 while (!ConvertUTF8toWide(CharByteWidth, NextFragment, Ptr, ErrorPtrTmp)) { 01589 const char *ErrorPtr = reinterpret_cast<const char *>(ErrorPtrTmp); 01590 NextStart = resyncUTF8(ErrorPtr, Fragment.end()); 01591 Builder << MakeCharSourceRange(Features, SourceLoc, TokBegin, 01592 ErrorPtr, NextStart); 01593 NextFragment = StringRef(NextStart, Fragment.end()-NextStart); 01594 } 01595 } 01596 return !NoErrorOnBadEncoding; 01597 } 01598 01599 void StringLiteralParser::DiagnoseLexingError(SourceLocation Loc) { 01600 hadError = true; 01601 if (Diags) 01602 Diags->Report(Loc, diag::err_lexing_string); 01603 } 01604 01605 /// getOffsetOfStringByte - This function returns the offset of the 01606 /// specified byte of the string data represented by Token. This handles 01607 /// advancing over escape sequences in the string. 01608 unsigned StringLiteralParser::getOffsetOfStringByte(const Token &Tok, 01609 unsigned ByteNo) const { 01610 // Get the spelling of the token. 01611 SmallString<32> SpellingBuffer; 01612 SpellingBuffer.resize(Tok.getLength()); 01613 01614 bool StringInvalid = false; 01615 const char *SpellingPtr = &SpellingBuffer[0]; 01616 unsigned TokLen = Lexer::getSpelling(Tok, SpellingPtr, SM, Features, 01617 &StringInvalid); 01618 if (StringInvalid) 01619 return 0; 01620 01621 const char *SpellingStart = SpellingPtr; 01622 const char *SpellingEnd = SpellingPtr+TokLen; 01623 01624 // Handle UTF-8 strings just like narrow strings. 01625 if (SpellingPtr[0] == 'u' && SpellingPtr[1] == '8') 01626 SpellingPtr += 2; 01627 01628 assert(SpellingPtr[0] != 'L' && SpellingPtr[0] != 'u' && 01629 SpellingPtr[0] != 'U' && "Doesn't handle wide or utf strings yet"); 01630 01631 // For raw string literals, this is easy. 01632 if (SpellingPtr[0] == 'R') { 01633 assert(SpellingPtr[1] == '"' && "Should be a raw string literal!"); 01634 // Skip 'R"'. 01635 SpellingPtr += 2; 01636 while (*SpellingPtr != '(') { 01637 ++SpellingPtr; 01638 assert(SpellingPtr < SpellingEnd && "Missing ( for raw string literal"); 01639 } 01640 // Skip '('. 01641 ++SpellingPtr; 01642 return SpellingPtr - SpellingStart + ByteNo; 01643 } 01644 01645 // Skip over the leading quote 01646 assert(SpellingPtr[0] == '"' && "Should be a string literal!"); 01647 ++SpellingPtr; 01648 01649 // Skip over bytes until we find the offset we're looking for. 01650 while (ByteNo) { 01651 assert(SpellingPtr < SpellingEnd && "Didn't find byte offset!"); 01652 01653 // Step over non-escapes simply. 01654 if (*SpellingPtr != '\\') { 01655 ++SpellingPtr; 01656 --ByteNo; 01657 continue; 01658 } 01659 01660 // Otherwise, this is an escape character. Advance over it. 01661 bool HadError = false; 01662 if (SpellingPtr[1] == 'u' || SpellingPtr[1] == 'U') { 01663 const char *EscapePtr = SpellingPtr; 01664 unsigned Len = MeasureUCNEscape(SpellingStart, SpellingPtr, SpellingEnd, 01665 1, Features, HadError); 01666 if (Len > ByteNo) { 01667 // ByteNo is somewhere within the escape sequence. 01668 SpellingPtr = EscapePtr; 01669 break; 01670 } 01671 ByteNo -= Len; 01672 } else { 01673 ProcessCharEscape(SpellingStart, SpellingPtr, SpellingEnd, HadError, 01674 FullSourceLoc(Tok.getLocation(), SM), 01675 CharByteWidth*8, Diags, Features); 01676 --ByteNo; 01677 } 01678 assert(!HadError && "This method isn't valid on erroneous strings"); 01679 } 01680 01681 return SpellingPtr-SpellingStart; 01682 }