LLVM API Documentation
00001 //===--- YAMLParser.cpp - Simple YAML parser ------------------------------===// 00002 // 00003 // The LLVM Compiler Infrastructure 00004 // 00005 // This file is distributed under the University of Illinois Open Source 00006 // License. See LICENSE.TXT for details. 00007 // 00008 //===----------------------------------------------------------------------===// 00009 // 00010 // This file implements a YAML parser. 00011 // 00012 //===----------------------------------------------------------------------===// 00013 00014 #include "llvm/Support/YAMLParser.h" 00015 #include "llvm/ADT/SmallVector.h" 00016 #include "llvm/ADT/StringExtras.h" 00017 #include "llvm/ADT/Twine.h" 00018 #include "llvm/ADT/ilist.h" 00019 #include "llvm/ADT/ilist_node.h" 00020 #include "llvm/Support/ErrorHandling.h" 00021 #include "llvm/Support/MemoryBuffer.h" 00022 #include "llvm/Support/SourceMgr.h" 00023 #include "llvm/Support/raw_ostream.h" 00024 00025 using namespace llvm; 00026 using namespace yaml; 00027 00028 enum UnicodeEncodingForm { 00029 UEF_UTF32_LE, ///< UTF-32 Little Endian 00030 UEF_UTF32_BE, ///< UTF-32 Big Endian 00031 UEF_UTF16_LE, ///< UTF-16 Little Endian 00032 UEF_UTF16_BE, ///< UTF-16 Big Endian 00033 UEF_UTF8, ///< UTF-8 or ascii. 00034 UEF_Unknown ///< Not a valid Unicode encoding. 00035 }; 00036 00037 /// EncodingInfo - Holds the encoding type and length of the byte order mark if 00038 /// it exists. Length is in {0, 2, 3, 4}. 00039 typedef std::pair<UnicodeEncodingForm, unsigned> EncodingInfo; 00040 00041 /// getUnicodeEncoding - Reads up to the first 4 bytes to determine the Unicode 00042 /// encoding form of \a Input. 00043 /// 00044 /// @param Input A string of length 0 or more. 00045 /// @returns An EncodingInfo indicating the Unicode encoding form of the input 00046 /// and how long the byte order mark is if one exists. 00047 static EncodingInfo getUnicodeEncoding(StringRef Input) { 00048 if (Input.size() == 0) 00049 return std::make_pair(UEF_Unknown, 0); 00050 00051 switch (uint8_t(Input[0])) { 00052 case 0x00: 00053 if (Input.size() >= 4) { 00054 if ( Input[1] == 0 00055 && uint8_t(Input[2]) == 0xFE 00056 && uint8_t(Input[3]) == 0xFF) 00057 return std::make_pair(UEF_UTF32_BE, 4); 00058 if (Input[1] == 0 && Input[2] == 0 && Input[3] != 0) 00059 return std::make_pair(UEF_UTF32_BE, 0); 00060 } 00061 00062 if (Input.size() >= 2 && Input[1] != 0) 00063 return std::make_pair(UEF_UTF16_BE, 0); 00064 return std::make_pair(UEF_Unknown, 0); 00065 case 0xFF: 00066 if ( Input.size() >= 4 00067 && uint8_t(Input[1]) == 0xFE 00068 && Input[2] == 0 00069 && Input[3] == 0) 00070 return std::make_pair(UEF_UTF32_LE, 4); 00071 00072 if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFE) 00073 return std::make_pair(UEF_UTF16_LE, 2); 00074 return std::make_pair(UEF_Unknown, 0); 00075 case 0xFE: 00076 if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFF) 00077 return std::make_pair(UEF_UTF16_BE, 2); 00078 return std::make_pair(UEF_Unknown, 0); 00079 case 0xEF: 00080 if ( Input.size() >= 3 00081 && uint8_t(Input[1]) == 0xBB 00082 && uint8_t(Input[2]) == 0xBF) 00083 return std::make_pair(UEF_UTF8, 3); 00084 return std::make_pair(UEF_Unknown, 0); 00085 } 00086 00087 // It could still be utf-32 or utf-16. 00088 if (Input.size() >= 4 && Input[1] == 0 && Input[2] == 0 && Input[3] == 0) 00089 return std::make_pair(UEF_UTF32_LE, 0); 00090 00091 if (Input.size() >= 2 && Input[1] == 0) 00092 return std::make_pair(UEF_UTF16_LE, 0); 00093 00094 return std::make_pair(UEF_UTF8, 0); 00095 } 00096 00097 namespace llvm { 00098 namespace yaml { 00099 /// Pin the vtables to this file. 00100 void Node::anchor() {} 00101 void NullNode::anchor() {} 00102 void ScalarNode::anchor() {} 00103 void KeyValueNode::anchor() {} 00104 void MappingNode::anchor() {} 00105 void SequenceNode::anchor() {} 00106 void AliasNode::anchor() {} 00107 00108 /// Token - A single YAML token. 00109 struct Token : ilist_node<Token> { 00110 enum TokenKind { 00111 TK_Error, // Uninitialized token. 00112 TK_StreamStart, 00113 TK_StreamEnd, 00114 TK_VersionDirective, 00115 TK_TagDirective, 00116 TK_DocumentStart, 00117 TK_DocumentEnd, 00118 TK_BlockEntry, 00119 TK_BlockEnd, 00120 TK_BlockSequenceStart, 00121 TK_BlockMappingStart, 00122 TK_FlowEntry, 00123 TK_FlowSequenceStart, 00124 TK_FlowSequenceEnd, 00125 TK_FlowMappingStart, 00126 TK_FlowMappingEnd, 00127 TK_Key, 00128 TK_Value, 00129 TK_Scalar, 00130 TK_Alias, 00131 TK_Anchor, 00132 TK_Tag 00133 } Kind; 00134 00135 /// A string of length 0 or more whose begin() points to the logical location 00136 /// of the token in the input. 00137 StringRef Range; 00138 00139 Token() : Kind(TK_Error) {} 00140 }; 00141 } 00142 } 00143 00144 namespace llvm { 00145 template<> 00146 struct ilist_sentinel_traits<Token> { 00147 Token *createSentinel() const { 00148 return &Sentinel; 00149 } 00150 static void destroySentinel(Token*) {} 00151 00152 Token *provideInitialHead() const { return createSentinel(); } 00153 Token *ensureHead(Token*) const { return createSentinel(); } 00154 static void noteHead(Token*, Token*) {} 00155 00156 private: 00157 mutable Token Sentinel; 00158 }; 00159 00160 template<> 00161 struct ilist_node_traits<Token> { 00162 Token *createNode(const Token &V) { 00163 return new (Alloc.Allocate<Token>()) Token(V); 00164 } 00165 static void deleteNode(Token *V) {} 00166 00167 void addNodeToList(Token *) {} 00168 void removeNodeFromList(Token *) {} 00169 void transferNodesFromList(ilist_node_traits & /*SrcTraits*/, 00170 ilist_iterator<Token> /*first*/, 00171 ilist_iterator<Token> /*last*/) {} 00172 00173 BumpPtrAllocator Alloc; 00174 }; 00175 } 00176 00177 typedef ilist<Token> TokenQueueT; 00178 00179 namespace { 00180 /// @brief This struct is used to track simple keys. 00181 /// 00182 /// Simple keys are handled by creating an entry in SimpleKeys for each Token 00183 /// which could legally be the start of a simple key. When peekNext is called, 00184 /// if the Token To be returned is referenced by a SimpleKey, we continue 00185 /// tokenizing until that potential simple key has either been found to not be 00186 /// a simple key (we moved on to the next line or went further than 1024 chars). 00187 /// Or when we run into a Value, and then insert a Key token (and possibly 00188 /// others) before the SimpleKey's Tok. 00189 struct SimpleKey { 00190 TokenQueueT::iterator Tok; 00191 unsigned Column; 00192 unsigned Line; 00193 unsigned FlowLevel; 00194 bool IsRequired; 00195 00196 bool operator ==(const SimpleKey &Other) { 00197 return Tok == Other.Tok; 00198 } 00199 }; 00200 } 00201 00202 /// @brief The Unicode scalar value of a UTF-8 minimal well-formed code unit 00203 /// subsequence and the subsequence's length in code units (uint8_t). 00204 /// A length of 0 represents an error. 00205 typedef std::pair<uint32_t, unsigned> UTF8Decoded; 00206 00207 static UTF8Decoded decodeUTF8(StringRef Range) { 00208 StringRef::iterator Position= Range.begin(); 00209 StringRef::iterator End = Range.end(); 00210 // 1 byte: [0x00, 0x7f] 00211 // Bit pattern: 0xxxxxxx 00212 if ((*Position & 0x80) == 0) { 00213 return std::make_pair(*Position, 1); 00214 } 00215 // 2 bytes: [0x80, 0x7ff] 00216 // Bit pattern: 110xxxxx 10xxxxxx 00217 if (Position + 1 != End && 00218 ((*Position & 0xE0) == 0xC0) && 00219 ((*(Position + 1) & 0xC0) == 0x80)) { 00220 uint32_t codepoint = ((*Position & 0x1F) << 6) | 00221 (*(Position + 1) & 0x3F); 00222 if (codepoint >= 0x80) 00223 return std::make_pair(codepoint, 2); 00224 } 00225 // 3 bytes: [0x8000, 0xffff] 00226 // Bit pattern: 1110xxxx 10xxxxxx 10xxxxxx 00227 if (Position + 2 != End && 00228 ((*Position & 0xF0) == 0xE0) && 00229 ((*(Position + 1) & 0xC0) == 0x80) && 00230 ((*(Position + 2) & 0xC0) == 0x80)) { 00231 uint32_t codepoint = ((*Position & 0x0F) << 12) | 00232 ((*(Position + 1) & 0x3F) << 6) | 00233 (*(Position + 2) & 0x3F); 00234 // Codepoints between 0xD800 and 0xDFFF are invalid, as 00235 // they are high / low surrogate halves used by UTF-16. 00236 if (codepoint >= 0x800 && 00237 (codepoint < 0xD800 || codepoint > 0xDFFF)) 00238 return std::make_pair(codepoint, 3); 00239 } 00240 // 4 bytes: [0x10000, 0x10FFFF] 00241 // Bit pattern: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 00242 if (Position + 3 != End && 00243 ((*Position & 0xF8) == 0xF0) && 00244 ((*(Position + 1) & 0xC0) == 0x80) && 00245 ((*(Position + 2) & 0xC0) == 0x80) && 00246 ((*(Position + 3) & 0xC0) == 0x80)) { 00247 uint32_t codepoint = ((*Position & 0x07) << 18) | 00248 ((*(Position + 1) & 0x3F) << 12) | 00249 ((*(Position + 2) & 0x3F) << 6) | 00250 (*(Position + 3) & 0x3F); 00251 if (codepoint >= 0x10000 && codepoint <= 0x10FFFF) 00252 return std::make_pair(codepoint, 4); 00253 } 00254 return std::make_pair(0, 0); 00255 } 00256 00257 namespace llvm { 00258 namespace yaml { 00259 /// @brief Scans YAML tokens from a MemoryBuffer. 00260 class Scanner { 00261 public: 00262 Scanner(StringRef Input, SourceMgr &SM); 00263 Scanner(MemoryBufferRef Buffer, SourceMgr &SM_); 00264 00265 /// @brief Parse the next token and return it without popping it. 00266 Token &peekNext(); 00267 00268 /// @brief Parse the next token and pop it from the queue. 00269 Token getNext(); 00270 00271 void printError(SMLoc Loc, SourceMgr::DiagKind Kind, const Twine &Message, 00272 ArrayRef<SMRange> Ranges = None) { 00273 SM.PrintMessage(Loc, Kind, Message, Ranges); 00274 } 00275 00276 void setError(const Twine &Message, StringRef::iterator Position) { 00277 if (Current >= End) 00278 Current = End - 1; 00279 00280 // Don't print out more errors after the first one we encounter. The rest 00281 // are just the result of the first, and have no meaning. 00282 if (!Failed) 00283 printError(SMLoc::getFromPointer(Current), SourceMgr::DK_Error, Message); 00284 Failed = true; 00285 } 00286 00287 void setError(const Twine &Message) { 00288 setError(Message, Current); 00289 } 00290 00291 /// @brief Returns true if an error occurred while parsing. 00292 bool failed() { 00293 return Failed; 00294 } 00295 00296 private: 00297 void init(MemoryBufferRef Buffer); 00298 00299 StringRef currentInput() { 00300 return StringRef(Current, End - Current); 00301 } 00302 00303 /// @brief Decode a UTF-8 minimal well-formed code unit subsequence starting 00304 /// at \a Position. 00305 /// 00306 /// If the UTF-8 code units starting at Position do not form a well-formed 00307 /// code unit subsequence, then the Unicode scalar value is 0, and the length 00308 /// is 0. 00309 UTF8Decoded decodeUTF8(StringRef::iterator Position) { 00310 return ::decodeUTF8(StringRef(Position, End - Position)); 00311 } 00312 00313 // The following functions are based on the gramar rules in the YAML spec. The 00314 // style of the function names it meant to closely match how they are written 00315 // in the spec. The number within the [] is the number of the grammar rule in 00316 // the spec. 00317 // 00318 // See 4.2 [Production Naming Conventions] for the meaning of the prefixes. 00319 // 00320 // c- 00321 // A production starting and ending with a special character. 00322 // b- 00323 // A production matching a single line break. 00324 // nb- 00325 // A production starting and ending with a non-break character. 00326 // s- 00327 // A production starting and ending with a white space character. 00328 // ns- 00329 // A production starting and ending with a non-space character. 00330 // l- 00331 // A production matching complete line(s). 00332 00333 /// @brief Skip a single nb-char[27] starting at Position. 00334 /// 00335 /// A nb-char is 0x9 | [0x20-0x7E] | 0x85 | [0xA0-0xD7FF] | [0xE000-0xFEFE] 00336 /// | [0xFF00-0xFFFD] | [0x10000-0x10FFFF] 00337 /// 00338 /// @returns The code unit after the nb-char, or Position if it's not an 00339 /// nb-char. 00340 StringRef::iterator skip_nb_char(StringRef::iterator Position); 00341 00342 /// @brief Skip a single b-break[28] starting at Position. 00343 /// 00344 /// A b-break is 0xD 0xA | 0xD | 0xA 00345 /// 00346 /// @returns The code unit after the b-break, or Position if it's not a 00347 /// b-break. 00348 StringRef::iterator skip_b_break(StringRef::iterator Position); 00349 00350 /// @brief Skip a single s-white[33] starting at Position. 00351 /// 00352 /// A s-white is 0x20 | 0x9 00353 /// 00354 /// @returns The code unit after the s-white, or Position if it's not a 00355 /// s-white. 00356 StringRef::iterator skip_s_white(StringRef::iterator Position); 00357 00358 /// @brief Skip a single ns-char[34] starting at Position. 00359 /// 00360 /// A ns-char is nb-char - s-white 00361 /// 00362 /// @returns The code unit after the ns-char, or Position if it's not a 00363 /// ns-char. 00364 StringRef::iterator skip_ns_char(StringRef::iterator Position); 00365 00366 typedef StringRef::iterator (Scanner::*SkipWhileFunc)(StringRef::iterator); 00367 /// @brief Skip minimal well-formed code unit subsequences until Func 00368 /// returns its input. 00369 /// 00370 /// @returns The code unit after the last minimal well-formed code unit 00371 /// subsequence that Func accepted. 00372 StringRef::iterator skip_while( SkipWhileFunc Func 00373 , StringRef::iterator Position); 00374 00375 /// @brief Scan ns-uri-char[39]s starting at Cur. 00376 /// 00377 /// This updates Cur and Column while scanning. 00378 /// 00379 /// @returns A StringRef starting at Cur which covers the longest contiguous 00380 /// sequence of ns-uri-char. 00381 StringRef scan_ns_uri_char(); 00382 00383 /// @brief Consume a minimal well-formed code unit subsequence starting at 00384 /// \a Cur. Return false if it is not the same Unicode scalar value as 00385 /// \a Expected. This updates \a Column. 00386 bool consume(uint32_t Expected); 00387 00388 /// @brief Skip \a Distance UTF-8 code units. Updates \a Cur and \a Column. 00389 void skip(uint32_t Distance); 00390 00391 /// @brief Return true if the minimal well-formed code unit subsequence at 00392 /// Pos is whitespace or a new line 00393 bool isBlankOrBreak(StringRef::iterator Position); 00394 00395 /// @brief If IsSimpleKeyAllowed, create and push_back a new SimpleKey. 00396 void saveSimpleKeyCandidate( TokenQueueT::iterator Tok 00397 , unsigned AtColumn 00398 , bool IsRequired); 00399 00400 /// @brief Remove simple keys that can no longer be valid simple keys. 00401 /// 00402 /// Invalid simple keys are not on the current line or are further than 1024 00403 /// columns back. 00404 void removeStaleSimpleKeyCandidates(); 00405 00406 /// @brief Remove all simple keys on FlowLevel \a Level. 00407 void removeSimpleKeyCandidatesOnFlowLevel(unsigned Level); 00408 00409 /// @brief Unroll indentation in \a Indents back to \a Col. Creates BlockEnd 00410 /// tokens if needed. 00411 bool unrollIndent(int ToColumn); 00412 00413 /// @brief Increase indent to \a Col. Creates \a Kind token at \a InsertPoint 00414 /// if needed. 00415 bool rollIndent( int ToColumn 00416 , Token::TokenKind Kind 00417 , TokenQueueT::iterator InsertPoint); 00418 00419 /// @brief Skip whitespace and comments until the start of the next token. 00420 void scanToNextToken(); 00421 00422 /// @brief Must be the first token generated. 00423 bool scanStreamStart(); 00424 00425 /// @brief Generate tokens needed to close out the stream. 00426 bool scanStreamEnd(); 00427 00428 /// @brief Scan a %BLAH directive. 00429 bool scanDirective(); 00430 00431 /// @brief Scan a ... or ---. 00432 bool scanDocumentIndicator(bool IsStart); 00433 00434 /// @brief Scan a [ or { and generate the proper flow collection start token. 00435 bool scanFlowCollectionStart(bool IsSequence); 00436 00437 /// @brief Scan a ] or } and generate the proper flow collection end token. 00438 bool scanFlowCollectionEnd(bool IsSequence); 00439 00440 /// @brief Scan the , that separates entries in a flow collection. 00441 bool scanFlowEntry(); 00442 00443 /// @brief Scan the - that starts block sequence entries. 00444 bool scanBlockEntry(); 00445 00446 /// @brief Scan an explicit ? indicating a key. 00447 bool scanKey(); 00448 00449 /// @brief Scan an explicit : indicating a value. 00450 bool scanValue(); 00451 00452 /// @brief Scan a quoted scalar. 00453 bool scanFlowScalar(bool IsDoubleQuoted); 00454 00455 /// @brief Scan an unquoted scalar. 00456 bool scanPlainScalar(); 00457 00458 /// @brief Scan an Alias or Anchor starting with * or &. 00459 bool scanAliasOrAnchor(bool IsAlias); 00460 00461 /// @brief Scan a block scalar starting with | or >. 00462 bool scanBlockScalar(bool IsLiteral); 00463 00464 /// @brief Scan a tag of the form !stuff. 00465 bool scanTag(); 00466 00467 /// @brief Dispatch to the next scanning function based on \a *Cur. 00468 bool fetchMoreTokens(); 00469 00470 /// @brief The SourceMgr used for diagnostics and buffer management. 00471 SourceMgr &SM; 00472 00473 /// @brief The original input. 00474 MemoryBufferRef InputBuffer; 00475 00476 /// @brief The current position of the scanner. 00477 StringRef::iterator Current; 00478 00479 /// @brief The end of the input (one past the last character). 00480 StringRef::iterator End; 00481 00482 /// @brief Current YAML indentation level in spaces. 00483 int Indent; 00484 00485 /// @brief Current column number in Unicode code points. 00486 unsigned Column; 00487 00488 /// @brief Current line number. 00489 unsigned Line; 00490 00491 /// @brief How deep we are in flow style containers. 0 Means at block level. 00492 unsigned FlowLevel; 00493 00494 /// @brief Are we at the start of the stream? 00495 bool IsStartOfStream; 00496 00497 /// @brief Can the next token be the start of a simple key? 00498 bool IsSimpleKeyAllowed; 00499 00500 /// @brief True if an error has occurred. 00501 bool Failed; 00502 00503 /// @brief Queue of tokens. This is required to queue up tokens while looking 00504 /// for the end of a simple key. And for cases where a single character 00505 /// can produce multiple tokens (e.g. BlockEnd). 00506 TokenQueueT TokenQueue; 00507 00508 /// @brief Indentation levels. 00509 SmallVector<int, 4> Indents; 00510 00511 /// @brief Potential simple keys. 00512 SmallVector<SimpleKey, 4> SimpleKeys; 00513 }; 00514 00515 } // end namespace yaml 00516 } // end namespace llvm 00517 00518 /// encodeUTF8 - Encode \a UnicodeScalarValue in UTF-8 and append it to result. 00519 static void encodeUTF8( uint32_t UnicodeScalarValue 00520 , SmallVectorImpl<char> &Result) { 00521 if (UnicodeScalarValue <= 0x7F) { 00522 Result.push_back(UnicodeScalarValue & 0x7F); 00523 } else if (UnicodeScalarValue <= 0x7FF) { 00524 uint8_t FirstByte = 0xC0 | ((UnicodeScalarValue & 0x7C0) >> 6); 00525 uint8_t SecondByte = 0x80 | (UnicodeScalarValue & 0x3F); 00526 Result.push_back(FirstByte); 00527 Result.push_back(SecondByte); 00528 } else if (UnicodeScalarValue <= 0xFFFF) { 00529 uint8_t FirstByte = 0xE0 | ((UnicodeScalarValue & 0xF000) >> 12); 00530 uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6); 00531 uint8_t ThirdByte = 0x80 | (UnicodeScalarValue & 0x3F); 00532 Result.push_back(FirstByte); 00533 Result.push_back(SecondByte); 00534 Result.push_back(ThirdByte); 00535 } else if (UnicodeScalarValue <= 0x10FFFF) { 00536 uint8_t FirstByte = 0xF0 | ((UnicodeScalarValue & 0x1F0000) >> 18); 00537 uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0x3F000) >> 12); 00538 uint8_t ThirdByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6); 00539 uint8_t FourthByte = 0x80 | (UnicodeScalarValue & 0x3F); 00540 Result.push_back(FirstByte); 00541 Result.push_back(SecondByte); 00542 Result.push_back(ThirdByte); 00543 Result.push_back(FourthByte); 00544 } 00545 } 00546 00547 bool yaml::dumpTokens(StringRef Input, raw_ostream &OS) { 00548 SourceMgr SM; 00549 Scanner scanner(Input, SM); 00550 while (true) { 00551 Token T = scanner.getNext(); 00552 switch (T.Kind) { 00553 case Token::TK_StreamStart: 00554 OS << "Stream-Start: "; 00555 break; 00556 case Token::TK_StreamEnd: 00557 OS << "Stream-End: "; 00558 break; 00559 case Token::TK_VersionDirective: 00560 OS << "Version-Directive: "; 00561 break; 00562 case Token::TK_TagDirective: 00563 OS << "Tag-Directive: "; 00564 break; 00565 case Token::TK_DocumentStart: 00566 OS << "Document-Start: "; 00567 break; 00568 case Token::TK_DocumentEnd: 00569 OS << "Document-End: "; 00570 break; 00571 case Token::TK_BlockEntry: 00572 OS << "Block-Entry: "; 00573 break; 00574 case Token::TK_BlockEnd: 00575 OS << "Block-End: "; 00576 break; 00577 case Token::TK_BlockSequenceStart: 00578 OS << "Block-Sequence-Start: "; 00579 break; 00580 case Token::TK_BlockMappingStart: 00581 OS << "Block-Mapping-Start: "; 00582 break; 00583 case Token::TK_FlowEntry: 00584 OS << "Flow-Entry: "; 00585 break; 00586 case Token::TK_FlowSequenceStart: 00587 OS << "Flow-Sequence-Start: "; 00588 break; 00589 case Token::TK_FlowSequenceEnd: 00590 OS << "Flow-Sequence-End: "; 00591 break; 00592 case Token::TK_FlowMappingStart: 00593 OS << "Flow-Mapping-Start: "; 00594 break; 00595 case Token::TK_FlowMappingEnd: 00596 OS << "Flow-Mapping-End: "; 00597 break; 00598 case Token::TK_Key: 00599 OS << "Key: "; 00600 break; 00601 case Token::TK_Value: 00602 OS << "Value: "; 00603 break; 00604 case Token::TK_Scalar: 00605 OS << "Scalar: "; 00606 break; 00607 case Token::TK_Alias: 00608 OS << "Alias: "; 00609 break; 00610 case Token::TK_Anchor: 00611 OS << "Anchor: "; 00612 break; 00613 case Token::TK_Tag: 00614 OS << "Tag: "; 00615 break; 00616 case Token::TK_Error: 00617 break; 00618 } 00619 OS << T.Range << "\n"; 00620 if (T.Kind == Token::TK_StreamEnd) 00621 break; 00622 else if (T.Kind == Token::TK_Error) 00623 return false; 00624 } 00625 return true; 00626 } 00627 00628 bool yaml::scanTokens(StringRef Input) { 00629 llvm::SourceMgr SM; 00630 llvm::yaml::Scanner scanner(Input, SM); 00631 for (;;) { 00632 llvm::yaml::Token T = scanner.getNext(); 00633 if (T.Kind == Token::TK_StreamEnd) 00634 break; 00635 else if (T.Kind == Token::TK_Error) 00636 return false; 00637 } 00638 return true; 00639 } 00640 00641 std::string yaml::escape(StringRef Input) { 00642 std::string EscapedInput; 00643 for (StringRef::iterator i = Input.begin(), e = Input.end(); i != e; ++i) { 00644 if (*i == '\\') 00645 EscapedInput += "\\\\"; 00646 else if (*i == '"') 00647 EscapedInput += "\\\""; 00648 else if (*i == 0) 00649 EscapedInput += "\\0"; 00650 else if (*i == 0x07) 00651 EscapedInput += "\\a"; 00652 else if (*i == 0x08) 00653 EscapedInput += "\\b"; 00654 else if (*i == 0x09) 00655 EscapedInput += "\\t"; 00656 else if (*i == 0x0A) 00657 EscapedInput += "\\n"; 00658 else if (*i == 0x0B) 00659 EscapedInput += "\\v"; 00660 else if (*i == 0x0C) 00661 EscapedInput += "\\f"; 00662 else if (*i == 0x0D) 00663 EscapedInput += "\\r"; 00664 else if (*i == 0x1B) 00665 EscapedInput += "\\e"; 00666 else if ((unsigned char)*i < 0x20) { // Control characters not handled above. 00667 std::string HexStr = utohexstr(*i); 00668 EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr; 00669 } else if (*i & 0x80) { // UTF-8 multiple code unit subsequence. 00670 UTF8Decoded UnicodeScalarValue 00671 = decodeUTF8(StringRef(i, Input.end() - i)); 00672 if (UnicodeScalarValue.second == 0) { 00673 // Found invalid char. 00674 SmallString<4> Val; 00675 encodeUTF8(0xFFFD, Val); 00676 EscapedInput.insert(EscapedInput.end(), Val.begin(), Val.end()); 00677 // FIXME: Error reporting. 00678 return EscapedInput; 00679 } 00680 if (UnicodeScalarValue.first == 0x85) 00681 EscapedInput += "\\N"; 00682 else if (UnicodeScalarValue.first == 0xA0) 00683 EscapedInput += "\\_"; 00684 else if (UnicodeScalarValue.first == 0x2028) 00685 EscapedInput += "\\L"; 00686 else if (UnicodeScalarValue.first == 0x2029) 00687 EscapedInput += "\\P"; 00688 else { 00689 std::string HexStr = utohexstr(UnicodeScalarValue.first); 00690 if (HexStr.size() <= 2) 00691 EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr; 00692 else if (HexStr.size() <= 4) 00693 EscapedInput += "\\u" + std::string(4 - HexStr.size(), '0') + HexStr; 00694 else if (HexStr.size() <= 8) 00695 EscapedInput += "\\U" + std::string(8 - HexStr.size(), '0') + HexStr; 00696 } 00697 i += UnicodeScalarValue.second - 1; 00698 } else 00699 EscapedInput.push_back(*i); 00700 } 00701 return EscapedInput; 00702 } 00703 00704 Scanner::Scanner(StringRef Input, SourceMgr &sm) : SM(sm) { 00705 init(MemoryBufferRef(Input, "YAML")); 00706 } 00707 00708 Scanner::Scanner(MemoryBufferRef Buffer, SourceMgr &SM_) : SM(SM_) { 00709 init(Buffer); 00710 } 00711 00712 void Scanner::init(MemoryBufferRef Buffer) { 00713 InputBuffer = Buffer; 00714 Current = InputBuffer.getBufferStart(); 00715 End = InputBuffer.getBufferEnd(); 00716 Indent = -1; 00717 Column = 0; 00718 Line = 0; 00719 FlowLevel = 0; 00720 IsStartOfStream = true; 00721 IsSimpleKeyAllowed = true; 00722 Failed = false; 00723 std::unique_ptr<MemoryBuffer> InputBufferOwner = 00724 MemoryBuffer::getMemBuffer(Buffer); 00725 SM.AddNewSourceBuffer(std::move(InputBufferOwner), SMLoc()); 00726 } 00727 00728 Token &Scanner::peekNext() { 00729 // If the current token is a possible simple key, keep parsing until we 00730 // can confirm. 00731 bool NeedMore = false; 00732 while (true) { 00733 if (TokenQueue.empty() || NeedMore) { 00734 if (!fetchMoreTokens()) { 00735 TokenQueue.clear(); 00736 TokenQueue.push_back(Token()); 00737 return TokenQueue.front(); 00738 } 00739 } 00740 assert(!TokenQueue.empty() && 00741 "fetchMoreTokens lied about getting tokens!"); 00742 00743 removeStaleSimpleKeyCandidates(); 00744 SimpleKey SK; 00745 SK.Tok = TokenQueue.front(); 00746 if (std::find(SimpleKeys.begin(), SimpleKeys.end(), SK) 00747 == SimpleKeys.end()) 00748 break; 00749 else 00750 NeedMore = true; 00751 } 00752 return TokenQueue.front(); 00753 } 00754 00755 Token Scanner::getNext() { 00756 Token Ret = peekNext(); 00757 // TokenQueue can be empty if there was an error getting the next token. 00758 if (!TokenQueue.empty()) 00759 TokenQueue.pop_front(); 00760 00761 // There cannot be any referenced Token's if the TokenQueue is empty. So do a 00762 // quick deallocation of them all. 00763 if (TokenQueue.empty()) { 00764 TokenQueue.Alloc.Reset(); 00765 } 00766 00767 return Ret; 00768 } 00769 00770 StringRef::iterator Scanner::skip_nb_char(StringRef::iterator Position) { 00771 if (Position == End) 00772 return Position; 00773 // Check 7 bit c-printable - b-char. 00774 if ( *Position == 0x09 00775 || (*Position >= 0x20 && *Position <= 0x7E)) 00776 return Position + 1; 00777 00778 // Check for valid UTF-8. 00779 if (uint8_t(*Position) & 0x80) { 00780 UTF8Decoded u8d = decodeUTF8(Position); 00781 if ( u8d.second != 0 00782 && u8d.first != 0xFEFF 00783 && ( u8d.first == 0x85 00784 || ( u8d.first >= 0xA0 00785 && u8d.first <= 0xD7FF) 00786 || ( u8d.first >= 0xE000 00787 && u8d.first <= 0xFFFD) 00788 || ( u8d.first >= 0x10000 00789 && u8d.first <= 0x10FFFF))) 00790 return Position + u8d.second; 00791 } 00792 return Position; 00793 } 00794 00795 StringRef::iterator Scanner::skip_b_break(StringRef::iterator Position) { 00796 if (Position == End) 00797 return Position; 00798 if (*Position == 0x0D) { 00799 if (Position + 1 != End && *(Position + 1) == 0x0A) 00800 return Position + 2; 00801 return Position + 1; 00802 } 00803 00804 if (*Position == 0x0A) 00805 return Position + 1; 00806 return Position; 00807 } 00808 00809 00810 StringRef::iterator Scanner::skip_s_white(StringRef::iterator Position) { 00811 if (Position == End) 00812 return Position; 00813 if (*Position == ' ' || *Position == '\t') 00814 return Position + 1; 00815 return Position; 00816 } 00817 00818 StringRef::iterator Scanner::skip_ns_char(StringRef::iterator Position) { 00819 if (Position == End) 00820 return Position; 00821 if (*Position == ' ' || *Position == '\t') 00822 return Position; 00823 return skip_nb_char(Position); 00824 } 00825 00826 StringRef::iterator Scanner::skip_while( SkipWhileFunc Func 00827 , StringRef::iterator Position) { 00828 while (true) { 00829 StringRef::iterator i = (this->*Func)(Position); 00830 if (i == Position) 00831 break; 00832 Position = i; 00833 } 00834 return Position; 00835 } 00836 00837 static bool is_ns_hex_digit(const char C) { 00838 return (C >= '0' && C <= '9') 00839 || (C >= 'a' && C <= 'z') 00840 || (C >= 'A' && C <= 'Z'); 00841 } 00842 00843 static bool is_ns_word_char(const char C) { 00844 return C == '-' 00845 || (C >= 'a' && C <= 'z') 00846 || (C >= 'A' && C <= 'Z'); 00847 } 00848 00849 StringRef Scanner::scan_ns_uri_char() { 00850 StringRef::iterator Start = Current; 00851 while (true) { 00852 if (Current == End) 00853 break; 00854 if (( *Current == '%' 00855 && Current + 2 < End 00856 && is_ns_hex_digit(*(Current + 1)) 00857 && is_ns_hex_digit(*(Current + 2))) 00858 || is_ns_word_char(*Current) 00859 || StringRef(Current, 1).find_first_of("#;/?:@&=+$,_.!~*'()[]") 00860 != StringRef::npos) { 00861 ++Current; 00862 ++Column; 00863 } else 00864 break; 00865 } 00866 return StringRef(Start, Current - Start); 00867 } 00868 00869 bool Scanner::consume(uint32_t Expected) { 00870 if (Expected >= 0x80) 00871 report_fatal_error("Not dealing with this yet"); 00872 if (Current == End) 00873 return false; 00874 if (uint8_t(*Current) >= 0x80) 00875 report_fatal_error("Not dealing with this yet"); 00876 if (uint8_t(*Current) == Expected) { 00877 ++Current; 00878 ++Column; 00879 return true; 00880 } 00881 return false; 00882 } 00883 00884 void Scanner::skip(uint32_t Distance) { 00885 Current += Distance; 00886 Column += Distance; 00887 assert(Current <= End && "Skipped past the end"); 00888 } 00889 00890 bool Scanner::isBlankOrBreak(StringRef::iterator Position) { 00891 if (Position == End) 00892 return false; 00893 if ( *Position == ' ' || *Position == '\t' 00894 || *Position == '\r' || *Position == '\n') 00895 return true; 00896 return false; 00897 } 00898 00899 void Scanner::saveSimpleKeyCandidate( TokenQueueT::iterator Tok 00900 , unsigned AtColumn 00901 , bool IsRequired) { 00902 if (IsSimpleKeyAllowed) { 00903 SimpleKey SK; 00904 SK.Tok = Tok; 00905 SK.Line = Line; 00906 SK.Column = AtColumn; 00907 SK.IsRequired = IsRequired; 00908 SK.FlowLevel = FlowLevel; 00909 SimpleKeys.push_back(SK); 00910 } 00911 } 00912 00913 void Scanner::removeStaleSimpleKeyCandidates() { 00914 for (SmallVectorImpl<SimpleKey>::iterator i = SimpleKeys.begin(); 00915 i != SimpleKeys.end();) { 00916 if (i->Line != Line || i->Column + 1024 < Column) { 00917 if (i->IsRequired) 00918 setError( "Could not find expected : for simple key" 00919 , i->Tok->Range.begin()); 00920 i = SimpleKeys.erase(i); 00921 } else 00922 ++i; 00923 } 00924 } 00925 00926 void Scanner::removeSimpleKeyCandidatesOnFlowLevel(unsigned Level) { 00927 if (!SimpleKeys.empty() && (SimpleKeys.end() - 1)->FlowLevel == Level) 00928 SimpleKeys.pop_back(); 00929 } 00930 00931 bool Scanner::unrollIndent(int ToColumn) { 00932 Token T; 00933 // Indentation is ignored in flow. 00934 if (FlowLevel != 0) 00935 return true; 00936 00937 while (Indent > ToColumn) { 00938 T.Kind = Token::TK_BlockEnd; 00939 T.Range = StringRef(Current, 1); 00940 TokenQueue.push_back(T); 00941 Indent = Indents.pop_back_val(); 00942 } 00943 00944 return true; 00945 } 00946 00947 bool Scanner::rollIndent( int ToColumn 00948 , Token::TokenKind Kind 00949 , TokenQueueT::iterator InsertPoint) { 00950 if (FlowLevel) 00951 return true; 00952 if (Indent < ToColumn) { 00953 Indents.push_back(Indent); 00954 Indent = ToColumn; 00955 00956 Token T; 00957 T.Kind = Kind; 00958 T.Range = StringRef(Current, 0); 00959 TokenQueue.insert(InsertPoint, T); 00960 } 00961 return true; 00962 } 00963 00964 void Scanner::scanToNextToken() { 00965 while (true) { 00966 while (*Current == ' ' || *Current == '\t') { 00967 skip(1); 00968 } 00969 00970 // Skip comment. 00971 if (*Current == '#') { 00972 while (true) { 00973 // This may skip more than one byte, thus Column is only incremented 00974 // for code points. 00975 StringRef::iterator i = skip_nb_char(Current); 00976 if (i == Current) 00977 break; 00978 Current = i; 00979 ++Column; 00980 } 00981 } 00982 00983 // Skip EOL. 00984 StringRef::iterator i = skip_b_break(Current); 00985 if (i == Current) 00986 break; 00987 Current = i; 00988 ++Line; 00989 Column = 0; 00990 // New lines may start a simple key. 00991 if (!FlowLevel) 00992 IsSimpleKeyAllowed = true; 00993 } 00994 } 00995 00996 bool Scanner::scanStreamStart() { 00997 IsStartOfStream = false; 00998 00999 EncodingInfo EI = getUnicodeEncoding(currentInput()); 01000 01001 Token T; 01002 T.Kind = Token::TK_StreamStart; 01003 T.Range = StringRef(Current, EI.second); 01004 TokenQueue.push_back(T); 01005 Current += EI.second; 01006 return true; 01007 } 01008 01009 bool Scanner::scanStreamEnd() { 01010 // Force an ending new line if one isn't present. 01011 if (Column != 0) { 01012 Column = 0; 01013 ++Line; 01014 } 01015 01016 unrollIndent(-1); 01017 SimpleKeys.clear(); 01018 IsSimpleKeyAllowed = false; 01019 01020 Token T; 01021 T.Kind = Token::TK_StreamEnd; 01022 T.Range = StringRef(Current, 0); 01023 TokenQueue.push_back(T); 01024 return true; 01025 } 01026 01027 bool Scanner::scanDirective() { 01028 // Reset the indentation level. 01029 unrollIndent(-1); 01030 SimpleKeys.clear(); 01031 IsSimpleKeyAllowed = false; 01032 01033 StringRef::iterator Start = Current; 01034 consume('%'); 01035 StringRef::iterator NameStart = Current; 01036 Current = skip_while(&Scanner::skip_ns_char, Current); 01037 StringRef Name(NameStart, Current - NameStart); 01038 Current = skip_while(&Scanner::skip_s_white, Current); 01039 01040 Token T; 01041 if (Name == "YAML") { 01042 Current = skip_while(&Scanner::skip_ns_char, Current); 01043 T.Kind = Token::TK_VersionDirective; 01044 T.Range = StringRef(Start, Current - Start); 01045 TokenQueue.push_back(T); 01046 return true; 01047 } else if(Name == "TAG") { 01048 Current = skip_while(&Scanner::skip_ns_char, Current); 01049 Current = skip_while(&Scanner::skip_s_white, Current); 01050 Current = skip_while(&Scanner::skip_ns_char, Current); 01051 T.Kind = Token::TK_TagDirective; 01052 T.Range = StringRef(Start, Current - Start); 01053 TokenQueue.push_back(T); 01054 return true; 01055 } 01056 return false; 01057 } 01058 01059 bool Scanner::scanDocumentIndicator(bool IsStart) { 01060 unrollIndent(-1); 01061 SimpleKeys.clear(); 01062 IsSimpleKeyAllowed = false; 01063 01064 Token T; 01065 T.Kind = IsStart ? Token::TK_DocumentStart : Token::TK_DocumentEnd; 01066 T.Range = StringRef(Current, 3); 01067 skip(3); 01068 TokenQueue.push_back(T); 01069 return true; 01070 } 01071 01072 bool Scanner::scanFlowCollectionStart(bool IsSequence) { 01073 Token T; 01074 T.Kind = IsSequence ? Token::TK_FlowSequenceStart 01075 : Token::TK_FlowMappingStart; 01076 T.Range = StringRef(Current, 1); 01077 skip(1); 01078 TokenQueue.push_back(T); 01079 01080 // [ and { may begin a simple key. 01081 saveSimpleKeyCandidate(TokenQueue.back(), Column - 1, false); 01082 01083 // And may also be followed by a simple key. 01084 IsSimpleKeyAllowed = true; 01085 ++FlowLevel; 01086 return true; 01087 } 01088 01089 bool Scanner::scanFlowCollectionEnd(bool IsSequence) { 01090 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 01091 IsSimpleKeyAllowed = false; 01092 Token T; 01093 T.Kind = IsSequence ? Token::TK_FlowSequenceEnd 01094 : Token::TK_FlowMappingEnd; 01095 T.Range = StringRef(Current, 1); 01096 skip(1); 01097 TokenQueue.push_back(T); 01098 if (FlowLevel) 01099 --FlowLevel; 01100 return true; 01101 } 01102 01103 bool Scanner::scanFlowEntry() { 01104 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 01105 IsSimpleKeyAllowed = true; 01106 Token T; 01107 T.Kind = Token::TK_FlowEntry; 01108 T.Range = StringRef(Current, 1); 01109 skip(1); 01110 TokenQueue.push_back(T); 01111 return true; 01112 } 01113 01114 bool Scanner::scanBlockEntry() { 01115 rollIndent(Column, Token::TK_BlockSequenceStart, TokenQueue.end()); 01116 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 01117 IsSimpleKeyAllowed = true; 01118 Token T; 01119 T.Kind = Token::TK_BlockEntry; 01120 T.Range = StringRef(Current, 1); 01121 skip(1); 01122 TokenQueue.push_back(T); 01123 return true; 01124 } 01125 01126 bool Scanner::scanKey() { 01127 if (!FlowLevel) 01128 rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end()); 01129 01130 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 01131 IsSimpleKeyAllowed = !FlowLevel; 01132 01133 Token T; 01134 T.Kind = Token::TK_Key; 01135 T.Range = StringRef(Current, 1); 01136 skip(1); 01137 TokenQueue.push_back(T); 01138 return true; 01139 } 01140 01141 bool Scanner::scanValue() { 01142 // If the previous token could have been a simple key, insert the key token 01143 // into the token queue. 01144 if (!SimpleKeys.empty()) { 01145 SimpleKey SK = SimpleKeys.pop_back_val(); 01146 Token T; 01147 T.Kind = Token::TK_Key; 01148 T.Range = SK.Tok->Range; 01149 TokenQueueT::iterator i, e; 01150 for (i = TokenQueue.begin(), e = TokenQueue.end(); i != e; ++i) { 01151 if (i == SK.Tok) 01152 break; 01153 } 01154 assert(i != e && "SimpleKey not in token queue!"); 01155 i = TokenQueue.insert(i, T); 01156 01157 // We may also need to add a Block-Mapping-Start token. 01158 rollIndent(SK.Column, Token::TK_BlockMappingStart, i); 01159 01160 IsSimpleKeyAllowed = false; 01161 } else { 01162 if (!FlowLevel) 01163 rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end()); 01164 IsSimpleKeyAllowed = !FlowLevel; 01165 } 01166 01167 Token T; 01168 T.Kind = Token::TK_Value; 01169 T.Range = StringRef(Current, 1); 01170 skip(1); 01171 TokenQueue.push_back(T); 01172 return true; 01173 } 01174 01175 // Forbidding inlining improves performance by roughly 20%. 01176 // FIXME: Remove once llvm optimizes this to the faster version without hints. 01177 LLVM_ATTRIBUTE_NOINLINE static bool 01178 wasEscaped(StringRef::iterator First, StringRef::iterator Position); 01179 01180 // Returns whether a character at 'Position' was escaped with a leading '\'. 01181 // 'First' specifies the position of the first character in the string. 01182 static bool wasEscaped(StringRef::iterator First, 01183 StringRef::iterator Position) { 01184 assert(Position - 1 >= First); 01185 StringRef::iterator I = Position - 1; 01186 // We calculate the number of consecutive '\'s before the current position 01187 // by iterating backwards through our string. 01188 while (I >= First && *I == '\\') --I; 01189 // (Position - 1 - I) now contains the number of '\'s before the current 01190 // position. If it is odd, the character at 'Position' was escaped. 01191 return (Position - 1 - I) % 2 == 1; 01192 } 01193 01194 bool Scanner::scanFlowScalar(bool IsDoubleQuoted) { 01195 StringRef::iterator Start = Current; 01196 unsigned ColStart = Column; 01197 if (IsDoubleQuoted) { 01198 do { 01199 ++Current; 01200 while (Current != End && *Current != '"') 01201 ++Current; 01202 // Repeat until the previous character was not a '\' or was an escaped 01203 // backslash. 01204 } while ( Current != End 01205 && *(Current - 1) == '\\' 01206 && wasEscaped(Start + 1, Current)); 01207 } else { 01208 skip(1); 01209 while (true) { 01210 // Skip a ' followed by another '. 01211 if (Current + 1 < End && *Current == '\'' && *(Current + 1) == '\'') { 01212 skip(2); 01213 continue; 01214 } else if (*Current == '\'') 01215 break; 01216 StringRef::iterator i = skip_nb_char(Current); 01217 if (i == Current) { 01218 i = skip_b_break(Current); 01219 if (i == Current) 01220 break; 01221 Current = i; 01222 Column = 0; 01223 ++Line; 01224 } else { 01225 if (i == End) 01226 break; 01227 Current = i; 01228 ++Column; 01229 } 01230 } 01231 } 01232 01233 if (Current == End) { 01234 setError("Expected quote at end of scalar", Current); 01235 return false; 01236 } 01237 01238 skip(1); // Skip ending quote. 01239 Token T; 01240 T.Kind = Token::TK_Scalar; 01241 T.Range = StringRef(Start, Current - Start); 01242 TokenQueue.push_back(T); 01243 01244 saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false); 01245 01246 IsSimpleKeyAllowed = false; 01247 01248 return true; 01249 } 01250 01251 bool Scanner::scanPlainScalar() { 01252 StringRef::iterator Start = Current; 01253 unsigned ColStart = Column; 01254 unsigned LeadingBlanks = 0; 01255 assert(Indent >= -1 && "Indent must be >= -1 !"); 01256 unsigned indent = static_cast<unsigned>(Indent + 1); 01257 while (true) { 01258 if (*Current == '#') 01259 break; 01260 01261 while (!isBlankOrBreak(Current)) { 01262 if ( FlowLevel && *Current == ':' 01263 && !(isBlankOrBreak(Current + 1) || *(Current + 1) == ',')) { 01264 setError("Found unexpected ':' while scanning a plain scalar", Current); 01265 return false; 01266 } 01267 01268 // Check for the end of the plain scalar. 01269 if ( (*Current == ':' && isBlankOrBreak(Current + 1)) 01270 || ( FlowLevel 01271 && (StringRef(Current, 1).find_first_of(",:?[]{}") 01272 != StringRef::npos))) 01273 break; 01274 01275 StringRef::iterator i = skip_nb_char(Current); 01276 if (i == Current) 01277 break; 01278 Current = i; 01279 ++Column; 01280 } 01281 01282 // Are we at the end? 01283 if (!isBlankOrBreak(Current)) 01284 break; 01285 01286 // Eat blanks. 01287 StringRef::iterator Tmp = Current; 01288 while (isBlankOrBreak(Tmp)) { 01289 StringRef::iterator i = skip_s_white(Tmp); 01290 if (i != Tmp) { 01291 if (LeadingBlanks && (Column < indent) && *Tmp == '\t') { 01292 setError("Found invalid tab character in indentation", Tmp); 01293 return false; 01294 } 01295 Tmp = i; 01296 ++Column; 01297 } else { 01298 i = skip_b_break(Tmp); 01299 if (!LeadingBlanks) 01300 LeadingBlanks = 1; 01301 Tmp = i; 01302 Column = 0; 01303 ++Line; 01304 } 01305 } 01306 01307 if (!FlowLevel && Column < indent) 01308 break; 01309 01310 Current = Tmp; 01311 } 01312 if (Start == Current) { 01313 setError("Got empty plain scalar", Start); 01314 return false; 01315 } 01316 Token T; 01317 T.Kind = Token::TK_Scalar; 01318 T.Range = StringRef(Start, Current - Start); 01319 TokenQueue.push_back(T); 01320 01321 // Plain scalars can be simple keys. 01322 saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false); 01323 01324 IsSimpleKeyAllowed = false; 01325 01326 return true; 01327 } 01328 01329 bool Scanner::scanAliasOrAnchor(bool IsAlias) { 01330 StringRef::iterator Start = Current; 01331 unsigned ColStart = Column; 01332 skip(1); 01333 while(true) { 01334 if ( *Current == '[' || *Current == ']' 01335 || *Current == '{' || *Current == '}' 01336 || *Current == ',' 01337 || *Current == ':') 01338 break; 01339 StringRef::iterator i = skip_ns_char(Current); 01340 if (i == Current) 01341 break; 01342 Current = i; 01343 ++Column; 01344 } 01345 01346 if (Start == Current) { 01347 setError("Got empty alias or anchor", Start); 01348 return false; 01349 } 01350 01351 Token T; 01352 T.Kind = IsAlias ? Token::TK_Alias : Token::TK_Anchor; 01353 T.Range = StringRef(Start, Current - Start); 01354 TokenQueue.push_back(T); 01355 01356 // Alias and anchors can be simple keys. 01357 saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false); 01358 01359 IsSimpleKeyAllowed = false; 01360 01361 return true; 01362 } 01363 01364 bool Scanner::scanBlockScalar(bool IsLiteral) { 01365 StringRef::iterator Start = Current; 01366 skip(1); // Eat | or > 01367 while(true) { 01368 StringRef::iterator i = skip_nb_char(Current); 01369 if (i == Current) { 01370 if (Column == 0) 01371 break; 01372 i = skip_b_break(Current); 01373 if (i != Current) { 01374 // We got a line break. 01375 Column = 0; 01376 ++Line; 01377 Current = i; 01378 continue; 01379 } else { 01380 // There was an error, which should already have been printed out. 01381 return false; 01382 } 01383 } 01384 Current = i; 01385 ++Column; 01386 } 01387 01388 if (Start == Current) { 01389 setError("Got empty block scalar", Start); 01390 return false; 01391 } 01392 01393 Token T; 01394 T.Kind = Token::TK_Scalar; 01395 T.Range = StringRef(Start, Current - Start); 01396 TokenQueue.push_back(T); 01397 return true; 01398 } 01399 01400 bool Scanner::scanTag() { 01401 StringRef::iterator Start = Current; 01402 unsigned ColStart = Column; 01403 skip(1); // Eat !. 01404 if (Current == End || isBlankOrBreak(Current)); // An empty tag. 01405 else if (*Current == '<') { 01406 skip(1); 01407 scan_ns_uri_char(); 01408 if (!consume('>')) 01409 return false; 01410 } else { 01411 // FIXME: Actually parse the c-ns-shorthand-tag rule. 01412 Current = skip_while(&Scanner::skip_ns_char, Current); 01413 } 01414 01415 Token T; 01416 T.Kind = Token::TK_Tag; 01417 T.Range = StringRef(Start, Current - Start); 01418 TokenQueue.push_back(T); 01419 01420 // Tags can be simple keys. 01421 saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false); 01422 01423 IsSimpleKeyAllowed = false; 01424 01425 return true; 01426 } 01427 01428 bool Scanner::fetchMoreTokens() { 01429 if (IsStartOfStream) 01430 return scanStreamStart(); 01431 01432 scanToNextToken(); 01433 01434 if (Current == End) 01435 return scanStreamEnd(); 01436 01437 removeStaleSimpleKeyCandidates(); 01438 01439 unrollIndent(Column); 01440 01441 if (Column == 0 && *Current == '%') 01442 return scanDirective(); 01443 01444 if (Column == 0 && Current + 4 <= End 01445 && *Current == '-' 01446 && *(Current + 1) == '-' 01447 && *(Current + 2) == '-' 01448 && (Current + 3 == End || isBlankOrBreak(Current + 3))) 01449 return scanDocumentIndicator(true); 01450 01451 if (Column == 0 && Current + 4 <= End 01452 && *Current == '.' 01453 && *(Current + 1) == '.' 01454 && *(Current + 2) == '.' 01455 && (Current + 3 == End || isBlankOrBreak(Current + 3))) 01456 return scanDocumentIndicator(false); 01457 01458 if (*Current == '[') 01459 return scanFlowCollectionStart(true); 01460 01461 if (*Current == '{') 01462 return scanFlowCollectionStart(false); 01463 01464 if (*Current == ']') 01465 return scanFlowCollectionEnd(true); 01466 01467 if (*Current == '}') 01468 return scanFlowCollectionEnd(false); 01469 01470 if (*Current == ',') 01471 return scanFlowEntry(); 01472 01473 if (*Current == '-' && isBlankOrBreak(Current + 1)) 01474 return scanBlockEntry(); 01475 01476 if (*Current == '?' && (FlowLevel || isBlankOrBreak(Current + 1))) 01477 return scanKey(); 01478 01479 if (*Current == ':' && (FlowLevel || isBlankOrBreak(Current + 1))) 01480 return scanValue(); 01481 01482 if (*Current == '*') 01483 return scanAliasOrAnchor(true); 01484 01485 if (*Current == '&') 01486 return scanAliasOrAnchor(false); 01487 01488 if (*Current == '!') 01489 return scanTag(); 01490 01491 if (*Current == '|' && !FlowLevel) 01492 return scanBlockScalar(true); 01493 01494 if (*Current == '>' && !FlowLevel) 01495 return scanBlockScalar(false); 01496 01497 if (*Current == '\'') 01498 return scanFlowScalar(false); 01499 01500 if (*Current == '"') 01501 return scanFlowScalar(true); 01502 01503 // Get a plain scalar. 01504 StringRef FirstChar(Current, 1); 01505 if (!(isBlankOrBreak(Current) 01506 || FirstChar.find_first_of("-?:,[]{}#&*!|>'\"%@`") != StringRef::npos) 01507 || (*Current == '-' && !isBlankOrBreak(Current + 1)) 01508 || (!FlowLevel && (*Current == '?' || *Current == ':') 01509 && isBlankOrBreak(Current + 1)) 01510 || (!FlowLevel && *Current == ':' 01511 && Current + 2 < End 01512 && *(Current + 1) == ':' 01513 && !isBlankOrBreak(Current + 2))) 01514 return scanPlainScalar(); 01515 01516 setError("Unrecognized character while tokenizing."); 01517 return false; 01518 } 01519 01520 Stream::Stream(StringRef Input, SourceMgr &SM) 01521 : scanner(new Scanner(Input, SM)), CurrentDoc() {} 01522 01523 Stream::Stream(MemoryBufferRef InputBuffer, SourceMgr &SM) 01524 : scanner(new Scanner(InputBuffer, SM)), CurrentDoc() {} 01525 01526 Stream::~Stream() {} 01527 01528 bool Stream::failed() { return scanner->failed(); } 01529 01530 void Stream::printError(Node *N, const Twine &Msg) { 01531 SmallVector<SMRange, 1> Ranges; 01532 Ranges.push_back(N->getSourceRange()); 01533 scanner->printError( N->getSourceRange().Start 01534 , SourceMgr::DK_Error 01535 , Msg 01536 , Ranges); 01537 } 01538 01539 document_iterator Stream::begin() { 01540 if (CurrentDoc) 01541 report_fatal_error("Can only iterate over the stream once"); 01542 01543 // Skip Stream-Start. 01544 scanner->getNext(); 01545 01546 CurrentDoc.reset(new Document(*this)); 01547 return document_iterator(CurrentDoc); 01548 } 01549 01550 document_iterator Stream::end() { 01551 return document_iterator(); 01552 } 01553 01554 void Stream::skip() { 01555 for (document_iterator i = begin(), e = end(); i != e; ++i) 01556 i->skip(); 01557 } 01558 01559 Node::Node(unsigned int Type, std::unique_ptr<Document> &D, StringRef A, 01560 StringRef T) 01561 : Doc(D), TypeID(Type), Anchor(A), Tag(T) { 01562 SMLoc Start = SMLoc::getFromPointer(peekNext().Range.begin()); 01563 SourceRange = SMRange(Start, Start); 01564 } 01565 01566 std::string Node::getVerbatimTag() const { 01567 StringRef Raw = getRawTag(); 01568 if (!Raw.empty() && Raw != "!") { 01569 std::string Ret; 01570 if (Raw.find_last_of('!') == 0) { 01571 Ret = Doc->getTagMap().find("!")->second; 01572 Ret += Raw.substr(1); 01573 return std::move(Ret); 01574 } else if (Raw.startswith("!!")) { 01575 Ret = Doc->getTagMap().find("!!")->second; 01576 Ret += Raw.substr(2); 01577 return std::move(Ret); 01578 } else { 01579 StringRef TagHandle = Raw.substr(0, Raw.find_last_of('!') + 1); 01580 std::map<StringRef, StringRef>::const_iterator It = 01581 Doc->getTagMap().find(TagHandle); 01582 if (It != Doc->getTagMap().end()) 01583 Ret = It->second; 01584 else { 01585 Token T; 01586 T.Kind = Token::TK_Tag; 01587 T.Range = TagHandle; 01588 setError(Twine("Unknown tag handle ") + TagHandle, T); 01589 } 01590 Ret += Raw.substr(Raw.find_last_of('!') + 1); 01591 return std::move(Ret); 01592 } 01593 } 01594 01595 switch (getType()) { 01596 case NK_Null: 01597 return "tag:yaml.org,2002:null"; 01598 case NK_Scalar: 01599 // TODO: Tag resolution. 01600 return "tag:yaml.org,2002:str"; 01601 case NK_Mapping: 01602 return "tag:yaml.org,2002:map"; 01603 case NK_Sequence: 01604 return "tag:yaml.org,2002:seq"; 01605 } 01606 01607 return ""; 01608 } 01609 01610 Token &Node::peekNext() { 01611 return Doc->peekNext(); 01612 } 01613 01614 Token Node::getNext() { 01615 return Doc->getNext(); 01616 } 01617 01618 Node *Node::parseBlockNode() { 01619 return Doc->parseBlockNode(); 01620 } 01621 01622 BumpPtrAllocator &Node::getAllocator() { 01623 return Doc->NodeAllocator; 01624 } 01625 01626 void Node::setError(const Twine &Msg, Token &Tok) const { 01627 Doc->setError(Msg, Tok); 01628 } 01629 01630 bool Node::failed() const { 01631 return Doc->failed(); 01632 } 01633 01634 01635 01636 StringRef ScalarNode::getValue(SmallVectorImpl<char> &Storage) const { 01637 // TODO: Handle newlines properly. We need to remove leading whitespace. 01638 if (Value[0] == '"') { // Double quoted. 01639 // Pull off the leading and trailing "s. 01640 StringRef UnquotedValue = Value.substr(1, Value.size() - 2); 01641 // Search for characters that would require unescaping the value. 01642 StringRef::size_type i = UnquotedValue.find_first_of("\\\r\n"); 01643 if (i != StringRef::npos) 01644 return unescapeDoubleQuoted(UnquotedValue, i, Storage); 01645 return UnquotedValue; 01646 } else if (Value[0] == '\'') { // Single quoted. 01647 // Pull off the leading and trailing 's. 01648 StringRef UnquotedValue = Value.substr(1, Value.size() - 2); 01649 StringRef::size_type i = UnquotedValue.find('\''); 01650 if (i != StringRef::npos) { 01651 // We're going to need Storage. 01652 Storage.clear(); 01653 Storage.reserve(UnquotedValue.size()); 01654 for (; i != StringRef::npos; i = UnquotedValue.find('\'')) { 01655 StringRef Valid(UnquotedValue.begin(), i); 01656 Storage.insert(Storage.end(), Valid.begin(), Valid.end()); 01657 Storage.push_back('\''); 01658 UnquotedValue = UnquotedValue.substr(i + 2); 01659 } 01660 Storage.insert(Storage.end(), UnquotedValue.begin(), UnquotedValue.end()); 01661 return StringRef(Storage.begin(), Storage.size()); 01662 } 01663 return UnquotedValue; 01664 } 01665 // Plain or block. 01666 return Value.rtrim(" "); 01667 } 01668 01669 StringRef ScalarNode::unescapeDoubleQuoted( StringRef UnquotedValue 01670 , StringRef::size_type i 01671 , SmallVectorImpl<char> &Storage) 01672 const { 01673 // Use Storage to build proper value. 01674 Storage.clear(); 01675 Storage.reserve(UnquotedValue.size()); 01676 for (; i != StringRef::npos; i = UnquotedValue.find_first_of("\\\r\n")) { 01677 // Insert all previous chars into Storage. 01678 StringRef Valid(UnquotedValue.begin(), i); 01679 Storage.insert(Storage.end(), Valid.begin(), Valid.end()); 01680 // Chop off inserted chars. 01681 UnquotedValue = UnquotedValue.substr(i); 01682 01683 assert(!UnquotedValue.empty() && "Can't be empty!"); 01684 01685 // Parse escape or line break. 01686 switch (UnquotedValue[0]) { 01687 case '\r': 01688 case '\n': 01689 Storage.push_back('\n'); 01690 if ( UnquotedValue.size() > 1 01691 && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n')) 01692 UnquotedValue = UnquotedValue.substr(1); 01693 UnquotedValue = UnquotedValue.substr(1); 01694 break; 01695 default: 01696 if (UnquotedValue.size() == 1) 01697 // TODO: Report error. 01698 break; 01699 UnquotedValue = UnquotedValue.substr(1); 01700 switch (UnquotedValue[0]) { 01701 default: { 01702 Token T; 01703 T.Range = StringRef(UnquotedValue.begin(), 1); 01704 setError("Unrecognized escape code!", T); 01705 return ""; 01706 } 01707 case '\r': 01708 case '\n': 01709 // Remove the new line. 01710 if ( UnquotedValue.size() > 1 01711 && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n')) 01712 UnquotedValue = UnquotedValue.substr(1); 01713 // If this was just a single byte newline, it will get skipped 01714 // below. 01715 break; 01716 case '0': 01717 Storage.push_back(0x00); 01718 break; 01719 case 'a': 01720 Storage.push_back(0x07); 01721 break; 01722 case 'b': 01723 Storage.push_back(0x08); 01724 break; 01725 case 't': 01726 case 0x09: 01727 Storage.push_back(0x09); 01728 break; 01729 case 'n': 01730 Storage.push_back(0x0A); 01731 break; 01732 case 'v': 01733 Storage.push_back(0x0B); 01734 break; 01735 case 'f': 01736 Storage.push_back(0x0C); 01737 break; 01738 case 'r': 01739 Storage.push_back(0x0D); 01740 break; 01741 case 'e': 01742 Storage.push_back(0x1B); 01743 break; 01744 case ' ': 01745 Storage.push_back(0x20); 01746 break; 01747 case '"': 01748 Storage.push_back(0x22); 01749 break; 01750 case '/': 01751 Storage.push_back(0x2F); 01752 break; 01753 case '\\': 01754 Storage.push_back(0x5C); 01755 break; 01756 case 'N': 01757 encodeUTF8(0x85, Storage); 01758 break; 01759 case '_': 01760 encodeUTF8(0xA0, Storage); 01761 break; 01762 case 'L': 01763 encodeUTF8(0x2028, Storage); 01764 break; 01765 case 'P': 01766 encodeUTF8(0x2029, Storage); 01767 break; 01768 case 'x': { 01769 if (UnquotedValue.size() < 3) 01770 // TODO: Report error. 01771 break; 01772 unsigned int UnicodeScalarValue; 01773 if (UnquotedValue.substr(1, 2).getAsInteger(16, UnicodeScalarValue)) 01774 // TODO: Report error. 01775 UnicodeScalarValue = 0xFFFD; 01776 encodeUTF8(UnicodeScalarValue, Storage); 01777 UnquotedValue = UnquotedValue.substr(2); 01778 break; 01779 } 01780 case 'u': { 01781 if (UnquotedValue.size() < 5) 01782 // TODO: Report error. 01783 break; 01784 unsigned int UnicodeScalarValue; 01785 if (UnquotedValue.substr(1, 4).getAsInteger(16, UnicodeScalarValue)) 01786 // TODO: Report error. 01787 UnicodeScalarValue = 0xFFFD; 01788 encodeUTF8(UnicodeScalarValue, Storage); 01789 UnquotedValue = UnquotedValue.substr(4); 01790 break; 01791 } 01792 case 'U': { 01793 if (UnquotedValue.size() < 9) 01794 // TODO: Report error. 01795 break; 01796 unsigned int UnicodeScalarValue; 01797 if (UnquotedValue.substr(1, 8).getAsInteger(16, UnicodeScalarValue)) 01798 // TODO: Report error. 01799 UnicodeScalarValue = 0xFFFD; 01800 encodeUTF8(UnicodeScalarValue, Storage); 01801 UnquotedValue = UnquotedValue.substr(8); 01802 break; 01803 } 01804 } 01805 UnquotedValue = UnquotedValue.substr(1); 01806 } 01807 } 01808 Storage.insert(Storage.end(), UnquotedValue.begin(), UnquotedValue.end()); 01809 return StringRef(Storage.begin(), Storage.size()); 01810 } 01811 01812 Node *KeyValueNode::getKey() { 01813 if (Key) 01814 return Key; 01815 // Handle implicit null keys. 01816 { 01817 Token &t = peekNext(); 01818 if ( t.Kind == Token::TK_BlockEnd 01819 || t.Kind == Token::TK_Value 01820 || t.Kind == Token::TK_Error) { 01821 return Key = new (getAllocator()) NullNode(Doc); 01822 } 01823 if (t.Kind == Token::TK_Key) 01824 getNext(); // skip TK_Key. 01825 } 01826 01827 // Handle explicit null keys. 01828 Token &t = peekNext(); 01829 if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Value) { 01830 return Key = new (getAllocator()) NullNode(Doc); 01831 } 01832 01833 // We've got a normal key. 01834 return Key = parseBlockNode(); 01835 } 01836 01837 Node *KeyValueNode::getValue() { 01838 if (Value) 01839 return Value; 01840 getKey()->skip(); 01841 if (failed()) 01842 return Value = new (getAllocator()) NullNode(Doc); 01843 01844 // Handle implicit null values. 01845 { 01846 Token &t = peekNext(); 01847 if ( t.Kind == Token::TK_BlockEnd 01848 || t.Kind == Token::TK_FlowMappingEnd 01849 || t.Kind == Token::TK_Key 01850 || t.Kind == Token::TK_FlowEntry 01851 || t.Kind == Token::TK_Error) { 01852 return Value = new (getAllocator()) NullNode(Doc); 01853 } 01854 01855 if (t.Kind != Token::TK_Value) { 01856 setError("Unexpected token in Key Value.", t); 01857 return Value = new (getAllocator()) NullNode(Doc); 01858 } 01859 getNext(); // skip TK_Value. 01860 } 01861 01862 // Handle explicit null values. 01863 Token &t = peekNext(); 01864 if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Key) { 01865 return Value = new (getAllocator()) NullNode(Doc); 01866 } 01867 01868 // We got a normal value. 01869 return Value = parseBlockNode(); 01870 } 01871 01872 void MappingNode::increment() { 01873 if (failed()) { 01874 IsAtEnd = true; 01875 CurrentEntry = nullptr; 01876 return; 01877 } 01878 if (CurrentEntry) { 01879 CurrentEntry->skip(); 01880 if (Type == MT_Inline) { 01881 IsAtEnd = true; 01882 CurrentEntry = nullptr; 01883 return; 01884 } 01885 } 01886 Token T = peekNext(); 01887 if (T.Kind == Token::TK_Key || T.Kind == Token::TK_Scalar) { 01888 // KeyValueNode eats the TK_Key. That way it can detect null keys. 01889 CurrentEntry = new (getAllocator()) KeyValueNode(Doc); 01890 } else if (Type == MT_Block) { 01891 switch (T.Kind) { 01892 case Token::TK_BlockEnd: 01893 getNext(); 01894 IsAtEnd = true; 01895 CurrentEntry = nullptr; 01896 break; 01897 default: 01898 setError("Unexpected token. Expected Key or Block End", T); 01899 case Token::TK_Error: 01900 IsAtEnd = true; 01901 CurrentEntry = nullptr; 01902 } 01903 } else { 01904 switch (T.Kind) { 01905 case Token::TK_FlowEntry: 01906 // Eat the flow entry and recurse. 01907 getNext(); 01908 return increment(); 01909 case Token::TK_FlowMappingEnd: 01910 getNext(); 01911 case Token::TK_Error: 01912 // Set this to end iterator. 01913 IsAtEnd = true; 01914 CurrentEntry = nullptr; 01915 break; 01916 default: 01917 setError( "Unexpected token. Expected Key, Flow Entry, or Flow " 01918 "Mapping End." 01919 , T); 01920 IsAtEnd = true; 01921 CurrentEntry = nullptr; 01922 } 01923 } 01924 } 01925 01926 void SequenceNode::increment() { 01927 if (failed()) { 01928 IsAtEnd = true; 01929 CurrentEntry = nullptr; 01930 return; 01931 } 01932 if (CurrentEntry) 01933 CurrentEntry->skip(); 01934 Token T = peekNext(); 01935 if (SeqType == ST_Block) { 01936 switch (T.Kind) { 01937 case Token::TK_BlockEntry: 01938 getNext(); 01939 CurrentEntry = parseBlockNode(); 01940 if (!CurrentEntry) { // An error occurred. 01941 IsAtEnd = true; 01942 CurrentEntry = nullptr; 01943 } 01944 break; 01945 case Token::TK_BlockEnd: 01946 getNext(); 01947 IsAtEnd = true; 01948 CurrentEntry = nullptr; 01949 break; 01950 default: 01951 setError( "Unexpected token. Expected Block Entry or Block End." 01952 , T); 01953 case Token::TK_Error: 01954 IsAtEnd = true; 01955 CurrentEntry = nullptr; 01956 } 01957 } else if (SeqType == ST_Indentless) { 01958 switch (T.Kind) { 01959 case Token::TK_BlockEntry: 01960 getNext(); 01961 CurrentEntry = parseBlockNode(); 01962 if (!CurrentEntry) { // An error occurred. 01963 IsAtEnd = true; 01964 CurrentEntry = nullptr; 01965 } 01966 break; 01967 default: 01968 case Token::TK_Error: 01969 IsAtEnd = true; 01970 CurrentEntry = nullptr; 01971 } 01972 } else if (SeqType == ST_Flow) { 01973 switch (T.Kind) { 01974 case Token::TK_FlowEntry: 01975 // Eat the flow entry and recurse. 01976 getNext(); 01977 WasPreviousTokenFlowEntry = true; 01978 return increment(); 01979 case Token::TK_FlowSequenceEnd: 01980 getNext(); 01981 case Token::TK_Error: 01982 // Set this to end iterator. 01983 IsAtEnd = true; 01984 CurrentEntry = nullptr; 01985 break; 01986 case Token::TK_StreamEnd: 01987 case Token::TK_DocumentEnd: 01988 case Token::TK_DocumentStart: 01989 setError("Could not find closing ]!", T); 01990 // Set this to end iterator. 01991 IsAtEnd = true; 01992 CurrentEntry = nullptr; 01993 break; 01994 default: 01995 if (!WasPreviousTokenFlowEntry) { 01996 setError("Expected , between entries!", T); 01997 IsAtEnd = true; 01998 CurrentEntry = nullptr; 01999 break; 02000 } 02001 // Otherwise it must be a flow entry. 02002 CurrentEntry = parseBlockNode(); 02003 if (!CurrentEntry) { 02004 IsAtEnd = true; 02005 } 02006 WasPreviousTokenFlowEntry = false; 02007 break; 02008 } 02009 } 02010 } 02011 02012 Document::Document(Stream &S) : stream(S), Root(nullptr) { 02013 // Tag maps starts with two default mappings. 02014 TagMap["!"] = "!"; 02015 TagMap["!!"] = "tag:yaml.org,2002:"; 02016 02017 if (parseDirectives()) 02018 expectToken(Token::TK_DocumentStart); 02019 Token &T = peekNext(); 02020 if (T.Kind == Token::TK_DocumentStart) 02021 getNext(); 02022 } 02023 02024 bool Document::skip() { 02025 if (stream.scanner->failed()) 02026 return false; 02027 if (!Root) 02028 getRoot(); 02029 Root->skip(); 02030 Token &T = peekNext(); 02031 if (T.Kind == Token::TK_StreamEnd) 02032 return false; 02033 if (T.Kind == Token::TK_DocumentEnd) { 02034 getNext(); 02035 return skip(); 02036 } 02037 return true; 02038 } 02039 02040 Token &Document::peekNext() { 02041 return stream.scanner->peekNext(); 02042 } 02043 02044 Token Document::getNext() { 02045 return stream.scanner->getNext(); 02046 } 02047 02048 void Document::setError(const Twine &Message, Token &Location) const { 02049 stream.scanner->setError(Message, Location.Range.begin()); 02050 } 02051 02052 bool Document::failed() const { 02053 return stream.scanner->failed(); 02054 } 02055 02056 Node *Document::parseBlockNode() { 02057 Token T = peekNext(); 02058 // Handle properties. 02059 Token AnchorInfo; 02060 Token TagInfo; 02061 parse_property: 02062 switch (T.Kind) { 02063 case Token::TK_Alias: 02064 getNext(); 02065 return new (NodeAllocator) AliasNode(stream.CurrentDoc, T.Range.substr(1)); 02066 case Token::TK_Anchor: 02067 if (AnchorInfo.Kind == Token::TK_Anchor) { 02068 setError("Already encountered an anchor for this node!", T); 02069 return nullptr; 02070 } 02071 AnchorInfo = getNext(); // Consume TK_Anchor. 02072 T = peekNext(); 02073 goto parse_property; 02074 case Token::TK_Tag: 02075 if (TagInfo.Kind == Token::TK_Tag) { 02076 setError("Already encountered a tag for this node!", T); 02077 return nullptr; 02078 } 02079 TagInfo = getNext(); // Consume TK_Tag. 02080 T = peekNext(); 02081 goto parse_property; 02082 default: 02083 break; 02084 } 02085 02086 switch (T.Kind) { 02087 case Token::TK_BlockEntry: 02088 // We got an unindented BlockEntry sequence. This is not terminated with 02089 // a BlockEnd. 02090 // Don't eat the TK_BlockEntry, SequenceNode needs it. 02091 return new (NodeAllocator) SequenceNode( stream.CurrentDoc 02092 , AnchorInfo.Range.substr(1) 02093 , TagInfo.Range 02094 , SequenceNode::ST_Indentless); 02095 case Token::TK_BlockSequenceStart: 02096 getNext(); 02097 return new (NodeAllocator) 02098 SequenceNode( stream.CurrentDoc 02099 , AnchorInfo.Range.substr(1) 02100 , TagInfo.Range 02101 , SequenceNode::ST_Block); 02102 case Token::TK_BlockMappingStart: 02103 getNext(); 02104 return new (NodeAllocator) 02105 MappingNode( stream.CurrentDoc 02106 , AnchorInfo.Range.substr(1) 02107 , TagInfo.Range 02108 , MappingNode::MT_Block); 02109 case Token::TK_FlowSequenceStart: 02110 getNext(); 02111 return new (NodeAllocator) 02112 SequenceNode( stream.CurrentDoc 02113 , AnchorInfo.Range.substr(1) 02114 , TagInfo.Range 02115 , SequenceNode::ST_Flow); 02116 case Token::TK_FlowMappingStart: 02117 getNext(); 02118 return new (NodeAllocator) 02119 MappingNode( stream.CurrentDoc 02120 , AnchorInfo.Range.substr(1) 02121 , TagInfo.Range 02122 , MappingNode::MT_Flow); 02123 case Token::TK_Scalar: 02124 getNext(); 02125 return new (NodeAllocator) 02126 ScalarNode( stream.CurrentDoc 02127 , AnchorInfo.Range.substr(1) 02128 , TagInfo.Range 02129 , T.Range); 02130 case Token::TK_Key: 02131 // Don't eat the TK_Key, KeyValueNode expects it. 02132 return new (NodeAllocator) 02133 MappingNode( stream.CurrentDoc 02134 , AnchorInfo.Range.substr(1) 02135 , TagInfo.Range 02136 , MappingNode::MT_Inline); 02137 case Token::TK_DocumentStart: 02138 case Token::TK_DocumentEnd: 02139 case Token::TK_StreamEnd: 02140 default: 02141 // TODO: Properly handle tags. "[!!str ]" should resolve to !!str "", not 02142 // !!null null. 02143 return new (NodeAllocator) NullNode(stream.CurrentDoc); 02144 case Token::TK_Error: 02145 return nullptr; 02146 } 02147 llvm_unreachable("Control flow shouldn't reach here."); 02148 return nullptr; 02149 } 02150 02151 bool Document::parseDirectives() { 02152 bool isDirective = false; 02153 while (true) { 02154 Token T = peekNext(); 02155 if (T.Kind == Token::TK_TagDirective) { 02156 parseTAGDirective(); 02157 isDirective = true; 02158 } else if (T.Kind == Token::TK_VersionDirective) { 02159 parseYAMLDirective(); 02160 isDirective = true; 02161 } else 02162 break; 02163 } 02164 return isDirective; 02165 } 02166 02167 void Document::parseYAMLDirective() { 02168 getNext(); // Eat %YAML <version> 02169 } 02170 02171 void Document::parseTAGDirective() { 02172 Token Tag = getNext(); // %TAG <handle> <prefix> 02173 StringRef T = Tag.Range; 02174 // Strip %TAG 02175 T = T.substr(T.find_first_of(" \t")).ltrim(" \t"); 02176 std::size_t HandleEnd = T.find_first_of(" \t"); 02177 StringRef TagHandle = T.substr(0, HandleEnd); 02178 StringRef TagPrefix = T.substr(HandleEnd).ltrim(" \t"); 02179 TagMap[TagHandle] = TagPrefix; 02180 } 02181 02182 bool Document::expectToken(int TK) { 02183 Token T = getNext(); 02184 if (T.Kind != TK) { 02185 setError("Unexpected token", T); 02186 return false; 02187 } 02188 return true; 02189 }