1 | //===- YAMLParser.h - Simple YAML parser ------------------------*- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This is a YAML 1.2 parser. |
10 | // |
11 | // See http://www.yaml.org/spec/1.2/spec.html for the full standard. |
12 | // |
13 | // This currently does not implement the following: |
14 | // * Multi-line literal folding. |
15 | // * Tag resolution. |
16 | // * UTF-16. |
17 | // * BOMs anywhere other than the first Unicode scalar value in the file. |
18 | // |
19 | // The most important class here is Stream. This represents a YAML stream with |
20 | // 0, 1, or many documents. |
21 | // |
22 | // SourceMgr sm; |
23 | // StringRef input = getInput(); |
24 | // yaml::Stream stream(input, sm); |
25 | // |
26 | // for (yaml::document_iterator di = stream.begin(), de = stream.end(); |
27 | // di != de; ++di) { |
28 | // yaml::Node *n = di->getRoot(); |
29 | // if (n) { |
30 | // // Do something with n... |
31 | // } else |
32 | // break; |
33 | // } |
34 | // |
35 | //===----------------------------------------------------------------------===// |
36 | |
37 | #ifndef LLVM_SUPPORT_YAMLPARSER_H |
38 | #define LLVM_SUPPORT_YAMLPARSER_H |
39 | |
40 | #include "llvm/ADT/StringRef.h" |
41 | #include "llvm/Support/Allocator.h" |
42 | #include "llvm/Support/SMLoc.h" |
43 | #include "llvm/Support/SourceMgr.h" |
44 | #include <cassert> |
45 | #include <cstddef> |
46 | #include <iterator> |
47 | #include <map> |
48 | #include <memory> |
49 | #include <string> |
50 | #include <system_error> |
51 | |
52 | namespace llvm { |
53 | |
54 | class MemoryBufferRef; |
55 | class raw_ostream; |
56 | class Twine; |
57 | |
58 | namespace yaml { |
59 | |
60 | class Document; |
61 | class document_iterator; |
62 | class Node; |
63 | class Scanner; |
64 | struct Token; |
65 | |
66 | /// Dump all the tokens in this stream to OS. |
67 | /// \returns true if there was an error, false otherwise. |
68 | bool dumpTokens(StringRef Input, raw_ostream &); |
69 | |
70 | /// Scans all tokens in input without outputting anything. This is used |
71 | /// for benchmarking the tokenizer. |
72 | /// \returns true if there was an error, false otherwise. |
73 | bool scanTokens(StringRef Input); |
74 | |
75 | /// Escape \a Input for a double quoted scalar; if \p EscapePrintable |
76 | /// is true, all UTF8 sequences will be escaped, if \p EscapePrintable is |
77 | /// false, those UTF8 sequences encoding printable unicode scalars will not be |
78 | /// escaped, but emitted verbatim. |
79 | std::string escape(StringRef Input, bool EscapePrintable = true); |
80 | |
81 | /// Parse \p S as a bool according to https://yaml.org/type/bool.html. |
82 | llvm::Optional<bool> parseBool(StringRef S); |
83 | |
84 | /// This class represents a YAML stream potentially containing multiple |
85 | /// documents. |
86 | class Stream { |
87 | public: |
88 | /// This keeps a reference to the string referenced by \p Input. |
89 | Stream(StringRef Input, SourceMgr &, bool ShowColors = true, |
90 | std::error_code *EC = nullptr); |
91 | |
92 | Stream(MemoryBufferRef InputBuffer, SourceMgr &, bool ShowColors = true, |
93 | std::error_code *EC = nullptr); |
94 | ~Stream(); |
95 | |
96 | document_iterator begin(); |
97 | document_iterator end(); |
98 | void skip(); |
99 | bool failed(); |
100 | |
101 | bool validate() { |
102 | skip(); |
103 | return !failed(); |
104 | } |
105 | |
106 | void printError(Node *N, const Twine &Msg, |
107 | SourceMgr::DiagKind Kind = SourceMgr::DK_Error); |
108 | void printError(const SMRange &Range, const Twine &Msg, |
109 | SourceMgr::DiagKind Kind = SourceMgr::DK_Error); |
110 | |
111 | private: |
112 | friend class Document; |
113 | |
114 | std::unique_ptr<Scanner> scanner; |
115 | std::unique_ptr<Document> CurrentDoc; |
116 | }; |
117 | |
118 | /// Abstract base class for all Nodes. |
119 | class Node { |
120 | virtual void anchor(); |
121 | |
122 | public: |
123 | enum NodeKind { |
124 | NK_Null, |
125 | NK_Scalar, |
126 | NK_BlockScalar, |
127 | NK_KeyValue, |
128 | NK_Mapping, |
129 | NK_Sequence, |
130 | NK_Alias |
131 | }; |
132 | |
133 | Node(unsigned int Type, std::unique_ptr<Document> &, StringRef Anchor, |
134 | StringRef Tag); |
135 | |
136 | // It's not safe to copy YAML nodes; the document is streamed and the position |
137 | // is part of the state. |
138 | Node(const Node &) = delete; |
139 | void operator=(const Node &) = delete; |
140 | |
141 | void *operator new(size_t Size, BumpPtrAllocator &Alloc, |
142 | size_t Alignment = 16) noexcept { |
143 | return Alloc.Allocate(Size, Alignment); |
144 | } |
145 | |
146 | void operator delete(void *Ptr, BumpPtrAllocator &Alloc, |
147 | size_t Size) noexcept { |
148 | Alloc.Deallocate(Ptr, Size, 0); |
149 | } |
150 | |
151 | void operator delete(void *) noexcept = delete; |
152 | |
153 | /// Get the value of the anchor attached to this node. If it does not |
154 | /// have one, getAnchor().size() will be 0. |
155 | StringRef getAnchor() const { return Anchor; } |
156 | |
157 | /// Get the tag as it was written in the document. This does not |
158 | /// perform tag resolution. |
159 | StringRef getRawTag() const { return Tag; } |
160 | |
161 | /// Get the verbatium tag for a given Node. This performs tag resoluton |
162 | /// and substitution. |
163 | std::string getVerbatimTag() const; |
164 | |
165 | SMRange getSourceRange() const { return SourceRange; } |
166 | void setSourceRange(SMRange SR) { SourceRange = SR; } |
167 | |
168 | // These functions forward to Document and Scanner. |
169 | Token &peekNext(); |
170 | Token getNext(); |
171 | Node *parseBlockNode(); |
172 | BumpPtrAllocator &getAllocator(); |
173 | void setError(const Twine &Message, Token &Location) const; |
174 | bool failed() const; |
175 | |
176 | virtual void skip() {} |
177 | |
178 | unsigned int getType() const { return TypeID; } |
179 | |
180 | protected: |
181 | std::unique_ptr<Document> &Doc; |
182 | SMRange SourceRange; |
183 | |
184 | ~Node() = default; |
185 | |
186 | private: |
187 | unsigned int TypeID; |
188 | StringRef Anchor; |
189 | /// The tag as typed in the document. |
190 | StringRef Tag; |
191 | }; |
192 | |
193 | /// A null value. |
194 | /// |
195 | /// Example: |
196 | /// !!null null |
197 | class NullNode final : public Node { |
198 | void anchor() override; |
199 | |
200 | public: |
201 | NullNode(std::unique_ptr<Document> &D) |
202 | : Node(NK_Null, D, StringRef(), StringRef()) {} |
203 | |
204 | static bool classof(const Node *N) { return N->getType() == NK_Null; } |
205 | }; |
206 | |
207 | /// A scalar node is an opaque datum that can be presented as a |
208 | /// series of zero or more Unicode scalar values. |
209 | /// |
210 | /// Example: |
211 | /// Adena |
212 | class ScalarNode final : public Node { |
213 | void anchor() override; |
214 | |
215 | public: |
216 | ScalarNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag, |
217 | StringRef Val) |
218 | : Node(NK_Scalar, D, Anchor, Tag), Value(Val) { |
219 | SMLoc Start = SMLoc::getFromPointer(Val.begin()); |
220 | SMLoc End = SMLoc::getFromPointer(Val.end()); |
221 | SourceRange = SMRange(Start, End); |
222 | } |
223 | |
224 | // Return Value without any escaping or folding or other fun YAML stuff. This |
225 | // is the exact bytes that are contained in the file (after conversion to |
226 | // utf8). |
227 | StringRef getRawValue() const { return Value; } |
228 | |
229 | /// Gets the value of this node as a StringRef. |
230 | /// |
231 | /// \param Storage is used to store the content of the returned StringRef if |
232 | /// it requires any modification from how it appeared in the source. |
233 | /// This happens with escaped characters and multi-line literals. |
234 | StringRef getValue(SmallVectorImpl<char> &Storage) const; |
235 | |
236 | static bool classof(const Node *N) { |
237 | return N->getType() == NK_Scalar; |
238 | } |
239 | |
240 | private: |
241 | StringRef Value; |
242 | |
243 | StringRef unescapeDoubleQuoted(StringRef UnquotedValue, |
244 | StringRef::size_type Start, |
245 | SmallVectorImpl<char> &Storage) const; |
246 | }; |
247 | |
248 | /// A block scalar node is an opaque datum that can be presented as a |
249 | /// series of zero or more Unicode scalar values. |
250 | /// |
251 | /// Example: |
252 | /// | |
253 | /// Hello |
254 | /// World |
255 | class BlockScalarNode final : public Node { |
256 | void anchor() override; |
257 | |
258 | public: |
259 | BlockScalarNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag, |
260 | StringRef Value, StringRef RawVal) |
261 | : Node(NK_BlockScalar, D, Anchor, Tag), Value(Value) { |
262 | SMLoc Start = SMLoc::getFromPointer(RawVal.begin()); |
263 | SMLoc End = SMLoc::getFromPointer(RawVal.end()); |
264 | SourceRange = SMRange(Start, End); |
265 | } |
266 | |
267 | /// Gets the value of this node as a StringRef. |
268 | StringRef getValue() const { return Value; } |
269 | |
270 | static bool classof(const Node *N) { |
271 | return N->getType() == NK_BlockScalar; |
272 | } |
273 | |
274 | private: |
275 | StringRef Value; |
276 | }; |
277 | |
278 | /// A key and value pair. While not technically a Node under the YAML |
279 | /// representation graph, it is easier to treat them this way. |
280 | /// |
281 | /// TODO: Consider making this not a child of Node. |
282 | /// |
283 | /// Example: |
284 | /// Section: .text |
285 | class KeyValueNode final : public Node { |
286 | void anchor() override; |
287 | |
288 | public: |
289 | KeyValueNode(std::unique_ptr<Document> &D) |
290 | : Node(NK_KeyValue, D, StringRef(), StringRef()) {} |
291 | |
292 | /// Parse and return the key. |
293 | /// |
294 | /// This may be called multiple times. |
295 | /// |
296 | /// \returns The key, or nullptr if failed() == true. |
297 | Node *getKey(); |
298 | |
299 | /// Parse and return the value. |
300 | /// |
301 | /// This may be called multiple times. |
302 | /// |
303 | /// \returns The value, or nullptr if failed() == true. |
304 | Node *getValue(); |
305 | |
306 | void skip() override { |
307 | if (Node *Key = getKey()) { |
308 | Key->skip(); |
309 | if (Node *Val = getValue()) |
310 | Val->skip(); |
311 | } |
312 | } |
313 | |
314 | static bool classof(const Node *N) { |
315 | return N->getType() == NK_KeyValue; |
316 | } |
317 | |
318 | private: |
319 | Node *Key = nullptr; |
320 | Node *Value = nullptr; |
321 | }; |
322 | |
323 | /// This is an iterator abstraction over YAML collections shared by both |
324 | /// sequences and maps. |
325 | /// |
326 | /// BaseT must have a ValueT* member named CurrentEntry and a member function |
327 | /// increment() which must set CurrentEntry to 0 to create an end iterator. |
328 | template <class BaseT, class ValueT> class basic_collection_iterator { |
329 | public: |
330 | using iterator_category = std::input_iterator_tag; |
331 | using value_type = ValueT; |
332 | using difference_type = std::ptrdiff_t; |
333 | using pointer = value_type *; |
334 | using reference = value_type &; |
335 | |
336 | basic_collection_iterator() = default; |
337 | basic_collection_iterator(BaseT *B) : Base(B) {} |
338 | |
339 | ValueT *operator->() const { |
340 | assert(Base && Base->CurrentEntry && "Attempted to access end iterator!" ); |
341 | return Base->CurrentEntry; |
342 | } |
343 | |
344 | ValueT &operator*() const { |
345 | assert(Base && Base->CurrentEntry && |
346 | "Attempted to dereference end iterator!" ); |
347 | return *Base->CurrentEntry; |
348 | } |
349 | |
350 | operator ValueT *() const { |
351 | assert(Base && Base->CurrentEntry && "Attempted to access end iterator!" ); |
352 | return Base->CurrentEntry; |
353 | } |
354 | |
355 | /// Note on EqualityComparable: |
356 | /// |
357 | /// The iterator is not re-entrant, |
358 | /// it is meant to be used for parsing YAML on-demand |
359 | /// Once iteration started - it can point only to one entry at a time |
360 | /// hence Base.CurrentEntry and Other.Base.CurrentEntry are equal |
361 | /// iff Base and Other.Base are equal. |
362 | bool operator==(const basic_collection_iterator &Other) const { |
363 | if (Base && (Base == Other.Base)) { |
364 | assert((Base->CurrentEntry == Other.Base->CurrentEntry) |
365 | && "Equal Bases expected to point to equal Entries" ); |
366 | } |
367 | |
368 | return Base == Other.Base; |
369 | } |
370 | |
371 | bool operator!=(const basic_collection_iterator &Other) const { |
372 | return !(Base == Other.Base); |
373 | } |
374 | |
375 | basic_collection_iterator &operator++() { |
376 | assert(Base && "Attempted to advance iterator past end!" ); |
377 | Base->increment(); |
378 | // Create an end iterator. |
379 | if (!Base->CurrentEntry) |
380 | Base = nullptr; |
381 | return *this; |
382 | } |
383 | |
384 | private: |
385 | BaseT *Base = nullptr; |
386 | }; |
387 | |
388 | // The following two templates are used for both MappingNode and Sequence Node. |
389 | template <class CollectionType> |
390 | typename CollectionType::iterator begin(CollectionType &C) { |
391 | assert(C.IsAtBeginning && "You may only iterate over a collection once!" ); |
392 | C.IsAtBeginning = false; |
393 | typename CollectionType::iterator ret(&C); |
394 | ++ret; |
395 | return ret; |
396 | } |
397 | |
398 | template <class CollectionType> void skip(CollectionType &C) { |
399 | // TODO: support skipping from the middle of a parsed collection ;/ |
400 | assert((C.IsAtBeginning || C.IsAtEnd) && "Cannot skip mid parse!" ); |
401 | if (C.IsAtBeginning) |
402 | for (typename CollectionType::iterator i = begin(C), e = C.end(); i != e; |
403 | ++i) |
404 | i->skip(); |
405 | } |
406 | |
407 | /// Represents a YAML map created from either a block map for a flow map. |
408 | /// |
409 | /// This parses the YAML stream as increment() is called. |
410 | /// |
411 | /// Example: |
412 | /// Name: _main |
413 | /// Scope: Global |
414 | class MappingNode final : public Node { |
415 | void anchor() override; |
416 | |
417 | public: |
418 | enum MappingType { |
419 | MT_Block, |
420 | MT_Flow, |
421 | MT_Inline ///< An inline mapping node is used for "[key: value]". |
422 | }; |
423 | |
424 | MappingNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag, |
425 | MappingType MT) |
426 | : Node(NK_Mapping, D, Anchor, Tag), Type(MT) {} |
427 | |
428 | friend class basic_collection_iterator<MappingNode, KeyValueNode>; |
429 | |
430 | using iterator = basic_collection_iterator<MappingNode, KeyValueNode>; |
431 | |
432 | template <class T> friend typename T::iterator yaml::begin(T &); |
433 | template <class T> friend void yaml::skip(T &); |
434 | |
435 | iterator begin() { return yaml::begin(*this); } |
436 | |
437 | iterator end() { return iterator(); } |
438 | |
439 | void skip() override { yaml::skip(*this); } |
440 | |
441 | static bool classof(const Node *N) { |
442 | return N->getType() == NK_Mapping; |
443 | } |
444 | |
445 | private: |
446 | MappingType Type; |
447 | bool IsAtBeginning = true; |
448 | bool IsAtEnd = false; |
449 | KeyValueNode *CurrentEntry = nullptr; |
450 | |
451 | void increment(); |
452 | }; |
453 | |
454 | /// Represents a YAML sequence created from either a block sequence for a |
455 | /// flow sequence. |
456 | /// |
457 | /// This parses the YAML stream as increment() is called. |
458 | /// |
459 | /// Example: |
460 | /// - Hello |
461 | /// - World |
462 | class SequenceNode final : public Node { |
463 | void anchor() override; |
464 | |
465 | public: |
466 | enum SequenceType { |
467 | ST_Block, |
468 | ST_Flow, |
469 | // Use for: |
470 | // |
471 | // key: |
472 | // - val1 |
473 | // - val2 |
474 | // |
475 | // As a BlockMappingEntry and BlockEnd are not created in this case. |
476 | ST_Indentless |
477 | }; |
478 | |
479 | SequenceNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag, |
480 | SequenceType ST) |
481 | : Node(NK_Sequence, D, Anchor, Tag), SeqType(ST) {} |
482 | |
483 | friend class basic_collection_iterator<SequenceNode, Node>; |
484 | |
485 | using iterator = basic_collection_iterator<SequenceNode, Node>; |
486 | |
487 | template <class T> friend typename T::iterator yaml::begin(T &); |
488 | template <class T> friend void yaml::skip(T &); |
489 | |
490 | void increment(); |
491 | |
492 | iterator begin() { return yaml::begin(*this); } |
493 | |
494 | iterator end() { return iterator(); } |
495 | |
496 | void skip() override { yaml::skip(*this); } |
497 | |
498 | static bool classof(const Node *N) { |
499 | return N->getType() == NK_Sequence; |
500 | } |
501 | |
502 | private: |
503 | SequenceType SeqType; |
504 | bool IsAtBeginning = true; |
505 | bool IsAtEnd = false; |
506 | bool WasPreviousTokenFlowEntry = true; // Start with an imaginary ','. |
507 | Node *CurrentEntry = nullptr; |
508 | }; |
509 | |
510 | /// Represents an alias to a Node with an anchor. |
511 | /// |
512 | /// Example: |
513 | /// *AnchorName |
514 | class AliasNode final : public Node { |
515 | void anchor() override; |
516 | |
517 | public: |
518 | AliasNode(std::unique_ptr<Document> &D, StringRef Val) |
519 | : Node(NK_Alias, D, StringRef(), StringRef()), Name(Val) {} |
520 | |
521 | StringRef getName() const { return Name; } |
522 | |
523 | static bool classof(const Node *N) { return N->getType() == NK_Alias; } |
524 | |
525 | private: |
526 | StringRef Name; |
527 | }; |
528 | |
529 | /// A YAML Stream is a sequence of Documents. A document contains a root |
530 | /// node. |
531 | class Document { |
532 | public: |
533 | Document(Stream &ParentStream); |
534 | |
535 | /// Root for parsing a node. Returns a single node. |
536 | Node *parseBlockNode(); |
537 | |
538 | /// Finish parsing the current document and return true if there are |
539 | /// more. Return false otherwise. |
540 | bool skip(); |
541 | |
542 | /// Parse and return the root level node. |
543 | Node *getRoot() { |
544 | if (Root) |
545 | return Root; |
546 | return Root = parseBlockNode(); |
547 | } |
548 | |
549 | const std::map<StringRef, StringRef> &getTagMap() const { return TagMap; } |
550 | |
551 | private: |
552 | friend class Node; |
553 | friend class document_iterator; |
554 | |
555 | /// Stream to read tokens from. |
556 | Stream &stream; |
557 | |
558 | /// Used to allocate nodes to. All are destroyed without calling their |
559 | /// destructor when the document is destroyed. |
560 | BumpPtrAllocator NodeAllocator; |
561 | |
562 | /// The root node. Used to support skipping a partially parsed |
563 | /// document. |
564 | Node *Root; |
565 | |
566 | /// Maps tag prefixes to their expansion. |
567 | std::map<StringRef, StringRef> TagMap; |
568 | |
569 | Token &peekNext(); |
570 | Token getNext(); |
571 | void setError(const Twine &Message, Token &Location) const; |
572 | bool failed() const; |
573 | |
574 | /// Parse %BLAH directives and return true if any were encountered. |
575 | bool parseDirectives(); |
576 | |
577 | /// Parse %YAML |
578 | void parseYAMLDirective(); |
579 | |
580 | /// Parse %TAG |
581 | void parseTAGDirective(); |
582 | |
583 | /// Consume the next token and error if it is not \a TK. |
584 | bool expectToken(int TK); |
585 | }; |
586 | |
587 | /// Iterator abstraction for Documents over a Stream. |
588 | class document_iterator { |
589 | public: |
590 | document_iterator() = default; |
591 | document_iterator(std::unique_ptr<Document> &D) : Doc(&D) {} |
592 | |
593 | bool operator==(const document_iterator &Other) const { |
594 | if (isAtEnd() || Other.isAtEnd()) |
595 | return isAtEnd() && Other.isAtEnd(); |
596 | |
597 | return Doc == Other.Doc; |
598 | } |
599 | bool operator!=(const document_iterator &Other) const { |
600 | return !(*this == Other); |
601 | } |
602 | |
603 | document_iterator operator++() { |
604 | assert(Doc && "incrementing iterator past the end." ); |
605 | if (!(*Doc)->skip()) { |
606 | Doc->reset(nullptr); |
607 | } else { |
608 | Stream &S = (*Doc)->stream; |
609 | Doc->reset(new Document(S)); |
610 | } |
611 | return *this; |
612 | } |
613 | |
614 | Document &operator*() { return *Doc->get(); } |
615 | |
616 | std::unique_ptr<Document> &operator->() { return *Doc; } |
617 | |
618 | private: |
619 | bool isAtEnd() const { return !Doc || !*Doc; } |
620 | |
621 | std::unique_ptr<Document> *Doc = nullptr; |
622 | }; |
623 | |
624 | } // end namespace yaml |
625 | |
626 | } // end namespace llvm |
627 | |
628 | #endif // LLVM_SUPPORT_YAMLPARSER_H |
629 | |