//===--- GLR.h - Implement a GLR parsing algorithm ---------------*- C++-*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This implements a standard Generalized LR (GLR) parsing algorithm. // // The GLR parser behaves as a normal LR parser until it encounters a conflict. // To handle a conflict (where there are multiple actions could perform), the // parser will simulate nondeterminism by doing a breadth-first search // over all the possibilities. // // Basic mechanisims of the GLR parser: // - A number of processes are operated in parallel. // - Each process has its own parsing stack and behaves as a standard // determinism LR parser. // - When a process encounters a conflict, it will be fork (one for each // avaiable action). // - When a process encounters an error, it is abandoned. // - All process are synchronized by the lookahead token: they perfrom shift // action at the same time, which means some processes need wait until other // processes have performed all reduce actions. // //===----------------------------------------------------------------------===// #ifndef CLANG_PSEUDO_GLR_H #define CLANG_PSEUDO_GLR_H #include "clang-pseudo/Forest.h" #include "clang-pseudo/Language.h" #include "clang-pseudo/grammar/Grammar.h" #include "clang-pseudo/grammar/LRTable.h" #include "llvm/Support/Allocator.h" #include namespace clang { namespace pseudo { // A Graph-Structured Stack efficiently represents all parse stacks of a GLR // parser. // // Each node stores a parse state, the last parsed ForestNode, and the parent // node. There may be several heads (top of stack), and the parser operates by: // - shift: pushing terminal symbols on top of the stack // - reduce: replace N symbols on top of the stack with one nonterminal // // The structure is a DAG rather than a linear stack: // - GLR allows multiple actions (conflicts) on the same head, producing forks // where several nodes have the same parent // - The parser merges nodes with the same (state, ForestNode), producing joins // where one node has multiple parents // // The parser is responsible for creating nodes and keeping track of the set of // heads. The GSS class is mostly an arena for them. struct GSS { // A node represents a partial parse of the input up to some point. // // It is the equivalent of a frame in an LR parse stack. // Like such a frame, it has an LR parse state and a syntax-tree node // representing the last parsed symbol (a ForestNode in our case). // Unlike a regular LR stack frame, it may have multiple parents. // // Nodes are not exactly pushed and popped on the stack: pushing is just // allocating a new head node with a parent pointer to the old head. Popping // is just forgetting about a node and remembering its parent instead. struct alignas(struct Node *) Node { // LR state describing how parsing should continue from this head. LRTable::StateID State; // Used internally to track reachability during garbage collection. bool GCParity; // Have we already used this node for error recovery? (prevents loops) mutable bool Recovered = false; // Number of the parents of this node. // The parents hold previous parsed symbols, and may resume control after // this node is reduced. unsigned ParentCount; // The parse node for the last parsed symbol. // This symbol appears on the left of the dot in the parse state's items. // (In the literature, the node is attached to the *edge* to the parent). const ForestNode *Payload = nullptr; llvm::ArrayRef parents() const { return llvm::ArrayRef(reinterpret_cast(this + 1), ParentCount); }; // Parents are stored as a trailing array of Node*. }; // Allocates a new node in the graph. const Node *addNode(LRTable::StateID State, const ForestNode *Symbol, llvm::ArrayRef Parents); // Frees all nodes not reachable as ancestors of Roots, and returns the count. // Calling this periodically prevents steady memory growth of the GSS. unsigned gc(std::vector &&Roots); size_t bytes() const { return Arena.getTotalMemory() + sizeof(*this); } size_t nodesCreated() const { return NodesCreated; } private: // Nodes are recycled using freelists. // They are variable size, so use one free-list per distinct #parents. std::vector> FreeList; Node *allocate(unsigned Parents); void destroy(Node *N); // The list of nodes created and not destroyed - our candidates for gc(). std::vector Alive; bool GCParity = false; // All nodes should match this, except during GC. llvm::BumpPtrAllocator Arena; unsigned NodesCreated = 0; }; llvm::raw_ostream &operator<<(llvm::raw_ostream &, const GSS::Node &); // Parameters for the GLR parsing. struct ParseParams { // The token stream to parse. const TokenStream &Code; // Arena for data structure used by the GLR algorithm. ForestArena &Forest; // Storage for the output forest. GSS &GSStack; // Storage for parsing stacks. }; // Parses the given token stream as the start symbol with the GLR algorithm, // and returns a forest node of the start symbol. // // A rule `_ := StartSymbol` must exit for the chosen start symbol. // // If the parsing fails, we model it as an opaque node in the forest. ForestNode &glrParse(const ParseParams &Params, SymbolID StartSymbol, const Language &Lang); // Shift a token onto all OldHeads, placing the results into NewHeads. // // Exposed for testing only. void glrShift(llvm::ArrayRef OldHeads, const ForestNode &NextTok, const ParseParams &Params, const Language &Lang, std::vector &NewHeads); // Applies available reductions on Heads, appending resulting heads to the list. // // Exposed for testing only. void glrReduce(std::vector &Heads, SymbolID Lookahead, const ParseParams &Params, const Language &Lang); // Heuristically recover from a state where no further parsing is possible. // // OldHeads is the parse state at TokenIndex. // This function consumes zero or more tokens by advancing TokenIndex, // and places any recovery states created in NewHeads. // // On failure, NewHeads is empty and TokenIndex is unchanged. // // WARNING: glrRecover acts as a "fallback shift". If it consumes no tokens, // there is a risk of the parser falling into an infinite loop, creating an // endless sequence of recovery nodes. // Generally it is safe for recovery to match 0 tokens against sequence symbols // like `statement-seq`, as the grammar won't permit another statement-seq // immediately afterwards. However recovery strategies for `statement` should // consume at least one token, as statements may be adjacent in the input. void glrRecover(llvm::ArrayRef OldHeads, unsigned &TokenIndex, const ParseParams &Params, const Language &Lang, std::vector &NewHeads); } // namespace pseudo } // namespace clang #endif // CLANG_PSEUDO_GLR_H