//===--- LRGraph.cpp - -------------------------------------------*- C++-*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// #include "clang-pseudo/grammar/LRGraph.h" #include "clang-pseudo/grammar/Grammar.h" #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/Hashing.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Support/FormatVariadic.h" #include "llvm/Support/raw_ostream.h" using ItemSet = std::vector; namespace llvm { // Support clang::pseudo::Item as DenseMap keys. template <> struct DenseMapInfo { static inline ItemSet getEmptyKey() { return {DenseMapInfo::getEmptyKey()}; } static inline ItemSet getTombstoneKey() { return {DenseMapInfo::getTombstoneKey()}; } static unsigned getHashValue(const ItemSet &I) { return llvm::hash_combine_range(I.begin(), I.end()); } static bool isEqual(const ItemSet &LHS, const ItemSet &RHS) { return LHS == RHS; } }; } // namespace llvm namespace clang { namespace pseudo { namespace { struct SortByNextSymbol { SortByNextSymbol(const Grammar &G) : G(G) {} bool operator()(const Item &L, const Item &R) { if (L.hasNext() && R.hasNext() && L.next(G) != R.next(G)) return L.next(G) < R.next(G); if (L.hasNext() != R.hasNext()) return L.hasNext() < R.hasNext(); // a trailing dot is minimal. return L < R; } const Grammar &G; }; // Computes a closure of the given item set S: // - extends the given S to contain all options for parsing next token; // - nonterminals after a dot are recursively expanded into the begin-state // of all production rules that produce that nonterminal; // // Given // Grammar rules = [ _ := E, E := E - T, E := T, T := n, T := ( E ) ] // Input = [ E := . T ] // returns [ E := . T, T := . n, T := . ( E ) ] State closure(ItemSet Queue, const Grammar &G) { llvm::DenseSet InQueue = {Queue.begin(), Queue.end()}; // We reuse the passed-by-value Queue as the final result, as it's already // initialized to the right elements. size_t ItIndex = 0; while (ItIndex < Queue.size()) { const Item &ExpandingItem = Queue[ItIndex]; ++ItIndex; if (!ExpandingItem.hasNext()) continue; SymbolID NextSym = ExpandingItem.next(G); if (pseudo::isToken(NextSym)) continue; auto RRange = G.table().Nonterminals[NextSym].RuleRange; for (RuleID RID = RRange.Start; RID < RRange.End; ++RID) { Item NewItem = Item::start(RID, G); if (InQueue.insert(NewItem).second) // new Queue.push_back(std::move(NewItem)); } } Queue.shrink_to_fit(); llvm::sort(Queue, SortByNextSymbol(G)); return {std::move(Queue)}; } // Returns all next (with a dot advanced) kernel item sets, partitioned by the // advanced symbol. // // Given // S = [ E := . a b, E := E . - T ] // returns [ // {id(a), [ E := a . b ]}, // {id(-), [ E := E - . T ]} // ] std::vector> nextAvailableKernelItems(const State &S, const Grammar &G) { std::vector> Results; llvm::ArrayRef AllItems = S.Items; AllItems = AllItems.drop_while([](const Item &I) { return !I.hasNext(); }); while (!AllItems.empty()) { SymbolID AdvancedSymbol = AllItems.front().next(G); auto Batch = AllItems.take_while([AdvancedSymbol, &G](const Item &I) { assert(I.hasNext()); return I.next(G) == AdvancedSymbol; }); assert(!Batch.empty()); AllItems = AllItems.drop_front(Batch.size()); // Advance a dot over the Symbol. ItemSet Next; for (const Item &I : Batch) Next.push_back(I.advance()); // sort the set to keep order determinism for hash computation. llvm::sort(Next); Results.push_back({AdvancedSymbol, std::move(Next)}); } return Results; } std::vector> availableRecovery(const State &S, const Grammar &G) { std::vector> Result; for (const Item &I : S.Items) { const auto &Rule = G.lookupRule(I.rule()); if (I.dot() != Rule.RecoveryIndex) continue; Result.push_back({Rule.Recovery, Rule.seq()[Rule.RecoveryIndex]}); } llvm::sort(Result); Result.erase(std::unique(Result.begin(), Result.end()), Result.end()); return Result; } } // namespace std::string Item::dump(const Grammar &G) const { const auto &Rule = G.lookupRule(RID); auto ToNames = [&](llvm::ArrayRef Syms) { std::vector Results; for (auto SID : Syms) Results.push_back(G.symbolName(SID)); return Results; }; return llvm::formatv("{0} := {1} • {2}{3}", G.symbolName(Rule.Target), llvm::join(ToNames(Rule.seq().take_front(DotPos)), " "), llvm::join(ToNames(Rule.seq().drop_front(DotPos)), " "), Rule.RecoveryIndex == DotPos ? " [recovery]" : "") .str(); } std::string State::dump(const Grammar &G, unsigned Indent) const { std::string Result; llvm::raw_string_ostream OS(Result); for (const auto &Item : Items) OS.indent(Indent) << llvm::formatv("{0}\n", Item.dump(G)); return OS.str(); } std::string LRGraph::dumpForTests(const Grammar &G) const { std::string Result; llvm::raw_string_ostream OS(Result); OS << "States:\n"; for (StateID ID = 0; ID < States.size(); ++ID) { OS << llvm::formatv("State {0}\n", ID); OS << States[ID].dump(G, /*Indent*/ 4); } for (const auto &E : Edges) { OS << llvm::formatv("{0} ->[{1}] {2}\n", E.Src, G.symbolName(E.Label), E.Dst); } return OS.str(); } LRGraph LRGraph::buildLR0(const Grammar &G) { class Builder { public: Builder(const Grammar &G) : G(G) {} // Adds a given state if not existed. std::pair insert(ItemSet KernelItems) { assert(llvm::is_sorted(KernelItems) && "Item must be sorted before inserting to a hash map!"); auto It = StatesIndex.find(KernelItems); if (It != StatesIndex.end()) return {It->second, false}; States.push_back(closure(KernelItems, G)); StateID NextStateID = States.size() - 1; StatesIndex.insert({std::move(KernelItems), NextStateID}); return {NextStateID, true}; } void insertEdge(StateID Src, StateID Dst, SymbolID Label) { Edges.push_back({Src, Dst, Label}); } void insertRecovery(StateID Src, ExtensionID Strategy, SymbolID Result) { Recoveries.push_back({Src, Strategy, Result}); } // Returns a state with the given id. const State &find(StateID ID) const { assert(ID < States.size()); return States[ID]; } void addStartState(SymbolID Sym, StateID State) { StartStates.push_back({Sym, State}); } LRGraph build() && { States.shrink_to_fit(); Edges.shrink_to_fit(); Recoveries.shrink_to_fit(); llvm::sort(StartStates); StartStates.shrink_to_fit(); return LRGraph(std::move(States), std::move(Edges), std::move(Recoveries), std::move(StartStates)); } private: // Key is the **kernel** item sets. llvm::DenseMap StatesIndex; std::vector States; std::vector Edges; std::vector Recoveries; const Grammar &G; std::vector> StartStates; } Builder(G); std::vector PendingStates; // Initialize states with the start symbol. auto RRange = G.table().Nonterminals[G.underscore()].RuleRange; for (RuleID RID = RRange.Start; RID < RRange.End; ++RID) { auto StartState = std::vector{Item::start(RID, G)}; auto Result = Builder.insert(std::move(StartState)); assert(Result.second && "State must be new"); PendingStates.push_back(Result.first); const Rule &StartRule = G.lookupRule(RID); assert(StartRule.Size == 2 && StartRule.seq().back() == tokenSymbol(tok::eof) && "Start rule must be of the form `_ := start-symbol EOF`!"); Builder.addStartState(StartRule.seq().front(), Result.first); } while (!PendingStates.empty()) { auto StateID = PendingStates.back(); PendingStates.pop_back(); for (auto Next : nextAvailableKernelItems(Builder.find(StateID), G)) { auto Insert = Builder.insert(Next.second); if (Insert.second) // new state, insert to the pending queue. PendingStates.push_back(Insert.first); Builder.insertEdge(StateID, Insert.first, Next.first); } for (auto Recovery : availableRecovery(Builder.find(StateID), G)) Builder.insertRecovery(StateID, Recovery.first, Recovery.second); } return std::move(Builder).build(); } } // namespace pseudo } // namespace clang