//===-- ClangPseudo.cpp - Clang pseudoparser tool -------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// #include "clang-pseudo/Bracket.h" #include "clang-pseudo/DirectiveTree.h" #include "clang-pseudo/Disambiguate.h" #include "clang-pseudo/Forest.h" #include "clang-pseudo/GLR.h" #include "clang-pseudo/Language.h" #include "clang-pseudo/Token.h" #include "clang-pseudo/cli/CLI.h" #include "clang-pseudo/grammar/Grammar.h" #include "clang-pseudo/grammar/LRGraph.h" #include "clang-pseudo/grammar/LRTable.h" #include "clang/Basic/LangOptions.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/STLFunctionalExtras.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/FormatVariadic.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/Signals.h" #include using clang::pseudo::ForestNode; using clang::pseudo::Token; using clang::pseudo::TokenStream; using llvm::cl::desc; using llvm::cl::init; using llvm::cl::opt; static opt PrintGrammar("print-grammar", desc("Print the grammar")); static opt PrintGraph("print-graph", desc("Print the LR graph for the grammar")); static opt PrintTable("print-table", desc("Print the LR table for the grammar")); static opt Source("source", desc("Source file")); static opt PrintSource("print-source", desc("Print token stream")); static opt PrintTokens("print-tokens", desc("Print detailed token info")); static opt PrintDirectiveTree("print-directive-tree", desc("Print directive structure of source code")); static opt StripDirectives("strip-directives", desc("Strip directives and select conditional sections")); static opt Disambiguate("disambiguate", desc("Choose best tree from parse forest")); static opt PrintStatistics("print-statistics", desc("Print GLR parser statistics")); static opt PrintForest("print-forest", desc("Print parse forest")); static opt ForestAbbrev("forest-abbrev", desc("Abbreviate parse forest"), init(true)); static opt HTMLForest("html-forest", desc("output file for HTML forest")); static opt StartSymbol("start-symbol", desc("Specify the start symbol to parse"), init("translation-unit")); static std::string readOrDie(llvm::StringRef Path) { llvm::ErrorOr> Text = llvm::MemoryBuffer::getFile(Path); if (std::error_code EC = Text.getError()) { llvm::errs() << "Error: can't read file '" << Path << "': " << EC.message() << "\n"; ::exit(1); } return Text.get()->getBuffer().str(); } namespace clang { namespace pseudo { // Defined in HTMLForest.cpp void writeHTMLForest(llvm::raw_ostream &OS, const Grammar &, const ForestNode &Root, const Disambiguation &, const TokenStream &); namespace { struct NodeStats { unsigned Total = 0; std::vector> BySymbol; NodeStats(const ForestNode &Root, llvm::function_ref Filter) { llvm::DenseMap Map; for (const ForestNode &N : Root.descendants()) if (Filter(N)) { ++Total; ++Map[N.symbol()]; } BySymbol = {Map.begin(), Map.end()}; // Sort by count descending, then symbol ascending. llvm::sort(BySymbol, [](const auto &L, const auto &R) { return std::tie(R.second, L.first) < std::tie(L.second, R.first); }); } }; } // namespace } // namespace pseudo } // namespace clang int main(int argc, char *argv[]) { llvm::cl::ParseCommandLineOptions(argc, argv, ""); llvm::sys::PrintStackTraceOnErrorSignal(argv[0]); clang::LangOptions LangOpts = clang::pseudo::genericLangOpts(); std::string SourceText; std::optional RawStream; std::optional PreprocessedStream; std::optional ParseableStream; if (Source.getNumOccurrences()) { SourceText = readOrDie(Source); RawStream = clang::pseudo::lex(SourceText, LangOpts); TokenStream *Stream = &*RawStream; auto DirectiveStructure = clang::pseudo::DirectiveTree::parse(*RawStream); clang::pseudo::chooseConditionalBranches(DirectiveStructure, *RawStream); std::optional Preprocessed; if (StripDirectives) { Preprocessed = DirectiveStructure.stripDirectives(*Stream); Stream = &*Preprocessed; } if (PrintSource) Stream->print(llvm::outs()); if (PrintTokens) llvm::outs() << *Stream; if (PrintDirectiveTree) llvm::outs() << DirectiveStructure; ParseableStream = clang::pseudo::stripComments(cook(*Stream, LangOpts)); pairBrackets(*ParseableStream); } const auto &Lang = clang::pseudo::getLanguageFromFlags(); if (PrintGrammar) llvm::outs() << Lang.G.dump(); if (PrintGraph) llvm::outs() << clang::pseudo::LRGraph::buildLR0(Lang.G).dumpForTests( Lang.G); if (PrintTable) llvm::outs() << Lang.Table.dumpForTests(Lang.G); if (PrintStatistics) llvm::outs() << Lang.Table.dumpStatistics(); if (ParseableStream) { clang::pseudo::ForestArena Arena; clang::pseudo::GSS GSS; std::optional StartSymID = Lang.G.findNonterminal(StartSymbol); if (!StartSymID) { llvm::errs() << llvm::formatv( "The start symbol {0} doesn't exit in the grammar!\n", StartSymbol); return 2; } auto &Root = glrParse(clang::pseudo::ParseParams{*ParseableStream, Arena, GSS}, *StartSymID, Lang); // If we're disambiguating, we'll print at the end instead. if (PrintForest && !Disambiguate) llvm::outs() << Root.dumpRecursive(Lang.G, /*Abbreviated=*/ForestAbbrev); clang::pseudo::Disambiguation Disambig; if (Disambiguate) Disambig = clang::pseudo::disambiguate(&Root, {}); if (HTMLForest.getNumOccurrences()) { std::error_code EC; llvm::raw_fd_ostream HTMLOut(HTMLForest, EC); if (EC) { llvm::errs() << "Couldn't write " << HTMLForest << ": " << EC.message() << "\n"; return 2; } clang::pseudo::writeHTMLForest(HTMLOut, Lang.G, Root, Disambig, *ParseableStream); } if (PrintStatistics) { llvm::outs() << "Forest bytes: " << Arena.bytes() << " nodes: " << Arena.nodeCount() << "\n"; llvm::outs() << "GSS bytes: " << GSS.bytes() << " nodes: " << GSS.nodesCreated() << "\n"; for (auto &P : {std::make_pair("Ambiguous", ForestNode::Ambiguous), std::make_pair("Opaque", ForestNode::Opaque)}) { clang::pseudo::NodeStats Stats( Root, [&](const auto &N) { return N.kind() == P.second; }); llvm::outs() << "\n" << Stats.Total << " " << P.first << " nodes:\n"; for (const auto &S : Stats.BySymbol) llvm::outs() << llvm::formatv(" {0,3} {1}\n", S.second, Lang.G.symbolName(S.first)); } // Metrics for how imprecise parsing was. // These are rough but aim to be: // - linear: if we eliminate half the errors the metric should halve // - length-independent unsigned UnparsedTokens = 0; // Tokens covered by Opaque. (not unique) unsigned Misparses = 0; // Sum of alternatives-1 llvm::DenseSet Visited; auto DFS = [&](const ForestNode &N, Token::Index End, auto &DFS) -> void { if (N.kind() == ForestNode::Opaque) { UnparsedTokens += End - N.startTokenIndex(); } else if (N.kind() == ForestNode::Ambiguous) { Misparses += N.alternatives().size() - 1; for (const auto *C : N.alternatives()) if (Visited.insert(C).second) DFS(*C, End, DFS); } else if (N.kind() == ForestNode::Sequence) { for (unsigned I = 0, E = N.children().size(); I < E; ++I) if (Visited.insert(N.children()[I]).second) DFS(*N.children()[I], I + 1 == N.children().size() ? End : N.children()[I + 1]->startTokenIndex(), DFS); } }; unsigned Len = ParseableStream->tokens().size(); DFS(Root, Len, DFS); llvm::outs() << "\n"; llvm::outs() << llvm::formatv("Ambiguity: {0} misparses/token\n", double(Misparses) / Len); llvm::outs() << llvm::formatv("Unparsed: {0}%\n", 100.0 * UnparsedTokens / Len); } if (Disambiguate && PrintForest) { ForestNode *DisambigRoot = &Root; removeAmbiguities(DisambigRoot, Disambig); llvm::outs() << "Disambiguated tree:\n"; llvm::outs() << DisambigRoot->dumpRecursive(Lang.G, /*Abbreviated=*/ForestAbbrev); } } return 0; }