//===--- CXX.cpp - Define public interfaces for C++ grammar ---------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// #include "clang-pseudo/cxx/CXX.h" #include "clang-pseudo/Forest.h" #include "clang-pseudo/Language.h" #include "clang-pseudo/grammar/Grammar.h" #include "clang-pseudo/grammar/LRTable.h" #include "clang/Basic/CharInfo.h" #include "clang/Basic/TokenKinds.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/Support/Debug.h" #include #define DEBUG_TYPE "CXX.cpp" namespace clang { namespace pseudo { namespace cxx { namespace { static const char *CXXBNF = #include "CXXBNF.inc" ; // User-defined string literals look like `""suffix`. bool isStringUserDefined(const Token &Tok) { return !Tok.text().ends_with("\""); } bool isCharUserDefined(const Token &Tok) { return !Tok.text().ends_with("'"); } // Combinable flags describing numbers. // Clang has just one numeric_token kind, the grammar has 4. enum NumericKind { Integer = 0, Floating = 1 << 0, UserDefined = 1 << 1, }; // Determine the kind of numeric_constant we have. // We can assume it's something valid, as it has been lexed. // FIXME: is this expensive enough that we should set flags on the token // and reuse them rather than computing it for each guard? unsigned numKind(const Token &Tok) { assert(Tok.Kind == tok::numeric_constant); llvm::StringRef Text = Tok.text(); if (Text.size() <= 1) return Integer; bool Hex = Text.size() > 2 && Text[0] == '0' && (Text[1] == 'x' || Text[1] == 'X'); uint8_t K = Integer; for (char C : Text) { switch (C) { case '.': K |= Floating; break; case 'e': case 'E': if (!Hex) K |= Floating; break; case 'p': case 'P': if (Hex) K |= Floating; break; case '_': K |= UserDefined; break; default: break; } } // We would be done here, but there are stdlib UDLs that lack _. // We must distinguish these from the builtin suffixes. unsigned LastLetter = Text.size(); while (LastLetter > 0 && isLetter(Text[LastLetter - 1])) --LastLetter; if (LastLetter == Text.size()) // Common case return NumericKind(K); // Trailing d/e/f are not part of the suffix in hex numbers. while (Hex && LastLetter < Text.size() && isHexDigit(Text[LastLetter])) ++LastLetter; return llvm::StringSwitch(Text.substr(LastLetter)) // std::chrono .Cases("h", "min", "s", "ms", "us", "ns", "d", "y", K | UserDefined) // complex .Cases("il", "i", "if", K | UserDefined) .Default(K); } // RHS is expected to contain a single terminal. // Returns the corresponding token. const Token &onlyToken(tok::TokenKind Kind, const ArrayRef RHS, const TokenStream &Tokens) { assert(RHS.size() == 1 && RHS.front()->symbol() == tokenSymbol(Kind)); return Tokens.tokens()[RHS.front()->startTokenIndex()]; } // RHS is expected to contain a single symbol. // Returns the corresponding ForestNode. const ForestNode &onlySymbol(SymbolID Kind, const ArrayRef RHS, const TokenStream &Tokens) { assert(RHS.size() == 1 && RHS.front()->symbol() == Kind); return *RHS.front(); } bool isFunctionDeclarator(const ForestNode *Declarator) { assert(Declarator->symbol() == cxx::Symbol::declarator); bool IsFunction = false; while (true) { // not well-formed code, return the best guess. if (Declarator->kind() != ForestNode::Sequence) return IsFunction; switch (Declarator->rule()) { case rule::noptr_declarator::declarator_id: // reached the bottom return IsFunction; // *X is a nonfunction (unless X is a function). case rule::ptr_declarator::ptr_operator__ptr_declarator: Declarator = Declarator->elements()[1]; IsFunction = false; continue; // X() is a function (unless X is a pointer or similar). case rule::declarator:: noptr_declarator__parameters_and_qualifiers__trailing_return_type: case rule::noptr_declarator::noptr_declarator__parameters_and_qualifiers: Declarator = Declarator->elements()[0]; IsFunction = true; continue; // X[] is an array (unless X is a pointer or function). case rule::noptr_declarator:: noptr_declarator__L_SQUARE__constant_expression__R_SQUARE: case rule::noptr_declarator::noptr_declarator__L_SQUARE__R_SQUARE: Declarator = Declarator->elements()[0]; IsFunction = false; continue; // (X) is whatever X is. case rule::noptr_declarator::L_PAREN__ptr_declarator__R_PAREN: Declarator = Declarator->elements()[1]; continue; case rule::ptr_declarator::noptr_declarator: case rule::declarator::ptr_declarator: Declarator = Declarator->elements()[0]; continue; default: assert(false && "unhandled declarator for IsFunction"); return IsFunction; } } llvm_unreachable("unreachable"); } bool guardNextTokenNotElse(const GuardParams &P) { return symbolToToken(P.Lookahead) != tok::kw_else; } bool specifiesStructuredBinding(const GuardParams &P) { const auto DSS = P.RHS[0]; assert(DSS->symbol() == Symbol::decl_specifier_seq); auto Length = P.RHS[1]->startTokenIndex() - DSS->startTokenIndex(); for (const auto &T : P.Tokens.tokens().slice(DSS->startTokenIndex(), Length)) { switch (T.Kind) { case clang::tok::kw_static: case clang::tok::kw_thread_local: case clang::tok::kw_auto: case clang::tok::kw_const: case clang::tok::kw_volatile: break; default: return false; } } return true; } // Whether this e.g. decl-specifier contains an "exclusive" type such as a class // name, and thus can't combine with a second exclusive type. // // Returns false for // - non-types // - "unsigned" etc that may suffice as types but may modify others // - cases of uncertainty (e.g. due to ambiguity) bool hasExclusiveType(const ForestNode *N) { // FIXME: every time we apply this check, we walk the whole subtree. // Add per-node caching instead. while (true) { assert(N->symbol() == Symbol::decl_specifier_seq || N->symbol() == Symbol::type_specifier_seq || N->symbol() == Symbol::defining_type_specifier_seq || N->symbol() == Symbol::decl_specifier || N->symbol() == Symbol::type_specifier || N->symbol() == Symbol::defining_type_specifier || N->symbol() == Symbol::simple_type_specifier); if (N->kind() == ForestNode::Opaque) return false; // conservative if (N->kind() == ForestNode::Ambiguous) return llvm::all_of(N->alternatives(), hasExclusiveType); // conservative // All supported symbols are nonterminals. assert(N->kind() == ForestNode::Sequence); switch (N->rule()) { // seq := element seq: check element then continue into seq case rule::decl_specifier_seq::decl_specifier__decl_specifier_seq: case rule::defining_type_specifier_seq::defining_type_specifier__defining_type_specifier_seq: case rule::type_specifier_seq::type_specifier__type_specifier_seq: if (hasExclusiveType(N->children()[0])) return true; N = N->children()[1]; continue; // seq := element: continue into element case rule::decl_specifier_seq::decl_specifier: case rule::type_specifier_seq::type_specifier: case rule::defining_type_specifier_seq::defining_type_specifier: N = N->children()[0]; continue; // defining-type-specifier case rule::defining_type_specifier::type_specifier: N = N->children()[0]; continue; case rule::defining_type_specifier::class_specifier: case rule::defining_type_specifier::enum_specifier: return true; // decl-specifier case rule::decl_specifier::defining_type_specifier: N = N->children()[0]; continue; case rule::decl_specifier::CONSTEVAL: case rule::decl_specifier::CONSTEXPR: case rule::decl_specifier::CONSTINIT: case rule::decl_specifier::INLINE: case rule::decl_specifier::FRIEND: case rule::decl_specifier::storage_class_specifier: case rule::decl_specifier::TYPEDEF: case rule::decl_specifier::function_specifier: return false; // type-specifier case rule::type_specifier::elaborated_type_specifier: case rule::type_specifier::typename_specifier: return true; case rule::type_specifier::simple_type_specifier: N = N->children()[0]; continue; case rule::type_specifier::cv_qualifier: return false; // simple-type-specifier case rule::simple_type_specifier::type_name: case rule::simple_type_specifier::template_name: case rule::simple_type_specifier::builtin_type: case rule::simple_type_specifier::nested_name_specifier__TEMPLATE__simple_template_id: case rule::simple_type_specifier::nested_name_specifier__template_name: case rule::simple_type_specifier::nested_name_specifier__type_name: case rule::simple_type_specifier::decltype_specifier: case rule::simple_type_specifier::placeholder_type_specifier: return true; case rule::simple_type_specifier::LONG: case rule::simple_type_specifier::SHORT: case rule::simple_type_specifier::SIGNED: case rule::simple_type_specifier::UNSIGNED: return false; default: LLVM_DEBUG(llvm::errs() << "Unhandled rule " << N->rule() << "\n"); llvm_unreachable("hasExclusiveType be exhaustive!"); } } } llvm::DenseMap buildGuards() { #define GUARD(cond) \ { \ [](const GuardParams &P) { return cond; } \ } #define TOKEN_GUARD(kind, cond) \ [](const GuardParams& P) { \ const Token &Tok = onlyToken(tok::kind, P.RHS, P.Tokens); \ return cond; \ } #define SYMBOL_GUARD(kind, cond) \ [](const GuardParams& P) { \ const ForestNode &N = onlySymbol(Symbol::kind, P.RHS, P.Tokens); \ return cond; \ } return { {rule::function_declarator::declarator, SYMBOL_GUARD(declarator, isFunctionDeclarator(&N))}, {rule::non_function_declarator::declarator, SYMBOL_GUARD(declarator, !isFunctionDeclarator(&N))}, // A {decl,type,defining-type}-specifier-sequence cannot have multiple // "exclusive" types (like class names): a value has only one type. {rule::defining_type_specifier_seq:: defining_type_specifier__defining_type_specifier_seq, GUARD(!hasExclusiveType(P.RHS[0]) || !hasExclusiveType(P.RHS[1]))}, {rule::type_specifier_seq::type_specifier__type_specifier_seq, GUARD(!hasExclusiveType(P.RHS[0]) || !hasExclusiveType(P.RHS[1]))}, {rule::decl_specifier_seq::decl_specifier__decl_specifier_seq, GUARD(!hasExclusiveType(P.RHS[0]) || !hasExclusiveType(P.RHS[1]))}, {rule::contextual_override::IDENTIFIER, TOKEN_GUARD(identifier, Tok.text() == "override")}, {rule::contextual_final::IDENTIFIER, TOKEN_GUARD(identifier, Tok.text() == "final")}, {rule::import_keyword::IDENTIFIER, TOKEN_GUARD(identifier, Tok.text() == "import")}, {rule::export_keyword::IDENTIFIER, TOKEN_GUARD(identifier, Tok.text() == "export")}, {rule::module_keyword::IDENTIFIER, TOKEN_GUARD(identifier, Tok.text() == "module")}, {rule::contextual_zero::NUMERIC_CONSTANT, TOKEN_GUARD(numeric_constant, Tok.text() == "0")}, {rule::selection_statement::IF__L_PAREN__condition__R_PAREN__statement, guardNextTokenNotElse}, {rule::selection_statement:: IF__L_PAREN__init_statement__condition__R_PAREN__statement, guardNextTokenNotElse}, {rule::selection_statement:: IF__CONSTEXPR__L_PAREN__condition__R_PAREN__statement, guardNextTokenNotElse}, {rule::selection_statement:: IF__CONSTEXPR__L_PAREN__init_statement__condition__R_PAREN__statement, guardNextTokenNotElse}, // Implement C++ [basic.lookup.qual.general]: // If a name, template-id, or decltype-specifier is followed by a // ​::​, it shall designate a namespace, class, enumeration, or // dependent type, and the ​::​ is never interpreted as a complete // nested-name-specifier. {rule::nested_name_specifier::COLONCOLON, TOKEN_GUARD(coloncolon, Tok.prev().Kind != tok::identifier)}, // Implement C++ [dcl.pre#6]: // A simple-declaration with an identifier-list is called a structured // binding declaration ([dcl.struct.bind]). If the decl-specifier-seq // contains any decl-specifier other than static, thread_­local, auto, // or cv-qualifiers, the program is ill-formed. {rule::simple_declaration:: decl_specifier_seq__ref_qualifier__L_SQUARE__identifier_list__R_SQUARE__initializer__SEMI, specifiesStructuredBinding}, {rule::simple_declaration:: decl_specifier_seq__L_SQUARE__identifier_list__R_SQUARE__initializer__SEMI, specifiesStructuredBinding}, // The grammar distinguishes (only) user-defined vs plain string literals, // where the clang lexer distinguishes (only) encoding types. {rule::user_defined_string_literal_chunk::STRING_LITERAL, TOKEN_GUARD(string_literal, isStringUserDefined(Tok))}, {rule::user_defined_string_literal_chunk::UTF8_STRING_LITERAL, TOKEN_GUARD(utf8_string_literal, isStringUserDefined(Tok))}, {rule::user_defined_string_literal_chunk::UTF16_STRING_LITERAL, TOKEN_GUARD(utf16_string_literal, isStringUserDefined(Tok))}, {rule::user_defined_string_literal_chunk::UTF32_STRING_LITERAL, TOKEN_GUARD(utf32_string_literal, isStringUserDefined(Tok))}, {rule::user_defined_string_literal_chunk::WIDE_STRING_LITERAL, TOKEN_GUARD(wide_string_literal, isStringUserDefined(Tok))}, {rule::string_literal_chunk::STRING_LITERAL, TOKEN_GUARD(string_literal, !isStringUserDefined(Tok))}, {rule::string_literal_chunk::UTF8_STRING_LITERAL, TOKEN_GUARD(utf8_string_literal, !isStringUserDefined(Tok))}, {rule::string_literal_chunk::UTF16_STRING_LITERAL, TOKEN_GUARD(utf16_string_literal, !isStringUserDefined(Tok))}, {rule::string_literal_chunk::UTF32_STRING_LITERAL, TOKEN_GUARD(utf32_string_literal, !isStringUserDefined(Tok))}, {rule::string_literal_chunk::WIDE_STRING_LITERAL, TOKEN_GUARD(wide_string_literal, !isStringUserDefined(Tok))}, // And the same for chars. {rule::user_defined_character_literal::CHAR_CONSTANT, TOKEN_GUARD(char_constant, isCharUserDefined(Tok))}, {rule::user_defined_character_literal::UTF8_CHAR_CONSTANT, TOKEN_GUARD(utf8_char_constant, isCharUserDefined(Tok))}, {rule::user_defined_character_literal::UTF16_CHAR_CONSTANT, TOKEN_GUARD(utf16_char_constant, isCharUserDefined(Tok))}, {rule::user_defined_character_literal::UTF32_CHAR_CONSTANT, TOKEN_GUARD(utf32_char_constant, isCharUserDefined(Tok))}, {rule::user_defined_character_literal::WIDE_CHAR_CONSTANT, TOKEN_GUARD(wide_char_constant, isCharUserDefined(Tok))}, {rule::character_literal::CHAR_CONSTANT, TOKEN_GUARD(char_constant, !isCharUserDefined(Tok))}, {rule::character_literal::UTF8_CHAR_CONSTANT, TOKEN_GUARD(utf8_char_constant, !isCharUserDefined(Tok))}, {rule::character_literal::UTF16_CHAR_CONSTANT, TOKEN_GUARD(utf16_char_constant, !isCharUserDefined(Tok))}, {rule::character_literal::UTF32_CHAR_CONSTANT, TOKEN_GUARD(utf32_char_constant, !isCharUserDefined(Tok))}, {rule::character_literal::WIDE_CHAR_CONSTANT, TOKEN_GUARD(wide_char_constant, !isCharUserDefined(Tok))}, // clang just has one NUMERIC_CONSTANT token for {ud,plain}x{float,int} {rule::user_defined_integer_literal::NUMERIC_CONSTANT, TOKEN_GUARD(numeric_constant, numKind(Tok) == (Integer | UserDefined))}, {rule::user_defined_floating_point_literal::NUMERIC_CONSTANT, TOKEN_GUARD(numeric_constant, numKind(Tok) == (Floating | UserDefined))}, {rule::integer_literal::NUMERIC_CONSTANT, TOKEN_GUARD(numeric_constant, numKind(Tok) == Integer)}, {rule::floating_point_literal::NUMERIC_CONSTANT, TOKEN_GUARD(numeric_constant, numKind(Tok) == Floating)}, }; #undef TOKEN_GUARD #undef SYMBOL_GUARD } Token::Index recoverBrackets(Token::Index Begin, const TokenStream &Tokens) { assert(Begin > 0); const Token &Left = Tokens.tokens()[Begin - 1]; assert(Left.Kind == tok::l_brace || Left.Kind == tok::l_paren || Left.Kind == tok::l_square); if (const Token *Right = Left.pair()) { assert(Tokens.index(*Right) > Begin - 1); return Tokens.index(*Right); } return Token::Invalid; } llvm::DenseMap buildRecoveryStrategies() { return { {Extension::Brackets, recoverBrackets}, }; } } // namespace const Language &getLanguage() { static const auto &CXXLanguage = []() -> const Language & { std::vector Diags; auto G = Grammar::parseBNF(CXXBNF, Diags); assert(Diags.empty()); LRTable Table = LRTable::buildSLR(G); const Language *PL = new Language{ std::move(G), std::move(Table), buildGuards(), buildRecoveryStrategies(), }; return *PL; }(); return CXXLanguage; } } // namespace cxx } // namespace pseudo } // namespace clang