From fee5085497c9ac623ba0d89b37bf5980c21a9773 Mon Sep 17 00:00:00 2001 From: Sam Vervaeck Date: Fri, 19 Aug 2022 19:52:57 +0200 Subject: [PATCH] Create a working scanner/parser for a subset of the language --- .gitignore | 2 + .vscode/launch.json | 17 + .vscode/settings.json | 32 ++ .vscode/tasks.json | 16 + CMakeLists.txt | 41 ++ include/bolt/ByteString.hpp | 15 + include/bolt/CST.hpp | 734 +++++++++++++++++++++++++++++++++++ include/bolt/Diagnostics.hpp | 39 ++ include/bolt/Integer.hpp | 10 + include/bolt/Parser.hpp | 45 +++ include/bolt/Scanner.hpp | 140 +++++++ include/bolt/String.hpp | 13 + include/bolt/Text.hpp | 37 ++ src/CST.cc | 317 +++++++++++++++ src/Diagnostics.cc | 9 + src/Parser.cc | 225 +++++++++++ src/Scanner.cc | 326 ++++++++++++++++ src/main.cc | 129 ++++++ 18 files changed, 2147 insertions(+) create mode 100644 .gitignore create mode 100644 .vscode/launch.json create mode 100644 .vscode/settings.json create mode 100644 .vscode/tasks.json create mode 100644 CMakeLists.txt create mode 100644 include/bolt/ByteString.hpp create mode 100644 include/bolt/CST.hpp create mode 100644 include/bolt/Diagnostics.hpp create mode 100644 include/bolt/Integer.hpp create mode 100644 include/bolt/Parser.hpp create mode 100644 include/bolt/Scanner.hpp create mode 100644 include/bolt/String.hpp create mode 100644 include/bolt/Text.hpp create mode 100644 src/CST.cc create mode 100644 src/Diagnostics.cc create mode 100644 src/Parser.cc create mode 100644 src/Scanner.cc create mode 100644 src/main.cc diff --git a/.gitignore b/.gitignore new file mode 100644 index 000000000..c8ca99457 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +/build/ +.cache/ diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 000000000..0e0386686 --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,17 @@ +{ + // Use IntelliSense to learn about possible attributes. + // Hover to view descriptions of existing attributes. + // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + { + "type": "lldb", + "request": "launch", + "name": "Debug", + "program": "${workspaceFolder}/build/bolt", + "args": ["test.bolt"], + "cwd": "${workspaceFolder}", + "preLaunchTask": "CMake: build" + } + ] +} \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 000000000..5efd846a1 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,32 @@ +{ + "files.associations": { + "*.tcc": "cpp", + "fstream": "cpp", + "iosfwd": "cpp", + "istream": "cpp", + "limits": "cpp", + "sstream": "cpp", + "streambuf": "cpp", + "typeinfo": "cpp", + "cstdlib": "cpp", + "array": "cpp", + "chrono": "cpp", + "cmath": "cpp", + "deque": "cpp", + "forward_list": "cpp", + "list": "cpp", + "string": "cpp", + "unordered_map": "cpp", + "unordered_set": "cpp", + "vector": "cpp", + "exception": "cpp", + "functional": "cpp", + "string_view": "cpp", + "memory": "cpp", + "random": "cpp", + "initializer_list": "cpp", + "numeric": "cpp", + "ostream": "cpp", + "system_error": "cpp" + } +} \ No newline at end of file diff --git a/.vscode/tasks.json b/.vscode/tasks.json new file mode 100644 index 000000000..4edc48b2e --- /dev/null +++ b/.vscode/tasks.json @@ -0,0 +1,16 @@ +{ + "version": "2.0.0", + "tasks": [ + { + "type": "cmake", + "label": "CMake: build", + "command": "build", + "targets": [ + "all" + ], + "group": "build", + "problemMatcher": [], + "detail": "CMake template build task" + } + ] +} \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 000000000..ab195d38f --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,41 @@ + +cmake_minimum_required(VERSION 3.10) + +project(Bolt CXX) + +set(CMAKE_CXX_STANDARD 17) + +add_subdirectory(deps/zen EXCLUDE_FROM_ALL) + +add_executable( + bolt + src/CST.cc + src/Diagnostics.cc + src/Scanner.cc + src/Parser.cc + src/main.cc +) +target_compile_options( + bolt + PUBLIC + -fstandalone-debug +) +target_include_directories( + bolt + PUBLIC + include +) +target_link_libraries( + bolt + PUBLIC + zen +) + +add_custom_command( + OUTPUT "${CMAKE_CURRENT_SOURCE_DIR}/include/bolt/CST.hpp" "${CMAKE_CURRENT_SOURCE_DIR}/src/CST.cc" + COMMAND scripts/gennodes.py --name=CST ./bolt-cst-spec.txt -Iinclude/ --include-root=bolt --source-root=src/ --namespace=bolt + DEPENDS scripts/gennodes.py + MAIN_DEPENDENCY "${CMAKE_CURRENT_SOURCE_DIR}/bolt-cst-spec.txt" + WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}" +) + diff --git a/include/bolt/ByteString.hpp b/include/bolt/ByteString.hpp new file mode 100644 index 000000000..7d40d45c9 --- /dev/null +++ b/include/bolt/ByteString.hpp @@ -0,0 +1,15 @@ +#ifndef BOLT_STRING_HPP +#define BOLT_STRING_HPP + +#include +#include + +namespace bolt { + + using ByteString = std::string; + + using ByteStringView = std::string_view; + +} + +#endif // of #ifndef BOLT_STRING_HPP diff --git a/include/bolt/CST.hpp b/include/bolt/CST.hpp new file mode 100644 index 000000000..655bdafd4 --- /dev/null +++ b/include/bolt/CST.hpp @@ -0,0 +1,734 @@ +#ifndef BOLT_CST_HPP +#define BOLT_CST_HPP + +#include + +#include "bolt/Text.hpp" +#include "bolt/Integer.hpp" +#include "bolt/ByteString.hpp" + +namespace bolt { + + enum class NodeType { + Equals, + Colon, + Dot, + DotDot, + LParen, + RParen, + LBracket, + RBracket, + LBrace, + RBrace, + LetKeyword, + MutKeyword, + PubKeyword, + TypeKeyword, + ReturnKeyword, + ModKeyword, + StructKeyword, + Invalid, + EndOfFile, + BlockStart, + BlockEnd, + LineFoldEnd, + CustomOperator, + Identifier, + StringLiteral, + IntegerLiteral, + QualifiedName, + ReferenceTypeExpression, + BindPattern, + ReferenceExpression, + ConstantExpression, + CallExpression, + ExpressionStatement, + ReturnStatement, + TypeAssert, + Param, + LetBlockBody, + LetExprBody, + LetDeclaration, + StructDeclField, + StructDecl, + SourceFile, + }; + + + class Node { + + unsigned refcount = 0; + + public: + + inline void ref() { + ++refcount; + } + + inline void unref() { + --refcount; + if (refcount == 0) { + delete this; + } + } + + const NodeType Type; + + inline Node(NodeType Type): + Type(Type) {} + + + virtual ~Node(); + + }; + + class Token : public Node { + + TextLoc StartLoc; + + public: + + Token(NodeType Type, TextLoc StartLoc): Node(Type), StartLoc(StartLoc) {} + + virtual std::string getText() const = 0; + + inline TextLoc getStartLoc() { + return StartLoc; + } + + inline TextLoc getEndLoc() { + TextLoc EndLoc; + EndLoc.advance(getText()); + return EndLoc; + } + + inline size_t getStartLine() { + return StartLoc.Line; + } + + inline size_t getStartColumn() { + return StartLoc.Column; + } + + inline size_t getEndLine() { + return getEndLoc().Line; + } + + inline size_t getEndColumn() { + return getEndLoc().Column; + } + + ~Token(); + + }; + + class Equals : public Token { + public: + + Equals(TextLoc StartLoc): Token(NodeType::Equals, StartLoc) {} + + std::string getText() const override; + + ~Equals(); + + }; + + class Colon : public Token { + public: + + Colon(TextLoc StartLoc): Token(NodeType::Colon, StartLoc) {} + + std::string getText() const override; + + ~Colon(); + + }; + + class Dot : public Token { + public: + + Dot(TextLoc StartLoc): Token(NodeType::Dot, StartLoc) {} + + std::string getText() const override; + + ~Dot(); + + }; + + class DotDot : public Token { + public: + + DotDot(TextLoc StartLoc): Token(NodeType::DotDot, StartLoc) {} + + std::string getText() const override; + + ~DotDot(); + + }; + + class LParen : public Token { + public: + + LParen(TextLoc StartLoc): Token(NodeType::LParen, StartLoc) {} + + std::string getText() const override; + + ~LParen(); + + }; + + class RParen : public Token { + public: + + RParen(TextLoc StartLoc): Token(NodeType::RParen, StartLoc) {} + + std::string getText() const override; + + ~RParen(); + + }; + + class LBracket : public Token { + public: + + LBracket(TextLoc StartLoc): Token(NodeType::LBracket, StartLoc) {} + + std::string getText() const override; + + ~LBracket(); + + }; + + class RBracket : public Token { + public: + + RBracket(TextLoc StartLoc): Token(NodeType::RBracket, StartLoc) {} + + std::string getText() const override; + + ~RBracket(); + + }; + + class LBrace : public Token { + public: + + LBrace(TextLoc StartLoc): Token(NodeType::LBrace, StartLoc) {} + + std::string getText() const override; + + ~LBrace(); + + }; + + class RBrace : public Token { + public: + + RBrace(TextLoc StartLoc): Token(NodeType::RBrace, StartLoc) {} + + std::string getText() const override; + + ~RBrace(); + + }; + + class LetKeyword : public Token { + public: + + LetKeyword(TextLoc StartLoc): Token(NodeType::LetKeyword, StartLoc) {} + + std::string getText() const override; + + ~LetKeyword(); + + }; + + class MutKeyword : public Token { + public: + + MutKeyword(TextLoc StartLoc): Token(NodeType::MutKeyword, StartLoc) {} + + std::string getText() const override; + + ~MutKeyword(); + + }; + + class PubKeyword : public Token { + public: + + PubKeyword(TextLoc StartLoc): Token(NodeType::PubKeyword, StartLoc) {} + + std::string getText() const override; + + ~PubKeyword(); + + }; + + class TypeKeyword : public Token { + public: + + TypeKeyword(TextLoc StartLoc): Token(NodeType::TypeKeyword, StartLoc) {} + + std::string getText() const override; + + ~TypeKeyword(); + + }; + + class ReturnKeyword : public Token { + public: + + ReturnKeyword(TextLoc StartLoc): Token(NodeType::ReturnKeyword, StartLoc) {} + + std::string getText() const override; + + ~ReturnKeyword(); + + }; + + class ModKeyword : public Token { + public: + + ModKeyword(TextLoc StartLoc): Token(NodeType::ModKeyword, StartLoc) {} + + std::string getText() const override; + + ~ModKeyword(); + + }; + + class StructKeyword : public Token { + public: + + StructKeyword(TextLoc StartLoc): Token(NodeType::StructKeyword, StartLoc) {} + + std::string getText() const override; + + ~StructKeyword(); + + }; + + class Invalid : public Token { + public: + + Invalid(TextLoc StartLoc): Token(NodeType::Invalid, StartLoc) {} + + std::string getText() const override; + + ~Invalid(); + + }; + + class EndOfFile : public Token { + public: + + EndOfFile(TextLoc StartLoc): Token(NodeType::EndOfFile, StartLoc) {} + + std::string getText() const override; + + ~EndOfFile(); + + }; + + class BlockStart : public Token { + public: + + BlockStart(TextLoc StartLoc): Token(NodeType::BlockStart, StartLoc) {} + + std::string getText() const override; + + ~BlockStart(); + + }; + + class BlockEnd : public Token { + public: + + BlockEnd(TextLoc StartLoc): Token(NodeType::BlockEnd, StartLoc) {} + + std::string getText() const override; + + ~BlockEnd(); + + }; + + class LineFoldEnd : public Token { + public: + + LineFoldEnd(TextLoc StartLoc): Token(NodeType::LineFoldEnd, StartLoc) {} + + std::string getText() const override; + + ~LineFoldEnd(); + + }; + + class CustomOperator : public Token { + public: + + ByteString Text; + + CustomOperator(ByteString Text, TextLoc StartLoc): Token(NodeType::CustomOperator, StartLoc), Text(Text) {} + + std::string getText() const override; + + ~CustomOperator(); + + }; + + class Identifier : public Token { + public: + + ByteString Text; + + Identifier(ByteString Text, TextLoc StartLoc): Token(NodeType::Identifier, StartLoc), Text(Text) {} + + std::string getText() const override; + + ~Identifier(); + + }; + + class StringLiteral : public Token { + public: + + ByteString Text; + + StringLiteral(ByteString Text, TextLoc StartLoc): Token(NodeType::StringLiteral, StartLoc), Text(Text) {} + + std::string getText() const override; + + ~StringLiteral(); + + }; + + class IntegerLiteral : public Token { + public: + + Integer Value; + + IntegerLiteral(Integer Value, TextLoc StartLoc): Token(NodeType::IntegerLiteral, StartLoc), Value(Value) {} + + std::string getText() const override; + + ~IntegerLiteral(); + + }; + + class QualifiedName : public Node { + public: + + std::vector ModulePath; + Identifier* Name; + + QualifiedName( + std::vector ModulePath, + Identifier* Name + ): Node(NodeType::QualifiedName), + ModulePath(ModulePath), + Name(Name) {} + + ~QualifiedName(); + + }; + + class SourceElement : public Node { + public: + + SourceElement(NodeType Type): Node(Type) {} + + ~SourceElement(); + + }; + + class LetBodyElement : public Node { + public: + + LetBodyElement(NodeType Type): Node(Type) {} + + ~LetBodyElement(); + + }; + + class TypeExpression : public Node { + public: + + TypeExpression(NodeType Type): Node(Type) {} + + ~TypeExpression(); + + }; + + class ReferenceTypeExpression : public TypeExpression { + public: + + QualifiedName* Name; + + ReferenceTypeExpression( + QualifiedName* Name + ): TypeExpression(NodeType::ReferenceTypeExpression), + Name(Name) {} + + ~ReferenceTypeExpression(); + + }; + + class Pattern : public Node { + public: + + Pattern(NodeType Type): Node(Type) {} + + ~Pattern(); + + }; + + class BindPattern : public Pattern { + public: + + Identifier* Name; + + BindPattern( + Identifier* Name + ): Pattern(NodeType::BindPattern), + Name(Name) {} + + ~BindPattern(); + + }; + + class Expression : public Node { + public: + + Expression(NodeType Type): Node(Type) {} + + ~Expression(); + + }; + + class ReferenceExpression : public Expression { + public: + + Identifier* Name; + + ReferenceExpression( + Identifier* Name + ): Expression(NodeType::ReferenceExpression), + Name(Name) {} + + ~ReferenceExpression(); + + }; + + class ConstantExpression : public Expression { + public: + + Token* Token; + + ConstantExpression( + class Token* Token + ): Expression(NodeType::ConstantExpression), + Token(Token) {} + + ~ConstantExpression(); + + }; + + class CallExpression : public Expression { + public: + + CallExpression(Expression* Function, std::vector Args): Expression(NodeType::CallExpression), Function(Function), Args(Args) {} + + Expression* Function; + std::vector Args; + + ~CallExpression(); + + }; + + class Statement : public LetBodyElement { + public: + + Statement(NodeType Type): LetBodyElement(Type) {} + + ~Statement(); + + }; + + class ExpressionStatement : public Statement, public SourceElement { + public: + + ExpressionStatement(Expression* Expression): Statement(NodeType::ExpressionStatement), SourceElement(NodeType::ExpressionStatement), Expression(Expression) {} + + Expression* Expression; + + ~ExpressionStatement(); + + }; + + class ReturnStatement : public Statement { + public: + + ReturnStatement(ReturnKeyword* ReturnKeyword, Expression* Expression): Statement(NodeType::ReturnStatement), ReturnKeyword(ReturnKeyword), Expression(Expression) {} + + ReturnKeyword* ReturnKeyword; + Expression* Expression; + + ~ReturnStatement(); + + }; + + class TypeAssert : public Node { + + public: + + TypeAssert(Colon* Colon, TypeExpression* TypeExpression): Node(NodeType::TypeAssert), Colon(Colon), TypeExpression(TypeExpression) {} + + Colon* Colon; + TypeExpression* TypeExpression; + + ~TypeAssert(); + + }; + + class Param : public Node { + public: + + Param(Pattern* Pattern, TypeAssert* TypeAssert): Node(NodeType::Param), Pattern(Pattern), TypeAssert(TypeAssert) {} + + Pattern* Pattern; + TypeAssert* TypeAssert; + + ~Param(); + + }; + + class LetBody : public Node { + public: + + LetBody(NodeType Type): Node(Type) {} + + ~LetBody(); + + }; + + class LetBlockBody : public LetBody { + public: + + LetBlockBody(BlockStart* BlockStart, std::vector Elements): LetBody(NodeType::LetBlockBody), BlockStart(BlockStart), Elements(Elements) {} + + BlockStart* BlockStart; + std::vector Elements; + + ~LetBlockBody(); + + }; + + class LetExprBody : public LetBody { + public: + + Equals* Equals; + Expression* Expression; + + LetExprBody( + class Equals* Equals, + class Expression* Expression + ): LetBody(NodeType::LetExprBody), + Equals(Equals), + Expression(Expression) {} + + ~LetExprBody(); + + }; + + class LetDeclaration : public SourceElement, public LetBodyElement { + public: + + PubKeyword* PubKeyword; + LetKeyword* LetKeywod; + MutKeyword* MutKeyword; + Pattern* Pattern; + std::vector Params; + TypeAssert* TypeAssert; + LetBody* Body; + + LetDeclaration( + class PubKeyword* PubKeyword, + class LetKeyword* LetKeywod, + class MutKeyword* MutKeyword, + class Pattern* Pattern, + std::vector Params, + class TypeAssert* TypeAssert, + LetBody* Body + ): SourceElement(NodeType::LetDeclaration), + LetBodyElement(NodeType::LetDeclaration), + PubKeyword(PubKeyword), + LetKeywod(LetKeywod), + MutKeyword(MutKeyword), + Pattern(Pattern), + Params(Params), + TypeAssert(TypeAssert), + Body(Body) {} + + ~LetDeclaration(); + + }; + + class StructDeclField : public Node { + public: + + StructDeclField( + Identifier* Name, + Colon* Colon, + TypeExpression* TypeExpression + ): Node(NodeType::StructDeclField), + Name(Name), + Colon(Colon), + TypeExpression(TypeExpression) {} + + Identifier* Name; + Colon* Colon; + TypeExpression* TypeExpression; + + ~StructDeclField(); + + }; + + class StructDecl : public SourceElement { + public: + + StructDecl( + StructKeyword* StructKeyword, + Identifier* Name, + Dot* Dot, + std::vector Fields + ): SourceElement(NodeType::StructDecl), + StructKeyword(StructKeyword), + Name(Name), + Dot(Dot), + Fields(Fields) {} + + StructKeyword* StructKeyword; + Identifier* Name; + Dot* Dot; + std::vector Fields; + + ~StructDecl(); + + }; + + class SourceFile : public Node { + + public: + + SourceFile(std::vector Elements): Node(NodeType::SourceFile), Elements(Elements) {} + + std::vector Elements; + + ~SourceFile(); + + }; + +} + +#endif diff --git a/include/bolt/Diagnostics.hpp b/include/bolt/Diagnostics.hpp new file mode 100644 index 000000000..447403ca0 --- /dev/null +++ b/include/bolt/Diagnostics.hpp @@ -0,0 +1,39 @@ + +#pragma once + +#include +#include + +#include "bolt/String.hpp" +#include "bolt/CST.hpp" + +namespace bolt { + + class Diagnostic : std::runtime_error { + public: + Diagnostic(); + }; + + class UnexpectedTokenDiagnostic : public Diagnostic { + public: + + Token* Actual; + std::vector Expected; + + inline UnexpectedTokenDiagnostic(Token* Actual, std::vector Expected): + Actual(Actual), Expected(Expected) {} + + }; + + class UnexpectedStringDiagnostic : public Diagnostic { + public: + + TextLoc Location; + String Actual; + + inline UnexpectedStringDiagnostic(TextLoc Location, String Actual): + Location(Location), Actual(Actual) {} + + }; + +} diff --git a/include/bolt/Integer.hpp b/include/bolt/Integer.hpp new file mode 100644 index 000000000..729e80459 --- /dev/null +++ b/include/bolt/Integer.hpp @@ -0,0 +1,10 @@ +#ifndef BOLT_INTEGER_HPP +#define BOLT_INTEGER_HPP + +namespace bolt { + + using Integer = long long; + +} + +#endif // of #ifndef BOLT_INTEGER_HPP diff --git a/include/bolt/Parser.hpp b/include/bolt/Parser.hpp new file mode 100644 index 000000000..49a14b49b --- /dev/null +++ b/include/bolt/Parser.hpp @@ -0,0 +1,45 @@ + +#pragma once + +#include "bolt/CST.hpp" + +namespace bolt { + + class Scanner; + + class Parser { + + Stream& Tokens; + + Token* peekFirstTokenAfterModifiers(); + + public: + + Parser(Stream& S); + + QualifiedName* parseQualifiedName(); + + TypeExpression* parseTypeExpression(); + + Pattern* parsePattern(); + + Param* parseParam(); + + ReferenceExpression* parseReferenceExpression(); + + Expression* parseExpression(); + + ExpressionStatement* parseExpressionStatement(); + + LetBodyElement* parseLetBodyElement(); + + LetDeclaration* parseLetDeclaration(); + + SourceElement* parseSourceElement(); + + SourceFile* parseSourceFile(); + + }; + +} + diff --git a/include/bolt/Scanner.hpp b/include/bolt/Scanner.hpp new file mode 100644 index 000000000..e2e6a6d40 --- /dev/null +++ b/include/bolt/Scanner.hpp @@ -0,0 +1,140 @@ + +#pragma once + +#include +#include +#include +#include + +#include "bolt/Text.hpp" +#include "bolt/String.hpp" + +namespace bolt { + + class Token; + + template + class Stream { + public: + + virtual T get() = 0; + virtual T peek(std::size_t Offset = 0) = 0; + + virtual ~Stream() {} + + }; + + template + class VectorStream : public Stream { + public: + + using value_type = typename ContainerT::value_type; + + ContainerT& Data; + value_type Sentry; + std::size_t Offset; + + VectorStream(ContainerT& Data, value_type Sentry, std::size_t Offset = 0): + Data(Data), Sentry(Sentry), Offset(Offset) {} + + value_type get() override { + return Offset < Data.size() ? Data[Offset++] : Sentry; + } + + value_type peek(std::size_t Offset2) override { + auto I = Offset + Offset2; + return I < Data.size() ? Data[I] : Sentry; + } + + }; + + template + class BufferedStream : public Stream { + + std::deque Buffer; + + protected: + + virtual T read() = 0; + + public: + + using value_type = T; + + value_type get() override { + if (Buffer.empty()) { + return read(); + } else { + auto Keep = Buffer.front(); + Buffer.pop_front(); + return Keep; + } + } + + value_type peek(std::size_t Offset = 0) override { + while (Buffer.size() <= Offset) { + Buffer.push_back(read()); + } + return Buffer[Offset]; + } + + }; + + class Scanner : public BufferedStream { + + Stream& Chars; + + TextLoc CurrLoc; + + inline TextLoc getCurrentLoc() const { + return CurrLoc; + } + + inline Char getChar() { + auto Chr = Chars.get(); + if (Chr == '\n') { + CurrLoc.Line += 1; + CurrLoc.Column = 1; + } else { + CurrLoc.Column += 1; + } + return Chr; + } + + inline Char peekChar(std::size_t Offset = 0) { + return Chars.peek(Offset); + } + + protected: + + Token* read() override; + + public: + + Scanner(Stream& Chars); + + }; + + enum class FrameType { + Block, + LineFold, + }; + + class Punctuator : public BufferedStream { + + Stream& Tokens; + + std::stack Frames; + std::stack Locations; + + protected: + + virtual Token* read() override; + + public: + + Punctuator(Stream& Tokens); + + }; + +} diff --git a/include/bolt/String.hpp b/include/bolt/String.hpp new file mode 100644 index 000000000..7164d712e --- /dev/null +++ b/include/bolt/String.hpp @@ -0,0 +1,13 @@ + +#pragma once + +#include + +namespace bolt { + + using Char = char32_t; + + using String = std::basic_string; + +} + diff --git a/include/bolt/Text.hpp b/include/bolt/Text.hpp new file mode 100644 index 000000000..d5707495d --- /dev/null +++ b/include/bolt/Text.hpp @@ -0,0 +1,37 @@ +#ifndef BOLT_TEXT_HPP +#define BOLT_TEXT_HPP + +#include + +#include + +namespace bolt { + + class TextLoc { + public: + + size_t Line = 1; + size_t Column = 1; + + void advance(const std::string& Text) { + for (auto Chr: Text) { + if (Chr == '\n') { + Line++; + Column = 1; + } else { + Column++; + } + } + } + + }; + + class TextRange { + public: + TextLoc Start; + TextLoc End; + }; + +} + +#endif // of #ifndef BOLT_TEXT_HPP diff --git a/src/CST.cc b/src/CST.cc new file mode 100644 index 000000000..b2e05ce72 --- /dev/null +++ b/src/CST.cc @@ -0,0 +1,317 @@ + +#include "bolt/CST.hpp" + +namespace bolt { + + Node::~Node() { } + + Token::~Token() { + } + + Equals::~Equals() { + } + + Colon::~Colon() { + } + + Dot::~Dot() { + } + + DotDot::~DotDot() { + } + + LParen::~LParen() { + } + + RParen::~RParen() { + } + + LBracket::~LBracket() { + } + + RBracket::~RBracket() { + } + + LBrace::~LBrace() { + } + + RBrace::~RBrace() { + } + + LetKeyword::~LetKeyword() { + } + + MutKeyword::~MutKeyword() { + } + + PubKeyword::~PubKeyword() { + } + + TypeKeyword::~TypeKeyword() { + } + + ReturnKeyword::~ReturnKeyword() { + } + + ModKeyword::~ModKeyword() { + } + + StructKeyword::~StructKeyword() { + } + + Invalid::~Invalid() { + } + + EndOfFile::~EndOfFile() { + } + + BlockStart::~BlockStart() { + } + + BlockEnd::~BlockEnd() { + } + + LineFoldEnd::~LineFoldEnd() { + } + + CustomOperator::~CustomOperator() { + } + + Identifier::~Identifier() { + } + + StringLiteral::~StringLiteral() { + } + + IntegerLiteral::~IntegerLiteral() { + } + + QualifiedName::~QualifiedName() { + for (auto& Element: ModulePath){ + Element->unref(); + } + Name->unref(); + } + + SourceElement::~SourceElement() { + } + + LetBodyElement::~LetBodyElement() { + } + + TypeExpression::~TypeExpression() { + } + + ReferenceTypeExpression::~ReferenceTypeExpression() { + Name->unref(); + } + + Pattern::~Pattern() { + } + + BindPattern::~BindPattern() { + Name->unref(); + } + + Expression::~Expression() { + } + + ReferenceExpression::~ReferenceExpression() { + Name->unref(); + } + + ConstantExpression::~ConstantExpression() { + Token->unref(); + } + + CallExpression::~CallExpression() { + Function->unref(); + for (auto& Element: Args){ + Element->unref(); + } + } + + Statement::~Statement() { + } + + ExpressionStatement::~ExpressionStatement() { + Expression->unref(); + } + + ReturnStatement::~ReturnStatement() { + ReturnKeyword->unref(); + Expression->unref(); + } + + TypeAssert::~TypeAssert() { + Colon->unref(); + TypeExpression->unref(); + } + + Param::~Param() { + Pattern->unref(); + TypeAssert->unref(); + } + + LetBody::~LetBody() { + } + + LetBlockBody::~LetBlockBody() { + BlockStart->unref(); + for (auto& Element: Elements){ + Element->unref(); + } + } + + LetExprBody::~LetExprBody() { + Equals->unref(); + Expression->unref(); + } + + LetDeclaration::~LetDeclaration() { + if (PubKeyword) { + PubKeyword->unref(); + } + LetKeywod->unref(); + if (MutKeyword) { + MutKeyword->unref(); + } + Pattern->unref(); + for (auto& Element: Params){ + Element->unref(); + } + if (TypeAssert) { + TypeAssert->unref(); + } + if (Body) { + Body->unref(); + } + } + + StructDeclField::~StructDeclField() { + Name->unref(); + Colon->unref(); + TypeExpression->unref(); + } + + StructDecl::~StructDecl() { + StructKeyword->unref(); + Name->unref(); + Dot->unref(); + for (auto& Element: Fields){ + Element->unref(); + } + } + + SourceFile::~SourceFile() { + for (auto& Element: Elements){ + Element->unref(); + } + } + + std::string Equals::getText() const { + return "="; + } + + std::string Colon::getText() const { + return ":"; + } + + std::string Dot::getText() const { + return "."; + } + + std::string LParen::getText() const { + return "("; + } + + std::string RParen::getText() const { + return ")"; + } + + std::string LBracket::getText() const { + return "["; + } + + std::string RBracket::getText() const { + return "]"; + } + + std::string LBrace::getText() const { + return "{"; + } + + std::string RBrace::getText() const { + return "}"; + } + + std::string LetKeyword::getText() const { + return "let"; + } + + std::string MutKeyword::getText() const { + return "mut"; + } + + std::string PubKeyword::getText() const { + return "pub"; + } + + std::string TypeKeyword::getText() const { + return "type"; + } + + std::string ReturnKeyword::getText() const { + return "return"; + } + + std::string ModKeyword::getText() const { + return "mod"; + } + + std::string StructKeyword::getText() const { + return "struct"; + } + + std::string Invalid::getText() const { + return ""; + } + + std::string EndOfFile::getText() const { + return ""; + } + + std::string BlockStart::getText() const { + return "."; + } + + std::string BlockEnd::getText() const { + return ""; + } + + std::string LineFoldEnd::getText() const { + return ""; + } + + std::string CustomOperator::getText() const { + return Text; + } + + std::string Identifier::getText() const { + return Text; + } + + std::string StringLiteral::getText() const { + return "\"" + Text + "\""; + } + + std::string IntegerLiteral::getText() const { + return std::to_string(Value); + } + + std::string DotDot::getText() const { + return ".."; + } + +} + diff --git a/src/Diagnostics.cc b/src/Diagnostics.cc new file mode 100644 index 000000000..6acfda46f --- /dev/null +++ b/src/Diagnostics.cc @@ -0,0 +1,9 @@ + +#include "bolt/Diagnostics.hpp" + +namespace bolt { + + Diagnostic::Diagnostic(): + std::runtime_error("a compiler error occurred without being caught") {} + +} diff --git a/src/Parser.cc b/src/Parser.cc new file mode 100644 index 000000000..b8ab78687 --- /dev/null +++ b/src/Parser.cc @@ -0,0 +1,225 @@ + +#include "bolt/CST.hpp" +#include "bolt/Scanner.hpp" +#include "bolt/Parser.hpp" +#include "bolt/Diagnostics.hpp" + +namespace bolt { + + Parser::Parser(Stream& S): + Tokens(S) {} + + Token* Parser::peekFirstTokenAfterModifiers() { + std::size_t I = 0; + for (;;) { + auto T0 = Tokens.peek(I++); + switch (T0->Type) { + case NodeType::PubKeyword: + case NodeType::MutKeyword: + continue; + default: + return T0; + } + } + } + +#define BOLT_EXPECT_TOKEN(name) \ + { \ + auto __Token = Tokens.get(); \ + if (__Token->Type != NodeType::name) { \ + throw UnexpectedTokenDiagnostic(__Token, std::vector { NodeType::name }); \ + } \ + } + + Pattern* Parser::parsePattern() { + auto T0 = Tokens.peek(); + switch (T0->Type) { + case NodeType::Identifier: + Tokens.get(); + return new BindPattern(static_cast(T0)); + default: + throw UnexpectedTokenDiagnostic(T0, std::vector { NodeType::Identifier }); + } + } + + QualifiedName* Parser::parseQualifiedName() { + std::vector ModulePath; + auto Name = Tokens.get(); + if (Name->Type != NodeType::Identifier) { + throw UnexpectedTokenDiagnostic(Name, std::vector { NodeType::Identifier }); + } + for (;;) { + auto T1 = Tokens.peek(); + if (T1->Type == NodeType::Dot) { + break; + } + Tokens.get(); + ModulePath.push_back(static_cast(Name)); + Name = Tokens.get(); + if (Name->Type != NodeType::Identifier) { + throw UnexpectedTokenDiagnostic(Name, std::vector { NodeType::Identifier }); + } + } + return new QualifiedName(ModulePath, static_cast(Name)); + } + + TypeExpression* Parser::parseTypeExpression() { + auto T0 = Tokens.peek(); + switch (T0->Type) { + case NodeType::Identifier: + return new ReferenceTypeExpression(parseQualifiedName()); + default: + throw UnexpectedTokenDiagnostic(T0, std::vector { NodeType::Identifier }); + } + } + + Expression* Parser::parseExpression() { + auto T0 = Tokens.peek(); + switch (T0->Type) { + case NodeType::Identifier: + Tokens.get(); + return new ReferenceExpression(static_cast(T0)); + case NodeType::IntegerLiteral: + case NodeType::StringLiteral: + Tokens.get(); + return new ConstantExpression(T0); + default: + throw UnexpectedTokenDiagnostic(T0, std::vector { NodeType::Identifier, NodeType::IntegerLiteral }); + } + } + + ExpressionStatement* Parser::parseExpressionStatement() { + auto E = parseExpression(); + BOLT_EXPECT_TOKEN(LineFoldEnd); + return new ExpressionStatement(E); + } + + LetDeclaration* Parser::parseLetDeclaration() { + + PubKeyword* Pub; + LetKeyword* Let; + MutKeyword* Mut; + auto T0 = Tokens.get(); + if (T0->Type == NodeType::PubKeyword) { + Pub = static_cast(T0); + T0 = Tokens.get(); + } + if (T0->Type != NodeType::LetKeyword) { + throw UnexpectedTokenDiagnostic(T0, std::vector { NodeType::LetKeyword }); + } + Let = static_cast(T0); + auto T1 = Tokens.peek(); + if (T1->Type == NodeType::MutKeyword) { + Mut = static_cast(T1); + Tokens.get(); + } + + auto Patt = parsePattern(); + + std::vector Params; + Token* T2; + for (;;) { + T2 = Tokens.peek(); + switch (T2->Type) { + case NodeType::LineFoldEnd: + case NodeType::BlockStart: + case NodeType::Equals: + case NodeType::Colon: + goto after_params; + default: + Params.push_back(new Param(parsePattern(), nullptr)); + } + } + +after_params: + + TypeAssert* TA = nullptr; + if (T2->Type == NodeType::Colon) { + Tokens.get(); + auto TE = parseTypeExpression(); + TA = new TypeAssert(static_cast(T2), TE); + T2 = Tokens.peek(); + } + + LetBody* Body; + switch (T2->Type) { + case NodeType::BlockStart: + { + Tokens.get(); + std::vector Elements; + for (;;) { + auto T3 = Tokens.peek(); + if (T3->Type == NodeType::BlockEnd) { + break; + } + Elements.push_back(parseLetBodyElement()); + } + Tokens.get(); + Body = new LetBlockBody(static_cast(T2), Elements); + break; + } + case NodeType::Equals: + Tokens.get(); + Body = new LetExprBody(static_cast(T2), parseExpression()); + break; + case NodeType::LineFoldEnd: + Body = nullptr; + break; + default: + std::vector Expected { NodeType::BlockStart, NodeType::LineFoldEnd, NodeType::Equals }; + if (TA == nullptr) { + // First tokens of TypeAssert + Expected.push_back(NodeType::Colon); + // First tokens of Pattern + Expected.push_back(NodeType::Identifier); + } + throw UnexpectedTokenDiagnostic(T2, Expected); + } + + BOLT_EXPECT_TOKEN(LineFoldEnd); + + return new LetDeclaration( + Pub, + Let, + Mut, + Patt, + Params, + TA, + Body + ); + } + + LetBodyElement* Parser::parseLetBodyElement() { + auto T0 = peekFirstTokenAfterModifiers(); + switch (T0->Type) { + case NodeType::LetKeyword: + return parseLetDeclaration(); + default: + return parseExpressionStatement(); + } + } + + SourceElement* Parser::parseSourceElement() { + auto T0 = peekFirstTokenAfterModifiers(); + switch (T0->Type) { + case NodeType::LetKeyword: + return parseLetDeclaration(); + default: + return parseExpressionStatement(); + } + } + + SourceFile* Parser::parseSourceFile() { + std::vector Elements; + for (;;) { + auto T0 = Tokens.peek(); + if (T0->Type == NodeType::EndOfFile) { + break; + } + Elements.push_back(parseSourceElement()); + } + return new SourceFile(Elements); + } + +} + diff --git a/src/Scanner.cc b/src/Scanner.cc new file mode 100644 index 000000000..cbdcb041e --- /dev/null +++ b/src/Scanner.cc @@ -0,0 +1,326 @@ + +#include + +#include "zen/config.hpp" + +#include "bolt/Integer.hpp" +#include "bolt/CST.hpp" +#include "bolt/Diagnostics.hpp" +#include "bolt/Scanner.hpp" + +namespace bolt { + + static inline bool isWhiteSpace(Char Chr) { + switch (Chr) { + case ' ': + case '\n': + case '\r': + case '\t': + return true; + default: + return false; + } + } + + static bool isIdentifierPart(Char Chr) { + return (Chr >= 65 && Chr <= 90) // Uppercase letter + || (Chr >= 96 && Chr <= 122) // Lowercase letter + || (Chr >= 48 && Chr <= 57) // Digit + || Chr == '_'; + } + + static int toDigit(Char Chr) { + ZEN_ASSERT(Chr >= 48 && Chr <= 57); + return Chr - 48; + } + + std::unordered_map Keywords = { + { "pub", NodeType::PubKeyword }, + { "let", NodeType::LetKeyword }, + { "mut", NodeType::MutKeyword }, + { "return", NodeType::ReturnKeyword }, + { "type", NodeType::TypeKeyword }, + { "mod", NodeType::ModKeyword }, + }; + + Scanner::Scanner(Stream& Chars): + Chars(Chars) {} + + Token* Scanner::read() { + + Char C0; + + for (;;) { + C0 = peekChar(); + if (!isWhiteSpace(C0)) { + break; + } + getChar(); + } + + auto StartLoc = getCurrentLoc(); + + switch (C0) { + + case static_cast(EOF): + return new EndOfFile(StartLoc); + + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + { + getChar(); + Integer I = toDigit(C0); + for (;;) { + auto C1 = peekChar(); + switch (C1) { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + getChar(); + I = I * 10 + toDigit(C1); + break; + default: + goto digit_finish; + } + } +digit_finish: + return new IntegerLiteral(I, StartLoc); + } + + case 'a': + case 'b': + case 'c': + case 'd': + case 'e': + case 'f': + case 'g': + case 'h': + case 'i': + case 'j': + case 'k': + case 'l': + case 'm': + case 'n': + case 'o': + case 'p': + case 'q': + case 'r': + case 's': + case 't': + case 'u': + case 'v': + case 'w': + case 'x': + case 'y': + case 'z': + case 'A': + case 'B': + case 'C': + case 'D': + case 'E': + case 'F': + case 'G': + case 'H': + case 'I': + case 'J': + case 'K': + case 'L': + case 'M': + case 'N': + case 'O': + case 'P': + case 'Q': + case 'R': + case 'S': + case 'T': + case 'U': + case 'V': + case 'W': + case 'X': + case 'Y': + case 'Z': + case '_': + { + getChar(); + ByteString Text { static_cast(C0) }; + for (;;) { + auto C1 = peekChar(); + if (!isIdentifierPart(C1)) { + break; + } + Text.push_back(C1); + getChar(); + } + auto Match = Keywords.find(Text); + if (Match != Keywords.end()) { + switch (Match->second) { + case NodeType::PubKeyword: + return new PubKeyword(StartLoc); + case NodeType::LetKeyword: + return new LetKeyword(StartLoc); + case NodeType::MutKeyword: + return new MutKeyword(StartLoc); + case NodeType::TypeKeyword: + return new TypeKeyword(StartLoc); + case NodeType::ReturnKeyword: + return new ReturnKeyword(StartLoc); + default: + ZEN_UNREACHABLE + } + } + return new Identifier(Text, StartLoc); + } + + case '"': + { + getChar(); + ByteString Text; + bool Escaping = false; + for (;;) { + auto Loc = getCurrentLoc(); + auto C1 = getChar(); + if (Escaping) { + switch (C1) { + case 'a': Text.push_back('\a'); break; + case 'b': Text.push_back('\b'); break; + case 'f': Text.push_back('\f'); break; + case 'n': Text.push_back('\n'); break; + case 'r': Text.push_back('\r'); break; + case 't': Text.push_back('\t'); break; + case 'v': Text.push_back('\v'); break; + case '0': Text.push_back('\0'); break; + case '\'': Text.push_back('\''); break; + case '"': Text.push_back('"'); break; + default: + throw UnexpectedStringDiagnostic(Loc, String { C1 }); + } + Escaping = false; + } else { + switch (C1) { + case '"': + goto after_string_contents; + case '\\': + Escaping = true; + break; + default: + Text.push_back(C1); + break; + } + } + } +after_string_contents: + return new StringLiteral(Text, StartLoc); + } + + case '.': + { + getChar(); + auto C1 = peekChar(); + if (C1 == '.') { + getChar(); + auto C2 = peekChar(); + if (C2 == '.') { + throw UnexpectedStringDiagnostic(getCurrentLoc(), String { C2 }); + } + return new DotDot(StartLoc); + } + return new Dot(StartLoc); + } + +#define BOLT_SIMPLE_TOKEN(ch, name) case ch: getChar(); return new name(StartLoc); + + BOLT_SIMPLE_TOKEN(':', Colon) + BOLT_SIMPLE_TOKEN('(', LParen) + BOLT_SIMPLE_TOKEN(')', RParen) + BOLT_SIMPLE_TOKEN('[', LBracket) + BOLT_SIMPLE_TOKEN(']', RBracket) + BOLT_SIMPLE_TOKEN('{', LBrace) + BOLT_SIMPLE_TOKEN('}', RBrace) + BOLT_SIMPLE_TOKEN('=', Equals) + + default: + + // TODO Add a diagnostic message indicating that scanning failed. + return new Invalid(StartLoc); + + } + + } + + Punctuator::Punctuator(Stream& Tokens): + Tokens(Tokens) { + Frames.push(FrameType::Block); + Locations.push(TextLoc { 0, 0 }); + } + + Token* Punctuator::read() { + + auto T0 = Tokens.peek(); + + if (T0->Type == NodeType::EndOfFile) { + if (Frames.size() == 1) { + return T0; + } + auto Frame = Frames.top(); + Frames.pop(); + switch (Frame) { + case FrameType::Block: + return new BlockEnd(T0->getStartLoc()); + case FrameType::LineFold: + return new LineFoldEnd(T0->getStartLoc()); + } + } + + auto RefLoc = Locations.top(); + switch (Frames.top()) { + case FrameType::LineFold: + { + if (T0->getStartLine() > RefLoc.Line + && T0->getStartColumn() <= RefLoc.Column) { + Frames.pop(); + Locations.pop(); + return new LineFoldEnd(T0->getStartLoc()); + } + if (T0->Type == NodeType::Dot) { + auto T1 = Tokens.peek(1); + if (T1->getStartLine() > T0->getEndLine()) { + Tokens.get(); + Frames.push(FrameType::Block); + return new BlockStart(T0->getStartLoc()); + } + } + return Tokens.get(); + } + case FrameType::Block: + { + if (T0->getStartColumn() <= RefLoc.Column) { + Frames.pop(); + return new BlockEnd(T0->getStartLoc()); + } + + Frames.push(FrameType::LineFold); + Locations.push(T0->getStartLoc()); + + return Tokens.get(); + } + } + + + } + +} diff --git a/src/main.cc b/src/main.cc new file mode 100644 index 000000000..48289cae0 --- /dev/null +++ b/src/main.cc @@ -0,0 +1,129 @@ + +#include + +#include +#include + +#include "zen/config.hpp" + +#include "bolt/CST.hpp" +#include "bolt/Diagnostics.hpp" +#include "bolt/Scanner.hpp" +#include "bolt/Parser.hpp" + +using namespace bolt; + +String readFile(std::string Path) { + + std::ifstream File(Path); + String Out; + + File.seekg(0, std::ios::end); + Out.reserve(File.tellg()); + File.seekg(0, std::ios::beg); + + Out.assign((std::istreambuf_iterator(File)), + std::istreambuf_iterator()); + + return Out; +} + +std::string describe(NodeType Type) { + switch (Type) { + case NodeType::Identifier: + return "an identifier"; + case NodeType::CustomOperator: + return "an operator"; + case NodeType::IntegerLiteral: + return "an integer literal"; + case NodeType::EndOfFile: + return "end-of-file"; + case NodeType::BlockStart: + return "the start of a new indented block"; + case NodeType::BlockEnd: + return "the end of the current indented block"; + case NodeType::LineFoldEnd: + return "the end of the current line-fold"; + case NodeType::LParen: + return "'('"; + case NodeType::RParen: + return "')'"; + case NodeType::LBrace: + return "'['"; + case NodeType::RBrace: + return "']'"; + case NodeType::LBracket: + return "'{'"; + case NodeType::RBracket: + return "'}'"; + case NodeType::Colon: + return "':'"; + case NodeType::Equals: + return "'='"; + case NodeType::StringLiteral: + return "a string literal"; + case NodeType::Dot: + return "'.'"; + case NodeType::PubKeyword: + return "'pub'"; + case NodeType::LetKeyword: + return "'let'"; + case NodeType::MutKeyword: + return "'mut'"; + case NodeType::ReturnKeyword: + return "'return'"; + case NodeType::TypeKeyword: + return "'type'"; + default: + ZEN_UNREACHABLE + } +} + + +int main(int argc, const char* argv[]) { + + if (argc < 2) { + fprintf(stderr, "Not enough arguments provided.\n"); + return 1; + } + + auto Text = readFile(argv[1]); + VectorStream Chars(Text, EOF); + Scanner S(Chars); + Punctuator PT(S); + Parser P(PT); + + SourceFile* SF; + +#ifdef NDEBUG + try { + SF = P.parseSourceFile(); + } catch (UnexpectedTokenDiagnostic& E) { + std::cerr << ":" << E.Actual->getStartLine() << ":" << E.Actual->getStartColumn() << ": expected "; + switch (E.Expected.size()) { + case 0: + std::cerr << "nothing"; + break; + case 1: + std::cerr << describe(E.Expected[0]); + break; + default: + auto Iter = E.Expected.begin(); + std::cerr << describe(*Iter++); + NodeType Prev; + while (Iter != E.Expected.end()) { + std::cerr << ", " << describe(Prev); + Prev = *Iter++; + } + std::cerr << " or " << describe(Prev); + break; + } + std::cerr << " but instead got '" << E.Actual->getText() << "'\n"; + } +#else + SF = P.parseSourceFile(); +#endif + + return 0; +} +