504 lines
12 KiB
TypeScript
504 lines
12 KiB
TypeScript
|
|
import {
|
|
SyntaxKind,
|
|
Token,
|
|
Identifier,
|
|
StringLiteral,
|
|
EndOfFile,
|
|
BlockStart,
|
|
BlockEnd,
|
|
LineFoldEnd,
|
|
PubKeyword,
|
|
MutKeyword,
|
|
LetKeyword,
|
|
ImportKeyword,
|
|
TypeKeyword,
|
|
TextPosition,
|
|
Colon,
|
|
Comma,
|
|
Equals,
|
|
LParen,
|
|
RParen,
|
|
LBrace,
|
|
LBracket,
|
|
RBrace,
|
|
RBracket,
|
|
ReturnKeyword,
|
|
CustomOperator,
|
|
IdentifierAlt,
|
|
Integer,
|
|
TextFile,
|
|
Dot,
|
|
DotDot,
|
|
Assignment,
|
|
ElifKeyword,
|
|
ElseKeyword,
|
|
IfKeyword,
|
|
StructKeyword,
|
|
RArrow,
|
|
EnumKeyword,
|
|
MatchKeyword,
|
|
RArrowAlt,
|
|
VBar,
|
|
} from "./cst"
|
|
import { Diagnostics, UnexpectedCharDiagnostic } from "./diagnostics"
|
|
import { Stream, BufferedStream, assert } from "./util";
|
|
|
|
const EOF = '\uFFFF'
|
|
|
|
function isUpper(ch: string): boolean {
|
|
return ch.toUpperCase() === ch;
|
|
}
|
|
|
|
function isWhiteSpace(ch: string): boolean {
|
|
return /[\r\n\t ]/.test(ch);
|
|
}
|
|
|
|
function isIdentPart(ch: string): boolean {
|
|
return /[a-zA-Z0-9_]/.test(ch);
|
|
}
|
|
|
|
function isIdentStart(ch: string): boolean {
|
|
return /[a-zA-Z_]/.test(ch)
|
|
}
|
|
|
|
function isDecimalDigit(ch: string): boolean {
|
|
return /[0-9]/.test(ch);
|
|
}
|
|
|
|
function toDecimal(ch: string): number {
|
|
const code = ch.charCodeAt(0);
|
|
assert(code >= 48 && code <= 57);
|
|
return code - 48;
|
|
}
|
|
|
|
function isOperatorPart(ch: string): boolean {
|
|
return /[+\-*\/%^&|$<>!?=]/.test(ch);
|
|
}
|
|
|
|
export class ScanError extends Error {
|
|
|
|
public constructor(
|
|
public file: TextFile,
|
|
public position: TextPosition,
|
|
public actual: string,
|
|
) {
|
|
super(`Uncaught scanner error`);
|
|
}
|
|
|
|
}
|
|
|
|
export class Scanner extends BufferedStream<Token> {
|
|
|
|
private currLine = 1;
|
|
private currColumn = 1;
|
|
|
|
public constructor(
|
|
public text: string,
|
|
public textOffset: number = 0,
|
|
public diagnostics: Diagnostics,
|
|
private file: TextFile,
|
|
) {
|
|
super();
|
|
}
|
|
|
|
private peekChar(offset = 1): string {
|
|
const i = this.textOffset + offset - 1;
|
|
return i < this.text.length ? this.text[i] : EOF;
|
|
}
|
|
|
|
private getChar(): string {
|
|
const ch = this.textOffset < this.text.length
|
|
? this.text[this.textOffset++]
|
|
: EOF
|
|
if (ch === '\n') {
|
|
this.currLine++;
|
|
this.currColumn = 1;
|
|
} else {
|
|
this.currColumn++;
|
|
}
|
|
return ch;
|
|
}
|
|
|
|
private takeWhile(pred: (ch: string) => boolean): string {
|
|
let out = ''
|
|
for (;;) {
|
|
const c0 = this.peekChar()
|
|
if (!pred(c0)) {
|
|
break;
|
|
}
|
|
this.getChar()
|
|
out += c0;
|
|
}
|
|
return out;
|
|
}
|
|
|
|
private getCurrentPosition(): TextPosition {
|
|
return new TextPosition(
|
|
this.textOffset,
|
|
this.currLine,
|
|
this.currColumn
|
|
);
|
|
}
|
|
|
|
public read(): Token {
|
|
|
|
let c0: string;
|
|
|
|
// Skip whitespace and comments
|
|
for (;;) {
|
|
|
|
for (;;) {
|
|
c0 = this.peekChar();
|
|
if (isWhiteSpace(c0)) {
|
|
this.getChar();
|
|
continue;
|
|
}
|
|
if (c0 === '#') {
|
|
this.getChar();
|
|
for (;;) {
|
|
const c1 = this.getChar();
|
|
if (c1 === '\n' || c1 === EOF) {
|
|
break;
|
|
}
|
|
}
|
|
continue;
|
|
}
|
|
|
|
// We failed to match a newline or line comment, so there's nothing to skip
|
|
break;
|
|
|
|
}
|
|
|
|
const startPos = this.getCurrentPosition();
|
|
this.getChar();
|
|
|
|
switch (c0) {
|
|
|
|
case '"':
|
|
{
|
|
const startPos = this.getCurrentPosition();
|
|
let contents = '';
|
|
let escaping = false;
|
|
for (;;) {
|
|
if (escaping) {
|
|
const startPos = this.getCurrentPosition();
|
|
const c1 = this.getChar();
|
|
switch (c1) {
|
|
case 'a': contents += '\a'; break;
|
|
case 'b': contents += '\b'; break;
|
|
case 'f': contents += '\f'; break;
|
|
case 'n': contents += '\n'; break;
|
|
case 'r': contents += '\r'; break;
|
|
case 't': contents += '\t'; break;
|
|
case 'v': contents += '\v'; break;
|
|
case '0': contents += '\0'; break;
|
|
case '\'': contents += '\''; break;
|
|
case '\"': contents += '\"'; break;
|
|
default:
|
|
throw new ScanError(this.file, startPos, c1);
|
|
}
|
|
escaping = false;
|
|
} else {
|
|
const c1 = this.getChar();
|
|
if (c1 === '"') {
|
|
break;
|
|
} else {
|
|
contents += c1;
|
|
}
|
|
}
|
|
}
|
|
return new StringLiteral(contents, startPos);
|
|
}
|
|
|
|
case EOF:
|
|
{
|
|
return new EndOfFile(startPos);
|
|
}
|
|
|
|
case '(': return new LParen(startPos);
|
|
case ')': return new RParen(startPos);
|
|
case '[': return new LBracket(startPos);
|
|
case ']': return new RBracket(startPos);
|
|
case '{': return new LBrace(startPos);
|
|
case '}': return new RBrace(startPos);
|
|
case ',': return new Comma(startPos);
|
|
case ':': return new Colon(startPos);
|
|
case '.': {
|
|
const dots = c0 + this.takeWhile(ch => ch === '.');
|
|
if (dots === '.') {
|
|
return new Dot(startPos);
|
|
} else if (dots === '..') {
|
|
return new DotDot(startPos);
|
|
} else {
|
|
throw new ScanError(this.file, startPos, dots);
|
|
}
|
|
}
|
|
|
|
case '+':
|
|
case '-':
|
|
case '*':
|
|
case '/':
|
|
case '%':
|
|
case '&':
|
|
case '^':
|
|
case '|':
|
|
case '$':
|
|
case '<':
|
|
case '>':
|
|
case '=':
|
|
case '!':
|
|
case '?':
|
|
{
|
|
const text = c0 + this.takeWhile(isOperatorPart);
|
|
if (text === '->') {
|
|
return new RArrow(startPos);
|
|
} else if (text === '=>') {
|
|
return new RArrowAlt(startPos);
|
|
} else if (text === '|') {
|
|
return new VBar(startPos);
|
|
} else if (text === '=') {
|
|
return new Equals(startPos);
|
|
} else if (text.endsWith('=') && text[text.length-2] !== '=') {
|
|
return new Assignment(text, startPos);
|
|
} else {
|
|
return new CustomOperator(text, startPos);
|
|
}
|
|
}
|
|
|
|
case '0':
|
|
{
|
|
const c1 = this.peekChar();
|
|
switch (c1) {
|
|
case 'x': // TODO
|
|
case 'o': // TODO
|
|
case 'b': // TODO
|
|
}
|
|
}
|
|
case '1':
|
|
case '2':
|
|
case '3':
|
|
case '4':
|
|
case '5':
|
|
case '6':
|
|
case '7':
|
|
case '8':
|
|
case '9':
|
|
{
|
|
let value = BigInt(toDecimal(c0));
|
|
for (;;) {
|
|
const c1 = this.peekChar();
|
|
if (!isDecimalDigit(c1)) {
|
|
break;
|
|
}
|
|
this.getChar();
|
|
value = value * BigInt(10) + BigInt(toDecimal(c1));
|
|
}
|
|
return new Integer(value, 10, startPos);
|
|
}
|
|
|
|
case 'a':
|
|
case 'b':
|
|
case 'c':
|
|
case 'd':
|
|
case 'e':
|
|
case 'f':
|
|
case 'g':
|
|
case 'h':
|
|
case 'i':
|
|
case 'j':
|
|
case 'k':
|
|
case 'l':
|
|
case 'm':
|
|
case 'n':
|
|
case 'o':
|
|
case 'p':
|
|
case 'q':
|
|
case 'r':
|
|
case 's':
|
|
case 't':
|
|
case 'u':
|
|
case 'v':
|
|
case 'w':
|
|
case 'x':
|
|
case 'y':
|
|
case 'z':
|
|
case 'A':
|
|
case 'B':
|
|
case 'C':
|
|
case 'D':
|
|
case 'E':
|
|
case 'F':
|
|
case 'G':
|
|
case 'H':
|
|
case 'I':
|
|
case 'J':
|
|
case 'K':
|
|
case 'L':
|
|
case 'M':
|
|
case 'N':
|
|
case 'O':
|
|
case 'P':
|
|
case 'Q':
|
|
case 'R':
|
|
case 'S':
|
|
case 'T':
|
|
case 'U':
|
|
case 'V':
|
|
case 'W':
|
|
case 'X':
|
|
case 'Y':
|
|
case 'Z':
|
|
case '_':
|
|
{
|
|
const text = c0 + this.takeWhile(isIdentPart);
|
|
switch (text) {
|
|
case 'import': return new ImportKeyword(startPos);
|
|
case 'pub': return new PubKeyword(startPos);
|
|
case 'mut': return new MutKeyword(startPos);
|
|
case 'let': return new LetKeyword(startPos);
|
|
case 'import': return new ImportKeyword(startPos);
|
|
case 'return': return new ReturnKeyword(startPos);
|
|
case 'type': return new TypeKeyword(startPos);
|
|
case 'if': return new IfKeyword(startPos);
|
|
case 'else': return new ElseKeyword(startPos);
|
|
case 'elif': return new ElifKeyword(startPos);
|
|
case 'struct': return new StructKeyword(startPos);
|
|
case 'enum': return new EnumKeyword(startPos);
|
|
case 'match': return new MatchKeyword(startPos);
|
|
default:
|
|
if (isUpper(text[0])) {
|
|
return new IdentifierAlt(text, startPos);
|
|
} else {
|
|
return new Identifier(text, startPos);
|
|
}
|
|
}
|
|
}
|
|
|
|
default:
|
|
|
|
// Nothing matched, so the current character is unrecognisable
|
|
throw new ScanError(this.file, startPos, c0);
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
const enum FrameType {
|
|
Block,
|
|
LineFold,
|
|
Fallthrough,
|
|
}
|
|
|
|
const INIT_POS = new TextPosition(0, 0, 0);
|
|
|
|
export class Punctuator extends BufferedStream<Token> {
|
|
|
|
private referencePositions: TextPosition[] = [ INIT_POS ];
|
|
|
|
private frameTypes: FrameType[] = [ FrameType.Block ];
|
|
|
|
public constructor(
|
|
private tokens: Stream<Token>,
|
|
) {
|
|
super();
|
|
}
|
|
|
|
public read(): Token {
|
|
|
|
const t0 = this.tokens.peek(1);
|
|
|
|
switch (t0.kind) {
|
|
case SyntaxKind.LBrace:
|
|
this.frameTypes.push(FrameType.Fallthrough);
|
|
break;
|
|
case SyntaxKind.EndOfFile:
|
|
{
|
|
if (this.frameTypes.length === 1) {
|
|
return t0;
|
|
}
|
|
const frameType = this.frameTypes.pop()!;
|
|
switch (frameType) {
|
|
case FrameType.LineFold:
|
|
return new LineFoldEnd(t0.getStartPosition());
|
|
case FrameType.Block:
|
|
return new BlockEnd(t0.getStartPosition());
|
|
}
|
|
}
|
|
}
|
|
|
|
const refPos = this.referencePositions[this.referencePositions.length-1];
|
|
const frameType = this.frameTypes[this.frameTypes.length-1];
|
|
|
|
switch (frameType) {
|
|
|
|
case FrameType.Fallthrough:
|
|
{
|
|
if (t0.kind === SyntaxKind.RBrace) {
|
|
this.frameTypes.pop()!;
|
|
}
|
|
this.tokens.get();
|
|
return t0;
|
|
}
|
|
|
|
case FrameType.LineFold:
|
|
{
|
|
|
|
// This important check verifies we're still inside the line-fold. If
|
|
// we aren't, we need to clean up the stack a bit and eventually return
|
|
// a token that indicates the line-fold ended.
|
|
if (t0.getStartLine() > refPos.line
|
|
&& t0.getStartColumn() <= refPos.column) {
|
|
this.frameTypes.pop();
|
|
this.referencePositions.pop();
|
|
return new LineFoldEnd(t0.getStartPosition());
|
|
}
|
|
|
|
const t1 = this.tokens.peek(2);
|
|
if (t0.kind === SyntaxKind.Dot && t0.getEndLine() < t1.getStartLine()) {
|
|
this.tokens.get();
|
|
this.frameTypes.push(FrameType.Block);
|
|
return new BlockStart(t0.getStartPosition());
|
|
}
|
|
|
|
// If we got here, this is an ordinary token that is part of the
|
|
// line-fold. Make sure to consume it and return it to the caller.
|
|
this.tokens.get();
|
|
return t0;
|
|
}
|
|
|
|
case FrameType.Block:
|
|
{
|
|
|
|
if (t0.getStartColumn() <= refPos.column) {
|
|
|
|
// We only get here if the current token is less indented than the
|
|
// current reference token. Pop the block indicator and leave the
|
|
// reference position be for the edge case where the parent line-fold
|
|
// continues after the block.
|
|
this.frameTypes.pop();
|
|
return new BlockEnd(t0.getStartPosition());
|
|
|
|
}
|
|
|
|
this.frameTypes.push(FrameType.LineFold);
|
|
this.referencePositions.push(t0.getStartPosition());
|
|
|
|
// In theory, we could explictly issue a LineFoldStart and let all
|
|
// tokens be passed through in the FrameType.LineFold case. It does add
|
|
// more logic to the parser for no real benefit, which is why it was
|
|
// omitted.
|
|
this.tokens.get();
|
|
return t0;
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|