ethos/src/front/parse.ts
2026-04-16 13:02:10 +02:00

575 lines
17 KiB
TypeScript

import * as ast from "../ast.ts";
import { FileReporter, Loc } from "../diagnostics.ts";
export function parse(
text: string,
reporter: FileReporter,
): ast.Node {
return new Parser(text, reporter).parseFile();
}
export class Parser {
private toks: Tok[];
private idx = 0;
private currentLoc: Loc = { idx: 0, line: 1, col: 1 };
private prevTok: Tok | null = null;
constructor(
private text: string,
private reporter: FileReporter,
) {
this.toks = tokenize(this.text, this.reporter);
}
parseFile(): ast.Node {
const loc = this.loc();
const stmts: ast.Node[] = [];
while (!this.done) {
stmts.push(this.parseStmt());
}
return ast.Node.create(loc, "File", { stmts });
}
parseBlock(): ast.Node {
const loc = this.loc();
this.mustEat("{");
const stmts: ast.Node[] = [];
while (!this.done && !this.test("}")) {
stmts.push(this.parseStmt());
}
this.mustEat("}");
return ast.Node.create(loc, "Block", { stmts });
}
parseStmt(): ast.Node {
const loc = this.loc();
if (this.test("fn")) {
return this.parseFnStmt();
} else if (this.test("return")) {
return this.parseReturnStmt();
} else if (this.test("let")) {
return this.parseLetStmt();
} else if (this.test("if")) {
return this.parseIfStmt();
} else if (this.test("while")) {
return this.parseWhileStmt();
} else if (this.test("break")) {
return this.parseBreakStmt();
} else {
const place = this.parseExpr();
if (this.eat("=")) {
const expr = this.parseExpr();
this.mustEat(";");
return ast.Node.create(loc, "AssignStmt", { place, expr });
}
this.mustEat(";");
return ast.Node.create(loc, "ExprStmt", { expr: place });
}
}
parseFnStmt(): ast.Node {
const loc = this.loc();
this.step();
const ident = this.mustEat("ident").value;
const genericParams = this.parseGenericParams();
this.mustEat("(");
const params: ast.Node[] = [];
if (!this.test(")")) {
params.push(this.parseParam());
while (this.eat(",")) {
if (this.test(")")) {
break;
}
params.push(this.parseParam());
}
}
this.mustEat(")");
let retTy: ast.Node | null = null;
if (this.eat("->")) {
retTy = this.parseTy();
}
const body = this.parseBlock();
return ast.Node.create(loc, "FnStmt", {
ident,
genericParams,
params,
retTy,
body,
});
}
parseReturnStmt(): ast.Node {
const loc = this.loc();
this.step();
let expr: ast.Node | null = null;
if (!this.test(";")) {
expr = this.parseExpr();
}
this.mustEat(";");
return ast.Node.create(loc, "ReturnStmt", { expr });
}
parseLetStmt(): ast.Node {
const loc = this.loc();
this.step();
const param = this.parseParam();
this.mustEat("=");
const expr = this.parseExpr();
this.mustEat(";");
return ast.Node.create(loc, "LetStmt", { param, expr });
}
parseIfStmt(): ast.Node {
const loc = this.loc();
this.step();
const cond = this.parseExpr();
const truthy = this.parseBlock();
let falsy: ast.Node | null = null;
if (this.eat("else")) {
falsy = this.parseBlock();
}
return ast.Node.create(loc, "IfStmt", { cond, truthy, falsy });
}
parseWhileStmt(): ast.Node {
const loc = this.loc();
this.step();
const cond = this.parseExpr();
const body = this.parseBlock();
return ast.Node.create(loc, "WhileStmt", { cond, body });
}
parseBreakStmt(): ast.Node {
const loc = this.loc();
this.step();
this.mustEat(";");
return ast.Node.create(loc, "BreakStmt", {});
}
parseParam(): ast.Node {
const loc = this.loc();
const ident = this.mustEat("ident").value;
let ty: ast.Node | null = null;
if (this.eat(":")) {
ty = this.parseTy();
}
return ast.Node.create(loc, "Param", { ident, ty });
}
parseExpr(): ast.Node {
return this.parseRange();
}
parseRange(): ast.Node {
const loc = this.loc();
if (this.eat("..") || this.eat("..=")) {
return this.parseRangeTail(loc, null, this.prevTok!.type);
} else {
const begin = this.parseBinary();
if (this.eat("..") || this.eat("..=")) {
return this.parseRangeTail(loc, begin, this.prevTok!.type);
} else {
return begin;
}
}
}
parseRangeTail(loc: Loc, begin: ast.Node | null, tok: string): ast.Node {
const limit: ast.RangeLimit = tok === ".." ? "Exclusive" : "Inclusive";
let end: ast.Node | null = null;
if (![";", ",", ")", "]"].some((tok) => this.test(tok))) {
end = this.parseBinary();
}
return ast
.create(loc, "RangeExpr", { begin, end, limit });
}
parseBinary(prec = 7): ast.Node {
const loc = this.loc();
if (prec == 0) {
return this.parsePrefix();
}
const ops: [Tok["type"], ast.BinaryOp, number][] = [
["or", "Or", 9],
["and", "And", 8],
["==", "Eq", 7],
["!=", "Ne", 7],
["<", "Lt", 7],
[">", "Gt", 7],
["<=", "Lte", 7],
[">=", "Gte", 7],
["|", "BitOr", 6],
["^", "BitXor", 5],
["&", "BitAnd", 4],
["<<", "Shl", 3],
[">>", "Shr", 3],
["+", "Add", 2],
["-", "Sub", 2],
["*", "Mul", 1],
["/", "Div", 1],
["%", "Rem", 1],
];
let left = this.parseBinary(prec - 1);
let should_continue = true;
while (should_continue) {
should_continue = false;
for (const [tok, op, p] of ops) {
if (prec >= p && this.eat(tok)) {
const right = this.parseBinary(prec - 1);
left = ast.Node.create(
loc,
"BinaryExpr",
{ op, left, right, tok },
);
should_continue = true;
break;
}
}
}
return left;
}
parsePrefix(): ast.Node {
const loc = this.loc();
const ops: [Tok["type"], ast.UnaryOp][] = [
["not", "Not"],
["-", "Neg"],
["*", "Deref"],
];
for (const [tok, op] of ops) {
if (this.eat(tok)) {
const expr = this.parsePrefix();
return ast.Node.create(loc, "UnaryExpr", { op, expr, tok });
}
}
if (this.eat("&")) {
const op: ast.UnaryOp = this.eat("mut") ? "RefMut" : "Ref";
const expr = this.parsePrefix();
const tok = op === "Ref" ? "&" : "&mut";
return ast.Node.create(loc, "UnaryExpr", { op, expr, tok });
}
return this.parsePostfix();
}
parsePostfix(): ast.Node {
let expr = this.parseOperand();
while (true) {
const loc = this.loc();
if (this.eat(".*")) {
// use unary because it's already there
// TODO: consider making a separate node type
expr = ast.Node
.create(loc, "UnaryExpr", { expr, op: "Deref", tok: ".*" });
} else if (this.eat("[")) {
const arg = this.parseExpr();
this.mustEat("]");
expr = ast.Node.create(loc, "IndexExpr", { value: expr, arg });
} else if (this.test("::<")) {
const generics = this.parseGenericArgs();
this.mustEat("(");
expr = this.parseCallExprTail(expr, loc, generics);
} else if (this.eat("(")) {
expr = this.parseCallExprTail(expr, loc, null);
} else {
break;
}
}
return expr;
}
parseCallExprTail(
expr: ast.Node,
loc: Loc,
generics: ast.Node[] | null,
): ast.Node {
const args: ast.Node[] = [];
if (!this.test(")")) {
args.push(this.parseExpr());
while (this.eat(",")) {
if (this.done || this.test(")")) {
break;
}
args.push(this.parseExpr());
}
}
this.mustEat(")");
return ast.Node.create(loc, "CallExpr", {
value: expr,
generics,
args,
});
}
parseOperand(): ast.Node {
const loc = this.loc();
if (this.test("ident")) {
const ident = this.current.value;
this.step();
return ast.Node.create(loc, "IdentExpr", { ident });
} else if (this.test("int")) {
const match = this.current.value
.match(/(0|(?:[1-9][0-9]*))([iu](?:8|16|32|64|size))?$/);
if (!match) {
throw new Error();
}
const value = Number(match[1]);
const intTy = match[2];
if (
intTy &&
!["8", "16", "32", "64", "size"].includes(intTy.slice(1))
) {
this.reporter.error(
loc,
`invalid integer size '${intTy[1]}'`,
);
this.reporter.abort();
}
this.step();
return ast.Node.create(loc, "IntExpr", {
value,
intTy: intTy as ast.IntTy ?? "any",
});
} else if (this.test("str")) {
const value = this.current.value;
this.step();
return ast.Node.create(loc, "StrExpr", { value });
} else if (this.eat("(")) {
const expr = this.parseExpr();
this.mustEat(")");
return expr;
} else if (this.eat("[")) {
const values: ast.Node[] = [];
if (!this.done && !this.test("]")) {
values.push(this.parseExpr());
while (this.eat(",")) {
if (this.test("]")) {
break;
}
values.push(this.parseExpr());
}
}
this.mustEat("]");
return ast.Node.create(loc, "ArrayExpr", { values });
} else {
this.mustEat("<expression>");
throw new Error();
}
}
parseTy(): ast.Node {
const loc = this.loc();
if (this.test("ident")) {
const ident = this.current.value;
this.step();
return ast.Node.create(loc, "IdentTy", { ident });
} else if (this.eat("*")) {
const mutable = this.eat("mut");
const ty = this.parseTy();
return ast.Node.create(loc, mutable ? "PtrMutTy" : "PtrTy", { ty });
} else if (this.eat("[")) {
const ty = this.parseTy();
if (this.eat(";")) {
const length = this.parseExpr();
this.mustEat("]");
return ast.Node.create(loc, "ArrayTy", { ty, length });
} else {
this.mustEat("]");
return ast.Node.create(loc, "SliceTy", { ty });
}
} else {
this.mustEat("<type>");
throw new Error();
}
}
parseGenericArgs(): ast.Node[] | null {
if (!this.eat("::<")) {
return null;
}
const args: ast.Node[] = [];
while (!this.done && !this.test("<")) {
args.push(this.parseTy());
if (!this.eat(",")) {
break;
}
}
this.mustEat(">");
return args;
}
parseGenericParams(): ast.Node[] | null {
if (!this.eat("<")) {
return null;
}
const params: ast.Node[] = [];
while (!this.done && !this.test("<")) {
const loc = this.loc();
const identTok = this.mustEat("ident");
params.push(ast.create(loc, "Generic", { ident: identTok.value }));
if (!this.eat(",")) {
break;
}
}
this.mustEat(">");
return params;
}
private mustEat(type: string, loc = this.loc()): Tok {
const tok = this.current;
if (tok.type !== type) {
this.reporter.error(
loc,
`expected '${type}', got '${
this.done ? "eof" : this.current.type
}'`,
);
if (type === ";" && this.idx > 0) {
this.reporter.info(
this.toks[this.idx - 1].loc,
`try adding '${type}' here`,
);
}
this.reporter.abort();
}
this.step();
return tok;
}
private eat(type: string): boolean {
if (this.test(type)) {
this.step();
return true;
}
return false;
}
private step() {
if (!this.done) {
this.prevTok = this.current;
}
this.idx += 1;
if (!this.done) {
this.currentLoc = this.current.loc;
}
}
private test(type: string): boolean {
return !this.done && this.current.type == type;
}
private loc(): Loc {
return this.currentLoc;
}
private get current(): Tok {
return this.toks[this.idx];
}
private get done(): boolean {
return this.idx >= this.toks.length;
}
}
export type Tok = { type: string; value: string; loc: Loc };
const keywordPattern =
/^(?:(?:fn)|(?:return)|(?:let)|(?:if)|(?:else)|(?:while)|(?:break)|(?:or)|(?:and)|(?:not)|(?:mut))/;
const operatorPattern2 =
/((?:\->)|(?:==)|(?:!=)|(?:<=)|(?:>=)|(?:\:\:<)|(?:<<)|(?:>>)|(?:\.\*)|(?:\.\.)|(?:\.\.=)|[\n\(\)\{\}\[\]\,\.\;\:\!\=\<\>\&\^\|\+\-\*\/\%])/g;
export function tokenize(text: string, reporter: FileReporter): Tok[] {
return new Lexer()
.add(/[ \t\r\n]+/, (_) => null)
.add(/\/\/[^\n]*/, (_) => null)
.add(operatorPattern2, (loc, value) => ({ type: value, value, loc }))
.add(/[a-zA-Z_][a-zA-Z0-9_]*/, (loc, value) => {
const type = keywordPattern.test(value) ? value : "ident";
return { type, value, loc };
})
.add(
/(?:0|(?:[1-9][0-9]*))(?:[iu](?:8|16|32|64|size))?/,
(loc, value) => {
return { type: "int", value, loc };
},
)
.add(/"(?:[^\\"]|\\.)*"/, (loc, literal) => {
let i = 1;
let value = "";
while (i < literal.length - 1) {
if (literal[i] === "\\") {
i += 1;
value += {
"0": "\0",
"t": "\t",
"r": "\r",
"n": "\n",
}[literal[i]] ?? literal[i];
} else {
value += literal[i];
}
i += 1;
}
return { type: "str", value, loc };
})
.add(/./, (loc, value) => {
const escapedChar = JSON.stringify(value[0]).slice(1, -1);
reporter.error(loc, `illegal character '${escapedChar}'`);
return null;
})
.lex(text);
}
type LexRule = {
pattern: RegExp;
action: LexAction;
};
type LexAction = (loc: Loc, match: string) => Tok | null;
class Lexer {
private rules: LexRule[] = [];
add(pattern: RegExp, action: LexAction): this {
this.rules.push({
pattern: new RegExp(`^(?:${pattern.source})`),
action,
});
return this;
}
lex(text: string): Tok[] {
const toks: Tok[] = [];
let idx = 0;
let line = 1;
let col = 1;
outer_loop: while (idx < text.length) {
for (const rule of this.rules) {
const match = text.slice(idx).match(rule.pattern);
if (!match) {
continue;
}
const loc: Loc = { idx, line, col };
for (let i = 0; i < match[0].length; ++i) {
if (text[idx] == "\n") {
line += 1;
col = 1;
} else {
col += 1;
}
idx += 1;
}
const tok = rule.action(loc, match[0]);
if (tok) {
toks.push(tok);
}
continue outer_loop;
}
throw new Error(`no rule for character '${text[idx]}'`);
}
return toks;
}
}