new tokenizer

This commit is contained in:
sfja 2025-12-11 03:03:48 +01:00
parent 9a7e263bed
commit 3f1408376b
3 changed files with 73 additions and 70 deletions

View File

@ -3,7 +3,7 @@ fn main() -> int {
let v: int = 123;
let ch = 'c';
let s = "hello\ world";
let s = "hello world";
inner();

View File

@ -6,7 +6,8 @@ import { Resolver } from "./resolve.ts";
async function main() {
const text = await Deno.readTextFile(Deno.args[0]);
const toks = tokenize(text);
// console.log({ toks });
console.log({ toks });
return;
const parser = new Parser(toks);
const file = parser.parseFile();

View File

@ -1,81 +1,83 @@
export type Tok = {
type: string;
idx: number;
line: number;
col: number;
length: number;
value?: string;
};
const keywords = new Set([
"true",
"false",
"bool",
"int",
"char",
"str",
"fn",
"let",
]);
type OpTree = Map<string, OpTree | null>;
const opTreeRoot: OpTree = new Map(
Object.entries({
"-": new Map(Object.entries({
">": null,
})),
}),
);
const keywords = ["true", "false", "bool", "int", "char", "str", "fn", "let"];
export function tokenize(text: string): Tok[] {
return text
.replace(/\/\/[^\n]*/g, "")
.replace(/\/\*.*?\*\//gs, "")
.replace(/([^a-zA-Z0-9_'"\\ \t\r])/g, " $1 ")
.split(/(?<!\\)[ \t\r]/)
.filter((tok) => tok !== "")
.map((tok) => tok.replace(/\\ /g, " "))
.reduce<[string[], OpTree]>(([toks, opTree], tok) => {
if (toks.length === 0) {
toks.push(tok);
return [toks, opTree];
const rules: Record<string, { match: RegExp; ignore?: boolean }> = {
"whitespace": { match: /^[ \t\r]+/, ignore: true },
"newline": { match: /^\n/s, ignore: true },
"linecomment": { match: /^\/\/[^\n]*/, ignore: true },
"blockcomment": { match: /^\/\*.*?\*\//s, ignore: true },
_keywords: {
match: new RegExp(
`^(${keywords.map((s) => `(?:${s})`).join("|")})`,
),
},
_identity: {
match: new RegExp(
`^(?:(?:\-\>)|[${RegExp.escape("()[]{}+-*/,.;:!=<>&|?")}])`,
),
},
"ident": { match: /^[a-zA-Z_][a-zA-Z0-9_]*/ },
"int": { match: /^[0-9_]+/ },
"char": { match: /^'(?:(?:\\.)|[^'\n])'/ },
"str": { match: /^"(?:(?:\\.)|[^"])*"/s },
};
const toks: Tok[] = [];
let idx = 0;
let line = 1;
let col = 1;
while (idx < text.length) {
let found = false;
for (const [id, rule] of Object.entries(rules)) {
const match = text.slice(idx).match(rule.match);
if (!match) {
continue;
}
const last = toks.at(-1)!;
if (!opTree.has(last)) {
toks.push(tok);
return [toks, opTreeRoot];
}
if (opTree.get(last) === null) {
toks.push(tok);
return [toks, opTreeRoot];
} else if (opTree.get(last)!.has(tok)) {
toks[toks.length - 1] += tok;
return [toks, opTree.get(last)!];
found = true;
idx += match[0].length;
if (rule.match.dotAll && /\n/.test(match[0])) {
line += (match[0].match(/\n/g) ?? []).length;
col = match[0].length - match[0].lastIndexOf("\n");
} else {
col += match[0].length;
}
if (rule.ignore) continue;
const length = match[0].length;
const tok: Tok = { type: id, idx, line, col, length };
if (id === "_keywords" || id === "_identity") {
tok.type = match[0];
} else {
tok.value = match[0];
}
toks.push(tok);
return [toks, opTreeRoot];
break;
}
}, [[], opTreeRoot])[0]
.slice(0, -1)
.reduce<[Tok[], number]>(([toks, line], type) => {
if (type === "\n") {
return [toks, line + 1];
} else {
toks.push({ type, line });
return [toks, line];
if (!found) {
printError(line, `invalid character '${text[idx]}'`);
idx += 1;
}
}, [[], 1])[0]
.map((tok) => {
if (
/^[a-zA-Z_][a-zA-Z0-9_]*$/.test(tok.type) &&
!keywords.has(tok.type)
) {
return { type: "ident", line: tok.line, value: tok.type };
} else if (/^[0-9_]+$/.test(tok.type)) {
return { type: "int", line: tok.line, value: tok.type };
} else if (/^'.*?'$/.test(tok.type)) {
return { type: "char", line: tok.line, value: tok.type };
} else if (/^".*?"$/.test(tok.type)) {
return { type: "str", line: tok.line, value: tok.type };
} else {
return tok;
}
});
return toks;
}
function printError(line: number, message: string) {
console.error(
`%cerror%c: ${message}\n %c--> line ${line}%c`,
"font-weight: bold; color: red",
"font-weight: bold; color: while",
"color: cyan",
"",
);
}