new tokenizer

This commit is contained in:
sfja 2025-12-11 03:03:48 +01:00
parent 9a7e263bed
commit 3f1408376b
3 changed files with 73 additions and 70 deletions

View File

@ -3,7 +3,7 @@ fn main() -> int {
let v: int = 123; let v: int = 123;
let ch = 'c'; let ch = 'c';
let s = "hello\ world"; let s = "hello world";
inner(); inner();

View File

@ -6,7 +6,8 @@ import { Resolver } from "./resolve.ts";
async function main() { async function main() {
const text = await Deno.readTextFile(Deno.args[0]); const text = await Deno.readTextFile(Deno.args[0]);
const toks = tokenize(text); const toks = tokenize(text);
// console.log({ toks }); console.log({ toks });
return;
const parser = new Parser(toks); const parser = new Parser(toks);
const file = parser.parseFile(); const file = parser.parseFile();

View File

@ -1,81 +1,83 @@
export type Tok = { export type Tok = {
type: string; type: string;
idx: number;
line: number; line: number;
col: number;
length: number;
value?: string; value?: string;
}; };
const keywords = ["true", "false", "bool", "int", "char", "str", "fn", "let"];
const keywords = new Set([
"true",
"false",
"bool",
"int",
"char",
"str",
"fn",
"let",
]);
type OpTree = Map<string, OpTree | null>;
const opTreeRoot: OpTree = new Map(
Object.entries({
"-": new Map(Object.entries({
">": null,
})),
}),
);
export function tokenize(text: string): Tok[] { export function tokenize(text: string): Tok[] {
return text const rules: Record<string, { match: RegExp; ignore?: boolean }> = {
.replace(/\/\/[^\n]*/g, "") "whitespace": { match: /^[ \t\r]+/, ignore: true },
.replace(/\/\*.*?\*\//gs, "") "newline": { match: /^\n/s, ignore: true },
.replace(/([^a-zA-Z0-9_'"\\ \t\r])/g, " $1 ") "linecomment": { match: /^\/\/[^\n]*/, ignore: true },
.split(/(?<!\\)[ \t\r]/) "blockcomment": { match: /^\/\*.*?\*\//s, ignore: true },
.filter((tok) => tok !== "") _keywords: {
.map((tok) => tok.replace(/\\ /g, " ")) match: new RegExp(
.reduce<[string[], OpTree]>(([toks, opTree], tok) => { `^(${keywords.map((s) => `(?:${s})`).join("|")})`,
if (toks.length === 0) { ),
toks.push(tok); },
return [toks, opTree]; _identity: {
match: new RegExp(
`^(?:(?:\-\>)|[${RegExp.escape("()[]{}+-*/,.;:!=<>&|?")}])`,
),
},
"ident": { match: /^[a-zA-Z_][a-zA-Z0-9_]*/ },
"int": { match: /^[0-9_]+/ },
"char": { match: /^'(?:(?:\\.)|[^'\n])'/ },
"str": { match: /^"(?:(?:\\.)|[^"])*"/s },
};
const toks: Tok[] = [];
let idx = 0;
let line = 1;
let col = 1;
while (idx < text.length) {
let found = false;
for (const [id, rule] of Object.entries(rules)) {
const match = text.slice(idx).match(rule.match);
if (!match) {
continue;
} }
const last = toks.at(-1)!; found = true;
if (!opTree.has(last)) {
toks.push(tok); idx += match[0].length;
return [toks, opTreeRoot]; if (rule.match.dotAll && /\n/.test(match[0])) {
} line += (match[0].match(/\n/g) ?? []).length;
if (opTree.get(last) === null) { col = match[0].length - match[0].lastIndexOf("\n");
toks.push(tok);
return [toks, opTreeRoot];
} else if (opTree.get(last)!.has(tok)) {
toks[toks.length - 1] += tok;
return [toks, opTree.get(last)!];
} else { } else {
toks.push(tok); col += match[0].length;
return [toks, opTreeRoot];
} }
}, [[], opTreeRoot])[0]
.slice(0, -1) if (rule.ignore) continue;
.reduce<[Tok[], number]>(([toks, line], type) => {
if (type === "\n") { const length = match[0].length;
return [toks, line + 1]; const tok: Tok = { type: id, idx, line, col, length };
if (id === "_keywords" || id === "_identity") {
tok.type = match[0];
} else { } else {
toks.push({ type, line }); tok.value = match[0];
return [toks, line];
} }
}, [[], 1])[0] toks.push(tok);
.map((tok) => { break;
if ( }
/^[a-zA-Z_][a-zA-Z0-9_]*$/.test(tok.type) && if (!found) {
!keywords.has(tok.type) printError(line, `invalid character '${text[idx]}'`);
) { idx += 1;
return { type: "ident", line: tok.line, value: tok.type }; }
} else if (/^[0-9_]+$/.test(tok.type)) { }
return { type: "int", line: tok.line, value: tok.type };
} else if (/^'.*?'$/.test(tok.type)) { return toks;
return { type: "char", line: tok.line, value: tok.type }; }
} else if (/^".*?"$/.test(tok.type)) {
return { type: "str", line: tok.line, value: tok.type }; function printError(line: number, message: string) {
} else { console.error(
return tok; `%cerror%c: ${message}\n %c--> line ${line}%c`,
} "font-weight: bold; color: red",
}); "font-weight: bold; color: while",
"color: cyan",
"",
);
} }