new tokenizer

2025-12-11 03:03:48 +01:00 · 2025-12-11 03:03:48 +01:00 · 3f1408376b
commit 3f1408376b
parent 9a7e263bed
3 changed files with 73 additions and 70 deletions
--- a/example.lang4
+++ b/example.lang4
@ -3,7 +3,7 @@ fn main() -> int {
    let v: int = 123;
    
    let ch = 'c';
-    let s = "hello\ world";
+    let s = "hello world";
    
    inner();

--- a/src/main.ts
+++ b/src/main.ts
@ -6,7 +6,8 @@ import { Resolver } from "./resolve.ts";
 async function main() {
    const text = await Deno.readTextFile(Deno.args[0]);
    const toks = tokenize(text);
-    // console.log({ toks });
+    console.log({ toks });
+    return;

    const parser = new Parser(toks);
    const file = parser.parseFile();
--- a/src/tok.ts
+++ b/src/tok.ts
@ -1,81 +1,83 @@
 export type Tok = {
    type: string;
+    idx: number;
    line: number;
+    col: number;
+    length: number;
    value?: string;
 };
-
-const keywords = new Set([
-    "true",
-    "false",
-    "bool",
-    "int",
-    "char",
-    "str",
-    "fn",
-    "let",
-]);
-
-type OpTree = Map<string, OpTree | null>;
-const opTreeRoot: OpTree = new Map(
-    Object.entries({
-        "-": new Map(Object.entries({
-            ">": null,
-        })),
-    }),
-);
+const keywords = ["true", "false", "bool", "int", "char", "str", "fn", "let"];

 export function tokenize(text: string): Tok[] {
-    return text
-        .replace(/\/\/[^\n]*/g, "")
-        .replace(/\/\*.*?\*\//gs, "")
-        .replace(/([^a-zA-Z0-9_'"\\ \t\r])/g, " $1 ")
-        .split(/(?<!\\)[ \t\r]/)
-        .filter((tok) => tok !== "")
-        .map((tok) => tok.replace(/\\ /g, " "))
-        .reduce<[string[], OpTree]>(([toks, opTree], tok) => {
-            if (toks.length === 0) {
-                toks.push(tok);
-                return [toks, opTree];
+    const rules: Record<string, { match: RegExp; ignore?: boolean }> = {
+        "whitespace": { match: /^[ \t\r]+/, ignore: true },
+        "newline": { match: /^\n/s, ignore: true },
+        "linecomment": { match: /^\/\/[^\n]*/, ignore: true },
+        "blockcomment": { match: /^\/\*.*?\*\//s, ignore: true },
+        _keywords: {
+            match: new RegExp(
+                `^(${keywords.map((s) => `(?:${s})`).join("|")})`,
+            ),
+        },
+        _identity: {
+            match: new RegExp(
+                `^(?:(?:\-\>)|[${RegExp.escape("()[]{}+-*/,.;:!=<>&|?")}])`,
+            ),
+        },
+        "ident": { match: /^[a-zA-Z_][a-zA-Z0-9_]*/ },
+        "int": { match: /^[0-9_]+/ },
+        "char": { match: /^'(?:(?:\\.)|[^'\n])'/ },
+        "str": { match: /^"(?:(?:\\.)|[^"])*"/s },
+    };
+
+    const toks: Tok[] = [];
+    let idx = 0;
+    let line = 1;
+    let col = 1;
+    while (idx < text.length) {
+        let found = false;
+        for (const [id, rule] of Object.entries(rules)) {
+            const match = text.slice(idx).match(rule.match);
+            if (!match) {
+                continue;
            }
-            const last = toks.at(-1)!;
-            if (!opTree.has(last)) {
-                toks.push(tok);
-                return [toks, opTreeRoot];
-            }
-            if (opTree.get(last) === null) {
-                toks.push(tok);
-                return [toks, opTreeRoot];
-            } else if (opTree.get(last)!.has(tok)) {
-                toks[toks.length - 1] += tok;
-                return [toks, opTree.get(last)!];
+            found = true;
+
+            idx += match[0].length;
+            if (rule.match.dotAll && /\n/.test(match[0])) {
+                line += (match[0].match(/\n/g) ?? []).length;
+                col = match[0].length - match[0].lastIndexOf("\n");
            } else {
+                col += match[0].length;
+            }
+
+            if (rule.ignore) continue;
+
+            const length = match[0].length;
+            const tok: Tok = { type: id, idx, line, col, length };
+            if (id === "_keywords" || id === "_identity") {
+                tok.type = match[0];
+            } else {
+                tok.value = match[0];
+            }
            toks.push(tok);
-                return [toks, opTreeRoot];
+            break;
        }
-        }, [[], opTreeRoot])[0]
-        .slice(0, -1)
-        .reduce<[Tok[], number]>(([toks, line], type) => {
-            if (type === "\n") {
-                return [toks, line + 1];
-            } else {
-                toks.push({ type, line });
-                return [toks, line];
+        if (!found) {
+            printError(line, `invalid character '${text[idx]}'`);
+            idx += 1;
        }
-        }, [[], 1])[0]
-        .map((tok) => {
-            if (
-                /^[a-zA-Z_][a-zA-Z0-9_]*$/.test(tok.type) &&
-                !keywords.has(tok.type)
-            ) {
-                return { type: "ident", line: tok.line, value: tok.type };
-            } else if (/^[0-9_]+$/.test(tok.type)) {
-                return { type: "int", line: tok.line, value: tok.type };
-            } else if (/^'.*?'$/.test(tok.type)) {
-                return { type: "char", line: tok.line, value: tok.type };
-            } else if (/^".*?"$/.test(tok.type)) {
-                return { type: "str", line: tok.line, value: tok.type };
-            } else {
-                return tok;
    }
-        });
+
+    return toks;
+}
+
+function printError(line: number, message: string) {
+    console.error(
+        `%cerror%c: ${message}\n %c--> line ${line}%c`,
+        "font-weight: bold; color: red",
+        "font-weight: bold; color: while",
+        "color: cyan",
+        "",
+    );
 }