new tokenizer
This commit is contained in:
parent
9a7e263bed
commit
3f1408376b
@ -3,7 +3,7 @@ fn main() -> int {
|
||||
let v: int = 123;
|
||||
|
||||
let ch = 'c';
|
||||
let s = "hello\ world";
|
||||
let s = "hello world";
|
||||
|
||||
inner();
|
||||
|
||||
|
||||
@ -6,7 +6,8 @@ import { Resolver } from "./resolve.ts";
|
||||
async function main() {
|
||||
const text = await Deno.readTextFile(Deno.args[0]);
|
||||
const toks = tokenize(text);
|
||||
// console.log({ toks });
|
||||
console.log({ toks });
|
||||
return;
|
||||
|
||||
const parser = new Parser(toks);
|
||||
const file = parser.parseFile();
|
||||
|
||||
136
src/tok.ts
136
src/tok.ts
@ -1,81 +1,83 @@
|
||||
export type Tok = {
|
||||
type: string;
|
||||
idx: number;
|
||||
line: number;
|
||||
col: number;
|
||||
length: number;
|
||||
value?: string;
|
||||
};
|
||||
|
||||
const keywords = new Set([
|
||||
"true",
|
||||
"false",
|
||||
"bool",
|
||||
"int",
|
||||
"char",
|
||||
"str",
|
||||
"fn",
|
||||
"let",
|
||||
]);
|
||||
|
||||
type OpTree = Map<string, OpTree | null>;
|
||||
const opTreeRoot: OpTree = new Map(
|
||||
Object.entries({
|
||||
"-": new Map(Object.entries({
|
||||
">": null,
|
||||
})),
|
||||
}),
|
||||
);
|
||||
const keywords = ["true", "false", "bool", "int", "char", "str", "fn", "let"];
|
||||
|
||||
export function tokenize(text: string): Tok[] {
|
||||
return text
|
||||
.replace(/\/\/[^\n]*/g, "")
|
||||
.replace(/\/\*.*?\*\//gs, "")
|
||||
.replace(/([^a-zA-Z0-9_'"\\ \t\r])/g, " $1 ")
|
||||
.split(/(?<!\\)[ \t\r]/)
|
||||
.filter((tok) => tok !== "")
|
||||
.map((tok) => tok.replace(/\\ /g, " "))
|
||||
.reduce<[string[], OpTree]>(([toks, opTree], tok) => {
|
||||
if (toks.length === 0) {
|
||||
toks.push(tok);
|
||||
return [toks, opTree];
|
||||
const rules: Record<string, { match: RegExp; ignore?: boolean }> = {
|
||||
"whitespace": { match: /^[ \t\r]+/, ignore: true },
|
||||
"newline": { match: /^\n/s, ignore: true },
|
||||
"linecomment": { match: /^\/\/[^\n]*/, ignore: true },
|
||||
"blockcomment": { match: /^\/\*.*?\*\//s, ignore: true },
|
||||
_keywords: {
|
||||
match: new RegExp(
|
||||
`^(${keywords.map((s) => `(?:${s})`).join("|")})`,
|
||||
),
|
||||
},
|
||||
_identity: {
|
||||
match: new RegExp(
|
||||
`^(?:(?:\-\>)|[${RegExp.escape("()[]{}+-*/,.;:!=<>&|?")}])`,
|
||||
),
|
||||
},
|
||||
"ident": { match: /^[a-zA-Z_][a-zA-Z0-9_]*/ },
|
||||
"int": { match: /^[0-9_]+/ },
|
||||
"char": { match: /^'(?:(?:\\.)|[^'\n])'/ },
|
||||
"str": { match: /^"(?:(?:\\.)|[^"])*"/s },
|
||||
};
|
||||
|
||||
const toks: Tok[] = [];
|
||||
let idx = 0;
|
||||
let line = 1;
|
||||
let col = 1;
|
||||
while (idx < text.length) {
|
||||
let found = false;
|
||||
for (const [id, rule] of Object.entries(rules)) {
|
||||
const match = text.slice(idx).match(rule.match);
|
||||
if (!match) {
|
||||
continue;
|
||||
}
|
||||
const last = toks.at(-1)!;
|
||||
if (!opTree.has(last)) {
|
||||
toks.push(tok);
|
||||
return [toks, opTreeRoot];
|
||||
}
|
||||
if (opTree.get(last) === null) {
|
||||
toks.push(tok);
|
||||
return [toks, opTreeRoot];
|
||||
} else if (opTree.get(last)!.has(tok)) {
|
||||
toks[toks.length - 1] += tok;
|
||||
return [toks, opTree.get(last)!];
|
||||
found = true;
|
||||
|
||||
idx += match[0].length;
|
||||
if (rule.match.dotAll && /\n/.test(match[0])) {
|
||||
line += (match[0].match(/\n/g) ?? []).length;
|
||||
col = match[0].length - match[0].lastIndexOf("\n");
|
||||
} else {
|
||||
col += match[0].length;
|
||||
}
|
||||
|
||||
if (rule.ignore) continue;
|
||||
|
||||
const length = match[0].length;
|
||||
const tok: Tok = { type: id, idx, line, col, length };
|
||||
if (id === "_keywords" || id === "_identity") {
|
||||
tok.type = match[0];
|
||||
} else {
|
||||
tok.value = match[0];
|
||||
}
|
||||
toks.push(tok);
|
||||
return [toks, opTreeRoot];
|
||||
break;
|
||||
}
|
||||
}, [[], opTreeRoot])[0]
|
||||
.slice(0, -1)
|
||||
.reduce<[Tok[], number]>(([toks, line], type) => {
|
||||
if (type === "\n") {
|
||||
return [toks, line + 1];
|
||||
} else {
|
||||
toks.push({ type, line });
|
||||
return [toks, line];
|
||||
if (!found) {
|
||||
printError(line, `invalid character '${text[idx]}'`);
|
||||
idx += 1;
|
||||
}
|
||||
}, [[], 1])[0]
|
||||
.map((tok) => {
|
||||
if (
|
||||
/^[a-zA-Z_][a-zA-Z0-9_]*$/.test(tok.type) &&
|
||||
!keywords.has(tok.type)
|
||||
) {
|
||||
return { type: "ident", line: tok.line, value: tok.type };
|
||||
} else if (/^[0-9_]+$/.test(tok.type)) {
|
||||
return { type: "int", line: tok.line, value: tok.type };
|
||||
} else if (/^'.*?'$/.test(tok.type)) {
|
||||
return { type: "char", line: tok.line, value: tok.type };
|
||||
} else if (/^".*?"$/.test(tok.type)) {
|
||||
return { type: "str", line: tok.line, value: tok.type };
|
||||
} else {
|
||||
return tok;
|
||||
}
|
||||
});
|
||||
|
||||
return toks;
|
||||
}
|
||||
|
||||
function printError(line: number, message: string) {
|
||||
console.error(
|
||||
`%cerror%c: ${message}\n %c--> line ${line}%c`,
|
||||
"font-weight: bold; color: red",
|
||||
"font-weight: bold; color: while",
|
||||
"color: cyan",
|
||||
"",
|
||||
);
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user