new tokenizer
This commit is contained in:
parent
9a7e263bed
commit
3f1408376b
@ -3,7 +3,7 @@ fn main() -> int {
|
|||||||
let v: int = 123;
|
let v: int = 123;
|
||||||
|
|
||||||
let ch = 'c';
|
let ch = 'c';
|
||||||
let s = "hello\ world";
|
let s = "hello world";
|
||||||
|
|
||||||
inner();
|
inner();
|
||||||
|
|
||||||
|
|||||||
@ -6,7 +6,8 @@ import { Resolver } from "./resolve.ts";
|
|||||||
async function main() {
|
async function main() {
|
||||||
const text = await Deno.readTextFile(Deno.args[0]);
|
const text = await Deno.readTextFile(Deno.args[0]);
|
||||||
const toks = tokenize(text);
|
const toks = tokenize(text);
|
||||||
// console.log({ toks });
|
console.log({ toks });
|
||||||
|
return;
|
||||||
|
|
||||||
const parser = new Parser(toks);
|
const parser = new Parser(toks);
|
||||||
const file = parser.parseFile();
|
const file = parser.parseFile();
|
||||||
|
|||||||
138
src/tok.ts
138
src/tok.ts
@ -1,81 +1,83 @@
|
|||||||
export type Tok = {
|
export type Tok = {
|
||||||
type: string;
|
type: string;
|
||||||
|
idx: number;
|
||||||
line: number;
|
line: number;
|
||||||
|
col: number;
|
||||||
|
length: number;
|
||||||
value?: string;
|
value?: string;
|
||||||
};
|
};
|
||||||
|
const keywords = ["true", "false", "bool", "int", "char", "str", "fn", "let"];
|
||||||
const keywords = new Set([
|
|
||||||
"true",
|
|
||||||
"false",
|
|
||||||
"bool",
|
|
||||||
"int",
|
|
||||||
"char",
|
|
||||||
"str",
|
|
||||||
"fn",
|
|
||||||
"let",
|
|
||||||
]);
|
|
||||||
|
|
||||||
type OpTree = Map<string, OpTree | null>;
|
|
||||||
const opTreeRoot: OpTree = new Map(
|
|
||||||
Object.entries({
|
|
||||||
"-": new Map(Object.entries({
|
|
||||||
">": null,
|
|
||||||
})),
|
|
||||||
}),
|
|
||||||
);
|
|
||||||
|
|
||||||
export function tokenize(text: string): Tok[] {
|
export function tokenize(text: string): Tok[] {
|
||||||
return text
|
const rules: Record<string, { match: RegExp; ignore?: boolean }> = {
|
||||||
.replace(/\/\/[^\n]*/g, "")
|
"whitespace": { match: /^[ \t\r]+/, ignore: true },
|
||||||
.replace(/\/\*.*?\*\//gs, "")
|
"newline": { match: /^\n/s, ignore: true },
|
||||||
.replace(/([^a-zA-Z0-9_'"\\ \t\r])/g, " $1 ")
|
"linecomment": { match: /^\/\/[^\n]*/, ignore: true },
|
||||||
.split(/(?<!\\)[ \t\r]/)
|
"blockcomment": { match: /^\/\*.*?\*\//s, ignore: true },
|
||||||
.filter((tok) => tok !== "")
|
_keywords: {
|
||||||
.map((tok) => tok.replace(/\\ /g, " "))
|
match: new RegExp(
|
||||||
.reduce<[string[], OpTree]>(([toks, opTree], tok) => {
|
`^(${keywords.map((s) => `(?:${s})`).join("|")})`,
|
||||||
if (toks.length === 0) {
|
),
|
||||||
toks.push(tok);
|
},
|
||||||
return [toks, opTree];
|
_identity: {
|
||||||
|
match: new RegExp(
|
||||||
|
`^(?:(?:\-\>)|[${RegExp.escape("()[]{}+-*/,.;:!=<>&|?")}])`,
|
||||||
|
),
|
||||||
|
},
|
||||||
|
"ident": { match: /^[a-zA-Z_][a-zA-Z0-9_]*/ },
|
||||||
|
"int": { match: /^[0-9_]+/ },
|
||||||
|
"char": { match: /^'(?:(?:\\.)|[^'\n])'/ },
|
||||||
|
"str": { match: /^"(?:(?:\\.)|[^"])*"/s },
|
||||||
|
};
|
||||||
|
|
||||||
|
const toks: Tok[] = [];
|
||||||
|
let idx = 0;
|
||||||
|
let line = 1;
|
||||||
|
let col = 1;
|
||||||
|
while (idx < text.length) {
|
||||||
|
let found = false;
|
||||||
|
for (const [id, rule] of Object.entries(rules)) {
|
||||||
|
const match = text.slice(idx).match(rule.match);
|
||||||
|
if (!match) {
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
const last = toks.at(-1)!;
|
found = true;
|
||||||
if (!opTree.has(last)) {
|
|
||||||
toks.push(tok);
|
idx += match[0].length;
|
||||||
return [toks, opTreeRoot];
|
if (rule.match.dotAll && /\n/.test(match[0])) {
|
||||||
}
|
line += (match[0].match(/\n/g) ?? []).length;
|
||||||
if (opTree.get(last) === null) {
|
col = match[0].length - match[0].lastIndexOf("\n");
|
||||||
toks.push(tok);
|
|
||||||
return [toks, opTreeRoot];
|
|
||||||
} else if (opTree.get(last)!.has(tok)) {
|
|
||||||
toks[toks.length - 1] += tok;
|
|
||||||
return [toks, opTree.get(last)!];
|
|
||||||
} else {
|
} else {
|
||||||
toks.push(tok);
|
col += match[0].length;
|
||||||
return [toks, opTreeRoot];
|
|
||||||
}
|
}
|
||||||
}, [[], opTreeRoot])[0]
|
|
||||||
.slice(0, -1)
|
if (rule.ignore) continue;
|
||||||
.reduce<[Tok[], number]>(([toks, line], type) => {
|
|
||||||
if (type === "\n") {
|
const length = match[0].length;
|
||||||
return [toks, line + 1];
|
const tok: Tok = { type: id, idx, line, col, length };
|
||||||
|
if (id === "_keywords" || id === "_identity") {
|
||||||
|
tok.type = match[0];
|
||||||
} else {
|
} else {
|
||||||
toks.push({ type, line });
|
tok.value = match[0];
|
||||||
return [toks, line];
|
|
||||||
}
|
}
|
||||||
}, [[], 1])[0]
|
toks.push(tok);
|
||||||
.map((tok) => {
|
break;
|
||||||
if (
|
}
|
||||||
/^[a-zA-Z_][a-zA-Z0-9_]*$/.test(tok.type) &&
|
if (!found) {
|
||||||
!keywords.has(tok.type)
|
printError(line, `invalid character '${text[idx]}'`);
|
||||||
) {
|
idx += 1;
|
||||||
return { type: "ident", line: tok.line, value: tok.type };
|
}
|
||||||
} else if (/^[0-9_]+$/.test(tok.type)) {
|
}
|
||||||
return { type: "int", line: tok.line, value: tok.type };
|
|
||||||
} else if (/^'.*?'$/.test(tok.type)) {
|
return toks;
|
||||||
return { type: "char", line: tok.line, value: tok.type };
|
}
|
||||||
} else if (/^".*?"$/.test(tok.type)) {
|
|
||||||
return { type: "str", line: tok.line, value: tok.type };
|
function printError(line: number, message: string) {
|
||||||
} else {
|
console.error(
|
||||||
return tok;
|
`%cerror%c: ${message}\n %c--> line ${line}%c`,
|
||||||
}
|
"font-weight: bold; color: red",
|
||||||
});
|
"font-weight: bold; color: while",
|
||||||
|
"color: cyan",
|
||||||
|
"",
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user