json-parser-in-c-2/src/json_parse.c
2026-03-19 23:55:05 +01:00

340 lines
8.7 KiB
C

#include "collections.h"
#include "json.h"
#include <assert.h>
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
struct loc {
size_t idx;
int line;
int col;
};
static void report(
struct loc loc, char const *message, char const *text, size_t text_len)
{
fprintf(stderr, "error: %s\n", message);
if (!text)
return;
assert(text[loc.idx] != '\n');
size_t line_begin_idx = loc.idx;
while (line_begin_idx > 0 && text[line_begin_idx] != '\n') {
line_begin_idx -= 1;
}
if (text[line_begin_idx] == '\n') {
line_begin_idx += 1;
}
size_t line_end_idx = loc.idx + 1;
while (line_end_idx < text_len && text[line_end_idx] != '\n') {
line_end_idx += 1;
}
if (line_end_idx >= text_len || text[line_end_idx] == '\n') {
line_end_idx -= 1;
}
int linenr_width = snprintf(NULL, 0, "%d", loc.line);
static char const *spaces = " ";
printf("%.*s|\n"
"%d|%.*s\n"
"%.*s|%.*s^\n"
"%.*s|\n",
linenr_width,
spaces,
loc.line,
(int)(line_end_idx - line_begin_idx + 1),
&text[line_begin_idx],
linenr_width,
spaces,
loc.col - 1,
spaces,
linenr_width,
spaces);
}
enum tokty {
tt_eof,
tt_null = json_null,
tt_false = json_false,
tt_true = json_true,
tt_string,
tt_float,
tt_int = '0',
tt_comma = ',',
tt_colon = ':',
tt_lbracket = '[',
tt_rbracket = ']',
tt_lbrace = '{',
tt_rbrace = '}',
};
struct tok {
enum tokty ty;
char const *ptr;
size_t len;
struct loc loc;
};
struct tokenizer {
char const *text;
size_t len;
size_t idx;
int line;
int col;
bool failed;
};
static void t_step(struct tokenizer *t)
{
if (t->idx >= t->len)
return;
if (t->text[t->idx] == '\n') {
t->line += 1;
t->col = 1;
} else {
t->col += 1;
}
t->idx += 1;
}
static struct tok t_tok(struct tokenizer *t, enum tokty ty, struct loc loc)
{
return (struct tok) { ty, &t->text[loc.idx], t->idx - loc.idx, loc };
}
static struct tok tokenizer_next(struct tokenizer *t);
static struct tok t_make_ident_tok(
struct tokenizer *t, struct loc loc, size_t *i)
{
char const *kws[] = { "null", "false", "true" };
enum tokty tys[] = { tt_null, tt_false, tt_true };
for (size_t kw_i = 0; kw_i < sizeof(kws) / sizeof(kws[0]); ++kw_i) {
if (strncmp(kws[kw_i], &t->text[loc.idx], *i - loc.idx)) {
return t_tok(t, tys[kw_i], loc);
}
}
report(loc, "invalid identifier", t->text, t->len);
t->failed = true;
return tokenizer_next(t);
}
static struct tok t_make_number_tok(
struct tokenizer *t, struct loc loc, size_t *i)
{
while (*i < t->len && t->text[*i] >= '0' && t->text[*i] <= '9') {
t_step(t);
}
enum tokty ty = tt_int;
if (*i < t->len && t->text[*i] == '.') {
ty = tt_float;
t_step(t);
while (*i < t->len && t->text[*i] >= '0' && t->text[*i] <= '9') {
t_step(t);
}
}
return t_tok(t, ty, loc);
}
static struct tok t_make_string_tok(
struct tokenizer *t, struct loc loc, size_t *i)
{
t_step(t);
while (*i < t->len && t->text[*i] != '\"') {
if (t->text[*i] == '\\') {
t_step(t);
if (*i >= t->len)
break;
}
t_step(t);
}
if (*i >= t->len && t->text[*i] != '\"') {
report(loc, "malformed string", t->text, t->len);
t->failed = true;
return tokenizer_next(t);
}
t_step(t);
return t_tok(t, tt_string, loc);
}
static struct tok tokenizer_next(struct tokenizer *t)
{
struct loc loc = { t->idx, t->line, t->col };
size_t *i = &t->idx;
if (*i >= t->len) {
return t_tok(t, tt_eof, loc);
}
bool matched = false;
while (*i < t->len && strchr(" \t\r\n", t->text[*i]) != NULL) {
matched = true;
t_step(t);
}
if (matched) {
return tokenizer_next(t);
}
if (strchr(",:[]{}0", t->text[*i]) != NULL) {
enum tokty ty = (enum tokty)t->text[*i];
t_step(t);
return t_tok(t, ty, loc);
}
while (*i < t->len && t->text[*i] >= 'a' && t->text[*i] <= 'z') {
matched = true;
t_step(t);
}
if (matched) {
return t_make_ident_tok(t, loc, i);
}
if (t->text[*i] >= '1' && t->text[*i] <= '9') {
return t_make_number_tok(t, loc, i);
}
if (t->text[*i] == '\"') {
return t_make_string_tok(t, loc, i);
}
report(loc, "illegal character", t->text, t->len);
t->failed = true;
t_step(t);
return tokenizer_next(t);
}
struct parser {
struct tokenizer tokenizer;
struct tok tok;
struct blockalloc allocator;
};
static void p_step(struct parser *p)
{
p->tok = tokenizer_next(&p->tokenizer);
}
static void parser_construct(
struct parser *p, char const *text, size_t text_len)
{
*p = (struct parser) {
.tokenizer = (struct tokenizer) { text, text_len, 0, 1, 1, false },
.tok = (struct tok) { 0 },
.allocator = (struct blockalloc) { 0 },
};
blockalloc_construct(&p->allocator);
p_step(p);
}
static void p_report(struct parser *p, struct loc loc, char const *message)
{
report(loc, message, p->tokenizer.text, p->tokenizer.len);
}
static struct json_value *parser_parse(struct parser *p);
static struct json_value *parser_parse_array(struct parser *p, enum tokty *ty)
{
p_step(p);
struct json_value *val = json_new(json_array);
bool tail = false;
while (*ty != tt_eof && ((!tail && *ty != ']') || (tail && *ty == ','))) {
if (tail)
p_step(p);
struct json_value *child = parser_parse(p);
if (!child)
goto array_leave_error_free_val;
json_push(val, child);
tail = true;
}
if (*ty == tt_eof || *ty != ']') {
p_report(p, p->tok.loc, "expected ']'");
goto array_leave_error_free_val;
}
p_step(p);
return val;
array_leave_error_free_val:
json_free(val);
return NULL;
}
static struct json_value *parser_parse_object(struct parser *p, enum tokty *ty)
{
p_step(p);
struct json_value *val = json_new(json_object);
bool tail = false;
while (*ty != tt_eof && ((!tail && *ty != '}') || (tail && *ty == ','))) {
if (tail)
p_step(p);
if (*ty != tt_string) {
p_report(p, p->tok.loc, "expected string");
goto object_leave_error_free_val;
}
struct tok key_tok = p->tok;
p_step(p);
if (*ty != ':') {
p_report(p, p->tok.loc, "expected ':'");
goto object_leave_error_free_val;
}
p_step(p);
struct json_value *child = parser_parse(p);
if (!child)
goto object_leave_error_free_val;
json_set_sized(val, key_tok.ptr + 1, key_tok.len - 2, child);
tail = true;
}
if (*ty == tt_eof || *ty != '}') {
p_report(p, p->tok.loc, "expected '}'");
goto object_leave_error_free_val;
}
p_step(p);
return val;
object_leave_error_free_val:
json_free(val);
return NULL;
}
static struct json_value *parser_parse(struct parser *p)
{
struct loc loc = p->tok.loc;
enum tokty *ty = &p->tok.ty;
if (*ty == tt_null || *ty == tt_false || *ty == tt_true) {
struct json_value *val = json_new((enum json_type) * ty);
p_step(p);
return val;
} else if (*ty == tt_int) {
int64_t value = strtol(p->tok.ptr, NULL, 10);
struct json_value *val = json_new(json_int);
json_set_int(val, value);
p_step(p);
return val;
} else if (*ty == tt_float) {
double value = strtod(p->tok.ptr, NULL);
struct json_value *val = json_new(json_float);
json_set_float(val, value);
p_step(p);
return val;
} else if (*ty == tt_string) {
char *value = blockalloc_alloc(&p->allocator, p->tok.len - 2 + 1, 2);
strncpy(value, p->tok.ptr + 1, p->tok.len - 2);
value[p->tok.len - 2] = '\0';
struct json_value *val = json_new(json_string);
json_set_string(val, value);
p_step(p);
return val;
} else if (*ty == '[') {
return parser_parse_array(p, ty);
} else if (*ty == '{') {
return parser_parse_object(p, ty);
} else {
p_report(p, loc, "expected expression");
return NULL;
}
}
struct json_value *json_parse(char const *text, size_t text_size)
{
text_size = text_size > 0 ? text_size : strlen(text);
struct parser p;
parser_construct(&p, text, text_size);
return parser_parse(&p);
}