Implement mostly correct parsing

This commit is contained in:
2022-10-22 16:04:03 -04:00
parent 9f696399be
commit 07a486cd16
3 changed files with 265 additions and 47 deletions

View File

@@ -13,6 +13,12 @@
#define END_LIST_CHAR ')'
#define QUOTE_CHAR '"'
static const char *reserved_symbols[] = {
"(",
")",
"\"",
};
// TODO: remove malloc and strndup calls
static bool nl_is_whitespace(char c) {
@@ -32,7 +38,6 @@ struct nl_object *nl_token_next(const char **curr_src) {
const char *start = *curr_src;
while (true) {
struct nl_object *curr = NULL;
char *str = NULL;
switch (**curr_src) {
case '\0':
@@ -90,17 +95,123 @@ struct nl_object *nl_tokenize(const char *source) {
return tokens;
}
struct nl_object *nl_parse_token_atom(struct nl_object *maybe_atom) {
struct nl_object *atom = NULL;
switch (maybe_atom->type) {
case NL_TYPE_CELL:
// Cell types currently are not valid for tokens
assert(false);
break;
case NL_TYPE_SYMBOL: {
// Check for reserved tokens first, which indicate special, non-atom behavior
for (int i = 0; i < ARRAY_SIZE(reserved_symbols); i++) {
if (!strcmp(maybe_atom->string, reserved_symbols[i])) {
return NULL;
}
}
atom = nl_symbol_create(strdup(maybe_atom->symbol));
break;
}
case NL_TYPE_STRING:
atom = nl_string_create(strdup(maybe_atom->string));
break;
case NL_TYPE_INT:
// TODO: Copy the token
assert(0);
break;
case NL_TYPE_COUNT:
assert(false);
}
return atom;
}
static struct nl_object *nl_parse_tokens_recursive(struct nl_object **token_iter) {
// Invariants:
// - This function returns NULL if it encounters the first unmatched END_LIST_CHAR
// - This function returns NULL if it encounters end-of-list without first finding a START_LIST_CHAR
// - This function returns an nl_object of the atom or list at **token_iter
assert(token_iter != NULL);
if (*token_iter == NULL) {
return NULL;
}
assert((*token_iter)->type == NL_TYPE_CELL);
struct nl_object *token = (*token_iter)->cell.car;
struct nl_object *next_sexp = nl_parse_token_atom(token);
if (next_sexp != NULL) {
*token_iter = (*token_iter)->cell.cdr;
return next_sexp;
}
assert(token->type == NL_TYPE_SYMBOL);
if (token->symbol[0] == START_LIST_CHAR) {
struct nl_object *list = NULL;
struct nl_object **next_node = &list;
// Consume the START_LIST_CHAR
*token_iter = (*token_iter)->cell.cdr;
while (1) {
token = (*token_iter)->cell.car;
if (token->symbol[0] == END_LIST_CHAR) {
*token_iter = (*token_iter)->cell.cdr;
if (list == NULL) {
list = nl_cell_create(NULL, NULL);
}
return list;
}
next_sexp = nl_parse_tokens_recursive(token_iter);
if (next_sexp == NULL) {
// Error somewhere in the recursive parsing
nl_object_delete(list);
return NULL;
}
*next_node = nl_cell_create(next_sexp, NULL);
next_node = &(*next_node)->cell.cdr;
}
} else if (token->symbol[0] == END_LIST_CHAR) {
// Mismatched parens
return NULL;
}
// Any other symbol type should have been an atom, this shouldn't happen
assert(false);
return NULL;
}
// parse_tokens -> doesn't care about quotes, terminates on EOF
// parse_tokens_recursive -> error on EOF)
struct nl_object *nl_parse_tokens(struct nl_object *tokens) {
struct nl_object* resultl = NULL;
struct nl_object** token_iter = &tokens;
struct nl_object** next_cell = &resultl;
while (*token_iter != NULL) {
struct nl_object *new_sexp = nl_parse_tokens_recursive(token_iter);
if (new_sexp == NULL) {
goto error;
}
*next_cell = nl_cell_create(new_sexp, NULL);
next_cell = &(*next_cell)->cell.cdr;
}
if (resultl == NULL) {
return nl_cell_create(NULL, NULL);
}
return resultl;
error:
nl_object_delete(resultl);
return NULL;
}
// TODO: Should the parse a single sexp (return the last-parsed position), or
// all sexps in the source (return a list of sexps)?
struct nl_object *nl_parse(const char *source) {
struct nl_object *tokens = nl_tokenize(source);
struct nl_object *sexp = NULL;
for (struct nl_object *token = tokens;
token != NULL;
token = token->cell.cdr) {
}
struct nl_object *sexp = nl_parse_tokens(tokens);
nl_object_delete(tokens);
return sexp;
}