From 07a486cd16f3cfed26e2731ca47e25aa1d21b9bb Mon Sep 17 00:00:00 2001 From: Max Regan Date: Sat, 22 Oct 2022 16:04:03 -0400 Subject: [PATCH] Implement mostly correct parsing --- src/nihilispm_internal.h | 10 +++ src/parse.c | 127 ++++++++++++++++++++++++++-- test/test_parse.c | 175 ++++++++++++++++++++++++++++++--------- 3 files changed, 265 insertions(+), 47 deletions(-) diff --git a/src/nihilispm_internal.h b/src/nihilispm_internal.h index c974e48..a3c340f 100644 --- a/src/nihilispm_internal.h +++ b/src/nihilispm_internal.h @@ -1,11 +1,21 @@ #include "nihilispm.h" +#define ARRAY_SIZE(x) sizeof(x) / sizeof(x[0]) +#define FOREACH_LIST(list, iter, item) \ + for (struct nl_object *iter = list, *item = iter->cell.car; \ + iter != NULL; \ + iter = token->cell.cdr, item = iter->cell.car) + + + struct nl_object *nl_cell_create(struct nl_object *car, struct nl_object *cdr); struct nl_object *nl_int_create(int integer); struct nl_object *nl_symbol_create(const char* symbol); struct nl_object *nl_string_create(const char* string); + void nl_object_delete(struct nl_object *obj); // For testing struct nl_object *nl_token_next(const char **curr_src); +struct nl_object *nl_parse_token_atom(struct nl_object *maybe_atom); diff --git a/src/parse.c b/src/parse.c index 2fae6ff..6796e1a 100644 --- a/src/parse.c +++ b/src/parse.c @@ -13,6 +13,12 @@ #define END_LIST_CHAR ')' #define QUOTE_CHAR '"' +static const char *reserved_symbols[] = { + "(", + ")", + "\"", +}; + // TODO: remove malloc and strndup calls static bool nl_is_whitespace(char c) { @@ -32,7 +38,6 @@ struct nl_object *nl_token_next(const char **curr_src) { const char *start = *curr_src; while (true) { - struct nl_object *curr = NULL; char *str = NULL; switch (**curr_src) { case '\0': @@ -90,17 +95,123 @@ struct nl_object *nl_tokenize(const char *source) { return tokens; } +struct nl_object *nl_parse_token_atom(struct nl_object *maybe_atom) { + struct nl_object *atom = NULL; + switch (maybe_atom->type) { + case NL_TYPE_CELL: + // Cell types currently are not valid for tokens + assert(false); + break; + case NL_TYPE_SYMBOL: { + // Check for reserved tokens first, which indicate special, non-atom behavior + for (int i = 0; i < ARRAY_SIZE(reserved_symbols); i++) { + if (!strcmp(maybe_atom->string, reserved_symbols[i])) { + return NULL; + } + } + atom = nl_symbol_create(strdup(maybe_atom->symbol)); + break; + } + case NL_TYPE_STRING: + atom = nl_string_create(strdup(maybe_atom->string)); + break; + case NL_TYPE_INT: + // TODO: Copy the token + assert(0); + break; + case NL_TYPE_COUNT: + assert(false); + } + return atom; +} + +static struct nl_object *nl_parse_tokens_recursive(struct nl_object **token_iter) { + // Invariants: + // - This function returns NULL if it encounters the first unmatched END_LIST_CHAR + // - This function returns NULL if it encounters end-of-list without first finding a START_LIST_CHAR + // - This function returns an nl_object of the atom or list at **token_iter + assert(token_iter != NULL); + if (*token_iter == NULL) { + return NULL; + } + + assert((*token_iter)->type == NL_TYPE_CELL); + + struct nl_object *token = (*token_iter)->cell.car; + struct nl_object *next_sexp = nl_parse_token_atom(token); + if (next_sexp != NULL) { + *token_iter = (*token_iter)->cell.cdr; + return next_sexp; + } + assert(token->type == NL_TYPE_SYMBOL); + + if (token->symbol[0] == START_LIST_CHAR) { + struct nl_object *list = NULL; + struct nl_object **next_node = &list; + // Consume the START_LIST_CHAR + *token_iter = (*token_iter)->cell.cdr; + while (1) { + token = (*token_iter)->cell.car; + if (token->symbol[0] == END_LIST_CHAR) { + *token_iter = (*token_iter)->cell.cdr; + if (list == NULL) { + list = nl_cell_create(NULL, NULL); + } + return list; + } + + next_sexp = nl_parse_tokens_recursive(token_iter); + if (next_sexp == NULL) { + // Error somewhere in the recursive parsing + nl_object_delete(list); + return NULL; + } + + *next_node = nl_cell_create(next_sexp, NULL); + next_node = &(*next_node)->cell.cdr; + } + } else if (token->symbol[0] == END_LIST_CHAR) { + // Mismatched parens + return NULL; + } + // Any other symbol type should have been an atom, this shouldn't happen + assert(false); + return NULL; +} + +// parse_tokens -> doesn't care about quotes, terminates on EOF +// parse_tokens_recursive -> error on EOF) + +struct nl_object *nl_parse_tokens(struct nl_object *tokens) { + struct nl_object* resultl = NULL; + struct nl_object** token_iter = &tokens; + struct nl_object** next_cell = &resultl; + + while (*token_iter != NULL) { + struct nl_object *new_sexp = nl_parse_tokens_recursive(token_iter); + if (new_sexp == NULL) { + goto error; + } + + *next_cell = nl_cell_create(new_sexp, NULL); + next_cell = &(*next_cell)->cell.cdr; + } + + if (resultl == NULL) { + return nl_cell_create(NULL, NULL); + } + + return resultl; + error: + nl_object_delete(resultl); + return NULL; +} + // TODO: Should the parse a single sexp (return the last-parsed position), or // all sexps in the source (return a list of sexps)? struct nl_object *nl_parse(const char *source) { struct nl_object *tokens = nl_tokenize(source); - struct nl_object *sexp = NULL; - - for (struct nl_object *token = tokens; - token != NULL; - token = token->cell.cdr) { - } - + struct nl_object *sexp = nl_parse_tokens(tokens); nl_object_delete(tokens); return sexp; } diff --git a/test/test_parse.c b/test/test_parse.c index 3e9d6b4..1a917be 100644 --- a/test/test_parse.c +++ b/test/test_parse.c @@ -1,5 +1,6 @@ #include #include +#include #include "nihilispm.h" #include "nihilispm_internal.h" @@ -15,21 +16,7 @@ void tearDown(void) { nl_object_delete(response); } -/* void test_parse_null() { */ -/* result = nl_parse("()"); */ -/* TEST_ASSERT_EQUAL(result->result, 0); */ -/* TEST_ASSERT_NOT_NULL(result->statement); */ -/* TEST_ASSERT_NULL(result->statement->car); */ -/* TEST_ASSERT_NULL(result->statement->cdr); */ -/* } */ - -/* void test_parse_error() { */ -/* result = nl_parse("("); */ -/* TEST_ASSERT_EQUAL(result->result, 0); */ -/* TEST_ASSERT_NULL(result->statement); */ -/* } */ - -void test_token_next_empty_str() { +static void test_token_next_empty_str(void) { const char *input = ""; const char *curr = input; @@ -39,7 +26,7 @@ void test_token_next_empty_str() { TEST_ASSERT_EQUAL('\0', *curr); } -void test_token_next_only_whitespace() { +static void test_token_next_only_whitespace(void) { const char *input = " \n"; const char *curr = input; @@ -49,7 +36,7 @@ void test_token_next_only_whitespace() { TEST_ASSERT_EQUAL('\0', *curr); } -void test_token_next_lparen() { +static void test_token_next_lparen(void) { const char *input = "("; const char *curr = input; @@ -62,7 +49,7 @@ void test_token_next_lparen() { TEST_ASSERT_EQUAL('\0', *curr); } -void test_token_next_rparen() { +static void test_token_next_rparen(void) { const char *input = ")"; const char *curr = input; @@ -75,7 +62,7 @@ void test_token_next_rparen() { TEST_ASSERT_EQUAL('\0', *curr); } -void test_token_next_lrparen() { +static void test_token_next_lrparen(void) { const char *input = "()"; const char *curr = input; @@ -97,7 +84,7 @@ void test_token_next_lrparen() { TEST_ASSERT_EQUAL('\0', *curr); } -void test_token_next_string() { +static void test_token_next_string(void) { const char *input = "\"foo\""; const char *curr = input; @@ -110,7 +97,7 @@ void test_token_next_string() { TEST_ASSERT_EQUAL('\0', *curr); } -void test_token_next_string_w_whitespace() { +static void test_token_next_string_w_whitespace(void) { const char *input = " \"foo\" "; const char *curr = input; @@ -123,7 +110,7 @@ void test_token_next_string_w_whitespace() { TEST_ASSERT_EQUAL_STRING(" ", curr); } -void test_token_next_symbol() { +static void test_token_next_symbol(void) { const char *input = "foo"; const char *curr = input; @@ -136,7 +123,7 @@ void test_token_next_symbol() { TEST_ASSERT_EQUAL_STRING("", curr); } -void test_token_next_symbol_w_whitespace() { +static void test_token_next_symbol_w_whitespace(void) { const char *input = " foo "; const char *curr = input; @@ -149,13 +136,13 @@ void test_token_next_symbol_w_whitespace() { TEST_ASSERT_EQUAL_STRING(" ", curr); } -void test_tokenize_empty_str() { +static void test_tokenize_empty_str(void) { response = nl_tokenize(""); TEST_ASSERT_NULL(response); } -void test_tokenize_nil() { +static void test_tokenize_nil(void) { response = nl_tokenize("()"); TEST_ASSERT_NOT_NULL(response); @@ -175,7 +162,7 @@ void test_tokenize_nil() { TEST_ASSERT_NULL(token2->cell.cdr); } -void test_tokenize_statement() { +static void test_tokenize_statement(void) { response = nl_tokenize("(foo)"); TEST_ASSERT_NOT_NULL(response); @@ -200,21 +187,28 @@ void test_tokenize_statement() { TEST_ASSERT_EQUAL_STRING(")", token3->cell.car->string); } -void test_parse_empty_str() { - response = nl_parse(""); +static void test_parse_atom_symbol(void) { + response = nl_parse_token_atom(nl_symbol_create(strdup("foo"))); + + TEST_ASSERT_EQUAL(response->type, NL_TYPE_SYMBOL); + TEST_ASSERT_EQUAL_STRING(response->symbol, "foo"); +} + +static void test_parse_atom_lparen(void) { + response = nl_parse_token_atom(nl_symbol_create(strdup("("))); + TEST_ASSERT_NULL(response); } -void test_parse_symbol() { - response = nl_parse("foo"); +static void test_parse_atom_rparen(void) { + response = nl_parse_token_atom(nl_symbol_create(strdup(")"))); - TEST_ASSERT_NOT_NULL(response); - TEST_ASSERT_EQUAL(NL_TYPE_SYMBOL, response->type); - TEST_ASSERT_EQUAL_STRING(NL_TYPE_SYMBOL, "foo"); + TEST_ASSERT_NULL(response); } -void test_parse_nil() { - response = nl_parse("()"); + +static void test_parse_empty_str(void) { + response = nl_parse(""); TEST_ASSERT_NOT_NULL(response); TEST_ASSERT_EQUAL(NL_TYPE_CELL, response->type); @@ -222,26 +216,118 @@ void test_parse_nil() { TEST_ASSERT_NULL(response->cell.cdr); } -void test_parse_list_1elem() { +static void test_parse_symbol(void) { + response = nl_parse("foo"); + + TEST_ASSERT_NOT_NULL(response); + TEST_ASSERT_EQUAL(NL_TYPE_CELL, response->type); + + TEST_ASSERT_NOT_NULL(response->cell.car); + TEST_ASSERT_EQUAL(response->cell.car->type, NL_TYPE_SYMBOL); + TEST_ASSERT_EQUAL_STRING(response->cell.car->symbol, "foo"); +} + +static void test_parse_nil(void) { + response = nl_parse("()"); + + TEST_ASSERT_NOT_NULL(response); + TEST_ASSERT_EQUAL(NL_TYPE_CELL, response->type); + TEST_ASSERT_NOT_NULL(response->cell.car); + TEST_ASSERT_NULL(response->cell.cdr); + + TEST_ASSERT_EQUAL(NL_TYPE_CELL, response->cell.car->type); + TEST_ASSERT_NULL(response->cell.car->cell.car); + TEST_ASSERT_NULL(response->cell.car->cell.cdr); +} + +static void test_parse_list_1elem(void) { response = nl_parse("(foo)"); TEST_ASSERT_NOT_NULL(response); TEST_ASSERT_EQUAL(NL_TYPE_CELL, response->type); TEST_ASSERT_NOT_NULL(response->cell.car); - TEST_ASSERT_EQUAL(NL_TYPE_SYMBOL, response->cell.car->type); - TEST_ASSERT_EQUAL_STRING("foo", response->cell.car->symbol); + TEST_ASSERT_NULL(response->cell.cdr); + + TEST_ASSERT_EQUAL(NL_TYPE_CELL, response->cell.car->type); + TEST_ASSERT_NULL(response->cell.car->cell.cdr); + + TEST_ASSERT_EQUAL(NL_TYPE_SYMBOL, response->cell.car->cell.car->type); + TEST_ASSERT_EQUAL_STRING("foo", response->cell.car->cell.car->symbol); } -void test_parse_list_2elem() { +static void test_parse_list_2elem(void) { response = nl_parse("(foo bar)"); TEST_ASSERT_NOT_NULL(response); TEST_ASSERT_EQUAL(NL_TYPE_CELL, response->type); + TEST_ASSERT_NOT_NULL(response->cell.car); + TEST_ASSERT_NULL(response->cell.cdr); + + TEST_ASSERT_EQUAL(NL_TYPE_CELL, response->cell.car->type); + TEST_ASSERT_NOT_NULL(response->cell.car->cell.cdr); + + TEST_ASSERT_EQUAL(NL_TYPE_SYMBOL, response->cell.car->cell.car->type); + TEST_ASSERT_EQUAL_STRING("foo", response->cell.car->cell.car->symbol); + + TEST_ASSERT_EQUAL(NL_TYPE_CELL, response->cell.car->cell.cdr->type); + TEST_ASSERT_EQUAL(NL_TYPE_SYMBOL, response->cell.car->cell.cdr->cell.car->type); + TEST_ASSERT_EQUAL_STRING("bar", response->cell.car->cell.cdr->cell.car->string); + +} + +static void test_parse_2elem(void) { + response = nl_parse("foo bar"); + + TEST_ASSERT_NOT_NULL(response); + TEST_ASSERT_EQUAL(NL_TYPE_CELL, response->type); + TEST_ASSERT_NOT_NULL(response->cell.car); TEST_ASSERT_EQUAL(NL_TYPE_SYMBOL, response->cell.car->type); TEST_ASSERT_EQUAL_STRING("foo", response->cell.car->symbol); + + TEST_ASSERT_NOT_NULL(response->cell.cdr); + TEST_ASSERT_EQUAL(NL_TYPE_CELL, response->cell.cdr->type); + + TEST_ASSERT_EQUAL(NL_TYPE_SYMBOL, response->cell.cdr->cell.car->type); + TEST_ASSERT_EQUAL_STRING("bar", response->cell.cdr->cell.car->symbol); +} + +static void test_parse_2elem_str(void) { + response = nl_parse("\"foo\" \"bar\""); + + TEST_ASSERT_NOT_NULL(response); + TEST_ASSERT_EQUAL(NL_TYPE_CELL, response->type); + + TEST_ASSERT_NOT_NULL(response->cell.car); + TEST_ASSERT_EQUAL(NL_TYPE_STRING, response->cell.car->type); + TEST_ASSERT_EQUAL_STRING("foo", response->cell.car->string); + + TEST_ASSERT_NOT_NULL(response->cell.cdr); + TEST_ASSERT_EQUAL(NL_TYPE_CELL, response->cell.cdr->type); + + TEST_ASSERT_EQUAL(NL_TYPE_STRING, response->cell.cdr->cell.car->type); + TEST_ASSERT_EQUAL_STRING("bar", response->cell.cdr->cell.car->string); +} + +static void test_parse_nested(void) { + response = nl_parse("((foo))"); + + TEST_ASSERT_NOT_NULL(response); + TEST_ASSERT_EQUAL(NL_TYPE_CELL, response->type); + + TEST_ASSERT_NOT_NULL(response->cell.car); + TEST_ASSERT_NULL(response->cell.cdr); + + TEST_ASSERT_EQUAL(NL_TYPE_CELL, response->cell.car->type); + TEST_ASSERT_NULL(response->cell.car->cell.cdr); + + TEST_ASSERT_EQUAL(NL_TYPE_CELL, response->cell.car->cell.car->type); + TEST_ASSERT_NULL(response->cell.car->cell.car->cell.cdr); + + TEST_ASSERT_EQUAL(NL_TYPE_SYMBOL, response->cell.car->cell.car->cell.car->type); + TEST_ASSERT_EQUAL_STRING("foo", response->cell.car->cell.car->cell.car->symbol); } @@ -259,5 +345,16 @@ int main(void) { RUN_TEST(test_tokenize_empty_str); RUN_TEST(test_tokenize_nil); RUN_TEST(test_tokenize_statement); + RUN_TEST(test_parse_atom_symbol); + RUN_TEST(test_parse_atom_lparen); + RUN_TEST(test_parse_atom_rparen); + RUN_TEST(test_parse_empty_str); + RUN_TEST(test_parse_symbol); + RUN_TEST(test_parse_nil); + RUN_TEST(test_parse_list_1elem); + RUN_TEST(test_parse_list_2elem); + RUN_TEST(test_parse_2elem); + RUN_TEST(test_parse_2elem_str); + RUN_TEST(test_parse_nested); return UNITY_END(); }