Implement mostly correct parsing

2022-10-22 16:04:03 -04:00
parent 9f696399be
commit 07a486cd16
3 changed files with 265 additions and 47 deletions
--- a/src/nihilispm_internal.h
+++ b/src/nihilispm_internal.h
@@ -1,11 +1,21 @@
 #include "nihilispm.h"

+#define ARRAY_SIZE(x) sizeof(x) / sizeof(x[0])
+#define FOREACH_LIST(list, iter, item) \
+    for (struct nl_object *iter = list, *item = iter->cell.car; \
+         iter != NULL; \
+         iter = token->cell.cdr, item = iter->cell.car)
+
+
+
 struct nl_object *nl_cell_create(struct nl_object *car, struct nl_object *cdr);
 struct nl_object *nl_int_create(int integer);
 struct nl_object *nl_symbol_create(const char* symbol);
 struct nl_object *nl_string_create(const char* string);

+
 void nl_object_delete(struct nl_object *obj);

 // For testing
 struct nl_object *nl_token_next(const char **curr_src);
+struct nl_object *nl_parse_token_atom(struct nl_object *maybe_atom);
--- a/src/parse.c
+++ b/src/parse.c
@@ -13,6 +13,12 @@
 #define END_LIST_CHAR ')'
 #define QUOTE_CHAR '"'

+static const char *reserved_symbols[] = {
+    "(",
+    ")",
+    "\"",
+};
+
 // TODO: remove malloc and strndup calls

 static bool nl_is_whitespace(char c) {
@@ -32,7 +38,6 @@ struct nl_object *nl_token_next(const char **curr_src) {
    const char *start = *curr_src;

    while (true) {
-        struct nl_object *curr = NULL;
        char *str = NULL;
        switch (**curr_src) {
            case '\0':
@@ -90,17 +95,123 @@ struct nl_object *nl_tokenize(const char *source) {
    return tokens;
 }

+struct nl_object *nl_parse_token_atom(struct nl_object *maybe_atom) {
+    struct nl_object *atom = NULL;
+    switch (maybe_atom->type) {
+        case NL_TYPE_CELL:
+            // Cell types currently are not valid for tokens
+            assert(false);
+            break;
+        case NL_TYPE_SYMBOL: {
+            // Check for reserved tokens first, which indicate special, non-atom behavior
+            for (int i = 0; i < ARRAY_SIZE(reserved_symbols); i++) {
+                if (!strcmp(maybe_atom->string, reserved_symbols[i])) {
+                    return NULL;
+                }
+            }
+            atom = nl_symbol_create(strdup(maybe_atom->symbol));
+            break;
+        }
+        case NL_TYPE_STRING:
+            atom = nl_string_create(strdup(maybe_atom->string));
+            break;
+        case NL_TYPE_INT:
+            // TODO: Copy the token
+            assert(0);
+            break;
+        case NL_TYPE_COUNT:
+            assert(false);
+    }
+    return atom;
+}
+
+static struct nl_object *nl_parse_tokens_recursive(struct nl_object **token_iter) {
+    // Invariants:
+    //  - This function returns NULL if it encounters the first unmatched END_LIST_CHAR
+    //  - This function returns NULL if it encounters end-of-list without first finding a START_LIST_CHAR
+    //  - This function returns an nl_object of the atom or list at **token_iter
+    assert(token_iter != NULL);
+    if (*token_iter == NULL) {
+        return NULL;
+    }
+
+    assert((*token_iter)->type == NL_TYPE_CELL);
+
+    struct nl_object *token = (*token_iter)->cell.car;
+    struct nl_object *next_sexp = nl_parse_token_atom(token);
+    if (next_sexp != NULL) {
+        *token_iter = (*token_iter)->cell.cdr;
+        return next_sexp;
+    }
+    assert(token->type == NL_TYPE_SYMBOL);
+
+    if (token->symbol[0] == START_LIST_CHAR) {
+        struct nl_object *list = NULL;
+        struct nl_object **next_node = &list;
+        // Consume the START_LIST_CHAR
+        *token_iter = (*token_iter)->cell.cdr;
+        while (1) {
+            token = (*token_iter)->cell.car;
+            if (token->symbol[0] == END_LIST_CHAR) {
+                *token_iter = (*token_iter)->cell.cdr;
+                if (list == NULL) {
+                    list = nl_cell_create(NULL, NULL);
+                }
+                return list;
+            }
+
+            next_sexp = nl_parse_tokens_recursive(token_iter);
+            if (next_sexp == NULL) {
+                // Error somewhere in the recursive parsing
+                nl_object_delete(list);
+                return NULL;
+            }
+
+            *next_node = nl_cell_create(next_sexp, NULL);
+            next_node = &(*next_node)->cell.cdr;
+        }
+    } else if (token->symbol[0] == END_LIST_CHAR) {
+        // Mismatched parens
+        return NULL;
+    }
+    // Any other symbol type should have been an atom, this shouldn't happen
+    assert(false);
+    return NULL;
+}
+
+// parse_tokens -> doesn't care about quotes, terminates on EOF
+// parse_tokens_recursive -> error on EOF)
+
+struct nl_object *nl_parse_tokens(struct nl_object *tokens) {
+    struct nl_object* resultl = NULL;
+    struct nl_object** token_iter = &tokens;
+    struct nl_object** next_cell = &resultl;
+
+    while (*token_iter != NULL) {
+        struct nl_object *new_sexp = nl_parse_tokens_recursive(token_iter);
+        if (new_sexp == NULL) {
+            goto error;
+        }
+
+        *next_cell = nl_cell_create(new_sexp, NULL);
+        next_cell = &(*next_cell)->cell.cdr;
+    }
+
+    if (resultl == NULL) {
+        return nl_cell_create(NULL, NULL);
+    }
+
+    return resultl;
+  error:
+    nl_object_delete(resultl);
+    return NULL;
+}
+
 // TODO: Should the parse a single sexp (return the last-parsed position), or
 // all sexps in the source (return a list of sexps)?
 struct nl_object *nl_parse(const char *source) {
    struct nl_object *tokens = nl_tokenize(source);
-    struct nl_object *sexp = NULL;
-
-    for (struct nl_object *token = tokens;
-         token != NULL;
-         token = token->cell.cdr) {
-    }
-
+    struct nl_object *sexp  = nl_parse_tokens(tokens);
    nl_object_delete(tokens);
    return sexp;
 }