Add initial tokenizer and tests

2022-10-14 19:37:12 -04:00
parent 9cde08b910
commit 2f8bff9c3b
7 changed files with 516 additions and 3 deletions
--- a/src/memory.c
+++ b/src/memory.c
@@ -0,0 +1,73 @@
+#include "nihilispm.h"
+#include "nihilispm_internal.h"
+
+#include <stddef.h>
+#include <stdlib.h>
+
+
+static struct nl_object *nl_object_alloc();
+static void nl_cell_delete(struct nl_cell *cell);
+
+struct nl_object *nl_cell_create(struct nl_object *car, struct nl_object *cdr)
+{
+    struct nl_object* obj = nl_object_alloc();
+    obj->type = NL_TYPE_CELL;
+    obj->cell.car = car;
+    obj->cell.cdr = cdr;
+    return obj;
+}
+
+struct nl_object *nl_int_create(int integer)
+{
+    struct nl_object* obj = nl_object_alloc();
+    obj->type = NL_TYPE_INT;
+    obj->integer = integer;
+    return obj;
+}
+
+struct nl_object *nl_symbol_create(const char *symbol)
+{
+    struct nl_object* obj = nl_object_alloc();
+    obj->type = NL_TYPE_SYMBOL;
+    obj->symbol = symbol;
+    return obj;
+}
+
+struct nl_object *nl_string_create(const char *string)
+{
+    struct nl_object* obj = nl_object_alloc();
+    obj->type = NL_TYPE_STRING;
+    obj->string = string;
+    return obj;
+}
+
+static struct nl_object* nl_object_alloc() {
+    return malloc(sizeof(struct nl_object));
+}
+
+void nl_object_delete(struct nl_object *obj) {
+    if (obj == NULL) {
+        return;
+    }
+
+    switch (obj->type) {
+        case NL_TYPE_CELL:
+            nl_object_delete(obj->cell.car);
+            obj->cell.car = NULL;
+            nl_object_delete(obj->cell.cdr);
+            obj->cell.cdr = NULL;
+            break;
+        case NL_TYPE_SYMBOL:
+            free(obj->symbol);
+            obj->symbol = NULL;
+            break;
+        case NL_TYPE_STRING:
+            free(obj->string);
+            obj->string = NULL;
+        case NL_TYPE_INT:
+        case NL_TYPE_COUNT:
+            break;
+    }
+    free(obj);
+}
+
--- a/src/nihilispm.h
+++ b/src/nihilispm.h
@@ -0,0 +1,40 @@
+#ifndef _NIHILISPM_H_
+#define _NIHILISPM_H_
+
+enum nl_type {
+    NL_TYPE_CELL = 0,
+    NL_TYPE_SYMBOL = 1,
+    NL_TYPE_INT = 2,
+    NL_TYPE_STRING = 3,
+    NL_TYPE_COUNT = 4,
+};
+
+struct nl_cell {
+    struct nl_object *car;
+    struct nl_object *cdr;
+};
+
+struct nl_object {
+    enum nl_type type;
+    union {
+        struct nl_cell cell;
+        const char *symbol;
+        int integer;
+        const char *string;
+    };
+};
+
+
+
+struct nl_parse_result {
+    int result;
+    struct nl_cell *statement;
+};
+
+struct nl_state; // TODO
+
+struct nl_object *nl_tokenize(const char *source);
+struct nl_object *nl_parse(const char *sexp);
+struct nl_cell *nl_evaluate(const struct nl_cell *sexp);
+
+#endif
--- a/src/nihilispm_internal.h
+++ b/src/nihilispm_internal.h
@@ -0,0 +1,11 @@
+#include "nihilispm.h"
+
+struct nl_object *nl_cell_create(struct nl_object *car, struct nl_object *cdr);
+struct nl_object *nl_int_create(int integer);
+struct nl_object *nl_symbol_create(const char* symbol);
+struct nl_object *nl_string_create(const char* string);
+
+void nl_object_delete(struct nl_object *obj);
+
+// For testing
+struct nl_object *nl_token_next(const char **curr_src);
--- a/src/parse.c
+++ b/src/parse.c
@@ -0,0 +1,123 @@
+#include "nihilispm.h"
+#include "nihilispm_internal.h"
+
+// TODO: remove these
+#include <stdbool.h>
+#include <stdint.h>
+#include <stddef.h>
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define START_LIST_CHAR '('
+#define END_LIST_CHAR ')'
+#define QUOTE_CHAR '"'
+
+// TODO: remove malloc and strndup calls
+
+static bool nl_is_whitespace(char c) {
+    return c == ' ' || c == '\n';
+}
+
+static bool nl_is_token(char c) {
+    return c == START_LIST_CHAR || c == END_LIST_CHAR || c == QUOTE_CHAR;
+}
+
+static bool nl_is_delimiter(char c) {
+    return nl_is_whitespace(c) || nl_is_token(c) || c == '\0';
+}
+
+struct nl_object *nl_token_next(const char **curr_src) {
+    assert(*curr_src != NULL);
+    const char *start = *curr_src;
+
+    while (true) {
+        struct nl_object *curr = NULL;
+        char *str = NULL;
+        switch (**curr_src) {
+            case '\0':
+                return NULL;
+            case ' ':
+            case '\n':
+                start++;
+                (*curr_src)++;
+                continue;
+            case START_LIST_CHAR:
+            case END_LIST_CHAR:
+                str = malloc(sizeof(char) * 2);
+                str[0] = **curr_src;
+                str[1] = '\0';
+                (*curr_src)++;
+                return nl_cell_create(nl_symbol_create(str), NULL);
+            case QUOTE_CHAR:
+                // skip beginning quote
+                (*curr_src)++;
+                // TODO: Support escaping
+                while (**curr_src != '"') {
+                    (*curr_src)++;
+                }
+                // skip end quote
+                (*curr_src)++;
+                // -2 for removing start/end quotes
+                str = strndup(start + 1, *curr_src - start - 2);
+                return nl_cell_create(nl_string_create(str), NULL);
+            //case '0':
+                // TODO: Parse integers
+            default:
+                while (!nl_is_delimiter(**curr_src)) {
+                    (*curr_src)++;
+                }
+                str = strndup(start, *curr_src - start);
+                return nl_cell_create(nl_symbol_create(str), NULL);
+        }
+    }
+
+    // Unreachable
+    assert(false);
+    return NULL;
+}
+
+struct nl_object *nl_tokenize(const char *source) {
+    struct nl_object *tokens = NULL, *curr_token = NULL, *prev_token = NULL;
+    const char *curr_src = source;
+
+    while (true) {
+        curr_token = nl_token_next(&curr_src);
+        if (curr_token == NULL) {
+            break;
+        }
+
+        if (tokens == NULL) {
+            tokens = curr_token;
+        } else {
+            prev_token->cell.cdr = curr_token;
+        }
+        prev_token = curr_token;
+    }
+
+    return tokens;
+}
+
+// TODO: Should the parse a single sexp (return the last-parsed position), or
+// all sexps in the source (return a list of sexps)?
+struct nl_object *nl_parse(const char *source) {
+    struct nl_object *tokens = nl_tokenize(source);
+    struct nl_object *sexp = NULL;
+
+    for (struct nl_object *token = tokens;
+         token != NULL;
+         token = token->cell.cdr) {
+
+
+
+    }
+
+    nl_object_delete(tokens);
+    return sexp;
+}
+
+/* struct ParseResult *nl_parse(const char *source) { */
+/*     struct Cell *tokens = nl_tokenize(source); */
+/*     struct Cell *sexp = n */
+
+/* } */