uclisp/src/parse.c

#include "nihilispm.h"
#include "nihilispm_internal.h"

// TODO: remove these
#include <stdbool.h>
#include <stdint.h>
#include <stddef.h>
#include <assert.h>
#include <stdlib.h>
#include <string.h>

#define START_LIST_CHAR '('
#define END_LIST_CHAR ')'
#define QUOTE_CHAR '"'

static const char *reserved_symbols[] = {
    "(",
    ")",
    "\"",
};

// TODO: remove malloc and strndup calls

static bool nl_is_whitespace(char c) {
    return c == ' ' || c == '\n' || c == '\t';
}

static bool nl_is_token(char c) {
    return c == START_LIST_CHAR || c == END_LIST_CHAR || c == QUOTE_CHAR;
}

static bool nl_is_delimiter(char c) {
    return nl_is_whitespace(c) || nl_is_token(c) || c == '\0';
}

struct nl_object *nl_token_next(const char **curr_src) {
    assert(*curr_src != NULL);
    const char *start = *curr_src;

    while (true) {
        char *str = NULL;
        switch (**curr_src) {
            case '\0':
                return NULL;
            case ' ':
            case '\n':
            case '\t':
                start++;
                (*curr_src)++;
                continue;
            case START_LIST_CHAR:
            case END_LIST_CHAR:
                str = malloc(sizeof(char) * 2);
                str[0] = **curr_src;
                str[1] = '\0';
                (*curr_src)++;
                return nl_cell_create(nl_symbol_create(str), NULL);
            case QUOTE_CHAR:
                // skip beginning quote
                (*curr_src)++;
                // TODO: Support escaping
                while (**curr_src != '"') {
                    (*curr_src)++;
                }
                // skip end quote
                (*curr_src)++;
                // -2 for removing start/end quotes
                str = strndup(start + 1, *curr_src - start - 2);
                return nl_cell_create(nl_string_create(str), NULL);
            //case '0':
                // TODO: Parse integers
            default:
                while (!nl_is_delimiter(**curr_src)) {
                    (*curr_src)++;
                }
                str = strndup(start, *curr_src - start);
                return nl_cell_create(nl_symbol_create(str), NULL);
        }
    }

    // Unreachable
    assert(false);
    return NULL;
}

struct nl_object *nl_tokenize(const char *source) {
    struct nl_object *tokens = NULL, *curr_token = NULL, **next_token = &tokens;
    const char *curr_src = source;

    while ((curr_token = nl_token_next(&curr_src)) != NULL) {
        *next_token = curr_token;
        next_token = &curr_token->cell.cdr;
    }

    return tokens;
}

struct nl_object *nl_parse_token_atom(struct nl_object *maybe_atom) {
    struct nl_object *atom = NULL;
    switch (maybe_atom->type) {
        case NL_TYPE_CELL:
            // Cell types currently are not valid for tokens
            assert(false);
            break;
        case NL_TYPE_SYMBOL: {
            // Check for reserved tokens first, which indicate special, non-atom behavior
            for (int i = 0; i < ARRAY_SIZE(reserved_symbols); i++) {
                if (!strcmp(maybe_atom->string, reserved_symbols[i])) {
                    return NULL;
                }
            }
            atom = nl_symbol_create(strdup(maybe_atom->symbol));
            break;
        }
        case NL_TYPE_STRING:
            atom = nl_string_create(strdup(maybe_atom->string));
            break;
        case NL_TYPE_INT:
            // TODO: Copy the token
            assert(0);
            break;
        case NL_TYPE_COUNT:
            assert(false);
    }
    return atom;
}

static struct nl_object *nl_parse_tokens_recursive(struct nl_object **token_iter) {
    // Invariants:
    //  - This function returns NULL if it encounters the first unmatched END_LIST_CHAR
    //  - This function returns NULL if it encounters end-of-list without first finding a START_LIST_CHAR
    //  - This function returns an nl_object of the atom or list at **token_iter
    assert(token_iter != NULL);
    if (*token_iter == NULL) {
        return NULL;
    }

    assert((*token_iter)->type == NL_TYPE_CELL);

    struct nl_object *token = (*token_iter)->cell.car;
    struct nl_object *next_sexp = nl_parse_token_atom(token);
    if (next_sexp != NULL) {
        *token_iter = (*token_iter)->cell.cdr;
        return next_sexp;
    }
    assert(token->type == NL_TYPE_SYMBOL);

    if (token->symbol[0] == START_LIST_CHAR) {
        struct nl_object *list = NULL;
        struct nl_object **next_node = &list;
        // Consume the START_LIST_CHAR
        *token_iter = (*token_iter)->cell.cdr;
        while (1) {
            token = (*token_iter)->cell.car;
            if (token->symbol[0] == END_LIST_CHAR) {
                *token_iter = (*token_iter)->cell.cdr;
                if (list == NULL) {
                    list = nl_cell_create(NULL, NULL);
                }
                return list;
            }

            next_sexp = nl_parse_tokens_recursive(token_iter);
            if (next_sexp == NULL) {
                // Error somewhere in the recursive parsing
                nl_object_delete(list);
                return NULL;
            }

            *next_node = nl_cell_create(next_sexp, NULL);
            next_node = &(*next_node)->cell.cdr;
        }
    } else if (token->symbol[0] == END_LIST_CHAR) {
        // Mismatched parens
        return NULL;
    }
    // Any other symbol type should have been an atom, this shouldn't happen
    assert(false);
    return NULL;
}

// parse_tokens -> doesn't care about quotes, terminates on EOF
// parse_tokens_recursive -> error on EOF)

struct nl_object *nl_parse_tokens(struct nl_object *tokens) {
    struct nl_object* resultl = NULL;
    struct nl_object** token_iter = &tokens;
    struct nl_object** next_cell = &resultl;

    while (*token_iter != NULL) {
        struct nl_object *new_sexp = nl_parse_tokens_recursive(token_iter);
        if (new_sexp == NULL) {
            goto error;
        }

        *next_cell = nl_cell_create(new_sexp, NULL);
        next_cell = &(*next_cell)->cell.cdr;
    }

    if (resultl == NULL) {
        return nl_cell_create(NULL, NULL);
    }

    return resultl;
  error:
    nl_object_delete(resultl);
    return NULL;
}

// TODO: Should the parse a single sexp (return the last-parsed position), or
// all sexps in the source (return a list of sexps)?
struct nl_object *nl_parse(const char *source) {
    struct nl_object *tokens = nl_tokenize(source);
    struct nl_object *sexp  = nl_parse_tokens(tokens);
    nl_object_delete(tokens);
    return sexp;
}

/* struct ParseResult *nl_parse(const char *source) { */
/*     struct Cell *tokens = nl_tokenize(source); */
/*     struct Cell *sexp = n */

/* } */