224 lines
6.6 KiB
C
224 lines
6.6 KiB
C
#include "nihilispm.h"
|
|
#include "nihilispm_internal.h"
|
|
|
|
// TODO: remove these
|
|
#include <stdbool.h>
|
|
#include <stdint.h>
|
|
#include <stddef.h>
|
|
#include <assert.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
|
|
#define START_LIST_CHAR '('
|
|
#define END_LIST_CHAR ')'
|
|
#define QUOTE_CHAR '"'
|
|
|
|
static const char *reserved_symbols[] = {
|
|
"(",
|
|
")",
|
|
"\"",
|
|
};
|
|
|
|
// TODO: remove malloc and strndup calls
|
|
|
|
static bool nl_is_whitespace(char c) {
|
|
return c == ' ' || c == '\n' || c == '\t';
|
|
}
|
|
|
|
static bool nl_is_token(char c) {
|
|
return c == START_LIST_CHAR || c == END_LIST_CHAR || c == QUOTE_CHAR;
|
|
}
|
|
|
|
static bool nl_is_delimiter(char c) {
|
|
return nl_is_whitespace(c) || nl_is_token(c) || c == '\0';
|
|
}
|
|
|
|
struct nl_object *nl_token_next(const char **curr_src) {
|
|
assert(*curr_src != NULL);
|
|
const char *start = *curr_src;
|
|
|
|
while (true) {
|
|
char *str = NULL;
|
|
switch (**curr_src) {
|
|
case '\0':
|
|
return NULL;
|
|
case ' ':
|
|
case '\n':
|
|
case '\t':
|
|
start++;
|
|
(*curr_src)++;
|
|
continue;
|
|
case START_LIST_CHAR:
|
|
case END_LIST_CHAR:
|
|
str = malloc(sizeof(char) * 2);
|
|
str[0] = **curr_src;
|
|
str[1] = '\0';
|
|
(*curr_src)++;
|
|
return nl_cell_create(nl_symbol_create(str), NULL);
|
|
case QUOTE_CHAR:
|
|
// skip beginning quote
|
|
(*curr_src)++;
|
|
// TODO: Support escaping
|
|
while (**curr_src != '"') {
|
|
(*curr_src)++;
|
|
}
|
|
// skip end quote
|
|
(*curr_src)++;
|
|
// -2 for removing start/end quotes
|
|
str = strndup(start + 1, *curr_src - start - 2);
|
|
return nl_cell_create(nl_string_create(str), NULL);
|
|
//case '0':
|
|
// TODO: Parse integers
|
|
default:
|
|
while (!nl_is_delimiter(**curr_src)) {
|
|
(*curr_src)++;
|
|
}
|
|
str = strndup(start, *curr_src - start);
|
|
return nl_cell_create(nl_symbol_create(str), NULL);
|
|
}
|
|
}
|
|
|
|
// Unreachable
|
|
assert(false);
|
|
return NULL;
|
|
}
|
|
|
|
struct nl_object *nl_tokenize(const char *source) {
|
|
struct nl_object *tokens = NULL, *curr_token = NULL, **next_token = &tokens;
|
|
const char *curr_src = source;
|
|
|
|
while ((curr_token = nl_token_next(&curr_src)) != NULL) {
|
|
*next_token = curr_token;
|
|
next_token = &curr_token->cell.cdr;
|
|
}
|
|
|
|
return tokens;
|
|
}
|
|
|
|
struct nl_object *nl_parse_token_atom(struct nl_object *maybe_atom) {
|
|
struct nl_object *atom = NULL;
|
|
switch (maybe_atom->type) {
|
|
case NL_TYPE_CELL:
|
|
// Cell types currently are not valid for tokens
|
|
assert(false);
|
|
break;
|
|
case NL_TYPE_SYMBOL: {
|
|
// Check for reserved tokens first, which indicate special, non-atom behavior
|
|
for (int i = 0; i < ARRAY_SIZE(reserved_symbols); i++) {
|
|
if (!strcmp(maybe_atom->string, reserved_symbols[i])) {
|
|
return NULL;
|
|
}
|
|
}
|
|
atom = nl_symbol_create(strdup(maybe_atom->symbol));
|
|
break;
|
|
}
|
|
case NL_TYPE_STRING:
|
|
atom = nl_string_create(strdup(maybe_atom->string));
|
|
break;
|
|
case NL_TYPE_INT:
|
|
// TODO: Copy the token
|
|
assert(0);
|
|
break;
|
|
case NL_TYPE_COUNT:
|
|
assert(false);
|
|
}
|
|
return atom;
|
|
}
|
|
|
|
static struct nl_object *nl_parse_tokens_recursive(struct nl_object **token_iter) {
|
|
// Invariants:
|
|
// - This function returns NULL if it encounters the first unmatched END_LIST_CHAR
|
|
// - This function returns NULL if it encounters end-of-list without first finding a START_LIST_CHAR
|
|
// - This function returns an nl_object of the atom or list at **token_iter
|
|
assert(token_iter != NULL);
|
|
if (*token_iter == NULL) {
|
|
return NULL;
|
|
}
|
|
|
|
assert((*token_iter)->type == NL_TYPE_CELL);
|
|
|
|
struct nl_object *token = (*token_iter)->cell.car;
|
|
struct nl_object *next_sexp = nl_parse_token_atom(token);
|
|
if (next_sexp != NULL) {
|
|
*token_iter = (*token_iter)->cell.cdr;
|
|
return next_sexp;
|
|
}
|
|
assert(token->type == NL_TYPE_SYMBOL);
|
|
|
|
if (token->symbol[0] == START_LIST_CHAR) {
|
|
struct nl_object *list = NULL;
|
|
struct nl_object **next_node = &list;
|
|
// Consume the START_LIST_CHAR
|
|
*token_iter = (*token_iter)->cell.cdr;
|
|
while (1) {
|
|
token = (*token_iter)->cell.car;
|
|
if (token->symbol[0] == END_LIST_CHAR) {
|
|
*token_iter = (*token_iter)->cell.cdr;
|
|
if (list == NULL) {
|
|
list = nl_cell_create(NULL, NULL);
|
|
}
|
|
return list;
|
|
}
|
|
|
|
next_sexp = nl_parse_tokens_recursive(token_iter);
|
|
if (next_sexp == NULL) {
|
|
// Error somewhere in the recursive parsing
|
|
nl_object_delete(list);
|
|
return NULL;
|
|
}
|
|
|
|
*next_node = nl_cell_create(next_sexp, NULL);
|
|
next_node = &(*next_node)->cell.cdr;
|
|
}
|
|
} else if (token->symbol[0] == END_LIST_CHAR) {
|
|
// Mismatched parens
|
|
return NULL;
|
|
}
|
|
// Any other symbol type should have been an atom, this shouldn't happen
|
|
assert(false);
|
|
return NULL;
|
|
}
|
|
|
|
// parse_tokens -> doesn't care about quotes, terminates on EOF
|
|
// parse_tokens_recursive -> error on EOF)
|
|
|
|
struct nl_object *nl_parse_tokens(struct nl_object *tokens) {
|
|
struct nl_object* resultl = NULL;
|
|
struct nl_object** token_iter = &tokens;
|
|
struct nl_object** next_cell = &resultl;
|
|
|
|
while (*token_iter != NULL) {
|
|
struct nl_object *new_sexp = nl_parse_tokens_recursive(token_iter);
|
|
if (new_sexp == NULL) {
|
|
goto error;
|
|
}
|
|
|
|
*next_cell = nl_cell_create(new_sexp, NULL);
|
|
next_cell = &(*next_cell)->cell.cdr;
|
|
}
|
|
|
|
if (resultl == NULL) {
|
|
return nl_cell_create(NULL, NULL);
|
|
}
|
|
|
|
return resultl;
|
|
error:
|
|
nl_object_delete(resultl);
|
|
return NULL;
|
|
}
|
|
|
|
// TODO: Should the parse a single sexp (return the last-parsed position), or
|
|
// all sexps in the source (return a list of sexps)?
|
|
struct nl_object *nl_parse(const char *source) {
|
|
struct nl_object *tokens = nl_tokenize(source);
|
|
struct nl_object *sexp = nl_parse_tokens(tokens);
|
|
nl_object_delete(tokens);
|
|
return sexp;
|
|
}
|
|
|
|
/* struct ParseResult *nl_parse(const char *source) { */
|
|
/* struct Cell *tokens = nl_tokenize(source); */
|
|
/* struct Cell *sexp = n */
|
|
|
|
/* } */
|