#include "nihilispm.h" #include "nihilispm_internal.h" // TODO: remove these #include #include #include #include #include #include #define START_LIST_CHAR '(' #define END_LIST_CHAR ')' #define QUOTE_CHAR '"' static const char *reserved_symbols[] = { "(", ")", "\"", }; // TODO: remove malloc and strndup calls static bool nl_is_whitespace(char c) { return c == ' ' || c == '\n' || c == '\t'; } static bool nl_is_token(char c) { return c == START_LIST_CHAR || c == END_LIST_CHAR || c == QUOTE_CHAR; } static bool nl_is_delimiter(char c) { return nl_is_whitespace(c) || nl_is_token(c) || c == '\0'; } struct nl_object *nl_token_next(const char **curr_src) { assert(*curr_src != NULL); const char *start = *curr_src; while (true) { char *str = NULL; switch (**curr_src) { case '\0': return NULL; case ' ': case '\n': case '\t': start++; (*curr_src)++; continue; case START_LIST_CHAR: case END_LIST_CHAR: str = malloc(sizeof(char) * 2); str[0] = **curr_src; str[1] = '\0'; (*curr_src)++; return nl_cell_create(nl_symbol_create(str), NULL); case QUOTE_CHAR: // skip beginning quote (*curr_src)++; // TODO: Support escaping while (**curr_src != '"') { (*curr_src)++; } // skip end quote (*curr_src)++; // -2 for removing start/end quotes str = strndup(start + 1, *curr_src - start - 2); return nl_cell_create(nl_string_create(str), NULL); //case '0': // TODO: Parse integers default: while (!nl_is_delimiter(**curr_src)) { (*curr_src)++; } str = strndup(start, *curr_src - start); return nl_cell_create(nl_symbol_create(str), NULL); } } // Unreachable assert(false); return NULL; } struct nl_object *nl_tokenize(const char *source) { struct nl_object *tokens = NULL, *curr_token = NULL, **next_token = &tokens; const char *curr_src = source; while ((curr_token = nl_token_next(&curr_src)) != NULL) { *next_token = curr_token; next_token = &curr_token->cell.cdr; } return tokens; } struct nl_object *nl_parse_token_atom(struct nl_object *maybe_atom) { struct nl_object *atom = NULL; switch (maybe_atom->type) { case NL_TYPE_CELL: // Cell types currently are not valid for tokens assert(false); break; case NL_TYPE_SYMBOL: { // Check for reserved tokens first, which indicate special, non-atom behavior for (int i = 0; i < ARRAY_SIZE(reserved_symbols); i++) { if (!strcmp(maybe_atom->string, reserved_symbols[i])) { return NULL; } } atom = nl_symbol_create(strdup(maybe_atom->symbol)); break; } case NL_TYPE_STRING: atom = nl_string_create(strdup(maybe_atom->string)); break; case NL_TYPE_INT: // TODO: Copy the token assert(0); break; case NL_TYPE_COUNT: assert(false); } return atom; } static struct nl_object *nl_parse_tokens_recursive(struct nl_object **token_iter) { // Invariants: // - This function returns NULL if it encounters the first unmatched END_LIST_CHAR // - This function returns NULL if it encounters end-of-list without first finding a START_LIST_CHAR // - This function returns an nl_object of the atom or list at **token_iter assert(token_iter != NULL); if (*token_iter == NULL) { return NULL; } assert((*token_iter)->type == NL_TYPE_CELL); struct nl_object *token = (*token_iter)->cell.car; struct nl_object *next_sexp = nl_parse_token_atom(token); if (next_sexp != NULL) { *token_iter = (*token_iter)->cell.cdr; return next_sexp; } assert(token->type == NL_TYPE_SYMBOL); if (token->symbol[0] == START_LIST_CHAR) { struct nl_object *list = NULL; struct nl_object **next_node = &list; // Consume the START_LIST_CHAR *token_iter = (*token_iter)->cell.cdr; while (1) { token = (*token_iter)->cell.car; if (token->symbol[0] == END_LIST_CHAR) { *token_iter = (*token_iter)->cell.cdr; if (list == NULL) { list = nl_cell_create(NULL, NULL); } return list; } next_sexp = nl_parse_tokens_recursive(token_iter); if (next_sexp == NULL) { // Error somewhere in the recursive parsing nl_object_delete(list); return NULL; } *next_node = nl_cell_create(next_sexp, NULL); next_node = &(*next_node)->cell.cdr; } } else if (token->symbol[0] == END_LIST_CHAR) { // Mismatched parens return NULL; } // Any other symbol type should have been an atom, this shouldn't happen assert(false); return NULL; } // parse_tokens -> doesn't care about quotes, terminates on EOF // parse_tokens_recursive -> error on EOF) struct nl_object *nl_parse_tokens(struct nl_object *tokens) { struct nl_object* resultl = NULL; struct nl_object** token_iter = &tokens; struct nl_object** next_cell = &resultl; while (*token_iter != NULL) { struct nl_object *new_sexp = nl_parse_tokens_recursive(token_iter); if (new_sexp == NULL) { goto error; } *next_cell = nl_cell_create(new_sexp, NULL); next_cell = &(*next_cell)->cell.cdr; } if (resultl == NULL) { return nl_cell_create(NULL, NULL); } return resultl; error: nl_object_delete(resultl); return NULL; } // TODO: Should the parse a single sexp (return the last-parsed position), or // all sexps in the source (return a list of sexps)? struct nl_object *nl_parse(const char *source) { struct nl_object *tokens = nl_tokenize(source); struct nl_object *sexp = nl_parse_tokens(tokens); nl_object_delete(tokens); return sexp; } /* struct ParseResult *nl_parse(const char *source) { */ /* struct Cell *tokens = nl_tokenize(source); */ /* struct Cell *sexp = n */ /* } */