minishell/parser/src/parser.c
2024-09-05 19:04:57 +00:00

1603 lines
49 KiB
C

/* ************************************************************************** */
/* */
/* ::: :::::::: */
/* parser.c :+: :+: :+: */
/* +:+ +:+ +:+ */
/* By: maiboyer <maiboyer@student.42.fr> +#+ +:+ +#+ */
/* +#+#+#+#+#+ +#+ */
/* Created: 2024/09/03 14:08:00 by maiboyer #+# #+# */
/* Updated: 2024/09/03 14:08:01 by maiboyer ### ########.fr */
/* */
/* ************************************************************************** */
#include "me/mem/mem.h"
#include "me/types.h"
#include "me/vec/vec_subtree.h"
#include "parser/api.h"
#include "parser/array.h"
#include "parser/language.h"
#include "parser/length.h"
#include "parser/lexer.h"
#include "parser/reduce_action.h"
#include "parser/stack.h"
#include "parser/subtree.h"
#include "parser/tree.h"
#include <assert.h>
#include <stdio.h>
#define MAX_VERSION_COUNT 40
#define MAX_VERSION_COUNT_OVERFLOW 60
#define MAX_SUMMARY_DEPTH 1
#define MAX_COST_DIFFERENCE 16 * ERROR_COST_PER_SKIPPED_TREE
typedef struct s_error_status t_error_status;
typedef enum e_error_comparison t_error_comparison;
typedef struct s_string_input t_string_input;
void ts_lexer__mark_end(TSLexer *_self);
struct TSParser
{
t_lexer lexer;
t_stack *stack;
const TSLanguage *language;
ReduceActionSet reduce_actions;
t_subtree finished_tree;
t_vec_subtree trailing_extras;
t_vec_subtree trailing_extras2;
t_vec_subtree scratch_trees;
void *external_scanner_payload;
t_u32 accept_count;
t_u32 operation_count;
bool has_scanner_error;
};
struct s_error_status
{
t_u32 cost;
t_u32 node_count;
int dynamic_precedence;
bool is_in_error;
};
enum e_error_comparison
{
ECTakeLeft,
ECPreferLeft,
ECNone,
ECPreferRight,
ECTakeRight,
};
struct s_string_input
{
const t_u8 *string;
t_u32 length;
};
// StringInput
static const t_u8 *ts_string_input_read(void *_self, t_u32 byte, TSPoint point, t_u32 *length)
{
t_string_input *self;
(void)point;
self = (t_string_input *)_self;
if (byte >= self->length)
{
*length = 0;
return ((const t_u8 *)"");
}
else
{
*length = self->length - byte;
return (self->string + byte);
}
}
// Parser - Private
static bool ts_parser__breakdown_top_of_stack(TSParser *self, t_stack_version version)
{
t_stack_slice_array pop;
bool did_break_down;
bool pending;
t_stack_slice slice;
TSStateId state;
t_subtree parent;
t_subtree child;
t_subtree tree;
t_u32 i;
t_u32 n;
t_u32 j;
did_break_down = false;
pending = false;
do
{
pop = ts_stack_pop_pending(self->stack, version);
if (!pop.size)
break;
did_break_down = true;
pending = false;
for (i = 0; i < pop.size; i++)
{
slice = pop.contents[i];
state = ts_stack_state(self->stack, slice.version);
parent = *slice.subtrees.buffer;
for (j = 0, n = ts_subtree_child_count(parent); j < n; j++)
{
child = ts_subtree_children(parent)[j];
pending = ts_subtree_child_count(child) > 0;
if (ts_subtree_is_error(child))
{
state = ERROR_STATE;
}
else if (!ts_subtree_extra(child))
{
state = ts_language_next_state(self->language, state, ts_subtree_symbol(child));
}
child->ref_count++;
ts_stack_push(self->stack, slice.version, child, pending, state);
}
for (j = 1; j < slice.subtrees.len; j++)
{
tree = slice.subtrees.buffer[j];
ts_stack_push(self->stack, slice.version, tree, false, state);
}
ts_subtree_release(parent);
array_delete(&slice.subtrees);
}
} while (pending);
return (did_break_down);
}
static t_error_comparison ts_parser__compare_versions(TSParser *self, t_error_status a, t_error_status b)
{
(void)self;
if (!a.is_in_error && b.is_in_error)
{
if (a.cost < b.cost)
{
return (ECTakeLeft);
}
else
{
return (ECPreferLeft);
}
}
if (a.is_in_error && !b.is_in_error)
{
if (b.cost < a.cost)
{
return (ECTakeRight);
}
else
{
return (ECPreferRight);
}
}
if (a.cost < b.cost)
{
if ((b.cost - a.cost) * (1 + a.node_count) > MAX_COST_DIFFERENCE)
{
return (ECTakeLeft);
}
else
{
return (ECPreferLeft);
}
}
if (b.cost < a.cost)
{
if ((a.cost - b.cost) * (1 + b.node_count) > MAX_COST_DIFFERENCE)
{
return (ECTakeRight);
}
else
{
return (ECPreferRight);
}
}
if (a.dynamic_precedence > b.dynamic_precedence)
return (ECPreferLeft);
if (b.dynamic_precedence > a.dynamic_precedence)
return (ECPreferRight);
return (ECNone);
}
static t_error_status ts_parser__version_status(TSParser *self, t_stack_version version)
{
t_u32 cost;
bool is_paused;
cost = ts_stack_error_cost(self->stack, version);
is_paused = ts_stack_is_paused(self->stack, version);
if (is_paused)
cost += ERROR_COST_PER_SKIPPED_TREE;
return ((t_error_status){.cost = cost,
.node_count = ts_stack_node_count_since_error(self->stack, version),
.dynamic_precedence = ts_stack_dynamic_precedence(self->stack, version),
.is_in_error = is_paused || ts_stack_state(self->stack, version) == ERROR_STATE});
}
static bool ts_parser__better_version_exists(TSParser *self, t_stack_version version, bool is_in_error, t_u32 cost)
{
t_error_status status_i;
Length position;
t_error_status status;
t_stack_version i;
t_stack_version n;
if (self->finished_tree && ts_subtree_error_cost(self->finished_tree) <= cost)
return (true);
position = ts_stack_position(self->stack, version);
status = (t_error_status){
.cost = cost,
.is_in_error = is_in_error,
.dynamic_precedence = ts_stack_dynamic_precedence(self->stack, version),
.node_count = ts_stack_node_count_since_error(self->stack, version),
};
for (i = 0, n = ts_stack_version_count(self->stack); i < n; i++)
{
if (i == version || !ts_stack_is_active(self->stack, i) || ts_stack_position(self->stack, i).bytes < position.bytes)
continue;
status_i = ts_parser__version_status(self, i);
switch (ts_parser__compare_versions(self, status, status_i))
{
case ECTakeRight:
return (true);
case ECPreferRight:
if (ts_stack_can_merge(self->stack, i, version))
return (true);
break;
default:
break;
}
}
return false;
}
// static bool ts_parser__call_main_lex_fn(TSParser *self, TSLexMode lex_mode)
// {
// return self->language->lex_fn(&self->lexer.data, lex_mode.lex_state);
// }
//
// static bool ts_parser__call_keyword_lex_fn(TSParser *self, TSLexMode lex_mode)
// {
// (void)(lex_mode);
// return self->language->keyword_lex_fn(&self->lexer.data, 0);
// }
static void ts_parser__external_scanner_create(TSParser *self)
{
if (self->language && self->language->external_scanner.states && self->language->external_scanner.create)
self->external_scanner_payload = self->language->external_scanner.create();
}
static void ts_parser__external_scanner_destroy(TSParser *self)
{
if (self->language && self->external_scanner_payload && self->language->external_scanner.destroy)
self->language->external_scanner.destroy(self->external_scanner_payload);
self->external_scanner_payload = NULL;
}
static t_u32 ts_parser__external_scanner_serialize(TSParser *self)
{
t_u32 length;
length = self->language->external_scanner.serialize(self->external_scanner_payload, self->lexer.debug_buffer);
if (length > TREE_SITTER_SERIALIZATION_BUFFER_SIZE)
me_abort("assertion failed in " __FILE__ " `length > "
"TREE_SITTER_SERIALIZATION_BUFFER_SIZE`");
return length;
}
static void ts_parser__external_scanner_deserialize(TSParser *self, t_subtree external_token)
{
const t_u8 *data;
t_u32 length;
data = NULL;
length = 0;
if (external_token)
{
data = ts_external_scanner_state_data(&external_token->external_scanner_state);
length = external_token->external_scanner_state.length;
printf("HERE\n");
}
self->language->external_scanner.deserialize(self->external_scanner_payload, data, length);
}
static bool ts_parser__external_scanner_scan(TSParser *self, TSStateId external_lex_state)
{
const bool *valid_external_tokens;
valid_external_tokens = ts_language_enabled_external_tokens(self->language, external_lex_state);
return self->language->external_scanner.scan(self->external_scanner_payload, &self->lexer.data, valid_external_tokens);
}
static t_subtree ts_parser__lex(TSParser *self, t_stack_version version, TSStateId parse_state)
{
Length start_position;
t_subtree external_token;
TSLexMode lex_mode;
bool found_external_token;
bool error_mode;
bool skipped_error;
bool called_get_column;
t_i32 first_error_character;
Length error_start_position;
Length error_end_position;
t_u32 lookahead_end_byte;
t_u32 external_scanner_state_len;
bool external_scanner_state_changed;
bool found_token;
Length current_position;
t_subtree result;
Length padding;
Length size;
t_u32 lookahead_bytes;
bool is_keyword;
TSSymbol symbol;
t_u32 end_byte;
lex_mode = self->language->lex_modes[parse_state];
if (lex_mode.lex_state == (t_u16)-1)
return NULL;
start_position = ts_stack_position(self->stack, version);
external_token = ts_stack_last_external_token(self->stack, version);
found_external_token = false;
error_mode = parse_state == ERROR_STATE;
skipped_error = false;
called_get_column = false;
first_error_character = 0;
error_start_position = length_zero();
error_end_position = length_zero();
lookahead_end_byte = 0;
external_scanner_state_len = 0;
external_scanner_state_changed = false;
ts_lexer_reset(&self->lexer, start_position);
for (;;)
{
found_token = false;
current_position = self->lexer.current_position;
if (lex_mode.external_lex_state != 0)
{
ts_lexer_start(&self->lexer);
ts_parser__external_scanner_deserialize(self, external_token);
found_token = ts_parser__external_scanner_scan(self, lex_mode.external_lex_state);
if (self->has_scanner_error)
return NULL;
ts_lexer_finish(&self->lexer, &lookahead_end_byte);
if (found_token)
{
external_scanner_state_len = ts_parser__external_scanner_serialize(self);
external_scanner_state_changed = !ts_external_scanner_state_eq(ts_subtree_external_scanner_state(external_token),
self->lexer.debug_buffer, external_scanner_state_len);
// When recovering from an error, ignore any
// zero-length external tokens unless they
// have changed the external scanner's
// state. This helps to avoid infinite loops
// which could otherwise occur, because the
// lexer is looking for any possible token,
// instead of looking for the specific set
// of tokens that are valid in some parse
// state.
//
// Note that it's possible that the token
// end position may be *before* the original
// position of the lexer because of the way
// that tokens are positioned at included
// range boundaries: when a token is
// terminated at the start of an included
// range, it is marked as ending at the
// *end* of the preceding included range.
if (self->lexer.token_end_position.bytes <= current_position.bytes &&
(error_mode || !ts_stack_has_advanced_since_error(self->stack, version)) && !external_scanner_state_changed)
{
found_token = false;
}
}
if (found_token)
{
found_external_token = true;
called_get_column = self->lexer.did_get_column;
break;
}
ts_lexer_reset(&self->lexer, current_position);
}
ts_lexer_start(&self->lexer);
found_token = self->language->lex_fn(&self->lexer.data, lex_mode.lex_state);
ts_lexer_finish(&self->lexer, &lookahead_end_byte);
if (found_token)
break;
if (!error_mode)
{
error_mode = true;
lex_mode = self->language->lex_modes[ERROR_STATE];
ts_lexer_reset(&self->lexer, start_position);
continue;
}
if (!skipped_error)
{
skipped_error = true;
error_start_position = self->lexer.token_start_position;
error_end_position = self->lexer.token_start_position;
first_error_character = self->lexer.data.lookahead;
}
if (self->lexer.current_position.bytes == error_end_position.bytes)
{
if (self->lexer.data.eof(&self->lexer.data))
{
self->lexer.data.result_symbol = ts_builtin_sym_error;
break;
}
self->lexer.data.advance(&self->lexer.data, false);
}
error_end_position = self->lexer.current_position;
}
if (skipped_error)
{
padding = length_sub(error_start_position, start_position);
size = length_sub(error_end_position, error_start_position);
lookahead_bytes = lookahead_end_byte - error_end_position.bytes;
result = ts_subtree_new_error(first_error_character, padding, size, lookahead_bytes, parse_state, self->language);
}
else
{
is_keyword = false;
symbol = self->lexer.data.result_symbol;
padding = length_sub(self->lexer.token_start_position, start_position);
size = length_sub(self->lexer.token_end_position, self->lexer.token_start_position);
lookahead_bytes = lookahead_end_byte - self->lexer.token_end_position.bytes;
if (found_external_token)
{
symbol = self->language->external_scanner.symbol_map[symbol];
}
else if (symbol == self->language->keyword_capture_token && symbol != 0)
{
end_byte = self->lexer.token_end_position.bytes;
ts_lexer_reset(&self->lexer, self->lexer.token_start_position);
ts_lexer_start(&self->lexer);
is_keyword = self->language->keyword_lex_fn(&self->lexer.data, 0);
if (is_keyword && self->lexer.token_end_position.bytes == end_byte &&
ts_language_has_actions(self->language, parse_state, self->lexer.data.result_symbol))
{
symbol = self->lexer.data.result_symbol;
}
}
result = ts_subtree_new_leaf(symbol, padding, size, lookahead_bytes, parse_state, found_external_token, called_get_column,
is_keyword, self->language);
if (found_external_token)
{
ts_external_scanner_state_init(&result->external_scanner_state, self->lexer.debug_buffer, external_scanner_state_len);
result->has_external_scanner_state_change = external_scanner_state_changed;
}
}
return result;
}
// Determine if a given tree should be replaced by an
// alternative tree.
//
// The decision is based on the trees' error costs (if any),
// their dynamic precedence, and finally, as a default, by a
// recursive comparison of the trees' symbols.
static bool ts_parser__select_tree(TSParser *self, t_subtree left, t_subtree right)
{
int comparison;
(void)(self);
if (!left)
return true;
if (!right)
return false;
if (ts_subtree_error_cost(right) < ts_subtree_error_cost(left))
{
return true;
}
if (ts_subtree_error_cost(left) < ts_subtree_error_cost(right))
{
return false;
}
if (ts_subtree_dynamic_precedence(right) > ts_subtree_dynamic_precedence(left))
{
return true;
}
if (ts_subtree_dynamic_precedence(left) > ts_subtree_dynamic_precedence(right))
{
return false;
}
if (ts_subtree_error_cost(left) > 0)
return true;
comparison = ts_subtree_compare(left, right);
switch (comparison)
{
case -1:
return false;
case 1:
return true;
default:
return false;
}
}
// Determine if a given tree's children should be replaced
// by an alternative array of children.
static bool ts_parser__select_children(TSParser *self, t_subtree left, const t_vec_subtree *children)
{
t_subtree scratch_tree;
vec_subtree_copy_into(&self->scratch_trees, (void *)children);
scratch_tree = ts_subtree_new_node(ts_subtree_symbol(left), &self->scratch_trees, 0, self->language);
return (ts_parser__select_tree(self, left, (scratch_tree)));
}
static void ts_parser__shift(TSParser *self, t_stack_version version, TSStateId state, t_subtree lookahead, bool extra)
{
t_subtree result;
bool is_leaf;
t_subtree subtree_to_push;
is_leaf = ts_subtree_child_count(lookahead) == 0;
subtree_to_push = lookahead;
if (extra != ts_subtree_extra(lookahead) && is_leaf)
{
result = ts_subtree_ensure_owner(lookahead);
ts_subtree_set_extra(&result, extra);
subtree_to_push = (result);
}
ts_stack_push(self->stack, version, subtree_to_push, !is_leaf, state);
if (ts_subtree_has_external_tokens(subtree_to_push))
ts_stack_set_last_external_token(self->stack, version, ts_subtree_last_external_token(subtree_to_push));
}
static t_stack_version ts_parser__reduce(TSParser *self, t_stack_version version, TSSymbol symbol, t_u32 count, int dynamic_precedence,
t_u16 production_id, bool is_fragile, bool end_of_non_terminal_extra)
{
t_u32 initial_version_count;
t_stack_slice_array pop;
t_u32 removed_version_count;
t_stack_slice slice;
t_stack_version slice_version;
t_stack_slice next_slice;
t_u32 i;
t_vec_subtree children;
t_subtree parent;
t_vec_subtree next_slice_children;
TSStateId state;
TSStateId next_state;
t_u32 j;
t_stack_version k;
initial_version_count = ts_stack_version_count(self->stack);
// Pop the given number of nodes from the given version
// of the parse stack. If stack versions have previously
// merged, then there may be more than one path back
// through the stack. For each path, create a new parent
// node to contain the popped children, and push it onto
// the stack in place of the children.
pop = ts_stack_pop_count(self->stack, version, count);
removed_version_count = 0;
for (i = 0; i < pop.size; i++)
{
slice = pop.contents[i];
slice_version = slice.version - removed_version_count;
// This is where new versions are added to the parse
// stack. The versions will all be sorted and
// truncated at the end of the outer parsing loop.
// Allow the maximum version count to be temporarily
// exceeded, but only by a limited threshold.
if (slice_version > MAX_VERSION_COUNT + MAX_VERSION_COUNT_OVERFLOW)
{
ts_stack_remove_version(self->stack, slice_version);
ts_subtree_array_delete(&slice.subtrees);
removed_version_count++;
while (i + 1 < pop.size)
{
next_slice = pop.contents[i + 1];
if (next_slice.version != slice.version)
break;
ts_subtree_array_delete(&next_slice.subtrees);
i++;
}
continue;
}
// Extra tokens on top of the stack should not be
// included in this new parent node. They will be
// re-pushed onto the stack after the parent node is
// created and pushed.
children = slice.subtrees;
ts_subtree_array_remove_trailing_extras(&children, &self->trailing_extras);
parent = ts_subtree_new_node(symbol, &children, production_id, self->language);
// This pop operation may have caused multiple stack
// versions to collapse into one, because they all
// diverged from a common state. In that case,
// choose one of the arrays of trees to be the
// parent node's children, and delete the rest of
// the tree arrays.
while (i + 1 < pop.size)
{
next_slice = pop.contents[i + 1];
if (next_slice.version != slice.version)
break;
i++;
next_slice_children = next_slice.subtrees;
ts_subtree_array_remove_trailing_extras(&next_slice_children, &self->trailing_extras2);
if (ts_parser__select_children(self, (parent), &next_slice_children))
{
ts_subtree_array_clear(
/*&self->tree_pool,*/ &self->trailing_extras);
ts_subtree_release(
/*&self->tree_pool,*/ (parent));
array_swap(&self->trailing_extras, &self->trailing_extras2);
parent = ts_subtree_new_node(symbol, &next_slice_children, production_id, self->language);
}
else
{
self->trailing_extras2.len = 0;
ts_subtree_array_delete(
/*&self->tree_pool,*/ &next_slice.subtrees);
}
}
state = ts_stack_state(self->stack, slice_version);
next_state = ts_language_next_state(self->language, state, symbol);
if (end_of_non_terminal_extra && next_state == state)
{
parent->extra = true;
}
if (is_fragile || pop.size > 1 || initial_version_count > 1)
{
parent->fragile_left = true;
parent->fragile_right = true;
parent->parse_state = TS_TREE_STATE_NONE;
}
else
{
parent->parse_state = state;
}
parent->dynamic_precedence += dynamic_precedence;
// Push the parent node onto the stack, along with
// any extra tokens that were previously on top of
// the stack.
ts_stack_push(self->stack, slice_version, (parent), false, next_state);
for (j = 0; j < self->trailing_extras.len; j++)
{
ts_stack_push(self->stack, slice_version, self->trailing_extras.buffer[j], false, next_state);
}
for (k = 0; k < slice_version; k++)
{
if (k == version)
continue;
if (ts_stack_merge(self->stack, k, slice_version))
{
removed_version_count++;
break;
}
}
}
// Return the first new stack version that was created.
return ts_stack_version_count(self->stack) > initial_version_count ? initial_version_count : STACK_VERSION_NONE;
}
static void ts_parser__accept(TSParser *self, t_stack_version version, t_subtree lookahead)
{
t_u32 child_count;
const t_subtree *children;
t_stack_slice_array pop;
t_vec_subtree trees;
t_subtree root;
t_u32 i;
t_u32 j;
t_u32 k;
t_subtree tree;
assert(ts_subtree_is_eof(lookahead));
ts_stack_push(self->stack, version, lookahead, false, 1);
pop = ts_stack_pop_all(self->stack, version);
for (i = 0; i < pop.size; i++)
{
trees = pop.contents[i].subtrees;
root = NULL;
for (j = trees.len - 1; j + 1 > 0; j--)
{
tree = trees.buffer[j];
if (!ts_subtree_extra(tree))
{
child_count = ts_subtree_child_count(tree);
children = ts_subtree_children(tree);
for (k = 0; k < child_count; k++)
children[k]->ref_count++;
vec_subtree_splice(&trees, vec_subtree_splice_args(j, 1, child_count, children));
root = (ts_subtree_new_node(ts_subtree_symbol(tree), &trees, tree->production_id, self->language));
ts_subtree_release(tree);
break;
}
}
self->accept_count++;
if (self->finished_tree)
{
if (ts_parser__select_tree(self, self->finished_tree, root))
{
ts_subtree_release(self->finished_tree);
self->finished_tree = root;
}
else
ts_subtree_release(root);
}
else
self->finished_tree = root;
}
ts_stack_remove_version(self->stack, pop.contents[0].version);
ts_stack_halt(self->stack, version);
}
static bool ts_parser__do_all_potential_reductions(TSParser *self, t_stack_version starting_version, TSSymbol lookahead_symbol)
{
t_u32 initial_version_count;
bool can_shift_lookahead_symbol;
t_stack_version version;
t_u32 i;
t_u32 version_count;
bool merged;
t_stack_version j;
TSStateId state;
bool has_shift_action;
TSSymbol first_symbol;
TSSymbol end_symbol;
t_stack_version reduction_version;
ReduceAction reduce_action;
t_u32 k;
TSSymbol symbol;
TableEntry entry;
TSParseAction action;
initial_version_count = ts_stack_version_count(self->stack);
can_shift_lookahead_symbol = false;
version = starting_version;
for (i = 0; true; i++)
{
version_count = ts_stack_version_count(self->stack);
if (version >= version_count)
break;
merged = false;
for (j = initial_version_count; j < version; j++)
{
if (ts_stack_merge(self->stack, j, version))
{
merged = true;
break;
}
}
if (merged)
continue;
state = ts_stack_state(self->stack, version);
has_shift_action = false;
array_clear(&self->reduce_actions);
if (lookahead_symbol != 0)
{
first_symbol = lookahead_symbol;
end_symbol = lookahead_symbol + 1;
}
else
{
first_symbol = 1;
end_symbol = self->language->token_count;
}
for (symbol = first_symbol; symbol < end_symbol; symbol++)
{
ts_language_table_entry(self->language, state, symbol, &entry);
for (k = 0; k < entry.action_count; k++)
{
action = entry.actions[k];
switch (action.type)
{
case TSParseActionTypeShift:
case TSParseActionTypeRecover:
if (!action.shift.extra && !action.shift.repetition)
has_shift_action = true;
break;
case TSParseActionTypeReduce:
if (action.reduce.child_count > 0)
ts_reduce_action_set_add(&self->reduce_actions, (ReduceAction){
.symbol = action.reduce.symbol,
.count = action.reduce.child_count,
.dynamic_precedence = action.reduce.dynamic_precedence,
.production_id = action.reduce.production_id,
});
break;
default:
break;
}
}
}
reduction_version = STACK_VERSION_NONE;
for (k = 0; k < self->reduce_actions.size; k++)
{
reduce_action = self->reduce_actions.contents[k];
reduction_version = ts_parser__reduce(self, version, reduce_action.symbol, reduce_action.count,
reduce_action.dynamic_precedence, reduce_action.production_id, true, false);
}
if (has_shift_action)
can_shift_lookahead_symbol = true;
else if (reduction_version != STACK_VERSION_NONE && i < MAX_VERSION_COUNT)
{
ts_stack_renumber_version(self->stack, reduction_version, version);
continue;
}
else if (lookahead_symbol != 0)
ts_stack_remove_version(self->stack, version);
if (version == starting_version)
version = version_count;
else
version++;
}
return can_shift_lookahead_symbol;
}
static bool ts_parser__recover_to_state(TSParser *self, t_stack_version version, t_u32 depth, TSStateId goal_state)
{
t_stack_slice_array pop;
t_stack_version previous_version;
t_stack_slice slice;
t_u32 i;
t_u32 j;
t_vec_subtree error_trees;
t_subtree error_tree;
t_u32 error_child_count;
t_subtree tree;
t_subtree error;
previous_version = STACK_VERSION_NONE;
pop = ts_stack_pop_count(self->stack, version, depth);
for (i = 0; i < pop.size; i++)
{
slice = pop.contents[i];
if (slice.version == previous_version)
{
ts_subtree_array_delete(&slice.subtrees);
array_erase(&pop, i--);
continue;
}
if (ts_stack_state(self->stack, slice.version) != goal_state)
{
ts_stack_halt(self->stack, slice.version);
ts_subtree_array_delete(&slice.subtrees);
array_erase(&pop, i--);
continue;
}
error_trees = ts_stack_pop_error(self->stack, slice.version);
if (error_trees.len > 0)
{
error_tree = error_trees.buffer[0];
error_child_count = ts_subtree_child_count(error_tree);
if (error_child_count > 0)
{
vec_subtree_splice(&slice.subtrees, vec_subtree_splice_args(0, 0, error_child_count, ts_subtree_children(error_tree)));
for (j = 0; j < error_child_count; j++)
slice.subtrees.buffer[j]->ref_count++;
}
ts_subtree_array_delete(&error_trees);
}
ts_subtree_array_remove_trailing_extras(&slice.subtrees, &self->trailing_extras);
if (slice.subtrees.len > 0)
{
error = ts_subtree_new_error_node(&slice.subtrees, true, self->language);
ts_stack_push(self->stack, slice.version, error, false, goal_state);
}
else
{
vec_subtree_free(slice.subtrees);
}
for (j = 0; j < self->trailing_extras.len; j++)
{
tree = self->trailing_extras.buffer[j];
ts_stack_push(self->stack, slice.version, tree, false, goal_state);
}
previous_version = slice.version;
}
return previous_version != STACK_VERSION_NONE;
}
static void ts_parser__recover(TSParser *self, t_stack_version version, t_subtree lookahead)
{
Length position;
bool did_recover;
bool would_merge;
t_stack_slice_array pop;
t_stack_summary *summary;
t_stack_summary_entry entry;
t_subtree parent;
t_u32 current_error_cost;
t_u32 depth;
t_u32 i;
t_u32 j;
t_u32 new_cost;
t_u32 node_count_since_error;
t_u32 previous_version_count;
t_vec_subtree children;
t_u32 n;
const TSParseAction *actions;
t_subtree error_repeat;
t_subtree mutable_lookahead;
did_recover = false;
previous_version_count = ts_stack_version_count(self->stack);
position = ts_stack_position(self->stack, version);
summary = ts_stack_get_summary(self->stack, version);
node_count_since_error = ts_stack_node_count_since_error(self->stack, version);
current_error_cost = ts_stack_error_cost(self->stack, version);
// When the parser is in the error state, there are two
// strategies for recovering with a given lookahead
// token:
// 1. Find a previous state on the stack in which that
// lookahead token would be valid. Then,
// create a new stack version that is in that state
// again. This entails popping all of the subtrees
// that have been pushed onto the stack since that
// previous state, and wrapping them in an ERROR
// node.
// 2. Wrap the lookahead token in an ERROR node, push
// that ERROR node onto the stack, and
// move on to the next lookahead token, remaining in
// the error state.
//
// First, try the strategy 1. Upon entering the error
// state, the parser recorded a summary of the previous
// parse states and their depths. Look at each state in
// the summary, to see if the current lookahead token
// would be valid in that state.
if (summary && !ts_subtree_is_error(lookahead))
{
for (i = 0; i < summary->size; i++)
{
entry = summary->contents[i];
if (entry.state == ERROR_STATE)
continue;
if (entry.position.bytes == position.bytes)
continue;
depth = entry.depth;
if (node_count_since_error > 0)
depth++;
// Do not recover in ways that create redundant
// stack versions.
would_merge = false;
for (j = 0; j < previous_version_count; j++)
{
if (ts_stack_state(self->stack, j) == entry.state && ts_stack_position(self->stack, j).bytes == position.bytes)
{
would_merge = true;
break;
}
}
if (would_merge)
continue;
// Do not recover if the result would clearly be
// worse than some existing stack version.
new_cost = current_error_cost + entry.depth * ERROR_COST_PER_SKIPPED_TREE +
(position.bytes - entry.position.bytes) * ERROR_COST_PER_SKIPPED_CHAR +
(position.extent.row - entry.position.extent.row) * ERROR_COST_PER_SKIPPED_LINE;
if (ts_parser__better_version_exists(self, version, false, new_cost))
break;
// If the current lookahead token is valid in
// some previous state, recover to that state.
// Then stop looking for further recoveries.
if (ts_language_has_actions(self->language, entry.state, ts_subtree_symbol(lookahead)))
{
if (ts_parser__recover_to_state(self, version, depth, entry.state))
{
did_recover = true;
break;
}
}
}
}
// In the process of attempting to recover, some stack
// versions may have been created and subsequently
// halted. Remove those versions.
for (i = previous_version_count; i < ts_stack_version_count(self->stack); i++)
if (!ts_stack_is_active(self->stack, i))
ts_stack_remove_version(self->stack, i--);
// If strategy 1 succeeded, a new stack version will
// have been created which is able to handle the current
// lookahead token. Now, in addition, try strategy 2
// described above: skip the current lookahead token by
// wrapping it in an ERROR node.
// Don't pursue this additional strategy if there are
// already too many stack versions.
if (did_recover && ts_stack_version_count(self->stack) > MAX_VERSION_COUNT)
{
ts_stack_halt(self->stack, version);
ts_subtree_release(lookahead);
return;
}
if (did_recover && ts_subtree_has_external_scanner_state_change(lookahead))
{
ts_stack_halt(self->stack, version);
ts_subtree_release(lookahead);
return;
}
// If the parser is still in the error state at the end
// of the file, just wrap everything in an ERROR node
// and terminate.
if (ts_subtree_is_eof(lookahead))
{
children = vec_subtree_new(16, NULL);
parent = ts_subtree_new_error_node(&children, false, self->language);
ts_stack_push(self->stack, version, parent, false, 1);
ts_parser__accept(self, version, lookahead);
return;
}
// Do not recover if the result would clearly be worse
// than some existing stack version.
new_cost = current_error_cost + ERROR_COST_PER_SKIPPED_TREE + ts_subtree_total_bytes(lookahead) * ERROR_COST_PER_SKIPPED_CHAR +
ts_subtree_total_size(lookahead).extent.row * ERROR_COST_PER_SKIPPED_LINE;
if (ts_parser__better_version_exists(self, version, false, new_cost))
{
ts_stack_halt(self->stack, version);
ts_subtree_release(lookahead);
return;
}
// If the current lookahead token is an extra token,
// mark it as extra. This means it won't be counted in
// error cost calculations.
actions = ts_language_actions(self->language, 1, ts_subtree_symbol(lookahead), &n);
if (n > 0 && actions[n - 1].type == TSParseActionTypeShift && actions[n - 1].shift.extra)
{
mutable_lookahead = ts_subtree_ensure_owner(lookahead);
ts_subtree_set_extra(&mutable_lookahead, true);
lookahead = (mutable_lookahead);
}
// Wrap the lookahead token in an ERROR.
children = vec_subtree_new(1, NULL);
vec_subtree_push(&children, lookahead);
error_repeat = ts_subtree_new_node(ts_builtin_sym_error_repeat, &children, 0, self->language);
// If other tokens have already been skipped, so there
// is already an ERROR at the top of the stack, then pop
// that ERROR off the stack and wrap the two ERRORs
// together into one larger ERROR.
if (node_count_since_error > 0)
{
pop = ts_stack_pop_count(self->stack, version, 1);
// TODO: Figure out how to make this condition
// occur. See
// https://github.com/atom/atom/issues/18450#issuecomment-439579778
// If multiple stack versions have merged at this
// point, just pick one of the errors arbitrarily
// and discard the rest.
if (pop.size > 1)
{
for (i = 1; i < pop.size; i++)
ts_subtree_array_delete(
/*&self->tree_pool,*/ &pop.contents[i].subtrees);
while (ts_stack_version_count(self->stack) > pop.contents[0].version + 1)
ts_stack_remove_version(self->stack, pop.contents[0].version + 1);
}
ts_stack_renumber_version(self->stack, pop.contents[0].version, version);
vec_subtree_push(&pop.contents[0].subtrees, (error_repeat));
error_repeat = ts_subtree_new_node(ts_builtin_sym_error_repeat, &pop.contents[0].subtrees, 0, self->language);
}
// Push the new ERROR onto the stack.
ts_stack_push(self->stack, version, (error_repeat), false, ERROR_STATE);
if (ts_subtree_has_external_tokens(lookahead))
ts_stack_set_last_external_token(self->stack, version, ts_subtree_last_external_token(lookahead));
}
static void ts_parser__handle_error(TSParser *self, t_stack_version version, t_subtree lookahead)
{
TSSymbol missing_symbol;
bool did_insert_missing_token;
TSStateId state;
TSStateId state_after_missing_symbol;
Length padding;
t_u32 lookahead_bytes;
t_stack_version version_with_missing_tree;
t_subtree missing_tree;
t_u32 previous_version_count;
t_u32 version_count;
Length position;
previous_version_count = ts_stack_version_count(self->stack);
// Perform any reductions that can happen in this state,
// regardless of the lookahead. After skipping one or
// more invalid tokens, the parser might find a token
// that would have allowed a reduction to take place.
ts_parser__do_all_potential_reductions(self, version, 0);
version_count = ts_stack_version_count(self->stack);
position = ts_stack_position(self->stack, version);
// Push a discontinuity onto the stack. Merge all of the
// stack versions that were created in the previous
// step.
did_insert_missing_token = false;
for (t_stack_version v = version; v < version_count;)
{
if (!did_insert_missing_token)
{
state = ts_stack_state(self->stack, v);
for (missing_symbol = 1; missing_symbol < (t_u16)self->language->token_count; missing_symbol++)
{
state_after_missing_symbol = ts_language_next_state(self->language, state, missing_symbol);
if (state_after_missing_symbol == 0 || state_after_missing_symbol == state)
continue;
if (ts_language_has_reduce_action(self->language, state_after_missing_symbol, ts_subtree_leaf_symbol(lookahead)))
{
// In case the parser is currently
// outside of any included range, the
// lexer will snap to the beginning of
// the next included range. The missing
// token's padding must be assigned to
// position it within the next included
// range.
ts_lexer_reset(&self->lexer, position);
ts_lexer__mark_end((void *)&self->lexer);
padding = length_sub(self->lexer.token_end_position, position);
lookahead_bytes = ts_subtree_total_bytes(lookahead) + ts_subtree_lookahead_bytes(lookahead);
version_with_missing_tree = ts_stack_copy_version(self->stack, v);
missing_tree = ts_subtree_new_missing_leaf(missing_symbol, padding, lookahead_bytes, self->language);
ts_stack_push(self->stack, version_with_missing_tree, missing_tree, false, state_after_missing_symbol);
if (ts_parser__do_all_potential_reductions(self, version_with_missing_tree, ts_subtree_leaf_symbol(lookahead)))
{
did_insert_missing_token = true;
break;
}
}
}
}
ts_stack_push(self->stack, v, NULL, false, ERROR_STATE);
v = (v == version) ? previous_version_count : v + 1;
}
for (t_u32 i = previous_version_count; i < version_count; i++)
{
ts_stack_merge(self->stack, version, previous_version_count);
}
ts_stack_record_summary(self->stack, version, MAX_SUMMARY_DEPTH);
// Begin recovery with the current lookahead node,
// rather than waiting for the next turn of the parse
// loop. This ensures that the tree accounts for the
// current lookahead token's "lookahead bytes" value,
// which describes how far the lexer needed to look
// ahead beyond the content of the token in order to
// recognize it.
ts_parser__recover(self, version, lookahead);
}
static bool ts_parser__advance(TSParser *self, t_stack_version version, bool allow_node_reuse)
{
TSStateId state;
t_subtree mutable_lookahead;
t_subtree lookahead;
TableEntry table_entry;
bool needs_lex;
t_u32 i;
t_stack_version last_reduction_version;
TSParseAction action;
TSStateId next_state;
bool is_fragile;
bool end_of_non_terminal_extra;
t_stack_version reduction_version;
(void)(allow_node_reuse);
lookahead = NULL;
table_entry = (TableEntry){.action_count = 0};
state = ts_stack_state(self->stack, version);
needs_lex = true;
for (;;)
{
// Otherwise, re-run the lexer.
if (needs_lex)
{
needs_lex = false;
lookahead = ts_parser__lex(self, version, state);
if (self->has_scanner_error)
return false;
if (lookahead)
{
ts_language_table_entry(self->language, state, ts_subtree_symbol(lookahead), &table_entry);
}
// When parsing a non-terminal extra, a null
// lookahead indicates the end of the rule. The
// reduction is stored in the EOF table entry.
// After the reduction, the lexer needs to be
// run again.
else
{
ts_language_table_entry(self->language, state, ts_builtin_sym_end, &table_entry);
}
}
// Process each parse action for the current
// lookahead token in the current state. If there
// are multiple actions, then this is an ambiguous
// state. REDUCE actions always create a new stack
// version, whereas SHIFT actions update the
// existing stack version and terminate this loop.
last_reduction_version = STACK_VERSION_NONE;
for (i = 0; i < table_entry.action_count; i++)
{
action = table_entry.actions[i];
switch (action.type)
{
case TSParseActionTypeShift: {
if (action.shift.repetition)
break;
if (action.shift.extra)
{
next_state = state;
}
else
{
next_state = action.shift.state;
}
if (ts_subtree_child_count(lookahead) > 0)
{
next_state = ts_language_next_state(self->language, state, ts_subtree_symbol(lookahead));
}
ts_parser__shift(self, version, next_state, lookahead, action.shift.extra);
return true;
}
case TSParseActionTypeReduce: {
is_fragile = table_entry.action_count > 1;
end_of_non_terminal_extra = lookahead == NULL;
reduction_version =
ts_parser__reduce(self, version, action.reduce.symbol, action.reduce.child_count, action.reduce.dynamic_precedence,
action.reduce.production_id, is_fragile, end_of_non_terminal_extra);
if (reduction_version != STACK_VERSION_NONE)
{
last_reduction_version = reduction_version;
}
break;
}
case TSParseActionTypeAccept: {
ts_parser__accept(self, version, lookahead);
return true;
}
case TSParseActionTypeRecover: {
ts_parser__recover(self, version, lookahead);
return true;
}
}
}
// If a reduction was performed, then replace the
// current stack version with one of the stack
// versions created by a reduction, and continue
// processing this version of the stack with the
// same lookahead symbol.
if (last_reduction_version != STACK_VERSION_NONE)
{
ts_stack_renumber_version(self->stack, last_reduction_version, version);
state = ts_stack_state(self->stack, version);
// At the end of a non-terminal extra rule, the
// lexer will return a null subtree, because the
// parser needs to perform a fixed reduction
// regardless of the lookahead node. After
// performing that reduction, (and completing
// the non-terminal extra rule) run the lexer
// again based on the current parse state.
if (!lookahead)
needs_lex = true;
else
ts_language_table_entry(self->language, state, ts_subtree_leaf_symbol(lookahead), &table_entry);
continue;
}
// A non-terminal extra rule was reduced and merged
// into an existing stack version. This version can
// be discarded.
if (!lookahead)
{
ts_stack_halt(self->stack, version);
return true;
}
// If there were no parse actions for the current
// lookahead token, then it is not valid in this
// state. If the current lookahead token is a
// keyword, then switch to treating it as the normal
// word token if that token is valid in this state.
if (ts_subtree_is_keyword(lookahead) && ts_subtree_symbol(lookahead) != self->language->keyword_capture_token)
{
ts_language_table_entry(self->language, state, self->language->keyword_capture_token, &table_entry);
if (table_entry.action_count > 0)
{
mutable_lookahead = ts_subtree_ensure_owner(lookahead);
ts_subtree_set_symbol(&mutable_lookahead, self->language->keyword_capture_token, self->language);
lookahead = mutable_lookahead;
continue;
}
}
// If the current lookahead token is not valid and
// the parser is already in the error state, restart
// the error recovery process.
// TODO - can this be unified with the other
// `RECOVER` case above?
if (state == ERROR_STATE)
{
ts_parser__recover(self, version, lookahead);
return true;
}
// If the current lookahead token is not valid and
// the previous subtree on the stack was reused from
// an old tree, it isn't actually valid to reuse it.
// Remove it from the stack, and in its place, push
// each of its children. Then try again to process
// the current lookahead.
if (ts_parser__breakdown_top_of_stack(self, version))
{
state = ts_stack_state(self->stack, version);
ts_subtree_release(
/*&self->tree_pool,*/ lookahead);
needs_lex = true;
continue;
}
// At this point, the current lookahead token is
// definitely not valid for this parse stack
// version. Mark this version as paused and continue
// processing any other stack versions that might
// exist. If some other version advances
// successfully, then this version can simply be
// removed. But if all versions end up paused, then
// error recovery is needed.
ts_stack_pause(self->stack, version, lookahead);
return true;
}
}
static t_u32 ts_parser__condense_stack(TSParser *self)
{
t_error_status status_i;
t_error_status status_j;
t_stack_version i;
t_stack_version j;
t_stack_version n;
t_subtree lookahead;
t_u32 min_error_cost;
bool has_unpaused_version;
min_error_cost = UINT_MAX;
for (i = 0; i < ts_stack_version_count(self->stack); i++)
{
// Prune any versions that have been marked for
// removal.
if (ts_stack_is_halted(self->stack, i))
{
ts_stack_remove_version(self->stack, i);
i--;
continue;
}
// Keep track of the minimum error cost of any stack
// version so that it can be returned.
status_i = ts_parser__version_status(self, i);
if (!status_i.is_in_error && status_i.cost < min_error_cost)
{
min_error_cost = status_i.cost;
}
// Examine each pair of stack versions, removing any
// versions that are clearly worse than another
// version. Ensure that the versions are ordered
// from most promising to least promising.
for (j = 0; j < i; j++)
{
status_j = ts_parser__version_status(self, j);
switch (ts_parser__compare_versions(self, status_j, status_i))
{
case ECTakeLeft:
ts_stack_remove_version(self->stack, i);
i--;
j = i;
break;
case ECPreferLeft:
case ECNone:
if (ts_stack_merge(self->stack, j, i))
{
i--;
j = i;
}
break;
case ECPreferRight:
if (ts_stack_merge(self->stack, j, i))
{
i--;
j = i;
}
else
{
ts_stack_swap_versions(self->stack, i, j);
}
break;
case ECTakeRight:
ts_stack_remove_version(self->stack, j);
i--;
j--;
break;
}
}
}
// Enforce a hard upper bound on the number of stack
// versions by discarding the least promising versions.
while (ts_stack_version_count(self->stack) > MAX_VERSION_COUNT)
ts_stack_remove_version(self->stack, MAX_VERSION_COUNT);
// If the best-performing stack version is currently
// paused, or all versions are paused, then resume the
// best paused version and begin the error recovery
// process. Otherwise, remove the paused versions.
if (ts_stack_version_count(self->stack) > 0)
{
has_unpaused_version = false;
for (i = 0, n = ts_stack_version_count(self->stack); i < n; i++)
{
if (ts_stack_is_paused(self->stack, i))
{
if (!has_unpaused_version && self->accept_count < MAX_VERSION_COUNT)
{
min_error_cost = ts_stack_error_cost(self->stack, i);
lookahead = ts_stack_resume(self->stack, i);
ts_parser__handle_error(self, i, lookahead);
has_unpaused_version = true;
}
else
{
ts_stack_remove_version(self->stack, i);
i--;
n--;
}
}
else
{
has_unpaused_version = true;
}
}
}
return min_error_cost;
}
static bool ts_parser_has_outstanding_parse(TSParser *self)
{
return (self->external_scanner_payload || ts_stack_state(self->stack, 0) != 1 || ts_stack_node_count_since_error(self->stack, 0) != 0);
}
// Parser - Public
TSParser *ts_parser_new(void)
{
TSParser *self;
self = mem_alloc(sizeof(*self));
ts_lexer_init(&self->lexer);
array_init(&self->reduce_actions);
array_reserve(&self->reduce_actions, 4);
self->stack = ts_stack_new();
self->finished_tree = NULL;
self->language = NULL;
self->has_scanner_error = false;
self->external_scanner_payload = NULL;
self->operation_count = 0;
return self;
}
void ts_parser_delete(TSParser *self)
{
if (!self)
return;
ts_parser_set_language(self, NULL);
ts_stack_delete(self->stack);
if (self->reduce_actions.contents)
{
array_delete(&self->reduce_actions);
}
array_delete(&self->trailing_extras);
array_delete(&self->trailing_extras2);
array_delete(&self->scratch_trees);
mem_free(self);
}
const TSLanguage *ts_parser_language(const TSParser *self)
{
return self->language;
}
bool ts_parser_set_language(TSParser *self, const TSLanguage *language)
{
ts_parser_reset(self);
self->language = language;
return true;
}
void ts_parser_reset(TSParser *self)
{
ts_parser__external_scanner_destroy(self);
ts_lexer_reset(&self->lexer, length_zero());
ts_stack_clear(self->stack);
if (self->finished_tree)
{
ts_subtree_release(self->finished_tree);
self->finished_tree = NULL;
}
self->accept_count = 0;
self->has_scanner_error = false;
}
TSTree *ts_parser_parse(TSParser *self, TSInput input)
{
TSTree *result;
t_u32 position;
t_u32 last_position;
t_u32 version_count;
t_stack_version version;
bool allow_node_reuse;
t_u32 min_error_cost;
result = NULL;
if (!self->language || !input.read)
return NULL;
ts_lexer_set_input(&self->lexer, input);
if (!ts_parser_has_outstanding_parse(self))
{
ts_parser__external_scanner_create(self);
if (self->has_scanner_error)
{
ts_parser_reset(self);
return result;
}
}
self->operation_count = 0;
position = 0;
last_position = 0;
version_count = 0;
version = 0;
do
{
for (version = 0; version_count = ts_stack_version_count(self->stack), version < version_count; version++)
{
allow_node_reuse = version_count == 1;
while (ts_stack_is_active(self->stack, version))
{
if (!ts_parser__advance(self, version, allow_node_reuse))
{
if (self->has_scanner_error)
{
ts_parser_reset(self);
return result;
}
return NULL;
}
position = ts_stack_position(self->stack, version).bytes;
if (position > last_position || (version > 0 && position == last_position))
{
last_position = position;
break;
}
}
}
// After advancing each version of the stack,
// re-sort the versions by their cost, removing any
// versions that are no longer worth pursuing.
min_error_cost = ts_parser__condense_stack(self);
// If there's already a finished parse tree that's
// better than any in-progress version, then
// terminate parsing. Clear the parse stack to
// remove any extra references to subtrees within
// the finished tree, ensuring that these subtrees
// can be safely mutated in-place for rebalancing.
if (self->finished_tree && ts_subtree_error_cost(self->finished_tree) < min_error_cost)
{
ts_stack_clear(self->stack);
break;
}
} while (version_count != 0);
if (self->finished_tree == NULL)
me_abort("self->finished_tree == NULL");
ts_subtree_balance(self->finished_tree, self->language);
result = ts_tree_new(self->finished_tree, self->language);
self->finished_tree = NULL;
ts_parser_reset(self);
return result;
}
TSTree *ts_parser_parse_string(TSParser *self, t_const_str string, t_u32 length)
{
t_string_input input;
input = (t_string_input){(const t_u8 *)string, length};
return ts_parser_parse(self, (TSInput){
&input,
ts_string_input_read,
});
}