started to work again on the parser

This commit is contained in:
Maieul BOYER 2024-05-30 15:38:35 +02:00
parent ffc7a2d0fc
commit f5e048d02e
No known key found for this signature in database
5 changed files with 1293 additions and 1192 deletions

View file

@ -1,4 +1,5 @@
#include "./api.h"
#include "./structs.h"
uint32_t ts_node_end_byte(t_parse_node self);
t_parse_node ts_node_parent(t_parse_node self);
@ -146,14 +147,6 @@ void ts_range_array_get_changed_ranges(const t_parse_range *old_ranges,
}
}
typedef struct s_iterator
{
t_tree_cursor cursor;
const t_language *language;
unsigned visible_depth;
bool in_padding;
} t_iterator;
static t_iterator iterator_new(t_tree_cursor *cursor, const t_subtree *tree,
const t_language *language)
{
@ -387,13 +380,6 @@ static void iterator_advance(t_iterator *self)
}
}
typedef enum e_iterator_comparison
{
IteratorDiffers,
IteratorMayDiffer,
IteratorMatches,
} t_iterator_comparison;
static t_iterator_comparison iterator_compare(const t_iterator *old_iter,
const t_iterator *new_iter)
{
@ -919,9 +905,6 @@ uint32_t ascii_decode(const uint8_t *chunk, uint32_t size, int32_t *codepoint)
return (1);
}
typedef uint32_t (*UnicodeDecodeFunction)(const uint8_t *chunk, uint32_t size,
int32_t *codepoint);
// Decode the next unicode character in the current chunk of source code.
// This assumes that the lexer has already retrieved a chunk of source
// code that spans the current position.
@ -939,7 +922,7 @@ static void ts_lexer__get_lookahead(t_lexer *self)
}
const uint8_t *chunk = (const uint8_t *)self->chunk + position_in_chunk;
UnicodeDecodeFunction decode = ascii_decode;
t_unicode_decode_function decode = ascii_decode;
self->lookahead_size = decode(chunk, size, &self->data.lookahead);
@ -1326,16 +1309,6 @@ t_parse_range *ts_lexer_included_ranges(const t_lexer *self, uint32_t *count)
#undef LOG
typedef struct s_node_child_iterator
{
t_subtree parent;
const t_first_tree *tree;
t_length position;
uint32_t child_index;
uint32_t structural_child_index;
const t_symbol *alias_sequence;
} t_node_child_iterator;
// t_parse_node - constructors
t_parse_node ts_node_new(const t_first_tree *tree, const t_subtree *subtree,
@ -2269,61 +2242,6 @@ static const unsigned MAX_SUMMARY_DEPTH = 16;
static const unsigned MAX_COST_DIFFERENCE = 16 * ERROR_COST_PER_SKIPPED_TREE;
static const unsigned OP_COUNT_PER_TIMEOUT_CHECK = 100;
typedef struct s_token_cache
{
t_subtree token;
t_subtree last_external_token;
uint32_t byte_index;
} t_token_cache;
struct s_first_parser
{
t_lexer lexer;
t_stack *stack;
t_subtree_pool tree_pool;
const t_language *language;
t_reduce_action_set reduce_actions;
t_subtree finished_tree;
t_subtree_array trailing_extras;
t_subtree_array trailing_extras2;
t_subtree_array scratch_trees;
t_token_cache token_cache;
t_reusable_node reusable_node;
void *external_scanner_payload;
t_parser_clock end_clock;
t_parser_duration timeout_duration;
unsigned accept_count;
unsigned operation_count;
const volatile size_t *cancellation_flag;
t_subtree old_tree;
t_range_array included_range_differences;
unsigned included_range_difference_index;
bool has_scanner_error;
};
typedef struct s_error_status
{
unsigned cost;
unsigned node_count;
int dynamic_precedence;
bool is_in_error;
} t_error_status;
typedef enum e_error_comparaison
{
ErrorComparisonTakeLeft,
ErrorComparisonPreferLeft,
ErrorComparisonNone,
ErrorComparisonPreferRight,
ErrorComparisonTakeRight,
} t_error_comparaison;
typedef struct s_string_input
{
const char *string;
uint32_t length;
} t_string_input;
// StringInput
static const char *ts_string_input_read(void *_self, uint32_t byte,
@ -4380,329 +4298,6 @@ t_first_tree *ts_parser_parse_string_encoding(t_first_parser *self,
});
}
/*
* t_stream - A sequence of unicode characters derived from a UTF8 string.
* This struct is used in parsing queries from S-expressions.
*/
typedef struct s_stream
{
const char *input;
const char *start;
const char *end;
int32_t next;
uint8_t next_size;
} t_stream;
/*
* t_query_step - A step in the process of matching a query. Each node within
* a query S-expression corresponds to one of these steps. An entire pattern
* is represented as a sequence of these steps. The basic properties of a
* node are represented by these fields:
* - `symbol` - The grammar symbol to match. A zero value represents the
* wildcard symbol, '_'.
* - `field` - The field name to match. A zero value means that a field name
* was not specified.
* - `capture_ids` - An array of integers representing the names of captures
* associated with this node in the pattern, terminated by a `NONE` value.
* - `depth` - The depth where this node occurs in the pattern. The root node
* of the pattern has depth zero.
* - `negated_field_list_id` - An id representing a set of fields that must
* not be present on a node matching this step.
*
* Steps have some additional fields in order to handle the `.` (or "anchor")
* operator, which forbids additional child nodes:
* - `is_immediate` - Indicates that the node matching this step cannot be
* preceded by other sibling nodes that weren't specified in the pattern.
* - `is_last_child` - Indicates that the node matching this step cannot have
* any subsequent named siblings.
*
* For simple patterns, steps are matched in sequential order. But in order to
* handle alternative/repeated/optional sub-patterns, query steps are not always
* structured as a linear sequence; they sometimes need to split and merge. This
* is done using the following fields:
* - `alternative_index` - The index of a different query step that serves as
* an alternative to this step. A `NONE` value represents no alternative.
* When a query state reaches a step with an alternative index, the state
* is duplicated, with one copy remaining at the original step, and one copy
* moving to the alternative step. The alternative may have its own
* alternative step, so this splitting is an iterative process.
* - `is_dead_end` - Indicates that this state cannot be passed directly, and
* exists only in order to redirect to an alternative index, with no
* splitting.
* - `is_pass_through` - Indicates that state has no matching logic of its own,
* and exists only to split a state. One copy of the state advances
* immediately to the next step, and one moves to the alternative step.
* - `alternative_is_immediate` - Indicates that this step's alternative step
* should be treated as if `is_immediate` is true.
*
* Steps also store some derived state that summarizes how they relate to other
* steps within the same pattern. This is used to optimize the matching process:
* - `contains_captures` - Indicates that this step or one of its child steps
* has a non-empty `capture_ids` list.
* - `parent_pattern_guaranteed` - Indicates that if this step is reached, then
* it and all of its subsequent sibling steps within the same parent pattern
* are guaranteed to match.
* - `root_pattern_guaranteed` - Similar to `parent_pattern_guaranteed`, but
* for the entire top-level pattern. When iterating through a query's
* captures using `ts_query_cursor_next_capture`, this field is used to
* detect that a capture can safely be returned from a match that has not
* even completed yet.
*/
typedef struct s_query_step
{
t_symbol symbol;
t_symbol supertype_symbol;
t_field_id field;
uint16_t capture_ids[MAX_STEP_CAPTURE_COUNT];
uint16_t depth;
uint16_t alternative_index;
uint16_t negated_field_list_id;
bool is_named : 1;
bool is_immediate : 1;
bool is_last_child : 1;
bool is_pass_through : 1;
bool is_dead_end : 1;
bool alternative_is_immediate : 1;
bool contains_captures : 1;
bool root_pattern_guaranteed : 1;
bool parent_pattern_guaranteed : 1;
} t_query_step;
/*
* t_slice - A slice of an external array. Within a query, capture names,
* literal string values, and predicate step information are stored in three
* contiguous arrays. Individual captures, string values, and predicates are
* represented as slices of these three arrays.
*/
typedef struct s_slice
{
uint32_t offset;
uint32_t length;
} t_slice;
/*
* t_symbol_table - a two-way mapping of strings to ids.
*/
typedef struct s_symbol_table
{
Array(char) characters;
Array(t_slice) slices;
} t_symbol_table;
/**
* CaptureQuantififers - a data structure holding the quantifiers of pattern
* captures.
*/
typedef Array(uint8_t) t_capture_quantifiers;
/*
* t_pattern_entry - Information about the starting point for matching a
* particular pattern. These entries are stored in a 'pattern map' - a sorted
* array that makes it possible to efficiently lookup patterns based on the
* symbol for their first step. The entry consists of the following fields:
* - `pattern_index` - the index of the pattern within the query
* - `step_index` - the index of the pattern's first step in the shared `steps`
* array
* - `is_rooted` - whether or not the pattern has a single root node. This
* property affects decisions about whether or not to start the pattern for
* nodes outside of a QueryCursor's range restriction.
*/
typedef struct s_pattern_entry
{
uint16_t step_index;
uint16_t pattern_index;
bool is_rooted;
} t_pattern_entry;
typedef struct s_query_pattern
{
t_slice steps;
t_slice predicate_steps;
uint32_t start_byte;
bool is_non_local;
} t_query_pattern;
typedef struct s_step_offset
{
uint32_t byte_offset;
uint16_t step_index;
} t_step_offset;
/*
* t_query_state - The state of an in-progress match of a particular pattern
* in a query. While executing, a `t_query_cursor` must keep track of a number
* of possible in-progress matches. Each of those possible matches is
* represented as one of these states. Fields:
* - `id` - A numeric id that is exposed to the public API. This allows the
* caller to remove a given match, preventing any more of its captures
* from being returned.
* - `start_depth` - The depth in the tree where the first step of the state's
* pattern was matched.
* - `pattern_index` - The pattern that the state is matching.
* - `consumed_capture_count` - The number of captures from this match that
* have already been returned.
* - `capture_list_id` - A numeric id that can be used to retrieve the state's
* list of captures from the `t_capture_list_pool`.
* - `seeking_immediate_match` - A flag that indicates that the state's next
* step must be matched by the very next sibling. This is used when
* processing repetitions.
* - `has_in_progress_alternatives` - A flag that indicates that there is are
* other states that have the same captures as this state, but are at
* different steps in their pattern. This means that in order to obey the
* 'longest-match' rule, this state should not be returned as a match until
* it is clear that there can be no other alternative match with more
* captures.
*/
typedef struct s_query_state
{
uint32_t id;
uint32_t capture_list_id;
uint16_t start_depth;
uint16_t step_index;
uint16_t pattern_index;
uint16_t consumed_capture_count : 12;
bool seeking_immediate_match : 1;
bool has_in_progress_alternatives : 1;
bool dead : 1;
bool needs_parent : 1;
} t_query_state;
typedef Array(t_query_capture) t_capture_list;
/*
* t_capture_list_pool - A collection of *lists* of captures. Each query state
* needs to maintain its own list of captures. To avoid repeated allocations,
* this struct maintains a fixed set of capture lists, and keeps track of which
* ones are currently in use by a query state.
*/
typedef struct s_capture_list_pool
{
Array(t_capture_list) list;
t_capture_list empty_list;
// The maximum number of capture lists that we are allowed to allocate. We
// never allow `list` to allocate more entries than this, dropping pending
// matches if needed to stay under the limit.
uint32_t max_capture_list_count;
// The number of capture lists allocated in `list` that are not currently in
// use. We reuse those existing-but-unused capture lists before trying to
// allocate any new ones. We use an invalid value (UINT32_MAX) for a capture
// list's length to indicate that it's not in use.
uint32_t free_capture_list_count;
} t_capture_list_pool;
/*
* t_analysis_state - The state needed for walking the parse table when
* analyzing a query pattern, to determine at which steps the pattern might fail
* to match.
*/
typedef struct s_analysis_state_entry
{
t_state_id parse_state;
t_symbol parent_symbol;
uint16_t child_index;
t_field_id field_id : 15;
bool done : 1;
} t_analysis_state_entry;
typedef struct s_analysis_state
{
t_analysis_state_entry stack[MAX_ANALYSIS_STATE_DEPTH];
uint16_t depth;
uint16_t step_index;
t_symbol root_symbol;
} t_analysis_state;
typedef Array(t_analysis_state *) t_analysis_state_set;
typedef struct s_query_analysis
{
t_analysis_state_set states;
t_analysis_state_set next_states;
t_analysis_state_set deeper_states;
t_analysis_state_set state_pool;
Array(uint16_t) final_step_indices;
Array(t_symbol) finished_parent_symbols;
bool did_abort;
} t_query_analysis;
/*
* t_analysis_subgraph - A subset of the states in the parse table that are used
* in constructing nodes with a certain symbol. Each state is accompanied by
* some information about the possible node that could be produced in
* downstream states.
*/
typedef struct s_analysis_subgraph_node
{
t_state_id state;
uint16_t production_id;
uint8_t child_index : 7;
bool done : 1;
} t_analysis_subgraph_node;
typedef struct s_analysis_subgraph
{
t_symbol symbol;
Array(t_state_id) start_states;
Array(t_analysis_subgraph_node) nodes;
} t_analysis_subgraph;
typedef Array(t_analysis_subgraph) t_analysis_subgraph_array;
/*
* t_state_predecessor_map - A map that stores the predecessors of each parse
* state. This is used during query analysis to determine which parse states can
* lead to which reduce actions.
*/
typedef struct s_state_predecessor_map
{
t_state_id *contents;
} t_state_predecessor_map;
/*
* t_parse_query - A tree query, compiled from a string of S-expressions. The
* query itself is immutable. The mutable state used in the process of executing
* the query is stored in a `t_query_cursor`.
*/
struct s_parse_query
{
t_symbol_table captures;
t_symbol_table predicate_values;
Array(t_capture_quantifiers) capture_quantifiers;
Array(t_query_step) steps;
Array(t_pattern_entry) pattern_map;
Array(t_query_predicate_step) predicate_steps;
Array(t_query_pattern) patterns;
Array(t_step_offset) step_offsets;
Array(t_field_id) negated_fields;
Array(char) string_buffer;
Array(t_symbol) repeat_symbols_with_rootless_patterns;
const t_language *language;
uint16_t wildcard_root_pattern_count;
};
/*
* t_query_cursor - A stateful struct used to execute a query on a tree.
*/
struct s_query_cursor
{
const t_parse_query *query;
t_tree_cursor cursor;
Array(t_query_state) states;
Array(t_query_state) finished_states;
t_capture_list_pool capture_list_pool;
uint32_t depth;
uint32_t max_start_depth;
uint32_t start_byte;
uint32_t end_byte;
t_point start_point;
t_point end_point;
uint32_t next_state_id;
bool on_visible_node;
bool ascending;
bool halted;
bool did_exceed_match_limit;
};
static const t_query_error PARENT_DONE = -1;
static const uint16_t PATTERN_DONE_MARKER = UINT16_MAX;
static const uint16_t NONE = UINT16_MAX;
@ -8991,74 +8586,6 @@ void ts_query_cursor_set_max_start_depth(t_query_cursor *self,
#undef LOG
typedef struct s_stack_node t_stack_node;
typedef struct s_stack_link
{
t_stack_node *node;
t_subtree subtree;
bool is_pending;
} t_stack_link;
struct s_stack_node
{
t_state_id state;
t_length position;
t_stack_link links[MAX_LINK_COUNT];
short unsigned int link_count;
uint32_t ref_count;
unsigned error_cost;
unsigned node_count;
int dynamic_precedence;
};
typedef struct s_stack_iterator
{
t_stack_node *node;
t_subtree_array subtrees;
uint32_t subtree_count;
bool is_pending;
} t_stack_iterator;
typedef Array(t_stack_node *) t_stack_node_array;
typedef enum e_stack_status
{
StackStatusActive,
StackStatusPaused,
StackStatusHalted,
} t_stack_status;
typedef struct s_stack_head
{
t_stack_node *node;
t_stack_summary *summary;
unsigned node_count_at_last_error;
t_subtree last_external_token;
t_subtree lookahead_when_paused;
t_stack_status status;
} t_stack_head;
struct s_stack
{
Array(t_stack_head) heads;
t_stack_slice_array slices;
Array(t_stack_iterator) iterators;
t_stack_node_array node_pool;
t_stack_node *base_node;
t_subtree_pool *subtree_pool;
};
typedef unsigned t_stack_action;
enum e_stack_action
{
StackActionNone,
StackActionStop = 1,
StackActionPop = 2,
};
typedef t_stack_action (*t_stack_callback)(void *, const t_stack_iterator *);
static void stack_node_retain(t_stack_node *self)
{
if (!self)
@ -9663,12 +9190,6 @@ t_stack_slice_array ts_stack_pop_all(t_stack *self, t_stack_version version)
return stack__iter(self, version, pop_all_callback, NULL, 0);
}
typedef struct s_summarize_stack_session
{
t_stack_summary *summary;
unsigned max_depth;
} t_summarize_stack_session;
static inline t_stack_action summarize_stack_callback(
void *payload, const t_stack_iterator *iterator)
{
@ -9894,13 +9415,6 @@ bool ts_stack_print_dot_graph(t_stack *self, const t_language *language,
return (false);
}
typedef struct s_edit
{
t_length start;
t_length old_end;
t_length new_end;
} t_edit;
// t_external_scanner_state
void ts_external_scanner_state_init(t_external_scanner_state *self,
@ -10659,11 +10173,6 @@ static inline void ts_subtree_set_has_changes(t_mutable_subtree *self)
t_subtree ts_subtree_edit(t_subtree self, const t_input_edit *input_edit,
t_subtree_pool *pool)
{
typedef struct s_edit_entry
{
t_subtree *tree;
t_edit edit;
} t_edit_entry;
Array(t_edit_entry) stack = array_new();
array_push(
@ -11095,17 +10604,6 @@ void ts_tree_print_dot_graph(const t_first_tree *self, int file_descriptor)
#endif
typedef struct s_cursor_child_iterator
{
t_subtree parent;
const t_first_tree *tree;
t_length position;
uint32_t child_index;
uint32_t structural_child_index;
uint32_t descendant_index;
const t_symbol *alias_sequence;
} t_cursor_child_iterator;
// t_cursor_child_iterator
static inline bool ts_tree_cursor_is_entry_visible(const t_tree_cursor *self,