started to work again on the parser
This commit is contained in:
parent
ffc7a2d0fc
commit
f5e048d02e
5 changed files with 1293 additions and 1192 deletions
545
parser/src/structs.h
Normal file
545
parser/src/structs.h
Normal file
|
|
@ -0,0 +1,545 @@
|
|||
#ifndef STRUCTS_H
|
||||
#define STRUCTS_H
|
||||
|
||||
#include "./api.h"
|
||||
|
||||
typedef unsigned t_stack_action;
|
||||
|
||||
typedef struct s_edit t_edit;
|
||||
typedef struct s_edit_entry t_edit_entry;
|
||||
typedef struct s_cursor_child_iterator t_cursor_child_iterator;
|
||||
typedef struct s_summarize_stack_session t_summarize_stack_session;
|
||||
typedef struct s_stack_node t_stack_node;
|
||||
typedef struct s_stack_link t_stack_link;
|
||||
typedef struct s_stack_head t_stack_head;
|
||||
typedef struct s_stack_iterator t_stack_iterator;
|
||||
typedef struct s_stack t_stack;
|
||||
typedef struct s_stack_head t_stack_head;
|
||||
typedef struct s_stack_iterator t_stack_iterator;
|
||||
typedef struct s_query_cursor t_query_cursor;
|
||||
typedef struct s_parse_query t_parse_query;
|
||||
typedef struct s_state_predecessor_map t_state_predecessor_map;
|
||||
typedef struct s_analysis_subgraph t_analysis_subgraph;
|
||||
typedef struct s_analysis_subgraph_node t_analysis_subgraph_node;
|
||||
typedef struct s_query_analysis t_query_analysis;
|
||||
typedef struct s_analysis_state t_analysis_state;
|
||||
typedef struct s_analysis_state_entry t_analysis_state_entry;
|
||||
typedef struct s_capture_list_pool t_capture_list_pool;
|
||||
typedef struct s_query_state t_query_state;
|
||||
typedef struct s_step_offset t_step_offset;
|
||||
typedef struct s_query_pattern t_query_pattern;
|
||||
typedef struct s_pattern_entry t_pattern_entry;
|
||||
typedef struct s_symbol_table t_symbol_table;
|
||||
typedef struct s_slice t_slice;
|
||||
typedef struct s_query_step t_query_step;
|
||||
typedef struct s_stream t_stream;
|
||||
typedef struct s_string_input t_string_input;
|
||||
typedef struct s_error_status t_error_status;
|
||||
typedef struct s_first_parser t_first_parser;
|
||||
typedef struct s_token_cache t_token_cache;
|
||||
typedef struct s_node_child_iterator t_node_child_iterator;
|
||||
typedef struct s_iterator t_iterator;
|
||||
typedef struct s_parse_query t_parse_query;
|
||||
|
||||
typedef t_stack_action (*t_stack_callback)(void *, const t_stack_iterator *);
|
||||
typedef Array(t_stack_node *) t_stack_node_array;
|
||||
typedef Array(t_analysis_subgraph) t_analysis_subgraph_array;
|
||||
typedef Array(t_analysis_state *) t_analysis_state_set;
|
||||
typedef Array(uint8_t) t_capture_quantifiers;
|
||||
typedef uint32_t (*t_unicode_decode_function)(const uint8_t *chunk,
|
||||
uint32_t size,
|
||||
int32_t *codepoint);
|
||||
typedef Array(t_query_capture) t_capture_list;
|
||||
|
||||
typedef enum e_stack_status t_stack_status;
|
||||
typedef enum e_error_comparaison t_error_comparaison;
|
||||
typedef enum e_iterator_comparison t_iterator_comparison;
|
||||
|
||||
struct s_iterator
|
||||
{
|
||||
t_tree_cursor cursor;
|
||||
const t_language *language;
|
||||
unsigned visible_depth;
|
||||
bool in_padding;
|
||||
};
|
||||
|
||||
enum e_iterator_comparison
|
||||
{
|
||||
IteratorDiffers,
|
||||
IteratorMayDiffer,
|
||||
IteratorMatches,
|
||||
};
|
||||
|
||||
struct s_node_child_iterator
|
||||
{
|
||||
t_subtree parent;
|
||||
const t_first_tree *tree;
|
||||
t_length position;
|
||||
uint32_t child_index;
|
||||
uint32_t structural_child_index;
|
||||
const t_symbol *alias_sequence;
|
||||
};
|
||||
struct s_token_cache
|
||||
{
|
||||
t_subtree token;
|
||||
t_subtree last_external_token;
|
||||
uint32_t byte_index;
|
||||
};
|
||||
|
||||
struct s_first_parser
|
||||
{
|
||||
t_lexer lexer;
|
||||
t_stack *stack;
|
||||
t_subtree_pool tree_pool;
|
||||
const t_language *language;
|
||||
t_reduce_action_set reduce_actions;
|
||||
t_subtree finished_tree;
|
||||
t_subtree_array trailing_extras;
|
||||
t_subtree_array trailing_extras2;
|
||||
t_subtree_array scratch_trees;
|
||||
t_token_cache token_cache;
|
||||
t_reusable_node reusable_node;
|
||||
void *external_scanner_payload;
|
||||
t_parser_clock end_clock;
|
||||
t_parser_duration timeout_duration;
|
||||
unsigned accept_count;
|
||||
unsigned operation_count;
|
||||
const volatile size_t *cancellation_flag;
|
||||
t_subtree old_tree;
|
||||
t_range_array included_range_differences;
|
||||
unsigned included_range_difference_index;
|
||||
bool has_scanner_error;
|
||||
};
|
||||
|
||||
struct s_error_status
|
||||
{
|
||||
unsigned cost;
|
||||
unsigned node_count;
|
||||
int dynamic_precedence;
|
||||
bool is_in_error;
|
||||
};
|
||||
|
||||
enum e_error_comparaison
|
||||
{
|
||||
ErrorComparisonTakeLeft,
|
||||
ErrorComparisonPreferLeft,
|
||||
ErrorComparisonNone,
|
||||
ErrorComparisonPreferRight,
|
||||
ErrorComparisonTakeRight,
|
||||
};
|
||||
|
||||
struct s_string_input
|
||||
{
|
||||
const char *string;
|
||||
uint32_t length;
|
||||
};
|
||||
|
||||
/*
|
||||
* t_stream - A sequence of unicode characters derived from a UTF8 string.
|
||||
* This struct is used in parsing queries from S-expressions.
|
||||
*/
|
||||
struct s_stream
|
||||
{
|
||||
const char *input;
|
||||
const char *start;
|
||||
const char *end;
|
||||
int32_t next;
|
||||
uint8_t next_size;
|
||||
};
|
||||
|
||||
/*
|
||||
* t_query_step - A step in the process of matching a query. Each node within
|
||||
* a query S-expression corresponds to one of these steps. An entire pattern
|
||||
* is represented as a sequence of these steps. The basic properties of a
|
||||
* node are represented by these fields:
|
||||
* - `symbol` - The grammar symbol to match. A zero value represents the
|
||||
* wildcard symbol, '_'.
|
||||
* - `field` - The field name to match. A zero value means that a field name
|
||||
* was not specified.
|
||||
* - `capture_ids` - An array of integers representing the names of captures
|
||||
* associated with this node in the pattern, terminated by a `NONE` value.
|
||||
* - `depth` - The depth where this node occurs in the pattern. The root node
|
||||
* of the pattern has depth zero.
|
||||
* - `negated_field_list_id` - An id representing a set of fields that must
|
||||
* not be present on a node matching this step.
|
||||
*
|
||||
* Steps have some additional fields in order to handle the `.` (or "anchor")
|
||||
* operator, which forbids additional child nodes:
|
||||
* - `is_immediate` - Indicates that the node matching this step cannot be
|
||||
* preceded by other sibling nodes that weren't specified in the pattern.
|
||||
* - `is_last_child` - Indicates that the node matching this step cannot have
|
||||
* any subsequent named siblings.
|
||||
*
|
||||
* For simple patterns, steps are matched in sequential order. But in order to
|
||||
* handle alternative/repeated/optional sub-patterns, query steps are not always
|
||||
* structured as a linear sequence; they sometimes need to split and merge. This
|
||||
* is done using the following fields:
|
||||
* - `alternative_index` - The index of a different query step that serves as
|
||||
* an alternative to this step. A `NONE` value represents no alternative.
|
||||
* When a query state reaches a step with an alternative index, the state
|
||||
* is duplicated, with one copy remaining at the original step, and one copy
|
||||
* moving to the alternative step. The alternative may have its own
|
||||
* alternative step, so this splitting is an iterative process.
|
||||
* - `is_dead_end` - Indicates that this state cannot be passed directly, and
|
||||
* exists only in order to redirect to an alternative index, with no
|
||||
* splitting.
|
||||
* - `is_pass_through` - Indicates that state has no matching logic of its own,
|
||||
* and exists only to split a state. One copy of the state advances
|
||||
* immediately to the next step, and one moves to the alternative step.
|
||||
* - `alternative_is_immediate` - Indicates that this step's alternative step
|
||||
* should be treated as if `is_immediate` is true.
|
||||
*
|
||||
* Steps also store some derived state that summarizes how they relate to other
|
||||
* steps within the same pattern. This is used to optimize the matching process:
|
||||
* - `contains_captures` - Indicates that this step or one of its child steps
|
||||
* has a non-empty `capture_ids` list.
|
||||
* - `parent_pattern_guaranteed` - Indicates that if this step is reached, then
|
||||
* it and all of its subsequent sibling steps within the same parent pattern
|
||||
* are guaranteed to match.
|
||||
* - `root_pattern_guaranteed` - Similar to `parent_pattern_guaranteed`, but
|
||||
* for the entire top-level pattern. When iterating through a query's
|
||||
* captures using `ts_query_cursor_next_capture`, this field is used to
|
||||
* detect that a capture can safely be returned from a match that has not
|
||||
* even completed yet.
|
||||
*/
|
||||
struct s_query_step
|
||||
{
|
||||
t_symbol symbol;
|
||||
t_symbol supertype_symbol;
|
||||
t_field_id field;
|
||||
uint16_t capture_ids[MAX_STEP_CAPTURE_COUNT];
|
||||
uint16_t depth;
|
||||
uint16_t alternative_index;
|
||||
uint16_t negated_field_list_id;
|
||||
bool is_named : 1;
|
||||
bool is_immediate : 1;
|
||||
bool is_last_child : 1;
|
||||
bool is_pass_through : 1;
|
||||
bool is_dead_end : 1;
|
||||
bool alternative_is_immediate : 1;
|
||||
bool contains_captures : 1;
|
||||
bool root_pattern_guaranteed : 1;
|
||||
bool parent_pattern_guaranteed : 1;
|
||||
};
|
||||
|
||||
/*
|
||||
* t_slice - A slice of an external array. Within a query, capture names,
|
||||
* literal string values, and predicate step information are stored in three
|
||||
* contiguous arrays. Individual captures, string values, and predicates are
|
||||
* represented as slices of these three arrays.
|
||||
*/
|
||||
struct s_slice
|
||||
{
|
||||
uint32_t offset;
|
||||
uint32_t length;
|
||||
};
|
||||
|
||||
/*
|
||||
* t_symbol_table - a two-way mapping of strings to ids.
|
||||
*/
|
||||
struct s_symbol_table
|
||||
{
|
||||
Array(char) characters;
|
||||
Array(t_slice) slices;
|
||||
};
|
||||
|
||||
/**
|
||||
* CaptureQuantififers - a data structure holding the quantifiers of pattern
|
||||
* captures.
|
||||
*/
|
||||
|
||||
/*
|
||||
* t_pattern_entry - Information about the starting point for matching a
|
||||
* particular pattern. These entries are stored in a 'pattern map' - a sorted
|
||||
* array that makes it possible to efficiently lookup patterns based on the
|
||||
* symbol for their first step. The entry consists of the following fields:
|
||||
* - `pattern_index` - the index of the pattern within the query
|
||||
* - `step_index` - the index of the pattern's first step in the shared `steps`
|
||||
* array
|
||||
* - `is_rooted` - whether or not the pattern has a single root node. This
|
||||
* property affects decisions about whether or not to start the pattern for
|
||||
* nodes outside of a QueryCursor's range restriction.
|
||||
*/
|
||||
struct s_pattern_entry
|
||||
{
|
||||
uint16_t step_index;
|
||||
uint16_t pattern_index;
|
||||
bool is_rooted;
|
||||
};
|
||||
|
||||
struct s_query_pattern
|
||||
{
|
||||
t_slice steps;
|
||||
t_slice predicate_steps;
|
||||
uint32_t start_byte;
|
||||
bool is_non_local;
|
||||
};
|
||||
|
||||
struct s_step_offset
|
||||
{
|
||||
uint32_t byte_offset;
|
||||
uint16_t step_index;
|
||||
};
|
||||
|
||||
/*
|
||||
* t_query_state - The state of an in-progress match of a particular pattern
|
||||
* in a query. While executing, a `t_query_cursor` must keep track of a number
|
||||
* of possible in-progress matches. Each of those possible matches is
|
||||
* represented as one of these states. Fields:
|
||||
* - `id` - A numeric id that is exposed to the public API. This allows the
|
||||
* caller to remove a given match, preventing any more of its captures
|
||||
* from being returned.
|
||||
* - `start_depth` - The depth in the tree where the first step of the state's
|
||||
* pattern was matched.
|
||||
* - `pattern_index` - The pattern that the state is matching.
|
||||
* - `consumed_capture_count` - The number of captures from this match that
|
||||
* have already been returned.
|
||||
* - `capture_list_id` - A numeric id that can be used to retrieve the state's
|
||||
* list of captures from the `t_capture_list_pool`.
|
||||
* - `seeking_immediate_match` - A flag that indicates that the state's next
|
||||
* step must be matched by the very next sibling. This is used when
|
||||
* processing repetitions.
|
||||
* - `has_in_progress_alternatives` - A flag that indicates that there is are
|
||||
* other states that have the same captures as this state, but are at
|
||||
* different steps in their pattern. This means that in order to obey the
|
||||
* 'longest-match' rule, this state should not be returned as a match until
|
||||
* it is clear that there can be no other alternative match with more
|
||||
* captures.
|
||||
*/
|
||||
struct s_query_state
|
||||
{
|
||||
uint32_t id;
|
||||
uint32_t capture_list_id;
|
||||
uint16_t start_depth;
|
||||
uint16_t step_index;
|
||||
uint16_t pattern_index;
|
||||
uint16_t consumed_capture_count : 12;
|
||||
bool seeking_immediate_match : 1;
|
||||
bool has_in_progress_alternatives : 1;
|
||||
bool dead : 1;
|
||||
bool needs_parent : 1;
|
||||
};
|
||||
|
||||
/*
|
||||
* t_capture_list_pool - A collection of *lists* of captures. Each query state
|
||||
* needs to maintain its own list of captures. To avoid repeated allocations,
|
||||
* this struct maintains a fixed set of capture lists, and keeps track of which
|
||||
* ones are currently in use by a query state.
|
||||
*/
|
||||
struct s_capture_list_pool
|
||||
{
|
||||
Array(t_capture_list) list;
|
||||
t_capture_list empty_list;
|
||||
// The maximum number of capture lists that we are allowed to allocate. We
|
||||
// never allow `list` to allocate more entries than this, dropping pending
|
||||
// matches if needed to stay under the limit.
|
||||
uint32_t max_capture_list_count;
|
||||
// The number of capture lists allocated in `list` that are not currently in
|
||||
// use. We reuse those existing-but-unused capture lists before trying to
|
||||
// allocate any new ones. We use an invalid value (UINT32_MAX) for a capture
|
||||
// list's length to indicate that it's not in use.
|
||||
uint32_t free_capture_list_count;
|
||||
};
|
||||
|
||||
/*
|
||||
* t_analysis_state - The state needed for walking the parse table when
|
||||
* analyzing a query pattern, to determine at which steps the pattern might fail
|
||||
* to match.
|
||||
*/
|
||||
struct s_analysis_state_entry
|
||||
{
|
||||
t_state_id parse_state;
|
||||
t_symbol parent_symbol;
|
||||
uint16_t child_index;
|
||||
t_field_id field_id : 15;
|
||||
bool done : 1;
|
||||
};
|
||||
|
||||
struct s_analysis_state
|
||||
{
|
||||
t_analysis_state_entry stack[MAX_ANALYSIS_STATE_DEPTH];
|
||||
uint16_t depth;
|
||||
uint16_t step_index;
|
||||
t_symbol root_symbol;
|
||||
};
|
||||
|
||||
struct s_query_analysis
|
||||
{
|
||||
t_analysis_state_set states;
|
||||
t_analysis_state_set next_states;
|
||||
t_analysis_state_set deeper_states;
|
||||
t_analysis_state_set state_pool;
|
||||
Array(uint16_t) final_step_indices;
|
||||
Array(t_symbol) finished_parent_symbols;
|
||||
bool did_abort;
|
||||
};
|
||||
|
||||
/*
|
||||
* t_analysis_subgraph - A subset of the states in the parse table that are used
|
||||
* in constructing nodes with a certain symbol. Each state is accompanied by
|
||||
* some information about the possible node that could be produced in
|
||||
* downstream states.
|
||||
*/
|
||||
struct s_analysis_subgraph_node
|
||||
{
|
||||
t_state_id state;
|
||||
uint16_t production_id;
|
||||
uint8_t child_index : 7;
|
||||
bool done : 1;
|
||||
};
|
||||
|
||||
struct s_analysis_subgraph
|
||||
{
|
||||
t_symbol symbol;
|
||||
Array(t_state_id) start_states;
|
||||
Array(t_analysis_subgraph_node) nodes;
|
||||
};
|
||||
|
||||
/*
|
||||
* t_state_predecessor_map - A map that stores the predecessors of each parse
|
||||
* state. This is used during query analysis to determine which parse states can
|
||||
* lead to which reduce actions.
|
||||
*/
|
||||
|
||||
struct s_state_predecessor_map
|
||||
{
|
||||
t_state_id *contents;
|
||||
};
|
||||
|
||||
/*
|
||||
* t_parse_query - A tree query, compiled from a string of S-expressions. The
|
||||
* query itself is immutable. The mutable state used in the process of executing
|
||||
* the query is stored in a `t_query_cursor`.
|
||||
*/
|
||||
struct s_parse_query
|
||||
{
|
||||
t_symbol_table captures;
|
||||
t_symbol_table predicate_values;
|
||||
Array(t_capture_quantifiers) capture_quantifiers;
|
||||
Array(t_query_step) steps;
|
||||
Array(t_pattern_entry) pattern_map;
|
||||
Array(t_query_predicate_step) predicate_steps;
|
||||
Array(t_query_pattern) patterns;
|
||||
Array(t_step_offset) step_offsets;
|
||||
Array(t_field_id) negated_fields;
|
||||
Array(char) string_buffer;
|
||||
Array(t_symbol) repeat_symbols_with_rootless_patterns;
|
||||
const t_language *language;
|
||||
uint16_t wildcard_root_pattern_count;
|
||||
};
|
||||
|
||||
/*
|
||||
* t_query_cursor - A stateful struct used to execute a query on a tree.
|
||||
*/
|
||||
struct s_query_cursor
|
||||
{
|
||||
const t_parse_query *query;
|
||||
t_tree_cursor cursor;
|
||||
Array(t_query_state) states;
|
||||
Array(t_query_state) finished_states;
|
||||
t_capture_list_pool capture_list_pool;
|
||||
uint32_t depth;
|
||||
uint32_t max_start_depth;
|
||||
uint32_t start_byte;
|
||||
uint32_t end_byte;
|
||||
t_point start_point;
|
||||
t_point end_point;
|
||||
uint32_t next_state_id;
|
||||
bool on_visible_node;
|
||||
bool ascending;
|
||||
bool halted;
|
||||
bool did_exceed_match_limit;
|
||||
};
|
||||
|
||||
struct s_stack_link
|
||||
{
|
||||
t_stack_node *node;
|
||||
t_subtree subtree;
|
||||
bool is_pending;
|
||||
};
|
||||
|
||||
struct s_stack_node
|
||||
{
|
||||
t_state_id state;
|
||||
t_length position;
|
||||
t_stack_link links[MAX_LINK_COUNT];
|
||||
short unsigned int link_count;
|
||||
uint32_t ref_count;
|
||||
unsigned error_cost;
|
||||
unsigned node_count;
|
||||
int dynamic_precedence;
|
||||
};
|
||||
|
||||
struct s_stack_iterator
|
||||
{
|
||||
t_stack_node *node;
|
||||
t_subtree_array subtrees;
|
||||
uint32_t subtree_count;
|
||||
bool is_pending;
|
||||
};
|
||||
|
||||
enum e_stack_status
|
||||
{
|
||||
StackStatusActive,
|
||||
StackStatusPaused,
|
||||
StackStatusHalted,
|
||||
};
|
||||
|
||||
struct s_stack_head
|
||||
{
|
||||
t_stack_node *node;
|
||||
t_stack_summary *summary;
|
||||
unsigned node_count_at_last_error;
|
||||
t_subtree last_external_token;
|
||||
t_subtree lookahead_when_paused;
|
||||
t_stack_status status;
|
||||
};
|
||||
|
||||
struct s_stack
|
||||
{
|
||||
Array(t_stack_head) heads;
|
||||
t_stack_slice_array slices;
|
||||
Array(t_stack_iterator) iterators;
|
||||
t_stack_node_array node_pool;
|
||||
t_stack_node *base_node;
|
||||
t_subtree_pool *subtree_pool;
|
||||
};
|
||||
|
||||
enum e_stack_action
|
||||
{
|
||||
StackActionNone,
|
||||
StackActionStop = 1,
|
||||
StackActionPop = 2,
|
||||
};
|
||||
|
||||
struct s_summarize_stack_session
|
||||
{
|
||||
t_stack_summary *summary;
|
||||
unsigned max_depth;
|
||||
};
|
||||
|
||||
struct s_edit
|
||||
{
|
||||
t_length start;
|
||||
t_length old_end;
|
||||
t_length new_end;
|
||||
};
|
||||
|
||||
struct s_edit_entry
|
||||
{
|
||||
t_subtree *tree;
|
||||
t_edit edit;
|
||||
};
|
||||
|
||||
struct s_cursor_child_iterator
|
||||
{
|
||||
t_subtree parent;
|
||||
const t_first_tree *tree;
|
||||
t_length position;
|
||||
uint32_t child_index;
|
||||
uint32_t structural_child_index;
|
||||
uint32_t descendant_index;
|
||||
const t_symbol *alias_sequence;
|
||||
};
|
||||
|
||||
#endif // STRUCTS_H
|
||||
Loading…
Add table
Add a link
Reference in a new issue