minishell/parser/src/structs.h

#ifndef STRUCTS_H
#define STRUCTS_H

#include "./api.h"

typedef unsigned t_stack_action;

typedef struct s_analysis_state			 t_analysis_state;
typedef struct s_analysis_state_entry	 t_analysis_state_entry;
typedef struct s_analysis_subgraph		 t_analysis_subgraph;
typedef struct s_analysis_subgraph_node	 t_analysis_subgraph_node;
typedef struct s_capture_list_pool		 t_capture_list_pool;
typedef struct s_cursor_child_iterator	 t_cursor_child_iterator;
typedef struct s_edit					 t_edit;
typedef struct s_edit_entry				 t_edit_entry;
typedef struct s_error_status			 t_error_status;
typedef struct s_first_parser			 t_first_parser;
typedef struct s_iterator				 t_iterator;
typedef struct s_node_child_iterator	 t_node_child_iterator;
typedef struct s_parse_query			 t_parse_query;
typedef struct s_pattern_entry			 t_pattern_entry;
typedef struct s_query_analysis			 t_query_analysis;
typedef struct s_query_cursor			 t_query_cursor;
typedef struct s_query_pattern			 t_query_pattern;
typedef struct s_query_state			 t_query_state;
typedef struct s_query_step				 t_query_step;
typedef struct s_slice					 t_slice;
typedef struct s_stack					 t_stack;
typedef struct s_stack_head				 t_stack_head;
typedef struct s_stack_iterator			 t_stack_iterator;
typedef struct s_stack_link				 t_stack_link;
typedef struct s_stack_node				 t_stack_node;
typedef struct s_state_predecessor_map	 t_state_predecessor_map;
typedef struct s_step_offset			 t_step_offset;
typedef struct s_stream					 t_stream;
typedef struct s_string_input			 t_string_input;
typedef struct s_summarize_stack_session t_summarize_stack_session;
typedef struct s_symbol_table			 t_symbol_table;
typedef struct s_token_cache			 t_token_cache;

typedef t_stack_action (*t_stack_callback)(void *, const t_stack_iterator *);
typedef uint32_t (*t_unicode_decode_function)(const uint8_t *chunk,
											  uint32_t		 size,
											  int32_t		*codepoint);

typedef Array(t_analysis_state *) t_analysis_state_set;
typedef Array(t_analysis_subgraph) t_analysis_subgraph_array;
typedef Array(t_query_capture) t_capture_list;
typedef Array(t_stack_node *) t_stack_node_array;
typedef Array(uint8_t) t_capture_quantifiers;

typedef enum e_stack_status		   t_stack_status;
typedef enum e_error_comparaison   t_error_comparaison;
typedef enum e_iterator_comparison t_iterator_comparison;

struct s_iterator
{
	t_tree_cursor	  cursor;
	const t_language *language;
	unsigned		  visible_depth;
	bool			  in_padding;
};

enum e_iterator_comparison
{
	IteratorDiffers,
	IteratorMayDiffer,
	IteratorMatches,
};

struct s_node_child_iterator
{
	t_subtree			parent;
	const t_first_tree *tree;
	t_length			position;
	uint32_t			child_index;
	uint32_t			structural_child_index;
	const t_symbol	   *alias_sequence;
};
struct s_token_cache
{
	t_subtree token;
	t_subtree last_external_token;
	uint32_t  byte_index;
};

struct s_first_parser
{
	t_lexer				   lexer;
	t_stack				  *stack;
	t_subtree_pool		   tree_pool;
	const t_language	  *language;
	t_reduce_action_set	   reduce_actions;
	t_subtree			   finished_tree;
	t_subtree_array		   trailing_extras;
	t_subtree_array		   trailing_extras2;
	t_subtree_array		   scratch_trees;
	t_token_cache		   token_cache;
	t_reusable_node		   reusable_node;
	void				  *external_scanner_payload;
	t_parser_clock		   end_clock;
	t_parser_duration	   timeout_duration;
	unsigned			   accept_count;
	unsigned			   operation_count;
	const volatile size_t *cancellation_flag;
	t_subtree			   old_tree;
	t_range_array		   included_range_differences;
	unsigned			   included_range_difference_index;
	bool				   has_scanner_error;
};

struct s_error_status
{
	unsigned cost;
	unsigned node_count;
	int		 dynamic_precedence;
	bool	 is_in_error;
};

enum e_error_comparaison
{
	ErrorComparisonTakeLeft,
	ErrorComparisonPreferLeft,
	ErrorComparisonNone,
	ErrorComparisonPreferRight,
	ErrorComparisonTakeRight,
};

struct s_string_input
{
	const char *string;
	uint32_t	length;
};

/*
 * t_stream - A sequence of unicode characters derived from a UTF8 string.
 * This struct is used in parsing queries from S-expressions.
 */
struct s_stream
{
	const char *input;
	const char *start;
	const char *end;
	int32_t		next;
	uint8_t		next_size;
};

/*
 * t_query_step - A step in the process of matching a query. Each node within
 * a query S-expression corresponds to one of these steps. An entire pattern
 * is represented as a sequence of these steps. The basic properties of a
 * node are represented by these fields:
 * - `symbol` - The grammar symbol to match. A zero value represents the
 *    wildcard symbol, '_'.
 * - `field` - The field name to match. A zero value means that a field name
 *    was not specified.
 * - `capture_ids` - An array of integers representing the names of captures
 *    associated with this node in the pattern, terminated by a `NONE` value.
 * - `depth` - The depth where this node occurs in the pattern. The root node
 *    of the pattern has depth zero.
 * - `negated_field_list_id` - An id representing a set of fields that must
 *    not be present on a node matching this step.
 *
 * Steps have some additional fields in order to handle the `.` (or "anchor")
 * operator, which forbids additional child nodes:
 * - `is_immediate` - Indicates that the node matching this step cannot be
 * preceded by other sibling nodes that weren't specified in the pattern.
 * - `is_last_child` - Indicates that the node matching this step cannot have
 * any subsequent named siblings.
 *
 * For simple patterns, steps are matched in sequential order. But in order to
 * handle alternative/repeated/optional sub-patterns, query steps are not always
 * structured as a linear sequence; they sometimes need to split and merge. This
 * is done using the following fields:
 *  - `alternative_index` - The index of a different query step that serves as
 *    an alternative to this step. A `NONE` value represents no alternative.
 *    When a query state reaches a step with an alternative index, the state
 *    is duplicated, with one copy remaining at the original step, and one copy
 *    moving to the alternative step. The alternative may have its own
 * alternative step, so this splitting is an iterative process.
 * - `is_dead_end` - Indicates that this state cannot be passed directly, and
 *    exists only in order to redirect to an alternative index, with no
 * splitting.
 * - `is_pass_through` - Indicates that state has no matching logic of its own,
 *    and exists only to split a state. One copy of the state advances
 * immediately to the next step, and one moves to the alternative step.
 * - `alternative_is_immediate` - Indicates that this step's alternative step
 *    should be treated as if `is_immediate` is true.
 *
 * Steps also store some derived state that summarizes how they relate to other
 * steps within the same pattern. This is used to optimize the matching process:
 *  - `contains_captures` - Indicates that this step or one of its child steps
 *     has a non-empty `capture_ids` list.
 *  - `parent_pattern_guaranteed` - Indicates that if this step is reached, then
 *     it and all of its subsequent sibling steps within the same parent pattern
 *     are guaranteed to match.
 *  - `root_pattern_guaranteed` - Similar to `parent_pattern_guaranteed`, but
 *     for the entire top-level pattern. When iterating through a query's
 *     captures using `ts_query_cursor_next_capture`, this field is used to
 *     detect that a capture can safely be returned from a match that has not
 *     even completed  yet.
 */
struct s_query_step
{
	t_symbol   symbol;
	t_symbol   supertype_symbol;
	t_field_id field;
	uint16_t   capture_ids[MAX_STEP_CAPTURE_COUNT];
	uint16_t   depth;
	uint16_t   alternative_index;
	uint16_t   negated_field_list_id;
	bool	   is_named : 1;
	bool	   is_immediate : 1;
	bool	   is_last_child : 1;
	bool	   is_pass_through : 1;
	bool	   is_dead_end : 1;
	bool	   alternative_is_immediate : 1;
	bool	   contains_captures : 1;
	bool	   root_pattern_guaranteed : 1;
	bool	   parent_pattern_guaranteed : 1;
};

/*
 * t_slice - A slice of an external array. Within a query, capture names,
 * literal string values, and predicate step information are stored in three
 * contiguous arrays. Individual captures, string values, and predicates are
 * represented as slices of these three arrays.
 */
struct s_slice
{
	uint32_t offset;
	uint32_t length;
};

/*
 * t_symbol_table - a two-way mapping of strings to ids.
 */
struct s_symbol_table
{
	Array(char) characters;
	Array(t_slice) slices;
};

/**
 * CaptureQuantififers - a data structure holding the quantifiers of pattern
 * captures.
 */

/*
 * t_pattern_entry - Information about the starting point for matching a
 * particular pattern. These entries are stored in a 'pattern map' - a sorted
 * array that makes it possible to efficiently lookup patterns based on the
 * symbol for their first step. The entry consists of the following fields:
 * - `pattern_index` - the index of the pattern within the query
 * - `step_index` - the index of the pattern's first step in the shared `steps`
 * array
 * - `is_rooted` - whether or not the pattern has a single root node. This
 * property affects decisions about whether or not to start the pattern for
 * nodes outside of a QueryCursor's range restriction.
 */
struct s_pattern_entry
{
	uint16_t step_index;
	uint16_t pattern_index;
	bool	 is_rooted;
};

struct s_query_pattern
{
	t_slice	 steps;
	t_slice	 predicate_steps;
	uint32_t start_byte;
	bool	 is_non_local;
};

struct s_step_offset
{
	uint32_t byte_offset;
	uint16_t step_index;
};

/*
 * t_query_state - The state of an in-progress match of a particular pattern
 * in a query. While executing, a `t_query_cursor` must keep track of a number
 * of possible in-progress matches. Each of those possible matches is
 * represented as one of these states. Fields:
 * - `id` - A numeric id that is exposed to the public API. This allows the
 *    caller to remove a given match, preventing any more of its captures
 *    from being returned.
 * - `start_depth` - The depth in the tree where the first step of the state's
 *    pattern was matched.
 * - `pattern_index` - The pattern that the state is matching.
 * - `consumed_capture_count` - The number of captures from this match that
 *    have already been returned.
 * - `capture_list_id` - A numeric id that can be used to retrieve the state's
 *    list of captures from the `t_capture_list_pool`.
 * - `seeking_immediate_match` - A flag that indicates that the state's next
 *    step must be matched by the very next sibling. This is used when
 *    processing repetitions.
 * - `has_in_progress_alternatives` - A flag that indicates that there is are
 *    other states that have the same captures as this state, but are at
 *    different steps in their pattern. This means that in order to obey the
 *    'longest-match' rule, this state should not be returned as a match until
 *    it is clear that there can be no other alternative match with more
 * captures.
 */
struct s_query_state
{
	uint32_t id;
	uint32_t capture_list_id;
	uint16_t start_depth;
	uint16_t step_index;
	uint16_t pattern_index;
	uint16_t consumed_capture_count : 12;
	bool	 seeking_immediate_match : 1;
	bool	 has_in_progress_alternatives : 1;
	bool	 dead : 1;
	bool	 needs_parent : 1;
};

/*
 * t_capture_list_pool - A collection of *lists* of captures. Each query state
 * needs to maintain its own list of captures. To avoid repeated allocations,
 * this struct maintains a fixed set of capture lists, and keeps track of which
 * ones are currently in use by a query state.
 */
struct s_capture_list_pool
{
	Array(t_capture_list) list;
	t_capture_list empty_list;
	// The maximum number of capture lists that we are allowed to allocate. We
	// never allow `list` to allocate more entries than this, dropping pending
	// matches if needed to stay under the limit.
	uint32_t max_capture_list_count;
	// The number of capture lists allocated in `list` that are not currently in
	// use. We reuse those existing-but-unused capture lists before trying to
	// allocate any new ones. We use an invalid value (UINT32_MAX) for a capture
	// list's length to indicate that it's not in use.
	uint32_t free_capture_list_count;
};

/*
 * t_analysis_state - The state needed for walking the parse table when
 * analyzing a query pattern, to determine at which steps the pattern might fail
 * to match.
 */
struct s_analysis_state_entry
{
	t_state_id parse_state;
	t_symbol   parent_symbol;
	uint16_t   child_index;
	t_field_id field_id : 15;
	bool	   done : 1;
};

struct s_analysis_state
{
	t_analysis_state_entry stack[MAX_ANALYSIS_STATE_DEPTH];
	uint16_t			   depth;
	uint16_t			   step_index;
	t_symbol			   root_symbol;
};

struct s_query_analysis
{
	t_analysis_state_set states;
	t_analysis_state_set next_states;
	t_analysis_state_set deeper_states;
	t_analysis_state_set state_pool;
	Array(uint16_t) final_step_indices;
	Array(t_symbol) finished_parent_symbols;
	bool did_abort;
};

/*
 * t_analysis_subgraph - A subset of the states in the parse table that are used
 * in constructing nodes with a certain symbol. Each state is accompanied by
 * some information about the possible node that could be produced in
 * downstream states.
 */
struct s_analysis_subgraph_node
{
	t_state_id state;
	uint16_t   production_id;
	uint8_t	   child_index : 7;
	bool	   done : 1;
};

struct s_analysis_subgraph
{
	t_symbol symbol;
	Array(t_state_id) start_states;
	Array(t_analysis_subgraph_node) nodes;
};

/*
 * t_state_predecessor_map - A map that stores the predecessors of each parse
 * state. This is used during query analysis to determine which parse states can
 * lead to which reduce actions.
 */

struct s_state_predecessor_map
{
	t_state_id *contents;
};

/*
 * t_parse_query - A tree query, compiled from a string of S-expressions. The
 * query itself is immutable. The mutable state used in the process of executing
 * the query is stored in a `t_query_cursor`.
 */
struct s_parse_query
{
	t_symbol_table captures;
	t_symbol_table predicate_values;
	Array(t_capture_quantifiers) capture_quantifiers;
	Array(t_query_step) steps;
	Array(t_pattern_entry) pattern_map;
	Array(t_query_predicate_step) predicate_steps;
	Array(t_query_pattern) patterns;
	Array(t_step_offset) step_offsets;
	Array(t_field_id) negated_fields;
	Array(char) string_buffer;
	Array(t_symbol) repeat_symbols_with_rootless_patterns;
	const t_language *language;
	uint16_t		  wildcard_root_pattern_count;
};

/*
 * t_query_cursor - A stateful struct used to execute a query on a tree.
 */
struct s_query_cursor
{
	const t_parse_query *query;
	t_tree_cursor		 cursor;
	Array(t_query_state) states;
	Array(t_query_state) finished_states;
	t_capture_list_pool capture_list_pool;
	uint32_t			depth;
	uint32_t			max_start_depth;
	uint32_t			start_byte;
	uint32_t			end_byte;
	t_point				start_point;
	t_point				end_point;
	uint32_t			next_state_id;
	bool				on_visible_node;
	bool				ascending;
	bool				halted;
	bool				did_exceed_match_limit;
};

struct s_stack_link
{
	t_stack_node *node;
	t_subtree	  subtree;
	bool		  is_pending;
};

struct s_stack_node
{
	t_state_id		   state;
	t_length		   position;
	t_stack_link	   links[MAX_LINK_COUNT];
	short unsigned int link_count;
	uint32_t		   ref_count;
	unsigned		   error_cost;
	unsigned		   node_count;
	int				   dynamic_precedence;
};

struct s_stack_iterator
{
	t_stack_node   *node;
	t_subtree_array subtrees;
	uint32_t		subtree_count;
	bool			is_pending;
};

enum e_stack_status
{
	StackStatusActive,
	StackStatusPaused,
	StackStatusHalted,
};

struct s_stack_head
{
	t_stack_node	*node;
	t_stack_summary *summary;
	unsigned		 node_count_at_last_error;
	t_subtree		 last_external_token;
	t_subtree		 lookahead_when_paused;
	t_stack_status	 status;
};

struct s_stack
{
	Array(t_stack_head) heads;
	t_stack_slice_array slices;
	Array(t_stack_iterator) iterators;
	t_stack_node_array node_pool;
	t_stack_node	  *base_node;
	t_subtree_pool	  *subtree_pool;
};

enum e_stack_action
{
	StackActionNone,
	StackActionStop = 1,
	StackActionPop = 2,
};

struct s_summarize_stack_session
{
	t_stack_summary *summary;
	unsigned		 max_depth;
};

struct s_edit
{
	t_length start;
	t_length old_end;
	t_length new_end;
};

struct s_edit_entry
{
	t_subtree *tree;
	t_edit	   edit;
};

struct s_cursor_child_iterator
{
	t_subtree			parent;
	const t_first_tree *tree;
	t_length			position;
	uint32_t			child_index;
	uint32_t			structural_child_index;
	uint32_t			descendant_index;
	const t_symbol	   *alias_sequence;
};

#endif // STRUCTS_H