diff --git a/shcat_c/parser/Makefile b/shcat_c/parser/Makefile index 968c5a19..0b5ed830 100644 --- a/shcat_c/parser/Makefile +++ b/shcat_c/parser/Makefile @@ -6,7 +6,7 @@ # By: maiboyer +#+ +:+ +#+ # # +#+#+#+#+#+ +#+ # # Created: 2023/11/03 13:20:01 by maiboyer #+# #+# # -# Updated: 2024/04/28 17:23:03 by maiboyer ### ########.fr # +# Updated: 2024/04/28 17:48:04 by maiboyer ### ########.fr # # # # **************************************************************************** # @@ -71,8 +71,8 @@ fclean: clean rm -f $(BUILD_DIR)/$(NAME) re: - $(MAKE) --no-print-directory fclean - $(MAKE) --no-print-directory all + @$(MAKE) --no-print-directory fclean + @$(MAKE) --no-print-directory all generate_filelist: @/usr/bin/env zsh -c "tree -iFf --noreport $(SRC_DIR) | rg '^$(SRC_DIR)/(.*)\.c\$$' --replace '\$$1' | sort -u" > ./source_files.list diff --git a/shcat_c/parser/src/array.h b/shcat_c/parser/src/array.h new file mode 100644 index 00000000..9319e790 --- /dev/null +++ b/shcat_c/parser/src/array.h @@ -0,0 +1,290 @@ +#ifndef TREE_SITTER_ARRAY_H_ +#define TREE_SITTER_ARRAY_H_ + +#ifdef __cplusplus +extern "C" { +#endif + + + +#include +#include +#include +#include +#include + +#ifdef _MSC_VER +#pragma warning(disable : 4101) +#elif defined(__GNUC__) || defined(__clang__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-variable" +#endif + +#define Array(T) \ + struct { \ + T *contents; \ + uint32_t size; \ + uint32_t capacity; \ + } + +/// Initialize an array. +#define array_init(self) \ + ((self)->size = 0, (self)->capacity = 0, (self)->contents = NULL) + +/// Create an empty array. +#define array_new() \ + { NULL, 0, 0 } + +/// Get a pointer to the element at a given `index` in the array. +#define array_get(self, _index) \ + (assert((uint32_t)(_index) < (self)->size), &(self)->contents[_index]) + +/// Get a pointer to the first element in the array. +#define array_front(self) array_get(self, 0) + +/// Get a pointer to the last element in the array. +#define array_back(self) array_get(self, (self)->size - 1) + +/// Clear the array, setting its size to zero. Note that this does not free any +/// memory allocated for the array's contents. +#define array_clear(self) ((self)->size = 0) + +/// Reserve `new_capacity` elements of space in the array. If `new_capacity` is +/// less than the array's current capacity, this function has no effect. +#define array_reserve(self, new_capacity) \ + _array__reserve((Array *)(self), array_elem_size(self), new_capacity) + +/// Free any memory allocated for this array. Note that this does not free any +/// memory allocated for the array's contents. +#define array_delete(self) _array__delete((Array *)(self)) + +/// Push a new `element` onto the end of the array. +#define array_push(self, element) \ + (_array__grow((Array *)(self), 1, array_elem_size(self)), \ + (self)->contents[(self)->size++] = (element)) + +/// Increase the array's size by `count` elements. +/// New elements are zero-initialized. +#define array_grow_by(self, count) \ + do { \ + if ((count) == 0) break; \ + _array__grow((Array *)(self), count, array_elem_size(self)); \ + memset((self)->contents + (self)->size, 0, (count) * array_elem_size(self)); \ + (self)->size += (count); \ + } while (0) + +/// Append all elements from one array to the end of another. +#define array_push_all(self, other) \ + array_extend((self), (other)->size, (other)->contents) + +/// Append `count` elements to the end of the array, reading their values from the +/// `contents` pointer. +#define array_extend(self, count, contents) \ + _array__splice( \ + (Array *)(self), array_elem_size(self), (self)->size, \ + 0, count, contents \ + ) + +/// Remove `old_count` elements from the array starting at the given `index`. At +/// the same index, insert `new_count` new elements, reading their values from the +/// `new_contents` pointer. +#define array_splice(self, _index, old_count, new_count, new_contents) \ + _array__splice( \ + (Array *)(self), array_elem_size(self), _index, \ + old_count, new_count, new_contents \ + ) + +/// Insert one `element` into the array at the given `index`. +#define array_insert(self, _index, element) \ + _array__splice((Array *)(self), array_elem_size(self), _index, 0, 1, &(element)) + +/// Remove one element from the array at the given `index`. +#define array_erase(self, _index) \ + _array__erase((Array *)(self), array_elem_size(self), _index) + +/// Pop the last element off the array, returning the element by value. +#define array_pop(self) ((self)->contents[--(self)->size]) + +/// Assign the contents of one array to another, reallocating if necessary. +#define array_assign(self, other) \ + _array__assign((Array *)(self), (const Array *)(other), array_elem_size(self)) + +/// Swap one array with another +#define array_swap(self, other) \ + _array__swap((Array *)(self), (Array *)(other)) + +/// Get the size of the array contents +#define array_elem_size(self) (sizeof *(self)->contents) + +/// Search a sorted array for a given `needle` value, using the given `compare` +/// callback to determine the order. +/// +/// If an existing element is found to be equal to `needle`, then the `index` +/// out-parameter is set to the existing value's index, and the `exists` +/// out-parameter is set to true. Otherwise, `index` is set to an index where +/// `needle` should be inserted in order to preserve the sorting, and `exists` +/// is set to false. +#define array_search_sorted_with(self, compare, needle, _index, _exists) \ + _array__search_sorted(self, 0, compare, , needle, _index, _exists) + +/// Search a sorted array for a given `needle` value, using integer comparisons +/// of a given struct field (specified with a leading dot) to determine the order. +/// +/// See also `array_search_sorted_with`. +#define array_search_sorted_by(self, field, needle, _index, _exists) \ + _array__search_sorted(self, 0, _compare_int, field, needle, _index, _exists) + +/// Insert a given `value` into a sorted array, using the given `compare` +/// callback to determine the order. +#define array_insert_sorted_with(self, compare, value) \ + do { \ + unsigned _index, _exists; \ + array_search_sorted_with(self, compare, &(value), &_index, &_exists); \ + if (!_exists) array_insert(self, _index, value); \ + } while (0) + +/// Insert a given `value` into a sorted array, using integer comparisons of +/// a given struct field (specified with a leading dot) to determine the order. +/// +/// See also `array_search_sorted_by`. +#define array_insert_sorted_by(self, field, value) \ + do { \ + unsigned _index, _exists; \ + array_search_sorted_by(self, field, (value) field, &_index, &_exists); \ + if (!_exists) array_insert(self, _index, value); \ + } while (0) + +// Private + +typedef Array(void) Array; + +/// This is not what you're looking for, see `array_delete`. +static inline void _array__delete(Array *self) { + if (self->contents) { + free(self->contents); + self->contents = NULL; + self->size = 0; + self->capacity = 0; + } +} + +/// This is not what you're looking for, see `array_erase`. +static inline void _array__erase(Array *self, size_t element_size, + uint32_t index) { + assert(index < self->size); + char *contents = (char *)self->contents; + memmove(contents + index * element_size, contents + (index + 1) * element_size, + (self->size - index - 1) * element_size); + self->size--; +} + +/// This is not what you're looking for, see `array_reserve`. +static inline void _array__reserve(Array *self, size_t element_size, uint32_t new_capacity) { + if (new_capacity > self->capacity) { + if (self->contents) { + self->contents = realloc(self->contents, new_capacity * element_size); + } else { + self->contents = malloc(new_capacity * element_size); + } + self->capacity = new_capacity; + } +} + +/// This is not what you're looking for, see `array_assign`. +static inline void _array__assign(Array *self, const Array *other, size_t element_size) { + _array__reserve(self, element_size, other->size); + self->size = other->size; + memcpy(self->contents, other->contents, self->size * element_size); +} + +/// This is not what you're looking for, see `array_swap`. +static inline void _array__swap(Array *self, Array *other) { + Array swap = *other; + *other = *self; + *self = swap; +} + +/// This is not what you're looking for, see `array_push` or `array_grow_by`. +static inline void _array__grow(Array *self, uint32_t count, size_t element_size) { + uint32_t new_size = self->size + count; + if (new_size > self->capacity) { + uint32_t new_capacity = self->capacity * 2; + if (new_capacity < 8) new_capacity = 8; + if (new_capacity < new_size) new_capacity = new_size; + _array__reserve(self, element_size, new_capacity); + } +} + +/// This is not what you're looking for, see `array_splice`. +static inline void _array__splice(Array *self, size_t element_size, + uint32_t index, uint32_t old_count, + uint32_t new_count, const void *elements) { + uint32_t new_size = self->size + new_count - old_count; + uint32_t old_end = index + old_count; + uint32_t new_end = index + new_count; + assert(old_end <= self->size); + + _array__reserve(self, element_size, new_size); + + char *contents = (char *)self->contents; + if (self->size > old_end) { + memmove( + contents + new_end * element_size, + contents + old_end * element_size, + (self->size - old_end) * element_size + ); + } + if (new_count > 0) { + if (elements) { + memcpy( + (contents + index * element_size), + elements, + new_count * element_size + ); + } else { + memset( + (contents + index * element_size), + 0, + new_count * element_size + ); + } + } + self->size += new_count - old_count; +} + +/// A binary search routine, based on Rust's `std::slice::binary_search_by`. +/// This is not what you're looking for, see `array_search_sorted_with` or `array_search_sorted_by`. +#define _array__search_sorted(self, start, compare, suffix, needle, _index, _exists) \ + do { \ + *(_index) = start; \ + *(_exists) = false; \ + uint32_t size = (self)->size - *(_index); \ + if (size == 0) break; \ + int comparison; \ + while (size > 1) { \ + uint32_t half_size = size / 2; \ + uint32_t mid_index = *(_index) + half_size; \ + comparison = compare(&((self)->contents[mid_index] suffix), (needle)); \ + if (comparison <= 0) *(_index) = mid_index; \ + size -= half_size; \ + } \ + comparison = compare(&((self)->contents[*(_index)] suffix), (needle)); \ + if (comparison == 0) *(_exists) = true; \ + else if (comparison < 0) *(_index) += 1; \ + } while (0) + +/// Helper macro for the `_sorted_by` routines below. This takes the left (existing) +/// parameter by reference in order to work with the generic sorting function above. +#define _compare_int(a, b) ((int)*(a) - (int)(b)) + +#ifdef _MSC_VER +#pragma warning(default : 4101) +#elif defined(__GNUC__) || defined(__clang__) +#pragma GCC diagnostic pop +#endif + +#ifdef __cplusplus +} +#endif + +#endif // TREE_SITTER_ARRAY_H_ diff --git a/shcat_c/parser/src/error_costs.h b/shcat_c/parser/src/error_costs.h new file mode 100644 index 00000000..32d3666a --- /dev/null +++ b/shcat_c/parser/src/error_costs.h @@ -0,0 +1,11 @@ +#ifndef TREE_SITTER_ERROR_COSTS_H_ +#define TREE_SITTER_ERROR_COSTS_H_ + +#define ERROR_STATE 0 +#define ERROR_COST_PER_RECOVERY 500 +#define ERROR_COST_PER_MISSING_TREE 110 +#define ERROR_COST_PER_SKIPPED_TREE 100 +#define ERROR_COST_PER_SKIPPED_LINE 30 +#define ERROR_COST_PER_SKIPPED_CHAR 1 + +#endif diff --git a/shcat_c/parser/src/host.h b/shcat_c/parser/src/host.h new file mode 100644 index 00000000..a07e9f89 --- /dev/null +++ b/shcat_c/parser/src/host.h @@ -0,0 +1,21 @@ + +// Determine endian and pointer size based on known defines. +// TS_BIG_ENDIAN and TS_PTR_SIZE can be set as -D compiler arguments +// to override this. + +#if !defined(TS_BIG_ENDIAN) +#if (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) \ + || (defined( __APPLE_CC__) && (defined(__ppc__) || defined(__ppc64__))) +#define TS_BIG_ENDIAN 1 +#else +#define TS_BIG_ENDIAN 0 +#endif +#endif + +#if !defined(TS_PTR_SIZE) +#if UINTPTR_MAX == 0xFFFFFFFF +#define TS_PTR_SIZE 32 +#else +#define TS_PTR_SIZE 64 +#endif +#endif diff --git a/shcat_c/parser/src/language.c b/shcat_c/parser/src/language.c new file mode 100644 index 00000000..cb87c043 --- /dev/null +++ b/shcat_c/parser/src/language.c @@ -0,0 +1,214 @@ +#include "./language.h" +#include "tree_sitter/api.h" +#include + +const TSLanguage *ts_language_copy(const TSLanguage *self) { + return self; +} + +void ts_language_delete(const TSLanguage *self) { +} + +uint32_t ts_language_symbol_count(const TSLanguage *self) { + return self->symbol_count + self->alias_count; +} + +uint32_t ts_language_state_count(const TSLanguage *self) { + return self->state_count; +} + +uint32_t ts_language_version(const TSLanguage *self) { + return self->version; +} + +uint32_t ts_language_field_count(const TSLanguage *self) { + return self->field_count; +} + +void ts_language_table_entry( + const TSLanguage *self, + TSStateId state, + TSSymbol symbol, + TableEntry *result +) { + if (symbol == ts_builtin_sym_error || symbol == ts_builtin_sym_error_repeat) { + result->action_count = 0; + result->is_reusable = false; + result->actions = NULL; + } else { + assert(symbol < self->token_count); + uint32_t action_index = ts_language_lookup(self, state, symbol); + const TSParseActionEntry *entry = &self->parse_actions[action_index]; + result->action_count = entry->entry.count; + result->is_reusable = entry->entry.reusable; + result->actions = (const TSParseAction *)(entry + 1); + } +} + +TSSymbolMetadata ts_language_symbol_metadata( + const TSLanguage *self, + TSSymbol symbol +) { + if (symbol == ts_builtin_sym_error) { + return (TSSymbolMetadata) {.visible = true, .named = true}; + } else if (symbol == ts_builtin_sym_error_repeat) { + return (TSSymbolMetadata) {.visible = false, .named = false}; + } else { + return self->symbol_metadata[symbol]; + } +} + +TSSymbol ts_language_public_symbol( + const TSLanguage *self, + TSSymbol symbol +) { + if (symbol == ts_builtin_sym_error) return symbol; + return self->public_symbol_map[symbol]; +} + +TSStateId ts_language_next_state( + const TSLanguage *self, + TSStateId state, + TSSymbol symbol +) { + if (symbol == ts_builtin_sym_error || symbol == ts_builtin_sym_error_repeat) { + return 0; + } else if (symbol < self->token_count) { + uint32_t count; + const TSParseAction *actions = ts_language_actions(self, state, symbol, &count); + if (count > 0) { + TSParseAction action = actions[count - 1]; + if (action.type == TSParseActionTypeShift) { + return action.shift.extra ? state : action.shift.state; + } + } + return 0; + } else { + return ts_language_lookup(self, state, symbol); + } +} + +const char *ts_language_symbol_name( + const TSLanguage *self, + TSSymbol symbol +) { + if (symbol == ts_builtin_sym_error) { + return "ERROR"; + } else if (symbol == ts_builtin_sym_error_repeat) { + return "_ERROR"; + } else if (symbol < ts_language_symbol_count(self)) { + return self->symbol_names[symbol]; + } else { + return NULL; + } +} + +TSSymbol ts_language_symbol_for_name( + const TSLanguage *self, + const char *string, + uint32_t length, + bool is_named +) { + if (!strncmp(string, "ERROR", length)) return ts_builtin_sym_error; + uint16_t count = (uint16_t)ts_language_symbol_count(self); + for (TSSymbol i = 0; i < count; i++) { + TSSymbolMetadata metadata = ts_language_symbol_metadata(self, i); + if ((!metadata.visible && !metadata.supertype) || metadata.named != is_named) continue; + const char *symbol_name = self->symbol_names[i]; + if (!strncmp(symbol_name, string, length) && !symbol_name[length]) { + return self->public_symbol_map[i]; + } + } + return 0; +} + +TSSymbolType ts_language_symbol_type( + const TSLanguage *self, + TSSymbol symbol +) { + TSSymbolMetadata metadata = ts_language_symbol_metadata(self, symbol); + if (metadata.named && metadata.visible) { + return TSSymbolTypeRegular; + } else if (metadata.visible) { + return TSSymbolTypeAnonymous; + } else { + return TSSymbolTypeAuxiliary; + } +} + +const char *ts_language_field_name_for_id( + const TSLanguage *self, + TSFieldId id +) { + uint32_t count = ts_language_field_count(self); + if (count && id <= count) { + return self->field_names[id]; + } else { + return NULL; + } +} + +TSFieldId ts_language_field_id_for_name( + const TSLanguage *self, + const char *name, + uint32_t name_length +) { + uint16_t count = (uint16_t)ts_language_field_count(self); + for (TSSymbol i = 1; i < count + 1; i++) { + switch (strncmp(name, self->field_names[i], name_length)) { + case 0: + if (self->field_names[i][name_length] == 0) return i; + break; + case -1: + return 0; + default: + break; + } + } + return 0; +} + +TSLookaheadIterator *ts_lookahead_iterator_new(const TSLanguage *self, TSStateId state) { + if (state >= self->state_count) return NULL; + LookaheadIterator *iterator = malloc(sizeof(LookaheadIterator)); + *iterator = ts_language_lookaheads(self, state); + return (TSLookaheadIterator *)iterator; +} + +void ts_lookahead_iterator_delete(TSLookaheadIterator *self) { + free(self); +} + +bool ts_lookahead_iterator_reset_state(TSLookaheadIterator * self, TSStateId state) { + LookaheadIterator *iterator = (LookaheadIterator *)self; + if (state >= iterator->language->state_count) return false; + *iterator = ts_language_lookaheads(iterator->language, state); + return true; +} + +const TSLanguage *ts_lookahead_iterator_language(const TSLookaheadIterator *self) { + const LookaheadIterator *iterator = (const LookaheadIterator *)self; + return iterator->language; +} + +bool ts_lookahead_iterator_reset(TSLookaheadIterator *self, const TSLanguage *language, TSStateId state) { + if (state >= language->state_count) return false; + LookaheadIterator *iterator = (LookaheadIterator *)self; + *iterator = ts_language_lookaheads(language, state); + return true; +} + +bool ts_lookahead_iterator_next(TSLookaheadIterator *self) { + LookaheadIterator *iterator = (LookaheadIterator *)self; + return ts_lookahead_iterator__next(iterator); +} + +TSSymbol ts_lookahead_iterator_current_symbol(const TSLookaheadIterator *self) { + const LookaheadIterator *iterator = (const LookaheadIterator *)self; + return iterator->symbol; +} + +const char *ts_lookahead_iterator_current_symbol_name(const TSLookaheadIterator *self) { + const LookaheadIterator *iterator = (const LookaheadIterator *)self; + return ts_language_symbol_name(iterator->language, iterator->symbol); +} diff --git a/shcat_c/parser/src/language.h b/shcat_c/parser/src/language.h new file mode 100644 index 00000000..4e2769b4 --- /dev/null +++ b/shcat_c/parser/src/language.h @@ -0,0 +1,299 @@ +#ifndef TREE_SITTER_LANGUAGE_H_ +#define TREE_SITTER_LANGUAGE_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "./subtree.h" +#include "./parser.h" + +#define ts_builtin_sym_error_repeat (ts_builtin_sym_error - 1) + +#define LANGUAGE_VERSION_WITH_PRIMARY_STATES 14 +#define LANGUAGE_VERSION_USABLE_VIA_WASM 13 + +typedef struct { + const TSParseAction *actions; + uint32_t action_count; + bool is_reusable; +} TableEntry; + +typedef struct { + const TSLanguage *language; + const uint16_t *data; + const uint16_t *group_end; + TSStateId state; + uint16_t table_value; + uint16_t section_index; + uint16_t group_count; + bool is_small_state; + + const TSParseAction *actions; + TSSymbol symbol; + TSStateId next_state; + uint16_t action_count; +} LookaheadIterator; + +void ts_language_table_entry(const TSLanguage *, TSStateId, TSSymbol, TableEntry *); + +TSSymbolMetadata ts_language_symbol_metadata(const TSLanguage *, TSSymbol); + +TSSymbol ts_language_public_symbol(const TSLanguage *, TSSymbol); + +TSStateId ts_language_next_state(const TSLanguage *self, TSStateId state, TSSymbol symbol); + +static inline bool ts_language_is_symbol_external(const TSLanguage *self, TSSymbol symbol) { + return 0 < symbol && symbol < self->external_token_count + 1; +} + +static inline const TSParseAction *ts_language_actions( + const TSLanguage *self, + TSStateId state, + TSSymbol symbol, + uint32_t *count +) { + TableEntry entry; + ts_language_table_entry(self, state, symbol, &entry); + *count = entry.action_count; + return entry.actions; +} + +static inline bool ts_language_has_reduce_action( + const TSLanguage *self, + TSStateId state, + TSSymbol symbol +) { + TableEntry entry; + ts_language_table_entry(self, state, symbol, &entry); + return entry.action_count > 0 && entry.actions[0].type == TSParseActionTypeReduce; +} + +// Lookup the table value for a given symbol and state. +// +// For non-terminal symbols, the table value represents a successor state. +// For terminal symbols, it represents an index in the actions table. +// For 'large' parse states, this is a direct lookup. For 'small' parse +// states, this requires searching through the symbol groups to find +// the given symbol. +static inline uint16_t ts_language_lookup( + const TSLanguage *self, + TSStateId state, + TSSymbol symbol +) { + if (state >= self->large_state_count) { + uint32_t index = self->small_parse_table_map[state - self->large_state_count]; + const uint16_t *data = &self->small_parse_table[index]; + uint16_t group_count = *(data++); + for (unsigned i = 0; i < group_count; i++) { + uint16_t section_value = *(data++); + uint16_t symbol_count = *(data++); + for (unsigned j = 0; j < symbol_count; j++) { + if (*(data++) == symbol) return section_value; + } + } + return 0; + } else { + return self->parse_table[state * self->symbol_count + symbol]; + } +} + +static inline bool ts_language_has_actions( + const TSLanguage *self, + TSStateId state, + TSSymbol symbol +) { + return ts_language_lookup(self, state, symbol) != 0; +} + +// Iterate over all of the symbols that are valid in the given state. +// +// For 'large' parse states, this just requires iterating through +// all possible symbols and checking the parse table for each one. +// For 'small' parse states, this exploits the structure of the +// table to only visit the valid symbols. +static inline LookaheadIterator ts_language_lookaheads( + const TSLanguage *self, + TSStateId state +) { + bool is_small_state = state >= self->large_state_count; + const uint16_t *data; + const uint16_t *group_end = NULL; + uint16_t group_count = 0; + if (is_small_state) { + uint32_t index = self->small_parse_table_map[state - self->large_state_count]; + data = &self->small_parse_table[index]; + group_end = data + 1; + group_count = *data; + } else { + data = &self->parse_table[state * self->symbol_count] - 1; + } + return (LookaheadIterator) { + .language = self, + .data = data, + .group_end = group_end, + .group_count = group_count, + .is_small_state = is_small_state, + .symbol = UINT16_MAX, + .next_state = 0, + }; +} + +static inline bool ts_lookahead_iterator__next(LookaheadIterator *self) { + // For small parse states, valid symbols are listed explicitly, + // grouped by their value. There's no need to look up the actions + // again until moving to the next group. + if (self->is_small_state) { + self->data++; + if (self->data == self->group_end) { + if (self->group_count == 0) return false; + self->group_count--; + self->table_value = *(self->data++); + unsigned symbol_count = *(self->data++); + self->group_end = self->data + symbol_count; + self->symbol = *self->data; + } else { + self->symbol = *self->data; + return true; + } + } + + // For large parse states, iterate through every symbol until one + // is found that has valid actions. + else { + do { + self->data++; + self->symbol++; + if (self->symbol >= self->language->symbol_count) return false; + self->table_value = *self->data; + } while (!self->table_value); + } + + // Depending on if the symbols is terminal or non-terminal, the table value either + // represents a list of actions or a successor state. + if (self->symbol < self->language->token_count) { + const TSParseActionEntry *entry = &self->language->parse_actions[self->table_value]; + self->action_count = entry->entry.count; + self->actions = (const TSParseAction *)(entry + 1); + self->next_state = 0; + } else { + self->action_count = 0; + self->next_state = self->table_value; + } + return true; +} + +// Whether the state is a "primary state". If this returns false, it indicates that there exists +// another state that behaves identically to this one with respect to query analysis. +static inline bool ts_language_state_is_primary( + const TSLanguage *self, + TSStateId state +) { + if (self->version >= LANGUAGE_VERSION_WITH_PRIMARY_STATES) { + return state == self->primary_state_ids[state]; + } else { + return true; + } +} + +static inline const bool *ts_language_enabled_external_tokens( + const TSLanguage *self, + unsigned external_scanner_state +) { + if (external_scanner_state == 0) { + return NULL; + } else { + return self->external_scanner.states + self->external_token_count * external_scanner_state; + } +} + +static inline const TSSymbol *ts_language_alias_sequence( + const TSLanguage *self, + uint32_t production_id +) { + return production_id ? + &self->alias_sequences[production_id * self->max_alias_sequence_length] : + NULL; +} + +static inline TSSymbol ts_language_alias_at( + const TSLanguage *self, + uint32_t production_id, + uint32_t child_index +) { + return production_id ? + self->alias_sequences[production_id * self->max_alias_sequence_length + child_index] : + 0; +} + +static inline void ts_language_field_map( + const TSLanguage *self, + uint32_t production_id, + const TSFieldMapEntry **start, + const TSFieldMapEntry **end +) { + if (self->field_count == 0) { + *start = NULL; + *end = NULL; + return; + } + + TSFieldMapSlice slice = self->field_map_slices[production_id]; + *start = &self->field_map_entries[slice.index]; + *end = &self->field_map_entries[slice.index] + slice.length; +} + +static inline void ts_language_aliases_for_symbol( + const TSLanguage *self, + TSSymbol original_symbol, + const TSSymbol **start, + const TSSymbol **end +) { + *start = &self->public_symbol_map[original_symbol]; + *end = *start + 1; + + unsigned idx = 0; + for (;;) { + TSSymbol symbol = self->alias_map[idx++]; + if (symbol == 0 || symbol > original_symbol) break; + uint16_t count = self->alias_map[idx++]; + if (symbol == original_symbol) { + *start = &self->alias_map[idx]; + *end = &self->alias_map[idx + count]; + break; + } + idx += count; + } +} + +static inline void ts_language_write_symbol_as_dot_string( + const TSLanguage *self, + FILE *f, + TSSymbol symbol +) { + const char *name = ts_language_symbol_name(self, symbol); + for (const char *chr = name; *chr; chr++) { + switch (*chr) { + case '"': + case '\\': + fputc('\\', f); + fputc(*chr, f); + break; + case '\n': + fputs("\\n", f); + break; + case '\t': + fputs("\\t", f); + break; + default: + fputc(*chr, f); + break; + } + } +} + +#ifdef __cplusplus +} +#endif + +#endif // TREE_SITTER_LANGUAGE_H_ diff --git a/shcat_c/parser/src/length.h b/shcat_c/parser/src/length.h new file mode 100644 index 00000000..42d61ef3 --- /dev/null +++ b/shcat_c/parser/src/length.h @@ -0,0 +1,52 @@ +#ifndef TREE_SITTER_LENGTH_H_ +#define TREE_SITTER_LENGTH_H_ + +#include +#include +#include "./point.h" +#include "tree_sitter/api.h" + +typedef struct { + uint32_t bytes; + TSPoint extent; +} Length; + +static const Length LENGTH_UNDEFINED = {0, {0, 1}}; +static const Length LENGTH_MAX = {UINT32_MAX, {UINT32_MAX, UINT32_MAX}}; + +static inline bool length_is_undefined(Length length) { + return length.bytes == 0 && length.extent.column != 0; +} + +static inline Length length_min(Length len1, Length len2) { + return (len1.bytes < len2.bytes) ? len1 : len2; +} + +static inline Length length_add(Length len1, Length len2) { + Length result; + result.bytes = len1.bytes + len2.bytes; + result.extent = point_add(len1.extent, len2.extent); + return result; +} + +static inline Length length_sub(Length len1, Length len2) { + Length result; + result.bytes = len1.bytes - len2.bytes; + result.extent = point_sub(len1.extent, len2.extent); + return result; +} + +static inline Length length_zero(void) { + Length result = {0, {0, 0}}; + return result; +} + +static inline Length length_saturating_sub(Length len1, Length len2) { + if (len1.bytes > len2.bytes) { + return length_sub(len1, len2); + } else { + return length_zero(); + } +} + +#endif diff --git a/shcat_c/parser/src/lexer.c b/shcat_c/parser/src/lexer.c new file mode 100644 index 00000000..606fe53a --- /dev/null +++ b/shcat_c/parser/src/lexer.c @@ -0,0 +1,455 @@ +#include "./lexer.h" +#include "./length.h" +#include "./subtree.h" +#include +#include + +#define LOG(message, character) \ + if (self->logger.log) \ + { \ + snprintf(self->debug_buffer, TREE_SITTER_SERIALIZATION_BUFFER_SIZE, \ + 32 <= character && character < 127 ? message \ + " character:'%c'" \ + : message " character:%d", \ + character); \ + self->logger.log(self->logger.payload, TSLogTypeLex, \ + self->debug_buffer); \ + } + +static const int32_t BYTE_ORDER_MARK = 0xFEFF; + +static const TSRange DEFAULT_RANGE = {.start_point = + { + .row = 0, + .column = 0, + }, + .end_point = + { + .row = UINT32_MAX, + .column = UINT32_MAX, + }, + .start_byte = 0, + .end_byte = UINT32_MAX}; + +// Check if the lexer has reached EOF. This state is stored +// by setting the lexer's `current_included_range_index` such that +// it has consumed all of its available ranges. +static bool ts_lexer__eof(const TSLexer *_self) +{ + Lexer *self = (Lexer *)_self; + return self->current_included_range_index == self->included_range_count; +} + +// Clear the currently stored chunk of source code, because the lexer's +// position has changed. +static void ts_lexer__clear_chunk(Lexer *self) +{ + self->chunk = NULL; + self->chunk_size = 0; + self->chunk_start = 0; +} + +// Call the lexer's input callback to obtain a new chunk of source code +// for the current position. +static void ts_lexer__get_chunk(Lexer *self) +{ + self->chunk_start = self->current_position.bytes; + self->chunk = + self->input.read(self->input.payload, self->current_position.bytes, + self->current_position.extent, &self->chunk_size); + if (!self->chunk_size) + { + self->current_included_range_index = self->included_range_count; + self->chunk = NULL; + } +} + +typedef uint32_t (*UnicodeDecodeFunction)(const uint8_t *chunk, uint32_t size, + int32_t *lookahead); + +uint32_t my_decode(const uint8_t *chunk, uint32_t size, int32_t *lookahead) +{ + *((uint32_t *)lookahead) = *chunk; + return (1); +} + +#define TS_DECODE_ERROR -1 + +// Decode the next unicode character in the current chunk of source code. +// This assumes that the lexer has already retrieved a chunk of source +// code that spans the current position. +static void ts_lexer__get_lookahead(Lexer *self) +{ + uint32_t position_in_chunk = + self->current_position.bytes - self->chunk_start; + uint32_t size = self->chunk_size - position_in_chunk; + + if (size == 0) + { + self->lookahead_size = 1; + self->data.lookahead = '\0'; + return; + } + + const uint8_t *chunk = (const uint8_t *)self->chunk + position_in_chunk; + UnicodeDecodeFunction decode = my_decode; + + self->lookahead_size = decode(chunk, size, &self->data.lookahead); + + // If this chunk ended in the middle of a multi-byte character, + // try again with a fresh chunk. + if (self->data.lookahead == TS_DECODE_ERROR && size < 4) + { + ts_lexer__get_chunk(self); + chunk = (const uint8_t *)self->chunk; + size = self->chunk_size; + self->lookahead_size = decode(chunk, size, &self->data.lookahead); + } + + if (self->data.lookahead == TS_DECODE_ERROR) + { + self->lookahead_size = 1; + } +} + +static void ts_lexer_goto(Lexer *self, Length position) +{ + self->current_position = position; + + // Move to the first valid position at or after the given position. + bool found_included_range = false; + for (unsigned i = 0; i < self->included_range_count; i++) + { + TSRange *included_range = &self->included_ranges[i]; + if (included_range->end_byte > self->current_position.bytes && + included_range->end_byte > included_range->start_byte) + { + if (included_range->start_byte >= self->current_position.bytes) + { + self->current_position = (Length){ + .bytes = included_range->start_byte, + .extent = included_range->start_point, + }; + } + + self->current_included_range_index = i; + found_included_range = true; + break; + } + } + + if (found_included_range) + { + // If the current position is outside of the current chunk of text, + // then clear out the current chunk of text. + if (self->chunk && (self->current_position.bytes < self->chunk_start || + self->current_position.bytes >= + self->chunk_start + self->chunk_size)) + { + ts_lexer__clear_chunk(self); + } + + self->lookahead_size = 0; + self->data.lookahead = '\0'; + } + + // If the given position is beyond any of included ranges, move to the EOF + // state - past the end of the included ranges. + else + { + self->current_included_range_index = self->included_range_count; + TSRange *last_included_range = + &self->included_ranges[self->included_range_count - 1]; + self->current_position = (Length){ + .bytes = last_included_range->end_byte, + .extent = last_included_range->end_point, + }; + ts_lexer__clear_chunk(self); + self->lookahead_size = 1; + self->data.lookahead = '\0'; + } +} + +// Intended to be called only from functions that control logging. +static void ts_lexer__do_advance(Lexer *self, bool skip) +{ + if (self->lookahead_size) + { + self->current_position.bytes += self->lookahead_size; + if (self->data.lookahead == '\n') + { + self->current_position.extent.row++; + self->current_position.extent.column = 0; + } + else + { + self->current_position.extent.column += self->lookahead_size; + } + } + + const TSRange *current_range = + &self->included_ranges[self->current_included_range_index]; + while (self->current_position.bytes >= current_range->end_byte || + current_range->end_byte == current_range->start_byte) + { + if (self->current_included_range_index < self->included_range_count) + { + self->current_included_range_index++; + } + if (self->current_included_range_index < self->included_range_count) + { + current_range++; + self->current_position = (Length){ + current_range->start_byte, + current_range->start_point, + }; + } + else + { + current_range = NULL; + break; + } + } + + if (skip) + self->token_start_position = self->current_position; + + if (current_range) + { + if (self->current_position.bytes < self->chunk_start || + self->current_position.bytes >= + self->chunk_start + self->chunk_size) + { + ts_lexer__get_chunk(self); + } + ts_lexer__get_lookahead(self); + } + else + { + ts_lexer__clear_chunk(self); + self->data.lookahead = '\0'; + self->lookahead_size = 1; + } +} + +// Advance to the next character in the source code, retrieving a new +// chunk of source code if needed. +static void ts_lexer__advance(TSLexer *_self, bool skip) +{ + Lexer *self = (Lexer *)_self; + if (!self->chunk) + return; + ts_lexer__do_advance(self, skip); +} + +// Mark that a token match has completed. This can be called multiple +// times if a longer match is found later. +static void ts_lexer__mark_end(TSLexer *_self) +{ + Lexer *self = (Lexer *)_self; + if (!ts_lexer__eof(&self->data)) + { + // If the lexer is right at the beginning of included range, + // then the token should be considered to end at the *end* of the + // previous included range, rather than here. + TSRange *current_included_range = + &self->included_ranges[self->current_included_range_index]; + if (self->current_included_range_index > 0 && + self->current_position.bytes == current_included_range->start_byte) + { + TSRange *previous_included_range = current_included_range - 1; + self->token_end_position = (Length){ + previous_included_range->end_byte, + previous_included_range->end_point, + }; + return; + } + } + self->token_end_position = self->current_position; +} + +static uint32_t ts_lexer__get_column(TSLexer *_self) +{ + Lexer *self = (Lexer *)_self; + + uint32_t goal_byte = self->current_position.bytes; + + self->did_get_column = true; + self->current_position.bytes -= self->current_position.extent.column; + self->current_position.extent.column = 0; + + if (self->current_position.bytes < self->chunk_start) + { + ts_lexer__get_chunk(self); + } + + uint32_t result = 0; + if (!ts_lexer__eof(_self)) + { + ts_lexer__get_lookahead(self); + while (self->current_position.bytes < goal_byte && self->chunk) + { + result++; + ts_lexer__do_advance(self, false); + if (ts_lexer__eof(_self)) + break; + } + } + + return result; +} + +// Is the lexer at a boundary between two disjoint included ranges of +// source code? This is exposed as an API because some languages' external +// scanners need to perform custom actions at these boundaries. +static bool ts_lexer__is_at_included_range_start(const TSLexer *_self) +{ + const Lexer *self = (const Lexer *)_self; + if (self->current_included_range_index < self->included_range_count) + { + TSRange *current_range = + &self->included_ranges[self->current_included_range_index]; + return self->current_position.bytes == current_range->start_byte; + } + else + { + return false; + } +} + +void ts_lexer_init(Lexer *self) +{ + *self = (Lexer){ + .data = + { + // The lexer's methods are stored as struct fields so that + // generated + // parsers can call them without needing to be linked against + // this + // library. + .advance = ts_lexer__advance, + .mark_end = ts_lexer__mark_end, + .get_column = ts_lexer__get_column, + .is_at_included_range_start = + ts_lexer__is_at_included_range_start, + .eof = ts_lexer__eof, + .lookahead = 0, + .result_symbol = 0, + }, + .chunk = NULL, + .chunk_size = 0, + .chunk_start = 0, + .current_position = {0, {0, 0}}, + .logger = {.payload = NULL, .log = NULL}, + .included_ranges = NULL, + .included_range_count = 0, + .current_included_range_index = 0, + }; + ts_lexer_set_included_ranges(self, NULL, 0); +} + +void ts_lexer_delete(Lexer *self) +{ + free(self->included_ranges); +} + +void ts_lexer_set_input(Lexer *self, TSInput input) +{ + self->input = input; + ts_lexer__clear_chunk(self); + ts_lexer_goto(self, self->current_position); +} + +// Move the lexer to the given position. This doesn't do any work +// if the parser is already at the given position. +void ts_lexer_reset(Lexer *self, Length position) +{ + if (position.bytes != self->current_position.bytes) + { + ts_lexer_goto(self, position); + } +} + +void ts_lexer_start(Lexer *self) +{ + self->token_start_position = self->current_position; + self->token_end_position = LENGTH_UNDEFINED; + self->data.result_symbol = 0; + self->did_get_column = false; + if (!ts_lexer__eof(&self->data)) + { + if (!self->chunk_size) + ts_lexer__get_chunk(self); + if (!self->lookahead_size) + ts_lexer__get_lookahead(self); + if (self->current_position.bytes == 0 && + self->data.lookahead == BYTE_ORDER_MARK) + ts_lexer__advance(&self->data, true); + } +} + +void ts_lexer_finish(Lexer *self, uint32_t *lookahead_end_byte) +{ + if (length_is_undefined(self->token_end_position)) + { + ts_lexer__mark_end(&self->data); + } + + // If the token ended at an included range boundary, then its end position + // will have been reset to the end of the preceding range. Reset the start + // position to match. + if (self->token_end_position.bytes < self->token_start_position.bytes) + { + self->token_start_position = self->token_end_position; + } + + uint32_t current_lookahead_end_byte = self->current_position.bytes + 1; + + // In order to determine that a byte sequence is invalid UTF8 or UTF16, + // the character decoding algorithm may have looked at the following byte. + // Therefore, the next byte *after* the current (invalid) character + // affects the interpretation of the current character. + if (self->data.lookahead == TS_DECODE_ERROR) + { + current_lookahead_end_byte++; + } + + if (current_lookahead_end_byte > *lookahead_end_byte) + { + *lookahead_end_byte = current_lookahead_end_byte; + } +} + +void ts_lexer_advance_to_end(Lexer *self) +{ + while (self->chunk) + { + ts_lexer__advance(&self->data, false); + } +} + +void ts_lexer_mark_end(Lexer *self) +{ + ts_lexer__mark_end(&self->data); +} + +bool ts_lexer_set_included_ranges(Lexer *self, const TSRange *ranges, + uint32_t count) +{ + ranges = &DEFAULT_RANGE; + count = 1; + size_t size = count * sizeof(TSRange); + self->included_ranges = realloc(self->included_ranges, size); + memcpy(self->included_ranges, ranges, size); + self->included_range_count = count; + ts_lexer_goto(self, self->current_position); + return true; +} + +TSRange *ts_lexer_included_ranges(const Lexer *self, uint32_t *count) +{ + *count = self->included_range_count; + return self->included_ranges; +} + +#undef LOG diff --git a/shcat_c/parser/src/lexer.h b/shcat_c/parser/src/lexer.h new file mode 100644 index 00000000..445c4fdc --- /dev/null +++ b/shcat_c/parser/src/lexer.h @@ -0,0 +1,49 @@ +#ifndef TREE_SITTER_LEXER_H_ +#define TREE_SITTER_LEXER_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "./length.h" +#include "./subtree.h" +#include "tree_sitter/api.h" +#include "./parser.h" + +typedef struct { + TSLexer data; + Length current_position; + Length token_start_position; + Length token_end_position; + + TSRange *included_ranges; + const char *chunk; + TSInput input; + TSLogger logger; + + uint32_t included_range_count; + uint32_t current_included_range_index; + uint32_t chunk_start; + uint32_t chunk_size; + uint32_t lookahead_size; + bool did_get_column; + + char debug_buffer[TREE_SITTER_SERIALIZATION_BUFFER_SIZE]; +} Lexer; + +void ts_lexer_init(Lexer *); +void ts_lexer_delete(Lexer *); +void ts_lexer_set_input(Lexer *, TSInput); +void ts_lexer_reset(Lexer *, Length); +void ts_lexer_start(Lexer *); +void ts_lexer_finish(Lexer *, uint32_t *); +void ts_lexer_advance_to_end(Lexer *); +void ts_lexer_mark_end(Lexer *); +bool ts_lexer_set_included_ranges(Lexer *self, const TSRange *ranges, uint32_t count); +TSRange *ts_lexer_included_ranges(const Lexer *self, uint32_t *count); + +#ifdef __cplusplus +} +#endif + +#endif // TREE_SITTER_LEXER_H_ diff --git a/shcat_c/parser/src/node.c b/shcat_c/parser/src/node.c new file mode 100644 index 00000000..f9960213 --- /dev/null +++ b/shcat_c/parser/src/node.c @@ -0,0 +1,774 @@ +#include +#include "./subtree.h" +#include "./tree.h" +#include "./language.h" + +typedef struct { + Subtree parent; + const TSTree *tree; + Length position; + uint32_t child_index; + uint32_t structural_child_index; + const TSSymbol *alias_sequence; +} NodeChildIterator; + +// TSNode - constructors + +TSNode ts_node_new( + const TSTree *tree, + const Subtree *subtree, + Length position, + TSSymbol alias +) { + return (TSNode) { + {position.bytes, position.extent.row, position.extent.column, alias}, + subtree, + tree, + }; +} + +static inline TSNode ts_node__null(void) { + return ts_node_new(NULL, NULL, length_zero(), 0); +} + +// TSNode - accessors + +uint32_t ts_node_start_byte(TSNode self) { + return self.context[0]; +} + +TSPoint ts_node_start_point(TSNode self) { + return (TSPoint) {self.context[1], self.context[2]}; +} + +static inline uint32_t ts_node__alias(const TSNode *self) { + return self->context[3]; +} + +static inline Subtree ts_node__subtree(TSNode self) { + return *(const Subtree *)self.id; +} + +// NodeChildIterator + +static inline NodeChildIterator ts_node_iterate_children(const TSNode *node) { + Subtree subtree = ts_node__subtree(*node); + if (ts_subtree_child_count(subtree) == 0) { + return (NodeChildIterator) {NULL_SUBTREE, node->tree, length_zero(), 0, 0, NULL}; + } + const TSSymbol *alias_sequence = ts_language_alias_sequence( + node->tree->language, + subtree.ptr->production_id + ); + return (NodeChildIterator) { + .tree = node->tree, + .parent = subtree, + .position = {ts_node_start_byte(*node), ts_node_start_point(*node)}, + .child_index = 0, + .structural_child_index = 0, + .alias_sequence = alias_sequence, + }; +} + +static inline bool ts_node_child_iterator_done(NodeChildIterator *self) { + return self->child_index == self->parent.ptr->child_count; +} + +static inline bool ts_node_child_iterator_next( + NodeChildIterator *self, + TSNode *result +) { + if (!self->parent.ptr || ts_node_child_iterator_done(self)) return false; + const Subtree *child = &ts_subtree_children(self->parent)[self->child_index]; + TSSymbol alias_symbol = 0; + if (!ts_subtree_extra(*child)) { + if (self->alias_sequence) { + alias_symbol = self->alias_sequence[self->structural_child_index]; + } + self->structural_child_index++; + } + if (self->child_index > 0) { + self->position = length_add(self->position, ts_subtree_padding(*child)); + } + *result = ts_node_new( + self->tree, + child, + self->position, + alias_symbol + ); + self->position = length_add(self->position, ts_subtree_size(*child)); + self->child_index++; + return true; +} + +// TSNode - private + +static inline bool ts_node__is_relevant(TSNode self, bool include_anonymous) { + Subtree tree = ts_node__subtree(self); + if (include_anonymous) { + return ts_subtree_visible(tree) || ts_node__alias(&self); + } else { + TSSymbol alias = ts_node__alias(&self); + if (alias) { + return ts_language_symbol_metadata(self.tree->language, alias).named; + } else { + return ts_subtree_visible(tree) && ts_subtree_named(tree); + } + } +} + +static inline uint32_t ts_node__relevant_child_count( + TSNode self, + bool include_anonymous +) { + Subtree tree = ts_node__subtree(self); + if (ts_subtree_child_count(tree) > 0) { + if (include_anonymous) { + return tree.ptr->visible_child_count; + } else { + return tree.ptr->named_child_count; + } + } else { + return 0; + } +} + +static inline TSNode ts_node__child( + TSNode self, + uint32_t child_index, + bool include_anonymous +) { + TSNode result = self; + bool did_descend = true; + + while (did_descend) { + did_descend = false; + + TSNode child; + uint32_t index = 0; + NodeChildIterator iterator = ts_node_iterate_children(&result); + while (ts_node_child_iterator_next(&iterator, &child)) { + if (ts_node__is_relevant(child, include_anonymous)) { + if (index == child_index) { + return child; + } + index++; + } else { + uint32_t grandchild_index = child_index - index; + uint32_t grandchild_count = ts_node__relevant_child_count(child, include_anonymous); + if (grandchild_index < grandchild_count) { + did_descend = true; + result = child; + child_index = grandchild_index; + break; + } + index += grandchild_count; + } + } + } + + return ts_node__null(); +} + +static bool ts_subtree_has_trailing_empty_descendant( + Subtree self, + Subtree other +) { + for (unsigned i = ts_subtree_child_count(self) - 1; i + 1 > 0; i--) { + Subtree child = ts_subtree_children(self)[i]; + if (ts_subtree_total_bytes(child) > 0) break; + if (child.ptr == other.ptr || ts_subtree_has_trailing_empty_descendant(child, other)) { + return true; + } + } + return false; +} + +static inline TSNode ts_node__prev_sibling(TSNode self, bool include_anonymous) { + Subtree self_subtree = ts_node__subtree(self); + bool self_is_empty = ts_subtree_total_bytes(self_subtree) == 0; + uint32_t target_end_byte = ts_node_end_byte(self); + + TSNode node = ts_node_parent(self); + TSNode earlier_node = ts_node__null(); + bool earlier_node_is_relevant = false; + + while (!ts_node_is_null(node)) { + TSNode earlier_child = ts_node__null(); + bool earlier_child_is_relevant = false; + bool found_child_containing_target = false; + + TSNode child; + NodeChildIterator iterator = ts_node_iterate_children(&node); + while (ts_node_child_iterator_next(&iterator, &child)) { + if (child.id == self.id) break; + if (iterator.position.bytes > target_end_byte) { + found_child_containing_target = true; + break; + } + + if (iterator.position.bytes == target_end_byte && + (!self_is_empty || + ts_subtree_has_trailing_empty_descendant(ts_node__subtree(child), self_subtree))) { + found_child_containing_target = true; + break; + } + + if (ts_node__is_relevant(child, include_anonymous)) { + earlier_child = child; + earlier_child_is_relevant = true; + } else if (ts_node__relevant_child_count(child, include_anonymous) > 0) { + earlier_child = child; + earlier_child_is_relevant = false; + } + } + + if (found_child_containing_target) { + if (!ts_node_is_null(earlier_child)) { + earlier_node = earlier_child; + earlier_node_is_relevant = earlier_child_is_relevant; + } + node = child; + } else if (earlier_child_is_relevant) { + return earlier_child; + } else if (!ts_node_is_null(earlier_child)) { + node = earlier_child; + } else if (earlier_node_is_relevant) { + return earlier_node; + } else { + node = earlier_node; + earlier_node = ts_node__null(); + earlier_node_is_relevant = false; + } + } + + return ts_node__null(); +} + +static inline TSNode ts_node__next_sibling(TSNode self, bool include_anonymous) { + uint32_t target_end_byte = ts_node_end_byte(self); + + TSNode node = ts_node_parent(self); + TSNode later_node = ts_node__null(); + bool later_node_is_relevant = false; + + while (!ts_node_is_null(node)) { + TSNode later_child = ts_node__null(); + bool later_child_is_relevant = false; + TSNode child_containing_target = ts_node__null(); + + TSNode child; + NodeChildIterator iterator = ts_node_iterate_children(&node); + while (ts_node_child_iterator_next(&iterator, &child)) { + if (iterator.position.bytes < target_end_byte) continue; + if (ts_node_start_byte(child) <= ts_node_start_byte(self)) { + if (ts_node__subtree(child).ptr != ts_node__subtree(self).ptr) { + child_containing_target = child; + } + } else if (ts_node__is_relevant(child, include_anonymous)) { + later_child = child; + later_child_is_relevant = true; + break; + } else if (ts_node__relevant_child_count(child, include_anonymous) > 0) { + later_child = child; + later_child_is_relevant = false; + break; + } + } + + if (!ts_node_is_null(child_containing_target)) { + if (!ts_node_is_null(later_child)) { + later_node = later_child; + later_node_is_relevant = later_child_is_relevant; + } + node = child_containing_target; + } else if (later_child_is_relevant) { + return later_child; + } else if (!ts_node_is_null(later_child)) { + node = later_child; + } else if (later_node_is_relevant) { + return later_node; + } else { + node = later_node; + } + } + + return ts_node__null(); +} + +static inline TSNode ts_node__first_child_for_byte( + TSNode self, + uint32_t goal, + bool include_anonymous +) { + TSNode node = self; + bool did_descend = true; + + while (did_descend) { + did_descend = false; + + TSNode child; + NodeChildIterator iterator = ts_node_iterate_children(&node); + while (ts_node_child_iterator_next(&iterator, &child)) { + if (ts_node_end_byte(child) > goal) { + if (ts_node__is_relevant(child, include_anonymous)) { + return child; + } else if (ts_node_child_count(child) > 0) { + did_descend = true; + node = child; + break; + } + } + } + } + + return ts_node__null(); +} + +static inline TSNode ts_node__descendant_for_byte_range( + TSNode self, + uint32_t range_start, + uint32_t range_end, + bool include_anonymous +) { + TSNode node = self; + TSNode last_visible_node = self; + + bool did_descend = true; + while (did_descend) { + did_descend = false; + + TSNode child; + NodeChildIterator iterator = ts_node_iterate_children(&node); + while (ts_node_child_iterator_next(&iterator, &child)) { + uint32_t node_end = iterator.position.bytes; + + // The end of this node must extend far enough forward to touch + // the end of the range and exceed the start of the range. + if (node_end < range_end) continue; + if (node_end <= range_start) continue; + + // The start of this node must extend far enough backward to + // touch the start of the range. + if (range_start < ts_node_start_byte(child)) break; + + node = child; + if (ts_node__is_relevant(node, include_anonymous)) { + last_visible_node = node; + } + did_descend = true; + break; + } + } + + return last_visible_node; +} + +static inline TSNode ts_node__descendant_for_point_range( + TSNode self, + TSPoint range_start, + TSPoint range_end, + bool include_anonymous +) { + TSNode node = self; + TSNode last_visible_node = self; + + bool did_descend = true; + while (did_descend) { + did_descend = false; + + TSNode child; + NodeChildIterator iterator = ts_node_iterate_children(&node); + while (ts_node_child_iterator_next(&iterator, &child)) { + TSPoint node_end = iterator.position.extent; + + // The end of this node must extend far enough forward to touch + // the end of the range and exceed the start of the range. + if (point_lt(node_end, range_end)) continue; + if (point_lte(node_end, range_start)) continue; + + // The start of this node must extend far enough backward to + // touch the start of the range. + if (point_lt(range_start, ts_node_start_point(child))) break; + + node = child; + if (ts_node__is_relevant(node, include_anonymous)) { + last_visible_node = node; + } + did_descend = true; + break; + } + } + + return last_visible_node; +} + +// TSNode - public + +uint32_t ts_node_end_byte(TSNode self) { + return ts_node_start_byte(self) + ts_subtree_size(ts_node__subtree(self)).bytes; +} + +TSPoint ts_node_end_point(TSNode self) { + return point_add(ts_node_start_point(self), ts_subtree_size(ts_node__subtree(self)).extent); +} + +TSSymbol ts_node_symbol(TSNode self) { + TSSymbol symbol = ts_node__alias(&self); + if (!symbol) symbol = ts_subtree_symbol(ts_node__subtree(self)); + return ts_language_public_symbol(self.tree->language, symbol); +} + +const char *ts_node_type(TSNode self) { + TSSymbol symbol = ts_node__alias(&self); + if (!symbol) symbol = ts_subtree_symbol(ts_node__subtree(self)); + return ts_language_symbol_name(self.tree->language, symbol); +} + +const TSLanguage *ts_node_language(TSNode self) { + return self.tree->language; +} + +TSSymbol ts_node_grammar_symbol(TSNode self) { + return ts_subtree_symbol(ts_node__subtree(self)); +} + +const char *ts_node_grammar_type(TSNode self) { + TSSymbol symbol = ts_subtree_symbol(ts_node__subtree(self)); + return ts_language_symbol_name(self.tree->language, symbol); +} + +char *ts_node_string(TSNode self) { + TSSymbol alias_symbol = ts_node__alias(&self); + return ts_subtree_string( + ts_node__subtree(self), + alias_symbol, + ts_language_symbol_metadata(self.tree->language, alias_symbol).visible, + self.tree->language, + false + ); +} + +bool ts_node_eq(TSNode self, TSNode other) { + return self.tree == other.tree && self.id == other.id; +} + +bool ts_node_is_null(TSNode self) { + return self.id == 0; +} + +bool ts_node_is_extra(TSNode self) { + return ts_subtree_extra(ts_node__subtree(self)); +} + +bool ts_node_is_named(TSNode self) { + TSSymbol alias = ts_node__alias(&self); + return alias + ? ts_language_symbol_metadata(self.tree->language, alias).named + : ts_subtree_named(ts_node__subtree(self)); +} + +bool ts_node_is_missing(TSNode self) { + return ts_subtree_missing(ts_node__subtree(self)); +} + +bool ts_node_has_changes(TSNode self) { + return ts_subtree_has_changes(ts_node__subtree(self)); +} + +bool ts_node_has_error(TSNode self) { + return ts_subtree_error_cost(ts_node__subtree(self)) > 0; +} + +bool ts_node_is_error(TSNode self) { + TSSymbol symbol = ts_node_symbol(self); + return symbol == ts_builtin_sym_error; +} + +uint32_t ts_node_descendant_count(TSNode self) { + return ts_subtree_visible_descendant_count(ts_node__subtree(self)) + 1; +} + +TSStateId ts_node_parse_state(TSNode self) { + return ts_subtree_parse_state(ts_node__subtree(self)); +} + +TSStateId ts_node_next_parse_state(TSNode self) { + const TSLanguage *language = self.tree->language; + uint16_t state = ts_node_parse_state(self); + if (state == TS_TREE_STATE_NONE) { + return TS_TREE_STATE_NONE; + } + uint16_t symbol = ts_node_grammar_symbol(self); + return ts_language_next_state(language, state, symbol); +} + +TSNode ts_node_parent(TSNode self) { + TSNode node = ts_tree_root_node(self.tree); + uint32_t end_byte = ts_node_end_byte(self); + if (node.id == self.id) return ts_node__null(); + + TSNode last_visible_node = node; + bool did_descend = true; + while (did_descend) { + did_descend = false; + + TSNode child; + NodeChildIterator iterator = ts_node_iterate_children(&node); + while (ts_node_child_iterator_next(&iterator, &child)) { + if ( + ts_node_start_byte(child) > ts_node_start_byte(self) || + child.id == self.id + ) break; + if (iterator.position.bytes >= end_byte && ts_node_child_count(child) > 0) { + node = child; + if (ts_node__is_relevant(child, true)) { + last_visible_node = node; + } + did_descend = true; + break; + } + } + } + + return last_visible_node; +} + +TSNode ts_node_child(TSNode self, uint32_t child_index) { + return ts_node__child(self, child_index, true); +} + +TSNode ts_node_named_child(TSNode self, uint32_t child_index) { + return ts_node__child(self, child_index, false); +} + +TSNode ts_node_child_by_field_id(TSNode self, TSFieldId field_id) { +recur: + if (!field_id || ts_node_child_count(self) == 0) return ts_node__null(); + + const TSFieldMapEntry *field_map, *field_map_end; + ts_language_field_map( + self.tree->language, + ts_node__subtree(self).ptr->production_id, + &field_map, + &field_map_end + ); + if (field_map == field_map_end) return ts_node__null(); + + // The field mappings are sorted by their field id. Scan all + // the mappings to find the ones for the given field id. + while (field_map->field_id < field_id) { + field_map++; + if (field_map == field_map_end) return ts_node__null(); + } + while (field_map_end[-1].field_id > field_id) { + field_map_end--; + if (field_map == field_map_end) return ts_node__null(); + } + + TSNode child; + NodeChildIterator iterator = ts_node_iterate_children(&self); + while (ts_node_child_iterator_next(&iterator, &child)) { + if (!ts_subtree_extra(ts_node__subtree(child))) { + uint32_t index = iterator.structural_child_index - 1; + if (index < field_map->child_index) continue; + + // Hidden nodes' fields are "inherited" by their visible parent. + if (field_map->inherited) { + + // If this is the *last* possible child node for this field, + // then perform a tail call to avoid recursion. + if (field_map + 1 == field_map_end) { + self = child; + goto recur; + } + + // Otherwise, descend into this child, but if it doesn't contain + // the field, continue searching subsequent children. + else { + TSNode result = ts_node_child_by_field_id(child, field_id); + if (result.id) return result; + field_map++; + if (field_map == field_map_end) return ts_node__null(); + } + } + + else if (ts_node__is_relevant(child, true)) { + return child; + } + + // If the field refers to a hidden node with visible children, + // return the first visible child. + else if (ts_node_child_count(child) > 0 ) { + return ts_node_child(child, 0); + } + + // Otherwise, continue searching subsequent children. + else { + field_map++; + if (field_map == field_map_end) return ts_node__null(); + } + } + } + + return ts_node__null(); +} + +static inline const char *ts_node__field_name_from_language(TSNode self, uint32_t structural_child_index) { + const TSFieldMapEntry *field_map, *field_map_end; + ts_language_field_map( + self.tree->language, + ts_node__subtree(self).ptr->production_id, + &field_map, + &field_map_end + ); + for (; field_map != field_map_end; field_map++) { + if (!field_map->inherited && field_map->child_index == structural_child_index) { + return self.tree->language->field_names[field_map->field_id]; + } + } + return NULL; +} + +const char *ts_node_field_name_for_child(TSNode self, uint32_t child_index) { + TSNode result = self; + bool did_descend = true; + const char *inherited_field_name = NULL; + + while (did_descend) { + did_descend = false; + + TSNode child; + uint32_t index = 0; + NodeChildIterator iterator = ts_node_iterate_children(&result); + while (ts_node_child_iterator_next(&iterator, &child)) { + if (ts_node__is_relevant(child, true)) { + if (index == child_index) { + const char *field_name = ts_node__field_name_from_language(result, iterator.structural_child_index - 1); + if (field_name) return field_name; + return inherited_field_name; + } + index++; + } else { + uint32_t grandchild_index = child_index - index; + uint32_t grandchild_count = ts_node__relevant_child_count(child, true); + if (grandchild_index < grandchild_count) { + const char *field_name = ts_node__field_name_from_language(result, iterator.structural_child_index - 1); + if (field_name) inherited_field_name = field_name; + + did_descend = true; + result = child; + child_index = grandchild_index; + break; + } + index += grandchild_count; + } + } + } + + return NULL; +} + +TSNode ts_node_child_by_field_name( + TSNode self, + const char *name, + uint32_t name_length +) { + TSFieldId field_id = ts_language_field_id_for_name( + self.tree->language, + name, + name_length + ); + return ts_node_child_by_field_id(self, field_id); +} + +uint32_t ts_node_child_count(TSNode self) { + Subtree tree = ts_node__subtree(self); + if (ts_subtree_child_count(tree) > 0) { + return tree.ptr->visible_child_count; + } else { + return 0; + } +} + +uint32_t ts_node_named_child_count(TSNode self) { + Subtree tree = ts_node__subtree(self); + if (ts_subtree_child_count(tree) > 0) { + return tree.ptr->named_child_count; + } else { + return 0; + } +} + +TSNode ts_node_next_sibling(TSNode self) { + return ts_node__next_sibling(self, true); +} + +TSNode ts_node_next_named_sibling(TSNode self) { + return ts_node__next_sibling(self, false); +} + +TSNode ts_node_prev_sibling(TSNode self) { + return ts_node__prev_sibling(self, true); +} + +TSNode ts_node_prev_named_sibling(TSNode self) { + return ts_node__prev_sibling(self, false); +} + +TSNode ts_node_first_child_for_byte(TSNode self, uint32_t byte) { + return ts_node__first_child_for_byte(self, byte, true); +} + +TSNode ts_node_first_named_child_for_byte(TSNode self, uint32_t byte) { + return ts_node__first_child_for_byte(self, byte, false); +} + +TSNode ts_node_descendant_for_byte_range( + TSNode self, + uint32_t start, + uint32_t end +) { + return ts_node__descendant_for_byte_range(self, start, end, true); +} + +TSNode ts_node_named_descendant_for_byte_range( + TSNode self, + uint32_t start, + uint32_t end +) { + return ts_node__descendant_for_byte_range(self, start, end, false); +} + +TSNode ts_node_descendant_for_point_range( + TSNode self, + TSPoint start, + TSPoint end +) { + return ts_node__descendant_for_point_range(self, start, end, true); +} + +TSNode ts_node_named_descendant_for_point_range( + TSNode self, + TSPoint start, + TSPoint end +) { + return ts_node__descendant_for_point_range(self, start, end, false); +} + +void ts_node_edit(TSNode *self, const TSInputEdit *edit) { + uint32_t start_byte = ts_node_start_byte(*self); + TSPoint start_point = ts_node_start_point(*self); + + if (start_byte >= edit->old_end_byte) { + start_byte = edit->new_end_byte + (start_byte - edit->old_end_byte); + start_point = point_add(edit->new_end_point, point_sub(start_point, edit->old_end_point)); + } else if (start_byte > edit->start_byte) { + start_byte = edit->new_end_byte; + start_point = edit->new_end_point; + } + + self->context[0] = start_byte; + self->context[1] = start_point.row; + self->context[2] = start_point.column; +} diff --git a/shcat_c/parser/src/parser.c b/shcat_c/parser/src/parser.c new file mode 100644 index 00000000..c805ef71 --- /dev/null +++ b/shcat_c/parser/src/parser.c @@ -0,0 +1,2272 @@ +#define _POSIX_C_SOURCE 200112L + +#include "./array.h" +#include "./error_costs.h" +#include "./language.h" +#include "./length.h" +#include "./lexer.h" +#include "./reduce_action.h" +#include "./reusable_node.h" +#include "./stack.h" +#include "./subtree.h" +#include "./tree.h" + +#include "tree_sitter/api.h" +#include +#include +#include +#include +#include +#include + +typedef Array(TSRange) ArrayRange; +typedef uint64_t TSDuration; +typedef uint64_t TSClock; + +#define LOG(...) \ + if (self->lexer.logger.log || self->dot_graph_file) \ + { \ + snprintf(self->lexer.debug_buffer, \ + TREE_SITTER_SERIALIZATION_BUFFER_SIZE, __VA_ARGS__); \ + ts_parser__log(self); \ + } + +#define LOG_LOOKAHEAD(symbol_name, size) \ + if (self->lexer.logger.log || self->dot_graph_file) \ + { \ + char *buf = self->lexer.debug_buffer; \ + const char *symbol = symbol_name; \ + int off = sprintf(buf, "lexed_lookahead sym:"); \ + for (int i = 0; \ + symbol[i] != '\0' && off < TREE_SITTER_SERIALIZATION_BUFFER_SIZE; \ + i++) \ + { \ + switch (symbol[i]) \ + { \ + case '\t': \ + buf[off++] = '\\'; \ + buf[off++] = 't'; \ + break; \ + case '\n': \ + buf[off++] = '\\'; \ + buf[off++] = 'n'; \ + break; \ + case '\v': \ + buf[off++] = '\\'; \ + buf[off++] = 'v'; \ + break; \ + case '\f': \ + buf[off++] = '\\'; \ + buf[off++] = 'f'; \ + break; \ + case '\r': \ + buf[off++] = '\\'; \ + buf[off++] = 'r'; \ + break; \ + case '\\': \ + buf[off++] = '\\'; \ + buf[off++] = '\\'; \ + break; \ + default: \ + buf[off++] = symbol[i]; \ + break; \ + } \ + } \ + snprintf(buf + off, TREE_SITTER_SERIALIZATION_BUFFER_SIZE - off, \ + ", size:%u", size); \ + ts_parser__log(self); \ + } + +#define LOG_STACK() \ + if (self->dot_graph_file) \ + { \ + ts_stack_print_dot_graph(self->stack, self->language, \ + self->dot_graph_file); \ + fputs("\n\n", self->dot_graph_file); \ + } + +#define LOG_TREE(tree) \ + if (self->dot_graph_file) \ + { \ + ts_subtree_print_dot_graph(tree, self->language, \ + self->dot_graph_file); \ + fputs("\n", self->dot_graph_file); \ + } + +#define SYM_NAME(symbol) ts_language_symbol_name(self->language, symbol) + +#define TREE_NAME(tree) SYM_NAME(ts_subtree_symbol(tree)) + +static const unsigned MAX_VERSION_COUNT = 6; +static const unsigned MAX_VERSION_COUNT_OVERFLOW = 4; +static const unsigned MAX_SUMMARY_DEPTH = 16; +static const unsigned MAX_COST_DIFFERENCE = 16 * ERROR_COST_PER_SKIPPED_TREE; +static const unsigned OP_COUNT_PER_TIMEOUT_CHECK = 100; + +typedef struct +{ + Subtree token; + Subtree last_external_token; + uint32_t byte_index; +} TokenCache; + +struct TSParser +{ + Lexer lexer; + Stack *stack; + SubtreePool tree_pool; + const TSLanguage *language; + void *wasm_store; + ReduceActionSet reduce_actions; + Subtree finished_tree; + SubtreeArray trailing_extras; + SubtreeArray trailing_extras2; + SubtreeArray scratch_trees; + TokenCache token_cache; + ReusableNode reusable_node; + void *external_scanner_payload; + FILE *dot_graph_file; + TSClock end_clock; + TSDuration timeout_duration; + unsigned accept_count; + unsigned operation_count; + const volatile size_t *cancellation_flag; + Subtree old_tree; + ArrayRange included_range_differences; + unsigned included_range_difference_index; + bool has_scanner_error; +}; + +typedef struct +{ + unsigned cost; + unsigned node_count; + int dynamic_precedence; + bool is_in_error; +} ErrorStatus; + +typedef enum +{ + ErrorComparisonTakeLeft, + ErrorComparisonPreferLeft, + ErrorComparisonNone, + ErrorComparisonPreferRight, + ErrorComparisonTakeRight, +} ErrorComparison; + +typedef struct +{ + const char *string; + uint32_t length; +} TSStringInput; + +// StringInput + +static const char *ts_string_input_read(void *_self, uint32_t byte, + TSPoint point, uint32_t *length) +{ + (void)point; + TSStringInput *self = (TSStringInput *)_self; + if (byte >= self->length) + { + *length = 0; + return ""; + } + else + { + *length = self->length - byte; + return self->string + byte; + } +} + +// Parser - Private + +static void ts_parser__log(TSParser *self) +{ + if (self->lexer.logger.log) + { + self->lexer.logger.log(self->lexer.logger.payload, TSLogTypeParse, + self->lexer.debug_buffer); + } + + if (self->dot_graph_file) + { + fprintf(self->dot_graph_file, "graph {\nlabel=\""); + for (char *chr = &self->lexer.debug_buffer[0]; *chr != 0; chr++) + { + if (*chr == '"' || *chr == '\\') + fputc('\\', self->dot_graph_file); + fputc(*chr, self->dot_graph_file); + } + fprintf(self->dot_graph_file, "\"\n}\n\n"); + } +} + +static bool ts_parser__breakdown_top_of_stack(TSParser *self, + StackVersion version) +{ + bool did_break_down = false; + bool pending = false; + + do + { + StackSliceArray pop = ts_stack_pop_pending(self->stack, version); + if (!pop.size) + break; + + did_break_down = true; + pending = false; + for (uint32_t i = 0; i < pop.size; i++) + { + StackSlice slice = pop.contents[i]; + TSStateId state = ts_stack_state(self->stack, slice.version); + Subtree parent = *array_front(&slice.subtrees); + + for (uint32_t j = 0, n = ts_subtree_child_count(parent); j < n; j++) + { + Subtree child = ts_subtree_children(parent)[j]; + pending = ts_subtree_child_count(child) > 0; + + if (ts_subtree_is_error(child)) + { + state = ERROR_STATE; + } + else if (!ts_subtree_extra(child)) + { + state = ts_language_next_state(self->language, state, + ts_subtree_symbol(child)); + } + + ts_subtree_retain(child); + ts_stack_push(self->stack, slice.version, child, pending, + state); + } + + for (uint32_t j = 1; j < slice.subtrees.size; j++) + { + Subtree tree = slice.subtrees.contents[j]; + ts_stack_push(self->stack, slice.version, tree, false, state); + } + + ts_subtree_release(&self->tree_pool, parent); + array_delete(&slice.subtrees); + + LOG("breakdown_top_of_stack tree:%s", TREE_NAME(parent)); + LOG_STACK(); + } + } while (pending); + + return did_break_down; +} + +static void ts_parser__breakdown_lookahead(TSParser *self, Subtree *lookahead, + TSStateId state, + ReusableNode *reusable_node) +{ + bool did_descend = false; + Subtree tree = reusable_node_tree(reusable_node); + while (ts_subtree_child_count(tree) > 0 && + ts_subtree_parse_state(tree) != state) + { + LOG("state_mismatch sym:%s", TREE_NAME(tree)); + reusable_node_descend(reusable_node); + tree = reusable_node_tree(reusable_node); + did_descend = true; + } + + if (did_descend) + { + ts_subtree_release(&self->tree_pool, *lookahead); + *lookahead = tree; + ts_subtree_retain(*lookahead); + } +} + +static ErrorComparison ts_parser__compare_versions(TSParser *self, + ErrorStatus a, ErrorStatus b) +{ + (void)self; + if (!a.is_in_error && b.is_in_error) + { + if (a.cost < b.cost) + { + return ErrorComparisonTakeLeft; + } + else + { + return ErrorComparisonPreferLeft; + } + } + + if (a.is_in_error && !b.is_in_error) + { + if (b.cost < a.cost) + { + return ErrorComparisonTakeRight; + } + else + { + return ErrorComparisonPreferRight; + } + } + + if (a.cost < b.cost) + { + if ((b.cost - a.cost) * (1 + a.node_count) > MAX_COST_DIFFERENCE) + { + return ErrorComparisonTakeLeft; + } + else + { + return ErrorComparisonPreferLeft; + } + } + + if (b.cost < a.cost) + { + if ((a.cost - b.cost) * (1 + b.node_count) > MAX_COST_DIFFERENCE) + { + return ErrorComparisonTakeRight; + } + else + { + return ErrorComparisonPreferRight; + } + } + + if (a.dynamic_precedence > b.dynamic_precedence) + return ErrorComparisonPreferLeft; + if (b.dynamic_precedence > a.dynamic_precedence) + return ErrorComparisonPreferRight; + return ErrorComparisonNone; +} + +static ErrorStatus ts_parser__version_status(TSParser *self, + StackVersion version) +{ + unsigned cost = ts_stack_error_cost(self->stack, version); + bool is_paused = ts_stack_is_paused(self->stack, version); + if (is_paused) + cost += ERROR_COST_PER_SKIPPED_TREE; + return (ErrorStatus){ + .cost = cost, + .node_count = ts_stack_node_count_since_error(self->stack, version), + .dynamic_precedence = ts_stack_dynamic_precedence(self->stack, version), + .is_in_error = + is_paused || ts_stack_state(self->stack, version) == ERROR_STATE}; +} + +static bool ts_parser__better_version_exists(TSParser *self, + StackVersion version, + bool is_in_error, unsigned cost) +{ + if (self->finished_tree.ptr && + ts_subtree_error_cost(self->finished_tree) <= cost) + { + return true; + } + + Length position = ts_stack_position(self->stack, version); + ErrorStatus status = { + .cost = cost, + .is_in_error = is_in_error, + .dynamic_precedence = ts_stack_dynamic_precedence(self->stack, version), + .node_count = ts_stack_node_count_since_error(self->stack, version), + }; + + for (StackVersion i = 0, n = ts_stack_version_count(self->stack); i < n; + i++) + { + if (i == version || !ts_stack_is_active(self->stack, i) || + ts_stack_position(self->stack, i).bytes < position.bytes) + continue; + ErrorStatus status_i = ts_parser__version_status(self, i); + switch (ts_parser__compare_versions(self, status, status_i)) + { + case ErrorComparisonTakeRight: + return true; + case ErrorComparisonPreferRight: + if (ts_stack_can_merge(self->stack, i, version)) + return true; + break; + default: + break; + } + } + + return false; +} + +static bool ts_parser__call_main_lex_fn(TSParser *self, TSLexMode lex_mode) +{ + return self->language->lex_fn(&self->lexer.data, lex_mode.lex_state); +} + +static bool ts_parser__call_keyword_lex_fn(TSParser *self, TSLexMode lex_mode) +{ + return self->language->keyword_lex_fn(&self->lexer.data, 0); +} + +static void ts_parser__external_scanner_create(TSParser *self) +{ + if (self->language && self->language->external_scanner.states) + { + self->external_scanner_payload = + self->language->external_scanner.create(); + } +} + +static void ts_parser__external_scanner_destroy(TSParser *self) +{ + if (self->language && self->external_scanner_payload && + self->language->external_scanner.destroy) + { + self->language->external_scanner.destroy( + self->external_scanner_payload); + } + self->external_scanner_payload = NULL; +} + +static unsigned ts_parser__external_scanner_serialize(TSParser *self) +{ + return self->language->external_scanner.serialize( + self->external_scanner_payload, self->lexer.debug_buffer); +} + +static void ts_parser__external_scanner_deserialize(TSParser *self, + Subtree external_token) +{ + const char *data = NULL; + uint32_t length = 0; + if (external_token.ptr) + { + data = ts_external_scanner_state_data( + &external_token.ptr->external_scanner_state); + length = external_token.ptr->external_scanner_state.length; + } + + self->language->external_scanner.deserialize(self->external_scanner_payload, + data, length); +} + +static bool ts_parser__external_scanner_scan(TSParser *self, + TSStateId external_lex_state) +{ + const bool *valid_external_tokens = + ts_language_enabled_external_tokens(self->language, external_lex_state); + return self->language->external_scanner.scan(self->external_scanner_payload, + &self->lexer.data, + valid_external_tokens); +} + +static bool ts_parser__can_reuse_first_leaf(TSParser *self, TSStateId state, + Subtree tree, + TableEntry *table_entry) +{ + TSLexMode current_lex_mode = self->language->lex_modes[state]; + TSSymbol leaf_symbol = ts_subtree_leaf_symbol(tree); + TSStateId leaf_state = ts_subtree_leaf_parse_state(tree); + TSLexMode leaf_lex_mode = self->language->lex_modes[leaf_state]; + + // At the end of a non-terminal extra node, the lexer normally returns + // NULL, which indicates that the parser should look for a reduce action + // at symbol `0`. Avoid reusing tokens in this situation to ensure that + // the same thing happens when incrementally reparsing. + if (current_lex_mode.lex_state == (uint16_t)(-1)) + return false; + + // If the token was created in a state with the same set of lookaheads, it + // is reusable. + if (table_entry->action_count > 0 && + memcmp(&leaf_lex_mode, ¤t_lex_mode, sizeof(TSLexMode)) == 0 && + (leaf_symbol != self->language->keyword_capture_token || + (!ts_subtree_is_keyword(tree) && + ts_subtree_parse_state(tree) == state))) + return true; + + // Empty tokens are not reusable in states with different lookaheads. + if (ts_subtree_size(tree).bytes == 0 && leaf_symbol != ts_builtin_sym_end) + return false; + + // If the current state allows external tokens or other tokens that conflict + // with this token, this token is not reusable. + return current_lex_mode.external_lex_state == 0 && table_entry->is_reusable; +} + +static Subtree ts_parser__lex(TSParser *self, StackVersion version, + TSStateId parse_state) +{ + TSLexMode lex_mode = self->language->lex_modes[parse_state]; + if (lex_mode.lex_state == (uint16_t)-1) + { + LOG("no_lookahead_after_non_terminal_extra"); + return NULL_SUBTREE; + } + + const Length start_position = ts_stack_position(self->stack, version); + const Subtree external_token = + ts_stack_last_external_token(self->stack, version); + + bool found_external_token = false; + bool error_mode = parse_state == ERROR_STATE; + bool skipped_error = false; + bool called_get_column = false; + int32_t first_error_character = 0; + Length error_start_position = length_zero(); + Length error_end_position = length_zero(); + uint32_t lookahead_end_byte = 0; + uint32_t external_scanner_state_len = 0; + bool external_scanner_state_changed = false; + ts_lexer_reset(&self->lexer, start_position); + + for (;;) + { + bool found_token = false; + Length current_position = self->lexer.current_position; + + if (lex_mode.external_lex_state != 0) + { + LOG("lex_external state:%d, row:%u, column:%u", + lex_mode.external_lex_state, current_position.extent.row, + current_position.extent.column); + ts_lexer_start(&self->lexer); + ts_parser__external_scanner_deserialize(self, external_token); + found_token = ts_parser__external_scanner_scan( + self, lex_mode.external_lex_state); + if (self->has_scanner_error) + return NULL_SUBTREE; + ts_lexer_finish(&self->lexer, &lookahead_end_byte); + + if (found_token) + { + external_scanner_state_len = + ts_parser__external_scanner_serialize(self); + external_scanner_state_changed = !ts_external_scanner_state_eq( + ts_subtree_external_scanner_state(external_token), + self->lexer.debug_buffer, external_scanner_state_len); + + // When recovering from an error, ignore any zero-length + // external tokens unless they have changed the external + // scanner's state. This helps to avoid infinite loops which + // could otherwise occur, because the lexer is looking for any + // possible token, instead of looking for the specific set of + // tokens that are valid in some parse state. + // + // Note that it's possible that the token end position may be + // *before* the original position of the lexer because of the + // way that tokens are positioned at included range boundaries: + // when a token is terminated at the start of an included range, + // it is marked as ending at the *end* of the preceding included + // range. + if (self->lexer.token_end_position.bytes <= + current_position.bytes && + (error_mode || !ts_stack_has_advanced_since_error( + self->stack, version)) && + !external_scanner_state_changed) + { + LOG("ignore_empty_external_token symbol:%s", + SYM_NAME( + self->language->external_scanner + .symbol_map[self->lexer.data.result_symbol])) + found_token = false; + } + } + + if (found_token) + { + found_external_token = true; + called_get_column = self->lexer.did_get_column; + break; + } + + ts_lexer_reset(&self->lexer, current_position); + } + + LOG("lex_internal state:%d, row:%u, column:%u", lex_mode.lex_state, + current_position.extent.row, current_position.extent.column); + ts_lexer_start(&self->lexer); + found_token = ts_parser__call_main_lex_fn(self, lex_mode); + ts_lexer_finish(&self->lexer, &lookahead_end_byte); + if (found_token) + break; + + if (!error_mode) + { + error_mode = true; + lex_mode = self->language->lex_modes[ERROR_STATE]; + ts_lexer_reset(&self->lexer, start_position); + continue; + } + + if (!skipped_error) + { + LOG("skip_unrecognized_character"); + skipped_error = true; + error_start_position = self->lexer.token_start_position; + error_end_position = self->lexer.token_start_position; + first_error_character = self->lexer.data.lookahead; + } + + if (self->lexer.current_position.bytes == error_end_position.bytes) + { + if (self->lexer.data.eof(&self->lexer.data)) + { + self->lexer.data.result_symbol = ts_builtin_sym_error; + break; + } + self->lexer.data.advance(&self->lexer.data, false); + } + + error_end_position = self->lexer.current_position; + } + + Subtree result; + if (skipped_error) + { + Length padding = length_sub(error_start_position, start_position); + Length size = length_sub(error_end_position, error_start_position); + uint32_t lookahead_bytes = + lookahead_end_byte - error_end_position.bytes; + result = ts_subtree_new_error(&self->tree_pool, first_error_character, + padding, size, lookahead_bytes, + parse_state, self->language); + } + else + { + bool is_keyword = false; + TSSymbol symbol = self->lexer.data.result_symbol; + Length padding = + length_sub(self->lexer.token_start_position, start_position); + Length size = length_sub(self->lexer.token_end_position, + self->lexer.token_start_position); + uint32_t lookahead_bytes = + lookahead_end_byte - self->lexer.token_end_position.bytes; + + if (found_external_token) + { + symbol = self->language->external_scanner.symbol_map[symbol]; + } + else if (symbol == self->language->keyword_capture_token && symbol != 0) + { + uint32_t end_byte = self->lexer.token_end_position.bytes; + ts_lexer_reset(&self->lexer, self->lexer.token_start_position); + ts_lexer_start(&self->lexer); + + is_keyword = ts_parser__call_keyword_lex_fn(self, lex_mode); + + if (is_keyword && + self->lexer.token_end_position.bytes == end_byte && + ts_language_has_actions(self->language, parse_state, + self->lexer.data.result_symbol)) + { + symbol = self->lexer.data.result_symbol; + } + } + + result = ts_subtree_new_leaf(&self->tree_pool, symbol, padding, size, + lookahead_bytes, parse_state, + found_external_token, called_get_column, + is_keyword, self->language); + + if (found_external_token) + { + MutableSubtree mut_result = ts_subtree_to_mut_unsafe(result); + ts_external_scanner_state_init( + &mut_result.ptr->external_scanner_state, + self->lexer.debug_buffer, external_scanner_state_len); + mut_result.ptr->has_external_scanner_state_change = + external_scanner_state_changed; + } + } + + LOG_LOOKAHEAD(SYM_NAME(ts_subtree_symbol(result)), + ts_subtree_total_size(result).bytes); + return result; +} + +static Subtree ts_parser__get_cached_token(TSParser *self, TSStateId state, + size_t position, + Subtree last_external_token, + TableEntry *table_entry) +{ + TokenCache *cache = &self->token_cache; + if (cache->token.ptr && cache->byte_index == position && + ts_subtree_external_scanner_state_eq(cache->last_external_token, + last_external_token)) + { + ts_language_table_entry(self->language, state, + ts_subtree_symbol(cache->token), table_entry); + if (ts_parser__can_reuse_first_leaf(self, state, cache->token, + table_entry)) + { + ts_subtree_retain(cache->token); + return cache->token; + } + } + return NULL_SUBTREE; +} + +static void ts_parser__set_cached_token(TSParser *self, uint32_t byte_index, + Subtree last_external_token, + Subtree token) +{ + TokenCache *cache = &self->token_cache; + if (token.ptr) + ts_subtree_retain(token); + if (last_external_token.ptr) + ts_subtree_retain(last_external_token); + if (cache->token.ptr) + ts_subtree_release(&self->tree_pool, cache->token); + if (cache->last_external_token.ptr) + ts_subtree_release(&self->tree_pool, cache->last_external_token); + cache->token = token; + cache->byte_index = byte_index; + cache->last_external_token = last_external_token; +} + +static Subtree ts_parser__reuse_node(TSParser *self, StackVersion version, + TSStateId *state, uint32_t position, + Subtree last_external_token, + TableEntry *table_entry) +{ + Subtree result; + while ((result = reusable_node_tree(&self->reusable_node)).ptr) + { + uint32_t byte_offset = reusable_node_byte_offset(&self->reusable_node); + uint32_t end_byte_offset = byte_offset + ts_subtree_total_bytes(result); + + // Do not reuse an EOF node if the included ranges array has changes + // later on in the file. + if (ts_subtree_is_eof(result)) + end_byte_offset = UINT32_MAX; + + if (byte_offset > position) + { + LOG("before_reusable_node symbol:%s", TREE_NAME(result)); + break; + } + + if (byte_offset < position) + { + LOG("past_reusable_node symbol:%s", TREE_NAME(result)); + if (end_byte_offset <= position || + !reusable_node_descend(&self->reusable_node)) + { + reusable_node_advance(&self->reusable_node); + } + continue; + } + + if (!ts_subtree_external_scanner_state_eq( + self->reusable_node.last_external_token, last_external_token)) + { + LOG("reusable_node_has_different_external_scanner_state symbol:%s", + TREE_NAME(result)); + reusable_node_advance(&self->reusable_node); + continue; + } + + const char *reason = NULL; + if (ts_subtree_has_changes(result)) + { + reason = "has_changes"; + } + else if (ts_subtree_is_error(result)) + { + reason = "is_error"; + } + else if (ts_subtree_missing(result)) + { + reason = "is_missing"; + } + else if (ts_subtree_is_fragile(result)) + { + reason = "is_fragile"; + } + + if (reason) + { + LOG("cant_reuse_node_%s tree:%s", reason, TREE_NAME(result)); + if (!reusable_node_descend(&self->reusable_node)) + { + reusable_node_advance(&self->reusable_node); + ts_parser__breakdown_top_of_stack(self, version); + *state = ts_stack_state(self->stack, version); + } + continue; + } + + TSSymbol leaf_symbol = ts_subtree_leaf_symbol(result); + ts_language_table_entry(self->language, *state, leaf_symbol, + table_entry); + if (!ts_parser__can_reuse_first_leaf(self, *state, result, table_entry)) + { + LOG("cant_reuse_node symbol:%s, first_leaf_symbol:%s", + TREE_NAME(result), SYM_NAME(leaf_symbol)); + reusable_node_advance_past_leaf(&self->reusable_node); + break; + } + + LOG("reuse_node symbol:%s", TREE_NAME(result)); + ts_subtree_retain(result); + return result; + } + + return NULL_SUBTREE; +} + +// Determine if a given tree should be replaced by an alternative tree. +// +// The decision is based on the trees' error costs (if any), their dynamic +// precedence, and finally, as a default, by a recursive comparison of the +// trees' symbols. +static bool ts_parser__select_tree(TSParser *self, Subtree left, Subtree right) +{ + if (!left.ptr) + return true; + if (!right.ptr) + return false; + + if (ts_subtree_error_cost(right) < ts_subtree_error_cost(left)) + { + LOG("select_smaller_error symbol:%s, over_symbol:%s", TREE_NAME(right), + TREE_NAME(left)); + return true; + } + + if (ts_subtree_error_cost(left) < ts_subtree_error_cost(right)) + { + LOG("select_smaller_error symbol:%s, over_symbol:%s", TREE_NAME(left), + TREE_NAME(right)); + return false; + } + + if (ts_subtree_dynamic_precedence(right) > + ts_subtree_dynamic_precedence(left)) + { + LOG("select_higher_precedence symbol:%s, prec:%" PRId32 + ", over_symbol:%s, other_prec:%" PRId32, + TREE_NAME(right), ts_subtree_dynamic_precedence(right), + TREE_NAME(left), ts_subtree_dynamic_precedence(left)); + return true; + } + + if (ts_subtree_dynamic_precedence(left) > + ts_subtree_dynamic_precedence(right)) + { + LOG("select_higher_precedence symbol:%s, prec:%" PRId32 + ", over_symbol:%s, other_prec:%" PRId32, + TREE_NAME(left), ts_subtree_dynamic_precedence(left), + TREE_NAME(right), ts_subtree_dynamic_precedence(right)); + return false; + } + + if (ts_subtree_error_cost(left) > 0) + return true; + + int comparison = ts_subtree_compare(left, right, &self->tree_pool); + switch (comparison) + { + case -1: + LOG("select_earlier symbol:%s, over_symbol:%s", TREE_NAME(left), + TREE_NAME(right)); + return false; + break; + case 1: + LOG("select_earlier symbol:%s, over_symbol:%s", TREE_NAME(right), + TREE_NAME(left)); + return true; + default: + LOG("select_existing symbol:%s, over_symbol:%s", TREE_NAME(left), + TREE_NAME(right)); + return false; + } +} + +// Determine if a given tree's children should be replaced by an alternative +// array of children. +static bool ts_parser__select_children(TSParser *self, Subtree left, + const SubtreeArray *children) +{ + array_assign(&self->scratch_trees, children); + + // Create a temporary subtree using the scratch trees array. This node does + // not perform any allocation except for possibly growing the array to make + // room for its own heap data. The scratch tree is never explicitly + // released, so the same 'scratch trees' array can be reused again later. + MutableSubtree scratch_tree = ts_subtree_new_node( + ts_subtree_symbol(left), &self->scratch_trees, 0, self->language); + + return ts_parser__select_tree(self, left, + ts_subtree_from_mut(scratch_tree)); +} + +static void ts_parser__shift(TSParser *self, StackVersion version, + TSStateId state, Subtree lookahead, bool extra) +{ + bool is_leaf = ts_subtree_child_count(lookahead) == 0; + Subtree subtree_to_push = lookahead; + if (extra != ts_subtree_extra(lookahead) && is_leaf) + { + MutableSubtree result = + ts_subtree_make_mut(&self->tree_pool, lookahead); + ts_subtree_set_extra(&result, extra); + subtree_to_push = ts_subtree_from_mut(result); + } + + ts_stack_push(self->stack, version, subtree_to_push, !is_leaf, state); + if (ts_subtree_has_external_tokens(subtree_to_push)) + { + ts_stack_set_last_external_token( + self->stack, version, + ts_subtree_last_external_token(subtree_to_push)); + } +} + +static StackVersion ts_parser__reduce(TSParser *self, StackVersion version, + TSSymbol symbol, uint32_t count, + int dynamic_precedence, + uint16_t production_id, bool is_fragile, + bool end_of_non_terminal_extra) +{ + uint32_t initial_version_count = ts_stack_version_count(self->stack); + + // Pop the given number of nodes from the given version of the parse stack. + // If stack versions have previously merged, then there may be more than one + // path back through the stack. For each path, create a new parent node to + // contain the popped children, and push it onto the stack in place of the + // children. + StackSliceArray pop = ts_stack_pop_count(self->stack, version, count); + uint32_t removed_version_count = 0; + for (uint32_t i = 0; i < pop.size; i++) + { + StackSlice slice = pop.contents[i]; + StackVersion slice_version = slice.version - removed_version_count; + + // This is where new versions are added to the parse stack. The versions + // will all be sorted and truncated at the end of the outer parsing + // loop. Allow the maximum version count to be temporarily exceeded, but + // only by a limited threshold. + if (slice_version > MAX_VERSION_COUNT + MAX_VERSION_COUNT_OVERFLOW) + { + ts_stack_remove_version(self->stack, slice_version); + ts_subtree_array_delete(&self->tree_pool, &slice.subtrees); + removed_version_count++; + while (i + 1 < pop.size) + { + StackSlice next_slice = pop.contents[i + 1]; + if (next_slice.version != slice.version) + break; + ts_subtree_array_delete(&self->tree_pool, &next_slice.subtrees); + i++; + } + continue; + } + + // Extra tokens on top of the stack should not be included in this new + // parent node. They will be re-pushed onto the stack after the parent + // node is created and pushed. + SubtreeArray children = slice.subtrees; + ts_subtree_array_remove_trailing_extras(&children, + &self->trailing_extras); + + MutableSubtree parent = ts_subtree_new_node( + symbol, &children, production_id, self->language); + + // This pop operation may have caused multiple stack versions to + // collapse into one, because they all diverged from a common state. In + // that case, choose one of the arrays of trees to be the parent node's + // children, and delete the rest of the tree arrays. + while (i + 1 < pop.size) + { + StackSlice next_slice = pop.contents[i + 1]; + if (next_slice.version != slice.version) + break; + i++; + + SubtreeArray next_slice_children = next_slice.subtrees; + ts_subtree_array_remove_trailing_extras(&next_slice_children, + &self->trailing_extras2); + + if (ts_parser__select_children(self, ts_subtree_from_mut(parent), + &next_slice_children)) + { + ts_subtree_array_clear(&self->tree_pool, + &self->trailing_extras); + ts_subtree_release(&self->tree_pool, + ts_subtree_from_mut(parent)); + array_swap(&self->trailing_extras, &self->trailing_extras2); + parent = ts_subtree_new_node(symbol, &next_slice_children, + production_id, self->language); + } + else + { + array_clear(&self->trailing_extras2); + ts_subtree_array_delete(&self->tree_pool, &next_slice.subtrees); + } + } + + TSStateId state = ts_stack_state(self->stack, slice_version); + TSStateId next_state = + ts_language_next_state(self->language, state, symbol); + if (end_of_non_terminal_extra && next_state == state) + { + parent.ptr->extra = true; + } + if (is_fragile || pop.size > 1 || initial_version_count > 1) + { + parent.ptr->fragile_left = true; + parent.ptr->fragile_right = true; + parent.ptr->parse_state = TS_TREE_STATE_NONE; + } + else + { + parent.ptr->parse_state = state; + } + parent.ptr->dynamic_precedence += dynamic_precedence; + + // Push the parent node onto the stack, along with any extra tokens that + // were previously on top of the stack. + ts_stack_push(self->stack, slice_version, ts_subtree_from_mut(parent), + false, next_state); + for (uint32_t j = 0; j < self->trailing_extras.size; j++) + { + ts_stack_push(self->stack, slice_version, + self->trailing_extras.contents[j], false, next_state); + } + + for (StackVersion j = 0; j < slice_version; j++) + { + if (j == version) + continue; + if (ts_stack_merge(self->stack, j, slice_version)) + { + removed_version_count++; + break; + } + } + } + + // Return the first new stack version that was created. + return ts_stack_version_count(self->stack) > initial_version_count + ? initial_version_count + : STACK_VERSION_NONE; +} + +static void ts_parser__accept(TSParser *self, StackVersion version, + Subtree lookahead) +{ + assert(ts_subtree_is_eof(lookahead)); + ts_stack_push(self->stack, version, lookahead, false, 1); + + StackSliceArray pop = ts_stack_pop_all(self->stack, version); + for (uint32_t i = 0; i < pop.size; i++) + { + SubtreeArray trees = pop.contents[i].subtrees; + + Subtree root = NULL_SUBTREE; + for (uint32_t j = trees.size - 1; j + 1 > 0; j--) + { + Subtree tree = trees.contents[j]; + if (!ts_subtree_extra(tree)) + { + assert(!tree.data.is_inline); + uint32_t child_count = ts_subtree_child_count(tree); + const Subtree *children = ts_subtree_children(tree); + for (uint32_t k = 0; k < child_count; k++) + { + ts_subtree_retain(children[k]); + } + array_splice(&trees, j, 1, child_count, children); + root = ts_subtree_from_mut(ts_subtree_new_node( + ts_subtree_symbol(tree), &trees, tree.ptr->production_id, + self->language)); + ts_subtree_release(&self->tree_pool, tree); + break; + } + } + + assert(root.ptr); + self->accept_count++; + + if (self->finished_tree.ptr) + { + if (ts_parser__select_tree(self, self->finished_tree, root)) + { + ts_subtree_release(&self->tree_pool, self->finished_tree); + self->finished_tree = root; + } + else + { + ts_subtree_release(&self->tree_pool, root); + } + } + else + { + self->finished_tree = root; + } + } + + ts_stack_remove_version(self->stack, pop.contents[0].version); + ts_stack_halt(self->stack, version); +} + +static bool ts_parser__do_all_potential_reductions( + TSParser *self, StackVersion starting_version, TSSymbol lookahead_symbol) +{ + uint32_t initial_version_count = ts_stack_version_count(self->stack); + + bool can_shift_lookahead_symbol = false; + StackVersion version = starting_version; + for (unsigned i = 0; true; i++) + { + uint32_t version_count = ts_stack_version_count(self->stack); + if (version >= version_count) + break; + + bool merged = false; + for (StackVersion j = initial_version_count; j < version; j++) + { + if (ts_stack_merge(self->stack, j, version)) + { + merged = true; + break; + } + } + if (merged) + continue; + + TSStateId state = ts_stack_state(self->stack, version); + bool has_shift_action = false; + array_clear(&self->reduce_actions); + + TSSymbol first_symbol, end_symbol; + if (lookahead_symbol != 0) + { + first_symbol = lookahead_symbol; + end_symbol = lookahead_symbol + 1; + } + else + { + first_symbol = 1; + end_symbol = self->language->token_count; + } + + for (TSSymbol symbol = first_symbol; symbol < end_symbol; symbol++) + { + TableEntry entry; + ts_language_table_entry(self->language, state, symbol, &entry); + for (uint32_t j = 0; j < entry.action_count; j++) + { + TSParseAction action = entry.actions[j]; + switch (action.type) + { + case TSParseActionTypeShift: + case TSParseActionTypeRecover: + if (!action.shift.extra && !action.shift.repetition) + has_shift_action = true; + break; + case TSParseActionTypeReduce: + if (action.reduce.child_count > 0) + ts_reduce_action_set_add( + &self->reduce_actions, + (ReduceAction){ + .symbol = action.reduce.symbol, + .count = action.reduce.child_count, + .dynamic_precedence = + action.reduce.dynamic_precedence, + .production_id = action.reduce.production_id, + }); + break; + default: + break; + } + } + } + + StackVersion reduction_version = STACK_VERSION_NONE; + for (uint32_t j = 0; j < self->reduce_actions.size; j++) + { + ReduceAction action = self->reduce_actions.contents[j]; + + reduction_version = ts_parser__reduce( + self, version, action.symbol, action.count, + action.dynamic_precedence, action.production_id, true, false); + } + + if (has_shift_action) + { + can_shift_lookahead_symbol = true; + } + else if (reduction_version != STACK_VERSION_NONE && + i < MAX_VERSION_COUNT) + { + ts_stack_renumber_version(self->stack, reduction_version, version); + continue; + } + else if (lookahead_symbol != 0) + { + ts_stack_remove_version(self->stack, version); + } + + if (version == starting_version) + { + version = version_count; + } + else + { + version++; + } + } + + return can_shift_lookahead_symbol; +} + +static bool ts_parser__recover_to_state(TSParser *self, StackVersion version, + unsigned depth, TSStateId goal_state) +{ + StackSliceArray pop = ts_stack_pop_count(self->stack, version, depth); + StackVersion previous_version = STACK_VERSION_NONE; + + for (unsigned i = 0; i < pop.size; i++) + { + StackSlice slice = pop.contents[i]; + + if (slice.version == previous_version) + { + ts_subtree_array_delete(&self->tree_pool, &slice.subtrees); + array_erase(&pop, i--); + continue; + } + + if (ts_stack_state(self->stack, slice.version) != goal_state) + { + ts_stack_halt(self->stack, slice.version); + ts_subtree_array_delete(&self->tree_pool, &slice.subtrees); + array_erase(&pop, i--); + continue; + } + + SubtreeArray error_trees = + ts_stack_pop_error(self->stack, slice.version); + if (error_trees.size > 0) + { + assert(error_trees.size == 1); + Subtree error_tree = error_trees.contents[0]; + uint32_t error_child_count = ts_subtree_child_count(error_tree); + if (error_child_count > 0) + { + array_splice(&slice.subtrees, 0, 0, error_child_count, + ts_subtree_children(error_tree)); + for (unsigned j = 0; j < error_child_count; j++) + { + ts_subtree_retain(slice.subtrees.contents[j]); + } + } + ts_subtree_array_delete(&self->tree_pool, &error_trees); + } + + ts_subtree_array_remove_trailing_extras(&slice.subtrees, + &self->trailing_extras); + + if (slice.subtrees.size > 0) + { + Subtree error = ts_subtree_new_error_node(&slice.subtrees, true, + self->language); + ts_stack_push(self->stack, slice.version, error, false, goal_state); + } + else + { + array_delete(&slice.subtrees); + } + + for (unsigned j = 0; j < self->trailing_extras.size; j++) + { + Subtree tree = self->trailing_extras.contents[j]; + ts_stack_push(self->stack, slice.version, tree, false, goal_state); + } + + previous_version = slice.version; + } + + return previous_version != STACK_VERSION_NONE; +} + +static void ts_parser__recover(TSParser *self, StackVersion version, + Subtree lookahead) +{ + bool did_recover = false; + unsigned previous_version_count = ts_stack_version_count(self->stack); + Length position = ts_stack_position(self->stack, version); + StackSummary *summary = ts_stack_get_summary(self->stack, version); + unsigned node_count_since_error = + ts_stack_node_count_since_error(self->stack, version); + unsigned current_error_cost = ts_stack_error_cost(self->stack, version); + + // When the parser is in the error state, there are two strategies for + // recovering with a given lookahead token: + // 1. Find a previous state on the stack in which that lookahead token would + // be valid. Then, + // create a new stack version that is in that state again. This entails + // popping all of the subtrees that have been pushed onto the stack since + // that previous state, and wrapping them in an ERROR node. + // 2. Wrap the lookahead token in an ERROR node, push that ERROR node onto + // the stack, and + // move on to the next lookahead token, remaining in the error state. + // + // First, try the strategy 1. Upon entering the error state, the parser + // recorded a summary of the previous parse states and their depths. Look at + // each state in the summary, to see if the current lookahead token would be + // valid in that state. + if (summary && !ts_subtree_is_error(lookahead)) + { + for (unsigned i = 0; i < summary->size; i++) + { + StackSummaryEntry entry = summary->contents[i]; + + if (entry.state == ERROR_STATE) + continue; + if (entry.position.bytes == position.bytes) + continue; + unsigned depth = entry.depth; + if (node_count_since_error > 0) + depth++; + + // Do not recover in ways that create redundant stack versions. + bool would_merge = false; + for (unsigned j = 0; j < previous_version_count; j++) + { + if (ts_stack_state(self->stack, j) == entry.state && + ts_stack_position(self->stack, j).bytes == position.bytes) + { + would_merge = true; + break; + } + } + if (would_merge) + continue; + + // Do not recover if the result would clearly be worse than some + // existing stack version. + unsigned new_cost = + current_error_cost + entry.depth * ERROR_COST_PER_SKIPPED_TREE + + (position.bytes - entry.position.bytes) * + ERROR_COST_PER_SKIPPED_CHAR + + (position.extent.row - entry.position.extent.row) * + ERROR_COST_PER_SKIPPED_LINE; + if (ts_parser__better_version_exists(self, version, false, + new_cost)) + break; + + // If the current lookahead token is valid in some previous state, + // recover to that state. Then stop looking for further recoveries. + if (ts_language_has_actions(self->language, entry.state, + ts_subtree_symbol(lookahead))) + { + if (ts_parser__recover_to_state(self, version, depth, + entry.state)) + { + did_recover = true; + LOG("recover_to_previous state:%u, depth:%u", entry.state, + depth); + LOG_STACK(); + break; + } + } + } + } + + // In the process of attempting to recover, some stack versions may have + // been created and subsequently halted. Remove those versions. + for (unsigned i = previous_version_count; + i < ts_stack_version_count(self->stack); i++) + { + if (!ts_stack_is_active(self->stack, i)) + { + ts_stack_remove_version(self->stack, i--); + } + } + + // If strategy 1 succeeded, a new stack version will have been created which + // is able to handle the current lookahead token. Now, in addition, try + // strategy 2 described above: skip the current lookahead token by wrapping + // it in an ERROR node. + + // Don't pursue this additional strategy if there are already too many stack + // versions. + if (did_recover && ts_stack_version_count(self->stack) > MAX_VERSION_COUNT) + { + ts_stack_halt(self->stack, version); + ts_subtree_release(&self->tree_pool, lookahead); + return; + } + + if (did_recover && ts_subtree_has_external_scanner_state_change(lookahead)) + { + ts_stack_halt(self->stack, version); + ts_subtree_release(&self->tree_pool, lookahead); + return; + } + + // If the parser is still in the error state at the end of the file, just + // wrap everything in an ERROR node and terminate. + if (ts_subtree_is_eof(lookahead)) + { + LOG("recover_eof"); + SubtreeArray children = array_new(); + Subtree parent = + ts_subtree_new_error_node(&children, false, self->language); + ts_stack_push(self->stack, version, parent, false, 1); + ts_parser__accept(self, version, lookahead); + return; + } + + // Do not recover if the result would clearly be worse than some existing + // stack version. + unsigned new_cost = + current_error_cost + ERROR_COST_PER_SKIPPED_TREE + + ts_subtree_total_bytes(lookahead) * ERROR_COST_PER_SKIPPED_CHAR + + ts_subtree_total_size(lookahead).extent.row * + ERROR_COST_PER_SKIPPED_LINE; + if (ts_parser__better_version_exists(self, version, false, new_cost)) + { + ts_stack_halt(self->stack, version); + ts_subtree_release(&self->tree_pool, lookahead); + return; + } + + // If the current lookahead token is an extra token, mark it as extra. This + // means it won't be counted in error cost calculations. + unsigned n; + const TSParseAction *actions = ts_language_actions( + self->language, 1, ts_subtree_symbol(lookahead), &n); + if (n > 0 && actions[n - 1].type == TSParseActionTypeShift && + actions[n - 1].shift.extra) + { + MutableSubtree mutable_lookahead = + ts_subtree_make_mut(&self->tree_pool, lookahead); + ts_subtree_set_extra(&mutable_lookahead, true); + lookahead = ts_subtree_from_mut(mutable_lookahead); + } + + // Wrap the lookahead token in an ERROR. + LOG("skip_token symbol:%s", TREE_NAME(lookahead)); + SubtreeArray children = array_new(); + array_reserve(&children, 1); + array_push(&children, lookahead); + MutableSubtree error_repeat = ts_subtree_new_node( + ts_builtin_sym_error_repeat, &children, 0, self->language); + + // If other tokens have already been skipped, so there is already an ERROR + // at the top of the stack, then pop that ERROR off the stack and wrap the + // two ERRORs together into one larger ERROR. + if (node_count_since_error > 0) + { + StackSliceArray pop = ts_stack_pop_count(self->stack, version, 1); + + // TODO: Figure out how to make this condition occur. + // See https://github.com/atom/atom/issues/18450#issuecomment-439579778 + // If multiple stack versions have merged at this point, just pick one + // of the errors arbitrarily and discard the rest. + if (pop.size > 1) + { + for (unsigned i = 1; i < pop.size; i++) + { + ts_subtree_array_delete(&self->tree_pool, + &pop.contents[i].subtrees); + } + while (ts_stack_version_count(self->stack) > + pop.contents[0].version + 1) + { + ts_stack_remove_version(self->stack, + pop.contents[0].version + 1); + } + } + + ts_stack_renumber_version(self->stack, pop.contents[0].version, + version); + array_push(&pop.contents[0].subtrees, + ts_subtree_from_mut(error_repeat)); + error_repeat = + ts_subtree_new_node(ts_builtin_sym_error_repeat, + &pop.contents[0].subtrees, 0, self->language); + } + + // Push the new ERROR onto the stack. + ts_stack_push(self->stack, version, ts_subtree_from_mut(error_repeat), + false, ERROR_STATE); + if (ts_subtree_has_external_tokens(lookahead)) + { + ts_stack_set_last_external_token( + self->stack, version, ts_subtree_last_external_token(lookahead)); + } +} + +static void ts_parser__handle_error(TSParser *self, StackVersion version, + Subtree lookahead) +{ + uint32_t previous_version_count = ts_stack_version_count(self->stack); + + // Perform any reductions that can happen in this state, regardless of the + // lookahead. After skipping one or more invalid tokens, the parser might + // find a token that would have allowed a reduction to take place. + ts_parser__do_all_potential_reductions(self, version, 0); + uint32_t version_count = ts_stack_version_count(self->stack); + Length position = ts_stack_position(self->stack, version); + + // Push a discontinuity onto the stack. Merge all of the stack versions that + // were created in the previous step. + bool did_insert_missing_token = false; + for (StackVersion v = version; v < version_count;) + { + if (!did_insert_missing_token) + { + TSStateId state = ts_stack_state(self->stack, v); + for (TSSymbol missing_symbol = 1; + missing_symbol < (uint16_t)self->language->token_count; + missing_symbol++) + { + TSStateId state_after_missing_symbol = ts_language_next_state( + self->language, state, missing_symbol); + if (state_after_missing_symbol == 0 || + state_after_missing_symbol == state) + { + continue; + } + + if (ts_language_has_reduce_action( + self->language, state_after_missing_symbol, + ts_subtree_leaf_symbol(lookahead))) + { + // In case the parser is currently outside of any included + // range, the lexer will snap to the beginning of the next + // included range. The missing token's padding must be + // assigned to position it within the next included range. + ts_lexer_reset(&self->lexer, position); + ts_lexer_mark_end(&self->lexer); + Length padding = + length_sub(self->lexer.token_end_position, position); + uint32_t lookahead_bytes = + ts_subtree_total_bytes(lookahead) + + ts_subtree_lookahead_bytes(lookahead); + + StackVersion version_with_missing_tree = + ts_stack_copy_version(self->stack, v); + Subtree missing_tree = ts_subtree_new_missing_leaf( + &self->tree_pool, missing_symbol, padding, + lookahead_bytes, self->language); + ts_stack_push(self->stack, version_with_missing_tree, + missing_tree, false, + state_after_missing_symbol); + + if (ts_parser__do_all_potential_reductions( + self, version_with_missing_tree, + ts_subtree_leaf_symbol(lookahead))) + { + LOG("recover_with_missing symbol:%s, state:%u", + SYM_NAME(missing_symbol), + ts_stack_state(self->stack, + version_with_missing_tree)); + did_insert_missing_token = true; + break; + } + } + } + } + + ts_stack_push(self->stack, v, NULL_SUBTREE, false, ERROR_STATE); + v = (v == version) ? previous_version_count : v + 1; + } + + for (unsigned i = previous_version_count; i < version_count; i++) + { + bool did_merge = + ts_stack_merge(self->stack, version, previous_version_count); + assert(did_merge); + (void)did_merge; // fix warning/error with clang -Os + } + + ts_stack_record_summary(self->stack, version, MAX_SUMMARY_DEPTH); + + // Begin recovery with the current lookahead node, rather than waiting for + // the next turn of the parse loop. This ensures that the tree accounts for + // the current lookahead token's "lookahead bytes" value, which describes + // how far the lexer needed to look ahead beyond the content of the token in + // order to recognize it. + if (ts_subtree_child_count(lookahead) > 0) + { + ts_parser__breakdown_lookahead(self, &lookahead, ERROR_STATE, + &self->reusable_node); + } + ts_parser__recover(self, version, lookahead); + + LOG_STACK(); +} + +static bool ts_parser__advance(TSParser *self, StackVersion version, + bool allow_node_reuse) +{ + TSStateId state = ts_stack_state(self->stack, version); + uint32_t position = ts_stack_position(self->stack, version).bytes; + Subtree last_external_token = + ts_stack_last_external_token(self->stack, version); + + bool did_reuse = true; + Subtree lookahead = NULL_SUBTREE; + TableEntry table_entry = {.action_count = 0}; + + // If possible, reuse a node from the previous syntax tree. + if (allow_node_reuse) + { + lookahead = ts_parser__reuse_node(self, version, &state, position, + last_external_token, &table_entry); + } + + // If no node from the previous syntax tree could be reused, then try to + // reuse the token previously returned by the lexer. + if (!lookahead.ptr) + { + did_reuse = false; + lookahead = ts_parser__get_cached_token( + self, state, position, last_external_token, &table_entry); + } + + bool needs_lex = !lookahead.ptr; + for (;;) + { + // Otherwise, re-run the lexer. + if (needs_lex) + { + needs_lex = false; + lookahead = ts_parser__lex(self, version, state); + if (self->has_scanner_error) + return false; + + if (lookahead.ptr) + { + ts_parser__set_cached_token(self, position, last_external_token, + lookahead); + ts_language_table_entry(self->language, state, + ts_subtree_symbol(lookahead), + &table_entry); + } + + // When parsing a non-terminal extra, a null lookahead indicates the + // end of the rule. The reduction is stored in the EOF table entry. + // After the reduction, the lexer needs to be run again. + else + { + ts_language_table_entry(self->language, state, + ts_builtin_sym_end, &table_entry); + } + } + + // If a cancellation flag or a timeout was provided, then check every + // time a fixed number of parse actions has been processed. + + // Process each parse action for the current lookahead token in + // the current state. If there are multiple actions, then this is + // an ambiguous state. REDUCE actions always create a new stack + // version, whereas SHIFT actions update the existing stack version + // and terminate this loop. + StackVersion last_reduction_version = STACK_VERSION_NONE; + for (uint32_t i = 0; i < table_entry.action_count; i++) + { + TSParseAction action = table_entry.actions[i]; + + switch (action.type) + { + case TSParseActionTypeShift: { + if (action.shift.repetition) + break; + TSStateId next_state; + if (action.shift.extra) + { + next_state = state; + LOG("shift_extra"); + } + else + { + next_state = action.shift.state; + LOG("shift state:%u", next_state); + } + + if (ts_subtree_child_count(lookahead) > 0) + { + ts_parser__breakdown_lookahead(self, &lookahead, state, + &self->reusable_node); + next_state = ts_language_next_state( + self->language, state, ts_subtree_symbol(lookahead)); + } + + ts_parser__shift(self, version, next_state, lookahead, + action.shift.extra); + if (did_reuse) + reusable_node_advance(&self->reusable_node); + return true; + } + + case TSParseActionTypeReduce: { + bool is_fragile = table_entry.action_count > 1; + bool end_of_non_terminal_extra = lookahead.ptr == NULL; + LOG("reduce sym:%s, child_count:%u", + SYM_NAME(action.reduce.symbol), action.reduce.child_count); + StackVersion reduction_version = ts_parser__reduce( + self, version, action.reduce.symbol, + action.reduce.child_count, action.reduce.dynamic_precedence, + action.reduce.production_id, is_fragile, + end_of_non_terminal_extra); + if (reduction_version != STACK_VERSION_NONE) + { + last_reduction_version = reduction_version; + } + break; + } + + case TSParseActionTypeAccept: { + LOG("accept"); + ts_parser__accept(self, version, lookahead); + return true; + } + + case TSParseActionTypeRecover: { + if (ts_subtree_child_count(lookahead) > 0) + { + ts_parser__breakdown_lookahead( + self, &lookahead, ERROR_STATE, &self->reusable_node); + } + + ts_parser__recover(self, version, lookahead); + if (did_reuse) + reusable_node_advance(&self->reusable_node); + return true; + } + } + } + + // If a reduction was performed, then replace the current stack version + // with one of the stack versions created by a reduction, and continue + // processing this version of the stack with the same lookahead symbol. + if (last_reduction_version != STACK_VERSION_NONE) + { + ts_stack_renumber_version(self->stack, last_reduction_version, + version); + LOG_STACK(); + state = ts_stack_state(self->stack, version); + + // At the end of a non-terminal extra rule, the lexer will return a + // null subtree, because the parser needs to perform a fixed + // reduction regardless of the lookahead node. After performing that + // reduction, (and completing the non-terminal extra rule) run the + // lexer again based on the current parse state. + if (!lookahead.ptr) + { + needs_lex = true; + } + else + { + ts_language_table_entry(self->language, state, + ts_subtree_leaf_symbol(lookahead), + &table_entry); + } + + continue; + } + + // A non-terminal extra rule was reduced and merged into an existing + // stack version. This version can be discarded. + if (!lookahead.ptr) + { + ts_stack_halt(self->stack, version); + return true; + } + + // If there were no parse actions for the current lookahead token, then + // it is not valid in this state. If the current lookahead token is a + // keyword, then switch to treating it as the normal word token if that + // token is valid in this state. + if (ts_subtree_is_keyword(lookahead) && + ts_subtree_symbol(lookahead) != + self->language->keyword_capture_token) + { + ts_language_table_entry(self->language, state, + self->language->keyword_capture_token, + &table_entry); + if (table_entry.action_count > 0) + { + LOG("switch from_keyword:%s, to_word_token:%s", + TREE_NAME(lookahead), + SYM_NAME(self->language->keyword_capture_token)); + + MutableSubtree mutable_lookahead = + ts_subtree_make_mut(&self->tree_pool, lookahead); + ts_subtree_set_symbol(&mutable_lookahead, + self->language->keyword_capture_token, + self->language); + lookahead = ts_subtree_from_mut(mutable_lookahead); + continue; + } + } + + // If the current lookahead token is not valid and the parser is + // already in the error state, restart the error recovery process. + // TODO - can this be unified with the other `RECOVER` case above? + if (state == ERROR_STATE) + { + ts_parser__recover(self, version, lookahead); + return true; + } + + // If the current lookahead token is not valid and the previous + // subtree on the stack was reused from an old tree, it isn't actually + // valid to reuse it. Remove it from the stack, and in its place, + // push each of its children. Then try again to process the current + // lookahead. + if (ts_parser__breakdown_top_of_stack(self, version)) + { + state = ts_stack_state(self->stack, version); + ts_subtree_release(&self->tree_pool, lookahead); + needs_lex = true; + continue; + } + + // At this point, the current lookahead token is definitely not valid + // for this parse stack version. Mark this version as paused and + // continue processing any other stack versions that might exist. If + // some other version advances successfully, then this version can + // simply be removed. But if all versions end up paused, then error + // recovery is needed. + LOG("detect_error"); + ts_stack_pause(self->stack, version, lookahead); + return true; + } +} + +static unsigned ts_parser__condense_stack(TSParser *self) +{ + bool made_changes = false; + unsigned min_error_cost = UINT_MAX; + for (StackVersion i = 0; i < ts_stack_version_count(self->stack); i++) + { + // Prune any versions that have been marked for removal. + if (ts_stack_is_halted(self->stack, i)) + { + ts_stack_remove_version(self->stack, i); + i--; + continue; + } + + // Keep track of the minimum error cost of any stack version so + // that it can be returned. + ErrorStatus status_i = ts_parser__version_status(self, i); + if (!status_i.is_in_error && status_i.cost < min_error_cost) + { + min_error_cost = status_i.cost; + } + + // Examine each pair of stack versions, removing any versions that + // are clearly worse than another version. Ensure that the versions + // are ordered from most promising to least promising. + for (StackVersion j = 0; j < i; j++) + { + ErrorStatus status_j = ts_parser__version_status(self, j); + + switch (ts_parser__compare_versions(self, status_j, status_i)) + { + case ErrorComparisonTakeLeft: + made_changes = true; + ts_stack_remove_version(self->stack, i); + i--; + j = i; + break; + + case ErrorComparisonPreferLeft: + case ErrorComparisonNone: + if (ts_stack_merge(self->stack, j, i)) + { + made_changes = true; + i--; + j = i; + } + break; + + case ErrorComparisonPreferRight: + made_changes = true; + if (ts_stack_merge(self->stack, j, i)) + { + i--; + j = i; + } + else + { + ts_stack_swap_versions(self->stack, i, j); + } + break; + + case ErrorComparisonTakeRight: + made_changes = true; + ts_stack_remove_version(self->stack, j); + i--; + j--; + break; + } + } + } + + // Enforce a hard upper bound on the number of stack versions by + // discarding the least promising versions. + while (ts_stack_version_count(self->stack) > MAX_VERSION_COUNT) + { + ts_stack_remove_version(self->stack, MAX_VERSION_COUNT); + made_changes = true; + } + + // If the best-performing stack version is currently paused, or all + // versions are paused, then resume the best paused version and begin + // the error recovery process. Otherwise, remove the paused versions. + if (ts_stack_version_count(self->stack) > 0) + { + bool has_unpaused_version = false; + for (StackVersion i = 0, n = ts_stack_version_count(self->stack); i < n; + i++) + { + if (ts_stack_is_paused(self->stack, i)) + { + if (!has_unpaused_version && + self->accept_count < MAX_VERSION_COUNT) + { + LOG("resume version:%u", i); + min_error_cost = ts_stack_error_cost(self->stack, i); + Subtree lookahead = ts_stack_resume(self->stack, i); + ts_parser__handle_error(self, i, lookahead); + has_unpaused_version = true; + } + else + { + ts_stack_remove_version(self->stack, i); + i--; + n--; + } + } + else + { + has_unpaused_version = true; + } + } + } + + if (made_changes) + { + LOG("condense"); + LOG_STACK(); + } + + return min_error_cost; +} + +static bool ts_parser_has_outstanding_parse(TSParser *self) +{ + return (self->external_scanner_payload || + ts_stack_state(self->stack, 0) != 1 || + ts_stack_node_count_since_error(self->stack, 0) != 0); +} + +// Parser - Public + +TSParser *ts_parser_new(void) +{ + TSParser *self = calloc(1, sizeof(TSParser)); + ts_lexer_init(&self->lexer); + array_init(&self->reduce_actions); + array_reserve(&self->reduce_actions, 4); + self->tree_pool = ts_subtree_pool_new(32); + self->stack = ts_stack_new(&self->tree_pool); + self->finished_tree = NULL_SUBTREE; + self->reusable_node = reusable_node_new(); + self->dot_graph_file = NULL; + self->cancellation_flag = NULL; + self->timeout_duration = 0; + self->language = NULL; + self->has_scanner_error = false; + self->external_scanner_payload = NULL; + self->end_clock = 0; + self->operation_count = 0; + self->old_tree = NULL_SUBTREE; + self->included_range_differences = (ArrayRange)array_new(); + self->included_range_difference_index = 0; + ts_parser__set_cached_token(self, 0, NULL_SUBTREE, NULL_SUBTREE); + return self; +} + +void ts_parser_delete(TSParser *self) +{ + if (!self) + return; + + ts_parser_set_language(self, NULL); + ts_stack_delete(self->stack); + if (self->reduce_actions.contents) + { + array_delete(&self->reduce_actions); + } + if (self->included_range_differences.contents) + { + array_delete(&self->included_range_differences); + } + if (self->old_tree.ptr) + { + ts_subtree_release(&self->tree_pool, self->old_tree); + self->old_tree = NULL_SUBTREE; + } + ts_lexer_delete(&self->lexer); + ts_parser__set_cached_token(self, 0, NULL_SUBTREE, NULL_SUBTREE); + ts_subtree_pool_delete(&self->tree_pool); + reusable_node_delete(&self->reusable_node); + array_delete(&self->trailing_extras); + array_delete(&self->trailing_extras2); + array_delete(&self->scratch_trees); + free(self); +} + +const TSLanguage *ts_parser_language(const TSParser *self) +{ + return self->language; +} + +bool ts_parser_set_language(TSParser *self, const TSLanguage *language) +{ + ts_parser_reset(self); + ts_language_delete(self->language); + self->language = NULL; + + if (language) + { + if (language->version > TREE_SITTER_LANGUAGE_VERSION || + language->version < TREE_SITTER_MIN_COMPATIBLE_LANGUAGE_VERSION) + return false; + } + + self->language = ts_language_copy(language); + return true; +} + +TSLogger ts_parser_logger(const TSParser *self) +{ + return self->lexer.logger; +} + +void ts_parser_set_logger(TSParser *self, TSLogger logger) +{ + self->lexer.logger = logger; +} + +void ts_parser_print_dot_graphs(TSParser *self, int fd) +{ + if (self->dot_graph_file) + { + fclose(self->dot_graph_file); + } + + if (fd >= 0) + { +#ifdef _WIN32 + self->dot_graph_file = _fdopen(fd, "a"); +#else + self->dot_graph_file = fdopen(fd, "a"); +#endif + } + else + { + self->dot_graph_file = NULL; + } +} + +const size_t *ts_parser_cancellation_flag(const TSParser *self) +{ + return (const size_t *)self->cancellation_flag; +} + +void ts_parser_set_cancellation_flag(TSParser *self, const size_t *flag) +{ + self->cancellation_flag = (const volatile size_t *)flag; +} + +uint64_t ts_parser_timeout_micros(const TSParser *self) +{ + return 0; +} + +void ts_parser_set_timeout_micros(TSParser *self, uint64_t timeout_micros) +{ + (void)(timeout_micros); + self->timeout_duration = 0; +} + +bool ts_parser_set_included_ranges(TSParser *self, const TSRange *ranges, + uint32_t count) +{ + return ts_lexer_set_included_ranges(&self->lexer, ranges, count); +} + +const TSRange *ts_parser_included_ranges(const TSParser *self, uint32_t *count) +{ + return ts_lexer_included_ranges(&self->lexer, count); +} + +void ts_parser_reset(TSParser *self) +{ + ts_parser__external_scanner_destroy(self); + + if (self->old_tree.ptr) + { + ts_subtree_release(&self->tree_pool, self->old_tree); + self->old_tree = NULL_SUBTREE; + } + + reusable_node_clear(&self->reusable_node); + ts_lexer_reset(&self->lexer, length_zero()); + ts_stack_clear(self->stack); + ts_parser__set_cached_token(self, 0, NULL_SUBTREE, NULL_SUBTREE); + if (self->finished_tree.ptr) + { + ts_subtree_release(&self->tree_pool, self->finished_tree); + self->finished_tree = NULL_SUBTREE; + } + self->accept_count = 0; + self->has_scanner_error = false; +} + +TSTree *ts_parser_parse(TSParser *self, const TSTree *old_tree, TSInput input) +{ + TSTree *result = NULL; + old_tree = NULL; + (void)(old_tree); + if (!self->language || !input.read) + return NULL; + + ts_lexer_set_input(&self->lexer, input); + array_clear(&self->included_range_differences); + self->included_range_difference_index = 0; + + if (ts_parser_has_outstanding_parse(self)) + { + LOG("resume_parsing"); + } + else + { + ts_parser__external_scanner_create(self); + if (self->has_scanner_error) + goto exit; + + reusable_node_clear(&self->reusable_node); + LOG("new_parse"); + } + + self->operation_count = 0; + + uint32_t position = 0, last_position = 0, version_count = 0; + do + { + for (StackVersion version = 0; + version_count = ts_stack_version_count(self->stack), + version < version_count; + version++) + { + bool allow_node_reuse = version_count == 1; + while (ts_stack_is_active(self->stack, version)) + { + LOG("process version:%u, version_count:%u, state:%d, row:%u, " + "col:%u", + version, ts_stack_version_count(self->stack), + ts_stack_state(self->stack, version), + ts_stack_position(self->stack, version).extent.row, + ts_stack_position(self->stack, version).extent.column); + + if (!ts_parser__advance(self, version, allow_node_reuse)) + { + if (self->has_scanner_error) + goto exit; + return NULL; + } + + LOG_STACK(); + + position = ts_stack_position(self->stack, version).bytes; + if (position > last_position || + (version > 0 && position == last_position)) + { + last_position = position; + break; + } + } + } + + // After advancing each version of the stack, re-sort the versions by + // their cost, removing any versions that are no longer worth pursuing. + unsigned min_error_cost = ts_parser__condense_stack(self); + + // If there's already a finished parse tree that's better than any + // in-progress version, then terminate parsing. Clear the parse stack to + // remove any extra references to subtrees within the finished tree, + // ensuring that these subtrees can be safely mutated in-place for + // rebalancing. + if (self->finished_tree.ptr && + ts_subtree_error_cost(self->finished_tree) < min_error_cost) + { + ts_stack_clear(self->stack); + break; + } + + while (self->included_range_difference_index < + self->included_range_differences.size) + { + TSRange *range = + &self->included_range_differences + .contents[self->included_range_difference_index]; + if (range->end_byte <= position) + { + self->included_range_difference_index++; + } + else + { + break; + } + } + } while (version_count != 0); + + assert(self->finished_tree.ptr); + ts_subtree_balance(self->finished_tree, &self->tree_pool, self->language); + LOG("done"); + LOG_TREE(self->finished_tree); + + result = ts_tree_new(self->finished_tree, self->language, + self->lexer.included_ranges, + self->lexer.included_range_count); + self->finished_tree = NULL_SUBTREE; + +exit: + ts_parser_reset(self); + return result; +} + +TSTree *ts_parser_parse_string(TSParser *self, const TSTree *old_tree, + const char *string, uint32_t length) +{ + return ts_parser_parse_string_encoding(self, old_tree, string, length, + TSInputEncodingUTF8); +} + +TSTree *ts_parser_parse_string_encoding(TSParser *self, const TSTree *old_tree, + const char *string, uint32_t length, + TSInputEncoding encoding) +{ + TSStringInput input = {string, length}; + return ts_parser_parse(self, old_tree, + (TSInput){ + &input, + ts_string_input_read, + encoding, + }); +} + +#undef LOG diff --git a/shcat_c/parser/src/parser.h b/shcat_c/parser/src/parser.h new file mode 100644 index 00000000..17f0e94b --- /dev/null +++ b/shcat_c/parser/src/parser.h @@ -0,0 +1,265 @@ +#ifndef TREE_SITTER_PARSER_H_ +#define TREE_SITTER_PARSER_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include + +#define ts_builtin_sym_error ((TSSymbol)-1) +#define ts_builtin_sym_end 0 +#define TREE_SITTER_SERIALIZATION_BUFFER_SIZE 1024 + +#ifndef TREE_SITTER_API_H_ +typedef uint16_t TSStateId; +typedef uint16_t TSSymbol; +typedef uint16_t TSFieldId; +typedef struct TSLanguage TSLanguage; +#endif + +typedef struct { + TSFieldId field_id; + uint8_t child_index; + bool inherited; +} TSFieldMapEntry; + +typedef struct { + uint16_t index; + uint16_t length; +} TSFieldMapSlice; + +typedef struct { + bool visible; + bool named; + bool supertype; +} TSSymbolMetadata; + +typedef struct TSLexer TSLexer; + +struct TSLexer { + int32_t lookahead; + TSSymbol result_symbol; + void (*advance)(TSLexer *, bool); + void (*mark_end)(TSLexer *); + uint32_t (*get_column)(TSLexer *); + bool (*is_at_included_range_start)(const TSLexer *); + bool (*eof)(const TSLexer *); +}; + +typedef enum { + TSParseActionTypeShift, + TSParseActionTypeReduce, + TSParseActionTypeAccept, + TSParseActionTypeRecover, +} TSParseActionType; + +typedef union { + struct { + uint8_t type; + TSStateId state; + bool extra; + bool repetition; + } shift; + struct { + uint8_t type; + uint8_t child_count; + TSSymbol symbol; + int16_t dynamic_precedence; + uint16_t production_id; + } reduce; + uint8_t type; +} TSParseAction; + +typedef struct { + uint16_t lex_state; + uint16_t external_lex_state; +} TSLexMode; + +typedef union { + TSParseAction action; + struct { + uint8_t count; + bool reusable; + } entry; +} TSParseActionEntry; + +typedef struct { + int32_t start; + int32_t end; +} TSCharacterRange; + +struct TSLanguage { + uint32_t version; + uint32_t symbol_count; + uint32_t alias_count; + uint32_t token_count; + uint32_t external_token_count; + uint32_t state_count; + uint32_t large_state_count; + uint32_t production_id_count; + uint32_t field_count; + uint16_t max_alias_sequence_length; + const uint16_t *parse_table; + const uint16_t *small_parse_table; + const uint32_t *small_parse_table_map; + const TSParseActionEntry *parse_actions; + const char * const *symbol_names; + const char * const *field_names; + const TSFieldMapSlice *field_map_slices; + const TSFieldMapEntry *field_map_entries; + const TSSymbolMetadata *symbol_metadata; + const TSSymbol *public_symbol_map; + const uint16_t *alias_map; + const TSSymbol *alias_sequences; + const TSLexMode *lex_modes; + bool (*lex_fn)(TSLexer *, TSStateId); + bool (*keyword_lex_fn)(TSLexer *, TSStateId); + TSSymbol keyword_capture_token; + struct { + const bool *states; + const TSSymbol *symbol_map; + void *(*create)(void); + void (*destroy)(void *); + bool (*scan)(void *, TSLexer *, const bool *symbol_whitelist); + unsigned (*serialize)(void *, char *); + void (*deserialize)(void *, const char *, unsigned); + } external_scanner; + const TSStateId *primary_state_ids; +}; + +static inline bool set_contains(TSCharacterRange *ranges, uint32_t len, int32_t lookahead) { + uint32_t index = 0; + uint32_t size = len - index; + while (size > 1) { + uint32_t half_size = size / 2; + uint32_t mid_index = index + half_size; + TSCharacterRange *range = &ranges[mid_index]; + if (lookahead >= range->start && lookahead <= range->end) { + return true; + } else if (lookahead > range->end) { + index = mid_index; + } + size -= half_size; + } + TSCharacterRange *range = &ranges[index]; + return (lookahead >= range->start && lookahead <= range->end); +} + +/* + * Lexer Macros + */ + +#ifdef _MSC_VER +#define UNUSED __pragma(warning(suppress : 4101)) +#else +#define UNUSED __attribute__((unused)) +#endif + +#define START_LEXER() \ + bool result = false; \ + bool skip = false; \ + UNUSED \ + bool eof = false; \ + int32_t lookahead; \ + goto start; \ + next_state: \ + lexer->advance(lexer, skip); \ + start: \ + skip = false; \ + lookahead = lexer->lookahead; + +#define ADVANCE(state_value) \ + { \ + state = state_value; \ + goto next_state; \ + } + +#define ADVANCE_MAP(...) \ + { \ + static const uint16_t map[] = { __VA_ARGS__ }; \ + for (uint32_t i = 0; i < sizeof(map) / sizeof(map[0]); i += 2) { \ + if (map[i] == lookahead) { \ + state = map[i + 1]; \ + goto next_state; \ + } \ + } \ + } + +#define SKIP(state_value) \ + { \ + skip = true; \ + state = state_value; \ + goto next_state; \ + } + +#define ACCEPT_TOKEN(symbol_value) \ + result = true; \ + lexer->result_symbol = symbol_value; \ + lexer->mark_end(lexer); + +#define END_STATE() return result; + +/* + * Parse Table Macros + */ + +#define SMALL_STATE(id) ((id) - LARGE_STATE_COUNT) + +#define STATE(id) id + +#define ACTIONS(id) id + +#define SHIFT(state_value) \ + {{ \ + .shift = { \ + .type = TSParseActionTypeShift, \ + .state = (state_value) \ + } \ + }} + +#define SHIFT_REPEAT(state_value) \ + {{ \ + .shift = { \ + .type = TSParseActionTypeShift, \ + .state = (state_value), \ + .repetition = true \ + } \ + }} + +#define SHIFT_EXTRA() \ + {{ \ + .shift = { \ + .type = TSParseActionTypeShift, \ + .extra = true \ + } \ + }} + +#define REDUCE(symbol_name, children, precedence, prod_id) \ + {{ \ + .reduce = { \ + .type = TSParseActionTypeReduce, \ + .symbol = symbol_name, \ + .child_count = children, \ + .dynamic_precedence = precedence, \ + .production_id = prod_id \ + }, \ + }} + +#define RECOVER() \ + {{ \ + .type = TSParseActionTypeRecover \ + }} + +#define ACCEPT_INPUT() \ + {{ \ + .type = TSParseActionTypeAccept \ + }} + +#ifdef __cplusplus +} +#endif + +#endif // TREE_SITTER_PARSER_H_ diff --git a/shcat_c/parser/src/point.h b/shcat_c/parser/src/point.h new file mode 100644 index 00000000..37346c8d --- /dev/null +++ b/shcat_c/parser/src/point.h @@ -0,0 +1,62 @@ +#ifndef TREE_SITTER_POINT_H_ +#define TREE_SITTER_POINT_H_ + +#include "tree_sitter/api.h" + +#define POINT_ZERO ((TSPoint) {0, 0}) +#define POINT_MAX ((TSPoint) {UINT32_MAX, UINT32_MAX}) + +static inline TSPoint point__new(unsigned row, unsigned column) { + TSPoint result = {row, column}; + return result; +} + +static inline TSPoint point_add(TSPoint a, TSPoint b) { + if (b.row > 0) + return point__new(a.row + b.row, b.column); + else + return point__new(a.row, a.column + b.column); +} + +static inline TSPoint point_sub(TSPoint a, TSPoint b) { + if (a.row > b.row) + return point__new(a.row - b.row, a.column); + else + return point__new(0, a.column - b.column); +} + +static inline bool point_lte(TSPoint a, TSPoint b) { + return (a.row < b.row) || (a.row == b.row && a.column <= b.column); +} + +static inline bool point_lt(TSPoint a, TSPoint b) { + return (a.row < b.row) || (a.row == b.row && a.column < b.column); +} + +static inline bool point_gt(TSPoint a, TSPoint b) { + return (a.row > b.row) || (a.row == b.row && a.column > b.column); +} + +static inline bool point_gte(TSPoint a, TSPoint b) { + return (a.row > b.row) || (a.row == b.row && a.column >= b.column); +} + +static inline bool point_eq(TSPoint a, TSPoint b) { + return a.row == b.row && a.column == b.column; +} + +static inline TSPoint point_min(TSPoint a, TSPoint b) { + if (a.row < b.row || (a.row == b.row && a.column < b.column)) + return a; + else + return b; +} + +static inline TSPoint point_max(TSPoint a, TSPoint b) { + if (a.row > b.row || (a.row == b.row && a.column > b.column)) + return a; + else + return b; +} + +#endif diff --git a/shcat_c/parser/src/reduce_action.h b/shcat_c/parser/src/reduce_action.h new file mode 100644 index 00000000..72aff08d --- /dev/null +++ b/shcat_c/parser/src/reduce_action.h @@ -0,0 +1,34 @@ +#ifndef TREE_SITTER_REDUCE_ACTION_H_ +#define TREE_SITTER_REDUCE_ACTION_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "./array.h" +#include "tree_sitter/api.h" + +typedef struct { + uint32_t count; + TSSymbol symbol; + int dynamic_precedence; + unsigned short production_id; +} ReduceAction; + +typedef Array(ReduceAction) ReduceActionSet; + +static inline void ts_reduce_action_set_add(ReduceActionSet *self, + ReduceAction new_action) { + for (uint32_t i = 0; i < self->size; i++) { + ReduceAction action = self->contents[i]; + if (action.symbol == new_action.symbol && action.count == new_action.count) + return; + } + array_push(self, new_action); +} + +#ifdef __cplusplus +} +#endif + +#endif // TREE_SITTER_REDUCE_ACTION_H_ diff --git a/shcat_c/parser/src/reusable_node.h b/shcat_c/parser/src/reusable_node.h new file mode 100644 index 00000000..63fe3c1a --- /dev/null +++ b/shcat_c/parser/src/reusable_node.h @@ -0,0 +1,95 @@ +#include "./subtree.h" + +typedef struct { + Subtree tree; + uint32_t child_index; + uint32_t byte_offset; +} StackEntry; + +typedef struct { + Array(StackEntry) stack; + Subtree last_external_token; +} ReusableNode; + +static inline ReusableNode reusable_node_new(void) { + return (ReusableNode) {array_new(), NULL_SUBTREE}; +} + +static inline void reusable_node_clear(ReusableNode *self) { + array_clear(&self->stack); + self->last_external_token = NULL_SUBTREE; +} + +static inline Subtree reusable_node_tree(ReusableNode *self) { + return self->stack.size > 0 + ? self->stack.contents[self->stack.size - 1].tree + : NULL_SUBTREE; +} + +static inline uint32_t reusable_node_byte_offset(ReusableNode *self) { + return self->stack.size > 0 + ? self->stack.contents[self->stack.size - 1].byte_offset + : UINT32_MAX; +} + +static inline void reusable_node_delete(ReusableNode *self) { + array_delete(&self->stack); +} + +static inline void reusable_node_advance(ReusableNode *self) { + StackEntry last_entry = *array_back(&self->stack); + uint32_t byte_offset = last_entry.byte_offset + ts_subtree_total_bytes(last_entry.tree); + if (ts_subtree_has_external_tokens(last_entry.tree)) { + self->last_external_token = ts_subtree_last_external_token(last_entry.tree); + } + + Subtree tree; + uint32_t next_index; + do { + StackEntry popped_entry = array_pop(&self->stack); + next_index = popped_entry.child_index + 1; + if (self->stack.size == 0) return; + tree = array_back(&self->stack)->tree; + } while (ts_subtree_child_count(tree) <= next_index); + + array_push(&self->stack, ((StackEntry) { + .tree = ts_subtree_children(tree)[next_index], + .child_index = next_index, + .byte_offset = byte_offset, + })); +} + +static inline bool reusable_node_descend(ReusableNode *self) { + StackEntry last_entry = *array_back(&self->stack); + if (ts_subtree_child_count(last_entry.tree) > 0) { + array_push(&self->stack, ((StackEntry) { + .tree = ts_subtree_children(last_entry.tree)[0], + .child_index = 0, + .byte_offset = last_entry.byte_offset, + })); + return true; + } else { + return false; + } +} + +static inline void reusable_node_advance_past_leaf(ReusableNode *self) { + while (reusable_node_descend(self)) {} + reusable_node_advance(self); +} + +static inline void reusable_node_reset(ReusableNode *self, Subtree tree) { + reusable_node_clear(self); + array_push(&self->stack, ((StackEntry) { + .tree = tree, + .child_index = 0, + .byte_offset = 0, + })); + + // Never reuse the root node, because it has a non-standard internal structure + // due to transformations that are applied when it is accepted: adding the EOF + // child and any extra children. + if (!reusable_node_descend(self)) { + reusable_node_clear(self); + } +} diff --git a/shcat_c/parser/src/scanner.c b/shcat_c/parser/src/scanner.c new file mode 100644 index 00000000..378f6fda --- /dev/null +++ b/shcat_c/parser/src/scanner.c @@ -0,0 +1,1514 @@ +#include "tree_sitter/array.h" +#include "tree_sitter/parser.h" + +#include +#include +#include +#include + +enum TokenType +{ + HEREDOC_START, + SIMPLE_HEREDOC_BODY, + HEREDOC_BODY_BEGINNING, + HEREDOC_CONTENT, + HEREDOC_END, + FILE_DESCRIPTOR, + EMPTY_VALUE, + CONCAT, + VARIABLE_NAME, + TEST_OPERATOR, + REGEX, + REGEX_NO_SLASH, + REGEX_NO_SPACE, + EXPANSION_WORD, + EXTGLOB_PATTERN, + BARE_DOLLAR, + BRACE_START, + IMMEDIATE_DOUBLE_HASH, + EXTERNAL_EXPANSION_SYM_HASH, + EXTERNAL_EXPANSION_SYM_BANG, + EXTERNAL_EXPANSION_SYM_EQUAL, + CLOSING_BRACE, + CLOSING_BRACKET, + HEREDOC_ARROW, + HEREDOC_ARROW_DASH, + NEWLINE, + OPENING_PAREN, + ESAC, + ERROR_RECOVERY, +}; + +typedef Array(char) t_string; +// typedef void *String; + +typedef struct s_heredoc +{ + bool is_raw; + bool started; + bool allows_indent; + t_string delimiter; + t_string current_leading_word; +} t_heredoc; + +static inline t_heredoc heredoc_new(void) +{ + return ((t_heredoc){ + .is_raw = false, + .started = false, + .allows_indent = false, + .delimiter = array_new(), + .current_leading_word = array_new(), + }); +} + +typedef struct s_scanner +{ + uint8_t last_glob_paren_depth; + bool ext_was_in_double_quote; + bool ext_saw_outside_quote; + Array(t_heredoc) heredocs; +} t_scanner; + +static inline void advance(TSLexer *lexer) +{ + lexer->advance(lexer, false); +} + +static inline void skip(TSLexer *lexer) +{ + lexer->advance(lexer, true); +} + +static inline bool in_error_recovery(const bool *valid_symbols) +{ + return valid_symbols[ERROR_RECOVERY]; +} + +static inline void reset_string(t_string *string) +{ + if (string->size > 0) + { + memset(string->contents, 0, string->size); + array_clear(string); + } +} + +static inline void reset_heredoc(t_heredoc *heredoc) +{ + heredoc->is_raw = false; + heredoc->started = false; + heredoc->allows_indent = false; + reset_string(&heredoc->delimiter); +} + +static inline void reset(t_scanner *scanner) +{ + uint32_t i; + + i = 0; + while (i < scanner->heredocs.size) + { + reset_heredoc(array_get(&scanner->heredocs, i)); + i++; + } +} + +static unsigned serialize(t_scanner *scanner, char *buffer) +{ + uint32_t size; + uint32_t i; + t_heredoc *heredoc; + + size = 0; + buffer[size++] = (char)scanner->last_glob_paren_depth; + buffer[size++] = (char)scanner->ext_was_in_double_quote; + buffer[size++] = (char)scanner->ext_saw_outside_quote; + buffer[size++] = (char)scanner->heredocs.size; + i = 0; + while (i < scanner->heredocs.size) + { + heredoc = array_get(&scanner->heredocs, i); + if (heredoc->delimiter.size + 3 + size >= + TREE_SITTER_SERIALIZATION_BUFFER_SIZE) + return 0; + buffer[size++] = (char)heredoc->is_raw; + buffer[size++] = (char)heredoc->started; + buffer[size++] = (char)heredoc->allows_indent; + memcpy(&buffer[size], &heredoc->delimiter.size, sizeof(uint32_t)); + size += sizeof(uint32_t); + if (heredoc->delimiter.size > 0) + { + memcpy(&buffer[size], heredoc->delimiter.contents, + heredoc->delimiter.size); + size += heredoc->delimiter.size; + } + i++; + } + return size; +} + +static void deserialize(t_scanner *scanner, const char *buffer, unsigned length) +{ + uint32_t size; + uint32_t heredoc_count; + t_heredoc *heredoc; + uint32_t i; + + size = 0; + if (length == 0) + reset(scanner); + else + { + scanner->last_glob_paren_depth = buffer[size++]; + scanner->ext_was_in_double_quote = buffer[size++]; + scanner->ext_saw_outside_quote = buffer[size++]; + heredoc_count = (unsigned char)buffer[size++]; + i = 0; + while (i < heredoc_count) + { + heredoc = NULL; + if (i < scanner->heredocs.size) + heredoc = array_get(&scanner->heredocs, i); + else + { + array_push(&scanner->heredocs, heredoc_new()); + heredoc = array_back(&scanner->heredocs); + } + heredoc->is_raw = buffer[size++]; + heredoc->started = buffer[size++]; + heredoc->allows_indent = buffer[size++]; + memcpy(&heredoc->delimiter.size, &buffer[size], sizeof(uint32_t)); + size += sizeof(uint32_t); + array_reserve(&heredoc->delimiter, heredoc->delimiter.size); + if (heredoc->delimiter.size > 0) + { + memcpy(heredoc->delimiter.contents, &buffer[size], + heredoc->delimiter.size); + size += heredoc->delimiter.size; + } + i++; + } + assert(size == length); + } +} + +/** + * Consume a "word" in POSIX parlance, and returns it unquoted. + * + * This is an approximate implementation that doesn't deal with any + * POSIX-mandated substitution, and assumes the default value for + * IFS. + */ +static bool advance_word(TSLexer *lexer, t_string *unquoted_word) +{ + bool empty; + int32_t quote; + + quote = 0; + empty = true; + if (lexer->lookahead == '\'' || lexer->lookahead == '"') + (quote = lexer->lookahead, advance(lexer)); + while (lexer->lookahead && + !((quote ? lexer->lookahead == quote || lexer->lookahead == '\r' || + lexer->lookahead == '\n' + : isspace(lexer->lookahead)))) + { + if (lexer->lookahead == '\\') + { + advance(lexer); + if (!lexer->lookahead) + return (false); + } + empty = false; + array_push(unquoted_word, lexer->lookahead); + advance(lexer); + } + array_push(unquoted_word, '\0'); + if (quote && lexer->lookahead == quote) + advance(lexer); + return (!empty); +} + +static inline bool scan_bare_dollar(TSLexer *lexer) +{ + while (isspace(lexer->lookahead) && lexer->lookahead != '\n' && + !lexer->eof(lexer)) + skip(lexer); + + if (lexer->lookahead == '$') + { + advance(lexer); + lexer->result_symbol = BARE_DOLLAR; + lexer->mark_end(lexer); + return (isspace(lexer->lookahead) || lexer->eof(lexer) || + lexer->lookahead == '\"'); + } + + return (false); +} + +static bool scan_heredoc_start(t_heredoc *heredoc, TSLexer *lexer) +{ + bool found_delimiter; + + while (isspace(lexer->lookahead)) + skip(lexer); + lexer->result_symbol = HEREDOC_START; + heredoc->is_raw = lexer->lookahead == '\'' || lexer->lookahead == '"' || + lexer->lookahead == '\\'; + found_delimiter = advance_word(lexer, &heredoc->delimiter); + if (!found_delimiter) + { + reset_string(&heredoc->delimiter); + return false; + } + return found_delimiter; +} + +static bool scan_heredoc_end_identifier(t_heredoc *heredoc, TSLexer *lexer) +{ + reset_string(&heredoc->current_leading_word); + // Scan the first 'n' characters on this line, to see if they match the + // heredoc delimiter + int32_t size; + + size = 0; + if (heredoc->delimiter.size > 0) + { + while (lexer->lookahead != '\0' && lexer->lookahead != '\n' && + (int32_t)*array_get(&heredoc->delimiter, size) == + lexer->lookahead && + heredoc->current_leading_word.size < heredoc->delimiter.size) + { + array_push(&heredoc->current_leading_word, lexer->lookahead); + advance(lexer); + size++; + } + } + array_push(&heredoc->current_leading_word, '\0'); + return heredoc->delimiter.size == 0 + ? false + : strcmp(heredoc->current_leading_word.contents, + heredoc->delimiter.contents) == 0; +} + +static bool scan_heredoc_content(t_scanner *scanner, TSLexer *lexer, + enum TokenType middle_type, + enum TokenType end_type) +{ + bool did_advance = false; + t_heredoc *heredoc = array_back(&scanner->heredocs); + + for (;;) + { + switch (lexer->lookahead) + { + case '\0': { + if (lexer->eof(lexer) && did_advance) + { + reset_heredoc(heredoc); + lexer->result_symbol = end_type; + return true; + } + return false; + } + + case '\\': { + did_advance = true; + advance(lexer); + advance(lexer); + break; + } + + case '$': { + if (heredoc->is_raw) + { + did_advance = true; + advance(lexer); + break; + } + if (did_advance) + { + lexer->mark_end(lexer); + lexer->result_symbol = middle_type; + heredoc->started = true; + advance(lexer); + if (isalpha(lexer->lookahead) || lexer->lookahead == '{' || + lexer->lookahead == '(') + { + return true; + } + break; + } + if (middle_type == HEREDOC_BODY_BEGINNING && + lexer->get_column(lexer) == 0) + { + lexer->result_symbol = middle_type; + heredoc->started = true; + return true; + } + return false; + } + + case '\n': { + if (!did_advance) + { + skip(lexer); + } + else + { + advance(lexer); + } + did_advance = true; + if (heredoc->allows_indent) + { + while (isspace(lexer->lookahead)) + { + advance(lexer); + } + } + lexer->result_symbol = heredoc->started ? middle_type : end_type; + lexer->mark_end(lexer); + if (scan_heredoc_end_identifier(heredoc, lexer)) + { + if (lexer->result_symbol == HEREDOC_END) + { + array_pop(&scanner->heredocs); + } + return true; + } + break; + } + + default: { + if (lexer->get_column(lexer) == 0) + { + // an alternative is to check the starting column of the + // heredoc body and track that statefully + while (isspace(lexer->lookahead)) + { + if (did_advance) + { + advance(lexer); + } + else + { + skip(lexer); + } + } + if (end_type != SIMPLE_HEREDOC_BODY) + { + lexer->result_symbol = middle_type; + if (scan_heredoc_end_identifier(heredoc, lexer)) + { + return true; + } + } + if (end_type == SIMPLE_HEREDOC_BODY) + { + lexer->result_symbol = end_type; + lexer->mark_end(lexer); + if (scan_heredoc_end_identifier(heredoc, lexer)) + { + return true; + } + } + } + did_advance = true; + advance(lexer); + break; + } + } + } +} +static bool regex_scan(t_scanner *scanner, TSLexer *lexer, + const bool *valid_symbols) +{ + if ((valid_symbols[REGEX] || valid_symbols[REGEX_NO_SLASH] || + valid_symbols[REGEX_NO_SPACE]) && + !in_error_recovery(valid_symbols)) + { + if (valid_symbols[REGEX] || valid_symbols[REGEX_NO_SPACE]) + { + while (isspace(lexer->lookahead)) + { + skip(lexer); + } + } + + if ((lexer->lookahead != '"' && lexer->lookahead != '\'') || + ((lexer->lookahead == '$' || lexer->lookahead == '\'') && + valid_symbols[REGEX_NO_SLASH]) || + (lexer->lookahead == '\'' && valid_symbols[REGEX_NO_SPACE])) + { + typedef struct + { + bool done; + bool advanced_once; + bool found_non_alnumdollarunderdash; + bool last_was_escape; + bool in_single_quote; + uint32_t paren_depth; + uint32_t bracket_depth; + uint32_t brace_depth; + } State; + + if (lexer->lookahead == '$' && valid_symbols[REGEX_NO_SLASH]) + { + lexer->mark_end(lexer); + advance(lexer); + if (lexer->lookahead == '(') + { + return false; + } + } + + lexer->mark_end(lexer); + + State state = {false, false, false, false, false, 0, 0, 0}; + while (!state.done) + { + if (state.in_single_quote) + { + if (lexer->lookahead == '\'') + { + state.in_single_quote = false; + advance(lexer); + lexer->mark_end(lexer); + } + } + switch (lexer->lookahead) + { + case '\\': + state.last_was_escape = true; + break; + case '\0': + return false; + case '(': + state.paren_depth++; + state.last_was_escape = false; + break; + case '[': + state.bracket_depth++; + state.last_was_escape = false; + break; + case '{': + if (!state.last_was_escape) + { + state.brace_depth++; + } + state.last_was_escape = false; + break; + case ')': + if (state.paren_depth == 0) + { + state.done = true; + } + state.paren_depth--; + state.last_was_escape = false; + break; + case ']': + if (state.bracket_depth == 0) + { + state.done = true; + } + state.bracket_depth--; + state.last_was_escape = false; + break; + case '}': + if (state.brace_depth == 0) + { + state.done = true; + } + state.brace_depth--; + state.last_was_escape = false; + break; + case '\'': + // Enter or exit a single-quoted string. + state.in_single_quote = !state.in_single_quote; + advance(lexer); + state.advanced_once = true; + state.last_was_escape = false; + continue; + default: + state.last_was_escape = false; + break; + } + + if (!state.done) + { + if (valid_symbols[REGEX]) + { + bool was_space = + !state.in_single_quote && isspace(lexer->lookahead); + advance(lexer); + state.advanced_once = true; + if (!was_space || state.paren_depth > 0) + { + lexer->mark_end(lexer); + } + } + else if (valid_symbols[REGEX_NO_SLASH]) + { + if (lexer->lookahead == '/') + { + lexer->mark_end(lexer); + lexer->result_symbol = REGEX_NO_SLASH; + return state.advanced_once; + } + if (lexer->lookahead == '\\') + { + advance(lexer); + state.advanced_once = true; + if (!lexer->eof(lexer) && lexer->lookahead != '[' && + lexer->lookahead != '/') + { + advance(lexer); + lexer->mark_end(lexer); + } + } + else + { + bool was_space = !state.in_single_quote && + isspace(lexer->lookahead); + advance(lexer); + state.advanced_once = true; + if (!was_space) + { + lexer->mark_end(lexer); + } + } + } + else if (valid_symbols[REGEX_NO_SPACE]) + { + if (lexer->lookahead == '\\') + { + state.found_non_alnumdollarunderdash = true; + advance(lexer); + if (!lexer->eof(lexer)) + { + advance(lexer); + } + } + else if (lexer->lookahead == '$') + { + lexer->mark_end(lexer); + advance(lexer); + // do not parse a command + // substitution + if (lexer->lookahead == '(') + { + return false; + } + // end $ always means regex, e.g. + // 99999999$ + if (isspace(lexer->lookahead)) + { + lexer->result_symbol = REGEX_NO_SPACE; + lexer->mark_end(lexer); + return true; + } + } + else + { + bool was_space = !state.in_single_quote && + isspace(lexer->lookahead); + if (was_space && state.paren_depth == 0) + { + lexer->mark_end(lexer); + lexer->result_symbol = REGEX_NO_SPACE; + return state.found_non_alnumdollarunderdash; + } + if (!isalnum(lexer->lookahead) && + lexer->lookahead != '$' && + lexer->lookahead != '-' && + lexer->lookahead != '_') + { + state.found_non_alnumdollarunderdash = true; + } + advance(lexer); + } + } + } + } + + lexer->result_symbol = + valid_symbols[REGEX_NO_SLASH] ? REGEX_NO_SLASH + : valid_symbols[REGEX_NO_SPACE] ? REGEX_NO_SPACE + : REGEX; + if (valid_symbols[REGEX] && !state.advanced_once) + return (false); + return (true); + } + } + return (false); +} + +static bool extglob_pattern_scan(t_scanner *scanner, TSLexer *lexer, + const bool *valid_symbols) +{ + if (valid_symbols[EXTGLOB_PATTERN] && !in_error_recovery(valid_symbols)) + { + // first skip ws, then check for ? * + @ ! + while (isspace(lexer->lookahead)) + { + skip(lexer); + } + + if (lexer->lookahead == '?' || lexer->lookahead == '*' || + lexer->lookahead == '+' || lexer->lookahead == '@' || + lexer->lookahead == '!' || lexer->lookahead == '-' || + lexer->lookahead == ')' || lexer->lookahead == '\\' || + lexer->lookahead == '.' || lexer->lookahead == '[' || + (isalpha(lexer->lookahead))) + { + if (lexer->lookahead == '\\') + { + advance(lexer); + if ((isspace(lexer->lookahead) || lexer->lookahead == '"') && + lexer->lookahead != '\r' && lexer->lookahead != '\n') + { + advance(lexer); + } + else + { + return false; + } + } + + if (lexer->lookahead == ')' && scanner->last_glob_paren_depth == 0) + { + lexer->mark_end(lexer); + advance(lexer); + + if (isspace(lexer->lookahead)) + { + return false; + } + } + + lexer->mark_end(lexer); + bool was_non_alpha = !isalpha(lexer->lookahead); + if (lexer->lookahead != '[') + { + // no esac + if (lexer->lookahead == 'e') + { + lexer->mark_end(lexer); + advance(lexer); + if (lexer->lookahead == 's') + { + advance(lexer); + if (lexer->lookahead == 'a') + { + advance(lexer); + if (lexer->lookahead == 'c') + { + advance(lexer); + if (isspace(lexer->lookahead)) + { + return false; + } + } + } + } + } + else + { + advance(lexer); + } + } + + // -\w is just a word, find something else special + if (lexer->lookahead == '-') + { + lexer->mark_end(lexer); + advance(lexer); + while (isalnum(lexer->lookahead)) + { + advance(lexer); + } + + if (lexer->lookahead == ')' || lexer->lookahead == '\\' || + lexer->lookahead == '.') + { + return false; + } + lexer->mark_end(lexer); + } + + // case item -) or *) + if (lexer->lookahead == ')' && scanner->last_glob_paren_depth == 0) + { + lexer->mark_end(lexer); + advance(lexer); + if (isspace(lexer->lookahead)) + { + lexer->result_symbol = EXTGLOB_PATTERN; + return was_non_alpha; + } + } + + if (isspace(lexer->lookahead)) + { + lexer->mark_end(lexer); + lexer->result_symbol = EXTGLOB_PATTERN; + scanner->last_glob_paren_depth = 0; + return true; + } + + if (lexer->lookahead == '$') + { + lexer->mark_end(lexer); + advance(lexer); + if (lexer->lookahead == '{' || lexer->lookahead == '(') + { + lexer->result_symbol = EXTGLOB_PATTERN; + return true; + } + } + + if (lexer->lookahead == '|') + { + lexer->mark_end(lexer); + advance(lexer); + lexer->result_symbol = EXTGLOB_PATTERN; + return true; + } + + if (!isalnum(lexer->lookahead) && lexer->lookahead != '(' && + lexer->lookahead != '"' && lexer->lookahead != '[' && + lexer->lookahead != '?' && lexer->lookahead != '/' && + lexer->lookahead != '\\' && lexer->lookahead != '_' && + lexer->lookahead != '*') + { + return false; + } + + typedef struct + { + bool done; + bool saw_non_alphadot; + uint32_t paren_depth; + uint32_t bracket_depth; + uint32_t brace_depth; + } State; + + State state = {false, was_non_alpha, scanner->last_glob_paren_depth, + 0, 0}; + while (!state.done) + { + switch (lexer->lookahead) + { + case '\0': + return false; + case '(': + state.paren_depth++; + break; + case '[': + state.bracket_depth++; + break; + case '{': + state.brace_depth++; + break; + case ')': + if (state.paren_depth == 0) + { + state.done = true; + } + state.paren_depth--; + break; + case ']': + if (state.bracket_depth == 0) + { + state.done = true; + } + state.bracket_depth--; + break; + case '}': + if (state.brace_depth == 0) + { + state.done = true; + } + state.brace_depth--; + break; + } + + if (lexer->lookahead == '|') + { + lexer->mark_end(lexer); + advance(lexer); + if (state.paren_depth == 0 && state.bracket_depth == 0 && + state.brace_depth == 0) + { + lexer->result_symbol = EXTGLOB_PATTERN; + return true; + } + } + + if (!state.done) + { + bool was_space = isspace(lexer->lookahead); + if (lexer->lookahead == '$') + { + lexer->mark_end(lexer); + if (!isalpha(lexer->lookahead) && + lexer->lookahead != '.' && lexer->lookahead != '\\') + { + state.saw_non_alphadot = true; + } + advance(lexer); + if (lexer->lookahead == '(' || lexer->lookahead == '{') + { + lexer->result_symbol = EXTGLOB_PATTERN; + scanner->last_glob_paren_depth = state.paren_depth; + return state.saw_non_alphadot; + } + } + if (was_space) + { + lexer->mark_end(lexer); + lexer->result_symbol = EXTGLOB_PATTERN; + scanner->last_glob_paren_depth = 0; + return state.saw_non_alphadot; + } + if (lexer->lookahead == '"') + { + lexer->mark_end(lexer); + lexer->result_symbol = EXTGLOB_PATTERN; + scanner->last_glob_paren_depth = 0; + return state.saw_non_alphadot; + } + if (lexer->lookahead == '\\') + { + if (!isalpha(lexer->lookahead) && + lexer->lookahead != '.' && lexer->lookahead != '\\') + { + state.saw_non_alphadot = true; + } + advance(lexer); + if (isspace(lexer->lookahead) || + lexer->lookahead == '"') + { + advance(lexer); + } + } + else + { + if (!isalpha(lexer->lookahead) && + lexer->lookahead != '.' && lexer->lookahead != '\\') + { + state.saw_non_alphadot = true; + } + advance(lexer); + } + if (!was_space) + { + lexer->mark_end(lexer); + } + } + } + + lexer->result_symbol = EXTGLOB_PATTERN; + scanner->last_glob_paren_depth = 0; + return state.saw_non_alphadot; + } + scanner->last_glob_paren_depth = 0; + + return false; + } + return (false); +} + +static bool expansion_word_scan(t_scanner *scanner, TSLexer *lexer, + const bool *valid_symbols) +{ + if (valid_symbols[EXPANSION_WORD]) + { + bool advanced_once = false; + bool advance_once_space = false; + for (;;) + { + if (lexer->lookahead == '\"') + { + return false; + } + if (lexer->lookahead == '$') + { + lexer->mark_end(lexer); + advance(lexer); + if (lexer->lookahead == '{' || lexer->lookahead == '(' || + lexer->lookahead == '\'' || isalnum(lexer->lookahead)) + { + lexer->result_symbol = EXPANSION_WORD; + return advanced_once; + } + advanced_once = true; + } + + if (lexer->lookahead == '}') + { + lexer->mark_end(lexer); + lexer->result_symbol = EXPANSION_WORD; + return advanced_once || advance_once_space; + } + + if (lexer->lookahead == '(' && + !(advanced_once || advance_once_space)) + { + lexer->mark_end(lexer); + advance(lexer); + while (lexer->lookahead != ')' && !lexer->eof(lexer)) + { + // if we find a $( or ${ assume this is valid and is + // a garbage concatenation of some weird word + an + // expansion + // I wonder where this can fail + if (lexer->lookahead == '$') + { + lexer->mark_end(lexer); + advance(lexer); + if (lexer->lookahead == '{' || + lexer->lookahead == '(' || + lexer->lookahead == '\'' || + isalnum(lexer->lookahead)) + { + lexer->result_symbol = EXPANSION_WORD; + return advanced_once; + } + advanced_once = true; + } + else + { + advanced_once = + advanced_once || !isspace(lexer->lookahead); + advance_once_space = + advance_once_space || isspace(lexer->lookahead); + advance(lexer); + } + } + lexer->mark_end(lexer); + if (lexer->lookahead == ')') + { + advanced_once = true; + advance(lexer); + lexer->mark_end(lexer); + if (lexer->lookahead == '}') + { + return false; + } + } + else + { + return false; + } + } + + if (lexer->lookahead == '\'') + { + return false; + } + + if (lexer->eof(lexer)) + { + return false; + } + advanced_once = advanced_once || !isspace(lexer->lookahead); + advance_once_space = + advance_once_space || isspace(lexer->lookahead); + advance(lexer); + } + } + return (false); +} + +static bool brace_start_scan(t_scanner *scanner, TSLexer *lexer, + const bool *valid_symbols) +{ + if (valid_symbols[BRACE_START] && !in_error_recovery(valid_symbols)) + { + while (isspace(lexer->lookahead)) + { + skip(lexer); + } + + if (lexer->lookahead != '{') + { + return false; + } + + advance(lexer); + lexer->mark_end(lexer); + + while (isdigit(lexer->lookahead)) + { + advance(lexer); + } + + if (lexer->lookahead != '.') + { + return false; + } + advance(lexer); + + if (lexer->lookahead != '.') + { + return false; + } + advance(lexer); + + while (isdigit(lexer->lookahead)) + { + advance(lexer); + } + + if (lexer->lookahead != '}') + { + return false; + } + + lexer->result_symbol = BRACE_START; + return true; + } + return (false); +} +static bool scan(t_scanner *scanner, TSLexer *lexer, const bool *valid_symbols) +{ + if (valid_symbols[CONCAT] && !in_error_recovery(valid_symbols)) + { + if (!(lexer->lookahead == 0 || isspace(lexer->lookahead) || + lexer->lookahead == '>' || lexer->lookahead == '<' || + lexer->lookahead == ')' || lexer->lookahead == '(' || + lexer->lookahead == ';' || lexer->lookahead == '&' || + lexer->lookahead == '|' || + (lexer->lookahead == '}' && valid_symbols[CLOSING_BRACE]) || + (lexer->lookahead == ']' && valid_symbols[CLOSING_BRACKET]))) + { + lexer->result_symbol = CONCAT; + // So for a`b`, we want to return a concat. We check if the + // 2nd backtick has whitespace after it, and if it does we + // return concat. + if (lexer->lookahead == '`') + { + lexer->mark_end(lexer); + advance(lexer); + while (lexer->lookahead != '`' && !lexer->eof(lexer)) + advance(lexer); + if (lexer->eof(lexer)) + return (false); + if (lexer->lookahead == '`') + advance(lexer); + return (isspace(lexer->lookahead) || lexer->eof(lexer)); + } + // strings w/ expansions that contains escaped quotes or + // backslashes need this to return a concat + if (lexer->lookahead == '\\') + { + lexer->mark_end(lexer); + advance(lexer); + if (lexer->lookahead == '"' || lexer->lookahead == '\'' || + lexer->lookahead == '\\') + return (true); + if (lexer->eof(lexer)) + return (false); + } + else + return (true); + } + if (isspace(lexer->lookahead) && valid_symbols[CLOSING_BRACE] && + !valid_symbols[EXPANSION_WORD]) + { + lexer->result_symbol = CONCAT; + return (true); + } + } + + if (valid_symbols[IMMEDIATE_DOUBLE_HASH] && + !in_error_recovery(valid_symbols)) + { + // advance two # and ensure not } after + if (lexer->lookahead == '#') + { + lexer->mark_end(lexer); + advance(lexer); + if (lexer->lookahead == '#') + { + advance(lexer); + if (lexer->lookahead != '}') + { + lexer->result_symbol = IMMEDIATE_DOUBLE_HASH; + lexer->mark_end(lexer); + return (true); + } + } + } + } + + if (valid_symbols[EXTERNAL_EXPANSION_SYM_HASH] && + !in_error_recovery(valid_symbols)) + { + if (lexer->lookahead == '#' || lexer->lookahead == '=' || + lexer->lookahead == '!') + { + lexer->result_symbol = + lexer->lookahead == '#' ? EXTERNAL_EXPANSION_SYM_HASH + : lexer->lookahead == '!' ? EXTERNAL_EXPANSION_SYM_BANG + : EXTERNAL_EXPANSION_SYM_EQUAL; + advance(lexer); + lexer->mark_end(lexer); + while (lexer->lookahead == '#' || lexer->lookahead == '=' || + lexer->lookahead == '!') + advance(lexer); + while (isspace(lexer->lookahead)) + skip(lexer); + if (lexer->lookahead == '}') + return (true); + return (false); + } + } + + if (valid_symbols[EMPTY_VALUE]) + { + if (isspace(lexer->lookahead) || lexer->eof(lexer) || + lexer->lookahead == ';' || lexer->lookahead == '&') + { + lexer->result_symbol = EMPTY_VALUE; + return (true); + } + } + + if ((valid_symbols[HEREDOC_BODY_BEGINNING] || + valid_symbols[SIMPLE_HEREDOC_BODY]) && + scanner->heredocs.size > 0 && + !array_back(&scanner->heredocs)->started && + !in_error_recovery(valid_symbols)) + return (scan_heredoc_content(scanner, lexer, HEREDOC_BODY_BEGINNING, + SIMPLE_HEREDOC_BODY)); + + if (valid_symbols[HEREDOC_END] && scanner->heredocs.size > 0) + { + t_heredoc *heredoc = array_back(&scanner->heredocs); + if (scan_heredoc_end_identifier(heredoc, lexer)) + { + array_delete(&heredoc->current_leading_word); + array_delete(&heredoc->delimiter); + array_pop(&scanner->heredocs); + lexer->result_symbol = HEREDOC_END; + return (true); + } + } + + if (valid_symbols[HEREDOC_CONTENT] && scanner->heredocs.size > 0 && + array_back(&scanner->heredocs)->started && + !in_error_recovery(valid_symbols)) + return ( + scan_heredoc_content(scanner, lexer, HEREDOC_CONTENT, HEREDOC_END)); + + if (valid_symbols[HEREDOC_START] && !in_error_recovery(valid_symbols) && + scanner->heredocs.size > 0) + return (scan_heredoc_start(array_back(&scanner->heredocs), lexer)); + + if (valid_symbols[TEST_OPERATOR] && !valid_symbols[EXPANSION_WORD]) + { + while (isspace(lexer->lookahead) && lexer->lookahead != '\n') + skip(lexer); + + if (lexer->lookahead == '\\') + { + if (valid_symbols[EXTGLOB_PATTERN]) + return (extglob_pattern_scan(scanner, lexer, valid_symbols)); + if (valid_symbols[REGEX_NO_SPACE]) + return (regex_scan(scanner, lexer, valid_symbols)); + skip(lexer); + + if (lexer->eof(lexer)) + return false; + + if (lexer->lookahead == '\r') + { + skip(lexer); + if (lexer->lookahead == '\n') + skip(lexer); + } + else if (lexer->lookahead == '\n') + skip(lexer); + else + return (false); + + while (isspace(lexer->lookahead)) + skip(lexer); + } + + if (lexer->lookahead == '\n' && !valid_symbols[NEWLINE]) + { + skip(lexer); + while (isspace(lexer->lookahead)) + skip(lexer); + } + + if (lexer->lookahead == '-') + { + advance(lexer); + + bool advanced_once = false; + while (isalpha(lexer->lookahead)) + { + advanced_once = true; + advance(lexer); + } + + if (isspace(lexer->lookahead) && advanced_once) + { + lexer->mark_end(lexer); + advance(lexer); + if (lexer->lookahead == '}' && valid_symbols[CLOSING_BRACE]) + { + if (valid_symbols[EXPANSION_WORD]) + { + lexer->mark_end(lexer); + lexer->result_symbol = EXPANSION_WORD; + return (true); + } + return (false); + } + lexer->result_symbol = TEST_OPERATOR; + return (true); + } + if (isspace(lexer->lookahead) && valid_symbols[EXTGLOB_PATTERN]) + { + lexer->result_symbol = EXTGLOB_PATTERN; + return (true); + } + } + + if (valid_symbols[BARE_DOLLAR] && !in_error_recovery(valid_symbols) && + scan_bare_dollar(lexer)) + return (true); + } + + if ((valid_symbols[VARIABLE_NAME] || valid_symbols[FILE_DESCRIPTOR] || + valid_symbols[HEREDOC_ARROW]) && + !valid_symbols[REGEX_NO_SLASH] && !in_error_recovery(valid_symbols)) + { + while (true) + { + if ((lexer->lookahead == ' ' || lexer->lookahead == '\t' || + lexer->lookahead == '\r' || + (lexer->lookahead == '\n' && !valid_symbols[NEWLINE])) && + !valid_symbols[EXPANSION_WORD]) + skip(lexer); + else if (lexer->lookahead == '\\') + { + skip(lexer); + + if (lexer->eof(lexer)) + { + lexer->mark_end(lexer); + lexer->result_symbol = VARIABLE_NAME; + return (true); + } + + if (lexer->lookahead == '\r') + + skip(lexer); + + if (lexer->lookahead == '\n') + + skip(lexer); + + else + { + if (lexer->lookahead == '\\' && + valid_symbols[EXPANSION_WORD]) + + return ( + expansion_word_scan(scanner, lexer, valid_symbols)); + + return (false); + } + } + else + + break; + } + + // no '*', '@', '?', '-', '$', '0', '_' + if (!valid_symbols[EXPANSION_WORD] && + (lexer->lookahead == '*' || lexer->lookahead == '@' || + lexer->lookahead == '?' || lexer->lookahead == '-' || + lexer->lookahead == '0' || lexer->lookahead == '_')) + { + lexer->mark_end(lexer); + advance(lexer); + if (lexer->lookahead == '=' || lexer->lookahead == '[' || + lexer->lookahead == ':' || lexer->lookahead == '-' || + lexer->lookahead == '%' || lexer->lookahead == '#' || + lexer->lookahead == '/') + + return (false); + + if (valid_symbols[EXTGLOB_PATTERN] && isspace(lexer->lookahead)) + { + lexer->mark_end(lexer); + lexer->result_symbol = EXTGLOB_PATTERN; + return (true); + } + } + + if (valid_symbols[HEREDOC_ARROW] && lexer->lookahead == '<') + { + advance(lexer); + if (lexer->lookahead == '<') + { + advance(lexer); + if (lexer->lookahead == '-') + { + advance(lexer); + t_heredoc heredoc = heredoc_new(); + heredoc.allows_indent = true; + array_push(&scanner->heredocs, heredoc); + lexer->result_symbol = HEREDOC_ARROW_DASH; + } + else if (lexer->lookahead == '<' || lexer->lookahead == '=') + return (false); + else + { + t_heredoc heredoc = heredoc_new(); + array_push(&scanner->heredocs, heredoc); + lexer->result_symbol = HEREDOC_ARROW; + } + return (true); + } + return (false); + } + + bool is_number = true; + if (isdigit(lexer->lookahead)) + advance(lexer); + else if (isalpha(lexer->lookahead) || lexer->lookahead == '_') + { + is_number = false; + advance(lexer); + } + else + { + if (lexer->lookahead == '{') + return (brace_start_scan(scanner, lexer, valid_symbols)); + if (valid_symbols[EXPANSION_WORD]) + return (expansion_word_scan(scanner, lexer, valid_symbols)); + if (valid_symbols[EXTGLOB_PATTERN]) + return (extglob_pattern_scan(scanner, lexer, valid_symbols)); + return false; + } + + while (true) + { + if (isdigit(lexer->lookahead)) + advance(lexer); + else if (isalpha(lexer->lookahead) || lexer->lookahead == '_') + (is_number = false, advance(lexer)); + else + break; + } + + if (is_number && valid_symbols[FILE_DESCRIPTOR] && + (lexer->lookahead == '>' || lexer->lookahead == '<')) + { + lexer->result_symbol = FILE_DESCRIPTOR; + return (true); + } + + if (valid_symbols[VARIABLE_NAME]) + { + if (lexer->lookahead == '+') + { + lexer->mark_end(lexer); + advance(lexer); + if (lexer->lookahead == '=' || lexer->lookahead == ':' || + valid_symbols[CLOSING_BRACE]) + { + lexer->result_symbol = VARIABLE_NAME; + return (true); + } + return (false); + } + if (lexer->lookahead == '/') + { + return (false); + } + if (lexer->lookahead == '=' || lexer->lookahead == '[' || + (lexer->lookahead == ':' && !valid_symbols[CLOSING_BRACE] && + !valid_symbols[OPENING_PAREN]) || // TODO(amaanq): more cases + // for regular word chars but + // not variable names for + // function words, only + // handling : for now? #235 + lexer->lookahead == '%' || + (lexer->lookahead == '#' && !is_number) || + lexer->lookahead == '@' || + (lexer->lookahead == '-' && valid_symbols[CLOSING_BRACE])) + { + lexer->mark_end(lexer); + lexer->result_symbol = VARIABLE_NAME; + return (true); + } + + if (lexer->lookahead == '?') + { + lexer->mark_end(lexer); + advance(lexer); + lexer->result_symbol = VARIABLE_NAME; + return (isalpha(lexer->lookahead)); + } + } + + return (false); + } + + if (valid_symbols[BARE_DOLLAR] && !in_error_recovery(valid_symbols) && + scan_bare_dollar(lexer)) + return (true); + return (false); +} + +void *tree_sitter_bash_external_scanner_create() +{ + t_scanner *scanner = calloc(1, sizeof(t_scanner)); + array_init(&scanner->heredocs); + return (scanner); +} + +bool tree_sitter_bash_external_scanner_scan(void *payload, TSLexer *lexer, + const bool *valid_symbols) +{ + t_scanner *scanner = (t_scanner *)payload; + return (scan(scanner, lexer, valid_symbols)); +} + +unsigned tree_sitter_bash_external_scanner_serialize(void *payload, char *state) +{ + t_scanner *scanner = (t_scanner *)payload; + return (serialize(scanner, state)); +} + +void tree_sitter_bash_external_scanner_deserialize(void *payload, + const char *state, + unsigned length) +{ + t_scanner *scanner = (t_scanner *)payload; + deserialize(scanner, state, length); +} + +void tree_sitter_bash_external_scanner_destroy(void *payload) +{ + t_scanner *scanner = (t_scanner *)payload; + for (size_t i = 0; i < scanner->heredocs.size; i++) + { + t_heredoc *heredoc = array_get(&scanner->heredocs, i); + array_delete(&heredoc->current_leading_word); + array_delete(&heredoc->delimiter); + } + array_delete(&scanner->heredocs); + free(scanner); +} diff --git a/shcat_c/parser/src/stack.c b/shcat_c/parser/src/stack.c new file mode 100644 index 00000000..75b1660c --- /dev/null +++ b/shcat_c/parser/src/stack.c @@ -0,0 +1,899 @@ + +#include "./language.h" +#include "./subtree.h" +#include "./array.h" +#include "./stack.h" +#include "./length.h" +#include +#include +#include + +#define MAX_LINK_COUNT 8 +#define MAX_NODE_POOL_SIZE 50 +#define MAX_ITERATOR_COUNT 64 + +#if defined _WIN32 && !defined __GNUC__ +#define forceinline __forceinline +#else +#define forceinline static inline __attribute__((always_inline)) +#endif + +typedef struct StackNode StackNode; + +typedef struct { + StackNode *node; + Subtree subtree; + bool is_pending; +} StackLink; + +struct StackNode { + TSStateId state; + Length position; + StackLink links[MAX_LINK_COUNT]; + short unsigned int link_count; + uint32_t ref_count; + unsigned error_cost; + unsigned node_count; + int dynamic_precedence; +}; + +typedef struct { + StackNode *node; + SubtreeArray subtrees; + uint32_t subtree_count; + bool is_pending; +} StackIterator; + +typedef Array(StackNode *) StackNodeArray; + +typedef enum { + StackStatusActive, + StackStatusPaused, + StackStatusHalted, +} StackStatus; + +typedef struct { + StackNode *node; + StackSummary *summary; + unsigned node_count_at_last_error; + Subtree last_external_token; + Subtree lookahead_when_paused; + StackStatus status; +} StackHead; + +struct Stack { + Array(StackHead) heads; + StackSliceArray slices; + Array(StackIterator) iterators; + StackNodeArray node_pool; + StackNode *base_node; + SubtreePool *subtree_pool; +}; + +typedef unsigned StackAction; +enum { + StackActionNone, + StackActionStop = 1, + StackActionPop = 2, +}; + +typedef StackAction (*StackCallback)(void *, const StackIterator *); + +static void stack_node_retain(StackNode *self) { + if (!self) + return; + assert(self->ref_count > 0); + self->ref_count++; + assert(self->ref_count != 0); +} + +static void stack_node_release( + StackNode *self, + StackNodeArray *pool, + SubtreePool *subtree_pool +) { +recur: + assert(self->ref_count != 0); + self->ref_count--; + if (self->ref_count > 0) return; + + StackNode *first_predecessor = NULL; + if (self->link_count > 0) { + for (unsigned i = self->link_count - 1; i > 0; i--) { + StackLink link = self->links[i]; + if (link.subtree.ptr) ts_subtree_release(subtree_pool, link.subtree); + stack_node_release(link.node, pool, subtree_pool); + } + StackLink link = self->links[0]; + if (link.subtree.ptr) ts_subtree_release(subtree_pool, link.subtree); + first_predecessor = self->links[0].node; + } + + if (pool->size < MAX_NODE_POOL_SIZE) { + array_push(pool, self); + } else { + free(self); + } + + if (first_predecessor) { + self = first_predecessor; + goto recur; + } +} + +/// Get the number of nodes in the subtree, for the purpose of measuring +/// how much progress has been made by a given version of the stack. +static uint32_t stack__subtree_node_count(Subtree subtree) { + uint32_t count = ts_subtree_visible_descendant_count(subtree); + if (ts_subtree_visible(subtree)) count++; + + // Count intermediate error nodes even though they are not visible, + // because a stack version's node count is used to check whether it + // has made any progress since the last time it encountered an error. + if (ts_subtree_symbol(subtree) == ts_builtin_sym_error_repeat) count++; + + return count; +} + +static StackNode *stack_node_new( + StackNode *previous_node, + Subtree subtree, + bool is_pending, + TSStateId state, + StackNodeArray *pool +) { + StackNode *node = pool->size > 0 + ? array_pop(pool) + : malloc(sizeof(StackNode)); + *node = (StackNode) { + .ref_count = 1, + .link_count = 0, + .state = state + }; + + if (previous_node) { + node->link_count = 1; + node->links[0] = (StackLink) { + .node = previous_node, + .subtree = subtree, + .is_pending = is_pending, + }; + + node->position = previous_node->position; + node->error_cost = previous_node->error_cost; + node->dynamic_precedence = previous_node->dynamic_precedence; + node->node_count = previous_node->node_count; + + if (subtree.ptr) { + node->error_cost += ts_subtree_error_cost(subtree); + node->position = length_add(node->position, ts_subtree_total_size(subtree)); + node->node_count += stack__subtree_node_count(subtree); + node->dynamic_precedence += ts_subtree_dynamic_precedence(subtree); + } + } else { + node->position = length_zero(); + node->error_cost = 0; + } + + return node; +} + +static bool stack__subtree_is_equivalent(Subtree left, Subtree right) { + if (left.ptr == right.ptr) return true; + if (!left.ptr || !right.ptr) return false; + + // Symbols must match + if (ts_subtree_symbol(left) != ts_subtree_symbol(right)) return false; + + // If both have errors, don't bother keeping both. + if (ts_subtree_error_cost(left) > 0 && ts_subtree_error_cost(right) > 0) return true; + + return ( + ts_subtree_padding(left).bytes == ts_subtree_padding(right).bytes && + ts_subtree_size(left).bytes == ts_subtree_size(right).bytes && + ts_subtree_child_count(left) == ts_subtree_child_count(right) && + ts_subtree_extra(left) == ts_subtree_extra(right) && + ts_subtree_external_scanner_state_eq(left, right) + ); +} + +static void stack_node_add_link( + StackNode *self, + StackLink link, + SubtreePool *subtree_pool +) { + if (link.node == self) return; + + for (int i = 0; i < self->link_count; i++) { + StackLink *existing_link = &self->links[i]; + if (stack__subtree_is_equivalent(existing_link->subtree, link.subtree)) { + // In general, we preserve ambiguities until they are removed from the stack + // during a pop operation where multiple paths lead to the same node. But in + // the special case where two links directly connect the same pair of nodes, + // we can safely remove the ambiguity ahead of time without changing behavior. + if (existing_link->node == link.node) { + if ( + ts_subtree_dynamic_precedence(link.subtree) > + ts_subtree_dynamic_precedence(existing_link->subtree) + ) { + ts_subtree_retain(link.subtree); + ts_subtree_release(subtree_pool, existing_link->subtree); + existing_link->subtree = link.subtree; + self->dynamic_precedence = + link.node->dynamic_precedence + ts_subtree_dynamic_precedence(link.subtree); + } + return; + } + + // If the previous nodes are mergeable, merge them recursively. + if ( + existing_link->node->state == link.node->state && + existing_link->node->position.bytes == link.node->position.bytes && + existing_link->node->error_cost == link.node->error_cost + ) { + for (int j = 0; j < link.node->link_count; j++) { + stack_node_add_link(existing_link->node, link.node->links[j], subtree_pool); + } + int32_t dynamic_precedence = link.node->dynamic_precedence; + if (link.subtree.ptr) { + dynamic_precedence += ts_subtree_dynamic_precedence(link.subtree); + } + if (dynamic_precedence > self->dynamic_precedence) { + self->dynamic_precedence = dynamic_precedence; + } + return; + } + } + } + + if (self->link_count == MAX_LINK_COUNT) return; + + stack_node_retain(link.node); + unsigned node_count = link.node->node_count; + int dynamic_precedence = link.node->dynamic_precedence; + self->links[self->link_count++] = link; + + if (link.subtree.ptr) { + ts_subtree_retain(link.subtree); + node_count += stack__subtree_node_count(link.subtree); + dynamic_precedence += ts_subtree_dynamic_precedence(link.subtree); + } + + if (node_count > self->node_count) self->node_count = node_count; + if (dynamic_precedence > self->dynamic_precedence) self->dynamic_precedence = dynamic_precedence; +} + +static void stack_head_delete( + StackHead *self, + StackNodeArray *pool, + SubtreePool *subtree_pool +) { + if (self->node) { + if (self->last_external_token.ptr) { + ts_subtree_release(subtree_pool, self->last_external_token); + } + if (self->lookahead_when_paused.ptr) { + ts_subtree_release(subtree_pool, self->lookahead_when_paused); + } + if (self->summary) { + array_delete(self->summary); + free(self->summary); + } + stack_node_release(self->node, pool, subtree_pool); + } +} + +static StackVersion ts_stack__add_version( + Stack *self, + StackVersion original_version, + StackNode *node +) { + StackHead head = { + .node = node, + .node_count_at_last_error = self->heads.contents[original_version].node_count_at_last_error, + .last_external_token = self->heads.contents[original_version].last_external_token, + .status = StackStatusActive, + .lookahead_when_paused = NULL_SUBTREE, + }; + array_push(&self->heads, head); + stack_node_retain(node); + if (head.last_external_token.ptr) ts_subtree_retain(head.last_external_token); + return (StackVersion)(self->heads.size - 1); +} + +static void ts_stack__add_slice( + Stack *self, + StackVersion original_version, + StackNode *node, + SubtreeArray *subtrees +) { + for (uint32_t i = self->slices.size - 1; i + 1 > 0; i--) { + StackVersion version = self->slices.contents[i].version; + if (self->heads.contents[version].node == node) { + StackSlice slice = {*subtrees, version}; + array_insert(&self->slices, i + 1, slice); + return; + } + } + + StackVersion version = ts_stack__add_version(self, original_version, node); + StackSlice slice = { *subtrees, version }; + array_push(&self->slices, slice); +} + +static StackSliceArray stack__iter( + Stack *self, + StackVersion version, + StackCallback callback, + void *payload, + int goal_subtree_count +) { + array_clear(&self->slices); + array_clear(&self->iterators); + + StackHead *head = array_get(&self->heads, version); + StackIterator new_iterator = { + .node = head->node, + .subtrees = array_new(), + .subtree_count = 0, + .is_pending = true, + }; + + bool include_subtrees = false; + if (goal_subtree_count >= 0) { + include_subtrees = true; + array_reserve(&new_iterator.subtrees, (uint32_t)ts_subtree_alloc_size(goal_subtree_count) / sizeof(Subtree)); + } + + array_push(&self->iterators, new_iterator); + + while (self->iterators.size > 0) { + for (uint32_t i = 0, size = self->iterators.size; i < size; i++) { + StackIterator *iterator = &self->iterators.contents[i]; + StackNode *node = iterator->node; + + StackAction action = callback(payload, iterator); + bool should_pop = action & StackActionPop; + bool should_stop = action & StackActionStop || node->link_count == 0; + + if (should_pop) { + SubtreeArray subtrees = iterator->subtrees; + if (!should_stop) { + ts_subtree_array_copy(subtrees, &subtrees); + } + ts_subtree_array_reverse(&subtrees); + ts_stack__add_slice( + self, + version, + node, + &subtrees + ); + } + + if (should_stop) { + if (!should_pop) { + ts_subtree_array_delete(self->subtree_pool, &iterator->subtrees); + } + array_erase(&self->iterators, i); + i--, size--; + continue; + } + + for (uint32_t j = 1; j <= node->link_count; j++) { + StackIterator *next_iterator; + StackLink link; + if (j == node->link_count) { + link = node->links[0]; + next_iterator = &self->iterators.contents[i]; + } else { + if (self->iterators.size >= MAX_ITERATOR_COUNT) continue; + link = node->links[j]; + StackIterator current_iterator = self->iterators.contents[i]; + array_push(&self->iterators, current_iterator); + next_iterator = array_back(&self->iterators); + ts_subtree_array_copy(next_iterator->subtrees, &next_iterator->subtrees); + } + + next_iterator->node = link.node; + if (link.subtree.ptr) { + if (include_subtrees) { + array_push(&next_iterator->subtrees, link.subtree); + ts_subtree_retain(link.subtree); + } + + if (!ts_subtree_extra(link.subtree)) { + next_iterator->subtree_count++; + if (!link.is_pending) { + next_iterator->is_pending = false; + } + } + } else { + next_iterator->subtree_count++; + next_iterator->is_pending = false; + } + } + } + } + + return self->slices; +} + +Stack *ts_stack_new(SubtreePool *subtree_pool) { + Stack *self = calloc(1, sizeof(Stack)); + + array_init(&self->heads); + array_init(&self->slices); + array_init(&self->iterators); + array_init(&self->node_pool); + array_reserve(&self->heads, 4); + array_reserve(&self->slices, 4); + array_reserve(&self->iterators, 4); + array_reserve(&self->node_pool, MAX_NODE_POOL_SIZE); + + self->subtree_pool = subtree_pool; + self->base_node = stack_node_new(NULL, NULL_SUBTREE, false, 1, &self->node_pool); + ts_stack_clear(self); + + return self; +} + +void ts_stack_delete(Stack *self) { + if (self->slices.contents) + array_delete(&self->slices); + if (self->iterators.contents) + array_delete(&self->iterators); + stack_node_release(self->base_node, &self->node_pool, self->subtree_pool); + for (uint32_t i = 0; i < self->heads.size; i++) { + stack_head_delete(&self->heads.contents[i], &self->node_pool, self->subtree_pool); + } + array_clear(&self->heads); + if (self->node_pool.contents) { + for (uint32_t i = 0; i < self->node_pool.size; i++) + free(self->node_pool.contents[i]); + array_delete(&self->node_pool); + } + array_delete(&self->heads); + free(self); +} + +uint32_t ts_stack_version_count(const Stack *self) { + return self->heads.size; +} + +TSStateId ts_stack_state(const Stack *self, StackVersion version) { + return array_get(&self->heads, version)->node->state; +} + +Length ts_stack_position(const Stack *self, StackVersion version) { + return array_get(&self->heads, version)->node->position; +} + +Subtree ts_stack_last_external_token(const Stack *self, StackVersion version) { + return array_get(&self->heads, version)->last_external_token; +} + +void ts_stack_set_last_external_token(Stack *self, StackVersion version, Subtree token) { + StackHead *head = array_get(&self->heads, version); + if (token.ptr) ts_subtree_retain(token); + if (head->last_external_token.ptr) ts_subtree_release(self->subtree_pool, head->last_external_token); + head->last_external_token = token; +} + +unsigned ts_stack_error_cost(const Stack *self, StackVersion version) { + StackHead *head = array_get(&self->heads, version); + unsigned result = head->node->error_cost; + if ( + head->status == StackStatusPaused || + (head->node->state == ERROR_STATE && !head->node->links[0].subtree.ptr)) { + result += ERROR_COST_PER_RECOVERY; + } + return result; +} + +unsigned ts_stack_node_count_since_error(const Stack *self, StackVersion version) { + StackHead *head = array_get(&self->heads, version); + if (head->node->node_count < head->node_count_at_last_error) { + head->node_count_at_last_error = head->node->node_count; + } + return head->node->node_count - head->node_count_at_last_error; +} + +void ts_stack_push( + Stack *self, + StackVersion version, + Subtree subtree, + bool pending, + TSStateId state +) { + StackHead *head = array_get(&self->heads, version); + StackNode *new_node = stack_node_new(head->node, subtree, pending, state, &self->node_pool); + if (!subtree.ptr) head->node_count_at_last_error = new_node->node_count; + head->node = new_node; +} + +forceinline StackAction pop_count_callback(void *payload, const StackIterator *iterator) { + unsigned *goal_subtree_count = payload; + if (iterator->subtree_count == *goal_subtree_count) { + return StackActionPop | StackActionStop; + } else { + return StackActionNone; + } +} + +StackSliceArray ts_stack_pop_count(Stack *self, StackVersion version, uint32_t count) { + return stack__iter(self, version, pop_count_callback, &count, (int)count); +} + +forceinline StackAction pop_pending_callback(void *payload, const StackIterator *iterator) { + (void)payload; + if (iterator->subtree_count >= 1) { + if (iterator->is_pending) { + return StackActionPop | StackActionStop; + } else { + return StackActionStop; + } + } else { + return StackActionNone; + } +} + +StackSliceArray ts_stack_pop_pending(Stack *self, StackVersion version) { + StackSliceArray pop = stack__iter(self, version, pop_pending_callback, NULL, 0); + if (pop.size > 0) { + ts_stack_renumber_version(self, pop.contents[0].version, version); + pop.contents[0].version = version; + } + return pop; +} + +forceinline StackAction pop_error_callback(void *payload, const StackIterator *iterator) { + if (iterator->subtrees.size > 0) { + bool *found_error = payload; + if (!*found_error && ts_subtree_is_error(iterator->subtrees.contents[0])) { + *found_error = true; + return StackActionPop | StackActionStop; + } else { + return StackActionStop; + } + } else { + return StackActionNone; + } +} + +SubtreeArray ts_stack_pop_error(Stack *self, StackVersion version) { + StackNode *node = array_get(&self->heads, version)->node; + for (unsigned i = 0; i < node->link_count; i++) { + if (node->links[i].subtree.ptr && ts_subtree_is_error(node->links[i].subtree)) { + bool found_error = false; + StackSliceArray pop = stack__iter(self, version, pop_error_callback, &found_error, 1); + if (pop.size > 0) { + assert(pop.size == 1); + ts_stack_renumber_version(self, pop.contents[0].version, version); + return pop.contents[0].subtrees; + } + break; + } + } + return (SubtreeArray) {.size = 0}; +} + +forceinline StackAction pop_all_callback(void *payload, const StackIterator *iterator) { + (void)payload; + return iterator->node->link_count == 0 ? StackActionPop : StackActionNone; +} + +StackSliceArray ts_stack_pop_all(Stack *self, StackVersion version) { + return stack__iter(self, version, pop_all_callback, NULL, 0); +} + +typedef struct { + StackSummary *summary; + unsigned max_depth; +} SummarizeStackSession; + +forceinline StackAction summarize_stack_callback(void *payload, const StackIterator *iterator) { + SummarizeStackSession *session = payload; + TSStateId state = iterator->node->state; + unsigned depth = iterator->subtree_count; + if (depth > session->max_depth) return StackActionStop; + for (unsigned i = session->summary->size - 1; i + 1 > 0; i--) { + StackSummaryEntry entry = session->summary->contents[i]; + if (entry.depth < depth) break; + if (entry.depth == depth && entry.state == state) return StackActionNone; + } + array_push(session->summary, ((StackSummaryEntry) { + .position = iterator->node->position, + .depth = depth, + .state = state, + })); + return StackActionNone; +} + +void ts_stack_record_summary(Stack *self, StackVersion version, unsigned max_depth) { + SummarizeStackSession session = { + .summary = malloc(sizeof(StackSummary)), + .max_depth = max_depth + }; + array_init(session.summary); + stack__iter(self, version, summarize_stack_callback, &session, -1); + StackHead *head = &self->heads.contents[version]; + if (head->summary) { + array_delete(head->summary); + free(head->summary); + } + head->summary = session.summary; +} + +StackSummary *ts_stack_get_summary(Stack *self, StackVersion version) { + return array_get(&self->heads, version)->summary; +} + +int ts_stack_dynamic_precedence(Stack *self, StackVersion version) { + return array_get(&self->heads, version)->node->dynamic_precedence; +} + +bool ts_stack_has_advanced_since_error(const Stack *self, StackVersion version) { + const StackHead *head = array_get(&self->heads, version); + const StackNode *node = head->node; + if (node->error_cost == 0) return true; + while (node) { + if (node->link_count > 0) { + Subtree subtree = node->links[0].subtree; + if (subtree.ptr) { + if (ts_subtree_total_bytes(subtree) > 0) { + return true; + } else if ( + node->node_count > head->node_count_at_last_error && + ts_subtree_error_cost(subtree) == 0 + ) { + node = node->links[0].node; + continue; + } + } + } + break; + } + return false; +} + +void ts_stack_remove_version(Stack *self, StackVersion version) { + stack_head_delete(array_get(&self->heads, version), &self->node_pool, self->subtree_pool); + array_erase(&self->heads, version); +} + +void ts_stack_renumber_version(Stack *self, StackVersion v1, StackVersion v2) { + if (v1 == v2) return; + assert(v2 < v1); + assert((uint32_t)v1 < self->heads.size); + StackHead *source_head = &self->heads.contents[v1]; + StackHead *target_head = &self->heads.contents[v2]; + if (target_head->summary && !source_head->summary) { + source_head->summary = target_head->summary; + target_head->summary = NULL; + } + stack_head_delete(target_head, &self->node_pool, self->subtree_pool); + *target_head = *source_head; + array_erase(&self->heads, v1); +} + +void ts_stack_swap_versions(Stack *self, StackVersion v1, StackVersion v2) { + StackHead temporary_head = self->heads.contents[v1]; + self->heads.contents[v1] = self->heads.contents[v2]; + self->heads.contents[v2] = temporary_head; +} + +StackVersion ts_stack_copy_version(Stack *self, StackVersion version) { + assert(version < self->heads.size); + array_push(&self->heads, self->heads.contents[version]); + StackHead *head = array_back(&self->heads); + stack_node_retain(head->node); + if (head->last_external_token.ptr) ts_subtree_retain(head->last_external_token); + head->summary = NULL; + return self->heads.size - 1; +} + +bool ts_stack_merge(Stack *self, StackVersion version1, StackVersion version2) { + if (!ts_stack_can_merge(self, version1, version2)) return false; + StackHead *head1 = &self->heads.contents[version1]; + StackHead *head2 = &self->heads.contents[version2]; + for (uint32_t i = 0; i < head2->node->link_count; i++) { + stack_node_add_link(head1->node, head2->node->links[i], self->subtree_pool); + } + if (head1->node->state == ERROR_STATE) { + head1->node_count_at_last_error = head1->node->node_count; + } + ts_stack_remove_version(self, version2); + return true; +} + +bool ts_stack_can_merge(Stack *self, StackVersion version1, StackVersion version2) { + StackHead *head1 = &self->heads.contents[version1]; + StackHead *head2 = &self->heads.contents[version2]; + return + head1->status == StackStatusActive && + head2->status == StackStatusActive && + head1->node->state == head2->node->state && + head1->node->position.bytes == head2->node->position.bytes && + head1->node->error_cost == head2->node->error_cost && + ts_subtree_external_scanner_state_eq(head1->last_external_token, head2->last_external_token); +} + +void ts_stack_halt(Stack *self, StackVersion version) { + array_get(&self->heads, version)->status = StackStatusHalted; +} + +void ts_stack_pause(Stack *self, StackVersion version, Subtree lookahead) { + StackHead *head = array_get(&self->heads, version); + head->status = StackStatusPaused; + head->lookahead_when_paused = lookahead; + head->node_count_at_last_error = head->node->node_count; +} + +bool ts_stack_is_active(const Stack *self, StackVersion version) { + return array_get(&self->heads, version)->status == StackStatusActive; +} + +bool ts_stack_is_halted(const Stack *self, StackVersion version) { + return array_get(&self->heads, version)->status == StackStatusHalted; +} + +bool ts_stack_is_paused(const Stack *self, StackVersion version) { + return array_get(&self->heads, version)->status == StackStatusPaused; +} + +Subtree ts_stack_resume(Stack *self, StackVersion version) { + StackHead *head = array_get(&self->heads, version); + assert(head->status == StackStatusPaused); + Subtree result = head->lookahead_when_paused; + head->status = StackStatusActive; + head->lookahead_when_paused = NULL_SUBTREE; + return result; +} + +void ts_stack_clear(Stack *self) { + stack_node_retain(self->base_node); + for (uint32_t i = 0; i < self->heads.size; i++) { + stack_head_delete(&self->heads.contents[i], &self->node_pool, self->subtree_pool); + } + array_clear(&self->heads); + array_push(&self->heads, ((StackHead) { + .node = self->base_node, + .status = StackStatusActive, + .last_external_token = NULL_SUBTREE, + .lookahead_when_paused = NULL_SUBTREE, + })); +} + +bool ts_stack_print_dot_graph(Stack *self, const TSLanguage *language, FILE *f) { + array_reserve(&self->iterators, 32); + if (!f) f = stderr; + + fprintf(f, "digraph stack {\n"); + fprintf(f, "rankdir=\"RL\";\n"); + fprintf(f, "edge [arrowhead=none]\n"); + + Array(StackNode *) visited_nodes = array_new(); + + array_clear(&self->iterators); + for (uint32_t i = 0; i < self->heads.size; i++) { + StackHead *head = &self->heads.contents[i]; + if (head->status == StackStatusHalted) continue; + + fprintf(f, "node_head_%u [shape=none, label=\"\"]\n", i); + fprintf(f, "node_head_%u -> node_%p [", i, (void *)head->node); + + if (head->status == StackStatusPaused) { + fprintf(f, "color=red "); + } + fprintf(f, + "label=%u, fontcolor=blue, weight=10000, labeltooltip=\"node_count: %u\nerror_cost: %u", + i, + ts_stack_node_count_since_error(self, i), + ts_stack_error_cost(self, i) + ); + + if (head->summary) { + fprintf(f, "\nsummary:"); + for (uint32_t j = 0; j < head->summary->size; j++) fprintf(f, " %u", head->summary->contents[j].state); + } + + if (head->last_external_token.ptr) { + const ExternalScannerState *state = &head->last_external_token.ptr->external_scanner_state; + const char *data = ts_external_scanner_state_data(state); + fprintf(f, "\nexternal_scanner_state:"); + for (uint32_t j = 0; j < state->length; j++) fprintf(f, " %2X", data[j]); + } + + fprintf(f, "\"]\n"); + array_push(&self->iterators, ((StackIterator) { + .node = head->node + })); + } + + bool all_iterators_done = false; + while (!all_iterators_done) { + all_iterators_done = true; + + for (uint32_t i = 0; i < self->iterators.size; i++) { + StackIterator iterator = self->iterators.contents[i]; + StackNode *node = iterator.node; + + for (uint32_t j = 0; j < visited_nodes.size; j++) { + if (visited_nodes.contents[j] == node) { + node = NULL; + break; + } + } + + if (!node) continue; + all_iterators_done = false; + + fprintf(f, "node_%p [", (void *)node); + if (node->state == ERROR_STATE) { + fprintf(f, "label=\"?\""); + } else if ( + node->link_count == 1 && + node->links[0].subtree.ptr && + ts_subtree_extra(node->links[0].subtree) + ) { + fprintf(f, "shape=point margin=0 label=\"\""); + } else { + fprintf(f, "label=\"%d\"", node->state); + } + + fprintf( + f, + " tooltip=\"position: %u,%u\nnode_count:%u\nerror_cost: %u\ndynamic_precedence: %d\"];\n", + node->position.extent.row + 1, + node->position.extent.column, + node->node_count, + node->error_cost, + node->dynamic_precedence + ); + + for (int j = 0; j < node->link_count; j++) { + StackLink link = node->links[j]; + fprintf(f, "node_%p -> node_%p [", (void *)node, (void *)link.node); + if (link.is_pending) fprintf(f, "style=dashed "); + if (link.subtree.ptr && ts_subtree_extra(link.subtree)) fprintf(f, "fontcolor=gray "); + + if (!link.subtree.ptr) { + fprintf(f, "color=red"); + } else { + fprintf(f, "label=\""); + bool quoted = ts_subtree_visible(link.subtree) && !ts_subtree_named(link.subtree); + if (quoted) fprintf(f, "'"); + ts_language_write_symbol_as_dot_string(language, f, ts_subtree_symbol(link.subtree)); + if (quoted) fprintf(f, "'"); + fprintf(f, "\""); + fprintf( + f, + "labeltooltip=\"error_cost: %u\ndynamic_precedence: %" PRId32 "\"", + ts_subtree_error_cost(link.subtree), + ts_subtree_dynamic_precedence(link.subtree) + ); + } + + fprintf(f, "];\n"); + + StackIterator *next_iterator; + if (j == 0) { + next_iterator = &self->iterators.contents[i]; + } else { + array_push(&self->iterators, iterator); + next_iterator = array_back(&self->iterators); + } + next_iterator->node = link.node; + } + + array_push(&visited_nodes, node); + } + } + + fprintf(f, "}\n"); + + array_delete(&visited_nodes); + return true; +} + +#undef forceinline diff --git a/shcat_c/parser/src/stack.h b/shcat_c/parser/src/stack.h new file mode 100644 index 00000000..86abbc9d --- /dev/null +++ b/shcat_c/parser/src/stack.h @@ -0,0 +1,133 @@ +#ifndef TREE_SITTER_PARSE_STACK_H_ +#define TREE_SITTER_PARSE_STACK_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "./array.h" +#include "./subtree.h" +#include "./error_costs.h" +#include + +typedef struct Stack Stack; + +typedef unsigned StackVersion; +#define STACK_VERSION_NONE ((StackVersion)-1) + +typedef struct { + SubtreeArray subtrees; + StackVersion version; +} StackSlice; +typedef Array(StackSlice) StackSliceArray; + +typedef struct { + Length position; + unsigned depth; + TSStateId state; +} StackSummaryEntry; +typedef Array(StackSummaryEntry) StackSummary; + +// Create a stack. +Stack *ts_stack_new(SubtreePool *); + +// Release the memory reserved for a given stack. +void ts_stack_delete(Stack *); + +// Get the stack's current number of versions. +uint32_t ts_stack_version_count(const Stack *); + +// Get the state at the top of the given version of the stack. If the stack is +// empty, this returns the initial state, 0. +TSStateId ts_stack_state(const Stack *, StackVersion); + +// Get the last external token associated with a given version of the stack. +Subtree ts_stack_last_external_token(const Stack *, StackVersion); + +// Set the last external token associated with a given version of the stack. +void ts_stack_set_last_external_token(Stack *, StackVersion, Subtree ); + +// Get the position of the given version of the stack within the document. +Length ts_stack_position(const Stack *, StackVersion); + +// Push a tree and state onto the given version of the stack. +// +// This transfers ownership of the tree to the Stack. Callers that +// need to retain ownership of the tree for their own purposes should +// first retain the tree. +void ts_stack_push(Stack *, StackVersion, Subtree , bool, TSStateId); + +// Pop the given number of entries from the given version of the stack. This +// operation can increase the number of stack versions by revealing multiple +// versions which had previously been merged. It returns an array that +// specifies the index of each revealed version and the trees that were +// removed from that version. +StackSliceArray ts_stack_pop_count(Stack *, StackVersion, uint32_t count); + +// Remove an error at the top of the given version of the stack. +SubtreeArray ts_stack_pop_error(Stack *, StackVersion); + +// Remove any pending trees from the top of the given version of the stack. +StackSliceArray ts_stack_pop_pending(Stack *, StackVersion); + +// Remove any all trees from the given version of the stack. +StackSliceArray ts_stack_pop_all(Stack *, StackVersion); + +// Get the maximum number of tree nodes reachable from this version of the stack +// since the last error was detected. +unsigned ts_stack_node_count_since_error(const Stack *, StackVersion); + +int ts_stack_dynamic_precedence(Stack *, StackVersion); + +bool ts_stack_has_advanced_since_error(const Stack *, StackVersion); + +// Compute a summary of all the parse states near the top of the given +// version of the stack and store the summary for later retrieval. +void ts_stack_record_summary(Stack *, StackVersion, unsigned max_depth); + +// Retrieve a summary of all the parse states near the top of the +// given version of the stack. +StackSummary *ts_stack_get_summary(Stack *, StackVersion); + +// Get the total cost of all errors on the given version of the stack. +unsigned ts_stack_error_cost(const Stack *, StackVersion version); + +// Merge the given two stack versions if possible, returning true +// if they were successfully merged and false otherwise. +bool ts_stack_merge(Stack *, StackVersion, StackVersion); + +// Determine whether the given two stack versions can be merged. +bool ts_stack_can_merge(Stack *, StackVersion, StackVersion); + +Subtree ts_stack_resume(Stack *, StackVersion); + +void ts_stack_pause(Stack *, StackVersion, Subtree); + +void ts_stack_halt(Stack *, StackVersion); + +bool ts_stack_is_active(const Stack *, StackVersion); + +bool ts_stack_is_paused(const Stack *, StackVersion); + +bool ts_stack_is_halted(const Stack *, StackVersion); + +void ts_stack_renumber_version(Stack *, StackVersion, StackVersion); + +void ts_stack_swap_versions(Stack *, StackVersion, StackVersion); + +StackVersion ts_stack_copy_version(Stack *, StackVersion); + +// Remove the given version from the stack. +void ts_stack_remove_version(Stack *, StackVersion); + +void ts_stack_clear(Stack *); + +bool ts_stack_print_dot_graph(Stack *, const TSLanguage *, FILE *); + +typedef void (*StackIterateCallback)(void *, TSStateId, uint32_t); + +#ifdef __cplusplus +} +#endif + +#endif // TREE_SITTER_PARSE_STACK_H_ diff --git a/shcat_c/parser/src/subtree.c b/shcat_c/parser/src/subtree.c new file mode 100644 index 00000000..85614d42 --- /dev/null +++ b/shcat_c/parser/src/subtree.c @@ -0,0 +1,1238 @@ +#include +#include +#include +#include +#include +#include + +#include "./array.h" + +#include "./error_costs.h" +#include "./language.h" +#include "./length.h" +#include "./subtree.h" +#include + +typedef struct +{ + Length start; + Length old_end; + Length new_end; +} Edit; + +#define TS_MAX_INLINE_TREE_LENGTH UINT8_MAX +#define TS_MAX_TREE_POOL_SIZE 32 + +// ExternalScannerState + +void ts_external_scanner_state_init(ExternalScannerState *self, + const char *data, unsigned length) +{ + self->length = length; + if (length > sizeof(self->short_data)) + { + self->long_data = malloc(length); + memcpy(self->long_data, data, length); + } + else + { + memcpy(self->short_data, data, length); + } +} + +ExternalScannerState ts_external_scanner_state_copy( + const ExternalScannerState *self) +{ + ExternalScannerState result = *self; + if (self->length > sizeof(self->short_data)) + { + result.long_data = malloc(self->length); + memcpy(result.long_data, self->long_data, self->length); + } + return result; +} + +void ts_external_scanner_state_delete(ExternalScannerState *self) +{ + if (self->length > sizeof(self->short_data)) + { + free(self->long_data); + } +} + +const char *ts_external_scanner_state_data(const ExternalScannerState *self) +{ + if (self->length > sizeof(self->short_data)) + { + return self->long_data; + } + else + { + return self->short_data; + } +} + +bool ts_external_scanner_state_eq(const ExternalScannerState *self, + const char *buffer, unsigned length) +{ + return self->length == length && + memcmp(ts_external_scanner_state_data(self), buffer, length) == 0; +} + +// SubtreeArray + +void ts_subtree_array_copy(SubtreeArray self, SubtreeArray *dest) +{ + dest->size = self.size; + dest->capacity = self.capacity; + dest->contents = self.contents; + if (self.capacity > 0) + { + dest->contents = calloc(self.capacity, sizeof(Subtree)); + memcpy(dest->contents, self.contents, self.size * sizeof(Subtree)); + for (uint32_t i = 0; i < self.size; i++) + { + ts_subtree_retain(dest->contents[i]); + } + } +} + +void ts_subtree_array_clear(SubtreePool *pool, SubtreeArray *self) +{ + for (uint32_t i = 0; i < self->size; i++) + { + ts_subtree_release(pool, self->contents[i]); + } + array_clear(self); +} + +void ts_subtree_array_delete(SubtreePool *pool, SubtreeArray *self) +{ + ts_subtree_array_clear(pool, self); + array_delete(self); +} + +void ts_subtree_array_remove_trailing_extras(SubtreeArray *self, + SubtreeArray *destination) +{ + array_clear(destination); + while (self->size > 0) + { + Subtree last = self->contents[self->size - 1]; + if (ts_subtree_extra(last)) + { + self->size--; + array_push(destination, last); + } + else + { + break; + } + } + ts_subtree_array_reverse(destination); +} + +void ts_subtree_array_reverse(SubtreeArray *self) +{ + for (uint32_t i = 0, limit = self->size / 2; i < limit; i++) + { + size_t reverse_index = self->size - 1 - i; + Subtree swap = self->contents[i]; + self->contents[i] = self->contents[reverse_index]; + self->contents[reverse_index] = swap; + } +} + +// SubtreePool + +SubtreePool ts_subtree_pool_new(uint32_t capacity) +{ + SubtreePool self = {array_new(), array_new()}; + array_reserve(&self.free_trees, capacity); + return self; +} + +void ts_subtree_pool_delete(SubtreePool *self) +{ + if (self->free_trees.contents) + { + for (unsigned i = 0; i < self->free_trees.size; i++) + { + free(self->free_trees.contents[i].ptr); + } + array_delete(&self->free_trees); + } + if (self->tree_stack.contents) + array_delete(&self->tree_stack); +} + +static SubtreeHeapData *ts_subtree_pool_allocate(SubtreePool *self) +{ + if (self->free_trees.size > 0) + { + return array_pop(&self->free_trees).ptr; + } + else + { + return malloc(sizeof(SubtreeHeapData)); + } +} + +static void ts_subtree_pool_free(SubtreePool *self, SubtreeHeapData *tree) +{ + if (self->free_trees.capacity > 0 && + self->free_trees.size + 1 <= TS_MAX_TREE_POOL_SIZE) + { + array_push(&self->free_trees, (MutableSubtree){.ptr = tree}); + } + else + { + free(tree); + } +} + +// Subtree + +static inline bool ts_subtree_can_inline(Length padding, Length size, + uint32_t lookahead_bytes) +{ + return padding.bytes < TS_MAX_INLINE_TREE_LENGTH && + padding.extent.row < 16 && + padding.extent.column < TS_MAX_INLINE_TREE_LENGTH && + size.extent.row == 0 && + size.extent.column < TS_MAX_INLINE_TREE_LENGTH && + lookahead_bytes < 16; +} + +Subtree ts_subtree_new_leaf(SubtreePool *pool, TSSymbol symbol, Length padding, + Length size, uint32_t lookahead_bytes, + TSStateId parse_state, bool has_external_tokens, + bool depends_on_column, bool is_keyword, + const TSLanguage *language) +{ + TSSymbolMetadata metadata = ts_language_symbol_metadata(language, symbol); + bool extra = symbol == ts_builtin_sym_end; + + bool is_inline = (symbol <= UINT8_MAX && !has_external_tokens && + ts_subtree_can_inline(padding, size, lookahead_bytes)); + + if (is_inline) + { + return (Subtree){{ + .parse_state = parse_state, + .symbol = symbol, + .padding_bytes = padding.bytes, + .padding_rows = padding.extent.row, + .padding_columns = padding.extent.column, + .size_bytes = size.bytes, + .lookahead_bytes = lookahead_bytes, + .visible = metadata.visible, + .named = metadata.named, + .extra = extra, + .has_changes = false, + .is_missing = false, + .is_keyword = is_keyword, + .is_inline = true, + }}; + } + else + { + SubtreeHeapData *data = ts_subtree_pool_allocate(pool); + *data = (SubtreeHeapData){ + .ref_count = 1, + .padding = padding, + .size = size, + .lookahead_bytes = lookahead_bytes, + .error_cost = 0, + .child_count = 0, + .symbol = symbol, + .parse_state = parse_state, + .visible = metadata.visible, + .named = metadata.named, + .extra = extra, + .fragile_left = false, + .fragile_right = false, + .has_changes = false, + .has_external_tokens = has_external_tokens, + .has_external_scanner_state_change = false, + .depends_on_column = depends_on_column, + .is_missing = false, + .is_keyword = is_keyword, + {{.first_leaf = {.symbol = 0, .parse_state = 0}}}}; + return (Subtree){.ptr = data}; + } +} + +void ts_subtree_set_symbol(MutableSubtree *self, TSSymbol symbol, + const TSLanguage *language) +{ + TSSymbolMetadata metadata = ts_language_symbol_metadata(language, symbol); + if (self->data.is_inline) + { + assert(symbol < UINT8_MAX); + self->data.symbol = symbol; + self->data.named = metadata.named; + self->data.visible = metadata.visible; + } + else + { + self->ptr->symbol = symbol; + self->ptr->named = metadata.named; + self->ptr->visible = metadata.visible; + } +} + +Subtree ts_subtree_new_error(SubtreePool *pool, int32_t lookahead_char, + Length padding, Length size, + uint32_t bytes_scanned, TSStateId parse_state, + const TSLanguage *language) +{ + Subtree result = ts_subtree_new_leaf(pool, ts_builtin_sym_error, padding, + size, bytes_scanned, parse_state, + false, false, false, language); + SubtreeHeapData *data = (SubtreeHeapData *)result.ptr; + data->fragile_left = true; + data->fragile_right = true; + data->lookahead_char = lookahead_char; + return result; +} + +// Clone a subtree. +MutableSubtree ts_subtree_clone(Subtree self) +{ + size_t alloc_size = ts_subtree_alloc_size(self.ptr->child_count); + Subtree *new_children = malloc(alloc_size); + Subtree *old_children = ts_subtree_children(self); + memcpy(new_children, old_children, alloc_size); + SubtreeHeapData *result = + (SubtreeHeapData *)&new_children[self.ptr->child_count]; + if (self.ptr->child_count > 0) + { + for (uint32_t i = 0; i < self.ptr->child_count; i++) + { + ts_subtree_retain(new_children[i]); + } + } + else if (self.ptr->has_external_tokens) + { + result->external_scanner_state = + ts_external_scanner_state_copy(&self.ptr->external_scanner_state); + } + result->ref_count = 1; + return (MutableSubtree){.ptr = result}; +} + +// Get mutable version of a subtree. +// +// This takes ownership of the subtree. If the subtree has only one owner, +// this will directly convert it into a mutable version. Otherwise, it will +// perform a copy. +MutableSubtree ts_subtree_make_mut(SubtreePool *pool, Subtree self) +{ + if (self.data.is_inline) + return (MutableSubtree){self.data}; + if (self.ptr->ref_count == 1) + return ts_subtree_to_mut_unsafe(self); + MutableSubtree result = ts_subtree_clone(self); + ts_subtree_release(pool, self); + return result; +} + +static void ts_subtree__compress(MutableSubtree self, unsigned count, + const TSLanguage *language, + MutableSubtreeArray *stack) +{ + unsigned initial_stack_size = stack->size; + + MutableSubtree tree = self; + TSSymbol symbol = tree.ptr->symbol; + for (unsigned i = 0; i < count; i++) + { + if (tree.ptr->ref_count > 1 || tree.ptr->child_count < 2) + break; + + MutableSubtree child = + ts_subtree_to_mut_unsafe(ts_subtree_children(tree)[0]); + if (child.data.is_inline || child.ptr->child_count < 2 || + child.ptr->ref_count > 1 || child.ptr->symbol != symbol) + break; + + MutableSubtree grandchild = + ts_subtree_to_mut_unsafe(ts_subtree_children(child)[0]); + if (grandchild.data.is_inline || grandchild.ptr->child_count < 2 || + grandchild.ptr->ref_count > 1 || grandchild.ptr->symbol != symbol) + break; + + ts_subtree_children(tree)[0] = ts_subtree_from_mut(grandchild); + ts_subtree_children(child)[0] = + ts_subtree_children(grandchild)[grandchild.ptr->child_count - 1]; + ts_subtree_children(grandchild)[grandchild.ptr->child_count - 1] = + ts_subtree_from_mut(child); + array_push(stack, tree); + tree = grandchild; + } + + while (stack->size > initial_stack_size) + { + tree = array_pop(stack); + MutableSubtree child = + ts_subtree_to_mut_unsafe(ts_subtree_children(tree)[0]); + MutableSubtree grandchild = ts_subtree_to_mut_unsafe( + ts_subtree_children(child)[child.ptr->child_count - 1]); + ts_subtree_summarize_children(grandchild, language); + ts_subtree_summarize_children(child, language); + ts_subtree_summarize_children(tree, language); + } +} + +void ts_subtree_balance(Subtree self, SubtreePool *pool, + const TSLanguage *language) +{ + array_clear(&pool->tree_stack); + + if (ts_subtree_child_count(self) > 0 && self.ptr->ref_count == 1) + { + array_push(&pool->tree_stack, ts_subtree_to_mut_unsafe(self)); + } + + while (pool->tree_stack.size > 0) + { + MutableSubtree tree = array_pop(&pool->tree_stack); + + if (tree.ptr->repeat_depth > 0) + { + Subtree child1 = ts_subtree_children(tree)[0]; + Subtree child2 = + ts_subtree_children(tree)[tree.ptr->child_count - 1]; + long repeat_delta = (long)ts_subtree_repeat_depth(child1) - + (long)ts_subtree_repeat_depth(child2); + if (repeat_delta > 0) + { + unsigned n = (unsigned)repeat_delta; + for (unsigned i = n / 2; i > 0; i /= 2) + { + ts_subtree__compress(tree, i, language, &pool->tree_stack); + n -= i; + } + } + } + + for (uint32_t i = 0; i < tree.ptr->child_count; i++) + { + Subtree child = ts_subtree_children(tree)[i]; + if (ts_subtree_child_count(child) > 0 && child.ptr->ref_count == 1) + { + array_push(&pool->tree_stack, ts_subtree_to_mut_unsafe(child)); + } + } + } +} + +// Assign all of the node's properties that depend on its children. +void ts_subtree_summarize_children(MutableSubtree self, + const TSLanguage *language) +{ + assert(!self.data.is_inline); + + self.ptr->named_child_count = 0; + self.ptr->visible_child_count = 0; + self.ptr->error_cost = 0; + self.ptr->repeat_depth = 0; + self.ptr->visible_descendant_count = 0; + self.ptr->has_external_tokens = false; + self.ptr->depends_on_column = false; + self.ptr->has_external_scanner_state_change = false; + self.ptr->dynamic_precedence = 0; + + uint32_t structural_index = 0; + const TSSymbol *alias_sequence = + ts_language_alias_sequence(language, self.ptr->production_id); + uint32_t lookahead_end_byte = 0; + + const Subtree *children = ts_subtree_children(self); + for (uint32_t i = 0; i < self.ptr->child_count; i++) + { + Subtree child = children[i]; + + if (self.ptr->size.extent.row == 0 && + ts_subtree_depends_on_column(child)) + { + self.ptr->depends_on_column = true; + } + + if (ts_subtree_has_external_scanner_state_change(child)) + { + self.ptr->has_external_scanner_state_change = true; + } + + if (i == 0) + { + self.ptr->padding = ts_subtree_padding(child); + self.ptr->size = ts_subtree_size(child); + } + else + { + self.ptr->size = + length_add(self.ptr->size, ts_subtree_total_size(child)); + } + + uint32_t child_lookahead_end_byte = self.ptr->padding.bytes + + self.ptr->size.bytes + + ts_subtree_lookahead_bytes(child); + if (child_lookahead_end_byte > lookahead_end_byte) + { + lookahead_end_byte = child_lookahead_end_byte; + } + + if (ts_subtree_symbol(child) != ts_builtin_sym_error_repeat) + { + self.ptr->error_cost += ts_subtree_error_cost(child); + } + + uint32_t grandchild_count = ts_subtree_child_count(child); + if (self.ptr->symbol == ts_builtin_sym_error || + self.ptr->symbol == ts_builtin_sym_error_repeat) + { + if (!ts_subtree_extra(child) && + !(ts_subtree_is_error(child) && grandchild_count == 0)) + { + if (ts_subtree_visible(child)) + { + self.ptr->error_cost += ERROR_COST_PER_SKIPPED_TREE; + } + else if (grandchild_count > 0) + { + self.ptr->error_cost += ERROR_COST_PER_SKIPPED_TREE * + child.ptr->visible_child_count; + } + } + } + + self.ptr->dynamic_precedence += ts_subtree_dynamic_precedence(child); + self.ptr->visible_descendant_count += + ts_subtree_visible_descendant_count(child); + + if (alias_sequence && alias_sequence[structural_index] != 0 && + !ts_subtree_extra(child)) + { + self.ptr->visible_descendant_count++; + self.ptr->visible_child_count++; + if (ts_language_symbol_metadata(language, + alias_sequence[structural_index]) + .named) + { + self.ptr->named_child_count++; + } + } + else if (ts_subtree_visible(child)) + { + self.ptr->visible_descendant_count++; + self.ptr->visible_child_count++; + if (ts_subtree_named(child)) + self.ptr->named_child_count++; + } + else if (grandchild_count > 0) + { + self.ptr->visible_child_count += child.ptr->visible_child_count; + self.ptr->named_child_count += child.ptr->named_child_count; + } + + if (ts_subtree_has_external_tokens(child)) + self.ptr->has_external_tokens = true; + + if (ts_subtree_is_error(child)) + { + self.ptr->fragile_left = self.ptr->fragile_right = true; + self.ptr->parse_state = TS_TREE_STATE_NONE; + } + + if (!ts_subtree_extra(child)) + structural_index++; + } + + self.ptr->lookahead_bytes = + lookahead_end_byte - self.ptr->size.bytes - self.ptr->padding.bytes; + + if (self.ptr->symbol == ts_builtin_sym_error || + self.ptr->symbol == ts_builtin_sym_error_repeat) + { + self.ptr->error_cost += + ERROR_COST_PER_RECOVERY + + ERROR_COST_PER_SKIPPED_CHAR * self.ptr->size.bytes + + ERROR_COST_PER_SKIPPED_LINE * self.ptr->size.extent.row; + } + + if (self.ptr->child_count > 0) + { + Subtree first_child = children[0]; + Subtree last_child = children[self.ptr->child_count - 1]; + + self.ptr->first_leaf.symbol = ts_subtree_leaf_symbol(first_child); + self.ptr->first_leaf.parse_state = + ts_subtree_leaf_parse_state(first_child); + + if (ts_subtree_fragile_left(first_child)) + self.ptr->fragile_left = true; + if (ts_subtree_fragile_right(last_child)) + self.ptr->fragile_right = true; + + if (self.ptr->child_count >= 2 && !self.ptr->visible && + !self.ptr->named && + ts_subtree_symbol(first_child) == self.ptr->symbol) + { + if (ts_subtree_repeat_depth(first_child) > + ts_subtree_repeat_depth(last_child)) + { + self.ptr->repeat_depth = + ts_subtree_repeat_depth(first_child) + 1; + } + else + { + self.ptr->repeat_depth = + ts_subtree_repeat_depth(last_child) + 1; + } + } + } +} + +// Create a new parent node with the given children. +// +// This takes ownership of the children array. +MutableSubtree ts_subtree_new_node(TSSymbol symbol, SubtreeArray *children, + unsigned production_id, + const TSLanguage *language) +{ + TSSymbolMetadata metadata = ts_language_symbol_metadata(language, symbol); + bool fragile = + symbol == ts_builtin_sym_error || symbol == ts_builtin_sym_error_repeat; + + // Allocate the node's data at the end of the array of children. + size_t new_byte_size = ts_subtree_alloc_size(children->size); + if (children->capacity * sizeof(Subtree) < new_byte_size) + { + children->contents = realloc(children->contents, new_byte_size); + children->capacity = (uint32_t)(new_byte_size / sizeof(Subtree)); + } + SubtreeHeapData *data = + (SubtreeHeapData *)&children->contents[children->size]; + + *data = (SubtreeHeapData){.ref_count = 1, + .symbol = symbol, + .child_count = children->size, + .visible = metadata.visible, + .named = metadata.named, + .has_changes = false, + .has_external_scanner_state_change = false, + .fragile_left = fragile, + .fragile_right = fragile, + .is_keyword = false, + {{ + .visible_descendant_count = 0, + .production_id = production_id, + .first_leaf = {.symbol = 0, .parse_state = 0}, + }}}; + MutableSubtree result = {.ptr = data}; + ts_subtree_summarize_children(result, language); + return result; +} + +// Create a new error node containing the given children. +// +// This node is treated as 'extra'. Its children are prevented from having +// having any effect on the parse state. +Subtree ts_subtree_new_error_node(SubtreeArray *children, bool extra, + const TSLanguage *language) +{ + MutableSubtree result = + ts_subtree_new_node(ts_builtin_sym_error, children, 0, language); + result.ptr->extra = extra; + return ts_subtree_from_mut(result); +} + +// Create a new 'missing leaf' node. +// +// This node is treated as 'extra'. Its children are prevented from having +// having any effect on the parse state. +Subtree ts_subtree_new_missing_leaf(SubtreePool *pool, TSSymbol symbol, + Length padding, uint32_t lookahead_bytes, + const TSLanguage *language) +{ + Subtree result = + ts_subtree_new_leaf(pool, symbol, padding, length_zero(), + lookahead_bytes, 0, false, false, false, language); + if (result.data.is_inline) + { + result.data.is_missing = true; + } + else + { + ((SubtreeHeapData *)result.ptr)->is_missing = true; + } + return result; +} + +void ts_subtree_retain(Subtree self) +{ + if (self.data.is_inline) + return; + assert(self.ptr->ref_count > 0); + *(uint32_t *)&self.ptr->ref_count += 1; + assert(self.ptr->ref_count != 0); +} + +void ts_subtree_release(SubtreePool *pool, Subtree self) +{ + if (self.data.is_inline) + return; + array_clear(&pool->tree_stack); + + assert(self.ptr->ref_count > 0); + if (--(*(uint32_t *)&self.ptr->ref_count) == 0) + { + array_push(&pool->tree_stack, ts_subtree_to_mut_unsafe(self)); + } + + while (pool->tree_stack.size > 0) + { + MutableSubtree tree = array_pop(&pool->tree_stack); + if (tree.ptr->child_count > 0) + { + Subtree *children = ts_subtree_children(tree); + for (uint32_t i = 0; i < tree.ptr->child_count; i++) + { + Subtree child = children[i]; + if (child.data.is_inline) + continue; + assert(child.ptr->ref_count > 0); + if (--*(uint32_t *)&child.ptr->ref_count == 0) + { + array_push(&pool->tree_stack, + ts_subtree_to_mut_unsafe(child)); + } + } + free(children); + } + else + { + if (tree.ptr->has_external_tokens) + { + ts_external_scanner_state_delete( + &tree.ptr->external_scanner_state); + } + ts_subtree_pool_free(pool, tree.ptr); + } + } +} + +int ts_subtree_compare(Subtree left, Subtree right, SubtreePool *pool) +{ + array_push(&pool->tree_stack, ts_subtree_to_mut_unsafe(left)); + array_push(&pool->tree_stack, ts_subtree_to_mut_unsafe(right)); + + while (pool->tree_stack.size > 0) + { + right = ts_subtree_from_mut(array_pop(&pool->tree_stack)); + left = ts_subtree_from_mut(array_pop(&pool->tree_stack)); + + int result = 0; + if (ts_subtree_symbol(left) < ts_subtree_symbol(right)) + result = -1; + else if (ts_subtree_symbol(right) < ts_subtree_symbol(left)) + result = 1; + else if (ts_subtree_child_count(left) < ts_subtree_child_count(right)) + result = -1; + else if (ts_subtree_child_count(right) < ts_subtree_child_count(left)) + result = 1; + if (result != 0) + { + array_clear(&pool->tree_stack); + return result; + } + + for (uint32_t i = ts_subtree_child_count(left); i > 0; i--) + { + Subtree left_child = ts_subtree_children(left)[i - 1]; + Subtree right_child = ts_subtree_children(right)[i - 1]; + array_push(&pool->tree_stack, ts_subtree_to_mut_unsafe(left_child)); + array_push(&pool->tree_stack, + ts_subtree_to_mut_unsafe(right_child)); + } + } + + return 0; +} + +static inline void ts_subtree_set_has_changes(MutableSubtree *self) +{ + if (self->data.is_inline) + { + self->data.has_changes = true; + } + else + { + self->ptr->has_changes = true; + } +} + +Subtree ts_subtree_edit(Subtree self, const TSInputEdit *input_edit, + SubtreePool *pool) +{ + typedef struct + { + Subtree *tree; + Edit edit; + } EditEntry; + + Array(EditEntry) stack = array_new(); + array_push( + &stack, + ((EditEntry){ + .tree = &self, + .edit = + (Edit){ + .start = {input_edit->start_byte, input_edit->start_point}, + .old_end = {input_edit->old_end_byte, + input_edit->old_end_point}, + .new_end = {input_edit->new_end_byte, + input_edit->new_end_point}, + }, + })); + + while (stack.size) + { + EditEntry entry = array_pop(&stack); + Edit edit = entry.edit; + bool is_noop = edit.old_end.bytes == edit.start.bytes && + edit.new_end.bytes == edit.start.bytes; + bool is_pure_insertion = edit.old_end.bytes == edit.start.bytes; + bool invalidate_first_row = ts_subtree_depends_on_column(*entry.tree); + + Length size = ts_subtree_size(*entry.tree); + Length padding = ts_subtree_padding(*entry.tree); + Length total_size = length_add(padding, size); + uint32_t lookahead_bytes = ts_subtree_lookahead_bytes(*entry.tree); + uint32_t end_byte = total_size.bytes + lookahead_bytes; + if (edit.start.bytes > end_byte || + (is_noop && edit.start.bytes == end_byte)) + continue; + + // If the edit is entirely within the space before this subtree, then + // shift this subtree over according to the edit without changing its + // size. + if (edit.old_end.bytes <= padding.bytes) + { + padding = + length_add(edit.new_end, length_sub(padding, edit.old_end)); + } + + // If the edit starts in the space before this subtree and extends into + // this subtree, shrink the subtree's content to compensate for the + // change in the space before it. + else if (edit.start.bytes < padding.bytes) + { + size = + length_saturating_sub(size, length_sub(edit.old_end, padding)); + padding = edit.new_end; + } + + // If the edit is a pure insertion right at the start of the subtree, + // shift the subtree over according to the insertion. + else if (edit.start.bytes == padding.bytes && is_pure_insertion) + { + padding = edit.new_end; + } + + // If the edit is within this subtree, resize the subtree to reflect the + // edit. + else if (edit.start.bytes < total_size.bytes || + (edit.start.bytes == total_size.bytes && is_pure_insertion)) + { + size = length_add(length_sub(edit.new_end, padding), + length_saturating_sub(total_size, edit.old_end)); + } + + MutableSubtree result = ts_subtree_make_mut(pool, *entry.tree); + + if (result.data.is_inline) + { + if (ts_subtree_can_inline(padding, size, lookahead_bytes)) + { + result.data.padding_bytes = padding.bytes; + result.data.padding_rows = padding.extent.row; + result.data.padding_columns = padding.extent.column; + result.data.size_bytes = size.bytes; + } + else + { + SubtreeHeapData *data = ts_subtree_pool_allocate(pool); + data->ref_count = 1; + data->padding = padding; + data->size = size; + data->lookahead_bytes = lookahead_bytes; + data->error_cost = 0; + data->child_count = 0; + data->symbol = result.data.symbol; + data->parse_state = result.data.parse_state; + data->visible = result.data.visible; + data->named = result.data.named; + data->extra = result.data.extra; + data->fragile_left = false; + data->fragile_right = false; + data->has_changes = false; + data->has_external_tokens = false; + data->depends_on_column = false; + data->is_missing = result.data.is_missing; + data->is_keyword = result.data.is_keyword; + result.ptr = data; + } + } + else + { + result.ptr->padding = padding; + result.ptr->size = size; + } + + ts_subtree_set_has_changes(&result); + *entry.tree = ts_subtree_from_mut(result); + + Length child_left, child_right = length_zero(); + for (uint32_t i = 0, n = ts_subtree_child_count(*entry.tree); i < n; + i++) + { + Subtree *child = &ts_subtree_children(*entry.tree)[i]; + Length child_size = ts_subtree_total_size(*child); + child_left = child_right; + child_right = length_add(child_left, child_size); + + // If this child ends before the edit, it is not affected. + if (child_right.bytes + ts_subtree_lookahead_bytes(*child) < + edit.start.bytes) + continue; + + // Keep editing child nodes until a node is reached that starts + // after the edit. Also, if this node's validity depends on its + // column position, then continue invaliditing child nodes until + // reaching a line break. + if (((child_left.bytes > edit.old_end.bytes) || + (child_left.bytes == edit.old_end.bytes && + child_size.bytes > 0 && i > 0)) && + (!invalidate_first_row || + child_left.extent.row > entry.tree->ptr->padding.extent.row)) + { + break; + } + + // Transform edit into the child's coordinate space. + Edit child_edit = { + .start = length_saturating_sub(edit.start, child_left), + .old_end = length_saturating_sub(edit.old_end, child_left), + .new_end = length_saturating_sub(edit.new_end, child_left), + }; + + // Interpret all inserted text as applying to the *first* child that + // touches the edit. Subsequent children are only never have any + // text inserted into them; they are only shrunk to compensate for + // the edit. + if (child_right.bytes > edit.start.bytes || + (child_right.bytes == edit.start.bytes && is_pure_insertion)) + { + edit.new_end = edit.start; + } + + // Children that occur before the edit are not reshaped by the edit. + else + { + child_edit.old_end = child_edit.start; + child_edit.new_end = child_edit.start; + } + + // Queue processing of this child's subtree. + array_push(&stack, ((EditEntry){ + .tree = child, + .edit = child_edit, + })); + } + } + + array_delete(&stack); + return self; +} + +Subtree ts_subtree_last_external_token(Subtree tree) +{ + if (!ts_subtree_has_external_tokens(tree)) + return NULL_SUBTREE; + while (tree.ptr->child_count > 0) + { + for (uint32_t i = tree.ptr->child_count - 1; i + 1 > 0; i--) + { + Subtree child = ts_subtree_children(tree)[i]; + if (ts_subtree_has_external_tokens(child)) + { + tree = child; + break; + } + } + } + return tree; +} + +static size_t ts_subtree__write_char_to_string(char *str, size_t n, int32_t chr) +{ + if (chr == -1) + return snprintf(str, n, "INVALID"); + else if (chr == '\0') + return snprintf(str, n, "'\\0'"); + else if (chr == '\n') + return snprintf(str, n, "'\\n'"); + else if (chr == '\t') + return snprintf(str, n, "'\\t'"); + else if (chr == '\r') + return snprintf(str, n, "'\\r'"); + else if (0 < chr && chr < 128 && isprint(chr)) + return snprintf(str, n, "'%c'", chr); + else + return snprintf(str, n, "%d", chr); +} + +static const char *const ROOT_FIELD = "__ROOT__"; + +static size_t ts_subtree__write_to_string( + Subtree self, char *string, size_t limit, const TSLanguage *language, + bool include_all, TSSymbol alias_symbol, bool alias_is_named, + const char *field_name) +{ + if (!self.ptr) + return snprintf(string, limit, "(NULL)"); + + char *cursor = string; + char **writer = (limit > 1) ? &cursor : &string; + bool is_root = field_name == ROOT_FIELD; + bool is_visible = + include_all || ts_subtree_missing(self) || + (alias_symbol ? alias_is_named + : ts_subtree_visible(self) && ts_subtree_named(self)); + + if (is_visible) + { + if (!is_root) + { + cursor += snprintf(*writer, limit, " "); + if (field_name) + { + cursor += snprintf(*writer, limit, "%s: ", field_name); + } + } + + if (ts_subtree_is_error(self) && ts_subtree_child_count(self) == 0 && + self.ptr->size.bytes > 0) + { + cursor += snprintf(*writer, limit, "(UNEXPECTED "); + cursor += ts_subtree__write_char_to_string( + *writer, limit, self.ptr->lookahead_char); + } + else + { + TSSymbol symbol = + alias_symbol ? alias_symbol : ts_subtree_symbol(self); + const char *symbol_name = ts_language_symbol_name(language, symbol); + if (ts_subtree_missing(self)) + { + cursor += snprintf(*writer, limit, "(MISSING "); + if (alias_is_named || ts_subtree_named(self)) + { + cursor += snprintf(*writer, limit, "%s", symbol_name); + } + else + { + cursor += snprintf(*writer, limit, "\"%s\"", symbol_name); + } + } + else + { + cursor += snprintf(*writer, limit, "(%s", symbol_name); + } + } + } + else if (is_root) + { + TSSymbol symbol = alias_symbol ? alias_symbol : ts_subtree_symbol(self); + const char *symbol_name = ts_language_symbol_name(language, symbol); + if (ts_subtree_child_count(self) > 0) + { + cursor += snprintf(*writer, limit, "(%s", symbol_name); + } + else if (ts_subtree_named(self)) + { + cursor += snprintf(*writer, limit, "(%s)", symbol_name); + } + else + { + cursor += snprintf(*writer, limit, "(\"%s\")", symbol_name); + } + } + + if (ts_subtree_child_count(self)) + { + const TSSymbol *alias_sequence = + ts_language_alias_sequence(language, self.ptr->production_id); + const TSFieldMapEntry *field_map, *field_map_end; + ts_language_field_map(language, self.ptr->production_id, &field_map, + &field_map_end); + + uint32_t structural_child_index = 0; + for (uint32_t i = 0; i < self.ptr->child_count; i++) + { + Subtree child = ts_subtree_children(self)[i]; + if (ts_subtree_extra(child)) + { + cursor += + ts_subtree__write_to_string(child, *writer, limit, language, + include_all, 0, false, NULL); + } + else + { + TSSymbol subtree_alias_symbol = + alias_sequence ? alias_sequence[structural_child_index] : 0; + bool subtree_alias_is_named = + subtree_alias_symbol ? ts_language_symbol_metadata( + language, subtree_alias_symbol) + .named + : false; + + const char *child_field_name = is_visible ? NULL : field_name; + for (const TSFieldMapEntry *map = field_map; + map < field_map_end; map++) + { + if (!map->inherited && + map->child_index == structural_child_index) + { + child_field_name = language->field_names[map->field_id]; + break; + } + } + + cursor += ts_subtree__write_to_string( + child, *writer, limit, language, include_all, + subtree_alias_symbol, subtree_alias_is_named, + child_field_name); + structural_child_index++; + } + } + } + + if (is_visible) + cursor += snprintf(*writer, limit, ")"); + + return cursor - string; +} + +char *ts_subtree_string(Subtree self, TSSymbol alias_symbol, + bool alias_is_named, const TSLanguage *language, + bool include_all) +{ + char scratch_string[1]; + size_t size = ts_subtree__write_to_string(self, scratch_string, 1, language, + include_all, alias_symbol, + alias_is_named, ROOT_FIELD) + + 1; + char *result = malloc(size * sizeof(char)); + ts_subtree__write_to_string(self, result, size, language, include_all, + alias_symbol, alias_is_named, ROOT_FIELD); + return result; +} + +void ts_subtree__print_dot_graph(const Subtree *self, uint32_t start_offset, + const TSLanguage *language, + TSSymbol alias_symbol, FILE *f) +{ + TSSymbol subtree_symbol = ts_subtree_symbol(*self); + TSSymbol symbol = alias_symbol ? alias_symbol : subtree_symbol; + uint32_t end_offset = start_offset + ts_subtree_total_bytes(*self); + fprintf(f, "tree_%p [label=\"", (void *)self); + ts_language_write_symbol_as_dot_string(language, f, symbol); + fprintf(f, "\""); + + if (ts_subtree_child_count(*self) == 0) + fprintf(f, ", shape=plaintext"); + if (ts_subtree_extra(*self)) + fprintf(f, ", fontcolor=gray"); + + fprintf(f, + ", tooltip=\"" + "range: %u - %u\n" + "state: %d\n" + "error-cost: %u\n" + "has-changes: %u\n" + "depends-on-column: %u\n" + "descendant-count: %u\n" + "repeat-depth: %u\n" + "lookahead-bytes: %u", + start_offset, end_offset, ts_subtree_parse_state(*self), + ts_subtree_error_cost(*self), ts_subtree_has_changes(*self), + ts_subtree_depends_on_column(*self), + ts_subtree_visible_descendant_count(*self), + ts_subtree_repeat_depth(*self), ts_subtree_lookahead_bytes(*self)); + + if (ts_subtree_is_error(*self) && ts_subtree_child_count(*self) == 0 && + self->ptr->lookahead_char != 0) + { + fprintf(f, "\ncharacter: '%c'", self->ptr->lookahead_char); + } + + fprintf(f, "\"]\n"); + + uint32_t child_start_offset = start_offset; + uint32_t child_info_offset = + language->max_alias_sequence_length * ts_subtree_production_id(*self); + for (uint32_t i = 0, n = ts_subtree_child_count(*self); i < n; i++) + { + const Subtree *child = &ts_subtree_children(*self)[i]; + TSSymbol subtree_alias_symbol = 0; + if (!ts_subtree_extra(*child) && child_info_offset) + { + subtree_alias_symbol = language->alias_sequences[child_info_offset]; + child_info_offset++; + } + ts_subtree__print_dot_graph(child, child_start_offset, language, + subtree_alias_symbol, f); + fprintf(f, "tree_%p -> tree_%p [tooltip=%u]\n", (void *)self, + (void *)child, i); + child_start_offset += ts_subtree_total_bytes(*child); + } +} + +void ts_subtree_print_dot_graph(Subtree self, const TSLanguage *language, + FILE *f) +{ + fprintf(f, "digraph tree {\n"); + fprintf(f, "edge [arrowhead=none]\n"); + ts_subtree__print_dot_graph(&self, 0, language, 0, f); + fprintf(f, "}\n"); +} + +const ExternalScannerState *ts_subtree_external_scanner_state(Subtree self) +{ + static const ExternalScannerState empty_state = {{.short_data = {0}}, + .length = 0}; + if (self.ptr && !self.data.is_inline && self.ptr->has_external_tokens && + self.ptr->child_count == 0) + { + return &self.ptr->external_scanner_state; + } + else + { + return &empty_state; + } +} + +bool ts_subtree_external_scanner_state_eq(Subtree self, Subtree other) +{ + const ExternalScannerState *state_self = + ts_subtree_external_scanner_state(self); + const ExternalScannerState *state_other = + ts_subtree_external_scanner_state(other); + return ts_external_scanner_state_eq( + state_self, ts_external_scanner_state_data(state_other), + state_other->length); +} diff --git a/shcat_c/parser/src/subtree.h b/shcat_c/parser/src/subtree.h new file mode 100644 index 00000000..f140ecdb --- /dev/null +++ b/shcat_c/parser/src/subtree.h @@ -0,0 +1,382 @@ +#ifndef TREE_SITTER_SUBTREE_H_ +#define TREE_SITTER_SUBTREE_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include +#include "./length.h" +#include "./array.h" +#include "./error_costs.h" +#include "./host.h" +#include "tree_sitter/api.h" +#include "./parser.h" + +#define TS_TREE_STATE_NONE USHRT_MAX +#define NULL_SUBTREE ((Subtree) {.ptr = NULL}) + +// The serialized state of an external scanner. +// +// Every time an external token subtree is created after a call to an +// external scanner, the scanner's `serialize` function is called to +// retrieve a serialized copy of its state. The bytes are then copied +// onto the subtree itself so that the scanner's state can later be +// restored using its `deserialize` function. +// +// Small byte arrays are stored inline, and long ones are allocated +// separately on the heap. +typedef struct { + union { + char *long_data; + char short_data[24]; + }; + uint32_t length; +} ExternalScannerState; + +// A compact representation of a subtree. +// +// This representation is used for small leaf nodes that are not +// errors, and were not created by an external scanner. +// +// The idea behind the layout of this struct is that the `is_inline` +// bit will fall exactly into the same location as the least significant +// bit of the pointer in `Subtree` or `MutableSubtree`, respectively. +// Because of alignment, for any valid pointer this will be 0, giving +// us the opportunity to make use of this bit to signify whether to use +// the pointer or the inline struct. +typedef struct SubtreeInlineData SubtreeInlineData; + +#define SUBTREE_BITS \ + bool visible : 1; \ + bool named : 1; \ + bool extra : 1; \ + bool has_changes : 1; \ + bool is_missing : 1; \ + bool is_keyword : 1; + +#define SUBTREE_SIZE \ + uint8_t padding_columns; \ + uint8_t padding_rows : 4; \ + uint8_t lookahead_bytes : 4; \ + uint8_t padding_bytes; \ + uint8_t size_bytes; + +#if TS_BIG_ENDIAN +#if TS_PTR_SIZE == 32 + +struct SubtreeInlineData { + uint16_t parse_state; + uint8_t symbol; + SUBTREE_BITS + bool unused : 1; + bool is_inline : 1; + SUBTREE_SIZE +}; + +#else + +struct SubtreeInlineData { + SUBTREE_SIZE + uint16_t parse_state; + uint8_t symbol; + SUBTREE_BITS + bool unused : 1; + bool is_inline : 1; +}; + +#endif +#else + +struct SubtreeInlineData { + bool is_inline : 1; + SUBTREE_BITS + uint8_t symbol; + uint16_t parse_state; + SUBTREE_SIZE +}; + +#endif + +#undef SUBTREE_BITS +#undef SUBTREE_SIZE + +// A heap-allocated representation of a subtree. +// +// This representation is used for parent nodes, external tokens, +// errors, and other leaf nodes whose data is too large to fit into +// the inline representation. +typedef struct { + volatile uint32_t ref_count; + Length padding; + Length size; + uint32_t lookahead_bytes; + uint32_t error_cost; + uint32_t child_count; + TSSymbol symbol; + TSStateId parse_state; + + bool visible : 1; + bool named : 1; + bool extra : 1; + bool fragile_left : 1; + bool fragile_right : 1; + bool has_changes : 1; + bool has_external_tokens : 1; + bool has_external_scanner_state_change : 1; + bool depends_on_column: 1; + bool is_missing : 1; + bool is_keyword : 1; + + union { + // Non-terminal subtrees (`child_count > 0`) + struct { + uint32_t visible_child_count; + uint32_t named_child_count; + uint32_t visible_descendant_count; + int32_t dynamic_precedence; + uint16_t repeat_depth; + uint16_t production_id; + struct { + TSSymbol symbol; + TSStateId parse_state; + } first_leaf; + }; + + // External terminal subtrees (`child_count == 0 && has_external_tokens`) + ExternalScannerState external_scanner_state; + + // Error terminal subtrees (`child_count == 0 && symbol == ts_builtin_sym_error`) + int32_t lookahead_char; + }; +} SubtreeHeapData; + +// The fundamental building block of a syntax tree. +typedef union { + SubtreeInlineData data; + const SubtreeHeapData *ptr; +} Subtree; + +// Like Subtree, but mutable. +typedef union { + SubtreeInlineData data; + SubtreeHeapData *ptr; +} MutableSubtree; + +typedef Array(Subtree) SubtreeArray; +typedef Array(MutableSubtree) MutableSubtreeArray; + +typedef struct { + MutableSubtreeArray free_trees; + MutableSubtreeArray tree_stack; +} SubtreePool; + +void ts_external_scanner_state_init(ExternalScannerState *, const char *, unsigned); +const char *ts_external_scanner_state_data(const ExternalScannerState *); +bool ts_external_scanner_state_eq(const ExternalScannerState *self, const char *, unsigned); +void ts_external_scanner_state_delete(ExternalScannerState *self); + +void ts_subtree_array_copy(SubtreeArray, SubtreeArray *); +void ts_subtree_array_clear(SubtreePool *, SubtreeArray *); +void ts_subtree_array_delete(SubtreePool *, SubtreeArray *); +void ts_subtree_array_remove_trailing_extras(SubtreeArray *, SubtreeArray *); +void ts_subtree_array_reverse(SubtreeArray *); + +SubtreePool ts_subtree_pool_new(uint32_t capacity); +void ts_subtree_pool_delete(SubtreePool *); + +Subtree ts_subtree_new_leaf( + SubtreePool *, TSSymbol, Length, Length, uint32_t, + TSStateId, bool, bool, bool, const TSLanguage * +); +Subtree ts_subtree_new_error( + SubtreePool *, int32_t, Length, Length, uint32_t, TSStateId, const TSLanguage * +); +MutableSubtree ts_subtree_new_node(TSSymbol, SubtreeArray *, unsigned, const TSLanguage *); +Subtree ts_subtree_new_error_node(SubtreeArray *, bool, const TSLanguage *); +Subtree ts_subtree_new_missing_leaf(SubtreePool *, TSSymbol, Length, uint32_t, const TSLanguage *); +MutableSubtree ts_subtree_make_mut(SubtreePool *, Subtree); +void ts_subtree_retain(Subtree); +void ts_subtree_release(SubtreePool *, Subtree); +int ts_subtree_compare(Subtree, Subtree, SubtreePool *); +void ts_subtree_set_symbol(MutableSubtree *, TSSymbol, const TSLanguage *); +void ts_subtree_summarize(MutableSubtree, const Subtree *, uint32_t, const TSLanguage *); +void ts_subtree_summarize_children(MutableSubtree, const TSLanguage *); +void ts_subtree_balance(Subtree, SubtreePool *, const TSLanguage *); +Subtree ts_subtree_edit(Subtree, const TSInputEdit *edit, SubtreePool *); +char *ts_subtree_string(Subtree, TSSymbol, bool, const TSLanguage *, bool include_all); +void ts_subtree_print_dot_graph(Subtree, const TSLanguage *, FILE *); +Subtree ts_subtree_last_external_token(Subtree); +const ExternalScannerState *ts_subtree_external_scanner_state(Subtree self); +bool ts_subtree_external_scanner_state_eq(Subtree, Subtree); + +#define SUBTREE_GET(self, name) ((self).data.is_inline ? (self).data.name : (self).ptr->name) + +static inline TSSymbol ts_subtree_symbol(Subtree self) { return SUBTREE_GET(self, symbol); } +static inline bool ts_subtree_visible(Subtree self) { return SUBTREE_GET(self, visible); } +static inline bool ts_subtree_named(Subtree self) { return SUBTREE_GET(self, named); } +static inline bool ts_subtree_extra(Subtree self) { return SUBTREE_GET(self, extra); } +static inline bool ts_subtree_has_changes(Subtree self) { return SUBTREE_GET(self, has_changes); } +static inline bool ts_subtree_missing(Subtree self) { return SUBTREE_GET(self, is_missing); } +static inline bool ts_subtree_is_keyword(Subtree self) { return SUBTREE_GET(self, is_keyword); } +static inline TSStateId ts_subtree_parse_state(Subtree self) { return SUBTREE_GET(self, parse_state); } +static inline uint32_t ts_subtree_lookahead_bytes(Subtree self) { return SUBTREE_GET(self, lookahead_bytes); } + +#undef SUBTREE_GET + +// Get the size needed to store a heap-allocated subtree with the given +// number of children. +static inline size_t ts_subtree_alloc_size(uint32_t child_count) { + return child_count * sizeof(Subtree) + sizeof(SubtreeHeapData); +} + +// Get a subtree's children, which are allocated immediately before the +// tree's own heap data. +#define ts_subtree_children(self) \ + ((self).data.is_inline ? NULL : (Subtree *)((self).ptr) - (self).ptr->child_count) + +static inline void ts_subtree_set_extra(MutableSubtree *self, bool is_extra) { + if (self->data.is_inline) { + self->data.extra = is_extra; + } else { + self->ptr->extra = is_extra; + } +} + +static inline TSSymbol ts_subtree_leaf_symbol(Subtree self) { + if (self.data.is_inline) return self.data.symbol; + if (self.ptr->child_count == 0) return self.ptr->symbol; + return self.ptr->first_leaf.symbol; +} + +static inline TSStateId ts_subtree_leaf_parse_state(Subtree self) { + if (self.data.is_inline) return self.data.parse_state; + if (self.ptr->child_count == 0) return self.ptr->parse_state; + return self.ptr->first_leaf.parse_state; +} + +static inline Length ts_subtree_padding(Subtree self) { + if (self.data.is_inline) { + Length result = {self.data.padding_bytes, {self.data.padding_rows, self.data.padding_columns}}; + return result; + } else { + return self.ptr->padding; + } +} + +static inline Length ts_subtree_size(Subtree self) { + if (self.data.is_inline) { + Length result = {self.data.size_bytes, {0, self.data.size_bytes}}; + return result; + } else { + return self.ptr->size; + } +} + +static inline Length ts_subtree_total_size(Subtree self) { + return length_add(ts_subtree_padding(self), ts_subtree_size(self)); +} + +static inline uint32_t ts_subtree_total_bytes(Subtree self) { + return ts_subtree_total_size(self).bytes; +} + +static inline uint32_t ts_subtree_child_count(Subtree self) { + return self.data.is_inline ? 0 : self.ptr->child_count; +} + +static inline uint32_t ts_subtree_repeat_depth(Subtree self) { + return self.data.is_inline ? 0 : self.ptr->repeat_depth; +} + +static inline uint32_t ts_subtree_is_repetition(Subtree self) { + return self.data.is_inline + ? 0 + : !self.ptr->named && !self.ptr->visible && self.ptr->child_count != 0; +} + +static inline uint32_t ts_subtree_visible_descendant_count(Subtree self) { + return (self.data.is_inline || self.ptr->child_count == 0) + ? 0 + : self.ptr->visible_descendant_count; +} + +static inline uint32_t ts_subtree_visible_child_count(Subtree self) { + if (ts_subtree_child_count(self) > 0) { + return self.ptr->visible_child_count; + } else { + return 0; + } +} + +static inline uint32_t ts_subtree_error_cost(Subtree self) { + if (ts_subtree_missing(self)) { + return ERROR_COST_PER_MISSING_TREE + ERROR_COST_PER_RECOVERY; + } else { + return self.data.is_inline ? 0 : self.ptr->error_cost; + } +} + +static inline int32_t ts_subtree_dynamic_precedence(Subtree self) { + return (self.data.is_inline || self.ptr->child_count == 0) ? 0 : self.ptr->dynamic_precedence; +} + +static inline uint16_t ts_subtree_production_id(Subtree self) { + if (ts_subtree_child_count(self) > 0) { + return self.ptr->production_id; + } else { + return 0; + } +} + +static inline bool ts_subtree_fragile_left(Subtree self) { + return self.data.is_inline ? false : self.ptr->fragile_left; +} + +static inline bool ts_subtree_fragile_right(Subtree self) { + return self.data.is_inline ? false : self.ptr->fragile_right; +} + +static inline bool ts_subtree_has_external_tokens(Subtree self) { + return self.data.is_inline ? false : self.ptr->has_external_tokens; +} + +static inline bool ts_subtree_has_external_scanner_state_change(Subtree self) { + return self.data.is_inline ? false : self.ptr->has_external_scanner_state_change; +} + +static inline bool ts_subtree_depends_on_column(Subtree self) { + return self.data.is_inline ? false : self.ptr->depends_on_column; +} + +static inline bool ts_subtree_is_fragile(Subtree self) { + return self.data.is_inline ? false : (self.ptr->fragile_left || self.ptr->fragile_right); +} + +static inline bool ts_subtree_is_error(Subtree self) { + return ts_subtree_symbol(self) == ts_builtin_sym_error; +} + +static inline bool ts_subtree_is_eof(Subtree self) { + return ts_subtree_symbol(self) == ts_builtin_sym_end; +} + +static inline Subtree ts_subtree_from_mut(MutableSubtree self) { + Subtree result; + result.data = self.data; + return result; +} + +static inline MutableSubtree ts_subtree_to_mut_unsafe(Subtree self) { + MutableSubtree result; + result.data = self.data; + return result; +} + +#ifdef __cplusplus +} +#endif + +#endif // TREE_SITTER_SUBTREE_H_ diff --git a/shcat_c/parser/src/tree.c b/shcat_c/parser/src/tree.c new file mode 100644 index 00000000..e7d28c38 --- /dev/null +++ b/shcat_c/parser/src/tree.c @@ -0,0 +1,140 @@ +#define _POSIX_C_SOURCE 200112L + +#include "tree_sitter/api.h" +#include "./array.h" + +#include "./length.h" +#include "./subtree.h" +#include "./tree_cursor.h" +#include "./tree.h" + +TSTree *ts_tree_new( + Subtree root, const TSLanguage *language, + const TSRange *included_ranges, unsigned included_range_count +) { + TSTree *result = malloc(sizeof(TSTree)); + result->root = root; + result->language = ts_language_copy(language); + result->included_ranges = calloc(included_range_count, sizeof(TSRange)); + memcpy(result->included_ranges, included_ranges, included_range_count * sizeof(TSRange)); + result->included_range_count = included_range_count; + return result; +} + +TSTree *ts_tree_copy(const TSTree *self) { + ts_subtree_retain(self->root); + return ts_tree_new(self->root, self->language, self->included_ranges, self->included_range_count); +} + +void ts_tree_delete(TSTree *self) { + if (!self) return; + + SubtreePool pool = ts_subtree_pool_new(0); + ts_subtree_release(&pool, self->root); + ts_subtree_pool_delete(&pool); + ts_language_delete(self->language); + free(self->included_ranges); + free(self); +} + +TSNode ts_tree_root_node(const TSTree *self) { + return ts_node_new(self, &self->root, ts_subtree_padding(self->root), 0); +} + +TSNode ts_tree_root_node_with_offset( + const TSTree *self, + uint32_t offset_bytes, + TSPoint offset_extent +) { + Length offset = {offset_bytes, offset_extent}; + return ts_node_new(self, &self->root, length_add(offset, ts_subtree_padding(self->root)), 0); +} + +const TSLanguage *ts_tree_language(const TSTree *self) { + return self->language; +} + +void ts_tree_edit(TSTree *self, const TSInputEdit *edit) { + for (unsigned i = 0; i < self->included_range_count; i++) { + TSRange *range = &self->included_ranges[i]; + if (range->end_byte >= edit->old_end_byte) { + if (range->end_byte != UINT32_MAX) { + range->end_byte = edit->new_end_byte + (range->end_byte - edit->old_end_byte); + range->end_point = point_add( + edit->new_end_point, + point_sub(range->end_point, edit->old_end_point) + ); + if (range->end_byte < edit->new_end_byte) { + range->end_byte = UINT32_MAX; + range->end_point = POINT_MAX; + } + } + } else if (range->end_byte > edit->start_byte) { + range->end_byte = edit->start_byte; + range->end_point = edit->start_point; + } + if (range->start_byte >= edit->old_end_byte) { + range->start_byte = edit->new_end_byte + (range->start_byte - edit->old_end_byte); + range->start_point = point_add( + edit->new_end_point, + point_sub(range->start_point, edit->old_end_point) + ); + if (range->start_byte < edit->new_end_byte) { + range->start_byte = UINT32_MAX; + range->start_point = POINT_MAX; + } + } else if (range->start_byte > edit->start_byte) { + range->start_byte = edit->start_byte; + range->start_point = edit->start_point; + } + } + + SubtreePool pool = ts_subtree_pool_new(0); + self->root = ts_subtree_edit(self->root, edit, &pool); + ts_subtree_pool_delete(&pool); +} + +TSRange *ts_tree_included_ranges(const TSTree *self, uint32_t *length) { + *length = self->included_range_count; + TSRange *ranges = calloc(self->included_range_count, sizeof(TSRange)); + memcpy(ranges, self->included_ranges, self->included_range_count * sizeof(TSRange)); + return ranges; +} + +#ifdef _WIN32 + +#include +#include + +int _ts_dup(HANDLE handle) { + HANDLE dup_handle; + if (!DuplicateHandle( + GetCurrentProcess(), handle, + GetCurrentProcess(), &dup_handle, + 0, FALSE, DUPLICATE_SAME_ACCESS + )) return -1; + + return _open_osfhandle((intptr_t)dup_handle, 0); +} + +void ts_tree_print_dot_graph(const TSTree *self, int fd) { + FILE *file = _fdopen(_ts_dup((HANDLE)_get_osfhandle(fd)), "a"); + ts_subtree_print_dot_graph(self->root, self->language, file); + fclose(file); +} + +#else + +#include + +int _ts_dup(int file_descriptor) { + return dup(file_descriptor); +} + +void ts_tree_print_dot_graph(const TSTree *self, int file_descriptor) { + FILE *file = fdopen(_ts_dup(file_descriptor), "a"); + ts_subtree_print_dot_graph(self->root, self->language, file); + fclose(file); +} + +#endif diff --git a/shcat_c/parser/src/tree.h b/shcat_c/parser/src/tree.h new file mode 100644 index 00000000..f012f888 --- /dev/null +++ b/shcat_c/parser/src/tree.h @@ -0,0 +1,31 @@ +#ifndef TREE_SITTER_TREE_H_ +#define TREE_SITTER_TREE_H_ + +#include "./subtree.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct { + const Subtree *child; + const Subtree *parent; + Length position; + TSSymbol alias_symbol; +} ParentCacheEntry; + +struct TSTree { + Subtree root; + const TSLanguage *language; + TSRange *included_ranges; + unsigned included_range_count; +}; + +TSTree *ts_tree_new(Subtree root, const TSLanguage *language, const TSRange *, unsigned); +TSNode ts_node_new(const TSTree *, const Subtree *, Length, TSSymbol); + +#ifdef __cplusplus +} +#endif + +#endif // TREE_SITTER_TREE_H_ diff --git a/shcat_c/parser/src/tree_cursor.c b/shcat_c/parser/src/tree_cursor.c new file mode 100644 index 00000000..457bb14c --- /dev/null +++ b/shcat_c/parser/src/tree_cursor.c @@ -0,0 +1,714 @@ +#include "tree_sitter/api.h" + +#include "./tree_cursor.h" +#include "./language.h" +#include "./tree.h" + +typedef struct { + Subtree parent; + const TSTree *tree; + Length position; + uint32_t child_index; + uint32_t structural_child_index; + uint32_t descendant_index; + const TSSymbol *alias_sequence; +} CursorChildIterator; + +// CursorChildIterator + +static inline bool ts_tree_cursor_is_entry_visible(const TreeCursor *self, uint32_t index) { + TreeCursorEntry *entry = &self->stack.contents[index]; + if (index == 0 || ts_subtree_visible(*entry->subtree)) { + return true; + } else if (!ts_subtree_extra(*entry->subtree)) { + TreeCursorEntry *parent_entry = &self->stack.contents[index - 1]; + return ts_language_alias_at( + self->tree->language, + parent_entry->subtree->ptr->production_id, + entry->structural_child_index + ); + } else { + return false; + } +} + +static inline CursorChildIterator ts_tree_cursor_iterate_children(const TreeCursor *self) { + TreeCursorEntry *last_entry = array_back(&self->stack); + if (ts_subtree_child_count(*last_entry->subtree) == 0) { + return (CursorChildIterator) {NULL_SUBTREE, self->tree, length_zero(), 0, 0, 0, NULL}; + } + const TSSymbol *alias_sequence = ts_language_alias_sequence( + self->tree->language, + last_entry->subtree->ptr->production_id + ); + + uint32_t descendant_index = last_entry->descendant_index; + if (ts_tree_cursor_is_entry_visible(self, self->stack.size - 1)) { + descendant_index += 1; + } + + return (CursorChildIterator) { + .tree = self->tree, + .parent = *last_entry->subtree, + .position = last_entry->position, + .child_index = 0, + .structural_child_index = 0, + .descendant_index = descendant_index, + .alias_sequence = alias_sequence, + }; +} + +static inline bool ts_tree_cursor_child_iterator_next( + CursorChildIterator *self, + TreeCursorEntry *result, + bool *visible +) { + if (!self->parent.ptr || self->child_index == self->parent.ptr->child_count) return false; + const Subtree *child = &ts_subtree_children(self->parent)[self->child_index]; + *result = (TreeCursorEntry) { + .subtree = child, + .position = self->position, + .child_index = self->child_index, + .structural_child_index = self->structural_child_index, + .descendant_index = self->descendant_index, + }; + *visible = ts_subtree_visible(*child); + bool extra = ts_subtree_extra(*child); + if (!extra) { + if (self->alias_sequence) { + *visible |= self->alias_sequence[self->structural_child_index]; + } + self->structural_child_index++; + } + + self->descendant_index += ts_subtree_visible_descendant_count(*child); + if (*visible) { + self->descendant_index += 1; + } + + self->position = length_add(self->position, ts_subtree_size(*child)); + self->child_index++; + + if (self->child_index < self->parent.ptr->child_count) { + Subtree next_child = ts_subtree_children(self->parent)[self->child_index]; + self->position = length_add(self->position, ts_subtree_padding(next_child)); + } + + return true; +} + +// Return a position that, when `b` is added to it, yields `a`. This +// can only be computed if `b` has zero rows. Otherwise, this function +// returns `LENGTH_UNDEFINED`, and the caller needs to recompute +// the position some other way. +static inline Length length_backtrack(Length a, Length b) { + if (length_is_undefined(a) || b.extent.row != 0) { + return LENGTH_UNDEFINED; + } + + Length result; + result.bytes = a.bytes - b.bytes; + result.extent.row = a.extent.row; + result.extent.column = a.extent.column - b.extent.column; + return result; +} + +static inline bool ts_tree_cursor_child_iterator_previous( + CursorChildIterator *self, + TreeCursorEntry *result, + bool *visible +) { + // this is mostly a reverse `ts_tree_cursor_child_iterator_next` taking into + // account unsigned underflow + if (!self->parent.ptr || (int8_t)self->child_index == -1) return false; + const Subtree *child = &ts_subtree_children(self->parent)[self->child_index]; + *result = (TreeCursorEntry) { + .subtree = child, + .position = self->position, + .child_index = self->child_index, + .structural_child_index = self->structural_child_index, + }; + *visible = ts_subtree_visible(*child); + bool extra = ts_subtree_extra(*child); + if (!extra && self->alias_sequence) { + *visible |= self->alias_sequence[self->structural_child_index]; + self->structural_child_index--; + } + + self->position = length_backtrack(self->position, ts_subtree_padding(*child)); + self->child_index--; + + // unsigned can underflow so compare it to child_count + if (self->child_index < self->parent.ptr->child_count) { + Subtree previous_child = ts_subtree_children(self->parent)[self->child_index]; + Length size = ts_subtree_size(previous_child); + self->position = length_backtrack(self->position, size); + } + + return true; +} + +// TSTreeCursor - lifecycle + +TSTreeCursor ts_tree_cursor_new(TSNode node) { + TSTreeCursor self = {NULL, NULL, {0, 0, 0}}; + ts_tree_cursor_init((TreeCursor *)&self, node); + return self; +} + +void ts_tree_cursor_reset(TSTreeCursor *_self, TSNode node) { + ts_tree_cursor_init((TreeCursor *)_self, node); +} + +void ts_tree_cursor_init(TreeCursor *self, TSNode node) { + self->tree = node.tree; + self->root_alias_symbol = node.context[3]; + array_clear(&self->stack); + array_push(&self->stack, ((TreeCursorEntry) { + .subtree = (const Subtree *)node.id, + .position = { + ts_node_start_byte(node), + ts_node_start_point(node) + }, + .child_index = 0, + .structural_child_index = 0, + .descendant_index = 0, + })); +} + +void ts_tree_cursor_delete(TSTreeCursor *_self) { + TreeCursor *self = (TreeCursor *)_self; + array_delete(&self->stack); +} + +// TSTreeCursor - walking the tree + +TreeCursorStep ts_tree_cursor_goto_first_child_internal(TSTreeCursor *_self) { + TreeCursor *self = (TreeCursor *)_self; + bool visible; + TreeCursorEntry entry; + CursorChildIterator iterator = ts_tree_cursor_iterate_children(self); + while (ts_tree_cursor_child_iterator_next(&iterator, &entry, &visible)) { + if (visible) { + array_push(&self->stack, entry); + return TreeCursorStepVisible; + } + if (ts_subtree_visible_child_count(*entry.subtree) > 0) { + array_push(&self->stack, entry); + return TreeCursorStepHidden; + } + } + return TreeCursorStepNone; +} + +bool ts_tree_cursor_goto_first_child(TSTreeCursor *self) { + for (;;) { + switch (ts_tree_cursor_goto_first_child_internal(self)) { + case TreeCursorStepHidden: + continue; + case TreeCursorStepVisible: + return true; + default: + return false; + } + } + return false; +} + +TreeCursorStep ts_tree_cursor_goto_last_child_internal(TSTreeCursor *_self) { + TreeCursor *self = (TreeCursor *)_self; + bool visible; + TreeCursorEntry entry; + CursorChildIterator iterator = ts_tree_cursor_iterate_children(self); + if (!iterator.parent.ptr || iterator.parent.ptr->child_count == 0) return TreeCursorStepNone; + + TreeCursorEntry last_entry = {0}; + TreeCursorStep last_step = TreeCursorStepNone; + while (ts_tree_cursor_child_iterator_next(&iterator, &entry, &visible)) { + if (visible) { + last_entry = entry; + last_step = TreeCursorStepVisible; + } + else if (ts_subtree_visible_child_count(*entry.subtree) > 0) { + last_entry = entry; + last_step = TreeCursorStepHidden; + } + } + if (last_entry.subtree) { + array_push(&self->stack, last_entry); + return last_step; + } + + return TreeCursorStepNone; +} + +bool ts_tree_cursor_goto_last_child(TSTreeCursor *self) { + for (;;) { + switch (ts_tree_cursor_goto_last_child_internal(self)) { + case TreeCursorStepHidden: + continue; + case TreeCursorStepVisible: + return true; + default: + return false; + } + } + return false; +} + +static inline int64_t ts_tree_cursor_goto_first_child_for_byte_and_point( + TSTreeCursor *_self, + uint32_t goal_byte, + TSPoint goal_point +) { + TreeCursor *self = (TreeCursor *)_self; + uint32_t initial_size = self->stack.size; + uint32_t visible_child_index = 0; + + bool did_descend; + do { + did_descend = false; + + bool visible; + TreeCursorEntry entry; + CursorChildIterator iterator = ts_tree_cursor_iterate_children(self); + while (ts_tree_cursor_child_iterator_next(&iterator, &entry, &visible)) { + Length entry_end = length_add(entry.position, ts_subtree_size(*entry.subtree)); + bool at_goal = entry_end.bytes >= goal_byte && point_gte(entry_end.extent, goal_point); + uint32_t visible_child_count = ts_subtree_visible_child_count(*entry.subtree); + if (at_goal) { + if (visible) { + array_push(&self->stack, entry); + return visible_child_index; + } + if (visible_child_count > 0) { + array_push(&self->stack, entry); + did_descend = true; + break; + } + } else if (visible) { + visible_child_index++; + } else { + visible_child_index += visible_child_count; + } + } + } while (did_descend); + + self->stack.size = initial_size; + return -1; +} + +int64_t ts_tree_cursor_goto_first_child_for_byte(TSTreeCursor *self, uint32_t goal_byte) { + return ts_tree_cursor_goto_first_child_for_byte_and_point(self, goal_byte, POINT_ZERO); +} + +int64_t ts_tree_cursor_goto_first_child_for_point(TSTreeCursor *self, TSPoint goal_point) { + return ts_tree_cursor_goto_first_child_for_byte_and_point(self, 0, goal_point); +} + +TreeCursorStep ts_tree_cursor_goto_sibling_internal( + TSTreeCursor *_self, + bool (*advance)(CursorChildIterator *, TreeCursorEntry *, bool *)) { + TreeCursor *self = (TreeCursor *)_self; + uint32_t initial_size = self->stack.size; + + while (self->stack.size > 1) { + TreeCursorEntry entry = array_pop(&self->stack); + CursorChildIterator iterator = ts_tree_cursor_iterate_children(self); + iterator.child_index = entry.child_index; + iterator.structural_child_index = entry.structural_child_index; + iterator.position = entry.position; + iterator.descendant_index = entry.descendant_index; + + bool visible = false; + advance(&iterator, &entry, &visible); + if (visible && self->stack.size + 1 < initial_size) break; + + while (advance(&iterator, &entry, &visible)) { + if (visible) { + array_push(&self->stack, entry); + return TreeCursorStepVisible; + } + + if (ts_subtree_visible_child_count(*entry.subtree)) { + array_push(&self->stack, entry); + return TreeCursorStepHidden; + } + } + } + + self->stack.size = initial_size; + return TreeCursorStepNone; +} + +TreeCursorStep ts_tree_cursor_goto_next_sibling_internal(TSTreeCursor *_self) { + return ts_tree_cursor_goto_sibling_internal(_self, ts_tree_cursor_child_iterator_next); +} + +bool ts_tree_cursor_goto_next_sibling(TSTreeCursor *self) { + switch (ts_tree_cursor_goto_next_sibling_internal(self)) { + case TreeCursorStepHidden: + ts_tree_cursor_goto_first_child(self); + return true; + case TreeCursorStepVisible: + return true; + default: + return false; + } +} + +TreeCursorStep ts_tree_cursor_goto_previous_sibling_internal(TSTreeCursor *_self) { + // since subtracting across row loses column information, we may have to + // restore it + TreeCursor *self = (TreeCursor *)_self; + + // for that, save current position before traversing + TreeCursorStep step = ts_tree_cursor_goto_sibling_internal( + _self, ts_tree_cursor_child_iterator_previous); + if (step == TreeCursorStepNone) + return step; + + // if length is already valid, there's no need to recompute it + if (!length_is_undefined(array_back(&self->stack)->position)) + return step; + + // restore position from the parent node + const TreeCursorEntry *parent = &self->stack.contents[self->stack.size - 2]; + Length position = parent->position; + uint32_t child_index = array_back(&self->stack)->child_index; + const Subtree *children = ts_subtree_children((*(parent->subtree))); + + if (child_index > 0) { + // skip first child padding since its position should match the position of the parent + position = length_add(position, ts_subtree_size(children[0])); + for (uint32_t i = 1; i < child_index; ++i) { + position = length_add(position, ts_subtree_total_size(children[i])); + } + position = length_add(position, ts_subtree_padding(children[child_index])); + } + + array_back(&self->stack)->position = position; + + return step; +} + +bool ts_tree_cursor_goto_previous_sibling(TSTreeCursor *self) { + switch (ts_tree_cursor_goto_previous_sibling_internal(self)) { + case TreeCursorStepHidden: + ts_tree_cursor_goto_last_child(self); + return true; + case TreeCursorStepVisible: + return true; + default: + return false; + } +} + +bool ts_tree_cursor_goto_parent(TSTreeCursor *_self) { + TreeCursor *self = (TreeCursor *)_self; + for (unsigned i = self->stack.size - 2; i + 1 > 0; i--) { + if (ts_tree_cursor_is_entry_visible(self, i)) { + self->stack.size = i + 1; + return true; + } + } + return false; +} + +void ts_tree_cursor_goto_descendant( + TSTreeCursor *_self, + uint32_t goal_descendant_index +) { + TreeCursor *self = (TreeCursor *)_self; + + // Ascend to the lowest ancestor that contains the goal node. + for (;;) { + uint32_t i = self->stack.size - 1; + TreeCursorEntry *entry = &self->stack.contents[i]; + uint32_t next_descendant_index = + entry->descendant_index + + (ts_tree_cursor_is_entry_visible(self, i) ? 1 : 0) + + ts_subtree_visible_descendant_count(*entry->subtree); + if ( + (entry->descendant_index <= goal_descendant_index) && + (next_descendant_index > goal_descendant_index) + ) { + break; + } else if (self->stack.size <= 1) { + return; + } else { + self->stack.size--; + } + } + + // Descend to the goal node. + bool did_descend = true; + do { + did_descend = false; + bool visible; + TreeCursorEntry entry; + CursorChildIterator iterator = ts_tree_cursor_iterate_children(self); + if (iterator.descendant_index > goal_descendant_index) { + return; + } + + while (ts_tree_cursor_child_iterator_next(&iterator, &entry, &visible)) { + if (iterator.descendant_index > goal_descendant_index) { + array_push(&self->stack, entry); + if (visible && entry.descendant_index == goal_descendant_index) { + return; + } else { + did_descend = true; + break; + } + } + } + } while (did_descend); +} + +uint32_t ts_tree_cursor_current_descendant_index(const TSTreeCursor *_self) { + const TreeCursor *self = (const TreeCursor *)_self; + TreeCursorEntry *last_entry = array_back(&self->stack); + return last_entry->descendant_index; +} + +TSNode ts_tree_cursor_current_node(const TSTreeCursor *_self) { + const TreeCursor *self = (const TreeCursor *)_self; + TreeCursorEntry *last_entry = array_back(&self->stack); + TSSymbol alias_symbol = self->root_alias_symbol; + if (self->stack.size > 1 && !ts_subtree_extra(*last_entry->subtree)) { + TreeCursorEntry *parent_entry = &self->stack.contents[self->stack.size - 2]; + alias_symbol = ts_language_alias_at( + self->tree->language, + parent_entry->subtree->ptr->production_id, + last_entry->structural_child_index + ); + } + return ts_node_new( + self->tree, + last_entry->subtree, + last_entry->position, + alias_symbol + ); +} + +// Private - Get various facts about the current node that are needed +// when executing tree queries. +void ts_tree_cursor_current_status( + const TSTreeCursor *_self, + TSFieldId *field_id, + bool *has_later_siblings, + bool *has_later_named_siblings, + bool *can_have_later_siblings_with_this_field, + TSSymbol *supertypes, + unsigned *supertype_count +) { + const TreeCursor *self = (const TreeCursor *)_self; + unsigned max_supertypes = *supertype_count; + *field_id = 0; + *supertype_count = 0; + *has_later_siblings = false; + *has_later_named_siblings = false; + *can_have_later_siblings_with_this_field = false; + + // Walk up the tree, visiting the current node and its invisible ancestors, + // because fields can refer to nodes through invisible *wrapper* nodes, + for (unsigned i = self->stack.size - 1; i > 0; i--) { + TreeCursorEntry *entry = &self->stack.contents[i]; + TreeCursorEntry *parent_entry = &self->stack.contents[i - 1]; + + const TSSymbol *alias_sequence = ts_language_alias_sequence( + self->tree->language, + parent_entry->subtree->ptr->production_id + ); + + #define subtree_symbol(subtree, structural_child_index) \ + (( \ + !ts_subtree_extra(subtree) && \ + alias_sequence && \ + alias_sequence[structural_child_index] \ + ) ? \ + alias_sequence[structural_child_index] : \ + ts_subtree_symbol(subtree)) + + // Stop walking up when a visible ancestor is found. + TSSymbol entry_symbol = subtree_symbol( + *entry->subtree, + entry->structural_child_index + ); + TSSymbolMetadata entry_metadata = ts_language_symbol_metadata( + self->tree->language, + entry_symbol + ); + if (i != self->stack.size - 1 && entry_metadata.visible) break; + + // Record any supertypes + if (entry_metadata.supertype && *supertype_count < max_supertypes) { + supertypes[*supertype_count] = entry_symbol; + (*supertype_count)++; + } + + // Determine if the current node has later siblings. + if (!*has_later_siblings) { + unsigned sibling_count = parent_entry->subtree->ptr->child_count; + unsigned structural_child_index = entry->structural_child_index; + if (!ts_subtree_extra(*entry->subtree)) structural_child_index++; + for (unsigned j = entry->child_index + 1; j < sibling_count; j++) { + Subtree sibling = ts_subtree_children(*parent_entry->subtree)[j]; + TSSymbolMetadata sibling_metadata = ts_language_symbol_metadata( + self->tree->language, + subtree_symbol(sibling, structural_child_index) + ); + if (sibling_metadata.visible) { + *has_later_siblings = true; + if (*has_later_named_siblings) break; + if (sibling_metadata.named) { + *has_later_named_siblings = true; + break; + } + } else if (ts_subtree_visible_child_count(sibling) > 0) { + *has_later_siblings = true; + if (*has_later_named_siblings) break; + if (sibling.ptr->named_child_count > 0) { + *has_later_named_siblings = true; + break; + } + } + if (!ts_subtree_extra(sibling)) structural_child_index++; + } + } + + #undef subtree_symbol + + if (!ts_subtree_extra(*entry->subtree)) { + const TSFieldMapEntry *field_map, *field_map_end; + ts_language_field_map( + self->tree->language, + parent_entry->subtree->ptr->production_id, + &field_map, &field_map_end + ); + + // Look for a field name associated with the current node. + if (!*field_id) { + for (const TSFieldMapEntry *map = field_map; map < field_map_end; map++) { + if (!map->inherited && map->child_index == entry->structural_child_index) { + *field_id = map->field_id; + break; + } + } + } + + // Determine if the current node can have later siblings with the same field name. + if (*field_id) { + for (const TSFieldMapEntry *map = field_map; map < field_map_end; map++) { + if ( + map->field_id == *field_id && + map->child_index > entry->structural_child_index + ) { + *can_have_later_siblings_with_this_field = true; + break; + } + } + } + } + } +} + +uint32_t ts_tree_cursor_current_depth(const TSTreeCursor *_self) { + const TreeCursor *self = (const TreeCursor *)_self; + uint32_t depth = 0; + for (unsigned i = 1; i < self->stack.size; i++) { + if (ts_tree_cursor_is_entry_visible(self, i)) { + depth++; + } + } + return depth; +} + +TSNode ts_tree_cursor_parent_node(const TSTreeCursor *_self) { + const TreeCursor *self = (const TreeCursor *)_self; + for (int i = (int)self->stack.size - 2; i >= 0; i--) { + TreeCursorEntry *entry = &self->stack.contents[i]; + bool is_visible = true; + TSSymbol alias_symbol = 0; + if (i > 0) { + TreeCursorEntry *parent_entry = &self->stack.contents[i - 1]; + alias_symbol = ts_language_alias_at( + self->tree->language, + parent_entry->subtree->ptr->production_id, + entry->structural_child_index + ); + is_visible = (alias_symbol != 0) || ts_subtree_visible(*entry->subtree); + } + if (is_visible) { + return ts_node_new( + self->tree, + entry->subtree, + entry->position, + alias_symbol + ); + } + } + return ts_node_new(NULL, NULL, length_zero(), 0); +} + +TSFieldId ts_tree_cursor_current_field_id(const TSTreeCursor *_self) { + const TreeCursor *self = (const TreeCursor *)_self; + + // Walk up the tree, visiting the current node and its invisible ancestors. + for (unsigned i = self->stack.size - 1; i > 0; i--) { + TreeCursorEntry *entry = &self->stack.contents[i]; + TreeCursorEntry *parent_entry = &self->stack.contents[i - 1]; + + // Stop walking up when another visible node is found. + if ( + i != self->stack.size - 1 && + ts_tree_cursor_is_entry_visible(self, i) + ) break; + + if (ts_subtree_extra(*entry->subtree)) break; + + const TSFieldMapEntry *field_map, *field_map_end; + ts_language_field_map( + self->tree->language, + parent_entry->subtree->ptr->production_id, + &field_map, &field_map_end + ); + for (const TSFieldMapEntry *map = field_map; map < field_map_end; map++) { + if (!map->inherited && map->child_index == entry->structural_child_index) { + return map->field_id; + } + } + } + return 0; +} + +const char *ts_tree_cursor_current_field_name(const TSTreeCursor *_self) { + TSFieldId id = ts_tree_cursor_current_field_id(_self); + if (id) { + const TreeCursor *self = (const TreeCursor *)_self; + return self->tree->language->field_names[id]; + } else { + return NULL; + } +} + +TSTreeCursor ts_tree_cursor_copy(const TSTreeCursor *_cursor) { + const TreeCursor *cursor = (const TreeCursor *)_cursor; + TSTreeCursor res = {NULL, NULL, {0, 0}}; + TreeCursor *copy = (TreeCursor *)&res; + copy->tree = cursor->tree; + copy->root_alias_symbol = cursor->root_alias_symbol; + array_init(©->stack); + array_push_all(©->stack, &cursor->stack); + return res; +} + +void ts_tree_cursor_reset_to(TSTreeCursor *_dst, const TSTreeCursor *_src) { + const TreeCursor *cursor = (const TreeCursor *)_src; + TreeCursor *copy = (TreeCursor *)_dst; + copy->tree = cursor->tree; + copy->root_alias_symbol = cursor->root_alias_symbol; + array_clear(©->stack); + array_push_all(©->stack, &cursor->stack); +} diff --git a/shcat_c/parser/src/tree_cursor.h b/shcat_c/parser/src/tree_cursor.h new file mode 100644 index 00000000..96a386df --- /dev/null +++ b/shcat_c/parser/src/tree_cursor.h @@ -0,0 +1,48 @@ +#ifndef TREE_SITTER_TREE_CURSOR_H_ +#define TREE_SITTER_TREE_CURSOR_H_ + +#include "./subtree.h" + +typedef struct { + const Subtree *subtree; + Length position; + uint32_t child_index; + uint32_t structural_child_index; + uint32_t descendant_index; +} TreeCursorEntry; + +typedef struct { + const TSTree *tree; + Array(TreeCursorEntry) stack; + TSSymbol root_alias_symbol; +} TreeCursor; + +typedef enum { + TreeCursorStepNone, + TreeCursorStepHidden, + TreeCursorStepVisible, +} TreeCursorStep; + +void ts_tree_cursor_init(TreeCursor *, TSNode); +void ts_tree_cursor_current_status( + const TSTreeCursor *, + TSFieldId *, + bool *, + bool *, + bool *, + TSSymbol *, + unsigned * +); + +TreeCursorStep ts_tree_cursor_goto_first_child_internal(TSTreeCursor *); +TreeCursorStep ts_tree_cursor_goto_next_sibling_internal(TSTreeCursor *); + +static inline Subtree ts_tree_cursor_current_subtree(const TSTreeCursor *_self) { + const TreeCursor *self = (const TreeCursor *)_self; + TreeCursorEntry *last_entry = array_back(&self->stack); + return *last_entry->subtree; +} + +TSNode ts_tree_cursor_parent_node(const TSTreeCursor *); + +#endif // TREE_SITTER_TREE_CURSOR_H_