Removing even more shit in the parser lib

This commit is contained in:
Maix0 2024-07-03 18:55:25 +02:00
parent 7e88e17d67
commit a7bfe526b0
20 changed files with 215 additions and 461 deletions

View file

@ -6,7 +6,7 @@
/* By: maiboyer <maiboyer@student.42.fr> +#+ +:+ +#+ */
/* +#+#+#+#+#+ +#+ */
/* Created: 2024/06/17 12:41:56 by maiboyer #+# #+# */
/* Updated: 2024/07/02 21:55:19 by maiboyer ### ########.fr */
/* Updated: 2024/07/03 18:47:44 by maiboyer ### ########.fr */
/* */
/* ************************************************************************** */
@ -666,7 +666,7 @@ t_error build_sym_expansion(t_parse_node self, t_const_str input, t_ast_node *ou
(void)(self);
if (out == NULL)
return (ERROR);
if (ts_node_symbol(self) != sym_simple_expansion)
if (ts_node_symbol(self) != sym_expansion)
return (ERROR);
ret = ast_alloc(AST_EXPANSION);
ret->data.expansion.kind = E_OP_NONE;

View file

View file

View file

@ -4,7 +4,7 @@
#include "api.h"
#include <stdbool.h>
typedef struct
typedef struct NodeChildIterator
{
Subtree parent;
const TSTree *tree;

View file

@ -1,21 +1,16 @@
#define _POSIX_C_SOURCE 200112L
#include "./api.h"
#include "./array.h"
#include "me/mem/mem.h"
// #include "./atomic.h"
// #include "./clock.h"
#include "./error_costs.h"
#include "./language.h"
#include "./length.h"
#include "./lexer.h"
#include "./reduce_action.h"
#include "./reusable_node.h"
#include "./stack.h"
#include "./subtree.h"
#include "./tree.h"
#include "api.h"
#include "me/mem/mem.h"
#include <assert.h>
#include <inttypes.h>
#include <limits.h>
#include <stdbool.h>
#include <stdio.h>
@ -55,7 +50,6 @@ struct TSParser
SubtreeArray trailing_extras2;
SubtreeArray scratch_trees;
TokenCache token_cache;
ReusableNode reusable_node;
void *external_scanner_payload;
unsigned accept_count;
unsigned operation_count;
@ -182,26 +176,6 @@ static bool ts_parser__breakdown_top_of_stack(TSParser *self, StackVersion versi
return did_break_down;
}
static void ts_parser__breakdown_lookahead(TSParser *self, Subtree *lookahead, TSStateId state, ReusableNode *reusable_node)
{
bool did_descend = false;
Subtree tree = reusable_node_tree(reusable_node);
while (ts_subtree_child_count(tree) > 0 && ts_subtree_parse_state(tree) != state)
{
LOG("state_mismatch sym:%s", TREE_NAME(tree));
reusable_node_descend(reusable_node);
tree = reusable_node_tree(reusable_node);
did_descend = true;
}
if (did_descend)
{
ts_subtree_release(&self->tree_pool, *lookahead);
*lookahead = tree;
ts_subtree_retain(*lookahead);
}
}
static ErrorComparison ts_parser__compare_versions(TSParser *self, ErrorStatus a, ErrorStatus b)
{
(void)self;
@ -592,100 +566,6 @@ static void ts_parser__set_cached_token(TSParser *self, uint32_t byte_index, Sub
cache->last_external_token = last_external_token;
}
// static bool ts_parser__has_included_range_difference(const TSParser *self, uint32_t start_position, uint32_t end_position)
// {
// return ts_range_array_intersects(&self->included_range_differences, self->included_range_difference_index, start_position,
// end_position);
// }
static Subtree ts_parser__reuse_node(TSParser *self, StackVersion version, TSStateId *state, uint32_t position, Subtree last_external_token,
TableEntry *table_entry)
{
Subtree result;
while ((result = reusable_node_tree(&self->reusable_node)).ptr)
{
uint32_t byte_offset = reusable_node_byte_offset(&self->reusable_node);
uint32_t end_byte_offset = byte_offset + ts_subtree_total_bytes(result);
// Do not reuse an EOF node if the included ranges array has changes
// later on in the file.
if (ts_subtree_is_eof(result))
end_byte_offset = UINT32_MAX;
if (byte_offset > position)
{
LOG("before_reusable_node symbol:%s", TREE_NAME(result));
break;
}
if (byte_offset < position)
{
LOG("past_reusable_node symbol:%s", TREE_NAME(result));
if (end_byte_offset <= position || !reusable_node_descend(&self->reusable_node))
{
reusable_node_advance(&self->reusable_node);
}
continue;
}
if (!ts_subtree_external_scanner_state_eq(self->reusable_node.last_external_token, last_external_token))
{
LOG("reusable_node_has_different_external_scanner_state symbol:%s", TREE_NAME(result));
reusable_node_advance(&self->reusable_node);
continue;
}
const char *reason = NULL;
if (ts_subtree_has_changes(result))
{
reason = "has_changes";
}
else if (ts_subtree_is_error(result))
{
reason = "is_error";
}
else if (ts_subtree_missing(result))
{
reason = "is_missing";
}
else if (ts_subtree_is_fragile(result))
{
reason = "is_fragile";
}
// else if (ts_parser__has_included_range_difference(self, byte_offset, end_byte_offset))
// {
// reason = "contains_different_included_range";
// }
if (reason)
{
LOG("cant_reuse_node_%s tree:%s", reason, TREE_NAME(result));
if (!reusable_node_descend(&self->reusable_node))
{
reusable_node_advance(&self->reusable_node);
ts_parser__breakdown_top_of_stack(self, version);
*state = ts_stack_state(self->stack, version);
}
continue;
}
TSSymbol leaf_symbol = ts_subtree_leaf_symbol(result);
ts_language_table_entry(self->language, *state, leaf_symbol, table_entry);
if (!ts_parser__can_reuse_first_leaf(self, *state, result, table_entry))
{
LOG("cant_reuse_node symbol:%s, first_leaf_symbol:%s", TREE_NAME(result), SYM_NAME(leaf_symbol));
reusable_node_advance_past_leaf(&self->reusable_node);
break;
}
LOG("reuse_node symbol:%s", TREE_NAME(result));
ts_subtree_retain(result);
return result;
}
return NULL_SUBTREE;
}
// Determine if a given tree should be replaced by an alternative tree.
//
// The decision is based on the trees' error costs (if any), their dynamic precedence,
@ -1361,10 +1241,6 @@ static void ts_parser__handle_error(TSParser *self, StackVersion version, Subtre
// current lookahead token's "lookahead bytes" value, which describes how far
// the lexer needed to look ahead beyond the content of the token in order to
// recognize it.
if (ts_subtree_child_count(lookahead) > 0)
{
ts_parser__breakdown_lookahead(self, &lookahead, ERROR_STATE, &self->reusable_node);
}
ts_parser__recover(self, version, lookahead);
LOG_STACK();
@ -1372,25 +1248,18 @@ static void ts_parser__handle_error(TSParser *self, StackVersion version, Subtre
static bool ts_parser__advance(TSParser *self, StackVersion version, bool allow_node_reuse)
{
(void)(allow_node_reuse);
TSStateId state = ts_stack_state(self->stack, version);
uint32_t position = ts_stack_position(self->stack, version).bytes;
Subtree last_external_token = ts_stack_last_external_token(self->stack, version);
bool did_reuse = true;
Subtree lookahead = NULL_SUBTREE;
TableEntry table_entry = {.action_count = 0};
// If possible, reuse a node from the previous syntax tree.
if (allow_node_reuse)
{
lookahead = ts_parser__reuse_node(self, version, &state, position, last_external_token, &table_entry);
}
// If no node from the previous syntax tree could be reused, then try to
// reuse the token previously returned by the lexer.
if (!lookahead.ptr)
{
did_reuse = false;
lookahead = ts_parser__get_cached_token(self, state, position, last_external_token, &table_entry);
}
@ -1464,13 +1333,12 @@ static bool ts_parser__advance(TSParser *self, StackVersion version, bool allow_
if (ts_subtree_child_count(lookahead) > 0)
{
ts_parser__breakdown_lookahead(self, &lookahead, state, &self->reusable_node);
next_state = ts_language_next_state(self->language, state, ts_subtree_symbol(lookahead));
}
ts_parser__shift(self, version, next_state, lookahead, action.shift.extra);
if (did_reuse)
reusable_node_advance(&self->reusable_node);
// if (did_reuse)
// reusable_node_advance(&self->reusable_node);
return true;
}
@ -1495,14 +1363,8 @@ static bool ts_parser__advance(TSParser *self, StackVersion version, bool allow_
}
case TSParseActionTypeRecover: {
if (ts_subtree_child_count(lookahead) > 0)
{
ts_parser__breakdown_lookahead(self, &lookahead, ERROR_STATE, &self->reusable_node);
}
ts_parser__recover(self, version, lookahead);
if (did_reuse)
reusable_node_advance(&self->reusable_node);
return true;
}
}
@ -1729,7 +1591,6 @@ TSParser *ts_parser_new(void)
self->tree_pool = ts_subtree_pool_new(32);
self->stack = ts_stack_new(&self->tree_pool);
self->finished_tree = NULL_SUBTREE;
self->reusable_node = reusable_node_new();
self->cancellation_flag = NULL;
self->language = NULL;
self->has_scanner_error = false;
@ -1760,7 +1621,6 @@ void ts_parser_delete(TSParser *self)
ts_lexer_delete(&self->lexer);
ts_parser__set_cached_token(self, 0, NULL_SUBTREE, NULL_SUBTREE);
ts_subtree_pool_delete(&self->tree_pool);
reusable_node_delete(&self->reusable_node);
array_delete(&self->trailing_extras);
array_delete(&self->trailing_extras2);
array_delete(&self->scratch_trees);
@ -1797,7 +1657,6 @@ void ts_parser_reset(TSParser *self)
self->old_tree = NULL_SUBTREE;
}
reusable_node_clear(&self->reusable_node);
ts_lexer_reset(&self->lexer, length_zero());
ts_stack_clear(self->stack);
ts_parser__set_cached_token(self, 0, NULL_SUBTREE, NULL_SUBTREE);
@ -1830,7 +1689,6 @@ TSTree *ts_parser_parse(TSParser *self, const TSTree *old_tree, TSInput input)
if (self->has_scanner_error)
goto exit;
reusable_node_clear(&self->reusable_node);
LOG("new_parse");
}

View file

@ -1,10 +1,6 @@
#ifndef TREE_SITTER_PARSER_H_
#define TREE_SITTER_PARSER_H_
#ifdef __cplusplus
extern "C" {
#endif
#include <stdbool.h>
#include <stdint.h>
#include <stdlib.h>
@ -20,18 +16,21 @@ typedef uint16_t TSFieldId;
typedef struct TSLanguage TSLanguage;
#endif
typedef struct {
typedef struct TSFieldMapEntry
{
TSFieldId field_id;
uint8_t child_index;
bool inherited;
} TSFieldMapEntry;
typedef struct {
typedef struct TSFieldMapSlice
{
uint16_t index;
uint16_t length;
} TSFieldMapSlice;
typedef struct {
typedef struct TSSymbolMetadata
{
bool visible;
bool named;
bool supertype;
@ -39,7 +38,8 @@ typedef struct {
typedef struct TSLexer TSLexer;
struct TSLexer {
struct TSLexer
{
int32_t lookahead;
TSSymbol result_symbol;
void (*advance)(TSLexer *, bool);
@ -49,21 +49,24 @@ struct TSLexer {
bool (*eof)(const TSLexer *);
};
typedef enum {
typedef enum TSParseActionType
{
TSParseActionTypeShift,
TSParseActionTypeReduce,
TSParseActionTypeAccept,
TSParseActionTypeRecover,
} TSParseActionType;
typedef union {
struct {
typedef union TSParseAction {
struct TSParseActionShift
{
uint8_t type;
TSStateId state;
bool extra;
bool repetition;
} shift;
struct {
struct TSParseActionReduce
{
uint8_t type;
uint8_t child_count;
TSSymbol symbol;
@ -73,25 +76,29 @@ typedef union {
uint8_t type;
} TSParseAction;
typedef struct {
typedef struct TSLexMode
{
uint16_t lex_state;
uint16_t external_lex_state;
} TSLexMode;
typedef union {
typedef union TSParseActionEntry {
TSParseAction action;
struct {
struct TSParseActionEntryInner
{
uint8_t count;
bool reusable;
} entry;
} TSParseActionEntry;
typedef struct {
typedef struct TSCharacterRange
{
int32_t start;
int32_t end;
} TSCharacterRange;
struct TSLanguage {
struct TSLanguage
{
uint32_t version;
uint32_t symbol_count;
uint32_t alias_count;
@ -106,8 +113,8 @@ struct TSLanguage {
const uint16_t *small_parse_table;
const uint32_t *small_parse_table_map;
const TSParseActionEntry *parse_actions;
const char * const *symbol_names;
const char * const *field_names;
const char *const *symbol_names;
const char *const *field_names;
const TSFieldMapSlice *field_map_slices;
const TSFieldMapEntry *field_map_entries;
const TSSymbolMetadata *symbol_metadata;
@ -118,7 +125,8 @@ struct TSLanguage {
bool (*lex_fn)(TSLexer *, TSStateId);
bool (*keyword_lex_fn)(TSLexer *, TSStateId);
TSSymbol keyword_capture_token;
struct ExternalScannerDefinition {
struct ExternalScannerDefinition
{
const bool *states;
const TSSymbol *symbol_map;
void *(*create)(void);
@ -130,16 +138,21 @@ struct TSLanguage {
const TSStateId *primary_state_ids;
};
static inline bool set_contains(TSCharacterRange *ranges, uint32_t len, int32_t lookahead) {
static inline bool set_contains(TSCharacterRange *ranges, uint32_t len, int32_t lookahead)
{
uint32_t index = 0;
uint32_t size = len - index;
while (size > 1) {
while (size > 1)
{
uint32_t half_size = size / 2;
uint32_t mid_index = index + half_size;
TSCharacterRange *range = &ranges[mid_index];
if (lookahead >= range->start && lookahead <= range->end) {
if (lookahead >= range->start && lookahead <= range->end)
{
return true;
} else if (lookahead > range->end) {
}
else if (lookahead > range->end)
{
index = mid_index;
}
size -= half_size;
@ -152,11 +165,7 @@ static inline bool set_contains(TSCharacterRange *ranges, uint32_t len, int32_t
* Lexer Macros
*/
#ifdef _MSC_VER
#define UNUSED __pragma(warning(suppress : 4101))
#else
#define UNUSED __attribute__((unused))
#endif
#define START_LEXER() \
bool result = false; \
@ -165,9 +174,9 @@ static inline bool set_contains(TSCharacterRange *ranges, uint32_t len, int32_t
bool eof = false; \
int32_t lookahead; \
goto start; \
next_state: \
next_state: \
lexer->advance(lexer, skip); \
start: \
start: \
skip = false; \
lookahead = lexer->lookahead;
@ -179,9 +188,11 @@ static inline bool set_contains(TSCharacterRange *ranges, uint32_t len, int32_t
#define ADVANCE_MAP(...) \
{ \
static const uint16_t map[] = { __VA_ARGS__ }; \
for (uint32_t i = 0; i < sizeof(map) / sizeof(map[0]); i += 2) { \
if (map[i] == lookahead) { \
static const uint16_t map[] = {__VA_ARGS__}; \
for (uint32_t i = 0; i < sizeof(map) / sizeof(map[0]); i += 2) \
{ \
if (map[i] == lookahead) \
{ \
state = map[i + 1]; \
goto next_state; \
} \
@ -206,60 +217,56 @@ static inline bool set_contains(TSCharacterRange *ranges, uint32_t len, int32_t
* Parse Table Macros
*/
#define SMALL_STATE(id) ((id) - LARGE_STATE_COUNT)
#define SMALL_STATE(id) ((id)-LARGE_STATE_COUNT)
#define STATE(id) id
#define ACTIONS(id) id
#define SHIFT(state_value) \
{{ \
.shift = { \
.type = TSParseActionTypeShift, \
.state = (state_value) \
{ \
{ \
.shift = {.type = TSParseActionTypeShift, .state = (state_value) } \
} \
}}
}
#define SHIFT_REPEAT(state_value) \
{{ \
.shift = { \
.type = TSParseActionTypeShift, \
.state = (state_value), \
.repetition = true \
{ \
{ \
.shift = {.type = TSParseActionTypeShift, .state = (state_value), .repetition = true } \
} \
}}
}
#define SHIFT_EXTRA() \
{{ \
.shift = { \
.type = TSParseActionTypeShift, \
.extra = true \
{ \
{ \
.shift = {.type = TSParseActionTypeShift, .extra = true } \
} \
}}
}
#define REDUCE(symbol_name, children, precedence, prod_id) \
{{ \
.reduce = { \
.type = TSParseActionTypeReduce, \
{ \
{ \
.reduce = {.type = TSParseActionTypeReduce, \
.symbol = symbol_name, \
.child_count = children, \
.dynamic_precedence = precedence, \
.production_id = prod_id \
}, \
}}
.production_id = prod_id}, \
} \
}
#define RECOVER() \
{{ \
{ \
{ \
.type = TSParseActionTypeRecover \
}}
} \
}
#define ACCEPT_INPUT() \
{{ \
{ \
{ \
.type = TSParseActionTypeAccept \
}}
#ifdef __cplusplus
}
#endif
} \
}
#endif // TREE_SITTER_PARSER_H_

View file

@ -1,111 +0,0 @@
#include "./subtree.h"
typedef struct StackEntry
{
Subtree tree;
uint32_t child_index;
uint32_t byte_offset;
} StackEntry;
typedef struct ReusableNode
{
Array(StackEntry) stack;
Subtree last_external_token;
} ReusableNode;
static inline ReusableNode reusable_node_new(void)
{
return (ReusableNode){array_new(), NULL_SUBTREE};
}
static inline void reusable_node_clear(ReusableNode *self)
{
array_clear(&self->stack);
self->last_external_token = NULL_SUBTREE;
}
static inline Subtree reusable_node_tree(ReusableNode *self)
{
return self->stack.size > 0 ? self->stack.contents[self->stack.size - 1].tree : NULL_SUBTREE;
}
static inline uint32_t reusable_node_byte_offset(ReusableNode *self)
{
return self->stack.size > 0 ? self->stack.contents[self->stack.size - 1].byte_offset : UINT32_MAX;
}
static inline void reusable_node_delete(ReusableNode *self)
{
array_delete(&self->stack);
}
static inline void reusable_node_advance(ReusableNode *self)
{
StackEntry last_entry = *array_back(&self->stack);
uint32_t byte_offset = last_entry.byte_offset + ts_subtree_total_bytes(last_entry.tree);
if (ts_subtree_has_external_tokens(last_entry.tree))
{
self->last_external_token = ts_subtree_last_external_token(last_entry.tree);
}
Subtree tree;
uint32_t next_index;
do
{
StackEntry popped_entry = array_pop(&self->stack);
next_index = popped_entry.child_index + 1;
if (self->stack.size == 0)
return;
tree = array_back(&self->stack)->tree;
} while (ts_subtree_child_count(tree) <= next_index);
array_push(&self->stack, ((StackEntry){
.tree = ts_subtree_children(tree)[next_index],
.child_index = next_index,
.byte_offset = byte_offset,
}));
}
static inline bool reusable_node_descend(ReusableNode *self)
{
StackEntry last_entry = *array_back(&self->stack);
if (ts_subtree_child_count(last_entry.tree) > 0)
{
array_push(&self->stack, ((StackEntry){
.tree = ts_subtree_children(last_entry.tree)[0],
.child_index = 0,
.byte_offset = last_entry.byte_offset,
}));
return true;
}
else
{
return false;
}
}
static inline void reusable_node_advance_past_leaf(ReusableNode *self)
{
while (reusable_node_descend(self))
{
}
reusable_node_advance(self);
}
static inline void reusable_node_reset(ReusableNode *self, Subtree tree)
{
reusable_node_clear(self);
array_push(&self->stack, ((StackEntry){
.tree = tree,
.child_index = 0,
.byte_offset = 0,
}));
// Never reuse the root node, because it has a non-standard internal structure
// due to transformations that are applied when it is accepted: adding the EOF
// child and any extra children.
if (!reusable_node_descend(self))
{
reusable_node_clear(self);
}
}

View file

@ -6,7 +6,7 @@
/* By: rparodi <rparodi@student.42.fr> +#+ +:+ +#+ */
/* +#+#+#+#+#+ +#+ */
/* Created: 2024/03/28 14:40:38 by rparodi #+# #+# */
/* Updated: 2024/06/30 16:44:34 by maiboyer ### ########.fr */
/* Updated: 2024/07/03 18:46:59 by maiboyer ### ########.fr */
/* */
/* ************************************************************************** */
@ -80,7 +80,7 @@ void print_node_data(t_node *t, t_usize depth)
idx = 0;
if (t->kind == 7)
return;
printf("\x1b[%im[%s](%lu)\x1b[0m", t->field_str == NULL ? 90 : 32, t->field_str == NULL ? "nil" : t->field_str, t->field);
printf("\x1b[%im[%-6s](%lu)\x1b[0m", t->field_str == NULL ? 90 : 32, t->field_str == NULL ? "nil" : t->field_str, t->field);
while (idx++ < depth + 1)
printf("\t");
idx = 0;