From 92823aff807ad268f3b20a347c3e7550e595ae54 Mon Sep 17 00:00:00 2001 From: Maieul BOYER Date: Wed, 4 Sep 2024 16:18:27 +0000 Subject: [PATCH] sadge to have to do a step backwards but it has to work --- line/src/line_no_tty.c | 2 +- parser/Filelist.parser.mk | 13 +- parser/src/parser.c | 29 +- parser/src/scanner.c | 1198 ++++++++++++++++++++++++ parser/src/scanner/advance_words.c | 52 - parser/src/scanner/deserialize.c | 68 -- parser/src/scanner/heredoc.c | 103 -- parser/src/scanner/heredoc_functions.c | 117 --- parser/src/scanner/lifetime.c | 47 - parser/src/scanner/scan.c | 59 -- parser/src/scanner/scan_concat.c | 57 -- parser/src/scanner/scan_dollar.c | 30 - parser/src/scanner/scan_double_hash.c | 38 - parser/src/scanner/scan_varname.c | 140 --- parser/src/scanner/scan_word.c | 97 -- parser/src/scanner/serialize.c | 62 -- 16 files changed, 1215 insertions(+), 897 deletions(-) create mode 100644 parser/src/scanner.c delete mode 100644 parser/src/scanner/advance_words.c delete mode 100644 parser/src/scanner/deserialize.c delete mode 100644 parser/src/scanner/heredoc.c delete mode 100644 parser/src/scanner/heredoc_functions.c delete mode 100644 parser/src/scanner/lifetime.c delete mode 100644 parser/src/scanner/scan.c delete mode 100644 parser/src/scanner/scan_concat.c delete mode 100644 parser/src/scanner/scan_dollar.c delete mode 100644 parser/src/scanner/scan_double_hash.c delete mode 100644 parser/src/scanner/scan_varname.c delete mode 100644 parser/src/scanner/scan_word.c delete mode 100644 parser/src/scanner/serialize.c diff --git a/line/src/line_no_tty.c b/line/src/line_no_tty.c index 58f0d97d..2374433f 100644 --- a/line/src/line_no_tty.c +++ b/line/src/line_no_tty.c @@ -32,7 +32,7 @@ bool line_no_tty_impl(t_str *out) chr = '\n'; if (read_fd(get_stdin(), (t_u8 *)&chr, 1, &ret)) return (string_free(line), *out = NULL, true); - if (ret == 0 || chr == '\n') + if (ret == 0) { if (line.len == 0) return (string_free(line), *out = NULL, true); diff --git a/parser/Filelist.parser.mk b/parser/Filelist.parser.mk index fe47ef0f..9a7b63fd 100644 --- a/parser/Filelist.parser.mk +++ b/parser/Filelist.parser.mk @@ -32,18 +32,7 @@ node/node_relevent \ parser \ point/point_funcs1 \ point/point_funcs2 \ -scanner/advance_words \ -scanner/deserialize \ -scanner/heredoc \ -scanner/heredoc_functions \ -scanner/lifetime \ -scanner/scan \ -scanner/scan_concat \ -scanner/scan_dollar \ -scanner/scan_double_hash \ -scanner/scan_varname \ -scanner/scan_word \ -scanner/serialize \ +scanner \ stack/stack_add_link \ stack/stack_funcs1 \ stack/stack_funcs2 \ diff --git a/parser/src/parser.c b/parser/src/parser.c index 263f4403..9277df4c 100644 --- a/parser/src/parser.c +++ b/parser/src/parser.c @@ -28,8 +28,7 @@ static const t_u32 MAX_VERSION_COUNT = 4; static const t_u32 MAX_VERSION_COUNT_OVERFLOW = 6; static const t_u32 MAX_SUMMARY_DEPTH = 1; -static const t_u32 MAX_COST_DIFFERENCE = 16 - * ERROR_COST_PER_SKIPPED_TREE; +static const t_u32 MAX_COST_DIFFERENCE = 16 * ERROR_COST_PER_SKIPPED_TREE; typedef struct s_error_status t_error_status; typedef enum e_error_comparison t_error_comparison; @@ -269,16 +268,16 @@ static bool ts_parser__better_version_exists(TSParser *self, return false; } -static bool ts_parser__call_main_lex_fn(TSParser *self, TSLexMode lex_mode) -{ - return self->language->lex_fn(&self->lexer.data, lex_mode.lex_state); -} - -static bool ts_parser__call_keyword_lex_fn(TSParser *self, TSLexMode lex_mode) -{ - (void)(lex_mode); - return self->language->keyword_lex_fn(&self->lexer.data, 0); -} +// static bool ts_parser__call_main_lex_fn(TSParser *self, TSLexMode lex_mode) +// { +// return self->language->lex_fn(&self->lexer.data, lex_mode.lex_state); +// } +// +// static bool ts_parser__call_keyword_lex_fn(TSParser *self, TSLexMode lex_mode) +// { +// (void)(lex_mode); +// return self->language->keyword_lex_fn(&self->lexer.data, 0); +// } static void ts_parser__external_scanner_create(TSParser *self) { @@ -299,6 +298,7 @@ static t_u32 ts_parser__external_scanner_serialize(TSParser *self) { t_u32 length; + length = self->language->external_scanner.serialize(self->external_scanner_payload, self->lexer.debug_buffer); if (length > TREE_SITTER_SERIALIZATION_BUFFER_SIZE) @@ -319,6 +319,7 @@ static void ts_parser__external_scanner_deserialize(TSParser *self, { data = ts_external_scanner_state_data(&external_token->external_scanner_state); length = external_token->external_scanner_state.length; + printf("HERE\n"); } self->language->external_scanner.deserialize(self->external_scanner_payload, data, length); @@ -430,7 +431,7 @@ static t_subtree ts_parser__lex(TSParser *self, t_stack_version version, ts_lexer_reset(&self->lexer, current_position); } ts_lexer_start(&self->lexer); - found_token = ts_parser__call_main_lex_fn(self, lex_mode); + found_token = self->language->lex_fn(&self->lexer.data, lex_mode.lex_state); ts_lexer_finish(&self->lexer, &lookahead_end_byte); if (found_token) break ; @@ -485,7 +486,7 @@ static t_subtree ts_parser__lex(TSParser *self, t_stack_version version, end_byte = self->lexer.token_end_position.bytes; ts_lexer_reset(&self->lexer, self->lexer.token_start_position); ts_lexer_start(&self->lexer); - is_keyword = ts_parser__call_keyword_lex_fn(self, lex_mode); + is_keyword = self->language->keyword_lex_fn(&self->lexer.data, 0); if (is_keyword && self->lexer.token_end_position.bytes == end_byte && ts_language_has_actions(self->language, parse_state, self->lexer.data.result_symbol)) diff --git a/parser/src/scanner.c b/parser/src/scanner.c new file mode 100644 index 00000000..024a0418 --- /dev/null +++ b/parser/src/scanner.c @@ -0,0 +1,1198 @@ +#include "parser/array.h" +#include "parser/parser.h" +#include "me/types.h" +#include +#include +#include + +enum TokenType +{ + HEREDOC_START, + SIMPLE_HEREDOC_BODY, + HEREDOC_BODY_BEGINNING, + HEREDOC_CONTENT, + HEREDOC_END, + FILE_DESCRIPTOR, + EMPTY_VALUE, + CONCAT, + VARIABLE_NAME, + REGEX, + EXPANSION_WORD, + EXTGLOB_PATTERN, + BARE_DOLLAR, + IMMEDIATE_DOUBLE_HASH, + HEREDOC_ARROW, + HEREDOC_ARROW_DASH, + NEWLINE, + OPENING_PAREN, + ESAC, + ERROR_RECOVERY, +}; + +typedef Array(char) String; + +typedef struct Heredoc +{ + bool is_raw; + bool started; + bool allows_indent; + String delimiter; + String current_leading_word; +} Heredoc; + +#define heredoc_new() \ + { \ + .is_raw = false, \ + .started = false, \ + .allows_indent = false, \ + .delimiter = array_new(), \ + .current_leading_word = array_new(), \ + }; + +typedef struct Scanner +{ + t_u8 last_glob_paren_depth; + bool ext_was_in_double_quote; + bool ext_saw_outside_quote; + Array(Heredoc) heredocs; +} Scanner; + +static inline void advance(TSLexer *lexer) +{ + lexer->advance(lexer, false); +} + +static inline void skip(TSLexer *lexer) +{ + lexer->advance(lexer, true); +} + +static inline bool in_error_recovery(const bool *valid_symbols) +{ + return valid_symbols[ERROR_RECOVERY]; +} + +static inline void reset_string(String *string) +{ + if (string->size > 0) + { + memset(string->contents, 0, string->size); + array_clear(string); + } +} + +static inline void reset_heredoc(Heredoc *heredoc) +{ + heredoc->is_raw = false; + heredoc->started = false; + heredoc->allows_indent = false; + reset_string(&heredoc->delimiter); +} + +static inline void reset(Scanner *scanner) +{ + for (t_u32 i = 0; i < scanner->heredocs.size; i++) + { + reset_heredoc(array_get(&scanner->heredocs, i)); + } +} + +static t_u32 serialize(Scanner *scanner, t_u8 *buffer) +{ + t_u32 size = 0; + + buffer[size++] = (char)scanner->last_glob_paren_depth; + buffer[size++] = (char)scanner->ext_was_in_double_quote; + buffer[size++] = (char)scanner->ext_saw_outside_quote; + buffer[size++] = (char)scanner->heredocs.size; + + for (t_u32 i = 0; i < scanner->heredocs.size; i++) + { + Heredoc *heredoc = array_get(&scanner->heredocs, i); + if (heredoc->delimiter.size + 3 + size >= TREE_SITTER_SERIALIZATION_BUFFER_SIZE) + { + return 0; + } + + buffer[size++] = (char)heredoc->is_raw; + buffer[size++] = (char)heredoc->started; + buffer[size++] = (char)heredoc->allows_indent; + + memcpy(&buffer[size], &heredoc->delimiter.size, sizeof(t_u32)); + size += sizeof(t_u32); + if (heredoc->delimiter.size > 0) + { + memcpy(&buffer[size], heredoc->delimiter.contents, heredoc->delimiter.size); + size += heredoc->delimiter.size; + } + } + return size; +} + +static void deserialize(Scanner *scanner, const t_u8 *buffer, t_u32 length) +{ + if (length == 0) + { + reset(scanner); + } + else + { + t_u32 size = 0; + scanner->last_glob_paren_depth = buffer[size++]; + scanner->ext_was_in_double_quote = buffer[size++]; + scanner->ext_saw_outside_quote = buffer[size++]; + t_u32 heredoc_count = (t_u8)buffer[size++]; + for (t_u32 i = 0; i < heredoc_count; i++) + { + Heredoc *heredoc = NULL; + if (i < scanner->heredocs.size) + { + heredoc = array_get(&scanner->heredocs, i); + } + else + { + Heredoc new_heredoc = heredoc_new(); + array_push(&scanner->heredocs, new_heredoc); + heredoc = array_back(&scanner->heredocs); + } + + heredoc->is_raw = buffer[size++]; + heredoc->started = buffer[size++]; + heredoc->allows_indent = buffer[size++]; + + memcpy(&heredoc->delimiter.size, &buffer[size], sizeof(t_u32)); + size += sizeof(t_u32); + array_reserve(&heredoc->delimiter, heredoc->delimiter.size); + + if (heredoc->delimiter.size > 0) + { + memcpy(heredoc->delimiter.contents, &buffer[size], heredoc->delimiter.size); + size += heredoc->delimiter.size; + } + } + assert(size == length); + } +} + +/** + * Consume a "word" in POSIX parlance, and returns it unquoted. + * + * This is an approximate implementation that doesn't deal with any + * POSIX-mandated substitution, and assumes the default value for + * IFS. + */ +static bool advance_word(TSLexer *lexer, String *unquoted_word) +{ + bool empty = true; + t_i32 quote = 0; + + if (lexer->lookahead == '\'' || lexer->lookahead == '"') + { + quote = lexer->lookahead; + advance(lexer); + } + + while (lexer->lookahead && + !(quote ? lexer->lookahead == quote || lexer->lookahead == '\r' || lexer->lookahead == '\n' : iswspace(lexer->lookahead))) + { + if (lexer->lookahead == '\\') + { + advance(lexer); + if (!lexer->lookahead) + return false; + } + empty = false; + array_push(unquoted_word, lexer->lookahead); + advance(lexer); + } + array_push(unquoted_word, '\0'); + + if (quote && lexer->lookahead == quote) + advance(lexer); + + return !empty; +} + +static inline bool scan_bare_dollar(TSLexer *lexer) +{ + while (iswspace(lexer->lookahead) && lexer->lookahead != '\n' && !lexer->eof(lexer)) + skip(lexer); + + if (lexer->lookahead == '$') + { + advance(lexer); + lexer->result_symbol = BARE_DOLLAR; + lexer->mark_end(lexer); + return (iswspace(lexer->lookahead) || lexer->eof(lexer) || lexer->lookahead == '\"'); + } + + return false; +} + +static bool scan_heredoc_start(Heredoc *heredoc, TSLexer *lexer) +{ + while (iswspace(lexer->lookahead)) + { + skip(lexer); + } + + lexer->result_symbol = HEREDOC_START; + heredoc->is_raw = lexer->lookahead == '\'' || lexer->lookahead == '"' || lexer->lookahead == '\\'; + + bool found_delimiter = advance_word(lexer, &heredoc->delimiter); + if (!found_delimiter) + { + reset_string(&heredoc->delimiter); + return false; + } + return found_delimiter; +} + +static bool scan_heredoc_end_identifier(Heredoc *heredoc, TSLexer *lexer) +{ + reset_string(&heredoc->current_leading_word); + // Scan the first 'n' characters on this line, to see if they match the + // heredoc delimiter + t_i32 size = 0; + if (heredoc->delimiter.size > 0) + { + while (lexer->lookahead != '\0' && lexer->lookahead != '\n' && (t_i32)*array_get(&heredoc->delimiter, size) == lexer->lookahead && + heredoc->current_leading_word.size < heredoc->delimiter.size) + { + array_push(&heredoc->current_leading_word, lexer->lookahead); + advance(lexer); + size++; + } + } + array_push(&heredoc->current_leading_word, '\0'); + return heredoc->delimiter.size == 0 ? false : strcmp(heredoc->current_leading_word.contents, heredoc->delimiter.contents) == 0; +} + +static bool scan_heredoc_content(Scanner *scanner, TSLexer *lexer, enum TokenType middle_type, enum TokenType end_type) +{ + bool did_advance = false; + Heredoc *heredoc = array_back(&scanner->heredocs); + + for (;;) + { + switch (lexer->lookahead) + { + case '\0': { + if (lexer->eof(lexer) && did_advance) + { + reset_heredoc(heredoc); + lexer->result_symbol = end_type; + return true; + } + return false; + } + + case '\\': { + did_advance = true; + advance(lexer); + advance(lexer); + break; + } + + case '$': { + if (heredoc->is_raw) + { + did_advance = true; + advance(lexer); + break; + } + if (did_advance) + { + lexer->mark_end(lexer); + lexer->result_symbol = middle_type; + heredoc->started = true; + advance(lexer); + if (iswalpha(lexer->lookahead) || lexer->lookahead == '{' || lexer->lookahead == '(') + { + return true; + } + break; + } + if (middle_type == HEREDOC_BODY_BEGINNING && lexer->get_column(lexer) == 0) + { + lexer->result_symbol = middle_type; + heredoc->started = true; + return true; + } + return false; + } + + case '\n': { + if (!did_advance) + { + skip(lexer); + } + else + { + advance(lexer); + } + did_advance = true; + if (heredoc->allows_indent) + { + while (iswspace(lexer->lookahead)) + { + advance(lexer); + } + } + lexer->result_symbol = heredoc->started ? middle_type : end_type; + lexer->mark_end(lexer); + if (scan_heredoc_end_identifier(heredoc, lexer)) + { + if (lexer->result_symbol == HEREDOC_END) + { + (void)array_pop(&scanner->heredocs); + } + return true; + } + break; + } + + default: { + if (lexer->get_column(lexer) == 0) + { + // an alternative is to check the starting column of the + // heredoc body and track that statefully + while (iswspace(lexer->lookahead)) + { + if (did_advance) + { + advance(lexer); + } + else + { + skip(lexer); + } + } + if (end_type != SIMPLE_HEREDOC_BODY) + { + lexer->result_symbol = middle_type; + if (scan_heredoc_end_identifier(heredoc, lexer)) + { + return true; + } + } + if (end_type == SIMPLE_HEREDOC_BODY) + { + lexer->result_symbol = end_type; + lexer->mark_end(lexer); + if (scan_heredoc_end_identifier(heredoc, lexer)) + { + return true; + } + } + } + did_advance = true; + advance(lexer); + break; + } + } + } +} + +static bool scan(Scanner *scanner, TSLexer *lexer, const bool *valid_symbols) +{ + if (valid_symbols[CONCAT] && !in_error_recovery(valid_symbols)) + { + if (!(lexer->lookahead == 0 || iswspace(lexer->lookahead) || lexer->lookahead == '>' || lexer->lookahead == '<' || + lexer->lookahead == ')' || lexer->lookahead == '(' || lexer->lookahead == ';' || lexer->lookahead == '&' || + lexer->lookahead == '|' || lexer->lookahead == '{' || lexer->lookahead == '}')) + { + lexer->result_symbol = CONCAT; + // So for a`b`, we want to return a concat. We check if the + // 2nd backtick has whitespace after it, and if it does we + // return concat. + if (lexer->lookahead == '`') + { + lexer->mark_end(lexer); + advance(lexer); + while (lexer->lookahead != '`' && !lexer->eof(lexer)) + { + advance(lexer); + } + if (lexer->eof(lexer)) + { + return false; + } + if (lexer->lookahead == '`') + { + advance(lexer); + } + return iswspace(lexer->lookahead) || lexer->eof(lexer); + } + // strings w/ expansions that contains escaped quotes or + // backslashes need this to return a concat + if (lexer->lookahead == '\\') + { + lexer->mark_end(lexer); + advance(lexer); + if (lexer->lookahead == '"' || lexer->lookahead == '\'' || lexer->lookahead == '\\') + { + return true; + } + if (lexer->eof(lexer)) + { + return false; + } + } + else + { + return true; + } + } + } + + if (valid_symbols[IMMEDIATE_DOUBLE_HASH] && !in_error_recovery(valid_symbols)) + { + // advance two # and ensure not } after + if (lexer->lookahead == '#') + { + lexer->mark_end(lexer); + advance(lexer); + if (lexer->lookahead == '#') + { + advance(lexer); + if (lexer->lookahead != '}') + { + lexer->result_symbol = IMMEDIATE_DOUBLE_HASH; + lexer->mark_end(lexer); + return true; + } + } + } + } + + if (valid_symbols[EMPTY_VALUE]) + { + if (iswspace(lexer->lookahead) || lexer->eof(lexer) || lexer->lookahead == ';' || lexer->lookahead == '&') + { + lexer->result_symbol = EMPTY_VALUE; + return true; + } + } + + if ((valid_symbols[HEREDOC_BODY_BEGINNING] || valid_symbols[SIMPLE_HEREDOC_BODY]) && scanner->heredocs.size > 0 && + !array_back(&scanner->heredocs)->started && !in_error_recovery(valid_symbols)) + { + return scan_heredoc_content(scanner, lexer, HEREDOC_BODY_BEGINNING, SIMPLE_HEREDOC_BODY); + } + + if (valid_symbols[HEREDOC_END] && scanner->heredocs.size > 0) + { + Heredoc *heredoc = array_back(&scanner->heredocs); + if (scan_heredoc_end_identifier(heredoc, lexer)) + { + array_delete(&heredoc->current_leading_word); + array_delete(&heredoc->delimiter); + (void)array_pop(&scanner->heredocs); + lexer->result_symbol = HEREDOC_END; + return true; + } + } + + if (valid_symbols[HEREDOC_CONTENT] && scanner->heredocs.size > 0 && array_back(&scanner->heredocs)->started && + !in_error_recovery(valid_symbols)) + { + return scan_heredoc_content(scanner, lexer, HEREDOC_CONTENT, HEREDOC_END); + } + + if (valid_symbols[HEREDOC_START] && !in_error_recovery(valid_symbols) && scanner->heredocs.size > 0) + { + return scan_heredoc_start(array_back(&scanner->heredocs), lexer); + } + + if ((valid_symbols[VARIABLE_NAME] || valid_symbols[FILE_DESCRIPTOR] || valid_symbols[HEREDOC_ARROW]) && + !in_error_recovery(valid_symbols)) + { + for (;;) + { + if ((lexer->lookahead == ' ' || lexer->lookahead == '\t' || lexer->lookahead == '\r' || + (lexer->lookahead == '\n' && !valid_symbols[NEWLINE])) && + !valid_symbols[EXPANSION_WORD]) + { + skip(lexer); + } + else if (lexer->lookahead == '\\') + { + skip(lexer); + + if (lexer->eof(lexer)) + { + lexer->mark_end(lexer); + lexer->result_symbol = VARIABLE_NAME; + return true; + } + + if (lexer->lookahead == '\r') + { + skip(lexer); + } + if (lexer->lookahead == '\n') + { + skip(lexer); + } + else + { + if (lexer->lookahead == '\\' && valid_symbols[EXPANSION_WORD]) + { + goto expansion_word; + } + return false; + } + } + else + { + break; + } + } + + // no '*', '@', '?', '-', '$', '0', '_' + if (!valid_symbols[EXPANSION_WORD] && (lexer->lookahead == '*' || lexer->lookahead == '@' || lexer->lookahead == '?' || + lexer->lookahead == '-' || lexer->lookahead == '0' || lexer->lookahead == '_')) + { + lexer->mark_end(lexer); + advance(lexer); + if (lexer->lookahead == '=' || lexer->lookahead == '[' || lexer->lookahead == ':' || lexer->lookahead == '-' || + lexer->lookahead == '%' || lexer->lookahead == '#' || lexer->lookahead == '/') + { + return false; + } + if (valid_symbols[EXTGLOB_PATTERN] && iswspace(lexer->lookahead)) + { + lexer->mark_end(lexer); + lexer->result_symbol = EXTGLOB_PATTERN; + return true; + } + } + + if (valid_symbols[HEREDOC_ARROW] && lexer->lookahead == '<') + { + advance(lexer); + if (lexer->lookahead == '<') + { + advance(lexer); + if (lexer->lookahead == '-') + { + advance(lexer); + Heredoc heredoc = heredoc_new(); + heredoc.allows_indent = true; + array_push(&scanner->heredocs, heredoc); + lexer->result_symbol = HEREDOC_ARROW_DASH; + } + // else if (lexer->lookahead == '<' || lexer->lookahead == '=') + // { + // return false; + // } + else + { + Heredoc heredoc = heredoc_new(); + array_push(&scanner->heredocs, heredoc); + lexer->result_symbol = HEREDOC_ARROW; + } + return true; + } + return false; + } + + bool is_number = true; + if (iswdigit(lexer->lookahead)) + { + advance(lexer); + } + else if (iswalpha(lexer->lookahead) || lexer->lookahead == '_') + { + is_number = false; + advance(lexer); + } + else + { + if (lexer->lookahead == '{') + { + goto brace_start; + } + if (valid_symbols[EXPANSION_WORD]) + { + goto expansion_word; + } + if (valid_symbols[EXTGLOB_PATTERN]) + { + goto extglob_pattern; + } + return false; + } + + for (;;) + { + if (iswdigit(lexer->lookahead)) + { + advance(lexer); + } + else if (iswalpha(lexer->lookahead) || lexer->lookahead == '_') + { + is_number = false; + advance(lexer); + } + else + { + break; + } + } + + if (is_number && valid_symbols[FILE_DESCRIPTOR] && (lexer->lookahead == '>' || lexer->lookahead == '<')) + { + lexer->result_symbol = FILE_DESCRIPTOR; + return true; + } + + if (valid_symbols[VARIABLE_NAME]) + { + if (lexer->lookahead == '+') + { + lexer->mark_end(lexer); + advance(lexer); + if (lexer->lookahead == '=' || lexer->lookahead == ':') + { + lexer->result_symbol = VARIABLE_NAME; + return true; + } + return false; + } + if (lexer->lookahead == '/') + { + return false; + } + if (lexer->lookahead == '=' || lexer->lookahead == '[' || + (lexer->lookahead == ':' && + !valid_symbols[OPENING_PAREN]) || // TODO(amaanq): more cases for regular word chars but not variable + // names for function words, only handling : for now? #235 + lexer->lookahead == '%' || + (lexer->lookahead == '#' && !is_number) || lexer->lookahead == '@' || (lexer->lookahead == '-')) + { + lexer->mark_end(lexer); + lexer->result_symbol = VARIABLE_NAME; + return true; + } + + if (lexer->lookahead == '?') + { + lexer->mark_end(lexer); + advance(lexer); + lexer->result_symbol = VARIABLE_NAME; + return iswalpha(lexer->lookahead); + } + } + + return false; + } + + if (valid_symbols[BARE_DOLLAR] && !in_error_recovery(valid_symbols) && scan_bare_dollar(lexer)) + { + return true; + } + + if ((valid_symbols[REGEX]) && !in_error_recovery(valid_symbols)) + { + if (valid_symbols[REGEX]) + { + while (iswspace(lexer->lookahead)) + { + skip(lexer); + } + } + + if ((lexer->lookahead != '"' && lexer->lookahead != '\'') || ((lexer->lookahead == '$' || lexer->lookahead == '\'')) || + (lexer->lookahead == '\'')) + { + typedef struct + { + bool done; + bool advanced_once; + bool found_non_alnumdollarunderdash; + bool last_was_escape; + bool in_single_quote; + t_u32 paren_depth; + t_u32 bracket_depth; + t_u32 brace_depth; + } State; + + if (lexer->lookahead == '$') + { + lexer->mark_end(lexer); + advance(lexer); + if (lexer->lookahead == '(') + { + return false; + } + } + + lexer->mark_end(lexer); + + State state = {false, false, false, false, false, 0, 0, 0}; + while (!state.done) + { + if (state.in_single_quote) + { + if (lexer->lookahead == '\'') + { + state.in_single_quote = false; + advance(lexer); + lexer->mark_end(lexer); + } + } + switch (lexer->lookahead) + { + case '\\': + state.last_was_escape = true; + break; + case '\0': + return false; + case '(': + state.paren_depth++; + state.last_was_escape = false; + break; + case '[': + state.bracket_depth++; + state.last_was_escape = false; + break; + case '{': + if (!state.last_was_escape) + state.brace_depth++; + state.last_was_escape = false; + break; + case ')': + if (state.paren_depth == 0) + state.done = true; + state.paren_depth--; + state.last_was_escape = false; + break; + case ']': + if (state.bracket_depth == 0) + state.done = true; + state.bracket_depth--; + state.last_was_escape = false; + break; + case '}': + if (state.brace_depth == 0) + state.done = true; + state.brace_depth--; + state.last_was_escape = false; + break; + case '\'': + // Enter or exit a single-quoted string. + state.in_single_quote = !state.in_single_quote; + advance(lexer); + state.advanced_once = true; + state.last_was_escape = false; + continue; + default: + state.last_was_escape = false; + break; + } + + if (!state.done) + { + if (valid_symbols[REGEX]) + { + bool was_space = !state.in_single_quote && iswspace(lexer->lookahead); + advance(lexer); + state.advanced_once = true; + if (!was_space || state.paren_depth > 0) + { + lexer->mark_end(lexer); + } + } + } + } + + lexer->result_symbol = REGEX; + if (valid_symbols[REGEX] && !state.advanced_once) + { + return false; + } + return true; + } + } + +extglob_pattern: + if (valid_symbols[EXTGLOB_PATTERN] && !in_error_recovery(valid_symbols)) + { + // first skip ws, then check for ? * + @ ! + while (iswspace(lexer->lookahead)) + { + skip(lexer); + } + + if (lexer->lookahead == '?' || lexer->lookahead == '*' || lexer->lookahead == '+' || lexer->lookahead == '@' || + lexer->lookahead == '!' || lexer->lookahead == '-' || lexer->lookahead == ')' || lexer->lookahead == '\\' || + lexer->lookahead == '.' || lexer->lookahead == '[' || (iswalpha(lexer->lookahead))) + { + if (lexer->lookahead == '\\') + { + advance(lexer); + if ((iswspace(lexer->lookahead) || lexer->lookahead == '"') && lexer->lookahead != '\r' && lexer->lookahead != '\n') + { + advance(lexer); + } + else + { + return false; + } + } + + if (lexer->lookahead == ')' && scanner->last_glob_paren_depth == 0) + { + lexer->mark_end(lexer); + advance(lexer); + + if (iswspace(lexer->lookahead)) + { + return false; + } + } + + lexer->mark_end(lexer); + bool was_non_alpha = !iswalpha(lexer->lookahead); + if (lexer->lookahead != '[') + { + // no esac + if (lexer->lookahead == 'e') + { + lexer->mark_end(lexer); + advance(lexer); + if (lexer->lookahead == 's') + { + advance(lexer); + if (lexer->lookahead == 'a') + { + advance(lexer); + if (lexer->lookahead == 'c') + { + advance(lexer); + if (iswspace(lexer->lookahead)) + { + return false; + } + } + } + } + } + else + { + advance(lexer); + } + } + + // -\w is just a word, find something else special + if (lexer->lookahead == '-') + { + lexer->mark_end(lexer); + advance(lexer); + while (iswalnum(lexer->lookahead)) + { + advance(lexer); + } + + if (lexer->lookahead == ')' || lexer->lookahead == '\\' || lexer->lookahead == '.') + { + return false; + } + lexer->mark_end(lexer); + } + + // case item -) or *) + if (lexer->lookahead == ')' && scanner->last_glob_paren_depth == 0) + { + lexer->mark_end(lexer); + advance(lexer); + if (iswspace(lexer->lookahead)) + { + lexer->result_symbol = EXTGLOB_PATTERN; + return was_non_alpha; + } + } + + if (iswspace(lexer->lookahead)) + { + lexer->mark_end(lexer); + lexer->result_symbol = EXTGLOB_PATTERN; + scanner->last_glob_paren_depth = 0; + return true; + } + + if (lexer->lookahead == '$') + { + lexer->mark_end(lexer); + advance(lexer); + if (lexer->lookahead == '{' || lexer->lookahead == '(') + { + lexer->result_symbol = EXTGLOB_PATTERN; + return true; + } + } + + if (lexer->lookahead == '|') + { + lexer->mark_end(lexer); + advance(lexer); + lexer->result_symbol = EXTGLOB_PATTERN; + return true; + } + + if (!iswalnum(lexer->lookahead) && lexer->lookahead != '(' && lexer->lookahead != '"' && lexer->lookahead != '[' && + lexer->lookahead != '?' && lexer->lookahead != '/' && lexer->lookahead != '\\' && lexer->lookahead != '_' && + lexer->lookahead != '*') + { + return false; + } + + typedef struct + { + bool done; + bool saw_non_alphadot; + t_u32 paren_depth; + t_u32 bracket_depth; + t_u32 brace_depth; + } State; + + State state = {false, was_non_alpha, scanner->last_glob_paren_depth, 0, 0}; + while (!state.done) + { + switch (lexer->lookahead) + { + case '\0': + return false; + case '(': + state.paren_depth++; + break; + case '[': + state.bracket_depth++; + break; + case '{': + state.brace_depth++; + break; + case ')': + if (state.paren_depth == 0) + { + state.done = true; + } + state.paren_depth--; + break; + case ']': + if (state.bracket_depth == 0) + { + state.done = true; + } + state.bracket_depth--; + break; + case '}': + if (state.brace_depth == 0) + { + state.done = true; + } + state.brace_depth--; + break; + } + + if (lexer->lookahead == '|') + { + lexer->mark_end(lexer); + advance(lexer); + if (state.paren_depth == 0 && state.bracket_depth == 0 && state.brace_depth == 0) + { + lexer->result_symbol = EXTGLOB_PATTERN; + return true; + } + } + + if (!state.done) + { + bool was_space = iswspace(lexer->lookahead); + if (lexer->lookahead == '$') + { + lexer->mark_end(lexer); + if (!iswalpha(lexer->lookahead) && lexer->lookahead != '.' && lexer->lookahead != '\\') + { + state.saw_non_alphadot = true; + } + advance(lexer); + if (lexer->lookahead == '(' || lexer->lookahead == '{') + { + lexer->result_symbol = EXTGLOB_PATTERN; + scanner->last_glob_paren_depth = state.paren_depth; + return state.saw_non_alphadot; + } + } + if (was_space) + { + lexer->mark_end(lexer); + lexer->result_symbol = EXTGLOB_PATTERN; + scanner->last_glob_paren_depth = 0; + return state.saw_non_alphadot; + } + if (lexer->lookahead == '"') + { + lexer->mark_end(lexer); + lexer->result_symbol = EXTGLOB_PATTERN; + scanner->last_glob_paren_depth = 0; + return state.saw_non_alphadot; + } + if (lexer->lookahead == '\\') + { + if (!iswalpha(lexer->lookahead) && lexer->lookahead != '.' && lexer->lookahead != '\\') + { + state.saw_non_alphadot = true; + } + advance(lexer); + if (iswspace(lexer->lookahead) || lexer->lookahead == '"') + { + advance(lexer); + } + } + else + { + if (!iswalpha(lexer->lookahead) && lexer->lookahead != '.' && lexer->lookahead != '\\') + { + state.saw_non_alphadot = true; + } + advance(lexer); + } + if (!was_space) + { + lexer->mark_end(lexer); + } + } + } + + lexer->result_symbol = EXTGLOB_PATTERN; + scanner->last_glob_paren_depth = 0; + return state.saw_non_alphadot; + } + scanner->last_glob_paren_depth = 0; + + return false; + } + +expansion_word: + if (valid_symbols[EXPANSION_WORD]) + { + bool advanced_once = false; + bool advance_once_space = false; + for (;;) + { + if (lexer->lookahead == '\"') + return false; + if (lexer->lookahead == '$') + { + lexer->mark_end(lexer); + advance(lexer); + if (lexer->lookahead == '{' || lexer->lookahead == '(' || lexer->lookahead == '\'' || iswalnum(lexer->lookahead)) + { + lexer->result_symbol = EXPANSION_WORD; + return advanced_once; + } + advanced_once = true; + } + + if (lexer->lookahead == '}') + { + lexer->mark_end(lexer); + lexer->result_symbol = EXPANSION_WORD; + return advanced_once || advance_once_space; + } + + if (lexer->lookahead == '(' && !(advanced_once || advance_once_space)) + { + lexer->mark_end(lexer); + advance(lexer); + while (lexer->lookahead != ')' && !lexer->eof(lexer)) + { + // if we find a $( or ${ assume this is valid and is + // a garbage concatenation of some weird word + an + // expansion + // I wonder where this can fail + if (lexer->lookahead == '$') + { + lexer->mark_end(lexer); + advance(lexer); + if (lexer->lookahead == '{' || lexer->lookahead == '(' || lexer->lookahead == '\'' || iswalnum(lexer->lookahead)) + { + lexer->result_symbol = EXPANSION_WORD; + return advanced_once; + } + advanced_once = true; + } + else + { + advanced_once = advanced_once || !iswspace(lexer->lookahead); + advance_once_space = advance_once_space || iswspace(lexer->lookahead); + advance(lexer); + } + } + lexer->mark_end(lexer); + if (lexer->lookahead == ')') + { + advanced_once = true; + advance(lexer); + lexer->mark_end(lexer); + if (lexer->lookahead == '}') + return false; + } + else + return false; + } + + if (lexer->lookahead == '\'') + return false; + if (lexer->eof(lexer)) + return false; + advanced_once = advanced_once || !iswspace(lexer->lookahead); + advance_once_space = advance_once_space || iswspace(lexer->lookahead); + advance(lexer); + } + } + +brace_start: + return false; +} + +void *tree_sitter_sh_external_scanner_create() +{ + Scanner *scanner = mem_alloc(sizeof(Scanner)); + array_init(&scanner->heredocs); + return scanner; +} + +bool tree_sitter_sh_external_scanner_scan(void *payload, TSLexer *lexer, const bool *valid_symbols) +{ + Scanner *scanner = (Scanner *)payload; + return scan(scanner, lexer, valid_symbols); +} + +t_u32 tree_sitter_sh_external_scanner_serialize(void *payload, t_u8 *state) +{ + Scanner *scanner = (Scanner *)payload; + return serialize(scanner, state); +} + +void tree_sitter_sh_external_scanner_deserialize(void *payload, const t_u8 *state, t_u32 length) +{ + Scanner *scanner = (Scanner *)payload; + deserialize(scanner, state, length); +} + +void tree_sitter_sh_external_scanner_destroy(void *payload) +{ + Scanner *scanner = (Scanner *)payload; + for (size_t i = 0; i < scanner->heredocs.size; i++) + { + Heredoc *heredoc = array_get(&scanner->heredocs, i); + array_delete(&heredoc->current_leading_word); + array_delete(&heredoc->delimiter); + } + array_delete(&scanner->heredocs); + mem_free(scanner); +} diff --git a/parser/src/scanner/advance_words.c b/parser/src/scanner/advance_words.c deleted file mode 100644 index e9cb99c7..00000000 --- a/parser/src/scanner/advance_words.c +++ /dev/null @@ -1,52 +0,0 @@ -/* ************************************************************************** */ -/* */ -/* ::: :::::::: */ -/* advance_words.c :+: :+: :+: */ -/* +:+ +:+ +:+ */ -/* By: maiboyer +#+ +:+ +#+ */ -/* +#+#+#+#+#+ +#+ */ -/* Created: 2024/09/01 19:28:19 by maiboyer #+# #+# */ -/* Updated: 2024/09/01 20:10:14 by maiboyer ### ########.fr */ -/* */ -/* ************************************************************************** */ - -#include "me/char/char.h" -#include "me/string/string.h" -#include "me/types.h" -#include "parser/parser.h" - -void advance_word_inner(TSLexer *lexer, bool *empty, t_i32 *quote) -{ - *empty = true; - *quote = 0; - if (lexer->lookahead == '\'' || lexer->lookahead == '"') - { - *quote = lexer->lookahead; - lexer->advance(lexer, false); - } -} - -bool advance_word(TSLexer *lexer, t_string *unquoted_word) -{ - bool empty; - t_i32 quote; - - advance_word_inner(lexer, &empty, "e); - while (lexer->lookahead && !((quote && (lexer->lookahead == quote - || lexer->lookahead == '\r' || lexer->lookahead == '\n')) - || (!quote && (me_isspace(lexer->lookahead))))) - { - if (lexer->lookahead == '\\') - { - lexer->advance(lexer, false); - if (!lexer->lookahead) - return (false); - } - empty = false; - string_push_char(unquoted_word, lexer->lookahead); - lexer->advance(lexer, false); - } - if (quote && lexer->lookahead == quote) - lexer->advance(lexer, false); - return (!empty); -} diff --git a/parser/src/scanner/deserialize.c b/parser/src/scanner/deserialize.c deleted file mode 100644 index 2a71493a..00000000 --- a/parser/src/scanner/deserialize.c +++ /dev/null @@ -1,68 +0,0 @@ -/* ************************************************************************** */ -/* */ -/* ::: :::::::: */ -/* deserialize.c :+: :+: :+: */ -/* +:+ +:+ +:+ */ -/* By: maiboyer +#+ +:+ +#+ */ -/* +#+#+#+#+#+ +#+ */ -/* Created: 2024/09/01 15:06:56 by maiboyer #+# #+# */ -/* Updated: 2024/09/01 20:08:37 by maiboyer ### ########.fr */ -/* */ -/* ************************************************************************** */ - -#include "me/mem/mem.h" -#include "me/types.h" -#include "me/vec/vec_heredoc.h" -#include "parser/inner/heredoc.h" -#include "parser/inner/scanner.h" - -void reset(t_scanner *scanner); - -void actual_reset(\ - t_scanner *scanner, const t_u8 *buffer, t_usize i, t_u32 *size) -{ - t_heredoc *heredoc; - t_usize delim_size; - - heredoc = NULL; - if (i < scanner->heredocs.len) - heredoc = vec_heredoc_get(&scanner->heredocs, i); - else - { - vec_heredoc_push(&scanner->heredocs, heredoc_new()); - heredoc = vec_heredoc_last(&scanner->heredocs); - } - heredoc->is_raw = buffer[(*size)++]; - heredoc->started = buffer[(*size)++]; - heredoc->allows_indent = buffer[(*size)++]; - mem_copy(&delim_size, &buffer[(*size)], sizeof(t_usize)); - (*size) += sizeof(t_usize); - string_reserve(&heredoc->delimiter, delim_size + 1); - heredoc->delimiter.len = delim_size - 1; - if (delim_size > 0) - { - mem_copy(heredoc->delimiter.buf, &buffer[(*size)], delim_size); - (*size) += delim_size; - } -} - -void tree_sitter_sh_external_scanner_deserialize(t_scanner *scanner, - const t_u8 *buffer, t_u32 length) -{ - t_u32 size; - t_u32 heredoc_count; - t_usize i; - - if (length == 0) - return (reset(scanner)); - size = 0; - scanner->last_glob_paren_depth = buffer[size++]; - scanner->ext_was_in_double_quote = buffer[size++]; - scanner->ext_saw_outside_quote = buffer[size++]; - heredoc_count = (t_u8)buffer[size++]; - i = 0; - while (i < heredoc_count) - actual_reset(scanner, buffer, i++, &size); - if (size != length) - me_abort("size != length"); -} diff --git a/parser/src/scanner/heredoc.c b/parser/src/scanner/heredoc.c deleted file mode 100644 index 47c501aa..00000000 --- a/parser/src/scanner/heredoc.c +++ /dev/null @@ -1,103 +0,0 @@ -/* ************************************************************************** */ -/* */ -/* ::: :::::::: */ -/* heredoc.c :+: :+: :+: */ -/* +:+ +:+ +:+ */ -/* By: maiboyer +#+ +:+ +#+ */ -/* +#+#+#+#+#+ +#+ */ -/* Created: 2024/09/01 19:33:04 by maiboyer #+# #+# */ -/* Updated: 2024/09/02 17:31:02 by maiboyer ### ########.fr */ -/* */ -/* ************************************************************************** */ - -#include "me/char/char.h" -#include "me/str/str.h" -#include "me/types.h" -#include "me/vec/vec_heredoc.h" -#include "parser/inner/heredoc.h" -#include "parser/inner/scanner.h" -#include "parser/parser.h" - -bool scan_heredoc_start(t_heredoc *heredoc, TSLexer *lexer) -{ - bool found_delimiter; - - while (me_isspace(lexer->lookahead)) - lexer->advance(lexer, true); - lexer->result_symbol = HEREDOC_START; - heredoc->is_raw = lexer->lookahead == '\'' || lexer->lookahead == '"' - || lexer->lookahead == '\\'; - found_delimiter = advance_word(lexer, &heredoc->delimiter); - if (!found_delimiter) - return (string_clear(&heredoc->delimiter), false); - return (found_delimiter); -} - -bool scan_heredoc_end_identifier(t_heredoc *heredoc, TSLexer *lexer) -{ - t_i32 size; - - size = 0; - string_clear(&heredoc->current_leading_word); - if (heredoc->delimiter.len > 0) - { - while (lexer->lookahead != '\0' && lexer->lookahead != '\n' - && (t_i32)heredoc->delimiter.buf[size] == lexer->lookahead - && heredoc->current_leading_word.len < heredoc->delimiter.len) - { - string_push_char(&heredoc->current_leading_word, lexer->lookahead); - lexer->advance(lexer, false); - size++; - } - } - if (heredoc->delimiter.len == 0) - return (false); - return (str_compare(heredoc->current_leading_word.buf, - heredoc->delimiter.buf)); -} - -bool scan_heredoc_content_nullbyte(struct s_heredoc_scan_state *state); -bool scan_heredoc_content_backslash(struct s_heredoc_scan_state *state); -bool scan_heredoc_content_dollar(struct s_heredoc_scan_state *state); -bool scan_heredoc_content_newline(struct s_heredoc_scan_state *state); -bool scan_heredoc_content_other(struct s_heredoc_scan_state *state); - -bool scan_heredoc_content(t_scanner *scanner, TSLexer *lexer, - enum e_token_type middle_type, enum e_token_type end_type) -{ - struct s_heredoc_scan_state state; - t_heredoc_content_func func; - - state = (struct s_heredoc_scan_state){.did_advance = false, .lexer = lexer, - .heredoc = vec_heredoc_last(&scanner->heredocs), .scanner = scanner, - .middle_type = middle_type, .end_type = end_type, - .return_value = false}; - while (true) - { - if (lexer->lookahead == '\0') - func = scan_heredoc_content_nullbyte; - else if (lexer->lookahead == '\\') - func = scan_heredoc_content_backslash; - else if (lexer->lookahead == '$') - func = scan_heredoc_content_dollar; - else if (lexer->lookahead == '\n') - func = scan_heredoc_content_newline; - else - func = scan_heredoc_content_other; - if (func(&state)) - return (state.return_value); - } - return (false); -} - -bool scan_heredoc_end(t_scanner *scanner, TSLexer *lexer) -{ - t_heredoc *heredoc; - - heredoc = vec_heredoc_last(&scanner->heredocs); - string_free(heredoc->current_leading_word); - string_free(heredoc->delimiter); - vec_heredoc_pop(&scanner->heredocs, NULL); - lexer->result_symbol = HEREDOC_END; - return (true); -} diff --git a/parser/src/scanner/heredoc_functions.c b/parser/src/scanner/heredoc_functions.c deleted file mode 100644 index e5aa249c..00000000 --- a/parser/src/scanner/heredoc_functions.c +++ /dev/null @@ -1,117 +0,0 @@ -/* ************************************************************************** */ -/* */ -/* ::: :::::::: */ -/* heredoc_functions.c :+: :+: :+: */ -/* +:+ +:+ +:+ */ -/* By: maiboyer +#+ +:+ +#+ */ -/* +#+#+#+#+#+ +#+ */ -/* Created: 2024/09/01 19:36:53 by maiboyer #+# #+# */ -/* Updated: 2024/09/02 18:04:32 by maiboyer ### ########.fr */ -/* */ -/* ************************************************************************** */ - -#include "me/char/char.h" -#include "me/str/str.h" -#include "me/types.h" -#include "me/vec/vec_heredoc.h" -#include "parser/inner/heredoc.h" -#include "parser/inner/scanner.h" -#include "parser/parser.h" - -bool scan_heredoc_end_identifier(t_heredoc *heredoc, TSLexer *lexer); - -bool scan_heredoc_content_nullbyte(struct s_heredoc_scan_state *state) -{ - if (state->lexer->eof(state->lexer) && state->did_advance) - { - reset_heredoc(state->heredoc); - state->lexer->result_symbol = state->end_type; - return (state->return_value = true, true); - } - return (state->return_value = false, true); -} - -bool scan_heredoc_content_backslash(struct s_heredoc_scan_state *state) -{ - state->did_advance = true; - state->lexer->advance(state->lexer, false); - state->lexer->advance(state->lexer, false); - return (false); -} - -bool scan_heredoc_content_dollar(struct s_heredoc_scan_state *state) -{ - if (state->heredoc->is_raw) - { - state->did_advance = true; - state->lexer->advance(state->lexer, false); - } - if (state->did_advance) - { - state->lexer->mark_end(state->lexer); - state->lexer->result_symbol = state->middle_type; - state->heredoc->started = true; - state->lexer->advance(state->lexer, false); - if (me_isalpha(state->lexer->lookahead) - || state->lexer->lookahead == '{' || state->lexer->lookahead == '(') - return (state->return_value = true, true); - } - if (state->middle_type == HEREDOC_BODY_BEGINNING - && state->lexer->get_column(state->lexer) == 0) - { - state->lexer->result_symbol = state->middle_type; - state->heredoc->started = true; - return (state->return_value = true, true); - } - return (state->return_value = false, true); -} - -bool scan_heredoc_content_newline(struct s_heredoc_scan_state *state) -{ - if (!state->did_advance) - state->lexer->advance(state->lexer, true); - else - state->lexer->advance(state->lexer, false); - state->did_advance = true; - if (state->heredoc->allows_indent) - { - while (me_isspace(state->lexer->lookahead)) - state->lexer->advance(state->lexer, false); - } - state->lexer->result_symbol = state->end_type; - if (state->heredoc->started) - state->lexer->result_symbol = state->middle_type; - state->lexer->mark_end(state->lexer); - if (scan_heredoc_end_identifier(state->heredoc, state->lexer)) - { - if (state->lexer->result_symbol == HEREDOC_END) - vec_heredoc_pop(&state->scanner->heredocs, NULL); - return (state->return_value = true, true); - } - return (false); -} - -bool scan_heredoc_content_other(struct s_heredoc_scan_state *state) -{ - if (state->lexer->get_column(state->lexer) == 0) - { - while (me_isspace(state->lexer->lookahead)) - state->lexer->advance(state->lexer, !state->did_advance); - if (state->end_type != SIMPLE_HEREDOC_BODY) - { - state->lexer->result_symbol = state->middle_type; - if (scan_heredoc_end_identifier(state->heredoc, state->lexer)) - return (state->return_value = true, true); - } - if (state->end_type == SIMPLE_HEREDOC_BODY) - { - state->lexer->result_symbol = state->end_type; - state->lexer->mark_end(state->lexer); - if (scan_heredoc_end_identifier(state->heredoc, state->lexer)) - return (state->return_value = true, true); - } - } - state->did_advance = true; - state->lexer->advance(state->lexer, false); - return (false); -} diff --git a/parser/src/scanner/lifetime.c b/parser/src/scanner/lifetime.c deleted file mode 100644 index 0ff5dfad..00000000 --- a/parser/src/scanner/lifetime.c +++ /dev/null @@ -1,47 +0,0 @@ -/* ************************************************************************** */ -/* */ -/* ::: :::::::: */ -/* lifetime.c :+: :+: :+: */ -/* +:+ +:+ +:+ */ -/* By: maiboyer +#+ +:+ +#+ */ -/* +#+#+#+#+#+ +#+ */ -/* Created: 2024/09/02 13:17:17 by maiboyer #+# #+# */ -/* Updated: 2024/09/02 17:45:47 by maiboyer ### ########.fr */ -/* */ -/* ************************************************************************** */ - -#include "me/mem/mem.h" -#include "parser/inner/scanner.h" - -void *tree_sitter_sh_external_scanner_create(void) -{ - t_scanner *scanner; - - scanner = mem_alloc(sizeof(*scanner)); - scanner->heredocs = vec_heredoc_new(16, heredoc_free); - return (scanner); -} - -bool tree_sitter_sh_external_scanner_scan(\ - void *payload, TSLexer *lexer, const bool *valid_symbols) -{ - t_scanner *scanner; - - scanner = (t_scanner *)payload; - return (scan(scanner, lexer, valid_symbols)); -} - -void tree_sitter_sh_external_scanner_destroy(void *payload) -{ - vec_heredoc_free(((t_scanner *)payload)->heredocs); - mem_free((t_scanner *)payload); -} - -void reset(t_scanner *scanner) -{ - t_usize i; - - i = 0; - while (i < scanner->heredocs.len) - reset_heredoc(&scanner->heredocs.buffer[i++]); -} diff --git a/parser/src/scanner/scan.c b/parser/src/scanner/scan.c deleted file mode 100644 index a016b3e3..00000000 --- a/parser/src/scanner/scan.c +++ /dev/null @@ -1,59 +0,0 @@ -/* ************************************************************************** */ -/* */ -/* ::: :::::::: */ -/* scan.c :+: :+: :+: */ -/* +:+ +:+ +:+ */ -/* By: maiboyer +#+ +:+ +#+ */ -/* +#+#+#+#+#+ +#+ */ -/* Created: 2024/09/02 17:40:46 by maiboyer #+# #+# */ -/* Updated: 2024/09/02 17:41:52 by maiboyer ### ########.fr */ -/* */ -/* ************************************************************************** */ - -#include "me/char/char.h" -#include "parser/inner/scanner.h" - -static bool scan2(t_scanner *scanner, TSLexer *lexer, const bool *valid_symbols) -{ - if (valid_symbols[HEREDOC_CONTENT] && scanner->heredocs.len > 0 - && vec_heredoc_last(&scanner->heredocs)->started - && !valid_symbols[ERROR_RECOVERY]) - return (scan_heredoc_content(scanner, lexer, HEREDOC_CONTENT, - HEREDOC_END)); - if (valid_symbols[HEREDOC_START] && !valid_symbols[ERROR_RECOVERY] - && scanner->heredocs.len > 0) - return (scan_heredoc_start(vec_heredoc_last(&scanner->heredocs), - lexer)); - if ((valid_symbols[VARIABLE_NAME] || valid_symbols[FILE_DESCRIPTOR] - || valid_symbols[HEREDOC_ARROW]) && !valid_symbols[ERROR_RECOVERY]) - return (scan_varname(scanner, lexer, valid_symbols)); - if (valid_symbols[BARE_DOLLAR] && !valid_symbols[ERROR_RECOVERY] - && scan_bare_dollar(lexer)) - return (true); - if (valid_symbols[EXPANSION_WORD]) - return (scan_expansion_word(scanner, lexer, valid_symbols)); - return (false); -} - -bool scan(t_scanner *scanner, TSLexer *lexer, const bool *valid_symbols) -{ - if (check_scan_concat(scanner, lexer, valid_symbols)) - return (scan_concat(scanner, lexer, valid_symbols)); - if (scan_double_hash(scanner, lexer, valid_symbols)) - return (true); - if (valid_symbols[EMPTY_VALUE] && (me_isspace(lexer->lookahead) - || lexer->eof(lexer) || lexer->lookahead == ';' - || lexer->lookahead == '&')) - return (lexer->result_symbol = EMPTY_VALUE, true); - if ((valid_symbols[HEREDOC_BODY_BEGINNING] - || valid_symbols[SIMPLE_HEREDOC_BODY]) && scanner->heredocs.len > 0 - && !vec_heredoc_last(&scanner->heredocs)->started - && !valid_symbols[ERROR_RECOVERY]) - return (scan_heredoc_content(scanner, lexer, HEREDOC_BODY_BEGINNING, - SIMPLE_HEREDOC_BODY)); - if (valid_symbols[HEREDOC_END] && scanner->heredocs.len > 0 - && scan_heredoc_end_identifier(vec_heredoc_last(&scanner->heredocs), - lexer)) - return (scan_heredoc_end(scanner, lexer)); - return (scan2(scanner, lexer, valid_symbols)); -} diff --git a/parser/src/scanner/scan_concat.c b/parser/src/scanner/scan_concat.c deleted file mode 100644 index f91da0e8..00000000 --- a/parser/src/scanner/scan_concat.c +++ /dev/null @@ -1,57 +0,0 @@ -/* ************************************************************************** */ -/* */ -/* ::: :::::::: */ -/* scan_concat.c :+: :+: :+: */ -/* +:+ +:+ +:+ */ -/* By: maiboyer +#+ +:+ +#+ */ -/* +#+#+#+#+#+ +#+ */ -/* Created: 2024/09/02 17:37:05 by maiboyer #+# #+# */ -/* Updated: 2024/09/02 18:03:17 by maiboyer ### ########.fr */ -/* */ -/* ************************************************************************** */ - -#include "me/char/char.h" -#include "parser/inner/scanner.h" - -bool scan_concat(\ - t_scanner *scanner, TSLexer *lexer, const bool *valid_symbols) -{ - ((void)(scanner), (void)(valid_symbols)); - lexer->result_symbol = CONCAT; - if (lexer->lookahead == '`') - { - lexer->mark_end(lexer); - lexer->advance(lexer, false); - while (lexer->lookahead != '`' && !lexer->eof(lexer)) - lexer->advance(lexer, false); - if (lexer->eof(lexer)) - return (false); - if (lexer->lookahead == '`') - lexer->advance(lexer, false); - return (me_isspace(lexer->lookahead) || lexer->eof(lexer)); - } - if (lexer->lookahead == '\\') - { - lexer->mark_end(lexer); - lexer->advance(lexer, false); - if (lexer->lookahead == '"' || lexer->lookahead == '\'' \ - || lexer->lookahead == '\\') - return (true); - if (lexer->eof(lexer)) - return (false); - } - return (true); -} - -bool check_scan_concat(\ - t_scanner *scanner, TSLexer *lexer, const bool *valid_symbols) -{ - (void)(scanner); - return ((valid_symbols[CONCAT] && !valid_symbols[ERROR_RECOVERY]) \ - && (!(lexer->lookahead == 0 || me_isspace(lexer->lookahead) \ - || lexer->lookahead == '>' || lexer->lookahead == '<' \ - || lexer->lookahead == ')' || lexer->lookahead == '(' \ - || lexer->lookahead == ';' || lexer->lookahead == '&' \ - || lexer->lookahead == '|' || lexer->lookahead == '{' \ - || lexer->lookahead == '}'))); -} diff --git a/parser/src/scanner/scan_dollar.c b/parser/src/scanner/scan_dollar.c deleted file mode 100644 index 68a7f95f..00000000 --- a/parser/src/scanner/scan_dollar.c +++ /dev/null @@ -1,30 +0,0 @@ -/* ************************************************************************** */ -/* */ -/* ::: :::::::: */ -/* scan_dollar.c :+: :+: :+: */ -/* +:+ +:+ +:+ */ -/* By: maiboyer +#+ +:+ +#+ */ -/* +#+#+#+#+#+ +#+ */ -/* Created: 2024/09/02 13:26:00 by maiboyer #+# #+# */ -/* Updated: 2024/09/02 17:46:26 by maiboyer ### ########.fr */ -/* */ -/* ************************************************************************** */ - -#include "me/char/char.h" -#include "parser/inner/scanner.h" - -bool scan_bare_dollar(TSLexer *lexer) -{ - while (me_isspace(lexer->lookahead) \ - && lexer->lookahead != '\n' && !lexer->eof(lexer)) - lexer->advance(lexer, true); - if (lexer->lookahead == '$') - { - lexer->advance(lexer, false); - lexer->result_symbol = BARE_DOLLAR; - lexer->mark_end(lexer); - return (me_isspace(lexer->lookahead) \ - || lexer->eof(lexer) || lexer->lookahead == '\"'); - } - return (false); -} diff --git a/parser/src/scanner/scan_double_hash.c b/parser/src/scanner/scan_double_hash.c deleted file mode 100644 index a5e9883f..00000000 --- a/parser/src/scanner/scan_double_hash.c +++ /dev/null @@ -1,38 +0,0 @@ -/* ************************************************************************** */ -/* */ -/* ::: :::::::: */ -/* scan_double_hash.c :+: :+: :+: */ -/* +:+ +:+ +:+ */ -/* By: maiboyer +#+ +:+ +#+ */ -/* +#+#+#+#+#+ +#+ */ -/* Created: 2024/09/02 17:32:35 by maiboyer #+# #+# */ -/* Updated: 2024/09/02 18:00:13 by maiboyer ### ########.fr */ -/* */ -/* ************************************************************************** */ - -#include "parser/inner/scanner.h" - -bool scan_double_hash(\ - t_scanner *scanner, TSLexer *lexer, const bool *valid_symbols) -{ - (void)(scanner); - if (valid_symbols[IMMEDIATE_DOUBLE_HASH] && !valid_symbols[ERROR_RECOVERY]) - { - if (lexer->lookahead == '#') - { - lexer->mark_end(lexer); - lexer->advance(lexer, false); - if (lexer->lookahead == '#') - { - lexer->advance(lexer, false); - if (lexer->lookahead != '}') - { - lexer->result_symbol = IMMEDIATE_DOUBLE_HASH; - lexer->mark_end(lexer); - return (true); - } - } - } - } - return (false); -} diff --git a/parser/src/scanner/scan_varname.c b/parser/src/scanner/scan_varname.c deleted file mode 100644 index b5cc87e0..00000000 --- a/parser/src/scanner/scan_varname.c +++ /dev/null @@ -1,140 +0,0 @@ -/* ************************************************************************** */ -/* */ -/* ::: :::::::: */ -/* scan_varname.c :+: :+: :+: */ -/* +:+ +:+ +:+ */ -/* By: maiboyer +#+ +:+ +#+ */ -/* +#+#+#+#+#+ +#+ */ -/* Created: 2024/09/02 17:26:05 by maiboyer #+# #+# */ -/* Updated: 2024/09/02 18:01:41 by maiboyer ### ########.fr */ -/* */ -/* ************************************************************************** */ - -#include "me/char/char.h" -#include "parser/inner/scanner.h" - -bool scan_varname(t_scanner *scanner, TSLexer *lexer, - const bool *valid_symbols) -{ - t_heredoc heredoc; - bool is_number; - - while (true) - { - if ((lexer->lookahead == ' ' || lexer->lookahead == '\t' - || lexer->lookahead == '\r' || (lexer->lookahead == '\n' - && !valid_symbols[NEWLINE])) - && !valid_symbols[EXPANSION_WORD]) - lexer->advance(lexer, true); - else if (lexer->lookahead == '\\') - { - lexer->advance(lexer, true); - if (lexer->eof(lexer)) - { - lexer->mark_end(lexer); - lexer->result_symbol = VARIABLE_NAME; - return (true); - } - if (lexer->lookahead == '\r') - lexer->advance(lexer, true); - if (lexer->lookahead == '\n') - lexer->advance(lexer, true); - else - { - if (lexer->lookahead == '\\' && valid_symbols[EXPANSION_WORD]) - return (scan_expansion_word(scanner, lexer, valid_symbols)); - return (false); - } - } - else - break ; - } - if (!valid_symbols[EXPANSION_WORD] && (lexer->lookahead == '*' - || lexer->lookahead == '@' || lexer->lookahead == '?' - || lexer->lookahead == '-' || lexer->lookahead == '0' - || lexer->lookahead == '_')) - { - lexer->mark_end(lexer); - lexer->advance(lexer, false); - if (lexer->lookahead == '=' || lexer->lookahead == '[' - || lexer->lookahead == ':' || lexer->lookahead == '-' - || lexer->lookahead == '%' || lexer->lookahead == '#' - || lexer->lookahead == '/') - return (false); - if (valid_symbols[EXTGLOB_PATTERN] && me_isspace(lexer->lookahead)) - { - lexer->mark_end(lexer); - lexer->result_symbol = EXTGLOB_PATTERN; - return (true); - } - } - if (valid_symbols[HEREDOC_ARROW] && lexer->lookahead == '<') - { - lexer->advance(lexer, false); - if (lexer->lookahead == '<') - { - lexer->advance(lexer, false); - heredoc = heredoc_new(); - vec_heredoc_push(&scanner->heredocs, heredoc); - lexer->result_symbol = HEREDOC_ARROW; - return (true); - } - return (false); - } - is_number = true; - if (me_isdigit(lexer->lookahead)) - lexer->advance(lexer, false); - else if (me_isalpha(lexer->lookahead) || lexer->lookahead == '_') - { - is_number = false; - lexer->advance(lexer, false); - } - else - { - if (lexer->lookahead == '{') - return (false); - if (valid_symbols[EXPANSION_WORD]) - return (scan_expansion_word(scanner, lexer, valid_symbols)); - return (false); - } - while (true) - { - if (me_isdigit(lexer->lookahead)) - lexer->advance(lexer, false); - else if (me_isalpha(lexer->lookahead) || lexer->lookahead == '_') - is_number = (lexer->advance(lexer, false), false); - else - break ; - } - if (is_number && valid_symbols[FILE_DESCRIPTOR] && (lexer->lookahead == '>' - || lexer->lookahead == '<')) - return (lexer->result_symbol = FILE_DESCRIPTOR, true); - if (valid_symbols[VARIABLE_NAME]) - { - if (lexer->lookahead == '+') - { - lexer->mark_end(lexer); - lexer->advance(lexer, false); - if (lexer->lookahead == '=' || lexer->lookahead == ':') - return (lexer->result_symbol = VARIABLE_NAME, true); - return (false); - } - if (lexer->lookahead == '/') - return (false); - if (lexer->lookahead == '=' || lexer->lookahead == '[' - || (lexer->lookahead == ':' && !valid_symbols[OPENING_PAREN]) - || lexer->lookahead == '%' || (lexer->lookahead == '#' - && !is_number) || lexer->lookahead == '@' - || (lexer->lookahead == '-')) - return (lexer->mark_end(lexer), - lexer->result_symbol = VARIABLE_NAME, true); - if (lexer->lookahead == '?') - { - lexer->mark_end(lexer); - lexer->advance(lexer, false); - lexer->result_symbol = VARIABLE_NAME; - return (me_isalpha(lexer->lookahead)); - } - } - return (false); -} diff --git a/parser/src/scanner/scan_word.c b/parser/src/scanner/scan_word.c deleted file mode 100644 index de97c7fb..00000000 --- a/parser/src/scanner/scan_word.c +++ /dev/null @@ -1,97 +0,0 @@ -/* ************************************************************************** */ -/* */ -/* ::: :::::::: */ -/* scan_word.c :+: :+: :+: */ -/* +:+ +:+ +:+ */ -/* By: maiboyer +#+ +:+ +#+ */ -/* +#+#+#+#+#+ +#+ */ -/* Created: 2024/09/02 16:59:16 by maiboyer #+# #+# */ -/* Updated: 2024/09/02 17:58:43 by maiboyer ### ########.fr */ -/* */ -/* ************************************************************************** */ - -#include "me/char/char.h" -#include "parser/inner/scanner.h" - -bool scan_expansion_word(t_scanner *scanner, TSLexer *lexer, - const bool *valid_symbols) -{ - bool advanced_once; - bool advance_once_space; - - advanced_once = false; - advance_once_space = false; - (void)(scanner); - (void)(lexer); - (void)(valid_symbols); - while (true) - { - if (lexer->lookahead == '\"') - return (false); - if (lexer->lookahead == '$') - { - lexer->mark_end(lexer); - lexer->advance(lexer, false); - if (lexer->lookahead == '{' || lexer->lookahead == '(' - || lexer->lookahead == '\'' || me_isalnum(lexer->lookahead)) - { - lexer->result_symbol = EXPANSION_WORD; - return (advanced_once); - } - advanced_once = true; - } - if (lexer->lookahead == '}') - { - lexer->mark_end(lexer); - lexer->result_symbol = EXPANSION_WORD; - return (advanced_once || advance_once_space); - } - if (lexer->lookahead == '(' && !(advanced_once || advance_once_space)) - { - lexer->mark_end(lexer); - lexer->advance(lexer, false); - while (lexer->lookahead != ')' && !lexer->eof(lexer)) - { - if (lexer->lookahead == '$') - { - lexer->mark_end(lexer); - lexer->advance(lexer, false); - if (lexer->lookahead == '{' || lexer->lookahead == '(' - || lexer->lookahead == '\'' - || me_isalnum(lexer->lookahead)) - { - lexer->result_symbol = EXPANSION_WORD; - return (advanced_once); - } - advanced_once = true; - } - else - { - advanced_once = advanced_once - || !me_isspace(lexer->lookahead); - advance_once_space = advance_once_space - || me_isspace(lexer->lookahead); - lexer->advance(lexer, false); - } - } - lexer->mark_end(lexer); - if (lexer->lookahead == ')') - { - advanced_once = true; - lexer->advance(lexer, false); - lexer->mark_end(lexer); - if (lexer->lookahead == '}') - return (false); - } - else - return (false); - } - if (lexer->lookahead == '\'') - return (false); - if (lexer->eof(lexer)) - return (false); - advanced_once = advanced_once || !me_isspace(lexer->lookahead); - advance_once_space = advance_once_space || me_isspace(lexer->lookahead); - lexer->advance(lexer, false); - } -} diff --git a/parser/src/scanner/serialize.c b/parser/src/scanner/serialize.c deleted file mode 100644 index 0e55b628..00000000 --- a/parser/src/scanner/serialize.c +++ /dev/null @@ -1,62 +0,0 @@ -/* ************************************************************************** */ -/* */ -/* ::: :::::::: */ -/* serialize.c :+: :+: :+: */ -/* +:+ +:+ +:+ */ -/* By: maiboyer +#+ +:+ +#+ */ -/* +#+#+#+#+#+ +#+ */ -/* Created: 2024/09/01 15:06:56 by maiboyer #+# #+# */ -/* Updated: 2024/09/02 17:06:26 by maiboyer ### ########.fr */ -/* */ -/* ************************************************************************** */ - -#include "me/types.h" -#include "parser/array.h" -#include "parser/inner/heredoc.h" -#include "parser/inner/scanner.h" -#include "parser/parser.h" - -t_error serialize_heredocs(t_scanner *scanner, t_u8 *buffer, t_u32 *size, - t_usize i) -{ - t_heredoc *heredoc; - - heredoc = vec_heredoc_get(&scanner->heredocs, i); - if (heredoc == NULL) - return (ERROR); - if (heredoc->delimiter.len + 1 + sizeof(t_usize) - + (*size) >= TREE_SITTER_SERIALIZATION_BUFFER_SIZE) - return (ERROR); - buffer[(*size)++] = (char)heredoc->is_raw; - buffer[(*size)++] = (char)heredoc->started; - buffer[(*size)++] = (char)heredoc->allows_indent; - heredoc->delimiter.len++; - mem_copy(&buffer[(*size)], &heredoc->delimiter.len, sizeof(t_usize)); - size += sizeof(t_usize); - if (heredoc->delimiter.len > 0) - { - mem_copy(&buffer[(*size)], heredoc->delimiter.buf, - heredoc->delimiter.len); - (*size) += heredoc->delimiter.len; - } - heredoc->delimiter.len--; - return (NO_ERROR); -} - -t_u32 tree_sitter_sh_external_scanner_serialize(t_scanner *scanner, - t_u8 *buffer) -{ - t_u32 size; - t_usize i; - - size = 0; - buffer[size++] = (char)scanner->last_glob_paren_depth; - buffer[size++] = (char)scanner->ext_was_in_double_quote; - buffer[size++] = (char)scanner->ext_saw_outside_quote; - buffer[size++] = (char)scanner->heredocs.len; - i = 0; - while (i < scanner->heredocs.len) - if (serialize_heredocs(scanner, buffer, &size, i++)) - return (0); - return (size); -}