diff --git a/.gitignore b/.gitignore index af44c8f7..5b65811e 100644 --- a/.gitignore +++ b/.gitignore @@ -67,4 +67,7 @@ tree-sitter-sh/bindings/ tree-sitter-sh/package-lock.json tree-sitter-sh/pyproject.toml tree-sitter-sh/setup.py -tree-sitter-sh/src/ +tree-sitter-sh/src/grammar.json +tree-sitter-sh/src/parser.c +tree-sitter-sh/src/tree_sitter/ +tree-sitter-sh/src/node-types.json diff --git a/tree-sitter-sh/grammar.js b/tree-sitter-sh/grammar.js index 94c60359..3c221e7b 100644 --- a/tree-sitter-sh/grammar.js +++ b/tree-sitter-sh/grammar.js @@ -9,16 +9,8 @@ // @ts-check const SPECIAL_CHARACTERS = [ - '\'', '"', - '<', '>', - '{', '}', - '\\[', '\\]', - '(', ')', - '`', '$', - '|', '&', ';', - '\\', - '\\s', -]; + "|", "&", ";", "<", ">", "(", ")", "$", "`", "\\", "\"", "'", " ", "\t", "\n", +] const PREC = { UPDATE: 0, @@ -63,7 +55,6 @@ module.exports = grammar({ $._multiline_variable_name, $._special_variable_name, $._statement_not_subshell, - $._redirect, ], externals: $ => [ @@ -76,17 +67,11 @@ module.exports = grammar({ $._empty_value, $._concat, $.variable_name, // Variable name followed by an operator like '=' or '+=' - $.test_operator, $.regex, - $._regex_no_slash, - $._regex_no_space, $._expansion_word, $.extglob_pattern, $._bare_dollar, - $._brace_start, $._immediate_double_hash, - '}', - ']', '<<', '<<-', /\n/, @@ -174,7 +159,7 @@ module.exports = grammar({ field('body', $._statement), field('redirect', repeat1(choice($.file_redirect, $.heredoc_redirect))), ), - field('redirect', repeat1($._redirect)), + field('redirect', repeat1($.file_redirect)), ))), for_statement: $ => seq( @@ -229,64 +214,43 @@ module.exports = grammar({ 'in', optional($._terminator), optional(seq( - repeat($.case_item), - alias($.last_case_item, $.case_item), + repeat(field('cases', $.case_item)), + field('cases', alias($._case_item_last, $.case_item)) )), 'esac', ), - case_item: $ => seq( - choice( - seq( - optional('('), - field('value', choice($._literal, $._extglob_blob)), - repeat(seq('|', field('value', choice($._literal, $._extglob_blob)))), - ')', - ), - ), - optional($._statements), - prec(1, choice( - field('termination', ';;'), - field('fallthrough', choice(';&', ';;&')), - )), - ), - - last_case_item: $ => seq( + _case_item_last: $ => seq( optional('('), field('value', choice($._literal, $._extglob_blob)), repeat(seq('|', field('value', choice($._literal, $._extglob_blob)))), ')', - optional($._statements), - optional(prec(1, ';;')), + repeat('\n'), + choice(field('statements', $._statements)), + optional(';;') + ), + + case_item: $ => seq( + optional('('), + field('value', choice($._literal, $._extglob_blob)), + repeat(seq('|', field('value', choice($._literal, $._extglob_blob)))), + ')', + repeat('\n'), + choice(field('statements', $._statements)), + ';;' ), function_definition: $ => prec.right(seq( - choice( - seq( - 'function', - field('name', $.word), - optional(seq('(', ')')), - ), - seq( - field('name', $.word), - '(', ')', - ), - ), - field( - 'body', - choice( - $.compound_statement, - $.subshell, - $.if_statement, - ), - ), - field('redirect', optional($._redirect)), + field('name', $.word), + '(', ')', + field('body', choice($.compound_statement, $.subshell)), + field('redirect', optional($.file_redirect)), )), compound_statement: $ => seq( '{', - optional($._terminated_statement), - token(prec(-1, '}')), + $._terminated_statement, + token(prec(1, '}')), ), subshell: $ => seq('(', $._statements, ')'), @@ -336,7 +300,7 @@ module.exports = grammar({ command: $ => prec.left(seq( repeat(choice( $.variable_assignment, - field('redirect', $._redirect), + field('redirect', $.file_redirect), )), field('name', $.command_name), choice( @@ -386,7 +350,7 @@ module.exports = grammar({ optional(choice( alias($._heredoc_pipeline, $.pipeline), seq( - field('redirect', repeat1($._redirect)), + field('redirect', repeat1($.file_redirect)), optional($._heredoc_expression), ), $._heredoc_expression, @@ -428,19 +392,16 @@ module.exports = grammar({ $.heredoc_end, ), - _redirect: $ => $.file_redirect, - // Literals _literal: $ => choice( $.concatenation, $._primary_expression, - alias(prec(-2, repeat1($._special_character)), $.word), + // alias(prec(-2, repeat1($._special_character)), $.word), ), _primary_expression: $ => choice( $.word, - alias($.test_operator, $.word), $.string, $.raw_string, $.number, @@ -448,18 +409,10 @@ module.exports = grammar({ $.simple_expansion, $.command_substitution, $.arithmetic_expansion, - $.brace_expression, ), - arithmetic_expansion: $ => seq('$((', commaSep1($._arithmetic_expression), '))'), + arithmetic_expansion: $ => seq('$((', optional($._arithmetic_expression), '))'), - brace_expression: $ => seq( - alias($._brace_start, '{'), - alias(token.immediate(/\d+/), $.number), - token.immediate('..'), - alias(token.immediate(/\d+/), $.number), - token.immediate('}'), - ), _arithmetic_expression: $ => prec(1, choice( $._arithmetic_literal, alias($._arithmetic_unary_expression, $.unary_expression), @@ -534,23 +487,14 @@ module.exports = grammar({ field('operator', choice('++', '--')), )), - _arithmetic_parenthesized_expression: $ => seq( - '(', - $._arithmetic_expression, - ')', - ), - + _arithmetic_parenthesized_expression: $ => seq('(', $._arithmetic_expression, ')'), concatenation: $ => prec(-1, seq( - choice( - $._primary_expression, - alias($._special_character, $.word), - ), + $._primary_expression, repeat1(seq( choice($._concat, alias(/`\s*`/, '``')), choice( $._primary_expression, - alias($._special_character, $.word), alias($._comment_word, $.word), alias($._bare_dollar, '$'), ), @@ -558,8 +502,6 @@ module.exports = grammar({ optional(seq($._concat, '$')), )), - _special_character: _ => token(prec(-1, choice('{', '}', '[', ']'))), - string: $ => seq( '"', repeat(seq( diff --git a/tree-sitter-sh/src/scanner.c b/tree-sitter-sh/src/scanner.c new file mode 100644 index 00000000..e94166c3 --- /dev/null +++ b/tree-sitter-sh/src/scanner.c @@ -0,0 +1,1243 @@ +#include "tree_sitter/array.h" +#include "tree_sitter/parser.h" + +#include +#include +#include +#include + +enum TokenType +{ + HEREDOC_START, + SIMPLE_HEREDOC_BODY, + HEREDOC_BODY_BEGINNING, + HEREDOC_CONTENT, + HEREDOC_END, + FILE_DESCRIPTOR, + EMPTY_VALUE, + CONCAT, + VARIABLE_NAME, + REGEX, + EXPANSION_WORD, + EXTGLOB_PATTERN, + BARE_DOLLAR, + IMMEDIATE_DOUBLE_HASH, + HEREDOC_ARROW, + HEREDOC_ARROW_DASH, + NEWLINE, + OPENING_PAREN, + ESAC, + ERROR_RECOVERY, +}; +// enum TokenType { +// HEREDOC_START, +// SIMPLE_HEREDOC_BODY, +// HEREDOC_BODY_BEGINNING, +// HEREDOC_CONTENT, +// HEREDOC_END, +// FILE_DESCRIPTOR, +// EMPTY_VALUE, +// CONCAT, +// VARIABLE_NAME, +// TEST_OPERATOR, +// REGEX, +// REGEX_NO_SLASH, +// REGEX_NO_SPACE, +// EXPANSION_WORD, +// EXTGLOB_PATTERN, +// BARE_DOLLAR, +// BRACE_START, +// IMMEDIATE_DOUBLE_HASH, +// EXTERNAL_EXPANSION_SYM_HASH, +// EXTERNAL_EXPANSION_SYM_BANG, +// EXTERNAL_EXPANSION_SYM_EQUAL, +// CLOSING_BRACE, +// CLOSING_BRACKET, +// HEREDOC_ARROW, +// HEREDOC_ARROW_DASH, +// NEWLINE, +// OPENING_PAREN, +// ESAC, +// ERROR_RECOVERY, +// }; + +typedef Array(char) String; + +typedef struct +{ + bool is_raw; + bool started; + bool allows_indent; + String delimiter; + String current_leading_word; +} Heredoc; + +#define heredoc_new() \ + { \ + .is_raw = false, \ + .started = false, \ + .allows_indent = false, \ + .delimiter = array_new(), \ + .current_leading_word = array_new(), \ + }; + +typedef struct +{ + uint8_t last_glob_paren_depth; + bool ext_was_in_double_quote; + bool ext_saw_outside_quote; + Array(Heredoc) heredocs; +} Scanner; + +static inline void advance(TSLexer *lexer) +{ + lexer->advance(lexer, false); +} + +static inline void skip(TSLexer *lexer) +{ + lexer->advance(lexer, true); +} + +static inline bool in_error_recovery(const bool *valid_symbols) +{ + return valid_symbols[ERROR_RECOVERY]; +} + +static inline void reset_string(String *string) +{ + if (string->size > 0) + { + memset(string->contents, 0, string->size); + array_clear(string); + } +} + +static inline void reset_heredoc(Heredoc *heredoc) +{ + heredoc->is_raw = false; + heredoc->started = false; + heredoc->allows_indent = false; + reset_string(&heredoc->delimiter); +} + +static inline void reset(Scanner *scanner) +{ + for (uint32_t i = 0; i < scanner->heredocs.size; i++) + { + reset_heredoc(array_get(&scanner->heredocs, i)); + } +} + +static unsigned serialize(Scanner *scanner, char *buffer) +{ + uint32_t size = 0; + + buffer[size++] = (char)scanner->last_glob_paren_depth; + buffer[size++] = (char)scanner->ext_was_in_double_quote; + buffer[size++] = (char)scanner->ext_saw_outside_quote; + buffer[size++] = (char)scanner->heredocs.size; + + for (uint32_t i = 0; i < scanner->heredocs.size; i++) + { + Heredoc *heredoc = array_get(&scanner->heredocs, i); + if (heredoc->delimiter.size + 3 + size >= TREE_SITTER_SERIALIZATION_BUFFER_SIZE) + { + return 0; + } + + buffer[size++] = (char)heredoc->is_raw; + buffer[size++] = (char)heredoc->started; + buffer[size++] = (char)heredoc->allows_indent; + + memcpy(&buffer[size], &heredoc->delimiter.size, sizeof(uint32_t)); + size += sizeof(uint32_t); + if (heredoc->delimiter.size > 0) + { + memcpy(&buffer[size], heredoc->delimiter.contents, heredoc->delimiter.size); + size += heredoc->delimiter.size; + } + } + return size; +} + +static void deserialize(Scanner *scanner, const char *buffer, unsigned length) +{ + if (length == 0) + { + reset(scanner); + } + else + { + uint32_t size = 0; + scanner->last_glob_paren_depth = buffer[size++]; + scanner->ext_was_in_double_quote = buffer[size++]; + scanner->ext_saw_outside_quote = buffer[size++]; + uint32_t heredoc_count = (unsigned char)buffer[size++]; + for (uint32_t i = 0; i < heredoc_count; i++) + { + Heredoc *heredoc = NULL; + if (i < scanner->heredocs.size) + { + heredoc = array_get(&scanner->heredocs, i); + } + else + { + Heredoc new_heredoc = heredoc_new(); + array_push(&scanner->heredocs, new_heredoc); + heredoc = array_back(&scanner->heredocs); + } + + heredoc->is_raw = buffer[size++]; + heredoc->started = buffer[size++]; + heredoc->allows_indent = buffer[size++]; + + memcpy(&heredoc->delimiter.size, &buffer[size], sizeof(uint32_t)); + size += sizeof(uint32_t); + array_reserve(&heredoc->delimiter, heredoc->delimiter.size); + + if (heredoc->delimiter.size > 0) + { + memcpy(heredoc->delimiter.contents, &buffer[size], heredoc->delimiter.size); + size += heredoc->delimiter.size; + } + } + assert(size == length); + } +} + +/** + * Consume a "word" in POSIX parlance, and returns it unquoted. + * + * This is an approximate implementation that doesn't deal with any + * POSIX-mandated substitution, and assumes the default value for + * IFS. + */ +static bool advance_word(TSLexer *lexer, String *unquoted_word) +{ + bool empty = true; + int32_t quote = 0; + + if (lexer->lookahead == '\'' || lexer->lookahead == '"') + { + quote = lexer->lookahead; + advance(lexer); + } + + while (lexer->lookahead && + !(quote ? lexer->lookahead == quote || lexer->lookahead == '\r' || lexer->lookahead == '\n' : iswspace(lexer->lookahead))) + { + if (lexer->lookahead == '\\') + { + advance(lexer); + if (!lexer->lookahead) + return false; + } + empty = false; + array_push(unquoted_word, lexer->lookahead); + advance(lexer); + } + array_push(unquoted_word, '\0'); + + if (quote && lexer->lookahead == quote) + advance(lexer); + + return !empty; +} + +static inline bool scan_bare_dollar(TSLexer *lexer) +{ + while (iswspace(lexer->lookahead) && lexer->lookahead != '\n' && !lexer->eof(lexer)) + skip(lexer); + + + if (lexer->lookahead == '$') + { + advance(lexer); + lexer->result_symbol = BARE_DOLLAR; + lexer->mark_end(lexer); + return (iswspace(lexer->lookahead) || lexer->eof(lexer) || lexer->lookahead == '\"'); + } + + return false; +} + +static bool scan_heredoc_start(Heredoc *heredoc, TSLexer *lexer) +{ + while (iswspace(lexer->lookahead)) + { + skip(lexer); + } + + lexer->result_symbol = HEREDOC_START; + heredoc->is_raw = lexer->lookahead == '\'' || lexer->lookahead == '"' || lexer->lookahead == '\\'; + + bool found_delimiter = advance_word(lexer, &heredoc->delimiter); + if (!found_delimiter) + { + reset_string(&heredoc->delimiter); + return false; + } + return found_delimiter; +} + +static bool scan_heredoc_end_identifier(Heredoc *heredoc, TSLexer *lexer) +{ + reset_string(&heredoc->current_leading_word); + // Scan the first 'n' characters on this line, to see if they match the + // heredoc delimiter + int32_t size = 0; + if (heredoc->delimiter.size > 0) + { + while (lexer->lookahead != '\0' && lexer->lookahead != '\n' && (int32_t)*array_get(&heredoc->delimiter, size) == lexer->lookahead && + heredoc->current_leading_word.size < heredoc->delimiter.size) + { + array_push(&heredoc->current_leading_word, lexer->lookahead); + advance(lexer); + size++; + } + } + array_push(&heredoc->current_leading_word, '\0'); + return heredoc->delimiter.size == 0 ? false : strcmp(heredoc->current_leading_word.contents, heredoc->delimiter.contents) == 0; +} + +static bool scan_heredoc_content(Scanner *scanner, TSLexer *lexer, enum TokenType middle_type, enum TokenType end_type) +{ + bool did_advance = false; + Heredoc *heredoc = array_back(&scanner->heredocs); + + for (;;) + { + switch (lexer->lookahead) + { + case '\0': { + if (lexer->eof(lexer) && did_advance) + { + reset_heredoc(heredoc); + lexer->result_symbol = end_type; + return true; + } + return false; + } + + case '\\': { + did_advance = true; + advance(lexer); + advance(lexer); + break; + } + + case '$': { + if (heredoc->is_raw) + { + did_advance = true; + advance(lexer); + break; + } + if (did_advance) + { + lexer->mark_end(lexer); + lexer->result_symbol = middle_type; + heredoc->started = true; + advance(lexer); + if (iswalpha(lexer->lookahead) || lexer->lookahead == '{' || lexer->lookahead == '(') + { + return true; + } + break; + } + if (middle_type == HEREDOC_BODY_BEGINNING && lexer->get_column(lexer) == 0) + { + lexer->result_symbol = middle_type; + heredoc->started = true; + return true; + } + return false; + } + + case '\n': { + if (!did_advance) + { + skip(lexer); + } + else + { + advance(lexer); + } + did_advance = true; + if (heredoc->allows_indent) + { + while (iswspace(lexer->lookahead)) + { + advance(lexer); + } + } + lexer->result_symbol = heredoc->started ? middle_type : end_type; + lexer->mark_end(lexer); + if (scan_heredoc_end_identifier(heredoc, lexer)) + { + if (lexer->result_symbol == HEREDOC_END) + { + array_pop(&scanner->heredocs); + } + return true; + } + break; + } + + default: { + if (lexer->get_column(lexer) == 0) + { + // an alternative is to check the starting column of the + // heredoc body and track that statefully + while (iswspace(lexer->lookahead)) + { + if (did_advance) + { + advance(lexer); + } + else + { + skip(lexer); + } + } + if (end_type != SIMPLE_HEREDOC_BODY) + { + lexer->result_symbol = middle_type; + if (scan_heredoc_end_identifier(heredoc, lexer)) + { + return true; + } + } + if (end_type == SIMPLE_HEREDOC_BODY) + { + lexer->result_symbol = end_type; + lexer->mark_end(lexer); + if (scan_heredoc_end_identifier(heredoc, lexer)) + { + return true; + } + } + } + did_advance = true; + advance(lexer); + break; + } + } + } +} + +static bool scan(Scanner *scanner, TSLexer *lexer, const bool *valid_symbols) +{ + if (valid_symbols[CONCAT] && !in_error_recovery(valid_symbols)) + { + if (!(lexer->lookahead == 0 || iswspace(lexer->lookahead) || lexer->lookahead == '>' || lexer->lookahead == '<' || + lexer->lookahead == ')' || lexer->lookahead == '(' || lexer->lookahead == ';' || lexer->lookahead == '&' || + lexer->lookahead == '|')) + { + lexer->result_symbol = CONCAT; + // So for a`b`, we want to return a concat. We check if the + // 2nd backtick has whitespace after it, and if it does we + // return concat. + if (lexer->lookahead == '`') + { + lexer->mark_end(lexer); + advance(lexer); + while (lexer->lookahead != '`' && !lexer->eof(lexer)) + { + advance(lexer); + } + if (lexer->eof(lexer)) + { + return false; + } + if (lexer->lookahead == '`') + { + advance(lexer); + } + return iswspace(lexer->lookahead) || lexer->eof(lexer); + } + // strings w/ expansions that contains escaped quotes or + // backslashes need this to return a concat + if (lexer->lookahead == '\\') + { + lexer->mark_end(lexer); + advance(lexer); + if (lexer->lookahead == '"' || lexer->lookahead == '\'' || lexer->lookahead == '\\') + { + return true; + } + if (lexer->eof(lexer)) + { + return false; + } + } + else + { + return true; + } + } + } + + if (valid_symbols[IMMEDIATE_DOUBLE_HASH] && !in_error_recovery(valid_symbols)) + { + // advance two # and ensure not } after + if (lexer->lookahead == '#') + { + lexer->mark_end(lexer); + advance(lexer); + if (lexer->lookahead == '#') + { + advance(lexer); + if (lexer->lookahead != '}') + { + lexer->result_symbol = IMMEDIATE_DOUBLE_HASH; + lexer->mark_end(lexer); + return true; + } + } + } + } + + if (valid_symbols[EMPTY_VALUE]) + { + if (iswspace(lexer->lookahead) || lexer->eof(lexer) || lexer->lookahead == ';' || lexer->lookahead == '&') + { + lexer->result_symbol = EMPTY_VALUE; + return true; + } + } + + if ((valid_symbols[HEREDOC_BODY_BEGINNING] || valid_symbols[SIMPLE_HEREDOC_BODY]) && scanner->heredocs.size > 0 && + !array_back(&scanner->heredocs)->started && !in_error_recovery(valid_symbols)) + { + return scan_heredoc_content(scanner, lexer, HEREDOC_BODY_BEGINNING, SIMPLE_HEREDOC_BODY); + } + + if (valid_symbols[HEREDOC_END] && scanner->heredocs.size > 0) + { + Heredoc *heredoc = array_back(&scanner->heredocs); + if (scan_heredoc_end_identifier(heredoc, lexer)) + { + array_delete(&heredoc->current_leading_word); + array_delete(&heredoc->delimiter); + array_pop(&scanner->heredocs); + lexer->result_symbol = HEREDOC_END; + return true; + } + } + + if (valid_symbols[HEREDOC_CONTENT] && scanner->heredocs.size > 0 && array_back(&scanner->heredocs)->started && + !in_error_recovery(valid_symbols)) + { + return scan_heredoc_content(scanner, lexer, HEREDOC_CONTENT, HEREDOC_END); + } + + if (valid_symbols[HEREDOC_START] && !in_error_recovery(valid_symbols) && scanner->heredocs.size > 0) + { + return scan_heredoc_start(array_back(&scanner->heredocs), lexer); + } + + if ((valid_symbols[VARIABLE_NAME] || valid_symbols[FILE_DESCRIPTOR] || valid_symbols[HEREDOC_ARROW]) && + !in_error_recovery(valid_symbols)) + { + for (;;) + { + if ((lexer->lookahead == ' ' || lexer->lookahead == '\t' || lexer->lookahead == '\r' || + (lexer->lookahead == '\n' && !valid_symbols[NEWLINE])) && + !valid_symbols[EXPANSION_WORD]) + { + skip(lexer); + } + else if (lexer->lookahead == '\\') + { + skip(lexer); + + if (lexer->eof(lexer)) + { + lexer->mark_end(lexer); + lexer->result_symbol = VARIABLE_NAME; + return true; + } + + if (lexer->lookahead == '\r') + { + skip(lexer); + } + if (lexer->lookahead == '\n') + { + skip(lexer); + } + else + { + if (lexer->lookahead == '\\' && valid_symbols[EXPANSION_WORD]) + { + goto expansion_word; + } + return false; + } + } + else + { + break; + } + } + + // no '*', '@', '?', '-', '$', '0', '_' + if (!valid_symbols[EXPANSION_WORD] && (lexer->lookahead == '*' || lexer->lookahead == '@' || lexer->lookahead == '?' || + lexer->lookahead == '-' || lexer->lookahead == '0' || lexer->lookahead == '_')) + { + lexer->mark_end(lexer); + advance(lexer); + if (lexer->lookahead == '=' || lexer->lookahead == '[' || lexer->lookahead == ':' || lexer->lookahead == '-' || + lexer->lookahead == '%' || lexer->lookahead == '#' || lexer->lookahead == '/') + { + return false; + } + if (valid_symbols[EXTGLOB_PATTERN] && iswspace(lexer->lookahead)) + { + lexer->mark_end(lexer); + lexer->result_symbol = EXTGLOB_PATTERN; + return true; + } + } + + if (valid_symbols[HEREDOC_ARROW] && lexer->lookahead == '<') + { + advance(lexer); + if (lexer->lookahead == '<') + { + advance(lexer); + if (lexer->lookahead == '-') + { + advance(lexer); + Heredoc heredoc = heredoc_new(); + heredoc.allows_indent = true; + array_push(&scanner->heredocs, heredoc); + lexer->result_symbol = HEREDOC_ARROW_DASH; + } + else if (lexer->lookahead == '<' || lexer->lookahead == '=') + { + return false; + } + else + { + Heredoc heredoc = heredoc_new(); + array_push(&scanner->heredocs, heredoc); + lexer->result_symbol = HEREDOC_ARROW; + } + return true; + } + return false; + } + + bool is_number = true; + if (iswdigit(lexer->lookahead)) + { + advance(lexer); + } + else if (iswalpha(lexer->lookahead) || lexer->lookahead == '_') + { + is_number = false; + advance(lexer); + } + else + { + if (lexer->lookahead == '{') + { + goto brace_start; + } + if (valid_symbols[EXPANSION_WORD]) + { + goto expansion_word; + } + if (valid_symbols[EXTGLOB_PATTERN]) + { + goto extglob_pattern; + } + return false; + } + + for (;;) + { + if (iswdigit(lexer->lookahead)) + { + advance(lexer); + } + else if (iswalpha(lexer->lookahead) || lexer->lookahead == '_') + { + is_number = false; + advance(lexer); + } + else + { + break; + } + } + + if (is_number && valid_symbols[FILE_DESCRIPTOR] && (lexer->lookahead == '>' || lexer->lookahead == '<')) + { + lexer->result_symbol = FILE_DESCRIPTOR; + return true; + } + + if (valid_symbols[VARIABLE_NAME]) + { + if (lexer->lookahead == '+') + { + lexer->mark_end(lexer); + advance(lexer); + if (lexer->lookahead == '=' || lexer->lookahead == ':') + { + lexer->result_symbol = VARIABLE_NAME; + return true; + } + return false; + } + if (lexer->lookahead == '/') + { + return false; + } + if (lexer->lookahead == '=' || lexer->lookahead == '[' || + (lexer->lookahead == ':' && + !valid_symbols[OPENING_PAREN]) || // TODO(amaanq): more cases for regular word chars but not variable + // names for function words, only handling : for now? #235 + lexer->lookahead == '%' || + (lexer->lookahead == '#' && !is_number) || lexer->lookahead == '@' || (lexer->lookahead == '-')) + { + lexer->mark_end(lexer); + lexer->result_symbol = VARIABLE_NAME; + return true; + } + + if (lexer->lookahead == '?') + { + lexer->mark_end(lexer); + advance(lexer); + lexer->result_symbol = VARIABLE_NAME; + return iswalpha(lexer->lookahead); + } + } + + return false; + } + + if (valid_symbols[BARE_DOLLAR] && !in_error_recovery(valid_symbols) && scan_bare_dollar(lexer)) + { + return true; + } + +regex: + if ((valid_symbols[REGEX]) && !in_error_recovery(valid_symbols)) + { + if (valid_symbols[REGEX]) + { + while (iswspace(lexer->lookahead)) + { + skip(lexer); + } + } + + if ((lexer->lookahead != '"' && lexer->lookahead != '\'') || ((lexer->lookahead == '$' || lexer->lookahead == '\'')) || + (lexer->lookahead == '\'')) + { + typedef struct + { + bool done; + bool advanced_once; + bool found_non_alnumdollarunderdash; + bool last_was_escape; + bool in_single_quote; + uint32_t paren_depth; + uint32_t bracket_depth; + uint32_t brace_depth; + } State; + + if (lexer->lookahead == '$') + { + lexer->mark_end(lexer); + advance(lexer); + if (lexer->lookahead == '(') + { + return false; + } + } + + lexer->mark_end(lexer); + + State state = {false, false, false, false, false, 0, 0, 0}; + while (!state.done) + { + if (state.in_single_quote) + { + if (lexer->lookahead == '\'') + { + state.in_single_quote = false; + advance(lexer); + lexer->mark_end(lexer); + } + } + switch (lexer->lookahead) + { + case '\\': + state.last_was_escape = true; + break; + case '\0': + return false; + case '(': + state.paren_depth++; + state.last_was_escape = false; + break; + case '[': + state.bracket_depth++; + state.last_was_escape = false; + break; + case '{': + if (!state.last_was_escape) + state.brace_depth++; + state.last_was_escape = false; + break; + case ')': + if (state.paren_depth == 0) + state.done = true; + state.paren_depth--; + state.last_was_escape = false; + break; + case ']': + if (state.bracket_depth == 0) + state.done = true; + state.bracket_depth--; + state.last_was_escape = false; + break; + case '}': + if (state.brace_depth == 0) + state.done = true; + state.brace_depth--; + state.last_was_escape = false; + break; + case '\'': + // Enter or exit a single-quoted string. + state.in_single_quote = !state.in_single_quote; + advance(lexer); + state.advanced_once = true; + state.last_was_escape = false; + continue; + default: + state.last_was_escape = false; + break; + } + + if (!state.done) + { + if (valid_symbols[REGEX]) + { + bool was_space = !state.in_single_quote && iswspace(lexer->lookahead); + advance(lexer); + state.advanced_once = true; + if (!was_space || state.paren_depth > 0) + { + lexer->mark_end(lexer); + } + } + } + } + + lexer->result_symbol = REGEX; + if (valid_symbols[REGEX] && !state.advanced_once) + { + return false; + } + return true; + } + } + +extglob_pattern: + if (valid_symbols[EXTGLOB_PATTERN] && !in_error_recovery(valid_symbols)) + { + // first skip ws, then check for ? * + @ ! + while (iswspace(lexer->lookahead)) + { + skip(lexer); + } + + if (lexer->lookahead == '?' || lexer->lookahead == '*' || lexer->lookahead == '+' || lexer->lookahead == '@' || + lexer->lookahead == '!' || lexer->lookahead == '-' || lexer->lookahead == ')' || lexer->lookahead == '\\' || + lexer->lookahead == '.' || lexer->lookahead == '[' || (iswalpha(lexer->lookahead))) + { + if (lexer->lookahead == '\\') + { + advance(lexer); + if ((iswspace(lexer->lookahead) || lexer->lookahead == '"') && lexer->lookahead != '\r' && lexer->lookahead != '\n') + { + advance(lexer); + } + else + { + return false; + } + } + + if (lexer->lookahead == ')' && scanner->last_glob_paren_depth == 0) + { + lexer->mark_end(lexer); + advance(lexer); + + if (iswspace(lexer->lookahead)) + { + return false; + } + } + + lexer->mark_end(lexer); + bool was_non_alpha = !iswalpha(lexer->lookahead); + if (lexer->lookahead != '[') + { + // no esac + if (lexer->lookahead == 'e') + { + lexer->mark_end(lexer); + advance(lexer); + if (lexer->lookahead == 's') + { + advance(lexer); + if (lexer->lookahead == 'a') + { + advance(lexer); + if (lexer->lookahead == 'c') + { + advance(lexer); + if (iswspace(lexer->lookahead)) + { + return false; + } + } + } + } + } + else + { + advance(lexer); + } + } + + // -\w is just a word, find something else special + if (lexer->lookahead == '-') + { + lexer->mark_end(lexer); + advance(lexer); + while (iswalnum(lexer->lookahead)) + { + advance(lexer); + } + + if (lexer->lookahead == ')' || lexer->lookahead == '\\' || lexer->lookahead == '.') + { + return false; + } + lexer->mark_end(lexer); + } + + // case item -) or *) + if (lexer->lookahead == ')' && scanner->last_glob_paren_depth == 0) + { + lexer->mark_end(lexer); + advance(lexer); + if (iswspace(lexer->lookahead)) + { + lexer->result_symbol = EXTGLOB_PATTERN; + return was_non_alpha; + } + } + + if (iswspace(lexer->lookahead)) + { + lexer->mark_end(lexer); + lexer->result_symbol = EXTGLOB_PATTERN; + scanner->last_glob_paren_depth = 0; + return true; + } + + if (lexer->lookahead == '$') + { + lexer->mark_end(lexer); + advance(lexer); + if (lexer->lookahead == '{' || lexer->lookahead == '(') + { + lexer->result_symbol = EXTGLOB_PATTERN; + return true; + } + } + + if (lexer->lookahead == '|') + { + lexer->mark_end(lexer); + advance(lexer); + lexer->result_symbol = EXTGLOB_PATTERN; + return true; + } + + if (!iswalnum(lexer->lookahead) && lexer->lookahead != '(' && lexer->lookahead != '"' && lexer->lookahead != '[' && + lexer->lookahead != '?' && lexer->lookahead != '/' && lexer->lookahead != '\\' && lexer->lookahead != '_' && + lexer->lookahead != '*') + { + return false; + } + + typedef struct + { + bool done; + bool saw_non_alphadot; + uint32_t paren_depth; + uint32_t bracket_depth; + uint32_t brace_depth; + } State; + + State state = {false, was_non_alpha, scanner->last_glob_paren_depth, 0, 0}; + while (!state.done) + { + switch (lexer->lookahead) + { + case '\0': + return false; + case '(': + state.paren_depth++; + break; + case '[': + state.bracket_depth++; + break; + case '{': + state.brace_depth++; + break; + case ')': + if (state.paren_depth == 0) + { + state.done = true; + } + state.paren_depth--; + break; + case ']': + if (state.bracket_depth == 0) + { + state.done = true; + } + state.bracket_depth--; + break; + case '}': + if (state.brace_depth == 0) + { + state.done = true; + } + state.brace_depth--; + break; + } + + if (lexer->lookahead == '|') + { + lexer->mark_end(lexer); + advance(lexer); + if (state.paren_depth == 0 && state.bracket_depth == 0 && state.brace_depth == 0) + { + lexer->result_symbol = EXTGLOB_PATTERN; + return true; + } + } + + if (!state.done) + { + bool was_space = iswspace(lexer->lookahead); + if (lexer->lookahead == '$') + { + lexer->mark_end(lexer); + if (!iswalpha(lexer->lookahead) && lexer->lookahead != '.' && lexer->lookahead != '\\') + { + state.saw_non_alphadot = true; + } + advance(lexer); + if (lexer->lookahead == '(' || lexer->lookahead == '{') + { + lexer->result_symbol = EXTGLOB_PATTERN; + scanner->last_glob_paren_depth = state.paren_depth; + return state.saw_non_alphadot; + } + } + if (was_space) + { + lexer->mark_end(lexer); + lexer->result_symbol = EXTGLOB_PATTERN; + scanner->last_glob_paren_depth = 0; + return state.saw_non_alphadot; + } + if (lexer->lookahead == '"') + { + lexer->mark_end(lexer); + lexer->result_symbol = EXTGLOB_PATTERN; + scanner->last_glob_paren_depth = 0; + return state.saw_non_alphadot; + } + if (lexer->lookahead == '\\') + { + if (!iswalpha(lexer->lookahead) && lexer->lookahead != '.' && lexer->lookahead != '\\') + { + state.saw_non_alphadot = true; + } + advance(lexer); + if (iswspace(lexer->lookahead) || lexer->lookahead == '"') + { + advance(lexer); + } + } + else + { + if (!iswalpha(lexer->lookahead) && lexer->lookahead != '.' && lexer->lookahead != '\\') + { + state.saw_non_alphadot = true; + } + advance(lexer); + } + if (!was_space) + { + lexer->mark_end(lexer); + } + } + } + + lexer->result_symbol = EXTGLOB_PATTERN; + scanner->last_glob_paren_depth = 0; + return state.saw_non_alphadot; + } + scanner->last_glob_paren_depth = 0; + + return false; + } + +expansion_word: + if (valid_symbols[EXPANSION_WORD]) + { + bool advanced_once = false; + bool advance_once_space = false; + for (;;) + { + if (lexer->lookahead == '\"') + { + return false; + } + if (lexer->lookahead == '$') + { + lexer->mark_end(lexer); + advance(lexer); + if (lexer->lookahead == '{' || lexer->lookahead == '(' || lexer->lookahead == '\'' || iswalnum(lexer->lookahead)) + { + lexer->result_symbol = EXPANSION_WORD; + return advanced_once; + } + advanced_once = true; + } + + if (lexer->lookahead == '}') + { + lexer->mark_end(lexer); + lexer->result_symbol = EXPANSION_WORD; + return advanced_once || advance_once_space; + } + + if (lexer->lookahead == '(' && !(advanced_once || advance_once_space)) + { + lexer->mark_end(lexer); + advance(lexer); + while (lexer->lookahead != ')' && !lexer->eof(lexer)) + { + // if we find a $( or ${ assume this is valid and is + // a garbage concatenation of some weird word + an + // expansion + // I wonder where this can fail + if (lexer->lookahead == '$') + { + lexer->mark_end(lexer); + advance(lexer); + if (lexer->lookahead == '{' || lexer->lookahead == '(' || lexer->lookahead == '\'' || iswalnum(lexer->lookahead)) + { + lexer->result_symbol = EXPANSION_WORD; + return advanced_once; + } + advanced_once = true; + } + else + { + advanced_once = advanced_once || !iswspace(lexer->lookahead); + advance_once_space = advance_once_space || iswspace(lexer->lookahead); + advance(lexer); + } + } + lexer->mark_end(lexer); + if (lexer->lookahead == ')') + { + advanced_once = true; + advance(lexer); + lexer->mark_end(lexer); + if (lexer->lookahead == '}') + { + return false; + } + } + else + { + return false; + } + } + + if (lexer->lookahead == '\'') + { + return false; + } + + if (lexer->eof(lexer)) + { + return false; + } + advanced_once = advanced_once || !iswspace(lexer->lookahead); + advance_once_space = advance_once_space || iswspace(lexer->lookahead); + advance(lexer); + } + } + +brace_start: + return false; +} + +void *tree_sitter_bash_external_scanner_create() +{ + Scanner *scanner = calloc(1, sizeof(Scanner)); + array_init(&scanner->heredocs); + return scanner; +} + +bool tree_sitter_bash_external_scanner_scan(void *payload, TSLexer *lexer, const bool *valid_symbols) +{ + Scanner *scanner = (Scanner *)payload; + return scan(scanner, lexer, valid_symbols); +} + +unsigned tree_sitter_bash_external_scanner_serialize(void *payload, char *state) +{ + Scanner *scanner = (Scanner *)payload; + return serialize(scanner, state); +} + +void tree_sitter_bash_external_scanner_deserialize(void *payload, const char *state, unsigned length) +{ + Scanner *scanner = (Scanner *)payload; + deserialize(scanner, state, length); +} + +void tree_sitter_bash_external_scanner_destroy(void *payload) +{ + Scanner *scanner = (Scanner *)payload; + for (size_t i = 0; i < scanner->heredocs.size; i++) + { + Heredoc *heredoc = array_get(&scanner->heredocs, i); + array_delete(&heredoc->current_leading_word); + array_delete(&heredoc->delimiter); + } + array_delete(&scanner->heredocs); + free(scanner); +}