From dcdefc2ac3f8e4dfef18556507c9278d417ed3a5 Mon Sep 17 00:00:00 2001 From: Maix0 Date: Tue, 2 Jul 2024 20:43:02 +0200 Subject: [PATCH] Updated grammar file (not code) --- ast/src/from_node.c | 7 +- tree-sitter-sh/grammar.js | 1197 ++++++++++++++++----------------- tree-sitter-sh/src/scanner.c | 1200 ++++++++++++++++++++++++++++++++++ 3 files changed, 1809 insertions(+), 595 deletions(-) create mode 100644 tree-sitter-sh/src/scanner.c diff --git a/ast/src/from_node.c b/ast/src/from_node.c index 0c58d78c..78253773 100644 --- a/ast/src/from_node.c +++ b/ast/src/from_node.c @@ -6,7 +6,7 @@ /* By: maiboyer +#+ +:+ +#+ */ /* +#+#+#+#+#+ +#+ */ /* Created: 2024/06/17 12:41:56 by maiboyer #+# #+# */ -/* Updated: 2024/07/01 21:44:49 by maiboyer ### ########.fr */ +/* Updated: 2024/07/02 15:49:56 by maiboyer ### ########.fr */ /* */ /* ************************************************************************** */ @@ -593,13 +593,15 @@ t_error build_sym_program(t_parse_node self, t_const_str input, t_ast_node *out) t_error build_sym_raw_string(t_parse_node self, t_const_str input, t_ast_node *out); t_error build_sym_redirected_statement(t_parse_node self, t_const_str input, t_ast_node *out); t_error build_sym_regex(t_parse_node self, t_const_str input, t_ast_node *out); +t_error build_sym_simple_expansion(t_parse_node self, t_const_str input, t_ast_node *out); t_error build_sym_string_content(t_parse_node self, t_const_str input, t_ast_node *out); t_error build_sym_subshell(t_parse_node self, t_const_str input, t_ast_node *out); t_error build_sym_variable_assignment(t_parse_node self, t_const_str input, t_ast_node *out); t_error build_sym_while_statement(t_parse_node self, t_const_str input, t_ast_node *out); t_error build_sym_word(t_parse_node self, t_const_str input, t_ast_node *out); -t_error build_sym_simple_expansion(t_parse_node self, t_const_str input, t_ast_node *out); +t_error build_sym_expansion(t_parse_node self, t_const_str input, t_ast_node *out); + /* FUNCTION THAT ARE NOT DONE */ @@ -612,7 +614,6 @@ t_error build_sym_arithmetic_unary_expression(t_parse_node self, t_const_str inp t_error build_sym_arithmetic_expansion(t_parse_node self, t_const_str input, t_ast_node *out); t_error build_sym_command_substitution(t_parse_node self, t_const_str input, t_ast_node *out); -t_error build_sym_expansion(t_parse_node self, t_const_str input, t_ast_node *out); t_error build_sym_expansion_expression(t_parse_node self, t_const_str input, t_ast_node *out); t_error build_sym_expansion_regex(t_parse_node self, t_const_str input, t_ast_node *out); diff --git a/tree-sitter-sh/grammar.js b/tree-sitter-sh/grammar.js index 72359299..d490d3b5 100644 --- a/tree-sitter-sh/grammar.js +++ b/tree-sitter-sh/grammar.js @@ -9,598 +9,611 @@ // @ts-check const SPECIAL_CHARACTERS = [ - '|', '&', ';', '<', '>', '(', ')', '$', '`', '\\', '\"', '\'', ' ', '\t', '\n', + '|', '&', ';', '<', '>', '(', ')', '$', '`', '\\', '\"', '\'', ' ', '\t', '\n', ] const PREC = { - UPDATE: 0, - ASSIGN: 1, - TERNARY: 2, - LOGICAL_OR: 3, - LOGICAL_AND: 4, - BITWISE_OR: 5, - BITWISE_XOR: 6, - BITWISE_AND: 7, - EQUALITY: 8, - COMPARE: 9, - TEST: 10, - UNARY: 11, - SHIFT: 12, - ADD: 13, - MULTIPLY: 14, - EXPONENT: 15, - NEGATE: 16, - PREFIX: 17, - POSTFIX: 18, + UPDATE: 0, + ASSIGN: 1, + TERNARY: 2, + LOGICAL_OR: 3, + LOGICAL_AND: 4, + BITWISE_OR: 5, + BITWISE_XOR: 6, + BITWISE_AND: 7, + EQUALITY: 8, + COMPARE: 9, + TEST: 10, + UNARY: 11, + SHIFT: 12, + ADD: 13, + MULTIPLY: 14, + EXPONENT: 15, + NEGATE: 16, + PREFIX: 17, + POSTFIX: 18, }; module.exports = grammar({ - name: 'sh', - - conflicts: $ => [ - [$.command, $._variable_assignments], - [$.redirected_statement, $.command], - [$.redirected_statement, $.command_substitution], - [$.function_definition, $.command_name], - [$.pipeline], - ], - - inline: $ => [ - $._statement, - // $._terminator, - $._literal, - $._terminated_statement, - $._primary_expression, - $._simple_variable_name, - $._multiline_variable_name, - $._special_variable_name, - $._statement_not_subshell, - ], - - externals: $ => [ - $.heredoc_start, - $.simple_heredoc_body, - $._heredoc_body_beginning, - $.heredoc_content, - $.heredoc_end, - $.file_descriptor, - $._empty_value, - $._concat, - $.variable_name, - $.regex, - $._expansion_word, - $.extglob_pattern, - $._bare_dollar, - $._immediate_double_hash, - '<<', - '<<-', - /\n/, - '(', - 'esac', - $.__error_recovery, - ], - - extras: $ => [ - $.comment, - /\s/, - /\\\r?\n/, - /\\( |\t|\v|\f)/, - ], - - // supertypes: $ => [ - // $._statement, - // $._primary_expression, - // ], - - word: $ => $.word, - - rules: { - program: $ => optional($._statements), - - _statements: $ => prec(1, seq( - repeat(seq( - field('stmt', $._statement), - field('terminator', $.terminator), - )), - field('stmt', $._statement), - field('terminator', optional($.terminator)), - )), - - _terminated_statement: $ => repeat1(seq( - field('stmt', $._statement), - field('terminator', $.terminator) - )), - - // Statements - - _statement: $ => choice( - $._statement_not_subshell, - $.subshell, - ), - - _statement_not_subshell: $ => choice( - $.case_statement, - $.command, - $.compound_statement, - $.for_statement, - $.function_definition, - $.if_statement, - $.list, - $.negated_command, - $.pipeline, - $.redirected_statement, - $.variable_assignment, - $._variable_assignments, - $.while_statement, - ), - - _statement_not_pipeline: $ => prec(1, choice( - $.case_statement, - $.command, - $.compound_statement, - $.for_statement, - $.function_definition, - $.if_statement, - $.list, - $.negated_command, - $.redirected_statement, - $.subshell, - $.variable_assignment, - $._variable_assignments, - $.while_statement, - )), - - redirected_statement: $ => prec.dynamic(-1, prec.right(-1, choice( - seq( - field('body', $._statement), - field('redirect', repeat1(choice($.file_redirect, $.heredoc_redirect))), - ), - field('redirect', repeat1($.file_redirect)), - ))), - - for_statement: $ => seq( - 'for', - field('variable', $._simple_variable_name), - optional(seq( - 'in', - field('value', repeat1($._literal)), - )), - $.terminator, - field('body', $.do_group), - ), - - while_statement: $ => seq( - choice('while', 'until'), - field('condition', $._terminated_statement), - field('body', $.do_group), - ), - - do_group: $ => seq( - 'do', - optional($._terminated_statement), - 'done', - ), - - if_statement: $ => seq( - 'if', - field('condition', alias($._terminated_statement, $.statements)), - 'then', - field('body', alias(optional($._terminated_statement), $.statements)), - field('elif', repeat($.elif_clause)), - field('else', optional($.else_clause)), - 'fi', - ), - - elif_clause: $ => seq( - 'elif', - field('condition', alias($._terminated_statement, $.statements)), - 'then', - field('body', alias(optional($._terminated_statement), $.statements)), - ), - - else_clause: $ => seq( - 'else', - field('body', alias(optional($._terminated_statement), $.statements)), - ), - - case_statement: $ => seq( - 'case', - field('value', $._literal), - optional($.terminator), - 'in', - optional($.terminator), - optional(seq( - repeat(field('cases', $.case_item)), - field('cases', alias($._case_item_last, $.case_item)) - )), - 'esac', - ), - - _case_item_last: $ => seq( - optional('('), - field('value', choice($._literal, $._extglob_blob)), - repeat(seq('|', field('value', choice($._literal, $._extglob_blob)))), - ')', - repeat('\n'), - choice(field('body', alias($._statements, $.statements)),), - optional(';;') - ), - - case_item: $ => seq( - optional('('), - field('value', choice($._literal, $._extglob_blob)), - repeat(seq('|', field('value', choice($._literal, $._extglob_blob)))), - ')', - repeat('\n'), - choice(field('body', alias($._statements, $.statements))), - ';;' - ), - - function_definition: $ => prec.right(seq( - field('name', $.word), - '(', ')', - field('body', choice($.compound_statement, $.subshell, $.command, $.while_statement, $.if_statement, $.for_statement, $._variable_assignments, repeat1($.file_redirect))), - )), - - compound_statement: $ => seq('{', $._terminated_statement, '}'), - subshell: $ => seq('(', $._statements, ')'), - - pipeline: $ => prec.right(seq( - $._statement_not_pipeline, - repeat1(seq('|', $._statement_not_pipeline)), - )), - - list: $ => prec.left(-1, seq( - field('cmd', $._statement), - field('op', alias(choice('&&', '||'), $.operator)), - field('cmd', $._statement), - )), - - // Commands - - negated_command: $ => seq( - '!', - choice( - prec(2, $.command), - prec(1, $.variable_assignment), - $.subshell, - ), - ), - - command: $ => prec.left(seq( - repeat(choice( - $.variable_assignment, - field('redirect', $.file_redirect), - )), - field('name', $.command_name), - choice( - repeat(choice( - field('arg', $._literal), - field('arg', alias($._bare_dollar, '$')), - )), - $.subshell, - ), - )), - - command_name: $ => $._literal, - - variable_assignment: $ => seq( - field('name', choice( - $.variable_name, - )), - '=', - field('value', choice( - $._literal, - $._empty_value, - alias($._comment_word, $.word), - )), - ), - - _variable_assignments: $ => seq($.variable_assignment, repeat1($.variable_assignment)), - - file_redirect: $ => prec.left(seq( - field('fd', optional($.file_descriptor)), - field('op', alias(choice('<', '>', '>>', '<&', '>&', '>|', '<>'), $.operator)), - field('dest', repeat1($._literal)), - )), - - heredoc_redirect: $ => seq( - field('fd', optional($.file_descriptor)), - field('op', alias(choice('<<', '<<-'), $.operator)), - $.heredoc_start, - optional(choice( - alias($._heredoc_pipeline, $.pipeline), - seq( - field('redirect', repeat1($.file_redirect)), - optional($._heredoc_expression), - ), - $._heredoc_expression, - $._heredoc_command, - )), - /\n/, - choice($._heredoc_body, $._simple_heredoc_body), - ), - - _heredoc_pipeline: $ => seq('|', $._statement,), - - _heredoc_expression: $ => seq( - field('op', alias(choice('||', '&&'), $.operator)), - field('right', $._statement), - ), - - _heredoc_command: $ => repeat1(field('arg', $._literal)), - - _heredoc_body: $ => seq( - $.heredoc_body, - $.heredoc_end, - ), - - heredoc_body: $ => seq( - $._heredoc_body_beginning, - repeat(choice( - $.expansion, - $.simple_expansion, - $.command_substitution, - $.heredoc_content, - )), - ), - - _simple_heredoc_body: $ => seq(alias($.simple_heredoc_body, $.heredoc_body), $.heredoc_end), - - // Literals - - _literal: $ => choice($.concatenation, $._primary_expression), - - _primary_expression: $ => choice( - $.word, - $.string, - $.raw_string, - $.number, - $.expansion, - $.simple_expansion, - $.command_substitution, - $.arithmetic_expansion, - ), - - arithmetic_expansion: $ => seq('$((', optional($._arithmetic_expression), '))'), - - _arithmetic_expression: $ => prec(1, choice( - $.arithmetic_literal, - $.arithmetic_unary_expression, - $.arithmetic_ternary_expression, - $.arithmetic_binary_expression, - $.arithmetic_postfix_expression, - $.arithmetic_parenthesized_expression, - $.command_substitution, - )), - - arithmetic_literal: $ => prec(1, choice( - $.number, - $.simple_expansion, - $.expansion, - $._simple_variable_name, - $.variable_name, - $.string, - )), - - arithmetic_binary_expression: $ => { - - /** @type {[RuleOrLiteral, number][]} */ - const table = [ - [choice('+=', '-=', '*=', '/=', '%=', '<<=', '>>=', '&=', '^=', '|='), PREC.UPDATE], - ['=', PREC.ASSIGN], - ['||', PREC.LOGICAL_OR], - ['&&', PREC.LOGICAL_AND], - ['|', PREC.BITWISE_OR], - ['^', PREC.BITWISE_XOR], - ['&', PREC.BITWISE_AND], - [choice('==', '!='), PREC.EQUALITY], - [choice('<', '>', '<=', '>='), PREC.COMPARE], - [choice('<<', '>>'), PREC.SHIFT], - [choice('+', '-'), PREC.ADD], - [choice('*', '/', '%'), PREC.MULTIPLY], - ]; - - return choice(...table.map(([operator, precedence]) => - prec.left(precedence, seq( - field('left', $._arithmetic_expression), - field('op', alias(operator, $.operator)), - field('right', $._arithmetic_expression), - )) - )); - }, - - arithmetic_ternary_expression: $ => prec.left(PREC.TERNARY, seq( - field('condition', $._arithmetic_expression), - '?', - field('consequence', $._arithmetic_expression), - ':', - field('alternative', $._arithmetic_expression), - )), - - arithmetic_unary_expression: $ => choice( - prec(PREC.PREFIX, seq( - field('op', alias(tokenLiterals(1, '++', '--'), $.operator)), - $._arithmetic_expression, - )), - prec(PREC.UNARY, seq( - field('op', alias(tokenLiterals(1, '-', '+', '~'), $.operator)), - $._arithmetic_expression, - )), - prec.right(PREC.UNARY, seq( - field('op', alias('!', $.operator)), - $._arithmetic_expression, - )), - ), - - arithmetic_postfix_expression: $ => prec(PREC.POSTFIX, seq( - $._arithmetic_expression, - field('op', alias(choice('++', '--'), $.operator)), - )), - - arithmetic_parenthesized_expression: $ => seq('(', $._arithmetic_expression, ')'), - - concatenation: $ => prec(-1, seq( - $._primary_expression, - repeat1(seq( - choice($._concat, alias(/`\s*`/, '``')), - choice( - $._primary_expression, - alias($._comment_word, $.word), - alias($._bare_dollar, $.word), - alias(/`\s*`/, '``') - ), - )), - optional(seq($._concat, alias('$', $.word))), - )), - - string: $ => seq( - '"', - repeat(seq( - choice( - seq(optional('$'), $.string_content), - $.expansion, - $.simple_expansion, - $.command_substitution, - $.arithmetic_expansion, - ), - optional($._concat), - )), - optional(alias('$', $.string_content)), - '"', - ), - - string_content: _ => token(prec(-1, /([^"`$\\\r\n]|\\(.|\r?\n))+/)), - - raw_string: _ => /'[^']*'/, - - number: _ => /[0-9]+/, - - simple_expansion: $ => seq( - '$', - choice( - $._simple_variable_name, - $._multiline_variable_name, - $._special_variable_name, - $.variable_name, - alias('!', $.special_variable_name), - alias('#', $.special_variable_name), - ), - ), - - expansion: $ => seq( - '${', - optional($._expansion_body), - '}', - ), - _expansion_body: $ => seq( - field('name', choice($.variable_name, $._simple_variable_name, $._special_variable_name)), - field('op', optional(choice($.expansion_expression, $.expansion_regex))), - ), - - - expansion_expression: $ => prec(1, seq( - field('op', alias(immediateLiterals(':-', '-', ':=', '=', ':?', '?', ':+', '+'), $.operator)), - optional(seq( - choice( - alias($._concatenation_in_expansion, $.concatenation), - $.word, - $.expansion, - $.string, - $.raw_string, - alias($._expansion_word, $.word), - ), - )), - )), - - expansion_regex: $ => seq( - field('op', alias(choice('#', $._immediate_double_hash, '%', '%%'), $.operator)), - repeat(choice( - $.regex, - alias(')', $.regex), - $.string, - $.raw_string, - alias(/\s+/, $.regex), - )), - ), - - - _concatenation_in_expansion: $ => prec(-2, seq( - choice( - $.word, - $.variable_name, - $.simple_expansion, - $.expansion, - $.string, - $.raw_string, - $.command_substitution, - alias($._expansion_word, $.word), - ), - repeat1(seq( - choice($._concat, alias(/`\s*`/, '``')), - choice( - $.word, - $.variable_name, - $.simple_expansion, - $.expansion, - $.string, - $.raw_string, - $.command_substitution, - alias($._expansion_word, $.word), - ), - )), - )), - - command_substitution: $ => choice( - seq('$(', $._statements, ')'), - seq('$(', field('redirect', $.file_redirect), ')'), - prec(1, seq('`', $._statements, '`')), - ), - - _extglob_blob: $ => choice( - $.extglob_pattern, - seq( - $.extglob_pattern, - choice($.string, $.expansion, $.command_substitution), - optional($.extglob_pattern), - ), - ), - - comment: _ => token(prec(-10, /#.*/)), - - _comment_word: _ => token(prec(-8, seq( - choice( - noneOf(...SPECIAL_CHARACTERS), - seq('\\', noneOf('\\s')), - ), - repeat(choice( - noneOf(...SPECIAL_CHARACTERS), - seq('\\', noneOf('\\s')), - '\\ ', - )), - ))), - - _simple_variable_name: $ => alias(/\w+/, $.variable_name), - _multiline_variable_name: $ => alias( - token(prec(-1, /(\w|\\\r?\n)+/)), - $.variable_name, - ), - - _special_variable_name: $ => alias(choice('*', '@', '?', '!', '#', '-', '$', '0', '_'), $.special_variable_name), - - word: _ => token(seq( - choice( - noneOf('#', ...SPECIAL_CHARACTERS), - seq('\\', noneOf('\\s')), - ), - repeat(choice( - noneOf(...SPECIAL_CHARACTERS), - seq('\\', noneOf('\\s')), - '\\ ', - )), - )), - terminator: _ => choice(';', ';;', /\n/, '&'), - }, + name: 'sh', + + conflicts: $ => [ + [$.command, $._variable_assignments], + [$.redirected_statement, $.command], + [$.redirected_statement, $.command_substitution], + [$.function_definition, $.command_name], + [$._expansion_body, $._expansion_regex], + [$.pipeline], + ], + + inline: $ => [ + $._statement, + $._literal, + $._terminated_statement, + $._primary_expression, + $._simple_variable_name, + $._multiline_variable_name, + $._special_variable_name, + $._statement_not_subshell, + ], + + externals: $ => [ + $.heredoc_start, + $.simple_heredoc_body, + $._heredoc_body_beginning, + $.heredoc_content, + $.heredoc_end, + $.file_descriptor, + $._empty_value, + $._concat, + $.variable_name, + $.regex, + $._expansion_word, + $.extglob_pattern, + $._bare_dollar, + $._immediate_double_hash, + '<<', + '<<-', + /\n/, + '(', + 'esac', + $.__error_recovery, + ], + + extras: $ => [ + $.comment, + /\s/, + /\\\r?\n/, + /\\( |\t|\v|\f)/, + ], + + // supertypes: $ => [ + // $._statement, + // $._primary_expression, + // ], + + word: $ => $.word, + + rules: { + program: $ => optional($._statements), + + _statements: $ => prec(1, seq( + repeat(seq( + field('stmt', $._statement), + field('term', $.terminator), + )), + field('stmt', $._statement), + field('term', optional($.terminator)), + )), + + _terminated_statement: $ => repeat1(seq( + field('stmt', $._statement), + field('term', $.terminator) + )), + + // Statements + + _statement: $ => choice( + $._statement_not_subshell, + $.subshell, + ), + + _statement_not_subshell: $ => choice( + $.case_statement, + $.command, + $.compound_statement, + $.for_statement, + $.function_definition, + $.if_statement, + $.list, + $.negated_command, + $.pipeline, + $.redirected_statement, + $.variable_assignment, + $._variable_assignments, + $.while_statement, + ), + + _statement_not_pipeline: $ => prec(1, choice( + $.case_statement, + $.command, + $.compound_statement, + $.for_statement, + $.function_definition, + $.if_statement, + $.list, + $.negated_command, + $.redirected_statement, + $.subshell, + $.variable_assignment, + $._variable_assignments, + $.while_statement, + )), + + redirected_statement: $ => prec.dynamic(-1, prec.right(-1, choice( + seq( + field('body', $._statement), + field('redr', repeat1(choice($.file_redirect, $.heredoc_redirect))), + ), + field('redr', repeat1($.file_redirect)), + ))), + + for_statement: $ => seq( + 'for', + field('var', $._simple_variable_name), + optional(seq( + 'in', + field('value', repeat1($._literal)), + )), + $.terminator, + field('body', $.do_group), + ), + + while_statement: $ => seq( + choice('while', 'until'), + field('cond', $._terminated_statement), + field('body', $.do_group), + ), + + do_group: $ => seq( + 'do', + optional($._terminated_statement), + 'done', + ), + + if_statement: $ => seq( + 'if', + field('cond', alias($._terminated_statement, $.statements)), + 'then', + field('body', alias(optional($._terminated_statement), $.statements)), + field('elif', repeat($.elif_clause)), + field('else', optional($.else_clause)), + 'fi', + ), + + elif_clause: $ => seq( + 'elif', + field('cond', alias($._terminated_statement, $.statements)), + 'then', + field('body', alias(optional($._terminated_statement), $.statements)), + ), + + else_clause: $ => seq( + 'else', + field('body', alias(optional($._terminated_statement), $.statements)), + ), + + case_statement: $ => seq( + 'case', + field('value', $._literal), + optional($.terminator), + 'in', + optional($.terminator), + optional(seq( + repeat(field('cases', $.case_item)), + field('cases', alias($._case_item_last, $.case_item)) + )), + 'esac', + ), + + _case_item_last: $ => seq( + optional('('), + field('value', choice($._literal, $._extglob_blob)), + repeat(seq('|', field('value', choice($._literal, $._extglob_blob)))), + ')', + repeat('\n'), + choice(field('body', alias($._statements, $.statements)),), + optional(';;') + ), + + case_item: $ => seq( + optional('('), + field('value', choice($._literal, $._extglob_blob)), + repeat(seq('|', field('value', choice($._literal, $._extglob_blob)))), + ')', + repeat('\n'), + choice(field('body', alias($._statements, $.statements))), + ';;' + ), + + function_definition: $ => prec.right(seq( + field('name', $.word), + '(', ')', + field('body', choice($.compound_statement, $.subshell, $.command, $.while_statement, $.if_statement, $.for_statement, $._variable_assignments, repeat1($.file_redirect))), + )), + + compound_statement: $ => seq('{', $._terminated_statement, '}'), + subshell: $ => seq('(', $._statements, ')'), + + pipeline: $ => prec.right(seq( + $._statement_not_pipeline, + repeat1(seq('|', $._statement_not_pipeline)), + )), + + list: $ => prec.left(-1, seq( + field('cmd', $._statement), + field('op', alias(choice('&&', '||'), $.operator)), + field('cmd', $._statement), + )), + + // Commands + + negated_command: $ => seq( + '!', + choice( + prec(2, $.command), + prec(1, $.variable_assignment), + $.subshell, + ), + ), + + command: $ => prec.left(seq( + repeat(choice( + $.variable_assignment, + field('redr', $.file_redirect), + )), + field('name', $.command_name), + choice( + repeat(choice( + field('arg', $._literal), + field('arg', alias($._bare_dollar, '$')), + )), + $.subshell, + ), + )), + + command_name: $ => $._literal, + + variable_assignment: $ => seq( + field('name', choice( + $.variable_name, + )), + '=', + field('value', choice( + $._literal, + $._empty_value, + alias($._comment_word, $.word), + )), + ), + + _variable_assignments: $ => seq($.variable_assignment, repeat1($.variable_assignment)), + + file_redirect: $ => prec.left(seq( + field('fd', optional($.file_descriptor)), + field('op', alias(choice('<', '>', '>>', '<&', '>&', '>|', '<>'), $.operator)), + field('dest', repeat1($._literal)), + )), + + heredoc_redirect: $ => seq( + field('fd', optional($.file_descriptor)), + field('op', alias(choice('<<', '<<-'), $.operator)), + $.heredoc_start, + optional(choice( + alias($._heredoc_pipeline, $.pipeline), + seq( + field('redr', repeat1($.file_redirect)), + optional($._heredoc_expression), + ), + $._heredoc_expression, + $._heredoc_command, + )), + /\n/, + choice($._heredoc_body, $._simple_heredoc_body), + ), + + _heredoc_pipeline: $ => seq('|', $._statement,), + + _heredoc_expression: $ => seq( + field('op', alias(choice('||', '&&'), $.operator)), + field('rhs', $._statement), + ), + + _heredoc_command: $ => repeat1(field('arg', $._literal)), + + _heredoc_body: $ => seq( + $.heredoc_body, + $.heredoc_end, + ), + + heredoc_body: $ => seq( + $._heredoc_body_beginning, + repeat(choice( + $.expansion, + $.simple_expansion, + $.command_substitution, + $.heredoc_content, + )), + ), + + _simple_heredoc_body: $ => seq(alias($.simple_heredoc_body, $.heredoc_body), $.heredoc_end), + + // Literals + + _literal: $ => choice($.concatenation, $._primary_expression), + + _primary_expression: $ => choice( + $.word, + $.string, + $.raw_string, + $.number, + $.expansion, + $.simple_expansion, + $.command_substitution, + $.arithmetic_expansion, + ), + + arithmetic_expansion: $ => seq('$((', optional($._arithmetic_expression), '))'), + + _arithmetic_expression: $ => prec(1, choice( + $.arithmetic_literal, + $.arithmetic_unary_expression, + $.arithmetic_ternary_expression, + $.arithmetic_binary_expression, + $.arithmetic_postfix_expression, + $.arithmetic_parenthesized_expression, + $.command_substitution, + )), + + arithmetic_literal: $ => prec(1, choice( + $.number, + $.simple_expansion, + $.expansion, + $._simple_variable_name, + $.variable_name, + $.string, + )), + + arithmetic_binary_expression: $ => { + + /** @type {[RuleOrLiteral, number][]} */ + const table = [ + [choice('+=', '-=', '*=', '/=', '%=', '<<=', '>>=', '&=', '^=', '|='), PREC.UPDATE], + ['=', PREC.ASSIGN], + ['||', PREC.LOGICAL_OR], + ['&&', PREC.LOGICAL_AND], + ['|', PREC.BITWISE_OR], + ['^', PREC.BITWISE_XOR], + ['&', PREC.BITWISE_AND], + [choice('==', '!='), PREC.EQUALITY], + [choice('<', '>', '<=', '>='), PREC.COMPARE], + [choice('<<', '>>'), PREC.SHIFT], + [choice('+', '-'), PREC.ADD], + [choice('*', '/', '%'), PREC.MULTIPLY], + ]; + + return choice(...table.map(([operator, precedence]) => + prec.left(precedence, seq( + field('lhs', $._arithmetic_expression), + field('op', alias(operator, $.operator)), + field('rhs', $._arithmetic_expression), + )) + )); + }, + + arithmetic_ternary_expression: $ => prec.left(PREC.TERNARY, seq( + field('cond', $._arithmetic_expression), + '?', + field('then', $._arithmetic_expression), + ':', + field('else', $._arithmetic_expression), + )), + + arithmetic_unary_expression: $ => choice( + prec(PREC.PREFIX, seq( + field('op', alias(tokenLiterals(1, '++', '--'), $.operator)), + $._arithmetic_expression, + )), + prec(PREC.UNARY, seq( + field('op', alias(tokenLiterals(1, '-', '+', '~'), $.operator)), + $._arithmetic_expression, + )), + prec.right(PREC.UNARY, seq( + field('op', alias('!', $.operator)), + $._arithmetic_expression, + )), + ), + + arithmetic_postfix_expression: $ => prec(PREC.POSTFIX, seq( + $._arithmetic_expression, + field('op', alias(choice('++', '--'), $.operator)), + )), + + arithmetic_parenthesized_expression: $ => seq('(', $._arithmetic_expression, ')'), + + concatenation: $ => prec(-1, seq( + $._primary_expression, + repeat1(seq( + choice($._concat, alias(/`\s*`/, '``')), + choice( + $._primary_expression, + alias($._comment_word, $.word), + alias($._bare_dollar, $.word), + alias(/`\s*`/, '``') + ), + )), + optional(seq($._concat, alias('$', $.word))), + )), + + string: $ => seq( + '"', + repeat(seq( + choice( + seq(optional('$'), $.string_content), + $.expansion, + $.simple_expansion, + $.command_substitution, + $.arithmetic_expansion, + ), + optional($._concat), + )), + optional(alias('$', $.string_content)), + '"', + ), + + string_content: _ => token(prec(-1, /([^"`$\\\r\n]|\\(.|\r?\n))+/)), + + raw_string: _ => /'[^']*'/, + + number: _ => /[0-9]+/, + + simple_expansion: $ => seq( + '$', + choice( + $._simple_variable_name, + $._multiline_variable_name, + $._special_variable_name, + $.variable_name, + alias('!', $.special_variable_name), + alias('#', $.special_variable_name), + ), + ), + + expansion: $ => seq( + '${', + optional($._expansion_body), + '}', + ), + + _expansion_body: $ => seq( + field('len', optional(alias('#', $.operator))), + field('name', choice($.variable_name, $._simple_variable_name, $._special_variable_name)), + optional(choice($._expansion_expression, $._expansion_regex)), + ), + + + _expansion_expression: $ => prec(1, seq( + field('op', alias(immediateLiterals(':-', '-', ':=', '=', ':?', '?', ':+', '+'), $.operator)), + field('args', optional(choice( + alias($._concatenation_in_expansion, $.concatenation), + //alias($._expansion_word, $.word1), + alias(prec(10000000, $._word_no_brace), $.word2), + $.expansion, + $.raw_string, + $.string, + ))), + )), + + _expansion_regex: $ => seq( + field('op', alias(choice('#', $._immediate_double_hash, '%', '%%'), $.operator)), + field('args', repeat(choice( + $.raw_string, + $.regex, + $.string, + alias(')', $.regex), + alias(/\s+/, $.regex), + ))), + ), + + + _concatenation_in_expansion: $ => prec(-2, seq( + choice( + alias($._word_no_brace, $.word), + alias($._expansion_word, $.word), + $.variable_name, + $.simple_expansion, + $.expansion, + $.string, + $.raw_string, + $.command_substitution, + ), + repeat1(seq( + choice($._concat, alias(/`\s*`/, '``')), + choice( + alias($._word_no_brace, $.word), + alias($._expansion_word, $.word), + $.variable_name, + $.simple_expansion, + $.expansion, + $.string, + $.raw_string, + $.command_substitution, + ), + )), + )), + + command_substitution: $ => choice( + seq('$(', $._statements, ')'), + seq('$(', field('redr', $.file_redirect), ')'), + prec(1, seq('`', $._statements, '`')), + ), + + _extglob_blob: $ => choice( + $.extglob_pattern, + seq( + $.extglob_pattern, + choice($.string, $.expansion, $.command_substitution), + optional($.extglob_pattern), + ), + ), + + comment: _ => token(prec(-10, /#.*/)), + + _comment_word: _ => token(prec(-8, seq( + choice( + noneOf(...SPECIAL_CHARACTERS), + seq('\\', noneOf('\\s')), + ), + repeat(choice( + noneOf(...SPECIAL_CHARACTERS), + seq('\\', noneOf('\\s')), + '\\ ', + )), + ))), + + _simple_variable_name: $ => alias(/\w+/, $.variable_name), + _multiline_variable_name: $ => alias( + token(prec(-1, /(\w|\\\r?\n)+/)), + $.variable_name, + ), + + _special_variable_name: $ => alias(choice('*', '@', '?', '!', '#', '-', '$', '0', '_'), $.special_variable_name), + + word: _ => token(seq( + choice( + noneOf('#', ...SPECIAL_CHARACTERS), + seq('\\', noneOf('\\s')), + ), + repeat(choice( + noneOf(...SPECIAL_CHARACTERS), + seq('\\', noneOf('\\s')), + '\\ ', + )), + )), + + + _word_no_brace: _ => prec(2, token(seq( + choice( + noneOf('#', '{', '}', ...SPECIAL_CHARACTERS), + seq('\\', noneOf('\\s')), + ), + repeat(choice( + noneOf('{', '}', ...SPECIAL_CHARACTERS), + seq('\\', noneOf('\\s')), + '\\ ', + )), + ))), + terminator: _ => choice(';', ';;', /\n/, '&'), + }, }); /** @@ -613,8 +626,8 @@ module.exports = grammar({ * */ function noneOf(...characters) { - const negatedString = characters.map(c => c == '\\' ? '\\\\' : c).join(''); - return new RegExp('[^' + negatedString + ']'); + const negatedString = characters.map(c => c == '\\' ? '\\\\' : c).join(''); + return new RegExp('[^' + negatedString + ']'); } /** @@ -626,7 +639,7 @@ function noneOf(...characters) { * */ function commaSep(rule) { - return optional(commaSep1(rule)); + return optional(commaSep1(rule)); } /** @@ -638,7 +651,7 @@ function commaSep(rule) { * */ function commaSep1(rule) { - return seq(rule, repeat(seq(',', rule))); + return seq(rule, repeat(seq(',', rule))); } /** @@ -650,7 +663,7 @@ function commaSep1(rule) { * @return {ChoiceRule} */ function immediateLiterals(...literals) { - return choice(...literals.map(l => token.immediate(l))); + return choice(...literals.map(l => token.immediate(l))); } /** @@ -664,5 +677,5 @@ function immediateLiterals(...literals) { * @return {ChoiceRule} */ function tokenLiterals(precedence, ...literals) { - return choice(...literals.map(l => token(prec(precedence, l)))); + return choice(...literals.map(l => token(prec(precedence, l)))); } diff --git a/tree-sitter-sh/src/scanner.c b/tree-sitter-sh/src/scanner.c new file mode 100644 index 00000000..bd0db274 --- /dev/null +++ b/tree-sitter-sh/src/scanner.c @@ -0,0 +1,1200 @@ +#include "tree_sitter/alloc.h" +#include "tree_sitter/array.h" +#include "tree_sitter/parser.h" + +#include +#include +#include +#include + +enum TokenType +{ + HEREDOC_START, + SIMPLE_HEREDOC_BODY, + HEREDOC_BODY_BEGINNING, + HEREDOC_CONTENT, + HEREDOC_END, + FILE_DESCRIPTOR, + EMPTY_VALUE, + CONCAT, + VARIABLE_NAME, + REGEX, + EXPANSION_WORD, + EXTGLOB_PATTERN, + BARE_DOLLAR, + IMMEDIATE_DOUBLE_HASH, + HEREDOC_ARROW, + HEREDOC_ARROW_DASH, + NEWLINE, + OPENING_PAREN, + ESAC, + ERROR_RECOVERY, +}; + +typedef Array(char) String; + +typedef struct Heredoc +{ + bool is_raw; + bool started; + bool allows_indent; + String delimiter; + String current_leading_word; +} Heredoc; + +#define heredoc_new() \ + { \ + .is_raw = false, \ + .started = false, \ + .allows_indent = false, \ + .delimiter = array_new(), \ + .current_leading_word = array_new(), \ + }; + +typedef struct Scanner +{ + uint8_t last_glob_paren_depth; + bool ext_was_in_double_quote; + bool ext_saw_outside_quote; + Array(Heredoc) heredocs; +} Scanner; + +static inline void advance(TSLexer *lexer) +{ + lexer->advance(lexer, false); +} + +static inline void skip(TSLexer *lexer) +{ + lexer->advance(lexer, true); +} + +static inline bool in_error_recovery(const bool *valid_symbols) +{ + return valid_symbols[ERROR_RECOVERY]; +} + +static inline void reset_string(String *string) +{ + if (string->size > 0) + { + memset(string->contents, 0, string->size); + array_clear(string); + } +} + +static inline void reset_heredoc(Heredoc *heredoc) +{ + heredoc->is_raw = false; + heredoc->started = false; + heredoc->allows_indent = false; + reset_string(&heredoc->delimiter); +} + +static inline void reset(Scanner *scanner) +{ + for (uint32_t i = 0; i < scanner->heredocs.size; i++) + { + reset_heredoc(array_get(&scanner->heredocs, i)); + } +} + +static unsigned serialize(Scanner *scanner, char *buffer) +{ + uint32_t size = 0; + + buffer[size++] = (char)scanner->last_glob_paren_depth; + buffer[size++] = (char)scanner->ext_was_in_double_quote; + buffer[size++] = (char)scanner->ext_saw_outside_quote; + buffer[size++] = (char)scanner->heredocs.size; + + for (uint32_t i = 0; i < scanner->heredocs.size; i++) + { + Heredoc *heredoc = array_get(&scanner->heredocs, i); + if (heredoc->delimiter.size + 3 + size >= TREE_SITTER_SERIALIZATION_BUFFER_SIZE) + { + return 0; + } + + buffer[size++] = (char)heredoc->is_raw; + buffer[size++] = (char)heredoc->started; + buffer[size++] = (char)heredoc->allows_indent; + + memcpy(&buffer[size], &heredoc->delimiter.size, sizeof(uint32_t)); + size += sizeof(uint32_t); + if (heredoc->delimiter.size > 0) + { + memcpy(&buffer[size], heredoc->delimiter.contents, heredoc->delimiter.size); + size += heredoc->delimiter.size; + } + } + return size; +} + +static void deserialize(Scanner *scanner, const char *buffer, unsigned length) +{ + if (length == 0) + { + reset(scanner); + } + else + { + uint32_t size = 0; + scanner->last_glob_paren_depth = buffer[size++]; + scanner->ext_was_in_double_quote = buffer[size++]; + scanner->ext_saw_outside_quote = buffer[size++]; + uint32_t heredoc_count = (unsigned char)buffer[size++]; + for (uint32_t i = 0; i < heredoc_count; i++) + { + Heredoc *heredoc = NULL; + if (i < scanner->heredocs.size) + { + heredoc = array_get(&scanner->heredocs, i); + } + else + { + Heredoc new_heredoc = heredoc_new(); + array_push(&scanner->heredocs, new_heredoc); + heredoc = array_back(&scanner->heredocs); + } + + heredoc->is_raw = buffer[size++]; + heredoc->started = buffer[size++]; + heredoc->allows_indent = buffer[size++]; + + memcpy(&heredoc->delimiter.size, &buffer[size], sizeof(uint32_t)); + size += sizeof(uint32_t); + array_reserve(&heredoc->delimiter, heredoc->delimiter.size); + + if (heredoc->delimiter.size > 0) + { + memcpy(heredoc->delimiter.contents, &buffer[size], heredoc->delimiter.size); + size += heredoc->delimiter.size; + } + } + assert(size == length); + } +} + +/** + * Consume a "word" in POSIX parlance, and returns it unquoted. + * + * This is an approximate implementation that doesn't deal with any + * POSIX-mandated substitution, and assumes the default value for + * IFS. + */ +static bool advance_word(TSLexer *lexer, String *unquoted_word) +{ + bool empty = true; + int32_t quote = 0; + + if (lexer->lookahead == '\'' || lexer->lookahead == '"') + { + quote = lexer->lookahead; + advance(lexer); + } + + while (lexer->lookahead && + !(quote ? lexer->lookahead == quote || lexer->lookahead == '\r' || lexer->lookahead == '\n' : iswspace(lexer->lookahead))) + { + if (lexer->lookahead == '\\') + { + advance(lexer); + if (!lexer->lookahead) + return false; + } + empty = false; + array_push(unquoted_word, lexer->lookahead); + advance(lexer); + } + array_push(unquoted_word, '\0'); + + if (quote && lexer->lookahead == quote) + advance(lexer); + + return !empty; +} + +static inline bool scan_bare_dollar(TSLexer *lexer) +{ + while (iswspace(lexer->lookahead) && lexer->lookahead != '\n' && !lexer->eof(lexer)) + skip(lexer); + + if (lexer->lookahead == '$') + { + advance(lexer); + lexer->result_symbol = BARE_DOLLAR; + lexer->mark_end(lexer); + return (iswspace(lexer->lookahead) || lexer->eof(lexer) || lexer->lookahead == '\"'); + } + + return false; +} + +static bool scan_heredoc_start(Heredoc *heredoc, TSLexer *lexer) +{ + while (iswspace(lexer->lookahead)) + { + skip(lexer); + } + + lexer->result_symbol = HEREDOC_START; + heredoc->is_raw = lexer->lookahead == '\'' || lexer->lookahead == '"' || lexer->lookahead == '\\'; + + bool found_delimiter = advance_word(lexer, &heredoc->delimiter); + if (!found_delimiter) + { + reset_string(&heredoc->delimiter); + return false; + } + return found_delimiter; +} + +static bool scan_heredoc_end_identifier(Heredoc *heredoc, TSLexer *lexer) +{ + reset_string(&heredoc->current_leading_word); + // Scan the first 'n' characters on this line, to see if they match the + // heredoc delimiter + int32_t size = 0; + if (heredoc->delimiter.size > 0) + { + while (lexer->lookahead != '\0' && lexer->lookahead != '\n' && (int32_t)*array_get(&heredoc->delimiter, size) == lexer->lookahead && + heredoc->current_leading_word.size < heredoc->delimiter.size) + { + array_push(&heredoc->current_leading_word, lexer->lookahead); + advance(lexer); + size++; + } + } + array_push(&heredoc->current_leading_word, '\0'); + return heredoc->delimiter.size == 0 ? false : strcmp(heredoc->current_leading_word.contents, heredoc->delimiter.contents) == 0; +} + +static bool scan_heredoc_content(Scanner *scanner, TSLexer *lexer, enum TokenType middle_type, enum TokenType end_type) +{ + bool did_advance = false; + Heredoc *heredoc = array_back(&scanner->heredocs); + + for (;;) + { + switch (lexer->lookahead) + { + case '\0': { + if (lexer->eof(lexer) && did_advance) + { + reset_heredoc(heredoc); + lexer->result_symbol = end_type; + return true; + } + return false; + } + + case '\\': { + did_advance = true; + advance(lexer); + advance(lexer); + break; + } + + case '$': { + if (heredoc->is_raw) + { + did_advance = true; + advance(lexer); + break; + } + if (did_advance) + { + lexer->mark_end(lexer); + lexer->result_symbol = middle_type; + heredoc->started = true; + advance(lexer); + if (iswalpha(lexer->lookahead) || lexer->lookahead == '{' || lexer->lookahead == '(') + { + return true; + } + break; + } + if (middle_type == HEREDOC_BODY_BEGINNING && lexer->get_column(lexer) == 0) + { + lexer->result_symbol = middle_type; + heredoc->started = true; + return true; + } + return false; + } + + case '\n': { + if (!did_advance) + { + skip(lexer); + } + else + { + advance(lexer); + } + did_advance = true; + if (heredoc->allows_indent) + { + while (iswspace(lexer->lookahead)) + { + advance(lexer); + } + } + lexer->result_symbol = heredoc->started ? middle_type : end_type; + lexer->mark_end(lexer); + if (scan_heredoc_end_identifier(heredoc, lexer)) + { + if (lexer->result_symbol == HEREDOC_END) + { + (void)array_pop(&scanner->heredocs); + } + return true; + } + break; + } + + default: { + if (lexer->get_column(lexer) == 0) + { + // an alternative is to check the starting column of the + // heredoc body and track that statefully + while (iswspace(lexer->lookahead)) + { + if (did_advance) + { + advance(lexer); + } + else + { + skip(lexer); + } + } + if (end_type != SIMPLE_HEREDOC_BODY) + { + lexer->result_symbol = middle_type; + if (scan_heredoc_end_identifier(heredoc, lexer)) + { + return true; + } + } + if (end_type == SIMPLE_HEREDOC_BODY) + { + lexer->result_symbol = end_type; + lexer->mark_end(lexer); + if (scan_heredoc_end_identifier(heredoc, lexer)) + { + return true; + } + } + } + did_advance = true; + advance(lexer); + break; + } + } + } +} + +static bool scan(Scanner *scanner, TSLexer *lexer, const bool *valid_symbols) +{ + if (valid_symbols[CONCAT] && !in_error_recovery(valid_symbols)) + { + if (!(lexer->lookahead == 0 || iswspace(lexer->lookahead) || lexer->lookahead == '>' || lexer->lookahead == '<' || + lexer->lookahead == ')' || lexer->lookahead == '(' || lexer->lookahead == ';' || lexer->lookahead == '&' || + lexer->lookahead == '|' || lexer->lookahead == '{' || lexer->lookahead == '}')) + { + lexer->result_symbol = CONCAT; + // So for a`b`, we want to return a concat. We check if the + // 2nd backtick has whitespace after it, and if it does we + // return concat. + if (lexer->lookahead == '`') + { + lexer->mark_end(lexer); + advance(lexer); + while (lexer->lookahead != '`' && !lexer->eof(lexer)) + { + advance(lexer); + } + if (lexer->eof(lexer)) + { + return false; + } + if (lexer->lookahead == '`') + { + advance(lexer); + } + return iswspace(lexer->lookahead) || lexer->eof(lexer); + } + // strings w/ expansions that contains escaped quotes or + // backslashes need this to return a concat + if (lexer->lookahead == '\\') + { + lexer->mark_end(lexer); + advance(lexer); + if (lexer->lookahead == '"' || lexer->lookahead == '\'' || lexer->lookahead == '\\') + { + return true; + } + if (lexer->eof(lexer)) + { + return false; + } + } + else + { + return true; + } + } + } + + if (valid_symbols[IMMEDIATE_DOUBLE_HASH] && !in_error_recovery(valid_symbols)) + { + // advance two # and ensure not } after + if (lexer->lookahead == '#') + { + lexer->mark_end(lexer); + advance(lexer); + if (lexer->lookahead == '#') + { + advance(lexer); + if (lexer->lookahead != '}') + { + lexer->result_symbol = IMMEDIATE_DOUBLE_HASH; + lexer->mark_end(lexer); + return true; + } + } + } + } + + if (valid_symbols[EMPTY_VALUE]) + { + if (iswspace(lexer->lookahead) || lexer->eof(lexer) || lexer->lookahead == ';' || lexer->lookahead == '&') + { + lexer->result_symbol = EMPTY_VALUE; + return true; + } + } + + if ((valid_symbols[HEREDOC_BODY_BEGINNING] || valid_symbols[SIMPLE_HEREDOC_BODY]) && scanner->heredocs.size > 0 && + !array_back(&scanner->heredocs)->started && !in_error_recovery(valid_symbols)) + { + return scan_heredoc_content(scanner, lexer, HEREDOC_BODY_BEGINNING, SIMPLE_HEREDOC_BODY); + } + + if (valid_symbols[HEREDOC_END] && scanner->heredocs.size > 0) + { + Heredoc *heredoc = array_back(&scanner->heredocs); + if (scan_heredoc_end_identifier(heredoc, lexer)) + { + array_delete(&heredoc->current_leading_word); + array_delete(&heredoc->delimiter); + (void)array_pop(&scanner->heredocs); + lexer->result_symbol = HEREDOC_END; + return true; + } + } + + if (valid_symbols[HEREDOC_CONTENT] && scanner->heredocs.size > 0 && array_back(&scanner->heredocs)->started && + !in_error_recovery(valid_symbols)) + { + return scan_heredoc_content(scanner, lexer, HEREDOC_CONTENT, HEREDOC_END); + } + + if (valid_symbols[HEREDOC_START] && !in_error_recovery(valid_symbols) && scanner->heredocs.size > 0) + { + return scan_heredoc_start(array_back(&scanner->heredocs), lexer); + } + + if ((valid_symbols[VARIABLE_NAME] || valid_symbols[FILE_DESCRIPTOR] || valid_symbols[HEREDOC_ARROW]) && + !in_error_recovery(valid_symbols)) + { + for (;;) + { + if ((lexer->lookahead == ' ' || lexer->lookahead == '\t' || lexer->lookahead == '\r' || + (lexer->lookahead == '\n' && !valid_symbols[NEWLINE])) && + !valid_symbols[EXPANSION_WORD]) + { + skip(lexer); + } + else if (lexer->lookahead == '\\') + { + skip(lexer); + + if (lexer->eof(lexer)) + { + lexer->mark_end(lexer); + lexer->result_symbol = VARIABLE_NAME; + return true; + } + + if (lexer->lookahead == '\r') + { + skip(lexer); + } + if (lexer->lookahead == '\n') + { + skip(lexer); + } + else + { + if (lexer->lookahead == '\\' && valid_symbols[EXPANSION_WORD]) + { + goto expansion_word; + } + return false; + } + } + else + { + break; + } + } + + // no '*', '@', '?', '-', '$', '0', '_' + if (!valid_symbols[EXPANSION_WORD] && (lexer->lookahead == '*' || lexer->lookahead == '@' || lexer->lookahead == '?' || + lexer->lookahead == '-' || lexer->lookahead == '0' || lexer->lookahead == '_')) + { + lexer->mark_end(lexer); + advance(lexer); + if (lexer->lookahead == '=' || lexer->lookahead == '[' || lexer->lookahead == ':' || lexer->lookahead == '-' || + lexer->lookahead == '%' || lexer->lookahead == '#' || lexer->lookahead == '/') + { + return false; + } + if (valid_symbols[EXTGLOB_PATTERN] && iswspace(lexer->lookahead)) + { + lexer->mark_end(lexer); + lexer->result_symbol = EXTGLOB_PATTERN; + return true; + } + } + + if (valid_symbols[HEREDOC_ARROW] && lexer->lookahead == '<') + { + advance(lexer); + if (lexer->lookahead == '<') + { + advance(lexer); + if (lexer->lookahead == '-') + { + advance(lexer); + Heredoc heredoc = heredoc_new(); + heredoc.allows_indent = true; + array_push(&scanner->heredocs, heredoc); + lexer->result_symbol = HEREDOC_ARROW_DASH; + } + // else if (lexer->lookahead == '<' || lexer->lookahead == '=') + // { + // return false; + // } + else + { + Heredoc heredoc = heredoc_new(); + array_push(&scanner->heredocs, heredoc); + lexer->result_symbol = HEREDOC_ARROW; + } + return true; + } + return false; + } + + bool is_number = true; + if (iswdigit(lexer->lookahead)) + { + advance(lexer); + } + else if (iswalpha(lexer->lookahead) || lexer->lookahead == '_') + { + is_number = false; + advance(lexer); + } + else + { + if (lexer->lookahead == '{') + { + goto brace_start; + } + if (valid_symbols[EXPANSION_WORD]) + { + goto expansion_word; + } + if (valid_symbols[EXTGLOB_PATTERN]) + { + goto extglob_pattern; + } + return false; + } + + for (;;) + { + if (iswdigit(lexer->lookahead)) + { + advance(lexer); + } + else if (iswalpha(lexer->lookahead) || lexer->lookahead == '_') + { + is_number = false; + advance(lexer); + } + else + { + break; + } + } + + if (is_number && valid_symbols[FILE_DESCRIPTOR] && (lexer->lookahead == '>' || lexer->lookahead == '<')) + { + lexer->result_symbol = FILE_DESCRIPTOR; + return true; + } + + if (valid_symbols[VARIABLE_NAME]) + { + if (lexer->lookahead == '+') + { + lexer->mark_end(lexer); + advance(lexer); + if (lexer->lookahead == '=' || lexer->lookahead == ':') + { + lexer->result_symbol = VARIABLE_NAME; + return true; + } + return false; + } + if (lexer->lookahead == '/') + { + return false; + } + if (lexer->lookahead == '=' || lexer->lookahead == '[' || + (lexer->lookahead == ':' && + !valid_symbols[OPENING_PAREN]) || // TODO(amaanq): more cases for regular word chars but not variable + // names for function words, only handling : for now? #235 + lexer->lookahead == '%' || + (lexer->lookahead == '#' && !is_number) || lexer->lookahead == '@' || (lexer->lookahead == '-')) + { + lexer->mark_end(lexer); + lexer->result_symbol = VARIABLE_NAME; + return true; + } + + if (lexer->lookahead == '?') + { + lexer->mark_end(lexer); + advance(lexer); + lexer->result_symbol = VARIABLE_NAME; + return iswalpha(lexer->lookahead); + } + } + + return false; + } + + if (valid_symbols[BARE_DOLLAR] && !in_error_recovery(valid_symbols) && scan_bare_dollar(lexer)) + { + return true; + } + + if ((valid_symbols[REGEX]) && !in_error_recovery(valid_symbols)) + { + if (valid_symbols[REGEX]) + { + while (iswspace(lexer->lookahead)) + { + skip(lexer); + } + } + + if ((lexer->lookahead != '"' && lexer->lookahead != '\'') || ((lexer->lookahead == '$' || lexer->lookahead == '\'')) || + (lexer->lookahead == '\'')) + { + typedef struct + { + bool done; + bool advanced_once; + bool found_non_alnumdollarunderdash; + bool last_was_escape; + bool in_single_quote; + uint32_t paren_depth; + uint32_t bracket_depth; + uint32_t brace_depth; + } State; + + if (lexer->lookahead == '$') + { + lexer->mark_end(lexer); + advance(lexer); + if (lexer->lookahead == '(') + { + return false; + } + } + + lexer->mark_end(lexer); + + State state = {false, false, false, false, false, 0, 0, 0}; + while (!state.done) + { + if (state.in_single_quote) + { + if (lexer->lookahead == '\'') + { + state.in_single_quote = false; + advance(lexer); + lexer->mark_end(lexer); + } + } + switch (lexer->lookahead) + { + case '\\': + state.last_was_escape = true; + break; + case '\0': + return false; + case '(': + state.paren_depth++; + state.last_was_escape = false; + break; + case '[': + state.bracket_depth++; + state.last_was_escape = false; + break; + case '{': + if (!state.last_was_escape) + state.brace_depth++; + state.last_was_escape = false; + break; + case ')': + if (state.paren_depth == 0) + state.done = true; + state.paren_depth--; + state.last_was_escape = false; + break; + case ']': + if (state.bracket_depth == 0) + state.done = true; + state.bracket_depth--; + state.last_was_escape = false; + break; + case '}': + if (state.brace_depth == 0) + state.done = true; + state.brace_depth--; + state.last_was_escape = false; + break; + case '\'': + // Enter or exit a single-quoted string. + state.in_single_quote = !state.in_single_quote; + advance(lexer); + state.advanced_once = true; + state.last_was_escape = false; + continue; + default: + state.last_was_escape = false; + break; + } + + if (!state.done) + { + if (valid_symbols[REGEX]) + { + bool was_space = !state.in_single_quote && iswspace(lexer->lookahead); + advance(lexer); + state.advanced_once = true; + if (!was_space || state.paren_depth > 0) + { + lexer->mark_end(lexer); + } + } + } + } + + lexer->result_symbol = REGEX; + if (valid_symbols[REGEX] && !state.advanced_once) + { + return false; + } + return true; + } + } + +extglob_pattern: + if (valid_symbols[EXTGLOB_PATTERN] && !in_error_recovery(valid_symbols)) + { + // first skip ws, then check for ? * + @ ! + while (iswspace(lexer->lookahead)) + { + skip(lexer); + } + + if (lexer->lookahead == '?' || lexer->lookahead == '*' || lexer->lookahead == '+' || lexer->lookahead == '@' || + lexer->lookahead == '!' || lexer->lookahead == '-' || lexer->lookahead == ')' || lexer->lookahead == '\\' || + lexer->lookahead == '.' || lexer->lookahead == '[' || (iswalpha(lexer->lookahead))) + { + if (lexer->lookahead == '\\') + { + advance(lexer); + if ((iswspace(lexer->lookahead) || lexer->lookahead == '"') && lexer->lookahead != '\r' && lexer->lookahead != '\n') + { + advance(lexer); + } + else + { + return false; + } + } + + if (lexer->lookahead == ')' && scanner->last_glob_paren_depth == 0) + { + lexer->mark_end(lexer); + advance(lexer); + + if (iswspace(lexer->lookahead)) + { + return false; + } + } + + lexer->mark_end(lexer); + bool was_non_alpha = !iswalpha(lexer->lookahead); + if (lexer->lookahead != '[') + { + // no esac + if (lexer->lookahead == 'e') + { + lexer->mark_end(lexer); + advance(lexer); + if (lexer->lookahead == 's') + { + advance(lexer); + if (lexer->lookahead == 'a') + { + advance(lexer); + if (lexer->lookahead == 'c') + { + advance(lexer); + if (iswspace(lexer->lookahead)) + { + return false; + } + } + } + } + } + else + { + advance(lexer); + } + } + + // -\w is just a word, find something else special + if (lexer->lookahead == '-') + { + lexer->mark_end(lexer); + advance(lexer); + while (iswalnum(lexer->lookahead)) + { + advance(lexer); + } + + if (lexer->lookahead == ')' || lexer->lookahead == '\\' || lexer->lookahead == '.') + { + return false; + } + lexer->mark_end(lexer); + } + + // case item -) or *) + if (lexer->lookahead == ')' && scanner->last_glob_paren_depth == 0) + { + lexer->mark_end(lexer); + advance(lexer); + if (iswspace(lexer->lookahead)) + { + lexer->result_symbol = EXTGLOB_PATTERN; + return was_non_alpha; + } + } + + if (iswspace(lexer->lookahead)) + { + lexer->mark_end(lexer); + lexer->result_symbol = EXTGLOB_PATTERN; + scanner->last_glob_paren_depth = 0; + return true; + } + + if (lexer->lookahead == '$') + { + lexer->mark_end(lexer); + advance(lexer); + if (lexer->lookahead == '{' || lexer->lookahead == '(') + { + lexer->result_symbol = EXTGLOB_PATTERN; + return true; + } + } + + if (lexer->lookahead == '|') + { + lexer->mark_end(lexer); + advance(lexer); + lexer->result_symbol = EXTGLOB_PATTERN; + return true; + } + + if (!iswalnum(lexer->lookahead) && lexer->lookahead != '(' && lexer->lookahead != '"' && lexer->lookahead != '[' && + lexer->lookahead != '?' && lexer->lookahead != '/' && lexer->lookahead != '\\' && lexer->lookahead != '_' && + lexer->lookahead != '*') + { + return false; + } + + typedef struct + { + bool done; + bool saw_non_alphadot; + uint32_t paren_depth; + uint32_t bracket_depth; + uint32_t brace_depth; + } State; + + State state = {false, was_non_alpha, scanner->last_glob_paren_depth, 0, 0}; + while (!state.done) + { + switch (lexer->lookahead) + { + case '\0': + return false; + case '(': + state.paren_depth++; + break; + case '[': + state.bracket_depth++; + break; + case '{': + state.brace_depth++; + break; + case ')': + if (state.paren_depth == 0) + { + state.done = true; + } + state.paren_depth--; + break; + case ']': + if (state.bracket_depth == 0) + { + state.done = true; + } + state.bracket_depth--; + break; + case '}': + if (state.brace_depth == 0) + { + state.done = true; + } + state.brace_depth--; + break; + } + + if (lexer->lookahead == '|') + { + lexer->mark_end(lexer); + advance(lexer); + if (state.paren_depth == 0 && state.bracket_depth == 0 && state.brace_depth == 0) + { + lexer->result_symbol = EXTGLOB_PATTERN; + return true; + } + } + + if (!state.done) + { + bool was_space = iswspace(lexer->lookahead); + if (lexer->lookahead == '$') + { + lexer->mark_end(lexer); + if (!iswalpha(lexer->lookahead) && lexer->lookahead != '.' && lexer->lookahead != '\\') + { + state.saw_non_alphadot = true; + } + advance(lexer); + if (lexer->lookahead == '(' || lexer->lookahead == '{') + { + lexer->result_symbol = EXTGLOB_PATTERN; + scanner->last_glob_paren_depth = state.paren_depth; + return state.saw_non_alphadot; + } + } + if (was_space) + { + lexer->mark_end(lexer); + lexer->result_symbol = EXTGLOB_PATTERN; + scanner->last_glob_paren_depth = 0; + return state.saw_non_alphadot; + } + if (lexer->lookahead == '"') + { + lexer->mark_end(lexer); + lexer->result_symbol = EXTGLOB_PATTERN; + scanner->last_glob_paren_depth = 0; + return state.saw_non_alphadot; + } + if (lexer->lookahead == '\\') + { + if (!iswalpha(lexer->lookahead) && lexer->lookahead != '.' && lexer->lookahead != '\\') + { + state.saw_non_alphadot = true; + } + advance(lexer); + if (iswspace(lexer->lookahead) || lexer->lookahead == '"') + { + advance(lexer); + } + } + else + { + if (!iswalpha(lexer->lookahead) && lexer->lookahead != '.' && lexer->lookahead != '\\') + { + state.saw_non_alphadot = true; + } + advance(lexer); + } + if (!was_space) + { + lexer->mark_end(lexer); + } + } + } + + lexer->result_symbol = EXTGLOB_PATTERN; + scanner->last_glob_paren_depth = 0; + return state.saw_non_alphadot; + } + scanner->last_glob_paren_depth = 0; + + return false; + } + +expansion_word: + if (valid_symbols[EXPANSION_WORD]) + { + bool advanced_once = false; + bool advance_once_space = false; + for (;;) + { + if (lexer->lookahead == '\"') + return false; + if (lexer->lookahead == '$') + { + lexer->mark_end(lexer); + advance(lexer); + if (lexer->lookahead == '{' || lexer->lookahead == '(' || lexer->lookahead == '\'' || iswalnum(lexer->lookahead)) + { + lexer->result_symbol = EXPANSION_WORD; + return advanced_once; + } + advanced_once = true; + } + + if (lexer->lookahead == '}') + { + lexer->mark_end(lexer); + lexer->result_symbol = EXPANSION_WORD; + return advanced_once || advance_once_space; + } + + if (lexer->lookahead == '(' && !(advanced_once || advance_once_space)) + { + lexer->mark_end(lexer); + advance(lexer); + while (lexer->lookahead != ')' && !lexer->eof(lexer)) + { + // if we find a $( or ${ assume this is valid and is + // a garbage concatenation of some weird word + an + // expansion + // I wonder where this can fail + if (lexer->lookahead == '$') + { + lexer->mark_end(lexer); + advance(lexer); + if (lexer->lookahead == '{' || lexer->lookahead == '(' || lexer->lookahead == '\'' || iswalnum(lexer->lookahead)) + { + lexer->result_symbol = EXPANSION_WORD; + return advanced_once; + } + advanced_once = true; + } + else + { + advanced_once = advanced_once || !iswspace(lexer->lookahead); + advance_once_space = advance_once_space || iswspace(lexer->lookahead); + advance(lexer); + } + } + lexer->mark_end(lexer); + if (lexer->lookahead == ')') + { + advanced_once = true; + advance(lexer); + lexer->mark_end(lexer); + if (lexer->lookahead == '}') + return false; + } + else + return false; + } + + if (lexer->lookahead == '\'') + return false; + if (lexer->eof(lexer)) + return false; + advanced_once = advanced_once || !iswspace(lexer->lookahead); + advance_once_space = advance_once_space || iswspace(lexer->lookahead); + advance(lexer); + } + } + +brace_start: + return false; +} + +void *tree_sitter_sh_external_scanner_create() +{ + Scanner *scanner = calloc(1, sizeof(Scanner)); + array_init(&scanner->heredocs); + return scanner; +} + +bool tree_sitter_sh_external_scanner_scan(void *payload, TSLexer *lexer, const bool *valid_symbols) +{ + Scanner *scanner = (Scanner *)payload; + return scan(scanner, lexer, valid_symbols); +} + +unsigned tree_sitter_sh_external_scanner_serialize(void *payload, char *state) +{ + Scanner *scanner = (Scanner *)payload; + return serialize(scanner, state); +} + +void tree_sitter_sh_external_scanner_deserialize(void *payload, const char *state, unsigned length) +{ + Scanner *scanner = (Scanner *)payload; + deserialize(scanner, state, length); +} + +void tree_sitter_sh_external_scanner_destroy(void *payload) +{ + Scanner *scanner = (Scanner *)payload; + for (size_t i = 0; i < scanner->heredocs.size; i++) + { + Heredoc *heredoc = array_get(&scanner->heredocs, i); + array_delete(&heredoc->current_leading_word); + array_delete(&heredoc->delimiter); + } + array_delete(&scanner->heredocs); + free(scanner); +}