update: changed the way heredocs are parsed

2024-09-15 20:29:27 +00:00 · 2024-09-15 20:29:27 +00:00 · 43b969183d
commit 43b969183d
parent 8272d72997
365 changed files with 20907 additions and 51362 deletions
--- a/.tree-sitter-sh/grammar.js
+++ b/.tree-sitter-sh/grammar.js
@ -57,11 +57,6 @@ module.exports = grammar({
 	],

 	externals: $ => [
-		$.heredoc_start,
-		$.simple_heredoc_body,
-		$._heredoc_body_beginning,
-		$.heredoc_content,
-		$.heredoc_end,
 		$.file_descriptor,
 		$._empty_value,
 		$._concat,
@ -71,11 +66,9 @@ module.exports = grammar({
 		$.extglob_pattern,
 		$._bare_dollar,
 		$._immediate_double_hash,
-		'<<',
-		'<<-',
+		//'<<',
 		/\n/,
 		'(',
-		'esac',
 		$.__error_recovery,
 	],

@ -86,11 +79,6 @@ module.exports = grammar({
 		/\\( |\t|\v|\f)/,
 	],

-	// supertypes: $ => [
-	//   $._statement,
-	//   $._primary_expression,
-	// ],
-
 	word: $ => $.word,

 	rules: {
@ -118,35 +106,35 @@ module.exports = grammar({
 		),

 		_statement_not_subshell: $ => choice(
-			$.case_statement,
+			// $.case_statement,
 			$.command,
 			$.compound_statement,
-			$.for_statement,
-			$.function_definition,
-			$.if_statement,
+			// $.for_statement,
+			// $.function_definition,
+			// $.if_statement,
 			$.list,
 			$.negated_command,
 			$.pipeline,
 			$.redirected_statement,
 			$.variable_assignment,
 			$._variable_assignments,
-			$.while_statement,
+			// $.while_statement,
 		),

 		_statement_not_pipeline: $ => prec(1, choice(
-			$.case_statement,
+			// $.case_statement,
 			$.command,
 			$.compound_statement,
-			$.for_statement,
-			$.function_definition,
-			$.if_statement,
+			// $.for_statement,
+			// $.function_definition,
+			// $.if_statement,
 			$.list,
 			$.negated_command,
 			$.redirected_statement,
 			$.subshell,
 			$.variable_assignment,
 			$._variable_assignments,
-			$.while_statement,
+			// $.while_statement,
 		)),

 		redirected_statement: $ => prec.dynamic(-1, prec.right(-1, choice(
@ -157,6 +145,7 @@ module.exports = grammar({
 			field('redr', repeat1($.file_redirect)),
 		))),

+		/*
 		for_statement: $ => seq(
 			'for',
 			field('var', $._simple_variable_name),
@ -240,6 +229,7 @@ module.exports = grammar({
 			'(', ')',
 			field('body', choice($.compound_statement, $.subshell, $.command, $.while_statement, $.if_statement, $.for_statement, $._variable_assignments, repeat1($.file_redirect))),
 		)),
+		*/

 		compound_statement: $ => seq('{', $._terminated_statement, '}'),
 		subshell: $ => seq('(', $._statements, ')'),
@ -301,46 +291,9 @@ module.exports = grammar({

 		heredoc_redirect: $ => seq(
 			field('op', alias('<<', $.operator)),
-			$.heredoc_start,
-			optional(choice(
-				alias($._heredoc_pipeline, $.pipeline),
-				seq(
-					field('redr', repeat1($.file_redirect)),
-					optional($._heredoc_expression),
-				),
-				$._heredoc_expression,
-				$._heredoc_command,
-			)),
-			/\n/,
-			choice($._heredoc_body, $._simple_heredoc_body),
+			field('del', alias(/[\w\d\-\._]+/, $.heredoc_delimiter)),
 		),

-		_heredoc_pipeline: $ => seq('|', $._statement,),
-
-		_heredoc_expression: $ => seq(
-			field('op', alias(choice('||', '&&'), $.operator)),
-			field('rhs', $._statement),
-		),
-
-		_heredoc_command: $ => repeat1(field('arg', $._literal)),
-
-		_heredoc_body: $ => seq(
-			$.heredoc_body,
-			$.heredoc_end,
-		),
-
-		heredoc_body: $ => seq(
-			$._heredoc_body_beginning,
-			repeat(choice(
-				$.expansion,
-				$.simple_expansion,
-				$.command_substitution,
-				$.heredoc_content,
-			)),
-		),
-
-		_simple_heredoc_body: $ => seq(alias($.simple_heredoc_body, $.heredoc_body), $.heredoc_end),
-
 		// Literals

 		_literal: $ => choice($.concatenation, $._primary_expression),
@ -401,10 +354,10 @@ module.exports = grammar({
 			field('else', $._arithmetic_expression),
 		)),

-		arithmetic_unary_expression: $ =>prec(PREC.UNARY, seq(
-				field('op', alias(tokenLiterals(1, '-', '+'), $.operator)),
-				$._arithmetic_expression,
-			)),
+		arithmetic_unary_expression: $ => prec(PREC.UNARY, seq(
+			field('op', alias(tokenLiterals(1, '-', '+'), $.operator)),
+			$._arithmetic_expression,
+		)),

 		arithmetic_postfix_expression: $ => prec(PREC.POSTFIX, seq(
 			$._arithmetic_expression,
@ -478,8 +431,7 @@ module.exports = grammar({
 			field('op', alias(immediateLiterals(':-', '-', ':=', '=', ':?', '?', ':+', '+'), $.operator)),
 			field('args', optional(choice(
 				alias($._concatenation_in_expansion, $.concatenation),
-				//alias($._expansion_word, $.word1),
-				alias(prec(10000000, $._word_no_brace), $.word2),
+				alias(prec(1, $._word_no_brace), $.word2),
 				$.expansion,
 				$.raw_string,
 				$.string,
--- a/.tree-sitter-sh/src/scanner.c
+++ b/.tree-sitter-sh/src/scanner.c
@ -9,11 +9,6 @@

 enum TokenType
 {
-	HEREDOC_START,
-	SIMPLE_HEREDOC_BODY,
-	HEREDOC_BODY_BEGINNING,
-	HEREDOC_CONTENT,
-	HEREDOC_END,
 	FILE_DESCRIPTOR,
 	EMPTY_VALUE,
 	CONCAT,
@ -23,11 +18,10 @@ enum TokenType
 	EXTGLOB_PATTERN,
 	BARE_DOLLAR,
 	IMMEDIATE_DOUBLE_HASH,
-	HEREDOC_ARROW,
-	HEREDOC_ARROW_DASH,
+	// HEREDOC_ARROW,
+	// HEREDOC_ARROW_DASH,
 	NEWLINE,
 	OPENING_PAREN,
-	ESAC,
 	ERROR_RECOVERY,
 };

@ -42,13 +36,13 @@ typedef struct Heredoc
 	String current_leading_word;
 } Heredoc;

-#define heredoc_new()                                                                                                                      \
-	{                                                                                                                                      \
-		.is_raw = false,                                                                                                                   \
-		.started = false,                                                                                                                  \
-		.allows_indent = false,                                                                                                            \
-		.delimiter = array_new(),                                                                                                          \
-		.current_leading_word = array_new(),                                                                                               \
+#define heredoc_new()                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          \
+	{                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          \
+		.is_raw = false,                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       \
+		.started = false,                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      \
+		.allows_indent = false,                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                \
+		.delimiter = array_new(),                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              \
+		.current_leading_word = array_new(),                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   \
 	};

 typedef struct Scanner
@ -194,8 +188,7 @@ static bool advance_word(TSLexer *lexer, String *unquoted_word)
 		advance(lexer);
 	}

-	while (lexer->lookahead &&
-		   !(quote ? lexer->lookahead == quote || lexer->lookahead == '\r' || lexer->lookahead == '\n' : iswspace(lexer->lookahead)))
+	while (lexer->lookahead && !(quote ? lexer->lookahead == quote || lexer->lookahead == '\r' || lexer->lookahead == '\n' : iswspace(lexer->lookahead)))
 	{
 		if (lexer->lookahead == '\\')
 		{
@ -231,178 +224,11 @@ static inline bool scan_bare_dollar(TSLexer *lexer)
 	return false;
 }

-static bool scan_heredoc_start(Heredoc *heredoc, TSLexer *lexer)
-{
-	while (iswspace(lexer->lookahead))
-	{
-		skip(lexer);
-	}
-
-	lexer->result_symbol = HEREDOC_START;
-	heredoc->is_raw = lexer->lookahead == '\'' || lexer->lookahead == '"' || lexer->lookahead == '\\';
-
-	bool found_delimiter = advance_word(lexer, &heredoc->delimiter);
-	if (!found_delimiter)
-	{
-		reset_string(&heredoc->delimiter);
-		return false;
-	}
-	return found_delimiter;
-}
-
-static bool scan_heredoc_end_identifier(Heredoc *heredoc, TSLexer *lexer)
-{
-	reset_string(&heredoc->current_leading_word);
-	// Scan the first 'n' characters on this line, to see if they match the
-	// heredoc delimiter
-	int32_t size = 0;
-	if (heredoc->delimiter.size > 0)
-	{
-		while (lexer->lookahead != '\0' && lexer->lookahead != '\n' && (int32_t)*array_get(&heredoc->delimiter, size) == lexer->lookahead &&
-			   heredoc->current_leading_word.size < heredoc->delimiter.size)
-		{
-			array_push(&heredoc->current_leading_word, lexer->lookahead);
-			advance(lexer);
-			size++;
-		}
-	}
-	array_push(&heredoc->current_leading_word, '\0');
-	return heredoc->delimiter.size == 0 ? false : strcmp(heredoc->current_leading_word.contents, heredoc->delimiter.contents) == 0;
-}
-
-static bool scan_heredoc_content(Scanner *scanner, TSLexer *lexer, enum TokenType middle_type, enum TokenType end_type)
-{
-	bool	 did_advance = false;
-	Heredoc *heredoc = array_back(&scanner->heredocs);
-
-	for (;;)
-	{
-		switch (lexer->lookahead)
-		{
-		case '\0': {
-			if (lexer->eof(lexer) && did_advance)
-			{
-				reset_heredoc(heredoc);
-				lexer->result_symbol = end_type;
-				return true;
-			}
-			return false;
-		}
-
-		case '\\': {
-			did_advance = true;
-			advance(lexer);
-			advance(lexer);
-			break;
-		}
-
-		case '$': {
-			if (heredoc->is_raw)
-			{
-				did_advance = true;
-				advance(lexer);
-				break;
-			}
-			if (did_advance)
-			{
-				lexer->mark_end(lexer);
-				lexer->result_symbol = middle_type;
-				heredoc->started = true;
-				advance(lexer);
-				if (iswalpha(lexer->lookahead) || lexer->lookahead == '{' || lexer->lookahead == '(')
-				{
-					return true;
-				}
-				break;
-			}
-			if (middle_type == HEREDOC_BODY_BEGINNING && lexer->get_column(lexer) == 0)
-			{
-				lexer->result_symbol = middle_type;
-				heredoc->started = true;
-				return true;
-			}
-			return false;
-		}
-
-		case '\n': {
-			if (!did_advance)
-			{
-				skip(lexer);
-			}
-			else
-			{
-				advance(lexer);
-			}
-			did_advance = true;
-			if (heredoc->allows_indent)
-			{
-				while (iswspace(lexer->lookahead))
-				{
-					advance(lexer);
-				}
-			}
-			lexer->result_symbol = heredoc->started ? middle_type : end_type;
-			lexer->mark_end(lexer);
-			if (scan_heredoc_end_identifier(heredoc, lexer))
-			{
-				if (lexer->result_symbol == HEREDOC_END)
-				{
-					(void)array_pop(&scanner->heredocs);
-				}
-				return true;
-			}
-			break;
-		}
-
-		default: {
-			if (lexer->get_column(lexer) == 0)
-			{
-				// an alternative is to check the starting column of the
-				// heredoc body and track that statefully
-				while (iswspace(lexer->lookahead))
-				{
-					if (did_advance)
-					{
-						advance(lexer);
-					}
-					else
-					{
-						skip(lexer);
-					}
-				}
-				if (end_type != SIMPLE_HEREDOC_BODY)
-				{
-					lexer->result_symbol = middle_type;
-					if (scan_heredoc_end_identifier(heredoc, lexer))
-					{
-						return true;
-					}
-				}
-				if (end_type == SIMPLE_HEREDOC_BODY)
-				{
-					lexer->result_symbol = end_type;
-					lexer->mark_end(lexer);
-					if (scan_heredoc_end_identifier(heredoc, lexer))
-					{
-						return true;
-					}
-				}
-			}
-			did_advance = true;
-			advance(lexer);
-			break;
-		}
-		}
-	}
-}
-
 static bool scan(Scanner *scanner, TSLexer *lexer, const bool *valid_symbols)
 {
 	if (valid_symbols[CONCAT] && !in_error_recovery(valid_symbols))
 	{
-		if (!(lexer->lookahead == 0 || iswspace(lexer->lookahead) || lexer->lookahead == '>' || lexer->lookahead == '<' ||
-			  lexer->lookahead == ')' || lexer->lookahead == '(' || lexer->lookahead == ';' || lexer->lookahead == '&' ||
-			  lexer->lookahead == '|' || lexer->lookahead == '{' || lexer->lookahead == '}'))
+		if (!(lexer->lookahead == 0 || iswspace(lexer->lookahead) || lexer->lookahead == '>' || lexer->lookahead == '<' || lexer->lookahead == ')' || lexer->lookahead == '(' || lexer->lookahead == ';' || lexer->lookahead == '&' || lexer->lookahead == '|' || lexer->lookahead == '{' || lexer->lookahead == '}'))
 		{
 			lexer->result_symbol = CONCAT;
 			// So for a`b`, we want to return a concat. We check if the
@ -477,44 +303,11 @@ static bool scan(Scanner *scanner, TSLexer *lexer, const bool *valid_symbols)
 		}
 	}

-	if ((valid_symbols[HEREDOC_BODY_BEGINNING] || valid_symbols[SIMPLE_HEREDOC_BODY]) && scanner->heredocs.size > 0 &&
-		!array_back(&scanner->heredocs)->started && !in_error_recovery(valid_symbols))
-	{
-		return scan_heredoc_content(scanner, lexer, HEREDOC_BODY_BEGINNING, SIMPLE_HEREDOC_BODY);
-	}
-
-	if (valid_symbols[HEREDOC_END] && scanner->heredocs.size > 0)
-	{
-		Heredoc *heredoc = array_back(&scanner->heredocs);
-		if (scan_heredoc_end_identifier(heredoc, lexer))
-		{
-			array_delete(&heredoc->current_leading_word);
-			array_delete(&heredoc->delimiter);
-			(void)array_pop(&scanner->heredocs);
-			lexer->result_symbol = HEREDOC_END;
-			return true;
-		}
-	}
-
-	if (valid_symbols[HEREDOC_CONTENT] && scanner->heredocs.size > 0 && array_back(&scanner->heredocs)->started &&
-		!in_error_recovery(valid_symbols))
-	{
-		return scan_heredoc_content(scanner, lexer, HEREDOC_CONTENT, HEREDOC_END);
-	}
-
-	if (valid_symbols[HEREDOC_START] && !in_error_recovery(valid_symbols) && scanner->heredocs.size > 0)
-	{
-		return scan_heredoc_start(array_back(&scanner->heredocs), lexer);
-	}
-
-	if ((valid_symbols[VARIABLE_NAME] || valid_symbols[FILE_DESCRIPTOR] || valid_symbols[HEREDOC_ARROW]) &&
-		!in_error_recovery(valid_symbols))
+	if ((valid_symbols[VARIABLE_NAME] || valid_symbols[FILE_DESCRIPTOR]) && !in_error_recovery(valid_symbols))
 	{
 		for (;;)
 		{
-			if ((lexer->lookahead == ' ' || lexer->lookahead == '\t' || lexer->lookahead == '\r' ||
-				 (lexer->lookahead == '\n' && !valid_symbols[NEWLINE])) &&
-				!valid_symbols[EXPANSION_WORD])
+			if ((lexer->lookahead == ' ' || lexer->lookahead == '\t' || lexer->lookahead == '\r' || (lexer->lookahead == '\n' && !valid_symbols[NEWLINE])) && !valid_symbols[EXPANSION_WORD])
 			{
 				skip(lexer);
 			}
@ -553,13 +346,11 @@ static bool scan(Scanner *scanner, TSLexer *lexer, const bool *valid_symbols)
 		}

 		// no '*', '@', '?', '-', '$', '0', '_'
-		if (!valid_symbols[EXPANSION_WORD] && (lexer->lookahead == '*' || lexer->lookahead == '@' || lexer->lookahead == '?' ||
-											   lexer->lookahead == '-' || lexer->lookahead == '0' || lexer->lookahead == '_'))
+		if (!valid_symbols[EXPANSION_WORD] && (lexer->lookahead == '*' || lexer->lookahead == '@' || lexer->lookahead == '?' || lexer->lookahead == '-' || lexer->lookahead == '0' || lexer->lookahead == '_'))
 		{
 			lexer->mark_end(lexer);
 			advance(lexer);
-			if (lexer->lookahead == '=' || lexer->lookahead == '[' || lexer->lookahead == ':' || lexer->lookahead == '-' ||
-				lexer->lookahead == '%' || lexer->lookahead == '#' || lexer->lookahead == '/')
+			if (lexer->lookahead == '=' || lexer->lookahead == '[' || lexer->lookahead == ':' || lexer->lookahead == '-' || lexer->lookahead == '%' || lexer->lookahead == '#' || lexer->lookahead == '/')
 			{
 				return false;
 			}
@ -571,35 +362,6 @@ static bool scan(Scanner *scanner, TSLexer *lexer, const bool *valid_symbols)
 			}
 		}

-		if (valid_symbols[HEREDOC_ARROW] && lexer->lookahead == '<')
-		{
-			advance(lexer);
-			if (lexer->lookahead == '<')
-			{
-				advance(lexer);
-				if (lexer->lookahead == '-')
-				{
-					advance(lexer);
-					Heredoc heredoc = heredoc_new();
-					heredoc.allows_indent = true;
-					array_push(&scanner->heredocs, heredoc);
-					lexer->result_symbol = HEREDOC_ARROW_DASH;
-				}
-				// else if (lexer->lookahead == '<' || lexer->lookahead == '=')
-				// {
-				// 	return false;
-				// }
-				else
-				{
-					Heredoc heredoc = heredoc_new();
-					array_push(&scanner->heredocs, heredoc);
-					lexer->result_symbol = HEREDOC_ARROW;
-				}
-				return true;
-			}
-			return false;
-		}
-
 		bool is_number = true;
 		if (iswdigit(lexer->lookahead))
 		{
@ -667,12 +429,9 @@ static bool scan(Scanner *scanner, TSLexer *lexer, const bool *valid_symbols)
 			{
 				return false;
 			}
-			if (lexer->lookahead == '=' || lexer->lookahead == '[' ||
-				(lexer->lookahead == ':' &&
-				 !valid_symbols[OPENING_PAREN]) || // TODO(amaanq): more cases for regular word chars but not variable
-												   // names for function words, only handling : for now? #235
-				lexer->lookahead == '%' ||
-				(lexer->lookahead == '#' && !is_number) || lexer->lookahead == '@' || (lexer->lookahead == '-'))
+			if (lexer->lookahead == '=' || lexer->lookahead == '[' || (lexer->lookahead == ':' && !valid_symbols[OPENING_PAREN]) || // TODO(amaanq): more cases for regular word chars but not variable
+																																	// names for function words, only handling : for now? #235
+				lexer->lookahead == '%' || (lexer->lookahead == '#' && !is_number) || lexer->lookahead == '@' || (lexer->lookahead == '-'))
 			{
 				lexer->mark_end(lexer);
 				lexer->result_symbol = VARIABLE_NAME;
@ -706,8 +465,7 @@ static bool scan(Scanner *scanner, TSLexer *lexer, const bool *valid_symbols)
 			}
 		}

-		if ((lexer->lookahead != '"' && lexer->lookahead != '\'') || ((lexer->lookahead == '$' || lexer->lookahead == '\'')) ||
-			(lexer->lookahead == '\''))
+		if ((lexer->lookahead != '"' && lexer->lookahead != '\'') || ((lexer->lookahead == '$' || lexer->lookahead == '\'')) || (lexer->lookahead == '\''))
 		{
 			typedef struct
 			{
@ -828,9 +586,7 @@ extglob_pattern:
 			skip(lexer);
 		}

-		if (lexer->lookahead == '?' || lexer->lookahead == '*' || lexer->lookahead == '+' || lexer->lookahead == '@' ||
-			lexer->lookahead == '!' || lexer->lookahead == '-' || lexer->lookahead == ')' || lexer->lookahead == '\\' ||
-			lexer->lookahead == '.' || lexer->lookahead == '[' || (iswalpha(lexer->lookahead)))
+		if (lexer->lookahead == '?' || lexer->lookahead == '*' || lexer->lookahead == '+' || lexer->lookahead == '@' || lexer->lookahead == '!' || lexer->lookahead == '-' || lexer->lookahead == ')' || lexer->lookahead == '\\' || lexer->lookahead == '.' || lexer->lookahead == '[' || (iswalpha(lexer->lookahead)))
 		{
 			if (lexer->lookahead == '\\')
 			{
@ -944,9 +700,7 @@ extglob_pattern:
 				return true;
 			}

-			if (!iswalnum(lexer->lookahead) && lexer->lookahead != '(' && lexer->lookahead != '"' && lexer->lookahead != '[' &&
-				lexer->lookahead != '?' && lexer->lookahead != '/' && lexer->lookahead != '\\' && lexer->lookahead != '_' &&
-				lexer->lookahead != '*')
+			if (!iswalnum(lexer->lookahead) && lexer->lookahead != '(' && lexer->lookahead != '"' && lexer->lookahead != '[' && lexer->lookahead != '?' && lexer->lookahead != '/' && lexer->lookahead != '\\' && lexer->lookahead != '_' && lexer->lookahead != '*')
 			{
 				return false;
 			}