update: changed the way heredocs are parsed

This commit is contained in:
maix0 2024-09-15 20:29:27 +00:00
parent 8272d72997
commit 43b969183d
365 changed files with 20907 additions and 51362 deletions

View file

@ -57,11 +57,6 @@ module.exports = grammar({
],
externals: $ => [
$.heredoc_start,
$.simple_heredoc_body,
$._heredoc_body_beginning,
$.heredoc_content,
$.heredoc_end,
$.file_descriptor,
$._empty_value,
$._concat,
@ -71,11 +66,9 @@ module.exports = grammar({
$.extglob_pattern,
$._bare_dollar,
$._immediate_double_hash,
'<<',
'<<-',
//'<<',
/\n/,
'(',
'esac',
$.__error_recovery,
],
@ -86,11 +79,6 @@ module.exports = grammar({
/\\( |\t|\v|\f)/,
],
// supertypes: $ => [
// $._statement,
// $._primary_expression,
// ],
word: $ => $.word,
rules: {
@ -118,35 +106,35 @@ module.exports = grammar({
),
_statement_not_subshell: $ => choice(
$.case_statement,
// $.case_statement,
$.command,
$.compound_statement,
$.for_statement,
$.function_definition,
$.if_statement,
// $.for_statement,
// $.function_definition,
// $.if_statement,
$.list,
$.negated_command,
$.pipeline,
$.redirected_statement,
$.variable_assignment,
$._variable_assignments,
$.while_statement,
// $.while_statement,
),
_statement_not_pipeline: $ => prec(1, choice(
$.case_statement,
// $.case_statement,
$.command,
$.compound_statement,
$.for_statement,
$.function_definition,
$.if_statement,
// $.for_statement,
// $.function_definition,
// $.if_statement,
$.list,
$.negated_command,
$.redirected_statement,
$.subshell,
$.variable_assignment,
$._variable_assignments,
$.while_statement,
// $.while_statement,
)),
redirected_statement: $ => prec.dynamic(-1, prec.right(-1, choice(
@ -157,6 +145,7 @@ module.exports = grammar({
field('redr', repeat1($.file_redirect)),
))),
/*
for_statement: $ => seq(
'for',
field('var', $._simple_variable_name),
@ -240,6 +229,7 @@ module.exports = grammar({
'(', ')',
field('body', choice($.compound_statement, $.subshell, $.command, $.while_statement, $.if_statement, $.for_statement, $._variable_assignments, repeat1($.file_redirect))),
)),
*/
compound_statement: $ => seq('{', $._terminated_statement, '}'),
subshell: $ => seq('(', $._statements, ')'),
@ -301,46 +291,9 @@ module.exports = grammar({
heredoc_redirect: $ => seq(
field('op', alias('<<', $.operator)),
$.heredoc_start,
optional(choice(
alias($._heredoc_pipeline, $.pipeline),
seq(
field('redr', repeat1($.file_redirect)),
optional($._heredoc_expression),
),
$._heredoc_expression,
$._heredoc_command,
)),
/\n/,
choice($._heredoc_body, $._simple_heredoc_body),
field('del', alias(/[\w\d\-\._]+/, $.heredoc_delimiter)),
),
_heredoc_pipeline: $ => seq('|', $._statement,),
_heredoc_expression: $ => seq(
field('op', alias(choice('||', '&&'), $.operator)),
field('rhs', $._statement),
),
_heredoc_command: $ => repeat1(field('arg', $._literal)),
_heredoc_body: $ => seq(
$.heredoc_body,
$.heredoc_end,
),
heredoc_body: $ => seq(
$._heredoc_body_beginning,
repeat(choice(
$.expansion,
$.simple_expansion,
$.command_substitution,
$.heredoc_content,
)),
),
_simple_heredoc_body: $ => seq(alias($.simple_heredoc_body, $.heredoc_body), $.heredoc_end),
// Literals
_literal: $ => choice($.concatenation, $._primary_expression),
@ -401,10 +354,10 @@ module.exports = grammar({
field('else', $._arithmetic_expression),
)),
arithmetic_unary_expression: $ =>prec(PREC.UNARY, seq(
field('op', alias(tokenLiterals(1, '-', '+'), $.operator)),
$._arithmetic_expression,
)),
arithmetic_unary_expression: $ => prec(PREC.UNARY, seq(
field('op', alias(tokenLiterals(1, '-', '+'), $.operator)),
$._arithmetic_expression,
)),
arithmetic_postfix_expression: $ => prec(PREC.POSTFIX, seq(
$._arithmetic_expression,
@ -478,8 +431,7 @@ module.exports = grammar({
field('op', alias(immediateLiterals(':-', '-', ':=', '=', ':?', '?', ':+', '+'), $.operator)),
field('args', optional(choice(
alias($._concatenation_in_expansion, $.concatenation),
//alias($._expansion_word, $.word1),
alias(prec(10000000, $._word_no_brace), $.word2),
alias(prec(1, $._word_no_brace), $.word2),
$.expansion,
$.raw_string,
$.string,

View file

@ -9,11 +9,6 @@
enum TokenType
{
HEREDOC_START,
SIMPLE_HEREDOC_BODY,
HEREDOC_BODY_BEGINNING,
HEREDOC_CONTENT,
HEREDOC_END,
FILE_DESCRIPTOR,
EMPTY_VALUE,
CONCAT,
@ -23,11 +18,10 @@ enum TokenType
EXTGLOB_PATTERN,
BARE_DOLLAR,
IMMEDIATE_DOUBLE_HASH,
HEREDOC_ARROW,
HEREDOC_ARROW_DASH,
// HEREDOC_ARROW,
// HEREDOC_ARROW_DASH,
NEWLINE,
OPENING_PAREN,
ESAC,
ERROR_RECOVERY,
};
@ -42,13 +36,13 @@ typedef struct Heredoc
String current_leading_word;
} Heredoc;
#define heredoc_new() \
{ \
.is_raw = false, \
.started = false, \
.allows_indent = false, \
.delimiter = array_new(), \
.current_leading_word = array_new(), \
#define heredoc_new() \
{ \
.is_raw = false, \
.started = false, \
.allows_indent = false, \
.delimiter = array_new(), \
.current_leading_word = array_new(), \
};
typedef struct Scanner
@ -194,8 +188,7 @@ static bool advance_word(TSLexer *lexer, String *unquoted_word)
advance(lexer);
}
while (lexer->lookahead &&
!(quote ? lexer->lookahead == quote || lexer->lookahead == '\r' || lexer->lookahead == '\n' : iswspace(lexer->lookahead)))
while (lexer->lookahead && !(quote ? lexer->lookahead == quote || lexer->lookahead == '\r' || lexer->lookahead == '\n' : iswspace(lexer->lookahead)))
{
if (lexer->lookahead == '\\')
{
@ -231,178 +224,11 @@ static inline bool scan_bare_dollar(TSLexer *lexer)
return false;
}
static bool scan_heredoc_start(Heredoc *heredoc, TSLexer *lexer)
{
while (iswspace(lexer->lookahead))
{
skip(lexer);
}
lexer->result_symbol = HEREDOC_START;
heredoc->is_raw = lexer->lookahead == '\'' || lexer->lookahead == '"' || lexer->lookahead == '\\';
bool found_delimiter = advance_word(lexer, &heredoc->delimiter);
if (!found_delimiter)
{
reset_string(&heredoc->delimiter);
return false;
}
return found_delimiter;
}
static bool scan_heredoc_end_identifier(Heredoc *heredoc, TSLexer *lexer)
{
reset_string(&heredoc->current_leading_word);
// Scan the first 'n' characters on this line, to see if they match the
// heredoc delimiter
int32_t size = 0;
if (heredoc->delimiter.size > 0)
{
while (lexer->lookahead != '\0' && lexer->lookahead != '\n' && (int32_t)*array_get(&heredoc->delimiter, size) == lexer->lookahead &&
heredoc->current_leading_word.size < heredoc->delimiter.size)
{
array_push(&heredoc->current_leading_word, lexer->lookahead);
advance(lexer);
size++;
}
}
array_push(&heredoc->current_leading_word, '\0');
return heredoc->delimiter.size == 0 ? false : strcmp(heredoc->current_leading_word.contents, heredoc->delimiter.contents) == 0;
}
static bool scan_heredoc_content(Scanner *scanner, TSLexer *lexer, enum TokenType middle_type, enum TokenType end_type)
{
bool did_advance = false;
Heredoc *heredoc = array_back(&scanner->heredocs);
for (;;)
{
switch (lexer->lookahead)
{
case '\0': {
if (lexer->eof(lexer) && did_advance)
{
reset_heredoc(heredoc);
lexer->result_symbol = end_type;
return true;
}
return false;
}
case '\\': {
did_advance = true;
advance(lexer);
advance(lexer);
break;
}
case '$': {
if (heredoc->is_raw)
{
did_advance = true;
advance(lexer);
break;
}
if (did_advance)
{
lexer->mark_end(lexer);
lexer->result_symbol = middle_type;
heredoc->started = true;
advance(lexer);
if (iswalpha(lexer->lookahead) || lexer->lookahead == '{' || lexer->lookahead == '(')
{
return true;
}
break;
}
if (middle_type == HEREDOC_BODY_BEGINNING && lexer->get_column(lexer) == 0)
{
lexer->result_symbol = middle_type;
heredoc->started = true;
return true;
}
return false;
}
case '\n': {
if (!did_advance)
{
skip(lexer);
}
else
{
advance(lexer);
}
did_advance = true;
if (heredoc->allows_indent)
{
while (iswspace(lexer->lookahead))
{
advance(lexer);
}
}
lexer->result_symbol = heredoc->started ? middle_type : end_type;
lexer->mark_end(lexer);
if (scan_heredoc_end_identifier(heredoc, lexer))
{
if (lexer->result_symbol == HEREDOC_END)
{
(void)array_pop(&scanner->heredocs);
}
return true;
}
break;
}
default: {
if (lexer->get_column(lexer) == 0)
{
// an alternative is to check the starting column of the
// heredoc body and track that statefully
while (iswspace(lexer->lookahead))
{
if (did_advance)
{
advance(lexer);
}
else
{
skip(lexer);
}
}
if (end_type != SIMPLE_HEREDOC_BODY)
{
lexer->result_symbol = middle_type;
if (scan_heredoc_end_identifier(heredoc, lexer))
{
return true;
}
}
if (end_type == SIMPLE_HEREDOC_BODY)
{
lexer->result_symbol = end_type;
lexer->mark_end(lexer);
if (scan_heredoc_end_identifier(heredoc, lexer))
{
return true;
}
}
}
did_advance = true;
advance(lexer);
break;
}
}
}
}
static bool scan(Scanner *scanner, TSLexer *lexer, const bool *valid_symbols)
{
if (valid_symbols[CONCAT] && !in_error_recovery(valid_symbols))
{
if (!(lexer->lookahead == 0 || iswspace(lexer->lookahead) || lexer->lookahead == '>' || lexer->lookahead == '<' ||
lexer->lookahead == ')' || lexer->lookahead == '(' || lexer->lookahead == ';' || lexer->lookahead == '&' ||
lexer->lookahead == '|' || lexer->lookahead == '{' || lexer->lookahead == '}'))
if (!(lexer->lookahead == 0 || iswspace(lexer->lookahead) || lexer->lookahead == '>' || lexer->lookahead == '<' || lexer->lookahead == ')' || lexer->lookahead == '(' || lexer->lookahead == ';' || lexer->lookahead == '&' || lexer->lookahead == '|' || lexer->lookahead == '{' || lexer->lookahead == '}'))
{
lexer->result_symbol = CONCAT;
// So for a`b`, we want to return a concat. We check if the
@ -477,44 +303,11 @@ static bool scan(Scanner *scanner, TSLexer *lexer, const bool *valid_symbols)
}
}
if ((valid_symbols[HEREDOC_BODY_BEGINNING] || valid_symbols[SIMPLE_HEREDOC_BODY]) && scanner->heredocs.size > 0 &&
!array_back(&scanner->heredocs)->started && !in_error_recovery(valid_symbols))
{
return scan_heredoc_content(scanner, lexer, HEREDOC_BODY_BEGINNING, SIMPLE_HEREDOC_BODY);
}
if (valid_symbols[HEREDOC_END] && scanner->heredocs.size > 0)
{
Heredoc *heredoc = array_back(&scanner->heredocs);
if (scan_heredoc_end_identifier(heredoc, lexer))
{
array_delete(&heredoc->current_leading_word);
array_delete(&heredoc->delimiter);
(void)array_pop(&scanner->heredocs);
lexer->result_symbol = HEREDOC_END;
return true;
}
}
if (valid_symbols[HEREDOC_CONTENT] && scanner->heredocs.size > 0 && array_back(&scanner->heredocs)->started &&
!in_error_recovery(valid_symbols))
{
return scan_heredoc_content(scanner, lexer, HEREDOC_CONTENT, HEREDOC_END);
}
if (valid_symbols[HEREDOC_START] && !in_error_recovery(valid_symbols) && scanner->heredocs.size > 0)
{
return scan_heredoc_start(array_back(&scanner->heredocs), lexer);
}
if ((valid_symbols[VARIABLE_NAME] || valid_symbols[FILE_DESCRIPTOR] || valid_symbols[HEREDOC_ARROW]) &&
!in_error_recovery(valid_symbols))
if ((valid_symbols[VARIABLE_NAME] || valid_symbols[FILE_DESCRIPTOR]) && !in_error_recovery(valid_symbols))
{
for (;;)
{
if ((lexer->lookahead == ' ' || lexer->lookahead == '\t' || lexer->lookahead == '\r' ||
(lexer->lookahead == '\n' && !valid_symbols[NEWLINE])) &&
!valid_symbols[EXPANSION_WORD])
if ((lexer->lookahead == ' ' || lexer->lookahead == '\t' || lexer->lookahead == '\r' || (lexer->lookahead == '\n' && !valid_symbols[NEWLINE])) && !valid_symbols[EXPANSION_WORD])
{
skip(lexer);
}
@ -553,13 +346,11 @@ static bool scan(Scanner *scanner, TSLexer *lexer, const bool *valid_symbols)
}
// no '*', '@', '?', '-', '$', '0', '_'
if (!valid_symbols[EXPANSION_WORD] && (lexer->lookahead == '*' || lexer->lookahead == '@' || lexer->lookahead == '?' ||
lexer->lookahead == '-' || lexer->lookahead == '0' || lexer->lookahead == '_'))
if (!valid_symbols[EXPANSION_WORD] && (lexer->lookahead == '*' || lexer->lookahead == '@' || lexer->lookahead == '?' || lexer->lookahead == '-' || lexer->lookahead == '0' || lexer->lookahead == '_'))
{
lexer->mark_end(lexer);
advance(lexer);
if (lexer->lookahead == '=' || lexer->lookahead == '[' || lexer->lookahead == ':' || lexer->lookahead == '-' ||
lexer->lookahead == '%' || lexer->lookahead == '#' || lexer->lookahead == '/')
if (lexer->lookahead == '=' || lexer->lookahead == '[' || lexer->lookahead == ':' || lexer->lookahead == '-' || lexer->lookahead == '%' || lexer->lookahead == '#' || lexer->lookahead == '/')
{
return false;
}
@ -571,35 +362,6 @@ static bool scan(Scanner *scanner, TSLexer *lexer, const bool *valid_symbols)
}
}
if (valid_symbols[HEREDOC_ARROW] && lexer->lookahead == '<')
{
advance(lexer);
if (lexer->lookahead == '<')
{
advance(lexer);
if (lexer->lookahead == '-')
{
advance(lexer);
Heredoc heredoc = heredoc_new();
heredoc.allows_indent = true;
array_push(&scanner->heredocs, heredoc);
lexer->result_symbol = HEREDOC_ARROW_DASH;
}
// else if (lexer->lookahead == '<' || lexer->lookahead == '=')
// {
// return false;
// }
else
{
Heredoc heredoc = heredoc_new();
array_push(&scanner->heredocs, heredoc);
lexer->result_symbol = HEREDOC_ARROW;
}
return true;
}
return false;
}
bool is_number = true;
if (iswdigit(lexer->lookahead))
{
@ -667,12 +429,9 @@ static bool scan(Scanner *scanner, TSLexer *lexer, const bool *valid_symbols)
{
return false;
}
if (lexer->lookahead == '=' || lexer->lookahead == '[' ||
(lexer->lookahead == ':' &&
!valid_symbols[OPENING_PAREN]) || // TODO(amaanq): more cases for regular word chars but not variable
// names for function words, only handling : for now? #235
lexer->lookahead == '%' ||
(lexer->lookahead == '#' && !is_number) || lexer->lookahead == '@' || (lexer->lookahead == '-'))
if (lexer->lookahead == '=' || lexer->lookahead == '[' || (lexer->lookahead == ':' && !valid_symbols[OPENING_PAREN]) || // TODO(amaanq): more cases for regular word chars but not variable
// names for function words, only handling : for now? #235
lexer->lookahead == '%' || (lexer->lookahead == '#' && !is_number) || lexer->lookahead == '@' || (lexer->lookahead == '-'))
{
lexer->mark_end(lexer);
lexer->result_symbol = VARIABLE_NAME;
@ -706,8 +465,7 @@ static bool scan(Scanner *scanner, TSLexer *lexer, const bool *valid_symbols)
}
}
if ((lexer->lookahead != '"' && lexer->lookahead != '\'') || ((lexer->lookahead == '$' || lexer->lookahead == '\'')) ||
(lexer->lookahead == '\''))
if ((lexer->lookahead != '"' && lexer->lookahead != '\'') || ((lexer->lookahead == '$' || lexer->lookahead == '\'')) || (lexer->lookahead == '\''))
{
typedef struct
{
@ -828,9 +586,7 @@ extglob_pattern:
skip(lexer);
}
if (lexer->lookahead == '?' || lexer->lookahead == '*' || lexer->lookahead == '+' || lexer->lookahead == '@' ||
lexer->lookahead == '!' || lexer->lookahead == '-' || lexer->lookahead == ')' || lexer->lookahead == '\\' ||
lexer->lookahead == '.' || lexer->lookahead == '[' || (iswalpha(lexer->lookahead)))
if (lexer->lookahead == '?' || lexer->lookahead == '*' || lexer->lookahead == '+' || lexer->lookahead == '@' || lexer->lookahead == '!' || lexer->lookahead == '-' || lexer->lookahead == ')' || lexer->lookahead == '\\' || lexer->lookahead == '.' || lexer->lookahead == '[' || (iswalpha(lexer->lookahead)))
{
if (lexer->lookahead == '\\')
{
@ -944,9 +700,7 @@ extglob_pattern:
return true;
}
if (!iswalnum(lexer->lookahead) && lexer->lookahead != '(' && lexer->lookahead != '"' && lexer->lookahead != '[' &&
lexer->lookahead != '?' && lexer->lookahead != '/' && lexer->lookahead != '\\' && lexer->lookahead != '_' &&
lexer->lookahead != '*')
if (!iswalnum(lexer->lookahead) && lexer->lookahead != '(' && lexer->lookahead != '"' && lexer->lookahead != '[' && lexer->lookahead != '?' && lexer->lookahead != '/' && lexer->lookahead != '\\' && lexer->lookahead != '_' && lexer->lookahead != '*')
{
return false;
}