From 4a8fb259dc94598e5749bd45d8c797be67bb1913 Mon Sep 17 00:00:00 2001 From: Maieul BOYER Date: Sat, 31 Aug 2024 18:26:15 +0000 Subject: [PATCH] Normed lexer --- parser/src/lexer/lexer_advance.c | 89 ++++++++++++++++------------ parser/src/lexer/lexer_chunk.c | 28 +++++---- parser/src/lexer/lexer_end.c | 49 +++++++-------- parser/src/lexer/lexer_get_column.c | 25 ++++---- parser/src/lexer/lexer_goto.c | 92 +++++++++++++++++------------ parser/src/lexer/lexer_lifetime.c | 40 +++++++------ parser/src/lexer/lexer_lookahead.c | 26 ++++---- 7 files changed, 191 insertions(+), 158 deletions(-) diff --git a/parser/src/lexer/lexer_advance.c b/parser/src/lexer/lexer_advance.c index fc3f103f..eb049775 100644 --- a/parser/src/lexer/lexer_advance.c +++ b/parser/src/lexer/lexer_advance.c @@ -6,7 +6,7 @@ /* By: maiboyer +#+ +:+ +#+ */ /* +#+#+#+#+#+ +#+ */ /* Created: 2024/08/31 18:06:07 by maiboyer #+# #+# */ -/* Updated: 2024/08/31 18:06:39 by maiboyer ### ########.fr */ +/* Updated: 2024/08/31 18:23:07 by maiboyer ### ########.fr */ /* */ /* ************************************************************************** */ @@ -23,9 +23,16 @@ void ts_lexer__mark_end(TSLexer *_self); void ts_lexer_advance_to_end(Lexer *self); void ts_lexer_goto(Lexer *self, Length position); +bool ts_lexer__do_advance_loop(Lexer *self, const TSRange **current_range); +void ts_lexer__do_advance_after_loop(Lexer *self, bool skip, + const TSRange *cur); + // Intended to be called only from functions that control logging. -void ts_lexer__do_advance(Lexer *self, bool skip) +void ts_lexer__do_advance(Lexer *self, bool skip) { + const TSRange *cur = \ + &self->included_ranges[self->current_included_range_index]; + if (self->lookahead_size) { self->current_position.bytes += self->lookahead_size; @@ -35,42 +42,58 @@ void ts_lexer__do_advance(Lexer *self, bool skip) self->current_position.extent.column = 0; } else - { self->current_position.extent.column += self->lookahead_size; - } } + while (self->current_position.bytes >= cur->end_byte + || cur->end_byte == cur->start_byte) + if (ts_lexer__do_advance_loop(self, &cur)) + break ; + ts_lexer__do_advance_after_loop(self, skip, cur); +} - const TSRange *current_range = &self->included_ranges[self->current_included_range_index]; - while (self->current_position.bytes >= current_range->end_byte || current_range->end_byte == current_range->start_byte) +// Advance to the next character in the source code, retrieving a new +// chunk of source code if needed. +void ts_lexer__advance(TSLexer *_self, bool skip) +{ + Lexer *self; + + self = (Lexer *)_self; + if (!self->chunk) + return ; + ts_lexer__do_advance(self, skip); +} + +bool ts_lexer__do_advance_loop(Lexer *self, const TSRange **current_range) +{ + if (self->current_included_range_index < self->included_range_count) + self->current_included_range_index++; + if (self->current_included_range_index < self->included_range_count) { - if (self->current_included_range_index < self->included_range_count) - { - self->current_included_range_index++; - } - if (self->current_included_range_index < self->included_range_count) - { - current_range++; - self->current_position = (Length){ - current_range->start_byte, - current_range->start_point, - }; - } - else - { - current_range = NULL; - break; - } + (*current_range)++; + self->current_position = (Length){ + (*current_range)->start_byte, + (*current_range)->start_point, + }; } + else + { + (*current_range) = NULL; + return (true); + } + return (false); +} +void ts_lexer__do_advance_after_loop(Lexer *self, bool skip, + const TSRange *cur) +{ if (skip) self->token_start_position = self->current_position; - - if (current_range) + if (cur) { - if (self->current_position.bytes < self->chunk_start || self->current_position.bytes >= self->chunk_start + self->chunk_size) - { + if (self->current_position.bytes < self->chunk_start + || self->current_position.bytes >= self->chunk_start + + self->chunk_size) ts_lexer__get_chunk(self); - } ts_lexer__get_lookahead(self); } else @@ -80,13 +103,3 @@ void ts_lexer__do_advance(Lexer *self, bool skip) self->lookahead_size = 1; } } - -// Advance to the next character in the source code, retrieving a new -// chunk of source code if needed. -void ts_lexer__advance(TSLexer *_self, bool skip) -{ - Lexer *self = (Lexer *)_self; - if (!self->chunk) - return; - ts_lexer__do_advance(self, skip); -} diff --git a/parser/src/lexer/lexer_chunk.c b/parser/src/lexer/lexer_chunk.c index c047a376..75b4fe0d 100644 --- a/parser/src/lexer/lexer_chunk.c +++ b/parser/src/lexer/lexer_chunk.c @@ -13,20 +13,20 @@ #include "me/types.h" #include "parser/lexer.h" -bool ts_lexer__eof(const TSLexer *_self); -t_u32 ts_lexer__get_column(TSLexer *_self); -void ts_lexer__advance(TSLexer *_self, bool skip); -void ts_lexer__do_advance(Lexer *self, bool skip); -void ts_lexer__clear_chunk(Lexer *self); -void ts_lexer__get_chunk(Lexer *self); -void ts_lexer__get_lookahead(Lexer *self); -void ts_lexer__mark_end(TSLexer *_self); -void ts_lexer_advance_to_end(Lexer *self); -void ts_lexer_goto(Lexer *self, Length position); +bool ts_lexer__eof(const TSLexer *_self); +t_u32 ts_lexer__get_column(TSLexer *_self); +void ts_lexer__advance(TSLexer *_self, bool skip); +void ts_lexer__do_advance(Lexer *self, bool skip); +void ts_lexer__clear_chunk(Lexer *self); +void ts_lexer__get_chunk(Lexer *self); +void ts_lexer__get_lookahead(Lexer *self); +void ts_lexer__mark_end(TSLexer *_self); +void ts_lexer_advance_to_end(Lexer *self); +void ts_lexer_goto(Lexer *self, Length position); // Clear the currently stored chunk of source code, because the lexer's // position has changed. -void ts_lexer__clear_chunk(Lexer *self) +void ts_lexer__clear_chunk(Lexer *self) { self->chunk = NULL; self->chunk_size = 0; @@ -35,10 +35,12 @@ void ts_lexer__clear_chunk(Lexer *self) // Call the lexer's input callback to obtain a new chunk of source code // for the current position. -void ts_lexer__get_chunk(Lexer *self) +void ts_lexer__get_chunk(Lexer *self) { self->chunk_start = self->current_position.bytes; - self->chunk = self->input.read(self->input.payload, self->current_position.bytes, self->current_position.extent, &self->chunk_size); + self->chunk = self->input.read(self->input.payload, + self->current_position.bytes, self->current_position.extent, + &self->chunk_size); if (!self->chunk_size) { self->current_included_range_index = self->included_range_count; diff --git a/parser/src/lexer/lexer_end.c b/parser/src/lexer/lexer_end.c index 9bde7e59..b5473a4e 100644 --- a/parser/src/lexer/lexer_end.c +++ b/parser/src/lexer/lexer_end.c @@ -6,30 +6,30 @@ /* By: maiboyer +#+ +:+ +#+ */ /* +#+#+#+#+#+ +#+ */ /* Created: 2024/08/31 18:07:07 by maiboyer #+# #+# */ -/* Updated: 2024/08/31 18:07:21 by maiboyer ### ########.fr */ +/* Updated: 2024/08/31 18:12:10 by maiboyer ### ########.fr */ /* */ /* ************************************************************************** */ #include "me/types.h" #include "parser/lexer.h" -bool ts_lexer__eof(const TSLexer *_self); -t_u32 ts_lexer__get_column(TSLexer *_self); -void ts_lexer__advance(TSLexer *_self, bool skip); -void ts_lexer__do_advance(Lexer *self, bool skip); -void ts_lexer__clear_chunk(Lexer *self); -void ts_lexer__get_chunk(Lexer *self); -void ts_lexer__get_lookahead(Lexer *self); -void ts_lexer__mark_end(TSLexer *_self); -void ts_lexer_advance_to_end(Lexer *self); -void ts_lexer_goto(Lexer *self, Length position); +bool ts_lexer__eof(const TSLexer *_self); +t_u32 ts_lexer__get_column(TSLexer *_self); +void ts_lexer__advance(TSLexer *_self, bool skip); +void ts_lexer__do_advance(Lexer *self, bool skip); +void ts_lexer__clear_chunk(Lexer *self); +void ts_lexer__get_chunk(Lexer *self); +void ts_lexer__get_lookahead(Lexer *self); +void ts_lexer__mark_end(TSLexer *_self); +void ts_lexer_advance_to_end(Lexer *self); +void ts_lexer_goto(Lexer *self, Length position); // Check if the lexer has reached EOF. This state is stored // by setting the lexer's `current_included_range_index` such that // it has consumed all of its available ranges. -bool ts_lexer__eof(const TSLexer *_self) +bool ts_lexer__eof(const TSLexer *_self) { - Lexer *self; + Lexer *self; self = (Lexer *)_self; return (self->current_included_range_index == self->included_range_count); @@ -37,29 +37,32 @@ bool ts_lexer__eof(const TSLexer *_self) // Mark that a token match has completed. This can be called multiple // times if a longer match is found later. -void ts_lexer__mark_end(TSLexer *_self) +void ts_lexer__mark_end(TSLexer *_self) { - Lexer *self = (Lexer *)_self; + Lexer *self; + TSRange *current_included_range; + TSRange *previous_included_range; + + self = (Lexer *)_self; if (!ts_lexer__eof(&self->data)) { - // If the lexer is right at the beginning of included range, - // then the token should be considered to end at the *end* of the - // previous included range, rather than here. - TSRange *current_included_range = &self->included_ranges[self->current_included_range_index]; - if (self->current_included_range_index > 0 && self->current_position.bytes == current_included_range->start_byte) + current_included_range = \ + &self->included_ranges[self->current_included_range_index]; + if (self->current_included_range_index > 0 \ + && self->current_position.bytes == current_included_range->start_byte) { - TSRange *previous_included_range = current_included_range - 1; + previous_included_range = current_included_range - 1; self->token_end_position = (Length){ previous_included_range->end_byte, previous_included_range->end_point, }; - return; + return ; } } self->token_end_position = self->current_position; } -void ts_lexer_advance_to_end(Lexer *self) +void ts_lexer_advance_to_end(Lexer *self) { while (self->chunk) ts_lexer__advance(&self->data, false); diff --git a/parser/src/lexer/lexer_get_column.c b/parser/src/lexer/lexer_get_column.c index 98201fb0..c5765b07 100644 --- a/parser/src/lexer/lexer_get_column.c +++ b/parser/src/lexer/lexer_get_column.c @@ -6,7 +6,7 @@ /* By: maiboyer +#+ +:+ +#+ */ /* +#+#+#+#+#+ +#+ */ /* Created: 2024/08/31 18:04:55 by maiboyer #+# #+# */ -/* Updated: 2024/08/31 18:05:47 by maiboyer ### ########.fr */ +/* Updated: 2024/08/31 18:18:31 by maiboyer ### ########.fr */ /* */ /* ************************************************************************** */ @@ -16,7 +16,7 @@ bool ts_lexer__eof(const TSLexer *_self); t_u32 ts_lexer__get_column(TSLexer *_self); void ts_lexer__advance(TSLexer *_self, bool skip); -void ts_lexer__do_advance(Lexer *self, bool skip); +void ts_lexer__do_advance(Lexer *self, bool skip); void ts_lexer__clear_chunk(Lexer *self); void ts_lexer__get_chunk(Lexer *self); void ts_lexer__get_lookahead(Lexer *self); @@ -24,22 +24,20 @@ void ts_lexer__mark_end(TSLexer *_self); void ts_lexer_advance_to_end(Lexer *self); void ts_lexer_goto(Lexer *self, Length position); -t_u32 ts_lexer__get_column(TSLexer *_self) +t_u32 ts_lexer__get_column(TSLexer *_self) { - Lexer *self = (Lexer *)_self; - - t_u32 goal_byte = self->current_position.bytes; + Lexer *self; + t_u32 goal_byte; + t_u32 result; + self = (Lexer *)_self; + goal_byte = self->current_position.bytes; self->did_get_column = true; self->current_position.bytes -= self->current_position.extent.column; self->current_position.extent.column = 0; - if (self->current_position.bytes < self->chunk_start) - { ts_lexer__get_chunk(self); - } - - t_u32 result = 0; + result = 0; if (!ts_lexer__eof(_self)) { ts_lexer__get_lookahead(self); @@ -48,9 +46,8 @@ t_u32 ts_lexer__get_column(TSLexer *_self) result++; ts_lexer__do_advance(self, false); if (ts_lexer__eof(_self)) - break; + break ; } } - - return result; + return (result); } diff --git a/parser/src/lexer/lexer_goto.c b/parser/src/lexer/lexer_goto.c index 86c4b2ba..de53a09d 100644 --- a/parser/src/lexer/lexer_goto.c +++ b/parser/src/lexer/lexer_goto.c @@ -6,69 +6,85 @@ /* By: maiboyer +#+ +:+ +#+ */ /* +#+#+#+#+#+ +#+ */ /* Created: 2024/08/31 18:08:11 by maiboyer #+# #+# */ -/* Updated: 2024/08/31 18:08:20 by maiboyer ### ########.fr */ +/* Updated: 2024/08/31 18:25:58 by maiboyer ### ########.fr */ /* */ /* ************************************************************************** */ #include "me/types.h" #include "parser/lexer.h" -bool ts_lexer__eof(const TSLexer *_self); -t_u32 ts_lexer__get_column(TSLexer *_self); -void ts_lexer__advance(TSLexer *_self, bool skip); -void ts_lexer__do_advance(Lexer *self, bool skip); -void ts_lexer__clear_chunk(Lexer *self); -void ts_lexer__get_chunk(Lexer *self); -void ts_lexer__get_lookahead(Lexer *self); -void ts_lexer__mark_end(TSLexer *_self); -void ts_lexer_advance_to_end(Lexer *self); -void ts_lexer_goto(Lexer *self, Length position); +bool ts_lexer__eof(const TSLexer *_self); +t_u32 ts_lexer__get_column(TSLexer *_self); +void ts_lexer__advance(TSLexer *_self, bool skip); +void ts_lexer__do_advance(Lexer *self, bool skip); +void ts_lexer__clear_chunk(Lexer *self); +void ts_lexer__get_chunk(Lexer *self); +void ts_lexer__get_lookahead(Lexer *self); +void ts_lexer__mark_end(TSLexer *_self); +void ts_lexer_advance_to_end(Lexer *self); +void ts_lexer_goto(Lexer *self, Length position); -void ts_lexer_goto(Lexer *self, Length position) +void ts_lexer_goto_inside_loop(Lexer *self, bool *found_included_range, + TSRange *included_range, t_usize i); +void ts_lexer_goto_after_loop(Lexer *self, bool found_included_range); + +void ts_lexer_goto(Lexer *self, Length position) { - bool found_included_range; - TSRange *included_range; - TSRange *last_included_range; + bool found_included_range; + TSRange *included_range; + t_u32 i; + included_range = NULL; found_included_range = false; self->current_position = position; - for (t_u32 i = 0; i < self->included_range_count; i++) + i = 0; + while (i < self->included_range_count) { included_range = &self->included_ranges[i]; - if (included_range->end_byte > self->current_position.bytes && included_range->end_byte > included_range->start_byte) + if (included_range->end_byte > self->current_position.bytes + && included_range->end_byte > included_range->start_byte) { - if (included_range->start_byte >= self->current_position.bytes) - { - self->current_position = (Length){ - .bytes = included_range->start_byte, - .extent = included_range->start_point, - }; - } - - self->current_included_range_index = i; - found_included_range = true; - break; + ts_lexer_goto_inside_loop(self, &found_included_range, + included_range, i); + break ; } + i++; } + ts_lexer_goto_after_loop(self, found_included_range); +} + +void ts_lexer_goto_inside_loop(Lexer *self, bool *found_included_range, + TSRange *included_range, t_usize i) +{ + if (included_range->start_byte >= self->current_position.bytes) + { + self->current_position = (Length){ + .bytes = included_range->start_byte, + .extent = included_range->start_point, + }; + } + self->current_included_range_index = i; + *found_included_range = true; +} + +void ts_lexer_goto_after_loop(Lexer *self, bool found_included_range) +{ + TSRange *last_included_range; + if (found_included_range) { - // If the current position is outside of the current chunk of text, - // then clear out the current chunk of text. - if (self->chunk && - (self->current_position.bytes < self->chunk_start || self->current_position.bytes >= self->chunk_start + self->chunk_size)) - { + if (self->chunk && (self->current_position.bytes < self->chunk_start + || self->current_position.bytes >= self->chunk_start + + self->chunk_size)) ts_lexer__clear_chunk(self); - } - self->lookahead_size = 0; self->data.lookahead = '\0'; } - // If the given position is beyond any of included ranges, move to the EOF - // state - past the end of the included ranges. else { self->current_included_range_index = self->included_range_count; - last_included_range = &self->included_ranges[self->included_range_count - 1]; + last_included_range = &self->included_ranges[self->included_range_count + - 1]; self->current_position = (Length){ .bytes = last_included_range->end_byte, .extent = last_included_range->end_point, diff --git a/parser/src/lexer/lexer_lifetime.c b/parser/src/lexer/lexer_lifetime.c index e7a00c38..94959d02 100644 --- a/parser/src/lexer/lexer_lifetime.c +++ b/parser/src/lexer/lexer_lifetime.c @@ -6,7 +6,7 @@ /* By: maiboyer +#+ +:+ +#+ */ /* +#+#+#+#+#+ +#+ */ /* Created: 2024/08/31 17:58:01 by maiboyer #+# #+# */ -/* Updated: 2024/08/31 18:03:48 by maiboyer ### ########.fr */ +/* Updated: 2024/08/31 18:25:16 by maiboyer ### ########.fr */ /* */ /* ************************************************************************** */ @@ -25,43 +25,44 @@ void ts_lexer__mark_end(TSLexer *_self); void ts_lexer_advance_to_end(Lexer *self); void ts_lexer_goto(Lexer *self, Length position); -void ts_lexer_init(Lexer *self) +void ts_lexer_init(Lexer *self) { - static TSRange DEFAULT_RANGE = {.start_point = \ - { .row = 0, .column = 0, }, \ - .end_point = { .row = UINT32_MAX, .column = UINT32_MAX, }, \ - .start_byte = 0, .end_byte = UINT32_MAX}; + static TSRange default_range = {.start_point = {\ + .row = 0, .column = 0, }, .end_point = {.row = UINT32_MAX, \ + .column = UINT32_MAX, }, .start_byte = 0, .end_byte = UINT32_MAX}; *self = (Lexer){ .data = { - .advance = ts_lexer__advance, - .mark_end = ts_lexer__mark_end, - .get_column = ts_lexer__get_column, - .eof = ts_lexer__eof, - .lookahead = 0, - .result_symbol = 0, }, - .chunk = NULL, .chunk_size = 0, .chunk_start = 0, \ + .advance = ts_lexer__advance, + .mark_end = ts_lexer__mark_end, + .get_column = ts_lexer__get_column, + .eof = ts_lexer__eof, + .lookahead = 0, + .result_symbol = 0, }, + .chunk = NULL, + .chunk_size = 0, + .chunk_start = 0, .current_position = {0, {0, 0}}, - .included_ranges = (void *)&DEFAULT_RANGE, + .included_ranges = (void *)&default_range, .included_range_count = 1, .current_included_range_index = 0, }; } -void ts_lexer_set_input(Lexer *self, TSInput input) +void ts_lexer_set_input(Lexer *self, TSInput input) { self->input = input; ts_lexer__clear_chunk(self); ts_lexer_goto(self, self->current_position); } -void ts_lexer_reset(Lexer *self, Length position) +void ts_lexer_reset(Lexer *self, Length position) { if (position.bytes != self->current_position.bytes) ts_lexer_goto(self, position); } -void ts_lexer_start(Lexer *self) +void ts_lexer_start(Lexer *self) { self->token_start_position = self->current_position; self->token_end_position = LENGTH_UNDEFINED; @@ -73,12 +74,13 @@ void ts_lexer_start(Lexer *self) ts_lexer__get_chunk(self); if (!self->lookahead_size) ts_lexer__get_lookahead(self); - if (self->current_position.bytes == 0 && self->data.lookahead == BYTE_ORDER_MARK) + if (self->current_position.bytes == 0 + && self->data.lookahead == BYTE_ORDER_MARK) ts_lexer__advance(&self->data, true); } } -void ts_lexer_finish(Lexer *self, t_u32 *lookahead_end_byte) +void ts_lexer_finish(Lexer *self, t_u32 *lookahead_end_byte) { if (length_is_undefined(self->token_end_position)) ts_lexer__mark_end(&self->data); diff --git a/parser/src/lexer/lexer_lookahead.c b/parser/src/lexer/lexer_lookahead.c index 4df93978..4b04e66d 100644 --- a/parser/src/lexer/lexer_lookahead.c +++ b/parser/src/lexer/lexer_lookahead.c @@ -14,22 +14,22 @@ #include "parser/input.h" #include "parser/lexer.h" -bool ts_lexer__eof(const TSLexer *_self); -t_u32 ts_lexer__get_column(TSLexer *_self); -void ts_lexer__advance(TSLexer *_self, bool skip); -void ts_lexer__do_advance(Lexer *self, bool skip); -void ts_lexer__clear_chunk(Lexer *self); -void ts_lexer__get_chunk(Lexer *self); -void ts_lexer__get_lookahead(Lexer *self); -void ts_lexer__mark_end(TSLexer *_self); -void ts_lexer_advance_to_end(Lexer *self); -void ts_lexer_goto(Lexer *self, Length position); +bool ts_lexer__eof(const TSLexer *_self); +t_u32 ts_lexer__get_column(TSLexer *_self); +void ts_lexer__advance(TSLexer *_self, bool skip); +void ts_lexer__do_advance(Lexer *self, bool skip); +void ts_lexer__clear_chunk(Lexer *self); +void ts_lexer__get_chunk(Lexer *self); +void ts_lexer__get_lookahead(Lexer *self); +void ts_lexer__mark_end(TSLexer *_self); +void ts_lexer_advance_to_end(Lexer *self); +void ts_lexer_goto(Lexer *self, Length position); -void ts_lexer__get_lookahead(Lexer *self) +void ts_lexer__get_lookahead(Lexer *self) { t_u32 position_in_chunk; t_u32 size; - const t_u8 *chunk; + const t_u8 *chunk; position_in_chunk = self->current_position.bytes - self->chunk_start; size = self->chunk_size - position_in_chunk; @@ -37,7 +37,7 @@ void ts_lexer__get_lookahead(Lexer *self) { self->lookahead_size = 1; self->data.lookahead = '\0'; - return; + return ; } chunk = (const t_u8 *)self->chunk + position_in_chunk; self->lookahead_size = ts_decode_ascii(chunk, size, &self->data.lookahead);