From 5ebadce4f84f81e859c95fdaaba749b591f721db Mon Sep 17 00:00:00 2001 From: Maieul BOYER Date: Sat, 31 Aug 2024 18:10:13 +0000 Subject: [PATCH] Working state --- parser/Filelist.parser.mk | 25 +- parser/src/language/language_field.c | 3 +- parser/src/lexer.c | 339 --------------------------- parser/src/lexer/lexer_advance.c | 92 ++++++++ parser/src/lexer/lexer_chunk.c | 47 ++++ parser/src/lexer/lexer_end.c | 66 ++++++ parser/src/lexer/lexer_get_column.c | 56 +++++ parser/src/lexer/lexer_goto.c | 80 +++++++ parser/src/lexer/lexer_lifetime.c | 88 +++++++ parser/src/lexer/lexer_lookahead.c | 46 ++++ 10 files changed, 494 insertions(+), 348 deletions(-) delete mode 100644 parser/src/lexer.c create mode 100644 parser/src/lexer/lexer_advance.c create mode 100644 parser/src/lexer/lexer_chunk.c create mode 100644 parser/src/lexer/lexer_end.c create mode 100644 parser/src/lexer/lexer_get_column.c create mode 100644 parser/src/lexer/lexer_goto.c create mode 100644 parser/src/lexer/lexer_lifetime.c create mode 100644 parser/src/lexer/lexer_lookahead.c diff --git a/parser/Filelist.parser.mk b/parser/Filelist.parser.mk index 0c6b559a..d45737e9 100644 --- a/parser/Filelist.parser.mk +++ b/parser/Filelist.parser.mk @@ -1,13 +1,25 @@ SRC_FILES = \ -create_language \ -external_scanner_state \ -external_scanner_state2 \ -input \ -language \ +language/language_field \ +language/language_getters \ +language/language_getters2 \ +language/language_misc \ +language/language_symbol \ +language/language_symbol2 \ length/length_funcs1 \ length/length_funcs2 \ lex \ -lexer \ +lexer/lexer_advance \ +lexer/lexer_chunk \ +lexer/lexer_end \ +lexer/lexer_get_column \ +lexer/lexer_goto \ +lexer/lexer_lifetime \ +lexer/lexer_lookahead \ +misc/create_language \ +misc/external_scanner_state \ +misc/external_scanner_state2 \ +misc/input \ +misc/reduce_action \ node/node_child \ node/node_child_inner \ node/node_constructor \ @@ -20,7 +32,6 @@ node/node_relevent \ parser \ point/point_funcs1 \ point/point_funcs2 \ -reduce_action \ scanner \ stack/stack_add_link \ stack/stack_funcs1 \ diff --git a/parser/src/language/language_field.c b/parser/src/language/language_field.c index ef8724b6..6b5ceec0 100644 --- a/parser/src/language/language_field.c +++ b/parser/src/language/language_field.c @@ -6,7 +6,7 @@ /* By: maiboyer +#+ +:+ +#+ */ /* +#+#+#+#+#+ +#+ */ /* Created: 2024/08/31 17:47:24 by maiboyer #+# #+# */ -/* Updated: 2024/08/31 17:48:55 by maiboyer ### ########.fr */ +/* Updated: 2024/08/31 18:10:02 by maiboyer ### ########.fr */ /* */ /* ************************************************************************** */ @@ -32,7 +32,6 @@ TSFieldId ts_language_field_id_for_name(const TSLanguage *self, { t_u16 count; TSSymbol i; - bool res; count = (t_u16)ts_language_field_count(self); i = 1; diff --git a/parser/src/lexer.c b/parser/src/lexer.c deleted file mode 100644 index bad21ac8..00000000 --- a/parser/src/lexer.c +++ /dev/null @@ -1,339 +0,0 @@ -#include "parser/lexer.h" -#include "me/mem/mem.h" -#include "me/types.h" -#include "parser/input.h" -#include "parser/length.h" -#include -#include - -#define BYTE_ORDER_MARK 0xFEFF - -// Check if the lexer has reached EOF. This state is stored -// by setting the lexer's `current_included_range_index` such that -// it has consumed all of its available ranges. -static bool ts_lexer__eof(const TSLexer *_self) -{ - Lexer *self; - - self = (Lexer *)_self; - return (self->current_included_range_index == self->included_range_count); -} - -// Clear the currently stored chunk of source code, because the lexer's -// position has changed. -static void ts_lexer__clear_chunk(Lexer *self) -{ - self->chunk = NULL; - self->chunk_size = 0; - self->chunk_start = 0; -} - -// Call the lexer's input callback to obtain a new chunk of source code -// for the current position. -static void ts_lexer__get_chunk(Lexer *self) -{ - self->chunk_start = self->current_position.bytes; - self->chunk = self->input.read(self->input.payload, self->current_position.bytes, self->current_position.extent, &self->chunk_size); - if (!self->chunk_size) - { - self->current_included_range_index = self->included_range_count; - self->chunk = NULL; - } -} - -// Decode the next unicode character in the current chunk of source code. -// This assumes that the lexer has already retrieved a chunk of source -// code that spans the current position. -static void ts_lexer__get_lookahead(Lexer *self) -{ - t_u32 position_in_chunk; - t_u32 size; - const t_u8 *chunk; - - position_in_chunk = self->current_position.bytes - self->chunk_start; - size = self->chunk_size - position_in_chunk; - if (size == 0) - { - self->lookahead_size = 1; - self->data.lookahead = '\0'; - return; - } - chunk = (const t_u8 *)self->chunk + position_in_chunk; - self->lookahead_size = ts_decode_ascii(chunk, size, &self->data.lookahead); - if (self->data.lookahead == TS_DECODE_ERROR) - self->lookahead_size = 1; -} - -static void ts_lexer_goto(Lexer *self, Length position) -{ - bool found_included_range; - TSRange *included_range; - TSRange *last_included_range; - - found_included_range = false; - self->current_position = position; - for (t_u32 i = 0; i < self->included_range_count; i++) - { - included_range = &self->included_ranges[i]; - if (included_range->end_byte > self->current_position.bytes && included_range->end_byte > included_range->start_byte) - { - if (included_range->start_byte >= self->current_position.bytes) - { - self->current_position = (Length){ - .bytes = included_range->start_byte, - .extent = included_range->start_point, - }; - } - - self->current_included_range_index = i; - found_included_range = true; - break; - } - } - if (found_included_range) - { - // If the current position is outside of the current chunk of text, - // then clear out the current chunk of text. - if (self->chunk && - (self->current_position.bytes < self->chunk_start || self->current_position.bytes >= self->chunk_start + self->chunk_size)) - { - ts_lexer__clear_chunk(self); - } - - self->lookahead_size = 0; - self->data.lookahead = '\0'; - } - // If the given position is beyond any of included ranges, move to the EOF - // state - past the end of the included ranges. - else - { - self->current_included_range_index = self->included_range_count; - last_included_range = &self->included_ranges[self->included_range_count - 1]; - self->current_position = (Length){ - .bytes = last_included_range->end_byte, - .extent = last_included_range->end_point, - }; - ts_lexer__clear_chunk(self); - self->lookahead_size = 1; - self->data.lookahead = '\0'; - } -} - -// Intended to be called only from functions that control logging. -static void ts_lexer__do_advance(Lexer *self, bool skip) -{ - if (self->lookahead_size) - { - self->current_position.bytes += self->lookahead_size; - if (self->data.lookahead == '\n') - { - self->current_position.extent.row++; - self->current_position.extent.column = 0; - } - else - { - self->current_position.extent.column += self->lookahead_size; - } - } - - const TSRange *current_range = &self->included_ranges[self->current_included_range_index]; - while (self->current_position.bytes >= current_range->end_byte || current_range->end_byte == current_range->start_byte) - { - if (self->current_included_range_index < self->included_range_count) - { - self->current_included_range_index++; - } - if (self->current_included_range_index < self->included_range_count) - { - current_range++; - self->current_position = (Length){ - current_range->start_byte, - current_range->start_point, - }; - } - else - { - current_range = NULL; - break; - } - } - - if (skip) - self->token_start_position = self->current_position; - - if (current_range) - { - if (self->current_position.bytes < self->chunk_start || self->current_position.bytes >= self->chunk_start + self->chunk_size) - { - ts_lexer__get_chunk(self); - } - ts_lexer__get_lookahead(self); - } - else - { - ts_lexer__clear_chunk(self); - self->data.lookahead = '\0'; - self->lookahead_size = 1; - } -} - -// Advance to the next character in the source code, retrieving a new -// chunk of source code if needed. -static void ts_lexer__advance(TSLexer *_self, bool skip) -{ - Lexer *self = (Lexer *)_self; - if (!self->chunk) - return; - ts_lexer__do_advance(self, skip); -} - -// Mark that a token match has completed. This can be called multiple -// times if a longer match is found later. -static void ts_lexer__mark_end(TSLexer *_self) -{ - Lexer *self = (Lexer *)_self; - if (!ts_lexer__eof(&self->data)) - { - // If the lexer is right at the beginning of included range, - // then the token should be considered to end at the *end* of the - // previous included range, rather than here. - TSRange *current_included_range = &self->included_ranges[self->current_included_range_index]; - if (self->current_included_range_index > 0 && self->current_position.bytes == current_included_range->start_byte) - { - TSRange *previous_included_range = current_included_range - 1; - self->token_end_position = (Length){ - previous_included_range->end_byte, - previous_included_range->end_point, - }; - return; - } - } - self->token_end_position = self->current_position; -} - -static t_u32 ts_lexer__get_column(TSLexer *_self) -{ - Lexer *self = (Lexer *)_self; - - t_u32 goal_byte = self->current_position.bytes; - - self->did_get_column = true; - self->current_position.bytes -= self->current_position.extent.column; - self->current_position.extent.column = 0; - - if (self->current_position.bytes < self->chunk_start) - { - ts_lexer__get_chunk(self); - } - - t_u32 result = 0; - if (!ts_lexer__eof(_self)) - { - ts_lexer__get_lookahead(self); - while (self->current_position.bytes < goal_byte && self->chunk) - { - result++; - ts_lexer__do_advance(self, false); - if (ts_lexer__eof(_self)) - break; - } - } - - return result; -} - -// Is the lexer at a boundary between two disjoint included ranges of -// source code? This is exposed as an API because some languages' external -// scanners need to perform custom actions at these boundaries. -static const TSRange DEFAULT_RANGE = {.start_point = - { - .row = 0, - .column = 0, - }, - .end_point = - { - .row = UINT32_MAX, - .column = UINT32_MAX, - }, - .start_byte = 0, - .end_byte = UINT32_MAX}; - -void ts_lexer_init(Lexer *self) -{ - *self = (Lexer){ - .data = - { - .advance = ts_lexer__advance, - .mark_end = ts_lexer__mark_end, - .get_column = ts_lexer__get_column, - .eof = ts_lexer__eof, - .lookahead = 0, - .result_symbol = 0, - }, - .chunk = NULL, - .chunk_size = 0, - .chunk_start = 0, - .current_position = {0, {0, 0}}, - .logger = {.payload = NULL, .log = NULL}, - .included_ranges = (void *)&DEFAULT_RANGE, - .included_range_count = 1, - .current_included_range_index = 0, - }; -} - -void ts_lexer_set_input(Lexer *self, TSInput input) -{ - self->input = input; - ts_lexer__clear_chunk(self); - ts_lexer_goto(self, self->current_position); -} - -// Move the lexer to the given position. This doesn't do any work -// if the parser is already at the given position. -void ts_lexer_reset(Lexer *self, Length position) -{ - if (position.bytes != self->current_position.bytes) - ts_lexer_goto(self, position); -} - -void ts_lexer_start(Lexer *self) -{ - self->token_start_position = self->current_position; - self->token_end_position = LENGTH_UNDEFINED; - self->data.result_symbol = 0; - self->did_get_column = false; - if (!ts_lexer__eof(&self->data)) - { - if (!self->chunk_size) - ts_lexer__get_chunk(self); - if (!self->lookahead_size) - ts_lexer__get_lookahead(self); - if (self->current_position.bytes == 0 && self->data.lookahead == BYTE_ORDER_MARK) - ts_lexer__advance(&self->data, true); - } -} - -void ts_lexer_finish(Lexer *self, t_u32 *lookahead_end_byte) -{ - if (length_is_undefined(self->token_end_position)) - ts_lexer__mark_end(&self->data); - (void)(lookahead_end_byte); - // If the token ended at an included range boundary, then its end position - // will have been reset to the end of the preceding range. Reset the start - // position to match. - if (self->token_end_position.bytes < self->token_start_position.bytes) - { - self->token_start_position = self->token_end_position; - } -} - -void ts_lexer_advance_to_end(Lexer *self) -{ - while (self->chunk) - ts_lexer__advance(&self->data, false); -} - -void ts_lexer_mark_end(Lexer *self) -{ - ts_lexer__mark_end(&self->data); -} diff --git a/parser/src/lexer/lexer_advance.c b/parser/src/lexer/lexer_advance.c new file mode 100644 index 00000000..fc3f103f --- /dev/null +++ b/parser/src/lexer/lexer_advance.c @@ -0,0 +1,92 @@ +/* ************************************************************************** */ +/* */ +/* ::: :::::::: */ +/* lexer_advance.c :+: :+: :+: */ +/* +:+ +:+ +:+ */ +/* By: maiboyer +#+ +:+ +#+ */ +/* +#+#+#+#+#+ +#+ */ +/* Created: 2024/08/31 18:06:07 by maiboyer #+# #+# */ +/* Updated: 2024/08/31 18:06:39 by maiboyer ### ########.fr */ +/* */ +/* ************************************************************************** */ + +#include "me/types.h" +#include "parser/lexer.h" + +bool ts_lexer__eof(const TSLexer *_self); +t_u32 ts_lexer__get_column(TSLexer *_self); +void ts_lexer__advance(TSLexer *_self, bool skip); +void ts_lexer__clear_chunk(Lexer *self); +void ts_lexer__get_chunk(Lexer *self); +void ts_lexer__get_lookahead(Lexer *self); +void ts_lexer__mark_end(TSLexer *_self); +void ts_lexer_advance_to_end(Lexer *self); +void ts_lexer_goto(Lexer *self, Length position); + +// Intended to be called only from functions that control logging. +void ts_lexer__do_advance(Lexer *self, bool skip) +{ + if (self->lookahead_size) + { + self->current_position.bytes += self->lookahead_size; + if (self->data.lookahead == '\n') + { + self->current_position.extent.row++; + self->current_position.extent.column = 0; + } + else + { + self->current_position.extent.column += self->lookahead_size; + } + } + + const TSRange *current_range = &self->included_ranges[self->current_included_range_index]; + while (self->current_position.bytes >= current_range->end_byte || current_range->end_byte == current_range->start_byte) + { + if (self->current_included_range_index < self->included_range_count) + { + self->current_included_range_index++; + } + if (self->current_included_range_index < self->included_range_count) + { + current_range++; + self->current_position = (Length){ + current_range->start_byte, + current_range->start_point, + }; + } + else + { + current_range = NULL; + break; + } + } + + if (skip) + self->token_start_position = self->current_position; + + if (current_range) + { + if (self->current_position.bytes < self->chunk_start || self->current_position.bytes >= self->chunk_start + self->chunk_size) + { + ts_lexer__get_chunk(self); + } + ts_lexer__get_lookahead(self); + } + else + { + ts_lexer__clear_chunk(self); + self->data.lookahead = '\0'; + self->lookahead_size = 1; + } +} + +// Advance to the next character in the source code, retrieving a new +// chunk of source code if needed. +void ts_lexer__advance(TSLexer *_self, bool skip) +{ + Lexer *self = (Lexer *)_self; + if (!self->chunk) + return; + ts_lexer__do_advance(self, skip); +} diff --git a/parser/src/lexer/lexer_chunk.c b/parser/src/lexer/lexer_chunk.c new file mode 100644 index 00000000..c047a376 --- /dev/null +++ b/parser/src/lexer/lexer_chunk.c @@ -0,0 +1,47 @@ +/* ************************************************************************** */ +/* */ +/* ::: :::::::: */ +/* lexer_chunk.c :+: :+: :+: */ +/* +:+ +:+ +:+ */ +/* By: maiboyer +#+ +:+ +#+ */ +/* +#+#+#+#+#+ +#+ */ +/* Created: 2024/08/31 18:07:46 by maiboyer #+# #+# */ +/* Updated: 2024/08/31 18:07:52 by maiboyer ### ########.fr */ +/* */ +/* ************************************************************************** */ + +#include "me/types.h" +#include "parser/lexer.h" + +bool ts_lexer__eof(const TSLexer *_self); +t_u32 ts_lexer__get_column(TSLexer *_self); +void ts_lexer__advance(TSLexer *_self, bool skip); +void ts_lexer__do_advance(Lexer *self, bool skip); +void ts_lexer__clear_chunk(Lexer *self); +void ts_lexer__get_chunk(Lexer *self); +void ts_lexer__get_lookahead(Lexer *self); +void ts_lexer__mark_end(TSLexer *_self); +void ts_lexer_advance_to_end(Lexer *self); +void ts_lexer_goto(Lexer *self, Length position); + +// Clear the currently stored chunk of source code, because the lexer's +// position has changed. +void ts_lexer__clear_chunk(Lexer *self) +{ + self->chunk = NULL; + self->chunk_size = 0; + self->chunk_start = 0; +} + +// Call the lexer's input callback to obtain a new chunk of source code +// for the current position. +void ts_lexer__get_chunk(Lexer *self) +{ + self->chunk_start = self->current_position.bytes; + self->chunk = self->input.read(self->input.payload, self->current_position.bytes, self->current_position.extent, &self->chunk_size); + if (!self->chunk_size) + { + self->current_included_range_index = self->included_range_count; + self->chunk = NULL; + } +} diff --git a/parser/src/lexer/lexer_end.c b/parser/src/lexer/lexer_end.c new file mode 100644 index 00000000..9bde7e59 --- /dev/null +++ b/parser/src/lexer/lexer_end.c @@ -0,0 +1,66 @@ +/* ************************************************************************** */ +/* */ +/* ::: :::::::: */ +/* lexer_end.c :+: :+: :+: */ +/* +:+ +:+ +:+ */ +/* By: maiboyer +#+ +:+ +#+ */ +/* +#+#+#+#+#+ +#+ */ +/* Created: 2024/08/31 18:07:07 by maiboyer #+# #+# */ +/* Updated: 2024/08/31 18:07:21 by maiboyer ### ########.fr */ +/* */ +/* ************************************************************************** */ + +#include "me/types.h" +#include "parser/lexer.h" + +bool ts_lexer__eof(const TSLexer *_self); +t_u32 ts_lexer__get_column(TSLexer *_self); +void ts_lexer__advance(TSLexer *_self, bool skip); +void ts_lexer__do_advance(Lexer *self, bool skip); +void ts_lexer__clear_chunk(Lexer *self); +void ts_lexer__get_chunk(Lexer *self); +void ts_lexer__get_lookahead(Lexer *self); +void ts_lexer__mark_end(TSLexer *_self); +void ts_lexer_advance_to_end(Lexer *self); +void ts_lexer_goto(Lexer *self, Length position); + +// Check if the lexer has reached EOF. This state is stored +// by setting the lexer's `current_included_range_index` such that +// it has consumed all of its available ranges. +bool ts_lexer__eof(const TSLexer *_self) +{ + Lexer *self; + + self = (Lexer *)_self; + return (self->current_included_range_index == self->included_range_count); +} + +// Mark that a token match has completed. This can be called multiple +// times if a longer match is found later. +void ts_lexer__mark_end(TSLexer *_self) +{ + Lexer *self = (Lexer *)_self; + if (!ts_lexer__eof(&self->data)) + { + // If the lexer is right at the beginning of included range, + // then the token should be considered to end at the *end* of the + // previous included range, rather than here. + TSRange *current_included_range = &self->included_ranges[self->current_included_range_index]; + if (self->current_included_range_index > 0 && self->current_position.bytes == current_included_range->start_byte) + { + TSRange *previous_included_range = current_included_range - 1; + self->token_end_position = (Length){ + previous_included_range->end_byte, + previous_included_range->end_point, + }; + return; + } + } + self->token_end_position = self->current_position; +} + +void ts_lexer_advance_to_end(Lexer *self) +{ + while (self->chunk) + ts_lexer__advance(&self->data, false); +} diff --git a/parser/src/lexer/lexer_get_column.c b/parser/src/lexer/lexer_get_column.c new file mode 100644 index 00000000..98201fb0 --- /dev/null +++ b/parser/src/lexer/lexer_get_column.c @@ -0,0 +1,56 @@ +/* ************************************************************************** */ +/* */ +/* ::: :::::::: */ +/* lexer_get_column.c :+: :+: :+: */ +/* +:+ +:+ +:+ */ +/* By: maiboyer +#+ +:+ +#+ */ +/* +#+#+#+#+#+ +#+ */ +/* Created: 2024/08/31 18:04:55 by maiboyer #+# #+# */ +/* Updated: 2024/08/31 18:05:47 by maiboyer ### ########.fr */ +/* */ +/* ************************************************************************** */ + +#include "me/types.h" +#include "parser/lexer.h" + +bool ts_lexer__eof(const TSLexer *_self); +t_u32 ts_lexer__get_column(TSLexer *_self); +void ts_lexer__advance(TSLexer *_self, bool skip); +void ts_lexer__do_advance(Lexer *self, bool skip); +void ts_lexer__clear_chunk(Lexer *self); +void ts_lexer__get_chunk(Lexer *self); +void ts_lexer__get_lookahead(Lexer *self); +void ts_lexer__mark_end(TSLexer *_self); +void ts_lexer_advance_to_end(Lexer *self); +void ts_lexer_goto(Lexer *self, Length position); + +t_u32 ts_lexer__get_column(TSLexer *_self) +{ + Lexer *self = (Lexer *)_self; + + t_u32 goal_byte = self->current_position.bytes; + + self->did_get_column = true; + self->current_position.bytes -= self->current_position.extent.column; + self->current_position.extent.column = 0; + + if (self->current_position.bytes < self->chunk_start) + { + ts_lexer__get_chunk(self); + } + + t_u32 result = 0; + if (!ts_lexer__eof(_self)) + { + ts_lexer__get_lookahead(self); + while (self->current_position.bytes < goal_byte && self->chunk) + { + result++; + ts_lexer__do_advance(self, false); + if (ts_lexer__eof(_self)) + break; + } + } + + return result; +} diff --git a/parser/src/lexer/lexer_goto.c b/parser/src/lexer/lexer_goto.c new file mode 100644 index 00000000..86c4b2ba --- /dev/null +++ b/parser/src/lexer/lexer_goto.c @@ -0,0 +1,80 @@ +/* ************************************************************************** */ +/* */ +/* ::: :::::::: */ +/* lexer_goto.c :+: :+: :+: */ +/* +:+ +:+ +:+ */ +/* By: maiboyer +#+ +:+ +#+ */ +/* +#+#+#+#+#+ +#+ */ +/* Created: 2024/08/31 18:08:11 by maiboyer #+# #+# */ +/* Updated: 2024/08/31 18:08:20 by maiboyer ### ########.fr */ +/* */ +/* ************************************************************************** */ + +#include "me/types.h" +#include "parser/lexer.h" + +bool ts_lexer__eof(const TSLexer *_self); +t_u32 ts_lexer__get_column(TSLexer *_self); +void ts_lexer__advance(TSLexer *_self, bool skip); +void ts_lexer__do_advance(Lexer *self, bool skip); +void ts_lexer__clear_chunk(Lexer *self); +void ts_lexer__get_chunk(Lexer *self); +void ts_lexer__get_lookahead(Lexer *self); +void ts_lexer__mark_end(TSLexer *_self); +void ts_lexer_advance_to_end(Lexer *self); +void ts_lexer_goto(Lexer *self, Length position); + +void ts_lexer_goto(Lexer *self, Length position) +{ + bool found_included_range; + TSRange *included_range; + TSRange *last_included_range; + + found_included_range = false; + self->current_position = position; + for (t_u32 i = 0; i < self->included_range_count; i++) + { + included_range = &self->included_ranges[i]; + if (included_range->end_byte > self->current_position.bytes && included_range->end_byte > included_range->start_byte) + { + if (included_range->start_byte >= self->current_position.bytes) + { + self->current_position = (Length){ + .bytes = included_range->start_byte, + .extent = included_range->start_point, + }; + } + + self->current_included_range_index = i; + found_included_range = true; + break; + } + } + if (found_included_range) + { + // If the current position is outside of the current chunk of text, + // then clear out the current chunk of text. + if (self->chunk && + (self->current_position.bytes < self->chunk_start || self->current_position.bytes >= self->chunk_start + self->chunk_size)) + { + ts_lexer__clear_chunk(self); + } + + self->lookahead_size = 0; + self->data.lookahead = '\0'; + } + // If the given position is beyond any of included ranges, move to the EOF + // state - past the end of the included ranges. + else + { + self->current_included_range_index = self->included_range_count; + last_included_range = &self->included_ranges[self->included_range_count - 1]; + self->current_position = (Length){ + .bytes = last_included_range->end_byte, + .extent = last_included_range->end_point, + }; + ts_lexer__clear_chunk(self); + self->lookahead_size = 1; + self->data.lookahead = '\0'; + } +} diff --git a/parser/src/lexer/lexer_lifetime.c b/parser/src/lexer/lexer_lifetime.c new file mode 100644 index 00000000..e7a00c38 --- /dev/null +++ b/parser/src/lexer/lexer_lifetime.c @@ -0,0 +1,88 @@ +/* ************************************************************************** */ +/* */ +/* ::: :::::::: */ +/* lexer_lifetime.c :+: :+: :+: */ +/* +:+ +:+ +:+ */ +/* By: maiboyer +#+ +:+ +#+ */ +/* +#+#+#+#+#+ +#+ */ +/* Created: 2024/08/31 17:58:01 by maiboyer #+# #+# */ +/* Updated: 2024/08/31 18:03:48 by maiboyer ### ########.fr */ +/* */ +/* ************************************************************************** */ + +#include "me/types.h" +#include "parser/lexer.h" + +#define BYTE_ORDER_MARK 0xFEFF + +bool ts_lexer__eof(const TSLexer *_self); +t_u32 ts_lexer__get_column(TSLexer *_self); +void ts_lexer__advance(TSLexer *_self, bool skip); +void ts_lexer__clear_chunk(Lexer *self); +void ts_lexer__get_chunk(Lexer *self); +void ts_lexer__get_lookahead(Lexer *self); +void ts_lexer__mark_end(TSLexer *_self); +void ts_lexer_advance_to_end(Lexer *self); +void ts_lexer_goto(Lexer *self, Length position); + +void ts_lexer_init(Lexer *self) +{ + static TSRange DEFAULT_RANGE = {.start_point = \ + { .row = 0, .column = 0, }, \ + .end_point = { .row = UINT32_MAX, .column = UINT32_MAX, }, \ + .start_byte = 0, .end_byte = UINT32_MAX}; + + *self = (Lexer){ + .data = { + .advance = ts_lexer__advance, + .mark_end = ts_lexer__mark_end, + .get_column = ts_lexer__get_column, + .eof = ts_lexer__eof, + .lookahead = 0, + .result_symbol = 0, }, + .chunk = NULL, .chunk_size = 0, .chunk_start = 0, \ + .current_position = {0, {0, 0}}, + .included_ranges = (void *)&DEFAULT_RANGE, + .included_range_count = 1, + .current_included_range_index = 0, + }; +} + +void ts_lexer_set_input(Lexer *self, TSInput input) +{ + self->input = input; + ts_lexer__clear_chunk(self); + ts_lexer_goto(self, self->current_position); +} + +void ts_lexer_reset(Lexer *self, Length position) +{ + if (position.bytes != self->current_position.bytes) + ts_lexer_goto(self, position); +} + +void ts_lexer_start(Lexer *self) +{ + self->token_start_position = self->current_position; + self->token_end_position = LENGTH_UNDEFINED; + self->data.result_symbol = 0; + self->did_get_column = false; + if (!ts_lexer__eof(&self->data)) + { + if (!self->chunk_size) + ts_lexer__get_chunk(self); + if (!self->lookahead_size) + ts_lexer__get_lookahead(self); + if (self->current_position.bytes == 0 && self->data.lookahead == BYTE_ORDER_MARK) + ts_lexer__advance(&self->data, true); + } +} + +void ts_lexer_finish(Lexer *self, t_u32 *lookahead_end_byte) +{ + if (length_is_undefined(self->token_end_position)) + ts_lexer__mark_end(&self->data); + (void)(lookahead_end_byte); + if (self->token_end_position.bytes < self->token_start_position.bytes) + self->token_start_position = self->token_end_position; +} diff --git a/parser/src/lexer/lexer_lookahead.c b/parser/src/lexer/lexer_lookahead.c new file mode 100644 index 00000000..4df93978 --- /dev/null +++ b/parser/src/lexer/lexer_lookahead.c @@ -0,0 +1,46 @@ +/* ************************************************************************** */ +/* */ +/* ::: :::::::: */ +/* lexer_lookahead.c :+: :+: :+: */ +/* +:+ +:+ +:+ */ +/* By: maiboyer +#+ +:+ +#+ */ +/* +#+#+#+#+#+ +#+ */ +/* Created: 2024/08/31 18:09:03 by maiboyer #+# #+# */ +/* Updated: 2024/08/31 18:09:03 by maiboyer ### ########.fr */ +/* */ +/* ************************************************************************** */ + +#include "me/types.h" +#include "parser/input.h" +#include "parser/lexer.h" + +bool ts_lexer__eof(const TSLexer *_self); +t_u32 ts_lexer__get_column(TSLexer *_self); +void ts_lexer__advance(TSLexer *_self, bool skip); +void ts_lexer__do_advance(Lexer *self, bool skip); +void ts_lexer__clear_chunk(Lexer *self); +void ts_lexer__get_chunk(Lexer *self); +void ts_lexer__get_lookahead(Lexer *self); +void ts_lexer__mark_end(TSLexer *_self); +void ts_lexer_advance_to_end(Lexer *self); +void ts_lexer_goto(Lexer *self, Length position); + +void ts_lexer__get_lookahead(Lexer *self) +{ + t_u32 position_in_chunk; + t_u32 size; + const t_u8 *chunk; + + position_in_chunk = self->current_position.bytes - self->chunk_start; + size = self->chunk_size - position_in_chunk; + if (size == 0) + { + self->lookahead_size = 1; + self->data.lookahead = '\0'; + return; + } + chunk = (const t_u8 *)self->chunk + position_in_chunk; + self->lookahead_size = ts_decode_ascii(chunk, size, &self->data.lookahead); + if (self->data.lookahead == TS_DECODE_ERROR) + self->lookahead_size = 1; +}