From b5556b9063fd17411086d2842bfa0eab5d6e69d1 Mon Sep 17 00:00:00 2001 From: maix0 Date: Mon, 30 Sep 2024 20:25:03 +0200 Subject: [PATCH] update: base of tokenizer --- parser/Filelist.parser.mk | 4 +- parser/include/parser/token.h | 42 +++++++------ parser/src/dollar.c | 36 ----------- parser/src/parentheses.c | 29 --------- parser/src/quotes.c | 59 ----------------- parser/src/token.c | 59 ----------------- parser/src/token_lifetime.c | 69 +++++++++++++++++--- parser/src/tokenizer.c | 115 ++++++++++++++++++++++++++++++++++ sources/_helper_main.c | 3 +- sources/main.c | 18 +++++- 10 files changed, 215 insertions(+), 219 deletions(-) delete mode 100644 parser/src/dollar.c delete mode 100644 parser/src/parentheses.c delete mode 100644 parser/src/quotes.c delete mode 100644 parser/src/token.c create mode 100644 parser/src/tokenizer.c diff --git a/parser/Filelist.parser.mk b/parser/Filelist.parser.mk index 32e1df85..d22139eb 100644 --- a/parser/Filelist.parser.mk +++ b/parser/Filelist.parser.mk @@ -1,8 +1,6 @@ SRC_FILES = \ -dollar \ -quotes \ -token \ token_lifetime \ +tokenizer \ GEN_FILES = \ \ diff --git a/parser/include/parser/token.h b/parser/include/parser/token.h index 36a3b3d9..dbaf637d 100644 --- a/parser/include/parser/token.h +++ b/parser/include/parser/token.h @@ -6,7 +6,7 @@ /* By: maiboyer +#+ +:+ +#+ */ /* +#+#+#+#+#+ +#+ */ /* Created: 2024/09/26 17:59:23 by maiboyer #+# #+# */ -/* Updated: 2024/09/29 13:30:06 by rparodi ### ########.fr */ +/* Updated: 2024/09/30 19:47:53 by maiboyer ### ########.fr */ /* */ /* ************************************************************************** */ @@ -18,25 +18,26 @@ enum e_token { - AMP, // ampersand == & - AND, // and == && - CARRET, // any carret == < > << >> - DLCARRET, // double left carret == << - DOLLAR, // dollar == $ - DQUOTE, // double quote string - DRCARRET, // double right carret == >> - EXPENSION, // an expension == $ - LCARRET, // left carret == < - LPAREN, // left parenthesis == ( - NQUOTE, // no quote string - OR, // or == || - PIPE, // pipe == | - RCARRET, // right carret == > - RPAREN, // right parenthesis == ) - SEMICOLON, // semicolor == ; - SQUOTE, // single quote string - WHITESPACE, // whitespace outside of quoted strings - WORD, // a meta token, which contains subtokens + TOK_NONE, // NO TOKEN TYPE == INVALID / INEXISTANT TOKEN + TOK_AMP, // ampersand == & + TOK_AND, // and == && + TOK_CARRET, // any carret == < > << >> + TOK_DLCARRET, // double left carret == << + TOK_DOLLAR, // dollar == $ + TOK_DQUOTE, // double quote string + TOK_DRCARRET, // double right carret == >> + TOK_EXPENSION, // an expension == $ + TOK_LCARRET, // left carret == < + TOK_LPAREN, // left parenthesis == ( + TOK_NQUOTE, // no quote string + TOK_OR, // or == || + TOK_PIPE, // pipe == | + TOK_RCARRET, // right carret == > + TOK_RPAREN, // right parenthesis == ) + TOK_SEMICOLON, // semicolor == ; + TOK_SQUOTE, // single quote string + TOK_WHITESPACE, // whitespace outside of quoted strings + TOK_WORD, // a meta token, which contains subtokens }; typedef struct s_token @@ -51,6 +52,7 @@ typedef struct s_token t_token token_new_meta(enum e_token type); // This create a "simple" token consisting of a string t_token token_new(enum e_token type); +t_token token_new_none(void); void token_free(t_token tok); bool token_is_meta(t_token tok); diff --git a/parser/src/dollar.c b/parser/src/dollar.c deleted file mode 100644 index 2cc2d3b4..00000000 --- a/parser/src/dollar.c +++ /dev/null @@ -1,36 +0,0 @@ -/* ************************************************************************** */ -/* */ -/* ::: :::::::: */ -/* dollar.c :+: :+: :+: */ -/* +:+ +:+ +:+ */ -/* By: rparodi +#+ +:+ +#+ */ -/* +#+#+#+#+#+ +#+ */ -/* Created: 2024/09/27 22:18:46 by rparodi #+# #+# */ -/* Updated: 2024/09/28 14:50:56 by maiboyer ### ########.fr */ -/* */ -/* ************************************************************************** */ - -#include "me/vec/vec_token.h" -#include "parser/token.h" -#include "me/string/string.h" -#include "me/types.h" -#include - -// MAIX: C'est necessaire de split ca dans une fonction a par vu que ca retourne -// la valeur de la comparaion ? -// Ca serai pas mieux de faire une fonction du genre -// "bool create_single_char_token(char c, t_token *tok)" qui cree un token -// dans `tok` et retourne true si il a match un char qui correspond -// a un token specific (genre $ ou parentheses par example) -/** - * @brief boolean function that's say if it's a dollar or not - * - * @param c character will be checked - * @return true if it's dollar, if not return false - */ -bool is_dollar(char c) -{ - if (c == '$') - return (true); - return (false); -} diff --git a/parser/src/parentheses.c b/parser/src/parentheses.c deleted file mode 100644 index 5856d61d..00000000 --- a/parser/src/parentheses.c +++ /dev/null @@ -1,29 +0,0 @@ -/* ************************************************************************** */ -/* */ -/* ::: :::::::: */ -/* parentheses.c :+: :+: :+: */ -/* +:+ +:+ +:+ */ -/* By: rparodi +#+ +:+ +#+ */ -/* +#+#+#+#+#+ +#+ */ -/* Created: 2024/09/30 12:25:22 by rparodi #+# #+# */ -/* Updated: 2024/09/30 12:28:26 by rparodi ### ########.fr */ -/* */ -/* ************************************************************************** */ - -#include "me/vec/vec_token.h" -#include "parser/token.h" -#include "me/string/string.h" -#include "me/types.h" -#include - -char is_parentheses(char c, char next) -{ - if (c == '(') - { - if (next == '(') - return (2); - else - return (1); - } - return (0); -} diff --git a/parser/src/quotes.c b/parser/src/quotes.c deleted file mode 100644 index 6c38a94a..00000000 --- a/parser/src/quotes.c +++ /dev/null @@ -1,59 +0,0 @@ -/* ************************************************************************** */ -/* */ -/* ::: :::::::: */ -/* quotes.c :+: :+: :+: */ -/* +:+ +:+ +:+ */ -/* By: rparodi +#+ +:+ +#+ */ -/* +#+#+#+#+#+ +#+ */ -/* Created: 2024/09/27 11:46:45 by rparodi #+# #+# */ -/* Updated: 2024/09/30 12:28:26 by rparodi ### ########.fr */ -/* */ -/* ************************************************************************** */ - -#include "me/vec/vec_token.h" -#include "parser/token.h" -#include "me/string/string.h" -#include "me/types.h" -#include - -/** - * @brief boolean function that's say if it's a quote or not - * - * @param c character will be checked - * @return true if it's quote, if not return false - */ -bool is_quote(char c) -{ - if (c == '"' || c == '\'') - return (true); - return (false); -} - -/** - * @brief token function that's read the string and return the tokens - * - * @param raw the input from stdin - * @param start the index where the first quote was found - * @param output the token of the string - * @return Check if there is an error on this function - */ -t_error find_end_string(t_const_str raw, t_usize *start, t_token *output) -{ - if (!raw || !output) - return (ERROR); - if (is_quote(raw[(*start)])) - { - string_push_char(&output->string, raw[(*start)]); - (*start)++; - if (raw[(*start)] == '\0') - return (ERROR); - while (raw[(*start)] != '\0') - { - string_push_char(&output->string, raw[(*start)]); - if (is_quote(raw[(*start)])) - return (NO_ERROR); - (*start)++; - } - } - return (ERROR); -} diff --git a/parser/src/token.c b/parser/src/token.c deleted file mode 100644 index 37a8ae9b..00000000 --- a/parser/src/token.c +++ /dev/null @@ -1,59 +0,0 @@ -/* ************************************************************************** */ -/* */ -/* ::: :::::::: */ -/* token.c :+: :+: :+: */ -/* +:+ +:+ +:+ */ -/* By: rparodi +#+ +:+ +#+ */ -/* +#+#+#+#+#+ +#+ */ -/* Created: 2024/09/25 16:27:03 by rparodi #+# #+# */ -/* Updated: 2024/09/29 15:24:11 by rparodi ### ########.fr */ -/* */ -/* ************************************************************************** */ - -#include "parser/token.h" -#include "me/string/string.h" -#include "me/types.h" -#include "me/char/char.h" -#include "me/vec/vec_token.h" -#include -#include -#include "me/mem/mem.h" - -// MAIX: tu peux faire un token par character "whitespace", vu qu'on va -// manipuler la list de token apres pour faire des truc plus simple a process -// on se debrouillera pour plus avoir plein de token whitespace :) -t_error start_analyse(t_const_str raw, t_vec_token *output) -{ - t_usize i; - t_token token; - - if (!raw || !output) - return (ERROR); - i = 0; - while (raw[i] != '\0') - { - if (me_isspace(raw[i])) - token = token_new(WHITESPACE); - if (is_quote(raw[i])) - find_end_string(raw, &i, &token); - if (is_dollar(raw[i])) - token = token_new(DOLLAR); - vec_token_push(output, token); - i++; - } - return (NO_ERROR); -} - -// MAIX: attention tu ne fais rien avec le vec_token ici :D -// aussi l'argument list est censer faire quoi ? -// c'est un reste d'une version ancienne ? -t_error tokeniser(t_const_str raw) -{ - t_vec_token output; - - if (!raw) - return (ERROR); - output = vec_token_new(16, NULL); - start_analyse(raw, &output); - return (NO_ERROR); -} diff --git a/parser/src/token_lifetime.c b/parser/src/token_lifetime.c index 226fb959..4ff16b4b 100644 --- a/parser/src/token_lifetime.c +++ b/parser/src/token_lifetime.c @@ -6,14 +6,14 @@ /* By: maiboyer +#+ +:+ +#+ */ /* +#+#+#+#+#+ +#+ */ /* Created: 2024/09/28 14:37:13 by maiboyer #+# #+# */ -/* Updated: 2024/09/28 15:24:06 by rparodi ### ########.fr */ +/* Updated: 2024/09/30 20:15:05 by maiboyer ### ########.fr */ /* */ /* ************************************************************************** */ #include "me/vec/vec_token.h" #include "parser/token.h" -void token_free(t_token tok) +void token_free(t_token tok) { if (tok.string.buf != NULL) string_free(tok.string); @@ -21,21 +21,70 @@ void token_free(t_token tok) vec_token_free(tok.subtokens); } -t_token token_new(enum e_token type) +t_token token_new(enum e_token type) { - return ((t_token){.type = type, .string = string_new(16), \ - .subtokens = {NULL, 0, 0, NULL}}); + return ((t_token){.type = type, .string = string_new(16), .subtokens = {NULL, 0, 0, NULL}}); } -t_token token_new_meta(enum e_token type) +t_token token_new_meta(enum e_token type) { - return ((t_token){.type = type, .string = {NULL, 0, 0}, \ - .subtokens = vec_token_new(16, token_free)}); + return ((t_token){.type = type, .string = {NULL, 0, 0}, .subtokens = vec_token_new(16, token_free)}); } -bool token_is_meta(t_token tok) +bool token_is_meta(t_token tok) { - if (tok.type == WORD) + if (tok.type == TOK_WORD) return (true); return (false); } + +t_token token_new_none(void) +{ + return ((t_token){.type = TOK_NONE, .string = {NULL, 0, 0}, .subtokens = vec_token_new(16, token_free)}); +} + +// TO REMOVE +t_str token_name(t_token *token) +{ + if (token->type == TOK_NONE) + return ("NONE"); + if (token->type == TOK_AMP) + return ("AMP"); + if (token->type == TOK_AND) + return ("AND"); + if (token->type == TOK_CARRET) + return ("CARRET"); + if (token->type == TOK_DLCARRET) + return ("DLCARRET"); + if (token->type == TOK_DOLLAR) + return ("DOLLAR"); + if (token->type == TOK_DQUOTE) + return ("DQUOTE"); + if (token->type == TOK_DRCARRET) + return ("DRCARRET"); + if (token->type == TOK_EXPENSION) + return ("EXPENSION"); + if (token->type == TOK_LCARRET) + return ("LCARRET"); + if (token->type == TOK_LPAREN) + return ("LPAREN"); + if (token->type == TOK_NQUOTE) + return ("NQUOTE"); + if (token->type == TOK_OR) + return ("OR"); + if (token->type == TOK_PIPE) + return ("PIPE"); + if (token->type == TOK_RCARRET) + return ("RCARRET"); + if (token->type == TOK_RPAREN) + return ("RPAREN"); + if (token->type == TOK_SEMICOLON) + return ("SEMICOLON"); + if (token->type == TOK_SQUOTE) + return ("SQUOTE"); + if (token->type == TOK_WHITESPACE) + return ("WHITESPACE"); + if (token->type == TOK_WORD) + return ("WORD"); + return (NULL); +} diff --git a/parser/src/tokenizer.c b/parser/src/tokenizer.c new file mode 100644 index 00000000..d7002676 --- /dev/null +++ b/parser/src/tokenizer.c @@ -0,0 +1,115 @@ +/* ************************************************************************** */ +/* */ +/* ::: :::::::: */ +/* tokenizer.c :+: :+: :+: */ +/* +:+ +:+ +:+ */ +/* By: maiboyer +#+ +:+ +#+ */ +/* +#+#+#+#+#+ +#+ */ +/* Created: 2024/09/30 19:39:39 by maiboyer #+# #+# */ +/* Updated: 2024/09/30 20:19:06 by maiboyer ### ########.fr */ +/* */ +/* ************************************************************************** */ + +#include "me/char/char.h" +#include "me/string/string.h" +#include "me/types.h" +#include "me/vec/vec_token.h" +#include "parser/token.h" + +static void push_token_and_create_new(t_vec_token *tokens, t_token *tok, enum e_token ttype, t_const_str s) +{ + t_token tmp; + if (tok->type != TOK_NONE) + vec_token_push(tokens, *tok); + *tok = token_new_none(); + tmp = token_new(ttype); + string_push(&tmp.string, s); + vec_token_push(tokens, tmp); +} + +t_error tokenize(t_const_str s, t_vec_token *out) +{ + t_usize i; + char quote; + t_vec_token ret; + t_token tok; + + if (s == NULL || out == NULL) + return (ERROR); + i = 0; + quote = '\0'; + tok = token_new_none(); + ret = vec_token_new(16, token_free); + while (s[i] != '\0') + { + if (quote == '\0') + { + quote = s[i]; + if (s[i] == '\"') + push_token_and_create_new(&ret, &tok, TOK_DQUOTE, ""); + else if (s[i] == '\'') + push_token_and_create_new(&ret, &tok, TOK_SQUOTE, ""); + else + { + quote = '\0'; + if (s[i] == '$') + push_token_and_create_new(&ret, &tok, TOK_DOLLAR, "$"); + else if (s[i] == '>') + push_token_and_create_new(&ret, &tok, TOK_RCARRET, ">"); + else if (s[i] == '<') + push_token_and_create_new(&ret, &tok, TOK_LCARRET, "<"); + else if (s[i] == '&') + push_token_and_create_new(&ret, &tok, TOK_AMP, "&"); + else if (s[i] == '|') + push_token_and_create_new(&ret, &tok, TOK_PIPE, "|"); + else if (s[i] == '(') + push_token_and_create_new(&ret, &tok, TOK_LPAREN, "("); + else if (s[i] == ')') + push_token_and_create_new(&ret, &tok, TOK_RPAREN, ")"); + else if (s[i] == ';') + push_token_and_create_new(&ret, &tok, TOK_RPAREN, ";"); + else if (me_isspace(s[i])) + push_token_and_create_new(&ret, &tok, TOK_WHITESPACE, " "); + else + { + if (tok.type == TOK_NONE) + tok = token_new(TOK_NQUOTE); + string_push_char(&tok.string, s[i]); + } + } + } + else if (quote == '\'') + { + if (s[i] == '\'') + { + quote = '\0'; + if (tok.type != TOK_NONE) + vec_token_push(&ret, tok); + tok = token_new_none(); + } + else + string_push_char(&tok.string, s[i]); + } + else if (quote == '\"') + { + if (s[i] == '\"') + { + quote = '\0'; + if (tok.type != TOK_NONE) + vec_token_push(&ret, tok); + tok = token_new_none(); + } + else + string_push_char(&tok.string, s[i]); + } + else + me_abort("invalid quote type"); + i++; + } + if (tok.type == TOK_NQUOTE) + vec_token_push(&ret, tok); + if (tok.type == TOK_NQUOTE || tok.type == TOK_NONE) + return (*out = ret, NO_ERROR); + else + return (vec_token_free(ret), ERROR); +} diff --git a/sources/_helper_main.c b/sources/_helper_main.c index 3d69b9ac..c5192216 100644 --- a/sources/_helper_main.c +++ b/sources/_helper_main.c @@ -6,7 +6,7 @@ /* By: rparodi +#+ +:+ +#+ */ /* +#+#+#+#+#+ +#+ */ /* Created: 2024/09/06 16:31:41 by rparodi #+# #+# */ -/* Updated: 2024/09/26 18:14:19 by maiboyer ### ########.fr */ +/* Updated: 2024/09/30 20:06:27 by maiboyer ### ########.fr */ /* */ /* ************************************************************************** */ @@ -47,6 +47,7 @@ t_error get_user_input(t_state *state) } } line_edit_stop(&lstate); + printf("state->str_input = %s\n", state->str_input); return (NO_ERROR); } diff --git a/sources/main.c b/sources/main.c index 1c69493d..2d488cf5 100644 --- a/sources/main.c +++ b/sources/main.c @@ -6,7 +6,7 @@ /* By: rparodi +#+ +:+ +#+ */ /* +#+#+#+#+#+ +#+ */ /* Created: 2024/03/28 14:40:38 by rparodi #+# #+# */ -/* Updated: 2024/09/26 18:14:59 by maiboyer ### ########.fr */ +/* Updated: 2024/09/30 20:11:12 by maiboyer ### ########.fr */ /* */ /* ************************************************************************** */ @@ -22,6 +22,8 @@ #include "me/str/str.h" #include "me/string/string.h" #include "me/types.h" +#include "me/vec/vec_str.h" +#include "me/vec/vec_token.h" #include #include @@ -97,9 +99,21 @@ void print_node_data(t_node *t, t_usize depth) } */ -void parse_str(t_state *state) +t_str token_name(t_token *out); +void func(t_usize i, t_token *token, void *state) { (void)(state); + (void)(i); + printf("%s => %s\n", token_name(token), token->string.buf); +} + +t_error tokenize(t_const_str s, t_vec_token *out); +void parse_str(t_state *state) +{ + t_vec_token tokens; + if (tokenize(state->str_input, &tokens)) + return ; + vec_token_iter(&tokens, func, NULL); } t_i32 main(t_i32 argc, t_str argv[], t_str envp[])