update: base of tokenizer

This commit is contained in:
maix0 2024-09-30 20:25:03 +02:00
parent 24d8bf5fc9
commit b5556b9063
10 changed files with 215 additions and 219 deletions

View file

@ -1,8 +1,6 @@
SRC_FILES = \ SRC_FILES = \
dollar \
quotes \
token \
token_lifetime \ token_lifetime \
tokenizer \
GEN_FILES = \ GEN_FILES = \
\ \

View file

@ -6,7 +6,7 @@
/* By: maiboyer <maiboyer@student.42.fr> +#+ +:+ +#+ */ /* By: maiboyer <maiboyer@student.42.fr> +#+ +:+ +#+ */
/* +#+#+#+#+#+ +#+ */ /* +#+#+#+#+#+ +#+ */
/* Created: 2024/09/26 17:59:23 by maiboyer #+# #+# */ /* Created: 2024/09/26 17:59:23 by maiboyer #+# #+# */
/* Updated: 2024/09/29 13:30:06 by rparodi ### ########.fr */ /* Updated: 2024/09/30 19:47:53 by maiboyer ### ########.fr */
/* */ /* */
/* ************************************************************************** */ /* ************************************************************************** */
@ -18,25 +18,26 @@
enum e_token enum e_token
{ {
AMP, // ampersand == & TOK_NONE, // NO TOKEN TYPE == INVALID / INEXISTANT TOKEN
AND, // and == && TOK_AMP, // ampersand == &
CARRET, // any carret == < > << >> TOK_AND, // and == &&
DLCARRET, // double left carret == << TOK_CARRET, // any carret == < > << >>
DOLLAR, // dollar == $ TOK_DLCARRET, // double left carret == <<
DQUOTE, // double quote string TOK_DOLLAR, // dollar == $
DRCARRET, // double right carret == >> TOK_DQUOTE, // double quote string
EXPENSION, // an expension == $<no_quote_word> TOK_DRCARRET, // double right carret == >>
LCARRET, // left carret == < TOK_EXPENSION, // an expension == $<no_quote_word>
LPAREN, // left parenthesis == ( TOK_LCARRET, // left carret == <
NQUOTE, // no quote string TOK_LPAREN, // left parenthesis == (
OR, // or == || TOK_NQUOTE, // no quote string
PIPE, // pipe == | TOK_OR, // or == ||
RCARRET, // right carret == > TOK_PIPE, // pipe == |
RPAREN, // right parenthesis == ) TOK_RCARRET, // right carret == >
SEMICOLON, // semicolor == ; TOK_RPAREN, // right parenthesis == )
SQUOTE, // single quote string TOK_SEMICOLON, // semicolor == ;
WHITESPACE, // whitespace outside of quoted strings TOK_SQUOTE, // single quote string
WORD, // a meta token, which contains subtokens TOK_WHITESPACE, // whitespace outside of quoted strings
TOK_WORD, // a meta token, which contains subtokens
}; };
typedef struct s_token typedef struct s_token
@ -51,6 +52,7 @@ typedef struct s_token
t_token token_new_meta(enum e_token type); t_token token_new_meta(enum e_token type);
// This create a "simple" token consisting of a string // This create a "simple" token consisting of a string
t_token token_new(enum e_token type); t_token token_new(enum e_token type);
t_token token_new_none(void);
void token_free(t_token tok); void token_free(t_token tok);
bool token_is_meta(t_token tok); bool token_is_meta(t_token tok);

View file

@ -1,36 +0,0 @@
/* ************************************************************************** */
/* */
/* ::: :::::::: */
/* dollar.c :+: :+: :+: */
/* +:+ +:+ +:+ */
/* By: rparodi <rparodi@student.42.fr> +#+ +:+ +#+ */
/* +#+#+#+#+#+ +#+ */
/* Created: 2024/09/27 22:18:46 by rparodi #+# #+# */
/* Updated: 2024/09/28 14:50:56 by maiboyer ### ########.fr */
/* */
/* ************************************************************************** */
#include "me/vec/vec_token.h"
#include "parser/token.h"
#include "me/string/string.h"
#include "me/types.h"
#include <stdio.h>
// MAIX: C'est necessaire de split ca dans une fonction a par vu que ca retourne
// la valeur de la comparaion ?
// Ca serai pas mieux de faire une fonction du genre
// "bool create_single_char_token(char c, t_token *tok)" qui cree un token
// dans `tok` et retourne true si il a match un char qui correspond
// a un token specific (genre $ ou parentheses par example)
/**
* @brief boolean function that's say if it's a dollar or not
*
* @param c character will be checked
* @return true if it's dollar, if not return false
*/
bool is_dollar(char c)
{
if (c == '$')
return (true);
return (false);
}

View file

@ -1,29 +0,0 @@
/* ************************************************************************** */
/* */
/* ::: :::::::: */
/* parentheses.c :+: :+: :+: */
/* +:+ +:+ +:+ */
/* By: rparodi <rparodi@student.42.fr> +#+ +:+ +#+ */
/* +#+#+#+#+#+ +#+ */
/* Created: 2024/09/30 12:25:22 by rparodi #+# #+# */
/* Updated: 2024/09/30 12:28:26 by rparodi ### ########.fr */
/* */
/* ************************************************************************** */
#include "me/vec/vec_token.h"
#include "parser/token.h"
#include "me/string/string.h"
#include "me/types.h"
#include <stdio.h>
char is_parentheses(char c, char next)
{
if (c == '(')
{
if (next == '(')
return (2);
else
return (1);
}
return (0);
}

View file

@ -1,59 +0,0 @@
/* ************************************************************************** */
/* */
/* ::: :::::::: */
/* quotes.c :+: :+: :+: */
/* +:+ +:+ +:+ */
/* By: rparodi <rparodi@student.42.fr> +#+ +:+ +#+ */
/* +#+#+#+#+#+ +#+ */
/* Created: 2024/09/27 11:46:45 by rparodi #+# #+# */
/* Updated: 2024/09/30 12:28:26 by rparodi ### ########.fr */
/* */
/* ************************************************************************** */
#include "me/vec/vec_token.h"
#include "parser/token.h"
#include "me/string/string.h"
#include "me/types.h"
#include <stdio.h>
/**
* @brief boolean function that's say if it's a quote or not
*
* @param c character will be checked
* @return true if it's quote, if not return false
*/
bool is_quote(char c)
{
if (c == '"' || c == '\'')
return (true);
return (false);
}
/**
* @brief token function that's read the string and return the tokens
*
* @param raw the input from stdin
* @param start the index where the first quote was found
* @param output the token of the string
* @return Check if there is an error on this function
*/
t_error find_end_string(t_const_str raw, t_usize *start, t_token *output)
{
if (!raw || !output)
return (ERROR);
if (is_quote(raw[(*start)]))
{
string_push_char(&output->string, raw[(*start)]);
(*start)++;
if (raw[(*start)] == '\0')
return (ERROR);
while (raw[(*start)] != '\0')
{
string_push_char(&output->string, raw[(*start)]);
if (is_quote(raw[(*start)]))
return (NO_ERROR);
(*start)++;
}
}
return (ERROR);
}

View file

@ -1,59 +0,0 @@
/* ************************************************************************** */
/* */
/* ::: :::::::: */
/* token.c :+: :+: :+: */
/* +:+ +:+ +:+ */
/* By: rparodi <rparodi@student.42.fr> +#+ +:+ +#+ */
/* +#+#+#+#+#+ +#+ */
/* Created: 2024/09/25 16:27:03 by rparodi #+# #+# */
/* Updated: 2024/09/29 15:24:11 by rparodi ### ########.fr */
/* */
/* ************************************************************************** */
#include "parser/token.h"
#include "me/string/string.h"
#include "me/types.h"
#include "me/char/char.h"
#include "me/vec/vec_token.h"
#include <stdio.h>
#include <stdbool.h>
#include "me/mem/mem.h"
// MAIX: tu peux faire un token par character "whitespace", vu qu'on va
// manipuler la list de token apres pour faire des truc plus simple a process
// on se debrouillera pour plus avoir plein de token whitespace :)
t_error start_analyse(t_const_str raw, t_vec_token *output)
{
t_usize i;
t_token token;
if (!raw || !output)
return (ERROR);
i = 0;
while (raw[i] != '\0')
{
if (me_isspace(raw[i]))
token = token_new(WHITESPACE);
if (is_quote(raw[i]))
find_end_string(raw, &i, &token);
if (is_dollar(raw[i]))
token = token_new(DOLLAR);
vec_token_push(output, token);
i++;
}
return (NO_ERROR);
}
// MAIX: attention tu ne fais rien avec le vec_token ici :D
// aussi l'argument list est censer faire quoi ?
// c'est un reste d'une version ancienne ?
t_error tokeniser(t_const_str raw)
{
t_vec_token output;
if (!raw)
return (ERROR);
output = vec_token_new(16, NULL);
start_analyse(raw, &output);
return (NO_ERROR);
}

View file

@ -6,7 +6,7 @@
/* By: maiboyer <maiboyer@student.42.fr> +#+ +:+ +#+ */ /* By: maiboyer <maiboyer@student.42.fr> +#+ +:+ +#+ */
/* +#+#+#+#+#+ +#+ */ /* +#+#+#+#+#+ +#+ */
/* Created: 2024/09/28 14:37:13 by maiboyer #+# #+# */ /* Created: 2024/09/28 14:37:13 by maiboyer #+# #+# */
/* Updated: 2024/09/28 15:24:06 by rparodi ### ########.fr */ /* Updated: 2024/09/30 20:15:05 by maiboyer ### ########.fr */
/* */ /* */
/* ************************************************************************** */ /* ************************************************************************** */
@ -23,19 +23,68 @@ void token_free(t_token tok)
t_token token_new(enum e_token type) t_token token_new(enum e_token type)
{ {
return ((t_token){.type = type, .string = string_new(16), \ return ((t_token){.type = type, .string = string_new(16), .subtokens = {NULL, 0, 0, NULL}});
.subtokens = {NULL, 0, 0, NULL}});
} }
t_token token_new_meta(enum e_token type) t_token token_new_meta(enum e_token type)
{ {
return ((t_token){.type = type, .string = {NULL, 0, 0}, \ return ((t_token){.type = type, .string = {NULL, 0, 0}, .subtokens = vec_token_new(16, token_free)});
.subtokens = vec_token_new(16, token_free)});
} }
bool token_is_meta(t_token tok) bool token_is_meta(t_token tok)
{ {
if (tok.type == WORD) if (tok.type == TOK_WORD)
return (true); return (true);
return (false); return (false);
} }
t_token token_new_none(void)
{
return ((t_token){.type = TOK_NONE, .string = {NULL, 0, 0}, .subtokens = vec_token_new(16, token_free)});
}
// TO REMOVE
t_str token_name(t_token *token)
{
if (token->type == TOK_NONE)
return ("NONE");
if (token->type == TOK_AMP)
return ("AMP");
if (token->type == TOK_AND)
return ("AND");
if (token->type == TOK_CARRET)
return ("CARRET");
if (token->type == TOK_DLCARRET)
return ("DLCARRET");
if (token->type == TOK_DOLLAR)
return ("DOLLAR");
if (token->type == TOK_DQUOTE)
return ("DQUOTE");
if (token->type == TOK_DRCARRET)
return ("DRCARRET");
if (token->type == TOK_EXPENSION)
return ("EXPENSION");
if (token->type == TOK_LCARRET)
return ("LCARRET");
if (token->type == TOK_LPAREN)
return ("LPAREN");
if (token->type == TOK_NQUOTE)
return ("NQUOTE");
if (token->type == TOK_OR)
return ("OR");
if (token->type == TOK_PIPE)
return ("PIPE");
if (token->type == TOK_RCARRET)
return ("RCARRET");
if (token->type == TOK_RPAREN)
return ("RPAREN");
if (token->type == TOK_SEMICOLON)
return ("SEMICOLON");
if (token->type == TOK_SQUOTE)
return ("SQUOTE");
if (token->type == TOK_WHITESPACE)
return ("WHITESPACE");
if (token->type == TOK_WORD)
return ("WORD");
return (NULL);
}

115
parser/src/tokenizer.c Normal file
View file

@ -0,0 +1,115 @@
/* ************************************************************************** */
/* */
/* ::: :::::::: */
/* tokenizer.c :+: :+: :+: */
/* +:+ +:+ +:+ */
/* By: maiboyer <maiboyer@student.42.fr> +#+ +:+ +#+ */
/* +#+#+#+#+#+ +#+ */
/* Created: 2024/09/30 19:39:39 by maiboyer #+# #+# */
/* Updated: 2024/09/30 20:19:06 by maiboyer ### ########.fr */
/* */
/* ************************************************************************** */
#include "me/char/char.h"
#include "me/string/string.h"
#include "me/types.h"
#include "me/vec/vec_token.h"
#include "parser/token.h"
static void push_token_and_create_new(t_vec_token *tokens, t_token *tok, enum e_token ttype, t_const_str s)
{
t_token tmp;
if (tok->type != TOK_NONE)
vec_token_push(tokens, *tok);
*tok = token_new_none();
tmp = token_new(ttype);
string_push(&tmp.string, s);
vec_token_push(tokens, tmp);
}
t_error tokenize(t_const_str s, t_vec_token *out)
{
t_usize i;
char quote;
t_vec_token ret;
t_token tok;
if (s == NULL || out == NULL)
return (ERROR);
i = 0;
quote = '\0';
tok = token_new_none();
ret = vec_token_new(16, token_free);
while (s[i] != '\0')
{
if (quote == '\0')
{
quote = s[i];
if (s[i] == '\"')
push_token_and_create_new(&ret, &tok, TOK_DQUOTE, "");
else if (s[i] == '\'')
push_token_and_create_new(&ret, &tok, TOK_SQUOTE, "");
else
{
quote = '\0';
if (s[i] == '$')
push_token_and_create_new(&ret, &tok, TOK_DOLLAR, "$");
else if (s[i] == '>')
push_token_and_create_new(&ret, &tok, TOK_RCARRET, ">");
else if (s[i] == '<')
push_token_and_create_new(&ret, &tok, TOK_LCARRET, "<");
else if (s[i] == '&')
push_token_and_create_new(&ret, &tok, TOK_AMP, "&");
else if (s[i] == '|')
push_token_and_create_new(&ret, &tok, TOK_PIPE, "|");
else if (s[i] == '(')
push_token_and_create_new(&ret, &tok, TOK_LPAREN, "(");
else if (s[i] == ')')
push_token_and_create_new(&ret, &tok, TOK_RPAREN, ")");
else if (s[i] == ';')
push_token_and_create_new(&ret, &tok, TOK_RPAREN, ";");
else if (me_isspace(s[i]))
push_token_and_create_new(&ret, &tok, TOK_WHITESPACE, " ");
else
{
if (tok.type == TOK_NONE)
tok = token_new(TOK_NQUOTE);
string_push_char(&tok.string, s[i]);
}
}
}
else if (quote == '\'')
{
if (s[i] == '\'')
{
quote = '\0';
if (tok.type != TOK_NONE)
vec_token_push(&ret, tok);
tok = token_new_none();
}
else
string_push_char(&tok.string, s[i]);
}
else if (quote == '\"')
{
if (s[i] == '\"')
{
quote = '\0';
if (tok.type != TOK_NONE)
vec_token_push(&ret, tok);
tok = token_new_none();
}
else
string_push_char(&tok.string, s[i]);
}
else
me_abort("invalid quote type");
i++;
}
if (tok.type == TOK_NQUOTE)
vec_token_push(&ret, tok);
if (tok.type == TOK_NQUOTE || tok.type == TOK_NONE)
return (*out = ret, NO_ERROR);
else
return (vec_token_free(ret), ERROR);
}

View file

@ -6,7 +6,7 @@
/* By: rparodi <rparodi@student.42.fr> +#+ +:+ +#+ */ /* By: rparodi <rparodi@student.42.fr> +#+ +:+ +#+ */
/* +#+#+#+#+#+ +#+ */ /* +#+#+#+#+#+ +#+ */
/* Created: 2024/09/06 16:31:41 by rparodi #+# #+# */ /* Created: 2024/09/06 16:31:41 by rparodi #+# #+# */
/* Updated: 2024/09/26 18:14:19 by maiboyer ### ########.fr */ /* Updated: 2024/09/30 20:06:27 by maiboyer ### ########.fr */
/* */ /* */
/* ************************************************************************** */ /* ************************************************************************** */
@ -47,6 +47,7 @@ t_error get_user_input(t_state *state)
} }
} }
line_edit_stop(&lstate); line_edit_stop(&lstate);
printf("state->str_input = %s\n", state->str_input);
return (NO_ERROR); return (NO_ERROR);
} }

View file

@ -6,7 +6,7 @@
/* By: rparodi <rparodi@student.42.fr> +#+ +:+ +#+ */ /* By: rparodi <rparodi@student.42.fr> +#+ +:+ +#+ */
/* +#+#+#+#+#+ +#+ */ /* +#+#+#+#+#+ +#+ */
/* Created: 2024/03/28 14:40:38 by rparodi #+# #+# */ /* Created: 2024/03/28 14:40:38 by rparodi #+# #+# */
/* Updated: 2024/09/26 18:14:59 by maiboyer ### ########.fr */ /* Updated: 2024/09/30 20:11:12 by maiboyer ### ########.fr */
/* */ /* */
/* ************************************************************************** */ /* ************************************************************************** */
@ -22,6 +22,8 @@
#include "me/str/str.h" #include "me/str/str.h"
#include "me/string/string.h" #include "me/string/string.h"
#include "me/types.h" #include "me/types.h"
#include "me/vec/vec_str.h"
#include "me/vec/vec_token.h"
#include <errno.h> #include <errno.h>
#include <sys/types.h> #include <sys/types.h>
@ -97,9 +99,21 @@ void print_node_data(t_node *t, t_usize depth)
} }
*/ */
void parse_str(t_state *state) t_str token_name(t_token *out);
void func(t_usize i, t_token *token, void *state)
{ {
(void)(state); (void)(state);
(void)(i);
printf("%s => %s\n", token_name(token), token->string.buf);
}
t_error tokenize(t_const_str s, t_vec_token *out);
void parse_str(t_state *state)
{
t_vec_token tokens;
if (tokenize(state->str_input, &tokens))
return ;
vec_token_iter(&tokens, func, NULL);
} }
t_i32 main(t_i32 argc, t_str argv[], t_str envp[]) t_i32 main(t_i32 argc, t_str argv[], t_str envp[])