update: base of tokenizer

2024-09-30 20:25:03 +02:00 · 2024-09-30 20:25:03 +02:00 · b5556b9063
commit b5556b9063
parent 24d8bf5fc9
10 changed files with 215 additions and 219 deletions
--- a/parser/Filelist.parser.mk
+++ b/parser/Filelist.parser.mk
@ -1,8 +1,6 @@
 SRC_FILES =                                                                   \
-dollar                                                                        \
-quotes                                                                        \
-token                                                                         \
 token_lifetime                                                                \
+tokenizer                                                                     \

 GEN_FILES =                                                                   \
                                                                              \
--- a/parser/include/parser/token.h
+++ b/parser/include/parser/token.h
@ -6,7 +6,7 @@
 /*   By: maiboyer <maiboyer@student.42.fr>          +#+  +:+       +#+        */
 /*                                                +#+#+#+#+#+   +#+           */
 /*   Created: 2024/09/26 17:59:23 by maiboyer          #+#    #+#             */
-/*   Updated: 2024/09/29 13:30:06 by rparodi          ###   ########.fr       */
+/*   Updated: 2024/09/30 19:47:53 by maiboyer         ###   ########.fr       */
 /*                                                                            */
 /* ************************************************************************** */

@ -18,25 +18,26 @@

 enum e_token
 {
-	AMP,		// ampersand == &
-	AND,		// and == &&
-	CARRET,		// any carret == < > << >>
-	DLCARRET,	// double left carret == <<
-	DOLLAR,		// dollar == $
-	DQUOTE,		// double quote string
-	DRCARRET,	// double right carret == >>
-	EXPENSION,	// an expension == $<no_quote_word>
-	LCARRET,	// left carret == <
-	LPAREN,		// left parenthesis == (
-	NQUOTE,		// no quote string
-	OR,			// or == ||
-	PIPE,		// pipe == |
-	RCARRET,	// right carret == >
-	RPAREN,		// right parenthesis == )
-	SEMICOLON,	// semicolor == ;
-	SQUOTE,		// single quote string
-	WHITESPACE, // whitespace outside of quoted strings
-	WORD,		// a meta token, which contains subtokens
+	TOK_NONE,		// NO TOKEN TYPE == INVALID / INEXISTANT TOKEN
+	TOK_AMP,		// ampersand == &
+	TOK_AND,		// and == &&
+	TOK_CARRET,		// any carret == < > << >>
+	TOK_DLCARRET,	// double left carret == <<
+	TOK_DOLLAR,		// dollar == $
+	TOK_DQUOTE,		// double quote string
+	TOK_DRCARRET,	// double right carret == >>
+	TOK_EXPENSION,	// an expension == $<no_quote_word>
+	TOK_LCARRET,	// left carret == <
+	TOK_LPAREN,		// left parenthesis == (
+	TOK_NQUOTE,		// no quote string
+	TOK_OR,			// or == ||
+	TOK_PIPE,		// pipe == |
+	TOK_RCARRET,	// right carret == >
+	TOK_RPAREN,		// right parenthesis == )
+	TOK_SEMICOLON,	// semicolor == ;
+	TOK_SQUOTE,		// single quote string
+	TOK_WHITESPACE, // whitespace outside of quoted strings
+	TOK_WORD,		// a meta token, which contains subtokens
 };

 typedef struct s_token
@ -51,6 +52,7 @@ typedef struct s_token
 t_token	token_new_meta(enum e_token type);
 // This create a "simple" token consisting of a string
 t_token	token_new(enum e_token type);
+t_token	token_new_none(void);
 void	token_free(t_token tok);
 bool	token_is_meta(t_token tok);

--- a/parser/src/dollar.c
+++ b/parser/src/dollar.c
@ -1,36 +0,0 @@
-/* ************************************************************************** */
-/*                                                                            */
-/*                                                        :::      ::::::::   */
-/*   dollar.c                                           :+:      :+:    :+:   */
-/*                                                    +:+ +:+         +:+     */
-/*   By: rparodi <rparodi@student.42.fr>            +#+  +:+       +#+        */
-/*                                                +#+#+#+#+#+   +#+           */
-/*   Created: 2024/09/27 22:18:46 by rparodi           #+#    #+#             */
-/*   Updated: 2024/09/28 14:50:56 by maiboyer         ###   ########.fr       */
-/*                                                                            */
-/* ************************************************************************** */
-
-#include "me/vec/vec_token.h"
-#include "parser/token.h"
-#include "me/string/string.h"
-#include "me/types.h"
-#include <stdio.h>
-
-// MAIX: C'est necessaire de split ca dans une fonction a par vu que ca retourne
-//		la valeur de la comparaion ?
-//		Ca serai pas mieux de faire une fonction du genre 
-//		"bool create_single_char_token(char c, t_token *tok)" qui cree un token
-//		dans `tok` et retourne true si il a match un char qui correspond 
-//		a un token specific (genre $ ou parentheses par example)
-/**
- * @brief boolean function that's say if it's a dollar or not
- *
- * @param c character will be checked
- * @return true if it's dollar, if not return false
- */
-bool	is_dollar(char c)
-{
-	if (c == '$')
-		return (true);
-	return (false);
-}
--- a/parser/src/parentheses.c
+++ b/parser/src/parentheses.c
@ -1,29 +0,0 @@
-/* ************************************************************************** */
-/*                                                                            */
-/*                                                        :::      ::::::::   */
-/*   parentheses.c                                      :+:      :+:    :+:   */
-/*                                                    +:+ +:+         +:+     */
-/*   By: rparodi <rparodi@student.42.fr>            +#+  +:+       +#+        */
-/*                                                +#+#+#+#+#+   +#+           */
-/*   Created: 2024/09/30 12:25:22 by rparodi           #+#    #+#             */
-/*   Updated: 2024/09/30 12:28:26 by rparodi          ###   ########.fr       */
-/*                                                                            */
-/* ************************************************************************** */
-
-#include "me/vec/vec_token.h"
-#include "parser/token.h"
-#include "me/string/string.h"
-#include "me/types.h"
-#include <stdio.h>
-
-char	is_parentheses(char c, char next)
-{
-	if (c == '(')
-	{
-		if (next == '(')
-			return (2);
-		else
-			return (1);
-	}
-	return (0);
-}
--- a/parser/src/quotes.c
+++ b/parser/src/quotes.c
@ -1,59 +0,0 @@
-/* ************************************************************************** */
-/*                                                                            */
-/*                                                        :::      ::::::::   */
-/*   quotes.c                                           :+:      :+:    :+:   */
-/*                                                    +:+ +:+         +:+     */
-/*   By: rparodi <rparodi@student.42.fr>            +#+  +:+       +#+        */
-/*                                                +#+#+#+#+#+   +#+           */
-/*   Created: 2024/09/27 11:46:45 by rparodi           #+#    #+#             */
-/*   Updated: 2024/09/30 12:28:26 by rparodi          ###   ########.fr       */
-/*                                                                            */
-/* ************************************************************************** */
-
-#include "me/vec/vec_token.h"
-#include "parser/token.h"
-#include "me/string/string.h"
-#include "me/types.h"
-#include <stdio.h>
-
-/**
- * @brief boolean function that's say if it's a quote or not
- *
- * @param c character will be checked
- * @return true if it's quote, if not return false
- */
-bool	is_quote(char c)
-{
-	if (c == '"' || c == '\'')
-		return (true);
-	return (false);
-}
-
-/**
- * @brief token function that's read the string and return the tokens
- *
- * @param raw the input from stdin
- * @param start the index where the first quote was found
- * @param output the token of the string
- * @return Check if there is an error on this function
- */
-t_error	find_end_string(t_const_str raw, t_usize *start, t_token *output)
-{
-	if (!raw || !output)
-		return (ERROR);
-	if (is_quote(raw[(*start)]))
-	{
-		string_push_char(&output->string, raw[(*start)]);
-		(*start)++;
-		if (raw[(*start)] == '\0')
-			return (ERROR);
-		while (raw[(*start)] != '\0')
-		{
-			string_push_char(&output->string, raw[(*start)]);
-			if (is_quote(raw[(*start)]))
-				return (NO_ERROR);
-			(*start)++;
-		}
-	}
-	return (ERROR);
-}
--- a/parser/src/token.c
+++ b/parser/src/token.c
@ -1,59 +0,0 @@
-/* ************************************************************************** */
-/*                                                                            */
-/*                                                        :::      ::::::::   */
-/*   token.c                                            :+:      :+:    :+:   */
-/*                                                    +:+ +:+         +:+     */
-/*   By: rparodi <rparodi@student.42.fr>            +#+  +:+       +#+        */
-/*                                                +#+#+#+#+#+   +#+           */
-/*   Created: 2024/09/25 16:27:03 by rparodi           #+#    #+#             */
-/*   Updated: 2024/09/29 15:24:11 by rparodi          ###   ########.fr       */
-/*                                                                            */
-/* ************************************************************************** */
-
-#include "parser/token.h"
-#include "me/string/string.h"
-#include "me/types.h"
-#include "me/char/char.h"
-#include "me/vec/vec_token.h"
-#include <stdio.h>
-#include <stdbool.h>
-#include "me/mem/mem.h"
-
-// MAIX: tu peux faire un token par character "whitespace", vu qu'on va 
-// manipuler la list de token apres pour faire des truc plus simple a process 
-// on se debrouillera pour plus avoir plein de token whitespace :)
-t_error	start_analyse(t_const_str raw, t_vec_token *output)
-{
-	t_usize	i;
-	t_token	token;
-
-	if (!raw || !output)
-		return (ERROR);
-	i = 0;
-	while (raw[i] != '\0')
-	{
-		if (me_isspace(raw[i]))
-			token = token_new(WHITESPACE);
-		if (is_quote(raw[i]))
-			find_end_string(raw, &i, &token);
-		if (is_dollar(raw[i]))
-			token = token_new(DOLLAR);
-		vec_token_push(output, token);
-		i++;
-	}
-	return (NO_ERROR);
-}
-
-// MAIX: attention tu ne fais rien avec le vec_token ici :D
-//		aussi l'argument list est censer faire quoi ? 
-//		c'est un reste d'une version ancienne ?
-t_error	tokeniser(t_const_str raw)
-{
-	t_vec_token	output;
-
-	if (!raw)
-		return (ERROR);
-	output = vec_token_new(16, NULL);
-	start_analyse(raw, &output);
-	return (NO_ERROR);
-}
--- a/parser/src/token_lifetime.c
+++ b/parser/src/token_lifetime.c
@ -6,14 +6,14 @@
 /*   By: maiboyer <maiboyer@student.42.fr>          +#+  +:+       +#+        */
 /*                                                +#+#+#+#+#+   +#+           */
 /*   Created: 2024/09/28 14:37:13 by maiboyer          #+#    #+#             */
-/*   Updated: 2024/09/28 15:24:06 by rparodi          ###   ########.fr       */
+/*   Updated: 2024/09/30 20:15:05 by maiboyer         ###   ########.fr       */
 /*                                                                            */
 /* ************************************************************************** */

 #include "me/vec/vec_token.h"
 #include "parser/token.h"

-void	token_free(t_token tok)
+void token_free(t_token tok)
 {
 	if (tok.string.buf != NULL)
 		string_free(tok.string);
@ -21,21 +21,70 @@ void	token_free(t_token tok)
 		vec_token_free(tok.subtokens);
 }

-t_token	token_new(enum e_token type)
+t_token token_new(enum e_token type)
 {
-	return ((t_token){.type = type, .string = string_new(16), \
-		.subtokens = {NULL, 0, 0, NULL}});
+	return ((t_token){.type = type, .string = string_new(16), .subtokens = {NULL, 0, 0, NULL}});
 }

-t_token	token_new_meta(enum e_token type)
+t_token token_new_meta(enum e_token type)
 {
-	return ((t_token){.type = type, .string = {NULL, 0, 0}, \
-		.subtokens = vec_token_new(16, token_free)});
+	return ((t_token){.type = type, .string = {NULL, 0, 0}, .subtokens = vec_token_new(16, token_free)});
 }

-bool	token_is_meta(t_token tok)
+bool token_is_meta(t_token tok)
 {
-	if (tok.type == WORD)
+	if (tok.type == TOK_WORD)
 		return (true);
 	return (false);
 }
+
+t_token token_new_none(void)
+{
+	return ((t_token){.type = TOK_NONE, .string = {NULL, 0, 0}, .subtokens = vec_token_new(16, token_free)});
+}
+
+// TO REMOVE
+t_str token_name(t_token *token)
+{
+	if (token->type == TOK_NONE)
+		return ("NONE");
+	if (token->type == TOK_AMP)
+		return ("AMP");
+	if (token->type == TOK_AND)
+		return ("AND");
+	if (token->type == TOK_CARRET)
+		return ("CARRET");
+	if (token->type == TOK_DLCARRET)
+		return ("DLCARRET");
+	if (token->type == TOK_DOLLAR)
+		return ("DOLLAR");
+	if (token->type == TOK_DQUOTE)
+		return ("DQUOTE");
+	if (token->type == TOK_DRCARRET)
+		return ("DRCARRET");
+	if (token->type == TOK_EXPENSION)
+		return ("EXPENSION");
+	if (token->type == TOK_LCARRET)
+		return ("LCARRET");
+	if (token->type == TOK_LPAREN)
+		return ("LPAREN");
+	if (token->type == TOK_NQUOTE)
+		return ("NQUOTE");
+	if (token->type == TOK_OR)
+		return ("OR");
+	if (token->type == TOK_PIPE)
+		return ("PIPE");
+	if (token->type == TOK_RCARRET)
+		return ("RCARRET");
+	if (token->type == TOK_RPAREN)
+		return ("RPAREN");
+	if (token->type == TOK_SEMICOLON)
+		return ("SEMICOLON");
+	if (token->type == TOK_SQUOTE)
+		return ("SQUOTE");
+	if (token->type == TOK_WHITESPACE)
+		return ("WHITESPACE");
+	if (token->type == TOK_WORD)
+		return ("WORD");
+	return (NULL);
+}
--- a/parser/src/tokenizer.c
+++ b/parser/src/tokenizer.c
@ -0,0 +1,115 @@
+/* ************************************************************************** */
+/*                                                                            */
+/*                                                        :::      ::::::::   */
+/*   tokenizer.c                                        :+:      :+:    :+:   */
+/*                                                    +:+ +:+         +:+     */
+/*   By: maiboyer <maiboyer@student.42.fr>          +#+  +:+       +#+        */
+/*                                                +#+#+#+#+#+   +#+           */
+/*   Created: 2024/09/30 19:39:39 by maiboyer          #+#    #+#             */
+/*   Updated: 2024/09/30 20:19:06 by maiboyer         ###   ########.fr       */
+/*                                                                            */
+/* ************************************************************************** */
+
+#include "me/char/char.h"
+#include "me/string/string.h"
+#include "me/types.h"
+#include "me/vec/vec_token.h"
+#include "parser/token.h"
+
+static void push_token_and_create_new(t_vec_token *tokens, t_token *tok, enum e_token ttype, t_const_str s)
+{
+	t_token tmp;
+	if (tok->type != TOK_NONE)
+		vec_token_push(tokens, *tok);
+	*tok = token_new_none();
+	tmp = token_new(ttype);
+	string_push(&tmp.string, s);
+	vec_token_push(tokens, tmp);
+}
+
+t_error tokenize(t_const_str s, t_vec_token *out)
+{
+	t_usize		i;
+	char		quote;
+	t_vec_token ret;
+	t_token		tok;
+
+	if (s == NULL || out == NULL)
+		return (ERROR);
+	i = 0;
+	quote = '\0';
+	tok = token_new_none();
+	ret = vec_token_new(16, token_free);
+	while (s[i] != '\0')
+	{
+		if (quote == '\0')
+		{
+			quote = s[i];
+			if (s[i] == '\"')
+				push_token_and_create_new(&ret, &tok, TOK_DQUOTE, "");
+			else if (s[i] == '\'')
+				push_token_and_create_new(&ret, &tok, TOK_SQUOTE, "");
+			else
+			{
+				quote = '\0';
+				if (s[i] == '$')
+					push_token_and_create_new(&ret, &tok, TOK_DOLLAR, "$");
+				else if (s[i] == '>')
+					push_token_and_create_new(&ret, &tok, TOK_RCARRET, ">");
+				else if (s[i] == '<')
+					push_token_and_create_new(&ret, &tok, TOK_LCARRET, "<");
+				else if (s[i] == '&')
+					push_token_and_create_new(&ret, &tok, TOK_AMP, "&");
+				else if (s[i] == '|')
+					push_token_and_create_new(&ret, &tok, TOK_PIPE, "|");
+				else if (s[i] == '(')
+					push_token_and_create_new(&ret, &tok, TOK_LPAREN, "(");
+				else if (s[i] == ')')
+					push_token_and_create_new(&ret, &tok, TOK_RPAREN, ")");
+				else if (s[i] == ';')
+					push_token_and_create_new(&ret, &tok, TOK_RPAREN, ";");
+				else if (me_isspace(s[i]))
+					push_token_and_create_new(&ret, &tok, TOK_WHITESPACE, " ");
+				else
+				{
+					if (tok.type == TOK_NONE)
+						tok = token_new(TOK_NQUOTE);
+					string_push_char(&tok.string, s[i]);
+				}
+			}
+		}
+		else if (quote == '\'')
+		{
+			if (s[i] == '\'')
+			{
+				quote = '\0';
+				if (tok.type != TOK_NONE)
+					vec_token_push(&ret, tok);
+				tok = token_new_none();
+			}
+			else
+				string_push_char(&tok.string, s[i]);
+		}
+		else if (quote == '\"')
+		{
+			if (s[i] == '\"')
+			{
+				quote = '\0';
+				if (tok.type != TOK_NONE)
+					vec_token_push(&ret, tok);
+				tok = token_new_none();
+			}
+			else
+				string_push_char(&tok.string, s[i]);
+		}
+		else
+			me_abort("invalid quote type");
+		i++;
+	}
+	if (tok.type == TOK_NQUOTE)
+		vec_token_push(&ret, tok);
+	if (tok.type == TOK_NQUOTE || tok.type == TOK_NONE)
+		return (*out = ret, NO_ERROR);
+	else
+		return (vec_token_free(ret), ERROR);
+}