diff --git a/parser/token.py/app.py b/parser/token.py/app.py index 6f121d82..faa03e1c 100644 --- a/parser/token.py/app.py +++ b/parser/token.py/app.py @@ -1,9 +1,10 @@ -import make_token +import str_to_token import prettier +import ttoken s = input("> ") -print(s); -first = make_token.me_tokenize(s) -pass1 = prettier.pass1(first) +print(s) +tokens = str_to_token.str_to_token(s) +concated_tokens = concat.concat(tokens) -prettier.print_tokenlist(pass1) +ttoken.print_tokenlist(concated_tokens) diff --git a/parser/token.py/concat.py b/parser/token.py/concat.py new file mode 100644 index 00000000..ae284f8e --- /dev/null +++ b/parser/token.py/concat.py @@ -0,0 +1,20 @@ +from ttoken import * + +def concat(tokens: list[Token]): + i = 0 + out = [] + while i < len(tokens): + tok = tokens[i] + if tok.is_word(): + word = Token(TokenType.WORD, subtokens=[]) + word.subtokens.append(tok) + j = 1 + while i + j < len(tokens) and (tokens[i + j]).is_word(): + word.subtokens.append(tokens[i + j]) + j += 1 + i += j + out.append(word) + else: + out.append(tok) + i += 1 + return out diff --git a/parser/token.py/make_token.py b/parser/token.py/make_token.py deleted file mode 100644 index 074dccd5..00000000 --- a/parser/token.py/make_token.py +++ /dev/null @@ -1,142 +0,0 @@ -from enum import Enum -from dataclasses import dataclass - - -TokenType = Enum( - "TokenType", - [ - "AMP", - "DOLLAR", - "DQUOTE", - "LPAREN", - "NQUOTE", - "PIPE", - "CARRET", - "RPAREN", - "SEMICOLON", - "SQUOTE", - "WHITESPACE", - ], -) - -@dataclass -class Token: - raw: str - ty: TokenType - -def print_tokenlist(tokens: list[Token]): - print("\n") - for tok in tokens: - col = "0" - if tok.ty == TokenType.SQUOTE: - col = "33" - if tok.ty == TokenType.DQUOTE: - col = "32" - if tok.ty == TokenType.WHITESPACE: - col = "31;4" - if tok.ty == TokenType.DOLLAR: - col = "31" - if tok.ty == TokenType.LPAREN: - col = "35" - if tok.ty == TokenType.RPAREN: - col = "35" - if tok.ty == TokenType.AMP: - col = "35" - if tok.ty == TokenType.PIPE: - col = "35" - if tok.ty == TokenType.SEMICOLON: - col = "35" - if tok.ty == TokenType.CARRET: - col = "35" - print(f"\x1b[{col}m{tok.raw}\x1b[0m", end="") - print("\n") - - -def is_quote(c: chr): - return c == "'" or c == '"' - -def me_tokenize(s: str): - tokens = [] - current_token = None - quote = 0 - i = 0 - while i < len(s): - c = s[i] - if quote == 0: - if is_quote(c): - if current_token != None: - tokens.append(current_token) - quote = c - current_token = Token( - "", TokenType.DQUOTE if c == '"' else TokenType.SQUOTE - ) - else: - if current_token == None: - current_token = Token("", TokenType.NQUOTE) - if c.isspace(): - if ( - len(current_token.raw) != 0 - and current_token.ty != TokenType.WHITESPACE - ): - tokens.append(current_token) - current_token = Token("", TokenType.WHITESPACE) - else: - if current_token.ty == TokenType.WHITESPACE: - tokens.append(current_token) - current_token = Token("", TokenType.NQUOTE) - if c == "$": - tokens.append(current_token) - current_token = None - tokens.append(Token("$", TokenType.DOLLAR)) - elif c == "(": - tokens.append(current_token) - current_token = None - tokens.append(Token("(", TokenType.LPAREN)) - elif c == ")": - tokens.append(current_token) - current_token = None - tokens.append(Token(")", TokenType.RPAREN)) - elif c == "|": - tokens.append(current_token) - current_token = None - tokens.append(Token("|", TokenType.PIPE)) - elif c == "&": - tokens.append(current_token) - current_token = None - tokens.append(Token("&", TokenType.AMP)) - elif c == ";": - tokens.append(current_token) - current_token = None - tokens.append(Token(";", TokenType.CARRET)) - elif c == ">" or c == "<": - tokens.append(current_token) - current_token = None - tokens.append(Token(c, TokenType.CARRET)) - else: - current_token.raw += c - elif quote == "'": - if c == "'": - tokens.append(current_token) - current_token = None - quote = 0 - else: - if current_token == None: - current_token = Token("", TokenType.SQUOTE) - current_token.raw += c - - elif quote == '"': - if c == '"': - tokens.append(current_token) - current_token = None - quote = 0 - else: - if current_token == None: - current_token = Token("", TokenType.DQUOTE) - current_token.raw += c - else: - print("you fucked up you quote thingy") - i += 1 - if current_token != None and current_token.ty == TokenType.NQUOTE: - tokens.append(current_token) - return tokens - diff --git a/parser/token.py/prettier.py b/parser/token.py/prettier.py deleted file mode 100644 index a60ba8fc..00000000 --- a/parser/token.py/prettier.py +++ /dev/null @@ -1,92 +0,0 @@ -from enum import Enum -from dataclasses import dataclass -import make_token as mt - - -TokenType = Enum( - "TokenType", - [ - "AND", - "DOLLAR", - "DQUOTE", - "EXPENSION", - "LCARRET", - "LCARRET_DOUBLE", - "LPAREN", - "NQUOTE", - "OR", - "PIPE", - "RCARRET", - "RCARRET_DOUBLE", - "RPAREN", - "SEMICOLON", - "SQUOTE", - "WHITESPACE", - "WORD", - ], -) - - -@dataclass -class Token: - raw: str - raw_list: list - ty: TokenType - - def is_list(self): - return self.ty == TokenType.WORD or self.ty == TokenType.EXPENSION - - -def is_word_mt(tok: mt.Token): - return ( - tok.ty == mt.TokenType.SQUOTE - or tok.ty == mt.TokenType.DQUOTE - or tok.ty == mt.TokenType.NQUOTE - or tok.ty == mt.TokenType.DOLLAR - ) - - -def pass1(tokens: list[mt.Token]): - i = 0 - out = [] - while i < len(tokens): - tok = tokens[i] - if is_word_mt(tok): - concat = Token("", [], TokenType.WORD) - concat.raw_list.append(tok) - j = 1 - while i + j < len(tokens) and is_word_mt(tokens[i + j]): - concat.raw_list.append(tokens[i + j]) - j += 1 - i += j - else: - out.append(tok) - i += 1 - return out - - -def print_tokenlist(tokens: list[Token]): - print("\n") - for tok in tokens: - col = "0" - if tok.ty == TokenType.SQUOTE: - col = "33" - if tok.ty == TokenType.DQUOTE: - col = "32" - if tok.ty == TokenType.WHITESPACE: - col = "31;4" - if tok.ty == TokenType.DOLLAR: - col = "31" - if tok.ty == TokenType.LPAREN: - col = "35" - if tok.ty == TokenType.RPAREN: - col = "35" - if tok.ty == TokenType.PIPE: - col = "35" - if tok.ty == TokenType.SEMICOLON: - col = "35" - if not Token.is_list(tok): - print(f"\x1b[{col}m{tok.raw}\x1b[0m", end="") - else: - print("NOT_PRINT_YET_LOL", end="") - print("\n") diff --git a/parser/token.py/str_to_token.py b/parser/token.py/str_to_token.py new file mode 100644 index 00000000..2abed189 --- /dev/null +++ b/parser/token.py/str_to_token.py @@ -0,0 +1,90 @@ +from ttoken import * + +TT = TokenType + + +def is_quote(c: chr): + return c == "'" or c == '"' + + +def str_to_token(s: str): + tokens = [] + current_token = None + quote = 0 + i = 0 + while i < len(s): + c = s[i] + if quote == 0: + if is_quote(c): + if current_token != None: + tokens.append(current_token) + quote = c + current_token = Token(TT.DQUOTE if c == '"' else TT.SQUOTE, string="") + else: + if current_token == None: + current_token = Token(TT.NQUOTE, string="") + if c.isspace(): + if ( + len(current_token.string) != 0 + and current_token.ty != TT.WHITESPACE + ): + tokens.append(current_token) + current_token = Token(TT.WHITESPACE, string="") + else: + if current_token.ty == TT.WHITESPACE: + tokens.append(current_token) + current_token = Token(TT.NQUOTE, string="") + if c == "$": + tokens.append(current_token) + current_token = None + tokens.append(Token(TT.DOLLAR, string="$")) + elif c == "(": + tokens.append(current_token) + current_token = None + tokens.append(Token(TT.LPAREN, string="(")) + elif c == ")": + tokens.append(current_token) + current_token = None + tokens.append(Token(TT.RPAREN, string=")")) + elif c == "|": + tokens.append(current_token) + current_token = None + tokens.append(Token(TT.PIPE, string="|")) + elif c == "&": + tokens.append(current_token) + current_token = None + tokens.append(Token(TT.AMP, string="&")) + elif c == ";": + tokens.append(current_token) + current_token = None + tokens.append(Token(TT.CARRET, string=";")) + elif c == ">" or c == "<": + tokens.append(current_token) + current_token = None + tokens.append(Token(TT.CARRET, string=c)) + else: + current_token.append_char(c) + elif quote == "'": + if c == "'": + tokens.append(current_token) + current_token = None + quote = 0 + else: + if current_token == None: + current_token = Token(TT.SQUOTE, string="") + current_token.append_char(c) + elif quote == '"': + if c == '"': + tokens.append(current_token) + current_token = None + quote = 0 + else: + if current_token == None: + current_token = Token(TT.DQUOTE, string="") + current_token.append_char(c) + else: + print("you fucked up you quote thingy") + i += 1 + if current_token != None and current_token.ty == TT.NQUOTE: + tokens.append(current_token) + return tokens diff --git a/parser/token.py/ttoken.py b/parser/token.py/ttoken.py new file mode 100644 index 00000000..4e377d15 --- /dev/null +++ b/parser/token.py/ttoken.py @@ -0,0 +1,83 @@ +from enum import Enum +from dataclasses import dataclass + +TokenType = Enum( + "TokenType", + [ + "AMP", + "AND", + "CARRET", + "DOLLAR", + "DQUOTE", + "EXPENSION", + "LCARRET", + "LCARRET_DOUBLE", + "LPAREN", + "NQUOTE", + "OR", + "PIPE", + "RCARRET", + "RCARRET_DOUBLE", + "RPAREN", + "SEMICOLON", + "SQUOTE", + "WHITESPACE", + "WORD", + ], +) + + +@dataclass +class Token: + ty: TokenType + string: str = None + subtokens: list = None + + def is_subtoken(self) -> bool: + return self.subtokens != None + + def append_char(self, c: str): + if self.string is None: + raise Exception(f"Tried to push a char on a token that contains subtokens, TT={self.ty}") + self.string += c + + def is_word(self): + return ( + self.ty == TokenType.SQUOTE + or self.ty == TokenType.DQUOTE + or self.ty == TokenType.NQUOTE + or self.ty == TokenType.DOLLAR + ) + + +def print_tokenlist(tokens: list[Token], *, between="", end="\n"): + for tok in tokens: + col = "0" + if tok.ty == TokenType.SQUOTE: + col = "33" + if tok.ty == TokenType.DQUOTE: + col = "32" + if tok.ty == TokenType.WHITESPACE: + col = "31;4" + if tok.ty == TokenType.DOLLAR: + col = "31" + if tok.ty == TokenType.LPAREN: + col = "35" + if tok.ty == TokenType.RPAREN: + col = "35" + if tok.ty == TokenType.AMP: + col = "35" + if tok.ty == TokenType.PIPE: + col = "35" + if tok.ty == TokenType.SEMICOLON: + col = "35" + if tok.ty == TokenType.CARRET: + col = "35" + if tok.is_subtoken(): + print_tokenlist(tok.subtokens, between="\x1b[100m", end="") + else: + print(f"\x1b[{col}m{between}{tok.string}\x1b[0m", end="") + #print(end) + + +__all__ = ["TokenType", "Token", "print_tokenlist"]