update: pushed more readable tokens

2024-09-26 17:37:57 +02:00 · 2024-09-26 17:37:57 +02:00 · 285104a19a
commit 285104a19a
parent 06c2d19097
6 changed files with 199 additions and 239 deletions
--- a/parser/token.py/app.py
+++ b/parser/token.py/app.py
@ -1,9 +1,10 @@
-import make_token
+import str_to_token
 import prettier
 import ttoken
 s = input("> ")
-print(s);
+print(s)
-first = make_token.me_tokenize(s)
+tokens = str_to_token.str_to_token(s)
-pass1 = prettier.pass1(first)
+concated_tokens = concat.concat(tokens)
-prettier.print_tokenlist(pass1)
+ttoken.print_tokenlist(concated_tokens)
--- a/parser/token.py/concat.py
+++ b/parser/token.py/concat.py
@ -0,0 +1,20 @@
 from ttoken import *
 def concat(tokens: list[Token]):
    i = 0
    out = []
    while i < len(tokens):
        tok = tokens[i]
        if tok.is_word():
            word = Token(TokenType.WORD, subtokens=[])
            word.subtokens.append(tok)
            j = 1
            while i + j < len(tokens) and (tokens[i + j]).is_word():
                word.subtokens.append(tokens[i + j])
                j += 1
            i += j
            out.append(word)
        else:
            out.append(tok)
            i += 1
    return out
--- a/parser/token.py/make_token.py
+++ b/parser/token.py/make_token.py
@ -1,142 +0,0 @@
 from enum import Enum
 from dataclasses import dataclass
 TokenType = Enum(
    "TokenType",
    [
        "AMP",
        "DOLLAR",
        "DQUOTE",
        "LPAREN",
        "NQUOTE",
        "PIPE",
        "CARRET",
        "RPAREN",
        "SEMICOLON",
        "SQUOTE",
        "WHITESPACE",
    ],
 )
@dataclass
 class Token:
    raw: str
    ty: TokenType
 def print_tokenlist(tokens: list[Token]):
    print("\n")
    for tok in tokens:
        col = "0"
        if tok.ty == TokenType.SQUOTE:
            col = "33"
        if tok.ty == TokenType.DQUOTE:
            col = "32"
        if tok.ty == TokenType.WHITESPACE:
            col = "31;4"
        if tok.ty == TokenType.DOLLAR:
            col = "31"
        if tok.ty == TokenType.LPAREN:
            col = "35"
        if tok.ty == TokenType.RPAREN:
            col = "35"
        if tok.ty == TokenType.AMP:
            col = "35"
        if tok.ty == TokenType.PIPE:
            col = "35"
        if tok.ty == TokenType.SEMICOLON:
            col = "35"
        if tok.ty == TokenType.CARRET:
            col = "35"
        print(f"\x1b[{col}m{tok.raw}\x1b[0m", end="")
    print("\n")
 def is_quote(c: chr):
    return c == "'" or c == '"'
 def me_tokenize(s: str):
    tokens = []
    current_token = None
    quote = 0
    i = 0
    while i < len(s):
        c = s[i]
        if quote == 0:
            if is_quote(c):
                if current_token != None:
                    tokens.append(current_token)
                quote = c
                current_token = Token(
                    "", TokenType.DQUOTE if c == '"' else TokenType.SQUOTE
                )
            else:
                if current_token == None:
                    current_token = Token("", TokenType.NQUOTE)
                if c.isspace():
                    if (
                        len(current_token.raw) != 0
                        and current_token.ty != TokenType.WHITESPACE
                    ):
                        tokens.append(current_token)
                    current_token = Token("", TokenType.WHITESPACE)
                else:
                    if current_token.ty == TokenType.WHITESPACE:
                        tokens.append(current_token)
                        current_token = Token("", TokenType.NQUOTE)
                if c == "$":
                    tokens.append(current_token)
                    current_token = None
                    tokens.append(Token("$", TokenType.DOLLAR))
                elif c == "(":
                    tokens.append(current_token)
                    current_token = None
                    tokens.append(Token("(", TokenType.LPAREN))
                elif c == ")":
                    tokens.append(current_token)
                    current_token = None
                    tokens.append(Token(")", TokenType.RPAREN))
                elif c == "|":
                    tokens.append(current_token)
                    current_token = None
                    tokens.append(Token("|", TokenType.PIPE))
                elif c == "&":
                    tokens.append(current_token)
                    current_token = None
                    tokens.append(Token("&", TokenType.AMP))
                elif c == ";":
                    tokens.append(current_token)
                    current_token = None
                    tokens.append(Token(";", TokenType.CARRET))
                elif c == ">" or c == "<":
                    tokens.append(current_token)
                    current_token = None
                    tokens.append(Token(c, TokenType.CARRET))
                else:
                    current_token.raw += c
        elif quote == "'":
            if c == "'":
                tokens.append(current_token)
                current_token = None
                quote = 0
            else:
                if current_token == None:
                    current_token = Token("", TokenType.SQUOTE)
                current_token.raw += c
        elif quote == '"':
            if c == '"':
                tokens.append(current_token)
                current_token = None
                quote = 0
            else:
                if current_token == None:
                    current_token = Token("", TokenType.DQUOTE)
                current_token.raw += c
        else:
            print("you fucked up you quote thingy")
        i += 1
    if current_token != None and current_token.ty == TokenType.NQUOTE:
        tokens.append(current_token)
    return tokens
--- a/parser/token.py/prettier.py
+++ b/parser/token.py/prettier.py
@ -1,92 +0,0 @@
 from enum import Enum
 from dataclasses import dataclass
 import make_token as mt
 TokenType = Enum(
    "TokenType",
    [
        "AND",
        "DOLLAR",
        "DQUOTE",
        "EXPENSION",
        "LCARRET",
        "LCARRET_DOUBLE",
        "LPAREN",
        "NQUOTE",
        "OR",
        "PIPE",
        "RCARRET",
        "RCARRET_DOUBLE",
        "RPAREN",
        "SEMICOLON",
        "SQUOTE",
        "WHITESPACE",
        "WORD",
    ],
 )
@dataclass
 class Token:
    raw: str
    raw_list: list
    ty: TokenType
    def is_list(self):
        return self.ty == TokenType.WORD or self.ty == TokenType.EXPENSION
 def is_word_mt(tok: mt.Token):
    return (
        tok.ty == mt.TokenType.SQUOTE
        or tok.ty == mt.TokenType.DQUOTE
        or tok.ty == mt.TokenType.NQUOTE
        or tok.ty == mt.TokenType.DOLLAR
    )
 def pass1(tokens: list[mt.Token]):
    i = 0
    out = []
    while i < len(tokens):
        tok = tokens[i]
        if is_word_mt(tok):
            concat = Token("", [], TokenType.WORD)
            concat.raw_list.append(tok)
            j = 1
            while i + j < len(tokens) and is_word_mt(tokens[i + j]):
                concat.raw_list.append(tokens[i + j])
                j += 1
            i += j
        else:
            out.append(tok)
            i += 1
    return out
 def print_tokenlist(tokens: list[Token]):
    print("\n")
    for tok in tokens:
        col = "0"
        if tok.ty == TokenType.SQUOTE:
            col = "33"
        if tok.ty == TokenType.DQUOTE:
            col = "32"
        if tok.ty == TokenType.WHITESPACE:
            col = "31;4"
        if tok.ty == TokenType.DOLLAR:
            col = "31"
        if tok.ty == TokenType.LPAREN:
            col = "35"
        if tok.ty == TokenType.RPAREN:
            col = "35"
        if tok.ty == TokenType.PIPE:
            col = "35"
        if tok.ty == TokenType.SEMICOLON:
            col = "35"
        if not Token.is_list(tok):
            print(f"\x1b[{col}m{tok.raw}\x1b[0m", end="")
        else:
            print("NOT_PRINT_YET_LOL", end="")
    print("\n")
--- a/parser/token.py/str_to_token.py
+++ b/parser/token.py/str_to_token.py
@ -0,0 +1,90 @@
 from ttoken import *
 TT = TokenType
 def is_quote(c: chr):
    return c == "'" or c == '"'
 def str_to_token(s: str):
    tokens = []
    current_token = None
    quote = 0
    i = 0
    while i < len(s):
        c = s[i]
        if quote == 0:
            if is_quote(c):
                if current_token != None:
                    tokens.append(current_token)
                quote = c
                current_token = Token(TT.DQUOTE if c == '"' else TT.SQUOTE, string="")
            else:
                if current_token == None:
                    current_token = Token(TT.NQUOTE, string="")
                if c.isspace():
                    if (
                        len(current_token.string) != 0
                        and current_token.ty != TT.WHITESPACE
                    ):
                        tokens.append(current_token)
                    current_token = Token(TT.WHITESPACE, string="")
                else:
                    if current_token.ty == TT.WHITESPACE:
                        tokens.append(current_token)
                        current_token = Token(TT.NQUOTE, string="")
                if c == "$":
                    tokens.append(current_token)
                    current_token = None
                    tokens.append(Token(TT.DOLLAR, string="$"))
                elif c == "(":
                    tokens.append(current_token)
                    current_token = None
                    tokens.append(Token(TT.LPAREN, string="("))
                elif c == ")":
                    tokens.append(current_token)
                    current_token = None
                    tokens.append(Token(TT.RPAREN, string=")"))
                elif c == "|":
                    tokens.append(current_token)
                    current_token = None
                    tokens.append(Token(TT.PIPE, string="|"))
                elif c == "&":
                    tokens.append(current_token)
                    current_token = None
                    tokens.append(Token(TT.AMP, string="&"))
                elif c == ";":
                    tokens.append(current_token)
                    current_token = None
                    tokens.append(Token(TT.CARRET, string=";"))
                elif c == ">" or c == "<":
                    tokens.append(current_token)
                    current_token = None
                    tokens.append(Token(TT.CARRET, string=c))
                else:
                    current_token.append_char(c)
        elif quote == "'":
            if c == "'":
                tokens.append(current_token)
                current_token = None
                quote = 0
            else:
                if current_token == None:
                    current_token = Token(TT.SQUOTE, string="")
                current_token.append_char(c)
        elif quote == '"':
            if c == '"':
                tokens.append(current_token)
                current_token = None
                quote = 0
            else:
                if current_token == None:
                    current_token = Token(TT.DQUOTE, string="")
                current_token.append_char(c)
        else:
            print("you fucked up you quote thingy")
        i += 1
    if current_token != None and current_token.ty == TT.NQUOTE:
        tokens.append(current_token)
    return tokens
--- a/parser/token.py/ttoken.py
+++ b/parser/token.py/ttoken.py
@ -0,0 +1,83 @@
 from enum import Enum
 from dataclasses import dataclass
 TokenType = Enum(
    "TokenType",
    [
        "AMP",
        "AND",
        "CARRET",
        "DOLLAR",
        "DQUOTE",
        "EXPENSION",
        "LCARRET",
        "LCARRET_DOUBLE",
        "LPAREN",
        "NQUOTE",
        "OR",
        "PIPE",
        "RCARRET",
        "RCARRET_DOUBLE",
        "RPAREN",
        "SEMICOLON",
        "SQUOTE",
        "WHITESPACE",
        "WORD",
    ],
 )
@dataclass
 class Token:
    ty: TokenType
    string: str = None
    subtokens: list = None
    def is_subtoken(self) -> bool:
        return self.subtokens != None
    def append_char(self, c: str):
        if self.string is None:
            raise Exception(f"Tried to push a char on a token that contains subtokens, TT={self.ty}")
        self.string += c
    def is_word(self):
        return (
            self.ty == TokenType.SQUOTE
            or self.ty == TokenType.DQUOTE
            or self.ty == TokenType.NQUOTE
            or self.ty == TokenType.DOLLAR
        )
 def print_tokenlist(tokens: list[Token], *, between="", end="\n"):
    for tok in tokens:
        col = "0"
        if tok.ty == TokenType.SQUOTE:
            col = "33"
        if tok.ty == TokenType.DQUOTE:
            col = "32"
        if tok.ty == TokenType.WHITESPACE:
            col = "31;4"
        if tok.ty == TokenType.DOLLAR:
            col = "31"
        if tok.ty == TokenType.LPAREN:
            col = "35"
        if tok.ty == TokenType.RPAREN:
            col = "35"
        if tok.ty == TokenType.AMP:
            col = "35"
        if tok.ty == TokenType.PIPE:
            col = "35"
        if tok.ty == TokenType.SEMICOLON:
            col = "35"
        if tok.ty == TokenType.CARRET:
            col = "35"
        if tok.is_subtoken():
            print_tokenlist(tok.subtokens, between="\x1b[100m", end="")
        else:
            print(f"\x1b[{col}m{between}{tok.string}\x1b[0m", end="")
    #print(end)
 __all__ = ["TokenType", "Token", "print_tokenlist"]