update: pushed more readable tokens

2024-09-26 17:37:57 +02:00 · 2024-09-26 17:37:57 +02:00 · 285104a19a
commit 285104a19a
parent 06c2d19097
6 changed files with 199 additions and 239 deletions
--- a/parser/token.py/app.py
+++ b/parser/token.py/app.py
@ -1,9 +1,10 @@
-import make_token
+import str_to_token
 import prettier
+import ttoken

 s = input("> ")
-print(s);
-first = make_token.me_tokenize(s)
-pass1 = prettier.pass1(first)
+print(s)
+tokens = str_to_token.str_to_token(s)
+concated_tokens = concat.concat(tokens)

-prettier.print_tokenlist(pass1)
+ttoken.print_tokenlist(concated_tokens)
--- a/parser/token.py/concat.py
+++ b/parser/token.py/concat.py
@ -0,0 +1,20 @@
+from ttoken import *
+
+def concat(tokens: list[Token]):
+    i = 0
+    out = []
+    while i < len(tokens):
+        tok = tokens[i]
+        if tok.is_word():
+            word = Token(TokenType.WORD, subtokens=[])
+            word.subtokens.append(tok)
+            j = 1
+            while i + j < len(tokens) and (tokens[i + j]).is_word():
+                word.subtokens.append(tokens[i + j])
+                j += 1
+            i += j
+            out.append(word)
+        else:
+            out.append(tok)
+            i += 1
+    return out
--- a/parser/token.py/make_token.py
+++ b/parser/token.py/make_token.py
@ -1,142 +0,0 @@
-from enum import Enum
-from dataclasses import dataclass
-
-
-TokenType = Enum(
-    "TokenType",
-    [
-        "AMP",
-        "DOLLAR",
-        "DQUOTE",
-        "LPAREN",
-        "NQUOTE",
-        "PIPE",
-        "CARRET",
-        "RPAREN",
-        "SEMICOLON",
-        "SQUOTE",
-        "WHITESPACE",
-    ],
-)
-
-@dataclass
-class Token:
-    raw: str
-    ty: TokenType
-
-def print_tokenlist(tokens: list[Token]):
-    print("\n")
-    for tok in tokens:
-        col = "0"
-        if tok.ty == TokenType.SQUOTE:
-            col = "33"
-        if tok.ty == TokenType.DQUOTE:
-            col = "32"
-        if tok.ty == TokenType.WHITESPACE:
-            col = "31;4"
-        if tok.ty == TokenType.DOLLAR:
-            col = "31"
-        if tok.ty == TokenType.LPAREN:
-            col = "35"
-        if tok.ty == TokenType.RPAREN:
-            col = "35"
-        if tok.ty == TokenType.AMP:
-            col = "35"
-        if tok.ty == TokenType.PIPE:
-            col = "35"
-        if tok.ty == TokenType.SEMICOLON:
-            col = "35"
-        if tok.ty == TokenType.CARRET:
-            col = "35"
-        print(f"\x1b[{col}m{tok.raw}\x1b[0m", end="")
-    print("\n")
-
-
-def is_quote(c: chr):
-    return c == "'" or c == '"'
-
-def me_tokenize(s: str):
-    tokens = []
-    current_token = None
-    quote = 0
-    i = 0
-    while i < len(s):
-        c = s[i]
-        if quote == 0:
-            if is_quote(c):
-                if current_token != None:
-                    tokens.append(current_token)
-                quote = c
-                current_token = Token(
-                    "", TokenType.DQUOTE if c == '"' else TokenType.SQUOTE
-                )
-            else:
-                if current_token == None:
-                    current_token = Token("", TokenType.NQUOTE)
-                if c.isspace():
-                    if (
-                        len(current_token.raw) != 0
-                        and current_token.ty != TokenType.WHITESPACE
-                    ):
-                        tokens.append(current_token)
-                    current_token = Token("", TokenType.WHITESPACE)
-                else:
-                    if current_token.ty == TokenType.WHITESPACE:
-                        tokens.append(current_token)
-                        current_token = Token("", TokenType.NQUOTE)
-                if c == "$":
-                    tokens.append(current_token)
-                    current_token = None
-                    tokens.append(Token("$", TokenType.DOLLAR))
-                elif c == "(":
-                    tokens.append(current_token)
-                    current_token = None
-                    tokens.append(Token("(", TokenType.LPAREN))
-                elif c == ")":
-                    tokens.append(current_token)
-                    current_token = None
-                    tokens.append(Token(")", TokenType.RPAREN))
-                elif c == "|":
-                    tokens.append(current_token)
-                    current_token = None
-                    tokens.append(Token("|", TokenType.PIPE))
-                elif c == "&":
-                    tokens.append(current_token)
-                    current_token = None
-                    tokens.append(Token("&", TokenType.AMP))
-                elif c == ";":
-                    tokens.append(current_token)
-                    current_token = None
-                    tokens.append(Token(";", TokenType.CARRET))
-                elif c == ">" or c == "<":
-                    tokens.append(current_token)
-                    current_token = None
-                    tokens.append(Token(c, TokenType.CARRET))
-                else:
-                    current_token.raw += c
-        elif quote == "'":
-            if c == "'":
-                tokens.append(current_token)
-                current_token = None
-                quote = 0
-            else:
-                if current_token == None:
-                    current_token = Token("", TokenType.SQUOTE)
-                current_token.raw += c
-
-        elif quote == '"':
-            if c == '"':
-                tokens.append(current_token)
-                current_token = None
-                quote = 0
-            else:
-                if current_token == None:
-                    current_token = Token("", TokenType.DQUOTE)
-                current_token.raw += c
-        else:
-            print("you fucked up you quote thingy")
-        i += 1
-    if current_token != None and current_token.ty == TokenType.NQUOTE:
-        tokens.append(current_token)
-    return tokens
-
--- a/parser/token.py/prettier.py
+++ b/parser/token.py/prettier.py
@ -1,92 +0,0 @@
-from enum import Enum
-from dataclasses import dataclass
-import make_token as mt
-
-
-TokenType = Enum(
-    "TokenType",
-    [
-        "AND",
-        "DOLLAR",
-        "DQUOTE",
-        "EXPENSION",
-        "LCARRET",
-        "LCARRET_DOUBLE",
-        "LPAREN",
-        "NQUOTE",
-        "OR",
-        "PIPE",
-        "RCARRET",
-        "RCARRET_DOUBLE",
-        "RPAREN",
-        "SEMICOLON",
-        "SQUOTE",
-        "WHITESPACE",
-        "WORD",
-    ],
-)
-
-
-@dataclass
-class Token:
-    raw: str
-    raw_list: list
-    ty: TokenType
-
-    def is_list(self):
-        return self.ty == TokenType.WORD or self.ty == TokenType.EXPENSION
-
-
-def is_word_mt(tok: mt.Token):
-    return (
-        tok.ty == mt.TokenType.SQUOTE
-        or tok.ty == mt.TokenType.DQUOTE
-        or tok.ty == mt.TokenType.NQUOTE
-        or tok.ty == mt.TokenType.DOLLAR
-    )
-
-
-def pass1(tokens: list[mt.Token]):
-    i = 0
-    out = []
-    while i < len(tokens):
-        tok = tokens[i]
-        if is_word_mt(tok):
-            concat = Token("", [], TokenType.WORD)
-            concat.raw_list.append(tok)
-            j = 1
-            while i + j < len(tokens) and is_word_mt(tokens[i + j]):
-                concat.raw_list.append(tokens[i + j])
-                j += 1
-            i += j
-        else:
-            out.append(tok)
-            i += 1
-    return out
-
-
-def print_tokenlist(tokens: list[Token]):
-    print("\n")
-    for tok in tokens:
-        col = "0"
-        if tok.ty == TokenType.SQUOTE:
-            col = "33"
-        if tok.ty == TokenType.DQUOTE:
-            col = "32"
-        if tok.ty == TokenType.WHITESPACE:
-            col = "31;4"
-        if tok.ty == TokenType.DOLLAR:
-            col = "31"
-        if tok.ty == TokenType.LPAREN:
-            col = "35"
-        if tok.ty == TokenType.RPAREN:
-            col = "35"
-        if tok.ty == TokenType.PIPE:
-            col = "35"
-        if tok.ty == TokenType.SEMICOLON:
-            col = "35"
-        if not Token.is_list(tok):
-            print(f"\x1b[{col}m{tok.raw}\x1b[0m", end="")
-        else:
-            print("NOT_PRINT_YET_LOL", end="")
-    print("\n")
--- a/parser/token.py/str_to_token.py
+++ b/parser/token.py/str_to_token.py
@ -0,0 +1,90 @@
+from ttoken import *
+
+TT = TokenType
+
+
+def is_quote(c: chr):
+    return c == "'" or c == '"'
+
+
+def str_to_token(s: str):
+    tokens = []
+    current_token = None
+    quote = 0
+    i = 0
+    while i < len(s):
+        c = s[i]
+        if quote == 0:
+            if is_quote(c):
+                if current_token != None:
+                    tokens.append(current_token)
+                quote = c
+                current_token = Token(TT.DQUOTE if c == '"' else TT.SQUOTE, string="")
+            else:
+                if current_token == None:
+                    current_token = Token(TT.NQUOTE, string="")
+                if c.isspace():
+                    if (
+                        len(current_token.string) != 0
+                        and current_token.ty != TT.WHITESPACE
+                    ):
+                        tokens.append(current_token)
+                    current_token = Token(TT.WHITESPACE, string="")
+                else:
+                    if current_token.ty == TT.WHITESPACE:
+                        tokens.append(current_token)
+                        current_token = Token(TT.NQUOTE, string="")
+                if c == "$":
+                    tokens.append(current_token)
+                    current_token = None
+                    tokens.append(Token(TT.DOLLAR, string="$"))
+                elif c == "(":
+                    tokens.append(current_token)
+                    current_token = None
+                    tokens.append(Token(TT.LPAREN, string="("))
+                elif c == ")":
+                    tokens.append(current_token)
+                    current_token = None
+                    tokens.append(Token(TT.RPAREN, string=")"))
+                elif c == "|":
+                    tokens.append(current_token)
+                    current_token = None
+                    tokens.append(Token(TT.PIPE, string="|"))
+                elif c == "&":
+                    tokens.append(current_token)
+                    current_token = None
+                    tokens.append(Token(TT.AMP, string="&"))
+                elif c == ";":
+                    tokens.append(current_token)
+                    current_token = None
+                    tokens.append(Token(TT.CARRET, string=";"))
+                elif c == ">" or c == "<":
+                    tokens.append(current_token)
+                    current_token = None
+                    tokens.append(Token(TT.CARRET, string=c))
+                else:
+                    current_token.append_char(c)
+        elif quote == "'":
+            if c == "'":
+                tokens.append(current_token)
+                current_token = None
+                quote = 0
+            else:
+                if current_token == None:
+                    current_token = Token(TT.SQUOTE, string="")
+                current_token.append_char(c)
+        elif quote == '"':
+            if c == '"':
+                tokens.append(current_token)
+                current_token = None
+                quote = 0
+            else:
+                if current_token == None:
+                    current_token = Token(TT.DQUOTE, string="")
+                current_token.append_char(c)
+        else:
+            print("you fucked up you quote thingy")
+        i += 1
+    if current_token != None and current_token.ty == TT.NQUOTE:
+        tokens.append(current_token)
+    return tokens
--- a/parser/token.py/ttoken.py
+++ b/parser/token.py/ttoken.py
@ -0,0 +1,83 @@
+from enum import Enum
+from dataclasses import dataclass
+
+TokenType = Enum(
+    "TokenType",
+    [
+        "AMP",
+        "AND",
+        "CARRET",
+        "DOLLAR",
+        "DQUOTE",
+        "EXPENSION",
+        "LCARRET",
+        "LCARRET_DOUBLE",
+        "LPAREN",
+        "NQUOTE",
+        "OR",
+        "PIPE",
+        "RCARRET",
+        "RCARRET_DOUBLE",
+        "RPAREN",
+        "SEMICOLON",
+        "SQUOTE",
+        "WHITESPACE",
+        "WORD",
+    ],
+)
+
+
+@dataclass
+class Token:
+    ty: TokenType
+    string: str = None
+    subtokens: list = None
+
+    def is_subtoken(self) -> bool:
+        return self.subtokens != None
+
+    def append_char(self, c: str):
+        if self.string is None:
+            raise Exception(f"Tried to push a char on a token that contains subtokens, TT={self.ty}")
+        self.string += c
+
+    def is_word(self):
+        return (
+            self.ty == TokenType.SQUOTE
+            or self.ty == TokenType.DQUOTE
+            or self.ty == TokenType.NQUOTE
+            or self.ty == TokenType.DOLLAR
+        )
+
+
+def print_tokenlist(tokens: list[Token], *, between="", end="\n"):
+    for tok in tokens:
+        col = "0"
+        if tok.ty == TokenType.SQUOTE:
+            col = "33"
+        if tok.ty == TokenType.DQUOTE:
+            col = "32"
+        if tok.ty == TokenType.WHITESPACE:
+            col = "31;4"
+        if tok.ty == TokenType.DOLLAR:
+            col = "31"
+        if tok.ty == TokenType.LPAREN:
+            col = "35"
+        if tok.ty == TokenType.RPAREN:
+            col = "35"
+        if tok.ty == TokenType.AMP:
+            col = "35"
+        if tok.ty == TokenType.PIPE:
+            col = "35"
+        if tok.ty == TokenType.SEMICOLON:
+            col = "35"
+        if tok.ty == TokenType.CARRET:
+            col = "35"
+        if tok.is_subtoken():
+            print_tokenlist(tok.subtokens, between="\x1b[100m", end="")
+        else:
+            print(f"\x1b[{col}m{between}{tok.string}\x1b[0m", end="")
+    #print(end)
+
+
+__all__ = ["TokenType", "Token", "print_tokenlist"]