update: pushed more readable tokens

This commit is contained in:
maix0 2024-09-26 17:37:57 +02:00
parent 06c2d19097
commit 285104a19a
6 changed files with 199 additions and 239 deletions

View file

@ -1,9 +1,10 @@
import make_token import str_to_token
import prettier import prettier
import ttoken
s = input("> ") s = input("> ")
print(s); print(s)
first = make_token.me_tokenize(s) tokens = str_to_token.str_to_token(s)
pass1 = prettier.pass1(first) concated_tokens = concat.concat(tokens)
prettier.print_tokenlist(pass1) ttoken.print_tokenlist(concated_tokens)

20
parser/token.py/concat.py Normal file
View file

@ -0,0 +1,20 @@
from ttoken import *
def concat(tokens: list[Token]):
i = 0
out = []
while i < len(tokens):
tok = tokens[i]
if tok.is_word():
word = Token(TokenType.WORD, subtokens=[])
word.subtokens.append(tok)
j = 1
while i + j < len(tokens) and (tokens[i + j]).is_word():
word.subtokens.append(tokens[i + j])
j += 1
i += j
out.append(word)
else:
out.append(tok)
i += 1
return out

View file

@ -1,142 +0,0 @@
from enum import Enum
from dataclasses import dataclass
TokenType = Enum(
"TokenType",
[
"AMP",
"DOLLAR",
"DQUOTE",
"LPAREN",
"NQUOTE",
"PIPE",
"CARRET",
"RPAREN",
"SEMICOLON",
"SQUOTE",
"WHITESPACE",
],
)
@dataclass
class Token:
raw: str
ty: TokenType
def print_tokenlist(tokens: list[Token]):
print("\n")
for tok in tokens:
col = "0"
if tok.ty == TokenType.SQUOTE:
col = "33"
if tok.ty == TokenType.DQUOTE:
col = "32"
if tok.ty == TokenType.WHITESPACE:
col = "31;4"
if tok.ty == TokenType.DOLLAR:
col = "31"
if tok.ty == TokenType.LPAREN:
col = "35"
if tok.ty == TokenType.RPAREN:
col = "35"
if tok.ty == TokenType.AMP:
col = "35"
if tok.ty == TokenType.PIPE:
col = "35"
if tok.ty == TokenType.SEMICOLON:
col = "35"
if tok.ty == TokenType.CARRET:
col = "35"
print(f"\x1b[{col}m{tok.raw}\x1b[0m", end="")
print("\n")
def is_quote(c: chr):
return c == "'" or c == '"'
def me_tokenize(s: str):
tokens = []
current_token = None
quote = 0
i = 0
while i < len(s):
c = s[i]
if quote == 0:
if is_quote(c):
if current_token != None:
tokens.append(current_token)
quote = c
current_token = Token(
"", TokenType.DQUOTE if c == '"' else TokenType.SQUOTE
)
else:
if current_token == None:
current_token = Token("", TokenType.NQUOTE)
if c.isspace():
if (
len(current_token.raw) != 0
and current_token.ty != TokenType.WHITESPACE
):
tokens.append(current_token)
current_token = Token("", TokenType.WHITESPACE)
else:
if current_token.ty == TokenType.WHITESPACE:
tokens.append(current_token)
current_token = Token("", TokenType.NQUOTE)
if c == "$":
tokens.append(current_token)
current_token = None
tokens.append(Token("$", TokenType.DOLLAR))
elif c == "(":
tokens.append(current_token)
current_token = None
tokens.append(Token("(", TokenType.LPAREN))
elif c == ")":
tokens.append(current_token)
current_token = None
tokens.append(Token(")", TokenType.RPAREN))
elif c == "|":
tokens.append(current_token)
current_token = None
tokens.append(Token("|", TokenType.PIPE))
elif c == "&":
tokens.append(current_token)
current_token = None
tokens.append(Token("&", TokenType.AMP))
elif c == ";":
tokens.append(current_token)
current_token = None
tokens.append(Token(";", TokenType.CARRET))
elif c == ">" or c == "<":
tokens.append(current_token)
current_token = None
tokens.append(Token(c, TokenType.CARRET))
else:
current_token.raw += c
elif quote == "'":
if c == "'":
tokens.append(current_token)
current_token = None
quote = 0
else:
if current_token == None:
current_token = Token("", TokenType.SQUOTE)
current_token.raw += c
elif quote == '"':
if c == '"':
tokens.append(current_token)
current_token = None
quote = 0
else:
if current_token == None:
current_token = Token("", TokenType.DQUOTE)
current_token.raw += c
else:
print("you fucked up you quote thingy")
i += 1
if current_token != None and current_token.ty == TokenType.NQUOTE:
tokens.append(current_token)
return tokens

View file

@ -1,92 +0,0 @@
from enum import Enum
from dataclasses import dataclass
import make_token as mt
TokenType = Enum(
"TokenType",
[
"AND",
"DOLLAR",
"DQUOTE",
"EXPENSION",
"LCARRET",
"LCARRET_DOUBLE",
"LPAREN",
"NQUOTE",
"OR",
"PIPE",
"RCARRET",
"RCARRET_DOUBLE",
"RPAREN",
"SEMICOLON",
"SQUOTE",
"WHITESPACE",
"WORD",
],
)
@dataclass
class Token:
raw: str
raw_list: list
ty: TokenType
def is_list(self):
return self.ty == TokenType.WORD or self.ty == TokenType.EXPENSION
def is_word_mt(tok: mt.Token):
return (
tok.ty == mt.TokenType.SQUOTE
or tok.ty == mt.TokenType.DQUOTE
or tok.ty == mt.TokenType.NQUOTE
or tok.ty == mt.TokenType.DOLLAR
)
def pass1(tokens: list[mt.Token]):
i = 0
out = []
while i < len(tokens):
tok = tokens[i]
if is_word_mt(tok):
concat = Token("", [], TokenType.WORD)
concat.raw_list.append(tok)
j = 1
while i + j < len(tokens) and is_word_mt(tokens[i + j]):
concat.raw_list.append(tokens[i + j])
j += 1
i += j
else:
out.append(tok)
i += 1
return out
def print_tokenlist(tokens: list[Token]):
print("\n")
for tok in tokens:
col = "0"
if tok.ty == TokenType.SQUOTE:
col = "33"
if tok.ty == TokenType.DQUOTE:
col = "32"
if tok.ty == TokenType.WHITESPACE:
col = "31;4"
if tok.ty == TokenType.DOLLAR:
col = "31"
if tok.ty == TokenType.LPAREN:
col = "35"
if tok.ty == TokenType.RPAREN:
col = "35"
if tok.ty == TokenType.PIPE:
col = "35"
if tok.ty == TokenType.SEMICOLON:
col = "35"
if not Token.is_list(tok):
print(f"\x1b[{col}m{tok.raw}\x1b[0m", end="")
else:
print("NOT_PRINT_YET_LOL", end="")
print("\n")

View file

@ -0,0 +1,90 @@
from ttoken import *
TT = TokenType
def is_quote(c: chr):
return c == "'" or c == '"'
def str_to_token(s: str):
tokens = []
current_token = None
quote = 0
i = 0
while i < len(s):
c = s[i]
if quote == 0:
if is_quote(c):
if current_token != None:
tokens.append(current_token)
quote = c
current_token = Token(TT.DQUOTE if c == '"' else TT.SQUOTE, string="")
else:
if current_token == None:
current_token = Token(TT.NQUOTE, string="")
if c.isspace():
if (
len(current_token.string) != 0
and current_token.ty != TT.WHITESPACE
):
tokens.append(current_token)
current_token = Token(TT.WHITESPACE, string="")
else:
if current_token.ty == TT.WHITESPACE:
tokens.append(current_token)
current_token = Token(TT.NQUOTE, string="")
if c == "$":
tokens.append(current_token)
current_token = None
tokens.append(Token(TT.DOLLAR, string="$"))
elif c == "(":
tokens.append(current_token)
current_token = None
tokens.append(Token(TT.LPAREN, string="("))
elif c == ")":
tokens.append(current_token)
current_token = None
tokens.append(Token(TT.RPAREN, string=")"))
elif c == "|":
tokens.append(current_token)
current_token = None
tokens.append(Token(TT.PIPE, string="|"))
elif c == "&":
tokens.append(current_token)
current_token = None
tokens.append(Token(TT.AMP, string="&"))
elif c == ";":
tokens.append(current_token)
current_token = None
tokens.append(Token(TT.CARRET, string=";"))
elif c == ">" or c == "<":
tokens.append(current_token)
current_token = None
tokens.append(Token(TT.CARRET, string=c))
else:
current_token.append_char(c)
elif quote == "'":
if c == "'":
tokens.append(current_token)
current_token = None
quote = 0
else:
if current_token == None:
current_token = Token(TT.SQUOTE, string="")
current_token.append_char(c)
elif quote == '"':
if c == '"':
tokens.append(current_token)
current_token = None
quote = 0
else:
if current_token == None:
current_token = Token(TT.DQUOTE, string="")
current_token.append_char(c)
else:
print("you fucked up you quote thingy")
i += 1
if current_token != None and current_token.ty == TT.NQUOTE:
tokens.append(current_token)
return tokens

83
parser/token.py/ttoken.py Normal file
View file

@ -0,0 +1,83 @@
from enum import Enum
from dataclasses import dataclass
TokenType = Enum(
"TokenType",
[
"AMP",
"AND",
"CARRET",
"DOLLAR",
"DQUOTE",
"EXPENSION",
"LCARRET",
"LCARRET_DOUBLE",
"LPAREN",
"NQUOTE",
"OR",
"PIPE",
"RCARRET",
"RCARRET_DOUBLE",
"RPAREN",
"SEMICOLON",
"SQUOTE",
"WHITESPACE",
"WORD",
],
)
@dataclass
class Token:
ty: TokenType
string: str = None
subtokens: list = None
def is_subtoken(self) -> bool:
return self.subtokens != None
def append_char(self, c: str):
if self.string is None:
raise Exception(f"Tried to push a char on a token that contains subtokens, TT={self.ty}")
self.string += c
def is_word(self):
return (
self.ty == TokenType.SQUOTE
or self.ty == TokenType.DQUOTE
or self.ty == TokenType.NQUOTE
or self.ty == TokenType.DOLLAR
)
def print_tokenlist(tokens: list[Token], *, between="", end="\n"):
for tok in tokens:
col = "0"
if tok.ty == TokenType.SQUOTE:
col = "33"
if tok.ty == TokenType.DQUOTE:
col = "32"
if tok.ty == TokenType.WHITESPACE:
col = "31;4"
if tok.ty == TokenType.DOLLAR:
col = "31"
if tok.ty == TokenType.LPAREN:
col = "35"
if tok.ty == TokenType.RPAREN:
col = "35"
if tok.ty == TokenType.AMP:
col = "35"
if tok.ty == TokenType.PIPE:
col = "35"
if tok.ty == TokenType.SEMICOLON:
col = "35"
if tok.ty == TokenType.CARRET:
col = "35"
if tok.is_subtoken():
print_tokenlist(tok.subtokens, between="\x1b[100m", end="")
else:
print(f"\x1b[{col}m{between}{tok.string}\x1b[0m", end="")
#print(end)
__all__ = ["TokenType", "Token", "print_tokenlist"]