update: added python version of the tokenizer
This commit is contained in:
parent
40a84e8248
commit
06c2d19097
6 changed files with 339 additions and 0 deletions
1
parser/token.py/.gitignore
vendored
Normal file
1
parser/token.py/.gitignore
vendored
Normal file
|
|
@ -0,0 +1 @@
|
|||
__pycache__
|
||||
9
parser/token.py/app.py
Normal file
9
parser/token.py/app.py
Normal file
|
|
@ -0,0 +1,9 @@
|
|||
import make_token
|
||||
import prettier
|
||||
|
||||
s = input("> ")
|
||||
print(s);
|
||||
first = make_token.me_tokenize(s)
|
||||
pass1 = prettier.pass1(first)
|
||||
|
||||
prettier.print_tokenlist(pass1)
|
||||
60
parser/token.py/flake.lock
generated
Normal file
60
parser/token.py/flake.lock
generated
Normal file
|
|
@ -0,0 +1,60 @@
|
|||
{
|
||||
"nodes": {
|
||||
"flake-utils": {
|
||||
"inputs": {
|
||||
"systems": "systems"
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1726560853,
|
||||
"narHash": "sha256-X6rJYSESBVr3hBoH0WbKE5KvhPU5bloyZ2L4K60/fPQ=",
|
||||
"owner": "numtide",
|
||||
"repo": "flake-utils",
|
||||
"rev": "c1dfcf08411b08f6b8615f7d8971a2bfa81d5e8a",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "numtide",
|
||||
"repo": "flake-utils",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"nixpkgs": {
|
||||
"locked": {
|
||||
"lastModified": 1726931411,
|
||||
"narHash": "sha256-Oxfw+YhT/RDdOmzYbtrFSkU2SwdO7UfbjXWuU6Bo4+o=",
|
||||
"owner": "nixos",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "c0e65bb8293c21f3aa0fdc9fae8dcccec187c1cf",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "nixos",
|
||||
"repo": "nixpkgs",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"root": {
|
||||
"inputs": {
|
||||
"flake-utils": "flake-utils",
|
||||
"nixpkgs": "nixpkgs"
|
||||
}
|
||||
},
|
||||
"systems": {
|
||||
"locked": {
|
||||
"lastModified": 1681028828,
|
||||
"narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
|
||||
"owner": "nix-systems",
|
||||
"repo": "default",
|
||||
"rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "nix-systems",
|
||||
"repo": "default",
|
||||
"type": "github"
|
||||
}
|
||||
}
|
||||
},
|
||||
"root": "root",
|
||||
"version": 7
|
||||
}
|
||||
35
parser/token.py/flake.nix
Normal file
35
parser/token.py/flake.nix
Normal file
|
|
@ -0,0 +1,35 @@
|
|||
{
|
||||
description = "Flake utils demo";
|
||||
|
||||
inputs.nixpkgs.url = "github:nixos/nixpkgs";
|
||||
inputs.flake-utils.url = "github:numtide/flake-utils";
|
||||
|
||||
outputs = {
|
||||
self,
|
||||
nixpkgs,
|
||||
flake-utils,
|
||||
}:
|
||||
flake-utils.lib.eachDefaultSystem (
|
||||
system: let
|
||||
pkgs = nixpkgs.legacyPackages.${system};
|
||||
in {
|
||||
devShell = pkgs.mkShell {
|
||||
packages = with pkgs;
|
||||
[
|
||||
gnumake
|
||||
llvmPackages_18.bintools
|
||||
tokei
|
||||
coreutils
|
||||
python312
|
||||
tree
|
||||
]
|
||||
++ (
|
||||
if system == "x86_64-linux"
|
||||
then [valgrind valgrind.dev]
|
||||
else []
|
||||
);
|
||||
#ASAN_OPTIONS = "strict_string_checks=1:detect_stack_use_after_return=1:check_initialization_order=1:strict_init_order=1";
|
||||
};
|
||||
}
|
||||
);
|
||||
}
|
||||
142
parser/token.py/make_token.py
Normal file
142
parser/token.py/make_token.py
Normal file
|
|
@ -0,0 +1,142 @@
|
|||
from enum import Enum
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
TokenType = Enum(
|
||||
"TokenType",
|
||||
[
|
||||
"AMP",
|
||||
"DOLLAR",
|
||||
"DQUOTE",
|
||||
"LPAREN",
|
||||
"NQUOTE",
|
||||
"PIPE",
|
||||
"CARRET",
|
||||
"RPAREN",
|
||||
"SEMICOLON",
|
||||
"SQUOTE",
|
||||
"WHITESPACE",
|
||||
],
|
||||
)
|
||||
|
||||
@dataclass
|
||||
class Token:
|
||||
raw: str
|
||||
ty: TokenType
|
||||
|
||||
def print_tokenlist(tokens: list[Token]):
|
||||
print("\n")
|
||||
for tok in tokens:
|
||||
col = "0"
|
||||
if tok.ty == TokenType.SQUOTE:
|
||||
col = "33"
|
||||
if tok.ty == TokenType.DQUOTE:
|
||||
col = "32"
|
||||
if tok.ty == TokenType.WHITESPACE:
|
||||
col = "31;4"
|
||||
if tok.ty == TokenType.DOLLAR:
|
||||
col = "31"
|
||||
if tok.ty == TokenType.LPAREN:
|
||||
col = "35"
|
||||
if tok.ty == TokenType.RPAREN:
|
||||
col = "35"
|
||||
if tok.ty == TokenType.AMP:
|
||||
col = "35"
|
||||
if tok.ty == TokenType.PIPE:
|
||||
col = "35"
|
||||
if tok.ty == TokenType.SEMICOLON:
|
||||
col = "35"
|
||||
if tok.ty == TokenType.CARRET:
|
||||
col = "35"
|
||||
print(f"\x1b[{col}m{tok.raw}\x1b[0m", end="")
|
||||
print("\n")
|
||||
|
||||
|
||||
def is_quote(c: chr):
|
||||
return c == "'" or c == '"'
|
||||
|
||||
def me_tokenize(s: str):
|
||||
tokens = []
|
||||
current_token = None
|
||||
quote = 0
|
||||
i = 0
|
||||
while i < len(s):
|
||||
c = s[i]
|
||||
if quote == 0:
|
||||
if is_quote(c):
|
||||
if current_token != None:
|
||||
tokens.append(current_token)
|
||||
quote = c
|
||||
current_token = Token(
|
||||
"", TokenType.DQUOTE if c == '"' else TokenType.SQUOTE
|
||||
)
|
||||
else:
|
||||
if current_token == None:
|
||||
current_token = Token("", TokenType.NQUOTE)
|
||||
if c.isspace():
|
||||
if (
|
||||
len(current_token.raw) != 0
|
||||
and current_token.ty != TokenType.WHITESPACE
|
||||
):
|
||||
tokens.append(current_token)
|
||||
current_token = Token("", TokenType.WHITESPACE)
|
||||
else:
|
||||
if current_token.ty == TokenType.WHITESPACE:
|
||||
tokens.append(current_token)
|
||||
current_token = Token("", TokenType.NQUOTE)
|
||||
if c == "$":
|
||||
tokens.append(current_token)
|
||||
current_token = None
|
||||
tokens.append(Token("$", TokenType.DOLLAR))
|
||||
elif c == "(":
|
||||
tokens.append(current_token)
|
||||
current_token = None
|
||||
tokens.append(Token("(", TokenType.LPAREN))
|
||||
elif c == ")":
|
||||
tokens.append(current_token)
|
||||
current_token = None
|
||||
tokens.append(Token(")", TokenType.RPAREN))
|
||||
elif c == "|":
|
||||
tokens.append(current_token)
|
||||
current_token = None
|
||||
tokens.append(Token("|", TokenType.PIPE))
|
||||
elif c == "&":
|
||||
tokens.append(current_token)
|
||||
current_token = None
|
||||
tokens.append(Token("&", TokenType.AMP))
|
||||
elif c == ";":
|
||||
tokens.append(current_token)
|
||||
current_token = None
|
||||
tokens.append(Token(";", TokenType.CARRET))
|
||||
elif c == ">" or c == "<":
|
||||
tokens.append(current_token)
|
||||
current_token = None
|
||||
tokens.append(Token(c, TokenType.CARRET))
|
||||
else:
|
||||
current_token.raw += c
|
||||
elif quote == "'":
|
||||
if c == "'":
|
||||
tokens.append(current_token)
|
||||
current_token = None
|
||||
quote = 0
|
||||
else:
|
||||
if current_token == None:
|
||||
current_token = Token("", TokenType.SQUOTE)
|
||||
current_token.raw += c
|
||||
|
||||
elif quote == '"':
|
||||
if c == '"':
|
||||
tokens.append(current_token)
|
||||
current_token = None
|
||||
quote = 0
|
||||
else:
|
||||
if current_token == None:
|
||||
current_token = Token("", TokenType.DQUOTE)
|
||||
current_token.raw += c
|
||||
else:
|
||||
print("you fucked up you quote thingy")
|
||||
i += 1
|
||||
if current_token != None and current_token.ty == TokenType.NQUOTE:
|
||||
tokens.append(current_token)
|
||||
return tokens
|
||||
|
||||
92
parser/token.py/prettier.py
Normal file
92
parser/token.py/prettier.py
Normal file
|
|
@ -0,0 +1,92 @@
|
|||
from enum import Enum
|
||||
from dataclasses import dataclass
|
||||
import make_token as mt
|
||||
|
||||
|
||||
TokenType = Enum(
|
||||
"TokenType",
|
||||
[
|
||||
"AND",
|
||||
"DOLLAR",
|
||||
"DQUOTE",
|
||||
"EXPENSION",
|
||||
"LCARRET",
|
||||
"LCARRET_DOUBLE",
|
||||
"LPAREN",
|
||||
"NQUOTE",
|
||||
"OR",
|
||||
"PIPE",
|
||||
"RCARRET",
|
||||
"RCARRET_DOUBLE",
|
||||
"RPAREN",
|
||||
"SEMICOLON",
|
||||
"SQUOTE",
|
||||
"WHITESPACE",
|
||||
"WORD",
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class Token:
|
||||
raw: str
|
||||
raw_list: list
|
||||
ty: TokenType
|
||||
|
||||
def is_list(self):
|
||||
return self.ty == TokenType.WORD or self.ty == TokenType.EXPENSION
|
||||
|
||||
|
||||
def is_word_mt(tok: mt.Token):
|
||||
return (
|
||||
tok.ty == mt.TokenType.SQUOTE
|
||||
or tok.ty == mt.TokenType.DQUOTE
|
||||
or tok.ty == mt.TokenType.NQUOTE
|
||||
or tok.ty == mt.TokenType.DOLLAR
|
||||
)
|
||||
|
||||
|
||||
def pass1(tokens: list[mt.Token]):
|
||||
i = 0
|
||||
out = []
|
||||
while i < len(tokens):
|
||||
tok = tokens[i]
|
||||
if is_word_mt(tok):
|
||||
concat = Token("", [], TokenType.WORD)
|
||||
concat.raw_list.append(tok)
|
||||
j = 1
|
||||
while i + j < len(tokens) and is_word_mt(tokens[i + j]):
|
||||
concat.raw_list.append(tokens[i + j])
|
||||
j += 1
|
||||
i += j
|
||||
else:
|
||||
out.append(tok)
|
||||
i += 1
|
||||
return out
|
||||
|
||||
|
||||
def print_tokenlist(tokens: list[Token]):
|
||||
print("\n")
|
||||
for tok in tokens:
|
||||
col = "0"
|
||||
if tok.ty == TokenType.SQUOTE:
|
||||
col = "33"
|
||||
if tok.ty == TokenType.DQUOTE:
|
||||
col = "32"
|
||||
if tok.ty == TokenType.WHITESPACE:
|
||||
col = "31;4"
|
||||
if tok.ty == TokenType.DOLLAR:
|
||||
col = "31"
|
||||
if tok.ty == TokenType.LPAREN:
|
||||
col = "35"
|
||||
if tok.ty == TokenType.RPAREN:
|
||||
col = "35"
|
||||
if tok.ty == TokenType.PIPE:
|
||||
col = "35"
|
||||
if tok.ty == TokenType.SEMICOLON:
|
||||
col = "35"
|
||||
if not Token.is_list(tok):
|
||||
print(f"\x1b[{col}m{tok.raw}\x1b[0m", end="")
|
||||
else:
|
||||
print("NOT_PRINT_YET_LOL", end="")
|
||||
print("\n")
|
||||
Loading…
Add table
Add a link
Reference in a new issue