From 06c2d19097c991dde6b5ed434354113fe3441a0d Mon Sep 17 00:00:00 2001 From: maix0 Date: Thu, 26 Sep 2024 16:51:01 +0200 Subject: [PATCH] update: added python version of the tokenizer --- parser/token.py/.gitignore | 1 + parser/token.py/app.py | 9 +++ parser/token.py/flake.lock | 60 ++++++++++++++ parser/token.py/flake.nix | 35 +++++++++ parser/token.py/make_token.py | 142 ++++++++++++++++++++++++++++++++++ parser/token.py/prettier.py | 92 ++++++++++++++++++++++ 6 files changed, 339 insertions(+) create mode 100644 parser/token.py/.gitignore create mode 100644 parser/token.py/app.py create mode 100644 parser/token.py/flake.lock create mode 100644 parser/token.py/flake.nix create mode 100644 parser/token.py/make_token.py create mode 100644 parser/token.py/prettier.py diff --git a/parser/token.py/.gitignore b/parser/token.py/.gitignore new file mode 100644 index 00000000..bee8a64b --- /dev/null +++ b/parser/token.py/.gitignore @@ -0,0 +1 @@ +__pycache__ diff --git a/parser/token.py/app.py b/parser/token.py/app.py new file mode 100644 index 00000000..6f121d82 --- /dev/null +++ b/parser/token.py/app.py @@ -0,0 +1,9 @@ +import make_token +import prettier + +s = input("> ") +print(s); +first = make_token.me_tokenize(s) +pass1 = prettier.pass1(first) + +prettier.print_tokenlist(pass1) diff --git a/parser/token.py/flake.lock b/parser/token.py/flake.lock new file mode 100644 index 00000000..74dbb08f --- /dev/null +++ b/parser/token.py/flake.lock @@ -0,0 +1,60 @@ +{ + "nodes": { + "flake-utils": { + "inputs": { + "systems": "systems" + }, + "locked": { + "lastModified": 1726560853, + "narHash": "sha256-X6rJYSESBVr3hBoH0WbKE5KvhPU5bloyZ2L4K60/fPQ=", + "owner": "numtide", + "repo": "flake-utils", + "rev": "c1dfcf08411b08f6b8615f7d8971a2bfa81d5e8a", + "type": "github" + }, + "original": { + "owner": "numtide", + "repo": "flake-utils", + "type": "github" + } + }, + "nixpkgs": { + "locked": { + "lastModified": 1726931411, + "narHash": "sha256-Oxfw+YhT/RDdOmzYbtrFSkU2SwdO7UfbjXWuU6Bo4+o=", + "owner": "nixos", + "repo": "nixpkgs", + "rev": "c0e65bb8293c21f3aa0fdc9fae8dcccec187c1cf", + "type": "github" + }, + "original": { + "owner": "nixos", + "repo": "nixpkgs", + "type": "github" + } + }, + "root": { + "inputs": { + "flake-utils": "flake-utils", + "nixpkgs": "nixpkgs" + } + }, + "systems": { + "locked": { + "lastModified": 1681028828, + "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=", + "owner": "nix-systems", + "repo": "default", + "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e", + "type": "github" + }, + "original": { + "owner": "nix-systems", + "repo": "default", + "type": "github" + } + } + }, + "root": "root", + "version": 7 +} diff --git a/parser/token.py/flake.nix b/parser/token.py/flake.nix new file mode 100644 index 00000000..86d52a91 --- /dev/null +++ b/parser/token.py/flake.nix @@ -0,0 +1,35 @@ +{ + description = "Flake utils demo"; + + inputs.nixpkgs.url = "github:nixos/nixpkgs"; + inputs.flake-utils.url = "github:numtide/flake-utils"; + + outputs = { + self, + nixpkgs, + flake-utils, + }: + flake-utils.lib.eachDefaultSystem ( + system: let + pkgs = nixpkgs.legacyPackages.${system}; + in { + devShell = pkgs.mkShell { + packages = with pkgs; + [ + gnumake + llvmPackages_18.bintools + tokei + coreutils + python312 + tree + ] + ++ ( + if system == "x86_64-linux" + then [valgrind valgrind.dev] + else [] + ); + #ASAN_OPTIONS = "strict_string_checks=1:detect_stack_use_after_return=1:check_initialization_order=1:strict_init_order=1"; + }; + } + ); +} diff --git a/parser/token.py/make_token.py b/parser/token.py/make_token.py new file mode 100644 index 00000000..074dccd5 --- /dev/null +++ b/parser/token.py/make_token.py @@ -0,0 +1,142 @@ +from enum import Enum +from dataclasses import dataclass + + +TokenType = Enum( + "TokenType", + [ + "AMP", + "DOLLAR", + "DQUOTE", + "LPAREN", + "NQUOTE", + "PIPE", + "CARRET", + "RPAREN", + "SEMICOLON", + "SQUOTE", + "WHITESPACE", + ], +) + +@dataclass +class Token: + raw: str + ty: TokenType + +def print_tokenlist(tokens: list[Token]): + print("\n") + for tok in tokens: + col = "0" + if tok.ty == TokenType.SQUOTE: + col = "33" + if tok.ty == TokenType.DQUOTE: + col = "32" + if tok.ty == TokenType.WHITESPACE: + col = "31;4" + if tok.ty == TokenType.DOLLAR: + col = "31" + if tok.ty == TokenType.LPAREN: + col = "35" + if tok.ty == TokenType.RPAREN: + col = "35" + if tok.ty == TokenType.AMP: + col = "35" + if tok.ty == TokenType.PIPE: + col = "35" + if tok.ty == TokenType.SEMICOLON: + col = "35" + if tok.ty == TokenType.CARRET: + col = "35" + print(f"\x1b[{col}m{tok.raw}\x1b[0m", end="") + print("\n") + + +def is_quote(c: chr): + return c == "'" or c == '"' + +def me_tokenize(s: str): + tokens = [] + current_token = None + quote = 0 + i = 0 + while i < len(s): + c = s[i] + if quote == 0: + if is_quote(c): + if current_token != None: + tokens.append(current_token) + quote = c + current_token = Token( + "", TokenType.DQUOTE if c == '"' else TokenType.SQUOTE + ) + else: + if current_token == None: + current_token = Token("", TokenType.NQUOTE) + if c.isspace(): + if ( + len(current_token.raw) != 0 + and current_token.ty != TokenType.WHITESPACE + ): + tokens.append(current_token) + current_token = Token("", TokenType.WHITESPACE) + else: + if current_token.ty == TokenType.WHITESPACE: + tokens.append(current_token) + current_token = Token("", TokenType.NQUOTE) + if c == "$": + tokens.append(current_token) + current_token = None + tokens.append(Token("$", TokenType.DOLLAR)) + elif c == "(": + tokens.append(current_token) + current_token = None + tokens.append(Token("(", TokenType.LPAREN)) + elif c == ")": + tokens.append(current_token) + current_token = None + tokens.append(Token(")", TokenType.RPAREN)) + elif c == "|": + tokens.append(current_token) + current_token = None + tokens.append(Token("|", TokenType.PIPE)) + elif c == "&": + tokens.append(current_token) + current_token = None + tokens.append(Token("&", TokenType.AMP)) + elif c == ";": + tokens.append(current_token) + current_token = None + tokens.append(Token(";", TokenType.CARRET)) + elif c == ">" or c == "<": + tokens.append(current_token) + current_token = None + tokens.append(Token(c, TokenType.CARRET)) + else: + current_token.raw += c + elif quote == "'": + if c == "'": + tokens.append(current_token) + current_token = None + quote = 0 + else: + if current_token == None: + current_token = Token("", TokenType.SQUOTE) + current_token.raw += c + + elif quote == '"': + if c == '"': + tokens.append(current_token) + current_token = None + quote = 0 + else: + if current_token == None: + current_token = Token("", TokenType.DQUOTE) + current_token.raw += c + else: + print("you fucked up you quote thingy") + i += 1 + if current_token != None and current_token.ty == TokenType.NQUOTE: + tokens.append(current_token) + return tokens + diff --git a/parser/token.py/prettier.py b/parser/token.py/prettier.py new file mode 100644 index 00000000..a60ba8fc --- /dev/null +++ b/parser/token.py/prettier.py @@ -0,0 +1,92 @@ +from enum import Enum +from dataclasses import dataclass +import make_token as mt + + +TokenType = Enum( + "TokenType", + [ + "AND", + "DOLLAR", + "DQUOTE", + "EXPENSION", + "LCARRET", + "LCARRET_DOUBLE", + "LPAREN", + "NQUOTE", + "OR", + "PIPE", + "RCARRET", + "RCARRET_DOUBLE", + "RPAREN", + "SEMICOLON", + "SQUOTE", + "WHITESPACE", + "WORD", + ], +) + + +@dataclass +class Token: + raw: str + raw_list: list + ty: TokenType + + def is_list(self): + return self.ty == TokenType.WORD or self.ty == TokenType.EXPENSION + + +def is_word_mt(tok: mt.Token): + return ( + tok.ty == mt.TokenType.SQUOTE + or tok.ty == mt.TokenType.DQUOTE + or tok.ty == mt.TokenType.NQUOTE + or tok.ty == mt.TokenType.DOLLAR + ) + + +def pass1(tokens: list[mt.Token]): + i = 0 + out = [] + while i < len(tokens): + tok = tokens[i] + if is_word_mt(tok): + concat = Token("", [], TokenType.WORD) + concat.raw_list.append(tok) + j = 1 + while i + j < len(tokens) and is_word_mt(tokens[i + j]): + concat.raw_list.append(tokens[i + j]) + j += 1 + i += j + else: + out.append(tok) + i += 1 + return out + + +def print_tokenlist(tokens: list[Token]): + print("\n") + for tok in tokens: + col = "0" + if tok.ty == TokenType.SQUOTE: + col = "33" + if tok.ty == TokenType.DQUOTE: + col = "32" + if tok.ty == TokenType.WHITESPACE: + col = "31;4" + if tok.ty == TokenType.DOLLAR: + col = "31" + if tok.ty == TokenType.LPAREN: + col = "35" + if tok.ty == TokenType.RPAREN: + col = "35" + if tok.ty == TokenType.PIPE: + col = "35" + if tok.ty == TokenType.SEMICOLON: + col = "35" + if not Token.is_list(tok): + print(f"\x1b[{col}m{tok.raw}\x1b[0m", end="") + else: + print("NOT_PRINT_YET_LOL", end="") + print("\n")