update: added python version of the tokenizer

2024-09-26 16:51:01 +02:00 · 2024-09-26 16:51:01 +02:00 · 06c2d19097
commit 06c2d19097
parent 40a84e8248
6 changed files with 339 additions and 0 deletions
--- a/parser/token.py/.gitignore
+++ b/parser/token.py/.gitignore
@ -0,0 +1 @@
+__pycache__
--- a/parser/token.py/app.py
+++ b/parser/token.py/app.py
@ -0,0 +1,9 @@
+import make_token
+import prettier
+
+s = input("> ")
+print(s);
+first = make_token.me_tokenize(s)
+pass1 = prettier.pass1(first)
+
+prettier.print_tokenlist(pass1)
--- a/parser/token.py/flake.lock
+++ b/parser/token.py/flake.lock
@ -0,0 +1,60 @@
+{
+  "nodes": {
+    "flake-utils": {
+      "inputs": {
+        "systems": "systems"
+      },
+      "locked": {
+        "lastModified": 1726560853,
+        "narHash": "sha256-X6rJYSESBVr3hBoH0WbKE5KvhPU5bloyZ2L4K60/fPQ=",
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "rev": "c1dfcf08411b08f6b8615f7d8971a2bfa81d5e8a",
+        "type": "github"
+      },
+      "original": {
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "type": "github"
+      }
+    },
+    "nixpkgs": {
+      "locked": {
+        "lastModified": 1726931411,
+        "narHash": "sha256-Oxfw+YhT/RDdOmzYbtrFSkU2SwdO7UfbjXWuU6Bo4+o=",
+        "owner": "nixos",
+        "repo": "nixpkgs",
+        "rev": "c0e65bb8293c21f3aa0fdc9fae8dcccec187c1cf",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nixos",
+        "repo": "nixpkgs",
+        "type": "github"
+      }
+    },
+    "root": {
+      "inputs": {
+        "flake-utils": "flake-utils",
+        "nixpkgs": "nixpkgs"
+      }
+    },
+    "systems": {
+      "locked": {
+        "lastModified": 1681028828,
+        "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
+        "owner": "nix-systems",
+        "repo": "default",
+        "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nix-systems",
+        "repo": "default",
+        "type": "github"
+      }
+    }
+  },
+  "root": "root",
+  "version": 7
+}
--- a/parser/token.py/flake.nix
+++ b/parser/token.py/flake.nix
@ -0,0 +1,35 @@
+{
+  description = "Flake utils demo";
+
+  inputs.nixpkgs.url = "github:nixos/nixpkgs";
+  inputs.flake-utils.url = "github:numtide/flake-utils";
+
+  outputs = {
+    self,
+    nixpkgs,
+    flake-utils,
+  }:
+    flake-utils.lib.eachDefaultSystem (
+      system: let
+        pkgs = nixpkgs.legacyPackages.${system};
+      in {
+        devShell = pkgs.mkShell {
+          packages = with pkgs;
+            [
+              gnumake
+              llvmPackages_18.bintools
+              tokei
+              coreutils
+              python312
+              tree
+            ]
+            ++ (
+              if system == "x86_64-linux"
+              then [valgrind valgrind.dev]
+              else []
+            );
+          #ASAN_OPTIONS = "strict_string_checks=1:detect_stack_use_after_return=1:check_initialization_order=1:strict_init_order=1";
+        };
+      }
+    );
+}
--- a/parser/token.py/make_token.py
+++ b/parser/token.py/make_token.py
@ -0,0 +1,142 @@
+from enum import Enum
+from dataclasses import dataclass
+
+
+TokenType = Enum(
+    "TokenType",
+    [
+        "AMP",
+        "DOLLAR",
+        "DQUOTE",
+        "LPAREN",
+        "NQUOTE",
+        "PIPE",
+        "CARRET",
+        "RPAREN",
+        "SEMICOLON",
+        "SQUOTE",
+        "WHITESPACE",
+    ],
+)
+
+@dataclass
+class Token:
+    raw: str
+    ty: TokenType
+
+def print_tokenlist(tokens: list[Token]):
+    print("\n")
+    for tok in tokens:
+        col = "0"
+        if tok.ty == TokenType.SQUOTE:
+            col = "33"
+        if tok.ty == TokenType.DQUOTE:
+            col = "32"
+        if tok.ty == TokenType.WHITESPACE:
+            col = "31;4"
+        if tok.ty == TokenType.DOLLAR:
+            col = "31"
+        if tok.ty == TokenType.LPAREN:
+            col = "35"
+        if tok.ty == TokenType.RPAREN:
+            col = "35"
+        if tok.ty == TokenType.AMP:
+            col = "35"
+        if tok.ty == TokenType.PIPE:
+            col = "35"
+        if tok.ty == TokenType.SEMICOLON:
+            col = "35"
+        if tok.ty == TokenType.CARRET:
+            col = "35"
+        print(f"\x1b[{col}m{tok.raw}\x1b[0m", end="")
+    print("\n")
+
+
+def is_quote(c: chr):
+    return c == "'" or c == '"'
+
+def me_tokenize(s: str):
+    tokens = []
+    current_token = None
+    quote = 0
+    i = 0
+    while i < len(s):
+        c = s[i]
+        if quote == 0:
+            if is_quote(c):
+                if current_token != None:
+                    tokens.append(current_token)
+                quote = c
+                current_token = Token(
+                    "", TokenType.DQUOTE if c == '"' else TokenType.SQUOTE
+                )
+            else:
+                if current_token == None:
+                    current_token = Token("", TokenType.NQUOTE)
+                if c.isspace():
+                    if (
+                        len(current_token.raw) != 0
+                        and current_token.ty != TokenType.WHITESPACE
+                    ):
+                        tokens.append(current_token)
+                    current_token = Token("", TokenType.WHITESPACE)
+                else:
+                    if current_token.ty == TokenType.WHITESPACE:
+                        tokens.append(current_token)
+                        current_token = Token("", TokenType.NQUOTE)
+                if c == "$":
+                    tokens.append(current_token)
+                    current_token = None
+                    tokens.append(Token("$", TokenType.DOLLAR))
+                elif c == "(":
+                    tokens.append(current_token)
+                    current_token = None
+                    tokens.append(Token("(", TokenType.LPAREN))
+                elif c == ")":
+                    tokens.append(current_token)
+                    current_token = None
+                    tokens.append(Token(")", TokenType.RPAREN))
+                elif c == "|":
+                    tokens.append(current_token)
+                    current_token = None
+                    tokens.append(Token("|", TokenType.PIPE))
+                elif c == "&":
+                    tokens.append(current_token)
+                    current_token = None
+                    tokens.append(Token("&", TokenType.AMP))
+                elif c == ";":
+                    tokens.append(current_token)
+                    current_token = None
+                    tokens.append(Token(";", TokenType.CARRET))
+                elif c == ">" or c == "<":
+                    tokens.append(current_token)
+                    current_token = None
+                    tokens.append(Token(c, TokenType.CARRET))
+                else:
+                    current_token.raw += c
+        elif quote == "'":
+            if c == "'":
+                tokens.append(current_token)
+                current_token = None
+                quote = 0
+            else:
+                if current_token == None:
+                    current_token = Token("", TokenType.SQUOTE)
+                current_token.raw += c
+
+        elif quote == '"':
+            if c == '"':
+                tokens.append(current_token)
+                current_token = None
+                quote = 0
+            else:
+                if current_token == None:
+                    current_token = Token("", TokenType.DQUOTE)
+                current_token.raw += c
+        else:
+            print("you fucked up you quote thingy")
+        i += 1
+    if current_token != None and current_token.ty == TokenType.NQUOTE:
+        tokens.append(current_token)
+    return tokens
+
--- a/parser/token.py/prettier.py
+++ b/parser/token.py/prettier.py
@ -0,0 +1,92 @@
+from enum import Enum
+from dataclasses import dataclass
+import make_token as mt
+
+
+TokenType = Enum(
+    "TokenType",
+    [
+        "AND",
+        "DOLLAR",
+        "DQUOTE",
+        "EXPENSION",
+        "LCARRET",
+        "LCARRET_DOUBLE",
+        "LPAREN",
+        "NQUOTE",
+        "OR",
+        "PIPE",
+        "RCARRET",
+        "RCARRET_DOUBLE",
+        "RPAREN",
+        "SEMICOLON",
+        "SQUOTE",
+        "WHITESPACE",
+        "WORD",
+    ],
+)
+
+
+@dataclass
+class Token:
+    raw: str
+    raw_list: list
+    ty: TokenType
+
+    def is_list(self):
+        return self.ty == TokenType.WORD or self.ty == TokenType.EXPENSION
+
+
+def is_word_mt(tok: mt.Token):
+    return (
+        tok.ty == mt.TokenType.SQUOTE
+        or tok.ty == mt.TokenType.DQUOTE
+        or tok.ty == mt.TokenType.NQUOTE
+        or tok.ty == mt.TokenType.DOLLAR
+    )
+
+
+def pass1(tokens: list[mt.Token]):
+    i = 0
+    out = []
+    while i < len(tokens):
+        tok = tokens[i]
+        if is_word_mt(tok):
+            concat = Token("", [], TokenType.WORD)
+            concat.raw_list.append(tok)
+            j = 1
+            while i + j < len(tokens) and is_word_mt(tokens[i + j]):
+                concat.raw_list.append(tokens[i + j])
+                j += 1
+            i += j
+        else:
+            out.append(tok)
+            i += 1
+    return out
+
+
+def print_tokenlist(tokens: list[Token]):
+    print("\n")
+    for tok in tokens:
+        col = "0"
+        if tok.ty == TokenType.SQUOTE:
+            col = "33"
+        if tok.ty == TokenType.DQUOTE:
+            col = "32"
+        if tok.ty == TokenType.WHITESPACE:
+            col = "31;4"
+        if tok.ty == TokenType.DOLLAR:
+            col = "31"
+        if tok.ty == TokenType.LPAREN:
+            col = "35"
+        if tok.ty == TokenType.RPAREN:
+            col = "35"
+        if tok.ty == TokenType.PIPE:
+            col = "35"
+        if tok.ty == TokenType.SEMICOLON:
+            col = "35"
+        if not Token.is_list(tok):
+            print(f"\x1b[{col}m{tok.raw}\x1b[0m", end="")
+        else:
+            print("NOT_PRINT_YET_LOL", end="")
+    print("\n")