update tokenizer to allow expension

2024-03-26 16:42:25 +01:00 · 2024-03-26 16:42:25 +01:00 · d1f93444e0
commit d1f93444e0
parent d4a29b5bf2
1 changed files with 129 additions and 17 deletions
--- a/rust/parser/src/main.rs
+++ b/rust/parser/src/main.rs
@ -1,5 +1,5 @@
 #![allow(dead_code)]
-use std::convert::Infallible as Never;
+use std::{borrow::Cow, collections::VecDeque, convert::Infallible as Never, pin::Pin};
 type WORD = String;
 type Rule2 = String;
@ -88,6 +88,8 @@ struct TokenizerState<'input> {
    current_pos: usize,
    input: &'input str,
    remaining: &'input str,
    invalid_quote: bool,
    _marker: std::marker::PhantomPinned,
 }
 impl<'input> TokenizerState<'input> {
@ -95,17 +97,34 @@ impl<'input> TokenizerState<'input> {
        Self {
            current_pos: 0,
            remaining: input,
            invalid_quote: false,
            input,
            _marker: std::marker::PhantomPinned,
        }
    }
 }
 // Cow<'input, str> is either a owned string (so it'll have to be free'd, basically it is an `String`
 // or it is an borrow string, so an &'input str, which doesn't need to be free (or at least by us)
 #[derive(Debug, Clone)]
 enum Token<'input> {
-    SingleQuote { val: &'input str, start_pos: usize },
+    SingleQuote {
-    DoubleQuote { val: &'input str, start_pos: usize },
+        val: Cow<'input, str>,
-    WhiteSpace { val: &'input str, start_pos: usize },
+        start_pos: usize,
-    Word { val: &'input str, start_pos: usize },
+    },
    DoubleQuote {
        val: Cow<'input, str>,
        start_pos: usize,
    },
    WhiteSpace {
        val: Cow<'input, str>,
        start_pos: usize,
    },
    Word {
        val: Cow<'input, str>,
        start_pos: usize,
    },
 }
 fn tokenizer<'state, 'input: 'state>(
@ -131,12 +150,13 @@ fn tokenizer<'state, 'input: 'state>(
                    chars.next();
                }
                let skip = chars.peek() == Some(&'\'');
                state.invalid_quote |= !skip;
                let old_current = state.current_pos;
                state.current_pos += len;
                let old_remaining = state.remaining;
                state.remaining = &state.remaining[(len + skip as usize)..];
                return Some(Token::SingleQuote {
-                    val: &old_remaining[1..len],
+                    val: old_remaining[1..len].into(),
                    start_pos: old_current,
                });
            }
@ -149,12 +169,13 @@ fn tokenizer<'state, 'input: 'state>(
                    escaped = chars.next() == Some('\\');
                }
                let skip = chars.peek() == Some(&'\"');
                state.invalid_quote |= !skip;
                let old_current = state.current_pos;
                state.current_pos += len;
                let old_remaining = state.remaining;
                state.remaining = &state.remaining[(len + skip as usize)..];
                return Some(Token::DoubleQuote {
-                    val: &old_remaining[1..len],
+                    val: old_remaining[1..len].into(),
                    start_pos: old_current,
                });
            }
@ -163,13 +184,12 @@ fn tokenizer<'state, 'input: 'state>(
        let was_whitespace = chr.is_ascii_whitespace();
        while let Some(&chr) = chars.peek() {
            if chr.is_ascii_whitespace() && !escaped && !was_whitespace {
                dbg!(state.current_pos);
                let old_current = state.current_pos;
                state.current_pos += len;
                let old_remaining = state.remaining;
                state.remaining = &state.remaining[len..];
                return Some(Token::Word {
-                    val: &old_remaining[..len],
+                    val: old_remaining[..len].into(),
                    start_pos: old_current,
                });
            } else if !chr.is_ascii_whitespace() && was_whitespace {
@ -178,7 +198,7 @@ fn tokenizer<'state, 'input: 'state>(
                let old_remaining = state.remaining;
                state.remaining = &state.remaining[len..];
                return Some(Token::WhiteSpace {
-                    val: &old_remaining[..len],
+                    val: old_remaining[..len].into(),
                    start_pos: old_current,
                });
            }
@ -191,26 +211,118 @@ fn tokenizer<'state, 'input: 'state>(
        state.remaining = &state.remaining[len..];
        Some(if was_whitespace {
            Token::WhiteSpace {
-                val: &old_remaining[..len],
+                val: old_remaining[..len].into(),
                start_pos: old_current,
            }
        } else {
            Token::Word {
-                val: &old_remaining[..len],
+                val: old_remaining[..len].into(),
                start_pos: old_current,
            }
        })
    })
 }
 struct TokenizerWrapper<'input> {
    _marker: std::marker::PhantomPinned,
    first_pass: TokenizerState<'input>,
    iter: Box<dyn Iterator<Item = Token<'input>> + 'input>,
 }
 impl<'input> TokenizerWrapper<'input> {
    fn new(s: TokenizerState<'input>) -> Pin<Box<Self>> {
        let mut value = Box::new(std::mem::MaybeUninit::<Self>::uninit());
        unsafe {
            //
            let ptr: *mut Self = value.as_mut_ptr();
            std::ptr::write(std::ptr::addr_of_mut!((*ptr).first_pass), s);
            std::ptr::write(
                std::ptr::addr_of_mut!((*ptr).iter),
                Box::new(tokenizer(&mut *std::ptr::addr_of_mut!((*ptr).first_pass))),
            );
            std::mem::transmute(Pin::new_unchecked(value))
        }
    }
 }
 impl<'input> TokenizerWrapper<'input> {
    fn next(self: &mut Pin<Box<Self>>) -> Option<Token<'input>> {
        unsafe { Pin::into_inner_unchecked(self.as_mut()).iter.next() }
    }
 }
 struct ExpenderState<'input> {
    // These will be inserted when a substitution is made, like $HOME
    // if it is "$HOME", then no splitting should be done, so if there  is any stuff  that needs to
    // be pushed, then push it, otherwise get the next token from `iter`, expend if needed and
    // voila
    need_push: VecDeque<ExpendedToken<'input>>,
    // This is because of the way I wrote the rust
    // stuff, returning iterator instead of an token everytime I call a function and stuff, it
    // shouldn't be reflected into the C code, as we will just call 'get_next_token(&state)' and it
    // will give us the next token (or EOF if no more token are present)
    tokenizer: Pin<Box<TokenizerWrapper<'input>>>,
 }
 #[derive(Debug, Clone)]
 enum ExpendedToken<'input> {
    SingleQuote {
        val: Cow<'input, str>,
        start_pos: usize,
    },
    DoubleQuote {
        val: Cow<'input, str>,
        start_pos: usize,
    },
    WhiteSpace {
        val: Cow<'input, str>,
        start_pos: usize,
    },
    Word {
        val: Cow<'input, str>,
        start_pos: usize,
    },
 }
 impl<'input> ExpenderState<'input> {
    fn new(input: &'input str) -> Self {
        let wrapper = TokenizerWrapper::new(TokenizerState::new(input));
        Self {
            need_push: VecDeque::new(),
            tokenizer: wrapper,
        }
    }
 }
 fn expend<'state, 'input: 'state>(
    input: &'state mut ExpenderState<'input>,
 ) -> impl Iterator<Item = ExpendedToken<'input>> + 'state {
    std::iter::from_fn(|| {
        if !input.need_push.is_empty() {
            input.need_push.pop_front()
        } else {
            input.tokenizer.next().map(|t| match t {
                Token::Word { val, start_pos } => ExpendedToken::Word { val, start_pos },
                Token::DoubleQuote { val, start_pos } => {
                    ExpendedToken::DoubleQuote { val, start_pos }
                }
                Token::SingleQuote { val, start_pos } => {
                    ExpendedToken::SingleQuote { val, start_pos }
                }
                Token::WhiteSpace { val, start_pos } => {
                    ExpendedToken::WhiteSpace { val, start_pos }
                }
            })
        }
    })
 }
 fn main() {
    for line in std::io::stdin().lines() {
        let line = line.unwrap();
-        let mut state = TokenizerState::new(&line);
+        let mut state = ExpenderState::new(&line);
        println!("line is = '{line}'");
-        println!(
+        println!("token are = {:?}", expend(&mut state).collect::<Vec<_>>());
            "token are = {:?}",
            tokenizer(&mut state).collect::<Vec<_>>()
        );
    }
 }