From d1f93444e0284538eb08f48e1ff591e2db9cb9ec Mon Sep 17 00:00:00 2001 From: Maieul BOYER Date: Tue, 26 Mar 2024 16:42:25 +0100 Subject: [PATCH] update tokenizer to allow expension --- rust/parser/src/main.rs | 146 +++++++++++++++++++++++++++++++++++----- 1 file changed, 129 insertions(+), 17 deletions(-) diff --git a/rust/parser/src/main.rs b/rust/parser/src/main.rs index b8b7ac4c..7881b4bd 100644 --- a/rust/parser/src/main.rs +++ b/rust/parser/src/main.rs @@ -1,5 +1,5 @@ #![allow(dead_code)] -use std::convert::Infallible as Never; +use std::{borrow::Cow, collections::VecDeque, convert::Infallible as Never, pin::Pin}; type WORD = String; type Rule2 = String; @@ -88,6 +88,8 @@ struct TokenizerState<'input> { current_pos: usize, input: &'input str, remaining: &'input str, + invalid_quote: bool, + _marker: std::marker::PhantomPinned, } impl<'input> TokenizerState<'input> { @@ -95,17 +97,34 @@ impl<'input> TokenizerState<'input> { Self { current_pos: 0, remaining: input, + invalid_quote: false, input, + _marker: std::marker::PhantomPinned, } } } +// Cow<'input, str> is either a owned string (so it'll have to be free'd, basically it is an `String` +// or it is an borrow string, so an &'input str, which doesn't need to be free (or at least by us) + #[derive(Debug, Clone)] enum Token<'input> { - SingleQuote { val: &'input str, start_pos: usize }, - DoubleQuote { val: &'input str, start_pos: usize }, - WhiteSpace { val: &'input str, start_pos: usize }, - Word { val: &'input str, start_pos: usize }, + SingleQuote { + val: Cow<'input, str>, + start_pos: usize, + }, + DoubleQuote { + val: Cow<'input, str>, + start_pos: usize, + }, + WhiteSpace { + val: Cow<'input, str>, + start_pos: usize, + }, + Word { + val: Cow<'input, str>, + start_pos: usize, + }, } fn tokenizer<'state, 'input: 'state>( @@ -131,12 +150,13 @@ fn tokenizer<'state, 'input: 'state>( chars.next(); } let skip = chars.peek() == Some(&'\''); + state.invalid_quote |= !skip; let old_current = state.current_pos; state.current_pos += len; let old_remaining = state.remaining; state.remaining = &state.remaining[(len + skip as usize)..]; return Some(Token::SingleQuote { - val: &old_remaining[1..len], + val: old_remaining[1..len].into(), start_pos: old_current, }); } @@ -149,12 +169,13 @@ fn tokenizer<'state, 'input: 'state>( escaped = chars.next() == Some('\\'); } let skip = chars.peek() == Some(&'\"'); + state.invalid_quote |= !skip; let old_current = state.current_pos; state.current_pos += len; let old_remaining = state.remaining; state.remaining = &state.remaining[(len + skip as usize)..]; return Some(Token::DoubleQuote { - val: &old_remaining[1..len], + val: old_remaining[1..len].into(), start_pos: old_current, }); } @@ -163,13 +184,12 @@ fn tokenizer<'state, 'input: 'state>( let was_whitespace = chr.is_ascii_whitespace(); while let Some(&chr) = chars.peek() { if chr.is_ascii_whitespace() && !escaped && !was_whitespace { - dbg!(state.current_pos); let old_current = state.current_pos; state.current_pos += len; let old_remaining = state.remaining; state.remaining = &state.remaining[len..]; return Some(Token::Word { - val: &old_remaining[..len], + val: old_remaining[..len].into(), start_pos: old_current, }); } else if !chr.is_ascii_whitespace() && was_whitespace { @@ -178,7 +198,7 @@ fn tokenizer<'state, 'input: 'state>( let old_remaining = state.remaining; state.remaining = &state.remaining[len..]; return Some(Token::WhiteSpace { - val: &old_remaining[..len], + val: old_remaining[..len].into(), start_pos: old_current, }); } @@ -191,26 +211,118 @@ fn tokenizer<'state, 'input: 'state>( state.remaining = &state.remaining[len..]; Some(if was_whitespace { Token::WhiteSpace { - val: &old_remaining[..len], + val: old_remaining[..len].into(), start_pos: old_current, } } else { Token::Word { - val: &old_remaining[..len], + val: old_remaining[..len].into(), start_pos: old_current, } }) }) } +struct TokenizerWrapper<'input> { + _marker: std::marker::PhantomPinned, + first_pass: TokenizerState<'input>, + iter: Box> + 'input>, +} + +impl<'input> TokenizerWrapper<'input> { + fn new(s: TokenizerState<'input>) -> Pin> { + let mut value = Box::new(std::mem::MaybeUninit::::uninit()); + unsafe { + // + let ptr: *mut Self = value.as_mut_ptr(); + std::ptr::write(std::ptr::addr_of_mut!((*ptr).first_pass), s); + std::ptr::write( + std::ptr::addr_of_mut!((*ptr).iter), + Box::new(tokenizer(&mut *std::ptr::addr_of_mut!((*ptr).first_pass))), + ); + std::mem::transmute(Pin::new_unchecked(value)) + } + } +} + +impl<'input> TokenizerWrapper<'input> { + fn next(self: &mut Pin>) -> Option> { + unsafe { Pin::into_inner_unchecked(self.as_mut()).iter.next() } + } +} + +struct ExpenderState<'input> { + // These will be inserted when a substitution is made, like $HOME + // if it is "$HOME", then no splitting should be done, so if there is any stuff that needs to + // be pushed, then push it, otherwise get the next token from `iter`, expend if needed and + // voila + need_push: VecDeque>, + // This is because of the way I wrote the rust + // stuff, returning iterator instead of an token everytime I call a function and stuff, it + // shouldn't be reflected into the C code, as we will just call 'get_next_token(&state)' and it + // will give us the next token (or EOF if no more token are present) + tokenizer: Pin>>, +} + +#[derive(Debug, Clone)] +enum ExpendedToken<'input> { + SingleQuote { + val: Cow<'input, str>, + start_pos: usize, + }, + DoubleQuote { + val: Cow<'input, str>, + start_pos: usize, + }, + WhiteSpace { + val: Cow<'input, str>, + start_pos: usize, + }, + Word { + val: Cow<'input, str>, + start_pos: usize, + }, +} + +impl<'input> ExpenderState<'input> { + fn new(input: &'input str) -> Self { + let wrapper = TokenizerWrapper::new(TokenizerState::new(input)); + + Self { + need_push: VecDeque::new(), + tokenizer: wrapper, + } + } +} + +fn expend<'state, 'input: 'state>( + input: &'state mut ExpenderState<'input>, +) -> impl Iterator> + 'state { + std::iter::from_fn(|| { + if !input.need_push.is_empty() { + input.need_push.pop_front() + } else { + input.tokenizer.next().map(|t| match t { + Token::Word { val, start_pos } => ExpendedToken::Word { val, start_pos }, + Token::DoubleQuote { val, start_pos } => { + ExpendedToken::DoubleQuote { val, start_pos } + } + Token::SingleQuote { val, start_pos } => { + ExpendedToken::SingleQuote { val, start_pos } + } + Token::WhiteSpace { val, start_pos } => { + ExpendedToken::WhiteSpace { val, start_pos } + } + }) + } + }) +} + fn main() { for line in std::io::stdin().lines() { let line = line.unwrap(); - let mut state = TokenizerState::new(&line); + let mut state = ExpenderState::new(&line); println!("line is = '{line}'"); - println!( - "token are = {:?}", - tokenizer(&mut state).collect::>() - ); + println!("token are = {:?}", expend(&mut state).collect::>()); } }