update tokenizer to allow expension

This commit is contained in:
Maieul BOYER 2024-03-26 16:42:25 +01:00
parent d4a29b5bf2
commit d1f93444e0
No known key found for this signature in database

View file

@ -1,5 +1,5 @@
#![allow(dead_code)] #![allow(dead_code)]
use std::convert::Infallible as Never; use std::{borrow::Cow, collections::VecDeque, convert::Infallible as Never, pin::Pin};
type WORD = String; type WORD = String;
type Rule2 = String; type Rule2 = String;
@ -88,6 +88,8 @@ struct TokenizerState<'input> {
current_pos: usize, current_pos: usize,
input: &'input str, input: &'input str,
remaining: &'input str, remaining: &'input str,
invalid_quote: bool,
_marker: std::marker::PhantomPinned,
} }
impl<'input> TokenizerState<'input> { impl<'input> TokenizerState<'input> {
@ -95,17 +97,34 @@ impl<'input> TokenizerState<'input> {
Self { Self {
current_pos: 0, current_pos: 0,
remaining: input, remaining: input,
invalid_quote: false,
input, input,
_marker: std::marker::PhantomPinned,
} }
} }
} }
// Cow<'input, str> is either a owned string (so it'll have to be free'd, basically it is an `String`
// or it is an borrow string, so an &'input str, which doesn't need to be free (or at least by us)
#[derive(Debug, Clone)] #[derive(Debug, Clone)]
enum Token<'input> { enum Token<'input> {
SingleQuote { val: &'input str, start_pos: usize }, SingleQuote {
DoubleQuote { val: &'input str, start_pos: usize }, val: Cow<'input, str>,
WhiteSpace { val: &'input str, start_pos: usize }, start_pos: usize,
Word { val: &'input str, start_pos: usize }, },
DoubleQuote {
val: Cow<'input, str>,
start_pos: usize,
},
WhiteSpace {
val: Cow<'input, str>,
start_pos: usize,
},
Word {
val: Cow<'input, str>,
start_pos: usize,
},
} }
fn tokenizer<'state, 'input: 'state>( fn tokenizer<'state, 'input: 'state>(
@ -131,12 +150,13 @@ fn tokenizer<'state, 'input: 'state>(
chars.next(); chars.next();
} }
let skip = chars.peek() == Some(&'\''); let skip = chars.peek() == Some(&'\'');
state.invalid_quote |= !skip;
let old_current = state.current_pos; let old_current = state.current_pos;
state.current_pos += len; state.current_pos += len;
let old_remaining = state.remaining; let old_remaining = state.remaining;
state.remaining = &state.remaining[(len + skip as usize)..]; state.remaining = &state.remaining[(len + skip as usize)..];
return Some(Token::SingleQuote { return Some(Token::SingleQuote {
val: &old_remaining[1..len], val: old_remaining[1..len].into(),
start_pos: old_current, start_pos: old_current,
}); });
} }
@ -149,12 +169,13 @@ fn tokenizer<'state, 'input: 'state>(
escaped = chars.next() == Some('\\'); escaped = chars.next() == Some('\\');
} }
let skip = chars.peek() == Some(&'\"'); let skip = chars.peek() == Some(&'\"');
state.invalid_quote |= !skip;
let old_current = state.current_pos; let old_current = state.current_pos;
state.current_pos += len; state.current_pos += len;
let old_remaining = state.remaining; let old_remaining = state.remaining;
state.remaining = &state.remaining[(len + skip as usize)..]; state.remaining = &state.remaining[(len + skip as usize)..];
return Some(Token::DoubleQuote { return Some(Token::DoubleQuote {
val: &old_remaining[1..len], val: old_remaining[1..len].into(),
start_pos: old_current, start_pos: old_current,
}); });
} }
@ -163,13 +184,12 @@ fn tokenizer<'state, 'input: 'state>(
let was_whitespace = chr.is_ascii_whitespace(); let was_whitespace = chr.is_ascii_whitespace();
while let Some(&chr) = chars.peek() { while let Some(&chr) = chars.peek() {
if chr.is_ascii_whitespace() && !escaped && !was_whitespace { if chr.is_ascii_whitespace() && !escaped && !was_whitespace {
dbg!(state.current_pos);
let old_current = state.current_pos; let old_current = state.current_pos;
state.current_pos += len; state.current_pos += len;
let old_remaining = state.remaining; let old_remaining = state.remaining;
state.remaining = &state.remaining[len..]; state.remaining = &state.remaining[len..];
return Some(Token::Word { return Some(Token::Word {
val: &old_remaining[..len], val: old_remaining[..len].into(),
start_pos: old_current, start_pos: old_current,
}); });
} else if !chr.is_ascii_whitespace() && was_whitespace { } else if !chr.is_ascii_whitespace() && was_whitespace {
@ -178,7 +198,7 @@ fn tokenizer<'state, 'input: 'state>(
let old_remaining = state.remaining; let old_remaining = state.remaining;
state.remaining = &state.remaining[len..]; state.remaining = &state.remaining[len..];
return Some(Token::WhiteSpace { return Some(Token::WhiteSpace {
val: &old_remaining[..len], val: old_remaining[..len].into(),
start_pos: old_current, start_pos: old_current,
}); });
} }
@ -191,26 +211,118 @@ fn tokenizer<'state, 'input: 'state>(
state.remaining = &state.remaining[len..]; state.remaining = &state.remaining[len..];
Some(if was_whitespace { Some(if was_whitespace {
Token::WhiteSpace { Token::WhiteSpace {
val: &old_remaining[..len], val: old_remaining[..len].into(),
start_pos: old_current, start_pos: old_current,
} }
} else { } else {
Token::Word { Token::Word {
val: &old_remaining[..len], val: old_remaining[..len].into(),
start_pos: old_current, start_pos: old_current,
} }
}) })
}) })
} }
struct TokenizerWrapper<'input> {
_marker: std::marker::PhantomPinned,
first_pass: TokenizerState<'input>,
iter: Box<dyn Iterator<Item = Token<'input>> + 'input>,
}
impl<'input> TokenizerWrapper<'input> {
fn new(s: TokenizerState<'input>) -> Pin<Box<Self>> {
let mut value = Box::new(std::mem::MaybeUninit::<Self>::uninit());
unsafe {
//
let ptr: *mut Self = value.as_mut_ptr();
std::ptr::write(std::ptr::addr_of_mut!((*ptr).first_pass), s);
std::ptr::write(
std::ptr::addr_of_mut!((*ptr).iter),
Box::new(tokenizer(&mut *std::ptr::addr_of_mut!((*ptr).first_pass))),
);
std::mem::transmute(Pin::new_unchecked(value))
}
}
}
impl<'input> TokenizerWrapper<'input> {
fn next(self: &mut Pin<Box<Self>>) -> Option<Token<'input>> {
unsafe { Pin::into_inner_unchecked(self.as_mut()).iter.next() }
}
}
struct ExpenderState<'input> {
// These will be inserted when a substitution is made, like $HOME
// if it is "$HOME", then no splitting should be done, so if there is any stuff that needs to
// be pushed, then push it, otherwise get the next token from `iter`, expend if needed and
// voila
need_push: VecDeque<ExpendedToken<'input>>,
// This is because of the way I wrote the rust
// stuff, returning iterator instead of an token everytime I call a function and stuff, it
// shouldn't be reflected into the C code, as we will just call 'get_next_token(&state)' and it
// will give us the next token (or EOF if no more token are present)
tokenizer: Pin<Box<TokenizerWrapper<'input>>>,
}
#[derive(Debug, Clone)]
enum ExpendedToken<'input> {
SingleQuote {
val: Cow<'input, str>,
start_pos: usize,
},
DoubleQuote {
val: Cow<'input, str>,
start_pos: usize,
},
WhiteSpace {
val: Cow<'input, str>,
start_pos: usize,
},
Word {
val: Cow<'input, str>,
start_pos: usize,
},
}
impl<'input> ExpenderState<'input> {
fn new(input: &'input str) -> Self {
let wrapper = TokenizerWrapper::new(TokenizerState::new(input));
Self {
need_push: VecDeque::new(),
tokenizer: wrapper,
}
}
}
fn expend<'state, 'input: 'state>(
input: &'state mut ExpenderState<'input>,
) -> impl Iterator<Item = ExpendedToken<'input>> + 'state {
std::iter::from_fn(|| {
if !input.need_push.is_empty() {
input.need_push.pop_front()
} else {
input.tokenizer.next().map(|t| match t {
Token::Word { val, start_pos } => ExpendedToken::Word { val, start_pos },
Token::DoubleQuote { val, start_pos } => {
ExpendedToken::DoubleQuote { val, start_pos }
}
Token::SingleQuote { val, start_pos } => {
ExpendedToken::SingleQuote { val, start_pos }
}
Token::WhiteSpace { val, start_pos } => {
ExpendedToken::WhiteSpace { val, start_pos }
}
})
}
})
}
fn main() { fn main() {
for line in std::io::stdin().lines() { for line in std::io::stdin().lines() {
let line = line.unwrap(); let line = line.unwrap();
let mut state = TokenizerState::new(&line); let mut state = ExpenderState::new(&line);
println!("line is = '{line}'"); println!("line is = '{line}'");
println!( println!("token are = {:?}", expend(&mut state).collect::<Vec<_>>());
"token are = {:?}",
tokenizer(&mut state).collect::<Vec<_>>()
);
} }
} }