update tokenizer to allow expension
This commit is contained in:
parent
d4a29b5bf2
commit
d1f93444e0
1 changed files with 129 additions and 17 deletions
|
|
@ -1,5 +1,5 @@
|
||||||
#![allow(dead_code)]
|
#![allow(dead_code)]
|
||||||
use std::convert::Infallible as Never;
|
use std::{borrow::Cow, collections::VecDeque, convert::Infallible as Never, pin::Pin};
|
||||||
|
|
||||||
type WORD = String;
|
type WORD = String;
|
||||||
type Rule2 = String;
|
type Rule2 = String;
|
||||||
|
|
@ -88,6 +88,8 @@ struct TokenizerState<'input> {
|
||||||
current_pos: usize,
|
current_pos: usize,
|
||||||
input: &'input str,
|
input: &'input str,
|
||||||
remaining: &'input str,
|
remaining: &'input str,
|
||||||
|
invalid_quote: bool,
|
||||||
|
_marker: std::marker::PhantomPinned,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'input> TokenizerState<'input> {
|
impl<'input> TokenizerState<'input> {
|
||||||
|
|
@ -95,17 +97,34 @@ impl<'input> TokenizerState<'input> {
|
||||||
Self {
|
Self {
|
||||||
current_pos: 0,
|
current_pos: 0,
|
||||||
remaining: input,
|
remaining: input,
|
||||||
|
invalid_quote: false,
|
||||||
input,
|
input,
|
||||||
|
_marker: std::marker::PhantomPinned,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Cow<'input, str> is either a owned string (so it'll have to be free'd, basically it is an `String`
|
||||||
|
// or it is an borrow string, so an &'input str, which doesn't need to be free (or at least by us)
|
||||||
|
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
enum Token<'input> {
|
enum Token<'input> {
|
||||||
SingleQuote { val: &'input str, start_pos: usize },
|
SingleQuote {
|
||||||
DoubleQuote { val: &'input str, start_pos: usize },
|
val: Cow<'input, str>,
|
||||||
WhiteSpace { val: &'input str, start_pos: usize },
|
start_pos: usize,
|
||||||
Word { val: &'input str, start_pos: usize },
|
},
|
||||||
|
DoubleQuote {
|
||||||
|
val: Cow<'input, str>,
|
||||||
|
start_pos: usize,
|
||||||
|
},
|
||||||
|
WhiteSpace {
|
||||||
|
val: Cow<'input, str>,
|
||||||
|
start_pos: usize,
|
||||||
|
},
|
||||||
|
Word {
|
||||||
|
val: Cow<'input, str>,
|
||||||
|
start_pos: usize,
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
fn tokenizer<'state, 'input: 'state>(
|
fn tokenizer<'state, 'input: 'state>(
|
||||||
|
|
@ -131,12 +150,13 @@ fn tokenizer<'state, 'input: 'state>(
|
||||||
chars.next();
|
chars.next();
|
||||||
}
|
}
|
||||||
let skip = chars.peek() == Some(&'\'');
|
let skip = chars.peek() == Some(&'\'');
|
||||||
|
state.invalid_quote |= !skip;
|
||||||
let old_current = state.current_pos;
|
let old_current = state.current_pos;
|
||||||
state.current_pos += len;
|
state.current_pos += len;
|
||||||
let old_remaining = state.remaining;
|
let old_remaining = state.remaining;
|
||||||
state.remaining = &state.remaining[(len + skip as usize)..];
|
state.remaining = &state.remaining[(len + skip as usize)..];
|
||||||
return Some(Token::SingleQuote {
|
return Some(Token::SingleQuote {
|
||||||
val: &old_remaining[1..len],
|
val: old_remaining[1..len].into(),
|
||||||
start_pos: old_current,
|
start_pos: old_current,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
@ -149,12 +169,13 @@ fn tokenizer<'state, 'input: 'state>(
|
||||||
escaped = chars.next() == Some('\\');
|
escaped = chars.next() == Some('\\');
|
||||||
}
|
}
|
||||||
let skip = chars.peek() == Some(&'\"');
|
let skip = chars.peek() == Some(&'\"');
|
||||||
|
state.invalid_quote |= !skip;
|
||||||
let old_current = state.current_pos;
|
let old_current = state.current_pos;
|
||||||
state.current_pos += len;
|
state.current_pos += len;
|
||||||
let old_remaining = state.remaining;
|
let old_remaining = state.remaining;
|
||||||
state.remaining = &state.remaining[(len + skip as usize)..];
|
state.remaining = &state.remaining[(len + skip as usize)..];
|
||||||
return Some(Token::DoubleQuote {
|
return Some(Token::DoubleQuote {
|
||||||
val: &old_remaining[1..len],
|
val: old_remaining[1..len].into(),
|
||||||
start_pos: old_current,
|
start_pos: old_current,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
@ -163,13 +184,12 @@ fn tokenizer<'state, 'input: 'state>(
|
||||||
let was_whitespace = chr.is_ascii_whitespace();
|
let was_whitespace = chr.is_ascii_whitespace();
|
||||||
while let Some(&chr) = chars.peek() {
|
while let Some(&chr) = chars.peek() {
|
||||||
if chr.is_ascii_whitespace() && !escaped && !was_whitespace {
|
if chr.is_ascii_whitespace() && !escaped && !was_whitespace {
|
||||||
dbg!(state.current_pos);
|
|
||||||
let old_current = state.current_pos;
|
let old_current = state.current_pos;
|
||||||
state.current_pos += len;
|
state.current_pos += len;
|
||||||
let old_remaining = state.remaining;
|
let old_remaining = state.remaining;
|
||||||
state.remaining = &state.remaining[len..];
|
state.remaining = &state.remaining[len..];
|
||||||
return Some(Token::Word {
|
return Some(Token::Word {
|
||||||
val: &old_remaining[..len],
|
val: old_remaining[..len].into(),
|
||||||
start_pos: old_current,
|
start_pos: old_current,
|
||||||
});
|
});
|
||||||
} else if !chr.is_ascii_whitespace() && was_whitespace {
|
} else if !chr.is_ascii_whitespace() && was_whitespace {
|
||||||
|
|
@ -178,7 +198,7 @@ fn tokenizer<'state, 'input: 'state>(
|
||||||
let old_remaining = state.remaining;
|
let old_remaining = state.remaining;
|
||||||
state.remaining = &state.remaining[len..];
|
state.remaining = &state.remaining[len..];
|
||||||
return Some(Token::WhiteSpace {
|
return Some(Token::WhiteSpace {
|
||||||
val: &old_remaining[..len],
|
val: old_remaining[..len].into(),
|
||||||
start_pos: old_current,
|
start_pos: old_current,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
@ -191,26 +211,118 @@ fn tokenizer<'state, 'input: 'state>(
|
||||||
state.remaining = &state.remaining[len..];
|
state.remaining = &state.remaining[len..];
|
||||||
Some(if was_whitespace {
|
Some(if was_whitespace {
|
||||||
Token::WhiteSpace {
|
Token::WhiteSpace {
|
||||||
val: &old_remaining[..len],
|
val: old_remaining[..len].into(),
|
||||||
start_pos: old_current,
|
start_pos: old_current,
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
Token::Word {
|
Token::Word {
|
||||||
val: &old_remaining[..len],
|
val: old_remaining[..len].into(),
|
||||||
start_pos: old_current,
|
start_pos: old_current,
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct TokenizerWrapper<'input> {
|
||||||
|
_marker: std::marker::PhantomPinned,
|
||||||
|
first_pass: TokenizerState<'input>,
|
||||||
|
iter: Box<dyn Iterator<Item = Token<'input>> + 'input>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'input> TokenizerWrapper<'input> {
|
||||||
|
fn new(s: TokenizerState<'input>) -> Pin<Box<Self>> {
|
||||||
|
let mut value = Box::new(std::mem::MaybeUninit::<Self>::uninit());
|
||||||
|
unsafe {
|
||||||
|
//
|
||||||
|
let ptr: *mut Self = value.as_mut_ptr();
|
||||||
|
std::ptr::write(std::ptr::addr_of_mut!((*ptr).first_pass), s);
|
||||||
|
std::ptr::write(
|
||||||
|
std::ptr::addr_of_mut!((*ptr).iter),
|
||||||
|
Box::new(tokenizer(&mut *std::ptr::addr_of_mut!((*ptr).first_pass))),
|
||||||
|
);
|
||||||
|
std::mem::transmute(Pin::new_unchecked(value))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'input> TokenizerWrapper<'input> {
|
||||||
|
fn next(self: &mut Pin<Box<Self>>) -> Option<Token<'input>> {
|
||||||
|
unsafe { Pin::into_inner_unchecked(self.as_mut()).iter.next() }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ExpenderState<'input> {
|
||||||
|
// These will be inserted when a substitution is made, like $HOME
|
||||||
|
// if it is "$HOME", then no splitting should be done, so if there is any stuff that needs to
|
||||||
|
// be pushed, then push it, otherwise get the next token from `iter`, expend if needed and
|
||||||
|
// voila
|
||||||
|
need_push: VecDeque<ExpendedToken<'input>>,
|
||||||
|
// This is because of the way I wrote the rust
|
||||||
|
// stuff, returning iterator instead of an token everytime I call a function and stuff, it
|
||||||
|
// shouldn't be reflected into the C code, as we will just call 'get_next_token(&state)' and it
|
||||||
|
// will give us the next token (or EOF if no more token are present)
|
||||||
|
tokenizer: Pin<Box<TokenizerWrapper<'input>>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
enum ExpendedToken<'input> {
|
||||||
|
SingleQuote {
|
||||||
|
val: Cow<'input, str>,
|
||||||
|
start_pos: usize,
|
||||||
|
},
|
||||||
|
DoubleQuote {
|
||||||
|
val: Cow<'input, str>,
|
||||||
|
start_pos: usize,
|
||||||
|
},
|
||||||
|
WhiteSpace {
|
||||||
|
val: Cow<'input, str>,
|
||||||
|
start_pos: usize,
|
||||||
|
},
|
||||||
|
Word {
|
||||||
|
val: Cow<'input, str>,
|
||||||
|
start_pos: usize,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'input> ExpenderState<'input> {
|
||||||
|
fn new(input: &'input str) -> Self {
|
||||||
|
let wrapper = TokenizerWrapper::new(TokenizerState::new(input));
|
||||||
|
|
||||||
|
Self {
|
||||||
|
need_push: VecDeque::new(),
|
||||||
|
tokenizer: wrapper,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn expend<'state, 'input: 'state>(
|
||||||
|
input: &'state mut ExpenderState<'input>,
|
||||||
|
) -> impl Iterator<Item = ExpendedToken<'input>> + 'state {
|
||||||
|
std::iter::from_fn(|| {
|
||||||
|
if !input.need_push.is_empty() {
|
||||||
|
input.need_push.pop_front()
|
||||||
|
} else {
|
||||||
|
input.tokenizer.next().map(|t| match t {
|
||||||
|
Token::Word { val, start_pos } => ExpendedToken::Word { val, start_pos },
|
||||||
|
Token::DoubleQuote { val, start_pos } => {
|
||||||
|
ExpendedToken::DoubleQuote { val, start_pos }
|
||||||
|
}
|
||||||
|
Token::SingleQuote { val, start_pos } => {
|
||||||
|
ExpendedToken::SingleQuote { val, start_pos }
|
||||||
|
}
|
||||||
|
Token::WhiteSpace { val, start_pos } => {
|
||||||
|
ExpendedToken::WhiteSpace { val, start_pos }
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
fn main() {
|
fn main() {
|
||||||
for line in std::io::stdin().lines() {
|
for line in std::io::stdin().lines() {
|
||||||
let line = line.unwrap();
|
let line = line.unwrap();
|
||||||
let mut state = TokenizerState::new(&line);
|
let mut state = ExpenderState::new(&line);
|
||||||
println!("line is = '{line}'");
|
println!("line is = '{line}'");
|
||||||
println!(
|
println!("token are = {:?}", expend(&mut state).collect::<Vec<_>>());
|
||||||
"token are = {:?}",
|
|
||||||
tokenizer(&mut state).collect::<Vec<_>>()
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue