#![allow(dead_code)] use std::{borrow::Cow, collections::VecDeque, convert::Infallible as Never, pin::Pin}; type WORD = String; type Rule2 = String; type Rule3 = String; struct SourceFile { cmds: Vec, } struct CompleteCommands { // each Pipeline Must have a Separator, but the // last Pipeline is optional list: Vec<(Pipeline, Option)>, } enum SeparatorOp { Semi, /* ; */ Fork, /* & */ } struct Pipeline { bang: bool, // placed before the seqence kind: Option, // placed after the seqence seq: Vec, // each cmd are piped into the next } enum PipelineKind { Or, /* || */ And, /* && */ } enum Commands { Simple(SimpleCommand), Compound(Never), FuncDef(Never), } struct SimpleCommand { prefix: Vec, cmd: Vec, // First is rule 7a, then 7b suffix: Vec, } enum CmdPrefix { IoRedirect(IoRedirect), Assigment(Assigment), } enum CmdSuffix { IoRedirect(IoRedirect), Word(WORD), } struct IoRedirect { io_file: Option, io_kind: IoKind, } enum IoKind { IoFile(IoFile), IoHere(IoHere), } enum IoFile { Less { filename: Rule2 }, /* < */ Greater { filename: Rule2 }, /* > */ LessAnd { filename: Rule2 }, /* <& */ GreaterAnd { filename: Rule2 }, /* >& */ DoubleGreater { filename: Rule2 }, /* >> */ LessGreater { filename: Rule2 }, /* <> */ Clobber { filename: Rule2 }, /* >| */ } enum IoHere { Dless { here_end: Rule3 }, /* << */ DlessDash { here_end: Rule3 }, /* <<- */ } // Rule 7 struct Assigment { key: WORD, value: WORD, } #[derive(Default)] struct TokenizerState<'input> { current_pos: usize, input: &'input str, remaining: &'input str, invalid_quote: bool, _marker: std::marker::PhantomPinned, } impl<'input> TokenizerState<'input> { fn new(input: &'input str) -> Self { Self { current_pos: 0, remaining: input, invalid_quote: false, input, _marker: std::marker::PhantomPinned, } } } // Cow<'input, str> is either a owned string (so it'll have to be free'd, basically it is an `String` // or it is an borrow string, so an &'input str, which doesn't need to be free (or at least by us) #[derive(Debug, Clone)] enum Token<'input> { SingleQuote { val: Cow<'input, str>, start_pos: usize, }, DoubleQuote { val: Cow<'input, str>, start_pos: usize, }, WhiteSpace { val: Cow<'input, str>, start_pos: usize, }, Word { val: Cow<'input, str>, start_pos: usize, }, } fn tokenizer<'state, 'input: 'state>( state: &'state mut TokenizerState<'input>, ) -> impl Iterator> + 'state { state.current_pos = 0; std::iter::from_fn(move || { let state = &mut *state; let mut chars = state.remaining.chars().peekable(); let mut len = 1; let mut escaped = false; let Some(chr) = chars.next() else { return None; }; match chr { '\'' => { while let Some(s) = chars.peek().copied() { if s == '\'' { break; } len += 1; chars.next(); } let skip = chars.peek() == Some(&'\''); state.invalid_quote |= !skip; let old_current = state.current_pos; state.current_pos += len; let old_remaining = state.remaining; state.remaining = &state.remaining[(len + skip as usize)..]; return Some(Token::SingleQuote { val: old_remaining[1..len].into(), start_pos: old_current, }); } '"' => { while let Some(s) = chars.peek().copied() { if !escaped && s == '\"' { break; } len += 1; escaped = chars.next() == Some('\\'); } let skip = chars.peek() == Some(&'\"'); state.invalid_quote |= !skip; let old_current = state.current_pos; state.current_pos += len; let old_remaining = state.remaining; state.remaining = &state.remaining[(len + skip as usize)..]; return Some(Token::DoubleQuote { val: old_remaining[1..len].into(), start_pos: old_current, }); } _ => {} } let was_whitespace = chr.is_ascii_whitespace(); while let Some(&chr) = chars.peek() { if chr.is_ascii_whitespace() && !escaped && !was_whitespace { let old_current = state.current_pos; state.current_pos += len; let old_remaining = state.remaining; state.remaining = &state.remaining[len..]; return Some(Token::Word { val: old_remaining[..len].into(), start_pos: old_current, }); } else if !chr.is_ascii_whitespace() && was_whitespace { let old_current = state.current_pos; state.current_pos += len; let old_remaining = state.remaining; state.remaining = &state.remaining[len..]; return Some(Token::WhiteSpace { val: old_remaining[..len].into(), start_pos: old_current, }); } len += 1; escaped = chars.next() == Some('\\'); } let old_current = state.current_pos; state.current_pos += len; let old_remaining = state.remaining; state.remaining = &state.remaining[len..]; Some(if was_whitespace { Token::WhiteSpace { val: old_remaining[..len].into(), start_pos: old_current, } } else { Token::Word { val: old_remaining[..len].into(), start_pos: old_current, } }) }) } // This isn't a C thingy, it is just needed to make rust happy struct TokenizerWrapper<'input> { _marker: std::marker::PhantomPinned, first_pass: TokenizerState<'input>, iter: Box> + 'input>, } impl<'input> TokenizerWrapper<'input> { fn new(s: TokenizerState<'input>) -> Pin> { let mut value = Box::new(std::mem::MaybeUninit::::uninit()); unsafe { let ptr = value.as_mut_ptr(); std::ptr::write(std::ptr::addr_of_mut!((*ptr).first_pass), s); std::ptr::write( std::ptr::addr_of_mut!((*ptr).iter), Box::new(tokenizer(&mut *std::ptr::addr_of_mut!((*ptr).first_pass))), ); std::mem::transmute(Pin::new_unchecked(value)) } } } impl<'input> TokenizerWrapper<'input> { fn next(self: &mut Pin>) -> Option> { unsafe { Pin::into_inner_unchecked(self.as_mut()).iter.next() } } } // end of rust thingy struct ExpenderState<'input> { // These will be inserted when a substitution is made, like $HOME // if it is "$HOME", then no splitting should be done, so if there is any stuff that needs to // be pushed, then push it, otherwise get the next token from `iter`, expend if needed and // voila need_push: VecDeque>, // This is because of the way I wrote the rust // stuff, returning iterator instead of an token everytime I call a function and stuff, it // shouldn't be reflected into the C code, as we will just call 'get_next_token(&state)' and it // will give us the next token (or EOF if no more token are present) tokenizer: Pin>>, } #[derive(Debug, Clone)] enum ExpendedToken<'input> { SingleQuote { val: Cow<'input, str>, start_pos: usize, }, DoubleQuote { val: Cow<'input, str>, start_pos: usize, }, WhiteSpace { val: Cow<'input, str>, start_pos: usize, }, Word { val: Cow<'input, str>, start_pos: usize, }, } impl<'input> ExpenderState<'input> { fn new(input: &'input str) -> Self { let wrapper = TokenizerWrapper::new(TokenizerState::new(input)); Self { need_push: VecDeque::new(), tokenizer: wrapper, } } } fn expend<'state, 'input: 'state>( input: &'state mut ExpenderState<'input>, ) -> impl Iterator> + 'state { std::iter::from_fn(|| { if !input.need_push.is_empty() { input.need_push.pop_front() } else { input.tokenizer.next().map(|t| match t { Token::Word { val, start_pos } => ExpendedToken::Word { val, start_pos }, Token::DoubleQuote { val, start_pos } => { ExpendedToken::DoubleQuote { val, start_pos } } Token::SingleQuote { val, start_pos } => { ExpendedToken::SingleQuote { val, start_pos } } Token::WhiteSpace { val, start_pos } => { ExpendedToken::WhiteSpace { val, start_pos } } }) } }) } fn main() { for line in std::io::stdin().lines() { let line = line.unwrap(); let mut state = ExpenderState::new(&line); println!("line is = '{line}'"); println!("token are = {:?}", expend(&mut state).collect::>()); } }