minishell/rust/parser/src/main.rs

#![allow(dead_code)]
use std::{borrow::Cow, collections::VecDeque, convert::Infallible as Never, pin::Pin};

type WORD = String;
type Rule2 = String;
type Rule3 = String;

struct SourceFile {
    cmds: Vec<CompleteCommands>,
}

struct CompleteCommands {
    // each Pipeline Must have a Separator, but the
    // last Pipeline is optional
    list: Vec<(Pipeline, Option<SeparatorOp>)>,
}

enum SeparatorOp {
    Semi, /* ; */
    Fork, /* & */
}

struct Pipeline {
    bang: bool,                 // placed before the seqence
    kind: Option<PipelineKind>, // placed after the seqence
    seq: Vec<Commands>,         // each cmd are piped into the next
}

enum PipelineKind {
    Or,  /* || */
    And, /* && */
}

enum Commands {
    Simple(SimpleCommand),
    Compound(Never),
    FuncDef(Never),
}

struct SimpleCommand {
    prefix: Vec<CmdPrefix>,
    cmd: Vec<WORD>, // First is rule 7a, then 7b
    suffix: Vec<CmdSuffix>,
}

enum CmdPrefix {
    IoRedirect(IoRedirect),
    Assigment(Assigment),
}

enum CmdSuffix {
    IoRedirect(IoRedirect),
    Word(WORD),
}

struct IoRedirect {
    io_file: Option<usize>,
    io_kind: IoKind,
}

enum IoKind {
    IoFile(IoFile),
    IoHere(IoHere),
}

enum IoFile {
    Less { filename: Rule2 },          /* <  */
    Greater { filename: Rule2 },       /* >  */
    LessAnd { filename: Rule2 },       /* <& */
    GreaterAnd { filename: Rule2 },    /* >& */
    DoubleGreater { filename: Rule2 }, /* >> */
    LessGreater { filename: Rule2 },   /* <> */
    Clobber { filename: Rule2 },       /* >| */
}
enum IoHere {
    Dless { here_end: Rule3 },     /* <<  */
    DlessDash { here_end: Rule3 }, /* <<- */
}

// Rule 7
struct Assigment {
    key: WORD,
    value: WORD,
}

#[derive(Default)]
struct TokenizerState<'input> {
    current_pos: usize,
    input: &'input str,
    remaining: &'input str,
    invalid_quote: bool,
    _marker: std::marker::PhantomPinned,
}

impl<'input> TokenizerState<'input> {
    fn new(input: &'input str) -> Self {
        Self {
            current_pos: 0,
            remaining: input,
            invalid_quote: false,
            input,
            _marker: std::marker::PhantomPinned,
        }
    }
}

// Cow<'input, str> is either a owned string (so it'll have to be free'd, basically it is an `String`
// or it is an borrow string, so an &'input str, which doesn't need to be free (or at least by us)

#[derive(Debug, Clone)]
enum Token<'input> {
    SingleQuote {
        val: Cow<'input, str>,
        start_pos: usize,
    },
    DoubleQuote {
        val: Cow<'input, str>,
        start_pos: usize,
    },
    WhiteSpace {
        val: Cow<'input, str>,
        start_pos: usize,
    },
    Word {
        val: Cow<'input, str>,
        start_pos: usize,
    },
}

fn tokenizer<'state, 'input: 'state>(
    state: &'state mut TokenizerState<'input>,
) -> impl Iterator<Item = Token<'input>> + 'state {
    state.current_pos = 0;
    std::iter::from_fn(move || {
        let state = &mut *state;
        let mut chars = state.remaining.chars().peekable();
        let mut len = 1;
        let mut escaped = false;

        let Some(chr) = chars.next() else {
            return None;
        };
        match chr {
            '\'' => {
                while let Some(s) = chars.peek().copied() {
                    if s == '\'' {
                        break;
                    }
                    len += 1;
                    chars.next();
                }
                let skip = chars.peek() == Some(&'\'');
                state.invalid_quote |= !skip;
                let old_current = state.current_pos;
                state.current_pos += len;
                let old_remaining = state.remaining;
                state.remaining = &state.remaining[(len + skip as usize)..];
                return Some(Token::SingleQuote {
                    val: old_remaining[1..len].into(),
                    start_pos: old_current,
                });
            }
            '"' => {
                while let Some(s) = chars.peek().copied() {
                    if !escaped && s == '\"' {
                        break;
                    }
                    len += 1;
                    escaped = chars.next() == Some('\\');
                }
                let skip = chars.peek() == Some(&'\"');
                state.invalid_quote |= !skip;
                let old_current = state.current_pos;
                state.current_pos += len;
                let old_remaining = state.remaining;
                state.remaining = &state.remaining[(len + skip as usize)..];
                return Some(Token::DoubleQuote {
                    val: old_remaining[1..len].into(),
                    start_pos: old_current,
                });
            }
            _ => {}
        }
        let was_whitespace = chr.is_ascii_whitespace();
        while let Some(&chr) = chars.peek() {
            if chr.is_ascii_whitespace() && !escaped && !was_whitespace {
                let old_current = state.current_pos;
                state.current_pos += len;
                let old_remaining = state.remaining;
                state.remaining = &state.remaining[len..];
                return Some(Token::Word {
                    val: old_remaining[..len].into(),
                    start_pos: old_current,
                });
            } else if !chr.is_ascii_whitespace() && was_whitespace {
                let old_current = state.current_pos;
                state.current_pos += len;
                let old_remaining = state.remaining;
                state.remaining = &state.remaining[len..];
                return Some(Token::WhiteSpace {
                    val: old_remaining[..len].into(),
                    start_pos: old_current,
                });
            }
            len += 1;
            escaped = chars.next() == Some('\\');
        }
        let old_current = state.current_pos;
        state.current_pos += len;
        let old_remaining = state.remaining;
        state.remaining = &state.remaining[len..];
        Some(if was_whitespace {
            Token::WhiteSpace {
                val: old_remaining[..len].into(),
                start_pos: old_current,
            }
        } else {
            Token::Word {
                val: old_remaining[..len].into(),
                start_pos: old_current,
            }
        })
    })
}

// This isn't a C thingy, it is just needed to make rust happy

struct TokenizerWrapper<'input> {
    _marker: std::marker::PhantomPinned,
    first_pass: TokenizerState<'input>,
    iter: Box<dyn Iterator<Item = Token<'input>> + 'input>,
}

impl<'input> TokenizerWrapper<'input> {
    fn new(s: TokenizerState<'input>) -> Pin<Box<Self>> {
        let mut value = Box::new(std::mem::MaybeUninit::<Self>::uninit());
        unsafe {
            let ptr = value.as_mut_ptr();
            std::ptr::write(std::ptr::addr_of_mut!((*ptr).first_pass), s);
            std::ptr::write(
                std::ptr::addr_of_mut!((*ptr).iter),
                Box::new(tokenizer(&mut *std::ptr::addr_of_mut!((*ptr).first_pass))),
            );
            std::mem::transmute(Pin::new_unchecked(value))
        }
    }
}

impl<'input> TokenizerWrapper<'input> {
    fn next(self: &mut Pin<Box<Self>>) -> Option<Token<'input>> {
        unsafe { Pin::into_inner_unchecked(self.as_mut()).iter.next() }
    }
}

// end of rust thingy

struct ExpenderState<'input> {
    // These will be inserted when a substitution is made, like $HOME
    // if it is "$HOME", then no splitting should be done, so if there  is any stuff  that needs to
    // be pushed, then push it, otherwise get the next token from `iter`, expend if needed and
    // voila
    need_push: VecDeque<ExpendedToken<'input>>,
    // This is because of the way I wrote the rust
    // stuff, returning iterator instead of an token everytime I call a function and stuff, it
    // shouldn't be reflected into the C code, as we will just call 'get_next_token(&state)' and it
    // will give us the next token (or EOF if no more token are present)
    tokenizer: Pin<Box<TokenizerWrapper<'input>>>,
}

#[derive(Debug, Clone)]
enum ExpendedToken<'input> {
    SingleQuote {
        val: Cow<'input, str>,
        start_pos: usize,
    },
    DoubleQuote {
        val: Cow<'input, str>,
        start_pos: usize,
    },
    WhiteSpace {
        val: Cow<'input, str>,
        start_pos: usize,
    },
    Word {
        val: Cow<'input, str>,
        start_pos: usize,
    },
}

impl<'input> ExpenderState<'input> {
    fn new(input: &'input str) -> Self {
        let wrapper = TokenizerWrapper::new(TokenizerState::new(input));

        Self {
            need_push: VecDeque::new(),
            tokenizer: wrapper,
        }
    }
}

fn expend<'state, 'input: 'state>(
    input: &'state mut ExpenderState<'input>,
) -> impl Iterator<Item = ExpendedToken<'input>> + 'state {
    std::iter::from_fn(|| {
        if !input.need_push.is_empty() {
            input.need_push.pop_front()
        } else {
            input.tokenizer.next().map(|t| match t {
                Token::Word { val, start_pos } => ExpendedToken::Word { val, start_pos },
                Token::DoubleQuote { val, start_pos } => {
                    ExpendedToken::DoubleQuote { val, start_pos }
                }
                Token::SingleQuote { val, start_pos } => {
                    ExpendedToken::SingleQuote { val, start_pos }
                }
                Token::WhiteSpace { val, start_pos } => {
                    ExpendedToken::WhiteSpace { val, start_pos }
                }
            })
        }
    })
}

fn main() {
    for line in std::io::stdin().lines() {
        let line = line.unwrap();
        let mut state = ExpenderState::new(&line);
        println!("line is = '{line}'");
        println!("token are = {:?}", expend(&mut state).collect::<Vec<_>>());
    }
}