Work: Rust parser being updated (Tokenizer worksTM)

This commit is contained in:
Maieul BOYER 2024-03-25 16:42:49 +01:00
parent 26ab7ff8b1
commit 8e8791e99f
No known key found for this signature in database

View file

@ -90,9 +90,22 @@ struct TokenizerState<'input> {
remaining: &'input str, remaining: &'input str,
} }
impl<'input> TokenizerState<'input> {
fn new(input: &'input str) -> Self {
Self {
current_pos: 0,
remaining: input,
input,
}
}
}
#[derive(Debug, Clone)]
enum Token<'input> { enum Token<'input> {
Thingy { val: &'input str, start_pos: usize },
SingleQuote { val: &'input str, start_pos: usize }, SingleQuote { val: &'input str, start_pos: usize },
DoubleQuote { val: &'input str, start_pos: usize },
WhiteSpace { val: &'input str, start_pos: usize },
Word { val: &'input str, start_pos: usize },
} }
fn tokenizer<'state, 'input: 'state>( fn tokenizer<'state, 'input: 'state>(
@ -103,34 +116,101 @@ fn tokenizer<'state, 'input: 'state>(
let state = &mut *state; let state = &mut *state;
let mut chars = state.remaining.chars().peekable(); let mut chars = state.remaining.chars().peekable();
let mut len = 1; let mut len = 1;
let mut escaped = false;
let Some(chr) = chars.next() else { let Some(chr) = chars.next() else {
return None; return None;
}; };
match chr { match chr {
'\'' => { '\'' => {
while chars.peek().copied() != Some('\'') { while let Some(s) = chars.peek().copied() {
if s == '\'' {
break;
}
len += 1; len += 1;
chars.next(); chars.next();
} }
let skip = chars.peek() == Some(&'\'');
let old_current = state.current_pos;
state.current_pos += len;
let old_remaining = state.remaining;
state.remaining = &state.remaining[(len + skip as usize)..];
return Some(Token::SingleQuote {
val: &old_remaining[1..len],
start_pos: old_current,
});
}
'"' => {
while let Some(s) = chars.peek().copied() {
if !escaped && s == '\"' {
break;
}
len += 1;
escaped = chars.next() == Some('\\');
}
let skip = chars.peek() == Some(&'\"');
let old_current = state.current_pos;
state.current_pos += len;
let old_remaining = state.remaining;
state.remaining = &state.remaining[(len + skip as usize)..];
return Some(Token::DoubleQuote {
val: &old_remaining[1..len],
start_pos: old_current,
});
}
_ => {}
}
let was_whitespace = chr.is_ascii_whitespace();
while let Some(&chr) = chars.peek() {
if chr.is_ascii_whitespace() && !escaped && !was_whitespace {
dbg!(state.current_pos);
let old_current = state.current_pos;
state.current_pos += len;
let old_remaining = state.remaining;
state.remaining = &state.remaining[len..];
return Some(Token::Word {
val: &old_remaining[..len],
start_pos: old_current,
});
} else if !chr.is_ascii_whitespace() && was_whitespace {
let old_current = state.current_pos;
state.current_pos += len;
let old_remaining = state.remaining;
state.remaining = &state.remaining[len..];
return Some(Token::WhiteSpace {
val: &old_remaining[..len],
start_pos: old_current,
});
}
len += 1;
escaped = chars.next() == Some('\\');
}
let old_current = state.current_pos; let old_current = state.current_pos;
state.current_pos += len; state.current_pos += len;
let old_remaining = state.remaining; let old_remaining = state.remaining;
state.remaining = &state.remaining[len..]; state.remaining = &state.remaining[len..];
return (Some(Token::SingleQuote { Some(if was_whitespace {
Token::WhiteSpace {
val: &old_remaining[..len], val: &old_remaining[..len],
start_pos: old_current, start_pos: old_current,
}));
} }
'"' => {} } else {
_ => {} Token::Word {
val: &old_remaining[..len],
start_pos: old_current,
} }
Some(Token::Thingy {
val: state.input,
start_pos: 0,
}) })
}) })
} }
fn main() {} fn main() {
for line in std::io::stdin().lines() {
let line = line.unwrap();
let mut state = TokenizerState::new(&line);
println!("line is = '{line}'");
println!(
"token are = {:?}",
tokenizer(&mut state).collect::<Vec<_>>()
);
}
}