Work: Rust parser being updated (Tokenizer worksTM)
This commit is contained in:
parent
26ab7ff8b1
commit
8e8791e99f
1 changed files with 91 additions and 11 deletions
|
|
@ -90,9 +90,22 @@ struct TokenizerState<'input> {
|
||||||
remaining: &'input str,
|
remaining: &'input str,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl<'input> TokenizerState<'input> {
|
||||||
|
fn new(input: &'input str) -> Self {
|
||||||
|
Self {
|
||||||
|
current_pos: 0,
|
||||||
|
remaining: input,
|
||||||
|
input,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
enum Token<'input> {
|
enum Token<'input> {
|
||||||
Thingy { val: &'input str, start_pos: usize },
|
|
||||||
SingleQuote { val: &'input str, start_pos: usize },
|
SingleQuote { val: &'input str, start_pos: usize },
|
||||||
|
DoubleQuote { val: &'input str, start_pos: usize },
|
||||||
|
WhiteSpace { val: &'input str, start_pos: usize },
|
||||||
|
Word { val: &'input str, start_pos: usize },
|
||||||
}
|
}
|
||||||
|
|
||||||
fn tokenizer<'state, 'input: 'state>(
|
fn tokenizer<'state, 'input: 'state>(
|
||||||
|
|
@ -103,34 +116,101 @@ fn tokenizer<'state, 'input: 'state>(
|
||||||
let state = &mut *state;
|
let state = &mut *state;
|
||||||
let mut chars = state.remaining.chars().peekable();
|
let mut chars = state.remaining.chars().peekable();
|
||||||
let mut len = 1;
|
let mut len = 1;
|
||||||
|
let mut escaped = false;
|
||||||
|
|
||||||
let Some(chr) = chars.next() else {
|
let Some(chr) = chars.next() else {
|
||||||
return None;
|
return None;
|
||||||
};
|
};
|
||||||
match chr {
|
match chr {
|
||||||
'\'' => {
|
'\'' => {
|
||||||
while chars.peek().copied() != Some('\'') {
|
while let Some(s) = chars.peek().copied() {
|
||||||
|
if s == '\'' {
|
||||||
|
break;
|
||||||
|
}
|
||||||
len += 1;
|
len += 1;
|
||||||
chars.next();
|
chars.next();
|
||||||
}
|
}
|
||||||
|
let skip = chars.peek() == Some(&'\'');
|
||||||
|
let old_current = state.current_pos;
|
||||||
|
state.current_pos += len;
|
||||||
|
let old_remaining = state.remaining;
|
||||||
|
state.remaining = &state.remaining[(len + skip as usize)..];
|
||||||
|
return Some(Token::SingleQuote {
|
||||||
|
val: &old_remaining[1..len],
|
||||||
|
start_pos: old_current,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
'"' => {
|
||||||
|
while let Some(s) = chars.peek().copied() {
|
||||||
|
if !escaped && s == '\"' {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
len += 1;
|
||||||
|
escaped = chars.next() == Some('\\');
|
||||||
|
}
|
||||||
|
let skip = chars.peek() == Some(&'\"');
|
||||||
|
let old_current = state.current_pos;
|
||||||
|
state.current_pos += len;
|
||||||
|
let old_remaining = state.remaining;
|
||||||
|
state.remaining = &state.remaining[(len + skip as usize)..];
|
||||||
|
return Some(Token::DoubleQuote {
|
||||||
|
val: &old_remaining[1..len],
|
||||||
|
start_pos: old_current,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
_ => {}
|
||||||
|
}
|
||||||
|
let was_whitespace = chr.is_ascii_whitespace();
|
||||||
|
while let Some(&chr) = chars.peek() {
|
||||||
|
if chr.is_ascii_whitespace() && !escaped && !was_whitespace {
|
||||||
|
dbg!(state.current_pos);
|
||||||
|
let old_current = state.current_pos;
|
||||||
|
state.current_pos += len;
|
||||||
|
let old_remaining = state.remaining;
|
||||||
|
state.remaining = &state.remaining[len..];
|
||||||
|
return Some(Token::Word {
|
||||||
|
val: &old_remaining[..len],
|
||||||
|
start_pos: old_current,
|
||||||
|
});
|
||||||
|
} else if !chr.is_ascii_whitespace() && was_whitespace {
|
||||||
|
let old_current = state.current_pos;
|
||||||
|
state.current_pos += len;
|
||||||
|
let old_remaining = state.remaining;
|
||||||
|
state.remaining = &state.remaining[len..];
|
||||||
|
return Some(Token::WhiteSpace {
|
||||||
|
val: &old_remaining[..len],
|
||||||
|
start_pos: old_current,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
len += 1;
|
||||||
|
escaped = chars.next() == Some('\\');
|
||||||
|
}
|
||||||
let old_current = state.current_pos;
|
let old_current = state.current_pos;
|
||||||
state.current_pos += len;
|
state.current_pos += len;
|
||||||
let old_remaining = state.remaining;
|
let old_remaining = state.remaining;
|
||||||
state.remaining = &state.remaining[len..];
|
state.remaining = &state.remaining[len..];
|
||||||
return (Some(Token::SingleQuote {
|
Some(if was_whitespace {
|
||||||
|
Token::WhiteSpace {
|
||||||
val: &old_remaining[..len],
|
val: &old_remaining[..len],
|
||||||
start_pos: old_current,
|
start_pos: old_current,
|
||||||
}));
|
|
||||||
}
|
}
|
||||||
'"' => {}
|
} else {
|
||||||
_ => {}
|
Token::Word {
|
||||||
|
val: &old_remaining[..len],
|
||||||
|
start_pos: old_current,
|
||||||
}
|
}
|
||||||
|
|
||||||
Some(Token::Thingy {
|
|
||||||
val: state.input,
|
|
||||||
start_pos: 0,
|
|
||||||
})
|
})
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
fn main() {}
|
fn main() {
|
||||||
|
for line in std::io::stdin().lines() {
|
||||||
|
let line = line.unwrap();
|
||||||
|
let mut state = TokenizerState::new(&line);
|
||||||
|
println!("line is = '{line}'");
|
||||||
|
println!(
|
||||||
|
"token are = {:?}",
|
||||||
|
tokenizer(&mut state).collect::<Vec<_>>()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue