The main part I'm unsure about is the code for reading strings, numbers and identifiers - it feels too complex, but I wasn't able to figure out a better way of doing it without adding a load of extra String allocations. Is there a way I could slim that code down?
use std::str::CharIndices;
use tokens::{DelimToken, Token};
usepub itertools;
usemod itertools::MultiPeek;tokens;
use std::str::CharIndices;
use self::tokens::Token;
#[inline]
fn is_id_start(ch: char) -> bool {
ch == '_' || ch.is_ascii_alphabetic()
}
#[inline]
fn is_id_continue(ch: char) -> bool {
ch == '_' || ch.is_ascii_digit()
}
pub type Location = usize;
#[derive(Debug, Fail, PartialEq)]
pub enum LexicalError {
#[fail(display = "Invalid character '{}' found at {}", ch, location)]
InvalidCharacter { ch: char, location: Location },
#[fail(display = "String starting at {} was not terminated", location)]
UnterminatedString { location: Location },
}
pub type SpanResult<'input> = Result<(Location, Token<'input>, Location), LexicalError>;
pub struct Lexer<'input> {
source: &'input str,
chars: MultiPeek<CharIndices<'input>>CharIndices<'input>,
lookahead: Option<(usize, char)>,
lookahead2: Option<(usize, char)>,
}
impl<'input> Lexer<'input> {
pub fn new(source: &'input str) -> Lexer<'input> {
let mut chars = source.char_indices();
let lookahead = chars.next();
let lookahead2 = chars.next();
Lexer {
source,
chars:,
itertools::multipeek(source.char_indices()) lookahead,
lookahead2,
}
}
fn skip_to_line_endbump(&mut self) -> Option<(usize, char)> {
let next = self.lookahead;
self.lookahead = self.lookahead2;
self.lookahead2 = self.chars.next();
next
}
fn take_until<F>(&mut self, mut terminate: F) -> Option<usize>
where
F: FnMut(char) -> bool,
{
while let Some(&(_i, ch)) = self.chars.peek()lookahead {
if terminate(ch != '\n') {
self.chars.nextreturn Some(i);
} else {
break;self.bump();
}
}
None
}
fn read_stringtake_while<F>(&mut self, positionmut condition: usizeF) -> Result<Token<'input>Option<usize>
where
F: FnMut(char) -> bool,
String> {
letself.take_until(|ch| mut!condition(ch))
end = position; }
fn skip_to_line_end(&mut self) {
while let Some( self.take_while(i,|ch| ch != '\n');
}
fn skip_whitespace(&mut self) ={
self.charstake_while(|ch| ch.nextis_whitespace());
{ }
fn read_string(&mut self, pos: usize) -> SpanResult<'input> {
if match self.take_until(|ch| ch == '"') {
Some(i) => {
return Ok(Token::String(&self.source[position + 1..end + 1]));
self.bump();
} else {
Ok((pos, Token::String(&self.source[pos + 1..i]), endi =+ i;1))
}
None => Err(LexicalError::UnterminatedString { location: pos }),
}
Err(format!("Unterminated string"))
}
fn read_number(&mut self, positionpos: usize) -> Result<Token<'input>, String>SpanResult<'input> {
let mut end = position;
let mut consumed_dot =self.take_while(|ch| false;ch.is_ascii_digit());
whileif let Some(&(i_, ch'.')) = self.chars.peek() {
match chlookahead {
// If we encounter a dot, we need to do an extra character of
// lookahead to checkCheck whetherif it's a decimal or a field
// access
// TODO: This code could almost certainly be cleaner
'.' if !consumed_dot => match self.chars.peek() {
let Some(&(_, next_ch)) if= next_chself.is_ascii_digit() =>lookahead2 {
end = i;
consumed_dot = true;
if self.charsnext_ch.nextis_ascii_digit();
}
_ => {
break;
}
},
ch if chself.is_ascii_digitbump() => {;
end = i;
self.charstake_while(|ch| ch.nextis_ascii_digit());
}
_ => {
break;
}
}
}
Oklet end = end.unwrap_or_else(Token::Number|| self.source.len());
Ok((
self.source[position..end + 1]
pos,
Token::Number(self.source[pos..end].parse()
.expect("unparsable number")),
.expect("unparsable number")end,
))
}
fn read_identifier(&mut self, positionpos: usize) -> Result<Token<'input>, String>SpanResult<'input> {
let mut end = position;
while let Some(&(i, ch)) = self.chars.peektake_while() {
if|ch| is_id_start(ch) || is_id_continue(ch) {
end = i;)
.unwrap_or_else(|| self.charssource.nextlen());
} else {
break;
}
}
match &self.source[positionsource[pos..end + 1]end] {
"and""else" => Ok(Token::And)(pos,
"else" => Ok(Token::Else, end)),
"false" => Ok((pos, Token::False, end)),
"fn" => Ok((pos, Token::Fn, end)),
"for" => Ok((pos, Token::For, end)),
"if" => Ok((pos, Token::If, end)),
"nil" => Ok((pos, Token::Nil),
"or" => Ok(Token::Orend)),
"print" => Ok((pos, Token::Print, end)),
"return" => Ok((pos, Token::Return, end)),
"this" => Ok((pos, Token::This, end)),
"true" => Ok((pos, Token::True, end)),
"let" => Ok((pos, Token::Let, end)),
"while" => Ok((pos, Token::While, end)),
id => Ok((pos, Token::Identifier(id), end)),
}
}
}
impl<'input> Iterator for Lexer<'input> {
type Item = Result<Token<'input>, String>;SpanResult<'input>;
fn next(&mut self) -> Option<Result<Token<'input>, String>>Option<SpanResult<'input>> {
whileself.skip_whitespace();
if let Some((i, ch)) = self.chars.nextbump() {
return match ch {
'{' => Some(Ok(Token::OpenDelim(DelimTokeni, Token::BraceOpenBrace, i + 1))),
'}' => Some(Ok(Token::CloseDelim(DelimTokeni, Token::BraceCloseBrace, i + 1))),
'(' => Some(Ok(Token::OpenDelim(DelimTokeni, Token::ParenOpenParen, i + 1))),
')' => Some(Ok(Token::CloseDelim(DelimTokeni, Token::ParenCloseParen, i + 1))),
'[' => Some(Ok(Token::OpenDelim(DelimTokeni, Token::BracketOpenBracket, i + 1))),
']' => Some(Ok((i, Token::CloseDelimCloseBracket, i + 1))),
';' => Some(DelimTokenOk((i, Token::BracketSemicolon, i + 1))),
',' => Some(Ok((i, Token::Comma, i + 1))),
'.' => Some(Ok((i, Token::Dot, i + 1))),
'+' => Some(Ok((i, Token::Plus, i + 1))),
'-' => Some(Ok((i, Token::Minus, i + 1))),
'*' => Some(Ok((i, Token::Star, i + 1))),
'/' => {
if let Some(&(_, '/')) = self.chars.peek()lookahead {
self.skip_to_line_end();
continue;self.next()
} else {
Some(Ok((i, Token::Slash, i + 1)))
}
}
'!' => {
if let Some(&(_, '=')) = self.chars.peek()lookahead {
self.chars.nextbump();
Some(Ok((i, Token::NotEqual, i + 2)))
} else {
Some(Ok((i, Token::Not, i + 1)))
}
}
'=' => {
if let Some(&(_, '=')) = self.chars.peek()lookahead {
self.chars.nextbump();
Some(Ok((i, Token::EqualEqual, i + 2)))
} else {
Some(Ok((i, Token::Equal, i + 1)))
}
}
'>' => {
if let Some(&(_, '=')) = self.chars.peek()lookahead {
self.chars.nextbump();
Some(Ok((i, Token::GreaterEqual, i + 2)))
} else {
Some(Ok((i, Token::Greater, i + 1)))
}
}
'<' => {
if let Some(&(_, '=')) = self.chars.peek()lookahead {
self.chars.nextbump();
Some(Ok((i, Token::LessEqual, i + 2)))
} else {
Some(Ok((i, Token::Less, i + 1)))
}
}
'"''&' => {
if let Some((_, '&')) = self.read_stringlookahead {
self.bump(i)),;
'\n' => Some(Ok((i, Token::NewLineAmpAmp, i + 2)))
} else {
Some(Err(LexicalError::InvalidCharacter { ch, location: i }))
}
}
'|' => {
if let Some((_, '|')) = self.lookahead {
self.bump();
Some(Ok((i, Token::PipePipe, i + 2)))
} else {
Some(Err(LexicalError::InvalidCharacter { ch, location: i }))
}
}
'"' => Some(self.read_string(i)),
ch if is_id_start(ch) => Some(self.read_identifier(i)),
ch if ch.is_ascii_digit() => Some(self.read_number(i)),
ch if ch.is_whitespace() => continue,
ch => Some(Err(format!("Unexpected tokenLexicalError::InvalidCharacter {}", ch), location: i })),
};
} else {
None
}
None
}
}
(Before you mention - I know String errors aren't great, I'm working on it :p)