Lexer code deduplication and refactoring (#414)
This commit is contained in:
parent
0ad5574ecc
commit
d065d1c54f
99
src/lexer.rs
99
src/lexer.rs
@ -85,6 +85,21 @@ impl<'a> Lexer<'a> {
|
|||||||
&self.text[self.token_start.offset..self.token_end.offset]
|
&self.text[self.token_start.offset..self.token_end.offset]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Length of current token
|
||||||
|
fn current_token_length(&self) -> usize {
|
||||||
|
self.token_end.offset - self.token_start.offset
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Is next character c?
|
||||||
|
fn next_is(&self, c: char) -> bool {
|
||||||
|
self.next == Some(c)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Is next character ' ' or '\t'?
|
||||||
|
fn next_is_whitespace(&self) -> bool {
|
||||||
|
self.next_is(' ') || self.next_is('\t')
|
||||||
|
}
|
||||||
|
|
||||||
/// Un-lexed text
|
/// Un-lexed text
|
||||||
fn rest(&self) -> &'a str {
|
fn rest(&self) -> &'a str {
|
||||||
&self.text[self.token_end.offset..]
|
&self.text[self.token_end.offset..]
|
||||||
@ -95,9 +110,14 @@ impl<'a> Lexer<'a> {
|
|||||||
self.rest().starts_with(prefix)
|
self.rest().starts_with(prefix)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Length of current token
|
/// Does rest start with "\n" or "\r\n"?
|
||||||
fn current_token_length(&self) -> usize {
|
fn at_eol(&self) -> bool {
|
||||||
self.token_end.offset - self.token_start.offset
|
self.next_is('\n') || self.rest_starts_with("\r\n")
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Are we at end-of-line or end-of-file?
|
||||||
|
fn at_eol_or_eof(&self) -> bool {
|
||||||
|
self.at_eol() || self.rest().is_empty()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Get current state
|
/// Get current state
|
||||||
@ -237,7 +257,7 @@ impl<'a> Lexer<'a> {
|
|||||||
|
|
||||||
// Handle blank line
|
// Handle blank line
|
||||||
if rest.starts_with('\n') || rest.starts_with("\r\n") || rest.is_empty() {
|
if rest.starts_with('\n') || rest.starts_with("\r\n") || rest.is_empty() {
|
||||||
while let Some(' ') | Some('\t') = self.next {
|
while self.next_is_whitespace() {
|
||||||
self.advance()?;
|
self.advance()?;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -250,7 +270,7 @@ impl<'a> Lexer<'a> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Handle nonblank lines with no leading whitespace
|
// Handle nonblank lines with no leading whitespace
|
||||||
if self.next != Some(' ') && self.next != Some('\t') {
|
if !self.next_is_whitespace() {
|
||||||
if let State::Indented { .. } = self.state()? {
|
if let State::Indented { .. } = self.state()? {
|
||||||
self.token(Dedent);
|
self.token(Dedent);
|
||||||
self.pop_state()?;
|
self.pop_state()?;
|
||||||
@ -261,24 +281,9 @@ impl<'a> Lexer<'a> {
|
|||||||
|
|
||||||
// Handle continued indentation
|
// Handle continued indentation
|
||||||
if let State::Indented { indentation } = self.state()? {
|
if let State::Indented { indentation } = self.state()? {
|
||||||
let mut remaining = indentation.len();
|
if self.rest_starts_with(indentation) {
|
||||||
|
for _ in indentation.chars() {
|
||||||
// Advance over whitespace up to length of current indentation
|
|
||||||
while let Some(' ') | Some('\t') = self.next {
|
|
||||||
self.advance()?;
|
self.advance()?;
|
||||||
remaining -= 1;
|
|
||||||
if remaining == 0 {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let lexeme = self.lexeme();
|
|
||||||
|
|
||||||
if lexeme != indentation {
|
|
||||||
return Err(self.error(InconsistentLeadingWhitespace {
|
|
||||||
expected: indentation,
|
|
||||||
found: lexeme,
|
|
||||||
}));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Indentation matches, lex as whitespace
|
// Indentation matches, lex as whitespace
|
||||||
@ -287,6 +292,24 @@ impl<'a> Lexer<'a> {
|
|||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Consume whitespace characters, matching or not, up to the length
|
||||||
|
// of expected indentation
|
||||||
|
for _ in indentation.chars().zip(self.rest().chars()) {
|
||||||
|
if self.next_is_whitespace() {
|
||||||
|
self.advance()?;
|
||||||
|
} else {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// We've either advanced over not enough whitespace or mismatching
|
||||||
|
// whitespace, so return an error
|
||||||
|
return Err(self.error(InconsistentLeadingWhitespace {
|
||||||
|
expected: indentation,
|
||||||
|
found: self.lexeme(),
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
|
||||||
if self.state()? != State::Normal {
|
if self.state()? != State::Normal {
|
||||||
return Err(self.internal_error(format!(
|
return Err(self.internal_error(format!(
|
||||||
"Lexer::lex_line_start called in unexpected state: {:?}",
|
"Lexer::lex_line_start called in unexpected state: {:?}",
|
||||||
@ -295,7 +318,7 @@ impl<'a> Lexer<'a> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Handle new indentation
|
// Handle new indentation
|
||||||
while let Some(' ') | Some('\t') = self.next {
|
while self.next_is_whitespace() {
|
||||||
self.advance()?;
|
self.advance()?;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -356,7 +379,7 @@ impl<'a> Lexer<'a> {
|
|||||||
self.pop_state()?;
|
self.pop_state()?;
|
||||||
// Emit interpolation end token
|
// Emit interpolation end token
|
||||||
self.lex_double(InterpolationEnd)
|
self.lex_double(InterpolationEnd)
|
||||||
} else if self.rest_starts_with("\n") || self.rest_starts_with("\r\n") {
|
} else if self.at_eol_or_eof() {
|
||||||
// Return unterminated interpolation error that highlights the opening {{
|
// Return unterminated interpolation error that highlights the opening {{
|
||||||
Err(self.unterminated_interpolation_error(interpolation_start))
|
Err(self.unterminated_interpolation_error(interpolation_start))
|
||||||
} else {
|
} else {
|
||||||
@ -446,7 +469,7 @@ impl<'a> Lexer<'a> {
|
|||||||
fn lex_colon(&mut self) -> CompilationResult<'a, ()> {
|
fn lex_colon(&mut self) -> CompilationResult<'a, ()> {
|
||||||
self.advance()?;
|
self.advance()?;
|
||||||
|
|
||||||
if let Some('=') = self.next {
|
if self.next_is('=') {
|
||||||
self.advance()?;
|
self.advance()?;
|
||||||
self.token(ColonEquals);
|
self.token(ColonEquals);
|
||||||
} else {
|
} else {
|
||||||
@ -492,8 +515,10 @@ impl<'a> Lexer<'a> {
|
|||||||
|
|
||||||
/// Lex name: [a-zA-Z_][a-zA-Z0-9_]*
|
/// Lex name: [a-zA-Z_][a-zA-Z0-9_]*
|
||||||
fn lex_name(&mut self) -> CompilationResult<'a, ()> {
|
fn lex_name(&mut self) -> CompilationResult<'a, ()> {
|
||||||
while let Some('a'...'z') | Some('A'...'Z') | Some('0'...'9') | Some('_') | Some('-') =
|
while self
|
||||||
self.next
|
.next
|
||||||
|
.map(|c| c.is_ascii_alphanumeric() || c == '-' || c == '_')
|
||||||
|
.unwrap_or(false)
|
||||||
{
|
{
|
||||||
self.advance()?;
|
self.advance()?;
|
||||||
}
|
}
|
||||||
@ -508,11 +533,7 @@ impl<'a> Lexer<'a> {
|
|||||||
// advance over #
|
// advance over #
|
||||||
self.advance()?;
|
self.advance()?;
|
||||||
|
|
||||||
loop {
|
while !self.at_eol_or_eof() {
|
||||||
if let Some('\r') | Some('\n') | None = self.next {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
self.advance()?;
|
self.advance()?;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -523,22 +544,18 @@ impl<'a> Lexer<'a> {
|
|||||||
|
|
||||||
/// Lex backtick: `[^\r\n]*`
|
/// Lex backtick: `[^\r\n]*`
|
||||||
fn lex_backtick(&mut self) -> CompilationResult<'a, ()> {
|
fn lex_backtick(&mut self) -> CompilationResult<'a, ()> {
|
||||||
// advance over `
|
// advance over initial `
|
||||||
self.advance()?;
|
self.advance()?;
|
||||||
|
|
||||||
loop {
|
while !self.next_is('`') {
|
||||||
if let Some('\r') | Some('\n') | None = self.next {
|
if self.at_eol_or_eof() {
|
||||||
return Err(self.error(UnterminatedBacktick));
|
return Err(self.error(UnterminatedBacktick));
|
||||||
}
|
}
|
||||||
|
|
||||||
if let Some('`') = self.next {
|
|
||||||
self.advance()?;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
self.advance()?;
|
self.advance()?;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
self.advance()?;
|
||||||
self.token(Backtick);
|
self.token(Backtick);
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
@ -546,7 +563,7 @@ impl<'a> Lexer<'a> {
|
|||||||
|
|
||||||
/// Lex whitespace: [ \t]+
|
/// Lex whitespace: [ \t]+
|
||||||
fn lex_whitespace(&mut self) -> CompilationResult<'a, ()> {
|
fn lex_whitespace(&mut self) -> CompilationResult<'a, ()> {
|
||||||
while let Some(' ') | Some('\t') = self.next {
|
while self.next_is_whitespace() {
|
||||||
self.advance()?
|
self.advance()?
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user