Reform indentation handling (#565)

Improve indentation handling in preparation for implementing inline
submodules. This changes the lexer to only parse freeform text inside
the first indent after a ':', so that just can be extended with new
indented constructs which are not recipe bodies. In addition, the lexer
should now handle multiple levels of indentation correctly.
This commit is contained in:
Casey Rodarmor 2019-12-11 20:25:16 -08:00 committed by GitHub
parent 66121d478b
commit bb4afe1481
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 322 additions and 135 deletions

View File

@ -60,7 +60,7 @@ pub(crate) use crate::{
position::Position, positional::Positional, recipe::Recipe, recipe_context::RecipeContext, position::Position, positional::Positional, recipe::Recipe, recipe_context::RecipeContext,
recipe_resolver::RecipeResolver, runtime_error::RuntimeError, scope::Scope, search::Search, recipe_resolver::RecipeResolver, runtime_error::RuntimeError, scope::Scope, search::Search,
search_config::SearchConfig, search_error::SearchError, set::Set, setting::Setting, search_config::SearchConfig, search_error::SearchError, set::Set, setting::Setting,
settings::Settings, shebang::Shebang, show_whitespace::ShowWhitespace, state::State, settings::Settings, shebang::Shebang, show_whitespace::ShowWhitespace,
string_literal::StringLiteral, subcommand::Subcommand, table::Table, thunk::Thunk, token::Token, string_literal::StringLiteral, subcommand::Subcommand, table::Table, thunk::Thunk, token::Token,
token_kind::TokenKind, unresolved_dependency::UnresolvedDependency, token_kind::TokenKind, unresolved_dependency::UnresolvedDependency,
unresolved_recipe::UnresolvedRecipe, use_color::UseColor, variables::Variables, unresolved_recipe::UnresolvedRecipe, use_color::UseColor, variables::Variables,

View File

@ -18,14 +18,20 @@ pub(crate) struct Lexer<'src> {
chars: Chars<'src>, chars: Chars<'src>,
/// Tokens /// Tokens
tokens: Vec<Token<'src>>, tokens: Vec<Token<'src>>,
/// State stack
state: Vec<State<'src>>,
/// Current token start /// Current token start
token_start: Position, token_start: Position,
/// Current token end /// Current token end
token_end: Position, token_end: Position,
/// Next character to be lexed /// Next character to be lexed
next: Option<char>, next: Option<char>,
/// Next indent will start a recipe body
recipe_body_pending: bool,
/// Inside recipe body
recipe_body: bool,
/// Indentation stack
indentation: Vec<&'src str>,
/// Current interpolation start token
interpolation_start: Option<Token<'src>>,
} }
impl<'src> Lexer<'src> { impl<'src> Lexer<'src> {
@ -46,10 +52,13 @@ impl<'src> Lexer<'src> {
}; };
Lexer { Lexer {
state: vec![State::Normal], indentation: vec![""],
tokens: Vec::new(), tokens: Vec::new(),
token_start: start, token_start: start,
token_end: start, token_end: start,
recipe_body_pending: false,
recipe_body: false,
interpolation_start: None,
chars, chars,
next, next,
src, src,
@ -123,22 +132,14 @@ impl<'src> Lexer<'src> {
self.at_eol() || self.rest().is_empty() self.at_eol() || self.rest().is_empty()
} }
/// Get current state /// Get current indentation
fn state(&self) -> CompilationResult<'src, State<'src>> { fn indentation(&self) -> &'src str {
if self.state.is_empty() { self.indentation.last().cloned().unwrap()
Err(self.internal_error("Lexer state stack empty"))
} else {
Ok(self.state[self.state.len() - 1])
}
} }
/// Pop current state from stack /// Are we currently indented
fn pop_state(&mut self) -> CompilationResult<'src, ()> { fn indented(&self) -> bool {
if self.state.pop().is_none() { !self.indentation().is_empty()
Err(self.internal_error("Lexer attempted to pop in start state"))
} else {
Ok(())
}
} }
/// Create a new token with `kind` whose lexeme /// Create a new token with `kind` whose lexeme
@ -260,39 +261,55 @@ impl<'src> Lexer<'src> {
} }
match self.next { match self.next {
Some(first) => match self.state()? { Some(first) => {
State::Normal => self.lex_normal(first)?, if let Some(interpolation_start) = self.interpolation_start {
State::Interpolation { self.lex_interpolation(interpolation_start, first)?
interpolation_start, } else if self.recipe_body {
} => self.lex_interpolation(interpolation_start, first)?, self.lex_body()?
State::Text => self.lex_text()?, } else {
State::Indented { .. } => self.lex_indented()?, self.lex_normal(first)?
}, };
}
None => break, None => break,
} }
} }
if let State::Interpolation { if let Some(interpolation_start) = self.interpolation_start {
interpolation_start,
} = self.state()?
{
return Err(self.unterminated_interpolation_error(interpolation_start)); return Err(self.unterminated_interpolation_error(interpolation_start));
} }
if let State::Indented { .. } | State::Text = self.state()? { while self.indented() {
self.token(Dedent); self.lex_dedent();
} }
self.token(Eof); self.token(Eof);
assert_eq!(self.token_start.offset, self.token_end.offset); assert_eq!(self.token_start.offset, self.token_end.offset);
assert_eq!(self.token_start.offset, self.src.len()); assert_eq!(self.token_start.offset, self.src.len());
assert_eq!(self.indentation.len(), 1);
Ok(self.tokens) Ok(self.tokens)
} }
/// Handle blank lines and indentation /// Handle blank lines and indentation
fn lex_line_start(&mut self) -> CompilationResult<'src, ()> { fn lex_line_start(&mut self) -> CompilationResult<'src, ()> {
enum Indentation<'src> {
// Line only contains whitespace
Blank,
// Indentation continues
Continue,
// Indentation decreases
Decrease,
// Indentation isn't consistent
Inconsistent,
// Indentation increases
Increase,
// Indentation mixes spaces and tabs
Mixed { whitespace: &'src str },
}
use Indentation::*;
let nonblank_index = self let nonblank_index = self
.rest() .rest()
.char_indices() .char_indices()
@ -303,92 +320,127 @@ impl<'src> Lexer<'src> {
let rest = &self.rest()[nonblank_index..]; let rest = &self.rest()[nonblank_index..];
// Handle blank line let whitespace = &self.rest()[..nonblank_index];
if rest.starts_with('\n') || rest.starts_with("\r\n") || rest.is_empty() {
let body_whitespace = &whitespace[..whitespace
.char_indices()
.take(self.indentation().chars().count())
.map(|(i, _c)| i)
.next()
.unwrap_or(0)];
let spaces = whitespace.chars().any(|c| c == ' ');
let tabs = whitespace.chars().any(|c| c == '\t');
let body_spaces = body_whitespace.chars().any(|c| c == ' ');
let body_tabs = body_whitespace.chars().any(|c| c == '\t');
#[allow(clippy::if_same_then_else)]
let indentation = if rest.starts_with('\n') || rest.starts_with("\r\n") || rest.is_empty() {
Blank
} else if whitespace == self.indentation() {
Continue
} else if self.indentation.contains(&whitespace) {
Decrease
} else if self.recipe_body && whitespace.starts_with(self.indentation()) {
Continue
} else if self.recipe_body && body_spaces && body_tabs {
Mixed {
whitespace: body_whitespace,
}
} else if !self.recipe_body && spaces && tabs {
Mixed { whitespace }
} else if whitespace.len() < self.indentation().len() {
Inconsistent
} else if self.recipe_body
&& body_whitespace.len() >= self.indentation().len()
&& !body_whitespace.starts_with(self.indentation())
{
Inconsistent
} else if whitespace.len() >= self.indentation().len()
&& !whitespace.starts_with(self.indentation())
{
Inconsistent
} else {
Increase
};
match indentation {
Blank => {
if !whitespace.is_empty() {
while self.next_is_whitespace() { while self.next_is_whitespace() {
self.advance()?; self.advance()?;
} }
// Lex a whitespace token if the blank line was nonempty
if self.current_token_length() > 0 {
self.token(Whitespace); self.token(Whitespace);
}; };
return Ok(()); Ok(())
} }
Continue => {
// Handle nonblank lines with no leading whitespace if !self.indentation().is_empty() {
if !self.next_is_whitespace() { for _ in self.indentation().chars() {
if let State::Indented { .. } = self.state()? {
self.token(Dedent);
self.pop_state()?;
}
return Ok(());
}
// Handle continued indentation
if let State::Indented { indentation } = self.state()? {
if self.rest_starts_with(indentation) {
for _ in indentation.chars() {
self.advance()?; self.advance()?;
} }
// Indentation matches, lex as whitespace
self.token(Whitespace); self.token(Whitespace);
return Ok(());
} }
// Consume whitespace characters, matching or not, up to the length Ok(())
// of expected indentation }
for _ in indentation.chars().zip(self.rest().chars()) { Decrease => {
if self.next_is_whitespace() { while self.indentation() != whitespace {
self.lex_dedent();
}
if !whitespace.is_empty() {
while self.next_is_whitespace() {
self.advance()?; self.advance()?;
} else {
break;
}
} }
// We've either advanced over not enough whitespace or mismatching self.token(Whitespace);
// whitespace, so return an error
return Err(self.error(InconsistentLeadingWhitespace {
expected: indentation,
found: self.lexeme(),
}));
} }
if self.state()? != State::Normal { Ok(())
return Err(self.internal_error(format!( }
"Lexer::lex_line_start called in unexpected state: {:?}", Mixed { whitespace } => {
self.state() for _ in whitespace.chars() {
))); self.advance()?;
} }
// Handle new indentation Err(self.error(MixedLeadingWhitespace { whitespace }))
}
Inconsistent => {
for _ in whitespace.chars() {
self.advance()?;
}
Err(self.error(InconsistentLeadingWhitespace {
expected: self.indentation(),
found: whitespace,
}))
}
Increase => {
while self.next_is_whitespace() { while self.next_is_whitespace() {
self.advance()?; self.advance()?;
} }
let indentation = self.lexeme(); let indentation = self.lexeme();
let spaces = indentation.chars().any(|c| c == ' '); self.indentation.push(indentation);
let tabs = indentation.chars().any(|c| c == '\t');
if spaces && tabs {
return Err(self.error(MixedLeadingWhitespace {
whitespace: indentation,
}));
}
self.state.push(State::Indented { indentation });
self.token(Indent); self.token(Indent);
Ok(()) if self.recipe_body_pending {
self.recipe_body = true;
} }
/// Lex token beginning with `start` in normal state Ok(())
}
}
}
/// Lex token beginning with `start` outside of a recipe body
fn lex_normal(&mut self, start: char) -> CompilationResult<'src, ()> { fn lex_normal(&mut self, start: char) -> CompilationResult<'src, ()> {
match start { match start {
'@' => self.lex_single(At), '@' => self.lex_single(At),
@ -420,7 +472,7 @@ impl<'src> Lexer<'src> {
} }
} }
/// Lex token beginning with `start` in interpolation state /// Lex token beginning with `start` inside an interpolation
fn lex_interpolation( fn lex_interpolation(
&mut self, &mut self,
interpolation_start: Token<'src>, interpolation_start: Token<'src>,
@ -428,21 +480,21 @@ impl<'src> Lexer<'src> {
) -> CompilationResult<'src, ()> { ) -> CompilationResult<'src, ()> {
// Check for end of interpolation // Check for end of interpolation
if self.rest_starts_with("}}") { if self.rest_starts_with("}}") {
// Pop interpolation state // end current interpolation
self.pop_state()?; self.interpolation_start = None;
// Emit interpolation end token // Emit interpolation end token
self.lex_double(InterpolationEnd) self.lex_double(InterpolationEnd)
} else if self.at_eol_or_eof() { } else if self.at_eol_or_eof() {
// Return unterminated interpolation error that highlights the opening {{ // Return unterminated interpolation error that highlights the opening {{
Err(self.unterminated_interpolation_error(interpolation_start)) Err(self.unterminated_interpolation_error(interpolation_start))
} else { } else {
// Otherwise lex as if we are in normal state // Otherwise lex as per normal
self.lex_normal(start) self.lex_normal(start)
} }
} }
/// Lex token beginning with `start` in text state /// Lex token while in recipe body
fn lex_text(&mut self) -> CompilationResult<'src, ()> { fn lex_body(&mut self) -> CompilationResult<'src, ()> {
enum Terminator { enum Terminator {
Newline, Newline,
NewlineCarriageReturn, NewlineCarriageReturn,
@ -478,29 +530,23 @@ impl<'src> Lexer<'src> {
} }
match terminator { match terminator {
Newline => { Newline => self.lex_single(Eol),
self.state.pop(); NewlineCarriageReturn => self.lex_double(Eol),
self.lex_single(Eol)
}
NewlineCarriageReturn => {
self.state.pop();
self.lex_double(Eol)
}
Interpolation => { Interpolation => {
self.lex_double(InterpolationStart)?; self.lex_double(InterpolationStart)?;
self.state.push(State::Interpolation { self.interpolation_start = Some(self.tokens[self.tokens.len() - 1]);
interpolation_start: self.tokens[self.tokens.len() - 1],
});
Ok(()) Ok(())
} }
EndOfFile => self.pop_state(), EndOfFile => Ok(()),
} }
} }
/// Lex token beginning with `start` in indented state fn lex_dedent(&mut self) {
fn lex_indented(&mut self) -> CompilationResult<'src, ()> { assert_eq!(self.current_token_length(), 0);
self.state.push(State::Text); self.token(Dedent);
Ok(()) self.indentation.pop();
self.recipe_body_pending = false;
self.recipe_body = false;
} }
/// Lex a single character token /// Lex a single character token
@ -527,6 +573,7 @@ impl<'src> Lexer<'src> {
self.token(ColonEquals); self.token(ColonEquals);
} else { } else {
self.token(Colon); self.token(Colon);
self.recipe_body_pending = true;
} }
Ok(()) Ok(())
@ -926,6 +973,126 @@ mod tests {
tokens: (Identifier:"foo", Colon, Eol, Indent:" ", Text:"a", Dedent), tokens: (Identifier:"foo", Colon, Eol, Indent:" ", Text:"a", Dedent),
} }
test! {
name: indented_normal,
text: "
a
b
c
",
tokens: (
Identifier:"a",
Eol,
Indent:" ",
Identifier:"b",
Eol,
Whitespace:" ",
Identifier:"c",
Eol,
Dedent,
),
}
test! {
name: indented_normal_nonempty_blank,
text: "a\n b\n\t\t\n c\n",
tokens: (
Identifier:"a",
Eol,
Indent:" ",
Identifier:"b",
Eol,
Whitespace:"\t\t",
Eol,
Whitespace:" ",
Identifier:"c",
Eol,
Dedent,
),
}
test! {
name: indented_normal_multiple,
text: "
a
b
c
",
tokens: (
Identifier:"a",
Eol,
Indent:" ",
Identifier:"b",
Eol,
Indent:" ",
Identifier:"c",
Eol,
Dedent,
Dedent,
),
}
test! {
name: indent_indent_dedent_indent,
text: "
a
b
c
d
e
",
tokens: (
Identifier:"a",
Eol,
Indent:" ",
Identifier:"b",
Eol,
Indent:" ",
Identifier:"c",
Eol,
Dedent,
Whitespace:" ",
Identifier:"d",
Eol,
Indent:" ",
Identifier:"e",
Eol,
Dedent,
Dedent,
),
}
test! {
name: indent_recipe_dedent_indent,
text: "
a
b:
c
d
e
",
tokens: (
Identifier:"a",
Eol,
Indent:" ",
Identifier:"b",
Colon,
Eol,
Indent:" ",
Text:"c",
Eol,
Dedent,
Whitespace:" ",
Identifier:"d",
Eol,
Indent:" ",
Identifier:"e",
Eol,
Dedent,
Dedent,
),
}
test! { test! {
name: indented_block, name: indented_block,
text: " text: "
@ -1646,7 +1813,7 @@ mod tests {
offset: 12, offset: 12,
line: 3, line: 3,
column: 0, column: 0,
width: 2, width: 3,
kind: InconsistentLeadingWhitespace{expected: "\t\t", found: "\t "}, kind: InconsistentLeadingWhitespace{expected: "\t\t", found: "\t "},
} }
@ -1762,7 +1929,7 @@ mod tests {
} }
error! { error! {
name: mixed_leading_whitespace, name: mixed_leading_whitespace_recipe,
input: "a:\n\t echo hello", input: "a:\n\t echo hello",
offset: 3, offset: 3,
line: 1, line: 1,
@ -1771,6 +1938,36 @@ mod tests {
kind: MixedLeadingWhitespace{whitespace: "\t "}, kind: MixedLeadingWhitespace{whitespace: "\t "},
} }
error! {
name: mixed_leading_whitespace_normal,
input: "a\n\t echo hello",
offset: 2,
line: 1,
column: 0,
width: 2,
kind: MixedLeadingWhitespace{whitespace: "\t "},
}
error! {
name: mixed_leading_whitespace_indent,
input: "a\n foo\n \tbar",
offset: 7,
line: 2,
column: 0,
width: 2,
kind: MixedLeadingWhitespace{whitespace: " \t"},
}
error! {
name: bad_dedent,
input: "a\n foo\n bar\n baz",
offset: 14,
line: 3,
column: 0,
width: 2,
kind: InconsistentLeadingWhitespace{expected: " ", found: " "},
}
error! { error! {
name: unclosed_interpolation_delimiter, name: unclosed_interpolation_delimiter,
input: "a:\n echo {{ foo", input: "a:\n echo {{ foo",

View File

@ -78,7 +78,6 @@ mod setting;
mod settings; mod settings;
mod shebang; mod shebang;
mod show_whitespace; mod show_whitespace;
mod state;
mod string_literal; mod string_literal;
mod subcommand; mod subcommand;
mod table; mod table;

View File

@ -1,9 +0,0 @@
use crate::common::*;
#[derive(Copy, Clone, PartialEq, Debug)]
pub(crate) enum State<'src> {
Normal,
Indented { indentation: &'src str },
Text,
Interpolation { interpolation_start: Token<'src> },
}