3a287b864a
- Upgrade to rust 2018 - Update dependencies - Use BTree{Map,Set} instead of Map and Set
709 lines
17 KiB
Rust
709 lines
17 KiB
Rust
use crate::common::*;
|
|
|
|
use CompilationErrorKind::*;
|
|
use TokenKind::*;
|
|
|
|
fn re(pattern: &str) -> Regex {
|
|
Regex::new(pattern).unwrap()
|
|
}
|
|
|
|
fn token(pattern: &str) -> Regex {
|
|
let mut s = String::new();
|
|
s += r"^(?m)([ \t]*)(";
|
|
s += pattern;
|
|
s += ")";
|
|
re(&s)
|
|
}
|
|
|
|
fn mixed_whitespace(text: &str) -> bool {
|
|
!(text.chars().all(|c| c == ' ') || text.chars().all(|c| c == '\t'))
|
|
}
|
|
|
|
pub struct Lexer<'a> {
|
|
tokens: Vec<Token<'a>>,
|
|
text: &'a str,
|
|
rest: &'a str,
|
|
index: usize,
|
|
column: usize,
|
|
line: usize,
|
|
state: Vec<State<'a>>,
|
|
}
|
|
|
|
#[derive(PartialEq)]
|
|
enum State<'a> {
|
|
Start,
|
|
Indent(&'a str),
|
|
Text,
|
|
Interpolation,
|
|
}
|
|
|
|
impl<'a> Lexer<'a> {
|
|
pub fn lex(text: &'a str) -> CompilationResult<Vec<Token<'a>>> {
|
|
let lexer = Lexer {
|
|
tokens: vec![],
|
|
rest: text,
|
|
index: 0,
|
|
line: 0,
|
|
column: 0,
|
|
state: vec![State::Start],
|
|
text,
|
|
};
|
|
|
|
lexer.inner()
|
|
}
|
|
|
|
fn error(&self, kind: CompilationErrorKind<'a>) -> CompilationError<'a> {
|
|
CompilationError {
|
|
text: self.text,
|
|
index: self.index,
|
|
line: self.line,
|
|
column: self.column,
|
|
width: None,
|
|
kind,
|
|
}
|
|
}
|
|
|
|
fn token(&self, prefix: &'a str, lexeme: &'a str, kind: TokenKind) -> Token<'a> {
|
|
Token {
|
|
index: self.index,
|
|
line: self.line,
|
|
column: self.column,
|
|
text: self.text,
|
|
prefix,
|
|
lexeme,
|
|
kind,
|
|
}
|
|
}
|
|
|
|
fn lex_indent(&mut self) -> CompilationResult<'a, Option<Token<'a>>> {
|
|
lazy_static! {
|
|
static ref INDENT: Regex = re(r"^([ \t]*)[^ \t\n\r]");
|
|
}
|
|
|
|
let indentation = INDENT
|
|
.captures(self.rest)
|
|
.map(|captures| captures.get(1).unwrap().as_str());
|
|
|
|
if self.column == 0 {
|
|
if let Some(kind) = match (self.state.last().unwrap(), indentation) {
|
|
// ignore: was no indentation and there still isn't
|
|
// or current line is blank
|
|
(&State::Start, Some("")) | (_, None) => None,
|
|
// indent: was no indentation, now there is
|
|
(&State::Start, Some(current)) => {
|
|
if mixed_whitespace(current) {
|
|
return Err(self.error(MixedLeadingWhitespace {
|
|
whitespace: current,
|
|
}));
|
|
}
|
|
//indent = Some(current);
|
|
self.state.push(State::Indent(current));
|
|
Some(Indent)
|
|
}
|
|
// dedent: there was indentation and now there isn't
|
|
(&State::Indent(_), Some("")) => {
|
|
// indent = None;
|
|
self.state.pop();
|
|
Some(Dedent)
|
|
}
|
|
// was indentation and still is, check if the new indentation matches
|
|
(&State::Indent(previous), Some(current)) => {
|
|
if !current.starts_with(previous) {
|
|
return Err(self.error(InconsistentLeadingWhitespace {
|
|
expected: previous,
|
|
found: current,
|
|
}));
|
|
}
|
|
None
|
|
}
|
|
// at column 0 in some other state: this should never happen
|
|
(&State::Text, _) | (&State::Interpolation, _) => {
|
|
return Err(self.error(Internal {
|
|
message: "unexpected state at column 0".to_string(),
|
|
}));
|
|
}
|
|
} {
|
|
return Ok(Some(self.token("", "", kind)));
|
|
}
|
|
}
|
|
Ok(None)
|
|
}
|
|
|
|
pub fn inner(mut self) -> CompilationResult<'a, Vec<Token<'a>>> {
|
|
lazy_static! {
|
|
static ref AT: Regex = token(r"@");
|
|
static ref BACKTICK: Regex = token(r"`[^`\n\r]*`");
|
|
static ref COLON: Regex = token(r":");
|
|
static ref COMMA: Regex = token(r",");
|
|
static ref COMMENT: Regex = token(r"#([^\n\r][^\n\r]*)?\r?$");
|
|
static ref EOF: Regex = token(r"\z");
|
|
static ref EOL: Regex = token(r"\n|\r\n");
|
|
static ref EQUALS: Regex = token(r"=");
|
|
static ref INTERPOLATION_END: Regex = token(r"[}][}]");
|
|
static ref INTERPOLATION_START_TOKEN: Regex = token(r"[{][{]");
|
|
static ref NAME: Regex = token(r"([a-zA-Z_][a-zA-Z0-9_-]*)");
|
|
static ref PAREN_L: Regex = token(r"[(]");
|
|
static ref PAREN_R: Regex = token(r"[)]");
|
|
static ref PLUS: Regex = token(r"[+]");
|
|
static ref RAW_STRING: Regex = token(r#"'[^']*'"#);
|
|
static ref STRING: Regex = token(r#"["]"#);
|
|
static ref UNTERMINATED_RAW_STRING: Regex = token(r#"'[^']*"#);
|
|
static ref INTERPOLATION_START: Regex = re(r"^[{][{]");
|
|
static ref LEADING_TEXT: Regex = re(r"^(?m)(.+?)[{][{]");
|
|
static ref LINE: Regex = re(r"^(?m)[ \t]+[^ \t\n\r].*$");
|
|
static ref TEXT: Regex = re(r"^(?m)(.+)");
|
|
}
|
|
|
|
loop {
|
|
if let Some(token) = self.lex_indent()? {
|
|
self.tokens.push(token);
|
|
}
|
|
|
|
// insert a dedent if we're indented and we hit the end of the file
|
|
if &State::Start != self.state.last().unwrap() && EOF.is_match(self.rest) {
|
|
let token = self.token("", "", Dedent);
|
|
self.tokens.push(token);
|
|
}
|
|
|
|
let (prefix, lexeme, kind) = if let (0, &State::Indent(indent), Some(captures)) = (
|
|
self.column,
|
|
self.state.last().unwrap(),
|
|
LINE.captures(self.rest),
|
|
) {
|
|
let line = captures.get(0).unwrap().as_str();
|
|
if !line.starts_with(indent) {
|
|
return Err(self.error(Internal {
|
|
message: "unexpected indent".to_string(),
|
|
}));
|
|
}
|
|
self.state.push(State::Text);
|
|
(&line[0..indent.len()], "", Line)
|
|
} else if let Some(captures) = EOF.captures(self.rest) {
|
|
(
|
|
captures.get(1).unwrap().as_str(),
|
|
captures.get(2).unwrap().as_str(),
|
|
Eof,
|
|
)
|
|
} else if let State::Text = *self.state.last().unwrap() {
|
|
if let Some(captures) = INTERPOLATION_START.captures(self.rest) {
|
|
self.state.push(State::Interpolation);
|
|
("", captures.get(0).unwrap().as_str(), InterpolationStart)
|
|
} else if let Some(captures) = LEADING_TEXT.captures(self.rest) {
|
|
("", captures.get(1).unwrap().as_str(), Text)
|
|
} else if let Some(captures) = TEXT.captures(self.rest) {
|
|
("", captures.get(1).unwrap().as_str(), Text)
|
|
} else if let Some(captures) = EOL.captures(self.rest) {
|
|
self.state.pop();
|
|
(
|
|
captures.get(1).unwrap().as_str(),
|
|
captures.get(2).unwrap().as_str(),
|
|
Eol,
|
|
)
|
|
} else {
|
|
return Err(self.error(Internal {
|
|
message: format!("Could not match token in text state: \"{}\"", self.rest),
|
|
}));
|
|
}
|
|
} else if let Some(captures) = INTERPOLATION_START_TOKEN.captures(self.rest) {
|
|
(
|
|
captures.get(1).unwrap().as_str(),
|
|
captures.get(2).unwrap().as_str(),
|
|
InterpolationStart,
|
|
)
|
|
} else if let Some(captures) = INTERPOLATION_END.captures(self.rest) {
|
|
if self.state.last().unwrap() == &State::Interpolation {
|
|
self.state.pop();
|
|
}
|
|
(
|
|
captures.get(1).unwrap().as_str(),
|
|
captures.get(2).unwrap().as_str(),
|
|
InterpolationEnd,
|
|
)
|
|
} else if let Some(captures) = NAME.captures(self.rest) {
|
|
(
|
|
captures.get(1).unwrap().as_str(),
|
|
captures.get(2).unwrap().as_str(),
|
|
Name,
|
|
)
|
|
} else if let Some(captures) = EOL.captures(self.rest) {
|
|
if self.state.last().unwrap() == &State::Interpolation {
|
|
return Err(self.error(UnterminatedInterpolation));
|
|
}
|
|
(
|
|
captures.get(1).unwrap().as_str(),
|
|
captures.get(2).unwrap().as_str(),
|
|
Eol,
|
|
)
|
|
} else if let Some(captures) = BACKTICK.captures(self.rest) {
|
|
(
|
|
captures.get(1).unwrap().as_str(),
|
|
captures.get(2).unwrap().as_str(),
|
|
Backtick,
|
|
)
|
|
} else if let Some(captures) = COLON.captures(self.rest) {
|
|
(
|
|
captures.get(1).unwrap().as_str(),
|
|
captures.get(2).unwrap().as_str(),
|
|
Colon,
|
|
)
|
|
} else if let Some(captures) = AT.captures(self.rest) {
|
|
(
|
|
captures.get(1).unwrap().as_str(),
|
|
captures.get(2).unwrap().as_str(),
|
|
At,
|
|
)
|
|
} else if let Some(captures) = COMMA.captures(self.rest) {
|
|
(
|
|
captures.get(1).unwrap().as_str(),
|
|
captures.get(2).unwrap().as_str(),
|
|
Comma,
|
|
)
|
|
} else if let Some(captures) = PAREN_L.captures(self.rest) {
|
|
(
|
|
captures.get(1).unwrap().as_str(),
|
|
captures.get(2).unwrap().as_str(),
|
|
ParenL,
|
|
)
|
|
} else if let Some(captures) = PAREN_R.captures(self.rest) {
|
|
(
|
|
captures.get(1).unwrap().as_str(),
|
|
captures.get(2).unwrap().as_str(),
|
|
ParenR,
|
|
)
|
|
} else if let Some(captures) = PLUS.captures(self.rest) {
|
|
(
|
|
captures.get(1).unwrap().as_str(),
|
|
captures.get(2).unwrap().as_str(),
|
|
Plus,
|
|
)
|
|
} else if let Some(captures) = EQUALS.captures(self.rest) {
|
|
(
|
|
captures.get(1).unwrap().as_str(),
|
|
captures.get(2).unwrap().as_str(),
|
|
Equals,
|
|
)
|
|
} else if let Some(captures) = COMMENT.captures(self.rest) {
|
|
(
|
|
captures.get(1).unwrap().as_str(),
|
|
captures.get(2).unwrap().as_str(),
|
|
Comment,
|
|
)
|
|
} else if let Some(captures) = RAW_STRING.captures(self.rest) {
|
|
(
|
|
captures.get(1).unwrap().as_str(),
|
|
captures.get(2).unwrap().as_str(),
|
|
RawString,
|
|
)
|
|
} else if UNTERMINATED_RAW_STRING.is_match(self.rest) {
|
|
return Err(self.error(UnterminatedString));
|
|
} else if let Some(captures) = STRING.captures(self.rest) {
|
|
let prefix = captures.get(1).unwrap().as_str();
|
|
let contents = &self.rest[prefix.len() + 1..];
|
|
if contents.is_empty() {
|
|
return Err(self.error(UnterminatedString));
|
|
}
|
|
let mut len = 0;
|
|
let mut escape = false;
|
|
for c in contents.chars() {
|
|
if c == '\n' || c == '\r' {
|
|
return Err(self.error(UnterminatedString));
|
|
} else if !escape && c == '"' {
|
|
break;
|
|
} else if !escape && c == '\\' {
|
|
escape = true;
|
|
} else if escape {
|
|
escape = false;
|
|
}
|
|
len += c.len_utf8();
|
|
}
|
|
let start = prefix.len();
|
|
let content_end = start + len + 1;
|
|
if escape || content_end >= self.rest.len() {
|
|
return Err(self.error(UnterminatedString));
|
|
}
|
|
(prefix, &self.rest[start..=content_end], StringToken)
|
|
} else {
|
|
return Err(self.error(UnknownStartOfToken));
|
|
};
|
|
|
|
let token = self.token(prefix, lexeme, kind);
|
|
self.tokens.push(token);
|
|
|
|
let len = prefix.len() + lexeme.len();
|
|
|
|
if len == 0 {
|
|
let last = self.tokens.last().unwrap();
|
|
match last.kind {
|
|
Eof => {}
|
|
_ => {
|
|
return Err(last.error(Internal {
|
|
message: format!("zero length token: {:?}", last),
|
|
}));
|
|
}
|
|
}
|
|
}
|
|
|
|
match self.tokens.last().unwrap().kind {
|
|
Eol => {
|
|
self.line += 1;
|
|
self.column = 0;
|
|
}
|
|
Eof => {
|
|
break;
|
|
}
|
|
RawString => {
|
|
let lexeme_lines = lexeme.lines().count();
|
|
self.line += lexeme_lines - 1;
|
|
if lexeme_lines == 1 {
|
|
self.column += len;
|
|
} else {
|
|
self.column = lexeme.lines().last().unwrap().len();
|
|
}
|
|
}
|
|
_ => {
|
|
self.column += len;
|
|
}
|
|
}
|
|
|
|
self.rest = &self.rest[len..];
|
|
self.index += len;
|
|
}
|
|
|
|
Ok(self.tokens)
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod test {
|
|
use super::*;
|
|
|
|
macro_rules! summary_test {
|
|
($name:ident, $input:expr, $expected:expr $(,)*) => {
|
|
#[test]
|
|
fn $name() {
|
|
let input = $input;
|
|
let expected = $expected;
|
|
let tokens = crate::lexer::Lexer::lex(input).unwrap();
|
|
let roundtrip = tokens
|
|
.iter()
|
|
.map(|t| {
|
|
let mut s = String::new();
|
|
s += t.prefix;
|
|
s += t.lexeme;
|
|
s
|
|
})
|
|
.collect::<Vec<_>>()
|
|
.join("");
|
|
let actual = token_summary(&tokens);
|
|
if actual != expected {
|
|
panic!(
|
|
"token summary mismatch:\nexpected: {}\ngot: {}\n",
|
|
expected, actual
|
|
);
|
|
}
|
|
assert_eq!(input, roundtrip);
|
|
}
|
|
};
|
|
}
|
|
|
|
fn token_summary(tokens: &[Token]) -> String {
|
|
tokens
|
|
.iter()
|
|
.map(|t| match t.kind {
|
|
At => "@",
|
|
Backtick => "`",
|
|
Colon => ":",
|
|
Comma => ",",
|
|
Comment { .. } => "#",
|
|
Dedent => "<",
|
|
Eof => ".",
|
|
Eol => "$",
|
|
Equals => "=",
|
|
Indent { .. } => ">",
|
|
InterpolationEnd => "}",
|
|
InterpolationStart => "{",
|
|
Line { .. } => "^",
|
|
Name => "N",
|
|
ParenL => "(",
|
|
ParenR => ")",
|
|
Plus => "+",
|
|
RawString => "'",
|
|
StringToken => "\"",
|
|
Text => "_",
|
|
})
|
|
.collect::<Vec<_>>()
|
|
.join("")
|
|
}
|
|
|
|
macro_rules! error_test {
|
|
(
|
|
name: $name:ident,
|
|
input: $input:expr,
|
|
index: $index:expr,
|
|
line: $line:expr,
|
|
column: $column:expr,
|
|
width: $width:expr,
|
|
kind: $kind:expr,
|
|
) => {
|
|
#[test]
|
|
fn $name() {
|
|
let input = $input;
|
|
|
|
let expected = CompilationError {
|
|
text: input,
|
|
index: $index,
|
|
line: $line,
|
|
column: $column,
|
|
width: $width,
|
|
kind: $kind,
|
|
};
|
|
|
|
if let Err(error) = Lexer::lex(input) {
|
|
assert_eq!(error.text, expected.text);
|
|
assert_eq!(error.index, expected.index);
|
|
assert_eq!(error.line, expected.line);
|
|
assert_eq!(error.column, expected.column);
|
|
assert_eq!(error.kind, expected.kind);
|
|
assert_eq!(error, expected);
|
|
} else {
|
|
panic!("tokenize succeeded but expected: {}\n{}", expected, input);
|
|
}
|
|
}
|
|
};
|
|
}
|
|
|
|
summary_test! {
|
|
tokenize_strings,
|
|
r#"a = "'a'" + '"b"' + "'c'" + '"d"'#echo hello"#,
|
|
r#"N="+'+"+'#."#,
|
|
}
|
|
|
|
summary_test! {
|
|
tokenize_recipe_interpolation_eol,
|
|
"foo: # some comment
|
|
{{hello}}
|
|
",
|
|
"N:#$>^{N}$<.",
|
|
}
|
|
|
|
summary_test! {
|
|
tokenize_recipe_interpolation_eof,
|
|
"foo: # more comments
|
|
{{hello}}
|
|
# another comment
|
|
",
|
|
"N:#$>^{N}$<#$.",
|
|
}
|
|
|
|
summary_test! {
|
|
tokenize_recipe_complex_interpolation_expression,
|
|
"foo: #lol\n {{a + b + \"z\" + blarg}}",
|
|
"N:#$>^{N+N+\"+N}<.",
|
|
}
|
|
|
|
summary_test! {
|
|
tokenize_recipe_multiple_interpolations,
|
|
"foo:,#ok\n {{a}}0{{b}}1{{c}}",
|
|
"N:,#$>^{N}_{N}_{N}<.",
|
|
}
|
|
|
|
summary_test! {
|
|
tokenize_junk,
|
|
"bob
|
|
|
|
hello blah blah blah : a b c #whatever
|
|
",
|
|
"N$$NNNN:NNN#$.",
|
|
}
|
|
|
|
summary_test! {
|
|
tokenize_empty_lines,
|
|
"
|
|
# this does something
|
|
hello:
|
|
asdf
|
|
bsdf
|
|
|
|
csdf
|
|
|
|
dsdf # whatever
|
|
|
|
# yolo
|
|
",
|
|
"$#$N:$>^_$^_$$^_$$^_$$<#$.",
|
|
}
|
|
|
|
summary_test! {
|
|
tokenize_comment_before_variable,
|
|
"
|
|
#
|
|
A='1'
|
|
echo:
|
|
echo {{A}}
|
|
",
|
|
"$#$N='$N:$>^_{N}$<.",
|
|
}
|
|
|
|
summary_test! {
|
|
tokenize_interpolation_backticks,
|
|
"hello:\n echo {{`echo hello` + `echo goodbye`}}",
|
|
"N:$>^_{`+`}<.",
|
|
}
|
|
|
|
summary_test! {
|
|
tokenize_assignment_backticks,
|
|
"a = `echo hello` + `echo goodbye`",
|
|
"N=`+`.",
|
|
}
|
|
|
|
summary_test! {
|
|
tokenize_multiple,
|
|
"
|
|
hello:
|
|
a
|
|
b
|
|
|
|
c
|
|
|
|
d
|
|
|
|
# hello
|
|
bob:
|
|
frank
|
|
",
|
|
|
|
"$N:$>^_$^_$$^_$$^_$$<#$N:$>^_$<.",
|
|
}
|
|
|
|
summary_test! {
|
|
tokenize_comment,
|
|
"a:=#",
|
|
"N:=#."
|
|
}
|
|
|
|
summary_test! {
|
|
tokenize_comment_with_bang,
|
|
"a:=#foo!",
|
|
"N:=#."
|
|
}
|
|
|
|
summary_test! {
|
|
tokenize_order,
|
|
r"
|
|
b: a
|
|
@mv a b
|
|
|
|
a:
|
|
@touch F
|
|
@touch a
|
|
|
|
d: c
|
|
@rm c
|
|
|
|
c: b
|
|
@mv b c",
|
|
"$N:N$>^_$$<N:$>^_$^_$$<N:N$>^_$$<N:N$>^_<.",
|
|
}
|
|
|
|
summary_test! {
|
|
tokenize_parens,
|
|
r"((())) )abc(+",
|
|
"((())))N(+.",
|
|
}
|
|
|
|
summary_test! {
|
|
crlf_newline,
|
|
"#\r\n#asdf\r\n",
|
|
"#$#$.",
|
|
}
|
|
|
|
error_test! {
|
|
name: tokenize_space_then_tab,
|
|
input: "a:
|
|
0
|
|
1
|
|
\t2
|
|
",
|
|
index: 9,
|
|
line: 3,
|
|
column: 0,
|
|
width: None,
|
|
kind: InconsistentLeadingWhitespace{expected: " ", found: "\t"},
|
|
}
|
|
|
|
error_test! {
|
|
name: tokenize_tabs_then_tab_space,
|
|
input: "a:
|
|
\t\t0
|
|
\t\t 1
|
|
\t 2
|
|
",
|
|
index: 12,
|
|
line: 3,
|
|
column: 0,
|
|
width: None,
|
|
kind: InconsistentLeadingWhitespace{expected: "\t\t", found: "\t "},
|
|
}
|
|
|
|
error_test! {
|
|
name: tokenize_unknown,
|
|
input: "~",
|
|
index: 0,
|
|
line: 0,
|
|
column: 0,
|
|
width: None,
|
|
kind: UnknownStartOfToken,
|
|
}
|
|
|
|
error_test! {
|
|
name: unterminated_string,
|
|
input: r#"a = ""#,
|
|
index: 3,
|
|
line: 0,
|
|
column: 3,
|
|
width: None,
|
|
kind: UnterminatedString,
|
|
}
|
|
|
|
error_test! {
|
|
name: unterminated_string_with_escapes,
|
|
input: r#"a = "\n\t\r\"\\"#,
|
|
index: 3,
|
|
line: 0,
|
|
column: 3,
|
|
width: None,
|
|
kind: UnterminatedString,
|
|
}
|
|
|
|
error_test! {
|
|
name: unterminated_raw_string,
|
|
input: "r a='asdf",
|
|
index: 4,
|
|
line: 0,
|
|
column: 4,
|
|
width: None,
|
|
kind: UnterminatedString,
|
|
}
|
|
|
|
error_test! {
|
|
name: unterminated_interpolation,
|
|
input: "foo:\n echo {{
|
|
",
|
|
index: 13,
|
|
line: 1,
|
|
column: 8,
|
|
width: None,
|
|
kind: UnterminatedInterpolation,
|
|
}
|
|
|
|
error_test! {
|
|
name: mixed_leading_whitespace,
|
|
input: "a:\n\t echo hello",
|
|
index: 3,
|
|
line: 1,
|
|
column: 0,
|
|
width: None,
|
|
kind: MixedLeadingWhitespace{whitespace: "\t "},
|
|
}
|
|
}
|