Refactor Tokenizer (#260)
This commit is contained in:
parent
afe2c0f94e
commit
861173581c
11
src/main.rs
11
src/main.rs
@ -33,11 +33,9 @@ mod recipe;
|
||||
mod recipe_resolver;
|
||||
mod run;
|
||||
mod runtime_error;
|
||||
mod scanner;
|
||||
mod shebang;
|
||||
mod token;
|
||||
mod tokenizer;
|
||||
|
||||
use tokenizer::tokenize;
|
||||
|
||||
mod common {
|
||||
pub use std::borrow::Cow;
|
||||
@ -70,18 +68,13 @@ mod common {
|
||||
pub use recipe::Recipe;
|
||||
pub use recipe_resolver::RecipeResolver;
|
||||
pub use runtime_error::{RuntimeError, RunResult};
|
||||
pub use scanner::Scanner;
|
||||
pub use shebang::Shebang;
|
||||
pub use token::{Token, TokenKind};
|
||||
}
|
||||
|
||||
use common::*;
|
||||
|
||||
fn compile(text: &str) -> CompilationResult<Justfile> {
|
||||
let tokens = tokenize(text)?;
|
||||
let parser = Parser::new(text, tokens);
|
||||
parser.justfile()
|
||||
}
|
||||
|
||||
fn main() {
|
||||
run::run();
|
||||
}
|
||||
|
@ -14,6 +14,12 @@ pub struct Parser<'a> {
|
||||
}
|
||||
|
||||
impl<'a> Parser<'a> {
|
||||
pub fn parse(text: &'a str) -> CompilationResult<'a, Justfile> {
|
||||
let tokens = Scanner::scan(text)?;
|
||||
let parser = Parser::new(text, tokens);
|
||||
parser.justfile()
|
||||
}
|
||||
|
||||
pub fn new(text: &'a str, tokens: Vec<Token<'a>>) -> Parser<'a> {
|
||||
Parser {
|
||||
text: text,
|
||||
|
@ -2,7 +2,6 @@ use common::*;
|
||||
|
||||
use std::{convert, ffi};
|
||||
use clap::{App, Arg, ArgGroup, AppSettings};
|
||||
use compile;
|
||||
use misc::maybe_s;
|
||||
use configuration::DEFAULT_SHELL;
|
||||
|
||||
@ -232,7 +231,7 @@ pub fn run() {
|
||||
.unwrap_or_else(|error| die!("Error reading justfile: {}", error));
|
||||
}
|
||||
|
||||
let justfile = compile(&text).unwrap_or_else(|error|
|
||||
let justfile = Parser::parse(&text).unwrap_or_else(|error|
|
||||
if color.stderr().active() {
|
||||
die!("{:#}", error);
|
||||
} else {
|
||||
|
600
src/scanner.rs
Normal file
600
src/scanner.rs
Normal file
@ -0,0 +1,600 @@
|
||||
use common::*;
|
||||
|
||||
use TokenKind::*;
|
||||
use CompilationErrorKind::*;
|
||||
|
||||
fn re(pattern: &str) -> Regex {
|
||||
Regex::new(pattern).unwrap()
|
||||
}
|
||||
|
||||
fn token(pattern: &str) -> Regex {
|
||||
let mut s = String::new();
|
||||
s += r"^(?m)([ \t]*)(";
|
||||
s += pattern;
|
||||
s += ")";
|
||||
re(&s)
|
||||
}
|
||||
|
||||
fn mixed_whitespace(text: &str) -> bool {
|
||||
!(text.chars().all(|c| c == ' ') || text.chars().all(|c| c == '\t'))
|
||||
}
|
||||
|
||||
pub struct Scanner<'a> {
|
||||
tokens: Vec<Token<'a>>,
|
||||
text: &'a str,
|
||||
rest: &'a str,
|
||||
index: usize,
|
||||
column: usize,
|
||||
line: usize,
|
||||
state: Vec<State<'a>>,
|
||||
}
|
||||
|
||||
#[derive(PartialEq)]
|
||||
enum State<'a> {
|
||||
Start,
|
||||
Indent(&'a str),
|
||||
Text,
|
||||
Interpolation,
|
||||
}
|
||||
|
||||
impl<'a> Scanner<'a> {
|
||||
pub fn scan(text: &'a str) -> CompilationResult<Vec<Token<'a>>> {
|
||||
let scanner = Scanner{
|
||||
tokens: vec![],
|
||||
text: text,
|
||||
rest: text,
|
||||
index: 0,
|
||||
line: 0,
|
||||
column: 0,
|
||||
state: vec![State::Start],
|
||||
};
|
||||
|
||||
scanner.inner()
|
||||
}
|
||||
|
||||
fn error(&self, kind: CompilationErrorKind<'a>) -> CompilationError<'a> {
|
||||
CompilationError {
|
||||
text: self.text,
|
||||
index: self.index,
|
||||
line: self.line,
|
||||
column: self.column,
|
||||
width: None,
|
||||
kind: kind,
|
||||
}
|
||||
}
|
||||
|
||||
fn token(&self, prefix: &'a str, lexeme: &'a str, kind: TokenKind) -> Token<'a> {
|
||||
Token {
|
||||
index: self.index,
|
||||
line: self.line,
|
||||
column: self.column,
|
||||
text: self.text,
|
||||
prefix: prefix,
|
||||
lexeme: lexeme,
|
||||
kind: kind,
|
||||
}
|
||||
}
|
||||
|
||||
fn scan_indent(&mut self) -> CompilationResult<'a, Option<Token<'a>>> {
|
||||
lazy_static! {
|
||||
static ref INDENT: Regex = re(r"^([ \t]*)[^ \t\n\r]");
|
||||
}
|
||||
|
||||
let indentation = INDENT.captures(self.rest).map(|captures| captures.get(1).unwrap().as_str());
|
||||
|
||||
if self.column == 0 {
|
||||
if let Some(kind) = match (self.state.last().unwrap(), indentation) {
|
||||
// ignore: was no indentation and there still isn't
|
||||
// or current line is blank
|
||||
(&State::Start, Some("")) | (_, None) => {
|
||||
None
|
||||
}
|
||||
// indent: was no indentation, now there is
|
||||
(&State::Start, Some(current)) => {
|
||||
if mixed_whitespace(current) {
|
||||
return Err(self.error(MixedLeadingWhitespace{whitespace: current}));
|
||||
}
|
||||
//indent = Some(current);
|
||||
self.state.push(State::Indent(current));
|
||||
Some(Indent)
|
||||
}
|
||||
// dedent: there was indentation and now there isn't
|
||||
(&State::Indent(_), Some("")) => {
|
||||
// indent = None;
|
||||
self.state.pop();
|
||||
Some(Dedent)
|
||||
}
|
||||
// was indentation and still is, check if the new indentation matches
|
||||
(&State::Indent(previous), Some(current)) => {
|
||||
if !current.starts_with(previous) {
|
||||
return Err(self.error(InconsistentLeadingWhitespace{
|
||||
expected: previous,
|
||||
found: current
|
||||
}));
|
||||
}
|
||||
None
|
||||
}
|
||||
// at column 0 in some other state: this should never happen
|
||||
(&State::Text, _) | (&State::Interpolation, _) => {
|
||||
return Err(self.error(Internal {
|
||||
message: "unexpected state at column 0".to_string()
|
||||
}));
|
||||
}
|
||||
} {
|
||||
return Ok(Some(self.token("", "", kind)));
|
||||
}
|
||||
}
|
||||
Ok(None)
|
||||
}
|
||||
|
||||
pub fn inner(mut self) -> CompilationResult<'a, Vec<Token<'a>>> {
|
||||
lazy_static! {
|
||||
static ref BACKTICK: Regex = token(r"`[^`\n\r]*`" );
|
||||
static ref COLON: Regex = token(r":" );
|
||||
static ref AT: Regex = token(r"@" );
|
||||
static ref COMMENT: Regex = token(r"#([^!\n\r].*)?$" );
|
||||
static ref EOF: Regex = token(r"(?-m)$" );
|
||||
static ref EOL: Regex = token(r"\n|\r\n" );
|
||||
static ref EQUALS: Regex = token(r"=" );
|
||||
static ref INTERPOLATION_END: Regex = token(r"[}][}]" );
|
||||
static ref INTERPOLATION_START_TOKEN: Regex = token(r"[{][{]" );
|
||||
static ref NAME: Regex = token(r"([a-zA-Z_][a-zA-Z0-9_-]*)" );
|
||||
static ref PLUS: Regex = token(r"[+]" );
|
||||
static ref STRING: Regex = token("\"" );
|
||||
static ref RAW_STRING: Regex = token(r#"'[^']*'"# );
|
||||
static ref UNTERMINATED_RAW_STRING: Regex = token(r#"'[^']*"# );
|
||||
static ref INTERPOLATION_START: Regex = re(r"^[{][{]" );
|
||||
static ref LEADING_TEXT: Regex = re(r"^(?m)(.+?)[{][{]" );
|
||||
static ref LINE: Regex = re(r"^(?m)[ \t]+[^ \t\n\r].*$");
|
||||
static ref TEXT: Regex = re(r"^(?m)(.+)" );
|
||||
}
|
||||
|
||||
loop {
|
||||
if let Some(token) = self.scan_indent()? {
|
||||
self.tokens.push(token);
|
||||
}
|
||||
|
||||
// insert a dedent if we're indented and we hit the end of the file
|
||||
if &State::Start != self.state.last().unwrap() && EOF.is_match(self.rest) {
|
||||
let token = self.token("", "", Dedent);
|
||||
self.tokens.push(token);
|
||||
}
|
||||
|
||||
let (prefix, lexeme, kind) =
|
||||
if let (0, &State::Indent(indent), Some(captures)) =
|
||||
(self.column, self.state.last().unwrap(), LINE.captures(self.rest)) {
|
||||
let line = captures.get(0).unwrap().as_str();
|
||||
if !line.starts_with(indent) {
|
||||
return Err(self.error(Internal{message: "unexpected indent".to_string()}));
|
||||
}
|
||||
self.state.push(State::Text);
|
||||
(&line[0..indent.len()], "", Line)
|
||||
} else if let Some(captures) = EOF.captures(self.rest) {
|
||||
(captures.get(1).unwrap().as_str(), captures.get(2).unwrap().as_str(), Eof)
|
||||
} else if let State::Text = *self.state.last().unwrap() {
|
||||
if let Some(captures) = INTERPOLATION_START.captures(self.rest) {
|
||||
self.state.push(State::Interpolation);
|
||||
("", captures.get(0).unwrap().as_str(), InterpolationStart)
|
||||
} else if let Some(captures) = LEADING_TEXT.captures(self.rest) {
|
||||
("", captures.get(1).unwrap().as_str(), Text)
|
||||
} else if let Some(captures) = TEXT.captures(self.rest) {
|
||||
("", captures.get(1).unwrap().as_str(), Text)
|
||||
} else if let Some(captures) = EOL.captures(self.rest) {
|
||||
self.state.pop();
|
||||
(captures.get(1).unwrap().as_str(), captures.get(2).unwrap().as_str(), Eol)
|
||||
} else {
|
||||
return Err(self.error(Internal {
|
||||
message: format!("Could not match token in text state: \"{}\"", self.rest)
|
||||
}));
|
||||
}
|
||||
} else if let Some(captures) = INTERPOLATION_START_TOKEN.captures(self.rest) {
|
||||
(captures.get(1).unwrap().as_str(), captures.get(2).unwrap().as_str(), InterpolationStart)
|
||||
} else if let Some(captures) = INTERPOLATION_END.captures(self.rest) {
|
||||
if self.state.last().unwrap() == &State::Interpolation {
|
||||
self.state.pop();
|
||||
}
|
||||
(captures.get(1).unwrap().as_str(), captures.get(2).unwrap().as_str(), InterpolationEnd)
|
||||
} else if let Some(captures) = NAME.captures(self.rest) {
|
||||
(captures.get(1).unwrap().as_str(), captures.get(2).unwrap().as_str(), Name)
|
||||
} else if let Some(captures) = EOL.captures(self.rest) {
|
||||
if self.state.last().unwrap() == &State::Interpolation {
|
||||
return Err(self.error(Internal {
|
||||
message: "hit EOL while still in interpolation state".to_string()
|
||||
}));
|
||||
}
|
||||
(captures.get(1).unwrap().as_str(), captures.get(2).unwrap().as_str(), Eol)
|
||||
} else if let Some(captures) = BACKTICK.captures(self.rest) {
|
||||
(captures.get(1).unwrap().as_str(), captures.get(2).unwrap().as_str(), Backtick)
|
||||
} else if let Some(captures) = COLON.captures(self.rest) {
|
||||
(captures.get(1).unwrap().as_str(), captures.get(2).unwrap().as_str(), Colon)
|
||||
} else if let Some(captures) = AT.captures(self.rest) {
|
||||
(captures.get(1).unwrap().as_str(), captures.get(2).unwrap().as_str(), At)
|
||||
} else if let Some(captures) = PLUS.captures(self.rest) {
|
||||
(captures.get(1).unwrap().as_str(), captures.get(2).unwrap().as_str(), Plus)
|
||||
} else if let Some(captures) = EQUALS.captures(self.rest) {
|
||||
(captures.get(1).unwrap().as_str(), captures.get(2).unwrap().as_str(), Equals)
|
||||
} else if let Some(captures) = COMMENT.captures(self.rest) {
|
||||
(captures.get(1).unwrap().as_str(), captures.get(2).unwrap().as_str(), Comment)
|
||||
} else if let Some(captures) = RAW_STRING.captures(self.rest) {
|
||||
(captures.get(1).unwrap().as_str(), captures.get(2).unwrap().as_str(), RawString)
|
||||
} else if UNTERMINATED_RAW_STRING.is_match(self.rest) {
|
||||
return Err(self.error(UnterminatedString));
|
||||
} else if let Some(captures) = STRING.captures(self.rest) {
|
||||
let prefix = captures.get(1).unwrap().as_str();
|
||||
let contents = &self.rest[prefix.len()+1..];
|
||||
if contents.is_empty() {
|
||||
return Err(self.error(UnterminatedString));
|
||||
}
|
||||
let mut len = 0;
|
||||
let mut escape = false;
|
||||
for c in contents.chars() {
|
||||
if c == '\n' || c == '\r' {
|
||||
return Err(self.error(UnterminatedString));
|
||||
} else if !escape && c == '"' {
|
||||
break;
|
||||
} else if !escape && c == '\\' {
|
||||
escape = true;
|
||||
} else if escape {
|
||||
escape = false;
|
||||
}
|
||||
len += c.len_utf8();
|
||||
}
|
||||
let start = prefix.len();
|
||||
let content_end = start + len + 1;
|
||||
if escape || content_end >= self.rest.len() {
|
||||
return Err(self.error(UnterminatedString));
|
||||
}
|
||||
(prefix, &self.rest[start..content_end + 1], StringToken)
|
||||
} else if self.rest.starts_with("#!") {
|
||||
return Err(self.error(OuterShebang));
|
||||
} else {
|
||||
return Err(self.error(UnknownStartOfToken));
|
||||
};
|
||||
|
||||
let token = self.token(prefix, lexeme, kind);
|
||||
self.tokens.push(token);
|
||||
|
||||
let len = prefix.len() + lexeme.len();
|
||||
|
||||
if len == 0 {
|
||||
let last = self.tokens.last().unwrap();
|
||||
match last.kind {
|
||||
Eof => {},
|
||||
_ => return Err(last.error(Internal {
|
||||
message: format!("zero length token: {:?}", last)
|
||||
})),
|
||||
}
|
||||
}
|
||||
|
||||
match self.tokens.last().unwrap().kind {
|
||||
Eol => {
|
||||
self.line += 1;
|
||||
self.column = 0;
|
||||
}
|
||||
Eof => {
|
||||
break;
|
||||
}
|
||||
RawString => {
|
||||
let lexeme_lines = lexeme.lines().count();
|
||||
self.line += lexeme_lines - 1;
|
||||
if lexeme_lines == 1 {
|
||||
self.column += len;
|
||||
} else {
|
||||
self.column = lexeme.lines().last().unwrap().len();
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
self.column += len;
|
||||
}
|
||||
}
|
||||
|
||||
self.rest = &self.rest[len..];
|
||||
self.index += len;
|
||||
}
|
||||
|
||||
Ok(self.tokens)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::*;
|
||||
|
||||
macro_rules! summary_test {
|
||||
($name:ident, $input:expr, $expected:expr $(,)*) => {
|
||||
#[test]
|
||||
fn $name() {
|
||||
let input = $input;
|
||||
let expected = $expected;
|
||||
let tokens = ::Scanner::scan(input).unwrap();
|
||||
let roundtrip = tokens.iter().map(|t| {
|
||||
let mut s = String::new();
|
||||
s += t.prefix;
|
||||
s += t.lexeme;
|
||||
s
|
||||
}).collect::<Vec<_>>().join("");
|
||||
let actual = token_summary(&tokens);
|
||||
if actual != expected {
|
||||
panic!("token summary mismatch:\nexpected: {}\ngot: {}\n", expected, actual);
|
||||
}
|
||||
assert_eq!(input, roundtrip);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn token_summary(tokens: &[Token]) -> String {
|
||||
tokens.iter().map(|t| {
|
||||
match t.kind {
|
||||
At => "@",
|
||||
Backtick => "`",
|
||||
Colon => ":",
|
||||
Comment{..} => "#",
|
||||
Dedent => "<",
|
||||
Eof => ".",
|
||||
Eol => "$",
|
||||
Equals => "=",
|
||||
Indent{..} => ">",
|
||||
InterpolationEnd => "}",
|
||||
InterpolationStart => "{",
|
||||
Line{..} => "^",
|
||||
Name => "N",
|
||||
Plus => "+",
|
||||
RawString => "'",
|
||||
StringToken => "\"",
|
||||
Text => "_",
|
||||
}
|
||||
}).collect::<Vec<_>>().join("")
|
||||
}
|
||||
|
||||
macro_rules! error_test {
|
||||
(
|
||||
name: $name:ident,
|
||||
input: $input:expr,
|
||||
index: $index:expr,
|
||||
line: $line:expr,
|
||||
column: $column:expr,
|
||||
width: $width:expr,
|
||||
kind: $kind:expr,
|
||||
) => {
|
||||
#[test]
|
||||
fn $name() {
|
||||
let input = $input;
|
||||
|
||||
let expected = CompilationError {
|
||||
text: input,
|
||||
index: $index,
|
||||
line: $line,
|
||||
column: $column,
|
||||
width: $width,
|
||||
kind: $kind,
|
||||
};
|
||||
|
||||
if let Err(error) = Scanner::scan(input) {
|
||||
assert_eq!(error.text, expected.text);
|
||||
assert_eq!(error.index, expected.index);
|
||||
assert_eq!(error.line, expected.line);
|
||||
assert_eq!(error.column, expected.column);
|
||||
assert_eq!(error.kind, expected.kind);
|
||||
assert_eq!(error, expected);
|
||||
} else {
|
||||
panic!("tokenize succeeded but expected: {}\n{}", expected, input);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
summary_test! {
|
||||
tokenize_strings,
|
||||
r#"a = "'a'" + '"b"' + "'c'" + '"d"'#echo hello"#,
|
||||
r#"N="+'+"+'#."#,
|
||||
}
|
||||
|
||||
summary_test! {
|
||||
tokenize_recipe_interpolation_eol,
|
||||
"foo: # some comment
|
||||
{{hello}}
|
||||
",
|
||||
"N:#$>^{N}$<.",
|
||||
}
|
||||
|
||||
summary_test! {
|
||||
tokenize_recipe_interpolation_eof,
|
||||
"foo: # more comments
|
||||
{{hello}}
|
||||
# another comment
|
||||
",
|
||||
"N:#$>^{N}$<#$.",
|
||||
}
|
||||
|
||||
summary_test! {
|
||||
tokenize_recipe_complex_interpolation_expression,
|
||||
"foo: #lol\n {{a + b + \"z\" + blarg}}",
|
||||
"N:#$>^{N+N+\"+N}<.",
|
||||
}
|
||||
|
||||
summary_test! {
|
||||
tokenize_recipe_multiple_interpolations,
|
||||
"foo:#ok\n {{a}}0{{b}}1{{c}}",
|
||||
"N:#$>^{N}_{N}_{N}<.",
|
||||
}
|
||||
|
||||
summary_test! {
|
||||
tokenize_junk,
|
||||
"bob
|
||||
|
||||
hello blah blah blah : a b c #whatever
|
||||
",
|
||||
"N$$NNNN:NNN#$.",
|
||||
}
|
||||
|
||||
summary_test! {
|
||||
tokenize_empty_lines,
|
||||
"
|
||||
# this does something
|
||||
hello:
|
||||
asdf
|
||||
bsdf
|
||||
|
||||
csdf
|
||||
|
||||
dsdf # whatever
|
||||
|
||||
# yolo
|
||||
",
|
||||
"$#$N:$>^_$^_$$^_$$^_$$<#$.",
|
||||
}
|
||||
|
||||
summary_test! {
|
||||
tokenize_comment_before_variable,
|
||||
"
|
||||
#
|
||||
A='1'
|
||||
echo:
|
||||
echo {{A}}
|
||||
",
|
||||
"$#$N='$N:$>^_{N}$<.",
|
||||
}
|
||||
|
||||
summary_test! {
|
||||
tokenize_interpolation_backticks,
|
||||
"hello:\n echo {{`echo hello` + `echo goodbye`}}",
|
||||
"N:$>^_{`+`}<.",
|
||||
}
|
||||
|
||||
summary_test! {
|
||||
tokenize_assignment_backticks,
|
||||
"a = `echo hello` + `echo goodbye`",
|
||||
"N=`+`.",
|
||||
}
|
||||
|
||||
summary_test! {
|
||||
tokenize_multiple,
|
||||
"
|
||||
hello:
|
||||
a
|
||||
b
|
||||
|
||||
c
|
||||
|
||||
d
|
||||
|
||||
# hello
|
||||
bob:
|
||||
frank
|
||||
",
|
||||
|
||||
"$N:$>^_$^_$$^_$$^_$$<#$N:$>^_$<.",
|
||||
}
|
||||
|
||||
summary_test! {
|
||||
tokenize_comment,
|
||||
"a:=#",
|
||||
"N:=#."
|
||||
}
|
||||
|
||||
summary_test! {
|
||||
tokenize_order,
|
||||
r"
|
||||
b: a
|
||||
@mv a b
|
||||
|
||||
a:
|
||||
@touch F
|
||||
@touch a
|
||||
|
||||
d: c
|
||||
@rm c
|
||||
|
||||
c: b
|
||||
@mv b c",
|
||||
"$N:N$>^_$$<N:$>^_$^_$$<N:N$>^_$$<N:N$>^_<.",
|
||||
}
|
||||
|
||||
error_test! {
|
||||
name: tokenize_space_then_tab,
|
||||
input: "a:
|
||||
0
|
||||
1
|
||||
\t2
|
||||
",
|
||||
index: 9,
|
||||
line: 3,
|
||||
column: 0,
|
||||
width: None,
|
||||
kind: InconsistentLeadingWhitespace{expected: " ", found: "\t"},
|
||||
}
|
||||
|
||||
error_test! {
|
||||
name: tokenize_tabs_then_tab_space,
|
||||
input: "a:
|
||||
\t\t0
|
||||
\t\t 1
|
||||
\t 2
|
||||
",
|
||||
index: 12,
|
||||
line: 3,
|
||||
column: 0,
|
||||
width: None,
|
||||
kind: InconsistentLeadingWhitespace{expected: "\t\t", found: "\t "},
|
||||
}
|
||||
|
||||
error_test! {
|
||||
name: tokenize_outer_shebang,
|
||||
input: "#!/usr/bin/env bash",
|
||||
index: 0,
|
||||
line: 0,
|
||||
column: 0,
|
||||
width: None,
|
||||
kind: OuterShebang,
|
||||
}
|
||||
|
||||
error_test! {
|
||||
name: tokenize_unknown,
|
||||
input: "~",
|
||||
index: 0,
|
||||
line: 0,
|
||||
column: 0,
|
||||
width: None,
|
||||
kind: UnknownStartOfToken,
|
||||
}
|
||||
|
||||
error_test! {
|
||||
name: unterminated_string,
|
||||
input: r#"a = ""#,
|
||||
index: 3,
|
||||
line: 0,
|
||||
column: 3,
|
||||
width: None,
|
||||
kind: UnterminatedString,
|
||||
}
|
||||
|
||||
error_test! {
|
||||
name: unterminated_string_with_escapes,
|
||||
input: r#"a = "\n\t\r\"\\"#,
|
||||
index: 3,
|
||||
line: 0,
|
||||
column: 3,
|
||||
width: None,
|
||||
kind: UnterminatedString,
|
||||
}
|
||||
|
||||
error_test! {
|
||||
name: unterminated_raw_string,
|
||||
input: "r a='asdf",
|
||||
index: 4,
|
||||
line: 0,
|
||||
column: 4,
|
||||
width: None,
|
||||
kind: UnterminatedString,
|
||||
}
|
||||
|
||||
error_test! {
|
||||
name: mixed_leading_whitespace,
|
||||
input: "a:\n\t echo hello",
|
||||
index: 3,
|
||||
line: 1,
|
||||
column: 0,
|
||||
width: None,
|
||||
kind: MixedLeadingWhitespace{whitespace: "\t "},
|
||||
}
|
||||
}
|
@ -1,9 +1,7 @@
|
||||
use common::*;
|
||||
|
||||
use compile;
|
||||
|
||||
pub fn parse_success(text: &str) -> Justfile {
|
||||
match compile(text) {
|
||||
match Parser::parse(text) {
|
||||
Ok(justfile) => justfile,
|
||||
Err(error) => panic!("Expected successful parse but got error:\n{}", error),
|
||||
}
|
||||
@ -32,7 +30,7 @@ macro_rules! compilation_error_test {
|
||||
kind: $kind,
|
||||
};
|
||||
|
||||
let tokens = ::tokenizer::tokenize(input).unwrap();
|
||||
let tokens = ::Scanner::scan(input).unwrap();
|
||||
let parser = ::Parser::new(input, tokens);
|
||||
|
||||
if let Err(error) = parser.justfile() {
|
||||
|
585
src/tokenizer.rs
585
src/tokenizer.rs
@ -1,585 +0,0 @@
|
||||
use common::*;
|
||||
|
||||
use TokenKind::*;
|
||||
use CompilationErrorKind::*;
|
||||
|
||||
fn re(pattern: &str) -> Regex {
|
||||
Regex::new(pattern).unwrap()
|
||||
}
|
||||
|
||||
fn token(pattern: &str) -> Regex {
|
||||
let mut s = String::new();
|
||||
s += r"^(?m)([ \t]*)(";
|
||||
s += pattern;
|
||||
s += ")";
|
||||
re(&s)
|
||||
}
|
||||
|
||||
fn mixed_whitespace(text: &str) -> bool {
|
||||
!(text.chars().all(|c| c == ' ') || text.chars().all(|c| c == '\t'))
|
||||
}
|
||||
|
||||
pub fn tokenize(text: &str) -> CompilationResult<Vec<Token>> {
|
||||
lazy_static! {
|
||||
static ref BACKTICK: Regex = token(r"`[^`\n\r]*`" );
|
||||
static ref COLON: Regex = token(r":" );
|
||||
static ref AT: Regex = token(r"@" );
|
||||
static ref COMMENT: Regex = token(r"#([^!\n\r].*)?$" );
|
||||
static ref EOF: Regex = token(r"(?-m)$" );
|
||||
static ref EOL: Regex = token(r"\n|\r\n" );
|
||||
static ref EQUALS: Regex = token(r"=" );
|
||||
static ref INTERPOLATION_END: Regex = token(r"[}][}]" );
|
||||
static ref INTERPOLATION_START_TOKEN: Regex = token(r"[{][{]" );
|
||||
static ref NAME: Regex = token(r"([a-zA-Z_][a-zA-Z0-9_-]*)" );
|
||||
static ref PLUS: Regex = token(r"[+]" );
|
||||
static ref STRING: Regex = token("\"" );
|
||||
static ref RAW_STRING: Regex = token(r#"'[^']*'"# );
|
||||
static ref UNTERMINATED_RAW_STRING: Regex = token(r#"'[^']*"# );
|
||||
static ref INDENT: Regex = re(r"^([ \t]*)[^ \t\n\r]" );
|
||||
static ref INTERPOLATION_START: Regex = re(r"^[{][{]" );
|
||||
static ref LEADING_TEXT: Regex = re(r"^(?m)(.+?)[{][{]" );
|
||||
static ref LINE: Regex = re(r"^(?m)[ \t]+[^ \t\n\r].*$");
|
||||
static ref TEXT: Regex = re(r"^(?m)(.+)" );
|
||||
}
|
||||
|
||||
#[derive(PartialEq)]
|
||||
enum State<'a> {
|
||||
Start,
|
||||
Indent(&'a str),
|
||||
Text,
|
||||
Interpolation,
|
||||
}
|
||||
|
||||
fn indentation(text: &str) -> Option<&str> {
|
||||
INDENT.captures(text).map(|captures| captures.get(1).unwrap().as_str())
|
||||
}
|
||||
|
||||
let mut tokens = vec![];
|
||||
let mut rest = text;
|
||||
let mut index = 0;
|
||||
let mut line = 0;
|
||||
let mut column = 0;
|
||||
let mut state = vec![State::Start];
|
||||
|
||||
macro_rules! error {
|
||||
($kind:expr) => {{
|
||||
Err(CompilationError {
|
||||
text: text,
|
||||
index: index,
|
||||
line: line,
|
||||
column: column,
|
||||
width: None,
|
||||
kind: $kind,
|
||||
})
|
||||
}};
|
||||
}
|
||||
|
||||
loop {
|
||||
if column == 0 {
|
||||
if let Some(kind) = match (state.last().unwrap(), indentation(rest)) {
|
||||
// ignore: was no indentation and there still isn't
|
||||
// or current line is blank
|
||||
(&State::Start, Some("")) | (_, None) => {
|
||||
None
|
||||
}
|
||||
// indent: was no indentation, now there is
|
||||
(&State::Start, Some(current)) => {
|
||||
if mixed_whitespace(current) {
|
||||
return error!(MixedLeadingWhitespace{whitespace: current})
|
||||
}
|
||||
//indent = Some(current);
|
||||
state.push(State::Indent(current));
|
||||
Some(Indent)
|
||||
}
|
||||
// dedent: there was indentation and now there isn't
|
||||
(&State::Indent(_), Some("")) => {
|
||||
// indent = None;
|
||||
state.pop();
|
||||
Some(Dedent)
|
||||
}
|
||||
// was indentation and still is, check if the new indentation matches
|
||||
(&State::Indent(previous), Some(current)) => {
|
||||
if !current.starts_with(previous) {
|
||||
return error!(InconsistentLeadingWhitespace{
|
||||
expected: previous,
|
||||
found: current
|
||||
});
|
||||
}
|
||||
None
|
||||
}
|
||||
// at column 0 in some other state: this should never happen
|
||||
(&State::Text, _) | (&State::Interpolation, _) => {
|
||||
return error!(Internal {
|
||||
message: "unexpected state at column 0".to_string()
|
||||
});
|
||||
}
|
||||
} {
|
||||
tokens.push(Token {
|
||||
index: index,
|
||||
line: line,
|
||||
column: column,
|
||||
text: text,
|
||||
prefix: "",
|
||||
lexeme: "",
|
||||
kind: kind,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// insert a dedent if we're indented and we hit the end of the file
|
||||
if &State::Start != state.last().unwrap() && EOF.is_match(rest) {
|
||||
tokens.push(Token {
|
||||
index: index,
|
||||
line: line,
|
||||
column: column,
|
||||
text: text,
|
||||
prefix: "",
|
||||
lexeme: "",
|
||||
kind: Dedent,
|
||||
});
|
||||
}
|
||||
|
||||
let (prefix, lexeme, kind) =
|
||||
if let (0, &State::Indent(indent), Some(captures)) =
|
||||
(column, state.last().unwrap(), LINE.captures(rest)) {
|
||||
let line = captures.get(0).unwrap().as_str();
|
||||
if !line.starts_with(indent) {
|
||||
return error!(Internal{message: "unexpected indent".to_string()});
|
||||
}
|
||||
state.push(State::Text);
|
||||
(&line[0..indent.len()], "", Line)
|
||||
} else if let Some(captures) = EOF.captures(rest) {
|
||||
(captures.get(1).unwrap().as_str(), captures.get(2).unwrap().as_str(), Eof)
|
||||
} else if let State::Text = *state.last().unwrap() {
|
||||
if let Some(captures) = INTERPOLATION_START.captures(rest) {
|
||||
state.push(State::Interpolation);
|
||||
("", captures.get(0).unwrap().as_str(), InterpolationStart)
|
||||
} else if let Some(captures) = LEADING_TEXT.captures(rest) {
|
||||
("", captures.get(1).unwrap().as_str(), Text)
|
||||
} else if let Some(captures) = TEXT.captures(rest) {
|
||||
("", captures.get(1).unwrap().as_str(), Text)
|
||||
} else if let Some(captures) = EOL.captures(rest) {
|
||||
state.pop();
|
||||
(captures.get(1).unwrap().as_str(), captures.get(2).unwrap().as_str(), Eol)
|
||||
} else {
|
||||
return error!(Internal {
|
||||
message: format!("Could not match token in text state: \"{}\"", rest)
|
||||
});
|
||||
}
|
||||
} else if let Some(captures) = INTERPOLATION_START_TOKEN.captures(rest) {
|
||||
(captures.get(1).unwrap().as_str(), captures.get(2).unwrap().as_str(), InterpolationStart)
|
||||
} else if let Some(captures) = INTERPOLATION_END.captures(rest) {
|
||||
if state.last().unwrap() == &State::Interpolation {
|
||||
state.pop();
|
||||
}
|
||||
(captures.get(1).unwrap().as_str(), captures.get(2).unwrap().as_str(), InterpolationEnd)
|
||||
} else if let Some(captures) = NAME.captures(rest) {
|
||||
(captures.get(1).unwrap().as_str(), captures.get(2).unwrap().as_str(), Name)
|
||||
} else if let Some(captures) = EOL.captures(rest) {
|
||||
if state.last().unwrap() == &State::Interpolation {
|
||||
return error!(Internal {
|
||||
message: "hit EOL while still in interpolation state".to_string()
|
||||
});
|
||||
}
|
||||
(captures.get(1).unwrap().as_str(), captures.get(2).unwrap().as_str(), Eol)
|
||||
} else if let Some(captures) = BACKTICK.captures(rest) {
|
||||
(captures.get(1).unwrap().as_str(), captures.get(2).unwrap().as_str(), Backtick)
|
||||
} else if let Some(captures) = COLON.captures(rest) {
|
||||
(captures.get(1).unwrap().as_str(), captures.get(2).unwrap().as_str(), Colon)
|
||||
} else if let Some(captures) = AT.captures(rest) {
|
||||
(captures.get(1).unwrap().as_str(), captures.get(2).unwrap().as_str(), At)
|
||||
} else if let Some(captures) = PLUS.captures(rest) {
|
||||
(captures.get(1).unwrap().as_str(), captures.get(2).unwrap().as_str(), Plus)
|
||||
} else if let Some(captures) = EQUALS.captures(rest) {
|
||||
(captures.get(1).unwrap().as_str(), captures.get(2).unwrap().as_str(), Equals)
|
||||
} else if let Some(captures) = COMMENT.captures(rest) {
|
||||
(captures.get(1).unwrap().as_str(), captures.get(2).unwrap().as_str(), Comment)
|
||||
} else if let Some(captures) = RAW_STRING.captures(rest) {
|
||||
(captures.get(1).unwrap().as_str(), captures.get(2).unwrap().as_str(), RawString)
|
||||
} else if UNTERMINATED_RAW_STRING.is_match(rest) {
|
||||
return error!(UnterminatedString);
|
||||
} else if let Some(captures) = STRING.captures(rest) {
|
||||
let prefix = captures.get(1).unwrap().as_str();
|
||||
let contents = &rest[prefix.len()+1..];
|
||||
if contents.is_empty() {
|
||||
return error!(UnterminatedString);
|
||||
}
|
||||
let mut len = 0;
|
||||
let mut escape = false;
|
||||
for c in contents.chars() {
|
||||
if c == '\n' || c == '\r' {
|
||||
return error!(UnterminatedString);
|
||||
} else if !escape && c == '"' {
|
||||
break;
|
||||
} else if !escape && c == '\\' {
|
||||
escape = true;
|
||||
} else if escape {
|
||||
escape = false;
|
||||
}
|
||||
len += c.len_utf8();
|
||||
}
|
||||
let start = prefix.len();
|
||||
let content_end = start + len + 1;
|
||||
if escape || content_end >= rest.len() {
|
||||
return error!(UnterminatedString);
|
||||
}
|
||||
(prefix, &rest[start..content_end + 1], StringToken)
|
||||
} else if rest.starts_with("#!") {
|
||||
return error!(OuterShebang)
|
||||
} else {
|
||||
return error!(UnknownStartOfToken)
|
||||
};
|
||||
|
||||
tokens.push(Token {
|
||||
index: index,
|
||||
line: line,
|
||||
column: column,
|
||||
prefix: prefix,
|
||||
text: text,
|
||||
lexeme: lexeme,
|
||||
kind: kind,
|
||||
});
|
||||
|
||||
let len = prefix.len() + lexeme.len();
|
||||
|
||||
if len == 0 {
|
||||
let last = tokens.last().unwrap();
|
||||
match last.kind {
|
||||
Eof => {},
|
||||
_ => return Err(last.error(Internal {
|
||||
message: format!("zero length token: {:?}", last)
|
||||
})),
|
||||
}
|
||||
}
|
||||
|
||||
match tokens.last().unwrap().kind {
|
||||
Eol => {
|
||||
line += 1;
|
||||
column = 0;
|
||||
}
|
||||
Eof => {
|
||||
break;
|
||||
}
|
||||
RawString => {
|
||||
let lexeme_lines = lexeme.lines().count();
|
||||
line += lexeme_lines - 1;
|
||||
if lexeme_lines == 1 {
|
||||
column += len;
|
||||
} else {
|
||||
column = lexeme.lines().last().unwrap().len();
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
column += len;
|
||||
}
|
||||
}
|
||||
|
||||
rest = &rest[len..];
|
||||
index += len;
|
||||
}
|
||||
|
||||
Ok(tokens)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::*;
|
||||
|
||||
macro_rules! summary_test {
|
||||
($name:ident, $input:expr, $expected:expr $(,)*) => {
|
||||
#[test]
|
||||
fn $name() {
|
||||
let input = $input;
|
||||
let expected = $expected;
|
||||
let tokens = tokenize(input).unwrap();
|
||||
let roundtrip = tokens.iter().map(|t| {
|
||||
let mut s = String::new();
|
||||
s += t.prefix;
|
||||
s += t.lexeme;
|
||||
s
|
||||
}).collect::<Vec<_>>().join("");
|
||||
let actual = token_summary(&tokens);
|
||||
if actual != expected {
|
||||
panic!("token summary mismatch:\nexpected: {}\ngot: {}\n", expected, actual);
|
||||
}
|
||||
assert_eq!(input, roundtrip);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn token_summary(tokens: &[Token]) -> String {
|
||||
tokens.iter().map(|t| {
|
||||
match t.kind {
|
||||
At => "@",
|
||||
Backtick => "`",
|
||||
Colon => ":",
|
||||
Comment{..} => "#",
|
||||
Dedent => "<",
|
||||
Eof => ".",
|
||||
Eol => "$",
|
||||
Equals => "=",
|
||||
Indent{..} => ">",
|
||||
InterpolationEnd => "}",
|
||||
InterpolationStart => "{",
|
||||
Line{..} => "^",
|
||||
Name => "N",
|
||||
Plus => "+",
|
||||
RawString => "'",
|
||||
StringToken => "\"",
|
||||
Text => "_",
|
||||
}
|
||||
}).collect::<Vec<_>>().join("")
|
||||
}
|
||||
|
||||
macro_rules! error_test {
|
||||
(
|
||||
name: $name:ident,
|
||||
input: $input:expr,
|
||||
index: $index:expr,
|
||||
line: $line:expr,
|
||||
column: $column:expr,
|
||||
width: $width:expr,
|
||||
kind: $kind:expr,
|
||||
) => {
|
||||
#[test]
|
||||
fn $name() {
|
||||
let input = $input;
|
||||
|
||||
let expected = CompilationError {
|
||||
text: input,
|
||||
index: $index,
|
||||
line: $line,
|
||||
column: $column,
|
||||
width: $width,
|
||||
kind: $kind,
|
||||
};
|
||||
|
||||
if let Err(error) = tokenize(input) {
|
||||
assert_eq!(error.text, expected.text);
|
||||
assert_eq!(error.index, expected.index);
|
||||
assert_eq!(error.line, expected.line);
|
||||
assert_eq!(error.column, expected.column);
|
||||
assert_eq!(error.kind, expected.kind);
|
||||
assert_eq!(error, expected);
|
||||
} else {
|
||||
panic!("tokenize() succeeded but expected: {}\n{}", expected, input);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
summary_test! {
|
||||
tokenize_strings,
|
||||
r#"a = "'a'" + '"b"' + "'c'" + '"d"'#echo hello"#,
|
||||
r#"N="+'+"+'#."#,
|
||||
}
|
||||
|
||||
summary_test! {
|
||||
tokenize_recipe_interpolation_eol,
|
||||
"foo: # some comment
|
||||
{{hello}}
|
||||
",
|
||||
"N:#$>^{N}$<.",
|
||||
}
|
||||
|
||||
summary_test! {
|
||||
tokenize_recipe_interpolation_eof,
|
||||
"foo: # more comments
|
||||
{{hello}}
|
||||
# another comment
|
||||
",
|
||||
"N:#$>^{N}$<#$.",
|
||||
}
|
||||
|
||||
summary_test! {
|
||||
tokenize_recipe_complex_interpolation_expression,
|
||||
"foo: #lol\n {{a + b + \"z\" + blarg}}",
|
||||
"N:#$>^{N+N+\"+N}<.",
|
||||
}
|
||||
|
||||
summary_test! {
|
||||
tokenize_recipe_multiple_interpolations,
|
||||
"foo:#ok\n {{a}}0{{b}}1{{c}}",
|
||||
"N:#$>^{N}_{N}_{N}<.",
|
||||
}
|
||||
|
||||
summary_test! {
|
||||
tokenize_junk,
|
||||
"bob
|
||||
|
||||
hello blah blah blah : a b c #whatever
|
||||
",
|
||||
"N$$NNNN:NNN#$.",
|
||||
}
|
||||
|
||||
summary_test! {
|
||||
tokenize_empty_lines,
|
||||
"
|
||||
# this does something
|
||||
hello:
|
||||
asdf
|
||||
bsdf
|
||||
|
||||
csdf
|
||||
|
||||
dsdf # whatever
|
||||
|
||||
# yolo
|
||||
",
|
||||
"$#$N:$>^_$^_$$^_$$^_$$<#$.",
|
||||
}
|
||||
|
||||
summary_test! {
|
||||
tokenize_comment_before_variable,
|
||||
"
|
||||
#
|
||||
A='1'
|
||||
echo:
|
||||
echo {{A}}
|
||||
",
|
||||
"$#$N='$N:$>^_{N}$<.",
|
||||
}
|
||||
|
||||
summary_test! {
|
||||
tokenize_interpolation_backticks,
|
||||
"hello:\n echo {{`echo hello` + `echo goodbye`}}",
|
||||
"N:$>^_{`+`}<.",
|
||||
}
|
||||
|
||||
summary_test! {
|
||||
tokenize_assignment_backticks,
|
||||
"a = `echo hello` + `echo goodbye`",
|
||||
"N=`+`.",
|
||||
}
|
||||
|
||||
summary_test! {
|
||||
tokenize_multiple,
|
||||
"
|
||||
hello:
|
||||
a
|
||||
b
|
||||
|
||||
c
|
||||
|
||||
d
|
||||
|
||||
# hello
|
||||
bob:
|
||||
frank
|
||||
",
|
||||
|
||||
"$N:$>^_$^_$$^_$$^_$$<#$N:$>^_$<.",
|
||||
}
|
||||
|
||||
summary_test! {
|
||||
tokenize_comment,
|
||||
"a:=#",
|
||||
"N:=#."
|
||||
}
|
||||
|
||||
summary_test! {
|
||||
tokenize_order,
|
||||
r"
|
||||
b: a
|
||||
@mv a b
|
||||
|
||||
a:
|
||||
@touch F
|
||||
@touch a
|
||||
|
||||
d: c
|
||||
@rm c
|
||||
|
||||
c: b
|
||||
@mv b c",
|
||||
"$N:N$>^_$$<N:$>^_$^_$$<N:N$>^_$$<N:N$>^_<.",
|
||||
}
|
||||
|
||||
error_test! {
|
||||
name: tokenize_space_then_tab,
|
||||
input: "a:
|
||||
0
|
||||
1
|
||||
\t2
|
||||
",
|
||||
index: 9,
|
||||
line: 3,
|
||||
column: 0,
|
||||
width: None,
|
||||
kind: InconsistentLeadingWhitespace{expected: " ", found: "\t"},
|
||||
}
|
||||
|
||||
error_test! {
|
||||
name: tokenize_tabs_then_tab_space,
|
||||
input: "a:
|
||||
\t\t0
|
||||
\t\t 1
|
||||
\t 2
|
||||
",
|
||||
index: 12,
|
||||
line: 3,
|
||||
column: 0,
|
||||
width: None,
|
||||
kind: InconsistentLeadingWhitespace{expected: "\t\t", found: "\t "},
|
||||
}
|
||||
|
||||
error_test! {
|
||||
name: tokenize_outer_shebang,
|
||||
input: "#!/usr/bin/env bash",
|
||||
index: 0,
|
||||
line: 0,
|
||||
column: 0,
|
||||
width: None,
|
||||
kind: OuterShebang,
|
||||
}
|
||||
|
||||
error_test! {
|
||||
name: tokenize_unknown,
|
||||
input: "~",
|
||||
index: 0,
|
||||
line: 0,
|
||||
column: 0,
|
||||
width: None,
|
||||
kind: UnknownStartOfToken,
|
||||
}
|
||||
|
||||
error_test! {
|
||||
name: unterminated_string,
|
||||
input: r#"a = ""#,
|
||||
index: 3,
|
||||
line: 0,
|
||||
column: 3,
|
||||
width: None,
|
||||
kind: UnterminatedString,
|
||||
}
|
||||
|
||||
error_test! {
|
||||
name: unterminated_string_with_escapes,
|
||||
input: r#"a = "\n\t\r\"\\"#,
|
||||
index: 3,
|
||||
line: 0,
|
||||
column: 3,
|
||||
width: None,
|
||||
kind: UnterminatedString,
|
||||
}
|
||||
|
||||
error_test! {
|
||||
name: unterminated_raw_string,
|
||||
input: "r a='asdf",
|
||||
index: 4,
|
||||
line: 0,
|
||||
column: 4,
|
||||
width: None,
|
||||
kind: UnterminatedString,
|
||||
}
|
||||
|
||||
error_test! {
|
||||
name: mixed_leading_whitespace,
|
||||
input: "a:\n\t echo hello",
|
||||
index: 3,
|
||||
line: 1,
|
||||
column: 0,
|
||||
width: None,
|
||||
kind: MixedLeadingWhitespace{whitespace: "\t "},
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user