2017-01-01 21:08:08 -08:00
|
|
|
extern crate itertools;
|
|
|
|
|
2017-01-01 20:15:17 -08:00
|
|
|
use std::iter::Peekable;
|
|
|
|
use std::str::Chars;
|
2017-01-01 21:08:08 -08:00
|
|
|
use self::itertools::Itertools;
|
2017-01-04 04:18:55 -08:00
|
|
|
use std::rc::Rc;
|
2017-01-01 20:15:17 -08:00
|
|
|
|
2015-12-25 02:03:11 -08:00
|
|
|
#[derive(Debug, Clone, PartialEq)]
|
2015-07-22 03:02:55 -07:00
|
|
|
pub enum Token {
|
2015-12-31 22:20:59 -08:00
|
|
|
Newline,
|
|
|
|
Semicolon,
|
2015-07-22 03:02:55 -07:00
|
|
|
LParen,
|
|
|
|
RParen,
|
|
|
|
Comma,
|
2015-07-26 01:51:15 -07:00
|
|
|
Period,
|
2016-01-05 22:00:29 -08:00
|
|
|
Colon,
|
2015-07-22 03:02:55 -07:00
|
|
|
NumLiteral(f64),
|
2017-01-04 04:18:55 -08:00
|
|
|
StrLiteral(Rc<String>),
|
|
|
|
Identifier(Rc<String>),
|
2016-01-15 03:27:24 -08:00
|
|
|
Operator(Op),
|
2016-12-29 02:04:03 -08:00
|
|
|
Keyword(Kw),
|
2015-07-22 04:01:56 -07:00
|
|
|
}
|
|
|
|
|
2016-01-15 03:27:24 -08:00
|
|
|
#[derive(Debug, Clone, PartialEq)]
|
2017-01-04 04:18:55 -08:00
|
|
|
pub struct Op(pub Rc<String>);
|
2016-01-15 03:27:24 -08:00
|
|
|
|
2015-07-22 04:01:56 -07:00
|
|
|
#[derive(Debug, Clone, PartialEq)]
|
|
|
|
pub enum Kw {
|
|
|
|
If,
|
|
|
|
Then,
|
|
|
|
Else,
|
|
|
|
While,
|
|
|
|
End,
|
|
|
|
Let,
|
|
|
|
Fn,
|
2015-08-08 00:27:40 -07:00
|
|
|
Null,
|
2015-07-22 03:02:55 -07:00
|
|
|
}
|
2015-07-22 03:12:01 -07:00
|
|
|
|
2016-12-28 22:52:23 -08:00
|
|
|
pub type TokenizeResult = Result<Vec<Token>, TokenizeError>;
|
|
|
|
|
|
|
|
#[derive(Debug)]
|
|
|
|
pub struct TokenizeError {
|
|
|
|
pub msg: String,
|
|
|
|
}
|
|
|
|
|
|
|
|
impl TokenizeError {
|
|
|
|
fn new(msg: &str) -> TokenizeError {
|
2016-12-29 02:04:03 -08:00
|
|
|
TokenizeError { msg: msg.to_string() }
|
2016-12-28 22:52:23 -08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-01-05 22:00:29 -08:00
|
|
|
fn is_digit(c: &char) -> bool {
|
|
|
|
c.is_digit(10)
|
|
|
|
}
|
|
|
|
|
2016-12-28 22:52:23 -08:00
|
|
|
pub fn tokenize(input: &str) -> TokenizeResult {
|
2016-01-05 22:00:29 -08:00
|
|
|
use self::Token::*;
|
2015-07-22 03:12:01 -07:00
|
|
|
let mut tokens = Vec::new();
|
2017-01-01 20:15:17 -08:00
|
|
|
let mut iter: Peekable<Chars> = input.chars().peekable();
|
2016-01-05 22:00:29 -08:00
|
|
|
while let Some(c) = iter.next() {
|
2017-01-01 18:18:55 -08:00
|
|
|
if c == '#' {
|
2016-01-05 22:00:29 -08:00
|
|
|
while let Some(c) = iter.next() {
|
2016-12-29 02:04:03 -08:00
|
|
|
if c == '\n' {
|
|
|
|
break;
|
|
|
|
}
|
2016-01-05 22:00:29 -08:00
|
|
|
}
|
|
|
|
}
|
2017-01-01 18:18:55 -08:00
|
|
|
let cur_tok = match c {
|
|
|
|
c if char::is_whitespace(c) && c != '\n' => continue,
|
|
|
|
'\n' => Newline,
|
|
|
|
';' => Semicolon,
|
|
|
|
'(' => LParen,
|
|
|
|
')' => RParen,
|
|
|
|
':' => Colon,
|
|
|
|
',' => Comma,
|
2017-01-01 20:15:17 -08:00
|
|
|
'"' => try!(tokenize_str(&mut iter)),
|
|
|
|
c if !char::is_alphanumeric(c) => try!(tokenize_operator(c, &mut iter)),
|
2017-01-01 21:36:33 -08:00
|
|
|
c @ '.' | c if is_digit(&c) => try!(tokenize_number_or_period(c, &mut iter)),
|
2017-01-01 20:15:17 -08:00
|
|
|
c => try!(tokenize_identifier(c, &mut iter)),
|
2016-01-05 22:00:29 -08:00
|
|
|
};
|
|
|
|
tokens.push(cur_tok);
|
|
|
|
}
|
2016-12-28 22:52:23 -08:00
|
|
|
Ok(tokens)
|
2015-07-22 03:12:01 -07:00
|
|
|
}
|
2015-07-22 04:01:56 -07:00
|
|
|
|
2017-01-01 20:15:17 -08:00
|
|
|
fn tokenize_str(iter: &mut Peekable<Chars>) -> Result<Token, TokenizeError> {
|
|
|
|
let mut buffer = String::new();
|
|
|
|
loop {
|
|
|
|
// TODO handle string escapes, interpolation
|
|
|
|
match iter.next() {
|
|
|
|
Some(x) if x == '"' => break,
|
|
|
|
Some(x) => buffer.push(x),
|
|
|
|
None => return Err(TokenizeError::new("Unclosed quote")),
|
|
|
|
}
|
|
|
|
}
|
2017-01-04 04:18:55 -08:00
|
|
|
Ok(Token::StrLiteral(Rc::new(buffer)))
|
2017-01-01 20:15:17 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
fn tokenize_operator(c: char, iter: &mut Peekable<Chars>) -> Result<Token, TokenizeError> {
|
|
|
|
let mut buffer = String::new();
|
|
|
|
buffer.push(c);
|
2017-01-01 21:08:08 -08:00
|
|
|
buffer.extend(iter.peeking_take_while(|x| !char::is_alphanumeric(*x) && !char::is_whitespace(*x)));
|
2017-01-04 04:18:55 -08:00
|
|
|
Ok(Token::Operator(Op(Rc::new(buffer))))
|
2017-01-01 20:15:17 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
fn tokenize_number_or_period(c: char, iter: &mut Peekable<Chars>) -> Result<Token, TokenizeError> {
|
|
|
|
if c == '.' && !iter.peek().map_or(false, is_digit) {
|
|
|
|
return Ok(Token::Period);
|
|
|
|
}
|
|
|
|
|
|
|
|
let mut buffer = String::new();
|
|
|
|
buffer.push(c);
|
2017-01-01 21:08:08 -08:00
|
|
|
buffer.extend(iter.peeking_take_while(|x| is_digit(x) || *x == '.'));
|
|
|
|
|
2017-01-01 20:15:17 -08:00
|
|
|
match buffer.parse::<f64>() {
|
|
|
|
Ok(f) => Ok(Token::NumLiteral(f)),
|
|
|
|
Err(_) => Err(TokenizeError::new("Failed to parse digit")),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
fn tokenize_identifier(c: char, iter: &mut Peekable<Chars>) -> Result<Token, TokenizeError> {
|
|
|
|
fn ends_identifier(c: &char) -> bool {
|
|
|
|
let c = *c;
|
|
|
|
char::is_whitespace(c) || is_digit(&c) || c == ';' || c == '(' || c == ')' ||
|
|
|
|
c == ',' || c == '.' || c == ',' || c == ':'
|
|
|
|
}
|
|
|
|
|
|
|
|
use self::Token::*;
|
|
|
|
let mut buffer = String::new();
|
|
|
|
buffer.push(c);
|
2017-01-02 01:47:39 -08:00
|
|
|
buffer.extend(iter.peeking_take_while(|x| !ends_identifier(x)));
|
2017-01-01 21:08:08 -08:00
|
|
|
|
2017-01-01 20:15:17 -08:00
|
|
|
Ok(match &buffer[..] {
|
|
|
|
"if" => Keyword(Kw::If),
|
|
|
|
"then" => Keyword(Kw::Then),
|
|
|
|
"else" => Keyword(Kw::Else),
|
|
|
|
"while" => Keyword(Kw::While),
|
|
|
|
"end" => Keyword(Kw::End),
|
|
|
|
"let" => Keyword(Kw::Let),
|
|
|
|
"fn" => Keyword(Kw::Fn),
|
|
|
|
"null" => Keyword(Kw::Null),
|
2017-01-04 04:18:55 -08:00
|
|
|
b => Identifier(Rc::new(b.to_string())),
|
2017-01-01 20:15:17 -08:00
|
|
|
})
|
|
|
|
}
|
|
|
|
|
2015-12-20 17:03:03 -08:00
|
|
|
#[cfg(test)]
|
|
|
|
mod tests {
|
2017-01-02 01:54:46 -08:00
|
|
|
use super::*;
|
|
|
|
use super::Token::*;
|
2016-01-09 00:07:48 -08:00
|
|
|
|
2017-01-02 01:54:46 -08:00
|
|
|
macro_rules! token_test {
|
|
|
|
($input: expr, $output: pat, $ifexpr: expr) => {
|
2016-01-09 00:07:48 -08:00
|
|
|
let tokens = tokenize($input).unwrap();
|
2017-01-02 01:54:46 -08:00
|
|
|
match tokens[..] {
|
|
|
|
$output if $ifexpr => (),
|
|
|
|
_ => panic!("Actual output: {:?}", tokens),
|
2016-01-09 00:07:48 -08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-12-20 17:03:03 -08:00
|
|
|
#[test]
|
2017-01-02 01:54:46 -08:00
|
|
|
fn basic_tokeniziation_tests() {
|
|
|
|
token_test!("let a = 3\n",
|
|
|
|
[Keyword(Kw::Let), Identifier(ref a), Operator(Op(ref b)), NumLiteral(3.0), Newline],
|
2017-01-05 02:36:28 -08:00
|
|
|
**a == "a" && **b == "=");
|
2016-01-09 00:07:48 -08:00
|
|
|
|
2017-01-02 01:54:46 -08:00
|
|
|
token_test!("2+1",
|
|
|
|
[NumLiteral(2.0), Operator(Op(ref a)), NumLiteral(1.0)],
|
2017-01-05 02:36:28 -08:00
|
|
|
**a == "+");
|
2016-01-09 00:07:48 -08:00
|
|
|
|
2017-01-02 01:54:46 -08:00
|
|
|
token_test!("2 + 1",
|
|
|
|
[NumLiteral(2.0), Operator(Op(ref a)), NumLiteral(1.0)],
|
2017-01-05 02:36:28 -08:00
|
|
|
**a == "+");
|
2016-01-09 00:07:48 -08:00
|
|
|
|
2017-01-02 01:54:46 -08:00
|
|
|
token_test!("2.3*49.2",
|
|
|
|
[NumLiteral(2.3), Operator(Op(ref a)), NumLiteral(49.2)],
|
2017-01-05 02:36:28 -08:00
|
|
|
**a == "*");
|
2016-01-09 00:09:48 -08:00
|
|
|
|
2016-12-28 22:52:23 -08:00
|
|
|
assert!(tokenize("2.4.5").is_err());
|
2016-01-09 01:01:37 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
2017-01-02 01:54:46 -08:00
|
|
|
fn string_test() {
|
|
|
|
token_test!("null + \"a string\"",
|
2017-01-05 02:36:28 -08:00
|
|
|
[Keyword(Kw::Null), Operator(Op(ref a)), StrLiteral(ref b)],
|
|
|
|
**a == "+" && **b == "a string");
|
2017-01-02 01:54:46 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn operator_test() {
|
|
|
|
token_test!("a *> b",
|
|
|
|
[Identifier(ref a), Operator(Op(ref b)), Identifier(ref c)],
|
2017-01-05 02:36:28 -08:00
|
|
|
**a == "a" && **b == "*>" && **c == "b");
|
2017-01-02 01:54:46 -08:00
|
|
|
|
2016-01-09 00:09:48 -08:00
|
|
|
|
2015-12-20 17:03:03 -08:00
|
|
|
}
|
|
|
|
}
|