schala/src/tokenizer.rs

extern crate itertools;

use std::iter::Peekable;
use std::str::Chars;
use self::itertools::Itertools;
use std::rc::Rc;

#[derive(Debug, Clone, PartialEq)]
pub enum Token {
    Newline,
    Semicolon,
    LParen,
    RParen,
    Comma,
    Period,
    Colon,
    NumLiteral(f64),
    StrLiteral(Rc<String>),
    Identifier(Rc<String>),
    Operator(Op),
    Keyword(Kw),
}

#[derive(Debug, Clone, PartialEq)]
pub struct Op(pub Rc<String>);

#[derive(Debug, Clone, PartialEq)]
pub enum Kw {
    If,
    Then,
    Else,
    While,
    End,
    Let,
    Fn,
    Null,
}

pub type TokenizeResult = Result<Vec<Token>, TokenizeError>;

#[derive(Debug)]
pub struct TokenizeError {
    pub msg: String,
}

impl TokenizeError {
    fn new(msg: &str) -> TokenizeError {
        TokenizeError { msg: msg.to_string() }
    }
}

fn is_digit(c: &char) -> bool {
    c.is_digit(10)
}

pub fn tokenize(input: &str) -> TokenizeResult {
    use self::Token::*;
    let mut tokens = Vec::new();
    let mut iter: Peekable<Chars> = input.chars().peekable();
    while let Some(c) = iter.next() {
        if c == '#' {
            while let Some(c) = iter.next() {
                if c == '\n' {
                    break;
                }
            }
        }
        let cur_tok = match c {
            c if char::is_whitespace(c) && c != '\n' => continue,
            '\n' => Newline,
            ';' => Semicolon,
            '(' => LParen,
            ')' => RParen,
            ':' => Colon,
            ',' => Comma,
            '"' => try!(tokenize_str(&mut iter)),
            c if !char::is_alphanumeric(c) => try!(tokenize_operator(c, &mut iter)),
            c @ '.' | c if is_digit(&c) => try!(tokenize_number_or_period(c, &mut iter)),
            c => try!(tokenize_identifier(c, &mut iter)),
        };
        tokens.push(cur_tok);
    }
    Ok(tokens)
}

fn tokenize_str(iter: &mut Peekable<Chars>) -> Result<Token, TokenizeError> {
    let mut buffer = String::new();
    loop {
        // TODO handle string escapes, interpolation
        match iter.next() {
            Some(x) if x == '"' => break,
            Some(x) => buffer.push(x),
            None => return Err(TokenizeError::new("Unclosed quote")),
        }
    }
    Ok(Token::StrLiteral(Rc::new(buffer)))
}

fn tokenize_operator(c: char, iter: &mut Peekable<Chars>) -> Result<Token, TokenizeError> {
    let mut buffer = String::new();
    buffer.push(c);
    buffer.extend(iter.peeking_take_while(|x| !char::is_alphanumeric(*x) && !char::is_whitespace(*x))); 
    Ok(Token::Operator(Op(Rc::new(buffer))))
}

fn tokenize_number_or_period(c: char, iter: &mut Peekable<Chars>) -> Result<Token, TokenizeError> {
    if c == '.' && !iter.peek().map_or(false, is_digit) {
        return Ok(Token::Period);
    }

    let mut buffer = String::new();
    buffer.push(c);
    buffer.extend(iter.peeking_take_while(|x| is_digit(x) || *x == '.'));

    match buffer.parse::<f64>() {
        Ok(f) => Ok(Token::NumLiteral(f)),
        Err(_) => Err(TokenizeError::new("Failed to parse digit")),
    }
}

fn tokenize_identifier(c: char, iter: &mut Peekable<Chars>) -> Result<Token, TokenizeError> {
    fn ends_identifier(c: &char) -> bool {
        let c = *c;
        char::is_whitespace(c) || is_digit(&c) || c == ';' || c == '(' || c == ')' ||
        c == ',' || c == '.' || c == ',' || c == ':'
    }

    use self::Token::*;
    let mut buffer = String::new();
    buffer.push(c);
    buffer.extend(iter.peeking_take_while(|x| !ends_identifier(x)));

    Ok(match &buffer[..] {
        "if" => Keyword(Kw::If),
        "then" => Keyword(Kw::Then),
        "else" => Keyword(Kw::Else),
        "while" => Keyword(Kw::While),
        "end" => Keyword(Kw::End),
        "let" => Keyword(Kw::Let),
        "fn" => Keyword(Kw::Fn),
        "null" => Keyword(Kw::Null),
        b => Identifier(Rc::new(b.to_string())),
    })
}

#[cfg(test)]
mod tests {
    use super::*;
    use super::Token::*;

    macro_rules! token_test {
        ($input: expr, $output: pat, $ifexpr: expr) => {
            let tokens = tokenize($input).unwrap();
            match tokens[..] {
                $output if $ifexpr => (),
                _ => panic!("Actual output: {:?}", tokens),
            }
        }
    }

    #[test]
    fn basic_tokeniziation_tests() {
        token_test!("let a = 3\n",
                    [Keyword(Kw::Let), Identifier(ref a), Operator(Op(ref b)), NumLiteral(3.0), Newline],
                    **a == "a" && **b == "=");

        token_test!("2+1",
                    [NumLiteral(2.0), Operator(Op(ref a)), NumLiteral(1.0)],
                    **a == "+");

        token_test!("2 + 1",
                    [NumLiteral(2.0), Operator(Op(ref a)), NumLiteral(1.0)],
                    **a == "+");

        token_test!("2.3*49.2",
                   [NumLiteral(2.3), Operator(Op(ref a)), NumLiteral(49.2)],
                   **a == "*");

        assert!(tokenize("2.4.5").is_err());
    }

    #[test]
    fn string_test() {
        token_test!("null + \"a string\"",
                    [Keyword(Kw::Null), Operator(Op(ref a)), StrLiteral(ref b)],
                    **a == "+" && **b == "a string");
    }

    #[test]
    fn operator_test() {
        token_test!("a *> b",
                   [Identifier(ref a), Operator(Op(ref b)), Identifier(ref c)],
                   **a == "a" && **b == "*>" && **c == "b");


    }
}
Use itertools peeking_take_while Cuts down on lines in the tokenizer 2017-01-01 21:08:08 -08:00			`extern crate itertools;`

Rewrite of tokenizer 2017-01-01 20:15:17 -08:00			`use std::iter::Peekable;`
			`use std::str::Chars;`
Use itertools peeking_take_while Cuts down on lines in the tokenizer 2017-01-01 21:08:08 -08:00			`use self::itertools::Itertools;`
Convertd like half the Strings to RC -still need to eliminate some clones in eval, parse + fix all the tests 2017-01-04 04:18:55 -08:00			`use std::rc::Rc;`
Rewrite of tokenizer 2017-01-01 20:15:17 -08:00
I can now parse one thing 2015-12-25 02:03:11 -08:00			`#[derive(Debug, Clone, PartialEq)]`
Move tokenizing into separate module 2015-07-22 03:02:55 -07:00			`pub enum Token {`
Get rid of Separator token Have separate newline and semicolon tokens 2015-12-31 22:20:59 -08:00			`Newline,`
			`Semicolon,`
Move tokenizing into separate module 2015-07-22 03:02:55 -07:00			`LParen,`
			`RParen,`
			`Comma,`
Tokenize periods separately 2015-07-26 01:51:15 -07:00			`Period,`
Start making tokenizer changes Hopefully this time iron out all the bugs from the last implementation 2016-01-05 22:00:29 -08:00			`Colon,`
Move tokenizing into separate module 2015-07-22 03:02:55 -07:00			`NumLiteral(f64),`
Convertd like half the Strings to RC -still need to eliminate some clones in eval, parse + fix all the tests 2017-01-04 04:18:55 -08:00			`StrLiteral(Rc<String>),`
			`Identifier(Rc<String>),`
Introduce Op type For operator parsing 2016-01-15 03:27:24 -08:00			`Operator(Op),`
Run rustfmt on the rest of them 2016-12-29 02:04:03 -08:00			`Keyword(Kw),`
Added Keyword lexical class 2015-07-22 04:01:56 -07:00			`}`

Introduce Op type For operator parsing 2016-01-15 03:27:24 -08:00			`#[derive(Debug, Clone, PartialEq)]`
Convertd like half the Strings to RC -still need to eliminate some clones in eval, parse + fix all the tests 2017-01-04 04:18:55 -08:00			`pub struct Op(pub Rc<String>);`
Introduce Op type For operator parsing 2016-01-15 03:27:24 -08:00
Added Keyword lexical class 2015-07-22 04:01:56 -07:00			`#[derive(Debug, Clone, PartialEq)]`
			`pub enum Kw {`
			`If,`
			`Then,`
			`Else,`
			`While,`
			`End,`
			`Let,`
			`Fn,`
Make = a keyword 2015-08-08 00:27:40 -07:00			`Null,`
Move tokenizing into separate module 2015-07-22 03:02:55 -07:00			`}`
Separate parsing into module 2015-07-22 03:12:01 -07:00
Convert Tokenize to Result 2016-12-28 22:52:23 -08:00			`pub type TokenizeResult = Result<Vec<Token>, TokenizeError>;`

			`#[derive(Debug)]`
			`pub struct TokenizeError {`
			`pub msg: String,`
			`}`

			`impl TokenizeError {`
			`fn new(msg: &str) -> TokenizeError {`
Run rustfmt on the rest of them 2016-12-29 02:04:03 -08:00			`TokenizeError { msg: msg.to_string() }`
Convert Tokenize to Result 2016-12-28 22:52:23 -08:00			`}`
			`}`

Start making tokenizer changes Hopefully this time iron out all the bugs from the last implementation 2016-01-05 22:00:29 -08:00			`fn is_digit(c: &char) -> bool {`
			`c.is_digit(10)`
			`}`

Convert Tokenize to Result 2016-12-28 22:52:23 -08:00			`pub fn tokenize(input: &str) -> TokenizeResult {`
Start making tokenizer changes Hopefully this time iron out all the bugs from the last implementation 2016-01-05 22:00:29 -08:00			`use self::Token::*;`
Separate parsing into module 2015-07-22 03:12:01 -07:00			`let mut tokens = Vec::new();`
Rewrite of tokenizer 2017-01-01 20:15:17 -08:00			`let mut iter: Peekable<Chars> = input.chars().peekable();`
Start making tokenizer changes Hopefully this time iron out all the bugs from the last implementation 2016-01-05 22:00:29 -08:00			`while let Some(c) = iter.next() {`
Convert tokenizer to large match statement In the hopes of making it shorter 2017-01-01 18:18:55 -08:00			`if c == '#' {`
Start making tokenizer changes Hopefully this time iron out all the bugs from the last implementation 2016-01-05 22:00:29 -08:00			`while let Some(c) = iter.next() {`
Run rustfmt on the rest of them 2016-12-29 02:04:03 -08:00			`if c == '\n' {`
			`break;`
			`}`
Start making tokenizer changes Hopefully this time iron out all the bugs from the last implementation 2016-01-05 22:00:29 -08:00			`}`
			`}`
Convert tokenizer to large match statement In the hopes of making it shorter 2017-01-01 18:18:55 -08:00			`let cur_tok = match c {`
			`c if char::is_whitespace(c) && c != '\n' => continue,`
			`'\n' => Newline,`
			`';' => Semicolon,`
			`'(' => LParen,`
			`')' => RParen,`
			`':' => Colon,`
			`',' => Comma,`
Rewrite of tokenizer 2017-01-01 20:15:17 -08:00			`'"' => try!(tokenize_str(&mut iter)),`
			`c if !char::is_alphanumeric(c) => try!(tokenize_operator(c, &mut iter)),`
Simplify pattern a little bit 2017-01-01 21:36:33 -08:00			`c @ '.' \| c if is_digit(&c) => try!(tokenize_number_or_period(c, &mut iter)),`
Rewrite of tokenizer 2017-01-01 20:15:17 -08:00			`c => try!(tokenize_identifier(c, &mut iter)),`
Start making tokenizer changes Hopefully this time iron out all the bugs from the last implementation 2016-01-05 22:00:29 -08:00			`};`
			`tokens.push(cur_tok);`
			`}`
Convert Tokenize to Result 2016-12-28 22:52:23 -08:00			`Ok(tokens)`
Separate parsing into module 2015-07-22 03:12:01 -07:00			`}`
Added Keyword lexical class 2015-07-22 04:01:56 -07:00
Rewrite of tokenizer 2017-01-01 20:15:17 -08:00			`fn tokenize_str(iter: &mut Peekable<Chars>) -> Result<Token, TokenizeError> {`
			`let mut buffer = String::new();`
			`loop {`
			`// TODO handle string escapes, interpolation`
			`match iter.next() {`
			`Some(x) if x == '"' => break,`
			`Some(x) => buffer.push(x),`
			`None => return Err(TokenizeError::new("Unclosed quote")),`
			`}`
			`}`
Convertd like half the Strings to RC -still need to eliminate some clones in eval, parse + fix all the tests 2017-01-04 04:18:55 -08:00			`Ok(Token::StrLiteral(Rc::new(buffer)))`
Rewrite of tokenizer 2017-01-01 20:15:17 -08:00			`}`

			`fn tokenize_operator(c: char, iter: &mut Peekable<Chars>) -> Result<Token, TokenizeError> {`
			`let mut buffer = String::new();`
			`buffer.push(c);`
Use itertools peeking_take_while Cuts down on lines in the tokenizer 2017-01-01 21:08:08 -08:00			`buffer.extend(iter.peeking_take_while(\|x\| !char::is_alphanumeric(x) && !char::is_whitespace(x)));`
Convertd like half the Strings to RC -still need to eliminate some clones in eval, parse + fix all the tests 2017-01-04 04:18:55 -08:00			`Ok(Token::Operator(Op(Rc::new(buffer))))`
Rewrite of tokenizer 2017-01-01 20:15:17 -08:00			`}`

			`fn tokenize_number_or_period(c: char, iter: &mut Peekable<Chars>) -> Result<Token, TokenizeError> {`
			`if c == '.' && !iter.peek().map_or(false, is_digit) {`
			`return Ok(Token::Period);`
			`}`

			`let mut buffer = String::new();`
			`buffer.push(c);`
Use itertools peeking_take_while Cuts down on lines in the tokenizer 2017-01-01 21:08:08 -08:00			`buffer.extend(iter.peeking_take_while(\|x\| is_digit(x) \|\| *x == '.'));`

Rewrite of tokenizer 2017-01-01 20:15:17 -08:00			`match buffer.parse::<f64>() {`
			`Ok(f) => Ok(Token::NumLiteral(f)),`
			`Err(_) => Err(TokenizeError::new("Failed to parse digit")),`
			`}`
			`}`

			`fn tokenize_identifier(c: char, iter: &mut Peekable<Chars>) -> Result<Token, TokenizeError> {`
			`fn ends_identifier(c: &char) -> bool {`
			`let c = *c;`
			`char::is_whitespace(c) \|\| is_digit(&c) \|\| c == ';' \|\| c == '(' \|\| c == ')' \|\|`
			`c == ',' \|\| c == '.' \|\| c == ',' \|\| c == ':'`
			`}`

			`use self::Token::*;`
			`let mut buffer = String::new();`
			`buffer.push(c);`
fixed bug with ends_identifier 2017-01-02 01:47:39 -08:00			`buffer.extend(iter.peeking_take_while(\|x\| !ends_identifier(x)));`
Use itertools peeking_take_while Cuts down on lines in the tokenizer 2017-01-01 21:08:08 -08:00
Rewrite of tokenizer 2017-01-01 20:15:17 -08:00			`Ok(match &buffer[..] {`
			`"if" => Keyword(Kw::If),`
			`"then" => Keyword(Kw::Then),`
			`"else" => Keyword(Kw::Else),`
			`"while" => Keyword(Kw::While),`
			`"end" => Keyword(Kw::End),`
			`"let" => Keyword(Kw::Let),`
			`"fn" => Keyword(Kw::Fn),`
			`"null" => Keyword(Kw::Null),`
Convertd like half the Strings to RC -still need to eliminate some clones in eval, parse + fix all the tests 2017-01-04 04:18:55 -08:00			`b => Identifier(Rc::new(b.to_string())),`
Rewrite of tokenizer 2017-01-01 20:15:17 -08:00			`})`
			`}`

Add tokenization test 2015-12-20 17:03:03 -08:00			`#[cfg(test)]`
			`mod tests {`
Tightened tokenization tests 2017-01-02 01:54:46 -08:00			`use super::*;`
			`use super::Token::*;`
Macro-ize token tests 2016-01-09 00:07:48 -08:00
Tightened tokenization tests 2017-01-02 01:54:46 -08:00			`macro_rules! token_test {`
			`($input: expr, $output: pat, $ifexpr: expr) => {`
Macro-ize token tests 2016-01-09 00:07:48 -08:00			`let tokens = tokenize($input).unwrap();`
Tightened tokenization tests 2017-01-02 01:54:46 -08:00			`match tokens[..] {`
			`$output if $ifexpr => (),`
			`_ => panic!("Actual output: {:?}", tokens),`
Macro-ize token tests 2016-01-09 00:07:48 -08:00			`}`
			`}`
			`}`

Add tokenization test 2015-12-20 17:03:03 -08:00			`#[test]`
Tightened tokenization tests 2017-01-02 01:54:46 -08:00			`fn basic_tokeniziation_tests() {`
			`token_test!("let a = 3\n",`
			`[Keyword(Kw::Let), Identifier(ref a), Operator(Op(ref b)), NumLiteral(3.0), Newline],`
Fixed all tests 2017-01-05 02:36:28 -08:00			`a == "a" && b == "=");`
Macro-ize token tests 2016-01-09 00:07:48 -08:00
Tightened tokenization tests 2017-01-02 01:54:46 -08:00			`token_test!("2+1",`
			`[NumLiteral(2.0), Operator(Op(ref a)), NumLiteral(1.0)],`
Fixed all tests 2017-01-05 02:36:28 -08:00			`**a == "+");`
Macro-ize token tests 2016-01-09 00:07:48 -08:00
Tightened tokenization tests 2017-01-02 01:54:46 -08:00			`token_test!("2 + 1",`
			`[NumLiteral(2.0), Operator(Op(ref a)), NumLiteral(1.0)],`
Fixed all tests 2017-01-05 02:36:28 -08:00			`**a == "+");`
Macro-ize token tests 2016-01-09 00:07:48 -08:00
Tightened tokenization tests 2017-01-02 01:54:46 -08:00			`token_test!("2.3*49.2",`
			`[NumLiteral(2.3), Operator(Op(ref a)), NumLiteral(49.2)],`
Fixed all tests 2017-01-05 02:36:28 -08:00			`*a == "");`
Couple more tests 2016-01-09 00:09:48 -08:00
Convert Tokenize to Result 2016-12-28 22:52:23 -08:00			`assert!(tokenize("2.4.5").is_err());`
Add test to ignore For better handing of user-defined operators, which I will do in the future 2016-01-09 01:01:37 -08:00			`}`

			`#[test]`
Tightened tokenization tests 2017-01-02 01:54:46 -08:00			`fn string_test() {`
			`token_test!("null + \"a string\"",`
Fixed all tests 2017-01-05 02:36:28 -08:00			`[Keyword(Kw::Null), Operator(Op(ref a)), StrLiteral(ref b)],`
			`a == "+" && b == "a string");`
Tightened tokenization tests 2017-01-02 01:54:46 -08:00			`}`

			`#[test]`
			`fn operator_test() {`
			`token_test!("a *> b",`
			`[Identifier(ref a), Operator(Op(ref b)), Identifier(ref c)],`
Fixed all tests 2017-01-05 02:36:28 -08:00			`a == "a" && b == ">" && *c == "b");`
Tightened tokenization tests 2017-01-02 01:54:46 -08:00
Couple more tests 2016-01-09 00:09:48 -08:00
Add tokenization test 2015-12-20 17:03:03 -08:00			`}`
			`}`