schala/src/tokenizer.rs

#[derive(Debug, Clone, PartialEq)]
pub enum Token {
    Newline,
    Semicolon,
    LParen,
    RParen,
    Comma,
    Period,
    Colon,
    NumLiteral(f64),
    StrLiteral(String),
    Identifier(String),
    Operator(Op),
    Keyword(Kw)
}

#[derive(Debug, Clone, PartialEq)]
pub struct Op {
    pub repr: String,
}

#[derive(Debug, Clone, PartialEq)]
pub enum Kw {
    If,
    Then,
    Else,
    While,
    End,
    Let,
    Fn,
    Null,
}

fn is_digit(c: &char) -> bool {
    c.is_digit(10)
}

fn ends_identifier(c: &char) -> bool {
    let c = *c;
    char::is_whitespace(c) ||
    is_digit(&c) ||
    c == ';' ||
    c == '(' ||
    c == ')' ||
    c == ',' ||
    c == '.' ||
    c == ',' ||
    c == ':'
}

pub fn tokenize(input: &str) -> Option<Vec<Token>> {
    use self::Token::*;
    let mut tokens = Vec::new();
    let mut iter = input.chars().peekable();

    while let Some(c) = iter.next() {
        if char::is_whitespace(c) && c != '\n' {
            continue;
        } else if c == '#' {
            while let Some(c) = iter.next() {
                if c == '\n' { break; }
            }
        }

        let cur_tok =
        if c == '\n' {
            Newline
        } else if c == ';' {
            Semicolon
        } else if c == '(' {
            LParen
        } else if c == ')' {
            RParen
        } else if c == ':' {
            Colon
        } else if  c == ',' {
            Comma
        } else if c == '"' {
            let mut buffer = String::with_capacity(20);
            loop {
                //TODO handle string escapes, interpolation
                match iter.next() {
                    Some(x) if x == '"' => break,
                    Some(x) => buffer.push(x),
                    None => return None,
                }
            }
            StrLiteral(buffer)
        } else if c == '.' && !iter.peek().map_or(false, |x| is_digit(x)) {
            Period
        } else if is_digit(&c) || c == '.' {
            let mut buffer = String::with_capacity(20);
            buffer.push(c);
            loop {
                if iter.peek().map_or(false, |x| is_digit(x) || *x == '.') {
                    let n = iter.next().unwrap();
                    buffer.push(n);
                } else {
                    break;
                }
            }
            match buffer.parse::<f64>() {
                Ok(f) => NumLiteral(f),
                Err(_) => return None
            }
        } else if !char::is_alphanumeric(c) {
            let mut buffer = String::with_capacity(20);
            buffer.push(c);
            loop {
                if iter.peek().map_or(false, |x| !char::is_alphanumeric(*x) && !char::is_whitespace(*x)) {
                    let n = iter.next().unwrap();
                    buffer.push(n);
                } else {
                    break;
                }
            }
            Operator(Op {repr: buffer })
        } else {
            let mut buffer = String::with_capacity(20);
            buffer.push(c);
            loop {
                if iter.peek().map_or(false, |x| ends_identifier(x)) {
                    break;
                } else {
                    buffer.push(iter.next().unwrap());
                }
            }

            match &buffer[..] {
                "if" => Keyword(Kw::If),
                "then" => Keyword(Kw::Then),
                "else" => Keyword(Kw::Else),
                "while" => Keyword(Kw::While),
                "end" => Keyword(Kw::End),
                "let" => Keyword(Kw::Let),
                "fn" => Keyword(Kw::Fn),
                "null" => Keyword(Kw::Null),
                b => Identifier(b.to_string())
            }
        };

        tokens.push(cur_tok);
    }

    Some(tokens)
}

#[cfg(test)]
mod tests {

    macro_rules! tokentest {
        ($input:expr, $output:expr) => {
            {
            let tokens = tokenize($input).unwrap();
            assert_eq!(format!("{:?}", tokens), $output);
            }
        }
    }

    use super::*;

    #[test]
    fn tokeniziation_tests() {
        tokentest!("let a = 3\n",
            "[Keyword(Let), Identifier(\"a\"), Operator(Op { repr: \"=\" }), NumLiteral(3), Newline]");

        tokentest!("2+1",
            "[NumLiteral(2), Operator(Op { repr: \"+\" }), NumLiteral(1)]");

        tokentest!("2 + 1",
            "[NumLiteral(2), Operator(Op { repr: \"+\" }), NumLiteral(1)]");

        tokentest!("2.3*49.2",
            "[NumLiteral(2.3), Operator(Op { repr: \"*\" }), NumLiteral(49.2)]");

        assert_eq!(tokenize("2.4.5"), None);
    }

    #[test]
    #[ignore]
    fn more_tokenization() {
        //it would be nice to support complicated operators in a nice, haskell-ish way
        tokentest!("a *> b",
            "[Identifier(\"a\"), Identifier(\"*>\"), Identifier(\"b\"), EOF]");

    }
}
I can now parse one thing 2015-12-25 02:03:11 -08:00			`#[derive(Debug, Clone, PartialEq)]`
Move tokenizing into separate module 2015-07-22 03:02:55 -07:00			`pub enum Token {`
Get rid of Separator token Have separate newline and semicolon tokens 2015-12-31 22:20:59 -08:00			`Newline,`
			`Semicolon,`
Move tokenizing into separate module 2015-07-22 03:02:55 -07:00			`LParen,`
			`RParen,`
			`Comma,`
Tokenize periods separately 2015-07-26 01:51:15 -07:00			`Period,`
Start making tokenizer changes Hopefully this time iron out all the bugs from the last implementation 2016-01-05 22:00:29 -08:00			`Colon,`
Move tokenizing into separate module 2015-07-22 03:02:55 -07:00			`NumLiteral(f64),`
			`StrLiteral(String),`
Added Keyword lexical class 2015-07-22 04:01:56 -07:00			`Identifier(String),`
Introduce Op type For operator parsing 2016-01-15 03:27:24 -08:00			`Operator(Op),`
Added Keyword lexical class 2015-07-22 04:01:56 -07:00			`Keyword(Kw)`
			`}`

Introduce Op type For operator parsing 2016-01-15 03:27:24 -08:00			`#[derive(Debug, Clone, PartialEq)]`
			`pub struct Op {`
			`pub repr: String,`
			`}`

Added Keyword lexical class 2015-07-22 04:01:56 -07:00			`#[derive(Debug, Clone, PartialEq)]`
			`pub enum Kw {`
			`If,`
			`Then,`
			`Else,`
			`While,`
			`End,`
			`Let,`
			`Fn,`
Make = a keyword 2015-08-08 00:27:40 -07:00			`Null,`
Move tokenizing into separate module 2015-07-22 03:02:55 -07:00			`}`
Separate parsing into module 2015-07-22 03:12:01 -07:00
Start making tokenizer changes Hopefully this time iron out all the bugs from the last implementation 2016-01-05 22:00:29 -08:00			`fn is_digit(c: &char) -> bool {`
			`c.is_digit(10)`
			`}`

Finish tokenizing 2016-01-07 02:25:32 -08:00			`fn ends_identifier(c: &char) -> bool {`
			`let c = *c;`
			`char::is_whitespace(c) \|\|`
			`is_digit(&c) \|\|`
			`c == ';' \|\|`
			`c == '(' \|\|`
			`c == ')' \|\|`
			`c == ',' \|\|`
			`c == '.' \|\|`
Add comma tokenization 2016-01-09 23:22:46 -08:00			`c == ',' \|\|`
Finish tokenizing 2016-01-07 02:25:32 -08:00			`c == ':'`
			`}`

Make tokenize error-able 2016-01-06 23:48:53 -08:00			`pub fn tokenize(input: &str) -> Option<Vec<Token>> {`
Start making tokenizer changes Hopefully this time iron out all the bugs from the last implementation 2016-01-05 22:00:29 -08:00			`use self::Token::*;`
Separate parsing into module 2015-07-22 03:12:01 -07:00			`let mut tokens = Vec::new();`
Start making tokenizer changes Hopefully this time iron out all the bugs from the last implementation 2016-01-05 22:00:29 -08:00			`let mut iter = input.chars().peekable();`

			`while let Some(c) = iter.next() {`
			`if char::is_whitespace(c) && c != '\n' {`
			`continue;`
			`} else if c == '#' {`
			`while let Some(c) = iter.next() {`
			`if c == '\n' { break; }`
			`}`
			`}`

			`let cur_tok =`
			`if c == '\n' {`
			`Newline`
			`} else if c == ';' {`
			`Semicolon`
			`} else if c == '(' {`
			`LParen`
			`} else if c == ')' {`
			`RParen`
			`} else if c == ':' {`
			`Colon`
Fix bug 2016-01-09 23:24:10 -08:00			`} else if c == ',' {`
Add comma tokenization 2016-01-09 23:22:46 -08:00			`Comma`
Start making tokenizer changes Hopefully this time iron out all the bugs from the last implementation 2016-01-05 22:00:29 -08:00			`} else if c == '"' {`
			`let mut buffer = String::with_capacity(20);`
			`loop {`
			`//TODO handle string escapes, interpolation`
			`match iter.next() {`
			`Some(x) if x == '"' => break,`
			`Some(x) => buffer.push(x),`
Make tokenize error-able 2016-01-06 23:48:53 -08:00			`None => return None,`
Start making tokenizer changes Hopefully this time iron out all the bugs from the last implementation 2016-01-05 22:00:29 -08:00			`}`
			`}`
			`StrLiteral(buffer)`
Finish tokenizing 2016-01-07 02:25:32 -08:00			`} else if c == '.' && !iter.peek().map_or(false, \|x\| is_digit(x)) {`
			`Period`
			`} else if is_digit(&c) \|\| c == '.' {`
Tokenize number literals TODO: expand this bit of code to handle 0x12, etc. syntax 2016-01-07 01:09:18 -08:00			`let mut buffer = String::with_capacity(20);`
			`buffer.push(c);`
			`loop {`
			`if iter.peek().map_or(false, \|x\| is_digit(x) \|\| *x == '.') {`
			`let n = iter.next().unwrap();`
			`buffer.push(n);`
			`} else {`
			`break;`
			`}`
			`}`
			`match buffer.parse::<f64>() {`
			`Ok(f) => NumLiteral(f),`
			`Err(_) => return None`
			`}`
Finish tokenizing Op separately 2016-01-15 00:31:00 -08:00			`} else if !char::is_alphanumeric(c) {`
Tokenizer work to support operators work in progress but committing to transfer 2016-01-14 20:52:29 -08:00			`let mut buffer = String::with_capacity(20);`
			`buffer.push(c);`
			`loop {`
fix operator parsing 2016-01-15 01:20:43 -08:00			`if iter.peek().map_or(false, \|x\| !char::is_alphanumeric(x) && !char::is_whitespace(x)) {`
Tokenizer work to support operators work in progress but committing to transfer 2016-01-14 20:52:29 -08:00			`let n = iter.next().unwrap();`
			`buffer.push(n);`
			`} else {`
			`break;`
			`}`
			`}`
Introduce Op type For operator parsing 2016-01-15 03:27:24 -08:00			`Operator(Op {repr: buffer })`
Start making tokenizer changes Hopefully this time iron out all the bugs from the last implementation 2016-01-05 22:00:29 -08:00			`} else {`
Finish tokenizing 2016-01-07 02:25:32 -08:00			`let mut buffer = String::with_capacity(20);`
			`buffer.push(c);`
			`loop {`
			`if iter.peek().map_or(false, \|x\| ends_identifier(x)) {`
			`break;`
			`} else {`
			`buffer.push(iter.next().unwrap());`
			`}`
			`}`

			`match &buffer[..] {`
			`"if" => Keyword(Kw::If),`
			`"then" => Keyword(Kw::Then),`
Finish keyword tokenization 2016-01-09 02:19:05 -08:00			`"else" => Keyword(Kw::Else),`
			`"while" => Keyword(Kw::While),`
			`"end" => Keyword(Kw::End),`
			`"let" => Keyword(Kw::Let),`
			`"fn" => Keyword(Kw::Fn),`
			`"null" => Keyword(Kw::Null),`
Finish tokenizing 2016-01-07 02:25:32 -08:00			`b => Identifier(b.to_string())`
			`}`
Start making tokenizer changes Hopefully this time iron out all the bugs from the last implementation 2016-01-05 22:00:29 -08:00			`};`

			`tokens.push(cur_tok);`
			`}`

Make tokenize error-able 2016-01-06 23:48:53 -08:00			`Some(tokens)`
Separate parsing into module 2015-07-22 03:12:01 -07:00			`}`
Added Keyword lexical class 2015-07-22 04:01:56 -07:00
Add tokenization test 2015-12-20 17:03:03 -08:00			`#[cfg(test)]`
			`mod tests {`
Macro-ize token tests 2016-01-09 00:07:48 -08:00
			`macro_rules! tokentest {`
			`($input:expr, $output:expr) => {`
			`{`
			`let tokens = tokenize($input).unwrap();`
			`assert_eq!(format!("{:?}", tokens), $output);`
			`}`
			`}`
			`}`

Add tokenization test 2015-12-20 17:03:03 -08:00			`use super::*;`

			`#[test]`
			`fn tokeniziation_tests() {`
Macro-ize token tests 2016-01-09 00:07:48 -08:00			`tokentest!("let a = 3\n",`
Fix tokenizer tests 2016-01-16 03:12:06 -08:00			`"[Keyword(Let), Identifier(\"a\"), Operator(Op { repr: \"=\" }), NumLiteral(3), Newline]");`
Macro-ize token tests 2016-01-09 00:07:48 -08:00
			`tokentest!("2+1",`
Fix tokenizer tests 2016-01-16 03:12:06 -08:00			`"[NumLiteral(2), Operator(Op { repr: \"+\" }), NumLiteral(1)]");`
Macro-ize token tests 2016-01-09 00:07:48 -08:00
			`tokentest!("2 + 1",`
Fix tokenizer tests 2016-01-16 03:12:06 -08:00			`"[NumLiteral(2), Operator(Op { repr: \"+\" }), NumLiteral(1)]");`
Macro-ize token tests 2016-01-09 00:07:48 -08:00
Couple more tests 2016-01-09 00:09:48 -08:00			`tokentest!("2.3*49.2",`
Fix tokenizer tests 2016-01-16 03:12:06 -08:00			`"[NumLiteral(2.3), Operator(Op { repr: \"*\" }), NumLiteral(49.2)]");`
Couple more tests 2016-01-09 00:09:48 -08:00
			`assert_eq!(tokenize("2.4.5"), None);`
Add test to ignore For better handing of user-defined operators, which I will do in the future 2016-01-09 01:01:37 -08:00			`}`

			`#[test]`
			`#[ignore]`
			`fn more_tokenization() {`
			`//it would be nice to support complicated operators in a nice, haskell-ish way`
			`tokentest!("a *> b",`
			`"[Identifier(\"a\"), Identifier(\"*>\"), Identifier(\"b\"), EOF]");`
Couple more tests 2016-01-09 00:09:48 -08:00
Add tokenization test 2015-12-20 17:03:03 -08:00			`}`
			`}`