schala/src/tokenizer.rs

201 lines
5.1 KiB
Rust
Raw Normal View History

2015-12-25 02:03:11 -08:00
#[derive(Debug, Clone, PartialEq)]
2015-07-22 03:02:55 -07:00
pub enum Token {
Newline,
Semicolon,
2015-07-22 03:02:55 -07:00
LParen,
RParen,
Comma,
2015-07-26 01:51:15 -07:00
Period,
Colon,
2015-07-22 03:02:55 -07:00
NumLiteral(f64),
StrLiteral(String),
2015-07-22 04:01:56 -07:00
Identifier(String),
2016-01-15 03:27:24 -08:00
Operator(Op),
2015-07-22 04:01:56 -07:00
Keyword(Kw)
}
2016-01-15 03:27:24 -08:00
#[derive(Debug, Clone, PartialEq)]
pub struct Op {
pub repr: String,
}
2015-07-22 04:01:56 -07:00
#[derive(Debug, Clone, PartialEq)]
pub enum Kw {
If,
Then,
Else,
While,
End,
Let,
Fn,
2015-08-08 00:27:40 -07:00
Null,
2015-07-22 03:02:55 -07:00
}
2015-07-22 03:12:01 -07:00
2016-12-28 22:52:23 -08:00
pub type TokenizeResult = Result<Vec<Token>, TokenizeError>;
#[derive(Debug)]
pub struct TokenizeError {
pub msg: String,
}
impl TokenizeError {
fn new(msg: &str) -> TokenizeError {
TokenizeError { msg: msg.to_string() }
}
}
fn is_digit(c: &char) -> bool {
c.is_digit(10)
}
2016-01-07 02:25:32 -08:00
fn ends_identifier(c: &char) -> bool {
let c = *c;
char::is_whitespace(c) ||
is_digit(&c) ||
c == ';' ||
c == '(' ||
c == ')' ||
c == ',' ||
c == '.' ||
2016-01-09 23:22:46 -08:00
c == ',' ||
2016-01-07 02:25:32 -08:00
c == ':'
}
2016-12-28 22:52:23 -08:00
pub fn tokenize(input: &str) -> TokenizeResult {
use self::Token::*;
2015-07-22 03:12:01 -07:00
let mut tokens = Vec::new();
let mut iter = input.chars().peekable();
while let Some(c) = iter.next() {
if char::is_whitespace(c) && c != '\n' {
continue;
} else if c == '#' {
while let Some(c) = iter.next() {
if c == '\n' { break; }
}
}
let cur_tok =
if c == '\n' {
Newline
} else if c == ';' {
Semicolon
} else if c == '(' {
LParen
} else if c == ')' {
RParen
} else if c == ':' {
Colon
2016-01-09 23:24:10 -08:00
} else if c == ',' {
2016-01-09 23:22:46 -08:00
Comma
} else if c == '"' {
let mut buffer = String::with_capacity(20);
loop {
//TODO handle string escapes, interpolation
match iter.next() {
Some(x) if x == '"' => break,
Some(x) => buffer.push(x),
2016-12-28 22:52:23 -08:00
None => return Err(TokenizeError::new("Unclosed quote")),
}
}
StrLiteral(buffer)
2016-01-07 02:25:32 -08:00
} else if c == '.' && !iter.peek().map_or(false, |x| is_digit(x)) {
Period
} else if is_digit(&c) || c == '.' {
let mut buffer = String::with_capacity(20);
buffer.push(c);
loop {
if iter.peek().map_or(false, |x| is_digit(x) || *x == '.') {
let n = iter.next().unwrap();
buffer.push(n);
} else {
break;
}
}
match buffer.parse::<f64>() {
Ok(f) => NumLiteral(f),
2016-12-28 22:52:23 -08:00
Err(_) => return Err(TokenizeError::new("Failed to pase digit")),
}
2016-01-15 00:31:00 -08:00
} else if !char::is_alphanumeric(c) {
let mut buffer = String::with_capacity(20);
buffer.push(c);
loop {
2016-01-15 01:20:43 -08:00
if iter.peek().map_or(false, |x| !char::is_alphanumeric(*x) && !char::is_whitespace(*x)) {
let n = iter.next().unwrap();
buffer.push(n);
} else {
break;
}
}
2016-01-15 03:27:24 -08:00
Operator(Op {repr: buffer })
} else {
2016-01-07 02:25:32 -08:00
let mut buffer = String::with_capacity(20);
buffer.push(c);
loop {
2016-01-16 10:40:17 -08:00
if iter.peek().map_or(true, |x| ends_identifier(x)) {
2016-01-07 02:25:32 -08:00
break;
} else {
buffer.push(iter.next().unwrap());
}
}
match &buffer[..] {
"if" => Keyword(Kw::If),
"then" => Keyword(Kw::Then),
2016-01-09 02:19:05 -08:00
"else" => Keyword(Kw::Else),
"while" => Keyword(Kw::While),
"end" => Keyword(Kw::End),
"let" => Keyword(Kw::Let),
"fn" => Keyword(Kw::Fn),
"null" => Keyword(Kw::Null),
2016-01-07 02:25:32 -08:00
b => Identifier(b.to_string())
}
};
tokens.push(cur_tok);
}
2016-12-28 22:52:23 -08:00
Ok(tokens)
2015-07-22 03:12:01 -07:00
}
2015-07-22 04:01:56 -07:00
2015-12-20 17:03:03 -08:00
#[cfg(test)]
mod tests {
2016-01-09 00:07:48 -08:00
macro_rules! tokentest {
($input:expr, $output:expr) => {
{
let tokens = tokenize($input).unwrap();
assert_eq!(format!("{:?}", tokens), $output);
}
}
}
2015-12-20 17:03:03 -08:00
use super::*;
#[test]
fn tokeniziation_tests() {
2016-01-09 00:07:48 -08:00
tokentest!("let a = 3\n",
2016-01-16 03:12:06 -08:00
"[Keyword(Let), Identifier(\"a\"), Operator(Op { repr: \"=\" }), NumLiteral(3), Newline]");
2016-01-09 00:07:48 -08:00
tokentest!("2+1",
2016-01-16 03:12:06 -08:00
"[NumLiteral(2), Operator(Op { repr: \"+\" }), NumLiteral(1)]");
2016-01-09 00:07:48 -08:00
tokentest!("2 + 1",
2016-01-16 03:12:06 -08:00
"[NumLiteral(2), Operator(Op { repr: \"+\" }), NumLiteral(1)]");
2016-01-09 00:07:48 -08:00
2016-01-09 00:09:48 -08:00
tokentest!("2.3*49.2",
2016-01-16 03:12:06 -08:00
"[NumLiteral(2.3), Operator(Op { repr: \"*\" }), NumLiteral(49.2)]");
2016-01-09 00:09:48 -08:00
2016-12-28 22:52:23 -08:00
assert!(tokenize("2.4.5").is_err());
}
#[test]
#[ignore]
fn more_tokenization() {
//it would be nice to support complicated operators in a nice, haskell-ish way
tokentest!("a *> b",
"[Identifier(\"a\"), Identifier(\"*>\"), Identifier(\"b\"), EOF]");
2016-01-09 00:09:48 -08:00
2015-12-20 17:03:03 -08:00
}
}