Rewrite of tokenizer

This commit is contained in:
greg 2017-01-01 20:15:17 -08:00
parent 59226eb731
commit b2e453a9de

View File

@ -1,3 +1,6 @@
use std::iter::Peekable;
use std::str::Chars;
#[derive(Debug, Clone, PartialEq)] #[derive(Debug, Clone, PartialEq)]
pub enum Token { pub enum Token {
Newline, Newline,
@ -46,17 +49,10 @@ fn is_digit(c: &char) -> bool {
c.is_digit(10) c.is_digit(10)
} }
fn ends_identifier(c: &char) -> bool {
let c = *c;
char::is_whitespace(c) || is_digit(&c) || c == ';' || c == '(' || c == ')' || c == ',' ||
c == '.' || c == ',' || c == ':'
}
pub fn tokenize(input: &str) -> TokenizeResult { pub fn tokenize(input: &str) -> TokenizeResult {
use self::Token::*; use self::Token::*;
let mut tokens = Vec::new(); let mut tokens = Vec::new();
let mut iter = input.chars().peekable(); let mut iter: Peekable<Chars> = input.chars().peekable();
while let Some(c) = iter.next() { while let Some(c) = iter.next() {
if c == '#' { if c == '#' {
while let Some(c) = iter.next() { while let Some(c) = iter.next() {
@ -65,7 +61,6 @@ pub fn tokenize(input: &str) -> TokenizeResult {
} }
} }
} }
let cur_tok = match c { let cur_tok = match c {
c if char::is_whitespace(c) && c != '\n' => continue, c if char::is_whitespace(c) && c != '\n' => continue,
'\n' => Newline, '\n' => Newline,
@ -74,8 +69,18 @@ pub fn tokenize(input: &str) -> TokenizeResult {
')' => RParen, ')' => RParen,
':' => Colon, ':' => Colon,
',' => Comma, ',' => Comma,
'"' => { '"' => try!(tokenize_str(&mut iter)),
let mut buffer = String::with_capacity(20); c if !char::is_alphanumeric(c) => try!(tokenize_operator(c, &mut iter)),
c if is_digit(&c) || c == '.' => try!(tokenize_number_or_period(c, &mut iter)),
c => try!(tokenize_identifier(c, &mut iter)),
};
tokens.push(cur_tok);
}
Ok(tokens)
}
fn tokenize_str(iter: &mut Peekable<Chars>) -> Result<Token, TokenizeError> {
let mut buffer = String::new();
loop { loop {
// TODO handle string escapes, interpolation // TODO handle string escapes, interpolation
match iter.next() { match iter.next() {
@ -84,10 +89,11 @@ pub fn tokenize(input: &str) -> TokenizeResult {
None => return Err(TokenizeError::new("Unclosed quote")), None => return Err(TokenizeError::new("Unclosed quote")),
} }
} }
StrLiteral(buffer) Ok(Token::StrLiteral(buffer))
} }
c if !char::is_alphanumeric(c) => {
let mut buffer = String::with_capacity(20); fn tokenize_operator(c: char, iter: &mut Peekable<Chars>) -> Result<Token, TokenizeError> {
let mut buffer = String::new();
buffer.push(c); buffer.push(c);
loop { loop {
if iter.peek().map_or(false, if iter.peek().map_or(false,
@ -98,13 +104,15 @@ pub fn tokenize(input: &str) -> TokenizeResult {
break; break;
} }
} }
Operator(Op(buffer)) Ok(Token::Operator(Op(buffer)))
}
fn tokenize_number_or_period(c: char, iter: &mut Peekable<Chars>) -> Result<Token, TokenizeError> {
if c == '.' && !iter.peek().map_or(false, is_digit) {
return Ok(Token::Period);
} }
c => {
if c == '.' && !iter.peek().map_or(false, |x| is_digit(x)) { let mut buffer = String::new();
Period
} else if is_digit(&c) || c == '.' {
let mut buffer = String::with_capacity(20);
buffer.push(c); buffer.push(c);
loop { loop {
if iter.peek().map_or(false, |x| is_digit(x) || *x == '.') { if iter.peek().map_or(false, |x| is_digit(x) || *x == '.') {
@ -115,11 +123,20 @@ pub fn tokenize(input: &str) -> TokenizeResult {
} }
} }
match buffer.parse::<f64>() { match buffer.parse::<f64>() {
Ok(f) => NumLiteral(f), Ok(f) => Ok(Token::NumLiteral(f)),
Err(_) => return Err(TokenizeError::new("Failed to pase digit")), Err(_) => Err(TokenizeError::new("Failed to parse digit")),
} }
} else { }
let mut buffer = String::with_capacity(20);
fn tokenize_identifier(c: char, iter: &mut Peekable<Chars>) -> Result<Token, TokenizeError> {
fn ends_identifier(c: &char) -> bool {
let c = *c;
char::is_whitespace(c) || is_digit(&c) || c == ';' || c == '(' || c == ')' ||
c == ',' || c == '.' || c == ',' || c == ':'
}
use self::Token::*;
let mut buffer = String::new();
buffer.push(c); buffer.push(c);
loop { loop {
if iter.peek().map_or(true, |x| ends_identifier(x)) { if iter.peek().map_or(true, |x| ends_identifier(x)) {
@ -128,8 +145,7 @@ pub fn tokenize(input: &str) -> TokenizeResult {
buffer.push(iter.next().unwrap()); buffer.push(iter.next().unwrap());
} }
} }
Ok(match &buffer[..] {
match &buffer[..] {
"if" => Keyword(Kw::If), "if" => Keyword(Kw::If),
"then" => Keyword(Kw::Then), "then" => Keyword(Kw::Then),
"else" => Keyword(Kw::Else), "else" => Keyword(Kw::Else),
@ -139,15 +155,7 @@ pub fn tokenize(input: &str) -> TokenizeResult {
"fn" => Keyword(Kw::Fn), "fn" => Keyword(Kw::Fn),
"null" => Keyword(Kw::Null), "null" => Keyword(Kw::Null),
b => Identifier(b.to_string()), b => Identifier(b.to_string()),
} })
}
}
};
tokens.push(cur_tok);
}
Ok(tokens)
} }
#[cfg(test)] #[cfg(test)]
@ -167,17 +175,17 @@ mod tests {
#[test] #[test]
fn tokeniziation_tests() { fn tokeniziation_tests() {
tokentest!("let a = 3\n", tokentest!("let a = 3\n",
"[Keyword(Let), Identifier(\"a\"), Operator(Op { repr: \"=\" }), \ "[Keyword(Let), Identifier(\"a\"), Operator(Op(\"=\")), \
NumLiteral(3), Newline]"); NumLiteral(3), Newline]");
tokentest!("2+1", tokentest!("2+1",
"[NumLiteral(2), Operator(Op { repr: \"+\" }), NumLiteral(1)]"); "[NumLiteral(2), Operator(Op(\"+\")), NumLiteral(1)]");
tokentest!("2 + 1", tokentest!("2 + 1",
"[NumLiteral(2), Operator(Op { repr: \"+\" }), NumLiteral(1)]"); "[NumLiteral(2), Operator(Op(\"+\")), NumLiteral(1)]");
tokentest!("2.3*49.2", tokentest!("2.3*49.2",
"[NumLiteral(2.3), Operator(Op { repr: \"*\" }), NumLiteral(49.2)]"); "[NumLiteral(2.3), Operator(Op(\"*\")), NumLiteral(49.2)]");
assert!(tokenize("2.4.5").is_err()); assert!(tokenize("2.4.5").is_err());
} }