use itertools::Itertools; use std::collections::HashMap; use std::rc::Rc; use std::iter::{Iterator, Peekable}; use std::fmt; #[derive(Debug, PartialEq, Clone)] pub enum TokenKind { Newline, Semicolon, LParen, RParen, LSquareBracket, RSquareBracket, LAngleBracket, RAngleBracket, LCurlyBrace, RCurlyBrace, Pipe, Backslash, Comma, Period, Colon, Underscore, Slash, Equals, Operator(Rc), DigitGroup(Rc), HexLiteral(Rc), BinNumberSigil, StrLiteral(Rc), Identifier(Rc), Keyword(Kw), EOF, Error(String), } use self::TokenKind::*; impl fmt::Display for TokenKind { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { &Operator(ref s) => write!(f, "Operator({})", **s), &DigitGroup(ref s) => write!(f, "DigitGroup({})", s), &HexLiteral(ref s) => write!(f, "HexLiteral({})", s), &StrLiteral(ref s) => write!(f, "StrLiteral({})", s), &Identifier(ref s) => write!(f, "Identifier({})", s), &Error(ref s) => write!(f, "Error({})", s), other => write!(f, "{:?}", other), } } } #[derive(Debug, Clone, Copy, PartialEq)] pub enum Kw { If, Then, Else, Is, Func, For, While, Const, Let, In, Mut, Return, Alias, Type, SelfType, SelfIdent, Interface, Impl, True, False, Module } lazy_static! { static ref KEYWORDS: HashMap<&'static str, Kw> = hashmap! { "if" => Kw::If, "then" => Kw::Then, "else" => Kw::Else, "is" => Kw::Is, "fn" => Kw::Func, "for" => Kw::For, "while" => Kw::While, "const" => Kw::Const, "let" => Kw::Let, "in" => Kw::In, "mut" => Kw::Mut, "return" => Kw::Return, "alias" => Kw::Alias, "type" => Kw::Type, "Self" => Kw::SelfType, "self" => Kw::SelfIdent, "interface" => Kw::Interface, "impl" => Kw::Impl, "true" => Kw::True, "false" => Kw::False, "module" => Kw::Module, }; } #[derive(Debug, Clone)] pub struct Token { pub kind: TokenKind, pub line_num: usize, pub char_num: usize } impl Token { pub fn get_error(&self) -> Option { match self.kind { TokenKind::Error(ref s) => Some(s.clone()), _ => None, } } pub fn to_string_with_metadata(&self) -> String { format!("{}(L:{},c:{})", self.kind, self.line_num, self.char_num) } pub fn get_kind(&self) -> TokenKind { self.kind.clone() } } const OPERATOR_CHARS: [char; 18] = ['!', '$', '%', '&', '*', '+', '-', '.', ':', '<', '>', '=', '?', '@', '^', '|', '~', '`']; fn is_operator(c: &char) -> bool { OPERATOR_CHARS.iter().any(|x| x == c) } type CharData = (usize, usize, char); pub fn tokenize(input: &str) -> Vec { let mut tokens: Vec = Vec::new(); let mut input = input.lines().enumerate() .intersperse((0, "\n")) .flat_map(|(line_idx, ref line)| { line.chars().enumerate().map(move |(ch_idx, ch)| (line_idx, ch_idx, ch)) }) .peekable(); while let Some((line_num, char_num, c)) = input.next() { let cur_tok_kind = match c { '/' => match input.peek().map(|t| t.2) { Some('/') => { while let Some((_, _, c)) = input.next() { if c == '\n' { break; } } continue; }, Some('*') => { input.next(); let mut comment_level = 1; while let Some((_, _, c)) = input.next() { if c == '*' && input.peek().map(|t| t.2) == Some('/') { input.next(); comment_level -= 1; } else if c == '/' && input.peek().map(|t| t.2) == Some('*') { input.next(); comment_level += 1; } if comment_level == 0 { break; } } continue; }, _ => Slash }, c if c.is_whitespace() && c != '\n' => continue, '\n' => Newline, ';' => Semicolon, ':' => Colon, ',' => Comma, '(' => LParen, ')' => RParen, '{' => LCurlyBrace, '}' => RCurlyBrace, '[' => LSquareBracket, ']' => RSquareBracket, '"' => handle_quote(&mut input), '\\' => Backslash, c if c.is_digit(10) => handle_digit(c, &mut input), c if c.is_alphabetic() || c == '_' => handle_alphabetic(c, &mut input), c if is_operator(&c) => handle_operator(c, &mut input), unknown => Error(format!("Unexpected character: {}", unknown)), }; tokens.push(Token { kind: cur_tok_kind, line_num, char_num }); } tokens } fn handle_digit(c: char, input: &mut Peekable>) -> TokenKind { if c == '0' && input.peek().map_or(false, |&(_, _, c)| { c == 'x' }) { input.next(); let rest: String = input.peeking_take_while(|&(_, _, ref c)| c.is_digit(16) || *c == '_').map(|(_, _, c)| { c }).collect(); HexLiteral(Rc::new(rest)) } else if c == '0' && input.peek().map_or(false, |&(_, _, c)| { c == 'b' }) { input.next(); BinNumberSigil } else { let mut buf = c.to_string(); buf.extend(input.peeking_take_while(|&(_, _, ref c)| c.is_digit(10)).map(|(_, _, c)| { c })); DigitGroup(Rc::new(buf)) } } fn handle_quote(input: &mut Peekable>) -> TokenKind { let mut buf = String::new(); loop { match input.next().map(|(_, _, c)| { c }) { Some('"') => break, Some('\\') => { let next = input.peek().map(|&(_, _, c)| { c }); if next == Some('n') { input.next(); buf.push('\n') } else if next == Some('"') { input.next(); buf.push('"'); } else if next == Some('t') { input.next(); buf.push('\t'); } }, Some(c) => buf.push(c), None => return TokenKind::Error(format!("Unclosed string")), } } TokenKind::StrLiteral(Rc::new(buf)) } fn handle_alphabetic(c: char, input: &mut Peekable>) -> TokenKind { let mut buf = String::new(); buf.push(c); if c == '_' && input.peek().map(|&(_, _, c)| { !c.is_alphabetic() }).unwrap_or(true) { return TokenKind::Underscore } loop { match input.peek().map(|&(_, _, c)| { c }) { Some(c) if c.is_alphanumeric() || c == '_' => { input.next(); buf.push(c); }, _ => break, } } match KEYWORDS.get(buf.as_str()) { Some(kw) => TokenKind::Keyword(*kw), None => TokenKind::Identifier(Rc::new(buf)), } } fn handle_operator(c: char, input: &mut Peekable>) -> TokenKind { match c { '<' | '>' | '|' | '.' | '=' => { let ref next = input.peek().map(|&(_, _, c)| { c }); if !next.map(|n| { is_operator(&n) }).unwrap_or(false) { return match c { '<' => LAngleBracket, '>' => RAngleBracket, '|' => Pipe, '.' => Period, '=' => Equals, _ => unreachable!(), } } }, _ => (), }; let mut buf = String::new(); if c == '`' { loop { match input.peek().map(|&(_, _, c)| { c }) { Some(c) if c.is_alphabetic() || c == '_' => { input.next(); buf.push(c); }, Some('`') => { input.next(); break; }, _ => break } } } else { buf.push(c); loop { match input.peek().map(|&(_, _, c)| { c }) { Some(c) if is_operator(&c) => { input.next(); buf.push(c); }, _ => break } } } TokenKind::Operator(Rc::new(buf)) } #[cfg(test)] mod schala_tokenizer_tests { use super::*; use super::Kw::*; macro_rules! digit { ($ident:expr) => { DigitGroup(Rc::new($ident.to_string())) } } macro_rules! ident { ($ident:expr) => { Identifier(Rc::new($ident.to_string())) } } macro_rules! op { ($ident:expr) => { Operator(Rc::new($ident.to_string())) } } #[test] fn tokens() { let a = tokenize("let a: A = c ++ d"); let token_kinds: Vec = a.into_iter().map(move |t| t.kind).collect(); assert_eq!(token_kinds, vec![Keyword(Let), ident!("a"), Colon, ident!("A"), LAngleBracket, ident!("B"), RAngleBracket, Equals, ident!("c"), op!("++"), ident!("d")]); } #[test] fn underscores() { let token_kinds: Vec = tokenize("4_8").into_iter().map(move |t| t.kind).collect(); assert_eq!(token_kinds, vec![digit!("4"), Underscore, digit!("8")]); let token_kinds2: Vec = tokenize("aba_yo").into_iter().map(move |t| t.kind).collect(); assert_eq!(token_kinds2, vec![ident!("aba_yo")]); } #[test] fn comments() { let token_kinds: Vec = tokenize("1 + /* hella /* bro */ */ 2").into_iter().map(move |t| t.kind).collect(); assert_eq!(token_kinds, vec![digit!("1"), op!("+"), digit!("2")]); } #[test] fn backtick_operators() { let token_kinds: Vec = tokenize("1 `plus` 2").into_iter().map(move |t| t.kind).collect(); assert_eq!(token_kinds, vec![digit!("1"), op!("plus"), digit!("2")]); } }