schala/schala-lang/language/src/tokenizing.rs

326 lines
8.9 KiB
Rust
Raw Normal View History

use itertools::Itertools;
use std::collections::HashMap;
use std::rc::Rc;
2018-03-03 00:28:52 -08:00
use std::iter::{Iterator, Peekable};
2018-03-02 22:11:25 -08:00
use std::fmt;
#[derive(Debug, PartialEq, Clone)]
2018-11-16 23:17:34 -08:00
pub enum TokenKind {
Newline, Semicolon,
LParen, RParen,
LSquareBracket, RSquareBracket,
LAngleBracket, RAngleBracket,
LCurlyBrace, RCurlyBrace,
2018-11-05 18:50:45 -08:00
Pipe, Backslash,
Comma, Period, Colon, Underscore,
2019-06-16 16:07:27 -07:00
Slash, Equals,
Operator(Rc<String>),
DigitGroup(Rc<String>), HexLiteral(Rc<String>), BinNumberSigil,
StrLiteral(Rc<String>),
Identifier(Rc<String>),
Keyword(Kw),
EOF,
Error(String),
}
2018-11-16 23:17:34 -08:00
use self::TokenKind::*;
2018-11-16 23:17:34 -08:00
impl fmt::Display for TokenKind {
2018-03-02 22:11:25 -08:00
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
&Operator(ref s) => write!(f, "Operator({})", **s),
&DigitGroup(ref s) => write!(f, "DigitGroup({})", s),
&HexLiteral(ref s) => write!(f, "HexLiteral({})", s),
&StrLiteral(ref s) => write!(f, "StrLiteral({})", s),
&Identifier(ref s) => write!(f, "Identifier({})", s),
&Error(ref s) => write!(f, "Error({})", s),
other => write!(f, "{:?}", other),
}
}
}
#[derive(Debug, Clone, Copy, PartialEq)]
pub enum Kw {
If, Then, Else,
Is,
Func,
For, While,
2018-07-11 16:44:15 -07:00
Const, Let, In,
Mut,
Return,
Alias, Type, SelfType, SelfIdent,
2018-04-24 16:30:17 -07:00
Interface, Impl,
True, False,
Module
}
lazy_static! {
static ref KEYWORDS: HashMap<&'static str, Kw> =
hashmap! {
"if" => Kw::If,
"then" => Kw::Then,
"else" => Kw::Else,
"is" => Kw::Is,
"fn" => Kw::Func,
"for" => Kw::For,
"while" => Kw::While,
"const" => Kw::Const,
"let" => Kw::Let,
"in" => Kw::In,
2018-07-11 16:44:15 -07:00
"mut" => Kw::Mut,
"return" => Kw::Return,
"alias" => Kw::Alias,
"type" => Kw::Type,
"Self" => Kw::SelfType,
"self" => Kw::SelfIdent,
2018-04-24 16:30:17 -07:00
"interface" => Kw::Interface,
"impl" => Kw::Impl,
"true" => Kw::True,
"false" => Kw::False,
"module" => Kw::Module,
};
}
#[derive(Debug, Clone)]
pub struct Token {
2018-11-16 23:17:34 -08:00
pub kind: TokenKind,
2019-01-08 02:11:19 -08:00
pub line_num: usize,
pub char_num: usize
}
impl Token {
2018-05-02 01:14:46 -07:00
pub fn get_error(&self) -> Option<String> {
2018-11-16 23:17:34 -08:00
match self.kind {
TokenKind::Error(ref s) => Some(s.clone()),
_ => None,
}
}
2018-03-02 22:11:25 -08:00
pub fn to_string_with_metadata(&self) -> String {
2019-01-08 02:11:19 -08:00
format!("{}(L:{},c:{})", self.kind, self.line_num, self.char_num)
2018-03-02 22:11:25 -08:00
}
2018-11-16 23:17:34 -08:00
pub fn get_kind(&self) -> TokenKind {
self.kind.clone()
}
}
2018-03-17 19:12:58 -07:00
const OPERATOR_CHARS: [char; 18] = ['!', '$', '%', '&', '*', '+', '-', '.', ':', '<', '>', '=', '?', '@', '^', '|', '~', '`'];
fn is_operator(c: &char) -> bool {
OPERATOR_CHARS.iter().any(|x| x == c)
}
type CharData = (usize, usize, char);
2018-03-02 02:57:04 -08:00
pub fn tokenize(input: &str) -> Vec<Token> {
let mut tokens: Vec<Token> = Vec::new();
2018-03-02 02:57:04 -08:00
2019-06-16 16:07:27 -07:00
let mut input = input.lines().enumerate()
.intersperse((0, "\n"))
2018-03-02 02:57:04 -08:00
.flat_map(|(line_idx, ref line)| {
line.chars().enumerate().map(move |(ch_idx, ch)| (line_idx, ch_idx, ch))
})
.peekable();
2018-03-02 02:57:04 -08:00
2019-01-08 02:11:19 -08:00
while let Some((line_num, char_num, c)) = input.next() {
2018-11-16 23:17:34 -08:00
let cur_tok_kind = match c {
2018-03-17 22:25:43 -07:00
'/' => match input.peek().map(|t| t.2) {
Some('/') => {
2018-03-02 15:15:12 -08:00
while let Some((_, _, c)) = input.next() {
if c == '\n' {
break;
}
}
2018-03-17 19:12:58 -07:00
continue;
},
2018-03-17 22:25:43 -07:00
Some('*') => {
input.next();
let mut comment_level = 1;
while let Some((_, _, c)) = input.next() {
if c == '*' && input.peek().map(|t| t.2) == Some('/') {
input.next();
comment_level -= 1;
} else if c == '/' && input.peek().map(|t| t.2) == Some('*') {
input.next();
comment_level += 1;
}
if comment_level == 0 {
break;
}
}
continue;
2018-03-17 19:12:58 -07:00
},
_ => Slash
},
c if c.is_whitespace() && c != '\n' => continue,
'\n' => Newline, ';' => Semicolon,
':' => Colon, ',' => Comma,
'(' => LParen, ')' => RParen,
'{' => LCurlyBrace, '}' => RCurlyBrace,
'[' => LSquareBracket, ']' => RSquareBracket,
'"' => handle_quote(&mut input),
2018-11-05 18:50:45 -08:00
'\\' => Backslash,
c if c.is_digit(10) => handle_digit(c, &mut input),
2019-01-08 02:38:10 -08:00
c if c.is_alphabetic() || c == '_' => handle_alphabetic(c, &mut input),
c if is_operator(&c) => handle_operator(c, &mut input),
unknown => Error(format!("Unexpected character: {}", unknown)),
};
2019-01-08 02:11:19 -08:00
tokens.push(Token { kind: cur_tok_kind, line_num, char_num });
}
tokens
}
2018-11-16 23:17:34 -08:00
fn handle_digit(c: char, input: &mut Peekable<impl Iterator<Item=CharData>>) -> TokenKind {
2018-03-02 15:15:12 -08:00
if c == '0' && input.peek().map_or(false, |&(_, _, c)| { c == 'x' }) {
input.next();
2018-03-02 15:15:12 -08:00
let rest: String = input.peeking_take_while(|&(_, _, ref c)| c.is_digit(16) || *c == '_').map(|(_, _, c)| { c }).collect();
HexLiteral(Rc::new(rest))
2018-03-02 15:15:12 -08:00
} else if c == '0' && input.peek().map_or(false, |&(_, _, c)| { c == 'b' }) {
input.next();
BinNumberSigil
} else {
let mut buf = c.to_string();
2018-03-02 15:15:12 -08:00
buf.extend(input.peeking_take_while(|&(_, _, ref c)| c.is_digit(10)).map(|(_, _, c)| { c }));
DigitGroup(Rc::new(buf))
}
}
2018-11-16 23:17:34 -08:00
fn handle_quote(input: &mut Peekable<impl Iterator<Item=CharData>>) -> TokenKind {
let mut buf = String::new();
loop {
2018-03-02 15:15:12 -08:00
match input.next().map(|(_, _, c)| { c }) {
Some('"') => break,
Some('\\') => {
2018-03-02 15:15:12 -08:00
let next = input.peek().map(|&(_, _, c)| { c });
if next == Some('n') {
input.next();
buf.push('\n')
} else if next == Some('"') {
input.next();
buf.push('"');
} else if next == Some('t') {
input.next();
buf.push('\t');
}
},
Some(c) => buf.push(c),
2018-11-16 23:17:34 -08:00
None => return TokenKind::Error(format!("Unclosed string")),
}
}
2018-11-16 23:17:34 -08:00
TokenKind::StrLiteral(Rc::new(buf))
}
2018-11-16 23:17:34 -08:00
fn handle_alphabetic(c: char, input: &mut Peekable<impl Iterator<Item=CharData>>) -> TokenKind {
let mut buf = String::new();
buf.push(c);
2018-03-02 15:15:12 -08:00
if c == '_' && input.peek().map(|&(_, _, c)| { !c.is_alphabetic() }).unwrap_or(true) {
2018-11-16 23:17:34 -08:00
return TokenKind::Underscore
}
loop {
2018-03-02 15:15:12 -08:00
match input.peek().map(|&(_, _, c)| { c }) {
2018-11-15 16:19:53 -08:00
Some(c) if c.is_alphanumeric() || c == '_' => {
input.next();
buf.push(c);
},
_ => break,
}
}
match KEYWORDS.get(buf.as_str()) {
2018-11-16 23:17:34 -08:00
Some(kw) => TokenKind::Keyword(*kw),
None => TokenKind::Identifier(Rc::new(buf)),
}
}
2018-11-16 23:17:34 -08:00
fn handle_operator(c: char, input: &mut Peekable<impl Iterator<Item=CharData>>) -> TokenKind {
match c {
2019-06-16 16:07:27 -07:00
'<' | '>' | '|' | '.' | '=' => {
2018-03-02 15:15:12 -08:00
let ref next = input.peek().map(|&(_, _, c)| { c });
if !next.map(|n| { is_operator(&n) }).unwrap_or(false) {
return match c {
'<' => LAngleBracket,
'>' => RAngleBracket,
'|' => Pipe,
'.' => Period,
2019-06-16 16:07:27 -07:00
'=' => Equals,
_ => unreachable!(),
}
}
},
_ => (),
};
let mut buf = String::new();
if c == '`' {
loop {
match input.peek().map(|&(_, _, c)| { c }) {
Some(c) if c.is_alphabetic() || c == '_' => {
input.next();
buf.push(c);
},
Some('`') => {
input.next();
break;
},
_ => break
}
}
} else {
buf.push(c);
loop {
match input.peek().map(|&(_, _, c)| { c }) {
Some(c) if is_operator(&c) => {
input.next();
buf.push(c);
},
_ => break
}
}
}
2018-11-16 23:17:34 -08:00
TokenKind::Operator(Rc::new(buf))
}
#[cfg(test)]
mod schala_tokenizer_tests {
use super::*;
2018-02-23 01:59:53 -08:00
use super::Kw::*;
macro_rules! digit { ($ident:expr) => { DigitGroup(Rc::new($ident.to_string())) } }
macro_rules! ident { ($ident:expr) => { Identifier(Rc::new($ident.to_string())) } }
macro_rules! op { ($ident:expr) => { Operator(Rc::new($ident.to_string())) } }
#[test]
fn tokens() {
let a = tokenize("let a: A<B> = c ++ d");
2018-11-16 23:17:34 -08:00
let token_kinds: Vec<TokenKind> = a.into_iter().map(move |t| t.kind).collect();
assert_eq!(token_kinds, vec![Keyword(Let), ident!("a"), Colon, ident!("A"),
2019-06-16 16:07:27 -07:00
LAngleBracket, ident!("B"), RAngleBracket, Equals, ident!("c"), op!("++"), ident!("d")]);
}
#[test]
fn underscores() {
2018-11-16 23:17:34 -08:00
let token_kinds: Vec<TokenKind> = tokenize("4_8").into_iter().map(move |t| t.kind).collect();
assert_eq!(token_kinds, vec![digit!("4"), Underscore, digit!("8")]);
2018-11-15 16:19:53 -08:00
2018-11-16 23:17:34 -08:00
let token_kinds2: Vec<TokenKind> = tokenize("aba_yo").into_iter().map(move |t| t.kind).collect();
assert_eq!(token_kinds2, vec![ident!("aba_yo")]);
}
2018-03-17 22:25:43 -07:00
#[test]
fn comments() {
2018-11-16 23:17:34 -08:00
let token_kinds: Vec<TokenKind> = tokenize("1 + /* hella /* bro */ */ 2").into_iter().map(move |t| t.kind).collect();
assert_eq!(token_kinds, vec![digit!("1"), op!("+"), digit!("2")]);
2018-03-17 22:25:43 -07:00
}
#[test]
fn backtick_operators() {
2018-11-16 23:17:34 -08:00
let token_kinds: Vec<TokenKind> = tokenize("1 `plus` 2").into_iter().map(move |t| t.kind).collect();
assert_eq!(token_kinds, vec![digit!("1"), op!("plus"), digit!("2")]);
}
}