372 lines
10 KiB
Rust
372 lines
10 KiB
Rust
use itertools::Itertools;
|
|
use std::{iter::{Iterator, Peekable}, convert::TryFrom, rc::Rc, fmt};
|
|
use std::convert::TryInto;
|
|
|
|
|
|
/// A location in a particular source file. Note that the
|
|
/// sizes of the internal unsigned integer types limit
|
|
/// the size of a source file to 2^32 lines of
|
|
/// at most 2^16 characters, which should be plenty big.
|
|
#[derive(Debug, Clone, Copy, PartialEq, Default)]
|
|
pub struct Location {
|
|
pub(crate) line_num: u32,
|
|
pub(crate) char_num: u16,
|
|
}
|
|
|
|
impl fmt::Display for Location {
|
|
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
|
write!(f, "{}:{}", self.line_num, self.char_num)
|
|
}
|
|
}
|
|
|
|
#[derive(Debug, PartialEq, Clone)]
|
|
pub enum TokenKind {
|
|
Newline, Semicolon,
|
|
|
|
LParen, RParen,
|
|
LSquareBracket, RSquareBracket,
|
|
LAngleBracket, RAngleBracket,
|
|
LCurlyBrace, RCurlyBrace,
|
|
Pipe, Backslash,
|
|
|
|
Comma, Period, Colon, Underscore,
|
|
Slash, Equals,
|
|
|
|
Operator(Rc<String>),
|
|
DigitGroup(Rc<String>), HexLiteral(Rc<String>), BinNumberSigil,
|
|
StrLiteral {
|
|
s: Rc<String>,
|
|
prefix: Option<Rc<String>>
|
|
},
|
|
Identifier(Rc<String>),
|
|
Keyword(Kw),
|
|
|
|
EOF,
|
|
|
|
Error(String),
|
|
}
|
|
use self::TokenKind::*;
|
|
|
|
impl fmt::Display for TokenKind {
|
|
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
|
match self {
|
|
&Operator(ref s) => write!(f, "Operator({})", **s),
|
|
&DigitGroup(ref s) => write!(f, "DigitGroup({})", s),
|
|
&HexLiteral(ref s) => write!(f, "HexLiteral({})", s),
|
|
&StrLiteral {ref s, .. } => write!(f, "StrLiteral({})", s),
|
|
&Identifier(ref s) => write!(f, "Identifier({})", s),
|
|
&Error(ref s) => write!(f, "Error({})", s),
|
|
other => write!(f, "{:?}", other),
|
|
}
|
|
}
|
|
}
|
|
|
|
#[derive(Debug, Clone, Copy, PartialEq)]
|
|
pub enum Kw {
|
|
If, Then, Else,
|
|
Is,
|
|
Func,
|
|
For, While,
|
|
Const, Let, In,
|
|
Mut,
|
|
Return,
|
|
Alias, Type, SelfType, SelfIdent,
|
|
Interface, Impl,
|
|
True, False,
|
|
Module, Import
|
|
}
|
|
|
|
impl TryFrom<&str> for Kw {
|
|
type Error = ();
|
|
|
|
fn try_from(value: &str) -> Result<Self, Self::Error> {
|
|
Ok(match value {
|
|
"if" => Kw::If,
|
|
"then" => Kw::Then,
|
|
"else" => Kw::Else,
|
|
"is" => Kw::Is,
|
|
"fn" => Kw::Func,
|
|
"for" => Kw::For,
|
|
"while" => Kw::While,
|
|
"const" => Kw::Const,
|
|
"let" => Kw::Let,
|
|
"in" => Kw::In,
|
|
"mut" => Kw::Mut,
|
|
"return" => Kw::Return,
|
|
"alias" => Kw::Alias,
|
|
"type" => Kw::Type,
|
|
"Self" => Kw::SelfType,
|
|
"self" => Kw::SelfIdent,
|
|
"interface" => Kw::Interface,
|
|
"impl" => Kw::Impl,
|
|
"true" => Kw::True,
|
|
"false" => Kw::False,
|
|
"module" => Kw::Module,
|
|
"import" => Kw::Import,
|
|
_ => return Err(()),
|
|
})
|
|
}
|
|
}
|
|
|
|
#[derive(Debug, Clone, PartialEq)]
|
|
pub struct Token {
|
|
pub kind: TokenKind,
|
|
pub(crate) location: Location,
|
|
}
|
|
|
|
impl Token {
|
|
pub fn to_string_with_metadata(&self) -> String {
|
|
format!("{}({})", self.kind, self.location)
|
|
}
|
|
|
|
pub fn get_kind(&self) -> TokenKind {
|
|
self.kind.clone()
|
|
}
|
|
}
|
|
|
|
const OPERATOR_CHARS: [char; 18] = ['!', '$', '%', '&', '*', '+', '-', '.', ':', '<', '>', '=', '?', '@', '^', '|', '~', '`'];
|
|
fn is_operator(c: &char) -> bool {
|
|
OPERATOR_CHARS.iter().any(|x| x == c)
|
|
}
|
|
|
|
type CharData = (usize, usize, char);
|
|
|
|
pub fn tokenize(input: &str) -> Vec<Token> {
|
|
let mut tokens: Vec<Token> = Vec::new();
|
|
|
|
let mut input = Iterator::intersperse(input.lines().enumerate(), (0, "\n"))
|
|
.flat_map(|(line_idx, ref line)| {
|
|
line.chars().enumerate().map(move |(ch_idx, ch)| (line_idx, ch_idx, ch))
|
|
})
|
|
.peekable();
|
|
|
|
while let Some((line_num, char_num, c)) = input.next() {
|
|
let cur_tok_kind = match c {
|
|
'/' => match input.peek().map(|t| t.2) {
|
|
Some('/') => {
|
|
while let Some((_, _, c)) = input.next() {
|
|
if c == '\n' {
|
|
break;
|
|
}
|
|
}
|
|
continue;
|
|
},
|
|
Some('*') => {
|
|
input.next();
|
|
let mut comment_level = 1;
|
|
while let Some((_, _, c)) = input.next() {
|
|
if c == '*' && input.peek().map(|t| t.2) == Some('/') {
|
|
input.next();
|
|
comment_level -= 1;
|
|
} else if c == '/' && input.peek().map(|t| t.2) == Some('*') {
|
|
input.next();
|
|
comment_level += 1;
|
|
}
|
|
if comment_level == 0 {
|
|
break;
|
|
}
|
|
}
|
|
if comment_level != 0 {
|
|
Error("Unclosed comment".to_string())
|
|
} else {
|
|
continue;
|
|
}
|
|
},
|
|
_ => Slash
|
|
},
|
|
c if c.is_whitespace() && c != '\n' => continue,
|
|
'\n' => Newline, ';' => Semicolon,
|
|
':' => Colon, ',' => Comma,
|
|
'(' => LParen, ')' => RParen,
|
|
'{' => LCurlyBrace, '}' => RCurlyBrace,
|
|
'[' => LSquareBracket, ']' => RSquareBracket,
|
|
'"' => handle_quote(&mut input, None),
|
|
'\\' => Backslash,
|
|
c if c.is_digit(10) => handle_digit(c, &mut input),
|
|
c if c.is_alphabetic() || c == '_' => handle_alphabetic(c, &mut input),
|
|
c if is_operator(&c) => handle_operator(c, &mut input),
|
|
unknown => Error(format!("Unexpected character: {}", unknown)),
|
|
};
|
|
let location = Location { line_num: line_num.try_into().unwrap(), char_num: char_num.try_into().unwrap() };
|
|
tokens.push(Token { kind: cur_tok_kind, location });
|
|
}
|
|
tokens
|
|
}
|
|
|
|
fn handle_digit(c: char, input: &mut Peekable<impl Iterator<Item=CharData>>) -> TokenKind {
|
|
if c == '0' && input.peek().map_or(false, |&(_, _, c)| { c == 'x' }) {
|
|
input.next();
|
|
let rest: String = input.peeking_take_while(|&(_, _, ref c)| c.is_digit(16) || *c == '_').map(|(_, _, c)| { c }).collect();
|
|
HexLiteral(Rc::new(rest))
|
|
} else if c == '0' && input.peek().map_or(false, |&(_, _, c)| { c == 'b' }) {
|
|
input.next();
|
|
BinNumberSigil
|
|
} else {
|
|
let mut buf = c.to_string();
|
|
buf.extend(input.peeking_take_while(|&(_, _, ref c)| c.is_digit(10)).map(|(_, _, c)| { c }));
|
|
DigitGroup(Rc::new(buf))
|
|
}
|
|
}
|
|
|
|
fn handle_quote(input: &mut Peekable<impl Iterator<Item=CharData>>, quote_prefix: Option<&str>) -> TokenKind {
|
|
let mut buf = String::new();
|
|
loop {
|
|
match input.next().map(|(_, _, c)| { c }) {
|
|
Some('"') => break,
|
|
Some('\\') => {
|
|
let next = input.peek().map(|&(_, _, c)| { c });
|
|
if next == Some('n') {
|
|
input.next();
|
|
buf.push('\n')
|
|
} else if next == Some('"') {
|
|
input.next();
|
|
buf.push('"');
|
|
} else if next == Some('t') {
|
|
input.next();
|
|
buf.push('\t');
|
|
}
|
|
},
|
|
Some(c) => buf.push(c),
|
|
None => return TokenKind::Error("Unclosed string".to_string()),
|
|
}
|
|
}
|
|
TokenKind::StrLiteral { s: Rc::new(buf), prefix: quote_prefix.map(|s| Rc::new(s.to_string())) }
|
|
}
|
|
|
|
fn handle_alphabetic(c: char, input: &mut Peekable<impl Iterator<Item=CharData>>) -> TokenKind {
|
|
let mut buf = String::new();
|
|
buf.push(c);
|
|
if c == '_' && input.peek().map(|&(_, _, c)| { !c.is_alphabetic() }).unwrap_or(true) {
|
|
return TokenKind::Underscore
|
|
}
|
|
|
|
loop {
|
|
match input.peek().map(|&(_, _, c)| { c }) {
|
|
Some(c) if c == '"' => {
|
|
input.next();
|
|
return handle_quote(input, Some(&buf));
|
|
},
|
|
Some(c) if c.is_alphanumeric() || c == '_' => {
|
|
input.next();
|
|
buf.push(c);
|
|
},
|
|
_ => break,
|
|
}
|
|
}
|
|
|
|
match Kw::try_from(buf.as_str()) {
|
|
Ok(kw) => TokenKind::Keyword(kw),
|
|
Err(()) => TokenKind::Identifier(Rc::new(buf)),
|
|
}
|
|
}
|
|
|
|
fn handle_operator(c: char, input: &mut Peekable<impl Iterator<Item=CharData>>) -> TokenKind {
|
|
match c {
|
|
'<' | '>' | '|' | '.' | '=' => {
|
|
let ref next = input.peek().map(|&(_, _, c)| { c });
|
|
if !next.map(|n| { is_operator(&n) }).unwrap_or(false) {
|
|
return match c {
|
|
'<' => LAngleBracket,
|
|
'>' => RAngleBracket,
|
|
'|' => Pipe,
|
|
'.' => Period,
|
|
'=' => Equals,
|
|
_ => unreachable!(),
|
|
}
|
|
}
|
|
},
|
|
_ => (),
|
|
};
|
|
|
|
let mut buf = String::new();
|
|
|
|
if c == '`' {
|
|
loop {
|
|
match input.peek().map(|&(_, _, c)| { c }) {
|
|
Some(c) if c.is_alphabetic() || c == '_' => {
|
|
input.next();
|
|
buf.push(c);
|
|
},
|
|
Some('`') => {
|
|
input.next();
|
|
break;
|
|
},
|
|
_ => break
|
|
}
|
|
}
|
|
} else {
|
|
buf.push(c);
|
|
loop {
|
|
match input.peek().map(|&(_, _, c)| { c }) {
|
|
Some(c) if is_operator(&c) => {
|
|
input.next();
|
|
buf.push(c);
|
|
},
|
|
_ => break
|
|
}
|
|
}
|
|
}
|
|
TokenKind::Operator(Rc::new(buf))
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod schala_tokenizer_tests {
|
|
use super::*;
|
|
use super::Kw::*;
|
|
|
|
macro_rules! digit { ($ident:expr) => { DigitGroup(Rc::new($ident.to_string())) } }
|
|
macro_rules! ident { ($ident:expr) => { Identifier(Rc::new($ident.to_string())) } }
|
|
macro_rules! op { ($ident:expr) => { Operator(Rc::new($ident.to_string())) } }
|
|
|
|
fn token_kinds(input: &str) -> Vec<TokenKind> {
|
|
tokenize(input).into_iter().map(move |tok| tok.kind).collect()
|
|
}
|
|
|
|
#[test]
|
|
fn tokens() {
|
|
let output = token_kinds("let a: A<B> = c ++ d");
|
|
assert_eq!(output, vec![Keyword(Let), ident!("a"), Colon, ident!("A"),
|
|
LAngleBracket, ident!("B"), RAngleBracket, Equals, ident!("c"), op!("++"), ident!("d")]);
|
|
}
|
|
|
|
#[test]
|
|
fn underscores() {
|
|
let output = token_kinds("4_8");
|
|
assert_eq!(output, vec![digit!("4"), Underscore, digit!("8")]);
|
|
|
|
let output = token_kinds("aba_yo");
|
|
assert_eq!(output, vec![ident!("aba_yo")]);
|
|
}
|
|
|
|
#[test]
|
|
fn comments() {
|
|
let output = token_kinds("1 + /* hella /* bro */ */ 2");
|
|
assert_eq!(output, vec![digit!("1"), op!("+"), digit!("2")]);
|
|
|
|
let output = token_kinds("1 + /* hella /* bro */ 2");
|
|
assert_eq!(output, vec![digit!("1"), op!("+"), Error("Unclosed comment".to_string())]);
|
|
|
|
//TODO not sure if I want this behavior
|
|
let output = token_kinds("1 + /* hella */ bro */ 2");
|
|
assert_eq!(output, vec![digit!("1"), op!("+"), Identifier(Rc::new("bro".to_string())), Operator(Rc::new("*".to_string())), Slash, DigitGroup(Rc::new("2".to_string()))]);
|
|
}
|
|
|
|
#[test]
|
|
fn backtick_operators() {
|
|
let output = token_kinds("1 `plus` 2");
|
|
assert_eq!(output, vec![digit!("1"), op!("plus"), digit!("2")]);
|
|
}
|
|
|
|
#[test]
|
|
fn string_literals() {
|
|
let output = token_kinds(r#""some string""#);
|
|
assert_eq!(output, vec![StrLiteral { s: Rc::new("some string".to_string()), prefix: None }]);
|
|
|
|
let output = token_kinds(r#"b"some bytestring""#);
|
|
assert_eq!(output, vec![StrLiteral { s: Rc::new("some bytestring".to_string()), prefix: Some(Rc::new("b".to_string())) }]);
|
|
|
|
let output = token_kinds(r#""Do \n \" escapes work\t""#);
|
|
assert_eq!(output, vec![StrLiteral { s: Rc::new("Do \n \" escapes work\t".to_string()), prefix: None }]);
|
|
}
|
|
}
|