2018-02-23 01:58:06 -08:00
use itertools ::Itertools ;
use std ::collections ::HashMap ;
use std ::rc ::Rc ;
2018-03-03 00:28:52 -08:00
use std ::iter ::{ Iterator , Peekable } ;
2018-03-02 22:11:25 -08:00
use std ::fmt ;
2018-02-23 01:58:06 -08:00
#[ derive(Debug, PartialEq, Clone) ]
pub enum TokenType {
Newline , Semicolon ,
LParen , RParen ,
LSquareBracket , RSquareBracket ,
LAngleBracket , RAngleBracket ,
LCurlyBrace , RCurlyBrace ,
Pipe ,
Comma , Period , Colon , Underscore ,
2018-03-17 19:12:58 -07:00
Slash ,
2018-02-23 01:58:06 -08:00
Operator ( Rc < String > ) ,
DigitGroup ( Rc < String > ) , HexLiteral ( Rc < String > ) , BinNumberSigil ,
StrLiteral ( Rc < String > ) ,
Identifier ( Rc < String > ) ,
Keyword ( Kw ) ,
EOF ,
Error ( String ) ,
}
use self ::TokenType ::* ;
2018-03-02 22:11:25 -08:00
impl fmt ::Display for TokenType {
fn fmt ( & self , f : & mut fmt ::Formatter ) -> fmt ::Result {
match self {
& Operator ( ref s ) = > write! ( f , " Operator({}) " , * * s ) ,
& DigitGroup ( ref s ) = > write! ( f , " DigitGroup({}) " , s ) ,
& HexLiteral ( ref s ) = > write! ( f , " HexLiteral({}) " , s ) ,
& StrLiteral ( ref s ) = > write! ( f , " StrLiteral({}) " , s ) ,
& Identifier ( ref s ) = > write! ( f , " Identifier({}) " , s ) ,
& Error ( ref s ) = > write! ( f , " Error({}) " , s ) ,
other = > write! ( f , " {:?} " , other ) ,
}
}
}
2018-02-23 01:58:06 -08:00
#[ derive(Debug, Clone, Copy, PartialEq) ]
pub enum Kw {
If , Else ,
Func ,
For ,
Match ,
Var , Const , Let , In ,
Return ,
Alias , Type , SelfType , SelfIdent ,
Trait , Impl ,
True , False ,
Module
}
lazy_static! {
static ref KEYWORDS : HashMap < & 'static str , Kw > =
hashmap! {
" if " = > Kw ::If ,
" else " = > Kw ::Else ,
" fn " = > Kw ::Func ,
" for " = > Kw ::For ,
" match " = > Kw ::Match ,
" var " = > Kw ::Var ,
" const " = > Kw ::Const ,
" let " = > Kw ::Let ,
" in " = > Kw ::In ,
" return " = > Kw ::Return ,
" alias " = > Kw ::Alias ,
" type " = > Kw ::Type ,
" Self " = > Kw ::SelfType ,
" self " = > Kw ::SelfIdent ,
" trait " = > Kw ::Trait ,
" impl " = > Kw ::Impl ,
" true " = > Kw ::True ,
" false " = > Kw ::False ,
" module " = > Kw ::Module ,
} ;
}
2018-03-02 00:42:52 -08:00
#[ derive(Debug, Clone) ]
2018-02-23 01:58:06 -08:00
pub struct Token {
pub token_type : TokenType ,
2018-03-02 15:21:48 -08:00
pub offset : ( usize , usize ) ,
2018-02-23 01:58:06 -08:00
}
impl Token {
pub fn get_error ( & self ) -> Option < & String > {
match self . token_type {
TokenType ::Error ( ref s ) = > Some ( s ) ,
_ = > None ,
}
}
2018-03-02 22:11:25 -08:00
pub fn to_string_with_metadata ( & self ) -> String {
format! ( " {} (L: {} ,c: {} ) " , self . token_type , self . offset . 0 , self . offset . 1 )
}
2018-02-23 01:58:06 -08:00
}
2018-03-17 19:12:58 -07:00
const OPERATOR_CHARS : [ char ; 18 ] = [ '!' , '$' , '%' , '&' , '*' , '+' , '-' , '.' , ':' , '<' , '>' , '=' , '?' , '@' , '^' , '|' , '~' , '`' ] ;
2018-02-23 01:58:06 -08:00
fn is_operator ( c : & char ) -> bool {
OPERATOR_CHARS . iter ( ) . any ( | x | x = = c )
}
2018-03-02 15:15:12 -08:00
type CharIter < I : Iterator < Item = ( usize , usize , char ) > > = Peekable < I > ;
2018-03-02 02:57:04 -08:00
2018-02-23 01:58:06 -08:00
pub fn tokenize ( input : & str ) -> Vec < Token > {
let mut tokens : Vec < Token > = Vec ::new ( ) ;
2018-03-02 02:57:04 -08:00
2018-03-02 15:15:12 -08:00
let mut input = input . lines ( ) . enumerate ( )
2018-03-02 02:57:04 -08:00
. flat_map ( | ( line_idx , ref line ) | {
line . chars ( ) . enumerate ( ) . map ( move | ( ch_idx , ch ) | ( line_idx , ch_idx , ch ) )
} ) . peekable ( ) ;
2018-03-02 15:21:48 -08:00
while let Some ( ( line_idx , ch_idx , c ) ) = input . next ( ) {
2018-02-23 01:58:06 -08:00
let cur_tok_type = match c {
2018-03-17 22:25:43 -07:00
'/' = > match input . peek ( ) . map ( | t | t . 2 ) {
Some ( '/' ) = > {
2018-03-02 15:15:12 -08:00
while let Some ( ( _ , _ , c ) ) = input . next ( ) {
2018-02-23 01:58:06 -08:00
if c = = '\n' {
break ;
}
}
2018-03-17 19:12:58 -07:00
continue ;
} ,
2018-03-17 22:25:43 -07:00
Some ( '*' ) = > {
input . next ( ) ;
let mut comment_level = 1 ;
while let Some ( ( _ , _ , c ) ) = input . next ( ) {
if c = = '*' & & input . peek ( ) . map ( | t | t . 2 ) = = Some ( '/' ) {
input . next ( ) ;
comment_level - = 1 ;
} else if c = = '/' & & input . peek ( ) . map ( | t | t . 2 ) = = Some ( '*' ) {
input . next ( ) ;
comment_level + = 1 ;
}
if comment_level = = 0 {
break ;
}
}
continue ;
2018-03-17 19:12:58 -07:00
} ,
_ = > Slash
2018-02-23 01:58:06 -08:00
} ,
c if c . is_whitespace ( ) & & c ! = '\n' = > continue ,
'\n' = > Newline , ';' = > Semicolon ,
':' = > Colon , ',' = > Comma ,
'(' = > LParen , ')' = > RParen ,
'{' = > LCurlyBrace , '}' = > RCurlyBrace ,
'[' = > LSquareBracket , ']' = > RSquareBracket ,
'"' = > handle_quote ( & mut input ) ,
c if c . is_digit ( 10 ) = > handle_digit ( c , & mut input ) ,
c if c . is_alphabetic ( ) | | c = = '_' = > handle_alphabetic ( c , & mut input ) , //TODO I'll probably have to rewrite this if I care about types being uppercase, also type parameterization
c if is_operator ( & c ) = > handle_operator ( c , & mut input ) ,
unknown = > Error ( format! ( " Unexpected character: {} " , unknown ) ) ,
} ;
2018-03-02 15:21:48 -08:00
tokens . push ( Token { token_type : cur_tok_type , offset : ( line_idx , ch_idx ) } ) ;
2018-02-23 01:58:06 -08:00
}
tokens
}
2018-03-02 15:15:12 -08:00
fn handle_digit < I : Iterator < Item = ( usize , usize , char ) > > ( c : char , input : & mut CharIter < I > ) -> TokenType {
if c = = '0' & & input . peek ( ) . map_or ( false , | & ( _ , _ , c ) | { c = = 'x' } ) {
2018-02-23 01:58:06 -08:00
input . next ( ) ;
2018-03-02 15:15:12 -08:00
let rest : String = input . peeking_take_while ( | & ( _ , _ , ref c ) | c . is_digit ( 16 ) | | * c = = '_' ) . map ( | ( _ , _ , c ) | { c } ) . collect ( ) ;
2018-02-23 01:58:06 -08:00
HexLiteral ( Rc ::new ( rest ) )
2018-03-02 15:15:12 -08:00
} else if c = = '0' & & input . peek ( ) . map_or ( false , | & ( _ , _ , c ) | { c = = 'b' } ) {
2018-02-23 01:58:06 -08:00
input . next ( ) ;
BinNumberSigil
} else {
let mut buf = c . to_string ( ) ;
2018-03-02 15:15:12 -08:00
buf . extend ( input . peeking_take_while ( | & ( _ , _ , ref c ) | c . is_digit ( 10 ) ) . map ( | ( _ , _ , c ) | { c } ) ) ;
2018-02-23 01:58:06 -08:00
DigitGroup ( Rc ::new ( buf ) )
}
}
2018-03-02 15:15:12 -08:00
fn handle_quote < I : Iterator < Item = ( usize , usize , char ) > > ( input : & mut CharIter < I > ) -> TokenType {
2018-02-23 01:58:06 -08:00
let mut buf = String ::new ( ) ;
loop {
2018-03-02 15:15:12 -08:00
match input . next ( ) . map ( | ( _ , _ , c ) | { c } ) {
2018-02-23 01:58:06 -08:00
Some ( '"' ) = > break ,
Some ( '\\' ) = > {
2018-03-02 15:15:12 -08:00
let next = input . peek ( ) . map ( | & ( _ , _ , c ) | { c } ) ;
2018-02-23 01:58:06 -08:00
if next = = Some ( 'n' ) {
input . next ( ) ;
buf . push ( '\n' )
} else if next = = Some ( '"' ) {
input . next ( ) ;
buf . push ( '"' ) ;
} else if next = = Some ( 't' ) {
input . next ( ) ;
buf . push ( '\t' ) ;
}
} ,
Some ( c ) = > buf . push ( c ) ,
None = > return TokenType ::Error ( format! ( " Unclosed string " ) ) ,
}
}
TokenType ::StrLiteral ( Rc ::new ( buf ) )
}
2018-03-02 15:15:12 -08:00
fn handle_alphabetic < I : Iterator < Item = ( usize , usize , char ) > > ( c : char , input : & mut CharIter < I > ) -> TokenType {
2018-02-23 01:58:06 -08:00
let mut buf = String ::new ( ) ;
buf . push ( c ) ;
2018-03-02 15:15:12 -08:00
if c = = '_' & & input . peek ( ) . map ( | & ( _ , _ , c ) | { ! c . is_alphabetic ( ) } ) . unwrap_or ( true ) {
2018-02-23 01:58:06 -08:00
return TokenType ::Underscore
}
loop {
2018-03-02 15:15:12 -08:00
match input . peek ( ) . map ( | & ( _ , _ , c ) | { c } ) {
2018-02-23 01:58:06 -08:00
Some ( c ) if c . is_alphanumeric ( ) = > {
input . next ( ) ;
buf . push ( c ) ;
} ,
_ = > break ,
}
}
match KEYWORDS . get ( buf . as_str ( ) ) {
Some ( kw ) = > TokenType ::Keyword ( * kw ) ,
None = > TokenType ::Identifier ( Rc ::new ( buf ) ) ,
}
}
2018-03-02 15:15:12 -08:00
fn handle_operator < I : Iterator < Item = ( usize , usize , char ) > > ( c : char , input : & mut CharIter < I > ) -> TokenType {
2018-02-23 01:58:06 -08:00
match c {
'<' | '>' | '|' | '.' = > {
2018-03-02 15:15:12 -08:00
let ref next = input . peek ( ) . map ( | & ( _ , _ , c ) | { c } ) ;
2018-02-23 01:58:06 -08:00
if ! next . map ( | n | { is_operator ( & n ) } ) . unwrap_or ( false ) {
return match c {
'<' = > LAngleBracket ,
'>' = > RAngleBracket ,
'|' = > Pipe ,
'.' = > Period ,
_ = > unreachable! ( ) ,
}
}
} ,
_ = > ( ) ,
} ;
let mut buf = String ::new ( ) ;
buf . push ( c ) ;
loop {
2018-03-02 15:15:12 -08:00
match input . peek ( ) . map ( | & ( _ , _ , c ) | { c } ) {
2018-02-23 01:58:06 -08:00
Some ( c ) if is_operator ( & c ) = > {
input . next ( ) ;
buf . push ( c ) ;
} ,
_ = > break
}
}
TokenType ::Operator ( Rc ::new ( buf ) )
}
#[ cfg(test) ]
mod schala_tokenizer_tests {
use super ::* ;
2018-02-23 01:59:53 -08:00
use super ::Kw ::* ;
2018-02-23 01:58:06 -08:00
macro_rules ! digit { ( $ident :expr ) = > { DigitGroup ( Rc ::new ( $ident . to_string ( ) ) ) } }
macro_rules ! ident { ( $ident :expr ) = > { Identifier ( Rc ::new ( $ident . to_string ( ) ) ) } }
macro_rules ! op { ( $ident :expr ) = > { Operator ( Rc ::new ( $ident . to_string ( ) ) ) } }
#[ test ]
fn tokens ( ) {
let a = tokenize ( " let a: A<B> = c ++ d " ) ;
let token_types : Vec < TokenType > = a . into_iter ( ) . map ( move | t | t . token_type ) . collect ( ) ;
assert_eq! ( token_types , vec! [ Keyword ( Let ) , ident! ( " a " ) , Colon , ident! ( " A " ) ,
LAngleBracket , ident! ( " B " ) , RAngleBracket , op! ( " = " ) , ident! ( " c " ) , op! ( " ++ " ) , ident! ( " d " ) ] ) ;
}
#[ test ]
fn underscores ( ) {
let token_types : Vec < TokenType > = tokenize ( " 4_8 " ) . into_iter ( ) . map ( move | t | t . token_type ) . collect ( ) ;
assert_eq! ( token_types , vec! [ digit! ( " 4 " ) , Underscore , digit! ( " 8 " ) ] ) ;
}
2018-03-17 22:25:43 -07:00
#[ test ]
fn comments ( ) {
let token_types : Vec < TokenType > = tokenize ( " 1 + /* hella /* bro */ */ 2 " ) . into_iter ( ) . map ( move | t | t . token_type ) . collect ( ) ;
assert_eq! ( token_types , vec! [ digit! ( " 1 " ) , op! ( " + " ) , digit! ( " 2 " ) ] ) ;
}
2018-02-23 01:58:06 -08:00
}