schala/src/schala_lang/parsing.rs

310 lines
8.2 KiB
Rust
Raw Normal View History

2017-09-06 05:09:20 -07:00
extern crate itertools;
2017-09-07 23:43:04 -07:00
use language::{ParseError};
2017-09-07 19:38:22 -07:00
use std::collections::HashMap;
2017-08-29 05:08:09 -07:00
use std::rc::Rc;
2017-09-06 05:09:20 -07:00
use std::iter::{Enumerate, Peekable};
use self::itertools::Itertools;
use std::str::Chars;
2017-08-29 04:27:07 -07:00
#[derive(Debug, PartialEq)]
2017-09-04 12:17:20 -07:00
pub enum TokenType {
2017-09-06 05:09:20 -07:00
Newline, Semicolon,
2017-09-04 12:17:20 -07:00
2017-09-06 05:09:20 -07:00
LParen, RParen,
LSquareBracket, RSquareBracket,
LAngleBracket, RAngleBracket,
LCurlyBrace, RCurlyBrace,
2017-09-07 22:29:23 -07:00
Pipe,
2017-09-04 12:17:20 -07:00
2017-09-06 05:09:20 -07:00
Comma, Period, Colon, Underscore,
2017-09-04 12:17:20 -07:00
2017-09-06 05:09:20 -07:00
Operator(Rc<String>),
DigitGroup(Rc<String>), HexNumberSigil, BinNumberSigil,
2017-08-29 05:08:09 -07:00
StrLiteral(Rc<String>),
Identifier(Rc<String>),
2017-09-04 12:17:20 -07:00
Keyword(Kw),
2017-09-06 05:09:20 -07:00
Error(String),
2017-08-29 05:08:09 -07:00
}
#[derive(Debug, Clone, Copy, PartialEq)]
2017-09-04 12:17:20 -07:00
pub enum Kw {
2017-09-07 22:29:23 -07:00
If, Else,
2017-09-04 12:17:20 -07:00
Func,
2017-09-06 05:09:20 -07:00
For,
2017-09-07 23:40:42 -07:00
Match,
Var, Const, Let, In,
Alias, Type, SelfType, SelfIdent,
2017-09-07 22:29:23 -07:00
Trait, Impl,
True, False
2017-09-04 12:17:20 -07:00
}
2017-09-07 19:38:22 -07:00
lazy_static! {
static ref KEYWORDS: HashMap<&'static str, Kw> =
hashmap! {
"if" => Kw::If,
"else" => Kw::Else,
2017-09-07 22:29:23 -07:00
"fn" => Kw::Func,
"for" => Kw::For,
2017-09-07 23:40:42 -07:00
"match" => Kw::Match,
2017-09-07 22:29:23 -07:00
"var" => Kw::Var,
"const" => Kw::Const,
"let" => Kw::Let,
2017-09-07 23:40:42 -07:00
"in" => Kw::In,
"alias" => Kw::Alias,
2017-09-07 22:29:23 -07:00
"type" => Kw::Type,
"Self" => Kw::SelfType,
"self" => Kw::SelfIdent,
"trait" => Kw::Trait,
"impl" => Kw::Impl,
"true" => Kw::True,
"false" => Kw::False,
2017-09-07 19:38:22 -07:00
};
}
2017-09-04 12:17:20 -07:00
#[derive(Debug)]
pub struct Token {
token_type: TokenType,
2017-09-06 05:09:20 -07:00
offset: usize,
2017-09-04 12:17:20 -07:00
}
impl Token {
pub fn get_error(&self) -> Option<&String> {
match self.token_type {
TokenType::Error(ref s) => Some(s),
_ => None,
}
}
}
2017-09-06 05:09:20 -07:00
fn is_digit(c: &char) -> bool {
c.is_digit(10)
}
type CharIter<'a> = Peekable<Enumerate<Chars<'a>>>;
pub fn tokenize(input: &str) -> Vec<Token> {
2017-09-06 05:09:20 -07:00
use self::TokenType::*;
let mut tokens: Vec<Token> = Vec::new();
let mut input: CharIter = input.chars().enumerate().peekable();
while let Some((idx, c)) = input.next() {
let cur_tok_type = match c {
'#' => {
if let Some(&(_, '{')) = input.peek() {
} else {
while let Some((_, c)) = input.next() {
if c == '\n' {
break;
}
}
}
continue;
},
2017-09-07 22:29:23 -07:00
c if c.is_whitespace() && c != '\n' => continue,
2017-09-06 05:09:20 -07:00
'\n' => Newline, ';' => Semicolon,
2017-09-06 09:42:29 -07:00
':' => Colon, ',' => Comma, '.' => Period,
2017-09-06 05:09:20 -07:00
'(' => LParen, ')' => RParen,
'{' => LCurlyBrace, '}' => RCurlyBrace,
'<' => LAngleBracket, '>' => RAngleBracket,
'[' => LSquareBracket, ']' => RSquareBracket,
2017-09-07 22:29:23 -07:00
'|' => Pipe,
2017-09-06 09:42:29 -07:00
'"' => handle_quote(&mut input),
2017-09-06 05:09:20 -07:00
c if is_digit(&c) => handle_digit(c, &mut input),
2017-09-08 01:33:27 -07:00
c if c.is_alphabetic() || c == '_' => handle_alphabetic(c, &mut input), //TODO I'll probably have to rewrite this if I care about types being uppercase, also type parameterization
2017-09-06 09:42:29 -07:00
c => handle_operator(c, &mut input),
2017-09-06 05:09:20 -07:00
};
tokens.push(Token { token_type: cur_tok_type, offset: idx });
}
tokens
2017-09-06 05:09:20 -07:00
}
fn handle_digit(c: char, input: &mut CharIter) -> TokenType {
use self::TokenType::*;
if c == '0' && input.peek().map_or(false, |&(_, c)| { c == 'x' }) {
input.next();
HexNumberSigil
} else if c == '0' && input.peek().map_or(false, |&(_, c)| { c == 'b' }) {
input.next();
BinNumberSigil
} else {
let mut buf = c.to_string();
buf.extend(input.peeking_take_while(|&(_, ref c)| is_digit(c)).map(|(_, c)| { c }));
DigitGroup(Rc::new(buf))
}
2017-08-29 05:08:09 -07:00
}
2017-09-06 09:42:29 -07:00
fn handle_quote(input: &mut CharIter) -> TokenType {
2017-09-06 16:52:49 -07:00
let mut buf = String::new();
2017-09-07 00:18:36 -07:00
loop {
match input.next().map(|(_, c)| { c }) {
Some('"') => break,
Some('\\') => {
let next = input.peek().map(|&(_, c)| { c });
if next == Some('n') {
input.next();
buf.push('\n')
} else if next == Some('"') {
input.next();
buf.push('"');
} else if next == Some('t') {
input.next();
buf.push('\t');
}
},
Some(c) => buf.push(c),
None => return TokenType::Error(format!("Unclosed string")),
2017-09-06 16:52:49 -07:00
}
}
TokenType::StrLiteral(Rc::new(buf))
2017-09-06 09:42:29 -07:00
}
fn handle_alphabetic(c: char, input: &mut CharIter) -> TokenType {
2017-09-07 19:38:22 -07:00
let mut buf = String::new();
buf.push(c);
2017-09-08 01:33:27 -07:00
if c == '_' && input.peek().map(|&(_, c)| { !c.is_alphabetic() }).unwrap_or(true) {
return TokenType::Identifier(Rc::new(format!("_")))
}
2017-09-07 19:38:22 -07:00
loop {
match input.peek().map(|&(_, c)| { c }) {
Some(c) if c.is_alphanumeric() => {
input.next();
buf.push(c);
},
_ => break,
}
}
match KEYWORDS.get(buf.as_str()) {
Some(kw) => TokenType::Keyword(kw.clone()),
None => TokenType::Identifier(Rc::new(buf)),
}
2017-09-06 09:42:29 -07:00
}
fn handle_operator(c: char, input: &mut CharIter) -> TokenType {
2017-09-07 22:29:23 -07:00
let mut buf = String::new();
buf.push(c);
loop {
match input.peek().map(|&(_, c)| { c }) {
Some(c) if !c.is_alphabetic() && !c.is_whitespace() => {
input.next();
buf.push(c);
},
_ => break
}
}
TokenType::Operator(Rc::new(buf))
2017-09-06 09:42:29 -07:00
}
#[cfg(test)]
mod schala_tokenizer_tests {
use super::*;
use super::TokenType::*;
use super::Kw::*;
macro_rules! ident { ($ident:expr) => { Identifier(Rc::new($ident.to_string())) } }
macro_rules! op { ($ident:expr) => { Operator(Rc::new($ident.to_string())) } }
#[test]
fn tokens() {
let a = tokenize("let a: A<B> = c ++ d");
let token_types: Vec<TokenType> = a.into_iter().map(move |t| t.token_type).collect();
assert_eq!(token_types, vec![Keyword(Let), ident!("a"), Colon, ident!("A"),
LAngleBracket, ident!("B"), RAngleBracket, op!("="), ident!("c"), op!("++"), ident!("d")]);
}
}
2017-09-06 09:42:29 -07:00
2017-08-29 05:08:09 -07:00
/*
2017-08-30 04:28:52 -07:00
Schala EBNF grammar
type alias <name> = <other type>
type <name> = struct { <field> : <type>,* }
type <name> = Variant1 | Variant2(type, type) | Variant3 struct { }
'' = literal, all other symbols are nonterminals
2017-08-29 05:08:09 -07:00
program := (statement delimiter ?)*
2017-08-30 04:28:52 -07:00
delimiter := 'Newline' | ';'
2017-08-29 05:08:09 -07:00
statement := declaration | expression
2017-08-30 04:28:52 -07:00
declaration := module | function | type_decl
type_decl := 'type' type_format
type_format := 'alias' '=' type | type_constructor
type_constructor := capital_ident '=' type_rhs
type_rhs := struct_decl | type_variant ('|' type_variant)*
struct_decl := 'struct' '{' (ident ':' type)* '}'
type_variant := capital_ident | tuple_type | capital_ident struct_decl
tuple_type := // something like Variant(a,b)
type := // something like Type[A[b]]
ascription := expression (':' type)+
function := 'fn' prototype '{' (statement)* '}'
prototype := identifier '(' identlist ')'
identlist := identifier (',' identifier)* | ε
2017-08-29 05:08:09 -07:00
declaration := FN prototype LCurlyBrace (statement)* RCurlyBrace
prototype := identifier LParen identlist RParen
identlist := Ident (Comma Ident)* | ε
exprlist := Expression (Comma Expression)* | ε
itemlist := Ident COLON Expression (Comma Ident COLON Expression)* | ε
expression := postop_expression (op postop_expression)*
postop_expression := primary_expression postop
primary_expression := number_expr | String | identifier_expr | paren_expr | conditional_expr | while_expr | lambda_expr | list_expr | struct_expr
number_expr := (PLUS | MINUS ) number_expr | Number
identifier_expr := call_expression | Variable
list_expr := LSquareBracket exprlist RSquareBracket
struct_expr := LCurlyBrace itemlist RCurlyBrace
call_expression := Identifier LParen exprlist RParen
while_expr := WHILE primary_expression LCurlyBrace (expression delimiter)* RCurlyBrace
paren_expr := LParen expression RParen
conditional_expr := IF expression LCurlyBrace (expression delimiter)* RCurlyBrace (LCurlyBrace (expresion delimiter)* RCurlyBrace)?
lambda_expr := FN LParen identlist RParen LCurlyBrace (expression delimiter)* RCurlyBrace
lambda_call := | LParen exprlist RParen
postop := ε | LParen exprlist RParen | LBracket expression RBracket
op := '+', '-', etc.
*/
2017-09-08 16:42:42 -07:00
struct Parser {
tokens: Vec<Token>,
}
2017-08-29 04:27:07 -07:00
#[derive(Debug)]
2017-09-08 16:42:42 -07:00
pub struct AST(Vec<Statement>);
#[derive(Debug, PartialEq)]
pub enum Statement {
Expression(Expression),
Declaration(Declaration),
}
#[derive(Debug, PartialEq)]
pub enum Declaration {
FuncDecl,
TypeDecl
}
#[derive(Debug, PartialEq)]
pub enum Expression {
UnsignedIntLiteral(u64),
SignedIntLiteral(i64),
FloatLiteral(f64),
}
pub fn parse(input: Vec<Token>) -> Result<AST, ParseError> {
use self::Statement::*; use self::Declaration::*; use self::Expression::*;
2017-08-29 05:08:09 -07:00
2017-09-08 16:42:42 -07:00
let statements = vec![Expression(UnsignedIntLiteral(1))];
Ok(AST(statements))
2017-08-29 05:08:09 -07:00
}