Changeset - 9227f244da73
[Not reviewed]
0 4 0
MH - 4 years ago 2021-04-16 09:44:06
henger@cwi.nl
WIP on compiler rearchitecting
4 files changed with 248 insertions and 53 deletions:
0 comments (0 inline, 0 general)
src/protocol/ast.rs
Show inline comments
 
// TODO: @cleanup, rigorous cleanup of dead code and silly object-oriented
 
//  trait impls where I deem them unfit.
 

	
 
use std::fmt;
 
use std::fmt::{Debug, Display, Formatter};
 
use std::ops::{Index, IndexMut};
 

	
 
use super::arena::{Arena, Id};
 
use crate::collections::StringRef;
 
use crate::protocol::inputsource::*;
 
use crate::protocol::input_source2::{InputPosition2, InputSpan};
 

	
 
/// Global limits to the AST, should be checked by lexer and parser. Some are
 
/// arbitrary
 
const MAX_LEVEL: usize = 128;
 
const MAX_NAMESPACES: usize = 64;
 

	
 
/// Helper macro that defines a type alias for a AST element ID. In this case 
 
/// only used to alias the `Id<T>` types.
 
macro_rules! define_aliased_ast_id {
 
    // Variant where we just defined the alias, without any indexing
 
    ($name:ident, $parent:ty) => {
 
        pub type $name = $parent;
 
    };
 
    // Variant where we define the type, and the Index and IndexMut traits
 
    (
 
        $name:ident, $parent:ty, 
 
        index($indexed_type:ty, $indexed_arena:ident)
 
    ) => {
 
        define_aliased_ast_id!($name, $parent);
 
        impl Index<$name> for Heap {
 
            type Output = $indexed_type;
 
            fn index(&self, index: $name) -> &Self::Output {
 
                &self.$indexed_arena[index]
 
            }
 
        }
 

	
 
        impl IndexMut<$name> for Heap {
 
            fn index_mut(&mut self, index: $name) -> &mut Self::Output {
 
                &mut self.$indexed_arena[index]
 
            }
 
        }
 
    };
 
    // Variant where we define type, Index(Mut) traits and an allocation function
 
    (
 
        $name:ident, $parent:ty,
 
        index($indexed_type:ty, $indexed_arena:ident),
 
        alloc($fn_name:ident)
 
    ) => {
 
        define_aliased_ast_id!($name, $parent, index($indexed_type, $indexed_arena));
 
        impl Heap {
 
            pub fn $fn_name(&mut self, f: impl FnOnce($name) -> $indexed_type) -> $name {
 
                self.$indexed_arena.alloc_with_id(|id| f(id))
 
            }
 
        }
 
    };
 
}
 

	
 
/// Helper macro that defines a wrapper type for a particular variant of an AST
 
/// element ID. Only used to define single-wrapping IDs.
 
macro_rules! define_new_ast_id {
 
    // Variant where we just defined the new type, without any indexing
 
    ($name:ident, $parent:ty) => {
 
        #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 
        pub struct $name (pub(crate) $parent);
 

	
 
        impl $name {
 
            pub(crate) fn new_invalid() -> Self     { Self($parent::new_invalid()) }
 
            pub(crate) fn is_invalid(&self) -> bool { self.0.is_invalid() }
 
            pub fn upcast(self) -> $parent          { self.0 }
 
        }
 
    };
 
    // Variant where we define the type, and the Index and IndexMut traits
 
    (
 
        $name:ident, $parent:ty, 
 
        index($indexed_type:ty, $wrapper_type:path, $indexed_arena:ident)
 
    ) => {
 
        define_new_ast_id!($name, $parent);
 
        impl Index<$name> for Heap {
 
            type Output = $indexed_type;
 
            fn index(&self, index: $name) -> &Self::Output {
 
                if let $wrapper_type(v) = &self.$indexed_arena[index.0] {
 
                    v
 
                } else {
 
                    unreachable!()
 
                }
 
            }
 
        }
 

	
 
        impl IndexMut<$name> for Heap {
 
            fn index_mut(&mut self, index: $name) -> &mut Self::Output {
 
                if let $wrapper_type(v) = &mut self.$indexed_arena[index.0] {
 
                    v
 
                } else {
 
                    unreachable!()
 
                }
 
            }
 
        }
 
    };
 
    // Variant where we define the type, the Index and IndexMut traits, and an allocation function
 
    (
 
        $name:ident, $parent:ty, 
 
        index($indexed_type:ty, $wrapper_type:path, $indexed_arena:ident),
 
        alloc($fn_name:ident)
 
    ) => {
 
        define_new_ast_id!($name, $parent, index($indexed_type, $wrapper_type, $indexed_arena));
 
        impl Heap {
 
            pub fn $fn_name(&mut self, f: impl FnOnce($name) -> $indexed_type) -> $name {
 
                $name(
 
                    self.$indexed_arena.alloc_with_id(|id| {
 
                        $wrapper_type(f($name(id)))
 
                    })
 
                )
 
            }
 
        }
 
    }
 
}
 

	
 
define_aliased_ast_id!(RootId, Id<Root>, index(Root, protocol_descriptions), alloc(alloc_protocol_description));
 
define_aliased_ast_id!(PragmaId, Id<Pragma>, index(Pragma, pragmas), alloc(alloc_pragma));
 
define_aliased_ast_id!(ImportId, Id<Import>, index(Import, imports), alloc(alloc_import));
 
define_aliased_ast_id!(ParserTypeId, Id<ParserType>, index(ParserType, parser_types), alloc(alloc_parser_type));
 

	
 
define_aliased_ast_id!(VariableId, Id<Variable>, index(Variable, variables));
 
define_new_ast_id!(ParameterId, VariableId, index(Parameter, Variable::Parameter, variables), alloc(alloc_parameter));
 
define_new_ast_id!(LocalId, VariableId, index(Local, Variable::Local, variables), alloc(alloc_local));
 

	
 
define_aliased_ast_id!(DefinitionId, Id<Definition>, index(Definition, definitions));
 
define_new_ast_id!(StructDefinitionId, DefinitionId, index(StructDefinition, Definition::Struct, definitions), alloc(alloc_struct_definition));
 
define_new_ast_id!(EnumDefinitionId, DefinitionId, index(EnumDefinition, Definition::Enum, definitions), alloc(alloc_enum_definition));
 
define_new_ast_id!(UnionDefinitionId, DefinitionId, index(UnionDefinition, Definition::Union, definitions), alloc(alloc_union_definition));
 
define_new_ast_id!(ComponentDefinitionId, DefinitionId, index(ComponentDefinition, Definition::Component, definitions), alloc(alloc_component_definition));
 
define_new_ast_id!(FunctionDefinitionId, DefinitionId, index(FunctionDefinition, Definition::Function, definitions), alloc(alloc_function_definition));
 

	
 
define_aliased_ast_id!(StatementId, Id<Statement>, index(Statement, statements));
 
define_new_ast_id!(BlockStatementId, StatementId, index(BlockStatement, Statement::Block, statements), alloc(alloc_block_statement));
 
define_new_ast_id!(LocalStatementId, StatementId, index(LocalStatement, Statement::Local, statements), alloc(alloc_local_statement));
 
define_new_ast_id!(MemoryStatementId, LocalStatementId);
 
define_new_ast_id!(ChannelStatementId, LocalStatementId);
 
define_new_ast_id!(SkipStatementId, StatementId, index(SkipStatement, Statement::Skip, statements), alloc(alloc_skip_statement));
 
define_new_ast_id!(LabeledStatementId, StatementId, index(LabeledStatement, Statement::Labeled, statements), alloc(alloc_labeled_statement));
 
define_new_ast_id!(IfStatementId, StatementId, index(IfStatement, Statement::If, statements), alloc(alloc_if_statement));
 
define_new_ast_id!(EndIfStatementId, StatementId, index(EndIfStatement, Statement::EndIf, statements), alloc(alloc_end_if_statement));
 
define_new_ast_id!(WhileStatementId, StatementId, index(WhileStatement, Statement::While, statements), alloc(alloc_while_statement));
 
define_new_ast_id!(EndWhileStatementId, StatementId, index(EndWhileStatement, Statement::EndWhile, statements), alloc(alloc_end_while_statement));
 
define_new_ast_id!(BreakStatementId, StatementId, index(BreakStatement, Statement::Break, statements), alloc(alloc_break_statement));
 
define_new_ast_id!(ContinueStatementId, StatementId, index(ContinueStatement, Statement::Continue, statements), alloc(alloc_continue_statement));
 
define_new_ast_id!(SynchronousStatementId, StatementId, index(SynchronousStatement, Statement::Synchronous, statements), alloc(alloc_synchronous_statement));
 
define_new_ast_id!(EndSynchronousStatementId, StatementId, index(EndSynchronousStatement, Statement::EndSynchronous, statements), alloc(alloc_end_synchronous_statement));
 
define_new_ast_id!(ReturnStatementId, StatementId, index(ReturnStatement, Statement::Return, statements), alloc(alloc_return_statement));
 
define_new_ast_id!(AssertStatementId, StatementId, index(AssertStatement, Statement::Assert, statements), alloc(alloc_assert_statement));
 
define_new_ast_id!(GotoStatementId, StatementId, index(GotoStatement, Statement::Goto, statements), alloc(alloc_goto_statement));
 
define_new_ast_id!(NewStatementId, StatementId, index(NewStatement, Statement::New, statements), alloc(alloc_new_statement));
 
define_new_ast_id!(ExpressionStatementId, StatementId, index(ExpressionStatement, Statement::Expression, statements), alloc(alloc_expression_statement));
 

	
 
define_aliased_ast_id!(ExpressionId, Id<Expression>, index(Expression, expressions));
 
define_new_ast_id!(AssignmentExpressionId, ExpressionId, index(AssignmentExpression, Expression::Assignment, expressions), alloc(alloc_assignment_expression));
 
define_new_ast_id!(BindingExpressionId, ExpressionId, index(BindingExpression, Expression::Binding, expressions), alloc(alloc_binding_expression));
 
define_new_ast_id!(ConditionalExpressionId, ExpressionId, index(ConditionalExpression, Expression::Conditional, expressions), alloc(alloc_conditional_expression));
 
define_new_ast_id!(BinaryExpressionId, ExpressionId, index(BinaryExpression, Expression::Binary, expressions), alloc(alloc_binary_expression));
 
define_new_ast_id!(UnaryExpressionId, ExpressionId, index(UnaryExpression, Expression::Unary, expressions), alloc(alloc_unary_expression));
 
define_new_ast_id!(IndexingExpressionId, ExpressionId, index(IndexingExpression, Expression::Indexing, expressions), alloc(alloc_indexing_expression));
 
define_new_ast_id!(SlicingExpressionId, ExpressionId, index(SlicingExpression, Expression::Slicing, expressions), alloc(alloc_slicing_expression));
 
define_new_ast_id!(SelectExpressionId, ExpressionId, index(SelectExpression, Expression::Select, expressions), alloc(alloc_select_expression));
 
define_new_ast_id!(ArrayExpressionId, ExpressionId, index(ArrayExpression, Expression::Array, expressions), alloc(alloc_array_expression));
 
define_new_ast_id!(LiteralExpressionId, ExpressionId, index(LiteralExpression, Expression::Literal, expressions), alloc(alloc_literal_expression));
 
define_new_ast_id!(CallExpressionId, ExpressionId, index(CallExpression, Expression::Call, expressions), alloc(alloc_call_expression));
 
define_new_ast_id!(VariableExpressionId, ExpressionId, index(VariableExpression, Expression::Variable, expressions), alloc(alloc_variable_expression));
 

	
 
// TODO: @cleanup - pub qualifiers can be removed once done
 
#[derive(Debug)]
 
pub struct Heap {
 
    // Root arena, contains the entry point for different modules. Each root
 
    // contains lists of IDs that correspond to the other arenas.
 
    pub(crate) protocol_descriptions: Arena<Root>,
 
    // Contents of a file, these are the elements the `Root` elements refer to
 
    pragmas: Arena<Pragma>,
 
    pub(crate) imports: Arena<Import>,
 
    identifiers: Arena<Identifier>,
 
    pub(crate) parser_types: Arena<ParserType>,
 
    pub(crate) variables: Arena<Variable>,
 
    pub(crate) definitions: Arena<Definition>,
 
    pub(crate) statements: Arena<Statement>,
 
    pub(crate) expressions: Arena<Expression>,
 
}
 

	
 
impl Heap {
 
    pub fn new() -> Heap {
 
        Heap {
 
            // string_alloc: StringAllocator::new(),
 
            protocol_descriptions: Arena::new(),
 
            pragmas: Arena::new(),
 
            imports: Arena::new(),
 
            identifiers: Arena::new(),
 
            parser_types: Arena::new(),
 
            variables: Arena::new(),
 
            definitions: Arena::new(),
 
            statements: Arena::new(),
 
            expressions: Arena::new(),
 
        }
 
    }
 
    pub fn alloc_memory_statement(
 
        &mut self,
 
        f: impl FnOnce(MemoryStatementId) -> MemoryStatement,
 
    ) -> MemoryStatementId {
 
        MemoryStatementId(LocalStatementId(self.statements.alloc_with_id(|id| {
 
            Statement::Local(LocalStatement::Memory(
 
                f(MemoryStatementId(LocalStatementId(id)))
 
            ))
 
        })))
 
    }
 
    pub fn alloc_channel_statement(
 
        &mut self,
 
        f: impl FnOnce(ChannelStatementId) -> ChannelStatement,
 
    ) -> ChannelStatementId {
 
        ChannelStatementId(LocalStatementId(self.statements.alloc_with_id(|id| {
 
            Statement::Local(LocalStatement::Channel(
 
                f(ChannelStatementId(LocalStatementId(id)))
 
            ))
 
        })))
 
    }
 
}
 

	
 
impl Index<MemoryStatementId> for Heap {
 
    type Output = MemoryStatement;
 
    fn index(&self, index: MemoryStatementId) -> &Self::Output {
 
        &self.statements[index.0.0].as_memory()
 
    }
 
}
 

	
 
impl Index<ChannelStatementId> for Heap {
 
    type Output = ChannelStatement;
 
    fn index(&self, index: ChannelStatementId) -> &Self::Output {
 
        &self.statements[index.0.0].as_channel()
 
    }
 
}
 

	
 
#[derive(Debug, Clone)]
 
pub struct Root {
 
    pub this: RootId,
 
    // Phase 1: parser
 
    // pub position: InputPosition,
 
    pub pragmas: Vec<PragmaId>,
 
    pub imports: Vec<ImportId>,
 
    pub definitions: Vec<DefinitionId>,
 
}
 

	
 
impl Root {
 
    pub fn get_definition_ident(&self, h: &Heap, id: &[u8]) -> Option<DefinitionId> {
 
        for &def in self.definitions.iter() {
 
            if h[def].identifier().value == id {
 
                return Some(def);
 
            }
 
        }
 
        None
 
    }
 
}
 

	
 
impl SyntaxElement for Root {
 
    fn position(&self) -> InputPosition {
 
        self.position
 
    }
 
}
 

	
 
#[derive(Debug, Clone)]
 
pub enum Pragma {
 
    Version(PragmaVersion),
 
    Module(PragmaModule),
 
}
 

	
 
impl Pragma {
 
    pub(crate) fn as_module(&self) -> &PragmaModule {
 
        match self {
 
            Pragma::Module(pragma) => pragma,
 
            _ => unreachable!("Tried to obtain {:?} as PragmaModule", self),
 
        }
 
    }
 
}
 

	
 
#[derive(Debug, Clone)]
 
pub struct PragmaVersion {
 
    pub this: PragmaId,
 
    // Phase 1: parser
 
    pub span: InputSpan, // of full pragma
 
    pub version: u64,
 
}
 

	
 
#[derive(Debug, Clone)]
 
pub struct PragmaModule {
 
    pub this: PragmaId,
 
    // Phase 1: parser
 
    pub span: InputSpan, // of full pragma
 
    pub value: Identifier,
 
}
 

	
 
#[derive(Debug, Clone)]
 
pub enum Import {
 
    Module(ImportModule),
 
    Symbols(ImportSymbols)
 
}
 

	
 
impl Import {
 
    pub(crate) fn as_module(&self) -> &ImportModule {
 
        match self {
 
            Import::Module(m) => m,
 
            _ => panic!("Unable to cast 'Import' to 'ImportModule'")
 
        }
 
    }
 
    pub(crate) fn as_symbols(&self) -> &ImportSymbols {
 
        match self {
 
            Import::Symbols(m) => m,
 
            _ => panic!("Unable to cast 'Import' to 'ImportSymbols'")
 
        }
 
    }
 
}
 

	
 
impl SyntaxElement for Import {
 
    fn position(&self) -> InputPosition {
 
        match self {
 
            Import::Module(m) => m.position,
 
            Import::Symbols(m) => m.position
 
        }
 
    }
 
}
 

	
 
#[derive(Debug, Clone)]
 
pub struct ImportModule {
 
    pub this: ImportId,
 
    // Phase 1: parser
 
    pub span: InputSpan,
 
    pub module_name: Identifier,
 
    pub alias: Identifier,
 
    pub module_id: RootId,
 
}
 

	
 
#[derive(Debug, Clone)]
 
pub struct AliasedSymbol {
 
    // Phase 1: parser
 
    pub position: InputPosition,
 
    pub name: Identifier,
 
    pub alias: Identifier,
 
    // Phase 2: symbol resolving
 
    pub definition_id: Option<DefinitionId>,
 
    pub definition_id: DefinitionId,
 
}
 

	
 
#[derive(Debug, Clone)]
 
pub struct ImportSymbols {
 
    pub this: ImportId,
 
    // Phase 1: parser
 
    pub span: InputSpan,
 
    pub module_name: Vec<u8>,
 
    // Phase 2: module resolving
 
    pub module_id: Option<RootId>,
 
    // Phase 1&2
 
    // if symbols is empty, then we implicitly import all symbols without any
 
    // aliases for them. If it is not empty, then symbols are explicitly
 
    // specified, and optionally given an alias.
 
    pub symbols: Vec<AliasedSymbol>,
 
}
 

	
 
#[derive(Debug, Clone)]
 
pub struct Identifier {
 
    pub span: InputSpan,
 
    pub value: StringRef<'static>,
 
}
 

	
 
impl PartialEq for Identifier {
 
    fn eq(&self, other: &Self) -> bool {
 
        return self.value == other.value
 
    }
 
}
 

	
 
impl Display for Identifier {
 
    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
 
        // A source identifier is in ASCII range.
 
        write!(f, "{}", String::from_utf8_lossy(&self.value))
 
    }
 
}
 

	
 
#[derive(Debug, Clone)]
 
pub enum NamespacedIdentifierPart {
 
    // Regular identifier
 
    Identifier{start: u16, end: u16},
 
    // Polyargs associated with a preceding identifier
 
    PolyArgs{start: u16, end: u16},
 
}
 

	
 
impl NamespacedIdentifierPart {
 
    pub(crate) fn is_identifier(&self) -> bool {
 
        match self {
 
            NamespacedIdentifierPart::Identifier{..} => true,
 
            NamespacedIdentifierPart::PolyArgs{..} => false,
 
        }
 
    }
 

	
 
    pub(crate) fn as_identifier(&self) -> (u16, u16) {
 
        match self {
 
            NamespacedIdentifierPart::Identifier{start, end} => (*start, *end),
 
            NamespacedIdentifierPart::PolyArgs{..} => {
 
                unreachable!("Tried to obtain {:?} as Identifier", self);
 
            }
 
        }
 
    }
 

	
 
    pub(crate) fn as_poly_args(&self) -> (u16, u16) {
 
        match self {
 
            NamespacedIdentifierPart::PolyArgs{start, end} => (*start, *end),
 
            NamespacedIdentifierPart::Identifier{..} => {
 
                unreachable!("Tried to obtain {:?} as PolyArgs", self)
 
            }
 
        }
 
    }
 
}
 

	
 
/// An identifier with optional namespaces and polymorphic variables. Note that 
 
/// we allow each identifier to be followed by polymorphic arguments during the 
 
/// parsing phase (e.g. Foo<A,B>::Bar<C,D>::Qux). But in our current language 
 
/// implementation we can only have valid namespaced identifier that contain one
 
/// set of polymorphic arguments at the appropriate position.
 
/// TODO: @tokens Reimplement/rename once we have a tokenizer
 
#[derive(Debug, Clone)]
 
pub struct NamespacedIdentifier {
 
    pub position: InputPosition,
 
    pub value: Vec<u8>, // Full name as it resides in the input source
 
    pub poly_args: Vec<ParserTypeId>, // All poly args littered throughout the namespaced identifier
 
    pub parts: Vec<NamespacedIdentifierPart>, // Indices into value/poly_args
 
}
 

	
 
impl NamespacedIdentifier {
 
    /// Returns the identifier value without any of the specific polymorphic
 
    /// arguments.
 
    pub fn strip_poly_args(&self) -> Vec<u8> {
 
        debug_assert!(!self.parts.is_empty() && self.parts[0].is_identifier());
 

	
 
        let mut result = Vec::with_capacity(self.value.len());
 
        let mut iter = self.iter();
 
        let (first_ident, _) = iter.next().unwrap();
 
        result.extend(first_ident);
 

	
 
        for (ident, _) in iter.next() {
 
            result.push(b':');
 
            result.push(b':');
 
            result.extend(ident);
 
        }
 

	
 
        result
 
    }
 

	
 
    /// Returns an iterator of the elements in the namespaced identifier
 
    pub fn iter(&self) -> NamespacedIdentifierIter {
 
        return NamespacedIdentifierIter{
 
            identifier: self,
 
            element_idx: 0
 
        }
 
    }
 

	
 
    pub fn get_poly_args(&self) -> Option<&[ParserTypeId]> {
 
        let has_poly_args = self.parts.iter().any(|v| !v.is_identifier());
 
        if has_poly_args {
 
            Some(&self.poly_args)
 
        } else {
 
            None
 
        }
 
    }
 

	
 
    // Check if two namespaced identifiers match eachother when not considering
 
    // the polymorphic arguments
 
    pub fn matches_namespaced_identifier(&self, other: &Self) -> bool {
 
        let mut iter_self = self.iter();
 
        let mut iter_other = other.iter();
 

	
 
        loop {
 
            let val_self = iter_self.next();
 
            let val_other = iter_other.next();
 
            if val_self.is_some() != val_other.is_some() {
 
                // One is longer than the other
 
                return false;
 
            }
 
            if val_self.is_none() {
 
                // Both are none
 
                return true;
 
            }
 

	
 
            // Both are something
 
            let (val_self, _) = val_self.unwrap();
 
            let (val_other, _) = val_other.unwrap();
 
            if val_self != val_other { return false; }
 
        }
 
    }
 

	
 
    // Check if the namespaced identifier matches an identifier when not 
 
    // considering the polymorphic arguments
 
    pub fn matches_identifier(&self, other: &Identifier) -> bool {
 
        let mut iter = self.iter();
 
        let (first_ident, _) = iter.next().unwrap();
 
        if first_ident != other.value { 
 
            return false;
 
        }
 

	
 
        if iter.next().is_some() {
 
            return false;
 
        }
 

	
 
        return true;
 
    }
 
}
 

	
 
/// Iterator over elements of the namespaced identifier. The element index will
 
/// only ever be at the start of an identifier element.
 
#[derive(Debug)]
 
pub struct NamespacedIdentifierIter<'a> {
 
    identifier: &'a NamespacedIdentifier,
 
    element_idx: usize,
 
}
 

	
 
impl<'a> Iterator for NamespacedIdentifierIter<'a> {
 
    type Item = (&'a [u8], Option<&'a [ParserTypeId]>);
 
    fn next(&mut self) -> Option<Self::Item> {
 
        match self.get(self.element_idx) {
 
            Some((ident, poly)) => {
 
                self.element_idx += 1;
 
                if poly.is_some() {
 
                    self.element_idx += 1;
 
                }
 
                Some((ident, poly))
 
            },
 
            None => None
 
        }
 
    }
 
}
 

	
 
impl<'a> NamespacedIdentifierIter<'a> {
 
    /// Returns number of parts iterated over, may not correspond to number of
 
    /// times one called `next()` because returning an identifier with 
 
    /// polymorphic arguments increments the internal counter by 2.
 
    pub fn num_returned(&self) -> usize {
 
        return self.element_idx;
 
    }
 

	
 
    pub fn num_remaining(&self) -> usize {
 
        return self.identifier.parts.len() - self.element_idx;
 
    }
 

	
 
    pub fn returned_section(&self) -> &[u8] {
 
        if self.element_idx == 0 { return &self.identifier.value[0..0]; }
 

	
 
        let last_idx = match &self.identifier.parts[self.element_idx - 1] {
 
            NamespacedIdentifierPart::Identifier{end, ..} => *end,
 
            NamespacedIdentifierPart::PolyArgs{end, ..} => *end,
 
        };
 

	
 
        return &self.identifier.value[..last_idx as usize];
 
    }
 

	
 
    /// Returns a specific element from the namespaced identifier
 
    pub fn get(&self, idx: usize) -> Option<<Self as Iterator>::Item> {
 
        if idx >= self.identifier.parts.len() { 
 
            return None 
 
        }
 

	
 
        let cur_part = &self.identifier.parts[idx];
 
        let next_part = self.identifier.parts.get(idx + 1);
 

	
 
        let (ident_start, ident_end) = cur_part.as_identifier();
 
        let poly_slice = match next_part {
 
            Some(part) => match part {
 
                NamespacedIdentifierPart::Identifier{..} => None,
 
                NamespacedIdentifierPart::PolyArgs{start, end} => Some(
 
                    &self.identifier.poly_args[*start as usize..*end as usize]
 
                ),
 
            },
 
            None => None
 
        };
 

	
 
        Some((
 
            &self.identifier.value[ident_start as usize..ident_end as usize],
 
            poly_slice
 
        ))
 
    }
 

	
 
    /// Returns the previously returend index into the parts array of the 
 
    /// identifier.
 
    pub fn prev_idx(&self) -> Option<usize> {
 
        if self.element_idx == 0 { 
 
            return None;
 
        };
 
        
 
        if self.identifier.parts[self.element_idx - 1].is_identifier() { 
 
            return Some(self.element_idx - 1);
 
        }
 

	
 
        // Previous part had polymorphic arguments, so the one before that must
 
        // be an identifier (if well formed)
 
        debug_assert!(self.element_idx >= 2 && self.identifier.parts[self.element_idx - 2].is_identifier());
 
        return Some(self.element_idx - 2)
 
    }
 

	
 
    /// Returns the previously returned result from `next()`
 
    pub fn prev(&self) -> Option<<Self as Iterator>::Item> {
 
        match self.prev_idx() {
 
            None => None,
 
            Some(idx) => self.get(idx)
 
        }
 
    }
 
}
 

	
 
/// TODO: @types Remove the Message -> Byte hack at some point...
 
#[derive(Debug, Clone)]
 
pub enum ParserTypeVariant {
 
    // Basic builtin
 
    Message,
 
    Bool,
 
    Byte,
 
    Short,
 
    Int,
 
    Long,
 
    String,
 
    // Literals (need to get concrete builtin type during typechecking)
 
    IntegerLiteral,
 
    Inferred,
 
    // Complex builtins
 
    Array(ParserTypeId), // array of a type
 
    Input(ParserTypeId), // typed input endpoint of a channel
 
    Output(ParserTypeId), // typed output endpoint of a channel
 
    Symbolic(SymbolicParserType), // symbolic type (definition or polyarg)
 
}
 

	
 
impl ParserTypeVariant {
 
    pub(crate) fn supports_polymorphic_args(&self) -> bool {
 
        use ParserTypeVariant::*;
 
        match self {
 
            Message | Bool | Byte | Short | Int | Long | String | IntegerLiteral | Inferred => false,
 
            _ => true
 
        }
 
    }
 
}
 

	
 
/// ParserType is a specification of a type during the parsing phase and initial
 
/// linker/validator phase of the compilation process. These types may be
 
/// (partially) inferred or represent literals (e.g. a integer whose bytesize is
 
/// not yet determined).
 
#[derive(Debug, Clone)]
 
pub struct ParserType {
 
    pub this: ParserTypeId,
 
    pub pos: InputPosition,
 
    pub variant: ParserTypeVariant,
 
}
 

	
 
/// SymbolicParserType is the specification of a symbolic type. During the
 
/// parsing phase we will only store the identifier of the type. During the
 
/// validation phase we will determine whether it refers to a user-defined type,
 
/// or a polymorphic argument. After the validation phase it may still be the
 
/// case that the resulting `variant` will not pass the typechecker.
 
#[derive(Debug, Clone)]
 
pub struct SymbolicParserType {
 
    // Phase 1: parser
 
    pub identifier: NamespacedIdentifier,
 
    // Phase 2: validation/linking (for types in function/component bodies) and
 
    //  type table construction (for embedded types of structs/unions)
 
    pub poly_args2: Vec<ParserTypeId>, // taken from identifier or inferred
 
    pub variant: Option<SymbolicParserTypeVariant>
 
}
 

	
 
/// Specifies whether the symbolic type points to an actual user-defined type,
 
/// or whether it points to a polymorphic argument within the definition (e.g.
 
/// a defined variable `T var` within a function `int func<T>()`
 
#[derive(Debug, Clone)]
 
pub enum SymbolicParserTypeVariant {
 
    Definition(DefinitionId),
 
    // TODO: figure out if I need the DefinitionId here
 
    PolyArg(DefinitionId, usize), // index of polyarg in the definition
 
}
 

	
 
/// ConcreteType is the representation of a type after resolving symbolic types
 
/// and performing type inference
 
#[derive(Debug, Clone, Copy, Eq, PartialEq)]
 
pub enum ConcreteTypePart {
 
    // Markers for the use of polymorphic types within a procedure's body that
 
    // refer to polymorphic variables on the procedure's definition. Different
 
    // from markers in the `InferenceType`, these will not contain nested types.
 
    Marker(usize),
 
    // Special types (cannot be explicitly constructed by the programmer)
 
    Void,
 
    // Builtin types without nested types
 
    Message,
 
    Bool,
 
    Byte,
 
    Short,
 
    Int,
 
    Long,
 
    String,
 
    // Builtin types with one nested type
 
    Array,
 
    Slice,
 
    Input,
 
    Output,
 
    // User defined type with any number of nested types
 
    Instance(DefinitionId, usize),
 
}
 

	
 
#[derive(Debug, Clone, Eq, PartialEq)]
 
pub struct ConcreteType {
 
    pub(crate) parts: Vec<ConcreteTypePart>
 
}
 

	
 
impl Default for ConcreteType {
 
    fn default() -> Self {
 
        Self{ parts: Vec::new() }
 
    }
 
}
 

	
 
impl ConcreteType {
 
    pub(crate) fn has_marker(&self) -> bool {
 
        self.parts
 
            .iter()
 
            .any(|p| {
 
                if let ConcreteTypePart::Marker(_) = p { true } else { false }
 
            })
 
    }
 
}
 

	
 
// TODO: Remove at some point
 
#[derive(Debug, Clone, PartialEq, Eq)]
 
pub enum PrimitiveType {
 
    Unassigned,
 
    Input,
 
    Output,
 
    Message,
 
    Boolean,
 
    Byte,
 
    Short,
 
    Int,
 
    Long,
 
}
 

	
 
#[derive(Debug, Clone, PartialEq, Eq)]
 
pub struct Type {
 
    pub primitive: PrimitiveType,
 
    pub array: bool,
 
}
 

	
src/protocol/lexer2.rs
Show inline comments
 
use crate::protocol::ast::*;
 
use crate::protocol::Heap;
 
use crate::collections::{StringPool, StringRef};
 
use crate::protocol::tokenizer::*;
 
use crate::protocol::input_source2::{InputSource2 as InputSource, InputPosition2 as InputPosition, InputSpan, ParseError};
 
use crate::protocol::symbol_table2::*;
 

	
 
#[derive(PartialEq, Eq)]
 
enum ModuleCompilationPhase {
 
    Source,                 // only source is set
 
    Tokenized,              // source is tokenized
 
    DefinitionsScanned,     // all definitions are linked to their type class
 
    ImportsResolved,        // all imports are added to the symbol table
 
    Parsed,                 // produced the AST for the module
 
    ValidatedAndLinked,     // AST is traversed and has linked the required AST nodes
 
    Typed,                  // Type inference and checking has been performed
 
}
 

	
 
enum KeywordDefinition {
 
    Struct,
 
    Enum,
 
    Union,
 
    Function,
 
    Primitive,
 
    Composite,
 
}
 

	
 
struct Module {
 
    // Buffers
 
    source: InputSource,
 
    tokens: TokenBuffer,
 
    // Identifiers
 
    root_id: RootId,
 
    name: Option<(PragmaId, StringRef<'static>)>,
 
    version: Option<(PragmaId, i64)>,
 
    phase: ModuleCompilationPhase,
 
}
 

	
 
struct Ctx<'a> {
 
    heap: &'a mut Heap,
 
    symbols: &'a mut SymbolTable,
 
    pool: &'a mut StringPool,
 
}
 

	
 
/// Scans the module and finds all module-level type definitions. These will be
 
/// added to the symbol table such that during AST-construction we know which
 
/// identifiers point to types. Will also parse all pragmas to determine module
 
/// names.
 
pub(crate) struct PassPreSymbol {
 
    symbols: Vec<Symbol>,
 
    pragmas: Vec<PragmaId>,
 
    imports: Vec<ImportId>,
 
    definitions: Vec<DefinitionId>,
 
    buffer: String,
 
    has_pragma_version: bool,
 
    has_pragma_module: bool,
 
}
 

	
 
impl PassPreSymbol {
 
    pub(crate) fn new() -> Self {
 
        Self{
 
            symbols: Vec::with_capacity(128),
 
            pragmas: Vec::with_capacity(8),
 
            imports: Vec::with_capacity(32),
 
            definitions: Vec::with_capacity(128),
 
            buffer: String::with_capacity(128),
 
            has_pragma_version: false,
 
            has_pragma_module: false,
 
        }
 
    }
 

	
 
    fn reset(&mut self) {
 
        self.symbols.clear();
 
        self.pragmas.clear();
 
        self.imports.clear();
 
        self.definitions.clear();
 
        self.has_pragma_version = false;
 
        self.has_pragma_module = false;
 
    }
 

	
 
    pub(crate) fn parse(&mut self, modules: &mut [Module], module_idx: usize, ctx: &mut Ctx) -> Result<(), ParseError> {
 
        self.reset();
 

	
 
        let module = &mut modules[module_idx];
 
        let module_range = &module.tokens.ranges[0];
 

	
 
        debug_assert_eq!(module.phase, ModuleCompilationPhase::Tokenized);
 
        debug_assert_eq!(module_range.range_kind, TokenRangeKind::Module);
 
        debug_assert!(module.root_id.is_invalid()); // not set yet,
 

	
 
        // Preallocate root in the heap
 
        let root_id = ctx.heap.alloc_protocol_description(|this| {
 
            Root{
 
                this,
 
                pragmas: Vec::new(),
 
                imports: Vec::new(),
 
                definitions: Vec::new(),
 
            }
 
        });
 
        module.root_id = root_id;
 

	
 
        // Visit token ranges to detect definitions and pragmas
 
        let mut range_idx = module_range.first_child_idx;
 
        loop {
 
            let range_idx_usize = range_idx as usize;
 
            let cur_range = &module.tokens.ranges[range_idx_usize];
 

	
 
            // Parse if it is a definition or a pragma
 
            if cur_range.range_kind == TokenRangeKind::Definition {
 
                self.visit_definition_range(modules, module_idx, ctx, range_idx_usize)?;
 
            } else if cur_range.range_kind == TokenRangeKind::Pragma {
 
                self.visit_pragma_range(modules, module_idx, ctx, range_idx_usize)?;
 
            }
 

	
 
            match cur_range.next_sibling_idx {
 
                Some(idx) => { range_idx = idx; },
 
                None => { break; },
 
            }
 
        }
 

	
 
        // Add the module's symbol scope and the symbols we just parsed
 
        let module_scope = SymbolScope::Module(root_id);
 
        ctx.symbols.insert_scope(None, module_scope);
 
        for symbol in self.symbols.drain(..) {
 
            if let Err((new_symbol, old_symbol)) = ctx.symbols.insert_symbol(module_scope, symbol) {
 
                return Err(construct_symbol_conflict_error(modules, module_idx, ctx, &new_symbol, old_symbol))
 
            }
 
        }
 

	
 
        // Modify the preallocated root
 
        let root = &mut ctx.heap[root_id];
 
        root.pragmas.extend(self.pragmas.drain(..));
 
        root.definitions.extend(self.definitions.drain(..));
 
        module.phase = ModuleCompilationPhase::DefinitionsScanned;
 

	
 
        Ok(())
 
    }
 

	
 
    fn visit_pragma_range(&mut self, modules: &[Module], module_idx: usize, ctx: &mut Ctx, range_idx: usize) -> Result<(), ParseError> {
 
        let module = &modules[module_idx];
 
        let range = &module.tokens.ranges[range_idx];
 
        let mut iter = module.tokens.iter_range(range);
 

	
 
        // Consume pragma name
 
        let (pragma_section, pragma_start, _) = consume_pragma(&self.source, &mut iter)?;
 

	
 
        // Consume pragma values
 
        if pragma_section == "#module" {
 
            // Check if name is defined twice within the same file
 
            if self.has_pragma_module {
 
                return Err(ParseError::new_error(&module.source, pragma_start, "module name is defined twice"));
 
            }
 

	
 
            // Consume the domain-name
 
            let (module_name, module_span) = consume_domain_ident(&module.source, &mut iter)?;
 
            if iter.next().is_some() {
 
                return Err(ParseError::new_error(&module.source, iter.last_valid_pos(), "expected end of #module pragma after module name"));
 
            }
 

	
 
            // Add to heap and symbol table
 
            let pragma_span = InputSpan::from_positions(pragma_start, module_span.end);
 
            let module_name = ctx.pool.intern(module_name);
 
            let pragma_id = ctx.heap.alloc_pragma(|this| Pragma::Module(PragmaModule{
 
                this,
 
                span: pragma_span,
 
                value: Identifier{ span: module_span, value: module_name.clone() },
 
            }));
 
            self.pragmas.push(pragma_id);
 

	
 
            if let Err(other_module_root_id) = ctx.symbols.insert_module(module_name, module.root_id) {
 
                // Naming conflict
 
                let this_module = &modules[module_idx];
 
                let other_module = seek_module(modules, other_module_root_id).unwrap();
 
                let (other_module_pragma_id, _) = other_module.name.unwrap();
 
                let other_pragma = ctx.heap[other_module_pragma_id].as_module();
 
                return Err(ParseError::new_error_str_at_span(
 
                    &this_module.source, pragma_span, "conflict in module name"
 
                ).with_info_str_at_span(
 
                    &other_module.source, other_pragma.span, "other module is defined here"
 
                ));
 
            }
 
            self.has_pragma_module = true;
 
        } else if pragma_section == "#version" {
 
            // Check if version is defined twice within the same file
 
            if self.has_pragma_version {
 
                return Err(ParseError::new_error(&module.source, pragma_start, "module version is defined twice"));
 
            }
 

	
 
            // Consume the version pragma
 
            let (version, version_span) = consume_integer_literal(&module.source, &mut iter, &mut self.buffer)?;
 
            let pragma_id = ctx.heap.alloc_pragma(|this| Pragma::Version(PragmaVersion{
 
                this,
 
                span: InputSpan::from_positions(pragma_start, version_span.end),
 
                version,
 
            }));
 
            self.pragmas.push(pragma_id);
 
            self.has_pragma_version = true;
 
        } else {
 
            // Custom pragma, maybe we support this in the future, but for now
 
            // we don't.
 
            return Err(ParseError::new_error(&module.source, pragma_start, "illegal pragma name"));
 
        }
 

	
 
        Ok(())
 
    }
 

	
 
    fn visit_definition_range(&mut self, modules: &[Module], module_idx: usize, ctx: &mut Ctx, range_idx: usize) -> Result<(), ParseError> {
 
        let module = &modules[module_idx];
 
        let range = &module.tokens.ranges[range_idx];
 
        let definition_span = InputSpan::from_positions(
 
            module.tokens.start_pos(range),
 
            module.tokens.end_pos(range)
 
        );
 
        let mut iter = module.tokens.iter_range(range);
 

	
 
        // First ident must be type of symbol
 
        let (kw_text, _) = consume_any_ident(&module.source, &mut iter).unwrap();
 
        let kw = parse_definition_keyword(kw_text).unwrap();
 

	
 
        // Retrieve identifier of definition
 
        let (identifier_text, identifier_span) = consume_ident(&module.source, &mut iter)?;
 
        let ident_text = ctx.pool.intern(identifier_text);
 
        let identifier = Identifier{ span: identifier_span, value: ident_text };
 
        let identifier = Identifier{ span: identifier_span, value: ident_text.clone() };
 

	
 
        // Reserve space in AST for definition and add it to the symbol table
 
        let symbol_definition;
 
        let definition_class;
 
        let ast_definition_id;
 
        match kw {
 
            KeywordDefinition::Struct => {
 
                let struct_def_id = ctx.heap.alloc_struct_definition(|this| {
 
                    StructDefinition::new_empty(this, definition_span, identifier)
 
                });
 
                symbol_definition = SymbolDefinition::Struct(struct_def_id);
 
                definition_class = DefinitionClass::Struct;
 
                ast_definition_id = struct_def_id.upcast();
 
            },
 
            KeywordDefinition::Enum => {
 
                let enum_def_id = ctx.heap.alloc_enum_definition(|this| {
 
                    EnumDefinition::new_empty(this, definition_span, identifier)
 
                });
 
                symbol_definition = SymbolDefinition::Enum(enum_def_id);
 
                definition_class = DefinitionClass::Enum;
 
                ast_definition_id = enum_def_id.upcast();
 
            },
 
            KeywordDefinition::Union => {
 
                let union_def_id = ctx.heap.alloc_union_definition(|this| {
 
                    UnionDefinition::new_empty(this, definition_span, identifier)
 
                });
 
                symbol_definition = SymbolDefinition::Union(union_def_id);
 
                definition_class = DefinitionClass::Union;
 
                ast_definition_id = union_def_id.upcast()
 
            },
 
            KeywordDefinition::Function => {
 
                let func_def_id = ctx.heap.alloc_function_definition(|this| {
 
                    FunctionDefinition::new_empty(this, definition_span, identifier)
 
                });
 
                symbol_definition = SymbolDefinition::Function(func_def_id);
 
                definition_class = DefinitionClass::Function;
 
                ast_definition_id = func_def_id.upcast();
 
            },
 
            KeywordDefinition::Primitive | KeywordDefinition::Composite => {
 
                let component_variant = if kw == KeywordDefinition::Primitive {
 
                    ComponentVariant::Primitive
 
                } else {
 
                    ComponentVariant::Composite
 
                };
 
                let comp_def_id = ctx.heap.alloc_component_definition(|this| {
 
                    ComponentDefinition::new_empty(this, definition_span, component_variant, identifier)
 
                });
 
                symbol_definition = SymbolDefinition::Component(comp_def_id);
 
                definition_class = DefinitionClass::Component;
 
                ast_definition_id = comp_def_id.upcast();
 
            }
 
        }
 

	
 
        let symbol = Symbol{
 
            defined_in_module: module.root_id,
 
            defined_in_scope: SymbolScope::Module(module.root_id),
 
            definition_span,
 
            identifier_span,
 
            introduced_at: None,
 
            name: definition_ident,
 
            definition: symbol_definition
 
            name: ident_text,
 
            data: SymbolVariant::Definition(SymbolDefinition{
 
                defined_in_module: module.root_id,
 
                defined_in_scope: SymbolScope::Module(module.root_id),
 
                definition_span,
 
                identifier_span,
 
                introduced_at: None,
 
                class: definition_class,
 
                definition_id: ast_definition_id,
 
            }),
 
        };
 
        self.symbols.push(symbol);
 
        self.definitions.push(ast_definition_id);
 

	
 
        Ok(())
 
    }
 
}
 

	
 
/// Parses all the imports in the module tokens. Is applied after the
 
/// definitions and name of modules are resolved. Hence we should be able to
 
/// resolve all symbols to their appropriate module/definition.
 
pub(crate) struct PassImport {
 
    imports: Vec<ImportId>,
 
}
 

	
 
impl PassImport {
 
    pub(crate) fn new() -> Self {
 
        Self{ imports: Vec::with_capacity(32) }
 
    }
 
    pub(crate) fn parse(&mut self, modules: &mut [Module], module_idx: usize, ctx: &mut Ctx) -> Result<(), ParseError> {
 
        let module = &modules[module_idx];
 
        let module_range = &module.tokens.ranges[0];
 
        debug_assert!(modules.iter().all(|m| m.phase >= ModuleCompilationPhase::DefinitionsScanned));
 
        debug_assert_eq!(module.phase, ModuleCompilationPhase::DefinitionsScanned);
 
        debug_assert_eq!(module_range.range_kind, TokenRangeKind::Module);
 

	
 
        let mut range_idx = module_range.first_child_idx;
 
        loop {
 
            let range_idx_usize = range_idx as usize;
 
            let cur_range = &module.tokens.ranges[range_idx_usize];
 

	
 
            if cur_range.range_kind == TokenRangeKind::Import {
 
                self.visit_import_range(modules, module_idx, ctx, range_idx_usize)?;
 
            }
 

	
 
            match cur_range.next_sibling_idx {
 
                Some(idx) => { range_idx = idx; },
 
                None => { break; }
 
            }
 
        }
 

	
 
        Ok(())
 
    }
 

	
 
    pub(crate) fn visit_import_range(
 
        &mut self, modules: &mut [Module], module_idx: usize, ctx: &mut Ctx, range_idx: usize
 
    ) -> Result<(), ParseError> {
 
        let module = &modules[module_idx];
 
        let import_range = &module.tokens.ranges[range_idx];
 
        debug_assert_eq!(import_range.range_kind, TokenRangeKind::Import);
 

	
 
        let mut iter = module.tokens.iter_range(import_range);
 

	
 
        // Consume "import"
 
        let (_import_ident, import_span) =
 
            consume_ident(&module.source, &mut iter)?;
 
        debug_assert_eq!(_import_ident, KW_IMPORT);
 

	
 
        // Consume module name
 
        let (module_name, module_name_span) = consume_domain_ident(&module.source, &mut iter)?;
 
        let target_root_id = ctx.symbols.get_module_by_name(module_name);
 
        if target_root_id.is_none() {
 
            return Err(ParseError::new_error_at_span(
 
                &module.source, module_name_span,
 
                format!("could not resolve module '{}'", String::from_utf8_lossy(module_name))
 
            ));
 
        }
 
        let module_name = ctx.pool.intern(module_name);
 
        let target_root_id = target_root_id.unwrap();
 

	
 
        // Check for subsequent characters
 
        let next = iter.next();
 
        if has_ident(&module.source, &mut iter, b"as") {
 
            iter.consume();
 
            let (alias_text, alias_span) = consume_ident(source, &mut iter)?;
 
            let alias = ctx.pool.intern(alias_text);
 

	
 
            let import_id = ctx.heap.alloc_import(|this| Import::Module(ImportModule{
 
                this,
 
                span: import_span,
 
                module_name: Identifier{ span: module_name_span, value: module_name },
 
                alias: Identifier{ span: alias_span, value: alias },
 
                alias: Identifier{ span: alias_span, value: alias.clone() },
 
                module_id: target_root_id
 
            }));
 
            ctx.symbols.insert_symbol(SymbolScope::Module(module.root_id), Symbol{
 
                defined_in_module: target_root_id,
 
                defined_in_scope: SymbolScope::Module(target_root_id),
 
                definition_span
 
            })
 
                name: alias,
 
                data: SymbolVariant::Module(SymbolModule{
 
                    root_id: target_root_id,
 
                    introduced_at: import_id,
 
                }),
 
            });
 
        } else if Some(TokenKind::ColonColon) == next {
 
            fn consume_symbol_and_maybe_alias<'a>(
 
                source: &'a InputSource, iter: &mut TokenIter, in_scope: SymbolScope, ctx: &Ctx
 
            ) -> Result<(&'a [u8], InputSpan, Option<(&'a [u8], InputSpan)>), ParseError> {
 
                // Consume symbol and make sure it points to something valid
 
                let (symbol, symbol_span) = consume_ident(source, iter)?;
 
                let target = ctx.symbols.get_symbol_by_name_defined_in_scope(in_scope, symbol);
 
                if target.is_none() {
 

	
 
                }
 

	
 
                if peek_ident(source, iter) == b"as" {
 
                    // Consume alias
 
                    iter.consume();
 
                    let (alias, alias_span) = consume_ident(source, iter)?;
 
                    Ok((symbol, symbol_span, Some((alias, alias_span))))
 
                } else {
 
                    Ok((symbol, symbol_span, None))
 
                }
 
            }
 

	
 
            iter.consume();
 

	
 
            let next = iter.next();
 
            if Some(TokenKind::Ident) = next {
 
                // Importing a single symbol
 
                iter.consume();
 
                let (symbol_text, symbol_span, maybe_alias) = consume_symbol_and_maybe_alias(&module.source, &mut iter)?;
 
                let target_symbol = ctx.symbols.get_symbol_by_name_defined_in_scope(
 
                    SymbolScope::Module(target_root_id))
 
            } else if Some(TokenKind::OpenCurly) = next {
 
                // Importing multiple symbols
 
                iter.consume();
 
            } else if Some(TokenKind::Star) = next {
 
                // Import all symbols from the module
 
                iter.consume();
 
            } else {
 
                return Err(ParseError::new_error_str_at_pos(
 
                    &module.source, iter.last_valid_pos(), "expected symbol name, '{' or '*'"
 
                ));
 
            }
 
        } else {
 
            // Assume implicit alias, then check if we get the semicolon next
 
            let module_name_str = module_name.as_str();
 
            let last_ident_start = module_name_str.rfind('.').map_or(0, |v| v + 1);
 
            let alias_text = &module_name_str.as_bytes()[last_ident_start..];
 
            let alias = ctx.pool.intern(alias_text);
 
            let alias_span = InputSpan::from_positions(
 
                module_name_span.begin.with_offset(last_ident_start as u32),
 
                module_name_span.end
 
            );
 
        }
 

	
 
        Ok(())
 
    }
 
}
 

	
 
fn consume_domain_ident<'a>(source: &'a InputSource, iter: &mut TokenIter) -> Result<(&'a [u8], InputSpan), ParseError> {
 
    let (_, mut span) = consume_ident(source, iter)?;
 
    while let Some(TokenKind::Dot) = iter.next() {
 
        consume_dot(source, iter)?;
 
        iter.consume();
 
        let (_, new_span) = consume_ident(source, iter)?;
 
        span.end = new_span.end;
 
    }
 

	
 
    // Not strictly necessary, but probably a reasonable restriction: this
 
    // simplifies parsing of module naming and imports.
 
    if span.begin.line != span.end.line {
 
        return Err(ParseError::new_error_str_at_span(source, span, "module names may not span multiple lines"));
 
    }
 

	
 
    // If module name consists of a single identifier, then it may not match any
 
    // of the reserved keywords
 
    let section = source.section(span.begin, span.end);
 
    if is_reserved_keyword(section) {
 
        return Err(ParseError::new_error_str_at_span(source, span, "encountered reserved keyword"));
 
    }
 

	
 
    Ok((source.section(span.begin, span.end), span))
 
}
 

	
 
fn consume_dot<'a>(source: &'a InputSource, iter: &mut TokenIter) -> Result<(), ParseError> {
 
    if Some(TokenKind::Dot) != iter.next() {
 
        return Err(ParseError::new_error_str_at_pos(source, iter.last_valid_pos(), "expected a dot"));
 
/// Consumes a specific expected token. Be careful to only call this with tokens that do not have a
 
/// variable length.
 
fn consume_token(source: &InputSource, iter: &mut TokenIter, expected: TokenKind) -> Result<(), ParseError> {
 
    if Some(expected) != iter.next() {
 
        return Err(ParseError::new_error_at_pos(
 
            source, iter.last_valid_pos(),
 
            format!("expected '{}'", expected.token_chars())
 
        ));
 
    }
 
    iter.consume();
 
    Ok(())
 
}
 

	
 
fn consume_integer_literal(source: &InputSource, iter: &mut TokenIter, buffer: &mut String) -> Result<(u64, InputSpan), ParseError> {
 
    if Some(TokenKind::Integer) != iter.next() {
 
        return Err(ParseError::new_error_str_at_pos(source, iter.last_valid_pos(), "expected an integer literal"));
 
    }
 
    let (start_pos, end_pos) = iter.next_range();
 
    iter.consume();
 

	
 
    let integer_text = source.section(start_pos, end_pos);
 

	
 
    // Determine radix and offset from prefix
 
    let (radix, input_offset, radix_name) =
 
        if integer_text.starts_with(b"0b") || integer_text.starts_with(b"0B") {
 
            // Binary number
 
            (2, 2, "binary")
 
        } else if integer_text.starts_with(b"0o") || integer_text.starts_with(b"0O") {
 
            // Octal number
 
            (8, 2, "octal")
 
        } else if integer_text.starts_with(b"0x") || integer_text.starts_with(b"0X") {
 
            // Hexadecimal number
 
            (16, 2, "hexadecimal")
 
        } else {
 
            (10, 0, "decimal")
 
        };
 

	
 
    // Take out any of the separating '_' characters
 
    buffer.clear();
 
    for char_idx in input_offset..integer_text.len() {
 
        let char = integer_text[char_idx];
 
        if char == b'_' {
 
            continue;
 
        }
 
        if !char.is_ascii_digit() {
 
            return Err(ParseError::new_error(source, start_pos, "incorrectly formatted integer"));
 
        }
 
        buffer.push(char::from(char));
 
    }
 

	
 
    // Use the cleaned up string to convert to integer
 
    match u64::from_str_radix(&buffer, radix) {
 
        Ok(number) => Ok((number, InputSpan::from_positions(start_pos, end_pos))),
 
        Err(_) => Err(
 
            ParseError::new_error(source, start_pos, "incorrectly formatted integer")
 
        ),
 
    }
 
}
 

	
 
fn seek_module(modules: &[Module], root_id: RootId) -> Option<&Module> {
 
    for module in modules {
 
        if module.root_id == root_id {
 
            return Some(module)
 
        }
 
    }
 

	
 
    return None
 
}
 

	
 
fn consume_pragma<'a>(source: &'a InputSource, iter: &mut TokenIter) -> Result<(&'a [u8], InputPosition, InputPosition), ParseError> {
 
    if Some(TokenKind::Pragma) != iter.next() {
 
        return Err(ParseError::new_error(source, iter.last_valid_pos(), "expected a pragma"));
 
    }
 
    let (pragma_start, pragma_end) = iter.next_range();
 
    iter.consume();
 
    Ok((source.section(pragma_start, pragma_end), pragma_start, pragma_end))
 
}
 

	
 
fn has_ident(source: &InputSource, iter: &mut TokenIter, expected: &[u8]) -> bool {
 
    if Some(TokenKind::Ident) == iter.next() {
 
        let (start, end) = iter.next_range();
 
        return source.section(start, end) == expected;
 
    }
 

	
 
    false
 
}
 

	
 
fn peek_ident<'a>(source: &'a InputSource, iter: &mut TokenIter) -> Option<&'a [u8]> {
 
    if Some(TokenKind::Ident) == iter.next() {
 
        let (start, end) = iter.next_range();
 
        return Some(source.section(start, end))
 
    }
 

	
 
    None
 
}
 

	
 
/// Consumes any identifier and returns it together with its span. Does not
 
/// check if the identifier is a reserved keyword.
 
fn consume_any_ident<'a>(source: &'a InputSource, iter: &mut TokenIter) -> Result<(&'a [u8], InputSpan), ParseError> {
 
    if Some(TokenKind::Ident) != iter.next() {
 
        return Err(ParseError::new_error(sourcee, iter.last_valid_pos(), "expected an identifier"));
 
    }
 
    let (ident_start, ident_end) = iter.next_range();
 
    iter.consume();
 
    Ok((source.section(ident_start, ident_end), InputSpan::from_positions(ident_start, ident_end)))
 
}
 

	
 
/// Consumes an identifier that is not a reserved keyword and returns it
 
/// together with its span.
 
fn consume_ident<'a>(source: &'a InputSource, iter: &mut TokenIter) -> Result<(&'a [u8], InputSpan), ParseError> {
 
    let (ident, span) = consume_any_ident(source, iter)?;
 
    if is_reserved_keyword(ident) {
 
        return Err(ParseError::new_error_str_at_span(source, span, "encountered reserved keyword"));
 
    }
 

	
 
    Ok((ident, span))
 
}
 

	
 
fn is_reserved_definition_keyword(text: &[u8]) -> bool {
 
    return ([
 
        b"struct", b"enum", b"union", b"function", b"primitive"
 
    ] as &[[u8]]).contains(text)
 
}
 

	
 
fn is_reserved_statement_keyword(text: &[u8]) -> bool {
 
    return ([
 
        b"channel", b"import", b"as",
 
        b"if", b"while", b"break", b"continue", b"goto", b"return",
 
        b"synchronous", b"assert", b"new",
 
    ] as &[[u8]]).contains(text)
 
}
 

	
 
fn is_reserved_expression_keyword(text: &[u8]) -> bool {
 
    return ([
 
        b"let", b"true", b"false", b"null", // literals
 
        b"get", b"put", b"fires", b"create", b"length", // functions
 
    ] as &[[u8]]).contains(text)
 
}
 

	
 
fn is_reserved_type_keyword(text: &[u8]) -> bool {
 
    return ([
 
        b"in", b"out", b"msg",
 
        b"bool",
 
        b"u8", b"u16", b"u32", b"u64",
 
        b"s8", b"s16", b"s32", b"s64",
 
        b"auto"
 
    ] as &[[u8]]).contains(text)
 
}
 

	
 
fn is_reserved_keyword(text: &[u8]) -> bool {
 
    return
 
        is_reserved_definition_keyword(text) ||
 
        is_reserved_statement_keyword(text) ||
 
        is_reserved_type_keyword(text);
 
}
 

	
 
/// Constructs a human-readable message indicating why there is a conflict of
 
/// symbols.
 
// Note: passing the `module_idx` is not strictly necessary, but will prevent
 
// programmer mistakes during development: we get a conflict because we're
 
// currently parsing a particular module.
 
fn construct_symbol_conflict_error(modules: &[Module], module_idx: usize, ctx: &Ctx, new_symbol: &Symbol, old_symbol: &Symbol) -> ParseError {
 
    let module = &modules[module_idx];
 
    let get_symbol_span_and_msg = |symbol: &Symbol| -> (String, InputSpan) {
 
        match symbol.introduced_at {
 
            Some(import_id) => {
 
                // Symbol is being imported
 
                let import = &ctx.heap[import_id];
 
                match import {
 
                    Import::Module(import) => (
 
                        format!("the module aliased as '{}' imported here", symbol.name.as_str()),
 
                        import.span
 
                    ),
 
                    Import::Symbols(symbols) => (
 
                        format!("the type '{}' imported here", symbol.name.as_str()),
 
                        symbols.span
 
                    ),
 
                }
 
            },
 
            None => {
 
                // Symbol is being defined
 
                debug_assert_eq!(symbol.defined_in_module, module.root_id);
 
                debug_assert_ne!(symbol.definition.symbol_class(), SymbolClass::Module);
 
                (
 
                    format!("the type '{}' defined here", symbol.name.as_str()),
 
                    symbol.identifier_span
 
                )
 
            }
 
        }
 
    };
 

	
 
    let (new_symbol_msg, new_symbol_span) = get_symbol_span_and_msg(new_symbol);
 
    let (old_symbol_msg, old_symbol_span) = get_symbol_span_and_msg(old_symbol);
 
    return ParseError::new_error_at_span(
 
        &module.source, new_symbol_span, format!("symbol is defined twice: {}", new_symbol_msg)
 
    ).with_info_at_span(
 
        &module.source, old_symbol_span, format!("it conflicts with {}", old_symbol_msg)
 
    )
 
}
 

	
 
fn parse_definition_keyword(keyword: &[u8]) -> Option<KeywordDefinition> {
 
    match keyword {
 
        KW_STRUCT =>    Some(Keyword::Struct),
 
        KW_ENUM =>      Some(Keyword::Enum),
 
        KW_UNION =>     Some(Keyword::Union),
 
        KW_FUNCTION =>  Some(Keyword::Function),
 
        KW_PRIMITIVE => Some(Keyword::Primitive),
 
        KW_COMPOSITE => Some(Keyword::Composite),
 
        _ => None
 
    }
 
}
 
\ No newline at end of file
src/protocol/parser/symbol_table2.rs
Show inline comments
 
/// symbol_table.rs
 
///
 
/// The datastructure used to lookup symbols within particular scopes. Scopes
 
/// may be module-level or definition level, although imports and definitions
 
/// within definitions are currently not allowed.
 
///
 
/// TODO: Once the compiler has matured, find out ways to optimize to prevent
 
///     the repeated HashMap lookup.
 

	
 
use std::collections::HashMap;
 
use std::collections::hash_map::Entry;
 

	
 
use crate::protocol::input_source2::*;
 
use crate::protocol::ast::*;
 
use crate::collections::*;
 

	
 
const RESERVED_SYMBOLS: usize = 32;
 

	
 
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
 
pub enum SymbolScope {
 
    Module(RootId),
 
    Definition(DefinitionId),
 
}
 

	
 
#[derive(Clone, Copy, PartialEq, Eq)]
 
pub enum SymbolClass {
 
    Module,
 
    Struct,
 
    Enum,
 
    Union,
 
    Function,
 
    Component
 
}
 

	
 
#[derive(Clone, Copy, PartialEq, Eq)]
 
pub enum DefinitionClass {
 
    Struct,
 
    Enum,
 
    Union,
 
    Function,
 
    Component,
 
}
 

	
 
impl DefinitionClass {
 
    fn as_symbol_class(&self) -> SymbolClass {
 
        match self {
 
            DefinitionClass::Struct => SymbolClass::Struct,
 
            DefinitionClass::Enum => SymbolClass::Enum,
 
            DefinitionClass::Union => SymbolClass::Union,
 
            DefinitionClass::Function => SymbolClass::Function,
 
            DefinitionClass::Component => SymbolClass::Component,
 
        }
 
    }
 
}
 

	
 
struct ScopedSymbols {
 
    scope: SymbolScope,
 
    parent_scope: Option<SymbolScope>,
 
    child_scopes: Vec<SymbolScope>,
 
    symbols: Vec<Symbol>,
 
}
 

	
 
pub enum SymbolDefinition {
 
    Module(RootId),
 
    Struct(StructDefinitionId),
 
    Enum(EnumDefinitionId),
 
    Union(UnionDefinitionId),
 
    Function(FunctionDefinitionId),
 
    Component(ComponentDefinitionId),
 
impl ScopedSymbols {
 
    fn get_symbol<'a>(&'a self, name: &StringRef) -> Option<&'a Symbol> {
 
        for symbol in self.symbols.iter() {
 
            if symbol.name == *name {
 
                return Some(symbol);
 
            }
 
        }
 

	
 
        None
 
    }
 
}
 

	
 
impl SymbolDefinition {
 
    pub fn symbol_class(&self) -> SymbolClass {
 
        use SymbolDefinition as SD;
 
        use SymbolClass as SC;
 

	
 
        match self {
 
            SD::Module(_) => SC::Module,
 
            SD::Struct(_) => SC::Struct,
 
            SD::Enum(_) => SC::Enum,
 
            SD::Union(_) => SC::Union,
 
            SD::Function(_) => SC::Function,
 
            SD::Component(_) => SC::Component,
 
        }
 
    }
 
}
 

	
 
pub enum SymbolData {
 
    
 
pub struct SymbolModule {
 
    pub root_id: RootId,
 
    pub introduced_at: ImportId,
 
}
 

	
 
pub struct Symbol {
 
    // Definition location (may be different from the scope/module in which it
 
    // is used if the symbol is imported)
 
pub struct SymbolDefinition {
 
    // Definition location (not necessarily the place where the symbol
 
    // is introduced, as it may be imported)
 
    pub defined_in_module: RootId,
 
    pub defined_in_scope: SymbolScope,
 
    pub definition_span: InputSpan, // full span of definition, not just the name
 
    pub definition_span: InputSpan, // full span of definition
 
    pub identifier_span: InputSpan, // span of just the identifier
 
    // Introduction location (if imported instead of defined)
 
    pub introduced_at: Option<ImportId>,
 
    // Symbol properties
 
    // Location where the symbol is introduced in its scope
 
    pub imported_at: Option<ImportId>,
 
    // Definition in the heap, with a utility enum to determine its
 
    // class if the ID is not needed.
 
    pub class: DefinitionClass,
 
    pub definition_id: DefinitionId,
 
}
 

	
 
pub enum SymbolVariant {
 
    Module(SymbolModule),
 
    Definition(SymbolDefinition),
 
}
 

	
 
pub struct Symbol {
 
    pub name: StringRef<'static>,
 
    pub definition: SymbolDefinition,
 
    pub data: SymbolVariant,
 
}
 

	
 
impl Symbol {
 
    fn class(&self) -> SymbolClass {
 
        match &self.data {
 
            SymbolVariant::Module(_) => SymbolClass::Module,
 
            SymbolVariant::Definition(data) => data.class.as_symbol_class(),
 
        }
 
    }
 
}
 

	
 
pub struct SymbolTable {
 
    module_lookup: HashMap<StringRef<'static>, RootId>,
 
    scope_lookup: HashMap<SymbolScope, ScopedSymbols>,
 
}
 

	
 
impl SymbolTable {
 
    /// Inserts a new module by its name. Upon module naming conflict the
 
    /// previously associated `RootId` will be returned.
 
    pub(crate) fn insert_module(&mut self, module_name: StringRef<'static>, root_id: RootId) -> Result<(), RootId> {
 
        match self.module_lookup.entry(module_name) {
 
            Entry::Occupied(v) => {
 
                Err(*v.get())
 
            },
 
            Entry::Vacant(v) => {
 
                v.insert(root_id);
 
                Ok(())
 
            }
 
        }
 
    }
 

	
 
    /// Retrieves module `RootId` by name
 
    pub(crate) fn get_module_by_name(&mut self, name: &[u8]) -> Option<RootId> {
 
        let string_ref = StringRef::new(name);
 
        self.module_lookup.get(&string_ref).map(|v| *v)
 
    }
 

	
 
    /// Inserts a new symbol scope. The parent must have been added to the
 
    /// symbol table before.
 
    pub(crate) fn insert_scope(&mut self, parent_scope: Option<SymbolScope>, new_scope: SymbolScope) {
 
        debug_assert!(
 
            parent_scope.is_none() || self.scope_lookup.contains_key(parent_scope.as_ref().unwrap()),
 
            "inserting scope {:?} but parent {:?} does not exist", new_scope, parent_scope
 
        );
 
        debug_assert!(!self.scope_lookup.contains_key(&new_scope), "inserting scope {:?}, but it already exists", new_scope);
 

	
 
        if let Some(parent_scope) = parent_scope {
 
            let parent = self.scope_lookup.get_mut(&parent_scope).unwrap();
 
            parent.child_scopes.push(new_scope);
 
        }
 

	
 
        let scope = ScopedSymbols {
 
            scope: new_scope,
 
            parent_scope,
 
            child_scopes: Vec::with_capacity(RESERVED_SYMBOLS),
 
            symbols: Vec::with_capacity(RESERVED_SYMBOLS)
 
        };
 
        self.scope_lookup.insert(new_scope, scope);
 
    }
 

	
 
    /// Inserts a symbol into a particular scope. The symbol's name may not
 
    /// exist in the scope or any of its parents. If it does collide then the
 
    /// symbol will be returned, together with the symbol that has the same
 
    /// name.
 
    pub(crate) fn insert_symbol(&mut self, in_scope: SymbolScope, symbol: Symbol) -> Result<(), (Symbol, &Symbol)> {
 
        debug_assert!(self.scope_lookup.contains_key(&in_scope), "inserting symbol {}, but scope {:?} does not exist", symbol.name.as_str(), in_scope);
 
        let mut seek_scope = in_scope;
 
        loop {
 
            let scoped_symbols = self.scope_lookup.get(&seek_scope).unwrap();
 
            for existing_symbol in scoped_symbols.symbols.iter() {
 
                if symbol.name == existing_symbol.name {
 
                    return Err((symbol, existing_symbol))
 
                }
 
            }
 

	
 
            match scoped_symbols.parent_scope {
 
                Some(parent_scope) => { seek_scope = parent_scope; },
 
                None => { break; }
 
            }
 
        }
 

	
 
        // If here, then there is no collision
 
        let scoped_symbols = self.scope_lookup.get_mut(&in_scope).unwrap();
 
        scoped_symbols.symbols.push(symbol);
 
        Ok(())
 
    }
 

	
 
    /// Retrieves a particular scope. As this will be called by the compiler to
 
    /// retrieve scopes that MUST exist, this function will panic if the
 
    /// indicated scope does not exist.
 
    pub(crate) fn get_scope_by_id(&mut self, scope: &SymbolScope) -> &mut ScopedSymbols {
 
        debug_assert!(self.scope_lookup.contains_key(scope), "retrieving scope {:?}, but it doesn't exist", scope);
 
        self.scope_lookup.get_mut(scope).unwrap()
 
    /// Retrieves a symbol by name by searching in a particular scope and that scope's parents. The
 
    /// returned symbol may both be imported as defined within any of the searched scopes.
 
    pub(crate) fn get_symbol_by_name(
 
        &self, mut in_scope: SymbolScope, name: &[u8]
 
    ) -> Option<&Symbol> {
 
        let string_ref = StringRef::new(name);
 
        loop {
 
            let scope = self.scope_lookup.get(&in_scope);
 
            if scope.is_none() {
 
                return None;
 
            }
 
            let scope = scope.unwrap();
 

	
 
            if let Some(symbol) = scope.get_symbol(&string_ref) {
 
                return Some(symbol);
 
            } else {
 
                // Could not find symbol in current scope, seek in the parent scope if it exists
 
                match &scope.parent_scope {
 
                    Some(parent_scope) => { in_scope = *parent_scope; },
 
                    None => return None,
 
                }
 
            }
 
        }
 
    }
 

	
 
    /// Retrieves a symbol by name by searching in a particular scope and that scope's parents. The
 
    /// returned symbol must be defined within any of the searched scopes and may not be imported.
 
    /// In case such an imported symbol exists then this function still returns `None`.
 
    pub(crate) fn get_symbol_by_name_defined_in_scope(
 
        &self, in_scope: SymbolScope, name: &[u8]
 
    ) -> Option<&Symbol> {
 
        match self.get_symbol_by_name(in_scope, name) {
 
            Some(symbol) => {
 
                match &symbol.data {
 
                    SymbolVariant::Module(_) => {
 
                        None // in-scope modules are always imported
 
                    },
 
                    SymbolVariant::Definition(variant) => {
 
                        if variant.imported_at.is_some() {
 
                            None
 
                        } else {
 
                            Some(symbol)
 
                        }
 
                    }
 
                }
 
            },
 
            None => None,
 
        }
 
    }
 
}
 
\ No newline at end of file
src/protocol/tokenizer/mod.rs
Show inline comments
 
use crate::protocol::input_source2::{
 
    InputSource2 as InputSource,
 
    ParseError,
 
    InputPosition2 as InputPosition,
 
    InputSpan
 
};
 

	
 
pub(crate) const KW_STRUCT:    &'static [u8] = b"struct";
 
pub(crate) const KW_ENUM:      &'static [u8] = b"enum";
 
pub(crate) const KW_UNION:     &'static [u8] = b"union";
 
pub(crate) const KW_FUNCTION:  &'static [u8] = b"function";
 
pub(crate) const KW_PRIMITIVE: &'static [u8] = b"primitive";
 
pub(crate) const KW_COMPOSITE: &'static [u8] = b"composite";
 
pub(crate) const KW_IMPORT:    &'static [u8] = b"import";
 

	
 
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
 
pub(crate) enum TokenKind {
 
    // Variable-character tokens, followed by a SpanEnd token
 
    Ident,          // regular identifier
 
    Pragma,         // identifier with prefixed `#`, range includes `#`
 
    Integer,        // integer literal
 
    String,         // string literal, range includes `"`
 
    Character,      // character literal, range includes `'`
 
    LineComment,    // line comment, range includes leading `//`, but not newline
 
    BlockComment,   // block comment, range includes leading `/*` and trailing `*/`
 
    // Punctuation (single character)
 
    Exclamation,    // !
 
    Question,       // ?
 
    Pound,          // #
 
    OpenAngle,      // <
 
    OpenCurly,      // {
 
    OpenParen,      // (
 
    OpenSquare,     // [
 
    CloseAngle,     // >
 
    CloseCurly,     // }
 
    CloseParen,     // )
 
    CloseSquare,    // ]
 
    Colon,          // :
 
    Comma,          // ,
 
    Dot,            // .
 
    SemiColon,      // ;
 
    Quote,          // '
 
    DoubleQuote,    // "
 
    // Operator-like (single character)
 
    At,             // @
 
    Plus,           // +
 
    Minus,          // -
 
    Star,           // *
 
    Slash,          // /
 
    Percent,        // %
 
    Caret,          // ^
 
    And,            // &
 
    Or,             // |
 
    Tilde,          // ~
 
    Equal,          // =
 
    // Punctuation (two characters)
 
    ColonColon,     // ::
 
    DotDot,         // ..
 
    ArrowRight,     // ->
 
    // Operator-like (two characters)
 
    PlusPlus,       // ++
 
    PlusEquals,     // +=
 
    MinusMinus,     // --
 
    MinusEquals,    // -=
 
    StarEquals,     // *=
 
    SlashEquals,    // /=
 
    PercentEquals,  // %=
 
    CaretEquals,    // ^=
 
    AndAnd,         // &&
 
    AndEquals,      // &=
 
    OrOr,           // ||
 
    OrEquals,       // |=
 
    EqualEqual,     // ==
 
    NotEqual,       // !=
 
    ShiftLeft,      // <<
 
    ShiftRight,     // >>
 
    // Operator-like (three characters)
 
    ShiftLeftEquals,// <<=
 
    ShiftRightEquals, // >>=
 
    // Special marker token to indicate end of variable-character tokens
 
    SpanEnd,
 
}
 

	
 
impl TokenKind {
 
    /// Returns true if the next expected token is the special `TokenKind::SpanEnd` token. This is
 
    /// the case for tokens of variable length (e.g. an identifier).
 
    fn has_span_end(&self) -> bool {
 
        return *self <= TokenKind::BlockComment
 
    }
 

	
 
    /// Returns the number of characters associated with the token. May only be called on tokens
 
    /// that do not have a variable length.
 
    fn num_characters(&self) -> u32 {
 
        debug_assert!(!self.has_span_end() && *self != TokenKind::SpanEnd);
 
        if *self <= TokenKind::Equal {
 
            1
 
        } else if *self <= TokenKind::ShiftRight {
 
            2
 
        } else {
 
            3
 
        }
 
    }
 

	
 
    /// Returns the characters that are represented by the token, may only be called on tokens that
 
    /// do not have a variable length.
 
    pub fn token_chars(&self) -> &'static str {
 
        debug_assert!(!self.has_span_end() && *self != TokenKind::SpanEnd);
 
        use TokenKind as TK;
 
        match self {
 
            TK::Exclamation => "!",
 
            TK::Question => "?",
 
            TK::Pound => "#",
 
            TK::OpenAngle => "<",
 
            TK::OpenCurly => "{",
 
            TK::OpenParen => "(",
 
            TK::OpenSquare => "[",
 
            TK::CloseAngle => ">",
 
            TK::CloseCurly => "}",
 
            TK::CloseParen => ")",
 
            TK::CloseSquare => "]",
 
            TK::Colon => ":",
 
            TK::Comma => ",",
 
            TK::Dot => ".",
 
            TK::SemiColon => ";",
 
            TK::Quote => "'",
 
            TK::DoubleQuote => "\"",
 
            TK::At => "@",
 
            TK::Plus => "+",
 
            TK::Minus => "-",
 
            TK::Star => "*",
 
            TK::Slash => "/",
 
            TK::Percent => "%",
 
            TK::Caret => "^",
 
            TK::And => "&",
 
            TK::Or => "|",
 
            TK::Tilde => "~",
 
            TK::Equal => "=",
 
            TK::ColonColon => "::",
 
            TK::DotDot => "..",
 
            TK::ArrowRight => "->",
 
            TK::PlusPlus => "++",
 
            TK::PlusEquals => "+=",
 
            TK::MinusMinus => "--",
 
            TK::MinusEquals => "-=",
 
            TK::StarEquals => "*=",
 
            TK::SlashEquals => "/=",
 
            TK::PercentEquals => "%=",
 
            TK::CaretEquals => "^=",
 
            TK::AndAnd => "&&",
 
            TK::AndEquals => "&=",
 
            TK::OrOr => "||",
 
            TK::OrEquals => "|=",
 
            TK::EqualEqual => "==",
 
            TK::NotEqual => "!=",
 
            TK::ShiftLeft => "<<",
 
            TK::ShiftRight => ">>",
 
            TK::ShiftLeftEquals => "<<=",
 
            TK::ShiftRightEquals => ">>=",
 
            // Lets keep these in explicitly for now, in case we want to add more symbols
 
            TK::Ident | TK::Pragma | TK::Integer | TK::String | TK::Character |
 
            TK::LineComment | TK::BlockComment | TK::SpanEnd => unreachable!(),
 
        }
 
    }
 
}
 

	
 
pub(crate) struct Token {
 
    pub kind: TokenKind,
 
    pub pos: InputPosition,
 
}
 

	
 
impl Token {
 
    fn new(kind: TokenKind, pos: InputPosition) -> Self {
 
        Self{ kind, pos }
 
    }
 
}
 

	
 
#[derive(Debug, PartialEq, Eq)]
 
pub(crate) enum TokenRangeKind {
 
    Module,
 
    Pragma,
 
    Import,
 
    Definition,
 
    Code,
 
}
 

	
 
/// TODO: Add first_child and next_sibling indices for slightly faster traversal
 
#[derive(Debug)]
 
pub(crate) struct TokenRange {
 
    // Index of parent in `TokenBuffer.ranges`, does not have a parent if the 
 
    // range kind is Module, in that case the parent index points to itself.
 
    pub parent_idx: usize,
 
    pub range_kind: TokenRangeKind,
 
    pub curly_depth: u32,
 
    // InputPosition offset is limited to u32, so token ranges can be as well.
 
    pub start: u32,             // first token (inclusive index)
 
    pub end: u32,               // last token (exclusive index)
 
    // Subranges and where they can be found
 
    pub subranges: u32,         // Number of subranges
 
    pub first_child_idx: u32,   // First subrange (or points to self if no subranges)
 
    pub last_child_idx: u32,    // Last subrange (or points to self if no subranges)
 
    pub next_sibling_idx: Option<u32>,
 
}
 

	
 
pub(crate) struct TokenBuffer {
 
    pub tokens: Vec<Token>,
 
    pub ranges: Vec<TokenRange>,
 
}
 

	
 
impl TokenBuffer {
 
    pub(crate) fn new() -> Self {
 
        Self{ tokens: Vec::new(), ranges: Vec::new() }
 
    }
 

	
 
    pub(crate) fn iter_range<'a>(&'a self, range: &TokenRange) -> TokenIter<'a> {
 
        TokenIter::new(self, range.start as usize, range.end as usize)
 
    }
 

	
 
    pub(crate) fn start_pos(&self, range: &TokenRange) -> InputPosition {
 
        self.tokens[range.start].pos
 
    }
 

	
 
    pub(crate) fn end_pos(&self, range: &TokenRange) -> InputPosition {
 
        let last_token = &self.tokens[range.end - 1];
 
        if last_token.kind == TokenKind::SpanEnd {
 
            return last_token.pos
 
        } else {
 
            debug_assert!(!last_token.kind.has_span_end());
 
            return last_token.pos.with_offset(last_token.kind.num_characters());
 
        }
 
    }
 
}
 

	
 
pub(crate) struct TokenIter<'a> {
 
    tokens: &'a Vec<Token>,
 
    cur: usize,
 
    end: usize,
 
}
 

	
 
impl<'a> TokenIter<'a> {
 
    fn new(buffer: &'a TokenBuffer, start: usize, end: usize) -> Self {
 
        Self{ tokens: &buffer.tokens, cur: start, end }
 
    }
 

	
 
    /// Returns the next token (may include comments), or `None` if at the end
 
    /// of the range.
 
    pub(crate) fn next_including_comments(&self) -> Option<TokenKind> {
 
        if self.cur >= self.end {
 
            return None;
 
        }
 

	
 
        let token = &self.tokens[self.cur];
 
        Some(token.kind)
 
    }
 

	
 
    /// Returns the next token (but skips over comments), or `None` if at the
 
    /// end of the range
 
    pub(crate) fn next(&mut self) -> Option<TokenKind> {
 
        while let Some(token_kind) = self.next() {
 
            if token_kind != TokenKind::LineComment && token_kind != TokenKind::BlockComment {
 
                return Some(token_kind);
 
            }
 
            self.consume();
 
        }
 

	
 
        return None
 
    }
 

	
 
    /// Returns the start position belonging to the token returned by `next`. If
 
    /// there is not a next token, then we return the end position of the
 
    /// previous token.
 
    pub(crate) fn last_valid_pos(&self) -> InputPosition {
 
        if self.cur < self.end {
 
            // Return token position
 
            return self.tokens[self.cur].pos
 
        }
 

	
 
        // Return previous token end
 
        let token = &self.tokens[self.cur - 1];
 
        return if token.kind == TokenKind::SpanEnd {
 
            token.pos
 
        } else {
 
            token.pos.with_offset(token.kind.num_characters());
 
        };
 
    }
 

	
 
    /// Returns the token range belonging to the token returned by `next`. This
 
    /// assumes that we're not at the end of the range we're iterating over.
 
    pub(crate) fn next_range(&self) -> (InputPosition, InputPosition) {
 
        debug_assert!(self.cur < self.end);
 
        let token = &self.tokens[self.cur];
 
        if token.kind.has_span_end() {
 
            let span_end = &self.tokens[self.cur + 1];
 
            debug_assert_eq!(span_end.kind, TokenKind::SpanEnd);
 
            (token.pos, span_end.pos)
 
        } else {
 
            let offset = token.kind.num_characters();
 
            (token.pos, token.pos.with_offset(offset))
 
        }
 
    }
 

	
 
    pub(crate) fn consume(&mut self) {
 
        if let Some(kind) = self.next() {
 
            if kind.has_span_end() {
 
                self.cur += 2;
 
            } else {
 
                self.cur += 1;
 
            }
 
        }
 
    }
 
}
 

	
 
/// Tokenizer is a reusable parser to tokenize multiple source files using the
 
/// same allocated buffers. In a well-formed program, we produce a consistent
 
/// tree of token ranges such that we may identify tokens that represent a
 
/// defintion or an import before producing the entire AST.
 
///
 
/// If the program is not well-formed then the tree may be inconsistent, but we
 
/// will detect this once we transform the tokens into the AST. To ensure a
 
/// consistent AST-producing phase we will require the import to have balanced
 
/// curly braces
 
pub(crate) struct Tokenizer {
 
    // Stack of input positions of opening curly braces, used to detect
 
    // unmatched opening braces, unmatched closing braces are detected
 
    // immediately.
 
    curly_stack: Vec<InputPosition>,
 
    // Points to an element in the `TokenBuffer.ranges` variable.
 
    stack_idx: usize,
 
}
 

	
 
impl Tokenizer {
 
    pub(crate) fn new() -> Self {
 
        Self{ curly_stack: Vec::with_capacity(32), stack_idx: 0 }
 
    }
 
    pub(crate) fn tokenize(&mut self, source: &mut InputSource, target: &mut TokenBuffer) -> Result<(), ParseError> {
 
        // Assert source and buffer are at start
 
        debug_assert_eq!(source.pos().offset, 0);
 
        debug_assert!(target.tokens.is_empty());
 
        debug_assert!(target.ranges.is_empty());
 

	
 
        // Set up for tokenization by pushing the first range onto the stack.
 
        // This range may get transformed into the appropriate range kind later,
 
        // see `push_range` and `pop_range`.
 
        self.curly_depth = 0;
 
        self.stack_idx = 0;
 
        target.ranges.push(TokenRange{
 
            parent_idx: 0,
 
            range_kind: TokenRangeKind::Module,
 
            curly_depth: 0,
 
            start: 0,
 
            end: 0,
 
            subranges: 0,
 
            first_child_idx: 0,
 
            last_child_idx: 0,
 
            next_sibling_idx: None,
 
        });
 

	
 
        // Main tokenization loop
 
        while let Some(c) = source.next() {
 
            let token_index = target.tokens.len() as u32;
 

	
 
            if is_char_literal_start(c) {
 
                self.consume_char_literal(source, target)?;
 
            } else if is_string_literal_start(c) {
 
                self.consume_string_literal(source, target)?;
 
            } else if is_identifier_start(c) {
 
                let ident = self.consume_identifier(source, target)?;
 

	
 
                if demarks_definition(ident) {
 
                    self.push_range(target, TokenRangeKind::Definition, token_index);
 
                } else if demarks_import(ident) {
 
                    self.push_range(target, TokenRangeKind::Import, token_index);
 
                }
 
            } else if is_integer_literal_start(c) {
 
                self.consume_number(source, target)?;
 
            } else if is_pragma_start_or_pound(c) {
 
                let was_pragma = self.consume_pragma_or_pound(c, source, target)?;
 
                if was_pragma {
 
                    self.push_range(target, TokenRangeKind::Pragma, token_index);
 
                }
 
            } else if self.is_line_comment_start(c, source) {
 
                self.consume_line_comment(source, target)?;
 
            } else if self.is_block_comment_start(c, source) {
 
                self.consume_block_comment(source, target)?;
 
            } else if is_whitespace(c) {
 
                let contained_newline = self.consume_whitespace(source);
 
                if contained_newline {
 
                    let range = &target.ranges[self.stack_idx];
 
                    if range.range_kind == TokenRangeKind::Pragma {
 
                        self.pop_range(target, target.tokens.len() as u32);
 
                    }
 
                }
 
            } else {
 
                let was_punctuation = self.maybe_parse_punctuation(c, source, target)?;
 
                if let Some((token, token_pos)) = was_punctuation {
 
                    if token == TokenKind::OpenCurly {
 
                        self.curly_stack.push(token_pos);
 
                    } else if token == TokenKind::CloseCurly {
 
                        // Check if this marks the end of a range we're 
 
                        // currently processing
 
                        if self.curly_stack.is_empty() {
 
                            return Err(ParseError::new_error(
 
                                source, token_pos, "unmatched closing curly brace '}'"
 
                            ));
 
                        }
 

	
 
                        self.curly_stack.pop();
 

	
 
                        let range = &target.ranges[self.stack_idx];
 
                        if range.range_kind == TokenRangeKind::Definition && range.curly_depth == self.curly_depth {
 
                            self.pop_range(target, target.tokens.len() as u32);
 
                        }
 

	
 
                        // Exit early if we have more closing curly braces than
 
                        // opening curly braces
 
                    } else if token == TokenKind::SemiColon {
 
                        // Check if this marks the end of an import
 
                        let range = &target.ranges[self.stack_idx];
 
                        if range.range_kind == TokenRangeKind::Import {
 
                            self.pop_range(target, target.tokens.len() as u32);
 
                        }
 
                    }
 
                } else {
 
                    return Err(ParseError::new_error(source, source.pos(), "unexpected character"));
 
                }
 
            }
 
        }
 

	
 
        // End of file, check if our state is correct
 
        if let Some(error) = source.had_error.take() {
 
            return Err(error);
 
        }
 

	
 
        if !self.curly_stack.is_empty() {
 
            // Let's not add a lot of heuristics and just tell the programmer
 
            // that something is wrong
 
            let last_unmatched_open = self.curly_stack.pop().unwrap();
 
            return Err(ParseError::new_error(
 
                source, last_unmatched_open, "unmatched opening curly brace '{'"
 
            ));
 
        }
 

	
 
        // TODO: @remove once I'm sure the algorithm works. For now it is better
 
        //  if the debugging is a little more expedient
 
        if cfg!(debug_assertions) {
 
            // For each range make sure its children make sense
 
            for parent_idx in 0..target.ranges.len() {
 
                let cur_range = &target.ranges[parent_idx];
 
                if cur_range.subranges == 0 {
 
                    assert_eq!(cur_range.first_child_idx, parent_idx as u32);
 
                    assert_eq!(cur_range.last_child_idx, parent_idx as u32);
 
                } else {
 
                    assert_ne!(cur_range.first_child_idx, parent_idx as u32);
 
                    assert_ne!(cur_range.last_child_idx, parent_idx as u32);
 

	
 
                    let mut child_counter = 0u32;
 
                    let mut last_child_idx = cur_range.first_child_idx;
 
                    let mut child_idx = Some(cur_range.first_child_idx);
 
                    while let Some(cur_child_idx) = child_idx {
 
                        let child_range = &target.ranges[cur_child_idx as usize];
 
                        assert_eq!(child_range.parent_idx, parent_idx);
 
                        child_idx = child_range.next_sibling_idx;
 
                        child_counter += 1;
 
                    }
 

	
 
                    assert_eq!(cur_range.last_child_idx, last_child_idx);
 
                    assert_eq!(cur_range.subranges, child_counter);
 
                }
 
            }
 
        }
 

	
 
        Ok(())
 
    }
 

	
 
    fn is_line_comment_start(&self, first_char: u8, source: &InputSource) -> bool {
 
        return first_char == b'/' && Some(b'/') == source.lookahead(1);
 
    }
 

	
 
    fn is_block_comment_start(&self, first_char: u8, source: &InputSource) -> bool {
 
        return first_char == b'/' && Some(b'*') == source.lookahead(1);
 
    }
 

	
 
    fn maybe_parse_punctuation(
 
        &mut self, first_char: u8, source: &mut InputSource, target: &mut TokenBuffer
 
    ) -> Result<Option<(TokenKind, InputPosition)>, ParseError> {
 
        debug_assert!(first_char != b'#', "'#' needs special handling");
 
        debug_assert!(first_char != b'\'', "'\'' needs special handling");
 
        debug_assert!(first_char != b'"', "'\"' needs special handling");
 

	
 
        let pos = source.pos();
 
        let token_kind;
 
        if first_char == b'!' {
 
            source.consume();
 
            if Some(b'=') == source.next() {
 
                source.consume();
 
                token_kind = TokenKind::NotEqual;
 
            } else {
 
                token_kind = TokenKind::Exclamation;
 
            }
 
        } else if first_char == b'%' {
 
            source.consume();
 
            if Some(b'=') == source.next() {
 
                source.consume();
 
                token_kind = TokenKind::PercentEquals;
 
            } else {
 
                token_kind = TokenKind::Percent;
 
            }
 
        } else if first_char == b'&' {
 
            source.consume();
 
            let next = source.next();
 
            if Some(b'&') == next {
 
                source.consume();
 
                token_kind = TokenKind::AndAnd;
 
            } else if Some(b'=') == next {
 
                source.consume();
 
                token_kind = TokenKind::AndEquals;
 
            } else {
 
                token_kind = TokenKind::And;
 
            }
 
        } else if first_char == b'(' {
 
            source.consume();
 
            token_kind = TokenKind::OpenParen;
 
        } else if first_char == b')' {
 
            source.consume();
 
            token_kind = TokenKind::CloseParen;
 
        } else if first_char == b'*' {
 
            source.consume();
 
            if let Some(b'=') = source.next() {
 
                source.consume();
 
                token_kind = TokenKind::StarEquals;
 
            } else {
 
                token_kind = TokenKind::Star;
 
            }
 
        } else if first_char == b'+' {
 
            source.consume();
 
            let next = source.next();
 
            if Some(b'+') == next {
 
                source.consume();
 
                token_kind = TokenKind::PlusPlus;
 
            } else if Some(b'=') == next {
 
                source.consume();
 
                token_kind = TokenKind::PlusEquals;
 
            } else {
 
                token_kind = TokenKind::Plus;
 
            }
 
        } else if first_char == b',' {
 
            source.consume();
 
            token_kind = TokenKind::Comma;
0 comments (0 inline, 0 general)