Changeset - ddddcd3cc9aa
[Not reviewed]
src/collections/string_pool.rs
Show inline comments
 
@@ -8,38 +8,44 @@ const SLAB_SIZE: usize = u16::max_value() as usize;
 
#[derive(Clone)]
 
pub struct StringRef<'a> {
 
    data: *const u8,
 
    length: usize,
 
    _phantom: PhantomData<&'a [u8]>,
 
}
 

	
 
impl<'a> StringRef<'a> {
 
    /// `new` constructs a new StringRef whose data is not owned by the
 
    /// `StringPool`, hence cannot have a `'static` lifetime.
 
    pub(crate) fn new(data: &'a [u8]) -> StringRef<'a> {
 
        // This is an internal (compiler) function: so debug_assert that the
 
        // string is valid UTF8. Most commonly the input will come from the
 
        // string is valid ascii. Most commonly the input will come from the
 
        // code's source file, which is checked for ASCII-ness anyway.
 
        debug_assert!(data.is_ascii());
 
        let length = data.len();
 
        let data = data.as_ptr();
 
        StringRef{ data, length, _phantom: PhantomData }
 
    }
 

	
 
    pub fn as_str(&self) -> &'a str {
 
        unsafe {
 
            let slice = std::slice::from_raw_parts::<'a, u8>(self.data, self.length);
 
            std::str::from_utf8_unchecked(slice)
 
        }
 
    }
 

	
 
    pub fn as_bytes(&self) -> &'a [u8] {
 
        unsafe {
 
            std::slice::from_raw_parts::<'a, u8>(self.data, self.length)
 
        }
 
    }
 
}
 

	
 
impl PartialEq for StringRef {
 
    fn eq(&self, other: &StringRef) -> bool {
 
        self.as_str() == other.as_str()
 
    }
 
}
 

	
 
impl Eq for StringRef {}
 

	
 
impl Hash for StringRef {
 
    fn hash<H: Hasher>(&self, state: &mut H) {
 
@@ -70,25 +76,28 @@ pub(crate) struct StringPool {
 
}
 

	
 
impl StringPool {
 
    pub(crate) fn new() -> Self {
 
        // To have some stability we just turn a box into a raw ptr.
 
        let initial_slab = Box::new(StringPoolSlab::new(null_mut()));
 
        let initial_slab = Box::into_raw(initial_slab);
 
        StringPool{
 
            last: initial_slab,
 
        }
 
    }
 

	
 
    // Interns a string to the StringPool, returning a reference to it
 
    /// Interns a string to the `StringPool`, returning a reference to it. The
 
    /// pointer owned by `StringRef` is `'static` as the `StringPool` doesn't
 
    /// reallocate/deallocate until dropped (which only happens at the end of
 
    /// the program.)
 
    pub(crate) fn intern(&mut self, data: &[u8]) -> StringRef<'static> {
 
        // TODO: Large string allocations, if ever needed.
 
        let data_len = data.len();
 
        assert!(data_len <= SLAB_SIZE, "string is too large for slab");
 
        debug_assert!(std::str::from_utf8(data).is_ok(), "string to intern is not valid UTF-8 encoded");
 
        
 
        let mut last = unsafe{&mut *self.last};
 
        if data.len() > last.remaining {
 
            // Doesn't fit: allocate new slab
 
            self.alloc_new_slab();
 
            last = unsafe{&mut *self.last};
 
        }
src/protocol/ast.rs
Show inline comments
 
@@ -167,40 +167,38 @@ define_new_ast_id!(LiteralExpressionId, ExpressionId, index(LiteralExpression, E
 
define_new_ast_id!(CallExpressionId, ExpressionId, index(CallExpression, Expression::Call, expressions), alloc(alloc_call_expression));
 
define_new_ast_id!(VariableExpressionId, ExpressionId, index(VariableExpression, Expression::Variable, expressions), alloc(alloc_variable_expression));
 

	
 
// TODO: @cleanup - pub qualifiers can be removed once done
 
#[derive(Debug)]
 
pub struct Heap {
 
    // Root arena, contains the entry point for different modules. Each root
 
    // contains lists of IDs that correspond to the other arenas.
 
    pub(crate) protocol_descriptions: Arena<Root>,
 
    // Contents of a file, these are the elements the `Root` elements refer to
 
    pragmas: Arena<Pragma>,
 
    pub(crate) imports: Arena<Import>,
 
    identifiers: Arena<Identifier>,
 
    pub(crate) parser_types: Arena<ParserType>,
 
    pub(crate) variables: Arena<Variable>,
 
    pub(crate) definitions: Arena<Definition>,
 
    pub(crate) statements: Arena<Statement>,
 
    pub(crate) expressions: Arena<Expression>,
 
}
 

	
 
impl Heap {
 
    pub fn new() -> Heap {
 
        Heap {
 
            // string_alloc: StringAllocator::new(),
 
            protocol_descriptions: Arena::new(),
 
            pragmas: Arena::new(),
 
            imports: Arena::new(),
 
            identifiers: Arena::new(),
 
            parser_types: Arena::new(),
 
            variables: Arena::new(),
 
            definitions: Arena::new(),
 
            statements: Arena::new(),
 
            expressions: Arena::new(),
 
        }
 
    }
 
    pub fn alloc_memory_statement(
 
        &mut self,
 
        f: impl FnOnce(MemoryStatementId) -> MemoryStatement,
 
    ) -> MemoryStatementId {
 
        MemoryStatementId(LocalStatementId(self.statements.alloc_with_id(|id| {
 
@@ -294,70 +292,74 @@ pub struct PragmaModule {
 
}
 

	
 
#[derive(Debug, Clone)]
 
pub enum Import {
 
    Module(ImportModule),
 
    Symbols(ImportSymbols)
 
}
 

	
 
impl Import {
 
    pub(crate) fn as_module(&self) -> &ImportModule {
 
        match self {
 
            Import::Module(m) => m,
 
            _ => panic!("Unable to cast 'Import' to 'ImportModule'")
 
            _ => unreachable!("Unable to cast 'Import' to 'ImportModule'")
 
        }
 
    }
 
    pub(crate) fn as_symbols(&self) -> &ImportSymbols {
 
        match self {
 
            Import::Symbols(m) => m,
 
            _ => panic!("Unable to cast 'Import' to 'ImportSymbols'")
 
            _ => unreachable!("Unable to cast 'Import' to 'ImportSymbols'")
 
        }
 
    }
 
    pub(crate) fn as_symbols_mut(&mut self) -> &mut ImportSymbols {
 
        match self {
 
            Import::Symbols(m) => m,
 
            _ => unreachable!("Unable to cast 'Import' to 'ImportSymbols'")
 
        }
 
    }
 
}
 

	
 
impl SyntaxElement for Import {
 
    fn position(&self) -> InputPosition {
 
        match self {
 
            Import::Module(m) => m.position,
 
            Import::Symbols(m) => m.position
 
        }
 
    }
 
}
 

	
 
#[derive(Debug, Clone)]
 
pub struct ImportModule {
 
    pub this: ImportId,
 
    // Phase 1: parser
 
    pub span: InputSpan,
 
    pub module_name: Identifier,
 
    pub module: Identifier,
 
    pub alias: Identifier,
 
    pub module_id: RootId,
 
}
 

	
 
#[derive(Debug, Clone)]
 
pub struct AliasedSymbol {
 
    // Phase 1: parser
 
    pub position: InputPosition,
 
    pub name: Identifier,
 
    pub alias: Identifier,
 
    pub alias: Option<Identifier>,
 
    pub definition_id: DefinitionId,
 
}
 

	
 
#[derive(Debug, Clone)]
 
pub struct ImportSymbols {
 
    pub this: ImportId,
 
    // Phase 1: parser
 
    pub span: InputSpan,
 
    pub module_name: Vec<u8>,
 
    pub module_id: Option<RootId>,
 
    pub module: Identifier,
 
    pub module_id: RootId,
 
    pub symbols: Vec<AliasedSymbol>,
 
}
 

	
 
#[derive(Debug, Clone)]
 
pub struct Identifier {
 
    pub span: InputSpan,
 
    pub value: StringRef<'static>,
 
}
 

	
 
impl PartialEq for Identifier {
 
    fn eq(&self, other: &Self) -> bool {
 
        return self.value == other.value
 
@@ -595,29 +597,27 @@ impl<'a> NamespacedIdentifierIter<'a> {
 
            None => None,
 
            Some(idx) => self.get(idx)
 
        }
 
    }
 
}
 

	
 
/// TODO: @types Remove the Message -> Byte hack at some point...
 
#[derive(Debug, Clone)]
 
pub enum ParserTypeVariant {
 
    // Basic builtin
 
    Message,
 
    Bool,
 
    Byte,
 
    Short,
 
    Int,
 
    Long,
 
    String,
 
    UInt8, Uint16, UInt32, UInt64,
 
    SInt8, SInt16, SInt32, SInt64,
 
    Character, String,
 
    // Literals (need to get concrete builtin type during typechecking)
 
    IntegerLiteral,
 
    Inferred,
 
    // Complex builtins
 
    Array(ParserTypeId), // array of a type
 
    Input(ParserTypeId), // typed input endpoint of a channel
 
    Output(ParserTypeId), // typed output endpoint of a channel
 
    Symbolic(SymbolicParserType), // symbolic type (definition or polyarg)
 
}
 

	
 
impl ParserTypeVariant {
 
    pub(crate) fn supports_polymorphic_args(&self) -> bool {
 
@@ -1070,25 +1070,24 @@ impl VariableScope for Definition {
 
        for &parameter_id in self.parameters().iter() {
 
            let parameter = &h[parameter_id];
 
            if parameter.identifier == *id {
 
                return Some(parameter_id.0);
 
            }
 
        }
 
        None
 
    }
 
}
 

	
 
#[derive(Debug, Clone)]
 
pub struct StructFieldDefinition {
 
    pub position: InputPosition,
 
    pub field: Identifier,
 
    pub parser_type: ParserTypeId,
 
}
 

	
 
#[derive(Debug, Clone)]
 
pub struct StructDefinition {
 
    pub this: StructDefinitionId,
 
    // Phase 1: parser
 
    pub span: InputSpan,
 
    pub identifier: Identifier,
 
    pub poly_vars: Vec<Identifier>,
 
    pub fields: Vec<StructFieldDefinition>
src/protocol/ast_printer.rs
Show inline comments
 
@@ -229,34 +229,34 @@ impl ASTWriter {
 
        }
 
    }
 

	
 
    fn write_import(&mut self, heap: &Heap, import_id: ImportId, indent: usize) {
 
        let import = &heap[import_id];
 
        let indent2 = indent + 1;
 

	
 
        match import {
 
            Import::Module(import) => {
 
                self.kv(indent).with_id(PREFIX_IMPORT_ID, import.this.index)
 
                    .with_s_key("ImportModule");
 

	
 
                self.kv(indent2).with_s_key("Name").with_ascii_val(&import.module_name);
 
                self.kv(indent2).with_s_key("Name").with_ascii_val(&import.module);
 
                self.kv(indent2).with_s_key("Alias").with_ascii_val(&import.alias.value);
 
                self.kv(indent2).with_s_key("Target")
 
                    .with_opt_disp_val(import.module_id.as_ref().map(|v| &v.index));
 
            },
 
            Import::Symbols(import) => {
 
                self.kv(indent).with_id(PREFIX_IMPORT_ID, import.this.index)
 
                    .with_s_key("ImportSymbol");
 

	
 
                self.kv(indent2).with_s_key("Name").with_ascii_val(&import.module_name);
 
                self.kv(indent2).with_s_key("Name").with_ascii_val(&import.module);
 
                self.kv(indent2).with_s_key("Target")
 
                    .with_opt_disp_val(import.module_id.as_ref().map(|v| &v.index));
 

	
 
                self.kv(indent2).with_s_key("Symbols");
 

	
 
                let indent3 = indent2 + 1;
 
                let indent4 = indent3 + 1;
 
                for symbol in &import.symbols {
 
                    self.kv(indent3).with_s_key("AliasedSymbol");
 
                    self.kv(indent4).with_s_key("Name").with_ascii_val(&symbol.name.value);
 
                    self.kv(indent4).with_s_key("Alias").with_ascii_val(&symbol.alias.value);
 
                    self.kv(indent4).with_s_key("Definition")
src/protocol/input_source2.rs
Show inline comments
 
@@ -74,28 +74,34 @@ impl InputSource2 {
 
        }
 
    }
 

	
 
    pub fn lookahead(&self, offset: usize) -> Option<u8> {
 
        let offset_pos = self.offset + offset;
 
        if offset_pos < self.input.len() {
 
            Some(self.input[offset_pos])
 
        } else {
 
            None
 
        }
 
    }
 

	
 
    pub fn section(&self, start: InputPosition2, end: InputPosition2) -> &[u8] {
 
    #[inline]
 
    pub fn section_at_pos(&self, start: InputPosition2, end: InputPosition2) -> &[u8] {
 
        &self.input[start.offset as usize..end.offset as usize]
 
    }
 

	
 
    #[inline]
 
    pub fn section_at_span(&self, span: InputSpan) -> &[u8] {
 
        &self.input[span.begin.offset as usize..span.end.offset as usize]
 
    }
 

	
 
    // Consumes the next character. Will check well-formedness of newlines: \r
 
    // must be followed by a \n, because this is used for error reporting. Will
 
    // not check for ascii-ness of the file, better left to a tokenizer.
 
    pub fn consume(&mut self) {
 
        match self.next() {
 
            Some(b'\r') => {
 
                if Some(b'\n') == self.lookahead(1) {
 
                    // Well formed file
 
                    self.offset += 1;
 
                } else {
 
                    // Not a well-formed file, pretend like we can continue
 
                    self.offset += 1;
src/protocol/lexer.rs
Show inline comments
 
@@ -2517,25 +2517,25 @@ impl Lexer<'_> {
 

	
 
        self.consume_whitespace(false)?;
 

	
 
        // Check for the potential aliasing or specific module imports
 
        let import = if self.has_string(b"as") {
 
            self.consume_string(b"as")?;
 
            self.consume_whitespace(true)?;
 
            let alias = self.consume_identifier()?;
 

	
 
            h.alloc_import(|this| Import::Module(ImportModule{
 
                this,
 
                position,
 
                module_name: value,
 
                module: value,
 
                alias,
 
                module_id: None,
 
            }))
 
        } else if self.has_string(b"::") {
 
            self.consume_string(b"::")?;
 
            self.consume_whitespace(false)?;
 

	
 
            let next = self.source.next();
 
            if Some(b'{') == next {
 
                let symbols = match self.consume_comma_separated(
 
                    h, b'{', b'}', "Expected end of import list",
 
                    |lexer, _heap| {
 
@@ -2566,71 +2566,71 @@ impl Lexer<'_> {
 
                                definition_id: None
 
                            })
 
                        }
 
                    }
 
                )? {
 
                    Some(symbols) => symbols,
 
                    None => unreachable!(), // because we checked for opening '{'
 
                };
 

	
 
                h.alloc_import(|this| Import::Symbols(ImportSymbols{
 
                    this,
 
                    position,
 
                    module_name: value,
 
                    module: value,
 
                    module_id: None,
 
                    symbols,
 
                }))
 
            } else if Some(b'*') == next {
 
                self.source.consume();
 
                h.alloc_import(|this| Import::Symbols(ImportSymbols{
 
                    this,
 
                    position,
 
                    module_name: value,
 
                    module: value,
 
                    module_id: None,
 
                    symbols: Vec::new()
 
                }))
 
            } else if self.has_identifier() {
 
                let position = self.source.pos();
 
                let name = self.consume_identifier()?;
 
                self.consume_whitespace(false)?;
 
                let alias = if self.has_string(b"as") {
 
                    self.consume_string(b"as")?;
 
                    self.consume_whitespace(true)?;
 
                    self.consume_identifier()?
 
                } else {
 
                    name.clone()
 
                };
 

	
 
                h.alloc_import(|this| Import::Symbols(ImportSymbols{
 
                    this,
 
                    position,
 
                    module_name: value,
 
                    module: value,
 
                    module_id: None,
 
                    symbols: vec![AliasedSymbol{
 
                        position,
 
                        name,
 
                        alias,
 
                        definition_id: None
 
                    }]
 
                }))
 
            } else {
 
                return Err(self.error_at_pos("Expected '*', '{' or a symbol name"));
 
            }
 
        } else {
 
            // No explicit alias or subimports, so implicit alias
 
            let alias_value = Vec::from(&value[last_ident_start..]);
 
            h.alloc_import(|this| Import::Module(ImportModule{
 
                this,
 
                position,
 
                module_name: value,
 
                module: value,
 
                alias: Identifier{
 
                    position: last_ident_pos,
 
                    value: Vec::from(alias_value),
 
                },
 
                module_id: None,
 
            }))
 
        };
 

	
 
        self.consume_whitespace(false)?;
 
        self.consume_string(b";")?;
 
        Ok(import)
 
    }
src/protocol/lexer2.rs
Show inline comments
 
deleted file
src/protocol/mod.rs
Show inline comments
 
mod arena;
 
// mod ast;
 
mod eval;
 
pub(crate) mod inputsource;
 
pub(crate) mod input_source2;
 
// mod lexer;
 
mod tokenizer;
 
mod parser;
 
#[cfg(test)] mod tests;
 

	
 
// TODO: Remove when not benchmarking
 
pub(crate) mod ast;
 
pub(crate) mod ast_printer;
 
pub(crate) mod lexer;
 
pub(crate) mod lexer2;
 

	
 
lazy_static::lazy_static! {
 
    /// Conveniently-provided protocol description initialized with a zero-length PDL string.
 
    /// Exposed to minimize repeated initializations of this common protocol description.
 
    pub static ref TRIVIAL_PD: std::sync::Arc<ProtocolDescription> = {
 
        std::sync::Arc::new(ProtocolDescription::parse(b"").unwrap())
 
    };
 
}
 

	
 
use crate::common::*;
 
use crate::protocol::ast::*;
 
use crate::protocol::eval::*;
src/protocol/parser/mod.rs
Show inline comments
 
mod depth_visitor;
 
pub(crate) mod symbol_table;
 
pub(crate) mod symbol_table2;
 
pub(crate) mod type_table;
 
pub(crate) mod tokens;
 
pub(crate) mod token_parsing;
 
pub(crate) mod pass_tokenizer;
 
pub(crate) mod pass_symbols;
 
pub(crate) mod pass_imports;
 
pub(crate) mod pass_definitions;
 
mod type_resolver;
 
mod visitor;
 
mod visitor_linker;
 
mod utils;
 

	
 
use depth_visitor::*;
 
use symbol_table::SymbolTable;
 
use tokens::*;
 
use crate::collections::*;
 
use symbol_table2::SymbolTable;
 
use visitor::Visitor2;
 
use visitor_linker::ValidityAndLinkerVisitor;
 
use type_resolver::{TypeResolvingVisitor, ResolveQueue};
 
use type_table::{TypeTable, TypeCtx};
 

	
 
use crate::protocol::ast::*;
 
use crate::protocol::inputsource::*;
 
use crate::protocol::input_source2::{InputSource2 as InputSource};
 
use crate::protocol::lexer::*;
 

	
 
use std::collections::HashMap;
 
use crate::protocol::ast_printer::ASTWriter;
 

	
 
#[derive(PartialEq, Eq)]
 
pub enum ModuleCompilationPhase {
 
    Source,                 // only source is set
 
    Tokenized,              // source is tokenized
 
    SymbolsScanned,         // all definitions are linked to their type class
 
    ImportsResolved,        // all imports are added to the symbol table
 
    DefinitionsParsed,      // produced the AST for the entire module
 
    ValidatedAndLinked,     // AST is traversed and has linked the required AST nodes
 
    Typed,                  // Type inference and checking has been performed
 
}
 

	
 
pub struct Module {
 
    // Buffers
 
    source: InputSource,
 
    tokens: TokenBuffer,
 
    // Identifiers
 
    root_id: RootId,
 
    name: Option<(PragmaId, StringRef<'static>)>,
 
    version: Option<(PragmaId, i64)>,
 
    phase: ModuleCompilationPhase,
 
}
 

	
 
pub struct PassCtx<'a> {
 
    heap: &'a mut Heap,
 
    symbols: &'a mut SymbolTable,
 
    pool: &'a mut StringPool,
 
}
 

	
 
// TODO: @fixme, pub qualifier
 
pub(crate) struct LexedModule {
 
    pub(crate) source: InputSource,
 
    module_name: Vec<u8>,
 
    version: Option<u64>,
 
    pub(crate) root_id: RootId,
 
}
 

	
 
pub struct Parser {
 
    pub(crate) heap: Heap,
 
    pub(crate) modules: Vec<LexedModule>,
 
    pub(crate) module_lookup: HashMap<Vec<u8>, usize>, // from (optional) module name to `modules` idx
 
@@ -147,31 +183,31 @@ impl Parser {
 
                if import_index >= root.imports.len() {
 
                    module_index += 1;
 
                    import_index = 0;
 
                    continue
 
                }
 
                root.imports[import_index]
 
            };
 

	
 
            let import = &mut self.heap[import_id];
 
            match import {
 
                Import::Module(import) => {
 
                    debug_assert!(import.module_id.is_none(), "module import already resolved");
 
                    let target_module_id = self.symbol_table.resolve_module(&import.module_name)
 
                    let target_module_id = self.symbol_table.resolve_module(&import.module)
 
                        .expect("module import is resolved by symbol table");
 
                    import.module_id = Some(target_module_id)
 
                },
 
                Import::Symbols(import) => {
 
                    debug_assert!(import.module_id.is_none(), "module of symbol import already resolved");
 
                    let target_module_id = self.symbol_table.resolve_module(&import.module_name)
 
                    let target_module_id = self.symbol_table.resolve_module(&import.module)
 
                        .expect("symbol import's module is resolved by symbol table");
 
                    import.module_id = Some(target_module_id);
 

	
 
                    for symbol in &mut import.symbols {
 
                        debug_assert!(symbol.definition_id.is_none(), "symbol import already resolved");
 
                        let (_, target_definition_id) = self.symbol_table.resolve_identifier(module_root_id, &symbol.alias)
 
                            .expect("symbol import is resolved by symbol table")
 
                            .as_definition()
 
                            .expect("symbol import does not resolve to namespace symbol");
 
                        symbol.definition_id = Some(target_definition_id);
 
                    }
 
                }
src/protocol/parser/pass_definitions.rs
Show inline comments
 
new file 100644
 
use crate::protocol::ast::*;
 
use super::symbol_table2::*;
 
use super::{Module, ModuleCompilationPhase, PassCtx};
 
use super::tokens::*;
 
use super::token_parsing::*;
 
use crate::protocol::input_source2::{InputSource2 as InputSource, InputSpan, ParseError};
 
use crate::collections::*;
 

	
 
/// Parses all the tokenized definitions into actual AST nodes.
 
pub(crate) struct PassDefinitions {
 
    identifiers: Vec<Identifier>,
 
    struct_fields: Vec<StructFieldDefinition>,
 
}
 

	
 
impl PassDefinitions {
 
    pub(crate) fn parse(&mut self, modules: &mut [Module], module_idx: usize, ctx: &mut PassCtx) -> Result<(), ParseError> {
 
        let module = &modules[module_idx];
 
        let module_range = &module.tokens.ranges[0];
 
        debug_assert_eq!(module.phase, ModuleCompilationPhase::ImportsResolved);
 
        debug_assert_eq!(module_range.range_kind, TokenRangeKind::Module);
 

	
 
        let mut range_idx = module_range.first_child_idx;
 
        loop {
 
            let range_idx_usize = range_idx as usize;
 
            let cur_range = &module.tokens.ranges[range_idx_usize];
 

	
 
            if cur_range.range_kind == TokenRangeKind::Definition {
 
                self.visit_definition_range(modules, module_idx, ctx, range_idx_usize)?;
 
            }
 

	
 
            match cur_range.next_sibling_idx {
 
                Some(idx) => { range_idx = idx; },
 
                None => { break; },
 
            }
 
        }
 

	
 

	
 

	
 
        Ok(())
 
    }
 

	
 
    fn visit_definition_range(
 
        &mut self, modules: &[Module], module_idx: usize, ctx: &mut PassCtx, range_idx: usize
 
    ) -> Result<(), ParseError> {
 
        let module = &modules[module_idx];
 
        let cur_range = &module.tokens.ranges[range_idx];
 
        debug_assert_eq!(cur_range.range_kind, TokenRangeKind::Definition);
 

	
 
        // Detect which definition we're parsing
 
        let mut iter = module.tokens.iter_range(cur_range);
 
        let keyword = peek_ident(&module.source, &mut iter).unwrap();
 
        match keyword {
 
            KW_STRUCT => {
 

	
 
            },
 
            KW_ENUM => {
 

	
 
            },
 
            KW_UNION => {
 

	
 
            },
 
            KW_FUNCTION => {
 

	
 
            },
 
            KW_PRIMITIVE => {
 

	
 
            },
 
            KW_COMPOSITE => {
 

	
 
            },
 
            _ => unreachable!("encountered keyword '{}' in definition range", String::from_utf8_lossy(keyword)),
 
        };
 

	
 
        Ok(())
 
    }
 

	
 
    fn visit_struct_definition(
 
        &mut self, module: &Module, iter: &mut TokenIter, ctx: &mut PassCtx
 
    ) -> Result<(), ParseError> {
 
        // Consume struct and name of struct
 
        let struct_span = consume_exact_ident(&module.source, iter, b"struct");
 
        let (ident_text, _) = consume_ident(&module.source, iter)?;
 

	
 
        // We should have preallocated the definition in the heap, retrieve its identifier
 
        let definition_id = ctx.symbols.get_symbol_by_name_defined_in_scope(SymbolScope::Module(module.root_id), ident_text)
 
            .unwrap().variant.as_definition().definition_id;
 

	
 
        consume_polymorphic_vars(source, iter, ctx, &mut self.identifiers)?;
 
        debug_assert!(self.struct_fields.is_empty());
 
        consume_comma_separated(
 
            TokenKind::OpenCurly, TokenKind::CloseCurly, source, iter,
 
            |source, iter| {
 
                let field = consume_ident_interned(source, iter, ctx)?;
 

	
 
                StructFieldDefinition{ field, parser_type }
 
            },
 
            &mut self.struct_fields, "a struct field", "a list of struct fields"
 
        )
 
    }
 
}
 

	
 
enum TypeKind {
 
    Message,
 
    Bool,
 
    UInt8, UInt16, UInt32, UInt64,
 
    SInt8, SInt16, SInt32, SInt64,
 
    Character, String,
 
    Inferred,
 
    Array,
 
    Input,
 
    Output,
 
    SymbolicDefinition(DefinitionId),
 
    SymbolicPolyArg(DefinitionId, usize),
 
}
 

	
 
/// Consumes a type. A type always starts with an identifier which may indicate
 
/// a builtin type or a user-defined type. The fact that it may contain
 
/// polymorphic arguments makes it a tree-like structure. Because we cannot rely
 
/// on knowing the exact number of polymorphic arguments we do not check for
 
/// these.
 
fn consume_parser_type(
 
    source: &InputSource, iter: &mut TokenIter, ctx: &mut PassCtx, poly_vars: &[Identifier]
 
) -> Result<(), ParseError> {
 
    struct StackEntry {
 
        angle_depth: i32,
 
    }
 
    let mut type_stack = Vec::new();
 

	
 
    Ok(())
 
}
 

	
 
fn consume_parser_type_ident(
 
    source: &InputSource, iter: &mut TokenIter, ctx: &mut PassCtx,
 
    mut scope: SymbolScope, wrapping_definition: DefinitionId, poly_vars: &[Identifier]
 
) -> Result<(TypeKind, InputSpan), ParseError> {
 
    let (type_text, type_span) = consume_any_ident(source, iter)?;
 

	
 
    let type_kind = match type_text {
 
        KW_TYPE_MESSAGE => TypeKind::Message,
 
        KW_TYPE_BOOL => TypeKind::Bool,
 
        KW_TYPE_UINT8 => TypeKind::UInt8,
 
        KW_TYPE_UINT16 => TypeKind::UInt16,
 
        KW_TYPE_UINT32 => TypeKind::UInt32,
 
        KW_TYPE_UINT64 => TypeKind::UInt64,
 
        KW_TYPE_SINT8 => TypeKind::SInt8,
 
        KW_TYPE_SINT16 => TypeKind::SInt16,
 
        KW_TYPE_SINT32 => TypeKind::SInt32,
 
        KW_TYPE_SINT64 => TypeKind::SInt64,
 
        KW_TYPE_IN_PORT => TypeKind::Input,
 
        KW_TYPE_OUT_PORT => TypeKind::Output,
 
        _ => {
 
            // Must be some kind of symbolic type
 
            let mut type_kind = None;
 
            for (poly_idx, poly_var) in poly_vars.iter().enumerate() {
 
                if poly_var.value.as_bytes() == type_text {
 
                    type_kind = Some(TypeKind::SymbolicPolyArg(wrapping_definition, poly_idx));
 
                }
 
            }
 

	
 
            if type_kind.is_none() {
 
                // Check symbol table for definition
 
                let last_symbol = ctx.symbols.get_symbol_by_name(scope, type_text);
 
                if last_symbol.is_none() {
 
                    return Err(ParseError::new_error_str_at_span(source, type_span, "unknown type"));
 
                }
 
                let last_symbol = last_symbol.unwrap();
 
                match last_symbol.variant {
 
                    SymbolVariant::Module(symbol_module) => {
 
                        // Keep seeking
 
                    },
 
                    SymbolVariant::Definition(symbol_definition) => {
 

	
 
                    }
 
                }
 
            }
 
        }
 
    }
 

	
 

	
 
    Ok(())
 
}
 

	
 
/// Consumes polymorphic variables (i.e. a list of identifiers). If the list is
 
/// absent then we simply return an empty array.
 
fn consume_polymorphic_vars(
 
    source: &InputSource, iter: &mut TokenIter, ctx: &mut PassCtx, target: &mut Vec<Identifier>
 
) -> Result<(), ParseError> {
 
    // Note: because this is just a list of identifiers, we don't have to take
 
    // two `TokenKind::CloseAngle` interpreted as `TokenKind::ShiftRight` into
 
    // account.
 
    debug_assert!(target.is_empty());
 
    maybe_consume_comma_separated(
 
        TokenKind::OpenAngle, TokenKind::CloseAngle, source, iter,
 
        |source, iter| consume_ident_interned(source, iter, ctx),
 
        target, "a polymorphic variable"
 
    )?;
 

	
 
    Ok(())
 
}
 
\ No newline at end of file
src/protocol/parser/pass_imports.rs
Show inline comments
 
new file 100644
 
use crate::protocol::ast::*;
 
use super::symbol_table2::*;
 
use super::{Module, ModuleCompilationPhase, PassCtx};
 
use super::tokens::*;
 
use super::token_parsing::*;
 
use crate::protocol::input_source2::{InputSource2 as InputSource, InputSpan, ParseError};
 
use crate::collections::*;
 

	
 
/// Parses all the imports in the module tokens. Is applied after the
 
/// definitions and name of modules are resolved. Hence we should be able to
 
/// resolve all symbols to their appropriate module/definition.
 
pub(crate) struct PassImport {
 
    imports: Vec<ImportId>,
 
    found_symbols: Vec<(AliasedSymbol, SymbolDefinition)>,
 
    scoped_symbols: Vec<Symbol>,
 
}
 

	
 
impl PassImport {
 
    pub(crate) fn new() -> Self {
 
        Self{
 
            imports: Vec::with_capacity(32),
 
            found_symbols: Vec::with_capacity(32),
 
            scoped_symbols: Vec::with_capacity(32),
 
        }
 
    }
 
    pub(crate) fn parse(&mut self, modules: &mut [Module], module_idx: usize, ctx: &mut PassCtx) -> Result<(), ParseError> {
 
        let module = &modules[module_idx];
 
        let module_range = &module.tokens.ranges[0];
 
        debug_assert!(modules.iter().all(|m| m.phase >= ModuleCompilationPhase::SymbolsScanned));
 
        debug_assert_eq!(module.phase, ModuleCompilationPhase::SymbolsScanned);
 
        debug_assert_eq!(module_range.range_kind, TokenRangeKind::Module);
 

	
 
        let mut range_idx = module_range.first_child_idx;
 
        loop {
 
            let range_idx_usize = range_idx as usize;
 
            let cur_range = &module.tokens.ranges[range_idx_usize];
 

	
 
            if cur_range.range_kind == TokenRangeKind::Import {
 
                self.visit_import_range(modules, module_idx, ctx, range_idx_usize)?;
 
            }
 

	
 
            match cur_range.next_sibling_idx {
 
                Some(idx) => { range_idx = idx; },
 
                None => { break; }
 
            }
 
        }
 

	
 
        let root = &mut ctx.heap[module.root_id];
 
        root.imports.extend(self.imports.drain(..));
 

	
 
        let module = &mut modules[module_idx];
 
        module.phase = ModuleCompilationPhase::ImportsResolved;
 

	
 
        Ok(())
 
    }
 

	
 
    pub(crate) fn visit_import_range(
 
        &mut self, modules: &mut [Module], module_idx: usize, ctx: &mut PassCtx, range_idx: usize
 
    ) -> Result<(), ParseError> {
 
        let module = &modules[module_idx];
 
        let import_range = &module.tokens.ranges[range_idx];
 
        debug_assert_eq!(import_range.range_kind, TokenRangeKind::Import);
 

	
 
        let mut iter = module.tokens.iter_range(import_range);
 

	
 
        // Consume "import"
 
        let (_import_ident, import_span) =
 
            consume_any_ident(&module.source, &mut iter)?;
 
        debug_assert_eq!(_import_ident, KW_IMPORT);
 

	
 
        // Consume module name
 
        let (module_name, module_name_span) = consume_domain_ident(&module.source, &mut iter)?;
 
        let target_root_id = ctx.symbols.get_module_by_name(module_name);
 
        if target_root_id.is_none() {
 
            return Err(ParseError::new_error_at_span(
 
                &module.source, module_name_span,
 
                format!("could not resolve module '{}'", String::from_utf8_lossy(module_name))
 
            ));
 
        }
 
        let module_name = ctx.pool.intern(module_name);
 
        let module_identifier = Identifier{ span: module_name_span, value: module_name };
 
        let target_root_id = target_root_id.unwrap();
 

	
 
        // Check for subsequent characters (alias, multiple imported symbols)
 
        let next = iter.next();
 
        let import_id;
 

	
 
        if has_ident(&module.source, &mut iter, b"as") {
 
            // Alias for module
 
            iter.consume();
 
            let alias_identifier = consume_ident_interned(&module.source, &mut iter, ctx)?;
 
            let alias_name = alias_identifier.value.clone();
 

	
 
            import_id = ctx.heap.alloc_import(|this| Import::Module(ImportModule{
 
                this,
 
                span: import_span,
 
                module: module_identifier,
 
                alias: alias_identifier,
 
                module_id: target_root_id
 
            }));
 
            ctx.symbols.insert_symbol(SymbolScope::Module(module.root_id), Symbol{
 
                name: alias_name,
 
                variant: SymbolVariant::Module(SymbolModule{
 
                    root_id: target_root_id,
 
                    introduced_at: import_id,
 
                }),
 
            });
 
        } else if Some(TokenKind::ColonColon) == next {
 
            iter.consume();
 

	
 
            // Helper function to consume symbols, their alias, and the
 
            // definition the symbol is pointing to.
 
            fn consume_symbol_and_maybe_alias<'a>(
 
                source: &'a InputSource, iter: &mut TokenIter, ctx: &mut PassCtx,
 
                module_name: &StringRef<'static>, module_root_id: RootId,
 
            ) -> Result<(AliasedSymbol, SymbolDefinition), ParseError> {
 
                // Consume symbol name and make sure it points to an existing definition
 
                let symbol_identifier = consume_ident_interned(source, iter, ctx)?;
 
                let target = ctx.symbols.get_symbol_by_name_defined_in_scope(
 
                    SymbolScope::Module(module_root_id), symbol
 
                );
 

	
 
                if target.is_none() {
 
                    return Err(ParseError::new_error_at_span(
 
                        source, symbol_span,
 
                        format!(
 
                            "could not find symbol '{}' within module '{}'",
 
                            symbol_identifier.value.as_str(), module_name.as_str()
 
                        )
 
                    ));
 
                }
 
                let target = target.unwrap();
 
                debug_assert_ne!(target.class(), SymbolClass::Module);
 
                let target_definition = target.variant.as_definition();
 

	
 
                // Consume alias text if specified
 
                let alias_identifier = if peek_ident(source, iter) == b"as" {
 
                    // Consume alias
 
                    iter.consume();
 
                    Some(consume_ident_interned(source, iter, ctx)?)
 
                } else {
 
                    None
 
                };
 

	
 
                Ok((
 
                    AliasedSymbol{
 
                        name: symbol_identifier,
 
                        alias: alias_identifier,
 
                        definition_id: target_definition.definition_id,
 
                    },
 
                    target_definition.clone()
 
                ))
 
            }
 

	
 
            let next = iter.next();
 

	
 
            if Some(TokenKind::Ident) = next {
 
                // Importing a single symbol
 
                iter.consume();
 
                let (imported_symbol, symbol_definition) = consume_symbol_and_maybe_alias(
 
                    &module.source, &mut iter, ctx, &module_identifier.value, target_root_id
 
                )?;
 
                let alias_identifier = imported_symbol.alias.unwrap_or_else(|| { imported_symbol.name.clone() });
 
                import_id = ctx.heap.alloc_import(|this| Import::Symbols(ImportSymbols{
 
                    this,
 
                    span: InputSpan::from_positions(import_span.begin, alias_identifier.span.end),
 
                    module: module_identifier,
 
                    module_id: target_root_id,
 
                    symbols: vec![imported_symbol],
 
                }));
 
                if let Err((new_symbol, old_symbol)) = ctx.symbols.insert_symbol(
 
                    SymbolScope::Module(module.root_id),
 
                    Symbol{
 
                        name: alias_identifier.value,
 
                        variant: SymbolVariant::Definition(symbol_definition.into_imported(import_id))
 
                    }
 
                ) {
 
                    return Err(construct_symbol_conflict_error(
 
                        modules, module_idx, ctx, &new_symbol, old_symbol
 
                    ));
 
                }
 
            } else if Some(TokenKind::OpenCurly) = next {
 
                // Importing multiple symbols
 
                consume_comma_separated(
 
                    TokenKind::OpenCurly, TokenKind::CloseCurly, source, &mut iter,
 
                    |source, iter| consume_symbol_and_maybe_alias(
 
                        source, iter, ctx, &module_identifier.value, target_root_id
 
                    ),
 
                    &mut self.found_symbols, "a symbol", "a list of symbols to import"
 
                )?;
 
                let end_of_list = iter.last_valid_pos();
 

	
 
                // Preallocate import
 
                import_id = ctx.heap.alloc_import(|this| Import::Symbols(ImportSymbols {
 
                    this,
 
                    span: InputSpan::from_positions(import_span.begin, end_of_list),
 
                    module: module_identifier,
 
                    module_id: target_root_id,
 
                    symbols: Vec::with_capacity(self.found_symbols.len()),
 
                }));
 

	
 
                // Fill import symbols while inserting symbols in the
 
                // appropriate scope in the symbol table.
 
                let import = ctx.heap[import_id].as_symbols_mut();
 

	
 
                for (imported_symbol, symbol_definition) in self.found_symbols.drain(..) {
 
                    let import_name = imported_symbol.alias.map_or_else(
 
                        || imported_symbol.name.value.clone(),
 
                        |v| v.value.clone()
 
                    );
 
                    import.symbols.push(imported_symbol);
 
                    if let Err((new_symbol, old_symbol)) = ctx.symbols.insert_symbol(
 
                        SymbolScope::Module(module.root_id), Symbol{
 
                            name: import_name,
 
                            variant: SymbolVariant::Definition(symbol_definition.into_imported(import_id))
 
                        }
 
                    ) {
 
                        return Err(construct_symbol_conflict_error(modules, module_idx, ctx, &new_symbol, old_symbol));
 
                    }
 
                }
 
            } else if Some(TokenKind::Star) = next {
 
                // Import all symbols from the module
 
                let star_span = iter.next_span();
 

	
 
                iter.consume();
 
                self.scoped_symbols.clear();
 
                let _found = ctx.symbols.get_all_symbols_defined_in_scope(
 
                    SymbolScope::Module(target_root_id),
 
                    &mut self.scoped_symbols
 
                );
 
                debug_assert!(_found); // even modules without symbols should have a scope
 

	
 
                // Preallocate import
 
                import_id = ctx.heap.alloc_import(|this| Import::Symbols(ImportSymbols{
 
                    this,
 
                    span: InputSpan::from_positions(import_span.begin, star_span.end),
 
                    module: module_identifier,
 
                    module_id: target_root_id,
 
                    symbols: Vec::with_capacity(self.scoped_symbols.len())
 
                }));
 

	
 
                // Fill import AST node and symbol table
 
                let import = ctx.heap[import_id].as_symbols_mut();
 

	
 
                for symbol in self.scoped_symbols.drain(..) {
 
                    let symbol_name = symbol.name;
 
                    match symbol.variant {
 
                        SymbolVariant::Definition(symbol_definition) => {
 
                            import.symbols.push(AliasedSymbol{
 
                                name: Identifier{ span: star_span, value: symbol.name.clone() },
 
                                alias: None,
 
                                definition_id: symbol_definition.definition_id,
 
                            });
 

	
 
                            if let Err((new_symbol, old_symbol)) = ctx.symbols.insert_symbol(
 
                                SymbolScope::Module(module.root_id),
 
                                Symbol{
 
                                    name: symbol_name,
 
                                    variant: SymbolVariant::Definition(symbol_definition.into_imported(import_id))
 
                                }
 
                            ) {
 
                                return Err(construct_symbol_conflict_error(modules, module_idx, ctx, &new_symbol, old_symbol));
 
                            }
 
                        },
 
                        _ => unreachable!(),
 
                    }
 
                }
 
            } else {
 
                return Err(ParseError::new_error_str_at_pos(
 
                    &module.source, iter.last_valid_pos(), "expected symbol name, '{' or '*'"
 
                ));
 
            }
 
        } else {
 
            // Assume implicit alias
 
            let module_name_str = module_identifier.value.clone();
 
            let last_ident_start = module_name_str.rfind('.').map_or(0, |v| v + 1);
 
            let alias_text = &module_name_str.as_bytes()[last_ident_start..];
 
            let alias = ctx.pool.intern(alias_text);
 
            let alias_span = InputSpan::from_positions(
 
                module_name_span.begin.with_offset(last_ident_start as u32),
 
                module_name_span.end
 
            );
 
            let alias_identifier = Identifier{ span: alias_span, value: alias.clone() };
 

	
 
            import_id = ctx.heap.alloc_import(|this| Import::Module(ImportModule{
 
                this,
 
                span: InputSpan::from_positions(import_span.begin, module_identifier.span.end),
 
                module: module_identifier,
 
                alias: alias_identifier,
 
                module_id: target_root_id,
 
            }));
 
            ctx.symbols.insert_symbol(SymbolScope::Module(module.root_id), Symbol{
 
                name: alias,
 
                variant: SymbolVariant::Module(SymbolModule{
 
                    root_id: target_root_id,
 
                    introduced_at: import_id
 
                })
 
            });
 
        }
 

	
 
        // By now the `import_id` is set, just need to make sure that the import
 
        // properly ends with a semicolon
 
        consume_token(&module.source, &mut iter, TokenKind::SemiColon)?;
 
        self.imports.push(import_id);
 

	
 
        Ok(())
 
    }
 
}
src/protocol/parser/pass_symbols.rs
Show inline comments
 
new file 100644
 
use crate::protocol::ast::*;
 
use super::symbol_table2::*;
 
use crate::protocol::input_source2::{ParseError, InputSpan};
 
use super::tokens::*;
 
use super::token_parsing::*;
 
use super::{Module, ModuleCompilationPhase, PassCtx};
 

	
 
/// Scans the module and finds all module-level type definitions. These will be
 
/// added to the symbol table such that during AST-construction we know which
 
/// identifiers point to types. Will also parse all pragmas to determine module
 
/// names.
 
pub(crate) struct PassSymbols {
 
    symbols: Vec<Symbol>,
 
    pragmas: Vec<PragmaId>,
 
    imports: Vec<ImportId>,
 
    definitions: Vec<DefinitionId>,
 
    buffer: String,
 
    has_pragma_version: bool,
 
    has_pragma_module: bool,
 
}
 

	
 
impl PassSymbols {
 
    pub(crate) fn new() -> Self {
 
        Self{
 
            symbols: Vec::with_capacity(128),
 
            pragmas: Vec::with_capacity(8),
 
            imports: Vec::with_capacity(32),
 
            definitions: Vec::with_capacity(128),
 
            buffer: String::with_capacity(128),
 
            has_pragma_version: false,
 
            has_pragma_module: false,
 
        }
 
    }
 

	
 
    fn reset(&mut self) {
 
        self.symbols.clear();
 
        self.pragmas.clear();
 
        self.imports.clear();
 
        self.definitions.clear();
 
        self.has_pragma_version = false;
 
        self.has_pragma_module = false;
 
    }
 

	
 
    pub(crate) fn parse(&mut self, modules: &mut [Module], module_idx: usize, ctx: &mut PassCtx) -> Result<(), ParseError> {
 
        self.reset();
 

	
 
        let module = &mut modules[module_idx];
 
        let module_range = &module.tokens.ranges[0];
 

	
 
        debug_assert_eq!(module.phase, ModuleCompilationPhase::Tokenized);
 
        debug_assert_eq!(module_range.range_kind, TokenRangeKind::Module);
 
        debug_assert!(module.root_id.is_invalid()); // not set yet,
 

	
 
        // Preallocate root in the heap
 
        let root_id = ctx.heap.alloc_protocol_description(|this| {
 
            Root{
 
                this,
 
                pragmas: Vec::new(),
 
                imports: Vec::new(),
 
                definitions: Vec::new(),
 
            }
 
        });
 
        module.root_id = root_id;
 

	
 
        // Visit token ranges to detect definitions and pragmas
 
        let mut range_idx = module_range.first_child_idx;
 
        loop {
 
            let range_idx_usize = range_idx as usize;
 
            let cur_range = &module.tokens.ranges[range_idx_usize];
 

	
 
            // Parse if it is a definition or a pragma
 
            if cur_range.range_kind == TokenRangeKind::Definition {
 
                self.visit_definition_range(modules, module_idx, ctx, range_idx_usize)?;
 
            } else if cur_range.range_kind == TokenRangeKind::Pragma {
 
                self.visit_pragma_range(modules, module_idx, ctx, range_idx_usize)?;
 
            }
 

	
 
            match cur_range.next_sibling_idx {
 
                Some(idx) => { range_idx = idx; },
 
                None => { break; },
 
            }
 
        }
 

	
 
        // Add the module's symbol scope and the symbols we just parsed
 
        let module_scope = SymbolScope::Module(root_id);
 
        ctx.symbols.insert_scope(None, module_scope);
 
        for symbol in self.symbols.drain(..) {
 
            if let Err((new_symbol, old_symbol)) = ctx.symbols.insert_symbol(module_scope, symbol) {
 
                return Err(construct_symbol_conflict_error(modules, module_idx, ctx, &new_symbol, old_symbol))
 
            }
 
        }
 

	
 
        // Modify the preallocated root
 
        let root = &mut ctx.heap[root_id];
 
        root.pragmas.extend(self.pragmas.drain(..));
 
        root.definitions.extend(self.definitions.drain(..));
 
        module.phase = ModuleCompilationPhase::SymbolsScanned;
 

	
 
        Ok(())
 
    }
 

	
 
    fn visit_pragma_range(&mut self, modules: &[Module], module_idx: usize, ctx: &mut PassCtx, range_idx: usize) -> Result<(), ParseError> {
 
        let module = &modules[module_idx];
 
        let range = &module.tokens.ranges[range_idx];
 
        let mut iter = module.tokens.iter_range(range);
 

	
 
        // Consume pragma name
 
        let (pragma_section, pragma_start, _) = consume_pragma(&self.source, &mut iter)?;
 

	
 
        // Consume pragma values
 
        if pragma_section == "#module" {
 
            // Check if name is defined twice within the same file
 
            if self.has_pragma_module {
 
                return Err(ParseError::new_error(&module.source, pragma_start, "module name is defined twice"));
 
            }
 

	
 
            // Consume the domain-name
 
            let (module_name, module_span) = consume_domain_ident(&module.source, &mut iter)?;
 
            if iter.next().is_some() {
 
                return Err(ParseError::new_error(&module.source, iter.last_valid_pos(), "expected end of #module pragma after module name"));
 
            }
 

	
 
            // Add to heap and symbol table
 
            let pragma_span = InputSpan::from_positions(pragma_start, module_span.end);
 
            let module_name = ctx.pool.intern(module_name);
 
            let pragma_id = ctx.heap.alloc_pragma(|this| Pragma::Module(PragmaModule{
 
                this,
 
                span: pragma_span,
 
                value: Identifier{ span: module_span, value: module_name.clone() },
 
            }));
 
            self.pragmas.push(pragma_id);
 

	
 
            if let Err(other_module_root_id) = ctx.symbols.insert_module(module_name, module.root_id) {
 
                // Naming conflict
 
                let this_module = &modules[module_idx];
 
                let other_module = seek_module(modules, other_module_root_id).unwrap();
 
                let (other_module_pragma_id, _) = other_module.name.unwrap();
 
                let other_pragma = ctx.heap[other_module_pragma_id].as_module();
 
                return Err(ParseError::new_error_str_at_span(
 
                    &this_module.source, pragma_span, "conflict in module name"
 
                ).with_info_str_at_span(
 
                    &other_module.source, other_pragma.span, "other module is defined here"
 
                ));
 
            }
 
            self.has_pragma_module = true;
 
        } else if pragma_section == "#version" {
 
            // Check if version is defined twice within the same file
 
            if self.has_pragma_version {
 
                return Err(ParseError::new_error(&module.source, pragma_start, "module version is defined twice"));
 
            }
 

	
 
            // Consume the version pragma
 
            let (version, version_span) = consume_integer_literal(&module.source, &mut iter, &mut self.buffer)?;
 
            let pragma_id = ctx.heap.alloc_pragma(|this| Pragma::Version(PragmaVersion{
 
                this,
 
                span: InputSpan::from_positions(pragma_start, version_span.end),
 
                version,
 
            }));
 
            self.pragmas.push(pragma_id);
 
            self.has_pragma_version = true;
 
        } else {
 
            // Custom pragma, maybe we support this in the future, but for now
 
            // we don't.
 
            return Err(ParseError::new_error(&module.source, pragma_start, "illegal pragma name"));
 
        }
 

	
 
        Ok(())
 
    }
 

	
 
    fn visit_definition_range(&mut self, modules: &[Module], module_idx: usize, ctx: &mut PassCtx, range_idx: usize) -> Result<(), ParseError> {
 
        let module = &modules[module_idx];
 
        let range = &module.tokens.ranges[range_idx];
 
        let definition_span = InputSpan::from_positions(
 
            module.tokens.start_pos(range),
 
            module.tokens.end_pos(range)
 
        );
 
        let mut iter = module.tokens.iter_range(range);
 

	
 
        // First ident must be type of symbol
 
        let (kw_text, _) = consume_any_ident(&module.source, &mut iter).unwrap();
 

	
 
        // Retrieve identifier of definition
 
        let identifier = consume_ident_interned(&module.source, &mut iter, ctx)?;
 
        let ident_text = identifier.value.clone(); // because we need it later
 

	
 
        // Reserve space in AST for definition and add it to the symbol table
 
        let definition_class;
 
        let ast_definition_id;
 
        match kw_text {
 
            KW_STRUCT => {
 
                let struct_def_id = ctx.heap.alloc_struct_definition(|this| {
 
                    StructDefinition::new_empty(this, definition_span, identifier)
 
                });
 
                definition_class = DefinitionClass::Struct;
 
                ast_definition_id = struct_def_id.upcast();
 
            },
 
            KW_ENUM => {
 
                let enum_def_id = ctx.heap.alloc_enum_definition(|this| {
 
                    EnumDefinition::new_empty(this, definition_span, identifier)
 
                });
 
                definition_class = DefinitionClass::Enum;
 
                ast_definition_id = enum_def_id.upcast();
 
            },
 
            KW_UNION => {
 
                let union_def_id = ctx.heap.alloc_union_definition(|this| {
 
                    UnionDefinition::new_empty(this, definition_span, identifier)
 
                });
 
                definition_class = DefinitionClass::Union;
 
                ast_definition_id = union_def_id.upcast()
 
            },
 
            KW_FUNCTION => {
 
                let func_def_id = ctx.heap.alloc_function_definition(|this| {
 
                    FunctionDefinition::new_empty(this, definition_span, identifier)
 
                });
 
                definition_class = DefinitionClass::Function;
 
                ast_definition_id = func_def_id.upcast();
 
            },
 
            KW_PRIMITIVE | KW_COMPOSITE => {
 
                let component_variant = if kw_text == KW_PRIMITIVE {
 
                    ComponentVariant::Primitive
 
                } else {
 
                    ComponentVariant::Composite
 
                };
 
                let comp_def_id = ctx.heap.alloc_component_definition(|this| {
 
                    ComponentDefinition::new_empty(this, definition_span, component_variant, identifier)
 
                });
 
                definition_class = DefinitionClass::Component;
 
                ast_definition_id = comp_def_id.upcast();
 
            },
 
            _ => unreachable!("encountered keyword '{}' in definition range", String::from_utf8_lossy(kw_text)),
 
        }
 

	
 
        let symbol = Symbol{
 
            name: ident_text,
 
            variant: SymbolVariant::Definition(SymbolDefinition{
 
                defined_in_module: module.root_id,
 
                defined_in_scope: SymbolScope::Module(module.root_id),
 
                definition_span,
 
                identifier_span,
 
                imported_at: None,
 
                class: definition_class,
 
                definition_id: ast_definition_id,
 
            }),
 
        };
 
        self.symbols.push(symbol);
 
        self.definitions.push(ast_definition_id);
 

	
 
        Ok(())
 
    }
 
}
 
\ No newline at end of file
src/protocol/parser/pass_tokenizer.rs
Show inline comments
 
file renamed from src/protocol/tokenizer/mod.rs to src/protocol/parser/pass_tokenizer.rs
 
use crate::protocol::input_source2::{
 
    InputSource2 as InputSource,
 
    ParseError,
 
    InputPosition2 as InputPosition,
 
    InputSpan
 
};
 

	
 
pub(crate) const KW_STRUCT:    &'static [u8] = b"struct";
 
pub(crate) const KW_ENUM:      &'static [u8] = b"enum";
 
pub(crate) const KW_UNION:     &'static [u8] = b"union";
 
pub(crate) const KW_FUNCTION:  &'static [u8] = b"function";
 
pub(crate) const KW_PRIMITIVE: &'static [u8] = b"primitive";
 
pub(crate) const KW_COMPOSITE: &'static [u8] = b"composite";
 
pub(crate) const KW_IMPORT:    &'static [u8] = b"import";
 

	
 
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
 
pub(crate) enum TokenKind {
 
    // Variable-character tokens, followed by a SpanEnd token
 
    Ident,          // regular identifier
 
    Pragma,         // identifier with prefixed `#`, range includes `#`
 
    Integer,        // integer literal
 
    String,         // string literal, range includes `"`
 
    Character,      // character literal, range includes `'`
 
    LineComment,    // line comment, range includes leading `//`, but not newline
 
    BlockComment,   // block comment, range includes leading `/*` and trailing `*/`
 
    // Punctuation (single character)
 
    Exclamation,    // !
 
    Question,       // ?
 
    Pound,          // #
 
    OpenAngle,      // <
 
    OpenCurly,      // {
 
    OpenParen,      // (
 
    OpenSquare,     // [
 
    CloseAngle,     // >
 
    CloseCurly,     // }
 
    CloseParen,     // )
 
    CloseSquare,    // ]
 
    Colon,          // :
 
    Comma,          // ,
 
    Dot,            // .
 
    SemiColon,      // ;
 
    Quote,          // '
 
    DoubleQuote,    // "
 
    // Operator-like (single character)
 
    At,             // @
 
    Plus,           // +
 
    Minus,          // -
 
    Star,           // *
 
    Slash,          // /
 
    Percent,        // %
 
    Caret,          // ^
 
    And,            // &
 
    Or,             // |
 
    Tilde,          // ~
 
    Equal,          // =
 
    // Punctuation (two characters)
 
    ColonColon,     // ::
 
    DotDot,         // ..
 
    ArrowRight,     // ->
 
    // Operator-like (two characters)
 
    PlusPlus,       // ++
 
    PlusEquals,     // +=
 
    MinusMinus,     // --
 
    MinusEquals,    // -=
 
    StarEquals,     // *=
 
    SlashEquals,    // /=
 
    PercentEquals,  // %=
 
    CaretEquals,    // ^=
 
    AndAnd,         // &&
 
    AndEquals,      // &=
 
    OrOr,           // ||
 
    OrEquals,       // |=
 
    EqualEqual,     // ==
 
    NotEqual,       // !=
 
    ShiftLeft,      // <<
 
    ShiftRight,     // >>
 
    // Operator-like (three characters)
 
    ShiftLeftEquals,// <<=
 
    ShiftRightEquals, // >>=
 
    // Special marker token to indicate end of variable-character tokens
 
    SpanEnd,
 
}
 

	
 
impl TokenKind {
 
    /// Returns true if the next expected token is the special `TokenKind::SpanEnd` token. This is
 
    /// the case for tokens of variable length (e.g. an identifier).
 
    fn has_span_end(&self) -> bool {
 
        return *self <= TokenKind::BlockComment
 
    }
 

	
 
    /// Returns the number of characters associated with the token. May only be called on tokens
 
    /// that do not have a variable length.
 
    fn num_characters(&self) -> u32 {
 
        debug_assert!(!self.has_span_end() && *self != TokenKind::SpanEnd);
 
        if *self <= TokenKind::Equal {
 
            1
 
        } else if *self <= TokenKind::ShiftRight {
 
            2
 
        } else {
 
            3
 
        }
 
    }
 

	
 
    /// Returns the characters that are represented by the token, may only be called on tokens that
 
    /// do not have a variable length.
 
    pub fn token_chars(&self) -> &'static str {
 
        debug_assert!(!self.has_span_end() && *self != TokenKind::SpanEnd);
 
        use TokenKind as TK;
 
        match self {
 
            TK::Exclamation => "!",
 
            TK::Question => "?",
 
            TK::Pound => "#",
 
            TK::OpenAngle => "<",
 
            TK::OpenCurly => "{",
 
            TK::OpenParen => "(",
 
            TK::OpenSquare => "[",
 
            TK::CloseAngle => ">",
 
            TK::CloseCurly => "}",
 
            TK::CloseParen => ")",
 
            TK::CloseSquare => "]",
 
            TK::Colon => ":",
 
            TK::Comma => ",",
 
            TK::Dot => ".",
 
            TK::SemiColon => ";",
 
            TK::Quote => "'",
 
            TK::DoubleQuote => "\"",
 
            TK::At => "@",
 
            TK::Plus => "+",
 
            TK::Minus => "-",
 
            TK::Star => "*",
 
            TK::Slash => "/",
 
            TK::Percent => "%",
 
            TK::Caret => "^",
 
            TK::And => "&",
 
            TK::Or => "|",
 
            TK::Tilde => "~",
 
            TK::Equal => "=",
 
            TK::ColonColon => "::",
 
            TK::DotDot => "..",
 
            TK::ArrowRight => "->",
 
            TK::PlusPlus => "++",
 
            TK::PlusEquals => "+=",
 
            TK::MinusMinus => "--",
 
            TK::MinusEquals => "-=",
 
            TK::StarEquals => "*=",
 
            TK::SlashEquals => "/=",
 
            TK::PercentEquals => "%=",
 
            TK::CaretEquals => "^=",
 
            TK::AndAnd => "&&",
 
            TK::AndEquals => "&=",
 
            TK::OrOr => "||",
 
            TK::OrEquals => "|=",
 
            TK::EqualEqual => "==",
 
            TK::NotEqual => "!=",
 
            TK::ShiftLeft => "<<",
 
            TK::ShiftRight => ">>",
 
            TK::ShiftLeftEquals => "<<=",
 
            TK::ShiftRightEquals => ">>=",
 
            // Lets keep these in explicitly for now, in case we want to add more symbols
 
            TK::Ident | TK::Pragma | TK::Integer | TK::String | TK::Character |
 
            TK::LineComment | TK::BlockComment | TK::SpanEnd => unreachable!(),
 
        }
 
    }
 
}
 

	
 
pub(crate) struct Token {
 
    pub kind: TokenKind,
 
    pub pos: InputPosition,
 
}
 

	
 
impl Token {
 
    fn new(kind: TokenKind, pos: InputPosition) -> Self {
 
        Self{ kind, pos }
 
    }
 
}
 

	
 
#[derive(Debug, PartialEq, Eq)]
 
pub(crate) enum TokenRangeKind {
 
    Module,
 
    Pragma,
 
    Import,
 
    Definition,
 
    Code,
 
}
 

	
 
/// TODO: Add first_child and next_sibling indices for slightly faster traversal
 
#[derive(Debug)]
 
pub(crate) struct TokenRange {
 
    // Index of parent in `TokenBuffer.ranges`, does not have a parent if the 
 
    // range kind is Module, in that case the parent index points to itself.
 
    pub parent_idx: usize,
 
    pub range_kind: TokenRangeKind,
 
    pub curly_depth: u32,
 
    // InputPosition offset is limited to u32, so token ranges can be as well.
 
    pub start: u32,             // first token (inclusive index)
 
    pub end: u32,               // last token (exclusive index)
 
    // Subranges and where they can be found
 
    pub subranges: u32,         // Number of subranges
 
    pub first_child_idx: u32,   // First subrange (or points to self if no subranges)
 
    pub last_child_idx: u32,    // Last subrange (or points to self if no subranges)
 
    pub next_sibling_idx: Option<u32>,
 
}
 

	
 
pub(crate) struct TokenBuffer {
 
    pub tokens: Vec<Token>,
 
    pub ranges: Vec<TokenRange>,
 
}
 

	
 
impl TokenBuffer {
 
    pub(crate) fn new() -> Self {
 
        Self{ tokens: Vec::new(), ranges: Vec::new() }
 
    }
 

	
 
    pub(crate) fn iter_range<'a>(&'a self, range: &TokenRange) -> TokenIter<'a> {
 
        TokenIter::new(self, range.start as usize, range.end as usize)
 
    }
 

	
 
    pub(crate) fn start_pos(&self, range: &TokenRange) -> InputPosition {
 
        self.tokens[range.start].pos
 
    }
 

	
 
    pub(crate) fn end_pos(&self, range: &TokenRange) -> InputPosition {
 
        let last_token = &self.tokens[range.end - 1];
 
        if last_token.kind == TokenKind::SpanEnd {
 
            return last_token.pos
 
        } else {
 
            debug_assert!(!last_token.kind.has_span_end());
 
            return last_token.pos.with_offset(last_token.kind.num_characters());
 
        }
 
    }
 
}
 

	
 
pub(crate) struct TokenIter<'a> {
 
    tokens: &'a Vec<Token>,
 
    cur: usize,
 
    end: usize,
 
}
 

	
 
impl<'a> TokenIter<'a> {
 
    fn new(buffer: &'a TokenBuffer, start: usize, end: usize) -> Self {
 
        Self{ tokens: &buffer.tokens, cur: start, end }
 
    }
 

	
 
    /// Returns the next token (may include comments), or `None` if at the end
 
    /// of the range.
 
    pub(crate) fn next_including_comments(&self) -> Option<TokenKind> {
 
        if self.cur >= self.end {
 
            return None;
 
        }
 

	
 
        let token = &self.tokens[self.cur];
 
        Some(token.kind)
 
    }
 

	
 
    /// Returns the next token (but skips over comments), or `None` if at the
 
    /// end of the range
 
    pub(crate) fn next(&mut self) -> Option<TokenKind> {
 
        while let Some(token_kind) = self.next() {
 
            if token_kind != TokenKind::LineComment && token_kind != TokenKind::BlockComment {
 
                return Some(token_kind);
 
            }
 
            self.consume();
 
        }
 

	
 
        return None
 
    }
 

	
 
    /// Returns the start position belonging to the token returned by `next`. If
 
    /// there is not a next token, then we return the end position of the
 
    /// previous token.
 
    pub(crate) fn last_valid_pos(&self) -> InputPosition {
 
        if self.cur < self.end {
 
            // Return token position
 
            return self.tokens[self.cur].pos
 
        }
 

	
 
        // Return previous token end
 
        let token = &self.tokens[self.cur - 1];
 
        return if token.kind == TokenKind::SpanEnd {
 
            token.pos
 
        } else {
 
            token.pos.with_offset(token.kind.num_characters());
 
        };
 
    }
 

	
 
    /// Returns the token range belonging to the token returned by `next`. This
 
    /// assumes that we're not at the end of the range we're iterating over.
 
    pub(crate) fn next_range(&self) -> (InputPosition, InputPosition) {
 
        debug_assert!(self.cur < self.end);
 
        let token = &self.tokens[self.cur];
 
        if token.kind.has_span_end() {
 
            let span_end = &self.tokens[self.cur + 1];
 
            debug_assert_eq!(span_end.kind, TokenKind::SpanEnd);
 
            (token.pos, span_end.pos)
 
        } else {
 
            let offset = token.kind.num_characters();
 
            (token.pos, token.pos.with_offset(offset))
 
        }
 
    }
 

	
 
    pub(crate) fn consume(&mut self) {
 
        if let Some(kind) = self.next() {
 
            if kind.has_span_end() {
 
                self.cur += 2;
 
            } else {
 
                self.cur += 1;
 
            }
 
        }
 
    }
 
}
 
use crate::protocol::parser::tokens::*;
 

	
 
/// Tokenizer is a reusable parser to tokenize multiple source files using the
 
/// same allocated buffers. In a well-formed program, we produce a consistent
 
/// tree of token ranges such that we may identify tokens that represent a
 
/// defintion or an import before producing the entire AST.
 
///
 
/// If the program is not well-formed then the tree may be inconsistent, but we
 
/// will detect this once we transform the tokens into the AST. To ensure a
 
/// consistent AST-producing phase we will require the import to have balanced
 
/// curly braces
 
pub(crate) struct Tokenizer {
 
pub(crate) struct PassTokenizer {
 
    // Stack of input positions of opening curly braces, used to detect
 
    // unmatched opening braces, unmatched closing braces are detected
 
    // immediately.
 
    curly_stack: Vec<InputPosition>,
 
    // Points to an element in the `TokenBuffer.ranges` variable.
 
    stack_idx: usize,
 
}
 

	
 
impl Tokenizer {
 
impl PassTokenizer {
 
    pub(crate) fn new() -> Self {
 
        Self{ curly_stack: Vec::with_capacity(32), stack_idx: 0 }
 
    }
 
    pub(crate) fn tokenize(&mut self, source: &mut InputSource, target: &mut TokenBuffer) -> Result<(), ParseError> {
 
        // Assert source and buffer are at start
 
        debug_assert_eq!(source.pos().offset, 0);
 
        debug_assert!(target.tokens.is_empty());
 
        debug_assert!(target.ranges.is_empty());
 

	
 
        // Set up for tokenization by pushing the first range onto the stack.
 
        // This range may get transformed into the appropriate range kind later,
 
        // see `push_range` and `pop_range`.
 
        self.curly_depth = 0;
 
        self.stack_idx = 0;
 
        target.ranges.push(TokenRange{
 
            parent_idx: 0,
 
            range_kind: TokenRangeKind::Module,
 
            curly_depth: 0,
 
            start: 0,
 
            end: 0,
 
            subranges: 0,
 
            num_child_ranges: 0,
 
            first_child_idx: 0,
 
            last_child_idx: 0,
 
            next_sibling_idx: None,
 
        });
 

	
 
        // Main tokenization loop
 
        while let Some(c) = source.next() {
 
            let token_index = target.tokens.len() as u32;
 

	
 
            if is_char_literal_start(c) {
 
                self.consume_char_literal(source, target)?;
 
            } else if is_string_literal_start(c) {
 
@@ -386,25 +84,25 @@ impl Tokenizer {
 
                if contained_newline {
 
                    let range = &target.ranges[self.stack_idx];
 
                    if range.range_kind == TokenRangeKind::Pragma {
 
                        self.pop_range(target, target.tokens.len() as u32);
 
                    }
 
                }
 
            } else {
 
                let was_punctuation = self.maybe_parse_punctuation(c, source, target)?;
 
                if let Some((token, token_pos)) = was_punctuation {
 
                    if token == TokenKind::OpenCurly {
 
                        self.curly_stack.push(token_pos);
 
                    } else if token == TokenKind::CloseCurly {
 
                        // Check if this marks the end of a range we're 
 
                        // Check if this marks the end of a range we're
 
                        // currently processing
 
                        if self.curly_stack.is_empty() {
 
                            return Err(ParseError::new_error(
 
                                source, token_pos, "unmatched closing curly brace '}'"
 
                            ));
 
                        }
 

	
 
                        self.curly_stack.pop();
 

	
 
                        let range = &target.ranges[self.stack_idx];
 
                        if range.range_kind == TokenRangeKind::Definition && range.curly_depth == self.curly_depth {
 
                            self.pop_range(target, target.tokens.len() as u32);
 
@@ -436,43 +134,43 @@ impl Tokenizer {
 
            let last_unmatched_open = self.curly_stack.pop().unwrap();
 
            return Err(ParseError::new_error(
 
                source, last_unmatched_open, "unmatched opening curly brace '{'"
 
            ));
 
        }
 

	
 
        // TODO: @remove once I'm sure the algorithm works. For now it is better
 
        //  if the debugging is a little more expedient
 
        if cfg!(debug_assertions) {
 
            // For each range make sure its children make sense
 
            for parent_idx in 0..target.ranges.len() {
 
                let cur_range = &target.ranges[parent_idx];
 
                if cur_range.subranges == 0 {
 
                if cur_range.num_child_ranges == 0 {
 
                    assert_eq!(cur_range.first_child_idx, parent_idx as u32);
 
                    assert_eq!(cur_range.last_child_idx, parent_idx as u32);
 
                } else {
 
                    assert_ne!(cur_range.first_child_idx, parent_idx as u32);
 
                    assert_ne!(cur_range.last_child_idx, parent_idx as u32);
 

	
 
                    let mut child_counter = 0u32;
 
                    let mut last_child_idx = cur_range.first_child_idx;
 
                    let mut child_idx = Some(cur_range.first_child_idx);
 
                    while let Some(cur_child_idx) = child_idx {
 
                        let child_range = &target.ranges[cur_child_idx as usize];
 
                        assert_eq!(child_range.parent_idx, parent_idx);
 
                        child_idx = child_range.next_sibling_idx;
 
                        child_counter += 1;
 
                    }
 

	
 
                    assert_eq!(cur_range.last_child_idx, last_child_idx);
 
                    assert_eq!(cur_range.subranges, child_counter);
 
                    assert_eq!(cur_range.num_child_ranges, child_counter);
 
                }
 
            }
 
        }
 

	
 
        Ok(())
 
    }
 

	
 
    fn is_line_comment_start(&self, first_char: u8, source: &InputSource) -> bool {
 
        return first_char == b'/' && Some(b'/') == source.lookahead(1);
 
    }
 

	
 
    fn is_block_comment_start(&self, first_char: u8, source: &InputSource) -> bool {
 
@@ -628,25 +326,25 @@ impl Tokenizer {
 
            token_kind = TokenKind::Question;
 
        } else if first_char == b'@' {
 
            source.consume();
 
            token_kind = TokenKind::At;
 
        } else if first_char == b'[' {
 
            source.consume();
 
            token_kind = TokenKind::OpenSquare;
 
        } else if first_char == b']' {
 
            source.consume();
 
            token_kind = TokenKind::CloseSquare;
 
        } else if first_char == b'^' {
 
            source.consume();
 
            if let Some(b'=') = source.next() { 
 
            if let Some(b'=') = source.next() {
 
                source.consume();
 
                token_kind = TokenKind::CaretEquals;
 
            } else {
 
                token_kind = TokenKind::Caret;
 
            }
 
        } else if first_char == b'{' {
 
            source.consume();
 
            token_kind = TokenKind::OpenCurly;
 
        } else if first_char == b'|' {
 
            source.consume();
 
            let next = source.next();
 
            if Some(b'|') == next {
 
@@ -674,25 +372,25 @@ impl Tokenizer {
 
        let begin_pos = source.pos();
 

	
 
        // Consume the leading quote
 
        debug_assert!(source.next().unwrap() == b'\'');
 
        source.consume();
 

	
 
        let mut prev_char = b'\'';
 
        while let Some(c) = source.next() {
 
            if !c.is_ascii() {
 
                return Err(ParseError::new_error(source, source.pos(), "non-ASCII character in char literal"));
 
            }
 
            source.consume();
 
            
 

	
 
            // Make sure ending quote was not escaped
 
            if c == b'\'' && prev_char != b'\\' {
 
                prev_char = c;
 
                break;
 
            }
 

	
 
            prev_char = c;
 
        }
 

	
 
        if prev_char != b'\'' {
 
            // Unterminated character literal, reached end of file.
 
            return Err(ParseError::new_error(source, begin_pos, "encountered unterminated character literal"));
 
@@ -856,25 +554,25 @@ impl Tokenizer {
 
        while let Some(c) = source.next() {
 
            if !is_identifier_remaining(c) {
 
                break;
 
            }
 

	
 
            source.consume();
 
        }
 
        self.check_ascii(source)?;
 

	
 
        let end_pos = source.pos();
 
        target.tokens.push(Token::new(TokenKind::Ident, begin_pos));
 
        target.tokens.push(Token::new(TokenKind::SpanEnd, end_pos));
 
        Ok(source.section(begin_pos, end_pos))
 
        Ok(source.section_at_pos(begin_pos, end_pos))
 
    }
 

	
 
    fn consume_number(&mut self, source: &mut InputSource, target: &mut TokenBuffer) -> Result<(), ParseError> {
 
        let begin_pos = source.pos();
 
        debug_assert!(is_integer_literal_start(source.next().unwrap()));
 
        source.consume();
 

	
 
        // Keep reading until it doesn't look like a number anymore
 
        while let Some(c) = source.next() {
 
            if !maybe_number_remaining(c) {
 
                break;
 
            }
 
@@ -906,48 +604,48 @@ impl Tokenizer {
 
            }
 
            source.consume();
 
        }
 

	
 
        has_newline
 
    }
 

	
 
    /// Pushes a new token range onto the stack in the buffers.
 
    fn push_range(&mut self, target: &mut TokenBuffer, range_kind: TokenRangeKind, first_token: u32) {
 
        let cur_range = &mut target.ranges[self.stack_idx];
 

	
 
        // If we have just popped a range and then push a new range, then the
 
        // first token is equal to the last token registered on the current 
 
        // range. If not, then we had some intermediate tokens that did not 
 
        // belong to a particular kind of token range: hence we insert an 
 
        // first token is equal to the last token registered on the current
 
        // range. If not, then we had some intermediate tokens that did not
 
        // belong to a particular kind of token range: hence we insert an
 
        // intermediate "code" range.
 
        if cur_range.end != first_token {
 
            let code_start = cur_range.end;
 
            let code_range_idx = target.ranges.len() as u32;
 

	
 
            if cur_range.first_child_idx == self.stack_idx as u32 {
 
                // The parent of the new "code" range we're going to push does
 
                // not have any registered children yet.
 
                cur_range.first_child_idx = code_range_idx;
 
            }
 
            cur_range.last_child_idx = code_range_idx + 1;
 

	
 
            cur_range.end = first_token;
 
            cur_range.subranges += 1;
 
            cur_range.num_child_ranges += 1;
 
            target.ranges.push(TokenRange{
 
                parent_idx: self.stack_idx,
 
                range_kind: TokenRangeKind::Code,
 
                curly_depth: self.curly_depth,
 
                start: code_start,
 
                end: first_token,
 
                subranges: 0,
 
                num_child_ranges: 0,
 
                first_child_idx: code_range_idx,
 
                last_child_idx: code_range_idx,
 
                next_sibling_idx: Some(code_range_idx + 1), // we're going to push this thing next
 
            });
 
        } else {
 
            // We're going to push the range in the code below, but while we
 
            // have the `cur_range` borrowed mutably, we fix up its children
 
            // indices.
 
            let new_range_idx = target.ranges.len() as u32;
 
            if cur_range.first_child_idx == self.stack_idx as u32 {
 
                cur_range.first_child_idx = new_range_idx;
 
            }
 
@@ -955,125 +653,125 @@ impl Tokenizer {
 
        }
 

	
 
        // Insert a new range
 
        let parent_idx = self.stack_idx;
 
        self.stack_idx = target.ranges.len();
 
        let new_range_idx = self.stack_idx as u32;
 
        target.ranges.push(TokenRange{
 
            parent_idx,
 
            range_kind,
 
            curly_depth: self.curly_depth,
 
            start: first_token,
 
            end: first_token,
 
            subranges: 0,
 
            num_child_ranges: 0,
 
            first_child_idx: new_range_idx,
 
            last_child_idx: new_range_idx,
 
            next_sibling_idx: None,
 
        });
 
    }
 

	
 
    fn pop_range(&mut self, target: &mut TokenBuffer, end_index: u32) {
 
        let last = &mut target.ranges[self.stack_idx];
 
        debug_assert!(self.stack_idx != last.parent_idx, "attempting to pop top-level range");
 

	
 
        // Fix up the current range before going back to parent
 
        last.end = end_index;
 
        debug_assert_ne!(last.start, end_index);
 
        
 

	
 
        // Go back to parent
 
        self.stack_idx = last.parent_idx;
 
        let parent = &mut target.ranges[self.stack_idx];
 
        parent.end = end_index;
 
        parent.subranges += 1;
 
        parent.num_child_ranges += 1;
 
    }
 

	
 

	
 
    fn check_ascii(&self, source: &InputSource) -> Result<(), ParseError> {
 
        match source.next() {
 
            Some(c) if !c.is_ascii() => {
 
                Err(ParseError::new_error(source, source.pos(), "encountered a non-ASCII character"))
 
            },
 
            _else => {
 
                Ok(())
 
            },
 
        }
 
    }
 
}
 

	
 
// Helpers for characters
 
fn demarks_definition(ident: &[u8]) -> bool {
 
    return
 
        ident == KW_STRUCT ||
 
        ident == KW_ENUM ||
 
        ident == KW_UNION ||
 
        ident == KW_FUNCTION ||
 
        ident == KW_PRIMITIVE ||
 
        ident == KW_COMPOSITE
 
            ident == KW_ENUM ||
 
            ident == KW_UNION ||
 
            ident == KW_FUNCTION ||
 
            ident == KW_PRIMITIVE ||
 
            ident == KW_COMPOSITE
 
}
 

	
 
fn demarks_import(ident: &[u8]) -> bool {
 
    return ident == KW_IMPORT;
 
}
 

	
 
fn is_whitespace(c: u8) -> bool {
 
    c.is_ascii_whitespace()
 
}
 

	
 
fn is_char_literal_start(c: u8) -> bool {
 
    return c == b'\'';
 
}
 

	
 
fn is_string_literal_start(c: u8) -> bool {
 
    return c == b'"';
 
}
 

	
 
fn is_pragma_start_or_pound(c: u8) -> bool {
 
    return c == b'#';
 
}
 

	
 
fn is_identifier_start(c: u8) -> bool {
 
    return 
 
    return
 
        (c >= b'a' && c <= b'z') ||
 
        (c >= b'A' && c <= b'Z') ||
 
        c == b'_' 
 
            (c >= b'A' && c <= b'Z') ||
 
            c == b'_'
 
}
 

	
 
fn is_identifier_remaining(c: u8) -> bool {
 
    return 
 
    return
 
        (c >= b'0' && c <= b'9') ||
 
        (c >= b'a' && c <= b'z') ||
 
        (c >= b'A' && c <= b'Z') ||
 
        c == b'_' 
 
            (c >= b'a' && c <= b'z') ||
 
            (c >= b'A' && c <= b'Z') ||
 
            c == b'_'
 
}
 

	
 
fn is_integer_literal_start(c: u8) -> bool {
 
    return c >= b'0' && c <= b'9';
 
}
 

	
 
fn maybe_number_remaining(c: u8) -> bool {
 
    return 
 
    return
 
        (c == b'b' || c == b'B' || c == b'o' || c == b'O' || c == b'x' || c == b'X') ||
 
        (c >= b'0' && c <= b'9') ||
 
        c == b'_';
 
            (c >= b'0' && c <= b'9') ||
 
            c == b'_';
 
}
 

	
 
#[cfg(test)]
 
mod tests {
 
    use super::*;
 

	
 
    // TODO: Remove at some point
 
    #[test]
 
    fn test_tokenizer() {
 
        let mut source = InputSource::new_test("
 
        
 

	
 
        #version 500
 
        # hello 2
 

	
 
        import std.reo::*;
 

	
 
        struct Thing {
 
            int a: 5,
 
        }
 
        enum Hello {
 
            A,
 
            B
 
        }
 
@@ -1091,41 +789,41 @@ mod tests {
 
            }
 
            /* Waza
 
            How are you doing
 
            Things in here yo
 
            /* */ */
 

	
 
            a = a + 5 * 2;
 
            struct Pressure {
 
                int d
 
            }
 
        }
 
        ");
 
        let mut t = Tokenizer::new();
 
        let mut t = PassTokenizer::new();
 
        let mut buffer = TokenBuffer::new();
 
        t.tokenize(&mut source, &mut buffer).expect("tokenize");
 

	
 
        println!("Ranges:\n");
 
        for (idx, range) in buffer.ranges.iter().enumerate() {
 
            println!("[{}] {:?}", idx, range)
 
        }
 

	
 
        println!("Tokens:\n");
 
        let mut iter = buffer.tokens.iter().enumerate();
 
        while let Some((idx, token)) = iter.next() {
 
            match token.kind {
 
                TokenKind::Ident | TokenKind::Pragma | TokenKind::Integer |
 
                TokenKind::String | TokenKind::Character | TokenKind::LineComment |
 
                TokenKind::BlockComment => {
 
                    let (_, end) = iter.next().unwrap();
 
                    println!("[{}] {:?} ......", idx, token.kind);
 
                    assert_eq!(end.kind, TokenKind::SpanEnd);
 
                    let text = source.section(token.pos, end.pos);
 
                    let text = source.section_at_pos(token.pos, end.pos);
 
                    println!("{}", String::from_utf8_lossy(text));
 
                },
 
                _ => {
 
                    println!("[{}] {:?}", idx, token.kind);
 
                }
 
            }
 
        }
 
    }
 
}
 
\ No newline at end of file
src/protocol/parser/symbol_table.rs
Show inline comments
 
@@ -114,25 +114,25 @@ impl SymbolTable {
 
        // Preparation: determine total number of imports we will be inserting
 
        // into the lookup table. We could just iterate over the arena, but then
 
        // we don't know the source file the import belongs to.
 
        let mut lookup_reserve_size = 0;
 
        for module in modules {
 
            let module_root = &heap[module.root_id];
 
            for import_id in &module_root.imports {
 
                match &heap[*import_id] {
 
                    Import::Module(_) => lookup_reserve_size += 1,
 
                    Import::Symbols(import) => {
 
                        if import.symbols.is_empty() {
 
                            // Add all symbols from the other module
 
                            match self.module_lookup.get(&import.module_name) {
 
                            match self.module_lookup.get(&import.module) {
 
                                Some(target_module_id) => {
 
                                    lookup_reserve_size += heap[*target_module_id].definitions.len()
 
                                },
 
                                None => {
 
                                    return Err(
 
                                        ParseError::new_error(&module.source, import.position, "Cannot resolve module")
 
                                    );
 
                                }
 
                            }
 
                        } else {
 
                            lookup_reserve_size += import.symbols.len();
 
                        }
 
@@ -165,47 +165,47 @@ impl SymbolTable {
 
            }
 
        }
 

	
 
        // Second pass: now that we can find symbols in modules, we can resolve
 
        // all imports (if they're correct, that is)
 
        for module in modules {
 
            let root = &heap[module.root_id];
 
            for import_id in &root.imports {
 
                let import = &heap[*import_id];
 
                match import {
 
                    Import::Module(import) => {
 
                        // Find the module using its name
 
                        let target_root_id = self.resolve_module(&import.module_name);
 
                        let target_root_id = self.resolve_module(&import.module);
 
                        if target_root_id.is_none() {
 
                            return Err(ParseError::new_error(&module.source, import.position, "Could not resolve module"));
 
                        }
 
                        let target_root_id = target_root_id.unwrap();
 
                        if target_root_id == module.root_id {
 
                            return Err(ParseError::new_error(&module.source, import.position, "Illegal import of self"));
 
                        }
 

	
 
                        // Add the target module under its alias
 
                        if let Err(previous_position) = self.add_namespace_symbol(
 
                            import.position, SymbolKey::from_identifier(module.root_id, &import.alias),
 
                            target_root_id
 
                        ) {
 
                            return Err(
 
                                ParseError::new_error(&module.source, import.position, "Symbol is multiply defined")
 
                                    .with_postfixed_info(&module.source, previous_position, "Previous definition was here")
 
                            );
 
                        }
 
                    },
 
                    Import::Symbols(import) => {
 
                        // Find the target module using its name
 
                        let target_root_id = self.resolve_module(&import.module_name);
 
                        let target_root_id = self.resolve_module(&import.module);
 
                        if target_root_id.is_none() {
 
                            return Err(ParseError::new_error(&module.source, import.position, "Could not resolve module of symbol imports"));
 
                        }
 
                        let target_root_id = target_root_id.unwrap();
 
                        if target_root_id == module.root_id {
 
                            return Err(ParseError::new_error(&module.source, import.position, "Illegal import of self"));
 
                        }
 

	
 
                        // Determine which symbols to import
 
                        if import.symbols.is_empty() {
 
                            // Import of all symbols, not using any aliases
 
                            for definition_id in &heap[target_root_id].definitions {
src/protocol/parser/symbol_table2.rs
Show inline comments
 
@@ -79,57 +79,93 @@ impl SymbolDefinition {
 

	
 
        match self {
 
            SD::Module(_) => SC::Module,
 
            SD::Struct(_) => SC::Struct,
 
            SD::Enum(_) => SC::Enum,
 
            SD::Union(_) => SC::Union,
 
            SD::Function(_) => SC::Function,
 
            SD::Component(_) => SC::Component,
 
        }
 
    }
 
}
 

	
 
#[derive(Debug)]
 
pub struct SymbolModule {
 
    pub root_id: RootId,
 
    pub introduced_at: ImportId,
 
}
 

	
 
#[derive(Debug, Clone)]
 
pub struct SymbolDefinition {
 
    // Definition location (not necessarily the place where the symbol
 
    // is introduced, as it may be imported)
 
    pub defined_in_module: RootId,
 
    pub defined_in_scope: SymbolScope,
 
    pub definition_span: InputSpan, // full span of definition
 
    pub identifier_span: InputSpan, // span of just the identifier
 
    // Location where the symbol is introduced in its scope
 
    pub imported_at: Option<ImportId>,
 
    // Definition in the heap, with a utility enum to determine its
 
    // class if the ID is not needed.
 
    pub class: DefinitionClass,
 
    pub definition_id: DefinitionId,
 
}
 

	
 
impl SymbolDefinition {
 
    /// Clones the entire data structure, but replaces the `imported_at` field
 
    /// with the supplied `ImportId`.
 
    pub(crate) fn into_imported(mut self, imported_at: ImportId) -> Self {
 
        self.imported_at = Some(imported_at);
 
        self
 
    }
 
}
 

	
 
#[derive(Debug)]
 
pub enum SymbolVariant {
 
    Module(SymbolModule),
 
    Definition(SymbolDefinition),
 
}
 

	
 
impl SymbolVariant {
 
    pub(crate) fn as_module(&self) -> &SymbolModule {
 
        match self {
 
            SymbolVariant::Module(v) => v,
 
            SymbolVariant::Definition(_) => unreachable!("called 'as_module' on {:?}", self),
 
        }
 
    }
 

	
 
    pub(crate) fn as_definition(&self) -> &SymbolDefinition {
 
        match self {
 
            SymbolVariant::Module(v) => unreachable!("called 'as_definition' on {:?}", self),
 
            SymbolVariant::Definition(v) => v,
 
        }
 
    }
 

	
 
    pub(crate) fn as_definition_mut(&mut self) -> &mut SymbolDefinition {
 
        match self {
 
            SymbolVariant::Module(v) => unreachable!("called 'as_definition_mut' on {:?}", self),
 
            SymbolVariant::Definition(v) => v,
 
        }
 
    }
 
}
 

	
 
#[derive(Clone)]
 
pub struct Symbol {
 
    pub name: StringRef<'static>,
 
    pub data: SymbolVariant,
 
    pub variant: SymbolVariant,
 
}
 

	
 
impl Symbol {
 
    fn class(&self) -> SymbolClass {
 
        match &self.data {
 
    pub(crate) fn class(&self) -> SymbolClass {
 
        match &self.variant {
 
            SymbolVariant::Module(_) => SymbolClass::Module,
 
            SymbolVariant::Definition(data) => data.class.as_symbol_class(),
 
        }
 
    }
 
}
 

	
 
pub struct SymbolTable {
 
    module_lookup: HashMap<StringRef<'static>, RootId>,
 
    scope_lookup: HashMap<SymbolScope, ScopedSymbols>,
 
}
 

	
 
impl SymbolTable {
 
@@ -227,29 +263,52 @@ impl SymbolTable {
 
            }
 
        }
 
    }
 

	
 
    /// Retrieves a symbol by name by searching in a particular scope and that scope's parents. The
 
    /// returned symbol must be defined within any of the searched scopes and may not be imported.
 
    /// In case such an imported symbol exists then this function still returns `None`.
 
    pub(crate) fn get_symbol_by_name_defined_in_scope(
 
        &self, in_scope: SymbolScope, name: &[u8]
 
    ) -> Option<&Symbol> {
 
        match self.get_symbol_by_name(in_scope, name) {
 
            Some(symbol) => {
 
                match &symbol.data {
 
                match &symbol.variant {
 
                    SymbolVariant::Module(_) => {
 
                        None // in-scope modules are always imported
 
                    },
 
                    SymbolVariant::Definition(variant) => {
 
                        if variant.imported_at.is_some() {
 
                            None
 
                        } else {
 
                            Some(symbol)
 
                        }
 
                    }
 
                }
 
            },
 
            None => None,
 
        }
 
    }
 

	
 
    /// Retrieves all symbols that are defined within a particular scope. Imported symbols are
 
    /// ignored. Returns `true` if the scope was found (which may contain 0 defined symbols) and
 
    /// `false` if the scope was not found.
 
    pub(crate) fn get_all_symbols_defined_in_scope(&self, in_scope: SymbolScope, target: &mut Vec<Symbol>) -> bool {
 
        match self.scope_lookup.get(&in_scope) {
 
            Some(scope) => {
 
                for symbol in &scope.symbols {
 
                    if let SymbolVariant::Definition(definition) = &symbol.variant {
 
                        if definition.imported_at.is_some() {
 
                            continue;
 
                        }
 

	
 
                        // Defined in scope, so push onto target
 
                        target.push(symbol.clone());
 
                    }
 
                }
 

	
 
                true
 
            },
 
            None => false,
 
        }
 
    }
 
}
 
\ No newline at end of file
src/protocol/parser/token_parsing.rs
Show inline comments
 
new file 100644
 
use crate::collections::StringRef;
 
use crate::protocol::ast::*;
 
use crate::protocol::input_source2::{
 
    InputSource2 as InputSource,
 
    InputPosition2 as InputPosition,
 
    InputSpan,
 
    ParseError,
 
};
 
use super::tokens::*;
 
use super::symbol_table2::*;
 
use super::{Module, ModuleCompilationPhase, PassCtx};
 

	
 
// Keywords
 
pub(crate) const KW_LET:       &'static [u8] = b"let";
 
pub(crate) const KW_AS:        &'static [u8] = b"as";
 
pub(crate) const KW_STRUCT:    &'static [u8] = b"struct";
 
pub(crate) const KW_ENUM:      &'static [u8] = b"enum";
 
pub(crate) const KW_UNION:     &'static [u8] = b"union";
 
pub(crate) const KW_FUNCTION:  &'static [u8] = b"function";
 
pub(crate) const KW_PRIMITIVE: &'static [u8] = b"primitive";
 
pub(crate) const KW_COMPOSITE: &'static [u8] = b"composite";
 
pub(crate) const KW_IMPORT:    &'static [u8] = b"import";
 

	
 
// Keywords - literals
 
pub(crate) const KW_LIT_TRUE:  &'static [u8] = b"true";
 
pub(crate) const KW_LIT_FALSE: &'static [u8] = b"false";
 
pub(crate) const KW_LIT_NULL:  &'static [u8] = b"null";
 

	
 
// Keywords - functions
 
pub(crate) const KW_FUNC_GET:    &'static [u8] = b"get";
 
pub(crate) const KW_FUNC_PUT:    &'static [u8] = b"put";
 
pub(crate) const KW_FUNC_FIRES:  &'static [u8] = b"fires";
 
pub(crate) const KW_FUNC_CREATE: &'static [u8] = b"create";
 
pub(crate) const KW_FUNC_LENGTH: &'static [u8] = b"length";
 

	
 
// Keywords - statements
 
pub(crate) const KW_STMT_CHANNEL:  &'static [u8] = b"channel";
 
pub(crate) const KW_STMT_IF:       &'static [u8] = b"if";
 
pub(crate) const KW_STMT_WHILE:    &'static [u8] = b"while";
 
pub(crate) const KW_STMT_BREAK:    &'static [u8] = b"break";
 
pub(crate) const KW_STMT_CONTINUE: &'static [u8] = b"continue";
 
pub(crate) const KW_STMT_GOTO:     &'static [u8] = b"goto";
 
pub(crate) const KW_STMT_RETURN:   &'static [u8] = b"return";
 
pub(crate) const KW_STMT_SYNC:     &'static [u8] = b"synchronous";
 
pub(crate) const KW_STMT_ASSERT:   &'static [u8] = b"assert";
 
pub(crate) const KW_STMT_NEW:      &'static [u8] = b"new";
 

	
 
// Keywords - types
 
pub(crate) const KW_TYPE_IN_PORT:  &'static [u8] = b"in";
 
pub(crate) const KW_TYPE_OUT_PORT: &'static [u8] = b"out";
 
pub(crate) const KW_TYPE_MESSAGE:  &'static [u8] = b"msg";
 
pub(crate) const KW_TYPE_BOOL:     &'static [u8] = b"bool";
 
pub(crate) const KW_TYPE_UINT8:    &'static [u8] = b"u8";
 
pub(crate) const KW_TYPE_UINT16:   &'static [u8] = b"u16";
 
pub(crate) const KW_TYPE_UINT32:   &'static [u8] = b"u32";
 
pub(crate) const KW_TYPE_UINT64:   &'static [u8] = b"u64";
 
pub(crate) const KW_TYPE_SINT8:     &'static [u8] = b"s8";
 
pub(crate) const KW_TYPE_SINT16:    &'static [u8] = b"s16";
 
pub(crate) const KW_TYPE_SINT32:    &'static [u8] = b"s32";
 
pub(crate) const KW_TYPE_SINT64:    &'static [u8] = b"s64";
 
pub(crate) const KW_TYPE_CHAR:     &'static [u8] = b"char";
 
pub(crate) const KW_TYPE_STRING:   &'static [u8] = b"string";
 
pub(crate) const KW_TYPE_INFERRED: &'static [u8] = b"auto";
 

	
 
/// Consumes a domain-name identifier: identifiers separated by a dot. For
 
/// simplification of later parsing and span identification the domain-name may
 
/// contain whitespace, but must reside on the same line.
 
pub(crate) fn consume_domain_ident<'a>(
 
    source: &'a InputSource, iter: &mut TokenIter
 
) -> Result<(&'a [u8], InputSpan), ParseError> {
 
    let (_, mut span) = consume_ident(source, iter)?;
 
    while let Some(TokenKind::Dot) = iter.next() {
 
        iter.consume();
 
        let (_, new_span) = consume_ident(source, iter)?;
 
        span.end = new_span.end;
 
    }
 

	
 
    // Not strictly necessary, but probably a reasonable restriction: this
 
    // simplifies parsing of module naming and imports.
 
    if span.begin.line != span.end.line {
 
        return Err(ParseError::new_error_str_at_span(source, span, "module names may not span multiple lines"));
 
    }
 

	
 
    // If module name consists of a single identifier, then it may not match any
 
    // of the reserved keywords
 
    let section = source.section_at_pos(span.begin, span.end);
 
    if is_reserved_keyword(section) {
 
        return Err(ParseError::new_error_str_at_span(source, span, "encountered reserved keyword"));
 
    }
 

	
 
    Ok((source.section_at_pos(span.begin, span.end), span))
 
}
 

	
 
/// Consumes a specific expected token. Be careful to only call this with tokens
 
/// that do not have a variable length.
 
pub(crate) fn consume_token(source: &InputSource, iter: &mut TokenIter, expected: TokenKind) -> Result<(), ParseError> {
 
    if Some(expected) != iter.next() {
 
        return Err(ParseError::new_error_at_pos(
 
            source, iter.last_valid_pos(),
 
            format!("expected '{}'", expected.token_chars())
 
        ));
 
    }
 
    iter.consume();
 
    Ok(())
 
}
 

	
 
/// Consumes a comma-separated list of items if the opening delimiting token is
 
/// encountered. If not, then the iterator will remain at its current position.
 
/// Note that the potential cases may be:
 
/// - No opening delimiter encountered, then we return `false`.
 
/// - Both opening and closing delimiter encountered, but no items.
 
/// - Opening and closing delimiter encountered, and items were processed.
 
/// - Found an opening delimiter, but processing an item failed.
 
pub(crate) fn maybe_consume_comma_separated<T, F>(
 
    open_delim: TokenKind, close_delim: TokenKind, source: &InputSource, iter: &mut TokenIter,
 
    consumer_fn: F, target: &mut Vec<T>, item_name_and_article: &'static str
 
) -> Result<bool, ParseError>
 
    where F: Fn(&InputSource, &mut TokenIter) -> Result<T, ParseError>
 
{
 
    let mut next = iter.next();
 
    if Some(open_delim) != next {
 
        return Ok(false);
 
    }
 

	
 
    // Opening delimiter encountered, so must parse the comma-separated list.
 
    iter.consume();
 
    target.clear();
 
    let mut had_comma = true;
 
    loop {
 
        next = iter.next();
 
        if Some(close_delim) == next {
 
            iter.consume();
 
            break;
 
        } else if !had_comma {
 
            return Err(ParseError::new_error_at_pos(
 
                source, iter.last_valid_pos(),
 
                format!("expected a '{}', or {}", close_delim.token_chars(), item_name_and_article)
 
            ));
 
        }
 

	
 
        let new_item = consumer_fn(source, iter)?;
 
        target.push(new_item);
 

	
 
        next = iter.next();
 
        had_comma = next == Some(TokenKind::Comma);
 
        if had_comma {
 
            iter.consume();
 
        }
 
    }
 

	
 
    Ok(true)
 
}
 

	
 
/// Consumes a comma-separated list and expected the opening and closing
 
/// characters to be present. The returned array may still be empty
 
pub(crate) fn consume_comma_separated<T, F>(
 
    open_delim: TokenKind, close_delim: TokenKind, source: &InputSource, iter: &mut TokenIter,
 
    consumer_fn: F, target: &mut Vec<T>, item_name_and_article: &'static str,
 
    list_name_and_article: &'static str
 
) -> Result<(), ParseError>
 
    where F: Fn(&InputSource, &mut TokenIter) -> Result<T, ParseError>
 
{
 
    let first_pos = iter.last_valid_pos();
 
    match maybe_consume_comma_separated(open_delim, close_delim, source, iter, consumer_fn, target, item_name_and_article) {
 
        Ok(true) => Ok(()),
 
        Ok(false) => {
 
            return Err(ParseError::new_error_at_pos(
 
                source, first_pos,
 
                format!("expected a {}", list_name_and_article)
 
            ));
 
        },
 
        Err(err) => Err(err)
 
    }
 
}
 

	
 
/// Consumes an integer literal, may be binary, octal, hexadecimal or decimal,
 
/// and may have separating '_'-characters.
 
pub(crate) fn consume_integer_literal(source: &InputSource, iter: &mut TokenIter, buffer: &mut String) -> Result<(u64, InputSpan), ParseError> {
 
    if Some(TokenKind::Integer) != iter.next() {
 
        return Err(ParseError::new_error_str_at_pos(source, iter.last_valid_pos(), "expected an integer literal"));
 
    }
 
    let integer_span = iter.next_span();
 
    iter.consume();
 

	
 
    let integer_text = source.section_at_span(integer_span);
 

	
 
    // Determine radix and offset from prefix
 
    let (radix, input_offset, radix_name) =
 
        if integer_text.starts_with(b"0b") || integer_text.starts_with(b"0B") {
 
            // Binary number
 
            (2, 2, "binary")
 
        } else if integer_text.starts_with(b"0o") || integer_text.starts_with(b"0O") {
 
            // Octal number
 
            (8, 2, "octal")
 
        } else if integer_text.starts_with(b"0x") || integer_text.starts_with(b"0X") {
 
            // Hexadecimal number
 
            (16, 2, "hexadecimal")
 
        } else {
 
            (10, 0, "decimal")
 
        };
 

	
 
    // Take out any of the separating '_' characters
 
    buffer.clear();
 
    for char_idx in input_offset..integer_text.len() {
 
        let char = integer_text[char_idx];
 
        if char == b'_' {
 
            continue;
 
        }
 
        if !char.is_ascii_digit() {
 
            return Err(ParseError::new_error_at_span(
 
                source, integer_span,
 
                format!("incorrectly formatted {} number", radix_name)
 
            ));
 
        }
 
        buffer.push(char::from(char));
 
    }
 

	
 
    // Use the cleaned up string to convert to integer
 
    match u64::from_str_radix(&buffer, radix) {
 
        Ok(number) => Ok((number, integer_span)),
 
        Err(_) => Err(ParseError::new_error_at_span(
 
            source, integer_span,
 
            format!("incorrectly formatted {} number", radix_name)
 
        )),
 
    }
 
}
 

	
 
pub(crate) fn consume_pragma<'a>(source: &'a InputSource, iter: &mut TokenIter) -> Result<(&'a [u8], InputPosition, InputPosition), ParseError> {
 
    if Some(TokenKind::Pragma) != iter.next() {
 
        return Err(ParseError::new_error_str_at_pos(source, iter.last_valid_pos(), "expected a pragma"));
 
    }
 
    let (pragma_start, pragma_end) = iter.next_positions();
 
    iter.consume();
 
    Ok((source.section_at_pos(pragma_start, pragma_end), pragma_start, pragma_end))
 
}
 

	
 
pub(crate) fn has_ident(source: &InputSource, iter: &mut TokenIter, expected: &[u8]) -> bool {
 
    peek_ident(source, iter).map_or(false, |section| section == expected)
 
}
 

	
 
pub(crate) fn peek_ident<'a>(source: &'a InputSource, iter: &mut TokenIter) -> Option<&'a [u8]> {
 
    if Some(TokenKind::Ident) == iter.next() {
 
        let (start, end) = iter.next_positions();
 
        return Some(source.section_at_pos(start, end))
 
    }
 

	
 
    None
 
}
 

	
 
/// Consumes any identifier and returns it together with its span. Does not
 
/// check if the identifier is a reserved keyword.
 
pub(crate) fn consume_any_ident<'a>(
 
    source: &'a InputSource, iter: &mut TokenIter
 
) -> Result<(&'a [u8], InputSpan), ParseError> {
 
    if Some(TokenKind::Ident) != iter.next() {
 
        return Err(ParseError::new_error_str_at_pos(source, iter.last_valid_pos(), "expected an identifier"));
 
    }
 
    let (ident_start, ident_end) = iter.next_positions();
 
    iter.consume();
 
    Ok((source.section_at_pos(ident_start, ident_end), InputSpan::from_positions(ident_start, ident_end)))
 
}
 

	
 
/// Consumes a specific identifier. May or may not be a reserved keyword.
 
pub(crate) fn consume_exact_ident(source: &InputSource, iter: &mut TokenIter, expected: &[u8]) -> Result<InputSpan, ParseError> {
 
    let (ident, pos) = consume_any_ident(source, iter)?;
 
    if ident != expected {
 
        debug_assert!(expected.is_ascii());
 
        return Err(ParseError::new_error_at_pos(
 
            source, iter.last_valid_pos(),
 
            format!("expected the text '{}'", &String::from_utf8_lossy(expected))
 
        ));
 
    }
 
    Ok(pos)
 
}
 

	
 
/// Consumes an identifier that is not a reserved keyword and returns it
 
/// together with its span.
 
pub(crate) fn consume_ident<'a>(
 
    source: &'a InputSource, iter: &mut TokenIter
 
) -> Result<(&'a [u8], InputSpan), ParseError> {
 
    let (ident, span) = consume_any_ident(source, iter)?;
 
    if is_reserved_keyword(ident) {
 
        return Err(ParseError::new_error_str_at_span(source, span, "encountered reserved keyword"));
 
    }
 

	
 
    Ok((ident, span))
 
}
 

	
 
/// Consumes an identifier and immediately intern it into the `StringPool`
 
pub(crate) fn consume_ident_interned(
 
    source: &InputSource, iter: &mut TokenIter, ctx: &mut PassCtx
 
) -> Result<Identifier, ParseError> {
 
    let (value, span) = consume_ident(source, iter)?;
 
    let value = ctx.pool.intern(value);
 
    Ok(Identifier{ span, value })
 
}
 

	
 
fn is_reserved_definition_keyword(text: &[u8]) -> bool {
 
    match text {
 
        KW_STRUCT | KW_ENUM | KW_UNION | KW_FUNCTION | KW_PRIMITIVE | KW_COMPOSITE => true,
 
        _ => false,
 
    }
 
}
 

	
 
fn is_reserved_statement_keyword(text: &[u8]) -> bool {
 
    match text {
 
        KW_IMPORT | KW_AS |
 
        KW_STMT_CHANNEL | KW_STMT_IF | KW_STMT_WHILE |
 
        KW_STMT_BREAK | KW_STMT_CONTINUE | KW_STMT_GOTO | KW_STMT_RETURN |
 
        KW_STMT_SYNC | KW_STMT_ASSERT | KW_STMT_NEW => true,
 
        _ => false,
 
    }
 
}
 

	
 
fn is_reserved_expression_keyword(text: &[u8]) -> bool {
 
    match text {
 
        KW_LET |
 
        KW_LIT_TRUE | KW_LIT_FALSE | KW_LIT_NULL |
 
        KW_FUNC_GET | KW_FUNC_PUT | KW_FUNC_FIRES | KW_FUNC_CREATE | KW_FUNC_LENGTH => true,
 
        _ => false,
 
    }
 
}
 

	
 
fn is_reserved_type_keyword(text: &[u8]) -> bool {
 
    match text {
 
        KW_TYPE_IN_PORT | KW_TYPE_OUT_PORT | KW_TYPE_MESSAGE | KW_TYPE_BOOL |
 
        KW_TYPE_UINT8 | KW_TYPE_UINT16 | KW_TYPE_UINT32 | KW_TYPE_UINT64 |
 
        KW_TYPE_SINT8 | KW_TYPE_SINT16 | KW_TYPE_SINT32 | KW_TYPE_SINT64 |
 
        KW_TYPE_CHAR | KW_TYPE_STRING |
 
        KW_TYPE_INFERRED => true,
 
        _ => false,
 
    }
 
}
 

	
 
fn is_reserved_keyword(text: &[u8]) -> bool {
 
    return
 
        is_reserved_definition_keyword(text) ||
 
        is_reserved_statement_keyword(text) ||
 
        is_reserved_expression_keyword(text) ||
 
        is_reserved_type_keyword(text);
 
}
 

	
 
pub(crate) fn seek_module(modules: &[Module], root_id: RootId) -> Option<&Module> {
 
    for module in modules {
 
        if module.root_id == root_id {
 
            return Some(module)
 
        }
 
    }
 

	
 
    return None
 
}
 

	
 
/// Constructs a human-readable message indicating why there is a conflict of
 
/// symbols.
 
// Note: passing the `module_idx` is not strictly necessary, but will prevent
 
// programmer mistakes during development: we get a conflict because we're
 
// currently parsing a particular module.
 
pub(crate) fn construct_symbol_conflict_error(
 
    modules: &[Module], module_idx: usize, ctx: &PassCtx, new_symbol: &Symbol, old_symbol: &Symbol
 
) -> ParseError {
 
    let module = &modules[module_idx];
 
    let get_symbol_span_and_msg = |symbol: &Symbol| -> (String, InputSpan) {
 
        match symbol.introduced_at {
 
            Some(import_id) => {
 
                // Symbol is being imported
 
                let import = &ctx.heap[import_id];
 
                match import {
 
                    Import::Module(import) => (
 
                        format!("the module aliased as '{}' imported here", symbol.name.as_str()),
 
                        import.span
 
                    ),
 
                    Import::Symbols(symbols) => (
 
                        format!("the type '{}' imported here", symbol.name.as_str()),
 
                        symbols.span
 
                    ),
 
                }
 
            },
 
            None => {
 
                // Symbol is being defined
 
                debug_assert_eq!(symbol.defined_in_module, module.root_id);
 
                debug_assert_ne!(symbol.definition.symbol_class(), SymbolClass::Module);
 
                (
 
                    format!("the type '{}' defined here", symbol.name.as_str()),
 
                    symbol.identifier_span
 
                )
 
            }
 
        }
 
    };
 

	
 
    let (new_symbol_msg, new_symbol_span) = get_symbol_span_and_msg(new_symbol);
 
    let (old_symbol_msg, old_symbol_span) = get_symbol_span_and_msg(old_symbol);
 
    return ParseError::new_error_at_span(
 
        &module.source, new_symbol_span, format!("symbol is defined twice: {}", new_symbol_msg)
 
    ).with_info_at_span(
 
        &module.source, old_symbol_span, format!("it conflicts with {}", old_symbol_msg)
 
    )
 
}
 
\ No newline at end of file
src/protocol/parser/tokens.rs
Show inline comments
 
new file 100644
 
use crate::protocol::input_source2::{
 
    InputPosition2 as InputPosition,
 
    InputSpan
 
};
 

	
 
/// Represents a particular kind of token. Some tokens represent
 
/// variable-character tokens. Such a token is always followed by a
 
/// `TokenKind::SpanEnd` token.
 
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
 
pub(crate) enum TokenKind {
 
    // Variable-character tokens, followed by a SpanEnd token
 
    Ident,          // regular identifier
 
    Pragma,         // identifier with prefixed `#`, range includes `#`
 
    Integer,        // integer literal
 
    String,         // string literal, range includes `"`
 
    Character,      // character literal, range includes `'`
 
    LineComment,    // line comment, range includes leading `//`, but not newline
 
    BlockComment,   // block comment, range includes leading `/*` and trailing `*/`
 
    // Punctuation (single character)
 
    Exclamation,    // !
 
    Question,       // ?
 
    Pound,          // #
 
    OpenAngle,      // <
 
    OpenCurly,      // {
 
    OpenParen,      // (
 
    OpenSquare,     // [
 
    CloseAngle,     // >
 
    CloseCurly,     // }
 
    CloseParen,     // )
 
    CloseSquare,    // ]
 
    Colon,          // :
 
    Comma,          // ,
 
    Dot,            // .
 
    SemiColon,      // ;
 
    Quote,          // '
 
    DoubleQuote,    // "
 
    // Operator-like (single character)
 
    At,             // @
 
    Plus,           // +
 
    Minus,          // -
 
    Star,           // *
 
    Slash,          // /
 
    Percent,        // %
 
    Caret,          // ^
 
    And,            // &
 
    Or,             // |
 
    Tilde,          // ~
 
    Equal,          // =
 
    // Punctuation (two characters)
 
    ColonColon,     // ::
 
    DotDot,         // ..
 
    ArrowRight,     // ->
 
    // Operator-like (two characters)
 
    PlusPlus,       // ++
 
    PlusEquals,     // +=
 
    MinusMinus,     // --
 
    MinusEquals,    // -=
 
    StarEquals,     // *=
 
    SlashEquals,    // /=
 
    PercentEquals,  // %=
 
    CaretEquals,    // ^=
 
    AndAnd,         // &&
 
    AndEquals,      // &=
 
    OrOr,           // ||
 
    OrEquals,       // |=
 
    EqualEqual,     // ==
 
    NotEqual,       // !=
 
    ShiftLeft,      // <<
 
    ShiftRight,     // >>
 
    // Operator-like (three characters)
 
    ShiftLeftEquals,// <<=
 
    ShiftRightEquals, // >>=
 
    // Special marker token to indicate end of variable-character tokens
 
    SpanEnd,
 
}
 

	
 
impl TokenKind {
 
    /// Returns true if the next expected token is the special `TokenKind::SpanEnd` token. This is
 
    /// the case for tokens of variable length (e.g. an identifier).
 
    fn has_span_end(&self) -> bool {
 
        return *self <= TokenKind::BlockComment
 
    }
 

	
 
    /// Returns the number of characters associated with the token. May only be called on tokens
 
    /// that do not have a variable length.
 
    fn num_characters(&self) -> u32 {
 
        debug_assert!(!self.has_span_end() && *self != TokenKind::SpanEnd);
 
        if *self <= TokenKind::Equal {
 
            1
 
        } else if *self <= TokenKind::ShiftRight {
 
            2
 
        } else {
 
            3
 
        }
 
    }
 

	
 
    /// Returns the characters that are represented by the token, may only be called on tokens that
 
    /// do not have a variable length.
 
    pub fn token_chars(&self) -> &'static str {
 
        debug_assert!(!self.has_span_end() && *self != TokenKind::SpanEnd);
 
        use TokenKind as TK;
 
        match self {
 
            TK::Exclamation => "!",
 
            TK::Question => "?",
 
            TK::Pound => "#",
 
            TK::OpenAngle => "<",
 
            TK::OpenCurly => "{",
 
            TK::OpenParen => "(",
 
            TK::OpenSquare => "[",
 
            TK::CloseAngle => ">",
 
            TK::CloseCurly => "}",
 
            TK::CloseParen => ")",
 
            TK::CloseSquare => "]",
 
            TK::Colon => ":",
 
            TK::Comma => ",",
 
            TK::Dot => ".",
 
            TK::SemiColon => ";",
 
            TK::Quote => "'",
 
            TK::DoubleQuote => "\"",
 
            TK::At => "@",
 
            TK::Plus => "+",
 
            TK::Minus => "-",
 
            TK::Star => "*",
 
            TK::Slash => "/",
 
            TK::Percent => "%",
 
            TK::Caret => "^",
 
            TK::And => "&",
 
            TK::Or => "|",
 
            TK::Tilde => "~",
 
            TK::Equal => "=",
 
            TK::ColonColon => "::",
 
            TK::DotDot => "..",
 
            TK::ArrowRight => "->",
 
            TK::PlusPlus => "++",
 
            TK::PlusEquals => "+=",
 
            TK::MinusMinus => "--",
 
            TK::MinusEquals => "-=",
 
            TK::StarEquals => "*=",
 
            TK::SlashEquals => "/=",
 
            TK::PercentEquals => "%=",
 
            TK::CaretEquals => "^=",
 
            TK::AndAnd => "&&",
 
            TK::AndEquals => "&=",
 
            TK::OrOr => "||",
 
            TK::OrEquals => "|=",
 
            TK::EqualEqual => "==",
 
            TK::NotEqual => "!=",
 
            TK::ShiftLeft => "<<",
 
            TK::ShiftRight => ">>",
 
            TK::ShiftLeftEquals => "<<=",
 
            TK::ShiftRightEquals => ">>=",
 
            // Lets keep these in explicitly for now, in case we want to add more symbols
 
            TK::Ident | TK::Pragma | TK::Integer | TK::String | TK::Character |
 
            TK::LineComment | TK::BlockComment | TK::SpanEnd => unreachable!(),
 
        }
 
    }
 
}
 

	
 
/// Represents a single token at a particular position.
 
pub(crate) struct Token {
 
    pub kind: TokenKind,
 
    pub pos: InputPosition,
 
}
 

	
 
impl Token {
 
    pub(crate) fn new(kind: TokenKind, pos: InputPosition) -> Self {
 
        Self{ kind, pos }
 
    }
 
}
 

	
 
/// The kind of token ranges that are specially parsed by the tokenizer.
 
#[derive(Debug, PartialEq, Eq)]
 
pub(crate) enum TokenRangeKind {
 
    Module,
 
    Pragma,
 
    Import,
 
    Definition,
 
    Code,
 
}
 

	
 
/// A range of tokens with a specific meaning. Such a range is part of a tree
 
/// where each parent tree envelops all of its children.
 
#[derive(Debug)]
 
pub(crate) struct TokenRange {
 
    // Index of parent in `TokenBuffer.ranges`, does not have a parent if the
 
    // range kind is Module, in that case the parent index points to itself.
 
    pub parent_idx: usize,
 
    pub range_kind: TokenRangeKind,
 
    pub curly_depth: u32,
 
    // Offsets into `TokenBuffer.ranges`: the tokens belonging to this range.
 
    pub start: u32,             // first token (inclusive index)
 
    pub end: u32,               // last token (exclusive index)
 
    // Child ranges
 
    pub num_child_ranges: u32,  // Number of subranges
 
    pub first_child_idx: u32,   // First subrange (or points to self if no subranges)
 
    pub last_child_idx: u32,    // Last subrange (or points to self if no subranges)
 
    pub next_sibling_idx: Option<u32>,
 
}
 

	
 
pub(crate) struct TokenBuffer {
 
    pub tokens: Vec<Token>,
 
    pub ranges: Vec<TokenRange>,
 
}
 

	
 
impl TokenBuffer {
 
    pub(crate) fn new() -> Self {
 
        Self{ tokens: Vec::new(), ranges: Vec::new() }
 
    }
 

	
 
    pub(crate) fn iter_range<'a>(&'a self, range: &TokenRange) -> TokenIter<'a> {
 
        TokenIter::new(self, range.start as usize, range.end as usize)
 
    }
 

	
 
    pub(crate) fn start_pos(&self, range: &TokenRange) -> InputPosition {
 
        self.tokens[range.start].pos
 
    }
 

	
 
    pub(crate) fn end_pos(&self, range: &TokenRange) -> InputPosition {
 
        let last_token = &self.tokens[range.end - 1];
 
        if last_token.kind == TokenKind::SpanEnd {
 
            return last_token.pos
 
        } else {
 
            debug_assert!(!last_token.kind.has_span_end());
 
            return last_token.pos.with_offset(last_token.kind.num_characters());
 
        }
 
    }
 
}
 

	
 
/// Iterator over tokens within a specific `TokenRange`.
 
pub(crate) struct TokenIter<'a> {
 
    tokens: &'a Vec<Token>,
 
    cur: usize,
 
    end: usize,
 
}
 

	
 
impl<'a> TokenIter<'a> {
 
    fn new(buffer: &'a TokenBuffer, start: usize, end: usize) -> Self {
 
        Self{ tokens: &buffer.tokens, cur: start, end }
 
    }
 

	
 
    /// Returns the next token (may include comments), or `None` if at the end
 
    /// of the range.
 
    pub(crate) fn next_including_comments(&self) -> Option<TokenKind> {
 
        if self.cur >= self.end {
 
            return None;
 
        }
 

	
 
        let token = &self.tokens[self.cur];
 
        Some(token.kind)
 
    }
 

	
 
    /// Returns the next token (but skips over comments), or `None` if at the
 
    /// end of the range
 
    pub(crate) fn next(&mut self) -> Option<TokenKind> {
 
        while let Some(token_kind) = self.next() {
 
            if token_kind != TokenKind::LineComment && token_kind != TokenKind::BlockComment {
 
                return Some(token_kind);
 
            }
 
            self.consume();
 
        }
 

	
 
        return None
 
    }
 

	
 
    /// Returns the start position belonging to the token returned by `next`. If
 
    /// there is not a next token, then we return the end position of the
 
    /// previous token.
 
    pub(crate) fn last_valid_pos(&self) -> InputPosition {
 
        if self.cur < self.end {
 
            // Return token position
 
            return self.tokens[self.cur].pos
 
        }
 

	
 
        // Return previous token end
 
        let token = &self.tokens[self.cur - 1];
 
        return if token.kind == TokenKind::SpanEnd {
 
            token.pos
 
        } else {
 
            token.pos.with_offset(token.kind.num_characters());
 
        };
 
    }
 

	
 
    /// Returns the token range belonging to the token returned by `next`. This
 
    /// assumes that we're not at the end of the range we're iterating over.
 
    /// TODO: @cleanup Phase out?
 
    pub(crate) fn next_positions(&self) -> (InputPosition, InputPosition) {
 
        debug_assert!(self.cur < self.end);
 
        let token = &self.tokens[self.cur];
 
        if token.kind.has_span_end() {
 
            let span_end = &self.tokens[self.cur + 1];
 
            debug_assert_eq!(span_end.kind, TokenKind::SpanEnd);
 
            (token.pos, span_end.pos)
 
        } else {
 
            let offset = token.kind.num_characters();
 
            (token.pos, token.pos.with_offset(offset))
 
        }
 
    }
 

	
 
    /// See `next_positions`
 
    pub(crate) fn next_span(&self) -> InputSpan {
 
        let (begin, end) = self.next_positions();
 
        return InputSpan::from_positions(begin, end)
 
    }
 

	
 
    /// Advances the iterator to the next (meaningful) token.
 
    pub(crate) fn consume(&mut self) {
 
        if let Some(kind) = self.next() {
 
            if kind.has_span_end() {
 
                self.cur += 2;
 
            } else {
 
                self.cur += 1;
 
            }
 
        }
 
    }
 
}
 
\ No newline at end of file
0 comments (0 inline, 0 general)