Changeset - 7f25ee16c39b
[Not reviewed]
0 6 1
MH - 4 years ago 2021-04-14 19:30:38
contact@maxhenger.nl
WIP on compiler rearchitecting
7 files changed with 964 insertions and 158 deletions:
0 comments (0 inline, 0 general)
src/collections/string_pool.rs
Show inline comments
 
use std::ptr::null_mut;
 
use std::collections::hash_map::DefaultHasher;
 
use std::hash::{Hash, Hasher};
 

	
 
const SLAB_SIZE: usize = u16::max_value() as usize;
 

	
 
#[derive(Clone)]
 
pub struct StringRef {
 
    data: *const u8,
 
    length: usize,
 
@@ -16,6 +19,22 @@ impl StringRef {
 
    }
 
}
 

	
 
impl PartialEq for StringRef {
 
    fn eq(&self, other: &StringRef) -> bool {
 
        self.as_str() == other.as_str()
 
    }
 
}
 

	
 
impl Eq for StringRef {}
 

	
 
impl Hash for StringRef {
 
    fn hash<H: Hasher>(&self, state: &mut H) {
 
        unsafe{
 
            state.write(std::slice::from_raw_parts(self.data, self.length));
 
        }
 
    }
 
}
 

	
 
struct StringPoolSlab {
 
    prev: *mut StringPoolSlab,
 
    data: Vec<u8>,
 
@@ -60,7 +79,7 @@ impl StringPool {
 
            last = unsafe{&mut *self.last};
 
        }
 

	
 
        // Must fit now
 
        // Must fit now, compute hash and put in buffer
 
        debug_assert!(data_len <= last.remaining);
 
        let range_start = last.data.len();
 
        last.data.extend_from_slice(data);
src/protocol/ast.rs
Show inline comments
 
@@ -6,7 +6,9 @@ use std::fmt::{Debug, Display, Formatter};
 
use std::ops::{Index, IndexMut};
 

	
 
use super::arena::{Arena, Id};
 
use crate::collections::StringRef;
 
use crate::protocol::inputsource::*;
 
use crate::protocol::input_source2::{InputPosition2, InputSpan};
 

	
 
/// Global limits to the AST, should be checked by lexer and parser. Some are
 
/// arbitrary
 
@@ -238,7 +240,7 @@ impl Index<ChannelStatementId> for Heap {
 
pub struct Root {
 
    pub this: RootId,
 
    // Phase 1: parser
 
    pub position: InputPosition,
 
    // pub position: InputPosition,
 
    pub pragmas: Vec<PragmaId>,
 
    pub imports: Vec<ImportId>,
 
    pub definitions: Vec<DefinitionId>,
 
@@ -264,14 +266,23 @@ impl SyntaxElement for Root {
 
#[derive(Debug, Clone)]
 
pub enum Pragma {
 
    Version(PragmaVersion),
 
    Module(PragmaModule)
 
    Module(PragmaModule),
 
}
 

	
 
impl Pragma {
 
    pub(crate) fn as_module(&self) -> &PragmaModule {
 
        match self {
 
            Pragma::Module(pragma) => pragma,
 
            _ => unreachable!("Tried to obtain {:?} as PragmaModule", self),
 
        }
 
    }
 
}
 

	
 
#[derive(Debug, Clone)]
 
pub struct PragmaVersion {
 
    pub this: PragmaId,
 
    // Phase 1: parser
 
    pub position: InputPosition,
 
    pub span: InputSpan, // of full pragma
 
    pub version: u64,
 
}
 

	
 
@@ -279,22 +290,8 @@ pub struct PragmaVersion {
 
pub struct PragmaModule {
 
    pub this: PragmaId,
 
    // Phase 1: parser
 
    pub position: InputPosition,
 
    pub value: Vec<u8>,
 
}
 

	
 
#[derive(Debug, Clone)]
 
pub struct PragmaOld {
 
    pub this: PragmaId,
 
    // Phase 1: parser
 
    pub position: InputPosition,
 
    pub value: Vec<u8>,
 
}
 

	
 
impl SyntaxElement for PragmaOld {
 
    fn position(&self) -> InputPosition {
 
        self.position
 
    }
 
    pub span: InputSpan, // of full pragma
 
    pub value: Identifier,
 
}
 

	
 
#[derive(Debug, Clone)]
 
@@ -365,8 +362,8 @@ pub struct ImportSymbols {
 

	
 
#[derive(Debug, Clone)]
 
pub struct Identifier {
 
    pub position: InputPosition,
 
    pub value: Vec<u8>
 
    pub span: InputSpan,
 
    pub value: StringRef,
 
}
 

	
 
impl PartialEq for Identifier {
src/protocol/input_source2.rs
Show inline comments
 
use std::fmt;
 
use std::cell::{Ref, RefCell};
 
use std::fmt::Write;
 

	
 
#[derive(Debug, Clone, Copy)]
 
pub struct InputPosition2 {
 
@@ -7,6 +8,13 @@ pub struct InputPosition2 {
 
    pub offset: u32,
 
}
 

	
 
impl InputPosition2 {
 
    pub(crate) fn with_offset(&self, offset: u32) -> Self {
 
        InputPosition2{ line: self.line, offset: self.offset + offset }
 
    }
 
}
 

	
 
#[derive(Debug, Clone, Copy)]
 
pub struct InputSpan {
 
    pub begin: InputPosition2,
 
    pub end: InputPosition2,
 
@@ -14,7 +22,7 @@ pub struct InputSpan {
 

	
 
impl InputSpan {
 
    #[inline]
 
    fn from_positions(begin: InputPosition2, end: InputPosition2) -> Self {
 
    pub fn from_positions(begin: InputPosition2, end: InputPosition2) -> Self {
 
        Self { begin, end }
 
    }
 
}
 
@@ -75,8 +83,8 @@ impl InputSource2 {
 
        }
 
    }
 

	
 
    pub fn section(&self, start: u32, end: u32) -> &[u8] {
 
        &self.input[start as usize..end as usize]
 
    pub fn section(&self, start: InputPosition2, end: InputPosition2) -> &[u8] {
 
        &self.input[start.offset as usize..end.offset as usize]
 
    }
 

	
 
    // Consumes the next character. Will check well-formedness of newlines: \r
 
@@ -145,11 +153,14 @@ impl InputSource2 {
 
        return lookup;
 
    }
 

	
 
    /// Retrieves offset at which line starts (right after newline)
 
    fn lookup_line_start_offset(&self, line_number: u32) -> u32 {
 
        let lookup = self.get_lookup();
 
        lookup[line_number as usize]
 
    }
 

	
 
    /// Retrieves offset at which line ends (at the newline character or the
 
    /// preceding carriage feed for \r\n-encoded newlines)
 
    fn lookup_line_end_offset(&self, line_number: u32) -> u32 {
 
        let lookup = self.get_lookup();
 
        let offset = lookup[(line_number + 1) as usize] - 1;
 
@@ -169,79 +180,186 @@ impl InputSource2 {
 
}
 

	
 
#[derive(Debug)]
 
pub enum ParseErrorType {
 
pub enum StatementKind {
 
    Info,
 
    Error
 
}
 

	
 
#[derive(Debug)]
 
pub enum ContextKind {
 
    SingleLine,
 
    MultiLine,
 
}
 

	
 
#[derive(Debug)]
 
pub struct ParseErrorStatement {
 
    pub(crate) error_type: ParseErrorType,
 
    pub(crate) line: u32,
 
    pub(crate) column: u32,
 
    pub(crate) offset: u32,
 
    pub(crate) statement_kind: StatementKind,
 
    pub(crate) context_kind: ContextKind,
 
    pub(crate) start_line: u32,
 
    pub(crate) start_column: u32,
 
    pub(crate) end_line: u32,
 
    pub(crate) end_column: u32,
 
    pub(crate) filename: String,
 
    pub(crate) context: String,
 
    pub(crate) message: String,
 
}
 

	
 
impl ParseErrorStatement {
 
    fn from_source(error_type: ParseErrorType, source: &InputSource2, position: InputPosition2, msg: &str) -> Self {
 
    fn from_source_at_pos(statement_kind: StatementKind, source: &InputSource2, position: InputPosition2, message: String) -> Self {
 
        // Seek line start and end
 
        let line_start = source.lookup_line_start_offset(position.line);
 
        let line_end = source.lookup_line_end_offset(position.line);
 
        let context = Self::create_context(source, line_start as usize, line_end as usize);
 
        debug_assert!(position.offset >= line_start);
 
        let column = position.offset - line_start + 1;
 

	
 
        Self{
 
            error_type,
 
            line: position.line,
 
            column,
 
            offset: position.offset,
 
            statement_kind,
 
            context_kind: ContextKind::SingleLine,
 
            start_line: position.line,
 
            start_column: column,
 
            end_line: position.line,
 
            end_column: column + 1,
 
            filename: source.filename.clone(),
 
            context,
 
            message,
 
        }
 
    }
 

	
 
    fn from_source_at_span(statement_kind: StatementKind, source: &InputSource2, span: InputSpan, message: String) -> Self {
 
        debug_assert!(span.end.line >= span.begin.line);
 
        debug_assert!(span.end.offset >= span.begin.offset);
 

	
 
        let first_line_start = source.lookup_line_start_offset(span.begin.line);
 
        let last_line_start = source.lookup_line_start_offset(span.end.line);
 
        let last_line_end = source.lookup_line_end_offset(span.end.line);
 
        let context = Self::create_context(source, first_line_start as usize, last_line_end as usize);
 
        debug_assert!(span.begin.offset >= first_line_start);
 
        let start_column = span.begin.offset - first_line_start + 1;
 
        let end_column = span.end.offset - last_line_start + 1;
 

	
 
        let context_kind = if span.begin.line == span.end.line {
 
            ContextKind::SingleLine
 
        } else {
 
            ContextKind::MultiLine
 
        };
 

	
 
        Self{
 
            statement_kind,
 
            context_kind,
 
            start_line: first_line_start,
 
            start_column,
 
            end_line: last_line_start,
 
            end_column,
 
            filename: source.filename.clone(),
 
            context: String::from_utf8_lossy(&source.input[line_start as usize..line_end as usize]).to_string(),
 
            message: msg.to_string()
 
            context,
 
            message,
 
        }
 
    }
 

	
 
    /// Produces context from source
 
    fn create_context(source: &InputSource2, start: usize, end: usize) -> String {
 
        let context_raw = &source.input[start..end];
 
        String::from_utf8_lossy(context_raw).to_string()
 
    }
 
}
 

	
 
impl fmt::Display for ParseErrorStatement {
 
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
 
        // Write message
 
        match self.error_type {
 
            ParseErrorType::Info => write!(f, " INFO: ")?,
 
            ParseErrorType::Error => write!(f, "ERROR: ")?,
 
        // Write kind of statement and message
 
        match self.statement_kind {
 
            StatementKind::Info => f.write_str(" INFO: ")?,
 
            StatementKind::Error => f.write_str("ERROR: ")?,
 
        }
 
        writeln!(f, "{}", &self.message)?;
 
        f.write_str(&self.message)?;
 
        f.write_char('\n')?;
 

	
 
        // Write originating file/line/column
 
        if self.filename.is_empty() {
 
            writeln!(f, " +- at {}:{}", self.line, self.column)?;
 
        } else {
 
            writeln!(f, " +- at {}:{}:{}", self.filename, self.line, self.column)?;
 
        f.write_str(" +- ")?;
 
        if !self.filename.is_empty() {
 
            write!(f, "in {} ", self.filename)?;
 
        }
 

	
 
        match self.context_kind {
 
            ContextKind::SingleLine => writeln!(f, " at {}:{}", self.start_line, self.start_column),
 
            ContextKind::MultiLine => writeln!(
 
                f, " from {}:{} to {}:{}",
 
                self.start_line, self.start_column, self.end_line, self.end_column
 
            )
 
        }?;
 

	
 
        // Helper function for writing context: converting tabs into 4 spaces
 
        // (oh, the controversy!) and creating an annotated line
 
        fn transform_context(source: &str, target: &mut String) {
 
            for char in source.chars() {
 
                if char == '\t' {
 
                    target.push_str("    ");
 
                } else {
 
                    target.push(char);
 
                }
 
            }
 
        }
 

	
 
        fn extend_annotation(first_col: u32, last_col: u32, source: &str, target: &mut String, extend_char: char) {
 
            debug_assert!(first_col > 0 && last_col > first_col);
 
            for (char_idx, char) in source.chars().enumerate().skip(first_col as usize - 1) {
 
                if char_idx == last_col as usize {
 
                    break;
 
                }
 

	
 
                if char == '\t' {
 
                    for _ in 0..4 { target.push(extend_char); }
 
                } else {
 
                    target.push(extend_char);
 
                }
 
            }
 
        }
 

	
 
        // Write source context
 
        writeln!(f, " | ")?;
 
        writeln!(f, " | {}", self.context)?;
 

	
 
        // Write underline indicating where the error ocurred
 
        debug_assert!(self.column as usize <= self.context.chars().count());
 
        let mut arrow = String::with_capacity(self.context.len() + 3);
 
        arrow.push_str(" | ");
 
        let mut char_col = 1;
 
        for char in self.context.chars() {
 
            if char_col == self.column { break; }
 
            if char == '\t' {
 
                arrow.push('\t');
 
            } else {
 
                arrow.push(' ');
 
            }
 

	
 
            char_col += 1;
 
        let mut context = String::with_capacity(128);
 
        let mut annotation = String::with_capacity(128);
 

	
 
        match self.context_kind {
 
            ContextKind::SingleLine => {
 
                // Write single line of context with indicator for the offending
 
                // span underneath.
 
                transform_context(&self.context, &mut context);
 
                context.push('\n');
 
                f.write_str(&context)?;
 

	
 
                annotation.push_str(" | ");
 
                extend_annotation(1, self.start_column, &self.source, &mut annotation, ' ');
 
                extend_annotation(self.start_column, self.end_column, &self.source, &mut annotation, '~');
 
                annotation.push('\n');
 

	
 
                f.write_str(&annotation)?;
 
            },
 
            ContextKind::MultiLine => {
 
                // Annotate all offending lines
 
                // - first line
 
                let mut lines = self.context.lines();
 
                let first_line = lines.next().unwrap();
 
                transform_context(first_line, &mut context);
 
                writeln!(" |- {}", &context)?;
 

	
 
                // - remaining lines
 
                let mut last_line = first_line;
 
                while let Some(cur_line) = lines.next() {
 
                    context.clear();
 
                    transform_context(cur_line, &mut context);
 
                    writeln!(" |  {}", &context);
 
                    last_line = cur_line;
 
                }
 

	
 
                // - underline beneath last line
 
                annotation.push_str(" \\__");
 
                extend_annotation(1, self.end_column, &last_line, &mut annotation, '_');
 
                annotation.push_str("/\n");
 
                f.write_str(&annotation)?;
 
            }
 
        }
 
        arrow.push('^');
 
        writeln!(f, "{}", arrow)?;
 

	
 
        Ok(())
 
    }
 
@@ -273,21 +391,53 @@ impl ParseError {
 
        Self{ statements: Vec::new() }
 
    }
 

	
 
    pub fn new_error(source: &InputSource2, position: InputPosition2, msg: &str) -> Self {
 
        Self{ statements: vec!(ParseErrorStatement::from_source(ParseErrorType::Error, source, position, msg))}
 
    pub fn new_error_at_pos(source: &InputSource2, position: InputPosition2, message: String) -> Self {
 
        Self{ statements: vec!(ParseErrorStatement::from_source_at_pos(
 
            StatementKind::Error, source, position, message
 
        )) }
 
    }
 

	
 
    pub fn new_error_str_at_pos(source: &InputSource2, position: InputPosition2, message: &str) -> Self {
 
        Self{ statements: vec!(ParseErrorStatement::from_source_at_pos(
 
            StatementKind::Error, source, position, message.to_string()
 
        )) }
 
    }
 

	
 
    pub fn new_error_at_span(source: &InputSource2, span: InputSpan, message: String) -> Self {
 
        Self{ statements: vec!(ParseErrorStatement::from_source_at_span(
 
            StatementKind::Error, source, span, message
 
        )) }
 
    }
 

	
 
    pub fn new_error_str_at_span(source: &InputSource2, span: InputSpan, message: &str) -> Self {
 
        Self{ statements: vec!(ParseErrorStatement::from_source_at_span(
 
            StatementKind::Error, source, span, message.to_string()
 
        )) }
 
    }
 

	
 
    pub fn with_prefixed(mut self, error_type: ParseErrorType, source: &InputSource2, position: InputPosition2, msg: &str) -> Self {
 
        self.statements.insert(0, ParseErrorStatement::from_source(error_type, source, position, msg));
 
    pub fn with_at_pos(mut self, error_type: StatementKind, source: &InputSource2, position: InputPosition2, message: String) -> Self {
 
        self.statements.push(ParseErrorStatement::from_source_at_pos(error_type, source, position, message));
 
        self
 
    }
 

	
 
    pub fn with_postfixed(mut self, error_type: ParseErrorType, source: &InputSource2, position: InputPosition2, msg: &str) -> Self {
 
        self.statements.push(ParseErrorStatement::from_source(error_type, source, position, msg));
 
    pub fn with_at_span(mut self, error_type: StatementKind, source: &InputSource2, span: InputSpan, message: String) -> Self {
 
        self.statements.push(ParseErrorStatement::from_source_at_span(error_type, source, span, message.to_string()));
 
        self
 
    }
 

	
 
    pub fn with_postfixed_info(self, source: &InputSource2, position: InputPosition2, msg: &str) -> Self {
 
        self.with_postfixed(ParseErrorType::Info, source, position, msg)
 
    pub fn with_info_at_pos(self, source: &InputSource2, position: InputPosition2, msg: String) -> Self {
 
        self.with_at_pos(StatementKind::Info, source, position, msg)
 
    }
 

	
 
    pub fn with_info_str_at_pos(self, source: &InputSource2, position: InputPosition2, msg: &str) -> Self {
 
        self.with_at_pos(StatementKind::Info, source, position, msg.to_string())
 
    }
 

	
 
    pub fn with_info_at_span(self, source: &InputSource2, span: InputSpan, msg: String) -> Self {
 
        self.with_at_span(StatementKind::Info, source, span, msg)
 
    }
 

	
 
    pub fn with_info_str_at_span(self, source: &InputSource2, span: InputSpan, msg: &str) -> Self {
 
        self.with_at_span(StatementKind::Info, source, span, msg.to_string())
 
    }
 
}
src/protocol/lexer2.rs
Show inline comments
 
use crate::protocol::ast::*;
 
use crate::protocol::Heap;
 
use crate::protocol::tokenizer::{TokenBuffer, Token};
 
use crate::protocol::input_source2::{InputSource2 as InputSource, ParseError};
 
use crate::collections::{StringPool, StringRef};
 
use crate::protocol::tokenizer::*;
 
use crate::protocol::input_source2::{InputSource2 as InputSource, InputPosition2 as InputPosition, InputSpan, ParseError};
 
use crate::protocol::symbol_table2::*;
 

	
 
#[derive(PartialEq, Eq)]
 
enum ModuleCompilationPhase {
 
    Source,                 // only source is set
 
    Tokenized,              // source is tokenized
 
    DefinitionsScanned,     // all definitions are linked to their type class
 
    ImportsResolved,        // all imports are added to the symbol table
 
    Parsed,                 // produced the AST for the module
 
    ValidatedAndLinked,     // AST is traversed and has linked the required AST nodes
 
    Typed,                  // Type inference and checking has been performed
 
}
 

	
 
enum KeywordDefinition {
 
    Struct,
 
    Enum,
 
    Union,
 
    Function,
 
    Primitive,
 
    Composite,
 
}
 

	
 
impl KeywordDefinition {
 
    fn as_symbol_class(&self) -> SymbolClass {
 
        use KeywordDefinition as KD;
 
        use SymbolClass as SC;
 

	
 
        match self {
 
            KD::Struct => SC::Struct,
 
            KD::Enum => SC::Enum,
 
            KD::Union => SC::Union,
 
            KD::Function => SC::Function,
 
            KD::Primitive | KD::Composite => SC::Component,
 
        }
 
    }
 
}
 

	
 
struct Module {
 
    // Buffers
 
    source: InputSource,
 
    tokens: TokenBuffer,
 
    // Identifiers
 
    root_id: RootId,
 
    name: Option<(PragmaId, StringRef)>,
 
    version: Option<(PragmaId, i64)>,
 
    phase: ModuleCompilationPhase,
 
}
 

	
 
struct Ctx<'a> {
 
    heap: &'a mut Heap,
 
    source: &'a InputSource,
 
    tokens: &'a TokenBuffer,
 
    symbols: &'a mut SymbolTable,
 
    pool: &'a mut StringPool,
 
}
 

	
 
/// Scans the module and finds all module-level type definitions. These will be
 
/// added to the symbol table such that during AST-construction we know which
 
/// identifiers point to types. Will also parse all pragmas to determine module
 
/// names.
 
pub(crate) struct ASTSymbolPrePass {
 
    symbols: Vec<Symbol>,
 
    pragmas: Vec<PragmaId>,
 
    buffer: String,
 
    has_pragma_version: bool,
 
    has_pragma_module: bool,
 
}
 

	
 
impl ASTSymbolPrePass {
 
    pub(crate) fn new() -> Self {
 
        Self{
 
            symbols: Vec::with_capacity(128),
 
            pragmas: Vec::with_capacity(8),
 
            buffer: String::with_capacity(128),
 
            has_pragma_version: false,
 
            has_pragma_module: false,
 
        }
 
    }
 

	
 
    fn reset(&mut self) {
 
        self.symbols.clear();
 
        self.pragmas.clear();
 
        self.has_pragma_version = false;
 
        self.has_pragma_module = false;
 
    }
 

	
 
    pub(crate) fn parse(&mut self, modules: &mut [Module], module_idx: usize, ctx: &mut Ctx) -> Result<(), ParseError> {
 
        self.reset();
 

	
 
        let module = &mut modules[module_idx];
 
        let module_range = &module.tokens.ranges[0];
 
        let expected_parent_idx = 0;
 
        let expected_subranges = module_range.subranges;
 
        debug_assert_eq!(module.phase, ModuleCompilationPhase::Tokenized);
 
        debug_assert_eq!(module_range.range_kind, TokenRangeKind::Module);
 
        debug_assert_eq!(module.root_id.index, 0);
 

	
 
        // Preallocate root in the heap
 
        let root_id = ctx.heap.alloc_protocol_description(|this| {
 
            Root{
 
                this,
 
                pragmas: Vec::new(),
 
                imports: Vec::new(),
 
                definitions: Vec::new(),
 
            }
 
        });
 
        module.root_id = root_id;
 

	
 
        // Visit token ranges to detect defintions
 
        let mut visited_subranges = 0;
 
        for range_idx in expected_parent_idx + 1..module.tokens.ranges.len() {
 
            // Skip any ranges that do not belong to the module
 
            let cur_range = &module.tokens.ranges[range_idx];
 
            if cur_range.parent_idx != expected_parent_idx {
 
                continue;
 
            }
 

	
 
            // Parse if it is a definition or a pragma
 
            if cur_range.range_kind == TokenRangeKind::Definition {
 
                self.visit_definition_range(modules, module_idx, ctx, range_idx)?;
 
            } else if cur_range.range_kind == TokenRangeKind::Pragma {
 
                self.visit_pragma_range(modules, module_idx, ctx, range_idx)?;
 
            }
 

	
 
            visited_subranges += 1;
 
            if visited_subranges == expected_subranges {
 
                break;
 
            }
 
        }
 

	
 
        // By now all symbols should have been found: add to symbol table and
 
        // add the parsed pragmas to the preallocated root in the heap.
 
        debug_assert_eq!(visited_subranges, expected_subranges);
 
        ctx.symbols.insert_scoped_symbols(None, SymbolScope::Module(module.root_id), &self.symbols);
 

	
 
        let root = &mut ctx.heap[root_id];
 
        debug_assert!(root.pragmas.is_empty());
 
        root.pragmas.extend(&self.pragmas);
 

	
 
        module.phase = ModuleCompilationPhase::DefinitionsScanned;
 

	
 
        Ok(())
 
    }
 

	
 
    fn visit_pragma_range(&mut self, modules: &mut [Module], module_idx: usize, ctx: &mut Ctx, range_idx: usize) -> Result<(), ParseError> {
 
        let module = &mut modules[module_idx];
 
        let range = &module.tokens.ranges[range_idx];
 
        let mut iter = module.tokens.iter_range(range);
 

	
 
        // Consume pragma name
 
        let (pragma_section, pragma_start, _) = consume_pragma(&self.source, &mut iter)?;
 

	
 
        // Consume pragma values
 
        if pragma_section == "#module" {
 
            // Check if name is defined twice within the same file
 
            if self.has_pragma_module {
 
                return Err(ParseError::new_error(&module.source, pragma_start, "module name is defined twice"));
 
            }
 

	
 
            // Consume the domain-name
 
            let (module_name, module_span) = consume_domain_ident(&module.source, &mut iter)?;
 
            if iter.next().is_some() {
 
                return Err(ParseError::new_error(&module.source, iter.last_valid_pos(), "expected end of #module pragma after module name"));
 
            }
 

	
 
            // Add to heap and symbol table
 
            let pragma_span = InputSpan::from_positions(pragma_start, module_span.end);
 
            let module_name = ctx.pool.intern(module_name);
 
            let pragma_id = ctx.heap.alloc_pragma(|this| Pragma::Module(PragmaModule{
 
                this,
 
                span: pragma_span,
 
                value: Identifier{ span: module_span, value: module_name.clone() },
 
            }));
 
            self.pragmas.push(pragma_id);
 

	
 
            if let Err(other_module_root_id) = ctx.symbols.insert_module(module_name, module.root_id) {
 
                // Naming conflict
 
                let this_module = &modules[module_idx];
 
                let other_module = seek_module(modules, other_module_root_id).unwrap();
 
                let (other_module_pragma_id, _) = other_module.name.unwrap();
 
                let other_pragma = ctx.heap[other_module_pragma_id].as_module();
 
                return Err(ParseError::new_error_str_at_span(
 
                    &this_module.source, pragma_span, "conflict in module name"
 
                ).with_info_str_at_span(
 
                    &other_module.source, other_pragma.span, "other module is defined here"
 
                ));
 
            }
 
            self.has_pragma_module = true;
 
        } else if pragma_section == "#version" {
 
            // Check if version is defined twice within the same file
 
            if self.has_pragma_version {
 
                return Err(ParseError::new_error(&module.source, pragma_start, "module version is defined twice"));
 
            }
 

	
 
            // Consume the version pragma
 
            let (version, version_span) = consume_integer_literal(&module.source, &mut iter, &mut self.buffer)?;
 
            let pragma_id = ctx.heap.alloc_pragma(|this| Pragma::Version(PragmaVersion{
 
                this,
 
                span: InputSpan::from_positions(pragma_start, version_span.end),
 
                version,
 
            }));
 
            self.pragmas.push(pragma_id);
 
            self.has_pragma_version = true;
 
        } else {
 
            // Custom pragma, maybe we support this in the future, but for now
 
            // we don't.
 
            return Err(ParseError::new_error(&module.source, pragma_start, "illegal pragma name"));
 
        }
 

	
 
        Ok(())
 
    }
 

	
 
    fn visit_definition_range(&mut self, modules: &mut [Module], module_idx: usize, ctx: &mut Ctx, range_idx: usize) -> Result<(), ParseError> {
 
        let module = &modules[module_idx];
 
        let range = &module.tokens.ranges[range_idx];
 
        let definition_span = InputSpan::from_positions(
 
            module.tokens.start_pos(range),
 
            module.tokens.end_pos(range)
 
        );
 
        let mut iter = module.tokens.iter_range(range);
 

	
 
        // Because we're visiting a definition, we expect an ident that resolves
 
        // to a keyword indicating a definition.
 
        let kw_text = consume_ident_text(&module.source, &mut iter).unwrap();
 
        let kw = parse_definition_keyword(kw_text).unwrap();
 

	
 
        // Retrieve identifier and put in temp symbol table
 
        let definition_ident = consume_ident_text(&module.source, &mut iter)?;
 
        let definition_ident = ctx.pool.intern(definition_ident);
 
        let symbol_class = kw.as_symbol_class();
 

	
 
        // Get the token indicating the end of the definition to get the full
 
        // span of the definition
 
        let last_token = &module.tokens.tokens[range.end - 1];
 
        debug_assert_eq!(last_token.kind, TokenKind::CloseCurly);
 

	
 
        self.symbols.push(Symbol::new(
 
            module.root_id,
 
            SymbolScope::Module(module.root_id),
 
            definition_span,
 
            symbol_class,
 
            definition_ident
 
        ));
 

	
 
        Ok(())
 
    }
 
}
 

	
 
pub(crate) struct ASTImportPrePass {
 
}
 

	
 
impl ASTImportPrePass {
 
    pub(crate) fn parse(&mut self, modules: &mut [Module], module_idx: usize, ctx: &mut Ctx) -> Result<(), ParseError> {
 
        let module = &modules[module_idx];
 
        let module_range = &module.tokens.ranges[0];
 
        debug_assert_eq!(module.phase, ModuleCompilationPhase::DefinitionsScanned);
 
        debug_assert_eq!(module_range.range_kind, TokenRangeKind::Module);
 

	
 
        let expected_parent_idx = 0;
 
        let expected_subranges = module_range.subranges;
 
        let mut visited_subranges = 0;
 

	
 
        for range_idx in expected_parent_idx + 1..module.tokens.ranges.len() {
 
            let cur_range = &module.tokens.ranges[range_idx];
 
            if cur_range.parent_idx != expected_parent_idx {
 
                continue;
 
            }
 

	
 
            visited_subranges += 1;
 
            if cur_range.range_kind == TokenRangeKind::Import {
 
                self.visit_import_range(modules, module_idx, ctx, range_idx)?;
 
            }
 

	
 
            if visited_subranges == expected_subranges {
 
                break;
 
            }
 
        }
 

	
 
        Ok(())
 
    }
 

	
 
    pub(crate) fn visit_import_range(
 
        &mut self, modules: &mut [Module], module_idx: usize, ctx: &mut Ctx, range_idx: usize
 
    ) -> Result<(), ParseError> {
 
        let module = &modules[module_idx];
 
        let import_range = &module.tokens.ranges[range_idx];
 
        debug_assert_eq!(import_range.range_kind, TokenRangeKind::Import);
 

	
 
        let mut iter = module.tokens.iter_range(import_range);
 

	
 
        // Consume "import"
 
        let _import_ident = consume_ident_text(&module.source, &mut iter)?;
 
        debug_assert_eq!(_import_ident, KW_IMPORT);
 

	
 
        // Consume module name
 
        let (module_name, _) = consume_domain_ident(&module.source, &mut iter)?;
 

	
 

	
 
        Ok(())
 
    }
 
}
 

	
 
// Lexes definitions. Should be the first pass over each of the module files 
 
// after tokenization. Only once all definitions are parsed can we do the full
 
// AST creation pass.
 
struct LexerDefinitions {
 
fn consume_domain_ident<'a>(source: &'a InputSource, iter: &mut TokenIter) -> Result<(&'a [u8], InputSpan), ParseError> {
 
    let (_, name_start, mut name_end) = consume_ident(source, iter)?;
 
    while let Some(TokenKind::Dot) = iter.next() {
 
        consume_dot(source, iter)?;
 
        let (_, _, new_end) = consume_ident(source, iter)?;
 
        name_end = new_end;
 
    }
 

	
 
    Ok((source.section(name_start, name_end), InputSpan::from_positions(name_start, name_end)))
 
}
 

	
 
fn consume_dot<'a>(source: &'a InputSource, iter: &mut TokenIter) -> Result<(), ParseError> {
 
    if Some(TokenKind::Dot) != iter.next() {
 
        return Err(ParseError::new_error_str_at_pos(source, iter.last_valid_pos(), "expected a dot"));
 
    }
 
    iter.consume();
 
    Ok(())
 
}
 

	
 
impl LexerDefinitions {
 
    pub(crate) fn parse(ctx: &mut Ctx) -> Result<(), ParseError> {
 
        debug_assert!(ctx.tokens.ranges.len() > 0);
 
fn consume_integer_literal(source: &InputSource, iter: &mut TokenIter, buffer: &mut String) -> Result<(u64, InputSpan), ParseError> {
 
    if Some(TokenKind::Integer) != iter.next() {
 
        return Err(ParseError::new_error_str_at_pos(source, iter.last_valid_pos(), "expected an integer literal"));
 
    }
 
    let (start_pos, end_pos) = iter.next_range();
 
    iter.consume();
 

	
 
    pub(crate) fn parse_definition(heap: &mut Heap, source: &InputSource, range: &TokenRang)
 
    let integer_text = source.section(start_pos, end_pos);
 

	
 
    // Determine radix and offset from prefix
 
    let (radix, input_offset, radix_name) =
 
        if integer_text.starts_with(b"0b") || integer_text.starts_with(b"0B") {
 
            // Binary number
 
            (2, 2, "binary")
 
        } else if integer_text.starts_with(b"0o") || integer_text.starts_with(b"0O") {
 
            // Octal number
 
            (8, 2, "octal")
 
        } else if integer_text.starts_with(b"0x") || integer_text.starts_with(b"0X") {
 
            // Hexadecimal number
 
            (16, 2, "hexadecimal")
 
        } else {
 
            (10, 0, "decimal")
 
        };
 

	
 
    // Take out any of the separating '_' characters
 
    buffer.clear();
 
    for char_idx in input_offset..integer_text.len() {
 
        let char = integer_text[char_idx];
 
        if char == b'_' {
 
            continue;
 
        }
 
        if !char.is_ascii_digit() {
 
            return Err(ParseError::new_error(source, start_pos, "incorrectly formatted integer"));
 
        }
 
        buffer.push(char::from(char));
 
    }
 

	
 
    // Use the cleaned up string to convert to integer
 
    match u64::from_str_radix(&buffer, radix) {
 
        Ok(number) => Ok((number, InputSpan::from_positions(start_pos, end_pos))),
 
        Err(_) => Err(
 
            ParseError::new_error(source, start_pos, "incorrectly formatted integer")
 
        ),
 
    }
 
}
 

	
 
fn seek_module(modules: &[Module], root_id: RootId) -> Option<&Module> {
 
    for module in modules {
 
        if module.root_id == root_id {
 
            return Some(module)
 
        }
 
    }
 

	
 
    return None
 
}
 

	
 
fn consume_pragma<'a>(source: &'a InputSource, iter: &mut TokenIter) -> Result<(&'a [u8], InputPosition, InputPosition), ParseError> {
 
    if Some(TokenKind::Pragma) != iter.next() {
 
        return Err(ParseError::new_error(source, iter.last_valid_pos(), "expected a pragma"));
 
    }
 
    let (pragma_start, pragma_end) = iter.next_range();
 
    iter.consume();
 
    Ok((source.section(pragma_start, pragma_end), pragma_start, pragma_end))
 
}
 

	
 
fn consume_ident_text<'a>(source: &'a InputSource, iter: &mut TokenIter) -> Result<&'a [u8], ParseError> {
 
    if Some(TokenKind::Ident) != iter.next() {
 
        return Err(ParseError::new_error(source, iter.last_valid_pos(), "expected an identifier"));
 
    }
 
    let (ident_start, ident_end) = iter.next_range();
 
    iter.consume();
 
    Ok(source.section(ident_start, ident_end))
 
}
 

	
 
fn consume_ident<'a>(source: &'a InputSource, iter: &mut TokenIter) -> Result<(&'a [u8], InputPosition, InputPosition), ParseError> {
 
    if Some(TokenKind::Ident) != iter.next() {
 
        return Err(ParseError::new_error(sourcee, iter.last_valid_pos(), "expected an identifier"));
 
    }
 
    let (ident_start, ident_end) = iter.next_range();
 
    iter.consume();
 
    Ok((source.section(ident_start, ident_end), ident_start, ident_end))
 
}
 

	
 
fn parse_definition_keyword(keyword: &[u8]) -> Option<KeywordDefinition> {
 
    match keyword {
 
        KW_STRUCT =>    Some(Keyword::Struct),
 
        KW_ENUM =>      Some(Keyword::Enum),
 
        KW_UNION =>     Some(Keyword::Union),
 
        KW_FUNCTION =>  Some(Keyword::Function),
 
        KW_PRIMITIVE => Some(Keyword::Primitive),
 
        KW_COMPOSITE => Some(Keyword::Composite),
 
        _ => None
 
    }
 
}
 
\ No newline at end of file
src/protocol/parser/mod.rs
Show inline comments
 
mod depth_visitor;
 
pub(crate) mod symbol_table;
 
pub(crate) mod symbol_table2;
 
pub(crate) mod type_table;
 
mod type_resolver;
 
mod visitor;
src/protocol/parser/symbol_table2.rs
Show inline comments
 
new file 100644
 
use std::collections::HashMap;
 
use std::collections::hash_map::Entry;
 

	
 
use crate::protocol::input_source2::*;
 
use crate::protocol::ast::*;
 
use crate::collections::*;
 

	
 
#[derive(Clone, Copy, PartialEq, Eq)]
 
pub enum SymbolScope {
 
    Module(RootId),
 
    Definition(DefinitionId),
 
}
 

	
 
#[derive(Clone, Copy, PartialEq, Eq)]
 
pub enum SymbolClass {
 
    Module,
 
    Struct,
 
    Enum,
 
    Union,
 
    Function,
 
    Component
 
}
 

	
 
struct ScopedSymbols {
 
    scope: SymbolScope,
 
    parent_scope: Option<SymbolScope>,
 
    child_scopes: Vec<SymbolScope>,
 
    start: usize,
 
    end: usize,
 
}
 

	
 
pub struct Symbol {
 
    // Definition location
 
    pub defined_in_module: RootId,
 
    pub defined_in_scope: SymbolScope,
 
    pub definition_span: InputSpan, // full span of definition
 
    // Introduction location (if imported instead of defined)
 

	
 
    // Symbol properties
 
    pub class: SymbolClass,
 
    pub name: StringRef,
 
    pub definition: Option<DefinitionId>,
 
}
 

	
 
impl Symbol {
 
    pub(crate) fn new(root_id: RootId, scope: SymbolScope, span: InputSpan, class: SymbolClass, name: StringRef) -> Self {
 
        Self{
 
            defined_in_module: root_id,
 
            defined_in_scope: scope,
 
            definition_span: span,
 
            class,
 
            name,
 
            definition: None,
 
        }
 
    }
 
}
 

	
 
pub struct SymbolTable {
 
    module_lookup: HashMap<StringRef, RootId>,
 
    scope_lookup: HashMap<SymbolScope, ScopedSymbols>,
 
    symbols: Vec<Symbol>,
 
}
 

	
 
impl SymbolTable {
 
    /// Inserts a new module by its name. Upon module naming conflict the
 
    /// previously associated `RootId` will be returned.
 
    pub(crate) fn insert_module(&mut self, module_name: StringRef, root_id: RootId) -> Result<(), RootId> {
 
        match self.module_lookup.entry(module_name) {
 
            Entry::Occupied(v) => {
 
                Err(*v.get())
 
            },
 
            Entry::Vacant(v) => {
 
                v.insert(root_id);
 
                Ok(())
 
            }
 
        }
 
    }
 

	
 
    /// Inserts a new scope with defined symbols. The `parent_scope` must
 
    /// already be added to the symbol table. The symbols are expected to come
 
    /// from a temporary buffer and are copied inside the symbol table. Will
 
    /// return an error if there is a naming conflict.
 
    pub(crate) fn insert_scoped_symbols(
 
        &mut self, parent_scope: Option<SymbolScope>, within_scope: SymbolScope, symbols: &[Symbol]
 
    ) -> Result<(), ParseError> {
 
        // Add scoped symbols
 
        let old_num_symbols = self.symbols.len();
 

	
 
        let new_scope = ScopedSymbols {
 
            scope: within_scope,
 
            parent_scope,
 
            child_scopes: Vec::new(),
 
            start: old_num_symbols,
 
            end: old_num_symbols + symbols.len(),
 
        };
 

	
 
        self.symbols.extend(symbols);
 
        self.scope_lookup.insert(within_scope, new_scope);
 

	
 
        if let Some(parent_scope) = parent_scope.as_ref() {
 
            let parent = self.scope_lookup.get_mut(parent_scope).unwrap();
 
            parent.child_scopes.push(within_scope);
 
        }
 

	
 
        Ok(())
 
    }
 
}
 
\ No newline at end of file
src/protocol/tokenizer/mod.rs
Show inline comments
 

	
 
use crate::protocol::input_source2::{InputSource2 as InputSource, ParseError, InputPosition2 as InputPosition, InputSpan};
 

	
 
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
 
use crate::protocol::input_source2::{
 
    InputSource2 as InputSource,
 
    ParseError,
 
    InputPosition2 as InputPosition,
 
    InputSpan
 
};
 

	
 
pub(crate) const KW_STRUCT:    &'static [u8] = b"struct";
 
pub(crate) const KW_ENUM:      &'static [u8] = b"enum";
 
pub(crate) const KW_UNION:     &'static [u8] = b"union";
 
pub(crate) const KW_FUNCTION:  &'static [u8] = b"func";
 
pub(crate) const KW_PRIMITIVE: &'static [u8] = b"primitive";
 
pub(crate) const KW_COMPOSITE: &'static [u8] = b"composite";
 
pub(crate) const KW_IMPORT:    &'static [u8] = b"import";
 

	
 
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
 
pub(crate) enum TokenKind {
 
    // Variable-character tokens, followed by a SpanEnd token
 
    Ident,          // regular identifier
 
@@ -11,7 +23,7 @@ pub(crate) enum TokenKind {
 
    Character,      // character literal, range includes `'`
 
    LineComment,    // line comment, range includes leading `//`, but not newline
 
    BlockComment,   // block comment, range includes leading `/*` and trailing `*/`
 
    // Punctuation
 
    // Punctuation (single character)
 
    Exclamation,    // !
 
    Question,       // ?
 
    Pound,          // #
 
@@ -24,48 +36,68 @@ pub(crate) enum TokenKind {
 
    CloseParen,     // )
 
    CloseSquare,    // ]
 
    Colon,          // :
 
    ColonColon,     // ::
 
    Comma,          // ,
 
    Dot,            // .
 
    DotDot,         // ..
 
    SemiColon,      // ;
 
    Quote,          // '
 
    DoubleQuote,    // "
 
    // Operator-like
 
    // Operator-like (single character)
 
    At,             // @
 
    Plus,           // +
 
    PlusPlus,       // ++
 
    PlusEquals,     // +=
 
    Minus,          // -
 
    Star,           // *
 
    Slash,          // /
 
    Percent,        // %
 
    Caret,          // ^
 
    And,            // &
 
    Or,             // |
 
    Tilde,          // ~
 
    Equal,          // =
 
    // Punctuation (two characters)
 
    ColonColon,     // ::
 
    DotDot,         // ..
 
    ArrowRight,     // ->
 
    // Operator-like (two characters)
 
    PlusPlus,       // ++
 
    PlusEquals,     // +=
 
    MinusMinus,     // --
 
    MinusEquals,    // -=
 
    Star,           // *
 
    StarEquals,     // *=
 
    Slash,          // /
 
    SlashEquals,    // /=
 
    Percent,        // %
 
    PercentEquals,  // %=
 
    Caret,          // ^
 
    CaretEquals,    // ^=
 
    And,            // &
 
    AndAnd,         // &&
 
    AndEquals,      // &=
 
    Or,             // |
 
    OrOr,           // ||
 
    OrEquals,       // |=
 
    Tilde,          // ~
 
    Equal,          // =
 
    EqualEqual,     // ==
 
    NotEqual,       // !=
 
    ShiftLeft,      // <<
 
    ShiftLeftEquals,// <<=
 
    ShiftRight,     // >>
 
    // Operator-like (three characters)
 
    ShiftLeftEquals,// <<=
 
    ShiftRightEquals, // >>=
 
    // Special marker token to indicate end of variable-character tokens
 
    SpanEnd,
 
}
 

	
 
impl TokenKind {
 
    fn has_span_end(&self) -> bool {
 
        return *self <= TokenKind::BlockComment
 
    }
 

	
 
    fn num_characters(&self) -> u32 {
 
        debug_assert!(!self.has_span_end() && *self != TokenKind::SpanEnd);
 
        if *self <= TokenKind::Equal {
 
            1
 
        } else if *self <= TokenKind::ShiftRight {
 
            2
 
        } else {
 
            3
 
        }
 
    }
 
}
 

	
 
pub(crate) struct Token {
 
    pub kind: TokenKind,
 
    pub pos: InputPosition,
 
@@ -86,16 +118,18 @@ pub(crate) enum TokenRangeKind {
 
    Code,
 
}
 

	
 
/// TODO: Add first_child and next_sibling indices for slightly faster traversal
 
#[derive(Debug)]
 
struct TokenRange {
 
pub(crate) struct TokenRange {
 
    // Index of parent in `TokenBuffer.ranges`, does not have a parent if the 
 
    // range kind is Module, in that case the parent index points to itself.
 
    parent_idx: usize,
 
    range_kind: TokenRangeKind,
 
    curly_depth: i32,
 
    start: usize,
 
    end: usize,
 
    subranges: usize,
 
    pub parent_idx: usize,
 
    pub range_kind: TokenRangeKind,
 
    pub curly_depth: u32,
 
    // InputPosition offset is limited to u32, so token ranges can be as well.
 
    pub start: u32,
 
    pub end: u32,
 
    pub subranges: u32,
 
}
 

	
 
pub(crate) struct TokenBuffer {
 
@@ -107,26 +141,126 @@ impl TokenBuffer {
 
    pub(crate) fn new() -> Self {
 
        Self{ tokens: Vec::new(), ranges: Vec::new() }
 
    }
 

	
 
    pub(crate) fn iter_range<'a>(&'a self, range: &TokenRange) -> TokenIter<'a> {
 
        TokenIter::new(self, range.start as usize, range.end as usize)
 
    }
 

	
 
    pub(crate) fn start_pos(&self, range: &TokenRange) -> InputPosition {
 
        self.tokens[range.start].pos
 
    }
 

	
 
    pub(crate) fn end_pos(&self, range: &TokenRange) -> InputPosition {
 
        let last_token = &self.tokens[range.end - 1];
 
        if last_token.kind == TokenKind::SpanEnd {
 
            return last_token.pos
 
        } else {
 
            debug_assert!(!last_token.kind.has_span_end());
 
            return last_token.pos.with_offset(last_token.kind.num_characters());
 
        }
 
    }
 
}
 

	
 
pub(crate) struct TokenIter<'a> {
 
    tokens: &'a Vec<Token>,
 
    cur: usize,
 
    end: usize,
 
}
 

	
 
// Tokenizer is a reusable parser to tokenize multiple source files using the
 
// same allocated buffers. In a well-formed program, we produce a consistent
 
// tree of token ranges such that we may identify tokens that represent a 
 
// defintion or an import before producing the entire AST.
 
//
 
// If the program is not well-formed then the tree may be inconsistent, but we
 
// will detect this once we transform the tokens into the AST. Maybe we want to
 
// detect a mismatch in opening/closing curly braces in the future?
 
impl<'a> TokenIter<'a> {
 
    fn new(buffer: &'a TokenBuffer, start: usize, end: usize) -> Self {
 
        Self{ tokens: &buffer.tokens, cur: start, end }
 
    }
 

	
 
    /// Returns the next token (may include comments), or `None` if at the end
 
    /// of the range.
 
    pub(crate) fn next_including_comments(&self) -> Option<TokenKind> {
 
        if self.cur >= self.end {
 
            return None;
 
        }
 

	
 
        let token = &self.tokens[self.cur];
 
        Some(token.kind)
 
    }
 

	
 
    /// Returns the next token (but skips over comments), or `None` if at the
 
    /// end of the range
 
    pub(crate) fn next(&mut self) -> Option<TokenKind> {
 
        while let Some(token_kind) = self.next() {
 
            if token_kind != TokenKind::LineComment && token_kind != TokenKind::BlockComment {
 
                return Some(token_kind);
 
            }
 
            self.consume();
 
        }
 

	
 
        return None
 
    }
 

	
 
    /// Returns the start position belonging to the token returned by `next`. If
 
    /// there is not a next token, then we return the end position of the
 
    /// previous token.
 
    pub(crate) fn last_valid_pos(&self) -> InputPosition {
 
        if self.cur < self.end {
 
            // Return token position
 
            return self.tokens[self.cur].pos
 
        }
 

	
 
        // Return previous token end
 
        let token = &self.tokens[self.cur - 1];
 
        return if token.kind == TokenKind::SpanEnd {
 
            token.pos
 
        } else {
 
            token.pos.with_offset(token.kind.num_characters());
 
        };
 
    }
 

	
 
    /// Returns the token range belonging to the token returned by `next`. This
 
    /// assumes that we're not at the end of the range we're iterating over.
 
    pub(crate) fn next_range(&self) -> (InputPosition, InputPosition) {
 
        debug_assert!(self.cur < self.end);
 
        let token = &self.tokens[self.cur];
 
        if token.kind.has_span_end() {
 
            let span_end = &self.tokens[self.cur + 1];
 
            debug_assert_eq!(span_end.kind, TokenKind::SpanEnd);
 
            (token.pos, span_end.pos)
 
        } else {
 
            let offset = token.kind.num_characters();
 
            (token.pos, token.pos.with_offset(offset))
 
        }
 
    }
 

	
 
    pub(crate) fn consume(&mut self) {
 
        if let Some(kind) = self.next() {
 
            if kind.has_span_end() {
 
                self.cur += 2;
 
            } else {
 
                self.cur += 1;
 
            }
 
        }
 
    }
 
}
 

	
 
/// Tokenizer is a reusable parser to tokenize multiple source files using the
 
/// same allocated buffers. In a well-formed program, we produce a consistent
 
/// tree of token ranges such that we may identify tokens that represent a
 
/// defintion or an import before producing the entire AST.
 
///
 
/// If the program is not well-formed then the tree may be inconsistent, but we
 
/// will detect this once we transform the tokens into the AST. To ensure a
 
/// consistent AST-producing phase we will require the import to have balanced
 
/// curly braces
 
pub(crate) struct Tokenizer {
 
    // Signed because programmer might have placed too many closing curly braces
 
    curly_depth: i32,
 
    // Stack of input positions of opening curly braces, used to detect
 
    // unmatched opening braces, unmatched closing braces are detected
 
    // immediately.
 
    curly_stack: Vec<InputPosition>,
 
    // Points to an element in the `TokenBuffer.ranges` variable.
 
    stack_idx: usize,
 
}
 

	
 
impl Tokenizer {
 
    pub(crate) fn new() -> Self {
 
        Self{ curly_depth: 0, stack_idx: 0 }
 
        Self{ curly_stack: Vec::with_capacity(32), stack_idx: 0 }
 
    }
 
    pub(crate) fn tokenize(&mut self, source: &mut InputSource, target: &mut TokenBuffer) -> Result<(), ParseError> {
 
        // Assert source and buffer are at start
 
@@ -150,7 +284,7 @@ impl Tokenizer {
 

	
 
        // Main tokenization loop
 
        while let Some(c) = source.next() {
 
            let token_index = target.tokens.len();
 
            let token_index = target.tokens.len() as u32;
 

	
 
            if is_char_literal_start(c) {
 
                self.consume_char_literal(source, target)?;
 
@@ -180,28 +314,37 @@ impl Tokenizer {
 
                if contained_newline {
 
                    let range = &target.ranges[self.stack_idx];
 
                    if range.range_kind == TokenRangeKind::Pragma {
 
                        self.pop_range(target, target.tokens.len());
 
                        self.pop_range(target, target.tokens.len() as u32);
 
                    }
 
                }
 
            } else {
 
                let was_punctuation = self.maybe_parse_punctuation(c, source, target)?;
 
                if let Some(token) = was_punctuation {
 
                if let Some((token, token_pos)) = was_punctuation {
 
                    if token == TokenKind::OpenCurly {
 
                        self.curly_depth += 1;
 
                        self.curly_stack.push(token_pos);
 
                    } else if token == TokenKind::CloseCurly {
 
                        // Check if this marks the end of a range we're 
 
                        // currently processing
 
                        self.curly_depth -= 1;
 
                        if self.curly_stack.is_empty() {
 
                            return Err(ParseError::new_error(
 
                                source, token_pos, "unmatched closing curly brace '}'"
 
                            ));
 
                        }
 

	
 
                        self.curly_stack.pop();
 

	
 
                        let range = &target.ranges[self.stack_idx];
 
                        if range.range_kind == TokenRangeKind::Definition && range.curly_depth == self.curly_depth {
 
                            self.pop_range(target, target.tokens.len());
 
                            self.pop_range(target, target.tokens.len() as u32);
 
                        }
 

	
 
                        // Exit early if we have more closing curly braces than
 
                        // opening curly braces
 
                    } else if token == TokenKind::SemiColon {
 
                        // Check if this marks the end of an import
 
                        let range = &target.ranges[self.stack_idx];
 
                        if range.range_kind == TokenRangeKind::Import {
 
                            self.pop_range(target, target.tokens.len());
 
                            self.pop_range(target, target.tokens.len() as u32);
 
                        }
 
                    }
 
                } else {
 
@@ -215,6 +358,15 @@ impl Tokenizer {
 
            return Err(error);
 
        }
 

	
 
        if !self.curly_stack.is_empty() {
 
            // Let's not add a lot of heuristics and just tell the programmer
 
            // that something is wrong
 
            let last_unmatched_open = self.curly_stack.pop().unwrap();
 
            return Err(ParseError::new_error(
 
                source, last_unmatched_open, "unmatched opening curly brace '{'"
 
            ));
 
        }
 

	
 
        Ok(())
 
    }
 

	
 
@@ -226,7 +378,9 @@ impl Tokenizer {
 
        return first_char == b'/' && Some(b'*') == source.lookahead(1);
 
    }
 

	
 
    fn maybe_parse_punctuation(&mut self, first_char: u8, source: &mut InputSource, target: &mut TokenBuffer) -> Result<Option<TokenKind>, ParseError> {
 
    fn maybe_parse_punctuation(
 
        &mut self, first_char: u8, source: &mut InputSource, target: &mut TokenBuffer
 
    ) -> Result<Option<(TokenKind, InputPosition)>, ParseError> {
 
        debug_assert!(first_char != b'#', "'#' needs special handling");
 
        debug_assert!(first_char != b'\'', "'\'' needs special handling");
 
        debug_assert!(first_char != b'"', "'\"' needs special handling");
 
@@ -412,7 +566,7 @@ impl Tokenizer {
 
        }
 

	
 
        target.tokens.push(Token::new(token_kind, pos));
 
        Ok(Some(token_kind))
 
        Ok(Some((token_kind, pos)))
 
    }
 

	
 
    fn consume_char_literal(&mut self, source: &mut InputSource, target: &mut TokenBuffer) -> Result<(), ParseError> {
 
@@ -610,7 +764,7 @@ impl Tokenizer {
 
        let end_pos = source.pos();
 
        target.tokens.push(Token::new(TokenKind::Ident, begin_pos));
 
        target.tokens.push(Token::new(TokenKind::SpanEnd, end_pos));
 
        Ok(source.section(begin_pos.offset, end_pos.offset))
 
        Ok(source.section(begin_pos, end_pos))
 
    }
 

	
 
    fn consume_number(&mut self, source: &mut InputSource, target: &mut TokenBuffer) -> Result<(), ParseError> {
 
@@ -656,23 +810,18 @@ impl Tokenizer {
 
    }
 

	
 
    /// Pushes a new token range onto the stack in the buffers.
 
    fn push_range(&mut self, target: &mut TokenBuffer, range_kind: TokenRangeKind, first_token: usize) {
 
    fn push_range(&mut self, target: &mut TokenBuffer, range_kind: TokenRangeKind, first_token: u32) {
 
        let cur_range = &mut target.ranges[self.stack_idx];
 

	
 
        println!(
 
            "DEBUG: push_range [1] | stack_idx: {}, range_end: {}, first_token: {}", 
 
            self.stack_idx, cur_range.end, first_token
 
        );
 

	
 
        // If we have just popped a range and then push a new range, then the
 
        // first token is equal to the last token registered on the current 
 
        // range. If not, then we had some intermediate tokens that did not 
 
        // belong to a particular kind of token range: hence we insert an 
 
        // intermediate "code" range.
 
        if cur_range.end != first_token {
 
            println!("DEBUG: push_range [2] | inserting code range");
 
            let code_start = cur_range.end;
 
            cur_range.end = first_token;
 
            debug_assert_ne!(code_start, first_token);
 
            cur_range.subranges += 1;
 
            target.ranges.push(TokenRange{
 
                parent_idx: self.stack_idx,
 
@@ -685,10 +834,6 @@ impl Tokenizer {
 
        }
 

	
 
        // Insert a new range
 
        println!(
 
            "DEBUG: push_range [3] | kind: {:?}, parent_idx: {}, stack_idx: {}", 
 
            range_kind, self.stack_idx, target.ranges.len()
 
        );
 
        let parent_idx = self.stack_idx;
 
        self.stack_idx = target.ranges.len();
 
        target.ranges.push(TokenRange{
 
@@ -701,26 +846,19 @@ impl Tokenizer {
 
        });
 
    }
 

	
 
    fn pop_range(&mut self, target: &mut TokenBuffer, end_index: usize) {
 
    fn pop_range(&mut self, target: &mut TokenBuffer, end_index: u32) {
 
        let last = &mut target.ranges[self.stack_idx];
 
        debug_assert!(self.stack_idx != last.parent_idx, "attempting to pop top-level range");
 

	
 
        // Fix up the current range before going back to parent
 
        println!(
 
            "DEBUG: pop_range  [1] | stack_idx: {}, kind: {:?}, start: {}, old_end: {}, new_end: {}",
 
            self.stack_idx, last.range_kind, last.start, last.end, end_index
 
        );
 
        last.end = end_index;
 
        debug_assert_ne!(last.start, end_index);
 
        
 
        // Go back to parent
 
        self.stack_idx = last.parent_idx;
 
        let parent = &mut target.ranges[self.stack_idx];
 
        parent.end = end_index;
 
        parent.subranges += 1;
 
        println!(
 
            "DEBUG: pop_range  [2] | returning to kind: {:?}, idx: {}, new_end: {}",
 
            parent.range_kind, self.stack_idx, end_index
 
        );
 
    }
 

	
 

	
 
@@ -739,16 +877,16 @@ impl Tokenizer {
 
// Helpers for characters
 
fn demarks_definition(ident: &[u8]) -> bool {
 
    return
 
        ident == b"struct" ||
 
        ident == b"enum" ||
 
        ident == b"union" ||
 
        ident == b"func" ||
 
        ident == b"primitive" ||
 
        ident == b"composite"
 
        ident == KW_STRUCT ||
 
        ident == KW_ENUM ||
 
        ident == KW_UNION ||
 
        ident == KW_FUNCTION ||
 
        ident == KW_PRIMITIVE ||
 
        ident == KW_COMPOSITE
 
}
 

	
 
fn demarks_import(ident: &[u8]) -> bool {
 
    return ident == b"import";
 
    return ident == KW_IMPORT;
 
}
 

	
 
fn is_whitespace(c: u8) -> bool {
 
@@ -856,7 +994,7 @@ mod tests {
 
                    let (_, end) = iter.next().unwrap();
 
                    println!("[{}] {:?} ......", idx, token.kind);
 
                    assert_eq!(end.kind, TokenKind::SpanEnd);
 
                    let text = source.section(token.pos.offset, end.pos.offset);
 
                    let text = source.section(token.pos, end.pos);
 
                    println!("{}", String::from_utf8_lossy(text));
 
                },
 
                _ => {
0 comments (0 inline, 0 general)