Changeset - 9227f244da73
[Not reviewed]
0 4 0
MH - 4 years ago 2021-04-16 09:44:06
henger@cwi.nl
WIP on compiler rearchitecting
4 files changed with 243 insertions and 48 deletions:
0 comments (0 inline, 0 general)
src/protocol/ast.rs
Show inline comments
 
@@ -330,40 +330,34 @@ pub struct ImportModule {
 
    pub span: InputSpan,
 
    pub module_name: Identifier,
 
    pub alias: Identifier,
 
    pub module_id: RootId,
 
}
 

	
 
#[derive(Debug, Clone)]
 
pub struct AliasedSymbol {
 
    // Phase 1: parser
 
    pub position: InputPosition,
 
    pub name: Identifier,
 
    pub alias: Identifier,
 
    // Phase 2: symbol resolving
 
    pub definition_id: Option<DefinitionId>,
 
    pub definition_id: DefinitionId,
 
}
 

	
 
#[derive(Debug, Clone)]
 
pub struct ImportSymbols {
 
    pub this: ImportId,
 
    // Phase 1: parser
 
    pub span: InputSpan,
 
    pub module_name: Vec<u8>,
 
    // Phase 2: module resolving
 
    pub module_id: Option<RootId>,
 
    // Phase 1&2
 
    // if symbols is empty, then we implicitly import all symbols without any
 
    // aliases for them. If it is not empty, then symbols are explicitly
 
    // specified, and optionally given an alias.
 
    pub symbols: Vec<AliasedSymbol>,
 
}
 

	
 
#[derive(Debug, Clone)]
 
pub struct Identifier {
 
    pub span: InputSpan,
 
    pub value: StringRef<'static>,
 
}
 

	
 
impl PartialEq for Identifier {
 
    fn eq(&self, other: &Self) -> bool {
 
        return self.value == other.value
src/protocol/lexer2.rs
Show inline comments
 
@@ -211,80 +211,83 @@ impl PassPreSymbol {
 
            module.tokens.start_pos(range),
 
            module.tokens.end_pos(range)
 
        );
 
        let mut iter = module.tokens.iter_range(range);
 

	
 
        // First ident must be type of symbol
 
        let (kw_text, _) = consume_any_ident(&module.source, &mut iter).unwrap();
 
        let kw = parse_definition_keyword(kw_text).unwrap();
 

	
 
        // Retrieve identifier of definition
 
        let (identifier_text, identifier_span) = consume_ident(&module.source, &mut iter)?;
 
        let ident_text = ctx.pool.intern(identifier_text);
 
        let identifier = Identifier{ span: identifier_span, value: ident_text };
 
        let identifier = Identifier{ span: identifier_span, value: ident_text.clone() };
 

	
 
        // Reserve space in AST for definition and add it to the symbol table
 
        let symbol_definition;
 
        let definition_class;
 
        let ast_definition_id;
 
        match kw {
 
            KeywordDefinition::Struct => {
 
                let struct_def_id = ctx.heap.alloc_struct_definition(|this| {
 
                    StructDefinition::new_empty(this, definition_span, identifier)
 
                });
 
                symbol_definition = SymbolDefinition::Struct(struct_def_id);
 
                definition_class = DefinitionClass::Struct;
 
                ast_definition_id = struct_def_id.upcast();
 
            },
 
            KeywordDefinition::Enum => {
 
                let enum_def_id = ctx.heap.alloc_enum_definition(|this| {
 
                    EnumDefinition::new_empty(this, definition_span, identifier)
 
                });
 
                symbol_definition = SymbolDefinition::Enum(enum_def_id);
 
                definition_class = DefinitionClass::Enum;
 
                ast_definition_id = enum_def_id.upcast();
 
            },
 
            KeywordDefinition::Union => {
 
                let union_def_id = ctx.heap.alloc_union_definition(|this| {
 
                    UnionDefinition::new_empty(this, definition_span, identifier)
 
                });
 
                symbol_definition = SymbolDefinition::Union(union_def_id);
 
                definition_class = DefinitionClass::Union;
 
                ast_definition_id = union_def_id.upcast()
 
            },
 
            KeywordDefinition::Function => {
 
                let func_def_id = ctx.heap.alloc_function_definition(|this| {
 
                    FunctionDefinition::new_empty(this, definition_span, identifier)
 
                });
 
                symbol_definition = SymbolDefinition::Function(func_def_id);
 
                definition_class = DefinitionClass::Function;
 
                ast_definition_id = func_def_id.upcast();
 
            },
 
            KeywordDefinition::Primitive | KeywordDefinition::Composite => {
 
                let component_variant = if kw == KeywordDefinition::Primitive {
 
                    ComponentVariant::Primitive
 
                } else {
 
                    ComponentVariant::Composite
 
                };
 
                let comp_def_id = ctx.heap.alloc_component_definition(|this| {
 
                    ComponentDefinition::new_empty(this, definition_span, component_variant, identifier)
 
                });
 
                symbol_definition = SymbolDefinition::Component(comp_def_id);
 
                definition_class = DefinitionClass::Component;
 
                ast_definition_id = comp_def_id.upcast();
 
            }
 
        }
 

	
 
        let symbol = Symbol{
 
            name: ident_text,
 
            data: SymbolVariant::Definition(SymbolDefinition{
 
                defined_in_module: module.root_id,
 
                defined_in_scope: SymbolScope::Module(module.root_id),
 
                definition_span,
 
                identifier_span,
 
                introduced_at: None,
 
            name: definition_ident,
 
            definition: symbol_definition
 
                class: definition_class,
 
                definition_id: ast_definition_id,
 
            }),
 
        };
 
        self.symbols.push(symbol);
 
        self.definitions.push(ast_definition_id);
 

	
 
        Ok(())
 
    }
 
}
 

	
 
/// Parses all the imports in the module tokens. Is applied after the
 
/// definitions and name of modules are resolved. Hence we should be able to
 
/// resolve all symbols to their appropriate module/definition.
 
pub(crate) struct PassImport {
 
@@ -348,77 +351,123 @@ impl PassImport {
 

	
 
        // Check for subsequent characters
 
        let next = iter.next();
 
        if has_ident(&module.source, &mut iter, b"as") {
 
            iter.consume();
 
            let (alias_text, alias_span) = consume_ident(source, &mut iter)?;
 
            let alias = ctx.pool.intern(alias_text);
 

	
 
            let import_id = ctx.heap.alloc_import(|this| Import::Module(ImportModule{
 
                this,
 
                span: import_span,
 
                module_name: Identifier{ span: module_name_span, value: module_name },
 
                alias: Identifier{ span: alias_span, value: alias },
 
                alias: Identifier{ span: alias_span, value: alias.clone() },
 
                module_id: target_root_id
 
            }));
 
            ctx.symbols.insert_symbol(SymbolScope::Module(module.root_id), Symbol{
 
                defined_in_module: target_root_id,
 
                defined_in_scope: SymbolScope::Module(target_root_id),
 
                definition_span
 
            })
 
                name: alias,
 
                data: SymbolVariant::Module(SymbolModule{
 
                    root_id: target_root_id,
 
                    introduced_at: import_id,
 
                }),
 
            });
 
        } else if Some(TokenKind::ColonColon) == next {
 
            fn consume_symbol_and_maybe_alias<'a>(
 
                source: &'a InputSource, iter: &mut TokenIter, in_scope: SymbolScope, ctx: &Ctx
 
            ) -> Result<(&'a [u8], InputSpan, Option<(&'a [u8], InputSpan)>), ParseError> {
 
                // Consume symbol and make sure it points to something valid
 
                let (symbol, symbol_span) = consume_ident(source, iter)?;
 
                let target = ctx.symbols.get_symbol_by_name_defined_in_scope(in_scope, symbol);
 
                if target.is_none() {
 

	
 
                }
 

	
 
                if peek_ident(source, iter) == b"as" {
 
                    // Consume alias
 
                    iter.consume();
 
                    let (alias, alias_span) = consume_ident(source, iter)?;
 
                    Ok((symbol, symbol_span, Some((alias, alias_span))))
 
                } else {
 
                    Ok((symbol, symbol_span, None))
 
                }
 
            }
 

	
 
            iter.consume();
 

	
 
            let next = iter.next();
 
            if Some(TokenKind::Ident) = next {
 
                // Importing a single symbol
 
                iter.consume();
 
                let (symbol_text, symbol_span, maybe_alias) = consume_symbol_and_maybe_alias(&module.source, &mut iter)?;
 
                let target_symbol = ctx.symbols.get_symbol_by_name_defined_in_scope(
 
                    SymbolScope::Module(target_root_id))
 
            } else if Some(TokenKind::OpenCurly) = next {
 
                // Importing multiple symbols
 
                iter.consume();
 
            } else if Some(TokenKind::Star) = next {
 
                // Import all symbols from the module
 
                iter.consume();
 
            } else {
 
                return Err(ParseError::new_error_str_at_pos(
 
                    &module.source, iter.last_valid_pos(), "expected symbol name, '{' or '*'"
 
                ));
 
            }
 
        } else {
 
            // Assume implicit alias, then check if we get the semicolon next
 
            let module_name_str = module_name.as_str();
 
            let last_ident_start = module_name_str.rfind('.').map_or(0, |v| v + 1);
 
            let alias_text = &module_name_str.as_bytes()[last_ident_start..];
 
            let alias = ctx.pool.intern(alias_text);
 
            let alias_span = InputSpan::from_positions(
 
                module_name_span.begin.with_offset(last_ident_start as u32),
 
                module_name_span.end
 
            );
 
        }
 

	
 
        Ok(())
 
    }
 
}
 

	
 
fn consume_domain_ident<'a>(source: &'a InputSource, iter: &mut TokenIter) -> Result<(&'a [u8], InputSpan), ParseError> {
 
    let (_, mut span) = consume_ident(source, iter)?;
 
    while let Some(TokenKind::Dot) = iter.next() {
 
        consume_dot(source, iter)?;
 
        iter.consume();
 
        let (_, new_span) = consume_ident(source, iter)?;
 
        span.end = new_span.end;
 
    }
 

	
 
    // Not strictly necessary, but probably a reasonable restriction: this
 
    // simplifies parsing of module naming and imports.
 
    if span.begin.line != span.end.line {
 
        return Err(ParseError::new_error_str_at_span(source, span, "module names may not span multiple lines"));
 
    }
 

	
 
    // If module name consists of a single identifier, then it may not match any
 
    // of the reserved keywords
 
    let section = source.section(span.begin, span.end);
 
    if is_reserved_keyword(section) {
 
        return Err(ParseError::new_error_str_at_span(source, span, "encountered reserved keyword"));
 
    }
 

	
 
    Ok((source.section(span.begin, span.end), span))
 
}
 

	
 
fn consume_dot<'a>(source: &'a InputSource, iter: &mut TokenIter) -> Result<(), ParseError> {
 
    if Some(TokenKind::Dot) != iter.next() {
 
        return Err(ParseError::new_error_str_at_pos(source, iter.last_valid_pos(), "expected a dot"));
 
/// Consumes a specific expected token. Be careful to only call this with tokens that do not have a
 
/// variable length.
 
fn consume_token(source: &InputSource, iter: &mut TokenIter, expected: TokenKind) -> Result<(), ParseError> {
 
    if Some(expected) != iter.next() {
 
        return Err(ParseError::new_error_at_pos(
 
            source, iter.last_valid_pos(),
 
            format!("expected '{}'", expected.token_chars())
 
        ));
 
    }
 
    iter.consume();
 
    Ok(())
 
}
 

	
 
fn consume_integer_literal(source: &InputSource, iter: &mut TokenIter, buffer: &mut String) -> Result<(u64, InputSpan), ParseError> {
 
    if Some(TokenKind::Integer) != iter.next() {
 
        return Err(ParseError::new_error_str_at_pos(source, iter.last_valid_pos(), "expected an integer literal"));
 
    }
 
    let (start_pos, end_pos) = iter.next_range();
 
    iter.consume();
 

	
src/protocol/parser/symbol_table2.rs
Show inline comments
 
@@ -23,72 +23,117 @@ pub enum SymbolScope {
 
}
 

	
 
#[derive(Clone, Copy, PartialEq, Eq)]
 
pub enum SymbolClass {
 
    Module,
 
    Struct,
 
    Enum,
 
    Union,
 
    Function,
 
    Component
 
}
 

	
 
#[derive(Clone, Copy, PartialEq, Eq)]
 
pub enum DefinitionClass {
 
    Struct,
 
    Enum,
 
    Union,
 
    Function,
 
    Component,
 
}
 

	
 
impl DefinitionClass {
 
    fn as_symbol_class(&self) -> SymbolClass {
 
        match self {
 
            DefinitionClass::Struct => SymbolClass::Struct,
 
            DefinitionClass::Enum => SymbolClass::Enum,
 
            DefinitionClass::Union => SymbolClass::Union,
 
            DefinitionClass::Function => SymbolClass::Function,
 
            DefinitionClass::Component => SymbolClass::Component,
 
        }
 
    }
 
}
 

	
 
struct ScopedSymbols {
 
    scope: SymbolScope,
 
    parent_scope: Option<SymbolScope>,
 
    child_scopes: Vec<SymbolScope>,
 
    symbols: Vec<Symbol>,
 
}
 

	
 
pub enum SymbolDefinition {
 
    Module(RootId),
 
    Struct(StructDefinitionId),
 
    Enum(EnumDefinitionId),
 
    Union(UnionDefinitionId),
 
    Function(FunctionDefinitionId),
 
    Component(ComponentDefinitionId),
 
impl ScopedSymbols {
 
    fn get_symbol<'a>(&'a self, name: &StringRef) -> Option<&'a Symbol> {
 
        for symbol in self.symbols.iter() {
 
            if symbol.name == *name {
 
                return Some(symbol);
 
            }
 
        }
 

	
 
        None
 
    }
 
}
 

	
 
impl SymbolDefinition {
 
    pub fn symbol_class(&self) -> SymbolClass {
 
        use SymbolDefinition as SD;
 
        use SymbolClass as SC;
 

	
 
        match self {
 
            SD::Module(_) => SC::Module,
 
            SD::Struct(_) => SC::Struct,
 
            SD::Enum(_) => SC::Enum,
 
            SD::Union(_) => SC::Union,
 
            SD::Function(_) => SC::Function,
 
            SD::Component(_) => SC::Component,
 
        }
 
    }
 
}
 

	
 
pub enum SymbolData {
 
    
 
pub struct SymbolModule {
 
    pub root_id: RootId,
 
    pub introduced_at: ImportId,
 
}
 

	
 
pub struct Symbol {
 
    // Definition location (may be different from the scope/module in which it
 
    // is used if the symbol is imported)
 
pub struct SymbolDefinition {
 
    // Definition location (not necessarily the place where the symbol
 
    // is introduced, as it may be imported)
 
    pub defined_in_module: RootId,
 
    pub defined_in_scope: SymbolScope,
 
    pub definition_span: InputSpan, // full span of definition, not just the name
 
    pub definition_span: InputSpan, // full span of definition
 
    pub identifier_span: InputSpan, // span of just the identifier
 
    // Introduction location (if imported instead of defined)
 
    pub introduced_at: Option<ImportId>,
 
    // Symbol properties
 
    // Location where the symbol is introduced in its scope
 
    pub imported_at: Option<ImportId>,
 
    // Definition in the heap, with a utility enum to determine its
 
    // class if the ID is not needed.
 
    pub class: DefinitionClass,
 
    pub definition_id: DefinitionId,
 
}
 

	
 
pub enum SymbolVariant {
 
    Module(SymbolModule),
 
    Definition(SymbolDefinition),
 
}
 

	
 
pub struct Symbol {
 
    pub name: StringRef<'static>,
 
    pub definition: SymbolDefinition,
 
    pub data: SymbolVariant,
 
}
 

	
 
impl Symbol {
 
    fn class(&self) -> SymbolClass {
 
        match &self.data {
 
            SymbolVariant::Module(_) => SymbolClass::Module,
 
            SymbolVariant::Definition(data) => data.class.as_symbol_class(),
 
        }
 
    }
 
}
 

	
 
pub struct SymbolTable {
 
    module_lookup: HashMap<StringRef<'static>, RootId>,
 
    scope_lookup: HashMap<SymbolScope, ScopedSymbols>,
 
}
 

	
 
impl SymbolTable {
 
    /// Inserts a new module by its name. Upon module naming conflict the
 
    /// previously associated `RootId` will be returned.
 
    pub(crate) fn insert_module(&mut self, module_name: StringRef<'static>, root_id: RootId) -> Result<(), RootId> {
 
        match self.module_lookup.entry(module_name) {
 
@@ -149,20 +194,62 @@ impl SymbolTable {
 
            match scoped_symbols.parent_scope {
 
                Some(parent_scope) => { seek_scope = parent_scope; },
 
                None => { break; }
 
            }
 
        }
 

	
 
        // If here, then there is no collision
 
        let scoped_symbols = self.scope_lookup.get_mut(&in_scope).unwrap();
 
        scoped_symbols.symbols.push(symbol);
 
        Ok(())
 
    }
 

	
 
    /// Retrieves a particular scope. As this will be called by the compiler to
 
    /// retrieve scopes that MUST exist, this function will panic if the
 
    /// indicated scope does not exist.
 
    pub(crate) fn get_scope_by_id(&mut self, scope: &SymbolScope) -> &mut ScopedSymbols {
 
        debug_assert!(self.scope_lookup.contains_key(scope), "retrieving scope {:?}, but it doesn't exist", scope);
 
        self.scope_lookup.get_mut(scope).unwrap()
 
    /// Retrieves a symbol by name by searching in a particular scope and that scope's parents. The
 
    /// returned symbol may both be imported as defined within any of the searched scopes.
 
    pub(crate) fn get_symbol_by_name(
 
        &self, mut in_scope: SymbolScope, name: &[u8]
 
    ) -> Option<&Symbol> {
 
        let string_ref = StringRef::new(name);
 
        loop {
 
            let scope = self.scope_lookup.get(&in_scope);
 
            if scope.is_none() {
 
                return None;
 
            }
 
            let scope = scope.unwrap();
 

	
 
            if let Some(symbol) = scope.get_symbol(&string_ref) {
 
                return Some(symbol);
 
            } else {
 
                // Could not find symbol in current scope, seek in the parent scope if it exists
 
                match &scope.parent_scope {
 
                    Some(parent_scope) => { in_scope = *parent_scope; },
 
                    None => return None,
 
                }
 
            }
 
        }
 
    }
 

	
 
    /// Retrieves a symbol by name by searching in a particular scope and that scope's parents. The
 
    /// returned symbol must be defined within any of the searched scopes and may not be imported.
 
    /// In case such an imported symbol exists then this function still returns `None`.
 
    pub(crate) fn get_symbol_by_name_defined_in_scope(
 
        &self, in_scope: SymbolScope, name: &[u8]
 
    ) -> Option<&Symbol> {
 
        match self.get_symbol_by_name(in_scope, name) {
 
            Some(symbol) => {
 
                match &symbol.data {
 
                    SymbolVariant::Module(_) => {
 
                        None // in-scope modules are always imported
 
                    },
 
                    SymbolVariant::Definition(variant) => {
 
                        if variant.imported_at.is_some() {
 
                            None
 
                        } else {
 
                            Some(symbol)
 
                        }
 
                    }
 
                }
 
            },
 
            None => None,
 
        }
 
    }
 
}
 
\ No newline at end of file
src/protocol/tokenizer/mod.rs
Show inline comments
 
@@ -73,38 +73,103 @@ pub(crate) enum TokenKind {
 
    EqualEqual,     // ==
 
    NotEqual,       // !=
 
    ShiftLeft,      // <<
 
    ShiftRight,     // >>
 
    // Operator-like (three characters)
 
    ShiftLeftEquals,// <<=
 
    ShiftRightEquals, // >>=
 
    // Special marker token to indicate end of variable-character tokens
 
    SpanEnd,
 
}
 

	
 
impl TokenKind {
 
    /// Returns true if the next expected token is the special `TokenKind::SpanEnd` token. This is
 
    /// the case for tokens of variable length (e.g. an identifier).
 
    fn has_span_end(&self) -> bool {
 
        return *self <= TokenKind::BlockComment
 
    }
 

	
 
    /// Returns the number of characters associated with the token. May only be called on tokens
 
    /// that do not have a variable length.
 
    fn num_characters(&self) -> u32 {
 
        debug_assert!(!self.has_span_end() && *self != TokenKind::SpanEnd);
 
        if *self <= TokenKind::Equal {
 
            1
 
        } else if *self <= TokenKind::ShiftRight {
 
            2
 
        } else {
 
            3
 
        }
 
    }
 

	
 
    /// Returns the characters that are represented by the token, may only be called on tokens that
 
    /// do not have a variable length.
 
    pub fn token_chars(&self) -> &'static str {
 
        debug_assert!(!self.has_span_end() && *self != TokenKind::SpanEnd);
 
        use TokenKind as TK;
 
        match self {
 
            TK::Exclamation => "!",
 
            TK::Question => "?",
 
            TK::Pound => "#",
 
            TK::OpenAngle => "<",
 
            TK::OpenCurly => "{",
 
            TK::OpenParen => "(",
 
            TK::OpenSquare => "[",
 
            TK::CloseAngle => ">",
 
            TK::CloseCurly => "}",
 
            TK::CloseParen => ")",
 
            TK::CloseSquare => "]",
 
            TK::Colon => ":",
 
            TK::Comma => ",",
 
            TK::Dot => ".",
 
            TK::SemiColon => ";",
 
            TK::Quote => "'",
 
            TK::DoubleQuote => "\"",
 
            TK::At => "@",
 
            TK::Plus => "+",
 
            TK::Minus => "-",
 
            TK::Star => "*",
 
            TK::Slash => "/",
 
            TK::Percent => "%",
 
            TK::Caret => "^",
 
            TK::And => "&",
 
            TK::Or => "|",
 
            TK::Tilde => "~",
 
            TK::Equal => "=",
 
            TK::ColonColon => "::",
 
            TK::DotDot => "..",
 
            TK::ArrowRight => "->",
 
            TK::PlusPlus => "++",
 
            TK::PlusEquals => "+=",
 
            TK::MinusMinus => "--",
 
            TK::MinusEquals => "-=",
 
            TK::StarEquals => "*=",
 
            TK::SlashEquals => "/=",
 
            TK::PercentEquals => "%=",
 
            TK::CaretEquals => "^=",
 
            TK::AndAnd => "&&",
 
            TK::AndEquals => "&=",
 
            TK::OrOr => "||",
 
            TK::OrEquals => "|=",
 
            TK::EqualEqual => "==",
 
            TK::NotEqual => "!=",
 
            TK::ShiftLeft => "<<",
 
            TK::ShiftRight => ">>",
 
            TK::ShiftLeftEquals => "<<=",
 
            TK::ShiftRightEquals => ">>=",
 
            // Lets keep these in explicitly for now, in case we want to add more symbols
 
            TK::Ident | TK::Pragma | TK::Integer | TK::String | TK::Character |
 
            TK::LineComment | TK::BlockComment | TK::SpanEnd => unreachable!(),
 
        }
 
    }
 
}
 

	
 
pub(crate) struct Token {
 
    pub kind: TokenKind,
 
    pub pos: InputPosition,
 
}
 

	
 
impl Token {
 
    fn new(kind: TokenKind, pos: InputPosition) -> Self {
 
        Self{ kind, pos }
 
    }
 
}
0 comments (0 inline, 0 general)