From 1a42eb33aa761ebad2152020e4be27dbcfdac6d9 2021-04-15 17:47:02 From: MH Date: 2021-04-15 17:47:02 Subject: [PATCH] WIP on compiler rearchitecting --- diff --git a/src/collections/string_pool.rs b/src/collections/string_pool.rs index 30cb8a5c6451ca390b91ed9c5a3d93bf989a2406..54505dba6aa01de3f3c4567a3e23d362c6226e5c 100644 --- a/src/collections/string_pool.rs +++ b/src/collections/string_pool.rs @@ -1,17 +1,31 @@ use std::ptr::null_mut; use std::collections::hash_map::DefaultHasher; use std::hash::{Hash, Hasher}; +use std::marker::PhantomData; const SLAB_SIZE: usize = u16::max_value() as usize; #[derive(Clone)] -pub struct StringRef { +pub struct StringRef<'a> { data: *const u8, length: usize, + _phantom: PhantomData<&'a [u8]>, } -impl StringRef { - pub fn as_str<'a>(&'a self) -> &'a str { +impl<'a> StringRef<'a> { + /// `new` constructs a new StringRef whose data is not owned by the + /// `StringPool`, hence cannot have a `'static` lifetime. + pub(crate) fn new(data: &'a [u8]) -> StringRef<'a> { + // This is an internal (compiler) function: so debug_assert that the + // string is valid UTF8. Most commonly the input will come from the + // code's source file, which is checked for ASCII-ness anyway. + debug_assert!(data.is_ascii()); + let length = data.len(); + let data = data.as_ptr(); + StringRef{ data, length, _phantom: PhantomData } + } + + pub fn as_str(&self) -> &'a str { unsafe { let slice = std::slice::from_raw_parts::<'a, u8>(self.data, self.length); std::str::from_utf8_unchecked(slice) @@ -66,7 +80,7 @@ impl StringPool { } // Interns a string to the StringPool, returning a reference to it - pub(crate) fn intern(&mut self, data: &[u8]) -> StringRef { + pub(crate) fn intern(&mut self, data: &[u8]) -> StringRef<'static> { // TODO: Large string allocations, if ever needed. let data_len = data.len(); assert!(data_len <= SLAB_SIZE, "string is too large for slab"); @@ -88,7 +102,7 @@ impl StringPool { unsafe { let start = last.data.as_ptr().offset(range_start as isize); - StringRef{ data: start, length: data_len } + StringRef{ data: start, length: data_len, _phantom: PhantomData } } } diff --git a/src/protocol/arena.rs b/src/protocol/arena.rs index df2d5ce1cd2ce9d45e97f3d4260f835a6fe7009a..68dc8b5cd6f8ad83dbeef578b351f0d2055e557a 100644 --- a/src/protocol/arena.rs +++ b/src/protocol/arena.rs @@ -3,14 +3,16 @@ use core::hash::Hash; use core::marker::PhantomData; pub struct Id { - pub(crate) index: u32, + // Not actually a signed index into the heap. But the index is set to -1 if + // we don't know an ID yet. This is checked during debug mode. + pub(crate) index: i32, _phantom: PhantomData, } impl Id { - pub(crate) fn new(index: u32) -> Self { - Self{ index, _phantom: Default::default() } - } + pub(crate) fn new_invalid() -> Self { Self{ index: -1, _phantom: Default::default() } } + pub(crate) fn new(index: i32) -> Self { Self{ index, _phantom: Default::default() } } + pub(crate) fn is_invalid(&self) -> bool { self.index < 0 } } #[derive(Debug)] @@ -46,15 +48,19 @@ impl Arena { pub fn new() -> Self { Self { store: vec![] } } + pub fn alloc_with_id(&mut self, f: impl FnOnce(Id) -> T) -> Id { - use std::convert::TryFrom; - let id = Id::new(u32::try_from(self.store.len()).expect("Out of capacity!")); + // Lets keep this a runtime assert. + assert!(self.store.len() < i32::max_value() as usize, "Arena out of capacity"); + let id = Id::new(self.store.len() as i32); self.store.push(f(id)); id } + pub fn iter(&self) -> impl Iterator { self.store.iter() } + pub fn len(&self) -> usize { self.store.len() } @@ -62,11 +68,13 @@ impl Arena { impl core::ops::Index> for Arena { type Output = T; fn index(&self, id: Id) -> &Self::Output { + debug_assert!(!id.is_invalid(), "attempted to index into Arena with an invalid id (index < 0)"); self.store.index(id.index as usize) } } impl core::ops::IndexMut> for Arena { fn index_mut(&mut self, id: Id) -> &mut Self::Output { + debug_assert!(!id.is_invalid(), "attempted to index_mut into Arena with an invalid id (index < 0)"); self.store.index_mut(id.index as usize) } } \ No newline at end of file diff --git a/src/protocol/ast.rs b/src/protocol/ast.rs index 2d42e0c68ffae8f1eb38fba278842d7de83d7a54..37c977fdd1de92cc52c1398305ff9553be0bc4a3 100644 --- a/src/protocol/ast.rs +++ b/src/protocol/ast.rs @@ -15,7 +15,6 @@ use crate::protocol::input_source2::{InputPosition2, InputSpan}; const MAX_LEVEL: usize = 128; const MAX_NAMESPACES: usize = 64; - /// Helper macro that defines a type alias for a AST element ID. In this case /// only used to alias the `Id` types. macro_rules! define_aliased_ast_id { @@ -66,9 +65,9 @@ macro_rules! define_new_ast_id { pub struct $name (pub(crate) $parent); impl $name { - pub fn upcast(self) -> $parent { - self.0 - } + pub(crate) fn new_invalid() -> Self { Self($parent::new_invalid()) } + pub(crate) fn is_invalid(&self) -> bool { self.0.is_invalid() } + pub fn upcast(self) -> $parent { self.0 } } }; // Variant where we define the type, and the Index and IndexMut traits @@ -127,11 +126,11 @@ define_new_ast_id!(ParameterId, VariableId, index(Parameter, Variable::Parameter define_new_ast_id!(LocalId, VariableId, index(Local, Variable::Local, variables), alloc(alloc_local)); define_aliased_ast_id!(DefinitionId, Id, index(Definition, definitions)); -define_new_ast_id!(StructId, DefinitionId, index(StructDefinition, Definition::Struct, definitions), alloc(alloc_struct_definition)); -define_new_ast_id!(EnumId, DefinitionId, index(EnumDefinition, Definition::Enum, definitions), alloc(alloc_enum_definition)); -define_new_ast_id!(UnionId, DefinitionId, index(UnionDefinition, Definition::Union, definitions), alloc(alloc_union_definition)); -define_new_ast_id!(ComponentId, DefinitionId, index(Component, Definition::Component, definitions), alloc(alloc_component)); -define_new_ast_id!(FunctionId, DefinitionId, index(Function, Definition::Function, definitions), alloc(alloc_function)); +define_new_ast_id!(StructDefinitionId, DefinitionId, index(StructDefinition, Definition::Struct, definitions), alloc(alloc_struct_definition)); +define_new_ast_id!(EnumDefinitionId, DefinitionId, index(EnumDefinition, Definition::Enum, definitions), alloc(alloc_enum_definition)); +define_new_ast_id!(UnionDefinitionId, DefinitionId, index(UnionDefinition, Definition::Union, definitions), alloc(alloc_union_definition)); +define_new_ast_id!(ComponentDefinitionId, DefinitionId, index(ComponentDefinition, Definition::Component, definitions), alloc(alloc_component_definition)); +define_new_ast_id!(FunctionDefinitionId, DefinitionId, index(FunctionDefinition, Definition::Function, definitions), alloc(alloc_function_definition)); define_aliased_ast_id!(StatementId, Id, index(Statement, statements)); define_new_ast_id!(BlockStatementId, StatementId, index(BlockStatement, Statement::Block, statements), alloc(alloc_block_statement)); @@ -328,11 +327,10 @@ impl SyntaxElement for Import { pub struct ImportModule { pub this: ImportId, // Phase 1: parser - pub position: InputPosition, - pub module_name: Vec, + pub span: InputSpan, + pub module_name: Identifier, pub alias: Identifier, - // Phase 2: module resolving - pub module_id: Option, + pub module_id: RootId, } #[derive(Debug, Clone)] @@ -349,7 +347,7 @@ pub struct AliasedSymbol { pub struct ImportSymbols { pub this: ImportId, // Phase 1: parser - pub position: InputPosition, + pub span: InputSpan, pub module_name: Vec, // Phase 2: module resolving pub module_id: Option, @@ -363,7 +361,7 @@ pub struct ImportSymbols { #[derive(Debug, Clone)] pub struct Identifier { pub span: InputSpan, - pub value: StringRef, + pub value: StringRef<'static>, } impl PartialEq for Identifier { @@ -965,8 +963,8 @@ pub enum Definition { Struct(StructDefinition), Enum(EnumDefinition), Union(UnionDefinition), - Component(Component), - Function(Function), + Component(ComponentDefinition), + Function(FunctionDefinition), } impl Definition { @@ -1012,7 +1010,7 @@ impl Definition { _ => false, } } - pub fn as_component(&self) -> &Component { + pub fn as_component(&self) -> &ComponentDefinition { match self { Definition::Component(result) => result, _ => panic!("Unable to cast `Definition` to `Component`"), @@ -1024,7 +1022,7 @@ impl Definition { _ => false, } } - pub fn as_function(&self) -> &Function { + pub fn as_function(&self) -> &FunctionDefinition { match self { Definition::Function(result) => result, _ => panic!("Unable to cast `Definition` to `Function`"), @@ -1094,14 +1092,20 @@ pub struct StructFieldDefinition { #[derive(Debug, Clone)] pub struct StructDefinition { - pub this: StructId, + pub this: StructDefinitionId, // Phase 1: parser - pub position: InputPosition, + pub span: InputSpan, pub identifier: Identifier, pub poly_vars: Vec, pub fields: Vec } +impl StructDefinition { + pub(crate) fn new_empty(this: StructDefinitionId, span: InputSpan, identifier: Identifier) -> Self { + Self{ this, span, identifier, poly_vars: Vec::new(), fields: Vec::new() } + } +} + #[derive(Debug, Clone)] pub enum EnumVariantValue { None, @@ -1117,14 +1121,20 @@ pub struct EnumVariantDefinition { #[derive(Debug, Clone)] pub struct EnumDefinition { - pub this: EnumId, + pub this: EnumDefinitionId, // Phase 1: parser - pub position: InputPosition, + pub span: InputSpan, pub identifier: Identifier, pub poly_vars: Vec, pub variants: Vec, } +impl EnumDefinition { + pub(crate) fn new_empty(this: EnumDefinitionId, span: InputSpan, identifier: Identifier) -> Self { + Self{ this, span, identifier, poly_vars: Vec::new(), variants: Vec::new() } + } +} + #[derive(Debug, Clone)] pub enum UnionVariantValue { None, @@ -1140,14 +1150,20 @@ pub struct UnionVariantDefinition { #[derive(Debug, Clone)] pub struct UnionDefinition { - pub this: UnionId, + pub this: UnionDefinitionId, // Phase 1: parser - pub position: InputPosition, + pub span: InputSpan, pub identifier: Identifier, pub poly_vars: Vec, pub variants: Vec, } +impl UnionDefinition { + pub(crate) fn new_empty(this: UnionDefinitionId, span: InputSpan, identifier: Identifier) -> Self { + Self{ this, span, identifier, poly_vars: Vec::new(), variants: Vec::new() } + } +} + #[derive(Debug, Clone, Copy)] pub enum ComponentVariant { Primitive, @@ -1155,10 +1171,10 @@ pub enum ComponentVariant { } #[derive(Debug, Clone)] -pub struct Component { - pub this: ComponentId, +pub struct ComponentDefinition { + pub this: ComponentDefinitionId, // Phase 1: parser - pub position: InputPosition, + pub span: InputSpan, pub variant: ComponentVariant, pub identifier: Identifier, pub poly_vars: Vec, @@ -1166,17 +1182,28 @@ pub struct Component { pub body: StatementId, } -impl SyntaxElement for Component { +impl ComponentDefinition { + pub(crate) fn new_empty(this: ComponentDefinitionId, span: InputSpan, variant: ComponentVariant, identifier: Identifier) -> Self { + Self{ + this, span, variant, identifier, + poly_vars: Vec::new(), + parameters: Vec::new(), + body: StatementId::new_invalid() + } + } +} + +impl SyntaxElement for ComponentDefinition { fn position(&self) -> InputPosition { self.position } } #[derive(Debug, Clone)] -pub struct Function { - pub this: FunctionId, +pub struct FunctionDefinition { + pub this: FunctionDefinitionId, // Phase 1: parser - pub position: InputPosition, + pub span: InputSpan, pub return_type: ParserTypeId, pub identifier: Identifier, pub poly_vars: Vec, @@ -1184,7 +1211,19 @@ pub struct Function { pub body: StatementId, } -impl SyntaxElement for Function { +impl FunctionDefinition { + pub(crate) fn new_empty(this: FunctionDefinitionId, span: InputSpan, identifier: Identifier) -> Self { + Self { + this, span, identifier, + return_type: ParserTypeId::new_invalid(), + poly_vars: Vec::new(), + parameters: Vec::new(), + body: StatementId::new_invalid(), + } + } +} + +impl SyntaxElement for FunctionDefinition { fn position(&self) -> InputPosition { self.position } diff --git a/src/protocol/lexer.rs b/src/protocol/lexer.rs index 29ed1c92942d206de98b701953917367d665bd00..9642d69a11378626d2dfacc5f3bb509327c77541 100644 --- a/src/protocol/lexer.rs +++ b/src/protocol/lexer.rs @@ -2378,7 +2378,7 @@ impl Lexer<'_> { // Parse body let body = self.consume_block_statement(h)?; - Ok(h.alloc_component(|this| Component { + Ok(h.alloc_component(|this| ComponentDefinition { this, variant: ComponentVariant::Composite, position, @@ -2404,7 +2404,7 @@ impl Lexer<'_> { // Consume body let body = self.consume_block_statement(h)?; - Ok(h.alloc_component(|this| Component { + Ok(h.alloc_component(|this| ComponentDefinition { this, variant: ComponentVariant::Primitive, position, @@ -2430,7 +2430,7 @@ impl Lexer<'_> { // Consume body let body = self.consume_block_statement(h)?; - Ok(h.alloc_function(|this| Function { + Ok(h.alloc_function(|this| FunctionDefinition { this, position, return_type, diff --git a/src/protocol/lexer2.rs b/src/protocol/lexer2.rs index 1cbb06f354f4a7b846c6992ac58ba0cd033f80a6..52d6b62b3674fd8f2b1b682c25a686dd95e81b10 100644 --- a/src/protocol/lexer2.rs +++ b/src/protocol/lexer2.rs @@ -25,28 +25,13 @@ enum KeywordDefinition { Composite, } -impl KeywordDefinition { - fn as_symbol_class(&self) -> SymbolClass { - use KeywordDefinition as KD; - use SymbolClass as SC; - - match self { - KD::Struct => SC::Struct, - KD::Enum => SC::Enum, - KD::Union => SC::Union, - KD::Function => SC::Function, - KD::Primitive | KD::Composite => SC::Component, - } - } -} - struct Module { // Buffers source: InputSource, tokens: TokenBuffer, // Identifiers root_id: RootId, - name: Option<(PragmaId, StringRef)>, + name: Option<(PragmaId, StringRef<'static>)>, version: Option<(PragmaId, i64)>, phase: ModuleCompilationPhase, } @@ -61,19 +46,23 @@ struct Ctx<'a> { /// added to the symbol table such that during AST-construction we know which /// identifiers point to types. Will also parse all pragmas to determine module /// names. -pub(crate) struct ASTSymbolPrePass { +pub(crate) struct PassPreSymbol { symbols: Vec, pragmas: Vec, + imports: Vec, + definitions: Vec, buffer: String, has_pragma_version: bool, has_pragma_module: bool, } -impl ASTSymbolPrePass { +impl PassPreSymbol { pub(crate) fn new() -> Self { Self{ symbols: Vec::with_capacity(128), pragmas: Vec::with_capacity(8), + imports: Vec::with_capacity(32), + definitions: Vec::with_capacity(128), buffer: String::with_capacity(128), has_pragma_version: false, has_pragma_module: false, @@ -83,6 +72,8 @@ impl ASTSymbolPrePass { fn reset(&mut self) { self.symbols.clear(); self.pragmas.clear(); + self.imports.clear(); + self.definitions.clear(); self.has_pragma_version = false; self.has_pragma_module = false; } @@ -92,11 +83,10 @@ impl ASTSymbolPrePass { let module = &mut modules[module_idx]; let module_range = &module.tokens.ranges[0]; - let expected_parent_idx = 0; - let expected_subranges = module_range.subranges; + debug_assert_eq!(module.phase, ModuleCompilationPhase::Tokenized); debug_assert_eq!(module_range.range_kind, TokenRangeKind::Module); - debug_assert_eq!(module.root_id.index, 0); + debug_assert!(module.root_id.is_invalid()); // not set yet, // Preallocate root in the heap let root_id = ctx.heap.alloc_protocol_description(|this| { @@ -109,44 +99,45 @@ impl ASTSymbolPrePass { }); module.root_id = root_id; - // Visit token ranges to detect defintions - let mut visited_subranges = 0; - for range_idx in expected_parent_idx + 1..module.tokens.ranges.len() { - // Skip any ranges that do not belong to the module - let cur_range = &module.tokens.ranges[range_idx]; - if cur_range.parent_idx != expected_parent_idx { - continue; - } + // Visit token ranges to detect definitions and pragmas + let mut range_idx = module_range.first_child_idx; + loop { + let range_idx_usize = range_idx as usize; + let cur_range = &module.tokens.ranges[range_idx_usize]; // Parse if it is a definition or a pragma if cur_range.range_kind == TokenRangeKind::Definition { - self.visit_definition_range(modules, module_idx, ctx, range_idx)?; + self.visit_definition_range(modules, module_idx, ctx, range_idx_usize)?; } else if cur_range.range_kind == TokenRangeKind::Pragma { - self.visit_pragma_range(modules, module_idx, ctx, range_idx)?; + self.visit_pragma_range(modules, module_idx, ctx, range_idx_usize)?; } - visited_subranges += 1; - if visited_subranges == expected_subranges { - break; + match cur_range.next_sibling_idx { + Some(idx) => { range_idx = idx; }, + None => { break; }, } } - // By now all symbols should have been found: add to symbol table and - // add the parsed pragmas to the preallocated root in the heap. - debug_assert_eq!(visited_subranges, expected_subranges); - ctx.symbols.insert_scoped_symbols(None, SymbolScope::Module(module.root_id), &self.symbols); + // Add the module's symbol scope and the symbols we just parsed + let module_scope = SymbolScope::Module(root_id); + ctx.symbols.insert_scope(None, module_scope); + for symbol in self.symbols.drain(..) { + if let Err((new_symbol, old_symbol)) = ctx.symbols.insert_symbol(module_scope, symbol) { + return Err(construct_symbol_conflict_error(modules, module_idx, ctx, &new_symbol, old_symbol)) + } + } + // Modify the preallocated root let root = &mut ctx.heap[root_id]; - debug_assert!(root.pragmas.is_empty()); - root.pragmas.extend(&self.pragmas); - + root.pragmas.extend(self.pragmas.drain(..)); + root.definitions.extend(self.definitions.drain(..)); module.phase = ModuleCompilationPhase::DefinitionsScanned; Ok(()) } - fn visit_pragma_range(&mut self, modules: &mut [Module], module_idx: usize, ctx: &mut Ctx, range_idx: usize) -> Result<(), ParseError> { - let module = &mut modules[module_idx]; + fn visit_pragma_range(&mut self, modules: &[Module], module_idx: usize, ctx: &mut Ctx, range_idx: usize) -> Result<(), ParseError> { + let module = &modules[module_idx]; let range = &module.tokens.ranges[range_idx]; let mut iter = module.tokens.iter_range(range); @@ -213,7 +204,7 @@ impl ASTSymbolPrePass { Ok(()) } - fn visit_definition_range(&mut self, modules: &mut [Module], module_idx: usize, ctx: &mut Ctx, range_idx: usize) -> Result<(), ParseError> { + fn visit_definition_range(&mut self, modules: &[Module], module_idx: usize, ctx: &mut Ctx, range_idx: usize) -> Result<(), ParseError> { let module = &modules[module_idx]; let range = &module.tokens.ranges[range_idx]; let definition_span = InputSpan::from_positions( @@ -222,60 +213,107 @@ impl ASTSymbolPrePass { ); let mut iter = module.tokens.iter_range(range); - // Because we're visiting a definition, we expect an ident that resolves - // to a keyword indicating a definition. - let kw_text = consume_ident_text(&module.source, &mut iter).unwrap(); + // First ident must be type of symbol + let (kw_text, _) = consume_any_ident(&module.source, &mut iter).unwrap(); let kw = parse_definition_keyword(kw_text).unwrap(); - // Retrieve identifier and put in temp symbol table - let definition_ident = consume_ident_text(&module.source, &mut iter)?; - let definition_ident = ctx.pool.intern(definition_ident); - let symbol_class = kw.as_symbol_class(); - - // Get the token indicating the end of the definition to get the full - // span of the definition - let last_token = &module.tokens.tokens[range.end - 1]; - debug_assert_eq!(last_token.kind, TokenKind::CloseCurly); + // Retrieve identifier of definition + let (identifier_text, identifier_span) = consume_ident(&module.source, &mut iter)?; + let ident_text = ctx.pool.intern(identifier_text); + let identifier = Identifier{ span: identifier_span, value: ident_text }; + + // Reserve space in AST for definition and add it to the symbol table + let symbol_definition; + let ast_definition_id; + match kw { + KeywordDefinition::Struct => { + let struct_def_id = ctx.heap.alloc_struct_definition(|this| { + StructDefinition::new_empty(this, definition_span, identifier) + }); + symbol_definition = SymbolDefinition::Struct(struct_def_id); + ast_definition_id = struct_def_id.upcast(); + }, + KeywordDefinition::Enum => { + let enum_def_id = ctx.heap.alloc_enum_definition(|this| { + EnumDefinition::new_empty(this, definition_span, identifier) + }); + symbol_definition = SymbolDefinition::Enum(enum_def_id); + ast_definition_id = enum_def_id.upcast(); + }, + KeywordDefinition::Union => { + let union_def_id = ctx.heap.alloc_union_definition(|this| { + UnionDefinition::new_empty(this, definition_span, identifier) + }); + symbol_definition = SymbolDefinition::Union(union_def_id); + ast_definition_id = union_def_id.upcast() + }, + KeywordDefinition::Function => { + let func_def_id = ctx.heap.alloc_function_definition(|this| { + FunctionDefinition::new_empty(this, definition_span, identifier) + }); + symbol_definition = SymbolDefinition::Function(func_def_id); + ast_definition_id = func_def_id.upcast(); + }, + KeywordDefinition::Primitive | KeywordDefinition::Composite => { + let component_variant = if kw == KeywordDefinition::Primitive { + ComponentVariant::Primitive + } else { + ComponentVariant::Composite + }; + let comp_def_id = ctx.heap.alloc_component_definition(|this| { + ComponentDefinition::new_empty(this, definition_span, component_variant, identifier) + }); + symbol_definition = SymbolDefinition::Component(comp_def_id); + ast_definition_id = comp_def_id.upcast(); + } + } - self.symbols.push(Symbol::new( - module.root_id, - SymbolScope::Module(module.root_id), + let symbol = Symbol{ + defined_in_module: module.root_id, + defined_in_scope: SymbolScope::Module(module.root_id), definition_span, - symbol_class, - definition_ident - )); + identifier_span, + introduced_at: None, + name: definition_ident, + definition: symbol_definition + }; + self.symbols.push(symbol); + self.definitions.push(ast_definition_id); Ok(()) } } -pub(crate) struct ASTImportPrePass { +/// Parses all the imports in the module tokens. Is applied after the +/// definitions and name of modules are resolved. Hence we should be able to +/// resolve all symbols to their appropriate module/definition. +pub(crate) struct PassImport { + imports: Vec, } -impl ASTImportPrePass { +impl PassImport { + pub(crate) fn new() -> Self { + Self{ imports: Vec::with_capacity(32) } + } pub(crate) fn parse(&mut self, modules: &mut [Module], module_idx: usize, ctx: &mut Ctx) -> Result<(), ParseError> { let module = &modules[module_idx]; let module_range = &module.tokens.ranges[0]; + debug_assert!(modules.iter().all(|m| m.phase >= ModuleCompilationPhase::DefinitionsScanned)); debug_assert_eq!(module.phase, ModuleCompilationPhase::DefinitionsScanned); debug_assert_eq!(module_range.range_kind, TokenRangeKind::Module); - let expected_parent_idx = 0; - let expected_subranges = module_range.subranges; - let mut visited_subranges = 0; + let mut range_idx = module_range.first_child_idx; + loop { + let range_idx_usize = range_idx as usize; + let cur_range = &module.tokens.ranges[range_idx_usize]; - for range_idx in expected_parent_idx + 1..module.tokens.ranges.len() { - let cur_range = &module.tokens.ranges[range_idx]; - if cur_range.parent_idx != expected_parent_idx { - continue; - } - - visited_subranges += 1; if cur_range.range_kind == TokenRangeKind::Import { - self.visit_import_range(modules, module_idx, ctx, range_idx)?; + self.visit_import_range(modules, module_idx, ctx, range_idx_usize)?; } - if visited_subranges == expected_subranges { - break; + match cur_range.next_sibling_idx { + Some(idx) => { range_idx = idx; }, + None => { break; } } } @@ -292,26 +330,81 @@ impl ASTImportPrePass { let mut iter = module.tokens.iter_range(import_range); // Consume "import" - let _import_ident = consume_ident_text(&module.source, &mut iter)?; + let (_import_ident, import_span) = + consume_ident(&module.source, &mut iter)?; debug_assert_eq!(_import_ident, KW_IMPORT); // Consume module name - let (module_name, _) = consume_domain_ident(&module.source, &mut iter)?; + let (module_name, module_name_span) = consume_domain_ident(&module.source, &mut iter)?; + let target_root_id = ctx.symbols.get_module_by_name(module_name); + if target_root_id.is_none() { + return Err(ParseError::new_error_at_span( + &module.source, module_name_span, + format!("could not resolve module '{}'", String::from_utf8_lossy(module_name)) + )); + } + let module_name = ctx.pool.intern(module_name); + let target_root_id = target_root_id.unwrap(); + + // Check for subsequent characters + let next = iter.next(); + if has_ident(&module.source, &mut iter, b"as") { + iter.consume(); + let (alias_text, alias_span) = consume_ident(source, &mut iter)?; + let alias = ctx.pool.intern(alias_text); + let import_id = ctx.heap.alloc_import(|this| Import::Module(ImportModule{ + this, + span: import_span, + module_name: Identifier{ span: module_name_span, value: module_name }, + alias: Identifier{ span: alias_span, value: alias }, + module_id: target_root_id + })); + ctx.symbols.insert_symbol(SymbolScope::Module(module.root_id), Symbol{ + defined_in_module: target_root_id, + defined_in_scope: SymbolScope::Module(target_root_id), + definition_span + }) + } else if Some(TokenKind::ColonColon) == next { + iter.consume(); + } else { + // Assume implicit alias, then check if we get the semicolon next + let module_name_str = module_name.as_str(); + let last_ident_start = module_name_str.rfind('.').map_or(0, |v| v + 1); + let alias_text = &module_name_str.as_bytes()[last_ident_start..]; + let alias = ctx.pool.intern(alias_text); + let alias_span = InputSpan::from_positions( + module_name_span.begin.with_offset(last_ident_start as u32), + module_name_span.end + ); + } Ok(()) } } fn consume_domain_ident<'a>(source: &'a InputSource, iter: &mut TokenIter) -> Result<(&'a [u8], InputSpan), ParseError> { - let (_, name_start, mut name_end) = consume_ident(source, iter)?; + let (_, mut span) = consume_ident(source, iter)?; while let Some(TokenKind::Dot) = iter.next() { consume_dot(source, iter)?; - let (_, _, new_end) = consume_ident(source, iter)?; - name_end = new_end; + let (_, new_span) = consume_ident(source, iter)?; + span.end = new_span.end; } - Ok((source.section(name_start, name_end), InputSpan::from_positions(name_start, name_end))) + // Not strictly necessary, but probably a reasonable restriction: this + // simplifies parsing of module naming and imports. + if span.begin.line != span.end.line { + return Err(ParseError::new_error_str_at_span(source, span, "module names may not span multiple lines")); + } + + // If module name consists of a single identifier, then it may not match any + // of the reserved keywords + let section = source.section(span.begin, span.end); + if is_reserved_keyword(section) { + return Err(ParseError::new_error_str_at_span(source, span, "encountered reserved keyword")); + } + + Ok((source.section(span.begin, span.end), span)) } fn consume_dot<'a>(source: &'a InputSource, iter: &mut TokenIter) -> Result<(), ParseError> { @@ -387,22 +480,126 @@ fn consume_pragma<'a>(source: &'a InputSource, iter: &mut TokenIter) -> Result<( Ok((source.section(pragma_start, pragma_end), pragma_start, pragma_end)) } -fn consume_ident_text<'a>(source: &'a InputSource, iter: &mut TokenIter) -> Result<&'a [u8], ParseError> { - if Some(TokenKind::Ident) != iter.next() { - return Err(ParseError::new_error(source, iter.last_valid_pos(), "expected an identifier")); +fn has_ident(source: &InputSource, iter: &mut TokenIter, expected: &[u8]) -> bool { + if Some(TokenKind::Ident) == iter.next() { + let (start, end) = iter.next_range(); + return source.section(start, end) == expected; } - let (ident_start, ident_end) = iter.next_range(); - iter.consume(); - Ok(source.section(ident_start, ident_end)) + + false +} + +fn peek_ident<'a>(source: &'a InputSource, iter: &mut TokenIter) -> Option<&'a [u8]> { + if Some(TokenKind::Ident) == iter.next() { + let (start, end) = iter.next_range(); + return Some(source.section(start, end)) + } + + None } -fn consume_ident<'a>(source: &'a InputSource, iter: &mut TokenIter) -> Result<(&'a [u8], InputPosition, InputPosition), ParseError> { +/// Consumes any identifier and returns it together with its span. Does not +/// check if the identifier is a reserved keyword. +fn consume_any_ident<'a>(source: &'a InputSource, iter: &mut TokenIter) -> Result<(&'a [u8], InputSpan), ParseError> { if Some(TokenKind::Ident) != iter.next() { return Err(ParseError::new_error(sourcee, iter.last_valid_pos(), "expected an identifier")); } let (ident_start, ident_end) = iter.next_range(); iter.consume(); - Ok((source.section(ident_start, ident_end), ident_start, ident_end)) + Ok((source.section(ident_start, ident_end), InputSpan::from_positions(ident_start, ident_end))) +} + +/// Consumes an identifier that is not a reserved keyword and returns it +/// together with its span. +fn consume_ident<'a>(source: &'a InputSource, iter: &mut TokenIter) -> Result<(&'a [u8], InputSpan), ParseError> { + let (ident, span) = consume_any_ident(source, iter)?; + if is_reserved_keyword(ident) { + return Err(ParseError::new_error_str_at_span(source, span, "encountered reserved keyword")); + } + + Ok((ident, span)) +} + +fn is_reserved_definition_keyword(text: &[u8]) -> bool { + return ([ + b"struct", b"enum", b"union", b"function", b"primitive" + ] as &[[u8]]).contains(text) +} + +fn is_reserved_statement_keyword(text: &[u8]) -> bool { + return ([ + b"channel", b"import", b"as", + b"if", b"while", b"break", b"continue", b"goto", b"return", + b"synchronous", b"assert", b"new", + ] as &[[u8]]).contains(text) +} + +fn is_reserved_expression_keyword(text: &[u8]) -> bool { + return ([ + b"let", b"true", b"false", b"null", // literals + b"get", b"put", b"fires", b"create", b"length", // functions + ] as &[[u8]]).contains(text) +} + +fn is_reserved_type_keyword(text: &[u8]) -> bool { + return ([ + b"in", b"out", b"msg", + b"bool", + b"u8", b"u16", b"u32", b"u64", + b"s8", b"s16", b"s32", b"s64", + b"auto" + ] as &[[u8]]).contains(text) +} + +fn is_reserved_keyword(text: &[u8]) -> bool { + return + is_reserved_definition_keyword(text) || + is_reserved_statement_keyword(text) || + is_reserved_type_keyword(text); +} + +/// Constructs a human-readable message indicating why there is a conflict of +/// symbols. +// Note: passing the `module_idx` is not strictly necessary, but will prevent +// programmer mistakes during development: we get a conflict because we're +// currently parsing a particular module. +fn construct_symbol_conflict_error(modules: &[Module], module_idx: usize, ctx: &Ctx, new_symbol: &Symbol, old_symbol: &Symbol) -> ParseError { + let module = &modules[module_idx]; + let get_symbol_span_and_msg = |symbol: &Symbol| -> (String, InputSpan) { + match symbol.introduced_at { + Some(import_id) => { + // Symbol is being imported + let import = &ctx.heap[import_id]; + match import { + Import::Module(import) => ( + format!("the module aliased as '{}' imported here", symbol.name.as_str()), + import.span + ), + Import::Symbols(symbols) => ( + format!("the type '{}' imported here", symbol.name.as_str()), + symbols.span + ), + } + }, + None => { + // Symbol is being defined + debug_assert_eq!(symbol.defined_in_module, module.root_id); + debug_assert_ne!(symbol.definition.symbol_class(), SymbolClass::Module); + ( + format!("the type '{}' defined here", symbol.name.as_str()), + symbol.identifier_span + ) + } + } + }; + + let (new_symbol_msg, new_symbol_span) = get_symbol_span_and_msg(new_symbol); + let (old_symbol_msg, old_symbol_span) = get_symbol_span_and_msg(old_symbol); + return ParseError::new_error_at_span( + &module.source, new_symbol_span, format!("symbol is defined twice: {}", new_symbol_msg) + ).with_info_at_span( + &module.source, old_symbol_span, format!("it conflicts with {}", old_symbol_msg) + ) } fn parse_definition_keyword(keyword: &[u8]) -> Option { diff --git a/src/protocol/parser/symbol_table2.rs b/src/protocol/parser/symbol_table2.rs index fecb9d079df26a592365855742a6eecb355f8620..f2f6839a73bd6eedb2c0390b57140d2006a52598 100644 --- a/src/protocol/parser/symbol_table2.rs +++ b/src/protocol/parser/symbol_table2.rs @@ -1,3 +1,12 @@ +/// symbol_table.rs +/// +/// The datastructure used to lookup symbols within particular scopes. Scopes +/// may be module-level or definition level, although imports and definitions +/// within definitions are currently not allowed. +/// +/// TODO: Once the compiler has matured, find out ways to optimize to prevent +/// the repeated HashMap lookup. + use std::collections::HashMap; use std::collections::hash_map::Entry; @@ -5,7 +14,9 @@ use crate::protocol::input_source2::*; use crate::protocol::ast::*; use crate::collections::*; -#[derive(Clone, Copy, PartialEq, Eq)] +const RESERVED_SYMBOLS: usize = 32; + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum SymbolScope { Module(RootId), Definition(DefinitionId), @@ -25,46 +36,61 @@ struct ScopedSymbols { scope: SymbolScope, parent_scope: Option, child_scopes: Vec, - start: usize, - end: usize, + symbols: Vec, +} + +pub enum SymbolDefinition { + Module(RootId), + Struct(StructDefinitionId), + Enum(EnumDefinitionId), + Union(UnionDefinitionId), + Function(FunctionDefinitionId), + Component(ComponentDefinitionId), +} + +impl SymbolDefinition { + pub fn symbol_class(&self) -> SymbolClass { + use SymbolDefinition as SD; + use SymbolClass as SC; + + match self { + SD::Module(_) => SC::Module, + SD::Struct(_) => SC::Struct, + SD::Enum(_) => SC::Enum, + SD::Union(_) => SC::Union, + SD::Function(_) => SC::Function, + SD::Component(_) => SC::Component, + } + } +} + +pub enum SymbolData { + } pub struct Symbol { - // Definition location + // Definition location (may be different from the scope/module in which it + // is used if the symbol is imported) pub defined_in_module: RootId, pub defined_in_scope: SymbolScope, - pub definition_span: InputSpan, // full span of definition + pub definition_span: InputSpan, // full span of definition, not just the name + pub identifier_span: InputSpan, // span of just the identifier // Introduction location (if imported instead of defined) - + pub introduced_at: Option, // Symbol properties - pub class: SymbolClass, - pub name: StringRef, - pub definition: Option, -} - -impl Symbol { - pub(crate) fn new(root_id: RootId, scope: SymbolScope, span: InputSpan, class: SymbolClass, name: StringRef) -> Self { - Self{ - defined_in_module: root_id, - defined_in_scope: scope, - definition_span: span, - class, - name, - definition: None, - } - } + pub name: StringRef<'static>, + pub definition: SymbolDefinition, } pub struct SymbolTable { - module_lookup: HashMap, + module_lookup: HashMap, RootId>, scope_lookup: HashMap, - symbols: Vec, } impl SymbolTable { /// Inserts a new module by its name. Upon module naming conflict the /// previously associated `RootId` will be returned. - pub(crate) fn insert_module(&mut self, module_name: StringRef, root_id: RootId) -> Result<(), RootId> { + pub(crate) fn insert_module(&mut self, module_name: StringRef<'static>, root_id: RootId) -> Result<(), RootId> { match self.module_lookup.entry(module_name) { Entry::Occupied(v) => { Err(*v.get()) @@ -76,32 +102,67 @@ impl SymbolTable { } } - /// Inserts a new scope with defined symbols. The `parent_scope` must - /// already be added to the symbol table. The symbols are expected to come - /// from a temporary buffer and are copied inside the symbol table. Will - /// return an error if there is a naming conflict. - pub(crate) fn insert_scoped_symbols( - &mut self, parent_scope: Option, within_scope: SymbolScope, symbols: &[Symbol] - ) -> Result<(), ParseError> { - // Add scoped symbols - let old_num_symbols = self.symbols.len(); - - let new_scope = ScopedSymbols { - scope: within_scope, + /// Retrieves module `RootId` by name + pub(crate) fn get_module_by_name(&mut self, name: &[u8]) -> Option { + let string_ref = StringRef::new(name); + self.module_lookup.get(&string_ref).map(|v| *v) + } + + /// Inserts a new symbol scope. The parent must have been added to the + /// symbol table before. + pub(crate) fn insert_scope(&mut self, parent_scope: Option, new_scope: SymbolScope) { + debug_assert!( + parent_scope.is_none() || self.scope_lookup.contains_key(parent_scope.as_ref().unwrap()), + "inserting scope {:?} but parent {:?} does not exist", new_scope, parent_scope + ); + debug_assert!(!self.scope_lookup.contains_key(&new_scope), "inserting scope {:?}, but it already exists", new_scope); + + if let Some(parent_scope) = parent_scope { + let parent = self.scope_lookup.get_mut(&parent_scope).unwrap(); + parent.child_scopes.push(new_scope); + } + + let scope = ScopedSymbols { + scope: new_scope, parent_scope, - child_scopes: Vec::new(), - start: old_num_symbols, - end: old_num_symbols + symbols.len(), + child_scopes: Vec::with_capacity(RESERVED_SYMBOLS), + symbols: Vec::with_capacity(RESERVED_SYMBOLS) }; + self.scope_lookup.insert(new_scope, scope); + } - self.symbols.extend(symbols); - self.scope_lookup.insert(within_scope, new_scope); + /// Inserts a symbol into a particular scope. The symbol's name may not + /// exist in the scope or any of its parents. If it does collide then the + /// symbol will be returned, together with the symbol that has the same + /// name. + pub(crate) fn insert_symbol(&mut self, in_scope: SymbolScope, symbol: Symbol) -> Result<(), (Symbol, &Symbol)> { + debug_assert!(self.scope_lookup.contains_key(&in_scope), "inserting symbol {}, but scope {:?} does not exist", symbol.name.as_str(), in_scope); + let mut seek_scope = in_scope; + loop { + let scoped_symbols = self.scope_lookup.get(&seek_scope).unwrap(); + for existing_symbol in scoped_symbols.symbols.iter() { + if symbol.name == existing_symbol.name { + return Err((symbol, existing_symbol)) + } + } - if let Some(parent_scope) = parent_scope.as_ref() { - let parent = self.scope_lookup.get_mut(parent_scope).unwrap(); - parent.child_scopes.push(within_scope); + match scoped_symbols.parent_scope { + Some(parent_scope) => { seek_scope = parent_scope; }, + None => { break; } + } } + // If here, then there is no collision + let scoped_symbols = self.scope_lookup.get_mut(&in_scope).unwrap(); + scoped_symbols.symbols.push(symbol); Ok(()) } + + /// Retrieves a particular scope. As this will be called by the compiler to + /// retrieve scopes that MUST exist, this function will panic if the + /// indicated scope does not exist. + pub(crate) fn get_scope_by_id(&mut self, scope: &SymbolScope) -> &mut ScopedSymbols { + debug_assert!(self.scope_lookup.contains_key(scope), "retrieving scope {:?}, but it doesn't exist", scope); + self.scope_lookup.get_mut(scope).unwrap() + } } \ No newline at end of file diff --git a/src/protocol/tests/utils.rs b/src/protocol/tests/utils.rs index 248844a38e89aa14307369f43b870804a1610c50..ece23a2f6126c52d67029237c90612ac3d21ddfb 100644 --- a/src/protocol/tests/utils.rs +++ b/src/protocol/tests/utils.rs @@ -450,11 +450,11 @@ impl<'a> UnionTester<'a> { pub(crate) struct FunctionTester<'a> { ctx: TestCtx<'a>, - def: &'a Function, + def: &'a FunctionDefinition, } impl<'a> FunctionTester<'a> { - fn new(ctx: TestCtx<'a>, def: &'a Function) -> Self { + fn new(ctx: TestCtx<'a>, def: &'a FunctionDefinition) -> Self { Self{ ctx, def } } diff --git a/src/protocol/tokenizer/mod.rs b/src/protocol/tokenizer/mod.rs index ebd4ee5555bd52674797aebbdc998b35c0d3e5d9..096cd3e853ea3e20b6c6116c2e1d23e6bfb6201e 100644 --- a/src/protocol/tokenizer/mod.rs +++ b/src/protocol/tokenizer/mod.rs @@ -8,7 +8,7 @@ use crate::protocol::input_source2::{ pub(crate) const KW_STRUCT: &'static [u8] = b"struct"; pub(crate) const KW_ENUM: &'static [u8] = b"enum"; pub(crate) const KW_UNION: &'static [u8] = b"union"; -pub(crate) const KW_FUNCTION: &'static [u8] = b"func"; +pub(crate) const KW_FUNCTION: &'static [u8] = b"function"; pub(crate) const KW_PRIMITIVE: &'static [u8] = b"primitive"; pub(crate) const KW_COMPOSITE: &'static [u8] = b"composite"; pub(crate) const KW_IMPORT: &'static [u8] = b"import"; @@ -127,9 +127,13 @@ pub(crate) struct TokenRange { pub range_kind: TokenRangeKind, pub curly_depth: u32, // InputPosition offset is limited to u32, so token ranges can be as well. - pub start: u32, - pub end: u32, - pub subranges: u32, + pub start: u32, // first token (inclusive index) + pub end: u32, // last token (exclusive index) + // Subranges and where they can be found + pub subranges: u32, // Number of subranges + pub first_child_idx: u32, // First subrange (or points to self if no subranges) + pub last_child_idx: u32, // Last subrange (or points to self if no subranges) + pub next_sibling_idx: Option, } pub(crate) struct TokenBuffer { @@ -280,6 +284,9 @@ impl Tokenizer { start: 0, end: 0, subranges: 0, + first_child_idx: 0, + last_child_idx: 0, + next_sibling_idx: None, }); // Main tokenization loop @@ -367,6 +374,35 @@ impl Tokenizer { )); } + // TODO: @remove once I'm sure the algorithm works. For now it is better + // if the debugging is a little more expedient + if cfg!(debug_assertions) { + // For each range make sure its children make sense + for parent_idx in 0..target.ranges.len() { + let cur_range = &target.ranges[parent_idx]; + if cur_range.subranges == 0 { + assert_eq!(cur_range.first_child_idx, parent_idx as u32); + assert_eq!(cur_range.last_child_idx, parent_idx as u32); + } else { + assert_ne!(cur_range.first_child_idx, parent_idx as u32); + assert_ne!(cur_range.last_child_idx, parent_idx as u32); + + let mut child_counter = 0u32; + let mut last_child_idx = cur_range.first_child_idx; + let mut child_idx = Some(cur_range.first_child_idx); + while let Some(cur_child_idx) = child_idx { + let child_range = &target.ranges[cur_child_idx as usize]; + assert_eq!(child_range.parent_idx, parent_idx); + child_idx = child_range.next_sibling_idx; + child_counter += 1; + } + + assert_eq!(cur_range.last_child_idx, last_child_idx); + assert_eq!(cur_range.subranges, child_counter); + } + } + } + Ok(()) } @@ -820,8 +856,16 @@ impl Tokenizer { // intermediate "code" range. if cur_range.end != first_token { let code_start = cur_range.end; + let code_range_idx = target.ranges.len() as u32; + + if cur_range.first_child_idx == self.stack_idx as u32 { + // The parent of the new "code" range we're going to push does + // not have any registered children yet. + cur_range.first_child_idx = code_range_idx; + } + cur_range.last_child_idx = code_range_idx + 1; + cur_range.end = first_token; - debug_assert_ne!(code_start, first_token); cur_range.subranges += 1; target.ranges.push(TokenRange{ parent_idx: self.stack_idx, @@ -830,12 +874,25 @@ impl Tokenizer { start: code_start, end: first_token, subranges: 0, + first_child_idx: code_range_idx, + last_child_idx: code_range_idx, + next_sibling_idx: Some(code_range_idx + 1), // we're going to push this thing next }); + } else { + // We're going to push the range in the code below, but while we + // have the `cur_range` borrowed mutably, we fix up its children + // indices. + let new_range_idx = target.ranges.len() as u32; + if cur_range.first_child_idx == self.stack_idx as u32 { + cur_range.first_child_idx = new_range_idx; + } + cur_range.last_child_idx = new_range_idx; } // Insert a new range let parent_idx = self.stack_idx; self.stack_idx = target.ranges.len(); + let new_range_idx = self.stack_idx as u32; target.ranges.push(TokenRange{ parent_idx, range_kind, @@ -843,6 +900,9 @@ impl Tokenizer { start: first_token, end: first_token, subranges: 0, + first_child_idx: new_range_idx, + last_child_idx: new_range_idx, + next_sibling_idx: None, }); }