From ddddcd3cc9aab55e98e7b671c326625501aa07e9 2021-04-18 10:47:33 From: MH Date: 2021-04-18 10:47:33 Subject: [PATCH] WIP on compiler rearchitecting --- diff --git a/src/collections/string_pool.rs b/src/collections/string_pool.rs index 54505dba6aa01de3f3c4567a3e23d362c6226e5c..91de3c466cf5cfedb0efcb9c6194897c51bdad6a 100644 --- a/src/collections/string_pool.rs +++ b/src/collections/string_pool.rs @@ -17,7 +17,7 @@ impl<'a> StringRef<'a> { /// `StringPool`, hence cannot have a `'static` lifetime. pub(crate) fn new(data: &'a [u8]) -> StringRef<'a> { // This is an internal (compiler) function: so debug_assert that the - // string is valid UTF8. Most commonly the input will come from the + // string is valid ascii. Most commonly the input will come from the // code's source file, which is checked for ASCII-ness anyway. debug_assert!(data.is_ascii()); let length = data.len(); @@ -31,6 +31,12 @@ impl<'a> StringRef<'a> { std::str::from_utf8_unchecked(slice) } } + + pub fn as_bytes(&self) -> &'a [u8] { + unsafe { + std::slice::from_raw_parts::<'a, u8>(self.data, self.length) + } + } } impl PartialEq for StringRef { @@ -79,7 +85,10 @@ impl StringPool { } } - // Interns a string to the StringPool, returning a reference to it + /// Interns a string to the `StringPool`, returning a reference to it. The + /// pointer owned by `StringRef` is `'static` as the `StringPool` doesn't + /// reallocate/deallocate until dropped (which only happens at the end of + /// the program.) pub(crate) fn intern(&mut self, data: &[u8]) -> StringRef<'static> { // TODO: Large string allocations, if ever needed. let data_len = data.len(); diff --git a/src/protocol/ast.rs b/src/protocol/ast.rs index 3a07106e8d89de2e295b473c3ebbc6f723135a9c..05d78f666bb6b336eee0c943a2bca7e9467d546e 100644 --- a/src/protocol/ast.rs +++ b/src/protocol/ast.rs @@ -176,7 +176,6 @@ pub struct Heap { // Contents of a file, these are the elements the `Root` elements refer to pragmas: Arena, pub(crate) imports: Arena, - identifiers: Arena, pub(crate) parser_types: Arena, pub(crate) variables: Arena, pub(crate) definitions: Arena, @@ -191,7 +190,6 @@ impl Heap { protocol_descriptions: Arena::new(), pragmas: Arena::new(), imports: Arena::new(), - identifiers: Arena::new(), parser_types: Arena::new(), variables: Arena::new(), definitions: Arena::new(), @@ -303,13 +301,19 @@ impl Import { pub(crate) fn as_module(&self) -> &ImportModule { match self { Import::Module(m) => m, - _ => panic!("Unable to cast 'Import' to 'ImportModule'") + _ => unreachable!("Unable to cast 'Import' to 'ImportModule'") } } pub(crate) fn as_symbols(&self) -> &ImportSymbols { match self { Import::Symbols(m) => m, - _ => panic!("Unable to cast 'Import' to 'ImportSymbols'") + _ => unreachable!("Unable to cast 'Import' to 'ImportSymbols'") + } + } + pub(crate) fn as_symbols_mut(&mut self) -> &mut ImportSymbols { + match self { + Import::Symbols(m) => m, + _ => unreachable!("Unable to cast 'Import' to 'ImportSymbols'") } } } @@ -328,17 +332,15 @@ pub struct ImportModule { pub this: ImportId, // Phase 1: parser pub span: InputSpan, - pub module_name: Identifier, + pub module: Identifier, pub alias: Identifier, pub module_id: RootId, } #[derive(Debug, Clone)] pub struct AliasedSymbol { - // Phase 1: parser - pub position: InputPosition, pub name: Identifier, - pub alias: Identifier, + pub alias: Option, pub definition_id: DefinitionId, } @@ -347,8 +349,8 @@ pub struct ImportSymbols { pub this: ImportId, // Phase 1: parser pub span: InputSpan, - pub module_name: Vec, - pub module_id: Option, + pub module: Identifier, + pub module_id: RootId, pub symbols: Vec, } @@ -604,11 +606,9 @@ pub enum ParserTypeVariant { // Basic builtin Message, Bool, - Byte, - Short, - Int, - Long, - String, + UInt8, Uint16, UInt32, UInt64, + SInt8, SInt16, SInt32, SInt64, + Character, String, // Literals (need to get concrete builtin type during typechecking) IntegerLiteral, Inferred, @@ -1079,7 +1079,6 @@ impl VariableScope for Definition { #[derive(Debug, Clone)] pub struct StructFieldDefinition { - pub position: InputPosition, pub field: Identifier, pub parser_type: ParserTypeId, } diff --git a/src/protocol/ast_printer.rs b/src/protocol/ast_printer.rs index dd338abccd1d9a1dcaddf15254327f9f8320b7b9..ddb1fa2942d742b89be6198f1962e30cee85809d 100644 --- a/src/protocol/ast_printer.rs +++ b/src/protocol/ast_printer.rs @@ -238,7 +238,7 @@ impl ASTWriter { self.kv(indent).with_id(PREFIX_IMPORT_ID, import.this.index) .with_s_key("ImportModule"); - self.kv(indent2).with_s_key("Name").with_ascii_val(&import.module_name); + self.kv(indent2).with_s_key("Name").with_ascii_val(&import.module); self.kv(indent2).with_s_key("Alias").with_ascii_val(&import.alias.value); self.kv(indent2).with_s_key("Target") .with_opt_disp_val(import.module_id.as_ref().map(|v| &v.index)); @@ -247,7 +247,7 @@ impl ASTWriter { self.kv(indent).with_id(PREFIX_IMPORT_ID, import.this.index) .with_s_key("ImportSymbol"); - self.kv(indent2).with_s_key("Name").with_ascii_val(&import.module_name); + self.kv(indent2).with_s_key("Name").with_ascii_val(&import.module); self.kv(indent2).with_s_key("Target") .with_opt_disp_val(import.module_id.as_ref().map(|v| &v.index)); diff --git a/src/protocol/input_source2.rs b/src/protocol/input_source2.rs index f2be5f9812a61dc4a6485c25c288b354cdcd47ca..ca14cb91c4ca6c00e130029c16f926d451862731 100644 --- a/src/protocol/input_source2.rs +++ b/src/protocol/input_source2.rs @@ -83,10 +83,16 @@ impl InputSource2 { } } - pub fn section(&self, start: InputPosition2, end: InputPosition2) -> &[u8] { + #[inline] + pub fn section_at_pos(&self, start: InputPosition2, end: InputPosition2) -> &[u8] { &self.input[start.offset as usize..end.offset as usize] } + #[inline] + pub fn section_at_span(&self, span: InputSpan) -> &[u8] { + &self.input[span.begin.offset as usize..span.end.offset as usize] + } + // Consumes the next character. Will check well-formedness of newlines: \r // must be followed by a \n, because this is used for error reporting. Will // not check for ascii-ness of the file, better left to a tokenizer. diff --git a/src/protocol/lexer.rs b/src/protocol/lexer.rs index 9642d69a11378626d2dfacc5f3bb509327c77541..4b8af9794cce90972f97907da61a9c9749bb52ca 100644 --- a/src/protocol/lexer.rs +++ b/src/protocol/lexer.rs @@ -2526,7 +2526,7 @@ impl Lexer<'_> { h.alloc_import(|this| Import::Module(ImportModule{ this, position, - module_name: value, + module: value, alias, module_id: None, })) @@ -2575,7 +2575,7 @@ impl Lexer<'_> { h.alloc_import(|this| Import::Symbols(ImportSymbols{ this, position, - module_name: value, + module: value, module_id: None, symbols, })) @@ -2584,7 +2584,7 @@ impl Lexer<'_> { h.alloc_import(|this| Import::Symbols(ImportSymbols{ this, position, - module_name: value, + module: value, module_id: None, symbols: Vec::new() })) @@ -2603,7 +2603,7 @@ impl Lexer<'_> { h.alloc_import(|this| Import::Symbols(ImportSymbols{ this, position, - module_name: value, + module: value, module_id: None, symbols: vec![AliasedSymbol{ position, @@ -2621,7 +2621,7 @@ impl Lexer<'_> { h.alloc_import(|this| Import::Module(ImportModule{ this, position, - module_name: value, + module: value, alias: Identifier{ position: last_ident_pos, value: Vec::from(alias_value), diff --git a/src/protocol/lexer2.rs b/src/protocol/lexer2.rs deleted file mode 100644 index 4ccf5b8e8601ef99d96b2b8acb77d5ed5304ef08..0000000000000000000000000000000000000000 --- a/src/protocol/lexer2.rs +++ /dev/null @@ -1,664 +0,0 @@ -use crate::protocol::ast::*; -use crate::protocol::Heap; -use crate::collections::{StringPool, StringRef}; -use crate::protocol::tokenizer::*; -use crate::protocol::input_source2::{InputSource2 as InputSource, InputPosition2 as InputPosition, InputSpan, ParseError}; -use crate::protocol::symbol_table2::*; - -#[derive(PartialEq, Eq)] -enum ModuleCompilationPhase { - Source, // only source is set - Tokenized, // source is tokenized - DefinitionsScanned, // all definitions are linked to their type class - ImportsResolved, // all imports are added to the symbol table - Parsed, // produced the AST for the module - ValidatedAndLinked, // AST is traversed and has linked the required AST nodes - Typed, // Type inference and checking has been performed -} - -enum KeywordDefinition { - Struct, - Enum, - Union, - Function, - Primitive, - Composite, -} - -struct Module { - // Buffers - source: InputSource, - tokens: TokenBuffer, - // Identifiers - root_id: RootId, - name: Option<(PragmaId, StringRef<'static>)>, - version: Option<(PragmaId, i64)>, - phase: ModuleCompilationPhase, -} - -struct Ctx<'a> { - heap: &'a mut Heap, - symbols: &'a mut SymbolTable, - pool: &'a mut StringPool, -} - -/// Scans the module and finds all module-level type definitions. These will be -/// added to the symbol table such that during AST-construction we know which -/// identifiers point to types. Will also parse all pragmas to determine module -/// names. -pub(crate) struct PassPreSymbol { - symbols: Vec, - pragmas: Vec, - imports: Vec, - definitions: Vec, - buffer: String, - has_pragma_version: bool, - has_pragma_module: bool, -} - -impl PassPreSymbol { - pub(crate) fn new() -> Self { - Self{ - symbols: Vec::with_capacity(128), - pragmas: Vec::with_capacity(8), - imports: Vec::with_capacity(32), - definitions: Vec::with_capacity(128), - buffer: String::with_capacity(128), - has_pragma_version: false, - has_pragma_module: false, - } - } - - fn reset(&mut self) { - self.symbols.clear(); - self.pragmas.clear(); - self.imports.clear(); - self.definitions.clear(); - self.has_pragma_version = false; - self.has_pragma_module = false; - } - - pub(crate) fn parse(&mut self, modules: &mut [Module], module_idx: usize, ctx: &mut Ctx) -> Result<(), ParseError> { - self.reset(); - - let module = &mut modules[module_idx]; - let module_range = &module.tokens.ranges[0]; - - debug_assert_eq!(module.phase, ModuleCompilationPhase::Tokenized); - debug_assert_eq!(module_range.range_kind, TokenRangeKind::Module); - debug_assert!(module.root_id.is_invalid()); // not set yet, - - // Preallocate root in the heap - let root_id = ctx.heap.alloc_protocol_description(|this| { - Root{ - this, - pragmas: Vec::new(), - imports: Vec::new(), - definitions: Vec::new(), - } - }); - module.root_id = root_id; - - // Visit token ranges to detect definitions and pragmas - let mut range_idx = module_range.first_child_idx; - loop { - let range_idx_usize = range_idx as usize; - let cur_range = &module.tokens.ranges[range_idx_usize]; - - // Parse if it is a definition or a pragma - if cur_range.range_kind == TokenRangeKind::Definition { - self.visit_definition_range(modules, module_idx, ctx, range_idx_usize)?; - } else if cur_range.range_kind == TokenRangeKind::Pragma { - self.visit_pragma_range(modules, module_idx, ctx, range_idx_usize)?; - } - - match cur_range.next_sibling_idx { - Some(idx) => { range_idx = idx; }, - None => { break; }, - } - } - - // Add the module's symbol scope and the symbols we just parsed - let module_scope = SymbolScope::Module(root_id); - ctx.symbols.insert_scope(None, module_scope); - for symbol in self.symbols.drain(..) { - if let Err((new_symbol, old_symbol)) = ctx.symbols.insert_symbol(module_scope, symbol) { - return Err(construct_symbol_conflict_error(modules, module_idx, ctx, &new_symbol, old_symbol)) - } - } - - // Modify the preallocated root - let root = &mut ctx.heap[root_id]; - root.pragmas.extend(self.pragmas.drain(..)); - root.definitions.extend(self.definitions.drain(..)); - module.phase = ModuleCompilationPhase::DefinitionsScanned; - - Ok(()) - } - - fn visit_pragma_range(&mut self, modules: &[Module], module_idx: usize, ctx: &mut Ctx, range_idx: usize) -> Result<(), ParseError> { - let module = &modules[module_idx]; - let range = &module.tokens.ranges[range_idx]; - let mut iter = module.tokens.iter_range(range); - - // Consume pragma name - let (pragma_section, pragma_start, _) = consume_pragma(&self.source, &mut iter)?; - - // Consume pragma values - if pragma_section == "#module" { - // Check if name is defined twice within the same file - if self.has_pragma_module { - return Err(ParseError::new_error(&module.source, pragma_start, "module name is defined twice")); - } - - // Consume the domain-name - let (module_name, module_span) = consume_domain_ident(&module.source, &mut iter)?; - if iter.next().is_some() { - return Err(ParseError::new_error(&module.source, iter.last_valid_pos(), "expected end of #module pragma after module name")); - } - - // Add to heap and symbol table - let pragma_span = InputSpan::from_positions(pragma_start, module_span.end); - let module_name = ctx.pool.intern(module_name); - let pragma_id = ctx.heap.alloc_pragma(|this| Pragma::Module(PragmaModule{ - this, - span: pragma_span, - value: Identifier{ span: module_span, value: module_name.clone() }, - })); - self.pragmas.push(pragma_id); - - if let Err(other_module_root_id) = ctx.symbols.insert_module(module_name, module.root_id) { - // Naming conflict - let this_module = &modules[module_idx]; - let other_module = seek_module(modules, other_module_root_id).unwrap(); - let (other_module_pragma_id, _) = other_module.name.unwrap(); - let other_pragma = ctx.heap[other_module_pragma_id].as_module(); - return Err(ParseError::new_error_str_at_span( - &this_module.source, pragma_span, "conflict in module name" - ).with_info_str_at_span( - &other_module.source, other_pragma.span, "other module is defined here" - )); - } - self.has_pragma_module = true; - } else if pragma_section == "#version" { - // Check if version is defined twice within the same file - if self.has_pragma_version { - return Err(ParseError::new_error(&module.source, pragma_start, "module version is defined twice")); - } - - // Consume the version pragma - let (version, version_span) = consume_integer_literal(&module.source, &mut iter, &mut self.buffer)?; - let pragma_id = ctx.heap.alloc_pragma(|this| Pragma::Version(PragmaVersion{ - this, - span: InputSpan::from_positions(pragma_start, version_span.end), - version, - })); - self.pragmas.push(pragma_id); - self.has_pragma_version = true; - } else { - // Custom pragma, maybe we support this in the future, but for now - // we don't. - return Err(ParseError::new_error(&module.source, pragma_start, "illegal pragma name")); - } - - Ok(()) - } - - fn visit_definition_range(&mut self, modules: &[Module], module_idx: usize, ctx: &mut Ctx, range_idx: usize) -> Result<(), ParseError> { - let module = &modules[module_idx]; - let range = &module.tokens.ranges[range_idx]; - let definition_span = InputSpan::from_positions( - module.tokens.start_pos(range), - module.tokens.end_pos(range) - ); - let mut iter = module.tokens.iter_range(range); - - // First ident must be type of symbol - let (kw_text, _) = consume_any_ident(&module.source, &mut iter).unwrap(); - let kw = parse_definition_keyword(kw_text).unwrap(); - - // Retrieve identifier of definition - let (identifier_text, identifier_span) = consume_ident(&module.source, &mut iter)?; - let ident_text = ctx.pool.intern(identifier_text); - let identifier = Identifier{ span: identifier_span, value: ident_text.clone() }; - - // Reserve space in AST for definition and add it to the symbol table - let definition_class; - let ast_definition_id; - match kw { - KeywordDefinition::Struct => { - let struct_def_id = ctx.heap.alloc_struct_definition(|this| { - StructDefinition::new_empty(this, definition_span, identifier) - }); - definition_class = DefinitionClass::Struct; - ast_definition_id = struct_def_id.upcast(); - }, - KeywordDefinition::Enum => { - let enum_def_id = ctx.heap.alloc_enum_definition(|this| { - EnumDefinition::new_empty(this, definition_span, identifier) - }); - definition_class = DefinitionClass::Enum; - ast_definition_id = enum_def_id.upcast(); - }, - KeywordDefinition::Union => { - let union_def_id = ctx.heap.alloc_union_definition(|this| { - UnionDefinition::new_empty(this, definition_span, identifier) - }); - definition_class = DefinitionClass::Union; - ast_definition_id = union_def_id.upcast() - }, - KeywordDefinition::Function => { - let func_def_id = ctx.heap.alloc_function_definition(|this| { - FunctionDefinition::new_empty(this, definition_span, identifier) - }); - definition_class = DefinitionClass::Function; - ast_definition_id = func_def_id.upcast(); - }, - KeywordDefinition::Primitive | KeywordDefinition::Composite => { - let component_variant = if kw == KeywordDefinition::Primitive { - ComponentVariant::Primitive - } else { - ComponentVariant::Composite - }; - let comp_def_id = ctx.heap.alloc_component_definition(|this| { - ComponentDefinition::new_empty(this, definition_span, component_variant, identifier) - }); - definition_class = DefinitionClass::Component; - ast_definition_id = comp_def_id.upcast(); - } - } - - let symbol = Symbol{ - name: ident_text, - data: SymbolVariant::Definition(SymbolDefinition{ - defined_in_module: module.root_id, - defined_in_scope: SymbolScope::Module(module.root_id), - definition_span, - identifier_span, - introduced_at: None, - class: definition_class, - definition_id: ast_definition_id, - }), - }; - self.symbols.push(symbol); - self.definitions.push(ast_definition_id); - - Ok(()) - } -} - -/// Parses all the imports in the module tokens. Is applied after the -/// definitions and name of modules are resolved. Hence we should be able to -/// resolve all symbols to their appropriate module/definition. -pub(crate) struct PassImport { - imports: Vec, -} - -impl PassImport { - pub(crate) fn new() -> Self { - Self{ imports: Vec::with_capacity(32) } - } - pub(crate) fn parse(&mut self, modules: &mut [Module], module_idx: usize, ctx: &mut Ctx) -> Result<(), ParseError> { - let module = &modules[module_idx]; - let module_range = &module.tokens.ranges[0]; - debug_assert!(modules.iter().all(|m| m.phase >= ModuleCompilationPhase::DefinitionsScanned)); - debug_assert_eq!(module.phase, ModuleCompilationPhase::DefinitionsScanned); - debug_assert_eq!(module_range.range_kind, TokenRangeKind::Module); - - let mut range_idx = module_range.first_child_idx; - loop { - let range_idx_usize = range_idx as usize; - let cur_range = &module.tokens.ranges[range_idx_usize]; - - if cur_range.range_kind == TokenRangeKind::Import { - self.visit_import_range(modules, module_idx, ctx, range_idx_usize)?; - } - - match cur_range.next_sibling_idx { - Some(idx) => { range_idx = idx; }, - None => { break; } - } - } - - Ok(()) - } - - pub(crate) fn visit_import_range( - &mut self, modules: &mut [Module], module_idx: usize, ctx: &mut Ctx, range_idx: usize - ) -> Result<(), ParseError> { - let module = &modules[module_idx]; - let import_range = &module.tokens.ranges[range_idx]; - debug_assert_eq!(import_range.range_kind, TokenRangeKind::Import); - - let mut iter = module.tokens.iter_range(import_range); - - // Consume "import" - let (_import_ident, import_span) = - consume_ident(&module.source, &mut iter)?; - debug_assert_eq!(_import_ident, KW_IMPORT); - - // Consume module name - let (module_name, module_name_span) = consume_domain_ident(&module.source, &mut iter)?; - let target_root_id = ctx.symbols.get_module_by_name(module_name); - if target_root_id.is_none() { - return Err(ParseError::new_error_at_span( - &module.source, module_name_span, - format!("could not resolve module '{}'", String::from_utf8_lossy(module_name)) - )); - } - let module_name = ctx.pool.intern(module_name); - let target_root_id = target_root_id.unwrap(); - - // Check for subsequent characters - let next = iter.next(); - if has_ident(&module.source, &mut iter, b"as") { - iter.consume(); - let (alias_text, alias_span) = consume_ident(source, &mut iter)?; - let alias = ctx.pool.intern(alias_text); - - let import_id = ctx.heap.alloc_import(|this| Import::Module(ImportModule{ - this, - span: import_span, - module_name: Identifier{ span: module_name_span, value: module_name }, - alias: Identifier{ span: alias_span, value: alias.clone() }, - module_id: target_root_id - })); - ctx.symbols.insert_symbol(SymbolScope::Module(module.root_id), Symbol{ - name: alias, - data: SymbolVariant::Module(SymbolModule{ - root_id: target_root_id, - introduced_at: import_id, - }), - }); - } else if Some(TokenKind::ColonColon) == next { - fn consume_symbol_and_maybe_alias<'a>( - source: &'a InputSource, iter: &mut TokenIter, in_scope: SymbolScope, ctx: &Ctx - ) -> Result<(&'a [u8], InputSpan, Option<(&'a [u8], InputSpan)>), ParseError> { - // Consume symbol and make sure it points to something valid - let (symbol, symbol_span) = consume_ident(source, iter)?; - let target = ctx.symbols.get_symbol_by_name_defined_in_scope(in_scope, symbol); - if target.is_none() { - - } - - if peek_ident(source, iter) == b"as" { - // Consume alias - iter.consume(); - let (alias, alias_span) = consume_ident(source, iter)?; - Ok((symbol, symbol_span, Some((alias, alias_span)))) - } else { - Ok((symbol, symbol_span, None)) - } - } - - iter.consume(); - - let next = iter.next(); - if Some(TokenKind::Ident) = next { - // Importing a single symbol - iter.consume(); - let (symbol_text, symbol_span, maybe_alias) = consume_symbol_and_maybe_alias(&module.source, &mut iter)?; - let target_symbol = ctx.symbols.get_symbol_by_name_defined_in_scope( - SymbolScope::Module(target_root_id)) - } else if Some(TokenKind::OpenCurly) = next { - // Importing multiple symbols - iter.consume(); - } else if Some(TokenKind::Star) = next { - // Import all symbols from the module - iter.consume(); - } else { - return Err(ParseError::new_error_str_at_pos( - &module.source, iter.last_valid_pos(), "expected symbol name, '{' or '*'" - )); - } - } else { - // Assume implicit alias, then check if we get the semicolon next - let module_name_str = module_name.as_str(); - let last_ident_start = module_name_str.rfind('.').map_or(0, |v| v + 1); - let alias_text = &module_name_str.as_bytes()[last_ident_start..]; - let alias = ctx.pool.intern(alias_text); - let alias_span = InputSpan::from_positions( - module_name_span.begin.with_offset(last_ident_start as u32), - module_name_span.end - ); - } - - Ok(()) - } -} - -fn consume_domain_ident<'a>(source: &'a InputSource, iter: &mut TokenIter) -> Result<(&'a [u8], InputSpan), ParseError> { - let (_, mut span) = consume_ident(source, iter)?; - while let Some(TokenKind::Dot) = iter.next() { - iter.consume(); - let (_, new_span) = consume_ident(source, iter)?; - span.end = new_span.end; - } - - // Not strictly necessary, but probably a reasonable restriction: this - // simplifies parsing of module naming and imports. - if span.begin.line != span.end.line { - return Err(ParseError::new_error_str_at_span(source, span, "module names may not span multiple lines")); - } - - // If module name consists of a single identifier, then it may not match any - // of the reserved keywords - let section = source.section(span.begin, span.end); - if is_reserved_keyword(section) { - return Err(ParseError::new_error_str_at_span(source, span, "encountered reserved keyword")); - } - - Ok((source.section(span.begin, span.end), span)) -} - -/// Consumes a specific expected token. Be careful to only call this with tokens that do not have a -/// variable length. -fn consume_token(source: &InputSource, iter: &mut TokenIter, expected: TokenKind) -> Result<(), ParseError> { - if Some(expected) != iter.next() { - return Err(ParseError::new_error_at_pos( - source, iter.last_valid_pos(), - format!("expected '{}'", expected.token_chars()) - )); - } - iter.consume(); - Ok(()) -} - -fn consume_integer_literal(source: &InputSource, iter: &mut TokenIter, buffer: &mut String) -> Result<(u64, InputSpan), ParseError> { - if Some(TokenKind::Integer) != iter.next() { - return Err(ParseError::new_error_str_at_pos(source, iter.last_valid_pos(), "expected an integer literal")); - } - let (start_pos, end_pos) = iter.next_range(); - iter.consume(); - - let integer_text = source.section(start_pos, end_pos); - - // Determine radix and offset from prefix - let (radix, input_offset, radix_name) = - if integer_text.starts_with(b"0b") || integer_text.starts_with(b"0B") { - // Binary number - (2, 2, "binary") - } else if integer_text.starts_with(b"0o") || integer_text.starts_with(b"0O") { - // Octal number - (8, 2, "octal") - } else if integer_text.starts_with(b"0x") || integer_text.starts_with(b"0X") { - // Hexadecimal number - (16, 2, "hexadecimal") - } else { - (10, 0, "decimal") - }; - - // Take out any of the separating '_' characters - buffer.clear(); - for char_idx in input_offset..integer_text.len() { - let char = integer_text[char_idx]; - if char == b'_' { - continue; - } - if !char.is_ascii_digit() { - return Err(ParseError::new_error(source, start_pos, "incorrectly formatted integer")); - } - buffer.push(char::from(char)); - } - - // Use the cleaned up string to convert to integer - match u64::from_str_radix(&buffer, radix) { - Ok(number) => Ok((number, InputSpan::from_positions(start_pos, end_pos))), - Err(_) => Err( - ParseError::new_error(source, start_pos, "incorrectly formatted integer") - ), - } -} - -fn seek_module(modules: &[Module], root_id: RootId) -> Option<&Module> { - for module in modules { - if module.root_id == root_id { - return Some(module) - } - } - - return None -} - -fn consume_pragma<'a>(source: &'a InputSource, iter: &mut TokenIter) -> Result<(&'a [u8], InputPosition, InputPosition), ParseError> { - if Some(TokenKind::Pragma) != iter.next() { - return Err(ParseError::new_error(source, iter.last_valid_pos(), "expected a pragma")); - } - let (pragma_start, pragma_end) = iter.next_range(); - iter.consume(); - Ok((source.section(pragma_start, pragma_end), pragma_start, pragma_end)) -} - -fn has_ident(source: &InputSource, iter: &mut TokenIter, expected: &[u8]) -> bool { - if Some(TokenKind::Ident) == iter.next() { - let (start, end) = iter.next_range(); - return source.section(start, end) == expected; - } - - false -} - -fn peek_ident<'a>(source: &'a InputSource, iter: &mut TokenIter) -> Option<&'a [u8]> { - if Some(TokenKind::Ident) == iter.next() { - let (start, end) = iter.next_range(); - return Some(source.section(start, end)) - } - - None -} - -/// Consumes any identifier and returns it together with its span. Does not -/// check if the identifier is a reserved keyword. -fn consume_any_ident<'a>(source: &'a InputSource, iter: &mut TokenIter) -> Result<(&'a [u8], InputSpan), ParseError> { - if Some(TokenKind::Ident) != iter.next() { - return Err(ParseError::new_error(sourcee, iter.last_valid_pos(), "expected an identifier")); - } - let (ident_start, ident_end) = iter.next_range(); - iter.consume(); - Ok((source.section(ident_start, ident_end), InputSpan::from_positions(ident_start, ident_end))) -} - -/// Consumes an identifier that is not a reserved keyword and returns it -/// together with its span. -fn consume_ident<'a>(source: &'a InputSource, iter: &mut TokenIter) -> Result<(&'a [u8], InputSpan), ParseError> { - let (ident, span) = consume_any_ident(source, iter)?; - if is_reserved_keyword(ident) { - return Err(ParseError::new_error_str_at_span(source, span, "encountered reserved keyword")); - } - - Ok((ident, span)) -} - -fn is_reserved_definition_keyword(text: &[u8]) -> bool { - return ([ - b"struct", b"enum", b"union", b"function", b"primitive" - ] as &[[u8]]).contains(text) -} - -fn is_reserved_statement_keyword(text: &[u8]) -> bool { - return ([ - b"channel", b"import", b"as", - b"if", b"while", b"break", b"continue", b"goto", b"return", - b"synchronous", b"assert", b"new", - ] as &[[u8]]).contains(text) -} - -fn is_reserved_expression_keyword(text: &[u8]) -> bool { - return ([ - b"let", b"true", b"false", b"null", // literals - b"get", b"put", b"fires", b"create", b"length", // functions - ] as &[[u8]]).contains(text) -} - -fn is_reserved_type_keyword(text: &[u8]) -> bool { - return ([ - b"in", b"out", b"msg", - b"bool", - b"u8", b"u16", b"u32", b"u64", - b"s8", b"s16", b"s32", b"s64", - b"auto" - ] as &[[u8]]).contains(text) -} - -fn is_reserved_keyword(text: &[u8]) -> bool { - return - is_reserved_definition_keyword(text) || - is_reserved_statement_keyword(text) || - is_reserved_type_keyword(text); -} - -/// Constructs a human-readable message indicating why there is a conflict of -/// symbols. -// Note: passing the `module_idx` is not strictly necessary, but will prevent -// programmer mistakes during development: we get a conflict because we're -// currently parsing a particular module. -fn construct_symbol_conflict_error(modules: &[Module], module_idx: usize, ctx: &Ctx, new_symbol: &Symbol, old_symbol: &Symbol) -> ParseError { - let module = &modules[module_idx]; - let get_symbol_span_and_msg = |symbol: &Symbol| -> (String, InputSpan) { - match symbol.introduced_at { - Some(import_id) => { - // Symbol is being imported - let import = &ctx.heap[import_id]; - match import { - Import::Module(import) => ( - format!("the module aliased as '{}' imported here", symbol.name.as_str()), - import.span - ), - Import::Symbols(symbols) => ( - format!("the type '{}' imported here", symbol.name.as_str()), - symbols.span - ), - } - }, - None => { - // Symbol is being defined - debug_assert_eq!(symbol.defined_in_module, module.root_id); - debug_assert_ne!(symbol.definition.symbol_class(), SymbolClass::Module); - ( - format!("the type '{}' defined here", symbol.name.as_str()), - symbol.identifier_span - ) - } - } - }; - - let (new_symbol_msg, new_symbol_span) = get_symbol_span_and_msg(new_symbol); - let (old_symbol_msg, old_symbol_span) = get_symbol_span_and_msg(old_symbol); - return ParseError::new_error_at_span( - &module.source, new_symbol_span, format!("symbol is defined twice: {}", new_symbol_msg) - ).with_info_at_span( - &module.source, old_symbol_span, format!("it conflicts with {}", old_symbol_msg) - ) -} - -fn parse_definition_keyword(keyword: &[u8]) -> Option { - match keyword { - KW_STRUCT => Some(Keyword::Struct), - KW_ENUM => Some(Keyword::Enum), - KW_UNION => Some(Keyword::Union), - KW_FUNCTION => Some(Keyword::Function), - KW_PRIMITIVE => Some(Keyword::Primitive), - KW_COMPOSITE => Some(Keyword::Composite), - _ => None - } -} \ No newline at end of file diff --git a/src/protocol/mod.rs b/src/protocol/mod.rs index 62e11f9b99130635e5d871827f4e855f9d6f7a83..95f635752057d7e6aade690a6d91755d5d978c48 100644 --- a/src/protocol/mod.rs +++ b/src/protocol/mod.rs @@ -4,7 +4,6 @@ mod eval; pub(crate) mod inputsource; pub(crate) mod input_source2; // mod lexer; -mod tokenizer; mod parser; #[cfg(test)] mod tests; @@ -12,7 +11,6 @@ mod parser; pub(crate) mod ast; pub(crate) mod ast_printer; pub(crate) mod lexer; -pub(crate) mod lexer2; lazy_static::lazy_static! { /// Conveniently-provided protocol description initialized with a zero-length PDL string. diff --git a/src/protocol/parser/mod.rs b/src/protocol/parser/mod.rs index 0110d950b8d80c1db45d1a5286816baf75a23042..4809745fac292421922b8a6c613c774c14c72fe1 100644 --- a/src/protocol/parser/mod.rs +++ b/src/protocol/parser/mod.rs @@ -2,25 +2,61 @@ mod depth_visitor; pub(crate) mod symbol_table; pub(crate) mod symbol_table2; pub(crate) mod type_table; +pub(crate) mod tokens; +pub(crate) mod token_parsing; +pub(crate) mod pass_tokenizer; +pub(crate) mod pass_symbols; +pub(crate) mod pass_imports; +pub(crate) mod pass_definitions; mod type_resolver; mod visitor; mod visitor_linker; mod utils; use depth_visitor::*; -use symbol_table::SymbolTable; +use tokens::*; +use crate::collections::*; +use symbol_table2::SymbolTable; use visitor::Visitor2; use visitor_linker::ValidityAndLinkerVisitor; use type_resolver::{TypeResolvingVisitor, ResolveQueue}; use type_table::{TypeTable, TypeCtx}; use crate::protocol::ast::*; -use crate::protocol::inputsource::*; +use crate::protocol::input_source2::{InputSource2 as InputSource}; use crate::protocol::lexer::*; use std::collections::HashMap; use crate::protocol::ast_printer::ASTWriter; +#[derive(PartialEq, Eq)] +pub enum ModuleCompilationPhase { + Source, // only source is set + Tokenized, // source is tokenized + SymbolsScanned, // all definitions are linked to their type class + ImportsResolved, // all imports are added to the symbol table + DefinitionsParsed, // produced the AST for the entire module + ValidatedAndLinked, // AST is traversed and has linked the required AST nodes + Typed, // Type inference and checking has been performed +} + +pub struct Module { + // Buffers + source: InputSource, + tokens: TokenBuffer, + // Identifiers + root_id: RootId, + name: Option<(PragmaId, StringRef<'static>)>, + version: Option<(PragmaId, i64)>, + phase: ModuleCompilationPhase, +} + +pub struct PassCtx<'a> { + heap: &'a mut Heap, + symbols: &'a mut SymbolTable, + pool: &'a mut StringPool, +} + // TODO: @fixme, pub qualifier pub(crate) struct LexedModule { pub(crate) source: InputSource, @@ -156,13 +192,13 @@ impl Parser { match import { Import::Module(import) => { debug_assert!(import.module_id.is_none(), "module import already resolved"); - let target_module_id = self.symbol_table.resolve_module(&import.module_name) + let target_module_id = self.symbol_table.resolve_module(&import.module) .expect("module import is resolved by symbol table"); import.module_id = Some(target_module_id) }, Import::Symbols(import) => { debug_assert!(import.module_id.is_none(), "module of symbol import already resolved"); - let target_module_id = self.symbol_table.resolve_module(&import.module_name) + let target_module_id = self.symbol_table.resolve_module(&import.module) .expect("symbol import's module is resolved by symbol table"); import.module_id = Some(target_module_id); diff --git a/src/protocol/parser/pass_definitions.rs b/src/protocol/parser/pass_definitions.rs new file mode 100644 index 0000000000000000000000000000000000000000..3bbfe219acd57fc1d373aadbeca584564733403d --- /dev/null +++ b/src/protocol/parser/pass_definitions.rs @@ -0,0 +1,199 @@ +use crate::protocol::ast::*; +use super::symbol_table2::*; +use super::{Module, ModuleCompilationPhase, PassCtx}; +use super::tokens::*; +use super::token_parsing::*; +use crate::protocol::input_source2::{InputSource2 as InputSource, InputSpan, ParseError}; +use crate::collections::*; + +/// Parses all the tokenized definitions into actual AST nodes. +pub(crate) struct PassDefinitions { + identifiers: Vec, + struct_fields: Vec, +} + +impl PassDefinitions { + pub(crate) fn parse(&mut self, modules: &mut [Module], module_idx: usize, ctx: &mut PassCtx) -> Result<(), ParseError> { + let module = &modules[module_idx]; + let module_range = &module.tokens.ranges[0]; + debug_assert_eq!(module.phase, ModuleCompilationPhase::ImportsResolved); + debug_assert_eq!(module_range.range_kind, TokenRangeKind::Module); + + let mut range_idx = module_range.first_child_idx; + loop { + let range_idx_usize = range_idx as usize; + let cur_range = &module.tokens.ranges[range_idx_usize]; + + if cur_range.range_kind == TokenRangeKind::Definition { + self.visit_definition_range(modules, module_idx, ctx, range_idx_usize)?; + } + + match cur_range.next_sibling_idx { + Some(idx) => { range_idx = idx; }, + None => { break; }, + } + } + + + + Ok(()) + } + + fn visit_definition_range( + &mut self, modules: &[Module], module_idx: usize, ctx: &mut PassCtx, range_idx: usize + ) -> Result<(), ParseError> { + let module = &modules[module_idx]; + let cur_range = &module.tokens.ranges[range_idx]; + debug_assert_eq!(cur_range.range_kind, TokenRangeKind::Definition); + + // Detect which definition we're parsing + let mut iter = module.tokens.iter_range(cur_range); + let keyword = peek_ident(&module.source, &mut iter).unwrap(); + match keyword { + KW_STRUCT => { + + }, + KW_ENUM => { + + }, + KW_UNION => { + + }, + KW_FUNCTION => { + + }, + KW_PRIMITIVE => { + + }, + KW_COMPOSITE => { + + }, + _ => unreachable!("encountered keyword '{}' in definition range", String::from_utf8_lossy(keyword)), + }; + + Ok(()) + } + + fn visit_struct_definition( + &mut self, module: &Module, iter: &mut TokenIter, ctx: &mut PassCtx + ) -> Result<(), ParseError> { + // Consume struct and name of struct + let struct_span = consume_exact_ident(&module.source, iter, b"struct"); + let (ident_text, _) = consume_ident(&module.source, iter)?; + + // We should have preallocated the definition in the heap, retrieve its identifier + let definition_id = ctx.symbols.get_symbol_by_name_defined_in_scope(SymbolScope::Module(module.root_id), ident_text) + .unwrap().variant.as_definition().definition_id; + + consume_polymorphic_vars(source, iter, ctx, &mut self.identifiers)?; + debug_assert!(self.struct_fields.is_empty()); + consume_comma_separated( + TokenKind::OpenCurly, TokenKind::CloseCurly, source, iter, + |source, iter| { + let field = consume_ident_interned(source, iter, ctx)?; + + StructFieldDefinition{ field, parser_type } + }, + &mut self.struct_fields, "a struct field", "a list of struct fields" + ) + } +} + +enum TypeKind { + Message, + Bool, + UInt8, UInt16, UInt32, UInt64, + SInt8, SInt16, SInt32, SInt64, + Character, String, + Inferred, + Array, + Input, + Output, + SymbolicDefinition(DefinitionId), + SymbolicPolyArg(DefinitionId, usize), +} + +/// Consumes a type. A type always starts with an identifier which may indicate +/// a builtin type or a user-defined type. The fact that it may contain +/// polymorphic arguments makes it a tree-like structure. Because we cannot rely +/// on knowing the exact number of polymorphic arguments we do not check for +/// these. +fn consume_parser_type( + source: &InputSource, iter: &mut TokenIter, ctx: &mut PassCtx, poly_vars: &[Identifier] +) -> Result<(), ParseError> { + struct StackEntry { + angle_depth: i32, + } + let mut type_stack = Vec::new(); + + Ok(()) +} + +fn consume_parser_type_ident( + source: &InputSource, iter: &mut TokenIter, ctx: &mut PassCtx, + mut scope: SymbolScope, wrapping_definition: DefinitionId, poly_vars: &[Identifier] +) -> Result<(TypeKind, InputSpan), ParseError> { + let (type_text, type_span) = consume_any_ident(source, iter)?; + + let type_kind = match type_text { + KW_TYPE_MESSAGE => TypeKind::Message, + KW_TYPE_BOOL => TypeKind::Bool, + KW_TYPE_UINT8 => TypeKind::UInt8, + KW_TYPE_UINT16 => TypeKind::UInt16, + KW_TYPE_UINT32 => TypeKind::UInt32, + KW_TYPE_UINT64 => TypeKind::UInt64, + KW_TYPE_SINT8 => TypeKind::SInt8, + KW_TYPE_SINT16 => TypeKind::SInt16, + KW_TYPE_SINT32 => TypeKind::SInt32, + KW_TYPE_SINT64 => TypeKind::SInt64, + KW_TYPE_IN_PORT => TypeKind::Input, + KW_TYPE_OUT_PORT => TypeKind::Output, + _ => { + // Must be some kind of symbolic type + let mut type_kind = None; + for (poly_idx, poly_var) in poly_vars.iter().enumerate() { + if poly_var.value.as_bytes() == type_text { + type_kind = Some(TypeKind::SymbolicPolyArg(wrapping_definition, poly_idx)); + } + } + + if type_kind.is_none() { + // Check symbol table for definition + let last_symbol = ctx.symbols.get_symbol_by_name(scope, type_text); + if last_symbol.is_none() { + return Err(ParseError::new_error_str_at_span(source, type_span, "unknown type")); + } + let last_symbol = last_symbol.unwrap(); + match last_symbol.variant { + SymbolVariant::Module(symbol_module) => { + // Keep seeking + }, + SymbolVariant::Definition(symbol_definition) => { + + } + } + } + } + } + + + Ok(()) +} + +/// Consumes polymorphic variables (i.e. a list of identifiers). If the list is +/// absent then we simply return an empty array. +fn consume_polymorphic_vars( + source: &InputSource, iter: &mut TokenIter, ctx: &mut PassCtx, target: &mut Vec +) -> Result<(), ParseError> { + // Note: because this is just a list of identifiers, we don't have to take + // two `TokenKind::CloseAngle` interpreted as `TokenKind::ShiftRight` into + // account. + debug_assert!(target.is_empty()); + maybe_consume_comma_separated( + TokenKind::OpenAngle, TokenKind::CloseAngle, source, iter, + |source, iter| consume_ident_interned(source, iter, ctx), + target, "a polymorphic variable" + )?; + + Ok(()) +} \ No newline at end of file diff --git a/src/protocol/parser/pass_imports.rs b/src/protocol/parser/pass_imports.rs new file mode 100644 index 0000000000000000000000000000000000000000..388d6a3f33119cf93551e89096e2eff8dd65a86c --- /dev/null +++ b/src/protocol/parser/pass_imports.rs @@ -0,0 +1,308 @@ +use crate::protocol::ast::*; +use super::symbol_table2::*; +use super::{Module, ModuleCompilationPhase, PassCtx}; +use super::tokens::*; +use super::token_parsing::*; +use crate::protocol::input_source2::{InputSource2 as InputSource, InputSpan, ParseError}; +use crate::collections::*; + +/// Parses all the imports in the module tokens. Is applied after the +/// definitions and name of modules are resolved. Hence we should be able to +/// resolve all symbols to their appropriate module/definition. +pub(crate) struct PassImport { + imports: Vec, + found_symbols: Vec<(AliasedSymbol, SymbolDefinition)>, + scoped_symbols: Vec, +} + +impl PassImport { + pub(crate) fn new() -> Self { + Self{ + imports: Vec::with_capacity(32), + found_symbols: Vec::with_capacity(32), + scoped_symbols: Vec::with_capacity(32), + } + } + pub(crate) fn parse(&mut self, modules: &mut [Module], module_idx: usize, ctx: &mut PassCtx) -> Result<(), ParseError> { + let module = &modules[module_idx]; + let module_range = &module.tokens.ranges[0]; + debug_assert!(modules.iter().all(|m| m.phase >= ModuleCompilationPhase::SymbolsScanned)); + debug_assert_eq!(module.phase, ModuleCompilationPhase::SymbolsScanned); + debug_assert_eq!(module_range.range_kind, TokenRangeKind::Module); + + let mut range_idx = module_range.first_child_idx; + loop { + let range_idx_usize = range_idx as usize; + let cur_range = &module.tokens.ranges[range_idx_usize]; + + if cur_range.range_kind == TokenRangeKind::Import { + self.visit_import_range(modules, module_idx, ctx, range_idx_usize)?; + } + + match cur_range.next_sibling_idx { + Some(idx) => { range_idx = idx; }, + None => { break; } + } + } + + let root = &mut ctx.heap[module.root_id]; + root.imports.extend(self.imports.drain(..)); + + let module = &mut modules[module_idx]; + module.phase = ModuleCompilationPhase::ImportsResolved; + + Ok(()) + } + + pub(crate) fn visit_import_range( + &mut self, modules: &mut [Module], module_idx: usize, ctx: &mut PassCtx, range_idx: usize + ) -> Result<(), ParseError> { + let module = &modules[module_idx]; + let import_range = &module.tokens.ranges[range_idx]; + debug_assert_eq!(import_range.range_kind, TokenRangeKind::Import); + + let mut iter = module.tokens.iter_range(import_range); + + // Consume "import" + let (_import_ident, import_span) = + consume_any_ident(&module.source, &mut iter)?; + debug_assert_eq!(_import_ident, KW_IMPORT); + + // Consume module name + let (module_name, module_name_span) = consume_domain_ident(&module.source, &mut iter)?; + let target_root_id = ctx.symbols.get_module_by_name(module_name); + if target_root_id.is_none() { + return Err(ParseError::new_error_at_span( + &module.source, module_name_span, + format!("could not resolve module '{}'", String::from_utf8_lossy(module_name)) + )); + } + let module_name = ctx.pool.intern(module_name); + let module_identifier = Identifier{ span: module_name_span, value: module_name }; + let target_root_id = target_root_id.unwrap(); + + // Check for subsequent characters (alias, multiple imported symbols) + let next = iter.next(); + let import_id; + + if has_ident(&module.source, &mut iter, b"as") { + // Alias for module + iter.consume(); + let alias_identifier = consume_ident_interned(&module.source, &mut iter, ctx)?; + let alias_name = alias_identifier.value.clone(); + + import_id = ctx.heap.alloc_import(|this| Import::Module(ImportModule{ + this, + span: import_span, + module: module_identifier, + alias: alias_identifier, + module_id: target_root_id + })); + ctx.symbols.insert_symbol(SymbolScope::Module(module.root_id), Symbol{ + name: alias_name, + variant: SymbolVariant::Module(SymbolModule{ + root_id: target_root_id, + introduced_at: import_id, + }), + }); + } else if Some(TokenKind::ColonColon) == next { + iter.consume(); + + // Helper function to consume symbols, their alias, and the + // definition the symbol is pointing to. + fn consume_symbol_and_maybe_alias<'a>( + source: &'a InputSource, iter: &mut TokenIter, ctx: &mut PassCtx, + module_name: &StringRef<'static>, module_root_id: RootId, + ) -> Result<(AliasedSymbol, SymbolDefinition), ParseError> { + // Consume symbol name and make sure it points to an existing definition + let symbol_identifier = consume_ident_interned(source, iter, ctx)?; + let target = ctx.symbols.get_symbol_by_name_defined_in_scope( + SymbolScope::Module(module_root_id), symbol + ); + + if target.is_none() { + return Err(ParseError::new_error_at_span( + source, symbol_span, + format!( + "could not find symbol '{}' within module '{}'", + symbol_identifier.value.as_str(), module_name.as_str() + ) + )); + } + let target = target.unwrap(); + debug_assert_ne!(target.class(), SymbolClass::Module); + let target_definition = target.variant.as_definition(); + + // Consume alias text if specified + let alias_identifier = if peek_ident(source, iter) == b"as" { + // Consume alias + iter.consume(); + Some(consume_ident_interned(source, iter, ctx)?) + } else { + None + }; + + Ok(( + AliasedSymbol{ + name: symbol_identifier, + alias: alias_identifier, + definition_id: target_definition.definition_id, + }, + target_definition.clone() + )) + } + + let next = iter.next(); + + if Some(TokenKind::Ident) = next { + // Importing a single symbol + iter.consume(); + let (imported_symbol, symbol_definition) = consume_symbol_and_maybe_alias( + &module.source, &mut iter, ctx, &module_identifier.value, target_root_id + )?; + let alias_identifier = imported_symbol.alias.unwrap_or_else(|| { imported_symbol.name.clone() }); + import_id = ctx.heap.alloc_import(|this| Import::Symbols(ImportSymbols{ + this, + span: InputSpan::from_positions(import_span.begin, alias_identifier.span.end), + module: module_identifier, + module_id: target_root_id, + symbols: vec![imported_symbol], + })); + if let Err((new_symbol, old_symbol)) = ctx.symbols.insert_symbol( + SymbolScope::Module(module.root_id), + Symbol{ + name: alias_identifier.value, + variant: SymbolVariant::Definition(symbol_definition.into_imported(import_id)) + } + ) { + return Err(construct_symbol_conflict_error( + modules, module_idx, ctx, &new_symbol, old_symbol + )); + } + } else if Some(TokenKind::OpenCurly) = next { + // Importing multiple symbols + consume_comma_separated( + TokenKind::OpenCurly, TokenKind::CloseCurly, source, &mut iter, + |source, iter| consume_symbol_and_maybe_alias( + source, iter, ctx, &module_identifier.value, target_root_id + ), + &mut self.found_symbols, "a symbol", "a list of symbols to import" + )?; + let end_of_list = iter.last_valid_pos(); + + // Preallocate import + import_id = ctx.heap.alloc_import(|this| Import::Symbols(ImportSymbols { + this, + span: InputSpan::from_positions(import_span.begin, end_of_list), + module: module_identifier, + module_id: target_root_id, + symbols: Vec::with_capacity(self.found_symbols.len()), + })); + + // Fill import symbols while inserting symbols in the + // appropriate scope in the symbol table. + let import = ctx.heap[import_id].as_symbols_mut(); + + for (imported_symbol, symbol_definition) in self.found_symbols.drain(..) { + let import_name = imported_symbol.alias.map_or_else( + || imported_symbol.name.value.clone(), + |v| v.value.clone() + ); + import.symbols.push(imported_symbol); + if let Err((new_symbol, old_symbol)) = ctx.symbols.insert_symbol( + SymbolScope::Module(module.root_id), Symbol{ + name: import_name, + variant: SymbolVariant::Definition(symbol_definition.into_imported(import_id)) + } + ) { + return Err(construct_symbol_conflict_error(modules, module_idx, ctx, &new_symbol, old_symbol)); + } + } + } else if Some(TokenKind::Star) = next { + // Import all symbols from the module + let star_span = iter.next_span(); + + iter.consume(); + self.scoped_symbols.clear(); + let _found = ctx.symbols.get_all_symbols_defined_in_scope( + SymbolScope::Module(target_root_id), + &mut self.scoped_symbols + ); + debug_assert!(_found); // even modules without symbols should have a scope + + // Preallocate import + import_id = ctx.heap.alloc_import(|this| Import::Symbols(ImportSymbols{ + this, + span: InputSpan::from_positions(import_span.begin, star_span.end), + module: module_identifier, + module_id: target_root_id, + symbols: Vec::with_capacity(self.scoped_symbols.len()) + })); + + // Fill import AST node and symbol table + let import = ctx.heap[import_id].as_symbols_mut(); + + for symbol in self.scoped_symbols.drain(..) { + let symbol_name = symbol.name; + match symbol.variant { + SymbolVariant::Definition(symbol_definition) => { + import.symbols.push(AliasedSymbol{ + name: Identifier{ span: star_span, value: symbol.name.clone() }, + alias: None, + definition_id: symbol_definition.definition_id, + }); + + if let Err((new_symbol, old_symbol)) = ctx.symbols.insert_symbol( + SymbolScope::Module(module.root_id), + Symbol{ + name: symbol_name, + variant: SymbolVariant::Definition(symbol_definition.into_imported(import_id)) + } + ) { + return Err(construct_symbol_conflict_error(modules, module_idx, ctx, &new_symbol, old_symbol)); + } + }, + _ => unreachable!(), + } + } + } else { + return Err(ParseError::new_error_str_at_pos( + &module.source, iter.last_valid_pos(), "expected symbol name, '{' or '*'" + )); + } + } else { + // Assume implicit alias + let module_name_str = module_identifier.value.clone(); + let last_ident_start = module_name_str.rfind('.').map_or(0, |v| v + 1); + let alias_text = &module_name_str.as_bytes()[last_ident_start..]; + let alias = ctx.pool.intern(alias_text); + let alias_span = InputSpan::from_positions( + module_name_span.begin.with_offset(last_ident_start as u32), + module_name_span.end + ); + let alias_identifier = Identifier{ span: alias_span, value: alias.clone() }; + + import_id = ctx.heap.alloc_import(|this| Import::Module(ImportModule{ + this, + span: InputSpan::from_positions(import_span.begin, module_identifier.span.end), + module: module_identifier, + alias: alias_identifier, + module_id: target_root_id, + })); + ctx.symbols.insert_symbol(SymbolScope::Module(module.root_id), Symbol{ + name: alias, + variant: SymbolVariant::Module(SymbolModule{ + root_id: target_root_id, + introduced_at: import_id + }) + }); + } + + // By now the `import_id` is set, just need to make sure that the import + // properly ends with a semicolon + consume_token(&module.source, &mut iter, TokenKind::SemiColon)?; + self.imports.push(import_id); + + Ok(()) + } +} diff --git a/src/protocol/parser/pass_symbols.rs b/src/protocol/parser/pass_symbols.rs new file mode 100644 index 0000000000000000000000000000000000000000..e4acb219a132d1130240084feea0ddfc77ecee73 --- /dev/null +++ b/src/protocol/parser/pass_symbols.rs @@ -0,0 +1,250 @@ +use crate::protocol::ast::*; +use super::symbol_table2::*; +use crate::protocol::input_source2::{ParseError, InputSpan}; +use super::tokens::*; +use super::token_parsing::*; +use super::{Module, ModuleCompilationPhase, PassCtx}; + +/// Scans the module and finds all module-level type definitions. These will be +/// added to the symbol table such that during AST-construction we know which +/// identifiers point to types. Will also parse all pragmas to determine module +/// names. +pub(crate) struct PassSymbols { + symbols: Vec, + pragmas: Vec, + imports: Vec, + definitions: Vec, + buffer: String, + has_pragma_version: bool, + has_pragma_module: bool, +} + +impl PassSymbols { + pub(crate) fn new() -> Self { + Self{ + symbols: Vec::with_capacity(128), + pragmas: Vec::with_capacity(8), + imports: Vec::with_capacity(32), + definitions: Vec::with_capacity(128), + buffer: String::with_capacity(128), + has_pragma_version: false, + has_pragma_module: false, + } + } + + fn reset(&mut self) { + self.symbols.clear(); + self.pragmas.clear(); + self.imports.clear(); + self.definitions.clear(); + self.has_pragma_version = false; + self.has_pragma_module = false; + } + + pub(crate) fn parse(&mut self, modules: &mut [Module], module_idx: usize, ctx: &mut PassCtx) -> Result<(), ParseError> { + self.reset(); + + let module = &mut modules[module_idx]; + let module_range = &module.tokens.ranges[0]; + + debug_assert_eq!(module.phase, ModuleCompilationPhase::Tokenized); + debug_assert_eq!(module_range.range_kind, TokenRangeKind::Module); + debug_assert!(module.root_id.is_invalid()); // not set yet, + + // Preallocate root in the heap + let root_id = ctx.heap.alloc_protocol_description(|this| { + Root{ + this, + pragmas: Vec::new(), + imports: Vec::new(), + definitions: Vec::new(), + } + }); + module.root_id = root_id; + + // Visit token ranges to detect definitions and pragmas + let mut range_idx = module_range.first_child_idx; + loop { + let range_idx_usize = range_idx as usize; + let cur_range = &module.tokens.ranges[range_idx_usize]; + + // Parse if it is a definition or a pragma + if cur_range.range_kind == TokenRangeKind::Definition { + self.visit_definition_range(modules, module_idx, ctx, range_idx_usize)?; + } else if cur_range.range_kind == TokenRangeKind::Pragma { + self.visit_pragma_range(modules, module_idx, ctx, range_idx_usize)?; + } + + match cur_range.next_sibling_idx { + Some(idx) => { range_idx = idx; }, + None => { break; }, + } + } + + // Add the module's symbol scope and the symbols we just parsed + let module_scope = SymbolScope::Module(root_id); + ctx.symbols.insert_scope(None, module_scope); + for symbol in self.symbols.drain(..) { + if let Err((new_symbol, old_symbol)) = ctx.symbols.insert_symbol(module_scope, symbol) { + return Err(construct_symbol_conflict_error(modules, module_idx, ctx, &new_symbol, old_symbol)) + } + } + + // Modify the preallocated root + let root = &mut ctx.heap[root_id]; + root.pragmas.extend(self.pragmas.drain(..)); + root.definitions.extend(self.definitions.drain(..)); + module.phase = ModuleCompilationPhase::SymbolsScanned; + + Ok(()) + } + + fn visit_pragma_range(&mut self, modules: &[Module], module_idx: usize, ctx: &mut PassCtx, range_idx: usize) -> Result<(), ParseError> { + let module = &modules[module_idx]; + let range = &module.tokens.ranges[range_idx]; + let mut iter = module.tokens.iter_range(range); + + // Consume pragma name + let (pragma_section, pragma_start, _) = consume_pragma(&self.source, &mut iter)?; + + // Consume pragma values + if pragma_section == "#module" { + // Check if name is defined twice within the same file + if self.has_pragma_module { + return Err(ParseError::new_error(&module.source, pragma_start, "module name is defined twice")); + } + + // Consume the domain-name + let (module_name, module_span) = consume_domain_ident(&module.source, &mut iter)?; + if iter.next().is_some() { + return Err(ParseError::new_error(&module.source, iter.last_valid_pos(), "expected end of #module pragma after module name")); + } + + // Add to heap and symbol table + let pragma_span = InputSpan::from_positions(pragma_start, module_span.end); + let module_name = ctx.pool.intern(module_name); + let pragma_id = ctx.heap.alloc_pragma(|this| Pragma::Module(PragmaModule{ + this, + span: pragma_span, + value: Identifier{ span: module_span, value: module_name.clone() }, + })); + self.pragmas.push(pragma_id); + + if let Err(other_module_root_id) = ctx.symbols.insert_module(module_name, module.root_id) { + // Naming conflict + let this_module = &modules[module_idx]; + let other_module = seek_module(modules, other_module_root_id).unwrap(); + let (other_module_pragma_id, _) = other_module.name.unwrap(); + let other_pragma = ctx.heap[other_module_pragma_id].as_module(); + return Err(ParseError::new_error_str_at_span( + &this_module.source, pragma_span, "conflict in module name" + ).with_info_str_at_span( + &other_module.source, other_pragma.span, "other module is defined here" + )); + } + self.has_pragma_module = true; + } else if pragma_section == "#version" { + // Check if version is defined twice within the same file + if self.has_pragma_version { + return Err(ParseError::new_error(&module.source, pragma_start, "module version is defined twice")); + } + + // Consume the version pragma + let (version, version_span) = consume_integer_literal(&module.source, &mut iter, &mut self.buffer)?; + let pragma_id = ctx.heap.alloc_pragma(|this| Pragma::Version(PragmaVersion{ + this, + span: InputSpan::from_positions(pragma_start, version_span.end), + version, + })); + self.pragmas.push(pragma_id); + self.has_pragma_version = true; + } else { + // Custom pragma, maybe we support this in the future, but for now + // we don't. + return Err(ParseError::new_error(&module.source, pragma_start, "illegal pragma name")); + } + + Ok(()) + } + + fn visit_definition_range(&mut self, modules: &[Module], module_idx: usize, ctx: &mut PassCtx, range_idx: usize) -> Result<(), ParseError> { + let module = &modules[module_idx]; + let range = &module.tokens.ranges[range_idx]; + let definition_span = InputSpan::from_positions( + module.tokens.start_pos(range), + module.tokens.end_pos(range) + ); + let mut iter = module.tokens.iter_range(range); + + // First ident must be type of symbol + let (kw_text, _) = consume_any_ident(&module.source, &mut iter).unwrap(); + + // Retrieve identifier of definition + let identifier = consume_ident_interned(&module.source, &mut iter, ctx)?; + let ident_text = identifier.value.clone(); // because we need it later + + // Reserve space in AST for definition and add it to the symbol table + let definition_class; + let ast_definition_id; + match kw_text { + KW_STRUCT => { + let struct_def_id = ctx.heap.alloc_struct_definition(|this| { + StructDefinition::new_empty(this, definition_span, identifier) + }); + definition_class = DefinitionClass::Struct; + ast_definition_id = struct_def_id.upcast(); + }, + KW_ENUM => { + let enum_def_id = ctx.heap.alloc_enum_definition(|this| { + EnumDefinition::new_empty(this, definition_span, identifier) + }); + definition_class = DefinitionClass::Enum; + ast_definition_id = enum_def_id.upcast(); + }, + KW_UNION => { + let union_def_id = ctx.heap.alloc_union_definition(|this| { + UnionDefinition::new_empty(this, definition_span, identifier) + }); + definition_class = DefinitionClass::Union; + ast_definition_id = union_def_id.upcast() + }, + KW_FUNCTION => { + let func_def_id = ctx.heap.alloc_function_definition(|this| { + FunctionDefinition::new_empty(this, definition_span, identifier) + }); + definition_class = DefinitionClass::Function; + ast_definition_id = func_def_id.upcast(); + }, + KW_PRIMITIVE | KW_COMPOSITE => { + let component_variant = if kw_text == KW_PRIMITIVE { + ComponentVariant::Primitive + } else { + ComponentVariant::Composite + }; + let comp_def_id = ctx.heap.alloc_component_definition(|this| { + ComponentDefinition::new_empty(this, definition_span, component_variant, identifier) + }); + definition_class = DefinitionClass::Component; + ast_definition_id = comp_def_id.upcast(); + }, + _ => unreachable!("encountered keyword '{}' in definition range", String::from_utf8_lossy(kw_text)), + } + + let symbol = Symbol{ + name: ident_text, + variant: SymbolVariant::Definition(SymbolDefinition{ + defined_in_module: module.root_id, + defined_in_scope: SymbolScope::Module(module.root_id), + definition_span, + identifier_span, + imported_at: None, + class: definition_class, + definition_id: ast_definition_id, + }), + }; + self.symbols.push(symbol); + self.definitions.push(ast_definition_id); + + Ok(()) + } +} \ No newline at end of file diff --git a/src/protocol/tokenizer/mod.rs b/src/protocol/parser/pass_tokenizer.rs similarity index 72% rename from src/protocol/tokenizer/mod.rs rename to src/protocol/parser/pass_tokenizer.rs index 371138f42ebda9441a93b236428802eba6b6938f..f0a8c44190f142a38b65583fd1836c70c933ae90 100644 --- a/src/protocol/tokenizer/mod.rs +++ b/src/protocol/parser/pass_tokenizer.rs @@ -5,309 +5,7 @@ use crate::protocol::input_source2::{ InputSpan }; -pub(crate) const KW_STRUCT: &'static [u8] = b"struct"; -pub(crate) const KW_ENUM: &'static [u8] = b"enum"; -pub(crate) const KW_UNION: &'static [u8] = b"union"; -pub(crate) const KW_FUNCTION: &'static [u8] = b"function"; -pub(crate) const KW_PRIMITIVE: &'static [u8] = b"primitive"; -pub(crate) const KW_COMPOSITE: &'static [u8] = b"composite"; -pub(crate) const KW_IMPORT: &'static [u8] = b"import"; - -#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] -pub(crate) enum TokenKind { - // Variable-character tokens, followed by a SpanEnd token - Ident, // regular identifier - Pragma, // identifier with prefixed `#`, range includes `#` - Integer, // integer literal - String, // string literal, range includes `"` - Character, // character literal, range includes `'` - LineComment, // line comment, range includes leading `//`, but not newline - BlockComment, // block comment, range includes leading `/*` and trailing `*/` - // Punctuation (single character) - Exclamation, // ! - Question, // ? - Pound, // # - OpenAngle, // < - OpenCurly, // { - OpenParen, // ( - OpenSquare, // [ - CloseAngle, // > - CloseCurly, // } - CloseParen, // ) - CloseSquare, // ] - Colon, // : - Comma, // , - Dot, // . - SemiColon, // ; - Quote, // ' - DoubleQuote, // " - // Operator-like (single character) - At, // @ - Plus, // + - Minus, // - - Star, // * - Slash, // / - Percent, // % - Caret, // ^ - And, // & - Or, // | - Tilde, // ~ - Equal, // = - // Punctuation (two characters) - ColonColon, // :: - DotDot, // .. - ArrowRight, // -> - // Operator-like (two characters) - PlusPlus, // ++ - PlusEquals, // += - MinusMinus, // -- - MinusEquals, // -= - StarEquals, // *= - SlashEquals, // /= - PercentEquals, // %= - CaretEquals, // ^= - AndAnd, // && - AndEquals, // &= - OrOr, // || - OrEquals, // |= - EqualEqual, // == - NotEqual, // != - ShiftLeft, // << - ShiftRight, // >> - // Operator-like (three characters) - ShiftLeftEquals,// <<= - ShiftRightEquals, // >>= - // Special marker token to indicate end of variable-character tokens - SpanEnd, -} - -impl TokenKind { - /// Returns true if the next expected token is the special `TokenKind::SpanEnd` token. This is - /// the case for tokens of variable length (e.g. an identifier). - fn has_span_end(&self) -> bool { - return *self <= TokenKind::BlockComment - } - - /// Returns the number of characters associated with the token. May only be called on tokens - /// that do not have a variable length. - fn num_characters(&self) -> u32 { - debug_assert!(!self.has_span_end() && *self != TokenKind::SpanEnd); - if *self <= TokenKind::Equal { - 1 - } else if *self <= TokenKind::ShiftRight { - 2 - } else { - 3 - } - } - - /// Returns the characters that are represented by the token, may only be called on tokens that - /// do not have a variable length. - pub fn token_chars(&self) -> &'static str { - debug_assert!(!self.has_span_end() && *self != TokenKind::SpanEnd); - use TokenKind as TK; - match self { - TK::Exclamation => "!", - TK::Question => "?", - TK::Pound => "#", - TK::OpenAngle => "<", - TK::OpenCurly => "{", - TK::OpenParen => "(", - TK::OpenSquare => "[", - TK::CloseAngle => ">", - TK::CloseCurly => "}", - TK::CloseParen => ")", - TK::CloseSquare => "]", - TK::Colon => ":", - TK::Comma => ",", - TK::Dot => ".", - TK::SemiColon => ";", - TK::Quote => "'", - TK::DoubleQuote => "\"", - TK::At => "@", - TK::Plus => "+", - TK::Minus => "-", - TK::Star => "*", - TK::Slash => "/", - TK::Percent => "%", - TK::Caret => "^", - TK::And => "&", - TK::Or => "|", - TK::Tilde => "~", - TK::Equal => "=", - TK::ColonColon => "::", - TK::DotDot => "..", - TK::ArrowRight => "->", - TK::PlusPlus => "++", - TK::PlusEquals => "+=", - TK::MinusMinus => "--", - TK::MinusEquals => "-=", - TK::StarEquals => "*=", - TK::SlashEquals => "/=", - TK::PercentEquals => "%=", - TK::CaretEquals => "^=", - TK::AndAnd => "&&", - TK::AndEquals => "&=", - TK::OrOr => "||", - TK::OrEquals => "|=", - TK::EqualEqual => "==", - TK::NotEqual => "!=", - TK::ShiftLeft => "<<", - TK::ShiftRight => ">>", - TK::ShiftLeftEquals => "<<=", - TK::ShiftRightEquals => ">>=", - // Lets keep these in explicitly for now, in case we want to add more symbols - TK::Ident | TK::Pragma | TK::Integer | TK::String | TK::Character | - TK::LineComment | TK::BlockComment | TK::SpanEnd => unreachable!(), - } - } -} - -pub(crate) struct Token { - pub kind: TokenKind, - pub pos: InputPosition, -} - -impl Token { - fn new(kind: TokenKind, pos: InputPosition) -> Self { - Self{ kind, pos } - } -} - -#[derive(Debug, PartialEq, Eq)] -pub(crate) enum TokenRangeKind { - Module, - Pragma, - Import, - Definition, - Code, -} - -/// TODO: Add first_child and next_sibling indices for slightly faster traversal -#[derive(Debug)] -pub(crate) struct TokenRange { - // Index of parent in `TokenBuffer.ranges`, does not have a parent if the - // range kind is Module, in that case the parent index points to itself. - pub parent_idx: usize, - pub range_kind: TokenRangeKind, - pub curly_depth: u32, - // InputPosition offset is limited to u32, so token ranges can be as well. - pub start: u32, // first token (inclusive index) - pub end: u32, // last token (exclusive index) - // Subranges and where they can be found - pub subranges: u32, // Number of subranges - pub first_child_idx: u32, // First subrange (or points to self if no subranges) - pub last_child_idx: u32, // Last subrange (or points to self if no subranges) - pub next_sibling_idx: Option, -} - -pub(crate) struct TokenBuffer { - pub tokens: Vec, - pub ranges: Vec, -} - -impl TokenBuffer { - pub(crate) fn new() -> Self { - Self{ tokens: Vec::new(), ranges: Vec::new() } - } - - pub(crate) fn iter_range<'a>(&'a self, range: &TokenRange) -> TokenIter<'a> { - TokenIter::new(self, range.start as usize, range.end as usize) - } - - pub(crate) fn start_pos(&self, range: &TokenRange) -> InputPosition { - self.tokens[range.start].pos - } - - pub(crate) fn end_pos(&self, range: &TokenRange) -> InputPosition { - let last_token = &self.tokens[range.end - 1]; - if last_token.kind == TokenKind::SpanEnd { - return last_token.pos - } else { - debug_assert!(!last_token.kind.has_span_end()); - return last_token.pos.with_offset(last_token.kind.num_characters()); - } - } -} - -pub(crate) struct TokenIter<'a> { - tokens: &'a Vec, - cur: usize, - end: usize, -} - -impl<'a> TokenIter<'a> { - fn new(buffer: &'a TokenBuffer, start: usize, end: usize) -> Self { - Self{ tokens: &buffer.tokens, cur: start, end } - } - - /// Returns the next token (may include comments), or `None` if at the end - /// of the range. - pub(crate) fn next_including_comments(&self) -> Option { - if self.cur >= self.end { - return None; - } - - let token = &self.tokens[self.cur]; - Some(token.kind) - } - - /// Returns the next token (but skips over comments), or `None` if at the - /// end of the range - pub(crate) fn next(&mut self) -> Option { - while let Some(token_kind) = self.next() { - if token_kind != TokenKind::LineComment && token_kind != TokenKind::BlockComment { - return Some(token_kind); - } - self.consume(); - } - - return None - } - - /// Returns the start position belonging to the token returned by `next`. If - /// there is not a next token, then we return the end position of the - /// previous token. - pub(crate) fn last_valid_pos(&self) -> InputPosition { - if self.cur < self.end { - // Return token position - return self.tokens[self.cur].pos - } - - // Return previous token end - let token = &self.tokens[self.cur - 1]; - return if token.kind == TokenKind::SpanEnd { - token.pos - } else { - token.pos.with_offset(token.kind.num_characters()); - }; - } - - /// Returns the token range belonging to the token returned by `next`. This - /// assumes that we're not at the end of the range we're iterating over. - pub(crate) fn next_range(&self) -> (InputPosition, InputPosition) { - debug_assert!(self.cur < self.end); - let token = &self.tokens[self.cur]; - if token.kind.has_span_end() { - let span_end = &self.tokens[self.cur + 1]; - debug_assert_eq!(span_end.kind, TokenKind::SpanEnd); - (token.pos, span_end.pos) - } else { - let offset = token.kind.num_characters(); - (token.pos, token.pos.with_offset(offset)) - } - } - - pub(crate) fn consume(&mut self) { - if let Some(kind) = self.next() { - if kind.has_span_end() { - self.cur += 2; - } else { - self.cur += 1; - } - } - } -} +use crate::protocol::parser::tokens::*; /// Tokenizer is a reusable parser to tokenize multiple source files using the /// same allocated buffers. In a well-formed program, we produce a consistent @@ -318,7 +16,7 @@ impl<'a> TokenIter<'a> { /// will detect this once we transform the tokens into the AST. To ensure a /// consistent AST-producing phase we will require the import to have balanced /// curly braces -pub(crate) struct Tokenizer { +pub(crate) struct PassTokenizer { // Stack of input positions of opening curly braces, used to detect // unmatched opening braces, unmatched closing braces are detected // immediately. @@ -327,7 +25,7 @@ pub(crate) struct Tokenizer { stack_idx: usize, } -impl Tokenizer { +impl PassTokenizer { pub(crate) fn new() -> Self { Self{ curly_stack: Vec::with_capacity(32), stack_idx: 0 } } @@ -348,7 +46,7 @@ impl Tokenizer { curly_depth: 0, start: 0, end: 0, - subranges: 0, + num_child_ranges: 0, first_child_idx: 0, last_child_idx: 0, next_sibling_idx: None, @@ -395,7 +93,7 @@ impl Tokenizer { if token == TokenKind::OpenCurly { self.curly_stack.push(token_pos); } else if token == TokenKind::CloseCurly { - // Check if this marks the end of a range we're + // Check if this marks the end of a range we're // currently processing if self.curly_stack.is_empty() { return Err(ParseError::new_error( @@ -445,7 +143,7 @@ impl Tokenizer { // For each range make sure its children make sense for parent_idx in 0..target.ranges.len() { let cur_range = &target.ranges[parent_idx]; - if cur_range.subranges == 0 { + if cur_range.num_child_ranges == 0 { assert_eq!(cur_range.first_child_idx, parent_idx as u32); assert_eq!(cur_range.last_child_idx, parent_idx as u32); } else { @@ -463,7 +161,7 @@ impl Tokenizer { } assert_eq!(cur_range.last_child_idx, last_child_idx); - assert_eq!(cur_range.subranges, child_counter); + assert_eq!(cur_range.num_child_ranges, child_counter); } } } @@ -637,7 +335,7 @@ impl Tokenizer { token_kind = TokenKind::CloseSquare; } else if first_char == b'^' { source.consume(); - if let Some(b'=') = source.next() { + if let Some(b'=') = source.next() { source.consume(); token_kind = TokenKind::CaretEquals; } else { @@ -683,7 +381,7 @@ impl Tokenizer { return Err(ParseError::new_error(source, source.pos(), "non-ASCII character in char literal")); } source.consume(); - + // Make sure ending quote was not escaped if c == b'\'' && prev_char != b'\\' { prev_char = c; @@ -865,7 +563,7 @@ impl Tokenizer { let end_pos = source.pos(); target.tokens.push(Token::new(TokenKind::Ident, begin_pos)); target.tokens.push(Token::new(TokenKind::SpanEnd, end_pos)); - Ok(source.section(begin_pos, end_pos)) + Ok(source.section_at_pos(begin_pos, end_pos)) } fn consume_number(&mut self, source: &mut InputSource, target: &mut TokenBuffer) -> Result<(), ParseError> { @@ -915,9 +613,9 @@ impl Tokenizer { let cur_range = &mut target.ranges[self.stack_idx]; // If we have just popped a range and then push a new range, then the - // first token is equal to the last token registered on the current - // range. If not, then we had some intermediate tokens that did not - // belong to a particular kind of token range: hence we insert an + // first token is equal to the last token registered on the current + // range. If not, then we had some intermediate tokens that did not + // belong to a particular kind of token range: hence we insert an // intermediate "code" range. if cur_range.end != first_token { let code_start = cur_range.end; @@ -931,14 +629,14 @@ impl Tokenizer { cur_range.last_child_idx = code_range_idx + 1; cur_range.end = first_token; - cur_range.subranges += 1; + cur_range.num_child_ranges += 1; target.ranges.push(TokenRange{ parent_idx: self.stack_idx, range_kind: TokenRangeKind::Code, curly_depth: self.curly_depth, start: code_start, end: first_token, - subranges: 0, + num_child_ranges: 0, first_child_idx: code_range_idx, last_child_idx: code_range_idx, next_sibling_idx: Some(code_range_idx + 1), // we're going to push this thing next @@ -964,7 +662,7 @@ impl Tokenizer { curly_depth: self.curly_depth, start: first_token, end: first_token, - subranges: 0, + num_child_ranges: 0, first_child_idx: new_range_idx, last_child_idx: new_range_idx, next_sibling_idx: None, @@ -978,12 +676,12 @@ impl Tokenizer { // Fix up the current range before going back to parent last.end = end_index; debug_assert_ne!(last.start, end_index); - + // Go back to parent self.stack_idx = last.parent_idx; let parent = &mut target.ranges[self.stack_idx]; parent.end = end_index; - parent.subranges += 1; + parent.num_child_ranges += 1; } @@ -1003,11 +701,11 @@ impl Tokenizer { fn demarks_definition(ident: &[u8]) -> bool { return ident == KW_STRUCT || - ident == KW_ENUM || - ident == KW_UNION || - ident == KW_FUNCTION || - ident == KW_PRIMITIVE || - ident == KW_COMPOSITE + ident == KW_ENUM || + ident == KW_UNION || + ident == KW_FUNCTION || + ident == KW_PRIMITIVE || + ident == KW_COMPOSITE } fn demarks_import(ident: &[u8]) -> bool { @@ -1031,18 +729,18 @@ fn is_pragma_start_or_pound(c: u8) -> bool { } fn is_identifier_start(c: u8) -> bool { - return + return (c >= b'a' && c <= b'z') || - (c >= b'A' && c <= b'Z') || - c == b'_' + (c >= b'A' && c <= b'Z') || + c == b'_' } fn is_identifier_remaining(c: u8) -> bool { - return + return (c >= b'0' && c <= b'9') || - (c >= b'a' && c <= b'z') || - (c >= b'A' && c <= b'Z') || - c == b'_' + (c >= b'a' && c <= b'z') || + (c >= b'A' && c <= b'Z') || + c == b'_' } fn is_integer_literal_start(c: u8) -> bool { @@ -1050,10 +748,10 @@ fn is_integer_literal_start(c: u8) -> bool { } fn maybe_number_remaining(c: u8) -> bool { - return + return (c == b'b' || c == b'B' || c == b'o' || c == b'O' || c == b'x' || c == b'X') || - (c >= b'0' && c <= b'9') || - c == b'_'; + (c >= b'0' && c <= b'9') || + c == b'_'; } #[cfg(test)] @@ -1064,7 +762,7 @@ mod tests { #[test] fn test_tokenizer() { let mut source = InputSource::new_test(" - + #version 500 # hello 2 @@ -1100,7 +798,7 @@ mod tests { } } "); - let mut t = Tokenizer::new(); + let mut t = PassTokenizer::new(); let mut buffer = TokenBuffer::new(); t.tokenize(&mut source, &mut buffer).expect("tokenize"); @@ -1119,7 +817,7 @@ mod tests { let (_, end) = iter.next().unwrap(); println!("[{}] {:?} ......", idx, token.kind); assert_eq!(end.kind, TokenKind::SpanEnd); - let text = source.section(token.pos, end.pos); + let text = source.section_at_pos(token.pos, end.pos); println!("{}", String::from_utf8_lossy(text)); }, _ => { diff --git a/src/protocol/parser/symbol_table.rs b/src/protocol/parser/symbol_table.rs index b12a28b14736b03d1840194f7a2f9c4a21e79987..43d7c6d22d7cc859ed1d851d25fc00899012fdaa 100644 --- a/src/protocol/parser/symbol_table.rs +++ b/src/protocol/parser/symbol_table.rs @@ -123,7 +123,7 @@ impl SymbolTable { Import::Symbols(import) => { if import.symbols.is_empty() { // Add all symbols from the other module - match self.module_lookup.get(&import.module_name) { + match self.module_lookup.get(&import.module) { Some(target_module_id) => { lookup_reserve_size += heap[*target_module_id].definitions.len() }, @@ -174,7 +174,7 @@ impl SymbolTable { match import { Import::Module(import) => { // Find the module using its name - let target_root_id = self.resolve_module(&import.module_name); + let target_root_id = self.resolve_module(&import.module); if target_root_id.is_none() { return Err(ParseError::new_error(&module.source, import.position, "Could not resolve module")); } @@ -196,7 +196,7 @@ impl SymbolTable { }, Import::Symbols(import) => { // Find the target module using its name - let target_root_id = self.resolve_module(&import.module_name); + let target_root_id = self.resolve_module(&import.module); if target_root_id.is_none() { return Err(ParseError::new_error(&module.source, import.position, "Could not resolve module of symbol imports")); } diff --git a/src/protocol/parser/symbol_table2.rs b/src/protocol/parser/symbol_table2.rs index bc40c7b5b9f2666321f7903f2e98c7246d7bbd21..967c44cde8d40bb846caa0977afd513f46cfa239 100644 --- a/src/protocol/parser/symbol_table2.rs +++ b/src/protocol/parser/symbol_table2.rs @@ -88,11 +88,13 @@ impl SymbolDefinition { } } +#[derive(Debug)] pub struct SymbolModule { pub root_id: RootId, pub introduced_at: ImportId, } +#[derive(Debug, Clone)] pub struct SymbolDefinition { // Definition location (not necessarily the place where the symbol // is introduced, as it may be imported) @@ -108,19 +110,53 @@ pub struct SymbolDefinition { pub definition_id: DefinitionId, } +impl SymbolDefinition { + /// Clones the entire data structure, but replaces the `imported_at` field + /// with the supplied `ImportId`. + pub(crate) fn into_imported(mut self, imported_at: ImportId) -> Self { + self.imported_at = Some(imported_at); + self + } +} + +#[derive(Debug)] pub enum SymbolVariant { Module(SymbolModule), Definition(SymbolDefinition), } +impl SymbolVariant { + pub(crate) fn as_module(&self) -> &SymbolModule { + match self { + SymbolVariant::Module(v) => v, + SymbolVariant::Definition(_) => unreachable!("called 'as_module' on {:?}", self), + } + } + + pub(crate) fn as_definition(&self) -> &SymbolDefinition { + match self { + SymbolVariant::Module(v) => unreachable!("called 'as_definition' on {:?}", self), + SymbolVariant::Definition(v) => v, + } + } + + pub(crate) fn as_definition_mut(&mut self) -> &mut SymbolDefinition { + match self { + SymbolVariant::Module(v) => unreachable!("called 'as_definition_mut' on {:?}", self), + SymbolVariant::Definition(v) => v, + } + } +} + +#[derive(Clone)] pub struct Symbol { pub name: StringRef<'static>, - pub data: SymbolVariant, + pub variant: SymbolVariant, } impl Symbol { - fn class(&self) -> SymbolClass { - match &self.data { + pub(crate) fn class(&self) -> SymbolClass { + match &self.variant { SymbolVariant::Module(_) => SymbolClass::Module, SymbolVariant::Definition(data) => data.class.as_symbol_class(), } @@ -236,7 +272,7 @@ impl SymbolTable { ) -> Option<&Symbol> { match self.get_symbol_by_name(in_scope, name) { Some(symbol) => { - match &symbol.data { + match &symbol.variant { SymbolVariant::Module(_) => { None // in-scope modules are always imported }, @@ -252,4 +288,27 @@ impl SymbolTable { None => None, } } + + /// Retrieves all symbols that are defined within a particular scope. Imported symbols are + /// ignored. Returns `true` if the scope was found (which may contain 0 defined symbols) and + /// `false` if the scope was not found. + pub(crate) fn get_all_symbols_defined_in_scope(&self, in_scope: SymbolScope, target: &mut Vec) -> bool { + match self.scope_lookup.get(&in_scope) { + Some(scope) => { + for symbol in &scope.symbols { + if let SymbolVariant::Definition(definition) = &symbol.variant { + if definition.imported_at.is_some() { + continue; + } + + // Defined in scope, so push onto target + target.push(symbol.clone()); + } + } + + true + }, + None => false, + } + } } \ No newline at end of file diff --git a/src/protocol/parser/token_parsing.rs b/src/protocol/parser/token_parsing.rs new file mode 100644 index 0000000000000000000000000000000000000000..bf7b4e734bde19af993f20eb553d449ac2b2584b --- /dev/null +++ b/src/protocol/parser/token_parsing.rs @@ -0,0 +1,397 @@ +use crate::collections::StringRef; +use crate::protocol::ast::*; +use crate::protocol::input_source2::{ + InputSource2 as InputSource, + InputPosition2 as InputPosition, + InputSpan, + ParseError, +}; +use super::tokens::*; +use super::symbol_table2::*; +use super::{Module, ModuleCompilationPhase, PassCtx}; + +// Keywords +pub(crate) const KW_LET: &'static [u8] = b"let"; +pub(crate) const KW_AS: &'static [u8] = b"as"; +pub(crate) const KW_STRUCT: &'static [u8] = b"struct"; +pub(crate) const KW_ENUM: &'static [u8] = b"enum"; +pub(crate) const KW_UNION: &'static [u8] = b"union"; +pub(crate) const KW_FUNCTION: &'static [u8] = b"function"; +pub(crate) const KW_PRIMITIVE: &'static [u8] = b"primitive"; +pub(crate) const KW_COMPOSITE: &'static [u8] = b"composite"; +pub(crate) const KW_IMPORT: &'static [u8] = b"import"; + +// Keywords - literals +pub(crate) const KW_LIT_TRUE: &'static [u8] = b"true"; +pub(crate) const KW_LIT_FALSE: &'static [u8] = b"false"; +pub(crate) const KW_LIT_NULL: &'static [u8] = b"null"; + +// Keywords - functions +pub(crate) const KW_FUNC_GET: &'static [u8] = b"get"; +pub(crate) const KW_FUNC_PUT: &'static [u8] = b"put"; +pub(crate) const KW_FUNC_FIRES: &'static [u8] = b"fires"; +pub(crate) const KW_FUNC_CREATE: &'static [u8] = b"create"; +pub(crate) const KW_FUNC_LENGTH: &'static [u8] = b"length"; + +// Keywords - statements +pub(crate) const KW_STMT_CHANNEL: &'static [u8] = b"channel"; +pub(crate) const KW_STMT_IF: &'static [u8] = b"if"; +pub(crate) const KW_STMT_WHILE: &'static [u8] = b"while"; +pub(crate) const KW_STMT_BREAK: &'static [u8] = b"break"; +pub(crate) const KW_STMT_CONTINUE: &'static [u8] = b"continue"; +pub(crate) const KW_STMT_GOTO: &'static [u8] = b"goto"; +pub(crate) const KW_STMT_RETURN: &'static [u8] = b"return"; +pub(crate) const KW_STMT_SYNC: &'static [u8] = b"synchronous"; +pub(crate) const KW_STMT_ASSERT: &'static [u8] = b"assert"; +pub(crate) const KW_STMT_NEW: &'static [u8] = b"new"; + +// Keywords - types +pub(crate) const KW_TYPE_IN_PORT: &'static [u8] = b"in"; +pub(crate) const KW_TYPE_OUT_PORT: &'static [u8] = b"out"; +pub(crate) const KW_TYPE_MESSAGE: &'static [u8] = b"msg"; +pub(crate) const KW_TYPE_BOOL: &'static [u8] = b"bool"; +pub(crate) const KW_TYPE_UINT8: &'static [u8] = b"u8"; +pub(crate) const KW_TYPE_UINT16: &'static [u8] = b"u16"; +pub(crate) const KW_TYPE_UINT32: &'static [u8] = b"u32"; +pub(crate) const KW_TYPE_UINT64: &'static [u8] = b"u64"; +pub(crate) const KW_TYPE_SINT8: &'static [u8] = b"s8"; +pub(crate) const KW_TYPE_SINT16: &'static [u8] = b"s16"; +pub(crate) const KW_TYPE_SINT32: &'static [u8] = b"s32"; +pub(crate) const KW_TYPE_SINT64: &'static [u8] = b"s64"; +pub(crate) const KW_TYPE_CHAR: &'static [u8] = b"char"; +pub(crate) const KW_TYPE_STRING: &'static [u8] = b"string"; +pub(crate) const KW_TYPE_INFERRED: &'static [u8] = b"auto"; + +/// Consumes a domain-name identifier: identifiers separated by a dot. For +/// simplification of later parsing and span identification the domain-name may +/// contain whitespace, but must reside on the same line. +pub(crate) fn consume_domain_ident<'a>( + source: &'a InputSource, iter: &mut TokenIter +) -> Result<(&'a [u8], InputSpan), ParseError> { + let (_, mut span) = consume_ident(source, iter)?; + while let Some(TokenKind::Dot) = iter.next() { + iter.consume(); + let (_, new_span) = consume_ident(source, iter)?; + span.end = new_span.end; + } + + // Not strictly necessary, but probably a reasonable restriction: this + // simplifies parsing of module naming and imports. + if span.begin.line != span.end.line { + return Err(ParseError::new_error_str_at_span(source, span, "module names may not span multiple lines")); + } + + // If module name consists of a single identifier, then it may not match any + // of the reserved keywords + let section = source.section_at_pos(span.begin, span.end); + if is_reserved_keyword(section) { + return Err(ParseError::new_error_str_at_span(source, span, "encountered reserved keyword")); + } + + Ok((source.section_at_pos(span.begin, span.end), span)) +} + +/// Consumes a specific expected token. Be careful to only call this with tokens +/// that do not have a variable length. +pub(crate) fn consume_token(source: &InputSource, iter: &mut TokenIter, expected: TokenKind) -> Result<(), ParseError> { + if Some(expected) != iter.next() { + return Err(ParseError::new_error_at_pos( + source, iter.last_valid_pos(), + format!("expected '{}'", expected.token_chars()) + )); + } + iter.consume(); + Ok(()) +} + +/// Consumes a comma-separated list of items if the opening delimiting token is +/// encountered. If not, then the iterator will remain at its current position. +/// Note that the potential cases may be: +/// - No opening delimiter encountered, then we return `false`. +/// - Both opening and closing delimiter encountered, but no items. +/// - Opening and closing delimiter encountered, and items were processed. +/// - Found an opening delimiter, but processing an item failed. +pub(crate) fn maybe_consume_comma_separated( + open_delim: TokenKind, close_delim: TokenKind, source: &InputSource, iter: &mut TokenIter, + consumer_fn: F, target: &mut Vec, item_name_and_article: &'static str +) -> Result + where F: Fn(&InputSource, &mut TokenIter) -> Result +{ + let mut next = iter.next(); + if Some(open_delim) != next { + return Ok(false); + } + + // Opening delimiter encountered, so must parse the comma-separated list. + iter.consume(); + target.clear(); + let mut had_comma = true; + loop { + next = iter.next(); + if Some(close_delim) == next { + iter.consume(); + break; + } else if !had_comma { + return Err(ParseError::new_error_at_pos( + source, iter.last_valid_pos(), + format!("expected a '{}', or {}", close_delim.token_chars(), item_name_and_article) + )); + } + + let new_item = consumer_fn(source, iter)?; + target.push(new_item); + + next = iter.next(); + had_comma = next == Some(TokenKind::Comma); + if had_comma { + iter.consume(); + } + } + + Ok(true) +} + +/// Consumes a comma-separated list and expected the opening and closing +/// characters to be present. The returned array may still be empty +pub(crate) fn consume_comma_separated( + open_delim: TokenKind, close_delim: TokenKind, source: &InputSource, iter: &mut TokenIter, + consumer_fn: F, target: &mut Vec, item_name_and_article: &'static str, + list_name_and_article: &'static str +) -> Result<(), ParseError> + where F: Fn(&InputSource, &mut TokenIter) -> Result +{ + let first_pos = iter.last_valid_pos(); + match maybe_consume_comma_separated(open_delim, close_delim, source, iter, consumer_fn, target, item_name_and_article) { + Ok(true) => Ok(()), + Ok(false) => { + return Err(ParseError::new_error_at_pos( + source, first_pos, + format!("expected a {}", list_name_and_article) + )); + }, + Err(err) => Err(err) + } +} + +/// Consumes an integer literal, may be binary, octal, hexadecimal or decimal, +/// and may have separating '_'-characters. +pub(crate) fn consume_integer_literal(source: &InputSource, iter: &mut TokenIter, buffer: &mut String) -> Result<(u64, InputSpan), ParseError> { + if Some(TokenKind::Integer) != iter.next() { + return Err(ParseError::new_error_str_at_pos(source, iter.last_valid_pos(), "expected an integer literal")); + } + let integer_span = iter.next_span(); + iter.consume(); + + let integer_text = source.section_at_span(integer_span); + + // Determine radix and offset from prefix + let (radix, input_offset, radix_name) = + if integer_text.starts_with(b"0b") || integer_text.starts_with(b"0B") { + // Binary number + (2, 2, "binary") + } else if integer_text.starts_with(b"0o") || integer_text.starts_with(b"0O") { + // Octal number + (8, 2, "octal") + } else if integer_text.starts_with(b"0x") || integer_text.starts_with(b"0X") { + // Hexadecimal number + (16, 2, "hexadecimal") + } else { + (10, 0, "decimal") + }; + + // Take out any of the separating '_' characters + buffer.clear(); + for char_idx in input_offset..integer_text.len() { + let char = integer_text[char_idx]; + if char == b'_' { + continue; + } + if !char.is_ascii_digit() { + return Err(ParseError::new_error_at_span( + source, integer_span, + format!("incorrectly formatted {} number", radix_name) + )); + } + buffer.push(char::from(char)); + } + + // Use the cleaned up string to convert to integer + match u64::from_str_radix(&buffer, radix) { + Ok(number) => Ok((number, integer_span)), + Err(_) => Err(ParseError::new_error_at_span( + source, integer_span, + format!("incorrectly formatted {} number", radix_name) + )), + } +} + +pub(crate) fn consume_pragma<'a>(source: &'a InputSource, iter: &mut TokenIter) -> Result<(&'a [u8], InputPosition, InputPosition), ParseError> { + if Some(TokenKind::Pragma) != iter.next() { + return Err(ParseError::new_error_str_at_pos(source, iter.last_valid_pos(), "expected a pragma")); + } + let (pragma_start, pragma_end) = iter.next_positions(); + iter.consume(); + Ok((source.section_at_pos(pragma_start, pragma_end), pragma_start, pragma_end)) +} + +pub(crate) fn has_ident(source: &InputSource, iter: &mut TokenIter, expected: &[u8]) -> bool { + peek_ident(source, iter).map_or(false, |section| section == expected) +} + +pub(crate) fn peek_ident<'a>(source: &'a InputSource, iter: &mut TokenIter) -> Option<&'a [u8]> { + if Some(TokenKind::Ident) == iter.next() { + let (start, end) = iter.next_positions(); + return Some(source.section_at_pos(start, end)) + } + + None +} + +/// Consumes any identifier and returns it together with its span. Does not +/// check if the identifier is a reserved keyword. +pub(crate) fn consume_any_ident<'a>( + source: &'a InputSource, iter: &mut TokenIter +) -> Result<(&'a [u8], InputSpan), ParseError> { + if Some(TokenKind::Ident) != iter.next() { + return Err(ParseError::new_error_str_at_pos(source, iter.last_valid_pos(), "expected an identifier")); + } + let (ident_start, ident_end) = iter.next_positions(); + iter.consume(); + Ok((source.section_at_pos(ident_start, ident_end), InputSpan::from_positions(ident_start, ident_end))) +} + +/// Consumes a specific identifier. May or may not be a reserved keyword. +pub(crate) fn consume_exact_ident(source: &InputSource, iter: &mut TokenIter, expected: &[u8]) -> Result { + let (ident, pos) = consume_any_ident(source, iter)?; + if ident != expected { + debug_assert!(expected.is_ascii()); + return Err(ParseError::new_error_at_pos( + source, iter.last_valid_pos(), + format!("expected the text '{}'", &String::from_utf8_lossy(expected)) + )); + } + Ok(pos) +} + +/// Consumes an identifier that is not a reserved keyword and returns it +/// together with its span. +pub(crate) fn consume_ident<'a>( + source: &'a InputSource, iter: &mut TokenIter +) -> Result<(&'a [u8], InputSpan), ParseError> { + let (ident, span) = consume_any_ident(source, iter)?; + if is_reserved_keyword(ident) { + return Err(ParseError::new_error_str_at_span(source, span, "encountered reserved keyword")); + } + + Ok((ident, span)) +} + +/// Consumes an identifier and immediately intern it into the `StringPool` +pub(crate) fn consume_ident_interned( + source: &InputSource, iter: &mut TokenIter, ctx: &mut PassCtx +) -> Result { + let (value, span) = consume_ident(source, iter)?; + let value = ctx.pool.intern(value); + Ok(Identifier{ span, value }) +} + +fn is_reserved_definition_keyword(text: &[u8]) -> bool { + match text { + KW_STRUCT | KW_ENUM | KW_UNION | KW_FUNCTION | KW_PRIMITIVE | KW_COMPOSITE => true, + _ => false, + } +} + +fn is_reserved_statement_keyword(text: &[u8]) -> bool { + match text { + KW_IMPORT | KW_AS | + KW_STMT_CHANNEL | KW_STMT_IF | KW_STMT_WHILE | + KW_STMT_BREAK | KW_STMT_CONTINUE | KW_STMT_GOTO | KW_STMT_RETURN | + KW_STMT_SYNC | KW_STMT_ASSERT | KW_STMT_NEW => true, + _ => false, + } +} + +fn is_reserved_expression_keyword(text: &[u8]) -> bool { + match text { + KW_LET | + KW_LIT_TRUE | KW_LIT_FALSE | KW_LIT_NULL | + KW_FUNC_GET | KW_FUNC_PUT | KW_FUNC_FIRES | KW_FUNC_CREATE | KW_FUNC_LENGTH => true, + _ => false, + } +} + +fn is_reserved_type_keyword(text: &[u8]) -> bool { + match text { + KW_TYPE_IN_PORT | KW_TYPE_OUT_PORT | KW_TYPE_MESSAGE | KW_TYPE_BOOL | + KW_TYPE_UINT8 | KW_TYPE_UINT16 | KW_TYPE_UINT32 | KW_TYPE_UINT64 | + KW_TYPE_SINT8 | KW_TYPE_SINT16 | KW_TYPE_SINT32 | KW_TYPE_SINT64 | + KW_TYPE_CHAR | KW_TYPE_STRING | + KW_TYPE_INFERRED => true, + _ => false, + } +} + +fn is_reserved_keyword(text: &[u8]) -> bool { + return + is_reserved_definition_keyword(text) || + is_reserved_statement_keyword(text) || + is_reserved_expression_keyword(text) || + is_reserved_type_keyword(text); +} + +pub(crate) fn seek_module(modules: &[Module], root_id: RootId) -> Option<&Module> { + for module in modules { + if module.root_id == root_id { + return Some(module) + } + } + + return None +} + +/// Constructs a human-readable message indicating why there is a conflict of +/// symbols. +// Note: passing the `module_idx` is not strictly necessary, but will prevent +// programmer mistakes during development: we get a conflict because we're +// currently parsing a particular module. +pub(crate) fn construct_symbol_conflict_error( + modules: &[Module], module_idx: usize, ctx: &PassCtx, new_symbol: &Symbol, old_symbol: &Symbol +) -> ParseError { + let module = &modules[module_idx]; + let get_symbol_span_and_msg = |symbol: &Symbol| -> (String, InputSpan) { + match symbol.introduced_at { + Some(import_id) => { + // Symbol is being imported + let import = &ctx.heap[import_id]; + match import { + Import::Module(import) => ( + format!("the module aliased as '{}' imported here", symbol.name.as_str()), + import.span + ), + Import::Symbols(symbols) => ( + format!("the type '{}' imported here", symbol.name.as_str()), + symbols.span + ), + } + }, + None => { + // Symbol is being defined + debug_assert_eq!(symbol.defined_in_module, module.root_id); + debug_assert_ne!(symbol.definition.symbol_class(), SymbolClass::Module); + ( + format!("the type '{}' defined here", symbol.name.as_str()), + symbol.identifier_span + ) + } + } + }; + + let (new_symbol_msg, new_symbol_span) = get_symbol_span_and_msg(new_symbol); + let (old_symbol_msg, old_symbol_span) = get_symbol_span_and_msg(old_symbol); + return ParseError::new_error_at_span( + &module.source, new_symbol_span, format!("symbol is defined twice: {}", new_symbol_msg) + ).with_info_at_span( + &module.source, old_symbol_span, format!("it conflicts with {}", old_symbol_msg) + ) +} \ No newline at end of file diff --git a/src/protocol/parser/tokens.rs b/src/protocol/parser/tokens.rs new file mode 100644 index 0000000000000000000000000000000000000000..710548b32f8365e009f48646c294251faadbb1f5 --- /dev/null +++ b/src/protocol/parser/tokens.rs @@ -0,0 +1,315 @@ +use crate::protocol::input_source2::{ + InputPosition2 as InputPosition, + InputSpan +}; + +/// Represents a particular kind of token. Some tokens represent +/// variable-character tokens. Such a token is always followed by a +/// `TokenKind::SpanEnd` token. +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] +pub(crate) enum TokenKind { + // Variable-character tokens, followed by a SpanEnd token + Ident, // regular identifier + Pragma, // identifier with prefixed `#`, range includes `#` + Integer, // integer literal + String, // string literal, range includes `"` + Character, // character literal, range includes `'` + LineComment, // line comment, range includes leading `//`, but not newline + BlockComment, // block comment, range includes leading `/*` and trailing `*/` + // Punctuation (single character) + Exclamation, // ! + Question, // ? + Pound, // # + OpenAngle, // < + OpenCurly, // { + OpenParen, // ( + OpenSquare, // [ + CloseAngle, // > + CloseCurly, // } + CloseParen, // ) + CloseSquare, // ] + Colon, // : + Comma, // , + Dot, // . + SemiColon, // ; + Quote, // ' + DoubleQuote, // " + // Operator-like (single character) + At, // @ + Plus, // + + Minus, // - + Star, // * + Slash, // / + Percent, // % + Caret, // ^ + And, // & + Or, // | + Tilde, // ~ + Equal, // = + // Punctuation (two characters) + ColonColon, // :: + DotDot, // .. + ArrowRight, // -> + // Operator-like (two characters) + PlusPlus, // ++ + PlusEquals, // += + MinusMinus, // -- + MinusEquals, // -= + StarEquals, // *= + SlashEquals, // /= + PercentEquals, // %= + CaretEquals, // ^= + AndAnd, // && + AndEquals, // &= + OrOr, // || + OrEquals, // |= + EqualEqual, // == + NotEqual, // != + ShiftLeft, // << + ShiftRight, // >> + // Operator-like (three characters) + ShiftLeftEquals,// <<= + ShiftRightEquals, // >>= + // Special marker token to indicate end of variable-character tokens + SpanEnd, +} + +impl TokenKind { + /// Returns true if the next expected token is the special `TokenKind::SpanEnd` token. This is + /// the case for tokens of variable length (e.g. an identifier). + fn has_span_end(&self) -> bool { + return *self <= TokenKind::BlockComment + } + + /// Returns the number of characters associated with the token. May only be called on tokens + /// that do not have a variable length. + fn num_characters(&self) -> u32 { + debug_assert!(!self.has_span_end() && *self != TokenKind::SpanEnd); + if *self <= TokenKind::Equal { + 1 + } else if *self <= TokenKind::ShiftRight { + 2 + } else { + 3 + } + } + + /// Returns the characters that are represented by the token, may only be called on tokens that + /// do not have a variable length. + pub fn token_chars(&self) -> &'static str { + debug_assert!(!self.has_span_end() && *self != TokenKind::SpanEnd); + use TokenKind as TK; + match self { + TK::Exclamation => "!", + TK::Question => "?", + TK::Pound => "#", + TK::OpenAngle => "<", + TK::OpenCurly => "{", + TK::OpenParen => "(", + TK::OpenSquare => "[", + TK::CloseAngle => ">", + TK::CloseCurly => "}", + TK::CloseParen => ")", + TK::CloseSquare => "]", + TK::Colon => ":", + TK::Comma => ",", + TK::Dot => ".", + TK::SemiColon => ";", + TK::Quote => "'", + TK::DoubleQuote => "\"", + TK::At => "@", + TK::Plus => "+", + TK::Minus => "-", + TK::Star => "*", + TK::Slash => "/", + TK::Percent => "%", + TK::Caret => "^", + TK::And => "&", + TK::Or => "|", + TK::Tilde => "~", + TK::Equal => "=", + TK::ColonColon => "::", + TK::DotDot => "..", + TK::ArrowRight => "->", + TK::PlusPlus => "++", + TK::PlusEquals => "+=", + TK::MinusMinus => "--", + TK::MinusEquals => "-=", + TK::StarEquals => "*=", + TK::SlashEquals => "/=", + TK::PercentEquals => "%=", + TK::CaretEquals => "^=", + TK::AndAnd => "&&", + TK::AndEquals => "&=", + TK::OrOr => "||", + TK::OrEquals => "|=", + TK::EqualEqual => "==", + TK::NotEqual => "!=", + TK::ShiftLeft => "<<", + TK::ShiftRight => ">>", + TK::ShiftLeftEquals => "<<=", + TK::ShiftRightEquals => ">>=", + // Lets keep these in explicitly for now, in case we want to add more symbols + TK::Ident | TK::Pragma | TK::Integer | TK::String | TK::Character | + TK::LineComment | TK::BlockComment | TK::SpanEnd => unreachable!(), + } + } +} + +/// Represents a single token at a particular position. +pub(crate) struct Token { + pub kind: TokenKind, + pub pos: InputPosition, +} + +impl Token { + pub(crate) fn new(kind: TokenKind, pos: InputPosition) -> Self { + Self{ kind, pos } + } +} + +/// The kind of token ranges that are specially parsed by the tokenizer. +#[derive(Debug, PartialEq, Eq)] +pub(crate) enum TokenRangeKind { + Module, + Pragma, + Import, + Definition, + Code, +} + +/// A range of tokens with a specific meaning. Such a range is part of a tree +/// where each parent tree envelops all of its children. +#[derive(Debug)] +pub(crate) struct TokenRange { + // Index of parent in `TokenBuffer.ranges`, does not have a parent if the + // range kind is Module, in that case the parent index points to itself. + pub parent_idx: usize, + pub range_kind: TokenRangeKind, + pub curly_depth: u32, + // Offsets into `TokenBuffer.ranges`: the tokens belonging to this range. + pub start: u32, // first token (inclusive index) + pub end: u32, // last token (exclusive index) + // Child ranges + pub num_child_ranges: u32, // Number of subranges + pub first_child_idx: u32, // First subrange (or points to self if no subranges) + pub last_child_idx: u32, // Last subrange (or points to self if no subranges) + pub next_sibling_idx: Option, +} + +pub(crate) struct TokenBuffer { + pub tokens: Vec, + pub ranges: Vec, +} + +impl TokenBuffer { + pub(crate) fn new() -> Self { + Self{ tokens: Vec::new(), ranges: Vec::new() } + } + + pub(crate) fn iter_range<'a>(&'a self, range: &TokenRange) -> TokenIter<'a> { + TokenIter::new(self, range.start as usize, range.end as usize) + } + + pub(crate) fn start_pos(&self, range: &TokenRange) -> InputPosition { + self.tokens[range.start].pos + } + + pub(crate) fn end_pos(&self, range: &TokenRange) -> InputPosition { + let last_token = &self.tokens[range.end - 1]; + if last_token.kind == TokenKind::SpanEnd { + return last_token.pos + } else { + debug_assert!(!last_token.kind.has_span_end()); + return last_token.pos.with_offset(last_token.kind.num_characters()); + } + } +} + +/// Iterator over tokens within a specific `TokenRange`. +pub(crate) struct TokenIter<'a> { + tokens: &'a Vec, + cur: usize, + end: usize, +} + +impl<'a> TokenIter<'a> { + fn new(buffer: &'a TokenBuffer, start: usize, end: usize) -> Self { + Self{ tokens: &buffer.tokens, cur: start, end } + } + + /// Returns the next token (may include comments), or `None` if at the end + /// of the range. + pub(crate) fn next_including_comments(&self) -> Option { + if self.cur >= self.end { + return None; + } + + let token = &self.tokens[self.cur]; + Some(token.kind) + } + + /// Returns the next token (but skips over comments), or `None` if at the + /// end of the range + pub(crate) fn next(&mut self) -> Option { + while let Some(token_kind) = self.next() { + if token_kind != TokenKind::LineComment && token_kind != TokenKind::BlockComment { + return Some(token_kind); + } + self.consume(); + } + + return None + } + + /// Returns the start position belonging to the token returned by `next`. If + /// there is not a next token, then we return the end position of the + /// previous token. + pub(crate) fn last_valid_pos(&self) -> InputPosition { + if self.cur < self.end { + // Return token position + return self.tokens[self.cur].pos + } + + // Return previous token end + let token = &self.tokens[self.cur - 1]; + return if token.kind == TokenKind::SpanEnd { + token.pos + } else { + token.pos.with_offset(token.kind.num_characters()); + }; + } + + /// Returns the token range belonging to the token returned by `next`. This + /// assumes that we're not at the end of the range we're iterating over. + /// TODO: @cleanup Phase out? + pub(crate) fn next_positions(&self) -> (InputPosition, InputPosition) { + debug_assert!(self.cur < self.end); + let token = &self.tokens[self.cur]; + if token.kind.has_span_end() { + let span_end = &self.tokens[self.cur + 1]; + debug_assert_eq!(span_end.kind, TokenKind::SpanEnd); + (token.pos, span_end.pos) + } else { + let offset = token.kind.num_characters(); + (token.pos, token.pos.with_offset(offset)) + } + } + + /// See `next_positions` + pub(crate) fn next_span(&self) -> InputSpan { + let (begin, end) = self.next_positions(); + return InputSpan::from_positions(begin, end) + } + + /// Advances the iterator to the next (meaningful) token. + pub(crate) fn consume(&mut self) { + if let Some(kind) = self.next() { + if kind.has_span_end() { + self.cur += 2; + } else { + self.cur += 1; + } + } + } +} \ No newline at end of file