diff --git a/src/protocol/lexer2.rs b/src/protocol/lexer2.rs index 50e7ff55cb147ab4c7668d0576badd1e67713cc2..1cbb06f354f4a7b846c6992ac58ba0cd033f80a6 100644 --- a/src/protocol/lexer2.rs +++ b/src/protocol/lexer2.rs @@ -1,24 +1,418 @@ +use crate::protocol::ast::*; use crate::protocol::Heap; -use crate::protocol::tokenizer::{TokenBuffer, Token}; -use crate::protocol::input_source2::{InputSource2 as InputSource, ParseError}; +use crate::collections::{StringPool, StringRef}; +use crate::protocol::tokenizer::*; +use crate::protocol::input_source2::{InputSource2 as InputSource, InputPosition2 as InputPosition, InputSpan, ParseError}; +use crate::protocol::symbol_table2::*; + +#[derive(PartialEq, Eq)] +enum ModuleCompilationPhase { + Source, // only source is set + Tokenized, // source is tokenized + DefinitionsScanned, // all definitions are linked to their type class + ImportsResolved, // all imports are added to the symbol table + Parsed, // produced the AST for the module + ValidatedAndLinked, // AST is traversed and has linked the required AST nodes + Typed, // Type inference and checking has been performed +} + +enum KeywordDefinition { + Struct, + Enum, + Union, + Function, + Primitive, + Composite, +} + +impl KeywordDefinition { + fn as_symbol_class(&self) -> SymbolClass { + use KeywordDefinition as KD; + use SymbolClass as SC; + + match self { + KD::Struct => SC::Struct, + KD::Enum => SC::Enum, + KD::Union => SC::Union, + KD::Function => SC::Function, + KD::Primitive | KD::Composite => SC::Component, + } + } +} + +struct Module { + // Buffers + source: InputSource, + tokens: TokenBuffer, + // Identifiers + root_id: RootId, + name: Option<(PragmaId, StringRef)>, + version: Option<(PragmaId, i64)>, + phase: ModuleCompilationPhase, +} struct Ctx<'a> { heap: &'a mut Heap, - source: &'a InputSource, - tokens: &'a TokenBuffer, + symbols: &'a mut SymbolTable, + pool: &'a mut StringPool, +} + +/// Scans the module and finds all module-level type definitions. These will be +/// added to the symbol table such that during AST-construction we know which +/// identifiers point to types. Will also parse all pragmas to determine module +/// names. +pub(crate) struct ASTSymbolPrePass { + symbols: Vec, + pragmas: Vec, + buffer: String, + has_pragma_version: bool, + has_pragma_module: bool, +} + +impl ASTSymbolPrePass { + pub(crate) fn new() -> Self { + Self{ + symbols: Vec::with_capacity(128), + pragmas: Vec::with_capacity(8), + buffer: String::with_capacity(128), + has_pragma_version: false, + has_pragma_module: false, + } + } + + fn reset(&mut self) { + self.symbols.clear(); + self.pragmas.clear(); + self.has_pragma_version = false; + self.has_pragma_module = false; + } + + pub(crate) fn parse(&mut self, modules: &mut [Module], module_idx: usize, ctx: &mut Ctx) -> Result<(), ParseError> { + self.reset(); + + let module = &mut modules[module_idx]; + let module_range = &module.tokens.ranges[0]; + let expected_parent_idx = 0; + let expected_subranges = module_range.subranges; + debug_assert_eq!(module.phase, ModuleCompilationPhase::Tokenized); + debug_assert_eq!(module_range.range_kind, TokenRangeKind::Module); + debug_assert_eq!(module.root_id.index, 0); + + // Preallocate root in the heap + let root_id = ctx.heap.alloc_protocol_description(|this| { + Root{ + this, + pragmas: Vec::new(), + imports: Vec::new(), + definitions: Vec::new(), + } + }); + module.root_id = root_id; + + // Visit token ranges to detect defintions + let mut visited_subranges = 0; + for range_idx in expected_parent_idx + 1..module.tokens.ranges.len() { + // Skip any ranges that do not belong to the module + let cur_range = &module.tokens.ranges[range_idx]; + if cur_range.parent_idx != expected_parent_idx { + continue; + } + + // Parse if it is a definition or a pragma + if cur_range.range_kind == TokenRangeKind::Definition { + self.visit_definition_range(modules, module_idx, ctx, range_idx)?; + } else if cur_range.range_kind == TokenRangeKind::Pragma { + self.visit_pragma_range(modules, module_idx, ctx, range_idx)?; + } + + visited_subranges += 1; + if visited_subranges == expected_subranges { + break; + } + } + + // By now all symbols should have been found: add to symbol table and + // add the parsed pragmas to the preallocated root in the heap. + debug_assert_eq!(visited_subranges, expected_subranges); + ctx.symbols.insert_scoped_symbols(None, SymbolScope::Module(module.root_id), &self.symbols); + + let root = &mut ctx.heap[root_id]; + debug_assert!(root.pragmas.is_empty()); + root.pragmas.extend(&self.pragmas); + + module.phase = ModuleCompilationPhase::DefinitionsScanned; + + Ok(()) + } + + fn visit_pragma_range(&mut self, modules: &mut [Module], module_idx: usize, ctx: &mut Ctx, range_idx: usize) -> Result<(), ParseError> { + let module = &mut modules[module_idx]; + let range = &module.tokens.ranges[range_idx]; + let mut iter = module.tokens.iter_range(range); + + // Consume pragma name + let (pragma_section, pragma_start, _) = consume_pragma(&self.source, &mut iter)?; + + // Consume pragma values + if pragma_section == "#module" { + // Check if name is defined twice within the same file + if self.has_pragma_module { + return Err(ParseError::new_error(&module.source, pragma_start, "module name is defined twice")); + } + + // Consume the domain-name + let (module_name, module_span) = consume_domain_ident(&module.source, &mut iter)?; + if iter.next().is_some() { + return Err(ParseError::new_error(&module.source, iter.last_valid_pos(), "expected end of #module pragma after module name")); + } + + // Add to heap and symbol table + let pragma_span = InputSpan::from_positions(pragma_start, module_span.end); + let module_name = ctx.pool.intern(module_name); + let pragma_id = ctx.heap.alloc_pragma(|this| Pragma::Module(PragmaModule{ + this, + span: pragma_span, + value: Identifier{ span: module_span, value: module_name.clone() }, + })); + self.pragmas.push(pragma_id); + + if let Err(other_module_root_id) = ctx.symbols.insert_module(module_name, module.root_id) { + // Naming conflict + let this_module = &modules[module_idx]; + let other_module = seek_module(modules, other_module_root_id).unwrap(); + let (other_module_pragma_id, _) = other_module.name.unwrap(); + let other_pragma = ctx.heap[other_module_pragma_id].as_module(); + return Err(ParseError::new_error_str_at_span( + &this_module.source, pragma_span, "conflict in module name" + ).with_info_str_at_span( + &other_module.source, other_pragma.span, "other module is defined here" + )); + } + self.has_pragma_module = true; + } else if pragma_section == "#version" { + // Check if version is defined twice within the same file + if self.has_pragma_version { + return Err(ParseError::new_error(&module.source, pragma_start, "module version is defined twice")); + } + + // Consume the version pragma + let (version, version_span) = consume_integer_literal(&module.source, &mut iter, &mut self.buffer)?; + let pragma_id = ctx.heap.alloc_pragma(|this| Pragma::Version(PragmaVersion{ + this, + span: InputSpan::from_positions(pragma_start, version_span.end), + version, + })); + self.pragmas.push(pragma_id); + self.has_pragma_version = true; + } else { + // Custom pragma, maybe we support this in the future, but for now + // we don't. + return Err(ParseError::new_error(&module.source, pragma_start, "illegal pragma name")); + } + + Ok(()) + } + + fn visit_definition_range(&mut self, modules: &mut [Module], module_idx: usize, ctx: &mut Ctx, range_idx: usize) -> Result<(), ParseError> { + let module = &modules[module_idx]; + let range = &module.tokens.ranges[range_idx]; + let definition_span = InputSpan::from_positions( + module.tokens.start_pos(range), + module.tokens.end_pos(range) + ); + let mut iter = module.tokens.iter_range(range); + + // Because we're visiting a definition, we expect an ident that resolves + // to a keyword indicating a definition. + let kw_text = consume_ident_text(&module.source, &mut iter).unwrap(); + let kw = parse_definition_keyword(kw_text).unwrap(); + + // Retrieve identifier and put in temp symbol table + let definition_ident = consume_ident_text(&module.source, &mut iter)?; + let definition_ident = ctx.pool.intern(definition_ident); + let symbol_class = kw.as_symbol_class(); + + // Get the token indicating the end of the definition to get the full + // span of the definition + let last_token = &module.tokens.tokens[range.end - 1]; + debug_assert_eq!(last_token.kind, TokenKind::CloseCurly); + + self.symbols.push(Symbol::new( + module.root_id, + SymbolScope::Module(module.root_id), + definition_span, + symbol_class, + definition_ident + )); + + Ok(()) + } +} + +pub(crate) struct ASTImportPrePass { +} + +impl ASTImportPrePass { + pub(crate) fn parse(&mut self, modules: &mut [Module], module_idx: usize, ctx: &mut Ctx) -> Result<(), ParseError> { + let module = &modules[module_idx]; + let module_range = &module.tokens.ranges[0]; + debug_assert_eq!(module.phase, ModuleCompilationPhase::DefinitionsScanned); + debug_assert_eq!(module_range.range_kind, TokenRangeKind::Module); + + let expected_parent_idx = 0; + let expected_subranges = module_range.subranges; + let mut visited_subranges = 0; + + for range_idx in expected_parent_idx + 1..module.tokens.ranges.len() { + let cur_range = &module.tokens.ranges[range_idx]; + if cur_range.parent_idx != expected_parent_idx { + continue; + } + + visited_subranges += 1; + if cur_range.range_kind == TokenRangeKind::Import { + self.visit_import_range(modules, module_idx, ctx, range_idx)?; + } + + if visited_subranges == expected_subranges { + break; + } + } + + Ok(()) + } + + pub(crate) fn visit_import_range( + &mut self, modules: &mut [Module], module_idx: usize, ctx: &mut Ctx, range_idx: usize + ) -> Result<(), ParseError> { + let module = &modules[module_idx]; + let import_range = &module.tokens.ranges[range_idx]; + debug_assert_eq!(import_range.range_kind, TokenRangeKind::Import); + + let mut iter = module.tokens.iter_range(import_range); + + // Consume "import" + let _import_ident = consume_ident_text(&module.source, &mut iter)?; + debug_assert_eq!(_import_ident, KW_IMPORT); + + // Consume module name + let (module_name, _) = consume_domain_ident(&module.source, &mut iter)?; + + + Ok(()) + } } -// Lexes definitions. Should be the first pass over each of the module files -// after tokenization. Only once all definitions are parsed can we do the full -// AST creation pass. -struct LexerDefinitions { +fn consume_domain_ident<'a>(source: &'a InputSource, iter: &mut TokenIter) -> Result<(&'a [u8], InputSpan), ParseError> { + let (_, name_start, mut name_end) = consume_ident(source, iter)?; + while let Some(TokenKind::Dot) = iter.next() { + consume_dot(source, iter)?; + let (_, _, new_end) = consume_ident(source, iter)?; + name_end = new_end; + } + Ok((source.section(name_start, name_end), InputSpan::from_positions(name_start, name_end))) +} + +fn consume_dot<'a>(source: &'a InputSource, iter: &mut TokenIter) -> Result<(), ParseError> { + if Some(TokenKind::Dot) != iter.next() { + return Err(ParseError::new_error_str_at_pos(source, iter.last_valid_pos(), "expected a dot")); + } + iter.consume(); + Ok(()) } -impl LexerDefinitions { - pub(crate) fn parse(ctx: &mut Ctx) -> Result<(), ParseError> { - debug_assert!(ctx.tokens.ranges.len() > 0); +fn consume_integer_literal(source: &InputSource, iter: &mut TokenIter, buffer: &mut String) -> Result<(u64, InputSpan), ParseError> { + if Some(TokenKind::Integer) != iter.next() { + return Err(ParseError::new_error_str_at_pos(source, iter.last_valid_pos(), "expected an integer literal")); } + let (start_pos, end_pos) = iter.next_range(); + iter.consume(); - pub(crate) fn parse_definition(heap: &mut Heap, source: &InputSource, range: &TokenRang) + let integer_text = source.section(start_pos, end_pos); + + // Determine radix and offset from prefix + let (radix, input_offset, radix_name) = + if integer_text.starts_with(b"0b") || integer_text.starts_with(b"0B") { + // Binary number + (2, 2, "binary") + } else if integer_text.starts_with(b"0o") || integer_text.starts_with(b"0O") { + // Octal number + (8, 2, "octal") + } else if integer_text.starts_with(b"0x") || integer_text.starts_with(b"0X") { + // Hexadecimal number + (16, 2, "hexadecimal") + } else { + (10, 0, "decimal") + }; + + // Take out any of the separating '_' characters + buffer.clear(); + for char_idx in input_offset..integer_text.len() { + let char = integer_text[char_idx]; + if char == b'_' { + continue; + } + if !char.is_ascii_digit() { + return Err(ParseError::new_error(source, start_pos, "incorrectly formatted integer")); + } + buffer.push(char::from(char)); + } + + // Use the cleaned up string to convert to integer + match u64::from_str_radix(&buffer, radix) { + Ok(number) => Ok((number, InputSpan::from_positions(start_pos, end_pos))), + Err(_) => Err( + ParseError::new_error(source, start_pos, "incorrectly formatted integer") + ), + } +} + +fn seek_module(modules: &[Module], root_id: RootId) -> Option<&Module> { + for module in modules { + if module.root_id == root_id { + return Some(module) + } + } + + return None +} + +fn consume_pragma<'a>(source: &'a InputSource, iter: &mut TokenIter) -> Result<(&'a [u8], InputPosition, InputPosition), ParseError> { + if Some(TokenKind::Pragma) != iter.next() { + return Err(ParseError::new_error(source, iter.last_valid_pos(), "expected a pragma")); + } + let (pragma_start, pragma_end) = iter.next_range(); + iter.consume(); + Ok((source.section(pragma_start, pragma_end), pragma_start, pragma_end)) +} + +fn consume_ident_text<'a>(source: &'a InputSource, iter: &mut TokenIter) -> Result<&'a [u8], ParseError> { + if Some(TokenKind::Ident) != iter.next() { + return Err(ParseError::new_error(source, iter.last_valid_pos(), "expected an identifier")); + } + let (ident_start, ident_end) = iter.next_range(); + iter.consume(); + Ok(source.section(ident_start, ident_end)) +} + +fn consume_ident<'a>(source: &'a InputSource, iter: &mut TokenIter) -> Result<(&'a [u8], InputPosition, InputPosition), ParseError> { + if Some(TokenKind::Ident) != iter.next() { + return Err(ParseError::new_error(sourcee, iter.last_valid_pos(), "expected an identifier")); + } + let (ident_start, ident_end) = iter.next_range(); + iter.consume(); + Ok((source.section(ident_start, ident_end), ident_start, ident_end)) +} + +fn parse_definition_keyword(keyword: &[u8]) -> Option { + match keyword { + KW_STRUCT => Some(Keyword::Struct), + KW_ENUM => Some(Keyword::Enum), + KW_UNION => Some(Keyword::Union), + KW_FUNCTION => Some(Keyword::Function), + KW_PRIMITIVE => Some(Keyword::Primitive), + KW_COMPOSITE => Some(Keyword::Composite), + _ => None + } } \ No newline at end of file