From c9800c8f19d701a8c008aa954a7f84141dbc3bda 2022-03-29 16:28:33 From: mh Date: 2022-03-29 16:28:33 Subject: [PATCH] Rewrite tokenizer to emit markers iso ranges --- diff --git a/src/protocol/ast.rs b/src/protocol/ast.rs index 6ad5f1ca98cfa39f02cad26fa0314df2c6f14413..87e0135bc0e117077ba4a62a7682c557ca1ae55d 100644 --- a/src/protocol/ast.rs +++ b/src/protocol/ast.rs @@ -932,7 +932,6 @@ pub struct StructDefinition { pub this: StructDefinitionId, pub defined_in: RootId, // Symbol scanning - pub span: InputSpan, pub identifier: Identifier, pub poly_vars: Vec, // Parsing @@ -941,10 +940,10 @@ pub struct StructDefinition { impl StructDefinition { pub(crate) fn new_empty( - this: StructDefinitionId, defined_in: RootId, span: InputSpan, + this: StructDefinitionId, defined_in: RootId, identifier: Identifier, poly_vars: Vec ) -> Self { - Self{ this, defined_in, span, identifier, poly_vars, fields: Vec::new() } + Self{ this, defined_in, identifier, poly_vars, fields: Vec::new() } } } @@ -965,7 +964,6 @@ pub struct EnumDefinition { pub this: EnumDefinitionId, pub defined_in: RootId, // Symbol scanning - pub span: InputSpan, pub identifier: Identifier, pub poly_vars: Vec, // Parsing @@ -974,10 +972,10 @@ pub struct EnumDefinition { impl EnumDefinition { pub(crate) fn new_empty( - this: EnumDefinitionId, defined_in: RootId, span: InputSpan, + this: EnumDefinitionId, defined_in: RootId, identifier: Identifier, poly_vars: Vec ) -> Self { - Self{ this, defined_in, span, identifier, poly_vars, variants: Vec::new() } + Self{ this, defined_in, identifier, poly_vars, variants: Vec::new() } } } @@ -993,7 +991,6 @@ pub struct UnionDefinition { pub this: UnionDefinitionId, pub defined_in: RootId, // Phase 1: symbol scanning - pub span: InputSpan, pub identifier: Identifier, pub poly_vars: Vec, // Phase 2: parsing @@ -1002,10 +999,10 @@ pub struct UnionDefinition { impl UnionDefinition { pub(crate) fn new_empty( - this: UnionDefinitionId, defined_in: RootId, span: InputSpan, + this: UnionDefinitionId, defined_in: RootId, identifier: Identifier, poly_vars: Vec ) -> Self { - Self{ this, defined_in, span, identifier, poly_vars, variants: Vec::new() } + Self{ this, defined_in, identifier, poly_vars, variants: Vec::new() } } } @@ -1111,7 +1108,6 @@ pub struct ProcedureDefinition { pub defined_in: RootId, // Symbol scanning pub kind: ProcedureKind, - pub span: InputSpan, pub identifier: Identifier, pub poly_vars: Vec, // Parser @@ -1126,12 +1122,11 @@ pub struct ProcedureDefinition { impl ProcedureDefinition { pub(crate) fn new_empty( - this: ProcedureDefinitionId, defined_in: RootId, span: InputSpan, + this: ProcedureDefinitionId, defined_in: RootId, kind: ProcedureKind, identifier: Identifier, poly_vars: Vec ) -> Self { Self { this, defined_in, - span, kind, identifier, poly_vars, source: ProcedureSource::FuncUserDefined, return_type: None, diff --git a/src/protocol/mod.rs b/src/protocol/mod.rs index da5b60c9ceadb1d990a1c7bc6fa7a78e2b4ea2c9..2e46cc167c05753e9aee603c74506c57a3fd5c9b 100644 --- a/src/protocol/mod.rs +++ b/src/protocol/mod.rs @@ -60,7 +60,6 @@ impl ProtocolDescription { return Err(format!("{}", err)) } - debug_assert_eq!(parser.modules.len(), 1, "only supporting one module here for now"); let modules: Vec = parser.modules.into_iter() .map(|module| Module{ source: module.source, diff --git a/src/protocol/parser/mod.rs b/src/protocol/parser/mod.rs index 59235a8cd2397b44eea99458f897fb910aeb2959..39de7073241552963fc106ab248880774a9e45fa 100644 --- a/src/protocol/parser/mod.rs +++ b/src/protocol/parser/mod.rs @@ -52,10 +52,9 @@ pub enum ModuleCompilationPhase { } pub struct Module { - // Buffers pub source: InputSource, pub tokens: TokenBuffer, - // Identifiers + pub is_compiler_file: bool, // TODO: @Hack pub root_id: RootId, pub name: Option<(PragmaId, StringRef<'static>)>, pub version: Option<(PragmaId, i64)>, @@ -156,7 +155,7 @@ impl Parser { pass_typing: PassTyping::new(), pass_rewriting: PassRewriting::new(), pass_stack_size: PassStackSize::new(), - write_tokens_to: Some("tokens.txt".to_string()), + write_tokens_to: None, write_ast_to: None, arch: TargetArch::new(), }; @@ -194,21 +193,7 @@ impl Parser { /// it internally for later parsing (when all modules are present). Returns /// the index of the new module. pub fn feed(&mut self, mut source: InputSource) -> Result { - let mut token_buffer = TokenBuffer::new(); - self.pass_tokenizer.tokenize(&mut source, &mut token_buffer)?; - - let module = Module{ - source, - tokens: token_buffer, - root_id: RootId::new_invalid(), - name: None, - version: None, - phase: ModuleCompilationPhase::Tokenized, - }; - let module_index = self.modules.len(); - self.modules.push(module); - - return Ok(module_index); + return self.feed_internal(source, false); } pub fn parse(&mut self) -> Result<(), ParseError> { @@ -353,7 +338,7 @@ impl Parser { let source = source.unwrap(); let input_source = InputSource::new(file.to_string(), source); - let module_index = self.feed(input_source); + let module_index = self.feed_internal(input_source, true); if let Err(err) = module_index { // A bit of a hack, but shouldn't really happen anyway: the // compiler should ship with a decent standard library (at some @@ -370,6 +355,25 @@ impl Parser { return Ok(()) } + + fn feed_internal(&mut self, mut source: InputSource, is_compiler_file: bool) -> Result { + let mut token_buffer = TokenBuffer::new(); + self.pass_tokenizer.tokenize(&mut source, &mut token_buffer)?; + + let module = Module{ + source, + tokens: token_buffer, + is_compiler_file, + root_id: RootId::new_invalid(), + name: None, + version: None, + phase: ModuleCompilationPhase::Tokenized, + }; + let module_index = self.modules.len(); + self.modules.push(module); + + return Ok(module_index); + } } fn insert_builtin_type(type_table: &mut TypeTable, parts: Vec, has_poly_var: bool, size: usize, alignment: usize) -> TypeId { diff --git a/src/protocol/parser/pass_definitions.rs b/src/protocol/parser/pass_definitions.rs index 607f8caacc21fb71b60c8bca23e60ec45a27fa5d..7f9817270909d65c1cfc3c9fa702aef204e1506e 100644 --- a/src/protocol/parser/pass_definitions.rs +++ b/src/protocol/parser/pass_definitions.rs @@ -43,35 +43,33 @@ impl PassDefinitions { pub(crate) fn parse(&mut self, modules: &mut [Module], module_idx: usize, ctx: &mut PassCtx) -> Result<(), ParseError> { let module = &modules[module_idx]; - let module_range = &module.tokens.ranges[0]; debug_assert_eq!(module.phase, ModuleCompilationPhase::ImportsResolved); - debug_assert_eq!(module_range.range_kind, TokenRangeKind::Module); - // Although we only need to parse the definitions, we want to go through - // code ranges as well such that we can throw errors if we get - // unexpected tokens at the module level of the source. - let mut range_idx = module_range.first_child_idx; - loop { - let range_idx_usize = range_idx as usize; - let cur_range = &module.tokens.ranges[range_idx_usize]; - - match cur_range.range_kind { - TokenRangeKind::Module => unreachable!(), // should not be reachable - TokenRangeKind::Pragma | TokenRangeKind::Import => { - // Already fully parsed, fall through and go to next range - }, - TokenRangeKind::Definition | TokenRangeKind::Code => { - // Visit range even if it is a "code" range to provide - // proper error messages. - self.visit_range(modules, module_idx, ctx, range_idx_usize)?; - }, + // We iterate through the entire document. If we find a marker that has + // been handled then we skip over it. It is important that we properly + // parse all other tokens in the document to ensure that we throw the + // correct kind of errors. + let num_tokens = module.tokens.tokens.len() as u32; + let num_markers = module.tokens.markers.len(); + + let mut marker_index = 0; + let mut first_token_index = 0; + while first_token_index < num_tokens { + // Seek ahead to the next marker that was already handled. + let mut last_token_index = num_tokens; + let mut new_first_token_index = num_tokens; + while marker_index < num_markers { + let marker = &module.tokens.markers[marker_index]; + marker_index += 1; + if marker.handled { + last_token_index = marker.first_token; + new_first_token_index = marker.last_token; + break; + } } - if cur_range.next_sibling_idx == NO_SIBLING { - break; - } else { - range_idx = cur_range.next_sibling_idx; - } + self.visit_token_range(modules, module_idx, ctx, first_token_index, last_token_index)?; + first_token_index = new_first_token_index; } modules[module_idx].phase = ModuleCompilationPhase::DefinitionsParsed; @@ -79,15 +77,14 @@ impl PassDefinitions { Ok(()) } - fn visit_range( - &mut self, modules: &[Module], module_idx: usize, ctx: &mut PassCtx, range_idx: usize + fn visit_token_range( + &mut self, modules: &[Module], module_idx: usize, ctx: &mut PassCtx, + token_range_begin: u32, token_range_end: u32, ) -> Result<(), ParseError> { let module = &modules[module_idx]; - let cur_range = &module.tokens.ranges[range_idx]; - debug_assert!(cur_range.range_kind == TokenRangeKind::Definition || cur_range.range_kind == TokenRangeKind::Code); // Detect which definition we're parsing - let mut iter = module.tokens.iter_range(cur_range.start, cur_range.end); + let mut iter = module.tokens.iter_range(token_range_begin, Some(token_range_end)); loop { let next = iter.next(); if next.is_none() { @@ -134,7 +131,7 @@ impl PassDefinitions { let start_pos = iter.last_valid_pos(); let parser_type = self.type_parser.consume_parser_type( iter, &ctx.heap, source, &ctx.symbols, poly_vars, definition_id, - module_scope, false, None + module_scope, false, false, None )?; let field = consume_ident_interned(source, iter, ctx)?; Ok(StructFieldDefinition{ @@ -221,7 +218,7 @@ impl PassDefinitions { let poly_vars = ctx.heap[definition_id].poly_vars(); self.type_parser.consume_parser_type( iter, &ctx.heap, source, &ctx.symbols, poly_vars, definition_id, - module_scope, false, None + module_scope, false, false, None ) }, &mut types_section, "an embedded type", Some(&mut close_pos) @@ -255,20 +252,21 @@ impl PassDefinitions { // Retrieve function name consume_exact_ident(&module.source, iter, KW_FUNCTION)?; let (ident_text, _) = consume_ident(&module.source, iter)?; - let stringy = String::from_utf8_lossy(ident_text).to_string(); // Retrieve preallocated DefinitionId let module_scope = SymbolScope::Module(module.root_id); let definition_id = ctx.symbols.get_symbol_by_name_defined_in_scope(module_scope, ident_text) .unwrap().variant.as_definition().definition_id; self.cur_definition = definition_id; + let allow_compiler_types = module.is_compiler_file; consume_polymorphic_vars_spilled(&module.source, iter, ctx)?; // Parse function's argument list let mut parameter_section = self.variables.start_section(); consume_parameter_list( - &mut self.type_parser, &module.source, iter, ctx, &mut parameter_section, module_scope, definition_id + &mut self.type_parser, &module.source, iter, ctx, &mut parameter_section, + module_scope, definition_id, allow_compiler_types )?; let parameters = parameter_section.into_vec(); @@ -277,7 +275,7 @@ impl PassDefinitions { let poly_vars = ctx.heap[definition_id].poly_vars(); let parser_type = self.type_parser.consume_parser_type( iter, &ctx.heap, &module.source, &ctx.symbols, poly_vars, definition_id, - module_scope, false, None + module_scope, false, allow_compiler_types, None )?; // Consume body @@ -308,13 +306,15 @@ impl PassDefinitions { let definition_id = ctx.symbols.get_symbol_by_name_defined_in_scope(module_scope, ident_text) .unwrap().variant.as_definition().definition_id; self.cur_definition = definition_id; + let allow_compiler_types = module.is_compiler_file; consume_polymorphic_vars_spilled(&module.source, iter, ctx)?; // Parse component's argument list let mut parameter_section = self.variables.start_section(); consume_parameter_list( - &mut self.type_parser, &module.source, iter, ctx, &mut parameter_section, module_scope, definition_id + &mut self.type_parser, &module.source, iter, ctx, &mut parameter_section, + module_scope, definition_id, allow_compiler_types )?; let parameters = parameter_section.into_vec(); @@ -346,10 +346,10 @@ impl PassDefinitions { if iter.next() == Some(TokenKind::OpenCurly) && iter.peek() == Some(TokenKind::Pragma) { // Consume the placeholder "{ #builtin }" tokens iter.consume(); // opening curly brace - let (pragma, pragma_start, pragma_end) = consume_pragma(&module.source, iter)?; + let (pragma, pragma_span) = consume_pragma(&module.source, iter)?; if pragma != b"#builtin" { return Err(ParseError::new_error_str_at_span( - &module.source, InputSpan::from_positions(pragma_start, pragma_end), + &module.source, pragma_span, "expected a '#builtin' pragma, or a function body" )); } @@ -863,7 +863,7 @@ impl PassDefinitions { let parser_type = self.type_parser.consume_parser_type( iter, &ctx.heap, &module.source, &ctx.symbols, poly_vars, definition_id, SymbolScope::Module(module.root_id), - true, Some(angle_start_pos) + true, false, Some(angle_start_pos) )?; (parser_type.elements, parser_type.full_span.end) @@ -959,7 +959,8 @@ impl PassDefinitions { let parser_type = self.type_parser.consume_parser_type( iter, &ctx.heap, &module.source, &ctx.symbols, poly_vars, - definition_id, SymbolScope::Definition(definition_id), true, None + definition_id, SymbolScope::Definition(definition_id), + true, false, None ); if let Ok(parser_type) = parser_type { @@ -1566,7 +1567,7 @@ impl PassDefinitions { let poly_vars = ctx.heap[self.cur_definition].poly_vars(); let parser_type = self.type_parser.consume_parser_type( iter, &ctx.heap, &module.source, &ctx.symbols, poly_vars, self.cur_definition, - symbol_scope, true, None + symbol_scope, true, false, None )?; debug_assert!(!parser_type.elements.is_empty()); match parser_type.elements[0].variant { @@ -1731,7 +1732,7 @@ impl PassDefinitions { self.type_parser.consume_parser_type( iter, &ctx.heap, &module.source, &ctx.symbols, poly_vars, definition_id, SymbolScope::Module(module.root_id), - true, Some(angle_start_pos) + true, false, Some(angle_start_pos) )? } else { // Automatic casting with inferred target type @@ -1867,7 +1868,7 @@ fn consume_polymorphic_vars_spilled(source: &InputSource, iter: &mut TokenIter, fn consume_parameter_list( parser: &mut ParserTypeParser, source: &InputSource, iter: &mut TokenIter, ctx: &mut PassCtx, target: &mut ScopedSection, - scope: SymbolScope, definition_id: DefinitionId + scope: SymbolScope, definition_id: DefinitionId, allow_compiler_types: bool ) -> Result<(), ParseError> { consume_comma_separated( TokenKind::OpenParen, TokenKind::CloseParen, source, iter, ctx, @@ -1875,7 +1876,7 @@ fn consume_parameter_list( let poly_vars = ctx.heap[definition_id].poly_vars(); // Rust being rust, multiple lookups let parser_type = parser.consume_parser_type( iter, &ctx.heap, source, &ctx.symbols, poly_vars, definition_id, - scope, false, None + scope, false, allow_compiler_types, None )?; let identifier = consume_ident_interned(source, iter, ctx)?; let parameter_id = ctx.heap.alloc_variable(|this| Variable{ diff --git a/src/protocol/parser/pass_definitions_types.rs b/src/protocol/parser/pass_definitions_types.rs index 11603e2ffe11da4a97c02a6c51ee3186070a0b61..0f574fc99222e1b195e5f437c2970d4d362ad125 100644 --- a/src/protocol/parser/pass_definitions_types.rs +++ b/src/protocol/parser/pass_definitions_types.rs @@ -59,7 +59,8 @@ impl ParserTypeParser { &mut self, iter: &mut TokenIter, heap: &Heap, source: &InputSource, symbols: &SymbolTable, poly_vars: &[Identifier], wrapping_definition: DefinitionId, cur_scope: SymbolScope, - allow_inference: bool, inside_angular_bracket: Option, + allow_inference: bool, allow_compiler_types: bool, + inside_angular_bracket: Option, ) -> Result { // Prepare self.entries.clear(); @@ -71,9 +72,10 @@ impl ParserTypeParser { } let initial_state = match iter.next() { - Some(TokenKind::Ident) => { + Some(TokenKind::Ident) | Some(TokenKind::Pragma) => { let element = Self::consume_parser_type_element( - iter, source, heap, symbols, wrapping_definition, poly_vars, cur_scope, allow_inference + iter, source, heap, symbols, wrapping_definition, poly_vars, cur_scope, + allow_inference, allow_compiler_types )?; self.first_pos = element.element_span.begin; self.last_pos = element.element_span.end; @@ -154,7 +156,8 @@ impl ParserTypeParser { // Allowed tokens: ident ( match next { Some(TokenKind::Ident) => self.consume_type_idents( - source, heap, symbols, wrapping_definition, poly_vars, cur_scope, allow_inference, iter + source, heap, symbols, wrapping_definition, poly_vars, cur_scope, + allow_inference, allow_compiler_types, iter )?, Some(TokenKind::OpenParen) => self.consume_open_paren(iter), _ => return Err(ParseError::new_error_str_at_pos( @@ -168,7 +171,8 @@ impl ParserTypeParser { // We'll strip the nested tuple later in this function match next { Some(TokenKind::Ident) => self.consume_type_idents( - source, heap, symbols, wrapping_definition, poly_vars, cur_scope, allow_inference, iter + source, heap, symbols, wrapping_definition, poly_vars, cur_scope, + allow_inference, allow_compiler_types, iter )?, Some(TokenKind::OpenParen) => self.consume_open_paren(iter), Some(TokenKind::CloseParen) => self.consume_close_paren(source, iter)?, @@ -182,7 +186,8 @@ impl ParserTypeParser { // Allowed tokens: ident ( > >> ) match next { Some(TokenKind::Ident) => self.consume_type_idents( - source, heap, symbols, wrapping_definition, poly_vars, cur_scope, allow_inference, iter + source, heap, symbols, wrapping_definition, poly_vars, cur_scope, + allow_inference, allow_compiler_types, iter )?, Some(TokenKind::OpenParen) => self.consume_open_paren(iter), Some(TokenKind::CloseAngle) => self.consume_close_angle(source, iter)?, @@ -288,10 +293,12 @@ impl ParserTypeParser { fn consume_type_idents( &mut self, source: &InputSource, heap: &Heap, symbols: &SymbolTable, wrapping_definition: DefinitionId, poly_vars: &[Identifier], - cur_scope: SymbolScope, allow_inference: bool, iter: &mut TokenIter + cur_scope: SymbolScope, allow_inference: bool, allow_compiler_types: bool, + iter: &mut TokenIter ) -> Result<(), ParseError> { let element = Self::consume_parser_type_element( - iter, source, heap, symbols, wrapping_definition, poly_vars, cur_scope, allow_inference + iter, source, heap, symbols, wrapping_definition, poly_vars, cur_scope, + allow_inference, allow_compiler_types )?; let depth = self.cur_depth(); self.last_pos = element.element_span.end; @@ -428,11 +435,35 @@ impl ParserTypeParser { fn consume_parser_type_element( iter: &mut TokenIter, source: &InputSource, heap: &Heap, symbols: &SymbolTable, wrapping_definition: DefinitionId, poly_vars: &[Identifier], - mut scope: SymbolScope, allow_inference: bool, + mut scope: SymbolScope, allow_inference: bool, allow_compiler_types: bool, ) -> Result { use ParserTypeVariant as PTV; - let (mut type_text, mut type_span) = consume_any_ident(source, iter)?; + // Early check for special builtin types available to the compiler + if iter.next() == Some(TokenKind::Pragma) { + let (type_text, pragma_span) = consume_pragma(source, iter)?; + let variant = match type_text { + PRAGMA_TYPE_VOID => Some(PTV::Void), + PRAGMA_TYPE_PORTLIKE => Some(PTV::InputOrOutput), + PRAGMA_TYPE_INTEGERLIKE => Some(PTV::IntegerLike), + PRAGMA_TYPE_ARRAYLIKE => Some(PTV::ArrayLike), + _ => None, + }; + + if !allow_compiler_types || variant.is_none() { + return Err(ParseError::new_error_str_at_span( + source, pragma_span, "unexpected pragma in type" + )); + } + + return Ok(ParserTypeElement{ + variant: variant.unwrap(), + element_span: pragma_span, + }); + } + + // No special type, parse as normal + let (mut type_text, mut type_span) = consume_any_ident(source, iter)?; let variant = match type_text { KW_TYPE_MESSAGE => PTV::Message, KW_TYPE_BOOL => PTV::Bool, diff --git a/src/protocol/parser/pass_imports.rs b/src/protocol/parser/pass_imports.rs index e64e07e87634183cc240d0cf0709336f8c0a7017..7fe1d8c77451ed17ffe7335cbc31969f15b27d68 100644 --- a/src/protocol/parser/pass_imports.rs +++ b/src/protocol/parser/pass_imports.rs @@ -25,28 +25,23 @@ impl PassImport { } pub(crate) fn parse(&mut self, modules: &mut [Module], module_idx: usize, ctx: &mut PassCtx) -> Result<(), ParseError> { let module = &modules[module_idx]; - let module_range = &module.tokens.ranges[0]; debug_assert!(modules.iter().all(|m| m.phase >= ModuleCompilationPhase::SymbolsScanned)); debug_assert_eq!(module.phase, ModuleCompilationPhase::SymbolsScanned); - debug_assert_eq!(module_range.range_kind, TokenRangeKind::Module); - let mut range_idx = module_range.first_child_idx; - loop { - let range_idx_usize = range_idx as usize; - let cur_range = &module.tokens.ranges[range_idx_usize]; + let module_root_id = module.root_id; + let num_markers = module.tokens.markers.len(); - if cur_range.range_kind == TokenRangeKind::Import { - self.visit_import_range(modules, module_idx, ctx, range_idx_usize)?; - } - - if cur_range.next_sibling_idx == NO_SIBLING { - break; - } else { - range_idx = cur_range.next_sibling_idx; + for marker_index in 0..num_markers { + let marker = &modules[module_idx].tokens.markers[marker_index]; + match marker.kind { + TokenMarkerKind::Import => { + self.visit_import_marker(modules, module_idx, ctx, marker_index)?; + }, + TokenMarkerKind::Definition | TokenMarkerKind::Pragma => {}, } } - let root = &mut ctx.heap[module.root_id]; + let root = &mut ctx.heap[module_root_id]; root.imports.extend(self.imports.drain(..)); let module = &mut modules[module_idx]; @@ -55,14 +50,13 @@ impl PassImport { Ok(()) } - pub(crate) fn visit_import_range( - &mut self, modules: &[Module], module_idx: usize, ctx: &mut PassCtx, range_idx: usize + pub(crate) fn visit_import_marker( + &mut self, modules: &mut [Module], module_idx: usize, ctx: &mut PassCtx, marker_index: usize ) -> Result<(), ParseError> { let module = &modules[module_idx]; - let import_range = &module.tokens.ranges[range_idx]; - debug_assert_eq!(import_range.range_kind, TokenRangeKind::Import); + let marker = &module.tokens.markers[marker_index]; - let mut iter = module.tokens.iter_range(import_range.start, import_range.end); + let mut iter = module.tokens.iter_range(marker.first_token, None); // Consume "import" let (_import_ident, import_span) = @@ -315,6 +309,12 @@ impl PassImport { consume_token(&module.source, &mut iter, TokenKind::SemiColon)?; self.imports.push(import_id); + // Update the marker + let marker_last_token = iter.token_index(); + let marker = &mut modules[module_idx].tokens.markers[marker_index]; + marker.last_token = marker_last_token; + marker.handled = true; + Ok(()) } } diff --git a/src/protocol/parser/pass_rewriting.rs b/src/protocol/parser/pass_rewriting.rs index a9f869d38c905cdbbe880ecdbd4bc522a8bf5b82..82702bd1056e5706249f1636604aba452aeb820c 100644 --- a/src/protocol/parser/pass_rewriting.rs +++ b/src/protocol/parser/pass_rewriting.rs @@ -49,6 +49,10 @@ impl Visitor for PassRewriting { fn visit_procedure_definition(&mut self, ctx: &mut Ctx, id: ProcedureDefinitionId) -> VisitorResult { let definition = &ctx.heap[id]; + if definition.source.is_builtin() { + return Ok(()); + } + let body_id = definition.body; self.current_scope = definition.scope; self.current_procedure_id = id; diff --git a/src/protocol/parser/pass_symbols.rs b/src/protocol/parser/pass_symbols.rs index 28d7ba5fce047bdc25d8133f5b7cb6beb42f66e8..3c3b628f105947d8748d9318ac4a237d1606e4fe 100644 --- a/src/protocol/parser/pass_symbols.rs +++ b/src/protocol/parser/pass_symbols.rs @@ -45,11 +45,10 @@ impl PassSymbols { self.reset(); let module = &mut modules[module_idx]; - let module_range = &module.tokens.ranges[0]; + let module_is_compiler_file = module.is_compiler_file; debug_assert_eq!(module.phase, ModuleCompilationPhase::Tokenized); - debug_assert_eq!(module_range.range_kind, TokenRangeKind::Module); - debug_assert!(module.root_id.is_invalid()); // not set yet, + debug_assert!(module.root_id.is_invalid()); // not set yet // Preallocate root in the heap let root_id = ctx.heap.alloc_protocol_description(|this| { @@ -62,28 +61,21 @@ impl PassSymbols { }); module.root_id = root_id; - // Retrieve first range index, then make immutable borrow - let mut range_idx = module_range.first_child_idx; - - // Visit token ranges to detect definitions and pragmas - loop { + // Use pragma token markers to detects symbol definitions and pragmas + let num_markers = module.tokens.markers.len(); + for marker_index in 0..num_markers { let module = &modules[module_idx]; - let range_idx_usize = range_idx as usize; - let cur_range = &module.tokens.ranges[range_idx_usize]; - let next_sibling_idx = cur_range.next_sibling_idx; - let range_kind = cur_range.range_kind; + let marker = &module.tokens.markers[marker_index]; // Parse if it is a definition or a pragma - if range_kind == TokenRangeKind::Definition { - self.visit_definition_range(modules, module_idx, ctx, range_idx_usize)?; - } else if range_kind == TokenRangeKind::Pragma { - self.visit_pragma_range(modules, module_idx, ctx, range_idx_usize)?; - } - - if next_sibling_idx == NO_SIBLING { - break; - } else { - range_idx = next_sibling_idx; + match marker.kind { + TokenMarkerKind::Pragma => { + self.visit_pragma_marker(modules, module_idx, ctx, marker_index)?; + }, + TokenMarkerKind::Definition => { + self.visit_definition_marker(modules, module_idx, ctx, marker_index)?; + } + TokenMarkerKind::Import => {}, // we don't care yet } } @@ -97,6 +89,14 @@ impl PassSymbols { } } + if module_is_compiler_file { + debug_assert!(self.symbols.is_empty()); + ctx.symbols.get_all_symbols_defined_in_scope(module_scope, &mut self.symbols); + for symbol in self.symbols.drain(..) { + ctx.symbols.insert_symbol_in_global_scope(symbol); + } + } + // Modify the preallocated root let root = &mut ctx.heap[root_id]; root.pragmas.extend(self.pragmas.drain(..)); @@ -109,32 +109,27 @@ impl PassSymbols { Ok(()) } - fn visit_pragma_range(&mut self, modules: &mut [Module], module_idx: usize, ctx: &mut PassCtx, range_idx: usize) -> Result<(), ParseError> { + fn visit_pragma_marker(&mut self, modules: &mut [Module], module_idx: usize, ctx: &mut PassCtx, marker_index: usize) -> Result<(), ParseError> { let module = &mut modules[module_idx]; - let range = &module.tokens.ranges[range_idx]; - let mut iter = module.tokens.iter_range(range.start, module.tokens.tokens.len() as u32); + let marker = &module.tokens.markers[marker_index]; + let mut iter = module.tokens.iter_range(marker.first_token, None); // Consume pragma name - let (pragma_section, pragma_start, _) = consume_pragma(&module.source, &mut iter)?; + let (pragma_section, mut pragma_span) = consume_pragma(&module.source, &mut iter)?; // Consume pragma values if pragma_section == b"#module" { // Check if name is defined twice within the same file if self.has_pragma_module { - return Err(ParseError::new_error_str_at_pos(&module.source, pragma_start, "module name is defined twice")); + return Err(ParseError::new_error_str_at_span(&module.source, pragma_span, "module name is defined twice")); } - // Consume the domain-name + // Consume the domain-name, then record end of pragma let (module_name, module_span) = consume_domain_ident(&module.source, &mut iter)?; - - // TODO: Fix with newer token range parsing - module.tokens.ranges[range_idx as usize].end = iter.token_index(); - // if iter.next().is_some() { - // return Err(ParseError::new_error_str_at_pos(&module.source, iter.last_valid_pos(), "expected end of #module pragma after module name")); - // } + let marker_last_token = iter.token_index(); // Add to heap and symbol table - let pragma_span = InputSpan::from_positions(pragma_start, module_span.end); + pragma_span.end = module_span.end; let module_name = ctx.pool.intern(module_name); let pragma_id = ctx.heap.alloc_pragma(|this| Pragma::Module(PragmaModule{ this, @@ -156,49 +151,51 @@ impl PassSymbols { )); } + let marker = &mut module.tokens.markers[marker_index]; + marker.last_token = marker_last_token; + marker.handled = true; + module.name = Some((pragma_id, module_name)); self.has_pragma_module = true; } else if pragma_section == b"#version" { // Check if version is defined twice within the same file if self.has_pragma_version { - return Err(ParseError::new_error_str_at_pos(&module.source, pragma_start, "module version is defined twice")); + return Err(ParseError::new_error_str_at_span(&module.source, pragma_span, "module version is defined twice")); } // Consume the version pragma let (version, version_span) = consume_integer_literal(&module.source, &mut iter, &mut self.buffer)?; + let marker_last_token = iter.token_index(); + + pragma_span.end = version_span.end; let pragma_id = ctx.heap.alloc_pragma(|this| Pragma::Version(PragmaVersion{ this, - span: InputSpan::from_positions(pragma_start, version_span.end), + span: pragma_span, version, })); self.pragmas.push(pragma_id); + let marker = &mut module.tokens.markers[marker_index]; + marker.last_token = marker_last_token; + marker.handled = true; + module.version = Some((pragma_id, version as i64)); self.has_pragma_version = true; - } else { - // Custom pragma, maybe we support this in the future, but for now - // we don't. - return Err(ParseError::new_error_str_at_pos(&module.source, pragma_start, "illegal pragma name")); - } + } // else: custom pragma used for something else, will be handled later (or rejected with an error) Ok(()) } - fn visit_definition_range(&mut self, modules: &[Module], module_idx: usize, ctx: &mut PassCtx, range_idx: usize) -> Result<(), ParseError> { + fn visit_definition_marker(&mut self, modules: &[Module], module_idx: usize, ctx: &mut PassCtx, marker_index: usize) -> Result<(), ParseError> { let module = &modules[module_idx]; - let range = &module.tokens.ranges[range_idx]; - let definition_span = InputSpan::from_positions( - module.tokens.start_pos(range), - module.tokens.end_pos(range) - ); - let mut iter = module.tokens.iter_range(range.start, range.end); + let marker = &module.tokens.markers[marker_index]; + let mut iter = module.tokens.iter_range(marker.first_token, None); // First ident must be type of symbol let (kw_text, _) = consume_any_ident(&module.source, &mut iter).unwrap(); // Retrieve identifier of definition let identifier = consume_ident_interned(&module.source, &mut iter, ctx)?; - println!("DEBUG: Parsing {} --- {}", String::from_utf8_lossy(kw_text).to_string(), identifier.value.as_str()); let mut poly_vars = Vec::new(); maybe_consume_comma_separated( TokenKind::OpenAngle, TokenKind::CloseAngle, &module.source, &mut iter, ctx, @@ -214,28 +211,28 @@ impl PassSymbols { match kw_text { KW_STRUCT => { let struct_def_id = ctx.heap.alloc_struct_definition(|this| { - StructDefinition::new_empty(this, module.root_id, definition_span, identifier, poly_vars) + StructDefinition::new_empty(this, module.root_id, identifier, poly_vars) }); definition_class = DefinitionClass::Struct; ast_definition_id = struct_def_id.upcast(); }, KW_ENUM => { let enum_def_id = ctx.heap.alloc_enum_definition(|this| { - EnumDefinition::new_empty(this, module.root_id, definition_span, identifier, poly_vars) + EnumDefinition::new_empty(this, module.root_id, identifier, poly_vars) }); definition_class = DefinitionClass::Enum; ast_definition_id = enum_def_id.upcast(); }, KW_UNION => { let union_def_id = ctx.heap.alloc_union_definition(|this| { - UnionDefinition::new_empty(this, module.root_id, definition_span, identifier, poly_vars) + UnionDefinition::new_empty(this, module.root_id, identifier, poly_vars) }); definition_class = DefinitionClass::Union; ast_definition_id = union_def_id.upcast() }, KW_FUNCTION => { let proc_def_id = ctx.heap.alloc_procedure_definition(|this| { - ProcedureDefinition::new_empty(this, module.root_id, definition_span, ProcedureKind::Function, identifier, poly_vars) + ProcedureDefinition::new_empty(this, module.root_id, ProcedureKind::Function, identifier, poly_vars) }); definition_class = DefinitionClass::Function; ast_definition_id = proc_def_id.upcast(); @@ -247,7 +244,7 @@ impl PassSymbols { ProcedureKind::Composite }; let proc_def_id = ctx.heap.alloc_procedure_definition(|this| { - ProcedureDefinition::new_empty(this, module.root_id, definition_span, procedure_kind, identifier, poly_vars) + ProcedureDefinition::new_empty(this, module.root_id, procedure_kind, identifier, poly_vars) }); definition_class = DefinitionClass::Component; ast_definition_id = proc_def_id.upcast(); @@ -260,7 +257,6 @@ impl PassSymbols { variant: SymbolVariant::Definition(SymbolDefinition{ defined_in_module: module.root_id, defined_in_scope: SymbolScope::Module(module.root_id), - definition_span, identifier_span: ident_span, imported_at: None, class: definition_class, diff --git a/src/protocol/parser/pass_tokenizer.rs b/src/protocol/parser/pass_tokenizer.rs index 6cb83d224693328d20a184c3c0bb26eb8c9b2482..e00569be823498ec4cb3e1b2258be896d29a78c0 100644 --- a/src/protocol/parser/pass_tokenizer.rs +++ b/src/protocol/parser/pass_tokenizer.rs @@ -66,9 +66,11 @@ impl PassTokenizer { } else if is_identifier_start(c) { let ident = self.consume_identifier(source, target)?; - if demarks_definition(ident) { + if demarks_symbol(ident) { + self.emit_marker(target, TokenMarkerKind::Definition, token_index); self.push_range(target, TokenRangeKind::Definition, token_index); } else if demarks_import(ident) { + self.emit_marker(target, TokenMarkerKind::Import, token_index); self.push_range(target, TokenRangeKind::Import, token_index); } } else if is_integer_literal_start(c) { @@ -76,6 +78,7 @@ impl PassTokenizer { } else if is_pragma_start_or_pound(c) { let was_pragma = self.consume_pragma_or_pound(c, source, target)?; if was_pragma { + self.emit_marker(target, TokenMarkerKind::Pragma, token_index); self.push_range(target, TokenRangeKind::Pragma, token_index); } } else if self.is_line_comment_start(c, source) { @@ -655,6 +658,22 @@ impl PassTokenizer { } } + fn emit_marker(&mut self, target: &mut TokenBuffer, kind: TokenMarkerKind, first_token: u32) { + debug_assert!( + target.markers + .last().map(|v| v.first_token < first_token) + .unwrap_or(true) + ); + + target.markers.push(TokenMarker{ + kind, + curly_depth: self.curly_stack.len() as u32, + first_token, + last_token: u32::MAX, + handled: false, + }); + } + fn push_range(&mut self, target: &mut TokenBuffer, range_kind: TokenRangeKind, first_token_idx: u32) { let new_range_idx = target.ranges.len() as i32; let parent_idx = self.stack_idx as i32; @@ -727,7 +746,7 @@ impl PassTokenizer { } // Helpers for characters -fn demarks_definition(ident: &[u8]) -> bool { +fn demarks_symbol(ident: &[u8]) -> bool { return ident == KW_STRUCT || ident == KW_ENUM || diff --git a/src/protocol/parser/pass_typing.rs b/src/protocol/parser/pass_typing.rs index d5cb3b9311832333ccdf6af675baabc98e6c5b32..99234f7eb2f1dec4b63694868262d7f8d24f1e9c 100644 --- a/src/protocol/parser/pass_typing.rs +++ b/src/protocol/parser/pass_typing.rs @@ -1223,6 +1223,7 @@ impl PassTyping { self.procedure_id = id; self.procedure_kind = procedure_def.kind; let body_id = procedure_def.body; + let procedure_is_builtin = procedure_def.source.is_builtin(); debug_log!("{}", "-".repeat(50)); debug_log!("Visiting procedure: '{}' (id: {}, kind: {:?})", procedure_def.identifier.value.as_str(), id.0.index, procedure_def.kind); @@ -1245,7 +1246,11 @@ impl PassTyping { // Visit all of the expressions within the body self.parent_index = None; - return self.visit_block_stmt(ctx, body_id); + if !procedure_is_builtin { + return self.visit_block_stmt(ctx, body_id); + } else { + return Ok(()); + } } // Statements diff --git a/src/protocol/parser/pass_validation_linking.rs b/src/protocol/parser/pass_validation_linking.rs index 5490af080f9b8eec88f98caa0ec7b0be6ad14862..19b463957c8388f64b0bd8eefdd1ea888cf44554 100644 --- a/src/protocol/parser/pass_validation_linking.rs +++ b/src/protocol/parser/pass_validation_linking.rs @@ -200,6 +200,7 @@ impl Visitor for PassValidationLinking { let definition = &ctx.heap[id]; let body_id = definition.body; + let definition_is_builtin = definition.source.is_builtin(); let section = self.variable_buffer.start_section_initialized(&definition.parameters); for variable_idx in 0..section.len() { let variable_id = section[variable_idx]; @@ -207,8 +208,11 @@ impl Visitor for PassValidationLinking { } section.forget(); - // Visit statements in function body - self.visit_block_stmt(ctx, body_id)?; + // Visit statements in function body, if present at all + if !definition_is_builtin { + self.visit_block_stmt(ctx, body_id)?; + } + self.pop_scope(old_scope); self.resolve_pending_control_flow_targets(ctx)?; diff --git a/src/protocol/parser/symbol_table.rs b/src/protocol/parser/symbol_table.rs index 0d64b4d7d5515fa4c0c530f8dc43dff3d74f36b6..088dc52d102199b6cf2e4b2f981cb4aebc30af6f 100644 --- a/src/protocol/parser/symbol_table.rs +++ b/src/protocol/parser/symbol_table.rs @@ -85,7 +85,6 @@ pub struct SymbolDefinition { // spans and module IDs pub defined_in_module: RootId, pub defined_in_scope: SymbolScope, - pub definition_span: InputSpan, // full span of definition pub identifier_span: InputSpan, // span of just the identifier // Location where the symbol is introduced in its scope pub imported_at: Option, @@ -231,6 +230,14 @@ impl SymbolTable { Ok(()) } + /// Insert a symbol in the global scope. Naturally there will be a + /// collision (as the symbol originates from a module), so we do *not* check + /// for this. + pub(crate) fn insert_symbol_in_global_scope(&mut self, symbol: Symbol) { + let scoped_symbols = self.scope_lookup.get_mut(&SymbolScope::Global).unwrap(); + scoped_symbols.symbols.push(symbol); + } + /// Retrieves a symbol by name by searching in a particular scope and that scope's parents. The /// returned symbol may both be imported as defined within any of the searched scopes. pub(crate) fn get_symbol_by_name( diff --git a/src/protocol/parser/token_parsing.rs b/src/protocol/parser/token_parsing.rs index 1a81840b3c5a4c112d01c7cd0e032cc66ba87ecd..4de8f5c7636b1fc002d1ebaf7473666a274234b6 100644 --- a/src/protocol/parser/token_parsing.rs +++ b/src/protocol/parser/token_parsing.rs @@ -86,6 +86,15 @@ pub(crate) const KW_TYPE_CHAR: &'static [u8] = KW_TYPE_CHAR_STR.as_bytes(); pub(crate) const KW_TYPE_STRING: &'static [u8] = KW_TYPE_STRING_STR.as_bytes(); pub(crate) const KW_TYPE_INFERRED: &'static [u8] = KW_TYPE_INFERRED_STR.as_bytes(); +// Builtin pragma types +// Not usable by the programmer, but usable in the standard library. These hint +// at the fact that we need a different system (e.g. function overloading) +pub(crate) const PRAGMA_TYPE_VOID: &'static [u8] = b"#type_void"; +pub(crate) const PRAGMA_TYPE_PORTLIKE: &'static [u8] = b"#type_portlike"; +pub(crate) const PRAGMA_TYPE_INTEGERLIKE: &'static [u8] = b"#type_integerlike"; +pub(crate) const PRAGMA_TYPE_ARRAYLIKE: &'static [u8] = b"#type_arraylike"; + + /// A special trait for when consuming comma-separated things such that we can /// push them onto a `Vec` and onto a `ScopedSection`. As we monomorph for /// very specific comma-separated cases I don't expect polymorph bloat. @@ -449,13 +458,13 @@ fn parse_escaped_character(source: &InputSource, literal_span: InputSpan, v: u8) Ok(result) } -pub(crate) fn consume_pragma<'a>(source: &'a InputSource, iter: &mut TokenIter) -> Result<(&'a [u8], InputPosition, InputPosition), ParseError> { +pub(crate) fn consume_pragma<'a>(source: &'a InputSource, iter: &mut TokenIter) -> Result<(&'a [u8], InputSpan), ParseError> { if Some(TokenKind::Pragma) != iter.next() { return Err(ParseError::new_error_str_at_pos(source, iter.last_valid_pos(), "expected a pragma")); } - let (pragma_start, pragma_end) = iter.next_positions(); + let pragma_span = iter.next_span(); iter.consume(); - Ok((source.section_at_pos(pragma_start, pragma_end), pragma_start, pragma_end)) + Ok((source.section_at_span(pragma_span), pragma_span)) } pub(crate) fn has_ident(source: &InputSource, iter: &mut TokenIter, expected: &[u8]) -> bool { diff --git a/src/protocol/parser/tokens.rs b/src/protocol/parser/tokens.rs index d8c6b7d25b01b89aea578664be187660c0a025bb..72c019ae259a40e7a4d0e92e0e6b4ea93e8183f8 100644 --- a/src/protocol/parser/tokens.rs +++ b/src/protocol/parser/tokens.rs @@ -170,6 +170,29 @@ impl Token { } } +#[derive(Debug, Clone, Copy)] +pub enum TokenMarkerKind { + Pragma, + Import, + Definition, +} + +/// A marker for a specific token. These are stored separately from the array of +/// tokens. These are used for initial symbol, module name, and import +/// discovery. +#[derive(Debug)] +pub struct TokenMarker { + pub kind: TokenMarkerKind, + pub curly_depth: u32, + // Indices into token buffer. The first token is inclusive and set upon + // tokenization, the last token is set at a later stage in parsing (e.g. + // at symbol discovery we may parse some of the `Pragma` tokens and set the + // last parsed token) + pub first_token: u32, + pub last_token: u32, + pub handled: bool, +} + /// The kind of token ranges that are specially parsed by the tokenizer. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum TokenRangeKind { @@ -204,31 +227,25 @@ pub struct TokenRange { pub struct TokenBuffer { pub tokens: Vec, + pub markers: Vec, pub ranges: Vec, } impl TokenBuffer { pub(crate) fn new() -> Self { - Self{ tokens: Vec::new(), ranges: Vec::new() } - } - - pub(crate) fn iter_range<'a>(&'a self, inclusive_start: u32, exclusive_end: u32) -> TokenIter<'a> { - debug_assert!(exclusive_end as usize <= self.tokens.len()); - TokenIter::new(self, inclusive_start as usize, exclusive_end as usize) - } - - pub(crate) fn start_pos(&self, range: &TokenRange) -> InputPosition { - self.tokens[range.start as usize].pos + return Self{ + tokens: Vec::new(), + markers: Vec::new(), + ranges: Vec::new() + }; } - pub(crate) fn end_pos(&self, range: &TokenRange) -> InputPosition { - let last_token = &self.tokens[range.end as usize - 1]; - if last_token.kind == TokenKind::SpanEnd { - return last_token.pos - } else { - debug_assert!(!last_token.kind.has_span_end()); - return last_token.pos.with_offset(last_token.kind.num_characters()); - } + pub(crate) fn iter_range( + &self, inclusive_start: u32, exclusive_end: Option + ) -> TokenIter { + let exclusive_end = exclusive_end.unwrap_or(self.tokens.len() as u32) as usize; + debug_assert!(exclusive_end <= self.tokens.len()); + TokenIter::new(self, inclusive_start as usize, exclusive_end) } } diff --git a/src/protocol/tests/utils.rs b/src/protocol/tests/utils.rs index 735cbdc11c4a0e1c5eeb1c13f21d34f281ea3d49..d0694aa883873531183aedd7c23ba71dee352d16 100644 --- a/src/protocol/tests/utils.rs +++ b/src/protocol/tests/utils.rs @@ -600,7 +600,8 @@ impl<'a> FunctionTester<'a> { // Find the first occurrence of the expression after the definition of // the function, we'll check that it is included in the body later. - let mut outer_match_idx = self.def.span.begin.offset as usize; + let body = &self.ctx.heap[self.def.body]; + let mut outer_match_idx = body.span.begin.offset as usize; while outer_match_idx < module.source.input.len() { if module.source.input[outer_match_idx..].starts_with(outer_match.as_bytes()) { break; diff --git a/src/protocol/token_writer.rs b/src/protocol/token_writer.rs index 37d52770c34a4efb22b06de483e15060d773c12b..ac28761f48e5a136126b1c0addd1b37782400b73 100644 --- a/src/protocol/token_writer.rs +++ b/src/protocol/token_writer.rs @@ -5,7 +5,7 @@ use std::io::Write as IOWrite; use crate::protocol::input_source::{InputSource, InputSpan}; use crate::protocol::parser::Module; -use crate::protocol::tokens::{Token, TokenBuffer, TokenKind, TokenRange, TokenIter, TokenRangeKind}; +use crate::protocol::tokens::{Token, TokenKind, TokenRange}; pub(crate) struct TokenWriter { buffer: String, diff --git a/std/std.global.pdl b/std/std.global.pdl index 49e3269e94b9c2443b72716c0e2981f16edd1ee0..6ad5a3aeb2b8fb5597794a4d8ac15bd4462728b4 100644 --- a/std/std.global.pdl +++ b/std/std.global.pdl @@ -7,8 +7,8 @@ func get(in input) -> T { #builtin } func put(out output, T value) -> #type_void { #builtin } -func fires(#type_portlike ) -> bool { #builtin } -func create(#type_integerlike length) -> T[] { #builtin } -func length(#type_arraylike array) -> u32 { #builtin } +func fires(#type_portlike port) -> bool { #builtin } +func create(#type_integerlike len) -> T[] { #builtin } +func length(#type_arraylike array) -> u32 { #builtin } func assert(bool condition) -> #type_void { #builtin } func print(string message) -> #type_void { #builtin } \ No newline at end of file