diff --git a/src/protocol/parser/mod.rs b/src/protocol/parser/mod.rs index 39de7073241552963fc106ab248880774a9e45fa..6d2a3281c144df2d3547c8bf2622498bdb2dd6a0 100644 --- a/src/protocol/parser/mod.rs +++ b/src/protocol/parser/mod.rs @@ -204,12 +204,6 @@ impl Parser { arch: &self.arch, }; - if let Some(filename) = &self.write_tokens_to { - let mut writer = TokenWriter::new(); - let mut file = std::fs::File::create(std::path::Path::new(filename)).unwrap(); - writer.write(&mut file, &self.modules); - } - // Advance all modules to the phase where all symbols are scanned for module_idx in 0..self.modules.len() { self.pass_symbols.parse(&mut self.modules, module_idx, &mut pass_ctx)?; @@ -222,6 +216,12 @@ impl Parser { self.pass_definitions.parse(&mut self.modules, module_idx, &mut pass_ctx)?; } + if let Some(filename) = &self.write_tokens_to { + let mut writer = TokenWriter::new(); + let mut file = std::fs::File::create(std::path::Path::new(filename)).unwrap(); + writer.write(&mut file, &self.modules); + } + // Add every known type to the type table self.type_table.build_base_types(&mut self.modules, &mut pass_ctx)?; diff --git a/src/protocol/parser/pass_tokenizer.rs b/src/protocol/parser/pass_tokenizer.rs index e00569be823498ec4cb3e1b2258be896d29a78c0..07f7dbc2c2eef970825fd2681526cac56e3e701f 100644 --- a/src/protocol/parser/pass_tokenizer.rs +++ b/src/protocol/parser/pass_tokenizer.rs @@ -21,15 +21,12 @@ pub(crate) struct PassTokenizer { // unmatched opening braces, unmatched closing braces are detected // immediately. curly_stack: Vec, - // Points to an element in the `TokenBuffer.ranges` variable. - stack_idx: usize, } impl PassTokenizer { pub(crate) fn new() -> Self { Self{ curly_stack: Vec::with_capacity(32), - stack_idx: 0 } } @@ -37,23 +34,6 @@ impl PassTokenizer { // Assert source and buffer are at start debug_assert_eq!(source.pos().offset, 0); debug_assert!(target.tokens.is_empty()); - debug_assert!(target.ranges.is_empty()); - - // Set up for tokenization by pushing the first range onto the stack. - // This range may get transformed into the appropriate range kind later, - // see `push_range` and `pop_range`. - self.stack_idx = 0; - target.ranges.push(TokenRange{ - parent_idx: NO_RELATION, - range_kind: TokenRangeKind::Module, - curly_depth: 0, - start: 0, - end: 0, - num_child_ranges: 0, - first_child_idx: NO_RELATION, - last_child_idx: NO_RELATION, - next_sibling_idx: NO_RELATION, - }); // Main tokenization loop while let Some(c) = source.next() { @@ -68,10 +48,8 @@ impl PassTokenizer { if demarks_symbol(ident) { self.emit_marker(target, TokenMarkerKind::Definition, token_index); - self.push_range(target, TokenRangeKind::Definition, token_index); } else if demarks_import(ident) { self.emit_marker(target, TokenMarkerKind::Import, token_index); - self.push_range(target, TokenRangeKind::Import, token_index); } } else if is_integer_literal_start(c) { self.consume_number(source, target)?; @@ -79,7 +57,6 @@ impl PassTokenizer { let was_pragma = self.consume_pragma_or_pound(c, source, target)?; if was_pragma { self.emit_marker(target, TokenMarkerKind::Pragma, token_index); - self.push_range(target, TokenRangeKind::Pragma, token_index); } } else if self.is_line_comment_start(c, source) { self.consume_line_comment(source, target)?; @@ -87,10 +64,6 @@ impl PassTokenizer { self.consume_block_comment(source, target)?; } else if is_whitespace(c) { self.consume_whitespace(source); - let range = &target.ranges[self.stack_idx]; - if range.range_kind == TokenRangeKind::Pragma { - self.pop_range(target, target.tokens.len() as u32); - } } else { let was_punctuation = self.maybe_parse_punctuation(c, source, target)?; if let Some((token, token_pos)) = was_punctuation { @@ -106,20 +79,6 @@ impl PassTokenizer { } self.curly_stack.pop(); - - let range = &target.ranges[self.stack_idx]; - if range.range_kind == TokenRangeKind::Definition && range.curly_depth == self.curly_stack.len() as u32 { - self.pop_range(target, target.tokens.len() as u32); - } - - // Exit early if we have more closing curly braces than - // opening curly braces - } else if token == TokenKind::SemiColon { - // Check if this marks the end of an import - let range = &target.ranges[self.stack_idx]; - if range.range_kind == TokenRangeKind::Import { - self.pop_range(target, target.tokens.len() as u32); - } } } else { return Err(ParseError::new_error_str_at_pos( @@ -143,21 +102,6 @@ impl PassTokenizer { )); } - // Ranges that did not depend on curly braces may have missing tokens. - // So close all of the active tokens - while self.stack_idx != 0 { - self.pop_range(target, target.tokens.len() as u32); - } - - // And finally, we may have a token range at the end that doesn't belong - // to a range yet, so insert a "code" range if this is the case. - debug_assert_eq!(self.stack_idx, 0); - let last_registered_idx = target.ranges[0].end; - let last_token_idx = target.tokens.len() as u32; - if last_registered_idx != last_token_idx { - self.add_code_range(target, 0, last_registered_idx, last_token_idx, NO_RELATION); - } - Ok(()) } @@ -624,40 +568,6 @@ impl PassTokenizer { has_newline } - fn add_code_range( - &mut self, target: &mut TokenBuffer, parent_idx: i32, - code_start_idx: u32, code_end_idx: u32, next_sibling_idx: i32 - ) { - let new_range_idx = target.ranges.len() as i32; - let parent_range = &mut target.ranges[parent_idx as usize]; - debug_assert_ne!(parent_range.end, code_end_idx, "called push_code_range without a need to do so"); - - let sibling_idx = parent_range.last_child_idx; - - parent_range.last_child_idx = new_range_idx; - parent_range.end = code_end_idx; - parent_range.num_child_ranges += 1; - - let curly_depth = self.curly_stack.len() as u32; - target.ranges.push(TokenRange{ - parent_idx, - range_kind: TokenRangeKind::Code, - curly_depth, - start: code_start_idx, - end: code_end_idx, - num_child_ranges: 0, - first_child_idx: NO_RELATION, - last_child_idx: NO_RELATION, - next_sibling_idx, - }); - - // Fix up the sibling indices - if sibling_idx != NO_RELATION { - let sibling_range = &mut target.ranges[sibling_idx as usize]; - sibling_range.next_sibling_idx = new_range_idx; - } - } - fn emit_marker(&mut self, target: &mut TokenBuffer, kind: TokenMarkerKind, first_token: u32) { debug_assert!( target.markers @@ -674,65 +584,6 @@ impl PassTokenizer { }); } - fn push_range(&mut self, target: &mut TokenBuffer, range_kind: TokenRangeKind, first_token_idx: u32) { - let new_range_idx = target.ranges.len() as i32; - let parent_idx = self.stack_idx as i32; - let parent_range = &mut target.ranges[self.stack_idx]; - - if parent_range.first_child_idx == NO_RELATION { - parent_range.first_child_idx = new_range_idx; - } - - let last_registered_idx = parent_range.end; - if last_registered_idx != first_token_idx { - self.add_code_range(target, parent_idx, last_registered_idx, first_token_idx, new_range_idx + 1); - } - - // Push the new range - self.stack_idx = target.ranges.len(); - let curly_depth = self.curly_stack.len() as u32; - target.ranges.push(TokenRange{ - parent_idx, - range_kind, - curly_depth, - start: first_token_idx, - end: first_token_idx, // modified when popped - num_child_ranges: 0, - first_child_idx: NO_RELATION, - last_child_idx: NO_RELATION, - next_sibling_idx: NO_RELATION - }) - } - - fn pop_range(&mut self, target: &mut TokenBuffer, end_token_idx: u32) { - let popped_idx = self.stack_idx as i32; - let popped_range = &mut target.ranges[self.stack_idx]; - debug_assert!(self.stack_idx != 0, "attempting to pop top-level range"); - - // Fix up the current range before going back to parent - popped_range.end = end_token_idx; - debug_assert_ne!(popped_range.start, end_token_idx); - - // Go back to parent and fix up its child pointers, but remember the - // last child, so we can link it to the newly popped range. - self.stack_idx = popped_range.parent_idx as usize; - let parent = &mut target.ranges[self.stack_idx]; - if parent.first_child_idx == NO_RELATION { - parent.first_child_idx = popped_idx; - } - let prev_sibling_idx = parent.last_child_idx; - parent.last_child_idx = popped_idx; - parent.end = end_token_idx; - parent.num_child_ranges += 1; - - // Fix up the sibling (if it exists) - if prev_sibling_idx != NO_RELATION { - let sibling = &mut target.ranges[prev_sibling_idx as usize]; - sibling.next_sibling_idx = popped_idx; - } - } - - fn check_ascii(&self, source: &InputSource) -> Result<(), ParseError> { match source.next() { Some(c) if !c.is_ascii() => { diff --git a/src/protocol/parser/tokens.rs b/src/protocol/parser/tokens.rs index 72c019ae259a40e7a4d0e92e0e6b4ea93e8183f8..d64b9572963b0fd63ea7ee850c2d4bae92e39f1a 100644 --- a/src/protocol/parser/tokens.rs +++ b/src/protocol/parser/tokens.rs @@ -193,42 +193,9 @@ pub struct TokenMarker { pub handled: bool, } -/// The kind of token ranges that are specially parsed by the tokenizer. -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum TokenRangeKind { - Module, - Pragma, - Import, - Definition, - Code, -} - -pub const NO_RELATION: i32 = -1; -pub const NO_SIBLING: i32 = NO_RELATION; - -/// A range of tokens with a specific meaning. Such a range is part of a tree -/// where each parent tree envelops all of its children. -#[derive(Debug)] -pub struct TokenRange { - // Index of parent in `TokenBuffer.ranges`, does not have a parent if the - // range kind is Module, in that case the parent index is -1. - pub parent_idx: i32, - pub range_kind: TokenRangeKind, - pub curly_depth: u32, - // Offsets into `TokenBuffer.ranges`: the tokens belonging to this range. - pub start: u32, // first token (inclusive index) - pub end: u32, // last token (exclusive index) - // Child ranges - pub num_child_ranges: u32, // Number of subranges - pub first_child_idx: i32, // First subrange (or -1 if no subranges) - pub last_child_idx: i32, // Last subrange (or -1 if no subranges) - pub next_sibling_idx: i32, // Next subrange (or -1 if no next subrange) -} - pub struct TokenBuffer { pub tokens: Vec, pub markers: Vec, - pub ranges: Vec, } impl TokenBuffer { @@ -236,7 +203,6 @@ impl TokenBuffer { return Self{ tokens: Vec::new(), markers: Vec::new(), - ranges: Vec::new() }; } diff --git a/src/protocol/token_writer.rs b/src/protocol/token_writer.rs index ac28761f48e5a136126b1c0addd1b37782400b73..a6028e9b5a315217ffc3d38ff94bf4076b908393 100644 --- a/src/protocol/token_writer.rs +++ b/src/protocol/token_writer.rs @@ -1,11 +1,11 @@ #![allow(dead_code)] -use std::fmt::Write; +use std::fmt::{Write, Error as FmtError}; use std::io::Write as IOWrite; use crate::protocol::input_source::{InputSource, InputSpan}; use crate::protocol::parser::Module; -use crate::protocol::tokens::{Token, TokenKind, TokenRange}; +use crate::protocol::tokens::{Token, TokenKind, TokenMarker}; pub(crate) struct TokenWriter { buffer: String, @@ -35,77 +35,33 @@ impl TokenWriter { None => self.buffer.push_str("Unnamed module\n"), } - - let mut range_index = -1; - if !module.tokens.ranges.is_empty() { - range_index = 0; - } - - while range_index >= 0 { - range_index = self.write_token_range( - &module.source, &module.tokens.tokens, &module.tokens.ranges, range_index, 1 - ); - } + self.write_marker_array(&module.tokens.markers, 1).expect("write markers"); + self.write_token_array(&module.source, &module.tokens.tokens, 1).expect("write tokens"); } - /// Writes a single token range. Recurses if there are any child ranges. - /// Returns the next token range index to iterate over (or a negative - /// number, if there are no more sibling ranges). - fn write_token_range(&mut self, source: &InputSource, tokens: &[Token], ranges: &[TokenRange], range_index: i32, indent: u32) -> i32 { - // Write range kind - let range = &ranges[range_index as usize]; - self.write_dashed_indent(indent); - writeln!(self.buffer, "Range: {:?}", range.range_kind); - - // Write tokens/lines it spans - let first_token_pos = tokens[range.start as usize].pos; - - let last_token_pos = if (range.end as usize) < tokens.len() { - tokens[range.end as usize].pos - } else { - tokens.last().unwrap().pos - }; - let first_source_col = source.get_column(first_token_pos); - let last_source_col = source.get_column(last_token_pos); - + fn write_marker_array(&mut self, markers: &[TokenMarker], indent: u32) -> Result<(), FmtError> { self.write_indent(indent); - writeln!( - self.buffer, "Source: token {} to {}, file {}:{}:{} to {}:{}", - range.start, range.end, source.filename, - first_token_pos.line, first_source_col, - last_token_pos.line, last_source_col - ); - - let next_sibling_index = range.next_sibling_idx; - if range.num_child_ranges == 0 { - // No child ranges, so dump the tokens here - debug_assert!(range.first_child_idx < 0); - self.write_token_array(source, tokens, range, indent); - } else { - // Child ranges - debug_assert!(range.first_child_idx >= 0); - self.write_indent(indent); - writeln!(self.buffer, "Children: ["); - - let mut range_index = range.first_child_idx; - while range_index >= 0 { - range_index = self.write_token_range(source, tokens, ranges, range_index, indent + 1); - } + writeln!(self.buffer, "Markers: [")?; - self.write_indent(indent); - writeln!(self.buffer, "]"); + let marker_indent = indent + 1; + for marker in markers { + self.write_indent(marker_indent); + writeln!(self.buffer, "{:?}", marker)?; } - // Wrote everything, return the next sibling token range - return next_sibling_index; + self.write_indent(indent); + writeln!(self.buffer, "]")?; + + return Ok(()); } - fn write_token_array(&mut self, source: &InputSource, tokens: &[Token], range: &TokenRange, indent: u32) { + fn write_token_array(&mut self, source: &InputSource, tokens: &[Token], indent: u32) -> Result<(), FmtError> { self.write_indent(indent); - writeln!(self.buffer, "Tokens: ["); + writeln!(self.buffer, "Tokens: [")?; + let num_tokens = tokens.len(); let token_indent = indent + 1; - for token_index in range.start as usize..range.end as usize { + for token_index in 0..num_tokens { // Skip uninteresting tokens let token = &tokens[token_index]; if token.kind == TokenKind::SpanEnd { @@ -113,19 +69,21 @@ impl TokenWriter { } self.write_indent(token_indent); - write!(self.buffer, "{:?} (index {})", token.kind, token_index); + write!(self.buffer, "{:?} (index {})", token.kind, token_index)?; if token.kind.has_span_end() { let token_start = token.pos; let token_end = tokens[token_index + 1].pos; let section = source.section_at_span(InputSpan::from_positions(token_start, token_end)); - writeln!(self.buffer, " text: {}", String::from_utf8_lossy(section)); + writeln!(self.buffer, " text: {}", String::from_utf8_lossy(section))?; } else { self.buffer.push('\n'); } } self.write_indent(indent); - writeln!(self.buffer, "]"); + writeln!(self.buffer, "]")?; + + return Ok(()); } fn write_dashed_indent(&mut self, indent: u32) {