Changeset - 2693293dc537
[Not reviewed]
0 4 0
mh - 3 years ago 2022-03-29 16:43:55
contact@maxhenger.nl
Remove code that produced token ranges
4 files changed with 27 insertions and 252 deletions:
0 comments (0 inline, 0 general)
src/protocol/parser/mod.rs
Show inline comments
 
@@ -204,12 +204,6 @@ impl Parser {
 
            arch: &self.arch,
 
        };
 

	
 
        if let Some(filename) = &self.write_tokens_to {
 
            let mut writer = TokenWriter::new();
 
            let mut file = std::fs::File::create(std::path::Path::new(filename)).unwrap();
 
            writer.write(&mut file, &self.modules);
 
        }
 

	
 
        // Advance all modules to the phase where all symbols are scanned
 
        for module_idx in 0..self.modules.len() {
 
            self.pass_symbols.parse(&mut self.modules, module_idx, &mut pass_ctx)?;
 
@@ -222,6 +216,12 @@ impl Parser {
 
            self.pass_definitions.parse(&mut self.modules, module_idx, &mut pass_ctx)?;
 
        }
 

	
 
        if let Some(filename) = &self.write_tokens_to {
 
            let mut writer = TokenWriter::new();
 
            let mut file = std::fs::File::create(std::path::Path::new(filename)).unwrap();
 
            writer.write(&mut file, &self.modules);
 
        }
 

	
 
        // Add every known type to the type table
 
        self.type_table.build_base_types(&mut self.modules, &mut pass_ctx)?;
 

	
src/protocol/parser/pass_tokenizer.rs
Show inline comments
 
@@ -21,15 +21,12 @@ pub(crate) struct PassTokenizer {
 
    // unmatched opening braces, unmatched closing braces are detected
 
    // immediately.
 
    curly_stack: Vec<InputPosition>,
 
    // Points to an element in the `TokenBuffer.ranges` variable.
 
    stack_idx: usize,
 
}
 

	
 
impl PassTokenizer {
 
    pub(crate) fn new() -> Self {
 
        Self{
 
            curly_stack: Vec::with_capacity(32),
 
            stack_idx: 0
 
        }
 
    }
 

	
 
@@ -37,23 +34,6 @@ impl PassTokenizer {
 
        // Assert source and buffer are at start
 
        debug_assert_eq!(source.pos().offset, 0);
 
        debug_assert!(target.tokens.is_empty());
 
        debug_assert!(target.ranges.is_empty());
 

	
 
        // Set up for tokenization by pushing the first range onto the stack.
 
        // This range may get transformed into the appropriate range kind later,
 
        // see `push_range` and `pop_range`.
 
        self.stack_idx = 0;
 
        target.ranges.push(TokenRange{
 
            parent_idx: NO_RELATION,
 
            range_kind: TokenRangeKind::Module,
 
            curly_depth: 0,
 
            start: 0,
 
            end: 0,
 
            num_child_ranges: 0,
 
            first_child_idx: NO_RELATION,
 
            last_child_idx: NO_RELATION,
 
            next_sibling_idx: NO_RELATION,
 
        });
 

	
 
        // Main tokenization loop
 
        while let Some(c) = source.next() {
 
@@ -68,10 +48,8 @@ impl PassTokenizer {
 

	
 
                if demarks_symbol(ident) {
 
                    self.emit_marker(target, TokenMarkerKind::Definition, token_index);
 
                    self.push_range(target, TokenRangeKind::Definition, token_index);
 
                } else if demarks_import(ident) {
 
                    self.emit_marker(target, TokenMarkerKind::Import, token_index);
 
                    self.push_range(target, TokenRangeKind::Import, token_index);
 
                }
 
            } else if is_integer_literal_start(c) {
 
                self.consume_number(source, target)?;
 
@@ -79,7 +57,6 @@ impl PassTokenizer {
 
                let was_pragma = self.consume_pragma_or_pound(c, source, target)?;
 
                if was_pragma {
 
                    self.emit_marker(target, TokenMarkerKind::Pragma, token_index);
 
                    self.push_range(target, TokenRangeKind::Pragma, token_index);
 
                }
 
            } else if self.is_line_comment_start(c, source) {
 
                self.consume_line_comment(source, target)?;
 
@@ -87,10 +64,6 @@ impl PassTokenizer {
 
                self.consume_block_comment(source, target)?;
 
            } else if is_whitespace(c) {
 
                self.consume_whitespace(source);
 
                let range = &target.ranges[self.stack_idx];
 
                if range.range_kind == TokenRangeKind::Pragma {
 
                    self.pop_range(target, target.tokens.len() as u32);
 
                }
 
            } else {
 
                let was_punctuation = self.maybe_parse_punctuation(c, source, target)?;
 
                if let Some((token, token_pos)) = was_punctuation {
 
@@ -106,20 +79,6 @@ impl PassTokenizer {
 
                        }
 

	
 
                        self.curly_stack.pop();
 

	
 
                        let range = &target.ranges[self.stack_idx];
 
                        if range.range_kind == TokenRangeKind::Definition && range.curly_depth == self.curly_stack.len() as u32 {
 
                            self.pop_range(target, target.tokens.len() as u32);
 
                        }
 

	
 
                        // Exit early if we have more closing curly braces than
 
                        // opening curly braces
 
                    } else if token == TokenKind::SemiColon {
 
                        // Check if this marks the end of an import
 
                        let range = &target.ranges[self.stack_idx];
 
                        if range.range_kind == TokenRangeKind::Import {
 
                            self.pop_range(target, target.tokens.len() as u32);
 
                        }
 
                    }
 
                } else {
 
                    return Err(ParseError::new_error_str_at_pos(
 
@@ -143,21 +102,6 @@ impl PassTokenizer {
 
            ));
 
        }
 

	
 
        // Ranges that did not depend on curly braces may have missing tokens.
 
        // So close all of the active tokens
 
        while self.stack_idx != 0 {
 
            self.pop_range(target, target.tokens.len() as u32);
 
        }
 

	
 
        // And finally, we may have a token range at the end that doesn't belong
 
        // to a range yet, so insert a "code" range if this is the case.
 
        debug_assert_eq!(self.stack_idx, 0);
 
        let last_registered_idx = target.ranges[0].end;
 
        let last_token_idx = target.tokens.len() as u32;
 
        if last_registered_idx != last_token_idx {
 
            self.add_code_range(target, 0, last_registered_idx, last_token_idx, NO_RELATION);
 
        }
 

	
 
        Ok(())
 
    }
 

	
 
@@ -624,40 +568,6 @@ impl PassTokenizer {
 
        has_newline
 
    }
 

	
 
    fn add_code_range(
 
        &mut self, target: &mut TokenBuffer, parent_idx: i32,
 
        code_start_idx: u32, code_end_idx: u32, next_sibling_idx: i32
 
    ) {
 
        let new_range_idx = target.ranges.len() as i32;
 
        let parent_range = &mut target.ranges[parent_idx as usize];
 
        debug_assert_ne!(parent_range.end, code_end_idx, "called push_code_range without a need to do so");
 

	
 
        let sibling_idx = parent_range.last_child_idx;
 

	
 
        parent_range.last_child_idx = new_range_idx;
 
        parent_range.end = code_end_idx;
 
        parent_range.num_child_ranges += 1;
 

	
 
        let curly_depth = self.curly_stack.len() as u32;
 
        target.ranges.push(TokenRange{
 
            parent_idx,
 
            range_kind: TokenRangeKind::Code,
 
            curly_depth,
 
            start: code_start_idx,
 
            end: code_end_idx,
 
            num_child_ranges: 0,
 
            first_child_idx: NO_RELATION,
 
            last_child_idx: NO_RELATION,
 
            next_sibling_idx,
 
        });
 

	
 
        // Fix up the sibling indices
 
        if sibling_idx != NO_RELATION {
 
            let sibling_range = &mut target.ranges[sibling_idx as usize];
 
            sibling_range.next_sibling_idx = new_range_idx;
 
        }
 
    }
 

	
 
    fn emit_marker(&mut self, target: &mut TokenBuffer, kind: TokenMarkerKind, first_token: u32) {
 
        debug_assert!(
 
            target.markers
 
@@ -674,65 +584,6 @@ impl PassTokenizer {
 
        });
 
    }
 

	
 
    fn push_range(&mut self, target: &mut TokenBuffer, range_kind: TokenRangeKind, first_token_idx: u32) {
 
        let new_range_idx = target.ranges.len() as i32;
 
        let parent_idx = self.stack_idx as i32;
 
        let parent_range = &mut target.ranges[self.stack_idx];
 

	
 
        if parent_range.first_child_idx == NO_RELATION {
 
            parent_range.first_child_idx = new_range_idx;
 
        }
 

	
 
        let last_registered_idx = parent_range.end;
 
        if last_registered_idx != first_token_idx {
 
            self.add_code_range(target, parent_idx, last_registered_idx, first_token_idx, new_range_idx + 1);
 
        }
 

	
 
        // Push the new range
 
        self.stack_idx = target.ranges.len();
 
        let curly_depth = self.curly_stack.len() as u32;
 
        target.ranges.push(TokenRange{
 
            parent_idx,
 
            range_kind,
 
            curly_depth,
 
            start: first_token_idx,
 
            end: first_token_idx, // modified when popped
 
            num_child_ranges: 0,
 
            first_child_idx: NO_RELATION,
 
            last_child_idx: NO_RELATION,
 
            next_sibling_idx: NO_RELATION
 
        })
 
    }
 

	
 
    fn pop_range(&mut self, target: &mut TokenBuffer, end_token_idx: u32) {
 
        let popped_idx = self.stack_idx as i32;
 
        let popped_range = &mut target.ranges[self.stack_idx];
 
        debug_assert!(self.stack_idx != 0, "attempting to pop top-level range");
 

	
 
        // Fix up the current range before going back to parent
 
        popped_range.end = end_token_idx;
 
        debug_assert_ne!(popped_range.start, end_token_idx);
 

	
 
        // Go back to parent and fix up its child pointers, but remember the
 
        // last child, so we can link it to the newly popped range.
 
        self.stack_idx = popped_range.parent_idx as usize;
 
        let parent = &mut target.ranges[self.stack_idx];
 
        if parent.first_child_idx == NO_RELATION {
 
            parent.first_child_idx = popped_idx;
 
        }
 
        let prev_sibling_idx = parent.last_child_idx;
 
        parent.last_child_idx = popped_idx;
 
        parent.end = end_token_idx;
 
        parent.num_child_ranges += 1;
 

	
 
        // Fix up the sibling (if it exists)
 
        if prev_sibling_idx != NO_RELATION {
 
            let sibling = &mut target.ranges[prev_sibling_idx as usize];
 
            sibling.next_sibling_idx = popped_idx;
 
        }
 
    }
 

	
 

	
 
    fn check_ascii(&self, source: &InputSource) -> Result<(), ParseError> {
 
        match source.next() {
 
            Some(c) if !c.is_ascii() => {
src/protocol/parser/tokens.rs
Show inline comments
 
@@ -193,42 +193,9 @@ pub struct TokenMarker {
 
    pub handled: bool,
 
}
 

	
 
/// The kind of token ranges that are specially parsed by the tokenizer.
 
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
 
pub enum TokenRangeKind {
 
    Module,
 
    Pragma,
 
    Import,
 
    Definition,
 
    Code,
 
}
 

	
 
pub const NO_RELATION: i32 = -1;
 
pub const NO_SIBLING: i32 = NO_RELATION;
 

	
 
/// A range of tokens with a specific meaning. Such a range is part of a tree
 
/// where each parent tree envelops all of its children.
 
#[derive(Debug)]
 
pub struct TokenRange {
 
    // Index of parent in `TokenBuffer.ranges`, does not have a parent if the
 
    // range kind is Module, in that case the parent index is -1.
 
    pub parent_idx: i32,
 
    pub range_kind: TokenRangeKind,
 
    pub curly_depth: u32,
 
    // Offsets into `TokenBuffer.ranges`: the tokens belonging to this range.
 
    pub start: u32,             // first token (inclusive index)
 
    pub end: u32,               // last token (exclusive index)
 
    // Child ranges
 
    pub num_child_ranges: u32,  // Number of subranges
 
    pub first_child_idx: i32,   // First subrange (or -1 if no subranges)
 
    pub last_child_idx: i32,    // Last subrange (or -1 if no subranges)
 
    pub next_sibling_idx: i32,  // Next subrange (or -1 if no next subrange)
 
}
 

	
 
pub struct TokenBuffer {
 
    pub tokens: Vec<Token>,
 
    pub markers: Vec<TokenMarker>,
 
    pub ranges: Vec<TokenRange>,
 
}
 

	
 
impl TokenBuffer {
 
@@ -236,7 +203,6 @@ impl TokenBuffer {
 
        return Self{
 
            tokens: Vec::new(),
 
            markers: Vec::new(),
 
            ranges: Vec::new()
 
        };
 
    }
 

	
src/protocol/token_writer.rs
Show inline comments
 
#![allow(dead_code)]
 

	
 
use std::fmt::Write;
 
use std::fmt::{Write, Error as FmtError};
 
use std::io::Write as IOWrite;
 

	
 
use crate::protocol::input_source::{InputSource, InputSpan};
 
use crate::protocol::parser::Module;
 
use crate::protocol::tokens::{Token, TokenKind, TokenRange};
 
use crate::protocol::tokens::{Token, TokenKind, TokenMarker};
 

	
 
pub(crate) struct TokenWriter {
 
    buffer: String,
 
@@ -35,77 +35,33 @@ impl TokenWriter {
 
            None => self.buffer.push_str("Unnamed module\n"),
 
        }
 

	
 

	
 
        let mut range_index = -1;
 
        if !module.tokens.ranges.is_empty() {
 
            range_index = 0;
 
        }
 

	
 
        while range_index >= 0 {
 
            range_index = self.write_token_range(
 
                &module.source, &module.tokens.tokens, &module.tokens.ranges, range_index, 1
 
            );
 
        self.write_marker_array(&module.tokens.markers, 1).expect("write markers");
 
        self.write_token_array(&module.source, &module.tokens.tokens, 1).expect("write tokens");
 
    }
 
    }
 

	
 
    /// Writes a single token range. Recurses if there are any child ranges.
 
    /// Returns the next token range index to iterate over (or a negative
 
    /// number, if there are no more sibling ranges).
 
    fn write_token_range(&mut self, source: &InputSource, tokens: &[Token], ranges: &[TokenRange], range_index: i32, indent: u32) -> i32 {
 
        // Write range kind
 
        let range = &ranges[range_index as usize];
 
        self.write_dashed_indent(indent);
 
        writeln!(self.buffer, "Range: {:?}", range.range_kind);
 

	
 
        // Write tokens/lines it spans
 
        let first_token_pos = tokens[range.start as usize].pos;
 

	
 
        let last_token_pos = if (range.end as usize) < tokens.len() {
 
            tokens[range.end as usize].pos
 
        } else {
 
            tokens.last().unwrap().pos
 
        };
 
        let first_source_col = source.get_column(first_token_pos);
 
        let last_source_col = source.get_column(last_token_pos);
 

	
 
        self.write_indent(indent);
 
        writeln!(
 
            self.buffer, "Source: token {} to {}, file {}:{}:{} to {}:{}",
 
            range.start, range.end, source.filename,
 
            first_token_pos.line, first_source_col,
 
            last_token_pos.line, last_source_col
 
        );
 

	
 
        let next_sibling_index = range.next_sibling_idx;
 
        if range.num_child_ranges == 0 {
 
            // No child ranges, so dump the tokens here
 
            debug_assert!(range.first_child_idx < 0);
 
            self.write_token_array(source, tokens, range, indent);
 
        } else {
 
            // Child ranges
 
            debug_assert!(range.first_child_idx >= 0);
 
    fn write_marker_array(&mut self, markers: &[TokenMarker], indent: u32) -> Result<(), FmtError> {
 
        self.write_indent(indent);
 
            writeln!(self.buffer, "Children: [");
 
        writeln!(self.buffer, "Markers: [")?;
 

	
 
            let mut range_index = range.first_child_idx;
 
            while range_index >= 0 {
 
                range_index = self.write_token_range(source, tokens, ranges, range_index, indent + 1);
 
        let marker_indent = indent + 1;
 
        for marker in markers {
 
            self.write_indent(marker_indent);
 
            writeln!(self.buffer, "{:?}", marker)?;
 
        }
 

	
 
        self.write_indent(indent);
 
            writeln!(self.buffer, "]");
 
        }
 
        writeln!(self.buffer, "]")?;
 

	
 
        // Wrote everything, return the next sibling token range
 
        return next_sibling_index;
 
        return Ok(());
 
    }
 

	
 
    fn write_token_array(&mut self, source: &InputSource, tokens: &[Token], range: &TokenRange, indent: u32) {
 
    fn write_token_array(&mut self, source: &InputSource, tokens: &[Token], indent: u32) -> Result<(), FmtError> {
 
        self.write_indent(indent);
 
        writeln!(self.buffer, "Tokens: [");
 
        writeln!(self.buffer, "Tokens: [")?;
 

	
 
        let num_tokens = tokens.len();
 
        let token_indent = indent + 1;
 
        for token_index in range.start as usize..range.end as usize {
 
        for token_index in 0..num_tokens {
 
            // Skip uninteresting tokens
 
            let token = &tokens[token_index];
 
            if token.kind == TokenKind::SpanEnd {
 
@@ -113,19 +69,21 @@ impl TokenWriter {
 
            }
 

	
 
            self.write_indent(token_indent);
 
            write!(self.buffer, "{:?} (index {})", token.kind, token_index);
 
            write!(self.buffer, "{:?} (index {})", token.kind, token_index)?;
 
            if token.kind.has_span_end() {
 
                let token_start = token.pos;
 
                let token_end = tokens[token_index + 1].pos;
 
                let section = source.section_at_span(InputSpan::from_positions(token_start, token_end));
 
                writeln!(self.buffer, " text: {}", String::from_utf8_lossy(section));
 
                writeln!(self.buffer, " text: {}", String::from_utf8_lossy(section))?;
 
            } else {
 
                self.buffer.push('\n');
 
            }
 
        }
 

	
 
        self.write_indent(indent);
 
        writeln!(self.buffer, "]");
 
        writeln!(self.buffer, "]")?;
 

	
 
        return Ok(());
 
    }
 

	
 
    fn write_dashed_indent(&mut self, indent: u32) {
0 comments (0 inline, 0 general)