Changeset - 2693293dc537
[Not reviewed]
0 4 0
mh - 3 years ago 2022-03-29 16:43:55
contact@maxhenger.nl
Remove code that produced token ranges
4 files changed with 27 insertions and 252 deletions:
0 comments (0 inline, 0 general)
src/protocol/parser/mod.rs
Show inline comments
 
@@ -15,379 +15,379 @@ pub(crate) mod pass_stack_size;
 

	
 
use tokens::*;
 
use crate::collections::*;
 
use visitor::Visitor;
 
use pass_tokenizer::PassTokenizer;
 
use pass_symbols::PassSymbols;
 
use pass_imports::PassImport;
 
use pass_definitions::PassDefinitions;
 
use pass_validation_linking::PassValidationLinking;
 
use pass_typing::{PassTyping, ResolveQueue};
 
use pass_rewriting::PassRewriting;
 
use pass_stack_size::PassStackSize;
 
use symbol_table::*;
 
use type_table::*;
 

	
 
use crate::protocol::ast::*;
 
use crate::protocol::input_source::*;
 

	
 
use crate::protocol::ast_writer::ASTWriter;
 
use crate::protocol::parser::type_table::PolymorphicVariable;
 
use crate::protocol::token_writer::TokenWriter;
 

	
 
const REOWOLF_PATH_ENV: &'static str = "REOWOLF_ROOT"; // first lookup reowolf path
 
const REOWOLF_PATH_DIR: &'static str = "std"; // then try folder in current working directory
 

	
 
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord)]
 
pub enum ModuleCompilationPhase {
 
    Tokenized,              // source is tokenized
 
    SymbolsScanned,         // all definitions are linked to their type class
 
    ImportsResolved,        // all imports are added to the symbol table
 
    DefinitionsParsed,      // produced the AST for the entire module
 
    TypesAddedToTable,      // added all definitions to the type table
 
    ValidatedAndLinked,     // AST is traversed and has linked the required AST nodes
 
    Typed,                  // Type inference and checking has been performed
 
    Rewritten,              // Special AST nodes are rewritten into regular AST nodes
 
    // When we continue with the compiler:
 
    // StackSize
 
}
 

	
 
pub struct Module {
 
    pub source: InputSource,
 
    pub tokens: TokenBuffer,
 
    pub is_compiler_file: bool, // TODO: @Hack
 
    pub root_id: RootId,
 
    pub name: Option<(PragmaId, StringRef<'static>)>,
 
    pub version: Option<(PragmaId, i64)>,
 
    pub phase: ModuleCompilationPhase,
 
}
 

	
 
pub struct TargetArch {
 
    pub void_type_id: TypeId,
 
    pub message_type_id: TypeId,
 
    pub bool_type_id: TypeId,
 
    pub uint8_type_id: TypeId,
 
    pub uint16_type_id: TypeId,
 
    pub uint32_type_id: TypeId,
 
    pub uint64_type_id: TypeId,
 
    pub sint8_type_id: TypeId,
 
    pub sint16_type_id: TypeId,
 
    pub sint32_type_id: TypeId,
 
    pub sint64_type_id: TypeId,
 
    pub char_type_id: TypeId,
 
    pub string_type_id: TypeId,
 
    pub array_type_id: TypeId,
 
    pub slice_type_id: TypeId,
 
    pub input_type_id: TypeId,
 
    pub output_type_id: TypeId,
 
    pub pointer_type_id: TypeId,
 
}
 

	
 
impl TargetArch {
 
    fn new() -> Self {
 
        return Self{
 
            void_type_id: TypeId::new_invalid(),
 
            bool_type_id: TypeId::new_invalid(),
 
            message_type_id: TypeId::new_invalid(),
 
            uint8_type_id: TypeId::new_invalid(),
 
            uint16_type_id: TypeId::new_invalid(),
 
            uint32_type_id: TypeId::new_invalid(),
 
            uint64_type_id: TypeId::new_invalid(),
 
            sint8_type_id: TypeId::new_invalid(),
 
            sint16_type_id: TypeId::new_invalid(),
 
            sint32_type_id: TypeId::new_invalid(),
 
            sint64_type_id: TypeId::new_invalid(),
 
            char_type_id: TypeId::new_invalid(),
 
            string_type_id: TypeId::new_invalid(),
 
            array_type_id: TypeId::new_invalid(),
 
            slice_type_id: TypeId::new_invalid(),
 
            input_type_id: TypeId::new_invalid(),
 
            output_type_id: TypeId::new_invalid(),
 
            pointer_type_id: TypeId::new_invalid(),
 
        }
 
    }
 
}
 

	
 
pub struct PassCtx<'a> {
 
    heap: &'a mut Heap,
 
    symbols: &'a mut SymbolTable,
 
    pool: &'a mut StringPool,
 
    arch: &'a TargetArch,
 
}
 

	
 
pub struct Parser {
 
    // Storage of all information created/gathered during compilation.
 
    pub(crate) heap: Heap,
 
    pub(crate) string_pool: StringPool, // Do not deallocate, holds all strings
 
    pub(crate) modules: Vec<Module>,
 
    pub(crate) symbol_table: SymbolTable,
 
    pub(crate) type_table: TypeTable,
 
    pub(crate) global_module_index: usize, // contains globals, implicitly imported everywhere
 
    // Compiler passes, used as little state machine that keep their memory
 
    // around.
 
    pass_tokenizer: PassTokenizer,
 
    pass_symbols: PassSymbols,
 
    pass_import: PassImport,
 
    pass_definitions: PassDefinitions,
 
    pass_validation: PassValidationLinking,
 
    pass_typing: PassTyping,
 
    pass_rewriting: PassRewriting,
 
    pass_stack_size: PassStackSize,
 
    // Compiler options
 
    pub write_tokens_to: Option<String>,
 
    pub write_ast_to: Option<String>,
 
    pub(crate) arch: TargetArch,
 
}
 

	
 
impl Parser {
 
    pub fn new() -> Result<Self, String> {
 
        let mut parser = Parser{
 
            heap: Heap::new(),
 
            string_pool: StringPool::new(),
 
            modules: Vec::new(),
 
            symbol_table: SymbolTable::new(),
 
            type_table: TypeTable::new(),
 
            global_module_index: 0,
 
            pass_tokenizer: PassTokenizer::new(),
 
            pass_symbols: PassSymbols::new(),
 
            pass_import: PassImport::new(),
 
            pass_definitions: PassDefinitions::new(),
 
            pass_validation: PassValidationLinking::new(),
 
            pass_typing: PassTyping::new(),
 
            pass_rewriting: PassRewriting::new(),
 
            pass_stack_size: PassStackSize::new(),
 
            write_tokens_to: None,
 
            write_ast_to: None,
 
            arch: TargetArch::new(),
 
        };
 

	
 
        parser.symbol_table.insert_scope(None, SymbolScope::Global);
 

	
 
        // Insert builtin types
 
        // TODO: At some point use correct values for size/alignment
 
        parser.arch.void_type_id    = insert_builtin_type(&mut parser.type_table, vec![ConcreteTypePart::Void], false, 0, 1);
 
        parser.arch.message_type_id = insert_builtin_type(&mut parser.type_table, vec![ConcreteTypePart::Message], false, 24, 8);
 
        parser.arch.bool_type_id    = insert_builtin_type(&mut parser.type_table, vec![ConcreteTypePart::Bool], false, 1, 1);
 
        parser.arch.uint8_type_id   = insert_builtin_type(&mut parser.type_table, vec![ConcreteTypePart::UInt8], false, 1, 1);
 
        parser.arch.uint16_type_id  = insert_builtin_type(&mut parser.type_table, vec![ConcreteTypePart::UInt16], false, 2, 2);
 
        parser.arch.uint32_type_id  = insert_builtin_type(&mut parser.type_table, vec![ConcreteTypePart::UInt32], false, 4, 4);
 
        parser.arch.uint64_type_id  = insert_builtin_type(&mut parser.type_table, vec![ConcreteTypePart::UInt64], false, 8, 8);
 
        parser.arch.sint8_type_id   = insert_builtin_type(&mut parser.type_table, vec![ConcreteTypePart::SInt8], false, 1, 1);
 
        parser.arch.sint16_type_id  = insert_builtin_type(&mut parser.type_table, vec![ConcreteTypePart::SInt16], false, 2, 2);
 
        parser.arch.sint32_type_id  = insert_builtin_type(&mut parser.type_table, vec![ConcreteTypePart::SInt32], false, 4, 4);
 
        parser.arch.sint64_type_id  = insert_builtin_type(&mut parser.type_table, vec![ConcreteTypePart::SInt64], false, 8, 8);
 
        parser.arch.char_type_id    = insert_builtin_type(&mut parser.type_table, vec![ConcreteTypePart::Character], false, 4, 4);
 
        parser.arch.string_type_id  = insert_builtin_type(&mut parser.type_table, vec![ConcreteTypePart::String], false, 24, 8);
 
        parser.arch.array_type_id   = insert_builtin_type(&mut parser.type_table, vec![ConcreteTypePart::Array, ConcreteTypePart::Void], true, 24, 8);
 
        parser.arch.slice_type_id   = insert_builtin_type(&mut parser.type_table, vec![ConcreteTypePart::Slice, ConcreteTypePart::Void], true, 16, 4);
 
        parser.arch.input_type_id   = insert_builtin_type(&mut parser.type_table, vec![ConcreteTypePart::Input, ConcreteTypePart::Void], true, 8, 8);
 
        parser.arch.output_type_id  = insert_builtin_type(&mut parser.type_table, vec![ConcreteTypePart::Output, ConcreteTypePart::Void], true, 8, 8);
 
        parser.arch.pointer_type_id = insert_builtin_type(&mut parser.type_table, vec![ConcreteTypePart::Pointer, ConcreteTypePart::Void], true, 8, 8);
 

	
 
        // Parse standard library
 
        parser.feed_standard_library()?;
 

	
 
        return Ok(parser)
 
    }
 

	
 
    /// Feeds a new InputSource to the parser, which will tokenize it and store
 
    /// it internally for later parsing (when all modules are present). Returns
 
    /// the index of the new module.
 
    pub fn feed(&mut self, mut source: InputSource) -> Result<usize, ParseError> {
 
        return self.feed_internal(source, false);
 
    }
 

	
 
    pub fn parse(&mut self) -> Result<(), ParseError> {
 
        let mut pass_ctx = PassCtx{
 
            heap: &mut self.heap,
 
            symbols: &mut self.symbol_table,
 
            pool: &mut self.string_pool,
 
            arch: &self.arch,
 
        };
 

	
 
        if let Some(filename) = &self.write_tokens_to {
 
            let mut writer = TokenWriter::new();
 
            let mut file = std::fs::File::create(std::path::Path::new(filename)).unwrap();
 
            writer.write(&mut file, &self.modules);
 
        }
 

	
 
        // Advance all modules to the phase where all symbols are scanned
 
        for module_idx in 0..self.modules.len() {
 
            self.pass_symbols.parse(&mut self.modules, module_idx, &mut pass_ctx)?;
 
        }
 

	
 
        // With all symbols scanned, perform further compilation until we can
 
        // add all base types to the type table.
 
        for module_idx in 0..self.modules.len() {
 
            self.pass_import.parse(&mut self.modules, module_idx, &mut pass_ctx)?;
 
            self.pass_definitions.parse(&mut self.modules, module_idx, &mut pass_ctx)?;
 
        }
 

	
 
        if let Some(filename) = &self.write_tokens_to {
 
            let mut writer = TokenWriter::new();
 
            let mut file = std::fs::File::create(std::path::Path::new(filename)).unwrap();
 
            writer.write(&mut file, &self.modules);
 
        }
 

	
 
        // Add every known type to the type table
 
        self.type_table.build_base_types(&mut self.modules, &mut pass_ctx)?;
 

	
 
        // Continue compilation with the remaining phases now that the types
 
        // are all in the type table
 
        for module_idx in 0..self.modules.len() {
 
            let mut ctx = visitor::Ctx{
 
                heap: &mut self.heap,
 
                modules: &mut self.modules,
 
                module_idx,
 
                symbols: &mut self.symbol_table,
 
                types: &mut self.type_table,
 
                arch: &self.arch,
 
            };
 
            self.pass_validation.visit_module(&mut ctx)?;
 
        }
 

	
 
        // Perform typechecking on all modules
 
        let mut queue = ResolveQueue::new();
 
        for module_idx in 0..self.modules.len() {
 
            let mut ctx = visitor::Ctx{
 
                heap: &mut self.heap,
 
                modules: &mut self.modules,
 
                module_idx,
 
                symbols: &mut self.symbol_table,
 
                types: &mut self.type_table,
 
                arch: &self.arch,
 
            };
 
            self.pass_typing.queue_module_definitions(&mut ctx, &mut queue);
 
        };
 
        while !queue.is_empty() {
 
            let top = queue.pop_front().unwrap();
 
            let mut ctx = visitor::Ctx{
 
                heap: &mut self.heap,
 
                modules: &mut self.modules,
 
                module_idx: top.root_id.index as usize,
 
                symbols: &mut self.symbol_table,
 
                types: &mut self.type_table,
 
                arch: &self.arch,
 
            };
 
            self.pass_typing.handle_module_definition(&mut ctx, &mut queue, top)?;
 
        }
 

	
 
        // Rewrite nodes in tree, then prepare for execution of code
 
        for module_idx in 0..self.modules.len() {
 
            self.modules[module_idx].phase = ModuleCompilationPhase::Typed;
 
            let mut ctx = visitor::Ctx{
 
                heap: &mut self.heap,
 
                modules: &mut self.modules,
 
                module_idx,
 
                symbols: &mut self.symbol_table,
 
                types: &mut self.type_table,
 
                arch: &self.arch,
 
            };
 
            self.pass_rewriting.visit_module(&mut ctx)?;
 
            self.pass_stack_size.visit_module(&mut ctx)?;
 
        }
 

	
 
        // Write out desired information
 
        if let Some(filename) = &self.write_ast_to {
 
            let mut writer = ASTWriter::new();
 
            let mut file = std::fs::File::create(std::path::Path::new(filename)).unwrap();
 
            writer.write_ast(&mut file, &self.heap);
 
        }
 

	
 
        Ok(())
 
    }
 

	
 
    /// Tries to find the standard library and add the files for parsing.
 
    fn feed_standard_library(&mut self) -> Result<(), String> {
 
        use std::env;
 
        use std::path::{Path, PathBuf};
 
        use std::fs;
 

	
 
        const FILES: [&'static str; 1] = [
 
            "std.global.pdl",
 
        ];
 

	
 
        // Determine base directory
 
        let (base_path, from_env) = if let Ok(path) = env::var(REOWOLF_PATH_ENV) {
 
            // Path variable is set
 
            (path, true)
 
        } else {
 
            let mut path = String::with_capacity(REOWOLF_PATH_DIR.len() + 2);
 
            path.push_str("./");
 
            path.push_str(REOWOLF_PATH_DIR);
 
            (path, false)
 
        };
 

	
 
        // Make sure directory exists
 
        let path = Path::new(&base_path);
 
        if !path.exists() {
 
            return Err(format!("std lib root directory '{}' does not exist", base_path));
 
        }
 

	
 
        // Try to load all standard library files. We might need a more unified
 
        // way to do this in the future (i.e. a "std" package, containing all
 
        // of the modules)
 
        let mut file_path = PathBuf::new();
 
        let mut first_file = true;
 

	
 
        for file in FILES {
 
            file_path.push(path);
 
            file_path.push(file);
 

	
 
            let source = fs::read(file_path.as_path());
 
            if let Err(err) = source {
 
                return Err(format!(
 
                    "failed to read std lib file '{}' in root directory '{}', because: {}",
 
                    file, base_path, err
 
                ));
 
            }
 

	
 
            let source = source.unwrap();
 
            let input_source = InputSource::new(file.to_string(), source);
 

	
 
            let module_index = self.feed_internal(input_source, true);
 
            if let Err(err) = module_index {
 
                // A bit of a hack, but shouldn't really happen anyway: the
 
                // compiler should ship with a decent standard library (at some
 
                // point)
 
                return Err(format!("{}", err));
 
            }
 
            let module_index = module_index.unwrap();
 

	
 
            if first_file {
 
                self.global_module_index = module_index;
 
                first_file = false;
 
            }
 
        }
 

	
 
        return Ok(())
 
    }
 

	
 
    fn feed_internal(&mut self, mut source: InputSource, is_compiler_file: bool) -> Result<usize, ParseError> {
 
        let mut token_buffer = TokenBuffer::new();
 
        self.pass_tokenizer.tokenize(&mut source, &mut token_buffer)?;
 

	
 
        let module = Module{
 
            source,
 
            tokens: token_buffer,
 
            is_compiler_file,
 
            root_id: RootId::new_invalid(),
 
            name: None,
 
            version: None,
 
            phase: ModuleCompilationPhase::Tokenized,
 
        };
 
        let module_index = self.modules.len();
 
        self.modules.push(module);
 

	
 
        return Ok(module_index);
 
    }
 
}
 

	
 
fn insert_builtin_type(type_table: &mut TypeTable, parts: Vec<ConcreteTypePart>, has_poly_var: bool, size: usize, alignment: usize) -> TypeId {
 
    const POLY_VARS: [PolymorphicVariable; 1] = [PolymorphicVariable{
 
        identifier: Identifier::new_empty(InputSpan::new()),
 
        is_in_use: false,
 
    }];
 

	
 
    let concrete_type = ConcreteType{ parts };
 
    let poly_var = if has_poly_var {
 
        POLY_VARS.as_slice()
 
    } else {
 
        &[]
 
    };
 

	
 
    return type_table.add_builtin_data_type(concrete_type, poly_var, size, alignment);
 
}
 
\ No newline at end of file
src/protocol/parser/pass_tokenizer.rs
Show inline comments
 
use crate::protocol::input_source::{
 
    InputSource as InputSource,
 
    ParseError,
 
    InputPosition as InputPosition,
 
};
 

	
 
use super::tokens::*;
 
use super::token_parsing::*;
 

	
 
/// Tokenizer is a reusable parser to tokenize multiple source files using the
 
/// same allocated buffers. In a well-formed program, we produce a consistent
 
/// tree of token ranges such that we may identify tokens that represent a
 
/// defintion or an import before producing the entire AST.
 
///
 
/// If the program is not well-formed then the tree may be inconsistent, but we
 
/// will detect this once we transform the tokens into the AST. To ensure a
 
/// consistent AST-producing phase we will require the import to have balanced
 
/// curly braces
 
pub(crate) struct PassTokenizer {
 
    // Stack of input positions of opening curly braces, used to detect
 
    // unmatched opening braces, unmatched closing braces are detected
 
    // immediately.
 
    curly_stack: Vec<InputPosition>,
 
    // Points to an element in the `TokenBuffer.ranges` variable.
 
    stack_idx: usize,
 
}
 

	
 
impl PassTokenizer {
 
    pub(crate) fn new() -> Self {
 
        Self{
 
            curly_stack: Vec::with_capacity(32),
 
            stack_idx: 0
 
        }
 
    }
 

	
 
    pub(crate) fn tokenize(&mut self, source: &mut InputSource, target: &mut TokenBuffer) -> Result<(), ParseError> {
 
        // Assert source and buffer are at start
 
        debug_assert_eq!(source.pos().offset, 0);
 
        debug_assert!(target.tokens.is_empty());
 
        debug_assert!(target.ranges.is_empty());
 

	
 
        // Set up for tokenization by pushing the first range onto the stack.
 
        // This range may get transformed into the appropriate range kind later,
 
        // see `push_range` and `pop_range`.
 
        self.stack_idx = 0;
 
        target.ranges.push(TokenRange{
 
            parent_idx: NO_RELATION,
 
            range_kind: TokenRangeKind::Module,
 
            curly_depth: 0,
 
            start: 0,
 
            end: 0,
 
            num_child_ranges: 0,
 
            first_child_idx: NO_RELATION,
 
            last_child_idx: NO_RELATION,
 
            next_sibling_idx: NO_RELATION,
 
        });
 

	
 
        // Main tokenization loop
 
        while let Some(c) = source.next() {
 
            let token_index = target.tokens.len() as u32;
 

	
 
            if is_char_literal_start(c) {
 
                self.consume_char_literal(source, target)?;
 
            } else if is_string_literal_start(c) {
 
                self.consume_string_literal(source, target)?;
 
            } else if is_identifier_start(c) {
 
                let ident = self.consume_identifier(source, target)?;
 

	
 
                if demarks_symbol(ident) {
 
                    self.emit_marker(target, TokenMarkerKind::Definition, token_index);
 
                    self.push_range(target, TokenRangeKind::Definition, token_index);
 
                } else if demarks_import(ident) {
 
                    self.emit_marker(target, TokenMarkerKind::Import, token_index);
 
                    self.push_range(target, TokenRangeKind::Import, token_index);
 
                }
 
            } else if is_integer_literal_start(c) {
 
                self.consume_number(source, target)?;
 
            } else if is_pragma_start_or_pound(c) {
 
                let was_pragma = self.consume_pragma_or_pound(c, source, target)?;
 
                if was_pragma {
 
                    self.emit_marker(target, TokenMarkerKind::Pragma, token_index);
 
                    self.push_range(target, TokenRangeKind::Pragma, token_index);
 
                }
 
            } else if self.is_line_comment_start(c, source) {
 
                self.consume_line_comment(source, target)?;
 
            } else if self.is_block_comment_start(c, source) {
 
                self.consume_block_comment(source, target)?;
 
            } else if is_whitespace(c) {
 
                self.consume_whitespace(source);
 
                let range = &target.ranges[self.stack_idx];
 
                if range.range_kind == TokenRangeKind::Pragma {
 
                    self.pop_range(target, target.tokens.len() as u32);
 
                }
 
            } else {
 
                let was_punctuation = self.maybe_parse_punctuation(c, source, target)?;
 
                if let Some((token, token_pos)) = was_punctuation {
 
                    if token == TokenKind::OpenCurly {
 
                        self.curly_stack.push(token_pos);
 
                    } else if token == TokenKind::CloseCurly {
 
                        // Check if this marks the end of a range we're
 
                        // currently processing
 
                        if self.curly_stack.is_empty() {
 
                            return Err(ParseError::new_error_str_at_pos(
 
                                source, token_pos, "unmatched closing curly brace '}'"
 
                            ));
 
                        }
 

	
 
                        self.curly_stack.pop();
 

	
 
                        let range = &target.ranges[self.stack_idx];
 
                        if range.range_kind == TokenRangeKind::Definition && range.curly_depth == self.curly_stack.len() as u32 {
 
                            self.pop_range(target, target.tokens.len() as u32);
 
                        }
 

	
 
                        // Exit early if we have more closing curly braces than
 
                        // opening curly braces
 
                    } else if token == TokenKind::SemiColon {
 
                        // Check if this marks the end of an import
 
                        let range = &target.ranges[self.stack_idx];
 
                        if range.range_kind == TokenRangeKind::Import {
 
                            self.pop_range(target, target.tokens.len() as u32);
 
                        }
 
                    }
 
                } else {
 
                    return Err(ParseError::new_error_str_at_pos(
 
                        source, source.pos(), "unexpected character"
 
                    ));
 
                }
 
            }
 
        }
 

	
 
        // End of file, check if our state is correct
 
        if let Some(error) = source.had_error.take() {
 
            return Err(error);
 
        }
 

	
 
        if !self.curly_stack.is_empty() {
 
            // Let's not add a lot of heuristics and just tell the programmer
 
            // that something is wrong
 
            let last_unmatched_open = self.curly_stack.pop().unwrap();
 
            return Err(ParseError::new_error_str_at_pos(
 
                source, last_unmatched_open, "unmatched opening curly brace '{'"
 
            ));
 
        }
 

	
 
        // Ranges that did not depend on curly braces may have missing tokens.
 
        // So close all of the active tokens
 
        while self.stack_idx != 0 {
 
            self.pop_range(target, target.tokens.len() as u32);
 
        }
 

	
 
        // And finally, we may have a token range at the end that doesn't belong
 
        // to a range yet, so insert a "code" range if this is the case.
 
        debug_assert_eq!(self.stack_idx, 0);
 
        let last_registered_idx = target.ranges[0].end;
 
        let last_token_idx = target.tokens.len() as u32;
 
        if last_registered_idx != last_token_idx {
 
            self.add_code_range(target, 0, last_registered_idx, last_token_idx, NO_RELATION);
 
        }
 

	
 
        Ok(())
 
    }
 

	
 
    fn is_line_comment_start(&self, first_char: u8, source: &InputSource) -> bool {
 
        return first_char == b'/' && Some(b'/') == source.lookahead(1);
 
    }
 

	
 
    fn is_block_comment_start(&self, first_char: u8, source: &InputSource) -> bool {
 
        return first_char == b'/' && Some(b'*') == source.lookahead(1);
 
    }
 

	
 
    fn maybe_parse_punctuation(
 
        &mut self, first_char: u8, source: &mut InputSource, target: &mut TokenBuffer
 
    ) -> Result<Option<(TokenKind, InputPosition)>, ParseError> {
 
        debug_assert!(first_char != b'#', "'#' needs special handling");
 
        debug_assert!(first_char != b'\'', "'\'' needs special handling");
 
        debug_assert!(first_char != b'"', "'\"' needs special handling");
 

	
 
        let pos = source.pos();
 
        let token_kind;
 
        if first_char == b'!' {
 
            source.consume();
 
            if Some(b'=') == source.next() {
 
                source.consume();
 
                token_kind = TokenKind::NotEqual;
 
            } else {
 
                token_kind = TokenKind::Exclamation;
 
            }
 
        } else if first_char == b'%' {
 
            source.consume();
 
            if Some(b'=') == source.next() {
 
                source.consume();
 
                token_kind = TokenKind::PercentEquals;
 
            } else {
 
                token_kind = TokenKind::Percent;
 
            }
 
        } else if first_char == b'&' {
 
            source.consume();
 
            let next = source.next();
 
            if Some(b'&') == next {
 
                source.consume();
 
                token_kind = TokenKind::AndAnd;
 
            } else if Some(b'=') == next {
 
                source.consume();
 
                token_kind = TokenKind::AndEquals;
 
            } else {
 
                token_kind = TokenKind::And;
 
            }
 
        } else if first_char == b'(' {
 
            source.consume();
 
            token_kind = TokenKind::OpenParen;
 
        } else if first_char == b')' {
 
            source.consume();
 
            token_kind = TokenKind::CloseParen;
 
        } else if first_char == b'*' {
 
            source.consume();
 
            if let Some(b'=') = source.next() {
 
                source.consume();
 
                token_kind = TokenKind::StarEquals;
 
            } else {
 
                token_kind = TokenKind::Star;
 
            }
 
        } else if first_char == b'+' {
 
            source.consume();
 
            let next = source.next();
 
            if Some(b'+') == next {
 
                source.consume();
 
                token_kind = TokenKind::PlusPlus;
 
            } else if Some(b'=') == next {
 
                source.consume();
 
                token_kind = TokenKind::PlusEquals;
 
            } else {
 
                token_kind = TokenKind::Plus;
 
            }
 
        } else if first_char == b',' {
 
            source.consume();
 
            token_kind = TokenKind::Comma;
 
        } else if first_char == b'-' {
 
            source.consume();
 
            let next = source.next();
 
            if Some(b'-') == next {
 
                source.consume();
 
                token_kind = TokenKind::MinusMinus;
 
            } else if Some(b'>') == next {
 
                source.consume();
 
                token_kind = TokenKind::ArrowRight;
 
            } else if Some(b'=') == next {
 
                source.consume();
 
                token_kind = TokenKind::MinusEquals;
 
            } else {
 
                token_kind = TokenKind::Minus;
 
            }
 
        } else if first_char == b'.' {
 
            source.consume();
 
            if let Some(b'.') = source.next() {
 
                source.consume();
 
                token_kind = TokenKind::DotDot;
 
            } else {
 
                token_kind = TokenKind::Dot
 
            }
 
        } else if first_char == b'/' {
 
            source.consume();
 
            debug_assert_ne!(Some(b'/'), source.next());
 
            debug_assert_ne!(Some(b'*'), source.next());
 
            if let Some(b'=') = source.next() {
 
                source.consume();
 
                token_kind = TokenKind::SlashEquals;
 
            } else {
 
                token_kind = TokenKind::Slash;
 
            }
 
        } else if first_char == b':' {
 
            source.consume();
 
            if let Some(b':') = source.next() {
 
                source.consume();
 
                token_kind = TokenKind::ColonColon;
 
            } else {
 
                token_kind = TokenKind::Colon;
 
            }
 
        } else if first_char == b';' {
 
            source.consume();
 
            token_kind = TokenKind::SemiColon;
 
        } else if first_char == b'<' {
 
            source.consume();
 
            let next = source.next();
 
            if let Some(b'<') = next {
 
                source.consume();
 
                if let Some(b'=') = source.next() {
 
                    source.consume();
 
                    token_kind = TokenKind::ShiftLeftEquals;
 
                } else {
 
                    token_kind = TokenKind::ShiftLeft;
 
                }
 
            } else if let Some(b'=') = next {
 
                source.consume();
 
                token_kind = TokenKind::LessEquals;
 
            } else {
 
                token_kind = TokenKind::OpenAngle;
 
            }
 
        } else if first_char == b'=' {
 
            source.consume();
 
            if let Some(b'=') = source.next() {
 
                source.consume();
 
                token_kind = TokenKind::EqualEqual;
 
            } else {
 
                token_kind = TokenKind::Equal;
 
            }
 
        } else if first_char == b'>' {
 
            source.consume();
 
            let next = source.next();
 
            if Some(b'>') == next {
 
                source.consume();
 
                if Some(b'=') == source.next() {
 
                    source.consume();
 
                    token_kind = TokenKind::ShiftRightEquals;
 
                } else {
 
                    token_kind = TokenKind::ShiftRight;
 
                }
 
            } else if Some(b'=') == next {
 
                source.consume();
 
                token_kind = TokenKind::GreaterEquals;
 
            } else {
 
                token_kind = TokenKind::CloseAngle;
 
            }
 
        } else if first_char == b'?' {
 
            source.consume();
 
            token_kind = TokenKind::Question;
 
        } else if first_char == b'@' {
 
            source.consume();
 
            if let Some(b'=') = source.next() {
 
                source.consume();
 
                token_kind = TokenKind::AtEquals;
 
            } else {
 
                token_kind = TokenKind::At;
 
            }
 
        } else if first_char == b'[' {
 
            source.consume();
 
            token_kind = TokenKind::OpenSquare;
 
        } else if first_char == b']' {
 
            source.consume();
 
            token_kind = TokenKind::CloseSquare;
 
        } else if first_char == b'^' {
 
            source.consume();
 
            if let Some(b'=') = source.next() {
 
                source.consume();
 
                token_kind = TokenKind::CaretEquals;
 
            } else {
 
                token_kind = TokenKind::Caret;
 
            }
 
        } else if first_char == b'{' {
 
            source.consume();
 
            token_kind = TokenKind::OpenCurly;
 
        } else if first_char == b'|' {
 
@@ -435,370 +379,277 @@ impl PassTokenizer {
 
            if prev_char == b'\\' && c == b'\\' {
 
                // Escaped backslash, set prev_char to bogus to not conflict
 
                // with escaped-" and unterminated string literal detection.
 
                prev_char = b'\0';
 
            } else {
 
                prev_char = c;
 
            }
 
        }
 

	
 
        if prev_char != b'"' {
 
            // Unterminated string literal
 
            return Err(ParseError::new_error_str_at_pos(source, begin_pos, "encountered unterminated string literal"));
 
        }
 

	
 
        let end_pos = source.pos();
 
        target.tokens.push(Token::new(TokenKind::String, begin_pos));
 
        target.tokens.push(Token::new(TokenKind::SpanEnd, end_pos));
 

	
 
        Ok(())
 
    }
 

	
 
    fn consume_pragma_or_pound(&mut self, first_char: u8, source: &mut InputSource, target: &mut TokenBuffer) -> Result<bool, ParseError> {
 
        let start_pos = source.pos();
 
        debug_assert_eq!(first_char, b'#');
 
        source.consume();
 

	
 
        let next = source.next();
 
        if next.is_none() || !is_identifier_start(next.unwrap()) {
 
            // Just a pound sign
 
            target.tokens.push(Token::new(TokenKind::Pound, start_pos));
 
            Ok(false)
 
        } else {
 
            // Pound sign followed by identifier
 
            source.consume();
 
            while let Some(c) = source.next() {
 
                if !is_identifier_remaining(c) {
 
                    break;
 
                }
 
                source.consume();
 
            }
 

	
 
            self.check_ascii(source)?;
 

	
 
            let end_pos = source.pos();
 
            target.tokens.push(Token::new(TokenKind::Pragma, start_pos));
 
            target.tokens.push(Token::new(TokenKind::SpanEnd, end_pos));
 
            Ok(true)
 
        }
 
    }
 

	
 
    fn consume_line_comment(&mut self, source: &mut InputSource, target: &mut TokenBuffer) -> Result<(), ParseError> {
 
        let begin_pos = source.pos();
 

	
 
        // Consume the leading "//"
 
        debug_assert!(source.next().unwrap() == b'/' && source.lookahead(1).unwrap() == b'/');
 
        source.consume();
 
        source.consume();
 

	
 
        let mut prev_char = b'/';
 
        let mut cur_char = b'/';
 
        while let Some(c) = source.next() {
 
            prev_char = cur_char;
 
            cur_char = c;
 

	
 
            if c == b'\n' {
 
                // End of line, note that the newline is not consumed
 
                break;
 
            }
 

	
 
            source.consume();
 
        }
 

	
 
        let mut end_pos = source.pos();
 
        debug_assert_eq!(begin_pos.line, end_pos.line);
 

	
 
        // Modify offset to not include the newline characters
 
        if cur_char == b'\n' {
 
            if prev_char == b'\r' {
 
                end_pos.offset -= 1;
 
            }
 

	
 
            // Consume final newline
 
            source.consume();
 
        } else {
 
            // End of comment was due to EOF
 
            debug_assert!(source.next().is_none())
 
        }
 

	
 
        target.tokens.push(Token::new(TokenKind::LineComment, begin_pos));
 
        target.tokens.push(Token::new(TokenKind::SpanEnd, end_pos));
 

	
 
        Ok(())
 
    }
 

	
 
    fn consume_block_comment(&mut self, source: &mut InputSource, target: &mut TokenBuffer) -> Result<(), ParseError> {
 
        let begin_pos = source.pos();
 

	
 
        // Consume the leading "/*"
 
        debug_assert!(source.next().unwrap() == b'/' && source.lookahead(1).unwrap() == b'*');
 
        source.consume();
 
        source.consume();
 

	
 
        // Explicitly do not put prev_char at "*", because then "/*/" would
 
        // represent a valid and closed block comment
 
        let mut prev_char = b' ';
 
        let mut is_closed = false;
 
        while let Some(c) = source.next() {
 
            source.consume();
 
            if prev_char == b'*' && c == b'/' {
 
                // End of block comment
 
                is_closed = true;
 
                break;
 
            }
 
            prev_char = c;
 
        }
 

	
 
        if !is_closed {
 
            return Err(ParseError::new_error_str_at_pos(
 
                source, source.pos(), "encountered unterminated block comment")
 
            );
 
        }
 

	
 
        let end_pos = source.pos();
 
        target.tokens.push(Token::new(TokenKind::BlockComment, begin_pos));
 
        target.tokens.push(Token::new(TokenKind::SpanEnd, end_pos));
 

	
 
        Ok(())
 
    }
 

	
 
    fn consume_identifier<'a>(&mut self, source: &'a mut InputSource, target: &mut TokenBuffer) -> Result<&'a [u8], ParseError> {
 
        let begin_pos = source.pos();
 
        debug_assert!(is_identifier_start(source.next().unwrap()));
 
        source.consume();
 

	
 
        // Keep reading until no more identifier
 
        while let Some(c) = source.next() {
 
            if !is_identifier_remaining(c) {
 
                break;
 
            }
 

	
 
            source.consume();
 
        }
 
        self.check_ascii(source)?;
 

	
 
        let end_pos = source.pos();
 
        target.tokens.push(Token::new(TokenKind::Ident, begin_pos));
 
        target.tokens.push(Token::new(TokenKind::SpanEnd, end_pos));
 
        Ok(source.section_at_pos(begin_pos, end_pos))
 
    }
 

	
 
    fn consume_number(&mut self, source: &mut InputSource, target: &mut TokenBuffer) -> Result<(), ParseError> {
 
        let begin_pos = source.pos();
 
        debug_assert!(is_integer_literal_start(source.next().unwrap()));
 
        source.consume();
 

	
 
        // Keep reading until it doesn't look like a number anymore
 
        while let Some(c) = source.next() {
 
            if !maybe_number_remaining(c) {
 
                break;
 
            }
 

	
 
            source.consume();
 
        }
 
        self.check_ascii(source)?;
 

	
 
        let end_pos = source.pos();
 
        target.tokens.push(Token::new(TokenKind::Integer, begin_pos));
 
        target.tokens.push(Token::new(TokenKind::SpanEnd, end_pos));
 

	
 
        Ok(())
 
    }
 

	
 
    // Consumes whitespace and returns whether or not the whitespace contained
 
    // a newline.
 
    fn consume_whitespace(&self, source: &mut InputSource) -> bool {
 
        debug_assert!(is_whitespace(source.next().unwrap()));
 

	
 
        let mut has_newline = false;
 
        while let Some(c) = source.next() {
 
            if !is_whitespace(c) {
 
                break;
 
            }
 

	
 
            if c == b'\n' {
 
                has_newline = true;
 
            }
 
            source.consume();
 
        }
 

	
 
        has_newline
 
    }
 

	
 
    fn add_code_range(
 
        &mut self, target: &mut TokenBuffer, parent_idx: i32,
 
        code_start_idx: u32, code_end_idx: u32, next_sibling_idx: i32
 
    ) {
 
        let new_range_idx = target.ranges.len() as i32;
 
        let parent_range = &mut target.ranges[parent_idx as usize];
 
        debug_assert_ne!(parent_range.end, code_end_idx, "called push_code_range without a need to do so");
 

	
 
        let sibling_idx = parent_range.last_child_idx;
 

	
 
        parent_range.last_child_idx = new_range_idx;
 
        parent_range.end = code_end_idx;
 
        parent_range.num_child_ranges += 1;
 

	
 
        let curly_depth = self.curly_stack.len() as u32;
 
        target.ranges.push(TokenRange{
 
            parent_idx,
 
            range_kind: TokenRangeKind::Code,
 
            curly_depth,
 
            start: code_start_idx,
 
            end: code_end_idx,
 
            num_child_ranges: 0,
 
            first_child_idx: NO_RELATION,
 
            last_child_idx: NO_RELATION,
 
            next_sibling_idx,
 
        });
 

	
 
        // Fix up the sibling indices
 
        if sibling_idx != NO_RELATION {
 
            let sibling_range = &mut target.ranges[sibling_idx as usize];
 
            sibling_range.next_sibling_idx = new_range_idx;
 
        }
 
    }
 

	
 
    fn emit_marker(&mut self, target: &mut TokenBuffer, kind: TokenMarkerKind, first_token: u32) {
 
        debug_assert!(
 
            target.markers
 
                .last().map(|v| v.first_token < first_token)
 
                .unwrap_or(true)
 
        );
 

	
 
        target.markers.push(TokenMarker{
 
            kind,
 
            curly_depth: self.curly_stack.len() as u32,
 
            first_token,
 
            last_token: u32::MAX,
 
            handled: false,
 
        });
 
    }
 

	
 
    fn push_range(&mut self, target: &mut TokenBuffer, range_kind: TokenRangeKind, first_token_idx: u32) {
 
        let new_range_idx = target.ranges.len() as i32;
 
        let parent_idx = self.stack_idx as i32;
 
        let parent_range = &mut target.ranges[self.stack_idx];
 

	
 
        if parent_range.first_child_idx == NO_RELATION {
 
            parent_range.first_child_idx = new_range_idx;
 
        }
 

	
 
        let last_registered_idx = parent_range.end;
 
        if last_registered_idx != first_token_idx {
 
            self.add_code_range(target, parent_idx, last_registered_idx, first_token_idx, new_range_idx + 1);
 
        }
 

	
 
        // Push the new range
 
        self.stack_idx = target.ranges.len();
 
        let curly_depth = self.curly_stack.len() as u32;
 
        target.ranges.push(TokenRange{
 
            parent_idx,
 
            range_kind,
 
            curly_depth,
 
            start: first_token_idx,
 
            end: first_token_idx, // modified when popped
 
            num_child_ranges: 0,
 
            first_child_idx: NO_RELATION,
 
            last_child_idx: NO_RELATION,
 
            next_sibling_idx: NO_RELATION
 
        })
 
    }
 

	
 
    fn pop_range(&mut self, target: &mut TokenBuffer, end_token_idx: u32) {
 
        let popped_idx = self.stack_idx as i32;
 
        let popped_range = &mut target.ranges[self.stack_idx];
 
        debug_assert!(self.stack_idx != 0, "attempting to pop top-level range");
 

	
 
        // Fix up the current range before going back to parent
 
        popped_range.end = end_token_idx;
 
        debug_assert_ne!(popped_range.start, end_token_idx);
 

	
 
        // Go back to parent and fix up its child pointers, but remember the
 
        // last child, so we can link it to the newly popped range.
 
        self.stack_idx = popped_range.parent_idx as usize;
 
        let parent = &mut target.ranges[self.stack_idx];
 
        if parent.first_child_idx == NO_RELATION {
 
            parent.first_child_idx = popped_idx;
 
        }
 
        let prev_sibling_idx = parent.last_child_idx;
 
        parent.last_child_idx = popped_idx;
 
        parent.end = end_token_idx;
 
        parent.num_child_ranges += 1;
 

	
 
        // Fix up the sibling (if it exists)
 
        if prev_sibling_idx != NO_RELATION {
 
            let sibling = &mut target.ranges[prev_sibling_idx as usize];
 
            sibling.next_sibling_idx = popped_idx;
 
        }
 
    }
 

	
 

	
 
    fn check_ascii(&self, source: &InputSource) -> Result<(), ParseError> {
 
        match source.next() {
 
            Some(c) if !c.is_ascii() => {
 
                Err(ParseError::new_error_str_at_pos(source, source.pos(), "encountered a non-ASCII character"))
 
            },
 
            _else => {
 
                Ok(())
 
            },
 
        }
 
    }
 
}
 

	
 
// Helpers for characters
 
fn demarks_symbol(ident: &[u8]) -> bool {
 
    return
 
        ident == KW_STRUCT ||
 
            ident == KW_ENUM ||
 
            ident == KW_UNION ||
 
            ident == KW_FUNCTION ||
 
            ident == KW_PRIMITIVE ||
 
            ident == KW_COMPOSITE
 
}
 

	
 
fn demarks_import(ident: &[u8]) -> bool {
 
    return ident == KW_IMPORT;
 
}
 

	
 
fn is_whitespace(c: u8) -> bool {
 
    c.is_ascii_whitespace()
 
}
 

	
 
fn is_char_literal_start(c: u8) -> bool {
 
    return c == b'\'';
 
}
 

	
 
fn is_string_literal_start(c: u8) -> bool {
 
    return c == b'"';
 
}
 

	
 
fn is_pragma_start_or_pound(c: u8) -> bool {
 
    return c == b'#';
 
}
 

	
 
fn is_identifier_start(c: u8) -> bool {
 
    return
 
        (c >= b'a' && c <= b'z') ||
 
            (c >= b'A' && c <= b'Z') ||
 
            c == b'_'
 
}
 

	
 
fn is_identifier_remaining(c: u8) -> bool {
 
    return
 
        (c >= b'0' && c <= b'9') ||
 
            (c >= b'a' && c <= b'z') ||
 
            (c >= b'A' && c <= b'Z') ||
 
            c == b'_'
 
}
 

	
 
fn is_integer_literal_start(c: u8) -> bool {
 
    return c >= b'0' && c <= b'9';
 
}
 

	
 
fn maybe_number_remaining(c: u8) -> bool {
 
    // Note: hex range includes the possible binary indicator 'b' and 'B';
 
    return
 
        (c == b'o' || c == b'O' || c == b'x' || c == b'X') ||
 
            (c >= b'0' && c <= b'9') || (c >= b'A' && c <= b'F') || (c >= b'a' && c <= b'f') ||
 
            c == b'_';
 
}
src/protocol/parser/tokens.rs
Show inline comments
 
@@ -4,369 +4,335 @@ use crate::protocol::input_source::{
 
};
 

	
 
/// Represents a particular kind of token. Some tokens represent
 
/// variable-character tokens. Such a token is always followed by a
 
/// `TokenKind::SpanEnd` token.
 
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
 
pub enum TokenKind {
 
    // Variable-character tokens, followed by a SpanEnd token
 
    Ident,          // regular identifier
 
    Pragma,         // identifier with prefixed `#`, range includes `#`
 
    Integer,        // integer literal
 
    String,         // string literal, range includes `"`
 
    Character,      // character literal, range includes `'`
 
    LineComment,    // line comment, range includes leading `//`, but not newline
 
    BlockComment,   // block comment, range includes leading `/*` and trailing `*/`
 
    // Punctuation (single character)
 
    Exclamation,    // !
 
    Question,       // ?
 
    Pound,          // #
 
    OpenAngle,      // <
 
    OpenCurly,      // {
 
    OpenParen,      // (
 
    OpenSquare,     // [
 
    CloseAngle,     // >
 
    CloseCurly,     // }
 
    CloseParen,     // )
 
    CloseSquare,    // ]
 
    Colon,          // :
 
    Comma,          // ,
 
    Dot,            // .
 
    SemiColon,      // ;
 
    // Operator-like (single character)
 
    At,             // @
 
    Plus,           // +
 
    Minus,          // -
 
    Star,           // *
 
    Slash,          // /
 
    Percent,        // %
 
    Caret,          // ^
 
    And,            // &
 
    Or,             // |
 
    Tilde,          // ~
 
    Equal,          // =
 
    // Punctuation (two characters)
 
    ColonColon,     // ::
 
    DotDot,         // ..
 
    ArrowRight,     // ->
 
    // Operator-like (two characters)
 
    AtEquals,       // @=
 
    PlusPlus,       // ++
 
    PlusEquals,     // +=
 
    MinusMinus,     // --
 
    MinusEquals,    // -=
 
    StarEquals,     // *=
 
    SlashEquals,    // /=
 
    PercentEquals,  // %=
 
    CaretEquals,    // ^=
 
    AndAnd,         // &&
 
    AndEquals,      // &=
 
    OrOr,           // ||
 
    OrEquals,       // |=
 
    EqualEqual,     // ==
 
    NotEqual,       // !=
 
    ShiftLeft,      // <<
 
    LessEquals,     // <=
 
    ShiftRight,     // >>
 
    GreaterEquals,  // >=
 
    // Operator-like (three characters)
 
    ShiftLeftEquals,// <<=
 
    ShiftRightEquals, // >>=
 
    // Special marker token to indicate end of variable-character tokens
 
    SpanEnd,
 
}
 

	
 
impl TokenKind {
 
    /// Returns true if the next expected token is the special `TokenKind::SpanEnd` token. This is
 
    /// the case for tokens of variable length (e.g. an identifier).
 
    pub(crate) fn has_span_end(&self) -> bool {
 
        return *self <= TokenKind::BlockComment
 
    }
 

	
 
    /// Returns the number of characters associated with the token. May only be called on tokens
 
    /// that do not have a variable length.
 
    fn num_characters(&self) -> u32 {
 
        debug_assert!(!self.has_span_end() && *self != TokenKind::SpanEnd);
 
        if *self <= TokenKind::Equal {
 
            1
 
        } else if *self <= TokenKind::GreaterEquals {
 
            2
 
        } else {
 
            3
 
        }
 
    }
 

	
 
    /// Returns the characters that are represented by the token, may only be called on tokens that
 
    /// do not have a variable length.
 
    pub fn token_chars(&self) -> &'static str {
 
        debug_assert!(!self.has_span_end() && *self != TokenKind::SpanEnd);
 
        use TokenKind as TK;
 
        match self {
 
            TK::Exclamation => "!",
 
            TK::Question => "?",
 
            TK::Pound => "#",
 
            TK::OpenAngle => "<",
 
            TK::OpenCurly => "{",
 
            TK::OpenParen => "(",
 
            TK::OpenSquare => "[",
 
            TK::CloseAngle => ">",
 
            TK::CloseCurly => "}",
 
            TK::CloseParen => ")",
 
            TK::CloseSquare => "]",
 
            TK::Colon => ":",
 
            TK::Comma => ",",
 
            TK::Dot => ".",
 
            TK::SemiColon => ";",
 
            TK::At => "@",
 
            TK::Plus => "+",
 
            TK::Minus => "-",
 
            TK::Star => "*",
 
            TK::Slash => "/",
 
            TK::Percent => "%",
 
            TK::Caret => "^",
 
            TK::And => "&",
 
            TK::Or => "|",
 
            TK::Tilde => "~",
 
            TK::Equal => "=",
 
            TK::ColonColon => "::",
 
            TK::DotDot => "..",
 
            TK::ArrowRight => "->",
 
            TK::AtEquals => "@=",
 
            TK::PlusPlus => "++",
 
            TK::PlusEquals => "+=",
 
            TK::MinusMinus => "--",
 
            TK::MinusEquals => "-=",
 
            TK::StarEquals => "*=",
 
            TK::SlashEquals => "/=",
 
            TK::PercentEquals => "%=",
 
            TK::CaretEquals => "^=",
 
            TK::AndAnd => "&&",
 
            TK::AndEquals => "&=",
 
            TK::OrOr => "||",
 
            TK::OrEquals => "|=",
 
            TK::EqualEqual => "==",
 
            TK::NotEqual => "!=",
 
            TK::ShiftLeft => "<<",
 
            TK::LessEquals => "<=",
 
            TK::ShiftRight => ">>",
 
            TK::GreaterEquals => ">=",
 
            TK::ShiftLeftEquals => "<<=",
 
            TK::ShiftRightEquals => ">>=",
 
            // Lets keep these in explicitly for now, in case we want to add more symbols
 
            TK::Ident | TK::Pragma | TK::Integer | TK::String | TK::Character |
 
            TK::LineComment | TK::BlockComment | TK::SpanEnd => unreachable!(),
 
        }
 
    }
 
}
 

	
 
/// Represents a single token at a particular position.
 
pub struct Token {
 
    pub kind: TokenKind,
 
    pub pos: InputPosition,
 
}
 

	
 
impl Token {
 
    pub(crate) fn new(kind: TokenKind, pos: InputPosition) -> Self {
 
        Self{ kind, pos }
 
    }
 
}
 

	
 
#[derive(Debug, Clone, Copy)]
 
pub enum TokenMarkerKind {
 
    Pragma,
 
    Import,
 
    Definition,
 
}
 

	
 
/// A marker for a specific token. These are stored separately from the array of
 
/// tokens. These are used for initial symbol, module name, and import
 
/// discovery.
 
#[derive(Debug)]
 
pub struct TokenMarker {
 
    pub kind: TokenMarkerKind,
 
    pub curly_depth: u32,
 
    // Indices into token buffer. The first token is inclusive and set upon
 
    // tokenization, the last token is set at a later stage in parsing (e.g.
 
    // at symbol discovery we may parse some of the `Pragma` tokens and set the
 
    // last parsed token)
 
    pub first_token: u32,
 
    pub last_token: u32,
 
    pub handled: bool,
 
}
 

	
 
/// The kind of token ranges that are specially parsed by the tokenizer.
 
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
 
pub enum TokenRangeKind {
 
    Module,
 
    Pragma,
 
    Import,
 
    Definition,
 
    Code,
 
}
 

	
 
pub const NO_RELATION: i32 = -1;
 
pub const NO_SIBLING: i32 = NO_RELATION;
 

	
 
/// A range of tokens with a specific meaning. Such a range is part of a tree
 
/// where each parent tree envelops all of its children.
 
#[derive(Debug)]
 
pub struct TokenRange {
 
    // Index of parent in `TokenBuffer.ranges`, does not have a parent if the
 
    // range kind is Module, in that case the parent index is -1.
 
    pub parent_idx: i32,
 
    pub range_kind: TokenRangeKind,
 
    pub curly_depth: u32,
 
    // Offsets into `TokenBuffer.ranges`: the tokens belonging to this range.
 
    pub start: u32,             // first token (inclusive index)
 
    pub end: u32,               // last token (exclusive index)
 
    // Child ranges
 
    pub num_child_ranges: u32,  // Number of subranges
 
    pub first_child_idx: i32,   // First subrange (or -1 if no subranges)
 
    pub last_child_idx: i32,    // Last subrange (or -1 if no subranges)
 
    pub next_sibling_idx: i32,  // Next subrange (or -1 if no next subrange)
 
}
 

	
 
pub struct TokenBuffer {
 
    pub tokens: Vec<Token>,
 
    pub markers: Vec<TokenMarker>,
 
    pub ranges: Vec<TokenRange>,
 
}
 

	
 
impl TokenBuffer {
 
    pub(crate) fn new() -> Self {
 
        return Self{
 
            tokens: Vec::new(),
 
            markers: Vec::new(),
 
            ranges: Vec::new()
 
        };
 
    }
 

	
 
    pub(crate) fn iter_range(
 
        &self, inclusive_start: u32, exclusive_end: Option<u32>
 
    ) -> TokenIter {
 
        let exclusive_end = exclusive_end.unwrap_or(self.tokens.len() as u32) as usize;
 
        debug_assert!(exclusive_end <= self.tokens.len());
 
        TokenIter::new(self, inclusive_start as usize, exclusive_end)
 
    }
 
}
 

	
 
/// Iterator over tokens within a specific `TokenRange`.
 
pub(crate) struct TokenIter<'a> {
 
    tokens: &'a Vec<Token>,
 
    cur: usize,
 
    end: usize,
 
}
 

	
 
impl<'a> TokenIter<'a> {
 
    fn new(buffer: &'a TokenBuffer, start: usize, end: usize) -> Self {
 
        Self{ tokens: &buffer.tokens, cur: start, end }
 
    }
 

	
 
    /// Returns the next token (may include comments), or `None` if at the end
 
    /// of the range.
 
    pub(crate) fn next_including_comments(&self) -> Option<TokenKind> {
 
        if self.cur >= self.end {
 
            return None;
 
        }
 

	
 
        let token = &self.tokens[self.cur];
 
        Some(token.kind)
 
    }
 

	
 
    /// Returns the next token (but skips over comments), or `None` if at the
 
    /// end of the range
 
    pub(crate) fn next(&mut self) -> Option<TokenKind> {
 
        while let Some(token_kind) = self.next_including_comments() {
 
            if token_kind != TokenKind::LineComment && token_kind != TokenKind::BlockComment {
 
                return Some(token_kind);
 
            }
 
            self.consume();
 
        }
 

	
 
        return None
 
    }
 

	
 
    /// Peeks ahead by one token (i.e. the one that comes after `next()`), and
 
    /// skips over comments
 
    pub(crate) fn peek(&self) -> Option<TokenKind> {
 
        for next_idx in self.cur + 1..self.end {
 
            let next_kind = self.tokens[next_idx].kind;
 
            if next_kind != TokenKind::LineComment && next_kind != TokenKind::BlockComment && next_kind != TokenKind::SpanEnd {
 
                return Some(next_kind);
 
            }
 
        }
 

	
 
        return None;
 
    }
 

	
 
    /// Returns the start position belonging to the token returned by `next`. If
 
    /// there is not a next token, then we return the end position of the
 
    /// previous token.
 
    pub(crate) fn last_valid_pos(&self) -> InputPosition {
 
        if self.cur < self.end {
 
            // Return token position
 
            return self.tokens[self.cur].pos
 
        }
 

	
 
        // Return previous token end
 
        let token = &self.tokens[self.cur - 1];
 
        return if token.kind == TokenKind::SpanEnd {
 
            token.pos
 
        } else {
 
            token.pos.with_offset(token.kind.num_characters())
 
        };
 
    }
 

	
 
    /// Assumes the token is not at the end and returns the starting position
 
    /// belonging to the token returned by `next`.
 
    pub(crate) fn next_start_position(&self) -> InputPosition {
 
        debug_assert!(self.cur < self.end);
 
        return self.tokens[self.cur].pos;
 
    }
 

	
 
    /// Returns the token range belonging to the token returned by `next`. This
 
    /// assumes that we're not at the end of the range we're iterating over.
 
    pub(crate) fn next_positions(&self) -> (InputPosition, InputPosition) {
 
        debug_assert!(self.cur < self.end);
 
        let token = &self.tokens[self.cur];
 
        if token.kind.has_span_end() {
 
            let span_end = &self.tokens[self.cur + 1];
 
            debug_assert_eq!(span_end.kind, TokenKind::SpanEnd);
 
            (token.pos, span_end.pos)
 
        } else {
 
            let offset = token.kind.num_characters();
 
            (token.pos, token.pos.with_offset(offset))
 
        }
 
    }
 

	
 
    /// See `next_positions`
 
    pub(crate) fn next_span(&self) -> InputSpan {
 
        let (begin, end) = self.next_positions();
 
        return InputSpan::from_positions(begin, end)
 
    }
 

	
 
    /// Advances the iterator to the next (meaningful) token.
 
    pub(crate) fn consume(&mut self) {
 
        if let Some(kind) = self.next_including_comments() {
 
            if kind.has_span_end() {
 
                self.cur += 2;
 
            } else {
 
                self.cur += 1;
 
            }
 
        }
 
    }
 

	
 
    pub(crate) fn token_index(&self) -> u32 {
 
        return self.cur as u32;
 
    }
 

	
 
    /// Saves the current iteration position, may be passed to `load` to return
 
    /// the iterator to a previous position.
 
    pub(crate) fn save(&self) -> (usize, usize) {
 
        (self.cur, self.end)
 
    }
 

	
 
    pub(crate) fn load(&mut self, saved: (usize, usize)) {
 
        self.cur = saved.0;
 
        self.end = saved.1;
 
    }
 
}
 
\ No newline at end of file
src/protocol/token_writer.rs
Show inline comments
 
#![allow(dead_code)]
 

	
 
use std::fmt::Write;
 
use std::fmt::{Write, Error as FmtError};
 
use std::io::Write as IOWrite;
 

	
 
use crate::protocol::input_source::{InputSource, InputSpan};
 
use crate::protocol::parser::Module;
 
use crate::protocol::tokens::{Token, TokenKind, TokenRange};
 
use crate::protocol::tokens::{Token, TokenKind, TokenMarker};
 

	
 
pub(crate) struct TokenWriter {
 
    buffer: String,
 
}
 

	
 
impl TokenWriter {
 
    pub(crate) fn new() -> Self {
 
        return Self{
 
            buffer: String::with_capacity(4096),
 
        }
 
    }
 

	
 
    pub(crate) fn write<W: IOWrite>(&mut self, w: &mut W, modules: &[Module]) {
 
        self.buffer.clear();
 
        for module in modules {
 
            self.write_module_tokens(module);
 
        }
 

	
 
        w.write_all(self.buffer.as_bytes()).expect("write tokens");
 
    }
 

	
 
    fn write_module_tokens(&mut self, module: &Module) {
 
        self.write_dashed_indent(0);
 

	
 
        match &module.name {
 
            Some(name) => writeln!(self.buffer, "Module: {}", name.1.as_str()).unwrap(),
 
            None => self.buffer.push_str("Unnamed module\n"),
 
        }
 

	
 

	
 
        let mut range_index = -1;
 
        if !module.tokens.ranges.is_empty() {
 
            range_index = 0;
 
        }
 

	
 
        while range_index >= 0 {
 
            range_index = self.write_token_range(
 
                &module.source, &module.tokens.tokens, &module.tokens.ranges, range_index, 1
 
            );
 
        self.write_marker_array(&module.tokens.markers, 1).expect("write markers");
 
        self.write_token_array(&module.source, &module.tokens.tokens, 1).expect("write tokens");
 
    }
 
    }
 

	
 
    /// Writes a single token range. Recurses if there are any child ranges.
 
    /// Returns the next token range index to iterate over (or a negative
 
    /// number, if there are no more sibling ranges).
 
    fn write_token_range(&mut self, source: &InputSource, tokens: &[Token], ranges: &[TokenRange], range_index: i32, indent: u32) -> i32 {
 
        // Write range kind
 
        let range = &ranges[range_index as usize];
 
        self.write_dashed_indent(indent);
 
        writeln!(self.buffer, "Range: {:?}", range.range_kind);
 

	
 
        // Write tokens/lines it spans
 
        let first_token_pos = tokens[range.start as usize].pos;
 

	
 
        let last_token_pos = if (range.end as usize) < tokens.len() {
 
            tokens[range.end as usize].pos
 
        } else {
 
            tokens.last().unwrap().pos
 
        };
 
        let first_source_col = source.get_column(first_token_pos);
 
        let last_source_col = source.get_column(last_token_pos);
 

	
 
        self.write_indent(indent);
 
        writeln!(
 
            self.buffer, "Source: token {} to {}, file {}:{}:{} to {}:{}",
 
            range.start, range.end, source.filename,
 
            first_token_pos.line, first_source_col,
 
            last_token_pos.line, last_source_col
 
        );
 

	
 
        let next_sibling_index = range.next_sibling_idx;
 
        if range.num_child_ranges == 0 {
 
            // No child ranges, so dump the tokens here
 
            debug_assert!(range.first_child_idx < 0);
 
            self.write_token_array(source, tokens, range, indent);
 
        } else {
 
            // Child ranges
 
            debug_assert!(range.first_child_idx >= 0);
 
    fn write_marker_array(&mut self, markers: &[TokenMarker], indent: u32) -> Result<(), FmtError> {
 
        self.write_indent(indent);
 
            writeln!(self.buffer, "Children: [");
 
        writeln!(self.buffer, "Markers: [")?;
 

	
 
            let mut range_index = range.first_child_idx;
 
            while range_index >= 0 {
 
                range_index = self.write_token_range(source, tokens, ranges, range_index, indent + 1);
 
        let marker_indent = indent + 1;
 
        for marker in markers {
 
            self.write_indent(marker_indent);
 
            writeln!(self.buffer, "{:?}", marker)?;
 
        }
 

	
 
        self.write_indent(indent);
 
            writeln!(self.buffer, "]");
 
        }
 
        writeln!(self.buffer, "]")?;
 

	
 
        // Wrote everything, return the next sibling token range
 
        return next_sibling_index;
 
        return Ok(());
 
    }
 

	
 
    fn write_token_array(&mut self, source: &InputSource, tokens: &[Token], range: &TokenRange, indent: u32) {
 
    fn write_token_array(&mut self, source: &InputSource, tokens: &[Token], indent: u32) -> Result<(), FmtError> {
 
        self.write_indent(indent);
 
        writeln!(self.buffer, "Tokens: [");
 
        writeln!(self.buffer, "Tokens: [")?;
 

	
 
        let num_tokens = tokens.len();
 
        let token_indent = indent + 1;
 
        for token_index in range.start as usize..range.end as usize {
 
        for token_index in 0..num_tokens {
 
            // Skip uninteresting tokens
 
            let token = &tokens[token_index];
 
            if token.kind == TokenKind::SpanEnd {
 
                continue;
 
            }
 

	
 
            self.write_indent(token_indent);
 
            write!(self.buffer, "{:?} (index {})", token.kind, token_index);
 
            write!(self.buffer, "{:?} (index {})", token.kind, token_index)?;
 
            if token.kind.has_span_end() {
 
                let token_start = token.pos;
 
                let token_end = tokens[token_index + 1].pos;
 
                let section = source.section_at_span(InputSpan::from_positions(token_start, token_end));
 
                writeln!(self.buffer, " text: {}", String::from_utf8_lossy(section));
 
                writeln!(self.buffer, " text: {}", String::from_utf8_lossy(section))?;
 
            } else {
 
                self.buffer.push('\n');
 
            }
 
        }
 

	
 
        self.write_indent(indent);
 
        writeln!(self.buffer, "]");
 
        writeln!(self.buffer, "]")?;
 

	
 
        return Ok(());
 
    }
 

	
 
    fn write_dashed_indent(&mut self, indent: u32) {
 
        for _ in 0..indent * 2 {
 
            self.buffer.push(' ');
 
        }
 
        self.buffer.push('-');
 
        self.buffer.push(' ');
 
    }
 

	
 
    fn write_indent(&mut self, indent: u32) {
 
        for _ in 0..(indent + 1)*2 {
 
            self.buffer.push(' ');
 
        }
 
    }
 
}
 
\ No newline at end of file
0 comments (0 inline, 0 general)