Changeset - 13bac4f35619
[Not reviewed]
1 4 4
MH - 4 years ago 2021-04-10 01:23:23
contact@maxhenger.nl
WIP on tokenization
9 files changed with 1146 insertions and 188 deletions:
0 comments (0 inline, 0 general)
src/collections/mod.rs
Show inline comments
 
new file 100644
 
mod string_pool;
 

	
 
pub(crate) use string_pool::{StringPool, StringRef};
 
\ No newline at end of file
src/collections/string_pool.rs
Show inline comments
 
new file 100644
 
use std::ptr::null_mut;
 

	
 
const SLAB_SIZE: usize = u16::max_value() as usize;
 

	
 
pub struct StringRef {
 
    data: *const u8,
 
    length: usize,
 
}
 

	
 
impl StringRef {
 
    pub fn as_str<'a>(&'a self) -> &'a str {
 
        unsafe {
 
            let slice = std::slice::from_raw_parts::<'a, u8>(self.data, self.length);
 
            std::str::from_utf8_unchecked(slice)
 
        }
 
    }
 
}
 

	
 
struct StringPoolSlab {
 
    prev: *mut StringPoolSlab,
 
    data: Vec<u8>,
 
    remaining: usize,
 
}
 

	
 
impl StringPoolSlab {
 
    fn new(prev: *mut StringPoolSlab) -> Self {
 
        Self{ prev, data: Vec::with_capacity(SLAB_SIZE), remaining: SLAB_SIZE }
 
    }
 
}
 

	
 
/// StringPool is a ever-growing pool of strings. Strings have a maximum size
 
/// equal to the slab size. The slabs are essentially a linked list to maintain
 
/// pointer-stability of the strings themselves.
 
/// All `StringRef` instances are invalidated when the string pool is dropped
 
pub(crate) struct StringPool {
 
    last: *mut StringPoolSlab,
 
}
 

	
 
impl StringPool {
 
    pub(crate) fn new() -> Self {
 
        // To have some stability we just turn a box into a raw ptr.
 
        let initial_slab = Box::new(StringPoolSlab::new(null_mut()));
 
        let initial_slab = Box::into_raw(initial_slab);
 
        StringPool{
 
            last: initial_slab,
 
        }
 
    }
 

	
 
    // Interns a string to the StringPool, returning a reference to it
 
    pub(crate) fn intern(&mut self, data: &[u8]) -> StringRef {
 
        // TODO: Large string allocations, if ever needed.
 
        let data_len = data.len();
 
        assert!(data_len <= SLAB_SIZE, "string is too large for slab");
 
        debug_assert!(std::str::from_utf8(data).is_ok(), "string to intern is not valid UTF-8 encoded");
 
        
 
        let mut last = unsafe{&mut *self.last};
 
        if data.len() > last.remaining {
 
            // Doesn't fit: allocate new slab
 
            self.alloc_new_slab();
 
            last = unsafe{&mut *self.last};
 
        }
 

	
 
        // Must fit now
 
        debug_assert!(data_len <= last.remaining);
 
        let range_start = last.data.len();
 
        last.data.extend_from_slice(data);
 
        last.remaining -= data_len;
 
        debug_assert_eq!(range_start + data_len, last.data.len());
 

	
 
        unsafe {
 
            let start = last.data.as_ptr().offset(range_start as isize);
 
            StringRef{ data: start, length: data_len }
 
        }
 
    }
 

	
 
    fn alloc_new_slab(&mut self) {
 
        let new_slab = Box::new(StringPoolSlab::new(self.last));
 
        let new_slab = Box::into_raw(new_slab);
 
        self.last = new_slab;
 
    }
 
}
 

	
 
impl Drop for StringPool {
 
    fn drop(&mut self) {
 
        let mut new_slab = self.last;
 
        while !new_slab.is_null() {
 
            let cur_slab = new_slab;
 
            unsafe {
 
                new_slab = (*cur_slab).prev;
 
                Box::from_raw(cur_slab); // consume and deallocate
 
            }
 
        }
 
    }
 
}
 

	
 
#[cfg(test)]
 
mod tests {
 
    use super::*;
 

	
 
    #[test]
 
    fn test_string_just_fits() {
 
        let large = "0".repeat(SLAB_SIZE);
 
        let mut pool = StringPool::new();
 
        let interned = pool.intern(large.as_bytes());
 
        assert_eq!(interned.as_str(), large);
 
    }
 

	
 
    #[test]
 
    #[should_panic]
 
    fn test_string_too_large() {
 
        let large = "0".repeat(SLAB_SIZE + 1);
 
        let mut pool = StringPool::new();
 
        let _interned = pool.intern(large.as_bytes());
 
    }
 

	
 
    #[test]
 
    fn test_lots_of_small_allocations() {
 
        const NUM_PER_SLAB: usize = 32;
 
        const NUM_SLABS: usize = 4;
 

	
 
        let to_intern = "0".repeat(SLAB_SIZE / NUM_PER_SLAB);
 
        let mut pool = StringPool::new();
 

	
 
        let mut last_slab = pool.last;
 
        let mut all_refs = Vec::new();
 

	
 
        // Fill up first slab
 
        for _alloc_idx in 0..NUM_PER_SLAB {
 
            let interned = pool.intern(to_intern.as_bytes());
 
            all_refs.push(interned);
 
            assert!(std::ptr::eq(last_slab, pool.last));
 
        }
 

	
 
        for _slab_idx in 0..NUM_SLABS-1 {
 
            for alloc_idx in 0..NUM_PER_SLAB {
 
                let interned = pool.intern(to_intern.as_bytes());
 
                all_refs.push(interned);
 

	
 
                if alloc_idx == 0 {
 
                    // First allocation produces a new slab
 
                    assert!(!std::ptr::eq(last_slab, pool.last));
 
                    last_slab = pool.last;
 
                } else {
 
                    assert!(std::ptr::eq(last_slab, pool.last));
 
                }
 
            }
 
        }
 

	
 
        // All strings are still correct
 
        for string_ref in all_refs {
 
            assert_eq!(string_ref.as_str(), to_intern);
 
        }
 
    }
 
}
 
\ No newline at end of file
src/lib.rs
Show inline comments
 
@@ -4,6 +4,7 @@ mod macros;
 
mod common;
 
mod protocol;
 
mod runtime;
 
mod collections;
 

	
 
pub use common::{ConnectorId, EndpointPolarity, Payload, Polarity, PortId};
 
pub use protocol::{ProtocolDescription, TRIVIAL_PD};
src/protocol/input_source2.rs
Show inline comments
 
new file 100644
 
use std::fmt;
 
use std::cell::{Ref, RefCell};
 

	
 
#[derive(Debug, Clone, Copy)]
 
pub struct InputPosition2 {
 
    pub line: u32,
 
    pub offset: u32,
 
}
 

	
 
pub struct InputSpan {
 
    pub begin: InputPosition2,
 
    pub end: InputPosition2,
 
}
 

	
 
impl InputSpan {
 
    #[inline]
 
    fn from_positions(begin: InputPosition2, end: InputPosition2) -> Self {
 
        Self { begin, end }
 
    }
 
}
 

	
 
/// Wrapper around source file with optional filename. Ensures that the file is
 
/// only scanned once.
 
pub struct InputSource2 {
 
    pub(crate) filename: String,
 
    pub(crate) input: Vec<u8>,
 
    // Iteration
 
    line: u32,
 
    offset: usize,
 
    // State tracking
 
    had_error: Option<ParseError>,
 
    // The offset_lookup is built on-demand upon attempting to report an error.
 
    // As the compiler is currently not multithreaded, we simply put it in a 
 
    // RefCell to allow interior mutability.
 
    offset_lookup: RefCell<Vec<u32>>,
 
}
 

	
 
impl InputSource2 {
 
    pub fn new(filename: String, input: Vec<u8>) -> Self {
 
        Self{
 
            filename,
 
            input,
 
            line: 1,
 
            offset: 0,
 
            had_error: None,
 
            offset_lookup: RefCell::new(Vec::new()),
 
        }
 
    }
 

	
 
    #[inline]
 
    pub fn pos(&self) -> InputPosition2 {
 
        InputPosition2{ line: self.line, offset: self.offset as u32 }
 
    }
 

	
 
    pub fn next(&self) -> Option<u8> {
 
        if self.offset < self.input.len() {
 
            Some(self.input[self.offset])
 
        } else {
 
            None
 
        }
 
    }
 

	
 
    pub fn lookahead(&self, offset: usize) -> Option<u8> {
 
        let offset_pos = self.offset + offset;
 
        if offset_pos < self.input.len() {
 
            Some(self.input[offset_pos])
 
        } else {
 
            None
 
        }
 
    }
 

	
 
    pub fn section(&self, start: u32, end: u32) -> &[u8] {
 
        &self.input[start as usize..end as usize]
 
    }
 

	
 
    // Consumes the next character. Will check well-formedness of newlines: \r
 
    // must be followed by a \n, because this is used for error reporting. Will
 
    // not check for ascii-ness of the file, better left to a tokenizer.
 
    pub fn consume(&mut self) {
 
        match self.next() {
 
            Some(b'\r') => {
 
                if Some(b'\n') == self.lookahead(1) {
 
                    // Well formed file
 
                    self.offset += 1;
 
                } else {
 
                    // Not a well-formed file, pretend like we can continue
 
                    self.offset += 1;
 
                    self.set_error("Encountered carriage-feed without a following newline");
 
                }
 
            },
 
            Some(b'\n') => {
 
                self.line += 1;
 
                self.offset += 1;
 
            },
 
            Some(_) => {
 
                self.offset += 1;
 
            }
 
            None => {}
 
        }
 

	
 
        // Maybe we actually want to check this in release mode. Then again:
 
        // a 4 gigabyte source file... Really?
 
        debug_assert!(self.offset < u32::max_value() as usize);
 
    }
 

	
 
    fn set_error(&mut self, msg: &str) {
 
        if self.had_error.is_none() {
 
            self.had_error = Some(ParseError::new_error(self, self.pos(), msg));
 
        }
 
    }
 

	
 
    fn get_lookup(&self) -> Ref<Vec<u32>> {
 
        // Once constructed the lookup always contains one element. We use this
 
        // to see if it is constructed already.
 
        let lookup = self.offset_lookup.borrow();
 
        if !lookup.is_empty() {
 
            return lookup;
 
        }
 

	
 
        // Build the line number (!) to offset lookup, so offset by 1. We 
 
        // assume the entire source file is scanned (most common case) for
 
        // preallocation.
 
        let lookup = self.offset_lookup.borrow_mut();
 
        lookup.reserve(self.line as usize + 2);
 
        lookup.push(0); // line 0: never used
 
        lookup.push(0); // first line: first character
 

	
 
        for char_idx in 0..self.input.len() {
 
            if self.input[char_idx] == b'\n' {
 
                lookup.push(char_idx as u32 + 1);
 
            }
 
        }
 

	
 
        lookup.push(self.input.len() as u32); // for lookup_line_end
 
        debug_assert_eq!(self.line as usize + 2, lookup.len(), "remove me: i am a testing assert and sometimes invalid");
 

	
 
        // Return created lookup
 
        let lookup = self.offset_lookup.borrow();
 
        return lookup;
 
    }
 

	
 
    fn lookup_line_start_offset(&self, line_number: u32) -> u32 {
 
        let lookup = self.get_lookup();
 
        lookup[line_number as usize]
 
    }
 

	
 
    fn lookup_line_end_offset(&self, line_number: u32) -> u32 {
 
        let lookup = self.get_lookup();
 
        let offset = lookup[(line_number + 1) as usize] - 1;
 
        let offset_usize = offset as usize;
 

	
 
        // Compensate for newlines and a potential carriage feed
 
        if self.input[offset_usize] == b'\n' {
 
            if offset_usize > 0 && self.input[offset_usize] == b'\r' {
 
                offset - 2
 
            } else {
 
                offset - 1
 
            }
 
        } else {
 
            offset
 
        }
 
    }
 
}
 

	
 
#[derive(Debug)]
 
pub enum ParseErrorType {
 
    Info,
 
    Error
 
}
 

	
 
#[derive(Debug)]
 
pub struct ParseErrorStatement {
 
    pub(crate) error_type: ParseErrorType,
 
    pub(crate) line: u32,
 
    pub(crate) column: u32,
 
    pub(crate) offset: u32,
 
    pub(crate) filename: String,
 
    pub(crate) context: String,
 
    pub(crate) message: String,
 
}
 

	
 
impl ParseErrorStatement {
 
    fn from_source(error_type: ParseErrorType, source: &InputSource2, position: InputPosition2, msg: &str) -> Self {
 
        // Seek line start and end
 
        let line_start = source.lookup_line_start_offset(position.line);
 
        let line_end = source.lookup_line_end_offset(position.line);
 
        debug_assert!(position.offset >= line_start);
 
        let column = position.offset - line_start + 1;
 

	
 
        Self{
 
            error_type,
 
            line: position.line,
 
            column,
 
            offset: position.offset,
 
            filename: source.filename.clone(),
 
            context: String::from_utf8_lossy(&source.input[line_start as usize..line_end as usize]).to_string(),
 
            message: msg.to_string()
 
        }
 
    }
 
}
 

	
 
impl fmt::Display for ParseErrorStatement {
 
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
 
        // Write message
 
        match self.error_type {
 
            ParseErrorType::Info => write!(f, " INFO: ")?,
 
            ParseErrorType::Error => write!(f, "ERROR: ")?,
 
        }
 
        writeln!(f, "{}", &self.message)?;
 

	
 
        // Write originating file/line/column
 
        if self.filename.is_empty() {
 
            writeln!(f, " +- at {}:{}", self.line, self.column)?;
 
        } else {
 
            writeln!(f, " +- at {}:{}:{}", self.filename, self.line, self.column)?;
 
        }
 

	
 
        // Write source context
 
        writeln!(f, " | ")?;
 
        writeln!(f, " | {}", self.context)?;
 

	
 
        // Write underline indicating where the error ocurred
 
        debug_assert!(self.column as usize <= self.context.chars().count());
 
        let mut arrow = String::with_capacity(self.context.len() + 3);
 
        arrow.push_str(" | ");
 
        let mut char_col = 1;
 
        for char in self.context.chars() {
 
            if char_col == self.column { break; }
 
            if char == '\t' {
 
                arrow.push('\t');
 
            } else {
 
                arrow.push(' ');
 
            }
 

	
 
            char_col += 1;
 
        }
 
        arrow.push('^');
 
        writeln!(f, "{}", arrow)?;
 

	
 
        Ok(())
 
    }
 
}
 

	
 
#[derive(Debug)]
 
pub struct ParseError {
 
    pub(crate) statements: Vec<ParseErrorStatement>
 
}
 

	
 
impl fmt::Display for ParseError {
 
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
 
        if self.statements.is_empty() {
 
            return Ok(())
 
        }
 

	
 
        self.statements[0].fmt(f)?;
 
        for statement in self.statements.iter().skip(1) {
 
            writeln!(f)?;
 
            statement.fmt(f)?;
 
        }
 

	
 
        Ok(())
 
    }
 
}
 

	
 
impl ParseError {
 
    pub fn empty() -> Self {
 
        Self{ statements: Vec::new() }
 
    }
 

	
 
    pub fn new_error(source: &InputSource2, position: InputPosition2, msg: &str) -> Self {
 
        Self{ statements: vec!(ParseErrorStatement::from_source(ParseErrorType::Error, source, position, msg))}
 
    }
 

	
 
    pub fn with_prefixed(mut self, error_type: ParseErrorType, source: &InputSource2, position: InputPosition2, msg: &str) -> Self {
 
        self.statements.insert(0, ParseErrorStatement::from_source(error_type, source, position, msg));
 
        self
 
    }
 

	
 
    pub fn with_postfixed(mut self, error_type: ParseErrorType, source: &InputSource2, position: InputPosition2, msg: &str) -> Self {
 
        self.statements.push(ParseErrorStatement::from_source(error_type, source, position, msg));
 
        self
 
    }
 

	
 
    pub fn with_postfixed_info(self, source: &InputSource2, position: InputPosition2, msg: &str) -> Self {
 
        self.with_postfixed(ParseErrorType::Info, source, position, msg)
 
    }
 
}
src/protocol/inputsource.rs
Show inline comments
 
@@ -164,11 +164,6 @@ impl InputPosition {
 
        }
 
        &source.input[start..end]
 
    }
 
    fn eval_error<S: ToString>(&self, message: S) -> EvalError {
 
        EvalError { position: *self, message: message.to_string(), backtrace: Backtrace::new() }
 
    }
 

	
 
    pub(crate) fn col(&self) -> usize { self.column }
 
}
 

	
 
impl Default for InputPosition {
 
@@ -185,9 +180,6 @@ impl fmt::Display for InputPosition {
 

	
 
pub trait SyntaxElement {
 
    fn position(&self) -> InputPosition;
 
    fn error<S: ToString>(&self, message: S) -> EvalError {
 
        self.position().eval_error(message)
 
    }
 
}
 

	
 
#[derive(Debug)]
 
@@ -315,172 +307,3 @@ impl ParseError {
 
        self.with_postfixed(ParseErrorType::Info, source, position, msg)
 
    }
 
}
 

	
 
#[derive(Debug, Clone)]
 
pub struct EvalError {
 
    position: InputPosition,
 
    message: String,
 
    backtrace: Backtrace,
 
}
 

	
 
impl EvalError {
 
    pub fn new<S: ToString>(position: InputPosition, message: S) -> EvalError {
 
        EvalError { position, message: message.to_string(), backtrace: Backtrace::new() }
 
    }
 
    // Diagnostic methods
 
    pub fn write<A: io::Write>(&self, source: &InputSource, writer: &mut A) -> io::Result<()> {
 
        if !source.filename.is_empty() {
 
            writeln!(
 
                writer,
 
                "Evaluation error at {}:{}: {}",
 
                source.filename, self.position, self.message
 
            )?;
 
        } else {
 
            writeln!(writer, "Evaluation error at {}: {}", self.position, self.message)?;
 
        }
 
        let line = self.position.context(source);
 
        writeln!(writer, "{}", String::from_utf8_lossy(line))?;
 
        let mut arrow: Vec<u8> = Vec::new();
 
        for pos in 1..self.position.column {
 
            let c = line[pos - 1];
 
            if c == b'\t' {
 
                arrow.push(b'\t')
 
            } else {
 
                arrow.push(b' ')
 
            }
 
        }
 
        arrow.push(b'^');
 
        writeln!(writer, "{}", String::from_utf8_lossy(&arrow))
 
    }
 
    pub fn print(&self, source: &InputSource) {
 
        self.write(source, &mut std::io::stdout()).unwrap()
 
    }
 
    pub fn display<'a>(&'a self, source: &'a InputSource) -> DisplayEvalError<'a> {
 
        DisplayEvalError::new(self, source)
 
    }
 
}
 

	
 
impl From<EvalError> for io::Error {
 
    fn from(_: EvalError) -> io::Error {
 
        io::Error::new(io::ErrorKind::InvalidInput, "eval error")
 
    }
 
}
 

	
 
#[derive(Clone, Copy)]
 
pub struct DisplayEvalError<'a> {
 
    error: &'a EvalError,
 
    source: &'a InputSource,
 
}
 

	
 
impl DisplayEvalError<'_> {
 
    fn new<'a>(error: &'a EvalError, source: &'a InputSource) -> DisplayEvalError<'a> {
 
        DisplayEvalError { error, source }
 
    }
 
}
 

	
 
impl fmt::Display for DisplayEvalError<'_> {
 
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
 
        let mut vec: Vec<u8> = Vec::new();
 
        match self.error.write(self.source, &mut vec) {
 
            Err(_) => {
 
                return fmt::Result::Err(fmt::Error);
 
            }
 
            Ok(_) => {}
 
        }
 
        write!(f, "{}", String::from_utf8_lossy(&vec))
 
    }
 
}
 

	
 
// #[cfg(test)]
 
// mod tests {
 
//     use super::*;
 

	
 
//     #[test]
 
//     fn test_from_string() {
 
//         let mut is = InputSource::from_string("#version 100\n").unwrap();
 
//         assert!(is.input.len() == 13);
 
//         assert!(is.line == 1);
 
//         assert!(is.column == 1);
 
//         assert!(is.offset == 0);
 
//         let ps = is.pos();
 
//         assert!(ps.line == 1);
 
//         assert!(ps.column == 1);
 
//         assert!(ps.offset == 0);
 
//         assert!(is.next() == Some(b'#'));
 
//         is.consume();
 
//         assert!(is.next() == Some(b'v'));
 
//         assert!(is.lookahead(1) == Some(b'e'));
 
//         is.consume();
 
//         assert!(is.next() == Some(b'e'));
 
//         is.consume();
 
//         assert!(is.next() == Some(b'r'));
 
//         is.consume();
 
//         assert!(is.next() == Some(b's'));
 
//         is.consume();
 
//         assert!(is.next() == Some(b'i'));
 
//         is.consume();
 
//         {
 
//             let ps = is.pos();
 
//             assert_eq!(b"#version 100", ps.context(&is));
 
//             let er = is.error("hello world!");
 
//             let mut vec: Vec<u8> = Vec::new();
 
//             er.write(&is, &mut vec).unwrap();
 
//             assert_eq!(
 
//                 "Parse error at 1:7: hello world!\n#version 100\n      ^\n",
 
//                 String::from_utf8_lossy(&vec)
 
//             );
 
//         }
 
//         assert!(is.next() == Some(b'o'));
 
//         is.consume();
 
//         assert!(is.next() == Some(b'n'));
 
//         is.consume();
 
//         assert!(is.input.len() == 13);
 
//         assert!(is.line == 1);
 
//         assert!(is.column == 9);
 
//         assert!(is.offset == 8);
 
//         assert!(is.next() == Some(b' '));
 
//         is.consume();
 
//         assert!(is.next() == Some(b'1'));
 
//         is.consume();
 
//         assert!(is.next() == Some(b'0'));
 
//         is.consume();
 
//         assert!(is.next() == Some(b'0'));
 
//         is.consume();
 
//         assert!(is.input.len() == 13);
 
//         assert!(is.line == 1);
 
//         assert!(is.column == 13);
 
//         assert!(is.offset == 12);
 
//         assert!(is.next() == Some(b'\n'));
 
//         is.consume();
 
//         assert!(is.input.len() == 13);
 
//         assert!(is.line == 2);
 
//         assert!(is.column == 1);
 
//         assert!(is.offset == 13);
 
//         {
 
//             let ps = is.pos();
 
//             assert_eq!(b"", ps.context(&is));
 
//         }
 
//         assert!(is.next() == None);
 
//         is.consume();
 
//         assert!(is.next() == None);
 
//     }
 

	
 
//     #[test]
 
//     fn test_split() {
 
//         let mut is = InputSource::from_string("#version 100\n").unwrap();
 
//         let backup = is.clone();
 
//         assert!(is.next() == Some(b'#'));
 
//         is.consume();
 
//         assert!(is.next() == Some(b'v'));
 
//         is.consume();
 
//         assert!(is.next() == Some(b'e'));
 
//         is.consume();
 
//         is = backup;
 
//         assert!(is.next() == Some(b'#'));
 
//         is.consume();
 
//         assert!(is.next() == Some(b'v'));
 
//         is.consume();
 
//         assert!(is.next() == Some(b'e'));
 
//         is.consume();
 
//     }
 
// }
src/protocol/lexer2.rs
Show inline comments
 
new file 100644
 

	
 
pub struct Lexer<'a> {
 
    source: &'a mut InputSource,
 
}
 
\ No newline at end of file
src/protocol/mod.rs
Show inline comments
 
@@ -2,16 +2,17 @@ mod arena;
 
// mod ast;
 
mod eval;
 
pub(crate) mod inputsource;
 
pub(crate) mod input_source2;
 
// mod lexer;
 
mod tokenizer;
 
mod parser;
 
mod pools;
 
#[cfg(test)] mod tests;
 

	
 
// TODO: Remove when not benchmarking
 
pub(crate) mod ast;
 
pub(crate) mod ast_printer;
 
pub(crate) mod lexer;
 
pub(crate) mod lexer2;
 

	
 
lazy_static::lazy_static! {
 
    /// Conveniently-provided protocol description initialized with a zero-length PDL string.
src/protocol/pools.rs
Show inline comments
 
deleted file
src/protocol/tokenizer/mod.rs
Show inline comments
 
enum Token {
 
    
 

	
 
use crate::protocol::input_source2::{InputSource2 as InputSource, ParseError, InputPosition2 as InputPosition, InputSpan};
 

	
 
#[derive(Clone, Copy, PartialEq, Eq)]
 
enum TokenKind {
 
    // Variable-character tokens, followed by a SpanEnd token
 
    Ident,
 
    Integer,
 
    String,
 
    Character,
 
    LineComment,
 
    BlockComment,
 
    // Punctuation
 
    Exclamation,    // !
 
    Question,       // ?
 
    Pound,          // #
 
    OpenAngle,      // <
 
    OpenCurly,      // {
 
    OpenParen,      // (
 
    OpenSquare,     // [
 
    CloseAngle,     // >
 
    CloseCurly,     // }
 
    CloseParen,     // )
 
    CloseSquare,    // ]
 
    Colon,          // :
 
    ColonColon,     // ::
 
    Comma,          // ,
 
    Dot,            // .
 
    DotDot,         // ..
 
    SemiColon,      // ;
 
    Quote,          // '
 
    DoubleQuote,    // "
 
    // Operator-like
 
    At,             // @
 
    Plus,           // +
 
    PlusPlus,       // ++
 
    PlusEquals,     // +=
 
    Minus,          // -
 
    ArrowRight,     // ->
 
    MinusMinus,     // --
 
    MinusEquals,    // -=
 
    Star,           // *
 
    StarEquals,     // *=
 
    Slash,          // /
 
    SlashEquals,    // /=
 
    Percent,        // %
 
    PercentEquals,  // %=
 
    Caret,          // ^
 
    CaretEquals,    // ^=
 
    And,            // &
 
    AndAnd,         // &&
 
    AndEquals,      // &=
 
    Or,             // |
 
    OrOr,           // ||
 
    OrEquals,       // |=
 
    Tilde,          // ~
 
    Equal,          // =
 
    EqualEqual,     // ==
 
    NotEqual,       // !=
 
    ShiftLeft,      // <<
 
    ShiftLeftEquals,// <<=
 
    ShiftRight,     // >>
 
    ShiftRightEquals, // >>=
 
    // Special marker token to indicate end of variable-character tokens
 
    SpanEnd,
 
}
 

	
 
struct Token {
 
    kind: TokenKind,
 
    pos: InputPosition, // probably need something different
 
}
 

	
 
impl Token {
 
    fn new(kind: TokenKind, pos: InputPosition) -> Self {
 
        Self{ kind, pos }
 
    }
 
}
 

	
 
enum TokenRangeType {
 
#[derive(Debug, PartialEq, Eq)]
 
enum TokenRangeKind {
 
    Module,
 
    Pragma,
 
    Import,
 
    Defintion,
 
    Definition,
 
    Code,
 
}
 

	
 
struct TokenRange {
 
    range_type: TokenRangeType,
 
    // Index of parent in `TokenBuffer.ranges`, does not have a parent if the 
 
    // range kind is Module, in that case the parent index points to itself.
 
    parent_idx: usize,
 
    range_kind: TokenRangeKind,
 
    curly_depth: u8,
 
    start: usize,
 
    end: usize,
 
    subranges: usize,
 
}
 

	
 
struct TokenBuffer {
 
@@ -19,3 +101,611 @@ struct TokenBuffer {
 
    ranges: Vec<TokenRange>,
 
}
 

	
 
struct ParseState {
 
    kind: TokenRangeKind,
 
    start: usize,
 
}
 

	
 
// Tokenizer is a reusable parser to tokenize multiple source files using the
 
// same allocated buffers. In a well-formed program, we produce a consistent
 
// tree of token ranges such that we may identify tokens that represent a 
 
// defintion or an import before producing the entire AST.
 
//
 
// If the program is not well-formed then the tree may be inconsistent, but we
 
// will detect this once we transform the tokens into the AST.
 
pub(crate) struct Tokenizer {
 
    curly_depth: u8,
 
    stack_idx: usize,
 
}
 

	
 
impl Tokenizer {
 
    pub(crate) fn tokenize(&mut self, source: &mut InputSource, target: &mut TokenBuffer) -> Result<(), ParseError> {
 
        // Assert source and buffer are at start
 
        debug_assert_eq!(source.pos().offset, 0);
 
        debug_assert!(target.tokens.is_empty());
 
        debug_assert!(target.ranges.is_empty());
 

	
 
        // Set up for tokenization by pushing the first range onto the stack.
 
        // This range may get transformed into the appropriate range kind later,
 
        // see `push_range` and `pop_range`.
 
        self.curly_depth = 0;
 
        self.stack_idx = 1;
 
        target.ranges.push(TokenRange{
 
            parent_idx: 0,
 
            range_kind: TokenRangeKind::Module,
 
            curly_depth: 0,
 
            start: 0,
 
            end: 0,
 
            subranges: 0,
 
        });
 

	
 
        // Main processing loop
 
        while let Some(c) = source.next() {
 
            let token_index = target.tokens.len();
 

	
 
            if is_char_literal_start(c) {
 
                self.consume_char_literal(source, target)?;
 
            } else if is_string_literal_start(c) {
 
                self.consume_string_literal(source, target)?;
 
            } else if is_identifier_start(c) {
 
                let ident = self.consume_identifier(source, target)?;
 

	
 
                if demarks_definition(ident) {
 
                    self.push_range(target, TokenRangeKind::Definition, token_index);
 
                } else if demarks_import(ident) {
 
                    self.push_range(target, TokenRangeKind::Import, token_index);
 
                }
 
            } else if is_integer_literal_start(c) {
 
                self.consume_number(source, target)?;
 
            } else if is_pragma_start(c) {
 
                self.consume_pragma(c, source, target);
 
                self.push_range(target, TokenRangeKind::Pragma, token_index);
 
            } else if self.is_line_comment_start(c, source) {
 
                self.consume_line_comment(source, target)?;
 
            } else if self.is_block_comment_start(c, source) {
 
                self.consume_block_comment(source, target)?;
 
            } else if is_whitespace(c) {
 
                let contained_newline = self.consume_whitespace(source);
 
            } else {
 
                let was_punctuation = self.maybe_parse_punctuation(c, source, target)?;
 
                if let Some(token) = was_punctuation {
 
                    if token == TokenKind::OpenCurly {
 
                        self.curly_depth += 1;
 
                    } else if token == TokenKind::CloseCurly {
 
                        // Check if this marks the end of a range we're 
 
                        // currently processing
 
                        self.curly_depth -= 1;
 

	
 
                        let range = &target.ranges[self.stack_idx];
 
                        if range.range_kind == TokenRangeKind::Definition && range.curly_depth == self.curly_depth {
 
                            self.pop_range(target, target.tokens.len());
 
                        }
 
                    } else if token == TokenKind::SemiColon {
 
                        // Check if this marks the end of an import
 
                        let range = &target.ranges[self.stack_idx];
 
                        if range.range_kind == TokenRangeKind::Import {
 
                            self.pop_range(target, target.tokens.len());
 
                        }
 
                    }
 
                } else {
 
                    return Err(ParseError::new_error(source, source.pos(), "unexpected character"));
 
                }
 
            }
 
        }
 

	
 
        // End of file, check if our state is correct
 
        Ok(())
 
    }
 

	
 
    fn is_line_comment_start(&self, first_char: u8, source: &InputSource) -> bool {
 
        return first_char == b'/' && Some(b'/') == source.lookahead(1);
 
    }
 

	
 
    fn is_block_comment_start(&self, first_char: u8, source: &InputSource) -> bool {
 
        return first_char == b'/' && Some(b'*') == source.lookahead(1);
 
    }
 

	
 
    pub(crate) fn maybe_parse_punctuation(&mut self, first_char: u8, source: &mut InputSource, target: &mut TokenBuffer) -> Result<Option<TokenKind>, ParseError> {
 
        debug_assert!(first_char != b'#', "'#' needs special handling");
 
        debug_assert!(first_char != b'\'', "'\'' needs special handling");
 
        debug_assert!(first_char != b'"', "'\"' needs special handling");
 

	
 
        let pos = source.pos();
 
        let token_kind;
 
        if first_char == b'!' {
 
            source.consume();
 
            if Some(b'=') == source.next() {
 
                source.consume();
 
                token_kind = TokenKind::NotEqual;
 
            } else {
 
                token_kind = TokenKind::Exclamation;
 
            }
 
        } else if first_char == b'%' {
 
            source.consume();
 
            if Some(b'=') == source.next() {
 
                source.consume();
 
                token_kind = TokenKind::PercentEquals;
 
            } else {
 
                token_kind = TokenKind::Percent;
 
            }
 
        } else if first_char == b'&' {
 
            source.consume();
 
            let next = source.next();
 
            if Some(b'&') == next {
 
                source.consume();
 
                token_kind = TokenKind::AndAnd;
 
            } else if Some(b'=') == next {
 
                source.consume();
 
                token_kind = TokenKind::AndEquals;
 
            } else {
 
                token_kind = TokenKind::And;
 
            }
 
        } else if first_char == b'(' {
 
            source.consume();
 
            token_kind = TokenKind::OpenParen;
 
        } else if first_char == b')' {
 
            source.consume();
 
            token_kind = TokenKind::CloseParen;
 
        } else if first_char == b'*' {
 
            source.consume();
 
            if let Some(b'=') = source.next() {
 
                source.consume();
 
                token_kind = TokenKind::StarEquals;
 
            } else {
 
                token_kind = TokenKind::Star;
 
            }
 
        } else if first_char == b'+' {
 
            source.consume();
 
            let next = source.next();
 
            if Some(b'+') == next {
 
                source.consume();
 
                token_kind = TokenKind::PlusPlus;
 
            } else if Some(b'=') == next {
 
                source.consume();
 
                token_kind = TokenKind::PlusEquals;
 
            } else {
 
                token_kind = TokenKind::Plus;
 
            }
 
        } else if first_char == b',' {
 
            source.consume();
 
            token_kind = TokenKind::Comma;
 
        } else if first_char == b'-' {
 
            source.consume();
 
            let next = source.next();
 
            if Some(b'-') == next {
 
                source.consume();
 
                token_kind = TokenKind::MinusMinus;
 
            } else if Some(b'>') == next {
 
                source.consume();
 
                token_kind = TokenKind::ArrowRight;
 
            } else if Some(b'=') == next {
 
                source.consume();
 
                token_kind = TokenKind::MinusEquals;
 
            } else {
 
                token_kind = TokenKind::Minus;
 
            }
 
        } else if first_char == b'.' {
 
            source.consume();
 
            if let Some(b'.') = source.next() {
 
                source.consume();
 
                token_kind = TokenKind::DotDot;
 
            } else {
 
                token_kind = TokenKind::Dot
 
            }
 
        } else if first_char == b'/' {
 
            source.consume();
 
            debug_assert_ne!(Some(b'/'), source.next());
 
            debug_assert_ne!(Some(b'*'), source.next());
 
            if let Some(b'=') = source.next() {
 
                source.consume();
 
                token_kind = TokenKind::SlashEquals;
 
            } else {
 
                token_kind = TokenKind::Slash;
 
            }
 
        } else if first_char == b':' {
 
            source.consume();
 
            if let Some(b':') = source.next() {
 
                source.consume();
 
                token_kind = TokenKind::ColonColon;
 
            } else {
 
                token_kind = TokenKind::Colon;
 
            }
 
        } else if first_char == b';' {
 
            source.consume();
 
            token_kind = TokenKind::SemiColon;
 
        } else if first_char == b'<' {
 
            source.consume();
 
            if let Some(b'<') = source.next() {
 
                source.consume();
 
                if let Some(b'=') = source.next() {
 
                    source.consume();
 
                    token_kind = TokenKind::ShiftLeftEquals;
 
                } else {
 
                    token_kind = TokenKind::ShiftLeft;
 
                }
 
            } else {
 
                token_kind = TokenKind::OpenAngle;
 
            }
 
        } else if first_char == b'=' {
 
            source.consume();
 
            if let Some(b'=') = source.next() {
 
                source.consume();
 
                token_kind = TokenKind::EqualEqual;
 
            } else {
 
                token_kind = TokenKind::Equal;
 
            }
 
        } else if first_char == b'>' {
 
            source.consume();
 
            if let Some(b'>') = source.next() {
 
                source.consume();
 
                if let Some(b'=') = source.next() {
 
                    source.consume();
 
                    token_kind = TokenKind::ShiftRightEquals;
 
                } else {
 
                    token_kind = TokenKind::ShiftRight;
 
                }
 
            } else {
 
                token_kind = TokenKind::CloseAngle;
 
            }
 
        } else if first_char == b'?' {
 
            source.consume();
 
            token_kind = TokenKind::Question;
 
        } else if first_char == b'@' {
 
            source.consume();
 
            token_kind = TokenKind::At;
 
        } else if first_char == b'[' {
 
            source.consume();
 
            token_kind = TokenKind::OpenSquare;
 
        } else if first_char == b']' {
 
            source.consume();
 
            token_kind = TokenKind::CloseSquare;
 
        } else if first_char == b'^' {
 
            source.consume();
 
            if let Some(b'=') = source.next() { 
 
                source.consume();
 
                token_kind = TokenKind::CaretEquals;
 
            } else {
 
                token_kind = TokenKind::Caret;
 
            }
 
        } else if first_char == b'{' {
 
            source.consume();
 
            token_kind = TokenKind::OpenCurly;
 
        } else if first_char == b'|' {
 
            source.consume();
 
            let next = source.next();
 
            if Some(b'|') == next {
 
                source.consume();
 
                token_kind = TokenKind::OrOr;
 
            } else if Some(b'=') == next {
 
                source.consume();
 
                token_kind = TokenKind::OrEquals;
 
            } else {
 
                token_kind = TokenKind::Or;
 
            }
 
        } else if first_char == b'}' {
 
            source.consume();
 
            token_kind = TokenKind::CloseCurly
 
        } else {
 
            self.check_ascii(source)?;
 
            return Ok(None);
 
        }
 

	
 
        target.tokens.push(Token::new(token_kind, pos));
 
        Ok(Some(token_kind))
 
    }
 

	
 
    pub(crate) fn consume_char_literal(&mut self, source: &mut InputSource, target: &mut TokenBuffer) -> Result<(), ParseError> {
 
        let begin_pos = source.pos();
 

	
 
        // Consume the leading quote
 
        debug_assert!(source.next().unwrap() == b'\'');
 
        source.consume();
 

	
 
        let mut prev_char = b'\'';
 
        while let Some(c) = source.next() {
 
            source.consume();
 
            if c == b'\'' && prev_char != b'\\' {
 
                prev_char = c;
 
                break;
 
            }
 

	
 
            prev_char = c;
 
        }
 

	
 
        if prev_char != b'\'' {
 
            // Unterminated character literal
 
            return Err(ParseError::new_error(source, begin_pos, "encountered unterminated character literal"));
 
        }
 

	
 
        let end_pos = source.pos();
 

	
 
        target.tokens.push(Token::new(TokenKind::Character, begin_pos));
 
        target.tokens.push(Token::new(TokenKind::SpanEnd, end_pos));
 

	
 
        Ok(())
 
    }
 

	
 
    pub(crate) fn consume_string_literal(&mut self, source: &mut InputSource, target: &mut TokenBuffer) -> Result<(), ParseError> {
 
        let begin_pos = source.pos();
 

	
 
        // Consume the leading double quotes
 
        debug_assert!(source.next().unwrap() == b'"');
 
        source.consume();
 

	
 
        let mut prev_char = b'"';
 
        while let Some(c) = source.next() {
 
            source.consume();
 
            if c == b'"' && prev_char != b'\\' {
 
                prev_char = c;
 
                break;
 
            }
 

	
 
            prev_char = c;
 
        }
 

	
 
        if prev_char != b'"' {
 
            // Unterminated string literal
 
            return Err(ParseError::new_error(source, begin_pos, "encountered unterminated string literal"));
 
        }
 

	
 
        let end_pos = source.pos();
 
        target.tokens.push(Token::new(TokenKind::String, begin_pos));
 
        target.tokens.push(Token::new(TokenKind::SpanEnd, end_pos));
 

	
 
        Ok(())
 
    }
 

	
 
    fn consume_pragma(&mut self, first_char: u8, source: &mut InputSource, target: &mut TokenBuffer) {
 
        let pos = source.pos();
 
        debug_assert_eq!(first_char, b'#');
 
        source.consume();
 

	
 
        target.tokens.push(Token::new(TokenKind::Pound, pos));
 
    }
 

	
 
    pub(crate) fn consume_line_comment(&mut self, source: &mut InputSource, target: &mut TokenBuffer) -> Result<(), ParseError> {
 
        let begin_pos = source.pos();
 

	
 
        // Consume the leading "//"
 
        debug_assert!(source.next().unwrap() == b'/' && source.lookahead(1).unwrap() == b'/');
 
        source.consume();
 
        source.consume();
 

	
 
        let mut prev_char = b'/';
 
        let mut cur_char = b'/';
 
        while let Some(c) = source.next() {
 
            source.consume();
 
            cur_char = c;
 
            if c == b'\n' {
 
                // End of line
 
                break;
 
            }
 
            prev_char = c;
 
        }
 

	
 
        let mut end_pos = source.pos();
 
        debug_assert_eq!(begin_pos.line, end_pos.line);
 

	
 
        // Modify offset to not include the newline characters
 
        if cur_char == b'\n' {
 
            if prev_char == b'\r' {
 
                end_pos.offset -= 2;
 
            } else {
 
                end_pos.offset -= 1;
 
            }
 
        } else {
 
            debug_assert!(source.next().is_none())
 
        }
 

	
 
        target.tokens.push(Token::new(TokenKind::LineComment, begin_pos));
 
        target.tokens.push(Token::new(TokenKind::SpanEnd, end_pos));
 

	
 
        Ok(())
 
    }
 

	
 
    pub(crate) fn consume_block_comment(&mut self, source: &mut InputSource, target: &mut TokenBuffer) -> Result<(), ParseError> {
 
        let begin_pos = source.pos();
 

	
 
        // Consume the leading "/*"
 
        debug_assert!(source.next().unwrap() == b'/' && source.lookahead(1).unwrap() == b'*');
 
        source.consume();
 
        source.consume();
 

	
 
        // Explicitly do not put prev_char at "*", because then "/*/" would
 
        // represent a valid and closed block comment
 
        let mut prev_char = b' ';
 
        let mut is_closed = false;
 
        while let Some(c) = source.next() {
 
            source.consume();
 
            if prev_char == b'*' && c == b'/' {
 
                // End of block comment
 
                is_closed = true;
 
                break;
 
            }
 
            prev_char = c;
 
        }
 

	
 
        if !is_closed {
 
            return Err(ParseError::new_error(source, source.pos(), "encountered unterminated block comment"));
 
        }
 

	
 
        let end_pos = source.pos();
 
        target.tokens.push(Token::new(TokenKind::BlockComment, begin_pos));
 
        target.tokens.push(Token::new(TokenKind::SpanEnd, end_pos));
 

	
 
        Ok(())
 
    }
 

	
 
    pub(crate) fn consume_identifier<'a>(&mut self, source: &'a mut InputSource, target: &mut TokenBuffer) -> Result<&'a [u8], ParseError> {
 
        let begin_pos = source.pos();
 
        debug_assert!(is_identifier_start(source.next().unwrap()));
 
        source.consume();
 

	
 
        // Keep reading until no more identifier
 
        while let Some(c) = source.next() {
 
            if !is_identifier_remaining(c) {
 
                break;
 
            }
 

	
 
            source.consume();
 
        }
 
        self.check_ascii(source)?;
 

	
 
        let end_pos = source.pos();
 
        target.tokens.push(Token::new(TokenKind::Ident, begin_pos));
 
        target.tokens.push(Token::new(TokenKind::SpanEnd, end_pos));
 
        Ok(source.section(begin_pos.offset, end_pos.offset))
 
    }
 

	
 
    pub(crate) fn consume_number(&mut self, source: &mut InputSource, target: &mut TokenBuffer) -> Result<(), ParseError> {
 
        let begin_pos = source.pos();
 
        debug_assert!(is_integer_literal_start(source.next().unwrap()));
 
        source.consume();
 

	
 
        // Keep reading until it doesn't look like a number anymore
 
        while let Some(c) = source.next() {
 
            if !maybe_number_remaining(c) {
 
                break;
 
            }
 

	
 
            source.consume();
 
        }
 
        self.check_ascii(source)?;
 

	
 
        let end_pos = source.pos();
 
        target.tokens.push(Token::new(TokenKind::Integer, begin_pos));
 
        target.tokens.push(Token::new(TokenKind::SpanEnd, end_pos));
 

	
 
        Ok(())
 
    }
 

	
 
    // Consumes whitespace and returns whether or not the whitespace contained
 
    // a newline.
 
    fn consume_whitespace(&self, source: &mut InputSource) -> bool {
 
        debug_assert!(is_whitespace(source.next().unwrap()));
 

	
 
        let mut has_newline = false;
 
        while let Some(c) = source.next() {
 
            if !is_whitespace(c) {
 
                break;
 
            }
 

	
 
            if c == b'\n' {
 
                has_newline = true;
 
            }
 
        }
 

	
 
        has_newline
 
    }
 

	
 
    /// Pushes a new token range onto the stack in the buffers.
 
    fn push_range(&self, target: &mut TokenBuffer, range_kind: TokenRangeKind, first_token: usize) {
 
        let cur_range = &target.ranges[self.stack_idx];
 
        let parent_idx = cur_range.parent_idx;
 
        let parent_range = &target.ranges[parent_idx];
 
        if parent_range.end != first_token {
 
            // Insert intermediate range
 
            target.ranges.push(TokenRange{
 
                parent_idx,
 
                range_kind: TokenRangeKind::Code,
 
                curly_depth: cur_range.curly_depth,
 
                start: parent_range.end,
 
                end: first_token,
 
                subranges: 0,
 
            });
 
        }
 

	
 
        // Insert a new range
 
        let parent_idx = self.stack_idx;
 
        self.stack_idx = target.ranges.len();
 
        target.ranges.push(TokenRange{
 
            parent_idx,
 
            range_kind,
 
            curly_depth: self.curly_depth,
 
            start: first_token,
 
            end: first_token,
 
            subranges: 0,
 
        });
 
    }
 

	
 
    fn pop_range(&self, target: &mut TokenBuffer, end_index: usize) {
 
        // Pop all the dummy ranges that are left on the range stack
 
        let last = &mut target.ranges[self.stack_idx];
 
        debug_assert!(self.stack_idx != last.parent_idx, "attempting to pop top-level range");
 

	
 
        last.end = end_index;
 
        self.stack_idx = last.parent_idx as usize;
 
        let parent = &mut target.ranges[self.stack_idx];
 
        parent.end = end_index;
 
        parent.subranges += 1;
 
    }
 

	
 

	
 
    fn check_ascii(&self, source: &InputSource) -> Result<(), ParseError> {
 
        match source.next() {
 
            Some(c) if !c.is_ascii() => {
 
                Err(ParseError::new_error(source, source.pos(), "encountered a non-ASCII character"))
 
            },
 
            _else => {
 
                Ok(())
 
            },
 
        }
 
    }
 
}
 

	
 
// Helpers for characters
 
fn demarks_definition(ident: &[u8]) -> bool {
 
    return
 
        ident == b"struct" ||
 
        ident == b"enum" ||
 
        ident == b"union" ||
 
        ident == b"func" ||
 
        ident == b"primitive" ||
 
        ident == b"composite"
 
}
 

	
 
fn demarks_import(ident: &[u8]) -> bool {
 
    return ident == b"import";
 
}
 

	
 
fn is_whitespace(c: u8) -> bool {
 
    c.is_ascii_whitespace()
 
}
 

	
 
fn is_char_literal_start(c: u8) -> bool {
 
    return c == b'\'';
 
}
 

	
 
fn is_string_literal_start(c: u8) -> bool {
 
    return c == b'"';
 
}
 

	
 
fn is_pragma_start(c: u8) -> bool {
 
    return c == b'#';
 
}
 

	
 
fn is_identifier_start(c: u8) -> bool {
 
    return 
 
        (c >= b'a' && c <= b'z') ||
 
        (c >= b'A' && c <= b'Z') ||
 
        c == b'_' 
 
}
 

	
 
fn is_identifier_remaining(c: u8) -> bool {
 
    return 
 
        (c >= b'0' && c <= b'9') ||
 
        (c >= b'a' && c <= b'z') ||
 
        (c >= b'A' && c <= b'Z') ||
 
        c == b'_' 
 
}
 

	
 
fn is_integer_literal_start(c: u8) -> bool {
 
    return c >= b'0' && c <= b'9';
 
}
 

	
 
fn maybe_number_remaining(c: u8) -> bool {
 
    return 
 
        (c == b'b' || c == b'B' || c == b'o' || c == b'O' || c == b'x' || c == b'X') ||
 
        (c >= b'0' && c <= b'9') ||
 
        c == b'_';
 
}
 
\ No newline at end of file
0 comments (0 inline, 0 general)