From 13bac4f3561979d2ab317502cbec886c9fb3a1f3 2021-04-10 01:23:23
From: MH <contact@maxhenger.nl>
Date: 2021-04-10 01:23:23
Subject: [PATCH] WIP on tokenization

---
diff --git a/src/collections/mod.rs b/src/collections/mod.rs
new file mode 100644
index 0000000000000000000000000000000000000000..b066ddec6c67ef24a2dbdd310c354577cadbf4e4
--- /dev/null
+++ b/src/collections/mod.rs
@@ -0,0 +1,3 @@
+mod string_pool;
+
+pub(crate) use string_pool::{StringPool, StringRef};
\ No newline at end of file
diff --git a/src/collections/string_pool.rs b/src/collections/string_pool.rs
new file mode 100644
index 0000000000000000000000000000000000000000..5cd82b91d55b8ebff11b98bd504745ceb5d1616d
--- /dev/null
+++ b/src/collections/string_pool.rs
@@ -0,0 +1,154 @@
+use std::ptr::null_mut;
+
+const SLAB_SIZE: usize = u16::max_value() as usize;
+
+pub struct StringRef {
+    data: *const u8,
+    length: usize,
+}
+
+impl StringRef {
+    pub fn as_str<'a>(&'a self) -> &'a str {
+        unsafe {
+            let slice = std::slice::from_raw_parts::<'a, u8>(self.data, self.length);
+            std::str::from_utf8_unchecked(slice)
+        }
+    }
+}
+
+struct StringPoolSlab {
+    prev: *mut StringPoolSlab,
+    data: Vec<u8>,
+    remaining: usize,
+}
+
+impl StringPoolSlab {
+    fn new(prev: *mut StringPoolSlab) -> Self {
+        Self{ prev, data: Vec::with_capacity(SLAB_SIZE), remaining: SLAB_SIZE }
+    }
+}
+
+/// StringPool is a ever-growing pool of strings. Strings have a maximum size
+/// equal to the slab size. The slabs are essentially a linked list to maintain
+/// pointer-stability of the strings themselves.
+/// All `StringRef` instances are invalidated when the string pool is dropped
+pub(crate) struct StringPool {
+    last: *mut StringPoolSlab,
+}
+
+impl StringPool {
+    pub(crate) fn new() -> Self {
+        // To have some stability we just turn a box into a raw ptr.
+        let initial_slab = Box::new(StringPoolSlab::new(null_mut()));
+        let initial_slab = Box::into_raw(initial_slab);
+        StringPool{
+            last: initial_slab,
+        }
+    }
+
+    // Interns a string to the StringPool, returning a reference to it
+    pub(crate) fn intern(&mut self, data: &[u8]) -> StringRef {
+        // TODO: Large string allocations, if ever needed.
+        let data_len = data.len();
+        assert!(data_len <= SLAB_SIZE, "string is too large for slab");
+        debug_assert!(std::str::from_utf8(data).is_ok(), "string to intern is not valid UTF-8 encoded");
+        
+        let mut last = unsafe{&mut *self.last};
+        if data.len() > last.remaining {
+            // Doesn't fit: allocate new slab
+            self.alloc_new_slab();
+            last = unsafe{&mut *self.last};
+        }
+
+        // Must fit now
+        debug_assert!(data_len <= last.remaining);
+        let range_start = last.data.len();
+        last.data.extend_from_slice(data);
+        last.remaining -= data_len;
+        debug_assert_eq!(range_start + data_len, last.data.len());
+
+        unsafe {
+            let start = last.data.as_ptr().offset(range_start as isize);
+            StringRef{ data: start, length: data_len }
+        }
+    }
+
+    fn alloc_new_slab(&mut self) {
+        let new_slab = Box::new(StringPoolSlab::new(self.last));
+        let new_slab = Box::into_raw(new_slab);
+        self.last = new_slab;
+    }
+}
+
+impl Drop for StringPool {
+    fn drop(&mut self) {
+        let mut new_slab = self.last;
+        while !new_slab.is_null() {
+            let cur_slab = new_slab;
+            unsafe {
+                new_slab = (*cur_slab).prev;
+                Box::from_raw(cur_slab); // consume and deallocate
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_string_just_fits() {
+        let large = "0".repeat(SLAB_SIZE);
+        let mut pool = StringPool::new();
+        let interned = pool.intern(large.as_bytes());
+        assert_eq!(interned.as_str(), large);
+    }
+
+    #[test]
+    #[should_panic]
+    fn test_string_too_large() {
+        let large = "0".repeat(SLAB_SIZE + 1);
+        let mut pool = StringPool::new();
+        let _interned = pool.intern(large.as_bytes());
+    }
+
+    #[test]
+    fn test_lots_of_small_allocations() {
+        const NUM_PER_SLAB: usize = 32;
+        const NUM_SLABS: usize = 4;
+
+        let to_intern = "0".repeat(SLAB_SIZE / NUM_PER_SLAB);
+        let mut pool = StringPool::new();
+
+        let mut last_slab = pool.last;
+        let mut all_refs = Vec::new();
+
+        // Fill up first slab
+        for _alloc_idx in 0..NUM_PER_SLAB {
+            let interned = pool.intern(to_intern.as_bytes());
+            all_refs.push(interned);
+            assert!(std::ptr::eq(last_slab, pool.last));
+        }
+
+        for _slab_idx in 0..NUM_SLABS-1 {
+            for alloc_idx in 0..NUM_PER_SLAB {
+                let interned = pool.intern(to_intern.as_bytes());
+                all_refs.push(interned);
+
+                if alloc_idx == 0 {
+                    // First allocation produces a new slab
+                    assert!(!std::ptr::eq(last_slab, pool.last));
+                    last_slab = pool.last;
+                } else {
+                    assert!(std::ptr::eq(last_slab, pool.last));
+                }
+            }
+        }
+
+        // All strings are still correct
+        for string_ref in all_refs {
+            assert_eq!(string_ref.as_str(), to_intern);
+        }
+    }
+}
\ No newline at end of file
diff --git a/src/lib.rs b/src/lib.rs
index 8674f5eedb3dcf874836957d45bde0a956e0503a..becc35e33199c98bd851df96cde81cbbbf21b786 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -4,6 +4,7 @@ mod macros;
 mod common;
 mod protocol;
 mod runtime;
+mod collections;
 
 pub use common::{ConnectorId, EndpointPolarity, Payload, Polarity, PortId};
 pub use protocol::{ProtocolDescription, TRIVIAL_PD};
diff --git a/src/protocol/input_source2.rs b/src/protocol/input_source2.rs
new file mode 100644
index 0000000000000000000000000000000000000000..025b86ddbfb0a0f77f8fd9f3dffe5a5cfad59218
--- /dev/null
+++ b/src/protocol/input_source2.rs
@@ -0,0 +1,287 @@
+use std::fmt;
+use std::cell::{Ref, RefCell};
+
+#[derive(Debug, Clone, Copy)]
+pub struct InputPosition2 {
+    pub line: u32,
+    pub offset: u32,
+}
+
+pub struct InputSpan {
+    pub begin: InputPosition2,
+    pub end: InputPosition2,
+}
+
+impl InputSpan {
+    #[inline]
+    fn from_positions(begin: InputPosition2, end: InputPosition2) -> Self {
+        Self { begin, end }
+    }
+}
+
+/// Wrapper around source file with optional filename. Ensures that the file is
+/// only scanned once.
+pub struct InputSource2 {
+    pub(crate) filename: String,
+    pub(crate) input: Vec<u8>,
+    // Iteration
+    line: u32,
+    offset: usize,
+    // State tracking
+    had_error: Option<ParseError>,
+    // The offset_lookup is built on-demand upon attempting to report an error.
+    // As the compiler is currently not multithreaded, we simply put it in a 
+    // RefCell to allow interior mutability.
+    offset_lookup: RefCell<Vec<u32>>,
+}
+
+impl InputSource2 {
+    pub fn new(filename: String, input: Vec<u8>) -> Self {
+        Self{
+            filename,
+            input,
+            line: 1,
+            offset: 0,
+            had_error: None,
+            offset_lookup: RefCell::new(Vec::new()),
+        }
+    }
+
+    #[inline]
+    pub fn pos(&self) -> InputPosition2 {
+        InputPosition2{ line: self.line, offset: self.offset as u32 }
+    }
+
+    pub fn next(&self) -> Option<u8> {
+        if self.offset < self.input.len() {
+            Some(self.input[self.offset])
+        } else {
+            None
+        }
+    }
+
+    pub fn lookahead(&self, offset: usize) -> Option<u8> {
+        let offset_pos = self.offset + offset;
+        if offset_pos < self.input.len() {
+            Some(self.input[offset_pos])
+        } else {
+            None
+        }
+    }
+
+    pub fn section(&self, start: u32, end: u32) -> &[u8] {
+        &self.input[start as usize..end as usize]
+    }
+
+    // Consumes the next character. Will check well-formedness of newlines: \r
+    // must be followed by a \n, because this is used for error reporting. Will
+    // not check for ascii-ness of the file, better left to a tokenizer.
+    pub fn consume(&mut self) {
+        match self.next() {
+            Some(b'\r') => {
+                if Some(b'\n') == self.lookahead(1) {
+                    // Well formed file
+                    self.offset += 1;
+                } else {
+                    // Not a well-formed file, pretend like we can continue
+                    self.offset += 1;
+                    self.set_error("Encountered carriage-feed without a following newline");
+                }
+            },
+            Some(b'\n') => {
+                self.line += 1;
+                self.offset += 1;
+            },
+            Some(_) => {
+                self.offset += 1;
+            }
+            None => {}
+        }
+
+        // Maybe we actually want to check this in release mode. Then again:
+        // a 4 gigabyte source file... Really?
+        debug_assert!(self.offset < u32::max_value() as usize);
+    }
+
+    fn set_error(&mut self, msg: &str) {
+        if self.had_error.is_none() {
+            self.had_error = Some(ParseError::new_error(self, self.pos(), msg));
+        }
+    }
+
+    fn get_lookup(&self) -> Ref<Vec<u32>> {
+        // Once constructed the lookup always contains one element. We use this
+        // to see if it is constructed already.
+        let lookup = self.offset_lookup.borrow();
+        if !lookup.is_empty() {
+            return lookup;
+        }
+
+        // Build the line number (!) to offset lookup, so offset by 1. We 
+        // assume the entire source file is scanned (most common case) for
+        // preallocation.
+        let lookup = self.offset_lookup.borrow_mut();
+        lookup.reserve(self.line as usize + 2);
+        lookup.push(0); // line 0: never used
+        lookup.push(0); // first line: first character
+
+        for char_idx in 0..self.input.len() {
+            if self.input[char_idx] == b'\n' {
+                lookup.push(char_idx as u32 + 1);
+            }
+        }
+
+        lookup.push(self.input.len() as u32); // for lookup_line_end
+        debug_assert_eq!(self.line as usize + 2, lookup.len(), "remove me: i am a testing assert and sometimes invalid");
+
+        // Return created lookup
+        let lookup = self.offset_lookup.borrow();
+        return lookup;
+    }
+
+    fn lookup_line_start_offset(&self, line_number: u32) -> u32 {
+        let lookup = self.get_lookup();
+        lookup[line_number as usize]
+    }
+
+    fn lookup_line_end_offset(&self, line_number: u32) -> u32 {
+        let lookup = self.get_lookup();
+        let offset = lookup[(line_number + 1) as usize] - 1;
+        let offset_usize = offset as usize;
+
+        // Compensate for newlines and a potential carriage feed
+        if self.input[offset_usize] == b'\n' {
+            if offset_usize > 0 && self.input[offset_usize] == b'\r' {
+                offset - 2
+            } else {
+                offset - 1
+            }
+        } else {
+            offset
+        }
+    }
+}
+
+#[derive(Debug)]
+pub enum ParseErrorType {
+    Info,
+    Error
+}
+
+#[derive(Debug)]
+pub struct ParseErrorStatement {
+    pub(crate) error_type: ParseErrorType,
+    pub(crate) line: u32,
+    pub(crate) column: u32,
+    pub(crate) offset: u32,
+    pub(crate) filename: String,
+    pub(crate) context: String,
+    pub(crate) message: String,
+}
+
+impl ParseErrorStatement {
+    fn from_source(error_type: ParseErrorType, source: &InputSource2, position: InputPosition2, msg: &str) -> Self {
+        // Seek line start and end
+        let line_start = source.lookup_line_start_offset(position.line);
+        let line_end = source.lookup_line_end_offset(position.line);
+        debug_assert!(position.offset >= line_start);
+        let column = position.offset - line_start + 1;
+
+        Self{
+            error_type,
+            line: position.line,
+            column,
+            offset: position.offset,
+            filename: source.filename.clone(),
+            context: String::from_utf8_lossy(&source.input[line_start as usize..line_end as usize]).to_string(),
+            message: msg.to_string()
+        }
+    }
+}
+
+impl fmt::Display for ParseErrorStatement {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        // Write message
+        match self.error_type {
+            ParseErrorType::Info => write!(f, " INFO: ")?,
+            ParseErrorType::Error => write!(f, "ERROR: ")?,
+        }
+        writeln!(f, "{}", &self.message)?;
+
+        // Write originating file/line/column
+        if self.filename.is_empty() {
+            writeln!(f, " +- at {}:{}", self.line, self.column)?;
+        } else {
+            writeln!(f, " +- at {}:{}:{}", self.filename, self.line, self.column)?;
+        }
+
+        // Write source context
+        writeln!(f, " | ")?;
+        writeln!(f, " | {}", self.context)?;
+
+        // Write underline indicating where the error ocurred
+        debug_assert!(self.column as usize <= self.context.chars().count());
+        let mut arrow = String::with_capacity(self.context.len() + 3);
+        arrow.push_str(" | ");
+        let mut char_col = 1;
+        for char in self.context.chars() {
+            if char_col == self.column { break; }
+            if char == '\t' {
+                arrow.push('\t');
+            } else {
+                arrow.push(' ');
+            }
+
+            char_col += 1;
+        }
+        arrow.push('^');
+        writeln!(f, "{}", arrow)?;
+
+        Ok(())
+    }
+}
+
+#[derive(Debug)]
+pub struct ParseError {
+    pub(crate) statements: Vec<ParseErrorStatement>
+}
+
+impl fmt::Display for ParseError {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        if self.statements.is_empty() {
+            return Ok(())
+        }
+
+        self.statements[0].fmt(f)?;
+        for statement in self.statements.iter().skip(1) {
+            writeln!(f)?;
+            statement.fmt(f)?;
+        }
+
+        Ok(())
+    }
+}
+
+impl ParseError {
+    pub fn empty() -> Self {
+        Self{ statements: Vec::new() }
+    }
+
+    pub fn new_error(source: &InputSource2, position: InputPosition2, msg: &str) -> Self {
+        Self{ statements: vec!(ParseErrorStatement::from_source(ParseErrorType::Error, source, position, msg))}
+    }
+
+    pub fn with_prefixed(mut self, error_type: ParseErrorType, source: &InputSource2, position: InputPosition2, msg: &str) -> Self {
+        self.statements.insert(0, ParseErrorStatement::from_source(error_type, source, position, msg));
+        self
+    }
+
+    pub fn with_postfixed(mut self, error_type: ParseErrorType, source: &InputSource2, position: InputPosition2, msg: &str) -> Self {
+        self.statements.push(ParseErrorStatement::from_source(error_type, source, position, msg));
+        self
+    }
+
+    pub fn with_postfixed_info(self, source: &InputSource2, position: InputPosition2, msg: &str) -> Self {
+        self.with_postfixed(ParseErrorType::Info, source, position, msg)
+    }
+}
diff --git a/src/protocol/inputsource.rs b/src/protocol/inputsource.rs
index 106b6d76c9ac813bdc99bff0158ca7be8a3c7eec..756f51cc32c3153a8d629f85f7c3ef02942d0521 100644
--- a/src/protocol/inputsource.rs
+++ b/src/protocol/inputsource.rs
@@ -164,11 +164,6 @@ impl InputPosition {
         }
         &source.input[start..end]
     }
-    fn eval_error<S: ToString>(&self, message: S) -> EvalError {
-        EvalError { position: *self, message: message.to_string(), backtrace: Backtrace::new() }
-    }
-
-    pub(crate) fn col(&self) -> usize { self.column }
 }
 
 impl Default for InputPosition {
@@ -185,9 +180,6 @@ impl fmt::Display for InputPosition {
 
 pub trait SyntaxElement {
     fn position(&self) -> InputPosition;
-    fn error<S: ToString>(&self, message: S) -> EvalError {
-        self.position().eval_error(message)
-    }
 }
 
 #[derive(Debug)]
@@ -315,172 +307,3 @@ impl ParseError {
         self.with_postfixed(ParseErrorType::Info, source, position, msg)
     }
 }
-
-#[derive(Debug, Clone)]
-pub struct EvalError {
-    position: InputPosition,
-    message: String,
-    backtrace: Backtrace,
-}
-
-impl EvalError {
-    pub fn new<S: ToString>(position: InputPosition, message: S) -> EvalError {
-        EvalError { position, message: message.to_string(), backtrace: Backtrace::new() }
-    }
-    // Diagnostic methods
-    pub fn write<A: io::Write>(&self, source: &InputSource, writer: &mut A) -> io::Result<()> {
-        if !source.filename.is_empty() {
-            writeln!(
-                writer,
-                "Evaluation error at {}:{}: {}",
-                source.filename, self.position, self.message
-            )?;
-        } else {
-            writeln!(writer, "Evaluation error at {}: {}", self.position, self.message)?;
-        }
-        let line = self.position.context(source);
-        writeln!(writer, "{}", String::from_utf8_lossy(line))?;
-        let mut arrow: Vec<u8> = Vec::new();
-        for pos in 1..self.position.column {
-            let c = line[pos - 1];
-            if c == b'\t' {
-                arrow.push(b'\t')
-            } else {
-                arrow.push(b' ')
-            }
-        }
-        arrow.push(b'^');
-        writeln!(writer, "{}", String::from_utf8_lossy(&arrow))
-    }
-    pub fn print(&self, source: &InputSource) {
-        self.write(source, &mut std::io::stdout()).unwrap()
-    }
-    pub fn display<'a>(&'a self, source: &'a InputSource) -> DisplayEvalError<'a> {
-        DisplayEvalError::new(self, source)
-    }
-}
-
-impl From<EvalError> for io::Error {
-    fn from(_: EvalError) -> io::Error {
-        io::Error::new(io::ErrorKind::InvalidInput, "eval error")
-    }
-}
-
-#[derive(Clone, Copy)]
-pub struct DisplayEvalError<'a> {
-    error: &'a EvalError,
-    source: &'a InputSource,
-}
-
-impl DisplayEvalError<'_> {
-    fn new<'a>(error: &'a EvalError, source: &'a InputSource) -> DisplayEvalError<'a> {
-        DisplayEvalError { error, source }
-    }
-}
-
-impl fmt::Display for DisplayEvalError<'_> {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        let mut vec: Vec<u8> = Vec::new();
-        match self.error.write(self.source, &mut vec) {
-            Err(_) => {
-                return fmt::Result::Err(fmt::Error);
-            }
-            Ok(_) => {}
-        }
-        write!(f, "{}", String::from_utf8_lossy(&vec))
-    }
-}
-
-// #[cfg(test)]
-// mod tests {
-//     use super::*;
-
-//     #[test]
-//     fn test_from_string() {
-//         let mut is = InputSource::from_string("#version 100\n").unwrap();
-//         assert!(is.input.len() == 13);
-//         assert!(is.line == 1);
-//         assert!(is.column == 1);
-//         assert!(is.offset == 0);
-//         let ps = is.pos();
-//         assert!(ps.line == 1);
-//         assert!(ps.column == 1);
-//         assert!(ps.offset == 0);
-//         assert!(is.next() == Some(b'#'));
-//         is.consume();
-//         assert!(is.next() == Some(b'v'));
-//         assert!(is.lookahead(1) == Some(b'e'));
-//         is.consume();
-//         assert!(is.next() == Some(b'e'));
-//         is.consume();
-//         assert!(is.next() == Some(b'r'));
-//         is.consume();
-//         assert!(is.next() == Some(b's'));
-//         is.consume();
-//         assert!(is.next() == Some(b'i'));
-//         is.consume();
-//         {
-//             let ps = is.pos();
-//             assert_eq!(b"#version 100", ps.context(&is));
-//             let er = is.error("hello world!");
-//             let mut vec: Vec<u8> = Vec::new();
-//             er.write(&is, &mut vec).unwrap();
-//             assert_eq!(
-//                 "Parse error at 1:7: hello world!\n#version 100\n      ^\n",
-//                 String::from_utf8_lossy(&vec)
-//             );
-//         }
-//         assert!(is.next() == Some(b'o'));
-//         is.consume();
-//         assert!(is.next() == Some(b'n'));
-//         is.consume();
-//         assert!(is.input.len() == 13);
-//         assert!(is.line == 1);
-//         assert!(is.column == 9);
-//         assert!(is.offset == 8);
-//         assert!(is.next() == Some(b' '));
-//         is.consume();
-//         assert!(is.next() == Some(b'1'));
-//         is.consume();
-//         assert!(is.next() == Some(b'0'));
-//         is.consume();
-//         assert!(is.next() == Some(b'0'));
-//         is.consume();
-//         assert!(is.input.len() == 13);
-//         assert!(is.line == 1);
-//         assert!(is.column == 13);
-//         assert!(is.offset == 12);
-//         assert!(is.next() == Some(b'\n'));
-//         is.consume();
-//         assert!(is.input.len() == 13);
-//         assert!(is.line == 2);
-//         assert!(is.column == 1);
-//         assert!(is.offset == 13);
-//         {
-//             let ps = is.pos();
-//             assert_eq!(b"", ps.context(&is));
-//         }
-//         assert!(is.next() == None);
-//         is.consume();
-//         assert!(is.next() == None);
-//     }
-
-//     #[test]
-//     fn test_split() {
-//         let mut is = InputSource::from_string("#version 100\n").unwrap();
-//         let backup = is.clone();
-//         assert!(is.next() == Some(b'#'));
-//         is.consume();
-//         assert!(is.next() == Some(b'v'));
-//         is.consume();
-//         assert!(is.next() == Some(b'e'));
-//         is.consume();
-//         is = backup;
-//         assert!(is.next() == Some(b'#'));
-//         is.consume();
-//         assert!(is.next() == Some(b'v'));
-//         is.consume();
-//         assert!(is.next() == Some(b'e'));
-//         is.consume();
-//     }
-// }
diff --git a/src/protocol/lexer2.rs b/src/protocol/lexer2.rs
new file mode 100644
index 0000000000000000000000000000000000000000..68d39e006e7b820c0aa6e376dff1b21a7d4f3974
--- /dev/null
+++ b/src/protocol/lexer2.rs
@@ -0,0 +1,4 @@
+
+pub struct Lexer<'a> {
+    source: &'a mut InputSource,
+}
\ No newline at end of file
diff --git a/src/protocol/mod.rs b/src/protocol/mod.rs
index 876376e589b9d46bbf539143a5e296ea51307cea..62e11f9b99130635e5d871827f4e855f9d6f7a83 100644
--- a/src/protocol/mod.rs
+++ b/src/protocol/mod.rs
@@ -2,16 +2,17 @@ mod arena;
 // mod ast;
 mod eval;
 pub(crate) mod inputsource;
+pub(crate) mod input_source2;
 // mod lexer;
 mod tokenizer;
 mod parser;
-mod pools;
 #[cfg(test)] mod tests;
 
 // TODO: Remove when not benchmarking
 pub(crate) mod ast;
 pub(crate) mod ast_printer;
 pub(crate) mod lexer;
+pub(crate) mod lexer2;
 
 lazy_static::lazy_static! {
     /// Conveniently-provided protocol description initialized with a zero-length PDL string.
diff --git a/src/protocol/pools.rs b/src/protocol/pools.rs
deleted file mode 100644
index 5850d9c547e8361df2b42c2d09bd2eff435c91cf..0000000000000000000000000000000000000000
--- a/src/protocol/pools.rs
+++ /dev/null
@@ -1,5 +0,0 @@
-
-
-struct StringPool {
-    
-}
\ No newline at end of file
diff --git a/src/protocol/tokenizer/mod.rs b/src/protocol/tokenizer/mod.rs
index 685cac45f4a39d478de1a9d19352935b10ba16ac..9f5cf247e7558d0a1025d48109be77c9fdd10546 100644
--- a/src/protocol/tokenizer/mod.rs
+++ b/src/protocol/tokenizer/mod.rs
@@ -1,17 +1,99 @@
-enum Token {
-    
+
+use crate::protocol::input_source2::{InputSource2 as InputSource, ParseError, InputPosition2 as InputPosition, InputSpan};
+
+#[derive(Clone, Copy, PartialEq, Eq)]
+enum TokenKind {
+    // Variable-character tokens, followed by a SpanEnd token
+    Ident,
+    Integer,
+    String,
+    Character,
+    LineComment,
+    BlockComment,
+    // Punctuation
+    Exclamation,    // !
+    Question,       // ?
+    Pound,          // #
+    OpenAngle,      // <
+    OpenCurly,      // {
+    OpenParen,      // (
+    OpenSquare,     // [
+    CloseAngle,     // >
+    CloseCurly,     // }
+    CloseParen,     // )
+    CloseSquare,    // ]
+    Colon,          // :
+    ColonColon,     // ::
+    Comma,          // ,
+    Dot,            // .
+    DotDot,         // ..
+    SemiColon,      // ;
+    Quote,          // '
+    DoubleQuote,    // "
+    // Operator-like
+    At,             // @
+    Plus,           // +
+    PlusPlus,       // ++
+    PlusEquals,     // +=
+    Minus,          // -
+    ArrowRight,     // ->
+    MinusMinus,     // --
+    MinusEquals,    // -=
+    Star,           // *
+    StarEquals,     // *=
+    Slash,          // /
+    SlashEquals,    // /=
+    Percent,        // %
+    PercentEquals,  // %=
+    Caret,          // ^
+    CaretEquals,    // ^=
+    And,            // &
+    AndAnd,         // &&
+    AndEquals,      // &=
+    Or,             // |
+    OrOr,           // ||
+    OrEquals,       // |=
+    Tilde,          // ~
+    Equal,          // =
+    EqualEqual,     // ==
+    NotEqual,       // !=
+    ShiftLeft,      // <<
+    ShiftLeftEquals,// <<=
+    ShiftRight,     // >>
+    ShiftRightEquals, // >>=
+    // Special marker token to indicate end of variable-character tokens
+    SpanEnd,
+}
+
+struct Token {
+    kind: TokenKind,
+    pos: InputPosition, // probably need something different
+}
+
+impl Token {
+    fn new(kind: TokenKind, pos: InputPosition) -> Self {
+        Self{ kind, pos }
+    }
 }
 
-enum TokenRangeType {
+#[derive(Debug, PartialEq, Eq)]
+enum TokenRangeKind {
+    Module,
     Pragma,
     Import,
-    Defintion,
+    Definition,
+    Code,
 }
 
 struct TokenRange {
-    range_type: TokenRangeType,
+    // Index of parent in `TokenBuffer.ranges`, does not have a parent if the 
+    // range kind is Module, in that case the parent index points to itself.
+    parent_idx: usize,
+    range_kind: TokenRangeKind,
+    curly_depth: u8,
     start: usize,
     end: usize,
+    subranges: usize,
 }
 
 struct TokenBuffer {
@@ -19,3 +101,611 @@ struct TokenBuffer {
     ranges: Vec<TokenRange>,
 }
 
+struct ParseState {
+    kind: TokenRangeKind,
+    start: usize,
+}
+
+// Tokenizer is a reusable parser to tokenize multiple source files using the
+// same allocated buffers. In a well-formed program, we produce a consistent
+// tree of token ranges such that we may identify tokens that represent a 
+// defintion or an import before producing the entire AST.
+//
+// If the program is not well-formed then the tree may be inconsistent, but we
+// will detect this once we transform the tokens into the AST.
+pub(crate) struct Tokenizer {
+    curly_depth: u8,
+    stack_idx: usize,
+}
+
+impl Tokenizer {
+    pub(crate) fn tokenize(&mut self, source: &mut InputSource, target: &mut TokenBuffer) -> Result<(), ParseError> {
+        // Assert source and buffer are at start
+        debug_assert_eq!(source.pos().offset, 0);
+        debug_assert!(target.tokens.is_empty());
+        debug_assert!(target.ranges.is_empty());
+
+        // Set up for tokenization by pushing the first range onto the stack.
+        // This range may get transformed into the appropriate range kind later,
+        // see `push_range` and `pop_range`.
+        self.curly_depth = 0;
+        self.stack_idx = 1;
+        target.ranges.push(TokenRange{
+            parent_idx: 0,
+            range_kind: TokenRangeKind::Module,
+            curly_depth: 0,
+            start: 0,
+            end: 0,
+            subranges: 0,
+        });
+
+        // Main processing loop
+        while let Some(c) = source.next() {
+            let token_index = target.tokens.len();
+
+            if is_char_literal_start(c) {
+                self.consume_char_literal(source, target)?;
+            } else if is_string_literal_start(c) {
+                self.consume_string_literal(source, target)?;
+            } else if is_identifier_start(c) {
+                let ident = self.consume_identifier(source, target)?;
+
+                if demarks_definition(ident) {
+                    self.push_range(target, TokenRangeKind::Definition, token_index);
+                } else if demarks_import(ident) {
+                    self.push_range(target, TokenRangeKind::Import, token_index);
+                }
+            } else if is_integer_literal_start(c) {
+                self.consume_number(source, target)?;
+            } else if is_pragma_start(c) {
+                self.consume_pragma(c, source, target);
+                self.push_range(target, TokenRangeKind::Pragma, token_index);
+            } else if self.is_line_comment_start(c, source) {
+                self.consume_line_comment(source, target)?;
+            } else if self.is_block_comment_start(c, source) {
+                self.consume_block_comment(source, target)?;
+            } else if is_whitespace(c) {
+                let contained_newline = self.consume_whitespace(source);
+            } else {
+                let was_punctuation = self.maybe_parse_punctuation(c, source, target)?;
+                if let Some(token) = was_punctuation {
+                    if token == TokenKind::OpenCurly {
+                        self.curly_depth += 1;
+                    } else if token == TokenKind::CloseCurly {
+                        // Check if this marks the end of a range we're 
+                        // currently processing
+                        self.curly_depth -= 1;
+
+                        let range = &target.ranges[self.stack_idx];
+                        if range.range_kind == TokenRangeKind::Definition && range.curly_depth == self.curly_depth {
+                            self.pop_range(target, target.tokens.len());
+                        }
+                    } else if token == TokenKind::SemiColon {
+                        // Check if this marks the end of an import
+                        let range = &target.ranges[self.stack_idx];
+                        if range.range_kind == TokenRangeKind::Import {
+                            self.pop_range(target, target.tokens.len());
+                        }
+                    }
+                } else {
+                    return Err(ParseError::new_error(source, source.pos(), "unexpected character"));
+                }
+            }
+        }
+
+        // End of file, check if our state is correct
+        Ok(())
+    }
+
+    fn is_line_comment_start(&self, first_char: u8, source: &InputSource) -> bool {
+        return first_char == b'/' && Some(b'/') == source.lookahead(1);
+    }
+
+    fn is_block_comment_start(&self, first_char: u8, source: &InputSource) -> bool {
+        return first_char == b'/' && Some(b'*') == source.lookahead(1);
+    }
+
+    pub(crate) fn maybe_parse_punctuation(&mut self, first_char: u8, source: &mut InputSource, target: &mut TokenBuffer) -> Result<Option<TokenKind>, ParseError> {
+        debug_assert!(first_char != b'#', "'#' needs special handling");
+        debug_assert!(first_char != b'\'', "'\'' needs special handling");
+        debug_assert!(first_char != b'"', "'\"' needs special handling");
+
+        let pos = source.pos();
+        let token_kind;
+        if first_char == b'!' {
+            source.consume();
+            if Some(b'=') == source.next() {
+                source.consume();
+                token_kind = TokenKind::NotEqual;
+            } else {
+                token_kind = TokenKind::Exclamation;
+            }
+        } else if first_char == b'%' {
+            source.consume();
+            if Some(b'=') == source.next() {
+                source.consume();
+                token_kind = TokenKind::PercentEquals;
+            } else {
+                token_kind = TokenKind::Percent;
+            }
+        } else if first_char == b'&' {
+            source.consume();
+            let next = source.next();
+            if Some(b'&') == next {
+                source.consume();
+                token_kind = TokenKind::AndAnd;
+            } else if Some(b'=') == next {
+                source.consume();
+                token_kind = TokenKind::AndEquals;
+            } else {
+                token_kind = TokenKind::And;
+            }
+        } else if first_char == b'(' {
+            source.consume();
+            token_kind = TokenKind::OpenParen;
+        } else if first_char == b')' {
+            source.consume();
+            token_kind = TokenKind::CloseParen;
+        } else if first_char == b'*' {
+            source.consume();
+            if let Some(b'=') = source.next() {
+                source.consume();
+                token_kind = TokenKind::StarEquals;
+            } else {
+                token_kind = TokenKind::Star;
+            }
+        } else if first_char == b'+' {
+            source.consume();
+            let next = source.next();
+            if Some(b'+') == next {
+                source.consume();
+                token_kind = TokenKind::PlusPlus;
+            } else if Some(b'=') == next {
+                source.consume();
+                token_kind = TokenKind::PlusEquals;
+            } else {
+                token_kind = TokenKind::Plus;
+            }
+        } else if first_char == b',' {
+            source.consume();
+            token_kind = TokenKind::Comma;
+        } else if first_char == b'-' {
+            source.consume();
+            let next = source.next();
+            if Some(b'-') == next {
+                source.consume();
+                token_kind = TokenKind::MinusMinus;
+            } else if Some(b'>') == next {
+                source.consume();
+                token_kind = TokenKind::ArrowRight;
+            } else if Some(b'=') == next {
+                source.consume();
+                token_kind = TokenKind::MinusEquals;
+            } else {
+                token_kind = TokenKind::Minus;
+            }
+        } else if first_char == b'.' {
+            source.consume();
+            if let Some(b'.') = source.next() {
+                source.consume();
+                token_kind = TokenKind::DotDot;
+            } else {
+                token_kind = TokenKind::Dot
+            }
+        } else if first_char == b'/' {
+            source.consume();
+            debug_assert_ne!(Some(b'/'), source.next());
+            debug_assert_ne!(Some(b'*'), source.next());
+            if let Some(b'=') = source.next() {
+                source.consume();
+                token_kind = TokenKind::SlashEquals;
+            } else {
+                token_kind = TokenKind::Slash;
+            }
+        } else if first_char == b':' {
+            source.consume();
+            if let Some(b':') = source.next() {
+                source.consume();
+                token_kind = TokenKind::ColonColon;
+            } else {
+                token_kind = TokenKind::Colon;
+            }
+        } else if first_char == b';' {
+            source.consume();
+            token_kind = TokenKind::SemiColon;
+        } else if first_char == b'<' {
+            source.consume();
+            if let Some(b'<') = source.next() {
+                source.consume();
+                if let Some(b'=') = source.next() {
+                    source.consume();
+                    token_kind = TokenKind::ShiftLeftEquals;
+                } else {
+                    token_kind = TokenKind::ShiftLeft;
+                }
+            } else {
+                token_kind = TokenKind::OpenAngle;
+            }
+        } else if first_char == b'=' {
+            source.consume();
+            if let Some(b'=') = source.next() {
+                source.consume();
+                token_kind = TokenKind::EqualEqual;
+            } else {
+                token_kind = TokenKind::Equal;
+            }
+        } else if first_char == b'>' {
+            source.consume();
+            if let Some(b'>') = source.next() {
+                source.consume();
+                if let Some(b'=') = source.next() {
+                    source.consume();
+                    token_kind = TokenKind::ShiftRightEquals;
+                } else {
+                    token_kind = TokenKind::ShiftRight;
+                }
+            } else {
+                token_kind = TokenKind::CloseAngle;
+            }
+        } else if first_char == b'?' {
+            source.consume();
+            token_kind = TokenKind::Question;
+        } else if first_char == b'@' {
+            source.consume();
+            token_kind = TokenKind::At;
+        } else if first_char == b'[' {
+            source.consume();
+            token_kind = TokenKind::OpenSquare;
+        } else if first_char == b']' {
+            source.consume();
+            token_kind = TokenKind::CloseSquare;
+        } else if first_char == b'^' {
+            source.consume();
+            if let Some(b'=') = source.next() { 
+                source.consume();
+                token_kind = TokenKind::CaretEquals;
+            } else {
+                token_kind = TokenKind::Caret;
+            }
+        } else if first_char == b'{' {
+            source.consume();
+            token_kind = TokenKind::OpenCurly;
+        } else if first_char == b'|' {
+            source.consume();
+            let next = source.next();
+            if Some(b'|') == next {
+                source.consume();
+                token_kind = TokenKind::OrOr;
+            } else if Some(b'=') == next {
+                source.consume();
+                token_kind = TokenKind::OrEquals;
+            } else {
+                token_kind = TokenKind::Or;
+            }
+        } else if first_char == b'}' {
+            source.consume();
+            token_kind = TokenKind::CloseCurly
+        } else {
+            self.check_ascii(source)?;
+            return Ok(None);
+        }
+
+        target.tokens.push(Token::new(token_kind, pos));
+        Ok(Some(token_kind))
+    }
+
+    pub(crate) fn consume_char_literal(&mut self, source: &mut InputSource, target: &mut TokenBuffer) -> Result<(), ParseError> {
+        let begin_pos = source.pos();
+
+        // Consume the leading quote
+        debug_assert!(source.next().unwrap() == b'\'');
+        source.consume();
+
+        let mut prev_char = b'\'';
+        while let Some(c) = source.next() {
+            source.consume();
+            if c == b'\'' && prev_char != b'\\' {
+                prev_char = c;
+                break;
+            }
+
+            prev_char = c;
+        }
+
+        if prev_char != b'\'' {
+            // Unterminated character literal
+            return Err(ParseError::new_error(source, begin_pos, "encountered unterminated character literal"));
+        }
+
+        let end_pos = source.pos();
+
+        target.tokens.push(Token::new(TokenKind::Character, begin_pos));
+        target.tokens.push(Token::new(TokenKind::SpanEnd, end_pos));
+
+        Ok(())
+    }
+
+    pub(crate) fn consume_string_literal(&mut self, source: &mut InputSource, target: &mut TokenBuffer) -> Result<(), ParseError> {
+        let begin_pos = source.pos();
+
+        // Consume the leading double quotes
+        debug_assert!(source.next().unwrap() == b'"');
+        source.consume();
+
+        let mut prev_char = b'"';
+        while let Some(c) = source.next() {
+            source.consume();
+            if c == b'"' && prev_char != b'\\' {
+                prev_char = c;
+                break;
+            }
+
+            prev_char = c;
+        }
+
+        if prev_char != b'"' {
+            // Unterminated string literal
+            return Err(ParseError::new_error(source, begin_pos, "encountered unterminated string literal"));
+        }
+
+        let end_pos = source.pos();
+        target.tokens.push(Token::new(TokenKind::String, begin_pos));
+        target.tokens.push(Token::new(TokenKind::SpanEnd, end_pos));
+
+        Ok(())
+    }
+
+    fn consume_pragma(&mut self, first_char: u8, source: &mut InputSource, target: &mut TokenBuffer) {
+        let pos = source.pos();
+        debug_assert_eq!(first_char, b'#');
+        source.consume();
+
+        target.tokens.push(Token::new(TokenKind::Pound, pos));
+    }
+
+    pub(crate) fn consume_line_comment(&mut self, source: &mut InputSource, target: &mut TokenBuffer) -> Result<(), ParseError> {
+        let begin_pos = source.pos();
+
+        // Consume the leading "//"
+        debug_assert!(source.next().unwrap() == b'/' && source.lookahead(1).unwrap() == b'/');
+        source.consume();
+        source.consume();
+
+        let mut prev_char = b'/';
+        let mut cur_char = b'/';
+        while let Some(c) = source.next() {
+            source.consume();
+            cur_char = c;
+            if c == b'\n' {
+                // End of line
+                break;
+            }
+            prev_char = c;
+        }
+
+        let mut end_pos = source.pos();
+        debug_assert_eq!(begin_pos.line, end_pos.line);
+
+        // Modify offset to not include the newline characters
+        if cur_char == b'\n' {
+            if prev_char == b'\r' {
+                end_pos.offset -= 2;
+            } else {
+                end_pos.offset -= 1;
+            }
+        } else {
+            debug_assert!(source.next().is_none())
+        }
+
+        target.tokens.push(Token::new(TokenKind::LineComment, begin_pos));
+        target.tokens.push(Token::new(TokenKind::SpanEnd, end_pos));
+
+        Ok(())
+    }
+
+    pub(crate) fn consume_block_comment(&mut self, source: &mut InputSource, target: &mut TokenBuffer) -> Result<(), ParseError> {
+        let begin_pos = source.pos();
+
+        // Consume the leading "/*"
+        debug_assert!(source.next().unwrap() == b'/' && source.lookahead(1).unwrap() == b'*');
+        source.consume();
+        source.consume();
+
+        // Explicitly do not put prev_char at "*", because then "/*/" would
+        // represent a valid and closed block comment
+        let mut prev_char = b' ';
+        let mut is_closed = false;
+        while let Some(c) = source.next() {
+            source.consume();
+            if prev_char == b'*' && c == b'/' {
+                // End of block comment
+                is_closed = true;
+                break;
+            }
+            prev_char = c;
+        }
+
+        if !is_closed {
+            return Err(ParseError::new_error(source, source.pos(), "encountered unterminated block comment"));
+        }
+
+        let end_pos = source.pos();
+        target.tokens.push(Token::new(TokenKind::BlockComment, begin_pos));
+        target.tokens.push(Token::new(TokenKind::SpanEnd, end_pos));
+
+        Ok(())
+    }
+
+    pub(crate) fn consume_identifier<'a>(&mut self, source: &'a mut InputSource, target: &mut TokenBuffer) -> Result<&'a [u8], ParseError> {
+        let begin_pos = source.pos();
+        debug_assert!(is_identifier_start(source.next().unwrap()));
+        source.consume();
+
+        // Keep reading until no more identifier
+        while let Some(c) = source.next() {
+            if !is_identifier_remaining(c) {
+                break;
+            }
+
+            source.consume();
+        }
+        self.check_ascii(source)?;
+
+        let end_pos = source.pos();
+        target.tokens.push(Token::new(TokenKind::Ident, begin_pos));
+        target.tokens.push(Token::new(TokenKind::SpanEnd, end_pos));
+        Ok(source.section(begin_pos.offset, end_pos.offset))
+    }
+
+    pub(crate) fn consume_number(&mut self, source: &mut InputSource, target: &mut TokenBuffer) -> Result<(), ParseError> {
+        let begin_pos = source.pos();
+        debug_assert!(is_integer_literal_start(source.next().unwrap()));
+        source.consume();
+
+        // Keep reading until it doesn't look like a number anymore
+        while let Some(c) = source.next() {
+            if !maybe_number_remaining(c) {
+                break;
+            }
+
+            source.consume();
+        }
+        self.check_ascii(source)?;
+
+        let end_pos = source.pos();
+        target.tokens.push(Token::new(TokenKind::Integer, begin_pos));
+        target.tokens.push(Token::new(TokenKind::SpanEnd, end_pos));
+
+        Ok(())
+    }
+
+    // Consumes whitespace and returns whether or not the whitespace contained
+    // a newline.
+    fn consume_whitespace(&self, source: &mut InputSource) -> bool {
+        debug_assert!(is_whitespace(source.next().unwrap()));
+
+        let mut has_newline = false;
+        while let Some(c) = source.next() {
+            if !is_whitespace(c) {
+                break;
+            }
+
+            if c == b'\n' {
+                has_newline = true;
+            }
+        }
+
+        has_newline
+    }
+
+    /// Pushes a new token range onto the stack in the buffers.
+    fn push_range(&self, target: &mut TokenBuffer, range_kind: TokenRangeKind, first_token: usize) {
+        let cur_range = &target.ranges[self.stack_idx];
+        let parent_idx = cur_range.parent_idx;
+        let parent_range = &target.ranges[parent_idx];
+        if parent_range.end != first_token {
+            // Insert intermediate range
+            target.ranges.push(TokenRange{
+                parent_idx,
+                range_kind: TokenRangeKind::Code,
+                curly_depth: cur_range.curly_depth,
+                start: parent_range.end,
+                end: first_token,
+                subranges: 0,
+            });
+        }
+
+        // Insert a new range
+        let parent_idx = self.stack_idx;
+        self.stack_idx = target.ranges.len();
+        target.ranges.push(TokenRange{
+            parent_idx,
+            range_kind,
+            curly_depth: self.curly_depth,
+            start: first_token,
+            end: first_token,
+            subranges: 0,
+        });
+    }
+
+    fn pop_range(&self, target: &mut TokenBuffer, end_index: usize) {
+        // Pop all the dummy ranges that are left on the range stack
+        let last = &mut target.ranges[self.stack_idx];
+        debug_assert!(self.stack_idx != last.parent_idx, "attempting to pop top-level range");
+
+        last.end = end_index;
+        self.stack_idx = last.parent_idx as usize;
+        let parent = &mut target.ranges[self.stack_idx];
+        parent.end = end_index;
+        parent.subranges += 1;
+    }
+
+
+    fn check_ascii(&self, source: &InputSource) -> Result<(), ParseError> {
+        match source.next() {
+            Some(c) if !c.is_ascii() => {
+                Err(ParseError::new_error(source, source.pos(), "encountered a non-ASCII character"))
+            },
+            _else => {
+                Ok(())
+            },
+        }
+    }
+}
+
+// Helpers for characters
+fn demarks_definition(ident: &[u8]) -> bool {
+    return
+        ident == b"struct" ||
+        ident == b"enum" ||
+        ident == b"union" ||
+        ident == b"func" ||
+        ident == b"primitive" ||
+        ident == b"composite"
+}
+
+fn demarks_import(ident: &[u8]) -> bool {
+    return ident == b"import";
+}
+
+fn is_whitespace(c: u8) -> bool {
+    c.is_ascii_whitespace()
+}
+
+fn is_char_literal_start(c: u8) -> bool {
+    return c == b'\'';
+}
+
+fn is_string_literal_start(c: u8) -> bool {
+    return c == b'"';
+}
+
+fn is_pragma_start(c: u8) -> bool {
+    return c == b'#';
+}
+
+fn is_identifier_start(c: u8) -> bool {
+    return 
+        (c >= b'a' && c <= b'z') ||
+        (c >= b'A' && c <= b'Z') ||
+        c == b'_' 
+}
+
+fn is_identifier_remaining(c: u8) -> bool {
+    return 
+        (c >= b'0' && c <= b'9') ||
+        (c >= b'a' && c <= b'z') ||
+        (c >= b'A' && c <= b'Z') ||
+        c == b'_' 
+}
+
+fn is_integer_literal_start(c: u8) -> bool {
+    return c >= b'0' && c <= b'9';
+}
+
+fn maybe_number_remaining(c: u8) -> bool {
+    return 
+        (c == b'b' || c == b'B' || c == b'o' || c == b'O' || c == b'x' || c == b'X') ||
+        (c >= b'0' && c <= b'9') ||
+        c == b'_';
+}
\ No newline at end of file