From 13bac4f3561979d2ab317502cbec886c9fb3a1f3 2021-04-10 01:23:23 From: MH Date: 2021-04-10 01:23:23 Subject: [PATCH] WIP on tokenization --- diff --git a/src/collections/mod.rs b/src/collections/mod.rs new file mode 100644 index 0000000000000000000000000000000000000000..b066ddec6c67ef24a2dbdd310c354577cadbf4e4 --- /dev/null +++ b/src/collections/mod.rs @@ -0,0 +1,3 @@ +mod string_pool; + +pub(crate) use string_pool::{StringPool, StringRef}; \ No newline at end of file diff --git a/src/collections/string_pool.rs b/src/collections/string_pool.rs new file mode 100644 index 0000000000000000000000000000000000000000..5cd82b91d55b8ebff11b98bd504745ceb5d1616d --- /dev/null +++ b/src/collections/string_pool.rs @@ -0,0 +1,154 @@ +use std::ptr::null_mut; + +const SLAB_SIZE: usize = u16::max_value() as usize; + +pub struct StringRef { + data: *const u8, + length: usize, +} + +impl StringRef { + pub fn as_str<'a>(&'a self) -> &'a str { + unsafe { + let slice = std::slice::from_raw_parts::<'a, u8>(self.data, self.length); + std::str::from_utf8_unchecked(slice) + } + } +} + +struct StringPoolSlab { + prev: *mut StringPoolSlab, + data: Vec, + remaining: usize, +} + +impl StringPoolSlab { + fn new(prev: *mut StringPoolSlab) -> Self { + Self{ prev, data: Vec::with_capacity(SLAB_SIZE), remaining: SLAB_SIZE } + } +} + +/// StringPool is a ever-growing pool of strings. Strings have a maximum size +/// equal to the slab size. The slabs are essentially a linked list to maintain +/// pointer-stability of the strings themselves. +/// All `StringRef` instances are invalidated when the string pool is dropped +pub(crate) struct StringPool { + last: *mut StringPoolSlab, +} + +impl StringPool { + pub(crate) fn new() -> Self { + // To have some stability we just turn a box into a raw ptr. + let initial_slab = Box::new(StringPoolSlab::new(null_mut())); + let initial_slab = Box::into_raw(initial_slab); + StringPool{ + last: initial_slab, + } + } + + // Interns a string to the StringPool, returning a reference to it + pub(crate) fn intern(&mut self, data: &[u8]) -> StringRef { + // TODO: Large string allocations, if ever needed. + let data_len = data.len(); + assert!(data_len <= SLAB_SIZE, "string is too large for slab"); + debug_assert!(std::str::from_utf8(data).is_ok(), "string to intern is not valid UTF-8 encoded"); + + let mut last = unsafe{&mut *self.last}; + if data.len() > last.remaining { + // Doesn't fit: allocate new slab + self.alloc_new_slab(); + last = unsafe{&mut *self.last}; + } + + // Must fit now + debug_assert!(data_len <= last.remaining); + let range_start = last.data.len(); + last.data.extend_from_slice(data); + last.remaining -= data_len; + debug_assert_eq!(range_start + data_len, last.data.len()); + + unsafe { + let start = last.data.as_ptr().offset(range_start as isize); + StringRef{ data: start, length: data_len } + } + } + + fn alloc_new_slab(&mut self) { + let new_slab = Box::new(StringPoolSlab::new(self.last)); + let new_slab = Box::into_raw(new_slab); + self.last = new_slab; + } +} + +impl Drop for StringPool { + fn drop(&mut self) { + let mut new_slab = self.last; + while !new_slab.is_null() { + let cur_slab = new_slab; + unsafe { + new_slab = (*cur_slab).prev; + Box::from_raw(cur_slab); // consume and deallocate + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_string_just_fits() { + let large = "0".repeat(SLAB_SIZE); + let mut pool = StringPool::new(); + let interned = pool.intern(large.as_bytes()); + assert_eq!(interned.as_str(), large); + } + + #[test] + #[should_panic] + fn test_string_too_large() { + let large = "0".repeat(SLAB_SIZE + 1); + let mut pool = StringPool::new(); + let _interned = pool.intern(large.as_bytes()); + } + + #[test] + fn test_lots_of_small_allocations() { + const NUM_PER_SLAB: usize = 32; + const NUM_SLABS: usize = 4; + + let to_intern = "0".repeat(SLAB_SIZE / NUM_PER_SLAB); + let mut pool = StringPool::new(); + + let mut last_slab = pool.last; + let mut all_refs = Vec::new(); + + // Fill up first slab + for _alloc_idx in 0..NUM_PER_SLAB { + let interned = pool.intern(to_intern.as_bytes()); + all_refs.push(interned); + assert!(std::ptr::eq(last_slab, pool.last)); + } + + for _slab_idx in 0..NUM_SLABS-1 { + for alloc_idx in 0..NUM_PER_SLAB { + let interned = pool.intern(to_intern.as_bytes()); + all_refs.push(interned); + + if alloc_idx == 0 { + // First allocation produces a new slab + assert!(!std::ptr::eq(last_slab, pool.last)); + last_slab = pool.last; + } else { + assert!(std::ptr::eq(last_slab, pool.last)); + } + } + } + + // All strings are still correct + for string_ref in all_refs { + assert_eq!(string_ref.as_str(), to_intern); + } + } +} \ No newline at end of file diff --git a/src/lib.rs b/src/lib.rs index 8674f5eedb3dcf874836957d45bde0a956e0503a..becc35e33199c98bd851df96cde81cbbbf21b786 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -4,6 +4,7 @@ mod macros; mod common; mod protocol; mod runtime; +mod collections; pub use common::{ConnectorId, EndpointPolarity, Payload, Polarity, PortId}; pub use protocol::{ProtocolDescription, TRIVIAL_PD}; diff --git a/src/protocol/input_source2.rs b/src/protocol/input_source2.rs new file mode 100644 index 0000000000000000000000000000000000000000..025b86ddbfb0a0f77f8fd9f3dffe5a5cfad59218 --- /dev/null +++ b/src/protocol/input_source2.rs @@ -0,0 +1,287 @@ +use std::fmt; +use std::cell::{Ref, RefCell}; + +#[derive(Debug, Clone, Copy)] +pub struct InputPosition2 { + pub line: u32, + pub offset: u32, +} + +pub struct InputSpan { + pub begin: InputPosition2, + pub end: InputPosition2, +} + +impl InputSpan { + #[inline] + fn from_positions(begin: InputPosition2, end: InputPosition2) -> Self { + Self { begin, end } + } +} + +/// Wrapper around source file with optional filename. Ensures that the file is +/// only scanned once. +pub struct InputSource2 { + pub(crate) filename: String, + pub(crate) input: Vec, + // Iteration + line: u32, + offset: usize, + // State tracking + had_error: Option, + // The offset_lookup is built on-demand upon attempting to report an error. + // As the compiler is currently not multithreaded, we simply put it in a + // RefCell to allow interior mutability. + offset_lookup: RefCell>, +} + +impl InputSource2 { + pub fn new(filename: String, input: Vec) -> Self { + Self{ + filename, + input, + line: 1, + offset: 0, + had_error: None, + offset_lookup: RefCell::new(Vec::new()), + } + } + + #[inline] + pub fn pos(&self) -> InputPosition2 { + InputPosition2{ line: self.line, offset: self.offset as u32 } + } + + pub fn next(&self) -> Option { + if self.offset < self.input.len() { + Some(self.input[self.offset]) + } else { + None + } + } + + pub fn lookahead(&self, offset: usize) -> Option { + let offset_pos = self.offset + offset; + if offset_pos < self.input.len() { + Some(self.input[offset_pos]) + } else { + None + } + } + + pub fn section(&self, start: u32, end: u32) -> &[u8] { + &self.input[start as usize..end as usize] + } + + // Consumes the next character. Will check well-formedness of newlines: \r + // must be followed by a \n, because this is used for error reporting. Will + // not check for ascii-ness of the file, better left to a tokenizer. + pub fn consume(&mut self) { + match self.next() { + Some(b'\r') => { + if Some(b'\n') == self.lookahead(1) { + // Well formed file + self.offset += 1; + } else { + // Not a well-formed file, pretend like we can continue + self.offset += 1; + self.set_error("Encountered carriage-feed without a following newline"); + } + }, + Some(b'\n') => { + self.line += 1; + self.offset += 1; + }, + Some(_) => { + self.offset += 1; + } + None => {} + } + + // Maybe we actually want to check this in release mode. Then again: + // a 4 gigabyte source file... Really? + debug_assert!(self.offset < u32::max_value() as usize); + } + + fn set_error(&mut self, msg: &str) { + if self.had_error.is_none() { + self.had_error = Some(ParseError::new_error(self, self.pos(), msg)); + } + } + + fn get_lookup(&self) -> Ref> { + // Once constructed the lookup always contains one element. We use this + // to see if it is constructed already. + let lookup = self.offset_lookup.borrow(); + if !lookup.is_empty() { + return lookup; + } + + // Build the line number (!) to offset lookup, so offset by 1. We + // assume the entire source file is scanned (most common case) for + // preallocation. + let lookup = self.offset_lookup.borrow_mut(); + lookup.reserve(self.line as usize + 2); + lookup.push(0); // line 0: never used + lookup.push(0); // first line: first character + + for char_idx in 0..self.input.len() { + if self.input[char_idx] == b'\n' { + lookup.push(char_idx as u32 + 1); + } + } + + lookup.push(self.input.len() as u32); // for lookup_line_end + debug_assert_eq!(self.line as usize + 2, lookup.len(), "remove me: i am a testing assert and sometimes invalid"); + + // Return created lookup + let lookup = self.offset_lookup.borrow(); + return lookup; + } + + fn lookup_line_start_offset(&self, line_number: u32) -> u32 { + let lookup = self.get_lookup(); + lookup[line_number as usize] + } + + fn lookup_line_end_offset(&self, line_number: u32) -> u32 { + let lookup = self.get_lookup(); + let offset = lookup[(line_number + 1) as usize] - 1; + let offset_usize = offset as usize; + + // Compensate for newlines and a potential carriage feed + if self.input[offset_usize] == b'\n' { + if offset_usize > 0 && self.input[offset_usize] == b'\r' { + offset - 2 + } else { + offset - 1 + } + } else { + offset + } + } +} + +#[derive(Debug)] +pub enum ParseErrorType { + Info, + Error +} + +#[derive(Debug)] +pub struct ParseErrorStatement { + pub(crate) error_type: ParseErrorType, + pub(crate) line: u32, + pub(crate) column: u32, + pub(crate) offset: u32, + pub(crate) filename: String, + pub(crate) context: String, + pub(crate) message: String, +} + +impl ParseErrorStatement { + fn from_source(error_type: ParseErrorType, source: &InputSource2, position: InputPosition2, msg: &str) -> Self { + // Seek line start and end + let line_start = source.lookup_line_start_offset(position.line); + let line_end = source.lookup_line_end_offset(position.line); + debug_assert!(position.offset >= line_start); + let column = position.offset - line_start + 1; + + Self{ + error_type, + line: position.line, + column, + offset: position.offset, + filename: source.filename.clone(), + context: String::from_utf8_lossy(&source.input[line_start as usize..line_end as usize]).to_string(), + message: msg.to_string() + } + } +} + +impl fmt::Display for ParseErrorStatement { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + // Write message + match self.error_type { + ParseErrorType::Info => write!(f, " INFO: ")?, + ParseErrorType::Error => write!(f, "ERROR: ")?, + } + writeln!(f, "{}", &self.message)?; + + // Write originating file/line/column + if self.filename.is_empty() { + writeln!(f, " +- at {}:{}", self.line, self.column)?; + } else { + writeln!(f, " +- at {}:{}:{}", self.filename, self.line, self.column)?; + } + + // Write source context + writeln!(f, " | ")?; + writeln!(f, " | {}", self.context)?; + + // Write underline indicating where the error ocurred + debug_assert!(self.column as usize <= self.context.chars().count()); + let mut arrow = String::with_capacity(self.context.len() + 3); + arrow.push_str(" | "); + let mut char_col = 1; + for char in self.context.chars() { + if char_col == self.column { break; } + if char == '\t' { + arrow.push('\t'); + } else { + arrow.push(' '); + } + + char_col += 1; + } + arrow.push('^'); + writeln!(f, "{}", arrow)?; + + Ok(()) + } +} + +#[derive(Debug)] +pub struct ParseError { + pub(crate) statements: Vec +} + +impl fmt::Display for ParseError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + if self.statements.is_empty() { + return Ok(()) + } + + self.statements[0].fmt(f)?; + for statement in self.statements.iter().skip(1) { + writeln!(f)?; + statement.fmt(f)?; + } + + Ok(()) + } +} + +impl ParseError { + pub fn empty() -> Self { + Self{ statements: Vec::new() } + } + + pub fn new_error(source: &InputSource2, position: InputPosition2, msg: &str) -> Self { + Self{ statements: vec!(ParseErrorStatement::from_source(ParseErrorType::Error, source, position, msg))} + } + + pub fn with_prefixed(mut self, error_type: ParseErrorType, source: &InputSource2, position: InputPosition2, msg: &str) -> Self { + self.statements.insert(0, ParseErrorStatement::from_source(error_type, source, position, msg)); + self + } + + pub fn with_postfixed(mut self, error_type: ParseErrorType, source: &InputSource2, position: InputPosition2, msg: &str) -> Self { + self.statements.push(ParseErrorStatement::from_source(error_type, source, position, msg)); + self + } + + pub fn with_postfixed_info(self, source: &InputSource2, position: InputPosition2, msg: &str) -> Self { + self.with_postfixed(ParseErrorType::Info, source, position, msg) + } +} diff --git a/src/protocol/inputsource.rs b/src/protocol/inputsource.rs index 106b6d76c9ac813bdc99bff0158ca7be8a3c7eec..756f51cc32c3153a8d629f85f7c3ef02942d0521 100644 --- a/src/protocol/inputsource.rs +++ b/src/protocol/inputsource.rs @@ -164,11 +164,6 @@ impl InputPosition { } &source.input[start..end] } - fn eval_error(&self, message: S) -> EvalError { - EvalError { position: *self, message: message.to_string(), backtrace: Backtrace::new() } - } - - pub(crate) fn col(&self) -> usize { self.column } } impl Default for InputPosition { @@ -185,9 +180,6 @@ impl fmt::Display for InputPosition { pub trait SyntaxElement { fn position(&self) -> InputPosition; - fn error(&self, message: S) -> EvalError { - self.position().eval_error(message) - } } #[derive(Debug)] @@ -315,172 +307,3 @@ impl ParseError { self.with_postfixed(ParseErrorType::Info, source, position, msg) } } - -#[derive(Debug, Clone)] -pub struct EvalError { - position: InputPosition, - message: String, - backtrace: Backtrace, -} - -impl EvalError { - pub fn new(position: InputPosition, message: S) -> EvalError { - EvalError { position, message: message.to_string(), backtrace: Backtrace::new() } - } - // Diagnostic methods - pub fn write(&self, source: &InputSource, writer: &mut A) -> io::Result<()> { - if !source.filename.is_empty() { - writeln!( - writer, - "Evaluation error at {}:{}: {}", - source.filename, self.position, self.message - )?; - } else { - writeln!(writer, "Evaluation error at {}: {}", self.position, self.message)?; - } - let line = self.position.context(source); - writeln!(writer, "{}", String::from_utf8_lossy(line))?; - let mut arrow: Vec = Vec::new(); - for pos in 1..self.position.column { - let c = line[pos - 1]; - if c == b'\t' { - arrow.push(b'\t') - } else { - arrow.push(b' ') - } - } - arrow.push(b'^'); - writeln!(writer, "{}", String::from_utf8_lossy(&arrow)) - } - pub fn print(&self, source: &InputSource) { - self.write(source, &mut std::io::stdout()).unwrap() - } - pub fn display<'a>(&'a self, source: &'a InputSource) -> DisplayEvalError<'a> { - DisplayEvalError::new(self, source) - } -} - -impl From for io::Error { - fn from(_: EvalError) -> io::Error { - io::Error::new(io::ErrorKind::InvalidInput, "eval error") - } -} - -#[derive(Clone, Copy)] -pub struct DisplayEvalError<'a> { - error: &'a EvalError, - source: &'a InputSource, -} - -impl DisplayEvalError<'_> { - fn new<'a>(error: &'a EvalError, source: &'a InputSource) -> DisplayEvalError<'a> { - DisplayEvalError { error, source } - } -} - -impl fmt::Display for DisplayEvalError<'_> { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - let mut vec: Vec = Vec::new(); - match self.error.write(self.source, &mut vec) { - Err(_) => { - return fmt::Result::Err(fmt::Error); - } - Ok(_) => {} - } - write!(f, "{}", String::from_utf8_lossy(&vec)) - } -} - -// #[cfg(test)] -// mod tests { -// use super::*; - -// #[test] -// fn test_from_string() { -// let mut is = InputSource::from_string("#version 100\n").unwrap(); -// assert!(is.input.len() == 13); -// assert!(is.line == 1); -// assert!(is.column == 1); -// assert!(is.offset == 0); -// let ps = is.pos(); -// assert!(ps.line == 1); -// assert!(ps.column == 1); -// assert!(ps.offset == 0); -// assert!(is.next() == Some(b'#')); -// is.consume(); -// assert!(is.next() == Some(b'v')); -// assert!(is.lookahead(1) == Some(b'e')); -// is.consume(); -// assert!(is.next() == Some(b'e')); -// is.consume(); -// assert!(is.next() == Some(b'r')); -// is.consume(); -// assert!(is.next() == Some(b's')); -// is.consume(); -// assert!(is.next() == Some(b'i')); -// is.consume(); -// { -// let ps = is.pos(); -// assert_eq!(b"#version 100", ps.context(&is)); -// let er = is.error("hello world!"); -// let mut vec: Vec = Vec::new(); -// er.write(&is, &mut vec).unwrap(); -// assert_eq!( -// "Parse error at 1:7: hello world!\n#version 100\n ^\n", -// String::from_utf8_lossy(&vec) -// ); -// } -// assert!(is.next() == Some(b'o')); -// is.consume(); -// assert!(is.next() == Some(b'n')); -// is.consume(); -// assert!(is.input.len() == 13); -// assert!(is.line == 1); -// assert!(is.column == 9); -// assert!(is.offset == 8); -// assert!(is.next() == Some(b' ')); -// is.consume(); -// assert!(is.next() == Some(b'1')); -// is.consume(); -// assert!(is.next() == Some(b'0')); -// is.consume(); -// assert!(is.next() == Some(b'0')); -// is.consume(); -// assert!(is.input.len() == 13); -// assert!(is.line == 1); -// assert!(is.column == 13); -// assert!(is.offset == 12); -// assert!(is.next() == Some(b'\n')); -// is.consume(); -// assert!(is.input.len() == 13); -// assert!(is.line == 2); -// assert!(is.column == 1); -// assert!(is.offset == 13); -// { -// let ps = is.pos(); -// assert_eq!(b"", ps.context(&is)); -// } -// assert!(is.next() == None); -// is.consume(); -// assert!(is.next() == None); -// } - -// #[test] -// fn test_split() { -// let mut is = InputSource::from_string("#version 100\n").unwrap(); -// let backup = is.clone(); -// assert!(is.next() == Some(b'#')); -// is.consume(); -// assert!(is.next() == Some(b'v')); -// is.consume(); -// assert!(is.next() == Some(b'e')); -// is.consume(); -// is = backup; -// assert!(is.next() == Some(b'#')); -// is.consume(); -// assert!(is.next() == Some(b'v')); -// is.consume(); -// assert!(is.next() == Some(b'e')); -// is.consume(); -// } -// } diff --git a/src/protocol/lexer2.rs b/src/protocol/lexer2.rs new file mode 100644 index 0000000000000000000000000000000000000000..68d39e006e7b820c0aa6e376dff1b21a7d4f3974 --- /dev/null +++ b/src/protocol/lexer2.rs @@ -0,0 +1,4 @@ + +pub struct Lexer<'a> { + source: &'a mut InputSource, +} \ No newline at end of file diff --git a/src/protocol/mod.rs b/src/protocol/mod.rs index 876376e589b9d46bbf539143a5e296ea51307cea..62e11f9b99130635e5d871827f4e855f9d6f7a83 100644 --- a/src/protocol/mod.rs +++ b/src/protocol/mod.rs @@ -2,16 +2,17 @@ mod arena; // mod ast; mod eval; pub(crate) mod inputsource; +pub(crate) mod input_source2; // mod lexer; mod tokenizer; mod parser; -mod pools; #[cfg(test)] mod tests; // TODO: Remove when not benchmarking pub(crate) mod ast; pub(crate) mod ast_printer; pub(crate) mod lexer; +pub(crate) mod lexer2; lazy_static::lazy_static! { /// Conveniently-provided protocol description initialized with a zero-length PDL string. diff --git a/src/protocol/pools.rs b/src/protocol/pools.rs deleted file mode 100644 index 5850d9c547e8361df2b42c2d09bd2eff435c91cf..0000000000000000000000000000000000000000 --- a/src/protocol/pools.rs +++ /dev/null @@ -1,5 +0,0 @@ - - -struct StringPool { - -} \ No newline at end of file diff --git a/src/protocol/tokenizer/mod.rs b/src/protocol/tokenizer/mod.rs index 685cac45f4a39d478de1a9d19352935b10ba16ac..9f5cf247e7558d0a1025d48109be77c9fdd10546 100644 --- a/src/protocol/tokenizer/mod.rs +++ b/src/protocol/tokenizer/mod.rs @@ -1,17 +1,99 @@ -enum Token { - + +use crate::protocol::input_source2::{InputSource2 as InputSource, ParseError, InputPosition2 as InputPosition, InputSpan}; + +#[derive(Clone, Copy, PartialEq, Eq)] +enum TokenKind { + // Variable-character tokens, followed by a SpanEnd token + Ident, + Integer, + String, + Character, + LineComment, + BlockComment, + // Punctuation + Exclamation, // ! + Question, // ? + Pound, // # + OpenAngle, // < + OpenCurly, // { + OpenParen, // ( + OpenSquare, // [ + CloseAngle, // > + CloseCurly, // } + CloseParen, // ) + CloseSquare, // ] + Colon, // : + ColonColon, // :: + Comma, // , + Dot, // . + DotDot, // .. + SemiColon, // ; + Quote, // ' + DoubleQuote, // " + // Operator-like + At, // @ + Plus, // + + PlusPlus, // ++ + PlusEquals, // += + Minus, // - + ArrowRight, // -> + MinusMinus, // -- + MinusEquals, // -= + Star, // * + StarEquals, // *= + Slash, // / + SlashEquals, // /= + Percent, // % + PercentEquals, // %= + Caret, // ^ + CaretEquals, // ^= + And, // & + AndAnd, // && + AndEquals, // &= + Or, // | + OrOr, // || + OrEquals, // |= + Tilde, // ~ + Equal, // = + EqualEqual, // == + NotEqual, // != + ShiftLeft, // << + ShiftLeftEquals,// <<= + ShiftRight, // >> + ShiftRightEquals, // >>= + // Special marker token to indicate end of variable-character tokens + SpanEnd, +} + +struct Token { + kind: TokenKind, + pos: InputPosition, // probably need something different +} + +impl Token { + fn new(kind: TokenKind, pos: InputPosition) -> Self { + Self{ kind, pos } + } } -enum TokenRangeType { +#[derive(Debug, PartialEq, Eq)] +enum TokenRangeKind { + Module, Pragma, Import, - Defintion, + Definition, + Code, } struct TokenRange { - range_type: TokenRangeType, + // Index of parent in `TokenBuffer.ranges`, does not have a parent if the + // range kind is Module, in that case the parent index points to itself. + parent_idx: usize, + range_kind: TokenRangeKind, + curly_depth: u8, start: usize, end: usize, + subranges: usize, } struct TokenBuffer { @@ -19,3 +101,611 @@ struct TokenBuffer { ranges: Vec, } +struct ParseState { + kind: TokenRangeKind, + start: usize, +} + +// Tokenizer is a reusable parser to tokenize multiple source files using the +// same allocated buffers. In a well-formed program, we produce a consistent +// tree of token ranges such that we may identify tokens that represent a +// defintion or an import before producing the entire AST. +// +// If the program is not well-formed then the tree may be inconsistent, but we +// will detect this once we transform the tokens into the AST. +pub(crate) struct Tokenizer { + curly_depth: u8, + stack_idx: usize, +} + +impl Tokenizer { + pub(crate) fn tokenize(&mut self, source: &mut InputSource, target: &mut TokenBuffer) -> Result<(), ParseError> { + // Assert source and buffer are at start + debug_assert_eq!(source.pos().offset, 0); + debug_assert!(target.tokens.is_empty()); + debug_assert!(target.ranges.is_empty()); + + // Set up for tokenization by pushing the first range onto the stack. + // This range may get transformed into the appropriate range kind later, + // see `push_range` and `pop_range`. + self.curly_depth = 0; + self.stack_idx = 1; + target.ranges.push(TokenRange{ + parent_idx: 0, + range_kind: TokenRangeKind::Module, + curly_depth: 0, + start: 0, + end: 0, + subranges: 0, + }); + + // Main processing loop + while let Some(c) = source.next() { + let token_index = target.tokens.len(); + + if is_char_literal_start(c) { + self.consume_char_literal(source, target)?; + } else if is_string_literal_start(c) { + self.consume_string_literal(source, target)?; + } else if is_identifier_start(c) { + let ident = self.consume_identifier(source, target)?; + + if demarks_definition(ident) { + self.push_range(target, TokenRangeKind::Definition, token_index); + } else if demarks_import(ident) { + self.push_range(target, TokenRangeKind::Import, token_index); + } + } else if is_integer_literal_start(c) { + self.consume_number(source, target)?; + } else if is_pragma_start(c) { + self.consume_pragma(c, source, target); + self.push_range(target, TokenRangeKind::Pragma, token_index); + } else if self.is_line_comment_start(c, source) { + self.consume_line_comment(source, target)?; + } else if self.is_block_comment_start(c, source) { + self.consume_block_comment(source, target)?; + } else if is_whitespace(c) { + let contained_newline = self.consume_whitespace(source); + } else { + let was_punctuation = self.maybe_parse_punctuation(c, source, target)?; + if let Some(token) = was_punctuation { + if token == TokenKind::OpenCurly { + self.curly_depth += 1; + } else if token == TokenKind::CloseCurly { + // Check if this marks the end of a range we're + // currently processing + self.curly_depth -= 1; + + let range = &target.ranges[self.stack_idx]; + if range.range_kind == TokenRangeKind::Definition && range.curly_depth == self.curly_depth { + self.pop_range(target, target.tokens.len()); + } + } else if token == TokenKind::SemiColon { + // Check if this marks the end of an import + let range = &target.ranges[self.stack_idx]; + if range.range_kind == TokenRangeKind::Import { + self.pop_range(target, target.tokens.len()); + } + } + } else { + return Err(ParseError::new_error(source, source.pos(), "unexpected character")); + } + } + } + + // End of file, check if our state is correct + Ok(()) + } + + fn is_line_comment_start(&self, first_char: u8, source: &InputSource) -> bool { + return first_char == b'/' && Some(b'/') == source.lookahead(1); + } + + fn is_block_comment_start(&self, first_char: u8, source: &InputSource) -> bool { + return first_char == b'/' && Some(b'*') == source.lookahead(1); + } + + pub(crate) fn maybe_parse_punctuation(&mut self, first_char: u8, source: &mut InputSource, target: &mut TokenBuffer) -> Result, ParseError> { + debug_assert!(first_char != b'#', "'#' needs special handling"); + debug_assert!(first_char != b'\'', "'\'' needs special handling"); + debug_assert!(first_char != b'"', "'\"' needs special handling"); + + let pos = source.pos(); + let token_kind; + if first_char == b'!' { + source.consume(); + if Some(b'=') == source.next() { + source.consume(); + token_kind = TokenKind::NotEqual; + } else { + token_kind = TokenKind::Exclamation; + } + } else if first_char == b'%' { + source.consume(); + if Some(b'=') == source.next() { + source.consume(); + token_kind = TokenKind::PercentEquals; + } else { + token_kind = TokenKind::Percent; + } + } else if first_char == b'&' { + source.consume(); + let next = source.next(); + if Some(b'&') == next { + source.consume(); + token_kind = TokenKind::AndAnd; + } else if Some(b'=') == next { + source.consume(); + token_kind = TokenKind::AndEquals; + } else { + token_kind = TokenKind::And; + } + } else if first_char == b'(' { + source.consume(); + token_kind = TokenKind::OpenParen; + } else if first_char == b')' { + source.consume(); + token_kind = TokenKind::CloseParen; + } else if first_char == b'*' { + source.consume(); + if let Some(b'=') = source.next() { + source.consume(); + token_kind = TokenKind::StarEquals; + } else { + token_kind = TokenKind::Star; + } + } else if first_char == b'+' { + source.consume(); + let next = source.next(); + if Some(b'+') == next { + source.consume(); + token_kind = TokenKind::PlusPlus; + } else if Some(b'=') == next { + source.consume(); + token_kind = TokenKind::PlusEquals; + } else { + token_kind = TokenKind::Plus; + } + } else if first_char == b',' { + source.consume(); + token_kind = TokenKind::Comma; + } else if first_char == b'-' { + source.consume(); + let next = source.next(); + if Some(b'-') == next { + source.consume(); + token_kind = TokenKind::MinusMinus; + } else if Some(b'>') == next { + source.consume(); + token_kind = TokenKind::ArrowRight; + } else if Some(b'=') == next { + source.consume(); + token_kind = TokenKind::MinusEquals; + } else { + token_kind = TokenKind::Minus; + } + } else if first_char == b'.' { + source.consume(); + if let Some(b'.') = source.next() { + source.consume(); + token_kind = TokenKind::DotDot; + } else { + token_kind = TokenKind::Dot + } + } else if first_char == b'/' { + source.consume(); + debug_assert_ne!(Some(b'/'), source.next()); + debug_assert_ne!(Some(b'*'), source.next()); + if let Some(b'=') = source.next() { + source.consume(); + token_kind = TokenKind::SlashEquals; + } else { + token_kind = TokenKind::Slash; + } + } else if first_char == b':' { + source.consume(); + if let Some(b':') = source.next() { + source.consume(); + token_kind = TokenKind::ColonColon; + } else { + token_kind = TokenKind::Colon; + } + } else if first_char == b';' { + source.consume(); + token_kind = TokenKind::SemiColon; + } else if first_char == b'<' { + source.consume(); + if let Some(b'<') = source.next() { + source.consume(); + if let Some(b'=') = source.next() { + source.consume(); + token_kind = TokenKind::ShiftLeftEquals; + } else { + token_kind = TokenKind::ShiftLeft; + } + } else { + token_kind = TokenKind::OpenAngle; + } + } else if first_char == b'=' { + source.consume(); + if let Some(b'=') = source.next() { + source.consume(); + token_kind = TokenKind::EqualEqual; + } else { + token_kind = TokenKind::Equal; + } + } else if first_char == b'>' { + source.consume(); + if let Some(b'>') = source.next() { + source.consume(); + if let Some(b'=') = source.next() { + source.consume(); + token_kind = TokenKind::ShiftRightEquals; + } else { + token_kind = TokenKind::ShiftRight; + } + } else { + token_kind = TokenKind::CloseAngle; + } + } else if first_char == b'?' { + source.consume(); + token_kind = TokenKind::Question; + } else if first_char == b'@' { + source.consume(); + token_kind = TokenKind::At; + } else if first_char == b'[' { + source.consume(); + token_kind = TokenKind::OpenSquare; + } else if first_char == b']' { + source.consume(); + token_kind = TokenKind::CloseSquare; + } else if first_char == b'^' { + source.consume(); + if let Some(b'=') = source.next() { + source.consume(); + token_kind = TokenKind::CaretEquals; + } else { + token_kind = TokenKind::Caret; + } + } else if first_char == b'{' { + source.consume(); + token_kind = TokenKind::OpenCurly; + } else if first_char == b'|' { + source.consume(); + let next = source.next(); + if Some(b'|') == next { + source.consume(); + token_kind = TokenKind::OrOr; + } else if Some(b'=') == next { + source.consume(); + token_kind = TokenKind::OrEquals; + } else { + token_kind = TokenKind::Or; + } + } else if first_char == b'}' { + source.consume(); + token_kind = TokenKind::CloseCurly + } else { + self.check_ascii(source)?; + return Ok(None); + } + + target.tokens.push(Token::new(token_kind, pos)); + Ok(Some(token_kind)) + } + + pub(crate) fn consume_char_literal(&mut self, source: &mut InputSource, target: &mut TokenBuffer) -> Result<(), ParseError> { + let begin_pos = source.pos(); + + // Consume the leading quote + debug_assert!(source.next().unwrap() == b'\''); + source.consume(); + + let mut prev_char = b'\''; + while let Some(c) = source.next() { + source.consume(); + if c == b'\'' && prev_char != b'\\' { + prev_char = c; + break; + } + + prev_char = c; + } + + if prev_char != b'\'' { + // Unterminated character literal + return Err(ParseError::new_error(source, begin_pos, "encountered unterminated character literal")); + } + + let end_pos = source.pos(); + + target.tokens.push(Token::new(TokenKind::Character, begin_pos)); + target.tokens.push(Token::new(TokenKind::SpanEnd, end_pos)); + + Ok(()) + } + + pub(crate) fn consume_string_literal(&mut self, source: &mut InputSource, target: &mut TokenBuffer) -> Result<(), ParseError> { + let begin_pos = source.pos(); + + // Consume the leading double quotes + debug_assert!(source.next().unwrap() == b'"'); + source.consume(); + + let mut prev_char = b'"'; + while let Some(c) = source.next() { + source.consume(); + if c == b'"' && prev_char != b'\\' { + prev_char = c; + break; + } + + prev_char = c; + } + + if prev_char != b'"' { + // Unterminated string literal + return Err(ParseError::new_error(source, begin_pos, "encountered unterminated string literal")); + } + + let end_pos = source.pos(); + target.tokens.push(Token::new(TokenKind::String, begin_pos)); + target.tokens.push(Token::new(TokenKind::SpanEnd, end_pos)); + + Ok(()) + } + + fn consume_pragma(&mut self, first_char: u8, source: &mut InputSource, target: &mut TokenBuffer) { + let pos = source.pos(); + debug_assert_eq!(first_char, b'#'); + source.consume(); + + target.tokens.push(Token::new(TokenKind::Pound, pos)); + } + + pub(crate) fn consume_line_comment(&mut self, source: &mut InputSource, target: &mut TokenBuffer) -> Result<(), ParseError> { + let begin_pos = source.pos(); + + // Consume the leading "//" + debug_assert!(source.next().unwrap() == b'/' && source.lookahead(1).unwrap() == b'/'); + source.consume(); + source.consume(); + + let mut prev_char = b'/'; + let mut cur_char = b'/'; + while let Some(c) = source.next() { + source.consume(); + cur_char = c; + if c == b'\n' { + // End of line + break; + } + prev_char = c; + } + + let mut end_pos = source.pos(); + debug_assert_eq!(begin_pos.line, end_pos.line); + + // Modify offset to not include the newline characters + if cur_char == b'\n' { + if prev_char == b'\r' { + end_pos.offset -= 2; + } else { + end_pos.offset -= 1; + } + } else { + debug_assert!(source.next().is_none()) + } + + target.tokens.push(Token::new(TokenKind::LineComment, begin_pos)); + target.tokens.push(Token::new(TokenKind::SpanEnd, end_pos)); + + Ok(()) + } + + pub(crate) fn consume_block_comment(&mut self, source: &mut InputSource, target: &mut TokenBuffer) -> Result<(), ParseError> { + let begin_pos = source.pos(); + + // Consume the leading "/*" + debug_assert!(source.next().unwrap() == b'/' && source.lookahead(1).unwrap() == b'*'); + source.consume(); + source.consume(); + + // Explicitly do not put prev_char at "*", because then "/*/" would + // represent a valid and closed block comment + let mut prev_char = b' '; + let mut is_closed = false; + while let Some(c) = source.next() { + source.consume(); + if prev_char == b'*' && c == b'/' { + // End of block comment + is_closed = true; + break; + } + prev_char = c; + } + + if !is_closed { + return Err(ParseError::new_error(source, source.pos(), "encountered unterminated block comment")); + } + + let end_pos = source.pos(); + target.tokens.push(Token::new(TokenKind::BlockComment, begin_pos)); + target.tokens.push(Token::new(TokenKind::SpanEnd, end_pos)); + + Ok(()) + } + + pub(crate) fn consume_identifier<'a>(&mut self, source: &'a mut InputSource, target: &mut TokenBuffer) -> Result<&'a [u8], ParseError> { + let begin_pos = source.pos(); + debug_assert!(is_identifier_start(source.next().unwrap())); + source.consume(); + + // Keep reading until no more identifier + while let Some(c) = source.next() { + if !is_identifier_remaining(c) { + break; + } + + source.consume(); + } + self.check_ascii(source)?; + + let end_pos = source.pos(); + target.tokens.push(Token::new(TokenKind::Ident, begin_pos)); + target.tokens.push(Token::new(TokenKind::SpanEnd, end_pos)); + Ok(source.section(begin_pos.offset, end_pos.offset)) + } + + pub(crate) fn consume_number(&mut self, source: &mut InputSource, target: &mut TokenBuffer) -> Result<(), ParseError> { + let begin_pos = source.pos(); + debug_assert!(is_integer_literal_start(source.next().unwrap())); + source.consume(); + + // Keep reading until it doesn't look like a number anymore + while let Some(c) = source.next() { + if !maybe_number_remaining(c) { + break; + } + + source.consume(); + } + self.check_ascii(source)?; + + let end_pos = source.pos(); + target.tokens.push(Token::new(TokenKind::Integer, begin_pos)); + target.tokens.push(Token::new(TokenKind::SpanEnd, end_pos)); + + Ok(()) + } + + // Consumes whitespace and returns whether or not the whitespace contained + // a newline. + fn consume_whitespace(&self, source: &mut InputSource) -> bool { + debug_assert!(is_whitespace(source.next().unwrap())); + + let mut has_newline = false; + while let Some(c) = source.next() { + if !is_whitespace(c) { + break; + } + + if c == b'\n' { + has_newline = true; + } + } + + has_newline + } + + /// Pushes a new token range onto the stack in the buffers. + fn push_range(&self, target: &mut TokenBuffer, range_kind: TokenRangeKind, first_token: usize) { + let cur_range = &target.ranges[self.stack_idx]; + let parent_idx = cur_range.parent_idx; + let parent_range = &target.ranges[parent_idx]; + if parent_range.end != first_token { + // Insert intermediate range + target.ranges.push(TokenRange{ + parent_idx, + range_kind: TokenRangeKind::Code, + curly_depth: cur_range.curly_depth, + start: parent_range.end, + end: first_token, + subranges: 0, + }); + } + + // Insert a new range + let parent_idx = self.stack_idx; + self.stack_idx = target.ranges.len(); + target.ranges.push(TokenRange{ + parent_idx, + range_kind, + curly_depth: self.curly_depth, + start: first_token, + end: first_token, + subranges: 0, + }); + } + + fn pop_range(&self, target: &mut TokenBuffer, end_index: usize) { + // Pop all the dummy ranges that are left on the range stack + let last = &mut target.ranges[self.stack_idx]; + debug_assert!(self.stack_idx != last.parent_idx, "attempting to pop top-level range"); + + last.end = end_index; + self.stack_idx = last.parent_idx as usize; + let parent = &mut target.ranges[self.stack_idx]; + parent.end = end_index; + parent.subranges += 1; + } + + + fn check_ascii(&self, source: &InputSource) -> Result<(), ParseError> { + match source.next() { + Some(c) if !c.is_ascii() => { + Err(ParseError::new_error(source, source.pos(), "encountered a non-ASCII character")) + }, + _else => { + Ok(()) + }, + } + } +} + +// Helpers for characters +fn demarks_definition(ident: &[u8]) -> bool { + return + ident == b"struct" || + ident == b"enum" || + ident == b"union" || + ident == b"func" || + ident == b"primitive" || + ident == b"composite" +} + +fn demarks_import(ident: &[u8]) -> bool { + return ident == b"import"; +} + +fn is_whitespace(c: u8) -> bool { + c.is_ascii_whitespace() +} + +fn is_char_literal_start(c: u8) -> bool { + return c == b'\''; +} + +fn is_string_literal_start(c: u8) -> bool { + return c == b'"'; +} + +fn is_pragma_start(c: u8) -> bool { + return c == b'#'; +} + +fn is_identifier_start(c: u8) -> bool { + return + (c >= b'a' && c <= b'z') || + (c >= b'A' && c <= b'Z') || + c == b'_' +} + +fn is_identifier_remaining(c: u8) -> bool { + return + (c >= b'0' && c <= b'9') || + (c >= b'a' && c <= b'z') || + (c >= b'A' && c <= b'Z') || + c == b'_' +} + +fn is_integer_literal_start(c: u8) -> bool { + return c >= b'0' && c <= b'9'; +} + +fn maybe_number_remaining(c: u8) -> bool { + return + (c == b'b' || c == b'B' || c == b'o' || c == b'O' || c == b'x' || c == b'X') || + (c >= b'0' && c <= b'9') || + c == b'_'; +} \ No newline at end of file