use crate::protocol::input_source2::{InputSource2 as InputSource, ParseError, InputPosition2 as InputPosition, InputSpan}; #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub(crate) enum TokenKind { // Variable-character tokens, followed by a SpanEnd token Ident, // regular identifier Pragma, // identifier with prefixed `#`, range includes `#` Integer, // integer literal String, // string literal, range includes `"` Character, // character literal, range includes `'` LineComment, // line comment, range includes leading `//`, but not newline BlockComment, // block comment, range includes leading `/*` and trailing `*/` // Punctuation Exclamation, // ! Question, // ? Pound, // # OpenAngle, // < OpenCurly, // { OpenParen, // ( OpenSquare, // [ CloseAngle, // > CloseCurly, // } CloseParen, // ) CloseSquare, // ] Colon, // : ColonColon, // :: Comma, // , Dot, // . DotDot, // .. SemiColon, // ; Quote, // ' DoubleQuote, // " // Operator-like At, // @ Plus, // + PlusPlus, // ++ PlusEquals, // += Minus, // - ArrowRight, // -> MinusMinus, // -- MinusEquals, // -= Star, // * StarEquals, // *= Slash, // / SlashEquals, // /= Percent, // % PercentEquals, // %= Caret, // ^ CaretEquals, // ^= And, // & AndAnd, // && AndEquals, // &= Or, // | OrOr, // || OrEquals, // |= Tilde, // ~ Equal, // = EqualEqual, // == NotEqual, // != ShiftLeft, // << ShiftLeftEquals,// <<= ShiftRight, // >> ShiftRightEquals, // >>= // Special marker token to indicate end of variable-character tokens SpanEnd, } pub(crate) struct Token { pub kind: TokenKind, pub pos: InputPosition, } impl Token { fn new(kind: TokenKind, pos: InputPosition) -> Self { Self{ kind, pos } } } #[derive(Debug, PartialEq, Eq)] pub(crate) enum TokenRangeKind { Module, Pragma, Import, Definition, Code, } #[derive(Debug)] struct TokenRange { // Index of parent in `TokenBuffer.ranges`, does not have a parent if the // range kind is Module, in that case the parent index points to itself. parent_idx: usize, range_kind: TokenRangeKind, curly_depth: i32, start: usize, end: usize, subranges: usize, } pub(crate) struct TokenBuffer { pub tokens: Vec, pub ranges: Vec, } impl TokenBuffer { pub(crate) fn new() -> Self { Self{ tokens: Vec::new(), ranges: Vec::new() } } } // Tokenizer is a reusable parser to tokenize multiple source files using the // same allocated buffers. In a well-formed program, we produce a consistent // tree of token ranges such that we may identify tokens that represent a // defintion or an import before producing the entire AST. // // If the program is not well-formed then the tree may be inconsistent, but we // will detect this once we transform the tokens into the AST. Maybe we want to // detect a mismatch in opening/closing curly braces in the future? pub(crate) struct Tokenizer { // Signed because programmer might have placed too many closing curly braces curly_depth: i32, // Points to an element in the `TokenBuffer.ranges` variable. stack_idx: usize, } impl Tokenizer { pub(crate) fn new() -> Self { Self{ curly_depth: 0, stack_idx: 0 } } pub(crate) fn tokenize(&mut self, source: &mut InputSource, target: &mut TokenBuffer) -> Result<(), ParseError> { // Assert source and buffer are at start debug_assert_eq!(source.pos().offset, 0); debug_assert!(target.tokens.is_empty()); debug_assert!(target.ranges.is_empty()); // Set up for tokenization by pushing the first range onto the stack. // This range may get transformed into the appropriate range kind later, // see `push_range` and `pop_range`. self.curly_depth = 0; self.stack_idx = 0; target.ranges.push(TokenRange{ parent_idx: 0, range_kind: TokenRangeKind::Module, curly_depth: 0, start: 0, end: 0, subranges: 0, }); // Main tokenization loop while let Some(c) = source.next() { let token_index = target.tokens.len(); if is_char_literal_start(c) { self.consume_char_literal(source, target)?; } else if is_string_literal_start(c) { self.consume_string_literal(source, target)?; } else if is_identifier_start(c) { let ident = self.consume_identifier(source, target)?; if demarks_definition(ident) { self.push_range(target, TokenRangeKind::Definition, token_index); } else if demarks_import(ident) { self.push_range(target, TokenRangeKind::Import, token_index); } } else if is_integer_literal_start(c) { self.consume_number(source, target)?; } else if is_pragma_start_or_pound(c) { let was_pragma = self.consume_pragma_or_pound(c, source, target)?; if was_pragma { self.push_range(target, TokenRangeKind::Pragma, token_index); } } else if self.is_line_comment_start(c, source) { self.consume_line_comment(source, target)?; } else if self.is_block_comment_start(c, source) { self.consume_block_comment(source, target)?; } else if is_whitespace(c) { let contained_newline = self.consume_whitespace(source); if contained_newline { let range = &target.ranges[self.stack_idx]; if range.range_kind == TokenRangeKind::Pragma { self.pop_range(target, target.tokens.len()); } } } else { let was_punctuation = self.maybe_parse_punctuation(c, source, target)?; if let Some(token) = was_punctuation { if token == TokenKind::OpenCurly { self.curly_depth += 1; } else if token == TokenKind::CloseCurly { // Check if this marks the end of a range we're // currently processing self.curly_depth -= 1; let range = &target.ranges[self.stack_idx]; if range.range_kind == TokenRangeKind::Definition && range.curly_depth == self.curly_depth { self.pop_range(target, target.tokens.len()); } } else if token == TokenKind::SemiColon { // Check if this marks the end of an import let range = &target.ranges[self.stack_idx]; if range.range_kind == TokenRangeKind::Import { self.pop_range(target, target.tokens.len()); } } } else { return Err(ParseError::new_error(source, source.pos(), "unexpected character")); } } } // End of file, check if our state is correct if let Some(error) = source.had_error.take() { return Err(error); } Ok(()) } fn is_line_comment_start(&self, first_char: u8, source: &InputSource) -> bool { return first_char == b'/' && Some(b'/') == source.lookahead(1); } fn is_block_comment_start(&self, first_char: u8, source: &InputSource) -> bool { return first_char == b'/' && Some(b'*') == source.lookahead(1); } fn maybe_parse_punctuation(&mut self, first_char: u8, source: &mut InputSource, target: &mut TokenBuffer) -> Result, ParseError> { debug_assert!(first_char != b'#', "'#' needs special handling"); debug_assert!(first_char != b'\'', "'\'' needs special handling"); debug_assert!(first_char != b'"', "'\"' needs special handling"); let pos = source.pos(); let token_kind; if first_char == b'!' { source.consume(); if Some(b'=') == source.next() { source.consume(); token_kind = TokenKind::NotEqual; } else { token_kind = TokenKind::Exclamation; } } else if first_char == b'%' { source.consume(); if Some(b'=') == source.next() { source.consume(); token_kind = TokenKind::PercentEquals; } else { token_kind = TokenKind::Percent; } } else if first_char == b'&' { source.consume(); let next = source.next(); if Some(b'&') == next { source.consume(); token_kind = TokenKind::AndAnd; } else if Some(b'=') == next { source.consume(); token_kind = TokenKind::AndEquals; } else { token_kind = TokenKind::And; } } else if first_char == b'(' { source.consume(); token_kind = TokenKind::OpenParen; } else if first_char == b')' { source.consume(); token_kind = TokenKind::CloseParen; } else if first_char == b'*' { source.consume(); if let Some(b'=') = source.next() { source.consume(); token_kind = TokenKind::StarEquals; } else { token_kind = TokenKind::Star; } } else if first_char == b'+' { source.consume(); let next = source.next(); if Some(b'+') == next { source.consume(); token_kind = TokenKind::PlusPlus; } else if Some(b'=') == next { source.consume(); token_kind = TokenKind::PlusEquals; } else { token_kind = TokenKind::Plus; } } else if first_char == b',' { source.consume(); token_kind = TokenKind::Comma; } else if first_char == b'-' { source.consume(); let next = source.next(); if Some(b'-') == next { source.consume(); token_kind = TokenKind::MinusMinus; } else if Some(b'>') == next { source.consume(); token_kind = TokenKind::ArrowRight; } else if Some(b'=') == next { source.consume(); token_kind = TokenKind::MinusEquals; } else { token_kind = TokenKind::Minus; } } else if first_char == b'.' { source.consume(); if let Some(b'.') = source.next() { source.consume(); token_kind = TokenKind::DotDot; } else { token_kind = TokenKind::Dot } } else if first_char == b'/' { source.consume(); debug_assert_ne!(Some(b'/'), source.next()); debug_assert_ne!(Some(b'*'), source.next()); if let Some(b'=') = source.next() { source.consume(); token_kind = TokenKind::SlashEquals; } else { token_kind = TokenKind::Slash; } } else if first_char == b':' { source.consume(); if let Some(b':') = source.next() { source.consume(); token_kind = TokenKind::ColonColon; } else { token_kind = TokenKind::Colon; } } else if first_char == b';' { source.consume(); token_kind = TokenKind::SemiColon; } else if first_char == b'<' { source.consume(); if let Some(b'<') = source.next() { source.consume(); if let Some(b'=') = source.next() { source.consume(); token_kind = TokenKind::ShiftLeftEquals; } else { token_kind = TokenKind::ShiftLeft; } } else { token_kind = TokenKind::OpenAngle; } } else if first_char == b'=' { source.consume(); if let Some(b'=') = source.next() { source.consume(); token_kind = TokenKind::EqualEqual; } else { token_kind = TokenKind::Equal; } } else if first_char == b'>' { source.consume(); if let Some(b'>') = source.next() { source.consume(); if let Some(b'=') = source.next() { source.consume(); token_kind = TokenKind::ShiftRightEquals; } else { token_kind = TokenKind::ShiftRight; } } else { token_kind = TokenKind::CloseAngle; } } else if first_char == b'?' { source.consume(); token_kind = TokenKind::Question; } else if first_char == b'@' { source.consume(); token_kind = TokenKind::At; } else if first_char == b'[' { source.consume(); token_kind = TokenKind::OpenSquare; } else if first_char == b']' { source.consume(); token_kind = TokenKind::CloseSquare; } else if first_char == b'^' { source.consume(); if let Some(b'=') = source.next() { source.consume(); token_kind = TokenKind::CaretEquals; } else { token_kind = TokenKind::Caret; } } else if first_char == b'{' { source.consume(); token_kind = TokenKind::OpenCurly; } else if first_char == b'|' { source.consume(); let next = source.next(); if Some(b'|') == next { source.consume(); token_kind = TokenKind::OrOr; } else if Some(b'=') == next { source.consume(); token_kind = TokenKind::OrEquals; } else { token_kind = TokenKind::Or; } } else if first_char == b'}' { source.consume(); token_kind = TokenKind::CloseCurly } else { self.check_ascii(source)?; return Ok(None); } target.tokens.push(Token::new(token_kind, pos)); Ok(Some(token_kind)) } fn consume_char_literal(&mut self, source: &mut InputSource, target: &mut TokenBuffer) -> Result<(), ParseError> { let begin_pos = source.pos(); // Consume the leading quote debug_assert!(source.next().unwrap() == b'\''); source.consume(); let mut prev_char = b'\''; while let Some(c) = source.next() { if !c.is_ascii() { return Err(ParseError::new_error(source, source.pos(), "non-ASCII character in char literal")); } source.consume(); // Make sure ending quote was not escaped if c == b'\'' && prev_char != b'\\' { prev_char = c; break; } prev_char = c; } if prev_char != b'\'' { // Unterminated character literal, reached end of file. return Err(ParseError::new_error(source, begin_pos, "encountered unterminated character literal")); } let end_pos = source.pos(); target.tokens.push(Token::new(TokenKind::Character, begin_pos)); target.tokens.push(Token::new(TokenKind::SpanEnd, end_pos)); Ok(()) } fn consume_string_literal(&mut self, source: &mut InputSource, target: &mut TokenBuffer) -> Result<(), ParseError> { let begin_pos = source.pos(); // Consume the leading double quotes debug_assert!(source.next().unwrap() == b'"'); source.consume(); let mut prev_char = b'"'; while let Some(c) = source.next() { if !c.is_ascii() { return Err(ParseError::new_error(source, source.pos(), "non-ASCII character in string literal")); } source.consume(); if c == b'"' && prev_char != b'\\' { prev_char = c; break; } prev_char = c; } if prev_char != b'"' { // Unterminated string literal return Err(ParseError::new_error(source, begin_pos, "encountered unterminated string literal")); } let end_pos = source.pos(); target.tokens.push(Token::new(TokenKind::String, begin_pos)); target.tokens.push(Token::new(TokenKind::SpanEnd, end_pos)); Ok(()) } fn consume_pragma_or_pound(&mut self, first_char: u8, source: &mut InputSource, target: &mut TokenBuffer) -> Result { let start_pos = source.pos(); debug_assert_eq!(first_char, b'#'); source.consume(); let next = source.next(); if next.is_none() || !is_identifier_start(next.unwrap()) { // Just a pound sign target.tokens.push(Token::new(TokenKind::Pound, start_pos)); Ok(false) } else { // Pound sign followed by identifier source.consume(); while let Some(c) = source.next() { if !is_identifier_remaining(c) { break; } source.consume(); } self.check_ascii(source)?; let end_pos = source.pos(); target.tokens.push(Token::new(TokenKind::Pragma, start_pos)); target.tokens.push(Token::new(TokenKind::SpanEnd, end_pos)); Ok(true) } } fn consume_line_comment(&mut self, source: &mut InputSource, target: &mut TokenBuffer) -> Result<(), ParseError> { let begin_pos = source.pos(); // Consume the leading "//" debug_assert!(source.next().unwrap() == b'/' && source.lookahead(1).unwrap() == b'/'); source.consume(); source.consume(); let mut prev_char = b'/'; let mut cur_char = b'/'; while let Some(c) = source.next() { prev_char = cur_char; cur_char = c; if c == b'\n' { // End of line, note that the newline is not consumed break; } source.consume(); } let mut end_pos = source.pos(); debug_assert_eq!(begin_pos.line, end_pos.line); // Modify offset to not include the newline characters if cur_char == b'\n' { if prev_char == b'\r' { end_pos.offset -= 2; } else { end_pos.offset -= 1; } // Consume final newline source.consume(); } else { // End of comment was due to EOF debug_assert!(source.next().is_none()) } target.tokens.push(Token::new(TokenKind::LineComment, begin_pos)); target.tokens.push(Token::new(TokenKind::SpanEnd, end_pos)); Ok(()) } fn consume_block_comment(&mut self, source: &mut InputSource, target: &mut TokenBuffer) -> Result<(), ParseError> { let begin_pos = source.pos(); // Consume the leading "/*" debug_assert!(source.next().unwrap() == b'/' && source.lookahead(1).unwrap() == b'*'); source.consume(); source.consume(); // Explicitly do not put prev_char at "*", because then "/*/" would // represent a valid and closed block comment let mut prev_char = b' '; let mut is_closed = false; while let Some(c) = source.next() { source.consume(); if prev_char == b'*' && c == b'/' { // End of block comment is_closed = true; break; } prev_char = c; } if !is_closed { return Err(ParseError::new_error(source, source.pos(), "encountered unterminated block comment")); } let end_pos = source.pos(); target.tokens.push(Token::new(TokenKind::BlockComment, begin_pos)); target.tokens.push(Token::new(TokenKind::SpanEnd, end_pos)); Ok(()) } fn consume_identifier<'a>(&mut self, source: &'a mut InputSource, target: &mut TokenBuffer) -> Result<&'a [u8], ParseError> { let begin_pos = source.pos(); debug_assert!(is_identifier_start(source.next().unwrap())); source.consume(); // Keep reading until no more identifier while let Some(c) = source.next() { if !is_identifier_remaining(c) { break; } source.consume(); } self.check_ascii(source)?; let end_pos = source.pos(); target.tokens.push(Token::new(TokenKind::Ident, begin_pos)); target.tokens.push(Token::new(TokenKind::SpanEnd, end_pos)); Ok(source.section(begin_pos.offset, end_pos.offset)) } fn consume_number(&mut self, source: &mut InputSource, target: &mut TokenBuffer) -> Result<(), ParseError> { let begin_pos = source.pos(); debug_assert!(is_integer_literal_start(source.next().unwrap())); source.consume(); // Keep reading until it doesn't look like a number anymore while let Some(c) = source.next() { if !maybe_number_remaining(c) { break; } source.consume(); } self.check_ascii(source)?; let end_pos = source.pos(); target.tokens.push(Token::new(TokenKind::Integer, begin_pos)); target.tokens.push(Token::new(TokenKind::SpanEnd, end_pos)); Ok(()) } // Consumes whitespace and returns whether or not the whitespace contained // a newline. fn consume_whitespace(&self, source: &mut InputSource) -> bool { debug_assert!(is_whitespace(source.next().unwrap())); let mut has_newline = false; while let Some(c) = source.next() { if !is_whitespace(c) { break; } if c == b'\n' { has_newline = true; } source.consume(); } has_newline } /// Pushes a new token range onto the stack in the buffers. fn push_range(&mut self, target: &mut TokenBuffer, range_kind: TokenRangeKind, first_token: usize) { let cur_range = &mut target.ranges[self.stack_idx]; println!( "DEBUG: push_range [1] | stack_idx: {}, range_end: {}, first_token: {}", self.stack_idx, cur_range.end, first_token ); // If we have just popped a range and then push a new range, then the // first token is equal to the last token registered on the current // range. If not, then we had some intermediate tokens that did not // belong to a particular kind of token range: hence we insert an // intermediate "code" range. if cur_range.end != first_token { println!("DEBUG: push_range [2] | inserting code range"); let code_start = cur_range.end; cur_range.end = first_token; cur_range.subranges += 1; target.ranges.push(TokenRange{ parent_idx: self.stack_idx, range_kind: TokenRangeKind::Code, curly_depth: self.curly_depth, start: code_start, end: first_token, subranges: 0, }); } // Insert a new range println!( "DEBUG: push_range [3] | kind: {:?}, parent_idx: {}, stack_idx: {}", range_kind, self.stack_idx, target.ranges.len() ); let parent_idx = self.stack_idx; self.stack_idx = target.ranges.len(); target.ranges.push(TokenRange{ parent_idx, range_kind, curly_depth: self.curly_depth, start: first_token, end: first_token, subranges: 0, }); } fn pop_range(&mut self, target: &mut TokenBuffer, end_index: usize) { let last = &mut target.ranges[self.stack_idx]; debug_assert!(self.stack_idx != last.parent_idx, "attempting to pop top-level range"); // Fix up the current range before going back to parent println!( "DEBUG: pop_range [1] | stack_idx: {}, kind: {:?}, start: {}, old_end: {}, new_end: {}", self.stack_idx, last.range_kind, last.start, last.end, end_index ); last.end = end_index; // Go back to parent self.stack_idx = last.parent_idx; let parent = &mut target.ranges[self.stack_idx]; parent.end = end_index; parent.subranges += 1; println!( "DEBUG: pop_range [2] | returning to kind: {:?}, idx: {}, new_end: {}", parent.range_kind, self.stack_idx, end_index ); } fn check_ascii(&self, source: &InputSource) -> Result<(), ParseError> { match source.next() { Some(c) if !c.is_ascii() => { Err(ParseError::new_error(source, source.pos(), "encountered a non-ASCII character")) }, _else => { Ok(()) }, } } } // Helpers for characters fn demarks_definition(ident: &[u8]) -> bool { return ident == b"struct" || ident == b"enum" || ident == b"union" || ident == b"func" || ident == b"primitive" || ident == b"composite" } fn demarks_import(ident: &[u8]) -> bool { return ident == b"import"; } fn is_whitespace(c: u8) -> bool { c.is_ascii_whitespace() } fn is_char_literal_start(c: u8) -> bool { return c == b'\''; } fn is_string_literal_start(c: u8) -> bool { return c == b'"'; } fn is_pragma_start_or_pound(c: u8) -> bool { return c == b'#'; } fn is_identifier_start(c: u8) -> bool { return (c >= b'a' && c <= b'z') || (c >= b'A' && c <= b'Z') || c == b'_' } fn is_identifier_remaining(c: u8) -> bool { return (c >= b'0' && c <= b'9') || (c >= b'a' && c <= b'z') || (c >= b'A' && c <= b'Z') || c == b'_' } fn is_integer_literal_start(c: u8) -> bool { return c >= b'0' && c <= b'9'; } fn maybe_number_remaining(c: u8) -> bool { return (c == b'b' || c == b'B' || c == b'o' || c == b'O' || c == b'x' || c == b'X') || (c >= b'0' && c <= b'9') || c == b'_'; } #[cfg(test)] mod tests { use super::*; // TODO: Remove at some point #[test] fn test_tokenizer() { let mut source = InputSource::new_test(" #version 500 # hello 2 import std.reo::*; struct Thing { int a: 5, } enum Hello { A, B } // Hello hello, is it me you are looking for? // I can seee it in your eeeyes func something(int a, int b, int c) -> byte { int a = 5; struct Inner { int a } struct City { int b } /* Waza How are you doing Things in here yo /* */ */ a = a + 5 * 2; struct Pressure { int d } } "); let mut t = Tokenizer::new(); let mut buffer = TokenBuffer::new(); t.tokenize(&mut source, &mut buffer).expect("tokenize"); println!("Ranges:\n"); for (idx, range) in buffer.ranges.iter().enumerate() { println!("[{}] {:?}", idx, range) } println!("Tokens:\n"); let mut iter = buffer.tokens.iter().enumerate(); while let Some((idx, token)) = iter.next() { match token.kind { TokenKind::Ident | TokenKind::Pragma | TokenKind::Integer | TokenKind::String | TokenKind::Character | TokenKind::LineComment | TokenKind::BlockComment => { let (_, end) = iter.next().unwrap(); println!("[{}] {:?} ......", idx, token.kind); assert_eq!(end.kind, TokenKind::SpanEnd); let text = source.section(token.pos.offset, end.pos.offset); println!("{}", String::from_utf8_lossy(text)); }, _ => { println!("[{}] {:?}", idx, token.kind); } } } } }