From 33aa370aeb00f0dc954d522a28efd6dd6d50e55b 2021-04-12 10:29:43 From: MH Date: 2021-04-12 10:29:43 Subject: [PATCH] Finished tokenizer, pending new lexer --- diff --git a/src/protocol/input_source2.rs b/src/protocol/input_source2.rs index 025b86ddbfb0a0f77f8fd9f3dffe5a5cfad59218..54888d0e7e255ce861097b47abbf4503a7c3ae2e 100644 --- a/src/protocol/input_source2.rs +++ b/src/protocol/input_source2.rs @@ -28,7 +28,7 @@ pub struct InputSource2 { line: u32, offset: usize, // State tracking - had_error: Option, + pub(crate) had_error: Option, // The offset_lookup is built on-demand upon attempting to report an error. // As the compiler is currently not multithreaded, we simply put it in a // RefCell to allow interior mutability. @@ -47,6 +47,12 @@ impl InputSource2 { } } + #[cfg(test)] + pub fn new_test(input: &str) -> Self { + let bytes = Vec::from(input.as_bytes()); + return Self::new(String::from("test"), bytes) + } + #[inline] pub fn pos(&self) -> InputPosition2 { InputPosition2{ line: self.line, offset: self.offset as u32 } @@ -120,7 +126,7 @@ impl InputSource2 { // Build the line number (!) to offset lookup, so offset by 1. We // assume the entire source file is scanned (most common case) for // preallocation. - let lookup = self.offset_lookup.borrow_mut(); + let mut lookup = self.offset_lookup.borrow_mut(); lookup.reserve(self.line as usize + 2); lookup.push(0); // line 0: never used lookup.push(0); // first line: first character diff --git a/src/protocol/inputsource.rs b/src/protocol/inputsource.rs index 756f51cc32c3153a8d629f85f7c3ef02942d0521..1442d6452d2892b47b7b5d09b2f72b3bbcf9d591 100644 --- a/src/protocol/inputsource.rs +++ b/src/protocol/inputsource.rs @@ -146,9 +146,9 @@ impl fmt::Display for InputSource { #[derive(Debug, Clone, Copy)] pub struct InputPosition { - line: usize, - column: usize, - pub(crate) offset: usize, + pub line: usize, + pub column: usize, + pub offset: usize, } impl InputPosition { diff --git a/src/protocol/lexer2.rs b/src/protocol/lexer2.rs index 68d39e006e7b820c0aa6e376dff1b21a7d4f3974..50e7ff55cb147ab4c7668d0576badd1e67713cc2 100644 --- a/src/protocol/lexer2.rs +++ b/src/protocol/lexer2.rs @@ -1,4 +1,24 @@ +use crate::protocol::Heap; +use crate::protocol::tokenizer::{TokenBuffer, Token}; +use crate::protocol::input_source2::{InputSource2 as InputSource, ParseError}; -pub struct Lexer<'a> { - source: &'a mut InputSource, +struct Ctx<'a> { + heap: &'a mut Heap, + source: &'a InputSource, + tokens: &'a TokenBuffer, +} + +// Lexes definitions. Should be the first pass over each of the module files +// after tokenization. Only once all definitions are parsed can we do the full +// AST creation pass. +struct LexerDefinitions { + +} + +impl LexerDefinitions { + pub(crate) fn parse(ctx: &mut Ctx) -> Result<(), ParseError> { + debug_assert!(ctx.tokens.ranges.len() > 0); + } + + pub(crate) fn parse_definition(heap: &mut Heap, source: &InputSource, range: &TokenRang) } \ No newline at end of file diff --git a/src/protocol/tests/utils.rs b/src/protocol/tests/utils.rs index 34bba8faaac0ac6023bdd85dad38e5dc1e67f7ab..248844a38e89aa14307369f43b870804a1610c50 100644 --- a/src/protocol/tests/utils.rs +++ b/src/protocol/tests/utils.rs @@ -757,7 +757,7 @@ impl<'a> ErrorTester<'a> { self.test_name, pattern, self.assert_postfix() ); let pos = pos.unwrap(); - let col = self.error.statements[idx].position.col(); + let col = self.error.statements[idx].position.column; assert_eq!( pos + 1, col, "[{}] Expected error to occur at column {}, but found it at {} for {}", diff --git a/src/protocol/tokenizer/mod.rs b/src/protocol/tokenizer/mod.rs index 9f5cf247e7558d0a1025d48109be77c9fdd10546..fe6105ad9dae355136099a86932474e7a287bed0 100644 --- a/src/protocol/tokenizer/mod.rs +++ b/src/protocol/tokenizer/mod.rs @@ -1,15 +1,16 @@ use crate::protocol::input_source2::{InputSource2 as InputSource, ParseError, InputPosition2 as InputPosition, InputSpan}; -#[derive(Clone, Copy, PartialEq, Eq)] -enum TokenKind { +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub(crate) enum TokenKind { // Variable-character tokens, followed by a SpanEnd token - Ident, - Integer, - String, - Character, - LineComment, - BlockComment, + Ident, // regular identifier + Pragma, // identifier with prefixed `#`, range includes `#` + Integer, // integer literal + String, // string literal, range includes `"` + Character, // character literal, range includes `'` + LineComment, // line comment, range includes leading `//`, but not newline + BlockComment, // block comment, range includes leading `/*` and trailing `*/` // Punctuation Exclamation, // ! Question, // ? @@ -65,9 +66,9 @@ enum TokenKind { SpanEnd, } -struct Token { - kind: TokenKind, - pos: InputPosition, // probably need something different +pub(crate) struct Token { + pub kind: TokenKind, + pub pos: InputPosition, } impl Token { @@ -77,7 +78,7 @@ impl Token { } #[derive(Debug, PartialEq, Eq)] -enum TokenRangeKind { +pub(crate) enum TokenRangeKind { Module, Pragma, Import, @@ -85,25 +86,27 @@ enum TokenRangeKind { Code, } +#[derive(Debug)] struct TokenRange { // Index of parent in `TokenBuffer.ranges`, does not have a parent if the // range kind is Module, in that case the parent index points to itself. parent_idx: usize, range_kind: TokenRangeKind, - curly_depth: u8, + curly_depth: i32, start: usize, end: usize, subranges: usize, } -struct TokenBuffer { - tokens: Vec, - ranges: Vec, +pub(crate) struct TokenBuffer { + pub tokens: Vec, + pub ranges: Vec, } -struct ParseState { - kind: TokenRangeKind, - start: usize, +impl TokenBuffer { + pub(crate) fn new() -> Self { + Self{ tokens: Vec::new(), ranges: Vec::new() } + } } // Tokenizer is a reusable parser to tokenize multiple source files using the @@ -112,13 +115,19 @@ struct ParseState { // defintion or an import before producing the entire AST. // // If the program is not well-formed then the tree may be inconsistent, but we -// will detect this once we transform the tokens into the AST. +// will detect this once we transform the tokens into the AST. Maybe we want to +// detect a mismatch in opening/closing curly braces in the future? pub(crate) struct Tokenizer { - curly_depth: u8, + // Signed because programmer might have placed too many closing curly braces + curly_depth: i32, + // Points to an element in the `TokenBuffer.ranges` variable. stack_idx: usize, } impl Tokenizer { + pub(crate) fn new() -> Self { + Self{ curly_depth: 0, stack_idx: 0 } + } pub(crate) fn tokenize(&mut self, source: &mut InputSource, target: &mut TokenBuffer) -> Result<(), ParseError> { // Assert source and buffer are at start debug_assert_eq!(source.pos().offset, 0); @@ -129,7 +138,7 @@ impl Tokenizer { // This range may get transformed into the appropriate range kind later, // see `push_range` and `pop_range`. self.curly_depth = 0; - self.stack_idx = 1; + self.stack_idx = 0; target.ranges.push(TokenRange{ parent_idx: 0, range_kind: TokenRangeKind::Module, @@ -139,7 +148,7 @@ impl Tokenizer { subranges: 0, }); - // Main processing loop + // Main tokenization loop while let Some(c) = source.next() { let token_index = target.tokens.len(); @@ -157,15 +166,23 @@ impl Tokenizer { } } else if is_integer_literal_start(c) { self.consume_number(source, target)?; - } else if is_pragma_start(c) { - self.consume_pragma(c, source, target); - self.push_range(target, TokenRangeKind::Pragma, token_index); + } else if is_pragma_start_or_pound(c) { + let was_pragma = self.consume_pragma_or_pound(c, source, target)?; + if was_pragma { + self.push_range(target, TokenRangeKind::Pragma, token_index); + } } else if self.is_line_comment_start(c, source) { self.consume_line_comment(source, target)?; } else if self.is_block_comment_start(c, source) { self.consume_block_comment(source, target)?; } else if is_whitespace(c) { let contained_newline = self.consume_whitespace(source); + if contained_newline { + let range = &target.ranges[self.stack_idx]; + if range.range_kind == TokenRangeKind::Pragma { + self.pop_range(target, target.tokens.len()); + } + } } else { let was_punctuation = self.maybe_parse_punctuation(c, source, target)?; if let Some(token) = was_punctuation { @@ -194,6 +211,10 @@ impl Tokenizer { } // End of file, check if our state is correct + if let Some(error) = source.had_error.take() { + return Err(error); + } + Ok(()) } @@ -205,7 +226,7 @@ impl Tokenizer { return first_char == b'/' && Some(b'*') == source.lookahead(1); } - pub(crate) fn maybe_parse_punctuation(&mut self, first_char: u8, source: &mut InputSource, target: &mut TokenBuffer) -> Result, ParseError> { + fn maybe_parse_punctuation(&mut self, first_char: u8, source: &mut InputSource, target: &mut TokenBuffer) -> Result, ParseError> { debug_assert!(first_char != b'#', "'#' needs special handling"); debug_assert!(first_char != b'\'', "'\'' needs special handling"); debug_assert!(first_char != b'"', "'\"' needs special handling"); @@ -394,7 +415,7 @@ impl Tokenizer { Ok(Some(token_kind)) } - pub(crate) fn consume_char_literal(&mut self, source: &mut InputSource, target: &mut TokenBuffer) -> Result<(), ParseError> { + fn consume_char_literal(&mut self, source: &mut InputSource, target: &mut TokenBuffer) -> Result<(), ParseError> { let begin_pos = source.pos(); // Consume the leading quote @@ -403,7 +424,12 @@ impl Tokenizer { let mut prev_char = b'\''; while let Some(c) = source.next() { + if !c.is_ascii() { + return Err(ParseError::new_error(source, source.pos(), "non-ASCII character in char literal")); + } source.consume(); + + // Make sure ending quote was not escaped if c == b'\'' && prev_char != b'\\' { prev_char = c; break; @@ -413,7 +439,7 @@ impl Tokenizer { } if prev_char != b'\'' { - // Unterminated character literal + // Unterminated character literal, reached end of file. return Err(ParseError::new_error(source, begin_pos, "encountered unterminated character literal")); } @@ -425,7 +451,7 @@ impl Tokenizer { Ok(()) } - pub(crate) fn consume_string_literal(&mut self, source: &mut InputSource, target: &mut TokenBuffer) -> Result<(), ParseError> { + fn consume_string_literal(&mut self, source: &mut InputSource, target: &mut TokenBuffer) -> Result<(), ParseError> { let begin_pos = source.pos(); // Consume the leading double quotes @@ -434,6 +460,10 @@ impl Tokenizer { let mut prev_char = b'"'; while let Some(c) = source.next() { + if !c.is_ascii() { + return Err(ParseError::new_error(source, source.pos(), "non-ASCII character in string literal")); + } + source.consume(); if c == b'"' && prev_char != b'\\' { prev_char = c; @@ -455,15 +485,36 @@ impl Tokenizer { Ok(()) } - fn consume_pragma(&mut self, first_char: u8, source: &mut InputSource, target: &mut TokenBuffer) { - let pos = source.pos(); + fn consume_pragma_or_pound(&mut self, first_char: u8, source: &mut InputSource, target: &mut TokenBuffer) -> Result { + let start_pos = source.pos(); debug_assert_eq!(first_char, b'#'); source.consume(); - target.tokens.push(Token::new(TokenKind::Pound, pos)); + let next = source.next(); + if next.is_none() || !is_identifier_start(next.unwrap()) { + // Just a pound sign + target.tokens.push(Token::new(TokenKind::Pound, start_pos)); + Ok(false) + } else { + // Pound sign followed by identifier + source.consume(); + while let Some(c) = source.next() { + if !is_identifier_remaining(c) { + break; + } + source.consume(); + } + + self.check_ascii(source)?; + + let end_pos = source.pos(); + target.tokens.push(Token::new(TokenKind::Pragma, start_pos)); + target.tokens.push(Token::new(TokenKind::SpanEnd, end_pos)); + Ok(true) + } } - pub(crate) fn consume_line_comment(&mut self, source: &mut InputSource, target: &mut TokenBuffer) -> Result<(), ParseError> { + fn consume_line_comment(&mut self, source: &mut InputSource, target: &mut TokenBuffer) -> Result<(), ParseError> { let begin_pos = source.pos(); // Consume the leading "//" @@ -474,13 +525,15 @@ impl Tokenizer { let mut prev_char = b'/'; let mut cur_char = b'/'; while let Some(c) = source.next() { - source.consume(); + prev_char = cur_char; cur_char = c; + if c == b'\n' { - // End of line + // End of line, note that the newline is not consumed break; } - prev_char = c; + + source.consume(); } let mut end_pos = source.pos(); @@ -493,7 +546,10 @@ impl Tokenizer { } else { end_pos.offset -= 1; } + // Consume final newline + source.consume(); } else { + // End of comment was due to EOF debug_assert!(source.next().is_none()) } @@ -503,7 +559,7 @@ impl Tokenizer { Ok(()) } - pub(crate) fn consume_block_comment(&mut self, source: &mut InputSource, target: &mut TokenBuffer) -> Result<(), ParseError> { + fn consume_block_comment(&mut self, source: &mut InputSource, target: &mut TokenBuffer) -> Result<(), ParseError> { let begin_pos = source.pos(); // Consume the leading "/*" @@ -536,7 +592,7 @@ impl Tokenizer { Ok(()) } - pub(crate) fn consume_identifier<'a>(&mut self, source: &'a mut InputSource, target: &mut TokenBuffer) -> Result<&'a [u8], ParseError> { + fn consume_identifier<'a>(&mut self, source: &'a mut InputSource, target: &mut TokenBuffer) -> Result<&'a [u8], ParseError> { let begin_pos = source.pos(); debug_assert!(is_identifier_start(source.next().unwrap())); source.consume(); @@ -557,7 +613,7 @@ impl Tokenizer { Ok(source.section(begin_pos.offset, end_pos.offset)) } - pub(crate) fn consume_number(&mut self, source: &mut InputSource, target: &mut TokenBuffer) -> Result<(), ParseError> { + fn consume_number(&mut self, source: &mut InputSource, target: &mut TokenBuffer) -> Result<(), ParseError> { let begin_pos = source.pos(); debug_assert!(is_integer_literal_start(source.next().unwrap())); source.consume(); @@ -593,29 +649,46 @@ impl Tokenizer { if c == b'\n' { has_newline = true; } + source.consume(); } has_newline } /// Pushes a new token range onto the stack in the buffers. - fn push_range(&self, target: &mut TokenBuffer, range_kind: TokenRangeKind, first_token: usize) { - let cur_range = &target.ranges[self.stack_idx]; - let parent_idx = cur_range.parent_idx; - let parent_range = &target.ranges[parent_idx]; - if parent_range.end != first_token { - // Insert intermediate range + fn push_range(&mut self, target: &mut TokenBuffer, range_kind: TokenRangeKind, first_token: usize) { + let cur_range = &mut target.ranges[self.stack_idx]; + + println!( + "DEBUG: push_range [1] | stack_idx: {}, range_end: {}, first_token: {}", + self.stack_idx, cur_range.end, first_token + ); + + // If we have just popped a range and then push a new range, then the + // first token is equal to the last token registered on the current + // range. If not, then we had some intermediate tokens that did not + // belong to a particular kind of token range: hence we insert an + // intermediate "code" range. + if cur_range.end != first_token { + println!("DEBUG: push_range [2] | inserting code range"); + let code_start = cur_range.end; + cur_range.end = first_token; + cur_range.subranges += 1; target.ranges.push(TokenRange{ - parent_idx, + parent_idx: self.stack_idx, range_kind: TokenRangeKind::Code, - curly_depth: cur_range.curly_depth, - start: parent_range.end, + curly_depth: self.curly_depth, + start: code_start, end: first_token, subranges: 0, }); } // Insert a new range + println!( + "DEBUG: push_range [3] | kind: {:?}, parent_idx: {}, stack_idx: {}", + range_kind, self.stack_idx, target.ranges.len() + ); let parent_idx = self.stack_idx; self.stack_idx = target.ranges.len(); target.ranges.push(TokenRange{ @@ -628,16 +701,26 @@ impl Tokenizer { }); } - fn pop_range(&self, target: &mut TokenBuffer, end_index: usize) { - // Pop all the dummy ranges that are left on the range stack + fn pop_range(&mut self, target: &mut TokenBuffer, end_index: usize) { let last = &mut target.ranges[self.stack_idx]; debug_assert!(self.stack_idx != last.parent_idx, "attempting to pop top-level range"); + // Fix up the current range before going back to parent + println!( + "DEBUG: pop_range [1] | stack_idx: {}, kind: {:?}, start: {}, old_end: {}, new_end: {}", + self.stack_idx, last.range_kind, last.start, last.end, end_index + ); last.end = end_index; - self.stack_idx = last.parent_idx as usize; + + // Go back to parent + self.stack_idx = last.parent_idx; let parent = &mut target.ranges[self.stack_idx]; parent.end = end_index; parent.subranges += 1; + println!( + "DEBUG: pop_range [2] | returning to kind: {:?}, idx: {}, new_end: {}", + parent.range_kind, self.stack_idx, end_index + ); } @@ -680,7 +763,7 @@ fn is_string_literal_start(c: u8) -> bool { return c == b'"'; } -fn is_pragma_start(c: u8) -> bool { +fn is_pragma_start_or_pound(c: u8) -> bool { return c == b'#'; } @@ -708,4 +791,78 @@ fn maybe_number_remaining(c: u8) -> bool { (c == b'b' || c == b'B' || c == b'o' || c == b'O' || c == b'x' || c == b'X') || (c >= b'0' && c <= b'9') || c == b'_'; +} + +#[cfg(test)] +mod tests { + use super::*; + + // TODO: Remove at some point + #[test] + fn test_tokenizer() { + let mut source = InputSource::new_test(" + + #version 500 + # hello 2 + + import std.reo::*; + + struct Thing { + int a: 5, + } + enum Hello { + A, + B + } + + // Hello hello, is it me you are looking for? + // I can seee it in your eeeyes + + func something(int a, int b, int c) -> byte { + int a = 5; + struct Inner { + int a + } + struct City { + int b + } + /* Waza + How are you doing + Things in here yo + /* */ */ + + a = a + 5 * 2; + struct Pressure { + int d + } + } + "); + let mut t = Tokenizer::new(); + let mut buffer = TokenBuffer::new(); + t.tokenize(&mut source, &mut buffer).expect("tokenize"); + + println!("Ranges:\n"); + for (idx, range) in buffer.ranges.iter().enumerate() { + println!("[{}] {:?}", idx, range) + } + + println!("Tokens:\n"); + let mut iter = buffer.tokens.iter().enumerate(); + while let Some((idx, token)) = iter.next() { + match token.kind { + TokenKind::Ident | TokenKind::Pragma | TokenKind::Integer | + TokenKind::String | TokenKind::Character | TokenKind::LineComment | + TokenKind::BlockComment => { + let (_, end) = iter.next().unwrap(); + println!("[{}] {:?} ......", idx, token.kind); + assert_eq!(end.kind, TokenKind::SpanEnd); + let text = source.section(token.pos.offset, end.pos.offset); + println!("{}", String::from_utf8_lossy(text)); + }, + _ => { + println!("[{}] {:?}", idx, token.kind); + } + } + } + } } \ No newline at end of file