CSY/reowolf Changeset - 33aa370aeb00 · Centrum Wiskunde & Informatica (CWI)

Changeset - 33aa370aeb00

Parent rev.

Child rev.

[Not reviewed]

0 5 0

MH - 4 years ago 2021-04-12 10:29:43
henger@cwi.nl

Finished tokenizer, pending new lexer

5 files changed with 244 insertions and 61 deletions:

src/protocol/input_source2.rs

src/protocol/inputsource.rs

src/protocol/lexer2.rs

src/protocol/tests/utils.rs

src/protocol/tokenizer/mod.rs

210

0 comments (0 inline, 0 general)

src/protocol/input_source2.rs

➞

Show inline comments

@@ @@ -28,7 +28,7 @@ pub struct InputSource2 { @@
     line: u32,
     offset: usize,
     // State tracking
     had_error: Option<ParseError>,
+    pub(crate) had_error: Option<ParseError>,
     // The offset_lookup is built on-demand upon attempting to report an error.
     // As the compiler is currently not multithreaded, we simply put it in a
     // RefCell to allow interior mutability.
@@ @@ -47,6 +47,12 @@ impl InputSource2 { @@
+        }
+    }
     #[cfg(test)]
     pub fn new_test(input: &str) -> Self {
         let bytes = Vec::from(input.as_bytes());
         return Self::new(String::from("test"), bytes)
+    }
     #[inline]
     pub fn pos(&self) -> InputPosition2 {
         InputPosition2{ line: self.line, offset: self.offset as u32 }
@@ @@ -120,7 +126,7 @@ impl InputSource2 { @@
         // Build the line number (!) to offset lookup, so offset by 1. We
         // assume the entire source file is scanned (most common case) for
         // preallocation.
         let lookup = self.offset_lookup.borrow_mut();
+        let mut lookup = self.offset_lookup.borrow_mut();
         lookup.reserve(self.line as usize + 2);
         lookup.push(0); // line 0: never used
         lookup.push(0); // first line: first character

src/protocol/inputsource.rs

➞

Show inline comments

@@ @@ -146,9 +146,9 @@ impl fmt::Display for InputSource { @@
 #[derive(Debug, Clone, Copy)]
 pub struct InputPosition {
     line: usize,
     column: usize,
     pub(crate) offset: usize,
     pub line: usize,
     pub column: usize,
     pub offset: usize,
+}
 impl InputPosition {

src/protocol/lexer2.rs

➞

Show inline comments

 use crate::protocol::Heap;
 use crate::protocol::tokenizer::{TokenBuffer, Token};
 use crate::protocol::input_source2::{InputSource2 as InputSource, ParseError};
 pub struct Lexer<'a> {
     source: &'a mut InputSource,
 struct Ctx<'a> {
     heap: &'a mut Heap,
     source: &'a InputSource,
     tokens: &'a TokenBuffer,
+}
 // Lexes definitions. Should be the first pass over each of the module files
 // after tokenization. Only once all definitions are parsed can we do the full
 // AST creation pass.
 struct LexerDefinitions {
+}
 impl LexerDefinitions {
     pub(crate) fn parse(ctx: &mut Ctx) -> Result<(), ParseError> {
         debug_assert!(ctx.tokens.ranges.len() > 0);
+    }
     pub(crate) fn parse_definition(heap: &mut Heap, source: &InputSource, range: &TokenRang)
+}
@@ \ No newline at end of file @@

src/protocol/tests/utils.rs

➞

Show inline comments

@@ @@ -757,7 +757,7 @@ impl<'a> ErrorTester<'a> { @@
             self.test_name, pattern, self.assert_postfix()
         );
         let pos = pos.unwrap();
-        let col = self.error.statements[idx].position.col();
+        let col = self.error.statements[idx].position.column;
         assert_eq!(
             pos + 1, col,
             "[{}] Expected error to occur at column {}, but found it at {} for {}",

src/protocol/tokenizer/mod.rs

➞

Show inline comments

 use crate::protocol::input_source2::{InputSource2 as InputSource, ParseError, InputPosition2 as InputPosition, InputSpan};
 #[derive(Clone, Copy, PartialEq, Eq)]
 enum TokenKind {
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub(crate) enum TokenKind {
     // Variable-character tokens, followed by a SpanEnd token
     Ident,
     Integer,
     String,
     Character,
     LineComment,
     BlockComment,
     Ident,          // regular identifier
     Pragma,         // identifier with prefixed `#`, range includes `#`
     Integer,        // integer literal
     String,         // string literal, range includes `"`
     Character,      // character literal, range includes `'`
     LineComment,    // line comment, range includes leading `//`, but not newline
     BlockComment,   // block comment, range includes leading `/*` and trailing `*/`
     // Punctuation
     Exclamation,    // !
     Question,       // ?
@@ @@ -65,9 +66,9 @@ enum TokenKind { @@
     SpanEnd,
+}
 struct Token {
     kind: TokenKind,
     pos: InputPosition, // probably need something different
 pub(crate) struct Token {
     pub kind: TokenKind,
     pub pos: InputPosition,
+}
 impl Token {
@@ @@ -77,7 +78,7 @@ impl Token { @@
+}
 #[derive(Debug, PartialEq, Eq)]
 enum TokenRangeKind {
+pub(crate) enum TokenRangeKind {
     Module,
     Pragma,
     Import,
@@ @@ -85,25 +86,27 @@ enum TokenRangeKind { @@
     Code,
+}
 #[derive(Debug)]
 struct TokenRange {
     // Index of parent in `TokenBuffer.ranges`, does not have a parent if the
     // range kind is Module, in that case the parent index points to itself.
     parent_idx: usize,
     range_kind: TokenRangeKind,
-    curly_depth: u8,
+    curly_depth: i32,
     start: usize,
     end: usize,
     subranges: usize,
+}
 struct TokenBuffer {
     tokens: Vec<Token>,
     ranges: Vec<TokenRange>,
 pub(crate) struct TokenBuffer {
     pub tokens: Vec<Token>,
     pub ranges: Vec<TokenRange>,
+}
 struct ParseState {
     kind: TokenRangeKind,
     start: usize,
 impl TokenBuffer {
     pub(crate) fn new() -> Self {
         Self{ tokens: Vec::new(), ranges: Vec::new() }
+    }
+}
 // Tokenizer is a reusable parser to tokenize multiple source files using the
@@ @@ -112,13 +115,19 @@ struct ParseState { @@
 // defintion or an import before producing the entire AST.
 //
 // If the program is not well-formed then the tree may be inconsistent, but we
 // will detect this once we transform the tokens into the AST.
 // will detect this once we transform the tokens into the AST. Maybe we want to
 // detect a mismatch in opening/closing curly braces in the future?
 pub(crate) struct Tokenizer {
     curly_depth: u8,
     // Signed because programmer might have placed too many closing curly braces
     curly_depth: i32,
     // Points to an element in the `TokenBuffer.ranges` variable.
     stack_idx: usize,
+}
 impl Tokenizer {
     pub(crate) fn new() -> Self {
         Self{ curly_depth: 0, stack_idx: 0 }
+    }
     pub(crate) fn tokenize(&mut self, source: &mut InputSource, target: &mut TokenBuffer) -> Result<(), ParseError> {
         // Assert source and buffer are at start
         debug_assert_eq!(source.pos().offset, 0);
@@ @@ -129,7 +138,7 @@ impl Tokenizer { @@
         // This range may get transformed into the appropriate range kind later,
         // see `push_range` and `pop_range`.
         self.curly_depth = 0;
-        self.stack_idx = 1;
+        self.stack_idx = 0;
         target.ranges.push(TokenRange{
             parent_idx: 0,
             range_kind: TokenRangeKind::Module,
@@ @@ -139,7 +148,7 @@ impl Tokenizer { @@
             subranges: 0,
         });
-        // Main processing loop
+        // Main tokenization loop
         while let Some(c) = source.next() {
             let token_index = target.tokens.len();
@@ @@ -157,15 +166,23 @@ impl Tokenizer { @@
+                }
             } else if is_integer_literal_start(c) {
                 self.consume_number(source, target)?;
             } else if is_pragma_start(c) {
                 self.consume_pragma(c, source, target);
                 self.push_range(target, TokenRangeKind::Pragma, token_index);
             } else if is_pragma_start_or_pound(c) {
                 let was_pragma = self.consume_pragma_or_pound(c, source, target)?;
                 if was_pragma {
                     self.push_range(target, TokenRangeKind::Pragma, token_index);
+                }
             } else if self.is_line_comment_start(c, source) {
                 self.consume_line_comment(source, target)?;
             } else if self.is_block_comment_start(c, source) {
                 self.consume_block_comment(source, target)?;
             } else if is_whitespace(c) {
                 let contained_newline = self.consume_whitespace(source);
                 if contained_newline {
                     let range = &target.ranges[self.stack_idx];
                     if range.range_kind == TokenRangeKind::Pragma {
                         self.pop_range(target, target.tokens.len());
+                    }
+                }
             } else {
                 let was_punctuation = self.maybe_parse_punctuation(c, source, target)?;
                 if let Some(token) = was_punctuation {
@@ @@ -194,6 +211,10 @@ impl Tokenizer { @@
+        }
         // End of file, check if our state is correct
         if let Some(error) = source.had_error.take() {
             return Err(error);
+        }
         Ok(())
+    }
@@ @@ -205,7 +226,7 @@ impl Tokenizer { @@
         return first_char == b'/' && Some(b'*') == source.lookahead(1);
+    }
-    pub(crate) fn maybe_parse_punctuation(&mut self, first_char: u8, source: &mut InputSource, target: &mut TokenBuffer) -> Result<Option<TokenKind>, ParseError> {
     fn maybe_parse_punctuation(&mut self, first_char: u8, source: &mut InputSource, target: &mut TokenBuffer) -> Result<Option<TokenKind>, ParseError> {
         debug_assert!(first_char != b'#', "'#' needs special handling");
         debug_assert!(first_char != b'\'', "'\'' needs special handling");
         debug_assert!(first_char != b'"', "'\"' needs special handling");
@@ @@ -394,7 +415,7 @@ impl Tokenizer { @@
         Ok(Some(token_kind))
+    }
-    pub(crate) fn consume_char_literal(&mut self, source: &mut InputSource, target: &mut TokenBuffer) -> Result<(), ParseError> {
     fn consume_char_literal(&mut self, source: &mut InputSource, target: &mut TokenBuffer) -> Result<(), ParseError> {
         let begin_pos = source.pos();
         // Consume the leading quote
@@ @@ -403,7 +424,12 @@ impl Tokenizer { @@
         let mut prev_char = b'\'';
         while let Some(c) = source.next() {
             if !c.is_ascii() {
                 return Err(ParseError::new_error(source, source.pos(), "non-ASCII character in char literal"));
+            }
             source.consume();
             // Make sure ending quote was not escaped
             if c == b'\'' && prev_char != b'\\' {
                 prev_char = c;
                 break;
@@ @@ -413,7 +439,7 @@ impl Tokenizer { @@
+        }
         if prev_char != b'\'' {
             // Unterminated character literal
+            // Unterminated character literal, reached end of file.
             return Err(ParseError::new_error(source, begin_pos, "encountered unterminated character literal"));
+        }
@@ @@ -425,7 +451,7 @@ impl Tokenizer { @@
         Ok(())
+    }
-    pub(crate) fn consume_string_literal(&mut self, source: &mut InputSource, target: &mut TokenBuffer) -> Result<(), ParseError> {
     fn consume_string_literal(&mut self, source: &mut InputSource, target: &mut TokenBuffer) -> Result<(), ParseError> {
         let begin_pos = source.pos();
         // Consume the leading double quotes
@@ @@ -434,6 +460,10 @@ impl Tokenizer { @@
         let mut prev_char = b'"';
         while let Some(c) = source.next() {
             if !c.is_ascii() {
                 return Err(ParseError::new_error(source, source.pos(), "non-ASCII character in string literal"));
+            }
             source.consume();
             if c == b'"' && prev_char != b'\\' {
                 prev_char = c;
@@ @@ -455,15 +485,36 @@ impl Tokenizer { @@
         Ok(())
+    }
     fn consume_pragma(&mut self, first_char: u8, source: &mut InputSource, target: &mut TokenBuffer) {
         let pos = source.pos();
     fn consume_pragma_or_pound(&mut self, first_char: u8, source: &mut InputSource, target: &mut TokenBuffer) -> Result<bool, ParseError> {
         let start_pos = source.pos();
         debug_assert_eq!(first_char, b'#');
         source.consume();
         target.tokens.push(Token::new(TokenKind::Pound, pos));
         let next = source.next();
         if next.is_none() || !is_identifier_start(next.unwrap()) {
             // Just a pound sign
             target.tokens.push(Token::new(TokenKind::Pound, start_pos));
             Ok(false)
         } else {
             // Pound sign followed by identifier
             source.consume();
             while let Some(c) = source.next() {
                 if !is_identifier_remaining(c) {
                     break;
+                }
                 source.consume();
+            }
             self.check_ascii(source)?;
             let end_pos = source.pos();
             target.tokens.push(Token::new(TokenKind::Pragma, start_pos));
             target.tokens.push(Token::new(TokenKind::SpanEnd, end_pos));
             Ok(true)
+        }
+    }
-    pub(crate) fn consume_line_comment(&mut self, source: &mut InputSource, target: &mut TokenBuffer) -> Result<(), ParseError> {
     fn consume_line_comment(&mut self, source: &mut InputSource, target: &mut TokenBuffer) -> Result<(), ParseError> {
         let begin_pos = source.pos();
         // Consume the leading "//"
@@ @@ -474,13 +525,15 @@ impl Tokenizer { @@
         let mut prev_char = b'/';
         let mut cur_char = b'/';
         while let Some(c) = source.next() {
-            source.consume();
+            prev_char = cur_char;
             cur_char = c;
             if c == b'\n' {
                 // End of line
+                // End of line, note that the newline is not consumed
                 break;
+            }
             prev_char = c;
             source.consume();
+        }
         let mut end_pos = source.pos();
@@ @@ -493,7 +546,10 @@ impl Tokenizer { @@
             } else {
                 end_pos.offset -= 1;
+            }
             // Consume final newline
             source.consume();
         } else {
             // End of comment was due to EOF
             debug_assert!(source.next().is_none())
+        }
@@ @@ -503,7 +559,7 @@ impl Tokenizer { @@
         Ok(())
+    }
-    pub(crate) fn consume_block_comment(&mut self, source: &mut InputSource, target: &mut TokenBuffer) -> Result<(), ParseError> {
     fn consume_block_comment(&mut self, source: &mut InputSource, target: &mut TokenBuffer) -> Result<(), ParseError> {
         let begin_pos = source.pos();
         // Consume the leading "/*"
@@ @@ -536,7 +592,7 @@ impl Tokenizer { @@
         Ok(())
+    }
-    pub(crate) fn consume_identifier<'a>(&mut self, source: &'a mut InputSource, target: &mut TokenBuffer) -> Result<&'a [u8], ParseError> {
     fn consume_identifier<'a>(&mut self, source: &'a mut InputSource, target: &mut TokenBuffer) -> Result<&'a [u8], ParseError> {
         let begin_pos = source.pos();
         debug_assert!(is_identifier_start(source.next().unwrap()));
         source.consume();
@@ @@ -557,7 +613,7 @@ impl Tokenizer { @@
         Ok(source.section(begin_pos.offset, end_pos.offset))
+    }
-    pub(crate) fn consume_number(&mut self, source: &mut InputSource, target: &mut TokenBuffer) -> Result<(), ParseError> {
     fn consume_number(&mut self, source: &mut InputSource, target: &mut TokenBuffer) -> Result<(), ParseError> {
         let begin_pos = source.pos();
         debug_assert!(is_integer_literal_start(source.next().unwrap()));
         source.consume();
@@ @@ -593,29 +649,46 @@ impl Tokenizer { @@
             if c == b'\n' {
                 has_newline = true;
+            }
             source.consume();
+        }
         has_newline
+    }
     /// Pushes a new token range onto the stack in the buffers.
     fn push_range(&self, target: &mut TokenBuffer, range_kind: TokenRangeKind, first_token: usize) {
         let cur_range = &target.ranges[self.stack_idx];
         let parent_idx = cur_range.parent_idx;
         let parent_range = &target.ranges[parent_idx];
         if parent_range.end != first_token {
             // Insert intermediate range
     fn push_range(&mut self, target: &mut TokenBuffer, range_kind: TokenRangeKind, first_token: usize) {
         let cur_range = &mut target.ranges[self.stack_idx];
         println!(
             "DEBUG: push_range [1] | stack_idx: {}, range_end: {}, first_token: {}",
             self.stack_idx, cur_range.end, first_token
         );
         // If we have just popped a range and then push a new range, then the
         // first token is equal to the last token registered on the current
         // range. If not, then we had some intermediate tokens that did not
         // belong to a particular kind of token range: hence we insert an
         // intermediate "code" range.
         if cur_range.end != first_token {
             println!("DEBUG: push_range [2] | inserting code range");
             let code_start = cur_range.end;
             cur_range.end = first_token;
             cur_range.subranges += 1;
             target.ranges.push(TokenRange{
                 parent_idx,
+                parent_idx: self.stack_idx,
                 range_kind: TokenRangeKind::Code,
                 curly_depth: cur_range.curly_depth,
                 start: parent_range.end,
                 curly_depth: self.curly_depth,
                 start: code_start,
                 end: first_token,
                 subranges: 0,
             });
+        }
         // Insert a new range
         println!(
             "DEBUG: push_range [3] | kind: {:?}, parent_idx: {}, stack_idx: {}",
             range_kind, self.stack_idx, target.ranges.len()
         );
         let parent_idx = self.stack_idx;
         self.stack_idx = target.ranges.len();
         target.ranges.push(TokenRange{
@@ @@ -628,16 +701,26 @@ impl Tokenizer { @@
         });
+    }
     fn pop_range(&self, target: &mut TokenBuffer, end_index: usize) {
         // Pop all the dummy ranges that are left on the range stack
     fn pop_range(&mut self, target: &mut TokenBuffer, end_index: usize) {
         let last = &mut target.ranges[self.stack_idx];
         debug_assert!(self.stack_idx != last.parent_idx, "attempting to pop top-level range");
         // Fix up the current range before going back to parent
         println!(
             "DEBUG: pop_range  [1] | stack_idx: {}, kind: {:?}, start: {}, old_end: {}, new_end: {}",
             self.stack_idx, last.range_kind, last.start, last.end, end_index
         );
         last.end = end_index;
         self.stack_idx = last.parent_idx as usize;
         // Go back to parent
         self.stack_idx = last.parent_idx;
         let parent = &mut target.ranges[self.stack_idx];
         parent.end = end_index;
         parent.subranges += 1;
         println!(
             "DEBUG: pop_range  [2] | returning to kind: {:?}, idx: {}, new_end: {}",
             parent.range_kind, self.stack_idx, end_index
         );
+    }
@@ @@ -680,7 +763,7 @@ fn is_string_literal_start(c: u8) -> bool { @@
     return c == b'"';
+}
-fn is_pragma_start(c: u8) -> bool {
+fn is_pragma_start_or_pound(c: u8) -> bool {
     return c == b'#';
+}
@@ @@ -708,4 +791,78 @@ fn maybe_number_remaining(c: u8) -> bool { @@
         (c == b'b' || c == b'B' || c == b'o' || c == b'O' || c == b'x' || c == b'X') ||
         (c >= b'0' && c <= b'9') ||
         c == b'_';
+}
 #[cfg(test)]
 mod tests {
     use super::*;
     // TODO: Remove at some point
     #[test]
     fn test_tokenizer() {
         let mut source = InputSource::new_test("
         #version 500
         # hello 2
         import std.reo::*;
         struct Thing {
             int a: 5,
+        }
         enum Hello {
             A,
+            B
+        }
         // Hello hello, is it me you are looking for?
         // I can seee it in your eeeyes
         func something(int a, int b, int c) -> byte {
             int a = 5;
             struct Inner {
                 int a
+            }
             struct City {
                 int b
+            }
             /* Waza
             How are you doing
             Things in here yo
             /* */ */
             a = a + 5 * 2;
             struct Pressure {
                 int d
+            }
+        }
         ");
         let mut t = Tokenizer::new();
         let mut buffer = TokenBuffer::new();
         t.tokenize(&mut source, &mut buffer).expect("tokenize");
         println!("Ranges:\n");
         for (idx, range) in buffer.ranges.iter().enumerate() {
             println!("[{}] {:?}", idx, range)
+        }
         println!("Tokens:\n");
         let mut iter = buffer.tokens.iter().enumerate();
         while let Some((idx, token)) = iter.next() {
             match token.kind {
                 TokenKind::Ident | TokenKind::Pragma | TokenKind::Integer |
                 TokenKind::String | TokenKind::Character | TokenKind::LineComment |
                 TokenKind::BlockComment => {
                     let (_, end) = iter.next().unwrap();
                     println!("[{}] {:?} ......", idx, token.kind);
                     assert_eq!(end.kind, TokenKind::SpanEnd);
                     let text = source.section(token.pos.offset, end.pos.offset);
                     println!("{}", String::from_utf8_lossy(text));
                 },
                 _ => {
                     println!("[{}] {:?}", idx, token.kind);
+                }
+            }
+        }
+    }
+}
@@ \ No newline at end of file @@

0 comments (0 inline, 0 general)