From 33aa370aeb00f0dc954d522a28efd6dd6d50e55b 2021-04-12 10:29:43
From: MH <henger@cwi.nl>
Date: 2021-04-12 10:29:43
Subject: [PATCH] Finished tokenizer, pending new lexer

---
diff --git a/src/protocol/input_source2.rs b/src/protocol/input_source2.rs
index 025b86ddbfb0a0f77f8fd9f3dffe5a5cfad59218..54888d0e7e255ce861097b47abbf4503a7c3ae2e 100644
--- a/src/protocol/input_source2.rs
+++ b/src/protocol/input_source2.rs
@@ -28,7 +28,7 @@ pub struct InputSource2 {
     line: u32,
     offset: usize,
     // State tracking
-    had_error: Option<ParseError>,
+    pub(crate) had_error: Option<ParseError>,
     // The offset_lookup is built on-demand upon attempting to report an error.
     // As the compiler is currently not multithreaded, we simply put it in a 
     // RefCell to allow interior mutability.
@@ -47,6 +47,12 @@ impl InputSource2 {
         }
     }
 
+    #[cfg(test)]
+    pub fn new_test(input: &str) -> Self {
+        let bytes = Vec::from(input.as_bytes());
+        return Self::new(String::from("test"), bytes)
+    }
+
     #[inline]
     pub fn pos(&self) -> InputPosition2 {
         InputPosition2{ line: self.line, offset: self.offset as u32 }
@@ -120,7 +126,7 @@ impl InputSource2 {
         // Build the line number (!) to offset lookup, so offset by 1. We 
         // assume the entire source file is scanned (most common case) for
         // preallocation.
-        let lookup = self.offset_lookup.borrow_mut();
+        let mut lookup = self.offset_lookup.borrow_mut();
         lookup.reserve(self.line as usize + 2);
         lookup.push(0); // line 0: never used
         lookup.push(0); // first line: first character
diff --git a/src/protocol/inputsource.rs b/src/protocol/inputsource.rs
index 756f51cc32c3153a8d629f85f7c3ef02942d0521..1442d6452d2892b47b7b5d09b2f72b3bbcf9d591 100644
--- a/src/protocol/inputsource.rs
+++ b/src/protocol/inputsource.rs
@@ -146,9 +146,9 @@ impl fmt::Display for InputSource {
 
 #[derive(Debug, Clone, Copy)]
 pub struct InputPosition {
-    line: usize,
-    column: usize,
-    pub(crate) offset: usize,
+    pub line: usize,
+    pub column: usize,
+    pub offset: usize,
 }
 
 impl InputPosition {
diff --git a/src/protocol/lexer2.rs b/src/protocol/lexer2.rs
index 68d39e006e7b820c0aa6e376dff1b21a7d4f3974..50e7ff55cb147ab4c7668d0576badd1e67713cc2 100644
--- a/src/protocol/lexer2.rs
+++ b/src/protocol/lexer2.rs
@@ -1,4 +1,24 @@
+use crate::protocol::Heap;
+use crate::protocol::tokenizer::{TokenBuffer, Token};
+use crate::protocol::input_source2::{InputSource2 as InputSource, ParseError};
 
-pub struct Lexer<'a> {
-    source: &'a mut InputSource,
+struct Ctx<'a> {
+    heap: &'a mut Heap,
+    source: &'a InputSource,
+    tokens: &'a TokenBuffer,
+}
+
+// Lexes definitions. Should be the first pass over each of the module files 
+// after tokenization. Only once all definitions are parsed can we do the full
+// AST creation pass.
+struct LexerDefinitions {
+
+}
+
+impl LexerDefinitions {
+    pub(crate) fn parse(ctx: &mut Ctx) -> Result<(), ParseError> {
+        debug_assert!(ctx.tokens.ranges.len() > 0);
+    }
+
+    pub(crate) fn parse_definition(heap: &mut Heap, source: &InputSource, range: &TokenRang)
 }
\ No newline at end of file
diff --git a/src/protocol/tests/utils.rs b/src/protocol/tests/utils.rs
index 34bba8faaac0ac6023bdd85dad38e5dc1e67f7ab..248844a38e89aa14307369f43b870804a1610c50 100644
--- a/src/protocol/tests/utils.rs
+++ b/src/protocol/tests/utils.rs
@@ -757,7 +757,7 @@ impl<'a> ErrorTester<'a> {
             self.test_name, pattern, self.assert_postfix()
         );
         let pos = pos.unwrap();
-        let col = self.error.statements[idx].position.col();
+        let col = self.error.statements[idx].position.column;
         assert_eq!(
             pos + 1, col,
             "[{}] Expected error to occur at column {}, but found it at {} for {}",
diff --git a/src/protocol/tokenizer/mod.rs b/src/protocol/tokenizer/mod.rs
index 9f5cf247e7558d0a1025d48109be77c9fdd10546..fe6105ad9dae355136099a86932474e7a287bed0 100644
--- a/src/protocol/tokenizer/mod.rs
+++ b/src/protocol/tokenizer/mod.rs
@@ -1,15 +1,16 @@
 
 use crate::protocol::input_source2::{InputSource2 as InputSource, ParseError, InputPosition2 as InputPosition, InputSpan};
 
-#[derive(Clone, Copy, PartialEq, Eq)]
-enum TokenKind {
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub(crate) enum TokenKind {
     // Variable-character tokens, followed by a SpanEnd token
-    Ident,
-    Integer,
-    String,
-    Character,
-    LineComment,
-    BlockComment,
+    Ident,          // regular identifier
+    Pragma,         // identifier with prefixed `#`, range includes `#`
+    Integer,        // integer literal
+    String,         // string literal, range includes `"`
+    Character,      // character literal, range includes `'`
+    LineComment,    // line comment, range includes leading `//`, but not newline
+    BlockComment,   // block comment, range includes leading `/*` and trailing `*/`
     // Punctuation
     Exclamation,    // !
     Question,       // ?
@@ -65,9 +66,9 @@ enum TokenKind {
     SpanEnd,
 }
 
-struct Token {
-    kind: TokenKind,
-    pos: InputPosition, // probably need something different
+pub(crate) struct Token {
+    pub kind: TokenKind,
+    pub pos: InputPosition,
 }
 
 impl Token {
@@ -77,7 +78,7 @@ impl Token {
 }
 
 #[derive(Debug, PartialEq, Eq)]
-enum TokenRangeKind {
+pub(crate) enum TokenRangeKind {
     Module,
     Pragma,
     Import,
@@ -85,25 +86,27 @@ enum TokenRangeKind {
     Code,
 }
 
+#[derive(Debug)]
 struct TokenRange {
     // Index of parent in `TokenBuffer.ranges`, does not have a parent if the 
     // range kind is Module, in that case the parent index points to itself.
     parent_idx: usize,
     range_kind: TokenRangeKind,
-    curly_depth: u8,
+    curly_depth: i32,
     start: usize,
     end: usize,
     subranges: usize,
 }
 
-struct TokenBuffer {
-    tokens: Vec<Token>,
-    ranges: Vec<TokenRange>,
+pub(crate) struct TokenBuffer {
+    pub tokens: Vec<Token>,
+    pub ranges: Vec<TokenRange>,
 }
 
-struct ParseState {
-    kind: TokenRangeKind,
-    start: usize,
+impl TokenBuffer {
+    pub(crate) fn new() -> Self {
+        Self{ tokens: Vec::new(), ranges: Vec::new() }
+    }
 }
 
 // Tokenizer is a reusable parser to tokenize multiple source files using the
@@ -112,13 +115,19 @@ struct ParseState {
 // defintion or an import before producing the entire AST.
 //
 // If the program is not well-formed then the tree may be inconsistent, but we
-// will detect this once we transform the tokens into the AST.
+// will detect this once we transform the tokens into the AST. Maybe we want to
+// detect a mismatch in opening/closing curly braces in the future?
 pub(crate) struct Tokenizer {
-    curly_depth: u8,
+    // Signed because programmer might have placed too many closing curly braces
+    curly_depth: i32,
+    // Points to an element in the `TokenBuffer.ranges` variable.
     stack_idx: usize,
 }
 
 impl Tokenizer {
+    pub(crate) fn new() -> Self {
+        Self{ curly_depth: 0, stack_idx: 0 }
+    }
     pub(crate) fn tokenize(&mut self, source: &mut InputSource, target: &mut TokenBuffer) -> Result<(), ParseError> {
         // Assert source and buffer are at start
         debug_assert_eq!(source.pos().offset, 0);
@@ -129,7 +138,7 @@ impl Tokenizer {
         // This range may get transformed into the appropriate range kind later,
         // see `push_range` and `pop_range`.
         self.curly_depth = 0;
-        self.stack_idx = 1;
+        self.stack_idx = 0;
         target.ranges.push(TokenRange{
             parent_idx: 0,
             range_kind: TokenRangeKind::Module,
@@ -139,7 +148,7 @@ impl Tokenizer {
             subranges: 0,
         });
 
-        // Main processing loop
+        // Main tokenization loop
         while let Some(c) = source.next() {
             let token_index = target.tokens.len();
 
@@ -157,15 +166,23 @@ impl Tokenizer {
                 }
             } else if is_integer_literal_start(c) {
                 self.consume_number(source, target)?;
-            } else if is_pragma_start(c) {
-                self.consume_pragma(c, source, target);
-                self.push_range(target, TokenRangeKind::Pragma, token_index);
+            } else if is_pragma_start_or_pound(c) {
+                let was_pragma = self.consume_pragma_or_pound(c, source, target)?;
+                if was_pragma {
+                    self.push_range(target, TokenRangeKind::Pragma, token_index);
+                }
             } else if self.is_line_comment_start(c, source) {
                 self.consume_line_comment(source, target)?;
             } else if self.is_block_comment_start(c, source) {
                 self.consume_block_comment(source, target)?;
             } else if is_whitespace(c) {
                 let contained_newline = self.consume_whitespace(source);
+                if contained_newline {
+                    let range = &target.ranges[self.stack_idx];
+                    if range.range_kind == TokenRangeKind::Pragma {
+                        self.pop_range(target, target.tokens.len());
+                    }
+                }
             } else {
                 let was_punctuation = self.maybe_parse_punctuation(c, source, target)?;
                 if let Some(token) = was_punctuation {
@@ -194,6 +211,10 @@ impl Tokenizer {
         }
 
         // End of file, check if our state is correct
+        if let Some(error) = source.had_error.take() {
+            return Err(error);
+        }
+
         Ok(())
     }
 
@@ -205,7 +226,7 @@ impl Tokenizer {
         return first_char == b'/' && Some(b'*') == source.lookahead(1);
     }
 
-    pub(crate) fn maybe_parse_punctuation(&mut self, first_char: u8, source: &mut InputSource, target: &mut TokenBuffer) -> Result<Option<TokenKind>, ParseError> {
+    fn maybe_parse_punctuation(&mut self, first_char: u8, source: &mut InputSource, target: &mut TokenBuffer) -> Result<Option<TokenKind>, ParseError> {
         debug_assert!(first_char != b'#', "'#' needs special handling");
         debug_assert!(first_char != b'\'', "'\'' needs special handling");
         debug_assert!(first_char != b'"', "'\"' needs special handling");
@@ -394,7 +415,7 @@ impl Tokenizer {
         Ok(Some(token_kind))
     }
 
-    pub(crate) fn consume_char_literal(&mut self, source: &mut InputSource, target: &mut TokenBuffer) -> Result<(), ParseError> {
+    fn consume_char_literal(&mut self, source: &mut InputSource, target: &mut TokenBuffer) -> Result<(), ParseError> {
         let begin_pos = source.pos();
 
         // Consume the leading quote
@@ -403,7 +424,12 @@ impl Tokenizer {
 
         let mut prev_char = b'\'';
         while let Some(c) = source.next() {
+            if !c.is_ascii() {
+                return Err(ParseError::new_error(source, source.pos(), "non-ASCII character in char literal"));
+            }
             source.consume();
+            
+            // Make sure ending quote was not escaped
             if c == b'\'' && prev_char != b'\\' {
                 prev_char = c;
                 break;
@@ -413,7 +439,7 @@ impl Tokenizer {
         }
 
         if prev_char != b'\'' {
-            // Unterminated character literal
+            // Unterminated character literal, reached end of file.
             return Err(ParseError::new_error(source, begin_pos, "encountered unterminated character literal"));
         }
 
@@ -425,7 +451,7 @@ impl Tokenizer {
         Ok(())
     }
 
-    pub(crate) fn consume_string_literal(&mut self, source: &mut InputSource, target: &mut TokenBuffer) -> Result<(), ParseError> {
+    fn consume_string_literal(&mut self, source: &mut InputSource, target: &mut TokenBuffer) -> Result<(), ParseError> {
         let begin_pos = source.pos();
 
         // Consume the leading double quotes
@@ -434,6 +460,10 @@ impl Tokenizer {
 
         let mut prev_char = b'"';
         while let Some(c) = source.next() {
+            if !c.is_ascii() {
+                return Err(ParseError::new_error(source, source.pos(), "non-ASCII character in string literal"));
+            }
+
             source.consume();
             if c == b'"' && prev_char != b'\\' {
                 prev_char = c;
@@ -455,15 +485,36 @@ impl Tokenizer {
         Ok(())
     }
 
-    fn consume_pragma(&mut self, first_char: u8, source: &mut InputSource, target: &mut TokenBuffer) {
-        let pos = source.pos();
+    fn consume_pragma_or_pound(&mut self, first_char: u8, source: &mut InputSource, target: &mut TokenBuffer) -> Result<bool, ParseError> {
+        let start_pos = source.pos();
         debug_assert_eq!(first_char, b'#');
         source.consume();
 
-        target.tokens.push(Token::new(TokenKind::Pound, pos));
+        let next = source.next();
+        if next.is_none() || !is_identifier_start(next.unwrap()) {
+            // Just a pound sign
+            target.tokens.push(Token::new(TokenKind::Pound, start_pos));
+            Ok(false)
+        } else {
+            // Pound sign followed by identifier
+            source.consume();
+            while let Some(c) = source.next() {
+                if !is_identifier_remaining(c) {
+                    break;
+                }
+                source.consume();
+            }
+
+            self.check_ascii(source)?;
+
+            let end_pos = source.pos();
+            target.tokens.push(Token::new(TokenKind::Pragma, start_pos));
+            target.tokens.push(Token::new(TokenKind::SpanEnd, end_pos));
+            Ok(true)
+        }
     }
 
-    pub(crate) fn consume_line_comment(&mut self, source: &mut InputSource, target: &mut TokenBuffer) -> Result<(), ParseError> {
+    fn consume_line_comment(&mut self, source: &mut InputSource, target: &mut TokenBuffer) -> Result<(), ParseError> {
         let begin_pos = source.pos();
 
         // Consume the leading "//"
@@ -474,13 +525,15 @@ impl Tokenizer {
         let mut prev_char = b'/';
         let mut cur_char = b'/';
         while let Some(c) = source.next() {
-            source.consume();
+            prev_char = cur_char;
             cur_char = c;
+
             if c == b'\n' {
-                // End of line
+                // End of line, note that the newline is not consumed
                 break;
             }
-            prev_char = c;
+
+            source.consume();
         }
 
         let mut end_pos = source.pos();
@@ -493,7 +546,10 @@ impl Tokenizer {
             } else {
                 end_pos.offset -= 1;
             }
+            // Consume final newline
+            source.consume();
         } else {
+            // End of comment was due to EOF
             debug_assert!(source.next().is_none())
         }
 
@@ -503,7 +559,7 @@ impl Tokenizer {
         Ok(())
     }
 
-    pub(crate) fn consume_block_comment(&mut self, source: &mut InputSource, target: &mut TokenBuffer) -> Result<(), ParseError> {
+    fn consume_block_comment(&mut self, source: &mut InputSource, target: &mut TokenBuffer) -> Result<(), ParseError> {
         let begin_pos = source.pos();
 
         // Consume the leading "/*"
@@ -536,7 +592,7 @@ impl Tokenizer {
         Ok(())
     }
 
-    pub(crate) fn consume_identifier<'a>(&mut self, source: &'a mut InputSource, target: &mut TokenBuffer) -> Result<&'a [u8], ParseError> {
+    fn consume_identifier<'a>(&mut self, source: &'a mut InputSource, target: &mut TokenBuffer) -> Result<&'a [u8], ParseError> {
         let begin_pos = source.pos();
         debug_assert!(is_identifier_start(source.next().unwrap()));
         source.consume();
@@ -557,7 +613,7 @@ impl Tokenizer {
         Ok(source.section(begin_pos.offset, end_pos.offset))
     }
 
-    pub(crate) fn consume_number(&mut self, source: &mut InputSource, target: &mut TokenBuffer) -> Result<(), ParseError> {
+    fn consume_number(&mut self, source: &mut InputSource, target: &mut TokenBuffer) -> Result<(), ParseError> {
         let begin_pos = source.pos();
         debug_assert!(is_integer_literal_start(source.next().unwrap()));
         source.consume();
@@ -593,29 +649,46 @@ impl Tokenizer {
             if c == b'\n' {
                 has_newline = true;
             }
+            source.consume();
         }
 
         has_newline
     }
 
     /// Pushes a new token range onto the stack in the buffers.
-    fn push_range(&self, target: &mut TokenBuffer, range_kind: TokenRangeKind, first_token: usize) {
-        let cur_range = &target.ranges[self.stack_idx];
-        let parent_idx = cur_range.parent_idx;
-        let parent_range = &target.ranges[parent_idx];
-        if parent_range.end != first_token {
-            // Insert intermediate range
+    fn push_range(&mut self, target: &mut TokenBuffer, range_kind: TokenRangeKind, first_token: usize) {
+        let cur_range = &mut target.ranges[self.stack_idx];
+
+        println!(
+            "DEBUG: push_range [1] | stack_idx: {}, range_end: {}, first_token: {}", 
+            self.stack_idx, cur_range.end, first_token
+        );
+
+        // If we have just popped a range and then push a new range, then the
+        // first token is equal to the last token registered on the current 
+        // range. If not, then we had some intermediate tokens that did not 
+        // belong to a particular kind of token range: hence we insert an 
+        // intermediate "code" range.
+        if cur_range.end != first_token {
+            println!("DEBUG: push_range [2] | inserting code range");
+            let code_start = cur_range.end;
+            cur_range.end = first_token;
+            cur_range.subranges += 1;
             target.ranges.push(TokenRange{
-                parent_idx,
+                parent_idx: self.stack_idx,
                 range_kind: TokenRangeKind::Code,
-                curly_depth: cur_range.curly_depth,
-                start: parent_range.end,
+                curly_depth: self.curly_depth,
+                start: code_start,
                 end: first_token,
                 subranges: 0,
             });
         }
 
         // Insert a new range
+        println!(
+            "DEBUG: push_range [3] | kind: {:?}, parent_idx: {}, stack_idx: {}", 
+            range_kind, self.stack_idx, target.ranges.len()
+        );
         let parent_idx = self.stack_idx;
         self.stack_idx = target.ranges.len();
         target.ranges.push(TokenRange{
@@ -628,16 +701,26 @@ impl Tokenizer {
         });
     }
 
-    fn pop_range(&self, target: &mut TokenBuffer, end_index: usize) {
-        // Pop all the dummy ranges that are left on the range stack
+    fn pop_range(&mut self, target: &mut TokenBuffer, end_index: usize) {
         let last = &mut target.ranges[self.stack_idx];
         debug_assert!(self.stack_idx != last.parent_idx, "attempting to pop top-level range");
 
+        // Fix up the current range before going back to parent
+        println!(
+            "DEBUG: pop_range  [1] | stack_idx: {}, kind: {:?}, start: {}, old_end: {}, new_end: {}",
+            self.stack_idx, last.range_kind, last.start, last.end, end_index
+        );
         last.end = end_index;
-        self.stack_idx = last.parent_idx as usize;
+        
+        // Go back to parent
+        self.stack_idx = last.parent_idx;
         let parent = &mut target.ranges[self.stack_idx];
         parent.end = end_index;
         parent.subranges += 1;
+        println!(
+            "DEBUG: pop_range  [2] | returning to kind: {:?}, idx: {}, new_end: {}",
+            parent.range_kind, self.stack_idx, end_index
+        );
     }
 
 
@@ -680,7 +763,7 @@ fn is_string_literal_start(c: u8) -> bool {
     return c == b'"';
 }
 
-fn is_pragma_start(c: u8) -> bool {
+fn is_pragma_start_or_pound(c: u8) -> bool {
     return c == b'#';
 }
 
@@ -708,4 +791,78 @@ fn maybe_number_remaining(c: u8) -> bool {
         (c == b'b' || c == b'B' || c == b'o' || c == b'O' || c == b'x' || c == b'X') ||
         (c >= b'0' && c <= b'9') ||
         c == b'_';
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    // TODO: Remove at some point
+    #[test]
+    fn test_tokenizer() {
+        let mut source = InputSource::new_test("
+        
+        #version 500
+        # hello 2
+
+        import std.reo::*;
+
+        struct Thing {
+            int a: 5,
+        }
+        enum Hello {
+            A,
+            B
+        }
+
+        // Hello hello, is it me you are looking for?
+        // I can seee it in your eeeyes
+
+        func something(int a, int b, int c) -> byte {
+            int a = 5;
+            struct Inner {
+                int a
+            }
+            struct City {
+                int b
+            }
+            /* Waza
+            How are you doing
+            Things in here yo
+            /* */ */
+
+            a = a + 5 * 2;
+            struct Pressure {
+                int d
+            }
+        }
+        ");
+        let mut t = Tokenizer::new();
+        let mut buffer = TokenBuffer::new();
+        t.tokenize(&mut source, &mut buffer).expect("tokenize");
+
+        println!("Ranges:\n");
+        for (idx, range) in buffer.ranges.iter().enumerate() {
+            println!("[{}] {:?}", idx, range)
+        }
+
+        println!("Tokens:\n");
+        let mut iter = buffer.tokens.iter().enumerate();
+        while let Some((idx, token)) = iter.next() {
+            match token.kind {
+                TokenKind::Ident | TokenKind::Pragma | TokenKind::Integer |
+                TokenKind::String | TokenKind::Character | TokenKind::LineComment |
+                TokenKind::BlockComment => {
+                    let (_, end) = iter.next().unwrap();
+                    println!("[{}] {:?} ......", idx, token.kind);
+                    assert_eq!(end.kind, TokenKind::SpanEnd);
+                    let text = source.section(token.pos.offset, end.pos.offset);
+                    println!("{}", String::from_utf8_lossy(text));
+                },
+                _ => {
+                    println!("[{}] {:?}", idx, token.kind);
+                }
+            }
+        }
+    }
 }
\ No newline at end of file