diff --git a/src/protocol/parser/pass_tokenizer.rs b/src/protocol/parser/pass_tokenizer.rs index d41b6f65939a7267f7a38fb635d78b51fb634e1d..c611c9c4dc6b79e5d39c2f23c19a8742c7f8db3c 100644 --- a/src/protocol/parser/pass_tokenizer.rs +++ b/src/protocol/parser/pass_tokenizer.rs @@ -21,15 +21,12 @@ pub(crate) struct PassTokenizer { // unmatched opening braces, unmatched closing braces are detected // immediately. curly_stack: Vec, - // Points to an element in the `TokenBuffer.ranges` variable. - stack_idx: usize, } impl PassTokenizer { pub(crate) fn new() -> Self { Self{ curly_stack: Vec::with_capacity(32), - stack_idx: 0 } } @@ -37,23 +34,6 @@ impl PassTokenizer { // Assert source and buffer are at start debug_assert_eq!(source.pos().offset, 0); debug_assert!(target.tokens.is_empty()); - debug_assert!(target.ranges.is_empty()); - - // Set up for tokenization by pushing the first range onto the stack. - // This range may get transformed into the appropriate range kind later, - // see `push_range` and `pop_range`. - self.stack_idx = 0; - target.ranges.push(TokenRange{ - parent_idx: NO_RELATION, - range_kind: TokenRangeKind::Module, - curly_depth: 0, - start: 0, - end: 0, - num_child_ranges: 0, - first_child_idx: NO_RELATION, - last_child_idx: NO_RELATION, - next_sibling_idx: NO_RELATION, - }); // Main tokenization loop while let Some(c) = source.next() { @@ -61,35 +41,31 @@ impl PassTokenizer { if is_char_literal_start(c) { self.consume_char_literal(source, target)?; + } else if is_bytestring_literal_start(c, source) { + self.consume_bytestring_literal(source, target)?; } else if is_string_literal_start(c) { self.consume_string_literal(source, target)?; } else if is_identifier_start(c) { let ident = self.consume_identifier(source, target)?; - if demarks_definition(ident) { - self.push_range(target, TokenRangeKind::Definition, token_index); + if demarks_symbol(ident) { + self.emit_marker(target, TokenMarkerKind::Definition, token_index); } else if demarks_import(ident) { - self.push_range(target, TokenRangeKind::Import, token_index); + self.emit_marker(target, TokenMarkerKind::Import, token_index); } } else if is_integer_literal_start(c) { self.consume_number(source, target)?; } else if is_pragma_start_or_pound(c) { let was_pragma = self.consume_pragma_or_pound(c, source, target)?; if was_pragma { - self.push_range(target, TokenRangeKind::Pragma, token_index); + self.emit_marker(target, TokenMarkerKind::Pragma, token_index); } } else if self.is_line_comment_start(c, source) { self.consume_line_comment(source, target)?; } else if self.is_block_comment_start(c, source) { self.consume_block_comment(source, target)?; } else if is_whitespace(c) { - let contained_newline = self.consume_whitespace(source); - if contained_newline { - let range = &target.ranges[self.stack_idx]; - if range.range_kind == TokenRangeKind::Pragma { - self.pop_range(target, target.tokens.len() as u32); - } - } + self.consume_whitespace(source); } else { let was_punctuation = self.maybe_parse_punctuation(c, source, target)?; if let Some((token, token_pos)) = was_punctuation { @@ -105,20 +81,6 @@ impl PassTokenizer { } self.curly_stack.pop(); - - let range = &target.ranges[self.stack_idx]; - if range.range_kind == TokenRangeKind::Definition && range.curly_depth == self.curly_stack.len() as u32 { - self.pop_range(target, target.tokens.len() as u32); - } - - // Exit early if we have more closing curly braces than - // opening curly braces - } else if token == TokenKind::SemiColon { - // Check if this marks the end of an import - let range = &target.ranges[self.stack_idx]; - if range.range_kind == TokenRangeKind::Import { - self.pop_range(target, target.tokens.len() as u32); - } } } else { return Err(ParseError::new_error_str_at_pos( @@ -142,21 +104,6 @@ impl PassTokenizer { )); } - // Ranges that did not depend on curly braces may have missing tokens. - // So close all of the active tokens - while self.stack_idx != 0 { - self.pop_range(target, target.tokens.len() as u32); - } - - // And finally, we may have a token range at the end that doesn't belong - // to a range yet, so insert a "code" range if this is the case. - debug_assert_eq!(self.stack_idx, 0); - let last_registered_idx = target.ranges[0].end; - let last_token_idx = target.tokens.len() as u32; - if last_registered_idx != last_token_idx { - self.add_code_range(target, 0, last_registered_idx, last_token_idx, NO_RELATION); - } - Ok(()) } @@ -411,41 +358,21 @@ impl PassTokenizer { Ok(()) } - fn consume_string_literal(&mut self, source: &mut InputSource, target: &mut TokenBuffer) -> Result<(), ParseError> { + fn consume_bytestring_literal(&mut self, source: &mut InputSource, target: &mut TokenBuffer) -> Result<(), ParseError> { let begin_pos = source.pos(); - - // Consume the leading double quotes - debug_assert!(source.next().unwrap() == b'"'); + debug_assert!(source.next().unwrap() == b'b'); source.consume(); - let mut prev_char = b'"'; - while let Some(c) = source.next() { - if !c.is_ascii() { - return Err(ParseError::new_error_str_at_pos(source, source.pos(), "non-ASCII character in string literal")); - } - - source.consume(); - if c == b'"' && prev_char != b'\\' { - // Unescaped string terminator - prev_char = c; - break; - } - - if prev_char == b'\\' && c == b'\\' { - // Escaped backslash, set prev_char to bogus to not conflict - // with escaped-" and unterminated string literal detection. - prev_char = b'\0'; - } else { - prev_char = c; - } - } + let end_pos = self.consume_ascii_string(begin_pos, source)?; + target.tokens.push(Token::new(TokenKind::Bytestring, begin_pos)); + target.tokens.push(Token::new(TokenKind::SpanEnd, end_pos)); - if prev_char != b'"' { - // Unterminated string literal - return Err(ParseError::new_error_str_at_pos(source, begin_pos, "encountered unterminated string literal")); - } + Ok(()) + } - let end_pos = source.pos(); + fn consume_string_literal(&mut self, source: &mut InputSource, target: &mut TokenBuffer) -> Result<(), ParseError> { + let begin_pos = source.pos(); + let end_pos = self.consume_ascii_string(begin_pos, source)?; target.tokens.push(Token::new(TokenKind::String, begin_pos)); target.tokens.push(Token::new(TokenKind::SpanEnd, end_pos)); @@ -509,10 +436,9 @@ impl PassTokenizer { // Modify offset to not include the newline characters if cur_char == b'\n' { if prev_char == b'\r' { - end_pos.offset -= 2; - } else { end_pos.offset -= 1; } + // Consume final newline source.consume(); } else { @@ -604,6 +530,44 @@ impl PassTokenizer { Ok(()) } + // Consumes the ascii string (including leading and trailing quotation + // marks) and returns the input position *after* the last quotation mark (or + // an error, if something went wrong). + fn consume_ascii_string(&self, begin_pos: InputPosition, source: &mut InputSource) -> Result { + debug_assert!(source.next().unwrap() == b'"'); + source.consume(); + + let mut prev_char = b'"'; + while let Some(c) = source.next() { + if !c.is_ascii() { + return Err(ParseError::new_error_str_at_pos(source, source.pos(), "non-ASCII character in string literal")); + } + + source.consume(); + if c == b'"' && prev_char != b'\\' { + // Unescaped string terminator + prev_char = c; + break; + } + + if prev_char == b'\\' && c == b'\\' { + // Escaped backslash, set prev_char to bogus to not conflict + // with escaped-" and unterminated string literal detection. + prev_char = b'\0'; + } else { + prev_char = c; + } + } + + if prev_char != b'"' { + // Unterminated string literal + return Err(ParseError::new_error_str_at_pos(source, begin_pos, "encountered unterminated string literal")); + } + + let end_pos = source.pos(); + return Ok(end_pos) + } + // Consumes whitespace and returns whether or not the whitespace contained // a newline. fn consume_whitespace(&self, source: &mut InputSource) -> bool { @@ -624,99 +588,22 @@ impl PassTokenizer { has_newline } - fn add_code_range( - &mut self, target: &mut TokenBuffer, parent_idx: i32, - code_start_idx: u32, code_end_idx: u32, next_sibling_idx: i32 - ) { - let new_range_idx = target.ranges.len() as i32; - let parent_range = &mut target.ranges[parent_idx as usize]; - debug_assert_ne!(parent_range.end, code_end_idx, "called push_code_range without a need to do so"); - - let sibling_idx = parent_range.last_child_idx; - - parent_range.last_child_idx = new_range_idx; - parent_range.end = code_end_idx; - parent_range.num_child_ranges += 1; - - let curly_depth = self.curly_stack.len() as u32; - target.ranges.push(TokenRange{ - parent_idx, - range_kind: TokenRangeKind::Code, - curly_depth, - start: code_start_idx, - end: code_end_idx, - num_child_ranges: 0, - first_child_idx: NO_RELATION, - last_child_idx: NO_RELATION, - next_sibling_idx, + fn emit_marker(&mut self, target: &mut TokenBuffer, kind: TokenMarkerKind, first_token: u32) { + debug_assert!( + target.markers + .last().map(|v| v.first_token < first_token) + .unwrap_or(true) + ); + + target.markers.push(TokenMarker{ + kind, + curly_depth: self.curly_stack.len() as u32, + first_token, + last_token: u32::MAX, + handled: false, }); - - // Fix up the sibling indices - if sibling_idx != NO_RELATION { - let sibling_range = &mut target.ranges[sibling_idx as usize]; - sibling_range.next_sibling_idx = new_range_idx; - } - } - - fn push_range(&mut self, target: &mut TokenBuffer, range_kind: TokenRangeKind, first_token_idx: u32) { - let new_range_idx = target.ranges.len() as i32; - let parent_idx = self.stack_idx as i32; - let parent_range = &mut target.ranges[self.stack_idx]; - - if parent_range.first_child_idx == NO_RELATION { - parent_range.first_child_idx = new_range_idx; - } - - let last_registered_idx = parent_range.end; - if last_registered_idx != first_token_idx { - self.add_code_range(target, parent_idx, last_registered_idx, first_token_idx, new_range_idx + 1); - } - - // Push the new range - self.stack_idx = target.ranges.len(); - let curly_depth = self.curly_stack.len() as u32; - target.ranges.push(TokenRange{ - parent_idx, - range_kind, - curly_depth, - start: first_token_idx, - end: first_token_idx, // modified when popped - num_child_ranges: 0, - first_child_idx: NO_RELATION, - last_child_idx: NO_RELATION, - next_sibling_idx: NO_RELATION - }) - } - - fn pop_range(&mut self, target: &mut TokenBuffer, end_token_idx: u32) { - let popped_idx = self.stack_idx as i32; - let popped_range = &mut target.ranges[self.stack_idx]; - debug_assert!(self.stack_idx != 0, "attempting to pop top-level range"); - - // Fix up the current range before going back to parent - popped_range.end = end_token_idx; - debug_assert_ne!(popped_range.start, end_token_idx); - - // Go back to parent and fix up its child pointers, but remember the - // last child, so we can link it to the newly popped range. - self.stack_idx = popped_range.parent_idx as usize; - let parent = &mut target.ranges[self.stack_idx]; - if parent.first_child_idx == NO_RELATION { - parent.first_child_idx = popped_idx; - } - let prev_sibling_idx = parent.last_child_idx; - parent.last_child_idx = popped_idx; - parent.end = end_token_idx; - parent.num_child_ranges += 1; - - // Fix up the sibling (if it exists) - if prev_sibling_idx != NO_RELATION { - let sibling = &mut target.ranges[prev_sibling_idx as usize]; - sibling.next_sibling_idx = popped_idx; - } } - fn check_ascii(&self, source: &InputSource) -> Result<(), ParseError> { match source.next() { Some(c) if !c.is_ascii() => { @@ -730,7 +617,7 @@ impl PassTokenizer { } // Helpers for characters -fn demarks_definition(ident: &[u8]) -> bool { +fn demarks_symbol(ident: &[u8]) -> bool { return ident == KW_STRUCT || ident == KW_ENUM || @@ -740,22 +627,32 @@ fn demarks_definition(ident: &[u8]) -> bool { ident == KW_COMPOSITE } +#[inline] fn demarks_import(ident: &[u8]) -> bool { return ident == KW_IMPORT; } +#[inline] fn is_whitespace(c: u8) -> bool { c.is_ascii_whitespace() } +#[inline] fn is_char_literal_start(c: u8) -> bool { return c == b'\''; } +#[inline] +fn is_bytestring_literal_start(c: u8, source: &InputSource) -> bool { + return c == b'b' && source.lookahead(1) == Some(b'"'); +} + +#[inline] fn is_string_literal_start(c: u8) -> bool { return c == b'"'; } +#[inline] fn is_pragma_start_or_pound(c: u8) -> bool { return c == b'#'; } @@ -775,6 +672,7 @@ fn is_identifier_remaining(c: u8) -> bool { c == b'_' } +#[inline] fn is_integer_literal_start(c: u8) -> bool { return c >= b'0' && c <= b'9'; }