diff --git a/src/protocol/parser/pass_tokenizer.rs b/src/protocol/parser/pass_tokenizer.rs index 07f7dbc2c2eef970825fd2681526cac56e3e701f..c611c9c4dc6b79e5d39c2f23c19a8742c7f8db3c 100644 --- a/src/protocol/parser/pass_tokenizer.rs +++ b/src/protocol/parser/pass_tokenizer.rs @@ -41,6 +41,8 @@ impl PassTokenizer { if is_char_literal_start(c) { self.consume_char_literal(source, target)?; + } else if is_bytestring_literal_start(c, source) { + self.consume_bytestring_literal(source, target)?; } else if is_string_literal_start(c) { self.consume_string_literal(source, target)?; } else if is_identifier_start(c) { @@ -356,41 +358,21 @@ impl PassTokenizer { Ok(()) } - fn consume_string_literal(&mut self, source: &mut InputSource, target: &mut TokenBuffer) -> Result<(), ParseError> { + fn consume_bytestring_literal(&mut self, source: &mut InputSource, target: &mut TokenBuffer) -> Result<(), ParseError> { let begin_pos = source.pos(); - - // Consume the leading double quotes - debug_assert!(source.next().unwrap() == b'"'); + debug_assert!(source.next().unwrap() == b'b'); source.consume(); - let mut prev_char = b'"'; - while let Some(c) = source.next() { - if !c.is_ascii() { - return Err(ParseError::new_error_str_at_pos(source, source.pos(), "non-ASCII character in string literal")); - } - - source.consume(); - if c == b'"' && prev_char != b'\\' { - // Unescaped string terminator - prev_char = c; - break; - } - - if prev_char == b'\\' && c == b'\\' { - // Escaped backslash, set prev_char to bogus to not conflict - // with escaped-" and unterminated string literal detection. - prev_char = b'\0'; - } else { - prev_char = c; - } - } + let end_pos = self.consume_ascii_string(begin_pos, source)?; + target.tokens.push(Token::new(TokenKind::Bytestring, begin_pos)); + target.tokens.push(Token::new(TokenKind::SpanEnd, end_pos)); - if prev_char != b'"' { - // Unterminated string literal - return Err(ParseError::new_error_str_at_pos(source, begin_pos, "encountered unterminated string literal")); - } + Ok(()) + } - let end_pos = source.pos(); + fn consume_string_literal(&mut self, source: &mut InputSource, target: &mut TokenBuffer) -> Result<(), ParseError> { + let begin_pos = source.pos(); + let end_pos = self.consume_ascii_string(begin_pos, source)?; target.tokens.push(Token::new(TokenKind::String, begin_pos)); target.tokens.push(Token::new(TokenKind::SpanEnd, end_pos)); @@ -548,6 +530,44 @@ impl PassTokenizer { Ok(()) } + // Consumes the ascii string (including leading and trailing quotation + // marks) and returns the input position *after* the last quotation mark (or + // an error, if something went wrong). + fn consume_ascii_string(&self, begin_pos: InputPosition, source: &mut InputSource) -> Result { + debug_assert!(source.next().unwrap() == b'"'); + source.consume(); + + let mut prev_char = b'"'; + while let Some(c) = source.next() { + if !c.is_ascii() { + return Err(ParseError::new_error_str_at_pos(source, source.pos(), "non-ASCII character in string literal")); + } + + source.consume(); + if c == b'"' && prev_char != b'\\' { + // Unescaped string terminator + prev_char = c; + break; + } + + if prev_char == b'\\' && c == b'\\' { + // Escaped backslash, set prev_char to bogus to not conflict + // with escaped-" and unterminated string literal detection. + prev_char = b'\0'; + } else { + prev_char = c; + } + } + + if prev_char != b'"' { + // Unterminated string literal + return Err(ParseError::new_error_str_at_pos(source, begin_pos, "encountered unterminated string literal")); + } + + let end_pos = source.pos(); + return Ok(end_pos) + } + // Consumes whitespace and returns whether or not the whitespace contained // a newline. fn consume_whitespace(&self, source: &mut InputSource) -> bool { @@ -607,22 +627,32 @@ fn demarks_symbol(ident: &[u8]) -> bool { ident == KW_COMPOSITE } +#[inline] fn demarks_import(ident: &[u8]) -> bool { return ident == KW_IMPORT; } +#[inline] fn is_whitespace(c: u8) -> bool { c.is_ascii_whitespace() } +#[inline] fn is_char_literal_start(c: u8) -> bool { return c == b'\''; } +#[inline] +fn is_bytestring_literal_start(c: u8, source: &InputSource) -> bool { + return c == b'b' && source.lookahead(1) == Some(b'"'); +} + +#[inline] fn is_string_literal_start(c: u8) -> bool { return c == b'"'; } +#[inline] fn is_pragma_start_or_pound(c: u8) -> bool { return c == b'#'; } @@ -642,6 +672,7 @@ fn is_identifier_remaining(c: u8) -> bool { c == b'_' } +#[inline] fn is_integer_literal_start(c: u8) -> bool { return c >= b'0' && c <= b'9'; }