Files
@ fc987660fdee
Branch filter:
Location: CSY/reowolf/src/protocol/parser/token_parsing.rs
fc987660fdee
17.6 KiB
application/rls-services+xml
WIP on compiler rearchitecting
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 | use crate::collections::StringRef;
use crate::protocol::ast::*;
use crate::protocol::input_source2::{
InputSource2 as InputSource,
InputPosition2 as InputPosition,
InputSpan,
ParseError,
};
use super::tokens::*;
use super::symbol_table2::*;
use super::{Module, ModuleCompilationPhase, PassCtx};
// Keywords
pub(crate) const KW_LET: &'static [u8] = b"let";
pub(crate) const KW_AS: &'static [u8] = b"as";
pub(crate) const KW_STRUCT: &'static [u8] = b"struct";
pub(crate) const KW_ENUM: &'static [u8] = b"enum";
pub(crate) const KW_UNION: &'static [u8] = b"union";
pub(crate) const KW_FUNCTION: &'static [u8] = b"function";
pub(crate) const KW_PRIMITIVE: &'static [u8] = b"primitive";
pub(crate) const KW_COMPOSITE: &'static [u8] = b"composite";
pub(crate) const KW_IMPORT: &'static [u8] = b"import";
// Keywords - literals
pub(crate) const KW_LIT_TRUE: &'static [u8] = b"true";
pub(crate) const KW_LIT_FALSE: &'static [u8] = b"false";
pub(crate) const KW_LIT_NULL: &'static [u8] = b"null";
// Keywords - functions
pub(crate) const KW_FUNC_GET: &'static [u8] = b"get";
pub(crate) const KW_FUNC_PUT: &'static [u8] = b"put";
pub(crate) const KW_FUNC_FIRES: &'static [u8] = b"fires";
pub(crate) const KW_FUNC_CREATE: &'static [u8] = b"create";
pub(crate) const KW_FUNC_LENGTH: &'static [u8] = b"length";
// Keywords - statements
pub(crate) const KW_STMT_CHANNEL: &'static [u8] = b"channel";
pub(crate) const KW_STMT_IF: &'static [u8] = b"if";
pub(crate) const KW_STMT_WHILE: &'static [u8] = b"while";
pub(crate) const KW_STMT_BREAK: &'static [u8] = b"break";
pub(crate) const KW_STMT_CONTINUE: &'static [u8] = b"continue";
pub(crate) const KW_STMT_GOTO: &'static [u8] = b"goto";
pub(crate) const KW_STMT_RETURN: &'static [u8] = b"return";
pub(crate) const KW_STMT_SYNC: &'static [u8] = b"synchronous";
pub(crate) const KW_STMT_ASSERT: &'static [u8] = b"assert";
pub(crate) const KW_STMT_NEW: &'static [u8] = b"new";
// Keywords - types
pub(crate) const KW_TYPE_IN_PORT: &'static [u8] = b"in";
pub(crate) const KW_TYPE_OUT_PORT: &'static [u8] = b"out";
pub(crate) const KW_TYPE_MESSAGE: &'static [u8] = b"msg";
pub(crate) const KW_TYPE_BOOL: &'static [u8] = b"bool";
pub(crate) const KW_TYPE_UINT8: &'static [u8] = b"u8";
pub(crate) const KW_TYPE_UINT16: &'static [u8] = b"u16";
pub(crate) const KW_TYPE_UINT32: &'static [u8] = b"u32";
pub(crate) const KW_TYPE_UINT64: &'static [u8] = b"u64";
pub(crate) const KW_TYPE_SINT8: &'static [u8] = b"s8";
pub(crate) const KW_TYPE_SINT16: &'static [u8] = b"s16";
pub(crate) const KW_TYPE_SINT32: &'static [u8] = b"s32";
pub(crate) const KW_TYPE_SINT64: &'static [u8] = b"s64";
pub(crate) const KW_TYPE_CHAR: &'static [u8] = b"char";
pub(crate) const KW_TYPE_STRING: &'static [u8] = b"string";
pub(crate) const KW_TYPE_INFERRED: &'static [u8] = b"auto";
/// Consumes a domain-name identifier: identifiers separated by a dot. For
/// simplification of later parsing and span identification the domain-name may
/// contain whitespace, but must reside on the same line.
pub(crate) fn consume_domain_ident<'a>(
source: &'a InputSource, iter: &mut TokenIter
) -> Result<(&'a [u8], InputSpan), ParseError> {
let (_, mut span) = consume_ident(source, iter)?;
while let Some(TokenKind::Dot) = iter.next() {
iter.consume();
let (_, new_span) = consume_ident(source, iter)?;
span.end = new_span.end;
}
// Not strictly necessary, but probably a reasonable restriction: this
// simplifies parsing of module naming and imports.
if span.begin.line != span.end.line {
return Err(ParseError::new_error_str_at_span(source, span, "module names may not span multiple lines"));
}
// If module name consists of a single identifier, then it may not match any
// of the reserved keywords
let section = source.section_at_pos(span.begin, span.end);
if is_reserved_keyword(section) {
return Err(ParseError::new_error_str_at_span(source, span, "encountered reserved keyword"));
}
Ok((source.section_at_pos(span.begin, span.end), span))
}
/// Consumes a specific expected token. Be careful to only call this with tokens
/// that do not have a variable length.
pub(crate) fn consume_token(source: &InputSource, iter: &mut TokenIter, expected: TokenKind) -> Result<InputSpan, ParseError> {
if Some(expected) != iter.next() {
return Err(ParseError::new_error_at_pos(
source, iter.last_valid_pos(),
format!("expected '{}'", expected.token_chars())
));
}
let span = iter.next_span();
iter.consume();
Ok(span)
}
/// Consumes a comma-separated list of items if the opening delimiting token is
/// encountered. If not, then the iterator will remain at its current position.
/// Note that the potential cases may be:
/// - No opening delimiter encountered, then we return `false`.
/// - Both opening and closing delimiter encountered, but no items.
/// - Opening and closing delimiter encountered, and items were processed.
/// - Found an opening delimiter, but processing an item failed.
pub(crate) fn maybe_consume_comma_separated<T, F>(
open_delim: TokenKind, close_delim: TokenKind, source: &InputSource, iter: &mut TokenIter,
consumer_fn: F, target: &mut Vec<T>, item_name_and_article: &'static str,
close_pos: Option<&mut InputPosition>
) -> Result<bool, ParseError>
where F: Fn(&InputSource, &mut TokenIter) -> Result<T, ParseError>
{
let mut next = iter.next();
if Some(open_delim) != next {
return Ok(false);
}
// Opening delimiter encountered, so must parse the comma-separated list.
iter.consume();
target.clear();
let mut had_comma = true;
loop {
next = iter.next();
if Some(close_delim) == next {
if let Some(close_pos) = close_pos {
// If requested return the position of the closing delimiter
let (_, new_close_pos) = iter.next_positions();
*close_pos = new_close_pos;
}
iter.consume();
break;
} else if !had_comma || next.is_none() {
return Err(ParseError::new_error_at_pos(
source, iter.last_valid_pos(),
format!("expected a '{}', or {}", close_delim.token_chars(), item_name_and_article)
));
}
let new_item = consumer_fn(source, iter)?;
target.push(new_item);
next = iter.next();
had_comma = next == Some(TokenKind::Comma);
if had_comma {
iter.consume();
}
}
Ok(true)
}
pub(crate) fn maybe_consume_comma_separated_spilled<F: Fn(&InputSource, &mut TokenIter) -> Result<(), ParseError>>(
open_delim: TokenKind, close_delim: TokenKind, source: &InputSource, iter: &mut TokenIter,
consumer_fn: F, item_name_and_article: &'static str
) -> Result<bool, ParseError> {
let mut next = iter.next();
if Some(open_delim) != next {
return Ok(false);
}
iter.consume();
let mut had_comma = true;
loop {
next = iter.next();
if Some(close_delim) == next {
iter.consume();
break;
} else if !had_comma {
return Err(ParseError::new_error_at_pos(
source, iter.last_valid_pos(),
format!("expected a '{}', or {}", close_delim.token_chars(), item_name_and_article)
));
}
consumer_fn(source, iter)?;
next = iter.next();
had_comma = next == Some(TokenKind::Comma);
if had_comma {
iter.consume();
}
}
Ok(true)
}
/// Consumes a comma-separated list and expected the opening and closing
/// characters to be present. The returned array may still be empty
pub(crate) fn consume_comma_separated<T, F>(
open_delim: TokenKind, close_delim: TokenKind, source: &InputSource, iter: &mut TokenIter,
consumer_fn: F, target: &mut Vec<T>, item_name_and_article: &'static str,
list_name_and_article: &'static str, close_pos: Option<&mut InputPosition>
) -> Result<(), ParseError>
where F: Fn(&InputSource, &mut TokenIter) -> Result<T, ParseError>
{
let first_pos = iter.last_valid_pos();
match maybe_consume_comma_separated(
open_delim, close_delim, source, iter, consumer_fn, target,
item_name_and_article, close_pos
) {
Ok(true) => Ok(()),
Ok(false) => {
return Err(ParseError::new_error_at_pos(
source, first_pos,
format!("expected {}", list_name_and_article)
));
},
Err(err) => Err(err)
}
}
/// Consumes an integer literal, may be binary, octal, hexadecimal or decimal,
/// and may have separating '_'-characters.
pub(crate) fn consume_integer_literal(source: &InputSource, iter: &mut TokenIter, buffer: &mut String) -> Result<(u64, InputSpan), ParseError> {
if Some(TokenKind::Integer) != iter.next() {
return Err(ParseError::new_error_str_at_pos(source, iter.last_valid_pos(), "expected an integer literal"));
}
let integer_span = iter.next_span();
iter.consume();
let integer_text = source.section_at_span(integer_span);
// Determine radix and offset from prefix
let (radix, input_offset, radix_name) =
if integer_text.starts_with(b"0b") || integer_text.starts_with(b"0B") {
// Binary number
(2, 2, "binary")
} else if integer_text.starts_with(b"0o") || integer_text.starts_with(b"0O") {
// Octal number
(8, 2, "octal")
} else if integer_text.starts_with(b"0x") || integer_text.starts_with(b"0X") {
// Hexadecimal number
(16, 2, "hexadecimal")
} else {
(10, 0, "decimal")
};
// Take out any of the separating '_' characters
buffer.clear();
for char_idx in input_offset..integer_text.len() {
let char = integer_text[char_idx];
if char == b'_' {
continue;
}
if !char.is_ascii_digit() {
return Err(ParseError::new_error_at_span(
source, integer_span,
format!("incorrectly formatted {} number", radix_name)
));
}
buffer.push(char::from(char));
}
// Use the cleaned up string to convert to integer
match u64::from_str_radix(&buffer, radix) {
Ok(number) => Ok((number, integer_span)),
Err(_) => Err(ParseError::new_error_at_span(
source, integer_span,
format!("incorrectly formatted {} number", radix_name)
)),
}
}
/// Consumes a character literal. We currently support a limited number of
/// backslash-escaped characters
pub(crate) fn consume_character_literal(source: &InputSource, iter: &mut TokenIter, buffer: &mut String) -> Result<char, ParseError> {
if Some(TokenKind::Character) != iter.next() {
return Err(ParseError::new_error_str_at_pos(source, iter.last_valid_pos(), "expected a character literal"));
}
let char_span = iter.next_span();
iter.consume();
let char_text = source.section_at_span(char_span);
//
}
/// Consumes a string literal. We currently support a limited number of
/// backslash-escaped characters.
pub(crate) fn consume_string_literal(source: &InputSource, iter: &mut TokenIter, buffer: &mut String) -> Result<(>
pub(crate) fn consume_pragma<'a>(source: &'a InputSource, iter: &mut TokenIter) -> Result<(&'a [u8], InputPosition, InputPosition), ParseError> {
if Some(TokenKind::Pragma) != iter.next() {
return Err(ParseError::new_error_str_at_pos(source, iter.last_valid_pos(), "expected a pragma"));
}
let (pragma_start, pragma_end) = iter.next_positions();
iter.consume();
Ok((source.section_at_pos(pragma_start, pragma_end), pragma_start, pragma_end))
}
pub(crate) fn has_ident(source: &InputSource, iter: &mut TokenIter, expected: &[u8]) -> bool {
peek_ident(source, iter).map_or(false, |section| section == expected)
}
pub(crate) fn peek_ident<'a>(source: &'a InputSource, iter: &mut TokenIter) -> Option<&'a [u8]> {
if Some(TokenKind::Ident) == iter.next() {
let (start, end) = iter.next_positions();
return Some(source.section_at_pos(start, end))
}
None
}
/// Consumes any identifier and returns it together with its span. Does not
/// check if the identifier is a reserved keyword.
pub(crate) fn consume_any_ident<'a>(
source: &'a InputSource, iter: &mut TokenIter
) -> Result<(&'a [u8], InputSpan), ParseError> {
if Some(TokenKind::Ident) != iter.next() {
return Err(ParseError::new_error_str_at_pos(source, iter.last_valid_pos(), "expected an identifier"));
}
let (ident_start, ident_end) = iter.next_positions();
iter.consume();
Ok((source.section_at_pos(ident_start, ident_end), InputSpan::from_positions(ident_start, ident_end)))
}
/// Consumes a specific identifier. May or may not be a reserved keyword.
pub(crate) fn consume_exact_ident(source: &InputSource, iter: &mut TokenIter, expected: &[u8]) -> Result<InputSpan, ParseError> {
let (ident, pos) = consume_any_ident(source, iter)?;
if ident != expected {
debug_assert!(expected.is_ascii());
return Err(ParseError::new_error_at_pos(
source, iter.last_valid_pos(),
format!("expected the text '{}'", &String::from_utf8_lossy(expected))
));
}
Ok(pos)
}
/// Consumes an identifier that is not a reserved keyword and returns it
/// together with its span.
pub(crate) fn consume_ident<'a>(
source: &'a InputSource, iter: &mut TokenIter
) -> Result<(&'a [u8], InputSpan), ParseError> {
let (ident, span) = consume_any_ident(source, iter)?;
if is_reserved_keyword(ident) {
return Err(ParseError::new_error_str_at_span(source, span, "encountered reserved keyword"));
}
Ok((ident, span))
}
/// Consumes an identifier and immediately intern it into the `StringPool`
pub(crate) fn consume_ident_interned(
source: &InputSource, iter: &mut TokenIter, ctx: &mut PassCtx
) -> Result<Identifier, ParseError> {
let (value, span) = consume_ident(source, iter)?;
let value = ctx.pool.intern(value);
Ok(Identifier{ span, value })
}
fn is_reserved_definition_keyword(text: &[u8]) -> bool {
match text {
KW_STRUCT | KW_ENUM | KW_UNION | KW_FUNCTION | KW_PRIMITIVE | KW_COMPOSITE => true,
_ => false,
}
}
fn is_reserved_statement_keyword(text: &[u8]) -> bool {
match text {
KW_IMPORT | KW_AS |
KW_STMT_CHANNEL | KW_STMT_IF | KW_STMT_WHILE |
KW_STMT_BREAK | KW_STMT_CONTINUE | KW_STMT_GOTO | KW_STMT_RETURN |
KW_STMT_SYNC | KW_STMT_ASSERT | KW_STMT_NEW => true,
_ => false,
}
}
fn is_reserved_expression_keyword(text: &[u8]) -> bool {
match text {
KW_LET |
KW_LIT_TRUE | KW_LIT_FALSE | KW_LIT_NULL |
KW_FUNC_GET | KW_FUNC_PUT | KW_FUNC_FIRES | KW_FUNC_CREATE | KW_FUNC_LENGTH => true,
_ => false,
}
}
fn is_reserved_type_keyword(text: &[u8]) -> bool {
match text {
KW_TYPE_IN_PORT | KW_TYPE_OUT_PORT | KW_TYPE_MESSAGE | KW_TYPE_BOOL |
KW_TYPE_UINT8 | KW_TYPE_UINT16 | KW_TYPE_UINT32 | KW_TYPE_UINT64 |
KW_TYPE_SINT8 | KW_TYPE_SINT16 | KW_TYPE_SINT32 | KW_TYPE_SINT64 |
KW_TYPE_CHAR | KW_TYPE_STRING |
KW_TYPE_INFERRED => true,
_ => false,
}
}
fn is_reserved_keyword(text: &[u8]) -> bool {
return
is_reserved_definition_keyword(text) ||
is_reserved_statement_keyword(text) ||
is_reserved_expression_keyword(text) ||
is_reserved_type_keyword(text);
}
pub(crate) fn seek_module(modules: &[Module], root_id: RootId) -> Option<&Module> {
for module in modules {
if module.root_id == root_id {
return Some(module)
}
}
return None
}
/// Constructs a human-readable message indicating why there is a conflict of
/// symbols.
// Note: passing the `module_idx` is not strictly necessary, but will prevent
// programmer mistakes during development: we get a conflict because we're
// currently parsing a particular module.
pub(crate) fn construct_symbol_conflict_error(
modules: &[Module], module_idx: usize, ctx: &PassCtx, new_symbol: &Symbol, old_symbol: &Symbol
) -> ParseError {
let module = &modules[module_idx];
let get_symbol_span_and_msg = |symbol: &Symbol| -> (String, InputSpan) {
match symbol.introduced_at {
Some(import_id) => {
// Symbol is being imported
let import = &ctx.heap[import_id];
match import {
Import::Module(import) => (
format!("the module aliased as '{}' imported here", symbol.name.as_str()),
import.span
),
Import::Symbols(symbols) => (
format!("the type '{}' imported here", symbol.name.as_str()),
symbols.span
),
}
},
None => {
// Symbol is being defined
debug_assert_eq!(symbol.defined_in_module, module.root_id);
debug_assert_ne!(symbol.definition.symbol_class(), SymbolClass::Module);
(
format!("the type '{}' defined here", symbol.name.as_str()),
symbol.identifier_span
)
}
}
};
let (new_symbol_msg, new_symbol_span) = get_symbol_span_and_msg(new_symbol);
let (old_symbol_msg, old_symbol_span) = get_symbol_span_and_msg(old_symbol);
return ParseError::new_error_at_span(
&module.source, new_symbol_span, format!("symbol is defined twice: {}", new_symbol_msg)
).with_info_at_span(
&module.source, old_symbol_span, format!("it conflicts with {}", old_symbol_msg)
)
}
|