Changeset - a52ba09256ad
[Not reviewed]
0 2 0
MH - 4 years ago 2021-04-23 10:26:53
henger@cwi.nl
WIP on compiler rearchitecting
2 files changed with 102 insertions and 8 deletions:
0 comments (0 inline, 0 general)
src/protocol/parser/pass_definitions.rs
Show inline comments
 
@@ -620,195 +620,212 @@ impl PassDefinitions {
 
            } else if token == TokenKind::OpenSquare {
 
                let subject = result;
 
                let from_index = self.consume_expression(module, iter, ctx)?;
 

	
 
                // Check if we have an indexing or slicing operation
 
                next = iter.next();
 
                if Some(TokenKind::DotDot) = next {
 
                    iter.consume();
 

	
 
                    let to_index = self.consume_expression(module, iter, ctx)?;
 
                    let end_span = consume_token(&module.source, iter, TokenKind::CloseSquare)?;
 
                    span.end = end_span.end;
 

	
 
                    result = ctx.heap.alloc_slicing_expression(|this| SlicingExpression{
 
                        this, span, subject, from_index, to_index,
 
                        parent: ExpressionParent::None,
 
                        concrete_type: ConcreteType::default()
 
                    }).upcast();
 
                } else if Some(TokenKind::CloseSquare) {
 
                    let end_span = consume_token(&module.source, iter, TokenKind::CloseSquare)?;
 
                    span.end = end_span.end;
 

	
 
                    result = ctx.heap.alloc_indexing_expression(|this| IndexingExpression{
 
                        this, span, subject,
 
                        index: from_index,
 
                        parent: ExpressionParent::None,
 
                        concrete_type: ConcreteType::default()
 
                    }).upcast();
 
                } else {
 
                    return Err(ParseError::new_error_str_at_pos(
 
                        &module.source, iter.last_valid_pos(), "unexpected token: expected ']' or '..'"
 
                    ));
 
                }
 
            } else {
 
                debug_assert_eq!(token, TokenKind::Dot);
 
                let subject = result;
 
                let (field_text, field_span) = consume_ident(&module.source, iter)?;
 
                let field = if field_text == b"length" {
 
                    Field::Length
 
                } else {
 
                    let value = ctx.pool.intern(field_text);
 
                    let identifier = Identifier{ value, span: field_span };
 
                    Field::Symbolic(FieldSymbolic{ identifier, definition: None, field_idx: 0 });
 
                };
 

	
 
                result = ctx.heap.alloc_select_expression(|this| SelectExpression{
 
                    this, span, subject, field,
 
                    parent: ExpressionParent::None,
 
                    concrete_type: ConcreteType::default()
 
                }).upcast();
 
            }
 

	
 
            next = iter.next();
 
        }
 

	
 
        Ok(result)
 
    }
 

	
 
    fn consume_primary_expression(
 
        &mut self, module: &Module, iter: &mut TokenIter, ctx: &mut PassCtx
 
    ) -> Result<ExpressionId, ParseError> {
 
        let next = iter.next();
 

	
 
        let result;
 
        if next == Some(TokenKind::OpenParen) {
 
            // Expression between parentheses
 
            iter.consume();
 
            result = self.consume_expression(module, iter, ctx)?;
 
            consume_token(&module.source, iter, TokenKind::CloseParen)?;
 
        } else if next == Some(TokenKind::OpenCurly) {
 
            // Array literal
 
            let (start_pos, mut end_pos) = iter.next_positions();
 
            let mut expressions = Vec::new();
 
            consume_comma_separated(
 
                TokenKind::OpenCurly, TokenKind::CloseCurly, &module.source, iter,
 
                |source, iter| self.consume_expression(module, iter, ctx),
 
                &mut expressions, "an expression", "a list of expressions", Some(&mut end_pos)
 
            )?;
 

	
 
            // TODO: Turn into literal
 
            result = ctx.heap.alloc_array_expression(|this| ArrayExpression{
 
                this,
 
                span: InputSpan::from_positions(start_pos, end_pos),
 
                elements: expressions,
 
                parent: ExpressionParent::None,
 
                concrete_type: ConcreteType::default(),
 
            }).upcast();
 
        } else if next == Some(TokenKind::Integer) {
 
            let (literal, span) = consume_integer_literal(&module.source, iter, &mut self.buffer)?;
 
            result = ctx.heap.alloc_literal_expression(|this| LiteralExpression{
 
                this, span,
 
                value: Literal::Integer(LiteralInteger{ unsigned_value: literal, negated: false }),
 
                parent: ExpressionParent::None,
 
                concrete_type: ConcreteType::default(),
 
            }).upcast();
 
        } else if next == Some(TokenKind::String) {
 
            let (text, span) = consume_string_literal(&module.source, iter, &mut self.buffer)?;
 
            let span = consume_string_literal(&module.source, iter, &mut self.buffer)?;
 
            let interned = ctx.pool.intern(self.buffer.as_bytes());
 
            result = ctx.heap.alloc_literal_expression(|this| LiteralExpression{
 
                this, span,
 
                value: Literal::String(interned),
 
                parent: ExpressionParent::None,
 
                concrete_type: ConcreteType::default(),
 
            }).upcast();
 
        } else if next == Some(TokenKind::Character) {
 

	
 
            let (character, span) = consume_character_literal(&module.source, iter)?;
 
            result = ctx.heap.alloc_literal_expression(|this| LiteralExpression{
 
                this, span,
 
                value: Literal::Character(character),
 
                parent: ExpressionParent::None,
 
                concrete_type: ConcreteType::default(),
 
            }).upcast();
 
        } else if next == Some(TokenKind::Ident) {
 
            // May be a variable, a type instantiation or a function call. If we
 
            // have a single identifier that we cannot find in the type table
 
            // then we're going to assume that we're dealign with a variable.
 
        }
 

	
 
        Ok(result)
 
    }
 

	
 
    //--------------------------------------------------------------------------
 
    // Expression Utilities
 
    //--------------------------------------------------------------------------
 

	
 
    #[inline]
 
    fn consume_generic_binary_expression<
 
        M: Fn(Option<TokenKind>) -> Option<BinaryOperator>,
 
        F: Fn(&mut PassDefinitions, &Module, &mut TokenIter, &mut PassCtx) -> Result<ExpressionId, ParseError>
 
    >(
 
        &mut self, module: &Module, iter: &mut TokenIter, ctx: &mut PassCtx, match_fn: M, higher_precedence_fn: F
 
    ) -> Result<ExpressionId, ParseError> {
 
        let mut result = higher_precedence_fn(self, module, iter, ctx)?;
 
        while let Some(operation) = match_fn(iter.next()) {
 
            let span = iter.next_span();
 
            iter.consume();
 

	
 
            let left = result;
 
            let right = higher_precedence_fn(self, module, iter, ctx)?;
 

	
 
            result = ctx.heap.alloc_binary_expression(|this| BinaryExpression{
 
                this, span, left, operation, right,
 
                parent: ExpressionParent::None,
 
                concrete_type: ConcreteType::default()
 
            }).upcast();
 
        }
 

	
 
        Ok(result)
 
    }
 
}
 

	
 
/// Consumes a type. A type always starts with an identifier which may indicate
 
/// a builtin type or a user-defined type. The fact that it may contain
 
/// polymorphic arguments makes it a tree-like structure. Because we cannot rely
 
/// on knowing the exact number of polymorphic arguments we do not check for
 
/// these.
 
// TODO: @Optimize, and fix spans if needed
 
fn consume_parser_type(
 
    source: &InputSource, iter: &mut TokenIter, symbols: &SymbolTable, heap: &Heap, poly_vars: &[Identifier],
 
    cur_scope: SymbolScope, wrapping_definition: DefinitionId, allow_inference: bool
 
) -> Result<ParserType, ParseError> {
 
    struct Entry{
 
        element: ParserTypeElement,
 
        depth: i32,
 
    }
 

	
 
    fn insert_array_before(elements: &mut Vec<Entry>, depth: i32, span: InputSpan) {
 
        let index = elements.iter().rposition(|e| e.depth == depth).unwrap();
 
        elements.insert(index, Entry{
 
            element: ParserTypeElement{ full_span: span, variant: ParserTypeVariant::Array },
 
            depth,
 
        });
 
    }
 

	
 
    // Most common case we just have one type, perhaps with some array
 
    // annotations.
 
    let element = consume_parser_type_ident(source, iter, symbols, heap, poly_vars, cur_scope, wrapping_definition, allow_inference)?;
 
    if iter.next() != Some(TokenKind::OpenAngle) {
 
        let mut num_array = 0;
 
        while iter.next() == Some(TokenKind::OpenSquare) {
 
            iter.consume();
 
            consume_token(source, iter, TokenKind::CloseSquare)?;
 
            num_array += 1;
 
        }
 

	
 
        let array_span = element.full_span;
 
        let mut elements = Vec::with_capacity(num_array + 1);
 
        for _ in 0..num_array {
 
            elements.push(ParserTypeElement{ full_span: array_span, variant: ParserTypeVariant::Array });
 
        }
 
        elements.push(element);
 

	
 
        return Ok(ParserType{ elements });
 
    };
 

	
 
    // We have a polymorphic specification. So we start by pushing the item onto
 
    // our stack, then start adding entries together with the angle-brace depth
 
    // at which they're found.
 
    let mut elements = Vec::new();
 
    elements.push(Entry{ element, depth: 0 });
 

	
 
    // Start out with the first '<' consumed.
 
    iter.consume();
 
    enum State { Ident, Open, Close, Comma };
 
    let mut state = State::Open;
 
    let mut angle_depth = 1;
 

	
 
    loop {
 
        let next = iter.next();
 

	
 
        match state {
 
            State::Ident => {
src/protocol/parser/token_parsing.rs
Show inline comments
 
@@ -178,207 +178,284 @@ pub(crate) fn maybe_consume_comma_separated_spilled<F: Fn(&InputSource, &mut Tok
 
            return Err(ParseError::new_error_at_pos(
 
                source, iter.last_valid_pos(),
 
                format!("expected a '{}', or {}", close_delim.token_chars(), item_name_and_article)
 
            ));
 
        }
 

	
 
        consumer_fn(source, iter)?;
 
        next = iter.next();
 
        had_comma = next == Some(TokenKind::Comma);
 
        if had_comma {
 
            iter.consume();
 
        }
 
    }
 

	
 
    Ok(true)
 
}
 

	
 
/// Consumes a comma-separated list and expected the opening and closing
 
/// characters to be present. The returned array may still be empty
 
pub(crate) fn consume_comma_separated<T, F>(
 
    open_delim: TokenKind, close_delim: TokenKind, source: &InputSource, iter: &mut TokenIter,
 
    consumer_fn: F, target: &mut Vec<T>, item_name_and_article: &'static str,
 
    list_name_and_article: &'static str, close_pos: Option<&mut InputPosition>
 
) -> Result<(), ParseError>
 
    where F: Fn(&InputSource, &mut TokenIter) -> Result<T, ParseError>
 
{
 
    let first_pos = iter.last_valid_pos();
 
    match maybe_consume_comma_separated(
 
        open_delim, close_delim, source, iter, consumer_fn, target,
 
        item_name_and_article, close_pos
 
    ) {
 
        Ok(true) => Ok(()),
 
        Ok(false) => {
 
            return Err(ParseError::new_error_at_pos(
 
                source, first_pos,
 
                format!("expected {}", list_name_and_article)
 
            ));
 
        },
 
        Err(err) => Err(err)
 
    }
 
}
 

	
 
/// Consumes an integer literal, may be binary, octal, hexadecimal or decimal,
 
/// and may have separating '_'-characters.
 
pub(crate) fn consume_integer_literal(source: &InputSource, iter: &mut TokenIter, buffer: &mut String) -> Result<(u64, InputSpan), ParseError> {
 
    if Some(TokenKind::Integer) != iter.next() {
 
        return Err(ParseError::new_error_str_at_pos(source, iter.last_valid_pos(), "expected an integer literal"));
 
    }
 
    let integer_span = iter.next_span();
 
    iter.consume();
 

	
 
    let integer_text = source.section_at_span(integer_span);
 

	
 
    // Determine radix and offset from prefix
 
    let (radix, input_offset, radix_name) =
 
        if integer_text.starts_with(b"0b") || integer_text.starts_with(b"0B") {
 
            // Binary number
 
            (2, 2, "binary")
 
        } else if integer_text.starts_with(b"0o") || integer_text.starts_with(b"0O") {
 
            // Octal number
 
            (8, 2, "octal")
 
        } else if integer_text.starts_with(b"0x") || integer_text.starts_with(b"0X") {
 
            // Hexadecimal number
 
            (16, 2, "hexadecimal")
 
        } else {
 
            (10, 0, "decimal")
 
        };
 

	
 
    // Take out any of the separating '_' characters
 
    buffer.clear();
 
    for char_idx in input_offset..integer_text.len() {
 
        let char = integer_text[char_idx];
 
        if char == b'_' {
 
            continue;
 
        }
 
        if !char.is_ascii_digit() {
 
            return Err(ParseError::new_error_at_span(
 
                source, integer_span,
 
                format!("incorrectly formatted {} number", radix_name)
 
            ));
 
        }
 
        buffer.push(char::from(char));
 
    }
 

	
 
    // Use the cleaned up string to convert to integer
 
    match u64::from_str_radix(&buffer, radix) {
 
        Ok(number) => Ok((number, integer_span)),
 
        Err(_) => Err(ParseError::new_error_at_span(
 
            source, integer_span,
 
            format!("incorrectly formatted {} number", radix_name)
 
        )),
 
    }
 
}
 

	
 
/// Consumes a character literal. We currently support a limited number of
 
/// backslash-escaped characters
 
pub(crate) fn consume_character_literal(source: &InputSource, iter: &mut TokenIter, buffer: &mut String) -> Result<char, ParseError> {
 
pub(crate) fn consume_character_literal(
 
    source: &InputSource, iter: &mut TokenIter
 
) -> Result<(char, InputSpan), ParseError> {
 
    if Some(TokenKind::Character) != iter.next() {
 
        return Err(ParseError::new_error_str_at_pos(source, iter.last_valid_pos(), "expected a character literal"));
 
    }
 
    let char_span = iter.next_span();
 
    let span = iter.next_span();
 
    iter.consume();
 

	
 
    let char_text = source.section_at_span(char_span);
 
    let char_text = source.section_at_span(span);
 
    if !char_text.is_ascii() {
 
        return Err(ParseError::new_error_str_at_span(
 
            source, span, "expected an ASCII character literal"
 
        ));
 
    }
 

	
 
    match char_text.len() {
 
        0 => return Err(ParseError::new_error_str_at_span(source, span, "too little characters in character literal")),
 
        1 => {
 
            // We already know the text is ascii, so just throw an error if we have the escape
 
            // character.
 
            if char_text[0] == b'\\' {
 
                return Err(ParseError::new_error_str_at_span(source, span, "escape character without subsequent character"));
 
            }
 
            return Ok((char_text[0] as char, span));
 
        },
 
        2 => {
 
            if char_text[0] == b'\\' {
 
                let result = parse_escaped_character(char_text[1])?;
 
                return Ok((result, span))
 
            }
 
        },
 
        _ => {}
 
    }
 

	
 
    //
 
    return Err(ParseError::new_error_str_at_span(source, span, "too many characters in character literal"))
 
}
 

	
 
/// Consumes a string literal. We currently support a limited number of
 
/// backslash-escaped characters.
 
pub(crate) fn consume_string_literal(source: &InputSource, iter: &mut TokenIter, buffer: &mut String) -> Result<(>
 
/// backslash-escaped characters. Note that the result is stored in the
 
/// buffer.
 
pub(crate) fn consume_string_literal(
 
    source: &InputSource, iter: &mut TokenIter, buffer: &mut String
 
) -> Result<InputSpan, ParseError> {
 
    if Some(TokenKind::String) != iter.next() {
 
        return Err(ParseError::new_error_str_at_pos(source, iter.last_valid_pos(), "expected a string literal"));
 
    }
 

	
 
    buffer.clear();
 
    let span = iter.next_span();
 
    iter.consume();
 

	
 
    let text = source.section_at_span(span);
 
    if !text.is_ascii() {
 
        return Err(ParseError::new_error_str_at_span(source, span, "expected an ASCII string literal"));
 
    }
 
    buffer.reserve(text.len());
 

	
 
    let mut was_escape = false;
 
    for idx in 0..text.len() {
 
        let cur = text[idx];
 
        if cur != b'\\' {
 
            if was_escape {
 
                let to_push = parse_escaped_character(cur)?;
 
                buffer.push(to_push);
 
            } else {
 
                buffer.push(cur as char);
 
            }
 
            was_escape = false;
 
        } else {
 
            was_escape = true;
 
        }
 
    }
 

	
 
    Ok(span)
 
}
 

	
 
fn parse_escaped_character(v: u8) -> Result<char, ParseError> {
 
    let result = match v {
 
        b'r' => '\r',
 
        b'n' => '\n',
 
        b't' => '\t',
 
        b'0' => '\0',
 
        b'\\' => '\\',
 
        b'\'' => '\'',
 
        b'"' => '"',
 
        v => return Err(ParseError::new_error_at_span(
 
            source, span, format!("unexpected escaped character '{}'", v)
 
        )),
 
    };
 
    Ok(result)
 
}
 

	
 
pub(crate) fn consume_pragma<'a>(source: &'a InputSource, iter: &mut TokenIter) -> Result<(&'a [u8], InputPosition, InputPosition), ParseError> {
 
    if Some(TokenKind::Pragma) != iter.next() {
 
        return Err(ParseError::new_error_str_at_pos(source, iter.last_valid_pos(), "expected a pragma"));
 
    }
 
    let (pragma_start, pragma_end) = iter.next_positions();
 
    iter.consume();
 
    Ok((source.section_at_pos(pragma_start, pragma_end), pragma_start, pragma_end))
 
}
 

	
 
pub(crate) fn has_ident(source: &InputSource, iter: &mut TokenIter, expected: &[u8]) -> bool {
 
    peek_ident(source, iter).map_or(false, |section| section == expected)
 
}
 

	
 
pub(crate) fn peek_ident<'a>(source: &'a InputSource, iter: &mut TokenIter) -> Option<&'a [u8]> {
 
    if Some(TokenKind::Ident) == iter.next() {
 
        let (start, end) = iter.next_positions();
 
        return Some(source.section_at_pos(start, end))
 
    }
 

	
 
    None
 
}
 

	
 
/// Consumes any identifier and returns it together with its span. Does not
 
/// check if the identifier is a reserved keyword.
 
pub(crate) fn consume_any_ident<'a>(
 
    source: &'a InputSource, iter: &mut TokenIter
 
) -> Result<(&'a [u8], InputSpan), ParseError> {
 
    if Some(TokenKind::Ident) != iter.next() {
 
        return Err(ParseError::new_error_str_at_pos(source, iter.last_valid_pos(), "expected an identifier"));
 
    }
 
    let (ident_start, ident_end) = iter.next_positions();
 
    iter.consume();
 
    Ok((source.section_at_pos(ident_start, ident_end), InputSpan::from_positions(ident_start, ident_end)))
 
}
 

	
 
/// Consumes a specific identifier. May or may not be a reserved keyword.
 
pub(crate) fn consume_exact_ident(source: &InputSource, iter: &mut TokenIter, expected: &[u8]) -> Result<InputSpan, ParseError> {
 
    let (ident, pos) = consume_any_ident(source, iter)?;
 
    if ident != expected {
 
        debug_assert!(expected.is_ascii());
 
        return Err(ParseError::new_error_at_pos(
 
            source, iter.last_valid_pos(),
 
            format!("expected the text '{}'", &String::from_utf8_lossy(expected))
 
        ));
 
    }
 
    Ok(pos)
 
}
 

	
 
/// Consumes an identifier that is not a reserved keyword and returns it
 
/// together with its span.
 
pub(crate) fn consume_ident<'a>(
 
    source: &'a InputSource, iter: &mut TokenIter
 
) -> Result<(&'a [u8], InputSpan), ParseError> {
 
    let (ident, span) = consume_any_ident(source, iter)?;
 
    if is_reserved_keyword(ident) {
 
        return Err(ParseError::new_error_str_at_span(source, span, "encountered reserved keyword"));
 
    }
 

	
 
    Ok((ident, span))
 
}
 

	
 
/// Consumes an identifier and immediately intern it into the `StringPool`
 
pub(crate) fn consume_ident_interned(
 
    source: &InputSource, iter: &mut TokenIter, ctx: &mut PassCtx
 
) -> Result<Identifier, ParseError> {
 
    let (value, span) = consume_ident(source, iter)?;
 
    let value = ctx.pool.intern(value);
 
    Ok(Identifier{ span, value })
 
}
 

	
 
fn is_reserved_definition_keyword(text: &[u8]) -> bool {
 
    match text {
 
        KW_STRUCT | KW_ENUM | KW_UNION | KW_FUNCTION | KW_PRIMITIVE | KW_COMPOSITE => true,
 
        _ => false,
 
    }
 
}
 

	
 
fn is_reserved_statement_keyword(text: &[u8]) -> bool {
 
    match text {
 
        KW_IMPORT | KW_AS |
 
        KW_STMT_CHANNEL | KW_STMT_IF | KW_STMT_WHILE |
 
        KW_STMT_BREAK | KW_STMT_CONTINUE | KW_STMT_GOTO | KW_STMT_RETURN |
 
        KW_STMT_SYNC | KW_STMT_ASSERT | KW_STMT_NEW => true,
 
        _ => false,
 
    }
 
}
 

	
 
fn is_reserved_expression_keyword(text: &[u8]) -> bool {
 
    match text {
 
        KW_LET |
 
        KW_LIT_TRUE | KW_LIT_FALSE | KW_LIT_NULL |
 
        KW_FUNC_GET | KW_FUNC_PUT | KW_FUNC_FIRES | KW_FUNC_CREATE | KW_FUNC_LENGTH => true,
 
        _ => false,
 
    }
 
}
0 comments (0 inline, 0 general)