Source code for ratus.token

from dataclasses import dataclass
from enum import Enum
from typing import Any, List


[docs]class TokeniserError(Exception):
    """Error raised in the tokeniser."""


[docs]class TokenType(Enum):
    """Token type."""

    LEFT_PAREN = "("
    RIGHT_PAREN = ")"
    COMMA = ","
    PLUS = "+"
    MINUS = "-"
    SLASH = "/"
    STAR = "*"

    BANG = "!"
    EQUAL = "="
    BANG_EQUAL = "!="
    GREATER = ">"
    GREATER_EQUAL = ">="
    LESS = "<"
    LESS_EQUAL = "<="

    INT = "int"
    FLOAT = "float"
    STRING = "string"
    IDENT = "ident"

    AND = "and"
    OR = "or"

    EOF = "eof"


[docs]@dataclass
class Token:
    """Representation of a token."""

    token_type: TokenType
    lexeme: str


[docs]@dataclass
class TokenLiteral(Token):
    """Representation of a literal."""

    literal: Any


class Tokeniser:
    def __init__(self):
        self.start: int = 0
        self.current: int = 0
        self.source: str = ""
        self.tokens: List[Token] = []

    def tokenise(self, source: str) -> List[Token]:
        """Tokenise an input in a list of tokens."""
        self.source = source
        self.current = 0
        self.start = 0
        while self.current < len(self.source):
            self.start = self.current
            self.scan_token()
        return self.tokens

    def scan_token(self):
        c = self.source[self.current]
        self.current += 1
        if c.strip() == "":
            # Skip whitespace
            return
        if c in ("(", ")", ",", "+", "-", "*", "=", "/"):
            # Match characters that are unambiguously only single characters
            self.add_token(TokenType(c))
        elif c == "!":
            if self.source[self.current] == "=":
                self.add_token(TokenType.BANG_EQUAL)
                self.current += 1
            else:
                self.add_token(TokenType.BANG)
        elif c == "<":
            if self.source[self.current] == "=":
                self.current += 1
                self.add_token(TokenType.LESS_EQUAL)
            else:
                self.add_token(TokenType.LESS)
        elif c == ">":
            if self.source[self.current] == "=":
                self.current += 1
                self.add_token(TokenType.GREATER_EQUAL)
            else:
                self.add_token(TokenType.GREATER)
        elif c in ("'", '"'):
            self.string()
        elif c.isdigit():
            self.numeric()
        elif c.isalpha():
            self.identifier()
        else:
            raise TokeniserError(f"Unexpected character: '{c}'")

    def add_token(self, token_type: TokenType):
        lexeme = self.source[self.start : self.current]
        token = Token(token_type, lexeme)
        self.tokens.append(token)

    def string(self):
        while self.current < len(self.source) and self.source[self.current] not in (
            "'",
            '"',
        ):
            self.current += 1
        if self.current >= len(self.source):
            raise TokeniserError("Unterminated string")
        self.current += 1  # Consume closing quote
        lexeme = self.source[self.start : self.current]
        string = self.source[self.start + 1 : self.current - 1]
        token = TokenLiteral(TokenType.STRING, lexeme, string)
        self.tokens.append(token)

    def numeric(self):
        while self.current < len(self.source) and self.source[self.current].isdigit():
            self.current += 1

        # Invalid to finish expression with "."
        if self.current == len(self.source) - 1 and self.source[self.current] == ".":
            raise TokeniserError("Expression cannot finish with '.'")

        if self.current < len(self.source) and self.source[self.current] == ".":
            # Consume the "." so we can start consuming digits again
            self.current += 1
            if not self.source[self.current].isdigit():
                raise TokeniserError(
                    f"Expected digit after '.', found '{self.source[self.current]}'"
                )

            # Match a float
            while (
                self.current < len(self.source) and self.source[self.current].isdigit()
            ):
                self.current += 1
            float_ = self.source[self.start : self.current]
            token = TokenLiteral(TokenType.FLOAT, float_, float(float_))
            self.tokens.append(token)
        else:
            int_ = self.source[self.start : self.current]
            token = TokenLiteral(TokenType.INT, int_, int(int_))
            self.tokens.append(token)

    def identifier(self):
        #  Identifiers can be made up of letters, numbers and '_'
        while self.current < len(self.source) and (
            self.source[self.current].isalpha()
            or self.source[self.current].isdigit()
            or self.source[self.current] == "_"
        ):
            self.current += 1
        ident = self.source[self.start : self.current]
        token = TokenLiteral(TokenType.IDENT, ident, ident)
        self.tokens.append(token)
Source code for ratus.token

ratus

Navigation

Related Topics