Source code for ratus.token

from dataclasses import dataclass
from enum import Enum
from typing import Any, List


[docs]class TokeniserError(Exception): """Error raised in the tokeniser."""
[docs]class TokenType(Enum): """Token type.""" LEFT_PAREN = "(" RIGHT_PAREN = ")" COMMA = "," PLUS = "+" MINUS = "-" SLASH = "/" STAR = "*" BANG = "!" EQUAL = "=" BANG_EQUAL = "!=" GREATER = ">" GREATER_EQUAL = ">=" LESS = "<" LESS_EQUAL = "<=" INT = "int" FLOAT = "float" STRING = "string" IDENT = "ident" AND = "and" OR = "or" EOF = "eof"
[docs]@dataclass class Token: """Representation of a token.""" token_type: TokenType lexeme: str
[docs]@dataclass class TokenLiteral(Token): """Representation of a literal.""" literal: Any
class Tokeniser: def __init__(self): self.start: int = 0 self.current: int = 0 self.source: str = "" self.tokens: List[Token] = [] def tokenise(self, source: str) -> List[Token]: """Tokenise an input in a list of tokens.""" self.source = source self.current = 0 self.start = 0 while self.current < len(self.source): self.start = self.current self.scan_token() return self.tokens def scan_token(self): c = self.source[self.current] self.current += 1 if c.strip() == "": # Skip whitespace return if c in ("(", ")", ",", "+", "-", "*", "=", "/"): # Match characters that are unambiguously only single characters self.add_token(TokenType(c)) elif c == "!": if self.source[self.current] == "=": self.add_token(TokenType.BANG_EQUAL) self.current += 1 else: self.add_token(TokenType.BANG) elif c == "<": if self.source[self.current] == "=": self.current += 1 self.add_token(TokenType.LESS_EQUAL) else: self.add_token(TokenType.LESS) elif c == ">": if self.source[self.current] == "=": self.current += 1 self.add_token(TokenType.GREATER_EQUAL) else: self.add_token(TokenType.GREATER) elif c in ("'", '"'): self.string() elif c.isdigit(): self.numeric() elif c.isalpha(): self.identifier() else: raise TokeniserError(f"Unexpected character: '{c}'") def add_token(self, token_type: TokenType): lexeme = self.source[self.start : self.current] token = Token(token_type, lexeme) self.tokens.append(token) def string(self): while self.current < len(self.source) and self.source[self.current] not in ( "'", '"', ): self.current += 1 if self.current >= len(self.source): raise TokeniserError("Unterminated string") self.current += 1 # Consume closing quote lexeme = self.source[self.start : self.current] string = self.source[self.start + 1 : self.current - 1] token = TokenLiteral(TokenType.STRING, lexeme, string) self.tokens.append(token) def numeric(self): while self.current < len(self.source) and self.source[self.current].isdigit(): self.current += 1 # Invalid to finish expression with "." if self.current == len(self.source) - 1 and self.source[self.current] == ".": raise TokeniserError("Expression cannot finish with '.'") if self.current < len(self.source) and self.source[self.current] == ".": # Consume the "." so we can start consuming digits again self.current += 1 if not self.source[self.current].isdigit(): raise TokeniserError( f"Expected digit after '.', found '{self.source[self.current]}'" ) # Match a float while ( self.current < len(self.source) and self.source[self.current].isdigit() ): self.current += 1 float_ = self.source[self.start : self.current] token = TokenLiteral(TokenType.FLOAT, float_, float(float_)) self.tokens.append(token) else: int_ = self.source[self.start : self.current] token = TokenLiteral(TokenType.INT, int_, int(int_)) self.tokens.append(token) def identifier(self): # Identifiers can be made up of letters, numbers and '_' while self.current < len(self.source) and ( self.source[self.current].isalpha() or self.source[self.current].isdigit() or self.source[self.current] == "_" ): self.current += 1 ident = self.source[self.start : self.current] token = TokenLiteral(TokenType.IDENT, ident, ident) self.tokens.append(token)