import re
import string
from functools import partial
from collections.abc import Sequence
from mau import text_buffer
[docs]class TokenTypes:
EOL = "EOL"
EOF = "EOF"
LITERAL = "LITERAL"
TEXT = "TEXT"
WHITESPACE = "WHITESPACE"
[docs]class LexerError(ValueError):
pass
[docs]class Token:
"""
This represents a token.
Tokens have a type, a value (the actual characters), and a position
in the global text, expressed as a tuple of line and column
"""
def __init__(self, _type, value=None, position=None):
self.type = _type
self.position = position
# This ensures things like numbers or other
# strange beasts are just treated as text.
self.value = str(value) if value is not None else None
def __str__(self):
position_string = ""
if self.position:
position_string = f", line={self.position[0]}, col={self.position[1]}"
value_string = ""
if self.value is not None:
value_string = f", '{self.value}'"
return f"Token({self.type}{value_string}{position_string})"
__repr__ = __str__
def __eq__(self, other):
if other.value is None:
return self.type == other.type
return (self.type, self.value) == (
other.type,
other.value,
)
def __hash__(self):
return hash((self.type, self.value))
def __len__(self):
if self.value:
return len(self.value)
return 0
def __bool__(self):
return True
# These are convenient shortcuts
EOL = Token(TokenTypes.EOL)
EOF = Token(TokenTypes.EOF)
WS = partial(Token, TokenTypes.WHITESPACE)
Text = partial(Token, TokenTypes.TEXT)
Literal = partial(Token, TokenTypes.LITERAL)
[docs]class BaseLexer:
"""
The base class for lexers.
The lexer decomposes the input text into a list of tokens
and provides basic navigation functions in the
output results.
"""
def __init__(self, initial_position=None):
# Use a TextBuffer internally to manage the text
self._text_buffer = text_buffer.TextBuffer()
# A buffer of tokens, useful when you need to
# collect them but later post-process them
# before you actually store them as result.
self._buffer = []
self._initial_position = initial_position or (0, 0)
# These are the tokens identified so far
self.tokens = []
@property
def _token_position(self):
# This returns the token position taking into
# account that the initial position might
# not be (0,0)
return tuple(map(sum, zip(self._text_buffer.position, self._initial_position)))
[docs] def context(self, token):
"""Returns the context of a token for error reporting purposes"""
return self._text_buffer.context(*token.position)
[docs] def process(self, text):
# Reset the lexer
self.tokens = []
# Load the text into the TextBuffer
self._text_buffer.load(text)
# Process tokens until we reach the end of file
self._process()
while True:
# Preprocess functions can return no tokens
if len(self.tokens) > 0 and self.tokens[-1].type is TokenTypes.EOF:
break
self._process()
def _process(self):
# This should not be touched by child classes
# as it is the core of the lexer. It tries
# each function in the list returned by
# _process_functions and stores all the resulting
# tokens. A parsing function must return None
# when characters do not match the rules.
process_functions = self._process_functions()
process_functions.append(self._process_error)
for process_func in process_functions:
# This ensures result is always either None or a list
result = self._wrap(process_func())
if result is None:
continue
self.tokens.extend(result)
return
def _wrap(self, result):
# Makes sure the result is either None or a list of tokens
# which makes processing function that return a single token
# more readable.
if result is None:
return
if not isinstance(result, Sequence):
return [result]
return result
def _nextline(self):
# Carriage return =) go to column 0
self._initial_position = (self._initial_position[0], 0)
# Skip the whole line including the EOL
self._text_buffer.nextline()
def _skip(self, steps=1):
# Skip only the given amount of characters
self._text_buffer.skip(steps)
@property
def _current_char(self):
# Return the current character
return self._text_buffer.current_char
@property
def _current_line(self):
# Return the current line
return self._text_buffer.current_line
@property
def _tail(self):
# A wrapper to return the rest of the line
return self._text_buffer.tail
def _create_token(self, token_type, token_value=None):
# A wrapper to create a token with the current position.
# This doesn't affect the position in the text being lexed
return Token(token_type, token_value, position=self._token_position)
def _create_token_and_skip(self, token_type, token_value=None, skip_value=None):
# Create the token and skip the characters in the text
# This skips the first non-None value between skip_value, token_type and the empty string
skip = next(x for x in [skip_value, token_value, ""] if x is not None)
# This creates the token
token = self._create_token(token_type, token_value)
# Perform the right skip
if token_type == TokenTypes.EOL:
self._nextline()
else:
self._skip(len(skip))
return token
def _store(self, token_type, token_value=None, skip_value=None):
# Create and skip a token, then store it in the buffer
self._buffer.append(
self._create_token_and_skip(token_type, token_value, skip_value)
)
def _pop(self):
# Get the content of the buffer
tokens = list(self._buffer)
self._buffer = []
return tokens
def _process_eof(self):
try:
self._current_line
except text_buffer.EOFError:
return self._create_token_and_skip(TokenTypes.EOF)
def _process_eol(self):
try:
self._current_char
except text_buffer.EOLError:
return self._create_token_and_skip(TokenTypes.EOL)
def _process_character(self):
if self._current_char not in string.ascii_letters:
return None
self._store(TokenTypes.TEXT, self._current_char)
return self._pop()
def _process_whitespace(self):
regexp = re.compile(r"\ +")
match = regexp.match(self._tail)
if not match:
return None
self._store(TokenTypes.WHITESPACE, match.group())
return self._pop()
def _process_functions(self):
return [
self._process_eof,
self._process_eol,
self._process_whitespace,
self._process_character,
]
def _process_error(self):
raise LexerError(f'Can\'t process "{self._tail}"')
def _insert(self, text):
self._text_buffer.insert(text)
def _rematch(self, regexp):
# Compile the regexp and get a match on the current line
regexp = re.compile(regexp)
return regexp.match(self._current_line)