import re
import copy
from mau.lexers.base_lexer import TokenTypes, Token
from mau.lexers.main_lexer import MainLexer
from mau.parsers.base_parser import (
BaseParser,
TokenError,
ConfigurationError,
parser,
)
from mau.parsers.text_parser import TextParser
from mau.parsers.arguments_parser import ArgumentsParser
from mau.parsers.preprocess_variables_parser import PreprocessVariablesParser
from mau.parsers.nodes import (
HorizontalRuleNode,
TextNode,
BlockNode,
ContentNode,
ContentImageNode,
CommandNode,
HeaderNode,
ListNode,
ListItemNode,
ParagraphNode,
TocNode,
TocEntryNode,
FootnotesNode,
)
[docs]class EngineError(ValueError):
""" Used to signal that the engine selected for a code block is not known """
# The MainParser is in charge of parsing
# the whole input, calling other parsers
# to manage single paragraphs or other
# things like variables.
[docs]class MainParser(BaseParser):
def __init__(self, variables=None):
super().__init__()
self.lexer = MainLexer()
# This is used as a storage for attributes.
# Block attributes are defined before the block
# so when we parse them we store them here and
# then use them when dealing with the block itself.
self.argsparser = ArgumentsParser()
# Copy the variables and make sure the "mau" namespace exists
self.variables = copy.deepcopy(variables) if variables else {}
if "mau" not in self.variables:
self.variables["mau"] = {}
self.headers = []
self.footnote_defs = []
self.blocks = {}
self.toc = None
# When we define a block we establish an alias
# {alias:actual_block_name}
self.block_aliases = {}
# Each block we define can have default values
# {actual_block_name:kwargs}
self.block_defaults = {}
# Each block we define can have names for unnamed arguments
# {actual_block_name:kwargs}
self.block_names = {}
# Backward compatibility with Mau 1.x
# Mau 1.x used [source] to format source, while Mau 2.x
# uses [myblock, engine=source], so this establishes
# a default block definition so that
# [source] = [source, engine=source]
# In Mau 2.x this block uses the template "block-source"
# so any template called "source" (e.g. "source.html")
# must be renamed.
# This definition can be overridden by custom block definitions
self.block_aliases["source"] = "source"
self.block_defaults["source"] = {"engine": "source", "language": "text"}
self.block_names["source"] = ["language"]
self.block_aliases["admonition"] = "admonition"
self.block_names["admonition"] = ["class", "icon", "label"]
self.block_aliases["quote"] = "quote"
self.block_defaults["quote"] = {"attribution": None}
self.block_names["quote"] = ["attribution"]
# Iterate through block definitions passed as variables
for alias, block_definition in (
self.variables["mau"].get("block_definitions", {}).items()
):
try:
blocktype = block_definition["blocktype"]
self.block_aliases[alias] = blocktype
except KeyError:
raise ConfigurationError(
f"Block definition '{alias}' is missing key 'blocktype'"
)
try:
self.block_defaults[blocktype] = block_definition["kwargs"]
except KeyError:
raise ConfigurationError(
f"Block definition '{alias}' is missing key 'kwargs'"
)
# This is a buffer for a block title
self._title = None
# This is the function used to create the header
# anchors. It can be specified through
# mau.header_anchor_function to override
# the default one.
self.header_anchor = self.variables["mau"].get(
"header_anchor_function", header_anchor
)
self.v1_backward_compatibility = self.variables["mau"].get(
"v1_backward_compatibility", False
)
def _pop_title(self):
# This return the title and resets the
# cached one, so no other block will
# use it.
title = self._title
self._title = None
return title
def _push_title(self, title):
# When we parse a title we can store it here
# so that it is available to the next block
# that will use it.
self._title = title
def _collect_lines(self, stop_tokens):
# This collects several lines of text in a list
# until it gets to a line that begins with one
# of the tokens listed in stop_tokens.
# It is useful for block or other elements that
# are clearly surrounded by delimiters.
lines = []
while self.peek_token() not in stop_tokens:
lines.append(self.collect_join([Token(TokenTypes.EOL)]))
self.get_token(TokenTypes.EOL)
return lines
def _collect_text_content(self):
# Collects all adjacent text tokens
# into a single string
if not self.peek_token_is(TokenTypes.TEXT):
return None
values = []
# Get all tokens
while self.peek_token_is(TokenTypes.TEXT):
values.append(self.get_token().value)
self.get_token(TokenTypes.EOL)
return " ".join(values)
def _parse_text_content(self, text):
# Parse a text using the TextParser.
# Replace variables
p = PreprocessVariablesParser(self.variables).analyse(
text,
)
text = p.nodes[0].value
# Parse the text
p = TextParser(
footnotes_start_with=len(self.footnote_defs) + 1,
v1_backward_compatibility=self.v1_backward_compatibility,
).analyse(text)
# Text should return a single sentence node
result = p.nodes[0]
# Store the footnotes
self.footnote_defs.extend(p.footnote_defs)
return result
@parser
def _parse_eol(self):
# This simply parses the end of line.
self.get_token(TokenTypes.EOL)
@parser
def _parse_horizontal_rule(self):
# The horizontal rule ---
self.get_token(TokenTypes.LITERAL, "---")
self.get_token(TokenTypes.EOL)
self._save(HorizontalRuleNode())
@parser
def _parse_single_line_comment(self):
# // A comment on a single line
self.get_token(TokenTypes.TEXT, check=lambda x: x.startswith("//"))
self.get_token(TokenTypes.EOL)
@parser
def _parse_multi_line_comment(self):
# ////
# A comment
# on multiple lines
# ////
self.get_token(TokenTypes.LITERAL, "////")
self._collect_lines([Token(TokenTypes.LITERAL, "////"), Token(TokenTypes.EOF)])
self.force_token(TokenTypes.LITERAL, "////")
@parser
def _parse_variable_definition(self):
# This parses a variable definition
#
# Simple variables are defined as :name:value
# as True booleans as just :name:
# and as False booleas as :!name:
#
# Variable names can use a namespace with
# :namespace.name:value
# Get the mandatory variable name
self.get_token(TokenTypes.LITERAL, ":")
variable_name = self.get_token(TokenTypes.TEXT).value
self.get_token(TokenTypes.LITERAL, ":")
# Assume the variable is a flag
variable_value = True
# If the name starts with ! it's a false flag
if variable_name.startswith("!"):
variable_value = False
variable_name = variable_name[1:]
# Get the optional value
value = self.collect_join([Token(TokenTypes.EOL)])
# The value is assigned only if the variable
# is not a negative flag. In that case it is ignored
if variable_value and len(value) > 0:
variable_value = value
# If the variable name contains a dot we
# want to use a namespace
if "." not in variable_name:
self.variables[variable_name] = variable_value
else:
# Let's ignore all others dots
namespace, variable_name = variable_name.split(".", maxsplit=1)
# This defines the namespace if it's not already there
try:
self.variables[namespace][variable_name] = variable_value
except KeyError:
self.variables[namespace] = {variable_name: variable_value}
@parser
def _parse_command(self):
# Parse a command in the form ::command:
self.get_token(TokenTypes.LITERAL, "::")
name = self.get_token(TokenTypes.TEXT).value
self.get_token(TokenTypes.LITERAL, ":")
args = []
kwargs = {}
# Commands can have arguments
with self:
arguments = self.get_token(TokenTypes.TEXT).value
self.argsparser.analyse(arguments)
# Consume the attributes
args, kwargs = self.argsparser.get_arguments_and_reset()
if name == "defblock":
# Block definitions must have at least 2 arguments,
# the alias and the block type.
if len(args) < 2:
self.error(
"Block definitions require at least two unnamed arguments: ALIAS and BLOCKTYPE"
)
block_alias = args.pop(0)
block_type = args.pop(0)
self.block_aliases[block_alias] = block_type
self.block_defaults[block_type] = kwargs
self.block_names[block_type] = args
return None
self._save(CommandNode(name=name, args=args, kwargs=kwargs))
@parser
def _parse_title(self):
# Parse a title in the form
#
# . This is a title
# or
# .This is a title
# Parse the mandatory dot
self.get_token(TokenTypes.LITERAL, ".")
# Parse the optional white spaces
with self:
self.get_token(TokenTypes.WHITESPACE)
# Get the text of the title
text = self.get_token(TokenTypes.TEXT).value
self.get_token(TokenTypes.EOL)
# Titles can contain Mau code
p = TextParser(
footnotes_start_with=len(self.footnote_defs) + 1,
v1_backward_compatibility=self.v1_backward_compatibility,
).analyse(text)
title = p.nodes[0]
self._push_title(title)
@parser
def _parse_attributes(self):
# Parse block attributes in the form
# [unnamed1, unnamed2, ..., named1=value1, name2=value2, ...]
self.get_token(TokenTypes.LITERAL, "[")
attributes = self.get_token(TokenTypes.TEXT).value
self.get_token(TokenTypes.LITERAL, "]")
# Attributes can use variables
p = PreprocessVariablesParser(self.variables).analyse(
attributes,
)
attributes = p.nodes[0].value
# Parse the arguments
self.argsparser.analyse(attributes)
@parser
def _parse_header(self):
# Parse a header in the form
#
# = Header
#
# The number of equal signs is arbitrary
# and represents the level of the header.
# Headers are automatically assigned an anchor
# created using the provided function self.header_anchor
#
# Headers in the form
# =! Header
# are rendered but not included in the TOC
# Get all the equal signs
header = self.get_token(
TokenTypes.LITERAL, check=lambda x: x.startswith("=")
).value
# Get the mandatory white spaces
self.get_token(TokenTypes.WHITESPACE)
# Check if the header has to be in the TOC
in_toc = True
if header.endswith("!"):
header = header[:-1]
in_toc = False
# Get the text of the header and calculate the level
text = self.get_token(TokenTypes.TEXT).value
level = len(header)
# Generate the anchor and append it to the TOC
anchor = self.header_anchor(text, level)
# Consume the attributes
args, kwargs = self.argsparser.get_arguments_and_reset()
# Generate the header node
header_node = HeaderNode(value=text, level=level, anchor=anchor, kwargs=kwargs)
if in_toc:
self.headers.append(header_node)
self._save(header_node)
@parser
def _parse_block(self):
# Parse a block in the form
#
# [block_type]
# ----
# Content
# ----
# Optional secondary content
#
# Blocks are delimited by 4 consecutive identical characters.
# Get the delimiter and check the length
delimiter = self.get_token(TokenTypes.TEXT).value
if len(delimiter) != 4 or len(set(delimiter)) != 1:
raise TokenError
self.get_token(TokenTypes.EOL)
# Collect everything until the next delimiter
content = self._collect_lines(
[Token(TokenTypes.TEXT, delimiter), Token(TokenTypes.EOF)]
)
self.force_token(TokenTypes.TEXT, delimiter)
self.get_token(TokenTypes.EOL)
# Get the optional secondary content
secondary_content = self._collect_lines(
[Token(TokenTypes.EOL), Token(TokenTypes.EOF)]
)
# Consume the title
title = self._pop_title()
# The first unnamed argument is the block type
blocktype = self.argsparser.pop()
# If there is a block alias for blocktype replace it
# otherwise use the blocktype we already have
blocktype = self.block_aliases.get(blocktype, blocktype)
# Assign names
self.argsparser.set_names_and_defaults(
self.block_names.get(blocktype, []), self.block_defaults.get(blocktype, {})
)
# Consume the attributes
args, kwargs = self.argsparser.get_arguments_and_reset()
# Extract classes and convert them into a list
classes = [i for i in kwargs.pop("classes", "").split(",") if len(i) > 0]
# Extract condition if present and process it
condition = kwargs.pop("condition", "")
# Run this only if there is a condition on this block
if len(condition) > 0:
try:
# The condition should be either test:variable:value or test:variable:
test, variable, value = condition.split(":")
except ValueError:
self.error(
f'Condition {condition} is not in the form "test:variable:value" or "test:variable:'
)
# If there is no value use True
if len(value) == 0:
value = True
# Check if the variable matches the value and apply the requested test
match = self.variables.get(variable) == value
result = True if test == "if" else False
# If the condition is not satisfied return
if match is not result:
return
# Extract the preprocessor
preprocessor = kwargs.pop("preprocessor", "none")
# Extract the engine
engine = kwargs.pop("engine", "default")
# Create the node parameters according to the engine
if engine in ["raw", "mau"]:
# Engine "raw" doesn't process the content,
# so we just pass it untouched in the form of
# a TextNode per line. The same is true for "mau"
# as the visitor will have to fire up an new parser
# to process the content.
content = [TextNode(line) for line in content]
secondary_content = [TextNode(line) for line in secondary_content]
elif engine == "source":
# Engine "source" extracts the content (source code),
# the callouts, and the highlights.
# The default language is "text".
content, callouts, highlights = self._parse_source_engine(
content, secondary_content, kwargs
)
secondary_content = []
kwargs["callouts"] = callouts
kwargs["highlights"] = highlights
kwargs["language"] = kwargs.get("language", "text")
elif engine == "default":
# This is the default engine and it parses
# both content and secondary content using a new parser
# but then merges headers and footnotes into the
# current one.
# Parse the primary and secondary content and record footnotes
pc = MainParser(variables=self.variables).analyse("\n".join(content))
ps = MainParser(variables=self.variables).analyse(
"\n".join(secondary_content)
)
content = pc.nodes
secondary_content = ps.nodes
self.footnote_defs.extend(pc.footnote_defs)
self.headers.extend(pc.headers)
else:
raise EngineError(f"Engine {engine} is not available")
self._save(
BlockNode(
blocktype=blocktype,
content=content,
secondary_content=secondary_content,
args=args,
classes=classes,
engine=engine,
preprocessor=preprocessor,
kwargs=kwargs,
title=title,
)
)
def _parse_source_engine(self, content, secondary_content, kwargs):
# Parse a source block in the form
#
# [source, language, attributes...]
# ----
# content
# ----
#
# Source blocks support the following attributes
#
# callouts=":" The separator used by callouts
# highlight="@" The special character to turn on highlight
#
# [source, language, attributes...]
# ----
# content:1:
# ----
#
# [source, language, attributes...]
# ----
# content:@:
# ----
#
# Callout descriptions can be added to the block
# as secondary content with the syntax
#
# [source, language, attributes...]
# ----
# content:name:
# ----
# <name>: <description>
#
# Since Mau uses Pygments, the attribute language
# is one of the langauges supported by that tool.
# Get the delimiter for callouts (":" by default)
delimiter = kwargs.pop("callouts", ":")
# A dictionary that contains callout markers in
# the form {linenum:name}
callout_markers = {}
# Get the marker for highlighted lines ("@" by default)
highlight_marker = kwargs.pop("highlight", "@")
# A list of highlighted lines
highlighted_lines = []
# This is a list of all lines that might contain
# a callout. They will be further processed
# later to be sure.
lines_with_callouts = [
(linenum, line)
for linenum, line in enumerate(content)
if line.endswith(delimiter)
]
# Each line in the previous list is processed
# and stored if it contains a callout
for linenum, line in lines_with_callouts:
# Remove the final delimiter
line = line[:-1]
splits = line.split(delimiter)
if len(splits) < 2:
# It's a trap! There are no separators left
continue
# Get the callout and the line
callout_name = splits[-1]
line = delimiter.join(splits[:-1])
content[linenum] = line
# Check if we want to just highlight the line
if callout_name == highlight_marker:
highlighted_lines.append(linenum)
else:
callout_markers[linenum] = callout_name
# A dictionary that contains the text for each
# marker in the form {name:text}
callout_contents = {}
# If there was secondary content it should be formatted
# with callout names followed by colon and the
# callout text.
for line in secondary_content:
if ":" not in line:
self.error(
f"Callout description should be written as 'name: text'. Missing ':' in '{line}'"
)
name, text = line.split(":")
if name not in callout_markers.values():
self.error(f"Callout {name} has not been created in the source code")
text = text.strip()
callout_contents[name] = text
# Put markers and contents together
callouts = {"markers": callout_markers, "contents": callout_contents}
# Source blocks must preserve the content literally
textlines = [TextNode(line) for line in content]
return textlines, callouts, highlighted_lines
# self._save(
# SourceNode(
# language,
# callouts=callouts,
# highlights=highlighted_lines,
# delimiter=delimiter,
# code=textlines,
# title=title,
# kwargs=kwargs,
# )
# )
@parser
def _parse_content(self):
# Parse attached content in the form
#
# [attributes]
# << content_type:uri
# Get the mandatory "<<" and white spaces
self.get_token(TokenTypes.LITERAL, check=lambda x: x.startswith("<<"))
self.get_token(TokenTypes.WHITESPACE)
# Get the content type and the content URI
content_type_and_uri = self.get_token(TokenTypes.TEXT).value
content_type, uri = content_type_and_uri.split(":", maxsplit=1)
title = self._pop_title()
if content_type == "image":
return self._parse_content_image(uri, title)
return self._parse_standard_content(content_type, uri, title)
def _parse_content_image(self, uri, title):
# Parse a content image in the form
#
# [alt_text, classes]
# << image:uri
#
# alt_text is the alternate text to use is the image is not reachable
# and classes is a comma-separated list of classes
# Assign names and consume the attributes
self.argsparser.set_names_and_defaults(
["alt_text", "classes"], {"alt_text": None, "classes": None}
)
args, kwargs = self.argsparser.get_arguments_and_reset()
alt_text = kwargs.pop("alt_text")
classes = kwargs.pop("classes")
if classes:
classes = classes.split(",")
self._save(
ContentImageNode(
uri=uri,
alt_text=alt_text,
classes=classes,
title=title,
kwargs=kwargs,
)
)
def _parse_standard_content(self, content_type, uri, title):
# This is the fallback for an unknown content type
# Consume the attributes
args, kwargs = self.argsparser.get_arguments_and_reset()
self._save(
ContentNode(
uri=uri,
title=title,
args=args,
kwargs=kwargs,
)
)
@parser
def _parse_list(self):
# Parse a list.
# Lists can be ordered (using numbers)
#
# * One item
# * Another item
#
# or unordered (using bullets)
#
# # Item 1
# # Item 2
#
# The number of headers increases
# the depth of each item
#
# # Item 1
# ## Sub-Item 1.1
#
# Spaces before and after the header are ignored.
# So the previous list can be also written
#
# # Item 1
# ## Sub-Item 1.1
#
# Ordered and unordered lists can be mixed.
#
# * One item
# ## Sub Item 1
# ## Sub Item 2
#
# Ignore initial white spaces
with self:
self.get_token(TokenTypes.WHITESPACE)
# Get the header and decide if it's a numbered or unnumbered list
header = self.peek_token(TokenTypes.LITERAL, check=lambda x: x[0] in "*#")
numbered = True if header.value[0] == "#" else False
# Parse all the following items
nodes = self._parse_list_nodes()
self._save(ListNode(numbered, nodes, main_node=True))
def _parse_list_nodes(self):
# This parses all items of a list
# Ignore initial white spaces
with self:
self.get_token(TokenTypes.WHITESPACE)
# Parse the header and ignore the following white spaces
header = self.get_token(TokenTypes.LITERAL, check=lambda x: x[0] in "*#").value
self.get_token(TokenTypes.WHITESPACE)
# Collect and parse the text of the item
text = self._collect_text_content()
content = self._parse_text_content(text)
# Compute the level of the item
level = len(header)
nodes = []
nodes.append(ListItemNode(level, content))
while not self.peek_token() in [Token(TokenTypes.EOF), Token(TokenTypes.EOL)]:
# This is the SentenceNode inside the last node added to the list
# which is used to append potential nested nodes
last_node_sentence = nodes[-1].content
# Ignore the initial white spaces
with self:
self.get_token(TokenTypes.WHITESPACE)
if len(self.peek_token().value) == level:
# The new item is on the same level
# Get the header
header = self.get_token().value
# Ignore white spaces
self.get_token(TokenTypes.WHITESPACE)
# Collect and parse the text of the item
text = self._collect_text_content()
content = self._parse_text_content(text)
nodes.append(ListItemNode(len(header), content))
elif len(self.peek_token().value) > level:
# The new item is on a deeper level
# Treat the new line as a new list
numbered = True if self.peek_token().value[0] == "#" else False
subnodes = self._parse_list_nodes()
last_node_sentence.content.append(ListNode(numbered, subnodes))
else:
break
return nodes
@parser
def _parse_paragraph(self):
# This parses a paragraph.
# Paragraphs can be written on multiple lines and
# end with an empty line.
# Get all the lines, join them and parse them
lines = self._collect_lines([Token(TokenTypes.EOL), Token(TokenTypes.EOF)])
text = " ".join(lines)
sentence = self._parse_text_content(text)
# Consume the attributes
args, kwargs = self.argsparser.get_arguments_and_reset()
self._save(ParagraphNode(sentence, args=args, kwargs=kwargs))
def _parse_functions(self):
# All the functions that this parser provides.
return [
self._parse_eol,
self._parse_horizontal_rule,
self._parse_single_line_comment,
self._parse_multi_line_comment,
self._parse_variable_definition,
self._parse_command,
self._parse_title,
self._parse_attributes,
self._parse_header,
self._parse_block,
self._parse_content,
self._parse_list,
self._parse_paragraph,
]
def _create_toc(self):
# Create the TOC from the list of headers.
nodes = []
latest_by_level = {}
for header_node in self.headers:
# This is the current node
node = TocEntryNode(header_node)
level = header_node.level
# This collects the latest node added with a given level
latest_by_level[level] = node
try:
# Simplest case, add it to the latest one
# with a level just 1 step lower
latest_by_level[level - 1].children.append(node)
except KeyError:
# Find all the latest ones added with a level lower than this
latest = [latest_by_level.get(i, None) for i in range(1, level)]
# Get the children list of each one, plus nodes for the root
children = [nodes] + [i.children for i in latest if i is not None]
# Get the nearest one and append to that
children[-1].append(node)
return TocNode(entries=nodes)
[docs] def parse(self):
super().parse()
self.toc = self._create_toc()
self.footnotes = FootnotesNode(entries=self.footnote_defs)