Source code for rads.config.xml_parsers

"""Parser combinators for reading XML files.

This module is heavily based on PEGTL_, a parser combinator library for C++.

.. _PEGTL: https://github.com/taocpp/PEGTL


Exceptions
----------

==============================  ===========================================
Name                            Description
==============================  ===========================================
:class:`TerminalXMLParseError`  Parse error that fails parsing of XML file.
:class:`XMLParseError`          Parse error that fails a single parser.
==============================  ===========================================


Parser Combinators (class based API)
------------------------------------

===================  ==============================================
Name                 Description
===================  ==============================================
:class:`Parser`      Base class of all parser combinators.
:class:`Apply`       Apply a function the value result of a parser.
:class:`Lazy`        Delay parser construction until evaluation.
:class:`Must`        Require parser to succeed.
:class:`At`          Non consuming match.
:class:`Not`         Invert a parser match, non consuming.
:class:`Repeat`      Match zero or more times (greedily).
:class:`Sequence`    Chain parsers together, all or nothing match.
:class:`Alternate`   Chain parsers together, taking first success.
:class:`Success`     Always succeeds, non consuming.
:class:`Failure`     Always fails, non consuming.
:class:`Start`       Match beginning of XML block.
:class:`End`         Match end of XML block.
:class:`AnyElement`  Match any XML element.
:class:`Tag`         Match an XML element by it's tag name.
===================  ==============================================


Parser Combinators (function based API)
---------------------------------------

===================  ===================================================
Name                 Description
===================  ===================================================
:func:`lazy`         Delay parser construction until evaluation.
:func:`at`           Non consuming match.
:func:`not_at`       Invert a parser match, non consuming.
:func:`opt`          Optional match, always succeeds.
:func:`plus`         Match one or more times (greedily).
:func:`seq`          Match a sequence of parsers, all or nothing.
:func:`sor`          Use first matching parser.
:func:`star`         Match zero ore more times (greedily).
:func:`must`         Require parser to succeed.
:func:`rep`          Match N times.
:func:`until`        Consume elements until a match, match not consumed.
:func:`failure`      Always fails, non consuming.
:func:`success`      Always succeeds, non consuming.
:func:`start`        Match beginning of XML block.
:func:`end`          Match end of XML block.
:func:`any`          Match any XML element.
:func:`tag`          Match an XML element by it's tag name.
===================  ===================================================
"""
import typing
from abc import ABC, abstractmethod
from typing import (
    Any,
    Callable,
    Collection,
    List,
    MutableSequence,
    NoReturn,
    Optional,
    Tuple,
    Union,
    cast,
)

import yzal

from rads.xml.base import Element

__all__ = [
    "TerminalXMLParseError",
    "XMLParseError",
    "next_element",
    "first_child",
    "Parser",
    "Apply",
    "Lazy",
    "Must",
    "At",
    "Not",
    "Repeat",
    "Sequence",
    "Alternate",
    "Success",
    "Failure",
    "Start",
    "End",
    "AnyElement",
    "Tag",
    "lazy",
    "at",
    "not_at",
    "opt",
    "plus",
    "seq",
    "sor",
    "star",
    "must",
    "rep",
    "until",
    "failure",
    "success",
    "start",
    "end",
    "any",
    "tag",
]


[docs]class TerminalXMLParseError(Exception): """A parse error that should fail parsing of the entire file.""" file: Optional[str] = None """Filename where the error occurred.""" line: Optional[int] = None """Line number in the :attr:`file` where the error occurred.""" message: str = "parsing failed" """The message provided when the error was constructed.""" def __init__( self, file: Optional[str], line: Optional[int], message: Optional[str] = None ): """ :param file: Name of file that was being parsed when the error occurred. :param line: Line number in the `file` that was being parsed when the error occurred. :param message: An optional message (instead of the default 'parsing failed') detailing why the parse failed. """ super().__init__() self.file = file self.line = line if message is not None: self.message = message def __str__(self) -> str: """Convert error to a string. :return: Error as a string in the following format: .. code:: text <filename>:<line number>: <message> """ # the reason this is not done in the constructor is to speed up # exception handling when reporting is not necessary. file = self.file if self.file else "" line = str(self.line) if self.line else "" return f"{file}:{line}: {self.message}"
[docs]class XMLParseError(TerminalXMLParseError): """A parse error that signals that a given parser failed. Unlike :class:`TerminalXMLParseError` is expected and simply signals that the parser did not match and another parser should be tried. """
[docs] def terminal(self, message: Optional[str] = None) -> TerminalXMLParseError: """Raise this local failure to a global failure. :param message: Optionally a new message. :return: A global parse failure with the same file and line, and possibly message as this exception. """ if message: return TerminalXMLParseError(self.file, self.line, message) return TerminalXMLParseError(self.file, self.line, self.message)
[docs]@yzal.lazy def next_element(pos: Element) -> Element: """Get next element lazily. :param pos: Current element. :return: Next sibling XML element. :raises XMLParseError: If there is no next sibling element. """ try: return pos.next() # noqa: B304 except StopIteration: raise XMLParseError(pos.file, pos.closing_line, "No more elements.")
[docs]@yzal.lazy def first_child(pos: Element) -> Element: """Get first child of element, lazily.""" try: return pos.down() except StopIteration: raise XMLParseError(pos.file, pos.opening_line, f"<{pos.tag}> has no children")
[docs]class Parser(ABC): """Base parser combinator."""
[docs] @abstractmethod def __call__(self, position: Element) -> Tuple[Any, Element]: """Call the parser, trying to match at the given `position`. If the match fails a :class:`XMLParseError` will be raised. This call will only return if the parser matches at the given `position`. :param position: An XML element that the parser should attempt to match at. :return: A tuple giving the value result of the match (which depends on the particular parser) and the element to match at. The next element can be the same element as given in `position` (a non consuming parser) or any later sibling element. Further, it will actually be a :class:`yzal.Thunk` and will therefore delay it's construction until it is needed. Therefore, any :class:`XMLParseError` that may be generated by moving to a later element will occur when the returned element is used. :raises XMLParseError: If the parser does not match at the given `position`. :raises TerminalXMLParseError: If the parser encounters an unrecoverable error. """
[docs] def __add__(self, other: "Parser") -> "Sequence": """Combine two parsers, matching the first followed by the second. Multiple consecutive uses of '+' will result in a single :class:`Sequence` because the :class:`Sequence` class automatically flattens itself. :param other: The parser to match after this one. :return: A new parser that will match this parser followed by the `other` parser (if the this parser matched). """ return Sequence(self, other)
[docs] def __or__(self, other: "Parser") -> "Alternate": """Combine two parsers, matching the first or the second. Multiple consecutive uses of '|' will result in a single :class:`Alternate` because the :class:`Alternate` class automatically flattens itself. :param other: The parser to match if this parser does not. :return: A new parser that will match either this parser or the `other` parser (if the this parser did not match). """ return Alternate(self, other)
[docs] def __xor__(self, func: Callable[[Any], Any]) -> "Apply": """Apply a function to the value result of this parser. :param func: The function to apply to the value result of matching this parser. .. note:: This will not be ran until this parser is matched. :return: A new parser that will match this parser and upon a successful match apply the given `func` to the value result. """ return Apply(self, func)
[docs] def __invert__(self) -> "Not": """Invert this parser. If this parser would match a given position, now it will not. If it would not match now it will, but it will not consume any elements. :return: A new parser that will not match whenever this parser does, and will match whenever this parser does not. However, it will not consume any elements. """ return Not(self)
[docs] def __lshift__(self, message: str) -> "Must": """Require the parser to succeed. This will convert all :class:`XMLParseError` s to :class:`TerminalXMLParseError` s. :param message: The message that will be raised if the parser does not match. :return: A new parser that will elevate any local parse failures to global failures and overwrite their message with `message`. """ return Must(self, message)
[docs]class Apply(Parser): """Apply a function to the value result of the parser.""" _catch: Optional[Union[type, Tuple[type, ...]]] = None def __init__( self, parser: Parser, func: Callable[[Any], Any], catch: Optional[Collection[type]] = None, ): """ :param parser: The parser whose value result to apply the given `func` to the value result of. :param func: The function to apply. :param catch: An exception or iterable of exceptions to convert into :class:`XMLParseError` s. The default is not to catch any exceptions. To catch all exceptions simply pass :class:`Exception` as it is the base class of all exceptions that should be caught. .. note:: Any exceptions that are derived from the exceptions given will also be caught. """ self._parser = parser self._function = func if catch: if isinstance(catch, type): self._catch = cast(type, catch) else: self._catch = tuple(catch)
[docs] def __call__(self, position: Element) -> Tuple[Any, Element]: # noqa: D102 value, position = self._parser(position) if not self._catch: return self._function(value), position try: return self._function(value), position except Exception as err: if not isinstance(err, self._catch): raise # don't catch this exception raise XMLParseError(position.file, position.opening_line, str(err)) from err
[docs]class Lazy(Parser): """Delay construction of parser until evaluated. .. note:: This lazy behavior is useful when constructing recursive parsers in order to avoid infinite recursion. """ def __init__(self, parser_func: Callable[[], Parser]): """ :param parser_func: A zero argument function that returns a parser when called. This will be used to delay construction of the parser. """ self._parser_func = parser_func self._parser: Optional[Parser] = None
[docs] def __call__(self, position: Element) -> Tuple[Any, Element]: # noqa: D102 if self._parser is None: self._parser = self._parser_func() return self._parser(position)
[docs]class Must(Parser): """Raise a XMLParseError to a TerminalXMLParseError ending parsing.""" def __init__(self, parser: Parser, message: Optional[str] = None): """ :param parser: Parser that must match. :param message: New message to apply to the :class:`GlobalParserFailure` if the parser does not match. """ self._parser = parser self._message = message
[docs] def __call__(self, position: Element) -> Tuple[Any, Element]: # noqa: D102 try: return self._parser(position) except XMLParseError as err: raise err.terminal(self._message)
[docs]class At(Parser): """Match a parser, consuming nothing.""" def __init__(self, parser: Parser): """ :param parser: Parser to match. """ self._parser = parser
[docs] def __call__(self, position: Element) -> Tuple[Any, Element]: # noqa: D102 value, _ = self._parser(position) return value, position
[docs]class Not(Parser): """Invert a parser match, consuming nothing.""" def __init__(self, parser: Parser): """ :param parser: Parser to invert the match of. """ self._parser = parser
[docs] def __call__(self, position: Element) -> Tuple[None, Element]: # noqa: D102 try: self._parser(position) except XMLParseError: return None, position raise XMLParseError(position.file, position.opening_line)
[docs]class Repeat(Parser): """Match a parser zero or more times (greedily).""" def __init__(self, parser: Parser): """ :param parser: Parser to match repeatedly. """ self._parser = parser
[docs] def __call__(self, position: Element) -> Tuple[List[Any], Element]: # noqa: D102 values = [] try: while True: # loop until parse failure value, position = self._parser(position) values.append(value) except XMLParseError: pass # no more matches return values, position
class _MultiParser(Parser, ABC): """Base class of multiple parser combinators.""" def __init__(self, subtype: type, *parsers: Parser): r""" :param subtype: The type of the child parser (the type of parser that subclasses this). :param \*parsers: Parsers to store in the multi parser. """ assert issubclass(subtype, _MultiParser) self._subtype = subtype self._parsers: List[Parser] = [] for parser in parsers: if isinstance(parser, subtype): self._parsers.extend(parser._parsers) else: self._parsers.append(parser) def _append(self, other: Parser) -> None: if isinstance(other, self._subtype): self._parsers.extend(other._parsers) else: self._parsers.append(other)
[docs]class Sequence(_MultiParser): """Chain parsers together, succeeding only if all succeed in order. .. note:: Consecutive Sequence's are automatically flattened. """ def __init__(self, *parsers: Parser): r""" :param \*parsers: Parsers to match in sequence. """ super().__init__(Sequence, *parsers)
[docs] def __call__(self, position: Element) -> Tuple[List[Any], Element]: # noqa: D102 values = [] for parser in self._parsers: value, position = parser(position) values.append(value) return values, position
[docs] def __add__(self, other: Parser) -> "Sequence": """Combine this sequence and a parser, returning a new sequence. .. note:: If the `other` parser is a :class:`Sequence` then the parsers in the `other` :class:`Sequence` will be unwrapped and appended individually. :param other: The parser to combine with this sequence to form the new sequence. :return: A new sequence which matches this sequence followed by the given parser (if the sequence matched). """ return Sequence(*self._parsers, other)
[docs] def __iadd__(self, other: Parser) -> "Sequence": """Combine this sequence with the given parser (in place). .. note:: If the `other` parser is a :class:`Sequence` then the parsers in the `other` :class:`Sequence` will be unwrapped and appended to this sequence individually. :param other: The parser to combine with (append to) this sequence. :return: This sequence parser. """ self._append(other) return self
[docs]class Alternate(_MultiParser): """Match any one of the parsers, stops on first match. .. note:: Consecutive Alternate's are automatically flattened. """ def __init__(self, *parsers: Parser): r""" :param \*parsers: Pool of parsers to find a match in. """ super().__init__(Alternate, *parsers)
[docs] def __call__(self, position: Element) -> Tuple[Any, Element]: # noqa: D102 for parser in self._parsers: try: return parser(position) except XMLParseError: pass raise XMLParseError(position.file, position.opening_line)
[docs] def __or__(self, other: Parser) -> "Alternate": """Combine this alternate and a parser, returning a new alternate. .. note:: If the `other` parser is a :class:`Alternate` then the parsers in the `other` :class:`Alternate` will be unwrapped and added individually. :param other: The parser to combine with this alternate to form the new alternate. :return: A new alternate which matches any parser from this alternate or the given parser (if no parser of this alternate matches). """ return Alternate(*self._parsers, other)
def __ior__(self, other: Parser) -> "Alternate": """Combine this alternate with the given parser (in place). .. note:: If the `other` parser is a :class:`Alternate` then the parsers in the `other` :class:`Alternate` will be unwrapped and added to this alternate individually. :param other: The parser to combine with (append to) this alternate. :return: This alternate parser. """ self._append(other) return self
[docs]class Success(Parser): """Parser that always succeeds, consuming nothing."""
[docs] def __call__(self, position: Element) -> Tuple[None, Element]: # noqa: D102 return None, position
[docs]class Failure(Parser): """Parser that always fails, consuming nothing."""
[docs] def __call__(self, position: Element) -> NoReturn: # noqa: D102 raise XMLParseError(position.file, position.opening_line)
[docs]class Start(Parser): """Match start of an element, consuming nothing."""
[docs] def __call__(self, position: Element) -> Tuple[None, Element]: # noqa: D102 try: prev = position.prev() raise XMLParseError( prev.file, prev.opening_line, "Expected start of element." ) except StopIteration: return None, position
[docs]class End(Parser): """Match end of an element, consuming nothing."""
[docs] def __call__(self, position: Element) -> Tuple[None, Element]: # noqa: D102 try: # Element is really a Thunk[Element] so we need to cast it and # force it's evaluation to determine if a parse error is thrown, # indicating the end of the current level of elements. yzal.strict(cast(yzal.Thunk[Element], position)) except XMLParseError: return None, position raise XMLParseError( position.file, position.closing_line, "Expected end of element." )
[docs]class AnyElement(Parser): """Parser that matches any element."""
[docs] def __call__(self, position: Element) -> Tuple[Element, Element]: # noqa: D102 return yzal.strict(position), next_element(position)
[docs]class Tag(Parser): """Match an element by it's tag name.""" def __init__(self, name: str): """ :param name: Tag name to match. """ self._name = name
[docs] def __call__(self, position: Element) -> Tuple[Element, Element]: # noqa: D102 if position.tag == self._name: return yzal.strict(position), next_element(position) raise XMLParseError(position.file, position.opening_line)
[docs]def lazy(parser_func: Callable[[], Parser]) -> Parser: """Delays construction of parser until evaluated. :param parser_func: A zero argument function that returns a parser when called. This will be used to delay construction of the parser. :return: A new parser that is equivalent to the parser returned by `parser_func`. """ return Lazy(parser_func)
[docs]def at(parser: Parser) -> Parser: """Succeeds if and only if the given parser succeeds, consumes nothing. :param parser: The parser that must succeed. :return: A new parser that succeeds if and only if `parser` succeeds, but does not consume input. """ return At(parser)
[docs]def not_at(parser: Parser) -> Parser: """Succeeds if and only if the given parser fails, consumes nothing. :param parser: The parser that must fail. :return: A new parser that succeeds if and only if `parser` fails, but does not consume input. """ return ~At(parser)
[docs]def opt(parser: Parser) -> Parser: """Parser that always succeeds, regardless of the given parser. :param parser: An optional parser that can succeed or fail. :return: A new parser that optionally matches `parser`. If `parser` succeeds this parser will be transparent, as if `parser` was called directly. If `parser` fails this :func:`opt` returns None as the result and does not consume anything. """ return parser | Success()
[docs]def plus(parser: Parser) -> Parser: """Match the given parser as much as possible, must match at least once. :param parser: Parser to match one or more times (greedily). :return: A new parser that matches `parser` one or more times. Failing if no matches are made. """ return parser + Repeat(parser)
[docs]def seq(*parsers: Parser) -> Parser: r"""Match sequence of parsers in order, succeeding if and only if all succeed. :param \*parsers: One or more parsers to match in order. :return: A new parser that matches all the given `parser`'s in order, failing if any one of the `parser`'s fail. """ return Sequence(*parsers)
[docs]def sor(*parsers: Parser) -> Parser: r"""Match the first of the given parsers, failing if all fail. :param \*parsers: One or more parsers to match. The first parser that succeeds will take the place of this parser. If all fail then this parser will also fail. :return: A new parser that matches the first `parser` that succeeds or fails if all `parser`'s fail. """ return Alternate(*parsers)
[docs]def star(parser: Parser) -> Parser: """Match the given parser as much as possible, can match zero times. :param parser: Parser to match zero or more times (greedily). :return: A new parser that matches `parser` one or more times. Failing if no matches are made. """ return Repeat(parser)
[docs]def must(parser: Parser) -> Parser: """Raise a local parse failure to a global parse failure. Local parse failures (:class:`XMLParseError`) are typically caught by :class:`Alternate` or other such parsers that allow some parser's to fail. In particular, local failures are an expected part of parser combinators and simply signal that a particular parser could not parse the given elements. A global parse failure (:class:`TerminalXMLParseError`) should only be caught at the top level and signals that the entire parse is a failure. :param parser: A parser that must match, else the entire parse is failed. :return: A parser that must succeed, if it fails a :class:`GlobalParserFailure` is raised. """ return Must(parser)
[docs]def rep(parser: Parser, times: int) -> Parser: """Match the given parser a given number of times. Fails if the parser does not succeed the given number of times. :param parser: The parser to match `times`. :param times: Number of times the `parser` must succeed. :return: A parser that succeeds only if the given `parser` matches the given number of `times`. """ return Sequence(*([parser] * times))
[docs]def until(parser: Parser) -> Parser: """Match all elements until the given `parser` matches. Does not consume the elements that the given `parser` matches. :param parser: The parser to end matching with. :return: A parser that will consume all elements until the given `parser` matches. It will not consume the elements that the given `parser` matched. """ def process(elements: typing.Sequence[Element]) -> Element: return elements[-1] def process2( elements: Tuple[MutableSequence[Element], Element] ) -> typing.Sequence[Element]: start_elements, last_element = elements start_elements.append(last_element) return start_elements return ( star(not_at(parser) + not_at(end()) + any() ^ process) + at(parser) ^ process2 )
[docs]def failure() -> Parser: """Parser that always fails. :return: A new parser that always fails, consuming nothing. """ return Failure()
[docs]def success() -> Parser: """Parser that always succeeds. :return: A new parser that always succeeds, consuming nothing. """ return Success()
[docs]def start() -> Parser: """Match the beginning of an element. :return: A new parser that matches the beginning of an element, consuming nothing. """ return Start()
[docs]def end() -> Parser: """Match the end of an element. :return: A new parser that matches the end of an element, consuming nothing. """ return End()
[docs]def any() -> Parser: """Match any element. :return: A new parser that matches any single element. """ return AnyElement()
[docs]def tag(name: str) -> Parser: """Match an element by tag name. :param name: Tag name to match. :return: Parser matching the given `tag` name. """ return Tag(name)