Source code for masci_tools.util.xml.common_functions

###############################################################################
# Copyright (c), Forschungszentrum Jülich GmbH, IAS-1/PGI-1, Germany.         #
#                All rights reserved.                                         #
# This file is part of the Masci-tools package.                               #
# (Material science tools)                                                    #
#                                                                             #
# The code is hosted on GitHub at https://github.com/judftteam/masci-tools.   #
# For further information on the license, see the LICENSE.txt file.           #
# For further information please visit http://judft.de/.                      #
#                                                                             #
###############################################################################
"""
Common functions for parsing input/output files or XMLschemas from FLEUR
"""
from __future__ import annotations

from masci_tools.util.typing import XMLLike, XPathLike, TXPathLike
from lxml import etree
import warnings
import copy
import logging
from typing import Any, TypeVar, cast, overload

from .xpathbuilder import FilterType, XPathBuilder


[docs]def clear_xml(tree: etree._ElementTree) -> tuple[etree._ElementTree, set[str]]: """ Removes comments and executes xinclude tags of an xml tree. :param tree: an xml-tree which will be processed :returns: cleared_tree, an xmltree without comments and with replaced xinclude tags """ cleared_tree = copy.deepcopy(tree) #Remove comments outside the root element (Since they have no parents this would lead to a crash) root = cleared_tree.getroot() prev_sibling = root.getprevious() while prev_sibling is not None: next_elem = prev_sibling.getprevious() if prev_sibling.tag is etree.Comment: root.append(prev_sibling) root.remove(prev_sibling) prev_sibling = next_elem next_sibling = root.getnext() while next_sibling is not None: next_elem = next_sibling.getnext() if next_sibling.tag is etree.Comment: root.append(next_sibling) root.remove(next_sibling) next_sibling = next_elem #find any include tags include_tags = eval_xpath_all(cleared_tree, '//xi:include', etree._Element, namespaces={'xi': 'http://www.w3.org/2001/XInclude'}) parents = [] known_tags = [] for tag in include_tags: parent = tag.getparent() if parent is None: raise ValueError('Could not find parent of included tag') parents.append(parent) known_tags.append({elem.tag for elem in parent if isinstance(elem.tag, str)}) # replace XInclude parts to validate against schema if len(include_tags) != 0: cleared_tree.xinclude() all_included_tags: set[str] = set() # get rid of xml:base attribute in the included parts for parent, old_tags in zip(parents, known_tags): new_tags = {elem.tag for elem in parent if isinstance(elem.tag, str)} #determine the elements not in old_tags, which are in tags #so what should have been included included_tag_names = new_tags.difference(old_tags) #Check for empty set (relax.xml include may not insert something) if not included_tag_names: continue all_included_tags = all_included_tags.union(included_tag_names) for tag_name in included_tag_names: for elem in parent.iterchildren(tag=tag_name): for attribute in elem.attrib.keys(): if 'base' in attribute: elem.attrib.pop(attribute, '') # remove comments from inp.xml comments: list[etree._Element] = cleared_tree.xpath('//comment()') #type:ignore for comment in comments: com_parent = comment.getparent() if com_parent is None: raise ValueError('Could not find parent of comment tag') com_parent.remove(comment) etree.indent(cleared_tree) return cleared_tree, all_included_tags
[docs]def get_inpgen_comments(xmltree: etree._ElementTree) -> list[etree._Element]: """ Get the XML comment element appended after the root of the inp.xml file These contain at the moment the inpgen command line and the content of the inpgen file :param xmltree: representation of the inp.xml :returns: list of XML comments, which appear after the fleurInput tag """ root = xmltree.getroot() comments = [] next_sibling = root.getnext() while next_sibling is not None: next_elem = next_sibling.getnext() if next_sibling.tag is etree.Comment: comments.append(next_sibling) next_sibling = next_elem return comments
[docs]def readd_inpgen_comments(xmltree: etree._ElementTree, comments: list[etree._Element]) -> etree._ElementTree: """ Add the given comments after the fleurInput tag of the inp.xml These contain at the moment the inpgen command line and the content of the inpgen file :param xmltree: representation of the inp.xml :param comments: list of XML comments """ previous = xmltree.getroot() for comment in comments: if not comment.tag is etree.Comment: raise ValueError(f'Invalid tag type. Only Comments allowed. Got: {comment.tag}') previous.addnext(comment) previous = comment return xmltree
[docs]def reverse_xinclude(xmltree, schema_dict, included_tags, **kwargs): """ DEPRECATED ALIAS: Moved to masci_tools.util.schema_dict_util Split the xmltree back up according to the given included tags. The original xmltree will be returned with the corresponding xinclude tags and the included trees are returned in a dict mapping the inserted filename to the extracted tree Tags for which no known filename is known are returned under unknown-1.xml, ... The following tags have known filenames: - `relaxation`: ``relax.xml`` - `kPointLists`: ``kpts.xml`` - `symmetryOperations`: ``sym.xml`` - `atomSpecies`: ``species.xml`` - `atomGroups`: ``atoms.xml`` Additional mappings can be given in the keyword arguments :param xmltree: an xml-tree which will be processed :param schema_dict: Schema dictionary containing all the necessary information :param included_tags: Iterable of str, containing the names of the tags to be excluded :returns: xmltree with the inseerted xinclude tags and a dict mapping the filenames to the excluded trees :raises ValueError: if the tag can not be found in the given xmltree """ from masci_tools.util.schema_dict_util import reverse_xinclude #pylint: disable=redefined-outer-name,cyclic-import warnings.warn('DEPRECATED: reverse_xinclude moved to masci_tools.util.schema_dict_util', DeprecationWarning) return reverse_xinclude(xmltree, schema_dict, included_tags, **kwargs)
[docs]def validate_xml(xmltree: etree._ElementTree, schema: etree.XMLSchema, error_header: str = 'File does not validate') -> None: """ Checks a given xmltree against a schema and produces a nice error message with all the validation errors collected :param xmltree: xmltree of the file to validate :param schema: etree.XMLSchema to validate against :param error_header: str to lead a evtl error message with :raises: etree.DocumentInvalid if the schema does not validate """ from itertools import groupby try: cleared_tree, _ = clear_xml(xmltree) schema.assertValid(cleared_tree) except etree.DocumentInvalid as exc: error_log = sorted(schema.error_log, key=lambda x: x.message) #type: ignore[call-overload] error_output = [] first_occurence = [] for message, group in groupby(error_log, key=lambda x: cast(object, x.message)): err_occurences = list(group) error_message = f'Line {err_occurences[0].line}: {message}' error_lines = '' if len(err_occurences) > 1: error_lines = f"; This error also occurred on the lines {', '.join([str(x.line) for x in err_occurences[1:]])}" error_output.append(f'{error_message}{error_lines} \n') first_occurence.append(err_occurences[0].line) error_output = [line for _, line in sorted(zip(first_occurence, error_output))] errmsg = f"{error_header}: \n{''.join(error_output)}" raise etree.DocumentInvalid(errmsg) from exc
[docs]def eval_xpath(node: XMLLike | etree.XPathElementEvaluator, xpath: XPathLike, logger: logging.Logger | None = None, list_return: bool = False, namespaces: dict[str, str] | None = None, **variables: etree._XPathObject) -> Any: """ Tries to evaluate an xpath expression. If it fails it logs it. If a absolute path is given (starting with '/') and the tag of the node does not match the root. It will try to find the tag in the path and convert it into a relative path :param node: root node of an etree :param xpath: xpath expression (relative, or absolute) :param logger: logger object for logging warnings, errors, if not provided all errors will be raised :param list_return: if True, the returned quantity is always a list even if only one element is in it :param namespaces: dict, passed to namespaces argument in xpath call :returns: text, attribute or a node list """ if isinstance(xpath, XPathBuilder): xpath_str = xpath.path variables = {**variables, **xpath.path_variables} xpath = xpath_str if logger is not None: logger.debug('XPath: %s', xpath) logger.debug('XPath Variables: %s', variables) if not isinstance(node, (etree._Element, etree._ElementTree, etree.XPathElementEvaluator)): if logger is not None: logger.error('Wrong Type for xpath eval; Got: %s', type(node)) raise TypeError(f'Wrong Type for xpath eval; Got: {type(node)}') if namespaces is not None and (isinstance(xpath, etree.XPath) or isinstance(node, etree.XPathElementEvaluator)): if logger is not None: logger.exception( 'Passing namespaces is only supported for string xpaths and nodes. for etree.XPath or XPathEvaluatore use namespaces in the init function' ) raise ValueError( 'Passing namespaces is only supported for string xpaths and nodes. for etree.XPath or XPathEvaluatore use namespaces in the init function' ) try: if isinstance(node, etree.XPathElementEvaluator): if isinstance(xpath, etree.XPath): if logger is not None: logger.error('Got an XPath object and an XPathEvaluator in eval_xpath') raise TypeError('Got an XPath object and an XPathEvaluator in eval_xpath') return_value = node(xpath, **variables) #[arg-type] elif isinstance(xpath, etree.XPath): return_value = xpath(node, **variables) else: return_value = node.xpath(xpath, namespaces=namespaces, smart_strings=True, extensions=None, **variables) except etree.XPathEvalError as err: if logger is not None: logger.exception( 'There was a XpathEvalError on the xpath: %s \n' 'The following variables were passed: %s \n' 'Either it does not exist, or something is wrong with the expression.', xpath, variables) raise ValueError(f'There was a XpathEvalError on the xpath: {str(xpath)} \n' f'The following variables were passed: {variables} \n' 'Either it does not exist, or something is wrong with the expression.') from err if logger is not None: logger.debug('XPath Result: %s', return_value) if list_return and not isinstance(return_value, list): return [return_value] if isinstance(return_value, list): if len(return_value) == 1 and not list_return: return return_value[0] return return_value
[docs]def get_xml_attribute(node: etree._Element, attributename: str, logger: logging.Logger | None = None) -> str | None: """ Get an attribute value from a node. :param node: a node from etree :param attributename: a string with the attribute name. :param logger: logger object for logging warnings, errors, if not provided all errors will be raised :returns: either attributevalue, or None """ if etree.iselement(node): attrib_value = node.get(attributename) if attrib_value: return attrib_value if logger is None: raise ValueError(f'Tried to get attribute: "{attributename}" from element {node.tag}.\n ' f'I received "{attrib_value}", maybe the attribute does not exist') logger.warning( 'Tried to get attribute: "%s" from element %s.\n ' 'I received "%s", maybe the attribute does not exist', attributename, node.tag, attrib_value) else: # something doesn't work here, some nodes get through here if logger is None: raise TypeError(f'Can not get attributename: "{attributename}" from node of type {type(node)}, ' f'because node is not an element of etree.') logger.error( 'Can not get attributename: "%s" from node of type %s, ' 'because node is not an element of etree.', attributename, type(node)) return None
[docs]def split_off_tag(xpath: TXPathLike) -> tuple[TXPathLike, str]: """ Splits off the last part of the given xpath .. note:: etree.XPath objects could lose context in here, i.e. non-default options passed at init :param xpath: xpath to split up """ if isinstance(xpath, XPathBuilder): xpath = copy.deepcopy(xpath) #type:ignore[assignment] tag = xpath.strip_off_tag() return xpath, tag #type:ignore[return-value] if isinstance(xpath, etree.XPath): xpath_str = xpath.path else: xpath_str = xpath #type:ignore[assignment] split_xpath = xpath_str.split('/') if split_xpath[-1] == '': xpath_str, tag = '/'.join(split_xpath[:-2]), split_xpath[-2] else: xpath_str, tag = '/'.join(split_xpath[:-1]), split_xpath[-1] if isinstance(xpath, etree.XPath): xpath = etree.XPath(xpath_str) #type:ignore [assignment] else: xpath = xpath_str #type:ignore[assignment] return xpath, tag
[docs]def add_tag(xpath: TXPathLike, tag: str) -> TXPathLike: """ Add tag to xpath .. note:: etree.XPath objects could lose context in here, i.e. non-default options passed at init :param xpath: xpath to change :param tag: str of the tag to add :returns: xpath with the form {old_xpath}/tag """ if isinstance(xpath, XPathBuilder): xpath = copy.deepcopy(xpath) #type:ignore[assignment] xpath.append_tag(tag) elif isinstance(xpath, etree.XPath): xpath = etree.XPath(f'{str(xpath.path)}/{tag}') #type:ignore [assignment] else: xpath = f"{str(xpath).rstrip('/')}/{tag}" #type:ignore[assignment] return xpath
[docs]def split_off_attrib(xpath: TXPathLike) -> tuple[TXPathLike, str]: """ Splits off attribute of the given xpath (part after @) .. note:: etree.XPath objects could lose context in here, i.e. non-default options passed at init :param xpath: xpath to split up """ if isinstance(xpath, XPathBuilder): xpath = copy.deepcopy(xpath) #type:ignore[assignment] attrib = xpath.strip_off_tag() if '@' not in attrib: raise ValueError('Path does not end with an attribute') return xpath, attrib.lstrip('@') #type:ignore[return-value] if isinstance(xpath, etree.XPath): xpath_str = xpath.path else: xpath_str = xpath #type:ignore[assignment] split_xpath = xpath_str.split('/@') if len(split_xpath) != 2: raise ValueError(f"Splitting off attribute failed for: '{split_xpath}'") xpath_str, attrib = tuple(split_xpath) if isinstance(xpath, etree.XPath): xpath = etree.XPath(xpath_str) #type:ignore [assignment] else: xpath = xpath_str #type:ignore[assignment] return xpath, attrib
[docs]def check_complex_xpath(node: XMLLike | etree.XPathElementEvaluator, base_xpath: XPathLike, complex_xpath: XPathLike) -> None: """ Check that the given complex xpath produces a subset of the results for the simple xpath :param node: root node of an etree or an etree :param base_xpath: str of the xpath without complex syntax :param complex_xpath: str of the xpath to check :raises ValueError: If the complex_xpath does not produce a subset of the results of the base_xpath """ results_base = set(eval_xpath_all(node, base_xpath)) results_complex = set(eval_xpath_all(node, complex_xpath)) if not results_base.issuperset(results_complex): raise ValueError(f"Complex xpath '{complex_xpath!r}' is not compatible with the base_xpath '{base_xpath!r}'")
[docs]def abs_to_rel_xpath(xpath: str, new_root: str) -> str: """ Convert a given xpath to be relative from a tag appearing in the original xpath. :param xpath: str of the xpath to convert :param new_root: str of the tag from which the new xpath should be relative :returns: str of the relative xpath """ if contains_tag(xpath, new_root): xpath = xpath + '/' xpath_to_root = '/'.join(xpath.split(new_root + '/')[:-1]) + new_root xpath = xpath.replace(f'{xpath_to_root}/', './') xpath = xpath.rstrip('/') else: raise ValueError(f'New root element {new_root} does not appear in xpath {xpath}') return xpath
[docs]def normalize_xmllike(xmllike: XMLLike) -> etree._Element: """ Returns the root of the xmltree """ if etree.iselement(xmllike): return xmllike xmllike, _ = clear_xml(xmllike) #type:ignore[arg-type] return xmllike.getroot()
[docs]def contains_tag(xpath: XPathLike, tag: str) -> bool: """ Return whether a given xpath contains a given tag This assumes that predicates of xpaths can't be nested since otherwise the regex for removing them could fail This function will only return True if one of the tags exactly matches the tag argument not if one tag contains the given name in it's name :param xpath: xpath expression :param tag: tag to check for :returns: whether a tag is contained in the xpath """ import re if isinstance(xpath, XPathBuilder): return tag in xpath.components if isinstance(xpath, etree.XPath): xpath_str = xpath.path else: xpath_str = str(xpath) #Strip out predicates xpath_str = re.sub(r'[\[].*?[\]]', '', xpath_str) return tag in xpath_str.split('/')
[docs]def is_valid_tag(tag: str) -> bool: """ Return whether the given string is a valid XML tag name :param tag: tag to check """ try: etree.QName(tag) return True except ValueError: return False
T = TypeVar('T') """Generic Type""" @overload def eval_xpath_all(node: XMLLike | etree.XPathElementEvaluator, xpath: XPathLike, expected_type: type[T], *, logger: logging.Logger | None = ..., namespaces: dict[str, str] | None = ..., **variables: etree._XPathObject) -> list[T]: ... @overload def eval_xpath_all(node: XMLLike | etree.XPathElementEvaluator, xpath: XPathLike, expected_type: None = ..., *, logger: logging.Logger | None = ..., namespaces: dict[str, str] | None = ..., **variables: etree._XPathObject) -> list[Any]: ... def eval_xpath_all(node: XMLLike | etree.XPathElementEvaluator, xpath: XPathLike, expected_type: type[T] | None = None, *, logger: logging.Logger | None = None, namespaces: dict[str, str] | None = None, **variables: etree._XPathObject) -> list[T] | list[Any]: result = eval_xpath(node, xpath, logger=logger, namespaces=namespaces, list_return=True, **variables) if expected_type is not None and not all(isinstance(x, expected_type) for x in result): all_types = {type(x) for x in result} if logger is not None: logger.error(f'Expected XPath results of type {expected_type!r}. Got: {all_types!r}') raise TypeError(f'Expected XPath results of type {expected_type!r}. Got: {all_types!r}') return result @overload def eval_xpath_first(node: XMLLike | etree.XPathElementEvaluator, xpath: XPathLike, expected_type: type[T], *, logger: logging.Logger | None = ..., namespaces: dict[str, str] | None = ..., **variables: etree._XPathObject) -> T: ... @overload def eval_xpath_first(node: XMLLike | etree.XPathElementEvaluator, xpath: XPathLike, expected_type: None = ..., *, logger: logging.Logger | None = ..., namespaces: dict[str, str] | None = ..., **variables: etree._XPathObject) -> Any: ... def eval_xpath_first(node: XMLLike | etree.XPathElementEvaluator, xpath: XPathLike, expected_type: type[T] | None = None, *, logger: logging.Logger | None = None, namespaces: dict[str, str] | None = None, **variables: etree._XPathObject) -> T | Any: result = eval_xpath(node, xpath, logger=logger, namespaces=namespaces, list_return=True, **variables) if len(result) == 0: if logger is not None: logger.error(f'Expected atleast one result. Found {len(result)}') raise ValueError(f'Expected atleast one result. Found {len(result)}') result = result[0] if expected_type is not None and not isinstance(result, expected_type): if logger is not None: logger.error(f'Expected XPath results of type {expected_type!r}. Got: {type(result)}') raise TypeError(f'Expected XPath results of type {expected_type!r}. Got: {type(result)}') return result @overload def eval_xpath_one(node: XMLLike | etree.XPathElementEvaluator, xpath: XPathLike, expected_type: type[T], *, logger: logging.Logger | None = ..., namespaces: dict[str, str] | None = ..., **variables: etree._XPathObject) -> T: ... @overload def eval_xpath_one(node: XMLLike | etree.XPathElementEvaluator, xpath: XPathLike, expected_type: None = ..., *, logger: logging.Logger | None = ..., namespaces: dict[str, str] | None = ..., **variables: etree._XPathObject) -> Any: ... def eval_xpath_one(node: XMLLike | etree.XPathElementEvaluator, xpath: XPathLike, expected_type: type[T] | None = None, *, logger: logging.Logger | None = None, namespaces: dict[str, str] | None = None, **variables: etree._XPathObject) -> T | Any: result = eval_xpath(node, xpath, logger=logger, namespaces=namespaces, list_return=True, **variables) if len(result) != 1: if logger is not None: logger.error(f'Expected one result. Found {len(result)}') raise ValueError(f'Expected one result. Found {len(result)}') result = result[0] if expected_type is not None and not isinstance(result, expected_type): if logger is not None: logger.error(f'Expected XPath results of type {expected_type!r}. Got: {type(result)}') raise TypeError(f'Expected XPath results of type {expected_type!r}. Got: {type(result)}') return result
[docs]def serialize_xml_objects(args: tuple[Any, ...], kwargs: dict[str, Any]) -> tuple[tuple[Any, ...], dict[str, Any]]: """ Convert every XML element/tree in the given args/kwargs to string using :py:func:`lxml.etree.tostring()` :param args: positional arguments :param kwargs: keyword arguments """ def tostring(elem: XMLLike) -> str: return etree.tostring(elem, encoding='unicode', pretty_print=True) args = tuple(tostring(x) if isinstance(x, (etree._Element, etree._ElementTree)) else x for x in args) kwargs = {k: tostring(x) if isinstance(x, (etree._Element, etree._ElementTree)) else x for k, x in kwargs.items()} return args, kwargs
[docs]def process_xpath_argument(simple_xpath: str | bytes | etree.XPath, complex_xpath: XPathLike | None, filters: FilterType | None) -> XPathLike: """ Process the simple and complex Xpath expressions and given filters Used for unifying the logic for all xml setters/evaluators using these arguments :param simple_xpath: THe simple XPath (no predicates) expression to base the paths on :param complex_xpath: Optional XPath given with no restrictions :param filters: Dict specifying constraints to apply on the xpath. See :py:class:`~masci_tools.util.xml.xpathbuilder.XPathBuilder` for details :returns: Complex XPath expression """ if complex_xpath is None: complex_xpath = XPathBuilder(simple_xpath, filters=filters, strict=True) elif filters is not None: if not isinstance(complex_xpath, XPathBuilder): raise ValueError( 'Provide only one of filters or complex_xpath (Except when complx_xpath is given as a XPathBuilder)') for key, val in filters.items(): complex_xpath.add_filter(key, val) return complex_xpath