Source code for masci_tools.util.xml.common_functions

# -*- coding: utf-8 -*-
###############################################################################
# Copyright (c), Forschungszentrum Jülich GmbH, IAS-1/PGI-1, Germany.         #
#                All rights reserved.                                         #
# This file is part of the Masci-tools package.                               #
# (Material science tools)                                                    #
#                                                                             #
# The code is hosted on GitHub at https://github.com/judftteam/masci-tools.   #
# For further information on the license, see the LICENSE.txt file.           #
# For further information please visit http://judft.de/.                      #
#                                                                             #
###############################################################################
"""
Common functions for parsing input/output files or XMLschemas from FLEUR
"""
from lxml import etree
import warnings


[docs]def clear_xml(tree): """ Removes comments and executes xinclude tags of an xml tree. :param tree: an xml-tree which will be processed :returns: cleared_tree, an xmltree without comments and with replaced xinclude tags """ import copy cleared_tree = copy.deepcopy(tree) #Remove comments outside the root element (Since they have no parents this would lead to a crash) root = cleared_tree.getroot() prev_sibling = root.getprevious() while prev_sibling is not None: if prev_sibling.tag is etree.Comment: root.append(prev_sibling) root.remove(prev_sibling) prev_sibling = prev_sibling.getprevious() next_sibling = root.getnext() while next_sibling is not None: if next_sibling.tag is etree.Comment: root.append(next_sibling) root.remove(next_sibling) next_sibling = next_sibling.getnext() #find any include tags include_tags = eval_xpath(cleared_tree, '//xi:include', namespaces={'xi': 'http://www.w3.org/2001/XInclude'}, list_return=True) parents = [] known_tags = [] for tag in include_tags: parent = tag.getparent() parents.append(parent) known_tags.append({elem.tag for elem in parent if isinstance(elem.tag, str)}) # replace XInclude parts to validate against schema if len(include_tags) != 0: cleared_tree.xinclude() all_included_tags = set() # get rid of xml:base attribute in the included parts for parent, old_tags in zip(parents, known_tags): new_tags = {elem.tag for elem in parent if isinstance(elem.tag, str)} #determine the elements not in old_tags, which are in tags #so what should have been included included_tag_names = new_tags.difference(old_tags) #Check for emtpy set (relax.xml include may not insert something) if not included_tag_names: continue all_included_tags = all_included_tags.union(included_tag_names) for tag_name in included_tag_names: for elem in parent: if elem.tag == tag_name: for attribute in elem.keys(): if 'base' in attribute: try: del elem.attrib[attribute] except BaseException: pass # remove comments from inp.xml comments = cleared_tree.xpath('//comment()') for comment in comments: com_parent = comment.getparent() com_parent.remove(comment) etree.indent(cleared_tree) return cleared_tree, all_included_tags
[docs]def reverse_xinclude(xmltree, schema_dict, included_tags, **kwargs): """ Split the xmltree back up according to the given included tags. The original xmltree will be returned with the corresponding xinclude tags and the included trees are returned in a dict mapping the inserted filename to the extracted tree Tags for which no known filename is known are returned under unknown-1.xml, ... The following tags have known filenames: - `relaxation`: ``relax.xml`` - `kPointLists`: ``kpts.xml`` - `symmetryOperations`: ``sym.xml`` - `atomSpecies`: ``species.xml`` - `atomGroups`: ``atoms.xml`` Additional mappings can be given in the keyword arguments :param xmltree: an xml-tree which will be processed :param schema_dict: Schema dictionary containing all the necessary information :param included_tags: Iterable of str, containing the names of the tags to be excluded :returns: xmltree with the inseerted xinclude tags and a dict mapping the filenames to the excluded trees :raises ValueError: if the tag can not be found in teh given xmltree """ from masci_tools.util.schema_dict_util import get_tag_xpath import copy INCLUDE_NSMAP = {'xi': 'http://www.w3.org/2001/XInclude'} INCLUDE_TAG = etree.QName(INCLUDE_NSMAP['xi'], 'include') FALLBACK_TAG = etree.QName(INCLUDE_NSMAP['xi'], 'fallback') excluded_tree = copy.deepcopy(xmltree) include_file_names = { 'relaxation': 'relax.xml', 'kPointLists': 'kpts.xml', 'symmetryOperations': 'sym.xml', 'atomSpecies': 'species.xml', 'atomGroups': 'atoms.xml' } include_file_names = {**include_file_names, **kwargs} unknown_file_names = 0 included_trees = {} root = excluded_tree.getroot() if not all(isinstance(tag, str) for tag in included_tags): raise ValueError(f'included_tags is not made up of strings: {included_tags}') for tag in included_tags: if tag in include_file_names: file_name = include_file_names[tag] else: warnings.warn(f'No filename known for tag {tag}') unknown_file_names += 1 file_name = f'unknown-{unknown_file_names}.xml' try: tag_xpath = get_tag_xpath(schema_dict, tag) except ValueError as exc: raise ValueError(f'Cannot determine place of included tag {tag}') from exc included_tag = eval_xpath(root, tag_xpath, list_return=True) if len(included_tag) != 1: raise ValueError(f'Cannot determine place of included tag {tag}') included_tag = included_tag[0] included_trees[file_name] = etree.ElementTree(included_tag) parent = included_tag.getparent() xinclude_elem = etree.Element(INCLUDE_TAG, href=file_name, nsmap=INCLUDE_NSMAP) xinclude_elem.append(etree.Element(FALLBACK_TAG)) parent.replace(included_tag, xinclude_elem) if 'relax.xml' not in included_trees: #The relax.xml include should always be there xinclude_elem = etree.Element(INCLUDE_TAG, href='relax.xml', nsmap=INCLUDE_NSMAP) xinclude_elem.append(etree.Element(FALLBACK_TAG)) root.append(xinclude_elem) etree.indent(excluded_tree) for tree in included_trees.values(): etree.indent(tree) return excluded_tree, included_trees
[docs]def validate_xml(xmltree, schema, error_header='File does not validate'): """ Checks a given xmltree against a schema and produces a nice error message with all the validation errors collected :param xmltree: xmltree of the file to validate :param schema: etree.XMLSchema to validate against :param error_header: str to lead a evtl error message with :raises: etree.DocumentInvalid if the schema does not validate """ from itertools import groupby try: cleared_tree, _ = clear_xml(xmltree) schema.assertValid(cleared_tree) except etree.DocumentInvalid as exc: error_log = sorted(schema.error_log, key=lambda x: x.message) error_output = [] first_occurence = [] for message, group in groupby(error_log, key=lambda x: x.message): err_occurences = list(group) error_message = f'Line {err_occurences[0].line}: {message}' error_lines = '' if len(err_occurences) > 1: error_lines = f"; This error also occured on the lines {', '.join([str(x.line) for x in err_occurences[1:]])}" error_output.append(f'{error_message}{error_lines} \n') first_occurence.append(err_occurences[0].line) error_output = [line for _, line in sorted(zip(first_occurence, error_output))] errmsg = f"{error_header}: \n{''.join(error_output)}" raise etree.DocumentInvalid(errmsg) from exc
[docs]def eval_xpath(node, xpath, logger=None, list_return=False, namespaces=None): """ Tries to evaluate an xpath expression. If it fails it logs it. If a absolute path is given (starting with '/') and the tag of the node does not match the root. It will try to find the tag in the path and convert it into a relative path :param node: root node of an etree :param xpath: xpath expression (relative, or absolute) :param logger: logger object for logging warnings, errors, if not provided all errors will be raised :param list_return: if True, the returned quantity is always a list even if only one element is in it :param namespaces: dict, passed to namespaces argument in xpath call :returns: text, attribute or a node list """ if not isinstance(node, (etree._Element, etree._ElementTree)): if logger is not None: logger.error('Wrong Type for xpath eval; Got: %s', type(node)) raise TypeError(f'Wrong Type for xpath eval; Got: {type(node)}') try: return_value = node.xpath(xpath, namespaces=namespaces) except etree.XPathEvalError as err: if logger is not None: logger.exception( 'There was a XpathEvalError on the xpath: %s \n' 'Either it does not exist, or something is wrong with the expression.', xpath) raise ValueError(f'There was a XpathEvalError on the xpath: {xpath} \n' 'Either it does not exist, or something is wrong with the expression.') from err if len(return_value) == 1 and not list_return: return return_value[0] else: return return_value
[docs]def get_xml_attribute(node, attributename, logger=None): """ Get an attribute value from a node. :param node: a node from etree :param attributename: a string with the attribute name. :param logger: logger object for logging warnings, errors, if not provided all errors will be raised :returns: either attributevalue, or None """ if etree.iselement(node): attrib_value = node.get(attributename) if attrib_value: return attrib_value else: if logger is not None: logger.warning( 'Tried to get attribute: "%s" from element %s.\n ' 'I received "%s", maybe the attribute does not exist', attributename, node.tag, attrib_value) else: raise ValueError(f'Tried to get attribute: "{attributename}" from element {node.tag}.\n ' f'I received "{attrib_value}", maybe the attribute does not exist') else: # something doesn't work here, some nodes get through here if logger is not None: logger.error( 'Can not get attributename: "%s" from node of type %s, ' 'because node is not an element of etree.', attributename, type(node)) else: raise TypeError(f'Can not get attributename: "{attributename}" from node of type {type(node)}, ' f'because node is not an element of etree.') return None
[docs]def split_off_tag(xpath): """ Splits off the last part of the given xpath :param xpath: str of the xpath to split up """ split_xpath = xpath.split('/') if split_xpath[-1] == '': return '/'.join(split_xpath[:-2]), split_xpath[-2] else: return '/'.join(split_xpath[:-1]), split_xpath[-1]
[docs]def split_off_attrib(xpath): """ Splits off attribute of the given xpath (part after @) :param xpath: str of the xpath to split up """ split_xpath = xpath.split('/@') assert len(split_xpath) == 2, f"Splitting off attribute failed for: '{split_xpath}'" return tuple(split_xpath)
[docs]def check_complex_xpath(node, base_xpath, complex_xpath): """ Check that the given complex xpath produces a subset of the results for the simple xpath :param node: root node of an etree :param base_xpath: str of the xpath without complex syntax :param complex_xpath: str of the xpath to check :raises ValueError: If the complex_xpath does not produce a subset of the results of the base_xpath """ results_base = set(eval_xpath(node, base_xpath, list_return=True)) results_complex = set(eval_xpath(node, complex_xpath, list_return=True)) if not results_base.issuperset(results_complex): raise ValueError(f"Complex xpath '{complex_xpath}' is not compatible with the base_xpath '{base_xpath}'")
[docs]def abs_to_rel_xpath(xpath, new_root): """ Convert a given xpath to be relative from a tag appearing in the original xpath. :param xpath: str of the xpath to convert :param new_root: str of the tag from which the new xpath should be relative :returns: str of the relative xpath """ if new_root in xpath: xpath = xpath + '/' xpath_to_root = '/'.join(xpath.split(new_root + '/')[:-1]) + new_root xpath = xpath.replace(xpath_to_root, '.') xpath = xpath.rstrip('/') else: raise ValueError(f'New root element {new_root} does not appear in xpath {xpath}') return xpath