Source code for masci_tools.io.parsers.fleur.fleur_inpxml_parser

# -*- coding: utf-8 -*-
###############################################################################
# Copyright (c), Forschungszentrum Jülich GmbH, IAS-1/PGI-1, Germany.         #
#                All rights reserved.                                         #
# This file is part of the Masci-tools package.                               #
# (Material science tools)                                                    #
#                                                                             #
# The code is hosted on GitHub at https://github.com/judftteam/masci-tools.   #
# For further information on the license, see the LICENSE.txt file.           #
# For further information please visit http://judft.de/.                      #
#                                                                             #
###############################################################################
"""
This module contains functions to load an fleur inp.xml file, parse it with a schema
and convert its content to a dict
"""
from lxml import etree
from pprint import pprint
from masci_tools.io.parsers.fleur.fleur_schema.schema_dict import InputSchemaDict
from masci_tools.util.xml.common_functions import clear_xml, validate_xml, eval_xpath
from masci_tools.util.xml.converters import convert_xml_attribute, convert_xml_text
from masci_tools.util.schema_dict_util import read_constants
from masci_tools.util.logging_util import DictHandler
import logging


[docs]def inpxml_parser(inpxmlfile, version=None, parser_info_out=None, strict=False, debug=False):
    """
    Parses the given inp.xml file to a python dictionary utilizing the schema
    defined by the version number to validate and corretly convert to the dictionary

    :param inpxmlfile: either path to the inp.xml file, opened file handle or a xml etree to be parsed
    :param version: version string to enforce that a given schema is used
    :param parser_info_out: dict, with warnings, info, errors, ...
    :param strict: bool if True  and no parser_info_out is provided any encountered error will immediately be raised

    :return: python dictionary with the parsed inp.xml

    :raises ValueError: If the validation against the schema failed, or an irrecoverable error
                        occured during parsing
    :raises FileNotFoundError: If no Schema file for the given version was found

    """

    __parser_version__ = '0.3.0'
    logger = logging.getLogger(__name__)

    parser_log_handler = None
    if parser_info_out is not None or not strict:
        if parser_info_out is None:
            parser_info_out = {}

        logging_level = logging.INFO
        if debug:
            logging_level = logging.DEBUG
        logger.setLevel(logging_level)

        parser_log_handler = DictHandler(parser_info_out,
                                         WARNING='parser_warnings',
                                         ERROR='parser_errors',
                                         INFO='parser_info',
                                         DEBUG='parser_debug',
                                         CRITICAL='parser_critical',
                                         ignore_unknown_levels=True,
                                         level=logging_level)

        logger.addHandler(parser_log_handler)

    if strict:
        logger = None

    if logger is not None:
        logger.info('Masci-Tools Fleur inp.xml Parser v%s', __parser_version__)

    if isinstance(inpxmlfile, etree._ElementTree):
        xmltree = inpxmlfile
    else:
        parser = etree.XMLParser(attribute_defaults=True, encoding='utf-8')
        try:
            xmltree = etree.parse(inpxmlfile, parser)
        except etree.XMLSyntaxError as msg:
            if logger is not None:
                logger.exception('Failed to parse input file')
            raise ValueError(f'Failed to parse input file: {msg}') from msg

    if version is None:
        version = eval_xpath(xmltree, '//@fleurInputVersion', logger=logger)
        version = str(version)
        if version is None:
            if logger is not None:
                logger.error('Failed to extract inputVersion')
            raise ValueError('Failed to extract inputVersion')

    if logger is not None:
        logger.info('Got Fleur input file with file version %s', version)
    schema_dict = InputSchemaDict.fromVersion(version, logger=logger)

    ignore_validation = schema_dict['inp_version'] != version

    xmltree, _ = clear_xml(xmltree)
    root = xmltree.getroot()

    constants = read_constants(root, schema_dict, logger=logger)

    try:
        validate_xml(xmltree, schema_dict.xmlschema, error_header='Input file does not validate against the schema')
    except etree.DocumentInvalid as err:
        errmsg = str(err)
        logger.warning(errmsg)
        if not ignore_validation:
            if logger is not None:
                logger.exception(errmsg)
            raise ValueError(errmsg) from err

    if schema_dict.xmlschema.validate(xmltree) or ignore_validation:
        inp_dict = inpxml_todict(root, schema_dict, constants, logger=logger)
    else:
        msg = 'Input file does not validate against the schema: Reason is unknown'
        if logger is not None:
            logger.warning(msg)
        if not ignore_validation:
            if logger is not None:
                logger.exception(msg)
            raise ValueError(msg)

    if parser_log_handler is not None:
        if logger is not None:
            logger.removeHandler(parser_log_handler)

    return inp_dict


def inpxml_todict(parent, schema_dict, constants, omitted_tags=False, base_xpath=None, logger=None):
    """
    Recursive operation which transforms an xml etree to
    python nested dictionaries and lists.
    Decision to add a list is if the tag name is in the given list tag_several

    :param parent: some xmltree, or xml element
    :param schema_dict: structure/layout of the xml file in python dictionary
    :param constants: dict with all the defined constants
    :param omitted_tags: switch. If True only a list of the contained tags is returned
                         Used to omitt useless tags like e.g ['atomSpecies']['species'][3]
                         becomes ['atomSpecies'][3]
    :param base_xpath: str, keeps track of the place in the inp.xml currently being processed
    :param parser_info_out: dict, with warnings, info, errors, ...

    :return: a python dictionary
    """

    #Check if this is the first call to this routine
    if base_xpath is None:
        base_xpath = f'/{parent.tag}'

    return_dict = {}
    if list(parent.items()):
        return_dict = dict(list(parent.items()))
        # Now we have to convert lazy fortan style into pretty things for the Database
        for key in return_dict:
            if key in schema_dict['attrib_types']:
                return_dict[key], suc = convert_xml_attribute(return_dict[key],
                                                              schema_dict['attrib_types'][key],
                                                              constants,
                                                              logger=logger)
                if not suc and logger is not None:
                    logger.warning("Failed to convert attribute '%s' Got: '%s'", key, return_dict[key])

    if parent.text:
        # has text, but we don't want all the '\n' s and empty stings in the database
        if parent.text.strip() != '':  # might not be the best solutions
            if parent.tag not in schema_dict['simple_elements']:
                if logger is not None:
                    logger.error('Something is wrong in the schema_dict: %s is not in simple_elements, but it has text',
                                 parent.tag)
                raise ValueError(
                    f'Something is wrong in the schema_dict: {parent.tag} is not in simple_elements, but it has text')

            converted_text, suc = convert_xml_text(parent.text,
                                                   schema_dict['simple_elements'][parent.tag],
                                                   constants,
                                                   logger=logger)
            if not suc and logger is not None:
                logger.warning("Failed to text of '%s' Got: '%s'", parent.tag, parent.text)

            if not return_dict:
                return_dict = converted_text
            else:
                return_dict['text_value'] = converted_text
                if 'label' in return_dict:
                    return_dict['text_label'] = return_dict['label']
                    return_dict.pop('label')

    if base_xpath in schema_dict['tag_info']:
        tag_info = schema_dict['tag_info'][base_xpath]
    else:
        tag_info = {'several': []}

    for element in parent:

        new_base_xpath = f'{base_xpath}/{element.tag}'
        omitt_contained_tags = element.tag in schema_dict['omitt_contained_tags']
        new_return_dict = inpxml_todict(element,
                                        schema_dict,
                                        constants,
                                        base_xpath=new_base_xpath,
                                        omitted_tags=omitt_contained_tags,
                                        logger=logger)

        if element.tag in tag_info['several']:
            # make a list, otherwise the tag will be overwritten in the dict
            if element.tag not in return_dict:  # is this the first occurence?
                if omitted_tags:
                    if len(return_dict) == 0:
                        return_dict = []
                else:
                    return_dict[element.tag] = []
            if omitted_tags:
                return_dict.append(new_return_dict)
            elif 'text_value' in new_return_dict:
                for key, value in new_return_dict.items():
                    if key == 'text_value':
                        return_dict[element.tag].append(value)
                    elif key == 'text_label':
                        if 'labels' not in return_dict:
                            return_dict['labels'] = {}
                        return_dict['labels'][value] = new_return_dict['text_value']
                    else:
                        if key not in return_dict:
                            return_dict[key] = []
                        elif not isinstance(return_dict[key], list):  #Key seems to be defined already
                            if logger is not None:
                                logger.error('%s cannot be extracted to the next level', key)
                            raise ValueError(f'{key} cannot be extracted to the next level')
                        return_dict[key].append(value)
                for key in new_return_dict.keys():
                    if key in ['text_value', 'text_label']:
                        continue
                    if len(return_dict[key]) != len(return_dict[element.tag]):
                        if logger is not None:
                            logger.error(
                                'Extracted optional argument %s at the moment only label is supported correctly', key)
                        raise ValueError(
                            f'Extracted optional argument {key} at the moment only label is supported correctly')
            else:
                return_dict[element.tag].append(new_return_dict)
        else:
            return_dict[element.tag] = new_return_dict

    return return_dict