Source code for masci_tools.util.parse_tasks

# -*- coding: utf-8 -*-
###############################################################################
# Copyright (c), Forschungszentrum Jülich GmbH, IAS-1/PGI-1, Germany.         #
#                All rights reserved.                                         #
# This file is part of the Masci-tools package.                               #
# (Material science tools)                                                    #
#                                                                             #
# The code is hosted on GitHub at https://github.com/judftteam/masci-tools.   #
# For further information on the license, see the LICENSE.txt file.           #
# For further information please visit http://judft.de/.                      #
#                                                                             #
###############################################################################
"""
This module contains a class which organizes the known parsing tasks for outxml files
and provides fuctionality for adding custom tasks easily
"""

from pprint import pprint
import importlib.util
import copy
import os

PACKAGE_DIRECTORY = os.path.dirname(os.path.abspath(__file__))
DEFAULT_TASK_FILE = os.path.abspath(os.path.join(PACKAGE_DIRECTORY, '../io/parsers/fleur/default_parse_tasks.py'))


[docs]def find_migration(start, target, migrations):
    """
    Tries to find a migration path from the start to the target version
    via the defined migration functions

    :param start: str of the starting version
    :param target: str of the target version
    :param migrations: dict of funcs registered via the register_migration_function decorator

    :returns: list of migration functions to be called to go from start to target
    """

    if start == target:
        return []

    if start not in migrations:
        return None

    if target in migrations[start]:
        if isinstance(migrations[start][target], str):
            if migrations[start][target] == 'compatible':
                return []
            return None
        else:
            return [migrations[start][target]]

    for possible_stop in migrations[start].keys():
        new_call_list = find_migration(possible_stop, target, migrations)

        if new_call_list is None:
            continue

        if isinstance(migrations[start][possible_stop], str):
            if migrations[start][possible_stop] == 'compatible':
                call_list = []
        else:
            call_list = [migrations[start][possible_stop]]
        call_list += new_call_list
        return call_list


[docs]class ParseTasks(object):
    """
    Representation of all known parsing tasks for the out.xml file

    When set up it will initialize the known default tasks and check if they work
    for the given output version

    Accesing definition of task example

    .. code-block:: python

        from masci_tools.io.parsers.fleur import ParseTasks

        p = ParseTasks('0.33')
        totE_definition = p.tasks['total_energy']
    """

    CONTROL_KEYS = {'_general', '_modes', '_minimal', '_special', '_conversions'}
    REQUIRED_KEYS = {'parse_type', 'path_spec'}
    ALLOWED_KEYS = {'parse_type', 'path_spec', 'subdict', 'overwrite_last'}
    ALLOWED_KEYS_ALLATTRIBS = {
        'parse_type', 'path_spec', 'subdict', 'base_value', 'ignore', 'overwrite', 'flat', 'only_required'
    }

    _version = '0.2.0'

    def __init__(self, version, task_file=None, validate_defaults=False):
        """
        Initialize the default parse tasks
        Terminates if the version is not marked as working with the default tasks

        :param version: str of the wanted ouput version
        :param task_file: optional, file to override default_parse_tasks
        :param validate_defaults: bool, if True all tasks from the default tasks
                                  are added one by one and are checked for
                                  inconsistent keys
        """

        if task_file is None:
            task_file = DEFAULT_TASK_FILE

        #import task definitions
        spec = importlib.util.spec_from_file_location('tasks', task_file)
        tasks = importlib.util.module_from_spec(spec)
        spec.loader.exec_module(tasks)

        self._iteration_tasks = []
        self._general_tasks = []

        tasks_dict = copy.deepcopy(tasks.TASKS_DEFINITION)
        if validate_defaults:
            #Manually add each task to make sure that there are no typos/inconsitencies in the keys
            self.tasks = {}
            for task_name, task in tasks_dict.items():
                self.add_task(task_name, task)
        else:
            self.tasks = tasks_dict

        #Catch initializations, where decorators have not been triggered
        #(if we do this at import we produce circular imports)
        #The alternative is moving concrete definitions for the output parser away from the parser which
        #I do not like at all
        if getattr(self, '_migrations', None) is None:
            import masci_tools.io.parsers.fleur.task_migrations  # pylint: disable=cyclic-import

        if getattr(self, '_conversion_functions', None) is None:
            import masci_tools.io.parsers.fleur.outxml_conversions  # pylint: disable=cyclic-import

        if getattr(self, '_parse_functions', None) is None:
            import masci_tools.util.schema_dict_util  # pylint: disable=cyclic-import

        #Look if the base version is compatible if not look for a migration
        if version not in tasks.__working_out_versions__:

            migration_list = find_migration(tasks.__base_version__, version, self._migrations)

            if migration_list is None:
                raise ValueError(f'Unsupported output version: {version}')

            for migration in migration_list:
                self.tasks = migration(self.tasks)

    @property
    def iteration_tasks(self):
        """
        Tasks to perform for each iteration
        """
        return self._iteration_tasks

    @property
    def general_tasks(self):
        """
        Tasks to perform for the root node
        """
        return self._general_tasks

    @iteration_tasks.setter
    def iteration_tasks(self, task_list):
        """
        Setter for iteration_tasks
        """
        self._iteration_tasks = task_list

    @general_tasks.setter
    def general_tasks(self, task_list):
        """
        Setter for general_tasks
        """
        self._general_tasks = task_list

[docs]    def add_task(self, task_name, task_definition, **kwargs):
        """
        Add a new task definition to the tasks dictionary

        Will first check if the definition has all the required keys

        :param task_name: str, key in the tasks dict
        :param task_definition: dict with the defined tasks
        :param overwrite: bool (optional), if True and the key is present in the dictionary it will be
                          overwritten with the new definition
        :param append: bool (optional), if True and the key is present in the dictionary the new defintions
                       will be inserted into this dictionary (inner keys WILL BE OVERWRITTEN). Additionally
                       if an inner key is overwritten with an empty dict the inner key will be removed

        The following keys are expected in each entry of the task_definition dictionary:
            :param parse_type: str, defines which methods to use when extracting the information
            :param path_spec: dict with all the arguments that should be passed to get_tag_xpath
                              or get_attrib_xpath to get the correct path
            :param subdict: str, if present the parsed values are put into this key in the output dictionary
            :param overwrite_last: bool, if True no list is inserted and each entry overwrites the last

        For the allAttribs parse_type there are more keys that can appear:
            :param base_value: str, optional. If given the attribute
                               with this name will be inserted into the key from the task_definition
                               all other keys are formatted as {task_key}_{attribute_name}
            :param ignore: list of str, these attributes will be ignored
            :param overwrite: list of str, these attributes will not create a list and overwrite any value
                              that might be there
            :param flat: bool, if False the dict parsed from the tag is inserted as a dict into the correspondin key
                               if True the values will be extracted and put into the output dictionary with the
                               format {task_key}_{attribute_name}

        """

        append = kwargs.get('append', False)
        overwrite = kwargs.get('overwrite', False)

        if task_name in self.tasks and not (append or overwrite):
            raise ValueError(f"Task '{task_name}' is already defined."
                             'Use append=True to append them (conflicting keys are overwritten)'
                             'or overwrite=True to remove all existing tasks')

        for task_key, definition in task_definition.items():

            if task_key.startswith('_'):
                if task_key not in self.CONTROL_KEYS:
                    raise ValueError(f'Unknown control key: {task_key}')
                continue

            task_keys = set(definition.keys())

            if not task_keys and task_key in self.tasks[task_name]:
                continue

            missing_required = self.REQUIRED_KEYS.difference(task_keys)
            if missing_required:
                raise ValueError(f'Reqired Keys missing: {missing_required}')

            if not definition['parse_type'] in self._parse_functions.keys():
                raise ValueError(f"Unknown parse_type: {definition['parse_type']}")

            if definition['parse_type'] in self._all_attribs_function:
                extra_keys = task_keys.difference(self.ALLOWED_KEYS_ALLATTRIBS)
            else:
                extra_keys = task_keys.difference(self.ALLOWED_KEYS)

            if extra_keys:
                raise ValueError(f'Got extra Keys: {extra_keys}')

        if append:
            if task_name not in self.tasks:
                self.tasks[task_name] = {}
            for task_key, definition in task_definition.items():
                if definition:
                    self.tasks[task_name][task_key] = definition
                elif task_key in self.tasks[task_name]:
                    self.tasks[task_name].pop(task_key)
        else:
            self.tasks[task_name] = task_definition

[docs]    def determine_tasks(self, fleurmodes, minimal=False):
        """
        Determine, which tasks to perform based on the fleur_modes

        :param fleurmodes: dict with the calculation modes
        :param minimal: bool, whether to only perform minimal tasks
        """

        for task_name, definition in self.tasks.items():

            if minimal:
                task_minimal = definition.get('_minimal', False)
                if not task_minimal:
                    continue

            #These tasks are always added manually
            special = definition.get('_special', False)
            if special:
                continue

            mode_req = definition.get('_modes', [])

            check = [fleurmodes[mode] == required_value for mode, required_value in mode_req]

            if not all(check):
                continue

            general_task = definition.get('_general', False)

            if general_task:
                self._general_tasks.append(task_name)
            else:
                self._iteration_tasks.append(task_name)

[docs]    def perform_task(self, task_name, node, out_dict, schema_dict, constants, logger=None, use_lists=True):
        """
        Evaluates the task given in the tasks_definition dict

        :param task_name: str, specifies the task to perform
        :param node: etree.Element, the xpath expressions are evaluated from this node
        :param out_dict: dict, output will be put in this dictionary
        :param schema_dict: dict, here all paths and attributes are stored according to the
                            outputschema
        :param constants: dict with all the defined mathematical constants
        :param logger: logger object for logging warnings, errors
        :param root_tag: str, this string will be appended in front of any xpath before it is evaluated
        :param use_lists: bool, if True lists are created for each key if not otherwise specified

        """
        from masci_tools.io.common_functions import camel_to_snake

        try:
            tasks_definition = self.tasks[task_name]
        except KeyError as exc:
            raise KeyError(f'Unknown Task: {task_name}') from exc

        for task_key, spec in tasks_definition.items():

            if task_key.startswith('_'):
                continue

            action = self._parse_functions[spec['parse_type']]

            args = spec['path_spec'].copy()

            if spec['parse_type'] in ['attrib', 'text', 'allAttribs', 'parentAttribs', 'singleValue']:
                args['constants'] = constants

            if 'only_required' in spec:
                args['only_required'] = spec['only_required']

            if spec['parse_type'] == 'singleValue':
                args['ignore'] = ['comment']
            elif spec['parse_type'] in ['allAttribs', 'parentAttribs']:
                args['ignore'] = spec.get('ignore', [])

            parsed_dict = out_dict
            if 'subdict' in spec:
                parsed_dict = out_dict.get(spec['subdict'], {})

            parsed_value = action(node, schema_dict, logger=logger, **args)

            if isinstance(parsed_value, dict):

                if spec['parse_type'] == 'singleValue':
                    base_value = 'value'
                    no_list = ['units']
                    flat = True
                elif spec['parse_type'] in ['allAttribs', 'parentAttribs']:
                    base_value = spec.get('base_value', '')
                    no_list = spec.get('overwrite', [])
                    flat = spec.get('flat', True)

                if flat:
                    for key, val in parsed_value.items():

                        if key == base_value:
                            current_key = task_key
                        else:
                            current_key = f'{task_key}_{camel_to_snake(key)}'

                        if current_key not in parsed_dict and use_lists:
                            parsed_dict[current_key] = []

                        if key in no_list or not use_lists:
                            parsed_dict[current_key] = val
                        else:
                            parsed_dict[current_key].append(val)

                else:
                    parsed_dict[task_key] = {camel_to_snake(key): val for key, val in parsed_value.items()}

            else:
                overwrite = spec.get('overwrite_last', False)
                if task_key not in parsed_dict and use_lists:
                    if overwrite:
                        parsed_dict[task_key] = None
                    else:
                        parsed_dict[task_key] = []

                if use_lists and not overwrite:
                    parsed_dict[task_key].append(parsed_value)
                elif overwrite:
                    if parsed_value is not None:
                        parsed_dict[task_key] = parsed_value
                else:
                    if parsed_value is not None or\
                       task_key not in parsed_dict:
                        parsed_dict[task_key] = parsed_value

            if 'subdict' in spec:
                out_dict[spec['subdict']] = parsed_dict
            else:
                out_dict = parsed_dict

        conversions = tasks_definition.get('_conversions', [])
        for conversion in conversions:
            action = self._conversion_functions[conversion]
            out_dict = action(out_dict, logger=logger)

        return out_dict

[docs]    def show_available_tasks(self, show_definitions=False):
        """
        Print all currently available task keys.
        If show_definitions is True also the corresponding defintions will be printed
        """
        if show_definitions:
            pprint(self.tasks)
        else:
            pprint(self.tasks.keys())