Source code for masci_tools.io.parsers.hdf5.reader

# -*- coding: utf-8 -*-
###############################################################################
# Copyright (c), Forschungszentrum Jülich GmbH, IAS-1/PGI-1, Germany.         #
#                All rights reserved.                                         #
# This file is part of the Masci-tools package.                               #
# (Material science tools)                                                    #
#                                                                             #
# The code is hosted on GitHub at https://github.com/judftteam/masci-tools.   #
# For further information on the license, see the LICENSE.txt file.           #
# For further information please visit http://judft.de/.                      #
#                                                                             #
###############################################################################
"""
This module contains a generic HDF5 reader
"""
import io
import h5py
from collections import namedtuple
import warnings
import logging

Transformation = namedtuple('Transformation', ['name', 'args', 'kwargs'])
AttribTransformation = namedtuple('AttribTransformation', ['name', 'attrib_name', 'args', 'kwargs'])

logger = logging.getLogger(__name__)


[docs]class HDF5Reader:
    """Class for reading in data from hdf5 files using a specified recipe

    :param file: filepath to hdf file or opened file handle (mode 'rb')
    :param move_to_memory: bool if True after reading and transforming the data
                           all leftover h5py.Datasets are moved into np.arrays

    The recipe is passed to the :py:meth:`HDF5Reader.read()` method and consists
    of a dict specifiying which attributes and datasets to read in and how to transform them

    Each attribute/dataset entry corresponds to one entry point in the given `.hdf` file
    Available transformations can either be found in :py:mod:`~masci_tools.io.parsers.hdf5.transforms`
    or can be defined by the user with the :py:func:`~masci_tools.io.parsers.hdf5.transforms.hdf5_transformation`
    decorator

    Basic Usage:

    .. code-block:: python

        from masci_tools.io.parsers.hdf5 import HDF5Reader
        import masci_tools.io.parsers.hdf5.recipes as recipes

        #This example shows the usage for producing data from a bandstructure calculation
        #in Fleur
        with HDF5Reader('/path/to/hdf/banddos.hdf') as h5reader:
            data, attributes = h5reader.read(recipe=recipes.FleurBands)
        print(data, attributes)

    """

    def __init__(self, file, move_to_memory=True):

        self._file = file

        if isinstance(self._file, io.IOBase):
            self._filename = self._file.name
        else:
            self._filename = self._file

        if not self._filename.endswith('.hdf'):
            logger.exception('Wrong File Type for %s: Got %s', self.__class__.__name__, self._filename)
            raise ValueError(f'Wrong File Type for {self.__class__.__name__}: Got {self._filename}')

        logger.info('Instantiated %s with file %s', self.__class__.__name__, self._filename)

        self._move_to_memory = move_to_memory
        self._h5_file = None

    def __enter__(self):
        self._h5_file = h5py.File(self._file, 'r')
        logger.debug('Opened h5py.File with id %s', self._h5_file.id)
        return self

    def __exit__(self, exc_type, exc_value, exc_traceback):
        self._h5_file.close()
        logger.debug('Closed h5py.File with id %s', self._h5_file.id)

    def _read_dataset(self, h5path, strict=True):
        """Return in the dataset specified by the given h5path

        :param h5path : str, HDF5 group path in file.
        :param strict : bool if no dataset is at the path and strcit is True raise Error
                        else return None

        :returns: h5py.Dataset or None

        :raises: ValueError if no dataset is at path and strict is True
        """
        if h5path in ('/', ''):
            if strict:
                return None
            else:
                pass

        logger.debug('Reading dataset from path %s', h5path)

        dset = self._h5_file.get(h5path)
        if dset is not None:
            return dset
        elif strict:
            logger.exception('HDF5 input file %s has no Dataset at %s.', self._file, h5path)
            raise ValueError(f'HDF5 input file {self._file} has no Dataset at {h5path}.')
        return None

    def _transform_dataset(self, transforms, dataset, attributes=None, dataset_name=None):
        """
        Transforms the given dataset with the given list of tasks

        :param transforms: list of namedtuples defining the tasks to perform
        :param dataset: h5py.Dataset, on which to perform the operations
        :param attributes: dict of previously processed attributes.
                           Only available to the entries in the datasets
                           section of the recipe. This allows for operations with
                           the previously parsed attributes

        :returns: the dataset with all the transformations applied
        """
        transformed_dset = dataset
        for spec in transforms:

            args = spec.args
            if spec.name in self._attribute_transforms:
                if attributes is None:
                    raise ValueError('Attribute transform not allowed for attributes')
                attrib_value = attributes[spec.attrib_name]
                args = attrib_value, *args

            logger.debug('Applying transformation %s to dataset %s of type %s', spec.name, dataset_name,
                         type(transformed_dset))

            try:
                transformed_dset = self._transforms[spec.name](transformed_dset, *args, **spec.kwargs)
            except Exception as err:
                logger.exception(str(err))
                raise

        return transformed_dset

    @staticmethod
    def _unpack_dataset(output_dict, dataset_name):
        """
        Unpack the entires of the dictionary dataset in the entry dataset_name into the
        output_dict

        :param output_dict: dict with the dataset entries
        :param dataset_name: key of the dataset to unpack into output_dict

        :returns: output_dict with the entries of dataset_name unpacked
        """

        logger.debug('Unpacking dict dataset %s after transformations', dataset_name)

        if not isinstance(output_dict[dataset_name], dict):
            raise ValueError(f'{dataset_name} cannot be unpacked: Got {type(output_dict[dataset_name])}')

        unpack_dict = output_dict.pop(dataset_name)

        if unpack_dict.keys() & output_dict.keys():
            raise ValueError('Unpacking would result in lost information: \n'
                             f"Intersection of keys: '{unpack_dict.keys().intersection(output_dict.keys())}'")

        return {**output_dict, **unpack_dict}

[docs]    def read(self, recipe=None):
        """Extracts datasets from HDF5 file, transforms them and puts all into a namedtuple.

        :param recipe: dict with the format given in :py:mod:`~masci_tools.io.parsers.hdf5.recipes`

        :returns: two dicts with the datasets/attributes read in and transformed according to the recipe
        """
        from itertools import chain
        from masci_tools.io.hdf5_util import read_hdf_simple

        logger.info('Started reading HDF file: %s', self._filename)

        if recipe is None:
            msg = 'Using the HDF5Reader without a recipe falling back to simple HDF reader'
            logging.warning(msg)
            warnings.warn(msg)
            res = read_hdf_simple(self._file)
            logger.info('Finished reading .hdf file')
            return res

        datasets = recipe.get('datasets', {})
        attributes = recipe.get('attributes', {})

        # remove entries whose key is an empty string
        h5paths = {item['h5path'] for item in chain(datasets.values(), attributes.values())}
        extracted_datasets = {h5path: self._read_dataset(h5path) for h5path in h5paths}

        output_attrs = {}
        for key, val in attributes.items():
            transforms = val.get('transforms', [])
            output_attrs[key] = self._transform_dataset(transforms, extracted_datasets[val['h5path']], dataset_name=key)
            if val.get('unpack_dict', False):
                try:
                    output_attrs = self._unpack_dataset(output_attrs, dataset_name=key)
                except Exception as err:
                    logger.exception(str(err))
                    raise

        output_data = {}
        for key, val in datasets.items():
            transforms = val.get('transforms', [])
            output_data[key] = self._transform_dataset(transforms,
                                                       extracted_datasets[val['h5path']],
                                                       attributes=output_attrs,
                                                       dataset_name=key)
            if val.get('unpack_dict', False):
                try:
                    output_data = self._unpack_dataset(output_data, dataset_name=key)
                except Exception as err:
                    logger.exception(str(err))
                    raise

        if self._move_to_memory:
            logger.debug('Moving remaining h5py.Datasets to memory')
            try:
                self._transforms['move_to_memory'](output_data)
                self._transforms['move_to_memory'](output_attrs)
            except Exception as err:
                logger.exception(str(err))
                raise

        logger.info('Finished reading HDF file: %s', self._filename)

        return output_data, output_attrs