Source code for masci_tools.io.parsers.hdf5.reader

# -*- coding: utf-8 -*-
###############################################################################
# Copyright (c), Forschungszentrum Jülich GmbH, IAS-1/PGI-1, Germany.         #
#                All rights reserved.                                         #
# This file is part of the Masci-tools package.                               #
# (Material science tools)                                                    #
#                                                                             #
# The code is hosted on GitHub at https://github.com/judftteam/masci-tools.   #
# For further information on the license, see the LICENSE.txt file.           #
# For further information please visit http://judft.de/.                      #
#                                                                             #
###############################################################################
"""
This module contains a generic HDF5 reader
"""
import io
import h5py
from collections import namedtuple
import warnings
import logging

Transformation = namedtuple('Transformation', ['name', 'args', 'kwargs'])
AttribTransformation = namedtuple('AttribTransformation', ['name', 'attrib_name', 'args', 'kwargs'])

logger = logging.getLogger(__name__)


[docs]class HDF5Reader: """Class for reading in data from hdf5 files using a specified recipe :param file: filepath to hdf file or opened file handle (mode 'rb') :param move_to_memory: bool if True after reading and transforming the data all leftover h5py.Datasets are moved into np.arrays The recipe is passed to the :py:meth:`HDF5Reader.read()` method and consists of a dict specifiying which attributes and datasets to read in and how to transform them Each attribute/dataset entry corresponds to one entry point in the given `.hdf` file Available transformations can either be found in :py:mod:`~masci_tools.io.parsers.hdf5.transforms` or can be defined by the user with the :py:func:`~masci_tools.io.parsers.hdf5.transforms.hdf5_transformation` decorator Basic Usage: .. code-block:: python from masci_tools.io.parsers.hdf5 import HDF5Reader import masci_tools.io.parsers.hdf5.recipes as recipes #This example shows the usage for producing data from a bandstructure calculation #in Fleur with HDF5Reader('/path/to/hdf/banddos.hdf') as h5reader: data, attributes = h5reader.read(recipe=recipes.FleurBands) print(data, attributes) """ def __init__(self, file, move_to_memory=True): self._file = file if isinstance(self._file, io.IOBase): self._filename = self._file.name else: self._filename = self._file if not self._filename.endswith('.hdf'): logger.exception('Wrong File Type for %s: Got %s', self.__class__.__name__, self._filename) raise ValueError(f'Wrong File Type for {self.__class__.__name__}: Got {self._filename}') logger.info('Instantiated %s with file %s', self.__class__.__name__, self._filename) self._move_to_memory = move_to_memory self._h5_file = None def __enter__(self): self._h5_file = h5py.File(self._file, 'r') logger.debug('Opened h5py.File with id %s', self._h5_file.id) return self def __exit__(self, exc_type, exc_value, exc_traceback): self._h5_file.close() logger.debug('Closed h5py.File with id %s', self._h5_file.id) def _read_dataset(self, h5path, strict=True): """Return in the dataset specified by the given h5path :param h5path : str, HDF5 group path in file. :param strict : bool if no dataset is at the path and strcit is True raise Error else return None :returns: h5py.Dataset or None :raises: ValueError if no dataset is at path and strict is True """ if h5path in ('/', ''): if strict: return None else: pass logger.debug('Reading dataset from path %s', h5path) dset = self._h5_file.get(h5path) if dset is not None: return dset elif strict: logger.exception('HDF5 input file %s has no Dataset at %s.', self._file, h5path) raise ValueError(f'HDF5 input file {self._file} has no Dataset at {h5path}.') return None def _transform_dataset(self, transforms, dataset, attributes=None, dataset_name=None): """ Transforms the given dataset with the given list of tasks :param transforms: list of namedtuples defining the tasks to perform :param dataset: h5py.Dataset, on which to perform the operations :param attributes: dict of previously processed attributes. Only available to the entries in the datasets section of the recipe. This allows for operations with the previously parsed attributes :returns: the dataset with all the transformations applied """ transformed_dset = dataset for spec in transforms: args = spec.args if spec.name in self._attribute_transforms: if attributes is None: raise ValueError('Attribute transform not allowed for attributes') attrib_value = attributes[spec.attrib_name] args = attrib_value, *args logger.debug('Applying transformation %s to dataset %s of type %s', spec.name, dataset_name, type(transformed_dset)) try: transformed_dset = self._transforms[spec.name](transformed_dset, *args, **spec.kwargs) except Exception as err: logger.exception(str(err)) raise return transformed_dset @staticmethod def _unpack_dataset(output_dict, dataset_name): """ Unpack the entires of the dictionary dataset in the entry dataset_name into the output_dict :param output_dict: dict with the dataset entries :param dataset_name: key of the dataset to unpack into output_dict :returns: output_dict with the entries of dataset_name unpacked """ logger.debug('Unpacking dict dataset %s after transformations', dataset_name) if not isinstance(output_dict[dataset_name], dict): raise ValueError(f'{dataset_name} cannot be unpacked: Got {type(output_dict[dataset_name])}') unpack_dict = output_dict.pop(dataset_name) if unpack_dict.keys() & output_dict.keys(): raise ValueError('Unpacking would result in lost information: \n' f"Intersection of keys: '{unpack_dict.keys().intersection(output_dict.keys())}'") return {**output_dict, **unpack_dict}
[docs] def read(self, recipe=None): """Extracts datasets from HDF5 file, transforms them and puts all into a namedtuple. :param recipe: dict with the format given in :py:mod:`~masci_tools.io.parsers.hdf5.recipes` :returns: two dicts with the datasets/attributes read in and transformed according to the recipe """ from itertools import chain from masci_tools.io.hdf5_util import read_hdf_simple logger.info('Started reading HDF file: %s', self._filename) if recipe is None: msg = 'Using the HDF5Reader without a recipe falling back to simple HDF reader' logging.warning(msg) warnings.warn(msg) res = read_hdf_simple(self._file) logger.info('Finished reading .hdf file') return res datasets = recipe.get('datasets', {}) attributes = recipe.get('attributes', {}) # remove entries whose key is an empty string h5paths = {item['h5path'] for item in chain(datasets.values(), attributes.values())} extracted_datasets = {h5path: self._read_dataset(h5path) for h5path in h5paths} output_attrs = {} for key, val in attributes.items(): transforms = val.get('transforms', []) output_attrs[key] = self._transform_dataset(transforms, extracted_datasets[val['h5path']], dataset_name=key) if val.get('unpack_dict', False): try: output_attrs = self._unpack_dataset(output_attrs, dataset_name=key) except Exception as err: logger.exception(str(err)) raise output_data = {} for key, val in datasets.items(): transforms = val.get('transforms', []) output_data[key] = self._transform_dataset(transforms, extracted_datasets[val['h5path']], attributes=output_attrs, dataset_name=key) if val.get('unpack_dict', False): try: output_data = self._unpack_dataset(output_data, dataset_name=key) except Exception as err: logger.exception(str(err)) raise if self._move_to_memory: logger.debug('Moving remaining h5py.Datasets to memory') try: self._transforms['move_to_memory'](output_data) self._transforms['move_to_memory'](output_attrs) except Exception as err: logger.exception(str(err)) raise logger.info('Finished reading HDF file: %s', self._filename) return output_data, output_attrs