Source code for masci_tools.io.hdf5_util
###############################################################################
# Copyright (c), Forschungszentrum Jülich GmbH, IAS-1/PGI-1, Germany. #
# All rights reserved. #
# This file is part of the Masci-tools package. #
# (Material science tools) #
# #
# The code is hosted on GitHub at https://github.com/judftteam/masci-tools. #
# For further information on the license, see the LICENSE.txt file. #
# For further information please visit http://judft.de/. #
# #
###############################################################################
"""
Small utility functions for inspecting hdf files and converting the
complete file structure into a python dictionary
"""
from __future__ import annotations
from typing import Any
import h5py
import numpy as np
from masci_tools.util.typing import FileLike
[docs]def hdfList(name: str, obj: h5py.HLObject) -> None:
"""
Print the name of the current object (indented to create a nice tree structure)
Also prints attribute values and dataset shapes and datatypes
"""
print(f"{name.split('/')[-1]:>{len(name)-1}}: {type(obj)}")
ref_length = len(name) - len(name.split('/')[-1]) + 4
if isinstance(obj, h5py.Dataset):
print(f"{'Datatype:':>{ref_length+9}} {obj.dtype}")
print(f"{'Shape:':>{ref_length+6}} {obj.shape}\n")
if obj.attrs:
print(f"{'Attributes:':>{ref_length+9}}")
for attr_name, attr_val in obj.attrs.items():
if isinstance(attr_val, list) and len(attr_val) == 1:
attr_val = attr_val[0]
print(f'{attr_name:>{ref_length+len(attr_name)}}: {attr_val}')
print('')
[docs]def h5dump(file: FileLike, group: str = '/') -> None:
"""
Shows the overall filestructure of an hdf file
Goes through all groups and subgroups and prints the attributes
or the shape and datatype of the datasets
:param filepath: path to the hdf file
"""
with h5py.File(file, 'r') as file_hdf:
if group != '/':
print(f'Starting from path {group}')
hdfList(group, file_hdf[group])
print('This path contains:\n')
file_hdf[group].visititems(hdfList)
[docs]def read_hdf_simple(file: FileLike, flatten: bool = False) -> tuple[dict[str, Any], dict[str, Any]]:
"""
Reads in an hdf file and returns its context in a nested dictionary
:param filepath: path or filehandle to the hdf file
:param flatten: bool, if True the dictionary will be flattened (does not check for lost information)
:returns: two dictionaries, one with the datasets the other
with the attributes in the file
**Non unique group attribute or dataset names will be overwritten in the return dict**
"""
with h5py.File(file, 'r') as file_hdf:
datasets, group_attrs = read_groups(file_hdf, flatten=flatten)
return datasets, group_attrs
[docs]def read_groups(hdfdata: h5py.Group, flatten: bool = False) -> tuple[dict[str, Any], dict[str, Any]]:
"""
Recursive function to read a hdf datastructure and extract the datasets
and attributes
:param hdfdata: current hdf group to process
:param flatten: bool, if True the dictionary will be flattened (does not check for lost information)
:returns: two dictionaries, one with the datasets the other
with the attributes in the file
"""
datasets: dict[str, Any] = {}
attrs: dict[str, Any] = {}
for name, attr_val in hdfdata.attrs.items():
if len(attr_val) == 1:
attr_val = attr_val[0]
attrs[name] = attr_val
for key, val in hdfdata.items():
if isinstance(val, h5py.Dataset):
datasets[key] = np.array(val)
else:
new_datasets, new_attrs = read_groups(val, flatten=flatten)
if not flatten:
if new_datasets:
datasets[key] = new_datasets
if new_attrs:
attrs[key] = new_attrs
else:
datasets.update(new_datasets)
attrs.update(new_attrs)
return datasets, attrs