Source code for pyNSID.io.hdf_io

# -*- coding: utf-8 -*-
"""
Utilities that assist in writing NSID related data to HDF5 files

Created on Thu August 20 2020

@author: Suhas Somnath, Gerd Duscher
"""

from __future__ import (absolute_import, division, print_function,
                        unicode_literals)

import sys
from warnings import warn

import ase
import h5py
import numpy as np

__all__ = ['create_empty_dataset', 'write_nsid_dataset', 'write_results']

from dask import array as da
from sidpy import Dataset, Dimension
from sidpy.base.dict_utils import flatten_dict
from sidpy.base.num_utils import contains_integers
from sidpy.hdf.hdf_utils import (is_editable_h5, write_book_keeping_attrs,
                                 write_dict_to_h5_group, write_simple_attrs)
from sidpy.hdf.prov_utils import create_indexed_group

from .hdf_utils import link_as_main, write_pynsid_book_keeping_attrs

if sys.version_info.major == 3:
    unicode = str


[docs]def create_empty_dataset(shape, h5_group, name='nDIM_Data'): """ returns a h5py.Dataset filled with zeros according to required shape list. Parameters ---------- shape: list List of integers denoting the shape of the main dataset h5_group: h5py.Group HDF5 group into which the datasets will be written into name: str, optional. Default: "nDIM_Data" Name of the main HDF5 dataset Returns ------- h5py.Dataset HDF5 dataset of desired shape written according to NSID format """ if not contains_integers(shape): raise ValueError('dimensions of shape need to be all integers') if not isinstance(h5_group, h5py.Group): raise TypeError('h5_group should be a h5py.Group object') return write_nsid_dataset(Dataset.from_array(np.zeros(shape)), h5_group, name)
[docs]def write_nsid_dataset(dataset, h5_group, main_data_name='', verbose=False, **kwargs): """ Writes the provided sid dataset as a 'Main' dataset with all appropriate linking. Parameters ---------- dataset : sidpy.Dataset Dataset to be written to HDF5 in NSID format h5_group : class:`h5py.Group` Parent group under which the datasets will be created main_data_name : String / Unicode Name to give to the main dataset. This cannot contain the '-' character Use this to provide better context about the dataset in the HDF5 file verbose : bool, Optional. Default = False Whether or not to write logs to standard out kwargs: dict additional keyword arguments passed on to h5py when writing data Return ------ h5py dataset """ if not isinstance(dataset, Dataset): raise TypeError('data to write should be sidpy Dataset') if not isinstance(h5_group, (h5py.Group, h5py.File)): raise TypeError('h5_parent_group should be a h5py.File or h5py.Group ' 'object') if not isinstance(main_data_name, str): raise TypeError('main_data_name should be a string, but it instead it' ' is {}'.format(type(main_data_name))) if not is_editable_h5(h5_group): raise ValueError('The provided file is not editable') if verbose: print('h5 group and file OK') if not isinstance(main_data_name, str): raise TypeError('main_data_name must be a string') if main_data_name == '': if dataset.title.strip() == '': main_data_name = 'nDim_Data' else: main_data_name = dataset.title.split('/')[-1] main_data_name = main_data_name.strip() if '-' in main_data_name: warn('main_data_name should not contain the "-" character. Reformatted' ' name from:{} to ' '{}'.format(main_data_name, main_data_name.replace('-', '_'))) main_data_name = main_data_name.replace('-', '_') h5_group = h5_group.create_group(main_data_name) write_book_keeping_attrs(h5_group) write_pynsid_book_keeping_attrs(h5_group) ##################### # Write Main Dataset #################### if h5_group.file.driver == 'mpio': if kwargs.pop('compression', None) is not None: warn('This HDF5 file has been opened wth the "mpio" communicator. ' 'mpi4py does not allow creation of compressed datasets. ' 'Compression kwarg has been removed') if main_data_name in h5_group: raise ValueError('h5 dataset of that name already exists, choose ' 'different name or delete first') _ = kwargs.pop('dtype', None) # step 1 - create the empty dataset: h5_main = h5_group.create_dataset(main_data_name, shape=dataset.shape, dtype=dataset.dtype, **kwargs) if verbose: print('Created empty dataset: {} for writing Dask dataset: {}' ''.format(h5_main, dataset)) print('Dask array will be written to HDF5 dataset: "{}" in file: "{}"' ''.format(h5_main.name, h5_main.file.filename)) # Step 2 - now ask Dask to dump data to disk da.to_hdf5(h5_main.file.filename, {h5_main.name: dataset}) if verbose: print('Created dataset for Main') ################# # Add Dimensions ################# dimensional_dict = {} for i, this_dim in dataset._axes.items(): if not isinstance(this_dim, Dimension): raise ValueError('Dimensions {} is not a sidpy Dimension') this_dim_dset = h5_group.create_dataset(this_dim.name, data=this_dim.values) attrs_to_write = {'name': this_dim.name, 'units': this_dim.units, 'quantity': this_dim.quantity, 'dimension_type': this_dim.dimension_type.name} write_simple_attrs(this_dim_dset, attrs_to_write) dimensional_dict[i] = this_dim_dset attrs_to_write = {'quantity': dataset.quantity, 'units': dataset.units, 'main_data_name': dataset.title, 'data_type': dataset.data_type.name, 'modality': dataset.modality, 'source': dataset.source} write_simple_attrs(h5_main, attrs_to_write) write_pynsid_book_keeping_attrs(h5_main) for attr_name in dir(dataset): attr_val = getattr(dataset, attr_name) if attr_name == 'structures': if verbose: print('Writing structure attributes {} of the ' 'sidpy.Dataset'.format(attr_val.keys)) structure_dict = structures_to_dict(attr_val) write_dict_to_h5_group(h5_group, attr_val, structure_dict) elif isinstance(attr_val, dict) and attr_name[0] != '_': if verbose: print('Writing attributes from property: {} of the ' 'sidpy.Dataset'.format(attr_name)) write_dict_to_h5_group(h5_group, attr_val, attr_name) # This will attach the dimensions nsid_data_main = link_as_main(h5_main, dimensional_dict) if verbose: print('Successfully linked datasets - dataset should be main now') dataset.h5_dataset = nsid_data_main return nsid_data_main
[docs]def write_results(h5_group, dataset=None, attributes=None, process_name=None): """ Writes results of a processing step back to HDF5 in NSID format Parameters ---------- h5_group : h5py.Group HDF5 Group into which results will be written dataset : sidpy.Dataset, optional. Default = None Dataset ?? attributes : dict, optional. Default = None Metadata regarding processing step process_name : str, optional. Default = "Log_" Name of the prefix for group containing process results Returns ------- log_group : h5py.Group HDF5 group containing results """ found_valid_dataset = False if dataset is not None: if isinstance(dataset, Dataset): dataset = [dataset] if isinstance(dataset, list): if not all([isinstance(itm, Dataset) for itm in dataset]): raise TypeError('List contains non-Sidpy dataset entries! ' 'Should only contain sidpy datasets') found_valid_dataset = True found_valid_attributes = False if attributes is not None: if isinstance(attributes, dict): if len(attributes) > 0: found_valid_attributes = True else: raise TypeError("Provided attributes is type {} but should be type" " dict".format(type(attributes))) if not (found_valid_dataset or found_valid_attributes): raise ValueError('results should contain at least a sidpy Dataset or ' 'a dictionary in results') log_name = 'Log_' if process_name is not None: log_name = log_name+process_name log_group = create_indexed_group(h5_group, log_name) write_book_keeping_attrs(log_group) write_pynsid_book_keeping_attrs(log_group) if found_valid_dataset: for dset in dataset: write_nsid_dataset(dset, log_group) if found_valid_attributes: write_simple_attrs(log_group, attributes) return log_group
[docs]def structures_to_dict(structures): structure_dict = {} for key, structure in structures.items(): structure_dict[key] = ase_to_dict() return structure_dict
[docs]def ase_to_dict(atoms): """ converts ase.Atoms object to dictionary """ tags = {'unit_cell': atoms.cell.array, 'elements': atoms.get_chemical_formula(), 'base': atoms.get_scaled_positions(), 'metadata': atoms.info} return tags