# -*- coding: utf-8 -*-
"""
Utilities for reading and writing USID datasets that are highly model-dependent (with or without N-dimensional form)
Created on Tue Nov 3 21:14:25 2015
@author: Suhas Somnath, Chris Smith
"""
from __future__ import division, print_function, absolute_import, unicode_literals
from warnings import warn
import sys
import h5py
import numpy as np
from dask import array as da
from sidpy.hdf.hdf_utils import get_attr, write_simple_attrs, is_editable_h5, \
copy_dataset, lazy_load_array
from sidpy.base.num_utils import contains_integers
from sidpy.base.dict_utils import flatten_dict
from sidpy.base.string_utils import validate_single_string_arg, \
validate_list_of_strings, validate_string_args
from sidpy.hdf.dtype_utils import validate_dtype
from sidpy import sid
from .base import write_book_keeping_attrs
from .simple import link_as_main, check_if_main, write_ind_val_dsets, validate_dims_against_main, validate_anc_h5_dsets
from ..dimension import Dimension, validate_dimensions
from ..anc_build_utils import INDICES_DTYPE, make_indices_matrix
if sys.version_info.major == 3:
unicode = str
[docs]
def reshape_to_n_dims(h5_main, h5_pos=None, h5_spec=None, get_labels=False, verbose=False, sort_dims=False,
lazy=False):
"""
Reshape the input 2D matrix to be N-dimensions based on the
position and spectroscopic datasets.
Parameters
----------
h5_main : HDF5 Dataset
2D data to be reshaped
h5_pos : HDF5 Dataset, optional
Position indices corresponding to rows in `h5_main`
h5_spec : HDF5 Dataset, optional
Spectroscopic indices corresponding to columns in `h5_main`
get_labels : bool, optional
Whether or not to return the dimension labels. Default False
verbose : bool, optional
Whether or not to print debugging statements
sort_dims : bool
If True, the data is sorted so that the dimensions are in order from slowest to fastest
If False, the data is kept in the original order
If `get_labels` is also True, the labels are sorted as well.
lazy : bool, optional. Default = False
If False, ds_Nd will be a numpy.ndarray object - this is suitable if the HDF5 dataset fits into memory
If True, ds_Nd will be a dask.array object - This is suitable if the HDF5 dataset is too large to fit into
memory. Note that this will bea lazy computation meaning that the returned object just contains the instructions
. In order to get the actual value or content in numpy arrays, call ds_Nd.compute()
Returns
-------
ds_Nd : N-D numpy array or dask.array object
N dimensional array arranged as [positions slowest to fastest, spectroscopic slowest to fastest]
success : boolean or string
True if full reshape was successful
"Positions" if it was only possible to reshape by
the position dimensions
False if no reshape was possible
ds_labels : list of str
List of the labels of each dimension of `ds_Nd`
Notes
-----
If either `h5_pos` or `h5_spec` are not provided, the function will first
attempt to find them as attributes of `h5_main`. If that fails, it will
generate dummy values for them.
"""
# TODO: automatically switch on lazy if the data is larger than memory
# TODO: sort_dims does not appear to do much. Functions as though it was always True
if h5_pos is None and h5_spec is None:
if not check_if_main(h5_main):
raise ValueError('if h5_main is a h5py.Dataset it should be a Main dataset')
else:
if not isinstance(h5_main, (h5py.Dataset, np.ndarray, da.core.Array)):
raise TypeError('h5_main should either be a h5py.Dataset or numpy array')
if h5_pos is not None:
if not isinstance(h5_pos, (h5py.Dataset, np.ndarray, da.core.Array)):
raise TypeError('h5_pos should either be a h5py.Dataset or numpy array')
if h5_pos.shape[0] != h5_main.shape[0]:
raise ValueError('The size of h5_pos: {} does not match with h5_main: {}'.format(h5_pos.shape,
h5_main.shape))
if h5_spec is not None:
if not isinstance(h5_spec, (h5py.Dataset, np.ndarray, da.core.Array)):
raise TypeError('h5_spec should either be a h5py.Dataset or numpy array')
if h5_spec.shape[1] != h5_main.shape[1]:
raise ValueError('The size of h5_spec: {} does not match with h5_main: {}'.format(h5_spec.shape,
h5_main.shape))
pos_labs = np.array(['Positions'])
spec_labs = np.array(['Spectral_Step'])
if h5_pos is None:
"""
Get the Position datasets from the references if possible
"""
if isinstance(h5_main, h5py.Dataset):
try:
h5_pos = h5_main.file[h5_main.attrs['Position_Indices']]
ds_pos = h5_pos[()]
pos_labs = get_attr(h5_pos, 'labels')
except KeyError:
print('No position datasets found as attributes of {}'.format(h5_main.name))
if len(h5_main.shape) > 1:
ds_pos = np.arange(h5_main.shape[0], dtype=INDICES_DTYPE).reshape(-1, 1)
pos_labs = np.array(['Position Dimension {}'.format(ipos) for ipos in range(ds_pos.shape[1])])
else:
ds_pos = np.array(0, dtype=INDICES_DTYPE).reshape(-1, 1)
else:
ds_pos = np.arange(h5_main.shape[0], dtype=INDICES_DTYPE).reshape(-1, 1)
pos_labs = np.array(['Position Dimension {}'.format(ipos) for ipos in range(ds_pos.shape[1])])
elif isinstance(h5_pos, h5py.Dataset):
"""
Position Indices dataset was provided
"""
ds_pos = h5_pos[()]
pos_labs = get_attr(h5_pos, 'labels')
elif isinstance(h5_pos, (np.ndarray, da.core.Array)):
ds_pos = np.atleast_2d(h5_pos)
pos_labs = np.array(['Position Dimension {}'.format(ipos) for ipos in range(ds_pos.shape[1])])
else:
raise TypeError('Position Indices must be either h5py.Dataset or None')
if h5_spec is None:
"""
Get the Spectroscopic datasets from the references if possible
"""
if isinstance(h5_main, h5py.Dataset):
try:
h5_spec = h5_main.file[h5_main.attrs['Spectroscopic_Indices']]
ds_spec = h5_spec[()]
spec_labs = get_attr(h5_spec, 'labels')
except KeyError:
print('No spectroscopic datasets found as attributes of {}'.format(h5_main.name))
if len(h5_main.shape) > 1:
ds_spec = np.arange(h5_main.shape[1], dtype=INDICES_DTYPE).reshape([1, -1])
spec_labs = np.array(['Spectral Dimension {}'.format(ispec) for ispec in range(ds_spec.shape[0])])
else:
ds_spec = np.array(0, dtype=INDICES_DTYPE).reshape([1, 1])
else:
ds_spec = np.arange(h5_main.shape[1], dtype=INDICES_DTYPE).reshape([1, -1])
spec_labs = np.array(['Spectral Dimension {}'.format(ispec) for ispec in range(ds_spec.shape[0])])
elif isinstance(h5_spec, h5py.Dataset):
"""
Spectroscopic Indices dataset was provided
"""
ds_spec = h5_spec[()]
spec_labs = get_attr(h5_spec, 'labels')
elif isinstance(h5_spec, (np.ndarray, da.core.Array)):
ds_spec = h5_spec
spec_labs = np.array(['Spectral Dimension {}'.format(ispec) for ispec in range(ds_spec.shape[0])])
else:
raise TypeError('Spectroscopic Indices must be either h5py.Dataset or None')
'''
Sort the indices from fastest to slowest
'''
pos_sort = get_sort_order(np.transpose(ds_pos))
spec_sort = get_sort_order(ds_spec)
if verbose:
print('Position dimensions:', pos_labs)
print('Position sort order:', pos_sort)
print('Spectroscopic Dimensions:', spec_labs)
print('Spectroscopic sort order:', spec_sort)
'''
Get the size of each dimension in the sorted order
'''
pos_dims = get_dimensionality(np.transpose(ds_pos), pos_sort)
spec_dims = get_dimensionality(ds_spec, spec_sort)
if np.prod(pos_dims) != h5_main.shape[0]:
mesg = 'Product of position dimension sizes: {} = {} not matching ' \
'with size of first axis of main dataset: {}. One or more ' \
'dimensions are dependent dimensions and not marked as such' \
'.'.format(pos_dims, np.prod(pos_dims), h5_main.shape[0])
raise ValueError(mesg)
if np.prod(spec_dims) != h5_main.shape[1]:
mesg = 'Product of spectroscopic dimension sizes: {} = {} not matching ' \
'with size of second axis of main dataset: {}. One or more ' \
'dimensions are dependent dimensions and not marked as such' \
'.'.format(spec_dims, np.prod(spec_dims), h5_main.shape[1])
raise ValueError(mesg)
if verbose:
print('\nPosition dimensions (sort applied):', pos_labs[pos_sort])
print('Position dimensionality (sort applied):', pos_dims)
print('Spectroscopic dimensions (sort applied):', spec_labs[spec_sort])
print('Spectroscopic dimensionality (sort applied):', spec_dims)
if lazy:
ds_main = lazy_load_array(h5_main)
else:
ds_main = h5_main[()]
"""
Now we reshape the dataset based on those dimensions
numpy reshapes correctly when the dimensions are arranged from slowest to fastest.
Since the sort orders we have are from fastest to slowest, we need to reverse the orders
for both the position and spectroscopic dimensions
"""
if verbose:
print('Will attempt to reshape main dataset from:\n{} to {}'.format(ds_main.shape, pos_dims[::-1] + spec_dims[::-1]))
try:
ds_Nd = ds_main.reshape(pos_dims[::-1] + spec_dims[::-1])
except ValueError:
warn('Could not reshape dataset to full N-dimensional form. Attempting reshape based on position only.')
try:
ds_Nd = ds_main.reshape(pos_dims[::-1] + [-1])
except ValueError:
warn('Reshape by position only also failed. Will keep dataset in 2d form.')
if get_labels:
return ds_main, False, ['Position', 'Spectral Step']
else:
return ds_main, False
# No exception
else:
if get_labels:
return ds_Nd, 'Positions', ['Position'] + spec_labs
else:
return ds_Nd, 'Positions'
all_labels = np.hstack((pos_labs[pos_sort][::-1],
spec_labs[spec_sort][::-1]))
if verbose:
print('\nAfter reshaping, labels are', all_labels)
print('Data shape is', ds_Nd.shape)
"""
At this point, the data is arranged from slowest to fastest dimension in both pos and spec
"""
if sort_dims:
results = [ds_Nd, True]
if get_labels:
results.append(all_labels)
return results
if verbose:
print('\nGoing to put dimensions back in the same order as in the file:')
swap_axes = list()
# Compare the original order of the pos / spec labels with where these dimensions occur in the sorted labels
for lab in pos_labs:
swap_axes.append(np.argwhere(all_labels == lab).squeeze())
for lab in spec_labs:
swap_axes.append(np.argwhere(all_labels == lab).squeeze())
swap_axes = np.array(swap_axes)
#If there are empty arrays, remove them.
swap_axes = np.array([ax for ax in swap_axes if ax.size>0])
if verbose:
print('Axes will permuted in this order:', swap_axes)
print('New labels ordering:', all_labels[swap_axes])
ds_Nd = ds_Nd.transpose(tuple(swap_axes))
results = [ds_Nd, True]
if verbose:
print('Dataset now of shape:', ds_Nd.shape)
if get_labels:
'''
Get the labels in the proper order
'''
results.append(all_labels[swap_axes])
return results
[docs]
def reshape_from_n_dims(data_n_dim, h5_pos=None, h5_spec=None, verbose=False):
"""
Reshape the input 2D matrix to be N-dimensions based on the
position and spectroscopic datasets.
Parameters
----------
data_n_dim : numpy.array or dask.array.core.Array
N dimensional array arranged as [positions dimensions..., spectroscopic dimensions]
If h5_pos and h5_spec are not provided, this function will have to assume that the dimensions
are arranged as [positions slowest to fastest, spectroscopic slowest to fastest].
This restriction is removed if h5_pos and h5_spec are provided
h5_pos : HDF5 Dataset, numpy.array or dask.array.core.Array
Position indices corresponding to rows in the final 2d array
The dimensions should be arranged in terms of rate of change corresponding to data_n_dim.
In other words if data_n_dim had two position dimensions arranged as [pos_fast, pos_slow, spec_dim_1....],
h5_pos should be arranged as [pos_fast, pos_slow]
h5_spec : HDF5 Dataset, numpy. array or dask.array.core.Array
Spectroscopic indices corresponding to columns in the final 2d array
The dimensions should be arranged in terms of rate of change corresponding to data_n_dim.
In other words if data_n_dim had two spectral dimensions arranged as [pos_dim_1,..., spec_fast, spec_slow],
h5_spec should be arranged as [pos_slow, pos_fast]
verbose : bool, optional. Default = False
Whether or not to print log statements
Returns
-------
ds_2d : numpy.array
2 dimensional numpy array arranged as [positions, spectroscopic]
success : boolean or string
True if full reshape was successful
"Positions" if it was only possible to reshape by
the position dimensions
False if no reshape was possible
Notes
-----
If either `h5_pos` or `h5_spec` are not provided, the function will
assume the first dimension is position and the remaining are spectroscopic already
in order from fastest to slowest.
"""
if not isinstance(data_n_dim, (np.ndarray, da.core.Array)):
raise TypeError('data_n_dim is not a numpy or dask array')
if h5_spec is None and h5_pos is None:
raise ValueError('at least one of h5_pos or h5_spec must be specified for an attempt to reshape to 2D')
if data_n_dim.ndim < 2:
return data_n_dim, True
if h5_pos is None:
pass
elif isinstance(h5_pos, h5py.Dataset):
'''
Position Indices dataset was provided
'''
ds_pos = h5_pos[()]
elif isinstance(h5_pos, da.core.Array):
ds_pos = h5_pos.compute()
elif isinstance(h5_pos, np.ndarray):
ds_pos = h5_pos
else:
raise TypeError('Position Indices must be either h5py.Dataset or None')
if h5_spec is None:
pass
elif isinstance(h5_spec, h5py.Dataset):
'''
Spectroscopic Indices dataset was provided
'''
ds_spec = h5_spec[()]
elif isinstance(h5_spec, da.core.Array):
ds_spec = h5_spec.compute()
elif isinstance(h5_spec, np.ndarray):
ds_spec = h5_spec
else:
raise TypeError('Spectroscopic Indices must be either h5py.Dataset or None')
if h5_spec is None and h5_pos is not None:
if verbose:
print('Spectral indices not provided but position indices provided.\n'
'Building spectral indices assuming that dimensions are arranged as slow -> fast')
pos_dims = get_dimensionality(ds_pos, index_sort=get_sort_order(ds_pos))
if not np.all([x in data_n_dim.shape for x in pos_dims]):
raise ValueError('Dimension sizes in pos_dims: {} do not exist in data_n_dim shape: '
'{}'.format(pos_dims, data_n_dim.shape))
spec_dims = [col for col in list(data_n_dim.shape[len(pos_dims):])]
if verbose:
print('data has dimensions: {}. Provided position indices had dimensions of size: {}. Spectral dimensions '
'will built with dimensions: {}'.format(data_n_dim.shape, pos_dims, spec_dims))
ds_spec = make_indices_matrix(spec_dims, is_position=False)
elif h5_pos is None and h5_spec is not None:
if verbose:
print('Position indices not provided but spectral indices provided.\n'
'Building position indices assuming that dimensions are arranged as slow -> fast')
spec_dims = get_dimensionality(ds_spec, index_sort=get_sort_order(ds_spec))
if not np.all([x in data_n_dim.shape for x in spec_dims]):
raise ValueError('Dimension sizes in spec_dims: {} do not exist in data_n_dim shape: '
'{}'.format(spec_dims, data_n_dim.shape))
pos_dims = [col for col in list(data_n_dim.shape[:data_n_dim.ndim-len(spec_dims)])]
if verbose:
print('data has dimensions: {}. Spectroscopic position indices had dimensions of size: {}. Position '
'dimensions will built with dimensions: {}'.format(data_n_dim.shape, spec_dims, pos_dims))
ds_pos = make_indices_matrix(pos_dims, is_position=True)
elif h5_spec is not None and h5_pos is not None:
if ds_pos.shape[0] * ds_spec.shape[1] != np.product(data_n_dim.shape):
raise ValueError('The product ({}) of the number of positions ({}) and spectroscopic ({}) observations is '
'not equal to the product ({}) of the data shape ({})'
'.'.format(ds_pos.shape[0] * ds_spec.shape[1], ds_pos.shape[0], ds_spec.shape[1],
np.product(data_n_dim.shape), data_n_dim.shape))
if ds_pos.shape[1] + ds_spec.shape[0] != data_n_dim.ndim:
# This may mean that the dummy position or spectroscopic axes has been squeezed out!
# Dask does NOT allow singular dimensions apparently. So cannot do expand_dims. Handle later
if ds_pos.size == 1 or ds_spec.size == 1:
if verbose:
print('ALL Position dimensions squeezed: {}. ALL Spectroscopic dimensions squeezed: {}'
'.'.format(ds_pos.size == 1, ds_spec.size == 1))
else:
raise ValueError('The number of position ({}) and spectroscopic ({}) dimensions do not match with the '
'dimensionality of the N-dimensional dataset: {}'
'.'.format(ds_pos.shape[1], ds_spec.shape[0], data_n_dim.ndim))
'''
Sort the indices from fastest to slowest
'''
if ds_pos.size == 1:
# Position dimension squeezed out:
pos_sort = []
else:
pos_sort = get_sort_order(np.transpose(ds_pos))
if ds_spec.size == 1:
# Spectroscopic axis squeezed out:
spec_sort = []
else:
spec_sort = get_sort_order(ds_spec)
if h5_spec is None:
spec_sort = spec_sort[::-1]
if h5_pos is None:
pos_sort = pos_sort[::-1]
if verbose:
print('Position sort order: {}'.format(pos_sort))
print('Spectroscopic sort order: {}'.format(spec_sort))
'''
Now we transpose the axes associated with the spectroscopic dimensions
so that they are in the same order as in the index array
'''
swap_axes = np.uint16(np.append(pos_sort[::-1], spec_sort[::-1] + len(pos_sort)))
if verbose:
print('swap axes: {} to be applied to N dimensional data of shape {}'.format(swap_axes, data_n_dim.shape))
data_n_dim_2 = data_n_dim.transpose(tuple(swap_axes))
if verbose:
print('N dimensional data shape after axes swap: {}'.format(data_n_dim_2.shape))
'''
Now we reshape the dataset based on those dimensions
We must use the spectroscopic dimensions in reverse order
'''
try:
ds_2d = data_n_dim_2.reshape([ds_pos.shape[0], ds_spec.shape[1]])
except ValueError:
raise ValueError('Could not reshape dataset to full N-dimensional form')
return ds_2d, True
[docs]
def get_dimensionality(ds_index, index_sort=None):
"""
Get the size of each index dimension in a specified sort order
Parameters
----------
ds_index : 2D HDF5 Dataset or numpy array
Row matrix of indices
index_sort : Iterable of unsigned integers (Optional)
Sort that can be applied to dimensionality.
For example - Order of rows sorted from fastest to slowest
Returns
-------
sorted_dims : list of unsigned integers
Dimensionality of each row in ds_index. If index_sort is supplied, it will be in the sorted order
"""
if isinstance(ds_index, da.core.Array):
ds_index = ds_index.compute()
if not isinstance(ds_index, (np.ndarray, h5py.Dataset)):
raise TypeError('ds_index should either be a numpy array or h5py.Dataset')
if ds_index.shape[0] > ds_index.shape[1]:
# must be spectroscopic like in shape (few rows, more cols)
ds_index = np.transpose(ds_index)
if index_sort is None:
index_sort = np.arange(ds_index.shape[0])
else:
if not contains_integers(index_sort, min_val=0):
raise ValueError('index_sort should contain integers > 0')
index_sort = np.array(index_sort)
if index_sort.ndim != 1:
raise ValueError('index_sort should be a 1D array')
if len(np.unique(index_sort)) > ds_index.shape[0]:
raise ValueError('length of index_sort ({}) should be smaller than number of dimensions in provided dataset'
' ({}'.format(len(np.unique(index_sort)), ds_index.shape[0]))
if set(np.arange(ds_index.shape[0])) != set(index_sort):
raise ValueError('Sort order of dimensions ({}) not matching with number of dimensions ({})'
''.format(index_sort, ds_index.shape[0]))
sorted_dims = [len(np.unique(row)) for row in np.array(ds_index, ndmin=2)[index_sort]]
return sorted_dims
[docs]
def get_sort_order(ds_spec):
"""
Find how quickly the spectroscopic values are changing in each row
and the order of rows from fastest changing to slowest.
Parameters
----------
ds_spec : 2D HDF5 dataset or numpy array
Rows of indices to be sorted from fastest changing to slowest
Returns
-------
change_sort : List of unsigned integers
Order of rows sorted from fastest changing to slowest
"""
if isinstance(ds_spec, da.core.Array):
ds_spec = ds_spec.compute()
if not isinstance(ds_spec, (np.ndarray, h5py.Dataset)):
raise TypeError('ds_spec should either be a numpy array or h5py.Dataset')
if ds_spec.shape[0] > ds_spec.shape[1]:
# must be spectroscopic like in shape (few rows, more cols)
ds_spec = np.transpose(ds_spec)
change_count = [len(np.where([row[i] != row[i - 1] for i in range(len(row))])[0]) for row in ds_spec]
change_sort = np.argsort(change_count)[::-1]
return change_sort
[docs]
def get_unit_values(ds_inds, ds_vals, dim_names=None, all_dim_names=None, is_spec=None, verbose=False):
"""
Gets the unit arrays of values that describe the spectroscopic dimensions
Parameters
----------
ds_inds : h5py.Dataset or numpy.ndarray
Spectroscopic or Position Indices dataset
ds_vals : h5py.Dataset or numpy.ndarray
Spectroscopic or Position Values dataset
dim_names : str, or list of str, Optional
Names of the dimensions of interest. Default = all
all_dim_names : list of str, Optional
Names of all the dimensions in these datasets. Use this if supplying numpy arrays instead of h5py.Dataset
objects for h5_inds, h5_vals since there is no other way of getting the dimension names.
is_spec : bool, optional
Whether or not the provided ancillary datasets are position or spectroscopic
The user is recommended to supply this parameter whenever it is known
By default, this function will attempt to recognize the answer based on the shape of the datasets.
verbose : bool, optional
Whether or not to print debugging statements. Default - off
Note - this function can be extended / modified for ancillary position dimensions as well
Returns
-------
unit_values : dict
Dictionary containing the unit array for each dimension. The name of the dimensions are the keys.
"""
if all_dim_names is None:
allowed_types = h5py.Dataset
else:
all_dim_names = validate_list_of_strings(all_dim_names, 'all_dim_names')
all_dim_names = np.array(all_dim_names)
allowed_types = (h5py.Dataset, np.ndarray)
for dset, dset_name in zip([ds_inds, ds_vals], ['ds_inds', 'ds_vals']):
if not isinstance(dset, allowed_types):
raise TypeError(dset_name + ' should be of type: {}'.format(allowed_types))
# For now, we will throw an error if even a single dimension is listed as an incomplete dimension:
if isinstance(ds_inds, h5py.Dataset):
if np.any(['incomplete_dimensions' in dset.attrs.keys() for dset in [ds_inds, ds_vals]]):
try:
incomp_dims_inds = get_attr(ds_inds, 'incomplete_dimensions')
except KeyError:
incomp_dims_inds = None
try:
incomp_dims_vals = get_attr(ds_vals, 'incomplete_dimensions')
except KeyError:
incomp_dims_vals = None
if incomp_dims_inds is None and incomp_dims_vals is not None:
incomp_dims = incomp_dims_vals
elif incomp_dims_inds is not None and incomp_dims_vals is None:
incomp_dims = incomp_dims_inds
else:
# ensure that both attributes are the same
if incomp_dims_vals != incomp_dims_inds:
raise ValueError('Provided indices ({}) and values ({}) datasets were marked with different values '
'for incomplete_datasets.'.format(incomp_dims_inds, incomp_dims_vals))
incomp_dims = incomp_dims_vals
all_dim_names = get_attr(ds_inds, 'labels')
raise ValueError('Among all dimensions: {}, These dimensions were marked as incomplete dimensions: {}'
'. You are recommended to find unit values manually'.format(all_dim_names, incomp_dims))
# Do we need to check that the provided inds and vals correspond to the same main dataset?
if ds_inds.shape != ds_vals.shape:
raise ValueError('h5_inds: {} and h5_vals: {} should have the same shapes'.format(ds_inds.shape, ds_vals.shape))
if all_dim_names is None:
all_dim_names = get_attr(ds_inds, 'labels')
if verbose:
print('All dimensions: {}'.format(all_dim_names))
# First load to memory
inds_mat = ds_inds[()]
vals_mat = ds_vals[()]
if is_spec is None:
# Attempt to recognize the type automatically
is_spec = False
if inds_mat.shape[0] < inds_mat.shape[1]:
is_spec = True
else:
if not isinstance(is_spec, bool):
raise TypeError('is_spec should be a boolean. Provided object is of type: {}'.format(type(is_spec)))
if verbose:
print(
'Ancillary matrices of shape: {}, hence determined to be Spectroscopic:{}'.format(inds_mat.shape, is_spec))
if not is_spec:
# Convert to spectral shape
inds_mat = np.transpose(inds_mat)
vals_mat = np.transpose(vals_mat)
if len(all_dim_names) != inds_mat.shape[0]:
raise ValueError('Length of dimension names list: {} not matching with shape of dataset: {}'
'.'.format(len(all_dim_names), inds_mat.shape[0]))
if dim_names is None:
dim_names = all_dim_names
if verbose:
print('Going to return unit values for all dimensions: {}'.format(all_dim_names))
else:
dim_names = validate_list_of_strings(dim_names, 'dim_names')
if verbose:
print('Checking to make sure that the target dimension names: {} exist in the datasets attributes: {}'
'.'.format(dim_names, all_dim_names))
# check to make sure that the dimension names exist in the datasets:
for dim_name in dim_names:
if dim_name not in all_dim_names:
raise KeyError('Dimension {} does not exist in the provided ancillary datasets'.format(dim_name))
unit_values = dict()
for dim_name in all_dim_names:
# Find the row in the spectroscopic indices that corresponds to the dimensions we want to slice:
if verbose:
print('Looking for dimension: {} in {}'.format(dim_name, dim_names))
desired_row_ind = np.where(all_dim_names == dim_name)[0][0]
inds_for_dim = inds_mat[desired_row_ind]
# Wherever this dimension goes to 0 - start of a new tile
starts = np.where(inds_for_dim == np.min(inds_for_dim))[0]
if starts[0] != 0:
raise ValueError('Spectroscopic Indices for dimension: "{}" not '
'starting with 0. Please fix this and try again'
'.'.format(dim_name))
# There may be repetitions in addition to tiling. Find how the the positions increase.
# 1 = repetition, > 1 = new tile
step_sizes = np.hstack(([1], np.diff(starts)))
# This array is of the same length as the full indices array
# We should expect only two values of step sizes for a regular dimension (tiles of the same size):
# 1 for same value repeating and a big jump in indices when the next tile starts
# If the repeats / tiles are of different lengths, then this is not a regular dimension.
# What does a Unit Values vector even mean in this case? Just raise an error for now
if np.where(np.unique(step_sizes) - 1)[0].size > 1:
raise ValueError('Non constant step sizes')
# Finding Start of a new tile
tile_starts = np.where(step_sizes > 1)[0]
# converting these indices to correct indices that can be mapped straight to
if len(tile_starts) < 1:
# Dimension(s) with no tiling at all
# Make it look as though the next tile starts at the end of the whole indices vector
tile_starts = np.array([0, len(inds_for_dim)])
else:
# Dimension with some form of repetition
tile_starts = np.hstack(([0], starts[tile_starts]))
# Verify that each tile is identical here
# Last tile will not be checked unless we add the length of the indices vector as the start of next tile
tile_starts = np.hstack((tile_starts, [len(inds_for_dim)]))
subsections = [inds_for_dim[tile_starts[ind]: tile_starts[ind + 1]] for ind in range(len(tile_starts) - 1)]
if np.max(np.diff(subsections, axis=0)) != 0:
# Should get unit values for ALL dimensions regardless of expectations to catch such scenarios.
raise ValueError('Values in each tile of dimension: {} are different'.format(dim_name))
# Now looking within the first tile:
subsection = inds_for_dim[tile_starts[0]:tile_starts[1]]
# remove all repetitions. ie - take indices only where jump == 1
step_inds = np.hstack(([0], np.where(np.hstack(([0], np.diff(subsection))))[0]))
# Finally, use these indices to get the values
if dim_name in dim_names:
# Only add this dimension to dictionary if requwested.
unit_values[dim_name] = vals_mat[desired_row_ind, step_inds]
return unit_values
[docs]
def write_main_dataset(h5_parent_group, main_data, main_data_name, quantity, units, pos_dims, spec_dims,
main_dset_attrs=None, h5_pos_inds=None, h5_pos_vals=None, h5_spec_inds=None, h5_spec_vals=None,
aux_spec_prefix='Spectroscopic_', aux_pos_prefix='Position_', verbose=False,
slow_to_fast=False, **kwargs):
"""
Writes the provided data as a 'Main' dataset with all appropriate linking.
By default, the instructions for generating the ancillary datasets should be specified using the pos_dims and
spec_dims arguments as dictionary objects. Alternatively, if both the indices and values datasets are already
available for either/or the positions / spectroscopic, they can be specified using the keyword arguments. In this
case, fresh datasets will not be generated.
Parameters
----------
h5_parent_group : :class:`h5py.Group`
Parent group under which the datasets will be created
main_data : numpy.ndarray, dask.array.core.Array, list or tuple
2D matrix formatted as [position, spectral] or a list / tuple with the shape for an empty dataset.
If creating an empty dataset - the dtype must be specified via a kwarg.
main_data_name : String / Unicode
Name to give to the main dataset. This cannot contain the '-' character.
quantity : String / Unicode
Name of the physical quantity stored in the dataset. Example - 'Current'
units : String / Unicode
Name of units for the quantity stored in the dataset. Example - 'A' for amperes
pos_dims : Dimension or array-like of Dimension objects
Sequence of Dimension objects that provides all necessary instructions for constructing the indices and values
datasets
Object specifying the instructions necessary for building the Position indices and values datasets
spec_dims : Dimension or array-like of Dimension objects
Sequence of Dimension objects that provides all necessary instructions for constructing the indices and values
datasets
Object specifying the instructions necessary for building the Spectroscopic indices and values datasets
main_dset_attrs : dictionary, Optional
Dictionary of parameters that will be written to the main dataset. Do NOT include region references here.
h5_pos_inds : h5py.Dataset, Optional
Dataset that will be linked with the name "Position_Indices"
h5_pos_vals : h5py.Dataset, Optional
Dataset that will be linked with the name "Position_Values"
h5_spec_inds : h5py.Dataset, Optional
Dataset that will be linked with the name "Spectroscopic_Indices"
h5_spec_vals : h5py.Dataset, Optional
Dataset that will be linked with the name "Spectroscopic_Values"
aux_spec_prefix : str or unicode, Optional
Default prefix for Spectroscopic datasets. Default = "Spectroscopic"
aux_pos_prefix : str or unicode, Optional
Default prefix for Position datasets. Default = "Position"
verbose : bool, Optional, default=False
If set to true - prints debugging logs
slow_to_fast : bool, Optional. Default=False
Set to True if the dimensions are arranged from slowest varying to fastest varying.
Set to False otherwise.
kwargs will be passed onto the creation of the dataset. Please pass chunking, compression, dtype, and other
arguments this way
Returns
-------
h5_main : USIDataset
Reference to the main dataset
"""
def __check_anc_before_creation(aux_prefix, dim_type='pos'):
aux_prefix = validate_single_string_arg(aux_prefix, 'aux_' + dim_type + '_prefix')
if not aux_prefix.endswith('_'):
aux_prefix += '_'
if '-' in aux_prefix:
warn('aux_' + dim_type + ' should not contain the "-" character. Reformatted name from:{} to '
'{}'.format(aux_prefix, aux_prefix.replace('-', '_')))
aux_prefix = aux_prefix.replace('-', '_')
for dset_name in [aux_prefix + 'Indices', aux_prefix + 'Values']:
if dset_name in h5_parent_group.keys():
# TODO: What if the contained data was correct?
raise KeyError('Dataset named: ' + dset_name + ' already exists in group: '
'{}. Consider passing these datasets using kwargs (if they are correct) instead of providing the pos_dims and spec_dims arguments'.format(h5_parent_group.name))
return aux_prefix
def __ensure_anc_in_correct_file(h5_inds, h5_vals, prefix):
if h5_inds.file != h5_vals.file:
raise ValueError('Provided ' + prefix + ' datasets are present in different HDF5 files!')
if h5_inds.file != h5_parent_group.file:
# Need to copy over the anc datasets to the new group
if verbose:
print('Need to copy over ancillary datasets: {} and {} to '
'destination group: {} which is in a different HDF5 '
'file'.format(h5_inds, h5_vals, h5_parent_group))
ret_vals = [copy_dataset(x, h5_parent_group, verbose=verbose) for x in [h5_inds, h5_vals]]
else:
ret_vals = [h5_inds, h5_vals]
return tuple(ret_vals)
if not isinstance(h5_parent_group, (h5py.Group, h5py.File)):
raise TypeError('h5_parent_group should be a h5py.File or h5py.Group object')
if not is_editable_h5(h5_parent_group):
raise ValueError('The provided file is not editable')
if verbose:
print('h5 group and file OK')
quantity, units, main_data_name = validate_string_args([quantity, units, main_data_name],
['quantity', 'units', 'main_data_name'])
if verbose:
print('quantity, units, main_data_name all OK')
quantity = quantity.strip()
units = units.strip()
main_data_name = main_data_name.strip()
if '-' in main_data_name:
warn('main_data_name should not contain the "-" character. Reformatted name from:{} to '
'{}'.format(main_data_name, main_data_name.replace('-', '_')))
main_data_name = main_data_name.replace('-', '_')
if isinstance(main_data, (list, tuple)):
if not contains_integers(main_data, min_val=1):
raise ValueError('main_data if specified as a shape should be a list / tuple of integers >= 1')
if len(main_data) != 2:
raise ValueError('main_data if specified as a shape should contain 2 numbers')
if 'dtype' not in kwargs:
raise ValueError('dtype must be included as a kwarg when creating an empty dataset')
_ = validate_dtype(kwargs.get('dtype'))
main_shape = main_data
if verbose:
print('Selected empty dataset creation. OK so far')
elif isinstance(main_data, (np.ndarray, da.core.Array)):
if main_data.ndim != 2:
raise ValueError('main_data should be a 2D array')
main_shape = main_data.shape
if verbose:
print('Provided numpy or Dask array for main_data OK so far')
else:
raise TypeError('main_data should either be a numpy array or a tuple / list with the shape of the data')
if h5_pos_inds is not None and h5_pos_vals is not None:
# The provided datasets override fresh building instructions.
validate_anc_h5_dsets(h5_pos_inds, h5_pos_vals, main_shape, is_spectroscopic=False)
if verbose:
print('The shapes of the provided h5 position indices and values are OK')
h5_pos_inds, h5_pos_vals = __ensure_anc_in_correct_file(h5_pos_inds, h5_pos_vals, 'Position')
else:
aux_pos_prefix = __check_anc_before_creation(aux_pos_prefix, dim_type='pos')
pos_dims = validate_dimensions(pos_dims, dim_type='Position')
validate_dims_against_main(main_shape, pos_dims, is_spectroscopic=False)
if verbose:
print('Passed all pre-tests for creating position datasets')
h5_pos_inds, h5_pos_vals = write_ind_val_dsets(h5_parent_group, pos_dims, is_spectral=False, verbose=verbose,
slow_to_fast=slow_to_fast, base_name=aux_pos_prefix)
if verbose:
print('Created position datasets!')
if h5_spec_inds is not None and h5_spec_vals is not None:
# The provided datasets override fresh building instructions.
validate_anc_h5_dsets(h5_spec_inds, h5_spec_vals, main_shape, is_spectroscopic=True)
if verbose:
print('The shapes of the provided h5 position indices and values '
'are OK')
h5_spec_inds, h5_spec_vals = __ensure_anc_in_correct_file(h5_spec_inds, h5_spec_vals,
'Spectroscopic')
else:
aux_spec_prefix = __check_anc_before_creation(aux_spec_prefix, dim_type='spec')
spec_dims = validate_dimensions(spec_dims, dim_type='Spectroscopic')
validate_dims_against_main(main_shape, spec_dims, is_spectroscopic=True)
if verbose:
print('Passed all pre-tests for creating spectroscopic datasets')
h5_spec_inds, h5_spec_vals = write_ind_val_dsets(h5_parent_group, spec_dims, is_spectral=True, verbose=verbose,
slow_to_fast=slow_to_fast, base_name=aux_spec_prefix)
if verbose:
print('Created Spectroscopic datasets')
if h5_parent_group.file.driver == 'mpio':
if kwargs.pop('compression', None) is not None:
warn('This HDF5 file has been opened wth the "mpio" communicator. '
'mpi4py does not allow creation of compressed datasets. Compression kwarg has been removed')
if isinstance(main_data, np.ndarray):
# Case 1 - simple small dataset
h5_main = h5_parent_group.create_dataset(main_data_name, data=main_data, **kwargs)
if verbose:
print('Created main dataset with provided data')
elif isinstance(main_data, da.core.Array):
# Case 2 - Dask dataset
# step 0 - get rid of any automated dtype specification:
_ = kwargs.pop('dtype', None)
# step 1 - create the empty dataset:
h5_main = h5_parent_group.create_dataset(main_data_name, shape=main_data.shape, dtype=main_data.dtype,
**kwargs)
if verbose:
print('Created empty dataset: {} for writing Dask dataset: {}'.format(h5_main, main_data))
print('Dask array will be written to HDF5 dataset: "{}" in file: "{}"'.format(h5_main.name,
h5_main.file.filename))
# Step 2 - now ask Dask to dump data to disk
da.to_hdf5(h5_main.file.filename, {h5_main.name: main_data})
# main_data.to_hdf5(h5_main.file.filename, h5_main.name) # Does not work with python 2 for some reason
else:
# Case 3 - large empty dataset
h5_main = h5_parent_group.create_dataset(main_data_name, main_data, **kwargs)
if verbose:
print('Created empty dataset for Main')
write_simple_attrs(h5_main, {'quantity': quantity, 'units': units})
if verbose:
print('Wrote quantity and units attributes to main dataset')
if isinstance(main_dset_attrs, dict):
write_simple_attrs(h5_main, main_dset_attrs)
if verbose:
print('Wrote provided attributes to main dataset')
write_book_keeping_attrs(h5_main)
# make it main
link_as_main(h5_main, h5_pos_inds, h5_pos_vals, h5_spec_inds, h5_spec_vals)
if verbose:
print('Successfully linked datasets - dataset should be main now')
from ..usi_data import USIDataset
return USIDataset(h5_main)
[docs]
def map_grid_to_cartesian(h5_main, grid_shape, mode='histogram', **kwargs):
"""
Map an incomplete measurement, such as a spiral scan, to a cartesian grid.
Parameters
----------
h5_main : :class:`pyUSID.USIDataset`
Dataset containing the sparse measurement
grid_shape : int or [int, int]
Shape of the output :class:`numpy.ndarray`.
mode : str, optional. Default = 'histogram'
Method used for building a cartesian grid.
Available methods = 'histogram', 'linear', 'nearest', 'cubic'
Use kwargs to pass onto each of the techniques
Note
----
UNDER DEVELOPMENT!
Currently only valid for 2 position dimensions
@author: Patrik Marschalik
Returns
-------
:class:`numpy.ndarray` but could be a h5py.Dataset or dask.array.core.Array object
"""
try:
from scipy.interpolate import griddata
except ImportError as expn:
griddata = None
warn('map_grid_to_cartesian() requires scipy')
raise expn
from ..usi_data import USIDataset
if not isinstance(h5_main, USIDataset):
raise TypeError('Provided object is not a pyUSID.USIDataset object')
if mode not in ['histogram', 'linear', 'nearest', 'cubic']:
raise ValueError('mode must be a string among["histogram", "cubic"]')
ds_main = h5_main[()].squeeze()
ds_pos_vals = h5_main.h5_pos_vals[()]
if ds_pos_vals.shape[1] != 2:
raise TypeError("Only working for 2 position dimensions.")
# Transform to row, col image format
rotation = np.array([[0, 1], [-1, 0]])
ds_pos_vals = np.dot(ds_pos_vals, rotation)
try:
grid_n = len(grid_shape)
except TypeError:
grid_n = 1
if grid_n != 1 and grid_n != 2:
raise ValueError("grid_shape must be of type int or [int, int].")
if grid_n == 1:
grid_shape = 2 * [grid_shape]
def interpolate(points, values, grid_shape, method):
grid_shape = list(map((1j).__mul__, grid_shape))
grid_x, grid_y = np.mgrid[
np.amin(points[:, 0]):np.amax(points[:, 0]):grid_shape[0],
np.amin(points[:, 1]):np.amax(points[:, 1]):grid_shape[1]
]
ndim_data = griddata(points, values, (grid_x, grid_y), method=method)
return ndim_data
if mode == "histogram":
histogram_weighted, _, _ = np.histogram2d(*ds_pos_vals.T, bins=grid_shape, weights=ds_main)
histogram, _, _ = np.histogram2d(*ds_pos_vals.T, bins=grid_shape)
cart_data = np.divide(histogram_weighted, histogram)
else:
cart_data = interpolate(ds_pos_vals, ds_main, grid_shape, method=mode)
return cart_data
[docs]
def write_sidpy_dataset(si_dset, h5_parent_group, verbose=False,
**kwargs):
"""
Writes a sidpy.Dataset as a USID dataset in the provided HDF5 Group.
Please see notes about dimension types
Parameters
----------
si_dset: sidpy.Dataset
Dataset to be written to HDF5 in NSID format
h5_parent_group : class:`h5py.Group`
Parent group under which the datasets will be created
verbose : bool, Optional. Default = False
Whether or not to write logs to standard out
kwargs: dict
additional keyword arguments passed on to h5py when writing data
Returns
------
h5_main : USIDataset
Reference to the main dataset
Notes
-----
USID only has two dimension types - Position and Spectroscopic.
Consider changing the types of dimensions of all other dimensions to either
"SPATIAL" or "SPECTRAL".
"""
if not isinstance(si_dset, sid.Dataset):
raise TypeError('Data to write is not a sidpy dataset')
if not isinstance(h5_parent_group, (h5py.File, h5py.Group)):
raise TypeError('h5_parent_group is not a h5py.File or '
'h5py.Group object')
spatial_dims, spectral_dims, spatial_size, spectral_size = [], [], 1, 1
for dim_ind, dime in si_dset._axes.items():
if dime._dimension_type == sid.DimensionType.SPATIAL:
spatial_dims.append(Dimension(dime._name,
dime._units,
dime.values,
dime._quantity,
dime._dimension_type))
spatial_size *= np.size(dime.values)
else:
if not dime._dimension_type == sid.DimensionType.SPECTRAL:
warn('Will consider dimension: {} of type: {} as a '
'spectroscopic dimension'.format(dime._name,
dime._dimension_type))
spectral_dims.append(Dimension(dime._name,
dime._units,
dime.values,
dime._quantity,
dime._dimension_type))
spectral_size *= np.size(dime.values)
main_dataset = da.reshape(si_dset, [spatial_size, spectral_size])
# TODO : Consider writing this out as a separate group
main_dset_attr = {}
for attr_name in dir(si_dset):
attr_val = getattr(si_dset, attr_name)
if isinstance(attr_val, dict):
main_dset_attr.update(attr_val)
h5_main = write_main_dataset(h5_parent_group=h5_parent_group,
main_data=main_dataset,
main_data_name=si_dset.name,
quantity=si_dset.quantity,
units=si_dset.units,
pos_dims=spatial_dims,
spec_dims=spectral_dims,
main_dset_attrs=flatten_dict(main_dset_attr),
slow_to_fast=True,
verbose=verbose,
**kwargs)
return h5_main