Source code for pyUSID.io.hdf_utils.model

# -*- coding: utf-8 -*-
"""
Utilities for reading and writing USID datasets that are highly model-dependent (with or without N-dimensional form)

Created on Tue Nov  3 21:14:25 2015

@author: Suhas Somnath, Chris Smith
"""
from __future__ import division, print_function, absolute_import, unicode_literals
from warnings import warn
import sys
import h5py
import numpy as np
from dask import array as da
from sidpy.hdf.hdf_utils import get_attr, write_simple_attrs, is_editable_h5, \
    copy_dataset, lazy_load_array
from sidpy.base.num_utils import contains_integers
from sidpy.base.dict_utils import flatten_dict
from sidpy.base.string_utils import validate_single_string_arg, \
    validate_list_of_strings, validate_string_args
from sidpy.hdf.dtype_utils import validate_dtype
from sidpy import sid

from .base import write_book_keeping_attrs
from .simple import link_as_main, check_if_main, write_ind_val_dsets, validate_dims_against_main, validate_anc_h5_dsets
from ..dimension import Dimension, validate_dimensions
from ..anc_build_utils import INDICES_DTYPE, make_indices_matrix

if sys.version_info.major == 3:
    unicode = str



[docs]
def reshape_to_n_dims(h5_main, h5_pos=None, h5_spec=None, get_labels=False, verbose=False, sort_dims=False,
                      lazy=False):
    """
    Reshape the input 2D matrix to be N-dimensions based on the
    position and spectroscopic datasets.

    Parameters
    ----------
    h5_main : HDF5 Dataset
        2D data to be reshaped
    h5_pos : HDF5 Dataset, optional
        Position indices corresponding to rows in `h5_main`
    h5_spec : HDF5 Dataset, optional
        Spectroscopic indices corresponding to columns in `h5_main`
    get_labels : bool, optional
        Whether or not to return the dimension labels.  Default False
    verbose : bool, optional
        Whether or not to print debugging statements
    sort_dims : bool
        If True, the data is sorted so that the dimensions are in order from slowest to fastest
        If False, the data is kept in the original order
        If `get_labels` is also True, the labels are sorted as well.
    lazy : bool, optional. Default = False
        If False, ds_Nd will be a numpy.ndarray object - this is suitable if the HDF5 dataset fits into memory
        If True, ds_Nd will be a dask.array object - This is suitable if the HDF5 dataset is too large to fit into
        memory. Note that this will bea lazy computation meaning that the returned object just contains the instructions
        . In order to get the actual value or content in numpy arrays, call ds_Nd.compute()

    Returns
    -------
    ds_Nd : N-D numpy array or dask.array object
        N dimensional array arranged as [positions slowest to fastest, spectroscopic slowest to fastest]
    success : boolean or string
        True if full reshape was successful

        "Positions" if it was only possible to reshape by
        the position dimensions

        False if no reshape was possible
    ds_labels : list of str
        List of the labels of each dimension of `ds_Nd`

    Notes
    -----
    If either `h5_pos` or `h5_spec` are not provided, the function will first
    attempt to find them as attributes of `h5_main`.  If that fails, it will
    generate dummy values for them.

    """
    # TODO: automatically switch on lazy if the data is larger than memory
    # TODO: sort_dims does not appear to do much. Functions as though it was always True

    if h5_pos is None and h5_spec is None:
        if not check_if_main(h5_main):
            raise ValueError('if h5_main is a h5py.Dataset it should be a Main dataset')
    else:
        if not isinstance(h5_main, (h5py.Dataset, np.ndarray, da.core.Array)):
            raise TypeError('h5_main should either be a h5py.Dataset or numpy array')

    if h5_pos is not None:
        if not isinstance(h5_pos, (h5py.Dataset, np.ndarray, da.core.Array)):
            raise TypeError('h5_pos should either be a h5py.Dataset or numpy array')
        if h5_pos.shape[0] != h5_main.shape[0]:
            raise ValueError('The size of h5_pos: {} does not match with h5_main: {}'.format(h5_pos.shape,
                                                                                             h5_main.shape))

    if h5_spec is not None:
        if not isinstance(h5_spec, (h5py.Dataset, np.ndarray, da.core.Array)):
            raise TypeError('h5_spec should either be a h5py.Dataset or numpy array')
        if h5_spec.shape[1] != h5_main.shape[1]:
            raise ValueError('The size of h5_spec: {} does not match with h5_main: {}'.format(h5_spec.shape,
                                                                                              h5_main.shape))

    pos_labs = np.array(['Positions'])
    spec_labs = np.array(['Spectral_Step'])
    if h5_pos is None:
        """
        Get the Position datasets from the references if possible
        """
        if isinstance(h5_main, h5py.Dataset):
            try:
                h5_pos = h5_main.file[h5_main.attrs['Position_Indices']]
                ds_pos = h5_pos[()]
                pos_labs = get_attr(h5_pos, 'labels')
            except KeyError:
                print('No position datasets found as attributes of {}'.format(h5_main.name))
                if len(h5_main.shape) > 1:
                    ds_pos = np.arange(h5_main.shape[0], dtype=INDICES_DTYPE).reshape(-1, 1)
                    pos_labs = np.array(['Position Dimension {}'.format(ipos) for ipos in range(ds_pos.shape[1])])
                else:
                    ds_pos = np.array(0, dtype=INDICES_DTYPE).reshape(-1, 1)
        else:
            ds_pos = np.arange(h5_main.shape[0], dtype=INDICES_DTYPE).reshape(-1, 1)
            pos_labs = np.array(['Position Dimension {}'.format(ipos) for ipos in range(ds_pos.shape[1])])
    elif isinstance(h5_pos, h5py.Dataset):
        """
    Position Indices dataset was provided
        """
        ds_pos = h5_pos[()]
        pos_labs = get_attr(h5_pos, 'labels')
    elif isinstance(h5_pos, (np.ndarray, da.core.Array)):
        ds_pos = np.atleast_2d(h5_pos)
        pos_labs = np.array(['Position Dimension {}'.format(ipos) for ipos in range(ds_pos.shape[1])])
    else:
        raise TypeError('Position Indices must be either h5py.Dataset or None')

    if h5_spec is None:
        """
        Get the Spectroscopic datasets from the references if possible
        """
        if isinstance(h5_main, h5py.Dataset):
            try:
                h5_spec = h5_main.file[h5_main.attrs['Spectroscopic_Indices']]
                ds_spec = h5_spec[()]
                spec_labs = get_attr(h5_spec, 'labels')
            except KeyError:
                print('No spectroscopic datasets found as attributes of {}'.format(h5_main.name))
                if len(h5_main.shape) > 1:
                    ds_spec = np.arange(h5_main.shape[1], dtype=INDICES_DTYPE).reshape([1, -1])
                    spec_labs = np.array(['Spectral Dimension {}'.format(ispec) for ispec in range(ds_spec.shape[0])])
                else:
                    ds_spec = np.array(0, dtype=INDICES_DTYPE).reshape([1, 1])
        else:
            ds_spec = np.arange(h5_main.shape[1], dtype=INDICES_DTYPE).reshape([1, -1])
            spec_labs = np.array(['Spectral Dimension {}'.format(ispec) for ispec in range(ds_spec.shape[0])])

    elif isinstance(h5_spec, h5py.Dataset):
        """
    Spectroscopic Indices dataset was provided
        """
        ds_spec = h5_spec[()]
        spec_labs = get_attr(h5_spec, 'labels')
    elif isinstance(h5_spec, (np.ndarray, da.core.Array)):
        ds_spec = h5_spec
        spec_labs = np.array(['Spectral Dimension {}'.format(ispec) for ispec in range(ds_spec.shape[0])])
    else:
        raise TypeError('Spectroscopic Indices must be either h5py.Dataset or None')

    '''
    Sort the indices from fastest to slowest
    '''
    pos_sort = get_sort_order(np.transpose(ds_pos))
    spec_sort = get_sort_order(ds_spec)

    if verbose:
        print('Position dimensions:', pos_labs)
        print('Position sort order:', pos_sort)
        print('Spectroscopic Dimensions:', spec_labs)
        print('Spectroscopic sort order:', spec_sort)

    '''
    Get the size of each dimension in the sorted order
    '''
    pos_dims = get_dimensionality(np.transpose(ds_pos), pos_sort)
    spec_dims = get_dimensionality(ds_spec, spec_sort)

    if np.prod(pos_dims) != h5_main.shape[0]:
        mesg = 'Product of position dimension sizes: {} = {} not matching ' \
               'with size of first axis of main dataset: {}. One or more ' \
               'dimensions are dependent dimensions and not marked as such' \
               '.'.format(pos_dims, np.prod(pos_dims), h5_main.shape[0])
        raise ValueError(mesg)
    if np.prod(spec_dims) != h5_main.shape[1]:
        mesg = 'Product of spectroscopic dimension sizes: {} = {} not matching ' \
               'with size of second axis of main dataset: {}. One or more ' \
               'dimensions are dependent dimensions and not marked as such' \
               '.'.format(spec_dims, np.prod(spec_dims), h5_main.shape[1])
        raise ValueError(mesg)

    if verbose:
        print('\nPosition dimensions (sort applied):', pos_labs[pos_sort])
        print('Position dimensionality (sort applied):', pos_dims)
        print('Spectroscopic dimensions (sort applied):', spec_labs[spec_sort])
        print('Spectroscopic dimensionality (sort applied):', spec_dims)

    if lazy:
        ds_main = lazy_load_array(h5_main)
    else:
        ds_main = h5_main[()]

    """
    Now we reshape the dataset based on those dimensions
    numpy reshapes correctly when the dimensions are arranged from slowest to fastest.
    Since the sort orders we have are from fastest to slowest, we need to reverse the orders
    for both the position and spectroscopic dimensions
    """
    if verbose:
        print('Will attempt to reshape main dataset from:\n{} to {}'.format(ds_main.shape, pos_dims[::-1] + spec_dims[::-1]))

    try:
        ds_Nd = ds_main.reshape(pos_dims[::-1] + spec_dims[::-1])

    except ValueError:
        warn('Could not reshape dataset to full N-dimensional form.  Attempting reshape based on position only.')
        try:
            ds_Nd = ds_main.reshape(pos_dims[::-1] + [-1])

        except ValueError:
            warn('Reshape by position only also failed.  Will keep dataset in 2d form.')
            if get_labels:
                return ds_main, False, ['Position', 'Spectral Step']
            else:
                return ds_main, False

        # No exception
        else:
            if get_labels:
                return ds_Nd, 'Positions', ['Position'] + spec_labs
            else:
                return ds_Nd, 'Positions'

    all_labels = np.hstack((pos_labs[pos_sort][::-1],
                            spec_labs[spec_sort][::-1]))

    if verbose:
        print('\nAfter reshaping, labels are', all_labels)
        print('Data shape is', ds_Nd.shape)

    """
    At this point, the data is arranged from slowest to fastest dimension in both pos and spec
    """
    if sort_dims:
        results = [ds_Nd, True]
        if get_labels:
            results.append(all_labels)
        return results

    if verbose:
        print('\nGoing to put dimensions back in the same order as in the file:')

    swap_axes = list()
    # Compare the original order of the pos / spec labels with where these dimensions occur in the sorted labels
    for lab in pos_labs:
        swap_axes.append(np.argwhere(all_labels == lab).squeeze())
    for lab in spec_labs:
        swap_axes.append(np.argwhere(all_labels == lab).squeeze())

    swap_axes = np.array(swap_axes)

    #If there are empty arrays, remove them. 
    swap_axes = np.array([ax for ax in swap_axes if ax.size>0])

    if verbose:
        print('Axes will permuted in this order:', swap_axes)
        print('New labels ordering:', all_labels[swap_axes])

    ds_Nd = ds_Nd.transpose(tuple(swap_axes))
    results = [ds_Nd, True]

    if verbose:
        print('Dataset now of shape:', ds_Nd.shape)

    if get_labels:
        '''
        Get the labels in the proper order
        '''
        results.append(all_labels[swap_axes])

    return results




[docs]
def reshape_from_n_dims(data_n_dim, h5_pos=None, h5_spec=None, verbose=False):
    """
    Reshape the input 2D matrix to be N-dimensions based on the
    position and spectroscopic datasets.

    Parameters
    ----------
    data_n_dim : numpy.array or dask.array.core.Array
        N dimensional array arranged as [positions dimensions..., spectroscopic dimensions]
        If h5_pos and h5_spec are not provided, this function will have to assume that the dimensions
        are arranged as [positions slowest to fastest, spectroscopic slowest to fastest].
        This restriction is removed if h5_pos and h5_spec are provided
    h5_pos : HDF5 Dataset, numpy.array or dask.array.core.Array
        Position indices corresponding to rows in the final 2d array
        The dimensions should be arranged in terms of rate of change corresponding to data_n_dim.
        In other words if data_n_dim had two position dimensions arranged as [pos_fast, pos_slow, spec_dim_1....],
        h5_pos should be arranged as [pos_fast, pos_slow]
    h5_spec : HDF5 Dataset, numpy. array or dask.array.core.Array
        Spectroscopic indices corresponding to columns in the final 2d array
        The dimensions should be arranged in terms of rate of change corresponding to data_n_dim.
        In other words if data_n_dim had two spectral dimensions arranged as [pos_dim_1,..., spec_fast, spec_slow],
        h5_spec should be arranged as [pos_slow, pos_fast]
    verbose : bool, optional. Default = False
        Whether or not to print log statements

    Returns
    -------
    ds_2d : numpy.array
        2 dimensional numpy array arranged as [positions, spectroscopic]
    success : boolean or string
        True if full reshape was successful

        "Positions" if it was only possible to reshape by
        the position dimensions

        False if no reshape was possible

    Notes
    -----
    If either `h5_pos` or `h5_spec` are not provided, the function will
    assume the first dimension is position and the remaining are spectroscopic already
    in order from fastest to slowest.

    """
    if not isinstance(data_n_dim, (np.ndarray, da.core.Array)):
        raise TypeError('data_n_dim is not a numpy or dask array')

    if h5_spec is None and h5_pos is None:
        raise ValueError('at least one of h5_pos or h5_spec must be specified for an attempt to reshape to 2D')

    if data_n_dim.ndim < 2:
        return data_n_dim, True

    if h5_pos is None:
        pass
    elif isinstance(h5_pos, h5py.Dataset):
        '''
        Position Indices dataset was provided
        '''
        ds_pos = h5_pos[()]
    elif isinstance(h5_pos, da.core.Array):
        ds_pos = h5_pos.compute()

    elif isinstance(h5_pos, np.ndarray):
        ds_pos = h5_pos
    else:
        raise TypeError('Position Indices must be either h5py.Dataset or None')

    if h5_spec is None:
        pass
    elif isinstance(h5_spec, h5py.Dataset):
        '''
        Spectroscopic Indices dataset was provided
        '''
        ds_spec = h5_spec[()]
    elif isinstance(h5_spec, da.core.Array):
        ds_spec = h5_spec.compute()
    elif isinstance(h5_spec, np.ndarray):
        ds_spec = h5_spec
    else:
        raise TypeError('Spectroscopic Indices must be either h5py.Dataset or None')

    if h5_spec is None and h5_pos is not None:
        if verbose:
            print('Spectral indices not provided but position indices provided.\n'
                  'Building spectral indices assuming that dimensions are arranged as slow -> fast')
        pos_dims = get_dimensionality(ds_pos, index_sort=get_sort_order(ds_pos))
        if not np.all([x in data_n_dim.shape for x in pos_dims]):
            raise ValueError('Dimension sizes in pos_dims: {} do not exist in data_n_dim shape: '
                             '{}'.format(pos_dims, data_n_dim.shape))
        spec_dims = [col for col in list(data_n_dim.shape[len(pos_dims):])]
        if verbose:
            print('data has dimensions: {}. Provided position indices had dimensions of size: {}. Spectral dimensions '
                  'will built with dimensions: {}'.format(data_n_dim.shape, pos_dims, spec_dims))
        ds_spec = make_indices_matrix(spec_dims, is_position=False)

    elif h5_pos is None and h5_spec is not None:
        if verbose:
            print('Position indices not provided but spectral indices provided.\n'
                  'Building position indices assuming that dimensions are arranged as slow -> fast')
        spec_dims = get_dimensionality(ds_spec, index_sort=get_sort_order(ds_spec))
        if not np.all([x in data_n_dim.shape for x in spec_dims]):
            raise ValueError('Dimension sizes in spec_dims: {} do not exist in data_n_dim shape: '
                             '{}'.format(spec_dims, data_n_dim.shape))
        pos_dims = [col for col in list(data_n_dim.shape[:data_n_dim.ndim-len(spec_dims)])]
        if verbose:
            print('data has dimensions: {}. Spectroscopic position indices had dimensions of size: {}. Position '
                  'dimensions will built with dimensions: {}'.format(data_n_dim.shape, spec_dims, pos_dims))
        ds_pos = make_indices_matrix(pos_dims, is_position=True)

    elif h5_spec is not None and h5_pos is not None:
        if ds_pos.shape[0] * ds_spec.shape[1] != np.prod(data_n_dim.shape):
            raise ValueError('The product ({}) of the number of positions ({}) and spectroscopic ({}) observations is '
                             'not equal to the product ({}) of the data shape ({})'
                             '.'.format(ds_pos.shape[0] * ds_spec.shape[1], ds_pos.shape[0], ds_spec.shape[1],
                                        np.prod(data_n_dim.shape), data_n_dim.shape))

        if ds_pos.shape[1] + ds_spec.shape[0] != data_n_dim.ndim:
            # This may mean that the dummy position or spectroscopic axes has been squeezed out!
            # Dask does NOT allow singular dimensions apparently. So cannot do expand_dims. Handle later

            if ds_pos.size == 1 or ds_spec.size == 1:
                if verbose:
                    print('ALL Position dimensions squeezed: {}. ALL Spectroscopic dimensions squeezed: {}'
                          '.'.format(ds_pos.size == 1, ds_spec.size == 1))
            else:
                raise ValueError('The number of position ({}) and spectroscopic ({}) dimensions do not match with the '
                                 'dimensionality of the N-dimensional dataset: {}'
                                 '.'.format(ds_pos.shape[1], ds_spec.shape[0], data_n_dim.ndim))

    '''
    Sort the indices from fastest to slowest
    '''
    if ds_pos.size == 1:
        # Position dimension squeezed out:
        pos_sort = []
    else:
        pos_sort = get_sort_order(np.transpose(ds_pos))

    if ds_spec.size == 1:
        # Spectroscopic axis squeezed out:
        spec_sort = []
    else:
        spec_sort = get_sort_order(ds_spec)

    if h5_spec is None:
        spec_sort = spec_sort[::-1]
    if h5_pos is None:
        pos_sort = pos_sort[::-1]

    if verbose:
        print('Position sort order: {}'.format(pos_sort))
        print('Spectroscopic sort order: {}'.format(spec_sort))

    '''
    Now we transpose the axes associated with the spectroscopic dimensions
    so that they are in the same order as in the index array
    '''
    swap_axes = np.uint16(np.append(pos_sort[::-1], spec_sort[::-1] + len(pos_sort)))

    if verbose:
        print('swap axes: {} to be applied to N dimensional data of shape {}'.format(swap_axes, data_n_dim.shape))

    data_n_dim_2 = data_n_dim.transpose(tuple(swap_axes))

    if verbose:
        print('N dimensional data shape after axes swap: {}'.format(data_n_dim_2.shape))

    '''
    Now we reshape the dataset based on those dimensions
    We must use the spectroscopic dimensions in reverse order
    '''
    try:
        ds_2d = data_n_dim_2.reshape([ds_pos.shape[0], ds_spec.shape[1]])
    except ValueError:
        raise ValueError('Could not reshape dataset to full N-dimensional form')

    return ds_2d, True




[docs]
def get_dimensionality(ds_index, index_sort=None):
    """
    Get the size of each index dimension in a specified sort order

    Parameters
    ----------
    ds_index : 2D HDF5 Dataset or numpy array
        Row matrix of indices
    index_sort : Iterable of unsigned integers (Optional)
        Sort that can be applied to dimensionality.
        For example - Order of rows sorted from fastest to slowest

    Returns
    -------
    sorted_dims : list of unsigned integers
        Dimensionality of each row in ds_index.  If index_sort is supplied, it will be in the sorted order

    """
    if isinstance(ds_index, da.core.Array):
        ds_index = ds_index.compute()
    if not isinstance(ds_index, (np.ndarray, h5py.Dataset)):
        raise TypeError('ds_index should either be a numpy array or h5py.Dataset')

    if ds_index.shape[0] > ds_index.shape[1]:
        # must be spectroscopic like in shape (few rows, more cols)
        ds_index = np.transpose(ds_index)

    if index_sort is None:
        index_sort = np.arange(ds_index.shape[0])
    else:
        if not contains_integers(index_sort, min_val=0):
            raise ValueError('index_sort should contain integers > 0')
        index_sort = np.array(index_sort)
        if index_sort.ndim != 1:
            raise ValueError('index_sort should be a 1D array')
        if len(np.unique(index_sort)) > ds_index.shape[0]:
            raise ValueError('length of index_sort ({}) should be smaller than number of dimensions in provided dataset'
                             ' ({}'.format(len(np.unique(index_sort)), ds_index.shape[0]))
        if set(np.arange(ds_index.shape[0])) !=  set(index_sort):
            raise ValueError('Sort order of dimensions ({}) not  matching  with number of dimensions ({})'
                             ''.format(index_sort, ds_index.shape[0]))

    sorted_dims = [len(np.unique(row)) for row in np.array(ds_index, ndmin=2)[index_sort]]
    return sorted_dims




[docs]
def get_sort_order(ds_spec):
    """
    Find how quickly the spectroscopic values are changing in each row
    and the order of rows from fastest changing to slowest.

    Parameters
    ----------
    ds_spec : 2D HDF5 dataset or numpy array
        Rows of indices to be sorted from fastest changing to slowest

    Returns
    -------
    change_sort : List of unsigned integers
        Order of rows sorted from fastest changing to slowest

    """
    if isinstance(ds_spec, da.core.Array):
        ds_spec = ds_spec.compute()
    if not isinstance(ds_spec, (np.ndarray, h5py.Dataset)):
        raise TypeError('ds_spec should either be a numpy array or h5py.Dataset')

    if ds_spec.shape[0] > ds_spec.shape[1]:
        # must be spectroscopic like in shape (few rows, more cols)
        ds_spec = np.transpose(ds_spec)

    change_count = [len(np.where([row[i] != row[i - 1] for i in range(len(row))])[0]) for row in ds_spec]
    change_sort = np.argsort(change_count)[::-1]

    return change_sort




[docs]
def get_unit_values(ds_inds, ds_vals, dim_names=None, all_dim_names=None, is_spec=None, verbose=False):
    """
    Gets the unit arrays of values that describe the spectroscopic dimensions

    Parameters
    ----------
    ds_inds : h5py.Dataset or numpy.ndarray
        Spectroscopic or Position Indices dataset
    ds_vals : h5py.Dataset or numpy.ndarray
        Spectroscopic or Position Values dataset
    dim_names : str, or list of str, Optional
        Names of the dimensions of interest. Default = all
    all_dim_names : list of str, Optional
        Names of all the dimensions in these datasets. Use this if supplying numpy arrays instead of h5py.Dataset
        objects for h5_inds, h5_vals since there is no other way of getting the dimension names.
    is_spec : bool, optional
        Whether or not the provided ancillary datasets are position or spectroscopic
        The user is recommended to supply this parameter whenever it is known
        By default, this function will attempt to recognize the answer based on the shape of the datasets.
    verbose : bool, optional
        Whether or not to print debugging statements. Default - off

    Note - this function can be extended / modified for ancillary position dimensions as well

    Returns
    -------
    unit_values : dict
        Dictionary containing the unit array for each dimension. The name of the dimensions are the keys.

    """
    if all_dim_names is None:
        allowed_types = h5py.Dataset
    else:
        all_dim_names = validate_list_of_strings(all_dim_names, 'all_dim_names')
        all_dim_names = np.array(all_dim_names)
        allowed_types = (h5py.Dataset, np.ndarray)

    for dset, dset_name in zip([ds_inds, ds_vals], ['ds_inds', 'ds_vals']):
        if not isinstance(dset, allowed_types):
            raise TypeError(dset_name + ' should be of type: {}'.format(allowed_types))

    # For now, we will throw an error if even a single dimension is listed as an incomplete dimension:
    if isinstance(ds_inds, h5py.Dataset):
        if np.any(['incomplete_dimensions' in dset.attrs.keys() for dset in [ds_inds, ds_vals]]):
            try:
                incomp_dims_inds = get_attr(ds_inds, 'incomplete_dimensions')
            except KeyError:
                incomp_dims_inds = None
            try:
                incomp_dims_vals = get_attr(ds_vals, 'incomplete_dimensions')
            except KeyError:
                incomp_dims_vals = None
            if incomp_dims_inds is None and incomp_dims_vals is not None:
                incomp_dims = incomp_dims_vals
            elif incomp_dims_inds is not None and incomp_dims_vals is None:
                incomp_dims = incomp_dims_inds
            else:
                # ensure that both attributes are the same
                if incomp_dims_vals != incomp_dims_inds:
                    raise ValueError('Provided indices ({}) and values ({}) datasets were marked with different values '
                                     'for incomplete_datasets.'.format(incomp_dims_inds, incomp_dims_vals))
                incomp_dims = incomp_dims_vals

            all_dim_names = get_attr(ds_inds, 'labels')
            raise ValueError('Among all dimensions: {}, These dimensions were marked as incomplete dimensions: {}'
                             '. You are recommended to find unit values manually'.format(all_dim_names, incomp_dims))

    # Do we need to check that the provided inds and vals correspond to the same main dataset?
    if ds_inds.shape != ds_vals.shape:
        raise ValueError('h5_inds: {} and h5_vals: {} should have the same shapes'.format(ds_inds.shape, ds_vals.shape))

    if all_dim_names is None:
        all_dim_names = get_attr(ds_inds, 'labels')
    if verbose:
        print('All dimensions: {}'.format(all_dim_names))

    # First load to memory
    inds_mat = ds_inds[()]
    vals_mat = ds_vals[()]

    if is_spec is None:
        # Attempt to recognize the type automatically
        is_spec = False
        if inds_mat.shape[0] < inds_mat.shape[1]:
            is_spec = True
    else:
        if not isinstance(is_spec, bool):
            raise TypeError('is_spec should be a boolean. Provided object is of type: {}'.format(type(is_spec)))

    if verbose:
        print(
            'Ancillary matrices of shape: {}, hence determined to be Spectroscopic:{}'.format(inds_mat.shape, is_spec))

    if not is_spec:
        # Convert to spectral shape
        inds_mat = np.transpose(inds_mat)
        vals_mat = np.transpose(vals_mat)

    if len(all_dim_names) != inds_mat.shape[0]:
        raise ValueError('Length of dimension names list: {} not matching with shape of dataset: {}'
                         '.'.format(len(all_dim_names), inds_mat.shape[0]))

    if dim_names is None:
        dim_names = all_dim_names
        if verbose:
            print('Going to return unit values for all dimensions: {}'.format(all_dim_names))
    else:
        dim_names = validate_list_of_strings(dim_names, 'dim_names')

        if verbose:
            print('Checking to make sure that the target dimension names: {} exist in the datasets attributes: {}'
                  '.'.format(dim_names, all_dim_names))

        # check to make sure that the dimension names exist in the datasets:
        for dim_name in dim_names:
            if dim_name not in all_dim_names:
                raise KeyError('Dimension {} does not exist in the provided ancillary datasets'.format(dim_name))

    unit_values = dict()
    for dim_name in all_dim_names:
        # Find the row in the spectroscopic indices that corresponds to the dimensions we want to slice:
        if verbose:
            print('Looking for dimension: {} in {}'.format(dim_name, dim_names))
        desired_row_ind = np.where(all_dim_names == dim_name)[0][0]

        inds_for_dim = inds_mat[desired_row_ind]
        # Wherever this dimension goes to 0 - start of a new tile
        starts = np.where(inds_for_dim == np.min(inds_for_dim))[0]
        if starts[0] != 0:
            raise ValueError('Spectroscopic Indices for dimension: "{}" not '
                             'starting with 0. Please fix this and try again'
                             '.'.format(dim_name))

        # There may be repetitions in addition to tiling. Find how the the positions increase.
        # 1 = repetition, > 1 = new tile
        step_sizes = np.hstack(([1], np.diff(starts)))
        # This array is of the same length as the full indices array

        # We should expect only two values of step sizes for a regular dimension (tiles of the same size):
        # 1 for same value repeating and a big jump in indices when the next tile starts
        # If the repeats / tiles are of different lengths, then this is not a regular dimension.
        # What does a Unit Values vector even mean in this case? Just raise an error for now
        if np.where(np.unique(step_sizes) - 1)[0].size > 1:
            raise ValueError('Non constant step sizes')

        # Finding Start of a new tile
        tile_starts = np.where(step_sizes > 1)[0]

        # converting these indices to correct indices that can be mapped straight to
        if len(tile_starts) < 1:
            # Dimension(s) with no tiling at all
            # Make it look as though the next tile starts at the end of the whole indices vector
            tile_starts = np.array([0, len(inds_for_dim)])
        else:
            # Dimension with some form of repetition
            tile_starts = np.hstack(([0], starts[tile_starts]))

            # Verify that each tile is identical here
            # Last tile will not be checked unless we add the length of the indices vector as the start of next tile
            tile_starts = np.hstack((tile_starts, [len(inds_for_dim)]))
            subsections = [inds_for_dim[tile_starts[ind]: tile_starts[ind + 1]] for ind in range(len(tile_starts) - 1)]
            if np.max(np.diff(subsections, axis=0)) != 0:
                # Should get unit values for ALL dimensions regardless of expectations to catch such scenarios.
                raise ValueError('Values in each tile of dimension: {} are different'.format(dim_name))

        # Now looking within the first tile:
        subsection = inds_for_dim[tile_starts[0]:tile_starts[1]]
        # remove all repetitions. ie - take indices only where jump == 1
        step_inds = np.hstack(([0], np.where(np.hstack(([0], np.diff(subsection))))[0]))
        # Finally, use these indices to get the values
        if dim_name in dim_names:
            # Only add this dimension to dictionary if requwested.
            unit_values[dim_name] = vals_mat[desired_row_ind, step_inds]

    return unit_values




[docs]
def write_main_dataset(h5_parent_group, main_data, main_data_name, quantity, units, pos_dims, spec_dims,
                       main_dset_attrs=None, h5_pos_inds=None, h5_pos_vals=None, h5_spec_inds=None, h5_spec_vals=None,
                       aux_spec_prefix='Spectroscopic_', aux_pos_prefix='Position_', verbose=False,
                       slow_to_fast=False, **kwargs):
    """
    Writes the provided data as a 'Main' dataset with all appropriate linking.
    By default, the instructions for generating the ancillary datasets should be specified using the pos_dims and
    spec_dims arguments as dictionary objects. Alternatively, if both the indices and values datasets are already
    available for either/or the positions / spectroscopic, they can be specified using the keyword arguments. In this
    case, fresh datasets will not be generated.

    Parameters
    ----------
    h5_parent_group : :class:`h5py.Group`
        Parent group under which the datasets will be created
    main_data : numpy.ndarray, dask.array.core.Array, list or tuple
        2D matrix formatted as [position, spectral] or a list / tuple with the shape for an empty dataset.
        If creating an empty dataset - the dtype must be specified via a kwarg.
    main_data_name : String / Unicode
        Name to give to the main dataset. This cannot contain the '-' character.
    quantity : String / Unicode
        Name of the physical quantity stored in the dataset. Example - 'Current'
    units : String / Unicode
        Name of units for the quantity stored in the dataset. Example - 'A' for amperes
    pos_dims : Dimension or array-like of Dimension objects
        Sequence of Dimension objects that provides all necessary instructions for constructing the indices and values
        datasets
        Object specifying the instructions necessary for building the Position indices and values datasets
    spec_dims : Dimension or array-like of Dimension objects
        Sequence of Dimension objects that provides all necessary instructions for constructing the indices and values
        datasets
        Object specifying the instructions necessary for building the Spectroscopic indices and values datasets
    main_dset_attrs : dictionary, Optional
        Dictionary of parameters that will be written to the main dataset. Do NOT include region references here.
    h5_pos_inds : h5py.Dataset, Optional
        Dataset that will be linked with the name "Position_Indices"
    h5_pos_vals : h5py.Dataset, Optional
        Dataset that will be linked with the name "Position_Values"
    h5_spec_inds : h5py.Dataset, Optional
        Dataset that will be linked with the name "Spectroscopic_Indices"
    h5_spec_vals : h5py.Dataset, Optional
        Dataset that will be linked with the name "Spectroscopic_Values"
    aux_spec_prefix : str or unicode, Optional
        Default prefix for Spectroscopic datasets. Default = "Spectroscopic"
    aux_pos_prefix : str or unicode, Optional
        Default prefix for Position datasets. Default = "Position"
    verbose : bool, Optional, default=False
        If set to true - prints debugging logs
    slow_to_fast : bool, Optional. Default=False
        Set to True if the dimensions are arranged from slowest varying to fastest varying.
        Set to False otherwise.
    kwargs will be passed onto the creation of the dataset. Please pass chunking, compression, dtype, and other
        arguments this way

    Returns
    -------
    h5_main : USIDataset
        Reference to the main dataset

    """
    def __check_anc_before_creation(aux_prefix, dim_type='pos'):
        aux_prefix = validate_single_string_arg(aux_prefix, 'aux_' + dim_type + '_prefix')
        if not aux_prefix.endswith('_'):
            aux_prefix += '_'
        if '-' in aux_prefix:
            warn('aux_' + dim_type + ' should not contain the "-" character. Reformatted name from:{} to '
                                     '{}'.format(aux_prefix, aux_prefix.replace('-', '_')))
        aux_prefix = aux_prefix.replace('-', '_')
        for dset_name in [aux_prefix + 'Indices', aux_prefix + 'Values']:
            if dset_name in h5_parent_group.keys():
                # TODO: What if the contained data was correct?
                raise KeyError('Dataset named: ' + dset_name + ' already exists in group: '
                                                               '{}. Consider passing these datasets using kwargs (if they are correct) instead of providing the pos_dims and spec_dims arguments'.format(h5_parent_group.name))
        return aux_prefix

    def __ensure_anc_in_correct_file(h5_inds, h5_vals, prefix):
        if h5_inds.file != h5_vals.file:
            raise ValueError('Provided ' + prefix + ' datasets are present in different HDF5 files!')

        if h5_inds.file != h5_parent_group.file:
            # Need to copy over the anc datasets to the new group
            if verbose:
                print('Need to copy over ancillary datasets: {} and {} to '
                      'destination group: {} which is in a different HDF5 '
                      'file'.format(h5_inds, h5_vals, h5_parent_group))
            ret_vals = [copy_dataset(x, h5_parent_group, verbose=verbose) for x in [h5_inds, h5_vals]]
        else:
            ret_vals = [h5_inds, h5_vals]
        return tuple(ret_vals)

    if not isinstance(h5_parent_group, (h5py.Group, h5py.File)):
        raise TypeError('h5_parent_group should be a h5py.File or h5py.Group object')
    if not is_editable_h5(h5_parent_group):
        raise ValueError('The provided file is not editable')
    if verbose:
        print('h5 group and file OK')

    quantity, units, main_data_name = validate_string_args([quantity, units, main_data_name],
                                                           ['quantity', 'units', 'main_data_name'])
    if verbose:
        print('quantity, units, main_data_name all OK')

    quantity = quantity.strip()
    units = units.strip()
    main_data_name = main_data_name.strip()
    if '-' in main_data_name:
        warn('main_data_name should not contain the "-" character. Reformatted name from:{} to '
             '{}'.format(main_data_name, main_data_name.replace('-', '_')))
    main_data_name = main_data_name.replace('-', '_')

    if isinstance(main_data, (list, tuple)):
        if not contains_integers(main_data, min_val=1):
            raise ValueError('main_data if specified as a shape should be a list / tuple of integers >= 1')
        if len(main_data) != 2:
            raise ValueError('main_data if specified as a shape should contain 2 numbers')
        if 'dtype' not in kwargs:
            raise ValueError('dtype must be included as a kwarg when creating an empty dataset')
        _ = validate_dtype(kwargs.get('dtype'))
        main_shape = main_data
        if verbose:
            print('Selected empty dataset creation. OK so far')
    elif isinstance(main_data, (np.ndarray, da.core.Array)):
        if main_data.ndim != 2:
            raise ValueError('main_data should be a 2D array')
        main_shape = main_data.shape
        if verbose:
            print('Provided numpy or Dask array for main_data OK so far')
    else:
        raise TypeError('main_data should either be a numpy array or a tuple / list with the shape of the data')

    if h5_pos_inds is not None and h5_pos_vals is not None:
        # The provided datasets override fresh building instructions.
        validate_anc_h5_dsets(h5_pos_inds, h5_pos_vals, main_shape, is_spectroscopic=False)
        if verbose:
            print('The shapes of the provided h5 position indices and values are OK')
        h5_pos_inds, h5_pos_vals = __ensure_anc_in_correct_file(h5_pos_inds, h5_pos_vals, 'Position')
    else:
        aux_pos_prefix = __check_anc_before_creation(aux_pos_prefix, dim_type='pos')
        pos_dims = validate_dimensions(pos_dims, dim_type='Position')
        validate_dims_against_main(main_shape, pos_dims, is_spectroscopic=False)
        if verbose:
            print('Passed all pre-tests for creating position datasets')
        h5_pos_inds, h5_pos_vals = write_ind_val_dsets(h5_parent_group, pos_dims, is_spectral=False, verbose=verbose,
                                                       slow_to_fast=slow_to_fast, base_name=aux_pos_prefix)
        if verbose:
            print('Created position datasets!')

    if h5_spec_inds is not None and h5_spec_vals is not None:
        # The provided datasets override fresh building instructions.
        validate_anc_h5_dsets(h5_spec_inds, h5_spec_vals, main_shape, is_spectroscopic=True)
        if verbose:
            print('The shapes of the provided h5 position indices and values '
                  'are OK')
        h5_spec_inds, h5_spec_vals = __ensure_anc_in_correct_file(h5_spec_inds, h5_spec_vals,
                                         'Spectroscopic')
    else:
        aux_spec_prefix = __check_anc_before_creation(aux_spec_prefix, dim_type='spec')
        spec_dims = validate_dimensions(spec_dims, dim_type='Spectroscopic')
        validate_dims_against_main(main_shape, spec_dims, is_spectroscopic=True)
        if verbose:
            print('Passed all pre-tests for creating spectroscopic datasets')
        h5_spec_inds, h5_spec_vals = write_ind_val_dsets(h5_parent_group, spec_dims, is_spectral=True, verbose=verbose,
                                                         slow_to_fast=slow_to_fast, base_name=aux_spec_prefix)
        if verbose:
            print('Created Spectroscopic datasets')

    if h5_parent_group.file.driver == 'mpio':
        if kwargs.pop('compression', None) is not None:
            warn('This HDF5 file has been opened wth the "mpio" communicator. '
                 'mpi4py does not allow creation of compressed datasets. Compression kwarg has been removed')

    if isinstance(main_data, np.ndarray):
        # Case 1 - simple small dataset
        h5_main = h5_parent_group.create_dataset(main_data_name, data=main_data, **kwargs)
        if verbose:
            print('Created main dataset with provided data')
    elif isinstance(main_data, da.core.Array):
        # Case 2 - Dask dataset
        # step 0 - get rid of any automated dtype specification:
        _ = kwargs.pop('dtype', None)
        # step 1 - create the empty dataset:
        h5_main = h5_parent_group.create_dataset(main_data_name, shape=main_data.shape, dtype=main_data.dtype,
                                                 **kwargs)
        if verbose:
            print('Created empty dataset: {} for writing Dask dataset: {}'.format(h5_main, main_data))
            print('Dask array will be written to HDF5 dataset: "{}" in file: "{}"'.format(h5_main.name,
                                                                                          h5_main.file.filename))
        # Step 2 - now ask Dask to dump data to disk
        da.to_hdf5(h5_main.file.filename, {h5_main.name: main_data})
        # main_data.to_hdf5(h5_main.file.filename, h5_main.name)  # Does not work with python 2 for some reason
    else:
        # Case 3 - large empty dataset
        h5_main = h5_parent_group.create_dataset(main_data_name, main_data, **kwargs)
        if verbose:
            print('Created empty dataset for Main')

    write_simple_attrs(h5_main, {'quantity': quantity, 'units': units})
    if verbose:
        print('Wrote quantity and units attributes to main dataset')

    if isinstance(main_dset_attrs, dict):
        write_simple_attrs(h5_main, main_dset_attrs)
        if verbose:
            print('Wrote provided attributes to main dataset')

    write_book_keeping_attrs(h5_main)

    # make it main
    link_as_main(h5_main, h5_pos_inds, h5_pos_vals, h5_spec_inds, h5_spec_vals)
    if verbose:
        print('Successfully linked datasets - dataset should be main now')

    from ..usi_data import USIDataset
    return USIDataset(h5_main)




[docs]
def map_grid_to_cartesian(h5_main, grid_shape, mode='histogram', **kwargs):
    """
    Map an incomplete measurement, such as a spiral scan, to a cartesian grid.
    Parameters
    ----------
    h5_main : :class:`pyUSID.USIDataset`
        Dataset containing the sparse measurement
    grid_shape : int or [int, int]
        Shape of the output :class:`numpy.ndarray`.
    mode : str, optional. Default = 'histogram'
        Method used for building a cartesian grid.
        Available methods = 'histogram', 'linear', 'nearest', 'cubic'
        Use kwargs to pass onto each of the techniques
    Note
    ----
    UNDER DEVELOPMENT!
    Currently only valid for 2 position dimensions

    @author: Patrik Marschalik

    Returns
    -------
    :class:`numpy.ndarray` but could be a h5py.Dataset or dask.array.core.Array object
    """
    try:
        from scipy.interpolate import griddata
    except ImportError as expn:
        griddata = None
        warn('map_grid_to_cartesian() requires scipy')
        raise expn

    from ..usi_data import USIDataset

    if not isinstance(h5_main, USIDataset):
        raise TypeError('Provided object is not a pyUSID.USIDataset object')

    if mode not in ['histogram', 'linear', 'nearest', 'cubic']:
        raise ValueError('mode must be a string among["histogram", "cubic"]')

    ds_main = h5_main[()].squeeze()
    ds_pos_vals = h5_main.h5_pos_vals[()]

    if ds_pos_vals.shape[1] != 2:
        raise TypeError("Only working for 2 position dimensions.")

    # Transform to row, col image format
    rotation = np.array([[0, 1], [-1, 0]])
    ds_pos_vals = np.dot(ds_pos_vals, rotation)

    try:
        grid_n = len(grid_shape)
    except TypeError:
        grid_n = 1

    if grid_n != 1 and grid_n != 2:
        raise ValueError("grid_shape must be of type int or [int, int].")

    if grid_n == 1:
        grid_shape = 2 * [grid_shape]

    def interpolate(points, values, grid_shape, method):
        grid_shape = list(map((1j).__mul__, grid_shape))
        grid_x, grid_y = np.mgrid[
            np.amin(points[:, 0]):np.amax(points[:, 0]):grid_shape[0],
            np.amin(points[:, 1]):np.amax(points[:, 1]):grid_shape[1]
        ]
        ndim_data = griddata(points, values, (grid_x, grid_y), method=method)

        return ndim_data

    if mode == "histogram":
        histogram_weighted, _, _ = np.histogram2d(*ds_pos_vals.T, bins=grid_shape, weights=ds_main)
        histogram, _, _ = np.histogram2d(*ds_pos_vals.T, bins=grid_shape)
        cart_data = np.divide(histogram_weighted, histogram)
    else:
        cart_data = interpolate(ds_pos_vals, ds_main, grid_shape, method=mode)

    return cart_data




[docs]
def write_sidpy_dataset(si_dset, h5_parent_group, verbose=False,
                        **kwargs):
    """
    Writes a sidpy.Dataset as a USID dataset in the provided HDF5 Group.
    Please see notes about dimension types
    
    Parameters
    ----------
    si_dset: sidpy.Dataset
        Dataset to be written to HDF5 in NSID format
    h5_parent_group : class:`h5py.Group`
        Parent group under which the datasets will be created
    verbose : bool, Optional. Default = False
        Whether or not to write logs to standard out
    kwargs: dict
        additional keyword arguments passed on to h5py when writing data
    
    Returns
    ------
    h5_main : USIDataset
        Reference to the main dataset

    Notes
    -----
    USID only has two dimension types - Position and Spectroscopic.
    Consider changing the types of dimensions of all other dimensions to either
    "SPATIAL" or "SPECTRAL".
    """
    
    if not isinstance(si_dset, sid.Dataset):
        raise TypeError('Data to write is not a sidpy dataset')
    if not isinstance(h5_parent_group, (h5py.File, h5py.Group)):
        raise TypeError('h5_parent_group is not a h5py.File or '
                        'h5py.Group object')

    spatial_dims, spectral_dims,  spatial_size, spectral_size = [], [], 1, 1
    for dim_ind, dime in si_dset._axes.items():
        if dime._dimension_type == sid.DimensionType.SPATIAL:
            spatial_dims.append(Dimension(dime._name,
                                          dime._units,
                                          dime.values,
                                          dime._quantity,
                                          dime._dimension_type))

            spatial_size *= np.size(dime.values)
        
        else:
            if not dime._dimension_type == sid.DimensionType.SPECTRAL:
                warn('Will consider dimension: {} of type: {} as a '
                     'spectroscopic dimension'.format(dime._name,
                                                      dime._dimension_type))
            spectral_dims.append(Dimension(dime._name,
                                           dime._units,
                                           dime.values,
                                           dime._quantity,
                                           dime._dimension_type))

            spectral_size *= np.size(dime.values)

    main_dataset = da.reshape(si_dset, [spatial_size, spectral_size])

    # TODO : Consider writing this out as a separate group
    main_dset_attr = {}
    for attr_name in dir(si_dset):
        attr_val = getattr(si_dset, attr_name)
        if isinstance(attr_val, dict):
            main_dset_attr.update(attr_val)

    h5_main = write_main_dataset(h5_parent_group=h5_parent_group,
                                 main_data=main_dataset,
                                 main_data_name=si_dset.name,
                                 quantity=si_dset.quantity,
                                 units=si_dset.units,
                                 pos_dims=spatial_dims,
                                 spec_dims=spectral_dims,
                                 main_dset_attrs=flatten_dict(main_dset_attr),
                                 slow_to_fast=True,
                                 verbose=verbose,
                                 **kwargs)

    return h5_main