Source code for skg.util

"""
Shared utility functions used by the fitting routines.
"""

from numpy import (
    add, argsort, array, concatenate, empty, flatnonzero, float_, inexact,
    issubdtype, mean, median, sqrt, __version__ as __np_version__
)
from numpy.core.multiarray import normalize_axis_index
from numpy.lib import NumpyVersion


__all__ = [
    'bias_mean', 'counts', 'ends', 'medians', 'moveaxis', 'preprocess',
    'preprocess_pair', 'preprocess_npair', 'roots', 'stds', 'sums', 'vars'
]


if NumpyVersion(__np_version__) >= '1.11.0':
    from numpy import moveaxis
else:
    from numpy import rollaxis
    def moveaxis(a, start, end):
        return rollaxis(a, start, normalize_axis_index(end, a.ndim + 1))


[docs]def preprocess(x, copy=False, float=False, axis=None):
    """
    Ensure that `x` is a properly formatted numpy array.

    Proper formatting means at least one dimension, and may include
    optional copying, reshaping and coersion into a floating point
    datatype.

    Parameters
    ----------
    x : array-like
        The array to process. If not already a numpy array, it will be
        converted to one.
    copy : bool, optional
        If True, a copy is made regardless of whether `x` is already a
        numpy array or not. The default is False.
    float : bool, optional
        If True, and `x` is not an inexact array already
        (:py:attr:`numpy.float16`, :py:attr:`numpy.float32`,
        :py:attr:`numpy.float64`, :py:attr:`numpy.float96`,
        :py:attr:`numpy.float128`, etc), coerce to be of type
        :py:attr:`numpy.float_`. Defaults to False.
    axis : int, optional
        If specified, the specified axis is moved to the end of the
        shape. Default is to return `x` with the original dimensions.

    Return
    ------
    x : ~numpy.ndarray
        Processed version of the input.
    """
    if float:
        dtype = x.dtype if hasattr(x, 'dtype') and \
                           issubdtype(x.dtype, inexact) else float_
    else:
        dtype=None

    x = array(x, copy=copy, subok=not copy, ndmin=1, dtype=dtype)
    if axis is not None and axis not in (-1, x.ndim - 1):
        # moveaxis always returns a new view, never the same object
        x = moveaxis(x, axis, -1)
    return x


[docs]def preprocess_pair(x, y, sorted=True, xcopy=False, ycopy=False):
    """
    Ensure that `x` and `y` are floating point arrays of the same size,
    ranked in increasing order by `x`.

    Parameters
    ----------
    x : array-like
        The x-values of the data points. The array will be converted to
        floating point, raveled and sorted, only as necessary.
    y : array-like
        The y-values of the data points corresponding to `x`. Must be
        the same size as `x`. Will be converted to floating point and
        raveled only as necessary. Will be sorted if `x` gets sorted.
    sorted : bool
        Set to True if `x` is already monotonically increasing or
        decreasing. If False, `x` will be sorted into increasing order,
        and `y` will be sorted along with it.
    xcopy : bool, optional
        Ensure that `x` gets copied even if it is already an array. The
        default is to leave arrays untouched as much as possible.
    ycopy : bool
        Ensure that `y` gets copied even if it is already an array. The
        default is to leave arrays untouched as much as possible.

    Return
    ------
    x, y : ~numpy.ndarray
        Processed versions of the inputs.

    See Also
    --------
    preprocess_npair : Similar function but for `x` containing vectors
        and `y` scalars.
    """
    x = preprocess(x, copy=xcopy, float=True)
    y = preprocess(y, copy=ycopy, float=True)
    if x.shape != y.shape:
        raise ValueError('x and y must be the same shape')
    x = x.ravel()
    y = y.ravel()
    if not sorted:
        # Is there a better way to do this in scipy?
        ind = argsort(x)
        x = x[ind]
        y = y[ind]
    return x, y


[docs]def preprocess_npair(x, y, axis=-1, xcopy=False, ycopy=False):
    """
    Ensure that `x` and `y` are floating point arrays of compatible
    size.

    `x` is an array containing vectors along dimension `axis`. `y`
    contains scalar elements. The shape of `y` must match that of `x`
    exactly except for `axis`.

    Parameters
    ----------
    x : array-like
        The vector x-values of the data points. The array will be
        converted to floating point, and raveled along all dimensions
        but `axis`, which will be the last dimension.
    y : array-like
        The y-values of the data points corresponding to `x`. Must have
        one fewer dimension than `x`, and its shape must match all
        elements of `x`'s shape except `axis`. Will be converted to
        floating point and raveled.
    xcopy : bool, optional
        Ensure that `x` gets copied even if it is already an array. The
        default is to leave arrays untouched as much as possible.
    ycopy : bool
        Ensure that `y` gets copied even if it is already an array. The
        default is to leave arrays untouched as much as possible.

    Return
    ------
    x, y : ~numpy.ndarray
        Processed versions of the inputs.

    See Also
    --------
    preprocess_pair : For cases when `x` and `y` both contain scalars,
        and are the exact same size.
    """
    if axis is None:
        raise ValueError('Axis must be an integer, not None')
    x = preprocess(x, copy=xcopy, float=True, axis=axis)
    y = preprocess(y, copy=ycopy, float=True)
    if x.shape[:-1] != y.shape:
        raise ValueError('x and y must be the same shape besides axis in x')
    x = x.reshape(-1, x.shape[-1])
    y = y.ravel()
    return x, y


[docs]def ends(p, size):
    """
    Compute the exclusive end indices of each cluster for a dataset of
    `size` elements.

    Parameters
    ----------
    p : array-like
        The start points of each cluster. Clusters continue until the
        start of the next cluster.
    size : int
        The size of the dataset (not of `p`). Necessary because `p`
        omits the end index.

    Return
    ------
    q : numpy.ndarray
        The exclusive indices of the end of each cluster.
        ``q.size == p.size``.
    """
    return concatenate((p.ravel()[1:], [size]))


[docs]def counts(p, size):
    """
    Compute the number of elements in each cluster in a dataset of `size`
    elements.

    Parameters
    ----------
    p : array-like
        The start points of each cluster. Clusters continue until the
        start of the next cluster.
    size : int
        The size of the dataset (not of `p`). Necessary because `p`
        omits the end index.

    Return
    ------
    c : numpy.ndarray
        The number of elements in each cluster. ``c.size == p.size``.
    """
    c = ends(p, size)
    c -= p.ravel()
    return c


[docs]def sums(x, p):
    """
    Compute the sums of each cluster in `x`.

    Parameters
    ----------
    x : array-like
        The array to compute sums over.
    p : array-like
        The start points of each cluster. Clusters continue until the
        start of the next cluster. The indices should start with zero
        and increase monotonically.

    Return
    ------
    s : numpy.ndarray
        The sum of each cluster identified by `p`.
    """
    return add.reduceat(preprocess(x).ravel(), p.ravel())


[docs]def means(x, p):
    """
    Compute the means of each cluster in `x`.

    Parameters
    ----------
    x : array-like
        The array to compute means over.
    p : array-like
        The start points of each cluster. Clusters continue until the
        start of the next cluster. The indices should start with zero
        and increase monotonically.

    Return
    ------
    m : numpy.ndarray
        The mean of each cluster identified by `p`.
    """
    x = preprocess(x)
    return sums(x, p) / counts(p, x.size)


[docs]def vars(x, p):
    """
    Compute the variances of each cluster in `x`.

    Parameters
    ----------
    x : array-like
        The array to compute variances over.
    p : array-like
        The start points of each cluster. Clusters continue until the
        start of the next cluster. The indices should start with zero
        and increase monotonically.

    Return
    ------
    v : numpy.ndarray
        The variance of each cluster identified by `p`.
    """
    x = preprocess(x)
    c = counts(p, x.size)
    return (sums(x**2, p) - sums(x, p)**2 / c) / c


[docs]def stds(x, p):
    """
    Compute the standard deviations of each cluster in `x`.

    Parameters
    ----------
    x : array-like
        The array to compute standard deviations over.
    p : array-like
        The start points of each cluster. Clusters continue until the
        start of the next cluster. The indices should start with zero
        and increase monotonically.

    Return
    ------
    s : numpy.ndarray
        The standard deviation of each cluster identified by `p`.
    """
    return sqrt(vars(x, p))


[docs]def medians(x, p):
    """
    Compute the median of each cluster in `x`.

    Parameters
    ----------
    x : array-like
        The array to compute medians over.
    p : array-like
        The start points of each cluster. Clusters continue until the
        start of the next cluster. The indices should start with zero
        and increase monotonically.

    Return
    ------
    m : numpy.ndarray
        The median of each cluster identified by `p`.
    """
    x = preprocess(x).ravel()
    m = empty(p.size)
    for i, (start, end) in enumerate(zip(p, ends(p, x.size))):
        m[i] = median(x[start:end])
    return m


[docs]def bias_mean(x, y):
    """
    Computes the bias of a dataset as the mean of the `y` values.

    This function is a wrapper to be used as `bias` for `roots`.

    Parameters
    ----------
    x : array-like
        The x-values, passed in to fulfill the correct interface and
        otherwise ignored.
    y : array-like
        The y-values of the data, treated as a 1D raveled array.

    Return
    ------
    The mean of `y`.
    """
    return mean(y)


[docs]def roots(x, y, sorted=True, bias=bias_mean, return_indices=False, return_bias=False):
    """
    Interpolate the roots of a 1-D dataset.

    Roots are interpolated using linear interpolation about an arbitrary
    bias, possibly generated from the data.

    Parameters
    ----------
    x : array-like
        The x-values of the data. Must be monotonically increasing or
        decreasing. Treated as a 1D raveled array.
    y : array-like
        The y-values of the data. Treated as a 1D raveled array. Must be
        the same lendth as `x`.
    sorted : bool
        Set to True if `x` is already monotonically increasing or
        decreasing. If False, `x` will be sorted into increasing order,
        and `y` will be sorted along with it.
    bias : scalar or array-like or callable, optional
        Either a fixed y-value that the data is offset by, or a callable
        that generates the value from `x` and `y`. The roots are
        computed for the y-values with the bias subtracted off. Defaults
        to the mean of the data.
    return_indices : bool, optional
        If `True`, return the indices of the interpolation points.
    return_bias : bool, optional
        If `True`, return the bias as an additional output parameter.
        This parameter is especially useful if `bias` is a callable.

    Return
    ------
    roots : numpy.ndarray
        The x-values of the interpolated intersections between the data
        and the bias.
    indices : numpy.ndarray
        If `return_indices` is set, this will be the indices of the
        interpolation points in `x`, as if returned by
        `numpy.searchsorted`. This will be the index of the point on the
        right of the interval containing the root.
    bias : numpy.ndarray or scalar
        If `return_bias` is set, this will be the actual scalar or array
        that is subtraced from the y-values to compute the roots.
    """
    if callable(bias):
        bias = bias(x, y)

    x, y = preprocess_pair(x, y, sorted, ycopy=True)
    # Speed tested
    y -= bias

    # Speed tested
    le = (y <= 0)
    gt = ~le

    mask = (gt[:-1] & le[1:]) | (le[:-1] & gt[1:])

    # Speed tested
    m0 = flatnonzero(mask)
    m1 = m0 + 1

    x0 = x[m0]
    x1 = x[m1]
    y0 = y[m0]
    y1 = y[m1]

    roots = (x0 * y1 - x1 * y0) / (y1 - y0)

    if return_indices:
        if return_bias:
            return roots, m1, bias
        return roots, m1
    if return_bias:
        return roots, bias
    return roots