Source code for skg.cluster1d

"""
A one-dimensional clustering algorithm that is useful for many
applications. It can be used to decode UART packets and seed iterative
minimizers for sinusoidal data.

The algorithm is very simple, relying on one sort, two diffs, and either
a max or partition of the data. The sort makes the asymptotic complexity
increase as O(n log(n)), but with a remarkably small load factor.

This module is not exactly a fitting rougine, although it is used by
:py:func:`~skg.sin_fit`, so it is not exported through the main
:py:mod:`skg` namespace.
"""

from numpy import (
    argmax, argpartition, argsort, array, concatenate, diff, log1p
)


__all__ = ['cluster1d', 'dscale_log']


[docs]def cluster1d(x, absolute=True, dscale=None, sensitivity=0):
    """
    Perform a 1-D clustering on the data.

    Data is assumed to be sorted, either increasing or decreasing
    monotonically. All data will be treated as a raveled 1-D array.

    Parameters
    ----------
    x : array-like
        The data points. Treated as a one-dimensional raveled array.
        Must be sorted.
    absolute : bool, optional
        Whether or not to introduce an absolute scaling term to the
        edge length computation. The term will determine whether cluster
        boundaries are subject to minute fluctuations in the data. It is
        generally a good idea to have this flag turned on (it is by
        default).
    dscale : callable, optional
        A function that accepts an array as input, to be applied
        *in-place* to the edges lengths before partitioning. This
        callable will adjust the location of the partition, and
        therefore what is considered to be a cluster boundary. Useful
        choices are `None` (no-op), `dscale_log`.
    sensitivity : int, optional
        The number non-maximal second-order edges to consider. With
        ``sensitivity=0`` or ``sensitivity=1``, only the largest gaps
        are considered. For ``sensitivity=2``, the second largest
        category of gaps will be included.

    Return
    ------
    q : numpy.ndarray
        An array of indices of the start of each cluster. The first
        element is always zero. The indices are always monotonically
        increasing.

    Notes
    -----
    `q` is intended to be used as the `index` argument to
    :py:meth:`numpy.ufunc.reduceat`. It can also be used as the input to
    :py:func:`numpy.split` by stripping off the leading element:
    ``q[1:]``. Use the functions in :py:mod:`skg.util` to slice and dice
    the indices further.

    Although funcionally identical, it is likely that
    ``sensitivity=0`` is slightly more efficient than ``sensitivity=1``.

    References
    ----------
    - Currently none. This function, is entirely the work of the author.
      A peer reviewed paper is currently in the works.
    """
    x = array(x, copy=False, subok=True).ravel()

    dx = diff(x)

    if dscale:
        dscale(dx)

    ind = argsort(dx)
    s = dx[ind]

    if absolute:
        # Speed tested
        ds = s.copy()
        ds[1:] -= s[:-1]
    else:
        ds = diff(s)

    m = argpartition(ds, -sensitivity)[-sensitivity:].min() \
        if sensitivity else argmax(ds)

    if not absolute:
        m += 1

    ind[m:] += 1
    ind[m:].sort()
    if m > 0:
        ind[m - 1] = 0
        return ind[m - 1:]
    return concatenate(([0], ind))


[docs]def dscale_log(x):
    """
    Perform an in-place log-scaling of the input, adjusted so that zeros
    are handled correctly.

    In practice this function computes

    .. math::

       log(x + 1)

    There is no return value, as the function operates in-place.

    Parameters
    ----------
    x : array-like
        The array to scale logarithmically.
    """
    log1p(x, out=x)