Skip to content

Spikes Module

pyspc.spikes

Functions

find_spikes(y, ndiff=1, method='zscore', threshold=None, iqr_factor=7)

Detect spikes in input data based on the specified outlier detection method.

Parameters:

Name Type Description Default
y ndarray

Input data 1D array of a single spectrum or 2D matrix (spectra in rows)

required
ndiff int

Order of differentiation, by default 1

1
method str

Outlier detection method, by default "zscore" Available options: * 'zscore' for z-score (x[i,j]-mean(x[i,:]))/std(x[i,:]) * 'mzscore' for modified z-score (x[i,j]-median(x[i,:]))/mad(x[i,:]) * 'iqr' for inter-quartile range [q1 - factoriqr, q3 + factoriqr]

'zscore'
threshold float

Threshold value for outlier/spike detection, by default None. Used only with 'zscore' and 'mzscore' methods.

None
iqr_factor float

Factor for the IQR calculation, by default 7. Used only with 'iqr' method.

7

Returns:

Type Description
ndarray

A boolean matrix indicating the spike locations

Source code in pyspc/spikes.py
def find_spikes(
    y: np.ndarray,
    ndiff: int = 1,
    method: str = "zscore",
    threshold: float = None,
    iqr_factor: float = 7,
):
    """
    Detect spikes in input data based on the specified outlier detection method.

    Parameters
    ----------
    y : np.ndarray
        Input data 1D array of a single spectrum or 2D matrix (spectra in rows)
    ndiff : int, optional
        Order of differentiation, by default 1
    method : str, optional
        Outlier detection method, by default "zscore"
        Available options:
        * 'zscore' for z-score (x[i,j]-mean(x[i,:]))/std(x[i,:])
        * 'mzscore' for modified z-score (x[i,j]-median(x[i,:]))/mad(x[i,:])
        * 'iqr' for inter-quartile range [q1 - factor*iqr, q3 + factor*iqr]
    threshold : float, optional
        Threshold value for outlier/spike detection, by default None.
        Used only with 'zscore' and 'mzscore' methods.
    iqr_factor : float, optional
        Factor for the IQR calculation, by default 7.
        Used only with 'iqr' method.

    Returns
    -------
    np.ndarray
        A boolean matrix indicating the spike locations
    """
    if y.ndim == 1:
        y = y.reshape((1, -1))

    if y.ndim != 2:
        raise ValueError(
            "y expected to be either 2D matrix (spectra in rows) or 1D vector"
        )

    if ndiff == 0:
        data = y
    elif ndiff == 1:
        data = np.diff(y, n=1, axis=1, append=y[:, [-2]])
    elif ndiff == 2:
        data = np.diff(y, n=2, axis=1, prepend=y[:, [1]], append=y[:, [-2]])
    else:
        raise ValueError("Unexpected order of differentiation. Must be one of: 0, 1, 2")

    if method == "zscore":
        mn = np.nanmean(data, axis=1, keepdims=True)
        std = np.nanstd(data, axis=1, keepdims=True)
        is_spiky_mat = np.abs((data - mn) / std) > threshold
    elif method == "mzscore":
        med = np.nanmedian(data, axis=1, keepdims=True)
        mad = np.nanmedian(np.abs(data - med), axis=1, keepdims=True)
        # The multiplier 0.6745 is the 0.75th quartile of the standard normal
        # distribution, to which the MAD converges to.
        is_spiky_mat = np.abs(0.6745 * (data - med) / mad) > threshold
    elif method == "iqr":
        is_spiky_mat = np.apply_along_axis(
            lambda x: is_iqr_outlier(x, factor=iqr_factor), axis=1, arr=data
        )
    else:
        raise ValueError(
            "Unexpected spike detection method. Supported options: zscore, mzscore, iqr"
        )

    is_spiky_row = np.any(is_spiky_mat, axis=1)

    if np.any(is_spiky_row):
        # Span spikes
        for i in np.where(is_spiky_row)[0]:
            is_spiky_mat[i, :] = _span_spikes(y[i, :], is_spiky_mat[i, :])

        # Recursively keep searching untill all spikes are cleaned
        spiky_y = y[is_spiky_row, :].copy()
        spiky_y[is_spiky_mat[is_spiky_row, :]] = np.nan
        spiky_y = np.apply_along_axis(fillna, axis=1, arr=spiky_y)

        is_spiky_mat[is_spiky_row, :] = np.bitwise_or(
            is_spiky_mat[is_spiky_row, :],
            find_spikes(
                spiky_y,
                ndiff=ndiff,
                method=method,
                threshold=threshold,
                iqr_factor=iqr_factor,
            ),
        )

    return is_spiky_mat