Source code for fit_checks

""" This file contains functions that check if two signals fit or not. They can be used to check a
gluing or molecular fit regions.
"""
import numpy as np
from scipy.stats import shapiro
from scipy.stats.mstats import normaltest
from scipy.stats import linregress


[docs]def check_correlation(first_signal, second_signal, threshold=None): """ Returns the correlation coefficient between the two signals. The signals can be either 1D arrays or 2D arrays containing the rolling slices of the input signals. In the 2D case, the function returns the sliding correlation between the original signals. If a threshold is provided, returns True if the correlation is above the specified threshold. Parameters ---------- first_signal: array The first signal array second_signal: array The second signal array threshold: float or None Threshold for the correlation coefficient. Returns ------- correlation: float or boolean If threshold is None, then the function returns an the correlation coefficient. If a threshold is provided, the function returns True if the correlation value is above the threshold. """ correlation_matrix = np.corrcoef(first_signal, second_signal) # This is needed to make the function work both for 1D vectors and rolling-window 2D matrices if first_signal.ndim == 1: number_of_slices = 1 else: number_of_slices = first_signal.shape[0] first_signal_idxs = np.arange(number_of_slices) second_signal_idxs = first_signal_idxs + number_of_slices correlation = correlation_matrix[first_signal_idxs, second_signal_idxs] if threshold: correlation = correlation > threshold return correlation
[docs]def sliding_check_correlation(first_signal, second_signal, window_length=11, threshold=None): """ Returns the sliding correlation coefficient between the two signals. If a threshold is provided, returns True if the correlation is above the specified threshold. Parameters ---------- first_signal: array The first signal array second_signal: array The second signal array window_length: int The length of the window. It should be an odd number. threshold: float or None Threshold for the correlation coefficient. Returns ------- correlation: float or boolean If threshold is None, then the function returns an the correlation coefficient. If a threshold is provided, the function returns True if the correlation value is above the threshold. """ # Make window length odd if window_length % 2: window_length += 1 sliding_first_signal = _rolling_window(first_signal, window_length) sliding_second_signal = _rolling_window(second_signal, window_length) correlation = check_correlation(sliding_first_signal, sliding_second_signal, threshold) correlation = _restore_array_length(correlation, window_length) return correlation
[docs]def check_linear_fit_intercept_and_correlation(first_signal, second_signal): """ Check if the intercept of a linear fit is near zero, and the correlation coefficient of the two signals. Performs a linear fit to the data, assuming y = ax + b, with x the first_signal and y the second_signal. It will return the value np.abs(b / np.mean(y) * 100) If the intercept is far from zero, it indicates that the two signals do not differ from a multiplication constant. Parameters ---------- first_signal : array The first signal array second_signal : array The second signal array Returns ------- intercept_percent : float or boolean The value of the intercept b, relative to the mean value of the second_signal. correlation : float Correlation coefficient between the two samples """ a, b, correlation, _, _ = linregress(first_signal, second_signal) # Fit the function y = ax + b intercept_percent = np.abs(b / np.mean(second_signal) * 100) return intercept_percent, correlation
[docs]def sliding_check_linear_fit_intercept_and_correlation(first_signal, second_signal, window_length=11): """ Check if the intercept of a linear fit is near zero. Performs a linear fit to the data, assuming y = ax + b, with x the first_signal and y the second_signal. It will return the value np.abs(b / np.mean(y) * 100) and the correlation of the two signals. Parameters ---------- first_signal: array The first signal array second_signal: array The second signal array window_length: int The length of the window. It should be an odd number. Returns ------- intercepts : float or boolean The value of the intercept b, relative to the mean value of the second_signal. correlations : float Correlation coefficient between the two samples """ # Make window length odd if window_length % 2: window_length += 1 results = _apply_sliding_check(check_linear_fit_intercept_and_correlation, first_signal, second_signal, window_length) intercepts = results[:, 0] correlations = results[:, 1] intercepts = _restore_array_length(intercepts, window_length) correlations = _restore_array_length(correlations, window_length) return intercepts, correlations
[docs]def check_residuals_not_gaussian(first_signal, second_signal, threshold=None): """ Check if the residuals of the linear fit are not from a normal distribution. The function uses a Shapiro-Wilk test on the residuals of a linear fit. Specifically, the function performs a linear fit to the data, assuming y = ax, and then calculates the residuals r = y - ax. It will return the p value of the Shapiro-Wilk test on the residuals. If a threshold is provided, returns True if the p value is below the specified threshold, i.e. if the residuals are probably not gaussian. Parameters ---------- first_signal: array The first signal array second_signal: array The second signal array threshold: float or None Threshold for the Shapiro-Wilk p-value. Returns ------- p_value: float or boolean If threshold is None, then the function returns the p-value of the Shapiro-Wilk test on the residuals. If a threshold is provided, the function returns True if p-value is below the threshold. """ # Estimate the residual from a linear fit. residuals = _residuals(first_signal, second_signal) # Get p-value _, p_value = shapiro(residuals) if threshold: p_value = p_value < threshold return p_value
[docs]def sliding_check_residuals_not_gaussian(first_signal, second_signal, window_length, threshold=None): """ Check if the residuals of the linear fit are not from a normal distribution. The function uses a Shapiro-Wilk test on the residuals of a linear fit. Specifically, the function performs a linear fit to the data, assuming y = ax, and then calculates the residuals r = y - ax. It will return the p value of the Shapiro-Wilk test on the residuals. If a threshold is provided, returns True if the p value is below the specified threshold, i.e. if the residuals are probably not gaussian. Parameters ---------- first_signal: array The first signal array second_signal: array The second signal array window_length: int The length of the window. It should be an odd number. threshold: float or None Threshold for the Shapiro-Wilk p-value. Returns ------- p_value: array If threshold is None, then the function returns the p-value of the Shapiro-Wilk test on the residuals. If a threshold is provided, the function returns True if p-value is below the threshold. """ # Make window length odd if window_length % 2: window_length += 1 p_values = _apply_sliding_check(check_residuals_not_gaussian, first_signal, second_signal, window_length, threshold) p_values = _restore_array_length(p_values, window_length) return p_values
[docs]def check_min_max_ratio(first_signal, second_signal, threshold=None): """ Returns the ration between minimum and maximum values (i.e. min / max). The operation is performed for both signals and the minimum is returned. The aim is to detect regions of large variation e.g. edges of clouds. Similar large values will be returned when the signals are near 0, so the relative difference is large. Consequently, this test should be used in parallel with checks e.g. about signal to noise ratio. If a threshold is provided, returns True if the reltio is above the specified threshold. Parameters ---------- first_signal: array The first signal array second_signal: array The second signal array threshold: float or None Threshold for the correlation coefficient. Returns ------- minmax: float or boolean If threshold is None, then the function returns the min/max ratio. If a threshold is provided, the function returns True if the correlation value is above the threshold. """ minmax_first = np.min(first_signal, axis=-1) / np.max(first_signal, axis=-1) minmax_second = np.min(second_signal, axis=-1) / np.max(second_signal, axis=-1) combined_minmax = np.min([minmax_first, minmax_second], axis=0) if threshold: combined_minmax = combined_minmax > threshold return combined_minmax
[docs]def sliding_check_min_max_ratio(first_signal, second_signal, window_length=11, threshold=None): """ Returns the sliding min/max ratio for both signals If a threshold is provided, returns True if the min/max ratio is above the specified threshold. Parameters ---------- first_signal: array The first signal array second_signal: array The second signal array window_length: int The length of the window. It should be an odd number. threshold: float or None Threshold for the correlation coefficient. Returns ------- correlation: float or boolean If threshold is None, then the function returns an the correlation coefficient. If a threshold is provided, the function returns True if the correlation value is above the threshold. """ # Make window length odd if window_length % 2: window_length += 1 sliding_first_signal = _rolling_window(first_signal, window_length) sliding_second_signal = _rolling_window(second_signal, window_length) minmax_ratio = check_min_max_ratio(sliding_first_signal, sliding_second_signal, threshold) minmax_ratio = _restore_array_length(minmax_ratio, window_length) return minmax_ratio
[docs]def check_residuals_not_gaussian_dagostino(first_signal, second_signal, threshold=None): """ Check if the residuals of the linear fit are not from a normal distribution. The function uses a D'agostino - Pearsons's test on the residuals of a linear fit. Specifically, the function performs a linear fit to the data, assuming y = ax, and then calculates the residuals r = y - ax. It will return the p value of the D'agostino - Pearsons's omnibus test on the residuals. If a threshold is provided, returns True if the p value is below the specified threshold, i.e. if the residuals are probably not gaussian. Parameters ---------- first_signal: array The first signal array second_signal: array The second signal array threshold: float or None Threshold for the Shapiro-Wilk p-value. Returns ------- p_value: float or boolean If threshold is None, then the function returns the p-value of the D'agostino - Pearsons's test on the residuals. If a threshold is provided, the function returns True if p-value is below the threshold. """ residuals = _residuals(first_signal, second_signal) # Get p-value _, p_value = normaltest(residuals) if threshold: p_value = p_value < threshold return p_value
[docs]def sliding_check_residuals_not_gaussian_dagostino(first_signal, second_signal, window_length, threshold=None): """ Check if the residuals of the linear fit are not from a normal distribution. The function uses a Shapiro-Wilk test on the residuals of a linear fit. Specifically, the function performs a linear fit to the data, assuming y = ax, and then calculates the residuals r = y - ax. It will return the p value of the Shapiro-Wilk test on the residuals. If a threshold is provided, returns True if the p value is below the specified threshold, i.e. if the residuals are probably not gaussian. Parameters ---------- first_signal: array The first signal array second_signal: array The second signal array window_length: int The length of the window. It should be an odd number. threshold: float or None Threshold for the Shapiro-Wilk p-value. Returns ------- p_value: array If threshold is None, then the function returns the p-value of the Shapiro-Wilk test on the residuals. If a threshold is provided, the function returns True if p-value is below the threshold. """ # Make window length odd if window_length % 2: window_length += 1 p_values = _apply_sliding_check(check_residuals_not_gaussian_dagostino, first_signal, second_signal, window_length, threshold) p_values = _restore_array_length(p_values, window_length) return p_values
def _residuals(first_signal, second_signal): # Fit y = ax a, _, _, _ = np.linalg.lstsq(first_signal[:, np.newaxis], second_signal) # Calculate residuals y_predicted = first_signal * a residuals = y_predicted - second_signal return residuals def _rolling_window(a, window): """ Return a rolling window view of the input array. This can be used to calculate moving window statistics efficiently. The code is from http://www.rigtorp.se/2011/01/01/rolling-statistics-numpy.html Parameters ---------- a: array Input array window: int Window length Returns ------- : array Rolling window view of the array. One dimension larger than the input array. """ shape = a.shape[:-1] + (a.shape[-1] - window + 1, window) strides = a.strides + (a.strides[-1],) return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides) # Maybe add writeable=False in the future def _apply_sliding_check(check_function, first_signal, second_signal, window_length, threshold=None): """ Apply the callable check function to a sliding window. """ signal_length = len(first_signal) output_list = [] if window_length >= signal_length: raise RuntimeError("Window length should be smaller than signal length.") for min_idx in range(signal_length - window_length + 1): max_idx = min_idx + window_length first_slice = first_signal[min_idx:max_idx] second_slice = second_signal[min_idx:max_idx] if threshold: check_output = check_function(first_slice, second_slice, threshold) else: check_output = check_function(first_slice, second_slice) output_list.append(check_output) output_array = np.array(output_list) return output_array def _restore_array_length(input_array, window_length): """ Pad an array with the missing values. It is used to restore the shape of the array after applying sliding window operation. Parameters ---------- input_array: array The input 1D numpy array. window_length: Size of the window used for the sliding operation/ Returns ------- output_array: array The padded array. """ prepend_region_length = window_length // 2 append_region_length = window_length - prepend_region_length - 1 # Create nan arrays fill the begining and end of the array prepend_array = np.ma.masked_all(prepend_region_length, dtype=input_array.dtype) append_array = np.ma.masked_all(append_region_length, dtype=input_array.dtype) output_array = np.ma.append(prepend_array, input_array) output_array = np.ma.append(output_array, append_array) output_array = np.ma.masked_invalid(output_array) return output_array