import numpy
import warnings
from handwriting_features.data.utils.cleanup import remove_outliers
from handwriting_features.data.exceptions.statistics import *
# Ignore numpy warnings
warnings.filterwarnings("ignore")
[docs]def mean(array):
"""
Computes mean of an input array (ignoring NaNs).
:param array: input array
:type array: numpy.ndarray
:return: mean value
:rtype: numpy.float
"""
return numpy.nanmean(array) if numpy.isfinite(array).any() else numpy.nan
[docs]def std(array):
"""
Computes std of an input array (ignoring NaNs).
:param array: input array
:type array: numpy.ndarray
:return: std value
:rtype: numpy.float
"""
return numpy.nanstd(array) if numpy.isfinite(array).any() else numpy.nan
[docs]def cv_parametric(array, as_percentage=False):
"""
Computes parametric cv of an input array (ignoring NaNs).
:param array: input array
:type array: numpy.ndarray
:param as_percentage: percentage computation flag, defaults to False
:type as_percentage: bool, optional
:return: parametric cv value
:rtype: numpy.float
"""
# Get the mean and standard deviation
_avg = mean(array)
_std = std(array)
# Compute the parametric cv
if any((not numpy.isfinite(_avg), not numpy.isfinite(_std))):
return numpy.nan
else:
return (_std / (_avg + numpy.finfo(float).eps)) * (1 if not as_percentage else 100)
[docs]def iqr(array):
"""
Computes iqr of an input array (ignoring NaNs).
:param array: input array
:type array: numpy.ndarray
:return: iqr value
:rtype: numpy.float
"""
# Get the quartiles
_q1 = numpy.nanquantile(array, 0.25)
_q3 = numpy.nanquantile(array, 0.75)
# Compute the iqr
if any((not numpy.isfinite(_q1), not numpy.isfinite(_q3))):
return numpy.nan
else:
return numpy.subtract(_q3, _q1)
[docs]def cv_nonparametric(array, as_percentage=False):
"""
Computes non-parametric cv of an input array (ignoring NaNs).
:param array: input array
:type array: numpy.ndarray
:param as_percentage: percentage computation flag, defaults to False
:type as_percentage: bool, optional
:return: non-parametric cv value
:rtype: numpy.float
"""
# Get the iqr and median
_med = median(array)
_iqr = iqr(array)
# Compute the non-parametric cv
if any((not numpy.isfinite(_med), not numpy.isfinite(_iqr))):
return numpy.nan
else:
return (_iqr / _med + numpy.finfo(float).eps) * (1 if not as_percentage else 100)
[docs]def quartile_1(array):
"""
Computes 1st quartile of an input array (ignoring NaNs).
:param array: input array
:type array: numpy.ndarray
:return: 1st quartile value
:rtype: numpy.float
"""
return numpy.nanquantile(array, 0.25) if numpy.isfinite(array).any() else numpy.nan
[docs]def quartile_3(array):
"""
Computes 3rd quartile of an input array (ignoring NaNs).
:param array: input array
:type array: numpy.ndarray
:return: 3rd quartile value
:rtype: numpy.float
"""
return numpy.nanquantile(array, 0.75) if numpy.isfinite(array).any() else numpy.nan
[docs]def percentile_5(array):
"""
Computes 5th percentile of an input array (ignoring NaNs).
:param array: input array
:type array: numpy.ndarray
:return: 5th percentile value
:rtype: numpy.float
"""
return numpy.nanpercentile(array, 5) if numpy.isfinite(array).any() else numpy.nan
[docs]def percentile_95(array):
"""
Computes 95th percentile of an input array (ignoring NaNs).
:param array: input array
:type array: numpy.ndarray
:return: 95th percentile value
:rtype: numpy.float
"""
return numpy.nanpercentile(array, 95) if numpy.isfinite(array).any() else numpy.nan
[docs]def slope_of_linear_regression(array, window_size=5, min_samples=3, center=True, threshold=3):
"""
Computes slope of linear regression of an input array (ignoring NaNs).
:param array: input array
:type array: numpy.ndarray
:param window_size: size of the moving window, defaults to 5
:type window_size: int, optional
:param min_samples: minimum number of samples in a window, defaults to 3
:type min_samples: int, optional
:param center: labels at the center of the window flag, defaults to True
:type center: bool, optional
:param threshold: outlier removal threshold, defaults to 3
:type threshold: int, optional
:return: slope of linear regression value
:rtype: numpy.float
"""
# Handle NaN/Inf value only
if not isinstance(array, numpy.ndarray) and not numpy.isfinite(array):
return numpy.nan
# Create a local deepcopy of the input array
data = numpy.copy(array)
# Remove NaN/Inf values and flatten the array
data = data[numpy.isfinite(data)]
data = data.flatten("F")
# Remove outliers
if data.size > window_size:
data = remove_outliers(data, window_size, min_samples=min_samples, center=center, threshold=threshold)
# Handle empty array
if data.size == 0:
return numpy.nan
# Fit the 1st order regression curve
try:
fitted = numpy.polyfit(data, data, 1)
except numpy.linalg.LinAlgError:
return numpy.nan
# Return the slope
return fitted[0]
[docs]class Statistics(object):
"""Class implementing statistics computation interface"""
# Mapping between statistics and computational functions
mapping = {
"mean": mean,
"std": std,
"cv_parametric": cv_parametric,
"median": median,
"iqr": iqr,
"cv_nonparametric": cv_nonparametric,
"quartile_1": quartile_1,
"quartile_3": quartile_3,
"percentile_5": percentile_5,
"percentile_95": percentile_95,
"slope_of_linear_regression": slope_of_linear_regression
}
[docs] @classmethod
def compute(cls, array, statistical_function):
"""
Computes the <statistical_function> of an input <array>.
:param array: input array
:type array: numpy.ndarray
:param statistical_function: statistical function name
:type statistical_function: str
:return: computed statistics
:rtype: numpy.float
"""
# Validate input arguments
if statistical_function not in cls.mapping:
raise StatisticsNameNotInMappingError(f"Unsupported <statistical_function> {statistical_function}")
if not isinstance(array, (numpy.float, numpy.ndarray)):
raise UnsupportedDataForStatisticsError(
f"Unsupported <array> type {type(array)}; "
f"must be any of the following: `numpy.ndarray`, `numpy.float`")
# Compute the statistical function
return cls.mapping[statistical_function](array)